From laffer1 at midnightbsd.org Sat Feb 8 14:26:24 2020 From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org) Date: Sat, 8 Feb 2020 14:26:24 -0500 (EST) Subject: [Midnightbsd-cvs] src [12301] trunk/sys/xen/interface/io: sync with FreeBSD 11-stable Message-ID: <202002081926.018JQO74060620@stargazer.midnightbsd.org> Revision: 12301 http://svnweb.midnightbsd.org/src/?rev=12301 Author: laffer1 Date: 2020-02-08 14:26:24 -0500 (Sat, 08 Feb 2020) Log Message: ----------- sync with FreeBSD 11-stable Modified Paths: -------------- trunk/sys/xen/interface/io/blkif.h trunk/sys/xen/interface/io/console.h trunk/sys/xen/interface/io/fbif.h trunk/sys/xen/interface/io/kbdif.h trunk/sys/xen/interface/io/libxenvchan.h trunk/sys/xen/interface/io/netif.h trunk/sys/xen/interface/io/pciif.h trunk/sys/xen/interface/io/protocols.h trunk/sys/xen/interface/io/ring.h trunk/sys/xen/interface/io/tpmif.h trunk/sys/xen/interface/io/usbif.h trunk/sys/xen/interface/io/vscsiif.h trunk/sys/xen/interface/io/xenbus.h trunk/sys/xen/interface/io/xs_wire.h Modified: trunk/sys/xen/interface/io/blkif.h =================================================================== --- trunk/sys/xen/interface/io/blkif.h 2020-02-02 21:31:28 UTC (rev 12300) +++ trunk/sys/xen/interface/io/blkif.h 2020-02-08 19:26:24 UTC (rev 12301) @@ -60,7 +60,7 @@ * All data in the XenStore is stored as strings. Nodes specifying numeric * values are encoded in decimal. Integer value ranges listed below are * expressed as fixed sized integer types capable of storing the conversion - * of a properly formatted node string, without loss of information. + * of a properly formated node string, without loss of information. * * Any specified default value is in effect if the corresponding XenBus node * is not present in the XenStore. @@ -89,10 +89,16 @@ * params * Values: string * - * Data used by the backend driver to locate and configure the backing - * device. The format and semantics of this data vary according to the - * backing device in use and are outside the scope of this specification. + * A free formatted string providing sufficient information for the + * backend driver to open the backing device. (e.g. the path to the + * file or block device representing the backing store.) * + * physical-device + * Values: "MAJOR:MINOR" + * + * MAJOR and MINOR are the major number and minor number of the + * backing device respectively. + * * type * Values: "file", "phy", "tap" * @@ -319,7 +325,7 @@ * access (even when it should be read-only). If the frontend hits the * maximum number of allowed persistently mapped grants, it can fallback * to non persistent mode. This will cause a performance degradation, - * since the the backend driver will still try to map those grants + * since the backend driver will still try to map those grants * persistently. Since the persistent grants protocol is compatible with * the previous protocol, a frontend driver can choose to work in * persistent mode even when the backend doesn't support it. @@ -494,7 +500,7 @@ * discarded region on the device must be rendered unrecoverable before the * command returns. * - * This operation is analogous to performing a trim (ATA) or unmap (SCSI), + * This operation is analogous to performing a trim (ATA) or unamp (SCSI), * command on a native device. * * More information about trim/unmap operations can be found at: @@ -559,7 +565,6 @@ /* @last_sect: last sector in frame to transfer (inclusive). */ uint8_t first_sect, last_sect; }; -typedef struct blkif_request_segment blkif_request_segment_t; /* * Starting ring element for any I/O request. @@ -570,7 +575,7 @@ blkif_vdev_t handle; /* only for read/write requests */ uint64_t id; /* private guest value, echoed in resp */ blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */ - blkif_request_segment_t seg[BLKIF_MAX_SEGMENTS_PER_REQUEST]; + struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST]; }; typedef struct blkif_request blkif_request_t; Modified: trunk/sys/xen/interface/io/console.h =================================================================== --- trunk/sys/xen/interface/io/console.h 2020-02-02 21:31:28 UTC (rev 12300) +++ trunk/sys/xen/interface/io/console.h 2020-02-08 19:26:24 UTC (rev 12301) @@ -44,7 +44,7 @@ /* * Local variables: * mode: C - * c-set-style: "BSD" + * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil Modified: trunk/sys/xen/interface/io/fbif.h =================================================================== --- trunk/sys/xen/interface/io/fbif.h 2020-02-02 21:31:28 UTC (rev 12300) +++ trunk/sys/xen/interface/io/fbif.h 2020-02-08 19:26:24 UTC (rev 12301) @@ -169,7 +169,7 @@ /* * Local variables: * mode: C - * c-set-style: "BSD" + * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil Modified: trunk/sys/xen/interface/io/kbdif.h =================================================================== --- trunk/sys/xen/interface/io/kbdif.h 2020-02-02 21:31:28 UTC (rev 12300) +++ trunk/sys/xen/interface/io/kbdif.h 2020-02-08 19:26:24 UTC (rev 12301) @@ -125,7 +125,7 @@ /* * Local variables: * mode: C - * c-set-style: "BSD" + * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil Modified: trunk/sys/xen/interface/io/libxenvchan.h =================================================================== --- trunk/sys/xen/interface/io/libxenvchan.h 2020-02-02 21:31:28 UTC (rev 12300) +++ trunk/sys/xen/interface/io/libxenvchan.h 2020-02-08 19:26:24 UTC (rev 12301) @@ -22,8 +22,7 @@ * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with this library; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + * License along with this library; If not, see <http://www.gnu.org/licenses/>. * * @section DESCRIPTION * Modified: trunk/sys/xen/interface/io/netif.h =================================================================== --- trunk/sys/xen/interface/io/netif.h 2020-02-02 21:31:28 UTC (rev 12300) +++ trunk/sys/xen/interface/io/netif.h 2020-02-08 19:26:24 UTC (rev 12301) @@ -32,6 +32,24 @@ #include "../grant_table.h" /* + * Older implementation of Xen network frontend / backend has an + * implicit dependency on the MAX_SKB_FRAGS as the maximum number of + * ring slots a skb can use. Netfront / netback may not work as + * expected when frontend and backend have different MAX_SKB_FRAGS. + * + * A better approach is to add mechanism for netfront / netback to + * negotiate this value. However we cannot fix all possible + * frontends, so we need to define a value which states the minimum + * slots backend must support. + * + * The minimum value derives from older Linux kernel's MAX_SKB_FRAGS + * (18), which is proved to work with most frontends. Any new backend + * which doesn't negotiate with frontend should expect frontend to + * send a valid packet using slots up to this value. + */ +#define XEN_NETIF_NR_SLOTS_MIN 18 + +/* * Notifications after enqueuing any type of message should be conditional on * the appropriate req_event or rsp_event field in the shared ring. * If the client sends notification for rx requests then it should specify @@ -40,16 +58,226 @@ */ /* + * "feature-split-event-channels" is introduced to separate guest TX + * and RX notification. Backend either doesn't support this feature or + * advertises it via xenstore as 0 (disabled) or 1 (enabled). + * + * To make use of this feature, frontend should allocate two event + * channels for TX and RX, advertise them to backend as + * "event-channel-tx" and "event-channel-rx" respectively. If frontend + * doesn't want to use this feature, it just writes "event-channel" + * node as before. + */ + +/* + * Multiple transmit and receive queues: + * If supported, the backend will write the key "multi-queue-max-queues" to + * the directory for that vif, and set its value to the maximum supported + * number of queues. + * Frontends that are aware of this feature and wish to use it can write the + * key "multi-queue-num-queues", set to the number they wish to use, which + * must be greater than zero, and no more than the value reported by the backend + * in "multi-queue-max-queues". + * + * Queues replicate the shared rings and event channels. + * "feature-split-event-channels" may optionally be used when using + * multiple queues, but is not mandatory. + * + * Each queue consists of one shared ring pair, i.e. there must be the same + * number of tx and rx rings. + * + * For frontends requesting just one queue, the usual event-channel and + * ring-ref keys are written as before, simplifying the backend processing + * to avoid distinguishing between a frontend that doesn't understand the + * multi-queue feature, and one that does, but requested only one queue. + * + * Frontends requesting two or more queues must not write the toplevel + * event-channel (or event-channel-{tx,rx}) and {tx,rx}-ring-ref keys, + * instead writing those keys under sub-keys having the name "queue-N" where + * N is the integer ID of the queue for which those keys belong. Queues + * are indexed from zero. For example, a frontend with two queues and split + * event channels must write the following set of queue-related keys: + * + * /local/domain/1/device/vif/0/multi-queue-num-queues = "2" + * /local/domain/1/device/vif/0/queue-0 = "" + * /local/domain/1/device/vif/0/queue-0/tx-ring-ref = "<ring-ref-tx0>" + * /local/domain/1/device/vif/0/queue-0/rx-ring-ref = "<ring-ref-rx0>" + * /local/domain/1/device/vif/0/queue-0/event-channel-tx = "<evtchn-tx0>" + * /local/domain/1/device/vif/0/queue-0/event-channel-rx = "<evtchn-rx0>" + * /local/domain/1/device/vif/0/queue-1 = "" + * /local/domain/1/device/vif/0/queue-1/tx-ring-ref = "<ring-ref-tx1>" + * /local/domain/1/device/vif/0/queue-1/rx-ring-ref = "<ring-ref-rx1" + * /local/domain/1/device/vif/0/queue-1/event-channel-tx = "<evtchn-tx1>" + * /local/domain/1/device/vif/0/queue-1/event-channel-rx = "<evtchn-rx1>" + * + * If there is any inconsistency in the XenStore data, the backend may + * choose not to connect any queues, instead treating the request as an + * error. This includes scenarios where more (or fewer) queues were + * requested than the frontend provided details for. + * + * Mapping of packets to queues is considered to be a function of the + * transmitting system (backend or frontend) and is not negotiated + * between the two. Guests are free to transmit packets on any queue + * they choose, provided it has been set up correctly. Guests must be + * prepared to receive packets on any queue they have requested be set up. + */ + +/* + * "feature-no-csum-offload" should be used to turn IPv4 TCP/UDP checksum + * offload off or on. If it is missing then the feature is assumed to be on. + * "feature-ipv6-csum-offload" should be used to turn IPv6 TCP/UDP checksum + * offload on or off. If it is missing then the feature is assumed to be off. + */ + +/* + * "feature-gso-tcpv4" and "feature-gso-tcpv6" advertise the capability to + * handle large TCP packets (in IPv4 or IPv6 form respectively). Neither + * frontends nor backends are assumed to be capable unless the flags are + * present. + */ + +/* + * "feature-multicast-control" advertises the capability to filter ethernet + * multicast packets in the backend. To enable use of this capability the + * frontend must set "request-multicast-control" before moving into the + * connected state. + * + * If "request-multicast-control" is set then the backend transmit side should + * no longer flood multicast packets to the frontend, it should instead drop any + * multicast packet that does not match in a filter list. The list is + * amended by the frontend by sending dummy transmit requests containing + * XEN_NETIF_EXTRA_TYPE_MCAST_{ADD,DEL} extra-info fragments as specified below. + * Once enabled by the frontend, the feature cannot be disabled except by + * closing and re-connecting to the backend. + */ + +/* * This is the 'wire' format for packets: - * Request 1: netif_tx_request -- NETTXF_* (any flags) - * [Request 2: netif_tx_extra] (only if request 1 has NETTXF_extra_info) - * [Request 3: netif_tx_extra] (only if request 2 has XEN_NETIF_EXTRA_FLAG_MORE) - * Request 4: netif_tx_request -- NETTXF_more_data - * Request 5: netif_tx_request -- NETTXF_more_data + * Request 1: netif_tx_request_t -- NETTXF_* (any flags) + * [Request 2: netif_extra_info_t] (only if request 1 has NETTXF_extra_info) + * [Request 3: netif_extra_info_t] (only if request 2 has XEN_NETIF_EXTRA_MORE) + * Request 4: netif_tx_request_t -- NETTXF_more_data + * Request 5: netif_tx_request_t -- NETTXF_more_data * ... - * Request N: netif_tx_request -- 0 + * Request N: netif_tx_request_t -- 0 */ +/* + * Guest transmit + * ============== + * + * Ring slot size is 12 octets, however not all request/response + * structs use the full size. + * + * tx request data (netif_tx_request_t) + * ------------------------------------ + * + * 0 1 2 3 4 5 6 7 octet + * +-----+-----+-----+-----+-----+-----+-----+-----+ + * | grant ref | offset | flags | + * +-----+-----+-----+-----+-----+-----+-----+-----+ + * | id | size | + * +-----+-----+-----+-----+ + * + * grant ref: Reference to buffer page. + * offset: Offset within buffer page. + * flags: NETTXF_*. + * id: request identifier, echoed in response. + * size: packet size in bytes. + * + * tx response (netif_tx_response_t) + * --------------------------------- + * + * 0 1 2 3 4 5 6 7 octet + * +-----+-----+-----+-----+-----+-----+-----+-----+ + * | id | status | unused | + * +-----+-----+-----+-----+-----+-----+-----+-----+ + * | unused | + * +-----+-----+-----+-----+ + * + * id: reflects id in transmit request + * status: NETIF_RSP_* + * + * Guest receive + * ============= + * + * Ring slot size is 8 octets. + * + * rx request (netif_rx_request_t) + * ------------------------------- + * + * 0 1 2 3 4 5 6 7 octet + * +-----+-----+-----+-----+-----+-----+-----+-----+ + * | id | pad | gref | + * +-----+-----+-----+-----+-----+-----+-----+-----+ + * + * id: request identifier, echoed in response. + * gref: reference to incoming granted frame. + * + * rx response (netif_rx_response_t) + * --------------------------------- + * + * 0 1 2 3 4 5 6 7 octet + * +-----+-----+-----+-----+-----+-----+-----+-----+ + * | id | offset | flags | status | + * +-----+-----+-----+-----+-----+-----+-----+-----+ + * + * id: reflects id in receive request + * offset: offset in page of start of received packet + * flags: NETRXF_* + * status: -ve: NETIF_RSP_*; +ve: Rx'ed pkt size. + * + * Extra Info + * ========== + * + * Can be present if initial request has NET{T,R}XF_extra_info, or + * previous extra request has XEN_NETIF_EXTRA_MORE. + * + * The struct therefore needs to fit into either a tx or rx slot and + * is therefore limited to 8 octets. + * + * extra info (netif_extra_info_t) + * ------------------------------- + * + * General format: + * + * 0 1 2 3 4 5 6 7 octet + * +-----+-----+-----+-----+-----+-----+-----+-----+ + * |type |flags| type specfic data | + * +-----+-----+-----+-----+-----+-----+-----+-----+ + * | padding for tx | + * +-----+-----+-----+-----+ + * + * type: XEN_NETIF_EXTRA_TYPE_* + * flags: XEN_NETIF_EXTRA_FLAG_* + * padding for tx: present only in the tx case due to 8 octet limit + * from rx case. Not shown in type specific entries below. + * + * XEN_NETIF_EXTRA_TYPE_GSO: + * + * 0 1 2 3 4 5 6 7 octet + * +-----+-----+-----+-----+-----+-----+-----+-----+ + * |type |flags| size |type | pad | features | + * +-----+-----+-----+-----+-----+-----+-----+-----+ + * + * type: Must be XEN_NETIF_EXTRA_TYPE_GSO + * flags: XEN_NETIF_EXTRA_FLAG_* + * size: Maximum payload size of each segment. + * type: XEN_NETIF_GSO_TYPE_* + * features: EN_NETIF_GSO_FEAT_* + * + * XEN_NETIF_EXTRA_TYPE_MCAST_{ADD,DEL}: + * + * 0 1 2 3 4 5 6 7 octet + * +-----+-----+-----+-----+-----+-----+-----+-----+ + * |type |flags| addr | + * +-----+-----+-----+-----+-----+-----+-----+-----+ + * + * type: Must be XEN_NETIF_EXTRA_TYPE_MCAST_{ADD,DEL} + * flags: XEN_NETIF_EXTRA_FLAG_* + * addr: address to add/remove + */ + /* Protocol checksum field is blank in the packet (hardware offload)? */ #define _NETTXF_csum_blank (0) #define NETTXF_csum_blank (1U<<_NETTXF_csum_blank) @@ -66,14 +294,13 @@ #define _NETTXF_extra_info (3) #define NETTXF_extra_info (1U<<_NETTXF_extra_info) +#define XEN_NETIF_MAX_TX_SIZE 0xFFFF struct netif_tx_request { grant_ref_t gref; /* Reference to buffer page */ uint16_t offset; /* Offset within buffer page */ uint16_t flags; /* NETTXF_* */ uint16_t id; /* Echoed in response message. */ - uint16_t size; /* For the first request in a packet, the packet - size in bytes. For subsequent requests, the - size of that request's associated data in bytes*/ + uint16_t size; /* Packet size in bytes. */ }; typedef struct netif_tx_request netif_tx_request_t; @@ -84,16 +311,18 @@ #define XEN_NETIF_EXTRA_TYPE_MCAST_DEL (3) /* u.mcast */ #define XEN_NETIF_EXTRA_TYPE_MAX (4) -/* netif_extra_info flags. */ +/* netif_extra_info_t flags. */ #define _XEN_NETIF_EXTRA_FLAG_MORE (0) #define XEN_NETIF_EXTRA_FLAG_MORE (1U<<_XEN_NETIF_EXTRA_FLAG_MORE) -/* GSO types - only TCPv4 currently supported. */ +/* GSO types */ +#define XEN_NETIF_GSO_TYPE_NONE (0) #define XEN_NETIF_GSO_TYPE_TCPV4 (1) +#define XEN_NETIF_GSO_TYPE_TCPV6 (2) /* - * This structure needs to fit within both netif_tx_request and - * netif_rx_response for compatibility. + * This structure needs to fit within both netif_tx_request_t and + * netif_rx_response_t for compatibility. */ struct netif_extra_info { uint8_t type; /* XEN_NETIF_EXTRA_TYPE_* */ @@ -128,14 +357,6 @@ /* * XEN_NETIF_EXTRA_TYPE_MCAST_{ADD,DEL}: - * Backend advertises availability via 'feature-multicast-control' - * xenbus node containing value '1'. - * Frontend requests this feature by advertising - * 'request-multicast-control' xenbus node containing value '1'. - * If multicast control is requested then multicast flooding is - * disabled and the frontend must explicitly register its interest - * in multicast groups using dummy transmit requests containing - * MCAST_{ADD,DEL} extra-info fragments. */ struct { uint8_t addr[6]; /* Address to add/remove. */ @@ -154,6 +375,7 @@ struct netif_rx_request { uint16_t id; /* Echoed in response message. */ + uint16_t pad; grant_ref_t gref; /* Reference to incoming granted frame */ }; typedef struct netif_rx_request netif_rx_request_t; @@ -174,15 +396,11 @@ #define _NETRXF_extra_info (3) #define NETRXF_extra_info (1U<<_NETRXF_extra_info) -/* GSO Prefix descriptor. */ -#define _NETRXF_gso_prefix (4) -#define NETRXF_gso_prefix (1U<<_NETRXF_gso_prefix) - struct netif_rx_response { uint16_t id; uint16_t offset; /* Offset in page of start of received packet */ uint16_t flags; /* NETRXF_* */ - int16_t status; /* -ve: NETIF_RSP_* ; +ve: Rx'ed response size. */ + int16_t status; /* -ve: NETIF_RSP_* ; +ve: Rx'ed pkt size. */ }; typedef struct netif_rx_response netif_rx_response_t; @@ -196,7 +414,7 @@ #define NETIF_RSP_DROPPED -2 #define NETIF_RSP_ERROR -1 #define NETIF_RSP_OKAY 0 -/* No response: used for auxiliary requests (e.g., netif_tx_extra). */ +/* No response: used for auxiliary requests (e.g., netif_extra_info_t). */ #define NETIF_RSP_NULL 1 #endif @@ -204,7 +422,7 @@ /* * Local variables: * mode: C - * c-set-style: "BSD" + * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil Modified: trunk/sys/xen/interface/io/pciif.h =================================================================== --- trunk/sys/xen/interface/io/pciif.h 2020-02-02 21:31:28 UTC (rev 12300) +++ trunk/sys/xen/interface/io/pciif.h 2020-02-08 19:26:24 UTC (rev 12301) @@ -47,6 +47,7 @@ #define XEN_PCI_OP_aer_resume (7) #define XEN_PCI_OP_aer_mmio (8) #define XEN_PCI_OP_aer_slotreset (9) +#define XEN_PCI_OP_enable_multi_msi (10) /* xen_pci_op error numbers */ #define XEN_PCI_ERR_success (0) @@ -117,7 +118,7 @@ /* * Local variables: * mode: C - * c-set-style: "BSD" + * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil Modified: trunk/sys/xen/interface/io/protocols.h =================================================================== --- trunk/sys/xen/interface/io/protocols.h 2020-02-02 21:31:28 UTC (rev 12300) +++ trunk/sys/xen/interface/io/protocols.h 2020-02-08 19:26:24 UTC (rev 12301) @@ -19,6 +19,8 @@ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. + * + * Copyright (c) 2008, Keir Fraser */ #ifndef __XEN_PROTOCOLS_H__ @@ -26,7 +28,6 @@ #define XEN_IO_PROTO_ABI_X86_32 "x86_32-abi" #define XEN_IO_PROTO_ABI_X86_64 "x86_64-abi" -#define XEN_IO_PROTO_ABI_IA64 "ia64-abi" #define XEN_IO_PROTO_ABI_ARM "arm-abi" #if defined(__i386__) @@ -33,9 +34,7 @@ # define XEN_IO_PROTO_ABI_NATIVE XEN_IO_PROTO_ABI_X86_32 #elif defined(__x86_64__) # define XEN_IO_PROTO_ABI_NATIVE XEN_IO_PROTO_ABI_X86_64 -#elif defined(__ia64__) -# define XEN_IO_PROTO_ABI_NATIVE XEN_IO_PROTO_ABI_IA64 -#elif defined(__arm__) +#elif defined(__arm__) || defined(__aarch64__) # define XEN_IO_PROTO_ABI_NATIVE XEN_IO_PROTO_ABI_ARM #else # error arch fixup needed here Modified: trunk/sys/xen/interface/io/ring.h =================================================================== --- trunk/sys/xen/interface/io/ring.h 2020-02-02 21:31:28 UTC (rev 12300) +++ trunk/sys/xen/interface/io/ring.h 2020-02-08 19:26:24 UTC (rev 12301) @@ -46,15 +46,9 @@ #define __RD32(_x) (((_x) & 0xffff0000) ? __RD16((_x)>>16)<<16 : __RD16(_x)) /* - * The amount of space reserved in the shared ring for accounting information. - */ -#define __RING_HEADER_SIZE(_s) \ - ((intptr_t)(_s)->ring - (intptr_t)(_s)) - -/* * Calculate size of a shared ring, given the total available space for the * ring and indexes (_sz), and the name tag of the request/response structure. - * A ring contains as many entries as will fit, rounded down to the nearest + * A ring contains as many entries as will fit, rounded down to the nearest * power of two (so we can mask with (size-1) to loop around). */ #define __CONST_RING_SIZE(_s, _sz) \ @@ -64,19 +58,9 @@ * The same for passing in an actual pointer instead of a name tag. */ #define __RING_SIZE(_s, _sz) \ - (__RD32(((_sz) - __RING_HEADER_SIZE(_s)) / sizeof((_s)->ring[0]))) + (__RD32(((_sz) - (long)(_s)->ring + (long)(_s)) / sizeof((_s)->ring[0]))) /* - * The number of pages needed to support a given number of request/reponse - * entries. The entry count is rounded down to the nearest power of two - * as required by the ring macros. - */ -#define __RING_PAGES(_s, _entries) \ - ((__RING_HEADER_SIZE(_s) \ - + (__RD32(_entries) * sizeof((_s)->ring[0])) \ - + PAGE_SIZE - 1) / PAGE_SIZE) - -/* * Macros to make the correct C datatypes for a new kind of ring. * * To make a new ring datatype, you need to have two message structures, @@ -128,7 +112,7 @@ uint8_t msg; \ } tapif_user; \ uint8_t pvt_pad[4]; \ - } private; \ + } pvt; \ uint8_t __pad[44]; \ union __name##_sring_entry ring[1]; /* variable-length */ \ }; \ @@ -173,7 +157,7 @@ #define SHARED_RING_INIT(_s) do { \ (_s)->req_prod = (_s)->rsp_prod = 0; \ (_s)->req_event = (_s)->rsp_event = 1; \ - (void)memset((_s)->private.pvt_pad, 0, sizeof((_s)->private.pvt_pad)); \ + (void)memset((_s)->pvt.pvt_pad, 0, sizeof((_s)->pvt.pvt_pad)); \ (void)memset((_s)->__pad, 0, sizeof((_s)->__pad)); \ } while(0) @@ -191,21 +175,6 @@ (_r)->sring = (_s); \ } while (0) -/* Initialize to existing shared indexes -- for recovery */ -#define FRONT_RING_ATTACH(_r, _s, __size) do { \ - (_r)->sring = (_s); \ - (_r)->req_prod_pvt = (_s)->req_prod; \ - (_r)->rsp_cons = (_s)->rsp_prod; \ - (_r)->nr_ents = __RING_SIZE(_s, __size); \ -} while (0) - -#define BACK_RING_ATTACH(_r, _s, __size) do { \ - (_r)->sring = (_s); \ - (_r)->rsp_prod_pvt = (_s)->rsp_prod; \ - (_r)->req_cons = (_s)->req_prod; \ - (_r)->nr_ents = __RING_SIZE(_s, __size); \ -} while (0) - /* How big is this ring? */ #define RING_SIZE(_r) \ ((_r)->nr_ents) @@ -251,6 +220,10 @@ #define RING_REQUEST_CONS_OVERFLOW(_r, _cons) \ (((_cons) - (_r)->rsp_prod_pvt) >= RING_SIZE(_r)) +/* Ill-behaved frontend determination: Can there be this many requests? */ +#define RING_REQUEST_PROD_OVERFLOW(_r, _prod) \ + (((_prod) - (_r)->rsp_prod_pvt) > RING_SIZE(_r)) + #define RING_PUSH_REQUESTS(_r) do { \ xen_wmb(); /* back sees requests /before/ updated producer index */ \ (_r)->sring->req_prod = (_r)->req_prod_pvt; \ @@ -332,7 +305,7 @@ /* * Local variables: * mode: C - * c-set-style: "BSD" + * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil Modified: trunk/sys/xen/interface/io/tpmif.h =================================================================== --- trunk/sys/xen/interface/io/tpmif.h 2020-02-02 21:31:28 UTC (rev 12300) +++ trunk/sys/xen/interface/io/tpmif.h 2020-02-08 19:26:24 UTC (rev 12301) @@ -65,12 +65,78 @@ }; typedef struct tpmif_tx_interface tpmif_tx_interface_t; +/****************************************************************************** + * TPM I/O interface for Xen guest OSes, v2 + * + * Author: Daniel De Graaf <dgdegra at tycho.nsa.gov> + * + * This protocol emulates the request/response behavior of a TPM using a Xen + * shared memory interface. All interaction with the TPM is at the direction + * of the frontend, since a TPM (hardware or virtual) is a passive device - + * the backend only processes commands as requested by the frontend. + * + * The frontend sends a request to the TPM by populating the shared page with + * the request packet, changing the state to TPMIF_STATE_SUBMIT, and sending + * and event channel notification. When the backend is finished, it will set + * the state to TPMIF_STATE_FINISH and send an event channel notification. + * + * In order to allow long-running commands to be canceled, the frontend can + * at any time change the state to TPMIF_STATE_CANCEL and send a notification. + * The TPM can either finish the command (changing state to TPMIF_STATE_FINISH) + * or can cancel the command and change the state to TPMIF_STATE_IDLE. The TPM + * can also change the state to TPMIF_STATE_IDLE instead of TPMIF_STATE_FINISH + * if another reason for cancellation is required - for example, a physical + * TPM may cancel a command if the interface is seized by another locality. + * + * The TPM command format is defined by the TCG, and is available at + * http://www.trustedcomputinggroup.org/resources/tpm_main_specification + */ + +enum tpmif_state { + TPMIF_STATE_IDLE, /* no contents / vTPM idle / cancel complete */ + TPMIF_STATE_SUBMIT, /* request ready / vTPM working */ + TPMIF_STATE_FINISH, /* response ready / vTPM idle */ + TPMIF_STATE_CANCEL, /* cancel requested / vTPM working */ +}; +/* Note: The backend should only change state to IDLE or FINISH, while the + * frontend should only change to SUBMIT or CANCEL. Status changes do not need + * to use atomic operations. + */ + + +/* The shared page for vTPM request/response packets looks like: + * + * Offset Contents + * ================================================= + * 0 struct tpmif_shared_page + * 16 [optional] List of grant IDs + * 16+4*nr_extra_pages TPM packet data + * + * If the TPM packet data extends beyond the end of a single page, the grant IDs + * defined in extra_pages are used as if they were mapped immediately following + * the primary shared page. The grants are allocated by the frontend and mapped + * by the backend. Before sending a request spanning multiple pages, the + * frontend should verify that the TPM supports such large requests by querying + * the TPM_CAP_PROP_INPUT_BUFFER property from the TPM. + */ +struct tpmif_shared_page { + uint32_t length; /* request/response length in bytes */ + + uint8_t state; /* enum tpmif_state */ + uint8_t locality; /* for the current request */ + uint8_t pad; /* should be zero */ + + uint8_t nr_extra_pages; /* extra pages for long packets; may be zero */ + uint32_t extra_pages[0]; /* grant IDs; length is actually nr_extra_pages */ +}; +typedef struct tpmif_shared_page tpmif_shared_page_t; + #endif /* * Local variables: * mode: C - * c-set-style: "BSD" + * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil Modified: trunk/sys/xen/interface/io/usbif.h =================================================================== --- trunk/sys/xen/interface/io/usbif.h 2020-02-02 21:31:28 UTC (rev 12300) +++ trunk/sys/xen/interface/io/usbif.h 2020-02-08 19:26:24 UTC (rev 12301) @@ -32,6 +32,76 @@ #include "ring.h" #include "../grant_table.h" +/* + * Feature and Parameter Negotiation + * ================================= + * The two halves of a Xen pvUSB driver utilize nodes within the XenStore to + * communicate capabilities and to negotiate operating parameters. This + * section enumerates these nodes which reside in the respective front and + * backend portions of the XenStore, following the XenBus convention. + * + * Any specified default value is in effect if the corresponding XenBus node + * is not present in the XenStore. + * + * XenStore nodes in sections marked "PRIVATE" are solely for use by the + * driver side whose XenBus tree contains them. + * + ***************************************************************************** + * Backend XenBus Nodes + ***************************************************************************** + * + *------------------ Backend Device Identification (PRIVATE) ------------------ + * + * num-ports + * Values: unsigned [1...31] + * + * Number of ports for this (virtual) USB host connector. + * + * usb-ver + * Values: unsigned [1...2] + * + * USB version of this host connector: 1 = USB 1.1, 2 = USB 2.0. + * + * port/[1...31] + * Values: string + * + * Physical USB device connected to the given port, e.g. "3-1.5". + * + ***************************************************************************** + * Frontend XenBus Nodes + ***************************************************************************** + * + *----------------------- Request Transport Parameters ----------------------- + * + * event-channel + * Values: unsigned + * + * The identifier of the Xen event channel used to signal activity + * in the ring buffer. + * + * urb-ring-ref + * Values: unsigned + * + * The Xen grant reference granting permission for the backend to map + * the sole page in a single page sized ring buffer. This is the ring + * buffer for urb requests. + * + * conn-ring-ref + * Values: unsigned + * + * The Xen grant reference granting permission for the backend to map + * the sole page in a single page sized ring buffer. This is the ring + * buffer for connection/disconnection requests. + * + * protocol + * Values: string (XEN_IO_PROTO_ABI_*) + * Default Value: XEN_IO_PROTO_ABI_NATIVE + * + * The machine ABI rules governing the format of all ring request and + * response structures. + * + */ + enum usb_spec_version { USB_VER_UNKNOWN = 0, USB_VER_USB11, @@ -42,38 +112,65 @@ /* * USB pipe in usbif_request * - * bits 0-5 are specific bits for virtual USB driver. - * bits 7-31 are standard urb pipe. + * - port number: bits 0-4 + * (USB_MAXCHILDREN is 31) * - * - port number(NEW): bits 0-4 - * (USB_MAXCHILDREN is 31) + * - operation flag: bit 5 + * (0 = submit urb, + * 1 = unlink urb) * - * - operation flag(NEW): bit 5 - * (0 = submit urb, - * 1 = unlink urb) - * * - direction: bit 7 - * (0 = Host-to-Device [Out] - * 1 = Device-to-Host [In]) + * (0 = Host-to-Device [Out] + * 1 = Device-to-Host [In]) * * - device address: bits 8-14 * * - endpoint: bits 15-18 * - * - pipe type: bits 30-31 - * (00 = isochronous, 01 = interrupt, - * 10 = control, 11 = bulk) + * - pipe type: bits 30-31 + * (00 = isochronous, 01 = interrupt, + * 10 = control, 11 = bulk) */ -#define usbif_pipeportnum(pipe) ((pipe) & 0x1f) -#define usbif_setportnum_pipe(pipe, portnum) \ - ((pipe)|(portnum)) -#define usbif_pipeunlink(pipe) ((pipe) & 0x20) -#define usbif_pipesubmit(pipe) (!usbif_pipeunlink(pipe)) -#define usbif_setunlink_pipe(pipe) ((pipe)|(0x20)) +#define USBIF_PIPE_PORT_MASK 0x0000001f +#define USBIF_PIPE_UNLINK 0x00000020 +#define USBIF_PIPE_DIR 0x00000080 +#define USBIF_PIPE_DEV_MASK 0x0000007f +#define USBIF_PIPE_DEV_SHIFT 8 +#define USBIF_PIPE_EP_MASK 0x0000000f +#define USBIF_PIPE_EP_SHIFT 15 +#define USBIF_PIPE_TYPE_MASK 0x00000003 +#define USBIF_PIPE_TYPE_SHIFT 30 +#define USBIF_PIPE_TYPE_ISOC 0 +#define USBIF_PIPE_TYPE_INT 1 +#define USBIF_PIPE_TYPE_CTRL 2 +#define USBIF_PIPE_TYPE_BULK 3 -#define USBIF_BACK_MAX_PENDING_REQS (128) +#define usbif_pipeportnum(pipe) ((pipe) & USBIF_PIPE_PORT_MASK) +#define usbif_setportnum_pipe(pipe, portnum) ((pipe) | (portnum)) + +#define usbif_pipeunlink(pipe) ((pipe) & USBIF_PIPE_UNLINK) +#define usbif_pipesubmit(pipe) (!usbif_pipeunlink(pipe)) +#define usbif_setunlink_pipe(pipe) ((pipe) | USBIF_PIPE_UNLINK) + +#define usbif_pipein(pipe) ((pipe) & USBIF_PIPE_DIR) +#define usbif_pipeout(pipe) (!usbif_pipein(pipe)) + +#define usbif_pipedevice(pipe) \ + (((pipe) >> USBIF_PIPE_DEV_SHIFT) & USBIF_PIPE_DEV_MASK) + +#define usbif_pipeendpoint(pipe) \ + (((pipe) >> USBIF_PIPE_EP_SHIFT) & USBIF_PIPE_EP_MASK) + +#define usbif_pipetype(pipe) \ + (((pipe) >> USBIF_PIPE_TYPE_SHIFT) & USBIF_PIPE_TYPE_MASK) +#define usbif_pipeisoc(pipe) (usbif_pipetype(pipe) == USBIF_PIPE_TYPE_ISOC) +#define usbif_pipeint(pipe) (usbif_pipetype(pipe) == USBIF_PIPE_TYPE_INT) +#define usbif_pipectrl(pipe) (usbif_pipetype(pipe) == USBIF_PIPE_TYPE_CTRL) +#define usbif_pipebulk(pipe) (usbif_pipetype(pipe) == USBIF_PIPE_TYPE_BULK) + #define USBIF_MAX_SEGMENTS_PER_REQUEST (16) +#define USBIF_MAX_PORTNR 31 /* * RING for transferring urbs. @@ -143,6 +240,10 @@ uint16_t id; /* request id */ uint8_t portnum; /* port number */ uint8_t speed; /* usb_device_speed */ +#define USBIF_SPEED_NONE 0 +#define USBIF_SPEED_LOW 1 +#define USBIF_SPEED_FULL 2 +#define USBIF_SPEED_HIGH 3 }; typedef struct usbif_conn_response usbif_conn_response_t; Modified: trunk/sys/xen/interface/io/vscsiif.h =================================================================== --- trunk/sys/xen/interface/io/vscsiif.h 2020-02-02 21:31:28 UTC (rev 12300) +++ trunk/sys/xen/interface/io/vscsiif.h 2020-02-08 19:26:24 UTC (rev 12301) @@ -1,9 +1,9 @@ /* $MidnightBSD$ */ /****************************************************************************** * vscsiif.h - * + * * Based on the blkif.h code. - * + * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the @@ -31,57 +31,212 @@ #include "ring.h" #include "../grant_table.h" -/* command between backend and frontend */ -#define VSCSIIF_ACT_SCSI_CDB 1 /* SCSI CDB command */ -#define VSCSIIF_ACT_SCSI_ABORT 2 /* SCSI Device(Lun) Abort*/ -#define VSCSIIF_ACT_SCSI_RESET 3 /* SCSI Device(Lun) Reset*/ +/* + * Feature and Parameter Negotiation + * ================================= + * The two halves of a Xen pvSCSI driver utilize nodes within the XenStore to + * communicate capabilities and to negotiate operating parameters. This + * section enumerates these nodes which reside in the respective front and + * backend portions of the XenStore, following the XenBus convention. + * + * Any specified default value is in effect if the corresponding XenBus node + * is not present in the XenStore. + * + * XenStore nodes in sections marked "PRIVATE" are solely for use by the + * driver side whose XenBus tree contains them. + * + ***************************************************************************** + * Backend XenBus Nodes + ***************************************************************************** + * + *------------------ Backend Device Identification (PRIVATE) ------------------ + * + * p-devname + * Values: string + * + * A free string used to identify the physical device (e.g. a disk name). + * + * p-dev + * Values: string + * + * A string specifying the backend device: either a 4-tuple "h:c:t:l" + * (host, controller, target, lun, all integers), or a WWN (e.g. + * "naa.60014054ac780582"). + * + * v-dev + * Values: string + * + * A string specifying the frontend device in form of a 4-tuple "h:c:t:l" + * (host, controller, target, lun, all integers). + * + *--------------------------------- Features --------------------------------- + * + * feature-sg-grant + * Values: unsigned [VSCSIIF_SG_TABLESIZE...65535] + * Default Value: 0 + * + * Specifies the maximum number of scatter/gather elements in grant pages + * supported. If not set, the backend supports up to VSCSIIF_SG_TABLESIZE + * SG elements specified directly in the request. + * + ***************************************************************************** + * Frontend XenBus Nodes + ***************************************************************************** + * + *----------------------- Request Transport Parameters ----------------------- + * + * event-channel + * Values: unsigned + * + * The identifier of the Xen event channel used to signal activity + * in the ring buffer. + * + * ring-ref + * Values: unsigned + * + * The Xen grant reference granting permission for the backend to map + * the sole page in a single page sized ring buffer. + * + * protocol + * Values: string (XEN_IO_PROTO_ABI_*) + * Default Value: XEN_IO_PROTO_ABI_NATIVE + * + * The machine ABI rules governing the format of all ring request and + * response structures. + */ +/* Requests from the frontend to the backend */ -#define VSCSIIF_BACK_MAX_PENDING_REQS 128 +/* + * Request a SCSI operation specified via a CDB in vscsiif_request.cmnd. + * The target is specified via channel, id and lun. + * + * The operation to be performed is specified via a CDB in cmnd[], the length + * of the CDB is in cmd_len. sc_data_direction specifies the direction of data + * (to the device, from the device, or none at all). + * + * If data is to be transferred to or from the device the buffer(s) in the + * guest memory is/are specified via one or multiple scsiif_request_segment + * descriptors each specifying a memory page via a grant_ref_t, a offset into + * the page and the length of the area in that page. All scsiif_request_segment + * areas concatenated form the resulting data buffer used by the operation. + * If the number of scsiif_request_segment areas is not too large (less than + * or equal VSCSIIF_SG_TABLESIZE) the areas can be specified directly in the + * seg[] array and the number of valid scsiif_request_segment elements is to be + * set in nr_segments. + * + * If "feature-sg-grant" in the Xenstore is set it is possible to specify more + * than VSCSIIF_SG_TABLESIZE scsiif_request_segment elements via indirection. + * The maximum number of allowed scsiif_request_segment elements is the value + * of the "feature-sg-grant" entry from Xenstore. When using indirection the + * seg[] array doesn't contain specifications of the data buffers, but + * references to scsiif_request_segment arrays, which in turn reference the + * data buffers. While nr_segments holds the number of populated seg[] entries + * (plus the set VSCSIIF_SG_GRANT bit), the number of scsiif_request_segment + * elements referencing the target data buffers is calculated from the lengths + * of the seg[] elements (the sum of all valid seg[].length divided by the + * size of one scsiif_request_segment structure). The frontend may use a mix of + * direct and indirect requests. + */ +#define VSCSIIF_ACT_SCSI_CDB 1 /* + * Request abort of a running operation for the specified target given by + * channel, id, lun and the operation's rqid in ref_rqid. + */ +#define VSCSIIF_ACT_SCSI_ABORT 2 + +/* + * Request a device reset of the specified target (channel and id). + */ +#define VSCSIIF_ACT_SCSI_RESET 3 + +/* + * Preset scatter/gather elements for a following request. Deprecated. + * Keeping the define only to avoid usage of the value "4" for other actions. + */ +#define VSCSIIF_ACT_SCSI_SG_PRESET 4 + +/* * Maximum scatter/gather segments per request. * - * Considering balance between allocating al least 16 "vscsiif_request" - * structures on one page (4096bytes) and number of scatter gather - * needed, we decided to use 26 as a magic number. + * Considering balance between allocating at least 16 "vscsiif_request" + * structures on one page (4096 bytes) and the number of scatter/gather + * elements needed, we decided to use 26 as a magic number. + * + * If "feature-sg-grant" is set, more scatter/gather elements can be specified + * by placing them in one or more (up to VSCSIIF_SG_TABLESIZE) granted pages. + * In this case the vscsiif_request seg elements don't contain references to + * the user data, but to the SG elements referencing the user data. */ #define VSCSIIF_SG_TABLESIZE 26 /* - * base on linux kernel 2.6.18 + * based on Linux kernel 2.6.18, still valid + * + * Changing these values requires support of multiple protocols via the rings + * as "old clients" will blindly use these values and the resulting structure + * sizes. */ #define VSCSIIF_MAX_COMMAND_SIZE 16 #define VSCSIIF_SENSE_BUFFERSIZE 96 +struct scsiif_request_segment { + grant_ref_t gref; + uint16_t offset; + uint16_t length; +}; +typedef struct scsiif_request_segment vscsiif_segment_t; +#define VSCSIIF_SG_PER_PAGE (PAGE_SIZE / sizeof(struct scsiif_request_segment)) + +/* Size of one request is 252 bytes */ struct vscsiif_request { uint16_t rqid; /* private guest value, echoed in resp */ uint8_t act; /* command between backend and frontend */ - uint8_t cmd_len; + uint8_t cmd_len; /* valid CDB bytes */ - uint8_t cmnd[VSCSIIF_MAX_COMMAND_SIZE]; - uint16_t timeout_per_command; /* The command is issued by twice - the value in Backend. */ - uint16_t channel, id, lun; - uint16_t padding; - uint8_t sc_data_direction; /* for DMA_TO_DEVICE(1) - DMA_FROM_DEVICE(2) - DMA_NONE(3) requests */ - uint8_t nr_segments; /* Number of pieces of scatter-gather */ + uint8_t cmnd[VSCSIIF_MAX_COMMAND_SIZE]; /* the CDB */ + uint16_t timeout_per_command; /* deprecated: timeout in secs, 0=default */ + uint16_t channel, id, lun; /* (virtual) device specification */ + uint16_t ref_rqid; /* command abort reference */ + uint8_t sc_data_direction; /* for DMA_TO_DEVICE(1) + DMA_FROM_DEVICE(2) + DMA_NONE(3) requests */ + uint8_t nr_segments; /* Number of pieces of scatter-gather */ +/* + * flag in nr_segments: SG elements via grant page + * + * If VSCSIIF_SG_GRANT is set, the low 7 bits of nr_segments specify the number + * of grant pages containing SG elements. Usable if "feature-sg-grant" set. + */ +#define VSCSIIF_SG_GRANT 0x80 - struct scsiif_request_segment { - grant_ref_t gref; - uint16_t offset; - uint16_t length; - } seg[VSCSIIF_SG_TABLESIZE]; + vscsiif_segment_t seg[VSCSIIF_SG_TABLESIZE]; uint32_t reserved[3]; }; typedef struct vscsiif_request vscsiif_request_t; +/* + * The following interface is deprecated! + */ +#define VSCSIIF_SG_LIST_SIZE ((sizeof(vscsiif_request_t) - 4) \ + / sizeof(vscsiif_segment_t)) + +struct vscsiif_sg_list { + /* First two fields must match struct vscsiif_request! */ + uint16_t rqid; /* private guest value, must match main req */ + uint8_t act; /* VSCSIIF_ACT_SCSI_SG_PRESET */ + uint8_t nr_segments; /* Number of pieces of scatter-gather */ + vscsiif_segment_t seg[VSCSIIF_SG_LIST_SIZE]; +}; +typedef struct vscsiif_sg_list vscsiif_sg_list_t; +/* End of deprecated interface */ + +/* Size of one response is 252 bytes */ struct vscsiif_response { - uint16_t rqid; - uint8_t padding; + uint16_t rqid; /* identifies request */ + uint8_t act; /* deprecated: valid only if SG_PRESET supported */ uint8_t sense_len; uint8_t sense_buffer[VSCSIIF_SENSE_BUFFERSIZE]; int32_t rslt; @@ -98,7 +253,7 @@ /* * Local variables: * mode: C - * c-set-style: "BSD" + * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil Modified: trunk/sys/xen/interface/io/xenbus.h =================================================================== --- trunk/sys/xen/interface/io/xenbus.h 2020-02-02 21:31:28 UTC (rev 12300) +++ trunk/sys/xen/interface/io/xenbus.h 2020-02-08 19:26:24 UTC (rev 12301) @@ -37,9 +37,6 @@ enum xenbus_state { XenbusStateUnknown = 0, - /* - * Initializing: Back-end is initializing. - */ XenbusStateInitialising = 1, /* @@ -53,9 +50,6 @@ */ XenbusStateInitialised = 3, - /* - * Connected: The normal state for a front to backend connection. - */ XenbusStateConnected = 4, /* @@ -63,18 +57,6 @@ */ XenbusStateClosing = 5, - /* - * Closed: No connection exists between front and back end. - * - * For backend devices with the "online" attribute, the front can - * request a reconnect at any time. To handle this transition - * gracefully, backend devices must reinitialize any XenStore data - * used to negotiate features with a peer before transitioning to - * the closed state. When a reconnect request occurs, the - * XenBus backend support code will automatically transition the - * backend device from Closed to InitWait, kicking off the ring - * and feature negotiation process. - */ XenbusStateClosed = 6, /* @@ -91,7 +73,7 @@ /* * Local variables: * mode: C - * c-set-style: "BSD" + * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil Modified: trunk/sys/xen/interface/io/xs_wire.h =================================================================== --- trunk/sys/xen/interface/io/xs_wire.h 2020-02-02 21:31:28 UTC (rev 12300) +++ trunk/sys/xen/interface/io/xs_wire.h 2020-02-08 19:26:24 UTC (rev 12301) @@ -50,7 +50,9 @@ XS_RESUME, XS_SET_TARGET, XS_RESTRICT, - XS_RESET_WATCHES + XS_RESET_WATCHES, + + XS_INVALID = 0xffff /* Guaranteed to remain an invalid type */ }; #define XS_WRITE_NONE "NONE" @@ -84,7 +86,8 @@ XSD_ERROR(EROFS), XSD_ERROR(EBUSY), XSD_ERROR(EAGAIN), - XSD_ERROR(EISCONN) + XSD_ERROR(EISCONN), + XSD_ERROR(E2BIG) }; #endif @@ -104,7 +107,10 @@ XS_WATCH_TOKEN }; -/* Inter-domain shared memory communications. */ +/* + * `incontents 150 xenstore_struct XenStore wire protocol. + * + * Inter-domain shared memory communications. */ #define XENSTORE_RING_SIZE 1024 typedef uint32_t XENSTORE_RING_IDX; #define MASK_XENSTORE_IDX(idx) ((idx) & (XENSTORE_RING_SIZE-1)) @@ -113,6 +119,8 @@ char rsp[XENSTORE_RING_SIZE]; /* Replies and async watch events. */ XENSTORE_RING_IDX req_cons, req_prod; XENSTORE_RING_IDX rsp_cons, rsp_prod; + uint32_t server_features; /* Bitmap of features supported by the server */ + uint32_t connection; }; /* Violating this is very bad. See docs/misc/xenstore.txt. */ @@ -122,12 +130,19 @@ #define XENSTORE_ABS_PATH_MAX 3072 #define XENSTORE_REL_PATH_MAX 2048 +/* The ability to reconnect a ring */ +#define XENSTORE_SERVER_FEATURE_RECONNECTION 1 + +/* Valid values for the connection field */ +#define XENSTORE_CONNECTED 0 /* the steady-state */ +#define XENSTORE_RECONNECT 1 /* guest has initiated a reconnect */ + #endif /* _XS_WIRE_H */ /* * Local variables: * mode: C - * c-set-style: "BSD" + * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil From laffer1 at midnightbsd.org Sat Feb 8 14:26:43 2020 From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org) Date: Sat, 8 Feb 2020 14:26:43 -0500 (EST) Subject: [Midnightbsd-cvs] src [12302] trunk/sys/xen/interface/xsm/flask_op.h: sync with FreeBSD 11-stable Message-ID: <202002081926.018JQhQG060682@stargazer.midnightbsd.org> Revision: 12302 http://svnweb.midnightbsd.org/src/?rev=12302 Author: laffer1 Date: 2020-02-08 14:26:42 -0500 (Sat, 08 Feb 2020) Log Message: ----------- sync with FreeBSD 11-stable Modified Paths: -------------- trunk/sys/xen/interface/xsm/flask_op.h Modified: trunk/sys/xen/interface/xsm/flask_op.h =================================================================== --- trunk/sys/xen/interface/xsm/flask_op.h 2020-02-08 19:26:24 UTC (rev 12301) +++ trunk/sys/xen/interface/xsm/flask_op.h 2020-02-08 19:26:42 UTC (rev 12302) @@ -26,6 +26,8 @@ #ifndef __FLASK_OP_H__ #define __FLASK_OP_H__ +#include "../event_channel.h" + #define XEN_FLASK_INTERFACE_VERSION 1 struct xen_flask_load { @@ -143,6 +145,19 @@ uint32_t sid; }; +struct xen_flask_relabel { + /* IN */ + uint32_t domid; + uint32_t sid; +}; + +struct xen_flask_devicetree_label { + /* IN */ + uint32_t sid; + uint32_t length; + XEN_GUEST_HANDLE(char) path; +}; + struct xen_flask_op { uint32_t cmd; #define FLASK_LOAD 1 @@ -168,6 +183,8 @@ #define FLASK_ADD_OCONTEXT 21 #define FLASK_DEL_OCONTEXT 22 #define FLASK_GET_PEER_SID 23 +#define FLASK_RELABEL_DOMAIN 24 +#define FLASK_DEVICETREE_LABEL 25 uint32_t interface_version; /* XEN_FLASK_INTERFACE_VERSION */ union { struct xen_flask_load load; @@ -186,6 +203,8 @@ /* FLASK_ADD_OCONTEXT, FLASK_DEL_OCONTEXT */ struct xen_flask_ocontext ocontext; struct xen_flask_peersid peersid; + struct xen_flask_relabel relabel; + struct xen_flask_devicetree_label devicetree_label; } u; }; typedef struct xen_flask_op xen_flask_op_t; From laffer1 at midnightbsd.org Sat Feb 8 14:27:19 2020 From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org) Date: Sat, 8 Feb 2020 14:27:19 -0500 (EST) Subject: [Midnightbsd-cvs] src [12303] trunk/sys/xen/interface/hvm: sync with FreeBSD 11-stable Message-ID: <202002081927.018JRJKC060748@stargazer.midnightbsd.org> Revision: 12303 http://svnweb.midnightbsd.org/src/?rev=12303 Author: laffer1 Date: 2020-02-08 14:27:19 -0500 (Sat, 08 Feb 2020) Log Message: ----------- sync with FreeBSD 11-stable Modified Paths: -------------- trunk/sys/xen/interface/hvm/e820.h trunk/sys/xen/interface/hvm/hvm_info_table.h trunk/sys/xen/interface/hvm/hvm_op.h trunk/sys/xen/interface/hvm/ioreq.h trunk/sys/xen/interface/hvm/params.h trunk/sys/xen/interface/hvm/save.h Added Paths: ----------- trunk/sys/xen/interface/hvm/hvm_xs_strings.h trunk/sys/xen/interface/hvm/pvdrivers.h Modified: trunk/sys/xen/interface/hvm/e820.h =================================================================== --- trunk/sys/xen/interface/hvm/e820.h 2020-02-08 19:26:42 UTC (rev 12302) +++ trunk/sys/xen/interface/hvm/e820.h 2020-02-08 19:27:19 UTC (rev 12303) @@ -1,5 +1,4 @@ /* $MidnightBSD$ */ - /* * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to @@ -18,6 +17,8 @@ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. + * + * Copyright (c) 2006, Keir Fraser */ #ifndef __XEN_PUBLIC_HVM_E820_H__ Modified: trunk/sys/xen/interface/hvm/hvm_info_table.h =================================================================== --- trunk/sys/xen/interface/hvm/hvm_info_table.h 2020-02-08 19:26:42 UTC (rev 12302) +++ trunk/sys/xen/interface/hvm/hvm_info_table.h 2020-02-08 19:27:19 UTC (rev 12303) @@ -21,6 +21,8 @@ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. + * + * Copyright (c) 2006, Keir Fraser */ #ifndef __XEN_PUBLIC_HVM_HVM_INFO_TABLE_H__ Modified: trunk/sys/xen/interface/hvm/hvm_op.h =================================================================== --- trunk/sys/xen/interface/hvm/hvm_op.h 2020-02-08 19:26:42 UTC (rev 12302) +++ trunk/sys/xen/interface/hvm/hvm_op.h 2020-02-08 19:27:19 UTC (rev 12303) @@ -17,6 +17,8 @@ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. + * + * Copyright (c) 2007, Keir Fraser */ #ifndef __XEN_PUBLIC_HVM_HVM_OP_H__ @@ -24,6 +26,7 @@ #include "../xen.h" #include "../trace.h" +#include "../event_channel.h" /* Get/set subcommands: extra argument == pointer to xen_hvm_param struct. */ #define HVMOP_set_param 0 @@ -81,6 +84,7 @@ HVMMEM_ram_rw, /* Normal read/write guest RAM */ HVMMEM_ram_ro, /* Read-only; writes are discarded */ HVMMEM_mmio_dm, /* Reads and write go to the device model */ + HVMMEM_mmio_write_dm /* Read-only; writes go to the device model */ } hvmmem_type_t; /* Following tools-only interfaces may change in future. */ @@ -91,10 +95,10 @@ struct xen_hvm_track_dirty_vram { /* Domain to be tracked. */ domid_t domid; + /* Number of pages to track. */ + uint32_t nr; /* First pfn to track. */ uint64_aligned_t first_pfn; - /* Number of pages to track. */ - uint64_aligned_t nr; /* OUT variable. */ /* Dirty bitmap buffer. */ XEN_GUEST_HANDLE_64(uint8) dirty_bitmap; @@ -107,10 +111,10 @@ struct xen_hvm_modified_memory { /* Domain to be updated. */ domid_t domid; + /* Number of pages. */ + uint32_t nr; /* First pfn. */ uint64_aligned_t first_pfn; - /* Number of pages. */ - uint64_aligned_t nr; }; typedef struct xen_hvm_modified_memory xen_hvm_modified_memory_t; DEFINE_XEN_GUEST_HANDLE(xen_hvm_modified_memory_t); @@ -163,49 +167,11 @@ /* Following tools-only interfaces may change in future. */ #if defined(__XEN__) || defined(__XEN_TOOLS__) +/* Deprecated by XENMEM_access_op_set_access */ #define HVMOP_set_mem_access 12 -typedef enum { - HVMMEM_access_n, - HVMMEM_access_r, - HVMMEM_access_w, - HVMMEM_access_rw, - HVMMEM_access_x, - HVMMEM_access_rx, - HVMMEM_access_wx, - HVMMEM_access_rwx, - HVMMEM_access_rx2rw, /* Page starts off as r-x, but automatically - * change to r-w on a write */ - HVMMEM_access_n2rwx, /* Log access: starts off as n, automatically - * goes to rwx, generating an event without - * pausing the vcpu */ - HVMMEM_access_default /* Take the domain default */ -} hvmmem_access_t; -/* Notify that a region of memory is to have specific access types */ -struct xen_hvm_set_mem_access { - /* Domain to be updated. */ - domid_t domid; - /* Memory type */ - uint16_t hvmmem_access; /* hvm_access_t */ - /* Number of pages, ignored on setting default access */ - uint32_t nr; - /* First pfn, or ~0ull to set the default access for new pages */ - uint64_aligned_t first_pfn; -}; -typedef struct xen_hvm_set_mem_access xen_hvm_set_mem_access_t; -DEFINE_XEN_GUEST_HANDLE(xen_hvm_set_mem_access_t); +/* Deprecated by XENMEM_access_op_get_access */ #define HVMOP_get_mem_access 13 -/* Get the specific access type for that region of memory */ -struct xen_hvm_get_mem_access { - /* Domain to be queried. */ - domid_t domid; - /* Memory type: OUT */ - uint16_t hvmmem_access; /* hvm_access_t */ - /* pfn, or ~0ull for default access for new pages. IN */ - uint64_aligned_t pfn; -}; -typedef struct xen_hvm_get_mem_access xen_hvm_get_mem_access_t; -DEFINE_XEN_GUEST_HANDLE(xen_hvm_get_mem_access_t); #define HVMOP_inject_trap 14 /* Inject a trap into a VCPU, which will get taken up on the next @@ -271,6 +237,267 @@ typedef struct xen_hvm_inject_msi xen_hvm_inject_msi_t; DEFINE_XEN_GUEST_HANDLE(xen_hvm_inject_msi_t); +/* + * IOREQ Servers + * + * The interface between an I/O emulator an Xen is called an IOREQ Server. + * A domain supports a single 'legacy' IOREQ Server which is instantiated if + * parameter... + * + * HVM_PARAM_IOREQ_PFN is read (to get the gmfn containing the synchronous + * ioreq structures), or... + * HVM_PARAM_BUFIOREQ_PFN is read (to get the gmfn containing the buffered + * ioreq ring), or... + * HVM_PARAM_BUFIOREQ_EVTCHN is read (to get the event channel that Xen uses + * to request buffered I/O emulation). + * + * The following hypercalls facilitate the creation of IOREQ Servers for + * 'secondary' emulators which are invoked to implement port I/O, memory, or + * PCI config space ranges which they explicitly register. + */ + +typedef uint16_t ioservid_t; + +/* + * HVMOP_create_ioreq_server: Instantiate a new IOREQ Server for a secondary + * emulator servicing domain <domid>. + * + * The <id> handed back is unique for <domid>. If <handle_bufioreq> is zero + * the buffered ioreq ring will not be allocated and hence all emulation + * requestes to this server will be synchronous. + */ +#define HVMOP_create_ioreq_server 17 +struct xen_hvm_create_ioreq_server { + domid_t domid; /* IN - domain to be serviced */ +#define HVM_IOREQSRV_BUFIOREQ_OFF 0 +#define HVM_IOREQSRV_BUFIOREQ_LEGACY 1 +/* + * Use this when read_pointer gets updated atomically and + * the pointer pair gets read atomically: + */ +#define HVM_IOREQSRV_BUFIOREQ_ATOMIC 2 + uint8_t handle_bufioreq; /* IN - should server handle buffered ioreqs */ + ioservid_t id; /* OUT - server id */ +}; +typedef struct xen_hvm_create_ioreq_server xen_hvm_create_ioreq_server_t; +DEFINE_XEN_GUEST_HANDLE(xen_hvm_create_ioreq_server_t); + +/* + * HVMOP_get_ioreq_server_info: Get all the information necessary to access + * IOREQ Server <id>. + * + * The emulator needs to map the synchronous ioreq structures and buffered + * ioreq ring (if it exists) that Xen uses to request emulation. These are + * hosted in domain <domid>'s gmfns <ioreq_pfn> and <bufioreq_pfn> + * respectively. In addition, if the IOREQ Server is handling buffered + * emulation requests, the emulator needs to bind to event channel + * <bufioreq_port> to listen for them. (The event channels used for + * synchronous emulation requests are specified in the per-CPU ioreq + * structures in <ioreq_pfn>). + * If the IOREQ Server is not handling buffered emulation requests then the + * values handed back in <bufioreq_pfn> and <bufioreq_port> will both be 0. + */ +#define HVMOP_get_ioreq_server_info 18 +struct xen_hvm_get_ioreq_server_info { + domid_t domid; /* IN - domain to be serviced */ + ioservid_t id; /* IN - server id */ + evtchn_port_t bufioreq_port; /* OUT - buffered ioreq port */ + uint64_aligned_t ioreq_pfn; /* OUT - sync ioreq pfn */ + uint64_aligned_t bufioreq_pfn; /* OUT - buffered ioreq pfn */ +}; +typedef struct xen_hvm_get_ioreq_server_info xen_hvm_get_ioreq_server_info_t; +DEFINE_XEN_GUEST_HANDLE(xen_hvm_get_ioreq_server_info_t); + +/* + * HVM_map_io_range_to_ioreq_server: Register an I/O range of domain <domid> + * for emulation by the client of IOREQ + * Server <id> + * HVM_unmap_io_range_from_ioreq_server: Deregister an I/O range of <domid> + * for emulation by the client of IOREQ + * Server <id> + * + * There are three types of I/O that can be emulated: port I/O, memory accesses + * and PCI config space accesses. The <type> field denotes which type of range + * the <start> and <end> (inclusive) fields are specifying. + * PCI config space ranges are specified by segment/bus/device/function values + * which should be encoded using the HVMOP_PCI_SBDF helper macro below. + * + * NOTE: unless an emulation request falls entirely within a range mapped + * by a secondary emulator, it will not be passed to that emulator. + */ +#define HVMOP_map_io_range_to_ioreq_server 19 +#define HVMOP_unmap_io_range_from_ioreq_server 20 +struct xen_hvm_io_range { + domid_t domid; /* IN - domain to be serviced */ + ioservid_t id; /* IN - server id */ + uint32_t type; /* IN - type of range */ +# define HVMOP_IO_RANGE_PORT 0 /* I/O port range */ +# define HVMOP_IO_RANGE_MEMORY 1 /* MMIO range */ +# define HVMOP_IO_RANGE_PCI 2 /* PCI segment/bus/dev/func range */ + uint64_aligned_t start, end; /* IN - inclusive start and end of range */ +}; +typedef struct xen_hvm_io_range xen_hvm_io_range_t; +DEFINE_XEN_GUEST_HANDLE(xen_hvm_io_range_t); + +#define HVMOP_PCI_SBDF(s,b,d,f) \ + ((((s) & 0xffff) << 16) | \ + (((b) & 0xff) << 8) | \ + (((d) & 0x1f) << 3) | \ + ((f) & 0x07)) + +/* + * HVMOP_destroy_ioreq_server: Destroy the IOREQ Server <id> servicing domain + * <domid>. + * + * Any registered I/O ranges will be automatically deregistered. + */ +#define HVMOP_destroy_ioreq_server 21 +struct xen_hvm_destroy_ioreq_server { + domid_t domid; /* IN - domain to be serviced */ + ioservid_t id; /* IN - server id */ +}; +typedef struct xen_hvm_destroy_ioreq_server xen_hvm_destroy_ioreq_server_t; +DEFINE_XEN_GUEST_HANDLE(xen_hvm_destroy_ioreq_server_t); + +/* + * HVMOP_set_ioreq_server_state: Enable or disable the IOREQ Server <id> servicing + * domain <domid>. + * + * The IOREQ Server will not be passed any emulation requests until it is in the + * enabled state. + * Note that the contents of the ioreq_pfn and bufioreq_fn (see + * HVMOP_get_ioreq_server_info) are not meaningful until the IOREQ Server is in + * the enabled state. + */ +#define HVMOP_set_ioreq_server_state 22 +struct xen_hvm_set_ioreq_server_state { + domid_t domid; /* IN - domain to be serviced */ + ioservid_t id; /* IN - server id */ + uint8_t enabled; /* IN - enabled? */ +}; +typedef struct xen_hvm_set_ioreq_server_state xen_hvm_set_ioreq_server_state_t; +DEFINE_XEN_GUEST_HANDLE(xen_hvm_set_ioreq_server_state_t); + #endif /* defined(__XEN__) || defined(__XEN_TOOLS__) */ +#if defined(__i386__) || defined(__x86_64__) + +/* + * HVMOP_set_evtchn_upcall_vector: Set a <vector> that should be used for event + * channel upcalls on the specified <vcpu>. If set, + * this vector will be used in preference to the + * domain global callback via (see + * HVM_PARAM_CALLBACK_IRQ). + */ +#define HVMOP_set_evtchn_upcall_vector 23 +struct xen_hvm_evtchn_upcall_vector { + uint32_t vcpu; + uint8_t vector; +}; +typedef struct xen_hvm_evtchn_upcall_vector xen_hvm_evtchn_upcall_vector_t; +DEFINE_XEN_GUEST_HANDLE(xen_hvm_evtchn_upcall_vector_t); + +#endif /* defined(__i386__) || defined(__x86_64__) */ + +#define HVMOP_guest_request_vm_event 24 + +/* HVMOP_altp2m: perform altp2m state operations */ +#define HVMOP_altp2m 25 + +#define HVMOP_ALTP2M_INTERFACE_VERSION 0x00000001 + +struct xen_hvm_altp2m_domain_state { + /* IN or OUT variable on/off */ + uint8_t state; +}; +typedef struct xen_hvm_altp2m_domain_state xen_hvm_altp2m_domain_state_t; +DEFINE_XEN_GUEST_HANDLE(xen_hvm_altp2m_domain_state_t); + +struct xen_hvm_altp2m_vcpu_enable_notify { + uint32_t vcpu_id; + uint32_t pad; + /* #VE info area gfn */ + uint64_t gfn; +}; +typedef struct xen_hvm_altp2m_vcpu_enable_notify xen_hvm_altp2m_vcpu_enable_notify_t; +DEFINE_XEN_GUEST_HANDLE(xen_hvm_altp2m_vcpu_enable_notify_t); + +struct xen_hvm_altp2m_view { + /* IN/OUT variable */ + uint16_t view; + /* Create view only: default access type + * NOTE: currently ignored */ + uint16_t hvmmem_default_access; /* xenmem_access_t */ +}; +typedef struct xen_hvm_altp2m_view xen_hvm_altp2m_view_t; +DEFINE_XEN_GUEST_HANDLE(xen_hvm_altp2m_view_t); + +struct xen_hvm_altp2m_set_mem_access { + /* view */ + uint16_t view; + /* Memory type */ + uint16_t hvmmem_access; /* xenmem_access_t */ + uint32_t pad; + /* gfn */ + uint64_t gfn; +}; +typedef struct xen_hvm_altp2m_set_mem_access xen_hvm_altp2m_set_mem_access_t; +DEFINE_XEN_GUEST_HANDLE(xen_hvm_altp2m_set_mem_access_t); + +struct xen_hvm_altp2m_change_gfn { + /* view */ + uint16_t view; + uint16_t pad1; + uint32_t pad2; + /* old gfn */ + uint64_t old_gfn; + /* new gfn, INVALID_GFN (~0UL) means revert */ + uint64_t new_gfn; +}; +typedef struct xen_hvm_altp2m_change_gfn xen_hvm_altp2m_change_gfn_t; +DEFINE_XEN_GUEST_HANDLE(xen_hvm_altp2m_change_gfn_t); + +struct xen_hvm_altp2m_op { + uint32_t version; /* HVMOP_ALTP2M_INTERFACE_VERSION */ + uint32_t cmd; +/* Get/set the altp2m state for a domain */ +#define HVMOP_altp2m_get_domain_state 1 +#define HVMOP_altp2m_set_domain_state 2 +/* Set the current VCPU to receive altp2m event notifications */ +#define HVMOP_altp2m_vcpu_enable_notify 3 +/* Create a new view */ +#define HVMOP_altp2m_create_p2m 4 +/* Destroy a view */ +#define HVMOP_altp2m_destroy_p2m 5 +/* Switch view for an entire domain */ +#define HVMOP_altp2m_switch_p2m 6 +/* Notify that a page of memory is to have specific access types */ +#define HVMOP_altp2m_set_mem_access 7 +/* Change a p2m entry to have a different gfn->mfn mapping */ +#define HVMOP_altp2m_change_gfn 8 + domid_t domain; + uint16_t pad1; + uint32_t pad2; + union { + struct xen_hvm_altp2m_domain_state domain_state; + struct xen_hvm_altp2m_vcpu_enable_notify enable_notify; + struct xen_hvm_altp2m_view view; + struct xen_hvm_altp2m_set_mem_access set_mem_access; + struct xen_hvm_altp2m_change_gfn change_gfn; + uint8_t pad[64]; + } u; +}; +typedef struct xen_hvm_altp2m_op xen_hvm_altp2m_op_t; +DEFINE_XEN_GUEST_HANDLE(xen_hvm_altp2m_op_t); + #endif /* __XEN_PUBLIC_HVM_HVM_OP_H__ */ + +/* + * Local variables: + * mode: C + * c-file-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ Added: trunk/sys/xen/interface/hvm/hvm_xs_strings.h =================================================================== --- trunk/sys/xen/interface/hvm/hvm_xs_strings.h (rev 0) +++ trunk/sys/xen/interface/hvm/hvm_xs_strings.h 2020-02-08 19:27:19 UTC (rev 12303) @@ -0,0 +1,83 @@ +/* $MidnightBSD$ */ +/****************************************************************************** + * hvm/hvm_xs_strings.h + * + * HVM xenstore strings used in HVMLOADER. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Copyright (c) 2013, Citrix Systems + */ + +#ifndef __XEN_PUBLIC_HVM_HVM_XS_STRINGS_H__ +#define __XEN_PUBLIC_HVM_HVM_XS_STRINGS_H__ + +#define HVM_XS_HVMLOADER "hvmloader" +#define HVM_XS_BIOS "hvmloader/bios" +#define HVM_XS_GENERATION_ID_ADDRESS "hvmloader/generation-id-address" +#define HVM_XS_ALLOW_MEMORY_RELOCATE "hvmloader/allow-memory-relocate" + +/* The following values allow additional ACPI tables to be added to the + * virtual ACPI BIOS that hvmloader constructs. The values specify the guest + * physical address and length of a block of ACPI tables to add. The format of + * the block is simply concatenated raw tables (which specify their own length + * in the ACPI header). + */ +#define HVM_XS_ACPI_PT_ADDRESS "hvmloader/acpi/address" +#define HVM_XS_ACPI_PT_LENGTH "hvmloader/acpi/length" + +/* Any number of SMBIOS types can be passed through to an HVM guest using + * the following xenstore values. The values specify the guest physical + * address and length of a block of SMBIOS structures for hvmloader to use. + * The block is formatted in the following way: + * + * <length><struct><length><struct>... + * + * Each length separator is a 32b integer indicating the length of the next + * SMBIOS structure. For DMTF defined types (0 - 121), the passed in struct + * will replace the default structure in hvmloader. In addition, any + * OEM/vendortypes (128 - 255) will all be added. + */ +#define HVM_XS_SMBIOS_PT_ADDRESS "hvmloader/smbios/address" +#define HVM_XS_SMBIOS_PT_LENGTH "hvmloader/smbios/length" + +/* Set to 1 to enable SMBIOS default portable battery (type 22) values. */ +#define HVM_XS_SMBIOS_DEFAULT_BATTERY "hvmloader/smbios/default_battery" + +/* The following xenstore values are used to override some of the default + * string values in the SMBIOS table constructed in hvmloader. + */ +#define HVM_XS_BIOS_STRINGS "bios-strings" +#define HVM_XS_BIOS_VENDOR "bios-strings/bios-vendor" +#define HVM_XS_BIOS_VERSION "bios-strings/bios-version" +#define HVM_XS_SYSTEM_MANUFACTURER "bios-strings/system-manufacturer" +#define HVM_XS_SYSTEM_PRODUCT_NAME "bios-strings/system-product-name" +#define HVM_XS_SYSTEM_VERSION "bios-strings/system-version" +#define HVM_XS_SYSTEM_SERIAL_NUMBER "bios-strings/system-serial-number" +#define HVM_XS_ENCLOSURE_MANUFACTURER "bios-strings/enclosure-manufacturer" +#define HVM_XS_ENCLOSURE_SERIAL_NUMBER "bios-strings/enclosure-serial-number" +#define HVM_XS_BATTERY_MANUFACTURER "bios-strings/battery-manufacturer" +#define HVM_XS_BATTERY_DEVICE_NAME "bios-strings/battery-device-name" + +/* 1 to 99 OEM strings can be set in xenstore using values of the form + * below. These strings will be loaded into the SMBIOS type 11 structure. + */ +#define HVM_XS_OEM_STRINGS "bios-strings/oem-%d" + +#endif /* __XEN_PUBLIC_HVM_HVM_XS_STRINGS_H__ */ Property changes on: trunk/sys/xen/interface/hvm/hvm_xs_strings.h ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +MidnightBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Modified: trunk/sys/xen/interface/hvm/ioreq.h =================================================================== --- trunk/sys/xen/interface/hvm/ioreq.h 2020-02-08 19:26:42 UTC (rev 12302) +++ trunk/sys/xen/interface/hvm/ioreq.h 2020-02-08 19:27:19 UTC (rev 12303) @@ -35,6 +35,7 @@ #define IOREQ_TYPE_PIO 0 /* pio */ #define IOREQ_TYPE_COPY 1 /* mmio ops */ +#define IOREQ_TYPE_PCI_CONFIG 2 #define IOREQ_TYPE_TIMEOFFSET 7 #define IOREQ_TYPE_INVALIDATE 8 /* mapcache */ @@ -41,7 +42,13 @@ /* * VMExit dispatcher should cooperate with instruction decoder to * prepare this structure and notify service OS and DM by sending - * virq + * virq. + * + * For I/O type IOREQ_TYPE_PCI_CONFIG, the physical address is formatted + * as follows: + * + * 63....48|47..40|39..35|34..32|31........0 + * SEGMENT |BUS |DEV |FN |OFFSET */ struct ioreq { uint64_t addr; /* physical address */ @@ -77,30 +84,21 @@ #define IOREQ_BUFFER_SLOT_NUM 511 /* 8 bytes each, plus 2 4-byte indexes */ struct buffered_iopage { - unsigned int read_pointer; - unsigned int write_pointer; +#ifdef __XEN__ + union bufioreq_pointers { + struct { +#endif + uint32_t read_pointer; + uint32_t write_pointer; +#ifdef __XEN__ + }; + uint64_t full; + } ptrs; +#endif buf_ioreq_t buf_ioreq[IOREQ_BUFFER_SLOT_NUM]; }; /* NB. Size of this structure must be no greater than one page. */ typedef struct buffered_iopage buffered_iopage_t; -#if defined(__ia64__) -struct pio_buffer { - uint32_t page_offset; - uint32_t pointer; - uint32_t data_end; - uint32_t buf_size; - void *opaque; -}; - -#define PIO_BUFFER_IDE_PRIMARY 0 /* I/O port = 0x1F0 */ -#define PIO_BUFFER_IDE_SECONDARY 1 /* I/O port = 0x170 */ -#define PIO_BUFFER_ENTRY_NUM 2 -struct buffered_piopage { - struct pio_buffer pio[PIO_BUFFER_ENTRY_NUM]; - uint8_t buffer[1]; -}; -#endif /* defined(__ia64__) */ - /* * ACPI Control/Event register locations. Location is controlled by a * version number in HVM_PARAM_ACPI_IOPORTS_LOCATION. @@ -133,7 +131,7 @@ /* * Local variables: * mode: C - * c-set-style: "BSD" + * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil Modified: trunk/sys/xen/interface/hvm/params.h =================================================================== --- trunk/sys/xen/interface/hvm/params.h 2020-02-08 19:26:42 UTC (rev 12302) +++ trunk/sys/xen/interface/hvm/params.h 2020-02-08 19:27:19 UTC (rev 12303) @@ -17,6 +17,8 @@ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. + * + * Copyright (c) 2007, Keir Fraser */ #ifndef __XEN_PUBLIC_HVM_PARAMS_H__ @@ -55,17 +57,54 @@ #define HVM_PARAM_BUFIOREQ_PFN 6 #define HVM_PARAM_BUFIOREQ_EVTCHN 26 -#ifdef __ia64__ +#if defined(__i386__) || defined(__x86_64__) -#define HVM_PARAM_NVRAM_FD 7 -#define HVM_PARAM_VHPT_SIZE 8 -#define HVM_PARAM_BUFPIOREQ_PFN 9 +/* + * Viridian enlightenments + * + * (See http://download.microsoft.com/download/A/B/4/AB43A34E-BDD0-4FA6-BDEF-79EEF16E880B/Hypervisor%20Top%20Level%20Functional%20Specification%20v4.0.docx) + * + * To expose viridian enlightenments to the guest set this parameter + * to the desired feature mask. The base feature set must be present + * in any valid feature mask. + */ +#define HVM_PARAM_VIRIDIAN 9 -#elif defined(__i386__) || defined(__x86_64__) +/* Base+Freq viridian feature sets: + * + * - Hypercall MSRs (HV_X64_MSR_GUEST_OS_ID and HV_X64_MSR_HYPERCALL) + * - APIC access MSRs (HV_X64_MSR_EOI, HV_X64_MSR_ICR and HV_X64_MSR_TPR) + * - Virtual Processor index MSR (HV_X64_MSR_VP_INDEX) + * - Timer frequency MSRs (HV_X64_MSR_TSC_FREQUENCY and + * HV_X64_MSR_APIC_FREQUENCY) + */ +#define _HVMPV_base_freq 0 +#define HVMPV_base_freq (1 << _HVMPV_base_freq) -/* Expose Viridian interfaces to this HVM guest? */ -#define HVM_PARAM_VIRIDIAN 9 +/* Feature set modifications */ +/* Disable timer frequency MSRs (HV_X64_MSR_TSC_FREQUENCY and + * HV_X64_MSR_APIC_FREQUENCY). + * This modification restores the viridian feature set to the + * original 'base' set exposed in releases prior to Xen 4.4. + */ +#define _HVMPV_no_freq 1 +#define HVMPV_no_freq (1 << _HVMPV_no_freq) + +/* Enable Partition Time Reference Counter (HV_X64_MSR_TIME_REF_COUNT) */ +#define _HVMPV_time_ref_count 2 +#define HVMPV_time_ref_count (1 << _HVMPV_time_ref_count) + +/* Enable Reference TSC Page (HV_X64_MSR_REFERENCE_TSC) */ +#define _HVMPV_reference_tsc 3 +#define HVMPV_reference_tsc (1 << _HVMPV_reference_tsc) + +#define HVMPV_feature_mask \ + (HVMPV_base_freq | \ + HVMPV_no_freq | \ + HVMPV_time_ref_count | \ + HVMPV_reference_tsc) + #endif /* @@ -126,28 +165,34 @@ */ #define HVM_PARAM_ACPI_IOPORTS_LOCATION 19 -/* Enable blocking memory events, async or sync (pause vcpu until response) - * onchangeonly indicates messages only on a change of value */ +/* Deprecated */ #define HVM_PARAM_MEMORY_EVENT_CR0 20 #define HVM_PARAM_MEMORY_EVENT_CR3 21 #define HVM_PARAM_MEMORY_EVENT_CR4 22 #define HVM_PARAM_MEMORY_EVENT_INT3 23 #define HVM_PARAM_MEMORY_EVENT_SINGLE_STEP 25 +#define HVM_PARAM_MEMORY_EVENT_MSR 30 -#define HVMPME_MODE_MASK (3 << 0) -#define HVMPME_mode_disabled 0 -#define HVMPME_mode_async 1 -#define HVMPME_mode_sync 2 -#define HVMPME_onchangeonly (1 << 2) - /* Boolean: Enable nestedhvm (hvm only) */ #define HVM_PARAM_NESTEDHVM 24 /* Params for the mem event rings */ #define HVM_PARAM_PAGING_RING_PFN 27 -#define HVM_PARAM_ACCESS_RING_PFN 28 +#define HVM_PARAM_MONITOR_RING_PFN 28 #define HVM_PARAM_SHARING_RING_PFN 29 -#define HVM_NR_PARAMS 30 +/* SHUTDOWN_* action in case of a triple fault */ +#define HVM_PARAM_TRIPLE_FAULT_REASON 31 +#define HVM_PARAM_IOREQ_SERVER_PFN 32 +#define HVM_PARAM_NR_IOREQ_SERVER_PAGES 33 + +/* Location of the VM Generation ID in guest physical address space. */ +#define HVM_PARAM_VM_GENERATION_ID_ADDR 34 + +/* Boolean: Enable altp2m */ +#define HVM_PARAM_ALTP2M 35 + +#define HVM_NR_PARAMS 36 + #endif /* __XEN_PUBLIC_HVM_PARAMS_H__ */ Added: trunk/sys/xen/interface/hvm/pvdrivers.h =================================================================== --- trunk/sys/xen/interface/hvm/pvdrivers.h (rev 0) +++ trunk/sys/xen/interface/hvm/pvdrivers.h 2020-02-08 19:27:19 UTC (rev 12303) @@ -0,0 +1,50 @@ +/* $MidnightBSD$ */ +/* + * pvdrivers.h: Register of PV drivers product numbers. + * Copyright (c) 2012, Citrix Systems Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#ifndef _XEN_PUBLIC_PVDRIVERS_H_ +#define _XEN_PUBLIC_PVDRIVERS_H_ + +/* + * This is the master registry of product numbers for + * PV drivers. + * If you need a new product number allocating, please + * post to xen-devel at lists.xensource.com. You should NOT use + * a product number without allocating one. + * If you maintain a separate versioning and distribution path + * for PV drivers you should have a separate product number so + * that your drivers can be separated from others. + * + * During development, you may use the product ID to + * indicate a driver which is yet to be released. + */ + +#define PVDRIVERS_PRODUCT_LIST(EACH) \ + EACH("xensource-windows", 0x0001) /* Citrix */ \ + EACH("gplpv-windows", 0x0002) /* James Harper */ \ + EACH("linux", 0x0003) \ + EACH("xenserver-windows-v7.0+", 0x0004) /* Citrix */ \ + EACH("xenserver-windows-v7.2+", 0x0005) /* Citrix */ \ + EACH("experimental", 0xffff) + +#endif /* _XEN_PUBLIC_PVDRIVERS_H_ */ Property changes on: trunk/sys/xen/interface/hvm/pvdrivers.h ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +MidnightBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Modified: trunk/sys/xen/interface/hvm/save.h =================================================================== --- trunk/sys/xen/interface/hvm/save.h 2020-02-08 19:26:42 UTC (rev 12302) +++ trunk/sys/xen/interface/hvm/save.h 2020-02-08 19:27:19 UTC (rev 12303) @@ -103,9 +103,7 @@ #if defined(__i386__) || defined(__x86_64__) #include "../arch-x86/hvm/save.h" -#elif defined(__ia64__) -#include "../arch-ia64/hvm/save.h" -#elif defined(__arm__) +#elif defined(__arm__) || defined(__aarch64__) #include "../arch-arm/hvm/save.h" #else #error "unsupported architecture" From laffer1 at midnightbsd.org Sat Feb 8 14:27:35 2020 From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org) Date: Sat, 8 Feb 2020 14:27:35 -0500 (EST) Subject: [Midnightbsd-cvs] src [12304] trunk/sys/xen/interface/arch-x86: sync with FreeBSD 11-stable Message-ID: <202002081927.018JRZsB060802@stargazer.midnightbsd.org> Revision: 12304 http://svnweb.midnightbsd.org/src/?rev=12304 Author: laffer1 Date: 2020-02-08 14:27:35 -0500 (Sat, 08 Feb 2020) Log Message: ----------- sync with FreeBSD 11-stable Modified Paths: -------------- trunk/sys/xen/interface/arch-x86/cpuid.h trunk/sys/xen/interface/arch-x86/hvm/save.h trunk/sys/xen/interface/arch-x86/xen-mca.h trunk/sys/xen/interface/arch-x86/xen-x86_32.h trunk/sys/xen/interface/arch-x86/xen-x86_64.h trunk/sys/xen/interface/arch-x86/xen.h Added Paths: ----------- trunk/sys/xen/interface/arch-x86/pmu.h Modified: trunk/sys/xen/interface/arch-x86/cpuid.h =================================================================== --- trunk/sys/xen/interface/arch-x86/cpuid.h 2020-02-08 19:27:19 UTC (rev 12303) +++ trunk/sys/xen/interface/arch-x86/cpuid.h 2020-02-08 19:27:35 UTC (rev 12304) @@ -31,12 +31,20 @@ #ifndef __XEN_PUBLIC_ARCH_X86_CPUID_H__ #define __XEN_PUBLIC_ARCH_X86_CPUID_H__ -/* Xen identification leaves start at 0x40000000. */ +/* + * For compatibility with other hypervisor interfaces, the Xen cpuid leaves + * can be found at the first otherwise unused 0x100 aligned boundary starting + * from 0x40000000. + * + * e.g If viridian extensions are enabled for an HVM domain, the Xen cpuid + * leaves will start at 0x40000100 + */ + #define XEN_CPUID_FIRST_LEAF 0x40000000 #define XEN_CPUID_LEAF(i) (XEN_CPUID_FIRST_LEAF + (i)) /* - * Leaf 1 (0x40000000) + * Leaf 1 (0x40000x00) * EAX: Largest Xen-information leaf. All leaves up to an including @EAX * are supported by the Xen host. * EBX-EDX: "XenVMMXenVMM" signature, allowing positive identification @@ -47,7 +55,7 @@ #define XEN_CPUID_SIGNATURE_EDX 0x4d4d566e /* "nVMM" */ /* - * Leaf 2 (0x40000001) + * Leaf 2 (0x40000x01) * EAX[31:16]: Xen major version. * EAX[15: 0]: Xen minor version. * EBX-EDX: Reserved (currently all zeroes). @@ -54,7 +62,7 @@ */ /* - * Leaf 3 (0x40000002) + * Leaf 3 (0x40000x02) * EAX: Number of hypercall transfer pages. This register is always guaranteed * to specify one hypercall page. * EBX: Base address of Xen-specific MSRs. @@ -66,4 +74,18 @@ #define _XEN_CPUID_FEAT1_MMU_PT_UPDATE_PRESERVE_AD 0 #define XEN_CPUID_FEAT1_MMU_PT_UPDATE_PRESERVE_AD (1u<<0) +/* + * Leaf 5 (0x40000x04) + * HVM-specific features + * EAX: Features + * EBX: vcpu id (iff EAX has XEN_HVM_CPUID_VCPU_ID_PRESENT flag) + */ +#define XEN_HVM_CPUID_APIC_ACCESS_VIRT (1u << 0) /* Virtualized APIC registers */ +#define XEN_HVM_CPUID_X2APIC_VIRT (1u << 1) /* Virtualized x2APIC accesses */ +/* Memory mapped from other domains has valid IOMMU entries */ +#define XEN_HVM_CPUID_IOMMU_MAPPINGS (1u << 2) +#define XEN_HVM_CPUID_VCPU_ID_PRESENT (1u << 3) /* vcpu id is present in EBX */ + +#define XEN_CPUID_MAX_NUM_LEAVES 4 + #endif /* __XEN_PUBLIC_ARCH_X86_CPUID_H__ */ Modified: trunk/sys/xen/interface/arch-x86/hvm/save.h =================================================================== --- trunk/sys/xen/interface/arch-x86/hvm/save.h 2020-02-08 19:27:19 UTC (rev 12303) +++ trunk/sys/xen/interface/arch-x86/hvm/save.h 2020-02-08 19:27:35 UTC (rev 12304) @@ -270,15 +270,18 @@ }; static inline int _hvm_hw_fix_cpu(void *h) { - struct hvm_hw_cpu *new=h; - struct hvm_hw_cpu_compat *old=h; + union hvm_hw_cpu_union { + struct hvm_hw_cpu nat; + struct hvm_hw_cpu_compat cmp; + } *ucpu = (union hvm_hw_cpu_union *)h; + /* If we copy from the end backwards, we should * be able to do the modification in-place */ - new->error_code=old->error_code; - new->pending_event=old->pending_event; - new->tsc=old->tsc; - new->msr_tsc_aux=0; + ucpu->nat.error_code = ucpu->cmp.error_code; + ucpu->nat.pending_event = ucpu->cmp.pending_event; + ucpu->nat.tsc = ucpu->cmp.tsc; + ucpu->nat.msr_tsc_aux = 0; return 0; } @@ -542,7 +545,7 @@ */ struct hvm_hw_cpu_xsave { - uint64_t xfeature_mask; + uint64_t xfeature_mask; /* Ignored */ uint64_t xcr0; /* Updated by XSETBV */ uint64_t xcr0_accum; /* Updated by XSETBV */ struct { @@ -566,6 +569,8 @@ struct hvm_viridian_domain_context { uint64_t hypercall_gpa; uint64_t guest_os_id; + uint64_t time_ref_count; + uint64_t reference_tsc; }; DECLARE_HVM_SAVE_TYPE(VIRIDIAN_DOMAIN, 15, struct hvm_viridian_domain_context); @@ -578,13 +583,49 @@ struct hvm_vmce_vcpu { uint64_t caps; + uint64_t mci_ctl2_bank0; + uint64_t mci_ctl2_bank1; }; DECLARE_HVM_SAVE_TYPE(VMCE_VCPU, 18, struct hvm_vmce_vcpu); +struct hvm_tsc_adjust { + uint64_t tsc_adjust; +}; + +DECLARE_HVM_SAVE_TYPE(TSC_ADJUST, 19, struct hvm_tsc_adjust); + + +struct hvm_msr { + uint32_t count; + struct hvm_one_msr { + uint32_t index; + uint32_t _rsvd; + uint64_t val; +#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L + } msr[]; +#elif defined(__GNUC__) + } msr[0]; +#else + } msr[1 /* variable size */]; +#endif +}; + +#define CPU_MSR_CODE 20 + /* * Largest type-code in use */ -#define HVM_SAVE_CODE_MAX 18 +#define HVM_SAVE_CODE_MAX 20 #endif /* __XEN_PUBLIC_HVM_SAVE_X86_H__ */ + +/* + * Local variables: + * mode: C + * c-file-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ Added: trunk/sys/xen/interface/arch-x86/pmu.h =================================================================== --- trunk/sys/xen/interface/arch-x86/pmu.h (rev 0) +++ trunk/sys/xen/interface/arch-x86/pmu.h 2020-02-08 19:27:35 UTC (rev 12304) @@ -0,0 +1,168 @@ +/* $MidnightBSD$ */ +/* + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Copyright (c) 2015 Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef __XEN_PUBLIC_ARCH_X86_PMU_H__ +#define __XEN_PUBLIC_ARCH_X86_PMU_H__ + +/* x86-specific PMU definitions */ + +/* AMD PMU registers and structures */ +struct xen_pmu_amd_ctxt { + /* + * Offsets to counter and control MSRs (relative to xen_pmu_arch.c.amd). + * For PV(H) guests these fields are RO. + */ + uint32_t counters; + uint32_t ctrls; + + /* Counter MSRs */ +#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L + uint64_t regs[]; +#elif defined(__GNUC__) + uint64_t regs[0]; +#endif +}; +typedef struct xen_pmu_amd_ctxt xen_pmu_amd_ctxt_t; +DEFINE_XEN_GUEST_HANDLE(xen_pmu_amd_ctxt_t); + +/* Intel PMU registers and structures */ +struct xen_pmu_cntr_pair { + uint64_t counter; + uint64_t control; +}; +typedef struct xen_pmu_cntr_pair xen_pmu_cntr_pair_t; +DEFINE_XEN_GUEST_HANDLE(xen_pmu_cntr_pair_t); + +struct xen_pmu_intel_ctxt { + /* + * Offsets to fixed and architectural counter MSRs (relative to + * xen_pmu_arch.c.intel). + * For PV(H) guests these fields are RO. + */ + uint32_t fixed_counters; + uint32_t arch_counters; + + /* PMU registers */ + uint64_t global_ctrl; + uint64_t global_ovf_ctrl; + uint64_t global_status; + uint64_t fixed_ctrl; + uint64_t ds_area; + uint64_t pebs_enable; + uint64_t debugctl; + + /* Fixed and architectural counter MSRs */ +#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L + uint64_t regs[]; +#elif defined(__GNUC__) + uint64_t regs[0]; +#endif +}; +typedef struct xen_pmu_intel_ctxt xen_pmu_intel_ctxt_t; +DEFINE_XEN_GUEST_HANDLE(xen_pmu_intel_ctxt_t); + +/* Sampled domain's registers */ +struct xen_pmu_regs { + uint64_t ip; + uint64_t sp; + uint64_t flags; + uint16_t cs; + uint16_t ss; + uint8_t cpl; + uint8_t pad[3]; +}; +typedef struct xen_pmu_regs xen_pmu_regs_t; +DEFINE_XEN_GUEST_HANDLE(xen_pmu_regs_t); + +/* PMU flags */ +#define PMU_CACHED (1<<0) /* PMU MSRs are cached in the context */ +#define PMU_SAMPLE_USER (1<<1) /* Sample is from user or kernel mode */ +#define PMU_SAMPLE_REAL (1<<2) /* Sample is from realmode */ +#define PMU_SAMPLE_PV (1<<3) /* Sample from a PV guest */ + +/* + * Architecture-specific information describing state of the processor at + * the time of PMU interrupt. + * Fields of this structure marked as RW for guest should only be written by + * the guest when PMU_CACHED bit in pmu_flags is set (which is done by the + * hypervisor during PMU interrupt). Hypervisor will read updated data in + * XENPMU_flush hypercall and clear PMU_CACHED bit. + */ +struct xen_pmu_arch { + union { + /* + * Processor's registers at the time of interrupt. + * WO for hypervisor, RO for guests. + */ + struct xen_pmu_regs regs; + /* Padding for adding new registers to xen_pmu_regs in the future */ +#define XENPMU_REGS_PAD_SZ 64 + uint8_t pad[XENPMU_REGS_PAD_SZ]; + } r; + + /* WO for hypervisor, RO for guest */ + uint64_t pmu_flags; + + /* + * APIC LVTPC register. + * RW for both hypervisor and guest. + * Only APIC_LVT_MASKED bit is loaded by the hypervisor into hardware + * during XENPMU_flush or XENPMU_lvtpc_set. + */ + union { + uint32_t lapic_lvtpc; + uint64_t pad; + } l; + + /* + * Vendor-specific PMU registers. + * RW for both hypervisor and guest (see exceptions above). + * Guest's updates to this field are verified and then loaded by the + * hypervisor into hardware during XENPMU_flush + */ + union { + struct xen_pmu_amd_ctxt amd; + struct xen_pmu_intel_ctxt intel; + + /* + * Padding for contexts (fixed parts only, does not include MSR banks + * that are specified by offsets) + */ +#define XENPMU_CTXT_PAD_SZ 128 + uint8_t pad[XENPMU_CTXT_PAD_SZ]; + } c; +}; +typedef struct xen_pmu_arch xen_pmu_arch_t; +DEFINE_XEN_GUEST_HANDLE(xen_pmu_arch_t); + +#endif /* __XEN_PUBLIC_ARCH_X86_PMU_H__ */ +/* + * Local variables: + * mode: C + * c-file-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ + Property changes on: trunk/sys/xen/interface/arch-x86/pmu.h ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +MidnightBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Modified: trunk/sys/xen/interface/arch-x86/xen-mca.h =================================================================== --- trunk/sys/xen/interface/arch-x86/xen-mca.h 2020-02-08 19:27:19 UTC (rev 12303) +++ trunk/sys/xen/interface/arch-x86/xen-mca.h 2020-02-08 19:27:35 UTC (rev 12304) @@ -415,7 +415,7 @@ struct xen_mc_inject_v2 { uint32_t flags; - struct xenctl_cpumap cpumap; + struct xenctl_bitmap cpumap; }; #endif Modified: trunk/sys/xen/interface/arch-x86/xen-x86_32.h =================================================================== --- trunk/sys/xen/interface/arch-x86/xen-x86_32.h 2020-02-08 19:27:19 UTC (rev 12303) +++ trunk/sys/xen/interface/arch-x86/xen-x86_32.h 2020-02-08 19:27:35 UTC (rev 12304) @@ -105,6 +105,7 @@ do { if ( sizeof(hnd) == 8 ) *(uint64_t *)&(hnd) = 0; \ (hnd).p = val; \ } while ( 0 ) +#define int64_aligned_t int64_t __attribute__((aligned(8))) #define uint64_aligned_t uint64_t __attribute__((aligned(8))) #define __XEN_GUEST_HANDLE_64(name) __guest_handle_64_ ## name #define XEN_GUEST_HANDLE_64(name) __XEN_GUEST_HANDLE_64(name) @@ -164,7 +165,7 @@ /* * Local variables: * mode: C - * c-set-style: "BSD" + * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil Modified: trunk/sys/xen/interface/arch-x86/xen-x86_64.h =================================================================== --- trunk/sys/xen/interface/arch-x86/xen-x86_64.h 2020-02-08 19:27:19 UTC (rev 12303) +++ trunk/sys/xen/interface/arch-x86/xen-x86_64.h 2020-02-08 19:27:35 UTC (rev 12304) @@ -195,7 +195,7 @@ /* * Local variables: * mode: C - * c-set-style: "BSD" + * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil Modified: trunk/sys/xen/interface/arch-x86/xen.h =================================================================== --- trunk/sys/xen/interface/arch-x86/xen.h 2020-02-08 19:27:19 UTC (rev 12303) +++ trunk/sys/xen/interface/arch-x86/xen.h 2020-02-08 19:27:35 UTC (rev 12304) @@ -39,6 +39,14 @@ typedef type * __guest_handle_ ## name #endif +/* + * XEN_GUEST_HANDLE represents a guest pointer, when passed as a field + * in a struct in memory. + * XEN_GUEST_HANDLE_PARAM represent a guest pointer, when passed as an + * hypercall argument. + * XEN_GUEST_HANDLE_PARAM and XEN_GUEST_HANDLE are the same on X86 but + * they might not be on other architectures. + */ #define __DEFINE_XEN_GUEST_HANDLE(name, type) \ ___DEFINE_XEN_GUEST_HANDLE(name, type); \ ___DEFINE_XEN_GUEST_HANDLE(const_##name, const type) @@ -45,6 +53,7 @@ #define DEFINE_XEN_GUEST_HANDLE(name) __DEFINE_XEN_GUEST_HANDLE(name, name) #define __XEN_GUEST_HANDLE(name) __guest_handle_ ## name #define XEN_GUEST_HANDLE(name) __XEN_GUEST_HANDLE(name) +#define XEN_GUEST_HANDLE_PARAM(name) XEN_GUEST_HANDLE(name) #define set_xen_guest_handle_raw(hnd, val) do { (hnd).p = val; } while (0) #ifdef __XEN_TOOLS__ #define get_xen_guest_handle(val, hnd) do { val = (hnd).p; } while (0) @@ -62,8 +71,12 @@ #define PRI_xen_pfn "lx" #endif +#define XEN_HAVE_PV_GUEST_ENTRY 1 + +#define XEN_HAVE_PV_UPCALL_MASK 1 + /* - * SEGMENT DESCRIPTOR TABLES + * `incontents 200 segdesc Segment Descriptor Tables */ /* * ` enum neg_errnoval @@ -75,11 +88,24 @@ * start of the GDT because some stupid OSes export hard-coded selector values * in their ABI. These hard-coded values are always near the start of the GDT, * so Xen places itself out of the way, at the far end of the GDT. + * + * NB The LDT is set using the MMUEXT_SET_LDT op of HYPERVISOR_mmuext_op */ #define FIRST_RESERVED_GDT_PAGE 14 #define FIRST_RESERVED_GDT_BYTE (FIRST_RESERVED_GDT_PAGE * 4096) #define FIRST_RESERVED_GDT_ENTRY (FIRST_RESERVED_GDT_BYTE / 8) + +/* + * ` enum neg_errnoval + * ` HYPERVISOR_update_descriptor(u64 pa, u64 desc); + * ` + * ` @pa The machine physical address of the descriptor to + * ` update. Must be either a descriptor page or writable. + * ` @desc The descriptor value to update, in the same format as a + * ` native descriptor table entry. + */ + /* Maximum number of virtual CPUs in legacy multi-processor guests. */ #define XEN_LEGACY_MAX_VCPUS 32 @@ -86,6 +112,7 @@ #ifndef __ASSEMBLY__ typedef unsigned long xen_ulong_t; +#define PRI_xen_ulong "lx" /* * ` enum neg_errnoval @@ -128,6 +155,15 @@ /* * The following is all CPU context. Note that the fpu_ctxt block is filled * in by FXSAVE if the CPU has feature FXSR; otherwise FSAVE is used. + * + * Also note that when calling DOMCTL_setvcpucontext and VCPU_initialise + * for HVM and PVH guests, not all information in this structure is updated: + * + * - For HVM guests, the structures read include: fpu_ctxt (if + * VGCT_I387_VALID is set), flags, user_regs, debugreg[*] + * + * - PVH guests are the same as HVM guests, but additionally use ctrlreg[3] to + * set cr3. All other fields not used should be set to 0. */ struct vcpu_guest_context { /* FPU registers come first so they can be aligned for FXSAVE/FXRSTOR. */ @@ -185,14 +221,58 @@ DEFINE_XEN_GUEST_HANDLE(vcpu_guest_context_t); struct arch_shared_info { - unsigned long max_pfn; /* max pfn that appears in table */ - /* Frame containing list of mfns containing list of mfns containing p2m. */ + /* + * Number of valid entries in the p2m table(s) anchored at + * pfn_to_mfn_frame_list_list and/or p2m_vaddr. + */ + unsigned long max_pfn; + /* + * Frame containing list of mfns containing list of mfns containing p2m. + * A value of 0 indicates it has not yet been set up, ~0 indicates it has + * been set to invalid e.g. due to the p2m being too large for the 3-level + * p2m tree. In this case the linear mapper p2m list anchored at p2m_vaddr + * is to be used. + */ xen_pfn_t pfn_to_mfn_frame_list_list; unsigned long nmi_reason; - uint64_t pad[32]; + /* + * Following three fields are valid if p2m_cr3 contains a value different + * from 0. + * p2m_cr3 is the root of the address space where p2m_vaddr is valid. + * p2m_cr3 is in the same format as a cr3 value in the vcpu register state + * and holds the folded machine frame number (via xen_pfn_to_cr3) of a + * L3 or L4 page table. + * p2m_vaddr holds the virtual address of the linear p2m list. All entries + * in the range [0...max_pfn[ are accessible via this pointer. + * p2m_generation will be incremented by the guest before and after each + * change of the mappings of the p2m list. p2m_generation starts at 0 and + * a value with the least significant bit set indicates that a mapping + * update is in progress. This allows guest external software (e.g. in Dom0) + * to verify that read mappings are consistent and whether they have changed + * since the last check. + * Modifying a p2m element in the linear p2m list is allowed via an atomic + * write only. + */ + unsigned long p2m_cr3; /* cr3 value of the p2m address space */ + unsigned long p2m_vaddr; /* virtual address of the p2m list */ + unsigned long p2m_generation; /* generation count of p2m mapping */ +#ifdef __i386__ + /* There's no room for this field in the generic structure. */ + uint32_t wc_sec_hi; +#endif }; typedef struct arch_shared_info arch_shared_info_t; +#if defined(__XEN__) || defined(__XEN_TOOLS__) +/* + * struct xen_arch_domainconfig's ABI is covered by + * XEN_DOMCTL_INTERFACE_VERSION. + */ +struct xen_arch_domainconfig { + char dummy; +}; +#endif + #endif /* !__ASSEMBLY__ */ /* @@ -230,7 +310,7 @@ /* * Local variables: * mode: C - * c-set-style: "BSD" + * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil From laffer1 at midnightbsd.org Sat Feb 8 14:27:58 2020 From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org) Date: Sat, 8 Feb 2020 14:27:58 -0500 (EST) Subject: [Midnightbsd-cvs] src [12305] trunk/sys/xen/interface/arch-arm/hvm/save.h: sync with FreeBSD 11-stable Message-ID: <202002081927.018JRwMm060861@stargazer.midnightbsd.org> Revision: 12305 http://svnweb.midnightbsd.org/src/?rev=12305 Author: laffer1 Date: 2020-02-08 14:27:58 -0500 (Sat, 08 Feb 2020) Log Message: ----------- sync with FreeBSD 11-stable Modified Paths: -------------- trunk/sys/xen/interface/arch-arm/hvm/save.h Modified: trunk/sys/xen/interface/arch-arm/hvm/save.h =================================================================== --- trunk/sys/xen/interface/arch-arm/hvm/save.h 2020-02-08 19:27:35 UTC (rev 12304) +++ trunk/sys/xen/interface/arch-arm/hvm/save.h 2020-02-08 19:27:58 UTC (rev 12305) @@ -32,7 +32,7 @@ /* * Local variables: * mode: C - * c-set-style: "BSD" + * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil From laffer1 at midnightbsd.org Sat Feb 8 14:28:09 2020 From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org) Date: Sat, 8 Feb 2020 14:28:09 -0500 (EST) Subject: [Midnightbsd-cvs] src [12306] trunk/sys/xen/interface: sync with FreeBSD 11-stable Message-ID: <202002081928.018JS90k060912@stargazer.midnightbsd.org> Revision: 12306 http://svnweb.midnightbsd.org/src/?rev=12306 Author: laffer1 Date: 2020-02-08 14:28:08 -0500 (Sat, 08 Feb 2020) Log Message: ----------- sync with FreeBSD 11-stable Modified Paths: -------------- trunk/sys/xen/interface/arch-arm.h trunk/sys/xen/interface/callback.h trunk/sys/xen/interface/dom0_ops.h trunk/sys/xen/interface/domctl.h trunk/sys/xen/interface/elfnote.h trunk/sys/xen/interface/event_channel.h trunk/sys/xen/interface/features.h trunk/sys/xen/interface/grant_table.h trunk/sys/xen/interface/kexec.h trunk/sys/xen/interface/memory.h trunk/sys/xen/interface/nmi.h trunk/sys/xen/interface/physdev.h trunk/sys/xen/interface/platform.h trunk/sys/xen/interface/sched.h trunk/sys/xen/interface/sysctl.h trunk/sys/xen/interface/tmem.h trunk/sys/xen/interface/trace.h trunk/sys/xen/interface/vcpu.h trunk/sys/xen/interface/version.h trunk/sys/xen/interface/xen-compat.h trunk/sys/xen/interface/xen.h trunk/sys/xen/interface/xenoprof.h Added Paths: ----------- trunk/sys/xen/interface/errno.h trunk/sys/xen/interface/gcov.h trunk/sys/xen/interface/pmu.h trunk/sys/xen/interface/vm_event.h Modified: trunk/sys/xen/interface/arch-arm.h =================================================================== --- trunk/sys/xen/interface/arch-arm.h 2020-02-08 19:27:58 UTC (rev 12305) +++ trunk/sys/xen/interface/arch-arm.h 2020-02-08 19:28:08 UTC (rev 12306) @@ -28,94 +28,254 @@ #ifndef __XEN_PUBLIC_ARCH_ARM_H__ #define __XEN_PUBLIC_ARCH_ARM_H__ -/* hypercall calling convention - * ---------------------------- +/* + * `incontents 50 arm_abi Hypercall Calling Convention * * A hypercall is issued using the ARM HVC instruction. * * A hypercall can take up to 5 arguments. These are passed in - * registers, the first argument in r0, the second argument in r1, the - * third in r2, the forth in r3 and the fifth in r4. + * registers, the first argument in x0/r0 (for arm64/arm32 guests + * respectively irrespective of whether the underlying hypervisor is + * 32- or 64-bit), the second argument in x1/r1, the third in x2/r2, + * the forth in x3/r3 and the fifth in x4/r4. * - * The hypercall number is passed in r12. + * The hypercall number is passed in r12 (arm) or x16 (arm64). In both + * cases the relevant ARM procedure calling convention specifies this + * is an inter-procedure-call scratch register (e.g. for use in linker + * stubs). This use does not conflict with use during a hypercall. * * The HVC ISS must contain a Xen specific TAG: XEN_HYPERCALL_TAG. * - * The return value is in r0. + * The return value is in x0/r0. * - * The hypercall will clobber r12 and the argument registers used by - * that hypercall (except r0 which is the return value) i.e. a 2 - * argument hypercall will clobber r1 and a 4 argument hypercall will - * clobber r1, r2 and r3. + * The hypercall will clobber x16/r12 and the argument registers used + * by that hypercall (except r0 which is the return value) i.e. in + * addition to x16/r12 a 2 argument hypercall will clobber x1/r1 and a + * 4 argument hypercall will clobber x1/r1, x2/r2 and x3/r3. * + * Parameter structs passed to hypercalls are laid out according to + * the Procedure Call Standard for the ARM Architecture (AAPCS, AKA + * EABI) and Procedure Call Standard for the ARM 64-bit Architecture + * (AAPCS64). Where there is a conflict the 64-bit standard should be + * used regardless of guest type. Structures which are passed as + * hypercall arguments are always little endian. + * + * All memory which is shared with other entities in the system + * (including the hypervisor and other guests) must reside in memory + * which is mapped as Normal Inner-cacheable. This applies to: + * - hypercall arguments passed via a pointer to guest memory. + * - memory shared via the grant table mechanism (including PV I/O + * rings etc). + * - memory shared with the hypervisor (struct shared_info, struct + * vcpu_info, the grant table, etc). + * + * Any Inner cache allocation strategy (Write-Back, Write-Through etc) + * is acceptable. There is no restriction on the Outer-cacheability. */ +/* + * `incontents 55 arm_hcall Supported Hypercalls + * + * Xen on ARM makes extensive use of hardware facilities and therefore + * only a subset of the potential hypercalls are required. + * + * Since ARM uses second stage paging any machine/physical addresses + * passed to hypercalls are Guest Physical Addresses (Intermediate + * Physical Addresses) unless otherwise noted. + * + * The following hypercalls (and sub operations) are supported on the + * ARM platform. Other hypercalls should be considered + * unavailable/unsupported. + * + * HYPERVISOR_memory_op + * All generic sub-operations + * + * HYPERVISOR_domctl + * All generic sub-operations, with the exception of: + * * XEN_DOMCTL_irq_permission (not yet implemented) + * + * HYPERVISOR_sched_op + * All generic sub-operations, with the exception of: + * * SCHEDOP_block -- prefer wfi hardware instruction + * + * HYPERVISOR_console_io + * All generic sub-operations + * + * HYPERVISOR_xen_version + * All generic sub-operations + * + * HYPERVISOR_event_channel_op + * All generic sub-operations + * + * HYPERVISOR_physdev_op + * No sub-operations are currenty supported + * + * HYPERVISOR_sysctl + * All generic sub-operations, with the exception of: + * * XEN_SYSCTL_page_offline_op + * * XEN_SYSCTL_get_pmstat + * * XEN_SYSCTL_pm_op + * + * HYPERVISOR_hvm_op + * Exactly these sub-operations are supported: + * * HVMOP_set_param + * * HVMOP_get_param + * + * HYPERVISOR_grant_table_op + * All generic sub-operations + * + * HYPERVISOR_vcpu_op + * Exactly these sub-operations are supported: + * * VCPUOP_register_vcpu_info + * * VCPUOP_register_runstate_memory_area + * + * + * Other notes on the ARM ABI: + * + * - struct start_info is not exported to ARM guests. + * + * - struct shared_info is mapped by ARM guests using the + * HYPERVISOR_memory_op sub-op XENMEM_add_to_physmap, passing + * XENMAPSPACE_shared_info as space parameter. + * + * - All the per-cpu struct vcpu_info are mapped by ARM guests using the + * HYPERVISOR_vcpu_op sub-op VCPUOP_register_vcpu_info, including cpu0 + * struct vcpu_info. + * + * - The grant table is mapped using the HYPERVISOR_memory_op sub-op + * XENMEM_add_to_physmap, passing XENMAPSPACE_grant_table as space + * parameter. The memory range specified under the Xen compatible + * hypervisor node on device tree can be used as target gpfn for the + * mapping. + * + * - Xenstore is initialized by using the two hvm_params + * HVM_PARAM_STORE_PFN and HVM_PARAM_STORE_EVTCHN. They can be read + * with the HYPERVISOR_hvm_op sub-op HVMOP_get_param. + * + * - The paravirtualized console is initialized by using the two + * hvm_params HVM_PARAM_CONSOLE_PFN and HVM_PARAM_CONSOLE_EVTCHN. They + * can be read with the HYPERVISOR_hvm_op sub-op HVMOP_get_param. + * + * - Event channel notifications are delivered using the percpu GIC + * interrupt specified under the Xen compatible hypervisor node on + * device tree. + * + * - The device tree Xen compatible node is fully described under Linux + * at Documentation/devicetree/bindings/arm/xen.txt. + */ + #define XEN_HYPERCALL_TAG 0XEA1 +#define int64_aligned_t int64_t __attribute__((aligned(8))) +#define uint64_aligned_t uint64_t __attribute__((aligned(8))) #ifndef __ASSEMBLY__ -#define ___DEFINE_XEN_GUEST_HANDLE(name, type) \ - typedef struct { type *p; } __guest_handle_ ## name +#define ___DEFINE_XEN_GUEST_HANDLE(name, type) \ + typedef union { type *p; unsigned long q; } \ + __guest_handle_ ## name; \ + typedef union { type *p; uint64_aligned_t q; } \ + __guest_handle_64_ ## name; +/* + * XEN_GUEST_HANDLE represents a guest pointer, when passed as a field + * in a struct in memory. On ARM is always 8 bytes sizes and 8 bytes + * aligned. + * XEN_GUEST_HANDLE_PARAM represents a guest pointer, when passed as an + * hypercall argument. It is 4 bytes on aarch32 and 8 bytes on aarch64. + */ #define __DEFINE_XEN_GUEST_HANDLE(name, type) \ ___DEFINE_XEN_GUEST_HANDLE(name, type); \ ___DEFINE_XEN_GUEST_HANDLE(const_##name, const type) #define DEFINE_XEN_GUEST_HANDLE(name) __DEFINE_XEN_GUEST_HANDLE(name, name) -#define __XEN_GUEST_HANDLE(name) __guest_handle_ ## name +#define __XEN_GUEST_HANDLE(name) __guest_handle_64_ ## name #define XEN_GUEST_HANDLE(name) __XEN_GUEST_HANDLE(name) -#define set_xen_guest_handle_raw(hnd, val) do { (hnd).p = val; } while (0) +#define XEN_GUEST_HANDLE_PARAM(name) __guest_handle_ ## name +#define set_xen_guest_handle_raw(hnd, val) \ + do { \ + typeof(&(hnd)) _sxghr_tmp = &(hnd); \ + _sxghr_tmp->q = 0; \ + _sxghr_tmp->p = val; \ + } while ( 0 ) #ifdef __XEN_TOOLS__ #define get_xen_guest_handle(val, hnd) do { val = (hnd).p; } while (0) #endif #define set_xen_guest_handle(hnd, val) set_xen_guest_handle_raw(hnd, val) -struct cpu_user_regs +#if defined(__GNUC__) && !defined(__STRICT_ANSI__) +/* Anonymous union includes both 32- and 64-bit names (e.g., r0/x0). */ +# define __DECL_REG(n64, n32) union { \ + uint64_t n64; \ + uint32_t n32; \ + } +#else +/* Non-gcc sources must always use the proper 64-bit name (e.g., x0). */ +#define __DECL_REG(n64, n32) uint64_t n64 +#endif + +struct vcpu_guest_core_regs { - uint32_t r0; - uint32_t r1; - uint32_t r2; - uint32_t r3; - uint32_t r4; - uint32_t r5; - uint32_t r6; - uint32_t r7; - uint32_t r8; - uint32_t r9; - uint32_t r10; - union { - uint32_t r11; - uint32_t fp; - }; - uint32_t r12; + /* Aarch64 Aarch32 */ + __DECL_REG(x0, r0_usr); + __DECL_REG(x1, r1_usr); + __DECL_REG(x2, r2_usr); + __DECL_REG(x3, r3_usr); + __DECL_REG(x4, r4_usr); + __DECL_REG(x5, r5_usr); + __DECL_REG(x6, r6_usr); + __DECL_REG(x7, r7_usr); + __DECL_REG(x8, r8_usr); + __DECL_REG(x9, r9_usr); + __DECL_REG(x10, r10_usr); + __DECL_REG(x11, r11_usr); + __DECL_REG(x12, r12_usr); - uint32_t sp; /* r13 - SP: Valid for Hyp. frames only, o/w banked (see below) */ + __DECL_REG(x13, sp_usr); + __DECL_REG(x14, lr_usr); - /* r14 - LR: is the same physical register as LR_usr */ - union { - uint32_t lr; /* r14 - LR: Valid for Hyp. Same physical register as lr_usr. */ - uint32_t lr_usr; - }; + __DECL_REG(x15, __unused_sp_hyp); - uint32_t pc; /* Return IP */ - uint32_t cpsr; /* Return mode */ - uint32_t pad0; /* Doubleword-align the kernel half of the frame */ + __DECL_REG(x16, lr_irq); + __DECL_REG(x17, sp_irq); - /* Outer guest frame only from here on... */ + __DECL_REG(x18, lr_svc); + __DECL_REG(x19, sp_svc); - uint32_t r8_fiq, r9_fiq, r10_fiq, r11_fiq, r12_fiq; + __DECL_REG(x20, lr_abt); + __DECL_REG(x21, sp_abt); - uint32_t sp_usr; /* LR_usr is the same register as LR, see above */ + __DECL_REG(x22, lr_und); + __DECL_REG(x23, sp_und); - uint32_t sp_svc, sp_abt, sp_und, sp_irq, sp_fiq; - uint32_t lr_svc, lr_abt, lr_und, lr_irq, lr_fiq; + __DECL_REG(x24, r8_fiq); + __DECL_REG(x25, r9_fiq); + __DECL_REG(x26, r10_fiq); + __DECL_REG(x27, r11_fiq); + __DECL_REG(x28, r12_fiq); - uint32_t spsr_svc, spsr_abt, spsr_und, spsr_irq, spsr_fiq; + __DECL_REG(x29, sp_fiq); + __DECL_REG(x30, lr_fiq); - uint32_t pad1; /* Doubleword-align the user half of the frame */ + /* Return address and mode */ + __DECL_REG(pc64, pc32); /* ELR_EL2 */ + uint32_t cpsr; /* SPSR_EL2 */ + + union { + uint32_t spsr_el1; /* AArch64 */ + uint32_t spsr_svc; /* AArch32 */ + }; + + /* AArch32 guests only */ + uint32_t spsr_fiq, spsr_irq, spsr_und, spsr_abt; + + /* AArch64 guests only */ + uint64_t sp_el0; + uint64_t sp_el1, elr_el1; }; -typedef struct cpu_user_regs cpu_user_regs_t; -DEFINE_XEN_GUEST_HANDLE(cpu_user_regs_t); +typedef struct vcpu_guest_core_regs vcpu_guest_core_regs_t; +DEFINE_XEN_GUEST_HANDLE(vcpu_guest_core_regs_t); +#undef __DECL_REG + typedef uint64_t xen_pfn_t; #define PRI_xen_pfn PRIx64 @@ -123,30 +283,77 @@ /* Only one. All other VCPUS must use VCPUOP_register_vcpu_info */ #define XEN_LEGACY_MAX_VCPUS 1 -typedef uint32_t xen_ulong_t; +typedef uint64_t xen_ulong_t; +#define PRI_xen_ulong PRIx64 +#if defined(__XEN__) || defined(__XEN_TOOLS__) struct vcpu_guest_context { - struct cpu_user_regs user_regs; /* User-level CPU registers */ +#define _VGCF_online 0 +#define VGCF_online (1<<_VGCF_online) + uint32_t flags; /* VGCF_* */ + struct vcpu_guest_core_regs user_regs; /* Core CPU registers */ + uint32_t sctlr; - uint32_t ttbr0, ttbr1, ttbcr; + uint64_t ttbcr, ttbr0, ttbr1; }; typedef struct vcpu_guest_context vcpu_guest_context_t; DEFINE_XEN_GUEST_HANDLE(vcpu_guest_context_t); -struct arch_vcpu_info { }; +/* + * struct xen_arch_domainconfig's ABI is covered by + * XEN_DOMCTL_INTERFACE_VERSION. + */ +#define XEN_DOMCTL_CONFIG_GIC_NATIVE 0 +#define XEN_DOMCTL_CONFIG_GIC_V2 1 +#define XEN_DOMCTL_CONFIG_GIC_V3 2 +struct xen_arch_domainconfig { + /* IN/OUT */ + uint8_t gic_version; + /* IN */ + uint32_t nr_spis; + /* + * OUT + * Based on the property clock-frequency in the DT timer node. + * The property may be present when the bootloader/firmware doesn't + * set correctly CNTFRQ which hold the timer frequency. + * + * As it's not possible to trap this register, we have to replicate + * the value in the guest DT. + * + * = 0 => property not present + * > 0 => Value of the property + * + */ + uint32_t clock_frequency; +}; +#endif /* __XEN__ || __XEN_TOOLS__ */ + +struct arch_vcpu_info { +}; typedef struct arch_vcpu_info arch_vcpu_info_t; -struct arch_shared_info { }; +struct arch_shared_info { +}; typedef struct arch_shared_info arch_shared_info_t; typedef uint64_t xen_callback_t; -#endif /* ifndef __ASSEMBLY __ */ +#endif -/* PSR bits (CPSR, SPSR)*/ +#if defined(__XEN__) || defined(__XEN_TOOLS__) -/* 0-4: Mode */ -#define PSR_MODE_MASK 0x1f +/* PSR bits (CPSR, SPSR) */ + +#define PSR_THUMB (1<<5) /* Thumb Mode enable */ +#define PSR_FIQ_MASK (1<<6) /* Fast Interrupt mask */ +#define PSR_IRQ_MASK (1<<7) /* Interrupt mask */ +#define PSR_ABT_MASK (1<<8) /* Asynchronous Abort mask */ +#define PSR_BIG_ENDIAN (1<<9) /* arm32: Big Endian Mode */ +#define PSR_DBG_MASK (1<<9) /* arm64: Debug Exception mask */ +#define PSR_IT_MASK (0x0600fc00) /* Thumb If-Then Mask */ +#define PSR_JAZELLE (1<<24) /* Jazelle Mode */ + +/* 32 bit modes */ #define PSR_MODE_USR 0x10 #define PSR_MODE_FIQ 0x11 #define PSR_MODE_IRQ 0x12 @@ -157,19 +364,102 @@ #define PSR_MODE_UND 0x1b #define PSR_MODE_SYS 0x1f -#define PSR_THUMB (1<<5) /* Thumb Mode enable */ -#define PSR_FIQ_MASK (1<<6) /* Fast Interrupt mask */ -#define PSR_IRQ_MASK (1<<7) /* Interrupt mask */ -#define PSR_ABT_MASK (1<<8) /* Asynchronous Abort mask */ -#define PSR_BIG_ENDIAN (1<<9) /* Big Endian Mode */ -#define PSR_JAZELLE (1<<24) /* Jazelle Mode */ +/* 64 bit modes */ +#define PSR_MODE_BIT 0x10 /* Set iff AArch32 */ +#define PSR_MODE_EL3h 0x0d +#define PSR_MODE_EL3t 0x0c +#define PSR_MODE_EL2h 0x09 +#define PSR_MODE_EL2t 0x08 +#define PSR_MODE_EL1h 0x05 +#define PSR_MODE_EL1t 0x04 +#define PSR_MODE_EL0t 0x00 +#define PSR_GUEST32_INIT (PSR_ABT_MASK|PSR_FIQ_MASK|PSR_IRQ_MASK|PSR_MODE_SVC) +#define PSR_GUEST64_INIT (PSR_ABT_MASK|PSR_FIQ_MASK|PSR_IRQ_MASK|PSR_MODE_EL1h) + +#define SCTLR_GUEST_INIT 0x00c50078 + +/* + * Virtual machine platform (memory layout, interrupts) + * + * These are defined for consistency between the tools and the + * hypervisor. Guests must not rely on these hardcoded values but + * should instead use the FDT. + */ + +/* Physical Address Space */ + +/* + * vGIC mappings: Only one set of mapping is used by the guest. + * Therefore they can overlap. + */ + +/* vGIC v2 mappings */ +#define GUEST_GICD_BASE 0x03001000ULL +#define GUEST_GICD_SIZE 0x00001000ULL +#define GUEST_GICC_BASE 0x03002000ULL +#define GUEST_GICC_SIZE 0x00000100ULL + +/* vGIC v3 mappings */ +#define GUEST_GICV3_GICD_BASE 0x03001000ULL +#define GUEST_GICV3_GICD_SIZE 0x00010000ULL + +#define GUEST_GICV3_RDIST_STRIDE 0x20000ULL +#define GUEST_GICV3_RDIST_REGIONS 1 + +#define GUEST_GICV3_GICR0_BASE 0x03020000ULL /* vCPU0 - vCPU127 */ +#define GUEST_GICV3_GICR0_SIZE 0x01000000ULL + +/* + * 16MB == 4096 pages reserved for guest to use as a region to map its + * grant table in. + */ +#define GUEST_GNTTAB_BASE 0x38000000ULL +#define GUEST_GNTTAB_SIZE 0x01000000ULL + +#define GUEST_MAGIC_BASE 0x39000000ULL +#define GUEST_MAGIC_SIZE 0x01000000ULL + +#define GUEST_RAM_BANKS 2 + +#define GUEST_RAM0_BASE 0x40000000ULL /* 3GB of low RAM @ 1GB */ +#define GUEST_RAM0_SIZE 0xc0000000ULL + +#define GUEST_RAM1_BASE 0x0200000000ULL /* 1016GB of RAM @ 8GB */ +#define GUEST_RAM1_SIZE 0xfe00000000ULL + +#define GUEST_RAM_BASE GUEST_RAM0_BASE /* Lowest RAM address */ +/* Largest amount of actual RAM, not including holes */ +#define GUEST_RAM_MAX (GUEST_RAM0_SIZE + GUEST_RAM1_SIZE) +/* Suitable for e.g. const uint64_t ramfoo[] = GUEST_RAM_BANK_FOOS; */ +#define GUEST_RAM_BANK_BASES { GUEST_RAM0_BASE, GUEST_RAM1_BASE } +#define GUEST_RAM_BANK_SIZES { GUEST_RAM0_SIZE, GUEST_RAM1_SIZE } + +/* Interrupts */ +#define GUEST_TIMER_VIRT_PPI 27 +#define GUEST_TIMER_PHYS_S_PPI 29 +#define GUEST_TIMER_PHYS_NS_PPI 30 +#define GUEST_EVTCHN_PPI 31 + +/* PSCI functions */ +#define PSCI_cpu_suspend 0 +#define PSCI_cpu_off 1 +#define PSCI_cpu_on 2 +#define PSCI_migrate 3 + +#endif + +#ifndef __ASSEMBLY__ +/* Stub definition of PMU structure */ +typedef struct xen_pmu_arch { uint8_t dummy; } xen_pmu_arch_t; +#endif + #endif /* __XEN_PUBLIC_ARCH_ARM_H__ */ /* * Local variables: * mode: C - * c-set-style: "BSD" + * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil Modified: trunk/sys/xen/interface/callback.h =================================================================== --- trunk/sys/xen/interface/callback.h 2020-02-08 19:27:58 UTC (rev 12305) +++ trunk/sys/xen/interface/callback.h 2020-02-08 19:28:08 UTC (rev 12306) @@ -37,7 +37,7 @@ * @extra_args == Operation-specific extra arguments (NULL if none). */ -/* ia64, x86: Callback for event delivery. */ +/* x86: Callback for event delivery. */ #define CALLBACKTYPE_event 0 /* x86: Failsafe callback when guest state cannot be restored by Xen. */ @@ -114,7 +114,7 @@ /* * Local variables: * mode: C - * c-set-style: "BSD" + * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil Modified: trunk/sys/xen/interface/dom0_ops.h =================================================================== --- trunk/sys/xen/interface/dom0_ops.h 2020-02-08 19:27:58 UTC (rev 12305) +++ trunk/sys/xen/interface/dom0_ops.h 2020-02-08 19:28:08 UTC (rev 12306) @@ -113,7 +113,7 @@ /* * Local variables: * mode: C - * c-set-style: "BSD" + * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil Modified: trunk/sys/xen/interface/domctl.h =================================================================== --- trunk/sys/xen/interface/domctl.h 2020-02-08 19:27:58 UTC (rev 12305) +++ trunk/sys/xen/interface/domctl.h 2020-02-08 19:28:08 UTC (rev 12306) @@ -35,8 +35,10 @@ #include "xen.h" #include "grant_table.h" +#include "hvm/save.h" +#include "memory.h" -#define XEN_DOMCTL_INTERFACE_VERSION 0x00000008 +#define XEN_DOMCTL_INTERFACE_VERSION 0x0000000b /* * NB. xen_domctl.domain is an IN/OUT parameter for this operation. @@ -47,7 +49,7 @@ /* IN parameters */ uint32_t ssidref; xen_domain_handle_t handle; - /* Is this an HVM guest (as opposed to a PV guest)? */ + /* Is this an HVM guest (as opposed to a PVH or PV guest)? */ #define _XEN_DOMCTL_CDF_hvm_guest 0 #define XEN_DOMCTL_CDF_hvm_guest (1U<<_XEN_DOMCTL_CDF_hvm_guest) /* Use hardware-assisted paging if available? */ @@ -59,7 +61,11 @@ /* Disable out-of-sync shadow page tables? */ #define _XEN_DOMCTL_CDF_oos_off 3 #define XEN_DOMCTL_CDF_oos_off (1U<<_XEN_DOMCTL_CDF_oos_off) + /* Is this a PVH guest (as opposed to an HVM or PV guest)? */ +#define _XEN_DOMCTL_CDF_pvh_guest 4 +#define XEN_DOMCTL_CDF_pvh_guest (1U<<_XEN_DOMCTL_CDF_pvh_guest) uint32_t flags; + struct xen_arch_domainconfig config; }; typedef struct xen_domctl_createdomain xen_domctl_createdomain_t; DEFINE_XEN_GUEST_HANDLE(xen_domctl_createdomain_t); @@ -89,6 +95,9 @@ /* Being debugged. */ #define _XEN_DOMINF_debugged 6 #define XEN_DOMINF_debugged (1U<<_XEN_DOMINF_debugged) +/* domain is PVH */ +#define _XEN_DOMINF_pvh_guest 7 +#define XEN_DOMINF_pvh_guest (1U<<_XEN_DOMINF_pvh_guest) /* XEN_DOMINF_shutdown guest-supplied code. */ #define XEN_DOMINF_shutdownmask 255 #define XEN_DOMINF_shutdownshift 16 @@ -95,11 +104,13 @@ uint32_t flags; /* XEN_DOMINF_* */ uint64_aligned_t tot_pages; uint64_aligned_t max_pages; + uint64_aligned_t outstanding_pages; uint64_aligned_t shr_pages; uint64_aligned_t paged_pages; uint64_aligned_t shared_info_frame; /* GMFN of shared_info struct */ uint64_aligned_t cpu_time; uint32_t nr_online_vcpus; /* Number of VCPUs currently online. */ +#define XEN_INVALID_MAX_VCPU_ID (~0U) /* Domain has no vcpus? */ uint32_t max_vcpu_id; /* Maximum VCPUID in use by this domain. */ uint32_t ssidref; xen_domain_handle_t handle; @@ -136,30 +147,9 @@ #define XEN_DOMCTL_PFINFO_LPINTAB (0x1U<<31) #define XEN_DOMCTL_PFINFO_XTAB (0xfU<<28) /* invalid page */ #define XEN_DOMCTL_PFINFO_XALLOC (0xeU<<28) /* allocate-only page */ -#define XEN_DOMCTL_PFINFO_PAGEDTAB (0x8U<<28) +#define XEN_DOMCTL_PFINFO_BROKEN (0xdU<<28) /* broken page */ #define XEN_DOMCTL_PFINFO_LTAB_MASK (0xfU<<28) -struct xen_domctl_getpageframeinfo { - /* IN variables. */ - uint64_aligned_t gmfn; /* GMFN to query */ - /* OUT variables. */ - /* Is the page PINNED to a type? */ - uint32_t type; /* see above type defs */ -}; -typedef struct xen_domctl_getpageframeinfo xen_domctl_getpageframeinfo_t; -DEFINE_XEN_GUEST_HANDLE(xen_domctl_getpageframeinfo_t); - - -/* XEN_DOMCTL_getpageframeinfo2 */ -struct xen_domctl_getpageframeinfo2 { - /* IN variables. */ - uint64_aligned_t num; - /* IN/OUT variables. */ - XEN_GUEST_HANDLE_64(uint32) array; -}; -typedef struct xen_domctl_getpageframeinfo2 xen_domctl_getpageframeinfo2_t; -DEFINE_XEN_GUEST_HANDLE(xen_domctl_getpageframeinfo2_t); - /* XEN_DOMCTL_getpageframeinfo3 */ struct xen_domctl_getpageframeinfo3 { /* IN variables. */ @@ -279,12 +269,47 @@ DEFINE_XEN_GUEST_HANDLE(xen_domctl_getvcpuinfo_t); +/* Get/set the NUMA node(s) with which the guest has affinity with. */ +/* XEN_DOMCTL_setnodeaffinity */ +/* XEN_DOMCTL_getnodeaffinity */ +struct xen_domctl_nodeaffinity { + struct xenctl_bitmap nodemap;/* IN */ +}; +typedef struct xen_domctl_nodeaffinity xen_domctl_nodeaffinity_t; +DEFINE_XEN_GUEST_HANDLE(xen_domctl_nodeaffinity_t); + + /* Get/set which physical cpus a vcpu can execute on. */ /* XEN_DOMCTL_setvcpuaffinity */ /* XEN_DOMCTL_getvcpuaffinity */ struct xen_domctl_vcpuaffinity { - uint32_t vcpu; /* IN */ - struct xenctl_cpumap cpumap; /* IN/OUT */ + /* IN variables. */ + uint32_t vcpu; + /* Set/get the hard affinity for vcpu */ +#define _XEN_VCPUAFFINITY_HARD 0 +#define XEN_VCPUAFFINITY_HARD (1U<<_XEN_VCPUAFFINITY_HARD) + /* Set/get the soft affinity for vcpu */ +#define _XEN_VCPUAFFINITY_SOFT 1 +#define XEN_VCPUAFFINITY_SOFT (1U<<_XEN_VCPUAFFINITY_SOFT) + uint32_t flags; + /* + * IN/OUT variables. + * + * Both are IN/OUT for XEN_DOMCTL_setvcpuaffinity, in which case they + * contain effective hard or/and soft affinity. That is, upon successful + * return, cpumap_soft, contains the intersection of the soft affinity, + * hard affinity and the cpupool's online CPUs for the domain (if + * XEN_VCPUAFFINITY_SOFT was set in flags). cpumap_hard contains the + * intersection between hard affinity and the cpupool's online CPUs (if + * XEN_VCPUAFFINITY_HARD was set in flags). + * + * Both are OUT-only for XEN_DOMCTL_getvcpuaffinity, in which case they + * contain the plain hard and/or soft affinity masks that were set during + * previous successful calls to XEN_DOMCTL_setvcpuaffinity (or the + * default values), without intersecting or altering them in any way. + */ + struct xenctl_bitmap cpumap_hard; + struct xenctl_bitmap cpumap_soft; }; typedef struct xen_domctl_vcpuaffinity xen_domctl_vcpuaffinity_t; DEFINE_XEN_GUEST_HANDLE(xen_domctl_vcpuaffinity_t); @@ -300,10 +325,12 @@ /* XEN_DOMCTL_scheduler_op */ /* Scheduler types. */ -#define XEN_SCHEDULER_SEDF 4 +/* #define XEN_SCHEDULER_SEDF 4 (Removed) */ #define XEN_SCHEDULER_CREDIT 5 #define XEN_SCHEDULER_CREDIT2 6 #define XEN_SCHEDULER_ARINC653 7 +#define XEN_SCHEDULER_RTDS 8 + /* Set or get info? */ #define XEN_DOMCTL_SCHEDOP_putinfo 0 #define XEN_DOMCTL_SCHEDOP_getinfo 1 @@ -311,13 +338,6 @@ uint32_t sched_id; /* XEN_SCHEDULER_* */ uint32_t cmd; /* XEN_DOMCTL_SCHEDOP_* */ union { - struct xen_domctl_sched_sedf { - uint64_aligned_t period; - uint64_aligned_t slice; - uint64_aligned_t latency; - uint32_t extratime; - uint32_t weight; - } sedf; struct xen_domctl_sched_credit { uint16_t weight; uint16_t cap; @@ -325,6 +345,10 @@ struct xen_domctl_sched_credit2 { uint16_t weight; } credit2; + struct xen_domctl_sched_rtds { + uint32_t period; + uint32_t budget; + } rtds; } u; }; typedef struct xen_domctl_scheduler_op xen_domctl_scheduler_op_t; @@ -384,29 +408,9 @@ DEFINE_XEN_GUEST_HANDLE(xen_domctl_hypercall_init_t); -/* XEN_DOMCTL_arch_setup */ -#define _XEN_DOMAINSETUP_hvm_guest 0 -#define XEN_DOMAINSETUP_hvm_guest (1UL<<_XEN_DOMAINSETUP_hvm_guest) -#define _XEN_DOMAINSETUP_query 1 /* Get parameters (for save) */ -#define XEN_DOMAINSETUP_query (1UL<<_XEN_DOMAINSETUP_query) -#define _XEN_DOMAINSETUP_sioemu_guest 2 -#define XEN_DOMAINSETUP_sioemu_guest (1UL<<_XEN_DOMAINSETUP_sioemu_guest) -typedef struct xen_domctl_arch_setup { - uint64_aligned_t flags; /* XEN_DOMAINSETUP_* */ -#ifdef __ia64__ - uint64_aligned_t bp; /* mpaddr of boot param area */ - uint64_aligned_t maxmem; /* Highest memory address for MDT. */ - uint64_aligned_t xsi_va; /* Xen shared_info area virtual address. */ - uint32_t hypercall_imm; /* Break imm for Xen hypercalls. */ - int8_t vhpt_size_log2; /* Log2 of VHPT size. */ -#endif -} xen_domctl_arch_setup_t; -DEFINE_XEN_GUEST_HANDLE(xen_domctl_arch_setup_t); - - /* XEN_DOMCTL_settimeoffset */ struct xen_domctl_settimeoffset { - int32_t time_offset_seconds; /* applied to domain wallclock time */ + int64_aligned_t time_offset_seconds; /* applied to domain wallclock time */ }; typedef struct xen_domctl_settimeoffset xen_domctl_settimeoffset_t; DEFINE_XEN_GUEST_HANDLE(xen_domctl_settimeoffset_t); @@ -430,14 +434,6 @@ DEFINE_XEN_GUEST_HANDLE(xen_domctl_address_size_t); -/* XEN_DOMCTL_real_mode_area */ -struct xen_domctl_real_mode_area { - uint32_t log; /* log2 of Real Mode Area size */ -}; -typedef struct xen_domctl_real_mode_area xen_domctl_real_mode_area_t; -DEFINE_XEN_GUEST_HANDLE(xen_domctl_real_mode_area_t); - - /* XEN_DOMCTL_sendtrigger */ #define XEN_DOMCTL_SENDTRIGGER_NMI 0 #define XEN_DOMCTL_SENDTRIGGER_RESET 1 @@ -452,12 +448,33 @@ DEFINE_XEN_GUEST_HANDLE(xen_domctl_sendtrigger_t); -/* Assign PCI device to HVM guest. Sets up IOMMU structures. */ +/* Assign a device to a guest. Sets up IOMMU structures. */ /* XEN_DOMCTL_assign_device */ /* XEN_DOMCTL_test_assign_device */ -/* XEN_DOMCTL_deassign_device */ +/* + * XEN_DOMCTL_deassign_device: The behavior of this DOMCTL differs + * between the different type of device: + * - PCI device (XEN_DOMCTL_DEV_PCI) will be reassigned to DOM0 + * - DT device (XEN_DOMCTL_DT_PCI) will left unassigned. DOM0 + * will have to call XEN_DOMCTL_assign_device in order to use the + * device. + */ +#define XEN_DOMCTL_DEV_PCI 0 +#define XEN_DOMCTL_DEV_DT 1 struct xen_domctl_assign_device { - uint32_t machine_sbdf; /* machine PCI ID of assigned device */ + uint32_t dev; /* XEN_DOMCTL_DEV_* */ + union { + struct { + uint32_t machine_sbdf; /* machine PCI ID of assigned device */ + } pci; + struct { + uint32_t size; /* Length of the path */ + XEN_GUEST_HANDLE_64(char) path; /* path to the device tree node */ + } dt; + } u; + /* IN */ +#define XEN_DOMCTL_DEV_RDM_RELAXED 1 + uint32_t flag; /* flag of assigned device */ }; typedef struct xen_domctl_assign_device xen_domctl_assign_device_t; DEFINE_XEN_GUEST_HANDLE(xen_domctl_assign_device_t); @@ -481,6 +498,7 @@ PT_IRQ_TYPE_ISA, PT_IRQ_TYPE_MSI, PT_IRQ_TYPE_MSI_TRANSLATE, + PT_IRQ_TYPE_SPI, /* ARM: valid range 32-1019 */ } pt_irq_type_t; struct xen_domctl_bind_pt_irq { uint32_t machine_irq; @@ -501,6 +519,9 @@ uint32_t gflags; uint64_aligned_t gtable; } msi; + struct { + uint16_t spi; + } spi; } u; }; typedef struct xen_domctl_bind_pt_irq xen_domctl_bind_pt_irq_t; @@ -508,6 +529,7 @@ /* Bind machine I/O address range -> HVM address range. */ +/* If this returns -E2BIG lower nr_mfns value. */ /* XEN_DOMCTL_memory_mapping */ #define DPCI_ADD_MAPPING 1 #define DPCI_REMOVE_MAPPING 0 @@ -545,6 +567,7 @@ #define XEN_DOMCTL_MEM_CACHEATTR_WP 5 #define XEN_DOMCTL_MEM_CACHEATTR_WB 6 #define XEN_DOMCTL_MEM_CACHEATTR_UCM 7 +#define XEN_DOMCTL_DELETE_MEM_CACHEATTR (~(uint32_t)0) struct xen_domctl_pin_mem_cacheattr { uint64_aligned_t start, end; uint32_t type; /* XEN_DOMCTL_MEM_CACHEATTR_* */ @@ -572,28 +595,20 @@ uint16_t sysenter_callback_cs; uint8_t syscall32_disables_events; uint8_t sysenter_disables_events; - uint64_aligned_t mcg_cap; +#if defined(__GNUC__) + union { + uint64_aligned_t mcg_cap; + struct hvm_vmce_vcpu vmce; + }; +#else + struct hvm_vmce_vcpu vmce; #endif +#endif }; typedef struct xen_domctl_ext_vcpucontext xen_domctl_ext_vcpucontext_t; DEFINE_XEN_GUEST_HANDLE(xen_domctl_ext_vcpucontext_t); /* - * Set optimizaton features for a domain - */ -/* XEN_DOMCTL_set_opt_feature */ -struct xen_domctl_set_opt_feature { -#if defined(__ia64__) - struct xen_ia64_opt_feature optf; -#else - /* Make struct non-empty: do not depend on this field name! */ - uint64_t dummy; -#endif -}; -typedef struct xen_domctl_set_opt_feature xen_domctl_set_opt_feature_t; -DEFINE_XEN_GUEST_HANDLE(xen_domctl_set_opt_feature_t); - -/* * Set the target domain for a domain */ /* XEN_DOMCTL_set_target */ @@ -617,6 +632,22 @@ DEFINE_XEN_GUEST_HANDLE(xen_domctl_cpuid_t); #endif +/* + * Arranges that if the domain suspends (specifically, if it shuts + * down with code SHUTDOWN_suspend), this event channel will be + * notified. + * + * This is _instead of_ the usual notification to the global + * VIRQ_DOM_EXC. (In most systems that pirq is owned by xenstored.) + * + * Only one subscription per domain is possible. Last subscriber + * wins; others are silently displaced. + * + * NB that contrary to the rather general name, it only applies to + * domain shutdown with code suspend. Shutdown for other reasons + * (including crash), and domain death, are notified to VIRQ_DOM_EXC + * regardless. + */ /* XEN_DOMCTL_subscribe */ struct xen_domctl_subscribe { uint32_t port; /* IN */ @@ -665,18 +696,13 @@ /* XEN_DOMCTL_gettscinfo */ /* XEN_DOMCTL_settscinfo */ -struct xen_guest_tsc_info { +typedef struct xen_domctl_tsc_info { + /* IN/OUT */ uint32_t tsc_mode; uint32_t gtsc_khz; uint32_t incarnation; uint32_t pad; uint64_aligned_t elapsed_nsec; -}; -typedef struct xen_guest_tsc_info xen_guest_tsc_info_t; -DEFINE_XEN_GUEST_HANDLE(xen_guest_tsc_info_t); -typedef struct xen_domctl_tsc_info { - XEN_GUEST_HANDLE_64(xen_guest_tsc_info_t) out_info; /* OUT */ - xen_guest_tsc_info_t info; /* IN */ } xen_domctl_tsc_info_t; /* XEN_DOMCTL_gdbsx_guestmemio guest mem io */ @@ -706,12 +732,23 @@ }; /* - * Memory event operations + * VM event operations */ -/* XEN_DOMCTL_mem_event_op */ +/* XEN_DOMCTL_vm_event_op */ /* + * There are currently three rings available for VM events: + * sharing, monitor and paging. This hypercall allows one to + * control these rings (enable/disable), as well as to signal + * to the hypervisor to pull responses (resume) from the given + * ring. + */ +#define XEN_VM_EVENT_ENABLE 0 +#define XEN_VM_EVENT_DISABLE 1 +#define XEN_VM_EVENT_RESUME 2 + +/* * Domain memory paging * Page memory in and out. * Domctl interface to set up and tear down the @@ -718,7 +755,7 @@ * pager<->hypervisor interface. Use XENMEM_paging_op* * to perform per-page operations. * - * The XEN_DOMCTL_MEM_EVENT_OP_PAGING_ENABLE domctl returns several + * The XEN_VM_EVENT_PAGING_ENABLE domctl returns several * non-standard error codes to indicate why paging could not be enabled: * ENODEV - host lacks HAP support (EPT/NPT) or HAP is disabled in guest * EMLINK - guest has iommu passthrough enabled @@ -725,35 +762,32 @@ * EXDEV - guest has PoD enabled * EBUSY - guest has or had paging enabled, ring buffer still active */ -#define XEN_DOMCTL_MEM_EVENT_OP_PAGING 1 +#define XEN_DOMCTL_VM_EVENT_OP_PAGING 1 -#define XEN_DOMCTL_MEM_EVENT_OP_PAGING_ENABLE 0 -#define XEN_DOMCTL_MEM_EVENT_OP_PAGING_DISABLE 1 - /* - * Access permissions. + * Monitor helper. * * As with paging, use the domctl for teardown/setup of the * helper<->hypervisor interface. * - * There are HVM hypercalls to set the per-page access permissions of every - * page in a domain. When one of these permissions--independent, read, - * write, and execute--is violated, the VCPU is paused and a memory event - * is sent with what happened. (See public/mem_event.h) . + * The monitor interface can be used to register for various VM events. For + * example, there are HVM hypercalls to set the per-page access permissions + * of every page in a domain. When one of these permissions--independent, + * read, write, and execute--is violated, the VCPU is paused and a memory event + * is sent with what happened. The memory event handler can then resume the + * VCPU and redo the access with a XEN_VM_EVENT_RESUME option. * - * The memory event handler can then resume the VCPU and redo the access - * with a XENMEM_access_op_resume hypercall. + * See public/vm_event.h for the list of available events that can be + * subscribed to via the monitor interface. * - * The XEN_DOMCTL_MEM_EVENT_OP_ACCESS_ENABLE domctl returns several + * The XEN_VM_EVENT_MONITOR_* domctls returns * non-standard error codes to indicate why access could not be enabled: * ENODEV - host lacks HAP support (EPT/NPT) or HAP is disabled in guest * EBUSY - guest has or had access enabled, ring buffer still active + * */ -#define XEN_DOMCTL_MEM_EVENT_OP_ACCESS 2 +#define XEN_DOMCTL_VM_EVENT_OP_MONITOR 2 -#define XEN_DOMCTL_MEM_EVENT_OP_ACCESS_ENABLE 0 -#define XEN_DOMCTL_MEM_EVENT_OP_ACCESS_DISABLE 1 - /* * Sharing ENOMEM helper. * @@ -767,21 +801,18 @@ * Note that shring can be turned on (as per the domctl below) * *without* this ring being setup. */ -#define XEN_DOMCTL_MEM_EVENT_OP_SHARING 3 +#define XEN_DOMCTL_VM_EVENT_OP_SHARING 3 -#define XEN_DOMCTL_MEM_EVENT_OP_SHARING_ENABLE 0 -#define XEN_DOMCTL_MEM_EVENT_OP_SHARING_DISABLE 1 - /* Use for teardown/setup of helper<->hypervisor interface for paging, * access and sharing.*/ -struct xen_domctl_mem_event_op { - uint32_t op; /* XEN_DOMCTL_MEM_EVENT_OP_*_* */ - uint32_t mode; /* XEN_DOMCTL_MEM_EVENT_OP_* */ +struct xen_domctl_vm_event_op { + uint32_t op; /* XEN_VM_EVENT_* */ + uint32_t mode; /* XEN_DOMCTL_VM_EVENT_OP_* */ uint32_t port; /* OUT: event channel for ring */ }; -typedef struct xen_domctl_mem_event_op xen_domctl_mem_event_op_t; -DEFINE_XEN_GUEST_HANDLE(xen_domctl_mem_event_op_t); +typedef struct xen_domctl_vm_event_op xen_domctl_vm_event_op_t; +DEFINE_XEN_GUEST_HANDLE(xen_domctl_vm_event_op_t); /* * Memory sharing operations @@ -822,7 +853,7 @@ /* IN: VCPU that this call applies to. */ uint32_t vcpu; /* - * SET: xfeature support mask of struct (IN) + * SET: Ignored. * GET: xfeature support mask of struct (IN/OUT) * xfeature mask is served as identifications of the saving format * so that compatible CPUs can have a check on format to decide @@ -850,6 +881,189 @@ typedef struct xen_domctl_set_access_required xen_domctl_set_access_required_t; DEFINE_XEN_GUEST_HANDLE(xen_domctl_set_access_required_t); +struct xen_domctl_set_broken_page_p2m { + uint64_aligned_t pfn; +}; +typedef struct xen_domctl_set_broken_page_p2m xen_domctl_set_broken_page_p2m_t; +DEFINE_XEN_GUEST_HANDLE(xen_domctl_set_broken_page_p2m_t); + +/* + * XEN_DOMCTL_set_max_evtchn: sets the maximum event channel port + * number the guest may use. Use this limit the amount of resources + * (global mapping space, xenheap) a guest may use for event channels. + */ +struct xen_domctl_set_max_evtchn { + uint32_t max_port; +}; +typedef struct xen_domctl_set_max_evtchn xen_domctl_set_max_evtchn_t; +DEFINE_XEN_GUEST_HANDLE(xen_domctl_set_max_evtchn_t); + +/* + * ARM: Clean and invalidate caches associated with given region of + * guest memory. + */ +struct xen_domctl_cacheflush { + /* IN: page range to flush. */ + xen_pfn_t start_pfn, nr_pfns; +}; +typedef struct xen_domctl_cacheflush xen_domctl_cacheflush_t; +DEFINE_XEN_GUEST_HANDLE(xen_domctl_cacheflush_t); + +#if defined(__i386__) || defined(__x86_64__) +struct xen_domctl_vcpu_msr { + uint32_t index; + uint32_t reserved; + uint64_aligned_t value; +}; +typedef struct xen_domctl_vcpu_msr xen_domctl_vcpu_msr_t; +DEFINE_XEN_GUEST_HANDLE(xen_domctl_vcpu_msr_t); + +/* + * XEN_DOMCTL_set_vcpu_msrs / XEN_DOMCTL_get_vcpu_msrs. + * + * Input: + * - A NULL 'msrs' guest handle is a request for the maximum 'msr_count'. + * - Otherwise, 'msr_count' is the number of entries in 'msrs'. + * + * Output for get: + * - If 'msr_count' is less than the number Xen needs to write, -ENOBUFS shall + * be returned and 'msr_count' updated to reflect the intended number. + * - On success, 'msr_count' shall indicate the number of MSRs written, which + * may be less than the maximum if some are not currently used by the vcpu. + * + * Output for set: + * - If Xen encounters an error with a specific MSR, -EINVAL shall be returned + * and 'msr_count' shall be set to the offending index, to aid debugging. + */ +struct xen_domctl_vcpu_msrs { + uint32_t vcpu; /* IN */ + uint32_t msr_count; /* IN/OUT */ + XEN_GUEST_HANDLE_64(xen_domctl_vcpu_msr_t) msrs; /* IN/OUT */ +}; +typedef struct xen_domctl_vcpu_msrs xen_domctl_vcpu_msrs_t; +DEFINE_XEN_GUEST_HANDLE(xen_domctl_vcpu_msrs_t); +#endif + +/* XEN_DOMCTL_setvnumainfo: specifies a virtual NUMA topology for the guest */ +struct xen_domctl_vnuma { + /* IN: number of vNUMA nodes to setup. Shall be greater than 0 */ + uint32_t nr_vnodes; + /* IN: number of memory ranges to setup */ + uint32_t nr_vmemranges; + /* + * IN: number of vCPUs of the domain (used as size of the vcpu_to_vnode + * array declared below). Shall be equal to the domain's max_vcpus. + */ + uint32_t nr_vcpus; + uint32_t pad; /* must be zero */ + + /* + * IN: array for specifying the distances of the vNUMA nodes + * between each others. Shall have nr_vnodes*nr_vnodes elements. + */ + XEN_GUEST_HANDLE_64(uint) vdistance; + /* + * IN: array for specifying to what vNUMA node each vCPU belongs. + * Shall have nr_vcpus elements. + */ + XEN_GUEST_HANDLE_64(uint) vcpu_to_vnode; + /* + * IN: array for specifying on what physical NUMA node each vNUMA + * node is placed. Shall have nr_vnodes elements. + */ + XEN_GUEST_HANDLE_64(uint) vnode_to_pnode; + /* + * IN: array for specifying the memory ranges. Shall have + * nr_vmemranges elements. + */ + XEN_GUEST_HANDLE_64(xen_vmemrange_t) vmemrange; +}; +typedef struct xen_domctl_vnuma xen_domctl_vnuma_t; +DEFINE_XEN_GUEST_HANDLE(xen_domctl_vnuma_t); + +struct xen_domctl_psr_cmt_op { +#define XEN_DOMCTL_PSR_CMT_OP_DETACH 0 +#define XEN_DOMCTL_PSR_CMT_OP_ATTACH 1 +#define XEN_DOMCTL_PSR_CMT_OP_QUERY_RMID 2 + uint32_t cmd; + uint32_t data; +}; +typedef struct xen_domctl_psr_cmt_op xen_domctl_psr_cmt_op_t; +DEFINE_XEN_GUEST_HANDLE(xen_domctl_psr_cmt_op_t); + +/* XEN_DOMCTL_MONITOR_* + * + * Enable/disable monitoring various VM events. + * This domctl configures what events will be reported to helper apps + * via the ring buffer "MONITOR". The ring has to be first enabled + * with the domctl XEN_DOMCTL_VM_EVENT_OP_MONITOR. + * + * GET_CAPABILITIES can be used to determine which of these features is + * available on a given platform. + * + * NOTICE: mem_access events are also delivered via the "MONITOR" ring buffer; + * however, enabling/disabling those events is performed with the use of + * memory_op hypercalls! + */ +#define XEN_DOMCTL_MONITOR_OP_ENABLE 0 +#define XEN_DOMCTL_MONITOR_OP_DISABLE 1 +#define XEN_DOMCTL_MONITOR_OP_GET_CAPABILITIES 2 + +#define XEN_DOMCTL_MONITOR_EVENT_WRITE_CTRLREG 0 +#define XEN_DOMCTL_MONITOR_EVENT_MOV_TO_MSR 1 +#define XEN_DOMCTL_MONITOR_EVENT_SINGLESTEP 2 +#define XEN_DOMCTL_MONITOR_EVENT_SOFTWARE_BREAKPOINT 3 +#define XEN_DOMCTL_MONITOR_EVENT_GUEST_REQUEST 4 + +struct xen_domctl_monitor_op { + uint32_t op; /* XEN_DOMCTL_MONITOR_OP_* */ + + /* + * When used with ENABLE/DISABLE this has to be set to + * the requested XEN_DOMCTL_MONITOR_EVENT_* value. + * With GET_CAPABILITIES this field returns a bitmap of + * events supported by the platform, in the format + * (1 << XEN_DOMCTL_MONITOR_EVENT_*). + */ + uint32_t event; + + /* + * Further options when issuing XEN_DOMCTL_MONITOR_OP_ENABLE. + */ + union { + struct { + /* Which control register */ + uint8_t index; + /* Pause vCPU until response */ + uint8_t sync; + /* Send event only on a change of value */ + uint8_t onchangeonly; + } mov_to_cr; + + struct { + /* Enable the capture of an extended set of MSRs */ + uint8_t extended_capture; + } mov_to_msr; + + struct { + /* Pause vCPU until response */ + uint8_t sync; + } guest_request; + } u; +}; +typedef struct xen_domctl_monitor_op xen_domctl_monitor_op_t; +DEFINE_XEN_GUEST_HANDLE(xen_domctl_monitor_op_t); + +struct xen_domctl_psr_cat_op { +#define XEN_DOMCTL_PSR_CAT_OP_SET_L3_CBM 0 +#define XEN_DOMCTL_PSR_CAT_OP_GET_L3_CBM 1 + uint32_t cmd; /* IN: XEN_DOMCTL_PSR_CAT_OP_* */ + uint32_t target; /* IN */ + uint64_t data; /* IN/OUT */ +}; +typedef struct xen_domctl_psr_cat_op xen_domctl_psr_cat_op_t; +DEFINE_XEN_GUEST_HANDLE(xen_domctl_psr_cat_op_t); + struct xen_domctl { uint32_t cmd; #define XEN_DOMCTL_createdomain 1 @@ -858,8 +1072,8 @@ #define XEN_DOMCTL_unpausedomain 4 #define XEN_DOMCTL_getdomaininfo 5 #define XEN_DOMCTL_getmemlist 6 -#define XEN_DOMCTL_getpageframeinfo 7 -#define XEN_DOMCTL_getpageframeinfo2 8 +/* #define XEN_DOMCTL_getpageframeinfo 7 Obsolete - use getpageframeinfo3 */ +/* #define XEN_DOMCTL_getpageframeinfo2 8 Obsolete - use getpageframeinfo3 */ #define XEN_DOMCTL_setvcpuaffinity 9 #define XEN_DOMCTL_shadow_op 10 #define XEN_DOMCTL_max_mem 11 @@ -874,10 +1088,10 @@ #define XEN_DOMCTL_iomem_permission 20 #define XEN_DOMCTL_ioport_permission 21 #define XEN_DOMCTL_hypercall_init 22 -#define XEN_DOMCTL_arch_setup 23 +#define XEN_DOMCTL_arch_setup 23 /* Obsolete IA64 only */ #define XEN_DOMCTL_settimeoffset 24 #define XEN_DOMCTL_getvcpuaffinity 25 -#define XEN_DOMCTL_real_mode_area 26 +#define XEN_DOMCTL_real_mode_area 26 /* Obsolete PPC only */ #define XEN_DOMCTL_resumedomain 27 #define XEN_DOMCTL_sendtrigger 28 #define XEN_DOMCTL_subscribe 29 @@ -892,7 +1106,7 @@ #define XEN_DOMCTL_pin_mem_cacheattr 41 #define XEN_DOMCTL_set_ext_vcpucontext 42 #define XEN_DOMCTL_get_ext_vcpucontext 43 -#define XEN_DOMCTL_set_opt_feature 44 +#define XEN_DOMCTL_set_opt_feature 44 /* Obsolete IA64 only */ #define XEN_DOMCTL_test_assign_device 45 #define XEN_DOMCTL_set_target 46 #define XEN_DOMCTL_deassign_device 47 @@ -904,7 +1118,7 @@ #define XEN_DOMCTL_suppress_spurious_page_faults 53 #define XEN_DOMCTL_debug_op 54 #define XEN_DOMCTL_gethvmcontext_partial 55 -#define XEN_DOMCTL_mem_event_op 56 +#define XEN_DOMCTL_vm_event_op 56 #define XEN_DOMCTL_mem_sharing_op 57 #define XEN_DOMCTL_disable_migrate 58 #define XEN_DOMCTL_gettscinfo 59 @@ -915,6 +1129,17 @@ #define XEN_DOMCTL_set_access_required 64 #define XEN_DOMCTL_audit_p2m 65 #define XEN_DOMCTL_set_virq_handler 66 +#define XEN_DOMCTL_set_broken_page_p2m 67 +#define XEN_DOMCTL_setnodeaffinity 68 +#define XEN_DOMCTL_getnodeaffinity 69 +#define XEN_DOMCTL_set_max_evtchn 70 +#define XEN_DOMCTL_cacheflush 71 +#define XEN_DOMCTL_get_vcpu_msrs 72 +#define XEN_DOMCTL_set_vcpu_msrs 73 +#define XEN_DOMCTL_setvnumainfo 74 +#define XEN_DOMCTL_psr_cmt_op 75 +#define XEN_DOMCTL_monitor_op 77 +#define XEN_DOMCTL_psr_cat_op 78 #define XEN_DOMCTL_gdbsx_guestmemio 1000 #define XEN_DOMCTL_gdbsx_pausevcpu 1001 #define XEN_DOMCTL_gdbsx_unpausevcpu 1002 @@ -925,9 +1150,8 @@ struct xen_domctl_createdomain createdomain; struct xen_domctl_getdomaininfo getdomaininfo; struct xen_domctl_getmemlist getmemlist; - struct xen_domctl_getpageframeinfo getpageframeinfo; - struct xen_domctl_getpageframeinfo2 getpageframeinfo2; struct xen_domctl_getpageframeinfo3 getpageframeinfo3; + struct xen_domctl_nodeaffinity nodeaffinity; struct xen_domctl_vcpuaffinity vcpuaffinity; struct xen_domctl_shadow_op shadow_op; struct xen_domctl_max_mem max_mem; @@ -941,11 +1165,9 @@ struct xen_domctl_iomem_permission iomem_permission; struct xen_domctl_ioport_permission ioport_permission; struct xen_domctl_hypercall_init hypercall_init; - struct xen_domctl_arch_setup arch_setup; struct xen_domctl_settimeoffset settimeoffset; struct xen_domctl_disable_migrate disable_migrate; struct xen_domctl_tsc_info tsc_info; - struct xen_domctl_real_mode_area real_mode_area; struct xen_domctl_hvmcontext hvmcontext; struct xen_domctl_hvmcontext_partial hvmcontext_partial; struct xen_domctl_address_size address_size; @@ -957,22 +1179,29 @@ struct xen_domctl_ioport_mapping ioport_mapping; struct xen_domctl_pin_mem_cacheattr pin_mem_cacheattr; struct xen_domctl_ext_vcpucontext ext_vcpucontext; - struct xen_domctl_set_opt_feature set_opt_feature; struct xen_domctl_set_target set_target; struct xen_domctl_subscribe subscribe; struct xen_domctl_debug_op debug_op; - struct xen_domctl_mem_event_op mem_event_op; + struct xen_domctl_vm_event_op vm_event_op; struct xen_domctl_mem_sharing_op mem_sharing_op; #if defined(__i386__) || defined(__x86_64__) struct xen_domctl_cpuid cpuid; struct xen_domctl_vcpuextstate vcpuextstate; + struct xen_domctl_vcpu_msrs vcpu_msrs; #endif struct xen_domctl_set_access_required access_required; struct xen_domctl_audit_p2m audit_p2m; struct xen_domctl_set_virq_handler set_virq_handler; + struct xen_domctl_set_max_evtchn set_max_evtchn; struct xen_domctl_gdbsx_memio gdbsx_guest_memio; + struct xen_domctl_set_broken_page_p2m set_broken_page_p2m; + struct xen_domctl_cacheflush cacheflush; struct xen_domctl_gdbsx_pauseunp_vcpu gdbsx_pauseunp_vcpu; struct xen_domctl_gdbsx_domstatus gdbsx_domstatus; + struct xen_domctl_vnuma vnuma; + struct xen_domctl_psr_cmt_op psr_cmt_op; + struct xen_domctl_monitor_op monitor_op; + struct xen_domctl_psr_cat_op psr_cat_op; uint8_t pad[128]; } u; }; @@ -984,7 +1213,7 @@ /* * Local variables: * mode: C - * c-set-style: "BSD" + * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil Modified: trunk/sys/xen/interface/elfnote.h =================================================================== --- trunk/sys/xen/interface/elfnote.h 2020-02-08 19:27:58 UTC (rev 12305) +++ trunk/sys/xen/interface/elfnote.h 2020-02-08 19:28:08 UTC (rev 12306) @@ -29,6 +29,8 @@ #define __XEN_PUBLIC_ELFNOTE_H__ /* + * `incontents 200 elfnotes ELF notes + * * The notes should live in a PT_NOTE segment and have "Xen" in the * name field. * @@ -37,6 +39,9 @@ * * LEGACY indicated the fields in the legacy __xen_guest string which * this a note type replaces. + * + * String values (for non-legacy) are NULL terminated ASCII, also known + * as ASCIZ type. */ /* @@ -67,8 +72,8 @@ #define XEN_ELFNOTE_VIRT_BASE 3 /* - * The offset of the ELF paddr field from the acutal required - * psuedo-physical address (numeric). + * The offset of the ELF paddr field from the actual required + * pseudo-physical address (numeric). * * This is used to maintain backwards compatibility with older kernels * which wrote __PAGE_OFFSET into that field. This field defaults to 0 @@ -159,6 +164,9 @@ /* * Whether or not the guest supports cooperative suspend cancellation. + * This is a numeric value. + * + * Default is 0 */ #define XEN_ELFNOTE_SUSPEND_CANCEL 14 @@ -256,7 +264,7 @@ /* * Local variables: * mode: C - * c-set-style: "BSD" + * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil Added: trunk/sys/xen/interface/errno.h =================================================================== --- trunk/sys/xen/interface/errno.h (rev 0) +++ trunk/sys/xen/interface/errno.h 2020-02-08 19:28:08 UTC (rev 12306) @@ -0,0 +1,96 @@ +/* $MidnightBSD$ */ +#ifndef __XEN_PUBLIC_ERRNO_H__ + +#ifndef __ASSEMBLY__ + +#define XEN_ERRNO(name, value) XEN_##name = value, +enum xen_errno { + +#else /* !__ASSEMBLY__ */ + +#define XEN_ERRNO(name, value) .equ XEN_##name, value + +#endif /* __ASSEMBLY__ */ + +/* ` enum neg_errnoval { [ -Efoo for each Efoo in the list below ] } */ +/* ` enum errnoval { */ + +#endif /* __XEN_PUBLIC_ERRNO_H__ */ + +#ifdef XEN_ERRNO + +/* + * Values originating from x86 Linux. Please consider using respective + * values when adding new definitions here. + * + * The set of identifiers to be added here shouldn't extend beyond what + * POSIX mandates (see e.g. + * http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/errno.h.html) + * with the exception that we support some optional (XSR) values + * specified there (but no new ones should be added). + */ + +XEN_ERRNO(EPERM, 1) /* Operation not permitted */ +XEN_ERRNO(ENOENT, 2) /* No such file or directory */ +XEN_ERRNO(ESRCH, 3) /* No such process */ +#ifdef __XEN__ /* Internal only, should never be exposed to the guest. */ +XEN_ERRNO(EINTR, 4) /* Interrupted system call */ +#endif +XEN_ERRNO(EIO, 5) /* I/O error */ +XEN_ERRNO(ENXIO, 6) /* No such device or address */ +XEN_ERRNO(E2BIG, 7) /* Arg list too long */ +XEN_ERRNO(ENOEXEC, 8) /* Exec format error */ +XEN_ERRNO(EBADF, 9) /* Bad file number */ +XEN_ERRNO(ECHILD, 10) /* No child processes */ +XEN_ERRNO(EAGAIN, 11) /* Try again */ +XEN_ERRNO(ENOMEM, 12) /* Out of memory */ +XEN_ERRNO(EACCES, 13) /* Permission denied */ +XEN_ERRNO(EFAULT, 14) /* Bad address */ +XEN_ERRNO(EBUSY, 16) /* Device or resource busy */ +XEN_ERRNO(EEXIST, 17) /* File exists */ +XEN_ERRNO(EXDEV, 18) /* Cross-device link */ +XEN_ERRNO(ENODEV, 19) /* No such device */ +XEN_ERRNO(EINVAL, 22) /* Invalid argument */ +XEN_ERRNO(ENFILE, 23) /* File table overflow */ +XEN_ERRNO(EMFILE, 24) /* Too many open files */ +XEN_ERRNO(ENOSPC, 28) /* No space left on device */ +XEN_ERRNO(EMLINK, 31) /* Too many links */ +XEN_ERRNO(EDOM, 33) /* Math argument out of domain of func */ +XEN_ERRNO(ERANGE, 34) /* Math result not representable */ +XEN_ERRNO(EDEADLK, 35) /* Resource deadlock would occur */ +XEN_ERRNO(ENAMETOOLONG, 36) /* File name too long */ +XEN_ERRNO(ENOLCK, 37) /* No record locks available */ +XEN_ERRNO(ENOSYS, 38) /* Function not implemented */ +XEN_ERRNO(ENODATA, 61) /* No data available */ +XEN_ERRNO(ETIME, 62) /* Timer expired */ +XEN_ERRNO(EBADMSG, 74) /* Not a data message */ +XEN_ERRNO(EOVERFLOW, 75) /* Value too large for defined data type */ +XEN_ERRNO(EILSEQ, 84) /* Illegal byte sequence */ +#ifdef __XEN__ /* Internal only, should never be exposed to the guest. */ +XEN_ERRNO(ERESTART, 85) /* Interrupted system call should be restarted */ +#endif +XEN_ERRNO(ENOTSOCK, 88) /* Socket operation on non-socket */ +XEN_ERRNO(EOPNOTSUPP, 95) /* Operation not supported on transport endpoint */ +XEN_ERRNO(EADDRINUSE, 98) /* Address already in use */ +XEN_ERRNO(EADDRNOTAVAIL, 99) /* Cannot assign requested address */ +XEN_ERRNO(ENOBUFS, 105) /* No buffer space available */ +XEN_ERRNO(EISCONN, 106) /* Transport endpoint is already connected */ +XEN_ERRNO(ENOTCONN, 107) /* Transport endpoint is not connected */ +XEN_ERRNO(ETIMEDOUT, 110) /* Connection timed out */ + +#undef XEN_ERRNO +#endif /* XEN_ERRNO */ + +#ifndef __XEN_PUBLIC_ERRNO_H__ +#define __XEN_PUBLIC_ERRNO_H__ + +/* ` } */ + +#ifndef __ASSEMBLY__ +}; +#endif + +#define XEN_EWOULDBLOCK XEN_EAGAIN /* Operation would block */ +#define XEN_EDEADLOCK XEN_EDEADLK /* Resource deadlock would occur */ + +#endif /* __XEN_PUBLIC_ERRNO_H__ */ Property changes on: trunk/sys/xen/interface/errno.h ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +MidnightBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Modified: trunk/sys/xen/interface/event_channel.h =================================================================== --- trunk/sys/xen/interface/event_channel.h 2020-02-08 19:27:58 UTC (rev 12305) +++ trunk/sys/xen/interface/event_channel.h 2020-02-08 19:28:08 UTC (rev 12306) @@ -72,13 +72,13 @@ #define EVTCHNOP_bind_vcpu 8 #define EVTCHNOP_unmask 9 #define EVTCHNOP_reset 10 +#define EVTCHNOP_init_control 11 +#define EVTCHNOP_expand_array 12 +#define EVTCHNOP_set_priority 13 /* ` } */ -#ifndef __XEN_EVTCHN_PORT_DEFINED__ typedef uint32_t evtchn_port_t; DEFINE_XEN_GUEST_HANDLE(evtchn_port_t); -#define __XEN_EVTCHN_PORT_DEFINED__ 1 -#endif /* * EVTCHNOP_alloc_unbound: Allocate a port in domain <dom> and mark as @@ -102,6 +102,17 @@ * a port that is unbound and marked as accepting bindings from the calling * domain. A fresh port is allocated in the calling domain and returned as * <local_port>. + * + * In case the peer domain has already tried to set our event channel + * pending, before it was bound, EVTCHNOP_bind_interdomain always sets + * the local event channel pending. + * + * The usual pattern of use, in the guest's upcall (or subsequent + * handler) is as follows: (Re-enable the event channel for subsequent + * signalling and then) check for the existence of whatever condition + * is being waited for by other means, and take whatever action is + * needed (if any). + * * NOTES: * 1. <remote_dom> may be DOMID_SELF, allowing loopback connections. */ @@ -254,6 +265,10 @@ * NOTES: * 1. <dom> may be specified as DOMID_SELF. * 2. Only a sufficiently-privileged domain may specify other than DOMID_SELF. + * 3. Destroys all control blocks and event array, resets event channel + * operations to 2-level ABI if called with <dom> == DOMID_SELF and FIFO + * ABI was used. Guests should not bind events during EVTCHNOP_reset call + * as these events are likely to be lost. */ struct evtchn_reset { /* IN parameters. */ @@ -262,6 +277,43 @@ typedef struct evtchn_reset evtchn_reset_t; /* + * EVTCHNOP_init_control: initialize the control block for the FIFO ABI. + * + * Note: any events that are currently pending will not be resent and + * will be lost. Guests should call this before binding any event to + * avoid losing any events. + */ +struct evtchn_init_control { + /* IN parameters. */ + uint64_t control_gfn; + uint32_t offset; + uint32_t vcpu; + /* OUT parameters. */ + uint8_t link_bits; + uint8_t _pad[7]; +}; +typedef struct evtchn_init_control evtchn_init_control_t; + +/* + * EVTCHNOP_expand_array: add an additional page to the event array. + */ +struct evtchn_expand_array { + /* IN parameters. */ + uint64_t array_gfn; +}; +typedef struct evtchn_expand_array evtchn_expand_array_t; + +/* + * EVTCHNOP_set_priority: set the priority for an event channel. + */ +struct evtchn_set_priority { + /* IN parameters. */ + uint32_t port; + uint32_t priority; +}; +typedef struct evtchn_set_priority evtchn_set_priority_t; + +/* * ` enum neg_errnoval * ` HYPERVISOR_event_channel_op_compat(struct evtchn_op *op) * ` @@ -285,12 +337,48 @@ typedef struct evtchn_op evtchn_op_t; DEFINE_XEN_GUEST_HANDLE(evtchn_op_t); +/* + * 2-level ABI + */ + +#define EVTCHN_2L_NR_CHANNELS (sizeof(xen_ulong_t) * sizeof(xen_ulong_t) * 64) + +/* + * FIFO ABI + */ + +/* Events may have priorities from 0 (highest) to 15 (lowest). */ +#define EVTCHN_FIFO_PRIORITY_MAX 0 +#define EVTCHN_FIFO_PRIORITY_DEFAULT 7 +#define EVTCHN_FIFO_PRIORITY_MIN 15 + +#define EVTCHN_FIFO_MAX_QUEUES (EVTCHN_FIFO_PRIORITY_MIN + 1) + +typedef uint32_t event_word_t; + +#define EVTCHN_FIFO_PENDING 31 +#define EVTCHN_FIFO_MASKED 30 +#define EVTCHN_FIFO_LINKED 29 +#define EVTCHN_FIFO_BUSY 28 + +#define EVTCHN_FIFO_LINK_BITS 17 +#define EVTCHN_FIFO_LINK_MASK ((1 << EVTCHN_FIFO_LINK_BITS) - 1) + +#define EVTCHN_FIFO_NR_CHANNELS (1 << EVTCHN_FIFO_LINK_BITS) + +struct evtchn_fifo_control_block { + uint32_t ready; + uint32_t _rsvd; + uint32_t head[EVTCHN_FIFO_MAX_QUEUES]; +}; +typedef struct evtchn_fifo_control_block evtchn_fifo_control_block_t; + #endif /* __XEN_PUBLIC_EVENT_CHANNEL_H__ */ /* * Local variables: * mode: C - * c-set-style: "BSD" + * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil Modified: trunk/sys/xen/interface/features.h =================================================================== --- trunk/sys/xen/interface/features.h 2020-02-08 19:27:58 UTC (rev 12305) +++ trunk/sys/xen/interface/features.h 2020-02-08 19:28:08 UTC (rev 12306) @@ -29,6 +29,20 @@ #define __XEN_PUBLIC_FEATURES_H__ /* + * `incontents 200 elfnotes_features XEN_ELFNOTE_FEATURES + * + * The list of all the features the guest supports. They are set by + * parsing the XEN_ELFNOTE_FEATURES and XEN_ELFNOTE_SUPPORTED_FEATURES + * string. The format is the feature names (as given here without the + * "XENFEAT_" prefix) separated by '|' characters. + * If a feature is required for the kernel to function then the feature name + * must be preceded by a '!' character. + * + * Note that if XEN_ELFNOTE_SUPPORTED_FEATURES is used, then in the + * XENFEAT_dom0 MUST be set if the guest is to be booted as dom0, + */ + +/* * If set, the guest does not need to write-protect its pagetables, and can * update them via direct writes. */ @@ -81,6 +95,14 @@ /* operation as Dom0 is supported */ #define XENFEAT_dom0 11 +/* Xen also maps grant references at pfn = mfn. + * This feature flag is deprecated and should not be used. +#define XENFEAT_grant_map_identity 12 + */ + +/* Guest can use XENMEMF_vnode to specify virtual node for memory op. */ +#define XENFEAT_memory_op_vnode_supported 13 + #define XENFEAT_NR_SUBMAPS 1 #endif /* __XEN_PUBLIC_FEATURES_H__ */ @@ -88,7 +110,7 @@ /* * Local variables: * mode: C - * c-set-style: "BSD" + * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil Added: trunk/sys/xen/interface/gcov.h =================================================================== --- trunk/sys/xen/interface/gcov.h (rev 0) +++ trunk/sys/xen/interface/gcov.h 2020-02-08 19:28:08 UTC (rev 12306) @@ -0,0 +1,116 @@ +/* $MidnightBSD$ */ +/****************************************************************************** + * gcov.h + * + * Coverage structures exported by Xen. + * Structure is different from Gcc one. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Copyright (c) 2013, Citrix Systems R&D Ltd. + */ + +#ifndef __XEN_PUBLIC_GCOV_H__ +#define __XEN_PUBLIC_GCOV_H__ __XEN_PUBLIC_GCOV_H__ + +#define XENCOV_COUNTERS 5 +#define XENCOV_TAG_BASE 0x58544300u +#define XENCOV_TAG_FILE (XENCOV_TAG_BASE+0x46u) +#define XENCOV_TAG_FUNC (XENCOV_TAG_BASE+0x66u) +#define XENCOV_TAG_COUNTER(n) (XENCOV_TAG_BASE+0x30u+((n)&0xfu)) +#define XENCOV_TAG_END (XENCOV_TAG_BASE+0x2eu) +#define XENCOV_IS_TAG_COUNTER(n) \ + ((n) >= XENCOV_TAG_COUNTER(0) && (n) < XENCOV_TAG_COUNTER(XENCOV_COUNTERS)) +#define XENCOV_COUNTER_NUM(n) ((n)-XENCOV_TAG_COUNTER(0)) + +/* + * The main structure for the blob is + * BLOB := FILE.. END + * FILE := TAG_FILE VERSION STAMP FILENAME COUNTERS FUNCTIONS + * FILENAME := LEN characters + * characters are padded to 32 bit + * LEN := 32 bit value + * COUNTERS := TAG_COUNTER(n) NUM COUNTER.. + * NUM := 32 bit valie + * COUNTER := 64 bit value + * FUNCTIONS := TAG_FUNC NUM FUNCTION.. + * FUNCTION := IDENT CHECKSUM NUM_COUNTERS + * + * All tagged structures are aligned to 8 bytes + */ + +/** + * File information + * Prefixed with XENCOV_TAG_FILE and a string with filename + * Aligned to 8 bytes + */ +struct xencov_file +{ + uint32_t tag; /* XENCOV_TAG_FILE */ + uint32_t version; + uint32_t stamp; + uint32_t fn_len; + char filename[1]; +}; + + +/** + * Counters information + * Prefixed with XENCOV_TAG_COUNTER(n) where n is 0..(XENCOV_COUNTERS-1) + * Aligned to 8 bytes + */ +struct xencov_counter +{ + uint32_t tag; /* XENCOV_TAG_COUNTER(n) */ + uint32_t num; + uint64_t values[1]; +}; + +/** + * Information for each function + * Number of counter is equal to the number of counter structures got before + */ +struct xencov_function +{ + uint32_t ident; + uint32_t checksum; + uint32_t num_counters[1]; +}; + +/** + * Information for all functions + * Aligned to 8 bytes + */ +struct xencov_functions +{ + uint32_t tag; /* XENCOV_TAG_FUNC */ + uint32_t num; + struct xencov_function xencov_function[1]; +}; + +/** + * Terminator + */ +struct xencov_end +{ + uint32_t tag; /* XENCOV_TAG_END */ +}; + +#endif /* __XEN_PUBLIC_GCOV_H__ */ + Property changes on: trunk/sys/xen/interface/gcov.h ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +MidnightBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Modified: trunk/sys/xen/interface/grant_table.h =================================================================== --- trunk/sys/xen/interface/grant_table.h 2020-02-08 19:27:58 UTC (rev 12305) +++ trunk/sys/xen/interface/grant_table.h 2020-02-08 19:28:08 UTC (rev 12306) @@ -135,8 +135,10 @@ /* The domain being granted foreign privileges. [GST] */ domid_t domid; /* - * GTF_permit_access: Frame that @domid is allowed to map and access. [GST] - * GTF_accept_transfer: Frame whose ownership transferred by @domid. [XEN] + * GTF_permit_access: GFN that @domid is allowed to map and access. [GST] + * GTF_accept_transfer: GFN that @domid is allowed to transfer into. [GST] + * GTF_transfer_completed: MFN whose ownership transferred by @domid + * (non-translated guests only). [XEN] */ uint32_t frame; }; @@ -310,6 +312,7 @@ #define GNTTABOP_get_status_frames 9 #define GNTTABOP_get_version 10 #define GNTTABOP_swap_grant_ref 11 +#define GNTTABOP_cache_flush 12 #endif /* __XEN_INTERFACE_VERSION__ */ /* ` } */ @@ -321,7 +324,7 @@ /* * GNTTABOP_map_grant_ref: Map the grant entry (<dom>,<ref>) for access * by devices and/or host CPUs. If successful, <handle> is a tracking number - * that must be presented later to destroy the mapping(s). On error, <handle> + * that must be presented later to destroy the mapping(s). On error, <status> * is a negative status code. * NOTES: * 1. If GNTMAP_device_map is specified then <dev_bus_addr> is the address @@ -386,7 +389,11 @@ uint32_t nr_frames; /* OUT parameters. */ int16_t status; /* => enum grant_status */ +#if __XEN_INTERFACE_VERSION__ < 0x00040300 XEN_GUEST_HANDLE(ulong) frame_list; +#else + XEN_GUEST_HANDLE(xen_pfn_t) frame_list; +#endif }; typedef struct gnttab_setup_table gnttab_setup_table_t; DEFINE_XEN_GUEST_HANDLE(gnttab_setup_table_t); @@ -446,12 +453,10 @@ #define GNTCOPY_source_gref (1<<_GNTCOPY_source_gref) #define _GNTCOPY_dest_gref (1) #define GNTCOPY_dest_gref (1<<_GNTCOPY_dest_gref) -#define _GNTCOPY_can_fail (2) -#define GNTCOPY_can_fail (1<<_GNTCOPY_can_fail) struct gnttab_copy { /* IN parameters. */ - struct { + struct gnttab_copy_ptr { union { grant_ref_t ref; xen_pfn_t gmfn; @@ -573,6 +578,25 @@ typedef struct gnttab_swap_grant_ref gnttab_swap_grant_ref_t; DEFINE_XEN_GUEST_HANDLE(gnttab_swap_grant_ref_t); +/* + * Issue one or more cache maintenance operations on a portion of a + * page granted to the calling domain by a foreign domain. + */ +struct gnttab_cache_flush { + union { + uint64_t dev_bus_addr; + grant_ref_t ref; + } a; + uint16_t offset; /* offset from start of grant */ + uint16_t length; /* size within the grant */ +#define GNTTAB_CACHE_CLEAN (1<<0) +#define GNTTAB_CACHE_INVAL (1<<1) +#define GNTTAB_CACHE_SOURCE_GREF (1<<31) + uint32_t op; +}; +typedef struct gnttab_cache_flush gnttab_cache_flush_t; +DEFINE_XEN_GUEST_HANDLE(gnttab_cache_flush_t); + #endif /* __XEN_INTERFACE_VERSION__ */ /* @@ -653,7 +677,7 @@ /* * Local variables: * mode: C - * c-set-style: "BSD" + * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil Modified: trunk/sys/xen/interface/kexec.h =================================================================== --- trunk/sys/xen/interface/kexec.h 2020-02-08 19:27:58 UTC (rev 12305) +++ trunk/sys/xen/interface/kexec.h 2020-02-08 19:28:08 UTC (rev 12306) @@ -98,9 +98,6 @@ #if defined(__i386__) || defined(__x86_64__) unsigned long page_list[KEXEC_XEN_NO_PAGES]; #endif -#if defined(__ia64__) - unsigned long reboot_code_buffer; -#endif unsigned long indirection_page; unsigned long start_address; } xen_kexec_image_t; @@ -109,6 +106,20 @@ * Perform kexec having previously loaded a kexec or kdump kernel * as appropriate. * type == KEXEC_TYPE_DEFAULT or KEXEC_TYPE_CRASH [in] + * + * Control is transferred to the image entry point with the host in + * the following state. + * + * - The image may be executed on any PCPU and all other PCPUs are + * stopped. + * + * - Local interrupts are disabled. + * + * - Register values are undefined. + * + * - The image segments have writeable 1:1 virtual to machine + * mappings. The location of any page tables is undefined and these + * page table frames are not be mapped. */ #define KEXEC_CMD_kexec 0 typedef struct xen_kexec_exec { @@ -120,12 +131,12 @@ * type == KEXEC_TYPE_DEFAULT or KEXEC_TYPE_CRASH [in] * image == relocation information for kexec (ignored for unload) [in] */ -#define KEXEC_CMD_kexec_load 1 -#define KEXEC_CMD_kexec_unload 2 -typedef struct xen_kexec_load { +#define KEXEC_CMD_kexec_load_v1 1 /* obsolete since 0x00040400 */ +#define KEXEC_CMD_kexec_unload_v1 2 /* obsolete since 0x00040400 */ +typedef struct xen_kexec_load_v1 { int type; xen_kexec_image_t image; -} xen_kexec_load_t; +} xen_kexec_load_v1_t; #define KEXEC_RANGE_MA_CRASH 0 /* machine address and size of crash area */ #define KEXEC_RANGE_MA_XEN 1 /* machine address and size of Xen itself */ @@ -135,7 +146,7 @@ * to Xen it exists in a separate EFI * region on ia64, and thus needs to be * inserted into iomem_machine separately */ -#define KEXEC_RANGE_MA_BOOT_PARAM 4 /* machine address and size of +#define KEXEC_RANGE_MA_BOOT_PARAM 4 /* Obsolete: machine address and size of * the ia64_boot_param */ #define KEXEC_RANGE_MA_EFI_MEMMAP 5 /* machine address and size of * of the EFI Memory Map */ @@ -156,12 +167,82 @@ unsigned long start; } xen_kexec_range_t; +#if __XEN_INTERFACE_VERSION__ >= 0x00040400 +/* + * A contiguous chunk of a kexec image and it's destination machine + * address. + */ +typedef struct xen_kexec_segment { + union { + XEN_GUEST_HANDLE(const_void) h; + uint64_t _pad; + } buf; + uint64_t buf_size; + uint64_t dest_maddr; + uint64_t dest_size; +} xen_kexec_segment_t; +DEFINE_XEN_GUEST_HANDLE(xen_kexec_segment_t); + +/* + * Load a kexec image into memory. + * + * For KEXEC_TYPE_DEFAULT images, the segments may be anywhere in RAM. + * The image is relocated prior to being executed. + * + * For KEXEC_TYPE_CRASH images, each segment of the image must reside + * in the memory region reserved for kexec (KEXEC_RANGE_MA_CRASH) and + * the entry point must be within the image. The caller is responsible + * for ensuring that multiple images do not overlap. + * + * All image segments will be loaded to their destination machine + * addresses prior to being executed. The trailing portion of any + * segments with a source buffer (from dest_maddr + buf_size to + * dest_maddr + dest_size) will be zeroed. + * + * Segments with no source buffer will be accessible to the image when + * it is executed. + */ + +#define KEXEC_CMD_kexec_load 4 +typedef struct xen_kexec_load { + uint8_t type; /* One of KEXEC_TYPE_* */ + uint8_t _pad; + uint16_t arch; /* ELF machine type (EM_*). */ + uint32_t nr_segments; + union { + XEN_GUEST_HANDLE(xen_kexec_segment_t) h; + uint64_t _pad; + } segments; + uint64_t entry_maddr; /* image entry point machine address. */ +} xen_kexec_load_t; +DEFINE_XEN_GUEST_HANDLE(xen_kexec_load_t); + +/* + * Unload a kexec image. + * + * Type must be one of KEXEC_TYPE_DEFAULT or KEXEC_TYPE_CRASH. + */ +#define KEXEC_CMD_kexec_unload 5 +typedef struct xen_kexec_unload { + uint8_t type; +} xen_kexec_unload_t; +DEFINE_XEN_GUEST_HANDLE(xen_kexec_unload_t); + +#else /* __XEN_INTERFACE_VERSION__ < 0x00040400 */ + +#define KEXEC_CMD_kexec_load KEXEC_CMD_kexec_load_v1 +#define KEXEC_CMD_kexec_unload KEXEC_CMD_kexec_unload_v1 +#define xen_kexec_load xen_kexec_load_v1 +#define xen_kexec_load_t xen_kexec_load_v1_t + +#endif + #endif /* _XEN_PUBLIC_KEXEC_H */ /* * Local variables: * mode: C - * c-set-style: "BSD" + * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil Modified: trunk/sys/xen/interface/memory.h =================================================================== --- trunk/sys/xen/interface/memory.h 2020-02-08 19:27:58 UTC (rev 12305) +++ trunk/sys/xen/interface/memory.h 2020-02-08 19:28:08 UTC (rev 12306) @@ -29,6 +29,7 @@ #define __XEN_PUBLIC_MEMORY_H__ #include "xen.h" +#include "physdev.h" /* * Increase or decrease the specified domain's memory reservation. Returns the @@ -56,6 +57,8 @@ /* Flag to request allocation only from the node specified */ #define XENMEMF_exact_node_request (1<<17) #define XENMEMF_exact_node(n) (XENMEMF_node(n) | XENMEMF_exact_node_request) +/* Flag to indicate the node specified is virtual node */ +#define XENMEMF_vnode (1<<18) #endif struct xen_memory_reservation { @@ -69,6 +72,8 @@ * IN: GPFN bases of extents to populate with memory * OUT: GMFN bases of extents that were allocated * (NB. This command also updates the mach_to_phys translation table) + * XENMEM_claim_pages: + * IN: must be zero */ XEN_GUEST_HANDLE(xen_pfn_t) extent_start; @@ -186,6 +191,15 @@ DEFINE_XEN_GUEST_HANDLE(xen_machphys_mfn_list_t); /* + * For a compat caller, this is identical to XENMEM_machphys_mfn_list. + * + * For a non compat caller, this functions similarly to + * XENMEM_machphys_mfn_list, but returns the mfns making up the compatibility + * m2p table. + */ +#define XENMEM_machphys_compat_mfn_list 25 + +/* * Returns the location in virtual address space of the machine_to_phys * mapping table. Architectures which do not have a m2p table, or which do not * map it by default into guest address space, do not implement this command. @@ -199,6 +213,16 @@ typedef struct xen_machphys_mapping xen_machphys_mapping_t; DEFINE_XEN_GUEST_HANDLE(xen_machphys_mapping_t); +/* Source mapping space. */ +/* ` enum phys_map_space { */ +#define XENMAPSPACE_shared_info 0 /* shared info page */ +#define XENMAPSPACE_grant_table 1 /* grant table page */ +#define XENMAPSPACE_gmfn 2 /* GMFN */ +#define XENMAPSPACE_gmfn_range 3 /* GMFN range, XENMEM_add_to_physmap only. */ +#define XENMAPSPACE_gmfn_foreign 4 /* GMFN from another dom, + * XENMEM_add_to_physmap_batch only. */ +/* ` } */ + /* * Sets the GPFN at which a particular page appears in the specified guest's * pseudophysical address space. @@ -212,24 +236,52 @@ /* Number of pages to go through for gmfn_range */ uint16_t size; - /* Source mapping space. */ -#define XENMAPSPACE_shared_info 0 /* shared info page */ -#define XENMAPSPACE_grant_table 1 /* grant table page */ -#define XENMAPSPACE_gmfn 2 /* GMFN */ -#define XENMAPSPACE_gmfn_range 3 /* GMFN range */ - unsigned int space; + unsigned int space; /* => enum phys_map_space */ #define XENMAPIDX_grant_table_status 0x80000000 - /* Index into source mapping space. */ + /* Index into space being mapped. */ xen_ulong_t idx; - /* GPFN where the source mapping page should appear. */ + /* GPFN in domid where the source mapping page should appear. */ xen_pfn_t gpfn; }; typedef struct xen_add_to_physmap xen_add_to_physmap_t; DEFINE_XEN_GUEST_HANDLE(xen_add_to_physmap_t); +/* A batched version of add_to_physmap. */ +#define XENMEM_add_to_physmap_batch 23 +struct xen_add_to_physmap_batch { + /* IN */ + /* Which domain to change the mapping for. */ + domid_t domid; + uint16_t space; /* => enum phys_map_space */ + + /* Number of pages to go through */ + uint16_t size; + domid_t foreign_domid; /* IFF gmfn_foreign */ + + /* Indexes into space being mapped. */ + XEN_GUEST_HANDLE(xen_ulong_t) idxs; + + /* GPFN in domid where the source mapping page should appear. */ + XEN_GUEST_HANDLE(xen_pfn_t) gpfns; + + /* OUT */ + + /* Per index error code. */ + XEN_GUEST_HANDLE(int) errs; +}; +typedef struct xen_add_to_physmap_batch xen_add_to_physmap_batch_t; +DEFINE_XEN_GUEST_HANDLE(xen_add_to_physmap_batch_t); + +#if __XEN_INTERFACE_VERSION__ < 0x00040400 +#define XENMEM_add_to_physmap_range XENMEM_add_to_physmap_batch +#define xen_add_to_physmap_range xen_add_to_physmap_batch +typedef struct xen_add_to_physmap_batch xen_add_to_physmap_range_t; +DEFINE_XEN_GUEST_HANDLE(xen_add_to_physmap_range_t); +#endif + /* * Unmaps the page appearing at a particular GPFN from the specified guest's * pseudophysical address space. @@ -324,13 +376,9 @@ #define XENMEM_paging_op_evict 1 #define XENMEM_paging_op_prep 2 -#define XENMEM_access_op 21 -#define XENMEM_access_op_resume 0 - -struct xen_mem_event_op { - uint8_t op; /* XENMEM_*_op_* */ +struct xen_mem_paging_op { + uint8_t op; /* XENMEM_paging_op_* */ domid_t domain; - /* PAGING_PREP IN: buffer to immediately fill page in */ uint64_aligned_t buffer; @@ -337,19 +385,69 @@ /* Other OPs */ uint64_aligned_t gfn; /* IN: gfn of page being operated on */ }; -typedef struct xen_mem_event_op xen_mem_event_op_t; -DEFINE_XEN_GUEST_HANDLE(xen_mem_event_op_t); +typedef struct xen_mem_paging_op xen_mem_paging_op_t; +DEFINE_XEN_GUEST_HANDLE(xen_mem_paging_op_t); +#define XENMEM_access_op 21 +#define XENMEM_access_op_set_access 0 +#define XENMEM_access_op_get_access 1 +#define XENMEM_access_op_enable_emulate 2 +#define XENMEM_access_op_disable_emulate 3 + +typedef enum { + XENMEM_access_n, + XENMEM_access_r, + XENMEM_access_w, + XENMEM_access_rw, + XENMEM_access_x, + XENMEM_access_rx, + XENMEM_access_wx, + XENMEM_access_rwx, + /* + * Page starts off as r-x, but automatically + * change to r-w on a write + */ + XENMEM_access_rx2rw, + /* + * Log access: starts off as n, automatically + * goes to rwx, generating an event without + * pausing the vcpu + */ + XENMEM_access_n2rwx, + /* Take the domain default */ + XENMEM_access_default +} xenmem_access_t; + +struct xen_mem_access_op { + /* XENMEM_access_op_* */ + uint8_t op; + /* xenmem_access_t */ + uint8_t access; + domid_t domid; + /* + * Number of pages for set op + * Ignored on setting default access and other ops + */ + uint32_t nr; + /* + * First pfn for set op + * pfn for get op + * ~0ull is used to set and get the default access for pages + */ + uint64_aligned_t pfn; +}; +typedef struct xen_mem_access_op xen_mem_access_op_t; +DEFINE_XEN_GUEST_HANDLE(xen_mem_access_op_t); + #define XENMEM_sharing_op 22 #define XENMEM_sharing_op_nominate_gfn 0 #define XENMEM_sharing_op_nominate_gref 1 #define XENMEM_sharing_op_share 2 -#define XENMEM_sharing_op_resume 3 -#define XENMEM_sharing_op_debug_gfn 4 -#define XENMEM_sharing_op_debug_mfn 5 -#define XENMEM_sharing_op_debug_gref 6 -#define XENMEM_sharing_op_add_physmap 7 -#define XENMEM_sharing_op_audit 8 +#define XENMEM_sharing_op_debug_gfn 3 +#define XENMEM_sharing_op_debug_mfn 4 +#define XENMEM_sharing_op_debug_gref 5 +#define XENMEM_sharing_op_add_physmap 6 +#define XENMEM_sharing_op_audit 7 #define XENMEM_SHARING_OP_S_HANDLE_INVALID (-10) #define XENMEM_SHARING_OP_C_HANDLE_INVALID (-9) @@ -398,14 +496,127 @@ typedef struct xen_mem_sharing_op xen_mem_sharing_op_t; DEFINE_XEN_GUEST_HANDLE(xen_mem_sharing_op_t); +/* + * Attempt to stake a claim for a domain on a quantity of pages + * of system RAM, but _not_ assign specific pageframes. Only + * arithmetic is performed so the hypercall is very fast and need + * not be preemptible, thus sidestepping time-of-check-time-of-use + * races for memory allocation. Returns 0 if the hypervisor page + * allocator has atomically and successfully claimed the requested + * number of pages, else non-zero. + * + * Any domain may have only one active claim. When sufficient memory + * has been allocated to resolve the claim, the claim silently expires. + * Claiming zero pages effectively resets any outstanding claim and + * is always successful. + * + * Note that a valid claim may be staked even after memory has been + * allocated for a domain. In this case, the claim is not incremental, + * i.e. if the domain's tot_pages is 3, and a claim is staked for 10, + * only 7 additional pages are claimed. + * + * Caller must be privileged or the hypercall fails. + */ +#define XENMEM_claim_pages 24 + +/* + * XENMEM_claim_pages flags - the are no flags at this time. + * The zero value is appropiate. + */ + +/* + * With some legacy devices, certain guest-physical addresses cannot safely + * be used for other purposes, e.g. to map guest RAM. This hypercall + * enumerates those regions so the toolstack can avoid using them. + */ +#define XENMEM_reserved_device_memory_map 27 +struct xen_reserved_device_memory { + xen_pfn_t start_pfn; + xen_ulong_t nr_pages; +}; +typedef struct xen_reserved_device_memory xen_reserved_device_memory_t; +DEFINE_XEN_GUEST_HANDLE(xen_reserved_device_memory_t); + +struct xen_reserved_device_memory_map { +#define XENMEM_RDM_ALL 1 /* Request all regions (ignore dev union). */ + /* IN */ + uint32_t flags; + /* + * IN/OUT + * + * Gets set to the required number of entries when too low, + * signaled by error code -ERANGE. + */ + unsigned int nr_entries; + /* OUT */ + XEN_GUEST_HANDLE(xen_reserved_device_memory_t) buffer; + /* IN */ + union { + struct physdev_pci_device pci; + } dev; +}; +typedef struct xen_reserved_device_memory_map xen_reserved_device_memory_map_t; +DEFINE_XEN_GUEST_HANDLE(xen_reserved_device_memory_map_t); + #endif /* defined(__XEN__) || defined(__XEN_TOOLS__) */ +/* + * XENMEM_get_vnumainfo used by guest to get + * vNUMA topology from hypervisor. + */ +#define XENMEM_get_vnumainfo 26 + +/* vNUMA node memory ranges */ +struct xen_vmemrange { + uint64_t start, end; + unsigned int flags; + unsigned int nid; +}; +typedef struct xen_vmemrange xen_vmemrange_t; +DEFINE_XEN_GUEST_HANDLE(xen_vmemrange_t); + +/* + * vNUMA topology specifies vNUMA node number, distance table, + * memory ranges and vcpu mapping provided for guests. + * XENMEM_get_vnumainfo hypercall expects to see from guest + * nr_vnodes, nr_vmemranges and nr_vcpus to indicate available memory. + * After filling guests structures, nr_vnodes, nr_vmemranges and nr_vcpus + * copied back to guest. Domain returns expected values of nr_vnodes, + * nr_vmemranges and nr_vcpus to guest if the values where incorrect. + */ +struct xen_vnuma_topology_info { + /* IN */ + domid_t domid; + uint16_t pad; + /* IN/OUT */ + unsigned int nr_vnodes; + unsigned int nr_vcpus; + unsigned int nr_vmemranges; + /* OUT */ + union { + XEN_GUEST_HANDLE(uint) h; + uint64_t pad; + } vdistance; + union { + XEN_GUEST_HANDLE(uint) h; + uint64_t pad; + } vcpu_to_vnode; + union { + XEN_GUEST_HANDLE(xen_vmemrange_t) h; + uint64_t pad; + } vmemrange; +}; +typedef struct xen_vnuma_topology_info xen_vnuma_topology_info_t; +DEFINE_XEN_GUEST_HANDLE(xen_vnuma_topology_info_t); + +/* Next available subop number is 28 */ + #endif /* __XEN_PUBLIC_MEMORY_H__ */ /* * Local variables: * mode: C - * c-set-style: "BSD" + * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil Modified: trunk/sys/xen/interface/nmi.h =================================================================== --- trunk/sys/xen/interface/nmi.h 2020-02-08 19:27:58 UTC (rev 12305) +++ trunk/sys/xen/interface/nmi.h 2020-02-08 19:28:08 UTC (rev 12306) @@ -37,9 +37,14 @@ /* I/O-check error reported via ISA port 0x61, bit 6. */ #define _XEN_NMIREASON_io_error 0 #define XEN_NMIREASON_io_error (1UL << _XEN_NMIREASON_io_error) + /* PCI SERR reported via ISA port 0x61, bit 7. */ +#define _XEN_NMIREASON_pci_serr 1 +#define XEN_NMIREASON_pci_serr (1UL << _XEN_NMIREASON_pci_serr) +#if __XEN_INTERFACE_VERSION__ < 0x00040300 /* legacy alias of the above */ /* Parity error reported via ISA port 0x61, bit 7. */ #define _XEN_NMIREASON_parity_error 1 #define XEN_NMIREASON_parity_error (1UL << _XEN_NMIREASON_parity_error) +#endif /* Unknown hardware-generated NMI. */ #define _XEN_NMIREASON_unknown 2 #define XEN_NMIREASON_unknown (1UL << _XEN_NMIREASON_unknown) @@ -73,7 +78,7 @@ /* * Local variables: * mode: C - * c-set-style: "BSD" + * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil Modified: trunk/sys/xen/interface/physdev.h =================================================================== --- trunk/sys/xen/interface/physdev.h 2020-02-08 19:27:58 UTC (rev 12305) +++ trunk/sys/xen/interface/physdev.h 2020-02-08 19:28:08 UTC (rev 12306) @@ -17,6 +17,8 @@ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. + * + * Copyright (c) 2006, Keir Fraser */ #ifndef __XEN_PUBLIC_PHYSDEV_H__ @@ -152,6 +154,7 @@ #define MAP_PIRQ_TYPE_GSI 0x1 #define MAP_PIRQ_TYPE_UNKNOWN 0x2 #define MAP_PIRQ_TYPE_MSI_SEG 0x3 +#define MAP_PIRQ_TYPE_MULTI_MSI 0x4 #define PHYSDEVOP_map_pirq 13 struct physdev_map_pirq { @@ -158,15 +161,15 @@ domid_t domid; /* IN */ int type; - /* IN */ + /* IN (ignored for ..._MULTI_MSI) */ int index; /* IN or OUT */ int pirq; - /* IN - high 16 bits hold segment for MAP_PIRQ_TYPE_MSI_SEG */ + /* IN - high 16 bits hold segment for ..._MSI_SEG and ..._MULTI_MSI */ int bus; /* IN */ int devfn; - /* IN */ + /* IN (also OUT for ..._MULTI_MSI) */ int entry_nr; /* IN */ uint64_t table_base; @@ -293,6 +296,11 @@ uint8_t bus; uint8_t devfn; } physfn; + /* + * Optional parameters array. + * First element ([0]) is PXM domain associated with the device (if + * XEN_PCI_DEV_PXM is set) + */ #if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L uint32_t optarr[]; #elif defined(__GNUC__) @@ -304,6 +312,12 @@ #define PHYSDEVOP_pci_device_remove 26 #define PHYSDEVOP_restore_msi_ext 27 +/* + * Dom0 should use these two to announce MMIO resources assigned to + * MSI-X capable devices won't (prepare) or may (release) change. + */ +#define PHYSDEVOP_prepare_msix 30 +#define PHYSDEVOP_release_msix 31 struct physdev_pci_device { /* IN */ uint16_t seg; @@ -313,6 +327,24 @@ typedef struct physdev_pci_device physdev_pci_device_t; DEFINE_XEN_GUEST_HANDLE(physdev_pci_device_t); +#define PHYSDEVOP_DBGP_RESET_PREPARE 1 +#define PHYSDEVOP_DBGP_RESET_DONE 2 + +#define PHYSDEVOP_DBGP_BUS_UNKNOWN 0 +#define PHYSDEVOP_DBGP_BUS_PCI 1 + +#define PHYSDEVOP_dbgp_op 29 +struct physdev_dbgp_op { + /* IN */ + uint8_t op; + uint8_t bus; + union { + struct physdev_pci_device pci; + } u; +}; +typedef struct physdev_dbgp_op physdev_dbgp_op_t; +DEFINE_XEN_GUEST_HANDLE(physdev_dbgp_op_t); + /* * Notify that some PIRQ-bound event channels have been unmasked. * ** This command is obsolete since interface version 0x00030202 and is ** @@ -320,9 +352,11 @@ */ #define PHYSDEVOP_IRQ_UNMASK_NOTIFY 4 +#if __XEN_INTERFACE_VERSION__ < 0x00040600 /* * These all-capitals physdev operation names are superceded by the new names - * (defined above) since interface version 0x00030202. + * (defined above) since interface version 0x00030202. The guard above was + * added post-4.5 only though and hence shouldn't check for 0x00030202. */ #define PHYSDEVOP_IRQ_STATUS_QUERY PHYSDEVOP_irq_status_query #define PHYSDEVOP_SET_IOPL PHYSDEVOP_set_iopl @@ -333,6 +367,7 @@ #define PHYSDEVOP_FREE_VECTOR PHYSDEVOP_free_irq_vector #define PHYSDEVOP_IRQ_NEEDS_UNMASK_NOTIFY XENIRQSTAT_needs_eoi #define PHYSDEVOP_IRQ_SHARED XENIRQSTAT_shared +#endif #if __XEN_INTERFACE_VERSION__ < 0x00040200 #define PHYSDEVOP_pirq_eoi_gmfn PHYSDEVOP_pirq_eoi_gmfn_v1 @@ -345,7 +380,7 @@ /* * Local variables: * mode: C - * c-set-style: "BSD" + * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil Modified: trunk/sys/xen/interface/platform.h =================================================================== --- trunk/sys/xen/interface/platform.h 2020-02-08 19:27:58 UTC (rev 12305) +++ trunk/sys/xen/interface/platform.h 2020-02-08 19:28:08 UTC (rev 12306) @@ -36,13 +36,28 @@ * Set clock such that it would read <secs,nsecs> after 00:00:00 UTC, * 1 January, 1970 if the current system time was <system_time>. */ -#define XENPF_settime 17 -struct xenpf_settime { +#define XENPF_settime32 17 +struct xenpf_settime32 { /* IN variables. */ uint32_t secs; uint32_t nsecs; uint64_t system_time; }; +#define XENPF_settime64 62 +struct xenpf_settime64 { + /* IN variables. */ + uint64_t secs; + uint32_t nsecs; + uint32_t mbz; + uint64_t system_time; +}; +#if __XEN_INTERFACE_VERSION__ < 0x00040600 +#define XENPF_settime XENPF_settime32 +#define xenpf_settime xenpf_settime32 +#else +#define XENPF_settime XENPF_settime64 +#define xenpf_settime xenpf_settime64 +#endif typedef struct xenpf_settime xenpf_settime_t; DEFINE_XEN_GUEST_HANDLE(xenpf_settime_t); @@ -127,6 +142,26 @@ #define XEN_EFI_query_variable_info 9 #define XEN_EFI_query_capsule_capabilities 10 #define XEN_EFI_update_capsule 11 + +struct xenpf_efi_time { + uint16_t year; + uint8_t month; + uint8_t day; + uint8_t hour; + uint8_t min; + uint8_t sec; + uint32_t ns; + int16_t tz; + uint8_t daylight; +}; + +struct xenpf_efi_guid { + uint32_t data1; + uint16_t data2; + uint16_t data3; + uint8_t data4[8]; +}; + struct xenpf_efi_runtime_call { uint32_t function; /* @@ -135,21 +170,11 @@ * where it holds the single returned value. */ uint32_t misc; - unsigned long status; + xen_ulong_t status; union { #define XEN_EFI_GET_TIME_SET_CLEARS_NS 0x00000001 struct { - struct xenpf_efi_time { - uint16_t year; - uint8_t month; - uint8_t day; - uint8_t hour; - uint8_t min; - uint8_t sec; - uint32_t ns; - int16_t tz; - uint8_t daylight; - } time; + struct xenpf_efi_time time; uint32_t resolution; uint32_t accuracy; } get_time; @@ -169,22 +194,18 @@ #define XEN_EFI_VARIABLE_RUNTIME_ACCESS 0x00000004 struct { XEN_GUEST_HANDLE(void) name; /* UCS-2/UTF-16 string */ - unsigned long size; + xen_ulong_t size; XEN_GUEST_HANDLE(void) data; - struct xenpf_efi_guid { - uint32_t data1; - uint16_t data2; - uint16_t data3; - uint8_t data4[8]; - } vendor_guid; + struct xenpf_efi_guid vendor_guid; } get_variable, set_variable; struct { - unsigned long size; + xen_ulong_t size; XEN_GUEST_HANDLE(void) name; /* UCS-2/UTF-16 string */ struct xenpf_efi_guid vendor_guid; } get_next_variable_name; +#define XEN_EFI_VARINFO_BOOT_SNAPSHOT 0x00000001 struct { uint32_t attr; uint64_t max_store_size; @@ -194,14 +215,14 @@ struct { XEN_GUEST_HANDLE(void) capsule_header_array; - unsigned long capsule_count; + xen_ulong_t capsule_count; uint64_t max_capsule_size; - unsigned int reset_type; + uint32_t reset_type; } query_capsule_capabilities; struct { XEN_GUEST_HANDLE(void) capsule_header_array; - unsigned long capsule_count; + xen_ulong_t capsule_count; uint64_t sg_list; /* machine address */ } update_capsule; } u; @@ -219,6 +240,8 @@ #define XEN_FW_EFI_VENDOR 2 #define XEN_FW_EFI_MEM_INFO 3 #define XEN_FW_EFI_RT_VERSION 4 +#define XEN_FW_EFI_PCI_ROM 5 +#define XEN_FW_KBD_SHIFT_FLAGS 5 struct xenpf_firmware_info { /* IN variables. */ uint32_t type; @@ -266,7 +289,21 @@ uint64_t attr; uint32_t type; } mem; + struct { + /* IN variables */ + uint16_t segment; + uint8_t bus; + uint8_t devfn; + uint16_t vendor; + uint16_t devid; + /* OUT variables */ + uint64_t address; + xen_ulong_t size; + } pci_rom; } efi_info; /* XEN_FW_EFI_INFO */ + + /* Int16, Fn02: Get keyboard shift flags. */ + uint8_t kbd_shift_flags; /* XEN_FW_KBD_SHIFT_FLAGS */ } u; }; typedef struct xenpf_firmware_info xenpf_firmware_info_t; @@ -275,10 +312,16 @@ #define XENPF_enter_acpi_sleep 51 struct xenpf_enter_acpi_sleep { /* IN variables */ +#if __XEN_INTERFACE_VERSION__ < 0x00040300 uint16_t pm1a_cnt_val; /* PM1a control value. */ uint16_t pm1b_cnt_val; /* PM1b control value. */ +#else + uint16_t val_a; /* PM1a control / sleep type A. */ + uint16_t val_b; /* PM1b control / sleep type B. */ +#endif uint32_t sleep_state; /* Which state to enter (Sn). */ - uint32_t flags; /* Must be zero. */ +#define XENPF_ACPI_SLEEP_EXTENDED 0x00000001 + uint32_t flags; /* XENPF_ACPI_SLEEP_*. */ }; typedef struct xenpf_enter_acpi_sleep xenpf_enter_acpi_sleep_t; DEFINE_XEN_GUEST_HANDLE(xenpf_enter_acpi_sleep_t); @@ -506,6 +549,67 @@ DEFINE_XEN_GUEST_HANDLE(xenpf_core_parking_t); /* + * Access generic platform resources(e.g., accessing MSR, port I/O, etc) + * in unified way. Batch resource operations in one call are supported and + * they are always non-preemptible and executed in their original order. + * The batch itself returns a negative integer for general errors, or a + * non-negative integer for the number of successful operations. For the latter + * case, the @ret in the failed entry (if any) indicates the exact error. + */ +#define XENPF_resource_op 61 + +#define XEN_RESOURCE_OP_MSR_READ 0 +#define XEN_RESOURCE_OP_MSR_WRITE 1 + +/* + * Specially handled MSRs: + * - MSR_IA32_TSC + * READ: Returns the scaled system time(ns) instead of raw timestamp. In + * multiple entry case, if other MSR read is followed by a MSR_IA32_TSC + * read, then both reads are guaranteed to be performed atomically (with + * IRQ disabled). The return time indicates the point of reading that MSR. + * WRITE: Not supported. + */ + +struct xenpf_resource_entry { + union { + uint32_t cmd; /* IN: XEN_RESOURCE_OP_* */ + int32_t ret; /* OUT: return value for failed entry */ + } u; + uint32_t rsvd; /* IN: padding and must be zero */ + uint64_t idx; /* IN: resource address to access */ + uint64_t val; /* IN/OUT: resource value to set/get */ +}; +typedef struct xenpf_resource_entry xenpf_resource_entry_t; +DEFINE_XEN_GUEST_HANDLE(xenpf_resource_entry_t); + +struct xenpf_resource_op { + uint32_t nr_entries; /* number of resource entry */ + uint32_t cpu; /* which cpu to run */ + XEN_GUEST_HANDLE(xenpf_resource_entry_t) entries; +}; +typedef struct xenpf_resource_op xenpf_resource_op_t; +DEFINE_XEN_GUEST_HANDLE(xenpf_resource_op_t); + +#define XENPF_get_symbol 63 +struct xenpf_symdata { + /* IN/OUT variables */ + uint32_t namelen; /* IN: size of name buffer */ + /* OUT: strlen(name) of hypervisor symbol (may be */ + /* larger than what's been copied to guest) */ + uint32_t symnum; /* IN: Symbol to read */ + /* OUT: Next available symbol. If same as IN then */ + /* we reached the end */ + + /* OUT variables */ + XEN_GUEST_HANDLE(char) name; + uint64_t address; + char type; +}; +typedef struct xenpf_symdata xenpf_symdata_t; +DEFINE_XEN_GUEST_HANDLE(xenpf_symdata_t); + +/* * ` enum neg_errnoval * ` HYPERVISOR_platform_op(const struct xen_platform_op*); */ @@ -514,6 +618,8 @@ uint32_t interface_version; /* XENPF_INTERFACE_VERSION */ union { struct xenpf_settime settime; + struct xenpf_settime32 settime32; + struct xenpf_settime64 settime64; struct xenpf_add_memtype add_memtype; struct xenpf_del_memtype del_memtype; struct xenpf_read_memtype read_memtype; @@ -531,6 +637,8 @@ struct xenpf_cpu_hotadd cpu_add; struct xenpf_mem_hotadd mem_add; struct xenpf_core_parking core_parking; + struct xenpf_resource_op resource_op; + struct xenpf_symdata symdata; uint8_t pad[128]; } u; }; @@ -542,7 +650,7 @@ /* * Local variables: * mode: C - * c-set-style: "BSD" + * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil Added: trunk/sys/xen/interface/pmu.h =================================================================== --- trunk/sys/xen/interface/pmu.h (rev 0) +++ trunk/sys/xen/interface/pmu.h 2020-02-08 19:28:08 UTC (rev 12306) @@ -0,0 +1,134 @@ +/* $MidnightBSD$ */ +/* + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Copyright (c) 2015 Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef __XEN_PUBLIC_PMU_H__ +#define __XEN_PUBLIC_PMU_H__ + +#include "xen.h" +#if defined(__i386__) || defined(__x86_64__) +#include "arch-x86/pmu.h" +#elif defined (__arm__) || defined (__aarch64__) +#include "arch-arm.h" +#else +#error "Unsupported architecture" +#endif + +#define XENPMU_VER_MAJ 0 +#define XENPMU_VER_MIN 1 + +/* + * ` enum neg_errnoval + * ` HYPERVISOR_xenpmu_op(enum xenpmu_op cmd, struct xenpmu_params *args); + * + * @cmd == XENPMU_* (PMU operation) + * @args == struct xenpmu_params + */ +/* ` enum xenpmu_op { */ +#define XENPMU_mode_get 0 /* Also used for getting PMU version */ +#define XENPMU_mode_set 1 +#define XENPMU_feature_get 2 +#define XENPMU_feature_set 3 +#define XENPMU_init 4 +#define XENPMU_finish 5 +#define XENPMU_lvtpc_set 6 +#define XENPMU_flush 7 /* Write cached MSR values to HW */ +/* ` } */ + +/* Parameters structure for HYPERVISOR_xenpmu_op call */ +struct xen_pmu_params { + /* IN/OUT parameters */ + struct { + uint32_t maj; + uint32_t min; + } version; + uint64_t val; + + /* IN parameters */ + uint32_t vcpu; + uint32_t pad; +}; +typedef struct xen_pmu_params xen_pmu_params_t; +DEFINE_XEN_GUEST_HANDLE(xen_pmu_params_t); + +/* PMU modes: + * - XENPMU_MODE_OFF: No PMU virtualization + * - XENPMU_MODE_SELF: Guests can profile themselves + * - XENPMU_MODE_HV: Guests can profile themselves, dom0 profiles + * itself and Xen + * - XENPMU_MODE_ALL: Only dom0 has access to VPMU and it profiles + * everyone: itself, the hypervisor and the guests. + */ +#define XENPMU_MODE_OFF 0 +#define XENPMU_MODE_SELF (1<<0) +#define XENPMU_MODE_HV (1<<1) +#define XENPMU_MODE_ALL (1<<2) + +/* + * PMU features: + * - XENPMU_FEATURE_INTEL_BTS: Intel BTS support (ignored on AMD) + */ +#define XENPMU_FEATURE_INTEL_BTS 1 + +/* + * Shared PMU data between hypervisor and PV(H) domains. + * + * The hypervisor fills out this structure during PMU interrupt and sends an + * interrupt to appropriate VCPU. + * Architecture-independent fields of xen_pmu_data are WO for the hypervisor + * and RO for the guest but some fields in xen_pmu_arch can be writable + * by both the hypervisor and the guest (see arch-$arch/pmu.h). + */ +struct xen_pmu_data { + /* Interrupted VCPU */ + uint32_t vcpu_id; + + /* + * Physical processor on which the interrupt occurred. On non-privileged + * guests set to vcpu_id; + */ + uint32_t pcpu_id; + + /* + * Domain that was interrupted. On non-privileged guests set to DOMID_SELF. + * On privileged guests can be DOMID_SELF, DOMID_XEN, or, when in + * XENPMU_MODE_ALL mode, domain ID of another domain. + */ + domid_t domain_id; + + uint8_t pad[6]; + + /* Architecture-specific information */ + struct xen_pmu_arch pmu; +}; + +#endif /* __XEN_PUBLIC_PMU_H__ */ + +/* + * Local variables: + * mode: C + * c-file-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ Property changes on: trunk/sys/xen/interface/pmu.h ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +MidnightBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Modified: trunk/sys/xen/interface/sched.h =================================================================== --- trunk/sys/xen/interface/sched.h 2020-02-08 19:27:58 UTC (rev 12305) +++ trunk/sys/xen/interface/sched.h 2020-02-08 19:28:08 UTC (rev 12306) @@ -1,9 +1,9 @@ /* $MidnightBSD$ */ /****************************************************************************** * sched.h - * + * * Scheduler state interactions - * + * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the @@ -31,11 +31,21 @@ #include "event_channel.h" /* + * `incontents 150 sched Guest Scheduler Operations + * + * The SCHEDOP interface provides mechanisms for a guest to interact + * with the scheduler, including yield, blocking and shutting itself + * down. + */ + +/* * The prototype for this hypercall is: - * long sched_op(int cmd, void *arg) + * ` long HYPERVISOR_sched_op(enum sched_op cmd, void *arg, ...) + * * @cmd == SCHEDOP_??? (scheduler operation). * @arg == Operation-specific extra argument(s), as described below. - * + * ... == Additional Operation-specific extra arguments, described below. + * * Versions of Xen prior to 3.0.2 provided only the following legacy version * of this hypercall, supporting only the commands yield, block and shutdown: * long sched_op(int cmd, unsigned long arg) @@ -42,9 +52,12 @@ * @cmd == SCHEDOP_??? (scheduler operation). * @arg == 0 (SCHEDOP_yield and SCHEDOP_block) * == SHUTDOWN_* code (SCHEDOP_shutdown) - * This legacy version is available to new guests as sched_op_compat(). + * + * This legacy version is available to new guests as: + * ` long HYPERVISOR_sched_op_compat(enum sched_op cmd, unsigned long arg) */ +/* ` enum sched_op { // SCHEDOP_* => struct sched_* */ /* * Voluntarily yield the CPU. * @arg == NULL. @@ -62,53 +75,44 @@ /* * Halt execution of this domain (all VCPUs) and notify the system controller. - * @arg == pointer to sched_shutdown structure. + * @arg == pointer to sched_shutdown_t structure. + * + * If the sched_shutdown_t reason is SHUTDOWN_suspend then + * x86 PV guests must also set RDX (EDX for 32-bit guests) to the MFN + * of the guest's start info page. RDX/EDX is the third hypercall + * argument. + * + * In addition, which reason is SHUTDOWN_suspend this hypercall + * returns 1 if suspend was cancelled or the domain was merely + * checkpointed, and 0 if it is resuming in a new domain. */ #define SCHEDOP_shutdown 2 -struct sched_shutdown { - unsigned int reason; /* SHUTDOWN_* */ -}; -typedef struct sched_shutdown sched_shutdown_t; -DEFINE_XEN_GUEST_HANDLE(sched_shutdown_t); /* * Poll a set of event-channel ports. Return when one or more are pending. An * optional timeout may be specified. - * @arg == pointer to sched_poll structure. + * @arg == pointer to sched_poll_t structure. */ #define SCHEDOP_poll 3 -struct sched_poll { - XEN_GUEST_HANDLE(evtchn_port_t) ports; - unsigned int nr_ports; - uint64_t timeout; -}; -typedef struct sched_poll sched_poll_t; -DEFINE_XEN_GUEST_HANDLE(sched_poll_t); /* * Declare a shutdown for another domain. The main use of this function is * in interpreting shutdown requests and reasons for fully-virtualized * domains. A para-virtualized domain may use SCHEDOP_shutdown directly. - * @arg == pointer to sched_remote_shutdown structure. + * @arg == pointer to sched_remote_shutdown_t structure. */ #define SCHEDOP_remote_shutdown 4 -struct sched_remote_shutdown { - domid_t domain_id; /* Remote domain ID */ - unsigned int reason; /* SHUTDOWN_xxx reason */ -}; -typedef struct sched_remote_shutdown sched_remote_shutdown_t; -DEFINE_XEN_GUEST_HANDLE(sched_remote_shutdown_t); /* * Latch a shutdown code, so that when the domain later shuts down it * reports this code to the control tools. - * @arg == as for SCHEDOP_shutdown. + * @arg == sched_shutdown_t, as for SCHEDOP_shutdown. */ #define SCHEDOP_shutdown_code 5 /* * Setup, poke and destroy a domain watchdog timer. - * @arg == pointer to sched_watchdog structure. + * @arg == pointer to sched_watchdog_t structure. * With id == 0, setup a domain watchdog timer to cause domain shutdown * after timeout, returns watchdog id. * With id != 0 and timeout == 0, destroy domain watchdog timer. @@ -115,6 +119,29 @@ * With id != 0 and timeout != 0, poke watchdog timer and set new timeout. */ #define SCHEDOP_watchdog 6 +/* ` } */ + +struct sched_shutdown { + unsigned int reason; /* SHUTDOWN_* => enum sched_shutdown_reason */ +}; +typedef struct sched_shutdown sched_shutdown_t; +DEFINE_XEN_GUEST_HANDLE(sched_shutdown_t); + +struct sched_poll { + XEN_GUEST_HANDLE(evtchn_port_t) ports; + unsigned int nr_ports; + uint64_t timeout; +}; +typedef struct sched_poll sched_poll_t; +DEFINE_XEN_GUEST_HANDLE(sched_poll_t); + +struct sched_remote_shutdown { + domid_t domain_id; /* Remote domain ID */ + unsigned int reason; /* SHUTDOWN_* => enum sched_shutdown_reason */ +}; +typedef struct sched_remote_shutdown sched_remote_shutdown_t; +DEFINE_XEN_GUEST_HANDLE(sched_remote_shutdown_t); + struct sched_watchdog { uint32_t id; /* watchdog ID */ uint32_t timeout; /* timeout */ @@ -127,11 +154,14 @@ * software to determine the appropriate action. For the most part, Xen does * not care about the shutdown code. */ +/* ` enum sched_shutdown_reason { */ #define SHUTDOWN_poweroff 0 /* Domain exited normally. Clean up and kill. */ #define SHUTDOWN_reboot 1 /* Clean up, kill, and then restart. */ #define SHUTDOWN_suspend 2 /* Clean up, save suspend info, kill. */ #define SHUTDOWN_crash 3 /* Tell controller we've crashed. */ #define SHUTDOWN_watchdog 4 /* Restart because watchdog time expired. */ +#define SHUTDOWN_MAX 4 /* Maximum valid shutdown reason. */ +/* ` } */ #endif /* __XEN_PUBLIC_SCHED_H__ */ @@ -138,7 +168,7 @@ /* * Local variables: * mode: C - * c-set-style: "BSD" + * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil Modified: trunk/sys/xen/interface/sysctl.h =================================================================== --- trunk/sys/xen/interface/sysctl.h 2020-02-08 19:27:58 UTC (rev 12305) +++ trunk/sys/xen/interface/sysctl.h 2020-02-08 19:28:08 UTC (rev 12306) @@ -34,8 +34,10 @@ #include "xen.h" #include "domctl.h" +#include "physdev.h" +#include "tmem.h" -#define XEN_SYSCTL_INTERFACE_VERSION 0x00000009 +#define XEN_SYSCTL_INTERFACE_VERSION 0x0000000C /* * Read console content from Xen buffer ring. @@ -72,7 +74,7 @@ #define XEN_SYSCTL_TBUFOP_disable 5 uint32_t cmd; /* IN/OUT variables */ - struct xenctl_cpumap cpu_mask; + struct xenctl_bitmap cpu_mask; uint32_t evt_mask; /* OUT variables */ uint64_aligned_t buffer_mfn; @@ -102,6 +104,7 @@ uint64_aligned_t total_pages; uint64_aligned_t free_pages; uint64_aligned_t scrub_pages; + uint64_aligned_t outstanding_pages; uint32_t hw_cap[8]; /* XEN_SYSCTL_PHYSCAP_??? */ @@ -226,13 +229,17 @@ uint64_aligned_t idle_time; /* idle time from boot */ XEN_GUEST_HANDLE_64(uint64) triggers; /* Cx trigger counts */ XEN_GUEST_HANDLE_64(uint64) residencies; /* Cx residencies */ - uint64_aligned_t pc2; - uint64_aligned_t pc3; - uint64_aligned_t pc6; - uint64_aligned_t pc7; - uint64_aligned_t cc3; - uint64_aligned_t cc6; - uint64_aligned_t cc7; + uint32_t nr_pc; /* entry nr in pc[] */ + uint32_t nr_cc; /* entry nr in cc[] */ + /* + * These two arrays may (and generally will) have unused slots; slots not + * having a corresponding hardware register will not be written by the + * hypervisor. It is therefore up to the caller to put a suitable sentinel + * into all slots before invoking the function. + * Indexing is 1-biased (PC1/CC1 being at index 0). + */ + XEN_GUEST_HANDLE_64(uint64) pc; + XEN_GUEST_HANDLE_64(uint64) cc; }; struct xen_sysctl_get_pmstat { @@ -458,61 +465,76 @@ typedef struct xen_sysctl_lockprof_op xen_sysctl_lockprof_op_t; DEFINE_XEN_GUEST_HANDLE(xen_sysctl_lockprof_op_t); -/* XEN_SYSCTL_topologyinfo */ -#define INVALID_TOPOLOGY_ID (~0U) -struct xen_sysctl_topologyinfo { - /* - * IN: maximum addressable entry in the caller-provided arrays. - * OUT: largest cpu identifier in the system. - * If OUT is greater than IN then the arrays are truncated! - * If OUT is leass than IN then the array tails are not written by sysctl. - */ - uint32_t max_cpu_index; +/* XEN_SYSCTL_cputopoinfo */ +#define XEN_INVALID_CORE_ID (~0U) +#define XEN_INVALID_SOCKET_ID (~0U) +#define XEN_INVALID_NODE_ID (~0U) - /* - * If not NULL, these arrays are filled with core/socket/node identifier - * for each cpu. - * If a cpu has no core/socket/node information (e.g., cpu not present) - * then the sentinel value ~0u is written to each array. - * The number of array elements written by the sysctl is: - * min(@max_cpu_index_IN, at max_cpu_index_OUT)+1 - */ - XEN_GUEST_HANDLE_64(uint32) cpu_to_core; - XEN_GUEST_HANDLE_64(uint32) cpu_to_socket; - XEN_GUEST_HANDLE_64(uint32) cpu_to_node; +struct xen_sysctl_cputopo { + uint32_t core; + uint32_t socket; + uint32_t node; }; -typedef struct xen_sysctl_topologyinfo xen_sysctl_topologyinfo_t; -DEFINE_XEN_GUEST_HANDLE(xen_sysctl_topologyinfo_t); +typedef struct xen_sysctl_cputopo xen_sysctl_cputopo_t; +DEFINE_XEN_GUEST_HANDLE(xen_sysctl_cputopo_t); +/* + * IN: + * - a NULL 'cputopo' handle is a request for maximun 'num_cpus'. + * - otherwise it's the number of entries in 'cputopo' + * + * OUT: + * - If 'num_cpus' is less than the number Xen wants to write but the handle + * handle is not a NULL one, partial data gets returned and 'num_cpus' gets + * updated to reflect the intended number. + * - Otherwise, 'num_cpus' shall indicate the number of entries written, which + * may be less than the input value. + */ +struct xen_sysctl_cputopoinfo { + uint32_t num_cpus; + XEN_GUEST_HANDLE_64(xen_sysctl_cputopo_t) cputopo; +}; +typedef struct xen_sysctl_cputopoinfo xen_sysctl_cputopoinfo_t; +DEFINE_XEN_GUEST_HANDLE(xen_sysctl_cputopoinfo_t); + /* XEN_SYSCTL_numainfo */ -#define INVALID_NUMAINFO_ID (~0U) +#define XEN_INVALID_MEM_SZ (~0U) +#define XEN_INVALID_NODE_DIST (~0U) + +struct xen_sysctl_meminfo { + uint64_t memsize; + uint64_t memfree; +}; +typedef struct xen_sysctl_meminfo xen_sysctl_meminfo_t; +DEFINE_XEN_GUEST_HANDLE(xen_sysctl_meminfo_t); + +/* + * IN: + * - Both 'meminfo' and 'distance' handles being null is a request + * for maximum value of 'num_nodes'. + * - Otherwise it's the number of entries in 'meminfo' and square root + * of number of entries in 'distance' (when corresponding handle is + * non-null) + * + * OUT: + * - If 'num_nodes' is less than the number Xen wants to write but either + * handle is not a NULL one, partial data gets returned and 'num_nodes' + * gets updated to reflect the intended number. + * - Otherwise, 'num_nodes' shall indicate the number of entries written, which + * may be less than the input value. + */ + struct xen_sysctl_numainfo { - /* - * IN: maximum addressable entry in the caller-provided arrays. - * OUT: largest node identifier in the system. - * If OUT is greater than IN then the arrays are truncated! - */ - uint32_t max_node_index; + uint32_t num_nodes; - /* NB. Entries are 0 if node is not present. */ - XEN_GUEST_HANDLE_64(uint64) node_to_memsize; - XEN_GUEST_HANDLE_64(uint64) node_to_memfree; + XEN_GUEST_HANDLE_64(xen_sysctl_meminfo_t) meminfo; /* - * Array, of size (max_node_index+1)^2, listing memory access distances - * between nodes. If an entry has no node distance information (e.g., node - * not present) then the value ~0u is written. - * - * Note that the array rows must be indexed by multiplying by the minimum - * of the caller-provided max_node_index and the returned value of - * max_node_index. That is, if the largest node index in the system is - * smaller than the caller can handle, a smaller 2-d array is constructed - * within the space provided by the caller. When this occurs, trailing - * space provided by the caller is not modified. If the largest node index - * in the system is larger than the caller can handle, then a 2-d array of - * the maximum size handleable by the caller is constructed. + * Distance between nodes 'i' and 'j' is stored in index 'i*N + j', + * where N is the number of nodes that will be returned in 'num_nodes' + * (i.e. not 'num_nodes' provided by the caller) */ - XEN_GUEST_HANDLE_64(uint32) node_to_node_distance; + XEN_GUEST_HANDLE_64(uint32) distance; }; typedef struct xen_sysctl_numainfo xen_sysctl_numainfo_t; DEFINE_XEN_GUEST_HANDLE(xen_sysctl_numainfo_t); @@ -533,7 +555,7 @@ uint32_t domid; /* IN: M */ uint32_t cpu; /* IN: AR */ uint32_t n_dom; /* OUT: I */ - struct xenctl_cpumap cpumap; /* OUT: IF */ + struct xenctl_bitmap cpumap; /* OUT: IF */ }; typedef struct xen_sysctl_cpupool_op xen_sysctl_cpupool_op_t; DEFINE_XEN_GUEST_HANDLE(xen_sysctl_cpupool_op_t); @@ -597,6 +619,152 @@ typedef struct xen_sysctl_scheduler_op xen_sysctl_scheduler_op_t; DEFINE_XEN_GUEST_HANDLE(xen_sysctl_scheduler_op_t); +/* XEN_SYSCTL_coverage_op */ +/* + * Get total size of information, to help allocate + * the buffer. The pointer points to a 32 bit value. + */ +#define XEN_SYSCTL_COVERAGE_get_total_size 0 + +/* + * Read coverage information in a single run + * You must use a tool to split them. + */ +#define XEN_SYSCTL_COVERAGE_read 1 + +/* + * Reset all the coverage counters to 0 + * No parameters. + */ +#define XEN_SYSCTL_COVERAGE_reset 2 + +/* + * Like XEN_SYSCTL_COVERAGE_read but reset also + * counters to 0 in a single call. + */ +#define XEN_SYSCTL_COVERAGE_read_and_reset 3 + +struct xen_sysctl_coverage_op { + uint32_t cmd; /* XEN_SYSCTL_COVERAGE_* */ + union { + uint32_t total_size; /* OUT */ + XEN_GUEST_HANDLE_64(uint8) raw_info; /* OUT */ + } u; +}; +typedef struct xen_sysctl_coverage_op xen_sysctl_coverage_op_t; +DEFINE_XEN_GUEST_HANDLE(xen_sysctl_coverage_op_t); + +#define XEN_SYSCTL_PSR_CMT_get_total_rmid 0 +#define XEN_SYSCTL_PSR_CMT_get_l3_upscaling_factor 1 +/* The L3 cache size is returned in KB unit */ +#define XEN_SYSCTL_PSR_CMT_get_l3_cache_size 2 +#define XEN_SYSCTL_PSR_CMT_enabled 3 +#define XEN_SYSCTL_PSR_CMT_get_l3_event_mask 4 +struct xen_sysctl_psr_cmt_op { + uint32_t cmd; /* IN: XEN_SYSCTL_PSR_CMT_* */ + uint32_t flags; /* padding variable, may be extended for future use */ + union { + uint64_t data; /* OUT */ + struct { + uint32_t cpu; /* IN */ + uint32_t rsvd; + } l3_cache; + } u; +}; +typedef struct xen_sysctl_psr_cmt_op xen_sysctl_psr_cmt_op_t; +DEFINE_XEN_GUEST_HANDLE(xen_sysctl_psr_cmt_op_t); + +/* XEN_SYSCTL_pcitopoinfo */ +#define XEN_INVALID_DEV (XEN_INVALID_NODE_ID - 1) +struct xen_sysctl_pcitopoinfo { + /* + * IN: Number of elements in 'pcitopo' and 'nodes' arrays. + * OUT: Number of processed elements of those arrays. + */ + uint32_t num_devs; + + /* IN: list of devices for which node IDs are requested. */ + XEN_GUEST_HANDLE_64(physdev_pci_device_t) devs; + + /* + * OUT: node identifier for each device. + * If information for a particular device is not available then + * corresponding entry will be set to XEN_INVALID_NODE_ID. If + * device is not known to the hypervisor then XEN_INVALID_DEV + * will be provided. + */ + XEN_GUEST_HANDLE_64(uint32) nodes; +}; +typedef struct xen_sysctl_pcitopoinfo xen_sysctl_pcitopoinfo_t; +DEFINE_XEN_GUEST_HANDLE(xen_sysctl_pcitopoinfo_t); + +#define XEN_SYSCTL_PSR_CAT_get_l3_info 0 +struct xen_sysctl_psr_cat_op { + uint32_t cmd; /* IN: XEN_SYSCTL_PSR_CAT_* */ + uint32_t target; /* IN */ + union { + struct { + uint32_t cbm_len; /* OUT: CBM length */ + uint32_t cos_max; /* OUT: Maximum COS */ + } l3_info; + } u; +}; +typedef struct xen_sysctl_psr_cat_op xen_sysctl_psr_cat_op_t; +DEFINE_XEN_GUEST_HANDLE(xen_sysctl_psr_cat_op_t); + +#define XEN_SYSCTL_TMEM_OP_ALL_CLIENTS 0xFFFFU + +#define XEN_SYSCTL_TMEM_OP_THAW 0 +#define XEN_SYSCTL_TMEM_OP_FREEZE 1 +#define XEN_SYSCTL_TMEM_OP_FLUSH 2 +#define XEN_SYSCTL_TMEM_OP_DESTROY 3 +#define XEN_SYSCTL_TMEM_OP_LIST 4 +#define XEN_SYSCTL_TMEM_OP_SET_WEIGHT 5 +#define XEN_SYSCTL_TMEM_OP_SET_CAP 6 +#define XEN_SYSCTL_TMEM_OP_SET_COMPRESS 7 +#define XEN_SYSCTL_TMEM_OP_QUERY_FREEABLE_MB 8 +#define XEN_SYSCTL_TMEM_OP_SAVE_BEGIN 10 +#define XEN_SYSCTL_TMEM_OP_SAVE_GET_VERSION 11 +#define XEN_SYSCTL_TMEM_OP_SAVE_GET_MAXPOOLS 12 +#define XEN_SYSCTL_TMEM_OP_SAVE_GET_CLIENT_WEIGHT 13 +#define XEN_SYSCTL_TMEM_OP_SAVE_GET_CLIENT_CAP 14 +#define XEN_SYSCTL_TMEM_OP_SAVE_GET_CLIENT_FLAGS 15 +#define XEN_SYSCTL_TMEM_OP_SAVE_GET_POOL_FLAGS 16 +#define XEN_SYSCTL_TMEM_OP_SAVE_GET_POOL_NPAGES 17 +#define XEN_SYSCTL_TMEM_OP_SAVE_GET_POOL_UUID 18 +#define XEN_SYSCTL_TMEM_OP_SAVE_GET_NEXT_PAGE 19 +#define XEN_SYSCTL_TMEM_OP_SAVE_GET_NEXT_INV 20 +#define XEN_SYSCTL_TMEM_OP_SAVE_END 21 +#define XEN_SYSCTL_TMEM_OP_RESTORE_BEGIN 30 +#define XEN_SYSCTL_TMEM_OP_RESTORE_PUT_PAGE 32 +#define XEN_SYSCTL_TMEM_OP_RESTORE_FLUSH_PAGE 33 + +/* + * XEN_SYSCTL_TMEM_OP_SAVE_GET_NEXT_[PAGE|INV] override the 'buf' in + * xen_sysctl_tmem_op with this structure - sometimes with an extra + * page tackled on. + */ +struct tmem_handle { + uint32_t pool_id; + uint32_t index; + xen_tmem_oid_t oid; +}; + +struct xen_sysctl_tmem_op { + uint32_t cmd; /* IN: XEN_SYSCTL_TMEM_OP_* . */ + int32_t pool_id; /* IN: 0 by default unless _SAVE_*, RESTORE_* .*/ + uint32_t cli_id; /* IN: client id, 0 for XEN_SYSCTL_TMEM_QUERY_FREEABLE_MB + for all others can be the domain id or + XEN_SYSCTL_TMEM_OP_ALL_CLIENTS for all. */ + uint32_t arg1; /* IN: If not applicable to command use 0. */ + uint32_t arg2; /* IN: If not applicable to command use 0. */ + uint32_t pad; /* Padding so structure is the same under 32 and 64. */ + xen_tmem_oid_t oid; /* IN: If not applicable to command use 0s. */ + XEN_GUEST_HANDLE_64(char) buf; /* IN/OUT: Buffer to save and restore ops. */ +}; +typedef struct xen_sysctl_tmem_op xen_sysctl_tmem_op_t; +DEFINE_XEN_GUEST_HANDLE(xen_sysctl_tmem_op_t); + struct xen_sysctl { uint32_t cmd; #define XEN_SYSCTL_readconsole 1 @@ -613,16 +781,22 @@ #define XEN_SYSCTL_pm_op 12 #define XEN_SYSCTL_page_offline_op 14 #define XEN_SYSCTL_lockprof_op 15 -#define XEN_SYSCTL_topologyinfo 16 +#define XEN_SYSCTL_cputopoinfo 16 #define XEN_SYSCTL_numainfo 17 #define XEN_SYSCTL_cpupool_op 18 #define XEN_SYSCTL_scheduler_op 19 +#define XEN_SYSCTL_coverage_op 20 +#define XEN_SYSCTL_psr_cmt_op 21 +#define XEN_SYSCTL_pcitopoinfo 22 +#define XEN_SYSCTL_psr_cat_op 23 +#define XEN_SYSCTL_tmem_op 24 uint32_t interface_version; /* XEN_SYSCTL_INTERFACE_VERSION */ union { struct xen_sysctl_readconsole readconsole; struct xen_sysctl_tbuf_op tbuf_op; struct xen_sysctl_physinfo physinfo; - struct xen_sysctl_topologyinfo topologyinfo; + struct xen_sysctl_cputopoinfo cputopoinfo; + struct xen_sysctl_pcitopoinfo pcitopoinfo; struct xen_sysctl_numainfo numainfo; struct xen_sysctl_sched_id sched_id; struct xen_sysctl_perfc_op perfc_op; @@ -637,6 +811,10 @@ struct xen_sysctl_lockprof_op lockprof_op; struct xen_sysctl_cpupool_op cpupool_op; struct xen_sysctl_scheduler_op scheduler_op; + struct xen_sysctl_coverage_op coverage_op; + struct xen_sysctl_psr_cmt_op psr_cmt_op; + struct xen_sysctl_psr_cat_op psr_cat_op; + struct xen_sysctl_tmem_op tmem_op; uint8_t pad[128]; } u; }; @@ -648,7 +826,7 @@ /* * Local variables: * mode: C - * c-set-style: "BSD" + * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil Modified: trunk/sys/xen/interface/tmem.h =================================================================== --- trunk/sys/xen/interface/tmem.h 2020-02-08 19:27:58 UTC (rev 12305) +++ trunk/sys/xen/interface/tmem.h 2020-02-08 19:28:08 UTC (rev 12306) @@ -34,48 +34,28 @@ #define TMEM_SPEC_VERSION 1 /* Commands to HYPERVISOR_tmem_op() */ -#define TMEM_CONTROL 0 +#ifdef __XEN__ +#define TMEM_CONTROL 0 /* Now called XEN_SYSCTL_tmem_op */ +#else +#undef TMEM_CONTROL +#endif #define TMEM_NEW_POOL 1 #define TMEM_DESTROY_POOL 2 -#define TMEM_NEW_PAGE 3 #define TMEM_PUT_PAGE 4 #define TMEM_GET_PAGE 5 #define TMEM_FLUSH_PAGE 6 #define TMEM_FLUSH_OBJECT 7 +#if __XEN_INTERFACE_VERSION__ < 0x00040400 +#define TMEM_NEW_PAGE 3 #define TMEM_READ 8 #define TMEM_WRITE 9 #define TMEM_XCHG 10 +#endif /* Privileged commands to HYPERVISOR_tmem_op() */ -#define TMEM_AUTH 101 +#define TMEM_AUTH 101 #define TMEM_RESTORE_NEW 102 -/* Subops for HYPERVISOR_tmem_op(TMEM_CONTROL) */ -#define TMEMC_THAW 0 -#define TMEMC_FREEZE 1 -#define TMEMC_FLUSH 2 -#define TMEMC_DESTROY 3 -#define TMEMC_LIST 4 -#define TMEMC_SET_WEIGHT 5 -#define TMEMC_SET_CAP 6 -#define TMEMC_SET_COMPRESS 7 -#define TMEMC_QUERY_FREEABLE_MB 8 -#define TMEMC_SAVE_BEGIN 10 -#define TMEMC_SAVE_GET_VERSION 11 -#define TMEMC_SAVE_GET_MAXPOOLS 12 -#define TMEMC_SAVE_GET_CLIENT_WEIGHT 13 -#define TMEMC_SAVE_GET_CLIENT_CAP 14 -#define TMEMC_SAVE_GET_CLIENT_FLAGS 15 -#define TMEMC_SAVE_GET_POOL_FLAGS 16 -#define TMEMC_SAVE_GET_POOL_NPAGES 17 -#define TMEMC_SAVE_GET_POOL_UUID 18 -#define TMEMC_SAVE_GET_NEXT_PAGE 19 -#define TMEMC_SAVE_GET_NEXT_INV 20 -#define TMEMC_SAVE_END 21 -#define TMEMC_RESTORE_BEGIN 30 -#define TMEMC_RESTORE_PUT_PAGE 32 -#define TMEMC_RESTORE_FLUSH_PAGE 33 - /* Bits for HYPERVISOR_tmem_op(TMEM_NEW_POOL) */ #define TMEM_POOL_PERSIST 1 #define TMEM_POOL_SHARED 2 @@ -94,9 +74,16 @@ #define EFROZEN 1000 #define EEMPTY 1001 +struct xen_tmem_oid { + uint64_t oid[3]; +}; +typedef struct xen_tmem_oid xen_tmem_oid_t; +DEFINE_XEN_GUEST_HANDLE(xen_tmem_oid_t); #ifndef __ASSEMBLY__ +#if __XEN_INTERFACE_VERSION__ < 0x00040400 typedef xen_pfn_t tmem_cli_mfn_t; +#endif typedef XEN_GUEST_HANDLE(char) tmem_cli_va_t; struct tmem_op { uint32_t cmd; @@ -107,33 +94,22 @@ uint32_t flags; uint32_t arg1; } creat; /* for cmd == TMEM_NEW_POOL, TMEM_AUTH, TMEM_RESTORE_NEW */ - struct { - uint32_t subop; - uint32_t cli_id; - uint32_t arg1; - uint32_t arg2; - uint64_t oid[3]; - tmem_cli_va_t buf; - } ctrl; /* for cmd == TMEM_CONTROL */ struct { - +#if __XEN_INTERFACE_VERSION__ < 0x00040600 uint64_t oid[3]; +#else + xen_tmem_oid_t oid; +#endif uint32_t index; uint32_t tmem_offset; uint32_t pfn_offset; uint32_t len; - tmem_cli_mfn_t cmfn; /* client machine page frame */ + xen_pfn_t cmfn; /* client machine page frame */ } gen; /* for all other cmd ("generic") */ } u; }; typedef struct tmem_op tmem_op_t; DEFINE_XEN_GUEST_HANDLE(tmem_op_t); - -struct tmem_handle { - uint32_t pool_id; - uint32_t index; - uint64_t oid[3]; -}; #endif #endif /* __XEN_PUBLIC_TMEM_H__ */ @@ -141,7 +117,7 @@ /* * Local variables: * mode: C - * c-set-style: "BSD" + * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil Modified: trunk/sys/xen/interface/trace.h =================================================================== --- trunk/sys/xen/interface/trace.h 2020-02-08 19:27:58 UTC (rev 12305) +++ trunk/sys/xen/interface/trace.h 2020-02-08 19:28:08 UTC (rev 12306) @@ -51,13 +51,41 @@ #define TRC_SUBCLS_SHIFT 12 /* trace subclasses for SVM */ -#define TRC_HVM_ENTRYEXIT 0x00081000 /* VMENTRY and #VMEXIT */ -#define TRC_HVM_HANDLER 0x00082000 /* various HVM handlers */ +#define TRC_HVM_ENTRYEXIT 0x00081000 /* VMENTRY and #VMEXIT */ +#define TRC_HVM_HANDLER 0x00082000 /* various HVM handlers */ +#define TRC_HVM_EMUL 0x00084000 /* emulated devices */ #define TRC_SCHED_MIN 0x00021000 /* Just runstate changes */ #define TRC_SCHED_CLASS 0x00022000 /* Scheduler-specific */ #define TRC_SCHED_VERBOSE 0x00028000 /* More inclusive scheduling */ +/* + * The highest 3 bits of the last 12 bits of TRC_SCHED_CLASS above are + * reserved for encoding what scheduler produced the information. The + * actual event is encoded in the last 9 bits. + * + * This means we have 8 scheduling IDs available (which means at most 8 + * schedulers generating events) and, in each scheduler, up to 512 + * different events. + */ +#define TRC_SCHED_ID_BITS 3 +#define TRC_SCHED_ID_SHIFT (TRC_SUBCLS_SHIFT - TRC_SCHED_ID_BITS) +#define TRC_SCHED_ID_MASK (((1UL<<TRC_SCHED_ID_BITS) - 1) << TRC_SCHED_ID_SHIFT) +#define TRC_SCHED_EVT_MASK (~(TRC_SCHED_ID_MASK)) + +/* Per-scheduler IDs, to identify scheduler specific events */ +#define TRC_SCHED_CSCHED 0 +#define TRC_SCHED_CSCHED2 1 +/* #define XEN_SCHEDULER_SEDF 2 (Removed) */ +#define TRC_SCHED_ARINC653 3 +#define TRC_SCHED_RTDS 4 + +/* Per-scheduler tracing */ +#define TRC_SCHED_CLASS_EVT(_c, _e) \ + ( ( TRC_SCHED_CLASS | \ + ((TRC_SCHED_##_c << TRC_SCHED_ID_SHIFT) & TRC_SCHED_ID_MASK) ) + \ + (_e & TRC_SCHED_EVT_MASK) ) + /* Trace classes for Hardware */ #define TRC_HW_PM 0x00801000 /* Power management traces */ #define TRC_HW_IRQ 0x00802000 /* Traces relating to the handling of IRQs */ @@ -95,21 +123,52 @@ #define TRC_MEM_POD_ZERO_RECLAIM (TRC_MEM + 17) #define TRC_MEM_POD_SUPERPAGE_SPLINTER (TRC_MEM + 18) +#define TRC_PV_ENTRY 0x00201000 /* Hypervisor entry points for PV guests. */ +#define TRC_PV_SUBCALL 0x00202000 /* Sub-call in a multicall hypercall */ -#define TRC_PV_HYPERCALL (TRC_PV + 1) -#define TRC_PV_TRAP (TRC_PV + 3) -#define TRC_PV_PAGE_FAULT (TRC_PV + 4) -#define TRC_PV_FORCED_INVALID_OP (TRC_PV + 5) -#define TRC_PV_EMULATE_PRIVOP (TRC_PV + 6) -#define TRC_PV_EMULATE_4GB (TRC_PV + 7) -#define TRC_PV_MATH_STATE_RESTORE (TRC_PV + 8) -#define TRC_PV_PAGING_FIXUP (TRC_PV + 9) -#define TRC_PV_GDT_LDT_MAPPING_FAULT (TRC_PV + 10) -#define TRC_PV_PTWR_EMULATION (TRC_PV + 11) -#define TRC_PV_PTWR_EMULATION_PAE (TRC_PV + 12) - /* Indicates that addresses in trace record are 64 bits */ -#define TRC_64_FLAG (0x100) +#define TRC_PV_HYPERCALL (TRC_PV_ENTRY + 1) +#define TRC_PV_TRAP (TRC_PV_ENTRY + 3) +#define TRC_PV_PAGE_FAULT (TRC_PV_ENTRY + 4) +#define TRC_PV_FORCED_INVALID_OP (TRC_PV_ENTRY + 5) +#define TRC_PV_EMULATE_PRIVOP (TRC_PV_ENTRY + 6) +#define TRC_PV_EMULATE_4GB (TRC_PV_ENTRY + 7) +#define TRC_PV_MATH_STATE_RESTORE (TRC_PV_ENTRY + 8) +#define TRC_PV_PAGING_FIXUP (TRC_PV_ENTRY + 9) +#define TRC_PV_GDT_LDT_MAPPING_FAULT (TRC_PV_ENTRY + 10) +#define TRC_PV_PTWR_EMULATION (TRC_PV_ENTRY + 11) +#define TRC_PV_PTWR_EMULATION_PAE (TRC_PV_ENTRY + 12) +#define TRC_PV_HYPERCALL_V2 (TRC_PV_ENTRY + 13) +#define TRC_PV_HYPERCALL_SUBCALL (TRC_PV_SUBCALL + 14) +/* + * TRC_PV_HYPERCALL_V2 format + * + * Only some of the hypercall argument are recorded. Bit fields A0 to + * A5 in the first extra word are set if the argument is present and + * the arguments themselves are packed sequentially in the following + * words. + * + * The TRC_64_FLAG bit is not set for these events (even if there are + * 64-bit arguments in the record). + * + * Word + * 0 bit 31 30|29 28|27 26|25 24|23 22|21 20|19 ... 0 + * A5 |A4 |A3 |A2 |A1 |A0 |Hypercall op + * 1 First 32 bit (or low word of first 64 bit) arg in record + * 2 Second 32 bit (or high word of first 64 bit) arg in record + * ... + * + * A0-A5 bitfield values: + * + * 00b Argument not present + * 01b 32-bit argument present + * 10b 64-bit argument present + * 11b Reserved + */ +#define TRC_PV_HYPERCALL_V2_ARG_32(i) (0x1 << (20 + 2*(i))) +#define TRC_PV_HYPERCALL_V2_ARG_64(i) (0x2 << (20 + 2*(i))) +#define TRC_PV_HYPERCALL_V2_ARG_MASK (0xfff00000) + #define TRC_SHADOW_NOT_SHADOW (TRC_SHADOW + 1) #define TRC_SHADOW_FAST_PROPAGATE (TRC_SHADOW + 2) #define TRC_SHADOW_FAST_MMIO (TRC_SHADOW + 3) @@ -173,6 +232,25 @@ #define TRC_HVM_IOPORT_WRITE (TRC_HVM_HANDLER + 0x216) #define TRC_HVM_IOMEM_WRITE (TRC_HVM_HANDLER + 0x217) +/* Trace events for emulated devices */ +#define TRC_HVM_EMUL_HPET_START_TIMER (TRC_HVM_EMUL + 0x1) +#define TRC_HVM_EMUL_PIT_START_TIMER (TRC_HVM_EMUL + 0x2) +#define TRC_HVM_EMUL_RTC_START_TIMER (TRC_HVM_EMUL + 0x3) +#define TRC_HVM_EMUL_LAPIC_START_TIMER (TRC_HVM_EMUL + 0x4) +#define TRC_HVM_EMUL_HPET_STOP_TIMER (TRC_HVM_EMUL + 0x5) +#define TRC_HVM_EMUL_PIT_STOP_TIMER (TRC_HVM_EMUL + 0x6) +#define TRC_HVM_EMUL_RTC_STOP_TIMER (TRC_HVM_EMUL + 0x7) +#define TRC_HVM_EMUL_LAPIC_STOP_TIMER (TRC_HVM_EMUL + 0x8) +#define TRC_HVM_EMUL_PIT_TIMER_CB (TRC_HVM_EMUL + 0x9) +#define TRC_HVM_EMUL_LAPIC_TIMER_CB (TRC_HVM_EMUL + 0xA) +#define TRC_HVM_EMUL_PIC_INT_OUTPUT (TRC_HVM_EMUL + 0xB) +#define TRC_HVM_EMUL_PIC_KICK (TRC_HVM_EMUL + 0xC) +#define TRC_HVM_EMUL_PIC_INTACK (TRC_HVM_EMUL + 0xD) +#define TRC_HVM_EMUL_PIC_POSEDGE (TRC_HVM_EMUL + 0xE) +#define TRC_HVM_EMUL_PIC_NEGEDGE (TRC_HVM_EMUL + 0xF) +#define TRC_HVM_EMUL_PIC_PEND_IRQ_CALL (TRC_HVM_EMUL + 0x10) +#define TRC_HVM_EMUL_LAPIC_PIC_INTR (TRC_HVM_EMUL + 0x11) + /* trace events for per class */ #define TRC_PM_FREQ_CHANGE (TRC_HW_PM + 0x01) #define TRC_PM_IDLE_ENTRY (TRC_HW_PM + 0x02) @@ -188,6 +266,14 @@ #define TRC_HW_IRQ_UNMAPPED_VECTOR (TRC_HW_IRQ + 0x7) #define TRC_HW_IRQ_HANDLED (TRC_HW_IRQ + 0x8) +/* + * Event Flags + * + * Some events (e.g, TRC_PV_TRAP and TRC_HVM_IOMEM_READ) have multiple + * record formats. These event flags distinguish between the + * different formats. + */ +#define TRC_64_FLAG 0x100 /* Addresses are 64 bits (instead of 32 bits) */ /* This structure represents a single trace buffer record. */ struct t_rec { @@ -238,7 +324,7 @@ /* * Local variables: * mode: C - * c-set-style: "BSD" + * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil Modified: trunk/sys/xen/interface/vcpu.h =================================================================== --- trunk/sys/xen/interface/vcpu.h 2020-02-08 19:27:58 UTC (rev 12305) +++ trunk/sys/xen/interface/vcpu.h 2020-02-08 19:28:08 UTC (rev 12306) @@ -32,7 +32,7 @@ /* * Prototype for this hypercall is: - * int vcpu_op(int cmd, int vcpuid, void *extra_args) + * long vcpu_op(int cmd, unsigned int vcpuid, void *extra_args) * @cmd == VCPUOP_??? (VCPU operation). * @vcpuid == VCPU to operate on. * @extra_args == Operation-specific extra arguments (NULL if none). @@ -233,7 +233,7 @@ /* * Local variables: * mode: C - * c-set-style: "BSD" + * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil Modified: trunk/sys/xen/interface/version.h =================================================================== --- trunk/sys/xen/interface/version.h 2020-02-08 19:27:58 UTC (rev 12305) +++ trunk/sys/xen/interface/version.h 2020-02-08 19:28:08 UTC (rev 12306) @@ -29,6 +29,8 @@ #ifndef __XEN_PUBLIC_VERSION_H__ #define __XEN_PUBLIC_VERSION_H__ +#include "xen.h" + /* NB. All ops return zero on success, except XENVER_{version,pagesize} */ /* arg == NULL; returns major:minor (16:16). */ @@ -59,7 +61,7 @@ #define XENVER_platform_parameters 5 struct xen_platform_parameters { - unsigned long virt_start; + xen_ulong_t virt_start; }; typedef struct xen_platform_parameters xen_platform_parameters_t; @@ -87,7 +89,7 @@ /* * Local variables: * mode: C - * c-set-style: "BSD" + * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil Added: trunk/sys/xen/interface/vm_event.h =================================================================== --- trunk/sys/xen/interface/vm_event.h (rev 0) +++ trunk/sys/xen/interface/vm_event.h 2020-02-08 19:28:08 UTC (rev 12306) @@ -0,0 +1,270 @@ +/* $MidnightBSD$ */ +/****************************************************************************** + * vm_event.h + * + * Memory event common structures. + * + * Copyright (c) 2009 by Citrix Systems, Inc. (Patrick Colp) + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#ifndef _XEN_PUBLIC_VM_EVENT_H +#define _XEN_PUBLIC_VM_EVENT_H + +#include "xen.h" + +#define VM_EVENT_INTERFACE_VERSION 0x00000001 + +#if defined(__XEN__) || defined(__XEN_TOOLS__) + +#include "io/ring.h" + +/* + * Memory event flags + */ + +/* + * VCPU_PAUSED in a request signals that the vCPU triggering the event has been + * paused + * VCPU_PAUSED in a response signals to unpause the vCPU + */ +#define VM_EVENT_FLAG_VCPU_PAUSED (1 << 0) +/* Flags to aid debugging vm_event */ +#define VM_EVENT_FLAG_FOREIGN (1 << 1) +/* + * The following flags can be set in response to a mem_access event. + * + * Emulate the fault-causing instruction (if set in the event response flags). + * This will allow the guest to continue execution without lifting the page + * access restrictions. + */ +#define VM_EVENT_FLAG_EMULATE (1 << 2) +/* + * Same as VM_EVENT_FLAG_EMULATE, but with write operations or operations + * potentially having side effects (like memory mapped or port I/O) disabled. + */ +#define VM_EVENT_FLAG_EMULATE_NOWRITE (1 << 3) +/* + * Toggle singlestepping on vm_event response. + * Requires the vCPU to be paused already (synchronous events only). + */ +#define VM_EVENT_FLAG_TOGGLE_SINGLESTEP (1 << 4) +/* + * Data is being sent back to the hypervisor in the event response, to be + * returned by the read function when emulating an instruction. + * This flag is only useful when combined with VM_EVENT_FLAG_EMULATE + * and takes precedence if combined with VM_EVENT_FLAG_EMULATE_NOWRITE + * (i.e. if both VM_EVENT_FLAG_EMULATE_NOWRITE and + * VM_EVENT_FLAG_SET_EMUL_READ_DATA are set, only the latter will be honored). + */ +#define VM_EVENT_FLAG_SET_EMUL_READ_DATA (1 << 5) + /* + * Deny completion of the operation that triggered the event. + * Currently only useful for MSR, CR0, CR3 and CR4 write events. + */ +#define VM_EVENT_FLAG_DENY (1 << 6) +/* + * This flag can be set in a request or a response + * + * On a request, indicates that the event occurred in the alternate p2m specified by + * the altp2m_idx request field. + * + * On a response, indicates that the VCPU should resume in the alternate p2m specified + * by the altp2m_idx response field if possible. + */ +#define VM_EVENT_FLAG_ALTERNATE_P2M (1 << 7) + +/* + * Reasons for the vm event request + */ + +/* Default case */ +#define VM_EVENT_REASON_UNKNOWN 0 +/* Memory access violation */ +#define VM_EVENT_REASON_MEM_ACCESS 1 +/* Memory sharing event */ +#define VM_EVENT_REASON_MEM_SHARING 2 +/* Memory paging event */ +#define VM_EVENT_REASON_MEM_PAGING 3 +/* A control register was updated */ +#define VM_EVENT_REASON_WRITE_CTRLREG 4 +/* An MSR was updated. */ +#define VM_EVENT_REASON_MOV_TO_MSR 5 +/* Debug operation executed (e.g. int3) */ +#define VM_EVENT_REASON_SOFTWARE_BREAKPOINT 6 +/* Single-step (e.g. MTF) */ +#define VM_EVENT_REASON_SINGLESTEP 7 +/* An event has been requested via HVMOP_guest_request_vm_event. */ +#define VM_EVENT_REASON_GUEST_REQUEST 8 + +/* Supported values for the vm_event_write_ctrlreg index. */ +#define VM_EVENT_X86_CR0 0 +#define VM_EVENT_X86_CR3 1 +#define VM_EVENT_X86_CR4 2 +#define VM_EVENT_X86_XCR0 3 + +/* + * Using a custom struct (not hvm_hw_cpu) so as to not fill + * the vm_event ring buffer too quickly. + */ +struct vm_event_regs_x86 { + uint64_t rax; + uint64_t rcx; + uint64_t rdx; + uint64_t rbx; + uint64_t rsp; + uint64_t rbp; + uint64_t rsi; + uint64_t rdi; + uint64_t r8; + uint64_t r9; + uint64_t r10; + uint64_t r11; + uint64_t r12; + uint64_t r13; + uint64_t r14; + uint64_t r15; + uint64_t rflags; + uint64_t dr7; + uint64_t rip; + uint64_t cr0; + uint64_t cr2; + uint64_t cr3; + uint64_t cr4; + uint64_t sysenter_cs; + uint64_t sysenter_esp; + uint64_t sysenter_eip; + uint64_t msr_efer; + uint64_t msr_star; + uint64_t msr_lstar; + uint64_t fs_base; + uint64_t gs_base; + uint32_t cs_arbytes; + uint32_t _pad; +}; + +/* + * mem_access flag definitions + * + * These flags are set only as part of a mem_event request. + * + * R/W/X: Defines the type of violation that has triggered the event + * Multiple types can be set in a single violation! + * GLA_VALID: If the gla field holds a guest VA associated with the event + * FAULT_WITH_GLA: If the violation was triggered by accessing gla + * FAULT_IN_GPT: If the violation was triggered during translating gla + */ +#define MEM_ACCESS_R (1 << 0) +#define MEM_ACCESS_W (1 << 1) +#define MEM_ACCESS_X (1 << 2) +#define MEM_ACCESS_RWX (MEM_ACCESS_R | MEM_ACCESS_W | MEM_ACCESS_X) +#define MEM_ACCESS_RW (MEM_ACCESS_R | MEM_ACCESS_W) +#define MEM_ACCESS_RX (MEM_ACCESS_R | MEM_ACCESS_X) +#define MEM_ACCESS_WX (MEM_ACCESS_W | MEM_ACCESS_X) +#define MEM_ACCESS_GLA_VALID (1 << 3) +#define MEM_ACCESS_FAULT_WITH_GLA (1 << 4) +#define MEM_ACCESS_FAULT_IN_GPT (1 << 5) + +struct vm_event_mem_access { + uint64_t gfn; + uint64_t offset; + uint64_t gla; /* if flags has MEM_ACCESS_GLA_VALID set */ + uint32_t flags; /* MEM_ACCESS_* */ + uint32_t _pad; +}; + +struct vm_event_write_ctrlreg { + uint32_t index; + uint32_t _pad; + uint64_t new_value; + uint64_t old_value; +}; + +struct vm_event_debug { + uint64_t gfn; +}; + +struct vm_event_mov_to_msr { + uint64_t msr; + uint64_t value; +}; + +#define MEM_PAGING_DROP_PAGE (1 << 0) +#define MEM_PAGING_EVICT_FAIL (1 << 1) + +struct vm_event_paging { + uint64_t gfn; + uint32_t p2mt; + uint32_t flags; +}; + +struct vm_event_sharing { + uint64_t gfn; + uint32_t p2mt; + uint32_t _pad; +}; + +struct vm_event_emul_read_data { + uint32_t size; + /* The struct is used in a union with vm_event_regs_x86. */ + uint8_t data[sizeof(struct vm_event_regs_x86) - sizeof(uint32_t)]; +}; + +typedef struct vm_event_st { + uint32_t version; /* VM_EVENT_INTERFACE_VERSION */ + uint32_t flags; /* VM_EVENT_FLAG_* */ + uint32_t reason; /* VM_EVENT_REASON_* */ + uint32_t vcpu_id; + uint16_t altp2m_idx; /* may be used during request and response */ + uint16_t _pad[3]; + + union { + struct vm_event_paging mem_paging; + struct vm_event_sharing mem_sharing; + struct vm_event_mem_access mem_access; + struct vm_event_write_ctrlreg write_ctrlreg; + struct vm_event_mov_to_msr mov_to_msr; + struct vm_event_debug software_breakpoint; + struct vm_event_debug singlestep; + } u; + + union { + union { + struct vm_event_regs_x86 x86; + } regs; + + struct vm_event_emul_read_data emul_read_data; + } data; +} vm_event_request_t, vm_event_response_t; + +DEFINE_RING_TYPES(vm_event, vm_event_request_t, vm_event_response_t); + +#endif /* defined(__XEN__) || defined(__XEN_TOOLS__) */ +#endif /* _XEN_PUBLIC_VM_EVENT_H */ + +/* + * Local variables: + * mode: C + * c-file-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ Property changes on: trunk/sys/xen/interface/vm_event.h ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +MidnightBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Modified: trunk/sys/xen/interface/xen-compat.h =================================================================== --- trunk/sys/xen/interface/xen-compat.h 2020-02-08 19:27:58 UTC (rev 12305) +++ trunk/sys/xen/interface/xen-compat.h 2020-02-08 19:28:08 UTC (rev 12306) @@ -28,14 +28,17 @@ #ifndef __XEN_PUBLIC_XEN_COMPAT_H__ #define __XEN_PUBLIC_XEN_COMPAT_H__ -#define __XEN_LATEST_INTERFACE_VERSION__ 0x00040200 +#define __XEN_LATEST_INTERFACE_VERSION__ 0x00040600 #if defined(__XEN__) || defined(__XEN_TOOLS__) /* Xen is built with matching headers and implements the latest interface. */ #define __XEN_INTERFACE_VERSION__ __XEN_LATEST_INTERFACE_VERSION__ #elif !defined(__XEN_INTERFACE_VERSION__) -/* Guests which do not specify a version get the legacy interface. */ -#define __XEN_INTERFACE_VERSION__ 0x00000000 +/* + * The interface version is not set if and only if xen/xen-os.h is not + * included. + */ +#error "Please include xen/xen-os.h" #endif #if __XEN_INTERFACE_VERSION__ > __XEN_LATEST_INTERFACE_VERSION__ Modified: trunk/sys/xen/interface/xen.h =================================================================== --- trunk/sys/xen/interface/xen.h 2020-02-08 19:27:58 UTC (rev 12305) +++ trunk/sys/xen/interface/xen.h 2020-02-08 19:28:08 UTC (rev 12306) @@ -32,9 +32,7 @@ #if defined(__i386__) || defined(__x86_64__) #include "arch-x86/xen.h" -#elif defined(__ia64__) -#include "arch-ia64.h" -#elif defined(__arm__) +#elif defined(__arm__) || defined (__aarch64__) #include "arch-arm.h" #else #error "Unsupported architecture" @@ -46,12 +44,15 @@ __DEFINE_XEN_GUEST_HANDLE(uchar, unsigned char); DEFINE_XEN_GUEST_HANDLE(int); __DEFINE_XEN_GUEST_HANDLE(uint, unsigned int); +#if __XEN_INTERFACE_VERSION__ < 0x00040300 DEFINE_XEN_GUEST_HANDLE(long); __DEFINE_XEN_GUEST_HANDLE(ulong, unsigned long); +#endif DEFINE_XEN_GUEST_HANDLE(void); DEFINE_XEN_GUEST_HANDLE(uint64_t); DEFINE_XEN_GUEST_HANDLE(xen_pfn_t); +DEFINE_XEN_GUEST_HANDLE(xen_ulong_t); #endif /* @@ -101,6 +102,7 @@ #define __HYPERVISOR_kexec_op 37 #define __HYPERVISOR_tmem_op 38 #define __HYPERVISOR_xc_reserved_op 39 /* reserved for XenClient */ +#define __HYPERVISOR_xenpmu_op 40 /* Architecture-specific hypercall definitions. */ #define __HYPERVISOR_arch_0 48 @@ -160,6 +162,7 @@ #define VIRQ_MEM_EVENT 10 /* G. (DOM0) A memory event has occured */ #define VIRQ_XC_RESERVED 11 /* G. Reserved for XenClient */ #define VIRQ_ENOMEM 12 /* G. (DOM0) Low on heap memory */ +#define VIRQ_XENPMU 13 /* V. PMC interrupt */ /* Architecture-specific VIRQ definitions. */ #define VIRQ_ARCH_0 16 @@ -277,15 +280,15 @@ * refer to Intel SDM 10.12. The PAT allows to set the caching attributes of * pages instead of using MTRRs. * - * The PAT MSR is as follow (it is a 64-bit value, each entry is 8 bits): - * PAT4 PAT0 - * +---+----+----+----+-----+----+----+ - * WC | WC | WB | UC | UC- | WC | WB | <= Linux - * +---+----+----+----+-----+----+----+ - * WC | WT | WB | UC | UC- | WT | WB | <= BIOS (default when machine boots) - * +---+----+----+----+-----+----+----+ - * WC | WP | WC | UC | UC- | WT | WB | <= Xen - * +---+----+----+----+-----+----+----+ + * The PAT MSR is as follows (it is a 64-bit value, each entry is 8 bits): + * PAT4 PAT0 + * +-----+-----+----+----+----+-----+----+----+ + * | UC | UC- | WC | WB | UC | UC- | WC | WB | <= Linux + * +-----+-----+----+----+----+-----+----+----+ + * | UC | UC- | WT | WB | UC | UC- | WT | WB | <= BIOS (default when machine boots) + * +-----+-----+----+----+----+-----+----+----+ + * | rsv | rsv | WP | WC | UC | UC- | WT | WB | <= Xen + * +-----+-----+----+----+----+-----+----+----+ * * The lookup of this index table translates to looking up * Bit 7, Bit 4, and Bit 3 of val entry: @@ -319,41 +322,47 @@ /* * MMU EXTENDED OPERATIONS - * - * HYPERVISOR_mmuext_op() accepts a list of mmuext_op structures. + * + * ` enum neg_errnoval + * ` HYPERVISOR_mmuext_op(mmuext_op_t uops[], + * ` unsigned int count, + * ` unsigned int *pdone, + * ` unsigned int foreigndom) + */ +/* HYPERVISOR_mmuext_op() accepts a list of mmuext_op structures. * A foreigndom (FD) can be specified (or DOMID_SELF for none). * Where the FD has some effect, it is described below. - * + * * cmd: MMUEXT_(UN)PIN_*_TABLE * mfn: Machine frame number to be (un)pinned as a p.t. page. * The frame must belong to the FD, if one is specified. - * + * * cmd: MMUEXT_NEW_BASEPTR * mfn: Machine frame number of new page-table base to install in MMU. - * + * * cmd: MMUEXT_NEW_USER_BASEPTR [x86/64 only] * mfn: Machine frame number of new page-table base to install in MMU * when in user space. - * + * * cmd: MMUEXT_TLB_FLUSH_LOCAL * No additional arguments. Flushes local TLB. - * + * * cmd: MMUEXT_INVLPG_LOCAL * linear_addr: Linear address to be flushed from the local TLB. - * + * * cmd: MMUEXT_TLB_FLUSH_MULTI * vcpumask: Pointer to bitmap of VCPUs to be flushed. - * + * * cmd: MMUEXT_INVLPG_MULTI * linear_addr: Linear address to be flushed. * vcpumask: Pointer to bitmap of VCPUs to be flushed. - * + * * cmd: MMUEXT_TLB_FLUSH_ALL * No additional arguments. Flushes all VCPUs' TLBs. - * + * * cmd: MMUEXT_INVLPG_ALL * linear_addr: Linear address to be flushed from all VCPUs' TLBs. - * + * * cmd: MMUEXT_FLUSH_CACHE * No additional arguments. Writes back and flushes cache contents. * @@ -360,7 +369,7 @@ * cmd: MMUEXT_FLUSH_CACHE_GLOBAL * No additional arguments. Writes back and flushes cache contents * on all CPUs in the system. - * + * * cmd: MMUEXT_SET_LDT * linear_addr: Linear address of LDT base (NB. must be page-aligned). * nr_ents: Number of entries in LDT. @@ -375,6 +384,7 @@ * cmd: MMUEXT_[UN]MARK_SUPER * mfn: Machine frame number of head of superpage to be [un]marked. */ +/* ` enum mmuext_cmd { */ #define MMUEXT_PIN_L1_TABLE 0 #define MMUEXT_PIN_L2_TABLE 1 #define MMUEXT_PIN_L3_TABLE 2 @@ -395,10 +405,11 @@ #define MMUEXT_FLUSH_CACHE_GLOBAL 18 #define MMUEXT_MARK_SUPER 19 #define MMUEXT_UNMARK_SUPER 20 +/* ` } */ #ifndef __ASSEMBLY__ struct mmuext_op { - unsigned int cmd; + unsigned int cmd; /* => enum mmuext_cmd */ union { /* [UN]PIN_TABLE, NEW_BASEPTR, NEW_USER_BASEPTR * CLEAR_PAGE, COPY_PAGE, [UN]MARK_SUPER */ @@ -423,9 +434,24 @@ DEFINE_XEN_GUEST_HANDLE(mmuext_op_t); #endif +/* + * ` enum neg_errnoval + * ` HYPERVISOR_update_va_mapping(unsigned long va, u64 val, + * ` enum uvm_flags flags) + * ` + * ` enum neg_errnoval + * ` HYPERVISOR_update_va_mapping_otherdomain(unsigned long va, u64 val, + * ` enum uvm_flags flags, + * ` domid_t domid) + * ` + * ` @va: The virtual address whose mapping we want to change + * ` @val: The new page table entry, must contain a machine address + * ` @flags: Control TLB flushes + */ /* These are passed as 'flags' to update_va_mapping. They can be ORed. */ /* When specifying UVMF_MULTI, also OR in a pointer to a CPU bitmap. */ /* UVMF_LOCAL is merely UVMF_MULTI with a NULL bitmap pointer. */ +/* ` enum uvm_flags { */ #define UVMF_NONE (0UL<<0) /* No flushing at all. */ #define UVMF_TLB_FLUSH (1UL<<0) /* Flush entire TLB(s). */ #define UVMF_INVLPG (2UL<<0) /* Flush only one entry. */ @@ -433,6 +459,7 @@ #define UVMF_MULTI (0UL<<2) /* Flush subset of TLBs. */ #define UVMF_LOCAL (0UL<<2) /* Flush local TLB. */ #define UVMF_ALL (1UL<<2) /* Flush all TLBs. */ +/* ` } */ /* * Commands to HYPERVISOR_console_io(). @@ -462,7 +489,21 @@ /* x86/PAE guests: support PDPTs above 4GB. */ #define VMASST_TYPE_pae_extended_cr3 3 +/* + * x86/64 guests: strictly hide M2P from user mode. + * This allows the guest to control respective hypervisor behavior: + * - when not set, L4 tables get created with the respective slot blank, + * and whenever the L4 table gets used as a kernel one the missing + * mapping gets inserted, + * - when set, L4 tables get created with the respective slot initialized + * as before, and whenever the L4 table gets used as a user one the + * mapping gets zapped. + */ +#define VMASST_TYPE_m2p_strict 32 + +#if __XEN_INTERFACE_VERSION__ < 0x00040600 #define MAX_VMASST_TYPE 3 +#endif #ifndef __ASSEMBLY__ @@ -515,21 +556,28 @@ DEFINE_XEN_GUEST_HANDLE(mmu_update_t); /* - * Send an array of these to HYPERVISOR_multicall(). - * NB. The fields are natural register size for this architecture. + * ` enum neg_errnoval + * ` HYPERVISOR_multicall(multicall_entry_t call_list[], + * ` uint32_t nr_calls); + * + * NB. The fields are logically the natural register size for this + * architecture. In cases where xen_ulong_t is larger than this then + * any unused bits in the upper portion must be zero. */ struct multicall_entry { - unsigned long op, result; - unsigned long args[6]; + xen_ulong_t op, result; + xen_ulong_t args[6]; }; typedef struct multicall_entry multicall_entry_t; DEFINE_XEN_GUEST_HANDLE(multicall_entry_t); +#if __XEN_INTERFACE_VERSION__ < 0x00040400 /* - * Event channel endpoints per domain: + * Event channel endpoints per domain (when using the 2-level ABI): * 1024 if a long is 32 bits; 4096 if a long is 64 bits. */ -#define NR_EVENT_CHANNELS (sizeof(unsigned long) * sizeof(unsigned long) * 64) +#define NR_EVENT_CHANNELS EVTCHN_2L_NR_CHANNELS +#endif struct vcpu_time_info { /* @@ -585,8 +633,12 @@ * to block: this avoids wakeup-waiting races. */ uint8_t evtchn_upcall_pending; +#ifdef XEN_HAVE_PV_UPCALL_MASK uint8_t evtchn_upcall_mask; - unsigned long evtchn_pending_sel; +#else /* XEN_HAVE_PV_UPCALL_MASK */ + uint8_t pad0; +#endif /* XEN_HAVE_PV_UPCALL_MASK */ + xen_ulong_t evtchn_pending_sel; struct arch_vcpu_info arch; struct vcpu_time_info time; }; /* 64 bytes (x86) */ @@ -595,6 +647,7 @@ #endif /* + * `incontents 200 startofday_shared Start-of-day shared data structure * Xen/kernel shared data -- pointer provided in start_info. * * This structure is defined to be both smaller than a page, and the @@ -636,8 +689,8 @@ * per-vcpu selector word to be set. Each bit in the selector covers a * 'C long' in the PENDING bitfield array. */ - unsigned long evtchn_pending[sizeof(unsigned long) * 8]; - unsigned long evtchn_mask[sizeof(unsigned long) * 8]; + xen_ulong_t evtchn_pending[sizeof(xen_ulong_t) * 8]; + xen_ulong_t evtchn_mask[sizeof(xen_ulong_t) * 8]; /* * Wallclock time: updated only by control software. Guests should base @@ -646,6 +699,12 @@ uint32_t wc_version; /* Version counter: see vcpu_time_info_t. */ uint32_t wc_sec; /* Secs 00:00:00 UTC, Jan 1, 1970. */ uint32_t wc_nsec; /* Nsecs 00:00:00 UTC, Jan 1, 1970. */ +#if !defined(__i386__) + uint32_t wc_sec_hi; +# define xen_wc_sec_hi wc_sec_hi +#elif !defined(__XEN__) && !defined(__XEN_TOOLS__) +# define xen_wc_sec_hi arch.wc_sec_hi +#endif struct arch_shared_info arch; @@ -655,30 +714,43 @@ #endif /* - * Start-of-day memory layout: + * `incontents 200 startofday Start-of-day memory layout + * * 1. The domain is started within contiguous virtual-memory region. * 2. The contiguous region ends on an aligned 4MB boundary. * 3. This the order of bootstrap elements in the initial virtual region: * a. relocated kernel image * b. initial ram disk [mod_start, mod_len] + * (may be omitted) * c. list of allocated page frames [mfn_list, nr_pages] * (unless relocated due to XEN_ELFNOTE_INIT_P2M) * d. start_info_t structure [register ESI (x86)] - * e. bootstrap page tables [pt_base, CR3 (x86)] - * f. bootstrap stack [register ESP (x86)] + * in case of dom0 this page contains the console info, too + * e. unless dom0: xenstore ring page + * f. unless dom0: console ring page + * g. bootstrap page tables [pt_base and CR3 (x86)] + * h. bootstrap stack [register ESP (x86)] * 4. Bootstrap elements are packed together, but each is 4kB-aligned. - * 5. The initial ram disk may be omitted. - * 6. The list of page frames forms a contiguous 'pseudo-physical' memory + * 5. The list of page frames forms a contiguous 'pseudo-physical' memory * layout for the domain. In particular, the bootstrap virtual-memory * region is a 1:1 mapping to the first section of the pseudo-physical map. - * 7. All bootstrap elements are mapped read-writable for the guest OS. The + * 6. All bootstrap elements are mapped read-writable for the guest OS. The * only exception is the bootstrap page table, which is mapped read-only. - * 8. There is guaranteed to be at least 512kB padding after the final + * 7. There is guaranteed to be at least 512kB padding after the final * bootstrap element. If necessary, the bootstrap virtual region is * extended by an extra 4MB to ensure this. + * + * Note: Prior to 25833:bb85bbccb1c9. ("x86/32-on-64 adjust Dom0 initial page + * table layout") a bug caused the pt_base (3.g above) and cr3 to not point + * to the start of the guest page tables (it was offset by two pages). + * This only manifested itself on 32-on-64 dom0 kernels and not 32-on-64 domU + * or 64-bit kernels of any colour. The page tables for a 32-on-64 dom0 got + * allocated in the order: 'first L1','first L2', 'first L3', so the offset + * to the page table base is by two pages back. The initial domain if it is + * 32-bit and runs under a 64-bit hypervisor should _NOT_ use two of the + * pages preceding pt_base and mark them as reserved/unused. */ - -#define MAX_GUEST_CMDLINE 1024 +#ifdef XEN_HAVE_PV_GUEST_ENTRY struct start_info { /* THE FOLLOWING ARE FILLED IN BOTH ON INITIAL BOOT AND ON RESUME. */ char magic[32]; /* "xen-<version>-<platform>". */ @@ -705,6 +777,7 @@ /* (PFN of pre-loaded module if */ /* SIF_MOD_START_PFN set in flags). */ unsigned long mod_len; /* Size (bytes) of pre-loaded module. */ +#define MAX_GUEST_CMDLINE 1024 int8_t cmd_line[MAX_GUEST_CMDLINE]; /* The pfn range here covers both page table and p->m table frames. */ unsigned long first_p2m_pfn;/* 1st pfn forming initial P->M table. */ @@ -717,6 +790,7 @@ #define console_mfn console.domU.mfn #define console_evtchn console.domU.evtchn #endif +#endif /* XEN_HAVE_PV_GUEST_ENTRY */ /* These flags are passed in the 'flags' field of start_info_t. */ #define SIF_PRIVILEGED (1<<0) /* Is the domain privileged? */ @@ -723,6 +797,8 @@ #define SIF_INITDOMAIN (1<<1) /* Is this the initial control domain? */ #define SIF_MULTIBOOT_MOD (1<<2) /* Is mod_start a multiboot module? */ #define SIF_MOD_START_PFN (1<<3) /* Is mod_start a PFN? */ +#define SIF_VIRT_P2M_4TOOLS (1<<4) /* Do Xen tools understand a virt. mapped */ + /* P->M making the 3 level tree obsolete? */ #define SIF_PM_MASK (0xFF<<8) /* reserve 1 byte for xen-pm options */ /* @@ -750,7 +826,14 @@ /* Unused, must be zero */ uint32_t pad; }; - +/* + * `incontents 200 startofday_dom0_console Dom0_console + * + * The console structure in start_info.console.dom0 + * + * This structure includes a variety of information required to + * have a working VGA/VESA console. + */ typedef struct dom0_vga_console_info { uint8_t video_type; /* DOM0_VGA_CONSOLE_??? */ #define XEN_VGATYPE_TEXT_MODE_3 0x03 @@ -815,6 +898,9 @@ /* Default definitions for macros used by domctl/sysctl. */ #if defined(__XEN__) || defined(__XEN_TOOLS__) +#ifndef int64_aligned_t +#define int64_aligned_t int64_t +#endif #ifndef uint64_aligned_t #define uint64_aligned_t uint64_t #endif @@ -823,9 +909,9 @@ #endif #ifndef __ASSEMBLY__ -struct xenctl_cpumap { +struct xenctl_bitmap { XEN_GUEST_HANDLE_64(uint8) bitmap; - uint32_t nr_cpus; + uint32_t nr_bits; }; #endif @@ -836,7 +922,7 @@ /* * Local variables: * mode: C - * c-set-style: "BSD" + * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil Modified: trunk/sys/xen/interface/xenoprof.h =================================================================== --- trunk/sys/xen/interface/xenoprof.h 2020-02-08 19:27:58 UTC (rev 12305) +++ trunk/sys/xen/interface/xenoprof.h 2020-02-08 19:28:08 UTC (rev 12306) @@ -145,7 +145,7 @@ /* * Local variables: * mode: C - * c-set-style: "BSD" + * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil From laffer1 at midnightbsd.org Sat Feb 8 14:28:38 2020 From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org) Date: Sat, 8 Feb 2020 14:28:38 -0500 (EST) Subject: [Midnightbsd-cvs] src [12307] trunk/sys/xen/xenmem: sync with FreeBSD 11-stable Message-ID: <202002081928.018JScT7060978@stargazer.midnightbsd.org> Revision: 12307 http://svnweb.midnightbsd.org/src/?rev=12307 Author: laffer1 Date: 2020-02-08 14:28:38 -0500 (Sat, 08 Feb 2020) Log Message: ----------- sync with FreeBSD 11-stable Added Paths: ----------- trunk/sys/xen/xenmem/ trunk/sys/xen/xenmem/xenmem_if.m Added: trunk/sys/xen/xenmem/xenmem_if.m =================================================================== --- trunk/sys/xen/xenmem/xenmem_if.m (rev 0) +++ trunk/sys/xen/xenmem/xenmem_if.m 2020-02-08 19:28:38 UTC (rev 12307) @@ -0,0 +1,95 @@ +#- +# Copyright (c) 2015 Roger Pau Monn? <royger at FreeBSD.org> +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +# SUCH DAMAGE. +# +# $FreeBSD: stable/11/sys/xen/xenmem/xenmem_if.m 282634 2015-05-08 14:48:40Z royger $ +# $MidnightBSD$ + +#include <sys/bus.h> + +INTERFACE xenmem; + +# +# Default implementations of some methods. +# +CODE { + static struct resource * + xenmem_generic_alloc(device_t dev, device_t child, int *res_id, + size_t size) + { + device_t parent; + + parent = device_get_parent(dev); + if (parent == NULL) + return (NULL); + return (XENMEM_ALLOC(parent, child, res_id, size)); + } + + static int + xenmem_generic_free(device_t dev, device_t child, int res_id, + struct resource *res) + { + device_t parent; + + parent = device_get_parent(dev); + if (parent == NULL) + return (ENXIO); + return (XENMEM_FREE(parent, child, res_id, res)); + } +}; + +/** + * @brief Request for unused physical memory regions. + * + * @param _dev the device whose child was being probed. + * @param _child the child device which failed to probe. + * @param _res_id a pointer to the resource identifier. + * @param _size size of the required memory region. + * + * @returns the resource which was allocated or @c NULL if no + * resource could be allocated. + */ +METHOD struct resource * alloc { + device_t _dev; + device_t _child; + int *_res_id; + size_t _size; +} DEFAULT xenmem_generic_alloc; + +/** + * @brief Free physical memory regions. + * + * @param _dev the device whose child was being probed. + * @param _child the child device which failed to probe. + * @param _res_id the resource identifier. + * @param _res the resource. + * + * @returns 0 on success, otherwise an error code. + */ +METHOD int free { + device_t _dev; + device_t _child; + int _res_id; + struct resource *_res; +} DEFAULT xenmem_generic_free; Property changes on: trunk/sys/xen/xenmem/xenmem_if.m ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +MidnightBSD=%H \ No newline at end of property From laffer1 at midnightbsd.org Sat Feb 8 14:28:55 2020 From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org) Date: Sat, 8 Feb 2020 14:28:55 -0500 (EST) Subject: [Midnightbsd-cvs] src [12308] trunk/sys/xen/evtchn/evtchnvar.h: sync with FreeBSD 11-stable Message-ID: <202002081928.018JStBX061035@stargazer.midnightbsd.org> Revision: 12308 http://svnweb.midnightbsd.org/src/?rev=12308 Author: laffer1 Date: 2020-02-08 14:28:55 -0500 (Sat, 08 Feb 2020) Log Message: ----------- sync with FreeBSD 11-stable Modified Paths: -------------- trunk/sys/xen/evtchn/evtchnvar.h Modified: trunk/sys/xen/evtchn/evtchnvar.h =================================================================== --- trunk/sys/xen/evtchn/evtchnvar.h 2020-02-08 19:28:38 UTC (rev 12307) +++ trunk/sys/xen/evtchn/evtchnvar.h 2020-02-08 19:28:55 UTC (rev 12308) @@ -29,7 +29,7 @@ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS * IN THE SOFTWARE. * - * $FreeBSD: stable/10/sys/xen/evtchn/evtchnvar.h 255040 2013-08-29 19:52:18Z gibbs $ + * $FreeBSD: stable/11/sys/xen/evtchn/evtchnvar.h 255040 2013-08-29 19:52:18Z gibbs $ */ #ifndef __XEN_EVTCHN_EVTCHNVAR_H__ From laffer1 at midnightbsd.org Sat Feb 8 14:29:01 2020 From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org) Date: Sat, 8 Feb 2020 14:29:01 -0500 (EST) Subject: [Midnightbsd-cvs] src [12309] trunk/sys/xen: sync with FreeBSD 11-stable Message-ID: <202002081929.018JT1kQ061079@stargazer.midnightbsd.org> Revision: 12309 http://svnweb.midnightbsd.org/src/?rev=12309 Author: laffer1 Date: 2020-02-08 14:29:01 -0500 (Sat, 08 Feb 2020) Log Message: ----------- sync with FreeBSD 11-stable Modified Paths: -------------- trunk/sys/xen/blkif.h trunk/sys/xen/evtchn.h trunk/sys/xen/features.c trunk/sys/xen/gnttab.h trunk/sys/xen/hvm.h trunk/sys/xen/hypervisor.h trunk/sys/xen/xen-os.h trunk/sys/xen/xen_intr.h trunk/sys/xen/xenbus/xenbus.c trunk/sys/xen/xenbus/xenbus_if.m trunk/sys/xen/xenbus/xenbusb.c trunk/sys/xen/xenbus/xenbusb.h trunk/sys/xen/xenbus/xenbusb_back.c trunk/sys/xen/xenbus/xenbusb_front.c trunk/sys/xen/xenbus/xenbusb_if.m trunk/sys/xen/xenbus/xenbusvar.h trunk/sys/xen/xenstore/xenstore_internal.h trunk/sys/xen/xenstore/xenstorevar.h Added Paths: ----------- trunk/sys/xen/error.h trunk/sys/xen/privcmd.h trunk/sys/xen/xen_msi.h trunk/sys/xen/xen_pci.h trunk/sys/xen/xen_pv.h Modified: trunk/sys/xen/blkif.h =================================================================== --- trunk/sys/xen/blkif.h 2020-02-08 19:28:55 UTC (rev 12308) +++ trunk/sys/xen/blkif.h 2020-02-08 19:29:01 UTC (rev 12309) @@ -18,7 +18,7 @@ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. * - * $FreeBSD: stable/10/sys/xen/blkif.h 285738 2015-07-21 07:22:18Z royger $ + * $FreeBSD: stable/11/sys/xen/blkif.h 289686 2015-10-21 10:44:07Z royger $ */ #ifndef __XEN_BLKIF_H__ @@ -121,7 +121,7 @@ dst->handle = src->handle; dst->id = src->id; dst->sector_number = src->sector_number; - barrier(); + __compiler_membar(); if (n > dst->nr_segments) n = dst->nr_segments; for (i = 0; i < n; i++) @@ -136,7 +136,7 @@ dst->handle = src->handle; dst->id = src->id; dst->sector_number = src->sector_number; - barrier(); + __compiler_membar(); if (n > dst->nr_segments) n = dst->nr_segments; for (i = 0; i < n; i++) Added: trunk/sys/xen/error.h =================================================================== --- trunk/sys/xen/error.h (rev 0) +++ trunk/sys/xen/error.h 2020-02-08 19:29:01 UTC (rev 12309) @@ -0,0 +1,102 @@ +/* $MidnightBSD$ */ +/*- + * Copyright (c) 2014 Roger Pau Monn? <royger at FreeBSD.org>. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: stable/11/sys/xen/error.h 301195 2016-06-02 07:45:01Z royger $ + */ + +#ifndef __XEN_ERROR_H__ +#define __XEN_ERROR_H__ + +#include <xen/interface/errno.h> + +/* Translation table */ +static int xen_errors[] = +{ + [XEN_EPERM] = EPERM, + [XEN_ENOENT] = ENOENT, + [XEN_ESRCH] = ESRCH, + [XEN_EIO] = EIO, + [XEN_ENXIO] = ENXIO, + [XEN_E2BIG] = E2BIG, + [XEN_ENOEXEC] = ENOEXEC, + [XEN_EBADF] = EBADF, + [XEN_ECHILD] = ECHILD, + [XEN_EAGAIN] = EAGAIN, + [XEN_ENOMEM] = ENOMEM, + [XEN_EACCES] = EACCES, + [XEN_EFAULT] = EFAULT, + [XEN_EBUSY] = EBUSY, + [XEN_EEXIST] = EEXIST, + [XEN_EXDEV] = EXDEV, + [XEN_ENODEV] = ENODEV, + [XEN_EINVAL] = EINVAL, + [XEN_ENFILE] = ENFILE, + [XEN_EMFILE] = EMFILE, + [XEN_ENOSPC] = ENOSPC, + [XEN_EMLINK] = EMLINK, + [XEN_EDOM] = EDOM, + [XEN_ERANGE] = ERANGE, + [XEN_EDEADLK] = EDEADLK, + [XEN_ENAMETOOLONG] = ENAMETOOLONG, + [XEN_ENOLCK] = ENOLCK, + [XEN_ENOSYS] = ENOSYS, + [XEN_ENODATA] = ENOENT, + [XEN_ETIME] = ETIMEDOUT, + [XEN_EBADMSG] = EBADMSG, + [XEN_EOVERFLOW] = EOVERFLOW, + [XEN_EILSEQ] = EILSEQ, + [XEN_ENOTSOCK] = ENOTSOCK, + [XEN_EOPNOTSUPP] = EOPNOTSUPP, + [XEN_EADDRINUSE] = EADDRINUSE, + [XEN_EADDRNOTAVAIL] = EADDRNOTAVAIL, + [XEN_ENOBUFS] = ENOBUFS, + [XEN_EISCONN] = EISCONN, + [XEN_ENOTCONN] = ENOTCONN, + [XEN_ETIMEDOUT] = ETIMEDOUT, +}; + +static inline int +xen_translate_error(int error) +{ + int bsd_error; + + KASSERT((error < 0), ("Value is not a valid Xen error code")); + + if (-error >= nitems(xen_errors)) { + /* + * We received an error value that cannot be translated, + * return EINVAL. + */ + return (EINVAL); + } + + bsd_error = xen_errors[-error]; + KASSERT((bsd_error != 0), ("Unknown Xen error code")); + + return (bsd_error); +} + +#endif /* !__XEN_ERROR_H__ */ Property changes on: trunk/sys/xen/error.h ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +MidnightBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Modified: trunk/sys/xen/evtchn.h =================================================================== --- trunk/sys/xen/evtchn.h 2020-02-08 19:28:55 UTC (rev 12308) +++ trunk/sys/xen/evtchn.h 2020-02-08 19:29:01 UTC (rev 12309) @@ -27,7 +27,7 @@ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS * IN THE SOFTWARE. * - * $FreeBSD: stable/10/sys/xen/evtchn.h 255040 2013-08-29 19:52:18Z gibbs $ + * $FreeBSD: stable/11/sys/xen/evtchn.h 255040 2013-08-29 19:52:18Z gibbs $ */ #ifndef __XEN_EVTCHN_H__ Modified: trunk/sys/xen/features.c =================================================================== --- trunk/sys/xen/features.c 2020-02-08 19:28:55 UTC (rev 12308) +++ trunk/sys/xen/features.c 2020-02-08 19:29:01 UTC (rev 12309) @@ -1,6 +1,6 @@ /* $MidnightBSD$ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/xen/features.c 255040 2013-08-29 19:52:18Z gibbs $"); +__FBSDID("$FreeBSD: stable/11/sys/xen/features.c 255040 2013-08-29 19:52:18Z gibbs $"); #include <sys/param.h> #include <sys/systm.h> Modified: trunk/sys/xen/gnttab.h =================================================================== --- trunk/sys/xen/gnttab.h 2020-02-08 19:28:55 UTC (rev 12308) +++ trunk/sys/xen/gnttab.h 2020-02-08 19:29:01 UTC (rev 12309) @@ -36,6 +36,7 @@ */ #ifndef __ASM_GNTTAB_H__ +#define __ASM_GNTTAB_H__ #include <xen/xen-os.h> #include <xen/hypervisor.h> @@ -52,8 +53,6 @@ uint16_t count; }; -int gnttab_init(void); - /* * Allocate a grant table reference and return it in *result. Returns * zero on success or errno on error. @@ -117,7 +116,7 @@ unsigned long pfn); int gnttab_suspend(void); -int gnttab_resume(void); +int gnttab_resume(device_t); #if 0 @@ -129,10 +128,8 @@ { if (flags & GNTMAP_contains_pte) map->host_addr = addr; - else if (xen_feature(XENFEAT_auto_translated_physmap)) + else map->host_addr = vtophys(addr); - else - map->host_addr = addr; map->flags = flags; map->ref = ref; @@ -145,10 +142,8 @@ { if (flags & GNTMAP_contains_pte) unmap->host_addr = addr; - else if (xen_feature(XENFEAT_auto_translated_physmap)) + else unmap->host_addr = vtophys(addr); - else - unmap->host_addr = addr; unmap->handle = handle; unmap->dev_bus_addr = 0; @@ -158,13 +153,8 @@ gnttab_set_replace_op(struct gnttab_unmap_and_replace *unmap, vm_paddr_t addr, vm_paddr_t new_addr, grant_handle_t handle) { - if (xen_feature(XENFEAT_auto_translated_physmap)) { - unmap->host_addr = vtophys(addr); - unmap->new_addr = vtophys(new_addr); - } else { - unmap->host_addr = addr; - unmap->new_addr = new_addr; - } + unmap->host_addr = vtophys(addr); + unmap->new_addr = vtophys(new_addr); unmap->handle = handle; } Modified: trunk/sys/xen/hvm.h =================================================================== --- trunk/sys/xen/hvm.h 2020-02-08 19:28:55 UTC (rev 12308) +++ trunk/sys/xen/hvm.h 2020-02-08 19:29:01 UTC (rev 12309) @@ -18,7 +18,7 @@ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. * - * $FreeBSD: stable/10/sys/xen/hvm.h 255744 2013-09-20 22:59:22Z gibbs $ + * $FreeBSD: stable/11/sys/xen/hvm.h 255744 2013-09-20 22:59:22Z gibbs $ */ #ifndef __XEN_HVM_H__ Modified: trunk/sys/xen/hypervisor.h =================================================================== --- trunk/sys/xen/hypervisor.h 2020-02-08 19:28:55 UTC (rev 12308) +++ trunk/sys/xen/hypervisor.h 2020-02-08 19:29:01 UTC (rev 12309) @@ -6,28 +6,12 @@ * * Copyright (c) 2002, K A Fraser * - * $FreeBSD: stable/10/sys/xen/hypervisor.h 196322 2009-08-17 14:38:59Z jhb $ + * $FreeBSD: stable/11/sys/xen/hypervisor.h 289686 2015-10-21 10:44:07Z royger $ */ #ifndef __XEN_HYPERVISOR_H__ #define __XEN_HYPERVISOR_H__ -#ifdef XENHVM - -#define is_running_on_xen() (HYPERVISOR_shared_info != NULL) - -#else - -#define is_running_on_xen() 1 - -#endif - -#ifdef PAE -#ifndef CONFIG_X86_PAE -#define CONFIG_X86_PAE -#endif -#endif - #include <sys/cdefs.h> #include <sys/systm.h> #include <xen/interface/xen.h> @@ -39,32 +23,14 @@ #include <xen/interface/memory.h> #include <machine/xen/hypercall.h> -#if defined(__amd64__) -#define MULTI_UVMFLAGS_INDEX 2 -#define MULTI_UVMDOMID_INDEX 3 -#else -#define MULTI_UVMFLAGS_INDEX 3 -#define MULTI_UVMDOMID_INDEX 4 -#endif - -#ifdef CONFIG_XEN_PRIVILEGED_GUEST -#define is_initial_xendomain() (xen_start_info->flags & SIF_INITDOMAIN) -#else -#define is_initial_xendomain() 0 -#endif - -extern start_info_t *xen_start_info; - extern uint64_t get_system_time(int ticks); static inline int -HYPERVISOR_console_write(char *str, int count) +HYPERVISOR_console_write(const char *str, int count) { return HYPERVISOR_console_io(CONSOLEIO_write, count, str); } -static inline void HYPERVISOR_crash(void) __dead2; - static inline int HYPERVISOR_yield(void) { @@ -133,23 +99,4 @@ return (rc); } -static inline void -MULTI_update_va_mapping( - multicall_entry_t *mcl, unsigned long va, - uint64_t new_val, unsigned long flags) -{ - mcl->op = __HYPERVISOR_update_va_mapping; - mcl->args[0] = va; -#if defined(__amd64__) - mcl->args[1] = new_val; -#elif defined(PAE) - mcl->args[1] = (uint32_t)(new_val & 0xffffffff) ; - mcl->args[2] = (uint32_t)(new_val >> 32); -#else - mcl->args[1] = new_val; - mcl->args[2] = 0; -#endif - mcl->args[MULTI_UVMFLAGS_INDEX] = flags; -} - #endif /* __XEN_HYPERVISOR_H__ */ Added: trunk/sys/xen/privcmd.h =================================================================== --- trunk/sys/xen/privcmd.h (rev 0) +++ trunk/sys/xen/privcmd.h 2020-02-08 19:29:01 UTC (rev 12309) @@ -0,0 +1,59 @@ +/* $MidnightBSD$ */ +/****************************************************************************** + * privcmd.h + * + * Interface to /proc/xen/privcmd. + * + * Copyright (c) 2003-2005, K A Fraser + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * $FreeBSD: stable/11/sys/xen/privcmd.h 273476 2014-10-22 17:07:20Z royger $ + */ + +#ifndef __XEN_PRIVCMD_H__ +#define __XEN_PRIVCMD_H__ + +struct ioctl_privcmd_hypercall +{ + unsigned long op; /* hypercall number */ + unsigned long arg[5]; /* arguments */ + long retval; /* return value */ +}; + +struct ioctl_privcmd_mmapbatch { + int num; /* number of pages to populate */ + domid_t dom; /* target domain */ + unsigned long addr; /* virtual address */ + const xen_pfn_t *arr; /* array of mfns */ + int *err; /* array of error codes */ +}; + +#define IOCTL_PRIVCMD_HYPERCALL \ + _IOWR('E', 0, struct ioctl_privcmd_hypercall) +#define IOCTL_PRIVCMD_MMAPBATCH \ + _IOWR('E', 1, struct ioctl_privcmd_mmapbatch) + +#endif /* !__XEN_PRIVCMD_H__ */ Property changes on: trunk/sys/xen/privcmd.h ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +MidnightBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Modified: trunk/sys/xen/xen-os.h =================================================================== --- trunk/sys/xen/xen-os.h 2020-02-08 19:28:55 UTC (rev 12308) +++ trunk/sys/xen/xen-os.h 2020-02-08 19:29:01 UTC (rev 12309) @@ -25,7 +25,7 @@ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. * - * $FreeBSD: stable/10/sys/xen/xen-os.h 315676 2017-03-21 09:38:59Z royger $ + * $FreeBSD: stable/11/sys/xen/xen-os.h 315668 2017-03-21 08:38:12Z royger $ */ #ifndef _XEN_XEN_OS_H_ @@ -48,15 +48,14 @@ /* Everything below this point is not included by assembler (.S) files. */ #ifndef __ASSEMBLY__ -/* Force a proper event-channel callback from Xen. */ -void force_evtchn_callback(void); - extern shared_info_t *HYPERVISOR_shared_info; +extern start_info_t *HYPERVISOR_start_info; -#ifdef XENHVM +/* XXX: we need to get rid of this and use HYPERVISOR_start_info directly */ +extern char *console_page; + extern int xen_disable_pv_disks; extern int xen_disable_pv_nics; -#endif extern bool xen_suspend_cancelled; @@ -86,6 +85,54 @@ return (xen_domain_type == XEN_HVM_DOMAIN); } +static inline bool +xen_initial_domain(void) +{ + return (xen_domain() && HYPERVISOR_start_info != NULL && + (HYPERVISOR_start_info->flags & SIF_INITDOMAIN) != 0); +} + +/* + * Based on ofed/include/linux/bitops.h + * + * Those helpers are prefixed by xen_ because xen-os.h is widely included + * and we don't want the other drivers using them. + * + */ +#define NBPL (NBBY * sizeof(long)) + +static inline bool +xen_test_bit(int bit, volatile long *addr) +{ + unsigned long mask = 1UL << (bit % NBPL); + + return !!(atomic_load_acq_long(&addr[bit / NBPL]) & mask); +} + +static inline void +xen_set_bit(int bit, volatile long *addr) +{ + atomic_set_long(&addr[bit / NBPL], 1UL << (bit % NBPL)); +} + +static inline void +xen_clear_bit(int bit, volatile long *addr) +{ + atomic_clear_long(&addr[bit / NBPL], 1UL << (bit % NBPL)); +} + +#undef NBPL + +/* + * Functions to allocate/free unused memory in order + * to map memory from other domains. + */ +struct resource *xenmem_alloc(device_t dev, int *res_id, size_t size); +int xenmem_free(device_t dev, int res_id, struct resource *res); + +/* Debug/emergency function, prints directly to hypervisor console */ +void xc_printf(const char *, ...) __printflike(1, 2); + #ifndef xen_mb #define xen_mb() mb() #endif Modified: trunk/sys/xen/xen_intr.h =================================================================== --- trunk/sys/xen/xen_intr.h 2020-02-08 19:28:55 UTC (rev 12308) +++ trunk/sys/xen/xen_intr.h 2020-02-08 19:29:01 UTC (rev 12309) @@ -29,16 +29,12 @@ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS * IN THE SOFTWARE. * - * $FreeBSD: stable/10/sys/xen/xen_intr.h 255331 2013-09-06 22:17:02Z gibbs $ + * $FreeBSD: stable/11/sys/xen/xen_intr.h 340016 2018-11-01 18:34:26Z jhb $ */ #ifndef _XEN_INTR_H_ #define _XEN_INTR_H_ -#ifndef __XEN_EVTCHN_PORT_DEFINED__ -typedef uint32_t evtchn_port_t; -DEFINE_XEN_GUEST_HANDLE(evtchn_port_t); -#define __XEN_EVTCHN_PORT_DEFINED__ 1 -#endif +#include <xen/interface/event_channel.h> /** Registered Xen interrupt callback handle. */ typedef void * xen_intr_handle_t; @@ -46,6 +42,8 @@ /** If non-zero, the hypervisor has been configured to use a direct vector */ extern int xen_vector_callback_enabled; +void xen_intr_handle_upcall(struct trapframe *trap_frame); + /** * Associate an already allocated local event channel port an interrupt * handler. @@ -146,7 +144,6 @@ * interupts and, if successful, associate the port with the specified * interrupt handler. * - * \param dev The device making this bind request. * \param cpu The cpu receiving the IPI. * \param filter The interrupt filter servicing this IPI. * \param irqflags Interrupt handler flags. See sys/bus.h. @@ -155,11 +152,23 @@ * * \returns 0 on success, otherwise an errno. */ -int xen_intr_alloc_and_bind_ipi(device_t dev, u_int cpu, +int xen_intr_alloc_and_bind_ipi(u_int cpu, driver_filter_t filter, enum intr_type irqflags, xen_intr_handle_t *handlep); /** + * Register a physical interrupt vector and setup the interrupt source. + * + * \param vector The global vector to use. + * \param trig Default trigger method. + * \param pol Default polarity of the interrupt. + * + * \returns 0 on success, otherwise an errno. + */ +int xen_register_pirq(int vector, enum intr_trigger trig, + enum intr_polarity pol); + +/** * Unbind an interrupt handler from its interrupt source. * * \param handlep A pointer to the opaque handle that was initialized @@ -213,4 +222,55 @@ */ evtchn_port_t xen_intr_port(xen_intr_handle_t handle); +/** + * Setup MSI vector interrupt(s). + * + * \param dev The device that requests the binding. + * + * \param vector Requested initial vector to bind the MSI interrupt(s) to. + * + * \param count Number of vectors to allocate. + * + * \returns 0 on success, otherwise an errno. + */ +int xen_register_msi(device_t dev, int vector, int count); + +/** + * Teardown a MSI vector interrupt. + * + * \param vector Requested vector to release. + * + * \returns 0 on success, otherwise an errno. + */ +int xen_release_msi(int vector); + +/** + * Bind an event channel port with a handler + * + * \param dev The device making this bind request. + * \param filter An interrupt filter handler. Specify NULL + * to always dispatch to the ithread handler. + * \param handler An interrupt ithread handler. Optional (can + * specify NULL) if all necessary event actions + * are performed by filter. + * \param arg Argument to present to both filter and handler. + * \param irqflags Interrupt handler flags. See sys/bus.h. + * \param handle Opaque handle used to manage this registration. + * + * \returns 0 on success, otherwise an errno. + */ +int xen_intr_add_handler(const char *name, driver_filter_t filter, + driver_intr_t handler, void *arg, enum intr_type flags, + xen_intr_handle_t handle); + +/** + * Register the IO-APIC PIRQs when running in legacy PVH Dom0 mode. + * + * \param pic PIC instance. + * + * NB: this should be removed together with the support for legacy PVH mode. + */ +struct pic; +void xenpv_register_pirqs(struct pic *pic); + #endif /* _XEN_INTR_H_ */ Added: trunk/sys/xen/xen_msi.h =================================================================== --- trunk/sys/xen/xen_msi.h (rev 0) +++ trunk/sys/xen/xen_msi.h 2020-02-08 19:29:01 UTC (rev 12309) @@ -0,0 +1,40 @@ +/* $MidnightBSD$ */ +/* + * Copyright (c) 2014 Roger Pau Monn? <roger.pau at citrix.com> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: stable/11/sys/xen/xen_msi.h 276767 2015-01-06 21:26:35Z imp $ + */ + +#ifndef __XEN_MSI_H__ +#define __XEN_MSI_H__ + +void xen_msi_init(void); +int xen_msi_map(int irq, uint64_t *addr, uint32_t *data); +int xen_msi_alloc(device_t dev, int count, int maxcount, int *irqs); +int xen_msi_release(int *irqs, int count); +int xen_msix_alloc(device_t dev, int *irq); +int xen_msix_release(int irq); + +#endif /* !__XEN_MSI_H__ */ Property changes on: trunk/sys/xen/xen_msi.h ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +MidnightBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Added: trunk/sys/xen/xen_pci.h =================================================================== --- trunk/sys/xen/xen_pci.h (rev 0) +++ trunk/sys/xen/xen_pci.h 2020-02-08 19:29:01 UTC (rev 12309) @@ -0,0 +1,38 @@ +/* $MidnightBSD$ */ +/* + * Copyright (c) 2014 Roger Pau Monn? <roger.pau at citrix.com> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: stable/11/sys/xen/xen_pci.h 275649 2014-12-09 18:03:25Z royger $ + */ + +#ifndef __XEN_PCI_H__ +#define __XEN_PCI_H__ + +void xen_pci_enable_msi_method(device_t dev, device_t child, uint64_t address, + uint16_t data); +void xen_pci_disable_msi_method(device_t dev, device_t child); +void xen_pci_child_added_method(device_t dev, device_t child); + +#endif /* !__XEN_PCI_H__ */ Property changes on: trunk/sys/xen/xen_pci.h ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +MidnightBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Added: trunk/sys/xen/xen_pv.h =================================================================== --- trunk/sys/xen/xen_pv.h (rev 0) +++ trunk/sys/xen/xen_pv.h 2020-02-08 19:29:01 UTC (rev 12309) @@ -0,0 +1,35 @@ +/* $MidnightBSD$ */ +/*- + * Copyright (c) 2014 Roger Pau Monn? <roger.pau at citrix.com> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: stable/11/sys/xen/xen_pv.h 267536 2014-06-16 08:54:04Z royger $ + */ + +#ifndef __XEN_PV_H__ +#define __XEN_PV_H__ + +extern struct apic_ops xen_apic_ops; + +#endif Property changes on: trunk/sys/xen/xen_pv.h ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +MidnightBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Modified: trunk/sys/xen/xenbus/xenbus.c =================================================================== --- trunk/sys/xen/xenbus/xenbus.c 2020-02-08 19:28:55 UTC (rev 12308) +++ trunk/sys/xen/xenbus/xenbus.c 2020-02-08 19:29:01 UTC (rev 12309) @@ -41,7 +41,7 @@ #endif #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/xen/xenbus/xenbus.c 255040 2013-08-29 19:52:18Z gibbs $"); +__FBSDID("$FreeBSD: stable/11/sys/xen/xenbus/xenbus.c 255040 2013-08-29 19:52:18Z gibbs $"); #include <sys/cdefs.h> #include <sys/param.h> Modified: trunk/sys/xen/xenbus/xenbus_if.m =================================================================== --- trunk/sys/xen/xenbus/xenbus_if.m 2020-02-08 19:28:55 UTC (rev 12308) +++ trunk/sys/xen/xenbus/xenbus_if.m 2020-02-08 19:29:01 UTC (rev 12309) @@ -1,3 +1,4 @@ +/* $MidnightBSD$ */ #- # Copyright (c) 2008 Doug Rabson # All rights reserved. @@ -23,8 +24,8 @@ # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF # SUCH DAMAGE. # -# $FreeBSD: stable/10/sys/xen/xenbus/xenbus_if.m 255040 2013-08-29 19:52:18Z gibbs $ -# $MidnightBSD$ +# $FreeBSD: stable/11/sys/xen/xenbus/xenbus_if.m 255040 2013-08-29 19:52:18Z gibbs $ +# #include <sys/bus.h> Modified: trunk/sys/xen/xenbus/xenbusb.c =================================================================== --- trunk/sys/xen/xenbus/xenbusb.c 2020-02-08 19:28:55 UTC (rev 12308) +++ trunk/sys/xen/xenbus/xenbusb.c 2020-02-08 19:29:01 UTC (rev 12309) @@ -53,7 +53,7 @@ * xnb1 */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/xen/xenbus/xenbusb.c 315676 2017-03-21 09:38:59Z royger $"); +__FBSDID("$FreeBSD: stable/11/sys/xen/xenbus/xenbusb.c 315668 2017-03-21 08:38:12Z royger $"); #include <sys/param.h> #include <sys/bus.h> @@ -331,7 +331,7 @@ default: return (EINVAL); } - return (SYSCTL_OUT(req, value, strlen(value))); + return (SYSCTL_OUT_STR(req, value)); } /** Modified: trunk/sys/xen/xenbus/xenbusb.h =================================================================== --- trunk/sys/xen/xenbus/xenbusb.h 2020-02-08 19:28:55 UTC (rev 12308) +++ trunk/sys/xen/xenbus/xenbusb.h 2020-02-08 19:29:01 UTC (rev 12309) @@ -31,7 +31,7 @@ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGES. * - * $FreeBSD: stable/10/sys/xen/xenbus/xenbusb.h 222975 2011-06-11 04:59:01Z gibbs $ + * $FreeBSD: stable/11/sys/xen/xenbus/xenbusb.h 222975 2011-06-11 04:59:01Z gibbs $ */ #ifndef _XEN_XENBUS_XENBUSB_H #define _XEN_XENBUS_XENBUSB_H Modified: trunk/sys/xen/xenbus/xenbusb_back.c =================================================================== --- trunk/sys/xen/xenbus/xenbusb_back.c 2020-02-08 19:28:55 UTC (rev 12308) +++ trunk/sys/xen/xenbus/xenbusb_back.c 2020-02-08 19:29:01 UTC (rev 12309) @@ -37,7 +37,7 @@ * Xen split devices. */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/xen/xenbus/xenbusb_back.c 225704 2011-09-20 23:44:34Z gibbs $"); +__FBSDID("$FreeBSD: stable/11/sys/xen/xenbus/xenbusb_back.c 225704 2011-09-20 23:44:34Z gibbs $"); #include <sys/param.h> #include <sys/bus.h> Modified: trunk/sys/xen/xenbus/xenbusb_front.c =================================================================== --- trunk/sys/xen/xenbus/xenbusb_front.c 2020-02-08 19:28:55 UTC (rev 12308) +++ trunk/sys/xen/xenbus/xenbusb_front.c 2020-02-08 19:29:01 UTC (rev 12309) @@ -37,7 +37,7 @@ * Xen split devices. */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/xen/xenbus/xenbusb_front.c 255040 2013-08-29 19:52:18Z gibbs $"); +__FBSDID("$FreeBSD: stable/11/sys/xen/xenbus/xenbusb_front.c 255040 2013-08-29 19:52:18Z gibbs $"); #include <sys/param.h> #include <sys/bus.h> Modified: trunk/sys/xen/xenbus/xenbusb_if.m =================================================================== --- trunk/sys/xen/xenbus/xenbusb_if.m 2020-02-08 19:28:55 UTC (rev 12308) +++ trunk/sys/xen/xenbus/xenbusb_if.m 2020-02-08 19:29:01 UTC (rev 12309) @@ -1,3 +1,4 @@ +/* $MidnightBSD$ */ #- # Copyright (c) 2010 Spectra Logic Corporation # All rights reserved. @@ -27,8 +28,8 @@ # IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGES. # -# $FreeBSD: stable/10/sys/xen/xenbus/xenbusb_if.m 222975 2011-06-11 04:59:01Z gibbs $ -# $MidnightBSD$ +# $FreeBSD: stable/11/sys/xen/xenbus/xenbusb_if.m 222975 2011-06-11 04:59:01Z gibbs $ +# #include <sys/bus.h> #include <sys/lock.h> Modified: trunk/sys/xen/xenbus/xenbusvar.h =================================================================== --- trunk/sys/xen/xenbus/xenbusvar.h 2020-02-08 19:28:55 UTC (rev 12308) +++ trunk/sys/xen/xenbus/xenbusvar.h 2020-02-08 19:29:01 UTC (rev 12309) @@ -24,7 +24,7 @@ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS * IN THE SOFTWARE. * - * $FreeBSD: stable/10/sys/xen/xenbus/xenbusvar.h 255040 2013-08-29 19:52:18Z gibbs $ + * $FreeBSD: stable/11/sys/xen/xenbus/xenbusvar.h 294090 2016-01-15 14:34:31Z royger $ */ /** @@ -83,7 +83,13 @@ }; /** - * Simplified accessors for xenbus devices + * Simplified accessors for xenbus devices: + * + * xenbus_get_node + * xenbus_get_type + * xenbus_get_state + * xenbus_get_otherend_id + * xenbus_get_otherend_path */ #define XENBUS_ACCESSOR(var, ivar, type) \ __BUS_ACCESSOR(xenbus, var, XENBUS, ivar, type) Modified: trunk/sys/xen/xenstore/xenstore_internal.h =================================================================== --- trunk/sys/xen/xenstore/xenstore_internal.h 2020-02-08 19:28:55 UTC (rev 12308) +++ trunk/sys/xen/xenstore/xenstore_internal.h 2020-02-08 19:29:01 UTC (rev 12309) @@ -30,11 +30,8 @@ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGES. * - * $FreeBSD: stable/10/sys/xen/xenstore/xenstore_internal.h 214077 2010-10-19 20:53:30Z gibbs $ + * $FreeBSD: stable/11/sys/xen/xenstore/xenstore_internal.h 272318 2014-09-30 17:31:04Z royger $ */ -/* Initialize support for userspace access to the XenStore. */ -void xs_dev_init(void); - /* Used by the XenStore character device to borrow kernel's store connection. */ int xs_dev_request_and_reply(struct xsd_sockmsg *msg, void **result); Modified: trunk/sys/xen/xenstore/xenstorevar.h =================================================================== --- trunk/sys/xen/xenstore/xenstorevar.h 2020-02-08 19:28:55 UTC (rev 12308) +++ trunk/sys/xen/xenstore/xenstorevar.h 2020-02-08 19:29:01 UTC (rev 12309) @@ -29,7 +29,7 @@ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS * IN THE SOFTWARE. * - * $FreeBSD: stable/10/sys/xen/xenstore/xenstorevar.h 315675 2017-03-21 09:27:24Z royger $ + * $FreeBSD: stable/11/sys/xen/xenstore/xenstorevar.h 315667 2017-03-21 08:36:25Z royger $ */ #ifndef _XEN_XENSTORE_XENSTOREVAR_H From laffer1 at midnightbsd.org Sat Feb 8 14:32:42 2020 From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org) Date: Sat, 8 Feb 2020 14:32:42 -0500 (EST) Subject: [Midnightbsd-cvs] src [12310] trunk/sys/x86: sync with FreeBSD 11-stable Message-ID: <202002081932.018JWgDh061873@stargazer.midnightbsd.org> Revision: 12310 http://svnweb.midnightbsd.org/src/?rev=12310 Author: laffer1 Date: 2020-02-08 14:32:41 -0500 (Sat, 08 Feb 2020) Log Message: ----------- sync with FreeBSD 11-stable Modified Paths: -------------- trunk/sys/x86/iommu/busdma_dmar.c trunk/sys/x86/iommu/busdma_dmar.h trunk/sys/x86/iommu/intel_ctx.c trunk/sys/x86/iommu/intel_dmar.h trunk/sys/x86/iommu/intel_drv.c trunk/sys/x86/iommu/intel_fault.c trunk/sys/x86/iommu/intel_gas.c trunk/sys/x86/iommu/intel_idpgtbl.c trunk/sys/x86/iommu/intel_qi.c trunk/sys/x86/iommu/intel_quirks.c trunk/sys/x86/iommu/intel_reg.h trunk/sys/x86/iommu/intel_utils.c trunk/sys/x86/isa/atpic.c trunk/sys/x86/isa/atrtc.c trunk/sys/x86/isa/clock.c trunk/sys/x86/isa/elcr.c trunk/sys/x86/isa/icu.h trunk/sys/x86/isa/isa.c trunk/sys/x86/isa/isa_dma.c trunk/sys/x86/isa/nmi.c trunk/sys/x86/isa/orm.c trunk/sys/x86/pci/pci_bus.c trunk/sys/x86/pci/qpi.c trunk/sys/x86/x86/bus_machdep.c trunk/sys/x86/x86/busdma_bounce.c trunk/sys/x86/x86/busdma_machdep.c trunk/sys/x86/x86/dump_machdep.c trunk/sys/x86/x86/fdt_machdep.c trunk/sys/x86/x86/identcpu.c trunk/sys/x86/x86/intr_machdep.c trunk/sys/x86/x86/io_apic.c trunk/sys/x86/x86/legacy.c trunk/sys/x86/x86/local_apic.c trunk/sys/x86/x86/mca.c trunk/sys/x86/x86/mptable.c trunk/sys/x86/x86/mptable_pci.c trunk/sys/x86/x86/msi.c trunk/sys/x86/x86/nexus.c trunk/sys/x86/x86/tsc.c trunk/sys/x86/xen/hvm.c trunk/sys/x86/xen/xen_intr.c Added Paths: ----------- trunk/sys/x86/iommu/intel_intrmap.c trunk/sys/x86/iommu/iommu_intrmap.h trunk/sys/x86/x86/autoconf.c trunk/sys/x86/x86/cpu_machdep.c trunk/sys/x86/x86/delay.c trunk/sys/x86/x86/mp_watchdog.c trunk/sys/x86/x86/mp_x86.c trunk/sys/x86/x86/pvclock.c trunk/sys/x86/x86/stack_machdep.c trunk/sys/x86/x86/ucode.c trunk/sys/x86/x86/x86_mem.c trunk/sys/x86/xen/pv.c trunk/sys/x86/xen/pvcpu_enum.c trunk/sys/x86/xen/xen_apic.c trunk/sys/x86/xen/xen_msi.c trunk/sys/x86/xen/xen_nexus.c trunk/sys/x86/xen/xen_pci_bus.c trunk/sys/x86/xen/xenpv.c Modified: trunk/sys/x86/iommu/busdma_dmar.c =================================================================== --- trunk/sys/x86/iommu/busdma_dmar.c 2020-02-08 19:29:01 UTC (rev 12309) +++ trunk/sys/x86/iommu/busdma_dmar.c 2020-02-08 19:32:41 UTC (rev 12310) @@ -29,7 +29,7 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/x86/iommu/busdma_dmar.c 284021 2015-06-05 08:36:25Z kib $"); +__FBSDID("$FreeBSD: stable/11/sys/x86/iommu/busdma_dmar.c 316392 2017-04-02 07:11:15Z kib $"); #include <sys/param.h> #include <sys/systm.h> @@ -48,6 +48,7 @@ #include <sys/taskqueue.h> #include <sys/tree.h> #include <sys/uio.h> +#include <sys/vmem.h> #include <dev/pci/pcireg.h> #include <dev/pci/pcivar.h> #include <vm/vm.h> @@ -74,14 +75,34 @@ dmar_bus_dma_is_dev_disabled(int domain, int bus, int slot, int func) { char str[128], *env; + int default_bounce; + bool ret; + static const char bounce_str[] = "bounce"; + static const char dmar_str[] = "dmar"; - snprintf(str, sizeof(str), "hw.busdma.pci%d.%d.%d.%d.bounce", + default_bounce = 0; + env = kern_getenv("hw.busdma.default"); + if (env != NULL) { + if (strcmp(env, bounce_str) == 0) + default_bounce = 1; + else if (strcmp(env, dmar_str) == 0) + default_bounce = 0; + freeenv(env); + } + + snprintf(str, sizeof(str), "hw.busdma.pci%d.%d.%d.%d", domain, bus, slot, func); - env = getenv(str); + env = kern_getenv(str); if (env == NULL) - return (false); + return (default_bounce != 0); + if (strcmp(env, bounce_str) == 0) + ret = true; + else if (strcmp(env, dmar_str) == 0) + ret = false; + else + ret = default_bounce != 0; freeenv(env); - return (true); + return (ret); } /* @@ -93,7 +114,7 @@ * domain, and must collectively be assigned to use either DMAR or * bounce mapping. */ -static device_t +device_t dmar_get_requester(device_t dev, uint16_t *rid) { devclass_t pci_class; @@ -225,7 +246,7 @@ disabled = dmar_bus_dma_is_dev_disabled(pci_get_domain(requester), pci_get_bus(requester), pci_get_slot(requester), pci_get_function(requester)); - ctx = dmar_get_ctx(dmar, requester, rid, disabled, rmrr); + ctx = dmar_get_ctx_for_dev(dmar, requester, rid, disabled, rmrr); if (ctx == NULL) return (NULL); if (disabled) { @@ -256,6 +277,8 @@ /* Not in scope of any DMAR ? */ if (dmar == NULL) return (NULL); + if (!dmar->dma_enabled) + return (NULL); dmar_quirks_pre_use(dmar); dmar_instantiate_rmrr_ctxs(dmar); @@ -369,16 +392,18 @@ { struct bus_dma_tag_dmar *tag; struct bus_dmamap_dmar *map; + struct dmar_domain *domain; tag = (struct bus_dma_tag_dmar *)dmat; map = (struct bus_dmamap_dmar *)map1; if (map != NULL) { - DMAR_CTX_LOCK(tag->ctx); + domain = tag->ctx->domain; + DMAR_DOMAIN_LOCK(domain); if (!TAILQ_EMPTY(&map->map_entries)) { - DMAR_CTX_UNLOCK(tag->ctx); + DMAR_DOMAIN_UNLOCK(domain); return (EBUSY); } - DMAR_CTX_UNLOCK(tag->ctx); + DMAR_DOMAIN_UNLOCK(domain); free(map, M_DMAR_DMAMAP); } tag->map_count--; @@ -455,6 +480,7 @@ struct dmar_map_entries_tailq *unroll_list) { struct dmar_ctx *ctx; + struct dmar_domain *domain; struct dmar_map_entry *entry; dmar_gaddr_t size; bus_size_t buflen1; @@ -464,6 +490,7 @@ if (segs == NULL) segs = tag->segments; ctx = tag->ctx; + domain = ctx->domain; seg = *segp; error = 0; idx = 0; @@ -485,7 +512,7 @@ if (seg + 1 < tag->common.nsegments) gas_flags |= DMAR_GM_CANSPLIT; - error = dmar_gas_map(ctx, &tag->common, size, offset, + error = dmar_gas_map(domain, &tag->common, size, offset, DMAR_MAP_ENTRY_READ | DMAR_MAP_ENTRY_WRITE, gas_flags, ma + idx, &entry); if (error != 0) @@ -532,10 +559,10 @@ (uintmax_t)entry->start, (uintmax_t)entry->end, (uintmax_t)buflen1, (uintmax_t)tag->common.maxsegsz)); - DMAR_CTX_LOCK(ctx); + DMAR_DOMAIN_LOCK(domain); TAILQ_INSERT_TAIL(&map->map_entries, entry, dmamap_link); entry->flags |= DMAR_MAP_ENTRY_MAP; - DMAR_CTX_UNLOCK(ctx); + DMAR_DOMAIN_UNLOCK(domain); TAILQ_INSERT_TAIL(unroll_list, entry, unroll_link); segs[seg].ds_addr = entry->start + offset; @@ -557,11 +584,13 @@ int flags, bus_dma_segment_t *segs, int *segp) { struct dmar_ctx *ctx; + struct dmar_domain *domain; struct dmar_map_entry *entry, *entry1; struct dmar_map_entries_tailq unroll_list; int error; ctx = tag->ctx; + domain = ctx->domain; atomic_add_long(&ctx->loads, 1); TAILQ_INIT(&unroll_list); @@ -573,7 +602,7 @@ * partial buffer load, so unfortunately we have to * revert all work done. */ - DMAR_CTX_LOCK(ctx); + DMAR_DOMAIN_LOCK(domain); TAILQ_FOREACH_SAFE(entry, &unroll_list, unroll_link, entry1) { /* @@ -584,12 +613,12 @@ */ TAILQ_REMOVE(&map->map_entries, entry, dmamap_link); TAILQ_REMOVE(&unroll_list, entry, unroll_link); - TAILQ_INSERT_TAIL(&ctx->unload_entries, entry, + TAILQ_INSERT_TAIL(&domain->unload_entries, entry, dmamap_link); } - DMAR_CTX_UNLOCK(ctx); - taskqueue_enqueue(ctx->dmar->delayed_taskqueue, - &ctx->unload_task); + DMAR_DOMAIN_UNLOCK(domain); + taskqueue_enqueue(domain->dmar->delayed_taskqueue, + &domain->unload_task); } if (error == ENOMEM && (flags & BUS_DMA_NOWAIT) == 0 && @@ -596,7 +625,7 @@ !map->cansleep) error = EINPROGRESS; if (error == EINPROGRESS) - dmar_bus_schedule_dmamap(ctx->dmar, map); + dmar_bus_schedule_dmamap(domain->dmar, map); return (error); } @@ -762,6 +791,7 @@ struct bus_dma_tag_dmar *tag; struct bus_dmamap_dmar *map; struct dmar_ctx *ctx; + struct dmar_domain *domain; #if defined(__amd64__) struct dmar_map_entries_tailq entries; #endif @@ -769,20 +799,22 @@ tag = (struct bus_dma_tag_dmar *)dmat; map = (struct bus_dmamap_dmar *)map1; ctx = tag->ctx; + domain = ctx->domain; atomic_add_long(&ctx->unloads, 1); #if defined(__i386__) - DMAR_CTX_LOCK(ctx); - TAILQ_CONCAT(&ctx->unload_entries, &map->map_entries, dmamap_link); - DMAR_CTX_UNLOCK(ctx); - taskqueue_enqueue(ctx->dmar->delayed_taskqueue, &ctx->unload_task); + DMAR_DOMAIN_LOCK(domain); + TAILQ_CONCAT(&domain->unload_entries, &map->map_entries, dmamap_link); + DMAR_DOMAIN_UNLOCK(domain); + taskqueue_enqueue(domain->dmar->delayed_taskqueue, + &domain->unload_task); #else /* defined(__amd64__) */ TAILQ_INIT(&entries); - DMAR_CTX_LOCK(ctx); + DMAR_DOMAIN_LOCK(domain); TAILQ_CONCAT(&entries, &map->map_entries, dmamap_link); - DMAR_CTX_UNLOCK(ctx); + DMAR_DOMAIN_UNLOCK(domain); THREAD_NO_SLEEPING(); - dmar_ctx_unload(ctx, &entries, false); + dmar_domain_unload(domain, &entries, false); THREAD_SLEEPING_OK(); KASSERT(TAILQ_EMPTY(&entries), ("lazy dmar_ctx_unload %p", ctx)); #endif @@ -855,6 +887,8 @@ dmar_init_busdma(struct dmar_unit *unit) { + unit->dma_enabled = 1; + TUNABLE_INT_FETCH("hw.dmar.dma", &unit->dma_enabled); TAILQ_INIT(&unit->delayed_maps); TASK_INIT(&unit->dmamap_load_task, 0, dmar_bus_task_dmamap, unit); unit->delayed_taskqueue = taskqueue_create("dmar", M_WAITOK, Modified: trunk/sys/x86/iommu/busdma_dmar.h =================================================================== --- trunk/sys/x86/iommu/busdma_dmar.h 2020-02-08 19:29:01 UTC (rev 12309) +++ trunk/sys/x86/iommu/busdma_dmar.h 2020-02-08 19:32:41 UTC (rev 12310) @@ -27,7 +27,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: stable/10/sys/x86/iommu/busdma_dmar.h 257251 2013-10-28 13:33:29Z kib $ + * $FreeBSD: stable/11/sys/x86/iommu/busdma_dmar.h 257251 2013-10-28 13:33:29Z kib $ */ #ifndef __X86_IOMMU_BUSDMA_DMAR_H Modified: trunk/sys/x86/iommu/intel_ctx.c =================================================================== --- trunk/sys/x86/iommu/intel_ctx.c 2020-02-08 19:29:01 UTC (rev 12309) +++ trunk/sys/x86/iommu/intel_ctx.c 2020-02-08 19:32:41 UTC (rev 12310) @@ -29,7 +29,7 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/x86/iommu/intel_ctx.c 279485 2015-03-01 10:35:54Z kib $"); +__FBSDID("$FreeBSD: stable/11/sys/x86/iommu/intel_ctx.c 320357 2017-06-26 12:30:39Z kib $"); #include <sys/param.h> #include <sys/systm.h> @@ -49,6 +49,7 @@ #include <sys/taskqueue.h> #include <sys/tree.h> #include <sys/uio.h> +#include <sys/vmem.h> #include <vm/vm.h> #include <vm/vm_extern.h> #include <vm/vm_kern.h> @@ -68,8 +69,12 @@ #include <dev/pci/pcivar.h> static MALLOC_DEFINE(M_DMAR_CTX, "dmar_ctx", "Intel DMAR Context"); +static MALLOC_DEFINE(M_DMAR_DOMAIN, "dmar_dom", "Intel DMAR Domain"); -static void dmar_ctx_unload_task(void *arg, int pending); +static void dmar_domain_unload_task(void *arg, int pending); +static void dmar_unref_domain_locked(struct dmar_unit *dmar, + struct dmar_domain *domain); +static void dmar_domain_destroy(struct dmar_domain *domain); static void dmar_ensure_ctx_page(struct dmar_unit *dmar, int bus) @@ -108,8 +113,8 @@ { dmar_ctx_entry_t *ctxp; - ctxp = dmar_map_pgtbl(ctx->dmar->ctx_obj, 1 + PCI_RID2BUS(ctx->rid), - DMAR_PGF_NOALLOC | DMAR_PGF_WAITOK, sfp); + ctxp = dmar_map_pgtbl(ctx->domain->dmar->ctx_obj, 1 + + PCI_RID2BUS(ctx->rid), DMAR_PGF_NOALLOC | DMAR_PGF_WAITOK, sfp); ctxp += ctx->rid & 0xff; return (ctxp); } @@ -119,7 +124,7 @@ { bus_addr_t maxaddr; - maxaddr = MIN(ctx->end, BUS_SPACE_MAXADDR); + maxaddr = MIN(ctx->domain->end, BUS_SPACE_MAXADDR); ctx->ctx_tag.common.ref_count = 1; /* Prevent free */ ctx->ctx_tag.common.impl = &bus_dma_dmar_impl; ctx->ctx_tag.common.boundary = PCI_DMA_BOUNDARY; @@ -130,33 +135,42 @@ ctx->ctx_tag.common.maxsegsz = maxaddr; ctx->ctx_tag.ctx = ctx; ctx->ctx_tag.owner = dev; - /* XXXKIB initialize tag further */ } static void -ctx_id_entry_init(struct dmar_ctx *ctx, dmar_ctx_entry_t *ctxp) +ctx_id_entry_init(struct dmar_ctx *ctx, dmar_ctx_entry_t *ctxp, bool move) { struct dmar_unit *unit; + struct dmar_domain *domain; vm_page_t ctx_root; - unit = ctx->dmar; - KASSERT(ctxp->ctx1 == 0 && ctxp->ctx2 == 0, + domain = ctx->domain; + unit = domain->dmar; + KASSERT(move || (ctxp->ctx1 == 0 && ctxp->ctx2 == 0), ("dmar%d: initialized ctx entry %d:%d:%d 0x%jx 0x%jx", unit->unit, pci_get_bus(ctx->ctx_tag.owner), pci_get_slot(ctx->ctx_tag.owner), pci_get_function(ctx->ctx_tag.owner), - ctxp->ctx1, - ctxp->ctx2)); - ctxp->ctx2 = DMAR_CTX2_DID(ctx->domain); - ctxp->ctx2 |= ctx->awlvl; - if ((ctx->flags & DMAR_CTX_IDMAP) != 0 && + ctxp->ctx1, ctxp->ctx2)); + /* + * For update due to move, the store is not atomic. It is + * possible that DMAR read upper doubleword, while low + * doubleword is not yet updated. The domain id is stored in + * the upper doubleword, while the table pointer in the lower. + * + * There is no good solution, for the same reason it is wrong + * to clear P bit in the ctx entry for update. + */ + dmar_pte_store1(&ctxp->ctx2, DMAR_CTX2_DID(domain->domain) | + domain->awlvl); + if ((domain->flags & DMAR_DOMAIN_IDMAP) != 0 && (unit->hw_ecap & DMAR_ECAP_PT) != 0) { - KASSERT(ctx->pgtbl_obj == NULL, + KASSERT(domain->pgtbl_obj == NULL, ("ctx %p non-null pgtbl_obj", ctx)); - dmar_pte_store(&ctxp->ctx1, DMAR_CTX1_T_PASS | DMAR_CTX1_P); + dmar_pte_store1(&ctxp->ctx1, DMAR_CTX1_T_PASS | DMAR_CTX1_P); } else { - ctx_root = dmar_pgalloc(ctx->pgtbl_obj, 0, DMAR_PGF_NOALLOC); - dmar_pte_store(&ctxp->ctx1, DMAR_CTX1_T_UNTR | + ctx_root = dmar_pgalloc(domain->pgtbl_obj, 0, DMAR_PGF_NOALLOC); + dmar_pte_store1(&ctxp->ctx1, DMAR_CTX1_T_UNTR | (DMAR_CTX1_ASR_MASK & VM_PAGE_TO_PHYS(ctx_root)) | DMAR_CTX1_P); } @@ -164,8 +178,32 @@ } static int -ctx_init_rmrr(struct dmar_ctx *ctx, device_t dev) +dmar_flush_for_ctx_entry(struct dmar_unit *dmar, bool force) { + int error; + + /* + * If dmar declares Caching Mode as Set, follow 11.5 "Caching + * Mode Consideration" and do the (global) invalidation of the + * negative TLB entries. + */ + if ((dmar->hw_cap & DMAR_CAP_CM) == 0 && !force) + return (0); + if (dmar->qi_enabled) { + dmar_qi_invalidate_ctx_glob_locked(dmar); + if ((dmar->hw_ecap & DMAR_ECAP_DI) != 0 || force) + dmar_qi_invalidate_iotlb_glob_locked(dmar); + return (0); + } + error = dmar_inv_ctx_glob(dmar); + if (error == 0 && ((dmar->hw_ecap & DMAR_ECAP_DI) != 0 || force)) + error = dmar_inv_iotlb_glob(dmar); + return (error); +} + +static int +domain_init_rmrr(struct dmar_domain *domain, device_t dev) +{ struct dmar_map_entries_tailq rmrr_entries; struct dmar_map_entry *entry, *entry1; vm_page_t *ma; @@ -175,7 +213,7 @@ error = 0; TAILQ_INIT(&rmrr_entries); - dmar_ctx_parse_rmrr(ctx, dev, &rmrr_entries); + dmar_dev_parse_rmrr(domain, dev, &rmrr_entries); TAILQ_FOREACH_SAFE(entry, &rmrr_entries, unroll_link, entry1) { /* * VT-d specification requires that the start of an @@ -195,7 +233,7 @@ if (bootverbose) { device_printf(dev, "BIOS bug: dmar%d RMRR " "region (%jx, %jx) corrected\n", - ctx->dmar->unit, start, end); + domain->dmar->unit, start, end); } entry->end += DMAR_PAGE_SIZE * 0x20; } @@ -205,8 +243,9 @@ ma[i] = vm_page_getfake(entry->start + PAGE_SIZE * i, VM_MEMATTR_DEFAULT); } - error1 = dmar_gas_map_region(ctx, entry, DMAR_MAP_ENTRY_READ | - DMAR_MAP_ENTRY_WRITE, DMAR_GM_CANWAIT, ma); + error1 = dmar_gas_map_region(domain, entry, + DMAR_MAP_ENTRY_READ | DMAR_MAP_ENTRY_WRITE, + DMAR_GM_CANWAIT, ma); /* * Non-failed RMRR entries are owned by context rb * tree. Get rid of the failed entry, but do not stop @@ -214,18 +253,19 @@ * loaded and removed on the context destruction. */ if (error1 == 0 && entry->end != entry->start) { - DMAR_LOCK(ctx->dmar); - ctx->flags |= DMAR_CTX_RMRR; - DMAR_UNLOCK(ctx->dmar); + DMAR_LOCK(domain->dmar); + domain->refs++; /* XXXKIB prevent free */ + domain->flags |= DMAR_DOMAIN_RMRR; + DMAR_UNLOCK(domain->dmar); } else { if (error1 != 0) { device_printf(dev, "dmar%d failed to map RMRR region (%jx, %jx) %d\n", - ctx->dmar->unit, start, end, error1); + domain->dmar->unit, start, end, error1); error = error1; } TAILQ_REMOVE(&rmrr_entries, entry, unroll_link); - dmar_gas_free_entry(ctx, entry); + dmar_gas_free_entry(domain, entry); } for (i = 0; i < size; i++) vm_page_putfake(ma[i]); @@ -234,47 +274,144 @@ return (error); } +static struct dmar_domain * +dmar_domain_alloc(struct dmar_unit *dmar, bool id_mapped) +{ + struct dmar_domain *domain; + int error, id, mgaw; + + id = alloc_unr(dmar->domids); + if (id == -1) + return (NULL); + domain = malloc(sizeof(*domain), M_DMAR_DOMAIN, M_WAITOK | M_ZERO); + domain->domain = id; + LIST_INIT(&domain->contexts); + RB_INIT(&domain->rb_root); + TAILQ_INIT(&domain->unload_entries); + TASK_INIT(&domain->unload_task, 0, dmar_domain_unload_task, domain); + mtx_init(&domain->lock, "dmardom", NULL, MTX_DEF); + domain->dmar = dmar; + + /* + * For now, use the maximal usable physical address of the + * installed memory to calculate the mgaw on id_mapped domain. + * It is useful for the identity mapping, and less so for the + * virtualized bus address space. + */ + domain->end = id_mapped ? ptoa(Maxmem) : BUS_SPACE_MAXADDR; + mgaw = dmar_maxaddr2mgaw(dmar, domain->end, !id_mapped); + error = domain_set_agaw(domain, mgaw); + if (error != 0) + goto fail; + if (!id_mapped) + /* Use all supported address space for remapping. */ + domain->end = 1ULL << (domain->agaw - 1); + + dmar_gas_init_domain(domain); + + if (id_mapped) { + if ((dmar->hw_ecap & DMAR_ECAP_PT) == 0) { + domain->pgtbl_obj = domain_get_idmap_pgtbl(domain, + domain->end); + } + domain->flags |= DMAR_DOMAIN_IDMAP; + } else { + error = domain_alloc_pgtbl(domain); + if (error != 0) + goto fail; + /* Disable local apic region access */ + error = dmar_gas_reserve_region(domain, 0xfee00000, + 0xfeefffff + 1); + if (error != 0) + goto fail; + } + return (domain); + +fail: + dmar_domain_destroy(domain); + return (NULL); +} + static struct dmar_ctx * -dmar_get_ctx_alloc(struct dmar_unit *dmar, uint16_t rid) +dmar_ctx_alloc(struct dmar_domain *domain, uint16_t rid) { struct dmar_ctx *ctx; ctx = malloc(sizeof(*ctx), M_DMAR_CTX, M_WAITOK | M_ZERO); - RB_INIT(&ctx->rb_root); - TAILQ_INIT(&ctx->unload_entries); - TASK_INIT(&ctx->unload_task, 0, dmar_ctx_unload_task, ctx); - mtx_init(&ctx->lock, "dmarctx", NULL, MTX_DEF); - ctx->dmar = dmar; + ctx->domain = domain; ctx->rid = rid; + ctx->refs = 1; return (ctx); } static void -dmar_ctx_dtr(struct dmar_ctx *ctx, bool gas_inited, bool pgtbl_inited) +dmar_ctx_link(struct dmar_ctx *ctx) { + struct dmar_domain *domain; - if (gas_inited) { - DMAR_CTX_LOCK(ctx); - dmar_gas_fini_ctx(ctx); - DMAR_CTX_UNLOCK(ctx); + domain = ctx->domain; + DMAR_ASSERT_LOCKED(domain->dmar); + KASSERT(domain->refs >= domain->ctx_cnt, + ("dom %p ref underflow %d %d", domain, domain->refs, + domain->ctx_cnt)); + domain->refs++; + domain->ctx_cnt++; + LIST_INSERT_HEAD(&domain->contexts, ctx, link); +} + +static void +dmar_ctx_unlink(struct dmar_ctx *ctx) +{ + struct dmar_domain *domain; + + domain = ctx->domain; + DMAR_ASSERT_LOCKED(domain->dmar); + KASSERT(domain->refs > 0, + ("domain %p ctx dtr refs %d", domain, domain->refs)); + KASSERT(domain->ctx_cnt >= domain->refs, + ("domain %p ctx dtr refs %d ctx_cnt %d", domain, + domain->refs, domain->ctx_cnt)); + domain->refs--; + domain->ctx_cnt--; + LIST_REMOVE(ctx, link); +} + +static void +dmar_domain_destroy(struct dmar_domain *domain) +{ + + KASSERT(TAILQ_EMPTY(&domain->unload_entries), + ("unfinished unloads %p", domain)); + KASSERT(LIST_EMPTY(&domain->contexts), + ("destroying dom %p with contexts", domain)); + KASSERT(domain->ctx_cnt == 0, + ("destroying dom %p with ctx_cnt %d", domain, domain->ctx_cnt)); + KASSERT(domain->refs == 0, + ("destroying dom %p with refs %d", domain, domain->refs)); + if ((domain->flags & DMAR_DOMAIN_GAS_INITED) != 0) { + DMAR_DOMAIN_LOCK(domain); + dmar_gas_fini_domain(domain); + DMAR_DOMAIN_UNLOCK(domain); } - if (pgtbl_inited) { - if (ctx->pgtbl_obj != NULL) - DMAR_CTX_PGLOCK(ctx); - ctx_free_pgtbl(ctx); + if ((domain->flags & DMAR_DOMAIN_PGTBL_INITED) != 0) { + if (domain->pgtbl_obj != NULL) + DMAR_DOMAIN_PGLOCK(domain); + domain_free_pgtbl(domain); } - mtx_destroy(&ctx->lock); - free(ctx, M_DMAR_CTX); + mtx_destroy(&domain->lock); + free_unr(domain->dmar->domids, domain->domain); + free(domain, M_DMAR_DOMAIN); } struct dmar_ctx * -dmar_get_ctx(struct dmar_unit *dmar, device_t dev, uint16_t rid, bool id_mapped, - bool rmrr_init) +dmar_get_ctx_for_dev(struct dmar_unit *dmar, device_t dev, uint16_t rid, + bool id_mapped, bool rmrr_init) { + struct dmar_domain *domain, *domain1; struct dmar_ctx *ctx, *ctx1; dmar_ctx_entry_t *ctxp; struct sf_buf *sf; - int bus, slot, func, error, mgaw; + int bus, slot, func, error; bool enable; bus = pci_get_bus(dev); @@ -292,67 +429,20 @@ */ DMAR_UNLOCK(dmar); dmar_ensure_ctx_page(dmar, PCI_RID2BUS(rid)); - ctx1 = dmar_get_ctx_alloc(dmar, rid); - - if (id_mapped) { - /* - * For now, use the maximal usable physical - * address of the installed memory to - * calculate the mgaw. It is useful for the - * identity mapping, and less so for the - * virtualized bus address space. - */ - ctx1->end = ptoa(Maxmem); - mgaw = dmar_maxaddr2mgaw(dmar, ctx1->end, false); - error = ctx_set_agaw(ctx1, mgaw); - if (error != 0) { - dmar_ctx_dtr(ctx1, false, false); - TD_PINNED_ASSERT; - return (NULL); - } - } else { - ctx1->end = BUS_SPACE_MAXADDR; - mgaw = dmar_maxaddr2mgaw(dmar, ctx1->end, true); - error = ctx_set_agaw(ctx1, mgaw); - if (error != 0) { - dmar_ctx_dtr(ctx1, false, false); - TD_PINNED_ASSERT; - return (NULL); - } - /* Use all supported address space for remapping. */ - ctx1->end = 1ULL << (ctx1->agaw - 1); + domain1 = dmar_domain_alloc(dmar, id_mapped); + if (domain1 == NULL) { + TD_PINNED_ASSERT; + return (NULL); } - - - dmar_gas_init_ctx(ctx1); - if (id_mapped) { - if ((dmar->hw_ecap & DMAR_ECAP_PT) == 0) { - ctx1->pgtbl_obj = ctx_get_idmap_pgtbl(ctx1, - ctx1->end); - } - ctx1->flags |= DMAR_CTX_IDMAP; - } else { - error = ctx_alloc_pgtbl(ctx1); + if (!id_mapped) { + error = domain_init_rmrr(domain1, dev); if (error != 0) { - dmar_ctx_dtr(ctx1, true, false); + dmar_domain_destroy(domain1); TD_PINNED_ASSERT; return (NULL); } - /* Disable local apic region access */ - error = dmar_gas_reserve_region(ctx1, 0xfee00000, - 0xfeefffff + 1); - if (error != 0) { - dmar_ctx_dtr(ctx1, true, true); - TD_PINNED_ASSERT; - return (NULL); - } - error = ctx_init_rmrr(ctx1, dev); - if (error != 0) { - dmar_ctx_dtr(ctx1, true, true); - TD_PINNED_ASSERT; - return (NULL); - } } + ctx1 = dmar_ctx_alloc(domain1, rid); ctxp = dmar_map_ctx_entry(ctx1, &sf); DMAR_LOCK(dmar); @@ -362,16 +452,10 @@ */ ctx = dmar_find_ctx_locked(dmar, rid); if (ctx == NULL) { + domain = domain1; ctx = ctx1; + dmar_ctx_link(ctx); ctx->ctx_tag.owner = dev; - ctx->domain = alloc_unrl(dmar->domids); - if (ctx->domain == -1) { - DMAR_UNLOCK(dmar); - dmar_unmap_pgtbl(sf); - dmar_ctx_dtr(ctx, true, true); - TD_PINNED_ASSERT; - return (NULL); - } ctx_tag_init(ctx, dev); /* @@ -379,46 +463,35 @@ * DMAR unit. Enable the translation after * everything is set up. */ - if (LIST_EMPTY(&dmar->contexts)) + if (LIST_EMPTY(&dmar->domains)) enable = true; - LIST_INSERT_HEAD(&dmar->contexts, ctx, link); - ctx_id_entry_init(ctx, ctxp); + LIST_INSERT_HEAD(&dmar->domains, domain, link); + ctx_id_entry_init(ctx, ctxp, false); device_printf(dev, "dmar%d pci%d:%d:%d:%d rid %x domain %d mgaw %d " "agaw %d %s-mapped\n", dmar->unit, dmar->segment, bus, slot, - func, rid, ctx->domain, ctx->mgaw, ctx->agaw, - id_mapped ? "id" : "re"); + func, rid, domain->domain, domain->mgaw, + domain->agaw, id_mapped ? "id" : "re"); + dmar_unmap_pgtbl(sf); } else { - dmar_ctx_dtr(ctx1, true, true); + dmar_unmap_pgtbl(sf); + dmar_domain_destroy(domain1); + /* Nothing needs to be done to destroy ctx1. */ + free(ctx1, M_DMAR_CTX); + domain = ctx->domain; + ctx->refs++; /* tag referenced us */ } - dmar_unmap_pgtbl(sf); + } else { + domain = ctx->domain; + ctx->refs++; /* tag referenced us */ } - ctx->refs++; - if ((ctx->flags & DMAR_CTX_RMRR) != 0) - ctx->refs++; /* XXXKIB */ - /* - * If dmar declares Caching Mode as Set, follow 11.5 "Caching - * Mode Consideration" and do the (global) invalidation of the - * negative TLB entries. - */ - if ((dmar->hw_cap & DMAR_CAP_CM) != 0 || enable) { - if (dmar->qi_enabled) { - dmar_qi_invalidate_ctx_glob_locked(dmar); - if ((dmar->hw_ecap & DMAR_ECAP_DI) != 0) - dmar_qi_invalidate_iotlb_glob_locked(dmar); - } else { - error = dmar_inv_ctx_glob(dmar); - if (error == 0 && - (dmar->hw_ecap & DMAR_ECAP_DI) != 0) - error = dmar_inv_iotlb_glob(dmar); - if (error != 0) { - dmar_free_ctx_locked(dmar, ctx); - TD_PINNED_ASSERT; - return (NULL); - } - } + error = dmar_flush_for_ctx_entry(dmar, enable); + if (error != 0) { + dmar_free_ctx_locked(dmar, ctx); + TD_PINNED_ASSERT; + return (NULL); } /* @@ -439,11 +512,74 @@ return (ctx); } +int +dmar_move_ctx_to_domain(struct dmar_domain *domain, struct dmar_ctx *ctx) +{ + struct dmar_unit *dmar; + struct dmar_domain *old_domain; + dmar_ctx_entry_t *ctxp; + struct sf_buf *sf; + int error; + + dmar = domain->dmar; + old_domain = ctx->domain; + if (domain == old_domain) + return (0); + KASSERT(old_domain->dmar == dmar, + ("domain %p %u moving between dmars %u %u", domain, + domain->domain, old_domain->dmar->unit, domain->dmar->unit)); + TD_PREP_PINNED_ASSERT; + + ctxp = dmar_map_ctx_entry(ctx, &sf); + DMAR_LOCK(dmar); + dmar_ctx_unlink(ctx); + ctx->domain = domain; + dmar_ctx_link(ctx); + ctx_id_entry_init(ctx, ctxp, true); + dmar_unmap_pgtbl(sf); + error = dmar_flush_for_ctx_entry(dmar, true); + /* If flush failed, rolling back would not work as well. */ + printf("dmar%d rid %x domain %d->%d %s-mapped\n", + dmar->unit, ctx->rid, old_domain->domain, domain->domain, + (domain->flags & DMAR_DOMAIN_IDMAP) != 0 ? "id" : "re"); + dmar_unref_domain_locked(dmar, old_domain); + TD_PINNED_ASSERT; + return (error); +} + +static void +dmar_unref_domain_locked(struct dmar_unit *dmar, struct dmar_domain *domain) +{ + + DMAR_ASSERT_LOCKED(dmar); + KASSERT(domain->refs >= 1, + ("dmar %d domain %p refs %u", dmar->unit, domain, domain->refs)); + KASSERT(domain->refs > domain->ctx_cnt, + ("dmar %d domain %p refs %d ctx_cnt %d", dmar->unit, domain, + domain->refs, domain->ctx_cnt)); + + if (domain->refs > 1) { + domain->refs--; + DMAR_UNLOCK(dmar); + return; + } + + KASSERT((domain->flags & DMAR_DOMAIN_RMRR) == 0, + ("lost ref on RMRR domain %p", domain)); + + LIST_REMOVE(domain, link); + DMAR_UNLOCK(dmar); + + taskqueue_drain(dmar->delayed_taskqueue, &domain->unload_task); + dmar_domain_destroy(domain); +} + void dmar_free_ctx_locked(struct dmar_unit *dmar, struct dmar_ctx *ctx) { struct sf_buf *sf; dmar_ctx_entry_t *ctxp; + struct dmar_domain *domain; DMAR_ASSERT_LOCKED(dmar); KASSERT(ctx->refs >= 1, @@ -459,8 +595,6 @@ return; } - KASSERT((ctx->flags & DMAR_CTX_RMRR) == 0, - ("lost ref on RMRR ctx %p", ctx)); KASSERT((ctx->flags & DMAR_CTX_DISABLED) == 0, ("lost ref on disabled ctx %p", ctx)); @@ -488,8 +622,6 @@ return; } - KASSERT((ctx->flags & DMAR_CTX_RMRR) == 0, - ("lost ref on RMRR ctx %p", ctx)); KASSERT((ctx->flags & DMAR_CTX_DISABLED) == 0, ("lost ref on disabled ctx %p", ctx)); @@ -507,19 +639,11 @@ else dmar_inv_iotlb_glob(dmar); } - LIST_REMOVE(ctx, link); - DMAR_UNLOCK(dmar); - - /* - * The rest of the destruction is invisible for other users of - * the dmar unit. - */ - taskqueue_drain(dmar->delayed_taskqueue, &ctx->unload_task); - KASSERT(TAILQ_EMPTY(&ctx->unload_entries), - ("unfinished unloads %p", ctx)); dmar_unmap_pgtbl(sf); - free_unr(dmar->domids, ctx->domain); - dmar_ctx_dtr(ctx, true, true); + domain = ctx->domain; + dmar_ctx_unlink(ctx); + free(ctx, M_DMAR_CTX); + dmar_unref_domain_locked(dmar, domain); TD_PINNED_ASSERT; } @@ -528,86 +652,101 @@ { struct dmar_unit *dmar; - dmar = ctx->dmar; + dmar = ctx->domain->dmar; DMAR_LOCK(dmar); dmar_free_ctx_locked(dmar, ctx); } +/* + * Returns with the domain locked. + */ struct dmar_ctx * dmar_find_ctx_locked(struct dmar_unit *dmar, uint16_t rid) { + struct dmar_domain *domain; struct dmar_ctx *ctx; DMAR_ASSERT_LOCKED(dmar); - LIST_FOREACH(ctx, &dmar->contexts, link) { - if (ctx->rid == rid) - return (ctx); + LIST_FOREACH(domain, &dmar->domains, link) { + LIST_FOREACH(ctx, &domain->contexts, link) { + if (ctx->rid == rid) + return (ctx); + } } return (NULL); } void -dmar_ctx_free_entry(struct dmar_map_entry *entry, bool free) +dmar_domain_free_entry(struct dmar_map_entry *entry, bool free) { - struct dmar_ctx *ctx; + struct dmar_domain *domain; - ctx = entry->ctx; - DMAR_CTX_LOCK(ctx); + domain = entry->domain; + DMAR_DOMAIN_LOCK(domain); if ((entry->flags & DMAR_MAP_ENTRY_RMRR) != 0) - dmar_gas_free_region(ctx, entry); + dmar_gas_free_region(domain, entry); else - dmar_gas_free_space(ctx, entry); - DMAR_CTX_UNLOCK(ctx); + dmar_gas_free_space(domain, entry); + DMAR_DOMAIN_UNLOCK(domain); if (free) - dmar_gas_free_entry(ctx, entry); + dmar_gas_free_entry(domain, entry); else entry->flags = 0; } void -dmar_ctx_unload_entry(struct dmar_map_entry *entry, bool free) +dmar_domain_unload_entry(struct dmar_map_entry *entry, bool free) { struct dmar_unit *unit; - unit = entry->ctx->dmar; + unit = entry->domain->dmar; if (unit->qi_enabled) { DMAR_LOCK(unit); - dmar_qi_invalidate_locked(entry->ctx, entry->start, - entry->end - entry->start, &entry->gseq); + dmar_qi_invalidate_locked(entry->domain, entry->start, + entry->end - entry->start, &entry->gseq, true); if (!free) entry->flags |= DMAR_MAP_ENTRY_QI_NF; TAILQ_INSERT_TAIL(&unit->tlb_flush_entries, entry, dmamap_link); DMAR_UNLOCK(unit); } else { - ctx_flush_iotlb_sync(entry->ctx, entry->start, entry->end - - entry->start); - dmar_ctx_free_entry(entry, free); + domain_flush_iotlb_sync(entry->domain, entry->start, + entry->end - entry->start); + dmar_domain_free_entry(entry, free); } } +static bool +dmar_domain_unload_emit_wait(struct dmar_domain *domain, + struct dmar_map_entry *entry) +{ + + if (TAILQ_NEXT(entry, dmamap_link) == NULL) + return (true); + return (domain->batch_no++ % dmar_batch_coalesce == 0); +} + void -dmar_ctx_unload(struct dmar_ctx *ctx, struct dmar_map_entries_tailq *entries, - bool cansleep) +dmar_domain_unload(struct dmar_domain *domain, + struct dmar_map_entries_tailq *entries, bool cansleep) { struct dmar_unit *unit; struct dmar_map_entry *entry, *entry1; - struct dmar_qi_genseq gseq; int error; - unit = ctx->dmar; + unit = domain->dmar; TAILQ_FOREACH_SAFE(entry, entries, dmamap_link, entry1) { KASSERT((entry->flags & DMAR_MAP_ENTRY_MAP) != 0, - ("not mapped entry %p %p", ctx, entry)); - error = ctx_unmap_buf(ctx, entry->start, entry->end - + ("not mapped entry %p %p", domain, entry)); + error = domain_unmap_buf(domain, entry->start, entry->end - entry->start, cansleep ? DMAR_PGF_WAITOK : 0); - KASSERT(error == 0, ("unmap %p error %d", ctx, error)); + KASSERT(error == 0, ("unmap %p error %d", domain, error)); if (!unit->qi_enabled) { - ctx_flush_iotlb_sync(ctx, entry->start, + domain_flush_iotlb_sync(domain, entry->start, entry->end - entry->start); TAILQ_REMOVE(entries, entry, dmamap_link); - dmar_ctx_free_entry(entry, true); + dmar_domain_free_entry(entry, true); } } if (TAILQ_EMPTY(entries)) @@ -616,36 +755,30 @@ KASSERT(unit->qi_enabled, ("loaded entry left")); DMAR_LOCK(unit); TAILQ_FOREACH(entry, entries, dmamap_link) { - entry->gseq.gen = 0; - entry->gseq.seq = 0; - dmar_qi_invalidate_locked(ctx, entry->start, entry->end - - entry->start, TAILQ_NEXT(entry, dmamap_link) == NULL ? - &gseq : NULL); + dmar_qi_invalidate_locked(domain, entry->start, entry->end - + entry->start, &entry->gseq, + dmar_domain_unload_emit_wait(domain, entry)); } - TAILQ_FOREACH_SAFE(entry, entries, dmamap_link, entry1) { - entry->gseq = gseq; - TAILQ_REMOVE(entries, entry, dmamap_link); - TAILQ_INSERT_TAIL(&unit->tlb_flush_entries, entry, dmamap_link); - } + TAILQ_CONCAT(&unit->tlb_flush_entries, entries, dmamap_link); DMAR_UNLOCK(unit); } static void -dmar_ctx_unload_task(void *arg, int pending) +dmar_domain_unload_task(void *arg, int pending) { - struct dmar_ctx *ctx; + struct dmar_domain *domain; struct dmar_map_entries_tailq entries; - ctx = arg; + domain = arg; TAILQ_INIT(&entries); for (;;) { - DMAR_CTX_LOCK(ctx); - TAILQ_SWAP(&ctx->unload_entries, &entries, dmar_map_entry, + DMAR_DOMAIN_LOCK(domain); + TAILQ_SWAP(&domain->unload_entries, &entries, dmar_map_entry, dmamap_link); - DMAR_CTX_UNLOCK(ctx); + DMAR_DOMAIN_UNLOCK(domain); if (TAILQ_EMPTY(&entries)) break; - dmar_ctx_unload(ctx, &entries, true); + dmar_domain_unload(domain, &entries, true); } } Modified: trunk/sys/x86/iommu/intel_dmar.h =================================================================== --- trunk/sys/x86/iommu/intel_dmar.h 2020-02-08 19:29:01 UTC (rev 12309) +++ trunk/sys/x86/iommu/intel_dmar.h 2020-02-08 19:32:41 UTC (rev 12310) @@ -1,6 +1,6 @@ /* $MidnightBSD$ */ /*- - * Copyright (c) 2013 The FreeBSD Foundation + * Copyright (c) 2013-2015 The FreeBSD Foundation * All rights reserved. * * This software was developed by Konstantin Belousov <kib at FreeBSD.org> @@ -27,7 +27,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: stable/10/sys/x86/iommu/intel_dmar.h 281545 2015-04-15 06:56:51Z kib $ + * $FreeBSD: stable/11/sys/x86/iommu/intel_dmar.h 320357 2017-06-26 12:30:39Z kib $ */ #ifndef __X86_IOMMU_INTEL_DMAR_H @@ -51,10 +51,10 @@ current R/B tree node */ u_int flags; TAILQ_ENTRY(dmar_map_entry) dmamap_link; /* Link for dmamap entries */ - RB_ENTRY(dmar_map_entry) rb_entry; /* Links for ctx entries */ + RB_ENTRY(dmar_map_entry) rb_entry; /* Links for domain entries */ TAILQ_ENTRY(dmar_map_entry) unroll_link; /* Link for unroll after dmamap_load failure */ - struct dmar_ctx *ctx; + struct dmar_domain *domain; struct dmar_qi_genseq gseq; }; @@ -74,51 +74,85 @@ #define DMAR_MAP_ENTRY_SNOOP 0x4000 /* Snoop */ #define DMAR_MAP_ENTRY_TM 0x8000 /* Transient */ +/* + * Locking annotations: + * (u) - Protected by dmar unit lock + * (d) - Protected by domain lock + * (c) - Immutable after initialization + */ + +/* + * The domain abstraction. Most non-constant members of the domain + * are protected by owning dmar unit lock, not by the domain lock. + * Most important, the dmar lock protects the contexts list. + * + * The domain lock protects the address map for the domain, and list + * of unload entries delayed. + * + * Page tables pages and pages content is protected by the vm object + * lock pgtbl_obj, which contains the page tables pages. + */ +struct dmar_domain { + int domain; /* (c) DID, written in context entry */ + int mgaw; /* (c) Real max address width */ + int agaw; /* (c) Adjusted guest address width */ + int pglvl; /* (c) The pagelevel */ + int awlvl; /* (c) The pagelevel as the bitmask, + to set in context entry */ + dmar_gaddr_t end; /* (c) Highest address + 1 in + the guest AS */ + u_int ctx_cnt; /* (u) Number of contexts owned */ + u_int refs; /* (u) Refs, including ctx */ + struct dmar_unit *dmar; /* (c) */ + struct mtx lock; /* (c) */ + LIST_ENTRY(dmar_domain) link; /* (u) Member in the dmar list */ + LIST_HEAD(, dmar_ctx) contexts; /* (u) */ + vm_object_t pgtbl_obj; /* (c) Page table pages */ + u_int flags; /* (u) */ + u_int entries_cnt; /* (d) */ + struct dmar_gas_entries_tree rb_root; /* (d) */ + struct dmar_map_entries_tailq unload_entries; /* (d) Entries to + unload */ + struct dmar_map_entry *first_place, *last_place; /* (d) */ + struct task unload_task; /* (c) */ + u_int batch_no; +}; + struct dmar_ctx { - uint16_t rid; /* pci RID */ - int domain; /* DID */ - int mgaw; /* Real max address width */ - int agaw; /* Adjusted guest address width */ - int pglvl; /* The pagelevel */ - int awlvl; /* The pagelevel as the bitmask, to set in - context entry */ - dmar_gaddr_t end;/* Highest address + 1 in the guest AS */ - u_int refs; /* References to the context, from tags */ - struct dmar_unit *dmar; - struct bus_dma_tag_dmar ctx_tag; /* Root tag */ - struct mtx lock; - LIST_ENTRY(dmar_ctx) link; /* Member in the dmar list */ - vm_object_t pgtbl_obj; /* Page table pages */ - u_int flags; /* Protected by dmar lock */ + struct bus_dma_tag_dmar ctx_tag; /* (c) Root tag */ + uint16_t rid; /* (c) pci RID */ uint64_t last_fault_rec[2]; /* Last fault reported */ - u_int entries_cnt; - u_long loads; - u_long unloads; - struct dmar_gas_entries_tree rb_root; - struct dmar_map_entries_tailq unload_entries; /* Entries to unload */ - struct dmar_map_entry *first_place, *last_place; - struct task unload_task; + struct dmar_domain *domain; /* (c) */ + LIST_ENTRY(dmar_ctx) link; /* (u) Member in the domain list */ + u_int refs; /* (u) References from tags */ + u_int flags; /* (u) */ + u_long loads; /* atomic updates, for stat only */ + u_long unloads; /* same */ }; +#define DMAR_DOMAIN_GAS_INITED 0x0001 +#define DMAR_DOMAIN_PGTBL_INITED 0x0002 +#define DMAR_DOMAIN_IDMAP 0x0010 /* Domain uses identity + page table */ +#define DMAR_DOMAIN_RMRR 0x0020 /* Domain contains RMRR entry, + cannot be turned off */ + /* struct dmar_ctx flags */ #define DMAR_CTX_FAULTED 0x0001 /* Fault was reported, last_fault_rec is valid */ -#define DMAR_CTX_IDMAP 0x0002 /* Context uses identity page table */ -#define DMAR_CTX_RMRR 0x0004 /* Context contains RMRR entry, - cannot be turned off */ -#define DMAR_CTX_DISABLED 0x0008 /* Device is disabled, the +#define DMAR_CTX_DISABLED 0x0002 /* Device is disabled, the ephemeral reference is kept to prevent context destruction */ -#define DMAR_CTX_PGLOCK(ctx) VM_OBJECT_WLOCK((ctx)->pgtbl_obj) -#define DMAR_CTX_PGTRYLOCK(ctx) VM_OBJECT_TRYWLOCK((ctx)->pgtbl_obj) -#define DMAR_CTX_PGUNLOCK(ctx) VM_OBJECT_WUNLOCK((ctx)->pgtbl_obj) -#define DMAR_CTX_ASSERT_PGLOCKED(ctx) \ - VM_OBJECT_ASSERT_WLOCKED((ctx)->pgtbl_obj) +#define DMAR_DOMAIN_PGLOCK(dom) VM_OBJECT_WLOCK((dom)->pgtbl_obj) +#define DMAR_DOMAIN_PGTRYLOCK(dom) VM_OBJECT_TRYWLOCK((dom)->pgtbl_obj) +#define DMAR_DOMAIN_PGUNLOCK(dom) VM_OBJECT_WUNLOCK((dom)->pgtbl_obj) +#define DMAR_DOMAIN_ASSERT_PGLOCKED(dom) \ + VM_OBJECT_ASSERT_WLOCKED((dom)->pgtbl_obj) -#define DMAR_CTX_LOCK(ctx) mtx_lock(&(ctx)->lock) -#define DMAR_CTX_UNLOCK(ctx) mtx_unlock(&(ctx)->lock) -#define DMAR_CTX_ASSERT_LOCKED(ctx) mtx_assert(&(ctx)->lock, MA_OWNED) +#define DMAR_DOMAIN_LOCK(dom) mtx_lock(&(dom)->lock) +#define DMAR_DOMAIN_UNLOCK(dom) mtx_unlock(&(dom)->lock) +#define DMAR_DOMAIN_ASSERT_LOCKED(dom) mtx_assert(&(dom)->lock, MA_OWNED) struct dmar_msi_data { int irq; @@ -158,7 +192,7 @@ /* Data for being a dmar */ struct mtx lock; - LIST_HEAD(, dmar_ctx) contexts; + LIST_HEAD(, dmar_domain) domains; struct unrhdr *domids; vm_object_t ctx_obj; u_int barrier_flags; @@ -186,6 +220,13 @@ u_int inv_seq_waiters; /* count of waiters for seq */ u_int inv_queue_full; /* informational counter */ + /* IR */ + int ir_enabled; + vm_paddr_t irt_phys; + dmar_irte_t *irt; + u_int irte_cnt; + vmem_t *irtids; + /* Delayed freeing of map entries queue processing */ struct dmar_map_entries_tailq tlb_flush_entries; struct task qi_task; @@ -195,6 +236,8 @@ struct task dmamap_load_task; TAILQ_HEAD(, bus_dmamap_dmar) delayed_maps; struct taskqueue *delayed_taskqueue; + + int dma_enabled; }; #define DMAR_LOCK(dmar) mtx_lock(&(dmar)->lock) @@ -207,6 +250,8 @@ #define DMAR_IS_COHERENT(dmar) (((dmar)->hw_ecap & DMAR_ECAP_C) != 0) #define DMAR_HAS_QI(dmar) (((dmar)->hw_ecap & DMAR_ECAP_QI) != 0) +#define DMAR_X2APIC(dmar) \ + (x2apic_mode && ((dmar)->hw_ecap & DMAR_ECAP_EIM) != 0) /* Barrier ids */ #define DMAR_BARRIER_RMRR 0 @@ -213,16 +258,18 @@ #define DMAR_BARRIER_USEQ 1 struct dmar_unit *dmar_find(device_t dev); +struct dmar_unit *dmar_find_hpet(device_t dev, uint16_t *rid); +struct dmar_unit *dmar_find_ioapic(u_int apic_id, uint16_t *rid); u_int dmar_nd2mask(u_int nd); bool dmar_pglvl_supported(struct dmar_unit *unit, int pglvl); -int ctx_set_agaw(struct dmar_ctx *ctx, int mgaw); -int dmar_maxaddr2mgaw(struct dmar_unit* unit, dmar_gaddr_t maxaddr, +int domain_set_agaw(struct dmar_domain *domain, int mgaw); +int dmar_maxaddr2mgaw(struct dmar_unit *unit, dmar_gaddr_t maxaddr, bool allow_less); vm_pindex_t pglvl_max_pages(int pglvl); -int ctx_is_sp_lvl(struct dmar_ctx *ctx, int lvl); +int domain_is_sp_lvl(struct dmar_domain *domain, int lvl); dmar_gaddr_t pglvl_page_size(int total_pglvl, int lvl); -dmar_gaddr_t ctx_page_size(struct dmar_ctx *ctx, int lvl); +dmar_gaddr_t domain_page_size(struct dmar_domain *domain, int lvl); int calc_am(struct dmar_unit *unit, dmar_gaddr_t base, dmar_gaddr_t size, dmar_gaddr_t *isizep); struct vm_page *dmar_pgalloc(vm_object_t obj, vm_pindex_t idx, int flags); @@ -239,8 +286,13 @@ void dmar_flush_root_to_ram(struct dmar_unit *unit, dmar_root_entry_t *dst); int dmar_enable_translation(struct dmar_unit *unit); int dmar_disable_translation(struct dmar_unit *unit); +int dmar_load_irt_ptr(struct dmar_unit *unit); +int dmar_enable_ir(struct dmar_unit *unit); +int dmar_disable_ir(struct dmar_unit *unit); bool dmar_barrier_enter(struct dmar_unit *dmar, u_int barrier_id); void dmar_barrier_exit(struct dmar_unit *dmar, u_int barrier_id); +uint64_t dmar_get_timeout(void); +void dmar_update_timeout(uint64_t newval); int dmar_fault_intr(void *arg); void dmar_enable_fault_intr(struct dmar_unit *unit); @@ -253,52 +305,61 @@ void dmar_disable_qi_intr(struct dmar_unit *unit); int dmar_init_qi(struct dmar_unit *unit); void dmar_fini_qi(struct dmar_unit *unit); -void dmar_qi_invalidate_locked(struct dmar_ctx *ctx, dmar_gaddr_t start, - dmar_gaddr_t size, struct dmar_qi_genseq *pseq); +void dmar_qi_invalidate_locked(struct dmar_domain *domain, dmar_gaddr_t start, + dmar_gaddr_t size, struct dmar_qi_genseq *psec, bool emit_wait); void dmar_qi_invalidate_ctx_glob_locked(struct dmar_unit *unit); void dmar_qi_invalidate_iotlb_glob_locked(struct dmar_unit *unit); +void dmar_qi_invalidate_iec_glob(struct dmar_unit *unit); +void dmar_qi_invalidate_iec(struct dmar_unit *unit, u_int start, u_int cnt); -vm_object_t ctx_get_idmap_pgtbl(struct dmar_ctx *ctx, dmar_gaddr_t maxaddr); +vm_object_t domain_get_idmap_pgtbl(struct dmar_domain *domain, + dmar_gaddr_t maxaddr); void put_idmap_pgtbl(vm_object_t obj); -int ctx_map_buf(struct dmar_ctx *ctx, dmar_gaddr_t base, dmar_gaddr_t size, - vm_page_t *ma, uint64_t pflags, int flags); -int ctx_unmap_buf(struct dmar_ctx *ctx, dmar_gaddr_t base, dmar_gaddr_t size, - int flags); -void ctx_flush_iotlb_sync(struct dmar_ctx *ctx, dmar_gaddr_t base, +int domain_map_buf(struct dmar_domain *domain, dmar_gaddr_t base, + dmar_gaddr_t size, vm_page_t *ma, uint64_t pflags, int flags); +int domain_unmap_buf(struct dmar_domain *domain, dmar_gaddr_t base, + dmar_gaddr_t size, int flags); +void domain_flush_iotlb_sync(struct dmar_domain *domain, dmar_gaddr_t base, dmar_gaddr_t size); -int ctx_alloc_pgtbl(struct dmar_ctx *ctx); -void ctx_free_pgtbl(struct dmar_ctx *ctx); +int domain_alloc_pgtbl(struct dmar_domain *domain); +void domain_free_pgtbl(struct dmar_domain *domain); struct dmar_ctx *dmar_instantiate_ctx(struct dmar_unit *dmar, device_t dev, bool rmrr); -struct dmar_ctx *dmar_get_ctx(struct dmar_unit *dmar, device_t dev, +struct dmar_ctx *dmar_get_ctx_for_dev(struct dmar_unit *dmar, device_t dev, uint16_t rid, bool id_mapped, bool rmrr_init); +int dmar_move_ctx_to_domain(struct dmar_domain *domain, struct dmar_ctx *ctx); void dmar_free_ctx_locked(struct dmar_unit *dmar, struct dmar_ctx *ctx); void dmar_free_ctx(struct dmar_ctx *ctx); struct dmar_ctx *dmar_find_ctx_locked(struct dmar_unit *dmar, uint16_t rid); -void dmar_ctx_unload_entry(struct dmar_map_entry *entry, bool free); -void dmar_ctx_unload(struct dmar_ctx *ctx, +void dmar_domain_unload_entry(struct dmar_map_entry *entry, bool free); +void dmar_domain_unload(struct dmar_domain *domain, struct dmar_map_entries_tailq *entries, bool cansleep); -void dmar_ctx_free_entry(struct dmar_map_entry *entry, bool free); +void dmar_domain_free_entry(struct dmar_map_entry *entry, bool free); int dmar_init_busdma(struct dmar_unit *unit); void dmar_fini_busdma(struct dmar_unit *unit); +device_t dmar_get_requester(device_t dev, uint16_t *rid); -void dmar_gas_init_ctx(struct dmar_ctx *ctx); -void dmar_gas_fini_ctx(struct dmar_ctx *ctx); -struct dmar_map_entry *dmar_gas_alloc_entry(struct dmar_ctx *ctx, u_int flags); -void dmar_gas_free_entry(struct dmar_ctx *ctx, struct dmar_map_entry *entry); -void dmar_gas_free_space(struct dmar_ctx *ctx, struct dmar_map_entry *entry); -int dmar_gas_map(struct dmar_ctx *ctx, const struct bus_dma_tag_common *common, - dmar_gaddr_t size, int offset, u_int eflags, u_int flags, vm_page_t *ma, - struct dmar_map_entry **res); -void dmar_gas_free_region(struct dmar_ctx *ctx, struct dmar_map_entry *entry); -int dmar_gas_map_region(struct dmar_ctx *ctx, struct dmar_map_entry *entry, - u_int eflags, u_int flags, vm_page_t *ma); -int dmar_gas_reserve_region(struct dmar_ctx *ctx, dmar_gaddr_t start, +void dmar_gas_init_domain(struct dmar_domain *domain); +void dmar_gas_fini_domain(struct dmar_domain *domain); +struct dmar_map_entry *dmar_gas_alloc_entry(struct dmar_domain *domain, + u_int flags); +void dmar_gas_free_entry(struct dmar_domain *domain, + struct dmar_map_entry *entry); +void dmar_gas_free_space(struct dmar_domain *domain, + struct dmar_map_entry *entry); +int dmar_gas_map(struct dmar_domain *domain, + const struct bus_dma_tag_common *common, dmar_gaddr_t size, int offset, + u_int eflags, u_int flags, vm_page_t *ma, struct dmar_map_entry **res); +void dmar_gas_free_region(struct dmar_domain *domain, + struct dmar_map_entry *entry); +int dmar_gas_map_region(struct dmar_domain *domain, + struct dmar_map_entry *entry, u_int eflags, u_int flags, vm_page_t *ma); +int dmar_gas_reserve_region(struct dmar_domain *domain, dmar_gaddr_t start, dmar_gaddr_t end); -void dmar_ctx_parse_rmrr(struct dmar_ctx *ctx, device_t dev, +void dmar_dev_parse_rmrr(struct dmar_domain *domain, device_t dev, struct dmar_map_entries_tailq *rmrr_entries); int dmar_instantiate_rmrr_ctxs(struct dmar_unit *dmar); @@ -305,6 +366,9 @@ void dmar_quirks_post_ident(struct dmar_unit *dmar); void dmar_quirks_pre_use(struct dmar_unit *dmar); +int dmar_init_irt(struct dmar_unit *unit); +void dmar_fini_irt(struct dmar_unit *unit); + #define DMAR_GM_CANWAIT 0x0001 #define DMAR_GM_CANSPLIT 0x0002 @@ -318,6 +382,7 @@ extern int haw; extern int dmar_tbl_pagecnt; extern int dmar_match_verbose; +extern int dmar_batch_coalesce; extern int dmar_check_free; static inline uint32_t @@ -375,13 +440,16 @@ * containing the P or R and W bits, is set only after the high word * is written. For clear, the P bit is cleared first, then the high * word is cleared. + * + * dmar_pte_update updates the pte. For amd64, the update is atomic. + * For i386, it first disables the entry by clearing the word + * containing the P bit, and then defer to dmar_pte_store. The locked + * cmpxchg8b is probably available on any machine having DMAR support, + * but interrupt translation table may be mapped uncached. */ static inline void -dmar_pte_store(volatile uint64_t *dst, uint64_t val) +dmar_pte_store1(volatile uint64_t *dst, uint64_t val) { - - KASSERT(*dst == 0, ("used pte %p oldval %jx newval %jx", - dst, (uintmax_t)*dst, (uintmax_t)val)); #ifdef __i386__ volatile uint32_t *p; uint32_t hi, lo; @@ -397,6 +465,28 @@ } static inline void +dmar_pte_store(volatile uint64_t *dst, uint64_t val) +{ + + KASSERT(*dst == 0, ("used pte %p oldval %jx newval %jx", + dst, (uintmax_t)*dst, (uintmax_t)val)); + dmar_pte_store1(dst, val); +} + +static inline void +dmar_pte_update(volatile uint64_t *dst, uint64_t val) +{ + +#ifdef __i386__ + volatile uint32_t *p; + + p = (volatile uint32_t *)dst; + *p = 0; +#endif + dmar_pte_store1(dst, val); +} + +static inline void dmar_pte_clear(volatile uint64_t *dst) { #ifdef __i386__ @@ -420,6 +510,36 @@ return (start + size <= ((start + boundary) & ~(boundary - 1))); } +extern struct timespec dmar_hw_timeout; + +#define DMAR_WAIT_UNTIL(cond) \ +{ \ + struct timespec last, curr; \ + bool forever; \ + \ + if (dmar_hw_timeout.tv_sec == 0 && \ + dmar_hw_timeout.tv_nsec == 0) { \ + forever = true; \ + } else { \ + forever = false; \ + nanouptime(&curr); \ + last = curr; \ + timespecadd(&last, &dmar_hw_timeout); \ + } \ + for (;;) { \ + if (cond) { \ + error = 0; \ + break; \ + } \ + nanouptime(&curr); \ + if (!forever && timespeccmp(&last, &curr, <)) { \ + error = ETIMEDOUT; \ + break; \ + } \ + cpu_spinwait(); \ + } \ +} + #ifdef INVARIANTS #define TD_PREP_PINNED_ASSERT \ int old_td_pinned; \ Modified: trunk/sys/x86/iommu/intel_drv.c =================================================================== --- trunk/sys/x86/iommu/intel_drv.c 2020-02-08 19:29:01 UTC (rev 12309) +++ trunk/sys/x86/iommu/intel_drv.c 2020-02-08 19:32:41 UTC (rev 12310) @@ -1,6 +1,6 @@ /* $MidnightBSD$ */ /*- - * Copyright (c) 2013 The FreeBSD Foundation + * Copyright (c) 2013-2015 The FreeBSD Foundation * All rights reserved. * * This software was developed by Konstantin Belousov <kib at FreeBSD.org> @@ -29,10 +29,10 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/x86/iommu/intel_drv.c 279470 2015-03-01 04:22:06Z rstone $"); +__FBSDID("$FreeBSD: stable/11/sys/x86/iommu/intel_drv.c 323921 2017-09-22 10:51:32Z kib $"); #include "opt_acpi.h" -#if defined(__amd64__) /* || defined(__ia64__) */ +#if defined(__amd64__) #define DEV_APIC #else #include "opt_apic.h" @@ -51,6 +51,7 @@ #include <sys/smp.h> #include <sys/taskqueue.h> #include <sys/tree.h> +#include <sys/vmem.h> #include <machine/bus.h> #include <contrib/dev/acpica/include/acpi.h> #include <contrib/dev/acpica/include/accommon.h> @@ -66,10 +67,14 @@ #include <x86/iommu/intel_reg.h> #include <x86/iommu/busdma_dmar.h> #include <x86/iommu/intel_dmar.h> +#include <dev/pci/pcireg.h> #include <dev/pci/pcivar.h> #ifdef DEV_APIC #include "pcib_if.h" +#include <machine/intr_machdep.h> +#include <x86/apicreg.h> +#include <x86/apicvar.h> #endif #define DMAR_FAULT_IRQ_RID 0 @@ -108,6 +113,7 @@ if (!iter(dmarh, arg)) break; } + AcpiPutTable((ACPI_TABLE_HEADER *)dmartbl); } struct find_iter_args { @@ -183,6 +189,7 @@ (unsigned)dmartbl->Flags, "\020\001INTR_REMAP\002X2APIC_OPT_OUT"); } + AcpiPutTable((ACPI_TABLE_HEADER *)dmartbl); dmar_iterate_tbl(dmar_count_iter, NULL); if (dmar_devcnt == 0) @@ -244,6 +251,7 @@ int i; dmar_fini_busdma(unit); + dmar_fini_irt(unit); dmar_fini_qi(unit); dmar_fini_fault_log(unit); for (i = 0; i < DMAR_INTR_TOTAL; i++) @@ -304,7 +312,7 @@ dmd->name, error); goto err4; } - bus_describe_intr(dev, dmd->irq_res, dmd->intr_handle, dmd->name); + bus_describe_intr(dev, dmd->irq_res, dmd->intr_handle, "%s", dmd->name); error = PCIB_MAP_MSI(pcib, dev, dmd->irq, &msi_addr, &msi_data); if (error != 0) { device_printf(dev, "cannot map %s interrupt, %d\n", @@ -398,6 +406,7 @@ { struct dmar_unit *unit; ACPI_DMAR_HARDWARE_UNIT *dmaru; + uint64_t timeout; int i, error; unit = device_get_softc(dev); @@ -422,6 +431,10 @@ dmar_print_caps(dev, unit, dmaru); dmar_quirks_post_ident(unit); + timeout = dmar_get_timeout(); + TUNABLE_UINT64_FETCH("hw.dmar.timeout", &timeout); + dmar_update_timeout(timeout); + for (i = 0; i < DMAR_INTR_TOTAL; i++) unit->intrs[i].irq = -1; @@ -457,6 +470,7 @@ mtx_init(&unit->lock, "dmarhw", NULL, MTX_DEF); unit->domids = new_unrhdr(0, dmar_nd2mask(DMAR_CAP_ND(unit->hw_cap)), &unit->lock); + LIST_INIT(&unit->domains); /* * 9.2 "Context Entry": @@ -510,6 +524,11 @@ dmar_release_resources(dev, unit); return (error); } + error = dmar_init_irt(unit); + if (error != 0) { + dmar_release_resources(dev, unit); + return (error); + } error = dmar_init_busdma(unit); if (error != 0) { dmar_release_resources(dev, unit); @@ -764,8 +783,87 @@ return (device_get_softc(dmar_dev)); } +static struct dmar_unit * +dmar_find_nonpci(u_int id, u_int entry_type, uint16_t *rid) +{ + device_t dmar_dev; + struct dmar_unit *unit; + ACPI_DMAR_HARDWARE_UNIT *dmarh; + ACPI_DMAR_DEVICE_SCOPE *devscope; + ACPI_DMAR_PCI_PATH *path; + char *ptr, *ptrend; +#ifdef DEV_APIC + int error; +#endif + int i; + + for (i = 0; i < dmar_devcnt; i++) { + dmar_dev = dmar_devs[i]; + if (dmar_dev == NULL) + continue; + unit = (struct dmar_unit *)device_get_softc(dmar_dev); + dmarh = dmar_find_by_index(i); + if (dmarh == NULL) + continue; + ptr = (char *)dmarh + sizeof(*dmarh); + ptrend = (char *)dmarh + dmarh->Header.Length; + for (;;) { + if (ptr >= ptrend) + break; + devscope = (ACPI_DMAR_DEVICE_SCOPE *)ptr; + ptr += devscope->Length; + if (devscope->EntryType != entry_type) + continue; + if (devscope->EnumerationId != id) + continue; +#ifdef DEV_APIC + if (entry_type == ACPI_DMAR_SCOPE_TYPE_IOAPIC) { + error = ioapic_get_rid(id, rid); + /* + * If our IOAPIC has PCI bindings then + * use the PCI device rid. + */ + if (error == 0) + return (unit); + } +#endif + if (devscope->Length - sizeof(ACPI_DMAR_DEVICE_SCOPE) + == 2) { + if (rid != NULL) { + path = (ACPI_DMAR_PCI_PATH *) + (devscope + 1); + *rid = PCI_RID(devscope->Bus, + path->Device, path->Function); + } + return (unit); + } + printf( + "dmar_find_nonpci: id %d type %d path length != 2\n", + id, entry_type); + break; + } + } + return (NULL); +} + + +struct dmar_unit * +dmar_find_hpet(device_t dev, uint16_t *rid) +{ + + return (dmar_find_nonpci(hpet_get_uid(dev), ACPI_DMAR_SCOPE_TYPE_HPET, + rid)); +} + +struct dmar_unit * +dmar_find_ioapic(u_int apic_id, uint16_t *rid) +{ + + return (dmar_find_nonpci(apic_id, ACPI_DMAR_SCOPE_TYPE_IOAPIC, rid)); +} + struct rmrr_iter_args { - struct dmar_ctx *ctx; + struct dmar_domain *domain; device_t dev; int dev_domain; int dev_busno; @@ -810,7 +908,8 @@ if (match == 1) { if (dmar_match_verbose) printf("matched\n"); - entry = dmar_gas_alloc_entry(ria->ctx, DMAR_PGF_WAITOK); + entry = dmar_gas_alloc_entry(ria->domain, + DMAR_PGF_WAITOK); entry->start = resmem->BaseAddress; /* The RMRR entry end address is inclusive. */ entry->end = resmem->EndAddress; @@ -825,7 +924,7 @@ } void -dmar_ctx_parse_rmrr(struct dmar_ctx *ctx, device_t dev, +dmar_dev_parse_rmrr(struct dmar_domain *domain, device_t dev, struct dmar_map_entries_tailq *rmrr_entries) { struct rmrr_iter_args ria; @@ -841,7 +940,7 @@ dev_path); } - ria.ctx = ctx; + ria.domain = domain; ria.dev = dev; ria.dev_path = dev_path; ria.rmrr_entries = rmrr_entries; @@ -961,7 +1060,7 @@ printf("dmar%d: instantiating RMRR contexts\n", dmar->unit); dmar_iterate_tbl(dmar_inst_rmrr_iter, &iria); DMAR_LOCK(dmar); - if (!LIST_EMPTY(&dmar->contexts)) { + if (!LIST_EMPTY(&dmar->domains)) { KASSERT((dmar->hw_gcmd & DMAR_GCMD_TE) == 0, ("dmar%d: RMRR not handled but translation is already enabled", dmar->unit)); @@ -976,7 +1075,7 @@ #include <ddb/db_lex.h> static void -dmar_print_ctx_entry(const struct dmar_map_entry *entry) +dmar_print_domain_entry(const struct dmar_map_entry *entry) { struct dmar_map_entry *l, *r; @@ -1000,24 +1099,39 @@ } static void -dmar_print_ctx(struct dmar_ctx *ctx, bool show_mappings) +dmar_print_ctx(struct dmar_ctx *ctx) { - struct dmar_map_entry *entry; db_printf( - " @%p pci%d:%d:%d dom %d mgaw %d agaw %d pglvl %d end %jx\n" - " refs %d flags %x pgobj %p map_ents %u loads %lu unloads %lu\n", + " @%p pci%d:%d:%d refs %d flags %x loads %lu unloads %lu\n", ctx, pci_get_bus(ctx->ctx_tag.owner), pci_get_slot(ctx->ctx_tag.owner), - pci_get_function(ctx->ctx_tag.owner), ctx->domain, ctx->mgaw, - ctx->agaw, ctx->pglvl, (uintmax_t)ctx->end, ctx->refs, - ctx->flags, ctx->pgtbl_obj, ctx->entries_cnt, ctx->loads, - ctx->unloads); + pci_get_function(ctx->ctx_tag.owner), ctx->refs, ctx->flags, + ctx->loads, ctx->unloads); +} + +static void +dmar_print_domain(struct dmar_domain *domain, bool show_mappings) +{ + struct dmar_map_entry *entry; + struct dmar_ctx *ctx; + + db_printf( + " @%p dom %d mgaw %d agaw %d pglvl %d end %jx refs %d\n" + " ctx_cnt %d flags %x pgobj %p map_ents %u\n", + domain, domain->domain, domain->mgaw, domain->agaw, domain->pglvl, + (uintmax_t)domain->end, domain->refs, domain->ctx_cnt, + domain->flags, domain->pgtbl_obj, domain->entries_cnt); + if (!LIST_EMPTY(&domain->contexts)) { + db_printf(" Contexts:\n"); + LIST_FOREACH(ctx, &domain->contexts, link) + dmar_print_ctx(ctx); + } if (!show_mappings) return; db_printf(" mapped:\n"); - RB_FOREACH(entry, dmar_gas_entries_tree, &ctx->rb_root) { - dmar_print_ctx_entry(entry); + RB_FOREACH(entry, dmar_gas_entries_tree, &domain->rb_root) { + dmar_print_domain_entry(entry); if (db_pager_quit) break; } @@ -1024,19 +1138,20 @@ if (db_pager_quit) return; db_printf(" unloading:\n"); - TAILQ_FOREACH(entry, &ctx->unload_entries, dmamap_link) { - dmar_print_ctx_entry(entry); + TAILQ_FOREACH(entry, &domain->unload_entries, dmamap_link) { + dmar_print_domain_entry(entry); if (db_pager_quit) break; } } -DB_FUNC(dmar_ctx, db_dmar_print_ctx, db_show_table, CS_OWN, NULL) +DB_FUNC(dmar_domain, db_dmar_print_domain, db_show_table, CS_OWN, NULL) { struct dmar_unit *unit; + struct dmar_domain *domain; struct dmar_ctx *ctx; bool show_mappings, valid; - int domain, bus, device, function, i, t; + int pci_domain, bus, device, function, i, t; db_expr_t radix; valid = false; @@ -1057,7 +1172,7 @@ show_mappings = false; } if (t == tNUMBER) { - domain = db_tok_number; + pci_domain = db_tok_number; t = db_read_token(); if (t == tNUMBER) { bus = db_tok_number; @@ -1075,19 +1190,24 @@ db_radix = radix; db_skip_to_eol(); if (!valid) { - db_printf("usage: show dmar_ctx [/m] " + db_printf("usage: show dmar_domain [/m] " "<domain> <bus> <device> <func>\n"); return; } for (i = 0; i < dmar_devcnt; i++) { unit = device_get_softc(dmar_devs[i]); - LIST_FOREACH(ctx, &unit->contexts, link) { - if (domain == unit->segment && - bus == pci_get_bus(ctx->ctx_tag.owner) && - device == pci_get_slot(ctx->ctx_tag.owner) && - function == pci_get_function(ctx->ctx_tag.owner)) { - dmar_print_ctx(ctx, show_mappings); - goto out; + LIST_FOREACH(domain, &unit->domains, link) { + LIST_FOREACH(ctx, &domain->contexts, link) { + if (pci_domain == unit->segment && + bus == pci_get_bus(ctx->ctx_tag.owner) && + device == + pci_get_slot(ctx->ctx_tag.owner) && + function == + pci_get_function(ctx->ctx_tag.owner)) { + dmar_print_domain(domain, + show_mappings); + goto out; + } } } } @@ -1095,10 +1215,10 @@ } static void -dmar_print_one(int idx, bool show_ctxs, bool show_mappings) +dmar_print_one(int idx, bool show_domains, bool show_mappings) { struct dmar_unit *unit; - struct dmar_ctx *ctx; + struct dmar_domain *domain; int i, frir; unit = device_get_softc(dmar_devs[idx]); @@ -1110,6 +1230,10 @@ dmar_read4(unit, DMAR_GSTS_REG), dmar_read4(unit, DMAR_FSTS_REG), dmar_read4(unit, DMAR_FECTL_REG)); + if (unit->ir_enabled) { + db_printf("ir is enabled; IRT @%p phys 0x%jx maxcnt %d\n", + unit->irt, (uintmax_t)unit->irt_phys, unit->irte_cnt); + } db_printf("fed 0x%x fea 0x%x feua 0x%x\n", dmar_read4(unit, DMAR_FEDATA_REG), dmar_read4(unit, DMAR_FEADDR_REG), @@ -1148,10 +1272,10 @@ db_printf("qi is disabled\n"); } } - if (show_ctxs) { - db_printf("contexts:\n"); - LIST_FOREACH(ctx, &unit->contexts, link) { - dmar_print_ctx(ctx, show_mappings); + if (show_domains) { + db_printf("domains:\n"); + LIST_FOREACH(domain, &unit->domains, link) { + dmar_print_domain(domain, show_mappings); if (db_pager_quit) break; } @@ -1160,27 +1284,27 @@ DB_SHOW_COMMAND(dmar, db_dmar_print) { - bool show_ctxs, show_mappings; + bool show_domains, show_mappings; - show_ctxs = strchr(modif, 'c') != NULL; + show_domains = strchr(modif, 'd') != NULL; show_mappings = strchr(modif, 'm') != NULL; if (!have_addr) { - db_printf("usage: show dmar [/c] [/m] index\n"); + db_printf("usage: show dmar [/d] [/m] index\n"); return; } - dmar_print_one((int)addr, show_ctxs, show_mappings); + dmar_print_one((int)addr, show_domains, show_mappings); } DB_SHOW_ALL_COMMAND(dmars, db_show_all_dmars) { int i; - bool show_ctxs, show_mappings; + bool show_domains, show_mappings; - show_ctxs = strchr(modif, 'c') != NULL; + show_domains = strchr(modif, 'd') != NULL; show_mappings = strchr(modif, 'm') != NULL; for (i = 0; i < dmar_devcnt; i++) { - dmar_print_one(i, show_ctxs, show_mappings); + dmar_print_one(i, show_domains, show_mappings); if (db_pager_quit) break; } Modified: trunk/sys/x86/iommu/intel_fault.c =================================================================== --- trunk/sys/x86/iommu/intel_fault.c 2020-02-08 19:29:01 UTC (rev 12309) +++ trunk/sys/x86/iommu/intel_fault.c 2020-02-08 19:32:41 UTC (rev 12310) @@ -29,7 +29,7 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/x86/iommu/intel_fault.c 279485 2015-03-01 10:35:54Z kib $"); +__FBSDID("$FreeBSD: stable/11/sys/x86/iommu/intel_fault.c 309882 2016-12-12 09:43:48Z kib $"); #include "opt_acpi.h" @@ -42,6 +42,7 @@ #include <sys/rman.h> #include <sys/taskqueue.h> #include <sys/tree.h> +#include <sys/vmem.h> #include <machine/bus.h> #include <contrib/dev/acpica/include/acpi.h> #include <contrib/dev/acpica/include/accommon.h> @@ -179,7 +180,7 @@ } if (enqueue) { - taskqueue_enqueue_fast(unit->fault_taskqueue, + taskqueue_enqueue(unit->fault_taskqueue, &unit->fault_task); } return (FILTER_HANDLED); @@ -271,7 +272,7 @@ M_DEVBUF, M_WAITOK | M_ZERO); TASK_INIT(&unit->fault_task, 0, dmar_fault_task, unit); - unit->fault_taskqueue = taskqueue_create_fast("dmar", M_WAITOK, + unit->fault_taskqueue = taskqueue_create_fast("dmarff", M_WAITOK, taskqueue_thread_enqueue, &unit->fault_taskqueue); taskqueue_start_threads(&unit->fault_taskqueue, 1, PI_AV, "dmar%d fault taskq", unit->unit); Modified: trunk/sys/x86/iommu/intel_gas.c =================================================================== --- trunk/sys/x86/iommu/intel_gas.c 2020-02-08 19:29:01 UTC (rev 12309) +++ trunk/sys/x86/iommu/intel_gas.c 2020-02-08 19:32:41 UTC (rev 12310) @@ -29,7 +29,7 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/x86/iommu/intel_gas.c 281545 2015-04-15 06:56:51Z kib $"); +__FBSDID("$FreeBSD: stable/11/sys/x86/iommu/intel_gas.c 329942 2018-02-25 00:32:42Z markj $"); #define RB_AUGMENT(entry) dmar_gas_augment_entry(entry) @@ -50,6 +50,7 @@ #include <sys/taskqueue.h> #include <sys/tree.h> #include <sys/uio.h> +#include <sys/vmem.h> #include <dev/pci/pcivar.h> #include <vm/vm.h> #include <vm/vm_extern.h> @@ -79,12 +80,12 @@ dmar_map_entry_zone = uma_zcreate("DMAR_MAP_ENTRY", sizeof(struct dmar_map_entry), NULL, NULL, - NULL, NULL, UMA_ALIGN_PTR, 0); + NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NODUMP); } SYSINIT(intel_gas, SI_SUB_DRIVERS, SI_ORDER_FIRST, intel_gas_init, NULL); struct dmar_map_entry * -dmar_gas_alloc_entry(struct dmar_ctx *ctx, u_int flags) +dmar_gas_alloc_entry(struct dmar_domain *domain, u_int flags) { struct dmar_map_entry *res; @@ -94,20 +95,20 @@ res = uma_zalloc(dmar_map_entry_zone, ((flags & DMAR_PGF_WAITOK) != 0 ? M_WAITOK : M_NOWAIT) | M_ZERO); if (res != NULL) { - res->ctx = ctx; - atomic_add_int(&ctx->entries_cnt, 1); + res->domain = domain; + atomic_add_int(&domain->entries_cnt, 1); } return (res); } void -dmar_gas_free_entry(struct dmar_ctx *ctx, struct dmar_map_entry *entry) +dmar_gas_free_entry(struct dmar_domain *domain, struct dmar_map_entry *entry) { - KASSERT(ctx == entry->ctx, - ("mismatched free ctx %p entry %p entry->ctx %p", ctx, - entry, entry->ctx)); - atomic_subtract_int(&ctx->entries_cnt, 1); + KASSERT(domain == entry->domain, + ("mismatched free domain %p entry %p entry->domain %p", domain, + entry, entry->domain)); + atomic_subtract_int(&domain->entries_cnt, 1); uma_zfree(dmar_map_entry_zone, entry); } @@ -158,12 +159,12 @@ dmar_gas_cmp_entries); static void -dmar_gas_fix_free(struct dmar_ctx *ctx, struct dmar_map_entry *entry) +dmar_gas_fix_free(struct dmar_domain *domain, struct dmar_map_entry *entry) { struct dmar_map_entry *next; - next = RB_NEXT(dmar_gas_entries_tree, &ctx->rb_root, entry); - entry->free_after = (next != NULL ? next->start : ctx->end) - + next = RB_NEXT(dmar_gas_entries_tree, &domain->rb_root, entry); + entry->free_after = (next != NULL ? next->start : domain->end) - entry->end; dmar_gas_augment_entry(entry); } @@ -170,18 +171,18 @@ #ifdef INVARIANTS static void -dmar_gas_check_free(struct dmar_ctx *ctx) +dmar_gas_check_free(struct dmar_domain *domain) { struct dmar_map_entry *entry, *next, *l, *r; dmar_gaddr_t v; - RB_FOREACH(entry, dmar_gas_entries_tree, &ctx->rb_root) { - KASSERT(ctx == entry->ctx, - ("mismatched free ctx %p entry %p entry->ctx %p", ctx, - entry, entry->ctx)); - next = RB_NEXT(dmar_gas_entries_tree, &ctx->rb_root, entry); + RB_FOREACH(entry, dmar_gas_entries_tree, &domain->rb_root) { + KASSERT(domain == entry->domain, + ("mismatched free domain %p entry %p entry->domain %p", + domain, entry, entry->domain)); + next = RB_NEXT(dmar_gas_entries_tree, &domain->rb_root, entry); if (next == NULL) { - MPASS(entry->free_after == ctx->end - entry->end); + MPASS(entry->free_after == domain->end - entry->end); } else { MPASS(entry->free_after = next->start - entry->end); MPASS(entry->end <= next->start); @@ -198,7 +199,7 @@ l->free_down)); } else { v = MAX(entry->free_after, l->free_down); - v = MAX(entry->free_down, r->free_down); + v = MAX(v, r->free_down); MPASS(entry->free_down == v); } } @@ -206,93 +207,95 @@ #endif static bool -dmar_gas_rb_insert(struct dmar_ctx *ctx, struct dmar_map_entry *entry) +dmar_gas_rb_insert(struct dmar_domain *domain, struct dmar_map_entry *entry) { struct dmar_map_entry *prev, *found; - found = RB_INSERT(dmar_gas_entries_tree, &ctx->rb_root, entry); - dmar_gas_fix_free(ctx, entry); - prev = RB_PREV(dmar_gas_entries_tree, &ctx->rb_root, entry); + found = RB_INSERT(dmar_gas_entries_tree, &domain->rb_root, entry); + dmar_gas_fix_free(domain, entry); + prev = RB_PREV(dmar_gas_entries_tree, &domain->rb_root, entry); if (prev != NULL) - dmar_gas_fix_free(ctx, prev); + dmar_gas_fix_free(domain, prev); return (found == NULL); } static void -dmar_gas_rb_remove(struct dmar_ctx *ctx, struct dmar_map_entry *entry) +dmar_gas_rb_remove(struct dmar_domain *domain, struct dmar_map_entry *entry) { struct dmar_map_entry *prev; - prev = RB_PREV(dmar_gas_entries_tree, &ctx->rb_root, entry); - RB_REMOVE(dmar_gas_entries_tree, &ctx->rb_root, entry); + prev = RB_PREV(dmar_gas_entries_tree, &domain->rb_root, entry); + RB_REMOVE(dmar_gas_entries_tree, &domain->rb_root, entry); if (prev != NULL) - dmar_gas_fix_free(ctx, prev); + dmar_gas_fix_free(domain, prev); } void -dmar_gas_init_ctx(struct dmar_ctx *ctx) +dmar_gas_init_domain(struct dmar_domain *domain) { struct dmar_map_entry *begin, *end; - begin = dmar_gas_alloc_entry(ctx, DMAR_PGF_WAITOK); - end = dmar_gas_alloc_entry(ctx, DMAR_PGF_WAITOK); + begin = dmar_gas_alloc_entry(domain, DMAR_PGF_WAITOK); + end = dmar_gas_alloc_entry(domain, DMAR_PGF_WAITOK); - DMAR_CTX_LOCK(ctx); - KASSERT(ctx->entries_cnt == 2, ("dirty ctx %p", ctx)); - KASSERT(RB_EMPTY(&ctx->rb_root), ("non-empty entries %p", ctx)); + DMAR_DOMAIN_LOCK(domain); + KASSERT(domain->entries_cnt == 2, ("dirty domain %p", domain)); + KASSERT(RB_EMPTY(&domain->rb_root), ("non-empty entries %p", domain)); begin->start = 0; begin->end = DMAR_PAGE_SIZE; - begin->free_after = ctx->end - begin->end; + begin->free_after = domain->end - begin->end; begin->flags = DMAR_MAP_ENTRY_PLACE | DMAR_MAP_ENTRY_UNMAPPED; - dmar_gas_rb_insert(ctx, begin); + dmar_gas_rb_insert(domain, begin); - end->start = ctx->end; - end->end = ctx->end; + end->start = domain->end; + end->end = domain->end; end->free_after = 0; end->flags = DMAR_MAP_ENTRY_PLACE | DMAR_MAP_ENTRY_UNMAPPED; - dmar_gas_rb_insert(ctx, end); + dmar_gas_rb_insert(domain, end); - ctx->first_place = begin; - ctx->last_place = end; - DMAR_CTX_UNLOCK(ctx); + domain->first_place = begin; + domain->last_place = end; + domain->flags |= DMAR_DOMAIN_GAS_INITED; + DMAR_DOMAIN_UNLOCK(domain); } void -dmar_gas_fini_ctx(struct dmar_ctx *ctx) +dmar_gas_fini_domain(struct dmar_domain *domain) { struct dmar_map_entry *entry, *entry1; - DMAR_CTX_ASSERT_LOCKED(ctx); - KASSERT(ctx->entries_cnt == 2, ("ctx still in use %p", ctx)); + DMAR_DOMAIN_ASSERT_LOCKED(domain); + KASSERT(domain->entries_cnt == 2, ("domain still in use %p", domain)); - entry = RB_MIN(dmar_gas_entries_tree, &ctx->rb_root); - KASSERT(entry->start == 0, ("start entry start %p", ctx)); - KASSERT(entry->end == DMAR_PAGE_SIZE, ("start entry end %p", ctx)); + entry = RB_MIN(dmar_gas_entries_tree, &domain->rb_root); + KASSERT(entry->start == 0, ("start entry start %p", domain)); + KASSERT(entry->end == DMAR_PAGE_SIZE, ("start entry end %p", domain)); KASSERT(entry->flags == DMAR_MAP_ENTRY_PLACE, - ("start entry flags %p", ctx)); - RB_REMOVE(dmar_gas_entries_tree, &ctx->rb_root, entry); - dmar_gas_free_entry(ctx, entry); + ("start entry flags %p", domain)); + RB_REMOVE(dmar_gas_entries_tree, &domain->rb_root, entry); + dmar_gas_free_entry(domain, entry); - entry = RB_MAX(dmar_gas_entries_tree, &ctx->rb_root); - KASSERT(entry->start == ctx->end, ("end entry start %p", ctx)); - KASSERT(entry->end == ctx->end, ("end entry end %p", ctx)); - KASSERT(entry->free_after == 0, ("end entry free_after%p", ctx)); + entry = RB_MAX(dmar_gas_entries_tree, &domain->rb_root); + KASSERT(entry->start == domain->end, ("end entry start %p", domain)); + KASSERT(entry->end == domain->end, ("end entry end %p", domain)); + KASSERT(entry->free_after == 0, ("end entry free_after %p", domain)); KASSERT(entry->flags == DMAR_MAP_ENTRY_PLACE, - ("end entry flags %p", ctx)); - RB_REMOVE(dmar_gas_entries_tree, &ctx->rb_root, entry); - dmar_gas_free_entry(ctx, entry); + ("end entry flags %p", domain)); + RB_REMOVE(dmar_gas_entries_tree, &domain->rb_root, entry); + dmar_gas_free_entry(domain, entry); - RB_FOREACH_SAFE(entry, dmar_gas_entries_tree, &ctx->rb_root, entry1) { + RB_FOREACH_SAFE(entry, dmar_gas_entries_tree, &domain->rb_root, + entry1) { KASSERT((entry->flags & DMAR_MAP_ENTRY_RMRR) != 0, - ("non-RMRR entry left %p", ctx)); - RB_REMOVE(dmar_gas_entries_tree, &ctx->rb_root, entry); - dmar_gas_free_entry(ctx, entry); + ("non-RMRR entry left %p", domain)); + RB_REMOVE(dmar_gas_entries_tree, &domain->rb_root, entry); + dmar_gas_free_entry(domain, entry); } } struct dmar_gas_match_args { - struct dmar_ctx *ctx; + struct dmar_domain *domain; dmar_gaddr_t size; int offset; const struct bus_dma_tag_common *common; @@ -325,8 +328,8 @@ * the boundary. Check if there is enough space after the * next boundary after the prev->end. */ - bs = (a->entry->start + a->offset + a->common->boundary) & - ~(a->common->boundary - 1); + bs = rounddown2(a->entry->start + a->offset + a->common->boundary, + a->common->boundary); start = roundup2(bs, a->common->alignment); /* DMAR_PAGE_SIZE to create gap after new entry. */ if (start + a->offset + a->size + DMAR_PAGE_SIZE <= @@ -371,12 +374,12 @@ */ a->entry->end = a->entry->start + a->size; - next = RB_NEXT(dmar_gas_entries_tree, &a->ctx->rb_root, prev); + next = RB_NEXT(dmar_gas_entries_tree, &a->domain->rb_root, prev); KASSERT(next->start >= a->entry->end && next->start - a->entry->start >= a->size && prev->end <= a->entry->end, ("dmar_gas_match_insert hole failed %p prev (%jx, %jx) " - "free_after %jx next (%jx, %jx) entry (%jx, %jx)", a->ctx, + "free_after %jx next (%jx, %jx) entry (%jx, %jx)", a->domain, (uintmax_t)prev->start, (uintmax_t)prev->end, (uintmax_t)prev->free_after, (uintmax_t)next->start, (uintmax_t)next->end, @@ -385,19 +388,19 @@ prev->free_after = a->entry->start - prev->end; a->entry->free_after = next->start - a->entry->end; - found = dmar_gas_rb_insert(a->ctx, a->entry); + found = dmar_gas_rb_insert(a->domain, a->entry); KASSERT(found, ("found dup %p start %jx size %jx", - a->ctx, (uintmax_t)a->entry->start, (uintmax_t)a->size)); + a->domain, (uintmax_t)a->entry->start, (uintmax_t)a->size)); a->entry->flags = DMAR_MAP_ENTRY_MAP; - KASSERT(RB_PREV(dmar_gas_entries_tree, &a->ctx->rb_root, + KASSERT(RB_PREV(dmar_gas_entries_tree, &a->domain->rb_root, a->entry) == prev, ("entry %p prev %p inserted prev %p", a->entry, prev, - RB_PREV(dmar_gas_entries_tree, &a->ctx->rb_root, a->entry))); - KASSERT(RB_NEXT(dmar_gas_entries_tree, &a->ctx->rb_root, + RB_PREV(dmar_gas_entries_tree, &a->domain->rb_root, a->entry))); + KASSERT(RB_NEXT(dmar_gas_entries_tree, &a->domain->rb_root, a->entry) == next, ("entry %p next %p inserted next %p", a->entry, next, - RB_NEXT(dmar_gas_entries_tree, &a->ctx->rb_root, a->entry))); + RB_NEXT(dmar_gas_entries_tree, &a->domain->rb_root, a->entry))); } static int @@ -434,11 +437,12 @@ struct dmar_map_entry *next, *prev, find_entry; find_entry.start = a->common->highaddr; - next = RB_NFIND(dmar_gas_entries_tree, &a->ctx->rb_root, &find_entry); + next = RB_NFIND(dmar_gas_entries_tree, &a->domain->rb_root, + &find_entry); if (next == NULL) return (ENOMEM); - prev = RB_PREV(dmar_gas_entries_tree, &a->ctx->rb_root, next); - KASSERT(prev != NULL, ("no prev %p %jx", a->ctx, + prev = RB_PREV(dmar_gas_entries_tree, &a->domain->rb_root, next); + KASSERT(prev != NULL, ("no prev %p %jx", a->domain, (uintmax_t)find_entry.start)); for (;;) { a->entry->start = prev->start + DMAR_PAGE_SIZE; @@ -446,7 +450,7 @@ a->entry->start = a->common->highaddr; a->entry->start = roundup2(a->entry->start, a->common->alignment); - if (dmar_gas_match_one(a, prev, a->ctx->end)) { + if (dmar_gas_match_one(a, prev, a->domain->end)) { dmar_gas_match_insert(a, prev); return (0); } @@ -459,16 +463,17 @@ * non-optimal way. */ prev = next; - next = RB_NEXT(dmar_gas_entries_tree, &a->ctx->rb_root, prev); - KASSERT(next != NULL, ("no next %p %jx", a->ctx, + next = RB_NEXT(dmar_gas_entries_tree, &a->domain->rb_root, + prev); + KASSERT(next != NULL, ("no next %p %jx", a->domain, (uintmax_t)find_entry.start)); - if (next->end >= a->ctx->end) + if (next->end >= a->domain->end) return (ENOMEM); } } static int -dmar_gas_find_space(struct dmar_ctx *ctx, +dmar_gas_find_space(struct dmar_domain *domain, const struct bus_dma_tag_common *common, dmar_gaddr_t size, int offset, u_int flags, struct dmar_map_entry *entry) { @@ -475,11 +480,11 @@ struct dmar_gas_match_args a; int error; - DMAR_CTX_ASSERT_LOCKED(ctx); - KASSERT(entry->flags == 0, ("dirty entry %p %p", ctx, entry)); + DMAR_DOMAIN_ASSERT_LOCKED(domain); + KASSERT(entry->flags == 0, ("dirty entry %p %p", domain, entry)); KASSERT((size & DMAR_PAGE_MASK) == 0, ("size %jx", (uintmax_t)size)); - a.ctx = ctx; + a.domain = domain; a.size = size; a.offset = offset; a.common = common; @@ -488,7 +493,7 @@ /* Handle lower region. */ if (common->lowaddr > 0) { - error = dmar_gas_lowermatch(&a, RB_ROOT(&ctx->rb_root)); + error = dmar_gas_lowermatch(&a, RB_ROOT(&domain->rb_root)); if (error == 0) return (0); KASSERT(error == ENOMEM, @@ -495,7 +500,7 @@ ("error %d from dmar_gas_lowermatch", error)); } /* Handle upper region. */ - if (common->highaddr >= ctx->end) + if (common->highaddr >= domain->end) return (ENOMEM); error = dmar_gas_uppermatch(&a); KASSERT(error == ENOMEM, @@ -504,13 +509,13 @@ } static int -dmar_gas_alloc_region(struct dmar_ctx *ctx, struct dmar_map_entry *entry, +dmar_gas_alloc_region(struct dmar_domain *domain, struct dmar_map_entry *entry, u_int flags) { struct dmar_map_entry *next, *prev; bool found; - DMAR_CTX_ASSERT_LOCKED(ctx); + DMAR_DOMAIN_ASSERT_LOCKED(domain); if ((entry->start & DMAR_PAGE_MASK) != 0 || (entry->end & DMAR_PAGE_MASK) != 0) @@ -517,13 +522,13 @@ return (EINVAL); if (entry->start >= entry->end) return (EINVAL); - if (entry->end >= ctx->end) + if (entry->end >= domain->end) return (EINVAL); - next = RB_NFIND(dmar_gas_entries_tree, &ctx->rb_root, entry); - KASSERT(next != NULL, ("next must be non-null %p %jx", ctx, + next = RB_NFIND(dmar_gas_entries_tree, &domain->rb_root, entry); + KASSERT(next != NULL, ("next must be non-null %p %jx", domain, (uintmax_t)entry->start)); - prev = RB_PREV(dmar_gas_entries_tree, &ctx->rb_root, next); + prev = RB_PREV(dmar_gas_entries_tree, &domain->rb_root, next); /* prev could be NULL */ /* @@ -551,23 +556,23 @@ if (prev != NULL && prev->end > entry->start) { /* This assumes that prev is the placeholder entry. */ - dmar_gas_rb_remove(ctx, prev); + dmar_gas_rb_remove(domain, prev); prev = NULL; } if (next != NULL && next->start < entry->end) { - dmar_gas_rb_remove(ctx, next); + dmar_gas_rb_remove(domain, next); next = NULL; } - found = dmar_gas_rb_insert(ctx, entry); + found = dmar_gas_rb_insert(domain, entry); KASSERT(found, ("found RMRR dup %p start %jx end %jx", - ctx, (uintmax_t)entry->start, (uintmax_t)entry->end)); + domain, (uintmax_t)entry->start, (uintmax_t)entry->end)); entry->flags = DMAR_MAP_ENTRY_RMRR; #ifdef INVARIANTS struct dmar_map_entry *ip, *in; - ip = RB_PREV(dmar_gas_entries_tree, &ctx->rb_root, entry); - in = RB_NEXT(dmar_gas_entries_tree, &ctx->rb_root, entry); + ip = RB_PREV(dmar_gas_entries_tree, &domain->rb_root, entry); + in = RB_NEXT(dmar_gas_entries_tree, &domain->rb_root, entry); KASSERT(prev == NULL || ip == prev, ("RMRR %p (%jx %jx) prev %p (%jx %jx) ins prev %p (%jx %jx)", entry, entry->start, entry->end, prev, @@ -584,47 +589,47 @@ } void -dmar_gas_free_space(struct dmar_ctx *ctx, struct dmar_map_entry *entry) +dmar_gas_free_space(struct dmar_domain *domain, struct dmar_map_entry *entry) { - DMAR_CTX_ASSERT_LOCKED(ctx); + DMAR_DOMAIN_ASSERT_LOCKED(domain); KASSERT((entry->flags & (DMAR_MAP_ENTRY_PLACE | DMAR_MAP_ENTRY_RMRR | DMAR_MAP_ENTRY_MAP)) == DMAR_MAP_ENTRY_MAP, - ("permanent entry %p %p", ctx, entry)); + ("permanent entry %p %p", domain, entry)); - dmar_gas_rb_remove(ctx, entry); + dmar_gas_rb_remove(domain, entry); entry->flags &= ~DMAR_MAP_ENTRY_MAP; #ifdef INVARIANTS if (dmar_check_free) - dmar_gas_check_free(ctx); + dmar_gas_check_free(domain); #endif } void -dmar_gas_free_region(struct dmar_ctx *ctx, struct dmar_map_entry *entry) +dmar_gas_free_region(struct dmar_domain *domain, struct dmar_map_entry *entry) { struct dmar_map_entry *next, *prev; - DMAR_CTX_ASSERT_LOCKED(ctx); + DMAR_DOMAIN_ASSERT_LOCKED(domain); KASSERT((entry->flags & (DMAR_MAP_ENTRY_PLACE | DMAR_MAP_ENTRY_RMRR | DMAR_MAP_ENTRY_MAP)) == DMAR_MAP_ENTRY_RMRR, - ("non-RMRR entry %p %p", ctx, entry)); + ("non-RMRR entry %p %p", domain, entry)); - prev = RB_PREV(dmar_gas_entries_tree, &ctx->rb_root, entry); - next = RB_NEXT(dmar_gas_entries_tree, &ctx->rb_root, entry); - dmar_gas_rb_remove(ctx, entry); + prev = RB_PREV(dmar_gas_entries_tree, &domain->rb_root, entry); + next = RB_NEXT(dmar_gas_entries_tree, &domain->rb_root, entry); + dmar_gas_rb_remove(domain, entry); entry->flags &= ~DMAR_MAP_ENTRY_RMRR; if (prev == NULL) - dmar_gas_rb_insert(ctx, ctx->first_place); + dmar_gas_rb_insert(domain, domain->first_place); if (next == NULL) - dmar_gas_rb_insert(ctx, ctx->last_place); + dmar_gas_rb_insert(domain, domain->last_place); } int -dmar_gas_map(struct dmar_ctx *ctx, const struct bus_dma_tag_common *common, - dmar_gaddr_t size, int offset, u_int eflags, u_int flags, vm_page_t *ma, - struct dmar_map_entry **res) +dmar_gas_map(struct dmar_domain *domain, + const struct bus_dma_tag_common *common, dmar_gaddr_t size, int offset, + u_int eflags, u_int flags, vm_page_t *ma, struct dmar_map_entry **res) { struct dmar_map_entry *entry; int error; @@ -632,29 +637,31 @@ KASSERT((flags & ~(DMAR_GM_CANWAIT | DMAR_GM_CANSPLIT)) == 0, ("invalid flags 0x%x", flags)); - entry = dmar_gas_alloc_entry(ctx, (flags & DMAR_GM_CANWAIT) != 0 ? + entry = dmar_gas_alloc_entry(domain, (flags & DMAR_GM_CANWAIT) != 0 ? DMAR_PGF_WAITOK : 0); if (entry == NULL) return (ENOMEM); - DMAR_CTX_LOCK(ctx); - error = dmar_gas_find_space(ctx, common, size, offset, flags, entry); + DMAR_DOMAIN_LOCK(domain); + error = dmar_gas_find_space(domain, common, size, offset, flags, + entry); if (error == ENOMEM) { - DMAR_CTX_UNLOCK(ctx); - dmar_gas_free_entry(ctx, entry); + DMAR_DOMAIN_UNLOCK(domain); + dmar_gas_free_entry(domain, entry); return (error); } #ifdef INVARIANTS if (dmar_check_free) - dmar_gas_check_free(ctx); + dmar_gas_check_free(domain); #endif KASSERT(error == 0, ("unexpected error %d from dmar_gas_find_entry", error)); - KASSERT(entry->end < ctx->end, ("allocated GPA %jx, max GPA %jx", - (uintmax_t)entry->end, (uintmax_t)ctx->end)); + KASSERT(entry->end < domain->end, ("allocated GPA %jx, max GPA %jx", + (uintmax_t)entry->end, (uintmax_t)domain->end)); entry->flags |= eflags; - DMAR_CTX_UNLOCK(ctx); + DMAR_DOMAIN_UNLOCK(domain); - error = ctx_map_buf(ctx, entry->start, entry->end - entry->start, ma, + error = domain_map_buf(domain, entry->start, entry->end - entry->start, + ma, ((eflags & DMAR_MAP_ENTRY_READ) != 0 ? DMAR_PTE_R : 0) | ((eflags & DMAR_MAP_ENTRY_WRITE) != 0 ? DMAR_PTE_W : 0) | ((eflags & DMAR_MAP_ENTRY_SNOOP) != 0 ? DMAR_PTE_SNP : 0) | @@ -661,11 +668,11 @@ ((eflags & DMAR_MAP_ENTRY_TM) != 0 ? DMAR_PTE_TM : 0), (flags & DMAR_GM_CANWAIT) != 0 ? DMAR_PGF_WAITOK : 0); if (error == ENOMEM) { - dmar_ctx_unload_entry(entry, true); + dmar_domain_unload_entry(entry, true); return (error); } KASSERT(error == 0, - ("unexpected error %d from ctx_map_buf", error)); + ("unexpected error %d from domain_map_buf", error)); *res = entry; return (0); @@ -672,30 +679,30 @@ } int -dmar_gas_map_region(struct dmar_ctx *ctx, struct dmar_map_entry *entry, +dmar_gas_map_region(struct dmar_domain *domain, struct dmar_map_entry *entry, u_int eflags, u_int flags, vm_page_t *ma) { dmar_gaddr_t start; int error; - KASSERT(entry->flags == 0, ("used RMRR entry %p %p %x", ctx, + KASSERT(entry->flags == 0, ("used RMRR entry %p %p %x", domain, entry, entry->flags)); KASSERT((flags & ~(DMAR_GM_CANWAIT)) == 0, ("invalid flags 0x%x", flags)); start = entry->start; - DMAR_CTX_LOCK(ctx); - error = dmar_gas_alloc_region(ctx, entry, flags); + DMAR_DOMAIN_LOCK(domain); + error = dmar_gas_alloc_region(domain, entry, flags); if (error != 0) { - DMAR_CTX_UNLOCK(ctx); + DMAR_DOMAIN_UNLOCK(domain); return (error); } entry->flags |= eflags; - DMAR_CTX_UNLOCK(ctx); + DMAR_DOMAIN_UNLOCK(domain); if (entry->end == entry->start) return (0); - error = ctx_map_buf(ctx, entry->start, entry->end - entry->start, + error = domain_map_buf(domain, entry->start, entry->end - entry->start, ma + OFF_TO_IDX(start - entry->start), ((eflags & DMAR_MAP_ENTRY_READ) != 0 ? DMAR_PTE_R : 0) | ((eflags & DMAR_MAP_ENTRY_WRITE) != 0 ? DMAR_PTE_W : 0) | @@ -703,31 +710,31 @@ ((eflags & DMAR_MAP_ENTRY_TM) != 0 ? DMAR_PTE_TM : 0), (flags & DMAR_GM_CANWAIT) != 0 ? DMAR_PGF_WAITOK : 0); if (error == ENOMEM) { - dmar_ctx_unload_entry(entry, false); + dmar_domain_unload_entry(entry, false); return (error); } KASSERT(error == 0, - ("unexpected error %d from ctx_map_buf", error)); + ("unexpected error %d from domain_map_buf", error)); return (0); } int -dmar_gas_reserve_region(struct dmar_ctx *ctx, dmar_gaddr_t start, +dmar_gas_reserve_region(struct dmar_domain *domain, dmar_gaddr_t start, dmar_gaddr_t end) { struct dmar_map_entry *entry; int error; - entry = dmar_gas_alloc_entry(ctx, DMAR_PGF_WAITOK); + entry = dmar_gas_alloc_entry(domain, DMAR_PGF_WAITOK); entry->start = start; entry->end = end; - DMAR_CTX_LOCK(ctx); - error = dmar_gas_alloc_region(ctx, entry, DMAR_GM_CANWAIT); + DMAR_DOMAIN_LOCK(domain); + error = dmar_gas_alloc_region(domain, entry, DMAR_GM_CANWAIT); if (error == 0) entry->flags |= DMAR_MAP_ENTRY_UNMAPPED; - DMAR_CTX_UNLOCK(ctx); + DMAR_DOMAIN_UNLOCK(domain); if (error != 0) - dmar_gas_free_entry(ctx, entry); + dmar_gas_free_entry(domain, entry); return (error); } Modified: trunk/sys/x86/iommu/intel_idpgtbl.c =================================================================== --- trunk/sys/x86/iommu/intel_idpgtbl.c 2020-02-08 19:29:01 UTC (rev 12309) +++ trunk/sys/x86/iommu/intel_idpgtbl.c 2020-02-08 19:32:41 UTC (rev 12310) @@ -29,7 +29,7 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/x86/iommu/intel_idpgtbl.c 286854 2015-08-17 18:36:16Z kib $"); +__FBSDID("$FreeBSD: stable/11/sys/x86/iommu/intel_idpgtbl.c 286777 2015-08-14 13:51:59Z kib $"); #include <sys/param.h> #include <sys/systm.h> @@ -49,6 +49,7 @@ #include <sys/taskqueue.h> #include <sys/tree.h> #include <sys/uio.h> +#include <sys/vmem.h> #include <vm/vm.h> #include <vm/vm_extern.h> #include <vm/vm_kern.h> @@ -66,8 +67,8 @@ #include <x86/iommu/busdma_dmar.h> #include <x86/iommu/intel_dmar.h> -static int ctx_unmap_buf_locked(struct dmar_ctx *ctx, dmar_gaddr_t base, - dmar_gaddr_t size, int flags); +static int domain_unmap_buf_locked(struct dmar_domain *domain, + dmar_gaddr_t base, dmar_gaddr_t size, int flags); /* * The cache of the identity mapping page tables for the DMARs. Using @@ -105,7 +106,7 @@ * mapped by the page table page. */ static void -ctx_idmap_nextlvl(struct idpgtbl *tbl, int lvl, vm_pindex_t idx, +domain_idmap_nextlvl(struct idpgtbl *tbl, int lvl, vm_pindex_t idx, dmar_gaddr_t addr) { vm_page_t m1; @@ -124,7 +125,7 @@ pg_sz = pglvl_page_size(tbl->pglvl, lvl); if (lvl != tbl->leaf) { for (i = 0, f = addr; i < DMAR_NPTEPG; i++, f += pg_sz) - ctx_idmap_nextlvl(tbl, lvl + 1, base + i, f); + domain_idmap_nextlvl(tbl, lvl + 1, base + i, f); } VM_OBJECT_WUNLOCK(tbl->pgtbl_obj); pte = dmar_map_pgtbl(tbl->pgtbl_obj, idx, DMAR_PGF_WAITOK, &sf); @@ -146,7 +147,7 @@ VM_PAGE_TO_PHYS(m1)) | DMAR_PTE_R | DMAR_PTE_W; } } - /* ctx_get_idmap_pgtbl flushes CPU cache if needed. */ + /* domain_get_idmap_pgtbl flushes CPU cache if needed. */ dmar_unmap_pgtbl(sf); VM_OBJECT_WLOCK(tbl->pgtbl_obj); } @@ -160,7 +161,7 @@ * maxaddr is typically mapped. */ vm_object_t -ctx_get_idmap_pgtbl(struct dmar_ctx *ctx, dmar_gaddr_t maxaddr) +domain_get_idmap_pgtbl(struct dmar_domain *domain, dmar_gaddr_t maxaddr) { struct dmar_unit *unit; struct idpgtbl *tbl; @@ -173,8 +174,8 @@ /* * First, determine where to stop the paging structures. */ - for (i = 0; i < ctx->pglvl; i++) { - if (i == ctx->pglvl - 1 || ctx_is_sp_lvl(ctx, i)) { + for (i = 0; i < domain->pglvl; i++) { + if (i == domain->pglvl - 1 || domain_is_sp_lvl(domain, i)) { leaf = i; break; } @@ -191,12 +192,12 @@ sx_slock(&idpgtbl_lock); LIST_FOREACH(tbl, &idpgtbls, link) { if (tbl->maxaddr >= maxaddr && - dmar_pglvl_supported(ctx->dmar, tbl->pglvl) && + dmar_pglvl_supported(domain->dmar, tbl->pglvl) && tbl->leaf == leaf) { res = tbl->pgtbl_obj; vm_object_reference(res); sx_sunlock(&idpgtbl_lock); - ctx->pglvl = tbl->pglvl; /* XXXKIB ? */ + domain->pglvl = tbl->pglvl; /* XXXKIB ? */ goto end; } } @@ -210,12 +211,12 @@ sx_xlock(&idpgtbl_lock); LIST_FOREACH(tbl, &idpgtbls, link) { if (tbl->maxaddr >= maxaddr && - dmar_pglvl_supported(ctx->dmar, tbl->pglvl) && + dmar_pglvl_supported(domain->dmar, tbl->pglvl) && tbl->leaf == leaf) { res = tbl->pgtbl_obj; vm_object_reference(res); sx_xunlock(&idpgtbl_lock); - ctx->pglvl = tbl->pglvl; /* XXXKIB ? */ + domain->pglvl = tbl->pglvl; /* XXXKIB ? */ return (res); } } @@ -224,13 +225,13 @@ * Still not found, create new page table. */ tbl = malloc(sizeof(*tbl), M_DMAR_IDPGTBL, M_WAITOK); - tbl->pglvl = ctx->pglvl; + tbl->pglvl = domain->pglvl; tbl->leaf = leaf; tbl->maxaddr = maxaddr; tbl->pgtbl_obj = vm_pager_allocate(OBJT_PHYS, NULL, IDX_TO_OFF(pglvl_max_pages(tbl->pglvl)), 0, 0, NULL); VM_OBJECT_WLOCK(tbl->pgtbl_obj); - ctx_idmap_nextlvl(tbl, 0, 0, 0); + domain_idmap_nextlvl(tbl, 0, 0, 0); VM_OBJECT_WUNLOCK(tbl->pgtbl_obj); LIST_INSERT_HEAD(&idpgtbls, tbl, link); res = tbl->pgtbl_obj; @@ -251,7 +252,7 @@ * If DMAR cannot look into the chipset write buffer, flush it * as well. */ - unit = ctx->dmar; + unit = domain->dmar; if (!DMAR_IS_COHERENT(unit)) { VM_OBJECT_WLOCK(res); for (m = vm_page_lookup(res, 0); m != NULL; @@ -320,10 +321,11 @@ * the level lvl. */ static int -ctx_pgtbl_pte_off(struct dmar_ctx *ctx, dmar_gaddr_t base, int lvl) +domain_pgtbl_pte_off(struct dmar_domain *domain, dmar_gaddr_t base, int lvl) { - base >>= DMAR_PAGE_SHIFT + (ctx->pglvl - lvl - 1) * DMAR_NPTEPGSHIFT; + base >>= DMAR_PAGE_SHIFT + (domain->pglvl - lvl - 1) * + DMAR_NPTEPGSHIFT; return (base & DMAR_PTEMASK); } @@ -333,21 +335,24 @@ * lvl. */ static vm_pindex_t -ctx_pgtbl_get_pindex(struct dmar_ctx *ctx, dmar_gaddr_t base, int lvl) +domain_pgtbl_get_pindex(struct dmar_domain *domain, dmar_gaddr_t base, int lvl) { vm_pindex_t idx, pidx; int i; - KASSERT(lvl >= 0 && lvl < ctx->pglvl, ("wrong lvl %p %d", ctx, lvl)); + KASSERT(lvl >= 0 && lvl < domain->pglvl, + ("wrong lvl %p %d", domain, lvl)); - for (pidx = idx = 0, i = 0; i < lvl; i++, pidx = idx) - idx = ctx_pgtbl_pte_off(ctx, base, i) + pidx * DMAR_NPTEPG + 1; + for (pidx = idx = 0, i = 0; i < lvl; i++, pidx = idx) { + idx = domain_pgtbl_pte_off(domain, base, i) + + pidx * DMAR_NPTEPG + 1; + } return (idx); } static dmar_pte_t * -ctx_pgtbl_map_pte(struct dmar_ctx *ctx, dmar_gaddr_t base, int lvl, int flags, - vm_pindex_t *idxp, struct sf_buf **sf) +domain_pgtbl_map_pte(struct dmar_domain *domain, dmar_gaddr_t base, int lvl, + int flags, vm_pindex_t *idxp, struct sf_buf **sf) { vm_page_t m; struct sf_buf *sfp; @@ -354,10 +359,10 @@ dmar_pte_t *pte, *ptep; vm_pindex_t idx, idx1; - DMAR_CTX_ASSERT_PGLOCKED(ctx); + DMAR_DOMAIN_ASSERT_PGLOCKED(domain); KASSERT((flags & DMAR_PGF_OBJL) != 0, ("lost PGF_OBJL")); - idx = ctx_pgtbl_get_pindex(ctx, base, lvl); + idx = domain_pgtbl_get_pindex(domain, base, lvl); if (*sf != NULL && idx == *idxp) { pte = (dmar_pte_t *)sf_buf_kva(*sf); } else { @@ -365,15 +370,16 @@ dmar_unmap_pgtbl(*sf); *idxp = idx; retry: - pte = dmar_map_pgtbl(ctx->pgtbl_obj, idx, flags, sf); + pte = dmar_map_pgtbl(domain->pgtbl_obj, idx, flags, sf); if (pte == NULL) { - KASSERT(lvl > 0, ("lost root page table page %p", ctx)); + KASSERT(lvl > 0, + ("lost root page table page %p", domain)); /* * Page table page does not exist, allocate * it and create a pte in the preceeding page level * to reference the allocated page table page. */ - m = dmar_pgalloc(ctx->pgtbl_obj, idx, flags | + m = dmar_pgalloc(domain->pgtbl_obj, idx, flags | DMAR_PGF_ZERO); if (m == NULL) return (NULL); @@ -381,25 +387,26 @@ /* * Prevent potential free while pgtbl_obj is * unlocked in the recursive call to - * ctx_pgtbl_map_pte(), if other thread did - * pte write and clean while the lock if + * domain_pgtbl_map_pte(), if other thread did + * pte write and clean while the lock is * dropped. */ m->wire_count++; sfp = NULL; - ptep = ctx_pgtbl_map_pte(ctx, base, lvl - 1, flags, - &idx1, &sfp); + ptep = domain_pgtbl_map_pte(domain, base, lvl - 1, + flags, &idx1, &sfp); if (ptep == NULL) { KASSERT(m->pindex != 0, - ("loosing root page %p", ctx)); + ("loosing root page %p", domain)); m->wire_count--; - dmar_pgfree(ctx->pgtbl_obj, m->pindex, flags); + dmar_pgfree(domain->pgtbl_obj, m->pindex, + flags); return (NULL); } dmar_pte_store(&ptep->pte, DMAR_PTE_R | DMAR_PTE_W | VM_PAGE_TO_PHYS(m)); - dmar_flush_pte_to_ram(ctx->dmar, ptep); + dmar_flush_pte_to_ram(domain->dmar, ptep); sf_buf_page(sfp)->wire_count += 1; m->wire_count--; dmar_unmap_pgtbl(sfp); @@ -407,13 +414,13 @@ goto retry; } } - pte += ctx_pgtbl_pte_off(ctx, base, lvl); + pte += domain_pgtbl_pte_off(domain, base, lvl); return (pte); } static int -ctx_map_buf_locked(struct dmar_ctx *ctx, dmar_gaddr_t base, dmar_gaddr_t size, - vm_page_t *ma, uint64_t pflags, int flags) +domain_map_buf_locked(struct dmar_domain *domain, dmar_gaddr_t base, + dmar_gaddr_t size, vm_page_t *ma, uint64_t pflags, int flags) { dmar_pte_t *pte; struct sf_buf *sf; @@ -422,7 +429,7 @@ int lvl; bool superpage; - DMAR_CTX_ASSERT_PGLOCKED(ctx); + DMAR_DOMAIN_ASSERT_PGLOCKED(domain); base1 = base; size1 = size; @@ -432,15 +439,15 @@ for (sf = NULL, pi = 0; size > 0; base += pg_sz, size -= pg_sz, pi += run_sz) { for (lvl = 0, c = 0, superpage = false;; lvl++) { - pg_sz = ctx_page_size(ctx, lvl); + pg_sz = domain_page_size(domain, lvl); run_sz = pg_sz >> DMAR_PAGE_SHIFT; - if (lvl == ctx->pglvl - 1) + if (lvl == domain->pglvl - 1) break; /* * Check if the current base suitable for the * superpage mapping. First, verify the level. */ - if (!ctx_is_sp_lvl(ctx, lvl)) + if (!domain_is_sp_lvl(domain, lvl)) continue; /* * Next, look at the size of the mapping and @@ -464,22 +471,23 @@ } } KASSERT(size >= pg_sz, - ("mapping loop overflow %p %jx %jx %jx", ctx, + ("mapping loop overflow %p %jx %jx %jx", domain, (uintmax_t)base, (uintmax_t)size, (uintmax_t)pg_sz)); KASSERT(pg_sz > 0, ("pg_sz 0 lvl %d", lvl)); - pte = ctx_pgtbl_map_pte(ctx, base, lvl, flags, &idx, &sf); + pte = domain_pgtbl_map_pte(domain, base, lvl, flags, &idx, &sf); if (pte == NULL) { KASSERT((flags & DMAR_PGF_WAITOK) == 0, - ("failed waitable pte alloc %p", ctx)); + ("failed waitable pte alloc %p", domain)); if (sf != NULL) dmar_unmap_pgtbl(sf); - ctx_unmap_buf_locked(ctx, base1, base - base1, flags); + domain_unmap_buf_locked(domain, base1, base - base1, + flags); TD_PINNED_ASSERT; return (ENOMEM); } dmar_pte_store(&pte->pte, VM_PAGE_TO_PHYS(ma[pi]) | pflags | (superpage ? DMAR_PTE_SP : 0)); - dmar_flush_pte_to_ram(ctx->dmar, pte); + dmar_flush_pte_to_ram(domain->dmar, pte); sf_buf_page(sf)->wire_count += 1; } if (sf != NULL) @@ -489,32 +497,32 @@ } int -ctx_map_buf(struct dmar_ctx *ctx, dmar_gaddr_t base, dmar_gaddr_t size, +domain_map_buf(struct dmar_domain *domain, dmar_gaddr_t base, dmar_gaddr_t size, vm_page_t *ma, uint64_t pflags, int flags) { struct dmar_unit *unit; int error; - unit = ctx->dmar; + unit = domain->dmar; - KASSERT((ctx->flags & DMAR_CTX_IDMAP) == 0, - ("modifying idmap pagetable ctx %p", ctx)); + KASSERT((domain->flags & DMAR_DOMAIN_IDMAP) == 0, + ("modifying idmap pagetable domain %p", domain)); KASSERT((base & DMAR_PAGE_MASK) == 0, - ("non-aligned base %p %jx %jx", ctx, (uintmax_t)base, + ("non-aligned base %p %jx %jx", domain, (uintmax_t)base, (uintmax_t)size)); KASSERT((size & DMAR_PAGE_MASK) == 0, - ("non-aligned size %p %jx %jx", ctx, (uintmax_t)base, + ("non-aligned size %p %jx %jx", domain, (uintmax_t)base, (uintmax_t)size)); - KASSERT(size > 0, ("zero size %p %jx %jx", ctx, (uintmax_t)base, + KASSERT(size > 0, ("zero size %p %jx %jx", domain, (uintmax_t)base, (uintmax_t)size)); - KASSERT(base < (1ULL << ctx->agaw), - ("base too high %p %jx %jx agaw %d", ctx, (uintmax_t)base, - (uintmax_t)size, ctx->agaw)); - KASSERT(base + size < (1ULL << ctx->agaw), - ("end too high %p %jx %jx agaw %d", ctx, (uintmax_t)base, - (uintmax_t)size, ctx->agaw)); + KASSERT(base < (1ULL << domain->agaw), + ("base too high %p %jx %jx agaw %d", domain, (uintmax_t)base, + (uintmax_t)size, domain->agaw)); + KASSERT(base + size < (1ULL << domain->agaw), + ("end too high %p %jx %jx agaw %d", domain, (uintmax_t)base, + (uintmax_t)size, domain->agaw)); KASSERT(base + size > base, - ("size overflow %p %jx %jx", ctx, (uintmax_t)base, + ("size overflow %p %jx %jx", domain, (uintmax_t)base, (uintmax_t)size)); KASSERT((pflags & (DMAR_PTE_R | DMAR_PTE_W)) != 0, ("neither read nor write %jx", (uintmax_t)pflags)); @@ -524,21 +532,21 @@ KASSERT((pflags & DMAR_PTE_SNP) == 0 || (unit->hw_ecap & DMAR_ECAP_SC) != 0, ("PTE_SNP for dmar without snoop control %p %jx", - ctx, (uintmax_t)pflags)); + domain, (uintmax_t)pflags)); KASSERT((pflags & DMAR_PTE_TM) == 0 || (unit->hw_ecap & DMAR_ECAP_DI) != 0, ("PTE_TM for dmar without DIOTLB %p %jx", - ctx, (uintmax_t)pflags)); + domain, (uintmax_t)pflags)); KASSERT((flags & ~DMAR_PGF_WAITOK) == 0, ("invalid flags %x", flags)); - DMAR_CTX_PGLOCK(ctx); - error = ctx_map_buf_locked(ctx, base, size, ma, pflags, flags); - DMAR_CTX_PGUNLOCK(ctx); + DMAR_DOMAIN_PGLOCK(domain); + error = domain_map_buf_locked(domain, base, size, ma, pflags, flags); + DMAR_DOMAIN_PGUNLOCK(domain); if (error != 0) return (error); if ((unit->hw_cap & DMAR_CAP_CM) != 0) - ctx_flush_iotlb_sync(ctx, base, size); + domain_flush_iotlb_sync(domain, base, size); else if ((unit->hw_cap & DMAR_CAP_RWBF) != 0) { /* See 11.1 Write Buffer Flushing. */ DMAR_LOCK(unit); @@ -548,11 +556,13 @@ return (0); } -static void ctx_unmap_clear_pte(struct dmar_ctx *ctx, dmar_gaddr_t base, - int lvl, int flags, dmar_pte_t *pte, struct sf_buf **sf, bool free_fs); +static void domain_unmap_clear_pte(struct dmar_domain *domain, + dmar_gaddr_t base, int lvl, int flags, dmar_pte_t *pte, + struct sf_buf **sf, bool free_fs); static void -ctx_free_pgtbl_pde(struct dmar_ctx *ctx, dmar_gaddr_t base, int lvl, int flags) +domain_free_pgtbl_pde(struct dmar_domain *domain, dmar_gaddr_t base, + int lvl, int flags) { struct sf_buf *sf; dmar_pte_t *pde; @@ -559,18 +569,18 @@ vm_pindex_t idx; sf = NULL; - pde = ctx_pgtbl_map_pte(ctx, base, lvl, flags, &idx, &sf); - ctx_unmap_clear_pte(ctx, base, lvl, flags, pde, &sf, true); + pde = domain_pgtbl_map_pte(domain, base, lvl, flags, &idx, &sf); + domain_unmap_clear_pte(domain, base, lvl, flags, pde, &sf, true); } static void -ctx_unmap_clear_pte(struct dmar_ctx *ctx, dmar_gaddr_t base, int lvl, +domain_unmap_clear_pte(struct dmar_domain *domain, dmar_gaddr_t base, int lvl, int flags, dmar_pte_t *pte, struct sf_buf **sf, bool free_sf) { vm_page_t m; dmar_pte_clear(&pte->pte); - dmar_flush_pte_to_ram(ctx->dmar, pte); + dmar_flush_pte_to_ram(domain->dmar, pte); m = sf_buf_page(*sf); if (free_sf) { dmar_unmap_pgtbl(*sf); @@ -580,13 +590,13 @@ if (m->wire_count != 0) return; KASSERT(lvl != 0, - ("lost reference (lvl) on root pg ctx %p base %jx lvl %d", - ctx, (uintmax_t)base, lvl)); + ("lost reference (lvl) on root pg domain %p base %jx lvl %d", + domain, (uintmax_t)base, lvl)); KASSERT(m->pindex != 0, - ("lost reference (idx) on root pg ctx %p base %jx lvl %d", - ctx, (uintmax_t)base, lvl)); - dmar_pgfree(ctx->pgtbl_obj, m->pindex, flags); - ctx_free_pgtbl_pde(ctx, base, lvl - 1, flags); + ("lost reference (idx) on root pg domain %p base %jx lvl %d", + domain, (uintmax_t)base, lvl)); + dmar_pgfree(domain->pgtbl_obj, m->pindex, flags); + domain_free_pgtbl_pde(domain, base, lvl - 1, flags); } /* @@ -593,7 +603,7 @@ * Assumes that the unmap is never partial. */ static int -ctx_unmap_buf_locked(struct dmar_ctx *ctx, dmar_gaddr_t base, +domain_unmap_buf_locked(struct dmar_domain *domain, dmar_gaddr_t base, dmar_gaddr_t size, int flags) { dmar_pte_t *pte; @@ -602,26 +612,26 @@ dmar_gaddr_t pg_sz; int lvl; - DMAR_CTX_ASSERT_PGLOCKED(ctx); + DMAR_DOMAIN_ASSERT_PGLOCKED(domain); if (size == 0) return (0); - KASSERT((ctx->flags & DMAR_CTX_IDMAP) == 0, - ("modifying idmap pagetable ctx %p", ctx)); + KASSERT((domain->flags & DMAR_DOMAIN_IDMAP) == 0, + ("modifying idmap pagetable domain %p", domain)); KASSERT((base & DMAR_PAGE_MASK) == 0, - ("non-aligned base %p %jx %jx", ctx, (uintmax_t)base, + ("non-aligned base %p %jx %jx", domain, (uintmax_t)base, (uintmax_t)size)); KASSERT((size & DMAR_PAGE_MASK) == 0, - ("non-aligned size %p %jx %jx", ctx, (uintmax_t)base, + ("non-aligned size %p %jx %jx", domain, (uintmax_t)base, (uintmax_t)size)); - KASSERT(base < (1ULL << ctx->agaw), - ("base too high %p %jx %jx agaw %d", ctx, (uintmax_t)base, - (uintmax_t)size, ctx->agaw)); - KASSERT(base + size < (1ULL << ctx->agaw), - ("end too high %p %jx %jx agaw %d", ctx, (uintmax_t)base, - (uintmax_t)size, ctx->agaw)); + KASSERT(base < (1ULL << domain->agaw), + ("base too high %p %jx %jx agaw %d", domain, (uintmax_t)base, + (uintmax_t)size, domain->agaw)); + KASSERT(base + size < (1ULL << domain->agaw), + ("end too high %p %jx %jx agaw %d", domain, (uintmax_t)base, + (uintmax_t)size, domain->agaw)); KASSERT(base + size > base, - ("size overflow %p %jx %jx", ctx, (uintmax_t)base, + ("size overflow %p %jx %jx", domain, (uintmax_t)base, (uintmax_t)size)); KASSERT((flags & ~DMAR_PGF_WAITOK) == 0, ("invalid flags %x", flags)); @@ -630,26 +640,27 @@ TD_PREP_PINNED_ASSERT; for (sf = NULL; size > 0; base += pg_sz, size -= pg_sz) { - for (lvl = 0; lvl < ctx->pglvl; lvl++) { - if (lvl != ctx->pglvl - 1 && !ctx_is_sp_lvl(ctx, lvl)) + for (lvl = 0; lvl < domain->pglvl; lvl++) { + if (lvl != domain->pglvl - 1 && + !domain_is_sp_lvl(domain, lvl)) continue; - pg_sz = ctx_page_size(ctx, lvl); + pg_sz = domain_page_size(domain, lvl); if (pg_sz > size) continue; - pte = ctx_pgtbl_map_pte(ctx, base, lvl, flags, + pte = domain_pgtbl_map_pte(domain, base, lvl, flags, &idx, &sf); KASSERT(pte != NULL, ("sleeping or page missed %p %jx %d 0x%x", - ctx, (uintmax_t)base, lvl, flags)); + domain, (uintmax_t)base, lvl, flags)); if ((pte->pte & DMAR_PTE_SP) != 0 || - lvl == ctx->pglvl - 1) { - ctx_unmap_clear_pte(ctx, base, lvl, flags, - pte, &sf, false); + lvl == domain->pglvl - 1) { + domain_unmap_clear_pte(domain, base, lvl, + flags, pte, &sf, false); break; } } KASSERT(size >= pg_sz, - ("unmapping loop overflow %p %jx %jx %jx", ctx, + ("unmapping loop overflow %p %jx %jx %jx", domain, (uintmax_t)base, (uintmax_t)size, (uintmax_t)pg_sz)); } if (sf != NULL) @@ -664,54 +675,58 @@ } int -ctx_unmap_buf(struct dmar_ctx *ctx, dmar_gaddr_t base, dmar_gaddr_t size, - int flags) +domain_unmap_buf(struct dmar_domain *domain, dmar_gaddr_t base, + dmar_gaddr_t size, int flags) { int error; - DMAR_CTX_PGLOCK(ctx); - error = ctx_unmap_buf_locked(ctx, base, size, flags); - DMAR_CTX_PGUNLOCK(ctx); + DMAR_DOMAIN_PGLOCK(domain); + error = domain_unmap_buf_locked(domain, base, size, flags); + DMAR_DOMAIN_PGUNLOCK(domain); return (error); } int -ctx_alloc_pgtbl(struct dmar_ctx *ctx) +domain_alloc_pgtbl(struct dmar_domain *domain) { vm_page_t m; - KASSERT(ctx->pgtbl_obj == NULL, ("already initialized %p", ctx)); + KASSERT(domain->pgtbl_obj == NULL, + ("already initialized %p", domain)); - ctx->pgtbl_obj = vm_pager_allocate(OBJT_PHYS, NULL, - IDX_TO_OFF(pglvl_max_pages(ctx->pglvl)), 0, 0, NULL); - DMAR_CTX_PGLOCK(ctx); - m = dmar_pgalloc(ctx->pgtbl_obj, 0, DMAR_PGF_WAITOK | + domain->pgtbl_obj = vm_pager_allocate(OBJT_PHYS, NULL, + IDX_TO_OFF(pglvl_max_pages(domain->pglvl)), 0, 0, NULL); + DMAR_DOMAIN_PGLOCK(domain); + m = dmar_pgalloc(domain->pgtbl_obj, 0, DMAR_PGF_WAITOK | DMAR_PGF_ZERO | DMAR_PGF_OBJL); /* No implicit free of the top level page table page. */ m->wire_count = 1; - DMAR_CTX_PGUNLOCK(ctx); + DMAR_DOMAIN_PGUNLOCK(domain); + DMAR_LOCK(domain->dmar); + domain->flags |= DMAR_DOMAIN_PGTBL_INITED; + DMAR_UNLOCK(domain->dmar); return (0); } void -ctx_free_pgtbl(struct dmar_ctx *ctx) +domain_free_pgtbl(struct dmar_domain *domain) { vm_object_t obj; vm_page_t m; - obj = ctx->pgtbl_obj; + obj = domain->pgtbl_obj; if (obj == NULL) { - KASSERT((ctx->dmar->hw_ecap & DMAR_ECAP_PT) != 0 && - (ctx->flags & DMAR_CTX_IDMAP) != 0, - ("lost pagetable object ctx %p", ctx)); + KASSERT((domain->dmar->hw_ecap & DMAR_ECAP_PT) != 0 && + (domain->flags & DMAR_DOMAIN_IDMAP) != 0, + ("lost pagetable object domain %p", domain)); return; } - DMAR_CTX_ASSERT_PGLOCKED(ctx); - ctx->pgtbl_obj = NULL; + DMAR_DOMAIN_ASSERT_PGLOCKED(domain); + domain->pgtbl_obj = NULL; - if ((ctx->flags & DMAR_CTX_IDMAP) != 0) { + if ((domain->flags & DMAR_DOMAIN_IDMAP) != 0) { put_idmap_pgtbl(obj); - ctx->flags &= ~DMAR_CTX_IDMAP; + domain->flags &= ~DMAR_DOMAIN_IDMAP; return; } @@ -724,7 +739,7 @@ } static inline uint64_t -ctx_wait_iotlb_flush(struct dmar_unit *unit, uint64_t wt, int iro) +domain_wait_iotlb_flush(struct dmar_unit *unit, uint64_t wt, int iro) { uint64_t iotlbr; @@ -740,7 +755,8 @@ } void -ctx_flush_iotlb_sync(struct dmar_ctx *ctx, dmar_gaddr_t base, dmar_gaddr_t size) +domain_flush_iotlb_sync(struct dmar_domain *domain, dmar_gaddr_t base, + dmar_gaddr_t size) { struct dmar_unit *unit; dmar_gaddr_t isize; @@ -747,14 +763,14 @@ uint64_t iotlbr; int am, iro; - unit = ctx->dmar; + unit = domain->dmar; KASSERT(!unit->qi_enabled, ("dmar%d: sync iotlb flush call", unit->unit)); iro = DMAR_ECAP_IRO(unit->hw_ecap) * 16; DMAR_LOCK(unit); if ((unit->hw_cap & DMAR_CAP_PSI) == 0 || size > 2 * 1024 * 1024) { - iotlbr = ctx_wait_iotlb_flush(unit, DMAR_IOTLB_IIRG_DOM | - DMAR_IOTLB_DID(ctx->domain), iro); + iotlbr = domain_wait_iotlb_flush(unit, DMAR_IOTLB_IIRG_DOM | + DMAR_IOTLB_DID(domain->domain), iro); KASSERT((iotlbr & DMAR_IOTLB_IAIG_MASK) != DMAR_IOTLB_IAIG_INVLD, ("dmar%d: invalidation failed %jx", unit->unit, @@ -763,9 +779,9 @@ for (; size > 0; base += isize, size -= isize) { am = calc_am(unit, base, size, &isize); dmar_write8(unit, iro, base | am); - iotlbr = ctx_wait_iotlb_flush(unit, - DMAR_IOTLB_IIRG_PAGE | DMAR_IOTLB_DID(ctx->domain), - iro); + iotlbr = domain_wait_iotlb_flush(unit, + DMAR_IOTLB_IIRG_PAGE | + DMAR_IOTLB_DID(domain->domain), iro); KASSERT((iotlbr & DMAR_IOTLB_IAIG_MASK) != DMAR_IOTLB_IAIG_INVLD, ("dmar%d: PSI invalidation failed " Added: trunk/sys/x86/iommu/intel_intrmap.c =================================================================== --- trunk/sys/x86/iommu/intel_intrmap.c (rev 0) +++ trunk/sys/x86/iommu/intel_intrmap.c 2020-02-08 19:32:41 UTC (rev 12310) @@ -0,0 +1,381 @@ +/* $MidnightBSD$ */ +/*- + * Copyright (c) 2015 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by Konstantin Belousov <kib at FreeBSD.org> + * under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: stable/11/sys/x86/iommu/intel_intrmap.c 340016 2018-11-01 18:34:26Z jhb $"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/bus.h> +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/memdesc.h> +#include <sys/rman.h> +#include <sys/rwlock.h> +#include <sys/taskqueue.h> +#include <sys/tree.h> +#include <sys/vmem.h> +#include <machine/bus.h> +#include <machine/intr_machdep.h> +#include <vm/vm.h> +#include <vm/vm_extern.h> +#include <vm/vm_kern.h> +#include <vm/vm_object.h> +#include <vm/vm_page.h> +#include <x86/include/apicreg.h> +#include <x86/include/apicvar.h> +#include <x86/include/busdma_impl.h> +#include <x86/iommu/intel_reg.h> +#include <x86/iommu/busdma_dmar.h> +#include <x86/iommu/intel_dmar.h> +#include <dev/pci/pcivar.h> +#include <x86/iommu/iommu_intrmap.h> + +static struct dmar_unit *dmar_ir_find(device_t src, uint16_t *rid, + int *is_dmar); +static void dmar_ir_program_irte(struct dmar_unit *unit, u_int idx, + uint64_t low, uint16_t rid); +static int dmar_ir_free_irte(struct dmar_unit *unit, u_int cookie); + +int +iommu_alloc_msi_intr(device_t src, u_int *cookies, u_int count) +{ + struct dmar_unit *unit; + vmem_addr_t vmem_res; + u_int idx, i; + int error; + + unit = dmar_ir_find(src, NULL, NULL); + if (unit == NULL || !unit->ir_enabled) { + for (i = 0; i < count; i++) + cookies[i] = -1; + return (EOPNOTSUPP); + } + + error = vmem_alloc(unit->irtids, count, M_FIRSTFIT | M_NOWAIT, + &vmem_res); + if (error != 0) { + KASSERT(error != EOPNOTSUPP, + ("impossible EOPNOTSUPP from vmem")); + return (error); + } + idx = vmem_res; + for (i = 0; i < count; i++) + cookies[i] = idx + i; + return (0); +} + +int +iommu_map_msi_intr(device_t src, u_int cpu, u_int vector, u_int cookie, + uint64_t *addr, uint32_t *data) +{ + struct dmar_unit *unit; + uint64_t low; + uint16_t rid; + int is_dmar; + + unit = dmar_ir_find(src, &rid, &is_dmar); + if (is_dmar) { + KASSERT(unit == NULL, ("DMAR cannot translate itself")); + + /* + * See VT-d specification, 5.1.6 Remapping Hardware - + * Interrupt Programming. + */ + *data = vector; + *addr = MSI_INTEL_ADDR_BASE | ((cpu & 0xff) << 12); + if (x2apic_mode) + *addr |= ((uint64_t)cpu & 0xffffff00) << 32; + else + KASSERT(cpu <= 0xff, ("cpu id too big %d", cpu)); + return (0); + } + if (unit == NULL || !unit->ir_enabled || cookie == -1) + return (EOPNOTSUPP); + + low = (DMAR_X2APIC(unit) ? DMAR_IRTE1_DST_x2APIC(cpu) : + DMAR_IRTE1_DST_xAPIC(cpu)) | DMAR_IRTE1_V(vector) | + DMAR_IRTE1_DLM_FM | DMAR_IRTE1_TM_EDGE | DMAR_IRTE1_RH_DIRECT | + DMAR_IRTE1_DM_PHYSICAL | DMAR_IRTE1_P; + dmar_ir_program_irte(unit, cookie, low, rid); + + if (addr != NULL) { + /* + * See VT-d specification, 5.1.5.2 MSI and MSI-X + * Register Programming. + */ + *addr = MSI_INTEL_ADDR_BASE | ((cookie & 0x7fff) << 5) | + ((cookie & 0x8000) << 2) | 0x18; + *data = 0; + } + return (0); +} + +int +iommu_unmap_msi_intr(device_t src, u_int cookie) +{ + struct dmar_unit *unit; + + if (cookie == -1) + return (0); + unit = dmar_ir_find(src, NULL, NULL); + return (dmar_ir_free_irte(unit, cookie)); +} + +int +iommu_map_ioapic_intr(u_int ioapic_id, u_int cpu, u_int vector, bool edge, + bool activehi, int irq, u_int *cookie, uint32_t *hi, uint32_t *lo) +{ + struct dmar_unit *unit; + vmem_addr_t vmem_res; + uint64_t low, iorte; + u_int idx; + int error; + uint16_t rid; + + unit = dmar_find_ioapic(ioapic_id, &rid); + if (unit == NULL || !unit->ir_enabled) { + *cookie = -1; + return (EOPNOTSUPP); + } + + error = vmem_alloc(unit->irtids, 1, M_FIRSTFIT | M_NOWAIT, &vmem_res); + if (error != 0) { + KASSERT(error != EOPNOTSUPP, + ("impossible EOPNOTSUPP from vmem")); + return (error); + } + idx = vmem_res; + low = 0; + switch (irq) { + case IRQ_EXTINT: + low |= DMAR_IRTE1_DLM_ExtINT; + break; + case IRQ_NMI: + low |= DMAR_IRTE1_DLM_NMI; + break; + case IRQ_SMI: + low |= DMAR_IRTE1_DLM_SMI; + break; + default: + KASSERT(vector != 0, ("No vector for IRQ %u", irq)); + low |= DMAR_IRTE1_DLM_FM | DMAR_IRTE1_V(vector); + break; + } + low |= (DMAR_X2APIC(unit) ? DMAR_IRTE1_DST_x2APIC(cpu) : + DMAR_IRTE1_DST_xAPIC(cpu)) | + (edge ? DMAR_IRTE1_TM_EDGE : DMAR_IRTE1_TM_LEVEL) | + DMAR_IRTE1_RH_DIRECT | DMAR_IRTE1_DM_PHYSICAL | DMAR_IRTE1_P; + dmar_ir_program_irte(unit, idx, low, rid); + + if (hi != NULL) { + /* + * See VT-d specification, 5.1.5.1 I/OxAPIC + * Programming. + */ + iorte = (1ULL << 48) | ((uint64_t)(idx & 0x7fff) << 49) | + ((idx & 0x8000) != 0 ? (1 << 11) : 0) | + (edge ? IOART_TRGREDG : IOART_TRGRLVL) | + (activehi ? IOART_INTAHI : IOART_INTALO) | + IOART_DELFIXED | vector; + *hi = iorte >> 32; + *lo = iorte; + } + *cookie = idx; + return (0); +} + +int +iommu_unmap_ioapic_intr(u_int ioapic_id, u_int *cookie) +{ + struct dmar_unit *unit; + u_int idx; + + idx = *cookie; + if (idx == -1) + return (0); + *cookie = -1; + unit = dmar_find_ioapic(ioapic_id, NULL); + KASSERT(unit != NULL && unit->ir_enabled, + ("unmap: cookie %d unit %p", idx, unit)); + return (dmar_ir_free_irte(unit, idx)); +} + +static struct dmar_unit * +dmar_ir_find(device_t src, uint16_t *rid, int *is_dmar) +{ + devclass_t src_class; + struct dmar_unit *unit; + + /* + * We need to determine if the interrupt source generates FSB + * interrupts. If yes, it is either DMAR, in which case + * interrupts are not remapped. Or it is HPET, and interrupts + * are remapped. For HPET, source id is reported by HPET + * record in DMAR ACPI table. + */ + if (is_dmar != NULL) + *is_dmar = FALSE; + src_class = device_get_devclass(src); + if (src_class == devclass_find("dmar")) { + unit = NULL; + if (is_dmar != NULL) + *is_dmar = TRUE; + } else if (src_class == devclass_find("hpet")) { + unit = dmar_find_hpet(src, rid); + } else { + unit = dmar_find(src); + if (unit != NULL && rid != NULL) + dmar_get_requester(src, rid); + } + return (unit); +} + +static void +dmar_ir_program_irte(struct dmar_unit *unit, u_int idx, uint64_t low, + uint16_t rid) +{ + dmar_irte_t *irte; + uint64_t high; + + KASSERT(idx < unit->irte_cnt, + ("bad cookie %d %d", idx, unit->irte_cnt)); + irte = &(unit->irt[idx]); + high = DMAR_IRTE2_SVT_RID | DMAR_IRTE2_SQ_RID | + DMAR_IRTE2_SID_RID(rid); + device_printf(unit->dev, + "programming irte[%d] rid %#x high %#jx low %#jx\n", + idx, rid, (uintmax_t)high, (uintmax_t)low); + DMAR_LOCK(unit); + if ((irte->irte1 & DMAR_IRTE1_P) != 0) { + /* + * The rte is already valid. Assume that the request + * is to remap the interrupt for balancing. Only low + * word of rte needs to be changed. Assert that the + * high word contains expected value. + */ + KASSERT(irte->irte2 == high, + ("irte2 mismatch, %jx %jx", (uintmax_t)irte->irte2, + (uintmax_t)high)); + dmar_pte_update(&irte->irte1, low); + } else { + dmar_pte_store(&irte->irte2, high); + dmar_pte_store(&irte->irte1, low); + } + dmar_qi_invalidate_iec(unit, idx, 1); + DMAR_UNLOCK(unit); + +} + +static int +dmar_ir_free_irte(struct dmar_unit *unit, u_int cookie) +{ + dmar_irte_t *irte; + + KASSERT(unit != NULL && unit->ir_enabled, + ("unmap: cookie %d unit %p", cookie, unit)); + KASSERT(cookie < unit->irte_cnt, + ("bad cookie %u %u", cookie, unit->irte_cnt)); + irte = &(unit->irt[cookie]); + dmar_pte_clear(&irte->irte1); + dmar_pte_clear(&irte->irte2); + DMAR_LOCK(unit); + dmar_qi_invalidate_iec(unit, cookie, 1); + DMAR_UNLOCK(unit); + vmem_free(unit->irtids, cookie, 1); + return (0); +} + +static u_int +clp2(u_int v) +{ + + return (powerof2(v) ? v : 1 << fls(v)); +} + +int +dmar_init_irt(struct dmar_unit *unit) +{ + + if ((unit->hw_ecap & DMAR_ECAP_IR) == 0) + return (0); + unit->ir_enabled = 1; + TUNABLE_INT_FETCH("hw.dmar.ir", &unit->ir_enabled); + if (!unit->ir_enabled) + return (0); + if (!unit->qi_enabled) { + unit->ir_enabled = 0; + if (bootverbose) + device_printf(unit->dev, + "QI disabled, disabling interrupt remapping\n"); + return (0); + } + unit->irte_cnt = clp2(num_io_irqs); + unit->irt = (dmar_irte_t *)(uintptr_t)kmem_alloc_contig(kernel_arena, + unit->irte_cnt * sizeof(dmar_irte_t), M_ZERO | M_WAITOK, 0, + dmar_high, PAGE_SIZE, 0, DMAR_IS_COHERENT(unit) ? + VM_MEMATTR_DEFAULT : VM_MEMATTR_UNCACHEABLE); + if (unit->irt == NULL) + return (ENOMEM); + unit->irt_phys = pmap_kextract((vm_offset_t)unit->irt); + unit->irtids = vmem_create("dmarirt", 0, unit->irte_cnt, 1, 0, + M_FIRSTFIT | M_NOWAIT); + DMAR_LOCK(unit); + dmar_load_irt_ptr(unit); + dmar_qi_invalidate_iec_glob(unit); + DMAR_UNLOCK(unit); + + /* + * Initialize mappings for already configured interrupt pins. + * Required, because otherwise the interrupts fault without + * irtes. + */ + intr_reprogram(); + + DMAR_LOCK(unit); + dmar_enable_ir(unit); + DMAR_UNLOCK(unit); + return (0); +} + +void +dmar_fini_irt(struct dmar_unit *unit) +{ + + unit->ir_enabled = 0; + if (unit->irt != NULL) { + dmar_disable_ir(unit); + dmar_qi_invalidate_iec_glob(unit); + vmem_destroy(unit->irtids); + kmem_free(kernel_arena, (vm_offset_t)unit->irt, + unit->irte_cnt * sizeof(dmar_irte_t)); + } +} Property changes on: trunk/sys/x86/iommu/intel_intrmap.c ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +MidnightBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Modified: trunk/sys/x86/iommu/intel_qi.c =================================================================== --- trunk/sys/x86/iommu/intel_qi.c 2020-02-08 19:29:01 UTC (rev 12309) +++ trunk/sys/x86/iommu/intel_qi.c 2020-02-08 19:32:41 UTC (rev 12310) @@ -29,7 +29,7 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/x86/iommu/intel_qi.c 284019 2015-06-05 08:23:33Z kib $"); +__FBSDID("$FreeBSD: stable/11/sys/x86/iommu/intel_qi.c 320357 2017-06-26 12:30:39Z kib $"); #include "opt_acpi.h" @@ -41,7 +41,9 @@ #include <sys/module.h> #include <sys/rman.h> #include <sys/taskqueue.h> +#include <sys/time.h> #include <sys/tree.h> +#include <sys/vmem.h> #include <machine/bus.h> #include <contrib/dev/acpica/include/acpi.h> #include <contrib/dev/acpica/include/accommon.h> @@ -70,27 +72,27 @@ static int dmar_enable_qi(struct dmar_unit *unit) { + int error; DMAR_ASSERT_LOCKED(unit); unit->hw_gcmd |= DMAR_GCMD_QIE; dmar_write4(unit, DMAR_GCMD_REG, unit->hw_gcmd); - /* XXXKIB should have a timeout */ - while ((dmar_read4(unit, DMAR_GSTS_REG) & DMAR_GSTS_QIES) == 0) - cpu_spinwait(); - return (0); + DMAR_WAIT_UNTIL(((dmar_read4(unit, DMAR_GSTS_REG) & DMAR_GSTS_QIES) + != 0)); + return (error); } static int dmar_disable_qi(struct dmar_unit *unit) { + int error; DMAR_ASSERT_LOCKED(unit); unit->hw_gcmd &= ~DMAR_GCMD_QIE; dmar_write4(unit, DMAR_GCMD_REG, unit->hw_gcmd); - /* XXXKIB should have a timeout */ - while ((dmar_read4(unit, DMAR_GSTS_REG) & DMAR_GSTS_QIES) != 0) - cpu_spinwait(); - return (0); + DMAR_WAIT_UNTIL(((dmar_read4(unit, DMAR_GSTS_REG) & DMAR_GSTS_QIES) + == 0)); + return (error); } static void @@ -170,7 +172,8 @@ } static void -dmar_qi_emit_wait_seq(struct dmar_unit *unit, struct dmar_qi_genseq *pseq) +dmar_qi_emit_wait_seq(struct dmar_unit *unit, struct dmar_qi_genseq *pseq, + bool emit_wait) { struct dmar_qi_genseq gsec; uint32_t seq; @@ -191,17 +194,21 @@ seq = unit->inv_waitd_seq++; pseq->gen = unit->inv_waitd_gen; pseq->seq = seq; - dmar_qi_emit_wait_descr(unit, seq, true, true, false); + if (emit_wait) { + dmar_qi_ensure(unit, 1); + dmar_qi_emit_wait_descr(unit, seq, true, true, false); + } } static void -dmar_qi_wait_for_seq(struct dmar_unit *unit, const struct dmar_qi_genseq *gseq) +dmar_qi_wait_for_seq(struct dmar_unit *unit, const struct dmar_qi_genseq *gseq, + bool nowait) { DMAR_ASSERT_LOCKED(unit); unit->inv_seq_waiters++; while (!dmar_qi_seq_processed(unit, gseq)) { - if (cold) { + if (cold || nowait) { cpu_spinwait(); } else { msleep(&unit->inv_seq_waiters, &unit->lock, 0, @@ -212,14 +219,14 @@ } void -dmar_qi_invalidate_locked(struct dmar_ctx *ctx, dmar_gaddr_t base, - dmar_gaddr_t size, struct dmar_qi_genseq *pseq) +dmar_qi_invalidate_locked(struct dmar_domain *domain, dmar_gaddr_t base, + dmar_gaddr_t size, struct dmar_qi_genseq *pseq, bool emit_wait) { struct dmar_unit *unit; dmar_gaddr_t isize; int am; - unit = ctx->dmar; + unit = domain->dmar; DMAR_ASSERT_LOCKED(unit); for (; size > 0; base += isize, size -= isize) { am = calc_am(unit, base, size, &isize); @@ -227,13 +234,10 @@ dmar_qi_emit(unit, DMAR_IQ_DESCR_IOTLB_INV | DMAR_IQ_DESCR_IOTLB_PAGE | DMAR_IQ_DESCR_IOTLB_DW | DMAR_IQ_DESCR_IOTLB_DR | - DMAR_IQ_DESCR_IOTLB_DID(ctx->domain), + DMAR_IQ_DESCR_IOTLB_DID(domain->domain), base | am); } - if (pseq != NULL) { - dmar_qi_ensure(unit, 1); - dmar_qi_emit_wait_seq(unit, pseq); - } + dmar_qi_emit_wait_seq(unit, pseq, emit_wait); dmar_qi_advance_tail(unit); } @@ -245,9 +249,9 @@ DMAR_ASSERT_LOCKED(unit); dmar_qi_ensure(unit, 2); dmar_qi_emit(unit, DMAR_IQ_DESCR_CTX_INV | DMAR_IQ_DESCR_CTX_GLOB, 0); - dmar_qi_emit_wait_seq(unit, &gseq); + dmar_qi_emit_wait_seq(unit, &gseq, true); dmar_qi_advance_tail(unit); - dmar_qi_wait_for_seq(unit, &gseq); + dmar_qi_wait_for_seq(unit, &gseq, false); } void @@ -259,11 +263,64 @@ dmar_qi_ensure(unit, 2); dmar_qi_emit(unit, DMAR_IQ_DESCR_IOTLB_INV | DMAR_IQ_DESCR_IOTLB_GLOB | DMAR_IQ_DESCR_IOTLB_DW | DMAR_IQ_DESCR_IOTLB_DR, 0); - dmar_qi_emit_wait_seq(unit, &gseq); + dmar_qi_emit_wait_seq(unit, &gseq, true); dmar_qi_advance_tail(unit); - dmar_qi_wait_for_seq(unit, &gseq); + dmar_qi_wait_for_seq(unit, &gseq, false); } +void +dmar_qi_invalidate_iec_glob(struct dmar_unit *unit) +{ + struct dmar_qi_genseq gseq; + + DMAR_ASSERT_LOCKED(unit); + dmar_qi_ensure(unit, 2); + dmar_qi_emit(unit, DMAR_IQ_DESCR_IEC_INV, 0); + dmar_qi_emit_wait_seq(unit, &gseq, true); + dmar_qi_advance_tail(unit); + dmar_qi_wait_for_seq(unit, &gseq, false); +} + +void +dmar_qi_invalidate_iec(struct dmar_unit *unit, u_int start, u_int cnt) +{ + struct dmar_qi_genseq gseq; + u_int c, l; + + DMAR_ASSERT_LOCKED(unit); + KASSERT(start < unit->irte_cnt && start < start + cnt && + start + cnt <= unit->irte_cnt, + ("inv iec overflow %d %d %d", unit->irte_cnt, start, cnt)); + for (; cnt > 0; cnt -= c, start += c) { + l = ffs(start | cnt) - 1; + c = 1 << l; + dmar_qi_ensure(unit, 1); + dmar_qi_emit(unit, DMAR_IQ_DESCR_IEC_INV | + DMAR_IQ_DESCR_IEC_IDX | DMAR_IQ_DESCR_IEC_IIDX(start) | + DMAR_IQ_DESCR_IEC_IM(l), 0); + } + dmar_qi_ensure(unit, 1); + dmar_qi_emit_wait_seq(unit, &gseq, true); + dmar_qi_advance_tail(unit); + + /* + * The caller of the function, in particular, + * dmar_ir_program_irte(), may be called from the context + * where the sleeping is forbidden (in fact, the + * intr_table_lock mutex may be held, locked from + * intr_shuffle_irqs()). Wait for the invalidation completion + * using the busy wait. + * + * The impact on the interrupt input setup code is small, the + * expected overhead is comparable with the chipset register + * read. It is more harmful for the parallel DMA operations, + * since we own the dmar unit lock until whole invalidation + * queue is processed, which includes requests possibly issued + * before our request. + */ + dmar_qi_wait_for_seq(unit, &gseq, true); +} + int dmar_qi_intr(void *arg) { @@ -271,7 +328,7 @@ unit = arg; KASSERT(unit->qi_enabled, ("dmar%d: QI is not enabled", unit->unit)); - taskqueue_enqueue_fast(unit->qi_taskqueue, &unit->qi_task); + taskqueue_enqueue(unit->qi_taskqueue, &unit->qi_task); return (FILTER_HANDLED); } @@ -289,12 +346,11 @@ entry = TAILQ_FIRST(&unit->tlb_flush_entries); if (entry == NULL) break; - if ((entry->gseq.gen == 0 && entry->gseq.seq == 0) || - !dmar_qi_seq_processed(unit, &entry->gseq)) + if (!dmar_qi_seq_processed(unit, &entry->gseq)) break; TAILQ_REMOVE(&unit->tlb_flush_entries, entry, dmamap_link); DMAR_UNLOCK(unit); - dmar_ctx_free_entry(entry, (entry->flags & + dmar_domain_free_entry(entry, (entry->flags & DMAR_MAP_ENTRY_QI_NF) == 0); DMAR_LOCK(unit); } @@ -324,7 +380,7 @@ TAILQ_INIT(&unit->tlb_flush_entries); TASK_INIT(&unit->qi_task, 0, dmar_qi_task, unit); - unit->qi_taskqueue = taskqueue_create_fast("dmar", M_WAITOK, + unit->qi_taskqueue = taskqueue_create_fast("dmarqf", M_WAITOK, taskqueue_thread_enqueue, &unit->qi_taskqueue); taskqueue_start_threads(&unit->qi_taskqueue, 1, PI_AV, "dmar%d qi taskq", unit->unit); @@ -377,9 +433,9 @@ DMAR_LOCK(unit); /* quisce */ dmar_qi_ensure(unit, 1); - dmar_qi_emit_wait_seq(unit, &gseq); + dmar_qi_emit_wait_seq(unit, &gseq, true); dmar_qi_advance_tail(unit); - dmar_qi_wait_for_seq(unit, &gseq); + dmar_qi_wait_for_seq(unit, &gseq, false); /* only after the quisce, disable queue */ dmar_disable_qi_intr(unit); dmar_disable_qi(unit); Modified: trunk/sys/x86/iommu/intel_quirks.c =================================================================== --- trunk/sys/x86/iommu/intel_quirks.c 2020-02-08 19:29:01 UTC (rev 12309) +++ trunk/sys/x86/iommu/intel_quirks.c 2020-02-08 19:32:41 UTC (rev 12310) @@ -1,6 +1,6 @@ /* $MidnightBSD$ */ /*- - * Copyright (c) 2013 The FreeBSD Foundation + * Copyright (c) 2013, 2015 The FreeBSD Foundation * All rights reserved. * * This software was developed by Konstantin Belousov <kib at FreeBSD.org> @@ -29,7 +29,7 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/x86/iommu/intel_quirks.c 257251 2013-10-28 13:33:29Z kib $"); +__FBSDID("$FreeBSD: stable/11/sys/x86/iommu/intel_quirks.c 280260 2015-03-19 13:57:47Z kib $"); #include <sys/param.h> #include <sys/bus.h> @@ -43,6 +43,7 @@ #include <sys/smp.h> #include <sys/taskqueue.h> #include <sys/tree.h> +#include <sys/vmem.h> #include <machine/bus.h> #include <contrib/dev/acpica/include/acpi.h> #include <contrib/dev/acpica/include/accommon.h> @@ -60,7 +61,7 @@ #include <x86/iommu/intel_dmar.h> #include <dev/pci/pcivar.h> -typedef void (*dmar_quirk_fun)(struct dmar_unit *); +typedef void (*dmar_quirk_cpu_fun)(struct dmar_unit *); struct intel_dmar_quirk_cpu { u_int ext_family; @@ -68,17 +69,21 @@ u_int family_code; u_int model; u_int stepping; - dmar_quirk_fun quirk; + dmar_quirk_cpu_fun quirk; const char *descr; }; +typedef void (*dmar_quirk_nb_fun)(struct dmar_unit *, device_t nb); + struct intel_dmar_quirk_nb { u_int dev_id; u_int rev_no; - dmar_quirk_fun quirk; + dmar_quirk_nb_fun quirk; const char *descr; }; +#define QUIRK_NB_ALL_REV 0xffffffff + static void dmar_match_quirks(struct dmar_unit *dmar, const struct intel_dmar_quirk_nb *nb_quirks, int nb_quirks_len, @@ -100,13 +105,14 @@ for (i = 0; i < nb_quirks_len; i++) { nb_quirk = &nb_quirks[i]; if (nb_quirk->dev_id == dev_id && - nb_quirk->rev_no == rev_no) { + (nb_quirk->rev_no == rev_no || + nb_quirk->rev_no == QUIRK_NB_ALL_REV)) { if (bootverbose) { device_printf(dmar->dev, "NB IOMMU quirk %s\n", nb_quirk->descr); } - nb_quirk->quirk(dmar); + nb_quirk->quirk(dmar, nb); } } } else { @@ -140,12 +146,29 @@ } static void -nb_5400_no_low_high_prot_mem(struct dmar_unit *unit) +nb_5400_no_low_high_prot_mem(struct dmar_unit *unit, device_t nb __unused) { unit->hw_cap &= ~(DMAR_CAP_PHMR | DMAR_CAP_PLMR); } +static void +nb_no_ir(struct dmar_unit *unit, device_t nb __unused) +{ + + unit->hw_ecap &= ~(DMAR_ECAP_IR | DMAR_ECAP_EIM); +} + +static void +nb_5500_no_ir_rev13(struct dmar_unit *unit, device_t nb) +{ + u_int rev_no; + + rev_no = pci_get_revid(nb); + if (rev_no <= 0x13) + nb_no_ir(unit, nb); +} + static const struct intel_dmar_quirk_nb pre_use_nb[] = { { .dev_id = 0x4001, .rev_no = 0x20, @@ -157,6 +180,26 @@ .quirk = nb_5400_no_low_high_prot_mem, .descr = "5400 E23" /* no low/high protected memory */ }, + { + .dev_id = 0x3403, .rev_no = QUIRK_NB_ALL_REV, + .quirk = nb_5500_no_ir_rev13, + .descr = "5500 E47, E53" /* interrupt remapping does not work */ + }, + { + .dev_id = 0x3405, .rev_no = QUIRK_NB_ALL_REV, + .quirk = nb_5500_no_ir_rev13, + .descr = "5500 E47, E53" /* interrupt remapping does not work */ + }, + { + .dev_id = 0x3405, .rev_no = 0x22, + .quirk = nb_no_ir, + .descr = "5500 E47, E53" /* interrupt remapping does not work */ + }, + { + .dev_id = 0x3406, .rev_no = QUIRK_NB_ALL_REV, + .quirk = nb_5500_no_ir_rev13, + .descr = "5500 E47, E53" /* interrupt remapping does not work */ + }, }; static void Modified: trunk/sys/x86/iommu/intel_reg.h =================================================================== --- trunk/sys/x86/iommu/intel_reg.h 2020-02-08 19:29:01 UTC (rev 12309) +++ trunk/sys/x86/iommu/intel_reg.h 2020-02-08 19:32:41 UTC (rev 12310) @@ -27,7 +27,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: stable/10/sys/x86/iommu/intel_reg.h 306466 2016-09-30 00:31:17Z jhb $ + * $FreeBSD: stable/11/sys/x86/iommu/intel_reg.h 306466 2016-09-30 00:31:17Z jhb $ */ #ifndef __X86_IOMMU_INTEL_REG_H Modified: trunk/sys/x86/iommu/intel_utils.c =================================================================== --- trunk/sys/x86/iommu/intel_utils.c 2020-02-08 19:29:01 UTC (rev 12309) +++ trunk/sys/x86/iommu/intel_utils.c 2020-02-08 19:32:41 UTC (rev 12310) @@ -29,7 +29,7 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/x86/iommu/intel_utils.c 279470 2015-03-01 04:22:06Z rstone $"); +__FBSDID("$FreeBSD: stable/11/sys/x86/iommu/intel_utils.c 327785 2018-01-10 20:39:26Z markj $"); #include <sys/param.h> #include <sys/bus.h> @@ -47,7 +47,9 @@ #include <sys/sysctl.h> #include <sys/systm.h> #include <sys/taskqueue.h> +#include <sys/time.h> #include <sys/tree.h> +#include <sys/vmem.h> #include <dev/pci/pcivar.h> #include <vm/vm.h> #include <vm/vm_extern.h> @@ -58,6 +60,8 @@ #include <vm/vm_pageout.h> #include <machine/bus.h> #include <machine/cpu.h> +#include <machine/intr_machdep.h> +#include <x86/include/apicvar.h> #include <x86/include/busdma_impl.h> #include <x86/iommu/intel_reg.h> #include <x86/iommu/busdma_dmar.h> @@ -98,7 +102,6 @@ {.agaw = 64, .cap = DMAR_CAP_SAGAW_6LVL, .awlvl = DMAR_CTX2_AW_6LVL, .pglvl = 6} }; -#define SIZEOF_SAGAW_BITS (sizeof(sagaw_bits) / sizeof(sagaw_bits[0])) bool dmar_pglvl_supported(struct dmar_unit *unit, int pglvl) @@ -105,7 +108,7 @@ { int i; - for (i = 0; i < SIZEOF_SAGAW_BITS; i++) { + for (i = 0; i < nitems(sagaw_bits); i++) { if (sagaw_bits[i].pglvl != pglvl) continue; if ((DMAR_CAP_SAGAW(unit->hw_cap) & sagaw_bits[i].cap) != 0) @@ -115,26 +118,23 @@ } int -ctx_set_agaw(struct dmar_ctx *ctx, int mgaw) +domain_set_agaw(struct dmar_domain *domain, int mgaw) { int sagaw, i; - ctx->mgaw = mgaw; - sagaw = DMAR_CAP_SAGAW(ctx->dmar->hw_cap); - for (i = 0; i < SIZEOF_SAGAW_BITS; i++) { + domain->mgaw = mgaw; + sagaw = DMAR_CAP_SAGAW(domain->dmar->hw_cap); + for (i = 0; i < nitems(sagaw_bits); i++) { if (sagaw_bits[i].agaw >= mgaw) { - ctx->agaw = sagaw_bits[i].agaw; - ctx->pglvl = sagaw_bits[i].pglvl; - ctx->awlvl = sagaw_bits[i].awlvl; + domain->agaw = sagaw_bits[i].agaw; + domain->pglvl = sagaw_bits[i].pglvl; + domain->awlvl = sagaw_bits[i].awlvl; return (0); } } - device_printf(ctx->dmar->dev, - "context request mgaw %d for pci%d:%d:%d:%d, " - "no agaw found, sagaw %x\n", mgaw, ctx->dmar->segment, - pci_get_bus(ctx->ctx_tag.owner), - pci_get_slot(ctx->ctx_tag.owner), - pci_get_function(ctx->ctx_tag.owner), sagaw); + device_printf(domain->dmar->dev, + "context request mgaw %d: no agaw found, sagaw %x\n", + mgaw, sagaw); return (EINVAL); } @@ -150,18 +150,18 @@ { int i; - for (i = 0; i < SIZEOF_SAGAW_BITS; i++) { + for (i = 0; i < nitems(sagaw_bits); i++) { if ((1ULL << sagaw_bits[i].agaw) >= maxaddr && (DMAR_CAP_SAGAW(unit->hw_cap) & sagaw_bits[i].cap) != 0) break; } - if (allow_less && i == SIZEOF_SAGAW_BITS) { + if (allow_less && i == nitems(sagaw_bits)) { do { i--; } while ((DMAR_CAP_SAGAW(unit->hw_cap) & sagaw_bits[i].cap) == 0); } - if (i < SIZEOF_SAGAW_BITS) + if (i < nitems(sagaw_bits)) return (sagaw_bits[i].agaw); KASSERT(0, ("no mgaw for maxaddr %jx allow_less %d", (uintmax_t) maxaddr, allow_less)); @@ -190,7 +190,7 @@ * the context ctx. */ int -ctx_is_sp_lvl(struct dmar_ctx *ctx, int lvl) +domain_is_sp_lvl(struct dmar_domain *domain, int lvl) { int alvl, cap_sps; static const int sagaw_sp[] = { @@ -200,10 +200,9 @@ DMAR_CAP_SPS_1T }; - alvl = ctx->pglvl - lvl - 1; - cap_sps = DMAR_CAP_SPS(ctx->dmar->hw_cap); - return (alvl < sizeof(sagaw_sp) / sizeof(sagaw_sp[0]) && - (sagaw_sp[alvl] & cap_sps) != 0); + alvl = domain->pglvl - lvl - 1; + cap_sps = DMAR_CAP_SPS(domain->dmar->hw_cap); + return (alvl < nitems(sagaw_sp) && (sagaw_sp[alvl] & cap_sps) != 0); } dmar_gaddr_t @@ -222,16 +221,15 @@ KASSERT(lvl >= 0 && lvl < total_pglvl, ("total %d lvl %d", total_pglvl, lvl)); rlvl = total_pglvl - lvl - 1; - KASSERT(rlvl < sizeof(pg_sz) / sizeof(pg_sz[0]), - ("sizeof pg_sz lvl %d", lvl)); + KASSERT(rlvl < nitems(pg_sz), ("sizeof pg_sz lvl %d", lvl)); return (pg_sz[rlvl]); } dmar_gaddr_t -ctx_page_size(struct dmar_ctx *ctx, int lvl) +domain_page_size(struct dmar_domain *domain, int lvl) { - return (pglvl_page_size(ctx->pglvl, lvl)); + return (pglvl_page_size(domain->pglvl, lvl)); } int @@ -260,9 +258,12 @@ dmar_pgalloc(vm_object_t obj, vm_pindex_t idx, int flags) { vm_page_t m; - int zeroed; + int zeroed, aflags; zeroed = (flags & DMAR_PGF_ZERO) != 0 ? VM_ALLOC_ZERO : 0; + aflags = zeroed | VM_ALLOC_NOBUSY | VM_ALLOC_SYSTEM | VM_ALLOC_NODUMP | + ((flags & DMAR_PGF_WAITOK) != 0 ? VM_ALLOC_WAITFAIL : + VM_ALLOC_NOWAIT); for (;;) { if ((flags & DMAR_PGF_OBJL) == 0) VM_OBJECT_WLOCK(obj); @@ -272,8 +273,7 @@ VM_OBJECT_WUNLOCK(obj); break; } - m = vm_page_alloc_contig(obj, idx, VM_ALLOC_NOBUSY | - VM_ALLOC_SYSTEM | VM_ALLOC_NODUMP | zeroed, 1, 0, + m = vm_page_alloc_contig(obj, idx, aflags, 1, 0, dmar_high, PAGE_SIZE, 0, VM_MEMATTR_DEFAULT); if ((flags & DMAR_PGF_OBJL) == 0) VM_OBJECT_WUNLOCK(obj); @@ -285,11 +285,6 @@ } if ((flags & DMAR_PGF_WAITOK) == 0) break; - if ((flags & DMAR_PGF_OBJL) != 0) - VM_OBJECT_WUNLOCK(obj); - VM_WAIT; - if ((flags & DMAR_PGF_OBJL) != 0) - VM_OBJECT_WLOCK(obj); } return (m); } @@ -405,6 +400,7 @@ dmar_load_root_entry_ptr(struct dmar_unit *unit) { vm_page_t root_entry; + int error; /* * Access to the GCMD register must be serialized while the @@ -417,10 +413,9 @@ VM_OBJECT_RUNLOCK(unit->ctx_obj); dmar_write8(unit, DMAR_RTADDR_REG, VM_PAGE_TO_PHYS(root_entry)); dmar_write4(unit, DMAR_GCMD_REG, unit->hw_gcmd | DMAR_GCMD_SRTP); - /* XXXKIB should have a timeout */ - while ((dmar_read4(unit, DMAR_GSTS_REG) & DMAR_GSTS_RTPS) == 0) - cpu_spinwait(); - return (0); + DMAR_WAIT_UNTIL(((dmar_read4(unit, DMAR_GSTS_REG) & DMAR_GSTS_RTPS) + != 0)); + return (error); } /* @@ -430,6 +425,7 @@ int dmar_inv_ctx_glob(struct dmar_unit *unit) { + int error; /* * Access to the CCMD register must be serialized while the @@ -445,10 +441,9 @@ * writes the upper dword last. */ dmar_write8(unit, DMAR_CCMD_REG, DMAR_CCMD_ICC | DMAR_CCMD_CIRG_GLOB); - /* XXXKIB should have a timeout */ - while ((dmar_read4(unit, DMAR_CCMD_REG + 4) & DMAR_CCMD_ICC32) != 0) - cpu_spinwait(); - return (0); + DMAR_WAIT_UNTIL(((dmar_read4(unit, DMAR_CCMD_REG + 4) & DMAR_CCMD_ICC32) + == 0)); + return (error); } /* @@ -457,7 +452,7 @@ int dmar_inv_iotlb_glob(struct dmar_unit *unit) { - int reg; + int error, reg; DMAR_ASSERT_LOCKED(unit); KASSERT(!unit->qi_enabled, ("QI enabled")); @@ -466,11 +461,9 @@ /* See a comment about DMAR_CCMD_ICC in dmar_inv_ctx_glob. */ dmar_write8(unit, reg + DMAR_IOTLB_REG_OFF, DMAR_IOTLB_IVT | DMAR_IOTLB_IIRG_GLB | DMAR_IOTLB_DR | DMAR_IOTLB_DW); - /* XXXKIB should have a timeout */ - while ((dmar_read4(unit, reg + DMAR_IOTLB_REG_OFF + 4) & - DMAR_IOTLB_IVT32) != 0) - cpu_spinwait(); - return (0); + DMAR_WAIT_UNTIL(((dmar_read4(unit, reg + DMAR_IOTLB_REG_OFF + 4) & + DMAR_IOTLB_IVT32) == 0)); + return (error); } /* @@ -480,6 +473,7 @@ int dmar_flush_write_bufs(struct dmar_unit *unit) { + int error; DMAR_ASSERT_LOCKED(unit); @@ -490,38 +484,86 @@ ("dmar%d: no RWBF", unit->unit)); dmar_write4(unit, DMAR_GCMD_REG, unit->hw_gcmd | DMAR_GCMD_WBF); - /* XXXKIB should have a timeout */ - while ((dmar_read4(unit, DMAR_GSTS_REG) & DMAR_GSTS_WBFS) == 0) - cpu_spinwait(); - return (0); + DMAR_WAIT_UNTIL(((dmar_read4(unit, DMAR_GSTS_REG) & DMAR_GSTS_WBFS) + != 0)); + return (error); } int dmar_enable_translation(struct dmar_unit *unit) { + int error; DMAR_ASSERT_LOCKED(unit); unit->hw_gcmd |= DMAR_GCMD_TE; dmar_write4(unit, DMAR_GCMD_REG, unit->hw_gcmd); - /* XXXKIB should have a timeout */ - while ((dmar_read4(unit, DMAR_GSTS_REG) & DMAR_GSTS_TES) == 0) - cpu_spinwait(); - return (0); + DMAR_WAIT_UNTIL(((dmar_read4(unit, DMAR_GSTS_REG) & DMAR_GSTS_TES) + != 0)); + return (error); } int dmar_disable_translation(struct dmar_unit *unit) { + int error; DMAR_ASSERT_LOCKED(unit); unit->hw_gcmd &= ~DMAR_GCMD_TE; dmar_write4(unit, DMAR_GCMD_REG, unit->hw_gcmd); - /* XXXKIB should have a timeout */ - while ((dmar_read4(unit, DMAR_GSTS_REG) & DMAR_GSTS_TES) != 0) - cpu_spinwait(); - return (0); + DMAR_WAIT_UNTIL(((dmar_read4(unit, DMAR_GSTS_REG) & DMAR_GSTS_TES) + == 0)); + return (error); } +int +dmar_load_irt_ptr(struct dmar_unit *unit) +{ + uint64_t irta, s; + int error; + + DMAR_ASSERT_LOCKED(unit); + irta = unit->irt_phys; + if (DMAR_X2APIC(unit)) + irta |= DMAR_IRTA_EIME; + s = fls(unit->irte_cnt) - 2; + KASSERT(unit->irte_cnt >= 2 && s <= DMAR_IRTA_S_MASK && + powerof2(unit->irte_cnt), + ("IRTA_REG_S overflow %x", unit->irte_cnt)); + irta |= s; + dmar_write8(unit, DMAR_IRTA_REG, irta); + dmar_write4(unit, DMAR_GCMD_REG, unit->hw_gcmd | DMAR_GCMD_SIRTP); + DMAR_WAIT_UNTIL(((dmar_read4(unit, DMAR_GSTS_REG) & DMAR_GSTS_IRTPS) + != 0)); + return (error); +} + +int +dmar_enable_ir(struct dmar_unit *unit) +{ + int error; + + DMAR_ASSERT_LOCKED(unit); + unit->hw_gcmd |= DMAR_GCMD_IRE; + unit->hw_gcmd &= ~DMAR_GCMD_CFI; + dmar_write4(unit, DMAR_GCMD_REG, unit->hw_gcmd); + DMAR_WAIT_UNTIL(((dmar_read4(unit, DMAR_GSTS_REG) & DMAR_GSTS_IRES) + != 0)); + return (error); +} + +int +dmar_disable_ir(struct dmar_unit *unit) +{ + int error; + + DMAR_ASSERT_LOCKED(unit); + unit->hw_gcmd &= ~DMAR_GCMD_IRE; + dmar_write4(unit, DMAR_GCMD_REG, unit->hw_gcmd); + DMAR_WAIT_UNTIL(((dmar_read4(unit, DMAR_GSTS_REG) & DMAR_GSTS_IRES) + == 0)); + return (error); +} + #define BARRIER_F \ u_int f_done, f_inproc, f_wakeup; \ \ @@ -573,18 +615,62 @@ } int dmar_match_verbose; +int dmar_batch_coalesce = 100; +struct timespec dmar_hw_timeout = { + .tv_sec = 0, + .tv_nsec = 1000000 +}; -static SYSCTL_NODE(_hw, OID_AUTO, dmar, CTLFLAG_RD, NULL, - ""); -SYSCTL_INT(_hw_dmar, OID_AUTO, tbl_pagecnt, CTLFLAG_RD | CTLFLAG_TUN, +static const uint64_t d = 1000000000; + +void +dmar_update_timeout(uint64_t newval) +{ + + /* XXXKIB not atomic */ + dmar_hw_timeout.tv_sec = newval / d; + dmar_hw_timeout.tv_nsec = newval % d; +} + +uint64_t +dmar_get_timeout(void) +{ + + return ((uint64_t)dmar_hw_timeout.tv_sec * d + + dmar_hw_timeout.tv_nsec); +} + +static int +dmar_timeout_sysctl(SYSCTL_HANDLER_ARGS) +{ + uint64_t val; + int error; + + val = dmar_get_timeout(); + error = sysctl_handle_long(oidp, &val, 0, req); + if (error != 0 || req->newptr == NULL) + return (error); + dmar_update_timeout(val); + return (error); +} + +static SYSCTL_NODE(_hw, OID_AUTO, dmar, CTLFLAG_RD, NULL, ""); +SYSCTL_INT(_hw_dmar, OID_AUTO, tbl_pagecnt, CTLFLAG_RD, &dmar_tbl_pagecnt, 0, "Count of pages used for DMAR pagetables"); -SYSCTL_INT(_hw_dmar, OID_AUTO, match_verbose, CTLFLAG_RW | CTLFLAG_TUN, +SYSCTL_INT(_hw_dmar, OID_AUTO, match_verbose, CTLFLAG_RWTUN, &dmar_match_verbose, 0, "Verbose matching of the PCI devices to DMAR paths"); +SYSCTL_INT(_hw_dmar, OID_AUTO, batch_coalesce, CTLFLAG_RWTUN, + &dmar_batch_coalesce, 0, + "Number of qi batches between interrupt"); +SYSCTL_PROC(_hw_dmar, OID_AUTO, timeout, + CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0, + dmar_timeout_sysctl, "QU", + "Timeout for command wait, in nanoseconds"); #ifdef INVARIANTS int dmar_check_free; -SYSCTL_INT(_hw_dmar, OID_AUTO, check_free, CTLFLAG_RW | CTLFLAG_TUN, +SYSCTL_INT(_hw_dmar, OID_AUTO, check_free, CTLFLAG_RWTUN, &dmar_check_free, 0, "Check the GPA RBtree for free_down and free_after validity"); #endif Added: trunk/sys/x86/iommu/iommu_intrmap.h =================================================================== --- trunk/sys/x86/iommu/iommu_intrmap.h (rev 0) +++ trunk/sys/x86/iommu/iommu_intrmap.h 2020-02-08 19:32:41 UTC (rev 12310) @@ -0,0 +1,44 @@ +/* $MidnightBSD$ */ +/*- + * Copyright (c) 2015 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by Konstantin Belousov <kib at FreeBSD.org> + * under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: stable/11/sys/x86/iommu/iommu_intrmap.h 280260 2015-03-19 13:57:47Z kib $ + */ + +#ifndef __X86_IOMMU_IOMMU_INTRMAP_H +#define __X86_IOMMU_IOMMU_INTRMAP_H + +int iommu_alloc_msi_intr(device_t src, u_int *cookies, u_int count); +int iommu_map_msi_intr(device_t src, u_int cpu, u_int vector, u_int cookie, + uint64_t *addr, uint32_t *data); +int iommu_unmap_msi_intr(device_t src, u_int cookie); +int iommu_map_ioapic_intr(u_int ioapic_id, u_int cpu, u_int vector, bool edge, + bool activehi, int irq, u_int *cookie, uint32_t *hi, uint32_t *lo); +int iommu_unmap_ioapic_intr(u_int ioapic_id, u_int *cookie); + +#endif Property changes on: trunk/sys/x86/iommu/iommu_intrmap.h ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +MidnightBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Modified: trunk/sys/x86/isa/atpic.c =================================================================== --- trunk/sys/x86/isa/atpic.c 2020-02-08 19:29:01 UTC (rev 12309) +++ trunk/sys/x86/isa/atpic.c 2020-02-08 19:32:41 UTC (rev 12310) @@ -30,10 +30,11 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/x86/isa/atpic.c 262192 2014-02-18 20:27:17Z jhb $"); +__FBSDID("$FreeBSD: stable/11/sys/x86/isa/atpic.c 340016 2018-11-01 18:34:26Z jhb $"); #include "opt_auto_eoi.h" #include "opt_isa.h" +#include "opt_mca.h" #include <sys/param.h> #include <sys/systm.h> @@ -55,9 +56,12 @@ #ifdef PC98 #include <pc98/cbus/cbus.h> #else -#include <x86/isa/isa.h> +#include <isa/isareg.h> #endif #include <isa/isavar.h> +#ifdef DEV_MCA +#include <i386/bios/mca_machdep.h> +#endif #ifdef __amd64__ #define SDT_ATPIC SDT_SYSIGT @@ -70,12 +74,12 @@ #define MASTER 0 #define SLAVE 1 +#define IMEN_MASK(ai) (IRQ_MASK((ai)->at_irq)) + #define NUM_ISA_IRQS 16 static void atpic_init(void *dummy); -unsigned int imen; /* XXX */ - inthand_t IDTVEC(atpic_intr0), IDTVEC(atpic_intr1), IDTVEC(atpic_intr2), IDTVEC(atpic_intr3), IDTVEC(atpic_intr4), IDTVEC(atpic_intr5), @@ -83,19 +87,42 @@ IDTVEC(atpic_intr9), IDTVEC(atpic_intr10), IDTVEC(atpic_intr11), IDTVEC(atpic_intr12), IDTVEC(atpic_intr13), IDTVEC(atpic_intr14), IDTVEC(atpic_intr15); +/* XXXKIB i386 uses stubs until pti comes */ +inthand_t + IDTVEC(atpic_intr0_pti), IDTVEC(atpic_intr1_pti), + IDTVEC(atpic_intr2_pti), IDTVEC(atpic_intr3_pti), + IDTVEC(atpic_intr4_pti), IDTVEC(atpic_intr5_pti), + IDTVEC(atpic_intr6_pti), IDTVEC(atpic_intr7_pti), + IDTVEC(atpic_intr8_pti), IDTVEC(atpic_intr9_pti), + IDTVEC(atpic_intr10_pti), IDTVEC(atpic_intr11_pti), + IDTVEC(atpic_intr12_pti), IDTVEC(atpic_intr13_pti), + IDTVEC(atpic_intr14_pti), IDTVEC(atpic_intr15_pti); #define IRQ(ap, ai) ((ap)->at_irqbase + (ai)->at_irq) -#define ATPIC(io, base, eoi, imenptr) \ - { { atpic_enable_source, atpic_disable_source, (eoi), \ - atpic_enable_intr, atpic_disable_intr, atpic_vector, \ - atpic_source_pending, NULL, atpic_resume, atpic_config_intr,\ - atpic_assign_cpu }, (io), (base), IDT_IO_INTS + (base), \ - (imenptr) } +#define ATPIC(io, base, eoi) { \ + .at_pic = { \ + .pic_register_sources = atpic_register_sources, \ + .pic_enable_source = atpic_enable_source, \ + .pic_disable_source = atpic_disable_source, \ + .pic_eoi_source = (eoi), \ + .pic_enable_intr = atpic_enable_intr, \ + .pic_disable_intr = atpic_disable_intr, \ + .pic_vector = atpic_vector, \ + .pic_source_pending = atpic_source_pending, \ + .pic_resume = atpic_resume, \ + .pic_config_intr = atpic_config_intr, \ + .pic_assign_cpu = atpic_assign_cpu \ + }, \ + .at_ioaddr = (io), \ + .at_irqbase = (base), \ + .at_intbase = IDT_IO_INTS + (base), \ + .at_imen = 0xff, \ + } #define INTSRC(irq) \ { { &atpics[(irq) / 8].at_pic }, IDTVEC(atpic_intr ## irq ), \ - (irq) % 8 } + IDTVEC(atpic_intr ## irq ## _pti), (irq) % 8 } struct atpic { struct pic at_pic; @@ -102,12 +129,12 @@ int at_ioaddr; int at_irqbase; uint8_t at_intbase; - uint8_t *at_imen; + uint8_t at_imen; }; struct atpic_intsrc { struct intsrc at_intsrc; - inthand_t *at_intr; + inthand_t *at_intr, *at_intr_pti; int at_irq; /* Relative to PIC base. */ enum intr_trigger at_trigger; u_long at_count; @@ -114,6 +141,7 @@ u_long at_straycount; }; +static void atpic_register_sources(struct pic *pic); static void atpic_enable_source(struct intsrc *isrc); static void atpic_disable_source(struct intsrc *isrc, int eoi); static void atpic_eoi_master(struct intsrc *isrc); @@ -129,8 +157,8 @@ static void i8259_init(struct atpic *pic, int slave); static struct atpic atpics[] = { - ATPIC(IO_ICU1, 0, atpic_eoi_master, (uint8_t *)&imen), - ATPIC(IO_ICU2, 8, atpic_eoi_slave, ((uint8_t *)&imen) + 1) + ATPIC(IO_ICU1, 0, atpic_eoi_master), + ATPIC(IO_ICU2, 8, atpic_eoi_slave) }; static struct atpic_intsrc atintrs[] = { @@ -152,7 +180,7 @@ INTSRC(15), }; -CTASSERT(sizeof(atintrs) / sizeof(atintrs[0]) == NUM_ISA_IRQS); +CTASSERT(nitems(atintrs) == NUM_ISA_IRQS); static __inline void _atpic_eoi_master(struct intsrc *isrc) @@ -184,6 +212,42 @@ } static void +atpic_register_sources(struct pic *pic) +{ + struct atpic *ap = (struct atpic *)pic; + struct atpic_intsrc *ai; + int i; + + /* + * If any of the ISA IRQs have an interrupt source already, then + * assume that the I/O APICs are being used and don't register any + * of our interrupt sources. This makes sure we don't accidentally + * use mixed mode. The "accidental" use could otherwise occur on + * machines that route the ACPI SCI interrupt to a different ISA + * IRQ (at least one machine routes it to IRQ 13) thus disabling + * that APIC ISA routing and allowing the ATPIC source for that IRQ + * to leak through. We used to depend on this feature for routing + * IRQ0 via mixed mode, but now we don't use mixed mode at all. + * + * To avoid the slave not register sources after the master + * registers its sources, register all IRQs when this function is + * called on the master. + */ + if (ap != &atpics[MASTER]) + return; + for (i = 0; i < NUM_ISA_IRQS; i++) + if (intr_lookup_source(i) != NULL) + return; + + /* Loop through all interrupt sources and add them. */ + for (i = 0, ai = atintrs; i < NUM_ISA_IRQS; i++, ai++) { + if (i == ICU_SLAVEID) + continue; + intr_register_source(&ai->at_intsrc); + } +} + +static void atpic_enable_source(struct intsrc *isrc) { struct atpic_intsrc *ai = (struct atpic_intsrc *)isrc; @@ -190,9 +254,9 @@ struct atpic *ap = (struct atpic *)isrc->is_pic; spinlock_enter(); - if (*ap->at_imen & IMEN_MASK(ai)) { - *ap->at_imen &= ~IMEN_MASK(ai); - outb(ap->at_ioaddr + ICU_IMR_OFFSET, *ap->at_imen); + if (ap->at_imen & IMEN_MASK(ai)) { + ap->at_imen &= ~IMEN_MASK(ai); + outb(ap->at_ioaddr + ICU_IMR_OFFSET, ap->at_imen); } spinlock_exit(); } @@ -205,8 +269,8 @@ spinlock_enter(); if (ai->at_trigger != INTR_TRIGGER_EDGE) { - *ap->at_imen |= IMEN_MASK(ai); - outb(ap->at_ioaddr + ICU_IMR_OFFSET, *ap->at_imen); + ap->at_imen |= IMEN_MASK(ai); + outb(ap->at_ioaddr + ICU_IMR_OFFSET, ap->at_imen); } /* @@ -400,7 +464,7 @@ outb(imr_addr, MASTER_MODE); /* Set interrupt enable mask. */ - outb(imr_addr, *pic->at_imen); + outb(imr_addr, pic->at_imen); /* Reset is finished, default to IRR on read. */ outb(pic->at_ioaddr, OCW3_SEL | OCW3_RR); @@ -420,7 +484,6 @@ int i; /* Start off with all interrupts disabled. */ - imen = 0xffff; i8259_init(&atpics[MASTER], 0); i8259_init(&atpics[SLAVE], 1); atpic_enable_source((struct intsrc *)&atintrs[ICU_SLAVEID]); @@ -432,7 +495,8 @@ ai->at_intsrc.is_count = &ai->at_count; ai->at_intsrc.is_straycount = &ai->at_straycount; setidt(((struct atpic *)ai->at_intsrc.is_pic)->at_intbase + - ai->at_irq, ai->at_intr, SDT_ATPIC, SEL_KPL, GSEL_ATPIC); + ai->at_irq, pti ? ai->at_intr_pti : ai->at_intr, SDT_ATPIC, + SEL_KPL, GSEL_ATPIC); } #ifdef DEV_MCA @@ -492,8 +556,6 @@ static void atpic_init(void *dummy __unused) { - struct atpic_intsrc *ai; - int i; /* * Register our PICs, even if we aren't going to use any of their @@ -503,29 +565,10 @@ intr_register_pic(&atpics[1].at_pic) != 0) panic("Unable to register ATPICs"); - /* - * If any of the ISA IRQs have an interrupt source already, then - * assume that the APICs are being used and don't register any - * of our interrupt sources. This makes sure we don't accidentally - * use mixed mode. The "accidental" use could otherwise occur on - * machines that route the ACPI SCI interrupt to a different ISA - * IRQ (at least one machines routes it to IRQ 13) thus disabling - * that APIC ISA routing and allowing the ATPIC source for that IRQ - * to leak through. We used to depend on this feature for routing - * IRQ0 via mixed mode, but now we don't use mixed mode at all. - */ - for (i = 0; i < NUM_ISA_IRQS; i++) - if (intr_lookup_source(i) != NULL) - return; - - /* Loop through all interrupt sources and add them. */ - for (i = 0, ai = atintrs; i < NUM_ISA_IRQS; i++, ai++) { - if (i == ICU_SLAVEID) - continue; - intr_register_source(&ai->at_intsrc); - } + if (num_io_irqs == 0) + num_io_irqs = NUM_ISA_IRQS; } -SYSINIT(atpic_init, SI_SUB_INTR, SI_ORDER_SECOND + 1, atpic_init, NULL); +SYSINIT(atpic_init, SI_SUB_INTR, SI_ORDER_FOURTH, atpic_init, NULL); void atpic_handle_intr(u_int vector, struct trapframe *frame) Modified: trunk/sys/x86/isa/atrtc.c =================================================================== --- trunk/sys/x86/isa/atrtc.c 2020-02-08 19:29:01 UTC (rev 12309) +++ trunk/sys/x86/isa/atrtc.c 2020-02-08 19:32:41 UTC (rev 12310) @@ -25,12 +25,13 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: stable/10/sys/x86/isa/atrtc.c 285446 2015-07-13 11:58:08Z brueffer $ + * $FreeBSD: stable/11/sys/x86/isa/atrtc.c 345590 2019-03-27 19:17:42Z wulf $ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/x86/isa/atrtc.c 285446 2015-07-13 11:58:08Z brueffer $"); +__FBSDID("$FreeBSD: stable/11/sys/x86/isa/atrtc.c 345590 2019-03-27 19:17:42Z wulf $"); +#include "opt_acpi.h" #include "opt_isa.h" #include <sys/param.h> @@ -53,10 +54,24 @@ #endif #include <machine/intr_machdep.h> #include "clock_if.h" +#ifdef DEV_ACPI +#include <contrib/dev/acpica/include/acpi.h> +#include <contrib/dev/acpica/include/accommon.h> +#include <dev/acpica/acpivar.h> +#include <machine/md_var.h> +#endif -#define RTC_LOCK do { if (!kdb_active) mtx_lock_spin(&clock_lock); } while (0) -#define RTC_UNLOCK do { if (!kdb_active) mtx_unlock_spin(&clock_lock); } while (0) +/* + * atrtc_lock protects low-level access to individual hardware registers. + * atrtc_time_lock protects the entire sequence of accessing multiple registers + * to read or write the date and time. + */ +static struct mtx atrtc_lock; +MTX_SYSINIT(atrtc_lock_init, &atrtc_lock, "atrtc", MTX_SPIN); +struct mtx atrtc_time_lock; +MTX_SYSINIT(atrtc_time_lock_init, &atrtc_time_lock, "atrtc_time", MTX_DEF); + int atrtcclock_disable = 0; static int rtc_reg = -1; @@ -63,16 +78,19 @@ static u_char rtc_statusa = RTCSA_DIVIDER | RTCSA_NOPROF; static u_char rtc_statusb = RTCSB_24HR; +#ifdef DEV_ACPI +#define _COMPONENT ACPI_TIMER +ACPI_MODULE_NAME("ATRTC") +#endif + /* * RTC support routines */ -int -rtcin(int reg) +static inline u_char +rtcin_locked(int reg) { - u_char val; - RTC_LOCK; if (rtc_reg != reg) { inb(0x84); outb(IO_RTC, reg); @@ -79,16 +97,13 @@ rtc_reg = reg; inb(0x84); } - val = inb(IO_RTC + 1); - RTC_UNLOCK; - return (val); + return (inb(IO_RTC + 1)); } -void -writertc(int reg, u_char val) +static inline void +rtcout_locked(int reg, u_char val) { - RTC_LOCK; if (rtc_reg != reg) { inb(0x84); outb(IO_RTC, reg); @@ -97,21 +112,36 @@ } outb(IO_RTC + 1, val); inb(0x84); - RTC_UNLOCK; } -static __inline int -readrtc(int port) +int +rtcin(int reg) { - return(bcd2bin(rtcin(port))); + u_char val; + + mtx_lock_spin(&atrtc_lock); + val = rtcin_locked(reg); + mtx_unlock_spin(&atrtc_lock); + return (val); } +void +writertc(int reg, u_char val) +{ + + mtx_lock_spin(&atrtc_lock); + rtcout_locked(reg, val); + mtx_unlock_spin(&atrtc_lock); +} + static void atrtc_start(void) { - writertc(RTC_STATUSA, rtc_statusa); - writertc(RTC_STATUSB, RTCSB_24HR); + mtx_lock_spin(&atrtc_lock); + rtcout_locked(RTC_STATUSA, rtc_statusa); + rtcout_locked(RTC_STATUSB, RTCSB_24HR); + mtx_unlock_spin(&atrtc_lock); } static void @@ -127,8 +157,10 @@ { rtc_statusb |= RTCSB_PINTR; - writertc(RTC_STATUSB, rtc_statusb); - rtcin(RTC_INTR); + mtx_lock_spin(&atrtc_lock); + rtcout_locked(RTC_STATUSB, rtc_statusb); + rtcin_locked(RTC_INTR); + mtx_unlock_spin(&atrtc_lock); } static void @@ -136,8 +168,10 @@ { rtc_statusb &= ~RTCSB_PINTR; - writertc(RTC_STATUSB, rtc_statusb); - rtcin(RTC_INTR); + mtx_lock_spin(&atrtc_lock); + rtcout_locked(RTC_STATUSB, rtc_statusb); + rtcin_locked(RTC_INTR); + mtx_unlock_spin(&atrtc_lock); } void @@ -145,11 +179,13 @@ { /* Restore all of the RTC's "status" (actually, control) registers. */ - rtcin(RTC_STATUSA); /* dummy to get rtc_reg set */ - writertc(RTC_STATUSB, RTCSB_24HR); - writertc(RTC_STATUSA, rtc_statusa); - writertc(RTC_STATUSB, rtc_statusb); - rtcin(RTC_INTR); + mtx_lock_spin(&atrtc_lock); + rtcin_locked(RTC_STATUSA); /* dummy to get rtc_reg set */ + rtcout_locked(RTC_STATUSB, RTCSB_24HR); + rtcout_locked(RTC_STATUSA, rtc_statusa); + rtcout_locked(RTC_STATUSB, rtc_statusb); + rtcin_locked(RTC_INTR); + mtx_unlock_spin(&atrtc_lock); } /********************************************************************** @@ -162,6 +198,9 @@ struct resource *intr_res; void *intr_handler; struct eventtimer et; +#ifdef DEV_ACPI + ACPI_HANDLE acpi_handle; +#endif }; static int @@ -216,7 +255,145 @@ return(flag ? FILTER_HANDLED : FILTER_STRAY); } +#ifdef DEV_ACPI /* + * ACPI RTC CMOS address space handler + */ +#define ATRTC_LAST_REG 0x40 + +static void +rtcin_region(int reg, void *buf, int len) +{ + u_char *ptr = buf; + + /* Drop lock after each IO as intr and settime have greater priority */ + while (len-- > 0) + *ptr++ = rtcin(reg++) & 0xff; +} + +static void +rtcout_region(int reg, const void *buf, int len) +{ + const u_char *ptr = buf; + + while (len-- > 0) + writertc(reg++, *ptr++); +} + +static bool +atrtc_check_cmos_access(bool is_read, ACPI_PHYSICAL_ADDRESS addr, UINT32 len) +{ + + /* Block address space wrapping on out-of-bound access */ + if (addr >= ATRTC_LAST_REG || addr + len > ATRTC_LAST_REG) + return (false); + + if (is_read) { + /* Reading 0x0C will muck with interrupts */ + if (addr <= RTC_INTR && addr + len > RTC_INTR) + return (false); + } else { + /* + * Allow single-byte writes to alarm registers and + * multi-byte writes to addr >= 0x30, else deny. + */ + if (!((len == 1 && (addr == RTC_SECALRM || + addr == RTC_MINALRM || + addr == RTC_HRSALRM)) || + addr >= 0x30)) + return (false); + } + return (true); +} + +static ACPI_STATUS +atrtc_acpi_cmos_handler(UINT32 func, ACPI_PHYSICAL_ADDRESS addr, + UINT32 bitwidth, UINT64 *value, void *context, void *region_context) +{ + device_t dev = context; + UINT32 bytewidth = howmany(bitwidth, 8); + bool is_read = func == ACPI_READ; + + /* ACPICA is very verbose on CMOS handler failures, so we, too */ +#define CMOS_HANDLER_ERR(fmt, ...) \ + device_printf(dev, "ACPI [SystemCMOS] handler: " fmt, ##__VA_ARGS__) + + ACPI_FUNCTION_TRACE((char *)(uintptr_t)__func__); + + if (value == NULL) { + CMOS_HANDLER_ERR("NULL parameter\n"); + return (AE_BAD_PARAMETER); + } + if (bitwidth == 0 || (bitwidth & 0x07) != 0) { + CMOS_HANDLER_ERR("Invalid bitwidth: %u\n", bitwidth); + return (AE_BAD_PARAMETER); + } + if (!atrtc_check_cmos_access(is_read, addr, bytewidth)) { + CMOS_HANDLER_ERR("%s access rejected: addr=%#04jx, len=%u\n", + is_read ? "Read" : "Write", (uintmax_t)addr, bytewidth); + return (AE_BAD_PARAMETER); + } + + switch (func) { + case ACPI_READ: + rtcin_region(addr, value, bytewidth); + break; + case ACPI_WRITE: + rtcout_region(addr, value, bytewidth); + break; + default: + CMOS_HANDLER_ERR("Invalid function: %u\n", func); + return (AE_BAD_PARAMETER); + } + + ACPI_VPRINT(dev, acpi_device_get_parent_softc(dev), + "ACPI RTC CMOS %s access: addr=%#04x, len=%u, val=%*D\n", + is_read ? "read" : "write", (unsigned)addr, bytewidth, + bytewidth, value, " "); + + return (AE_OK); +} + +static int +atrtc_reg_acpi_cmos_handler(device_t dev) +{ + struct atrtc_softc *sc = device_get_softc(dev); + + ACPI_FUNCTION_TRACE((char *)(uintptr_t) __func__); + + /* Don't handle address space events if driver is disabled. */ + if (acpi_disabled("atrtc")) + return (ENXIO); + + sc->acpi_handle = acpi_get_handle(dev); + if (sc->acpi_handle == NULL || + ACPI_FAILURE(AcpiInstallAddressSpaceHandler(sc->acpi_handle, + ACPI_ADR_SPACE_CMOS, atrtc_acpi_cmos_handler, NULL, dev))) { + sc->acpi_handle = NULL; + device_printf(dev, + "Can't register ACPI CMOS address space handler\n"); + return (ENXIO); + } + + return (0); +} + +static int +atrtc_unreg_acpi_cmos_handler(device_t dev) +{ + struct atrtc_softc *sc = device_get_softc(dev); + + ACPI_FUNCTION_TRACE((char *)(uintptr_t) __func__); + + if (sc->acpi_handle != NULL) + AcpiRemoveAddressSpaceHandler(sc->acpi_handle, + ACPI_ADR_SPACE_CMOS, atrtc_acpi_cmos_handler); + + return (0); +} +#endif /* DEV_ACPI */ + +/* * Attach to the ISA PnP descriptors for the timer and realtime clock. */ static struct isa_pnp_id atrtc_ids[] = { @@ -242,7 +419,7 @@ atrtc_attach(device_t dev) { struct atrtc_softc *sc; - u_long s; + rman_res_t s; int i; sc = device_get_softc(dev); @@ -288,6 +465,37 @@ } static int +atrtc_isa_attach(device_t dev) +{ + + return (atrtc_attach(dev)); +} + +#ifdef DEV_ACPI +static int +atrtc_acpi_attach(device_t dev) +{ + int ret; + + ret = atrtc_attach(dev); + if (ret) + return (ret); + + (void)atrtc_reg_acpi_cmos_handler(dev); + + return (0); +} + +static int +atrtc_acpi_detach(device_t dev) +{ + + (void)atrtc_unreg_acpi_cmos_handler(dev); + return (0); +} +#endif /* DEV_ACPI */ + +static int atrtc_resume(device_t dev) { @@ -298,28 +506,38 @@ static int atrtc_settime(device_t dev __unused, struct timespec *ts) { - struct clocktime ct; + struct bcd_clocktime bct; - clock_ts_to_ct(ts, &ct); + clock_ts_to_bcd(ts, &bct, false); + clock_dbgprint_bcd(dev, CLOCK_DBG_WRITE, &bct); - /* Disable RTC updates and interrupts. */ - writertc(RTC_STATUSB, RTCSB_HALT | RTCSB_24HR); + mtx_lock(&atrtc_time_lock); + mtx_lock_spin(&atrtc_lock); - writertc(RTC_SEC, bin2bcd(ct.sec)); /* Write back Seconds */ - writertc(RTC_MIN, bin2bcd(ct.min)); /* Write back Minutes */ - writertc(RTC_HRS, bin2bcd(ct.hour)); /* Write back Hours */ + /* Disable RTC updates and interrupts. */ + rtcout_locked(RTC_STATUSB, RTCSB_HALT | RTCSB_24HR); - writertc(RTC_WDAY, ct.dow + 1); /* Write back Weekday */ - writertc(RTC_DAY, bin2bcd(ct.day)); /* Write back Day */ - writertc(RTC_MONTH, bin2bcd(ct.mon)); /* Write back Month */ - writertc(RTC_YEAR, bin2bcd(ct.year % 100)); /* Write back Year */ + /* Write all the time registers. */ + rtcout_locked(RTC_SEC, bct.sec); + rtcout_locked(RTC_MIN, bct.min); + rtcout_locked(RTC_HRS, bct.hour); + rtcout_locked(RTC_WDAY, bct.dow + 1); + rtcout_locked(RTC_DAY, bct.day); + rtcout_locked(RTC_MONTH, bct.mon); + rtcout_locked(RTC_YEAR, bct.year & 0xff); #ifdef USE_RTC_CENTURY - writertc(RTC_CENTURY, bin2bcd(ct.year / 100)); /* ... and Century */ + rtcout_locked(RTC_CENTURY, bct.year >> 8); #endif - /* Reenable RTC updates and interrupts. */ - writertc(RTC_STATUSB, rtc_statusb); - rtcin(RTC_INTR); + /* + * Re-enable RTC updates and interrupts. + */ + rtcout_locked(RTC_STATUSB, rtc_statusb); + rtcin_locked(RTC_INTR); + + mtx_unlock_spin(&atrtc_lock); + mtx_unlock(&atrtc_time_lock); + return (0); } @@ -326,7 +544,7 @@ static int atrtc_gettime(device_t dev, struct timespec *ts) { - struct clocktime ct; + struct bcd_clocktime bct; /* Look if we have a RTC present and the time is valid */ if (!(rtcin(RTC_STATUSD) & RTCSD_PWR)) { @@ -341,32 +559,32 @@ * to make sure that no more than 240us pass after we start reading, * and try again if so. */ + mtx_lock(&atrtc_time_lock); while (rtcin(RTC_STATUSA) & RTCSA_TUP) continue; - critical_enter(); - ct.nsec = 0; - ct.sec = readrtc(RTC_SEC); - ct.min = readrtc(RTC_MIN); - ct.hour = readrtc(RTC_HRS); - ct.day = readrtc(RTC_DAY); - ct.dow = readrtc(RTC_WDAY) - 1; - ct.mon = readrtc(RTC_MONTH); - ct.year = readrtc(RTC_YEAR); + mtx_lock_spin(&atrtc_lock); + bct.sec = rtcin_locked(RTC_SEC); + bct.min = rtcin_locked(RTC_MIN); + bct.hour = rtcin_locked(RTC_HRS); + bct.day = rtcin_locked(RTC_DAY); + bct.mon = rtcin_locked(RTC_MONTH); + bct.year = rtcin_locked(RTC_YEAR); #ifdef USE_RTC_CENTURY - ct.year += readrtc(RTC_CENTURY) * 100; -#else - ct.year += (ct.year < 80 ? 2000 : 1900); + bct.year |= rtcin_locked(RTC_CENTURY) << 8; #endif - critical_exit(); - /* Set dow = -1 because some clocks don't set it correctly. */ - ct.dow = -1; - return (clock_ct_to_ts(&ct, ts)); + mtx_unlock_spin(&atrtc_lock); + mtx_unlock(&atrtc_time_lock); + /* dow is unused in timespec conversion and we have no nsec info. */ + bct.dow = 0; + bct.nsec = 0; + clock_dbgprint_bcd(dev, CLOCK_DBG_READ, &bct); + return (clock_bcd_to_ts(&bct, ts, false)); } -static device_method_t atrtc_methods[] = { +static device_method_t atrtc_isa_methods[] = { /* Device interface */ DEVMETHOD(device_probe, atrtc_probe), - DEVMETHOD(device_attach, atrtc_attach), + DEVMETHOD(device_attach, atrtc_isa_attach), DEVMETHOD(device_detach, bus_generic_detach), DEVMETHOD(device_shutdown, bus_generic_shutdown), DEVMETHOD(device_suspend, bus_generic_suspend), @@ -380,26 +598,38 @@ { 0, 0 } }; -static driver_t atrtc_driver = { +static driver_t atrtc_isa_driver = { "atrtc", - atrtc_methods, + atrtc_isa_methods, sizeof(struct atrtc_softc), }; -static devclass_t atrtc_devclass; +#ifdef DEV_ACPI +static device_method_t atrtc_acpi_methods[] = { + /* Device interface */ + DEVMETHOD(device_probe, atrtc_probe), + DEVMETHOD(device_attach, atrtc_acpi_attach), + DEVMETHOD(device_detach, atrtc_acpi_detach), + /* XXX stop statclock? */ + DEVMETHOD(device_resume, atrtc_resume), -DRIVER_MODULE(atrtc, isa, atrtc_driver, atrtc_devclass, 0, 0); -DRIVER_MODULE(atrtc, acpi, atrtc_driver, atrtc_devclass, 0, 0); + /* clock interface */ + DEVMETHOD(clock_gettime, atrtc_gettime), + DEVMETHOD(clock_settime, atrtc_settime), -#include "opt_ddb.h" -#ifdef DDB -#include <ddb/ddb.h> + { 0, 0 } +}; -DB_SHOW_COMMAND(rtc, rtc) -{ - printf("%02x/%02x/%02x %02x:%02x:%02x, A = %02x, B = %02x, C = %02x\n", - rtcin(RTC_YEAR), rtcin(RTC_MONTH), rtcin(RTC_DAY), - rtcin(RTC_HRS), rtcin(RTC_MIN), rtcin(RTC_SEC), - rtcin(RTC_STATUSA), rtcin(RTC_STATUSB), rtcin(RTC_INTR)); -} -#endif /* DDB */ +static driver_t atrtc_acpi_driver = { + "atrtc", + atrtc_acpi_methods, + sizeof(struct atrtc_softc), +}; +#endif /* DEV_ACPI */ + +static devclass_t atrtc_devclass; + +DRIVER_MODULE(atrtc, isa, atrtc_isa_driver, atrtc_devclass, 0, 0); +#ifdef DEV_ACPI +DRIVER_MODULE(atrtc, acpi, atrtc_acpi_driver, atrtc_devclass, 0, 0); +#endif Modified: trunk/sys/x86/isa/clock.c =================================================================== --- trunk/sys/x86/isa/clock.c 2020-02-08 19:29:01 UTC (rev 12309) +++ trunk/sys/x86/isa/clock.c 2020-02-08 19:32:41 UTC (rev 12310) @@ -35,7 +35,7 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/x86/isa/clock.c 254373 2013-08-15 17:21:06Z brooks $"); +__FBSDID("$FreeBSD: stable/11/sys/x86/isa/clock.c 331722 2018-03-29 02:50:57Z eadler $"); /* * Routines to handle clock hardware. @@ -66,6 +66,7 @@ #include <machine/intr_machdep.h> #include <machine/ppireg.h> #include <machine/timerreg.h> +#include <x86/init.h> #ifdef PC98 #include <pc98/pc98/pc98_machdep.h> @@ -98,7 +99,7 @@ int i8254_max_count; static int i8254_timecounter = 1; -struct mtx clock_lock; +static struct mtx clock_lock; static struct intsrc *i8254_intsrc; static uint16_t i8254_lastcount; static uint16_t i8254_offset; @@ -140,6 +141,15 @@ static unsigned i8254_get_timecount(struct timecounter *tc); static void set_i8254_freq(int mode, uint32_t period); +void +clock_init(void) +{ + /* Init the clock lock */ + mtx_init(&clock_lock, "clk", NULL, MTX_SPIN | MTX_NOPROFILE); + /* Init the clock in order to use DELAY */ + init_ops.early_clock_source_init(); +} + static int clkintr(void *arg) { @@ -157,7 +167,7 @@ mtx_unlock_spin(&clock_lock); } - if (sc && sc->et.et_active && sc->mode != MODE_STOP) + if (sc->et.et_active && sc->mode != MODE_STOP) sc->et.et_event_cb(&sc->et, sc->et.et_arg); #ifdef DEV_MCA @@ -248,54 +258,6 @@ return ((high << 8) | low); } -#ifndef DELAYDEBUG -static u_int -get_tsc(__unused struct timecounter *tc) -{ - - return (rdtsc32()); -} - -static __inline int -delay_tc(int n) -{ - struct timecounter *tc; - timecounter_get_t *func; - uint64_t end, freq, now; - u_int last, mask, u; - - tc = timecounter; - freq = atomic_load_acq_64(&tsc_freq); - if (tsc_is_invariant && freq != 0) { - func = get_tsc; - mask = ~0u; - } else { - if (tc->tc_quality <= 0) - return (0); - func = tc->tc_get_timecount; - mask = tc->tc_counter_mask; - freq = tc->tc_frequency; - } - now = 0; - end = freq * n / 1000000; - if (func == get_tsc) - sched_pin(); - last = func(tc) & mask; - do { - cpu_spinwait(); - u = func(tc) & mask; - if (u < last) - now += mask - last + u + 1; - else - now += u - last; - last = u; - } while (now < end); - if (func == get_tsc) - sched_unpin(); - return (1); -} -#endif - /* * Wait "n" microseconds. * Relies on timer 1 counting down from (i8254_freq / hz) @@ -302,7 +264,7 @@ * Note: timer had better have been programmed before this is first used! */ void -DELAY(int n) +i8254_delay(int n) { int delta, prev_tick, tick, ticks_left; #ifdef DELAYDEBUG @@ -318,9 +280,6 @@ } if (state == 1) printf("DELAY(%d)...", n); -#else - if (delay_tc(n)) - return; #endif /* * Read the counter first, so that the rest of the setup overhead is @@ -500,7 +459,6 @@ i8254_init(void) { - mtx_init(&clock_lock, "clk", NULL, MTX_SPIN | MTX_NOPROFILE); #ifdef PC98 if (pc98_machine_type & M_8M) i8254_freq = 1996800L; /* 1.9968 MHz */ @@ -518,8 +476,27 @@ void cpu_initclocks(void) { +#ifdef EARLY_AP_STARTUP + struct thread *td; + int i; + td = curthread; cpu_initclocks_bsp(); + CPU_FOREACH(i) { + if (i == 0) + continue; + thread_lock(td); + sched_bind(td, i); + thread_unlock(td); + cpu_initclocks_ap(); + } + thread_lock(td); + if (sched_is_bound(td)) + sched_unbind(td); + thread_unlock(td); +#else + cpu_initclocks_bsp(); +#endif } static int @@ -699,7 +676,7 @@ attimer_attach(device_t dev) { struct attimer_softc *sc; - u_long s; + rman_res_t s; int i; attimer_sc = sc = device_get_softc(dev); Modified: trunk/sys/x86/isa/elcr.c =================================================================== --- trunk/sys/x86/isa/elcr.c 2020-02-08 19:29:01 UTC (rev 12309) +++ trunk/sys/x86/isa/elcr.c 2020-02-08 19:32:41 UTC (rev 12310) @@ -26,7 +26,7 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/x86/isa/elcr.c 262192 2014-02-18 20:27:17Z jhb $"); +__FBSDID("$FreeBSD: stable/11/sys/x86/isa/elcr.c 261520 2014-02-05 18:13:27Z jhb $"); /* * The ELCR is a register that controls the trigger mode and polarity of Modified: trunk/sys/x86/isa/icu.h =================================================================== --- trunk/sys/x86/isa/icu.h 2020-02-08 19:29:01 UTC (rev 12309) +++ trunk/sys/x86/isa/icu.h 2020-02-08 19:32:41 UTC (rev 12310) @@ -31,7 +31,7 @@ * SUCH DAMAGE. * * from: @(#)icu.h 5.6 (Berkeley) 5/9/91 - * $FreeBSD: stable/10/sys/x86/isa/icu.h 233031 2012-03-16 12:13:44Z nyan $ + * $FreeBSD: stable/11/sys/x86/isa/icu.h 339928 2018-10-30 19:10:41Z jhb $ */ /* @@ -88,7 +88,6 @@ #endif #define IRQ_MASK(irq) (1 << (irq)) -#define IMEN_MASK(ai) (IRQ_MASK((ai)->at_irq)) void atpic_handle_intr(u_int vector, struct trapframe *frame); void atpic_startup(void); Modified: trunk/sys/x86/isa/isa.c =================================================================== --- trunk/sys/x86/isa/isa.c 2020-02-08 19:29:01 UTC (rev 12309) +++ trunk/sys/x86/isa/isa.c 2020-02-08 19:32:41 UTC (rev 12310) @@ -26,7 +26,7 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/x86/isa/isa.c 221526 2011-05-06 13:48:53Z jhb $"); +__FBSDID("$FreeBSD: stable/11/sys/x86/isa/isa.c 295832 2016-02-20 01:32:58Z jhibbits $"); /*- * Modifications for Intel architecture by Garrett A. Wollman. @@ -89,13 +89,13 @@ */ struct resource * isa_alloc_resource(device_t bus, device_t child, int type, int *rid, - u_long start, u_long end, u_long count, u_int flags) + rman_res_t start, rman_res_t end, rman_res_t count, u_int flags) { /* * Consider adding a resource definition. */ int passthrough = (device_get_parent(child) != bus); - int isdefault = (start == 0UL && end == ~0UL); + int isdefault = RMAN_IS_DEFAULT_RANGE(start, end); struct isa_device* idev = DEVTOISA(child); struct resource_list *rl = &idev->id_resources; struct resource_list_entry *rle; @@ -242,3 +242,8 @@ * On this platform, isa can also attach to the legacy bus. */ DRIVER_MODULE(isa, legacy, isa_driver, isa_devclass, 0, 0); + +/* + * Attach the ISA bus to the xenpv bus in order to get syscons. + */ +DRIVER_MODULE(isa, xenpv, isa_driver, isa_devclass, 0, 0); Modified: trunk/sys/x86/isa/isa_dma.c =================================================================== --- trunk/sys/x86/isa/isa_dma.c 2020-02-08 19:29:01 UTC (rev 12309) +++ trunk/sys/x86/isa/isa_dma.c 2020-02-08 19:32:41 UTC (rev 12310) @@ -34,7 +34,7 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/x86/isa/isa_dma.c 233675 2012-03-29 18:58:02Z jhb $"); +__FBSDID("$FreeBSD: stable/11/sys/x86/isa/isa_dma.c 332304 2018-04-08 20:52:09Z emaste $"); /* * code to manage AT bus @@ -62,7 +62,7 @@ #include <isa/isavar.h> #include <isa/isa_dmareg.h> -#define ISARAM_END RAM_END +#define ISARAM_END 0x1000000 static int isa_dmarangecheck(caddr_t va, u_int length, int chan); @@ -145,8 +145,7 @@ * in open() or during its initialization. */ int -isa_dma_acquire(chan) - int chan; +isa_dma_acquire(int chan) { #ifdef DIAGNOSTIC if (chan & ~VALID_DMA_MASK) @@ -171,8 +170,7 @@ * during close() or during its shutdown. */ void -isa_dma_release(chan) - int chan; +isa_dma_release(int chan) { #ifdef DIAGNOSTIC if (chan & ~VALID_DMA_MASK) @@ -206,8 +204,7 @@ * external dma control by a board. */ void -isa_dmacascade(chan) - int chan; +isa_dmacascade(int chan) { #ifdef DIAGNOSTIC if (chan & ~VALID_DMA_MASK) Modified: trunk/sys/x86/isa/nmi.c =================================================================== --- trunk/sys/x86/isa/nmi.c 2020-02-08 19:29:01 UTC (rev 12309) +++ trunk/sys/x86/isa/nmi.c 2020-02-08 19:32:41 UTC (rev 12310) @@ -34,7 +34,7 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/x86/isa/nmi.c 204309 2010-02-25 14:13:39Z attilio $"); +__FBSDID("$FreeBSD: stable/11/sys/x86/isa/nmi.c 331722 2018-03-29 02:50:57Z eadler $"); #include "opt_mca.h" Modified: trunk/sys/x86/isa/orm.c =================================================================== --- trunk/sys/x86/isa/orm.c 2020-02-08 19:29:01 UTC (rev 12309) +++ trunk/sys/x86/isa/orm.c 2020-02-08 19:32:41 UTC (rev 12310) @@ -26,7 +26,7 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/x86/isa/orm.c 204309 2010-02-25 14:13:39Z attilio $"); +__FBSDID("$FreeBSD: stable/11/sys/x86/isa/orm.c 299392 2016-05-10 22:28:06Z bz $"); /* * Driver to take care of holes in ISA I/O memory occupied @@ -59,7 +59,7 @@ { 0, NULL }, }; -#define MAX_ROMS 16 +#define MAX_ROMS 32 struct orm_softc { int rnum; @@ -92,6 +92,9 @@ struct orm_softc *sc; u_int8_t buf[3]; + if (resource_disabled("orm", 0)) + return; + child = BUS_ADD_CHILD(parent, ISA_ORDER_SENSITIVE, "orm", -1); device_set_driver(child, driver); isa_set_logicalid(child, ORM_ID); @@ -98,7 +101,7 @@ isa_set_vendorid(child, ORM_ID); sc = device_get_softc(child); sc->rnum = 0; - while (chunk < IOMEM_END) { + while (sc->rnum < MAX_ROMS && chunk < IOMEM_END) { bus_set_resource(child, SYS_RES_MEMORY, sc->rnum, chunk, IOMEM_STEP); rid = sc->rnum; Modified: trunk/sys/x86/pci/pci_bus.c =================================================================== --- trunk/sys/x86/pci/pci_bus.c 2020-02-08 19:29:01 UTC (rev 12309) +++ trunk/sys/x86/pci/pci_bus.c 2020-02-08 19:32:41 UTC (rev 12310) @@ -26,7 +26,7 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/x86/pci/pci_bus.c 280970 2015-04-01 21:48:54Z jhb $"); +__FBSDID("$FreeBSD: stable/11/sys/x86/pci/pci_bus.c 294883 2016-01-27 02:23:54Z jhibbits $"); #include "opt_cpu.h" @@ -525,7 +525,7 @@ device_probe_and_attach(pir); } #endif - device_add_child(dev, "pci", bus); + device_add_child(dev, "pci", -1); return bus_generic_attach(dev); } @@ -576,12 +576,11 @@ SYSCTL_DECL(_hw_pci); static unsigned long host_mem_start = 0x80000000; -TUNABLE_ULONG("hw.pci.host_mem_start", &host_mem_start); SYSCTL_ULONG(_hw_pci, OID_AUTO, host_mem_start, CTLFLAG_RDTUN, &host_mem_start, 0, "Limit the host bridge memory to being above this address."); -u_long -hostb_alloc_start(int type, u_long start, u_long end, u_long count) +rman_res_t +hostb_alloc_start(int type, rman_res_t start, rman_res_t end, rman_res_t count) { if (start + count - 1 != end) { @@ -595,7 +594,7 @@ struct resource * legacy_pcib_alloc_resource(device_t dev, device_t child, int type, int *rid, - u_long start, u_long end, u_long count, u_int flags) + rman_res_t start, rman_res_t end, rman_res_t count, u_int flags) { #if defined(NEW_PCIB) && defined(PCI_RES_BUS) @@ -611,7 +610,7 @@ #if defined(NEW_PCIB) && defined(PCI_RES_BUS) int legacy_pcib_adjust_resource(device_t dev, device_t child, int type, - struct resource *r, u_long start, u_long end) + struct resource *r, rman_res_t start, rman_res_t end) { if (type == PCI_RES_BUS) Modified: trunk/sys/x86/pci/qpi.c =================================================================== --- trunk/sys/x86/pci/qpi.c 2020-02-08 19:29:01 UTC (rev 12309) +++ trunk/sys/x86/pci/qpi.c 2020-02-08 19:32:41 UTC (rev 12310) @@ -27,14 +27,14 @@ */ /* - * This driver provides a psuedo-bus to enumerate the PCI buses - * present on a sytem using a QPI chipset. It creates a qpi0 bus that - * is a child of nexus0 and then creates two Host-PCI bridges as a + * This driver provides a pseudo-bus to enumerate the PCI buses + * present on a system using a QPI chipset. It creates a qpi0 bus that + * is a child of nexus0 and then creates Host-PCI bridges as a * child of that. */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/x86/pci/qpi.c 283927 2015-06-02 19:20:39Z jhb $"); +__FBSDID("$FreeBSD: stable/11/sys/x86/pci/qpi.c 323609 2017-09-15 09:03:01Z kib $"); #include <sys/param.h> #include <sys/bus.h> @@ -64,17 +64,23 @@ static void qpi_identify(driver_t *driver, device_t parent) { + int do_qpi; - /* Check CPUID to ensure this is an i7 CPU of some sort. */ - if (!(cpu_vendor_id == CPU_VENDOR_INTEL && - CPUID_TO_FAMILY(cpu_id) == 0x6 && - (CPUID_TO_MODEL(cpu_id) == 0x1a || CPUID_TO_MODEL(cpu_id) == 0x2c))) - return; + /* Check CPUID to ensure this is an i7 CPU of some sort. */ + if (cpu_vendor_id != CPU_VENDOR_INTEL || + CPUID_TO_FAMILY(cpu_id) != 0x6) + return; - /* PCI config register access is required. */ - if (pci_cfgregopen() == 0) - return; + /* Only discover buses with configuration devices if allowed by user */ + do_qpi = 0; + TUNABLE_INT_FETCH("hw.attach_intel_csr_pci", &do_qpi); + if (!do_qpi) + return; + /* PCI config register access is required. */ + if (pci_cfgregopen() == 0) + return; + /* Add a qpi bus device. */ if (BUS_ADD_CHILD(parent, 20, "qpi", -1) == NULL) panic("Failed to add qpi bus"); @@ -98,6 +104,7 @@ struct qpi_device *qdev; device_t child; uint32_t devid; + int s; /* * If a PCI bus already exists for this bus number, then @@ -107,18 +114,23 @@ return (EEXIST); /* - * Attempt to read the device id for device 0, function 0 on - * the bus. A value of 0xffffffff means that the bus is not - * present. + * Attempt to read the device id for every slot, function 0 on + * the bus. If all read values are 0xffffffff this means that + * the bus is not present. */ - devid = pci_cfgregread(bus, 0, 0, PCIR_DEVVENDOR, 4); + for (s = 0; s <= PCI_SLOTMAX; s++) { + devid = pci_cfgregread(bus, s, 0, PCIR_DEVVENDOR, 4); + if (devid != 0xffffffff) + break; + } if (devid == 0xffffffff) return (ENOENT); if ((devid & 0xffff) != 0x8086) { - device_printf(dev, - "Device at pci%d.0.0 has non-Intel vendor 0x%x\n", bus, - devid & 0xffff); + if (bootverbose) + device_printf(dev, + "Device at pci%d.%d.0 has non-Intel vendor 0x%x\n", + bus, s, devid & 0xffff); return (ENXIO); } @@ -138,12 +150,12 @@ int bus; /* - * Each processor socket has a dedicated PCI bus counting down from - * 255. We keep probing buses until one fails. + * Each processor socket has a dedicated PCI bus, sometimes + * not enumerated by ACPI. Probe all unattached buses from 0 + * to 255. */ - for (bus = 255;; bus--) - if (qpi_probe_pcib(dev, bus) != 0) - break; + for (bus = PCI_BUSMAX; bus >= 0; bus--) + qpi_probe_pcib(dev, bus); return (bus_generic_attach(dev)); } @@ -219,8 +231,8 @@ qpi_pcib_attach(device_t dev) { - device_add_child(dev, "pci", pcib_get_bus(dev)); - return (bus_generic_attach(dev)); + device_add_child(dev, "pci", -1); + return (bus_generic_attach(dev)); } static int @@ -242,7 +254,7 @@ #if defined(NEW_PCIB) && defined(PCI_RES_BUS) static struct resource * qpi_pcib_alloc_resource(device_t dev, device_t child, int type, int *rid, - u_long start, u_long end, u_long count, u_int flags) + rman_res_t start, rman_res_t end, rman_res_t count, u_int flags) { if (type == PCI_RES_BUS) Added: trunk/sys/x86/x86/autoconf.c =================================================================== --- trunk/sys/x86/x86/autoconf.c (rev 0) +++ trunk/sys/x86/x86/autoconf.c 2020-02-08 19:32:41 UTC (rev 12310) @@ -0,0 +1,162 @@ +/* $MidnightBSD$ */ +/*- + * Copyright (c) 1990 The Regents of the University of California. + * All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * William Jolitz. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)autoconf.c 7.1 (Berkeley) 5/9/91 + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: stable/11/sys/x86/x86/autoconf.c 332304 2018-04-08 20:52:09Z emaste $"); + +/* + * Setup the system to run on the current machine. + * + * Configure() is called at boot time and initializes the vba + * device tables and the memory controller monitoring. Available + * devices are determined (from possibilities mentioned in ioconf.c), + * and the drivers are initialized. + */ +#include "opt_bootp.h" +#include "opt_isa.h" +#include "opt_bus.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/bus.h> +#include <sys/conf.h> +#include <sys/reboot.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/mount.h> +#include <sys/cons.h> + +#include <sys/socket.h> +#include <net/if.h> +#include <net/if_dl.h> +#include <net/if_types.h> +#include <net/if_var.h> +#include <net/ethernet.h> +#include <netinet/in.h> + +#ifdef PC98 +#include <machine/bootinfo.h> +#endif +#include <machine/md_var.h> + +#ifdef DEV_ISA +#include <isa/isavar.h> + +device_t isa_bus_device = 0; +#endif + +static void configure_first(void *); +static void configure(void *); +static void configure_final(void *); + +SYSINIT(configure1, SI_SUB_CONFIGURE, SI_ORDER_FIRST, configure_first, NULL); +/* SI_ORDER_SECOND is hookable */ +SYSINIT(configure2, SI_SUB_CONFIGURE, SI_ORDER_THIRD, configure, NULL); +/* SI_ORDER_MIDDLE is hookable */ +SYSINIT(configure3, SI_SUB_CONFIGURE, SI_ORDER_ANY, configure_final, NULL); + +/* + * Determine i/o configuration for a machine. + */ +static void +configure_first(void *dummy) +{ + + /* nexus0 is the top of the x86 device tree */ + device_add_child(root_bus, "nexus", 0); +} + +static void +configure(void *dummy) +{ + + /* initialize new bus architecture */ + root_bus_configure(); + +#ifdef DEV_ISA + /* + * Explicitly probe and attach ISA last. The isa bus saves + * it's device node at attach time for us here. + */ + if (isa_bus_device) + isa_probe_children(isa_bus_device); +#endif +} + +static void +configure_final(void *dummy) +{ + + cninit_finish(); + + if (bootverbose) { +#ifdef PC98 + int i; + + /* + * Print out the BIOS's idea of the disk geometries. + */ + printf("BIOS Geometries:\n"); + for (i = 0; i < N_BIOS_GEOM; i++) { + unsigned long bios_geom; + int max_cylinder, max_head, max_sector; + + bios_geom = bootinfo.bi_bios_geom[i]; + + /* + * XXX the bootstrap punts a 1200K floppy geometry + * when the get-disk-geometry interrupt fails. Skip + * drives that have this geometry. + */ + if (bios_geom == 0x4f020f) + continue; + + printf(" %x:%08lx ", i, bios_geom); + max_cylinder = bios_geom >> 16; + max_head = (bios_geom >> 8) & 0xff; + max_sector = bios_geom & 0xff; + printf( + "0..%d=%d cylinders, 0..%d=%d heads, 1..%d=%d sectors\n", + max_cylinder, max_cylinder + 1, + max_head, max_head + 1, + max_sector, max_sector); + } + printf(" %d accounted for\n", bootinfo.bi_n_bios_used); +#endif + + printf("Device configuration finished.\n"); + } + cold = 0; +} Property changes on: trunk/sys/x86/x86/autoconf.c ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +MidnightBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Modified: trunk/sys/x86/x86/bus_machdep.c =================================================================== --- trunk/sys/x86/x86/bus_machdep.c 2020-02-08 19:29:01 UTC (rev 12309) +++ trunk/sys/x86/x86/bus_machdep.c 2020-02-08 19:32:41 UTC (rev 12310) @@ -26,7 +26,7 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/x86/x86/bus_machdep.c 287126 2015-08-25 14:39:40Z marcel $"); +__FBSDID("$FreeBSD: stable/11/sys/x86/x86/bus_machdep.c 286667 2015-08-12 15:26:32Z marcel $"); #include <sys/param.h> #include <sys/systm.h> Modified: trunk/sys/x86/x86/busdma_bounce.c =================================================================== --- trunk/sys/x86/x86/busdma_bounce.c 2020-02-08 19:29:01 UTC (rev 12309) +++ trunk/sys/x86/x86/busdma_bounce.c 2020-02-08 19:32:41 UTC (rev 12310) @@ -26,7 +26,7 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/x86/x86/busdma_bounce.c 318977 2017-05-27 08:17:59Z hselasky $"); +__FBSDID("$FreeBSD: stable/11/sys/x86/x86/busdma_bounce.c 343361 2019-01-23 20:49:14Z kib $"); #include <sys/param.h> #include <sys/systm.h> @@ -80,7 +80,8 @@ vm_offset_t vaddr; /* kva of bounce buffer */ bus_addr_t busaddr; /* Physical address */ vm_offset_t datavaddr; /* kva of client data */ - bus_addr_t dataaddr; /* client physical address */ + vm_offset_t dataoffs; /* page offset of client data */ + vm_page_t datapage[2]; /* physical page(s) of client data */ bus_size_t datacount; /* client data count */ STAILQ_ENTRY(bounce_page) links; }; @@ -135,10 +136,9 @@ static int reserve_bounce_pages(bus_dma_tag_t dmat, bus_dmamap_t map, int commit); static bus_addr_t add_bounce_page(bus_dma_tag_t dmat, bus_dmamap_t map, - vm_offset_t vaddr, bus_addr_t addr, - bus_size_t size); + vm_offset_t vaddr, bus_addr_t addr1, + bus_addr_t addr2, bus_size_t size); static void free_bounce_page(bus_dma_tag_t dmat, struct bounce_page *bpage); -int run_filter(bus_dma_tag_t dmat, bus_addr_t paddr); static void _bus_dmamap_count_pages(bus_dma_tag_t dmat, bus_dmamap_t map, pmap_t pmap, void *buf, bus_size_t buflen, int flags); @@ -148,11 +148,6 @@ static int _bus_dmamap_reserve_pages(bus_dma_tag_t dmat, bus_dmamap_t map, int flags); -#ifdef XEN -#undef pmap_kextract -#define pmap_kextract pmap_kextract_ma -#endif - /* * Allocate a device specific dma_tag. */ @@ -494,7 +489,8 @@ while (buflen != 0) { sgsize = MIN(buflen, dmat->common.maxsegsz); if (bus_dma_run_filter(&dmat->common, curaddr)) { - sgsize = MIN(sgsize, PAGE_SIZE); + sgsize = MIN(sgsize, + PAGE_SIZE - (curaddr & PAGE_MASK)); map->pagesneeded++; } curaddr += sgsize; @@ -544,6 +540,51 @@ } } +static void +_bus_dmamap_count_ma(bus_dma_tag_t dmat, bus_dmamap_t map, struct vm_page **ma, + int ma_offs, bus_size_t buflen, int flags) +{ + bus_size_t sg_len, max_sgsize; + int page_index; + vm_paddr_t paddr; + + if ((map != &nobounce_dmamap && map->pagesneeded == 0)) { + CTR4(KTR_BUSDMA, "lowaddr= %d Maxmem= %d, boundary= %d, " + "alignment= %d", dmat->common.lowaddr, + ptoa((vm_paddr_t)Maxmem), + dmat->common.boundary, dmat->common.alignment); + CTR3(KTR_BUSDMA, "map= %p, nobouncemap= %p, pagesneeded= %d", + map, &nobounce_dmamap, map->pagesneeded); + + /* + * Count the number of bounce pages + * needed in order to complete this transfer + */ + page_index = 0; + while (buflen > 0) { + paddr = VM_PAGE_TO_PHYS(ma[page_index]) + ma_offs; + sg_len = PAGE_SIZE - ma_offs; + max_sgsize = MIN(buflen, dmat->common.maxsegsz); + sg_len = MIN(sg_len, max_sgsize); + if (bus_dma_run_filter(&dmat->common, paddr) != 0) { + sg_len = roundup2(sg_len, + dmat->common.alignment); + sg_len = MIN(sg_len, max_sgsize); + KASSERT((sg_len & (dmat->common.alignment - 1)) + == 0, ("Segment size is not aligned")); + map->pagesneeded++; + } + if (((ma_offs + sg_len) & ~PAGE_MASK) != 0) + page_index++; + ma_offs = (ma_offs + sg_len) & PAGE_MASK; + KASSERT(buflen >= sg_len, + ("Segment length overruns original buffer")); + buflen -= sg_len; + } + CTR1(KTR_BUSDMA, "pagesneeded= %d\n", map->pagesneeded); + } +} + static int _bus_dmamap_reserve_pages(bus_dma_tag_t dmat, bus_dmamap_t map, int flags) { @@ -648,8 +689,8 @@ if (((dmat->bounce_flags & BUS_DMA_COULD_BOUNCE) != 0) && map->pagesneeded != 0 && bus_dma_run_filter(&dmat->common, curaddr)) { - sgsize = MIN(sgsize, PAGE_SIZE); - curaddr = add_bounce_page(dmat, map, 0, curaddr, + sgsize = MIN(sgsize, PAGE_SIZE - (curaddr & PAGE_MASK)); + curaddr = add_bounce_page(dmat, map, 0, curaddr, 0, sgsize); } sgsize = _bus_dmamap_addseg(dmat, map, curaddr, sgsize, segs, @@ -677,7 +718,7 @@ { bus_size_t sgsize, max_sgsize; bus_addr_t curaddr; - vm_offset_t vaddr; + vm_offset_t kvaddr, vaddr; int error; if (map == NULL) @@ -700,22 +741,25 @@ /* * Get the physical address for this segment. */ - if (pmap == kernel_pmap) + if (pmap == kernel_pmap) { curaddr = pmap_kextract(vaddr); - else + kvaddr = vaddr; + } else { curaddr = pmap_extract(pmap, vaddr); + kvaddr = 0; + } /* * Compute the segment size, and adjust counts. */ max_sgsize = MIN(buflen, dmat->common.maxsegsz); - sgsize = PAGE_SIZE - ((vm_offset_t)curaddr & PAGE_MASK); + sgsize = PAGE_SIZE - (curaddr & PAGE_MASK); if (((dmat->bounce_flags & BUS_DMA_COULD_BOUNCE) != 0) && map->pagesneeded != 0 && bus_dma_run_filter(&dmat->common, curaddr)) { sgsize = roundup2(sgsize, dmat->common.alignment); sgsize = MIN(sgsize, max_sgsize); - curaddr = add_bounce_page(dmat, map, vaddr, curaddr, + curaddr = add_bounce_page(dmat, map, kvaddr, curaddr, 0, sgsize); } else { sgsize = MIN(sgsize, max_sgsize); @@ -734,6 +778,88 @@ return (buflen != 0 ? EFBIG : 0); /* XXX better return value here? */ } +static int +bounce_bus_dmamap_load_ma(bus_dma_tag_t dmat, bus_dmamap_t map, + struct vm_page **ma, bus_size_t buflen, int ma_offs, int flags, + bus_dma_segment_t *segs, int *segp) +{ + vm_paddr_t paddr, next_paddr; + int error, page_index; + bus_size_t sgsize, max_sgsize; + + if (dmat->common.flags & BUS_DMA_KEEP_PG_OFFSET) { + /* + * If we have to keep the offset of each page this function + * is not suitable, switch back to bus_dmamap_load_ma_triv + * which is going to do the right thing in this case. + */ + error = bus_dmamap_load_ma_triv(dmat, map, ma, buflen, ma_offs, + flags, segs, segp); + return (error); + } + + if (map == NULL) + map = &nobounce_dmamap; + + if (segs == NULL) + segs = dmat->segments; + + if ((dmat->bounce_flags & BUS_DMA_COULD_BOUNCE) != 0) { + _bus_dmamap_count_ma(dmat, map, ma, ma_offs, buflen, flags); + if (map->pagesneeded != 0) { + error = _bus_dmamap_reserve_pages(dmat, map, flags); + if (error) + return (error); + } + } + + page_index = 0; + while (buflen > 0) { + /* + * Compute the segment size, and adjust counts. + */ + paddr = VM_PAGE_TO_PHYS(ma[page_index]) + ma_offs; + max_sgsize = MIN(buflen, dmat->common.maxsegsz); + sgsize = PAGE_SIZE - ma_offs; + if (((dmat->bounce_flags & BUS_DMA_COULD_BOUNCE) != 0) && + map->pagesneeded != 0 && + bus_dma_run_filter(&dmat->common, paddr)) { + sgsize = roundup2(sgsize, dmat->common.alignment); + sgsize = MIN(sgsize, max_sgsize); + KASSERT((sgsize & (dmat->common.alignment - 1)) == 0, + ("Segment size is not aligned")); + /* + * Check if two pages of the user provided buffer + * are used. + */ + if ((ma_offs + sgsize) > PAGE_SIZE) + next_paddr = + VM_PAGE_TO_PHYS(ma[page_index + 1]); + else + next_paddr = 0; + paddr = add_bounce_page(dmat, map, 0, paddr, + next_paddr, sgsize); + } else { + sgsize = MIN(sgsize, max_sgsize); + } + sgsize = _bus_dmamap_addseg(dmat, map, paddr, sgsize, segs, + segp); + if (sgsize == 0) + break; + KASSERT(buflen >= sgsize, + ("Segment length overruns original buffer")); + buflen -= sgsize; + if (((ma_offs + sgsize) & ~PAGE_MASK) != 0) + page_index++; + ma_offs = (ma_offs + sgsize) & PAGE_MASK; + } + + /* + * Did we fit? + */ + return (buflen != 0 ? EFBIG : 0); /* XXX better return value here? */ +} + static void bounce_bus_dmamap_waitok(bus_dma_tag_t dmat, bus_dmamap_t map, struct memdesc *mem, bus_dmamap_callback_t *callback, void *callback_arg) @@ -779,6 +905,8 @@ bus_dmasync_op_t op) { struct bounce_page *bpage; + vm_offset_t datavaddr, tempvaddr; + bus_size_t datacount1, datacount2; if (map == NULL || (bpage = STAILQ_FIRST(&map->bpages)) == NULL) return; @@ -792,13 +920,40 @@ if ((op & BUS_DMASYNC_PREWRITE) != 0) { while (bpage != NULL) { - if (bpage->datavaddr != 0) { - bcopy((void *)bpage->datavaddr, - (void *)bpage->vaddr, bpage->datacount); - } else { - physcopyout(bpage->dataaddr, - (void *)bpage->vaddr, bpage->datacount); + tempvaddr = 0; + datavaddr = bpage->datavaddr; + datacount1 = bpage->datacount; + if (datavaddr == 0) { + tempvaddr = + pmap_quick_enter_page(bpage->datapage[0]); + datavaddr = tempvaddr | bpage->dataoffs; + datacount1 = min(PAGE_SIZE - bpage->dataoffs, + datacount1); } + + bcopy((void *)datavaddr, + (void *)bpage->vaddr, datacount1); + + if (tempvaddr != 0) + pmap_quick_remove_page(tempvaddr); + + if (bpage->datapage[1] == 0) { + KASSERT(datacount1 == bpage->datacount, + ("Mismatch between data size and provided memory space")); + goto next_w; + } + + /* + * We are dealing with an unmapped buffer that expands + * over two pages. + */ + datavaddr = pmap_quick_enter_page(bpage->datapage[1]); + datacount2 = bpage->datacount - datacount1; + bcopy((void *)datavaddr, + (void *)(bpage->vaddr + datacount1), datacount2); + pmap_quick_remove_page(datavaddr); + +next_w: bpage = STAILQ_NEXT(bpage, links); } dmat->bounce_zone->total_bounced++; @@ -806,14 +961,40 @@ if ((op & BUS_DMASYNC_POSTREAD) != 0) { while (bpage != NULL) { - if (bpage->datavaddr != 0) { - bcopy((void *)bpage->vaddr, - (void *)bpage->datavaddr, - bpage->datacount); - } else { - physcopyin((void *)bpage->vaddr, - bpage->dataaddr, bpage->datacount); + tempvaddr = 0; + datavaddr = bpage->datavaddr; + datacount1 = bpage->datacount; + if (datavaddr == 0) { + tempvaddr = + pmap_quick_enter_page(bpage->datapage[0]); + datavaddr = tempvaddr | bpage->dataoffs; + datacount1 = min(PAGE_SIZE - bpage->dataoffs, + datacount1); } + + bcopy((void *)bpage->vaddr, (void *)datavaddr, + datacount1); + + if (tempvaddr != 0) + pmap_quick_remove_page(tempvaddr); + + if (bpage->datapage[1] == 0) { + KASSERT(datacount1 == bpage->datacount, + ("Mismatch between data size and provided memory space")); + goto next_r; + } + + /* + * We are dealing with an unmapped buffer that expands + * over two pages. + */ + datavaddr = pmap_quick_enter_page(bpage->datapage[1]); + datacount2 = bpage->datacount - datacount1; + bcopy((void *)(bpage->vaddr + datacount1), + (void *)datavaddr, datacount2); + pmap_quick_remove_page(datavaddr); + +next_r: bpage = STAILQ_NEXT(bpage, links); } dmat->bounce_zone->total_bounced++; @@ -979,7 +1160,7 @@ static bus_addr_t add_bounce_page(bus_dma_tag_t dmat, bus_dmamap_t map, vm_offset_t vaddr, - bus_addr_t addr, bus_size_t size) + bus_addr_t addr1, bus_addr_t addr2, bus_size_t size) { struct bounce_zone *bz; struct bounce_page *bpage; @@ -1009,11 +1190,16 @@ if (dmat->common.flags & BUS_DMA_KEEP_PG_OFFSET) { /* Page offset needs to be preserved. */ - bpage->vaddr |= addr & PAGE_MASK; - bpage->busaddr |= addr & PAGE_MASK; + bpage->vaddr |= addr1 & PAGE_MASK; + bpage->busaddr |= addr1 & PAGE_MASK; + KASSERT(addr2 == 0, + ("Trying to bounce multiple pages with BUS_DMA_KEEP_PG_OFFSET")); } bpage->datavaddr = vaddr; - bpage->dataaddr = addr; + bpage->datapage[0] = PHYS_TO_VM_PAGE(addr1); + KASSERT((addr2 & PAGE_MASK) == 0, ("Second page is not aligned")); + bpage->datapage[1] = PHYS_TO_VM_PAGE(addr2); + bpage->dataoffs = addr1 & PAGE_MASK; bpage->datacount = size; STAILQ_INSERT_TAIL(&(map->bpages), bpage, links); return (bpage->busaddr); @@ -1085,7 +1271,7 @@ .mem_free = bounce_bus_dmamem_free, .load_phys = bounce_bus_dmamap_load_phys, .load_buffer = bounce_bus_dmamap_load_buffer, - .load_ma = bus_dmamap_load_ma_triv, + .load_ma = bounce_bus_dmamap_load_ma, .map_waitok = bounce_bus_dmamap_waitok, .map_complete = bounce_bus_dmamap_complete, .map_unload = bounce_bus_dmamap_unload, Modified: trunk/sys/x86/x86/busdma_machdep.c =================================================================== --- trunk/sys/x86/x86/busdma_machdep.c 2020-02-08 19:29:01 UTC (rev 12309) +++ trunk/sys/x86/x86/busdma_machdep.c 2020-02-08 19:32:41 UTC (rev 12310) @@ -30,7 +30,7 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/x86/x86/busdma_machdep.c 259511 2013-12-17 13:39:50Z kib $"); +__FBSDID("$FreeBSD: stable/11/sys/x86/x86/busdma_machdep.c 257230 2013-10-27 22:05:10Z kib $"); #include <sys/param.h> #include <sys/systm.h> Added: trunk/sys/x86/x86/cpu_machdep.c =================================================================== --- trunk/sys/x86/x86/cpu_machdep.c (rev 0) +++ trunk/sys/x86/x86/cpu_machdep.c 2020-02-08 19:32:41 UTC (rev 12310) @@ -0,0 +1,1359 @@ +/* $MidnightBSD$ */ +/*- + * Copyright (c) 2003 Peter Wemm. + * Copyright (c) 1992 Terrence R. Lambert. + * Copyright (c) 1982, 1987, 1990 The Regents of the University of California. + * All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * William Jolitz. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91 + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: stable/11/sys/x86/x86/cpu_machdep.c 355701 2019-12-13 06:54:41Z scottl $"); + +#include "opt_atpic.h" +#include "opt_compat.h" +#include "opt_cpu.h" +#include "opt_ddb.h" +#include "opt_inet.h" +#include "opt_isa.h" +#include "opt_kdb.h" +#include "opt_kstack_pages.h" +#include "opt_maxmem.h" +#include "opt_mp_watchdog.h" +#include "opt_perfmon.h" +#include "opt_platform.h" +#ifdef __i386__ +#include "opt_apic.h" +#include "opt_xbox.h" +#endif + +#include <sys/param.h> +#include <sys/proc.h> +#include <sys/systm.h> +#include <sys/bus.h> +#include <sys/cpu.h> +#include <sys/kdb.h> +#include <sys/kernel.h> +#include <sys/ktr.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/mutex.h> +#include <sys/pcpu.h> +#include <sys/rwlock.h> +#include <sys/sched.h> +#include <sys/smp.h> +#include <sys/sysctl.h> + +#include <machine/clock.h> +#include <machine/cpu.h> +#include <machine/cputypes.h> +#include <machine/specialreg.h> +#include <machine/md_var.h> +#include <machine/mp_watchdog.h> +#ifdef PERFMON +#include <machine/perfmon.h> +#endif +#include <machine/tss.h> +#ifdef SMP +#include <machine/smp.h> +#endif +#ifdef CPU_ELAN +#include <machine/elan_mmcr.h> +#endif +#include <x86/acpica_machdep.h> + +#include <vm/vm.h> +#include <vm/vm_extern.h> +#include <vm/vm_kern.h> +#include <vm/vm_page.h> +#include <vm/vm_map.h> +#include <vm/vm_object.h> +#include <vm/vm_pager.h> +#include <vm/vm_param.h> + +#ifndef PC98 +#include <isa/isareg.h> +#endif + +#define STATE_RUNNING 0x0 +#define STATE_MWAIT 0x1 +#define STATE_SLEEPING 0x2 + +#ifdef SMP +static u_int cpu_reset_proxyid; +static volatile u_int cpu_reset_proxy_active; +#endif + +struct msr_op_arg { + u_int msr; + int op; + uint64_t arg1; +}; + +static void +x86_msr_op_one(void *argp) +{ + struct msr_op_arg *a; + uint64_t v; + + a = argp; + switch (a->op) { + case MSR_OP_ANDNOT: + v = rdmsr(a->msr); + v &= ~a->arg1; + wrmsr(a->msr, v); + break; + case MSR_OP_OR: + v = rdmsr(a->msr); + v |= a->arg1; + wrmsr(a->msr, v); + break; + case MSR_OP_WRITE: + wrmsr(a->msr, a->arg1); + break; + } +} + +#define MSR_OP_EXMODE_MASK 0xf0000000 +#define MSR_OP_OP_MASK 0x000000ff + +void +x86_msr_op(u_int msr, u_int op, uint64_t arg1) +{ + struct thread *td; + struct msr_op_arg a; + u_int exmode; + int bound_cpu, i, is_bound; + + a.op = op & MSR_OP_OP_MASK; + MPASS(a.op == MSR_OP_ANDNOT || a.op == MSR_OP_OR || + a.op == MSR_OP_WRITE); + exmode = op & MSR_OP_EXMODE_MASK; + MPASS(exmode == MSR_OP_LOCAL || exmode == MSR_OP_SCHED || + exmode == MSR_OP_RENDEZVOUS); + a.msr = msr; + a.arg1 = arg1; + switch (exmode) { + case MSR_OP_LOCAL: + x86_msr_op_one(&a); + break; + case MSR_OP_SCHED: + td = curthread; + thread_lock(td); + is_bound = sched_is_bound(td); + bound_cpu = td->td_oncpu; + CPU_FOREACH(i) { + sched_bind(td, i); + x86_msr_op_one(&a); + } + if (is_bound) + sched_bind(td, bound_cpu); + else + sched_unbind(td); + thread_unlock(td); + break; + case MSR_OP_RENDEZVOUS: + smp_rendezvous(NULL, x86_msr_op_one, NULL, &a); + break; + } +} + +/* + * Machine dependent boot() routine + * + * I haven't seen anything to put here yet + * Possibly some stuff might be grafted back here from boot() + */ +void +cpu_boot(int howto) +{ +} + +/* + * Flush the D-cache for non-DMA I/O so that the I-cache can + * be made coherent later. + */ +void +cpu_flush_dcache(void *ptr, size_t len) +{ + /* Not applicable */ +} + +void +acpi_cpu_c1(void) +{ + + __asm __volatile("sti; hlt"); +} + +/* + * Use mwait to pause execution while waiting for an interrupt or + * another thread to signal that there is more work. + * + * NOTE: Interrupts will cause a wakeup; however, this function does + * not enable interrupt handling. The caller is responsible to enable + * interrupts. + */ +void +acpi_cpu_idle_mwait(uint32_t mwait_hint) +{ + int *state; + uint64_t v; + + /* + * A comment in Linux patch claims that 'CPUs run faster with + * speculation protection disabled. All CPU threads in a core + * must disable speculation protection for it to be + * disabled. Disable it while we are idle so the other + * hyperthread can run fast.' + * + * XXXKIB. Software coordination mode should be supported, + * but all Intel CPUs provide hardware coordination. + */ + + state = (int *)PCPU_PTR(monitorbuf); + KASSERT(atomic_load_int(state) == STATE_SLEEPING, + ("cpu_mwait_cx: wrong monitorbuf state")); + atomic_store_int(state, STATE_MWAIT); + if (PCPU_GET(ibpb_set) || hw_ssb_active) { + v = rdmsr(MSR_IA32_SPEC_CTRL); + wrmsr(MSR_IA32_SPEC_CTRL, v & ~(IA32_SPEC_CTRL_IBRS | + IA32_SPEC_CTRL_STIBP | IA32_SPEC_CTRL_SSBD)); + } else { + v = 0; + } + cpu_monitor(state, 0, 0); + if (atomic_load_int(state) == STATE_MWAIT) + cpu_mwait(MWAIT_INTRBREAK, mwait_hint); + + /* + * SSB cannot be disabled while we sleep, or rather, if it was + * disabled, the sysctl thread will bind to our cpu to tweak + * MSR. + */ + if (v != 0) + wrmsr(MSR_IA32_SPEC_CTRL, v); + + /* + * We should exit on any event that interrupts mwait, because + * that event might be a wanted interrupt. + */ + atomic_store_int(state, STATE_RUNNING); +} + +/* Get current clock frequency for the given cpu id. */ +int +cpu_est_clockrate(int cpu_id, uint64_t *rate) +{ + uint64_t tsc1, tsc2; + uint64_t acnt, mcnt, perf; + register_t reg; + + if (pcpu_find(cpu_id) == NULL || rate == NULL) + return (EINVAL); +#ifdef __i386__ + if ((cpu_feature & CPUID_TSC) == 0) + return (EOPNOTSUPP); +#endif + + /* + * If TSC is P-state invariant and APERF/MPERF MSRs do not exist, + * DELAY(9) based logic fails. + */ + if (tsc_is_invariant && !tsc_perf_stat) + return (EOPNOTSUPP); + +#ifdef SMP + if (smp_cpus > 1) { + /* Schedule ourselves on the indicated cpu. */ + thread_lock(curthread); + sched_bind(curthread, cpu_id); + thread_unlock(curthread); + } +#endif + + /* Calibrate by measuring a short delay. */ + reg = intr_disable(); + if (tsc_is_invariant) { + wrmsr(MSR_MPERF, 0); + wrmsr(MSR_APERF, 0); + tsc1 = rdtsc(); + DELAY(1000); + mcnt = rdmsr(MSR_MPERF); + acnt = rdmsr(MSR_APERF); + tsc2 = rdtsc(); + intr_restore(reg); + perf = 1000 * acnt / mcnt; + *rate = (tsc2 - tsc1) * perf; + } else { + tsc1 = rdtsc(); + DELAY(1000); + tsc2 = rdtsc(); + intr_restore(reg); + *rate = (tsc2 - tsc1) * 1000; + } + +#ifdef SMP + if (smp_cpus > 1) { + thread_lock(curthread); + sched_unbind(curthread); + thread_unlock(curthread); + } +#endif + + return (0); +} + +/* + * Shutdown the CPU as much as possible + */ +void +cpu_halt(void) +{ + for (;;) + halt(); +} + +static void +cpu_reset_real(void) +{ + struct region_descriptor null_idt; +#ifndef PC98 + int b; +#endif + + disable_intr(); +#ifdef CPU_ELAN + if (elan_mmcr != NULL) + elan_mmcr->RESCFG = 1; +#endif +#ifdef __i386__ + if (cpu == CPU_GEODE1100) { + /* Attempt Geode's own reset */ + outl(0xcf8, 0x80009044ul); + outl(0xcfc, 0xf); + } +#endif +#ifdef PC98 + /* + * Attempt to do a CPU reset via CPU reset port. + */ + if ((inb(0x35) & 0xa0) != 0xa0) { + outb(0x37, 0x0f); /* SHUT0 = 0. */ + outb(0x37, 0x0b); /* SHUT1 = 0. */ + } + outb(0xf0, 0x00); /* Reset. */ +#else +#if !defined(BROKEN_KEYBOARD_RESET) + /* + * Attempt to do a CPU reset via the keyboard controller, + * do not turn off GateA20, as any machine that fails + * to do the reset here would then end up in no man's land. + */ + outb(IO_KBD + 4, 0xFE); + DELAY(500000); /* wait 0.5 sec to see if that did it */ +#endif + + /* + * Attempt to force a reset via the Reset Control register at + * I/O port 0xcf9. Bit 2 forces a system reset when it + * transitions from 0 to 1. Bit 1 selects the type of reset + * to attempt: 0 selects a "soft" reset, and 1 selects a + * "hard" reset. We try a "hard" reset. The first write sets + * bit 1 to select a "hard" reset and clears bit 2. The + * second write forces a 0 -> 1 transition in bit 2 to trigger + * a reset. + */ + outb(0xcf9, 0x2); + outb(0xcf9, 0x6); + DELAY(500000); /* wait 0.5 sec to see if that did it */ + + /* + * Attempt to force a reset via the Fast A20 and Init register + * at I/O port 0x92. Bit 1 serves as an alternate A20 gate. + * Bit 0 asserts INIT# when set to 1. We are careful to only + * preserve bit 1 while setting bit 0. We also must clear bit + * 0 before setting it if it isn't already clear. + */ + b = inb(0x92); + if (b != 0xff) { + if ((b & 0x1) != 0) + outb(0x92, b & 0xfe); + outb(0x92, b | 0x1); + DELAY(500000); /* wait 0.5 sec to see if that did it */ + } +#endif /* PC98 */ + + printf("No known reset method worked, attempting CPU shutdown\n"); + DELAY(1000000); /* wait 1 sec for printf to complete */ + + /* Wipe the IDT. */ + null_idt.rd_limit = 0; + null_idt.rd_base = 0; + lidt(&null_idt); + + /* "good night, sweet prince .... <THUNK!>" */ + breakpoint(); + + /* NOTREACHED */ + while(1); +} + +#ifdef SMP +static void +cpu_reset_proxy(void) +{ + + cpu_reset_proxy_active = 1; + while (cpu_reset_proxy_active == 1) + ia32_pause(); /* Wait for other cpu to see that we've started */ + + printf("cpu_reset_proxy: Stopped CPU %d\n", cpu_reset_proxyid); + DELAY(1000000); + cpu_reset_real(); +} +#endif + +void +cpu_reset(void) +{ +#ifdef SMP + cpuset_t map; + u_int cnt; + + if (smp_started) { + map = all_cpus; + CPU_CLR(PCPU_GET(cpuid), &map); + CPU_NAND(&map, &stopped_cpus); + if (!CPU_EMPTY(&map)) { + printf("cpu_reset: Stopping other CPUs\n"); + stop_cpus(map); + } + + if (PCPU_GET(cpuid) != 0) { + cpu_reset_proxyid = PCPU_GET(cpuid); + cpustop_restartfunc = cpu_reset_proxy; + cpu_reset_proxy_active = 0; + printf("cpu_reset: Restarting BSP\n"); + + /* Restart CPU #0. */ + CPU_SETOF(0, &started_cpus); + wmb(); + + cnt = 0; + while (cpu_reset_proxy_active == 0 && cnt < 10000000) { + ia32_pause(); + cnt++; /* Wait for BSP to announce restart */ + } + if (cpu_reset_proxy_active == 0) { + printf("cpu_reset: Failed to restart BSP\n"); + } else { + cpu_reset_proxy_active = 2; + while (1) + ia32_pause(); + /* NOTREACHED */ + } + } + + DELAY(1000000); + } +#endif + cpu_reset_real(); + /* NOTREACHED */ +} + +bool +cpu_mwait_usable(void) +{ + + return ((cpu_feature2 & CPUID2_MON) != 0 && ((cpu_mon_mwait_flags & + (CPUID5_MON_MWAIT_EXT | CPUID5_MWAIT_INTRBREAK)) == + (CPUID5_MON_MWAIT_EXT | CPUID5_MWAIT_INTRBREAK))); +} + +void (*cpu_idle_hook)(sbintime_t) = NULL; /* ACPI idle hook. */ +static int cpu_ident_amdc1e = 0; /* AMD C1E supported. */ +static int idle_mwait = 1; /* Use MONITOR/MWAIT for short idle. */ +SYSCTL_INT(_machdep, OID_AUTO, idle_mwait, CTLFLAG_RWTUN, &idle_mwait, + 0, "Use MONITOR/MWAIT for short idle"); + +#ifndef PC98 +static void +cpu_idle_acpi(sbintime_t sbt) +{ + int *state; + + state = (int *)PCPU_PTR(monitorbuf); + atomic_store_int(state, STATE_SLEEPING); + + /* See comments in cpu_idle_hlt(). */ + disable_intr(); + if (sched_runnable()) + enable_intr(); + else if (cpu_idle_hook) + cpu_idle_hook(sbt); + else + acpi_cpu_c1(); + atomic_store_int(state, STATE_RUNNING); +} +#endif /* !PC98 */ + +static void +cpu_idle_hlt(sbintime_t sbt) +{ + int *state; + + state = (int *)PCPU_PTR(monitorbuf); + atomic_store_int(state, STATE_SLEEPING); + + /* + * Since we may be in a critical section from cpu_idle(), if + * an interrupt fires during that critical section we may have + * a pending preemption. If the CPU halts, then that thread + * may not execute until a later interrupt awakens the CPU. + * To handle this race, check for a runnable thread after + * disabling interrupts and immediately return if one is + * found. Also, we must absolutely guarentee that hlt is + * the next instruction after sti. This ensures that any + * interrupt that fires after the call to disable_intr() will + * immediately awaken the CPU from hlt. Finally, please note + * that on x86 this works fine because of interrupts enabled only + * after the instruction following sti takes place, while IF is set + * to 1 immediately, allowing hlt instruction to acknowledge the + * interrupt. + */ + disable_intr(); + if (sched_runnable()) + enable_intr(); + else + acpi_cpu_c1(); + atomic_store_int(state, STATE_RUNNING); +} + +static void +cpu_idle_mwait(sbintime_t sbt) +{ + int *state; + + state = (int *)PCPU_PTR(monitorbuf); + atomic_store_int(state, STATE_MWAIT); + + /* See comments in cpu_idle_hlt(). */ + disable_intr(); + if (sched_runnable()) { + atomic_store_int(state, STATE_RUNNING); + enable_intr(); + return; + } + + cpu_monitor(state, 0, 0); + if (atomic_load_int(state) == STATE_MWAIT) + __asm __volatile("sti; mwait" : : "a" (MWAIT_C1), "c" (0)); + else + enable_intr(); + atomic_store_int(state, STATE_RUNNING); +} + +static void +cpu_idle_spin(sbintime_t sbt) +{ + int *state; + int i; + + state = (int *)PCPU_PTR(monitorbuf); + atomic_store_int(state, STATE_RUNNING); + + /* + * The sched_runnable() call is racy but as long as there is + * a loop missing it one time will have just a little impact if any + * (and it is much better than missing the check at all). + */ + for (i = 0; i < 1000; i++) { + if (sched_runnable()) + return; + cpu_spinwait(); + } +} + +/* + * C1E renders the local APIC timer dead, so we disable it by + * reading the Interrupt Pending Message register and clearing + * both C1eOnCmpHalt (bit 28) and SmiOnCmpHalt (bit 27). + * + * Reference: + * "BIOS and Kernel Developer's Guide for AMD NPT Family 0Fh Processors" + * #32559 revision 3.00+ + */ +#define MSR_AMDK8_IPM 0xc0010055 +#define AMDK8_SMIONCMPHALT (1ULL << 27) +#define AMDK8_C1EONCMPHALT (1ULL << 28) +#define AMDK8_CMPHALT (AMDK8_SMIONCMPHALT | AMDK8_C1EONCMPHALT) + +void +cpu_probe_amdc1e(void) +{ + + /* + * Detect the presence of C1E capability mostly on latest + * dual-cores (or future) k8 family. + */ + if (cpu_vendor_id == CPU_VENDOR_AMD && + (cpu_id & 0x00000f00) == 0x00000f00 && + (cpu_id & 0x0fff0000) >= 0x00040000) { + cpu_ident_amdc1e = 1; + } +} + +#if defined(__i386__) && defined(PC98) +void (*cpu_idle_fn)(sbintime_t) = cpu_idle_hlt; +#else +void (*cpu_idle_fn)(sbintime_t) = cpu_idle_acpi; +#endif + +void +cpu_idle(int busy) +{ + uint64_t msr; + sbintime_t sbt = -1; + + CTR2(KTR_SPARE2, "cpu_idle(%d) at %d", + busy, curcpu); +#ifdef MP_WATCHDOG + ap_watchdog(PCPU_GET(cpuid)); +#endif + + /* If we are busy - try to use fast methods. */ + if (busy) { + if ((cpu_feature2 & CPUID2_MON) && idle_mwait) { + cpu_idle_mwait(busy); + goto out; + } + } + + /* If we have time - switch timers into idle mode. */ + if (!busy) { + critical_enter(); + sbt = cpu_idleclock(); + } + + /* Apply AMD APIC timer C1E workaround. */ + if (cpu_ident_amdc1e && cpu_disable_c3_sleep) { + msr = rdmsr(MSR_AMDK8_IPM); + if (msr & AMDK8_CMPHALT) + wrmsr(MSR_AMDK8_IPM, msr & ~AMDK8_CMPHALT); + } + + /* Call main idle method. */ + cpu_idle_fn(sbt); + + /* Switch timers back into active mode. */ + if (!busy) { + cpu_activeclock(); + critical_exit(); + } +out: + CTR2(KTR_SPARE2, "cpu_idle(%d) at %d done", + busy, curcpu); +} + +static int cpu_idle_apl31_workaround; +SYSCTL_INT(_machdep, OID_AUTO, idle_apl31, CTLFLAG_RW, + &cpu_idle_apl31_workaround, 0, + "Apollo Lake APL31 MWAIT bug workaround"); + +int +cpu_idle_wakeup(int cpu) +{ + int *state; + + state = (int *)pcpu_find(cpu)->pc_monitorbuf; + switch (atomic_load_int(state)) { + case STATE_SLEEPING: + return (0); + case STATE_MWAIT: + atomic_store_int(state, STATE_RUNNING); + return (cpu_idle_apl31_workaround ? 0 : 1); + case STATE_RUNNING: + return (1); + default: + panic("bad monitor state"); + return (1); + } +} + +/* + * Ordered by speed/power consumption. + */ +static struct { + void *id_fn; + char *id_name; + int id_cpuid2_flag; +} idle_tbl[] = { + { .id_fn = cpu_idle_spin, .id_name = "spin" }, + { .id_fn = cpu_idle_mwait, .id_name = "mwait", + .id_cpuid2_flag = CPUID2_MON }, + { .id_fn = cpu_idle_hlt, .id_name = "hlt" }, +#if !defined(__i386__) || !defined(PC98) + { .id_fn = cpu_idle_acpi, .id_name = "acpi" }, +#endif +}; + +static int +idle_sysctl_available(SYSCTL_HANDLER_ARGS) +{ + char *avail, *p; + int error; + int i; + + avail = malloc(256, M_TEMP, M_WAITOK); + p = avail; + for (i = 0; i < nitems(idle_tbl); i++) { + if (idle_tbl[i].id_cpuid2_flag != 0 && + (cpu_feature2 & idle_tbl[i].id_cpuid2_flag) == 0) + continue; +#if !defined(__i386__) || !defined(PC98) + if (strcmp(idle_tbl[i].id_name, "acpi") == 0 && + cpu_idle_hook == NULL) + continue; +#endif + p += sprintf(p, "%s%s", p != avail ? ", " : "", + idle_tbl[i].id_name); + } + error = sysctl_handle_string(oidp, avail, 0, req); + free(avail, M_TEMP); + return (error); +} + +SYSCTL_PROC(_machdep, OID_AUTO, idle_available, CTLTYPE_STRING | CTLFLAG_RD, + 0, 0, idle_sysctl_available, "A", "list of available idle functions"); + +static bool +cpu_idle_selector(const char *new_idle_name) +{ + int i; + + for (i = 0; i < nitems(idle_tbl); i++) { + if (idle_tbl[i].id_cpuid2_flag != 0 && + (cpu_feature2 & idle_tbl[i].id_cpuid2_flag) == 0) + continue; +#if !defined(__i386__) || !defined(PC98) + if (strcmp(idle_tbl[i].id_name, "acpi") == 0 && + cpu_idle_hook == NULL) + continue; +#endif + if (strcmp(idle_tbl[i].id_name, new_idle_name)) + continue; + cpu_idle_fn = idle_tbl[i].id_fn; + if (bootverbose) + printf("CPU idle set to %s\n", idle_tbl[i].id_name); + return (true); + } + return (false); +} + +static int +cpu_idle_sysctl(SYSCTL_HANDLER_ARGS) +{ + char buf[16], *p; + int error, i; + + p = "unknown"; + for (i = 0; i < nitems(idle_tbl); i++) { + if (idle_tbl[i].id_fn == cpu_idle_fn) { + p = idle_tbl[i].id_name; + break; + } + } + strncpy(buf, p, sizeof(buf)); + error = sysctl_handle_string(oidp, buf, sizeof(buf), req); + if (error != 0 || req->newptr == NULL) + return (error); + return (cpu_idle_selector(buf) ? 0 : EINVAL); +} + +SYSCTL_PROC(_machdep, OID_AUTO, idle, CTLTYPE_STRING | CTLFLAG_RW, 0, 0, + cpu_idle_sysctl, "A", "currently selected idle function"); + +static void +cpu_idle_tun(void *unused __unused) +{ + char tunvar[16]; + + if (TUNABLE_STR_FETCH("machdep.idle", tunvar, sizeof(tunvar))) + cpu_idle_selector(tunvar); + else if (cpu_vendor_id == CPU_VENDOR_AMD && + CPUID_TO_FAMILY(cpu_id) == 0x17 && CPUID_TO_MODEL(cpu_id) == 0x1) { + /* Ryzen erratas 1057, 1109. */ + cpu_idle_selector("hlt"); + idle_mwait = 0; + } + + if (cpu_vendor_id == CPU_VENDOR_INTEL && cpu_id == 0x506c9) { + /* + * Apollo Lake errata APL31 (public errata APL30). + * Stores to the armed address range may not trigger + * MWAIT to resume execution. OS needs to use + * interrupts to wake processors from MWAIT-induced + * sleep states. + */ + cpu_idle_apl31_workaround = 1; + } + TUNABLE_INT_FETCH("machdep.idle_apl31", &cpu_idle_apl31_workaround); +} +SYSINIT(cpu_idle_tun, SI_SUB_CPU, SI_ORDER_MIDDLE, cpu_idle_tun, NULL); + +static int panic_on_nmi = 1; +SYSCTL_INT(_machdep, OID_AUTO, panic_on_nmi, CTLFLAG_RWTUN, + &panic_on_nmi, 0, + "Panic on NMI raised by hardware failure"); +int nmi_is_broadcast = 1; +SYSCTL_INT(_machdep, OID_AUTO, nmi_is_broadcast, CTLFLAG_RWTUN, + &nmi_is_broadcast, 0, + "Chipset NMI is broadcast"); +#ifdef KDB +int kdb_on_nmi = 1; +SYSCTL_INT(_machdep, OID_AUTO, kdb_on_nmi, CTLFLAG_RWTUN, + &kdb_on_nmi, 0, + "Go to KDB on NMI with unknown source"); +#endif + +void +nmi_call_kdb(u_int cpu, u_int type, struct trapframe *frame) +{ + bool claimed = false; + +#ifdef DEV_ISA + /* machine/parity/power fail/"kitchen sink" faults */ + if (isa_nmi(frame->tf_err)) { + claimed = true; + if (panic_on_nmi) + panic("NMI indicates hardware failure"); + } +#endif /* DEV_ISA */ +#ifdef KDB + if (!claimed && kdb_on_nmi) { + /* + * NMI can be hooked up to a pushbutton for debugging. + */ + printf("NMI/cpu%d ... going to debugger\n", cpu); + kdb_trap(type, 0, frame); + } +#endif /* KDB */ +} + +void +nmi_handle_intr(u_int type, struct trapframe *frame) +{ + +#ifdef SMP + if (nmi_is_broadcast) { + nmi_call_kdb_smp(type, frame); + return; + } +#endif + nmi_call_kdb(PCPU_GET(cpuid), type, frame); +} + +int hw_ibrs_active; +int hw_ibrs_disable = 1; + +SYSCTL_INT(_hw, OID_AUTO, ibrs_active, CTLFLAG_RD, &hw_ibrs_active, 0, + "Indirect Branch Restricted Speculation active"); + +void +hw_ibrs_recalculate(void) +{ + if ((cpu_ia32_arch_caps & IA32_ARCH_CAP_IBRS_ALL) != 0) { + x86_msr_op(MSR_IA32_SPEC_CTRL, MSR_OP_LOCAL | + (hw_ibrs_disable ? MSR_OP_ANDNOT : MSR_OP_OR), + IA32_SPEC_CTRL_IBRS); + return; + } + hw_ibrs_active = (cpu_stdext_feature3 & CPUID_STDEXT3_IBPB) != 0 && + !hw_ibrs_disable; +} + +static int +hw_ibrs_disable_handler(SYSCTL_HANDLER_ARGS) +{ + int error, val; + + val = hw_ibrs_disable; + error = sysctl_handle_int(oidp, &val, 0, req); + if (error != 0 || req->newptr == NULL) + return (error); + hw_ibrs_disable = val != 0; + hw_ibrs_recalculate(); + return (0); +} +SYSCTL_PROC(_hw, OID_AUTO, ibrs_disable, CTLTYPE_INT | CTLFLAG_RWTUN | + CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0, hw_ibrs_disable_handler, "I", + "Disable Indirect Branch Restricted Speculation"); + +int hw_ssb_active; +int hw_ssb_disable; + +SYSCTL_INT(_hw, OID_AUTO, spec_store_bypass_disable_active, CTLFLAG_RD, + &hw_ssb_active, 0, + "Speculative Store Bypass Disable active"); + +static void +hw_ssb_set(bool enable, bool for_all_cpus) +{ + + if ((cpu_stdext_feature3 & CPUID_STDEXT3_SSBD) == 0) { + hw_ssb_active = 0; + return; + } + hw_ssb_active = enable; + x86_msr_op(MSR_IA32_SPEC_CTRL, + (enable ? MSR_OP_OR : MSR_OP_ANDNOT) | + (for_all_cpus ? MSR_OP_SCHED : MSR_OP_LOCAL), IA32_SPEC_CTRL_SSBD); +} + +void +hw_ssb_recalculate(bool all_cpus) +{ + + switch (hw_ssb_disable) { + default: + hw_ssb_disable = 0; + /* FALLTHROUGH */ + case 0: /* off */ + hw_ssb_set(false, all_cpus); + break; + case 1: /* on */ + hw_ssb_set(true, all_cpus); + break; + case 2: /* auto */ + hw_ssb_set((cpu_ia32_arch_caps & IA32_ARCH_CAP_SSB_NO) != 0 ? + false : true, all_cpus); + break; + } +} + +static int +hw_ssb_disable_handler(SYSCTL_HANDLER_ARGS) +{ + int error, val; + + val = hw_ssb_disable; + error = sysctl_handle_int(oidp, &val, 0, req); + if (error != 0 || req->newptr == NULL) + return (error); + hw_ssb_disable = val; + hw_ssb_recalculate(true); + return (0); +} +SYSCTL_PROC(_hw, OID_AUTO, spec_store_bypass_disable, CTLTYPE_INT | + CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0, + hw_ssb_disable_handler, "I", + "Speculative Store Bypass Disable (0 - off, 1 - on, 2 - auto"); + +int hw_mds_disable; + +/* + * Handler for Microarchitectural Data Sampling issues. Really not a + * pointer to C function: on amd64 the code must not change any CPU + * architectural state except possibly %rflags. Also, it is always + * called with interrupts disabled. + */ +void mds_handler_void(void); +void mds_handler_verw(void); +void mds_handler_ivb(void); +void mds_handler_bdw(void); +void mds_handler_skl_sse(void); +void mds_handler_skl_avx(void); +void mds_handler_skl_avx512(void); +void mds_handler_silvermont(void); +void (*mds_handler)(void) = mds_handler_void; + +static int +sysctl_hw_mds_disable_state_handler(SYSCTL_HANDLER_ARGS) +{ + const char *state; + + if (mds_handler == mds_handler_void) + state = "inactive"; + else if (mds_handler == mds_handler_verw) + state = "VERW"; + else if (mds_handler == mds_handler_ivb) + state = "software IvyBridge"; + else if (mds_handler == mds_handler_bdw) + state = "software Broadwell"; + else if (mds_handler == mds_handler_skl_sse) + state = "software Skylake SSE"; + else if (mds_handler == mds_handler_skl_avx) + state = "software Skylake AVX"; + else if (mds_handler == mds_handler_skl_avx512) + state = "software Skylake AVX512"; + else if (mds_handler == mds_handler_silvermont) + state = "software Silvermont"; + else + state = "unknown"; + return (SYSCTL_OUT(req, state, strlen(state))); +} + +SYSCTL_PROC(_hw, OID_AUTO, mds_disable_state, + CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, + sysctl_hw_mds_disable_state_handler, "A", + "Microarchitectural Data Sampling Mitigation state"); + +_Static_assert(__offsetof(struct pcpu, pc_mds_tmp) % 64 == 0, "MDS AVX512"); + +void +hw_mds_recalculate(void) +{ + struct pcpu *pc; + vm_offset_t b64; + u_long xcr0; + int i; + + /* + * Allow user to force VERW variant even if MD_CLEAR is not + * reported. For instance, hypervisor might unknowingly + * filter the cap out. + * For the similar reasons, and for testing, allow to enable + * mitigation even for RDCL_NO or MDS_NO caps. + */ + if (cpu_vendor_id != CPU_VENDOR_INTEL || hw_mds_disable == 0 || + ((cpu_ia32_arch_caps & (IA32_ARCH_CAP_RDCL_NO | + IA32_ARCH_CAP_MDS_NO)) != 0 && hw_mds_disable == 3)) { + mds_handler = mds_handler_void; + } else if (((cpu_stdext_feature3 & CPUID_STDEXT3_MD_CLEAR) != 0 && + hw_mds_disable == 3) || hw_mds_disable == 1) { + mds_handler = mds_handler_verw; + } else if (CPUID_TO_FAMILY(cpu_id) == 0x6 && + (CPUID_TO_MODEL(cpu_id) == 0x2e || CPUID_TO_MODEL(cpu_id) == 0x1e || + CPUID_TO_MODEL(cpu_id) == 0x1f || CPUID_TO_MODEL(cpu_id) == 0x1a || + CPUID_TO_MODEL(cpu_id) == 0x2f || CPUID_TO_MODEL(cpu_id) == 0x25 || + CPUID_TO_MODEL(cpu_id) == 0x2c || CPUID_TO_MODEL(cpu_id) == 0x2d || + CPUID_TO_MODEL(cpu_id) == 0x2a || CPUID_TO_MODEL(cpu_id) == 0x3e || + CPUID_TO_MODEL(cpu_id) == 0x3a) && + (hw_mds_disable == 2 || hw_mds_disable == 3)) { + /* + * Nehalem, SandyBridge, IvyBridge + */ + CPU_FOREACH(i) { + pc = pcpu_find(i); + if (pc->pc_mds_buf == NULL) { + pc->pc_mds_buf = malloc(672, M_TEMP, + M_WAITOK); + bzero(pc->pc_mds_buf, 16); + } + } + mds_handler = mds_handler_ivb; + } else if (CPUID_TO_FAMILY(cpu_id) == 0x6 && + (CPUID_TO_MODEL(cpu_id) == 0x3f || CPUID_TO_MODEL(cpu_id) == 0x3c || + CPUID_TO_MODEL(cpu_id) == 0x45 || CPUID_TO_MODEL(cpu_id) == 0x46 || + CPUID_TO_MODEL(cpu_id) == 0x56 || CPUID_TO_MODEL(cpu_id) == 0x4f || + CPUID_TO_MODEL(cpu_id) == 0x47 || CPUID_TO_MODEL(cpu_id) == 0x3d) && + (hw_mds_disable == 2 || hw_mds_disable == 3)) { + /* + * Haswell, Broadwell + */ + CPU_FOREACH(i) { + pc = pcpu_find(i); + if (pc->pc_mds_buf == NULL) { + pc->pc_mds_buf = malloc(1536, M_TEMP, + M_WAITOK); + bzero(pc->pc_mds_buf, 16); + } + } + mds_handler = mds_handler_bdw; + } else if (CPUID_TO_FAMILY(cpu_id) == 0x6 && + ((CPUID_TO_MODEL(cpu_id) == 0x55 && (cpu_id & + CPUID_STEPPING) <= 5) || + CPUID_TO_MODEL(cpu_id) == 0x4e || CPUID_TO_MODEL(cpu_id) == 0x5e || + (CPUID_TO_MODEL(cpu_id) == 0x8e && (cpu_id & + CPUID_STEPPING) <= 0xb) || + (CPUID_TO_MODEL(cpu_id) == 0x9e && (cpu_id & + CPUID_STEPPING) <= 0xc)) && + (hw_mds_disable == 2 || hw_mds_disable == 3)) { + /* + * Skylake, KabyLake, CoffeeLake, WhiskeyLake, + * CascadeLake + */ + CPU_FOREACH(i) { + pc = pcpu_find(i); + if (pc->pc_mds_buf == NULL) { + pc->pc_mds_buf = malloc(6 * 1024, + M_TEMP, M_WAITOK); + b64 = (vm_offset_t)malloc(64 + 63, + M_TEMP, M_WAITOK); + pc->pc_mds_buf64 = (void *)roundup2(b64, 64); + bzero(pc->pc_mds_buf64, 64); + } + } + xcr0 = rxcr(0); + if ((xcr0 & XFEATURE_ENABLED_ZMM_HI256) != 0 && + (cpu_stdext_feature2 & CPUID_STDEXT_AVX512DQ) != 0) + mds_handler = mds_handler_skl_avx512; + else if ((xcr0 & XFEATURE_ENABLED_AVX) != 0 && + (cpu_feature2 & CPUID2_AVX) != 0) + mds_handler = mds_handler_skl_avx; + else + mds_handler = mds_handler_skl_sse; + } else if (CPUID_TO_FAMILY(cpu_id) == 0x6 && + ((CPUID_TO_MODEL(cpu_id) == 0x37 || + CPUID_TO_MODEL(cpu_id) == 0x4a || + CPUID_TO_MODEL(cpu_id) == 0x4c || + CPUID_TO_MODEL(cpu_id) == 0x4d || + CPUID_TO_MODEL(cpu_id) == 0x5a || + CPUID_TO_MODEL(cpu_id) == 0x5d || + CPUID_TO_MODEL(cpu_id) == 0x6e || + CPUID_TO_MODEL(cpu_id) == 0x65 || + CPUID_TO_MODEL(cpu_id) == 0x75 || + CPUID_TO_MODEL(cpu_id) == 0x1c || + CPUID_TO_MODEL(cpu_id) == 0x26 || + CPUID_TO_MODEL(cpu_id) == 0x27 || + CPUID_TO_MODEL(cpu_id) == 0x35 || + CPUID_TO_MODEL(cpu_id) == 0x36 || + CPUID_TO_MODEL(cpu_id) == 0x7a))) { + /* Silvermont, Airmont */ + CPU_FOREACH(i) { + pc = pcpu_find(i); + if (pc->pc_mds_buf == NULL) + pc->pc_mds_buf = malloc(256, M_TEMP, M_WAITOK); + } + mds_handler = mds_handler_silvermont; + } else { + hw_mds_disable = 0; + mds_handler = mds_handler_void; + } +} + +static void +hw_mds_recalculate_boot(void *arg __unused) +{ + + hw_mds_recalculate(); +} +SYSINIT(mds_recalc, SI_SUB_SMP, SI_ORDER_ANY, hw_mds_recalculate_boot, NULL); + +static int +sysctl_mds_disable_handler(SYSCTL_HANDLER_ARGS) +{ + int error, val; + + val = hw_mds_disable; + error = sysctl_handle_int(oidp, &val, 0, req); + if (error != 0 || req->newptr == NULL) + return (error); + if (val < 0 || val > 3) + return (EINVAL); + hw_mds_disable = val; + hw_mds_recalculate(); + return (0); +} + +SYSCTL_PROC(_hw, OID_AUTO, mds_disable, CTLTYPE_INT | + CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0, + sysctl_mds_disable_handler, "I", + "Microarchitectural Data Sampling Mitigation " + "(0 - off, 1 - on VERW, 2 - on SW, 3 - on AUTO"); + + +/* + * Intel Transactional Memory Asynchronous Abort Mitigation + * CVE-2019-11135 + */ +int x86_taa_enable; +int x86_taa_state; +enum { + TAA_NONE = 0, /* No mitigation enabled */ + TAA_TSX_DISABLE = 1, /* Disable TSX via MSR */ + TAA_VERW = 2, /* Use VERW mitigation */ + TAA_AUTO = 3, /* Automatically select the mitigation */ + + /* The states below are not selectable by the operator */ + + TAA_TAA_UC = 4, /* Mitigation present in microcode */ + TAA_NOT_PRESENT = 5 /* TSX is not present */ +}; + +static void +taa_set(bool enable, bool all) +{ + + x86_msr_op(MSR_IA32_TSX_CTRL, + (enable ? MSR_OP_OR : MSR_OP_ANDNOT) | + (all ? MSR_OP_RENDEZVOUS : MSR_OP_LOCAL), + IA32_TSX_CTRL_RTM_DISABLE | IA32_TSX_CTRL_TSX_CPUID_CLEAR); +} + +void +x86_taa_recalculate(void) +{ + static int taa_saved_mds_disable = 0; + int taa_need = 0, taa_state = 0; + int mds_disable = 0, need_mds_recalc = 0; + + /* Check CPUID.07h.EBX.HLE and RTM for the presence of TSX */ + if ((cpu_stdext_feature & CPUID_STDEXT_HLE) == 0 || + (cpu_stdext_feature & CPUID_STDEXT_RTM) == 0) { + /* TSX is not present */ + x86_taa_state = TAA_NOT_PRESENT; + return; + } + + /* Check to see what mitigation options the CPU gives us */ + if (cpu_ia32_arch_caps & IA32_ARCH_CAP_TAA_NO) { + /* CPU is not suseptible to TAA */ + taa_need = TAA_TAA_UC; + } else if (cpu_ia32_arch_caps & IA32_ARCH_CAP_TSX_CTRL) { + /* + * CPU can turn off TSX. This is the next best option + * if TAA_NO hardware mitigation isn't present + */ + taa_need = TAA_TSX_DISABLE; + } else { + /* No TSX/TAA specific remedies are available. */ + if (x86_taa_enable == TAA_TSX_DISABLE) { + if (bootverbose) + printf("TSX control not available\n"); + return; + } else + taa_need = TAA_VERW; + } + + /* Can we automatically take action, or are we being forced? */ + if (x86_taa_enable == TAA_AUTO) + taa_state = taa_need; + else + taa_state = x86_taa_enable; + + /* No state change, nothing to do */ + if (taa_state == x86_taa_state) { + if (bootverbose) + printf("No TSX change made\n"); + return; + } + + /* Does the MSR need to be turned on or off? */ + if (taa_state == TAA_TSX_DISABLE) + taa_set(true, true); + else if (x86_taa_state == TAA_TSX_DISABLE) + taa_set(false, true); + + /* Does MDS need to be set to turn on VERW? */ + if (taa_state == TAA_VERW) { + taa_saved_mds_disable = hw_mds_disable; + mds_disable = hw_mds_disable = 1; + need_mds_recalc = 1; + } else if (x86_taa_state == TAA_VERW) { + mds_disable = hw_mds_disable = taa_saved_mds_disable; + need_mds_recalc = 1; + } + if (need_mds_recalc) { + hw_mds_recalculate(); + if (mds_disable != hw_mds_disable) { + if (bootverbose) + printf("Cannot change MDS state for TAA\n"); + /* Don't update our state */ + return; + } + } + + x86_taa_state = taa_state; + return; +} + +static void +taa_recalculate_boot(void * arg __unused) +{ + + x86_taa_recalculate(); +} +SYSINIT(taa_recalc, SI_SUB_SMP, SI_ORDER_ANY, taa_recalculate_boot, NULL); + +SYSCTL_NODE(_machdep_mitigations, OID_AUTO, taa, CTLFLAG_RW, 0, + "TSX Asynchronous Abort Mitigation"); + +static int +sysctl_taa_handler(SYSCTL_HANDLER_ARGS) +{ + int error, val; + + val = x86_taa_enable; + error = sysctl_handle_int(oidp, &val, 0, req); + if (error != 0 || req->newptr == NULL) + return (error); + if (val < TAA_NONE || val > TAA_AUTO) + return (EINVAL); + x86_taa_enable = val; + x86_taa_recalculate(); + return (0); +} + +SYSCTL_PROC(_machdep_mitigations_taa, OID_AUTO, enable, CTLTYPE_INT | + CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0, + sysctl_taa_handler, "I", + "TAA Mitigation enablement control " + "(0 - off, 1 - disable TSX, 2 - VERW, 3 - on AUTO"); + +static int +sysctl_taa_state_handler(SYSCTL_HANDLER_ARGS) +{ + const char *state; + + switch (x86_taa_state) { + case TAA_NONE: + state = "inactive"; + break; + case TAA_TSX_DISABLE: + state = "TSX disabled"; + break; + case TAA_VERW: + state = "VERW"; + break; + case TAA_TAA_UC: + state = "Mitigated in microcode"; + break; + case TAA_NOT_PRESENT: + state = "TSX not present"; + break; + default: + state = "unknown"; + } + + return (SYSCTL_OUT(req, state, strlen(state))); +} + +SYSCTL_PROC(_machdep_mitigations_taa, OID_AUTO, state, + CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, + sysctl_taa_state_handler, "A", + "TAA Mitigation state"); + Property changes on: trunk/sys/x86/x86/cpu_machdep.c ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +MidnightBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Added: trunk/sys/x86/x86/delay.c =================================================================== --- trunk/sys/x86/x86/delay.c (rev 0) +++ trunk/sys/x86/x86/delay.c 2020-02-08 19:32:41 UTC (rev 12310) @@ -0,0 +1,138 @@ +/* $MidnightBSD$ */ +/*- + * Copyright (c) 1990 The Regents of the University of California. + * Copyright (c) 2010 Alexander Motin <mav at FreeBSD.org> + * All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * William Jolitz and Don Ahn. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)clock.c 7.2 (Berkeley) 5/12/91 + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: stable/11/sys/x86/x86/delay.c 340270 2018-11-08 22:42:55Z jhb $"); + +/* Generic x86 routines to handle delay */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/timetc.h> +#include <sys/proc.h> +#include <sys/kernel.h> +#include <sys/sched.h> + +#include <machine/clock.h> +#include <machine/cpu.h> +#include <x86/init.h> + +static void +delay_tsc(int n) +{ + uint64_t end, now; + + /* + * Pin the current thread ensure correct behavior if the TSCs + * on different CPUs are not in sync. + */ + sched_pin(); + now = rdtsc(); + end = now + tsc_freq * n / 1000000; + do { + cpu_spinwait(); + now = rdtsc(); + } while (now < end); + sched_unpin(); +} + +static int +delay_tc(int n) +{ + struct timecounter *tc; + timecounter_get_t *func; + uint64_t end, freq, now; + u_int last, mask, u; + + /* + * Only use the TSC if it is P-state invariant. If the TSC is + * not P-state invariant and the CPU is not running at the + * "full" P-state, then the TSC will increment at some rate + * less than tsc_freq and delay_tsc() will wait too long. + */ + if (tsc_is_invariant && tsc_freq != 0) { + delay_tsc(n); + return (1); + } + tc = timecounter; + if (tc->tc_quality <= 0) + return (0); + func = tc->tc_get_timecount; + mask = tc->tc_counter_mask; + freq = tc->tc_frequency; + now = 0; + end = freq * n / 1000000; + last = func(tc) & mask; + do { + cpu_spinwait(); + u = func(tc) & mask; + if (u < last) + now += mask - last + u + 1; + else + now += u - last; + last = u; + } while (now < end); + return (1); +} + +void +DELAY(int n) +{ + + if (delay_tc(n)) + return; + + init_ops.early_delay(n); +} + +void +cpu_lock_delay(void) +{ + + /* + * Use TSC to wait for a usec if present, otherwise fall back + * to reading from port 0x84. We can't call into timecounters + * for this delay since timecounters might use spin locks. + * + * Note that unlike delay_tc(), this uses the TSC even if it + * is not P-state invariant. For this function it is ok to + * wait even a few usecs. + */ + if (tsc_freq != 0) + delay_tsc(1); + else + inb(0x84); +} Property changes on: trunk/sys/x86/x86/delay.c ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +MidnightBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Modified: trunk/sys/x86/x86/dump_machdep.c =================================================================== --- trunk/sys/x86/x86/dump_machdep.c 2020-02-08 19:29:01 UTC (rev 12309) +++ trunk/sys/x86/x86/dump_machdep.c 2020-02-08 19:32:41 UTC (rev 12310) @@ -26,355 +26,30 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/x86/x86/dump_machdep.c 236503 2012-06-03 08:01:12Z avg $"); +__FBSDID("$FreeBSD: stable/11/sys/x86/x86/dump_machdep.c 276772 2015-01-07 01:01:39Z markj $"); #include "opt_watchdog.h" #include <sys/param.h> -#include <sys/systm.h> #include <sys/conf.h> -#include <sys/cons.h> +#include <sys/kerneldump.h> #include <sys/sysctl.h> -#include <sys/kernel.h> -#include <sys/kerneldump.h> -#include <sys/watchdog.h> +#include <sys/systm.h> #include <vm/vm.h> #include <vm/pmap.h> -#include <machine/elf.h> -#include <machine/md_var.h> -#ifdef __amd64__ -#define KERNELDUMP_VERSION KERNELDUMP_AMD64_VERSION -#define EM_VALUE EM_X86_64 -#else -#define KERNELDUMP_VERSION KERNELDUMP_I386_VERSION -#define EM_VALUE EM_386 -#endif - -CTASSERT(sizeof(struct kerneldumpheader) == 512); - int do_minidump = 1; -TUNABLE_INT("debug.minidump", &do_minidump); -SYSCTL_INT(_debug, OID_AUTO, minidump, CTLFLAG_RW, &do_minidump, 0, +SYSCTL_INT(_debug, OID_AUTO, minidump, CTLFLAG_RWTUN, &do_minidump, 0, "Enable mini crash dumps"); -/* - * Don't touch the first SIZEOF_METADATA bytes on the dump device. This - * is to protect us from metadata and to protect metadata from us. - */ -#define SIZEOF_METADATA (64*1024) - -#define MD_ALIGN(x) (((off_t)(x) + PAGE_MASK) & ~PAGE_MASK) -#define DEV_ALIGN(x) (((off_t)(x) + (DEV_BSIZE-1)) & ~(DEV_BSIZE-1)) - -struct md_pa { - vm_paddr_t md_start; - vm_paddr_t md_size; -}; - -typedef int callback_t(struct md_pa *, int, void *); - -static struct kerneldumpheader kdh; -static off_t dumplo, fileofs; - -/* Handle buffered writes. */ -static char buffer[DEV_BSIZE]; -static size_t fragsz; - -/* 20 phys_avail entry pairs correspond to 10 md_pa's */ -static struct md_pa dump_map[10]; - -static void -md_pa_init(void) -{ - int n, idx; - - bzero(dump_map, sizeof(dump_map)); - for (n = 0; n < sizeof(dump_map) / sizeof(dump_map[0]); n++) { - idx = n * 2; - if (dump_avail[idx] == 0 && dump_avail[idx + 1] == 0) - break; - dump_map[n].md_start = dump_avail[idx]; - dump_map[n].md_size = dump_avail[idx + 1] - dump_avail[idx]; - } -} - -static struct md_pa * -md_pa_first(void) -{ - - return (&dump_map[0]); -} - -static struct md_pa * -md_pa_next(struct md_pa *mdp) -{ - - mdp++; - if (mdp->md_size == 0) - mdp = NULL; - return (mdp); -} - -static int -buf_write(struct dumperinfo *di, char *ptr, size_t sz) -{ - size_t len; - int error; - - while (sz) { - len = DEV_BSIZE - fragsz; - if (len > sz) - len = sz; - bcopy(ptr, buffer + fragsz, len); - fragsz += len; - ptr += len; - sz -= len; - if (fragsz == DEV_BSIZE) { - error = dump_write(di, buffer, 0, dumplo, - DEV_BSIZE); - if (error) - return error; - dumplo += DEV_BSIZE; - fragsz = 0; - } - } - - return (0); -} - -static int -buf_flush(struct dumperinfo *di) -{ - int error; - - if (fragsz == 0) - return (0); - - error = dump_write(di, buffer, 0, dumplo, DEV_BSIZE); - dumplo += DEV_BSIZE; - fragsz = 0; - return (error); -} - -#define PG2MB(pgs) ((pgs + (1 << 8) - 1) >> 8) - -static int -cb_dumpdata(struct md_pa *mdp, int seqnr, void *arg) -{ - struct dumperinfo *di = (struct dumperinfo*)arg; - vm_paddr_t a, pa; - void *va; - uint64_t pgs; - size_t counter, sz, chunk; - int i, c, error, twiddle; - u_int maxdumppgs; - - error = 0; /* catch case in which chunk size is 0 */ - counter = 0; /* Update twiddle every 16MB */ - twiddle = 0; - va = 0; - pgs = mdp->md_size / PAGE_SIZE; - pa = mdp->md_start; - maxdumppgs = min(di->maxiosize / PAGE_SIZE, MAXDUMPPGS); - if (maxdumppgs == 0) /* seatbelt */ - maxdumppgs = 1; - - printf(" chunk %d: %juMB (%ju pages)", seqnr, (uintmax_t)PG2MB(pgs), - (uintmax_t)pgs); - - while (pgs) { - chunk = pgs; - if (chunk > maxdumppgs) - chunk = maxdumppgs; - sz = chunk << PAGE_SHIFT; - counter += sz; - if (counter >> 24) { - printf(" %ju", (uintmax_t)PG2MB(pgs)); - counter &= (1<<24) - 1; - } - for (i = 0; i < chunk; i++) { - a = pa + i * PAGE_SIZE; - va = pmap_kenter_temporary(trunc_page(a), i); - } - - wdog_kern_pat(WD_LASTVAL); - - error = dump_write(di, va, 0, dumplo, sz); - if (error) - break; - dumplo += sz; - pgs -= chunk; - pa += sz; - - /* Check for user abort. */ - c = cncheckc(); - if (c == 0x03) - return (ECANCELED); - if (c != -1) - printf(" (CTRL-C to abort) "); - } - printf(" ... %s\n", (error) ? "fail" : "ok"); - return (error); -} - -static int -cb_dumphdr(struct md_pa *mdp, int seqnr, void *arg) -{ - struct dumperinfo *di = (struct dumperinfo*)arg; - Elf_Phdr phdr; - uint64_t size; - int error; - - size = mdp->md_size; - bzero(&phdr, sizeof(phdr)); - phdr.p_type = PT_LOAD; - phdr.p_flags = PF_R; /* XXX */ - phdr.p_offset = fileofs; - phdr.p_vaddr = mdp->md_start; - phdr.p_paddr = mdp->md_start; - phdr.p_filesz = size; - phdr.p_memsz = size; - phdr.p_align = PAGE_SIZE; - - error = buf_write(di, (char*)&phdr, sizeof(phdr)); - fileofs += phdr.p_filesz; - return (error); -} - -static int -cb_size(struct md_pa *mdp, int seqnr, void *arg) -{ - uint64_t *sz = (uint64_t*)arg; - - *sz += (uint64_t)mdp->md_size; - return (0); -} - -static int -foreach_chunk(callback_t cb, void *arg) -{ - struct md_pa *mdp; - int error, seqnr; - - seqnr = 0; - mdp = md_pa_first(); - while (mdp != NULL) { - error = (*cb)(mdp, seqnr++, arg); - if (error) - return (-error); - mdp = md_pa_next(mdp); - } - return (seqnr); -} - void -dumpsys(struct dumperinfo *di) +dumpsys_map_chunk(vm_paddr_t pa, size_t chunk, void **va) { - Elf_Ehdr ehdr; - uint64_t dumpsize; - off_t hdrgap; - size_t hdrsz; - int error; + int i; + vm_paddr_t a; - if (do_minidump) { - minidumpsys(di); - return; + for (i = 0; i < chunk; i++) { + a = pa + i * PAGE_SIZE; + *va = pmap_kenter_temporary(trunc_page(a), i); } - bzero(&ehdr, sizeof(ehdr)); - ehdr.e_ident[EI_MAG0] = ELFMAG0; - ehdr.e_ident[EI_MAG1] = ELFMAG1; - ehdr.e_ident[EI_MAG2] = ELFMAG2; - ehdr.e_ident[EI_MAG3] = ELFMAG3; - ehdr.e_ident[EI_CLASS] = ELF_CLASS; -#if BYTE_ORDER == LITTLE_ENDIAN - ehdr.e_ident[EI_DATA] = ELFDATA2LSB; -#else - ehdr.e_ident[EI_DATA] = ELFDATA2MSB; -#endif - ehdr.e_ident[EI_VERSION] = EV_CURRENT; - ehdr.e_ident[EI_OSABI] = ELFOSABI_STANDALONE; /* XXX big picture? */ - ehdr.e_type = ET_CORE; - ehdr.e_machine = EM_VALUE; - ehdr.e_phoff = sizeof(ehdr); - ehdr.e_flags = 0; - ehdr.e_ehsize = sizeof(ehdr); - ehdr.e_phentsize = sizeof(Elf_Phdr); - ehdr.e_shentsize = sizeof(Elf_Shdr); - - md_pa_init(); - - /* Calculate dump size. */ - dumpsize = 0L; - ehdr.e_phnum = foreach_chunk(cb_size, &dumpsize); - hdrsz = ehdr.e_phoff + ehdr.e_phnum * ehdr.e_phentsize; - fileofs = MD_ALIGN(hdrsz); - dumpsize += fileofs; - hdrgap = fileofs - DEV_ALIGN(hdrsz); - - /* Determine dump offset on device. */ - if (di->mediasize < SIZEOF_METADATA + dumpsize + sizeof(kdh) * 2) { - error = ENOSPC; - goto fail; - } - dumplo = di->mediaoffset + di->mediasize - dumpsize; - dumplo -= sizeof(kdh) * 2; - - mkdumpheader(&kdh, KERNELDUMPMAGIC, KERNELDUMP_VERSION, dumpsize, - di->blocksize); - - printf("Dumping %llu MB (%d chunks)\n", (long long)dumpsize >> 20, - ehdr.e_phnum); - - /* Dump leader */ - error = dump_write(di, &kdh, 0, dumplo, sizeof(kdh)); - if (error) - goto fail; - dumplo += sizeof(kdh); - - /* Dump ELF header */ - error = buf_write(di, (char*)&ehdr, sizeof(ehdr)); - if (error) - goto fail; - - /* Dump program headers */ - error = foreach_chunk(cb_dumphdr, di); - if (error < 0) - goto fail; - buf_flush(di); - - /* - * All headers are written using blocked I/O, so we know the - * current offset is (still) block aligned. Skip the alignement - * in the file to have the segment contents aligned at page - * boundary. We cannot use MD_ALIGN on dumplo, because we don't - * care and may very well be unaligned within the dump device. - */ - dumplo += hdrgap; - - /* Dump memory chunks (updates dumplo) */ - error = foreach_chunk(cb_dumpdata, di); - if (error < 0) - goto fail; - - /* Dump trailer */ - error = dump_write(di, &kdh, 0, dumplo, sizeof(kdh)); - if (error) - goto fail; - - /* Signal completion, signoff and exit stage left. */ - dump_write(di, NULL, 0, 0, 0); - printf("\nDump complete\n"); - return; - - fail: - if (error < 0) - error = -error; - - if (error == ECANCELED) - printf("\nDump aborted\n"); - else if (error == ENOSPC) - printf("\nDump failed. Partition too small.\n"); - else - printf("\n** DUMP FAILED (ERROR %d) **\n", error); } Modified: trunk/sys/x86/x86/fdt_machdep.c =================================================================== --- trunk/sys/x86/x86/fdt_machdep.c 2020-02-08 19:29:01 UTC (rev 12309) +++ trunk/sys/x86/x86/fdt_machdep.c 2020-02-08 19:32:41 UTC (rev 12310) @@ -26,7 +26,7 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/x86/x86/fdt_machdep.c 250840 2013-05-21 03:05:49Z marcel $"); +__FBSDID("$FreeBSD: stable/11/sys/x86/x86/fdt_machdep.c 287000 2015-08-21 15:57:57Z royger $"); #include "opt_platform.h" @@ -55,7 +55,7 @@ mdp = preload_search_by_type("elf kernel"); if (mdp == NULL) mdp = preload_search_by_type("elf32 kernel"); - dtbp = (mdp != NULL) ? MD_FETCH(mdp, MODINFOMD_DTBP, void *) : NULL; + dtbp = MD_FETCH(mdp, MODINFOMD_DTBP, void *); #if defined(FDT_DTB_STATIC) /* Modified: trunk/sys/x86/x86/identcpu.c =================================================================== --- trunk/sys/x86/x86/identcpu.c 2020-02-08 19:29:01 UTC (rev 12309) +++ trunk/sys/x86/x86/identcpu.c 2020-02-08 19:32:41 UTC (rev 12310) @@ -40,7 +40,7 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/x86/x86/identcpu.c 332743 2018-04-19 00:11:02Z jhb $"); +__FBSDID("$FreeBSD: stable/11/sys/x86/x86/identcpu.c 354658 2019-11-12 19:35:46Z scottl $"); #include "opt_cpu.h" @@ -84,9 +84,46 @@ static void print_via_padlock_info(void); static void print_vmx_info(void); +#ifdef __i386__ +int cpu; /* Are we 386, 386sx, 486, etc? */ int cpu_class; +#endif +u_int cpu_feature; /* Feature flags */ +u_int cpu_feature2; /* Feature flags */ +u_int amd_feature; /* AMD feature flags */ +u_int amd_feature2; /* AMD feature flags */ +u_int amd_pminfo; /* AMD advanced power management info */ +u_int amd_extended_feature_extensions; +u_int via_feature_rng; /* VIA RNG features */ +u_int via_feature_xcrypt; /* VIA ACE features */ +u_int cpu_high; /* Highest arg to CPUID */ +u_int cpu_exthigh; /* Highest arg to extended CPUID */ +u_int cpu_id; /* Stepping ID */ +u_int cpu_procinfo; /* HyperThreading Info / Brand Index / CLFUSH */ +u_int cpu_procinfo2; /* Multicore info */ +char cpu_vendor[20]; /* CPU Origin code */ +u_int cpu_vendor_id; /* CPU vendor ID */ +u_int cpu_fxsr; /* SSE enabled */ +u_int cpu_mxcsr_mask; /* Valid bits in mxcsr */ +u_int cpu_clflush_line_size = 32; +u_int cpu_stdext_feature; /* %ebx */ +u_int cpu_stdext_feature2; /* %ecx */ +u_int cpu_stdext_feature3; /* %edx */ +uint64_t cpu_ia32_arch_caps; +u_int cpu_max_ext_state_size; +u_int cpu_mon_mwait_flags; /* MONITOR/MWAIT flags (CPUID.05H.ECX) */ +u_int cpu_mon_min_size; /* MONITOR minimum range size, bytes */ +u_int cpu_mon_max_size; /* MONITOR minimum range size, bytes */ +u_int cpu_maxphyaddr; /* Max phys addr width in bits */ char machine[] = MACHINE; +SYSCTL_UINT(_hw, OID_AUTO, via_feature_rng, CTLFLAG_RD, + &via_feature_rng, 0, + "VIA RNG feature available in CPU"); +SYSCTL_UINT(_hw, OID_AUTO, via_feature_xcrypt, CTLFLAG_RD, + &via_feature_xcrypt, 0, + "VIA xcrypt feature available in CPU"); + #ifdef __amd64__ #ifdef SCTL_MASK32 extern int adaptive_machine_arch; @@ -109,8 +146,8 @@ return (error); } -SYSCTL_PROC(_hw, HW_MACHINE, machine, CTLTYPE_STRING | CTLFLAG_RD, - NULL, 0, sysctl_hw_machine, "A", "Machine class"); +SYSCTL_PROC(_hw, HW_MACHINE, machine, CTLTYPE_STRING | CTLFLAG_RD | + CTLFLAG_MPSAFE, NULL, 0, sysctl_hw_machine, "A", "Machine class"); #else SYSCTL_STRING(_hw, HW_MACHINE, machine, CTLFLAG_RD, machine, 0, "Machine class"); @@ -117,7 +154,7 @@ #endif static char cpu_model[128]; -SYSCTL_STRING(_hw, HW_MODEL, model, CTLFLAG_RD, +SYSCTL_STRING(_hw, HW_MODEL, model, CTLFLAG_RD | CTLFLAG_MPSAFE, cpu_model, 0, "Machine model"); static int hw_clockrate; @@ -126,8 +163,8 @@ u_int hv_high; char hv_vendor[16]; -SYSCTL_STRING(_hw, OID_AUTO, hv_vendor, CTLFLAG_RD, hv_vendor, 0, - "Hypervisor vendor"); +SYSCTL_STRING(_hw, OID_AUTO, hv_vendor, CTLFLAG_RD | CTLFLAG_MPSAFE, hv_vendor, + 0, "Hypervisor vendor"); static eventhandler_tag tsc_post_tag; @@ -147,13 +184,11 @@ NULL, "Intel Pentium 4" }; -#endif static struct { char *cpu_name; int cpu_class; } cpus[] = { -#ifdef __i386__ { "Intel 80286", CPUCLASS_286 }, /* CPU_286 */ { "i386SX", CPUCLASS_386 }, /* CPU_386SX */ { "i386DX", CPUCLASS_386 }, /* CPU_386 */ @@ -171,11 +206,8 @@ { "Pentium II", CPUCLASS_686 }, /* CPU_PII */ { "Pentium III", CPUCLASS_686 }, /* CPU_PIII */ { "Pentium 4", CPUCLASS_686 }, /* CPU_P4 */ -#else - { "Clawhammer", CPUCLASS_K8 }, /* CPU_CLAWHAMMER */ - { "Sledgehammer", CPUCLASS_K8 }, /* CPU_SLEDGEHAMMER */ +}; #endif -}; static struct { char *vendor; @@ -205,9 +237,13 @@ u_int regs[4], i; char *brand; + printf("CPU: "); +#ifdef __i386__ cpu_class = cpus[cpu].cpu_class; - printf("CPU: "); strncpy(cpu_model, cpus[cpu].cpu_name, sizeof (cpu_model)); +#else + strncpy(cpu_model, "Hammer", sizeof (cpu_model)); +#endif /* Check for extended CPUID information and a processor name. */ if (cpu_exthigh >= 0x80000004) { @@ -660,8 +696,8 @@ (intmax_t)(tsc_freq + 4999) / 1000000, (u_int)((tsc_freq + 4999) / 10000) % 100); } +#ifdef __i386__ switch(cpu_class) { -#ifdef __i386__ case CPUCLASS_286: printf("286"); break; @@ -683,14 +719,12 @@ printf("686"); break; #endif -#else - case CPUCLASS_K8: - printf("K8"); - break; -#endif default: printf("Unknown"); /* will panic below... */ } +#else + printf("K8"); +#endif printf("-class CPU)\n"); if (*cpu_vendor) printf(" Origin=\"%s\"", cpu_vendor); @@ -914,6 +948,7 @@ "\020PQE" /* AVX512 Foundation */ "\021AVX512F" + "\022AVX512DQ" /* Enhanced NRBG */ "\023RDSEED" /* ADCX + ADOX */ @@ -920,12 +955,17 @@ "\024ADX" /* Supervisor Mode Access Prevention */ "\025SMAP" + "\026AVX512IFMA" + "\027PCOMMIT" "\030CLFLUSHOPT" + "\031CLWB" "\032PROCTRACE" "\033AVX512PF" "\034AVX512ER" "\035AVX512CD" "\036SHA" + "\037AVX512BW" + "\040AVX512VL" ); } @@ -934,14 +974,35 @@ cpu_stdext_feature2, "\020" "\001PREFETCHWT1" + "\002AVX512VBMI" "\003UMIP" "\004PKU" "\005OSPKE" + "\006WAITPKG" + "\011GFNI" "\027RDPID" + "\032CLDEMOTE" + "\034MOVDIRI" + "\035MOVDIRI64B" "\037SGXLC" ); } + if (cpu_stdext_feature3 != 0) { + printf("\n Structured Extended Features3=0x%b", + cpu_stdext_feature3, + "\020" + "\013MD_CLEAR" + "\016TSXFA" + "\033IBPB" + "\034STIBP" + "\035L1DFL" + "\036ARCH_CAP" + "\037CORE_CAP" + "\040SSBD" + ); + } + if ((cpu_feature2 & CPUID2_XSAVE) != 0) { cpuid_count(0xd, 0x1, regs); if (regs[0] != 0) { @@ -955,6 +1016,31 @@ } } + if (cpu_ia32_arch_caps != 0) { + printf("\n IA32_ARCH_CAPS=0x%b", + (u_int)cpu_ia32_arch_caps, + "\020" + "\001RDCL_NO" + "\002IBRS_ALL" + "\003RSBA" + "\004SKIP_L1DFL_VME" + "\005SSB_NO" + "\006MDS_NO" + "\010TSX_CTRL" + "\011TAA_NO" + ); + } + + if (amd_extended_feature_extensions != 0) { + printf("\n " + "AMD Extended Feature Extensions ID EBX=" + "0x%b", amd_extended_feature_extensions, + "\020" + "\001CLZERO" + "\002IRPerf" + "\003XSaveErPtr"); + } + if (via_feature_rng != 0 || via_feature_xcrypt != 0) print_via_padlock_info(); @@ -1008,11 +1094,11 @@ print_hypervisor_info(); } +#ifdef __i386__ void panicifcpuunsupported(void) { -#ifdef __i386__ #if !defined(lint) #if !defined(I486_CPU) && !defined(I586_CPU) && !defined(I686_CPU) #error This kernel is not configured for one of the supported CPUs @@ -1019,17 +1105,11 @@ #endif #else /* lint */ #endif /* lint */ -#else /* __amd64__ */ -#ifndef HAMMER -#error "You need to specify a cpu type" -#endif -#endif /* * Now that we have told the user what they have, * let them know if that machine type isn't configured. */ switch (cpu_class) { -#ifdef __i386__ case CPUCLASS_286: /* a 286 should not make it this far, anyway */ case CPUCLASS_386: #if !defined(I486_CPU) @@ -1041,12 +1121,6 @@ #if !defined(I686_CPU) case CPUCLASS_686: #endif -#else /* __amd64__ */ - case CPUCLASS_X86: -#ifndef HAMMER - case CPUCLASS_K8: -#endif -#endif panic("CPU class not configured"); default: break; @@ -1053,7 +1127,6 @@ } } -#ifdef __i386__ static volatile u_int trap_by_rdmsr; /* @@ -1210,7 +1283,6 @@ SYSINIT(hook_tsc_freq, SI_SUB_CONFIGURE, SI_ORDER_ANY, hook_tsc_freq, NULL); -#ifndef XEN static const char *const vm_bnames[] = { "QEMU", /* QEMU */ "Plex86", /* Plex86 */ @@ -1270,6 +1342,10 @@ vm_guest = VM_GUEST_VMWARE; else if (strcmp(hv_vendor, "Microsoft Hv") == 0) vm_guest = VM_GUEST_HV; + else if (strcmp(hv_vendor, "KVMKVMKVM") == 0) + vm_guest = VM_GUEST_KVM; + else if (strcmp(hv_vendor, "bhyve bhyve") == 0) + vm_guest = VM_GUEST_BHYVE; } return; } @@ -1277,7 +1353,7 @@ /* * Examine SMBIOS strings for older hypervisors. */ - p = getenv("smbios.system.serial"); + p = kern_getenv("smbios.system.serial"); if (p != NULL) { if (strncmp(p, "VMware-", 7) == 0 || strncmp(p, "VMW", 3) == 0) { vmware_hvcall(VMW_HVCMD_GETVERSION, regs); @@ -1294,7 +1370,7 @@ * XXX: Some of these entries may not be needed since they were * added to FreeBSD before the checks above. */ - p = getenv("smbios.bios.vendor"); + p = kern_getenv("smbios.bios.vendor"); if (p != NULL) { for (i = 0; vm_bnames[i] != NULL; i++) if (strcmp(p, vm_bnames[i]) == 0) { @@ -1304,7 +1380,7 @@ } freeenv(p); } - p = getenv("smbios.system.product"); + p = kern_getenv("smbios.system.product"); if (p != NULL) { for (i = 0; vm_pnames[i] != NULL; i++) if (strcmp(p, vm_pnames[i]) == 0) { @@ -1315,7 +1391,6 @@ freeenv(p); } } -#endif bool fix_cpuid(void) @@ -1360,9 +1435,8 @@ return (false); } -#ifdef __amd64__ void -identify_cpu(void) +identify_cpu1(void) { u_int regs[4]; @@ -1379,8 +1453,34 @@ cpu_feature = regs[3]; cpu_feature2 = regs[2]; } -#endif +void +identify_cpu2(void) +{ + u_int regs[4], cpu_stdext_disable; + + if (cpu_high >= 7) { + cpuid_count(7, 0, regs); + cpu_stdext_feature = regs[1]; + + /* + * Some hypervisors failed to filter out unsupported + * extended features. Allow to disable the + * extensions, activation of which requires setting a + * bit in CR4, and which VM monitors do not support. + */ + cpu_stdext_disable = 0; + TUNABLE_INT_FETCH("hw.cpu_stdext_disable", &cpu_stdext_disable); + cpu_stdext_feature &= ~cpu_stdext_disable; + + cpu_stdext_feature2 = regs[2]; + cpu_stdext_feature3 = regs[3]; + + if ((cpu_stdext_feature3 & CPUID_STDEXT3_ARCH_CAP) != 0) + cpu_ia32_arch_caps = rdmsr(MSR_IA32_ARCH_CAP); + } +} + /* * Final stage of CPU identification. */ @@ -1387,7 +1487,7 @@ void finishidentcpu(void) { - u_int regs[4], cpu_stdext_disable; + u_int regs[4]; #ifdef __i386__ u_char ccr3; #endif @@ -1406,26 +1506,8 @@ cpu_mon_max_size = regs[1] & CPUID5_MON_MAX_SIZE; } - if (cpu_high >= 7) { - cpuid_count(7, 0, regs); - cpu_stdext_feature = regs[1]; + identify_cpu2(); - /* - * Some hypervisors fail to filter out unsupported - * extended features. For now, disable the - * extensions, activation of which requires setting a - * bit in CR4, and which VM monitors do not support. - */ - if (cpu_feature2 & CPUID2_HV) { - cpu_stdext_disable = CPUID_STDEXT_FSGSBASE | - CPUID_STDEXT_SMEP; - } else - cpu_stdext_disable = 0; - TUNABLE_INT_FETCH("hw.cpu_stdext_disable", &cpu_stdext_disable); - cpu_stdext_feature &= ~cpu_stdext_disable; - cpu_stdext_feature2 = regs[2]; - } - #ifdef __i386__ if (cpu_high > 0 && (cpu_vendor_id == CPU_VENDOR_INTEL || @@ -1457,6 +1539,7 @@ if (cpu_exthigh >= 0x80000008) { do_cpuid(0x80000008, regs); cpu_maxphyaddr = regs[0] & 0xff; + amd_extended_feature_extensions = regs[1]; cpu_procinfo2 = regs[2]; } else { cpu_maxphyaddr = (cpu_feature & CPUID_PAE) != 0 ? 36 : 32; @@ -1550,18 +1633,26 @@ return; } } -#else - /* XXX */ - cpu = CPU_CLAWHAMMER; #endif } +int +pti_get_default(void) +{ + + if (strcmp(cpu_vendor, AMD_VENDOR_ID) == 0) + return (0); + if ((cpu_ia32_arch_caps & IA32_ARCH_CAP_RDCL_NO) != 0) + return (0); + return (1); +} + static u_int find_cpu_vendor_id(void) { int i; - for (i = 0; i < sizeof(cpu_vendors) / sizeof(cpu_vendors[0]); i++) + for (i = 0; i < nitems(cpu_vendors); i++) if (strcmp(cpu_vendor, cpu_vendors[i].vendor) == 0) return (cpu_vendors[i].vendor_id); return (0); Modified: trunk/sys/x86/x86/intr_machdep.c =================================================================== --- trunk/sys/x86/x86/intr_machdep.c 2020-02-08 19:29:01 UTC (rev 12309) +++ trunk/sys/x86/x86/intr_machdep.c 2020-02-08 19:32:41 UTC (rev 12310) @@ -24,7 +24,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: stable/10/sys/x86/x86/intr_machdep.c 307244 2016-10-14 02:03:53Z sephe $ + * $FreeBSD: stable/11/sys/x86/x86/intr_machdep.c 340016 2018-11-01 18:34:26Z jhb $ */ /* @@ -37,6 +37,7 @@ #include "opt_atpic.h" #include "opt_ddb.h" +#include "opt_smp.h" #include <sys/param.h> #include <sys/bus.h> @@ -44,6 +45,7 @@ #include <sys/ktr.h> #include <sys/kernel.h> #include <sys/lock.h> +#include <sys/malloc.h> #include <sys/mutex.h> #include <sys/proc.h> #include <sys/smp.h> @@ -50,6 +52,7 @@ #include <sys/sx.h> #include <sys/syslog.h> #include <sys/systm.h> +#include <sys/vmmeter.h> #include <machine/clock.h> #include <machine/intr_machdep.h> #include <machine/smp.h> @@ -65,7 +68,7 @@ #ifdef PC98 #include <pc98/cbus/cbus.h> #else -#include <x86/isa/isa.h> +#include <isa/isareg.h> #endif #endif @@ -74,22 +77,26 @@ typedef void (*mask_fn)(void *); static int intrcnt_index; -static struct intsrc *interrupt_sources[NUM_IO_INTS]; +static struct intsrc **interrupt_sources; static struct sx intrsrc_lock; static struct mtx intrpic_lock; static struct mtx intrcnt_lock; static TAILQ_HEAD(pics_head, pic) pics; +u_int num_io_irqs; -#ifdef SMP +#if defined(SMP) && !defined(EARLY_AP_STARTUP) static int assign_cpu; #endif -u_long intrcnt[INTRCNT_COUNT]; -char intrnames[INTRCNT_COUNT * (MAXCOMLEN + 1)]; +u_long *intrcnt; +char *intrnames; size_t sintrcnt = sizeof(intrcnt); size_t sintrnames = sizeof(intrnames); +int nintrcnt; -static int intr_assign_cpu(void *arg, u_char cpu); +static MALLOC_DEFINE(M_INTR, "intr", "Interrupt Sources"); + +static int intr_assign_cpu(void *arg, int cpu); static void intr_disable_src(void *arg); static void intr_init(void *__dummy); static int intr_pic_registered(struct pic *pic); @@ -97,6 +104,18 @@ static void intrcnt_updatename(struct intsrc *is); static void intrcnt_register(struct intsrc *is); +/* + * SYSINIT levels for SI_SUB_INTR: + * + * SI_ORDER_FIRST: Initialize locks and pics TAILQ, xen_hvm_cpu_init + * SI_ORDER_SECOND: Xen PICs + * SI_ORDER_THIRD: Add I/O APIC PICs, alloc MSI and Xen IRQ ranges + * SI_ORDER_FOURTH: Add 8259A PICs + * SI_ORDER_FOURTH + 1: Finalize interrupt count and add interrupt sources + * SI_ORDER_MIDDLE: SMP interrupt counters + * SI_ORDER_ANY: Enable interrupts on BSP + */ + static int intr_pic_registered(struct pic *pic) { @@ -132,6 +151,56 @@ } /* + * Allocate interrupt source arrays and register interrupt sources + * once the number of interrupts is known. + */ +static void +intr_init_sources(void *arg) +{ + struct pic *pic; + + MPASS(num_io_irqs > 0); + + interrupt_sources = mallocarray(num_io_irqs, sizeof(*interrupt_sources), + M_INTR, M_WAITOK | M_ZERO); + + /* + * - 1 ??? dummy counter. + * - 2 counters for each I/O interrupt. + * - 1 counter for each CPU for lapic timer. + * - 1 counter for each CPU for the Hyper-V vmbus driver. + * - 8 counters for each CPU for IPI counters for SMP. + */ + nintrcnt = 1 + num_io_irqs * 2 + mp_ncpus * 2; +#ifdef COUNT_IPIS + if (mp_ncpus > 1) + nintrcnt += 8 * mp_ncpus; +#endif + intrcnt = mallocarray(nintrcnt, sizeof(u_long), M_INTR, M_WAITOK | + M_ZERO); + intrnames = mallocarray(nintrcnt, MAXCOMLEN + 1, M_INTR, M_WAITOK | + M_ZERO); + sintrcnt = nintrcnt * sizeof(u_long); + sintrnames = nintrcnt * (MAXCOMLEN + 1); + + intrcnt_setname("???", 0); + intrcnt_index = 1; + + /* + * NB: intrpic_lock is not held here to avoid LORs due to + * malloc() in intr_register_source(). However, we are still + * single-threaded at this point in startup so the list of + * PICs shouldn't change. + */ + TAILQ_FOREACH(pic, &pics, pics) { + if (pic->pic_register_sources != NULL) + pic->pic_register_sources(pic); + } +} +SYSINIT(intr_init_sources, SI_SUB_INTR, SI_ORDER_FOURTH + 1, intr_init_sources, + NULL); + +/* * Register a new interrupt source with the global interrupt system. * The global interrupts need to be disabled when this function is * called. @@ -143,6 +212,8 @@ KASSERT(intr_pic_registered(isrc->is_pic), ("unregistered PIC")); vector = isrc->is_pic->pic_vector(isrc); + KASSERT(vector < num_io_irqs, ("IRQ %d too large (%u irqs)", vector, + num_io_irqs)); if (interrupt_sources[vector] != NULL) return (EEXIST); error = intr_event_create(&isrc->is_event, isrc, 0, vector, @@ -168,6 +239,8 @@ intr_lookup_source(int vector) { + if (vector < 0 || vector >= num_io_irqs) + return (NULL); return (interrupt_sources[vector]); } @@ -308,17 +381,24 @@ } static int -intr_assign_cpu(void *arg, u_char cpu) +intr_assign_cpu(void *arg, int cpu) { #ifdef SMP struct intsrc *isrc; int error; +#ifdef EARLY_AP_STARTUP + MPASS(mp_ncpus == 1 || smp_started); + + /* Nothing to do if there is only a single CPU. */ + if (mp_ncpus > 1 && cpu != NOCPU) { +#else /* * Don't do anything during early boot. We will pick up the * assignment once the APs are started. */ if (assign_cpu && cpu != NOCPU) { +#endif isrc = arg; sx_xlock(&intrsrc_lock); error = isrc->is_pic->pic_assign_cpu(isrc, cpu_apic_ids[cpu]); @@ -353,6 +433,7 @@ KASSERT(is->is_event != NULL, ("%s: isrc with no event", __func__)); mtx_lock_spin(&intrcnt_lock); + MPASS(intrcnt_index + 2 <= nintrcnt); is->is_index = intrcnt_index; intrcnt_index += 2; snprintf(straystr, MAXCOMLEN + 1, "stray irq%d", @@ -369,6 +450,7 @@ { mtx_lock_spin(&intrcnt_lock); + MPASS(intrcnt_index < nintrcnt); *countp = &intrcnt[intrcnt_index]; intrcnt_setname(name, intrcnt_index); intrcnt_index++; @@ -379,8 +461,6 @@ intr_init(void *dummy __unused) { - intrcnt_setname("???", 0); - intrcnt_index = 1; TAILQ_INIT(&pics); mtx_init(&intrpic_lock, "intrpic", NULL, MTX_DEF); sx_init(&intrsrc_lock, "intrsrc"); @@ -388,6 +468,21 @@ } SYSINIT(intr_init, SI_SUB_INTR, SI_ORDER_FIRST, intr_init, NULL); +static void +intr_init_final(void *dummy __unused) +{ + + /* + * Enable interrupts on the BSP after all of the interrupt + * controllers are initialized. Device interrupts are still + * disabled in the interrupt controllers until interrupt + * handlers are registered. Interrupts are enabled on each AP + * after their first context switch. + */ + enable_intr(); +} +SYSINIT(intr_init_final, SI_SUB_INTR, SI_ORDER_ANY, intr_init_final, NULL); + #ifndef DEV_ATPIC /* Initialize the two 8259A's to a known-good shutdown state. */ void @@ -427,6 +522,23 @@ return (0); } +void +intr_reprogram(void) +{ + struct intsrc *is; + u_int v; + + sx_xlock(&intrsrc_lock); + for (v = 0; v < num_io_irqs; v++) { + is = interrupt_sources[v]; + if (is == NULL) + continue; + if (is->is_pic->pic_reprogram_pin != NULL) + is->is_pic->pic_reprogram_pin(is); + } + sx_xunlock(&intrsrc_lock); +} + #ifdef DDB /* * Dump data about interrupt handlers @@ -434,7 +546,8 @@ DB_SHOW_COMMAND(irqs, db_show_irqs) { struct intsrc **isrc; - int i, verbose; + u_int i; + int verbose; if (strcmp(modif, "v") == 0) verbose = 1; @@ -441,7 +554,7 @@ else verbose = 0; isrc = interrupt_sources; - for (i = 0; i < NUM_IO_INTS && !db_pager_quit; i++, isrc++) + for (i = 0; i < num_io_irqs && !db_pager_quit; i++, isrc++) if (*isrc != NULL) db_dump_intr_event((*isrc)->is_event, verbose); } @@ -453,7 +566,7 @@ * allocate CPUs round-robin. */ -static cpuset_t intr_cpus = CPUSET_T_INITIALIZER(0x1); +cpuset_t intr_cpus = CPUSET_T_INITIALIZER(0x1); static int current_cpu; /* @@ -465,9 +578,15 @@ { u_int apic_id; +#ifdef EARLY_AP_STARTUP + MPASS(mp_ncpus == 1 || smp_started); + if (mp_ncpus == 1) + return (PCPU_GET(apic_id)); +#else /* Leave all interrupts on the BSP during boot. */ if (!assign_cpu) return (PCPU_GET(apic_id)); +#endif mtx_lock_spin(&icu_lock); apic_id = cpu_apic_ids[current_cpu]; @@ -509,6 +628,7 @@ CPU_SET(cpu, &intr_cpus); } +#ifndef EARLY_AP_STARTUP /* * Distribute all the interrupt sources among the available CPUs once the * AP's have been launched. @@ -517,15 +637,8 @@ intr_shuffle_irqs(void *arg __unused) { struct intsrc *isrc; - int i; + u_int i; -#ifdef XEN - /* - * Doesn't work yet - */ - return; -#endif - /* Don't bother on UP. */ if (mp_ncpus == 1) return; @@ -533,7 +646,7 @@ /* Round-robin assign a CPU to each enabled source. */ sx_xlock(&intrsrc_lock); assign_cpu = 1; - for (i = 0; i < NUM_IO_INTS; i++) { + for (i = 0; i < num_io_irqs; i++) { isrc = interrupt_sources[i]; if (isrc != NULL && isrc->is_handlers > 0) { /* @@ -556,6 +669,7 @@ } SYSINIT(intr_shuffle_irqs, SI_SUB_SMP, SI_ORDER_SECOND, intr_shuffle_irqs, NULL); +#endif #else /* * Always route interrupts to the current processor in the UP case. Modified: trunk/sys/x86/x86/io_apic.c =================================================================== --- trunk/sys/x86/x86/io_apic.c 2020-02-08 19:29:01 UTC (rev 12309) +++ trunk/sys/x86/x86/io_apic.c 2020-02-08 19:32:41 UTC (rev 12310) @@ -26,8 +26,9 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/x86/x86/io_apic.c 330959 2018-03-14 23:59:52Z marius $"); +__FBSDID("$FreeBSD: stable/11/sys/x86/x86/io_apic.c 340016 2018-11-01 18:34:26Z jhb $"); +#include "opt_acpi.h" #include "opt_isa.h" #include <sys/param.h> @@ -38,6 +39,7 @@ #include <sys/malloc.h> #include <sys/module.h> #include <sys/mutex.h> +#include <sys/rman.h> #include <sys/sysctl.h> #include <dev/pci/pcireg.h> @@ -49,9 +51,10 @@ #include <x86/apicreg.h> #include <machine/frame.h> #include <machine/intr_machdep.h> -#include <machine/apicvar.h> +#include <x86/apicvar.h> #include <machine/resource.h> #include <machine/segments.h> +#include <x86/iommu/iommu_intrmap.h> #define IOAPIC_ISA_INTS 16 #define IOAPIC_MEM_REGION 32 @@ -58,11 +61,6 @@ #define IOAPIC_REDTBL_LO(i) (IOAPIC_REDTBL + (i) * 2) #define IOAPIC_REDTBL_HI(i) (IOAPIC_REDTBL_LO(i) + 1) -#define IRQ_EXTINT (NUM_IO_INTS + 1) -#define IRQ_NMI (NUM_IO_INTS + 2) -#define IRQ_SMI (NUM_IO_INTS + 3) -#define IRQ_DISABLED (NUM_IO_INTS + 4) - static MALLOC_DEFINE(M_IOAPIC, "io_apic", "I/O APIC structures"); /* @@ -81,15 +79,16 @@ struct ioapic_intsrc { struct intsrc io_intsrc; - u_int io_irq; + int io_irq; u_int io_intpin:8; u_int io_vector:8; - u_int io_cpu:8; + u_int io_cpu; u_int io_activehi:1; u_int io_edgetrigger:1; u_int io_masked:1; int io_bus:4; uint32_t io_lowreg; + u_int io_remap_cookie; }; struct ioapic { @@ -98,9 +97,13 @@ u_int io_apic_id:4; u_int io_intbase:8; /* System Interrupt base */ u_int io_numintr:8; + u_int io_haseoi:1; volatile ioapic_t *io_addr; /* XXX: should use bus_space */ vm_paddr_t io_paddr; STAILQ_ENTRY(ioapic) io_next; + device_t pci_dev; /* matched pci device, if found */ + struct resource *pci_wnd; /* BAR 0, should be same or alias to + io_paddr */ struct ioapic_intsrc io_pins[0]; }; @@ -108,6 +111,7 @@ static void ioapic_write(volatile ioapic_t *apic, int reg, u_int val); static const char *ioapic_bus_string(int bus_type); static void ioapic_print_irq(struct ioapic_intsrc *intpin); +static void ioapic_register_sources(struct pic *pic); static void ioapic_enable_source(struct intsrc *isrc); static void ioapic_disable_source(struct intsrc *isrc, int eoi); static void ioapic_eoi_source(struct intsrc *isrc); @@ -120,27 +124,79 @@ static void ioapic_resume(struct pic *pic, bool suspend_cancelled); static int ioapic_assign_cpu(struct intsrc *isrc, u_int apic_id); static void ioapic_program_intpin(struct ioapic_intsrc *intpin); +static void ioapic_reprogram_intpin(struct intsrc *isrc); static STAILQ_HEAD(,ioapic) ioapic_list = STAILQ_HEAD_INITIALIZER(ioapic_list); -struct pic ioapic_template = { ioapic_enable_source, ioapic_disable_source, - ioapic_eoi_source, ioapic_enable_intr, - ioapic_disable_intr, ioapic_vector, - ioapic_source_pending, NULL, ioapic_resume, - ioapic_config_intr, ioapic_assign_cpu }; +struct pic ioapic_template = { + .pic_register_sources = ioapic_register_sources, + .pic_enable_source = ioapic_enable_source, + .pic_disable_source = ioapic_disable_source, + .pic_eoi_source = ioapic_eoi_source, + .pic_enable_intr = ioapic_enable_intr, + .pic_disable_intr = ioapic_disable_intr, + .pic_vector = ioapic_vector, + .pic_source_pending = ioapic_source_pending, + .pic_suspend = NULL, + .pic_resume = ioapic_resume, + .pic_config_intr = ioapic_config_intr, + .pic_assign_cpu = ioapic_assign_cpu, + .pic_reprogram_pin = ioapic_reprogram_intpin, +}; -static int next_ioapic_base; +static u_int next_ioapic_base; static u_int next_id; -static SYSCTL_NODE(_hw, OID_AUTO, apic, CTLFLAG_RD, 0, "APIC options"); static int enable_extint; SYSCTL_INT(_hw_apic, OID_AUTO, enable_extint, CTLFLAG_RDTUN, &enable_extint, 0, "Enable the ExtINT pin in the first I/O APIC"); -TUNABLE_INT("hw.apic.enable_extint", &enable_extint); -static __inline void -_ioapic_eoi_source(struct intsrc *isrc) +static void +_ioapic_eoi_source(struct intsrc *isrc, int locked) { + struct ioapic_intsrc *src; + struct ioapic *io; + volatile uint32_t *apic_eoi; + uint32_t low1; + lapic_eoi(); + if (!lapic_eoi_suppression) + return; + src = (struct ioapic_intsrc *)isrc; + if (src->io_edgetrigger) + return; + io = (struct ioapic *)isrc->is_pic; + + /* + * Handle targeted EOI for level-triggered pins, if broadcast + * EOI suppression is supported by LAPICs. + */ + if (io->io_haseoi) { + /* + * If IOAPIC has EOI Register, simply write vector + * number into the reg. + */ + apic_eoi = (volatile uint32_t *)((volatile char *) + io->io_addr + IOAPIC_EOIR); + *apic_eoi = src->io_vector; + } else { + /* + * Otherwise, if IO-APIC is too old to provide EOIR, + * do what Intel did for the Linux kernel. Temporary + * switch the pin to edge-trigger and back, masking + * the pin during the trick. + */ + if (!locked) + mtx_lock_spin(&icu_lock); + low1 = src->io_lowreg; + low1 &= ~IOART_TRGRLVL; + low1 |= IOART_TRGREDG | IOART_INTMSET; + ioapic_write(io->io_addr, IOAPIC_REDTBL_LO(src->io_intpin), + low1); + ioapic_write(io->io_addr, IOAPIC_REDTBL_LO(src->io_intpin), + src->io_lowreg); + if (!locked) + mtx_unlock_spin(&icu_lock); + } } static u_int @@ -195,7 +251,7 @@ printf("SMI"); break; default: - printf("%s IRQ %u", ioapic_bus_string(intpin->io_bus), + printf("%s IRQ %d", ioapic_bus_string(intpin->io_bus), intpin->io_irq); } } @@ -233,7 +289,7 @@ } if (eoi == PIC_EOI) - _ioapic_eoi_source(isrc); + _ioapic_eoi_source(isrc, 1); mtx_unlock_spin(&icu_lock); } @@ -242,7 +298,7 @@ ioapic_eoi_source(struct intsrc *isrc) { - _ioapic_eoi_source(isrc); + _ioapic_eoi_source(isrc, 0); } /* @@ -254,6 +310,9 @@ { struct ioapic *io = (struct ioapic *)intpin->io_intsrc.is_pic; uint32_t low, high; +#ifdef ACPI_DMAR + int error; +#endif /* * If a pin is completely invalid or if it is valid but hasn't @@ -260,7 +319,7 @@ * been enabled yet, just ensure that the pin is masked. */ mtx_assert(&icu_lock, MA_OWNED); - if (intpin->io_irq == IRQ_DISABLED || (intpin->io_irq < NUM_IO_INTS && + if (intpin->io_irq == IRQ_DISABLED || (intpin->io_irq >= 0 && intpin->io_vector == 0)) { low = ioapic_read(io->io_addr, IOAPIC_REDTBL_LO(intpin->io_intpin)); @@ -268,9 +327,34 @@ ioapic_write(io->io_addr, IOAPIC_REDTBL_LO(intpin->io_intpin), low | IOART_INTMSET); +#ifdef ACPI_DMAR + mtx_unlock_spin(&icu_lock); + iommu_unmap_ioapic_intr(io->io_apic_id, + &intpin->io_remap_cookie); + mtx_lock_spin(&icu_lock); +#endif return; } +#ifdef ACPI_DMAR + mtx_unlock_spin(&icu_lock); + error = iommu_map_ioapic_intr(io->io_apic_id, + intpin->io_cpu, intpin->io_vector, intpin->io_edgetrigger, + intpin->io_activehi, intpin->io_irq, &intpin->io_remap_cookie, + &high, &low); + mtx_lock_spin(&icu_lock); + if (error == 0) { + ioapic_write(io->io_addr, IOAPIC_REDTBL_HI(intpin->io_intpin), + high); + intpin->io_lowreg = low; + ioapic_write(io->io_addr, IOAPIC_REDTBL_LO(intpin->io_intpin), + low); + return; + } else if (error != EOPNOTSUPP) { + return; + } +#endif + /* * Set the destination. Note that with Intel interrupt remapping, * the previously reserved bits 55:48 now have a purpose so ensure @@ -318,6 +402,15 @@ ioapic_write(io->io_addr, IOAPIC_REDTBL_LO(intpin->io_intpin), low); } +static void +ioapic_reprogram_intpin(struct intsrc *isrc) +{ + + mtx_lock_spin(&icu_lock); + ioapic_program_intpin((struct ioapic_intsrc *)isrc); + mtx_unlock_spin(&icu_lock); +} + static int ioapic_assign_cpu(struct intsrc *isrc, u_int apic_id) { @@ -537,6 +630,8 @@ io = malloc(sizeof(struct ioapic) + numintr * sizeof(struct ioapic_intsrc), M_IOAPIC, M_WAITOK); io->io_pic = ioapic_template; + io->pci_dev = NULL; + io->pci_wnd = NULL; mtx_lock_spin(&icu_lock); io->io_id = next_id++; io->io_apic_id = ioapic_read(apic, IOAPIC_ID) >> APIC_ID_SHIFT; @@ -557,11 +652,29 @@ io->io_id, intbase, next_ioapic_base); io->io_intbase = intbase; next_ioapic_base = intbase + numintr; + if (next_ioapic_base > num_io_irqs) + num_io_irqs = next_ioapic_base; io->io_numintr = numintr; io->io_addr = apic; io->io_paddr = addr; + if (bootverbose) { + printf("ioapic%u: ver 0x%02x maxredir 0x%02x\n", io->io_id, + (value & IOART_VER_VERSION), (value & IOART_VER_MAXREDIR) + >> MAXREDIRSHIFT); + } /* + * The summary information about IO-APIC versions is taken from + * the Linux kernel source: + * 0Xh 82489DX + * 1Xh I/OAPIC or I/O(x)APIC which are not PCI 2.2 Compliant + * 2Xh I/O(x)APIC which is PCI 2.2 Compliant + * 30h-FFh Reserved + * IO-APICs with version >= 0x20 have working EOIR register. + */ + io->io_haseoi = (value & IOART_VER_VERSION) >= 0x20; + + /* * Initialize pins. Start off with interrupts disabled. Default * to active-hi and edge-triggered for ISA interrupts and active-lo * and level-triggered for all others. @@ -599,6 +712,15 @@ intpin->io_cpu = PCPU_GET(apic_id); value = ioapic_read(apic, IOAPIC_REDTBL_LO(i)); ioapic_write(apic, IOAPIC_REDTBL_LO(i), value | IOART_INTMSET); +#ifdef ACPI_DMAR + /* dummy, but sets cookie */ + mtx_unlock_spin(&icu_lock); + iommu_map_ioapic_intr(io->io_apic_id, + intpin->io_cpu, intpin->io_vector, intpin->io_edgetrigger, + intpin->io_activehi, intpin->io_irq, + &intpin->io_remap_cookie, NULL, NULL); + mtx_lock_spin(&icu_lock); +#endif } mtx_unlock_spin(&icu_lock); @@ -640,7 +762,7 @@ io = (struct ioapic *)cookie; if (pin >= io->io_numintr || vector < 0) return (EINVAL); - if (io->io_pins[pin].io_irq >= NUM_IO_INTS) + if (io->io_pins[pin].io_irq < 0) return (EINVAL); io->io_pins[pin].io_irq = vector; if (bootverbose) @@ -659,7 +781,7 @@ io = (struct ioapic *)cookie; if (pin >= io->io_numintr) return (EINVAL); - if (io->io_pins[pin].io_irq >= NUM_IO_INTS) + if (io->io_pins[pin].io_irq < 0) return (EINVAL); if (io->io_pins[pin].io_bus == bus_type) return (0); @@ -680,7 +802,7 @@ return (EINVAL); if (io->io_pins[pin].io_irq == IRQ_NMI) return (0); - if (io->io_pins[pin].io_irq >= NUM_IO_INTS) + if (io->io_pins[pin].io_irq < 0) return (EINVAL); io->io_pins[pin].io_bus = APIC_BUS_UNKNOWN; io->io_pins[pin].io_irq = IRQ_NMI; @@ -703,7 +825,7 @@ return (EINVAL); if (io->io_pins[pin].io_irq == IRQ_SMI) return (0); - if (io->io_pins[pin].io_irq >= NUM_IO_INTS) + if (io->io_pins[pin].io_irq < 0) return (EINVAL); io->io_pins[pin].io_bus = APIC_BUS_UNKNOWN; io->io_pins[pin].io_irq = IRQ_SMI; @@ -726,7 +848,7 @@ return (EINVAL); if (io->io_pins[pin].io_irq == IRQ_EXTINT) return (0); - if (io->io_pins[pin].io_irq >= NUM_IO_INTS) + if (io->io_pins[pin].io_irq < 0) return (EINVAL); io->io_pins[pin].io_bus = APIC_BUS_UNKNOWN; io->io_pins[pin].io_irq = IRQ_EXTINT; @@ -751,7 +873,7 @@ io = (struct ioapic *)cookie; if (pin >= io->io_numintr || pol == INTR_POLARITY_CONFORM) return (EINVAL); - if (io->io_pins[pin].io_irq >= NUM_IO_INTS) + if (io->io_pins[pin].io_irq < 0) return (EINVAL); activehi = (pol == INTR_POLARITY_HIGH); if (io->io_pins[pin].io_activehi == activehi) @@ -772,7 +894,7 @@ io = (struct ioapic *)cookie; if (pin >= io->io_numintr || trigger == INTR_TRIGGER_CONFORM) return (EINVAL); - if (io->io_pins[pin].io_irq >= NUM_IO_INTS) + if (io->io_pins[pin].io_irq < 0) return (EINVAL); edgetrigger = (trigger == INTR_TRIGGER_EDGE); if (io->io_pins[pin].io_edgetrigger == edgetrigger) @@ -808,14 +930,26 @@ /* * Reprogram pins to handle special case pins (such as NMI and - * SMI) and register valid pins as interrupt sources. + * SMI) and disable normal pins until a handler is registered. */ intr_register_pic(&io->io_pic); + for (i = 0, pin = io->io_pins; i < io->io_numintr; i++, pin++) + ioapic_reprogram_intpin(&pin->io_intsrc); +} + +/* + * Add interrupt sources for I/O APIC interrupt pins. + */ +static void +ioapic_register_sources(struct pic *pic) +{ + struct ioapic_intsrc *pin; + struct ioapic *io; + int i; + + io = (struct ioapic *)pic; for (i = 0, pin = io->io_pins; i < io->io_numintr; i++, pin++) { - mtx_lock_spin(&icu_lock); - ioapic_program_intpin(pin); - mtx_unlock_spin(&icu_lock); - if (pin->io_irq < NUM_IO_INTS) + if (pin->io_irq >= 0) intr_register_source(&pin->io_intsrc); } } @@ -846,7 +980,72 @@ static int ioapic_pci_attach(device_t dev) { + struct resource *res; + volatile ioapic_t *apic; + struct ioapic *io; + int rid; + u_int apic_id; + /* + * Try to match the enumerated ioapic. Match BAR start + * against io_paddr. Due to a fear that PCI window is not the + * same as the MADT reported io window, but an alias, read the + * APIC ID from the mapped BAR and match against it. + */ + rid = PCIR_BAR(0); + res = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid, + RF_ACTIVE | RF_SHAREABLE); + if (res == NULL) { + if (bootverbose) + device_printf(dev, "cannot activate BAR0\n"); + return (ENXIO); + } + apic = (volatile ioapic_t *)rman_get_virtual(res); + if (rman_get_size(res) < IOAPIC_WND_SIZE) { + if (bootverbose) + device_printf(dev, + "BAR0 too small (%jd) for IOAPIC window\n", + (uintmax_t)rman_get_size(res)); + goto fail; + } + mtx_lock_spin(&icu_lock); + apic_id = ioapic_read(apic, IOAPIC_ID) >> APIC_ID_SHIFT; + /* First match by io window address */ + STAILQ_FOREACH(io, &ioapic_list, io_next) { + if (io->io_paddr == (vm_paddr_t)rman_get_start(res)) + goto found; + } + /* Then by apic id */ + STAILQ_FOREACH(io, &ioapic_list, io_next) { + if (io->io_apic_id == apic_id) + goto found; + } + mtx_unlock_spin(&icu_lock); + if (bootverbose) + device_printf(dev, + "cannot match pci bar apic id %d against MADT\n", + apic_id); +fail: + bus_release_resource(dev, SYS_RES_MEMORY, rid, res); + return (ENXIO); +found: + KASSERT(io->pci_dev == NULL, + ("ioapic %d pci_dev not NULL", io->io_id)); + KASSERT(io->pci_wnd == NULL, + ("ioapic %d pci_wnd not NULL", io->io_id)); + + io->pci_dev = dev; + io->pci_wnd = res; + if (bootverbose && (io->io_paddr != (vm_paddr_t)rman_get_start(res) || + io->io_apic_id != apic_id)) { + device_printf(dev, "pci%d:%d:%d:%d pci BAR0@%jx id %d " + "MADT id %d paddr@%jx\n", + pci_get_domain(dev), pci_get_bus(dev), + pci_get_slot(dev), pci_get_function(dev), + (uintmax_t)rman_get_start(res), apic_id, + io->io_apic_id, (uintmax_t)io->io_paddr); + } + mtx_unlock_spin(&icu_lock); return (0); } @@ -863,6 +1062,28 @@ static devclass_t ioapic_devclass; DRIVER_MODULE(ioapic, pci, ioapic_pci_driver, ioapic_devclass, 0, 0); +int +ioapic_get_rid(u_int apic_id, uint16_t *ridp) +{ + struct ioapic *io; + uintptr_t rid; + int error; + + mtx_lock_spin(&icu_lock); + STAILQ_FOREACH(io, &ioapic_list, io_next) { + if (io->io_apic_id == apic_id) + break; + } + mtx_unlock_spin(&icu_lock); + if (io == NULL || io->pci_dev == NULL) + return (EINVAL); + error = pci_get_id(io->pci_dev, PCI_ID_RID, &rid); + if (error != 0) + return (error); + *ridp = rid; + return (0); +} + /* * A new-bus driver to consume the memory resources associated with * the APICs in the system. On some systems ACPI or PnPBIOS system @@ -896,19 +1117,11 @@ { int error; -#ifdef PAE - /* - * Resources use long's to track resources, so we can't - * include memory regions above 4GB. - */ - if (base >= ~0ul) - return; -#endif error = bus_set_resource(dev, SYS_RES_MEMORY, rid, base, length); if (error) panic("apic_add_resource: resource %d failed set with %d", rid, error); - bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid, 0); + bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid, RF_SHAREABLE); } static int @@ -918,7 +1131,7 @@ int i; /* Reserve the local APIC. */ - apic_add_resource(dev, 0, lapic_paddr, sizeof(lapic_t)); + apic_add_resource(dev, 0, lapic_paddr, LAPIC_MEM_REGION); i = 1; STAILQ_FOREACH(io, &ioapic_list, io_next) { apic_add_resource(dev, i, io->io_paddr, IOAPIC_MEM_REGION); Modified: trunk/sys/x86/x86/legacy.c =================================================================== --- trunk/sys/x86/x86/legacy.c 2020-02-08 19:29:01 UTC (rev 12309) +++ trunk/sys/x86/x86/legacy.c 2020-02-08 19:32:41 UTC (rev 12310) @@ -33,7 +33,7 @@ #include "opt_mca.h" #endif #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/x86/x86/legacy.c 233707 2012-03-30 19:10:14Z jhb $"); +__FBSDID("$FreeBSD: stable/11/sys/x86/x86/legacy.c 233707 2012-03-30 19:10:14Z jhb $"); /* * This code implements a system driver for legacy systems that do not Modified: trunk/sys/x86/x86/local_apic.c =================================================================== --- trunk/sys/x86/x86/local_apic.c 2020-02-08 19:29:01 UTC (rev 12309) +++ trunk/sys/x86/x86/local_apic.c 2020-02-08 19:32:41 UTC (rev 12310) @@ -33,11 +33,10 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/x86/x86/local_apic.c 314662 2017-03-04 12:04:24Z avg $"); +__FBSDID("$FreeBSD: stable/11/sys/x86/x86/local_apic.c 351757 2019-09-03 16:27:23Z emaste $"); #include "opt_atpic.h" #include "opt_hwpmc_hooks.h" -#include "opt_kdtrace.h" #include "opt_ddb.h" @@ -51,6 +50,7 @@ #include <sys/proc.h> #include <sys/sched.h> #include <sys/smp.h> +#include <sys/sysctl.h> #include <sys/timeet.h> #include <vm/vm.h> @@ -58,14 +58,16 @@ #include <x86/apicreg.h> #include <machine/clock.h> +#include <machine/cpufunc.h> #include <machine/cputypes.h> #include <machine/frame.h> #include <machine/intr_machdep.h> -#include <machine/apicvar.h> +#include <x86/apicvar.h> #include <x86/mca.h> #include <machine/md_var.h> #include <machine/smp.h> #include <machine/specialreg.h> +#include <x86/init.h> #ifdef DDB #include <sys/interrupt.h> @@ -88,12 +90,24 @@ CTASSERT(APIC_LOCAL_INTS == 240); CTASSERT(IPI_STOP < APIC_SPURIOUS_INT); -/* Magic IRQ values for the timer and syscalls. */ -#define IRQ_TIMER (NUM_IO_INTS + 1) -#define IRQ_SYSCALL (NUM_IO_INTS + 2) -#define IRQ_DTRACE_RET (NUM_IO_INTS + 3) -#define IRQ_EVTCHN (NUM_IO_INTS + 4) +/* + * I/O interrupts use non-negative IRQ values. These values are used + * to mark unused IDT entries or IDT entries reserved for a non-I/O + * interrupt. + */ +#define IRQ_FREE -1 +#define IRQ_TIMER -2 +#define IRQ_SYSCALL -3 +#define IRQ_DTRACE_RET -4 +#define IRQ_EVTCHN -5 +enum lat_timer_mode { + LAT_MODE_UNDEF = 0, + LAT_MODE_PERIODIC = 1, + LAT_MODE_ONESHOT = 2, + LAT_MODE_DEADLINE = 3, +}; + /* * Support for local APICs. Local APICs manage interrupts on each * individual processor as opposed to I/O APICs which receive interrupts @@ -114,14 +128,16 @@ struct lapic { struct lvt la_lvts[APIC_LVT_MAX + 1]; + struct lvt la_elvts[APIC_ELVT_MAX + 1]; u_int la_id:8; u_int la_cluster:4; u_int la_cluster_id:2; u_int la_present:1; u_long *la_timer_count; - u_long la_timer_period; - u_int la_timer_mode; - uint32_t lvt_timer_cache; + uint64_t la_timer_period; + enum lat_timer_mode la_timer_mode; + uint32_t lvt_timer_base; + uint32_t lvt_timer_last; /* Include IDT_SYSCALL to make indexing easier. */ int la_ioint_irqs[APIC_NUM_IOINTS + 1]; } static lapics[MAX_APIC_ID + 1]; @@ -137,6 +153,14 @@ { 1, 1, 1, 1, APIC_LVT_DM_FIXED, APIC_CMC_INT }, /* CMCI */ }; +/* Global defaults for AMD local APIC ELVT entries. */ +static struct lvt elvts[APIC_ELVT_MAX + 1] = { + { 1, 1, 1, 0, APIC_LVT_DM_FIXED, 0 }, + { 1, 1, 1, 0, APIC_LVT_DM_FIXED, APIC_CMC_INT }, + { 1, 1, 1, 0, APIC_LVT_DM_FIXED, 0 }, + { 1, 1, 1, 0, APIC_LVT_DM_FIXED, 0 }, +}; + static inthand_t *ioint_handlers[] = { NULL, /* 0 - 31 */ IDTVEC(apic_isr1), /* 32 - 63 */ @@ -148,6 +172,16 @@ IDTVEC(apic_isr7), /* 224 - 255 */ }; +static inthand_t *ioint_pti_handlers[] = { + NULL, /* 0 - 31 */ + IDTVEC(apic_isr1_pti), /* 32 - 63 */ + IDTVEC(apic_isr2_pti), /* 64 - 95 */ + IDTVEC(apic_isr3_pti), /* 96 - 127 */ + IDTVEC(apic_isr4_pti), /* 128 - 159 */ + IDTVEC(apic_isr5_pti), /* 160 - 191 */ + IDTVEC(apic_isr6_pti), /* 192 - 223 */ + IDTVEC(apic_isr7_pti), /* 224 - 255 */ +}; static u_int32_t lapic_timer_divisors[] = { APIC_TDCR_1, APIC_TDCR_2, APIC_TDCR_4, APIC_TDCR_8, APIC_TDCR_16, @@ -154,42 +188,223 @@ APIC_TDCR_32, APIC_TDCR_64, APIC_TDCR_128 }; -extern inthand_t IDTVEC(rsvd); +extern inthand_t IDTVEC(rsvd_pti), IDTVEC(rsvd); -volatile lapic_t *lapic; +volatile char *lapic_map; vm_paddr_t lapic_paddr; -static u_long lapic_timer_divisor; +int x2apic_mode; +int lapic_eoi_suppression; +static int lapic_timer_tsc_deadline; +static u_long lapic_timer_divisor, count_freq; static struct eventtimer lapic_et; #ifdef SMP static uint64_t lapic_ipi_wait_mult; #endif +SYSCTL_NODE(_hw, OID_AUTO, apic, CTLFLAG_RD, 0, "APIC options"); +SYSCTL_INT(_hw_apic, OID_AUTO, x2apic_mode, CTLFLAG_RD, &x2apic_mode, 0, ""); +SYSCTL_INT(_hw_apic, OID_AUTO, eoi_suppression, CTLFLAG_RD, + &lapic_eoi_suppression, 0, ""); +SYSCTL_INT(_hw_apic, OID_AUTO, timer_tsc_deadline, CTLFLAG_RD, + &lapic_timer_tsc_deadline, 0, ""); + +static void lapic_calibrate_initcount(struct lapic *la); +static void lapic_calibrate_deadline(struct lapic *la); + +static uint32_t +lapic_read32(enum LAPIC_REGISTERS reg) +{ + uint32_t res; + + if (x2apic_mode) { + res = rdmsr32(MSR_APIC_000 + reg); + } else { + res = *(volatile uint32_t *)(lapic_map + reg * LAPIC_MEM_MUL); + } + return (res); +} + +static void +lapic_write32(enum LAPIC_REGISTERS reg, uint32_t val) +{ + + if (x2apic_mode) { + mfence(); + lfence(); + wrmsr(MSR_APIC_000 + reg, val); + } else { + *(volatile uint32_t *)(lapic_map + reg * LAPIC_MEM_MUL) = val; + } +} + +static void +lapic_write32_nofence(enum LAPIC_REGISTERS reg, uint32_t val) +{ + + if (x2apic_mode) { + wrmsr(MSR_APIC_000 + reg, val); + } else { + *(volatile uint32_t *)(lapic_map + reg * LAPIC_MEM_MUL) = val; + } +} + +#ifdef SMP +static uint64_t +lapic_read_icr(void) +{ + uint64_t v; + uint32_t vhi, vlo; + + if (x2apic_mode) { + v = rdmsr(MSR_APIC_000 + LAPIC_ICR_LO); + } else { + vhi = lapic_read32(LAPIC_ICR_HI); + vlo = lapic_read32(LAPIC_ICR_LO); + v = ((uint64_t)vhi << 32) | vlo; + } + return (v); +} + +static uint64_t +lapic_read_icr_lo(void) +{ + + return (lapic_read32(LAPIC_ICR_LO)); +} + +static void +lapic_write_icr(uint32_t vhi, uint32_t vlo) +{ + uint64_t v; + + if (x2apic_mode) { + v = ((uint64_t)vhi << 32) | vlo; + mfence(); + wrmsr(MSR_APIC_000 + LAPIC_ICR_LO, v); + } else { + lapic_write32(LAPIC_ICR_HI, vhi); + lapic_write32(LAPIC_ICR_LO, vlo); + } +} +#endif /* SMP */ + +static void +native_lapic_enable_x2apic(void) +{ + uint64_t apic_base; + + apic_base = rdmsr(MSR_APICBASE); + apic_base |= APICBASE_X2APIC | APICBASE_ENABLED; + wrmsr(MSR_APICBASE, apic_base); +} + +static bool +native_lapic_is_x2apic(void) +{ + uint64_t apic_base; + + apic_base = rdmsr(MSR_APICBASE); + return ((apic_base & (APICBASE_X2APIC | APICBASE_ENABLED)) == + (APICBASE_X2APIC | APICBASE_ENABLED)); +} + static void lapic_enable(void); static void lapic_resume(struct pic *pic, bool suspend_cancelled); -static void lapic_timer_oneshot(struct lapic *, - u_int count, int enable_int); -static void lapic_timer_periodic(struct lapic *, - u_int count, int enable_int); +static void lapic_timer_oneshot(struct lapic *); +static void lapic_timer_oneshot_nointr(struct lapic *, uint32_t); +static void lapic_timer_periodic(struct lapic *); +static void lapic_timer_deadline(struct lapic *); static void lapic_timer_stop(struct lapic *); static void lapic_timer_set_divisor(u_int divisor); static uint32_t lvt_mode(struct lapic *la, u_int pin, uint32_t value); static int lapic_et_start(struct eventtimer *et, - sbintime_t first, sbintime_t period); + sbintime_t first, sbintime_t period); static int lapic_et_stop(struct eventtimer *et); +static u_int apic_idt_to_irq(u_int apic_id, u_int vector); +static void lapic_set_tpr(u_int vector); struct pic lapic_pic = { .pic_resume = lapic_resume }; +/* Forward declarations for apic_ops */ +static void native_lapic_create(u_int apic_id, int boot_cpu); +static void native_lapic_init(vm_paddr_t addr); +static void native_lapic_xapic_mode(void); +static void native_lapic_setup(int boot); +static void native_lapic_dump(const char *str); +static void native_lapic_disable(void); +static void native_lapic_eoi(void); +static int native_lapic_id(void); +static int native_lapic_intr_pending(u_int vector); +static u_int native_apic_cpuid(u_int apic_id); +static u_int native_apic_alloc_vector(u_int apic_id, u_int irq); +static u_int native_apic_alloc_vectors(u_int apic_id, u_int *irqs, + u_int count, u_int align); +static void native_apic_disable_vector(u_int apic_id, u_int vector); +static void native_apic_enable_vector(u_int apic_id, u_int vector); +static void native_apic_free_vector(u_int apic_id, u_int vector, u_int irq); +static void native_lapic_set_logical_id(u_int apic_id, u_int cluster, + u_int cluster_id); +static int native_lapic_enable_pmc(void); +static void native_lapic_disable_pmc(void); +static void native_lapic_reenable_pmc(void); +static void native_lapic_enable_cmc(void); +static int native_lapic_enable_mca_elvt(void); +static int native_lapic_set_lvt_mask(u_int apic_id, u_int lvt, + u_char masked); +static int native_lapic_set_lvt_mode(u_int apic_id, u_int lvt, + uint32_t mode); +static int native_lapic_set_lvt_polarity(u_int apic_id, u_int lvt, + enum intr_polarity pol); +static int native_lapic_set_lvt_triggermode(u_int apic_id, u_int lvt, + enum intr_trigger trigger); +#ifdef SMP +static void native_lapic_ipi_raw(register_t icrlo, u_int dest); +static void native_lapic_ipi_vectored(u_int vector, int dest); +static int native_lapic_ipi_wait(int delay); +#endif /* SMP */ +static int native_lapic_ipi_alloc(inthand_t *ipifunc); +static void native_lapic_ipi_free(int vector); + +struct apic_ops apic_ops = { + .create = native_lapic_create, + .init = native_lapic_init, + .xapic_mode = native_lapic_xapic_mode, + .is_x2apic = native_lapic_is_x2apic, + .setup = native_lapic_setup, + .dump = native_lapic_dump, + .disable = native_lapic_disable, + .eoi = native_lapic_eoi, + .id = native_lapic_id, + .intr_pending = native_lapic_intr_pending, + .set_logical_id = native_lapic_set_logical_id, + .cpuid = native_apic_cpuid, + .alloc_vector = native_apic_alloc_vector, + .alloc_vectors = native_apic_alloc_vectors, + .enable_vector = native_apic_enable_vector, + .disable_vector = native_apic_disable_vector, + .free_vector = native_apic_free_vector, + .enable_pmc = native_lapic_enable_pmc, + .disable_pmc = native_lapic_disable_pmc, + .reenable_pmc = native_lapic_reenable_pmc, + .enable_cmc = native_lapic_enable_cmc, + .enable_mca_elvt = native_lapic_enable_mca_elvt, +#ifdef SMP + .ipi_raw = native_lapic_ipi_raw, + .ipi_vectored = native_lapic_ipi_vectored, + .ipi_wait = native_lapic_ipi_wait, +#endif + .ipi_alloc = native_lapic_ipi_alloc, + .ipi_free = native_lapic_ipi_free, + .set_lvt_mask = native_lapic_set_lvt_mask, + .set_lvt_mode = native_lapic_set_lvt_mode, + .set_lvt_polarity = native_lapic_set_lvt_polarity, + .set_lvt_triggermode = native_lapic_set_lvt_triggermode, +}; + static uint32_t -lvt_mode(struct lapic *la, u_int pin, uint32_t value) +lvt_mode_impl(struct lapic *la, struct lvt *lvt, u_int pin, uint32_t value) { - struct lvt *lvt; - KASSERT(pin <= APIC_LVT_MAX, ("%s: pin %u out of range", __func__, pin)); - if (la->la_lvts[pin].lvt_active) - lvt = &la->la_lvts[pin]; - else - lvt = &lvts[pin]; - value &= ~(APIC_LVT_M | APIC_LVT_TM | APIC_LVT_IIPP | APIC_LVT_DM | APIC_LVT_VECTOR); if (lvt->lvt_edgetrigger == 0) @@ -204,7 +419,7 @@ case APIC_LVT_DM_SMI: case APIC_LVT_DM_INIT: case APIC_LVT_DM_EXTINT: - if (!lvt->lvt_edgetrigger) { + if (!lvt->lvt_edgetrigger && bootverbose) { printf("lapic%u: Forcing LINT%u to edge trigger\n", la->la_id, pin); value &= ~APIC_LVT_TM; @@ -220,23 +435,70 @@ return (value); } +static uint32_t +lvt_mode(struct lapic *la, u_int pin, uint32_t value) +{ + struct lvt *lvt; + + KASSERT(pin <= APIC_LVT_MAX, + ("%s: pin %u out of range", __func__, pin)); + if (la->la_lvts[pin].lvt_active) + lvt = &la->la_lvts[pin]; + else + lvt = &lvts[pin]; + + return (lvt_mode_impl(la, lvt, pin, value)); +} + +static uint32_t +elvt_mode(struct lapic *la, u_int idx, uint32_t value) +{ + struct lvt *elvt; + + KASSERT(idx <= APIC_ELVT_MAX, + ("%s: idx %u out of range", __func__, idx)); + + elvt = &la->la_elvts[idx]; + KASSERT(elvt->lvt_active, ("%s: ELVT%u is not active", __func__, idx)); + KASSERT(elvt->lvt_edgetrigger, + ("%s: ELVT%u is not edge triggered", __func__, idx)); + KASSERT(elvt->lvt_activehi, + ("%s: ELVT%u is not active high", __func__, idx)); + return (lvt_mode_impl(la, elvt, idx, value)); +} + /* * Map the local APIC and setup necessary interrupt vectors. */ -void -lapic_init(vm_paddr_t addr) +static void +native_lapic_init(vm_paddr_t addr) { #ifdef SMP uint64_t r, r1, r2, rx; #endif + uint32_t ver; u_int regs[4]; int i, arat; - /* Map the local APIC and setup the spurious interrupt handler. */ + /* + * Enable x2APIC mode if possible. Map the local APIC + * registers page. + * + * Keep the LAPIC registers page mapped uncached for x2APIC + * mode too, to have direct map page attribute set to + * uncached. This is needed to work around CPU errata present + * on all Intel processors. + */ KASSERT(trunc_page(addr) == addr, ("local APIC not aligned on a page boundary")); lapic_paddr = addr; - lapic = pmap_mapdev(addr, sizeof(lapic_t)); + lapic_map = pmap_mapdev(addr, PAGE_SIZE); + if (x2apic_mode) { + native_lapic_enable_x2apic(); + lapic_map = NULL; + } + + /* Setup the spurious interrupt handler. */ setidt(APIC_SPURIOUS_INT, IDTVEC(spuriousint), SDT_APIC, SEL_KPL, GSEL_APIC); @@ -247,15 +509,18 @@ PCPU_SET(apic_id, lapic_id()); /* Local APIC timer interrupt. */ - setidt(APIC_TIMER_INT, IDTVEC(timerint), SDT_APIC, SEL_KPL, GSEL_APIC); + setidt(APIC_TIMER_INT, pti ? IDTVEC(timerint_pti) : IDTVEC(timerint), + SDT_APIC, SEL_KPL, GSEL_APIC); /* Local APIC error interrupt. */ - setidt(APIC_ERROR_INT, IDTVEC(errorint), SDT_APIC, SEL_KPL, GSEL_APIC); + setidt(APIC_ERROR_INT, pti ? IDTVEC(errorint_pti) : IDTVEC(errorint), + SDT_APIC, SEL_KPL, GSEL_APIC); /* XXX: Thermal interrupt */ /* Local APIC CMCI. */ - setidt(APIC_CMC_INT, IDTVEC(cmcint), SDT_APICT, SEL_KPL, GSEL_APIC); + setidt(APIC_CMC_INT, pti ? IDTVEC(cmcint_pti) : IDTVEC(cmcint), + SDT_APICT, SEL_KPL, GSEL_APIC); if ((resource_int_value("apic", 0, "clock", &i) != 0 || i != 0)) { arat = 0; @@ -264,6 +529,9 @@ do_cpuid(0x06, regs); if ((regs[0] & CPUTPM1_ARAT) != 0) arat = 1; + } else if (cpu_vendor_id == CPU_VENDOR_AMD && + CPUID_TO_FAMILY(cpu_id) >= 0x12) { + arat = 1; } bzero(&lapic_et, sizeof(lapic_et)); lapic_et.et_name = "LAPIC"; @@ -272,8 +540,16 @@ lapic_et.et_quality = 600; if (!arat) { lapic_et.et_flags |= ET_FLAGS_C3STOP; - lapic_et.et_quality -= 200; + lapic_et.et_quality = 100; } + if ((cpu_feature & CPUID_TSC) != 0 && + (cpu_feature2 & CPUID2_TSCDLT) != 0 && + tsc_is_invariant && tsc_freq != 0) { + lapic_timer_tsc_deadline = 1; + TUNABLE_INT_FETCH("hw.lapic_tsc_deadline", + &lapic_timer_tsc_deadline); + } + lapic_et.et_frequency = 0; /* We don't know frequency yet, so trying to guess. */ lapic_et.et_min_period = 0x00001000LL; @@ -284,6 +560,29 @@ et_register(&lapic_et); } + /* + * Set lapic_eoi_suppression after lapic_enable(), to not + * enable suppression in the hardware prematurely. Note that + * we by default enable suppression even when system only has + * one IO-APIC, since EOI is broadcasted to all APIC agents, + * including CPUs, otherwise. + * + * It seems that at least some KVM versions report + * EOI_SUPPRESSION bit, but auto-EOI does not work. + */ + ver = lapic_read32(LAPIC_VERSION); + if ((ver & APIC_VER_EOI_SUPPRESSION) != 0) { + lapic_eoi_suppression = 1; + if (vm_guest == VM_GUEST_KVM) { + if (bootverbose) + printf( + "KVM -- disabling lapic eoi suppression\n"); + lapic_eoi_suppression = 0; + } + TUNABLE_INT_FETCH("hw.lapic_eoi_suppression", + &lapic_eoi_suppression); + } + #ifdef SMP #define LOOPS 100000 /* @@ -299,20 +598,22 @@ */ KASSERT((cpu_feature & CPUID_TSC) != 0 && tsc_freq != 0, ("TSC not initialized")); - r = rdtsc(); - for (rx = 0; rx < LOOPS; rx++) { - (void)lapic->icr_lo; - ia32_pause(); + if (!x2apic_mode) { + r = rdtsc(); + for (rx = 0; rx < LOOPS; rx++) { + (void)lapic_read_icr_lo(); + ia32_pause(); + } + r = rdtsc() - r; + r1 = tsc_freq * LOOPS; + r2 = r * 1000000; + lapic_ipi_wait_mult = r1 >= r2 ? r1 / r2 : 1; + if (bootverbose) { + printf("LAPIC: ipi_wait() us multiplier %ju (r %ju " + "tsc %ju)\n", (uintmax_t)lapic_ipi_wait_mult, + (uintmax_t)r, (uintmax_t)tsc_freq); + } } - r = rdtsc() - r; - r1 = tsc_freq * LOOPS; - r2 = r * 1000000; - lapic_ipi_wait_mult = r1 >= r2 ? r1 / r2 : 1; - if (bootverbose) { - printf("LAPIC: ipi_wait() us multiplier %ju (r %ju tsc %ju)\n", - (uintmax_t)lapic_ipi_wait_mult, (uintmax_t)r, - (uintmax_t)tsc_freq); - } #undef LOOPS #endif /* SMP */ } @@ -320,8 +621,8 @@ /* * Create a local APIC instance. */ -void -lapic_create(u_int apic_id, int boot_cpu) +static void +native_lapic_create(u_int apic_id, int boot_cpu) { int i; @@ -344,8 +645,12 @@ lapics[apic_id].la_lvts[i] = lvts[i]; lapics[apic_id].la_lvts[i].lvt_active = 0; } + for (i = 0; i <= APIC_ELVT_MAX; i++) { + lapics[apic_id].la_elvts[i] = elvts[i]; + lapics[apic_id].la_elvts[i].lvt_active = 0; + } for (i = 0; i <= APIC_NUM_IOINTS; i++) - lapics[apic_id].la_ioint_irqs[i] = -1; + lapics[apic_id].la_ioint_irqs[i] = IRQ_FREE; lapics[apic_id].la_ioint_irqs[IDT_SYSCALL - APIC_IO_INTS] = IRQ_SYSCALL; lapics[apic_id].la_ioint_irqs[APIC_TIMER_INT - APIC_IO_INTS] = IRQ_TIMER; @@ -363,41 +668,100 @@ #endif } +static inline uint32_t +amd_read_ext_features(void) +{ + uint32_t version; + + if (cpu_vendor_id != CPU_VENDOR_AMD) + return (0); + version = lapic_read32(LAPIC_VERSION); + if ((version & APIC_VER_AMD_EXT_SPACE) != 0) + return (lapic_read32(LAPIC_EXT_FEATURES)); + else + return (0); +} + +static inline uint32_t +amd_read_elvt_count(void) +{ + uint32_t extf; + uint32_t count; + + extf = amd_read_ext_features(); + count = (extf & APIC_EXTF_ELVT_MASK) >> APIC_EXTF_ELVT_SHIFT; + count = min(count, APIC_ELVT_MAX + 1); + return (count); +} + /* * Dump contents of local APIC registers */ -void -lapic_dump(const char* str) +static void +native_lapic_dump(const char* str) { + uint32_t version; uint32_t maxlvt; + uint32_t extf; + int elvt_count; + int i; - maxlvt = (lapic->version & APIC_VER_MAXLVT) >> MAXLVTSHIFT; + version = lapic_read32(LAPIC_VERSION); + maxlvt = (version & APIC_VER_MAXLVT) >> MAXLVTSHIFT; printf("cpu%d %s:\n", PCPU_GET(cpuid), str); - printf(" ID: 0x%08x VER: 0x%08x LDR: 0x%08x DFR: 0x%08x\n", - lapic->id, lapic->version, lapic->ldr, lapic->dfr); - printf(" lint0: 0x%08x lint1: 0x%08x TPR: 0x%08x SVR: 0x%08x\n", - lapic->lvt_lint0, lapic->lvt_lint1, lapic->tpr, lapic->svr); + printf(" ID: 0x%08x VER: 0x%08x LDR: 0x%08x DFR: 0x%08x", + lapic_read32(LAPIC_ID), version, + lapic_read32(LAPIC_LDR), x2apic_mode ? 0 : lapic_read32(LAPIC_DFR)); + if ((cpu_feature2 & CPUID2_X2APIC) != 0) + printf(" x2APIC: %d", x2apic_mode); + printf("\n lint0: 0x%08x lint1: 0x%08x TPR: 0x%08x SVR: 0x%08x\n", + lapic_read32(LAPIC_LVT_LINT0), lapic_read32(LAPIC_LVT_LINT1), + lapic_read32(LAPIC_TPR), lapic_read32(LAPIC_SVR)); printf(" timer: 0x%08x therm: 0x%08x err: 0x%08x", - lapic->lvt_timer, lapic->lvt_thermal, lapic->lvt_error); + lapic_read32(LAPIC_LVT_TIMER), lapic_read32(LAPIC_LVT_THERMAL), + lapic_read32(LAPIC_LVT_ERROR)); if (maxlvt >= APIC_LVT_PMC) - printf(" pmc: 0x%08x", lapic->lvt_pcint); + printf(" pmc: 0x%08x", lapic_read32(LAPIC_LVT_PCINT)); printf("\n"); if (maxlvt >= APIC_LVT_CMCI) - printf(" cmci: 0x%08x\n", lapic->lvt_cmci); + printf(" cmci: 0x%08x\n", lapic_read32(LAPIC_LVT_CMCI)); + extf = amd_read_ext_features(); + if (extf != 0) { + printf(" AMD ext features: 0x%08x\n", extf); + elvt_count = amd_read_elvt_count(); + for (i = 0; i < elvt_count; i++) + printf(" AMD elvt%d: 0x%08x\n", i, + lapic_read32(LAPIC_EXT_LVT0 + i)); + } } -void -lapic_setup(int boot) +static void +native_lapic_xapic_mode(void) { + register_t saveintr; + + saveintr = intr_disable(); + if (x2apic_mode) + native_lapic_enable_x2apic(); + intr_restore(saveintr); +} + +static void +native_lapic_setup(int boot) +{ struct lapic *la; - u_int32_t maxlvt; + uint32_t version; + uint32_t maxlvt; register_t saveintr; - char buf[MAXCOMLEN + 1]; + int elvt_count; + int i; + saveintr = intr_disable(); + la = &lapics[lapic_id()]; KASSERT(la->la_present, ("missing APIC structure")); - saveintr = intr_disable(); - maxlvt = (lapic->version & APIC_VER_MAXLVT) >> MAXLVTSHIFT; + version = lapic_read32(LAPIC_VERSION); + maxlvt = (version & APIC_VER_MAXLVT) >> MAXLVTSHIFT; /* Initialize the TPR to allow all interrupts. */ lapic_set_tpr(0); @@ -406,54 +770,103 @@ lapic_enable(); /* Program LINT[01] LVT entries. */ - lapic->lvt_lint0 = lvt_mode(la, APIC_LVT_LINT0, lapic->lvt_lint0); - lapic->lvt_lint1 = lvt_mode(la, APIC_LVT_LINT1, lapic->lvt_lint1); + lapic_write32(LAPIC_LVT_LINT0, lvt_mode(la, APIC_LVT_LINT0, + lapic_read32(LAPIC_LVT_LINT0))); + lapic_write32(LAPIC_LVT_LINT1, lvt_mode(la, APIC_LVT_LINT1, + lapic_read32(LAPIC_LVT_LINT1))); /* Program the PMC LVT entry if present. */ - if (maxlvt >= APIC_LVT_PMC) - lapic->lvt_pcint = lvt_mode(la, APIC_LVT_PMC, lapic->lvt_pcint); + if (maxlvt >= APIC_LVT_PMC) { + lapic_write32(LAPIC_LVT_PCINT, lvt_mode(la, APIC_LVT_PMC, + LAPIC_LVT_PCINT)); + } - /* Program timer LVT and setup handler. */ - la->lvt_timer_cache = lapic->lvt_timer = - lvt_mode(la, APIC_LVT_TIMER, lapic->lvt_timer); - if (boot) { - snprintf(buf, sizeof(buf), "cpu%d:timer", PCPU_GET(cpuid)); - intrcnt_add(buf, &la->la_timer_count); + /* Program timer LVT. */ + la->lvt_timer_base = lvt_mode(la, APIC_LVT_TIMER, + lapic_read32(LAPIC_LVT_TIMER)); + la->lvt_timer_last = la->lvt_timer_base; + lapic_write32(LAPIC_LVT_TIMER, la->lvt_timer_base); + + /* Calibrate the timer parameters using BSP. */ + if (boot && IS_BSP()) { + lapic_calibrate_initcount(la); + if (lapic_timer_tsc_deadline) + lapic_calibrate_deadline(la); } /* Setup the timer if configured. */ - if (la->la_timer_mode != 0) { + if (la->la_timer_mode != LAT_MODE_UNDEF) { KASSERT(la->la_timer_period != 0, ("lapic%u: zero divisor", lapic_id())); - lapic_timer_set_divisor(lapic_timer_divisor); - if (la->la_timer_mode == 1) - lapic_timer_periodic(la, la->la_timer_period, 1); - else - lapic_timer_oneshot(la, la->la_timer_period, 1); + switch (la->la_timer_mode) { + case LAT_MODE_PERIODIC: + lapic_timer_set_divisor(lapic_timer_divisor); + lapic_timer_periodic(la); + break; + case LAT_MODE_ONESHOT: + lapic_timer_set_divisor(lapic_timer_divisor); + lapic_timer_oneshot(la); + break; + case LAT_MODE_DEADLINE: + lapic_timer_deadline(la); + break; + default: + panic("corrupted la_timer_mode %p %d", la, + la->la_timer_mode); + } } /* Program error LVT and clear any existing errors. */ - lapic->lvt_error = lvt_mode(la, APIC_LVT_ERROR, lapic->lvt_error); - lapic->esr = 0; + lapic_write32(LAPIC_LVT_ERROR, lvt_mode(la, APIC_LVT_ERROR, + lapic_read32(LAPIC_LVT_ERROR))); + lapic_write32(LAPIC_ESR, 0); /* XXX: Thermal LVT */ /* Program the CMCI LVT entry if present. */ - if (maxlvt >= APIC_LVT_CMCI) - lapic->lvt_cmci = lvt_mode(la, APIC_LVT_CMCI, lapic->lvt_cmci); + if (maxlvt >= APIC_LVT_CMCI) { + lapic_write32(LAPIC_LVT_CMCI, lvt_mode(la, APIC_LVT_CMCI, + lapic_read32(LAPIC_LVT_CMCI))); + } + elvt_count = amd_read_elvt_count(); + for (i = 0; i < elvt_count; i++) { + if (la->la_elvts[i].lvt_active) + lapic_write32(LAPIC_EXT_LVT0 + i, + elvt_mode(la, i, lapic_read32(LAPIC_EXT_LVT0 + i))); + } + intr_restore(saveintr); } -void -lapic_reenable_pmc(void) +static void +native_lapic_intrcnt(void *dummy __unused) { + struct pcpu *pc; + struct lapic *la; + char buf[MAXCOMLEN + 1]; + + STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) { + la = &lapics[pc->pc_apic_id]; + if (!la->la_present) + continue; + + snprintf(buf, sizeof(buf), "cpu%d:timer", pc->pc_cpuid); + intrcnt_add(buf, &la->la_timer_count); + } +} +SYSINIT(native_lapic_intrcnt, SI_SUB_INTR, SI_ORDER_MIDDLE, native_lapic_intrcnt, + NULL); + +static void +native_lapic_reenable_pmc(void) +{ #ifdef HWPMC_HOOKS uint32_t value; - value = lapic->lvt_pcint; + value = lapic_read32(LAPIC_LVT_PCINT); value &= ~APIC_LVT_M; - lapic->lvt_pcint = value; + lapic_write32(LAPIC_LVT_PCINT, value); #endif } @@ -464,27 +877,32 @@ struct lapic *la; la = &lapics[lapic_id()]; - lapic->lvt_pcint = lvt_mode(la, APIC_LVT_PMC, lapic->lvt_pcint); + lapic_write32(LAPIC_LVT_PCINT, lvt_mode(la, APIC_LVT_PMC, + lapic_read32(LAPIC_LVT_PCINT))); } #endif -int -lapic_enable_pmc(void) +static int +native_lapic_enable_pmc(void) { #ifdef HWPMC_HOOKS u_int32_t maxlvt; /* Fail if the local APIC is not present. */ - if (lapic == NULL) + if (!x2apic_mode && lapic_map == NULL) return (0); /* Fail if the PMC LVT is not present. */ - maxlvt = (lapic->version & APIC_VER_MAXLVT) >> MAXLVTSHIFT; + maxlvt = (lapic_read32(LAPIC_VERSION) & APIC_VER_MAXLVT) >> MAXLVTSHIFT; if (maxlvt < APIC_LVT_PMC) return (0); lvts[APIC_LVT_PMC].lvt_masked = 0; +#ifdef EARLY_AP_STARTUP + MPASS(mp_ncpus == 1 || smp_started); + smp_rendezvous(NULL, lapic_update_pmc, NULL, NULL); +#else #ifdef SMP /* * If hwpmc was loaded at boot time then the APs may not be @@ -496,6 +914,7 @@ else #endif lapic_update_pmc(NULL); +#endif return (1); #else return (0); @@ -502,18 +921,18 @@ #endif } -void -lapic_disable_pmc(void) +static void +native_lapic_disable_pmc(void) { #ifdef HWPMC_HOOKS u_int32_t maxlvt; /* Fail if the local APIC is not present. */ - if (lapic == NULL) + if (!x2apic_mode && lapic_map == NULL) return; /* Fail if the PMC LVT is not present. */ - maxlvt = (lapic->version & APIC_VER_MAXLVT) >> MAXLVTSHIFT; + maxlvt = (lapic_read32(LAPIC_VERSION) & APIC_VER_MAXLVT) >> MAXLVTSHIFT; if (maxlvt < APIC_LVT_PMC) return; @@ -527,45 +946,89 @@ #endif } +static void +lapic_calibrate_initcount(struct lapic *la) +{ + u_long value; + + /* Start off with a divisor of 2 (power on reset default). */ + lapic_timer_divisor = 2; + /* Try to calibrate the local APIC timer. */ + do { + lapic_timer_set_divisor(lapic_timer_divisor); + lapic_timer_oneshot_nointr(la, APIC_TIMER_MAX_COUNT); + DELAY(1000000); + value = APIC_TIMER_MAX_COUNT - lapic_read32(LAPIC_CCR_TIMER); + if (value != APIC_TIMER_MAX_COUNT) + break; + lapic_timer_divisor <<= 1; + } while (lapic_timer_divisor <= 128); + if (lapic_timer_divisor > 128) + panic("lapic: Divisor too big"); + if (bootverbose) { + printf("lapic: Divisor %lu, Frequency %lu Hz\n", + lapic_timer_divisor, value); + } + count_freq = value; +} + +static void +lapic_calibrate_deadline(struct lapic *la __unused) +{ + + if (bootverbose) { + printf("lapic: deadline tsc mode, Frequency %ju Hz\n", + (uintmax_t)tsc_freq); + } +} + +static void +lapic_change_mode(struct eventtimer *et, struct lapic *la, + enum lat_timer_mode newmode) +{ + + if (la->la_timer_mode == newmode) + return; + switch (newmode) { + case LAT_MODE_PERIODIC: + lapic_timer_set_divisor(lapic_timer_divisor); + et->et_frequency = count_freq; + break; + case LAT_MODE_DEADLINE: + et->et_frequency = tsc_freq; + break; + case LAT_MODE_ONESHOT: + lapic_timer_set_divisor(lapic_timer_divisor); + et->et_frequency = count_freq; + break; + default: + panic("lapic_change_mode %d", newmode); + } + la->la_timer_mode = newmode; + et->et_min_period = (0x00000002LLU << 32) / et->et_frequency; + et->et_max_period = (0xfffffffeLLU << 32) / et->et_frequency; +} + static int lapic_et_start(struct eventtimer *et, sbintime_t first, sbintime_t period) { struct lapic *la; - u_long value; la = &lapics[PCPU_GET(apic_id)]; - if (et->et_frequency == 0) { - /* Start off with a divisor of 2 (power on reset default). */ - lapic_timer_divisor = 2; - /* Try to calibrate the local APIC timer. */ - do { - lapic_timer_set_divisor(lapic_timer_divisor); - lapic_timer_oneshot(la, APIC_TIMER_MAX_COUNT, 0); - DELAY(1000000); - value = APIC_TIMER_MAX_COUNT - lapic->ccr_timer; - if (value != APIC_TIMER_MAX_COUNT) - break; - lapic_timer_divisor <<= 1; - } while (lapic_timer_divisor <= 128); - if (lapic_timer_divisor > 128) - panic("lapic: Divisor too big"); - if (bootverbose) - printf("lapic: Divisor %lu, Frequency %lu Hz\n", - lapic_timer_divisor, value); - et->et_frequency = value; - et->et_min_period = (0x00000002LLU << 32) / et->et_frequency; - et->et_max_period = (0xfffffffeLLU << 32) / et->et_frequency; - } - if (la->la_timer_mode == 0) - lapic_timer_set_divisor(lapic_timer_divisor); if (period != 0) { - la->la_timer_mode = 1; - la->la_timer_period = ((uint32_t)et->et_frequency * period) >> 32; - lapic_timer_periodic(la, la->la_timer_period, 1); + lapic_change_mode(et, la, LAT_MODE_PERIODIC); + la->la_timer_period = ((uint32_t)et->et_frequency * period) >> + 32; + lapic_timer_periodic(la); + } else if (lapic_timer_tsc_deadline) { + lapic_change_mode(et, la, LAT_MODE_DEADLINE); + la->la_timer_period = (et->et_frequency * first) >> 32; + lapic_timer_deadline(la); } else { - la->la_timer_mode = 2; - la->la_timer_period = ((uint32_t)et->et_frequency * first) >> 32; - lapic_timer_oneshot(la, la->la_timer_period, 1); + lapic_change_mode(et, la, LAT_MODE_ONESHOT); + la->la_timer_period = ((uint32_t)et->et_frequency * first) >> + 32; + lapic_timer_oneshot(la); } return (0); } @@ -573,34 +1036,37 @@ static int lapic_et_stop(struct eventtimer *et) { - struct lapic *la = &lapics[PCPU_GET(apic_id)]; + struct lapic *la; - la->la_timer_mode = 0; + la = &lapics[PCPU_GET(apic_id)]; lapic_timer_stop(la); + la->la_timer_mode = LAT_MODE_UNDEF; return (0); } -void -lapic_disable(void) +static void +native_lapic_disable(void) { uint32_t value; /* Software disable the local APIC. */ - value = lapic->svr; + value = lapic_read32(LAPIC_SVR); value &= ~APIC_SVR_SWEN; - lapic->svr = value; + lapic_write32(LAPIC_SVR, value); } static void lapic_enable(void) { - u_int32_t value; + uint32_t value; /* Program the spurious vector to enable the local APIC. */ - value = lapic->svr; + value = lapic_read32(LAPIC_SVR); value &= ~(APIC_SVR_VECTOR | APIC_SVR_FOCUS); - value |= (APIC_SVR_FEN | APIC_SVR_SWEN | APIC_SPURIOUS_INT); - lapic->svr = value; + value |= APIC_SVR_FEN | APIC_SVR_SWEN | APIC_SPURIOUS_INT; + if (lapic_eoi_suppression) + value |= APIC_SVR_EOI_SUPPRESSION; + lapic_write32(LAPIC_SVR, value); } /* Reset the local APIC on the BSP during resume. */ @@ -611,34 +1077,36 @@ lapic_setup(0); } -int -lapic_id(void) +static int +native_lapic_id(void) { + uint32_t v; - KASSERT(lapic != NULL, ("local APIC is not mapped")); - return (lapic->id >> APIC_ID_SHIFT); + KASSERT(x2apic_mode || lapic_map != NULL, ("local APIC is not mapped")); + v = lapic_read32(LAPIC_ID); + if (!x2apic_mode) + v >>= APIC_ID_SHIFT; + return (v); } -int -lapic_intr_pending(u_int vector) +static int +native_lapic_intr_pending(u_int vector) { - volatile u_int32_t *irr; + uint32_t irr; /* - * The IRR registers are an array of 128-bit registers each of - * which only describes 32 interrupts in the low 32 bits.. Thus, - * we divide the vector by 32 to get the 128-bit index. We then - * multiply that index by 4 to get the equivalent index from - * treating the IRR as an array of 32-bit registers. Finally, we - * modulus the vector by 32 to determine the individual bit to - * test. + * The IRR registers are an array of registers each of which + * only describes 32 interrupts in the low 32 bits. Thus, we + * divide the vector by 32 to get the register index. + * Finally, we modulus the vector by 32 to determine the + * individual bit to test. */ - irr = &lapic->irr0; - return (irr[(vector / 32) * 4] & 1 << (vector % 32)); + irr = lapic_read32(LAPIC_IRR0 + vector / 32); + return (irr & 1 << (vector % 32)); } -void -lapic_set_logical_id(u_int apic_id, u_int cluster, u_int cluster_id) +static void +native_lapic_set_logical_id(u_int apic_id, u_int cluster, u_int cluster_id) { struct lapic *la; @@ -653,8 +1121,8 @@ la->la_cluster_id = cluster_id; } -int -lapic_set_lvt_mask(u_int apic_id, u_int pin, u_char masked) +static int +native_lapic_set_lvt_mask(u_int apic_id, u_int pin, u_char masked) { if (pin > APIC_LVT_MAX) @@ -676,8 +1144,8 @@ return (0); } -int -lapic_set_lvt_mode(u_int apic_id, u_int pin, u_int32_t mode) +static int +native_lapic_set_lvt_mode(u_int apic_id, u_int pin, u_int32_t mode) { struct lvt *lvt; @@ -732,8 +1200,8 @@ return (0); } -int -lapic_set_lvt_polarity(u_int apic_id, u_int pin, enum intr_polarity pol) +static int +native_lapic_set_lvt_polarity(u_int apic_id, u_int pin, enum intr_polarity pol) { if (pin > APIC_LVT_MAX || pol == INTR_POLARITY_CONFORM) @@ -757,8 +1225,9 @@ return (0); } -int -lapic_set_lvt_triggermode(u_int apic_id, u_int pin, enum intr_trigger trigger) +static int +native_lapic_set_lvt_triggermode(u_int apic_id, u_int pin, + enum intr_trigger trigger) { if (pin > APIC_LVT_MAX || trigger == INTR_TRIGGER_CONFORM) @@ -786,25 +1255,25 @@ * Adjust the TPR of the current CPU so that it blocks all interrupts below * the passed in vector. */ -void +static void lapic_set_tpr(u_int vector) { #ifdef CHEAP_TPR - lapic->tpr = vector; + lapic_write32(LAPIC_TPR, vector); #else - u_int32_t tpr; + uint32_t tpr; - tpr = lapic->tpr & ~APIC_TPR_PRIO; + tpr = lapic_read32(LAPIC_TPR) & ~APIC_TPR_PRIO; tpr |= vector; - lapic->tpr = tpr; + lapic_write32(LAPIC_TPR, tpr); #endif } -void -lapic_eoi(void) +static void +native_lapic_eoi(void) { - lapic->eoi = 0; + lapic_write32_nofence(LAPIC_EOI, 0); } void @@ -864,48 +1333,82 @@ { KASSERT(powerof2(divisor), ("lapic: invalid divisor %u", divisor)); - KASSERT(ffs(divisor) <= sizeof(lapic_timer_divisors) / - sizeof(u_int32_t), ("lapic: invalid divisor %u", divisor)); - lapic->dcr_timer = lapic_timer_divisors[ffs(divisor) - 1]; + KASSERT(ffs(divisor) <= nitems(lapic_timer_divisors), + ("lapic: invalid divisor %u", divisor)); + lapic_write32(LAPIC_DCR_TIMER, lapic_timer_divisors[ffs(divisor) - 1]); } static void -lapic_timer_oneshot(struct lapic *la, u_int count, int enable_int) +lapic_timer_oneshot(struct lapic *la) { - u_int32_t value; + uint32_t value; - value = la->lvt_timer_cache; - value &= ~APIC_LVTT_TM; + value = la->lvt_timer_base; + value &= ~(APIC_LVTT_TM | APIC_LVT_M); value |= APIC_LVTT_TM_ONE_SHOT; - if (enable_int) - value &= ~APIC_LVT_M; - lapic->lvt_timer = value; - lapic->icr_timer = count; + la->lvt_timer_last = value; + lapic_write32(LAPIC_LVT_TIMER, value); + lapic_write32(LAPIC_ICR_TIMER, la->la_timer_period); } static void -lapic_timer_periodic(struct lapic *la, u_int count, int enable_int) +lapic_timer_oneshot_nointr(struct lapic *la, uint32_t count) { - u_int32_t value; + uint32_t value; - value = la->lvt_timer_cache; + value = la->lvt_timer_base; value &= ~APIC_LVTT_TM; + value |= APIC_LVTT_TM_ONE_SHOT | APIC_LVT_M; + la->lvt_timer_last = value; + lapic_write32(LAPIC_LVT_TIMER, value); + lapic_write32(LAPIC_ICR_TIMER, count); +} + +static void +lapic_timer_periodic(struct lapic *la) +{ + uint32_t value; + + value = la->lvt_timer_base; + value &= ~(APIC_LVTT_TM | APIC_LVT_M); value |= APIC_LVTT_TM_PERIODIC; - if (enable_int) - value &= ~APIC_LVT_M; - lapic->lvt_timer = value; - lapic->icr_timer = count; + la->lvt_timer_last = value; + lapic_write32(LAPIC_LVT_TIMER, value); + lapic_write32(LAPIC_ICR_TIMER, la->la_timer_period); } static void +lapic_timer_deadline(struct lapic *la) +{ + uint32_t value; + + value = la->lvt_timer_base; + value &= ~(APIC_LVTT_TM | APIC_LVT_M); + value |= APIC_LVTT_TM_TSCDLT; + if (value != la->lvt_timer_last) { + la->lvt_timer_last = value; + lapic_write32_nofence(LAPIC_LVT_TIMER, value); + if (!x2apic_mode) + mfence(); + } + wrmsr(MSR_TSC_DEADLINE, la->la_timer_period + rdtsc()); +} + +static void lapic_timer_stop(struct lapic *la) { - u_int32_t value; + uint32_t value; - value = la->lvt_timer_cache; - value &= ~APIC_LVTT_TM; - value |= APIC_LVT_M; - lapic->lvt_timer = value; + if (la->la_timer_mode == LAT_MODE_DEADLINE) { + wrmsr(MSR_TSC_DEADLINE, 0); + mfence(); + } else { + value = la->lvt_timer_base; + value &= ~APIC_LVTT_TM; + value |= APIC_LVT_M; + la->lvt_timer_last = value; + lapic_write32(LAPIC_LVT_TIMER, value); + } } void @@ -922,13 +1425,13 @@ * is called prior to lapic_setup() during boot, this just needs to unmask * this CPU's LVT_CMCI entry. */ -void -lapic_enable_cmc(void) +static void +native_lapic_enable_cmc(void) { u_int apic_id; #ifdef DEV_ATPIC - if (lapic == NULL) + if (!x2apic_mode && lapic_map == NULL) return; #endif apic_id = PCPU_GET(apic_id); @@ -940,10 +1443,41 @@ printf("lapic%u: CMCI unmasked\n", apic_id); } +static int +native_lapic_enable_mca_elvt(void) +{ + u_int apic_id; + uint32_t value; + int elvt_count; + +#ifdef DEV_ATPIC + if (lapic_map == NULL) + return (-1); +#endif + + apic_id = PCPU_GET(apic_id); + KASSERT(lapics[apic_id].la_present, + ("%s: missing APIC %u", __func__, apic_id)); + elvt_count = amd_read_elvt_count(); + if (elvt_count <= APIC_ELVT_MCA) + return (-1); + + value = lapic_read32(LAPIC_EXT_LVT0 + APIC_ELVT_MCA); + if ((value & APIC_LVT_M) == 0) { + printf("AMD MCE Thresholding Extended LVT is already active\n"); + return (-1); + } + lapics[apic_id].la_elvts[APIC_ELVT_MCA].lvt_masked = 0; + lapics[apic_id].la_elvts[APIC_ELVT_MCA].lvt_active = 1; + if (bootverbose) + printf("lapic%u: MCE Thresholding ELVT unmasked\n", apic_id); + return (APIC_ELVT_MCA); +} + void lapic_handle_error(void) { - u_int32_t esr; + uint32_t esr; /* * Read the contents of the error status register. Write to @@ -951,15 +1485,15 @@ * to update its value to indicate any errors that have * occurred since the previous write to the register. */ - lapic->esr = 0; - esr = lapic->esr; + lapic_write32(LAPIC_ESR, 0); + esr = lapic_read32(LAPIC_ESR); printf("CPU%d: local APIC error 0x%x\n", PCPU_GET(cpuid), esr); lapic_eoi(); } -u_int -apic_cpuid(u_int apic_id) +static u_int +native_apic_cpuid(u_int apic_id) { #ifdef SMP return apic_cpuids[apic_id]; @@ -969,12 +1503,12 @@ } /* Request a free IDT vector to be used by the specified IRQ. */ -u_int -apic_alloc_vector(u_int apic_id, u_int irq) +static u_int +native_apic_alloc_vector(u_int apic_id, u_int irq) { u_int vector; - KASSERT(irq < NUM_IO_INTS, ("Invalid IRQ %u", irq)); + KASSERT(irq < num_io_irqs, ("Invalid IRQ %u", irq)); /* * Search for a free vector. Currently we just use a very simple @@ -982,7 +1516,7 @@ */ mtx_lock_spin(&icu_lock); for (vector = 0; vector < APIC_NUM_IOINTS; vector++) { - if (lapics[apic_id].la_ioint_irqs[vector] != -1) + if (lapics[apic_id].la_ioint_irqs[vector] != IRQ_FREE) continue; lapics[apic_id].la_ioint_irqs[vector] = irq; mtx_unlock_spin(&icu_lock); @@ -998,8 +1532,8 @@ * aligned on a boundary of 'align'. If the request cannot be * satisfied, 0 is returned. */ -u_int -apic_alloc_vectors(u_int apic_id, u_int *irqs, u_int count, u_int align) +static u_int +native_apic_alloc_vectors(u_int apic_id, u_int *irqs, u_int count, u_int align) { u_int first, run, vector; @@ -1008,7 +1542,7 @@ KASSERT(align >= count, ("align < count")); #ifdef INVARIANTS for (run = 0; run < count; run++) - KASSERT(irqs[run] < NUM_IO_INTS, ("Invalid IRQ %u at index %u", + KASSERT(irqs[run] < num_io_irqs, ("Invalid IRQ %u at index %u", irqs[run], run)); #endif @@ -1022,7 +1556,7 @@ for (vector = 0; vector < APIC_NUM_IOINTS; vector++) { /* Vector is in use, end run. */ - if (lapics[apic_id].la_ioint_irqs[vector] != -1) { + if (lapics[apic_id].la_ioint_irqs[vector] != IRQ_FREE) { run = 0; first = 0; continue; @@ -1058,8 +1592,8 @@ * which do not have the vector configured would report spurious interrupts * should it fire. */ -void -apic_enable_vector(u_int apic_id, u_int vector) +static void +native_apic_enable_vector(u_int apic_id, u_int vector) { KASSERT(vector != IDT_SYSCALL, ("Attempt to overwrite syscall entry")); @@ -1069,12 +1603,12 @@ KASSERT(vector != IDT_DTRACE_RET, ("Attempt to overwrite DTrace entry")); #endif - setidt(vector, ioint_handlers[vector / 32], SDT_APIC, SEL_KPL, - GSEL_APIC); + setidt(vector, (pti ? ioint_pti_handlers : ioint_handlers)[vector / 32], + SDT_APIC, SEL_KPL, GSEL_APIC); } -void -apic_disable_vector(u_int apic_id, u_int vector) +static void +native_apic_disable_vector(u_int apic_id, u_int vector) { KASSERT(vector != IDT_SYSCALL, ("Attempt to overwrite syscall entry")); @@ -1089,13 +1623,14 @@ * We can not currently clear the idt entry because other cpus * may have a valid vector at this offset. */ - setidt(vector, &IDTVEC(rsvd), SDT_APICT, SEL_KPL, GSEL_APIC); + setidt(vector, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_APICT, + SEL_KPL, GSEL_APIC); #endif } /* Release an APIC vector when it's no longer in use. */ -void -apic_free_vector(u_int apic_id, u_int vector, u_int irq) +static void +native_apic_free_vector(u_int apic_id, u_int vector, u_int irq) { struct thread *td; @@ -1102,7 +1637,7 @@ KASSERT(vector >= APIC_IO_INTS && vector != IDT_SYSCALL && vector <= APIC_IO_INTS + APIC_NUM_IOINTS, ("Vector %u does not map to an IRQ line", vector)); - KASSERT(irq < NUM_IO_INTS, ("Invalid IRQ %u", irq)); + KASSERT(irq < num_io_irqs, ("Invalid IRQ %u", irq)); KASSERT(lapics[apic_id].la_ioint_irqs[vector - APIC_IO_INTS] == irq, ("IRQ mismatch")); #ifdef KDTRACE_HOOKS @@ -1123,7 +1658,7 @@ thread_unlock(td); } mtx_lock_spin(&icu_lock); - lapics[apic_id].la_ioint_irqs[vector - APIC_IO_INTS] = -1; + lapics[apic_id].la_ioint_irqs[vector - APIC_IO_INTS] = IRQ_FREE; mtx_unlock_spin(&icu_lock); if (!rebooting) { thread_lock(td); @@ -1133,7 +1668,7 @@ } /* Map an IDT vector (APIC) to an IRQ (interrupt source). */ -u_int +static u_int apic_idt_to_irq(u_int apic_id, u_int vector) { int irq; @@ -1174,7 +1709,7 @@ db_printf("Interrupts bound to lapic %u\n", apic_id); for (i = 0; i < APIC_NUM_IOINTS + 1 && !db_pager_quit; i++) { irq = lapics[apic_id].la_ioint_irqs[i]; - if (irq == -1 || irq == IRQ_SYSCALL) + if (irq == IRQ_FREE || irq == IRQ_SYSCALL) continue; #ifdef KDTRACE_HOOKS if (irq == IRQ_DTRACE_RET) @@ -1187,7 +1722,7 @@ db_printf("vec 0x%2x -> ", i + APIC_IO_INTS); if (irq == IRQ_TIMER) db_printf("lapic timer\n"); - else if (irq < NUM_IO_INTS) { + else if (irq < num_io_irqs) { isrc = intr_lookup_source(irq); if (isrc == NULL || verbose == 0) db_printf("IRQ %u\n", irq); @@ -1224,48 +1759,49 @@ uint32_t v; db_printf("lapic ID = %d\n", lapic_id()); - v = lapic->version; + v = lapic_read32(LAPIC_VERSION); db_printf("version = %d.%d\n", (v & APIC_VER_VERSION) >> 4, v & 0xf); db_printf("max LVT = %d\n", (v & APIC_VER_MAXLVT) >> MAXLVTSHIFT); - v = lapic->svr; + v = lapic_read32(LAPIC_SVR); db_printf("SVR = %02x (%s)\n", v & APIC_SVR_VECTOR, v & APIC_SVR_ENABLE ? "enabled" : "disabled"); - db_printf("TPR = %02x\n", lapic->tpr); + db_printf("TPR = %02x\n", lapic_read32(LAPIC_TPR)); -#define dump_field(prefix, index) \ - dump_mask(__XSTRING(prefix ## index), lapic->prefix ## index, \ +#define dump_field(prefix, regn, index) \ + dump_mask(__XSTRING(prefix ## index), \ + lapic_read32(LAPIC_ ## regn ## index), \ index * 32) db_printf("In-service Interrupts:\n"); - dump_field(isr, 0); - dump_field(isr, 1); - dump_field(isr, 2); - dump_field(isr, 3); - dump_field(isr, 4); - dump_field(isr, 5); - dump_field(isr, 6); - dump_field(isr, 7); + dump_field(isr, ISR, 0); + dump_field(isr, ISR, 1); + dump_field(isr, ISR, 2); + dump_field(isr, ISR, 3); + dump_field(isr, ISR, 4); + dump_field(isr, ISR, 5); + dump_field(isr, ISR, 6); + dump_field(isr, ISR, 7); db_printf("TMR Interrupts:\n"); - dump_field(tmr, 0); - dump_field(tmr, 1); - dump_field(tmr, 2); - dump_field(tmr, 3); - dump_field(tmr, 4); - dump_field(tmr, 5); - dump_field(tmr, 6); - dump_field(tmr, 7); + dump_field(tmr, TMR, 0); + dump_field(tmr, TMR, 1); + dump_field(tmr, TMR, 2); + dump_field(tmr, TMR, 3); + dump_field(tmr, TMR, 4); + dump_field(tmr, TMR, 5); + dump_field(tmr, TMR, 6); + dump_field(tmr, TMR, 7); db_printf("IRR Interrupts:\n"); - dump_field(irr, 0); - dump_field(irr, 1); - dump_field(irr, 2); - dump_field(irr, 3); - dump_field(irr, 4); - dump_field(irr, 5); - dump_field(irr, 6); - dump_field(irr, 7); + dump_field(irr, IRR, 0); + dump_field(irr, IRR, 1); + dump_field(irr, IRR, 2); + dump_field(irr, IRR, 3); + dump_field(irr, IRR, 4); + dump_field(irr, IRR, 5); + dump_field(irr, IRR, 6); + dump_field(irr, IRR, 7); #undef dump_field } @@ -1391,20 +1927,18 @@ * Local APIC must be registered before other PICs and pseudo PICs * for proper suspend/resume order. */ -#ifndef XEN intr_register_pic(&lapic_pic); -#endif retval = best_enum->apic_setup_io(); if (retval != 0) printf("%s: Failed to setup I/O APICs: returned %d\n", best_enum->apic_name, retval); -#ifdef XEN - return; -#endif + /* - * Finish setting up the local APIC on the BSP once we know how to - * properly program the LINT pins. + * Finish setting up the local APIC on the BSP once we know + * how to properly program the LINT pins. In particular, this + * enables the EOI suppression mode, if LAPIC support it and + * user did not disabled the mode. */ lapic_setup(1); if (bootverbose) @@ -1411,9 +1945,13 @@ lapic_dump("BSP"); /* Enable the MSI "pic". */ - msi_init(); + init_ops.msi_init(); + +#ifdef XENHVM + xen_intr_alloc_irqs(); +#endif } -SYSINIT(apic_setup_io, SI_SUB_INTR, SI_ORDER_SECOND, apic_setup_io, NULL); +SYSINIT(apic_setup_io, SI_SUB_INTR, SI_ORDER_THIRD, apic_setup_io, NULL); #ifdef SMP /* @@ -1426,13 +1964,18 @@ * Wait delay microseconds for IPI to be sent. If delay is -1, we * wait forever. */ -int -lapic_ipi_wait(int delay) +static int +native_lapic_ipi_wait(int delay) { uint64_t rx; + /* LAPIC_ICR.APIC_DELSTAT_MASK is undefined in x2APIC mode */ + if (x2apic_mode) + return (1); + for (rx = 0; delay == -1 || rx < lapic_ipi_wait_mult * delay; rx++) { - if ((lapic->icr_lo & APIC_DELSTAT_MASK) == APIC_DELSTAT_IDLE) + if ((lapic_read_icr_lo() & APIC_DELSTAT_MASK) == + APIC_DELSTAT_IDLE) return (1); ia32_pause(); } @@ -1439,33 +1982,51 @@ return (0); } -void -lapic_ipi_raw(register_t icrlo, u_int dest) +static void +native_lapic_ipi_raw(register_t icrlo, u_int dest) { - register_t value, saveintr; + uint64_t icr; + uint32_t vhi, vlo; + register_t saveintr; /* XXX: Need more sanity checking of icrlo? */ - KASSERT(lapic != NULL, ("%s called too early", __func__)); - KASSERT((dest & ~(APIC_ID_MASK >> APIC_ID_SHIFT)) == 0, + KASSERT(x2apic_mode || lapic_map != NULL, + ("%s called too early", __func__)); + KASSERT(x2apic_mode || + (dest & ~(APIC_ID_MASK >> APIC_ID_SHIFT)) == 0, ("%s: invalid dest field", __func__)); KASSERT((icrlo & APIC_ICRLO_RESV_MASK) == 0, ("%s: reserved bits set in ICR LO register", __func__)); /* Set destination in ICR HI register if it is being used. */ - saveintr = intr_disable(); + if (!x2apic_mode) { + saveintr = intr_disable(); + icr = lapic_read_icr(); + } + if ((icrlo & APIC_DEST_MASK) == APIC_DEST_DESTFLD) { - value = lapic->icr_hi; - value &= ~APIC_ID_MASK; - value |= dest << APIC_ID_SHIFT; - lapic->icr_hi = value; + if (x2apic_mode) { + vhi = dest; + } else { + vhi = icr >> 32; + vhi &= ~APIC_ID_MASK; + vhi |= dest << APIC_ID_SHIFT; + } + } else { + vhi = 0; } /* Program the contents of the IPI and dispatch it. */ - value = lapic->icr_lo; - value &= APIC_ICRLO_RESV_MASK; - value |= icrlo; - lapic->icr_lo = value; - intr_restore(saveintr); + if (x2apic_mode) { + vlo = icrlo; + } else { + vlo = icr; + vlo &= APIC_ICRLO_RESV_MASK; + vlo |= icrlo; + } + lapic_write_icr(vhi, vlo); + if (!x2apic_mode) + intr_restore(saveintr); } #define BEFORE_SPIN 50000 @@ -1473,8 +2034,8 @@ #define AFTER_SPIN 50 #endif -void -lapic_ipi_vectored(u_int vector, int dest) +static void +native_lapic_ipi_vectored(u_int vector, int dest) { register_t icrlo, destfield; @@ -1484,11 +2045,10 @@ icrlo = APIC_DESTMODE_PHY | APIC_TRIGMOD_EDGE | APIC_LEVEL_ASSERT; /* - * IPI_STOP_HARD is just a "fake" vector used to send a NMI. - * Use special rules regard NMI if passed, otherwise specify - * the vector. + * NMI IPIs are just fake vectors used to send a NMI. Use special rules + * regarding NMIs if passed, otherwise specify the vector. */ - if (vector == IPI_STOP_HARD) + if (vector >= IPI_NMI_FIRST) icrlo |= APIC_DELMODE_NMI; else icrlo |= vector | APIC_DELMODE_FIXED; @@ -1504,7 +2064,8 @@ icrlo |= APIC_DEST_ALLESELF; break; default: - KASSERT((dest & ~(APIC_ID_MASK >> APIC_ID_SHIFT)) == 0, + KASSERT(x2apic_mode || + (dest & ~(APIC_ID_MASK >> APIC_ID_SHIFT)) == 0, ("%s: invalid destination 0x%x", __func__, dest)); destfield = dest; } @@ -1541,10 +2102,70 @@ printf("APIC: IPI might be stuck\n"); #else /* !needsattention */ /* Wait until mesage is sent without a timeout. */ - while (lapic->icr_lo & APIC_DELSTAT_PEND) + while (lapic_read_icr_lo() & APIC_DELSTAT_PEND) ia32_pause(); #endif /* needsattention */ } #endif /* DETECT_DEADLOCK */ } + #endif /* SMP */ + +/* + * Since the IDT is shared by all CPUs the IPI slot update needs to be globally + * visible. + * + * Consider the case where an IPI is generated immediately after allocation: + * vector = lapic_ipi_alloc(ipifunc); + * ipi_selected(other_cpus, vector); + * + * In xAPIC mode a write to ICR_LO has serializing semantics because the + * APIC page is mapped as an uncached region. In x2APIC mode there is an + * explicit 'mfence' before the ICR MSR is written. Therefore in both cases + * the IDT slot update is globally visible before the IPI is delivered. + */ +static int +native_lapic_ipi_alloc(inthand_t *ipifunc) +{ + struct gate_descriptor *ip; + long func; + int idx, vector; + + KASSERT(ipifunc != &IDTVEC(rsvd) && ipifunc != &IDTVEC(rsvd_pti), + ("invalid ipifunc %p", ipifunc)); + + vector = -1; + mtx_lock_spin(&icu_lock); + for (idx = IPI_DYN_FIRST; idx <= IPI_DYN_LAST; idx++) { + ip = &idt[idx]; + func = (ip->gd_hioffset << 16) | ip->gd_looffset; + if ((!pti && func == (uintptr_t)&IDTVEC(rsvd)) || + (pti && func == (uintptr_t)&IDTVEC(rsvd_pti))) { + vector = idx; + setidt(vector, ipifunc, SDT_APIC, SEL_KPL, GSEL_APIC); + break; + } + } + mtx_unlock_spin(&icu_lock); + return (vector); +} + +static void +native_lapic_ipi_free(int vector) +{ + struct gate_descriptor *ip; + long func; + + KASSERT(vector >= IPI_DYN_FIRST && vector <= IPI_DYN_LAST, + ("%s: invalid vector %d", __func__, vector)); + + mtx_lock_spin(&icu_lock); + ip = &idt[vector]; + func = (ip->gd_hioffset << 16) | ip->gd_looffset; + KASSERT(func != (uintptr_t)&IDTVEC(rsvd) && + func != (uintptr_t)&IDTVEC(rsvd_pti), + ("invalid idtfunc %#lx", func)); + setidt(vector, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_APICT, + SEL_KPL, GSEL_APIC); + mtx_unlock_spin(&icu_lock); +} Modified: trunk/sys/x86/x86/mca.c =================================================================== --- trunk/sys/x86/x86/mca.c 2020-02-08 19:29:01 UTC (rev 12309) +++ trunk/sys/x86/x86/mca.c 2020-02-08 19:32:41 UTC (rev 12310) @@ -31,7 +31,7 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/x86/x86/mca.c 314667 2017-03-04 13:03:31Z avg $"); +__FBSDID("$FreeBSD: stable/11/sys/x86/x86/mca.c 333159 2018-05-02 07:38:38Z kib $"); #ifdef __amd64__ #define DEV_APIC @@ -53,7 +53,7 @@ #include <sys/systm.h> #include <sys/taskqueue.h> #include <machine/intr_machdep.h> -#include <machine/apicvar.h> +#include <x86/apicvar.h> #include <machine/cpu.h> #include <machine/cputypes.h> #include <x86/mca.h> @@ -76,6 +76,11 @@ int max_threshold; time_t last_intr; }; + +struct amd_et_state { + int cur_threshold; + time_t last_intr; +}; #endif struct mca_internal { @@ -93,22 +98,20 @@ "Machine Check Architecture"); static int mca_enabled = 1; -TUNABLE_INT("hw.mca.enabled", &mca_enabled); SYSCTL_INT(_hw_mca, OID_AUTO, enabled, CTLFLAG_RDTUN, &mca_enabled, 0, "Administrative toggle for machine check support"); static int amd10h_L1TP = 1; -TUNABLE_INT("hw.mca.amd10h_L1TP", &amd10h_L1TP); SYSCTL_INT(_hw_mca, OID_AUTO, amd10h_L1TP, CTLFLAG_RDTUN, &amd10h_L1TP, 0, "Administrative toggle for logging of level one TLB parity (L1TP) errors"); static int intel6h_HSD131; -TUNABLE_INT("hw.mca.intel6h_hsd131", &intel6h_HSD131); SYSCTL_INT(_hw_mca, OID_AUTO, intel6h_HSD131, CTLFLAG_RDTUN, &intel6h_HSD131, 0, "Administrative toggle for logging of spurious corrected errors"); int workaround_erratum383; -SYSCTL_INT(_hw_mca, OID_AUTO, erratum383, CTLFLAG_RD, &workaround_erratum383, 0, +SYSCTL_INT(_hw_mca, OID_AUTO, erratum383, CTLFLAG_RDTUN, + &workaround_erratum383, 0, "Is the workaround for Erratum 383 on AMD Family 10h processors enabled?"); static STAILQ_HEAD(, mca_internal) mca_freelist; @@ -121,8 +124,18 @@ static struct mtx mca_lock; #ifdef DEV_APIC -static struct cmc_state **cmc_state; /* Indexed by cpuid, bank */ +static struct cmc_state **cmc_state; /* Indexed by cpuid, bank. */ +static struct amd_et_state *amd_et_state; /* Indexed by cpuid. */ static int cmc_throttle = 60; /* Time in seconds to throttle CMCI. */ + +static int amd_elvt = -1; + +static inline bool +amd_thresholding_supported(void) +{ + return (cpu_vendor_id == CPU_VENDOR_AMD && + CPUID_TO_FAMILY(cpu_id) >= 0x10 && CPUID_TO_FAMILY(cpu_id) <= 0x16); +} #endif static int @@ -511,8 +524,8 @@ STAILQ_INSERT_TAIL(&mca_records, rec, link); mca_count++; mtx_unlock_spin(&mca_lock); - if (mode == CMCI) - taskqueue_enqueue_fast(mca_tq, &mca_refill_task); + if (mode == CMCI && !cold) + taskqueue_enqueue(mca_tq, &mca_refill_task); } #ifdef DEV_APIC @@ -524,19 +537,15 @@ * cmc_throttle seconds or the periodic scan. If a periodic scan * finds that the threshold is too high, it is lowered. */ -static void -cmci_update(enum scan_mode mode, int bank, int valid, struct mca_record *rec) +static int +update_threshold(enum scan_mode mode, int valid, int last_intr, int count, + int cur_threshold, int max_threshold) { - struct cmc_state *cc; - uint64_t ctl; u_int delta; - int count, limit; + int limit; - /* Fetch the current limit for this bank. */ - cc = &cmc_state[PCPU_GET(cpuid)][bank]; - ctl = rdmsr(MSR_MC_CTL2(bank)); - count = (rec->mr_status & MC_STATUS_COR_COUNT) >> 38; - delta = (u_int)(time_uptime - cc->last_intr); + delta = (u_int)(time_uptime - last_intr); + limit = cur_threshold; /* * If an interrupt was received less than cmc_throttle seconds @@ -545,16 +554,11 @@ * double the threshold up to the max. */ if (mode == CMCI && valid) { - limit = ctl & MC_CTL2_THRESHOLD; if (delta < cmc_throttle && count >= limit && - limit < cc->max_threshold) { - limit = min(limit << 1, cc->max_threshold); - ctl &= ~MC_CTL2_THRESHOLD; - ctl |= limit; - wrmsr(MSR_MC_CTL2(bank), ctl); + limit < max_threshold) { + limit = min(limit << 1, max_threshold); } - cc->last_intr = time_uptime; - return; + return (limit); } /* @@ -562,11 +566,11 @@ * should be lowered. */ if (mode != POLLED) - return; + return (limit); /* If a CMCI occured recently, do nothing for now. */ if (delta < cmc_throttle) - return; + return (limit); /* * Compute a new limit based on the average rate of events per @@ -573,20 +577,70 @@ * cmc_throttle seconds since the last interrupt. */ if (valid) { - count = (rec->mr_status & MC_STATUS_COR_COUNT) >> 38; limit = count * cmc_throttle / delta; if (limit <= 0) limit = 1; - else if (limit > cc->max_threshold) - limit = cc->max_threshold; - } else + else if (limit > max_threshold) + limit = max_threshold; + } else { limit = 1; - if ((ctl & MC_CTL2_THRESHOLD) != limit) { + } + return (limit); +} + +static void +cmci_update(enum scan_mode mode, int bank, int valid, struct mca_record *rec) +{ + struct cmc_state *cc; + uint64_t ctl; + int cur_threshold, new_threshold; + int count; + + /* Fetch the current limit for this bank. */ + cc = &cmc_state[PCPU_GET(cpuid)][bank]; + ctl = rdmsr(MSR_MC_CTL2(bank)); + count = (rec->mr_status & MC_STATUS_COR_COUNT) >> 38; + cur_threshold = ctl & MC_CTL2_THRESHOLD; + + new_threshold = update_threshold(mode, valid, cc->last_intr, count, + cur_threshold, cc->max_threshold); + + if (mode == CMCI && valid) + cc->last_intr = time_uptime; + if (new_threshold != cur_threshold) { ctl &= ~MC_CTL2_THRESHOLD; - ctl |= limit; + ctl |= new_threshold; wrmsr(MSR_MC_CTL2(bank), ctl); } } + +static void +amd_thresholding_update(enum scan_mode mode, int bank, int valid) +{ + struct amd_et_state *cc; + uint64_t misc; + int new_threshold; + int count; + + KASSERT(bank == MC_AMDNB_BANK, + ("%s: unexpected bank %d", __func__, bank)); + cc = &amd_et_state[PCPU_GET(cpuid)]; + misc = rdmsr(MSR_MC_MISC(bank)); + count = (misc & MC_MISC_AMDNB_CNT_MASK) >> MC_MISC_AMDNB_CNT_SHIFT; + count = count - (MC_MISC_AMDNB_CNT_MAX - cc->cur_threshold); + + new_threshold = update_threshold(mode, valid, cc->last_intr, count, + cc->cur_threshold, MC_MISC_AMDNB_CNT_MAX); + + cc->cur_threshold = new_threshold; + misc &= ~MC_MISC_AMDNB_CNT_MASK; + misc |= (uint64_t)(MC_MISC_AMDNB_CNT_MAX - cc->cur_threshold) + << MC_MISC_AMDNB_CNT_SHIFT; + misc &= ~MC_MISC_AMDNB_OVERFLOW; + wrmsr(MSR_MC_MISC(bank), misc); + if (mode == CMCI && valid) + cc->last_intr = time_uptime; +} #endif /* @@ -600,7 +654,7 @@ * count of the number of valid MC records found. */ static int -mca_scan(enum scan_mode mode) +mca_scan(enum scan_mode mode, int *recoverablep) { struct mca_record rec; uint64_t mcg_cap, ucmask; @@ -641,13 +695,19 @@ * If this is a bank this CPU monitors via CMCI, * update the threshold. */ - if (PCPU_GET(cmci_mask) & 1 << i) - cmci_update(mode, i, valid, &rec); + if (PCPU_GET(cmci_mask) & 1 << i) { + if (cmc_state != NULL) + cmci_update(mode, i, valid, &rec); + else + amd_thresholding_update(mode, i, valid); + } #endif } if (mode == POLLED) mca_fill_freelist(); - return (mode == MCE ? recoverable : count); + if (recoverablep != NULL) + *recoverablep = recoverable; + return (count); } /* @@ -669,7 +729,7 @@ CPU_FOREACH(cpu) { sched_bind(td, cpu); thread_unlock(td); - count += mca_scan(POLLED); + count += mca_scan(POLLED, NULL); thread_lock(td); sched_unbind(td); } @@ -690,7 +750,7 @@ mca_periodic_scan(void *arg) { - taskqueue_enqueue_fast(mca_tq, &mca_scan_task); + taskqueue_enqueue(mca_tq, &mca_scan_task); callout_reset(&mca_timer, mca_ticks * hz, mca_periodic_scan, NULL); } @@ -704,7 +764,7 @@ if (error) return (error); if (i) - taskqueue_enqueue_fast(mca_tq, &mca_scan_task); + taskqueue_enqueue(mca_tq, &mca_scan_task); return (0); } @@ -717,6 +777,9 @@ mca_tq = taskqueue_create_fast("mca", M_WAITOK, taskqueue_thread_enqueue, &mca_tq); taskqueue_start_threads(&mca_tq, 1, PI_SWI(SWI_TQ), "mca taskq"); + + /* CMCIs during boot may have claimed items from the freelist. */ + mca_fill_freelist(); } SYSINIT(mca_createtq, SI_SUB_CONFIGURE, SI_ORDER_ANY, mca_createtq, NULL); @@ -729,7 +792,11 @@ callout_reset(&mca_timer, mca_ticks * hz, mca_periodic_scan, NULL); } +#ifdef EARLY_AP_STARTUP +SYSINIT(mca_startup, SI_SUB_KICK_SCHEDULER, SI_ORDER_ANY, mca_startup, NULL); +#else SYSINIT(mca_startup, SI_SUB_SMP, SI_ORDER_ANY, mca_startup, NULL); +#endif #ifdef DEV_APIC static void @@ -747,6 +814,18 @@ &cmc_throttle, 0, sysctl_positive_int, "I", "Interval in seconds to throttle corrected MC interrupts"); } + +static void +amd_thresholding_setup(void) +{ + + amd_et_state = malloc((mp_maxid + 1) * sizeof(struct amd_et_state), + M_MCA, M_WAITOK | M_ZERO); + SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO, + "cmc_throttle", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, + &cmc_throttle, 0, sysctl_positive_int, "I", + "Interval in seconds to throttle corrected MC interrupts"); +} #endif static void @@ -785,6 +864,8 @@ #ifdef DEV_APIC if (mcg_cap & MCG_CAP_CMCI_P) cmci_setup(); + else if (amd_thresholding_supported()) + amd_thresholding_setup(); #endif } @@ -859,6 +940,82 @@ ctl |= MC_CTL2_CMCI_EN | 1; wrmsr(MSR_MC_CTL2(i), ctl); } + +static void +amd_thresholding_start(struct amd_et_state *cc) +{ + uint64_t misc; + + KASSERT(amd_elvt >= 0, ("ELVT offset is not set")); + misc = rdmsr(MSR_MC_MISC(MC_AMDNB_BANK)); + misc &= ~MC_MISC_AMDNB_INT_MASK; + misc |= MC_MISC_AMDNB_INT_LVT; + misc &= ~MC_MISC_AMDNB_LVT_MASK; + misc |= (uint64_t)amd_elvt << MC_MISC_AMDNB_LVT_SHIFT; + misc &= ~MC_MISC_AMDNB_CNT_MASK; + misc |= (uint64_t)(MC_MISC_AMDNB_CNT_MAX - cc->cur_threshold) + << MC_MISC_AMDNB_CNT_SHIFT; + misc &= ~MC_MISC_AMDNB_OVERFLOW; + misc |= MC_MISC_AMDNB_CNTEN; + + wrmsr(MSR_MC_MISC(MC_AMDNB_BANK), misc); +} + +static void +amd_thresholding_init(void) +{ + struct amd_et_state *cc; + uint64_t misc; + + /* The counter must be valid and present. */ + misc = rdmsr(MSR_MC_MISC(MC_AMDNB_BANK)); + if ((misc & (MC_MISC_AMDNB_VAL | MC_MISC_AMDNB_CNTP)) != + (MC_MISC_AMDNB_VAL | MC_MISC_AMDNB_CNTP)) + return; + + /* The register should not be locked. */ + if ((misc & MC_MISC_AMDNB_LOCK) != 0) + return; + + /* + * If counter is enabled then either the firmware or another CPU + * has already claimed it. + */ + if ((misc & MC_MISC_AMDNB_CNTEN) != 0) + return; + + /* + * Configure an Extended Interrupt LVT register for reporting + * counter overflows if that feature is supported and the first + * extended register is available. + */ + amd_elvt = lapic_enable_mca_elvt(); + if (amd_elvt < 0) + return; + + /* Re-use Intel CMC support infrastructure. */ + cc = &amd_et_state[PCPU_GET(cpuid)]; + cc->cur_threshold = 1; + amd_thresholding_start(cc); + + /* Mark the NB bank as monitored. */ + PCPU_SET(cmci_mask, PCPU_GET(cmci_mask) | 1 << MC_AMDNB_BANK); +} + +static void +amd_thresholding_resume(void) +{ + struct amd_et_state *cc; + + /* Nothing to do if this CPU doesn't monitor the NB bank. */ + if ((PCPU_GET(cmci_mask) & 1 << MC_AMDNB_BANK) == 0) + return; + + cc = &amd_et_state[PCPU_GET(cpuid)]; + cc->last_intr = 0; + cc->cur_threshold = 1; + amd_thresholding_start(cc); +} #endif /* @@ -884,7 +1041,7 @@ if (mcg_cap & MCG_CAP_CTL_P) /* Enable MCA features. */ wrmsr(MSR_MCG_CTL, MCG_CTL_ENABLE); - if (PCPU_GET(cpuid) == 0 && boot) + if (IS_BSP() && boot) mca_setup(mcg_cap); /* @@ -900,6 +1057,14 @@ if ((mask & (1UL << 5)) == 0) wrmsr(MSR_MC0_CTL_MASK, mask | (1UL << 5)); } + + /* + * The cmci_monitor() must not be executed + * simultaneously by several CPUs. + */ + if (boot) + mtx_lock_spin(&mca_lock); + for (i = 0; i < (mcg_cap & MCG_CAP_COUNT); i++) { /* By default enable logging of all errors. */ ctl = 0xffffffffffffffffUL; @@ -934,10 +1099,30 @@ /* Clear all errors. */ wrmsr(MSR_MC_STATUS(i), 0); } + if (boot) + mtx_unlock_spin(&mca_lock); #ifdef DEV_APIC - if (PCPU_GET(cmci_mask) != 0 && boot) + /* + * AMD Processors from families 10h - 16h provide support + * for Machine Check Error Thresholding. + * The processors support counters of MC errors and they + * can be configured to generate an interrupt when a counter + * overflows. + * The counters are all associated with Bank 4 and each + * of them covers a group of errors reported via that bank. + * At the moment only the DRAM Error Threshold Group is + * supported. + */ + if (amd_thresholding_supported() && + (mcg_cap & MCG_CAP_COUNT) >= 4) { + if (boot) + amd_thresholding_init(); + else + amd_thresholding_resume(); + } else if (PCPU_GET(cmci_mask) != 0 && boot) { lapic_enable_cmc(); + } #endif } @@ -978,7 +1163,7 @@ mca_intr(void) { uint64_t mcg_status; - int old_count, recoverable; + int recoverable, count; if (!(cpu_feature & CPUID_MCA)) { /* @@ -992,8 +1177,7 @@ } /* Scan the banks and check for any non-recoverable errors. */ - old_count = mca_count; - recoverable = mca_scan(MCE); + count = mca_scan(MCE, &recoverable); mcg_status = rdmsr(MSR_MCG_STATUS); if (!(mcg_status & MCG_STATUS_RIPV)) recoverable = 0; @@ -1000,12 +1184,11 @@ if (!recoverable) { /* - * Wait for at least one error to be logged before - * panic'ing. Some errors will assert a machine check - * on all CPUs, but only certain CPUs will find a valid - * bank to log. + * Only panic if the error was detected local to this CPU. + * Some errors will assert a machine check on all CPUs, but + * only certain CPUs will find a valid bank to log. */ - while (mca_count == old_count) + while (count == 0) cpu_spinwait(); panic("Unrecoverable machine check exception"); @@ -1027,7 +1210,7 @@ * Serialize MCA bank scanning to prevent collisions from * sibling threads. */ - count = mca_scan(CMCI); + count = mca_scan(CMCI, NULL); /* If we found anything, log them to the console. */ if (count != 0) { Added: trunk/sys/x86/x86/mp_watchdog.c =================================================================== --- trunk/sys/x86/x86/mp_watchdog.c (rev 0) +++ trunk/sys/x86/x86/mp_watchdog.c 2020-02-08 19:32:41 UTC (rev 12310) @@ -0,0 +1,211 @@ +/* $MidnightBSD$ */ +/*- + * Copyright (c) 2004 Robert N. M. Watson + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: stable/11/sys/x86/x86/mp_watchdog.c 303912 2016-08-10 13:38:44Z kib $ + */ + +#include "opt_mp_watchdog.h" +#include "opt_sched.h" + +#ifdef SCHED_ULE +#error MP_WATCHDOG cannot currently be used with SCHED_ULE +#endif + +#include <sys/param.h> +#include <sys/kdb.h> +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/pcpu.h> +#include <sys/proc.h> +#include <sys/sysctl.h> +#include <sys/systm.h> + +#include <machine/smp.h> +#include <x86/apicreg.h> +#include <x86/apicvar.h> +#include <machine/mp_watchdog.h> + +/* + * mp_watchdog hijacks the idle thread on a specified CPU, prevents new work + * from being scheduled there, and uses it as a "watchdog" to detect kernel + * failure on other CPUs. This is made reasonable by inclusion of logical + * processors in Xeon hardware. The watchdog is configured by setting the + * debug.watchdog sysctl/tunable to the CPU of interest. A callout will then + * begin executing reseting a timer that is gradually lowered by the watching + * thread. If the timer reaches 0, the watchdog fires by ether dropping + * directly to the debugger, or by sending an NMI IPI to the boot processor. + * This is a somewhat less efficient substitute for dedicated watchdog + * hardware, but can be quite an effective tool for debugging hangs. + * + * XXXRW: This should really use the watchdog(9)/watchdog(4) framework, but + * doesn't yet. + */ +static int watchdog_cpu = -1; +static int watchdog_dontfire = 1; +static int watchdog_timer = -1; +static int watchdog_nmi = 1; + +SYSCTL_INT(_debug, OID_AUTO, watchdog_nmi, CTLFLAG_RWTUN, &watchdog_nmi, 0, + "IPI the boot processor with an NMI to enter the debugger"); + +static struct callout watchdog_callout; + +static void watchdog_change(int wdcpu); + +/* + * Number of seconds before the watchdog will fire if the callout fails to + * reset the timer. + */ +#define WATCHDOG_THRESHOLD 10 + +static void +watchdog_init(void *arg) +{ + + callout_init(&watchdog_callout, 1); + if (watchdog_cpu != -1) + watchdog_change(watchdog_cpu); +} + +/* + * This callout resets a timer until the watchdog kicks in. It acquires some + * critical locks to make sure things haven't gotten wedged with those locks + * held. + */ +static void +watchdog_function(void *arg) +{ + + /* + * Since the timer ran, we must not be wedged. Acquire some critical + * locks to make sure. Then reset the timer. + */ + mtx_lock(&Giant); + watchdog_timer = WATCHDOG_THRESHOLD; + mtx_unlock(&Giant); + callout_reset(&watchdog_callout, 1 * hz, watchdog_function, NULL); +} +SYSINIT(watchdog_init, SI_SUB_DRIVERS, SI_ORDER_ANY, watchdog_init, NULL); + +static void +watchdog_change(int wdcpu) +{ + + if (wdcpu == -1 || wdcpu == 0xffffffff) { + /* + * Disable the watchdog. + */ + watchdog_cpu = -1; + watchdog_dontfire = 1; + callout_stop(&watchdog_callout); + printf("watchdog stopped\n"); + } else { + watchdog_timer = WATCHDOG_THRESHOLD; + watchdog_dontfire = 0; + watchdog_cpu = wdcpu; + callout_reset(&watchdog_callout, 1 * hz, watchdog_function, + NULL); + } +} + +/* + * This sysctl sets which CPU is the watchdog CPU. Set to -1 or 0xffffffff + * to disable the watchdog. + */ +static int +sysctl_watchdog(SYSCTL_HANDLER_ARGS) +{ + int error, temp; + + temp = watchdog_cpu; + error = sysctl_handle_int(oidp, &temp, 0, req); + if (error) + return (error); + + if (req->newptr != NULL) + watchdog_change(temp); + return (0); +} +SYSCTL_PROC(_debug, OID_AUTO, watchdog, CTLTYPE_INT|CTLFLAG_RW, 0, 0, + sysctl_watchdog, "I", ""); + +/* + * Drop into the debugger by sending an IPI NMI to the boot processor. + */ +static void +watchdog_ipi_nmi(void) +{ + + /* + * Deliver NMI to the boot processor. Why not? + */ + lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE | + APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_NMI, + boot_cpu_id); + lapic_ipi_wait(-1); +} + +/* + * ap_watchdog() is called by the SMP idle loop code. It works on the same + * premise that the disabling of logical processors does: that if the cpu is + * idle, then it can ignore the world from then on, as nothing will be + * scheduled on it. Leaving aside multi-runqueue schedulers (SCHED_ULE) and + * explicit process migration (sched_bind()), this is not an unreasonable + * assumption. + */ +void +ap_watchdog(u_int cpuid) +{ + char old_pcomm[MAXCOMLEN + 1]; + struct proc *p; + + if (watchdog_cpu != cpuid) + return; + + printf("watchdog started on cpu %d\n", cpuid); + p = curproc; + bcopy(p->p_comm, old_pcomm, MAXCOMLEN + 1); + snprintf(p->p_comm, MAXCOMLEN + 1, "mp_watchdog cpu %d", cpuid); + while (1) { + DELAY(1000000); /* One second. */ + if (watchdog_cpu != cpuid) + break; + atomic_subtract_int(&watchdog_timer, 1); + if (watchdog_timer < 4) + printf("Watchdog timer: %d\n", watchdog_timer); + if (watchdog_timer == 0 && watchdog_dontfire == 0) { + printf("Watchdog firing!\n"); + watchdog_dontfire = 1; + if (watchdog_nmi) + watchdog_ipi_nmi(); + else + kdb_enter(KDB_WHY_WATCHDOG, "mp_watchdog"); + } + } + bcopy(old_pcomm, p->p_comm, MAXCOMLEN + 1); + printf("watchdog stopped on cpu %d\n", cpuid); +} Property changes on: trunk/sys/x86/x86/mp_watchdog.c ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +MidnightBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Added: trunk/sys/x86/x86/mp_x86.c =================================================================== --- trunk/sys/x86/x86/mp_x86.c (rev 0) +++ trunk/sys/x86/x86/mp_x86.c 2020-02-08 19:32:41 UTC (rev 12310) @@ -0,0 +1,1640 @@ +/* $MidnightBSD$ */ +/*- + * Copyright (c) 1996, by Steve Passe + * Copyright (c) 2003, by Peter Wemm + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. The name of the developer may NOT be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: stable/11/sys/x86/x86/mp_x86.c 349958 2019-07-12 22:31:12Z jhb $"); + +#ifdef __i386__ +#include "opt_apic.h" +#endif +#include "opt_cpu.h" +#include "opt_kstack_pages.h" +#include "opt_pmap.h" +#include "opt_sched.h" +#include "opt_smp.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/bus.h> +#include <sys/cons.h> /* cngetc() */ +#include <sys/cpuset.h> +#ifdef GPROF +#include <sys/gmon.h> +#endif +#include <sys/kernel.h> +#include <sys/ktr.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/memrange.h> +#include <sys/mutex.h> +#include <sys/pcpu.h> +#include <sys/proc.h> +#include <sys/sched.h> +#include <sys/smp.h> +#include <sys/sysctl.h> + +#include <vm/vm.h> +#include <vm/vm_param.h> +#include <vm/pmap.h> +#include <vm/vm_kern.h> +#include <vm/vm_extern.h> +#include <vm/vm_map.h> + +#include <x86/apicreg.h> +#include <machine/clock.h> +#include <machine/cpu.h> +#include <machine/cputypes.h> +#include <x86/mca.h> +#include <machine/md_var.h> +#include <machine/pcb.h> +#include <machine/psl.h> +#include <machine/smp.h> +#include <machine/specialreg.h> +#include <x86/ucode.h> + +/* lock region used by kernel profiling */ +int mcount_lock; + +int mp_naps; /* # of Applications processors */ +int boot_cpu_id = -1; /* designated BSP */ + +extern struct pcpu __pcpu[]; + +/* AP uses this during bootstrap. Do not staticize. */ +char *bootSTK; +int bootAP; + +/* Free these after use */ +void *bootstacks[MAXCPU]; +void *dpcpu; + +struct pcb stoppcbs[MAXCPU]; +struct susppcb **susppcbs; + +#ifdef COUNT_IPIS +/* Interrupt counts. */ +static u_long *ipi_preempt_counts[MAXCPU]; +static u_long *ipi_ast_counts[MAXCPU]; +u_long *ipi_invltlb_counts[MAXCPU]; +u_long *ipi_invlrng_counts[MAXCPU]; +u_long *ipi_invlpg_counts[MAXCPU]; +u_long *ipi_invlcache_counts[MAXCPU]; +u_long *ipi_rendezvous_counts[MAXCPU]; +static u_long *ipi_hardclock_counts[MAXCPU]; +#endif + +/* Default cpu_ops implementation. */ +struct cpu_ops cpu_ops; + +/* + * Local data and functions. + */ + +static volatile cpuset_t ipi_stop_nmi_pending; + +volatile cpuset_t resuming_cpus; +volatile cpuset_t toresume_cpus; + +/* used to hold the AP's until we are ready to release them */ +struct mtx ap_boot_mtx; + +/* Set to 1 once we're ready to let the APs out of the pen. */ +volatile int aps_ready = 0; + +/* + * Store data from cpu_add() until later in the boot when we actually setup + * the APs. + */ +struct cpu_info cpu_info[MAX_APIC_ID + 1]; +int apic_cpuids[MAX_APIC_ID + 1]; +int cpu_apic_ids[MAXCPU]; + +/* Holds pending bitmap based IPIs per CPU */ +volatile u_int cpu_ipi_pending[MAXCPU]; + +static void release_aps(void *dummy); +static void cpustop_handler_post(u_int cpu); + +static int hyperthreading_allowed = 1; +SYSCTL_INT(_machdep, OID_AUTO, hyperthreading_allowed, CTLFLAG_RDTUN, + &hyperthreading_allowed, 0, "Use Intel HTT logical CPUs"); + +static struct topo_node topo_root; + +static int pkg_id_shift; +static int core_id_shift; +static int disabled_cpus; + +struct cache_info { + int id_shift; + int present; +} static caches[MAX_CACHE_LEVELS]; + +void +mem_range_AP_init(void) +{ + + if (mem_range_softc.mr_op && mem_range_softc.mr_op->initAP) + mem_range_softc.mr_op->initAP(&mem_range_softc); +} + +/* + * Round up to the next power of two, if necessary, and then + * take log2. + * Returns -1 if argument is zero. + */ +static __inline int +mask_width(u_int x) +{ + + return (fls(x << (1 - powerof2(x))) - 1); +} + +/* + * Add a cache level to the cache topology description. + */ +static int +add_deterministic_cache(int type, int level, int share_count) +{ + + if (type == 0) + return (0); + if (type > 3) { + printf("unexpected cache type %d\n", type); + return (1); + } + if (type == 2) /* ignore instruction cache */ + return (1); + if (level == 0 || level > MAX_CACHE_LEVELS) { + printf("unexpected cache level %d\n", type); + return (1); + } + + if (caches[level - 1].present) { + printf("WARNING: multiple entries for L%u data cache\n", level); + printf("%u => %u\n", caches[level - 1].id_shift, + mask_width(share_count)); + } + caches[level - 1].id_shift = mask_width(share_count); + caches[level - 1].present = 1; + + if (caches[level - 1].id_shift > pkg_id_shift) { + printf("WARNING: L%u data cache covers more " + "APIC IDs than a package\n", level); + printf("%u > %u\n", caches[level - 1].id_shift, pkg_id_shift); + caches[level - 1].id_shift = pkg_id_shift; + } + if (caches[level - 1].id_shift < core_id_shift) { + printf("WARNING: L%u data cache covers less " + "APIC IDs than a core\n", level); + printf("%u < %u\n", caches[level - 1].id_shift, core_id_shift); + caches[level - 1].id_shift = core_id_shift; + } + + return (1); +} + +/* + * Determine topology of processing units and caches for AMD CPUs. + * See: + * - AMD CPUID Specification (Publication # 25481) + * - BKDG for AMD NPT Family 0Fh Processors (Publication # 32559) + * - BKDG For AMD Family 10h Processors (Publication # 31116) + * - BKDG For AMD Family 15h Models 00h-0Fh Processors (Publication # 42301) + * - BKDG For AMD Family 16h Models 00h-0Fh Processors (Publication # 48751) + * - PPR For AMD Family 17h Models 00h-0Fh Processors (Publication # 54945) + */ +static void +topo_probe_amd(void) +{ + u_int p[4]; + uint64_t v; + int level; + int nodes_per_socket; + int share_count; + int type; + int i; + + /* No multi-core capability. */ + if ((amd_feature2 & AMDID2_CMP) == 0) + return; + + /* For families 10h and newer. */ + pkg_id_shift = (cpu_procinfo2 & AMDID_COREID_SIZE) >> + AMDID_COREID_SIZE_SHIFT; + + /* For 0Fh family. */ + if (pkg_id_shift == 0) + pkg_id_shift = + mask_width((cpu_procinfo2 & AMDID_CMP_CORES) + 1); + + /* + * Families prior to 16h define the following value as + * cores per compute unit and we don't really care about the AMD + * compute units at the moment. Perhaps we should treat them as + * cores and cores within the compute units as hardware threads, + * but that's up for debate. + * Later families define the value as threads per compute unit, + * so we are following AMD's nomenclature here. + */ + if ((amd_feature2 & AMDID2_TOPOLOGY) != 0 && + CPUID_TO_FAMILY(cpu_id) >= 0x16) { + cpuid_count(0x8000001e, 0, p); + share_count = ((p[1] >> 8) & 0xff) + 1; + core_id_shift = mask_width(share_count); + } + + if ((amd_feature2 & AMDID2_TOPOLOGY) != 0) { + for (i = 0; ; i++) { + cpuid_count(0x8000001d, i, p); + type = p[0] & 0x1f; + level = (p[0] >> 5) & 0x7; + share_count = 1 + ((p[0] >> 14) & 0xfff); + + if (!add_deterministic_cache(type, level, share_count)) + break; + } + } else { + if (cpu_exthigh >= 0x80000005) { + cpuid_count(0x80000005, 0, p); + if (((p[2] >> 24) & 0xff) != 0) { + caches[0].id_shift = 0; + caches[0].present = 1; + } + } + if (cpu_exthigh >= 0x80000006) { + cpuid_count(0x80000006, 0, p); + if (((p[2] >> 16) & 0xffff) != 0) { + caches[1].id_shift = 0; + caches[1].present = 1; + } + if (((p[3] >> 18) & 0x3fff) != 0) { + nodes_per_socket = 1; + if ((amd_feature2 & AMDID2_NODE_ID) != 0) { + /* + * Handle multi-node processors that + * have multiple chips, each with its + * own L3 cache, on the same die. + */ + v = rdmsr(0xc001100c); + nodes_per_socket = 1 + ((v >> 3) & 0x7); + } + caches[2].id_shift = + pkg_id_shift - mask_width(nodes_per_socket); + caches[2].present = 1; + } + } + } +} + +/* + * Determine topology of processing units for Intel CPUs + * using CPUID Leaf 1 and Leaf 4, if supported. + * See: + * - Intel 64 Architecture Processor Topology Enumeration + * - Intel 64 and IA-32 ArchitecturesSoftware Developer?s Manual, + * Volume 3A: System Programming Guide, PROGRAMMING CONSIDERATIONS + * FOR HARDWARE MULTI-THREADING CAPABLE PROCESSORS + */ +static void +topo_probe_intel_0x4(void) +{ + u_int p[4]; + int max_cores; + int max_logical; + + /* Both zero and one here mean one logical processor per package. */ + max_logical = (cpu_feature & CPUID_HTT) != 0 ? + (cpu_procinfo & CPUID_HTT_CORES) >> 16 : 1; + if (max_logical <= 1) + return; + + if (cpu_high >= 0x4) { + cpuid_count(0x04, 0, p); + max_cores = ((p[0] >> 26) & 0x3f) + 1; + } else + max_cores = 1; + + core_id_shift = mask_width(max_logical/max_cores); + KASSERT(core_id_shift >= 0, + ("intel topo: max_cores > max_logical\n")); + pkg_id_shift = core_id_shift + mask_width(max_cores); +} + +/* + * Determine topology of processing units for Intel CPUs + * using CPUID Leaf 11, if supported. + * See: + * - Intel 64 Architecture Processor Topology Enumeration + * - Intel 64 and IA-32 ArchitecturesSoftware Developer?s Manual, + * Volume 3A: System Programming Guide, PROGRAMMING CONSIDERATIONS + * FOR HARDWARE MULTI-THREADING CAPABLE PROCESSORS + */ +static void +topo_probe_intel_0xb(void) +{ + u_int p[4]; + int bits; + int type; + int i; + + /* Fall back if CPU leaf 11 doesn't really exist. */ + cpuid_count(0x0b, 0, p); + if (p[1] == 0) { + topo_probe_intel_0x4(); + return; + } + + /* We only support three levels for now. */ + for (i = 0; ; i++) { + cpuid_count(0x0b, i, p); + + bits = p[0] & 0x1f; + type = (p[2] >> 8) & 0xff; + + if (type == 0) + break; + + /* TODO: check for duplicate (re-)assignment */ + if (type == CPUID_TYPE_SMT) + core_id_shift = bits; + else if (type == CPUID_TYPE_CORE) + pkg_id_shift = bits; + else + printf("unknown CPU level type %d\n", type); + } + + if (pkg_id_shift < core_id_shift) { + printf("WARNING: core covers more APIC IDs than a package\n"); + core_id_shift = pkg_id_shift; + } +} + +/* + * Determine topology of caches for Intel CPUs. + * See: + * - Intel 64 Architecture Processor Topology Enumeration + * - Intel 64 and IA-32 Architectures Software Developer?s Manual + * Volume 2A: Instruction Set Reference, A-M, + * CPUID instruction + */ +static void +topo_probe_intel_caches(void) +{ + u_int p[4]; + int level; + int share_count; + int type; + int i; + + if (cpu_high < 0x4) { + /* + * Available cache level and sizes can be determined + * via CPUID leaf 2, but that requires a huge table of hardcoded + * values, so for now just assume L1 and L2 caches potentially + * shared only by HTT processing units, if HTT is present. + */ + caches[0].id_shift = pkg_id_shift; + caches[0].present = 1; + caches[1].id_shift = pkg_id_shift; + caches[1].present = 1; + return; + } + + for (i = 0; ; i++) { + cpuid_count(0x4, i, p); + type = p[0] & 0x1f; + level = (p[0] >> 5) & 0x7; + share_count = 1 + ((p[0] >> 14) & 0xfff); + + if (!add_deterministic_cache(type, level, share_count)) + break; + } +} + +/* + * Determine topology of processing units and caches for Intel CPUs. + * See: + * - Intel 64 Architecture Processor Topology Enumeration + */ +static void +topo_probe_intel(void) +{ + + /* + * Note that 0x1 <= cpu_high < 4 case should be + * compatible with topo_probe_intel_0x4() logic when + * CPUID.1:EBX[23:16] > 0 (cpu_cores will be 1) + * or it should trigger the fallback otherwise. + */ + if (cpu_high >= 0xb) + topo_probe_intel_0xb(); + else if (cpu_high >= 0x1) + topo_probe_intel_0x4(); + + topo_probe_intel_caches(); +} + +/* + * Topology information is queried only on BSP, on which this + * code runs and for which it can query CPUID information. + * Then topology is extrapolated on all packages using an + * assumption that APIC ID to hardware component ID mapping is + * homogenious. + * That doesn't necesserily imply that the topology is uniform. + */ +void +topo_probe(void) +{ + static int cpu_topo_probed = 0; + struct x86_topo_layer { + int type; + int subtype; + int id_shift; + } topo_layers[MAX_CACHE_LEVELS + 3]; + struct topo_node *parent; + struct topo_node *node; + int layer; + int nlayers; + int node_id; + int i; + + if (cpu_topo_probed) + return; + + CPU_ZERO(&logical_cpus_mask); + + if (mp_ncpus <= 1) + ; /* nothing */ + else if (cpu_vendor_id == CPU_VENDOR_AMD) + topo_probe_amd(); + else if (cpu_vendor_id == CPU_VENDOR_INTEL) + topo_probe_intel(); + + KASSERT(pkg_id_shift >= core_id_shift, + ("bug in APIC topology discovery")); + + nlayers = 0; + bzero(topo_layers, sizeof(topo_layers)); + + topo_layers[nlayers].type = TOPO_TYPE_PKG; + topo_layers[nlayers].id_shift = pkg_id_shift; + if (bootverbose) + printf("Package ID shift: %u\n", topo_layers[nlayers].id_shift); + nlayers++; + + /* + * Consider all caches to be within a package/chip + * and "in front" of all sub-components like + * cores and hardware threads. + */ + for (i = MAX_CACHE_LEVELS - 1; i >= 0; --i) { + if (caches[i].present) { + KASSERT(caches[i].id_shift <= pkg_id_shift, + ("bug in APIC topology discovery")); + KASSERT(caches[i].id_shift >= core_id_shift, + ("bug in APIC topology discovery")); + + topo_layers[nlayers].type = TOPO_TYPE_CACHE; + topo_layers[nlayers].subtype = i + 1; + topo_layers[nlayers].id_shift = caches[i].id_shift; + if (bootverbose) + printf("L%u cache ID shift: %u\n", + topo_layers[nlayers].subtype, + topo_layers[nlayers].id_shift); + nlayers++; + } + } + + if (pkg_id_shift > core_id_shift) { + topo_layers[nlayers].type = TOPO_TYPE_CORE; + topo_layers[nlayers].id_shift = core_id_shift; + if (bootverbose) + printf("Core ID shift: %u\n", + topo_layers[nlayers].id_shift); + nlayers++; + } + + topo_layers[nlayers].type = TOPO_TYPE_PU; + topo_layers[nlayers].id_shift = 0; + nlayers++; + + topo_init_root(&topo_root); + for (i = 0; i <= MAX_APIC_ID; ++i) { + if (!cpu_info[i].cpu_present) + continue; + + parent = &topo_root; + for (layer = 0; layer < nlayers; ++layer) { + node_id = i >> topo_layers[layer].id_shift; + parent = topo_add_node_by_hwid(parent, node_id, + topo_layers[layer].type, + topo_layers[layer].subtype); + } + } + + parent = &topo_root; + for (layer = 0; layer < nlayers; ++layer) { + node_id = boot_cpu_id >> topo_layers[layer].id_shift; + node = topo_find_node_by_hwid(parent, node_id, + topo_layers[layer].type, + topo_layers[layer].subtype); + topo_promote_child(node); + parent = node; + } + + cpu_topo_probed = 1; +} + +/* + * Assign logical CPU IDs to local APICs. + */ +void +assign_cpu_ids(void) +{ + struct topo_node *node; + u_int smt_mask; + + smt_mask = (1u << core_id_shift) - 1; + + /* + * Assign CPU IDs to local APIC IDs and disable any CPUs + * beyond MAXCPU. CPU 0 is always assigned to the BSP. + */ + mp_ncpus = 0; + TOPO_FOREACH(node, &topo_root) { + if (node->type != TOPO_TYPE_PU) + continue; + + if ((node->hwid & smt_mask) != (boot_cpu_id & smt_mask)) + cpu_info[node->hwid].cpu_hyperthread = 1; + + if (resource_disabled("lapic", node->hwid)) { + if (node->hwid != boot_cpu_id) + cpu_info[node->hwid].cpu_disabled = 1; + else + printf("Cannot disable BSP, APIC ID = %d\n", + node->hwid); + } + + if (!hyperthreading_allowed && + cpu_info[node->hwid].cpu_hyperthread) + cpu_info[node->hwid].cpu_disabled = 1; + + if (mp_ncpus >= MAXCPU) + cpu_info[node->hwid].cpu_disabled = 1; + + if (cpu_info[node->hwid].cpu_disabled) { + disabled_cpus++; + continue; + } + + cpu_apic_ids[mp_ncpus] = node->hwid; + apic_cpuids[node->hwid] = mp_ncpus; + topo_set_pu_id(node, mp_ncpus); + mp_ncpus++; + } + + KASSERT(mp_maxid >= mp_ncpus - 1, + ("%s: counters out of sync: max %d, count %d", __func__, mp_maxid, + mp_ncpus)); +} + +/* + * Print various information about the SMP system hardware and setup. + */ +void +cpu_mp_announce(void) +{ + struct topo_node *node; + const char *hyperthread; + int pkg_count; + int cores_per_pkg; + int thrs_per_core; + + printf("FreeBSD/SMP: "); + if (topo_analyze(&topo_root, 1, &pkg_count, + &cores_per_pkg, &thrs_per_core)) { + printf("%d package(s)", pkg_count); + if (cores_per_pkg > 0) + printf(" x %d core(s)", cores_per_pkg); + if (thrs_per_core > 1) + printf(" x %d hardware threads", thrs_per_core); + } else { + printf("Non-uniform topology"); + } + printf("\n"); + + if (disabled_cpus) { + printf("FreeBSD/SMP Online: "); + if (topo_analyze(&topo_root, 0, &pkg_count, + &cores_per_pkg, &thrs_per_core)) { + printf("%d package(s)", pkg_count); + if (cores_per_pkg > 0) + printf(" x %d core(s)", cores_per_pkg); + if (thrs_per_core > 1) + printf(" x %d hardware threads", thrs_per_core); + } else { + printf("Non-uniform topology"); + } + printf("\n"); + } + + if (!bootverbose) + return; + + TOPO_FOREACH(node, &topo_root) { + switch (node->type) { + case TOPO_TYPE_PKG: + printf("Package HW ID = %u (%#x)\n", + node->hwid, node->hwid); + break; + case TOPO_TYPE_CORE: + printf("\tCore HW ID = %u (%#x)\n", + node->hwid, node->hwid); + break; + case TOPO_TYPE_PU: + if (cpu_info[node->hwid].cpu_hyperthread) + hyperthread = "/HT"; + else + hyperthread = ""; + + if (node->subtype == 0) + printf("\t\tCPU (AP%s): APIC ID: %u (%#x)" + "(disabled)\n", hyperthread, node->hwid, + node->hwid); + else if (node->id == 0) + printf("\t\tCPU0 (BSP): APIC ID: %u (%#x)\n", + node->hwid, node->hwid); + else + printf("\t\tCPU%u (AP%s): APIC ID: %u (%#x)\n", + node->id, hyperthread, node->hwid, + node->hwid); + break; + default: + /* ignored */ + break; + } + } +} + +/* + * Add a scheduling group, a group of logical processors sharing + * a particular cache (and, thus having an affinity), to the scheduling + * topology. + * This function recursively works on lower level caches. + */ +static void +x86topo_add_sched_group(struct topo_node *root, struct cpu_group *cg_root) +{ + struct topo_node *node; + int nchildren; + int ncores; + int i; + + KASSERT(root->type == TOPO_TYPE_SYSTEM || root->type == TOPO_TYPE_CACHE, + ("x86topo_add_sched_group: bad type: %u", root->type)); + CPU_COPY(&root->cpuset, &cg_root->cg_mask); + cg_root->cg_count = root->cpu_count; + if (root->type == TOPO_TYPE_SYSTEM) + cg_root->cg_level = CG_SHARE_NONE; + else + cg_root->cg_level = root->subtype; + + /* + * Check how many core nodes we have under the given root node. + * If we have multiple logical processors, but not multiple + * cores, then those processors must be hardware threads. + */ + ncores = 0; + node = root; + while (node != NULL) { + if (node->type != TOPO_TYPE_CORE) { + node = topo_next_node(root, node); + continue; + } + + ncores++; + node = topo_next_nonchild_node(root, node); + } + + if (cg_root->cg_level != CG_SHARE_NONE && + root->cpu_count > 1 && ncores < 2) + cg_root->cg_flags = CG_FLAG_SMT; + + /* + * Find out how many cache nodes we have under the given root node. + * We ignore cache nodes that cover all the same processors as the + * root node. Also, we do not descend below found cache nodes. + * That is, we count top-level "non-redundant" caches under the root + * node. + */ + nchildren = 0; + node = root; + while (node != NULL) { + if (node->type != TOPO_TYPE_CACHE || + (root->type != TOPO_TYPE_SYSTEM && + CPU_CMP(&node->cpuset, &root->cpuset) == 0)) { + node = topo_next_node(root, node); + continue; + } + nchildren++; + node = topo_next_nonchild_node(root, node); + } + + cg_root->cg_child = smp_topo_alloc(nchildren); + cg_root->cg_children = nchildren; + + /* + * Now find again the same cache nodes as above and recursively + * build scheduling topologies for them. + */ + node = root; + i = 0; + while (node != NULL) { + if (node->type != TOPO_TYPE_CACHE || + (root->type != TOPO_TYPE_SYSTEM && + CPU_CMP(&node->cpuset, &root->cpuset) == 0)) { + node = topo_next_node(root, node); + continue; + } + cg_root->cg_child[i].cg_parent = cg_root; + x86topo_add_sched_group(node, &cg_root->cg_child[i]); + i++; + node = topo_next_nonchild_node(root, node); + } +} + +/* + * Build the MI scheduling topology from the discovered hardware topology. + */ +struct cpu_group * +cpu_topo(void) +{ + struct cpu_group *cg_root; + + if (mp_ncpus <= 1) + return (smp_topo_none()); + + cg_root = smp_topo_alloc(1); + x86topo_add_sched_group(&topo_root, cg_root); + return (cg_root); +} + + +/* + * Add a logical CPU to the topology. + */ +void +cpu_add(u_int apic_id, char boot_cpu) +{ + + if (apic_id > MAX_APIC_ID) { + panic("SMP: APIC ID %d too high", apic_id); + return; + } + KASSERT(cpu_info[apic_id].cpu_present == 0, ("CPU %d added twice", + apic_id)); + cpu_info[apic_id].cpu_present = 1; + if (boot_cpu) { + KASSERT(boot_cpu_id == -1, + ("CPU %d claims to be BSP, but CPU %d already is", apic_id, + boot_cpu_id)); + boot_cpu_id = apic_id; + cpu_info[apic_id].cpu_bsp = 1; + } + if (mp_ncpus < MAXCPU) { + mp_ncpus++; + mp_maxid = mp_ncpus - 1; + } + if (bootverbose) + printf("SMP: Added CPU %d (%s)\n", apic_id, boot_cpu ? "BSP" : + "AP"); +} + +void +cpu_mp_setmaxid(void) +{ + + /* + * mp_ncpus and mp_maxid should be already set by calls to cpu_add(). + * If there were no calls to cpu_add() assume this is a UP system. + */ + if (mp_ncpus == 0) + mp_ncpus = 1; +} + +int +cpu_mp_probe(void) +{ + + /* + * Always record BSP in CPU map so that the mbuf init code works + * correctly. + */ + CPU_SETOF(0, &all_cpus); + return (mp_ncpus > 1); +} + +/* + * AP CPU's call this to initialize themselves. + */ +void +init_secondary_tail(void) +{ + u_int cpuid; + + pmap_activate_boot(vmspace_pmap(proc0.p_vmspace)); + + /* + * On real hardware, switch to x2apic mode if possible. Do it + * after aps_ready was signalled, to avoid manipulating the + * mode while BSP might still want to send some IPI to us + * (second startup IPI is ignored on modern hardware etc). + */ + lapic_xapic_mode(); + + /* Initialize the PAT MSR. */ + pmap_init_pat(); + + /* set up CPU registers and state */ + cpu_setregs(); + + /* set up SSE/NX */ + initializecpu(); + + /* set up FPU state on the AP */ +#ifdef __amd64__ + fpuinit(); +#else + npxinit(false); +#endif + + if (cpu_ops.cpu_init) + cpu_ops.cpu_init(); + + /* A quick check from sanity claus */ + cpuid = PCPU_GET(cpuid); + if (PCPU_GET(apic_id) != lapic_id()) { + printf("SMP: cpuid = %d\n", cpuid); + printf("SMP: actual apic_id = %d\n", lapic_id()); + printf("SMP: correct apic_id = %d\n", PCPU_GET(apic_id)); + panic("cpuid mismatch! boom!!"); + } + + /* Initialize curthread. */ + KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread")); + PCPU_SET(curthread, PCPU_GET(idlethread)); + + mtx_lock_spin(&ap_boot_mtx); + + mca_init(); + + /* Init local apic for irq's */ + lapic_setup(1); + + /* Set memory range attributes for this CPU to match the BSP */ + mem_range_AP_init(); + + smp_cpus++; + + CTR1(KTR_SMP, "SMP: AP CPU #%d Launched", cpuid); + printf("SMP: AP CPU #%d Launched!\n", cpuid); + + /* Determine if we are a logical CPU. */ + if (cpu_info[PCPU_GET(apic_id)].cpu_hyperthread) + CPU_SET(cpuid, &logical_cpus_mask); + + if (bootverbose) + lapic_dump("AP"); + + if (smp_cpus == mp_ncpus) { + /* enable IPI's, tlb shootdown, freezes etc */ + atomic_store_rel_int(&smp_started, 1); + } + +#ifdef __amd64__ + /* + * Enable global pages TLB extension + * This also implicitly flushes the TLB + */ + load_cr4(rcr4() | CR4_PGE); + if (pmap_pcid_enabled) + load_cr4(rcr4() | CR4_PCIDE); + load_ds(_udatasel); + load_es(_udatasel); + load_fs(_ufssel); +#endif + + mtx_unlock_spin(&ap_boot_mtx); + + /* Wait until all the AP's are up. */ + while (atomic_load_acq_int(&smp_started) == 0) + ia32_pause(); + +#ifndef EARLY_AP_STARTUP + /* Start per-CPU event timers. */ + cpu_initclocks_ap(); +#endif + + sched_throw(NULL); + + panic("scheduler returned us to %s", __func__); + /* NOTREACHED */ +} + +/******************************************************************* + * local functions and data + */ + +/* + * We tell the I/O APIC code about all the CPUs we want to receive + * interrupts. If we don't want certain CPUs to receive IRQs we + * can simply not tell the I/O APIC code about them in this function. + * We also do not tell it about the BSP since it tells itself about + * the BSP internally to work with UP kernels and on UP machines. + */ +void +set_interrupt_apic_ids(void) +{ + u_int i, apic_id; + + for (i = 0; i < MAXCPU; i++) { + apic_id = cpu_apic_ids[i]; + if (apic_id == -1) + continue; + if (cpu_info[apic_id].cpu_bsp) + continue; + if (cpu_info[apic_id].cpu_disabled) + continue; + + /* Don't let hyperthreads service interrupts. */ + if (cpu_info[apic_id].cpu_hyperthread) + continue; + + intr_add_cpu(i); + } +} + + +#ifdef COUNT_XINVLTLB_HITS +u_int xhits_gbl[MAXCPU]; +u_int xhits_pg[MAXCPU]; +u_int xhits_rng[MAXCPU]; +static SYSCTL_NODE(_debug, OID_AUTO, xhits, CTLFLAG_RW, 0, ""); +SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, global, CTLFLAG_RW, &xhits_gbl, + sizeof(xhits_gbl), "IU", ""); +SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, page, CTLFLAG_RW, &xhits_pg, + sizeof(xhits_pg), "IU", ""); +SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, range, CTLFLAG_RW, &xhits_rng, + sizeof(xhits_rng), "IU", ""); + +u_int ipi_global; +u_int ipi_page; +u_int ipi_range; +u_int ipi_range_size; +SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_global, CTLFLAG_RW, &ipi_global, 0, ""); +SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_page, CTLFLAG_RW, &ipi_page, 0, ""); +SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range, CTLFLAG_RW, &ipi_range, 0, ""); +SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range_size, CTLFLAG_RW, &ipi_range_size, + 0, ""); +#endif /* COUNT_XINVLTLB_HITS */ + +/* + * Init and startup IPI. + */ +void +ipi_startup(int apic_id, int vector) +{ + + /* + * This attempts to follow the algorithm described in the + * Intel Multiprocessor Specification v1.4 in section B.4. + * For each IPI, we allow the local APIC ~20us to deliver the + * IPI. If that times out, we panic. + */ + + /* + * first we do an INIT IPI: this INIT IPI might be run, resetting + * and running the target CPU. OR this INIT IPI might be latched (P5 + * bug), CPU waiting for STARTUP IPI. OR this INIT IPI might be + * ignored. + */ + lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_LEVEL | + APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, apic_id); + lapic_ipi_wait(100); + + /* Explicitly deassert the INIT IPI. */ + lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_LEVEL | + APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, + apic_id); + + DELAY(10000); /* wait ~10mS */ + + /* + * next we do a STARTUP IPI: the previous INIT IPI might still be + * latched, (P5 bug) this 1st STARTUP would then terminate + * immediately, and the previously started INIT IPI would continue. OR + * the previous INIT IPI has already run. and this STARTUP IPI will + * run. OR the previous INIT IPI was ignored. and this STARTUP IPI + * will run. + */ + lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE | + APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP | + vector, apic_id); + if (!lapic_ipi_wait(100)) + panic("Failed to deliver first STARTUP IPI to APIC %d", + apic_id); + DELAY(200); /* wait ~200uS */ + + /* + * finally we do a 2nd STARTUP IPI: this 2nd STARTUP IPI should run IF + * the previous STARTUP IPI was cancelled by a latched INIT IPI. OR + * this STARTUP IPI will be ignored, as only ONE STARTUP IPI is + * recognized after hardware RESET or INIT IPI. + */ + lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE | + APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP | + vector, apic_id); + if (!lapic_ipi_wait(100)) + panic("Failed to deliver second STARTUP IPI to APIC %d", + apic_id); + + DELAY(200); /* wait ~200uS */ +} + +/* + * Send an IPI to specified CPU handling the bitmap logic. + */ +void +ipi_send_cpu(int cpu, u_int ipi) +{ + u_int bitmap, old_pending, new_pending; + + KASSERT(cpu_apic_ids[cpu] != -1, ("IPI to non-existent CPU %d", cpu)); + + if (IPI_IS_BITMAPED(ipi)) { + bitmap = 1 << ipi; + ipi = IPI_BITMAP_VECTOR; + do { + old_pending = cpu_ipi_pending[cpu]; + new_pending = old_pending | bitmap; + } while (!atomic_cmpset_int(&cpu_ipi_pending[cpu], + old_pending, new_pending)); + if (old_pending) + return; + } + lapic_ipi_vectored(ipi, cpu_apic_ids[cpu]); +} + +void +ipi_bitmap_handler(struct trapframe frame) +{ + struct trapframe *oldframe; + struct thread *td; + int cpu = PCPU_GET(cpuid); + u_int ipi_bitmap; + + critical_enter(); + td = curthread; + td->td_intr_nesting_level++; + oldframe = td->td_intr_frame; + td->td_intr_frame = &frame; + ipi_bitmap = atomic_readandclear_int(&cpu_ipi_pending[cpu]); + if (ipi_bitmap & (1 << IPI_PREEMPT)) { +#ifdef COUNT_IPIS + (*ipi_preempt_counts[cpu])++; +#endif + sched_preempt(td); + } + if (ipi_bitmap & (1 << IPI_AST)) { +#ifdef COUNT_IPIS + (*ipi_ast_counts[cpu])++; +#endif + /* Nothing to do for AST */ + } + if (ipi_bitmap & (1 << IPI_HARDCLOCK)) { +#ifdef COUNT_IPIS + (*ipi_hardclock_counts[cpu])++; +#endif + hardclockintr(); + } + td->td_intr_frame = oldframe; + td->td_intr_nesting_level--; + critical_exit(); +} + +/* + * send an IPI to a set of cpus. + */ +void +ipi_selected(cpuset_t cpus, u_int ipi) +{ + int cpu; + + /* + * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit + * of help in order to understand what is the source. + * Set the mask of receiving CPUs for this purpose. + */ + if (ipi == IPI_STOP_HARD) + CPU_OR_ATOMIC(&ipi_stop_nmi_pending, &cpus); + + while ((cpu = CPU_FFS(&cpus)) != 0) { + cpu--; + CPU_CLR(cpu, &cpus); + CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi); + ipi_send_cpu(cpu, ipi); + } +} + +/* + * send an IPI to a specific CPU. + */ +void +ipi_cpu(int cpu, u_int ipi) +{ + + /* + * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit + * of help in order to understand what is the source. + * Set the mask of receiving CPUs for this purpose. + */ + if (ipi == IPI_STOP_HARD) + CPU_SET_ATOMIC(cpu, &ipi_stop_nmi_pending); + + CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi); + ipi_send_cpu(cpu, ipi); +} + +/* + * send an IPI to all CPUs EXCEPT myself + */ +void +ipi_all_but_self(u_int ipi) +{ + cpuset_t other_cpus; + + other_cpus = all_cpus; + CPU_CLR(PCPU_GET(cpuid), &other_cpus); + if (IPI_IS_BITMAPED(ipi)) { + ipi_selected(other_cpus, ipi); + return; + } + + /* + * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit + * of help in order to understand what is the source. + * Set the mask of receiving CPUs for this purpose. + */ + if (ipi == IPI_STOP_HARD) + CPU_OR_ATOMIC(&ipi_stop_nmi_pending, &other_cpus); + + CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi); + lapic_ipi_vectored(ipi, APIC_IPI_DEST_OTHERS); +} + +int +ipi_nmi_handler(void) +{ + u_int cpuid; + + /* + * As long as there is not a simple way to know about a NMI's + * source, if the bitmask for the current CPU is present in + * the global pending bitword an IPI_STOP_HARD has been issued + * and should be handled. + */ + cpuid = PCPU_GET(cpuid); + if (!CPU_ISSET(cpuid, &ipi_stop_nmi_pending)) + return (1); + + CPU_CLR_ATOMIC(cpuid, &ipi_stop_nmi_pending); + cpustop_handler(); + return (0); +} + +int nmi_kdb_lock; + +void +nmi_call_kdb_smp(u_int type, struct trapframe *frame) +{ + int cpu; + bool call_post; + + cpu = PCPU_GET(cpuid); + if (atomic_cmpset_acq_int(&nmi_kdb_lock, 0, 1)) { + nmi_call_kdb(cpu, type, frame); + call_post = false; + } else { + savectx(&stoppcbs[cpu]); + CPU_SET_ATOMIC(cpu, &stopped_cpus); + while (!atomic_cmpset_acq_int(&nmi_kdb_lock, 0, 1)) + ia32_pause(); + call_post = true; + } + atomic_store_rel_int(&nmi_kdb_lock, 0); + if (call_post) + cpustop_handler_post(cpu); +} + +/* + * Handle an IPI_STOP by saving our current context and spinning until we + * are resumed. + */ +void +cpustop_handler(void) +{ + u_int cpu; + + cpu = PCPU_GET(cpuid); + + savectx(&stoppcbs[cpu]); + + /* Indicate that we are stopped */ + CPU_SET_ATOMIC(cpu, &stopped_cpus); + + /* Wait for restart */ + while (!CPU_ISSET(cpu, &started_cpus)) + ia32_pause(); + + cpustop_handler_post(cpu); +} + +static void +cpustop_handler_post(u_int cpu) +{ + + CPU_CLR_ATOMIC(cpu, &started_cpus); + CPU_CLR_ATOMIC(cpu, &stopped_cpus); + +#if defined(__amd64__) && defined(DDB) + amd64_db_resume_dbreg(); +#endif + + if (cpu == 0 && cpustop_restartfunc != NULL) { + cpustop_restartfunc(); + cpustop_restartfunc = NULL; + } +} + +/* + * Handle an IPI_SUSPEND by saving our current context and spinning until we + * are resumed. + */ +void +cpususpend_handler(void) +{ + u_int cpu; + + mtx_assert(&smp_ipi_mtx, MA_NOTOWNED); + + cpu = PCPU_GET(cpuid); + if (savectx(&susppcbs[cpu]->sp_pcb)) { +#ifdef __amd64__ + fpususpend(susppcbs[cpu]->sp_fpususpend); +#else + npxsuspend(susppcbs[cpu]->sp_fpususpend); +#endif + /* + * suspended_cpus is cleared shortly after each AP is restarted + * by a Startup IPI, so that the BSP can proceed to restarting + * the next AP. + * + * resuming_cpus gets cleared when the AP completes + * initialization after having been released by the BSP. + * resuming_cpus is probably not the best name for the + * variable, because it is actually a set of processors that + * haven't resumed yet and haven't necessarily started resuming. + * + * Note that suspended_cpus is meaningful only for ACPI suspend + * as it's not really used for Xen suspend since the APs are + * automatically restored to the running state and the correct + * context. For the same reason resumectx is never called in + * that case. + */ + CPU_SET_ATOMIC(cpu, &suspended_cpus); + CPU_SET_ATOMIC(cpu, &resuming_cpus); + + /* + * Invalidate the cache after setting the global status bits. + * The last AP to set its bit may end up being an Owner of the + * corresponding cache line in MOESI protocol. The AP may be + * stopped before the cache line is written to the main memory. + */ + wbinvd(); + } else { +#ifdef __amd64__ + fpuresume(susppcbs[cpu]->sp_fpususpend); +#else + npxresume(susppcbs[cpu]->sp_fpususpend); +#endif + pmap_init_pat(); + initializecpu(); + PCPU_SET(switchtime, 0); + PCPU_SET(switchticks, ticks); + + /* Indicate that we have restarted and restored the context. */ + CPU_CLR_ATOMIC(cpu, &suspended_cpus); + } + + /* Wait for resume directive */ + while (!CPU_ISSET(cpu, &toresume_cpus)) + ia32_pause(); + + /* Re-apply microcode updates. */ + ucode_reload(); + + if (cpu_ops.cpu_resume) + cpu_ops.cpu_resume(); +#ifdef __amd64__ + if (vmm_resume_p) + vmm_resume_p(); +#endif + + /* Resume MCA and local APIC */ + lapic_xapic_mode(); + mca_resume(); + lapic_setup(0); + + /* Indicate that we are resumed */ + CPU_CLR_ATOMIC(cpu, &resuming_cpus); + CPU_CLR_ATOMIC(cpu, &suspended_cpus); + CPU_CLR_ATOMIC(cpu, &toresume_cpus); +} + + +void +invlcache_handler(void) +{ + uint32_t generation; + +#ifdef COUNT_IPIS + (*ipi_invlcache_counts[PCPU_GET(cpuid)])++; +#endif /* COUNT_IPIS */ + + /* + * Reading the generation here allows greater parallelism + * since wbinvd is a serializing instruction. Without the + * temporary, we'd wait for wbinvd to complete, then the read + * would execute, then the dependent write, which must then + * complete before return from interrupt. + */ + generation = smp_tlb_generation; + wbinvd(); + PCPU_SET(smp_tlb_done, generation); +} + +/* + * This is called once the rest of the system is up and running and we're + * ready to let the AP's out of the pen. + */ +static void +release_aps(void *dummy __unused) +{ + + if (mp_ncpus == 1) + return; + atomic_store_rel_int(&aps_ready, 1); + while (smp_started == 0) + ia32_pause(); +} +SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL); + +#ifdef COUNT_IPIS +/* + * Setup interrupt counters for IPI handlers. + */ +static void +mp_ipi_intrcnt(void *dummy) +{ + char buf[64]; + int i; + + CPU_FOREACH(i) { + snprintf(buf, sizeof(buf), "cpu%d:invltlb", i); + intrcnt_add(buf, &ipi_invltlb_counts[i]); + snprintf(buf, sizeof(buf), "cpu%d:invlrng", i); + intrcnt_add(buf, &ipi_invlrng_counts[i]); + snprintf(buf, sizeof(buf), "cpu%d:invlpg", i); + intrcnt_add(buf, &ipi_invlpg_counts[i]); + snprintf(buf, sizeof(buf), "cpu%d:invlcache", i); + intrcnt_add(buf, &ipi_invlcache_counts[i]); + snprintf(buf, sizeof(buf), "cpu%d:preempt", i); + intrcnt_add(buf, &ipi_preempt_counts[i]); + snprintf(buf, sizeof(buf), "cpu%d:ast", i); + intrcnt_add(buf, &ipi_ast_counts[i]); + snprintf(buf, sizeof(buf), "cpu%d:rendezvous", i); + intrcnt_add(buf, &ipi_rendezvous_counts[i]); + snprintf(buf, sizeof(buf), "cpu%d:hardclock", i); + intrcnt_add(buf, &ipi_hardclock_counts[i]); + } +} +SYSINIT(mp_ipi_intrcnt, SI_SUB_INTR, SI_ORDER_MIDDLE, mp_ipi_intrcnt, NULL); +#endif + +/* + * Flush the TLB on other CPU's + */ + +/* Variables needed for SMP tlb shootdown. */ +vm_offset_t smp_tlb_addr1, smp_tlb_addr2; +pmap_t smp_tlb_pmap; +volatile uint32_t smp_tlb_generation; + +#ifdef __amd64__ +#define read_eflags() read_rflags() +#endif + +static void +smp_targeted_tlb_shootdown(cpuset_t mask, u_int vector, pmap_t pmap, + vm_offset_t addr1, vm_offset_t addr2) +{ + cpuset_t other_cpus; + volatile uint32_t *p_cpudone; + uint32_t generation; + int cpu; + + /* + * Check for other cpus. Return if none. + */ + if (CPU_ISFULLSET(&mask)) { + if (mp_ncpus <= 1) + return; + } else { + CPU_CLR(PCPU_GET(cpuid), &mask); + if (CPU_EMPTY(&mask)) + return; + } + + if (!(read_eflags() & PSL_I)) + panic("%s: interrupts disabled", __func__); + mtx_lock_spin(&smp_ipi_mtx); + smp_tlb_addr1 = addr1; + smp_tlb_addr2 = addr2; + smp_tlb_pmap = pmap; + generation = ++smp_tlb_generation; + if (CPU_ISFULLSET(&mask)) { + ipi_all_but_self(vector); + other_cpus = all_cpus; + CPU_CLR(PCPU_GET(cpuid), &other_cpus); + } else { + other_cpus = mask; + while ((cpu = CPU_FFS(&mask)) != 0) { + cpu--; + CPU_CLR(cpu, &mask); + CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, + cpu, vector); + ipi_send_cpu(cpu, vector); + } + } + while ((cpu = CPU_FFS(&other_cpus)) != 0) { + cpu--; + CPU_CLR(cpu, &other_cpus); + p_cpudone = &cpuid_to_pcpu[cpu]->pc_smp_tlb_done; + while (*p_cpudone != generation) + ia32_pause(); + } + mtx_unlock_spin(&smp_ipi_mtx); +} + +void +smp_masked_invltlb(cpuset_t mask, pmap_t pmap) +{ + + if (smp_started) { + smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, pmap, 0, 0); +#ifdef COUNT_XINVLTLB_HITS + ipi_global++; +#endif + } +} + +void +smp_masked_invlpg(cpuset_t mask, vm_offset_t addr, pmap_t pmap) +{ + + if (smp_started) { + smp_targeted_tlb_shootdown(mask, IPI_INVLPG, pmap, addr, 0); +#ifdef COUNT_XINVLTLB_HITS + ipi_page++; +#endif + } +} + +void +smp_masked_invlpg_range(cpuset_t mask, vm_offset_t addr1, vm_offset_t addr2, + pmap_t pmap) +{ + + if (smp_started) { + smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, pmap, + addr1, addr2); +#ifdef COUNT_XINVLTLB_HITS + ipi_range++; + ipi_range_size += (addr2 - addr1) / PAGE_SIZE; +#endif + } +} + +void +smp_cache_flush(void) +{ + + if (smp_started) { + smp_targeted_tlb_shootdown(all_cpus, IPI_INVLCACHE, NULL, + 0, 0); + } +} + +/* + * Handlers for TLB related IPIs + */ +void +invltlb_handler(void) +{ + uint32_t generation; + +#ifdef COUNT_XINVLTLB_HITS + xhits_gbl[PCPU_GET(cpuid)]++; +#endif /* COUNT_XINVLTLB_HITS */ +#ifdef COUNT_IPIS + (*ipi_invltlb_counts[PCPU_GET(cpuid)])++; +#endif /* COUNT_IPIS */ + + /* + * Reading the generation here allows greater parallelism + * since invalidating the TLB is a serializing operation. + */ + generation = smp_tlb_generation; + if (smp_tlb_pmap == kernel_pmap) + invltlb_glob(); + else + invltlb(); + PCPU_SET(smp_tlb_done, generation); +} + +void +invlpg_handler(void) +{ + uint32_t generation; + +#ifdef COUNT_XINVLTLB_HITS + xhits_pg[PCPU_GET(cpuid)]++; +#endif /* COUNT_XINVLTLB_HITS */ +#ifdef COUNT_IPIS + (*ipi_invlpg_counts[PCPU_GET(cpuid)])++; +#endif /* COUNT_IPIS */ + + generation = smp_tlb_generation; /* Overlap with serialization */ + invlpg(smp_tlb_addr1); + PCPU_SET(smp_tlb_done, generation); +} + +void +invlrng_handler(void) +{ + vm_offset_t addr, addr2; + uint32_t generation; + +#ifdef COUNT_XINVLTLB_HITS + xhits_rng[PCPU_GET(cpuid)]++; +#endif /* COUNT_XINVLTLB_HITS */ +#ifdef COUNT_IPIS + (*ipi_invlrng_counts[PCPU_GET(cpuid)])++; +#endif /* COUNT_IPIS */ + + addr = smp_tlb_addr1; + addr2 = smp_tlb_addr2; + generation = smp_tlb_generation; /* Overlap with serialization */ + do { + invlpg(addr); + addr += PAGE_SIZE; + } while (addr < addr2); + + PCPU_SET(smp_tlb_done, generation); +} Property changes on: trunk/sys/x86/x86/mp_x86.c ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +MidnightBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Modified: trunk/sys/x86/x86/mptable.c =================================================================== --- trunk/sys/x86/x86/mptable.c 2020-02-08 19:29:01 UTC (rev 12309) +++ trunk/sys/x86/x86/mptable.c 2020-02-08 19:32:41 UTC (rev 12310) @@ -26,7 +26,7 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/x86/x86/mptable.c 262141 2014-02-18 01:15:32Z jhb $"); +__FBSDID("$FreeBSD: stable/11/sys/x86/x86/mptable.c 261087 2014-01-23 20:10:22Z jhb $"); #include "opt_mptable_force_htt.h" #include <sys/param.h> @@ -51,7 +51,7 @@ #include <x86/mptable.h> #include <machine/frame.h> #include <machine/intr_machdep.h> -#include <machine/apicvar.h> +#include <x86/apicvar.h> #include <machine/md_var.h> #ifdef NEW_PCIB #include <machine/resource.h> @@ -79,6 +79,13 @@ typedef void mptable_entry_handler(u_char *entry, void *arg); typedef void mptable_extended_entry_handler(ext_entry_ptr entry, void *arg); +/* descriptions of MP table entries */ +typedef struct BASETABLE_ENTRY { + uint8_t type; + uint8_t length; + uint8_t name[16]; +} basetable_entry; + static basetable_entry basetable_entry_types[] = { {0, 20, "Processor"}, Modified: trunk/sys/x86/x86/mptable_pci.c =================================================================== --- trunk/sys/x86/x86/mptable_pci.c 2020-02-08 19:29:01 UTC (rev 12309) +++ trunk/sys/x86/x86/mptable_pci.c 2020-02-08 19:32:41 UTC (rev 12310) @@ -31,7 +31,7 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/x86/x86/mptable_pci.c 280970 2015-04-01 21:48:54Z jhb $"); +__FBSDID("$FreeBSD: stable/11/sys/x86/x86/mptable_pci.c 294883 2016-01-27 02:23:54Z jhibbits $"); #include <sys/param.h> #include <sys/systm.h> @@ -70,13 +70,13 @@ #ifdef NEW_PCIB mptable_pci_host_res_init(dev); #endif - device_add_child(dev, "pci", pcib_get_bus(dev)); + device_add_child(dev, "pci", -1); return (bus_generic_attach(dev)); } #ifdef NEW_PCIB static int -mptable_is_isa_range(u_long start, u_long end) +mptable_is_isa_range(rman_res_t start, rman_res_t end) { if (end >= 0x10000) @@ -89,7 +89,7 @@ } static int -mptable_is_vga_range(u_long start, u_long end) +mptable_is_vga_range(rman_res_t start, rman_res_t end) { if (end >= 0x10000) return (0); @@ -102,7 +102,7 @@ static struct resource * mptable_hostb_alloc_resource(device_t dev, device_t child, int type, int *rid, - u_long start, u_long end, u_long count, u_int flags) + rman_res_t start, rman_res_t end, rman_res_t count, u_int flags) { struct mptable_hostb_softc *sc; @@ -143,7 +143,7 @@ static int mptable_hostb_adjust_resource(device_t dev, device_t child, int type, - struct resource *r, u_long start, u_long end) + struct resource *r, rman_res_t start, rman_res_t end) { struct mptable_hostb_softc *sc; Modified: trunk/sys/x86/x86/msi.c =================================================================== --- trunk/sys/x86/x86/msi.c 2020-02-08 19:29:01 UTC (rev 12309) +++ trunk/sys/x86/x86/msi.c 2020-02-08 19:32:41 UTC (rev 12310) @@ -36,11 +36,14 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/x86/x86/msi.c 333126 2018-04-30 20:29:28Z jhb $"); +__FBSDID("$FreeBSD: stable/11/sys/x86/x86/msi.c 344912 2019-03-08 01:04:19Z jhb $"); +#include "opt_acpi.h" + #include <sys/param.h> #include <sys/bus.h> #include <sys/kernel.h> +#include <sys/limits.h> #include <sys/lock.h> #include <sys/malloc.h> #include <sys/mutex.h> @@ -52,7 +55,8 @@ #include <machine/md_var.h> #include <machine/frame.h> #include <machine/intr_machdep.h> -#include <machine/apicvar.h> +#include <x86/apicvar.h> +#include <x86/iommu/iommu_intrmap.h> #include <machine/specialreg.h> #include <dev/pci/pcivar.h> @@ -113,10 +117,11 @@ u_int msi_irq; /* IRQ cookie. */ u_int msi_msix; /* MSI-X message. */ u_int msi_vector:8; /* IDT vector. */ - u_int msi_cpu:8; /* Local APIC ID. (g) */ + u_int msi_cpu; /* Local APIC ID. (g) */ u_int msi_count:8; /* Messages in this group. (g) */ u_int msi_maxcount:8; /* Alignment for this group. (g) */ - int *msi_irqs; /* Group's IRQ list. (g) */ + u_int *msi_irqs; /* Group's IRQ list. (g) */ + u_int msi_remap_cookie; }; static void msi_create_source(void); @@ -131,11 +136,27 @@ enum intr_polarity pol); static int msi_assign_cpu(struct intsrc *isrc, u_int apic_id); -struct pic msi_pic = { msi_enable_source, msi_disable_source, msi_eoi_source, - msi_enable_intr, msi_disable_intr, msi_vector, - msi_source_pending, NULL, NULL, msi_config_intr, - msi_assign_cpu }; +struct pic msi_pic = { + .pic_enable_source = msi_enable_source, + .pic_disable_source = msi_disable_source, + .pic_eoi_source = msi_eoi_source, + .pic_enable_intr = msi_enable_intr, + .pic_disable_intr = msi_disable_intr, + .pic_vector = msi_vector, + .pic_source_pending = msi_source_pending, + .pic_suspend = NULL, + .pic_resume = NULL, + .pic_config_intr = msi_config_intr, + .pic_assign_cpu = msi_assign_cpu, + .pic_reprogram_pin = NULL, +}; +u_int first_msi_irq; + +u_int num_msi_irqs = 512; +SYSCTL_UINT(_machdep, OID_AUTO, num_msi_irqs, CTLFLAG_RDTUN, &num_msi_irqs, 0, + "Number of IRQs reserved for MSI and MSI-X interrupts"); + #ifdef SMP /** * Xen hypervisors prior to 4.6.0 do not properly handle updates to @@ -153,7 +174,7 @@ #endif static int msi_enabled; -static int msi_last_irq; +static u_int msi_last_irq; static struct mtx msi_lock; static void @@ -314,6 +335,14 @@ } #endif + if (num_msi_irqs == 0) + return; + + first_msi_irq = max(MINIMUM_MSI_INT, num_io_irqs); + if (num_msi_irqs > UINT_MAX - first_msi_irq) + panic("num_msi_irqs too high"); + num_io_irqs = first_msi_irq + num_msi_irqs; + msi_enabled = 1; intr_register_pic(&msi_pic); mtx_init(&msi_lock, "msi", NULL, MTX_DEF); @@ -326,11 +355,11 @@ u_int irq; mtx_lock(&msi_lock); - if (msi_last_irq >= NUM_MSI_INTS) { + if (msi_last_irq >= num_msi_irqs) { mtx_unlock(&msi_lock); return; } - irq = msi_last_irq + FIRST_MSI_INT; + irq = msi_last_irq + first_msi_irq; msi_last_irq++; mtx_unlock(&msi_lock); @@ -348,8 +377,12 @@ msi_alloc(device_t dev, int count, int maxcount, int *irqs) { struct msi_intsrc *msi, *fsrc; - u_int cpu; - int cnt, i, *mirqs, vector; + u_int cpu, *mirqs; + int cnt, i, vector; +#ifdef ACPI_DMAR + u_int cookies[count]; + int error; +#endif if (!msi_enabled) return (ENXIO); @@ -363,7 +396,7 @@ /* Try to find 'count' free IRQs. */ cnt = 0; - for (i = FIRST_MSI_INT; i < FIRST_MSI_INT + NUM_MSI_INTS; i++) { + for (i = first_msi_irq; i < first_msi_irq + num_msi_irqs; i++) { msi = (struct msi_intsrc *)intr_lookup_source(i); /* End of allocated sources, so break. */ @@ -382,7 +415,7 @@ /* Do we need to create some new sources? */ if (cnt < count) { /* If we would exceed the max, give up. */ - if (i + (count - cnt) > FIRST_MSI_INT + NUM_MSI_INTS) { + if (i + (count - cnt) > first_msi_irq + num_msi_irqs) { mtx_unlock(&msi_lock); free(mirqs, M_MSI); return (ENXIO); @@ -409,6 +442,24 @@ return (ENOSPC); } +#ifdef ACPI_DMAR + mtx_unlock(&msi_lock); + error = iommu_alloc_msi_intr(dev, cookies, count); + mtx_lock(&msi_lock); + if (error == EOPNOTSUPP) + error = 0; + if (error != 0) { + for (i = 0; i < count; i++) + apic_free_vector(cpu, vector + i, irqs[i]); + free(mirqs, M_MSI); + return (error); + } + for (i = 0; i < count; i++) { + msi = (struct msi_intsrc *)intr_lookup_source(irqs[i]); + msi->msi_remap_cookie = cookies[i]; + } +#endif + /* Assign IDT vectors and make these messages owned by 'dev'. */ fsrc = (struct msi_intsrc *)intr_lookup_source(irqs[0]); for (i = 0; i < count; i++) { @@ -430,7 +481,6 @@ bcopy(irqs, mirqs, count * sizeof(*mirqs)); fsrc->msi_irqs = mirqs; mtx_unlock(&msi_lock); - return (0); } @@ -474,6 +524,9 @@ msi = (struct msi_intsrc *)intr_lookup_source(irqs[i]); KASSERT(msi->msi_first == first, ("message not in group")); KASSERT(msi->msi_dev == first->msi_dev, ("owner mismatch")); +#ifdef ACPI_DMAR + iommu_unmap_msi_intr(first->msi_dev, msi->msi_remap_cookie); +#endif msi->msi_first = NULL; msi->msi_dev = NULL; apic_free_vector(msi->msi_cpu, msi->msi_vector, msi->msi_irq); @@ -481,6 +534,11 @@ } /* Clear out the first message. */ +#ifdef ACPI_DMAR + mtx_unlock(&msi_lock); + iommu_unmap_msi_intr(first->msi_dev, first->msi_remap_cookie); + mtx_lock(&msi_lock); +#endif first->msi_first = NULL; first->msi_dev = NULL; apic_free_vector(first->msi_cpu, first->msi_vector, first->msi_irq); @@ -498,6 +556,11 @@ msi_map(int irq, uint64_t *addr, uint32_t *data) { struct msi_intsrc *msi; + int error; +#ifdef ACPI_DMAR + struct msi_intsrc *msi1; + int i, k; +#endif mtx_lock(&msi_lock); msi = (struct msi_intsrc *)intr_lookup_source(irq); @@ -525,10 +588,36 @@ msi = msi->msi_first; } - *addr = INTEL_ADDR(msi); - *data = INTEL_DATA(msi); +#ifdef ACPI_DMAR + if (!msi->msi_msix) { + for (k = msi->msi_count - 1, i = first_msi_irq; k > 0 && + i < first_msi_irq + num_msi_irqs; i++) { + if (i == msi->msi_irq) + continue; + msi1 = (struct msi_intsrc *)intr_lookup_source(i); + if (!msi1->msi_msix && msi1->msi_first == msi) { + mtx_unlock(&msi_lock); + iommu_map_msi_intr(msi1->msi_dev, + msi1->msi_cpu, msi1->msi_vector, + msi1->msi_remap_cookie, NULL, NULL); + k--; + mtx_lock(&msi_lock); + } + } + } mtx_unlock(&msi_lock); - return (0); + error = iommu_map_msi_intr(msi->msi_dev, msi->msi_cpu, + msi->msi_vector, msi->msi_remap_cookie, addr, data); +#else + mtx_unlock(&msi_lock); + error = EOPNOTSUPP; +#endif + if (error == EOPNOTSUPP) { + *addr = INTEL_ADDR(msi); + *data = INTEL_DATA(msi); + error = 0; + } + return (error); } int @@ -537,6 +626,10 @@ struct msi_intsrc *msi; u_int cpu; int i, vector; +#ifdef ACPI_DMAR + u_int cookie; + int error; +#endif if (!msi_enabled) return (ENXIO); @@ -545,7 +638,7 @@ mtx_lock(&msi_lock); /* Find a free IRQ. */ - for (i = FIRST_MSI_INT; i < FIRST_MSI_INT + NUM_MSI_INTS; i++) { + for (i = first_msi_irq; i < first_msi_irq + num_msi_irqs; i++) { msi = (struct msi_intsrc *)intr_lookup_source(i); /* End of allocated sources, so break. */ @@ -558,7 +651,7 @@ } /* Are all IRQs in use? */ - if (i == FIRST_MSI_INT + NUM_MSI_INTS) { + if (i == first_msi_irq + num_msi_irqs) { mtx_unlock(&msi_lock); return (ENXIO); } @@ -579,6 +672,22 @@ mtx_unlock(&msi_lock); return (ENOSPC); } + + msi->msi_dev = dev; +#ifdef ACPI_DMAR + mtx_unlock(&msi_lock); + error = iommu_alloc_msi_intr(dev, &cookie, 1); + mtx_lock(&msi_lock); + if (error == EOPNOTSUPP) + error = 0; + if (error != 0) { + msi->msi_dev = NULL; + apic_free_vector(cpu, vector, i); + return (error); + } + msi->msi_remap_cookie = cookie; +#endif + if (bootverbose) printf("msi: routing MSI-X IRQ %d to local APIC %u vector %u\n", msi->msi_irq, cpu, vector); @@ -585,7 +694,6 @@ /* Setup source. */ msi->msi_cpu = cpu; - msi->msi_dev = dev; msi->msi_first = msi; msi->msi_vector = vector; msi->msi_msix = 1; @@ -621,6 +729,11 @@ KASSERT(msi->msi_dev != NULL, ("unowned message")); /* Clear out the message. */ +#ifdef ACPI_DMAR + mtx_unlock(&msi_lock); + iommu_unmap_msi_intr(msi->msi_dev, msi->msi_remap_cookie); + mtx_lock(&msi_lock); +#endif msi->msi_first = NULL; msi->msi_dev = NULL; apic_free_vector(msi->msi_cpu, msi->msi_vector, msi->msi_irq); Modified: trunk/sys/x86/x86/nexus.c =================================================================== --- trunk/sys/x86/x86/nexus.c 2020-02-08 19:29:01 UTC (rev 12309) +++ trunk/sys/x86/x86/nexus.c 2020-02-08 19:32:41 UTC (rev 12310) @@ -29,7 +29,7 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/x86/x86/nexus.c 221324 2011-05-02 14:13:12Z jhb $"); +__FBSDID("$FreeBSD: stable/11/sys/x86/x86/nexus.c 340016 2018-11-01 18:34:26Z jhb $"); /* * This code implements a `root nexus' for Intel Architecture @@ -64,7 +64,6 @@ #include <machine/vmparam.h> #include <vm/vm.h> #include <vm/pmap.h> -#include <machine/pmap.h> #include <machine/metadata.h> #include <machine/nexusvar.h> @@ -80,7 +79,7 @@ #ifdef PC98 #include <pc98/cbus/cbus.h> #else -#include <x86/isa/isa.h> +#include <isa/isareg.h> #endif #endif #include <sys/rtprio.h> @@ -100,9 +99,10 @@ static device_t nexus_add_child(device_t bus, u_int order, const char *name, int unit); static struct resource *nexus_alloc_resource(device_t, device_t, int, int *, - u_long, u_long, u_long, u_int); + rman_res_t, rman_res_t, rman_res_t, + u_int); static int nexus_adjust_resource(device_t, device_t, int, struct resource *, - u_long, u_long); + rman_res_t, rman_res_t); #ifdef SMP static int nexus_bind_intr(device_t, device_t, struct resource *, int); #endif @@ -115,6 +115,12 @@ struct resource *); static int nexus_deactivate_resource(device_t, device_t, int, int, struct resource *); +static int nexus_map_resource(device_t bus, device_t child, int type, + struct resource *r, + struct resource_map_request *argsp, + struct resource_map *map); +static int nexus_unmap_resource(device_t bus, device_t child, int type, + struct resource *r, struct resource_map *map); static int nexus_release_resource(device_t, device_t, int, int, struct resource *); static int nexus_setup_intr(device_t, device_t, struct resource *, int flags, @@ -123,9 +129,13 @@ static int nexus_teardown_intr(device_t, device_t, struct resource *, void *); static struct resource_list *nexus_get_reslist(device_t dev, device_t child); -static int nexus_set_resource(device_t, device_t, int, int, u_long, u_long); -static int nexus_get_resource(device_t, device_t, int, int, u_long *, u_long *); +static int nexus_set_resource(device_t, device_t, int, int, + rman_res_t, rman_res_t); +static int nexus_get_resource(device_t, device_t, int, int, + rman_res_t *, rman_res_t *); static void nexus_delete_resource(device_t, device_t, int, int); +static int nexus_get_cpus(device_t, device_t, enum cpu_sets, size_t, + cpuset_t *); #ifdef DEV_APIC static int nexus_alloc_msi(device_t pcib, device_t dev, int count, int maxcount, int *irqs); static int nexus_release_msi(device_t pcib, device_t dev, int count, int *irqs); @@ -151,6 +161,8 @@ DEVMETHOD(bus_release_resource, nexus_release_resource), DEVMETHOD(bus_activate_resource, nexus_activate_resource), DEVMETHOD(bus_deactivate_resource, nexus_deactivate_resource), + DEVMETHOD(bus_map_resource, nexus_map_resource), + DEVMETHOD(bus_unmap_resource, nexus_unmap_resource), DEVMETHOD(bus_setup_intr, nexus_setup_intr), DEVMETHOD(bus_teardown_intr, nexus_teardown_intr), #ifdef SMP @@ -162,6 +174,7 @@ DEVMETHOD(bus_set_resource, nexus_set_resource), DEVMETHOD(bus_get_resource, nexus_get_resource), DEVMETHOD(bus_delete_resource, nexus_delete_resource), + DEVMETHOD(bus_get_cpus, nexus_get_cpus), /* pcib interface */ #ifdef DEV_APIC @@ -214,7 +227,7 @@ irq_rman.rm_start = 0; irq_rman.rm_type = RMAN_ARRAY; irq_rman.rm_descr = "Interrupt request lines"; - irq_rman.rm_end = NUM_IO_INTS - 1; + irq_rman.rm_end = num_io_irqs - 1; if (rman_init(&irq_rman)) panic("nexus_init_resources irq_rman"); @@ -222,7 +235,7 @@ * We search for regions of existing IRQs and add those to the IRQ * resource manager. */ - for (irq = 0; irq < NUM_IO_INTS; irq++) + for (irq = 0; irq < num_io_irqs; irq++) if (intr_lookup_source(irq) != NULL) if (rman_manage_region(&irq_rman, irq, irq) != 0) panic("nexus_init_resources irq_rman add"); @@ -260,11 +273,15 @@ panic("nexus_init_resources port_rman"); mem_rman.rm_start = 0; - mem_rman.rm_end = ~0ul; +#ifndef PAE + mem_rman.rm_end = BUS_SPACE_MAXADDR; +#else + mem_rman.rm_end = ((1ULL << cpu_maxphyaddr) - 1); +#endif mem_rman.rm_type = RMAN_ARRAY; mem_rman.rm_descr = "I/O memory addresses"; if (rman_init(&mem_rman) - || rman_manage_region(&mem_rman, 0, ~0)) + || rman_manage_region(&mem_rman, 0, mem_rman.rm_end)) panic("nexus_init_resources mem_rman"); } @@ -296,9 +313,9 @@ if (STAILQ_FIRST(rl)) retval += printf(" at"); - retval += resource_list_print_type(rl, "port", SYS_RES_IOPORT, "%#lx"); - retval += resource_list_print_type(rl, "iomem", SYS_RES_MEMORY, "%#lx"); - retval += resource_list_print_type(rl, "irq", SYS_RES_IRQ, "%ld"); + retval += resource_list_print_type(rl, "port", SYS_RES_IOPORT, "%#jx"); + retval += resource_list_print_type(rl, "iomem", SYS_RES_MEMORY, "%#jx"); + retval += resource_list_print_type(rl, "irq", SYS_RES_IRQ, "%jd"); return retval; } @@ -360,7 +377,8 @@ */ static struct resource * nexus_alloc_resource(device_t bus, device_t child, int type, int *rid, - u_long start, u_long end, u_long count, u_int flags) + rman_res_t start, rman_res_t end, rman_res_t count, + u_int flags) { struct nexus_device *ndev = DEVTONX(child); struct resource *rv; @@ -369,12 +387,13 @@ int needactivate = flags & RF_ACTIVE; /* - * If this is an allocation of the "default" range for a given RID, and - * we know what the resources for this device are (ie. they aren't maintained - * by a child bus), then work out the start/end values. + * If this is an allocation of the "default" range for a given + * RID, and we know what the resources for this device are + * (ie. they aren't maintained by a child bus), then work out + * the start/end values. */ - if ((start == 0UL) && (end == ~0UL) && (count == 1)) { - if (ndev == NULL) + if (RMAN_IS_DEFAULT_RANGE(start, end) && (count == 1)) { + if (device_get_parent(child) != bus || ndev == NULL) return(NULL); rle = resource_list_find(&ndev->nx_resources, type, *rid); if (rle == NULL) @@ -390,7 +409,7 @@ return (NULL); rv = rman_reserve_resource(rm, start, end, count, flags, child); - if (rv == 0) + if (rv == NULL) return 0; rman_set_rid(rv, *rid); @@ -406,7 +425,7 @@ static int nexus_adjust_resource(device_t bus, device_t child, int type, - struct resource *r, u_long start, u_long end) + struct resource *r, rman_res_t start, rman_res_t end) { struct rman *rm; @@ -422,12 +441,82 @@ nexus_activate_resource(device_t bus, device_t child, int type, int rid, struct resource *r) { + struct resource_map map; + int error; + + error = rman_activate_resource(r); + if (error != 0) + return (error); + + if (!(rman_get_flags(r) & RF_UNMAPPED) && + (type == SYS_RES_MEMORY || type == SYS_RES_IOPORT)) { + error = nexus_map_resource(bus, child, type, r, NULL, &map); + if (error) { + rman_deactivate_resource(r); + return (error); + } + + rman_set_mapping(r,&map); + } + return (0); +} + +static int +nexus_deactivate_resource(device_t bus, device_t child, int type, int rid, + struct resource *r) +{ + struct resource_map map; + int error; + + error = rman_deactivate_resource(r); + if (error) + return (error); + + if (!(rman_get_flags(r) & RF_UNMAPPED) && + (type == SYS_RES_MEMORY || type == SYS_RES_IOPORT)) { + rman_get_mapping(r, &map); + nexus_unmap_resource(bus, child, type, r, &map); + } + return (0); +} + +static int +nexus_map_resource(device_t bus, device_t child, int type, struct resource *r, + struct resource_map_request *argsp, struct resource_map *map) +{ + struct resource_map_request args; + rman_res_t end, length, start; #ifdef PC98 - bus_space_handle_t bh; int error; #endif - void *vaddr; + /* Resources must be active to be mapped. */ + if (!(rman_get_flags(r) & RF_ACTIVE)) + return (ENXIO); + + /* Mappings are only supported on I/O and memory resources. */ + switch (type) { + case SYS_RES_IOPORT: + case SYS_RES_MEMORY: + break; + default: + return (EINVAL); + } + + resource_init_map_request(&args); + if (argsp != NULL) + bcopy(argsp, &args, imin(argsp->size, args.size)); + start = rman_get_start(r) + args.offset; + if (args.length == 0) + length = rman_get_size(r); + else + length = args.length; + end = start + length - 1; + if (start > rman_get_end(r) || start < rman_get_start(r)) + return (EINVAL); + if (end > rman_get_end(r) || end < start) + return (EINVAL); + /* * If this is a memory resource, map it into the kernel. */ @@ -435,58 +524,64 @@ case SYS_RES_IOPORT: #ifdef PC98 error = i386_bus_space_handle_alloc(X86_BUS_SPACE_IO, - rman_get_start(r), rman_get_size(r), &bh); + start, length, &map->r_bushandle); if (error) return (error); - rman_set_bushandle(r, bh); #else - rman_set_bushandle(r, rman_get_start(r)); + map->r_bushandle = start; #endif - rman_set_bustag(r, X86_BUS_SPACE_IO); + map->r_bustag = X86_BUS_SPACE_IO; + map->r_size = length; + map->r_vaddr = NULL; break; case SYS_RES_MEMORY: #ifdef PC98 error = i386_bus_space_handle_alloc(X86_BUS_SPACE_MEM, - rman_get_start(r), rman_get_size(r), &bh); + start, length, &map->r_bushandle); if (error) return (error); #endif - vaddr = pmap_mapdev(rman_get_start(r), rman_get_size(r)); - rman_set_virtual(r, vaddr); - rman_set_bustag(r, X86_BUS_SPACE_MEM); + map->r_vaddr = pmap_mapdev_attr(start, length, args.memattr); + map->r_bustag = X86_BUS_SPACE_MEM; + map->r_size = length; + + /* + * PC-98 stores the virtual address as a member of the + * structure in the handle. On plain x86, the handle is + * the virtual address. + */ #ifdef PC98 - /* PC-98: the type of bus_space_handle_t is the structure. */ - bh->bsh_base = (bus_addr_t) vaddr; - rman_set_bushandle(r, bh); + map->r_bushandle->bsh_base = (bus_addr_t)map->r_vaddr; #else - /* IBM-PC: the type of bus_space_handle_t is u_int */ - rman_set_bushandle(r, (bus_space_handle_t) vaddr); + map->r_bushandle = (bus_space_handle_t)map->r_vaddr; #endif + break; } - return (rman_activate_resource(r)); + return (0); } static int -nexus_deactivate_resource(device_t bus, device_t child, int type, int rid, - struct resource *r) +nexus_unmap_resource(device_t bus, device_t child, int type, struct resource *r, + struct resource_map *map) { - + /* * If this is a memory resource, unmap it. */ - if (type == SYS_RES_MEMORY) { - pmap_unmapdev((vm_offset_t)rman_get_virtual(r), - rman_get_size(r)); - } + switch (type) { + case SYS_RES_MEMORY: + pmap_unmapdev((vm_offset_t)map->r_vaddr, map->r_size); + /* FALLTHROUGH */ + case SYS_RES_IOPORT: #ifdef PC98 - if (type == SYS_RES_MEMORY || type == SYS_RES_IOPORT) { - bus_space_handle_t bh; - - bh = rman_get_bushandle(r); - i386_bus_space_handle_free(rman_get_bustag(r), bh, bh->bsh_sz); + i386_bus_space_handle_free(map->r_bustag, map->r_bushandle, + map->r_bushandle->bsh_sz); +#endif + break; + default: + return (EINVAL); } -#endif - return (rman_deactivate_resource(r)); + return (0); } static int @@ -493,6 +588,7 @@ nexus_release_resource(device_t bus, device_t child, int type, int rid, struct resource *r) { + if (rman_get_flags(r) & RF_ACTIVE) { int error = bus_deactivate_resource(child, type, rid, r); if (error) @@ -518,7 +614,7 @@ if (irq == NULL) panic("nexus_setup_intr: NULL irq resource!"); - *cookiep = 0; + *cookiep = NULL; if ((rman_get_flags(irq) & RF_SHAREABLE) == 0) flags |= INTR_EXCL; @@ -573,7 +669,8 @@ } static int -nexus_set_resource(device_t dev, device_t child, int type, int rid, u_long start, u_long count) +nexus_set_resource(device_t dev, device_t child, int type, int rid, + rman_res_t start, rman_res_t count) { struct nexus_device *ndev = DEVTONX(child); struct resource_list *rl = &ndev->nx_resources; @@ -584,7 +681,8 @@ } static int -nexus_get_resource(device_t dev, device_t child, int type, int rid, u_long *startp, u_long *countp) +nexus_get_resource(device_t dev, device_t child, int type, int rid, + rman_res_t *startp, rman_res_t *countp) { struct nexus_device *ndev = DEVTONX(child); struct resource_list *rl = &ndev->nx_resources; @@ -609,6 +707,24 @@ resource_list_delete(rl, type, rid); } +static int +nexus_get_cpus(device_t dev, device_t child, enum cpu_sets op, size_t setsize, + cpuset_t *cpuset) +{ + + switch (op) { +#ifdef SMP + case INTR_CPUS: + if (setsize != sizeof(cpuset_t)) + return (EINVAL); + *cpuset = intr_cpus; + return (0); +#endif + default: + return (bus_generic_get_cpus(dev, child, op, setsize, cpuset)); + } +} + /* Called from the MSI code to add new IRQs to the IRQ rman. */ void nexus_add_irq(u_long irq) @@ -689,11 +805,8 @@ kmdp = preload_search_by_type("elf kernel"); if (kmdp == NULL) kmdp = preload_search_by_type(ELF_KERN_STR); - if (kmdp != NULL) - smapbase = (struct bios_smap *)preload_search_info(kmdp, - MODINFO_METADATA | MODINFOMD_SMAP); - else - smapbase = NULL; + smapbase = (struct bios_smap *)preload_search_info(kmdp, + MODINFO_METADATA | MODINFOMD_SMAP); if (smapbase != NULL) { smapsize = *((u_int32_t *)smapbase - 1); smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize); Added: trunk/sys/x86/x86/pvclock.c =================================================================== --- trunk/sys/x86/x86/pvclock.c (rev 0) +++ trunk/sys/x86/x86/pvclock.c 2020-02-08 19:32:41 UTC (rev 12310) @@ -0,0 +1,204 @@ +/* $MidnightBSD$ */ +/*- + * Copyright (c) 2009 Adrian Chadd + * Copyright (c) 2012 Spectra Logic Corporation + * Copyright (c) 2014 Bryan Venteicher + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: stable/11/sys/x86/x86/pvclock.c 278184 2015-02-04 08:33:04Z bryanv $"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/proc.h> + +#include <machine/cpufunc.h> +#include <machine/cpu.h> +#include <machine/atomic.h> +#include <machine/pvclock.h> + +/* + * Last time; this guarantees a monotonically increasing clock for when + * a stable TSC is not provided. + */ +static volatile uint64_t pvclock_last_cycles; + +void +pvclock_resume(void) +{ + + atomic_store_rel_64(&pvclock_last_cycles, 0); +} + +uint64_t +pvclock_get_last_cycles(void) +{ + + return (atomic_load_acq_64(&pvclock_last_cycles)); +} + +uint64_t +pvclock_tsc_freq(struct pvclock_vcpu_time_info *ti) +{ + uint64_t freq; + + freq = (1000000000ULL << 32) / ti->tsc_to_system_mul; + + if (ti->tsc_shift < 0) + freq <<= -ti->tsc_shift; + else + freq >>= ti->tsc_shift; + + return (freq); +} + +/* + * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction, + * yielding a 64-bit result. + */ +static inline uint64_t +pvclock_scale_delta(uint64_t delta, uint32_t mul_frac, int shift) +{ + uint64_t product; + + if (shift < 0) + delta >>= -shift; + else + delta <<= shift; + +#if defined(__i386__) + { + uint32_t tmp1, tmp2; + + /** + * For i386, the formula looks like: + * + * lower = (mul_frac * (delta & UINT_MAX)) >> 32 + * upper = mul_frac * (delta >> 32) + * product = lower + upper + */ + __asm__ ( + "mul %5 ; " + "mov %4,%%eax ; " + "mov %%edx,%4 ; " + "mul %5 ; " + "xor %5,%5 ; " + "add %4,%%eax ; " + "adc %5,%%edx ; " + : "=A" (product), "=r" (tmp1), "=r" (tmp2) + : "a" ((uint32_t)delta), "1" ((uint32_t)(delta >> 32)), + "2" (mul_frac) ); + } +#elif defined(__amd64__) + { + unsigned long tmp; + + __asm__ ( + "mulq %[mul_frac] ; shrd $32, %[hi], %[lo]" + : [lo]"=a" (product), [hi]"=d" (tmp) + : "0" (delta), [mul_frac]"rm"((uint64_t)mul_frac)); + } +#else +#error "pvclock: unsupported x86 architecture?" +#endif + + return (product); +} + +static uint64_t +pvclock_get_nsec_offset(struct pvclock_vcpu_time_info *ti) +{ + uint64_t delta; + + delta = rdtsc() - ti->tsc_timestamp; + + return (pvclock_scale_delta(delta, ti->tsc_to_system_mul, + ti->tsc_shift)); +} + +static void +pvclock_read_time_info(struct pvclock_vcpu_time_info *ti, + uint64_t *cycles, uint8_t *flags) +{ + uint32_t version; + + do { + version = ti->version; + rmb(); + *cycles = ti->system_time + pvclock_get_nsec_offset(ti); + *flags = ti->flags; + rmb(); + } while ((ti->version & 1) != 0 || ti->version != version); +} + +static void +pvclock_read_wall_clock(struct pvclock_wall_clock *wc, uint32_t *sec, + uint32_t *nsec) +{ + uint32_t version; + + do { + version = wc->version; + rmb(); + *sec = wc->sec; + *nsec = wc->nsec; + rmb(); + } while ((wc->version & 1) != 0 || wc->version != version); +} + +uint64_t +pvclock_get_timecount(struct pvclock_vcpu_time_info *ti) +{ + uint64_t now, last; + uint8_t flags; + + pvclock_read_time_info(ti, &now, &flags); + + if (flags & PVCLOCK_FLAG_TSC_STABLE) + return (now); + + /* + * Enforce a monotonically increasing clock time across all VCPUs. + * If our time is too old, use the last time and return. Otherwise, + * try to update the last time. + */ + do { + last = atomic_load_acq_64(&pvclock_last_cycles); + if (last > now) + return (last); + } while (!atomic_cmpset_64(&pvclock_last_cycles, last, now)); + + return (now); +} + +void +pvclock_get_wallclock(struct pvclock_wall_clock *wc, struct timespec *ts) +{ + uint32_t sec, nsec; + + pvclock_read_wall_clock(wc, &sec, &nsec); + ts->tv_sec = sec; + ts->tv_nsec = nsec; +} Property changes on: trunk/sys/x86/x86/pvclock.c ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +MidnightBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Added: trunk/sys/x86/x86/stack_machdep.c =================================================================== --- trunk/sys/x86/x86/stack_machdep.c (rev 0) +++ trunk/sys/x86/x86/stack_machdep.c 2020-02-08 19:32:41 UTC (rev 12310) @@ -0,0 +1,182 @@ +/* $MidnightBSD$ */ +/*- + * Copyright (c) 2015 EMC Corporation + * Copyright (c) 2005 Antoine Brodin + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: stable/11/sys/x86/x86/stack_machdep.c 337976 2018-08-17 16:04:59Z markj $"); + +#include "opt_stack.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/proc.h> +#include <sys/stack.h> + +#include <machine/pcb.h> +#include <machine/smp.h> + +#include <vm/vm.h> +#include <vm/vm_param.h> +#include <vm/pmap.h> + +#include <x86/stack.h> + +#ifdef __i386__ +#define PCB_FP(pcb) ((pcb)->pcb_ebp) +#define TF_FLAGS(tf) ((tf)->tf_eflags) +#define TF_FP(tf) ((tf)->tf_ebp) +#define TF_PC(tf) ((tf)->tf_eip) + +typedef struct i386_frame *x86_frame_t; +#else +#define PCB_FP(pcb) ((pcb)->pcb_rbp) +#define TF_FLAGS(tf) ((tf)->tf_rflags) +#define TF_FP(tf) ((tf)->tf_rbp) +#define TF_PC(tf) ((tf)->tf_rip) + +typedef struct amd64_frame *x86_frame_t; +#endif + +#ifdef STACK +static struct stack *nmi_stack; +static volatile struct thread *nmi_pending; + +#ifdef SMP +static struct mtx nmi_lock; +MTX_SYSINIT(nmi_lock, &nmi_lock, "stack_nmi", MTX_SPIN); +#endif +#endif + +static void +stack_capture(struct thread *td, struct stack *st, register_t fp) +{ + x86_frame_t frame; + vm_offset_t callpc; + + stack_zero(st); + frame = (x86_frame_t)fp; + while (1) { + if ((vm_offset_t)frame < td->td_kstack || + (vm_offset_t)frame >= td->td_kstack + + td->td_kstack_pages * PAGE_SIZE) + break; + callpc = frame->f_retaddr; + if (!INKERNEL(callpc)) + break; + if (stack_put(st, callpc) == -1) + break; + if (frame->f_frame <= frame) + break; + frame = frame->f_frame; + } +} + +int +stack_nmi_handler(struct trapframe *tf) +{ + +#ifdef STACK + /* Don't consume an NMI that wasn't meant for us. */ + if (nmi_stack == NULL || curthread != nmi_pending) + return (0); + + if (!TRAPF_USERMODE(tf) && (TF_FLAGS(tf) & PSL_I) != 0) + stack_capture(curthread, nmi_stack, TF_FP(tf)); + else + /* We were running in usermode or had interrupts disabled. */ + nmi_stack->depth = 0; + + atomic_store_rel_ptr((long *)&nmi_pending, (long)NULL); + return (1); +#else + return (0); +#endif +} + +void +stack_save_td(struct stack *st, struct thread *td) +{ + + if (TD_IS_SWAPPED(td)) + panic("stack_save_td: swapped"); + if (TD_IS_RUNNING(td)) + panic("stack_save_td: running"); + + stack_capture(td, st, PCB_FP(td->td_pcb)); +} + +int +stack_save_td_running(struct stack *st, struct thread *td) +{ + +#ifdef STACK + THREAD_LOCK_ASSERT(td, MA_OWNED); + MPASS(TD_IS_RUNNING(td)); + + if (td == curthread) { + stack_save(st); + return (0); + } + +#ifdef SMP + mtx_lock_spin(&nmi_lock); + + nmi_stack = st; + nmi_pending = td; + ipi_cpu(td->td_oncpu, IPI_TRACE); + while ((void *)atomic_load_acq_ptr((long *)&nmi_pending) != NULL) + cpu_spinwait(); + nmi_stack = NULL; + + mtx_unlock_spin(&nmi_lock); + + if (st->depth == 0) + return (EAGAIN); +#else /* !SMP */ + KASSERT(0, ("curthread isn't running")); +#endif /* SMP */ + return (0); +#else /* !STACK */ + return (EOPNOTSUPP); +#endif /* STACK */ +} + +void +stack_save(struct stack *st) +{ + register_t fp; + +#ifdef __i386__ + __asm __volatile("movl %%ebp,%0" : "=g" (fp)); +#else + __asm __volatile("movq %%rbp,%0" : "=g" (fp)); +#endif + stack_capture(curthread, st, fp); +} Property changes on: trunk/sys/x86/x86/stack_machdep.c ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +MidnightBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Modified: trunk/sys/x86/x86/tsc.c =================================================================== --- trunk/sys/x86/x86/tsc.c 2020-02-08 19:29:01 UTC (rev 12309) +++ trunk/sys/x86/x86/tsc.c 2020-02-08 19:32:41 UTC (rev 12310) @@ -26,7 +26,7 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/x86/x86/tsc.c 280973 2015-04-02 01:02:42Z jhb $"); +__FBSDID("$FreeBSD: stable/11/sys/x86/x86/tsc.c 353007 2019-10-02 13:46:40Z kib $"); #include "opt_compat.h" #include "opt_clock.h" @@ -49,6 +49,7 @@ #include <machine/md_var.h> #include <machine/specialreg.h> #include <x86/vmware.h> +#include <dev/acpica/acpi_hpet.h> #include "cpufreq_if.h" @@ -60,34 +61,28 @@ SYSCTL_INT(_kern_timecounter, OID_AUTO, invariant_tsc, CTLFLAG_RDTUN, &tsc_is_invariant, 0, "Indicates whether the TSC is P-state invariant"); -TUNABLE_INT("kern.timecounter.invariant_tsc", &tsc_is_invariant); #ifdef SMP int smp_tsc; SYSCTL_INT(_kern_timecounter, OID_AUTO, smp_tsc, CTLFLAG_RDTUN, &smp_tsc, 0, "Indicates whether the TSC is safe to use in SMP mode"); -TUNABLE_INT("kern.timecounter.smp_tsc", &smp_tsc); int smp_tsc_adjust = 0; SYSCTL_INT(_kern_timecounter, OID_AUTO, smp_tsc_adjust, CTLFLAG_RDTUN, &smp_tsc_adjust, 0, "Try to adjust TSC on APs to match BSP"); -TUNABLE_INT("kern.timecounter.smp_tsc_adjust", &smp_tsc_adjust); #endif static int tsc_shift = 1; SYSCTL_INT(_kern_timecounter, OID_AUTO, tsc_shift, CTLFLAG_RDTUN, &tsc_shift, 0, "Shift to pre-apply for the maximum TSC frequency"); -TUNABLE_INT("kern.timecounter.tsc_shift", &tsc_shift); static int tsc_disabled; SYSCTL_INT(_machdep, OID_AUTO, disable_tsc, CTLFLAG_RDTUN, &tsc_disabled, 0, "Disable x86 Time Stamp Counter"); -TUNABLE_INT("machdep.disable_tsc", &tsc_disabled); static int tsc_skip_calibration; SYSCTL_INT(_machdep, OID_AUTO, disable_tsc_calibration, CTLFLAG_RDTUN, &tsc_skip_calibration, 0, "Disable TSC frequency calibration"); -TUNABLE_INT("machdep.disable_tsc_calibration", &tsc_skip_calibration); static void tsc_freq_changed(void *arg, const struct cf_level *level, int status); @@ -100,14 +95,22 @@ static unsigned tsc_get_timecount_mfence(struct timecounter *tc); static unsigned tsc_get_timecount_low_mfence(struct timecounter *tc); static void tsc_levels_changed(void *arg, int unit); +static uint32_t x86_tsc_vdso_timehands(struct vdso_timehands *vdso_th, + struct timecounter *tc); +#ifdef COMPAT_FREEBSD32 +static uint32_t x86_tsc_vdso_timehands32(struct vdso_timehands32 *vdso_th32, + struct timecounter *tc); +#endif static struct timecounter tsc_timecounter = { - tsc_get_timecount, /* get_timecount */ - 0, /* no poll_pps */ - ~0u, /* counter_mask */ - 0, /* frequency */ - "TSC", /* name */ - 800, /* quality (adjusted in code) */ + .tc_get_timecount = tsc_get_timecount, + .tc_counter_mask = ~0u, + .tc_name = "TSC", + .tc_quality = 800, /* adjusted in code */ + .tc_fill_vdso_timehands = x86_tsc_vdso_timehands, +#ifdef COMPAT_FREEBSD32 + .tc_fill_vdso_timehands32 = x86_tsc_vdso_timehands32, +#endif }; static void @@ -126,6 +129,40 @@ tsc_is_invariant = 1; } +/* + * Calculate TSC frequency using information from the CPUID leaf 0x15 + * 'Time Stamp Counter and Nominal Core Crystal Clock'. If leaf 0x15 + * is not functional, as it is on Skylake/Kabylake, try 0x16 'Processor + * Frequency Information'. Leaf 0x16 is described in the SDM as + * informational only, but if 0x15 did not work, and TSC calibration + * is disabled, it is the best we can get at all. It should still be + * an improvement over the parsing of the CPU model name in + * tsc_freq_intel(), when available. + */ +static bool +tsc_freq_cpuid(void) +{ + u_int regs[4]; + + if (cpu_high < 0x15) + return (false); + do_cpuid(0x15, regs); + if (regs[0] != 0 && regs[1] != 0 && regs[2] != 0) { + tsc_freq = (uint64_t)regs[2] * regs[1] / regs[0]; + return (true); + } + + if (cpu_high < 0x16) + return (false); + do_cpuid(0x16, regs); + if (regs[0] != 0) { + tsc_freq = (uint64_t)regs[0] * 1000000; + return (true); + } + + return (false); +} + static void tsc_freq_intel(void) { @@ -250,18 +287,19 @@ } if (tsc_skip_calibration) { - if (cpu_vendor_id == CPU_VENDOR_INTEL) + if (tsc_freq_cpuid()) + ; + else if (cpu_vendor_id == CPU_VENDOR_INTEL) tsc_freq_intel(); - return; + } else { + if (bootverbose) + printf("Calibrating TSC clock ... "); + tsc1 = rdtsc(); + DELAY(1000000); + tsc2 = rdtsc(); + tsc_freq = tsc2 - tsc1; } - if (bootverbose) - printf("Calibrating TSC clock ... "); - tsc1 = rdtsc(); - DELAY(1000000); - tsc2 = rdtsc(); - tsc_freq = tsc2 - tsc1; - if (bootverbose) printf("TSC clock: %ju Hz\n", (intmax_t)tsc_freq); } @@ -427,7 +465,7 @@ } static int -test_tsc(void) +test_tsc(int adj_max_count) { uint64_t *data, *tsc; u_int i, size, adj; @@ -441,12 +479,12 @@ for (i = 0, tsc = data; i < N; i++, tsc += size) smp_rendezvous(tsc_read_0, tsc_read_1, tsc_read_2, tsc); smp_tsc = 1; /* XXX */ - smp_rendezvous(smp_no_rendevous_barrier, comp_smp_tsc, - smp_no_rendevous_barrier, data); - if (!smp_tsc && adj < smp_tsc_adjust) { + smp_rendezvous(smp_no_rendezvous_barrier, comp_smp_tsc, + smp_no_rendezvous_barrier, data); + if (!smp_tsc && adj < adj_max_count) { adj++; - smp_rendezvous(smp_no_rendevous_barrier, adj_smp_tsc, - smp_no_rendevous_barrier, data); + smp_rendezvous(smp_no_rendezvous_barrier, adj_smp_tsc, + smp_no_rendezvous_barrier, data); goto retry; } free(data, M_TEMP); @@ -481,19 +519,6 @@ #undef N -#else - -/* - * The function is not called, it is provided to avoid linking failure - * on uniprocessor kernel. - */ -static int -test_tsc(void) -{ - - return (0); -} - #endif /* SMP */ static void @@ -529,17 +554,22 @@ } /* - * We cannot use the TSC if it stops incrementing while idle. * Intel CPUs without a C-state invariant TSC can stop the TSC - * in either C2 or C3. + * in either C2 or C3. Disable use of C2 and C3 while using + * the TSC as the timecounter. The timecounter can be changed + * to enable C2 and C3. + * + * Note that the TSC is used as the cputicker for computing + * thread runtime regardless of the timecounter setting, so + * using an alternate timecounter and enabling C2 or C3 can + * result incorrect runtimes for kernel idle threads (but not + * for any non-idle threads). */ - if (cpu_deepest_sleep >= 2 && cpu_vendor_id == CPU_VENDOR_INTEL && + if (cpu_vendor_id == CPU_VENDOR_INTEL && (amd_pminfo & AMDPM_TSC_INVARIANT) == 0) { - tsc_timecounter.tc_quality = -1000; tsc_timecounter.tc_flags |= TC_FLAGS_C2STOP; if (bootverbose) - printf("TSC timecounter disabled: C2/C3 may halt it.\n"); - goto init; + printf("TSC timecounter disables C2 and C3.\n"); } /* @@ -549,9 +579,12 @@ * non-zero value. The TSC seems unreliable in virtualized SMP * environments, so it is set to a negative quality in those cases. */ +#ifdef SMP if (mp_ncpus > 1) - tsc_timecounter.tc_quality = test_tsc(); - else if (tsc_is_invariant) + tsc_timecounter.tc_quality = test_tsc(smp_tsc_adjust); + else +#endif /* SMP */ + if (tsc_is_invariant) tsc_timecounter.tc_quality = 1000; max_freq >>= tsc_shift; @@ -586,6 +619,32 @@ } SYSINIT(tsc_tc, SI_SUB_SMP, SI_ORDER_ANY, init_TSC_tc, NULL); +void +resume_TSC(void) +{ +#ifdef SMP + int quality; + + /* If TSC was not good on boot, it is unlikely to become good now. */ + if (tsc_timecounter.tc_quality < 0) + return; + /* Nothing to do with UP. */ + if (mp_ncpus < 2) + return; + + /* + * If TSC was good, a single synchronization should be enough, + * but honour smp_tsc_adjust if it's set. + */ + quality = test_tsc(MAX(smp_tsc_adjust, 1)); + if (quality != tsc_timecounter.tc_quality) { + printf("TSC timecounter quality changed: %d -> %d\n", + tsc_timecounter.tc_quality, quality); + tsc_timecounter.tc_quality = quality; + } +#endif /* SMP */ +} + /* * When cpufreq levels change, find out about the (new) max frequency. We * use this to update CPU accounting in case it got a lower estimate at boot. @@ -726,22 +785,27 @@ return (tsc_get_timecount_low(tc)); } -uint32_t -cpu_fill_vdso_timehands(struct vdso_timehands *vdso_th) +static uint32_t +x86_tsc_vdso_timehands(struct vdso_timehands *vdso_th, struct timecounter *tc) { - vdso_th->th_x86_shift = (int)(intptr_t)timecounter->tc_priv; + vdso_th->th_algo = VDSO_TH_ALGO_X86_TSC; + vdso_th->th_x86_shift = (int)(intptr_t)tc->tc_priv; + vdso_th->th_x86_hpet_idx = 0xffffffff; bzero(vdso_th->th_res, sizeof(vdso_th->th_res)); - return (timecounter == &tsc_timecounter); + return (1); } #ifdef COMPAT_FREEBSD32 -uint32_t -cpu_fill_vdso_timehands32(struct vdso_timehands32 *vdso_th32) +static uint32_t +x86_tsc_vdso_timehands32(struct vdso_timehands32 *vdso_th32, + struct timecounter *tc) { - vdso_th32->th_x86_shift = (int)(intptr_t)timecounter->tc_priv; + vdso_th32->th_algo = VDSO_TH_ALGO_X86_TSC; + vdso_th32->th_x86_shift = (int)(intptr_t)tc->tc_priv; + vdso_th32->th_x86_hpet_idx = 0xffffffff; bzero(vdso_th32->th_res, sizeof(vdso_th32->th_res)); - return (timecounter == &tsc_timecounter); + return (1); } #endif Added: trunk/sys/x86/x86/ucode.c =================================================================== --- trunk/sys/x86/x86/ucode.c (rev 0) +++ trunk/sys/x86/x86/ucode.c 2020-02-08 19:32:41 UTC (rev 12310) @@ -0,0 +1,402 @@ +/* $MidnightBSD$ */ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2018 The FreeBSD Foundation + * + * This software was developed by Mark Johnston under sponsorship from + * the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: stable/11/sys/x86/x86/ucode.c 347700 2019-05-16 14:42:16Z markj $"); + +#include <sys/param.h> +#include <sys/cpuset.h> +#include <sys/kernel.h> +#include <sys/linker.h> +#include <sys/malloc.h> +#include <sys/pcpu.h> +#include <sys/smp.h> +#include <sys/systm.h> + +#include <machine/atomic.h> +#include <machine/cpufunc.h> +#include <x86/specialreg.h> +#include <machine/stdarg.h> +#include <x86/ucode.h> +#include <x86/x86_smp.h> + +#include <vm/vm.h> +#include <vm/pmap.h> +#include <vm/vm_extern.h> +#include <vm/vm_kern.h> +#include <vm/vm_param.h> + +static void *ucode_intel_match(uint8_t *data, size_t *len); +static int ucode_intel_verify(struct ucode_intel_header *hdr, + size_t resid); + +static struct ucode_ops { + const char *vendor; + int (*load)(void *, bool, uint64_t *, uint64_t *); + void *(*match)(uint8_t *, size_t *); +} loaders[] = { + { + .vendor = INTEL_VENDOR_ID, + .load = ucode_intel_load, + .match = ucode_intel_match, + }, +}; + +/* Selected microcode update data. */ +static void *early_ucode_data; +static void *ucode_data; +static struct ucode_ops *ucode_loader; + +/* Variables used for reporting success or failure. */ +enum { + NO_ERROR, + NO_MATCH, + VERIFICATION_FAILED, +} ucode_error = NO_ERROR; +static uint64_t ucode_nrev, ucode_orev; + +static void +log_msg(void *arg __unused) +{ + + if (ucode_nrev != 0) { + printf("CPU microcode: updated from %#jx to %#jx\n", + (uintmax_t)ucode_orev, (uintmax_t)ucode_nrev); + return; + } + + switch (ucode_error) { + case NO_MATCH: + printf("CPU microcode: no matching update found\n"); + break; + case VERIFICATION_FAILED: + printf("CPU microcode: microcode verification failed\n"); + break; + default: + break; + } +} +SYSINIT(ucode_log, SI_SUB_CPU, SI_ORDER_FIRST, log_msg, NULL); + +int +ucode_intel_load(void *data, bool unsafe, uint64_t *nrevp, uint64_t *orevp) +{ + uint64_t nrev, orev; + uint32_t cpuid[4]; + + orev = rdmsr(MSR_BIOS_SIGN) >> 32; + + /* + * Perform update. Flush caches first to work around seemingly + * undocumented errata applying to some Broadwell CPUs. + */ + wbinvd(); + if (unsafe) + wrmsr_safe(MSR_BIOS_UPDT_TRIG, (uint64_t)(uintptr_t)data); + else + wrmsr(MSR_BIOS_UPDT_TRIG, (uint64_t)(uintptr_t)data); + wrmsr(MSR_BIOS_SIGN, 0); + + /* + * Serialize instruction flow. + */ + do_cpuid(0, cpuid); + + /* + * Verify that the microcode revision changed. + */ + nrev = rdmsr(MSR_BIOS_SIGN) >> 32; + if (nrevp != NULL) + *nrevp = nrev; + if (orevp != NULL) + *orevp = orev; + if (nrev <= orev) + return (EEXIST); + return (0); +} + +static int +ucode_intel_verify(struct ucode_intel_header *hdr, size_t resid) +{ + uint32_t cksum, *data, size; + int i; + + if (resid < sizeof(struct ucode_intel_header)) + return (1); + size = hdr->total_size; + if (size == 0) + size = UCODE_INTEL_DEFAULT_DATA_SIZE + + sizeof(struct ucode_intel_header); + + if (hdr->header_version != 1) + return (1); + if (size % 16 != 0) + return (1); + if (resid < size) + return (1); + + cksum = 0; + data = (uint32_t *)hdr; + for (i = 0; i < size / sizeof(uint32_t); i++) + cksum += data[i]; + if (cksum != 0) + return (1); + return (0); +} + +static void * +ucode_intel_match(uint8_t *data, size_t *len) +{ + struct ucode_intel_header *hdr; + struct ucode_intel_extsig_table *table; + struct ucode_intel_extsig *entry; + uint64_t platformid; + size_t resid; + uint32_t data_size, flags, regs[4], sig, total_size; + int i; + + do_cpuid(1, regs); + sig = regs[0]; + + platformid = rdmsr(MSR_IA32_PLATFORM_ID); + flags = 1 << ((platformid >> 50) & 0x7); + + for (resid = *len; resid > 0; data += total_size, resid -= total_size) { + hdr = (struct ucode_intel_header *)data; + if (ucode_intel_verify(hdr, resid) != 0) { + ucode_error = VERIFICATION_FAILED; + break; + } + + data_size = hdr->data_size; + total_size = hdr->total_size; + if (data_size == 0) + data_size = UCODE_INTEL_DEFAULT_DATA_SIZE; + if (total_size == 0) + total_size = UCODE_INTEL_DEFAULT_DATA_SIZE + + sizeof(struct ucode_intel_header); + if (data_size > total_size + sizeof(struct ucode_intel_header)) + table = (struct ucode_intel_extsig_table *) + ((uint8_t *)(hdr + 1) + data_size); + else + table = NULL; + + if (hdr->processor_signature == sig) { + if ((hdr->processor_flags & flags) != 0) { + *len = data_size; + return (hdr + 1); + } + } else if (table != NULL) { + for (i = 0; i < table->signature_count; i++) { + entry = &table->entries[i]; + if (entry->processor_signature == sig && + (entry->processor_flags & flags) != 0) { + *len = data_size; + return (hdr + 1); + } + } + } + } + return (NULL); +} + +/* + * Release any memory backing unused microcode blobs back to the system. + * We copy the selected update and free the entire microcode file. + */ +static void +ucode_release(void *arg __unused) +{ + char *name, *type; + caddr_t file; + int release; + + if (early_ucode_data == NULL) + return; + release = 1; + TUNABLE_INT_FETCH("debug.ucode.release", &release); + if (!release) + return; + +restart: + file = 0; + for (;;) { + file = preload_search_next_name(file); + if (file == 0) + break; + type = (char *)preload_search_info(file, MODINFO_TYPE); + if (type == NULL || strcmp(type, "cpu_microcode") != 0) + continue; + + name = preload_search_info(file, MODINFO_NAME); + preload_delete_name(name); + goto restart; + } +} +SYSINIT(ucode_release, SI_SUB_KMEM + 1, SI_ORDER_ANY, ucode_release, NULL); + +void +ucode_load_ap(int cpu) +{ +#ifdef SMP + KASSERT(cpu_info[cpu_apic_ids[cpu]].cpu_present, + ("cpu %d not present", cpu)); + + if (cpu_info[cpu_apic_ids[cpu]].cpu_hyperthread) + return; +#endif + + if (ucode_data != NULL) + (void)ucode_loader->load(ucode_data, false, NULL, NULL); +} + +static void * +map_ucode(uintptr_t free, size_t len) +{ +#ifdef __i386__ + uintptr_t va; + + for (va = free; va < free + len; va += PAGE_SIZE) + pmap_kenter(va, (vm_paddr_t)va); +#else + (void)len; +#endif + return ((void *)free); +} + +static void +unmap_ucode(uintptr_t free, size_t len) +{ +#ifdef __i386__ + uintptr_t va; + + for (va = free; va < free + len; va += PAGE_SIZE) + pmap_kremove(va); +#else + (void)free; + (void)len; +#endif +} + +/* + * Search for an applicable microcode update, and load it. APs will load the + * selected update once they come online. + * + * "free" is the address of the next free physical page. If a microcode update + * is selected, it will be copied to this region prior to loading in order to + * satisfy alignment requirements. + */ +size_t +ucode_load_bsp(uintptr_t free) +{ + union { + uint32_t regs[4]; + char vendor[13]; + } cpuid; + uint8_t *addr, *fileaddr, *match; + char *type; + uint64_t nrev, orev; + caddr_t file; + size_t i, len; + int error; + + KASSERT(free % PAGE_SIZE == 0, ("unaligned boundary %p", (void *)free)); + + do_cpuid(0, cpuid.regs); + cpuid.regs[0] = cpuid.regs[1]; + cpuid.regs[1] = cpuid.regs[3]; + cpuid.vendor[12] = '\0'; + for (i = 0; i < nitems(loaders); i++) + if (strcmp(cpuid.vendor, loaders[i].vendor) == 0) { + ucode_loader = &loaders[i]; + break; + } + if (ucode_loader == NULL) + return (0); + + file = 0; + fileaddr = match = NULL; + for (;;) { + file = preload_search_next_name(file); + if (file == 0) + break; + type = (char *)preload_search_info(file, MODINFO_TYPE); + if (type == NULL || strcmp(type, "cpu_microcode") != 0) + continue; + + fileaddr = preload_fetch_addr(file); + len = preload_fetch_size(file); + match = ucode_loader->match(fileaddr, &len); + if (match != NULL) { + addr = map_ucode(free, len); + /* We can't use memcpy() before ifunc resolution. */ + for (i = 0; i < len; i++) + addr[i] = ((volatile uint8_t *)match)[i]; + match = addr; + + error = ucode_loader->load(match, false, &nrev, &orev); + if (error == 0) { + ucode_data = early_ucode_data = match; + ucode_nrev = nrev; + ucode_orev = orev; + return (len); + } + unmap_ucode(free, len); + } + } + if (fileaddr != NULL && ucode_error == NO_ERROR) + ucode_error = NO_MATCH; + return (0); +} + +/* + * Reload microcode following an ACPI resume. + */ +void +ucode_reload(void) +{ + + ucode_load_ap(PCPU_GET(cpuid)); +} + +/* + * Replace an existing microcode update. + */ +void * +ucode_update(void *newdata) +{ + + newdata = (void *)atomic_swap_ptr((void *)&ucode_data, + (uintptr_t)newdata); + if (newdata == early_ucode_data) + newdata = NULL; + return (newdata); +} Property changes on: trunk/sys/x86/x86/ucode.c ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +MidnightBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Added: trunk/sys/x86/x86/x86_mem.c =================================================================== --- trunk/sys/x86/x86/x86_mem.c (rev 0) +++ trunk/sys/x86/x86/x86_mem.c 2020-02-08 19:32:41 UTC (rev 12310) @@ -0,0 +1,729 @@ +/* $MidnightBSD$ */ +/*- + * Copyright (c) 1999 Michael Smith <msmith at freebsd.org> + * Copyright (c) 2017 The FreeBSD Foundation + * All rights reserved. + * + * Portions of this software were developed by Konstantin Belousov + * under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: stable/11/sys/x86/x86/x86_mem.c 314591 2017-03-03 10:30:30Z kib $"); + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/systm.h> +#include <sys/malloc.h> +#include <sys/memrange.h> +#include <sys/smp.h> +#include <sys/sysctl.h> + +#include <vm/vm.h> +#include <vm/vm_param.h> +#include <vm/pmap.h> + +#include <machine/cputypes.h> +#include <machine/md_var.h> +#include <machine/specialreg.h> + +/* + * Pentium Pro+ memory range operations + * + * This code will probably be impenetrable without reference to the + * Intel Pentium Pro documentation or x86-64 programmers manual vol 2. + */ + +static char *mem_owner_bios = "BIOS"; + +#define MR686_FIXMTRR (1<<0) + +#define mrwithin(mr, a) \ + (((a) >= (mr)->mr_base) && ((a) < ((mr)->mr_base + (mr)->mr_len))) +#define mroverlap(mra, mrb) \ + (mrwithin(mra, mrb->mr_base) || mrwithin(mrb, mra->mr_base)) + +#define mrvalid(base, len) \ + ((!(base & ((1 << 12) - 1))) && /* base is multiple of 4k */ \ + ((len) >= (1 << 12)) && /* length is >= 4k */ \ + powerof2((len)) && /* ... and power of two */ \ + !((base) & ((len) - 1))) /* range is not discontiuous */ + +#define mrcopyflags(curr, new) \ + (((curr) & ~MDF_ATTRMASK) | ((new) & MDF_ATTRMASK)) + +static int mtrrs_disabled; +SYSCTL_INT(_machdep, OID_AUTO, disable_mtrrs, CTLFLAG_RDTUN, + &mtrrs_disabled, 0, + "Disable MTRRs."); + +static void x86_mrinit(struct mem_range_softc *sc); +static int x86_mrset(struct mem_range_softc *sc, + struct mem_range_desc *mrd, int *arg); +static void x86_mrAPinit(struct mem_range_softc *sc); +static void x86_mrreinit(struct mem_range_softc *sc); + +static struct mem_range_ops x86_mrops = { + x86_mrinit, + x86_mrset, + x86_mrAPinit, + x86_mrreinit +}; + +/* XXX for AP startup hook */ +static u_int64_t mtrrcap, mtrrdef; + +/* The bitmask for the PhysBase and PhysMask fields of the variable MTRRs. */ +static u_int64_t mtrr_physmask; + +static struct mem_range_desc *mem_range_match(struct mem_range_softc *sc, + struct mem_range_desc *mrd); +static void x86_mrfetch(struct mem_range_softc *sc); +static int x86_mtrrtype(int flags); +static int x86_mrt2mtrr(int flags, int oldval); +static int x86_mtrrconflict(int flag1, int flag2); +static void x86_mrstore(struct mem_range_softc *sc); +static void x86_mrstoreone(void *arg); +static struct mem_range_desc *x86_mtrrfixsearch(struct mem_range_softc *sc, + u_int64_t addr); +static int x86_mrsetlow(struct mem_range_softc *sc, + struct mem_range_desc *mrd, int *arg); +static int x86_mrsetvariable(struct mem_range_softc *sc, + struct mem_range_desc *mrd, int *arg); + +/* ia32 MTRR type to memory range type conversion */ +static int x86_mtrrtomrt[] = { + MDF_UNCACHEABLE, + MDF_WRITECOMBINE, + MDF_UNKNOWN, + MDF_UNKNOWN, + MDF_WRITETHROUGH, + MDF_WRITEPROTECT, + MDF_WRITEBACK +}; + +#define MTRRTOMRTLEN nitems(x86_mtrrtomrt) + +static int +x86_mtrr2mrt(int val) +{ + + if (val < 0 || val >= MTRRTOMRTLEN) + return (MDF_UNKNOWN); + return (x86_mtrrtomrt[val]); +} + +/* + * x86 MTRR conflicts. Writeback and uncachable may overlap. + */ +static int +x86_mtrrconflict(int flag1, int flag2) +{ + + flag1 &= MDF_ATTRMASK; + flag2 &= MDF_ATTRMASK; + if ((flag1 & MDF_UNKNOWN) || (flag2 & MDF_UNKNOWN)) + return (1); + if (flag1 == flag2 || + (flag1 == MDF_WRITEBACK && flag2 == MDF_UNCACHEABLE) || + (flag2 == MDF_WRITEBACK && flag1 == MDF_UNCACHEABLE)) + return (0); + return (1); +} + +/* + * Look for an exactly-matching range. + */ +static struct mem_range_desc * +mem_range_match(struct mem_range_softc *sc, struct mem_range_desc *mrd) +{ + struct mem_range_desc *cand; + int i; + + for (i = 0, cand = sc->mr_desc; i < sc->mr_ndesc; i++, cand++) + if ((cand->mr_base == mrd->mr_base) && + (cand->mr_len == mrd->mr_len)) + return (cand); + return (NULL); +} + +/* + * Ensure that the direct map region does not contain any mappings + * that span MTRRs of different types. However, the fixed MTRRs can + * be ignored, because a large page mapping the first 1 MB of physical + * memory is a special case that the processor handles. Invalidate + * any old TLB entries that might hold inconsistent memory type + * information. + */ +static void +x86_mr_split_dmap(struct mem_range_softc *sc __unused) +{ +#ifdef __amd64__ + struct mem_range_desc *mrd; + int i; + + i = (sc->mr_cap & MR686_FIXMTRR) ? MTRR_N64K + MTRR_N16K + MTRR_N4K : 0; + mrd = sc->mr_desc + i; + for (; i < sc->mr_ndesc; i++, mrd++) { + if ((mrd->mr_flags & (MDF_ACTIVE | MDF_BOGUS)) == MDF_ACTIVE) + pmap_demote_DMAP(mrd->mr_base, mrd->mr_len, TRUE); + } +#endif +} + +/* + * Fetch the current mtrr settings from the current CPU (assumed to + * all be in sync in the SMP case). Note that if we are here, we + * assume that MTRRs are enabled, and we may or may not have fixed + * MTRRs. + */ +static void +x86_mrfetch(struct mem_range_softc *sc) +{ + struct mem_range_desc *mrd; + u_int64_t msrv; + int i, j, msr; + + mrd = sc->mr_desc; + + /* Get fixed-range MTRRs. */ + if (sc->mr_cap & MR686_FIXMTRR) { + msr = MSR_MTRR64kBase; + for (i = 0; i < (MTRR_N64K / 8); i++, msr++) { + msrv = rdmsr(msr); + for (j = 0; j < 8; j++, mrd++) { + mrd->mr_flags = + (mrd->mr_flags & ~MDF_ATTRMASK) | + x86_mtrr2mrt(msrv & 0xff) | MDF_ACTIVE; + if (mrd->mr_owner[0] == 0) + strcpy(mrd->mr_owner, mem_owner_bios); + msrv = msrv >> 8; + } + } + msr = MSR_MTRR16kBase; + for (i = 0; i < MTRR_N16K / 8; i++, msr++) { + msrv = rdmsr(msr); + for (j = 0; j < 8; j++, mrd++) { + mrd->mr_flags = + (mrd->mr_flags & ~MDF_ATTRMASK) | + x86_mtrr2mrt(msrv & 0xff) | MDF_ACTIVE; + if (mrd->mr_owner[0] == 0) + strcpy(mrd->mr_owner, mem_owner_bios); + msrv = msrv >> 8; + } + } + msr = MSR_MTRR4kBase; + for (i = 0; i < MTRR_N4K / 8; i++, msr++) { + msrv = rdmsr(msr); + for (j = 0; j < 8; j++, mrd++) { + mrd->mr_flags = + (mrd->mr_flags & ~MDF_ATTRMASK) | + x86_mtrr2mrt(msrv & 0xff) | MDF_ACTIVE; + if (mrd->mr_owner[0] == 0) + strcpy(mrd->mr_owner, mem_owner_bios); + msrv = msrv >> 8; + } + } + } + + /* Get remainder which must be variable MTRRs. */ + msr = MSR_MTRRVarBase; + for (; mrd - sc->mr_desc < sc->mr_ndesc; msr += 2, mrd++) { + msrv = rdmsr(msr); + mrd->mr_flags = (mrd->mr_flags & ~MDF_ATTRMASK) | + x86_mtrr2mrt(msrv & MTRR_PHYSBASE_TYPE); + mrd->mr_base = msrv & mtrr_physmask; + msrv = rdmsr(msr + 1); + mrd->mr_flags = (msrv & MTRR_PHYSMASK_VALID) ? + (mrd->mr_flags | MDF_ACTIVE) : + (mrd->mr_flags & ~MDF_ACTIVE); + + /* Compute the range from the mask. Ick. */ + mrd->mr_len = (~(msrv & mtrr_physmask) & + (mtrr_physmask | 0xfff)) + 1; + if (!mrvalid(mrd->mr_base, mrd->mr_len)) + mrd->mr_flags |= MDF_BOGUS; + + /* If unclaimed and active, must be the BIOS. */ + if ((mrd->mr_flags & MDF_ACTIVE) && (mrd->mr_owner[0] == 0)) + strcpy(mrd->mr_owner, mem_owner_bios); + } +} + +/* + * Return the MTRR memory type matching a region's flags + */ +static int +x86_mtrrtype(int flags) +{ + int i; + + flags &= MDF_ATTRMASK; + + for (i = 0; i < MTRRTOMRTLEN; i++) { + if (x86_mtrrtomrt[i] == MDF_UNKNOWN) + continue; + if (flags == x86_mtrrtomrt[i]) + return (i); + } + return (-1); +} + +static int +x86_mrt2mtrr(int flags, int oldval) +{ + int val; + + if ((val = x86_mtrrtype(flags)) == -1) + return (oldval & 0xff); + return (val & 0xff); +} + +/* + * Update running CPU(s) MTRRs to match the ranges in the descriptor + * list. + * + * Must be called with interrupts enabled. + */ +static void +x86_mrstore(struct mem_range_softc *sc) +{ + + smp_rendezvous(NULL, x86_mrstoreone, NULL, sc); +} + +/* + * Update the current CPU's MTRRs with those represented in the + * descriptor list. Note that we do this wholesale rather than just + * stuffing one entry; this is simpler (but slower, of course). + */ +static void +x86_mrstoreone(void *arg) +{ + struct mem_range_softc *sc = arg; + struct mem_range_desc *mrd; + u_int64_t omsrv, msrv; + int i, j, msr; + u_long cr0, cr4; + + mrd = sc->mr_desc; + + critical_enter(); + + /* Disable PGE. */ + cr4 = rcr4(); + load_cr4(cr4 & ~CR4_PGE); + + /* Disable caches (CD = 1, NW = 0). */ + cr0 = rcr0(); + load_cr0((cr0 & ~CR0_NW) | CR0_CD); + + /* Flushes caches and TLBs. */ + wbinvd(); + invltlb(); + + /* Disable MTRRs (E = 0). */ + wrmsr(MSR_MTRRdefType, rdmsr(MSR_MTRRdefType) & ~MTRR_DEF_ENABLE); + + /* Set fixed-range MTRRs. */ + if (sc->mr_cap & MR686_FIXMTRR) { + msr = MSR_MTRR64kBase; + for (i = 0; i < MTRR_N64K / 8; i++, msr++) { + msrv = 0; + omsrv = rdmsr(msr); + for (j = 7; j >= 0; j--) { + msrv = msrv << 8; + msrv |= x86_mrt2mtrr((mrd + j)->mr_flags, + omsrv >> (j * 8)); + } + wrmsr(msr, msrv); + mrd += 8; + } + msr = MSR_MTRR16kBase; + for (i = 0; i < MTRR_N16K / 8; i++, msr++) { + msrv = 0; + omsrv = rdmsr(msr); + for (j = 7; j >= 0; j--) { + msrv = msrv << 8; + msrv |= x86_mrt2mtrr((mrd + j)->mr_flags, + omsrv >> (j * 8)); + } + wrmsr(msr, msrv); + mrd += 8; + } + msr = MSR_MTRR4kBase; + for (i = 0; i < MTRR_N4K / 8; i++, msr++) { + msrv = 0; + omsrv = rdmsr(msr); + for (j = 7; j >= 0; j--) { + msrv = msrv << 8; + msrv |= x86_mrt2mtrr((mrd + j)->mr_flags, + omsrv >> (j * 8)); + } + wrmsr(msr, msrv); + mrd += 8; + } + } + + /* Set remainder which must be variable MTRRs. */ + msr = MSR_MTRRVarBase; + for (; mrd - sc->mr_desc < sc->mr_ndesc; msr += 2, mrd++) { + /* base/type register */ + omsrv = rdmsr(msr); + if (mrd->mr_flags & MDF_ACTIVE) { + msrv = mrd->mr_base & mtrr_physmask; + msrv |= x86_mrt2mtrr(mrd->mr_flags, omsrv); + } else { + msrv = 0; + } + wrmsr(msr, msrv); + + /* mask/active register */ + if (mrd->mr_flags & MDF_ACTIVE) { + msrv = MTRR_PHYSMASK_VALID | + rounddown2(mtrr_physmask, mrd->mr_len); + } else { + msrv = 0; + } + wrmsr(msr + 1, msrv); + } + + /* Flush caches and TLBs. */ + wbinvd(); + invltlb(); + + /* Enable MTRRs. */ + wrmsr(MSR_MTRRdefType, rdmsr(MSR_MTRRdefType) | MTRR_DEF_ENABLE); + + /* Restore caches and PGE. */ + load_cr0(cr0); + load_cr4(cr4); + + critical_exit(); +} + +/* + * Hunt for the fixed MTRR referencing (addr) + */ +static struct mem_range_desc * +x86_mtrrfixsearch(struct mem_range_softc *sc, u_int64_t addr) +{ + struct mem_range_desc *mrd; + int i; + + for (i = 0, mrd = sc->mr_desc; i < MTRR_N64K + MTRR_N16K + MTRR_N4K; + i++, mrd++) + if (addr >= mrd->mr_base && + addr < mrd->mr_base + mrd->mr_len) + return (mrd); + return (NULL); +} + +/* + * Try to satisfy the given range request by manipulating the fixed + * MTRRs that cover low memory. + * + * Note that we try to be generous here; we'll bloat the range out to + * the next higher/lower boundary to avoid the consumer having to know + * too much about the mechanisms here. + * + * XXX note that this will have to be updated when we start supporting + * "busy" ranges. + */ +static int +x86_mrsetlow(struct mem_range_softc *sc, struct mem_range_desc *mrd, int *arg) +{ + struct mem_range_desc *first_md, *last_md, *curr_md; + + /* Range check. */ + if ((first_md = x86_mtrrfixsearch(sc, mrd->mr_base)) == NULL || + (last_md = x86_mtrrfixsearch(sc, mrd->mr_base + mrd->mr_len - 1)) + == NULL) + return (EINVAL); + + /* Check that we aren't doing something risky. */ + if ((mrd->mr_flags & MDF_FORCE) == 0) { + for (curr_md = first_md; curr_md <= last_md; curr_md++) { + if ((curr_md->mr_flags & MDF_ATTRMASK) == MDF_UNKNOWN) + return (EACCES); + } + } + + /* Set flags, clear set-by-firmware flag. */ + for (curr_md = first_md; curr_md <= last_md; curr_md++) { + curr_md->mr_flags = mrcopyflags(curr_md->mr_flags & + ~MDF_FIRMWARE, mrd->mr_flags); + bcopy(mrd->mr_owner, curr_md->mr_owner, sizeof(mrd->mr_owner)); + } + + return (0); +} + +/* + * Modify/add a variable MTRR to satisfy the request. + * + * XXX needs to be updated to properly support "busy" ranges. + */ +static int +x86_mrsetvariable(struct mem_range_softc *sc, struct mem_range_desc *mrd, + int *arg) +{ + struct mem_range_desc *curr_md, *free_md; + int i; + + /* + * Scan the currently active variable descriptors, look for + * one we exactly match (straight takeover) and for possible + * accidental overlaps. + * + * Keep track of the first empty variable descriptor in case + * we can't perform a takeover. + */ + i = (sc->mr_cap & MR686_FIXMTRR) ? MTRR_N64K + MTRR_N16K + MTRR_N4K : 0; + curr_md = sc->mr_desc + i; + free_md = NULL; + for (; i < sc->mr_ndesc; i++, curr_md++) { + if (curr_md->mr_flags & MDF_ACTIVE) { + /* Exact match? */ + if (curr_md->mr_base == mrd->mr_base && + curr_md->mr_len == mrd->mr_len) { + + /* Whoops, owned by someone. */ + if (curr_md->mr_flags & MDF_BUSY) + return (EBUSY); + + /* Check that we aren't doing something risky */ + if (!(mrd->mr_flags & MDF_FORCE) && + (curr_md->mr_flags & MDF_ATTRMASK) == + MDF_UNKNOWN) + return (EACCES); + + /* Ok, just hijack this entry. */ + free_md = curr_md; + break; + } + + /* Non-exact overlap? */ + if (mroverlap(curr_md, mrd)) { + /* Between conflicting region types? */ + if (x86_mtrrconflict(curr_md->mr_flags, + mrd->mr_flags)) + return (EINVAL); + } + } else if (free_md == NULL) { + free_md = curr_md; + } + } + + /* Got somewhere to put it? */ + if (free_md == NULL) + return (ENOSPC); + + /* Set up new descriptor. */ + free_md->mr_base = mrd->mr_base; + free_md->mr_len = mrd->mr_len; + free_md->mr_flags = mrcopyflags(MDF_ACTIVE, mrd->mr_flags); + bcopy(mrd->mr_owner, free_md->mr_owner, sizeof(mrd->mr_owner)); + return (0); +} + +/* + * Handle requests to set memory range attributes by manipulating MTRRs. + */ +static int +x86_mrset(struct mem_range_softc *sc, struct mem_range_desc *mrd, int *arg) +{ + struct mem_range_desc *targ; + int error; + + switch (*arg) { + case MEMRANGE_SET_UPDATE: + /* + * Make sure that what's being asked for is even + * possible at all. + */ + if (!mrvalid(mrd->mr_base, mrd->mr_len) || + x86_mtrrtype(mrd->mr_flags) == -1) + return (EINVAL); + +#define FIXTOP \ + ((MTRR_N64K * 0x10000) + (MTRR_N16K * 0x4000) + (MTRR_N4K * 0x1000)) + + /* Are the "low memory" conditions applicable? */ + if ((sc->mr_cap & MR686_FIXMTRR) != 0 && + mrd->mr_base + mrd->mr_len <= FIXTOP) { + if ((error = x86_mrsetlow(sc, mrd, arg)) != 0) + return (error); + } else { + /* It's time to play with variable MTRRs. */ + if ((error = x86_mrsetvariable(sc, mrd, arg)) != 0) + return (error); + } + break; + + case MEMRANGE_SET_REMOVE: + if ((targ = mem_range_match(sc, mrd)) == NULL) + return (ENOENT); + if (targ->mr_flags & MDF_FIXACTIVE) + return (EPERM); + if (targ->mr_flags & MDF_BUSY) + return (EBUSY); + targ->mr_flags &= ~MDF_ACTIVE; + targ->mr_owner[0] = 0; + break; + + default: + return (EOPNOTSUPP); + } + + x86_mr_split_dmap(sc); + + /* Update the hardware. */ + x86_mrstore(sc); + + /* Refetch to see where we're at. */ + x86_mrfetch(sc); + return (0); +} + +/* + * Work out how many ranges we support, initialise storage for them, + * and fetch the initial settings. + */ +static void +x86_mrinit(struct mem_range_softc *sc) +{ + struct mem_range_desc *mrd; + int i, nmdesc; + + if (sc->mr_desc != NULL) + /* Already initialized. */ + return; + + nmdesc = 0; + mtrrcap = rdmsr(MSR_MTRRcap); + mtrrdef = rdmsr(MSR_MTRRdefType); + + /* For now, bail out if MTRRs are not enabled. */ + if (!(mtrrdef & MTRR_DEF_ENABLE)) { + if (bootverbose) + printf("CPU supports MTRRs but not enabled\n"); + return; + } + nmdesc = mtrrcap & MTRR_CAP_VCNT; + if (bootverbose) + printf("Pentium Pro MTRR support enabled\n"); + + /* + * Determine the size of the PhysMask and PhysBase fields in + * the variable range MTRRs. + */ + mtrr_physmask = (((uint64_t)1 << cpu_maxphyaddr) - 1) & + ~(uint64_t)0xfff; + + /* If fixed MTRRs supported and enabled. */ + if ((mtrrcap & MTRR_CAP_FIXED) && (mtrrdef & MTRR_DEF_FIXED_ENABLE)) { + sc->mr_cap = MR686_FIXMTRR; + nmdesc += MTRR_N64K + MTRR_N16K + MTRR_N4K; + } + + sc->mr_desc = malloc(nmdesc * sizeof(struct mem_range_desc), M_MEMDESC, + M_WAITOK | M_ZERO); + sc->mr_ndesc = nmdesc; + + mrd = sc->mr_desc; + + /* Populate the fixed MTRR entries' base/length. */ + if (sc->mr_cap & MR686_FIXMTRR) { + for (i = 0; i < MTRR_N64K; i++, mrd++) { + mrd->mr_base = i * 0x10000; + mrd->mr_len = 0x10000; + mrd->mr_flags = MDF_FIXBASE | MDF_FIXLEN | + MDF_FIXACTIVE; + } + for (i = 0; i < MTRR_N16K; i++, mrd++) { + mrd->mr_base = i * 0x4000 + 0x80000; + mrd->mr_len = 0x4000; + mrd->mr_flags = MDF_FIXBASE | MDF_FIXLEN | + MDF_FIXACTIVE; + } + for (i = 0; i < MTRR_N4K; i++, mrd++) { + mrd->mr_base = i * 0x1000 + 0xc0000; + mrd->mr_len = 0x1000; + mrd->mr_flags = MDF_FIXBASE | MDF_FIXLEN | + MDF_FIXACTIVE; + } + } + + /* + * Get current settings, anything set now is considered to + * have been set by the firmware. (XXX has something already + * played here?) + */ + x86_mrfetch(sc); + mrd = sc->mr_desc; + for (i = 0; i < sc->mr_ndesc; i++, mrd++) { + if (mrd->mr_flags & MDF_ACTIVE) + mrd->mr_flags |= MDF_FIRMWARE; + } + + x86_mr_split_dmap(sc); +} + +/* + * Initialise MTRRs on an AP after the BSP has run the init code. + */ +static void +x86_mrAPinit(struct mem_range_softc *sc) +{ + + x86_mrstoreone(sc); + wrmsr(MSR_MTRRdefType, mtrrdef); +} + +/* + * Re-initialise running CPU(s) MTRRs to match the ranges in the descriptor + * list. + * + * Must be called with interrupts enabled. + */ +static void +x86_mrreinit(struct mem_range_softc *sc) +{ + + smp_rendezvous(NULL, (void (*)(void *))x86_mrAPinit, NULL, sc); +} + +static void +x86_mem_drvinit(void *unused) +{ + + if (mtrrs_disabled) + return; + if (!(cpu_feature & CPUID_MTRR)) + return; + mem_range_softc.mr_op = &x86_mrops; + x86_mrinit(&mem_range_softc); +} +SYSINIT(x86memdev, SI_SUB_CPU, SI_ORDER_ANY, x86_mem_drvinit, NULL); Property changes on: trunk/sys/x86/x86/x86_mem.c ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +MidnightBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Modified: trunk/sys/x86/xen/hvm.c =================================================================== --- trunk/sys/x86/xen/hvm.c 2020-02-08 19:29:01 UTC (rev 12309) +++ trunk/sys/x86/xen/hvm.c 2020-02-08 19:32:41 UTC (rev 12310) @@ -27,7 +27,7 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/x86/xen/hvm.c 305672 2016-09-09 19:57:32Z jhb $"); +__FBSDID("$FreeBSD: stable/11/sys/x86/xen/hvm.c 305672 2016-09-09 19:57:32Z jhb $"); #include <sys/param.h> #include <sys/bus.h> @@ -59,34 +59,8 @@ #include <xen/interface/vcpu.h> /*--------------------------- Forward Declarations ---------------------------*/ -#ifdef SMP -static driver_filter_t xen_smp_rendezvous_action; -static driver_filter_t xen_invltlb; -static driver_filter_t xen_invlpg; -static driver_filter_t xen_invlrng; -static driver_filter_t xen_invlcache; -#ifdef __i386__ -static driver_filter_t xen_lazypmap; -#endif -static driver_filter_t xen_ipi_bitmap_handler; -static driver_filter_t xen_cpustop_handler; -static driver_filter_t xen_cpususpend_handler; -static driver_filter_t xen_cpustophard_handler; -static void xen_ipi_vectored(u_int vector, int dest); -#endif static void xen_hvm_cpu_init(void); -/*---------------------------- Extern Declarations ---------------------------*/ -#ifdef __i386__ -extern void pmap_lazyfix_action(void); -#endif -#ifdef __amd64__ -extern int pmap_pcid_enabled; -#endif - -/*---------------------------------- Macros ----------------------------------*/ -#define IPI_TO_IDX(ipi) ((ipi) - APIC_IPI_INTS) - /*-------------------------------- Local Types -------------------------------*/ enum xen_hvm_init_type { XEN_HVM_INIT_COLD, @@ -94,18 +68,11 @@ XEN_HVM_INIT_RESUME }; -struct xen_ipi_handler -{ - driver_filter_t *filter; - const char *description; -}; - /*-------------------------------- Global Data -------------------------------*/ enum xen_domain_type xen_domain_type = XEN_NATIVE; #ifdef SMP struct cpu_ops xen_hvm_cpu_ops = { - .ipi_vectored = lapic_ipi_vectored, .cpu_init = xen_hvm_cpu_init, .cpu_resume = xen_hvm_cpu_init }; @@ -113,24 +80,6 @@ static MALLOC_DEFINE(M_XENHVM, "xen_hvm", "Xen HVM PV Support"); -#ifdef SMP -static struct xen_ipi_handler xen_ipis[] = -{ - [IPI_TO_IDX(IPI_RENDEZVOUS)] = { xen_smp_rendezvous_action, "r" }, - [IPI_TO_IDX(IPI_INVLTLB)] = { xen_invltlb, "itlb"}, - [IPI_TO_IDX(IPI_INVLPG)] = { xen_invlpg, "ipg" }, - [IPI_TO_IDX(IPI_INVLRNG)] = { xen_invlrng, "irg" }, - [IPI_TO_IDX(IPI_INVLCACHE)] = { xen_invlcache, "ic" }, -#ifdef __i386__ - [IPI_TO_IDX(IPI_LAZYPMAP)] = { xen_lazypmap, "lp" }, -#endif - [IPI_TO_IDX(IPI_BITMAP_VECTOR)] = { xen_ipi_bitmap_handler, "b" }, - [IPI_TO_IDX(IPI_STOP)] = { xen_cpustop_handler, "st" }, - [IPI_TO_IDX(IPI_SUSPEND)] = { xen_cpususpend_handler, "sp" }, - [IPI_TO_IDX(IPI_STOP_HARD)] = { xen_cpustophard_handler, "sth" }, -}; -#endif - /** * If non-zero, the hypervisor has been configured to use a direct * IDT event callback for interrupt injection. @@ -140,14 +89,10 @@ /*------------------------------- Per-CPU Data -------------------------------*/ DPCPU_DEFINE(struct vcpu_info, vcpu_local_info); DPCPU_DEFINE(struct vcpu_info *, vcpu_info); -#ifdef SMP -DPCPU_DEFINE(xen_intr_handle_t, ipi_handle[nitems(xen_ipis)]); -#endif /*------------------ Hypervisor Access Shared Memory Regions -----------------*/ -/** Hypercall table accessed via HYPERVISOR_*_op() methods. */ -char *hypercall_stubs; shared_info_t *HYPERVISOR_shared_info; +start_info_t *HYPERVISOR_start_info; /*------------------------------ Sysctl tunables -----------------------------*/ @@ -156,207 +101,6 @@ TUNABLE_INT("hw.xen.disable_pv_disks", &xen_disable_pv_disks); TUNABLE_INT("hw.xen.disable_pv_nics", &xen_disable_pv_nics); -#ifdef SMP -/*---------------------------- XEN PV IPI Handlers ---------------------------*/ -/* - * This are C clones of the ASM functions found in apic_vector.s - */ -static int -xen_ipi_bitmap_handler(void *arg) -{ - struct trapframe *frame; - - frame = arg; - ipi_bitmap_handler(*frame); - return (FILTER_HANDLED); -} - -static int -xen_smp_rendezvous_action(void *arg) -{ -#ifdef COUNT_IPIS - (*ipi_rendezvous_counts[PCPU_GET(cpuid)])++; -#endif /* COUNT_IPIS */ - - smp_rendezvous_action(); - return (FILTER_HANDLED); -} - -static int -xen_invltlb(void *arg) -{ - - invltlb_handler(); - return (FILTER_HANDLED); -} - -#ifdef __amd64__ -static int -xen_invltlb_pcid(void *arg) -{ - - invltlb_pcid_handler(); - return (FILTER_HANDLED); -} -#endif - -static int -xen_invlpg(void *arg) -{ - - invlpg_handler(); - return (FILTER_HANDLED); -} - -#ifdef __amd64__ -static int -xen_invlpg_pcid(void *arg) -{ - - invlpg_pcid_handler(); - return (FILTER_HANDLED); -} -#endif - -static int -xen_invlrng(void *arg) -{ - - invlrng_handler(); - return (FILTER_HANDLED); -} - -static int -xen_invlcache(void *arg) -{ - - invlcache_handler(); - return (FILTER_HANDLED); -} - -#ifdef __i386__ -static int -xen_lazypmap(void *arg) -{ - - pmap_lazyfix_action(); - return (FILTER_HANDLED); -} -#endif - -static int -xen_cpustop_handler(void *arg) -{ - - cpustop_handler(); - return (FILTER_HANDLED); -} - -static int -xen_cpususpend_handler(void *arg) -{ - - cpususpend_handler(); - return (FILTER_HANDLED); -} - -static int -xen_cpustophard_handler(void *arg) -{ - - ipi_nmi_handler(); - return (FILTER_HANDLED); -} - -/* Xen PV IPI sender */ -static void -xen_ipi_vectored(u_int vector, int dest) -{ - xen_intr_handle_t *ipi_handle; - int ipi_idx, to_cpu, self; - - ipi_idx = IPI_TO_IDX(vector); - if (ipi_idx > nitems(xen_ipis)) - panic("IPI out of range"); - - switch(dest) { - case APIC_IPI_DEST_SELF: - ipi_handle = DPCPU_GET(ipi_handle); - xen_intr_signal(ipi_handle[ipi_idx]); - break; - case APIC_IPI_DEST_ALL: - CPU_FOREACH(to_cpu) { - ipi_handle = DPCPU_ID_GET(to_cpu, ipi_handle); - xen_intr_signal(ipi_handle[ipi_idx]); - } - break; - case APIC_IPI_DEST_OTHERS: - self = PCPU_GET(cpuid); - CPU_FOREACH(to_cpu) { - if (to_cpu != self) { - ipi_handle = DPCPU_ID_GET(to_cpu, ipi_handle); - xen_intr_signal(ipi_handle[ipi_idx]); - } - } - break; - default: - to_cpu = apic_cpuid(dest); - ipi_handle = DPCPU_ID_GET(to_cpu, ipi_handle); - xen_intr_signal(ipi_handle[ipi_idx]); - break; - } -} - -/*---------------------- XEN diverged cpu operations -------------------------*/ -static void -xen_cpu_ipi_init(int cpu) -{ - xen_intr_handle_t *ipi_handle; - const struct xen_ipi_handler *ipi; - device_t dev; - int idx, rc; - - ipi_handle = DPCPU_ID_GET(cpu, ipi_handle); - dev = pcpu_find(cpu)->pc_device; - KASSERT((dev != NULL), ("NULL pcpu device_t")); - - for (ipi = xen_ipis, idx = 0; idx < nitems(xen_ipis); ipi++, idx++) { - - if (ipi->filter == NULL) { - ipi_handle[idx] = NULL; - continue; - } - - rc = xen_intr_alloc_and_bind_ipi(dev, cpu, ipi->filter, - INTR_TYPE_TTY, &ipi_handle[idx]); - if (rc != 0) - panic("Unable to allocate a XEN IPI port"); - xen_intr_describe(ipi_handle[idx], "%s", ipi->description); - } -} - -static void -xen_setup_cpus(void) -{ - int i; - - if (!xen_hvm_domain() || !xen_vector_callback_enabled) - return; - -#ifdef __amd64__ - if (pmap_pcid_enabled) { - xen_ipis[IPI_TO_IDX(IPI_INVLTLB)].filter = xen_invltlb_pcid; - xen_ipis[IPI_TO_IDX(IPI_INVLPG)].filter = xen_invlpg_pcid; - } -#endif - CPU_FOREACH(i) - xen_cpu_ipi_init(i); - - /* Set the xen pv ipi ops to replace the native ones */ - cpu_ops.ipi_vectored = xen_ipi_vectored; -} -#endif - /*---------------------- XEN Hypervisor Probe and Setup ----------------------*/ static uint32_t xen_hvm_cpuid_base(void) @@ -376,16 +120,21 @@ * Allocate and fill in the hypcall page. */ static int -xen_hvm_init_hypercall_stubs(void) +xen_hvm_init_hypercall_stubs(enum xen_hvm_init_type init_type) { uint32_t base, regs[4]; int i; + if (xen_pv_domain()) { + /* hypercall page is already set in the PV case */ + return (0); + } + base = xen_hvm_cpuid_base(); if (base == 0) return (ENXIO); - if (hypercall_stubs == NULL) { + if (init_type == XEN_HVM_INIT_COLD) { int major, minor; do_cpuid(base + 1, regs); @@ -417,18 +166,9 @@ * Find the hypercall pages. */ do_cpuid(base + 2, regs); - - if (hypercall_stubs == NULL) { - size_t call_region_size; - call_region_size = regs[0] * PAGE_SIZE; - hypercall_stubs = malloc(call_region_size, M_XENHVM, M_NOWAIT); - if (hypercall_stubs == NULL) - panic("Unable to allocate Xen hypercall region"); - } - for (i = 0; i < regs[0]; i++) - wrmsr(regs[1], vtophys(hypercall_stubs + i * PAGE_SIZE) + i); + wrmsr(regs[1], vtophys(&hypercall_page + i * PAGE_SIZE) + i); return (0); } @@ -438,6 +178,14 @@ { struct xen_add_to_physmap xatp; + if (xen_pv_domain()) { + /* + * Already setup in the PV case, shared_info is passed inside + * of the start_info struct at start of day. + */ + return; + } + if (HYPERVISOR_shared_info == NULL) { HYPERVISOR_shared_info = malloc(PAGE_SIZE, M_XENHVM, M_NOWAIT); if (HYPERVISOR_shared_info == NULL) @@ -516,6 +264,16 @@ { u_short disable_devs = 0; + if (xen_pv_domain()) { + /* + * No emulated devices in the PV case, so no need to unplug + * anything. + */ + if (xen_disable_pv_disks != 0 || xen_disable_pv_nics != 0) + printf("PV devices cannot be disabled in PV guests\n"); + return; + } + if (inw(XEN_MAGIC_IOPORT) != XMI_MAGIC) return; @@ -543,7 +301,7 @@ if (init_type == XEN_HVM_INIT_CANCELLED_SUSPEND) return; - error = xen_hvm_init_hypercall_stubs(); + error = xen_hvm_init_hypercall_stubs(init_type); switch (init_type) { case XEN_HVM_INIT_COLD: @@ -550,11 +308,21 @@ if (error != 0) return; + /* + * If xen_domain_type is not set at this point + * it means we are inside a (PV)HVM guest, because + * for PVH the guest type is set much earlier + * (see hammer_time_xen). + */ + if (!xen_domain()) { + xen_domain_type = XEN_HVM_DOMAIN; + vm_guest = VM_GUEST_XEN; + } + setup_xen_features(); #ifdef SMP cpu_ops = xen_hvm_cpu_ops; #endif - vm_guest = VM_GUEST_XEN; break; case XEN_HVM_INIT_RESUME: if (error != 0) @@ -569,9 +337,15 @@ } xen_vector_callback_enabled = 0; - xen_domain_type = XEN_HVM_DOMAIN; + xen_hvm_set_callback(NULL); + + /* + * On (PV)HVM domains we need to request the hypervisor to + * fill the shared info page, for PVH guest the shared_info page + * is passed inside the start_info struct and is already set, so this + * functions are no-ops. + */ xen_hvm_init_shared_info_page(); - xen_hvm_set_callback(NULL); xen_hvm_disable_emulated_devices(); } @@ -603,6 +377,9 @@ struct pcpu *pc; int i; + if (!xen_hvm_domain()) + return; + /* Set vcpu_id to acpi_id */ CPU_FOREACH(i) { pc = pcpu_find(i); @@ -645,8 +422,5 @@ } SYSINIT(xen_hvm_init, SI_SUB_HYPERVISOR, SI_ORDER_FIRST, xen_hvm_sysinit, NULL); -#ifdef SMP -SYSINIT(xen_setup_cpus, SI_SUB_SMP, SI_ORDER_FIRST, xen_setup_cpus, NULL); -#endif SYSINIT(xen_hvm_cpu_init, SI_SUB_INTR, SI_ORDER_FIRST, xen_hvm_cpu_init, NULL); SYSINIT(xen_set_vcpu_id, SI_SUB_CPU, SI_ORDER_ANY, xen_set_vcpu_id, NULL); Added: trunk/sys/x86/xen/pv.c =================================================================== --- trunk/sys/x86/xen/pv.c (rev 0) +++ trunk/sys/x86/xen/pv.c 2020-02-08 19:32:41 UTC (rev 12310) @@ -0,0 +1,428 @@ +/* $MidnightBSD$ */ +/* + * Copyright (c) 2004 Christian Limpach. + * Copyright (c) 2004-2006,2008 Kip Macy + * Copyright (c) 2008 The NetBSD Foundation, Inc. + * Copyright (c) 2013 Roger Pau Monn? <roger.pau at citrix.com> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: stable/11/sys/x86/xen/pv.c 344378 2019-02-20 19:19:24Z kevans $"); + +#include "opt_ddb.h" +#include "opt_kstack_pages.h" + +#include <sys/param.h> +#include <sys/bus.h> +#include <sys/kernel.h> +#include <sys/reboot.h> +#include <sys/systm.h> +#include <sys/malloc.h> +#include <sys/linker.h> +#include <sys/lock.h> +#include <sys/rwlock.h> +#include <sys/boot.h> +#include <sys/ctype.h> +#include <sys/mutex.h> +#include <sys/smp.h> + +#include <vm/vm.h> +#include <vm/vm_extern.h> +#include <vm/vm_kern.h> +#include <vm/vm_page.h> +#include <vm/vm_map.h> +#include <vm/vm_object.h> +#include <vm/vm_pager.h> +#include <vm/vm_param.h> + +#include <machine/intr_machdep.h> +#include <x86/apicvar.h> +#include <x86/init.h> +#include <machine/pc/bios.h> +#include <machine/smp.h> +#include <machine/intr_machdep.h> +#include <machine/metadata.h> + +#include <xen/xen-os.h> +#include <xen/hypervisor.h> +#include <xen/xenstore/xenstorevar.h> +#include <xen/xen_pv.h> +#include <xen/xen_msi.h> + +#include <xen/interface/vcpu.h> + +#include <dev/xen/timer/timer.h> + +#ifdef DDB +#include <ddb/ddb.h> +#endif + +/* Native initial function */ +extern u_int64_t hammer_time(u_int64_t, u_int64_t); +/* Xen initial function */ +uint64_t hammer_time_xen(start_info_t *, uint64_t); + +#define MAX_E820_ENTRIES 128 + +/*--------------------------- Forward Declarations ---------------------------*/ +static caddr_t xen_pv_parse_preload_data(u_int64_t); +static void xen_pv_parse_memmap(caddr_t, vm_paddr_t *, int *); + +#ifdef SMP +static int xen_pv_start_all_aps(void); +#endif + +/*---------------------------- Extern Declarations ---------------------------*/ +#ifdef SMP +/* Variables used by amd64 mp_machdep to start APs */ +extern char *doublefault_stack; +extern char *mce_stack; +extern char *nmi_stack; +#endif + +/* + * Placed by the linker at the end of the bss section, which is the last + * section loaded by Xen before loading the symtab and strtab. + */ +extern uint32_t end; + +/*-------------------------------- Global Data -------------------------------*/ +/* Xen init_ops implementation. */ +struct init_ops xen_init_ops = { + .parse_preload_data = xen_pv_parse_preload_data, + .early_clock_source_init = xen_clock_init, + .early_delay = xen_delay, + .parse_memmap = xen_pv_parse_memmap, +#ifdef SMP + .start_all_aps = xen_pv_start_all_aps, +#endif + .msi_init = xen_msi_init, +}; + +static struct bios_smap xen_smap[MAX_E820_ENTRIES]; + +/*-------------------------------- Xen PV init -------------------------------*/ +/* + * First function called by the Xen PVH boot sequence. + * + * Set some Xen global variables and prepare the environment so it is + * as similar as possible to what native FreeBSD init function expects. + */ +uint64_t +hammer_time_xen(start_info_t *si, uint64_t xenstack) +{ + uint64_t physfree; + uint64_t *PT4 = (u_int64_t *)xenstack; + uint64_t *PT3 = (u_int64_t *)(xenstack + PAGE_SIZE); + uint64_t *PT2 = (u_int64_t *)(xenstack + 2 * PAGE_SIZE); + int i; + + xen_domain_type = XEN_PV_DOMAIN; + vm_guest = VM_GUEST_XEN; + + if ((si == NULL) || (xenstack == 0)) { + xc_printf("ERROR: invalid start_info or xen stack, halting\n"); + HYPERVISOR_shutdown(SHUTDOWN_crash); + } + + xc_printf("FreeBSD PVH running on %s\n", si->magic); + + /* We use 3 pages of xen stack for the boot pagetables */ + physfree = xenstack + 3 * PAGE_SIZE - KERNBASE; + + /* Setup Xen global variables */ + HYPERVISOR_start_info = si; + HYPERVISOR_shared_info = + (shared_info_t *)(si->shared_info + KERNBASE); + + /* + * Setup some misc global variables for Xen devices + * + * XXX: Devices that need these specific variables should + * be rewritten to fetch this info by themselves from the + * start_info page. + */ + xen_store = (struct xenstore_domain_interface *) + (ptoa(si->store_mfn) + KERNBASE); + console_page = (char *)(ptoa(si->console.domU.mfn) + KERNBASE); + + /* + * Use the stack Xen gives us to build the page tables + * as native FreeBSD expects to find them (created + * by the boot trampoline). + */ + for (i = 0; i < (PAGE_SIZE / sizeof(uint64_t)); i++) { + /* + * Each slot of the level 4 pages points + * to the same level 3 page + */ + PT4[i] = ((uint64_t)&PT3[0]) - KERNBASE; + PT4[i] |= PG_V | PG_RW | PG_U; + + /* + * Each slot of the level 3 pages points + * to the same level 2 page + */ + PT3[i] = ((uint64_t)&PT2[0]) - KERNBASE; + PT3[i] |= PG_V | PG_RW | PG_U; + + /* + * The level 2 page slots are mapped with + * 2MB pages for 1GB. + */ + PT2[i] = i * (2 * 1024 * 1024); + PT2[i] |= PG_V | PG_RW | PG_PS | PG_U; + } + load_cr3(((uint64_t)&PT4[0]) - KERNBASE); + + /* Set the hooks for early functions that diverge from bare metal */ + init_ops = xen_init_ops; + apic_ops = xen_apic_ops; + + /* Now we can jump into the native init function */ + return (hammer_time(0, physfree)); +} + +/*-------------------------------- PV specific -------------------------------*/ +#ifdef SMP +static bool +start_xen_ap(int cpu) +{ + struct vcpu_guest_context *ctxt; + int ms, cpus = mp_naps; + const size_t stacksize = kstack_pages * PAGE_SIZE; + + /* allocate and set up an idle stack data page */ + bootstacks[cpu] = + (void *)kmem_malloc(kernel_arena, stacksize, M_WAITOK | M_ZERO); + doublefault_stack = + (char *)kmem_malloc(kernel_arena, PAGE_SIZE, M_WAITOK | M_ZERO); + mce_stack = + (char *)kmem_malloc(kernel_arena, PAGE_SIZE, M_WAITOK | M_ZERO); + nmi_stack = + (char *)kmem_malloc(kernel_arena, PAGE_SIZE, M_WAITOK | M_ZERO); + dpcpu = + (void *)kmem_malloc(kernel_arena, DPCPU_SIZE, M_WAITOK | M_ZERO); + + bootSTK = (char *)bootstacks[cpu] + kstack_pages * PAGE_SIZE - 8; + bootAP = cpu; + + ctxt = malloc(sizeof(*ctxt), M_TEMP, M_WAITOK | M_ZERO); + + ctxt->flags = VGCF_IN_KERNEL; + ctxt->user_regs.rip = (unsigned long) init_secondary; + ctxt->user_regs.rsp = (unsigned long) bootSTK; + + /* Set the AP to use the same page tables */ + ctxt->ctrlreg[3] = KPML4phys; + + if (HYPERVISOR_vcpu_op(VCPUOP_initialise, cpu, ctxt)) + panic("unable to initialize AP#%d", cpu); + + free(ctxt, M_TEMP); + + /* Launch the vCPU */ + if (HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL)) + panic("unable to start AP#%d", cpu); + + /* Wait up to 5 seconds for it to start. */ + for (ms = 0; ms < 5000; ms++) { + if (mp_naps > cpus) + return (true); + DELAY(1000); + } + + return (false); +} + +static int +xen_pv_start_all_aps(void) +{ + int cpu; + + mtx_init(&ap_boot_mtx, "ap boot", NULL, MTX_SPIN); + + for (cpu = 1; cpu < mp_ncpus; cpu++) { + + /* attempt to start the Application Processor */ + if (!start_xen_ap(cpu)) + panic("AP #%d failed to start!", cpu); + + CPU_SET(cpu, &all_cpus); /* record AP in CPU map */ + } + + return (mp_naps); +} +#endif /* SMP */ + +/* + * Functions to convert the "extra" parameters passed by Xen + * into FreeBSD boot options. + */ +static void +xen_pv_set_env(void) +{ + char *cmd_line_next, *cmd_line; + size_t env_size; + + cmd_line = HYPERVISOR_start_info->cmd_line; + env_size = sizeof(HYPERVISOR_start_info->cmd_line); + + /* Skip leading spaces */ + for (; isspace(*cmd_line) && (env_size != 0); cmd_line++) + env_size--; + + /* Replace ',' with '\0' */ + for (cmd_line_next = cmd_line; strsep(&cmd_line_next, ",") != NULL;) + ; + + init_static_kenv(cmd_line, 0); +} + +#ifdef DDB +/* + * The way Xen loads the symtab is different from the native boot loader, + * because it's tailored for NetBSD. So we have to adapt and use the same + * method as NetBSD. Portions of the code below have been picked from NetBSD: + * sys/kern/kern_ksyms.c CVS Revision 1.71. + */ +static void +xen_pv_parse_symtab(void) +{ + Elf_Ehdr *ehdr; + Elf_Shdr *shdr; + vm_offset_t sym_end; + uint32_t size; + int i, j; + + size = end; + sym_end = HYPERVISOR_start_info->mod_start != 0 ? + HYPERVISOR_start_info->mod_start : + HYPERVISOR_start_info->mfn_list; + + /* + * Make sure the size is right headed, sym_end is just a + * high boundary, but at least allows us to fail earlier. + */ + if ((vm_offset_t)&end + size > sym_end) { + xc_printf("Unable to load ELF symtab: size mismatch\n"); + return; + } + + ehdr = (Elf_Ehdr *)(&end + 1); + if (memcmp(ehdr->e_ident, ELFMAG, SELFMAG) || + ehdr->e_ident[EI_CLASS] != ELF_TARG_CLASS || + ehdr->e_version > 1) { + xc_printf("Unable to load ELF symtab: invalid symbol table\n"); + return; + } + + shdr = (Elf_Shdr *)((uint8_t *)ehdr + ehdr->e_shoff); + /* Find the symbol table and the corresponding string table. */ + for (i = 1; i < ehdr->e_shnum; i++) { + if (shdr[i].sh_type != SHT_SYMTAB) + continue; + if (shdr[i].sh_offset == 0) + continue; + ksymtab = (uintptr_t)((uint8_t *)ehdr + shdr[i].sh_offset); + ksymtab_size = shdr[i].sh_size; + j = shdr[i].sh_link; + if (shdr[j].sh_offset == 0) + continue; /* Can this happen? */ + kstrtab = (uintptr_t)((uint8_t *)ehdr + shdr[j].sh_offset); + break; + } + + if (ksymtab == 0 || kstrtab == 0) { + xc_printf( + "Unable to load ELF symtab: could not find symtab or strtab\n"); + return; + } +} +#endif + +static caddr_t +xen_pv_parse_preload_data(u_int64_t modulep) +{ + caddr_t kmdp; + vm_ooffset_t off; + vm_paddr_t metadata; + char *envp; + + if (HYPERVISOR_start_info->mod_start != 0) { + preload_metadata = (caddr_t)(HYPERVISOR_start_info->mod_start); + + kmdp = preload_search_by_type("elf kernel"); + if (kmdp == NULL) + kmdp = preload_search_by_type("elf64 kernel"); + KASSERT(kmdp != NULL, ("unable to find kernel")); + + /* + * Xen has relocated the metadata and the modules, + * so we need to recalculate it's position. This is + * done by saving the original modulep address and + * then calculating the offset with mod_start, + * which contains the relocated modulep address. + */ + metadata = MD_FETCH(kmdp, MODINFOMD_MODULEP, vm_paddr_t); + off = HYPERVISOR_start_info->mod_start - metadata; + + preload_bootstrap_relocate(off); + + boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int); + envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *); + if (envp != NULL) + envp += off; + init_static_kenv(envp, 0); + } else { + /* Parse the extra boot information given by Xen */ + xen_pv_set_env(); + boothowto |= boot_env_to_howto(); + kmdp = NULL; + } + +#ifdef DDB + xen_pv_parse_symtab(); +#endif + return (kmdp); +} + +static void +xen_pv_parse_memmap(caddr_t kmdp, vm_paddr_t *physmap, int *physmap_idx) +{ + struct xen_memory_map memmap; + u_int32_t size; + int rc; + + /* Fetch the E820 map from Xen */ + memmap.nr_entries = MAX_E820_ENTRIES; + set_xen_guest_handle(memmap.buffer, xen_smap); + rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap); + if (rc) + panic("unable to fetch Xen E820 memory map"); + size = memmap.nr_entries * sizeof(xen_smap[0]); + + bios_add_smap_entries(xen_smap, size, physmap, physmap_idx); +} Property changes on: trunk/sys/x86/xen/pv.c ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +MidnightBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Added: trunk/sys/x86/xen/pvcpu_enum.c =================================================================== --- trunk/sys/x86/xen/pvcpu_enum.c (rev 0) +++ trunk/sys/x86/xen/pvcpu_enum.c 2020-02-08 19:32:41 UTC (rev 12310) @@ -0,0 +1,267 @@ +/* $MidnightBSD$ */ +/*- + * Copyright (c) 2003 John Baldwin <jhb at FreeBSD.org> + * Copyright (c) 2013 Roger Pau Monn? <roger.pau at citrix.com> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: stable/11/sys/x86/xen/pvcpu_enum.c 340016 2018-11-01 18:34:26Z jhb $"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/bus.h> +#include <sys/kernel.h> +#include <sys/smp.h> +#include <sys/pcpu.h> +#include <vm/vm.h> +#include <vm/pmap.h> + +#include <machine/intr_machdep.h> +#include <x86/apicvar.h> + +#include <machine/cpu.h> +#include <machine/smp.h> + +#include <xen/xen-os.h> +#include <xen/xen_intr.h> +#include <xen/hypervisor.h> + +#include <xen/interface/vcpu.h> + +#include <contrib/dev/acpica/include/acpi.h> +#include <contrib/dev/acpica/include/aclocal.h> +#include <contrib/dev/acpica/include/actables.h> + +#include <dev/acpica/acpivar.h> + +static int xenpv_probe(void); +static int xenpv_probe_cpus(void); +static int xenpv_setup_local(void); +static int xenpv_setup_io(void); + +static ACPI_TABLE_MADT *madt; +static vm_paddr_t madt_physaddr; +static vm_offset_t madt_length; + +static struct apic_enumerator xenpv_enumerator = { + "Xen PV", + xenpv_probe, + xenpv_probe_cpus, + xenpv_setup_local, + xenpv_setup_io +}; + +/*--------------------- Helper functions to parse MADT -----------------------*/ + +/* + * Parse an interrupt source override for an ISA interrupt. + */ +static void +madt_parse_interrupt_override(ACPI_MADT_INTERRUPT_OVERRIDE *intr) +{ + enum intr_trigger trig; + enum intr_polarity pol; + int ret; + + if (acpi_quirks & ACPI_Q_MADT_IRQ0 && intr->SourceIrq == 0 && + intr->GlobalIrq == 2) { + if (bootverbose) + printf("MADT: Skipping timer override\n"); + return; + } + + madt_parse_interrupt_values(intr, &trig, &pol); + + /* Remap the IRQ if it is mapped to a different interrupt vector. */ + if (intr->SourceIrq != intr->GlobalIrq && intr->GlobalIrq > 15 && + intr->SourceIrq == AcpiGbl_FADT.SciInterrupt) + /* + * If the SCI is remapped to a non-ISA global interrupt, + * then override the vector we use to setup. + */ + acpi_OverrideInterruptLevel(intr->GlobalIrq); + + /* Register the IRQ with the polarity and trigger mode found. */ + ret = xen_register_pirq(intr->GlobalIrq, trig, pol); + if (ret != 0) + panic("Unable to register interrupt override"); +} + +/* + * Call the handler routine for each entry in the MADT table. + */ +static void +madt_walk_table(acpi_subtable_handler *handler, void *arg) +{ + + acpi_walk_subtables(madt + 1, (char *)madt + madt->Header.Length, + handler, arg); +} + +/* + * Parse interrupt entries. + */ +static void +madt_parse_ints(ACPI_SUBTABLE_HEADER *entry, void *arg __unused) +{ + + if (entry->Type == ACPI_MADT_TYPE_INTERRUPT_OVERRIDE) + madt_parse_interrupt_override( + (ACPI_MADT_INTERRUPT_OVERRIDE *)entry); +} + +/*---------------------------- Xen PV enumerator -----------------------------*/ + +/* + * This enumerator will only be registered on PVH + */ +static int +xenpv_probe(void) +{ + return (0); +} + +/* + * Test each possible vCPU in order to find the number of vCPUs + */ +static int +xenpv_probe_cpus(void) +{ +#ifdef SMP + int i, ret; + + for (i = 0; i < MAXCPU; i++) { + ret = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL); + if (ret >= 0) + lapic_create((i * 2), (i == 0)); + } +#endif + return (0); +} + +/* + * Initialize the vCPU id of the BSP + */ +static int +xenpv_setup_local(void) +{ + PCPU_SET(vcpu_id, 0); + lapic_init(0); + return (0); +} + +/* + * On PVH guests there's no IO APIC + */ +static int +xenpv_setup_io(void) +{ + + if (xen_initial_domain()) { + /* + * NB: we could iterate over the MADT IOAPIC entries in order + * to figure out the exact number of IOAPIC interrupts, but + * this is legacy code so just keep using the previous + * behaviour and assume a maximum of 256 interrupts. + */ + num_io_irqs = max(MINIMUM_MSI_INT - 1, num_io_irqs); + + acpi_SetDefaultIntrModel(ACPI_INTR_APIC); + } + return (0); +} + +void +xenpv_register_pirqs(struct pic *pic __unused) +{ + unsigned int i; + int ret; + + /* Map MADT */ + madt_physaddr = acpi_find_table(ACPI_SIG_MADT); + madt = acpi_map_table(madt_physaddr, ACPI_SIG_MADT); + madt_length = madt->Header.Length; + + /* Try to initialize ACPI so that we can access the FADT. */ + ret = acpi_Startup(); + if (ACPI_FAILURE(ret)) { + printf("MADT: ACPI Startup failed with %s\n", + AcpiFormatException(ret)); + printf("Try disabling either ACPI or apic support.\n"); + panic("Using MADT but ACPI doesn't work"); + } + + /* Run through the table to see if there are any overrides. */ + madt_walk_table(madt_parse_ints, NULL); + + /* + * If there was not an explicit override entry for the SCI, + * force it to use level trigger and active-low polarity. + */ + if (!madt_found_sci_override) { + printf( +"MADT: Forcing active-low polarity and level trigger for SCI\n"); + ret = xen_register_pirq(AcpiGbl_FADT.SciInterrupt, + INTR_TRIGGER_LEVEL, INTR_POLARITY_LOW); + if (ret != 0) + panic("Unable to register SCI IRQ"); + } + + /* Register legacy ISA IRQs */ + for (i = 1; i < 16; i++) { + if (intr_lookup_source(i) != NULL) + continue; + ret = xen_register_pirq(i, INTR_TRIGGER_EDGE, + INTR_POLARITY_LOW); + if (ret != 0 && bootverbose) + printf("Unable to register legacy IRQ#%u: %d\n", i, + ret); + } +} + +static void +xenpv_register(void *dummy __unused) +{ + if (xen_pv_domain()) { + apic_register_enumerator(&xenpv_enumerator); + } +} +SYSINIT(xenpv_register, SI_SUB_TUNABLES - 1, SI_ORDER_FIRST, xenpv_register, NULL); + +/* + * Setup per-CPU vCPU IDs + */ +static void +xenpv_set_ids(void *dummy) +{ + struct pcpu *pc; + int i; + + CPU_FOREACH(i) { + pc = pcpu_find(i); + pc->pc_vcpu_id = i; + } +} +SYSINIT(xenpv_set_ids, SI_SUB_CPU, SI_ORDER_MIDDLE, xenpv_set_ids, NULL); Property changes on: trunk/sys/x86/xen/pvcpu_enum.c ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +MidnightBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Added: trunk/sys/x86/xen/xen_apic.c =================================================================== --- trunk/sys/x86/xen/xen_apic.c (rev 0) +++ trunk/sys/x86/xen/xen_apic.c 2020-02-08 19:32:41 UTC (rev 12310) @@ -0,0 +1,598 @@ +/* $MidnightBSD$ */ +/* + * Copyright (c) 2014 Roger Pau Monn? <roger.pau at citrix.com> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: stable/11/sys/x86/xen/xen_apic.c 334047 2018-05-22 14:36:46Z kib $"); + +#include <sys/param.h> +#include <sys/bus.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/proc.h> +#include <sys/smp.h> +#include <sys/systm.h> + +#include <vm/vm.h> +#include <vm/pmap.h> + +#include <machine/cpufunc.h> +#include <machine/cpu.h> +#include <machine/intr_machdep.h> +#include <machine/md_var.h> +#include <machine/smp.h> + +#include <x86/apicreg.h> +#include <x86/apicvar.h> + +#include <xen/xen-os.h> +#include <xen/features.h> +#include <xen/gnttab.h> +#include <xen/hypervisor.h> +#include <xen/hvm.h> +#include <xen/xen_intr.h> + +#include <xen/interface/vcpu.h> + +/*--------------------------------- Macros -----------------------------------*/ + +#define XEN_APIC_UNSUPPORTED \ + panic("%s: not available in Xen PV port.", __func__) + + +/*--------------------------- Forward Declarations ---------------------------*/ +#ifdef SMP +static driver_filter_t xen_smp_rendezvous_action; +static driver_filter_t xen_invltlb; +static driver_filter_t xen_invlpg; +static driver_filter_t xen_invlrng; +static driver_filter_t xen_invlcache; +static driver_filter_t xen_ipi_bitmap_handler; +static driver_filter_t xen_cpustop_handler; +static driver_filter_t xen_cpususpend_handler; +static driver_filter_t xen_cpustophard_handler; +#endif + +/*---------------------------------- Macros ----------------------------------*/ +#define IPI_TO_IDX(ipi) ((ipi) - APIC_IPI_INTS) + +/*--------------------------------- Xen IPIs ---------------------------------*/ +#ifdef SMP +struct xen_ipi_handler +{ + driver_filter_t *filter; + const char *description; +}; + +static struct xen_ipi_handler xen_ipis[] = +{ + [IPI_TO_IDX(IPI_RENDEZVOUS)] = { xen_smp_rendezvous_action, "r" }, + [IPI_TO_IDX(IPI_INVLTLB)] = { xen_invltlb, "itlb"}, + [IPI_TO_IDX(IPI_INVLPG)] = { xen_invlpg, "ipg" }, + [IPI_TO_IDX(IPI_INVLRNG)] = { xen_invlrng, "irg" }, + [IPI_TO_IDX(IPI_INVLCACHE)] = { xen_invlcache, "ic" }, + [IPI_TO_IDX(IPI_BITMAP_VECTOR)] = { xen_ipi_bitmap_handler, "b" }, + [IPI_TO_IDX(IPI_STOP)] = { xen_cpustop_handler, "st" }, + [IPI_TO_IDX(IPI_SUSPEND)] = { xen_cpususpend_handler, "sp" }, + [IPI_TO_IDX(IPI_STOP_HARD)] = { xen_cpustophard_handler, "sth" }, +}; +#endif + +/*------------------------------- Per-CPU Data -------------------------------*/ +#ifdef SMP +DPCPU_DEFINE(xen_intr_handle_t, ipi_handle[nitems(xen_ipis)]); +#endif + +/*------------------------------- Xen PV APIC --------------------------------*/ + +static void +xen_pv_lapic_create(u_int apic_id, int boot_cpu) +{ +#ifdef SMP + cpu_add(apic_id, boot_cpu); +#endif +} + +static void +xen_pv_lapic_init(vm_paddr_t addr) +{ + +} + +static void +xen_pv_lapic_setup(int boot) +{ + +} + +static void +xen_pv_lapic_dump(const char *str) +{ + + printf("cpu%d %s XEN PV LAPIC\n", PCPU_GET(cpuid), str); +} + +static void +xen_pv_lapic_disable(void) +{ + +} + +static bool +xen_pv_lapic_is_x2apic(void) +{ + + return (false); +} + +static void +xen_pv_lapic_eoi(void) +{ + + XEN_APIC_UNSUPPORTED; +} + +static int +xen_pv_lapic_id(void) +{ + + return (PCPU_GET(apic_id)); +} + +static int +xen_pv_lapic_intr_pending(u_int vector) +{ + + XEN_APIC_UNSUPPORTED; + return (0); +} + +static u_int +xen_pv_apic_cpuid(u_int apic_id) +{ +#ifdef SMP + return (apic_cpuids[apic_id]); +#else + return (0); +#endif +} + +static u_int +xen_pv_apic_alloc_vector(u_int apic_id, u_int irq) +{ + + XEN_APIC_UNSUPPORTED; + return (0); +} + +static u_int +xen_pv_apic_alloc_vectors(u_int apic_id, u_int *irqs, u_int count, u_int align) +{ + + XEN_APIC_UNSUPPORTED; + return (0); +} + +static void +xen_pv_apic_disable_vector(u_int apic_id, u_int vector) +{ + + XEN_APIC_UNSUPPORTED; +} + +static void +xen_pv_apic_enable_vector(u_int apic_id, u_int vector) +{ + + XEN_APIC_UNSUPPORTED; +} + +static void +xen_pv_apic_free_vector(u_int apic_id, u_int vector, u_int irq) +{ + + XEN_APIC_UNSUPPORTED; +} + +static void +xen_pv_lapic_set_logical_id(u_int apic_id, u_int cluster, u_int cluster_id) +{ + + XEN_APIC_UNSUPPORTED; +} + +static int +xen_pv_lapic_enable_pmc(void) +{ + + XEN_APIC_UNSUPPORTED; + return (0); +} + +static void +xen_pv_lapic_disable_pmc(void) +{ + + XEN_APIC_UNSUPPORTED; +} + +static void +xen_pv_lapic_reenable_pmc(void) +{ + + XEN_APIC_UNSUPPORTED; +} + +static void +xen_pv_lapic_enable_cmc(void) +{ + +} + +#ifdef SMP +static void +xen_pv_lapic_ipi_raw(register_t icrlo, u_int dest) +{ + + XEN_APIC_UNSUPPORTED; +} + +static void +xen_pv_lapic_ipi_vectored(u_int vector, int dest) +{ + xen_intr_handle_t *ipi_handle; + int ipi_idx, to_cpu, self; + + ipi_idx = IPI_TO_IDX(vector); + if (ipi_idx >= nitems(xen_ipis)) + panic("IPI out of range"); + + switch(dest) { + case APIC_IPI_DEST_SELF: + ipi_handle = DPCPU_GET(ipi_handle); + xen_intr_signal(ipi_handle[ipi_idx]); + break; + case APIC_IPI_DEST_ALL: + CPU_FOREACH(to_cpu) { + ipi_handle = DPCPU_ID_GET(to_cpu, ipi_handle); + xen_intr_signal(ipi_handle[ipi_idx]); + } + break; + case APIC_IPI_DEST_OTHERS: + self = PCPU_GET(cpuid); + CPU_FOREACH(to_cpu) { + if (to_cpu != self) { + ipi_handle = DPCPU_ID_GET(to_cpu, ipi_handle); + xen_intr_signal(ipi_handle[ipi_idx]); + } + } + break; + default: + to_cpu = apic_cpuid(dest); + ipi_handle = DPCPU_ID_GET(to_cpu, ipi_handle); + xen_intr_signal(ipi_handle[ipi_idx]); + break; + } +} + +static int +xen_pv_lapic_ipi_wait(int delay) +{ + + XEN_APIC_UNSUPPORTED; + return (0); +} +#endif /* SMP */ + +static int +xen_pv_lapic_ipi_alloc(inthand_t *ipifunc) +{ + + XEN_APIC_UNSUPPORTED; + return (-1); +} + +static void +xen_pv_lapic_ipi_free(int vector) +{ + + XEN_APIC_UNSUPPORTED; +} + +static int +xen_pv_lapic_set_lvt_mask(u_int apic_id, u_int lvt, u_char masked) +{ + + XEN_APIC_UNSUPPORTED; + return (0); +} + +static int +xen_pv_lapic_set_lvt_mode(u_int apic_id, u_int lvt, uint32_t mode) +{ + + XEN_APIC_UNSUPPORTED; + return (0); +} + +static int +xen_pv_lapic_set_lvt_polarity(u_int apic_id, u_int lvt, enum intr_polarity pol) +{ + + XEN_APIC_UNSUPPORTED; + return (0); +} + +static int +xen_pv_lapic_set_lvt_triggermode(u_int apic_id, u_int lvt, + enum intr_trigger trigger) +{ + + XEN_APIC_UNSUPPORTED; + return (0); +} + +/* Xen apic_ops implementation */ +struct apic_ops xen_apic_ops = { + .create = xen_pv_lapic_create, + .init = xen_pv_lapic_init, + .xapic_mode = xen_pv_lapic_disable, + .is_x2apic = xen_pv_lapic_is_x2apic, + .setup = xen_pv_lapic_setup, + .dump = xen_pv_lapic_dump, + .disable = xen_pv_lapic_disable, + .eoi = xen_pv_lapic_eoi, + .id = xen_pv_lapic_id, + .intr_pending = xen_pv_lapic_intr_pending, + .set_logical_id = xen_pv_lapic_set_logical_id, + .cpuid = xen_pv_apic_cpuid, + .alloc_vector = xen_pv_apic_alloc_vector, + .alloc_vectors = xen_pv_apic_alloc_vectors, + .enable_vector = xen_pv_apic_enable_vector, + .disable_vector = xen_pv_apic_disable_vector, + .free_vector = xen_pv_apic_free_vector, + .enable_pmc = xen_pv_lapic_enable_pmc, + .disable_pmc = xen_pv_lapic_disable_pmc, + .reenable_pmc = xen_pv_lapic_reenable_pmc, + .enable_cmc = xen_pv_lapic_enable_cmc, +#ifdef SMP + .ipi_raw = xen_pv_lapic_ipi_raw, + .ipi_vectored = xen_pv_lapic_ipi_vectored, + .ipi_wait = xen_pv_lapic_ipi_wait, +#endif + .ipi_alloc = xen_pv_lapic_ipi_alloc, + .ipi_free = xen_pv_lapic_ipi_free, + .set_lvt_mask = xen_pv_lapic_set_lvt_mask, + .set_lvt_mode = xen_pv_lapic_set_lvt_mode, + .set_lvt_polarity = xen_pv_lapic_set_lvt_polarity, + .set_lvt_triggermode = xen_pv_lapic_set_lvt_triggermode, +}; + +#ifdef SMP +/*---------------------------- XEN PV IPI Handlers ---------------------------*/ +/* + * These are C clones of the ASM functions found in apic_vector. + */ +static int +xen_ipi_bitmap_handler(void *arg) +{ + struct trapframe *frame; + + frame = arg; + ipi_bitmap_handler(*frame); + return (FILTER_HANDLED); +} + +static int +xen_smp_rendezvous_action(void *arg) +{ +#ifdef COUNT_IPIS + (*ipi_rendezvous_counts[PCPU_GET(cpuid)])++; +#endif /* COUNT_IPIS */ + + smp_rendezvous_action(); + return (FILTER_HANDLED); +} + +static int +xen_invltlb(void *arg) +{ + + invltlb_handler(); + return (FILTER_HANDLED); +} + +#ifdef __amd64__ +static int +xen_invltlb_invpcid(void *arg) +{ + + invltlb_invpcid_handler(); + return (FILTER_HANDLED); +} + +static int +xen_invltlb_pcid(void *arg) +{ + + invltlb_pcid_handler(); + return (FILTER_HANDLED); +} + +static int +xen_invltlb_invpcid_pti(void *arg) +{ + + invltlb_invpcid_pti_handler(); + return (FILTER_HANDLED); +} + +static int +xen_invlpg_invpcid_handler(void *arg) +{ + + invlpg_invpcid_handler(); + return (FILTER_HANDLED); +} + +static int +xen_invlpg_pcid_handler(void *arg) +{ + + invlpg_pcid_handler(); + return (FILTER_HANDLED); +} + +static int +xen_invlrng_invpcid_handler(void *arg) +{ + + invlrng_invpcid_handler(); + return (FILTER_HANDLED); +} + +static int +xen_invlrng_pcid_handler(void *arg) +{ + + invlrng_pcid_handler(); + return (FILTER_HANDLED); +} +#endif + +static int +xen_invlpg(void *arg) +{ + + invlpg_handler(); + return (FILTER_HANDLED); +} + +static int +xen_invlrng(void *arg) +{ + + invlrng_handler(); + return (FILTER_HANDLED); +} + +static int +xen_invlcache(void *arg) +{ + + invlcache_handler(); + return (FILTER_HANDLED); +} + +static int +xen_cpustop_handler(void *arg) +{ + + cpustop_handler(); + return (FILTER_HANDLED); +} + +static int +xen_cpususpend_handler(void *arg) +{ + + cpususpend_handler(); + return (FILTER_HANDLED); +} + +static int +xen_cpustophard_handler(void *arg) +{ + + ipi_nmi_handler(); + return (FILTER_HANDLED); +} + +/*----------------------------- XEN PV IPI setup -----------------------------*/ +/* + * Those functions are provided outside of the Xen PV APIC implementation + * so PVHVM guests can also use PV IPIs without having an actual Xen PV APIC, + * because on PVHVM there's an emulated LAPIC provided by Xen. + */ +static void +xen_cpu_ipi_init(int cpu) +{ + xen_intr_handle_t *ipi_handle; + const struct xen_ipi_handler *ipi; + int idx, rc; + + ipi_handle = DPCPU_ID_GET(cpu, ipi_handle); + + for (ipi = xen_ipis, idx = 0; idx < nitems(xen_ipis); ipi++, idx++) { + + if (ipi->filter == NULL) { + ipi_handle[idx] = NULL; + continue; + } + + rc = xen_intr_alloc_and_bind_ipi(cpu, ipi->filter, + INTR_TYPE_TTY, &ipi_handle[idx]); + if (rc != 0) + panic("Unable to allocate a XEN IPI port"); + xen_intr_describe(ipi_handle[idx], "%s", ipi->description); + } +} + +static void +xen_setup_cpus(void) +{ + int i; + + if (!xen_vector_callback_enabled) + return; + +#ifdef __amd64__ + if (pmap_pcid_enabled) { + if (pti) + xen_ipis[IPI_TO_IDX(IPI_INVLTLB)].filter = + invpcid_works ? xen_invltlb_invpcid_pti : + xen_invltlb_pcid; + else + xen_ipis[IPI_TO_IDX(IPI_INVLTLB)].filter = + invpcid_works ? xen_invltlb_invpcid : + xen_invltlb_pcid; + xen_ipis[IPI_TO_IDX(IPI_INVLPG)].filter = invpcid_works ? + xen_invlpg_invpcid_handler : xen_invlpg_pcid_handler; + xen_ipis[IPI_TO_IDX(IPI_INVLRNG)].filter = invpcid_works ? + xen_invlrng_invpcid_handler : xen_invlrng_pcid_handler; + } +#endif + CPU_FOREACH(i) + xen_cpu_ipi_init(i); + + /* Set the xen pv ipi ops to replace the native ones */ + if (xen_hvm_domain()) + apic_ops.ipi_vectored = xen_pv_lapic_ipi_vectored; +} + +/* We need to setup IPIs before APs are started */ +SYSINIT(xen_setup_cpus, SI_SUB_SMP-1, SI_ORDER_FIRST, xen_setup_cpus, NULL); +#endif /* SMP */ Property changes on: trunk/sys/x86/xen/xen_apic.c ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +MidnightBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Modified: trunk/sys/x86/xen/xen_intr.c =================================================================== --- trunk/sys/x86/xen/xen_intr.c 2020-02-08 19:29:01 UTC (rev 12309) +++ trunk/sys/x86/xen/xen_intr.c 2020-02-08 19:32:41 UTC (rev 12310) @@ -2,7 +2,7 @@ /****************************************************************************** * xen_intr.c * - * Xen event and interrupt services for x86 PV and HVM guests. + * Xen event and interrupt services for x86 HVM guests. * * Copyright (c) 2002-2005, K A Fraser * Copyright (c) 2005, Intel Corporation <xiaofeng.ling at intel.com> @@ -31,8 +31,10 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/x86/xen/xen_intr.c 291647 2015-12-02 12:58:20Z royger $"); +__FBSDID("$FreeBSD: stable/11/sys/x86/xen/xen_intr.c 342656 2018-12-31 22:09:08Z jhb $"); +#include "opt_ddb.h" + #include <sys/param.h> #include <sys/systm.h> #include <sys/bus.h> @@ -49,22 +51,30 @@ #include <vm/pmap.h> #include <machine/intr_machdep.h> -#include <machine/apicvar.h> +#include <x86/apicvar.h> +#include <x86/apicreg.h> #include <machine/smp.h> #include <machine/stdarg.h> #include <machine/xen/synch_bitops.h> #include <machine/xen/xen-os.h> -#include <machine/xen/xenvar.h> +#include <xen/xen-os.h> #include <xen/hypervisor.h> #include <xen/xen_intr.h> #include <xen/evtchn/evtchnvar.h> #include <dev/xen/xenpci/xenpcivar.h> +#include <dev/pci/pcivar.h> +#ifdef DDB +#include <ddb/ddb.h> +#endif + static MALLOC_DEFINE(M_XENINTR, "xen_intr", "Xen Interrupt Services"); +static u_int first_evtchn_irq; + /** * Per-cpu event channel processing state. */ @@ -96,7 +106,7 @@ * Start the scan at port 0 by initializing the last scanned * location as the highest numbered event channel port. */ -DPCPU_DEFINE(struct xen_intr_pcpu_data, xen_intr_pcpu) = { +static DPCPU_DEFINE(struct xen_intr_pcpu_data, xen_intr_pcpu) = { .last_processed_l1i = LONG_BIT - 1, .last_processed_l2i = LONG_BIT - 1 }; @@ -103,8 +113,12 @@ DPCPU_DECLARE(struct vcpu_info *, vcpu_info); -#define is_valid_evtchn(x) ((x) != 0) +#define XEN_EEXIST 17 /* Xen "already exists" error */ +#define XEN_ALLOCATE_VECTOR 0 /* Allocate a vector for this event channel */ +#define XEN_INVALID_EVTCHN 0 /* Invalid event channel */ +#define is_valid_evtchn(x) ((x) != XEN_INVALID_EVTCHN) + struct xenisrc { struct intsrc xi_intsrc; enum evtchn_type xi_type; @@ -113,13 +127,13 @@ evtchn_port_t xi_port; int xi_pirq; int xi_virq; + void *xi_cookie; u_int xi_close:1; /* close on unbind? */ - u_int xi_needs_eoi:1; - u_int xi_shared:1; /* Shared with other domains. */ + u_int xi_activehi:1; + u_int xi_edgetrigger:1; + u_int xi_masked:1; }; -#define ARRAY_SIZE(a) (sizeof(a) / sizeof(a[0])) - static void xen_intr_suspend(struct pic *); static void xen_intr_resume(struct pic *, bool suspend_cancelled); static void xen_intr_enable_source(struct intsrc *isrc); @@ -137,6 +151,9 @@ static void xen_intr_pirq_disable_source(struct intsrc *isrc, int eoi); static void xen_intr_pirq_eoi_source(struct intsrc *isrc); static void xen_intr_pirq_enable_intr(struct intsrc *isrc); +static void xen_intr_pirq_disable_intr(struct intsrc *isrc); +static int xen_intr_pirq_config_intr(struct intsrc *isrc, + enum intr_trigger trig, enum intr_polarity pol); /** * PIC interface for all event channel port types except physical IRQs. @@ -160,22 +177,25 @@ * physical interrupt sources. */ struct pic xen_intr_pirq_pic = { +#ifdef __amd64__ + .pic_register_sources = xenpv_register_pirqs, +#endif .pic_enable_source = xen_intr_pirq_enable_source, .pic_disable_source = xen_intr_pirq_disable_source, .pic_eoi_source = xen_intr_pirq_eoi_source, .pic_enable_intr = xen_intr_pirq_enable_intr, - .pic_disable_intr = xen_intr_disable_intr, + .pic_disable_intr = xen_intr_pirq_disable_intr, .pic_vector = xen_intr_vector, .pic_source_pending = xen_intr_source_pending, - .pic_suspend = xen_intr_suspend, - .pic_resume = xen_intr_resume, - .pic_config_intr = xen_intr_config_intr, + .pic_config_intr = xen_intr_pirq_config_intr, .pic_assign_cpu = xen_intr_assign_cpu }; -static struct mtx xen_intr_isrc_lock; -static int xen_intr_isrc_count; -static struct xenisrc *xen_intr_port_to_isrc[NR_EVENT_CHANNELS]; +static struct mtx xen_intr_isrc_lock; +static u_int xen_intr_auto_vector_count; +static struct xenisrc *xen_intr_port_to_isrc[NR_EVENT_CHANNELS]; +static u_long *xen_intr_pirq_eoi_map; +static boolean_t xen_intr_pirq_eoi_map_enabled; /*------------------------- Private Functions --------------------------------*/ /** @@ -197,7 +217,7 @@ struct xen_intr_pcpu_data *pcpu; pcpu = DPCPU_ID_PTR(cpu, xen_intr_pcpu); - clear_bit(port, pcpu->evtchn_enabled); + xen_clear_bit(port, pcpu->evtchn_enabled); } /** @@ -219,7 +239,7 @@ struct xen_intr_pcpu_data *pcpu; pcpu = DPCPU_ID_PTR(cpu, xen_intr_pcpu); - set_bit(port, pcpu->evtchn_enabled); + xen_set_bit(port, pcpu->evtchn_enabled); } /** @@ -257,11 +277,11 @@ KASSERT(mtx_owned(&xen_intr_isrc_lock), ("Evtchn isrc lock not held")); - for (isrc_idx = 0; isrc_idx < xen_intr_isrc_count; isrc_idx ++) { + for (isrc_idx = 0; isrc_idx < xen_intr_auto_vector_count; isrc_idx ++) { struct xenisrc *isrc; u_int vector; - vector = FIRST_EVTCHN_INT + isrc_idx; + vector = first_evtchn_irq + isrc_idx; isrc = (struct xenisrc *)intr_lookup_source(vector); if (isrc != NULL && isrc->xi_type == EVTCHN_TYPE_UNBOUND) { @@ -283,15 +303,14 @@ * object or NULL. */ static struct xenisrc * -xen_intr_alloc_isrc(enum evtchn_type type) +xen_intr_alloc_isrc(enum evtchn_type type, int vector) { static int warned; struct xenisrc *isrc; - int vector; KASSERT(mtx_owned(&xen_intr_isrc_lock), ("Evtchn alloc lock not held")); - if (xen_intr_isrc_count > NR_EVENT_CHANNELS) { + if (xen_intr_auto_vector_count > NR_EVENT_CHANNELS) { if (!warned) { warned = 1; printf("xen_intr_alloc: Event channels exhausted.\n"); @@ -298,12 +317,19 @@ } return (NULL); } - vector = FIRST_EVTCHN_INT + xen_intr_isrc_count; - xen_intr_isrc_count++; + if (type != EVTCHN_TYPE_PIRQ) { + vector = first_evtchn_irq + xen_intr_auto_vector_count; + xen_intr_auto_vector_count++; + } + + KASSERT((intr_lookup_source(vector) == NULL), + ("Trying to use an already allocated vector")); + mtx_unlock(&xen_intr_isrc_lock); isrc = malloc(sizeof(*isrc), M_XENINTR, M_WAITOK | M_ZERO); - isrc->xi_intsrc.is_pic = &xen_intr_pic; + isrc->xi_intsrc.is_pic = + (type == EVTCHN_TYPE_PIRQ) ? &xen_intr_pirq_pic : &xen_intr_pic; isrc->xi_vector = vector; isrc->xi_type = type; intr_register_source(&isrc->xi_intsrc); @@ -345,6 +371,7 @@ isrc->xi_cpu = 0; isrc->xi_type = EVTCHN_TYPE_UNBOUND; isrc->xi_port = 0; + isrc->xi_cookie = NULL; mtx_unlock(&xen_intr_isrc_lock); return (0); } @@ -372,7 +399,7 @@ */ static int xen_intr_bind_isrc(struct xenisrc **isrcp, evtchn_port_t local_port, - enum evtchn_type type, device_t intr_owner, driver_filter_t filter, + enum evtchn_type type, const char *intr_owner, driver_filter_t filter, driver_intr_t handler, void *arg, enum intr_type flags, xen_intr_handle_t *port_handlep) { @@ -381,8 +408,8 @@ *isrcp = NULL; if (port_handlep == NULL) { - device_printf(intr_owner, - "xen_intr_bind_isrc: Bad event handle\n"); + printf("%s: xen_intr_bind_isrc: Bad event handle\n", + intr_owner); return (EINVAL); } @@ -389,7 +416,7 @@ mtx_lock(&xen_intr_isrc_lock); isrc = xen_intr_find_unused_isrc(type); if (isrc == NULL) { - isrc = xen_intr_alloc_isrc(type); + isrc = xen_intr_alloc_isrc(type, XEN_ALLOCATE_VECTOR); if (isrc == NULL) { mtx_unlock(&xen_intr_isrc_lock); return (ENOSPC); @@ -399,17 +426,37 @@ xen_intr_port_to_isrc[local_port] = isrc; mtx_unlock(&xen_intr_isrc_lock); - error = intr_add_handler(device_get_nameunit(intr_owner), - isrc->xi_vector, filter, handler, arg, - flags|INTR_EXCL, port_handlep); + /* Assign the opaque handler (the event channel port) */ + *port_handlep = &isrc->xi_vector; + +#ifdef SMP + if (type == EVTCHN_TYPE_PORT) { + /* + * By default all interrupts are assigned to vCPU#0 + * unless specified otherwise, so shuffle them to balance + * the interrupt load. + */ + xen_intr_assign_cpu(&isrc->xi_intsrc, intr_next_cpu()); + } +#endif + + if (filter == NULL && handler == NULL) { + /* + * No filter/handler provided, leave the event channel + * masked and without a valid handler, the caller is + * in charge of setting that up. + */ + *isrcp = isrc; + return (0); + } + + error = xen_intr_add_handler(intr_owner, filter, handler, arg, flags, + *port_handlep); if (error != 0) { - device_printf(intr_owner, - "xen_intr_bind_irq: intr_add_handler failed\n"); xen_intr_release_isrc(isrc); return (error); } *isrcp = isrc; - evtchn_unmask_port(local_port); return (0); } @@ -426,13 +473,17 @@ static struct xenisrc * xen_intr_isrc(xen_intr_handle_t handle) { - struct intr_handler *ih; + int vector; - ih = handle; - if (ih == NULL || ih->ih_event == NULL) + if (handle == NULL) return (NULL); - return (ih->ih_event->ie_source); + vector = *(int *)handle; + KASSERT(vector >= first_evtchn_irq && + vector < (first_evtchn_irq + xen_intr_auto_vector_count), + ("Xen interrupt vector is out of range")); + + return ((struct xenisrc *)intr_lookup_source(vector)); } /** @@ -451,6 +502,11 @@ xen_intr_active_ports(struct xen_intr_pcpu_data *pcpu, shared_info_t *sh, u_int idx) { + + CTASSERT(sizeof(sh->evtchn_mask[0]) == sizeof(sh->evtchn_pending[0])); + CTASSERT(sizeof(sh->evtchn_mask[0]) == sizeof(pcpu->evtchn_enabled[0])); + CTASSERT(sizeof(sh->evtchn_mask) == sizeof(sh->evtchn_pending)); + CTASSERT(sizeof(sh->evtchn_mask) == sizeof(pcpu->evtchn_enabled)); return (sh->evtchn_pending[idx] & ~sh->evtchn_mask[idx] & pcpu->evtchn_enabled[idx]); @@ -570,8 +626,10 @@ static int xen_intr_init(void *dummy __unused) { + shared_info_t *s = HYPERVISOR_shared_info; struct xen_intr_pcpu_data *pcpu; - int i; + struct physdev_pirq_eoi_gmfn eoi_gmfn; + int i, rc; if (!xen_domain()) return (0); @@ -579,25 +637,65 @@ mtx_init(&xen_intr_isrc_lock, "xen-irq-lock", NULL, MTX_DEF); /* - * Register interrupt count manually as we aren't - * guaranteed to see a call to xen_intr_assign_cpu() - * before our first interrupt. Also set the per-cpu - * mask of CPU#0 to enable all, since by default - * all event channels are bound to CPU#0. + * Set the per-cpu mask of CPU#0 to enable all, since by default all + * event channels are bound to CPU#0. */ CPU_FOREACH(i) { pcpu = DPCPU_ID_PTR(i, xen_intr_pcpu); memset(pcpu->evtchn_enabled, i == 0 ? ~0 : 0, - sizeof(pcpu->evtchn_enabled)); - xen_intr_intrcnt_add(i); + sizeof(pcpu->evtchn_enabled)); } + for (i = 0; i < nitems(s->evtchn_mask); i++) + atomic_store_rel_long(&s->evtchn_mask[i], ~0); + + /* Try to register PIRQ EOI map */ + xen_intr_pirq_eoi_map = malloc(PAGE_SIZE, M_XENINTR, M_WAITOK | M_ZERO); + eoi_gmfn.gmfn = atop(vtophys(xen_intr_pirq_eoi_map)); + rc = HYPERVISOR_physdev_op(PHYSDEVOP_pirq_eoi_gmfn_v2, &eoi_gmfn); + if (rc != 0 && bootverbose) + printf("Xen interrupts: unable to register PIRQ EOI map\n"); + else + xen_intr_pirq_eoi_map_enabled = true; + intr_register_pic(&xen_intr_pic); + if (xen_pv_domain() && xen_initial_domain()) + intr_register_pic(&xen_intr_pirq_pic); + if (bootverbose) + printf("Xen interrupt system initialized\n"); + return (0); } -SYSINIT(xen_intr_init, SI_SUB_INTR, SI_ORDER_MIDDLE, xen_intr_init, NULL); +SYSINIT(xen_intr_init, SI_SUB_INTR, SI_ORDER_SECOND, xen_intr_init, NULL); +static void +xen_intrcnt_init(void *dummy __unused) +{ + unsigned int i; + + if (!xen_domain()) + return; + + /* + * Register interrupt count manually as we aren't guaranteed to see a + * call to xen_intr_assign_cpu() before our first interrupt. + */ + CPU_FOREACH(i) + xen_intr_intrcnt_add(i); +} +SYSINIT(xen_intrcnt_init, SI_SUB_INTR, SI_ORDER_MIDDLE, xen_intrcnt_init, NULL); + +void +xen_intr_alloc_irqs(void) +{ + + if (num_io_irqs > UINT_MAX - NR_EVENT_CHANNELS) + panic("IRQ allocation overflow (num_msi_irqs too high?)"); + first_evtchn_irq = num_io_irqs; + num_io_irqs += NR_EVENT_CHANNELS; +} + /*--------------------------- Common PIC Functions ---------------------------*/ /** * Prepare this PIC for system suspension. @@ -685,8 +783,8 @@ struct xen_intr_pcpu_data *pcpu; pcpu = DPCPU_ID_PTR(i, xen_intr_pcpu); - memset(pcpu->evtchn_enabled, - i == 0 ? ~0 : 0, sizeof(pcpu->evtchn_enabled)); + memset(pcpu->evtchn_enabled, i == 0 ? ~0 : 0, + sizeof(pcpu->evtchn_enabled)); } /* Mask all event channels. */ @@ -697,10 +795,10 @@ memset(xen_intr_port_to_isrc, 0, sizeof(xen_intr_port_to_isrc)); /* Free unused isrcs and rebind VIRQs and IPIs */ - for (isrc_idx = 0; isrc_idx < xen_intr_isrc_count; isrc_idx++) { + for (isrc_idx = 0; isrc_idx < xen_intr_auto_vector_count; isrc_idx++) { u_int vector; - vector = FIRST_EVTCHN_INT + isrc_idx; + vector = first_evtchn_irq + isrc_idx; isrc = (struct xenisrc *)intr_lookup_source(vector); if (isrc != NULL) { isrc->xi_port = 0; @@ -712,7 +810,6 @@ xen_rebind_virq(isrc); break; default: - isrc->xi_cpu = 0; break; } } @@ -798,16 +895,13 @@ struct evtchn_bind_vcpu bind_vcpu; struct xenisrc *isrc; u_int to_cpu, vcpu_id; - int error; + int error, masked; -#ifdef XENHVM if (xen_vector_callback_enabled == 0) return (EOPNOTSUPP); -#endif to_cpu = apic_cpuid(apic_id); vcpu_id = pcpu_find(to_cpu)->pc_vcpu_id; - xen_intr_intrcnt_add(to_cpu); mtx_lock(&xen_intr_isrc_lock); isrc = (struct xenisrc *)base_isrc; @@ -816,6 +910,11 @@ return (EINVAL); } + /* + * Mask the event channel while binding it to prevent interrupt + * delivery with an inconsistent state in isrc->xi_cpu. + */ + masked = evtchn_test_and_set_mask(isrc->xi_port); if ((isrc->xi_type == EVTCHN_TYPE_VIRQ) || (isrc->xi_type == EVTCHN_TYPE_IPI)) { /* @@ -826,18 +925,12 @@ evtchn_cpu_mask_port(isrc->xi_cpu, isrc->xi_port); isrc->xi_cpu = to_cpu; evtchn_cpu_unmask_port(isrc->xi_cpu, isrc->xi_port); - mtx_unlock(&xen_intr_isrc_lock); - return (0); + goto out; } bind_vcpu.port = isrc->xi_port; bind_vcpu.vcpu = vcpu_id; - /* - * Allow interrupts to be fielded on the new VCPU before - * we ask the hypervisor to deliver them there. - */ - evtchn_cpu_unmask_port(to_cpu, isrc->xi_port); error = HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind_vcpu); if (isrc->xi_cpu != to_cpu) { if (error == 0) { @@ -844,11 +937,13 @@ /* Commit to new binding by removing the old one. */ evtchn_cpu_mask_port(isrc->xi_cpu, isrc->xi_port); isrc->xi_cpu = to_cpu; - } else { - /* Roll-back to previous binding. */ - evtchn_cpu_mask_port(to_cpu, isrc->xi_port); + evtchn_cpu_unmask_port(isrc->xi_cpu, isrc->xi_port); } } + +out: + if (masked == 0) + evtchn_unmask_port(isrc->xi_port); mtx_unlock(&xen_intr_isrc_lock); return (0); #else @@ -865,8 +960,21 @@ * acknowledgements. */ static void -xen_intr_disable_source(struct intsrc *isrc, int eoi) +xen_intr_disable_source(struct intsrc *base_isrc, int eoi) { + struct xenisrc *isrc; + + isrc = (struct xenisrc *)base_isrc; + + /* + * NB: checking if the event channel is already masked is + * needed because the event channel user-space device + * masks event channels on it's filter as part of it's + * normal operation, and those shouldn't be automatically + * unmasked by the generic interrupt code. The event channel + * device will unmask them when needed. + */ + isrc->xi_masked = !!evtchn_test_and_set_mask(isrc->xi_port); } /* @@ -875,8 +983,14 @@ * \param isrc The interrupt source to unmask (if necessary). */ static void -xen_intr_enable_source(struct intsrc *isrc) +xen_intr_enable_source(struct intsrc *base_isrc) { + struct xenisrc *isrc; + + isrc = (struct xenisrc *)base_isrc; + + if (isrc->xi_masked == 0) + evtchn_unmask_port(isrc->xi_port); } /* @@ -885,7 +999,7 @@ * \param isrc The interrupt source to EOI. */ static void -xen_intr_eoi_source(struct intsrc *isrc) +xen_intr_eoi_source(struct intsrc *base_isrc) { } @@ -916,7 +1030,11 @@ struct xenisrc *isrc; isrc = (struct xenisrc *)base_isrc; - evtchn_mask_port(isrc->xi_port); + + if (isrc->xi_edgetrigger == 0) + evtchn_mask_port(isrc->xi_port); + if (eoi == PIC_EOI) + xen_intr_pirq_eoi_source(base_isrc); } /* @@ -930,7 +1048,9 @@ struct xenisrc *isrc; isrc = (struct xenisrc *)base_isrc; - evtchn_unmask_port(isrc->xi_port); + + if (isrc->xi_edgetrigger == 0) + evtchn_unmask_port(isrc->xi_port); } /* @@ -942,13 +1062,17 @@ xen_intr_pirq_eoi_source(struct intsrc *base_isrc) { struct xenisrc *isrc; + int error; - /* XXX Use shared page of flags for this. */ isrc = (struct xenisrc *)base_isrc; - if (isrc->xi_needs_eoi != 0) { + + if (xen_test_bit(isrc->xi_pirq, xen_intr_pirq_eoi_map)) { struct physdev_eoi eoi = { .irq = isrc->xi_pirq }; - (void)HYPERVISOR_physdev_op(PHYSDEVOP_eoi, &eoi); + error = HYPERVISOR_physdev_op(PHYSDEVOP_eoi, &eoi); + if (error != 0) + panic("Unable to EOI PIRQ#%d: %d\n", + isrc->xi_pirq, error); } } @@ -958,10 +1082,118 @@ * \param isrc The interrupt source to enable. */ static void -xen_intr_pirq_enable_intr(struct intsrc *isrc) +xen_intr_pirq_enable_intr(struct intsrc *base_isrc) { + struct xenisrc *isrc; + struct evtchn_bind_pirq bind_pirq; + struct physdev_irq_status_query irq_status; + int error; + + isrc = (struct xenisrc *)base_isrc; + + if (!xen_intr_pirq_eoi_map_enabled) { + irq_status.irq = isrc->xi_pirq; + error = HYPERVISOR_physdev_op(PHYSDEVOP_irq_status_query, + &irq_status); + if (error) + panic("unable to get status of IRQ#%d", isrc->xi_pirq); + + if (irq_status.flags & XENIRQSTAT_needs_eoi) { + /* + * Since the dynamic PIRQ EOI map is not available + * mark the PIRQ as needing EOI unconditionally. + */ + xen_set_bit(isrc->xi_pirq, xen_intr_pirq_eoi_map); + } + } + + bind_pirq.pirq = isrc->xi_pirq; + bind_pirq.flags = isrc->xi_edgetrigger ? 0 : BIND_PIRQ__WILL_SHARE; + error = HYPERVISOR_event_channel_op(EVTCHNOP_bind_pirq, &bind_pirq); + if (error) + panic("unable to bind IRQ#%d", isrc->xi_pirq); + + isrc->xi_port = bind_pirq.port; + + mtx_lock(&xen_intr_isrc_lock); + KASSERT((xen_intr_port_to_isrc[bind_pirq.port] == NULL), + ("trying to override an already setup event channel port")); + xen_intr_port_to_isrc[bind_pirq.port] = isrc; + mtx_unlock(&xen_intr_isrc_lock); + + evtchn_unmask_port(isrc->xi_port); } +/* + * Disable an interrupt source. + * + * \param isrc The interrupt source to disable. + */ +static void +xen_intr_pirq_disable_intr(struct intsrc *base_isrc) +{ + struct xenisrc *isrc; + struct evtchn_close close; + int error; + + isrc = (struct xenisrc *)base_isrc; + + evtchn_mask_port(isrc->xi_port); + + close.port = isrc->xi_port; + error = HYPERVISOR_event_channel_op(EVTCHNOP_close, &close); + if (error) + panic("unable to close event channel %d IRQ#%d", + isrc->xi_port, isrc->xi_pirq); + + mtx_lock(&xen_intr_isrc_lock); + xen_intr_port_to_isrc[isrc->xi_port] = NULL; + mtx_unlock(&xen_intr_isrc_lock); + + isrc->xi_port = 0; +} + +/** + * Perform configuration of an interrupt source. + * + * \param isrc The interrupt source to configure. + * \param trig Edge or level. + * \param pol Active high or low. + * + * \returns 0 if no events are pending, otherwise non-zero. + */ +static int +xen_intr_pirq_config_intr(struct intsrc *base_isrc, enum intr_trigger trig, + enum intr_polarity pol) +{ + struct xenisrc *isrc = (struct xenisrc *)base_isrc; + struct physdev_setup_gsi setup_gsi; + int error; + + KASSERT(!(trig == INTR_TRIGGER_CONFORM || pol == INTR_POLARITY_CONFORM), + ("%s: Conforming trigger or polarity\n", __func__)); + + setup_gsi.gsi = isrc->xi_pirq; + setup_gsi.triggering = trig == INTR_TRIGGER_EDGE ? 0 : 1; + setup_gsi.polarity = pol == INTR_POLARITY_HIGH ? 0 : 1; + + error = HYPERVISOR_physdev_op(PHYSDEVOP_setup_gsi, &setup_gsi); + if (error == -XEN_EEXIST) { + if ((isrc->xi_edgetrigger && (trig != INTR_TRIGGER_EDGE)) || + (isrc->xi_activehi && (pol != INTR_POLARITY_HIGH))) + panic("unable to reconfigure interrupt IRQ#%d", + isrc->xi_pirq); + error = 0; + } + if (error) + panic("unable to configure IRQ#%d\n", isrc->xi_pirq); + + isrc->xi_activehi = pol == INTR_POLARITY_HIGH ? 1 : 0; + isrc->xi_edgetrigger = trig == INTR_TRIGGER_EDGE ? 1 : 0; + + return (0); +} + /*--------------------------- Public Functions -------------------------------*/ /*------- API comments for these methods can be found in xen/xenintr.h -------*/ int @@ -972,8 +1204,9 @@ struct xenisrc *isrc; int error; - error = xen_intr_bind_isrc(&isrc, local_port, EVTCHN_TYPE_PORT, dev, - filter, handler, arg, flags, port_handlep); + error = xen_intr_bind_isrc(&isrc, local_port, EVTCHN_TYPE_PORT, + device_get_nameunit(dev), filter, handler, arg, flags, + port_handlep); if (error != 0) return (error); @@ -1007,8 +1240,8 @@ } error = xen_intr_bind_isrc(&isrc, alloc_unbound.port, EVTCHN_TYPE_PORT, - dev, filter, handler, arg, flags, - port_handlep); + device_get_nameunit(dev), filter, handler, arg, flags, + port_handlep); if (error != 0) { evtchn_close_t close = { .port = alloc_unbound.port }; if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close)) @@ -1042,8 +1275,8 @@ } error = xen_intr_bind_isrc(&isrc, bind_interdomain.local_port, - EVTCHN_TYPE_PORT, dev, filter, handler, - arg, flags, port_handlep); + EVTCHN_TYPE_PORT, device_get_nameunit(dev), filter, handler, arg, + flags, port_handlep); if (error) { evtchn_close_t close = { .port = bind_interdomain.local_port }; if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close)) @@ -1069,9 +1302,6 @@ struct evtchn_bind_virq bind_virq = { .virq = virq, .vcpu = vcpu_id }; int error; - /* Ensure the target CPU is ready to handle evtchn interrupts. */ - xen_intr_intrcnt_add(cpu); - isrc = NULL; error = HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq, &bind_virq); if (error != 0) { @@ -1082,8 +1312,9 @@ return (-error); } - error = xen_intr_bind_isrc(&isrc, bind_virq.port, EVTCHN_TYPE_VIRQ, dev, - filter, handler, arg, flags, port_handlep); + error = xen_intr_bind_isrc(&isrc, bind_virq.port, EVTCHN_TYPE_VIRQ, + device_get_nameunit(dev), filter, handler, arg, flags, + port_handlep); #ifdef SMP if (error == 0) @@ -1122,19 +1353,17 @@ } int -xen_intr_alloc_and_bind_ipi(device_t dev, u_int cpu, - driver_filter_t filter, enum intr_type flags, - xen_intr_handle_t *port_handlep) +xen_intr_alloc_and_bind_ipi(u_int cpu, driver_filter_t filter, + enum intr_type flags, xen_intr_handle_t *port_handlep) { #ifdef SMP int vcpu_id = pcpu_find(cpu)->pc_vcpu_id; struct xenisrc *isrc; struct evtchn_bind_ipi bind_ipi = { .vcpu = vcpu_id }; + /* Same size as the one used by intr_handler->ih_name. */ + char name[MAXCOMLEN + 1]; int error; - /* Ensure the target CPU is ready to handle evtchn interrupts. */ - xen_intr_intrcnt_add(cpu); - isrc = NULL; error = HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi, &bind_ipi); if (error != 0) { @@ -1145,12 +1374,10 @@ return (-error); } + snprintf(name, sizeof(name), "cpu%u", cpu); + error = xen_intr_bind_isrc(&isrc, bind_ipi.port, EVTCHN_TYPE_IPI, - dev, filter, NULL, NULL, flags, - port_handlep); - if (error == 0) - error = intr_event_bind(isrc->xi_intsrc.is_event, cpu); - + name, filter, NULL, NULL, flags, port_handlep); if (error != 0) { evtchn_close_t close = { .port = bind_ipi.port }; @@ -1182,6 +1409,101 @@ } int +xen_register_pirq(int vector, enum intr_trigger trig, enum intr_polarity pol) +{ + struct physdev_map_pirq map_pirq; + struct xenisrc *isrc; + int error; + + if (vector == 0) + return (EINVAL); + + if (bootverbose) + printf("xen: register IRQ#%d\n", vector); + + map_pirq.domid = DOMID_SELF; + map_pirq.type = MAP_PIRQ_TYPE_GSI; + map_pirq.index = vector; + map_pirq.pirq = vector; + + error = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_pirq); + if (error) { + printf("xen: unable to map IRQ#%d\n", vector); + return (error); + } + + mtx_lock(&xen_intr_isrc_lock); + isrc = xen_intr_alloc_isrc(EVTCHN_TYPE_PIRQ, vector); + mtx_unlock(&xen_intr_isrc_lock); + KASSERT((isrc != NULL), ("xen: unable to allocate isrc for interrupt")); + isrc->xi_pirq = vector; + isrc->xi_activehi = pol == INTR_POLARITY_HIGH ? 1 : 0; + isrc->xi_edgetrigger = trig == INTR_TRIGGER_EDGE ? 1 : 0; + + return (0); +} + +int +xen_register_msi(device_t dev, int vector, int count) +{ + struct physdev_map_pirq msi_irq; + struct xenisrc *isrc; + int ret; + + memset(&msi_irq, 0, sizeof(msi_irq)); + msi_irq.domid = DOMID_SELF; + msi_irq.type = count == 1 ? + MAP_PIRQ_TYPE_MSI_SEG : MAP_PIRQ_TYPE_MULTI_MSI; + msi_irq.index = -1; + msi_irq.pirq = -1; + msi_irq.bus = pci_get_bus(dev) | (pci_get_domain(dev) << 16); + msi_irq.devfn = (pci_get_slot(dev) << 3) | pci_get_function(dev); + msi_irq.entry_nr = count; + + ret = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &msi_irq); + if (ret != 0) + return (ret); + if (count != msi_irq.entry_nr) { + panic("unable to setup all requested MSI vectors " + "(expected %d got %d)", count, msi_irq.entry_nr); + } + + mtx_lock(&xen_intr_isrc_lock); + for (int i = 0; i < count; i++) { + isrc = xen_intr_alloc_isrc(EVTCHN_TYPE_PIRQ, vector + i); + KASSERT(isrc != NULL, + ("xen: unable to allocate isrc for interrupt")); + isrc->xi_pirq = msi_irq.pirq + i; + /* MSI interrupts are always edge triggered */ + isrc->xi_edgetrigger = 1; + } + mtx_unlock(&xen_intr_isrc_lock); + + return (0); +} + +int +xen_release_msi(int vector) +{ + struct physdev_unmap_pirq unmap; + struct xenisrc *isrc; + int ret; + + isrc = (struct xenisrc *)intr_lookup_source(vector); + if (isrc == NULL) + return (ENXIO); + + unmap.pirq = isrc->xi_pirq; + ret = HYPERVISOR_physdev_op(PHYSDEVOP_unmap_pirq, &unmap); + if (ret != 0) + return (ret); + + xen_intr_release_isrc(isrc); + + return (0); +} + +int xen_intr_describe(xen_intr_handle_t port_handle, const char *fmt, ...) { char descr[MAXCOMLEN + 1]; @@ -1195,22 +1517,24 @@ va_start(ap, fmt); vsnprintf(descr, sizeof(descr), fmt, ap); va_end(ap); - return (intr_describe(isrc->xi_vector, port_handle, descr)); + return (intr_describe(isrc->xi_vector, isrc->xi_cookie, descr)); } void xen_intr_unbind(xen_intr_handle_t *port_handlep) { - struct intr_handler *handler; struct xenisrc *isrc; - handler = *port_handlep; + KASSERT(port_handlep != NULL, + ("NULL xen_intr_handle_t passed to xen_intr_unbind")); + + isrc = xen_intr_isrc(*port_handlep); *port_handlep = NULL; - isrc = xen_intr_isrc(handler); if (isrc == NULL) return; - intr_remove_handler(handler); + if (isrc->xi_cookie != NULL) + intr_remove_handler(isrc->xi_cookie); xen_intr_release_isrc(isrc); } @@ -1240,3 +1564,96 @@ return (isrc->xi_port); } + +int +xen_intr_add_handler(const char *name, driver_filter_t filter, + driver_intr_t handler, void *arg, enum intr_type flags, + xen_intr_handle_t handle) +{ + struct xenisrc *isrc; + int error; + + isrc = xen_intr_isrc(handle); + if (isrc == NULL || isrc->xi_cookie != NULL) + return (EINVAL); + + error = intr_add_handler(name, isrc->xi_vector,filter, handler, arg, + flags|INTR_EXCL, &isrc->xi_cookie); + if (error != 0) { + printf( + "%s: xen_intr_add_handler: intr_add_handler failed: %d\n", + name, error); + } + + return (error); +} + +#ifdef DDB +static const char * +xen_intr_print_type(enum evtchn_type type) +{ + static const char *evtchn_type_to_string[EVTCHN_TYPE_COUNT] = { + [EVTCHN_TYPE_UNBOUND] = "UNBOUND", + [EVTCHN_TYPE_PIRQ] = "PIRQ", + [EVTCHN_TYPE_VIRQ] = "VIRQ", + [EVTCHN_TYPE_IPI] = "IPI", + [EVTCHN_TYPE_PORT] = "PORT", + }; + + if (type >= EVTCHN_TYPE_COUNT) + return ("UNKNOWN"); + + return (evtchn_type_to_string[type]); +} + +static void +xen_intr_dump_port(struct xenisrc *isrc) +{ + struct xen_intr_pcpu_data *pcpu; + shared_info_t *s = HYPERVISOR_shared_info; + int i; + + db_printf("Port %d Type: %s\n", + isrc->xi_port, xen_intr_print_type(isrc->xi_type)); + if (isrc->xi_type == EVTCHN_TYPE_PIRQ) { + db_printf("\tPirq: %d ActiveHi: %d EdgeTrigger: %d " + "NeedsEOI: %d\n", + isrc->xi_pirq, isrc->xi_activehi, isrc->xi_edgetrigger, + !!xen_test_bit(isrc->xi_pirq, xen_intr_pirq_eoi_map)); + } + if (isrc->xi_type == EVTCHN_TYPE_VIRQ) + db_printf("\tVirq: %d\n", isrc->xi_virq); + + db_printf("\tMasked: %d Pending: %d\n", + !!xen_test_bit(isrc->xi_port, &s->evtchn_mask[0]), + !!xen_test_bit(isrc->xi_port, &s->evtchn_pending[0])); + + db_printf("\tPer-CPU Masks: "); + CPU_FOREACH(i) { + pcpu = DPCPU_ID_PTR(i, xen_intr_pcpu); + db_printf("cpu#%d: %d ", i, + !!xen_test_bit(isrc->xi_port, pcpu->evtchn_enabled)); + } + db_printf("\n"); +} + +DB_SHOW_COMMAND(xen_evtchn, db_show_xen_evtchn) +{ + int i; + + if (!xen_domain()) { + db_printf("Only available on Xen guests\n"); + return; + } + + for (i = 0; i < NR_EVENT_CHANNELS; i++) { + struct xenisrc *isrc; + + isrc = xen_intr_port_to_isrc[i]; + if (isrc == NULL) + continue; + + xen_intr_dump_port(isrc); + } +} +#endif /* DDB */ Added: trunk/sys/x86/xen/xen_msi.c =================================================================== --- trunk/sys/x86/xen/xen_msi.c (rev 0) +++ trunk/sys/x86/xen/xen_msi.c 2020-02-08 19:32:41 UTC (rev 12310) @@ -0,0 +1,134 @@ +/* $MidnightBSD$ */ +/* + * Copyright (c) 2014 Roger Pau Monn? <roger.pau at citrix.com> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: stable/11/sys/x86/xen/xen_msi.c 344912 2019-03-08 01:04:19Z jhb $"); + +#include <sys/param.h> +#include <sys/bus.h> +#include <sys/kernel.h> +#include <sys/limits.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/mutex.h> +#include <sys/sx.h> +#include <sys/systm.h> +#include <x86/apicreg.h> +#include <machine/cputypes.h> +#include <machine/md_var.h> +#include <machine/frame.h> +#include <machine/intr_machdep.h> +#include <x86/apicvar.h> +#include <machine/specialreg.h> +#include <dev/pci/pcivar.h> + +#include <xen/xen-os.h> +#include <xen/xen_intr.h> +#include <xen/xen_msi.h> + +static struct mtx msi_lock; +static u_int msi_last_irq; + +void +xen_msi_init(void) +{ + + MPASS(num_io_irqs > 0); + first_msi_irq = min(MINIMUM_MSI_INT, num_io_irqs); + if (num_msi_irqs > UINT_MAX - first_msi_irq) + panic("num_msi_irqs too high"); + num_io_irqs = first_msi_irq + num_msi_irqs; + + mtx_init(&msi_lock, "msi", NULL, MTX_DEF); +} + +/* + * Try to allocate 'count' interrupt sources with contiguous IDT values. + */ +int +xen_msi_alloc(device_t dev, int count, int maxcount, int *irqs) +{ + int i, ret = 0; + + mtx_lock(&msi_lock); + + /* If we would exceed the max, give up. */ + if (msi_last_irq + count > num_msi_irqs) { + mtx_unlock(&msi_lock); + return (ENXIO); + } + + /* Allocate MSI vectors */ + for (i = 0; i < count; i++) + irqs[i] = first_msi_irq + msi_last_irq++; + + mtx_unlock(&msi_lock); + + ret = xen_register_msi(dev, irqs[0], count); + if (ret != 0) + return (ret); + + for (i = 0; i < count; i++) + nexus_add_irq(irqs[i]); + + return (0); +} + +int +xen_msi_release(int *irqs, int count) +{ + int i, ret; + + for (i = 0; i < count; i++) { + ret = xen_release_msi(irqs[i]); + if (ret != 0) + return (ret); + } + + return (0); +} + +int +xen_msi_map(int irq, uint64_t *addr, uint32_t *data) +{ + + return (0); +} + +int +xen_msix_alloc(device_t dev, int *irq) +{ + + return (ENXIO); +} + +int +xen_msix_release(int irq) +{ + + return (ENOENT); +} Property changes on: trunk/sys/x86/xen/xen_msi.c ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +MidnightBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Added: trunk/sys/x86/xen/xen_nexus.c =================================================================== --- trunk/sys/x86/xen/xen_nexus.c (rev 0) +++ trunk/sys/x86/xen/xen_nexus.c 2020-02-08 19:32:41 UTC (rev 12310) @@ -0,0 +1,168 @@ +/* $MidnightBSD$ */ +/* + * Copyright (c) 2013 Roger Pau Monn? <roger.pau at citrix.com> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: stable/11/sys/x86/xen/xen_nexus.c 340016 2018-11-01 18:34:26Z jhb $"); + +#include <sys/param.h> +#include <sys/bus.h> +#include <sys/kernel.h> +#include <sys/module.h> +#include <sys/sysctl.h> +#include <sys/systm.h> +#include <sys/smp.h> + +#include <contrib/dev/acpica/include/acpi.h> + +#include <dev/acpica/acpivar.h> + +#include <x86/init.h> +#include <machine/nexusvar.h> +#include <machine/intr_machdep.h> + +#include <xen/xen-os.h> +#include <xen/xen_intr.h> +#include <xen/xen_msi.h> + +#include "pcib_if.h" + +/* + * Xen nexus(4) driver. + */ +static int +nexus_xen_probe(device_t dev) +{ + + if (!xen_pv_domain()) + return (ENXIO); + + return (BUS_PROBE_SPECIFIC); +} + +static int +nexus_xen_attach(device_t dev) +{ + int error; + device_t acpi_dev = NULL; + + nexus_init_resources(); + bus_generic_probe(dev); + + if (xen_initial_domain()) { + /* Disable some ACPI devices that are not usable by Dom0 */ + acpi_cpu_disabled = true; + acpi_hpet_disabled = true; + acpi_timer_disabled = true; + + acpi_dev = BUS_ADD_CHILD(dev, 10, "acpi", 0); + if (acpi_dev == NULL) + panic("Unable to add ACPI bus to Xen Dom0"); + } + + error = bus_generic_attach(dev); + if (xen_initial_domain() && (error == 0)) + acpi_install_wakeup_handler(device_get_softc(acpi_dev)); + + return (error); +} + +static int +nexus_xen_config_intr(device_t dev, int irq, enum intr_trigger trig, + enum intr_polarity pol) +{ + int ret; + + /* + * ISA and PCI intline IRQs are not preregistered on Xen, so + * intercept calls to configure those and register them on the fly. + */ + if ((irq < first_msi_irq) && (intr_lookup_source(irq) == NULL)) { + ret = xen_register_pirq(irq, trig, pol); + if (ret != 0) + return (ret); + nexus_add_irq(irq); + } + return (intr_config_intr(irq, trig, pol)); +} + +static int +nexus_xen_alloc_msix(device_t pcib, device_t dev, int *irq) +{ + + return (xen_msix_alloc(dev, irq)); +} + +static int +nexus_xen_release_msix(device_t pcib, device_t dev, int irq) +{ + + return (xen_msix_release(irq)); +} + +static int +nexus_xen_alloc_msi(device_t pcib, device_t dev, int count, int maxcount, int *irqs) +{ + + return (xen_msi_alloc(dev, count, maxcount, irqs)); +} + +static int +nexus_xen_release_msi(device_t pcib, device_t dev, int count, int *irqs) +{ + + return (xen_msi_release(irqs, count)); +} + +static int +nexus_xen_map_msi(device_t pcib, device_t dev, int irq, uint64_t *addr, uint32_t *data) +{ + + return (xen_msi_map(irq, addr, data)); +} + +static device_method_t nexus_xen_methods[] = { + /* Device interface */ + DEVMETHOD(device_probe, nexus_xen_probe), + DEVMETHOD(device_attach, nexus_xen_attach), + + /* INTR */ + DEVMETHOD(bus_config_intr, nexus_xen_config_intr), + + /* MSI */ + DEVMETHOD(pcib_alloc_msi, nexus_xen_alloc_msi), + DEVMETHOD(pcib_release_msi, nexus_xen_release_msi), + DEVMETHOD(pcib_alloc_msix, nexus_xen_alloc_msix), + DEVMETHOD(pcib_release_msix, nexus_xen_release_msix), + DEVMETHOD(pcib_map_msi, nexus_xen_map_msi), + + { 0, 0 } +}; + +DEFINE_CLASS_1(nexus, nexus_xen_driver, nexus_xen_methods, 1, nexus_driver); +static devclass_t nexus_devclass; + +DRIVER_MODULE(nexus_xen, root, nexus_xen_driver, nexus_devclass, 0, 0); Property changes on: trunk/sys/x86/xen/xen_nexus.c ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +MidnightBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Added: trunk/sys/x86/xen/xen_pci_bus.c =================================================================== --- trunk/sys/x86/xen/xen_pci_bus.c (rev 0) +++ trunk/sys/x86/xen/xen_pci_bus.c 2020-02-08 19:32:41 UTC (rev 12310) @@ -0,0 +1,91 @@ +/* $MidnightBSD$ */ +/* + * Copyright (c) 2014 Roger Pau Monn? <roger.pau at citrix.com> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: stable/11/sys/x86/xen/xen_pci_bus.c 275649 2014-12-09 18:03:25Z royger $"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/bus.h> +#include <sys/kernel.h> + +#include <sys/pciio.h> +#include <dev/pci/pcireg.h> +#include <dev/pci/pcivar.h> +#include <dev/pci/pci_private.h> + +#include <xen/xen-os.h> +#include <xen/hypervisor.h> +#include <xen/xen_pci.h> + +#include "pcib_if.h" +#include "pci_if.h" + +void +xen_pci_enable_msi_method(device_t dev, device_t child, uint64_t address, + uint16_t data) +{ + struct pci_devinfo *dinfo = device_get_ivars(child); + struct pcicfg_msi *msi = &dinfo->cfg.msi; + + /* Enable MSI in the control register. */ + msi->msi_ctrl |= PCIM_MSICTRL_MSI_ENABLE; + pci_write_config(child, msi->msi_location + PCIR_MSI_CTRL, + msi->msi_ctrl, 2); +} + +void +xen_pci_disable_msi_method(device_t dev, device_t child) +{ + struct pci_devinfo *dinfo = device_get_ivars(child); + struct pcicfg_msi *msi = &dinfo->cfg.msi; + + msi->msi_ctrl &= ~PCIM_MSICTRL_MSI_ENABLE; + pci_write_config(child, msi->msi_location + PCIR_MSI_CTRL, + msi->msi_ctrl, 2); +} + +void +xen_pci_child_added_method(device_t dev, device_t child) +{ + struct pci_devinfo *dinfo; + struct physdev_pci_device_add add_pci; + int error; + + dinfo = device_get_ivars(child); + KASSERT((dinfo != NULL), + ("xen_pci_add_child_method called with NULL dinfo")); + + bzero(&add_pci, sizeof(add_pci)); + add_pci.seg = dinfo->cfg.domain; + add_pci.bus = dinfo->cfg.bus; + add_pci.devfn = (dinfo->cfg.slot << 3) | dinfo->cfg.func; + error = HYPERVISOR_physdev_op(PHYSDEVOP_pci_device_add, &add_pci); + if (error) + panic("unable to add device bus %u devfn %u error: %d\n", + add_pci.bus, add_pci.devfn, error); +} Property changes on: trunk/sys/x86/xen/xen_pci_bus.c ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +MidnightBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Added: trunk/sys/x86/xen/xenpv.c =================================================================== --- trunk/sys/x86/xen/xenpv.c (rev 0) +++ trunk/sys/x86/xen/xenpv.c 2020-02-08 19:32:41 UTC (rev 12310) @@ -0,0 +1,203 @@ +/* $MidnightBSD$ */ +/* + * Copyright (c) 2014 Roger Pau Monn? <roger.pau at citrix.com> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: stable/11/sys/x86/xen/xenpv.c 331017 2018-03-15 19:08:33Z kevans $"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/bus.h> +#include <sys/kernel.h> +#include <sys/module.h> +#include <sys/pcpu.h> +#include <sys/rman.h> +#include <sys/smp.h> +#include <sys/limits.h> +#include <sys/vmmeter.h> + +#include <vm/vm.h> +#include <vm/vm_page.h> +#include <vm/vm_param.h> +#include <vm/vm_phys.h> + +#include <xen/xen-os.h> +#include <xen/gnttab.h> + +#include "xenmem_if.h" + +/* + * Allocate unused physical memory above 4GB in order to map memory + * from foreign domains. We use memory starting at 4GB in order to + * prevent clashes with MMIO/ACPI regions. + * + * Since this is not possible on i386 just use any available memory + * chunk and hope we don't clash with anything else. + */ +#ifdef __amd64__ +#define LOW_MEM_LIMIT 0x100000000ul +#else +#define LOW_MEM_LIMIT 0 +#endif + +static devclass_t xenpv_devclass; + +static void +xenpv_identify(driver_t *driver, device_t parent) +{ + if (!xen_domain()) + return; + + /* Make sure there's only one xenpv device. */ + if (devclass_get_device(xenpv_devclass, 0)) + return; + + /* + * The xenpv bus should be the last to attach in order + * to properly detect if an ISA bus has already been added. + */ + if (BUS_ADD_CHILD(parent, UINT_MAX, "xenpv", 0) == NULL) + panic("Unable to attach xenpv bus."); +} + +static int +xenpv_probe(device_t dev) +{ + + device_set_desc(dev, "Xen PV bus"); + return (BUS_PROBE_NOWILDCARD); +} + +static int +xenpv_attach(device_t dev) +{ + device_t child; + + /* + * Let our child drivers identify any child devices that they + * can find. Once that is done attach any devices that we + * found. + */ + bus_generic_probe(dev); + bus_generic_attach(dev); + + if (!devclass_get_device(devclass_find("isa"), 0)) { + child = BUS_ADD_CHILD(dev, 0, "isa", 0); + if (child == NULL) + panic("Failed to attach ISA bus."); + device_probe_and_attach(child); + } + + return (0); +} + +static struct resource * +xenpv_alloc_physmem(device_t dev, device_t child, int *res_id, size_t size) +{ + struct resource *res; + vm_paddr_t phys_addr; + int error; + + res = bus_alloc_resource(child, SYS_RES_MEMORY, res_id, LOW_MEM_LIMIT, + ~0, size, RF_ACTIVE); + if (res == NULL) + return (NULL); + + phys_addr = rman_get_start(res); + error = vm_phys_fictitious_reg_range(phys_addr, phys_addr + size, + VM_MEMATTR_DEFAULT); + if (error) { + bus_release_resource(child, SYS_RES_MEMORY, *res_id, res); + return (NULL); + } + + return (res); +} + +static int +xenpv_free_physmem(device_t dev, device_t child, int res_id, struct resource *res) +{ + vm_paddr_t phys_addr; + size_t size; + + phys_addr = rman_get_start(res); + size = rman_get_size(res); + + vm_phys_fictitious_unreg_range(phys_addr, phys_addr + size); + return (bus_release_resource(child, SYS_RES_MEMORY, res_id, res)); +} + +static device_method_t xenpv_methods[] = { + /* Device interface */ + DEVMETHOD(device_identify, xenpv_identify), + DEVMETHOD(device_probe, xenpv_probe), + DEVMETHOD(device_attach, xenpv_attach), + DEVMETHOD(device_suspend, bus_generic_suspend), + DEVMETHOD(device_resume, bus_generic_resume), + + /* Bus interface */ + DEVMETHOD(bus_add_child, bus_generic_add_child), + DEVMETHOD(bus_alloc_resource, bus_generic_alloc_resource), + DEVMETHOD(bus_release_resource, bus_generic_release_resource), + DEVMETHOD(bus_activate_resource, bus_generic_activate_resource), + DEVMETHOD(bus_deactivate_resource, bus_generic_deactivate_resource), + + /* Interface to allocate memory for foreign mappings */ + DEVMETHOD(xenmem_alloc, xenpv_alloc_physmem), + DEVMETHOD(xenmem_free, xenpv_free_physmem), + + DEVMETHOD_END +}; + +static driver_t xenpv_driver = { + "xenpv", + xenpv_methods, + 0, +}; + +DRIVER_MODULE(xenpv, nexus, xenpv_driver, xenpv_devclass, 0, 0); + +struct resource * +xenmem_alloc(device_t dev, int *res_id, size_t size) +{ + device_t parent; + + parent = device_get_parent(dev); + if (parent == NULL) + return (NULL); + return (XENMEM_ALLOC(parent, dev, res_id, size)); +} + +int +xenmem_free(device_t dev, int res_id, struct resource *res) +{ + device_t parent; + + parent = device_get_parent(dev); + if (parent == NULL) + return (ENXIO); + return (XENMEM_FREE(parent, dev, res_id, res)); +} Property changes on: trunk/sys/x86/xen/xenpv.c ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +MidnightBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property From laffer1 at midnightbsd.org Sat Feb 8 14:33:28 2020 From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org) Date: Sat, 8 Feb 2020 14:33:28 -0500 (EST) Subject: [Midnightbsd-cvs] src [12311] trunk/sys/x86/include: sync with FreeBSD 11-stable Message-ID: <202002081933.018JXShK061961@stargazer.midnightbsd.org> Revision: 12311 http://svnweb.midnightbsd.org/src/?rev=12311 Author: laffer1 Date: 2020-02-08 14:33:27 -0500 (Sat, 08 Feb 2020) Log Message: ----------- sync with FreeBSD 11-stable Modified Paths: -------------- trunk/sys/x86/include/_align.h trunk/sys/x86/include/_inttypes.h trunk/sys/x86/include/_limits.h trunk/sys/x86/include/_stdint.h trunk/sys/x86/include/_types.h trunk/sys/x86/include/acpica_machdep.h trunk/sys/x86/include/apicreg.h trunk/sys/x86/include/apm_bios.h trunk/sys/x86/include/bus.h trunk/sys/x86/include/busdma_impl.h trunk/sys/x86/include/elf.h trunk/sys/x86/include/endian.h trunk/sys/x86/include/fdt.h trunk/sys/x86/include/float.h trunk/sys/x86/include/fpu.h trunk/sys/x86/include/frame.h trunk/sys/x86/include/legacyvar.h trunk/sys/x86/include/mca.h trunk/sys/x86/include/metadata.h trunk/sys/x86/include/mptable.h trunk/sys/x86/include/ofw_machdep.h trunk/sys/x86/include/pci_cfgreg.h trunk/sys/x86/include/psl.h trunk/sys/x86/include/ptrace.h trunk/sys/x86/include/reg.h trunk/sys/x86/include/segments.h trunk/sys/x86/include/setjmp.h trunk/sys/x86/include/sigframe.h trunk/sys/x86/include/signal.h trunk/sys/x86/include/specialreg.h trunk/sys/x86/include/stdarg.h trunk/sys/x86/include/sysarch.h trunk/sys/x86/include/trap.h trunk/sys/x86/include/ucontext.h trunk/sys/x86/include/vdso.h trunk/sys/x86/include/vmware.h Added Paths: ----------- trunk/sys/x86/include/apicvar.h trunk/sys/x86/include/cputypes.h trunk/sys/x86/include/dump.h trunk/sys/x86/include/ifunc.h trunk/sys/x86/include/init.h trunk/sys/x86/include/intr_machdep.h trunk/sys/x86/include/pvclock.h trunk/sys/x86/include/stack.h trunk/sys/x86/include/ucode.h trunk/sys/x86/include/x86_smp.h trunk/sys/x86/include/x86_var.h trunk/sys/x86/include/xen/ trunk/sys/x86/include/xen/xen-os.h Modified: trunk/sys/x86/include/_align.h =================================================================== --- trunk/sys/x86/include/_align.h 2020-02-08 19:32:41 UTC (rev 12310) +++ trunk/sys/x86/include/_align.h 2020-02-08 19:33:27 UTC (rev 12311) @@ -36,7 +36,7 @@ * SUCH DAMAGE. * * from: @(#)param.h 5.8 (Berkeley) 6/28/91 - * $FreeBSD: stable/10/sys/x86/include/_align.h 215856 2010-11-26 10:59:20Z tijl $ + * $FreeBSD: stable/11/sys/x86/include/_align.h 331722 2018-03-29 02:50:57Z eadler $ */ #ifndef _X86_INCLUDE__ALIGN_H_ @@ -47,7 +47,7 @@ * for all data types (int, long, ...). The result is unsigned int * and must be cast to any desired pointer type. */ -#define _ALIGNBYTES (sizeof(register_t) - 1) -#define _ALIGN(p) (((uintptr_t)(p) + _ALIGNBYTES) & ~_ALIGNBYTES) +#define _ALIGNBYTES (sizeof(__register_t) - 1) +#define _ALIGN(p) (((__uintptr_t)(p) + _ALIGNBYTES) & ~_ALIGNBYTES) #endif /* !_X86_INCLUDE__ALIGN_H_ */ Modified: trunk/sys/x86/include/_inttypes.h =================================================================== --- trunk/sys/x86/include/_inttypes.h 2020-02-08 19:32:41 UTC (rev 12310) +++ trunk/sys/x86/include/_inttypes.h 2020-02-08 19:33:27 UTC (rev 12311) @@ -28,7 +28,7 @@ * POSSIBILITY OF SUCH DAMAGE. * * From: $NetBSD: int_fmtio.h,v 1.2 2001/04/26 16:25:21 kleink Exp $ - * $FreeBSD: stable/10/sys/x86/include/_inttypes.h 217157 2011-01-08 18:09:48Z tijl $ + * $FreeBSD: stable/11/sys/x86/include/_inttypes.h 217157 2011-01-08 18:09:48Z tijl $ */ #ifndef _MACHINE_INTTYPES_H_ Modified: trunk/sys/x86/include/_limits.h =================================================================== --- trunk/sys/x86/include/_limits.h 2020-02-08 19:32:41 UTC (rev 12310) +++ trunk/sys/x86/include/_limits.h 2020-02-08 19:33:27 UTC (rev 12311) @@ -28,7 +28,7 @@ * SUCH DAMAGE. * * @(#)limits.h 8.3 (Berkeley) 1/4/94 - * $FreeBSD: stable/10/sys/x86/include/_limits.h 235939 2012-05-24 21:44:46Z obrien $ + * $FreeBSD: stable/11/sys/x86/include/_limits.h 331722 2018-03-29 02:50:57Z eadler $ */ #ifndef _MACHINE__LIMITS_H_ Modified: trunk/sys/x86/include/_stdint.h =================================================================== --- trunk/sys/x86/include/_stdint.h 2020-02-08 19:32:41 UTC (rev 12310) +++ trunk/sys/x86/include/_stdint.h 2020-02-08 19:33:27 UTC (rev 12311) @@ -35,12 +35,14 @@ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * - * $FreeBSD: stable/10/sys/x86/include/_stdint.h 237517 2012-06-24 04:15:58Z andrew $ + * $FreeBSD: stable/11/sys/x86/include/_stdint.h 301030 2016-05-31 08:38:24Z ed $ */ #ifndef _MACHINE__STDINT_H_ #define _MACHINE__STDINT_H_ +#include <machine/_limits.h> + #if !defined(__cplusplus) || defined(__STDC_CONSTANT_MACROS) #define INT8_C(c) (c) @@ -168,8 +170,8 @@ #define PTRDIFF_MAX INT64_MAX /* Limits of sig_atomic_t. */ -#define SIG_ATOMIC_MIN LONG_MIN -#define SIG_ATOMIC_MAX LONG_MAX +#define SIG_ATOMIC_MIN __LONG_MIN +#define SIG_ATOMIC_MAX __LONG_MAX /* Limit of size_t. */ #define SIZE_MAX UINT64_MAX Modified: trunk/sys/x86/include/_types.h =================================================================== --- trunk/sys/x86/include/_types.h 2020-02-08 19:32:41 UTC (rev 12310) +++ trunk/sys/x86/include/_types.h 2020-02-08 19:33:27 UTC (rev 12311) @@ -34,7 +34,7 @@ * * From: @(#)ansi.h 8.2 (Berkeley) 1/4/94 * From: @(#)types.h 8.3 (Berkeley) 1/5/94 - * $FreeBSD: stable/10/sys/x86/include/_types.h 287139 2015-08-25 19:18:38Z jkim $ + * $FreeBSD: stable/11/sys/x86/include/_types.h 332135 2018-04-06 19:17:59Z kevans $ */ #ifndef _MACHINE__TYPES_H_ @@ -44,6 +44,8 @@ #error this file needs sys/cdefs.h as a prerequisite #endif +#include <machine/_limits.h> + #define __NO_STRICT_ALIGNMENT /* @@ -77,15 +79,19 @@ #ifdef __LP64__ typedef __int32_t __clock_t; /* clock()... */ typedef __int64_t __critical_t; +#ifndef _STANDALONE typedef double __double_t; typedef float __float_t; +#endif typedef __int64_t __intfptr_t; typedef __int64_t __intptr_t; #else typedef unsigned long __clock_t; typedef __int32_t __critical_t; +#ifndef _STANDALONE typedef long double __double_t; typedef long double __float_t; +#endif typedef __int32_t __intfptr_t; typedef __int32_t __intptr_t; #endif @@ -141,8 +147,6 @@ #endif typedef __uint32_t __vm_size_t; #endif -typedef __int64_t __vm_ooffset_t; -typedef __uint64_t __vm_pindex_t; typedef int ___wchar_t; #define __WCHAR_MIN __INT_MIN /* min value for a wchar_t */ Modified: trunk/sys/x86/include/acpica_machdep.h =================================================================== --- trunk/sys/x86/include/acpica_machdep.h 2020-02-08 19:32:41 UTC (rev 12310) +++ trunk/sys/x86/include/acpica_machdep.h 2020-02-08 19:33:27 UTC (rev 12311) @@ -24,7 +24,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: stable/10/sys/x86/include/acpica_machdep.h 259073 2013-12-07 18:23:29Z peter $ + * $FreeBSD: stable/11/sys/x86/include/acpica_machdep.h 298094 2016-04-16 03:44:50Z gjb $ */ /****************************************************************************** @@ -70,12 +70,20 @@ (Acq) = acpi_release_global_lock(&((GLptr)->GlobalLock)); \ } while (0) +enum intr_trigger; +enum intr_polarity; + void acpi_SetDefaultIntrModel(int model); void acpi_cpu_c1(void); +void acpi_cpu_idle_mwait(uint32_t mwait_hint); void *acpi_map_table(vm_paddr_t pa, const char *sig); void acpi_unmap_table(void *table); vm_paddr_t acpi_find_table(const char *sig); +void madt_parse_interrupt_values(void *entry, + enum intr_trigger *trig, enum intr_polarity *pol); +extern int madt_found_sci_override; + #endif /* _KERNEL */ #endif /* __ACPICA_MACHDEP_H__ */ Modified: trunk/sys/x86/include/apicreg.h =================================================================== --- trunk/sys/x86/include/apicreg.h 2020-02-08 19:32:41 UTC (rev 12310) +++ trunk/sys/x86/include/apicreg.h 2020-02-08 19:33:27 UTC (rev 12311) @@ -23,7 +23,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: stable/10/sys/x86/include/apicreg.h 262141 2014-02-18 01:15:32Z jhb $ + * $FreeBSD: stable/11/sys/x86/include/apicreg.h 323608 2017-09-15 09:00:45Z kib $ */ #ifndef _X86_APICREG_H_ @@ -194,6 +194,81 @@ typedef struct LAPIC lapic_t; +enum LAPIC_REGISTERS { + LAPIC_ID = 0x2, + LAPIC_VERSION = 0x3, + LAPIC_TPR = 0x8, + LAPIC_APR = 0x9, + LAPIC_PPR = 0xa, + LAPIC_EOI = 0xb, + LAPIC_LDR = 0xd, + LAPIC_DFR = 0xe, /* Not in x2APIC */ + LAPIC_SVR = 0xf, + LAPIC_ISR0 = 0x10, + LAPIC_ISR1 = 0x11, + LAPIC_ISR2 = 0x12, + LAPIC_ISR3 = 0x13, + LAPIC_ISR4 = 0x14, + LAPIC_ISR5 = 0x15, + LAPIC_ISR6 = 0x16, + LAPIC_ISR7 = 0x17, + LAPIC_TMR0 = 0x18, + LAPIC_TMR1 = 0x19, + LAPIC_TMR2 = 0x1a, + LAPIC_TMR3 = 0x1b, + LAPIC_TMR4 = 0x1c, + LAPIC_TMR5 = 0x1d, + LAPIC_TMR6 = 0x1e, + LAPIC_TMR7 = 0x1f, + LAPIC_IRR0 = 0x20, + LAPIC_IRR1 = 0x21, + LAPIC_IRR2 = 0x22, + LAPIC_IRR3 = 0x23, + LAPIC_IRR4 = 0x24, + LAPIC_IRR5 = 0x25, + LAPIC_IRR6 = 0x26, + LAPIC_IRR7 = 0x27, + LAPIC_ESR = 0x28, + LAPIC_LVT_CMCI = 0x2f, + LAPIC_ICR_LO = 0x30, + LAPIC_ICR_HI = 0x31, /* Not in x2APIC */ + LAPIC_LVT_TIMER = 0x32, + LAPIC_LVT_THERMAL = 0x33, + LAPIC_LVT_PCINT = 0x34, + LAPIC_LVT_LINT0 = 0x35, + LAPIC_LVT_LINT1 = 0x36, + LAPIC_LVT_ERROR = 0x37, + LAPIC_ICR_TIMER = 0x38, + LAPIC_CCR_TIMER = 0x39, + LAPIC_DCR_TIMER = 0x3e, + LAPIC_SELF_IPI = 0x3f, /* Only in x2APIC */ + LAPIC_EXT_FEATURES = 0x40, /* AMD */ + LAPIC_EXT_CTRL = 0x41, /* AMD */ + LAPIC_EXT_SEOI = 0x42, /* AMD */ + LAPIC_EXT_IER0 = 0x48, /* AMD */ + LAPIC_EXT_IER1 = 0x49, /* AMD */ + LAPIC_EXT_IER2 = 0x4a, /* AMD */ + LAPIC_EXT_IER3 = 0x4b, /* AMD */ + LAPIC_EXT_IER4 = 0x4c, /* AMD */ + LAPIC_EXT_IER5 = 0x4d, /* AMD */ + LAPIC_EXT_IER6 = 0x4e, /* AMD */ + LAPIC_EXT_IER7 = 0x4f, /* AMD */ + LAPIC_EXT_LVT0 = 0x50, /* AMD */ + LAPIC_EXT_LVT1 = 0x51, /* AMD */ + LAPIC_EXT_LVT2 = 0x52, /* AMD */ + LAPIC_EXT_LVT3 = 0x53, /* AMD */ +}; + +#define LAPIC_MEM_MUL 0x10 + +/* + * Although some registers are available on AMD processors only, + * it's not a big waste to reserve them on all platforms. + * However, we need to watch out for this space being assigned for + * non-APIC purposes in the future processor models. + */ +#define LAPIC_MEM_REGION ((LAPIC_EXT_LVT3 + 1) * LAPIC_MEM_MUL) + /****************************************************************************** * I/O APIC structure */ @@ -236,6 +311,7 @@ #define APIC_VER_MAXLVT 0x00ff0000 #define MAXLVTSHIFT 16 #define APIC_VER_EOI_SUPPRESSION 0x01000000 +#define APIC_VER_AMD_EXT_SPACE 0x80000000 /* fields in LDR */ #define APIC_LDR_RESERVED 0x00ffffff @@ -340,11 +416,12 @@ #define APIC_LVTT_VECTOR 0x000000ff #define APIC_LVTT_DS 0x00001000 #define APIC_LVTT_M 0x00010000 -#define APIC_LVTT_TM 0x00020000 +#define APIC_LVTT_TM 0x00060000 # define APIC_LVTT_TM_ONE_SHOT 0x00000000 # define APIC_LVTT_TM_PERIODIC 0x00020000 +# define APIC_LVTT_TM_TSCDLT 0x00040000 +# define APIC_LVTT_TM_RSRV 0x00060000 - /* APIC timer current count */ #define APIC_TIMER_MAX_COUNT 0xffffffff @@ -358,6 +435,13 @@ #define APIC_TDCR_128 0x0a #define APIC_TDCR_1 0x0b +/* Constants related to AMD Extended APIC Features Register */ +#define APIC_EXTF_ELVT_MASK 0x00ff0000 +#define APIC_EXTF_ELVT_SHIFT 16 +#define APIC_EXTF_EXTID_CAP 0x00000004 +#define APIC_EXTF_SEIO_CAP 0x00000002 +#define APIC_EXTF_IER_CAP 0x00000001 + /* LVT table indices */ #define APIC_LVT_LINT0 0 #define APIC_LVT_LINT1 1 @@ -368,6 +452,13 @@ #define APIC_LVT_CMCI 6 #define APIC_LVT_MAX APIC_LVT_CMCI +/* AMD extended LVT constants, seem to be assigned by fiat */ +#define APIC_ELVT_IBS 0 /* Instruction based sampling */ +#define APIC_ELVT_MCA 1 /* MCE thresholding */ +#define APIC_ELVT_DEI 2 /* Deferred error interrupt */ +#define APIC_ELVT_SBI 3 /* Sideband interface */ +#define APIC_ELVT_MAX APIC_ELVT_SBI + /****************************************************************************** * I/O APIC defines */ @@ -379,6 +470,8 @@ #define IOAPIC_WINDOW 0x10 #define IOAPIC_EOIR 0x40 +#define IOAPIC_WND_SIZE 0x50 + /* indexes into IO APIC */ #define IOAPIC_ID 0x00 #define IOAPIC_VER 0x01 Added: trunk/sys/x86/include/apicvar.h =================================================================== --- trunk/sys/x86/include/apicvar.h (rev 0) +++ trunk/sys/x86/include/apicvar.h 2020-02-08 19:33:27 UTC (rev 12311) @@ -0,0 +1,487 @@ +/* $MidnightBSD$ */ +/*- + * Copyright (c) 2003 John Baldwin <jhb at FreeBSD.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: stable/11/sys/x86/include/apicvar.h 346817 2019-04-28 13:21:01Z dchagin $ + */ + +#ifndef _X86_APICVAR_H_ +#define _X86_APICVAR_H_ + +/* + * Local && I/O APIC variable definitions. + */ + +/* + * Layout of local APIC interrupt vectors: + * + * 0xff (255) +-------------+ + * | | 15 (Spurious / IPIs / Local Interrupts) + * 0xf0 (240) +-------------+ + * | | 14 (I/O Interrupts / Timer) + * 0xe0 (224) +-------------+ + * | | 13 (I/O Interrupts) + * 0xd0 (208) +-------------+ + * | | 12 (I/O Interrupts) + * 0xc0 (192) +-------------+ + * | | 11 (I/O Interrupts) + * 0xb0 (176) +-------------+ + * | | 10 (I/O Interrupts) + * 0xa0 (160) +-------------+ + * | | 9 (I/O Interrupts) + * 0x90 (144) +-------------+ + * | | 8 (I/O Interrupts / System Calls) + * 0x80 (128) +-------------+ + * | | 7 (I/O Interrupts) + * 0x70 (112) +-------------+ + * | | 6 (I/O Interrupts) + * 0x60 (96) +-------------+ + * | | 5 (I/O Interrupts) + * 0x50 (80) +-------------+ + * | | 4 (I/O Interrupts) + * 0x40 (64) +-------------+ + * | | 3 (I/O Interrupts) + * 0x30 (48) +-------------+ + * | | 2 (ATPIC Interrupts) + * 0x20 (32) +-------------+ + * | | 1 (Exceptions, traps, faults, etc.) + * 0x10 (16) +-------------+ + * | | 0 (Exceptions, traps, faults, etc.) + * 0x00 (0) +-------------+ + * + * Note: 0x80 needs to be handled specially and not allocated to an + * I/O device! + */ + +#define MAX_APIC_ID 0xfe +#define APIC_ID_ALL 0xff + +/* I/O Interrupts are used for external devices such as ISA, PCI, etc. */ +#define APIC_IO_INTS (IDT_IO_INTS + 16) +#define APIC_NUM_IOINTS 191 + +/* The timer interrupt is used for clock handling and drives hardclock, etc. */ +#define APIC_TIMER_INT (APIC_IO_INTS + APIC_NUM_IOINTS) + +/* + ********************* !!! WARNING !!! ****************************** + * Each local apic has an interrupt receive fifo that is two entries deep + * for each interrupt priority class (higher 4 bits of interrupt vector). + * Once the fifo is full the APIC can no longer receive interrupts for this + * class and sending IPIs from other CPUs will be blocked. + * To avoid deadlocks there should be no more than two IPI interrupts + * pending at the same time. + * Currently this is guaranteed by dividing the IPIs in two groups that have + * each at most one IPI interrupt pending. The first group is protected by the + * smp_ipi_mtx and waits for the completion of the IPI (Only one IPI user + * at a time) The second group uses a single interrupt and a bitmap to avoid + * redundant IPI interrupts. + */ + +/* Interrupts for local APIC LVT entries other than the timer. */ +#define APIC_LOCAL_INTS 240 +#define APIC_ERROR_INT APIC_LOCAL_INTS +#define APIC_THERMAL_INT (APIC_LOCAL_INTS + 1) +#define APIC_CMC_INT (APIC_LOCAL_INTS + 2) +#define APIC_IPI_INTS (APIC_LOCAL_INTS + 3) + +#define IPI_RENDEZVOUS (APIC_IPI_INTS) /* Inter-CPU rendezvous. */ +#define IPI_INVLTLB (APIC_IPI_INTS + 1) /* TLB Shootdown IPIs */ +#define IPI_INVLPG (APIC_IPI_INTS + 2) +#define IPI_INVLRNG (APIC_IPI_INTS + 3) +#define IPI_INVLCACHE (APIC_IPI_INTS + 4) +/* Vector to handle bitmap based IPIs */ +#define IPI_BITMAP_VECTOR (APIC_IPI_INTS + 5) + +/* IPIs handled by IPI_BITMAP_VECTOR */ +#define IPI_AST 0 /* Generate software trap. */ +#define IPI_PREEMPT 1 +#define IPI_HARDCLOCK 2 +#define IPI_BITMAP_LAST IPI_HARDCLOCK +#define IPI_IS_BITMAPED(x) ((x) <= IPI_BITMAP_LAST) + +#define IPI_STOP (APIC_IPI_INTS + 6) /* Stop CPU until restarted. */ +#define IPI_SUSPEND (APIC_IPI_INTS + 7) /* Suspend CPU until restarted. */ +#define IPI_DYN_FIRST (APIC_IPI_INTS + 8) +#define IPI_DYN_LAST (253) /* IPIs allocated at runtime */ + +/* + * IPI_STOP_HARD does not need to occupy a slot in the IPI vector space since + * it is delivered using an NMI anyways. + */ +#define IPI_NMI_FIRST 254 +#define IPI_TRACE 254 /* Interrupt for tracing. */ +#define IPI_STOP_HARD 255 /* Stop CPU with a NMI. */ + +/* + * The spurious interrupt can share the priority class with the IPIs since + * it is not a normal interrupt. (Does not use the APIC's interrupt fifo) + */ +#define APIC_SPURIOUS_INT 255 + +#ifndef LOCORE + +#define APIC_IPI_DEST_SELF -1 +#define APIC_IPI_DEST_ALL -2 +#define APIC_IPI_DEST_OTHERS -3 + +#define APIC_BUS_UNKNOWN -1 +#define APIC_BUS_ISA 0 +#define APIC_BUS_EISA 1 +#define APIC_BUS_PCI 2 +#define APIC_BUS_MAX APIC_BUS_PCI + +#define IRQ_EXTINT -1 +#define IRQ_NMI -2 +#define IRQ_SMI -3 +#define IRQ_DISABLED -4 + +/* + * An APIC enumerator is a pseudo bus driver that enumerates APIC's including + * CPU's and I/O APIC's. + */ +struct apic_enumerator { + const char *apic_name; + int (*apic_probe)(void); + int (*apic_probe_cpus)(void); + int (*apic_setup_local)(void); + int (*apic_setup_io)(void); + SLIST_ENTRY(apic_enumerator) apic_next; +}; + +inthand_t + IDTVEC(apic_isr1), IDTVEC(apic_isr2), IDTVEC(apic_isr3), + IDTVEC(apic_isr4), IDTVEC(apic_isr5), IDTVEC(apic_isr6), + IDTVEC(apic_isr7), IDTVEC(cmcint), IDTVEC(errorint), + IDTVEC(spuriousint), IDTVEC(timerint), + IDTVEC(apic_isr1_pti), IDTVEC(apic_isr2_pti), IDTVEC(apic_isr3_pti), + IDTVEC(apic_isr4_pti), IDTVEC(apic_isr5_pti), IDTVEC(apic_isr6_pti), + IDTVEC(apic_isr7_pti), IDTVEC(cmcint_pti), IDTVEC(errorint_pti), + IDTVEC(spuriousint_pti), IDTVEC(timerint_pti); + +extern vm_paddr_t lapic_paddr; +extern int apic_cpuids[]; + +void apic_register_enumerator(struct apic_enumerator *enumerator); +void *ioapic_create(vm_paddr_t addr, int32_t apic_id, int intbase); +int ioapic_disable_pin(void *cookie, u_int pin); +int ioapic_get_vector(void *cookie, u_int pin); +void ioapic_register(void *cookie); +int ioapic_remap_vector(void *cookie, u_int pin, int vector); +int ioapic_set_bus(void *cookie, u_int pin, int bus_type); +int ioapic_set_extint(void *cookie, u_int pin); +int ioapic_set_nmi(void *cookie, u_int pin); +int ioapic_set_polarity(void *cookie, u_int pin, enum intr_polarity pol); +int ioapic_set_triggermode(void *cookie, u_int pin, + enum intr_trigger trigger); +int ioapic_set_smi(void *cookie, u_int pin); + +/* + * Struct containing pointers to APIC functions whose + * implementation is run time selectable. + */ +struct apic_ops { + void (*create)(u_int, int); + void (*init)(vm_paddr_t); + void (*xapic_mode)(void); + bool (*is_x2apic)(void); + void (*setup)(int); + void (*dump)(const char *); + void (*disable)(void); + void (*eoi)(void); + int (*id)(void); + int (*intr_pending)(u_int); + void (*set_logical_id)(u_int, u_int, u_int); + u_int (*cpuid)(u_int); + + /* Vectors */ + u_int (*alloc_vector)(u_int, u_int); + u_int (*alloc_vectors)(u_int, u_int *, u_int, u_int); + void (*enable_vector)(u_int, u_int); + void (*disable_vector)(u_int, u_int); + void (*free_vector)(u_int, u_int, u_int); + + + /* PMC */ + int (*enable_pmc)(void); + void (*disable_pmc)(void); + void (*reenable_pmc)(void); + + /* CMC */ + void (*enable_cmc)(void); + + /* AMD ELVT */ + int (*enable_mca_elvt)(void); + + /* IPI */ + void (*ipi_raw)(register_t, u_int); + void (*ipi_vectored)(u_int, int); + int (*ipi_wait)(int); + int (*ipi_alloc)(inthand_t *ipifunc); + void (*ipi_free)(int vector); + + /* LVT */ + int (*set_lvt_mask)(u_int, u_int, u_char); + int (*set_lvt_mode)(u_int, u_int, u_int32_t); + int (*set_lvt_polarity)(u_int, u_int, enum intr_polarity); + int (*set_lvt_triggermode)(u_int, u_int, enum intr_trigger); +}; + +extern struct apic_ops apic_ops; + +static inline void +lapic_create(u_int apic_id, int boot_cpu) +{ + + apic_ops.create(apic_id, boot_cpu); +} + +static inline void +lapic_init(vm_paddr_t addr) +{ + + apic_ops.init(addr); +} + +static inline void +lapic_xapic_mode(void) +{ + + apic_ops.xapic_mode(); +} + +static inline bool +lapic_is_x2apic(void) +{ + + return (apic_ops.is_x2apic()); +} + +static inline void +lapic_setup(int boot) +{ + + apic_ops.setup(boot); +} + +static inline void +lapic_dump(const char *str) +{ + + apic_ops.dump(str); +} + +static inline void +lapic_disable(void) +{ + + apic_ops.disable(); +} + +static inline void +lapic_eoi(void) +{ + + apic_ops.eoi(); +} + +static inline int +lapic_id(void) +{ + + return (apic_ops.id()); +} + +static inline int +lapic_intr_pending(u_int vector) +{ + + return (apic_ops.intr_pending(vector)); +} + +/* XXX: UNUSED */ +static inline void +lapic_set_logical_id(u_int apic_id, u_int cluster, u_int cluster_id) +{ + + apic_ops.set_logical_id(apic_id, cluster, cluster_id); +} + +static inline u_int +apic_cpuid(u_int apic_id) +{ + + return (apic_ops.cpuid(apic_id)); +} + +static inline u_int +apic_alloc_vector(u_int apic_id, u_int irq) +{ + + return (apic_ops.alloc_vector(apic_id, irq)); +} + +static inline u_int +apic_alloc_vectors(u_int apic_id, u_int *irqs, u_int count, u_int align) +{ + + return (apic_ops.alloc_vectors(apic_id, irqs, count, align)); +} + +static inline void +apic_enable_vector(u_int apic_id, u_int vector) +{ + + apic_ops.enable_vector(apic_id, vector); +} + +static inline void +apic_disable_vector(u_int apic_id, u_int vector) +{ + + apic_ops.disable_vector(apic_id, vector); +} + +static inline void +apic_free_vector(u_int apic_id, u_int vector, u_int irq) +{ + + apic_ops.free_vector(apic_id, vector, irq); +} + +static inline int +lapic_enable_pmc(void) +{ + + return (apic_ops.enable_pmc()); +} + +static inline void +lapic_disable_pmc(void) +{ + + apic_ops.disable_pmc(); +} + +static inline void +lapic_reenable_pmc(void) +{ + + apic_ops.reenable_pmc(); +} + +static inline void +lapic_enable_cmc(void) +{ + + apic_ops.enable_cmc(); +} + +static inline int +lapic_enable_mca_elvt(void) +{ + + return (apic_ops.enable_mca_elvt()); +} + +static inline void +lapic_ipi_raw(register_t icrlo, u_int dest) +{ + + apic_ops.ipi_raw(icrlo, dest); +} + +static inline void +lapic_ipi_vectored(u_int vector, int dest) +{ + + apic_ops.ipi_vectored(vector, dest); +} + +static inline int +lapic_ipi_wait(int delay) +{ + + return (apic_ops.ipi_wait(delay)); +} + +static inline int +lapic_ipi_alloc(inthand_t *ipifunc) +{ + + return (apic_ops.ipi_alloc(ipifunc)); +} + +static inline void +lapic_ipi_free(int vector) +{ + + return (apic_ops.ipi_free(vector)); +} + +static inline int +lapic_set_lvt_mask(u_int apic_id, u_int lvt, u_char masked) +{ + + return (apic_ops.set_lvt_mask(apic_id, lvt, masked)); +} + +static inline int +lapic_set_lvt_mode(u_int apic_id, u_int lvt, u_int32_t mode) +{ + + return (apic_ops.set_lvt_mode(apic_id, lvt, mode)); +} + +static inline int +lapic_set_lvt_polarity(u_int apic_id, u_int lvt, enum intr_polarity pol) +{ + + return (apic_ops.set_lvt_polarity(apic_id, lvt, pol)); +} + +static inline int +lapic_set_lvt_triggermode(u_int apic_id, u_int lvt, enum intr_trigger trigger) +{ + + return (apic_ops.set_lvt_triggermode(apic_id, lvt, trigger)); +} + +void lapic_handle_cmc(void); +void lapic_handle_error(void); +void lapic_handle_intr(int vector, struct trapframe *frame); +void lapic_handle_timer(struct trapframe *frame); + +int ioapic_get_rid(u_int apic_id, uint16_t *ridp); + +extern int x2apic_mode; +extern int lapic_eoi_suppression; + +#ifdef _SYS_SYSCTL_H_ +SYSCTL_DECL(_hw_apic); +#endif + +#endif /* !LOCORE */ +#endif /* _X86_APICVAR_H_ */ Property changes on: trunk/sys/x86/include/apicvar.h ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +MidnightBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Modified: trunk/sys/x86/include/apm_bios.h =================================================================== --- trunk/sys/x86/include/apm_bios.h 2020-02-08 19:32:41 UTC (rev 12310) +++ trunk/sys/x86/include/apm_bios.h 2020-02-08 19:33:27 UTC (rev 12311) @@ -13,7 +13,7 @@ * * Aug, 1994 Implemented on FreeBSD 1.1.5.1R (Toshiba AVS001WD) * - * $FreeBSD: stable/10/sys/x86/include/apm_bios.h 215140 2010-11-11 19:36:21Z jkim $ + * $FreeBSD: stable/11/sys/x86/include/apm_bios.h 215140 2010-11-11 19:36:21Z jkim $ */ #ifndef _X86_APM_BIOS_H_ Modified: trunk/sys/x86/include/bus.h =================================================================== --- trunk/sys/x86/include/bus.h 2020-02-08 19:32:41 UTC (rev 12310) +++ trunk/sys/x86/include/bus.h 2020-02-08 19:33:27 UTC (rev 12311) @@ -29,7 +29,7 @@ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * - * $FreeBSD: stable/10/sys/x86/include/bus.h 287126 2015-08-25 14:39:40Z marcel $ + * $FreeBSD: stable/11/sys/x86/include/bus.h 286667 2015-08-12 15:26:32Z marcel $ */ /* $NetBSD: bus.h,v 1.12 1997/10/01 08:25:15 fvdl Exp $ */ Modified: trunk/sys/x86/include/busdma_impl.h =================================================================== --- trunk/sys/x86/include/busdma_impl.h 2020-02-08 19:32:41 UTC (rev 12310) +++ trunk/sys/x86/include/busdma_impl.h 2020-02-08 19:33:27 UTC (rev 12311) @@ -27,7 +27,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: stable/10/sys/x86/include/busdma_impl.h 259512 2013-12-17 13:49:35Z kib $ + * $FreeBSD: stable/11/sys/x86/include/busdma_impl.h 257308 2013-10-29 07:25:54Z kib $ */ #ifndef __X86_BUSDMA_IMPL_H Added: trunk/sys/x86/include/cputypes.h =================================================================== --- trunk/sys/x86/include/cputypes.h (rev 0) +++ trunk/sys/x86/include/cputypes.h 2020-02-08 19:33:27 UTC (rev 12311) @@ -0,0 +1,50 @@ +/* $MidnightBSD$ */ +/*- + * Copyright (c) 1993 Christopher G. Demetriou + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD: stable/11/sys/x86/include/cputypes.h 308433 2016-11-08 06:13:22Z jhb $ + */ + +#ifndef _X86_CPUTYPES_H_ +#define _X86_CPUTYPES_H_ + +/* + * Vendors of processor. + */ +#define CPU_VENDOR_NSC 0x100b /* NSC */ +#define CPU_VENDOR_IBM 0x1014 /* IBM */ +#define CPU_VENDOR_AMD 0x1022 /* AMD */ +#define CPU_VENDOR_SIS 0x1039 /* SiS */ +#define CPU_VENDOR_UMC 0x1060 /* UMC */ +#define CPU_VENDOR_NEXGEN 0x1074 /* Nexgen */ +#define CPU_VENDOR_CYRIX 0x1078 /* Cyrix */ +#define CPU_VENDOR_IDT 0x111d /* Centaur/IDT/VIA */ +#define CPU_VENDOR_TRANSMETA 0x1279 /* Transmeta */ +#define CPU_VENDOR_INTEL 0x8086 /* Intel */ +#define CPU_VENDOR_RISE 0xdead2bad /* Rise */ +#define CPU_VENDOR_CENTAUR CPU_VENDOR_IDT + +#endif /* !_X86_CPUTYPES_H_ */ Property changes on: trunk/sys/x86/include/cputypes.h ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +MidnightBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Added: trunk/sys/x86/include/dump.h =================================================================== --- trunk/sys/x86/include/dump.h (rev 0) +++ trunk/sys/x86/include/dump.h 2020-02-08 19:33:27 UTC (rev 12311) @@ -0,0 +1,88 @@ +/* $MidnightBSD$ */ +/*- + * Copyright (c) 2014 EMC Corp. + * Author: Conrad Meyer <conrad.meyer at isilon.com> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: stable/11/sys/x86/include/dump.h 276772 2015-01-07 01:01:39Z markj $ + */ + +#ifndef _MACHINE_DUMP_H_ +#define _MACHINE_DUMP_H_ + +#ifdef __amd64__ +#define KERNELDUMP_ARCH_VERSION KERNELDUMP_AMD64_VERSION +#define EM_VALUE EM_X86_64 +#else +#define KERNELDUMP_ARCH_VERSION KERNELDUMP_I386_VERSION +#define EM_VALUE EM_386 +#endif + +/* 20 phys_avail entry pairs correspond to 10 pa's */ +#define DUMPSYS_MD_PA_NPAIRS 10 +#define DUMPSYS_NUM_AUX_HDRS 0 + +static inline void +dumpsys_pa_init(void) +{ + + dumpsys_gen_pa_init(); +} + +static inline struct dump_pa * +dumpsys_pa_next(struct dump_pa *p) +{ + + return (dumpsys_gen_pa_next(p)); +} + +static inline void +dumpsys_wbinv_all(void) +{ + + dumpsys_gen_wbinv_all(); +} + +static inline void +dumpsys_unmap_chunk(vm_paddr_t pa, size_t s, void *va) +{ + + dumpsys_gen_unmap_chunk(pa, s, va); +} + +static inline int +dumpsys_write_aux_headers(struct dumperinfo *di) +{ + + return (dumpsys_gen_write_aux_headers(di)); +} + +static inline int +dumpsys(struct dumperinfo *di) +{ + + return (dumpsys_generic(di)); +} + +#endif /* !_MACHINE_DUMP_H_ */ Property changes on: trunk/sys/x86/include/dump.h ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +MidnightBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Modified: trunk/sys/x86/include/elf.h =================================================================== --- trunk/sys/x86/include/elf.h 2020-02-08 19:32:41 UTC (rev 12310) +++ trunk/sys/x86/include/elf.h 2020-02-08 19:33:27 UTC (rev 12311) @@ -24,7 +24,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: stable/10/sys/x86/include/elf.h 247047 2013-02-20 17:39:52Z kib $ + * $FreeBSD: stable/11/sys/x86/include/elf.h 325810 2017-11-14 16:03:07Z jhb $ */ #ifndef _MACHINE_ELF_H_ @@ -101,8 +101,11 @@ #define AT_PAGESIZESLEN 21 /* Number of pagesizes. */ #define AT_TIMEKEEP 22 /* Pointer to timehands. */ #define AT_STACKPROT 23 /* Initial stack protection. */ +#define AT_EHDRFLAGS 24 /* e_flags field from elf hdr */ +#define AT_HWCAP 25 /* CPU feature flags. */ +#define AT_HWCAP2 26 /* CPU feature flags 2. */ -#define AT_COUNT 24 /* Count of defined aux entry types. */ +#define AT_COUNT 27 /* Count of defined aux entry types. */ /* * Relocation types. @@ -186,8 +189,11 @@ #define AT_PAGESIZESLEN 21 /* Number of pagesizes. */ #define AT_TIMEKEEP 22 /* Pointer to timehands. */ #define AT_STACKPROT 23 /* Initial stack protection. */ +#define AT_EHDRFLAGS 24 /* e_flags field from elf hdr */ +#define AT_HWCAP 25 /* CPU feature flags. */ +#define AT_HWCAP2 26 /* CPU feature flags 2. */ -#define AT_COUNT 24 /* Count of defined aux entry types. */ +#define AT_COUNT 27 /* Count of defined aux entry types. */ /* * Relocation types. Modified: trunk/sys/x86/include/endian.h =================================================================== --- trunk/sys/x86/include/endian.h 2020-02-08 19:32:41 UTC (rev 12310) +++ trunk/sys/x86/include/endian.h 2020-02-08 19:33:27 UTC (rev 12311) @@ -28,7 +28,7 @@ * SUCH DAMAGE. * * @(#)endian.h 7.8 (Berkeley) 4/3/91 - * $FreeBSD: stable/10/sys/x86/include/endian.h 233684 2012-03-29 23:31:48Z dim $ + * $FreeBSD: stable/11/sys/x86/include/endian.h 331722 2018-03-29 02:50:57Z eadler $ */ #ifndef _MACHINE_ENDIAN_H_ Modified: trunk/sys/x86/include/fdt.h =================================================================== --- trunk/sys/x86/include/fdt.h 2020-02-08 19:32:41 UTC (rev 12310) +++ trunk/sys/x86/include/fdt.h 2020-02-08 19:33:27 UTC (rev 12311) @@ -24,7 +24,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: stable/10/sys/x86/include/fdt.h 266084 2014-05-14 19:18:58Z ian $ + * $FreeBSD: stable/11/sys/x86/include/fdt.h 260327 2014-01-05 18:46:58Z nwhitehorn $ */ #ifndef _MACHINE_FDT_H_ Modified: trunk/sys/x86/include/float.h =================================================================== --- trunk/sys/x86/include/float.h 2020-02-08 19:32:41 UTC (rev 12310) +++ trunk/sys/x86/include/float.h 2020-02-08 19:33:27 UTC (rev 12311) @@ -11,7 +11,7 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 4. Neither the name of the University nor the names of its contributors + * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * @@ -28,7 +28,7 @@ * SUCH DAMAGE. * * from: @(#)float.h 7.1 (Berkeley) 5/8/90 - * $FreeBSD: stable/10/sys/x86/include/float.h 235939 2012-05-24 21:44:46Z obrien $ + * $FreeBSD: stable/11/sys/x86/include/float.h 331722 2018-03-29 02:50:57Z eadler $ */ #ifndef _MACHINE_FLOAT_H_ Modified: trunk/sys/x86/include/fpu.h =================================================================== --- trunk/sys/x86/include/fpu.h 2020-02-08 19:32:41 UTC (rev 12310) +++ trunk/sys/x86/include/fpu.h 2020-02-08 19:33:27 UTC (rev 12311) @@ -31,7 +31,7 @@ * SUCH DAMAGE. * * from: @(#)npx.h 5.3 (Berkeley) 1/18/91 - * $FreeBSD: stable/10/sys/x86/include/fpu.h 279211 2015-02-23 18:38:41Z jhb $ + * $FreeBSD: stable/11/sys/x86/include/fpu.h 274817 2014-11-21 20:53:17Z jhb $ */ /* Modified: trunk/sys/x86/include/frame.h =================================================================== --- trunk/sys/x86/include/frame.h 2020-02-08 19:32:41 UTC (rev 12310) +++ trunk/sys/x86/include/frame.h 2020-02-08 19:33:27 UTC (rev 12311) @@ -32,7 +32,7 @@ * SUCH DAMAGE. * * from: @(#)frame.h 5.2 (Berkeley) 1/18/91 - * $FreeBSD: stable/10/sys/x86/include/frame.h 247047 2013-02-20 17:39:52Z kib $ + * $FreeBSD: stable/11/sys/x86/include/frame.h 331722 2018-03-29 02:50:57Z eadler $ */ #ifndef _MACHINE_FRAME_H_ @@ -65,7 +65,7 @@ int tf_eip; int tf_cs; int tf_eflags; - /* below only when crossing rings (e.g. user to kernel) */ + /* below only when crossing rings (user to kernel) */ int tf_esp; int tf_ss; }; @@ -90,15 +90,24 @@ int tf_eip; int tf_cs; int tf_eflags; - /* below only when crossing rings (e.g. user to kernel) */ + /* below only when crossing rings (user (including vm86) to kernel) */ int tf_esp; int tf_ss; - /* below only when switching out of VM86 mode */ + /* below only when crossing from vm86 mode to kernel */ int tf_vm86_es; int tf_vm86_ds; int tf_vm86_fs; int tf_vm86_gs; }; + +/* + * This alias for the MI TRAPF_USERMODE() should be used when we don't + * care about user mode itself, but need to know if a frame has stack + * registers. The difference is only logical, but on i386 the logic + * for using TRAPF_USERMODE() is complicated by sometimes treating vm86 + * bioscall mode (which is a special ring 3 user mode) as kernel mode. + */ +#define TF_HAS_STACKREGS(tf) TRAPF_USERMODE(tf) #endif /* __i386__ */ #ifdef __amd64__ @@ -137,6 +146,7 @@ register_t tf_rip; register_t tf_cs; register_t tf_rflags; + /* the amd64 frame always has the stack registers */ register_t tf_rsp; register_t tf_ss; }; Added: trunk/sys/x86/include/ifunc.h =================================================================== --- trunk/sys/x86/include/ifunc.h (rev 0) +++ trunk/sys/x86/include/ifunc.h 2020-02-08 19:33:27 UTC (rev 12311) @@ -0,0 +1,51 @@ +/* $MidnightBSD$ */ +/*- + * Copyright (c) 2015-2018 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by Konstantin Belousov <kib at FreeBSD.org> + * under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: stable/11/sys/x86/include/ifunc.h 339217 2018-10-07 00:40:56Z kib $ + */ + +#ifndef __X86_IFUNC_H +#define __X86_IFUNC_H + +#define DEFINE_IFUNC(qual, ret_type, name, args, resolver_qual) \ + resolver_qual ret_type (*name##_resolver(void))args __used; \ + qual ret_type name args __attribute__((ifunc(#name "_resolver"))); \ + resolver_qual ret_type (*name##_resolver(void))args + +#define DEFINE_UIFUNC(qual, ret_type, name, args, resolver_qual) \ + resolver_qual ret_type (*name##_resolver(uint32_t, uint32_t, \ + uint32_t, uint32_t))args __used; \ + qual ret_type name args __attribute__((ifunc(#name "_resolver"))); \ + resolver_qual ret_type (*name##_resolver( \ + uint32_t cpu_feature __unused, \ + uint32_t cpu_feature2 __unused, \ + uint32_t cpu_stdext_feature __unused, \ + uint32_t cpu_stdext_feature2 __unused))args + +#endif Property changes on: trunk/sys/x86/include/ifunc.h ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +MidnightBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Added: trunk/sys/x86/include/init.h =================================================================== --- trunk/sys/x86/include/init.h (rev 0) +++ trunk/sys/x86/include/init.h 2020-02-08 19:33:27 UTC (rev 12311) @@ -0,0 +1,59 @@ +/* $MidnightBSD$ */ +/*- + * Copyright (c) 2013 Roger Pau Monn? <roger.pau at citrix.com> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: stable/11/sys/x86/include/init.h 272310 2014-09-30 16:46:45Z royger $ + */ + +#ifndef __X86_INIT_H__ +#define __X86_INIT_H__ +/* + * Struct containing pointers to init functions whose + * implementation is run time selectable. Selection can be made, + * for example, based on detection of a BIOS variant or + * hypervisor environment. + */ +struct init_ops { + caddr_t (*parse_preload_data)(u_int64_t); + void (*early_clock_source_init)(void); + void (*early_delay)(int); + void (*parse_memmap)(caddr_t, vm_paddr_t *, int *); + u_int (*mp_bootaddress)(u_int); + int (*start_all_aps)(void); + void (*msi_init)(void); +}; + +extern struct init_ops init_ops; + +/* Knob to disable acpi_cpu devices */ +extern bool acpi_cpu_disabled; + +/* Knob to disable acpi_hpet device */ +extern bool acpi_hpet_disabled; + +/* Knob to disable acpi_timer device */ +extern bool acpi_timer_disabled; + +#endif /* __X86_INIT_H__ */ Property changes on: trunk/sys/x86/include/init.h ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +MidnightBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Added: trunk/sys/x86/include/intr_machdep.h =================================================================== --- trunk/sys/x86/include/intr_machdep.h (rev 0) +++ trunk/sys/x86/include/intr_machdep.h 2020-02-08 19:33:27 UTC (rev 12311) @@ -0,0 +1,177 @@ +/* $MidnightBSD$ */ +/*- + * Copyright (c) 2003 John Baldwin <jhb at FreeBSD.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: stable/11/sys/x86/include/intr_machdep.h 342656 2018-12-31 22:09:08Z jhb $ + */ + +#ifndef __X86_INTR_MACHDEP_H__ +#define __X86_INTR_MACHDEP_H__ + +#ifdef _KERNEL + +/* + * Values used in determining the allocation of IRQ values among + * different types of I/O interrupts. These values are used as + * indices into a interrupt source array to map I/O interrupts to a + * device interrupt source whether it be a pin on an interrupt + * controller or an MSI interrupt. The 16 ISA IRQs are assigned fixed + * IDT vectors, but all other device interrupts allocate IDT vectors + * on demand. Currently we have 191 IDT vectors available for device + * interrupts on each CPU. On many systems with I/O APICs, a lot of + * the IRQs are not used, so the total number of IRQ values reserved + * can exceed the number of available IDT slots. + * + * The first 16 IRQs (0 - 15) are reserved for ISA IRQs. Interrupt + * pins on I/O APICs for non-ISA interrupts use IRQ values starting at + * IRQ 17. This layout matches the GSI numbering used by ACPI so that + * IRQ values returned by ACPI methods such as _CRS can be used + * directly by the ACPI bus driver. + * + * MSI interrupts allocate a block of interrupts starting at either + * the end of the I/O APIC range or 256, whichever is higher. When + * running under the Xen Hypervisor, an additional range of IRQ values + * are available for binding to event channel events. We use 256 as + * the minimum IRQ value for MSI interrupts to attempt to leave 255 + * unused since 255 is used in PCI to indicate an invalid INTx IRQ. + */ +#define MINIMUM_MSI_INT 256 + +extern u_int first_msi_irq; +extern u_int num_io_irqs; +extern u_int num_msi_irqs; + +/* + * Default base address for MSI messages on x86 platforms. + */ +#define MSI_INTEL_ADDR_BASE 0xfee00000 + +#ifndef LOCORE + +typedef void inthand_t(void); + +#define IDTVEC(name) __CONCAT(X,name) + +struct intsrc; + +/* + * Methods that a PIC provides to mask/unmask a given interrupt source, + * "turn on" the interrupt on the CPU side by setting up an IDT entry, and + * return the vector associated with this source. + */ +struct pic { + void (*pic_register_sources)(struct pic *); + void (*pic_enable_source)(struct intsrc *); + void (*pic_disable_source)(struct intsrc *, int); + void (*pic_eoi_source)(struct intsrc *); + void (*pic_enable_intr)(struct intsrc *); + void (*pic_disable_intr)(struct intsrc *); + int (*pic_vector)(struct intsrc *); + int (*pic_source_pending)(struct intsrc *); + void (*pic_suspend)(struct pic *); + void (*pic_resume)(struct pic *, bool suspend_cancelled); + int (*pic_config_intr)(struct intsrc *, enum intr_trigger, + enum intr_polarity); + int (*pic_assign_cpu)(struct intsrc *, u_int apic_id); + void (*pic_reprogram_pin)(struct intsrc *); + TAILQ_ENTRY(pic) pics; +}; + +/* Flags for pic_disable_source() */ +enum { + PIC_EOI, + PIC_NO_EOI, +}; + +/* + * An interrupt source. The upper-layer code uses the PIC methods to + * control a given source. The lower-layer PIC drivers can store additional + * private data in a given interrupt source such as an interrupt pin number + * or an I/O APIC pointer. + */ +struct intsrc { + struct pic *is_pic; + struct intr_event *is_event; + u_long *is_count; + u_long *is_straycount; + u_int is_index; + u_int is_handlers; +}; + +struct trapframe; + +#ifdef SMP +extern cpuset_t intr_cpus; +#endif +extern struct mtx icu_lock; +extern int elcr_found; +#ifdef SMP +extern int msix_disable_migration; +#endif + +#ifndef DEV_ATPIC +void atpic_reset(void); +#endif +/* XXX: The elcr_* prototypes probably belong somewhere else. */ +int elcr_probe(void); +enum intr_trigger elcr_read_trigger(u_int irq); +void elcr_resume(void); +void elcr_write_trigger(u_int irq, enum intr_trigger trigger); +#ifdef SMP +void intr_add_cpu(u_int cpu); +#endif +int intr_add_handler(const char *name, int vector, driver_filter_t filter, + driver_intr_t handler, void *arg, enum intr_type flags, + void **cookiep); +#ifdef SMP +int intr_bind(u_int vector, u_char cpu); +#endif +int intr_config_intr(int vector, enum intr_trigger trig, + enum intr_polarity pol); +int intr_describe(u_int vector, void *ih, const char *descr); +void intr_execute_handlers(struct intsrc *isrc, struct trapframe *frame); +u_int intr_next_cpu(void); +struct intsrc *intr_lookup_source(int vector); +int intr_register_pic(struct pic *pic); +int intr_register_source(struct intsrc *isrc); +int intr_remove_handler(void *cookie); +void intr_resume(bool suspend_cancelled); +void intr_suspend(void); +void intr_reprogram(void); +void intrcnt_add(const char *name, u_long **countp); +void nexus_add_irq(u_long irq); +int msi_alloc(device_t dev, int count, int maxcount, int *irqs); +void msi_init(void); +int msi_map(int irq, uint64_t *addr, uint32_t *data); +int msi_release(int *irqs, int count); +int msix_alloc(device_t dev, int *irq); +int msix_release(int irq); +#ifdef XENHVM +void xen_intr_alloc_irqs(void); +#endif + +#endif /* !LOCORE */ +#endif /* _KERNEL */ +#endif /* !__X86_INTR_MACHDEP_H__ */ Property changes on: trunk/sys/x86/include/intr_machdep.h ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +MidnightBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Modified: trunk/sys/x86/include/legacyvar.h =================================================================== --- trunk/sys/x86/include/legacyvar.h 2020-02-08 19:32:41 UTC (rev 12310) +++ trunk/sys/x86/include/legacyvar.h 2020-02-08 19:33:27 UTC (rev 12311) @@ -24,7 +24,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: stable/10/sys/x86/include/legacyvar.h 280970 2015-04-01 21:48:54Z jhb $ + * $FreeBSD: stable/11/sys/x86/include/legacyvar.h 294883 2016-01-27 02:23:54Z jhibbits $ */ #ifndef _X86_LEGACYVAR_H_ @@ -57,9 +57,10 @@ int legacy_pcib_write_ivar(device_t dev, device_t child, int which, uintptr_t value); struct resource *legacy_pcib_alloc_resource(device_t dev, device_t child, - int type, int *rid, u_long start, u_long end, u_long count, u_int flags); + int type, int *rid, rman_res_t start, rman_res_t end, rman_res_t count, + u_int flags); int legacy_pcib_adjust_resource(device_t dev, device_t child, int type, - struct resource *r, u_long start, u_long end); + struct resource *r, rman_res_t start, rman_res_t end); int legacy_pcib_release_resource(device_t dev, device_t child, int type, int rid, struct resource *r); int legacy_pcib_alloc_msi(device_t pcib, device_t dev, int count, Modified: trunk/sys/x86/include/mca.h =================================================================== --- trunk/sys/x86/include/mca.h 2020-02-08 19:32:41 UTC (rev 12310) +++ trunk/sys/x86/include/mca.h 2020-02-08 19:33:27 UTC (rev 12311) @@ -25,7 +25,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: stable/10/sys/x86/include/mca.h 283927 2015-06-02 19:20:39Z jhb $ + * $FreeBSD: stable/11/sys/x86/include/mca.h 281887 2015-04-23 14:22:20Z jhb $ */ #ifndef __X86_MCA_H__ Modified: trunk/sys/x86/include/metadata.h =================================================================== --- trunk/sys/x86/include/metadata.h 2020-02-08 19:32:41 UTC (rev 12310) +++ trunk/sys/x86/include/metadata.h 2020-02-08 19:33:27 UTC (rev 12311) @@ -24,7 +24,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: stable/10/sys/x86/include/metadata.h 294274 2016-01-18 15:52:07Z emaste $ + * $FreeBSD: stable/11/sys/x86/include/metadata.h 293343 2016-01-07 19:47:26Z emaste $ */ #ifndef _MACHINE_METADATA_H_ Modified: trunk/sys/x86/include/mptable.h =================================================================== --- trunk/sys/x86/include/mptable.h 2020-02-08 19:32:41 UTC (rev 12310) +++ trunk/sys/x86/include/mptable.h 2020-02-08 19:33:27 UTC (rev 12311) @@ -23,7 +23,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: stable/10/sys/x86/include/mptable.h 259837 2013-12-24 19:10:56Z jhb $ + * $FreeBSD: stable/11/sys/x86/include/mptable.h 259228 2013-12-11 21:19:04Z jhb $ */ #ifndef __MACHINE_MPTABLE_H__ @@ -31,10 +31,13 @@ enum busTypes { NOBUS = 0, + CBUS = 1, + CBUSII = 2, EISA = 3, ISA = 6, MCA = 9, PCI = 13, + XPRESS = 18, MAX_BUSTYPE = 18, UNKNOWN_BUSTYPE = 0xff }; @@ -41,17 +44,17 @@ /* MP Floating Pointer Structure */ typedef struct MPFPS { - char signature[4]; - u_int32_t pap; - u_char length; - u_char spec_rev; - u_char checksum; - u_char config_type; - u_char mpfb2; - u_char mpfb3; - u_char mpfb4; - u_char mpfb5; -} *mpfps_t; + uint8_t signature[4]; + uint32_t pap; + uint8_t length; + uint8_t spec_rev; + uint8_t checksum; + uint8_t config_type; + uint8_t mpfb2; + uint8_t mpfb3; + uint8_t mpfb4; + uint8_t mpfb5; +} __packed *mpfps_t; #define MPFB2_IMCR_PRESENT 0x80 #define MPFB2_MUL_CLK_SRCS 0x40 @@ -58,20 +61,20 @@ /* MP Configuration Table Header */ typedef struct MPCTH { - char signature[4]; - u_short base_table_length; - u_char spec_rev; - u_char checksum; - u_char oem_id[8]; - u_char product_id[12]; - u_int32_t oem_table_pointer; - u_short oem_table_size; - u_short entry_count; - u_int32_t apic_address; - u_short extended_table_length; - u_char extended_table_checksum; - u_char reserved; -} *mpcth_t; + uint8_t signature[4]; + uint16_t base_table_length; + uint8_t spec_rev; + uint8_t checksum; + uint8_t oem_id[8]; + uint8_t product_id[12]; + uint32_t oem_table_pointer; + uint16_t oem_table_size; + uint16_t entry_count; + uint32_t apic_address; + uint16_t extended_table_length; + uint8_t extended_table_checksum; + uint8_t reserved; +} __packed *mpcth_t; /* Base table entries */ @@ -82,44 +85,44 @@ #define MPCT_ENTRY_LOCAL_INT 4 typedef struct PROCENTRY { - u_char type; - u_char apic_id; - u_char apic_version; - u_char cpu_flags; - u_int32_t cpu_signature; - u_int32_t feature_flags; - u_int32_t reserved1; - u_int32_t reserved2; -} *proc_entry_ptr; + uint8_t type; + uint8_t apic_id; + uint8_t apic_version; + uint8_t cpu_flags; + uint32_t cpu_signature; + uint32_t feature_flags; + uint32_t reserved1; + uint32_t reserved2; +} __packed *proc_entry_ptr; #define PROCENTRY_FLAG_EN 0x01 #define PROCENTRY_FLAG_BP 0x02 typedef struct BUSENTRY { - u_char type; - u_char bus_id; - char bus_type[6]; -} *bus_entry_ptr; + uint8_t type; + uint8_t bus_id; + uint8_t bus_type[6]; +} __packed *bus_entry_ptr; typedef struct IOAPICENTRY { - u_char type; - u_char apic_id; - u_char apic_version; - u_char apic_flags; - u_int32_t apic_address; -} *io_apic_entry_ptr; + uint8_t type; + uint8_t apic_id; + uint8_t apic_version; + uint8_t apic_flags; + uint32_t apic_address; +} __packed *io_apic_entry_ptr; #define IOAPICENTRY_FLAG_EN 0x01 typedef struct INTENTRY { - u_char type; - u_char int_type; - u_short int_flags; - u_char src_bus_id; - u_char src_bus_irq; - u_char dst_apic_id; - u_char dst_apic_int; -} *int_entry_ptr; + uint8_t type; + uint8_t int_type; + uint16_t int_flags; + uint8_t src_bus_id; + uint8_t src_bus_irq; + uint8_t dst_apic_id; + uint8_t dst_apic_int; +} __packed *int_entry_ptr; #define INTENTRY_TYPE_INT 0 #define INTENTRY_TYPE_NMI 1 @@ -138,9 +141,9 @@ /* Extended table entries */ typedef struct EXTENTRY { - u_char type; - u_char length; -} *ext_entry_ptr; + uint8_t type; + uint8_t length; +} __packed *ext_entry_ptr; #define MPCT_EXTENTRY_SAS 0x80 #define MPCT_EXTENTRY_BHD 0x81 @@ -147,13 +150,13 @@ #define MPCT_EXTENTRY_CBASM 0x82 typedef struct SASENTRY { - u_char type; - u_char length; - u_char bus_id; - u_char address_type; + uint8_t type; + uint8_t length; + uint8_t bus_id; + uint8_t address_type; uint64_t address_base; uint64_t address_length; -} __attribute__((__packed__)) *sas_entry_ptr; +} __packed *sas_entry_ptr; #define SASENTRY_TYPE_IO 0 #define SASENTRY_TYPE_MEMORY 1 @@ -160,23 +163,23 @@ #define SASENTRY_TYPE_PREFETCH 2 typedef struct BHDENTRY { - u_char type; - u_char length; - u_char bus_id; - u_char bus_info; - u_char parent_bus; - u_char reserved[3]; -} *bhd_entry_ptr; + uint8_t type; + uint8_t length; + uint8_t bus_id; + uint8_t bus_info; + uint8_t parent_bus; + uint8_t reserved[3]; +} __packed *bhd_entry_ptr; #define BHDENTRY_INFO_SUBTRACTIVE_DECODE 0x1 typedef struct CBASMENTRY { - u_char type; - u_char length; - u_char bus_id; - u_char address_mod; - u_int predefined_range; -} *cbasm_entry_ptr; + uint8_t type; + uint8_t length; + uint8_t bus_id; + uint8_t address_mod; + uint32_t predefined_range; +} __packed *cbasm_entry_ptr; #define CBASMENTRY_ADDRESS_MOD_ADD 0x0 #define CBASMENTRY_ADDRESS_MOD_SUBTRACT 0x1 @@ -184,13 +187,6 @@ #define CBASMENTRY_RANGE_ISA_IO 0 #define CBASMENTRY_RANGE_VGA_IO 1 -/* descriptions of MP table entries */ -typedef struct BASETABLE_ENTRY { - u_char type; - u_char length; - char name[16]; -} basetable_entry; - #ifdef _KERNEL struct mptable_hostb_softc { #ifdef NEW_PCIB Modified: trunk/sys/x86/include/ofw_machdep.h =================================================================== --- trunk/sys/x86/include/ofw_machdep.h 2020-02-08 19:32:41 UTC (rev 12310) +++ trunk/sys/x86/include/ofw_machdep.h 2020-02-08 19:33:27 UTC (rev 12311) @@ -24,13 +24,13 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: stable/10/sys/x86/include/ofw_machdep.h 266084 2014-05-14 19:18:58Z ian $ + * $FreeBSD: stable/11/sys/x86/include/ofw_machdep.h 287260 2015-08-28 15:41:09Z imp $ */ #ifndef _MACHINE_OFW_MACHDEP_H_ #define _MACHINE_OFW_MACHDEP_H_ -#include <x86/bus.h> +#include <machine/bus.h> #include <vm/vm.h> typedef uint32_t cell_t; Modified: trunk/sys/x86/include/pci_cfgreg.h =================================================================== --- trunk/sys/x86/include/pci_cfgreg.h 2020-02-08 19:32:41 UTC (rev 12310) +++ trunk/sys/x86/include/pci_cfgreg.h 2020-02-08 19:33:27 UTC (rev 12311) @@ -24,7 +24,7 @@ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * - * $FreeBSD: stable/10/sys/x86/include/pci_cfgreg.h 223440 2011-06-22 21:04:13Z jhb $ + * $FreeBSD: stable/11/sys/x86/include/pci_cfgreg.h 294883 2016-01-27 02:23:54Z jhibbits $ * */ @@ -47,7 +47,7 @@ #define CONF2_ENABLE_CHK 0x0e #define CONF2_ENABLE_RES 0x0e -u_long hostb_alloc_start(int type, u_long start, u_long end, u_long count); +rman_res_t hostb_alloc_start(int type, rman_res_t start, rman_res_t end, rman_res_t count); int pcie_cfgregopen(uint64_t base, uint8_t minbus, uint8_t maxbus); int pci_cfgregopen(void); u_int32_t pci_cfgregread(int bus, int slot, int func, int reg, int bytes); Modified: trunk/sys/x86/include/psl.h =================================================================== --- trunk/sys/x86/include/psl.h 2020-02-08 19:32:41 UTC (rev 12310) +++ trunk/sys/x86/include/psl.h 2020-02-08 19:33:27 UTC (rev 12311) @@ -31,7 +31,7 @@ * SUCH DAMAGE. * * from: @(#)psl.h 5.2 (Berkeley) 1/18/91 - * $FreeBSD: stable/10/sys/x86/include/psl.h 258559 2013-11-25 15:58:48Z emaste $ + * $FreeBSD: stable/11/sys/x86/include/psl.h 331722 2018-03-29 02:50:57Z eadler $ */ #ifndef _MACHINE_PSL_H_ Modified: trunk/sys/x86/include/ptrace.h =================================================================== --- trunk/sys/x86/include/ptrace.h 2020-02-08 19:32:41 UTC (rev 12310) +++ trunk/sys/x86/include/ptrace.h 2020-02-08 19:33:27 UTC (rev 12311) @@ -28,7 +28,7 @@ * SUCH DAMAGE. * * @(#)ptrace.h 8.1 (Berkeley) 6/11/93 - * $FreeBSD: stable/10/sys/x86/include/ptrace.h 286311 2015-08-05 08:17:10Z kib $ + * $FreeBSD: stable/11/sys/x86/include/ptrace.h 331722 2018-03-29 02:50:57Z eadler $ */ #ifndef _MACHINE_PTRACE_H_ Added: trunk/sys/x86/include/pvclock.h =================================================================== --- trunk/sys/x86/include/pvclock.h (rev 0) +++ trunk/sys/x86/include/pvclock.h 2020-02-08 19:33:27 UTC (rev 12311) @@ -0,0 +1,60 @@ +/* $MidnightBSD$ */ +/*- + * Copyright (c) 2014, Bryan Venteicher <bryanv at FreeBSD.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: stable/11/sys/x86/include/pvclock.h 278184 2015-02-04 08:33:04Z bryanv $ + */ + +#ifndef X86_PVCLOCK +#define X86_PVCLOCK + +struct pvclock_vcpu_time_info { + uint32_t version; + uint32_t pad0; + uint64_t tsc_timestamp; + uint64_t system_time; + uint32_t tsc_to_system_mul; + int8_t tsc_shift; + uint8_t flags; + uint8_t pad[2]; +}; + +#define PVCLOCK_FLAG_TSC_STABLE 0x01 +#define PVCLOCK_FLAG_GUEST_PASUED 0x02 + +struct pvclock_wall_clock { + uint32_t version; + uint32_t sec; + uint32_t nsec; +}; + +void pvclock_resume(void); +uint64_t pvclock_get_last_cycles(void); +uint64_t pvclock_tsc_freq(struct pvclock_vcpu_time_info *ti); +uint64_t pvclock_get_timecount(struct pvclock_vcpu_time_info *ti); +void pvclock_get_wallclock(struct pvclock_wall_clock *wc, + struct timespec *ts); + +#endif Property changes on: trunk/sys/x86/include/pvclock.h ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +MidnightBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Modified: trunk/sys/x86/include/reg.h =================================================================== --- trunk/sys/x86/include/reg.h 2020-02-08 19:32:41 UTC (rev 12310) +++ trunk/sys/x86/include/reg.h 2020-02-08 19:33:27 UTC (rev 12311) @@ -32,7 +32,7 @@ * SUCH DAMAGE. * * from: @(#)reg.h 5.5 (Berkeley) 1/18/91 - * $FreeBSD: stable/10/sys/x86/include/reg.h 283910 2015-06-02 14:54:53Z jhb $ + * $FreeBSD: stable/11/sys/x86/include/reg.h 338691 2018-09-14 23:21:52Z jhb $ */ #ifndef _MACHINE_REG_H_ @@ -205,6 +205,14 @@ /* Index 8-15: reserved */ }; +#define DBREG_DR6_RESERVED1 0xffff0ff0 +#define DBREG_DR6_BMASK 0x000f +#define DBREG_DR6_B(i) (1 << (i)) +#define DBREG_DR6_BD 0x2000 +#define DBREG_DR6_BS 0x4000 +#define DBREG_DR6_BT 0x8000 + +#define DBREG_DR7_RESERVED1 0x0400 #define DBREG_DR7_LOCAL_ENABLE 0x01 #define DBREG_DR7_GLOBAL_ENABLE 0x02 #define DBREG_DR7_LEN_1 0x00 /* 1 byte length */ @@ -235,6 +243,8 @@ #undef __dbreg64 #ifdef _KERNEL +struct thread; + /* * XXX these interfaces are MI, so they should be declared in a MI place. */ Modified: trunk/sys/x86/include/segments.h =================================================================== --- trunk/sys/x86/include/segments.h 2020-02-08 19:32:41 UTC (rev 12310) +++ trunk/sys/x86/include/segments.h 2020-02-08 19:33:27 UTC (rev 12311) @@ -32,7 +32,7 @@ * SUCH DAMAGE. * * from: @(#)segments.h 7.1 (Berkeley) 5/9/91 - * $FreeBSD: stable/10/sys/x86/include/segments.h 255040 2013-08-29 19:52:18Z gibbs $ + * $FreeBSD: stable/11/sys/x86/include/segments.h 331722 2018-03-29 02:50:57Z eadler $ */ #ifndef _X86_SEGMENTS_H_ @@ -47,11 +47,7 @@ */ #define SEL_RPL_MASK 3 /* requester priv level */ #define ISPL(s) ((s)&3) /* priority level of a selector */ -#ifdef XEN -#define SEL_KPL 1 /* kernel priority level */ -#else #define SEL_KPL 0 /* kernel priority level */ -#endif #define SEL_UPL 3 /* user priority level */ #define ISLDT(s) ((s)&SEL_LDT) /* is it local or global */ #define SEL_LDT 4 /* local descriptor table */ @@ -220,7 +216,7 @@ #define IDT_DTRACE_RET 0x92 /* DTrace pid provider Interrupt Vector */ #define IDT_EVTCHN 0x93 /* Xen HVM Event Channel Interrupt Vector */ -#if defined(__i386__) || defined(__ia64__) +#if defined(__i386__) /* * Entries in the Global Descriptor Table (GDT) * Note that each 4 entries share a single 32 byte L1 cache line. @@ -245,11 +241,7 @@ #define GBIOSUTIL_SEL 16 /* BIOS interface (Utility) */ #define GBIOSARGS_SEL 17 /* BIOS interface (Arguments) */ #define GNDIS_SEL 18 /* For the NDIS layer */ -#ifdef XEN -#define NGDT 9 -#else #define NGDT 19 -#endif /* * Entries in the Local Descriptor Table (LDT) @@ -265,7 +257,7 @@ #define LBSDICALLS_SEL 16 /* BSDI system call gate */ #define NLDT (LBSDICALLS_SEL + 1) -#else /* !__i386__ && !__ia64__ */ +#else /* !__i386__ */ /* * Entries in the Global Descriptor Table (GDT) */ @@ -283,6 +275,6 @@ #define GUSERLDT_SEL 11 /* LDT */ /* slot 12 is second half of GUSERLDT_SEL */ #define NGDT 13 -#endif /* __i386__ || __ia64__ */ +#endif /* __i386__ */ #endif /* !_X86_SEGMENTS_H_ */ Modified: trunk/sys/x86/include/setjmp.h =================================================================== --- trunk/sys/x86/include/setjmp.h 2020-02-08 19:32:41 UTC (rev 12310) +++ trunk/sys/x86/include/setjmp.h 2020-02-08 19:33:27 UTC (rev 12311) @@ -27,7 +27,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: stable/10/sys/x86/include/setjmp.h 232275 2012-02-28 22:17:52Z tijl $ + * $FreeBSD: stable/11/sys/x86/include/setjmp.h 331722 2018-03-29 02:50:57Z eadler $ */ #ifndef _MACHINE_SETJMP_H_ Modified: trunk/sys/x86/include/sigframe.h =================================================================== --- trunk/sys/x86/include/sigframe.h 2020-02-08 19:32:41 UTC (rev 12310) +++ trunk/sys/x86/include/sigframe.h 2020-02-08 19:33:27 UTC (rev 12311) @@ -26,7 +26,7 @@ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * - * $FreeBSD: stable/10/sys/x86/include/sigframe.h 247047 2013-02-20 17:39:52Z kib $ + * $FreeBSD: stable/11/sys/x86/include/sigframe.h 247047 2013-02-20 17:39:52Z kib $ */ #ifndef _X86_SIGFRAME_H_ Modified: trunk/sys/x86/include/signal.h =================================================================== --- trunk/sys/x86/include/signal.h 2020-02-08 19:32:41 UTC (rev 12310) +++ trunk/sys/x86/include/signal.h 2020-02-08 19:33:27 UTC (rev 12311) @@ -29,7 +29,7 @@ * SUCH DAMAGE. * * @(#)signal.h 8.1 (Berkeley) 6/11/93 - * $FreeBSD: stable/10/sys/x86/include/signal.h 247047 2013-02-20 17:39:52Z kib $ + * $FreeBSD: stable/11/sys/x86/include/signal.h 331722 2018-03-29 02:50:57Z eadler $ */ #ifndef _X86_SIGNAL_H Modified: trunk/sys/x86/include/specialreg.h =================================================================== --- trunk/sys/x86/include/specialreg.h 2020-02-08 19:32:41 UTC (rev 12310) +++ trunk/sys/x86/include/specialreg.h 2020-02-08 19:33:27 UTC (rev 12311) @@ -28,7 +28,7 @@ * SUCH DAMAGE. * * from: @(#)specialreg.h 7.1 (Berkeley) 5/9/91 - * $FreeBSD: stable/10/sys/x86/include/specialreg.h 315928 2017-03-25 05:09:03Z grehan $ + * $FreeBSD: stable/11/sys/x86/include/specialreg.h 354658 2019-11-12 19:35:46Z scottl $ */ #ifndef _MACHINE_SPECIALREG_H_ @@ -54,6 +54,7 @@ #define CR0_CD 0x40000000 /* Cache Disable */ #define CR3_PCID_SAVE 0x8000000000000000 +#define CR3_PCID_MASK 0xfff /* * Bits in PPro special registers @@ -74,6 +75,7 @@ #define CR4_PCIDE 0x00020000 /* Enable Context ID */ #define CR4_XSAVE 0x00040000 /* XSETBV/XGETBV */ #define CR4_SMEP 0x00100000 /* Supervisor-Mode Execution Prevention */ +#define CR4_SMAP 0x00200000 /* Supervisor-Mode Access Prevention */ /* * Bits in AMD64 special registers. EFER is 64 bits wide. @@ -322,6 +324,13 @@ #define AMDPM_CPB 0x00000200 /* + * AMD extended function 8000_0008h ebx info (amd_extended_feature_extensions) + */ +#define AMDFEID_CLZERO 0x00000001 +#define AMDFEID_IRPERF 0x00000002 +#define AMDFEID_XSAVEERPTR 0x00000004 + +/* * AMD extended function 8000_0008h ecx info */ #define AMDID_CMP_CORES 0x000000ff @@ -348,15 +357,21 @@ #define CPUID_STDEXT_MPX 0x00004000 #define CPUID_STDEXT_PQE 0x00008000 #define CPUID_STDEXT_AVX512F 0x00010000 +#define CPUID_STDEXT_AVX512DQ 0x00020000 #define CPUID_STDEXT_RDSEED 0x00040000 #define CPUID_STDEXT_ADX 0x00080000 #define CPUID_STDEXT_SMAP 0x00100000 +#define CPUID_STDEXT_AVX512IFMA 0x00200000 +#define CPUID_STDEXT_PCOMMIT 0x00400000 #define CPUID_STDEXT_CLFLUSHOPT 0x00800000 +#define CPUID_STDEXT_CLWB 0x01000000 #define CPUID_STDEXT_PROCTRACE 0x02000000 #define CPUID_STDEXT_AVX512PF 0x04000000 #define CPUID_STDEXT_AVX512ER 0x08000000 #define CPUID_STDEXT_AVX512CD 0x10000000 #define CPUID_STDEXT_SHA 0x20000000 +#define CPUID_STDEXT_AVX512BW 0x40000000 +#define CPUID_STDEXT_AVX512VL 0x80000000 /* * CPUID instruction 7 Structured Extended Features, leaf 0 ecx info @@ -365,10 +380,42 @@ #define CPUID_STDEXT2_UMIP 0x00000004 #define CPUID_STDEXT2_PKU 0x00000008 #define CPUID_STDEXT2_OSPKE 0x00000010 +#define CPUID_STDEXT2_WAITPKG 0x00000020 +#define CPUID_STDEXT2_GFNI 0x00000100 #define CPUID_STDEXT2_RDPID 0x00400000 +#define CPUID_STDEXT2_CLDEMOTE 0x02000000 +#define CPUID_STDEXT2_MOVDIRI 0x08000000 +#define CPUID_STDEXT2_MOVDIRI64B 0x10000000 #define CPUID_STDEXT2_SGXLC 0x40000000 /* + * CPUID instruction 7 Structured Extended Features, leaf 0 edx info + */ +#define CPUID_STDEXT3_MD_CLEAR 0x00000400 +#define CPUID_STDEXT3_TSXFA 0x00002000 +#define CPUID_STDEXT3_IBPB 0x04000000 +#define CPUID_STDEXT3_STIBP 0x08000000 +#define CPUID_STDEXT3_L1D_FLUSH 0x10000000 +#define CPUID_STDEXT3_ARCH_CAP 0x20000000 +#define CPUID_STDEXT3_CORE_CAP 0x40000000 +#define CPUID_STDEXT3_SSBD 0x80000000 + +/* MSR IA32_ARCH_CAP(ABILITIES) bits */ +#define IA32_ARCH_CAP_RDCL_NO 0x00000001 +#define IA32_ARCH_CAP_IBRS_ALL 0x00000002 +#define IA32_ARCH_CAP_RSBA 0x00000004 +#define IA32_ARCH_CAP_SKIP_L1DFL_VMENTRY 0x00000008 +#define IA32_ARCH_CAP_SSB_NO 0x00000010 +#define IA32_ARCH_CAP_MDS_NO 0x00000020 +#define IA32_ARCH_CAP_IF_PSCHANGE_MC_NO 0x00000040 +#define IA32_ARCH_CAP_TSX_CTRL 0x00000080 +#define IA32_ARCH_CAP_TAA_NO 0x00000100 + +/* MSR IA32_TSX_CTRL bits */ +#define IA32_TSX_CTRL_RTM_DISABLE 0x00000001 +#define IA32_TSX_CTRL_TSX_CPUID_CLEAR 0x00000002 + +/* * CPUID manufacturers identifiers */ #define AMD_VENDOR_ID "AuthenticAMD" @@ -396,6 +443,8 @@ #define MSR_EBL_CR_POWERON 0x02a #define MSR_TEST_CTL 0x033 #define MSR_IA32_FEATURE_CONTROL 0x03a +#define MSR_IA32_SPEC_CTRL 0x048 +#define MSR_IA32_PRED_CMD 0x049 #define MSR_BIOS_UPDT_TRIG 0x079 #define MSR_BBL_CR_D0 0x088 #define MSR_BBL_CR_D1 0x089 @@ -408,6 +457,9 @@ #define MSR_APERF 0x0e8 #define MSR_IA32_EXT_CONFIG 0x0ee /* Undocumented. Core Solo/Duo only */ #define MSR_MTRRcap 0x0fe +#define MSR_IA32_ARCH_CAP 0x10a +#define MSR_IA32_FLUSH_CMD 0x10b +#define MSR_TSX_FORCE_ABORT 0x10f #define MSR_BBL_CR_ADDR 0x116 #define MSR_BBL_CR_DECC 0x118 #define MSR_BBL_CR_CTL 0x119 @@ -414,6 +466,7 @@ #define MSR_BBL_CR_TRIG 0x11a #define MSR_BBL_CR_BUSY 0x11b #define MSR_BBL_CR_CTL3 0x11e +#define MSR_IA32_TSX_CTRL 0x122 #define MSR_SYSENTER_CS_MSR 0x174 #define MSR_SYSENTER_ESP_MSR 0x175 #define MSR_SYSENTER_EIP_MSR 0x176 @@ -467,6 +520,7 @@ #define MSR_DRAM_ENERGY_STATUS 0x619 #define MSR_PP0_ENERGY_STATUS 0x639 #define MSR_PP1_ENERGY_STATUS 0x641 +#define MSR_TSC_DEADLINE 0x6e0 /* Writes are not serializing */ /* * VMX MSRs @@ -488,8 +542,10 @@ #define MSR_VMX_TRUE_ENTRY_CTLS 0x490 /* - * X2APIC MSRs + * X2APIC MSRs. + * Writes are not serializing. */ +#define MSR_APIC_000 0x800 #define MSR_APIC_ID 0x802 #define MSR_APIC_VERSION 0x803 #define MSR_APIC_TPR 0x808 @@ -548,6 +604,21 @@ #define IA32_MISC_EN_XDD 0x0000000400000000ULL /* + * IA32_SPEC_CTRL and IA32_PRED_CMD MSRs are described in the Intel' + * document 336996-001 Speculative Execution Side Channel Mitigations. + */ +/* MSR IA32_SPEC_CTRL */ +#define IA32_SPEC_CTRL_IBRS 0x00000001 +#define IA32_SPEC_CTRL_STIBP 0x00000002 +#define IA32_SPEC_CTRL_SSBD 0x00000004 + +/* MSR IA32_PRED_CMD */ +#define IA32_PRED_CMD_IBPB_BARRIER 0x0000000000000001ULL + +/* MSR IA32_FLUSH_CMD */ +#define IA32_FLUSH_CMD_L1D 0x00000001 + +/* * PAT modes. */ #define PAT_UNCACHEABLE 0x00 @@ -697,6 +768,22 @@ #define MC_MISC_ADDRESS_MODE 0x00000000000001c0 /* If MCG_CAP_SER_P */ #define MC_CTL2_THRESHOLD 0x0000000000007fff #define MC_CTL2_CMCI_EN 0x0000000040000000 +#define MC_AMDNB_BANK 4 +#define MC_MISC_AMDNB_VAL 0x8000000000000000 /* Counter presence valid */ +#define MC_MISC_AMDNB_CNTP 0x4000000000000000 /* Counter present */ +#define MC_MISC_AMDNB_LOCK 0x2000000000000000 /* Register locked */ +#define MC_MISC_AMDNB_LVT_MASK 0x00f0000000000000 /* Extended LVT offset */ +#define MC_MISC_AMDNB_LVT_SHIFT 52 +#define MC_MISC_AMDNB_CNTEN 0x0008000000000000 /* Counter enabled */ +#define MC_MISC_AMDNB_INT_MASK 0x0006000000000000 /* Interrupt type */ +#define MC_MISC_AMDNB_INT_LVT 0x0002000000000000 /* Interrupt via Extended LVT */ +#define MC_MISC_AMDNB_INT_SMI 0x0004000000000000 /* SMI */ +#define MC_MISC_AMDNB_OVERFLOW 0x0001000000000000 /* Counter overflow */ +#define MC_MISC_AMDNB_CNT_MASK 0x00000fff00000000 /* Counter value */ +#define MC_MISC_AMDNB_CNT_SHIFT 32 +#define MC_MISC_AMDNB_CNT_MAX 0xfff +#define MC_MISC_AMDNB_PTR_MASK 0x00000000ff000000 /* Pointer to additional registers */ +#define MC_MISC_AMDNB_PTR_SHIFT 24 /* * The following four 3-byte registers control the non-cacheable regions. @@ -800,6 +887,7 @@ #define MSR_FSBASE 0xc0000100 /* base address of the %fs "segment" */ #define MSR_GSBASE 0xc0000101 /* base address of the %gs "segment" */ #define MSR_KGSBASE 0xc0000102 /* base address of the kernel %gs */ +#define MSR_TSC_AUX 0xc0000103 #define MSR_PERFEVSEL0 0xc0010000 #define MSR_PERFEVSEL1 0xc0010001 #define MSR_PERFEVSEL2 0xc0010002 @@ -817,6 +905,8 @@ #define MSR_TOP_MEM 0xc001001a /* boundary for ram below 4G */ #define MSR_TOP_MEM2 0xc001001d /* boundary for ram above 4G */ #define MSR_NB_CFG1 0xc001001f /* NB configuration 1 */ +#define MSR_K8_UCODE_UPDATE 0xc0010020 /* update microcode */ +#define MSR_MC0_CTL_MASK 0xc0010044 #define MSR_P_STATE_LIMIT 0xc0010061 /* P-state Current Limit Register */ #define MSR_P_STATE_CONTROL 0xc0010062 /* P-state Control Register */ #define MSR_P_STATE_STATUS 0xc0010063 /* P-state Status Register */ @@ -823,12 +913,12 @@ #define MSR_P_STATE_CONFIG(n) (0xc0010064 + (n)) /* P-state Config */ #define MSR_SMM_ADDR 0xc0010112 /* SMM TSEG base address */ #define MSR_SMM_MASK 0xc0010113 /* SMM TSEG address mask */ +#define MSR_VM_CR 0xc0010114 /* SVM: feature control */ +#define MSR_VM_HSAVE_PA 0xc0010117 /* SVM: host save area address */ +#define MSR_AMD_CPUID07 0xc0011002 /* CPUID 07 %ebx override */ #define MSR_EXTFEATURES 0xc0011005 /* Extended CPUID Features override */ +#define MSR_LS_CFG 0xc0011020 #define MSR_IC_CFG 0xc0011021 /* Instruction Cache Configuration */ -#define MSR_K8_UCODE_UPDATE 0xc0010020 /* update microcode */ -#define MSR_MC0_CTL_MASK 0xc0010044 -#define MSR_VM_CR 0xc0010114 /* SVM: feature control */ -#define MSR_VM_HSAVE_PA 0xc0010117 /* SVM: host save area address */ /* MSR_VM_CR related */ #define VM_CR_SVMDIS 0x10 /* SVM: disabled by BIOS */ Added: trunk/sys/x86/include/stack.h =================================================================== --- trunk/sys/x86/include/stack.h (rev 0) +++ trunk/sys/x86/include/stack.h 2020-02-08 19:33:27 UTC (rev 12311) @@ -0,0 +1,62 @@ +/* $MidnightBSD$ */ +/*- + * Mach Operating System + * Copyright (c) 1991,1990 Carnegie Mellon University + * All Rights Reserved. + * + * Permission to use, copy, modify and distribute this software and its + * documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR + * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution at CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + * + * $FreeBSD: stable/11/sys/x86/include/stack.h 287645 2015-09-11 03:54:37Z markj $ + */ + +#ifndef _X86_STACK_H +#define _X86_STACK_H + +/* + * Stack trace. + */ + +#ifdef __i386__ +struct i386_frame { + struct i386_frame *f_frame; + u_int f_retaddr; + u_int f_arg0; +}; +#endif + +#ifdef __amd64__ +struct amd64_frame { + struct amd64_frame *f_frame; + u_long f_retaddr; +}; + +struct i386_frame { + uint32_t f_frame; + uint32_t f_retaddr; + uint32_t f_arg0; +}; +#endif /* __amd64__ */ + +#ifdef _KERNEL +int stack_nmi_handler(struct trapframe *); +#endif + +#endif /* !_X86_STACK_H */ Property changes on: trunk/sys/x86/include/stack.h ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +MidnightBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Modified: trunk/sys/x86/include/stdarg.h =================================================================== --- trunk/sys/x86/include/stdarg.h 2020-02-08 19:32:41 UTC (rev 12310) +++ trunk/sys/x86/include/stdarg.h 2020-02-08 19:33:27 UTC (rev 12311) @@ -26,7 +26,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: stable/10/sys/x86/include/stdarg.h 256105 2013-10-07 10:01:23Z phk $ + * $FreeBSD: stable/11/sys/x86/include/stdarg.h 331722 2018-03-29 02:50:57Z eadler $ */ #ifndef _MACHINE_STDARG_H_ Modified: trunk/sys/x86/include/sysarch.h =================================================================== --- trunk/sys/x86/include/sysarch.h 2020-02-08 19:32:41 UTC (rev 12310) +++ trunk/sys/x86/include/sysarch.h 2020-02-08 19:33:27 UTC (rev 12311) @@ -27,7 +27,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: stable/10/sys/x86/include/sysarch.h 233209 2012-03-19 21:57:31Z tijl $ + * $FreeBSD: stable/11/sys/x86/include/sysarch.h 331722 2018-03-29 02:50:57Z eadler $ */ /* Modified: trunk/sys/x86/include/trap.h =================================================================== --- trunk/sys/x86/include/trap.h 2020-02-08 19:32:41 UTC (rev 12310) +++ trunk/sys/x86/include/trap.h 2020-02-08 19:33:27 UTC (rev 12311) @@ -31,7 +31,7 @@ * SUCH DAMAGE. * * from: @(#)trap.h 5.4 (Berkeley) 5/9/91 - * $FreeBSD: stable/10/sys/x86/include/trap.h 262042 2014-02-17 12:57:13Z avg $ + * $FreeBSD: stable/11/sys/x86/include/trap.h 331722 2018-03-29 02:50:57Z eadler $ */ #ifndef _MACHINE_TRAP_H_ Added: trunk/sys/x86/include/ucode.h =================================================================== --- trunk/sys/x86/include/ucode.h (rev 0) +++ trunk/sys/x86/include/ucode.h 2020-02-08 19:33:27 UTC (rev 12311) @@ -0,0 +1,69 @@ +/* $MidnightBSD$ */ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2018 The FreeBSD Foundation + * + * This software was developed by Mark Johnston under sponsorship from + * the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: stable/11/sys/x86/include/ucode.h 347700 2019-05-16 14:42:16Z markj $ + */ + +#ifndef _MACHINE_UCODE_H_ +#define _MACHINE_UCODE_H_ + +struct ucode_intel_header { + uint32_t header_version; + int32_t update_revision; + uint32_t dat; + uint32_t processor_signature; + uint32_t checksum; + uint32_t loader_revision; + uint32_t processor_flags; +#define UCODE_INTEL_DEFAULT_DATA_SIZE 2000 + uint32_t data_size; + uint32_t total_size; + uint32_t reserved[3]; +}; + +struct ucode_intel_extsig_table { + uint32_t signature_count; + uint32_t signature_table_checksum; + uint32_t reserved[3]; + struct ucode_intel_extsig { + uint32_t processor_signature; + uint32_t processor_flags; + uint32_t checksum; + } entries[0]; +}; + +int ucode_intel_load(void *data, bool unsafe, + uint64_t *nrevp, uint64_t *orevp); +size_t ucode_load_bsp(uintptr_t free); +void ucode_load_ap(int cpu); +void ucode_reload(void); +void * ucode_update(void *data); + +#endif /* _MACHINE_UCODE_H_ */ Property changes on: trunk/sys/x86/include/ucode.h ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +MidnightBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Modified: trunk/sys/x86/include/ucontext.h =================================================================== --- trunk/sys/x86/include/ucontext.h 2020-02-08 19:32:41 UTC (rev 12310) +++ trunk/sys/x86/include/ucontext.h 2020-02-08 19:33:27 UTC (rev 12311) @@ -27,7 +27,7 @@ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * - * $FreeBSD: stable/10/sys/x86/include/ucontext.h 247047 2013-02-20 17:39:52Z kib $ + * $FreeBSD: stable/11/sys/x86/include/ucontext.h 295561 2016-02-12 07:38:19Z kib $ */ #ifndef _X86_UCONTEXT_H_ @@ -163,4 +163,9 @@ } mcontext_t; #endif /* __amd64__ */ +#ifdef __LINT__ +typedef struct __mcontext { +} mcontext_t; +#endif /* __LINT__ */ + #endif /* !_X86_UCONTEXT_H_ */ Modified: trunk/sys/x86/include/vdso.h =================================================================== --- trunk/sys/x86/include/vdso.h 2020-02-08 19:32:41 UTC (rev 12310) +++ trunk/sys/x86/include/vdso.h 2020-02-08 19:33:27 UTC (rev 12311) @@ -1,8 +1,12 @@ /* $MidnightBSD$ */ /*- * Copyright 2012 Konstantin Belousov <kib at FreeBSD.ORG>. + * Copyright 2016 The FreeBSD Foundation. * All rights reserved. * + * Portions of this software were developed by Konstantin Belousov + * under sponsorship from the FreeBSD Foundation. + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -23,7 +27,7 @@ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * - * $FreeBSD: stable/10/sys/x86/include/vdso.h 237433 2012-06-22 07:06:40Z kib $ + * $FreeBSD: stable/11/sys/x86/include/vdso.h 311376 2017-01-05 07:42:08Z sephe $ */ #ifndef _X86_VDSO_H @@ -31,8 +35,13 @@ #define VDSO_TIMEHANDS_MD \ uint32_t th_x86_shift; \ - uint32_t th_res[7]; + uint32_t th_x86_hpet_idx; \ + uint32_t th_res[6]; +#define VDSO_TH_ALGO_X86_TSC VDSO_TH_ALGO_1 +#define VDSO_TH_ALGO_X86_HPET VDSO_TH_ALGO_2 +#define VDSO_TH_ALGO_X86_HVTSC VDSO_TH_ALGO_3 /* Hyper-V ref. TSC */ + #ifdef _KERNEL #ifdef COMPAT_FREEBSD32 Modified: trunk/sys/x86/include/vmware.h =================================================================== --- trunk/sys/x86/include/vmware.h 2020-02-08 19:32:41 UTC (rev 12310) +++ trunk/sys/x86/include/vmware.h 2020-02-08 19:33:27 UTC (rev 12311) @@ -24,7 +24,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: stable/10/sys/x86/include/vmware.h 278522 2015-02-10 16:34:42Z jhb $ + * $FreeBSD: stable/11/sys/x86/include/vmware.h 278749 2015-02-14 09:00:12Z kib $ */ #ifndef _X86_VMWARE_H_ @@ -32,9 +32,14 @@ #define VMW_HVMAGIC 0x564d5868 #define VMW_HVPORT 0x5658 + #define VMW_HVCMD_GETVERSION 10 #define VMW_HVCMD_GETHZ 45 +#define VMW_HVCMD_GETVCPU_INFO 68 +#define VMW_VCPUINFO_LEGACY_X2APIC (1 << 3) +#define VMW_VCPUINFO_VCPU_RESERVED (1 << 31) + static __inline void vmware_hvcall(u_int cmd, u_int *p) { Added: trunk/sys/x86/include/x86_smp.h =================================================================== --- trunk/sys/x86/include/x86_smp.h (rev 0) +++ trunk/sys/x86/include/x86_smp.h 2020-02-08 19:33:27 UTC (rev 12311) @@ -0,0 +1,108 @@ +/* $MidnightBSD$ */ +/*- + * ---------------------------------------------------------------------------- + * "THE BEER-WARE LICENSE" (Revision 42): + * <phk at FreeBSD.org> wrote this file. As long as you retain this notice you + * can do whatever you want with this stuff. If we meet some day, and you think + * this stuff is worth it, you can buy me a beer in return. Poul-Henning Kamp + * ---------------------------------------------------------------------------- + * + * $FreeBSD: stable/11/sys/x86/include/x86_smp.h 329462 2018-02-17 18:00:01Z kib $ + * + */ + +#ifndef _X86_X86_SMP_H_ +#define _X86_X86_SMP_H_ + +#include <sys/bus.h> +#include <machine/frame.h> +#include <machine/intr_machdep.h> +#include <x86/apicvar.h> +#include <machine/pcb.h> + +struct pmap; + +/* global data in mp_x86.c */ +extern int mp_naps; +extern int boot_cpu_id; +extern struct pcb stoppcbs[]; +extern int cpu_apic_ids[]; +extern int bootAP; +extern void *dpcpu; +extern char *bootSTK; +extern void *bootstacks[]; +extern volatile u_int cpu_ipi_pending[]; +extern volatile int aps_ready; +extern struct mtx ap_boot_mtx; +extern int cpu_logical; +extern int cpu_cores; +extern volatile uint32_t smp_tlb_generation; +extern struct pmap *smp_tlb_pmap; +extern vm_offset_t smp_tlb_addr1, smp_tlb_addr2; +extern u_int xhits_gbl[]; +extern u_int xhits_pg[]; +extern u_int xhits_rng[]; +extern u_int ipi_global; +extern u_int ipi_page; +extern u_int ipi_range; +extern u_int ipi_range_size; + +extern int nmi_kdb_lock; +extern int nmi_is_broadcast; + +struct cpu_info { + int cpu_present:1; + int cpu_bsp:1; + int cpu_disabled:1; + int cpu_hyperthread:1; +}; +extern struct cpu_info cpu_info[]; + +#ifdef COUNT_IPIS +extern u_long *ipi_invltlb_counts[MAXCPU]; +extern u_long *ipi_invlrng_counts[MAXCPU]; +extern u_long *ipi_invlpg_counts[MAXCPU]; +extern u_long *ipi_invlcache_counts[MAXCPU]; +extern u_long *ipi_rendezvous_counts[MAXCPU]; +#endif + +/* IPI handlers */ +inthand_t + IDTVEC(invltlb), /* TLB shootdowns - global */ + IDTVEC(invlpg), /* TLB shootdowns - 1 page */ + IDTVEC(invlrng), /* TLB shootdowns - page range */ + IDTVEC(invlcache), /* Write back and invalidate cache */ + IDTVEC(ipi_intr_bitmap_handler), /* Bitmap based IPIs */ + IDTVEC(cpustop), /* CPU stops & waits to be restarted */ + IDTVEC(cpususpend), /* CPU suspends & waits to be resumed */ + IDTVEC(rendezvous); /* handle CPU rendezvous */ + +/* functions in x86_mp.c */ +void assign_cpu_ids(void); +void cpu_add(u_int apic_id, char boot_cpu); +void cpustop_handler(void); +void cpususpend_handler(void); +void init_secondary_tail(void); +void invltlb_handler(void); +void invlpg_handler(void); +void invlrng_handler(void); +void invlcache_handler(void); +void init_secondary(void); +void ipi_startup(int apic_id, int vector); +void ipi_all_but_self(u_int ipi); +void ipi_bitmap_handler(struct trapframe frame); +void ipi_cpu(int cpu, u_int ipi); +int ipi_nmi_handler(void); +void ipi_selected(cpuset_t cpus, u_int ipi); +u_int mp_bootaddress(u_int); +void set_interrupt_apic_ids(void); +void smp_cache_flush(void); +void smp_masked_invlpg(cpuset_t mask, vm_offset_t addr, struct pmap *pmap); +void smp_masked_invlpg_range(cpuset_t mask, vm_offset_t startva, + vm_offset_t endva, struct pmap *pmap); +void smp_masked_invltlb(cpuset_t mask, struct pmap *pmap); +void mem_range_AP_init(void); +void topo_probe(void); +void ipi_send_cpu(int cpu, u_int ipi); + +#endif Property changes on: trunk/sys/x86/include/x86_smp.h ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +MidnightBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Added: trunk/sys/x86/include/x86_var.h =================================================================== --- trunk/sys/x86/include/x86_var.h (rev 0) +++ trunk/sys/x86/include/x86_var.h 2020-02-08 19:33:27 UTC (rev 12311) @@ -0,0 +1,161 @@ +/* $MidnightBSD$ */ +/*- + * Copyright (c) 1995 Bruce D. Evans. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the author nor the names of contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: stable/11/sys/x86/include/x86_var.h 355094 2019-11-25 16:46:41Z kib $ + */ + +#ifndef _X86_X86_VAR_H_ +#define _X86_X86_VAR_H_ + +/* + * Miscellaneous machine-dependent declarations. + */ + +extern long Maxmem; +extern u_int basemem; +extern int busdma_swi_pending; +extern u_int cpu_exthigh; +extern u_int cpu_feature; +extern u_int cpu_feature2; +extern u_int amd_feature; +extern u_int amd_feature2; +extern u_int amd_pminfo; +extern u_int amd_extended_feature_extensions; +extern u_int via_feature_rng; +extern u_int via_feature_xcrypt; +extern u_int cpu_clflush_line_size; +extern u_int cpu_stdext_feature; +extern u_int cpu_stdext_feature2; +extern u_int cpu_stdext_feature3; +extern uint64_t cpu_ia32_arch_caps; +extern u_int cpu_fxsr; +extern u_int cpu_high; +extern u_int cpu_id; +extern u_int cpu_max_ext_state_size; +extern u_int cpu_mxcsr_mask; +extern u_int cpu_procinfo; +extern u_int cpu_procinfo2; +extern char cpu_vendor[]; +extern u_int cpu_vendor_id; +extern u_int cpu_mon_mwait_flags; +extern u_int cpu_mon_min_size; +extern u_int cpu_mon_max_size; +extern u_int cpu_maxphyaddr; +extern char ctx_switch_xsave[]; +extern u_int hv_high; +extern char hv_vendor[]; +extern char kstack[]; +extern char sigcode[]; +extern int szsigcode; +extern int vm_page_dump_size; +extern int workaround_erratum383; +extern int _udatasel; +extern int _ucodesel; +extern int _ucode32sel; +extern int _ufssel; +extern int _ugssel; +extern int use_xsave; +extern uint64_t xsave_mask; +extern int pti; +extern int hw_ibrs_active; +extern int hw_mds_disable; +extern int hw_ssb_active; +extern int x86_taa_enable; + +struct pcb; +struct thread; +struct reg; +struct fpreg; +struct dbreg; +struct dumperinfo; +struct trapframe; + +/* + * The interface type of the interrupt handler entry point cannot be + * expressed in C. Use simplest non-variadic function type as an + * approximation. + */ +typedef void alias_for_inthand_t(void); + +/* + * Returns the maximum physical address that can be used with the + * current system. + */ +static __inline vm_paddr_t +cpu_getmaxphyaddr(void) +{ +#if defined(__i386__) && !defined(PAE) + return (0xffffffff); +#else + return ((1ULL << cpu_maxphyaddr) - 1); +#endif +} + +void *alloc_fpusave(int flags); +void busdma_swi(void); +bool cpu_mwait_usable(void); +void cpu_probe_amdc1e(void); +void cpu_setregs(void); +void dump_add_page(vm_paddr_t); +void dump_drop_page(vm_paddr_t); +void finishidentcpu(void); +void identify_cpu1(void); +void identify_cpu2(void); +void identify_hypervisor(void); +void initializecpu(void); +void initializecpucache(void); +bool fix_cpuid(void); +void fillw(int /*u_short*/ pat, void *base, size_t cnt); +int is_physical_memory(vm_paddr_t addr); +int isa_nmi(int cd); +void handle_ibrs_entry(void); +void handle_ibrs_exit(void); +void hw_ibrs_recalculate(void); +void hw_mds_recalculate(void); +void hw_ssb_recalculate(bool all_cpus); +void x86_taa_recalculate(void); +void nmi_call_kdb(u_int cpu, u_int type, struct trapframe *frame); +void nmi_call_kdb_smp(u_int type, struct trapframe *frame); +void nmi_handle_intr(u_int type, struct trapframe *frame); +void pagecopy(void *from, void *to); +void printcpuinfo(void); +int pti_get_default(void); +int user_dbreg_trap(register_t dr6); +int minidumpsys(struct dumperinfo *); +struct pcb *get_pcb_td(struct thread *td); + +#define MSR_OP_ANDNOT 0x00000001 +#define MSR_OP_OR 0x00000002 +#define MSR_OP_WRITE 0x00000003 +#define MSR_OP_LOCAL 0x10000000 +#define MSR_OP_SCHED 0x20000000 +#define MSR_OP_RENDEZVOUS 0x30000000 +void x86_msr_op(u_int msr, u_int op, uint64_t arg1); + +#endif Property changes on: trunk/sys/x86/include/x86_var.h ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +MidnightBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Added: trunk/sys/x86/include/xen/xen-os.h =================================================================== --- trunk/sys/x86/include/xen/xen-os.h (rev 0) +++ trunk/sys/x86/include/xen/xen-os.h 2020-02-08 19:33:27 UTC (rev 12311) @@ -0,0 +1,39 @@ +/* $MidnightBSD$ */ +/***************************************************************************** + * x86/xen/xen-os.h + * + * Random collection of macros and definition + * + * Copyright (c) 2003, 2004 Keir Fraser (on behalf of the Xen team) + * All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * $FreeBSD: stable/11/sys/x86/include/xen/xen-os.h 289686 2015-10-21 10:44:07Z royger $ + */ + +#ifndef _MACHINE_X86_XEN_XEN_OS_H_ +#define _MACHINE_X86_XEN_XEN_OS_H_ + +/* Everything below this point is not included by assembler (.S) files. */ +#ifndef __ASSEMBLY__ + +#endif /* !__ASSEMBLY__ */ + +#endif /* _MACHINE_X86_XEN_XEN_OS_H_ */ Property changes on: trunk/sys/x86/include/xen/xen-os.h ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +MidnightBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property From laffer1 at midnightbsd.org Sat Feb 8 14:34:35 2020 From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org) Date: Sat, 8 Feb 2020 14:34:35 -0500 (EST) Subject: [Midnightbsd-cvs] src [12312] trunk/sys/x86: sync with FreeBSD 11-stable Message-ID: <202002081934.018JYZuq062031@stargazer.midnightbsd.org> Revision: 12312 http://svnweb.midnightbsd.org/src/?rev=12312 Author: laffer1 Date: 2020-02-08 14:34:34 -0500 (Sat, 08 Feb 2020) Log Message: ----------- sync with FreeBSD 11-stable Modified Paths: -------------- trunk/sys/x86/acpica/OsdEnvironment.c trunk/sys/x86/acpica/acpi_apm.c trunk/sys/x86/acpica/acpi_wakeup.c trunk/sys/x86/acpica/madt.c trunk/sys/x86/acpica/srat.c trunk/sys/x86/bios/smbios.c trunk/sys/x86/bios/vpd.c trunk/sys/x86/cpufreq/est.c trunk/sys/x86/cpufreq/hwpstate.c trunk/sys/x86/cpufreq/p4tcc.c trunk/sys/x86/cpufreq/powernow.c trunk/sys/x86/cpufreq/smist.c Modified: trunk/sys/x86/acpica/OsdEnvironment.c =================================================================== --- trunk/sys/x86/acpica/OsdEnvironment.c 2020-02-08 19:33:27 UTC (rev 12311) +++ trunk/sys/x86/acpica/OsdEnvironment.c 2020-02-08 19:34:34 UTC (rev 12312) @@ -27,10 +27,11 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/x86/acpica/OsdEnvironment.c 281687 2015-04-18 08:01:12Z jkim $"); +__FBSDID("$FreeBSD: stable/11/sys/x86/acpica/OsdEnvironment.c 316303 2017-03-30 20:18:34Z jkim $"); #include <sys/types.h> #include <sys/bus.h> +#include <sys/kernel.h> #include <sys/sysctl.h> #include <contrib/dev/acpica/include/acpi.h> @@ -61,6 +62,16 @@ { long acpi_root; + if (TUNABLE_ULONG_FETCH("acpi.rsdp", &acpi_root)) + return (acpi_root); + + /* + * The hints mechanism is unreliable (it fails if anybody ever + * compiled in hints to the kernel). It has been replaced + * by the tunable method, but is used here as a fallback to + * retain maximum compatibility between old loaders and new + * kernels. It can be removed after 11.0R. + */ if (resource_long_value("acpi", 0, "rsdp", &acpi_root) == 0) return (acpi_root); Modified: trunk/sys/x86/acpica/acpi_apm.c =================================================================== --- trunk/sys/x86/acpica/acpi_apm.c 2020-02-08 19:33:27 UTC (rev 12311) +++ trunk/sys/x86/acpica/acpi_apm.c 2020-02-08 19:34:34 UTC (rev 12312) @@ -26,7 +26,7 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/x86/acpica/acpi_apm.c 228283 2011-12-05 16:08:18Z ed $"); +__FBSDID("$FreeBSD: stable/11/sys/x86/acpica/acpi_apm.c 228283 2011-12-05 16:08:18Z ed $"); #include <sys/param.h> #include <sys/bus.h> Modified: trunk/sys/x86/acpica/acpi_wakeup.c =================================================================== --- trunk/sys/x86/acpica/acpi_wakeup.c 2020-02-08 19:33:27 UTC (rev 12311) +++ trunk/sys/x86/acpica/acpi_wakeup.c 2020-02-08 19:34:34 UTC (rev 12312) @@ -29,10 +29,12 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/x86/acpica/acpi_wakeup.c 331910 2018-04-03 07:52:06Z avg $"); +__FBSDID("$FreeBSD: stable/11/sys/x86/acpica/acpi_wakeup.c 347700 2019-05-16 14:42:16Z markj $"); -#ifdef __i386__ -#include "opt_npx.h" +#if defined(__amd64__) +#define DEV_APIC +#else +#include "opt_apic.h" #endif #include <sys/param.h> @@ -43,6 +45,7 @@ #include <sys/memrange.h> #include <sys/smp.h> #include <sys/systm.h> +#include <sys/cons.h> #include <vm/vm.h> #include <vm/pmap.h> @@ -50,14 +53,17 @@ #include <machine/clock.h> #include <machine/cpu.h> #include <machine/intr_machdep.h> +#include <machine/md_var.h> #include <x86/mca.h> #include <machine/pcb.h> -#include <machine/pmap.h> #include <machine/specialreg.h> -#include <machine/md_var.h> +#include <x86/ucode.h> +#ifdef DEV_APIC +#include <x86/apicreg.h> +#include <x86/apicvar.h> +#endif #ifdef SMP -#include <x86/apicreg.h> #include <machine/smp.h> #include <machine/vmparam.h> #endif @@ -74,6 +80,7 @@ extern int acpi_resume_beep; extern int acpi_reset_video; +extern int acpi_susp_bounce; #ifdef SMP extern struct susppcb **susppcbs; @@ -82,7 +89,7 @@ static struct susppcb **susppcbs; #endif -static void *acpi_alloc_wakeup_handler(void); +static void *acpi_alloc_wakeup_handler(void **); static void acpi_stop_beep(void *); #ifdef SMP @@ -91,18 +98,14 @@ #endif #ifdef __amd64__ -#define ACPI_PAGETABLES 3 +#define ACPI_WAKEPAGES 4 #else -#define ACPI_PAGETABLES 0 +#define ACPI_WAKEPAGES 1 #endif -#define WAKECODE_VADDR(sc) \ - ((sc)->acpi_wakeaddr + (ACPI_PAGETABLES * PAGE_SIZE)) -#define WAKECODE_PADDR(sc) \ - ((sc)->acpi_wakephys + (ACPI_PAGETABLES * PAGE_SIZE)) #define WAKECODE_FIXUP(offset, type, val) do { \ type *addr; \ - addr = (type *)(WAKECODE_VADDR(sc) + offset); \ + addr = (type *)(sc->acpi_wakeaddr + (offset)); \ *addr = val; \ } while (0) @@ -119,7 +122,7 @@ acpi_wakeup_ap(struct acpi_softc *sc, int cpu) { struct pcb *pcb; - int vector = (WAKECODE_PADDR(sc) >> 12) & 0xff; + int vector = (sc->acpi_wakephys >> 12) & 0xff; int apic_id = cpu_apic_ids[cpu]; int ms; @@ -162,7 +165,7 @@ /* setup a vector to our boot code */ *((volatile u_short *)WARMBOOT_OFF) = WARMBOOT_TARGET; - *((volatile u_short *)WARMBOOT_SEG) = WAKECODE_PADDR(sc) >> 4; + *((volatile u_short *)WARMBOOT_SEG) = sc->acpi_wakephys >> 4; outb(CMOS_REG, BIOS_RESET); outb(CMOS_DATA, BIOS_WARM); /* 'warm-start' */ @@ -191,6 +194,10 @@ { ACPI_STATUS status; struct pcb *pcb; +#ifdef __amd64__ + struct pcpu *pc; + int i; +#endif if (sc->acpi_wakeaddr == 0ul) return (-1); /* couldn't alloc wake memory */ @@ -203,7 +210,7 @@ if (acpi_resume_beep != 0) timer_spkr_acquire(); - AcpiSetFirmwareWakingVector(WAKECODE_PADDR(sc), 0); + AcpiSetFirmwareWakingVector(sc->acpi_wakephys, 0); intr_suspend(); @@ -211,7 +218,7 @@ if (savectx(pcb)) { #ifdef __amd64__ fpususpend(susppcbs[0]->sp_fpususpend); -#elif defined(DEV_NPX) +#else npxsuspend(susppcbs[0]->sp_fpususpend); #endif #ifdef SMP @@ -220,11 +227,23 @@ return (0); /* couldn't sleep */ } #endif +#ifdef __amd64__ + hw_ibrs_active = 0; + hw_ssb_active = 0; + cpu_stdext_feature3 = 0; + CPU_FOREACH(i) { + pc = pcpu_find(i); + pc->pc_ibpb_set = 0; + } +#endif WAKECODE_FIXUP(resume_beep, uint8_t, (acpi_resume_beep != 0)); WAKECODE_FIXUP(reset_video, uint8_t, (acpi_reset_video != 0)); -#ifndef __amd64__ +#ifdef __amd64__ + WAKECODE_FIXUP(wakeup_efer, uint64_t, rdmsr(MSR_EFER) & + ~(EFER_LMA)); +#else WAKECODE_FIXUP(wakeup_cr4, register_t, pcb->pcb_cr4); #endif WAKECODE_FIXUP(wakeup_pcb, struct pcb *, pcb); @@ -243,12 +262,21 @@ return (0); /* couldn't sleep */ } + if (acpi_susp_bounce) + resumectx(pcb); + for (;;) ia32_pause(); } else { + /* + * Re-initialize console hardware as soon as possibe. + * No console output (e.g. printf) is allowed before + * this point. + */ + cnresume(); #ifdef __amd64__ fpuresume(susppcbs[0]->sp_fpususpend); -#elif defined(DEV_NPX) +#else npxresume(susppcbs[0]->sp_fpususpend); #endif } @@ -267,10 +295,14 @@ if (!intr_enabled) { /* Wakeup MD procedures in interrupt disabled context */ if (sleep_result == 1) { + ucode_reload(); pmap_init_pat(); initializecpu(); PCPU_SET(switchtime, 0); PCPU_SET(switchticks, ticks); +#ifdef DEV_APIC + lapic_xapic_mode(); +#endif #ifdef SMP if (!CPU_EMPTY(&suspcpus)) acpi_wakeup_cpus(sc); @@ -300,11 +332,12 @@ } static void * -acpi_alloc_wakeup_handler(void) +acpi_alloc_wakeup_handler(void *wakepages[ACPI_WAKEPAGES]) { - void *wakeaddr; int i; + memset(wakepages, 0, ACPI_WAKEPAGES * sizeof(*wakepages)); + /* * Specify the region for our wakeup code. We want it in the low 1 MB * region, excluding real mode IVT (0-0x3ff), BDA (0x400-0x4ff), EBDA @@ -312,18 +345,18 @@ * and ROM area (0xa0000 and above). The temporary page tables must be * page-aligned. */ - wakeaddr = contigmalloc((ACPI_PAGETABLES + 1) * PAGE_SIZE, M_DEVBUF, - M_WAITOK, 0x500, 0xa0000, PAGE_SIZE, 0ul); - if (wakeaddr == NULL) { - printf("%s: can't alloc wake memory\n", __func__); - return (NULL); + for (i = 0; i < ACPI_WAKEPAGES; i++) { + wakepages[i] = contigmalloc(PAGE_SIZE, M_DEVBUF, M_NOWAIT, + 0x500, 0xa0000, PAGE_SIZE, 0ul); + if (wakepages[i] == NULL) { + printf("%s: can't alloc wake memory\n", __func__); + goto freepages; + } } if (EVENTHANDLER_REGISTER(power_resume, acpi_stop_beep, NULL, EVENTHANDLER_PRI_LAST) == NULL) { printf("%s: can't register event handler\n", __func__); - contigfree(wakeaddr, (ACPI_PAGETABLES + 1) * PAGE_SIZE, - M_DEVBUF); - return (NULL); + goto freepages; } susppcbs = malloc(mp_ncpus * sizeof(*susppcbs), M_DEVBUF, M_WAITOK); for (i = 0; i < mp_ncpus; i++) { @@ -331,15 +364,23 @@ susppcbs[i]->sp_fpususpend = alloc_fpusave(M_WAITOK); } - return (wakeaddr); + return (wakepages); + +freepages: + for (i = 0; i < ACPI_WAKEPAGES; i++) + if (wakepages[i] != NULL) + contigfree(wakepages[i], PAGE_SIZE, M_DEVBUF); + return (NULL); } void acpi_install_wakeup_handler(struct acpi_softc *sc) { - static void *wakeaddr = NULL; + static void *wakeaddr; + void *wakepages[ACPI_WAKEPAGES]; #ifdef __amd64__ uint64_t *pt4, *pt3, *pt2; + vm_paddr_t pt4pa, pt3pa, pt2pa; int i; #endif @@ -346,24 +387,33 @@ if (wakeaddr != NULL) return; - wakeaddr = acpi_alloc_wakeup_handler(); - if (wakeaddr == NULL) + if (acpi_alloc_wakeup_handler(wakepages) == NULL) return; + wakeaddr = wakepages[0]; sc->acpi_wakeaddr = (vm_offset_t)wakeaddr; sc->acpi_wakephys = vtophys(wakeaddr); - bcopy(wakecode, (void *)WAKECODE_VADDR(sc), sizeof(wakecode)); +#ifdef __amd64__ + pt4 = wakepages[1]; + pt3 = wakepages[2]; + pt2 = wakepages[3]; + pt4pa = vtophys(pt4); + pt3pa = vtophys(pt3); + pt2pa = vtophys(pt2); +#endif + bcopy(wakecode, (void *)sc->acpi_wakeaddr, sizeof(wakecode)); + /* Patch GDT base address, ljmp targets. */ WAKECODE_FIXUP((bootgdtdesc + 2), uint32_t, - WAKECODE_PADDR(sc) + bootgdt); + sc->acpi_wakephys + bootgdt); WAKECODE_FIXUP((wakeup_sw32 + 2), uint32_t, - WAKECODE_PADDR(sc) + wakeup_32); + sc->acpi_wakephys + wakeup_32); #ifdef __amd64__ WAKECODE_FIXUP((wakeup_sw64 + 1), uint32_t, - WAKECODE_PADDR(sc) + wakeup_64); - WAKECODE_FIXUP(wakeup_pagetables, uint32_t, sc->acpi_wakephys); + sc->acpi_wakephys + wakeup_64); + WAKECODE_FIXUP(wakeup_pagetables, uint32_t, pt4pa); #endif /* Save pointers to some global data. */ @@ -375,12 +425,7 @@ WAKECODE_FIXUP(wakeup_cr3, register_t, vtophys(kernel_pmap->pm_pdir)); #endif -#else - /* Build temporary page tables below realmode code. */ - pt4 = wakeaddr; - pt3 = pt4 + (PAGE_SIZE) / sizeof(uint64_t); - pt2 = pt3 + (PAGE_SIZE) / sizeof(uint64_t); - +#else /* __amd64__ */ /* Create the initial 1GB replicated page tables */ for (i = 0; i < 512; i++) { /* @@ -387,7 +432,7 @@ * Each slot of the level 4 pages points * to the same level 3 page */ - pt4[i] = (uint64_t)(sc->acpi_wakephys + PAGE_SIZE); + pt4[i] = (uint64_t)pt3pa; pt4[i] |= PG_V | PG_RW | PG_U; /* @@ -394,7 +439,7 @@ * Each slot of the level 3 pages points * to the same level 2 page */ - pt3[i] = (uint64_t)(sc->acpi_wakephys + (2 * PAGE_SIZE)); + pt3[i] = (uint64_t)pt2pa; pt3[i] |= PG_V | PG_RW | PG_U; /* The level 2 page slots are mapped with 2MB pages for 1GB. */ @@ -401,7 +446,7 @@ pt2[i] = i * (2 * 1024 * 1024); pt2[i] |= PG_V | PG_RW | PG_PS | PG_U; } -#endif +#endif /* !__amd64__ */ if (bootverbose) device_printf(sc->acpi_dev, "wakeup code va %#jx pa %#jx\n", Modified: trunk/sys/x86/acpica/madt.c =================================================================== --- trunk/sys/x86/acpica/madt.c 2020-02-08 19:33:27 UTC (rev 12311) +++ trunk/sys/x86/acpica/madt.c 2020-02-08 19:34:34 UTC (rev 12312) @@ -26,12 +26,13 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/x86/acpica/madt.c 288461 2015-10-01 20:54:19Z jhb $"); +__FBSDID("$FreeBSD: stable/11/sys/x86/acpica/madt.c 340016 2018-11-01 18:34:26Z jhb $"); #include <sys/param.h> #include <sys/systm.h> #include <sys/bus.h> #include <sys/kernel.h> +#include <sys/limits.h> #include <sys/malloc.h> #include <sys/smp.h> #include <vm/vm.h> @@ -39,7 +40,9 @@ #include <x86/apicreg.h> #include <machine/intr_machdep.h> -#include <machine/apicvar.h> +#include <x86/apicvar.h> +#include <machine/md_var.h> +#include <x86/vmware.h> #include <contrib/dev/acpica/include/acpi.h> #include <contrib/dev/acpica/include/aclocal.h> @@ -59,7 +62,7 @@ u_int la_acpi_id; } lapics[MAX_APIC_ID + 1]; -static int madt_found_sci_override; +int madt_found_sci_override; static ACPI_TABLE_MADT *madt; static vm_paddr_t madt_physaddr; static vm_offset_t madt_length; @@ -104,7 +107,7 @@ madt_physaddr = acpi_find_table(ACPI_SIG_MADT); if (madt_physaddr == 0) return (ENXIO); - return (0); + return (-50); } /* @@ -129,8 +132,86 @@ static int madt_setup_local(void) { + ACPI_TABLE_DMAR *dmartbl; + vm_paddr_t dmartbl_physaddr; + const char *reason; + char *hw_vendor; + u_int p[4]; + int user_x2apic; + bool bios_x2apic; madt = pmap_mapbios(madt_physaddr, madt_length); + if ((cpu_feature2 & CPUID2_X2APIC) != 0) { + reason = NULL; + + /* + * Automatically detect several configurations where + * x2APIC mode is known to cause troubles. User can + * override the setting with hw.x2apic_enable tunable. + */ + dmartbl_physaddr = acpi_find_table(ACPI_SIG_DMAR); + if (dmartbl_physaddr != 0) { + dmartbl = acpi_map_table(dmartbl_physaddr, + ACPI_SIG_DMAR); + if ((dmartbl->Flags & ACPI_DMAR_X2APIC_OPT_OUT) != 0) + reason = "by DMAR table"; + acpi_unmap_table(dmartbl); + } + if (vm_guest == VM_GUEST_VMWARE) { + vmware_hvcall(VMW_HVCMD_GETVCPU_INFO, p); + if ((p[0] & VMW_VCPUINFO_VCPU_RESERVED) != 0 || + (p[0] & VMW_VCPUINFO_LEGACY_X2APIC) == 0) + reason = + "inside VMWare without intr redirection"; + } else if (vm_guest == VM_GUEST_XEN) { + reason = "due to running under XEN"; + } else if (vm_guest == VM_GUEST_NO && + CPUID_TO_FAMILY(cpu_id) == 0x6 && + CPUID_TO_MODEL(cpu_id) == 0x2a) { + hw_vendor = kern_getenv("smbios.planar.maker"); + /* + * It seems that some Lenovo and ASUS + * SandyBridge-based notebook BIOSes have a + * bug which prevents booting AP in x2APIC + * mode. Since the only way to detect mobile + * CPU is to check northbridge pci id, which + * cannot be done that early, disable x2APIC + * for all Lenovo and ASUS SandyBridge + * machines. + */ + if (hw_vendor != NULL) { + if (!strcmp(hw_vendor, "LENOVO") || + !strcmp(hw_vendor, + "ASUSTeK Computer Inc.")) { + reason = + "for a suspected SandyBridge BIOS bug"; + } + freeenv(hw_vendor); + } + } + bios_x2apic = lapic_is_x2apic(); + if (reason != NULL && bios_x2apic) { + if (bootverbose) + printf("x2APIC should be disabled %s but " + "already enabled by BIOS; enabling.\n", + reason); + reason = NULL; + } + if (reason == NULL) + x2apic_mode = 1; + else if (bootverbose) + printf("x2APIC available but disabled %s\n", reason); + user_x2apic = x2apic_mode; + TUNABLE_INT_FETCH("hw.x2apic_enable", &user_x2apic); + if (user_x2apic != x2apic_mode) { + if (bios_x2apic && !user_x2apic) + printf("x2APIC disabled by tunable and " + "enabled by BIOS; ignoring tunable."); + else + x2apic_mode = user_x2apic; + } + } + lapic_init(madt->Address); printf("ACPI APIC Table: <%.*s %.*s>\n", (int)sizeof(madt->Header.OemId), madt->Header.OemId, @@ -290,10 +371,6 @@ apic->Id); if (ioapics[apic->Id].io_apic != NULL) panic("%s: Double APIC ID %u", __func__, apic->Id); - if (apic->GlobalIrqBase >= FIRST_MSI_INT) { - printf("MADT: Ignoring bogus I/O APIC ID %u", apic->Id); - break; - } ioapics[apic->Id].io_apic = ioapic_create(apic->Address, apic->Id, apic->GlobalIrqBase); ioapics[apic->Id].io_vector = apic->GlobalIrqBase; @@ -396,41 +473,27 @@ return (0); } -/* - * Parse an interrupt source override for an ISA interrupt. - */ -static void -madt_parse_interrupt_override(ACPI_MADT_INTERRUPT_OVERRIDE *intr) +void +madt_parse_interrupt_values(void *entry, + enum intr_trigger *trig, enum intr_polarity *pol) { - void *new_ioapic, *old_ioapic; - u_int new_pin, old_pin; - enum intr_trigger trig; - enum intr_polarity pol; + ACPI_MADT_INTERRUPT_OVERRIDE *intr; char buf[64]; - if (acpi_quirks & ACPI_Q_MADT_IRQ0 && intr->SourceIrq == 0 && - intr->GlobalIrq == 2) { - if (bootverbose) - printf("MADT: Skipping timer override\n"); - return; - } + intr = entry; + if (bootverbose) printf("MADT: Interrupt override: source %u, irq %u\n", intr->SourceIrq, intr->GlobalIrq); KASSERT(intr->Bus == 0, ("bus for interrupt overrides must be zero")); - if (madt_find_interrupt(intr->GlobalIrq, &new_ioapic, &new_pin) != 0) { - printf("MADT: Could not find APIC for vector %u (IRQ %u)\n", - intr->GlobalIrq, intr->SourceIrq); - return; - } /* * Lookup the appropriate trigger and polarity modes for this * entry. */ - trig = interrupt_trigger(intr->IntiFlags, intr->SourceIrq); - pol = interrupt_polarity(intr->IntiFlags, intr->SourceIrq); - + *trig = interrupt_trigger(intr->IntiFlags, intr->SourceIrq); + *pol = interrupt_polarity(intr->IntiFlags, intr->SourceIrq); + /* * If the SCI is identity mapped but has edge trigger and * active-hi polarity or the force_sci_lo tunable is set, @@ -440,30 +503,57 @@ madt_found_sci_override = 1; if (getenv_string("hw.acpi.sci.trigger", buf, sizeof(buf))) { if (tolower(buf[0]) == 'e') - trig = INTR_TRIGGER_EDGE; + *trig = INTR_TRIGGER_EDGE; else if (tolower(buf[0]) == 'l') - trig = INTR_TRIGGER_LEVEL; + *trig = INTR_TRIGGER_LEVEL; else panic( "Invalid trigger %s: must be 'edge' or 'level'", buf); printf("MADT: Forcing SCI to %s trigger\n", - trig == INTR_TRIGGER_EDGE ? "edge" : "level"); + *trig == INTR_TRIGGER_EDGE ? "edge" : "level"); } if (getenv_string("hw.acpi.sci.polarity", buf, sizeof(buf))) { if (tolower(buf[0]) == 'h') - pol = INTR_POLARITY_HIGH; + *pol = INTR_POLARITY_HIGH; else if (tolower(buf[0]) == 'l') - pol = INTR_POLARITY_LOW; + *pol = INTR_POLARITY_LOW; else panic( "Invalid polarity %s: must be 'high' or 'low'", buf); printf("MADT: Forcing SCI to active %s polarity\n", - pol == INTR_POLARITY_HIGH ? "high" : "low"); + *pol == INTR_POLARITY_HIGH ? "high" : "low"); } } +} +/* + * Parse an interrupt source override for an ISA interrupt. + */ +static void +madt_parse_interrupt_override(ACPI_MADT_INTERRUPT_OVERRIDE *intr) +{ + void *new_ioapic, *old_ioapic; + u_int new_pin, old_pin; + enum intr_trigger trig; + enum intr_polarity pol; + + if (acpi_quirks & ACPI_Q_MADT_IRQ0 && intr->SourceIrq == 0 && + intr->GlobalIrq == 2) { + if (bootverbose) + printf("MADT: Skipping timer override\n"); + return; + } + + if (madt_find_interrupt(intr->GlobalIrq, &new_ioapic, &new_pin) != 0) { + printf("MADT: Could not find APIC for vector %u (IRQ %u)\n", + intr->GlobalIrq, intr->SourceIrq); + return; + } + + madt_parse_interrupt_values(intr, &trig, &pol); + /* Remap the IRQ if it is mapped to a different interrupt vector. */ if (intr->SourceIrq != intr->GlobalIrq) { /* @@ -510,7 +600,7 @@ if (!(nmi->IntiFlags & ACPI_MADT_TRIGGER_CONFORMS)) ioapic_set_triggermode(ioapic, pin, interrupt_trigger(nmi->IntiFlags, 0)); - if (!(nmi->IntiFlags & ACPI_MADT_TRIGGER_CONFORMS)) + if (!(nmi->IntiFlags & ACPI_MADT_POLARITY_CONFORMS)) ioapic_set_polarity(ioapic, pin, interrupt_polarity(nmi->IntiFlags, 0)); } Modified: trunk/sys/x86/acpica/srat.c =================================================================== --- trunk/sys/x86/acpica/srat.c 2020-02-08 19:33:27 UTC (rev 12311) +++ trunk/sys/x86/acpica/srat.c 2020-02-08 19:34:34 UTC (rev 12312) @@ -27,8 +27,10 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/x86/acpica/srat.c 299485 2016-05-11 22:06:28Z vangyzen $"); +__FBSDID("$FreeBSD: stable/11/sys/x86/acpica/srat.c 322996 2017-08-29 07:01:15Z mav $"); +#include "opt_vm.h" + #include <sys/param.h> #include <sys/bus.h> #include <sys/kernel.h> @@ -47,7 +49,7 @@ #include <contrib/dev/acpica/include/actables.h> #include <machine/intr_machdep.h> -#include <machine/apicvar.h> +#include <x86/apicvar.h> #include <dev/acpica/acpivar.h> @@ -64,11 +66,102 @@ static ACPI_TABLE_SRAT *srat; static vm_paddr_t srat_physaddr; -static int vm_domains[VM_PHYSSEG_MAX]; +static int domain_pxm[MAXMEMDOM]; +static int ndomain; +static ACPI_TABLE_SLIT *slit; +static vm_paddr_t slit_physaddr; +static int vm_locality_table[MAXMEMDOM * MAXMEMDOM]; + static void srat_walk_table(acpi_subtable_handler *handler, void *arg); /* + * SLIT parsing. + */ + +static void +slit_parse_table(ACPI_TABLE_SLIT *s) +{ + int i, j; + int i_domain, j_domain; + int offset = 0; + uint8_t e; + + /* + * This maps the SLIT data into the VM-domain centric view. + * There may be sparse entries in the PXM namespace, so + * remap them to a VM-domain ID and if it doesn't exist, + * skip it. + * + * It should result in a packed 2d array of VM-domain + * locality information entries. + */ + + if (bootverbose) + printf("SLIT.Localities: %d\n", (int) s->LocalityCount); + for (i = 0; i < s->LocalityCount; i++) { + i_domain = acpi_map_pxm_to_vm_domainid(i); + if (i_domain < 0) + continue; + + if (bootverbose) + printf("%d: ", i); + for (j = 0; j < s->LocalityCount; j++) { + j_domain = acpi_map_pxm_to_vm_domainid(j); + if (j_domain < 0) + continue; + e = s->Entry[i * s->LocalityCount + j]; + if (bootverbose) + printf("%d ", (int) e); + /* 255 == "no locality information" */ + if (e == 255) + vm_locality_table[offset] = -1; + else + vm_locality_table[offset] = e; + offset++; + } + if (bootverbose) + printf("\n"); + } +} + +/* + * Look for an ACPI System Locality Distance Information Table ("SLIT") + */ +static int +parse_slit(void) +{ + + if (resource_disabled("slit", 0)) { + return (-1); + } + + slit_physaddr = acpi_find_table(ACPI_SIG_SLIT); + if (slit_physaddr == 0) { + return (-1); + } + + /* + * Make a pass over the table to populate the cpus[] and + * mem_info[] tables. + */ + slit = acpi_map_table(slit_physaddr, ACPI_SIG_SLIT); + slit_parse_table(slit); + acpi_unmap_table(slit); + slit = NULL; + +#ifdef VM_NUMA_ALLOC + /* Tell the VM about it! */ + mem_locality = vm_locality_table; +#endif + return (0); +} + +/* + * SRAT parsing. + */ + +/* * Returns true if a memory range overlaps with at least one range in * phys_avail[]. */ @@ -78,7 +171,7 @@ int i; for (i = 0; phys_avail[i] != 0 && phys_avail[i + 1] != 0; i += 2) { - if (phys_avail[i + 1] < start) + if (phys_avail[i + 1] <= start) continue; if (phys_avail[i] < end) return (1); @@ -110,6 +203,12 @@ "enabled" : "disabled"); if (!(cpu->Flags & ACPI_SRAT_CPU_ENABLED)) break; + if (cpu->ApicId > MAX_APIC_ID) { + printf("SRAT: Ignoring local APIC ID %u (too high)\n", + cpu->ApicId); + break; + } + if (cpus[cpu->ApicId].enabled) { printf("SRAT: Duplicate local APIC ID %u\n", cpu->ApicId); @@ -128,6 +227,12 @@ "enabled" : "disabled"); if (!(x2apic->Flags & ACPI_SRAT_CPU_ENABLED)) break; + if (x2apic->ApicId > MAX_APIC_ID) { + printf("SRAT: Ignoring local APIC ID %u (too high)\n", + x2apic->ApicId); + break; + } + KASSERT(!cpus[x2apic->ApicId].enabled, ("Duplicate local APIC ID %u", x2apic->ApicId)); cpus[x2apic->ApicId].domain = x2apic->ProximityDomain; @@ -137,7 +242,7 @@ mem = (ACPI_SRAT_MEM_AFFINITY *)entry; if (bootverbose) printf( - "SRAT: Found memory domain %d addr %jx len %jx: %s\n", + "SRAT: Found memory domain %d addr 0x%jx len 0x%jx: %s\n", mem->ProximityDomain, (uintmax_t)mem->BaseAddress, (uintmax_t)mem->Length, (mem->Flags & ACPI_SRAT_MEM_ENABLED) ? @@ -146,7 +251,7 @@ break; if (!overlaps_phys_avail(mem->BaseAddress, mem->BaseAddress + mem->Length)) { - printf("SRAT: Ignoring memory at addr %jx\n", + printf("SRAT: Ignoring memory at addr 0x%jx\n", (uintmax_t)mem->BaseAddress); break; } @@ -243,7 +348,7 @@ address = mem_info[i].end + 1; } } - printf("SRAT: No memory region found for %jx - %jx\n", + printf("SRAT: No memory region found for 0x%jx - 0x%jx\n", (uintmax_t)phys_avail[j], (uintmax_t)phys_avail[j + 1]); return (ENXIO); } @@ -258,48 +363,47 @@ int i, j, slot; /* Enumerate all the domains. */ - vm_ndomains = 0; + ndomain = 0; for (i = 0; i < num_mem; i++) { /* See if this domain is already known. */ - for (j = 0; j < vm_ndomains; j++) { - if (vm_domains[j] >= mem_info[i].domain) + for (j = 0; j < ndomain; j++) { + if (domain_pxm[j] >= mem_info[i].domain) break; } - if (j < vm_ndomains && vm_domains[j] == mem_info[i].domain) + if (j < ndomain && domain_pxm[j] == mem_info[i].domain) continue; - /* Insert the new domain at slot 'j'. */ - slot = j; - for (j = vm_ndomains; j > slot; j--) - vm_domains[j] = vm_domains[j - 1]; - vm_domains[slot] = mem_info[i].domain; - vm_ndomains++; - if (vm_ndomains > MAXMEMDOM) { - vm_ndomains = 1; + if (ndomain >= MAXMEMDOM) { + ndomain = 1; printf("SRAT: Too many memory domains\n"); return (EFBIG); } + + /* Insert the new domain at slot 'j'. */ + slot = j; + for (j = ndomain; j > slot; j--) + domain_pxm[j] = domain_pxm[j - 1]; + domain_pxm[slot] = mem_info[i].domain; + ndomain++; } - /* Renumber each domain to its index in the sorted 'domains' list. */ - for (i = 0; i < vm_ndomains; i++) { + /* Renumber each domain to its index in the sorted 'domain_pxm' list. */ + for (i = 0; i < ndomain; i++) { /* * If the domain is already the right value, no need * to renumber. */ - if (vm_domains[i] == i) + if (domain_pxm[i] == i) continue; /* Walk the cpu[] and mem_info[] arrays to renumber. */ for (j = 0; j < num_mem; j++) - if (mem_info[j].domain == vm_domains[i]) + if (mem_info[j].domain == domain_pxm[i]) mem_info[j].domain = i; for (j = 0; j <= MAX_APIC_ID; j++) - if (cpus[j].enabled && cpus[j].domain == vm_domains[i]) + if (cpus[j].enabled && cpus[j].domain == domain_pxm[i]) cpus[j].domain = i; } - KASSERT(vm_ndomains > 0, - ("renumber_domains: invalid final vm_ndomains setup")); return (0); } @@ -307,17 +411,17 @@ /* * Look for an ACPI System Resource Affinity Table ("SRAT") */ -static void -parse_srat(void *dummy) +static int +parse_srat(void) { int error; if (resource_disabled("srat", 0)) - return; + return (-1); srat_physaddr = acpi_find_table(ACPI_SIG_SRAT); if (srat_physaddr == 0) - return; + return (-1); /* * Make a pass over the table to populate the cpus[] and @@ -331,15 +435,44 @@ if (error || check_domains() != 0 || check_phys_avail() != 0 || renumber_domains() != 0) { srat_physaddr = 0; - return; + return (-1); } +#ifdef VM_NUMA_ALLOC /* Point vm_phys at our memory affinity table. */ + vm_ndomains = ndomain; mem_affinity = mem_info; +#endif + + return (0); } -SYSINIT(parse_srat, SI_SUB_VM - 1, SI_ORDER_FIRST, parse_srat, NULL); static void +init_mem_locality(void) +{ + int i; + + /* + * For now, assume -1 == "no locality information for + * this pairing. + */ + for (i = 0; i < MAXMEMDOM * MAXMEMDOM; i++) + vm_locality_table[i] = -1; +} + +static void +parse_acpi_tables(void *dummy) +{ + + if (parse_srat() < 0) + return; + init_mem_locality(); + (void) parse_slit(); +} +SYSINIT(parse_acpi_tables, SI_SUB_VM - 1, SI_ORDER_FIRST, parse_acpi_tables, + NULL); + +static void srat_walk_table(acpi_subtable_handler *handler, void *arg) { @@ -348,7 +481,7 @@ } /* - * Setup per-CPU ACPI IDs. + * Setup per-CPU domain IDs. */ static void srat_set_cpus(void *dummy) @@ -369,6 +502,7 @@ panic("SRAT: CPU with APIC ID %u is not known", pc->pc_apic_id); pc->pc_domain = cpu->domain; + CPU_SET(i, &cpuset_domain[cpu->domain]); if (bootverbose) printf("SRAT: CPU %u has memory domain %d\n", i, cpu->domain); @@ -386,8 +520,8 @@ { int i; - for (i = 0; i < vm_ndomains; i++) { - if (vm_domains[i] == pxm) + for (i = 0; i < ndomain; i++) { + if (domain_pxm[i] == pxm) return (i); } @@ -394,4 +528,13 @@ return (-1); } +#else /* MAXMEMDOM == 1 */ + +int +acpi_map_pxm_to_vm_domainid(int pxm) +{ + + return (-1); +} + #endif /* MAXMEMDOM > 1 */ Modified: trunk/sys/x86/bios/smbios.c =================================================================== --- trunk/sys/x86/bios/smbios.c 2020-02-08 19:33:27 UTC (rev 12311) +++ trunk/sys/x86/bios/smbios.c 2020-02-08 19:34:34 UTC (rev 12312) @@ -26,7 +26,7 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/x86/bios/smbios.c 241073 2012-09-30 15:42:20Z kevlo $"); +__FBSDID("$FreeBSD: stable/11/sys/x86/bios/smbios.c 241073 2012-09-30 15:42:20Z kevlo $"); #include <sys/param.h> #include <sys/systm.h> Modified: trunk/sys/x86/bios/vpd.c =================================================================== --- trunk/sys/x86/bios/vpd.c 2020-02-08 19:33:27 UTC (rev 12311) +++ trunk/sys/x86/bios/vpd.c 2020-02-08 19:34:34 UTC (rev 12312) @@ -26,7 +26,7 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/x86/bios/vpd.c 227309 2011-11-07 15:43:11Z ed $"); +__FBSDID("$FreeBSD: stable/11/sys/x86/bios/vpd.c 227309 2011-11-07 15:43:11Z ed $"); /* * VPD decoder for IBM systems (Thinkpads) Modified: trunk/sys/x86/cpufreq/est.c =================================================================== --- trunk/sys/x86/cpufreq/est.c 2020-02-08 19:33:27 UTC (rev 12311) +++ trunk/sys/x86/cpufreq/est.c 2020-02-08 19:34:34 UTC (rev 12312) @@ -27,7 +27,7 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/x86/cpufreq/est.c 260473 2014-01-09 10:44:27Z mav $"); +__FBSDID("$FreeBSD: stable/11/sys/x86/cpufreq/est.c 259197 2013-12-10 20:25:43Z mav $"); #include <sys/param.h> #include <sys/bus.h> Modified: trunk/sys/x86/cpufreq/hwpstate.c =================================================================== --- trunk/sys/x86/cpufreq/hwpstate.c 2020-02-08 19:33:27 UTC (rev 12311) +++ trunk/sys/x86/cpufreq/hwpstate.c 2020-02-08 19:34:34 UTC (rev 12312) @@ -45,7 +45,7 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/x86/cpufreq/hwpstate.c 326638 2017-12-06 21:40:24Z jkim $"); +__FBSDID("$FreeBSD: stable/11/sys/x86/cpufreq/hwpstate.c 326637 2017-12-06 21:39:01Z jkim $"); #include <sys/param.h> #include <sys/bus.h> Modified: trunk/sys/x86/cpufreq/p4tcc.c =================================================================== --- trunk/sys/x86/cpufreq/p4tcc.c 2020-02-08 19:33:27 UTC (rev 12311) +++ trunk/sys/x86/cpufreq/p4tcc.c 2020-02-08 19:34:34 UTC (rev 12312) @@ -38,7 +38,7 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/x86/cpufreq/p4tcc.c 250487 2013-05-10 22:43:27Z hiren $"); +__FBSDID("$FreeBSD: stable/11/sys/x86/cpufreq/p4tcc.c 250487 2013-05-10 22:43:27Z hiren $"); #include <sys/param.h> #include <sys/systm.h> Modified: trunk/sys/x86/cpufreq/powernow.c =================================================================== --- trunk/sys/x86/cpufreq/powernow.c 2020-02-08 19:33:27 UTC (rev 12311) +++ trunk/sys/x86/cpufreq/powernow.c 2020-02-08 19:34:34 UTC (rev 12312) @@ -30,7 +30,7 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/x86/cpufreq/powernow.c 305615 2016-09-08 15:06:28Z pfg $"); +__FBSDID("$FreeBSD: stable/11/sys/x86/cpufreq/powernow.c 305614 2016-09-08 15:05:25Z pfg $"); #include <sys/param.h> #include <sys/bus.h> Modified: trunk/sys/x86/cpufreq/smist.c =================================================================== --- trunk/sys/x86/cpufreq/smist.c 2020-02-08 19:33:27 UTC (rev 12311) +++ trunk/sys/x86/cpufreq/smist.c 2020-02-08 19:34:34 UTC (rev 12312) @@ -37,7 +37,7 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/x86/cpufreq/smist.c 187597 2009-01-22 20:29:07Z jkim $"); +__FBSDID("$FreeBSD: stable/11/sys/x86/cpufreq/smist.c 297793 2016-04-10 23:07:00Z pfg $"); #include <sys/param.h> #include <sys/bus.h> @@ -224,7 +224,7 @@ bus_dma_tag_destroy(tag); device_printf(dev, "can't load mem\n"); return (ENXIO); - }; + } DPRINT(dev, "taking ownership over BIOS return %d\n", cb_data.result); bus_dmamap_unload(tag, map); bus_dmamem_free(tag, cb_data.buf, map); From laffer1 at midnightbsd.org Sat Feb 8 14:35:05 2020 From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org) Date: Sat, 8 Feb 2020 14:35:05 -0500 (EST) Subject: [Midnightbsd-cvs] src [12313] trunk/sys/xdr: sync with FreeBSD 11-stable Message-ID: <202002081935.018JZ5Fa062440@stargazer.midnightbsd.org> Revision: 12313 http://svnweb.midnightbsd.org/src/?rev=12313 Author: laffer1 Date: 2020-02-08 14:35:04 -0500 (Sat, 08 Feb 2020) Log Message: ----------- sync with FreeBSD 11-stable Modified Paths: -------------- trunk/sys/xdr/xdr.c trunk/sys/xdr/xdr_array.c trunk/sys/xdr/xdr_mbuf.c trunk/sys/xdr/xdr_mem.c trunk/sys/xdr/xdr_reference.c trunk/sys/xdr/xdr_sizeof.c Modified: trunk/sys/xdr/xdr.c =================================================================== --- trunk/sys/xdr/xdr.c 2020-02-08 19:34:34 UTC (rev 12312) +++ trunk/sys/xdr/xdr.c 2020-02-08 19:35:04 UTC (rev 12313) @@ -1,3 +1,4 @@ +/* $MidnightBSD$ */ /* $NetBSD: xdr.c,v 1.22 2000/07/06 03:10:35 christos Exp $ */ /* @@ -34,7 +35,7 @@ static char *sccsid = "@(#)xdr.c 2.1 88/07/29 4.0 RPCSRC"; #endif #include <sys/cdefs.h> -__MBSDID("$MidnightBSD$"); +__FBSDID("$FreeBSD: stable/11/sys/xdr/xdr.c 319614 2017-06-06 07:21:33Z delphij $"); /* * xdr.c, Generic XDR routines implementation. Modified: trunk/sys/xdr/xdr_array.c =================================================================== --- trunk/sys/xdr/xdr_array.c 2020-02-08 19:34:34 UTC (rev 12312) +++ trunk/sys/xdr/xdr_array.c 2020-02-08 19:35:04 UTC (rev 12313) @@ -1,3 +1,4 @@ +/* $MidnightBSD$ */ /* $NetBSD: xdr_array.c,v 1.12 2000/01/22 22:19:18 mycroft Exp $ */ /* @@ -34,7 +35,7 @@ static char *sccsid = "@(#)xdr_array.c 2.1 88/07/29 4.0 RPCSRC"; #endif #include <sys/cdefs.h> -__MBSDID("$MidnightBSD$"); +__FBSDID("$FreeBSD: stable/11/sys/xdr/xdr_array.c 177633 2008-03-26 15:23:12Z dfr $"); /* * xdr_array.c, Generic XDR routines impelmentation. Modified: trunk/sys/xdr/xdr_mbuf.c =================================================================== --- trunk/sys/xdr/xdr_mbuf.c 2020-02-08 19:34:34 UTC (rev 12312) +++ trunk/sys/xdr/xdr_mbuf.c 2020-02-08 19:35:04 UTC (rev 12313) @@ -1,3 +1,4 @@ +/* $MidnightBSD$ */ /*- * Copyright (c) 2008 Isilon Inc http://www.isilon.com/ * Authors: Doug Rabson <dfr at rabson.org> @@ -26,7 +27,7 @@ */ #include <sys/cdefs.h> -__MBSDID("$MidnightBSD$"); +__FBSDID("$FreeBSD: stable/11/sys/xdr/xdr_mbuf.c 248318 2013-03-15 10:21:18Z glebius $"); #include <sys/param.h> #include <sys/systm.h> Modified: trunk/sys/xdr/xdr_mem.c =================================================================== --- trunk/sys/xdr/xdr_mem.c 2020-02-08 19:34:34 UTC (rev 12312) +++ trunk/sys/xdr/xdr_mem.c 2020-02-08 19:35:04 UTC (rev 12313) @@ -1,3 +1,4 @@ +/* $MidnightBSD$ */ /* $NetBSD: xdr_mem.c,v 1.15 2000/01/22 22:19:18 mycroft Exp $ */ /* @@ -34,7 +35,7 @@ static char *sccsid = "@(#)xdr_mem.c 2.1 88/07/29 4.0 RPCSRC"; #endif #include <sys/cdefs.h> -__MBSDID("$MidnightBSD$"); +__FBSDID("$FreeBSD: stable/11/sys/xdr/xdr_mem.c 297975 2016-04-14 17:06:37Z pfg $"); /* * xdr_mem.h, XDR implementation using memory buffers. Modified: trunk/sys/xdr/xdr_reference.c =================================================================== --- trunk/sys/xdr/xdr_reference.c 2020-02-08 19:34:34 UTC (rev 12312) +++ trunk/sys/xdr/xdr_reference.c 2020-02-08 19:35:04 UTC (rev 12313) @@ -1,3 +1,4 @@ +/* $MidnightBSD$ */ /* $NetBSD: xdr_reference.c,v 1.13 2000/01/22 22:19:18 mycroft Exp $ */ /* @@ -34,7 +35,7 @@ static char *sccsid = "@(#)xdr_reference.c 2.1 88/07/29 4.0 RPCSRC"; #endif #include <sys/cdefs.h> -__MBSDID("$MidnightBSD$"); +__FBSDID("$FreeBSD: stable/11/sys/xdr/xdr_reference.c 177633 2008-03-26 15:23:12Z dfr $"); /* * xdr_reference.c, Generic XDR routines impelmentation. Modified: trunk/sys/xdr/xdr_sizeof.c =================================================================== --- trunk/sys/xdr/xdr_sizeof.c 2020-02-08 19:34:34 UTC (rev 12312) +++ trunk/sys/xdr/xdr_sizeof.c 2020-02-08 19:35:04 UTC (rev 12313) @@ -1,3 +1,4 @@ +/* $MidnightBSD$ */ /* * Sun RPC is a product of Sun Microsystems, Inc. and is provided for * unrestricted use provided that this legend is included on all tape @@ -36,7 +37,7 @@ */ #include <sys/cdefs.h> -__MBSDID("$MidnightBSD$"); +__FBSDID("$FreeBSD: stable/11/sys/xdr/xdr_sizeof.c 177633 2008-03-26 15:23:12Z dfr $"); #include <sys/param.h> #include <sys/systm.h> From laffer1 at midnightbsd.org Sat Feb 8 14:35:49 2020 From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org) Date: Sat, 8 Feb 2020 14:35:49 -0500 (EST) Subject: [Midnightbsd-cvs] src [12314] trunk/sys/vm: sync with FreeBSD 11-stable Message-ID: <202002081935.018JZnSh062792@stargazer.midnightbsd.org> Revision: 12314 http://svnweb.midnightbsd.org/src/?rev=12314 Author: laffer1 Date: 2020-02-08 14:35:48 -0500 (Sat, 08 Feb 2020) Log Message: ----------- sync with FreeBSD 11-stable Modified Paths: -------------- trunk/sys/vm/_vm_radix.h trunk/sys/vm/default_pager.c trunk/sys/vm/device_pager.c trunk/sys/vm/memguard.c trunk/sys/vm/memguard.h trunk/sys/vm/phys_pager.c trunk/sys/vm/pmap.h trunk/sys/vm/redzone.c trunk/sys/vm/redzone.h trunk/sys/vm/sg_pager.c trunk/sys/vm/swap_pager.c trunk/sys/vm/swap_pager.h trunk/sys/vm/uma.h trunk/sys/vm/uma_core.c trunk/sys/vm/uma_dbg.c trunk/sys/vm/uma_dbg.h trunk/sys/vm/uma_int.h trunk/sys/vm/vm.h trunk/sys/vm/vm_extern.h trunk/sys/vm/vm_fault.c trunk/sys/vm/vm_glue.c trunk/sys/vm/vm_init.c trunk/sys/vm/vm_kern.c trunk/sys/vm/vm_kern.h trunk/sys/vm/vm_map.c trunk/sys/vm/vm_map.h trunk/sys/vm/vm_meter.c trunk/sys/vm/vm_mmap.c trunk/sys/vm/vm_object.c trunk/sys/vm/vm_object.h trunk/sys/vm/vm_page.c trunk/sys/vm/vm_page.h trunk/sys/vm/vm_pageout.c trunk/sys/vm/vm_pageout.h trunk/sys/vm/vm_pager.c trunk/sys/vm/vm_pager.h trunk/sys/vm/vm_param.h trunk/sys/vm/vm_phys.c trunk/sys/vm/vm_phys.h trunk/sys/vm/vm_radix.c trunk/sys/vm/vm_radix.h trunk/sys/vm/vm_reserv.c trunk/sys/vm/vm_reserv.h trunk/sys/vm/vm_unix.c trunk/sys/vm/vm_zeroidle.c trunk/sys/vm/vnode_pager.c trunk/sys/vm/vnode_pager.h Added Paths: ----------- trunk/sys/vm/vm_domain.c trunk/sys/vm/vm_domain.h trunk/sys/vm/vm_swapout.c trunk/sys/vm/vm_swapout_dummy.c Modified: trunk/sys/vm/_vm_radix.h =================================================================== --- trunk/sys/vm/_vm_radix.h 2020-02-08 19:35:04 UTC (rev 12313) +++ trunk/sys/vm/_vm_radix.h 2020-02-08 19:35:48 UTC (rev 12314) @@ -26,7 +26,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: stable/10/sys/vm/_vm_radix.h 254141 2013-08-09 11:28:55Z attilio $ + * $FreeBSD: stable/11/sys/vm/_vm_radix.h 321513 2017-07-26 06:52:45Z kib $ */ #ifndef __VM_RADIX_H_ @@ -37,20 +37,6 @@ */ struct vm_radix { uintptr_t rt_root; - uint8_t rt_flags; }; -#define RT_INSERT_INPROG 0x01 -#define RT_TRIE_MODIFIED 0x02 - -#ifdef _KERNEL - -static __inline boolean_t -vm_radix_is_empty(struct vm_radix *rtree) -{ - - return (rtree->rt_root == 0); -} - -#endif /* _KERNEL */ #endif /* !__VM_RADIX_H_ */ Modified: trunk/sys/vm/default_pager.c =================================================================== --- trunk/sys/vm/default_pager.c 2020-02-08 19:35:04 UTC (rev 12313) +++ trunk/sys/vm/default_pager.c 2020-02-08 19:35:48 UTC (rev 12314) @@ -28,18 +28,10 @@ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. - * - * The default pager is responsible for supplying backing store to unbacked - * storage. The backing store is usually swap so we just fall through to - * the swap routines. However, since swap metadata has not been assigned, - * the swap routines assign and manage the swap backing store through the - * vm_page->swapblk field. The object is only converted when the page is - * physically freed after having been cleaned and even then vm_page->swapblk - * is maintained whenever a resident page also has swap backing store. */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/vm/default_pager.c 310363 2016-12-21 11:32:08Z kib $"); +__FBSDID("$FreeBSD: stable/11/sys/vm/default_pager.c 315473 2017-03-18 05:38:10Z alc $"); #include <sys/param.h> #include <sys/systm.h> @@ -54,14 +46,16 @@ #include <vm/vm_pager.h> #include <vm/swap_pager.h> -static vm_object_t default_pager_alloc(void *, vm_ooffset_t, vm_prot_t, - vm_ooffset_t, struct ucred *); -static void default_pager_dealloc(vm_object_t); -static int default_pager_getpages(vm_object_t, vm_page_t *, int, int); -static void default_pager_putpages(vm_object_t, vm_page_t *, int, - boolean_t, int *); -static boolean_t default_pager_haspage(vm_object_t, vm_pindex_t, int *, - int *); +static vm_object_t default_pager_alloc(void *, vm_ooffset_t, vm_prot_t, + vm_ooffset_t, struct ucred *); +static void default_pager_dealloc(vm_object_t); +static int default_pager_getpages(vm_object_t, vm_page_t *, int, + int *, int *); +static void default_pager_putpages(vm_object_t, vm_page_t *, int, + boolean_t, int *); +static boolean_t default_pager_haspage(vm_object_t, vm_pindex_t, int *, + int *); + /* * pagerops for OBJT_DEFAULT - "default pager". * @@ -84,7 +78,7 @@ }; /* - * no_pager_alloc just returns an initialized object. + * Return an initialized object. */ static vm_object_t default_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot, @@ -102,51 +96,41 @@ object = vm_object_allocate(OBJT_DEFAULT, OFF_TO_IDX(round_page(offset + size))); if (cred != NULL) { - VM_OBJECT_WLOCK(object); object->cred = cred; object->charge = size; - VM_OBJECT_WUNLOCK(object); } return (object); } /* - * deallocate resources associated with default objects. The default objects - * have no special resources allocated to them, but the vm_page's being used - * in this object might. Still, we do not have to do anything - we will free - * the swapblk in the underlying vm_page's when we free the vm_page or - * garbage collect the vm_page cache list. + * Deallocate resources associated with the object. */ static void -default_pager_dealloc(object) - vm_object_t object; +default_pager_dealloc(vm_object_t object) { - /* - * OBJT_DEFAULT objects have no special resources allocated to them. - */ + + /* Reserved swap is released by vm_object_destroy(). */ object->type = OBJT_DEAD; } /* - * Load pages from backing store. Since OBJT_DEFAULT is converted to - * OBJT_SWAP at the time a swap-backed vm_page_t is freed, we will never - * see a vm_page with assigned swap here. + * Load pages from backing store. */ static int -default_pager_getpages(object, m, count, reqpage) - vm_object_t object; - vm_page_t *m; - int count; - int reqpage; +default_pager_getpages(vm_object_t object, vm_page_t *m, int count, + int *rbehind, int *rahead) { - return VM_PAGER_FAIL; + + /* + * Since an OBJT_DEFAULT object is converted to OBJT_SWAP by the first + * call to the putpages method, this function will never be called on + * a vm_page with assigned swap. + */ + return (VM_PAGER_FAIL); } /* - * Store pages to backing store. We should assign swap and initiate - * I/O. We do not actually convert the object to OBJT_SWAP here. The - * object will be converted when the written-out vm_page_t is moved from the - * cache to the free list. + * Store pages to backing store. */ static void default_pager_putpages(vm_object_t object, vm_page_t *m, int count, @@ -153,28 +137,20 @@ int flags, int *rtvals) { + /* The swap pager will convert the object to OBJT_SWAP. */ swappagerops.pgo_putpages(object, m, count, flags, rtvals); } /* - * Tell us whether the backing store for the requested (object,index) is - * synchronized. i.e. tell us whether we can throw the page away and - * reload it later. So, for example, if we are in the process of writing - * the page to its backing store, or if no backing store has been assigned, - * it is not yet synchronized. - * - * It is possible to have fully-synchronized swap assigned without the - * object having been converted. We just call swap_pager_haspage() to - * deal with it since it must already deal with it plus deal with swap - * meta-data structures. + * Tell us whether the requested (object,index) is available from the object's + * backing store. */ static boolean_t -default_pager_haspage(object, pindex, before, after) - vm_object_t object; - vm_pindex_t pindex; - int *before; - int *after; +default_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before, + int *after) { - return FALSE; + + /* An OBJT_DEFAULT object has no backing store. */ + return (FALSE); } Modified: trunk/sys/vm/device_pager.c =================================================================== --- trunk/sys/vm/device_pager.c 2020-02-08 19:35:04 UTC (rev 12313) +++ trunk/sys/vm/device_pager.c 2020-02-08 19:35:48 UTC (rev 12314) @@ -36,7 +36,7 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/vm/device_pager.c 320439 2017-06-28 06:13:58Z alc $"); +__FBSDID("$FreeBSD: stable/11/sys/vm/device_pager.c 331722 2018-03-29 02:50:57Z eadler $"); #include <sys/param.h> #include <sys/systm.h> @@ -47,6 +47,7 @@ #include <sys/mman.h> #include <sys/rwlock.h> #include <sys/sx.h> +#include <sys/vmmeter.h> #include <vm/vm.h> #include <vm/vm_param.h> @@ -60,10 +61,12 @@ static vm_object_t dev_pager_alloc(void *, vm_ooffset_t, vm_prot_t, vm_ooffset_t, struct ucred *); static void dev_pager_dealloc(vm_object_t); -static int dev_pager_getpages(vm_object_t, vm_page_t *, int, int); +static int dev_pager_getpages(vm_object_t, vm_page_t *, int, int *, int *); static void dev_pager_putpages(vm_object_t, vm_page_t *, int, int, int *); static boolean_t dev_pager_haspage(vm_object_t, vm_pindex_t, int *, int *); static void dev_pager_free_page(vm_object_t object, vm_page_t m); +static int dev_pager_populate(vm_object_t object, vm_pindex_t pidx, + int fault_type, vm_prot_t, vm_pindex_t *first, vm_pindex_t *last); /* list of device pager objects */ static struct pagerlst dev_pager_object_list; @@ -85,6 +88,7 @@ .pgo_getpages = dev_pager_getpages, .pgo_putpages = dev_pager_putpages, .pgo_haspage = dev_pager_haspage, + .pgo_populate = dev_pager_populate, }; static int old_dev_pager_ctor(void *handle, vm_ooffset_t size, vm_prot_t prot, @@ -128,6 +132,8 @@ if (tp != OBJT_DEVICE && tp != OBJT_MGTDEVICE) return (NULL); + KASSERT(tp == OBJT_MGTDEVICE || ops->cdev_pg_populate == NULL, + ("populate on unmanaged device pager")); /* * Offset should be page aligned. @@ -135,8 +141,18 @@ if (foff & PAGE_MASK) return (NULL); + /* + * Treat the mmap(2) file offset as an unsigned value for a + * device mapping. This, in effect, allows a user to pass all + * possible off_t values as the mapping cookie to the driver. At + * this point, we know that both foff and size are a multiple + * of the page size. Do a check to avoid wrap. + */ size = round_page(size); - pindex = OFF_TO_IDX(foff + size); + pindex = UOFF_TO_IDX(foff) + UOFF_TO_IDX(size); + if (pindex > OBJ_MAX_SIZE || pindex < UOFF_TO_IDX(foff) || + pindex < UOFF_TO_IDX(size)) + return (NULL); if (ops->cdev_pg_ctor(handle, size, prot, foff, cred, &color) != 0) return (NULL); @@ -169,6 +185,11 @@ */ if (pindex > object->size) object->size = pindex; + KASSERT(object->type == tp, + ("Inconsistent device pager type %p %d", + object, tp)); + KASSERT(object->un_pager.devp.ops == ops, + ("Inconsistent devops %p %p", object, ops)); } else { object = object1; object1 = NULL; @@ -175,12 +196,14 @@ object->handle = handle; TAILQ_INSERT_TAIL(&dev_pager_object_list, object, pager_object_list); - KASSERT(object->type == tp, - ("Inconsistent device pager type %p %d", object, tp)); + if (ops->cdev_pg_populate != NULL) + vm_object_set_flag(object, OBJ_POPULATE); } } else { if (pindex > object->size) object->size = pindex; + KASSERT(object->type == tp, + ("Inconsistent device pager type %p %d", object, tp)); } mtx_unlock(&dev_pager_mtx); if (object1 != NULL) { @@ -256,34 +279,35 @@ } static int -dev_pager_getpages(vm_object_t object, vm_page_t *ma, int count, int reqpage) +dev_pager_getpages(vm_object_t object, vm_page_t *ma, int count, int *rbehind, + int *rahead) { - int error, i; + int error; + /* Since our haspage reports zero after/before, the count is 1. */ + KASSERT(count == 1, ("%s: count %d", __func__, count)); VM_OBJECT_ASSERT_WLOCKED(object); + if (object->un_pager.devp.ops->cdev_pg_fault == NULL) + return (VM_PAGER_FAIL); error = object->un_pager.devp.ops->cdev_pg_fault(object, - IDX_TO_OFF(ma[reqpage]->pindex), PROT_READ, &ma[reqpage]); + IDX_TO_OFF(ma[0]->pindex), PROT_READ, &ma[0]); VM_OBJECT_ASSERT_WLOCKED(object); - for (i = 0; i < count; i++) { - if (i != reqpage) { - vm_page_lock(ma[i]); - vm_page_free(ma[i]); - vm_page_unlock(ma[i]); - } - } - if (error == VM_PAGER_OK) { KASSERT((object->type == OBJT_DEVICE && - (ma[reqpage]->oflags & VPO_UNMANAGED) != 0) || + (ma[0]->oflags & VPO_UNMANAGED) != 0) || (object->type == OBJT_MGTDEVICE && - (ma[reqpage]->oflags & VPO_UNMANAGED) == 0), - ("Wrong page type %p %p", ma[reqpage], object)); + (ma[0]->oflags & VPO_UNMANAGED) == 0), + ("Wrong page type %p %p", ma[0], object)); if (object->type == OBJT_DEVICE) { TAILQ_INSERT_TAIL(&object->un_pager.devp.devp_pglist, - ma[reqpage], plinks.q); + ma[0], plinks.q); } + if (rbehind) + *rbehind = 0; + if (rahead) + *rahead = 0; } return (error); @@ -290,6 +314,18 @@ } static int +dev_pager_populate(vm_object_t object, vm_pindex_t pidx, int fault_type, + vm_prot_t max_prot, vm_pindex_t *first, vm_pindex_t *last) +{ + + VM_OBJECT_ASSERT_WLOCKED(object); + if (object->un_pager.devp.ops->cdev_pg_populate == NULL) + return (VM_PAGER_FAIL); + return (object->un_pager.devp.ops->cdev_pg_populate(object, pidx, + fault_type, max_prot, first, last)); +} + +static int old_dev_pager_fault(vm_object_t object, vm_ooffset_t offset, int prot, vm_page_t *mres) { @@ -355,8 +391,7 @@ */ page = vm_page_getfake(paddr, memattr); VM_OBJECT_WLOCK(object); - if (vm_page_replace(page, object, (*mres)->pindex) != *mres) - panic("old_dev_pager_fault: invalid page replacement"); + vm_page_replace_checked(page, object, (*mres)->pindex, *mres); vm_page_lock(*mres); vm_page_free(*mres); vm_page_unlock(*mres); Modified: trunk/sys/vm/memguard.c =================================================================== --- trunk/sys/vm/memguard.c 2020-02-08 19:35:04 UTC (rev 12313) +++ trunk/sys/vm/memguard.c 2020-02-08 19:35:48 UTC (rev 12314) @@ -27,7 +27,7 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/vm/memguard.c 325037 2017-10-27 14:23:53Z markj $"); +__FBSDID("$FreeBSD: stable/11/sys/vm/memguard.c 331017 2018-03-15 19:08:33Z kevans $"); /* * MemGuard is a simple replacement allocator for debugging only @@ -50,6 +50,7 @@ #include <sys/malloc.h> #include <sys/sysctl.h> #include <sys/vmem.h> +#include <sys/vmmeter.h> #include <vm/vm.h> #include <vm/uma.h> @@ -68,9 +69,9 @@ * reserved for MemGuard. */ static u_int vm_memguard_divisor; -SYSCTL_UINT(_vm_memguard, OID_AUTO, divisor, CTLFLAG_RDTUN, +SYSCTL_UINT(_vm_memguard, OID_AUTO, divisor, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &vm_memguard_divisor, - 0, "(kmem_size/memguard_divisor) == memguard submap size"); + 0, "(kmem_size/memguard_divisor) == memguard submap size"); /* * Short description (ks_shortdesc) of memory type to monitor. @@ -131,8 +132,7 @@ #define MG_GUARD_ALLLARGE 0x002 #define MG_GUARD_NOFREE 0x004 static int memguard_options = MG_GUARD_AROUND; -TUNABLE_INT("vm.memguard.options", &memguard_options); -SYSCTL_INT(_vm_memguard, OID_AUTO, options, CTLFLAG_RW, +SYSCTL_INT(_vm_memguard, OID_AUTO, options, CTLFLAG_RWTUN, &memguard_options, 0, "MemGuard options:\n" "\t0x001 - add guard pages around each allocation\n" @@ -148,8 +148,7 @@ static u_int memguard_frequency; static u_long memguard_frequency_hits; -TUNABLE_INT("vm.memguard.frequency", &memguard_frequency); -SYSCTL_UINT(_vm_memguard, OID_AUTO, frequency, CTLFLAG_RW, +SYSCTL_UINT(_vm_memguard, OID_AUTO, frequency, CTLFLAG_RWTUN, &memguard_frequency, 0, "Times in 100000 that MemGuard will randomly run"); SYSCTL_ULONG(_vm_memguard, OID_AUTO, frequency_hits, CTLFLAG_RD, &memguard_frequency_hits, 0, "# times MemGuard randomly chose"); @@ -165,6 +164,7 @@ u_long mem_pgs, parent_size; vm_memguard_divisor = 10; + /* CTFLAG_RDTUN doesn't work during the early boot process. */ TUNABLE_INT_FETCH("vm.memguard.divisor", &vm_memguard_divisor); parent_size = vm_map_max(parent_map) - vm_map_min(parent_map) + @@ -180,7 +180,7 @@ * This prevents memguard's page promotions from completely * using up memory, since most malloc(9) calls are sub-page. */ - mem_pgs = cnt.v_page_count; + mem_pgs = vm_cnt.v_page_count; memguard_physlimit = (mem_pgs / vm_memguard_divisor) * PAGE_SIZE; /* * We want as much KVA as we can take safely. Use at most our Modified: trunk/sys/vm/memguard.h =================================================================== --- trunk/sys/vm/memguard.h 2020-02-08 19:35:04 UTC (rev 12313) +++ trunk/sys/vm/memguard.h 2020-02-08 19:35:48 UTC (rev 12314) @@ -24,7 +24,7 @@ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * - * $FreeBSD: stable/10/sys/vm/memguard.h 254025 2013-08-07 06:21:20Z jeff $ + * $FreeBSD: stable/11/sys/vm/memguard.h 254025 2013-08-07 06:21:20Z jeff $ */ #ifndef _VM_MEMGUARD_H_ Modified: trunk/sys/vm/phys_pager.c =================================================================== --- trunk/sys/vm/phys_pager.c 2020-02-08 19:35:04 UTC (rev 12313) +++ trunk/sys/vm/phys_pager.c 2020-02-08 19:35:48 UTC (rev 12314) @@ -25,7 +25,7 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/vm/phys_pager.c 310110 2016-12-15 10:47:35Z kib $"); +__FBSDID("$FreeBSD: stable/11/sys/vm/phys_pager.c 327785 2018-01-10 20:39:26Z markj $"); #include <sys/param.h> #include <sys/systm.h> @@ -42,6 +42,7 @@ #include <vm/vm_param.h> #include <vm/vm_object.h> #include <vm/vm_page.h> +#include <vm/vm_pageout.h> #include <vm/vm_pager.h> /* list of phys pager objects */ @@ -99,6 +100,7 @@ object = object1; object1 = NULL; object->handle = handle; + vm_object_set_flag(object, OBJ_POPULATE); TAILQ_INSERT_TAIL(&phys_pager_object_list, object, pager_object_list); } @@ -110,6 +112,7 @@ vm_object_deallocate(object1); } else { object = vm_object_allocate(OBJT_PHYS, pindex); + vm_object_set_flag(object, OBJ_POPULATE); } return (object); @@ -134,7 +137,8 @@ * Fill as many pages as vm_fault has allocated for us. */ static int -phys_pager_getpages(vm_object_t object, vm_page_t *m, int count, int reqpage) +phys_pager_getpages(vm_object_t object, vm_page_t *m, int count, int *rbehind, + int *rahead) { int i; @@ -149,35 +153,98 @@ ("phys_pager_getpages: partially valid page %p", m[i])); KASSERT(m[i]->dirty == 0, ("phys_pager_getpages: dirty page %p", m[i])); - /* The requested page must remain busy, the others not. */ - if (i == reqpage) { - vm_page_lock(m[i]); - vm_page_flash(m[i]); - vm_page_unlock(m[i]); - } else - vm_page_xunbusy(m[i]); } + if (rbehind) + *rbehind = 0; + if (rahead) + *rahead = 0; return (VM_PAGER_OK); } -static void -phys_pager_putpages(vm_object_t object, vm_page_t *m, int count, boolean_t sync, - int *rtvals) -{ - - panic("phys_pager_putpage called"); -} - /* * Implement a pretty aggressive clustered getpages strategy. Hint that * everything in an entire 4MB window should be prefaulted at once. * - * XXX 4MB (1024 slots per page table page) is convenient for x86, + * 4MB (1024 slots per page table page) is convenient for x86, * but may not be for other arches. */ #ifndef PHYSCLUSTER #define PHYSCLUSTER 1024 #endif +static int phys_pager_cluster = PHYSCLUSTER; +SYSCTL_INT(_vm, OID_AUTO, phys_pager_cluster, CTLFLAG_RWTUN, + &phys_pager_cluster, 0, + "prefault window size for phys pager"); + +/* + * Max hint to vm_page_alloc() about the further allocation needs + * inside the phys_pager_populate() loop. The number of bits used to + * implement VM_ALLOC_COUNT() determines the hard limit on this value. + * That limit is currently 65535. + */ +#define PHYSALLOC 16 + +static int +phys_pager_populate(vm_object_t object, vm_pindex_t pidx, + int fault_type __unused, vm_prot_t max_prot __unused, vm_pindex_t *first, + vm_pindex_t *last) +{ + vm_page_t m; + vm_pindex_t base, end, i; + int ahead; + + base = rounddown(pidx, phys_pager_cluster); + end = base + phys_pager_cluster - 1; + if (end >= object->size) + end = object->size - 1; + if (*first > base) + base = *first; + if (end > *last) + end = *last; + *first = base; + *last = end; + + for (i = base; i <= end; i++) { +retry: + m = vm_page_lookup(object, i); + if (m == NULL) { + ahead = MIN(end - i, PHYSALLOC); + m = vm_page_alloc(object, i, VM_ALLOC_NORMAL | + VM_ALLOC_ZERO | VM_ALLOC_WAITFAIL | + VM_ALLOC_COUNT(ahead)); + if (m == NULL) + goto retry; + if ((m->flags & PG_ZERO) == 0) + pmap_zero_page(m); + m->valid = VM_PAGE_BITS_ALL; + } else if (vm_page_xbusied(m)) { + vm_page_lock(m); + VM_OBJECT_WUNLOCK(object); + vm_page_busy_sleep(m, "physb", true); + VM_OBJECT_WLOCK(object); + goto retry; + } else { + vm_page_xbusy(m); + if (m->valid != VM_PAGE_BITS_ALL) + vm_page_zero_invalid(m, TRUE); + } + + KASSERT(m->valid == VM_PAGE_BITS_ALL, + ("phys_pager_populate: partially valid page %p", m)); + KASSERT(m->dirty == 0, + ("phys_pager_populate: dirty page %p", m)); + } + return (VM_PAGER_OK); +} + +static void +phys_pager_putpages(vm_object_t object, vm_page_t *m, int count, boolean_t sync, + int *rtvals) +{ + + panic("phys_pager_putpage called"); +} + static boolean_t phys_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before, int *after) @@ -184,8 +251,8 @@ { vm_pindex_t base, end; - base = pindex & (~(PHYSCLUSTER - 1)); - end = base + (PHYSCLUSTER - 1); + base = rounddown(pindex, phys_pager_cluster); + end = base + phys_pager_cluster - 1; if (before != NULL) *before = pindex - base; if (after != NULL) @@ -200,4 +267,5 @@ .pgo_getpages = phys_pager_getpages, .pgo_putpages = phys_pager_putpages, .pgo_haspage = phys_pager_haspage, + .pgo_populate = phys_pager_populate, }; Modified: trunk/sys/vm/pmap.h =================================================================== --- trunk/sys/vm/pmap.h 2020-02-08 19:35:04 UTC (rev 12313) +++ trunk/sys/vm/pmap.h 2020-02-08 19:35:48 UTC (rev 12314) @@ -58,7 +58,7 @@ * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. * - * $FreeBSD: stable/10/sys/vm/pmap.h 270920 2014-09-01 07:58:15Z kib $ + * $FreeBSD: stable/11/sys/vm/pmap.h 331722 2018-03-29 02:50:57Z eadler $ */ /* @@ -101,10 +101,22 @@ /* * Flags for pmap_enter(). The bits in the low-order byte are reserved * for the protection code (vm_prot_t) that describes the fault type. + * Bits 24 through 31 are reserved for the pmap's internal use. */ -#define PMAP_ENTER_NOSLEEP 0x0100 -#define PMAP_ENTER_WIRED 0x0200 +#define PMAP_ENTER_NOSLEEP 0x00000100 +#define PMAP_ENTER_WIRED 0x00000200 +#define PMAP_ENTER_RESERVED 0xFF000000 +/* + * Define the maximum number of machine-dependent reference bits that are + * cleared by a call to pmap_ts_referenced(). This limit serves two purposes. + * First, it bounds the cost of reference bit maintenance on widely shared + * pages. Second, it prevents numeric overflow during maintenance of a + * widely shared page's "act_count" field. An overflow could result in the + * premature deactivation of the page. + */ +#define PMAP_TS_REFERENCED_MAX 5 + void pmap_activate(struct thread *td); void pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice); @@ -142,6 +154,8 @@ void pmap_protect(pmap_t, vm_offset_t, vm_offset_t, vm_prot_t); void pmap_qenter(vm_offset_t, vm_page_t *, int); void pmap_qremove(vm_offset_t, int); +vm_offset_t pmap_quick_enter_page(vm_page_t); +void pmap_quick_remove_page(vm_offset_t); void pmap_release(pmap_t); void pmap_remove(pmap_t, vm_offset_t, vm_offset_t); void pmap_remove_all(vm_page_t m); Modified: trunk/sys/vm/redzone.c =================================================================== --- trunk/sys/vm/redzone.c 2020-02-08 19:35:04 UTC (rev 12313) +++ trunk/sys/vm/redzone.c 2020-02-08 19:35:48 UTC (rev 12314) @@ -26,7 +26,7 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/vm/redzone.c 227309 2011-11-07 15:43:11Z ed $"); +__FBSDID("$FreeBSD: stable/11/sys/vm/redzone.c 267992 2014-06-28 03:56:17Z hselasky $"); #include <sys/param.h> #include <sys/systm.h> @@ -42,8 +42,7 @@ SYSCTL_ULONG(_vm_redzone, OID_AUTO, extra_mem, CTLFLAG_RD, &redzone_extra_mem, 0, "Extra memory allocated by redzone"); static int redzone_panic = 0; -TUNABLE_INT("vm.redzone.panic", &redzone_panic); -SYSCTL_INT(_vm_redzone, OID_AUTO, panic, CTLFLAG_RW, &redzone_panic, 0, +SYSCTL_INT(_vm_redzone, OID_AUTO, panic, CTLFLAG_RWTUN, &redzone_panic, 0, "Panic when buffer corruption is detected"); #define REDZONE_CHSIZE (16) Modified: trunk/sys/vm/redzone.h =================================================================== --- trunk/sys/vm/redzone.h 2020-02-08 19:35:04 UTC (rev 12313) +++ trunk/sys/vm/redzone.h 2020-02-08 19:35:48 UTC (rev 12314) @@ -24,7 +24,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: stable/10/sys/vm/redzone.h 155086 2006-01-31 11:09:21Z pjd $ + * $FreeBSD: stable/11/sys/vm/redzone.h 155086 2006-01-31 11:09:21Z pjd $ */ #ifndef _VM_REDZONE_H_ Modified: trunk/sys/vm/sg_pager.c =================================================================== --- trunk/sys/vm/sg_pager.c 2020-02-08 19:35:04 UTC (rev 12313) +++ trunk/sys/vm/sg_pager.c 2020-02-08 19:35:48 UTC (rev 12314) @@ -27,7 +27,7 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/vm/sg_pager.c 284100 2015-06-06 20:37:40Z jhb $"); +__FBSDID("$FreeBSD: stable/11/sys/vm/sg_pager.c 331017 2018-03-15 19:08:33Z kevans $"); /* * This pager manages OBJT_SG objects. These objects are backed by @@ -39,6 +39,8 @@ #include <sys/mutex.h> #include <sys/rwlock.h> #include <sys/sglist.h> +#include <sys/vmmeter.h> + #include <vm/vm.h> #include <vm/vm_param.h> #include <vm/vm_object.h> @@ -50,7 +52,7 @@ static vm_object_t sg_pager_alloc(void *, vm_ooffset_t, vm_prot_t, vm_ooffset_t, struct ucred *); static void sg_pager_dealloc(vm_object_t); -static int sg_pager_getpages(vm_object_t, vm_page_t *, int, int); +static int sg_pager_getpages(vm_object_t, vm_page_t *, int, int *, int *); static void sg_pager_putpages(vm_object_t, vm_page_t *, int, boolean_t, int *); static boolean_t sg_pager_haspage(vm_object_t, vm_pindex_t, int *, @@ -97,8 +99,9 @@ * to map beyond that. */ size = round_page(size); - pindex = OFF_TO_IDX(foff + size); - if (pindex > npages) + pindex = UOFF_TO_IDX(foff) + UOFF_TO_IDX(size); + if (pindex > npages || pindex < UOFF_TO_IDX(foff) || + pindex < UOFF_TO_IDX(size)) return (NULL); /* @@ -136,7 +139,8 @@ } static int -sg_pager_getpages(vm_object_t object, vm_page_t *m, int count, int reqpage) +sg_pager_getpages(vm_object_t object, vm_page_t *m, int count, int *rbehind, + int *rahead) { struct sglist *sg; vm_page_t m_paddr, page; @@ -146,11 +150,13 @@ size_t space; int i; + /* Since our haspage reports zero after/before, the count is 1. */ + KASSERT(count == 1, ("%s: count %d", __func__, count)); VM_OBJECT_ASSERT_WLOCKED(object); sg = object->handle; memattr = object->memattr; VM_OBJECT_WUNLOCK(object); - offset = m[reqpage]->pindex; + offset = m[0]->pindex; /* * Lookup the physical address of the requested page. An initial @@ -179,7 +185,7 @@ } /* Return a fake page for the requested page. */ - KASSERT(!(m[reqpage]->flags & PG_FICTITIOUS), + KASSERT(!(m[0]->flags & PG_FICTITIOUS), ("backing page for SG is fake")); /* Construct a new fake page. */ @@ -186,19 +192,18 @@ page = vm_page_getfake(paddr, memattr); VM_OBJECT_WLOCK(object); TAILQ_INSERT_TAIL(&object->un_pager.sgp.sgp_pglist, page, plinks.q); - - /* Free the original pages and insert this fake page into the object. */ - for (i = 0; i < count; i++) { - if (i == reqpage && - vm_page_replace(page, object, offset) != m[i]) - panic("sg_pager_getpages: invalid place replacement"); - vm_page_lock(m[i]); - vm_page_free(m[i]); - vm_page_unlock(m[i]); - } - m[reqpage] = page; + vm_page_replace_checked(page, object, offset, m[0]); + vm_page_lock(m[0]); + vm_page_free(m[0]); + vm_page_unlock(m[0]); + m[0] = page; page->valid = VM_PAGE_BITS_ALL; + if (rbehind) + *rbehind = 0; + if (rahead) + *rahead = 0; + return (VM_PAGER_OK); } Modified: trunk/sys/vm/swap_pager.c =================================================================== --- trunk/sys/vm/swap_pager.c 2020-02-08 19:35:04 UTC (rev 12313) +++ trunk/sys/vm/swap_pager.c 2020-02-08 19:35:48 UTC (rev 12314) @@ -68,7 +68,7 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/vm/swap_pager.c 320557 2017-07-01 22:21:11Z alc $"); +__FBSDID("$FreeBSD: stable/11/sys/vm/swap_pager.c 350355 2019-07-26 10:36:07Z kib $"); #include "opt_swap.h" #include "opt_vm.h" @@ -87,10 +87,12 @@ #include <sys/namei.h> #include <sys/vnode.h> #include <sys/malloc.h> +#include <sys/pctrie.h> #include <sys/racct.h> #include <sys/resource.h> #include <sys/resourcevar.h> #include <sys/rwlock.h> +#include <sys/sbuf.h> #include <sys/sysctl.h> #include <sys/sysproto.h> #include <sys/blist.h> @@ -120,7 +122,7 @@ * The 64-page limit is due to the radix code (kern/subr_blist.c). */ #ifndef MAX_PAGEOUT_CLUSTER -#define MAX_PAGEOUT_CLUSTER 16 +#define MAX_PAGEOUT_CLUSTER 32 #endif #if !defined(SWB_NPAGES) @@ -127,22 +129,17 @@ #define SWB_NPAGES MAX_PAGEOUT_CLUSTER #endif +#define SWAP_META_PAGES PCTRIE_COUNT + /* - * The swblock structure maps an object and a small, fixed-size range - * of page indices to disk addresses within a swap area. - * The collection of these mappings is implemented as a hash table. - * Unused disk addresses within a swap area are allocated and managed - * using a blist. + * A swblk structure maps each page index within a + * SWAP_META_PAGES-aligned and sized range to the address of an + * on-disk swap block (or SWAPBLK_NONE). The collection of these + * mappings for an entire vm object is implemented as a pc-trie. */ -#define SWAP_META_PAGES (SWB_NPAGES * 2) -#define SWAP_META_MASK (SWAP_META_PAGES - 1) - -struct swblock { - struct swblock *swb_hnext; - vm_object_t swb_object; - vm_pindex_t swb_index; - int swb_count; - daddr_t swb_pages[SWAP_META_PAGES]; +struct swblk { + vm_pindex_t p; + daddr_t d[SWAP_META_PAGES]; }; static MALLOC_DEFINE(M_VMPGDATA, "vm_pgdata", "swap pager private data"); @@ -151,7 +148,7 @@ static struct swdevt *swdevhd; /* Allocate from here next */ static int nswapdev; /* Number of swap devices */ int swap_pager_avail; -static int swdev_syscall_active = 0; /* serialize swap(on|off) */ +static struct sx swdev_syscall_lock; /* serialize swap(on|off) */ static vm_ooffset_t swap_total; SYSCTL_QUAD(_vm, OID_AUTO, swap_total, CTLFLAG_RD, &swap_total, 0, @@ -160,7 +157,7 @@ SYSCTL_QUAD(_vm, OID_AUTO, swap_reserved, CTLFLAG_RD, &swap_reserved, 0, "Amount of swap storage needed to back all allocated anonymous memory."); static int overcommit = 0; -SYSCTL_INT(_vm, OID_AUTO, overcommit, CTLFLAG_RW, &overcommit, 0, +SYSCTL_INT(_vm, VM_OVERCOMMIT, overcommit, CTLFLAG_RW, &overcommit, 0, "Configure virtual memory overcommit behavior. See tuning(7) " "for details."); static unsigned long swzone; @@ -210,7 +207,7 @@ mtx_lock(&sw_dev_mtx); r = swap_reserved + incr; if (overcommit & SWAP_RESERVE_ALLOW_NONWIRED) { - s = cnt.v_page_count - cnt.v_free_reserved - cnt.v_wire_count; + s = vm_cnt.v_page_count - vm_cnt.v_free_reserved - vm_cnt.v_wire_count; s *= PAGE_SIZE; } else s = 0; @@ -223,16 +220,14 @@ mtx_unlock(&sw_dev_mtx); if (res) { - PROC_LOCK(curproc); UIDINFO_VMSIZE_LOCK(uip); if ((overcommit & SWAP_RESERVE_RLIMIT_ON) != 0 && - uip->ui_vmsize + incr > lim_cur(curproc, RLIMIT_SWAP) && + uip->ui_vmsize + incr > lim_cur(curthread, RLIMIT_SWAP) && priv_check(curthread, PRIV_VM_SWAP_NORLIMIT)) res = 0; else uip->ui_vmsize += incr; UIDINFO_VMSIZE_UNLOCK(uip); - PROC_UNLOCK(curproc); if (!res) { mtx_lock(&sw_dev_mtx); swap_reserved -= incr; @@ -314,12 +309,10 @@ racct_sub_cred(cred, RACCT_SWAP, decr); } -static void swapdev_strategy(struct buf *, struct swdevt *sw); - #define SWM_FREE 0x02 /* free, period */ #define SWM_POP 0x04 /* pop out */ -int swap_pager_full = 2; /* swap space exhaustion (task killing) */ +static int swap_pager_full = 2; /* swap space exhaustion (task killing) */ static int swap_pager_almost_full = 1; /* swap space exhaustion (w/hysteresis)*/ static int nsw_rcount; /* free read buffers */ static int nsw_wcount_sync; /* limit write buffers / synchronous */ @@ -327,17 +320,17 @@ static int nsw_wcount_async_max;/* assigned maximum */ static int nsw_cluster_max; /* maximum VOP I/O allowed */ -static struct swblock **swhash; -static int swhash_mask; -static struct mtx swhash_mtx; +static int sysctl_swap_async_max(SYSCTL_HANDLER_ARGS); +SYSCTL_PROC(_vm, OID_AUTO, swap_async_max, CTLTYPE_INT | CTLFLAG_RW | + CTLFLAG_MPSAFE, NULL, 0, sysctl_swap_async_max, "I", + "Maximum running async swap ops"); +static int sysctl_swap_fragmentation(SYSCTL_HANDLER_ARGS); +SYSCTL_PROC(_vm, OID_AUTO, swap_fragmentation, CTLTYPE_STRING | CTLFLAG_RD | + CTLFLAG_MPSAFE, NULL, 0, sysctl_swap_fragmentation, "A", + "Swap Fragmentation Info"); -static int swap_async_max = 4; /* maximum in-progress async I/O's */ static struct sx sw_alloc_sx; - -SYSCTL_INT(_vm, OID_AUTO, swap_async_max, - CTLFLAG_RW, &swap_async_max, 0, "Maximum running async swap ops"); - /* * "named" and "unnamed" anon region objects. Try to reduce the overhead * of searching a named list by hashing it just a little. @@ -348,9 +341,9 @@ #define NOBJLIST(handle) \ (&swap_pager_object_list[((int)(intptr_t)handle >> 4) & (NOBJLISTS-1)]) -static struct mtx sw_alloc_mtx; /* protect list manipulation */ static struct pagerlst swap_pager_object_list[NOBJLISTS]; -static uma_zone_t swap_zone; +static uma_zone_t swblk_zone; +static uma_zone_t swpctrie_zone; /* * pagerops for OBJT_SWAP - "swap pager". Some ops are also global procedure @@ -361,7 +354,10 @@ swap_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot, vm_ooffset_t offset, struct ucred *); static void swap_pager_dealloc(vm_object_t object); -static int swap_pager_getpages(vm_object_t, vm_page_t *, int, int); +static int swap_pager_getpages(vm_object_t, vm_page_t *, int, int *, + int *); +static int swap_pager_getpages_async(vm_object_t, vm_page_t *, int, int *, + int *, pgo_getpages_iodone_t, void *); static void swap_pager_putpages(vm_object_t, vm_page_t *, int, boolean_t, int *); static boolean_t swap_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before, int *after); @@ -374,6 +370,7 @@ .pgo_alloc = swap_pager_alloc, /* allocate an OBJT_SWAP object */ .pgo_dealloc = swap_pager_dealloc, /* deallocate an OBJT_SWAP object */ .pgo_getpages = swap_pager_getpages, /* pagein */ + .pgo_getpages_async = swap_pager_getpages_async, /* pagein (async) */ .pgo_putpages = swap_pager_putpages, /* pageout */ .pgo_haspage = swap_pager_haspage, /* get backing store status for page */ .pgo_pageunswapped = swap_pager_unswapped, /* remove swap related to page */ @@ -391,7 +388,7 @@ static void swp_sizecheck(void); static void swp_pager_async_iodone(struct buf *bp); -static int swapongeom(struct thread *, struct vnode *); +static int swapongeom(struct vnode *); static int swaponvp(struct thread *, struct vnode *, u_long); static int swapoff_one(struct swdevt *sp, struct ucred *cred); @@ -404,22 +401,28 @@ /* * Metadata functions */ -static struct swblock **swp_pager_hash(vm_object_t object, vm_pindex_t index); static void swp_pager_meta_build(vm_object_t, vm_pindex_t, daddr_t); -static void swp_pager_meta_free(vm_object_t, vm_pindex_t, daddr_t); +static void swp_pager_meta_free(vm_object_t, vm_pindex_t, vm_pindex_t); static void swp_pager_meta_free_all(vm_object_t); static daddr_t swp_pager_meta_ctl(vm_object_t, vm_pindex_t, int); +static void * +swblk_trie_alloc(struct pctrie *ptree) +{ + + return (uma_zalloc(swpctrie_zone, M_NOWAIT | (curproc == pageproc ? + M_USE_RESERVE : 0))); +} + static void -swp_pager_free_nrpage(vm_page_t m) +swblk_trie_free(struct pctrie *ptree, void *node) { - vm_page_lock(m); - if (m->wire_count == 0) - vm_page_free(m); - vm_page_unlock(m); + uma_zfree(swpctrie_zone, node); } +PCTRIE_DEFINE(SWAP, swblk, p, swblk_trie_alloc, swblk_trie_free); + /* * SWP_SIZECHECK() - update swap_pager_full indication * @@ -448,33 +451,6 @@ } /* - * SWP_PAGER_HASH() - hash swap meta data - * - * This is an helper function which hashes the swapblk given - * the object and page index. It returns a pointer to a pointer - * to the object, or a pointer to a NULL pointer if it could not - * find a swapblk. - */ -static struct swblock ** -swp_pager_hash(vm_object_t object, vm_pindex_t index) -{ - struct swblock **pswap; - struct swblock *swap; - - index &= ~(vm_pindex_t)SWAP_META_MASK; - pswap = &swhash[(index ^ (int)(intptr_t)object) & swhash_mask]; - while ((swap = *pswap) != NULL) { - if (swap->swb_object == object && - swap->swb_index == index - ) { - break; - } - pswap = &swap->swb_hnext; - } - return (pswap); -} - -/* * SWAP_PAGER_INIT() - initialize the swap pager! * * Expected to be started from system init. NOTE: This code is run @@ -491,9 +467,9 @@ for (i = 0; i < NOBJLISTS; ++i) TAILQ_INIT(&swap_pager_object_list[i]); - mtx_init(&sw_alloc_mtx, "swap_pager list", NULL, MTX_DEF); mtx_init(&sw_dev_mtx, "swapdev", NULL, MTX_DEF); sx_init(&sw_alloc_sx, "swspsx"); + sx_init(&swdev_syscall_lock, "swsysc"); } /* @@ -539,21 +515,25 @@ mtx_unlock(&pbuf_mtx); /* - * Initialize our zone. Right now I'm just guessing on the number - * we need based on the number of pages in the system. Each swblock - * can hold 32 pages, so this is probably overkill. This reservation - * is typically limited to around 32MB by default. + * Initialize our zone, taking the user's requested size or + * estimating the number we need based on the number of pages + * in the system. */ - n = cnt.v_page_count / 2; - if (maxswzone && n > maxswzone / sizeof(struct swblock)) - n = maxswzone / sizeof(struct swblock); + n = maxswzone != 0 ? maxswzone / sizeof(struct swblk) : + vm_cnt.v_page_count / 2; + swpctrie_zone = uma_zcreate("swpctrie", pctrie_node_size(), NULL, NULL, + pctrie_zone_init, NULL, UMA_ALIGN_PTR, + UMA_ZONE_NOFREE | UMA_ZONE_VM); + if (swpctrie_zone == NULL) + panic("failed to create swap pctrie zone."); + swblk_zone = uma_zcreate("swblk", sizeof(struct swblk), NULL, NULL, + NULL, NULL, _Alignof(struct swblk) - 1, + UMA_ZONE_NOFREE | UMA_ZONE_VM); + if (swblk_zone == NULL) + panic("failed to create swap blk zone."); n2 = n; - swap_zone = uma_zcreate("SWAPMETA", sizeof(struct swblock), NULL, NULL, - NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE | UMA_ZONE_VM); - if (swap_zone == NULL) - panic("failed to create swap_zone."); do { - if (uma_zone_reserve_kva(swap_zone, n)) + if (uma_zone_reserve_kva(swblk_zone, n)) break; /* * if the allocation failed, try a zone two thirds the @@ -561,25 +541,50 @@ */ n -= ((n + 2) / 3); } while (n > 0); - if (n2 != n) - printf("Swap zone entries reduced from %lu to %lu.\n", n2, n); + + /* + * Often uma_zone_reserve_kva() cannot reserve exactly the + * requested size. Account for the difference when + * calculating swap_maxpages. + */ + n = uma_zone_get_max(swblk_zone); + + if (n < n2) + printf("Swap blk zone entries changed from %lu to %lu.\n", + n2, n); swap_maxpages = n * SWAP_META_PAGES; - swzone = n * sizeof(struct swblock); - n2 = n; + swzone = n * sizeof(struct swblk); + if (!uma_zone_reserve_kva(swpctrie_zone, n)) + printf("Cannot reserve swap pctrie zone, " + "reduce kern.maxswzone.\n"); +} +static vm_object_t +swap_pager_alloc_init(void *handle, struct ucred *cred, vm_ooffset_t size, + vm_ooffset_t offset) +{ + vm_object_t object; + + if (cred != NULL) { + if (!swap_reserve_by_cred(size, cred)) + return (NULL); + crhold(cred); + } + /* - * Initialize our meta-data hash table. The swapper does not need to - * be quite as efficient as the VM system, so we do not use an - * oversized hash table. - * - * n: size of hash table, must be power of 2 - * swhash_mask: hash table index mask + * The un_pager.swp.swp_blks trie is initialized by + * vm_object_allocate() to ensure the correct order of + * visibility to other threads. */ - for (n = 1; n < n2 / 8; n *= 2) - ; - swhash = malloc(sizeof(struct swblock *) * n, M_VMPGDATA, M_WAITOK | M_ZERO); - swhash_mask = n - 1; - mtx_init(&swhash_mtx, "swap_pager swhash", NULL, MTX_DEF); + object = vm_object_allocate(OBJT_SWAP, OFF_TO_IDX(offset + + PAGE_MASK + size)); + + object->handle = handle; + if (cred != NULL) { + object->cred = cred; + object->charge = size; + } + return (object); } /* @@ -587,13 +592,11 @@ * its metadata structures. * * This routine is called from the mmap and fork code to create a new - * OBJT_SWAP object. We do this by creating an OBJT_DEFAULT object - * and then converting it with swp_pager_meta_build(). + * OBJT_SWAP object. * - * This routine may block in vm_object_allocate() and create a named - * object lookup race, so we must interlock. - * - * MPSAFE + * This routine must ensure that no live duplicate is created for + * the named object request, which is protected against by + * holding the sw_alloc_sx lock in case handle != NULL. */ static vm_object_t swap_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot, @@ -600,11 +603,8 @@ vm_ooffset_t offset, struct ucred *cred) { vm_object_t object; - vm_pindex_t pindex; - pindex = OFF_TO_IDX(offset + PAGE_MASK + size); - if (handle) { - mtx_lock(&Giant); + if (handle != NULL) { /* * Reference existing named region or allocate new one. There * should not be a race here against swp_pager_meta_build() @@ -614,40 +614,16 @@ sx_xlock(&sw_alloc_sx); object = vm_pager_object_lookup(NOBJLIST(handle), handle); if (object == NULL) { - if (cred != NULL) { - if (!swap_reserve_by_cred(size, cred)) { - sx_xunlock(&sw_alloc_sx); - mtx_unlock(&Giant); - return (NULL); - } - crhold(cred); + object = swap_pager_alloc_init(handle, cred, size, + offset); + if (object != NULL) { + TAILQ_INSERT_TAIL(NOBJLIST(object->handle), + object, pager_object_list); } - object = vm_object_allocate(OBJT_DEFAULT, pindex); - VM_OBJECT_WLOCK(object); - object->handle = handle; - if (cred != NULL) { - object->cred = cred; - object->charge = size; - } - swp_pager_meta_build(object, 0, SWAPBLK_NONE); - VM_OBJECT_WUNLOCK(object); } sx_xunlock(&sw_alloc_sx); - mtx_unlock(&Giant); } else { - if (cred != NULL) { - if (!swap_reserve_by_cred(size, cred)) - return (NULL); - crhold(cred); - } - object = vm_object_allocate(OBJT_DEFAULT, pindex); - VM_OBJECT_WLOCK(object); - if (cred != NULL) { - object->cred = cred; - object->charge = size; - } - swp_pager_meta_build(object, 0, SWAPBLK_NONE); - VM_OBJECT_WUNLOCK(object); + object = swap_pager_alloc_init(handle, cred, size, offset); } return (object); } @@ -666,17 +642,22 @@ swap_pager_dealloc(vm_object_t object) { + VM_OBJECT_ASSERT_WLOCKED(object); + KASSERT((object->flags & OBJ_DEAD) != 0, ("dealloc of reachable obj")); + /* * Remove from list right away so lookups will fail if we block for * pageout completion. */ if (object->handle != NULL) { - mtx_lock(&sw_alloc_mtx); - TAILQ_REMOVE(NOBJLIST(object->handle), object, pager_object_list); - mtx_unlock(&sw_alloc_mtx); + VM_OBJECT_WUNLOCK(object); + sx_xlock(&sw_alloc_sx); + TAILQ_REMOVE(NOBJLIST(object->handle), object, + pager_object_list); + sx_xunlock(&sw_alloc_sx); + VM_OBJECT_WLOCK(object); } - VM_OBJECT_ASSERT_WLOCKED(object); vm_object_pip_wait(object, "swpdea"); /* @@ -763,11 +744,8 @@ mtx_unlock(&sw_dev_mtx); if ((sp->sw_flags & SW_UNMAPPED) != 0 && unmapped_buf_allowed) { - bp->b_kvaalloc = bp->b_data; bp->b_data = unmapped_buf; - bp->b_kvabase = unmapped_buf; bp->b_offset = 0; - bp->b_flags |= B_UNMAPPED; } else { pmap_qenter((vm_offset_t)bp->b_data, &bp->b_pages[0], bp->b_bcount / PAGE_SIZE); @@ -815,6 +793,36 @@ } /* + * SYSCTL_SWAP_FRAGMENTATION() - produce raw swap space stats + */ +static int +sysctl_swap_fragmentation(SYSCTL_HANDLER_ARGS) +{ + struct sbuf sbuf; + struct swdevt *sp; + const char *devname; + int error; + + error = sysctl_wire_old_buffer(req, 0); + if (error != 0) + return (error); + sbuf_new_for_sysctl(&sbuf, NULL, 128, req); + mtx_lock(&sw_dev_mtx); + TAILQ_FOREACH(sp, &swtailq, sw_list) { + if (vn_isdisk(sp->sw_vp, NULL)) + devname = devtoname(sp->sw_vp->v_rdev); + else + devname = "[file]"; + sbuf_printf(&sbuf, "\nFree space on device %s:\n", devname); + blist_stats(sp->sw_blist, &sbuf); + } + mtx_unlock(&sw_dev_mtx); + error = sbuf_finish(&sbuf); + sbuf_delete(&sbuf); + return (error); +} + +/* * SWAP_PAGER_FREESPACE() - frees swap blocks associated with a page * range within an object. * @@ -906,16 +914,19 @@ * If destroysource is set, we remove the source object from the * swap_pager internal queue now. */ - if (destroysource) { - if (srcobject->handle != NULL) { - mtx_lock(&sw_alloc_mtx); - TAILQ_REMOVE( - NOBJLIST(srcobject->handle), - srcobject, - pager_object_list - ); - mtx_unlock(&sw_alloc_mtx); - } + if (destroysource && srcobject->handle != NULL) { + vm_object_pip_add(srcobject, 1); + VM_OBJECT_WUNLOCK(srcobject); + vm_object_pip_add(dstobject, 1); + VM_OBJECT_WUNLOCK(dstobject); + sx_xlock(&sw_alloc_sx); + TAILQ_REMOVE(NOBJLIST(srcobject->handle), srcobject, + pager_object_list); + sx_xunlock(&sw_alloc_sx); + VM_OBJECT_WLOCK(dstobject); + vm_object_pip_wakeup(dstobject); + VM_OBJECT_WLOCK(srcobject); + vm_object_pip_wakeup(srcobject); } /* @@ -970,7 +981,7 @@ /* * Free left over swap blocks in source. * - * We have to revert the type to OBJT_DEFAULT so we do not accidently + * We have to revert the type to OBJT_DEFAULT so we do not accidentally * double-remove the object from the swap queues. */ if (destroysource) { @@ -993,22 +1004,21 @@ * page and return TRUE if it does, FALSE if it doesn't. * * If TRUE, we also try to determine how much valid, contiguous backing - * store exists before and after the requested page within a reasonable - * distance. We do not try to restrict it to the swap device stripe - * (that is handled in getpages/putpages). It probably isn't worth - * doing here. + * store exists before and after the requested page. */ static boolean_t -swap_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before, int *after) +swap_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before, + int *after) { - daddr_t blk0; + daddr_t blk, blk0; + int i; VM_OBJECT_ASSERT_LOCKED(object); + /* * do we have good backing store at the requested index ? */ blk0 = swp_pager_meta_ctl(object, pindex, 0); - if (blk0 == SWAPBLK_NONE) { if (before) *before = 0; @@ -1021,11 +1031,7 @@ * find backwards-looking contiguous good backing store */ if (before != NULL) { - int i; - - for (i = 1; i < (SWB_NPAGES/2); ++i) { - daddr_t blk; - + for (i = 1; i < SWB_NPAGES; i++) { if (i > pindex) break; blk = swp_pager_meta_ctl(object, pindex - i, 0); @@ -1032,7 +1038,7 @@ if (blk != blk0 - i) break; } - *before = (i - 1); + *before = i - 1; } /* @@ -1039,16 +1045,12 @@ * find forward-looking contiguous good backing store */ if (after != NULL) { - int i; - - for (i = 1; i < (SWB_NPAGES/2); ++i) { - daddr_t blk; - + for (i = 1; i < SWB_NPAGES; i++) { blk = swp_pager_meta_ctl(object, pindex + i, 0); if (blk != blk0 + i) break; } - *after = (i - 1); + *after = i - 1; } return (TRUE); } @@ -1080,134 +1082,130 @@ } /* - * SWAP_PAGER_GETPAGES() - bring pages in from swap + * swap_pager_getpages() - bring pages in from swap * - * Attempt to retrieve (m, count) pages from backing store, but make - * sure we retrieve at least m[reqpage]. We try to load in as large - * a chunk surrounding m[reqpage] as is contiguous in swap and which - * belongs to the same object. + * Attempt to page in the pages in array "ma" of length "count". The + * caller may optionally specify that additional pages preceding and + * succeeding the specified range be paged in. The number of such pages + * is returned in the "rbehind" and "rahead" parameters, and they will + * be in the inactive queue upon return. * - * The code is designed for asynchronous operation and - * immediate-notification of 'reqpage' but tends not to be - * used that way. Please do not optimize-out this algorithmic - * feature, I intend to improve on it in the future. - * - * The parent has a single vm_object_pip_add() reference prior to - * calling us and we should return with the same. - * - * The parent has BUSY'd the pages. We should return with 'm' - * left busy, but the others adjusted. + * The pages in "ma" must be busied and will remain busied upon return. */ static int -swap_pager_getpages(vm_object_t object, vm_page_t *m, int count, int reqpage) +swap_pager_getpages(vm_object_t object, vm_page_t *ma, int count, int *rbehind, + int *rahead) { struct buf *bp; - vm_page_t mreq; - int i; - int j; + vm_page_t bm, mpred, msucc, p; + vm_pindex_t pindex; daddr_t blk; + int i, maxahead, maxbehind, reqcount; - mreq = m[reqpage]; + reqcount = count; - KASSERT(mreq->object == object, - ("swap_pager_getpages: object mismatch %p/%p", - object, mreq->object)); + /* + * Determine the final number of read-behind pages and + * allocate them BEFORE releasing the object lock. Otherwise, + * there can be a problematic race with vm_object_split(). + * Specifically, vm_object_split() might first transfer pages + * that precede ma[0] in the current object to a new object, + * and then this function incorrectly recreates those pages as + * read-behind pages in the current object. + */ + if (!swap_pager_haspage(object, ma[0]->pindex, &maxbehind, &maxahead)) + return (VM_PAGER_FAIL); /* - * Calculate range to retrieve. The pages have already been assigned - * their swapblks. We require a *contiguous* range but we know it to - * not span devices. If we do not supply it, bad things - * happen. Note that blk, iblk & jblk can be SWAPBLK_NONE, but the - * loops are set up such that the case(s) are handled implicitly. - * - * The swp_*() calls must be made with the object locked. + * Clip the readahead and readbehind ranges to exclude resident pages. */ - blk = swp_pager_meta_ctl(mreq->object, mreq->pindex, 0); - - for (i = reqpage - 1; i >= 0; --i) { - daddr_t iblk; - - iblk = swp_pager_meta_ctl(m[i]->object, m[i]->pindex, 0); - if (blk != iblk + (reqpage - i)) - break; + if (rahead != NULL) { + KASSERT(reqcount - 1 <= maxahead, + ("page count %d extends beyond swap block", reqcount)); + *rahead = imin(*rahead, maxahead - (reqcount - 1)); + pindex = ma[reqcount - 1]->pindex; + msucc = TAILQ_NEXT(ma[reqcount - 1], listq); + if (msucc != NULL && msucc->pindex - pindex - 1 < *rahead) + *rahead = msucc->pindex - pindex - 1; } - ++i; + if (rbehind != NULL) { + *rbehind = imin(*rbehind, maxbehind); + pindex = ma[0]->pindex; + mpred = TAILQ_PREV(ma[0], pglist, listq); + if (mpred != NULL && pindex - mpred->pindex - 1 < *rbehind) + *rbehind = pindex - mpred->pindex - 1; + } - for (j = reqpage + 1; j < count; ++j) { - daddr_t jblk; + bm = ma[0]; + for (i = 0; i < count; i++) + ma[i]->oflags |= VPO_SWAPINPROG; - jblk = swp_pager_meta_ctl(m[j]->object, m[j]->pindex, 0); - if (blk != jblk - (j - reqpage)) - break; - } - /* - * free pages outside our collection range. Note: we never free - * mreq, it must remain busy throughout. + * Allocate readahead and readbehind pages. */ - if (0 < i || j < count) { - int k; - - for (k = 0; k < i; ++k) - swp_pager_free_nrpage(m[k]); - for (k = j; k < count; ++k) - swp_pager_free_nrpage(m[k]); + if (rbehind != NULL) { + for (i = 1; i <= *rbehind; i++) { + p = vm_page_alloc(object, ma[0]->pindex - i, + VM_ALLOC_NORMAL); + if (p == NULL) + break; + p->oflags |= VPO_SWAPINPROG; + bm = p; + } + *rbehind = i - 1; } + if (rahead != NULL) { + for (i = 0; i < *rahead; i++) { + p = vm_page_alloc(object, + ma[reqcount - 1]->pindex + i + 1, VM_ALLOC_NORMAL); + if (p == NULL) + break; + p->oflags |= VPO_SWAPINPROG; + } + *rahead = i; + } + if (rbehind != NULL) + count += *rbehind; + if (rahead != NULL) + count += *rahead; - /* - * Return VM_PAGER_FAIL if we have nothing to do. Return mreq - * still busy, but the others unbusied. - */ - if (blk == SWAPBLK_NONE) - return (VM_PAGER_FAIL); + vm_object_pip_add(object, count); - /* - * Getpbuf() can sleep. - */ + pindex = bm->pindex; + blk = swp_pager_meta_ctl(object, pindex, 0); + KASSERT(blk != SWAPBLK_NONE, + ("no swap blocking containing %p(%jx)", object, (uintmax_t)pindex)); + VM_OBJECT_WUNLOCK(object); - /* - * Get a swap buffer header to perform the IO - */ bp = getpbuf(&nsw_rcount); + /* Pages cannot leave the object while busy. */ + for (i = 0, p = bm; i < count; i++, p = TAILQ_NEXT(p, listq)) { + MPASS(p->pindex == bm->pindex + i); + bp->b_pages[i] = p; + } + bp->b_flags |= B_PAGING; - bp->b_iocmd = BIO_READ; bp->b_iodone = swp_pager_async_iodone; bp->b_rcred = crhold(thread0.td_ucred); bp->b_wcred = crhold(thread0.td_ucred); - bp->b_blkno = blk - (reqpage - i); - bp->b_bcount = PAGE_SIZE * (j - i); - bp->b_bufsize = PAGE_SIZE * (j - i); - bp->b_pager.pg_reqpage = reqpage - i; + bp->b_blkno = blk; + bp->b_bcount = PAGE_SIZE * count; + bp->b_bufsize = PAGE_SIZE * count; + bp->b_npages = count; + bp->b_pgbefore = rbehind != NULL ? *rbehind : 0; + bp->b_pgafter = rahead != NULL ? *rahead : 0; - VM_OBJECT_WLOCK(object); - { - int k; - - for (k = i; k < j; ++k) { - bp->b_pages[k - i] = m[k]; - m[k]->oflags |= VPO_SWAPINPROG; - } - } - bp->b_npages = j - i; - PCPU_INC(cnt.v_swapin); - PCPU_ADD(cnt.v_swappgsin, bp->b_npages); + PCPU_ADD(cnt.v_swappgsin, count); /* - * We still hold the lock on mreq, and our automatic completion routine - * does not remove it. - */ - vm_object_pip_add(object, bp->b_npages); - VM_OBJECT_WUNLOCK(object); - - /* * perform the I/O. NOTE!!! bp cannot be considered valid after * this point because we automatically release it on completion. * Instead, we look at the one page we are interested in which we * still hold a lock on even through the I/O completion. * - * The other pages in our m[] array are also released on completion, + * The other pages in our ma[] array are also released on completion, * so we cannot assume they are valid anymore either. * * NOTE: b_blkno is destroyed by the call to swapdev_strategy @@ -1216,13 +1214,13 @@ swp_pager_strategy(bp); /* - * wait for the page we want to complete. VPO_SWAPINPROG is always + * Wait for the pages we want to complete. VPO_SWAPINPROG is always * cleared on completion. If an I/O error occurs, SWAPBLK_NONE - * is set in the meta-data. + * is set in the metadata for each page in the request. */ VM_OBJECT_WLOCK(object); - while ((mreq->oflags & VPO_SWAPINPROG) != 0) { - mreq->oflags |= VPO_SWAPSLEEP; + while ((ma[0]->oflags & VPO_SWAPINPROG) != 0) { + ma[0]->oflags |= VPO_SWAPSLEEP; PCPU_INC(cnt.v_intrans); if (VM_OBJECT_SLEEP(object, &object->paging_in_progress, PSWP, "swread", hz * 20)) { @@ -1233,16 +1231,14 @@ } /* - * mreq is left busied after completion, but all the other pages - * are freed. If we had an unrecoverable read error the page will - * not be valid. + * If we had an unrecoverable read error pages will not be valid. */ - if (mreq->valid != VM_PAGE_BITS_ALL) { - return (VM_PAGER_ERROR); - } else { - return (VM_PAGER_OK); - } + for (i = 0; i < reqcount; i++) + if (ma[i]->valid != VM_PAGE_BITS_ALL) + return (VM_PAGER_ERROR); + return (VM_PAGER_OK); + /* * A final note: in a low swap situation, we cannot deallocate swap * and mark a page dirty here because the caller is likely to mark @@ -1252,6 +1248,39 @@ } /* + * swap_pager_getpages_async(): + * + * Right now this is emulation of asynchronous operation on top of + * swap_pager_getpages(). + */ +static int +swap_pager_getpages_async(vm_object_t object, vm_page_t *ma, int count, + int *rbehind, int *rahead, pgo_getpages_iodone_t iodone, void *arg) +{ + int r, error; + + r = swap_pager_getpages(object, ma, count, rbehind, rahead); + VM_OBJECT_WUNLOCK(object); + switch (r) { + case VM_PAGER_OK: + error = 0; + break; + case VM_PAGER_ERROR: + error = EIO; + break; + case VM_PAGER_FAIL: + error = EINVAL; + break; + default: + panic("unhandled swap_pager_getpages() error %d", r); + } + (iodone)(arg, ma, count, error); + VM_OBJECT_WLOCK(object); + + return (r); +} + +/* * swap_pager_putpages: * * Assign swap (if necessary) and initiate I/O on the specified pages. @@ -1273,17 +1302,17 @@ * those whos rtvals[] entry is not set to VM_PAGER_PEND on return. * We need to unbusy the rest on I/O completion. */ -void -swap_pager_putpages(vm_object_t object, vm_page_t *m, int count, +static void +swap_pager_putpages(vm_object_t object, vm_page_t *ma, int count, int flags, int *rtvals) { int i, n; boolean_t sync; - if (count && m[0]->object != object) { + if (count && ma[0]->object != object) { panic("swap_pager_putpages: object mismatch %p/%p", object, - m[0]->object + ma[0]->object ); } @@ -1307,39 +1336,6 @@ /* * Step 2 * - * Update nsw parameters from swap_async_max sysctl values. - * Do not let the sysop crash the machine with bogus numbers. - */ - mtx_lock(&pbuf_mtx); - if (swap_async_max != nsw_wcount_async_max) { - int n; - - /* - * limit range - */ - if ((n = swap_async_max) > nswbuf / 2) - n = nswbuf / 2; - if (n < 1) - n = 1; - swap_async_max = n; - - /* - * Adjust difference ( if possible ). If the current async - * count is too low, we may not be able to make the adjustment - * at this time. - */ - n -= nsw_wcount_async_max; - if (nsw_wcount_async + n >= 0) { - nsw_wcount_async += n; - nsw_wcount_async_max += n; - wakeup(&nsw_wcount_async); - } - } - mtx_unlock(&pbuf_mtx); - - /* - * Step 3 - * * Assign swap blocks and issue I/O. We reallocate swap on the fly. * The page is left dirty until the pageout operation completes * successfully. @@ -1394,7 +1390,7 @@ VM_OBJECT_WLOCK(object); for (j = 0; j < n; ++j) { - vm_page_t mreq = m[i+j]; + vm_page_t mreq = ma[i+j]; swp_pager_meta_build( mreq->object, @@ -1402,8 +1398,6 @@ blk + j ); MPASS(mreq->dirty == VM_PAGE_BITS_ALL); - rtvals[i+j] = VM_PAGER_OK; - mreq->oflags |= VPO_SWAPINPROG; bp->b_pages[j] = mreq; } @@ -1419,6 +1413,16 @@ PCPU_ADD(cnt.v_swappgsout, bp->b_npages); /* + * We unconditionally set rtvals[] to VM_PAGER_PEND so that we + * can call the async completion routine at the end of a + * synchronous I/O operation. Otherwise, our caller would + * perform duplicate unbusy and wakeup operations on the page + * and object, respectively. + */ + for (j = 0; j < n; j++) + rtvals[i + j] = VM_PAGER_PEND; + + /* * asynchronous * * NOTE: b_blkno is destroyed by the call to swapdev_strategy @@ -1427,10 +1431,6 @@ bp->b_iodone = swp_pager_async_iodone; BUF_KERNPROC(bp); swp_pager_strategy(bp); - - for (j = 0; j < n; ++j) - rtvals[i+j] = VM_PAGER_PEND; - /* restart outter loop */ continue; } @@ -1443,14 +1443,10 @@ swp_pager_strategy(bp); /* - * Wait for the sync I/O to complete, then update rtvals. - * We just set the rtvals[] to VM_PAGER_PEND so we can call - * our async completion routine at the end, thus avoiding a - * double-free. + * Wait for the sync I/O to complete. */ bwait(bp, PVM, "swwrt"); - for (j = 0; j < n; ++j) - rtvals[i+j] = VM_PAGER_PEND; + /* * Now that we are through with the bp, we can call the * normal async completion, which frees everything up. @@ -1491,12 +1487,10 @@ /* * remove the mapping for kernel virtual */ - if ((bp->b_flags & B_UNMAPPED) != 0) { - bp->b_data = bp->b_kvaalloc; - bp->b_kvabase = bp->b_kvaalloc; - bp->b_flags &= ~B_UNMAPPED; - } else + if (buf_mapped(bp)) pmap_qremove((vm_offset_t)bp->b_data, bp->b_npages); + else + bp->b_data = bp->b_kvabase; if (bp->b_npages) { object = bp->b_pages[0]->object; @@ -1529,33 +1523,11 @@ */ if (bp->b_iocmd == BIO_READ) { /* - * When reading, reqpage needs to stay - * locked for the parent, but all other - * pages can be freed. We still want to - * wakeup the parent waiting on the page, - * though. ( also: pg_reqpage can be -1 and - * not match anything ). - * - * We have to wake specifically requested pages - * up too because we cleared VPO_SWAPINPROG and - * someone may be waiting for that. - * * NOTE: for reads, m->dirty will probably * be overridden by the original caller of * getpages so don't play cute tricks here. */ m->valid = 0; - if (i != bp->b_pager.pg_reqpage) - swp_pager_free_nrpage(m); - else { - vm_page_lock(m); - vm_page_flash(m); - vm_page_unlock(m); - } - /* - * If i == bp->b_pager.pg_reqpage, do not wake - * the page up. The caller needs to. - */ } else { /* * If a write error occurs, reactivate page @@ -1562,7 +1534,7 @@ * so it doesn't clog the inactive list, * then finish the I/O. */ - vm_page_dirty(m); + MPASS(m->dirty == VM_PAGE_BITS_ALL); vm_page_lock(m); vm_page_activate(m); vm_page_unlock(m); @@ -1577,54 +1549,33 @@ * want to do that anyway, but it was an optimization * that existed in the old swapper for a time before * it got ripped out due to precisely this problem. - * - * If not the requested page then deactivate it. - * - * Note that the requested page, reqpage, is left - * busied, but we still have to wake it up. The - * other pages are released (unbusied) by - * vm_page_xunbusy(). */ KASSERT(!pmap_page_is_mapped(m), ("swp_pager_async_iodone: page %p is mapped", m)); - m->valid = VM_PAGE_BITS_ALL; KASSERT(m->dirty == 0, ("swp_pager_async_iodone: page %p is dirty", m)); - /* - * We have to wake specifically requested pages - * up too because we cleared VPO_SWAPINPROG and - * could be waiting for it in getpages. However, - * be sure to not unbusy getpages specifically - * requested page - getpages expects it to be - * left busy. - */ - if (i != bp->b_pager.pg_reqpage) { - vm_page_lock(m); - vm_page_deactivate(m); - vm_page_unlock(m); - vm_page_xunbusy(m); - } else { - vm_page_lock(m); - vm_page_flash(m); - vm_page_unlock(m); - } + m->valid = VM_PAGE_BITS_ALL; + if (i < bp->b_pgbefore || + i >= bp->b_npages - bp->b_pgafter) + vm_page_readahead_finish(m); } else { /* * For write success, clear the dirty * status, then finish the I/O ( which decrements the * busy count and possibly wakes waiter's up ). + * A page is only written to swap after a period of + * inactivity. Therefore, we do not expect it to be + * reused. */ KASSERT(!pmap_page_is_write_mapped(m), ("swp_pager_async_iodone: page %p is not write" " protected", m)); vm_page_undirty(m); + vm_page_lock(m); + vm_page_deactivate_noreuse(m); + vm_page_unlock(m); vm_page_sunbusy(m); - if (vm_page_count_severe()) { - vm_page_lock(m); - vm_page_try_to_cache(m); - vm_page_unlock(m); - } } } @@ -1661,51 +1612,17 @@ } /* - * swap_pager_isswapped: + * SWP_PAGER_FORCE_PAGEIN() - force a swap block to be paged in * - * Return 1 if at least one page in the given object is paged - * out to the given swap device. + * This routine dissociates the page at the given index within an object + * from its backing store, paging it in if it does not reside in memory. + * If the page is paged in, it is marked dirty and placed in the laundry + * queue. The page is marked dirty because it no longer has backing + * store. It is placed in the laundry queue because it has not been + * accessed recently. Otherwise, it would already reside in memory. * - * This routine may not sleep. - */ -int -swap_pager_isswapped(vm_object_t object, struct swdevt *sp) -{ - daddr_t index = 0; - int bcount; - int i; - - VM_OBJECT_ASSERT_WLOCKED(object); - if (object->type != OBJT_SWAP) - return (0); - - mtx_lock(&swhash_mtx); - for (bcount = 0; bcount < object->un_pager.swp.swp_bcount; bcount++) { - struct swblock *swap; - - if ((swap = *swp_pager_hash(object, index)) != NULL) { - for (i = 0; i < SWAP_META_PAGES; ++i) { - if (swp_pager_isondev(swap->swb_pages[i], sp)) { - mtx_unlock(&swhash_mtx); - return (1); - } - } - } - index += SWAP_META_PAGES; - } - mtx_unlock(&swhash_mtx); - return (0); -} - -/* - * SWP_PAGER_FORCE_PAGEIN() - force a swap block to be paged in - * - * This routine dissociates the page at the given index within a - * swap block from its backing store, paging it in if necessary. - * If the page is paged in, it is placed in the inactive queue, - * since it had its backing store ripped out from under it. - * We also attempt to swap in all other pages in the swap block, - * we only guarantee that the one at the specified index is + * We also attempt to swap in all other pages in the swap block. + * However, we only guarantee that the one at the specified index is * paged in. * * XXX - The code to page the whole block in doesn't work, so we @@ -1719,7 +1636,7 @@ vm_object_pip_add(object, 1); m = vm_page_grab(object, pindex, VM_ALLOC_NORMAL); if (m->valid == VM_PAGE_BITS_ALL) { - vm_object_pip_subtract(object, 1); + vm_object_pip_wakeup(object); vm_page_dirty(m); vm_page_lock(m); vm_page_activate(m); @@ -1729,12 +1646,12 @@ return; } - if (swap_pager_getpages(object, &m, 1, 0) != VM_PAGER_OK) + if (swap_pager_getpages(object, &m, 1, NULL, NULL) != VM_PAGER_OK) panic("swap_pager_force_pagein: read from swap failed");/*XXX*/ - vm_object_pip_subtract(object, 1); + vm_object_pip_wakeup(object); vm_page_dirty(m); vm_page_lock(m); - vm_page_deactivate(m); + vm_page_launder(m); vm_page_unlock(m); vm_page_xunbusy(m); vm_pager_page_unswapped(m); @@ -1753,50 +1670,56 @@ static void swap_pager_swapoff(struct swdevt *sp) { - struct swblock *swap; - vm_object_t locked_obj, object; - vm_pindex_t pindex; - int i, j, retries; + struct swblk *sb; + vm_object_t object; + vm_pindex_t pi; + int i, retries; - GIANT_REQUIRED; + sx_assert(&swdev_syscall_lock, SA_XLOCKED); retries = 0; - locked_obj = NULL; full_rescan: - mtx_lock(&swhash_mtx); - for (i = 0; i <= swhash_mask; i++) { /* '<=' is correct here */ -restart: - for (swap = swhash[i]; swap != NULL; swap = swap->swb_hnext) { - object = swap->swb_object; - pindex = swap->swb_index; - for (j = 0; j < SWAP_META_PAGES; ++j) { - if (!swp_pager_isondev(swap->swb_pages[j], sp)) + mtx_lock(&vm_object_list_mtx); + TAILQ_FOREACH(object, &vm_object_list, object_list) { + if (object->type != OBJT_SWAP) + continue; + mtx_unlock(&vm_object_list_mtx); + /* Depends on type-stability. */ + VM_OBJECT_WLOCK(object); + + /* + * Dead objects are eventually terminated on their own. + */ + if ((object->flags & OBJ_DEAD) != 0) + goto next_obj; + + /* + * Sync with fences placed after pctrie + * initialization. We must not access pctrie below + * unless we checked that our object is swap and not + * dead. + */ + atomic_thread_fence_acq(); + if (object->type != OBJT_SWAP) + goto next_obj; + + for (pi = 0; (sb = SWAP_PCTRIE_LOOKUP_GE( + &object->un_pager.swp.swp_blks, pi)) != NULL; ) { + pi = sb->p + SWAP_META_PAGES; + for (i = 0; i < SWAP_META_PAGES; i++) { + if (sb->d[i] == SWAPBLK_NONE) continue; - if (locked_obj != object) { - if (locked_obj != NULL) - VM_OBJECT_WUNLOCK(locked_obj); - locked_obj = object; - if (!VM_OBJECT_TRYWLOCK(object)) { - mtx_unlock(&swhash_mtx); - /* Depends on type-stability. */ - VM_OBJECT_WLOCK(object); - mtx_lock(&swhash_mtx); - goto restart; - } - } - MPASS(locked_obj == object); - mtx_unlock(&swhash_mtx); - swp_pager_force_pagein(object, pindex + j); - mtx_lock(&swhash_mtx); - goto restart; + if (swp_pager_isondev(sb->d[i], sp)) + swp_pager_force_pagein(object, + sb->p + i); } } +next_obj: + VM_OBJECT_WUNLOCK(object); + mtx_lock(&vm_object_list_mtx); } - mtx_unlock(&swhash_mtx); - if (locked_obj != NULL) { - VM_OBJECT_WUNLOCK(locked_obj); - locked_obj = NULL; - } + mtx_unlock(&vm_object_list_mtx); + if (sp->sw_used) { /* * Objects may be locked or paging to the device being @@ -1839,94 +1762,120 @@ static void swp_pager_meta_build(vm_object_t object, vm_pindex_t pindex, daddr_t swapblk) { - static volatile int exhausted; - struct swblock *swap; - struct swblock **pswap; - int idx; + static volatile int swblk_zone_exhausted, swpctrie_zone_exhausted; + struct swblk *sb, *sb1; + vm_pindex_t modpi, rdpi; + int error, i; VM_OBJECT_ASSERT_WLOCKED(object); + /* * Convert default object to swap object if necessary */ if (object->type != OBJT_SWAP) { + pctrie_init(&object->un_pager.swp.swp_blks); + + /* + * Ensure that swap_pager_swapoff()'s iteration over + * object_list does not see a garbage pctrie. + */ + atomic_thread_fence_rel(); + object->type = OBJT_SWAP; - object->un_pager.swp.swp_bcount = 0; - - if (object->handle != NULL) { - mtx_lock(&sw_alloc_mtx); - TAILQ_INSERT_TAIL( - NOBJLIST(object->handle), - object, - pager_object_list - ); - mtx_unlock(&sw_alloc_mtx); - } + KASSERT(object->handle == NULL, ("default pager with handle")); } - /* - * Locate hash entry. If not found create, but if we aren't adding - * anything just return. If we run out of space in the map we wait - * and, since the hash table may have changed, retry. - */ -retry: - mtx_lock(&swhash_mtx); - pswap = swp_pager_hash(object, pindex); - - if ((swap = *pswap) == NULL) { - int i; - + rdpi = rounddown(pindex, SWAP_META_PAGES); + sb = SWAP_PCTRIE_LOOKUP(&object->un_pager.swp.swp_blks, rdpi); + if (sb == NULL) { if (swapblk == SWAPBLK_NONE) - goto done; - - swap = *pswap = uma_zalloc(swap_zone, M_NOWAIT | - (curproc == pageproc ? M_USE_RESERVE : 0)); - if (swap == NULL) { - mtx_unlock(&swhash_mtx); + return; + for (;;) { + sb = uma_zalloc(swblk_zone, M_NOWAIT | (curproc == + pageproc ? M_USE_RESERVE : 0)); + if (sb != NULL) { + sb->p = rdpi; + for (i = 0; i < SWAP_META_PAGES; i++) + sb->d[i] = SWAPBLK_NONE; + if (atomic_cmpset_int(&swblk_zone_exhausted, + 1, 0)) + printf("swblk zone ok\n"); + break; + } VM_OBJECT_WUNLOCK(object); - if (uma_zone_exhausted(swap_zone)) { - if (atomic_cmpset_int(&exhausted, 0, 1)) - printf("swap zone exhausted, " + if (uma_zone_exhausted(swblk_zone)) { + if (atomic_cmpset_int(&swblk_zone_exhausted, + 0, 1)) + printf("swap blk zone exhausted, " "increase kern.maxswzone\n"); vm_pageout_oom(VM_OOM_SWAPZ); - pause("swzonex", 10); + pause("swzonxb", 10); } else - VM_WAIT; + uma_zwait(swblk_zone); VM_OBJECT_WLOCK(object); - goto retry; + sb = SWAP_PCTRIE_LOOKUP(&object->un_pager.swp.swp_blks, + rdpi); + if (sb != NULL) + /* + * Somebody swapped out a nearby page, + * allocating swblk at the rdpi index, + * while we dropped the object lock. + */ + goto allocated; } + for (;;) { + error = SWAP_PCTRIE_INSERT( + &object->un_pager.swp.swp_blks, sb); + if (error == 0) { + if (atomic_cmpset_int(&swpctrie_zone_exhausted, + 1, 0)) + printf("swpctrie zone ok\n"); + break; + } + VM_OBJECT_WUNLOCK(object); + if (uma_zone_exhausted(swpctrie_zone)) { + if (atomic_cmpset_int(&swpctrie_zone_exhausted, + 0, 1)) + printf("swap pctrie zone exhausted, " + "increase kern.maxswzone\n"); + vm_pageout_oom(VM_OOM_SWAPZ); + pause("swzonxp", 10); + } else + uma_zwait(swpctrie_zone); + VM_OBJECT_WLOCK(object); + sb1 = SWAP_PCTRIE_LOOKUP(&object->un_pager.swp.swp_blks, + rdpi); + if (sb1 != NULL) { + uma_zfree(swblk_zone, sb); + sb = sb1; + goto allocated; + } + } + } +allocated: + MPASS(sb->p == rdpi); - if (atomic_cmpset_int(&exhausted, 1, 0)) - printf("swap zone ok\n"); + modpi = pindex % SWAP_META_PAGES; + /* Delete prior contents of metadata. */ + if (sb->d[modpi] != SWAPBLK_NONE) + swp_pager_freeswapspace(sb->d[modpi], 1); + /* Enter block into metadata. */ + sb->d[modpi] = swapblk; - swap->swb_hnext = NULL; - swap->swb_object = object; - swap->swb_index = pindex & ~(vm_pindex_t)SWAP_META_MASK; - swap->swb_count = 0; - - ++object->un_pager.swp.swp_bcount; - - for (i = 0; i < SWAP_META_PAGES; ++i) - swap->swb_pages[i] = SWAPBLK_NONE; - } - /* - * Delete prior contents of metadata + * Free the swblk if we end up with the empty page run. */ - idx = pindex & SWAP_META_MASK; - - if (swap->swb_pages[idx] != SWAPBLK_NONE) { - swp_pager_freeswapspace(swap->swb_pages[idx], 1); - --swap->swb_count; + if (swapblk == SWAPBLK_NONE) { + for (i = 0; i < SWAP_META_PAGES; i++) { + if (sb->d[i] != SWAPBLK_NONE) + break; + } + if (i == SWAP_META_PAGES) { + SWAP_PCTRIE_REMOVE(&object->un_pager.swp.swp_blks, + rdpi); + uma_zfree(swblk_zone, sb); + } } - - /* - * Enter block into metadata - */ - swap->swb_pages[idx] = swapblk; - if (swapblk != SWAPBLK_NONE) - ++swap->swb_count; -done: - mtx_unlock(&swhash_mtx); } /* @@ -1940,41 +1889,39 @@ * with resident pages. */ static void -swp_pager_meta_free(vm_object_t object, vm_pindex_t index, daddr_t count) +swp_pager_meta_free(vm_object_t object, vm_pindex_t pindex, vm_pindex_t count) { + struct swblk *sb; + vm_pindex_t last; + int i; + bool empty; - VM_OBJECT_ASSERT_LOCKED(object); - if (object->type != OBJT_SWAP) + VM_OBJECT_ASSERT_WLOCKED(object); + if (object->type != OBJT_SWAP || count == 0) return; - while (count > 0) { - struct swblock **pswap; - struct swblock *swap; - - mtx_lock(&swhash_mtx); - pswap = swp_pager_hash(object, index); - - if ((swap = *pswap) != NULL) { - daddr_t v = swap->swb_pages[index & SWAP_META_MASK]; - - if (v != SWAPBLK_NONE) { - swp_pager_freeswapspace(v, 1); - swap->swb_pages[index & SWAP_META_MASK] = - SWAPBLK_NONE; - if (--swap->swb_count == 0) { - *pswap = swap->swb_hnext; - uma_zfree(swap_zone, swap); - --object->un_pager.swp.swp_bcount; - } - } - --count; - ++index; - } else { - int n = SWAP_META_PAGES - (index & SWAP_META_MASK); - count -= n; - index += n; + last = pindex + count - 1; + for (;;) { + sb = SWAP_PCTRIE_LOOKUP_GE(&object->un_pager.swp.swp_blks, + rounddown(pindex, SWAP_META_PAGES)); + if (sb == NULL || sb->p > last) + break; + empty = true; + for (i = 0; i < SWAP_META_PAGES; i++) { + if (sb->d[i] == SWAPBLK_NONE) + continue; + if (pindex <= sb->p + i && sb->p + i <= last) { + swp_pager_freeswapspace(sb->d[i], 1); + sb->d[i] = SWAPBLK_NONE; + } else + empty = false; } - mtx_unlock(&swhash_mtx); + pindex = sb->p + SWAP_META_PAGES; + if (empty) { + SWAP_PCTRIE_REMOVE(&object->un_pager.swp.swp_blks, + sb->p); + uma_zfree(swblk_zone, sb); + } } } @@ -1987,9 +1934,8 @@ static void swp_pager_meta_free_all(vm_object_t object) { - struct swblock **pswap, *swap; - vm_pindex_t index; - daddr_t v; + struct swblk *sb; + vm_pindex_t pindex; int i; VM_OBJECT_ASSERT_WLOCKED(object); @@ -1996,27 +1942,15 @@ if (object->type != OBJT_SWAP) return; - index = 0; - while (object->un_pager.swp.swp_bcount != 0) { - mtx_lock(&swhash_mtx); - pswap = swp_pager_hash(object, index); - if ((swap = *pswap) != NULL) { - for (i = 0; i < SWAP_META_PAGES; ++i) { - v = swap->swb_pages[i]; - if (v != SWAPBLK_NONE) { - --swap->swb_count; - swp_pager_freeswapspace(v, 1); - } - } - if (swap->swb_count != 0) - panic( - "swap_pager_meta_free_all: swb_count != 0"); - *pswap = swap->swb_hnext; - uma_zfree(swap_zone, swap); - --object->un_pager.swp.swp_bcount; + for (pindex = 0; (sb = SWAP_PCTRIE_LOOKUP_GE( + &object->un_pager.swp.swp_blks, pindex)) != NULL;) { + pindex = sb->p + SWAP_META_PAGES; + for (i = 0; i < SWAP_META_PAGES; i++) { + if (sb->d[i] != SWAPBLK_NONE) + swp_pager_freeswapspace(sb->d[i], 1); } - mtx_unlock(&swhash_mtx); - index += SWAP_META_PAGES; + SWAP_PCTRIE_REMOVE(&object->un_pager.swp.swp_blks, sb->p); + uma_zfree(swblk_zone, sb); } } @@ -2030,9 +1964,6 @@ * was invalid. This routine will automatically free any invalid * meta-data swapblks. * - * It is not possible to store invalid swapblks in the swap meta data - * (other then a literal 'SWAPBLK_NONE'), so we don't bother checking. - * * When acting on a busy resident page and paging is in progress, we * have to wait until paging is complete but otherwise can act on the * busy page. @@ -2043,44 +1974,90 @@ static daddr_t swp_pager_meta_ctl(vm_object_t object, vm_pindex_t pindex, int flags) { - struct swblock **pswap; - struct swblock *swap; + struct swblk *sb; daddr_t r1; - int idx; + int i; - VM_OBJECT_ASSERT_LOCKED(object); + if ((flags & (SWM_FREE | SWM_POP)) != 0) + VM_OBJECT_ASSERT_WLOCKED(object); + else + VM_OBJECT_ASSERT_LOCKED(object); + /* - * The meta data only exists of the object is OBJT_SWAP + * The meta data only exists if the object is OBJT_SWAP * and even then might not be allocated yet. */ if (object->type != OBJT_SWAP) return (SWAPBLK_NONE); - r1 = SWAPBLK_NONE; - mtx_lock(&swhash_mtx); - pswap = swp_pager_hash(object, pindex); + sb = SWAP_PCTRIE_LOOKUP(&object->un_pager.swp.swp_blks, + rounddown(pindex, SWAP_META_PAGES)); + if (sb == NULL) + return (SWAPBLK_NONE); + r1 = sb->d[pindex % SWAP_META_PAGES]; + if (r1 == SWAPBLK_NONE) + return (SWAPBLK_NONE); + if ((flags & (SWM_FREE | SWM_POP)) != 0) { + sb->d[pindex % SWAP_META_PAGES] = SWAPBLK_NONE; + for (i = 0; i < SWAP_META_PAGES; i++) { + if (sb->d[i] != SWAPBLK_NONE) + break; + } + if (i == SWAP_META_PAGES) { + SWAP_PCTRIE_REMOVE(&object->un_pager.swp.swp_blks, + rounddown(pindex, SWAP_META_PAGES)); + uma_zfree(swblk_zone, sb); + } + } + if ((flags & SWM_FREE) != 0) { + swp_pager_freeswapspace(r1, 1); + r1 = SWAPBLK_NONE; + } + return (r1); +} - if ((swap = *pswap) != NULL) { - idx = pindex & SWAP_META_MASK; - r1 = swap->swb_pages[idx]; +/* + * Returns the least page index which is greater than or equal to the + * parameter pindex and for which there is a swap block allocated. + * Returns object's size if the object's type is not swap or if there + * are no allocated swap blocks for the object after the requested + * pindex. + */ +vm_pindex_t +swap_pager_find_least(vm_object_t object, vm_pindex_t pindex) +{ + struct swblk *sb; + int i; - if (r1 != SWAPBLK_NONE) { - if (flags & SWM_FREE) { - swp_pager_freeswapspace(r1, 1); - r1 = SWAPBLK_NONE; - } - if (flags & (SWM_FREE|SWM_POP)) { - swap->swb_pages[idx] = SWAPBLK_NONE; - if (--swap->swb_count == 0) { - *pswap = swap->swb_hnext; - uma_zfree(swap_zone, swap); - --object->un_pager.swp.swp_bcount; - } - } + VM_OBJECT_ASSERT_LOCKED(object); + if (object->type != OBJT_SWAP) + return (object->size); + + sb = SWAP_PCTRIE_LOOKUP_GE(&object->un_pager.swp.swp_blks, + rounddown(pindex, SWAP_META_PAGES)); + if (sb == NULL) + return (object->size); + if (sb->p < pindex) { + for (i = pindex % SWAP_META_PAGES; i < SWAP_META_PAGES; i++) { + if (sb->d[i] != SWAPBLK_NONE) + return (sb->p + i); } + sb = SWAP_PCTRIE_LOOKUP_GE(&object->un_pager.swp.swp_blks, + roundup(pindex, SWAP_META_PAGES)); + if (sb == NULL) + return (object->size); } - mtx_unlock(&swhash_mtx); - return (r1); + for (i = 0; i < SWAP_META_PAGES; i++) { + if (sb->d[i] != SWAPBLK_NONE) + return (sb->p + i); + } + + /* + * We get here if a swblk is present in the trie but it + * doesn't map any blocks. + */ + MPASS(0); + return (object->size); } /* @@ -2110,16 +2087,13 @@ if (error) return (error); - mtx_lock(&Giant); - while (swdev_syscall_active) - tsleep(&swdev_syscall_active, PUSER - 1, "swpon", 0); - swdev_syscall_active = 1; + sx_xlock(&swdev_syscall_lock); /* * Swap metadata may not fit in the KVM if we have physical * memory of >1GB. */ - if (swap_zone == NULL) { + if (swblk_zone == NULL) { error = ENOMEM; goto done; } @@ -2134,7 +2108,7 @@ vp = nd.ni_vp; if (vn_isdisk(vp, &error)) { - error = swapongeom(td, vp); + error = swapongeom(vp); } else if (vp->v_type == VREG && (vp->v_mount->mnt_vfc->vfc_flags & VFCF_NETWORK) != 0 && (error = VOP_GETATTR(vp, &attr, td->td_ucred)) == 0) { @@ -2148,9 +2122,7 @@ if (error) vrele(vp); done: - swdev_syscall_active = 0; - wakeup_one(&swdev_syscall_active); - mtx_unlock(&Giant); + sx_xunlock(&swdev_syscall_lock); return (error); } @@ -2157,15 +2129,16 @@ /* * Check that the total amount of swap currently configured does not * exceed half the theoretical maximum. If it does, print a warning - * message and return -1; otherwise, return 0. + * message. */ -static int -swapon_check_swzone(unsigned long npages) +static void +swapon_check_swzone(void) { - unsigned long maxpages; + unsigned long maxpages, npages; + npages = swap_total / PAGE_SIZE; /* absolute maximum we can handle assuming 100% efficiency */ - maxpages = uma_zone_get_max(swap_zone) * SWAP_META_PAGES; + maxpages = uma_zone_get_max(swblk_zone) * SWAP_META_PAGES; /* recommend using no more than half that amount */ if (npages > maxpages / 2) { @@ -2174,9 +2147,7 @@ npages, maxpages / 2); printf("warning: increase kern.maxswzone " "or reduce amount of swap.\n"); - return (-1); } - return (0); } static void @@ -2212,7 +2183,6 @@ sp->sw_vp = vp; sp->sw_id = id; sp->sw_dev = dev; - sp->sw_flags = 0; sp->sw_nblks = nblks; sp->sw_used = 0; sp->sw_strategy = strategy; @@ -2244,7 +2214,7 @@ nswapdev++; swap_pager_avail += nblks - 2; swap_total += (vm_ooffset_t)nblks * PAGE_SIZE; - swapon_check_swzone(swap_total / PAGE_SIZE); + swapon_check_swzone(); swp_sizecheck(); mtx_unlock(&sw_dev_mtx); } @@ -2280,10 +2250,7 @@ if (error) return (error); - mtx_lock(&Giant); - while (swdev_syscall_active) - tsleep(&swdev_syscall_active, PUSER - 1, "swpoff", 0); - swdev_syscall_active = 1; + sx_xlock(&swdev_syscall_lock); NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1, UIO_USERSPACE, uap->name, td); @@ -2305,9 +2272,7 @@ } error = swapoff_one(sp, td->td_ucred); done: - swdev_syscall_active = 0; - wakeup_one(&swdev_syscall_active); - mtx_unlock(&Giant); + sx_xunlock(&swdev_syscall_lock); return (error); } @@ -2319,7 +2284,7 @@ int error; #endif - mtx_assert(&Giant, MA_OWNED); + sx_assert(&swdev_syscall_lock, SA_XLOCKED); #ifdef MAC (void) vn_lock(sp->sw_vp, LK_EXCLUSIVE | LK_RETRY); error = mac_system_check_swapoff(cred, sp->sw_vp); @@ -2335,10 +2300,8 @@ * of data we will have to page back in, plus an epsilon so * the system doesn't become critically low on swap space. */ - if (cnt.v_free_count + cnt.v_cache_count + swap_pager_avail < - nblks + nswap_lowat) { + if (vm_cnt.v_free_count + swap_pager_avail < nblks + nswap_lowat) return (ENOMEM); - } /* * Prevent further allocations on this device. @@ -2378,10 +2341,7 @@ const char *devname; int error; - mtx_lock(&Giant); - while (swdev_syscall_active) - tsleep(&swdev_syscall_active, PUSER - 1, "swpoff", 0); - swdev_syscall_active = 1; + sx_xlock(&swdev_syscall_lock); mtx_lock(&sw_dev_mtx); TAILQ_FOREACH_SAFE(sp, &swtailq, sw_list, spt) { @@ -2401,9 +2361,7 @@ } mtx_unlock(&sw_dev_mtx); - swdev_syscall_active = 0; - wakeup_one(&swdev_syscall_active); - mtx_unlock(&Giant); + sx_xunlock(&swdev_syscall_lock); } void @@ -2472,19 +2430,14 @@ SYSCTL_INT(_vm, OID_AUTO, nswapdev, CTLFLAG_RD, &nswapdev, 0, "Number of swap devices"); -SYSCTL_NODE(_vm, OID_AUTO, swap_info, CTLFLAG_RD, sysctl_vm_swap_info, +SYSCTL_NODE(_vm, OID_AUTO, swap_info, CTLFLAG_RD | CTLFLAG_MPSAFE, + sysctl_vm_swap_info, "Swap statistics by device"); /* - * vmspace_swap_count() - count the approximate swap usage in pages for a - * vmspace. - * - * The map must be locked. - * - * Swap usage is determined by taking the proportional swap used by - * VM objects backing the VM map. To make up for fractional losses, - * if the VM object has any swap use at all the associated map entries - * count for at least 1 swap page. + * Count the approximate swap usage in pages for a vmspace. The + * shadowed or not yet copied on write swap blocks are not accounted. + * The map must be locked. */ long vmspace_swap_count(struct vmspace *vmspace) @@ -2492,23 +2445,38 @@ vm_map_t map; vm_map_entry_t cur; vm_object_t object; - long count, n; + struct swblk *sb; + vm_pindex_t e, pi; + long count; + int i; map = &vmspace->vm_map; count = 0; for (cur = map->header.next; cur != &map->header; cur = cur->next) { - if ((cur->eflags & MAP_ENTRY_IS_SUB_MAP) == 0 && - (object = cur->object.vm_object) != NULL) { - VM_OBJECT_WLOCK(object); - if (object->type == OBJT_SWAP && - object->un_pager.swp.swp_bcount != 0) { - n = (cur->end - cur->start) / PAGE_SIZE; - count += object->un_pager.swp.swp_bcount * - SWAP_META_PAGES * n / object->size + 1; + if ((cur->eflags & MAP_ENTRY_IS_SUB_MAP) != 0) + continue; + object = cur->object.vm_object; + if (object == NULL || object->type != OBJT_SWAP) + continue; + VM_OBJECT_RLOCK(object); + if (object->type != OBJT_SWAP) + goto unlock; + pi = OFF_TO_IDX(cur->offset); + e = pi + OFF_TO_IDX(cur->end - cur->start); + for (;; pi = sb->p + SWAP_META_PAGES) { + sb = SWAP_PCTRIE_LOOKUP_GE( + &object->un_pager.swp.swp_blks, pi); + if (sb == NULL || sb->p >= e) + break; + for (i = 0; i < SWAP_META_PAGES; i++) { + if (sb->p + i < e && + sb->d[i] != SWAPBLK_NONE) + count++; } - VM_OBJECT_WUNLOCK(object); } +unlock: + VM_OBJECT_RUNLOCK(object); } return (count); } @@ -2554,8 +2522,9 @@ } /* - * Remove a reference from the g_consumer. Post a close event if - * all referneces go away. + * Remove a reference from the g_consumer. Post a close event if all + * references go away, since the function might be called from the + * biodone context. */ static void swapgeom_release(struct g_consumer *cp, struct swdevt *sp) @@ -2628,7 +2597,7 @@ bio->bio_offset = (bp->b_blkno - sp->sw_first) * PAGE_SIZE; bio->bio_length = bp->b_bcount; bio->bio_done = swapgeom_done; - if ((bp->b_flags & B_UNMAPPED) != 0) { + if (!buf_mapped(bp)) { bio->bio_ma = bp->b_pages; bio->bio_data = unmapped_buf; bio->bio_ma_offset = (vm_offset_t)bp->b_offset & PAGE_MASK; @@ -2678,22 +2647,19 @@ cp = sw->sw_id; sw->sw_id = NULL; mtx_unlock(&sw_dev_mtx); - /* XXX: direct call when Giant untangled */ + + /* + * swapgeom_close() may be called from the biodone context, + * where we cannot perform topology changes. Delegate the + * work to the events thread. + */ if (cp != NULL) g_waitfor_event(swapgeom_close_ev, cp, M_WAITOK, NULL); } - -struct swh0h0 { - struct cdev *dev; - struct vnode *vp; - int error; -}; - -static void -swapongeom_ev(void *arg, int flags) +static int +swapongeom_locked(struct cdev *dev, struct vnode *vp) { - struct swh0h0 *swh; struct g_provider *pp; struct g_consumer *cp; static struct g_geom *gp; @@ -2701,20 +2667,15 @@ u_long nblks; int error; - swh = arg; - swh->error = 0; - pp = g_dev_getprovider(swh->dev); - if (pp == NULL) { - swh->error = ENODEV; - return; - } + pp = g_dev_getprovider(dev); + if (pp == NULL) + return (ENODEV); mtx_lock(&sw_dev_mtx); TAILQ_FOREACH(sp, &swtailq, sw_list) { cp = sp->sw_id; if (cp != NULL && cp->provider == pp) { mtx_unlock(&sw_dev_mtx); - swh->error = EBUSY; - return; + return (EBUSY); } } mtx_unlock(&sw_dev_mtx); @@ -2721,44 +2682,41 @@ if (gp == NULL) gp = g_new_geomf(&g_swap_class, "swap"); cp = g_new_consumer(gp); - cp->index = 1; /* Number of active I/Os, plus one for being active. */ + cp->index = 1; /* Number of active I/Os, plus one for being active. */ cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE; g_attach(cp, pp); /* - * XXX: Everytime you think you can improve the margin for + * XXX: Every time you think you can improve the margin for * footshooting, somebody depends on the ability to do so: * savecore(8) wants to write to our swapdev so we cannot * set an exclusive count :-( */ error = g_access(cp, 1, 1, 0); - if (error) { + if (error != 0) { g_detach(cp); g_destroy_consumer(cp); - swh->error = error; - return; + return (error); } nblks = pp->mediasize / DEV_BSIZE; - swaponsomething(swh->vp, cp, nblks, swapgeom_strategy, - swapgeom_close, dev2udev(swh->dev), + swaponsomething(vp, cp, nblks, swapgeom_strategy, + swapgeom_close, dev2udev(dev), (pp->flags & G_PF_ACCEPT_UNMAPPED) != 0 ? SW_UNMAPPED : 0); - swh->error = 0; + return (0); } static int -swapongeom(struct thread *td, struct vnode *vp) +swapongeom(struct vnode *vp) { int error; - struct swh0h0 swh; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); - - swh.dev = vp->v_rdev; - swh.vp = vp; - swh.error = 0; - /* XXX: direct call when Giant untangled */ - error = g_waitfor_event(swapongeom_ev, &swh, M_WAITOK, NULL); - if (!error) - error = swh.error; + if (vp->v_type != VCHR || (vp->v_iflag & VI_DOOMED) != 0) { + error = ENOENT; + } else { + g_topology_lock(); + error = swapongeom_locked(vp->v_rdev, vp); + g_topology_unlock(); + } VOP_UNLOCK(vp, 0); return (error); } @@ -2833,3 +2791,40 @@ NODEV, 0); return (0); } + +static int +sysctl_swap_async_max(SYSCTL_HANDLER_ARGS) +{ + int error, new, n; + + new = nsw_wcount_async_max; + error = sysctl_handle_int(oidp, &new, 0, req); + if (error != 0 || req->newptr == NULL) + return (error); + + if (new > nswbuf / 2 || new < 1) + return (EINVAL); + + mtx_lock(&pbuf_mtx); + while (nsw_wcount_async_max != new) { + /* + * Adjust difference. If the current async count is too low, + * we will need to sqeeze our update slowly in. Sleep with a + * higher priority than getpbuf() to finish faster. + */ + n = new - nsw_wcount_async_max; + if (nsw_wcount_async + n >= 0) { + nsw_wcount_async += n; + nsw_wcount_async_max += n; + wakeup(&nsw_wcount_async); + } else { + nsw_wcount_async_max -= nsw_wcount_async; + nsw_wcount_async = 0; + msleep(&nsw_wcount_async, &pbuf_mtx, PSWP, + "swpsysctl", 0); + } + } + mtx_unlock(&pbuf_mtx); + + return (0); +} Modified: trunk/sys/vm/swap_pager.h =================================================================== --- trunk/sys/vm/swap_pager.h 2020-02-08 19:35:04 UTC (rev 12313) +++ trunk/sys/vm/swap_pager.h 2020-02-08 19:35:48 UTC (rev 12314) @@ -33,7 +33,7 @@ * SUCH DAMAGE. * * from: @(#)swap_pager.h 7.1 (Berkeley) 12/5/90 - * $FreeBSD: stable/10/sys/vm/swap_pager.h 248514 2013-03-19 14:39:27Z kib $ + * $FreeBSD: stable/11/sys/vm/swap_pager.h 331722 2018-03-29 02:50:57Z eadler $ */ #ifndef _VM_SWAP_PAGER_H_ @@ -74,15 +74,14 @@ #ifdef _KERNEL -extern int swap_pager_full; extern int swap_pager_avail; struct xswdev; int swap_dev_info(int name, struct xswdev *xs, char *devname, size_t len); void swap_pager_copy(vm_object_t, vm_object_t, vm_pindex_t, int); +vm_pindex_t swap_pager_find_least(vm_object_t object, vm_pindex_t pindex); void swap_pager_freespace(vm_object_t, vm_pindex_t, vm_size_t); void swap_pager_swap_init(void); -int swap_pager_isswapped(vm_object_t, struct swdevt *); int swap_pager_reserve(vm_object_t, vm_pindex_t, vm_size_t); void swap_pager_status(int *total, int *used); void swapoff_all(void); Modified: trunk/sys/vm/uma.h =================================================================== --- trunk/sys/vm/uma.h 2020-02-08 19:35:04 UTC (rev 12313) +++ trunk/sys/vm/uma.h 2020-02-08 19:35:48 UTC (rev 12314) @@ -25,7 +25,7 @@ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * - * $FreeBSD: stable/10/sys/vm/uma.h 324602 2017-10-13 17:11:08Z jhb $ + * $FreeBSD: stable/11/sys/vm/uma.h 338389 2018-08-29 17:58:01Z markj $ * */ @@ -263,8 +263,8 @@ * information in the vm_page. */ #define UMA_ZONE_SECONDARY 0x0200 /* Zone is a Secondary Zone */ -#define UMA_ZONE_REFCNT 0x0400 /* Allocate refcnts in slabs */ -#define UMA_ZONE_MAXBUCKET 0x0800 /* Use largest buckets */ +#define UMA_ZONE_NOBUCKET 0x0400 /* Do not use buckets. */ +#define UMA_ZONE_MAXBUCKET 0x0800 /* Use largest buckets. */ #define UMA_ZONE_CACHESPREAD 0x1000 /* * Spread memory start locations across * all possible cache lines. May @@ -277,7 +277,7 @@ * mini-dumps. */ #define UMA_ZONE_PCPU 0x8000 /* - * Allocates mp_ncpus slabs sized to + * Allocates mp_maxid + 1 slabs sized to * sizeof(struct pcpu). */ @@ -288,7 +288,7 @@ */ #define UMA_ZONE_INHERIT \ (UMA_ZONE_OFFPAGE | UMA_ZONE_MALLOC | UMA_ZONE_NOFREE | \ - UMA_ZONE_HASH | UMA_ZONE_REFCNT | UMA_ZONE_VTOSLAB | UMA_ZONE_PCPU) + UMA_ZONE_HASH | UMA_ZONE_VTOSLAB | UMA_ZONE_PCPU) /* Definitions for align */ #define UMA_ALIGN_PTR (sizeof(void *) - 1) /* Alignment fit for ptr */ @@ -367,6 +367,11 @@ } /* + * Wait until the specified zone can allocate an item. + */ +void uma_zwait(uma_zone_t zone); + +/* * XXX The rest of the prototypes in this header are h0h0 magic for the VM. * If you think you need to use it for a normal zone you're probably incorrect. */ @@ -523,6 +528,19 @@ void uma_zone_set_warning(uma_zone_t zone, const char *warning); /* + * Sets a function to run when limit is reached + * + * Arguments: + * zone The zone to which this applies + * fx The function ro run + * + * Returns: + * Nothing + */ +typedef void (*uma_maxaction_t)(uma_zone_t, int); +void uma_zone_set_maxaction(uma_zone_t zone, uma_maxaction_t); + +/* * Obtains the approximate current number of items allocated from a zone * * Arguments: @@ -612,21 +630,6 @@ void uma_prealloc(uma_zone_t zone, int itemcnt); /* - * Used to lookup the reference counter allocated for an item - * from a UMA_ZONE_REFCNT zone. For UMA_ZONE_REFCNT zones, - * reference counters are allocated for items and stored in - * the underlying slab header. - * - * Arguments: - * zone The UMA_ZONE_REFCNT zone to which the item belongs. - * item The address of the item for which we want a refcnt. - * - * Returns: - * A pointer to a uint32_t reference counter. - */ -uint32_t *uma_find_refcnt(uma_zone_t zone, void *item); - -/* * Used to determine if a fixed-size zone is exhausted. * * Arguments: Modified: trunk/sys/vm/uma_core.c =================================================================== --- trunk/sys/vm/uma_core.c 2020-02-08 19:35:04 UTC (rev 12313) +++ trunk/sys/vm/uma_core.c 2020-02-08 19:35:48 UTC (rev 12314) @@ -32,7 +32,7 @@ * * This allocator is intended to replace the multitude of similar object caches * in the standard FreeBSD kernel. The intent is to be flexible as well as - * effecient. A primary design goal is to return unused memory to the rest of + * efficient. A primary design goal is to return unused memory to the rest of * the system. This will make the system as a whole more flexible due to the * ability to move memory to subsystems which most need it instead of leaving * pools of reserved memory unused. @@ -49,7 +49,7 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/vm/uma_core.c 320440 2017-06-28 06:40:13Z alc $"); +__FBSDID("$FreeBSD: stable/11/sys/vm/uma_core.c 357046 2020-01-23 14:14:38Z markj $"); /* I should really use ktr.. */ /* @@ -75,10 +75,12 @@ #include <sys/sysctl.h> #include <sys/mutex.h> #include <sys/proc.h> +#include <sys/random.h> #include <sys/rwlock.h> #include <sys/sbuf.h> #include <sys/sched.h> #include <sys/smp.h> +#include <sys/taskqueue.h> #include <sys/vmmeter.h> #include <vm/vm.h> @@ -112,7 +114,6 @@ /* This is the zone from which all of uma_slab_t's are allocated. */ static uma_zone_t slabzone; -static uma_zone_t slabrefzone; /* With refcounters (for UMA_ZONE_REFCNT) */ /* * The initial hash tables come out of this zone so they can be allocated @@ -138,7 +139,7 @@ LIST_HEAD_INITIALIZER(uma_cachezones); /* This RW lock protects the keg list */ -static struct rwlock_padalign uma_rwlock; +static struct rwlock_padalign __exclusive_cache_line uma_rwlock; /* Linked list of boot time pages */ static LIST_HEAD(,uma_slab) uma_boot_pages = @@ -153,14 +154,9 @@ static int booted = 0; #define UMA_STARTUP 1 #define UMA_STARTUP2 2 +#define UMA_SHUTDOWN 3 /* - * Only mbuf clusters use ref zones. Just provide enough references - * to support the one user. New code should not use the ref facility. - */ -static const u_int uma_max_ipers_ref = PAGE_SIZE / MCLBYTES; - -/* * This is the handle used to schedule events that need to happen * outside of the allocation fast path. */ @@ -248,11 +244,12 @@ static void keg_large_init(uma_keg_t keg); static void zone_foreach(void (*zfunc)(uma_zone_t)); static void zone_timeout(uma_zone_t zone); -static int hash_alloc(struct uma_hash *); +static int hash_alloc(struct uma_hash *, u_int); static int hash_expand(struct uma_hash *, struct uma_hash *); static void hash_free(struct uma_hash *hash); static void uma_timeout(void *); static void uma_startup3(void); +static void uma_shutdown(void); static void *zone_alloc_item(uma_zone_t, void *, int); static void zone_free_item(uma_zone_t, void *, void *, enum zfreeskip); static void bucket_enable(void); @@ -276,6 +273,11 @@ static int sysctl_vm_zone_count(SYSCTL_HANDLER_ARGS); static int sysctl_vm_zone_stats(SYSCTL_HANDLER_ARGS); +#ifdef INVARIANTS +static void uma_dbg_free(uma_zone_t zone, uma_slab_t slab, void *item); +static void uma_dbg_alloc(uma_zone_t zone, uma_slab_t slab, void *item); +#endif + SYSINIT(uma_startup3, SI_SUB_VM_CONF, SI_ORDER_SECOND, uma_startup3, NULL); SYSCTL_PROC(_vm, OID_AUTO, zone_count, CTLFLAG_RD|CTLTYPE_INT, @@ -285,8 +287,7 @@ 0, 0, sysctl_vm_zone_stats, "s,struct uma_type_header", "Zone Stats"); static int zone_warnings = 1; -TUNABLE_INT("vm.zone_warnings", &zone_warnings); -SYSCTL_INT(_vm, OID_AUTO, zone_warnings, CTLFLAG_RW, &zone_warnings, 0, +SYSCTL_INT(_vm, OID_AUTO, zone_warnings, CTLFLAG_RWTUN, &zone_warnings, 0, "Warn when UMA zones becomes full"); /* @@ -433,6 +434,14 @@ printf("[zone: %s] %s\n", zone->uz_name, zone->uz_warning); } +static inline void +zone_maxaction(uma_zone_t zone) +{ + + if (zone->uz_maxaction.ta_func != NULL) + taskqueue_enqueue(taskqueue_thread, &zone->uz_maxaction); +} + static void zone_foreach_keg(uma_zone_t zone, void (*kegfn)(uma_keg_t)) { @@ -471,6 +480,7 @@ static void keg_timeout(uma_keg_t keg) { + u_int slabs; KEG_LOCK(keg); /* @@ -481,7 +491,8 @@ * may be a little aggressive. Should I allow for two collisions max? */ if (keg->uk_flags & UMA_ZONE_HASH && - keg->uk_pages / keg->uk_ppera >= keg->uk_hash.uh_hashsize) { + (slabs = keg->uk_pages / keg->uk_ppera) > + keg->uk_hash.uh_hashsize) { struct uma_hash newhash; struct uma_hash oldhash; int ret; @@ -492,9 +503,8 @@ * I have to do everything in stages and check for * races. */ - newhash = keg->uk_hash; KEG_UNLOCK(keg); - ret = hash_alloc(&newhash); + ret = hash_alloc(&newhash, 1 << fls(slabs)); KEG_LOCK(keg); if (ret) { if (hash_expand(&keg->uk_hash, &newhash)) { @@ -526,19 +536,16 @@ * hash A new hash structure with the old hash size in uh_hashsize * * Returns: - * 1 on sucess and 0 on failure. + * 1 on success and 0 on failure. */ static int -hash_alloc(struct uma_hash *hash) +hash_alloc(struct uma_hash *hash, u_int size) { - int oldsize; - int alloc; + size_t alloc; - oldsize = hash->uh_hashsize; - - /* We're just going to go to a power of two greater */ - if (oldsize) { - hash->uh_hashsize = oldsize * 2; + KASSERT(powerof2(size), ("hash size must be power of 2")); + if (size > UMA_HASH_SIZE_INIT) { + hash->uh_hashsize = size; alloc = sizeof(hash->uh_slab_hash[0]) * hash->uh_hashsize; hash->uh_slab_hash = (struct slabhead *)malloc(alloc, M_UMAHASH, M_NOWAIT); @@ -575,8 +582,8 @@ hash_expand(struct uma_hash *oldhash, struct uma_hash *newhash) { uma_slab_t slab; - int hval; - int i; + u_int hval; + u_int idx; if (!newhash->uh_slab_hash) return (0); @@ -589,10 +596,10 @@ * full rehash. */ - for (i = 0; i < oldhash->uh_hashsize; i++) - while (!SLIST_EMPTY(&oldhash->uh_slab_hash[i])) { - slab = SLIST_FIRST(&oldhash->uh_slab_hash[i]); - SLIST_REMOVE_HEAD(&oldhash->uh_slab_hash[i], us_hlink); + for (idx = 0; idx < oldhash->uh_hashsize; idx++) + while (!SLIST_EMPTY(&oldhash->uh_slab_hash[idx])) { + slab = SLIST_FIRST(&oldhash->uh_slab_hash[idx]); + SLIST_REMOVE_HEAD(&oldhash->uh_slab_hash[idx], us_hlink); hval = UMA_HASH(newhash, slab->us_data); SLIST_INSERT_HEAD(&newhash->uh_slab_hash[hval], slab, us_hlink); @@ -840,8 +847,7 @@ keg_drain(uma_keg_t keg) { struct slabhead freeslabs = { 0 }; - uma_slab_t slab; - uma_slab_t n; + uma_slab_t slab, tmp; /* * We don't want to take pages from statically allocated kegs at this @@ -857,15 +863,10 @@ if (keg->uk_free == 0) goto finished; - slab = LIST_FIRST(&keg->uk_free_slab); - while (slab) { - n = LIST_NEXT(slab, us_link); - - /* We have no where to free these to */ - if (slab->us_flags & UMA_SLAB_BOOT) { - slab = n; + LIST_FOREACH_SAFE(slab, &keg->uk_free_slab, us_link, tmp) { + /* We have nowhere to free these to. */ + if (slab->us_flags & UMA_SLAB_BOOT) continue; - } LIST_REMOVE(slab, us_link); keg->uk_pages -= keg->uk_ppera; @@ -875,8 +876,6 @@ UMA_HASH_REMOVE(&keg->uk_hash, slab, slab->us_data); SLIST_INSERT_HEAD(&freeslabs, slab, us_hlink); - - slab = n; } finished: KEG_UNLOCK(keg); @@ -939,7 +938,6 @@ static uma_slab_t keg_alloc_slab(uma_keg_t keg, uma_zone_t zone, int wait) { - uma_slabrefcnt_t slabref; uma_alloc allocf; uma_slab_t slab; uint8_t *mem; @@ -1002,11 +1000,6 @@ #ifdef INVARIANTS BIT_ZERO(SLAB_SETSIZE, &slab->us_debugfree); #endif - if (keg->uk_flags & UMA_ZONE_REFCNT) { - slabref = (uma_slabrefcnt_t)slab; - for (i = 0; i < keg->uk_ipers; i++) - slabref->us_refcnt[i] = 0; - } if (keg->uk_init != NULL) { for (i = 0; i < keg->uk_ipers; i++) @@ -1135,7 +1128,9 @@ npages = howmany(bytes, PAGE_SIZE); while (npages > 0) { p = vm_page_alloc(NULL, 0, VM_ALLOC_INTERRUPT | - VM_ALLOC_WIRED | VM_ALLOC_NOOBJ); + VM_ALLOC_WIRED | VM_ALLOC_NOOBJ | + ((wait & M_WAITOK) != 0 ? VM_ALLOC_WAITOK : + VM_ALLOC_NOWAIT)); if (p != NULL) { /* * Since the page does not belong to an object, its @@ -1145,17 +1140,12 @@ npages--; continue; } - if (wait & M_WAITOK) { - VM_WAIT; - continue; - } - /* * Page allocation failed, free intermediate pages and * exit. */ TAILQ_FOREACH_SAFE(p, &alloctail, listq, p_next) { - vm_page_unwire(p, 0); + vm_page_unwire(p, PQ_NONE); vm_page_free(p); } return (NULL); @@ -1229,7 +1219,7 @@ u_int slabsize; if (keg->uk_flags & UMA_ZONE_PCPU) { - u_int ncpus = mp_ncpus ? mp_ncpus : MAXCPU; + u_int ncpus = (mp_maxid + 1) ? (mp_maxid + 1) : MAXCPU; slabsize = sizeof(struct pcpu); keg->uk_ppera = howmany(ncpus * sizeof(struct pcpu), @@ -1255,15 +1245,20 @@ keg->uk_rsize < sizeof(struct pcpu), ("%s: size %u too large", __func__, keg->uk_rsize)); - if (keg->uk_flags & UMA_ZONE_REFCNT) - rsize += sizeof(uint32_t); - if (keg->uk_flags & UMA_ZONE_OFFPAGE) shsize = 0; else shsize = sizeof(struct uma_slab); - keg->uk_ipers = (slabsize - shsize) / rsize; + if (rsize <= slabsize - shsize) + keg->uk_ipers = (slabsize - shsize) / rsize; + else { + /* Handle special case when we have 1 item per slab, so + * alignment requirement can be relaxed. */ + KASSERT(keg->uk_size <= slabsize - shsize, + ("%s: size %u greater than slab", __func__, keg->uk_size)); + keg->uk_ipers = 1; + } KASSERT(keg->uk_ipers > 0 && keg->uk_ipers <= SLAB_SETSIZE, ("%s: keg->uk_ipers %u", __func__, keg->uk_ipers)); @@ -1337,21 +1332,24 @@ keg->uk_ipers = 1; keg->uk_rsize = keg->uk_size; - /* We can't do OFFPAGE if we're internal, bail out here. */ - if (keg->uk_flags & UMA_ZFLAG_INTERNAL) - return; - /* Check whether we have enough space to not do OFFPAGE. */ if ((keg->uk_flags & UMA_ZONE_OFFPAGE) == 0) { shsize = sizeof(struct uma_slab); - if (keg->uk_flags & UMA_ZONE_REFCNT) - shsize += keg->uk_ipers * sizeof(uint32_t); if (shsize & UMA_ALIGN_PTR) shsize = (shsize & ~UMA_ALIGN_PTR) + (UMA_ALIGN_PTR + 1); - if ((PAGE_SIZE * keg->uk_ppera) - keg->uk_rsize < shsize) - keg->uk_flags |= UMA_ZONE_OFFPAGE; + if (PAGE_SIZE * keg->uk_ppera - keg->uk_rsize < shsize) { + /* + * We can't do OFFPAGE if we're internal, in which case + * we need an extra page per allocation to contain the + * slab header. + */ + if ((keg->uk_flags & UMA_ZFLAG_INTERNAL) == 0) + keg->uk_flags |= UMA_ZONE_OFFPAGE; + else + keg->uk_ppera++; + } } if ((keg->uk_flags & UMA_ZONE_OFFPAGE) && @@ -1433,7 +1431,7 @@ if (arg->flags & UMA_ZONE_ZINIT) keg->uk_init = zero_init; - if (arg->flags & UMA_ZONE_REFCNT || arg->flags & UMA_ZONE_MALLOC) + if (arg->flags & UMA_ZONE_MALLOC) keg->uk_flags |= UMA_ZONE_VTOSLAB; if (arg->flags & UMA_ZONE_PCPU) @@ -1445,13 +1443,6 @@ if (keg->uk_flags & UMA_ZONE_CACHESPREAD) { keg_cachespread_init(keg); - } else if (keg->uk_flags & UMA_ZONE_REFCNT) { - if (keg->uk_size > - (UMA_SLAB_SIZE - sizeof(struct uma_slab_refcnt) - - sizeof(uint32_t))) - keg_large_init(keg); - else - keg_small_init(keg); } else { if (keg->uk_size > (UMA_SLAB_SIZE - sizeof(struct uma_slab))) keg_large_init(keg); @@ -1459,15 +1450,8 @@ keg_small_init(keg); } - if (keg->uk_flags & UMA_ZONE_OFFPAGE) { - if (keg->uk_flags & UMA_ZONE_REFCNT) { - if (keg->uk_ipers > uma_max_ipers_ref) - panic("Too many ref items per zone: %d > %d\n", - keg->uk_ipers, uma_max_ipers_ref); - keg->uk_slabzone = slabrefzone; - } else - keg->uk_slabzone = slabzone; - } + if (keg->uk_flags & UMA_ZONE_OFFPAGE) + keg->uk_slabzone = slabzone; /* * If we haven't booted yet we need allocations to go through the @@ -1504,10 +1488,6 @@ /* Size of the slab struct and free list */ totsize = sizeof(struct uma_slab); - /* Size of the reference counts. */ - if (keg->uk_flags & UMA_ZONE_REFCNT) - totsize += keg->uk_ipers * sizeof(uint32_t); - if (totsize & UMA_ALIGN_PTR) totsize = (totsize & ~UMA_ALIGN_PTR) + (UMA_ALIGN_PTR + 1); @@ -1521,8 +1501,6 @@ * sure here anyway. */ totsize = keg->uk_pgoff + sizeof(struct uma_slab); - if (keg->uk_flags & UMA_ZONE_REFCNT) - totsize += keg->uk_ipers * sizeof(uint32_t); if (totsize > PAGE_SIZE * keg->uk_ppera) { printf("zone %s ipers %d rsize %d size %d\n", zone->uz_name, keg->uk_ipers, keg->uk_rsize, @@ -1532,7 +1510,7 @@ } if (keg->uk_flags & UMA_ZONE_HASH) - hash_alloc(&keg->uk_hash); + hash_alloc(&keg->uk_hash, 0); #ifdef UMA_DEBUG printf("UMA: %s(%p) size %d(%d) flags %#x ipers %d ppera %d out %d free %d\n", @@ -1667,10 +1645,15 @@ } out: - if ((arg->flags & UMA_ZONE_MAXBUCKET) == 0) + KASSERT((arg->flags & (UMA_ZONE_MAXBUCKET | UMA_ZONE_NOBUCKET)) != + (UMA_ZONE_MAXBUCKET | UMA_ZONE_NOBUCKET), + ("Invalid zone flag combination")); + if ((arg->flags & UMA_ZONE_MAXBUCKET) != 0) + zone->uz_count = BUCKET_MAX; + else if ((arg->flags & UMA_ZONE_NOBUCKET) != 0) + zone->uz_count = 0; + else zone->uz_count = bucket_select(zone->uz_size); - else - zone->uz_count = BUCKET_MAX; zone->uz_count_min = zone->uz_count; return (0); @@ -1785,7 +1768,6 @@ { struct uma_zctor_args args; uma_slab_t slab; - u_int slabsize; int i; #ifdef UMA_DEBUG @@ -1835,9 +1817,6 @@ zone_ctor(zones, sizeof(struct uma_zone), &args, M_WAITOK); #ifdef UMA_DEBUG - printf("Initializing pcpu cache locks.\n"); -#endif -#ifdef UMA_DEBUG printf("Creating slab and hash zones.\n"); #endif @@ -1847,18 +1826,6 @@ NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL); - /* - * We also create a zone for the bigger slabs with reference - * counts in them, to accomodate UMA_ZONE_REFCNT zones. - */ - slabsize = sizeof(struct uma_slab_refcnt); - slabsize += uma_max_ipers_ref * sizeof(uint32_t); - slabrefzone = uma_zcreate("UMA RCntSlabs", - slabsize, - NULL, NULL, NULL, NULL, - UMA_ALIGN_PTR, - UMA_ZFLAG_INTERNAL); - hashzone = uma_zcreate("UMA Hash", sizeof(struct slabhead *) * UMA_HASH_SIZE_INIT, NULL, NULL, NULL, NULL, @@ -1885,10 +1852,6 @@ #endif } -/* - * Initialize our callout handle - * - */ static void uma_startup3(void) @@ -1901,8 +1864,18 @@ #ifdef UMA_DEBUG printf("UMA startup3 complete.\n"); #endif + + EVENTHANDLER_REGISTER(shutdown_post_sync, uma_shutdown, NULL, + EVENTHANDLER_PRI_FIRST); } +static void +uma_shutdown(void) +{ + + booted = UMA_SHUTDOWN; +} + static uma_keg_t uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit, uma_fini fini, int align, uint32_t flags) @@ -1948,6 +1921,20 @@ args.dtor = dtor; args.uminit = uminit; args.fini = fini; +#ifdef INVARIANTS + /* + * If a zone is being created with an empty constructor and + * destructor, pass UMA constructor/destructor which checks for + * memory use after free. + */ + if ((!(flags & (UMA_ZONE_ZINIT | UMA_ZONE_NOFREE))) && + ctor == NULL && dtor == NULL && uminit == NULL && fini == NULL) { + args.ctor = trash_ctor; + args.dtor = trash_dtor; + args.uminit = trash_init; + args.fini = trash_fini; + } +#endif args.align = align; args.flags = flags; args.keg = NULL; @@ -2070,15 +2057,8 @@ error = EINVAL; goto out; } + /* - * Both must either be refcnt, or not be refcnt. - */ - if ((zone->uz_flags & UMA_ZONE_REFCNT) != - (master->uz_flags & UMA_ZONE_REFCNT)) { - error = EINVAL; - goto out; - } - /* * The underlying object must be the same size. rsize * may be different. */ @@ -2114,11 +2094,28 @@ uma_zdestroy(uma_zone_t zone) { + /* + * Large slabs are expensive to reclaim, so don't bother doing + * unnecessary work if we're shutting down. + */ + if (booted == UMA_SHUTDOWN && + zone->uz_fini == NULL && + zone->uz_release == (uma_release)zone_release) + return; sx_slock(&uma_drain_lock); zone_free_item(zones, zone, NULL, SKIP_NONE); sx_sunlock(&uma_drain_lock); } +void +uma_zwait(uma_zone_t zone) +{ + void *item; + + item = uma_zalloc_arg(zone, NULL, M_WAITOK); + uma_zfree(zone, item); +} + /* See uma.h */ void * uma_zalloc_arg(uma_zone_t zone, void *udata, int flags) @@ -2129,6 +2126,9 @@ int lockfail; int cpu; + /* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */ + random_harvest_fast_uma(&zone, sizeof(zone), 1, RANDOM_UMA); + /* This is the fast path allocation */ #ifdef UMA_DEBUG_ALLOC_1 printf("Allocating one item from %s(%p)\n", zone->uz_name, zone); @@ -2140,20 +2140,17 @@ WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "uma_zalloc_arg: zone \"%s\"", zone->uz_name); } + KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(), + ("uma_zalloc_arg: called with spinlock or critical section held")); + #ifdef DEBUG_MEMGUARD if (memguard_cmp_zone(zone)) { item = memguard_alloc(zone->uz_size, flags); if (item != NULL) { - /* - * Avoid conflict with the use-after-free - * protecting infrastructure from INVARIANTS. - */ if (zone->uz_init != NULL && - zone->uz_init != mtrash_init && zone->uz_init(item, zone->uz_size, flags) != 0) return (NULL); if (zone->uz_ctor != NULL && - zone->uz_ctor != mtrash_ctor && zone->uz_ctor(item, zone->uz_size, udata, flags) != 0) { zone->uz_fini(item, zone->uz_size); @@ -2289,7 +2286,7 @@ /* * Now lets just fill a bucket and put it on the free list. If that - * works we'll restart the allocation from the begining and it + * works we'll restart the allocation from the beginning and it * will use the just filled bucket. */ bucket = zone_alloc_bucket(zone, udata, flags); @@ -2370,6 +2367,7 @@ if ((zone->uz_flags & UMA_ZFLAG_MULTI) == 0) { zone->uz_flags |= UMA_ZFLAG_FULL; zone_log_warning(zone); + zone_maxaction(zone); } if (flags & M_NOWAIT) break; @@ -2489,6 +2487,7 @@ zone->uz_flags |= UMA_ZFLAG_FULL; zone->uz_sleeps++; zone_log_warning(zone); + zone_maxaction(zone); msleep(zone, zone->uz_lockptr, PVM, "zonelimit", hz/100); zone->uz_flags &= ~UMA_ZFLAG_FULL; @@ -2668,6 +2667,9 @@ int lockfail; int cpu; + /* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */ + random_harvest_fast_uma(&zone, sizeof(zone), 1, RANDOM_UMA); + #ifdef UMA_DEBUG_ALLOC_1 printf("Freeing item %p to %s(%p)\n", item, zone->uz_name, zone); #endif @@ -2674,14 +2676,17 @@ CTR2(KTR_UMA, "uma_zfree_arg thread %x zone %s", curthread, zone->uz_name); + KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(), + ("uma_zfree_arg: called with spinlock or critical section held")); + /* uma_zfree(..., NULL) does nothing, to match free(9). */ if (item == NULL) return; #ifdef DEBUG_MEMGUARD if (is_memguard_addr(item)) { - if (zone->uz_dtor != NULL && zone->uz_dtor != mtrash_dtor) + if (zone->uz_dtor != NULL) zone->uz_dtor(item, zone->uz_size, udata); - if (zone->uz_fini != NULL && zone->uz_fini != mtrash_fini) + if (zone->uz_fini != NULL) zone->uz_fini(item, zone->uz_size); memguard_free(item); return; @@ -2988,6 +2993,16 @@ } /* See uma.h */ +void +uma_zone_set_maxaction(uma_zone_t zone, uma_maxaction_t maxaction) +{ + + ZONE_LOCK(zone); + TASK_INIT(&zone->uz_maxaction, 0, (task_fn_t *)maxaction, zone); + ZONE_UNLOCK(zone); +} + +/* See uma.h */ int uma_zone_get_cur(uma_zone_t zone) { @@ -3176,26 +3191,6 @@ } /* See uma.h */ -uint32_t * -uma_find_refcnt(uma_zone_t zone, void *item) -{ - uma_slabrefcnt_t slabref; - uma_slab_t slab; - uma_keg_t keg; - uint32_t *refcnt; - int idx; - - slab = vtoslab((vm_offset_t)item & (~UMA_SLAB_MASK)); - slabref = (uma_slabrefcnt_t)slab; - keg = slab->us_keg; - KASSERT(keg->uk_flags & UMA_ZONE_REFCNT, - ("uma_find_refcnt(): zone possibly not UMA_ZONE_REFCNT")); - idx = ((uintptr_t)item - (uintptr_t)slab->us_data) / keg->uk_rsize; - refcnt = &slabref->us_refcnt[idx]; - return refcnt; -} - -/* See uma.h */ static void uma_reclaim_locked(bool kmem_danger) { @@ -3216,7 +3211,6 @@ * zones are drained. We have to do the same for buckets. */ zone_drain(slabzone); - zone_drain(slabrefzone); bucket_zone_drain(); } @@ -3309,9 +3303,10 @@ static void uma_zero_item(void *item, uma_zone_t zone) { + int i; if (zone->uz_flags & UMA_ZONE_PCPU) { - for (int i = 0; i < mp_ncpus; i++) + CPU_FOREACH(i) bzero(zpcpu_get_cpu(item, i), zone->uz_size); } else bzero(item, zone->uz_size); @@ -3447,7 +3442,7 @@ { struct uma_stream_header ush; struct uma_type_header uth; - struct uma_percpu_stat ups; + struct uma_percpu_stat *ups; uma_bucket_t bucket; struct sbuf sbuf; uma_cache_t cache; @@ -3461,6 +3456,8 @@ if (error != 0) return (error); sbuf_new_for_sysctl(&sbuf, NULL, 128, req); + sbuf_clear_flags(&sbuf, SBUF_INCLUDENUL); + ups = malloc((mp_maxid + 1) * sizeof(*ups), M_TEMP, M_WAITOK); count = 0; rw_rlock(&uma_rwlock); @@ -3509,7 +3506,6 @@ uth.uth_frees = z->uz_frees; uth.uth_fails = z->uz_fails; uth.uth_sleeps = z->uz_sleeps; - (void)sbuf_bcat(&sbuf, &uth, sizeof(uth)); /* * While it is not normally safe to access the cache * bucket pointers while not on the CPU that owns the @@ -3518,30 +3514,31 @@ * accept the possible race associated with bucket * exchange during monitoring. */ - for (i = 0; i < (mp_maxid + 1); i++) { - bzero(&ups, sizeof(ups)); - if (kz->uk_flags & UMA_ZFLAG_INTERNAL) - goto skip; - if (CPU_ABSENT(i)) - goto skip; + for (i = 0; i < mp_maxid + 1; i++) { + bzero(&ups[i], sizeof(*ups)); + if (kz->uk_flags & UMA_ZFLAG_INTERNAL || + CPU_ABSENT(i)) + continue; cache = &z->uz_cpu[i]; if (cache->uc_allocbucket != NULL) - ups.ups_cache_free += + ups[i].ups_cache_free += cache->uc_allocbucket->ub_cnt; if (cache->uc_freebucket != NULL) - ups.ups_cache_free += + ups[i].ups_cache_free += cache->uc_freebucket->ub_cnt; - ups.ups_allocs = cache->uc_allocs; - ups.ups_frees = cache->uc_frees; -skip: - (void)sbuf_bcat(&sbuf, &ups, sizeof(ups)); + ups[i].ups_allocs = cache->uc_allocs; + ups[i].ups_frees = cache->uc_frees; } ZONE_UNLOCK(z); + (void)sbuf_bcat(&sbuf, &uth, sizeof(uth)); + for (i = 0; i < mp_maxid + 1; i++) + (void)sbuf_bcat(&sbuf, &ups[i], sizeof(ups[i])); } } rw_runlock(&uma_rwlock); error = sbuf_finish(&sbuf); sbuf_delete(&sbuf); + free(ups, M_TEMP); return (error); } @@ -3549,16 +3546,13 @@ sysctl_handle_uma_zone_max(SYSCTL_HANDLER_ARGS) { uma_zone_t zone = *(uma_zone_t *)arg1; - int error, max, old; + int error, max; - old = max = uma_zone_get_max(zone); + max = uma_zone_get_max(zone); error = sysctl_handle_int(oidp, &max, 0, req); if (error || !req->newptr) return (error); - if (max < old) - return (EINVAL); - uma_zone_set_max(zone, max); return (0); @@ -3574,6 +3568,102 @@ return (sysctl_handle_int(oidp, &cur, 0, req)); } +#ifdef INVARIANTS +static uma_slab_t +uma_dbg_getslab(uma_zone_t zone, void *item) +{ + uma_slab_t slab; + uma_keg_t keg; + uint8_t *mem; + + mem = (uint8_t *)((uintptr_t)item & (~UMA_SLAB_MASK)); + if (zone->uz_flags & UMA_ZONE_VTOSLAB) { + slab = vtoslab((vm_offset_t)mem); + } else { + /* + * It is safe to return the slab here even though the + * zone is unlocked because the item's allocation state + * essentially holds a reference. + */ + ZONE_LOCK(zone); + keg = LIST_FIRST(&zone->uz_kegs)->kl_keg; + if (keg->uk_flags & UMA_ZONE_HASH) + slab = hash_sfind(&keg->uk_hash, mem); + else + slab = (uma_slab_t)(mem + keg->uk_pgoff); + ZONE_UNLOCK(zone); + } + + return (slab); +} + +/* + * Set up the slab's freei data such that uma_dbg_free can function. + * + */ +static void +uma_dbg_alloc(uma_zone_t zone, uma_slab_t slab, void *item) +{ + uma_keg_t keg; + int freei; + + if (zone_first_keg(zone) == NULL) + return; + if (slab == NULL) { + slab = uma_dbg_getslab(zone, item); + if (slab == NULL) + panic("uma: item %p did not belong to zone %s\n", + item, zone->uz_name); + } + keg = slab->us_keg; + freei = ((uintptr_t)item - (uintptr_t)slab->us_data) / keg->uk_rsize; + + if (BIT_ISSET(SLAB_SETSIZE, freei, &slab->us_debugfree)) + panic("Duplicate alloc of %p from zone %p(%s) slab %p(%d)\n", + item, zone, zone->uz_name, slab, freei); + BIT_SET_ATOMIC(SLAB_SETSIZE, freei, &slab->us_debugfree); + + return; +} + +/* + * Verifies freed addresses. Checks for alignment, valid slab membership + * and duplicate frees. + * + */ +static void +uma_dbg_free(uma_zone_t zone, uma_slab_t slab, void *item) +{ + uma_keg_t keg; + int freei; + + if (zone_first_keg(zone) == NULL) + return; + if (slab == NULL) { + slab = uma_dbg_getslab(zone, item); + if (slab == NULL) + panic("uma: Freed item %p did not belong to zone %s\n", + item, zone->uz_name); + } + keg = slab->us_keg; + freei = ((uintptr_t)item - (uintptr_t)slab->us_data) / keg->uk_rsize; + + if (freei >= keg->uk_ipers) + panic("Invalid free of %p from zone %p(%s) slab %p(%d)\n", + item, zone, zone->uz_name, slab, freei); + + if (((freei * keg->uk_rsize) + slab->us_data) != item) + panic("Unaligned free of %p from zone %p(%s) slab %p(%d)\n", + item, zone, zone->uz_name, slab, freei); + + if (!BIT_ISSET(SLAB_SETSIZE, freei, &slab->us_debugfree)) + panic("Duplicate free of %p from zone %p(%s) slab %p(%d)\n", + item, zone, zone->uz_name, slab, freei); + + BIT_CLR_ATOMIC(SLAB_SETSIZE, freei, &slab->us_debugfree); +} +#endif /* INVARIANTS */ + #ifdef DDB DB_SHOW_COMMAND(uma, db_show_uma) { @@ -3631,4 +3721,4 @@ return; } } -#endif +#endif /* DDB */ Modified: trunk/sys/vm/uma_dbg.c =================================================================== --- trunk/sys/vm/uma_dbg.c 2020-02-08 19:35:04 UTC (rev 12313) +++ trunk/sys/vm/uma_dbg.c 2020-02-08 19:35:48 UTC (rev 12314) @@ -32,8 +32,10 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/vm/uma_dbg.c 252040 2013-06-20 19:08:12Z jeff $"); +__FBSDID("$FreeBSD: stable/11/sys/vm/uma_dbg.c 301176 2016-06-01 22:31:35Z markj $"); +#include "opt_vm.h" + #include <sys/param.h> #include <sys/systm.h> #include <sys/bitset.h> @@ -50,6 +52,7 @@ #include <vm/uma.h> #include <vm/uma_int.h> #include <vm/uma_dbg.h> +#include <vm/memguard.h> static const uint32_t uma_junk = 0xdeadc0de; @@ -58,7 +61,6 @@ * prior to subsequent reallocation. * * Complies with standard ctor arg/return - * */ int trash_ctor(void *mem, int size, void *arg, int flags) @@ -66,12 +68,22 @@ int cnt; uint32_t *p; +#ifdef DEBUG_MEMGUARD + if (is_memguard_addr(mem)) + return (0); +#endif + cnt = size / sizeof(uma_junk); for (p = mem; cnt > 0; cnt--, p++) if (*p != uma_junk) { +#ifdef INVARIANTS + panic("Memory modified after free %p(%d) val=%x @ %p\n", + mem, size, *p, p); +#else printf("Memory modified after free %p(%d) val=%x @ %p\n", mem, size, *p, p); +#endif return (0); } return (0); @@ -89,6 +101,11 @@ int cnt; uint32_t *p; +#ifdef DEBUG_MEMGUARD + if (is_memguard_addr(mem)) + return; +#endif + cnt = size / sizeof(uma_junk); for (p = mem; cnt > 0; cnt--, p++) @@ -127,6 +144,11 @@ uint32_t *p = mem; int cnt; +#ifdef DEBUG_MEMGUARD + if (is_memguard_addr(mem)) + return (0); +#endif + size -= sizeof(struct malloc_type *); ksp = (struct malloc_type **)mem; ksp += size / sizeof(struct malloc_type *); @@ -154,6 +176,11 @@ int cnt; uint32_t *p; +#ifdef DEBUG_MEMGUARD + if (is_memguard_addr(mem)) + return; +#endif + size -= sizeof(struct malloc_type *); cnt = size / sizeof(uma_junk); @@ -172,6 +199,11 @@ { struct malloc_type **ksp; +#ifdef DEBUG_MEMGUARD + if (is_memguard_addr(mem)) + return (0); +#endif + mtrash_dtor(mem, size, NULL); ksp = (struct malloc_type **)mem; @@ -192,100 +224,3 @@ { (void)mtrash_ctor(mem, size, NULL, 0); } - -#ifdef INVARIANTS -static uma_slab_t -uma_dbg_getslab(uma_zone_t zone, void *item) -{ - uma_slab_t slab; - uma_keg_t keg; - uint8_t *mem; - - mem = (uint8_t *)((uintptr_t)item & (~UMA_SLAB_MASK)); - if (zone->uz_flags & UMA_ZONE_VTOSLAB) { - slab = vtoslab((vm_offset_t)mem); - } else { - /* - * It is safe to return the slab here even though the - * zone is unlocked because the item's allocation state - * essentially holds a reference. - */ - ZONE_LOCK(zone); - keg = LIST_FIRST(&zone->uz_kegs)->kl_keg; - if (keg->uk_flags & UMA_ZONE_HASH) - slab = hash_sfind(&keg->uk_hash, mem); - else - slab = (uma_slab_t)(mem + keg->uk_pgoff); - ZONE_UNLOCK(zone); - } - - return (slab); -} - -/* - * Set up the slab's freei data such that uma_dbg_free can function. - * - */ -void -uma_dbg_alloc(uma_zone_t zone, uma_slab_t slab, void *item) -{ - uma_keg_t keg; - int freei; - - if (zone_first_keg(zone) == NULL) - return; - if (slab == NULL) { - slab = uma_dbg_getslab(zone, item); - if (slab == NULL) - panic("uma: item %p did not belong to zone %s\n", - item, zone->uz_name); - } - keg = slab->us_keg; - freei = ((uintptr_t)item - (uintptr_t)slab->us_data) / keg->uk_rsize; - - if (BIT_ISSET(SLAB_SETSIZE, freei, &slab->us_debugfree)) - panic("Duplicate alloc of %p from zone %p(%s) slab %p(%d)\n", - item, zone, zone->uz_name, slab, freei); - BIT_SET_ATOMIC(SLAB_SETSIZE, freei, &slab->us_debugfree); - - return; -} - -/* - * Verifies freed addresses. Checks for alignment, valid slab membership - * and duplicate frees. - * - */ -void -uma_dbg_free(uma_zone_t zone, uma_slab_t slab, void *item) -{ - uma_keg_t keg; - int freei; - - if (zone_first_keg(zone) == NULL) - return; - if (slab == NULL) { - slab = uma_dbg_getslab(zone, item); - if (slab == NULL) - panic("uma: Freed item %p did not belong to zone %s\n", - item, zone->uz_name); - } - keg = slab->us_keg; - freei = ((uintptr_t)item - (uintptr_t)slab->us_data) / keg->uk_rsize; - - if (freei >= keg->uk_ipers) - panic("Invalid free of %p from zone %p(%s) slab %p(%d)\n", - item, zone, zone->uz_name, slab, freei); - - if (((freei * keg->uk_rsize) + slab->us_data) != item) - panic("Unaligned free of %p from zone %p(%s) slab %p(%d)\n", - item, zone, zone->uz_name, slab, freei); - - if (!BIT_ISSET(SLAB_SETSIZE, freei, &slab->us_debugfree)) - panic("Duplicate free of %p from zone %p(%s) slab %p(%d)\n", - item, zone, zone->uz_name, slab, freei); - - BIT_CLR_ATOMIC(SLAB_SETSIZE, freei, &slab->us_debugfree); -} - -#endif /* INVARIANTS */ Modified: trunk/sys/vm/uma_dbg.h =================================================================== --- trunk/sys/vm/uma_dbg.h 2020-02-08 19:35:04 UTC (rev 12313) +++ trunk/sys/vm/uma_dbg.h 2020-02-08 19:35:48 UTC (rev 12314) @@ -25,7 +25,7 @@ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * - * $FreeBSD: stable/10/sys/vm/uma_dbg.h 148078 2005-07-16 09:51:52Z rwatson $ + * $FreeBSD: stable/11/sys/vm/uma_dbg.h 295221 2016-02-03 22:02:36Z glebius $ * */ @@ -50,7 +50,4 @@ int mtrash_init(void *mem, int size, int flags); void mtrash_fini(void *mem, int size); -void uma_dbg_free(uma_zone_t zone, uma_slab_t slab, void *item); -void uma_dbg_alloc(uma_zone_t zone, uma_slab_t slab, void *item); - #endif /* VM_UMA_DBG_H */ Modified: trunk/sys/vm/uma_int.h =================================================================== --- trunk/sys/vm/uma_int.h 2020-02-08 19:35:04 UTC (rev 12313) +++ trunk/sys/vm/uma_int.h 2020-02-08 19:35:48 UTC (rev 12314) @@ -25,10 +25,13 @@ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * - * $FreeBSD: stable/10/sys/vm/uma_int.h 316835 2017-04-14 14:11:59Z avg $ + * $FreeBSD: stable/11/sys/vm/uma_int.h 344363 2019-02-20 14:12:25Z pfg $ * */ +#include <sys/_bitset.h> +#include <sys/_task.h> + /* * This file includes definitions, structures, prototypes, and inlines that * should not be used outside of the actual implementation of UMA. @@ -109,6 +112,8 @@ #define UMA_SLAB_SHIFT PAGE_SHIFT /* Number of bits PAGE_MASK */ #define UMA_BOOT_PAGES 64 /* Pages allocated for startup */ +#define UMA_BOOT_PAGES_ZONES 32 /* Multiplier for pages to reserve */ + /* if uma_zone > PAGE_SIZE */ /* Max waste percentage before going to off page slab management */ #define UMA_MAX_WASTE 10 @@ -140,8 +145,8 @@ struct uma_hash { struct slabhead *uh_slab_hash; /* Hash table for slabs */ - int uh_hashsize; /* Current size of the hash table */ - int uh_hashmask; /* Mask used during hashing */ + u_int uh_hashsize; /* Current size of the hash table */ + u_int uh_hashmask; /* Mask used during hashing */ }; /* @@ -207,7 +212,7 @@ vm_offset_t uk_kva; /* Zone base KVA */ uma_zone_t uk_slabzone; /* Slab zone backing us, if OFFPAGE */ - uint16_t uk_pgoff; /* Offset to uma_slab struct */ + uint32_t uk_pgoff; /* Offset to uma_slab struct */ uint16_t uk_ppera; /* pages per allocation from backend */ uint16_t uk_ipers; /* Items per slab */ uint32_t uk_flags; /* Internal flags */ @@ -248,17 +253,7 @@ #define us_link us_type._us_link #define us_size us_type._us_size -/* - * The slab structure for UMA_ZONE_REFCNT zones for whose items we - * maintain reference counters in the slab for. - */ -struct uma_slab_refcnt { - struct uma_slab us_head; /* slab header data */ - uint32_t us_refcnt[0]; /* Actually larger. */ -}; - typedef struct uma_slab * uma_slab_t; -typedef struct uma_slab_refcnt * uma_slabrefcnt_t; typedef uma_slab_t (*uma_slaballoc)(uma_zone_t, uma_keg_t, int); struct uma_klink { @@ -303,10 +298,12 @@ uint16_t uz_count; /* Amount of items in full bucket */ uint16_t uz_count_min; /* Minimal amount of items there */ - /* The next three fields are used to print a rate-limited warnings. */ + /* The next two fields are used to print a rate-limited warnings. */ const char *uz_warning; /* Warning to print on failure */ struct timeval uz_ratecheck; /* Warnings rate-limiting */ + struct task uz_maxaction; /* Task to run when at limit */ + /* * This HAS to be the last item because we adjust the zone size * based on NCPU and then allocate the space for the zones. @@ -390,7 +387,7 @@ hash_sfind(struct uma_hash *hash, uint8_t *data) { uma_slab_t slab; - int hval; + u_int hval; hval = UMA_HASH(hash, data); @@ -421,7 +418,7 @@ /* * The following two functions may be defined by architecture specific code - * if they can provide more effecient allocation functions. This is useful + * if they can provide more efficient allocation functions. This is useful * for using direct mapped addresses. */ void *uma_small_alloc(uma_zone_t zone, vm_size_t bytes, uint8_t *pflag, Modified: trunk/sys/vm/vm.h =================================================================== --- trunk/sys/vm/vm.h 2020-02-08 19:35:04 UTC (rev 12313) +++ trunk/sys/vm/vm.h 2020-02-08 19:35:48 UTC (rev 12314) @@ -56,7 +56,7 @@ * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. * - * $FreeBSD: stable/10/sys/vm/vm.h 321717 2017-07-30 10:36:20Z kib $ + * $FreeBSD: stable/11/sys/vm/vm.h 331921 2018-04-03 09:38:53Z kib $ */ #ifndef VM_H @@ -79,7 +79,9 @@ #define VM_PROT_WRITE ((vm_prot_t) 0x02) #define VM_PROT_EXECUTE ((vm_prot_t) 0x04) #define VM_PROT_COPY ((vm_prot_t) 0x08) /* copy-on-read */ -#define VM_PROT_FAULT_LOOKUP ((vm_prot_t) 0x010) +#define VM_PROT_PRIV_FLAG ((vm_prot_t) 0x10) +#define VM_PROT_FAULT_LOOKUP VM_PROT_PRIV_FLAG +#define VM_PROT_QUICK_NOFAULT VM_PROT_PRIV_FLAG /* same to save bits */ #define VM_PROT_ALL (VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE) #define VM_PROT_RW (VM_PROT_READ|VM_PROT_WRITE) @@ -112,8 +114,9 @@ typedef int boolean_t; /* - * The exact set of memory attributes is machine dependent. However, every - * machine is required to define VM_MEMATTR_DEFAULT. + * The exact set of memory attributes is machine dependent. However, + * every machine is required to define VM_MEMATTR_DEFAULT and + * VM_MEMATTR_UNCACHEABLE. */ typedef char vm_memattr_t; /* memory attribute codes */ Added: trunk/sys/vm/vm_domain.c =================================================================== --- trunk/sys/vm/vm_domain.c (rev 0) +++ trunk/sys/vm/vm_domain.c 2020-02-08 19:35:48 UTC (rev 12314) @@ -0,0 +1,401 @@ +/* $MidnightBSD$ */ +/*- + * Copyright (c) 2015 Adrian Chadd <adrian at FreeBSD.org>. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer, + * without modification. + * 2. Redistributions in binary form must reproduce at minimum a disclaimer + * similar to the "NO WARRANTY" disclaimer below ("Disclaimer") and any + * redistribution must be conditioned upon including a substantially + * similar Disclaimer requirement for further binary redistribution. + * + * NO WARRANTY + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF NONINFRINGEMENT, MERCHANTIBILITY + * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL + * THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, + * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGES. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: stable/11/sys/vm/vm_domain.c 312714 2017-01-24 19:39:24Z mjg $"); + +#include "opt_vm.h" +#include "opt_ddb.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/lock.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/mutex.h> +#ifdef VM_NUMA_ALLOC +#include <sys/proc.h> +#endif +#include <sys/queue.h> +#include <sys/rwlock.h> +#include <sys/sbuf.h> +#include <sys/sysctl.h> +#include <sys/tree.h> +#include <sys/vmmeter.h> +#include <sys/seq.h> + +#include <ddb/ddb.h> + +#include <vm/vm.h> +#include <vm/vm_param.h> +#include <vm/vm_kern.h> +#include <vm/vm_object.h> +#include <vm/vm_page.h> +#include <vm/vm_phys.h> + +#include <vm/vm_domain.h> + +#ifdef VM_NUMA_ALLOC +static __inline int +vm_domain_rr_selectdomain(int skip_domain) +{ + struct thread *td; + + td = curthread; + + td->td_dom_rr_idx++; + td->td_dom_rr_idx %= vm_ndomains; + + /* + * If skip_domain is provided then skip over that + * domain. This is intended for round robin variants + * which first try a fixed domain. + */ + if ((skip_domain > -1) && (td->td_dom_rr_idx == skip_domain)) { + td->td_dom_rr_idx++; + td->td_dom_rr_idx %= vm_ndomains; + } + return (td->td_dom_rr_idx); +} +#endif + +/* + * This implements a very simple set of VM domain memory allocation + * policies and iterators. + */ + +/* + * A VM domain policy represents a desired VM domain policy. + * Iterators implement searching through VM domains in a specific + * order. + */ + +/* + * When setting a policy, the caller must establish their own + * exclusive write protection for the contents of the domain + * policy. + */ +int +vm_domain_policy_init(struct vm_domain_policy *vp) +{ + + bzero(vp, sizeof(*vp)); + vp->p.policy = VM_POLICY_NONE; + vp->p.domain = -1; + return (0); +} + +int +vm_domain_policy_set(struct vm_domain_policy *vp, + vm_domain_policy_type_t vt, int domain) +{ + + seq_write_begin(&vp->seq); + vp->p.policy = vt; + vp->p.domain = domain; + seq_write_end(&vp->seq); + return (0); +} + +/* + * Take a local copy of a policy. + * + * The destination policy isn't write-barriered; this is used + * for doing local copies into something that isn't shared. + */ +void +vm_domain_policy_localcopy(struct vm_domain_policy *dst, + const struct vm_domain_policy *src) +{ + seq_t seq; + + for (;;) { + seq = seq_read(&src->seq); + *dst = *src; + if (seq_consistent(&src->seq, seq)) + return; + } +} + +/* + * Take a write-barrier copy of a policy. + * + * The destination policy is write -barriered; this is used + * for doing copies into policies that may be read by other + * threads. + */ +void +vm_domain_policy_copy(struct vm_domain_policy *dst, + const struct vm_domain_policy *src) +{ + seq_t seq; + struct vm_domain_policy d; + + for (;;) { + seq = seq_read(&src->seq); + d = *src; + if (seq_consistent(&src->seq, seq)) { + seq_write_begin(&dst->seq); + dst->p.domain = d.p.domain; + dst->p.policy = d.p.policy; + seq_write_end(&dst->seq); + return; + } + } +} + +int +vm_domain_policy_validate(const struct vm_domain_policy *vp) +{ + + switch (vp->p.policy) { + case VM_POLICY_NONE: + case VM_POLICY_ROUND_ROBIN: + case VM_POLICY_FIRST_TOUCH: + case VM_POLICY_FIRST_TOUCH_ROUND_ROBIN: + if (vp->p.domain == -1) + return (0); + return (-1); + case VM_POLICY_FIXED_DOMAIN: + case VM_POLICY_FIXED_DOMAIN_ROUND_ROBIN: +#ifdef VM_NUMA_ALLOC + if (vp->p.domain >= 0 && vp->p.domain < vm_ndomains) + return (0); +#else + if (vp->p.domain == 0) + return (0); +#endif + return (-1); + default: + return (-1); + } + return (-1); +} + +int +vm_domain_policy_cleanup(struct vm_domain_policy *vp) +{ + + /* For now, empty */ + return (0); +} + +int +vm_domain_iterator_init(struct vm_domain_iterator *vi) +{ + + /* Nothing to do for now */ + return (0); +} + +/* + * Manually setup an iterator with the given details. + */ +int +vm_domain_iterator_set(struct vm_domain_iterator *vi, + vm_domain_policy_type_t vt, int domain) +{ + +#ifdef VM_NUMA_ALLOC + switch (vt) { + case VM_POLICY_FIXED_DOMAIN: + vi->policy = VM_POLICY_FIXED_DOMAIN; + vi->domain = domain; + vi->n = 1; + break; + case VM_POLICY_FIXED_DOMAIN_ROUND_ROBIN: + vi->policy = VM_POLICY_FIXED_DOMAIN_ROUND_ROBIN; + vi->domain = domain; + vi->n = vm_ndomains; + break; + case VM_POLICY_FIRST_TOUCH: + vi->policy = VM_POLICY_FIRST_TOUCH; + vi->domain = PCPU_GET(domain); + vi->n = 1; + break; + case VM_POLICY_FIRST_TOUCH_ROUND_ROBIN: + vi->policy = VM_POLICY_FIRST_TOUCH_ROUND_ROBIN; + vi->domain = PCPU_GET(domain); + vi->n = vm_ndomains; + break; + case VM_POLICY_ROUND_ROBIN: + default: + vi->policy = VM_POLICY_ROUND_ROBIN; + vi->domain = -1; + vi->n = vm_ndomains; + break; + } +#else + vi->domain = 0; + vi->n = 1; +#endif + return (0); +} + +/* + * Setup an iterator based on the given policy. + */ +static inline void +_vm_domain_iterator_set_policy(struct vm_domain_iterator *vi, + const struct vm_domain_policy *vt) +{ + +#ifdef VM_NUMA_ALLOC + /* + * Initialise the iterator. + * + * For first-touch, the initial domain is set + * via the current thread CPU domain. + * + * For fixed-domain, it's assumed that the + * caller has initialised the specific domain + * it is after. + */ + switch (vt->p.policy) { + case VM_POLICY_FIXED_DOMAIN: + vi->policy = vt->p.policy; + vi->domain = vt->p.domain; + vi->n = 1; + break; + case VM_POLICY_FIXED_DOMAIN_ROUND_ROBIN: + vi->policy = vt->p.policy; + vi->domain = vt->p.domain; + vi->n = vm_ndomains; + break; + case VM_POLICY_FIRST_TOUCH: + vi->policy = vt->p.policy; + vi->domain = PCPU_GET(domain); + vi->n = 1; + break; + case VM_POLICY_FIRST_TOUCH_ROUND_ROBIN: + vi->policy = vt->p.policy; + vi->domain = PCPU_GET(domain); + vi->n = vm_ndomains; + break; + case VM_POLICY_ROUND_ROBIN: + default: + /* + * Default to round-robin policy. + */ + vi->policy = VM_POLICY_ROUND_ROBIN; + vi->domain = -1; + vi->n = vm_ndomains; + break; + } +#else + vi->domain = 0; + vi->n = 1; +#endif +} + +void +vm_domain_iterator_set_policy(struct vm_domain_iterator *vi, + const struct vm_domain_policy *vt) +{ + seq_t seq; + struct vm_domain_policy vt_lcl; + + for (;;) { + seq = seq_read(&vt->seq); + vt_lcl = *vt; + if (seq_consistent(&vt->seq, seq)) { + _vm_domain_iterator_set_policy(vi, &vt_lcl); + return; + } + } +} + +/* + * Return the next VM domain to use. + * + * Returns 0 w/ domain set to the next domain to use, or + * -1 to indicate no more domains are available. + */ +int +vm_domain_iterator_run(struct vm_domain_iterator *vi, int *domain) +{ + + /* General catch-all */ + if (vi->n <= 0) + return (-1); + +#ifdef VM_NUMA_ALLOC + switch (vi->policy) { + case VM_POLICY_FIXED_DOMAIN: + case VM_POLICY_FIRST_TOUCH: + *domain = vi->domain; + vi->n--; + break; + case VM_POLICY_FIXED_DOMAIN_ROUND_ROBIN: + case VM_POLICY_FIRST_TOUCH_ROUND_ROBIN: + /* + * XXX TODO: skip over the rr'ed domain + * if it equals the one we started with. + */ + if (vi->n == vm_ndomains) + *domain = vi->domain; + else + *domain = vm_domain_rr_selectdomain(vi->domain); + vi->n--; + break; + case VM_POLICY_ROUND_ROBIN: + default: + *domain = vm_domain_rr_selectdomain(-1); + vi->n--; + break; + } +#else + *domain = 0; + vi->n--; +#endif + + return (0); +} + +/* + * Returns 1 if the iteration is done, or 0 if it has not. + + * This can only be called after at least one loop through + * the iterator. Ie, it's designed to be used as a tail + * check of a loop, not the head check of a loop. + */ +int +vm_domain_iterator_isdone(struct vm_domain_iterator *vi) +{ + + return (vi->n <= 0); +} + +int +vm_domain_iterator_cleanup(struct vm_domain_iterator *vi) +{ + + return (0); +} Property changes on: trunk/sys/vm/vm_domain.c ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +MidnightBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Added: trunk/sys/vm/vm_domain.h =================================================================== --- trunk/sys/vm/vm_domain.h (rev 0) +++ trunk/sys/vm/vm_domain.h 2020-02-08 19:35:48 UTC (rev 12314) @@ -0,0 +1,67 @@ +/* $MidnightBSD$ */ +/*- + * Copyright (c) 2015 Adrian Chadd <adrian at FreeBSD.org>. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer, + * without modification. + * 2. Redistributions in binary form must reproduce at minimum a disclaimer + * similar to the "NO WARRANTY" disclaimer below ("Disclaimer") and any + * redistribution must be conditioned upon including a substantially + * similar Disclaimer requirement for further binary redistribution. + * + * NO WARRANTY + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF NONINFRINGEMENT, MERCHANTIBILITY + * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL + * THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, + * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGES. + * + * $FreeBSD: stable/11/sys/vm/vm_domain.h 285387 2015-07-11 15:21:37Z adrian $ + */ +#ifndef __VM_DOMAIN_H__ +#define __VM_DOMAIN_H__ + +#include <sys/_vm_domain.h> + +struct vm_domain_iterator { + vm_domain_policy_type_t policy; + int domain; + int n; +}; + +/* + * TODO: check to see if these should just become inline functions + * at some point. + */ +extern int vm_domain_policy_init(struct vm_domain_policy *vp); +extern int vm_domain_policy_set(struct vm_domain_policy *vp, + vm_domain_policy_type_t vt, int domain); +extern int vm_domain_policy_cleanup(struct vm_domain_policy *vp); +extern void vm_domain_policy_localcopy(struct vm_domain_policy *dst, + const struct vm_domain_policy *src); +extern void vm_domain_policy_copy(struct vm_domain_policy *dst, + const struct vm_domain_policy *src); +extern int vm_domain_policy_validate(const struct vm_domain_policy *vp); + +extern int vm_domain_iterator_init(struct vm_domain_iterator *vi); +extern int vm_domain_iterator_set(struct vm_domain_iterator *vi, + vm_domain_policy_type_t vt, int domain); +extern void vm_domain_iterator_set_policy(struct vm_domain_iterator *vi, + const struct vm_domain_policy *vt); +extern int vm_domain_iterator_run(struct vm_domain_iterator *vi, + int *domain); +extern int vm_domain_iterator_isdone(struct vm_domain_iterator *vi); +extern int vm_domain_iterator_cleanup(struct vm_domain_iterator *vi); + +#endif /* __VM_DOMAIN_H__ */ Property changes on: trunk/sys/vm/vm_domain.h ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +MidnightBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Modified: trunk/sys/vm/vm_extern.h =================================================================== --- trunk/sys/vm/vm_extern.h 2020-02-08 19:35:04 UTC (rev 12313) +++ trunk/sys/vm/vm_extern.h 2020-02-08 19:35:48 UTC (rev 12314) @@ -28,7 +28,7 @@ * SUCH DAMAGE. * * @(#)vm_extern.h 8.2 (Berkeley) 1/12/94 - * $FreeBSD: stable/10/sys/vm/vm_extern.h 270920 2014-09-01 07:58:15Z kib $ + * $FreeBSD: stable/11/sys/vm/vm_extern.h 337262 2018-08-03 15:42:39Z markj $ */ #ifndef _VM_EXTERN_H_ @@ -41,6 +41,8 @@ struct vmem; #ifdef _KERNEL +struct cdev; +struct cdevsw; /* These operate on kernel virtual addresses only. */ vm_offset_t kva_alloc(vm_size_t); @@ -64,6 +66,7 @@ void kmem_unback(vm_object_t, vm_offset_t, vm_size_t); /* Bootstrapping. */ +void kmem_bootstrap_free(vm_offset_t, vm_size_t); vm_map_t kmem_suballoc(vm_map_t, vm_offset_t *, vm_offset_t *, vm_size_t, boolean_t); void kmem_init(vm_offset_t, vm_offset_t); @@ -70,7 +73,6 @@ void kmem_init_zero_region(void); void kmeminit(void); -void swapout_procs(int); int kernacc(void *, int, int); int useracc(void *, int, int); int vm_fault(vm_map_t, vm_offset_t, vm_prot_t, int); @@ -82,10 +84,18 @@ int fault_flags, vm_page_t *m_hold); int vm_fault_quick_hold_pages(vm_map_t map, vm_offset_t addr, vm_size_t len, vm_prot_t prot, vm_page_t *ma, int max_count); -int vm_forkproc(struct thread *, struct proc *, struct thread *, struct vmspace *, int); +int vm_forkproc(struct thread *, struct proc *, struct thread *, + struct vmspace *, int); void vm_waitproc(struct proc *); -int vm_mmap(vm_map_t, vm_offset_t *, vm_size_t, vm_prot_t, vm_prot_t, int, objtype_t, void *, vm_ooffset_t); +int vm_mmap(vm_map_t, vm_offset_t *, vm_size_t, vm_prot_t, vm_prot_t, int, + objtype_t, void *, vm_ooffset_t); +int vm_mmap_object(vm_map_t, vm_offset_t *, vm_size_t, vm_prot_t, + vm_prot_t, int, vm_object_t, vm_ooffset_t, boolean_t, struct thread *); int vm_mmap_to_errno(int rv); +int vm_mmap_cdev(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, + int *, struct cdev *, struct cdevsw *, vm_ooffset_t *, vm_object_t *); +int vm_mmap_vnode(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, int *, + struct vnode *, vm_ooffset_t *, vm_object_t *, boolean_t *); void vm_set_page_size(void); void vm_sync_icache(vm_map_t, vm_offset_t, vm_size_t); typedef int (*pmap_pinit_t)(struct pmap *pmap); @@ -97,6 +107,7 @@ struct vmspace *vmspace_acquire_ref(struct proc *); void vmspace_free(struct vmspace *); void vmspace_exitfree(struct proc *); +void vmspace_switch_aio(struct vmspace *); void vnode_pager_setsize(struct vnode *, vm_ooffset_t); int vslock(void *, size_t); void vsunlock(void *, size_t); @@ -104,6 +115,5 @@ void vm_imgact_unmap_page(struct sf_buf *sf); void vm_thread_dispose(struct thread *td); int vm_thread_new(struct thread *td, int pages); -int vm_mlock(struct proc *, struct ucred *, const void *, size_t); #endif /* _KERNEL */ #endif /* !_VM_EXTERN_H_ */ Modified: trunk/sys/vm/vm_fault.c =================================================================== --- trunk/sys/vm/vm_fault.c 2020-02-08 19:35:04 UTC (rev 12313) +++ trunk/sys/vm/vm_fault.c 2020-02-08 19:35:48 UTC (rev 12314) @@ -73,7 +73,7 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/vm/vm_fault.c 329707 2018-02-21 11:31:29Z kib $"); +__FBSDID("$FreeBSD: stable/11/sys/vm/vm_fault.c 345572 2019-03-27 11:03:07Z kib $"); #include "opt_ktrace.h" #include "opt_vm.h" @@ -82,7 +82,9 @@ #include <sys/systm.h> #include <sys/kernel.h> #include <sys/lock.h> +#include <sys/mman.h> #include <sys/proc.h> +#include <sys/racct.h> #include <sys/resourcevar.h> #include <sys/rwlock.h> #include <sys/sysctl.h> @@ -107,14 +109,11 @@ #define PFBAK 4 #define PFFOR 4 -static int vm_fault_additional_pages(vm_page_t, int, int, vm_page_t *, int *); - -#define VM_FAULT_READ_BEHIND 8 +#define VM_FAULT_READ_DEFAULT (1 + VM_FAULT_READ_AHEAD_INIT) #define VM_FAULT_READ_MAX (1 + VM_FAULT_READ_AHEAD_MAX) -#define VM_FAULT_NINCR (VM_FAULT_READ_MAX / VM_FAULT_READ_BEHIND) -#define VM_FAULT_SUM (VM_FAULT_NINCR * (VM_FAULT_NINCR + 1) / 2) -#define VM_FAULT_CACHE_BEHIND (VM_FAULT_READ_BEHIND * VM_FAULT_SUM) +#define VM_FAULT_DONTNEED_MIN 1048576 + struct faultstate { vm_page_t m; vm_object_t object; @@ -124,14 +123,15 @@ vm_pindex_t first_pindex; vm_map_t map; vm_map_entry_t entry; - int lookup_still_valid; int map_generation; + bool lookup_still_valid; struct vnode *vp; }; -static void vm_fault_cache_behind(const struct faultstate *fs, int distance); +static void vm_fault_dontneed(const struct faultstate *fs, vm_offset_t vaddr, + int ahead); static void vm_fault_prefault(const struct faultstate *fs, vm_offset_t addra, - int faultcount, int reqpage); + int backward, int forward, bool obj_locked); static inline void release_page(struct faultstate *fs) @@ -150,7 +150,7 @@ if (fs->lookup_still_valid) { vm_map_lookup_done(fs->map, fs->entry); - fs->lookup_still_valid = FALSE; + fs->lookup_still_valid = false; } } @@ -237,14 +237,15 @@ * written NOW so dirty it explicitly to save on * pmap_is_modified() calls later. * - * Also tell the backing pager, if any, that it should remove - * any swap backing since the page is now dirty. + * Also, since the page is now dirty, we can possibly tell + * the pager to release any swap backing the page. Calling + * the pager requires a write lock on the object. */ if (need_dirty) vm_page_dirty(m); if (!set_wd) vm_page_unlock(m); - if (need_dirty) + else if (need_dirty) vm_pager_page_unswapped(m); } @@ -267,8 +268,12 @@ vm_fault_soft_fast(struct faultstate *fs, vm_offset_t vaddr, vm_prot_t prot, int fault_type, int fault_flags, boolean_t wired, vm_page_t *m_hold) { - vm_page_t m; - int rv; + vm_page_t m, m_map; +#if defined(__amd64__) && VM_NRESERVLEVEL > 0 + vm_page_t m_super; + int flags; +#endif + int psind, rv; MPASS(fs->vp == NULL); m = vm_page_lookup(fs->first_object, fs->first_pindex); @@ -276,20 +281,204 @@ if (m == NULL || ((prot & VM_PROT_WRITE) != 0 && vm_page_busied(m)) || m->valid != VM_PAGE_BITS_ALL) return (KERN_FAILURE); - rv = pmap_enter(fs->map->pmap, vaddr, m, prot, fault_type | - PMAP_ENTER_NOSLEEP | (wired ? PMAP_ENTER_WIRED : 0), 0); + m_map = m; + psind = 0; +#if defined(__amd64__) && VM_NRESERVLEVEL > 0 + if ((m->flags & PG_FICTITIOUS) == 0 && + (m_super = vm_reserv_to_superpage(m)) != NULL && + rounddown2(vaddr, pagesizes[m_super->psind]) >= fs->entry->start && + roundup2(vaddr + 1, pagesizes[m_super->psind]) <= fs->entry->end && + (vaddr & (pagesizes[m_super->psind] - 1)) == (VM_PAGE_TO_PHYS(m) & + (pagesizes[m_super->psind] - 1)) && + pmap_ps_enabled(fs->map->pmap)) { + flags = PS_ALL_VALID; + if ((prot & VM_PROT_WRITE) != 0) { + /* + * Create a superpage mapping allowing write access + * only if none of the constituent pages are busy and + * all of them are already dirty (except possibly for + * the page that was faulted on). + */ + flags |= PS_NONE_BUSY; + if ((fs->first_object->flags & OBJ_UNMANAGED) == 0) + flags |= PS_ALL_DIRTY; + } + if (vm_page_ps_test(m_super, flags, m)) { + m_map = m_super; + psind = m_super->psind; + vaddr = rounddown2(vaddr, pagesizes[psind]); + /* Preset the modified bit for dirty superpages. */ + if ((flags & PS_ALL_DIRTY) != 0) + fault_type |= VM_PROT_WRITE; + } + } +#endif + rv = pmap_enter(fs->map->pmap, vaddr, m_map, prot, fault_type | + PMAP_ENTER_NOSLEEP | (wired ? PMAP_ENTER_WIRED : 0), psind); if (rv != KERN_SUCCESS) return (rv); vm_fault_fill_hold(m_hold, m); vm_fault_dirty(fs->entry, m, prot, fault_type, fault_flags, false); + if (psind == 0 && !wired) + vm_fault_prefault(fs, vaddr, PFBAK, PFFOR, true); VM_OBJECT_RUNLOCK(fs->first_object); - if (!wired) - vm_fault_prefault(fs, vaddr, 0, 0); vm_map_lookup_done(fs->map, fs->entry); curthread->td_ru.ru_minflt++; return (KERN_SUCCESS); } +static void +vm_fault_restore_map_lock(struct faultstate *fs) +{ + + VM_OBJECT_ASSERT_WLOCKED(fs->first_object); + MPASS(fs->first_object->paging_in_progress > 0); + + if (!vm_map_trylock_read(fs->map)) { + VM_OBJECT_WUNLOCK(fs->first_object); + vm_map_lock_read(fs->map); + VM_OBJECT_WLOCK(fs->first_object); + } + fs->lookup_still_valid = true; +} + +static void +vm_fault_populate_check_page(vm_page_t m) +{ + + /* + * Check each page to ensure that the pager is obeying the + * interface: the page must be installed in the object, fully + * valid, and exclusively busied. + */ + MPASS(m != NULL); + MPASS(m->valid == VM_PAGE_BITS_ALL); + MPASS(vm_page_xbusied(m)); +} + +static void +vm_fault_populate_cleanup(vm_object_t object, vm_pindex_t first, + vm_pindex_t last) +{ + vm_page_t m; + vm_pindex_t pidx; + + VM_OBJECT_ASSERT_WLOCKED(object); + MPASS(first <= last); + for (pidx = first, m = vm_page_lookup(object, pidx); + pidx <= last; pidx++, m = vm_page_next(m)) { + vm_fault_populate_check_page(m); + vm_page_lock(m); + vm_page_deactivate(m); + vm_page_unlock(m); + vm_page_xunbusy(m); + } +} + +static int +vm_fault_populate(struct faultstate *fs, vm_offset_t vaddr, vm_prot_t prot, + int fault_type, int fault_flags, boolean_t wired, vm_page_t *m_hold) +{ + vm_page_t m; + vm_pindex_t map_first, map_last, pager_first, pager_last, pidx; + int rv; + + MPASS(fs->object == fs->first_object); + VM_OBJECT_ASSERT_WLOCKED(fs->first_object); + MPASS(fs->first_object->paging_in_progress > 0); + MPASS(fs->first_object->backing_object == NULL); + MPASS(fs->lookup_still_valid); + + pager_first = OFF_TO_IDX(fs->entry->offset); + pager_last = pager_first + atop(fs->entry->end - fs->entry->start) - 1; + unlock_map(fs); + unlock_vp(fs); + + /* + * Call the pager (driver) populate() method. + * + * There is no guarantee that the method will be called again + * if the current fault is for read, and a future fault is + * for write. Report the entry's maximum allowed protection + * to the driver. + */ + rv = vm_pager_populate(fs->first_object, fs->first_pindex, + fault_type, fs->entry->max_protection, &pager_first, &pager_last); + + VM_OBJECT_ASSERT_WLOCKED(fs->first_object); + if (rv == VM_PAGER_BAD) { + /* + * VM_PAGER_BAD is the backdoor for a pager to request + * normal fault handling. + */ + vm_fault_restore_map_lock(fs); + if (fs->map->timestamp != fs->map_generation) + return (KERN_RESOURCE_SHORTAGE); /* RetryFault */ + return (KERN_NOT_RECEIVER); + } + if (rv != VM_PAGER_OK) + return (KERN_FAILURE); /* AKA SIGSEGV */ + + /* Ensure that the driver is obeying the interface. */ + MPASS(pager_first <= pager_last); + MPASS(fs->first_pindex <= pager_last); + MPASS(fs->first_pindex >= pager_first); + MPASS(pager_last < fs->first_object->size); + + vm_fault_restore_map_lock(fs); + if (fs->map->timestamp != fs->map_generation) { + vm_fault_populate_cleanup(fs->first_object, pager_first, + pager_last); + return (KERN_RESOURCE_SHORTAGE); /* RetryFault */ + } + + /* + * The map is unchanged after our last unlock. Process the fault. + * + * The range [pager_first, pager_last] that is given to the + * pager is only a hint. The pager may populate any range + * within the object that includes the requested page index. + * In case the pager expanded the range, clip it to fit into + * the map entry. + */ + map_first = OFF_TO_IDX(fs->entry->offset); + if (map_first > pager_first) { + vm_fault_populate_cleanup(fs->first_object, pager_first, + map_first - 1); + pager_first = map_first; + } + map_last = map_first + atop(fs->entry->end - fs->entry->start) - 1; + if (map_last < pager_last) { + vm_fault_populate_cleanup(fs->first_object, map_last + 1, + pager_last); + pager_last = map_last; + } + for (pidx = pager_first, m = vm_page_lookup(fs->first_object, pidx); + pidx <= pager_last; pidx++, m = vm_page_next(m)) { + vm_fault_populate_check_page(m); + vm_fault_dirty(fs->entry, m, prot, fault_type, fault_flags, + true); + VM_OBJECT_WUNLOCK(fs->first_object); + pmap_enter(fs->map->pmap, fs->entry->start + IDX_TO_OFF(pidx) - + fs->entry->offset, m, prot, fault_type | (wired ? + PMAP_ENTER_WIRED : 0), 0); + VM_OBJECT_WLOCK(fs->first_object); + if (pidx == fs->first_pindex) + vm_fault_fill_hold(m_hold, m); + vm_page_lock(m); + if ((fault_flags & VM_FAULT_WIRE) != 0) { + KASSERT(wired, ("VM_FAULT_WIRE && !wired")); + vm_page_wire(m); + } else { + vm_page_activate(m); + } + vm_page_unlock(m); + vm_page_xunbusy(m); + } + curthread->td_ru.ru_majflt++; + return (KERN_SUCCESS); +} + /* * vm_fault: * @@ -334,21 +523,23 @@ vm_fault_hold(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type, int fault_flags, vm_page_t *m_hold) { - vm_prot_t prot; - long ahead, behind; - int alloc_req, era, faultcount, nera, reqpage, result; - boolean_t dead, is_first_object_locked, wired; - vm_object_t next_object; - vm_page_t marray[VM_FAULT_READ_MAX]; - int hardfault; struct faultstate fs; struct vnode *vp; - int locked, error; + vm_object_t next_object, retry_object; + vm_offset_t e_end, e_start; + vm_pindex_t retry_pindex; + vm_prot_t prot, retry_prot; + int ahead, alloc_req, behind, cluster_offset, error, era, faultcount; + int locked, nera, result, rv; + u_char behavior; + boolean_t wired; /* Passed by reference. */ + bool dead, hardfault, is_first_object_locked; - hardfault = 0; PCPU_INC(cnt.v_vm_faults); fs.vp = NULL; - faultcount = reqpage = 0; + faultcount = 0; + nera = -1; + hardfault = false; RetryFault:; @@ -415,10 +606,10 @@ (fs.first_object->type != OBJT_VNODE && (fs.first_object->flags & OBJ_TMPFS_NODE) == 0) || (fs.first_object->flags & OBJ_MIGHTBEDIRTY) != 0) { - result = vm_fault_soft_fast(&fs, vaddr, prot, - fault_type, fault_flags, wired, m_hold); - if (result == KERN_SUCCESS) - return (result); + rv = vm_fault_soft_fast(&fs, vaddr, prot, fault_type, + fault_flags, wired, m_hold); + if (rv == KERN_SUCCESS) + return (rv); } if (!VM_OBJECT_TRYUPGRADE(fs.first_object)) { VM_OBJECT_RUNLOCK(fs.first_object); @@ -435,13 +626,12 @@ * they will stay around as well. * * Bump the paging-in-progress count to prevent size changes (e.g. - * truncation operations) during I/O. This must be done after - * obtaining the vnode lock in order to avoid possible deadlocks. + * truncation operations) during I/O. */ vm_object_reference_locked(fs.first_object); vm_object_pip_add(fs.first_object, 1); - fs.lookup_still_valid = TRUE; + fs.lookup_still_valid = true; fs.first_m = NULL; @@ -534,11 +724,13 @@ goto readrest; break; } + KASSERT(fs.m == NULL, ("fs.m should be NULL, not %p", fs.m)); /* - * Page is not resident. If this is the search termination - * or the pager might contain the page, allocate a new page. - * Default objects are zero-fill, there is no real pager. + * Page is not resident. If the pager might contain the page + * or this is the beginning of the search, allocate a new + * page. (Default objects are zero-fill, so there is no real + * pager for them.) */ if (fs.object->type != OBJT_DEFAULT || fs.object == fs.first_object) { @@ -547,6 +739,30 @@ return (KERN_PROTECTION_FAILURE); } + if (fs.object == fs.first_object && + (fs.first_object->flags & OBJ_POPULATE) != 0 && + fs.first_object->shadow_count == 0) { + rv = vm_fault_populate(&fs, vaddr, prot, + fault_type, fault_flags, wired, m_hold); + switch (rv) { + case KERN_SUCCESS: + case KERN_FAILURE: + unlock_and_deallocate(&fs); + return (rv); + case KERN_RESOURCE_SHORTAGE: + unlock_and_deallocate(&fs); + goto RetryFault; + case KERN_NOT_RECEIVER: + /* + * Pager's populate() method + * returned VM_PAGER_BAD. + */ + break; + default: + panic("inconsistent return codes"); + } + } + /* * Allocate a new page for this object/offset pair. * @@ -555,14 +771,10 @@ * there, and allocation can fail, causing * restart and new reading of the p_flag. */ - fs.m = NULL; if (!vm_page_count_severe() || P_KILLED(curproc)) { #if VM_NRESERVLEVEL > 0 - if ((fs.object->flags & OBJ_COLORED) == 0) { - fs.object->flags |= OBJ_COLORED; - fs.object->pg_color = atop(vaddr) - - fs.pindex; - } + vm_object_color(fs.object, atop(vaddr) - + fs.pindex); #endif alloc_req = P_KILLED(curproc) ? VM_ALLOC_SYSTEM : VM_ALLOC_NORMAL; @@ -576,80 +788,113 @@ unlock_and_deallocate(&fs); VM_WAITPFAULT; goto RetryFault; - } else if (fs.m->valid == VM_PAGE_BITS_ALL) - break; + } } readrest: /* - * We have found a valid page or we have allocated a new page. - * The page thus may not be valid or may not be entirely - * valid. + * At this point, we have either allocated a new page or found + * an existing page that is only partially valid. * - * Attempt to fault-in the page if there is a chance that the - * pager has it, and potentially fault in additional pages - * at the same time. For default objects simply provide - * zero-filled pages. + * We hold a reference on the current object and the page is + * exclusive busied. */ - if (fs.object->type != OBJT_DEFAULT) { - int rv; - u_char behavior = vm_map_entry_behavior(fs.entry); - if (behavior == MAP_ENTRY_BEHAV_RANDOM || - P_KILLED(curproc)) { - behind = 0; - ahead = 0; + /* + * If the pager for the current object might have the page, + * then determine the number of additional pages to read and + * potentially reprioritize previously read pages for earlier + * reclamation. These operations should only be performed + * once per page fault. Even if the current pager doesn't + * have the page, the number of additional pages to read will + * apply to subsequent objects in the shadow chain. + */ + if (fs.object->type != OBJT_DEFAULT && nera == -1 && + !P_KILLED(curproc)) { + KASSERT(fs.lookup_still_valid, ("map unlocked")); + era = fs.entry->read_ahead; + behavior = vm_map_entry_behavior(fs.entry); + if (behavior == MAP_ENTRY_BEHAV_RANDOM) { + nera = 0; } else if (behavior == MAP_ENTRY_BEHAV_SEQUENTIAL) { - behind = 0; - ahead = atop(fs.entry->end - vaddr) - 1; - if (ahead > VM_FAULT_READ_AHEAD_MAX) - ahead = VM_FAULT_READ_AHEAD_MAX; - if (fs.pindex == fs.entry->next_read) - vm_fault_cache_behind(&fs, - VM_FAULT_READ_MAX); - } else { + nera = VM_FAULT_READ_AHEAD_MAX; + if (vaddr == fs.entry->next_read) + vm_fault_dontneed(&fs, vaddr, nera); + } else if (vaddr == fs.entry->next_read) { /* - * If this is a sequential page fault, then - * arithmetically increase the number of pages - * in the read-ahead window. Otherwise, reset - * the read-ahead window to its smallest size. + * This is a sequential fault. Arithmetically + * increase the requested number of pages in + * the read-ahead window. The requested + * number of pages is "# of sequential faults + * x (read ahead min + 1) + read ahead min" */ - behind = atop(vaddr - fs.entry->start); - if (behind > VM_FAULT_READ_BEHIND) - behind = VM_FAULT_READ_BEHIND; - ahead = atop(fs.entry->end - vaddr) - 1; - era = fs.entry->read_ahead; - if (fs.pindex == fs.entry->next_read) { - nera = era + behind; + nera = VM_FAULT_READ_AHEAD_MIN; + if (era > 0) { + nera += era + 1; if (nera > VM_FAULT_READ_AHEAD_MAX) nera = VM_FAULT_READ_AHEAD_MAX; - behind = 0; - if (ahead > nera) - ahead = nera; - if (era == VM_FAULT_READ_AHEAD_MAX) - vm_fault_cache_behind(&fs, - VM_FAULT_CACHE_BEHIND); - } else if (ahead > VM_FAULT_READ_AHEAD_MIN) - ahead = VM_FAULT_READ_AHEAD_MIN; - if (era != ahead) - fs.entry->read_ahead = ahead; + } + if (era == VM_FAULT_READ_AHEAD_MAX) + vm_fault_dontneed(&fs, vaddr, nera); + } else { + /* + * This is a non-sequential fault. + */ + nera = 0; } + if (era != nera) { + /* + * A read lock on the map suffices to update + * the read ahead count safely. + */ + fs.entry->read_ahead = nera; + } /* - * Call the pager to retrieve the data, if any, after - * releasing the lock on the map. We hold a ref on - * fs.object and the pages are exclusive busied. + * Prepare for unlocking the map. Save the map + * entry's start and end addresses, which are used to + * optimize the size of the pager operation below. + * Even if the map entry's addresses change after + * unlocking the map, using the saved addresses is + * safe. */ + e_start = fs.entry->start; + e_end = fs.entry->end; + } + + /* + * Call the pager to retrieve the page if there is a chance + * that the pager has it, and potentially retrieve additional + * pages at the same time. + */ + if (fs.object->type != OBJT_DEFAULT) { + /* + * Release the map lock before locking the vnode or + * sleeping in the pager. (If the current object has + * a shadow, then an earlier iteration of this loop + * may have already unlocked the map.) + */ unlock_map(&fs); if (fs.object->type == OBJT_VNODE && (vp = fs.object->handle) != fs.vp) { + /* + * Perform an unlock in case the desired vnode + * changed while the map was unlocked during a + * retry. + */ unlock_vp(&fs); + locked = VOP_ISLOCKED(vp); - if (locked != LK_EXCLUSIVE) locked = LK_SHARED; - /* Do not sleep for vnode lock while fs.m is busy */ + + /* + * We must not sleep acquiring the vnode lock + * while we have the page exclusive busied or + * the object's paging-in-progress count + * incremented. Otherwise, we could deadlock. + */ error = vget(vp, locked | LK_CANRECURSE | LK_NOWAIT, curthread); if (error != 0) { @@ -670,88 +915,85 @@ ("vm_fault: vnode-backed object mapped by system map")); /* - * now we find out if any other pages should be paged - * in at this time this routine checks to see if the - * pages surrounding this fault reside in the same - * object as the page for this fault. If they do, - * then they are faulted in also into the object. The - * array "marray" returned contains an array of - * vm_page_t structs where one of them is the - * vm_page_t passed to the routine. The reqpage - * return value is the index into the marray for the - * vm_page_t passed to the routine. - * - * fs.m plus the additional pages are exclusive busied. + * Page in the requested page and hint the pager, + * that it may bring up surrounding pages. */ - faultcount = vm_fault_additional_pages( - fs.m, behind, ahead, marray, &reqpage); - - rv = faultcount ? - vm_pager_get_pages(fs.object, marray, faultcount, - reqpage) : VM_PAGER_FAIL; - + if (nera == -1 || behavior == MAP_ENTRY_BEHAV_RANDOM || + P_KILLED(curproc)) { + behind = 0; + ahead = 0; + } else { + /* Is this a sequential fault? */ + if (nera > 0) { + behind = 0; + ahead = nera; + } else { + /* + * Request a cluster of pages that is + * aligned to a VM_FAULT_READ_DEFAULT + * page offset boundary within the + * object. Alignment to a page offset + * boundary is more likely to coincide + * with the underlying file system + * block than alignment to a virtual + * address boundary. + */ + cluster_offset = fs.pindex % + VM_FAULT_READ_DEFAULT; + behind = ulmin(cluster_offset, + atop(vaddr - e_start)); + ahead = VM_FAULT_READ_DEFAULT - 1 - + cluster_offset; + } + ahead = ulmin(ahead, atop(e_end - vaddr) - 1); + } + rv = vm_pager_get_pages(fs.object, &fs.m, 1, + &behind, &ahead); if (rv == VM_PAGER_OK) { - /* - * Found the page. Leave it busy while we play - * with it. - */ - - /* - * Relookup in case pager changed page. Pager - * is responsible for disposition of old page - * if moved. - */ - fs.m = vm_page_lookup(fs.object, fs.pindex); - if (!fs.m) { - unlock_and_deallocate(&fs); - goto RetryFault; - } - - hardfault++; + faultcount = behind + 1 + ahead; + hardfault = true; break; /* break to PAGE HAS BEEN FOUND */ } - /* - * Remove the bogus page (which does not exist at this - * object/offset); before doing so, we must get back - * our object lock to preserve our invariant. - * - * Also wake up any other process that may want to bring - * in this page. - * - * If this is the top-level object, we must leave the - * busy page to prevent another process from rushing - * past us, and inserting the page in that object at - * the same time that we are. - */ if (rv == VM_PAGER_ERROR) printf("vm_fault: pager read error, pid %d (%s)\n", curproc->p_pid, curproc->p_comm); + /* - * Data outside the range of the pager or an I/O error + * If an I/O error occurred or the requested page was + * outside the range of the pager, clean up and return + * an error. */ - /* - * XXX - the check for kernel_map is a kludge to work - * around having the machine panic on a kernel space - * fault w/ I/O error. - */ - if (((fs.map != kernel_map) && (rv == VM_PAGER_ERROR)) || - (rv == VM_PAGER_BAD)) { + if (rv == VM_PAGER_ERROR || rv == VM_PAGER_BAD) { vm_page_lock(fs.m); - vm_page_free(fs.m); + if (fs.m->wire_count == 0) + vm_page_free(fs.m); + else + vm_page_xunbusy_maybelocked(fs.m); vm_page_unlock(fs.m); fs.m = NULL; unlock_and_deallocate(&fs); - return ((rv == VM_PAGER_ERROR) ? KERN_FAILURE : KERN_PROTECTION_FAILURE); + return (rv == VM_PAGER_ERROR ? KERN_FAILURE : + KERN_PROTECTION_FAILURE); } + + /* + * The requested page does not exist at this object/ + * offset. Remove the invalid page from the object, + * waking up anyone waiting for it, and continue on to + * the next object. However, if this is the top-level + * object, we must leave the busy page in place to + * prevent another process from rushing past us, and + * inserting the page in that object at the same time + * that we are. + */ if (fs.object != fs.first_object) { vm_page_lock(fs.m); - vm_page_free(fs.m); + if (fs.m->wire_count == 0) + vm_page_free(fs.m); + else + vm_page_xunbusy_maybelocked(fs.m); vm_page_unlock(fs.m); fs.m = NULL; - /* - * XXX - we cannot just fall out at this - * point, m has been freed and is invalid! - */ } } @@ -766,7 +1008,6 @@ * Move on to the next object. Lock the next object before * unlocking the current one. */ - fs.pindex += OFF_TO_IDX(fs.object->backing_object_offset); next_object = fs.object->backing_object; if (next_object == NULL) { /* @@ -804,6 +1045,8 @@ vm_object_pip_add(next_object, 1); if (fs.object != fs.first_object) vm_object_pip_wakeup(fs.object); + fs.pindex += + OFF_TO_IDX(fs.object->backing_object_offset); VM_OBJECT_WUNLOCK(fs.object); fs.object = next_object; } @@ -836,7 +1079,7 @@ * dirty in the first object so that it will go out * to swap when needed. */ - is_first_object_locked = FALSE; + is_first_object_locked = false; if ( /* * Only one shadow object @@ -860,22 +1103,15 @@ * We don't chase down the shadow chain */ fs.object == fs.first_object->backing_object) { - /* - * get rid of the unnecessary page - */ + vm_page_lock(fs.m); + vm_page_remove(fs.m); + vm_page_unlock(fs.m); vm_page_lock(fs.first_m); + vm_page_replace_checked(fs.m, fs.first_object, + fs.first_pindex, fs.first_m); vm_page_free(fs.first_m); vm_page_unlock(fs.first_m); - /* - * grab the page and put it into the - * process'es object. The page is - * automatically made dirty. - */ - if (vm_page_rename(fs.m, fs.first_object, - fs.first_pindex)) { - unlock_and_deallocate(&fs); - goto RetryFault; - } + vm_page_dirty(fs.m); #if VM_NRESERVLEVEL > 0 /* * Rename the reservation. @@ -884,6 +1120,10 @@ fs.object, OFF_TO_IDX( fs.first_object->backing_object_offset)); #endif + /* + * Removing the page from the backing object + * unbusied it. + */ vm_page_xbusy(fs.m); fs.first_m = fs.m; fs.m = NULL; @@ -905,7 +1145,7 @@ vm_page_unlock(fs.first_m); vm_page_lock(fs.m); - vm_page_unwire(fs.m, FALSE); + vm_page_unwire(fs.m, PQ_INACTIVE); vm_page_unlock(fs.m); } /* @@ -939,16 +1179,12 @@ * lookup. */ if (!fs.lookup_still_valid) { - vm_object_t retry_object; - vm_pindex_t retry_pindex; - vm_prot_t retry_prot; - if (!vm_map_trylock_read(fs.map)) { release_page(&fs); unlock_and_deallocate(&fs); goto RetryFault; } - fs.lookup_still_valid = TRUE; + fs.lookup_still_valid = true; if (fs.map->timestamp != fs.map_generation) { result = vm_map_lookup_locked(&fs.map, vaddr, fault_type, &fs.entry, &retry_object, &retry_pindex, &retry_prot, &wired); @@ -986,20 +1222,23 @@ * write-enabled after all. */ prot &= retry_prot; + fault_type &= retry_prot; + if (prot == 0) { + release_page(&fs); + unlock_and_deallocate(&fs); + goto RetryFault; + } } } + /* - * If the page was filled by a pager, update the map entry's - * last read offset. Since the pager does not return the - * actual set of pages that it read, this update is based on - * the requested set. Typically, the requested and actual - * sets are the same. - * - * XXX The following assignment modifies the map - * without holding a write lock on it. + * If the page was filled by a pager, save the virtual address that + * should be faulted on next under a sequential access pattern to the + * map entry. A read lock on the map suffices to update this address + * safely. */ if (hardfault) - fs.entry->next_read = fs.pindex + faultcount - reqpage; + fs.entry->next_read = vaddr + ptoa(ahead) + PAGE_SIZE; vm_fault_dirty(fs.entry, fs.m, prot, fault_type, fault_flags, true); vm_page_assert_xbusied(fs.m); @@ -1022,7 +1261,9 @@ fault_type | (wired ? PMAP_ENTER_WIRED : 0), 0); if (faultcount != 1 && (fault_flags & VM_FAULT_WIRE) == 0 && wired == 0) - vm_fault_prefault(&fs, vaddr, faultcount, reqpage); + vm_fault_prefault(&fs, vaddr, + faultcount > 0 ? behind : PFBAK, + faultcount > 0 ? ahead : PFFOR, false); VM_OBJECT_WLOCK(fs.object); vm_page_lock(fs.m); @@ -1049,6 +1290,21 @@ if (hardfault) { PCPU_INC(cnt.v_io_faults); curthread->td_ru.ru_majflt++; +#ifdef RACCT + if (racct_enable && fs.object->type == OBJT_VNODE) { + PROC_LOCK(curproc); + if ((fault_type & (VM_PROT_COPY | VM_PROT_WRITE)) != 0) { + racct_add_force(curproc, RACCT_WRITEBPS, + PAGE_SIZE + behind * PAGE_SIZE); + racct_add_force(curproc, RACCT_WRITEIOPS, 1); + } else { + racct_add_force(curproc, RACCT_READBPS, + PAGE_SIZE + ahead * PAGE_SIZE); + racct_add_force(curproc, RACCT_READIOPS, 1); + } + PROC_UNLOCK(curproc); + } +#endif } else curthread->td_ru.ru_minflt++; @@ -1056,15 +1312,26 @@ } /* - * Speed up the reclamation of up to "distance" pages that precede the - * faulting pindex within the first object of the shadow chain. + * Speed up the reclamation of pages that precede the faulting pindex within + * the first object of the shadow chain. Essentially, perform the equivalent + * to madvise(..., MADV_DONTNEED) on a large cluster of pages that precedes + * the faulting pindex by the cluster size when the pages read by vm_fault() + * cross a cluster-size boundary. The cluster size is the greater of the + * smallest superpage size and VM_FAULT_DONTNEED_MIN. + * + * When "fs->first_object" is a shadow object, the pages in the backing object + * that precede the faulting pindex are deactivated by vm_fault(). So, this + * function must only be concerned with pages in the first object. */ static void -vm_fault_cache_behind(const struct faultstate *fs, int distance) +vm_fault_dontneed(const struct faultstate *fs, vm_offset_t vaddr, int ahead) { + vm_map_entry_t entry; vm_object_t first_object, object; - vm_page_t m, m_prev; - vm_pindex_t pindex; + vm_offset_t end, start; + vm_page_t m, m_next; + vm_pindex_t pend, pstart; + vm_size_t size; object = fs->object; VM_OBJECT_ASSERT_WLOCKED(object); @@ -1076,32 +1343,44 @@ VM_OBJECT_WLOCK(object); } } - /* Neither fictitious nor unmanaged pages can be cached. */ + /* Neither fictitious nor unmanaged pages can be reclaimed. */ if ((first_object->flags & (OBJ_FICTITIOUS | OBJ_UNMANAGED)) == 0) { - if (fs->first_pindex < distance) - pindex = 0; - else - pindex = fs->first_pindex - distance; - if (pindex < OFF_TO_IDX(fs->entry->offset)) - pindex = OFF_TO_IDX(fs->entry->offset); - m = first_object != object ? fs->first_m : fs->m; - vm_page_assert_xbusied(m); - m_prev = vm_page_prev(m); - while ((m = m_prev) != NULL && m->pindex >= pindex && - m->valid == VM_PAGE_BITS_ALL) { - m_prev = vm_page_prev(m); - if (vm_page_busied(m)) - continue; - vm_page_lock(m); - if (m->hold_count == 0 && m->wire_count == 0) { - pmap_remove_all(m); - vm_page_aflag_clear(m, PGA_REFERENCED); - if (m->dirty != 0) - vm_page_deactivate(m); - else - vm_page_cache(m); + size = VM_FAULT_DONTNEED_MIN; + if (MAXPAGESIZES > 1 && size < pagesizes[1]) + size = pagesizes[1]; + end = rounddown2(vaddr, size); + if (vaddr - end >= size - PAGE_SIZE - ptoa(ahead) && + (entry = fs->entry)->start < end) { + if (end - entry->start < size) + start = entry->start; + else + start = end - size; + pmap_advise(fs->map->pmap, start, end, MADV_DONTNEED); + pstart = OFF_TO_IDX(entry->offset) + atop(start - + entry->start); + m_next = vm_page_find_least(first_object, pstart); + pend = OFF_TO_IDX(entry->offset) + atop(end - + entry->start); + while ((m = m_next) != NULL && m->pindex < pend) { + m_next = TAILQ_NEXT(m, listq); + if (m->valid != VM_PAGE_BITS_ALL || + vm_page_busied(m)) + continue; + + /* + * Don't clear PGA_REFERENCED, since it would + * likely represent a reference by a different + * process. + * + * Typically, at this point, prefetched pages + * are still in the inactive queue. Only + * pages that triggered page faults are in the + * active queue. + */ + vm_page_lock(m); + vm_page_deactivate(m); + vm_page_unlock(m); } - vm_page_unlock(m); } } if (first_object != object) @@ -1116,7 +1395,7 @@ */ static void vm_fault_prefault(const struct faultstate *fs, vm_offset_t addra, - int faultcount, int reqpage) + int backward, int forward, bool obj_locked) { pmap_t pmap; vm_map_entry_t entry; @@ -1124,19 +1403,12 @@ vm_offset_t addr, starta; vm_pindex_t pindex; vm_page_t m; - int backward, forward, i; + int i; pmap = fs->map->pmap; if (pmap != vmspace_pmap(curthread->td_proc->p_vmspace)) return; - if (faultcount > 0) { - backward = reqpage; - forward = faultcount - reqpage - 1; - } else { - backward = PFBAK; - forward = PFFOR; - } entry = fs->entry; if (addra < backward * PAGE_SIZE) { @@ -1169,7 +1441,8 @@ pindex = ((addr - entry->start) + entry->offset) >> PAGE_SHIFT; lobject = entry->object.vm_object; - VM_OBJECT_RLOCK(lobject); + if (!obj_locked) + VM_OBJECT_RLOCK(lobject); while ((m = vm_page_lookup(lobject, pindex)) == NULL && lobject->type == OBJT_DEFAULT && (backing_object = lobject->backing_object) != NULL) { @@ -1177,17 +1450,20 @@ 0, ("vm_fault_prefault: unaligned object offset")); pindex += lobject->backing_object_offset >> PAGE_SHIFT; VM_OBJECT_RLOCK(backing_object); - VM_OBJECT_RUNLOCK(lobject); + if (!obj_locked || lobject != entry->object.vm_object) + VM_OBJECT_RUNLOCK(lobject); lobject = backing_object; } if (m == NULL) { - VM_OBJECT_RUNLOCK(lobject); + if (!obj_locked || lobject != entry->object.vm_object) + VM_OBJECT_RUNLOCK(lobject); break; } if (m->valid == VM_PAGE_BITS_ALL && (m->flags & PG_FICTITIOUS) == 0) pmap_enter_quick(pmap, addr, m, entry->protection); - VM_OBJECT_RUNLOCK(lobject); + if (!obj_locked || lobject != entry->object.vm_object) + VM_OBJECT_RUNLOCK(lobject); } } @@ -1252,7 +1528,18 @@ * page was mapped at the specified virtual address or that * mapping had insufficient permissions. Attempt to fault in * and hold these pages. + * + * If vm_fault_disable_pagefaults() was called, + * i.e., TDP_NOFAULTING is set, we must not sleep nor + * acquire MD VM locks, which means we must not call + * vm_fault_hold(). Some (out of tree) callers mark + * too wide a code area with vm_fault_disable_pagefaults() + * already, use the VM_PROT_QUICK_NOFAULT flag to request + * the proper behaviour explicitly. */ + if ((prot & VM_PROT_QUICK_NOFAULT) != 0 && + (curthread->td_pflags & TDP_NOFAULTING) != 0) + goto error; for (mp = ma, va = addr; va < end; mp++, va += PAGE_SIZE) if (*mp == NULL && vm_fault_hold(map, va, prot, VM_FAULT_NORMAL, mp) != KERN_SUCCESS) @@ -1315,11 +1602,12 @@ * actually shadow anything - we copy the pages directly.) */ dst_object = vm_object_allocate(OBJT_DEFAULT, - OFF_TO_IDX(dst_entry->end - dst_entry->start)); + atop(dst_entry->end - dst_entry->start)); #if VM_NRESERVLEVEL > 0 dst_object->flags |= OBJ_COLORED; dst_object->pg_color = atop(dst_entry->start); #endif + dst_object->charge = dst_entry->end - dst_entry->start; } VM_OBJECT_WLOCK(dst_object); @@ -1328,7 +1616,6 @@ if (src_object != dst_object) { dst_entry->object.vm_object = dst_object; dst_entry->offset = 0; - dst_object->charge = dst_entry->end - dst_entry->start; } if (fork_charge != NULL) { KASSERT(dst_entry->cred == NULL, @@ -1336,7 +1623,9 @@ dst_object->cred = curthread->td_ucred; crhold(dst_object->cred); *fork_charge += dst_object->charge; - } else if (dst_object->cred == NULL) { + } else if ((dst_object->type == OBJT_DEFAULT || + dst_object->type == OBJT_SWAP) && + dst_object->cred == NULL) { KASSERT(dst_entry->cred != NULL, ("no cred for entry %p", dst_entry)); dst_object->cred = dst_entry->cred; @@ -1361,7 +1650,7 @@ * range, copying each page from the source object to the * destination object. Since the source is wired, those pages * must exist. In contrast, the destination is pageable. - * Since the destination object does share any backing storage + * Since the destination object doesn't share any backing storage * with the source object, all of its pages must be dirtied, * regardless of whether they can be written. */ @@ -1417,15 +1706,19 @@ } pmap_copy_page(src_m, dst_m); VM_OBJECT_RUNLOCK(object); - dst_m->valid = VM_PAGE_BITS_ALL; - dst_m->dirty = VM_PAGE_BITS_ALL; + dst_m->dirty = dst_m->valid = src_m->valid; } else { dst_m = src_m; if (vm_page_sleep_if_busy(dst_m, "fltupg")) goto again; + if (dst_m->pindex >= dst_object->size) + /* + * We are upgrading. Index can occur + * out of bounds if the object type is + * vnode and the file was truncated. + */ + break; vm_page_xbusy(dst_m); - KASSERT(dst_m->valid == VM_PAGE_BITS_ALL, - ("invalid dst page %p", dst_m)); } VM_OBJECT_WUNLOCK(dst_object); @@ -1433,9 +1726,18 @@ * Enter it in the pmap. If a wired, copy-on-write * mapping is being replaced by a write-enabled * mapping, then wire that new mapping. + * + * The page can be invalid if the user called + * msync(MS_INVALIDATE) or truncated the backing vnode + * or shared memory object. In this case, do not + * insert it into pmap, but still do the copy so that + * all copies of the wired map entry have similar + * backing pages. */ - pmap_enter(dst_map->pmap, vaddr, dst_m, prot, - access | (upgrade ? PMAP_ENTER_WIRED : 0), 0); + if (dst_m->valid == VM_PAGE_BITS_ALL) { + pmap_enter(dst_map->pmap, vaddr, dst_m, prot, + access | (upgrade ? PMAP_ENTER_WIRED : 0), 0); + } /* * Mark it no longer busy, and put it on the active list. @@ -1445,7 +1747,7 @@ if (upgrade) { if (src_m != dst_m) { vm_page_lock(src_m); - vm_page_unwire(src_m, 0); + vm_page_unwire(src_m, PQ_INACTIVE); vm_page_unlock(src_m); vm_page_lock(dst_m); vm_page_wire(dst_m); @@ -1468,134 +1770,7 @@ } } - /* - * This routine checks around the requested page for other pages that - * might be able to be faulted in. This routine brackets the viable - * pages for the pages to be paged in. - * - * Inputs: - * m, rbehind, rahead - * - * Outputs: - * marray (array of vm_page_t), reqpage (index of requested page) - * - * Return value: - * number of pages in marray - */ -static int -vm_fault_additional_pages(m, rbehind, rahead, marray, reqpage) - vm_page_t m; - int rbehind; - int rahead; - vm_page_t *marray; - int *reqpage; -{ - int i,j; - vm_object_t object; - vm_pindex_t pindex, startpindex, endpindex, tpindex; - vm_page_t rtm; - int cbehind, cahead; - - VM_OBJECT_ASSERT_WLOCKED(m->object); - - object = m->object; - pindex = m->pindex; - cbehind = cahead = 0; - - /* - * if the requested page is not available, then give up now - */ - if (!vm_pager_has_page(object, pindex, &cbehind, &cahead)) { - return 0; - } - - if ((cbehind == 0) && (cahead == 0)) { - *reqpage = 0; - marray[0] = m; - return 1; - } - - if (rahead > cahead) { - rahead = cahead; - } - - if (rbehind > cbehind) { - rbehind = cbehind; - } - - /* - * scan backward for the read behind pages -- in memory - */ - if (pindex > 0) { - if (rbehind > pindex) { - rbehind = pindex; - startpindex = 0; - } else { - startpindex = pindex - rbehind; - } - - if ((rtm = TAILQ_PREV(m, pglist, listq)) != NULL && - rtm->pindex >= startpindex) - startpindex = rtm->pindex + 1; - - /* tpindex is unsigned; beware of numeric underflow. */ - for (i = 0, tpindex = pindex - 1; tpindex >= startpindex && - tpindex < pindex; i++, tpindex--) { - - rtm = vm_page_alloc(object, tpindex, VM_ALLOC_NORMAL | - VM_ALLOC_IFNOTCACHED); - if (rtm == NULL) { - /* - * Shift the allocated pages to the - * beginning of the array. - */ - for (j = 0; j < i; j++) { - marray[j] = marray[j + tpindex + 1 - - startpindex]; - } - break; - } - - marray[tpindex - startpindex] = rtm; - } - } else { - startpindex = 0; - i = 0; - } - - marray[i] = m; - /* page offset of the required page */ - *reqpage = i; - - tpindex = pindex + 1; - i++; - - /* - * scan forward for the read ahead pages - */ - endpindex = tpindex + rahead; - if ((rtm = TAILQ_NEXT(m, listq)) != NULL && rtm->pindex < endpindex) - endpindex = rtm->pindex; - if (endpindex > object->size) - endpindex = object->size; - - for (; tpindex < endpindex; i++, tpindex++) { - - rtm = vm_page_alloc(object, tpindex, VM_ALLOC_NORMAL | - VM_ALLOC_IFNOTCACHED); - if (rtm == NULL) { - break; - } - - marray[i] = rtm; - } - - /* return number of pages */ - return i; -} - -/* * Block entry into the machine-independent layer's page fault handler by * the calling thread. Subsequent calls to vm_fault() by that thread will * return KERN_PROTECTION_FAILURE. Enable machine-dependent handling of Modified: trunk/sys/vm/vm_glue.c =================================================================== --- trunk/sys/vm/vm_glue.c 2020-02-08 19:35:04 UTC (rev 12313) +++ trunk/sys/vm/vm_glue.c 2020-02-08 19:35:48 UTC (rev 12314) @@ -58,7 +58,7 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/vm/vm_glue.c 300673 2016-05-25 10:04:53Z kib $"); +__FBSDID("$FreeBSD: stable/11/sys/vm/vm_glue.c 341467 2018-12-04 15:04:48Z emaste $"); #include "opt_vm.h" #include "opt_kstack_pages.h" @@ -102,13 +102,6 @@ #include <machine/cpu.h> -#ifndef NO_SWAPPING -static int swapout(struct proc *); -static void swapclear(struct proc *); -static void vm_thread_swapin(struct thread *td); -static void vm_thread_swapout(struct thread *td); -#endif - /* * MPSAFE * @@ -119,9 +112,7 @@ * space. */ int -kernacc(addr, len, rw) - void *addr; - int len, rw; +kernacc(void *addr, int len, int rw) { boolean_t rv; vm_offset_t saddr, eaddr; @@ -130,7 +121,7 @@ KASSERT((rw & ~VM_PROT_ALL) == 0, ("illegal ``rw'' argument to kernacc (%x)\n", rw)); - if ((vm_offset_t)addr + len > kernel_map->max_offset || + if ((vm_offset_t)addr + len > vm_map_max(kernel_map) || (vm_offset_t)addr + len < (vm_offset_t)addr) return (FALSE); @@ -150,12 +141,10 @@ * the associated vm_map_entry range. It does not determine whether the * contents of the memory is actually readable or writable. vmapbuf(), * vm_fault_quick(), or copyin()/copout()/su*()/fu*() functions should be - * used in conjuction with this call. + * used in conjunction with this call. */ int -useracc(addr, len, rw) - void *addr; - int len, rw; +useracc(void *addr, int len, int rw) { boolean_t rv; vm_prot_t prot; @@ -201,16 +190,21 @@ * Also, the sysctl code, which is the only present user * of vslock(), does a hard loop on EAGAIN. */ - if (npages + cnt.v_wire_count > vm_page_max_wired) + if (npages + vm_cnt.v_wire_count > vm_page_max_wired) return (EAGAIN); #endif error = vm_map_wire(&curproc->p_vmspace->vm_map, start, end, VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES); + if (error == KERN_SUCCESS) { + curthread->td_vslock_sz += len; + return (0); + } + /* * Return EFAULT on error to match copy{in,out}() behaviour * rather than returning ENOMEM like mlock() would. */ - return (error == KERN_SUCCESS ? 0 : EFAULT); + return (EFAULT); } void @@ -218,6 +212,8 @@ { /* Rely on the parameter sanity checks performed by vslock(). */ + MPASS(curthread->td_vslock_sz >= len); + curthread->td_vslock_sz -= len; (void)vm_map_unwire(&curproc->p_vmspace->vm_map, trunc_page((vm_offset_t)addr), round_page((vm_offset_t)addr + len), VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES); @@ -231,19 +227,16 @@ static vm_page_t vm_imgact_hold_page(vm_object_t object, vm_ooffset_t offset) { - vm_page_t m, ma[1]; + vm_page_t m; vm_pindex_t pindex; int rv; VM_OBJECT_WLOCK(object); pindex = OFF_TO_IDX(offset); - m = vm_page_grab(object, pindex, VM_ALLOC_NORMAL); + m = vm_page_grab(object, pindex, VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY); if (m->valid != VM_PAGE_BITS_ALL) { - ma[0] = m; - rv = vm_pager_get_pages(object, ma, 1, 0); - m = vm_page_lookup(object, pindex); - if (m == NULL) - goto out; + vm_page_xbusy(m); + rv = vm_pager_get_pages(object, &m, 1, NULL, NULL); if (rv != VM_PAGER_OK) { vm_page_lock(m); vm_page_free(m); @@ -251,8 +244,8 @@ m = NULL; goto out; } + vm_page_xunbusy(m); } - vm_page_xunbusy(m); vm_page_lock(m); vm_page_hold(m); vm_page_activate(m); @@ -312,10 +305,6 @@ SYSCTL_INT(_vm, OID_AUTO, kstacks, CTLFLAG_RD, &kstacks, 0, ""); -#ifndef KSTACK_MAX_PAGES -#define KSTACK_MAX_PAGES 32 -#endif - /* * Create the kernel stack (including pcb for i386) for a new thread. * This routine directly affects the fork perf for a process and @@ -326,17 +315,17 @@ { vm_object_t ksobj; vm_offset_t ks; - vm_page_t m, ma[KSTACK_MAX_PAGES]; + vm_page_t ma[KSTACK_MAX_PAGES]; struct kstack_cache_entry *ks_ce; int i; /* Bounds check */ if (pages <= 1) - pages = KSTACK_PAGES; + pages = kstack_pages; else if (pages > KSTACK_MAX_PAGES) pages = KSTACK_MAX_PAGES; - if (pages == KSTACK_PAGES) { + if (pages == kstack_pages) { mtx_lock(&kstack_cache_mtx); if (kstack_cache != NULL) { ks_ce = kstack_cache; @@ -345,7 +334,7 @@ td->td_kstack_obj = ks_ce->ksobj; td->td_kstack = (vm_offset_t)ks_ce; - td->td_kstack_pages = KSTACK_PAGES; + td->td_kstack_pages = kstack_pages; return (1); } mtx_unlock(&kstack_cache_mtx); @@ -395,15 +384,10 @@ * page of stack. */ VM_OBJECT_WLOCK(ksobj); - for (i = 0; i < pages; i++) { - /* - * Get a kernel stack page. - */ - m = vm_page_grab(ksobj, i, VM_ALLOC_NOBUSY | - VM_ALLOC_NORMAL | VM_ALLOC_WIRED); - ma[i] = m; - m->valid = VM_PAGE_BITS_ALL; - } + (void)vm_page_grab_pages(ksobj, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY | + VM_ALLOC_WIRED, ma, pages); + for (i = 0; i < pages; i++) + ma[i]->valid = VM_PAGE_BITS_ALL; VM_OBJECT_WUNLOCK(ksobj); pmap_qenter(ks, ma, pages); return (1); @@ -423,7 +407,7 @@ if (m == NULL) panic("vm_thread_dispose: kstack already missing?"); vm_page_lock(m); - vm_page_unwire(m, 0); + vm_page_unwire(m, PQ_NONE); vm_page_free(m); vm_page_unlock(m); } @@ -449,7 +433,7 @@ ks = td->td_kstack; td->td_kstack = 0; td->td_kstack_pages = 0; - if (pages == KSTACK_PAGES && kstacks <= kstack_cache_size) { + if (pages == kstack_pages && kstacks <= kstack_cache_size) { ks_ce = (struct kstack_cache_entry *)ks; ks_ce->ksobj = ksobj; mtx_lock(&kstack_cache_mtx); @@ -476,7 +460,7 @@ ks_ce = ks_ce->next_ks_entry; vm_thread_stack_dispose(ks_ce1->ksobj, (vm_offset_t)ks_ce1, - KSTACK_PAGES); + kstack_pages); } } @@ -536,78 +520,7 @@ } #endif /* KSTACK_USAGE_PROF */ -#ifndef NO_SWAPPING /* - * Allow a thread's kernel stack to be paged out. - */ -static void -vm_thread_swapout(struct thread *td) -{ - vm_object_t ksobj; - vm_page_t m; - int i, pages; - - cpu_thread_swapout(td); - pages = td->td_kstack_pages; - ksobj = td->td_kstack_obj; - pmap_qremove(td->td_kstack, pages); - VM_OBJECT_WLOCK(ksobj); - for (i = 0; i < pages; i++) { - m = vm_page_lookup(ksobj, i); - if (m == NULL) - panic("vm_thread_swapout: kstack already missing?"); - vm_page_dirty(m); - vm_page_lock(m); - vm_page_unwire(m, 0); - vm_page_unlock(m); - } - VM_OBJECT_WUNLOCK(ksobj); -} - -/* - * Bring the kernel stack for a specified thread back in. - */ -static void -vm_thread_swapin(struct thread *td) -{ - vm_object_t ksobj; - vm_page_t ma[KSTACK_MAX_PAGES]; - int i, j, k, pages, rv; - - pages = td->td_kstack_pages; - ksobj = td->td_kstack_obj; - VM_OBJECT_WLOCK(ksobj); - for (i = 0; i < pages; i++) - ma[i] = vm_page_grab(ksobj, i, VM_ALLOC_NORMAL | - VM_ALLOC_WIRED); - for (i = 0; i < pages; i++) { - if (ma[i]->valid != VM_PAGE_BITS_ALL) { - vm_page_assert_xbusied(ma[i]); - vm_object_pip_add(ksobj, 1); - for (j = i + 1; j < pages; j++) { - if (ma[j]->valid != VM_PAGE_BITS_ALL) - vm_page_assert_xbusied(ma[j]); - if (ma[j]->valid == VM_PAGE_BITS_ALL) - break; - } - rv = vm_pager_get_pages(ksobj, ma + i, j - i, 0); - if (rv != VM_PAGER_OK) - panic("vm_thread_swapin: cannot get kstack for proc: %d", - td->td_proc->p_pid); - vm_object_pip_wakeup(ksobj); - for (k = i; k < j; k++) - ma[k] = vm_page_lookup(ksobj, k); - vm_page_xunbusy(ma[i]); - } else if (vm_page_xbusied(ma[i])) - vm_page_xunbusy(ma[i]); - } - VM_OBJECT_WUNLOCK(ksobj); - pmap_qenter(td->td_kstack, ma, pages); - cpu_thread_swapin(td); -} -#endif /* !NO_SWAPPING */ - -/* * Implement fork's actions on an address space. * Here we arrange for the address space to be copied or referenced, * allocate a user struct (pcb and kernel stack), then call the @@ -616,12 +529,8 @@ * to user mode to avoid stack copying and relocation problems. */ int -vm_forkproc(td, p2, td2, vm2, flags) - struct thread *td; - struct proc *p2; - struct thread *td2; - struct vmspace *vm2; - int flags; +vm_forkproc(struct thread *td, struct proc *p2, struct thread *td2, + struct vmspace *vm2, int flags) { struct proc *p1 = td->td_proc; int error; @@ -667,7 +576,7 @@ } /* - * Called after process has been wait(2)'ed apon and is being reaped. + * Called after process has been wait(2)'ed upon and is being reaped. * The idea is to reclaim resources that we could not reclaim while * the process was still executing. */ @@ -680,414 +589,8 @@ } void -faultin(p) - struct proc *p; -{ -#ifdef NO_SWAPPING - - PROC_LOCK_ASSERT(p, MA_OWNED); - if ((p->p_flag & P_INMEM) == 0) - panic("faultin: proc swapped out with NO_SWAPPING!"); -#else /* !NO_SWAPPING */ - struct thread *td; - - PROC_LOCK_ASSERT(p, MA_OWNED); - /* - * If another process is swapping in this process, - * just wait until it finishes. - */ - if (p->p_flag & P_SWAPPINGIN) { - while (p->p_flag & P_SWAPPINGIN) - msleep(&p->p_flag, &p->p_mtx, PVM, "faultin", 0); - return; - } - if ((p->p_flag & P_INMEM) == 0) { - /* - * Don't let another thread swap process p out while we are - * busy swapping it in. - */ - ++p->p_lock; - p->p_flag |= P_SWAPPINGIN; - PROC_UNLOCK(p); - - /* - * We hold no lock here because the list of threads - * can not change while all threads in the process are - * swapped out. - */ - FOREACH_THREAD_IN_PROC(p, td) - vm_thread_swapin(td); - PROC_LOCK(p); - swapclear(p); - p->p_swtick = ticks; - - wakeup(&p->p_flag); - - /* Allow other threads to swap p out now. */ - --p->p_lock; - } -#endif /* NO_SWAPPING */ -} - -/* - * This swapin algorithm attempts to swap-in processes only if there - * is enough space for them. Of course, if a process waits for a long - * time, it will be swapped in anyway. - */ -void -swapper(void) -{ - struct proc *p; - struct thread *td; - struct proc *pp; - int slptime; - int swtime; - int ppri; - int pri; - -loop: - if (vm_page_count_min()) { - VM_WAIT; - goto loop; - } - - pp = NULL; - ppri = INT_MIN; - sx_slock(&allproc_lock); - FOREACH_PROC_IN_SYSTEM(p) { - PROC_LOCK(p); - if (p->p_state == PRS_NEW || - p->p_flag & (P_SWAPPINGOUT | P_SWAPPINGIN | P_INMEM)) { - PROC_UNLOCK(p); - continue; - } - swtime = (ticks - p->p_swtick) / hz; - FOREACH_THREAD_IN_PROC(p, td) { - /* - * An otherwise runnable thread of a process - * swapped out has only the TDI_SWAPPED bit set. - * - */ - thread_lock(td); - if (td->td_inhibitors == TDI_SWAPPED) { - slptime = (ticks - td->td_slptick) / hz; - pri = swtime + slptime; - if ((td->td_flags & TDF_SWAPINREQ) == 0) - pri -= p->p_nice * 8; - /* - * if this thread is higher priority - * and there is enough space, then select - * this process instead of the previous - * selection. - */ - if (pri > ppri) { - pp = p; - ppri = pri; - } - } - thread_unlock(td); - } - PROC_UNLOCK(p); - } - sx_sunlock(&allproc_lock); - - /* - * Nothing to do, back to sleep. - */ - if ((p = pp) == NULL) { - tsleep(&proc0, PVM, "swapin", MAXSLP * hz / 2); - goto loop; - } - PROC_LOCK(p); - - /* - * Another process may be bringing or may have already - * brought this process in while we traverse all threads. - * Or, this process may even be being swapped out again. - */ - if (p->p_flag & (P_INMEM | P_SWAPPINGOUT | P_SWAPPINGIN)) { - PROC_UNLOCK(p); - goto loop; - } - - /* - * We would like to bring someone in. (only if there is space). - * [What checks the space? ] - */ - faultin(p); - PROC_UNLOCK(p); - goto loop; -} - -void kick_proc0(void) { wakeup(&proc0); } - -#ifndef NO_SWAPPING - -/* - * Swap_idle_threshold1 is the guaranteed swapped in time for a process - */ -static int swap_idle_threshold1 = 2; -SYSCTL_INT(_vm, OID_AUTO, swap_idle_threshold1, CTLFLAG_RW, - &swap_idle_threshold1, 0, "Guaranteed swapped in time for a process"); - -/* - * Swap_idle_threshold2 is the time that a process can be idle before - * it will be swapped out, if idle swapping is enabled. - */ -static int swap_idle_threshold2 = 10; -SYSCTL_INT(_vm, OID_AUTO, swap_idle_threshold2, CTLFLAG_RW, - &swap_idle_threshold2, 0, "Time before a process will be swapped out"); - -/* - * First, if any processes have been sleeping or stopped for at least - * "swap_idle_threshold1" seconds, they are swapped out. If, however, - * no such processes exist, then the longest-sleeping or stopped - * process is swapped out. Finally, and only as a last resort, if - * there are no sleeping or stopped processes, the longest-resident - * process is swapped out. - */ -void -swapout_procs(action) -int action; -{ - struct proc *p; - struct thread *td; - int didswap = 0; - -retry: - sx_slock(&allproc_lock); - FOREACH_PROC_IN_SYSTEM(p) { - struct vmspace *vm; - int minslptime = 100000; - int slptime; - - /* - * Watch out for a process in - * creation. It may have no - * address space or lock yet. - */ - if (p->p_state == PRS_NEW) - continue; - /* - * An aio daemon switches its - * address space while running. - * Perform a quick check whether - * a process has P_SYSTEM. - */ - if ((p->p_flag & P_SYSTEM) != 0) - continue; - /* - * Do not swapout a process that - * is waiting for VM data - * structures as there is a possible - * deadlock. Test this first as - * this may block. - * - * Lock the map until swapout - * finishes, or a thread of this - * process may attempt to alter - * the map. - */ - vm = vmspace_acquire_ref(p); - if (vm == NULL) - continue; - if (!vm_map_trylock(&vm->vm_map)) - goto nextproc1; - - PROC_LOCK(p); - if (p->p_lock != 0 || - (p->p_flag & (P_STOPPED_SINGLE|P_TRACED|P_SYSTEM|P_WEXIT) - ) != 0) { - goto nextproc; - } - /* - * only aiod changes vmspace, however it will be - * skipped because of the if statement above checking - * for P_SYSTEM - */ - if ((p->p_flag & (P_INMEM|P_SWAPPINGOUT|P_SWAPPINGIN)) != P_INMEM) - goto nextproc; - - switch (p->p_state) { - default: - /* Don't swap out processes in any sort - * of 'special' state. */ - break; - - case PRS_NORMAL: - /* - * do not swapout a realtime process - * Check all the thread groups.. - */ - FOREACH_THREAD_IN_PROC(p, td) { - thread_lock(td); - if (PRI_IS_REALTIME(td->td_pri_class)) { - thread_unlock(td); - goto nextproc; - } - slptime = (ticks - td->td_slptick) / hz; - /* - * Guarantee swap_idle_threshold1 - * time in memory. - */ - if (slptime < swap_idle_threshold1) { - thread_unlock(td); - goto nextproc; - } - - /* - * Do not swapout a process if it is - * waiting on a critical event of some - * kind or there is a thread whose - * pageable memory may be accessed. - * - * This could be refined to support - * swapping out a thread. - */ - if (!thread_safetoswapout(td)) { - thread_unlock(td); - goto nextproc; - } - /* - * If the system is under memory stress, - * or if we are swapping - * idle processes >= swap_idle_threshold2, - * then swap the process out. - */ - if (((action & VM_SWAP_NORMAL) == 0) && - (((action & VM_SWAP_IDLE) == 0) || - (slptime < swap_idle_threshold2))) { - thread_unlock(td); - goto nextproc; - } - - if (minslptime > slptime) - minslptime = slptime; - thread_unlock(td); - } - - /* - * If the pageout daemon didn't free enough pages, - * or if this process is idle and the system is - * configured to swap proactively, swap it out. - */ - if ((action & VM_SWAP_NORMAL) || - ((action & VM_SWAP_IDLE) && - (minslptime > swap_idle_threshold2))) { - if (swapout(p) == 0) - didswap++; - PROC_UNLOCK(p); - vm_map_unlock(&vm->vm_map); - vmspace_free(vm); - sx_sunlock(&allproc_lock); - goto retry; - } - } -nextproc: - PROC_UNLOCK(p); - vm_map_unlock(&vm->vm_map); -nextproc1: - vmspace_free(vm); - continue; - } - sx_sunlock(&allproc_lock); - /* - * If we swapped something out, and another process needed memory, - * then wakeup the sched process. - */ - if (didswap) - wakeup(&proc0); -} - -static void -swapclear(p) - struct proc *p; -{ - struct thread *td; - - PROC_LOCK_ASSERT(p, MA_OWNED); - - FOREACH_THREAD_IN_PROC(p, td) { - thread_lock(td); - td->td_flags |= TDF_INMEM; - td->td_flags &= ~TDF_SWAPINREQ; - TD_CLR_SWAPPED(td); - if (TD_CAN_RUN(td)) - if (setrunnable(td)) { -#ifdef INVARIANTS - /* - * XXX: We just cleared TDI_SWAPPED - * above and set TDF_INMEM, so this - * should never happen. - */ - panic("not waking up swapper"); -#endif - } - thread_unlock(td); - } - p->p_flag &= ~(P_SWAPPINGIN|P_SWAPPINGOUT); - p->p_flag |= P_INMEM; -} - -static int -swapout(p) - struct proc *p; -{ - struct thread *td; - - PROC_LOCK_ASSERT(p, MA_OWNED); -#if defined(SWAP_DEBUG) - printf("swapping out %d\n", p->p_pid); -#endif - - /* - * The states of this process and its threads may have changed - * by now. Assuming that there is only one pageout daemon thread, - * this process should still be in memory. - */ - KASSERT((p->p_flag & (P_INMEM|P_SWAPPINGOUT|P_SWAPPINGIN)) == P_INMEM, - ("swapout: lost a swapout race?")); - - /* - * remember the process resident count - */ - p->p_vmspace->vm_swrss = vmspace_resident_count(p->p_vmspace); - /* - * Check and mark all threads before we proceed. - */ - p->p_flag &= ~P_INMEM; - p->p_flag |= P_SWAPPINGOUT; - FOREACH_THREAD_IN_PROC(p, td) { - thread_lock(td); - if (!thread_safetoswapout(td)) { - thread_unlock(td); - swapclear(p); - return (EBUSY); - } - td->td_flags &= ~TDF_INMEM; - TD_SET_SWAPPED(td); - thread_unlock(td); - } - td = FIRST_THREAD_IN_PROC(p); - ++td->td_ru.ru_nswap; - PROC_UNLOCK(p); - - /* - * This list is stable because all threads are now prevented from - * running. The list is only modified in the context of a running - * thread in this process. - */ - FOREACH_THREAD_IN_PROC(p, td) - vm_thread_swapout(td); - - PROC_LOCK(p); - p->p_flag &= ~P_SWAPPINGOUT; - p->p_swtick = ticks; - return (0); -} -#endif /* !NO_SWAPPING */ Modified: trunk/sys/vm/vm_init.c =================================================================== --- trunk/sys/vm/vm_init.c 2020-02-08 19:35:04 UTC (rev 12313) +++ trunk/sys/vm/vm_init.c 2020-02-08 19:35:48 UTC (rev 12314) @@ -64,7 +64,7 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/vm/vm_init.c 255426 2013-09-09 18:11:59Z jhb $"); +__FBSDID("$FreeBSD: stable/11/sys/vm/vm_init.c 338484 2018-09-05 21:28:33Z kib $"); #include <sys/param.h> #include <sys/kernel.h> @@ -75,6 +75,7 @@ #include <sys/sysctl.h> #include <sys/systm.h> #include <sys/selinfo.h> +#include <sys/smp.h> #include <sys/pipe.h> #include <sys/bio.h> #include <sys/buf.h> @@ -91,11 +92,6 @@ long physmem; -static int exec_map_entries = 16; -TUNABLE_INT("vm.exec_map_entries", &exec_map_entries); -SYSCTL_INT(_vm, OID_AUTO, exec_map_entries, CTLFLAG_RD, &exec_map_entries, 0, - "Maximum number of simultaneous execs"); - /* * System initialization */ @@ -197,8 +193,8 @@ * Discount the physical memory larger than the size of kernel_map * to avoid eating up all of KVA space. */ - physmem_est = lmin(physmem, btoc(kernel_map->max_offset - - kernel_map->min_offset)); + physmem_est = lmin(physmem, btoc(vm_map_max(kernel_map) - + vm_map_min(kernel_map))); v = kern_vfs_bio_buffer_alloc(v, physmem_est); @@ -231,12 +227,15 @@ /* * Allocate the buffer arena. + * + * Enable the quantum cache if we have more than 4 cpus. This + * avoids lock contention at the expense of some fragmentation. */ size = (long)nbuf * BKVASIZE; kmi->buffer_sva = firstaddr; kmi->buffer_eva = kmi->buffer_sva + size; vmem_init(buffer_arena, "buffer arena", kmi->buffer_sva, size, - PAGE_SIZE, 0, 0); + PAGE_SIZE, (mp_ncpus > 4) ? BKVASIZE * 8 : 0, 0); firstaddr += size; /* @@ -259,10 +258,19 @@ panic("Clean map calculation incorrect"); /* - * Allocate the pageable submaps. + * Allocate the pageable submaps. We may cache an exec map entry per + * CPU, so we therefore need to reserve space for at least ncpu+1 + * entries to avoid deadlock. The exec map is also used by some image + * activators, so we leave a fixed number of pages for their use. */ +#ifdef __LP64__ + exec_map_entries = 8 * mp_ncpus; +#else + exec_map_entries = 2 * mp_ncpus + 4; +#endif + exec_map_entry_size = round_page(PATH_MAX + ARG_MAX); exec_map = kmem_suballoc(kernel_map, &minaddr, &maxaddr, - exec_map_entries * round_page(PATH_MAX + ARG_MAX), FALSE); + exec_map_entries * exec_map_entry_size + 64 * PAGE_SIZE, FALSE); pipe_map = kmem_suballoc(kernel_map, &minaddr, &maxaddr, maxpipekva, FALSE); } Modified: trunk/sys/vm/vm_kern.c =================================================================== --- trunk/sys/vm/vm_kern.c 2020-02-08 19:35:04 UTC (rev 12313) +++ trunk/sys/vm/vm_kern.c 2020-02-08 19:35:48 UTC (rev 12314) @@ -64,7 +64,7 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/vm/vm_kern.c 324782 2017-10-20 00:38:01Z emaste $"); +__FBSDID("$FreeBSD: stable/11/sys/vm/vm_kern.c 340660 2018-11-20 01:12:21Z markj $"); #include <sys/param.h> #include <sys/systm.h> @@ -85,6 +85,8 @@ #include <vm/vm_object.h> #include <vm/vm_page.h> #include <vm/vm_pageout.h> +#include <vm/vm_phys.h> +#include <vm/vm_radix.h> #include <vm/vm_extern.h> #include <vm/uma.h> @@ -98,6 +100,9 @@ /* NB: Used by kernel debuggers. */ const u_long vm_maxuser_address = VM_MAXUSER_ADDRESS; +u_int exec_map_entry_size; +u_int exec_map_entries; + SYSCTL_ULONG(_vm, OID_AUTO, min_kernel_address, CTLFLAG_RD, SYSCTL_NULL_ULONG_PTR, VM_MIN_KERNEL_ADDRESS, "Min kernel address"); @@ -160,8 +165,7 @@ vm_paddr_t high, vm_memattr_t memattr) { vm_object_t object = vmem == kmem_arena ? kmem_object : kernel_object; - vm_offset_t addr, i; - vm_ooffset_t offset; + vm_offset_t addr, i, offset; vm_page_t m; int pflags, tries; @@ -170,16 +174,21 @@ return (0); offset = addr - VM_MIN_KERNEL_ADDRESS; pflags = malloc2vm_flags(flags) | VM_ALLOC_NOBUSY | VM_ALLOC_WIRED; + pflags &= ~(VM_ALLOC_NOWAIT | VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL); + pflags |= VM_ALLOC_NOWAIT; VM_OBJECT_WLOCK(object); for (i = 0; i < size; i += PAGE_SIZE) { tries = 0; retry: - m = vm_page_alloc_contig(object, OFF_TO_IDX(offset + i), + m = vm_page_alloc_contig(object, atop(offset + i), pflags, 1, low, high, PAGE_SIZE, 0, memattr); if (m == NULL) { VM_OBJECT_WUNLOCK(object); if (tries < ((flags & M_NOWAIT) != 0 ? 1 : 3)) { - vm_pageout_grow_cache(tries, low, high); + if (!vm_page_reclaim_contig(pflags, 1, + low, high, PAGE_SIZE, 0) && + (flags & M_WAITOK) != 0) + VM_WAIT; VM_OBJECT_WLOCK(object); tries++; goto retry; @@ -212,9 +221,9 @@ vm_memattr_t memattr) { vm_object_t object = vmem == kmem_arena ? kmem_object : kernel_object; - vm_offset_t addr, tmp; - vm_ooffset_t offset; + vm_offset_t addr, offset, tmp; vm_page_t end_m, m; + u_long npages; int pflags, tries; size = round_page(size); @@ -222,15 +231,20 @@ return (0); offset = addr - VM_MIN_KERNEL_ADDRESS; pflags = malloc2vm_flags(flags) | VM_ALLOC_NOBUSY | VM_ALLOC_WIRED; + pflags &= ~(VM_ALLOC_NOWAIT | VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL); + pflags |= VM_ALLOC_NOWAIT; + npages = atop(size); VM_OBJECT_WLOCK(object); tries = 0; retry: - m = vm_page_alloc_contig(object, OFF_TO_IDX(offset), pflags, - atop(size), low, high, alignment, boundary, memattr); + m = vm_page_alloc_contig(object, atop(offset), pflags, + npages, low, high, alignment, boundary, memattr); if (m == NULL) { VM_OBJECT_WUNLOCK(object); if (tries < ((flags & M_NOWAIT) != 0 ? 1 : 3)) { - vm_pageout_grow_cache(tries, low, high); + if (!vm_page_reclaim_contig(pflags, npages, low, high, + alignment, boundary) && (flags & M_WAITOK) != 0) + VM_WAIT; VM_OBJECT_WLOCK(object); tries++; goto retry; @@ -238,7 +252,7 @@ vmem_free(vmem, addr, size); return (0); } - end_m = m + atop(size); + end_m = m + npages; tmp = addr; for (; m < end_m; m++) { if ((flags & M_ZERO) && (m->flags & PG_ZERO) == 0) @@ -322,7 +336,7 @@ kmem_back(vm_object_t object, vm_offset_t addr, vm_size_t size, int flags) { vm_offset_t offset, i; - vm_page_t m; + vm_page_t m, mpred; int pflags; KASSERT(object == kmem_object || object == kernel_object, @@ -330,11 +344,17 @@ offset = addr - VM_MIN_KERNEL_ADDRESS; pflags = malloc2vm_flags(flags) | VM_ALLOC_NOBUSY | VM_ALLOC_WIRED; + pflags &= ~(VM_ALLOC_NOWAIT | VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL); + if (flags & M_WAITOK) + pflags |= VM_ALLOC_WAITFAIL; + i = 0; VM_OBJECT_WLOCK(object); - for (i = 0; i < size; i += PAGE_SIZE) { retry: - m = vm_page_alloc(object, OFF_TO_IDX(offset + i), pflags); + mpred = vm_radix_lookup_le(&object->rtree, atop(offset + i)); + for (; i < size; i += PAGE_SIZE, mpred = m) { + m = vm_page_alloc_after(object, atop(offset + i), pflags, + mpred); /* * Ran out of space, free everything up and return. Don't need @@ -342,12 +362,9 @@ * aren't on any queues. */ if (m == NULL) { + if ((flags & M_NOWAIT) == 0) + goto retry; VM_OBJECT_WUNLOCK(object); - if ((flags & M_NOWAIT) == 0) { - VM_WAIT; - VM_OBJECT_WLOCK(object); - goto retry; - } kmem_unback(object, addr, i); return (KERN_NO_SPACE); } @@ -376,8 +393,8 @@ void kmem_unback(vm_object_t object, vm_offset_t addr, vm_size_t size) { - vm_page_t m; - vm_offset_t i, offset; + vm_page_t m, next; + vm_offset_t end, offset; KASSERT(object == kmem_object || object == kernel_object, ("kmem_unback: only supports kernel objects.")); @@ -384,10 +401,12 @@ pmap_remove(kernel_pmap, addr, addr + size); offset = addr - VM_MIN_KERNEL_ADDRESS; + end = offset + size; VM_OBJECT_WLOCK(object); - for (i = 0; i < size; i += PAGE_SIZE) { - m = vm_page_lookup(object, OFF_TO_IDX(offset + i)); - vm_page_unwire(m, 0); + for (m = vm_page_lookup(object, atop(offset)); offset < end; + offset += PAGE_SIZE, m = next) { + next = vm_page_next(m); + vm_page_unwire(m, PQ_NONE); vm_page_free(m); } VM_OBJECT_WUNLOCK(object); @@ -443,8 +462,8 @@ map->needs_wakeup = TRUE; vm_map_unlock_and_wait(map, 0); } - vm_map_insert(map, NULL, 0, addr, addr + size, VM_PROT_ALL, - VM_PROT_ALL, MAP_ACC_CHARGED); + vm_map_insert(map, NULL, 0, addr, addr + size, VM_PROT_RW, VM_PROT_RW, + MAP_ACC_CHARGED); vm_map_unlock(map); return (addr); } @@ -520,6 +539,43 @@ vm_map_unlock(m); } +/* + * kmem_bootstrap_free: + * + * Free pages backing preloaded data (e.g., kernel modules) to the + * system. Currently only supported on platforms that create a + * vm_phys segment for preloaded data. + */ +void +kmem_bootstrap_free(vm_offset_t start, vm_size_t size) +{ +#if defined(__i386__) || defined(__amd64__) + struct vm_domain *vmd; + vm_offset_t end, va; + vm_paddr_t pa; + vm_page_t m; + + end = trunc_page(start + size); + start = round_page(start); + + for (va = start; va < end; va += PAGE_SIZE) { + pa = pmap_kextract(va); + m = PHYS_TO_VM_PAGE(pa); + + vmd = vm_phys_domain(m); + mtx_lock(&vm_page_queue_free_mtx); + vm_phys_free_pages(m, 0); + vmd->vmd_page_count++; + vm_phys_freecnt_adj(m, 1); + mtx_unlock(&vm_page_queue_free_mtx); + + vm_cnt.v_page_count++; + } + pmap_remove(kernel_pmap, start, end); + (void)vmem_add(kernel_arena, start, end - start, M_WAITOK); +#endif +} + #ifdef DIAGNOSTIC /* * Allow userspace to directly trigger the VM drain routine for testing Modified: trunk/sys/vm/vm_kern.h =================================================================== --- trunk/sys/vm/vm_kern.h 2020-02-08 19:35:04 UTC (rev 12313) +++ trunk/sys/vm/vm_kern.h 2020-02-08 19:35:48 UTC (rev 12314) @@ -58,11 +58,11 @@ * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. * - * $FreeBSD: stable/10/sys/vm/vm_kern.h 254307 2013-08-13 22:40:43Z jeff $ + * $FreeBSD: stable/11/sys/vm/vm_kern.h 331722 2018-03-29 02:50:57Z eadler $ */ #ifndef _VM_VM_KERN_H_ -#define _VM_VM_KERN_H_ 1 +#define _VM_VM_KERN_H_ /* Kernel memory management definitions. */ extern vm_map_t kernel_map; @@ -75,5 +75,7 @@ extern struct vmem *memguard_arena; extern vm_offset_t swapbkva; extern u_long vm_kmem_size; +extern u_int exec_map_entries; +extern u_int exec_map_entry_size; -#endif /* _VM_VM_KERN_H_ */ +#endif /* _VM_VM_KERN_H_ */ Modified: trunk/sys/vm/vm_map.c =================================================================== --- trunk/sys/vm/vm_map.c 2020-02-08 19:35:04 UTC (rev 12313) +++ trunk/sys/vm/vm_map.c 2020-02-08 19:35:48 UTC (rev 12314) @@ -64,7 +64,7 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/vm/vm_map.c 326523 2017-12-04 10:05:59Z kib $"); +__FBSDID("$FreeBSD: stable/11/sys/vm/vm_map.c 355049 2019-11-24 06:54:17Z dougm $"); #include <sys/param.h> #include <sys/systm.h> @@ -136,6 +136,8 @@ static void vm_map_entry_unwire(vm_map_t map, vm_map_entry_t entry); static int vm_map_growstack(vm_map_t map, vm_offset_t addr, vm_map_entry_t gap_entry); +static void vm_map_pmap_enter(vm_map_t map, vm_offset_t addr, vm_prot_t prot, + vm_object_t object, vm_pindex_t pindex, vm_size_t size, int flags); #ifdef INVARIANTS static void vm_map_zdtor(void *mem, int size, void *arg); static void vmspace_zdtor(void *mem, int size, void *arg); @@ -277,12 +279,7 @@ struct vmspace *vm; vm = uma_zalloc(vmspace_zone, M_WAITOK); - KASSERT(vm->vm_map.pmap == NULL, ("vm_map.pmap must be NULL")); - - if (pinit == NULL) - pinit = &pmap_pinit; - if (!pinit(vmspace_pmap(vm))) { uma_zfree(vmspace_zone, vm); return (NULL); @@ -333,8 +330,8 @@ * Delete all of the mappings and pages they hold, then call * the pmap module to reclaim anything left. */ - (void)vm_map_remove(&vm->vm_map, vm->vm_map.min_offset, - vm->vm_map.max_offset); + (void)vm_map_remove(&vm->vm_map, vm_map_min(&vm->vm_map), + vm_map_max(&vm->vm_map)); pmap_release(vmspace_pmap(vm)); vm->vm_map.pmap = NULL; @@ -346,7 +343,7 @@ { WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, - "vmspace_free() called with non-sleepable lock held"); + "vmspace_free() called"); if (vm->vm_refcnt == 0) panic("vmspace_free: attempt to free already freed vmspace"); @@ -452,7 +449,48 @@ return (vm); } +/* + * Switch between vmspaces in an AIO kernel process. + * + * The new vmspace is either the vmspace of a user process obtained + * from an active AIO request or the initial vmspace of the AIO kernel + * process (when it is idling). Because user processes will block to + * drain any active AIO requests before proceeding in exit() or + * execve(), the reference count for vmspaces from AIO requests can + * never be 0. Similarly, AIO kernel processes hold an extra + * reference on their initial vmspace for the life of the process. As + * a result, the 'newvm' vmspace always has a non-zero reference + * count. This permits an additional reference on 'newvm' to be + * acquired via a simple atomic increment rather than the loop in + * vmspace_acquire_ref() above. + */ void +vmspace_switch_aio(struct vmspace *newvm) +{ + struct vmspace *oldvm; + + /* XXX: Need some way to assert that this is an aio daemon. */ + + KASSERT(newvm->vm_refcnt > 0, + ("vmspace_switch_aio: newvm unreferenced")); + + oldvm = curproc->p_vmspace; + if (oldvm == newvm) + return; + + /* + * Point to the new address space and refer to it. + */ + curproc->p_vmspace = newvm; + atomic_add_int(&newvm->vm_refcnt, 1); + + /* Activate the new mapping. */ + pmap_activate(curthread); + + vmspace_free(oldvm); +} + +void _vm_map_lock(vm_map_t map, const char *file, int line) { @@ -748,8 +786,8 @@ map->needs_wakeup = FALSE; map->system_map = 0; map->pmap = pmap; - map->min_offset = min; - map->max_offset = max; + map->header.end = min; + map->header.start = max; map->flags = 0; map->root = NULL; map->timestamp = 0; @@ -952,12 +990,10 @@ "vm_map_entry_link: map %p, nentries %d, entry %p, after %p", map, map->nentries, entry, after_where); VM_MAP_ASSERT_LOCKED(map); - KASSERT(after_where == &map->header || - after_where->end <= entry->start, + KASSERT(after_where->end <= entry->start, ("vm_map_entry_link: prev end %jx new start %jx overlap", (uintmax_t)after_where->end, (uintmax_t)entry->start)); - KASSERT(after_where->next == &map->header || - entry->end <= after_where->next->start, + KASSERT(entry->end <= after_where->next->start, ("vm_map_entry_link: new end %jx next start %jx overlap", (uintmax_t)entry->end, (uintmax_t)after_where->next->start)); @@ -979,8 +1015,7 @@ entry->right = map->root; entry->left = NULL; } - entry->adj_free = (entry->next == &map->header ? map->max_offset : - entry->next->start) - entry->end; + entry->adj_free = entry->next->start - entry->end; vm_map_entry_set_max_free(entry); map->root = entry; } @@ -999,8 +1034,7 @@ else { root = vm_map_entry_splay(entry->start, entry->left); root->right = entry->right; - root->adj_free = (entry->next == &map->header ? map->max_offset : - entry->next->start) - root->end; + root->adj_free = entry->next->start - root->end; vm_map_entry_set_max_free(root); } map->root = root; @@ -1036,8 +1070,7 @@ if (entry != map->root) map->root = vm_map_entry_splay(entry->start, map->root); - entry->adj_free = (entry->next == &map->header ? map->max_offset : - entry->next->start) - entry->end; + entry->adj_free = entry->next->start - entry->end; vm_map_entry_set_max_free(entry); } @@ -1152,7 +1185,8 @@ /* * Check that the start and end points are not bogus. */ - if (start < map->min_offset || end > map->max_offset || start >= end) + if (start < vm_map_min(map) || end > vm_map_max(map) || + start >= end) return (KERN_INVALID_ADDRESS); /* @@ -1167,7 +1201,7 @@ /* * Assert that the next entry doesn't overlap the end point. */ - if (prev_entry->next != &map->header && prev_entry->next->start < end) + if (prev_entry->next->start < end) return (KERN_NO_SPACE); if ((cow & MAP_CREATE_GUARD) != 0 && (object != NULL || @@ -1295,7 +1329,7 @@ new_entry->wired_count = 0; new_entry->wiring_thread = NULL; new_entry->read_ahead = VM_FAULT_READ_AHEAD_INIT; - new_entry->next_read = OFF_TO_IDX(offset); + new_entry->next_read = start; KASSERT(cred == NULL || !ENTRY_CHARGED(new_entry), ("overcommit: vm_map_insert leaks vm_map %p", new_entry)); @@ -1352,9 +1386,8 @@ * Request must fit within min/max VM address and must avoid * address wrap. */ - if (start < map->min_offset) - start = map->min_offset; - if (start + length > map->max_offset || start + length < start) + start = MAX(start, vm_map_min(map)); + if (start + length > vm_map_max(map) || start + length < start) return (1); /* Empty tree means wide open address space. */ @@ -1456,6 +1489,8 @@ KASSERT((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) == 0 || object == NULL, ("vm_map_find: non-NULL backing object for stack")); + MPASS((cow & MAP_REMAP) == 0 || (find_space == VMFS_NO_SPACE && + (cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) == 0)); if (find_space == VMFS_OPTIMAL_SPACE && (object == NULL || (object->flags & OBJ_COLORED) == 0)) find_space = VMFS_ANY_SPACE; @@ -1496,6 +1531,14 @@ } start = *addr; + } else if ((cow & MAP_REMAP) != 0) { + if (start < vm_map_min(map) || + start + length > vm_map_max(map) || + start + length <= length) { + result = KERN_INVALID_ADDRESS; + break; + } + vm_map_delete(map, start, start + length); } if ((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) != 0) { result = vm_map_stack_locked(map, start, length, @@ -1549,7 +1592,7 @@ * * The map must be locked. * - * This routine guarentees that the passed entry remains valid (though + * This routine guarantees that the passed entry remains valid (though * possibly extended). When merging, this routine may delete one or * both neighbors. */ @@ -1655,6 +1698,8 @@ vm_map_entry_t new_entry; VM_MAP_ASSERT_LOCKED(map); + KASSERT(entry->end > start && entry->start < start, + ("_vm_map_clip_start: invalid clip of entry %p", entry)); /* * Split off the front portion -- note that we must insert the new @@ -1740,6 +1785,8 @@ vm_map_entry_t new_entry; VM_MAP_ASSERT_LOCKED(map); + KASSERT(entry->start < end && entry->end > end, + ("_vm_map_clip_end: invalid clip of entry %p", entry)); /* * If there is no object backing this entry, we might as well create @@ -1856,11 +1903,9 @@ * limited number of page mappings are created at the low-end of the * specified address range. (For this purpose, a superpage mapping * counts as one page mapping.) Otherwise, all resident pages within - * the specified address range are mapped. Because these mappings are - * being created speculatively, cached pages are not reactivated and - * mapped. + * the specified address range are mapped. */ -void +static void vm_map_pmap_enter(vm_map_t map, vm_offset_t addr, vm_prot_t prot, vm_object_t object, vm_pindex_t pindex, vm_size_t size, int flags) { @@ -1910,7 +1955,7 @@ * free pages allocating pv entries. */ if (((flags & MAP_PREFAULT_MADVISE) != 0 && - cnt.v_free_count < cnt.v_free_reserved) || + vm_cnt.v_free_count < vm_cnt.v_free_reserved) || ((flags & MAP_PREFAULT_PARTIAL) != 0 && tmpidx >= threshold)) { psize = tmpidx; @@ -1926,7 +1971,7 @@ (pagesizes[p->psind] - 1)) == 0) { mask = atop(pagesizes[p->psind]) - 1; if (tmpidx + mask < psize && - vm_page_ps_is_valid(p)) { + vm_page_ps_test(p, PS_ALL_VALID, NULL)) { p += mask; threshold += mask; } @@ -1955,7 +2000,7 @@ vm_map_protect(vm_map_t map, vm_offset_t start, vm_offset_t end, vm_prot_t new_prot, boolean_t set_max) { - vm_map_entry_t current, entry; + vm_map_entry_t current, entry, in_tran; vm_object_t obj; struct ucred *cred; vm_prot_t old_prot; @@ -1963,8 +2008,18 @@ if (start == end) return (KERN_SUCCESS); +again: + in_tran = NULL; vm_map_lock(map); + /* + * Ensure that we are not concurrently wiring pages. vm_map_wire() may + * need to fault pages into the map and will drop the map lock while + * doing so, and the VM object may end up in an inconsistent state if we + * update the protection on the map entry in between faults. + */ + vm_map_wait_busy(map); + VM_MAP_RANGE_CHECK(map, start, end); if (vm_map_lookup_entry(map, start, &entry)) { @@ -1976,8 +2031,7 @@ /* * Make a first pass to check for protection violations. */ - for (current = entry; current != &map->header && current->start < end; - current = current->next) { + for (current = entry; current->start < end; current = current->next) { if ((current->eflags & MAP_ENTRY_GUARD) != 0) continue; if (current->eflags & MAP_ENTRY_IS_SUB_MAP) { @@ -1988,15 +2042,29 @@ vm_map_unlock(map); return (KERN_PROTECTION_FAILURE); } + if ((current->eflags & MAP_ENTRY_IN_TRANSITION) != 0) + in_tran = current; } /* + * Postpone the operation until all in-transition map entries have + * stabilized. An in-transition entry might already have its pages + * wired and wired_count incremented, but not yet have its + * MAP_ENTRY_USER_WIRED flag set. In which case, we would fail to call + * vm_fault_copy_entry() in the final loop below. + */ + if (in_tran != NULL) { + in_tran->eflags |= MAP_ENTRY_NEEDS_WAKEUP; + vm_map_unlock_and_wait(map, 0); + goto again; + } + + /* * Do an accounting pass for private read-only mappings that * now will do cow due to allowed write (e.g. debugger sets * breakpoint on text segment) */ - for (current = entry; current != &map->header && current->start < end; - current = current->next) { + for (current = entry; current->start < end; current = current->next) { vm_map_clip_end(map, current, end); @@ -2050,8 +2118,7 @@ * Go back and fix up protections. [Note that clipping is not * necessary the second time.] */ - for (current = entry; current != &map->header && current->start < end; - current = current->next) { + for (current = entry; current->start < end; current = current->next) { if ((current->eflags & MAP_ENTRY_GUARD) != 0) continue; @@ -2160,10 +2227,8 @@ * We clip the vm_map_entry so that behavioral changes are * limited to the specified address range. */ - for (current = entry; - (current != &map->header) && (current->start < end); - current = current->next - ) { + for (current = entry; current->start < end; + current = current->next) { if (current->eflags & MAP_ENTRY_IS_SUB_MAP) continue; @@ -2207,15 +2272,25 @@ * Since we don't clip the vm_map_entry, we have to clip * the vm_object pindex and count. */ - for (current = entry; - (current != &map->header) && (current->start < end); - current = current->next - ) { + for (current = entry; current->start < end; + current = current->next) { vm_offset_t useEnd, useStart; if (current->eflags & MAP_ENTRY_IS_SUB_MAP) continue; + /* + * MADV_FREE would otherwise rewind time to + * the creation of the shadow object. Because + * we hold the VM map read-locked, neither the + * entry's object nor the presence of a + * backing object can change. + */ + if (behav == MADV_FREE && + current->object.vm_object != NULL && + current->object.vm_object->backing_object != NULL) + continue; + pstart = OFF_TO_IDX(current->offset); pend = pstart + atop(current->end - current->start); useStart = current->start; @@ -2306,7 +2381,7 @@ vm_map_clip_start(map, entry, start); } else entry = temp_entry->next; - while ((entry != &map->header) && (entry->start < end)) { + while (entry->start < end) { vm_map_clip_end(map, entry, end); if ((entry->eflags & MAP_ENTRY_GUARD) == 0 || new_inheritance != VM_INHERIT_ZERO) @@ -2348,7 +2423,7 @@ } last_timestamp = map->timestamp; entry = first_entry; - while (entry != &map->header && entry->start < end) { + while (entry->start < end) { if (entry->eflags & MAP_ENTRY_IN_TRANSITION) { /* * We have not yet clipped the entry. @@ -2411,8 +2486,7 @@ * If VM_MAP_WIRE_HOLESOK was specified, skip this check. */ if (((flags & VM_MAP_WIRE_HOLESOK) == 0) && - (entry->end < end && (entry->next == &map->header || - entry->next->start > entry->end))) { + (entry->end < end && entry->next->start > entry->end)) { end = entry->end; rv = KERN_INVALID_ADDRESS; goto done; @@ -2438,8 +2512,7 @@ else KASSERT(result, ("vm_map_unwire: lookup failed")); } - for (entry = first_entry; entry != &map->header && entry->start < end; - entry = entry->next) { + for (entry = first_entry; entry->start < end; entry = entry->next) { /* * If VM_MAP_WIRE_HOLESOK was specified, an empty * space in the unwired region could have been mapped @@ -2553,7 +2626,7 @@ } last_timestamp = map->timestamp; entry = first_entry; - while (entry != &map->header && entry->start < end) { + while (entry->start < end) { if (entry->eflags & MAP_ENTRY_IN_TRANSITION) { /* * We have not yet clipped the entry. @@ -2690,8 +2763,7 @@ */ next_entry: if ((flags & VM_MAP_WIRE_HOLESOK) == 0 && - entry->end < end && (entry->next == &map->header || - entry->next->start > entry->end)) { + entry->end < end && entry->next->start > entry->end) { end = entry->end; rv = KERN_INVALID_ADDRESS; goto done; @@ -2708,8 +2780,7 @@ else KASSERT(result, ("vm_map_wire: lookup failed")); } - for (entry = first_entry; entry != &map->header && entry->start < end; - entry = entry->next) { + for (entry = first_entry; entry->start < end; entry = entry->next) { /* * If VM_MAP_WIRE_HOLESOK was specified, an empty * space in the unwired region could have been mapped @@ -2813,15 +2884,13 @@ /* * Make a first pass to check for user-wired memory and holes. */ - for (current = entry; current != &map->header && current->start < end; - current = current->next) { + for (current = entry; current->start < end; current = current->next) { if (invalidate && (current->eflags & MAP_ENTRY_USER_WIRED)) { vm_map_unlock_read(map); return (KERN_INVALID_ARGUMENT); } if (end > current->end && - (current->next == &map->header || - current->end != current->next->start)) { + current->end != current->next->start) { vm_map_unlock_read(map); return (KERN_INVALID_ADDRESS); } @@ -2835,7 +2904,7 @@ * Make a second pass, cleaning/uncaching pages from the indicated * objects as we go. */ - for (current = entry; current != &map->header && current->start < end;) { + for (current = entry; current->start < end;) { offset = current->offset + (start - current->start); size = (end <= current->end ? end : current->end) - start; if (current->eflags & MAP_ENTRY_IS_SUB_MAP) { @@ -2912,7 +2981,7 @@ { vm_object_t object; vm_pindex_t offidxstart, offidxend, count, size1; - vm_ooffset_t size; + vm_size_t size; vm_map_entry_unlink(map, entry); object = entry->object.vm_object; @@ -2938,7 +3007,7 @@ KASSERT(entry->cred == NULL || object->cred == NULL || (entry->eflags & MAP_ENTRY_NEEDS_COPY), ("OVERCOMMIT vm_map_entry_delete: both cred %p", entry)); - count = OFF_TO_IDX(size); + count = atop(size); offidxstart = OFF_TO_IDX(entry->offset); offidxend = offidxstart + count; VM_OBJECT_WLOCK(object); @@ -3012,7 +3081,7 @@ /* * Step through all entries in this region */ - while ((entry != &map->header) && (entry->start < end)) { + while (entry->start < end) { vm_map_entry_t next; /* @@ -3058,11 +3127,17 @@ * Unwire before removing addresses from the pmap; otherwise, * unwiring will put the entries back in the pmap. */ - if (entry->wired_count != 0) { + if (entry->wired_count != 0) vm_map_entry_unwire(map, entry); - } - pmap_remove(map->pmap, entry->start, entry->end); + /* + * Remove mappings for the pages, but only if the + * mappings could exist. For instance, it does not + * make sense to call pmap_remove() for guard entries. + */ + if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0 || + entry->object.vm_object != NULL) + pmap_remove(map->pmap, entry->start, entry->end); /* * Delete the entry only after removing all pmap @@ -3120,8 +3195,6 @@ entry = tmp_entry; while (start < end) { - if (entry == &map->header) - return (FALSE); /* * No holes allowed! */ @@ -3325,7 +3398,8 @@ old_map = &vm1->vm_map; /* Copy immutable fields of vm1 to vm2. */ - vm2 = vmspace_alloc(old_map->min_offset, old_map->max_offset, NULL); + vm2 = vmspace_alloc(vm_map_min(old_map), vm_map_max(old_map), + pmap_pinit); if (vm2 == NULL) return (NULL); vm2->vm_taddr = vm1->vm_taddr; @@ -3529,9 +3603,7 @@ growsize = sgrowsiz; init_ssize = (max_ssize < growsize) ? max_ssize : growsize; vm_map_lock(map); - PROC_LOCK(curproc); - vmemlim = lim_cur(curproc, RLIMIT_VMEM); - PROC_UNLOCK(curproc); + vmemlim = lim_cur(curthread, RLIMIT_VMEM); /* If we would blow our VMEM resource limit, no go */ if (map->size + init_ssize > vmemlim) { rv = KERN_NO_SPACE; @@ -3572,7 +3644,8 @@ addrbos + max_ssize > vm_map_max(map) || addrbos + max_ssize <= addrbos) return (KERN_INVALID_ADDRESS); - sgp = (vm_size_t)stack_guard_page * PAGE_SIZE; + sgp = (curproc->p_flag2 & P2_STKGAP_DISABLE) != 0 ? 0 : + (vm_size_t)stack_guard_page * PAGE_SIZE; if (sgp >= max_ssize) return (KERN_INVALID_ARGUMENT); @@ -3585,10 +3658,9 @@ return (KERN_NO_SPACE); /* - * If we can't accomodate max_ssize in the current mapping, no go. + * If we can't accommodate max_ssize in the current mapping, no go. */ - if ((prev_entry->next != &map->header) && - (prev_entry->next->start < addrbos + max_ssize)) + if (prev_entry->next->start < addrbos + max_ssize) return (KERN_NO_SPACE); /* @@ -3624,11 +3696,25 @@ KASSERT((orient & MAP_STACK_GROWS_UP) == 0 || (new_entry->eflags & MAP_ENTRY_GROWS_UP) != 0, ("new entry lacks MAP_ENTRY_GROWS_UP")); + if (gap_bot == gap_top) + return (KERN_SUCCESS); rv = vm_map_insert(map, NULL, 0, gap_bot, gap_top, VM_PROT_NONE, VM_PROT_NONE, MAP_CREATE_GUARD | (orient == MAP_STACK_GROWS_DOWN ? MAP_CREATE_STACK_GAP_DN : MAP_CREATE_STACK_GAP_UP)); - if (rv != KERN_SUCCESS) + if (rv == KERN_SUCCESS) { + /* + * Gap can never successfully handle a fault, so + * read-ahead logic is never used for it. Re-use + * next_read of the gap entry to store + * stack_guard_page for vm_map_growstack(). + */ + if (orient == MAP_STACK_GROWS_DOWN) + new_entry->prev->next_read = sgp; + else + new_entry->next->next_read = sgp; + } else { (void)vm_map_delete(map, bot, top); + } return (rv); } @@ -3663,17 +3749,15 @@ * debugger or AIO daemon. The reason is that the wrong * resource limits are applied. */ - if (map != &p->p_vmspace->vm_map || p->p_textvp == NULL) + if (p != initproc && (map != &p->p_vmspace->vm_map || + p->p_textvp == NULL)) return (KERN_FAILURE); MPASS(!map->system_map); - guard = stack_guard_page * PAGE_SIZE; - PROC_LOCK(p); - lmemlim = lim_cur(p, RLIMIT_MEMLOCK); - stacklim = lim_cur(p, RLIMIT_STACK); - vmemlim = lim_cur(p, RLIMIT_VMEM); - PROC_UNLOCK(p); + lmemlim = lim_cur(curthread, RLIMIT_MEMLOCK); + stacklim = lim_cur(curthread, RLIMIT_STACK); + vmemlim = lim_cur(curthread, RLIMIT_VMEM); retry: /* If addr is not in a hole for a stack grow area, no need to grow. */ if (gap_entry == NULL && !vm_map_lookup_entry(map, addr, &gap_entry)) @@ -3697,6 +3781,8 @@ } else { return (KERN_FAILURE); } + guard = (curproc->p_flag2 & P2_STKGAP_DISABLE) != 0 ? 0 : + gap_entry->next_read; max_grow = gap_entry->end - gap_entry->start; if (guard > max_grow) return (KERN_NO_SPACE); @@ -3844,9 +3930,7 @@ if (rv == KERN_SUCCESS && (map->flags & MAP_WIREFUTURE) != 0) { vm_map_unlock(map); vm_map_wire(map, grow_start, grow_start + grow_amount, - (p->p_flag & P_SYSTEM) - ? VM_MAP_WIRE_SYSTEM|VM_MAP_WIRE_NOHOLES - : VM_MAP_WIRE_USER|VM_MAP_WIRE_NOHOLES); + VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); vm_map_lock_read(map); } else vm_map_lock_downgrade(map); @@ -3883,7 +3967,7 @@ KASSERT((curthread->td_pflags & TDP_EXECVMSPC) == 0, ("vmspace_exec recursed")); - newvmspace = vmspace_alloc(minuser, maxuser, NULL); + newvmspace = vmspace_alloc(minuser, maxuser, pmap_pinit); if (newvmspace == NULL) return (ENOMEM); newvmspace->vm_swrss = oldvmspace->vm_swrss; @@ -4125,7 +4209,7 @@ * Return the object/offset from this entry. If the entry was * copy-on-write or empty, it has been fixed up. */ - *pindex = OFF_TO_IDX((vaddr - entry->start) + entry->offset); + *pindex = UOFF_TO_IDX((vaddr - entry->start) + entry->offset); *object = entry->object.vm_object; *out_prot = prot; @@ -4206,7 +4290,7 @@ * Return the object/offset from this entry. If the entry was * copy-on-write or empty, it has been fixed up. */ - *pindex = OFF_TO_IDX((vaddr - entry->start) + entry->offset); + *pindex = UOFF_TO_IDX((vaddr - entry->start) + entry->offset); *object = entry->object.vm_object; *out_prot = prot; @@ -4228,6 +4312,27 @@ vm_map_unlock_read(map); } +vm_offset_t +vm_map_max_KBI(const struct vm_map *map) +{ + + return (vm_map_max(map)); +} + +vm_offset_t +vm_map_min_KBI(const struct vm_map *map) +{ + + return (vm_map_min(map)); +} + +pmap_t +vm_map_pmap_KBI(vm_map_t map) +{ + + return (map->pmap); +} + #include "opt_ddb.h" #ifdef DDB #include <sys/kernel.h> Modified: trunk/sys/vm/vm_map.h =================================================================== --- trunk/sys/vm/vm_map.h 2020-02-08 19:35:04 UTC (rev 12313) +++ trunk/sys/vm/vm_map.h 2020-02-08 19:35:48 UTC (rev 12314) @@ -58,7 +58,7 @@ * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. * - * $FreeBSD: stable/10/sys/vm/vm_map.h 321718 2017-07-30 10:49:13Z kib $ + * $FreeBSD: stable/11/sys/vm/vm_map.h 343426 2019-01-25 11:46:07Z kib $ */ /* @@ -105,6 +105,7 @@ vm_offset_t start; /* start address */ vm_offset_t end; /* end address */ vm_offset_t pad0; + vm_offset_t next_read; /* vaddr of the next sequential read */ vm_size_t adj_free; /* amount of adjacent free space */ vm_size_t max_free; /* max free space in subtree */ union vm_map_object object; /* object I point to */ @@ -115,7 +116,6 @@ vm_inherit_t inheritance; /* inheritance */ uint8_t read_ahead; /* pages in the read-ahead window */ int wired_count; /* can be paged if = 0 */ - vm_pindex_t next_read; /* index of the next sequential read */ struct ucred *cred; /* tmp storage for creator ref */ struct thread *wiring_thread; }; @@ -173,15 +173,26 @@ * A map is a set of map entries. These map entries are * organized both as a binary search tree and as a doubly-linked * list. Both structures are ordered based upon the start and - * end addresses contained within each map entry. Sleator and - * Tarjan's top-down splay algorithm is employed to control - * height imbalance in the binary search tree. + * end addresses contained within each map entry. * - * List of locks + * Counterintuitively, the map's min offset value is stored in + * map->header.end, and its max offset value is stored in + * map->header.start. + * + * The list header has max start value and min end value to act + * as sentinels for sequential search of the doubly-linked list. + * Sleator and Tarjan's top-down splay algorithm is employed to + * control height imbalance in the binary search tree. + * + * List of locks * (c) const until freed */ struct vm_map { struct vm_map_entry header; /* List of entries */ +/* + map min_offset header.end (c) + map max_offset header.start (c) +*/ struct sx lock; /* Lock for map data */ struct mtx system_mtx; int nentries; /* Number of entries */ @@ -192,8 +203,6 @@ vm_flags_t flags; /* flags for this vm_map */ vm_map_entry_t root; /* Root of a binary search tree */ pmap_t pmap; /* (c) Physical map */ -#define min_offset header.start /* (c) */ -#define max_offset header.end /* (c) */ int busy; }; @@ -204,16 +213,23 @@ #define MAP_BUSY_WAKEUP 0x02 #ifdef _KERNEL +#ifdef KLD_MODULE +#define vm_map_max(map) vm_map_max_KBI((map)) +#define vm_map_min(map) vm_map_min_KBI((map)) +#define vm_map_pmap(map) vm_map_pmap_KBI((map)) +#else static __inline vm_offset_t vm_map_max(const struct vm_map *map) { - return (map->max_offset); + + return (map->header.start); } static __inline vm_offset_t vm_map_min(const struct vm_map *map) { - return (map->min_offset); + + return (map->header.end); } static __inline pmap_t @@ -227,6 +243,7 @@ { map->flags = (map->flags | set) & ~clear; } +#endif /* KLD_MODULE */ #endif /* _KERNEL */ /* @@ -287,6 +304,9 @@ void vm_map_busy(vm_map_t map); void vm_map_unbusy(vm_map_t map); void vm_map_wait_busy(vm_map_t map); +vm_offset_t vm_map_max_KBI(const struct vm_map *map); +vm_offset_t vm_map_min_KBI(const struct vm_map *map); +pmap_t vm_map_pmap_KBI(vm_map_t map); #define vm_map_lock(map) _vm_map_lock(map, LOCK_FILE, LOCK_LINE) #define vm_map_unlock(map) _vm_map_unlock(map, LOCK_FILE, LOCK_LINE) @@ -306,9 +326,8 @@ #endif /* _KERNEL */ -/* XXX: number of kernel maps and entries to statically allocate */ +/* XXX: number of kernel maps to statically allocate */ #define MAX_KMAP 10 -#define MAX_KMAPENT 128 /* * Copy-on-write flags for vm_map operations @@ -324,6 +343,7 @@ #define MAP_DISABLE_COREDUMP 0x0100 #define MAP_PREFAULT_MADVISE 0x0200 /* from (user) madvise request */ #define MAP_VN_WRITECOUNT 0x0400 +#define MAP_REMAP 0x0800 #define MAP_STACK_GROWS_DOWN 0x1000 #define MAP_STACK_GROWS_UP 0x2000 #define MAP_ACC_CHARGED 0x4000 @@ -389,15 +409,13 @@ vm_pindex_t *, vm_prot_t *, boolean_t *); void vm_map_lookup_done (vm_map_t, vm_map_entry_t); boolean_t vm_map_lookup_entry (vm_map_t, vm_offset_t, vm_map_entry_t *); -void vm_map_pmap_enter(vm_map_t map, vm_offset_t addr, vm_prot_t prot, - vm_object_t object, vm_pindex_t pindex, vm_size_t size, int flags); int vm_map_protect (vm_map_t, vm_offset_t, vm_offset_t, vm_prot_t, boolean_t); int vm_map_remove (vm_map_t, vm_offset_t, vm_offset_t); +void vm_map_simplify_entry(vm_map_t map, vm_map_entry_t entry); void vm_map_startup (void); int vm_map_submap (vm_map_t, vm_offset_t, vm_offset_t, vm_map_t); int vm_map_sync(vm_map_t, vm_offset_t, vm_offset_t, boolean_t, boolean_t); int vm_map_madvise (vm_map_t, vm_offset_t, vm_offset_t, int); -void vm_map_simplify_entry (vm_map_t, vm_map_entry_t); int vm_map_stack (vm_map_t, vm_offset_t, vm_size_t, vm_prot_t, vm_prot_t, int); int vm_map_unwire(vm_map_t map, vm_offset_t start, vm_offset_t end, int flags); Modified: trunk/sys/vm/vm_meter.c =================================================================== --- trunk/sys/vm/vm_meter.c 2020-02-08 19:35:04 UTC (rev 12313) +++ trunk/sys/vm/vm_meter.c 2020-02-08 19:35:48 UTC (rev 12314) @@ -31,7 +31,7 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/vm/vm_meter.c 311049 2017-01-02 08:31:29Z kib $"); +__FBSDID("$FreeBSD: stable/11/sys/vm/vm_meter.c 331722 2018-03-29 02:50:57Z eadler $"); #include <sys/param.h> #include <sys/systm.h> @@ -54,24 +54,20 @@ #include <vm/vm_object.h> #include <sys/sysctl.h> -struct vmmeter cnt; +struct vmmeter vm_cnt; SYSCTL_UINT(_vm, VM_V_FREE_MIN, v_free_min, - CTLFLAG_RW, &cnt.v_free_min, 0, "Minimum low-free-pages threshold"); + CTLFLAG_RW, &vm_cnt.v_free_min, 0, "Minimum low-free-pages threshold"); SYSCTL_UINT(_vm, VM_V_FREE_TARGET, v_free_target, - CTLFLAG_RW, &cnt.v_free_target, 0, "Desired free pages"); + CTLFLAG_RW, &vm_cnt.v_free_target, 0, "Desired free pages"); SYSCTL_UINT(_vm, VM_V_FREE_RESERVED, v_free_reserved, - CTLFLAG_RW, &cnt.v_free_reserved, 0, "Pages reserved for deadlock"); + CTLFLAG_RW, &vm_cnt.v_free_reserved, 0, "Pages reserved for deadlock"); SYSCTL_UINT(_vm, VM_V_INACTIVE_TARGET, v_inactive_target, - CTLFLAG_RW, &cnt.v_inactive_target, 0, "Pages desired inactive"); -SYSCTL_UINT(_vm, VM_V_CACHE_MIN, v_cache_min, - CTLFLAG_RW, &cnt.v_cache_min, 0, "Min pages on cache queue"); -SYSCTL_UINT(_vm, VM_V_CACHE_MAX, v_cache_max, - CTLFLAG_RW, &cnt.v_cache_max, 0, "Max pages on cache queue"); + CTLFLAG_RW, &vm_cnt.v_inactive_target, 0, "Pages desired inactive"); SYSCTL_UINT(_vm, VM_V_PAGEOUT_FREE_MIN, v_pageout_free_min, - CTLFLAG_RW, &cnt.v_pageout_free_min, 0, "Min pages reserved for kernel"); + CTLFLAG_RW, &vm_cnt.v_pageout_free_min, 0, "Min pages reserved for kernel"); SYSCTL_UINT(_vm, OID_AUTO, v_free_severe, - CTLFLAG_RW, &cnt.v_free_severe, 0, "Severe page depletion point"); + CTLFLAG_RW, &vm_cnt.v_free_severe, 0, "Severe page depletion point"); static int sysctl_vm_loadavg(SYSCTL_HANDLER_ARGS) @@ -140,7 +136,7 @@ else total.t_sl++; if (td->td_wchan == - &cnt.v_free_count) + &vm_cnt.v_free_count) total.t_pw++; } break; @@ -209,13 +205,13 @@ } } mtx_unlock(&vm_object_list_mtx); - total.t_free = cnt.v_free_count + cnt.v_cache_count; + total.t_free = vm_cnt.v_free_count; return (sysctl_handle_opaque(oidp, &total, sizeof(total), req)); } /* - * vcnt() - accumulate statistics from all cpus and the global cnt - * structure. + * vm_meter_cnt() - accumulate statistics from all cpus and the global cnt + * structure. * * The vmmeter structure is now per-cpu as well as global. Those * statistics which can be kept on a per-cpu basis (to avoid cache @@ -222,23 +218,31 @@ * stalls between cpus) can be moved to the per-cpu vmmeter. Remaining * statistics, such as v_free_reserved, are left in the global * structure. - * - * (sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req) */ -static int -vcnt(SYSCTL_HANDLER_ARGS) +u_int +vm_meter_cnt(size_t offset) { - int count = *(int *)arg1; - int offset = (char *)arg1 - (char *)&cnt; + struct pcpu *pcpu; + u_int count; int i; + count = *(u_int *)((char *)&vm_cnt + offset); CPU_FOREACH(i) { - struct pcpu *pcpu = pcpu_find(i); - count += *(int *)((char *)&pcpu->pc_cnt + offset); + pcpu = pcpu_find(i); + count += *(u_int *)((char *)&pcpu->pc_cnt + offset); } - return (SYSCTL_OUT(req, &count, sizeof(int))); + return (count); } +static int +cnt_sysctl(SYSCTL_HANDLER_ARGS) +{ + u_int count; + + count = vm_meter_cnt((char *)arg1 - (char *)&vm_cnt); + return (SYSCTL_OUT(req, &count, sizeof(count))); +} + SYSCTL_PROC(_vm, VM_TOTAL, vmtotal, CTLTYPE_OPAQUE|CTLFLAG_RD|CTLFLAG_MPSAFE, 0, sizeof(struct vmtotal), vmtotal, "S,vmtotal", "System virtual memory statistics"); @@ -251,8 +255,8 @@ #define VM_STATS(parent, var, descr) \ SYSCTL_PROC(parent, OID_AUTO, var, \ - CTLTYPE_UINT | CTLFLAG_RD | CTLFLAG_MPSAFE, &cnt.var, 0, vcnt, \ - "IU", descr) + CTLTYPE_UINT | CTLFLAG_RD | CTLFLAG_MPSAFE, &vm_cnt.var, 0, \ + cnt_sysctl, "IU", descr) #define VM_STATS_VM(var, descr) VM_STATS(_vm_stats_vm, var, descr) #define VM_STATS_SYS(var, descr) VM_STATS(_vm_stats_sys, var, descr) @@ -276,9 +280,10 @@ VM_STATS_VM(v_vnodepgsin, "Vnode pages paged in"); VM_STATS_VM(v_vnodepgsout, "Vnode pages paged out"); VM_STATS_VM(v_intrans, "In transit page faults"); -VM_STATS_VM(v_reactivated, "Pages reactivated from free list"); +VM_STATS_VM(v_reactivated, "Pages reactivated by pagedaemon"); VM_STATS_VM(v_pdwakeups, "Pagedaemon wakeups"); VM_STATS_VM(v_pdpages, "Pages analyzed by pagedaemon"); +VM_STATS_VM(v_pdshortfalls, "Page reclamation shortfalls"); VM_STATS_VM(v_tcached, "Total pages cached"); VM_STATS_VM(v_dfree, "Pages freed by pagedaemon"); VM_STATS_VM(v_pfree, "Pages freed by exiting processes"); @@ -293,9 +298,8 @@ VM_STATS_VM(v_active_count, "Active pages"); VM_STATS_VM(v_inactive_target, "Desired inactive pages"); VM_STATS_VM(v_inactive_count, "Inactive pages"); +VM_STATS_VM(v_laundry_count, "Pages eligible for laundering"); VM_STATS_VM(v_cache_count, "Pages on cache queue"); -VM_STATS_VM(v_cache_min, "Min pages on cache queue"); -VM_STATS_VM(v_cache_max, "Max pages on cached queue"); VM_STATS_VM(v_pageout_free_min, "Min pages reserved for kernel"); VM_STATS_VM(v_interrupt_free_min, "Reserved pages for interrupt code"); VM_STATS_VM(v_forks, "Number of fork() calls"); Modified: trunk/sys/vm/vm_mmap.c =================================================================== --- trunk/sys/vm/vm_mmap.c 2020-02-08 19:35:04 UTC (rev 12313) +++ trunk/sys/vm/vm_mmap.c 2020-02-08 19:35:48 UTC (rev 12314) @@ -42,10 +42,11 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/vm/vm_mmap.c 321717 2017-07-30 10:36:20Z kib $"); +__FBSDID("$FreeBSD: stable/11/sys/vm/vm_mmap.c 356634 2020-01-11 15:06:06Z kevans $"); #include "opt_compat.h" #include "opt_hwpmc_hooks.h" +#include "opt_vm.h" #include <sys/param.h> #include <sys/systm.h> @@ -74,6 +75,7 @@ #include <sys/sysent.h> #include <sys/vmmeter.h> +#include <security/audit/audit.h> #include <security/mac/mac_framework.h> #include <vm/vm.h> @@ -93,21 +95,16 @@ #endif int old_mlock = 0; -SYSCTL_INT(_vm, OID_AUTO, old_mlock, CTLFLAG_RW | CTLFLAG_TUN, &old_mlock, 0, +SYSCTL_INT(_vm, OID_AUTO, old_mlock, CTLFLAG_RWTUN, &old_mlock, 0, "Do not apply RLIMIT_MEMLOCK on mlockall"); -TUNABLE_INT("vm.old_mlock", &old_mlock); +static int mincore_mapped = 1; +SYSCTL_INT(_vm, OID_AUTO, mincore_mapped, CTLFLAG_RWTUN, &mincore_mapped, 0, + "mincore reports mappings, not residency"); #ifdef MAP_32BIT #define MAP_32BIT_MAX_ADDR ((vm_offset_t)1 << 31) #endif -static int vm_mmap_vnode(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, - int *, struct vnode *, vm_ooffset_t *, vm_object_t *, boolean_t *); -static int vm_mmap_cdev(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, - int *, struct cdev *, vm_ooffset_t *, vm_object_t *); -static int vm_mmap_shm(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, - int *, struct shmfd *, vm_ooffset_t, vm_object_t *); - #ifndef _SYS_SYSPROTO_H_ struct sbrk_args { int incr; @@ -177,34 +174,48 @@ #endif int -sys_mmap(td, uap) - struct thread *td; - struct mmap_args *uap; +sys_mmap(struct thread *td, struct mmap_args *uap) { -#ifdef HWPMC_HOOKS - struct pmckern_map_in pkm; -#endif + + return (kern_mmap(td, (uintptr_t)uap->addr, uap->len, uap->prot, + uap->flags, uap->fd, uap->pos)); +} + +int +kern_mmap(struct thread *td, uintptr_t addr0, size_t size, int prot, int flags, + int fd, off_t pos) +{ + + return (kern_mmap_fpcheck(td, addr0, size, prot, flags, fd, pos, NULL)); +} + +/* + * When mmap'ing a file, check_fp_fn may be used for the caller to do any + * last-minute validation based on the referenced file in a non-racy way. + */ +int +kern_mmap_fpcheck(struct thread *td, uintptr_t addr0, size_t size, int prot, + int flags, int fd, off_t pos, mmap_check_fp_fn check_fp_fn) +{ + struct vmspace *vms; struct file *fp; - struct vnode *vp; vm_offset_t addr; - vm_size_t size, pageoff; - vm_prot_t cap_maxprot, prot, maxprot; - void *handle; - objtype_t handle_type; - int align, error, flags; - off_t pos; - struct vmspace *vms = td->td_proc->p_vmspace; + vm_size_t pageoff; + vm_prot_t cap_maxprot; + int align, error; cap_rights_t rights; - addr = (vm_offset_t) uap->addr; - size = uap->len; - prot = uap->prot & VM_PROT_ALL; - flags = uap->flags; - pos = uap->pos; - + vms = td->td_proc->p_vmspace; fp = NULL; + AUDIT_ARG_FD(fd); + addr = addr0; /* + * Ignore old flags that used to be defined but did not do anything. + */ + flags &= ~(MAP_RESERVED0020 | MAP_RESERVED0040); + + /* * Enforce the constraints. * Mapping of length 0 is only allowed for old binaries. * Anonymous mapping shall specify -1 as filedescriptor and @@ -214,8 +225,8 @@ * pos. */ if (!SV_CURPROC_FLAG(SV_AOUT)) { - if ((uap->len == 0 && curproc->p_osrel >= P_OSREL_MAP_ANON) || - ((flags & MAP_ANON) != 0 && (uap->fd != -1 || pos != 0))) + if ((size == 0 && curproc->p_osrel >= P_OSREL_MAP_ANON) || + ((flags & MAP_ANON) != 0 && (fd != -1 || pos != 0))) return (EINVAL); } else { if ((flags & MAP_ANON) != 0) @@ -223,15 +234,28 @@ } if (flags & MAP_STACK) { - if ((uap->fd != -1) || + if ((fd != -1) || ((prot & (PROT_READ | PROT_WRITE)) != (PROT_READ | PROT_WRITE))) return (EINVAL); flags |= MAP_ANON; pos = 0; } + if ((flags & ~(MAP_SHARED | MAP_PRIVATE | MAP_FIXED | MAP_HASSEMAPHORE | + MAP_STACK | MAP_NOSYNC | MAP_ANON | MAP_EXCL | MAP_NOCORE | + MAP_PREFAULT_READ | MAP_GUARD | +#ifdef MAP_32BIT + MAP_32BIT | +#endif + MAP_ALIGNMENT_MASK)) != 0) + return (EINVAL); if ((flags & (MAP_EXCL | MAP_FIXED)) == MAP_EXCL) return (EINVAL); - if ((flags & MAP_GUARD) != 0 && (prot != PROT_NONE || uap->fd != -1 || + if ((flags & (MAP_SHARED | MAP_PRIVATE)) == (MAP_SHARED | MAP_PRIVATE)) + return (EINVAL); + if (prot != PROT_NONE && + (prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC)) != 0) + return (EINVAL); + if ((flags & MAP_GUARD) != 0 && (prot != PROT_NONE || fd != -1 || pos != 0 || (flags & (MAP_SHARED | MAP_PRIVATE | MAP_PREFAULT | MAP_PREFAULT_READ | MAP_ANON | MAP_STACK)) != 0)) return (EINVAL); @@ -295,28 +319,32 @@ * There should really be a pmap call to determine a reasonable * location. */ - PROC_LOCK(td->td_proc); if (addr == 0 || (addr >= round_page((vm_offset_t)vms->vm_taddr) && addr < round_page((vm_offset_t)vms->vm_daddr + - lim_max(td->td_proc, RLIMIT_DATA)))) + lim_max(td, RLIMIT_DATA)))) addr = round_page((vm_offset_t)vms->vm_daddr + - lim_max(td->td_proc, RLIMIT_DATA)); - PROC_UNLOCK(td->td_proc); + lim_max(td, RLIMIT_DATA)); } - if ((flags & MAP_GUARD) != 0) { - handle = NULL; - handle_type = OBJT_DEFAULT; - maxprot = VM_PROT_NONE; - cap_maxprot = VM_PROT_NONE; + if (size == 0) { + /* + * Return success without mapping anything for old + * binaries that request a page-aligned mapping of + * length 0. For modern binaries, this function + * returns an error earlier. + */ + error = 0; + } else if ((flags & MAP_GUARD) != 0) { + error = vm_mmap_object(&vms->vm_map, &addr, size, VM_PROT_NONE, + VM_PROT_NONE, flags, NULL, pos, FALSE, td); } else if ((flags & MAP_ANON) != 0) { /* * Mapping blank space is trivial. + * + * This relies on VM_PROT_* matching PROT_*. */ - handle = NULL; - handle_type = OBJT_DEFAULT; - maxprot = VM_PROT_ALL; - cap_maxprot = VM_PROT_ALL; + error = vm_mmap_object(&vms->vm_map, &addr, size, prot, + VM_PROT_ALL, flags, NULL, pos, FALSE, td); } else { /* * Mapping file, get fp for validation and don't let the @@ -333,94 +361,24 @@ } if (prot & PROT_EXEC) cap_rights_set(&rights, CAP_MMAP_X); - error = fget_mmap(td, uap->fd, &rights, &cap_maxprot, &fp); + error = fget_mmap(td, fd, &rights, &cap_maxprot, &fp); if (error != 0) goto done; - if (fp->f_type == DTYPE_SHM) { - handle = fp->f_data; - handle_type = OBJT_SWAP; - maxprot = VM_PROT_NONE; - - /* FREAD should always be set. */ - if (fp->f_flag & FREAD) - maxprot |= VM_PROT_EXECUTE | VM_PROT_READ; - if (fp->f_flag & FWRITE) - maxprot |= VM_PROT_WRITE; - goto map; - } - if (fp->f_type != DTYPE_VNODE) { - error = ENODEV; + if ((flags & (MAP_SHARED | MAP_PRIVATE)) == 0 && + td->td_proc->p_osrel >= P_OSREL_MAP_FSTRICT) { + error = EINVAL; goto done; } -#if defined(COMPAT_FREEBSD7) || defined(COMPAT_FREEBSD6) || \ - defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4) - /* - * POSIX shared-memory objects are defined to have - * kernel persistence, and are not defined to support - * read(2)/write(2) -- or even open(2). Thus, we can - * use MAP_ASYNC to trade on-disk coherence for speed. - * The shm_open(3) library routine turns on the FPOSIXSHM - * flag to request this behavior. - */ - if (fp->f_flag & FPOSIXSHM) - flags |= MAP_NOSYNC; -#endif - vp = fp->f_vnode; - /* - * Ensure that file and memory protections are - * compatible. Note that we only worry about - * writability if mapping is shared; in this case, - * current and max prot are dictated by the open file. - * XXX use the vnode instead? Problem is: what - * credentials do we use for determination? What if - * proc does a setuid? - */ - if (vp->v_mount != NULL && vp->v_mount->mnt_flag & MNT_NOEXEC) - maxprot = VM_PROT_NONE; - else - maxprot = VM_PROT_EXECUTE; - if (fp->f_flag & FREAD) { - maxprot |= VM_PROT_READ; - } else if (prot & PROT_READ) { - error = EACCES; - goto done; - } - /* - * If we are sharing potential changes (either via - * MAP_SHARED or via the implicit sharing of character - * device mappings), and we are trying to get write - * permission although we opened it without asking - * for it, bail out. - */ - if ((flags & MAP_SHARED) != 0) { - if ((fp->f_flag & FWRITE) != 0) { - maxprot |= VM_PROT_WRITE; - } else if ((prot & PROT_WRITE) != 0) { - error = EACCES; + if (check_fp_fn != NULL) { + error = check_fp_fn(fp, prot, cap_maxprot, flags); + if (error != 0) goto done; - } - } else if (vp->v_type != VCHR || (fp->f_flag & FWRITE) != 0) { - maxprot |= VM_PROT_WRITE; - cap_maxprot |= VM_PROT_WRITE; } - handle = (void *)vp; - handle_type = OBJT_VNODE; + /* This relies on VM_PROT_* matching PROT_*. */ + error = fo_mmap(fp, &vms->vm_map, &addr, size, prot, + cap_maxprot, flags, pos, td); } -map: - td->td_fpop = fp; - maxprot &= cap_maxprot; - error = vm_mmap(&vms->vm_map, &addr, size, prot, maxprot, - flags, handle_type, handle, pos); - td->td_fpop = NULL; -#ifdef HWPMC_HOOKS - /* inform hwpmc(4) if an executable is being mapped */ - if (error == 0 && handle_type == OBJT_VNODE && - (prot & PROT_EXEC)) { - pkm.pm_file = handle; - pkm.pm_address = (uintptr_t) addr; - PMC_CALL_HOOK(td, PMC_FN_MMAP, (void *) &pkm); - } -#endif + if (error == 0) td->td_retval[0] = (register_t) (addr + pageoff); done: @@ -430,19 +388,15 @@ return (error); } +#if defined(COMPAT_FREEBSD6) int freebsd6_mmap(struct thread *td, struct freebsd6_mmap_args *uap) { - struct mmap_args oargs; - oargs.addr = uap->addr; - oargs.len = uap->len; - oargs.prot = uap->prot; - oargs.flags = uap->flags; - oargs.fd = uap->fd; - oargs.pos = uap->pos; - return (sys_mmap(td, &oargs)); + return (kern_mmap(td, (uintptr_t)uap->addr, uap->len, uap->prot, + uap->flags, uap->fd, uap->pos)); } +#endif #ifdef COMPAT_43 #ifndef _SYS_SYSPROTO_H_ @@ -456,11 +410,8 @@ }; #endif int -ommap(td, uap) - struct thread *td; - struct ommap_args *uap; +ommap(struct thread *td, struct ommap_args *uap) { - struct mmap_args nargs; static const char cvtbsdprot[8] = { 0, PROT_EXEC, @@ -471,6 +422,7 @@ PROT_WRITE | PROT_READ, PROT_EXEC | PROT_WRITE | PROT_READ, }; + int flags, prot; #define OMAP_ANON 0x0002 #define OMAP_COPY 0x0020 @@ -477,30 +429,27 @@ #define OMAP_SHARED 0x0010 #define OMAP_FIXED 0x0100 - nargs.addr = uap->addr; - nargs.len = uap->len; - nargs.prot = cvtbsdprot[uap->prot & 0x7]; + prot = cvtbsdprot[uap->prot & 0x7]; #ifdef COMPAT_FREEBSD32 -#if defined(__amd64__) || defined(__ia64__) +#if defined(__amd64__) if (i386_read_exec && SV_PROC_FLAG(td->td_proc, SV_ILP32) && - nargs.prot != 0) - nargs.prot |= PROT_EXEC; + prot != 0) + prot |= PROT_EXEC; #endif #endif - nargs.flags = 0; + flags = 0; if (uap->flags & OMAP_ANON) - nargs.flags |= MAP_ANON; + flags |= MAP_ANON; if (uap->flags & OMAP_COPY) - nargs.flags |= MAP_COPY; + flags |= MAP_COPY; if (uap->flags & OMAP_SHARED) - nargs.flags |= MAP_SHARED; + flags |= MAP_SHARED; else - nargs.flags |= MAP_PRIVATE; + flags |= MAP_PRIVATE; if (uap->flags & OMAP_FIXED) - nargs.flags |= MAP_FIXED; - nargs.fd = uap->fd; - nargs.pos = uap->pos; - return (sys_mmap(td, &nargs)); + flags |= MAP_FIXED; + return (kern_mmap(td, (uintptr_t)uap->addr, uap->len, prot, flags, + uap->fd, uap->pos)); } #endif /* COMPAT_43 */ @@ -513,20 +462,21 @@ }; #endif int -sys_msync(td, uap) - struct thread *td; - struct msync_args *uap; +sys_msync(struct thread *td, struct msync_args *uap) { + + return (kern_msync(td, (uintptr_t)uap->addr, uap->len, uap->flags)); +} + +int +kern_msync(struct thread *td, uintptr_t addr0, size_t size, int flags) +{ vm_offset_t addr; - vm_size_t size, pageoff; - int flags; + vm_size_t pageoff; vm_map_t map; int rv; - addr = (vm_offset_t) uap->addr; - size = uap->len; - flags = uap->flags; - + addr = addr0; pageoff = (addr & PAGE_MASK); addr -= pageoff; size += pageoff; @@ -565,23 +515,28 @@ }; #endif int -sys_munmap(td, uap) - struct thread *td; - struct munmap_args *uap; +sys_munmap(struct thread *td, struct munmap_args *uap) { + + return (kern_munmap(td, (uintptr_t)uap->addr, uap->len)); +} + +int +kern_munmap(struct thread *td, uintptr_t addr0, size_t size) +{ #ifdef HWPMC_HOOKS struct pmckern_map_out pkm; vm_map_entry_t entry; + bool pmc_handled; #endif vm_offset_t addr; - vm_size_t size, pageoff; + vm_size_t pageoff; vm_map_t map; - addr = (vm_offset_t) uap->addr; - size = uap->len; if (size == 0) return (EINVAL); + addr = addr0; pageoff = (addr & PAGE_MASK); addr -= pageoff; size += pageoff; @@ -597,20 +552,23 @@ return (EINVAL); vm_map_lock(map); #ifdef HWPMC_HOOKS - /* - * Inform hwpmc if the address range being unmapped contains - * an executable region. - */ - pkm.pm_address = (uintptr_t) NULL; - if (vm_map_lookup_entry(map, addr, &entry)) { - for (; - entry != &map->header && entry->start < addr + size; - entry = entry->next) { - if (vm_map_check_protection(map, entry->start, - entry->end, VM_PROT_EXECUTE) == TRUE) { - pkm.pm_address = (uintptr_t) addr; - pkm.pm_size = (size_t) size; - break; + pmc_handled = false; + if (PMC_HOOK_INSTALLED(PMC_FN_MUNMAP)) { + pmc_handled = true; + /* + * Inform hwpmc if the address range being unmapped contains + * an executable region. + */ + pkm.pm_address = (uintptr_t) NULL; + if (vm_map_lookup_entry(map, addr, &entry)) { + for (; entry->start < addr + size; + entry = entry->next) { + if (vm_map_check_protection(map, entry->start, + entry->end, VM_PROT_EXECUTE) == TRUE) { + pkm.pm_address = (uintptr_t) addr; + pkm.pm_size = (size_t) size; + break; + } } } } @@ -618,14 +576,16 @@ vm_map_delete(map, addr, addr + size); #ifdef HWPMC_HOOKS - /* downgrade the lock to prevent a LOR with the pmc-sx lock */ - vm_map_lock_downgrade(map); - if (pkm.pm_address != (uintptr_t) NULL) - PMC_CALL_HOOK(td, PMC_FN_MUNMAP, (void *) &pkm); - vm_map_unlock_read(map); -#else - vm_map_unlock(map); + if (__predict_false(pmc_handled)) { + /* downgrade the lock to prevent a LOR with the pmc-sx lock */ + vm_map_lock_downgrade(map); + if (pkm.pm_address != (uintptr_t) NULL) + PMC_CALL_HOOK(td, PMC_FN_MUNMAP, (void *) &pkm); + vm_map_unlock_read(map); + } else #endif + vm_map_unlock(map); + /* vm_map_delete returns nothing but KERN_SUCCESS anyway */ return (0); } @@ -638,22 +598,30 @@ }; #endif int -sys_mprotect(td, uap) - struct thread *td; - struct mprotect_args *uap; +sys_mprotect(struct thread *td, struct mprotect_args *uap) { + + return (kern_mprotect(td, (uintptr_t)uap->addr, uap->len, uap->prot)); +} + +int +kern_mprotect(struct thread *td, uintptr_t addr0, size_t size, int prot) +{ vm_offset_t addr; - vm_size_t size, pageoff; - vm_prot_t prot; + vm_size_t pageoff; - addr = (vm_offset_t) uap->addr; - size = uap->len; - prot = uap->prot & VM_PROT_ALL; - + addr = addr0; + prot = (prot & VM_PROT_ALL); pageoff = (addr & PAGE_MASK); addr -= pageoff; size += pageoff; size = (vm_size_t) round_page(size); +#ifdef COMPAT_FREEBSD32 + if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) { + if (((addr + size) & 0xffffffff) < addr) + return (EINVAL); + } else +#endif if (addr + size < addr) return (EINVAL); @@ -715,8 +683,15 @@ int sys_madvise(struct thread *td, struct madvise_args *uap) { - vm_offset_t start, end; + + return (kern_madvise(td, (uintptr_t)uap->addr, uap->len, uap->behav)); +} + +int +kern_madvise(struct thread *td, uintptr_t addr0, size_t len, int behav) +{ vm_map_t map; + vm_offset_t addr, end, start; int flags; /* @@ -723,7 +698,7 @@ * Check for our special case, advising the swap pager we are * "immortal." */ - if (uap->behav == MADV_PROTECT) { + if (behav == MADV_PROTECT) { flags = PPROT_SET; return (kern_procctl(td, P_PID, td->td_proc->p_pid, PROC_SPROTECT, &flags)); @@ -732,7 +707,7 @@ /* * Check for illegal behavior */ - if (uap->behav < 0 || uap->behav > MADV_CORE) + if (behav < 0 || behav > MADV_CORE) return (EINVAL); /* * Check for illegal addresses. Watch out for address wrap... Note @@ -739,10 +714,10 @@ * that VM_*_ADDRESS are not constants due to casts (argh). */ map = &td->td_proc->p_vmspace->vm_map; - if ((vm_offset_t)uap->addr < vm_map_min(map) || - (vm_offset_t)uap->addr + uap->len > vm_map_max(map)) + addr = addr0; + if (addr < vm_map_min(map) || addr + len > vm_map_max(map)) return (EINVAL); - if (((vm_offset_t) uap->addr + uap->len) < (vm_offset_t) uap->addr) + if ((addr + len) < addr) return (EINVAL); /* @@ -749,10 +724,10 @@ * Since this routine is only advisory, we default to conservative * behavior. */ - start = trunc_page((vm_offset_t) uap->addr); - end = round_page((vm_offset_t) uap->addr + uap->len); + start = trunc_page(addr); + end = round_page(addr + len); - if (vm_map_madvise(map, start, end, uap->behav)) + if (vm_map_madvise(map, start, end, behav)) return (EINVAL); return (0); } @@ -768,11 +743,17 @@ int sys_mincore(struct thread *td, struct mincore_args *uap) { + + return (kern_mincore(td, (uintptr_t)uap->addr, uap->len, uap->vec)); +} + +int +kern_mincore(struct thread *td, uintptr_t addr0, size_t len, char *vec) +{ vm_offset_t addr, first_addr; vm_offset_t end, cend; pmap_t pmap; vm_map_t map; - char *vec; int error = 0; int vecindex, lastvecindex; vm_map_entry_t current; @@ -789,17 +770,12 @@ * Make sure that the addresses presented are valid for user * mode. */ - first_addr = addr = trunc_page((vm_offset_t) uap->addr); - end = addr + (vm_size_t)round_page(uap->len); + first_addr = addr = trunc_page(addr0); + end = addr + (vm_size_t)round_page(len); map = &td->td_proc->p_vmspace->vm_map; if (end > vm_map_max(map) || end < addr) return (ENOMEM); - /* - * Address of byte vector - */ - vec = uap->vec; - pmap = vmspace_pmap(td->td_proc->p_vmspace); vm_map_lock_read(map); @@ -817,16 +793,12 @@ * up the pages elsewhere. */ lastvecindex = -1; - for (current = entry; - (current != &map->header) && (current->start < end); - current = current->next) { + for (current = entry; current->start < end; current = current->next) { /* * check for contiguity */ - if (current->end < end && - (entry->next == &map->header || - current->next->start > current->end)) { + if (current->end < end && current->next->start > current->end) { vm_map_unlock_read(map); return (ENOMEM); } @@ -862,8 +834,17 @@ retry: m = NULL; mincoreinfo = pmap_mincore(pmap, addr, &locked_pa); - if (locked_pa != 0) { + if (mincore_mapped) { /* + * We only care about this pmap's + * mapping of the page, if any. + */ + if (locked_pa != 0) { + vm_page_unlock(PHYS_TO_VM_PAGE( + locked_pa)); + } + } else if (locked_pa != 0) { + /* * The page is mapped by this process but not * both accessed and modified. It is also * managed. Acquire the object lock so that @@ -905,9 +886,6 @@ pindex = OFF_TO_IDX(current->offset + (addr - current->start)); m = vm_page_lookup(object, pindex); - if (m == NULL && - vm_page_is_cached(object, pindex)) - mincoreinfo = MINCORE_INCORE; if (m != NULL && m->valid == 0) m = NULL; if (m != NULL) @@ -945,7 +923,7 @@ /* * calculate index into user supplied byte vector */ - vecindex = OFF_TO_IDX(addr - first_addr); + vecindex = atop(addr - first_addr); /* * If we have skipped map entries, we need to make sure that @@ -991,7 +969,7 @@ /* * Zero the last entries in the byte vector. */ - vecindex = OFF_TO_IDX(end - first_addr); + vecindex = atop(end - first_addr); while ((lastvecindex + 1) < vecindex) { ++lastvecindex; error = subyte(vec + lastvecindex, 0); @@ -1023,11 +1001,12 @@ sys_mlock(struct thread *td, struct mlock_args *uap) { - return (vm_mlock(td->td_proc, td->td_ucred, uap->addr, uap->len)); + return (kern_mlock(td->td_proc, td->td_ucred, + __DECONST(uintptr_t, uap->addr), uap->len)); } int -vm_mlock(struct proc *proc, struct ucred *cred, const void *addr0, size_t len) +kern_mlock(struct proc *proc, struct ucred *cred, uintptr_t addr0, size_t len) { vm_offset_t addr, end, last, start; vm_size_t npages, size; @@ -1038,7 +1017,7 @@ error = priv_check_cred(cred, PRIV_VM_MLOCK, 0); if (error) return (error); - addr = (vm_offset_t)addr0; + addr = addr0; size = len; last = addr + size; start = trunc_page(addr); @@ -1051,12 +1030,12 @@ map = &proc->p_vmspace->vm_map; PROC_LOCK(proc); nsize = ptoa(npages + pmap_wired_count(map->pmap)); - if (nsize > lim_cur(proc, RLIMIT_MEMLOCK)) { + if (nsize > lim_cur_proc(proc, RLIMIT_MEMLOCK)) { PROC_UNLOCK(proc); return (ENOMEM); } PROC_UNLOCK(proc); - if (npages + cnt.v_wire_count > vm_page_max_wired) + if (npages + vm_cnt.v_wire_count > vm_page_max_wired) return (EAGAIN); #ifdef RACCT if (racct_enable) { @@ -1106,7 +1085,7 @@ */ if (!old_mlock && uap->how & MCL_CURRENT) { PROC_LOCK(td->td_proc); - if (map->size > lim_cur(td->td_proc, RLIMIT_MEMLOCK)) { + if (map->size > lim_cur(td, RLIMIT_MEMLOCK)) { PROC_UNLOCK(td->td_proc); return (ENOMEM); } @@ -1195,12 +1174,16 @@ }; #endif int -sys_munlock(td, uap) - struct thread *td; - struct munlock_args *uap; +sys_munlock(struct thread *td, struct munlock_args *uap) { + + return (kern_munlock(td, (uintptr_t)uap->addr, uap->len)); +} + +int +kern_munlock(struct thread *td, uintptr_t addr0, size_t size) +{ vm_offset_t addr, end, last, start; - vm_size_t size; #ifdef RACCT vm_map_t map; #endif @@ -1209,8 +1192,7 @@ error = priv_check(td, PRIV_VM_MUNLOCK); if (error) return (error); - addr = (vm_offset_t)uap->addr; - size = uap->len; + addr = addr0; last = addr + size; start = trunc_page(addr); end = round_page(last); @@ -1235,9 +1217,6 @@ * * Helper function for vm_mmap. Perform sanity check specific for mmap * operations on vnodes. - * - * For VCHR vnodes, the vnode lock is held over the call to - * vm_mmap_cdev() to keep vp->v_rdev valid. */ int vm_mmap_vnode(struct thread *td, vm_size_t objsize, @@ -1247,7 +1226,7 @@ { struct vattr va; vm_object_t obj; - vm_offset_t foff; + vm_ooffset_t foff; struct ucred *cred; int error, flags, locktype; @@ -1258,6 +1237,7 @@ locktype = LK_SHARED; if ((error = vget(vp, locktype, td)) != 0) return (error); + AUDIT_ARG_VNODE1(vp); foff = *foffp; flags = *flagsp; obj = vp->v_object; @@ -1284,12 +1264,6 @@ *writecounted = TRUE; vnode_pager_update_writecount(obj, 0, objsize); } - } else if (vp->v_type == VCHR) { - error = vm_mmap_cdev(td, objsize, prot, maxprotp, flagsp, - vp->v_rdev, foffp, objp); - if (error == 0) - goto mark_atime; - goto done; } else { error = EINVAL; goto done; @@ -1297,13 +1271,14 @@ if ((error = VOP_GETATTR(vp, &va, cred))) goto done; #ifdef MAC - error = mac_vnode_check_mmap(cred, vp, prot, flags); + /* This relies on VM_PROT_* matching PROT_*. */ + error = mac_vnode_check_mmap(cred, vp, (int)prot, flags); if (error != 0) goto done; #endif if ((flags & MAP_SHARED) != 0) { if ((va.va_flags & (SF_SNAPSHOT|IMMUTABLE|APPEND)) != 0) { - if (prot & PROT_WRITE) { + if (prot & VM_PROT_WRITE) { error = EPERM; goto done; } @@ -1318,22 +1293,26 @@ objsize = round_page(va.va_size); if (va.va_nlink == 0) flags |= MAP_NOSYNC; - if (obj->type == OBJT_VNODE) + if (obj->type == OBJT_VNODE) { obj = vm_pager_allocate(OBJT_VNODE, vp, objsize, prot, foff, cred); - else { + if (obj == NULL) { + error = ENOMEM; + goto done; + } + } else { KASSERT(obj->type == OBJT_DEFAULT || obj->type == OBJT_SWAP, ("wrong object type")); - vm_object_reference(obj); + VM_OBJECT_WLOCK(obj); + vm_object_reference_locked(obj); +#if VM_NRESERVLEVEL > 0 + vm_object_color(obj, 0); +#endif + VM_OBJECT_WUNLOCK(obj); } - if (obj == NULL) { - error = ENOMEM; - goto done; - } *objp = obj; *flagsp = flags; -mark_atime: vfs_mark_atime(vp, cred); done: @@ -1352,21 +1331,18 @@ * operations on cdevs. */ int -vm_mmap_cdev(struct thread *td, vm_size_t objsize, - vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, - struct cdev *cdev, vm_ooffset_t *foff, vm_object_t *objp) +vm_mmap_cdev(struct thread *td, vm_size_t objsize, vm_prot_t prot, + vm_prot_t *maxprotp, int *flagsp, struct cdev *cdev, struct cdevsw *dsw, + vm_ooffset_t *foff, vm_object_t *objp) { vm_object_t obj; - struct cdevsw *dsw; - int error, flags, ref; + int error, flags; flags = *flagsp; - dsw = dev_refthread(cdev, &ref); - if (dsw == NULL) - return (ENXIO); if (dsw->d_flags & D_MMAP_ANON) { - dev_relthread(cdev, ref); + *objp = NULL; + *foff = 0; *maxprotp = VM_PROT_ALL; *flagsp |= MAP_ANON; return (0); @@ -1375,24 +1351,18 @@ * cdevs do not provide private mappings of any kind. */ if ((*maxprotp & VM_PROT_WRITE) == 0 && - (prot & PROT_WRITE) != 0) { - dev_relthread(cdev, ref); + (prot & VM_PROT_WRITE) != 0) return (EACCES); - } - if (flags & (MAP_PRIVATE|MAP_COPY)) { - dev_relthread(cdev, ref); + if (flags & (MAP_PRIVATE|MAP_COPY)) return (EINVAL); - } /* * Force device mappings to be shared. */ flags |= MAP_SHARED; #ifdef MAC_XXX - error = mac_cdev_check_mmap(td->td_ucred, cdev, prot); - if (error != 0) { - dev_relthread(cdev, ref); + error = mac_cdev_check_mmap(td->td_ucred, cdev, (int)prot); + if (error != 0) return (error); - } #endif /* * First, try d_mmap_single(). If that is not implemented @@ -1404,7 +1374,6 @@ * XXX assumes VM_PROT_* == PROT_* */ error = dsw->d_mmap_single(cdev, foff, objsize, objp, (int)prot); - dev_relthread(cdev, ref); if (error != ENODEV) return (error); obj = vm_pager_allocate(OBJT_DEVICE, cdev, objsize, prot, *foff, @@ -1417,65 +1386,96 @@ } /* - * vm_mmap_shm() + * vm_mmap() * - * MPSAFE - * - * Helper function for vm_mmap. Perform sanity check specific for mmap - * operations on shm file descriptors. + * Internal version of mmap used by exec, sys5 shared memory, and + * various device drivers. Handle is either a vnode pointer, a + * character device, or NULL for MAP_ANON. */ int -vm_mmap_shm(struct thread *td, vm_size_t objsize, - vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, - struct shmfd *shmfd, vm_ooffset_t foff, vm_object_t *objp) +vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, + vm_prot_t maxprot, int flags, + objtype_t handle_type, void *handle, + vm_ooffset_t foff) { + vm_object_t object; + struct thread *td = curthread; int error; + boolean_t writecounted; - if ((*flagsp & MAP_SHARED) != 0 && - (*maxprotp & VM_PROT_WRITE) == 0 && - (prot & PROT_WRITE) != 0) - return (EACCES); -#ifdef MAC - error = mac_posixshm_check_mmap(td->td_ucred, shmfd, prot, *flagsp); - if (error != 0) - return (error); -#endif - error = shm_mmap(shmfd, objsize, foff, objp); + if (size == 0) + return (EINVAL); + + size = round_page(size); + object = NULL; + writecounted = FALSE; + + /* + * Lookup/allocate object. + */ + switch (handle_type) { + case OBJT_DEVICE: { + struct cdevsw *dsw; + struct cdev *cdev; + int ref; + + cdev = handle; + dsw = dev_refthread(cdev, &ref); + if (dsw == NULL) + return (ENXIO); + error = vm_mmap_cdev(td, size, prot, &maxprot, &flags, cdev, + dsw, &foff, &object); + dev_relthread(cdev, ref); + break; + } + case OBJT_VNODE: + error = vm_mmap_vnode(td, size, prot, &maxprot, &flags, + handle, &foff, &object, &writecounted); + break; + case OBJT_DEFAULT: + if (handle == NULL) { + error = 0; + break; + } + /* FALLTHROUGH */ + default: + error = EINVAL; + break; + } if (error) return (error); - return (0); + + error = vm_mmap_object(map, addr, size, prot, maxprot, flags, object, + foff, writecounted, td); + if (error != 0 && object != NULL) { + /* + * If this mapping was accounted for in the vnode's + * writecount, then undo that now. + */ + if (writecounted) + vnode_pager_release_writecount(object, 0, size); + vm_object_deallocate(object); + } + return (error); } /* - * vm_mmap() - * - * MPSAFE - * - * Internal version of mmap. Currently used by mmap, exec, and sys5 - * shared memory. Handle is either a vnode pointer or NULL for MAP_ANON. + * Internal version of mmap that maps a specific VM object into an + * map. Called by mmap for MAP_ANON, vm_mmap, shm_mmap, and vn_mmap. */ int -vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, - vm_prot_t maxprot, int flags, - objtype_t handle_type, void *handle, - vm_ooffset_t foff) +vm_mmap_object(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, + vm_prot_t maxprot, int flags, vm_object_t object, vm_ooffset_t foff, + boolean_t writecounted, struct thread *td) { boolean_t curmap, fitit; vm_offset_t max_addr; - vm_object_t object = NULL; - struct thread *td = curthread; int docow, error, findspace, rv; - boolean_t writecounted; - if (size == 0) - return (0); - - size = round_page(size); - curmap = map == &td->td_proc->p_vmspace->vm_map; if (curmap) { PROC_LOCK(td->td_proc); - if (map->size + size > lim_cur(td->td_proc, RLIMIT_VMEM)) { + if (map->size + size > lim_cur_proc(td->td_proc, RLIMIT_VMEM)) { PROC_UNLOCK(td->td_proc); return (ENOMEM); } @@ -1485,7 +1485,7 @@ } if (!old_mlock && map->flags & MAP_WIREFUTURE) { if (ptoa(pmap_wired_count(map->pmap)) + size > - lim_cur(td->td_proc, RLIMIT_MEMLOCK)) { + lim_cur_proc(td->td_proc, RLIMIT_MEMLOCK)) { racct_set_force(td->td_proc, RACCT_VMEM, map->size); PROC_UNLOCK(td->td_proc); @@ -1505,11 +1505,11 @@ /* * We currently can only deal with page aligned file offsets. - * The check is here rather than in the syscall because the - * kernel calls this function internally for other mmaping - * operations (such as in exec) and non-aligned offsets will - * cause pmap inconsistencies...so we want to be sure to - * disallow this in all cases. + * The mmap() system call already enforces this by subtracting + * the page offset from the file offset, but checking here + * catches errors in device drivers (e.g. d_single_mmap() + * callbacks) and other internal mapping requests (such as in + * exec). */ if (foff & PAGE_MASK) return (EINVAL); @@ -1522,44 +1522,11 @@ return (EINVAL); fitit = FALSE; } - writecounted = FALSE; - /* - * Lookup/allocate object. - */ - switch (handle_type) { - case OBJT_DEVICE: - error = vm_mmap_cdev(td, size, prot, &maxprot, &flags, - handle, &foff, &object); - break; - case OBJT_VNODE: - error = vm_mmap_vnode(td, size, prot, &maxprot, &flags, - handle, &foff, &object, &writecounted); - break; - case OBJT_SWAP: - error = vm_mmap_shm(td, size, prot, &maxprot, &flags, - handle, foff, &object); - break; - case OBJT_DEFAULT: - if (handle == NULL) { - error = 0; - break; - } - /* FALLTHROUGH */ - default: - error = EINVAL; - break; - } - if (error) - return (error); if (flags & MAP_ANON) { - object = NULL; + if (object != NULL || foff != 0) + return (EINVAL); docow = 0; - /* - * Unnamed anonymous regions always start at 0. - */ - if (handle == 0) - foff = 0; } else if (flags & MAP_PREFAULT_READ) docow = MAP_PREFAULT; else @@ -1600,15 +1567,9 @@ max_addr = MAP_32BIT_MAX_ADDR; #endif if (curmap) { - vm_offset_t min_addr; - - PROC_LOCK(td->td_proc); - min_addr = round_page((vm_offset_t)td->td_proc-> - p_vmspace->vm_daddr + lim_max(td->td_proc, - RLIMIT_DATA)); - PROC_UNLOCK(td->td_proc); rv = vm_map_find_min(map, object, foff, addr, size, - min_addr, max_addr, + round_page((vm_offset_t)td->td_proc->p_vmspace-> + vm_daddr + lim_max(td, RLIMIT_DATA)), max_addr, findspace, prot, maxprot, docow); } else { rv = vm_map_find(map, object, foff, addr, size, @@ -1629,19 +1590,6 @@ VM_MAP_WIRE_USER | ((flags & MAP_STACK) ? VM_MAP_WIRE_HOLESOK : VM_MAP_WIRE_NOHOLES)); } - } else { - /* - * If this mapping was accounted for in the vnode's - * writecount, then undo that now. - */ - if (writecounted) - vnode_pager_release_writecount(object, 0, size); - /* - * Lose the object reference. Will destroy the - * object if it's an unnamed anonymous mapping - * or named anonymous without other references. - */ - vm_object_deallocate(object); } return (vm_mmap_to_errno(rv)); } Modified: trunk/sys/vm/vm_object.c =================================================================== --- trunk/sys/vm/vm_object.c 2020-02-08 19:35:04 UTC (rev 12313) +++ trunk/sys/vm/vm_object.c 2020-02-08 19:35:48 UTC (rev 12314) @@ -64,7 +64,7 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/vm/vm_object.c 321677 2017-07-29 08:24:51Z kib $"); +__FBSDID("$FreeBSD: stable/11/sys/vm/vm_object.c 352331 2019-09-14 13:35:48Z kib $"); #include "opt_vm.h" @@ -74,6 +74,7 @@ #include <sys/mman.h> #include <sys/mount.h> #include <sys/kernel.h> +#include <sys/pctrie.h> #include <sys/sysctl.h> #include <sys/mutex.h> #include <sys/proc.h> /* for curproc, pageproc */ @@ -179,9 +180,6 @@ ("object %p has reservations", object)); #endif - KASSERT(vm_object_cache_is_empty(object), - ("object %p has cached pages", - object)); KASSERT(object->paging_in_progress == 0, ("object %p paging_in_progress = %d", object, object->paging_in_progress)); @@ -203,19 +201,16 @@ vm_object_t object; object = (vm_object_t)mem; - bzero(&object->lock, sizeof(object->lock)); - rw_init_flags(&object->lock, "vm object", RW_DUPOK); + rw_init_flags(&object->lock, "vm object", RW_DUPOK | RW_NEW); /* These are true for any object that has been freed */ object->type = OBJT_DEAD; object->ref_count = 0; - object->rtree.rt_root = 0; - object->rtree.rt_flags = 0; + vm_radix_init(&object->rtree); object->paging_in_progress = 0; object->resident_page_count = 0; object->shadow_count = 0; - object->cache.rt_root = 0; - object->cache.rt_flags = 0; + object->flags = OBJ_DEAD; mtx_lock(&vm_object_list_mtx); TAILQ_INSERT_TAIL(&vm_object_list, object, object_list); @@ -231,6 +226,16 @@ LIST_INIT(&object->shadow_head); object->type = type; + if (type == OBJT_SWAP) + pctrie_init(&object->un_pager.swp.swp_blks); + + /* + * Ensure that swap_pager_swapoff() iteration over object_list + * sees up to date type and pctrie head if it observed + * non-dead object. + */ + atomic_thread_fence_rel(); + switch (type) { case OBJT_DEAD: panic("_vm_object_allocate: can't create OBJT_DEAD"); @@ -266,6 +271,7 @@ #if VM_NRESERVLEVEL > 0 LIST_INIT(&object->rvq); #endif + umtx_shm_object_init(object); } /* @@ -280,8 +286,8 @@ mtx_init(&vm_object_list_mtx, "vm object_list", NULL, MTX_DEF); rw_init(&kernel_object->lock, "kernel vm object"); - _vm_object_allocate(OBJT_PHYS, OFF_TO_IDX(VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS), - kernel_object); + _vm_object_allocate(OBJT_PHYS, atop(VM_MAX_KERNEL_ADDRESS - + VM_MIN_KERNEL_ADDRESS), kernel_object); #if VM_NRESERVLEVEL > 0 kernel_object->flags |= OBJ_COLORED; kernel_object->pg_color = (u_short)atop(VM_MIN_KERNEL_ADDRESS); @@ -288,8 +294,8 @@ #endif rw_init(&kmem_object->lock, "kmem vm object"); - _vm_object_allocate(OBJT_PHYS, OFF_TO_IDX(VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS), - kmem_object); + _vm_object_allocate(OBJT_PHYS, atop(VM_MAX_KERNEL_ADDRESS - + VM_MIN_KERNEL_ADDRESS), kmem_object); #if VM_NRESERVLEVEL > 0 kmem_object->flags |= OBJ_COLORED; kmem_object->pg_color = (u_short)atop(VM_MIN_KERNEL_ADDRESS); @@ -308,7 +314,7 @@ #endif vm_object_zinit, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); - vm_radix_init(); + vm_radix_zinit(); } void @@ -472,11 +478,14 @@ KASSERT(vp != NULL, ("vm_object_vndeallocate: missing vp")); #ifdef INVARIANTS if (object->ref_count == 0) { - vprint("vm_object_vndeallocate", vp); + vn_printf(vp, "vm_object_vndeallocate "); panic("vm_object_vndeallocate: bad object reference count"); } #endif + if (!umtx_shm_vnobj_persistent && object->ref_count == 1) + umtx_shm_object_terminated(object); + /* * The test for text of vp vnode does not need a bypass to * reach right VV_TEXT there, since it is obtained from @@ -649,6 +658,7 @@ return; } doterm: + umtx_shm_object_terminated(object); temp = object->backing_object; if (temp != NULL) { KASSERT((object->flags & OBJ_TMPFS_NODE) == 0, @@ -697,6 +707,89 @@ } /* + * vm_object_terminate_pages removes any remaining pageable pages + * from the object and resets the object to an empty state. + */ +static void +vm_object_terminate_pages(vm_object_t object) +{ + vm_page_t p, p_next; + struct mtx *mtx, *mtx1; + struct vm_pagequeue *pq, *pq1; + + VM_OBJECT_ASSERT_WLOCKED(object); + + mtx = NULL; + pq = NULL; + + /* + * Free any remaining pageable pages. This also removes them from the + * paging queues. However, don't free wired pages, just remove them + * from the object. Rather than incrementally removing each page from + * the object, the page and object are reset to any empty state. + */ + TAILQ_FOREACH_SAFE(p, &object->memq, listq, p_next) { + vm_page_assert_unbusied(p); + if ((object->flags & OBJ_UNMANAGED) == 0) { + /* + * vm_page_free_prep() only needs the page + * lock for managed pages. + */ + mtx1 = vm_page_lockptr(p); + if (mtx1 != mtx) { + if (mtx != NULL) + mtx_unlock(mtx); + if (pq != NULL) { + vm_pagequeue_unlock(pq); + pq = NULL; + } + mtx = mtx1; + mtx_lock(mtx); + } + } + p->object = NULL; + if (p->wire_count != 0) + goto unlist; + PCPU_INC(cnt.v_pfree); + p->flags &= ~PG_ZERO; + if (p->queue != PQ_NONE) { + KASSERT(p->queue < PQ_COUNT, ("vm_object_terminate: " + "page %p is not queued", p)); + pq1 = vm_page_pagequeue(p); + if (pq != pq1) { + if (pq != NULL) + vm_pagequeue_unlock(pq); + pq = pq1; + vm_pagequeue_lock(pq); + } + } + if (vm_page_free_prep(p, true)) + continue; +unlist: + TAILQ_REMOVE(&object->memq, p, listq); + } + if (pq != NULL) + vm_pagequeue_unlock(pq); + if (mtx != NULL) + mtx_unlock(mtx); + + vm_page_free_phys_pglist(&object->memq); + + /* + * If the object contained any pages, then reset it to an empty state. + * None of the object's fields, including "resident_page_count", were + * modified by the preceding loop. + */ + if (object->resident_page_count != 0) { + vm_radix_reclaim_allnodes(&object->rtree); + TAILQ_INIT(&object->memq); + object->resident_page_count = 0; + if (object->type == OBJT_VNODE) + vdrop(object->handle); + } +} + +/* * vm_object_terminate actually destroys the specified object, freeing * up all previously used resources. * @@ -706,7 +799,6 @@ void vm_object_terminate(vm_object_t object) { - vm_page_t p, p_next; VM_OBJECT_ASSERT_WLOCKED(object); @@ -749,48 +841,13 @@ ("vm_object_terminate: object with references, ref_count=%d", object->ref_count)); - /* - * Free any remaining pageable pages. This also removes them from the - * paging queues. However, don't free wired pages, just remove them - * from the object. Rather than incrementally removing each page from - * the object, the page and object are reset to any empty state. - */ - TAILQ_FOREACH_SAFE(p, &object->memq, listq, p_next) { - vm_page_assert_unbusied(p); - vm_page_lock(p); - /* - * Optimize the page's removal from the object by resetting - * its "object" field. Specifically, if the page is not - * wired, then the effect of this assignment is that - * vm_page_free()'s call to vm_page_remove() will return - * immediately without modifying the page or the object. - */ - p->object = NULL; - if (p->wire_count == 0) { - vm_page_free(p); - PCPU_INC(cnt.v_pfree); - } - vm_page_unlock(p); - } - /* - * If the object contained any pages, then reset it to an empty state. - * None of the object's fields, including "resident_page_count", were - * modified by the preceding loop. - */ - if (object->resident_page_count != 0) { - vm_radix_reclaim_allnodes(&object->rtree); - TAILQ_INIT(&object->memq); - object->resident_page_count = 0; - if (object->type == OBJT_VNODE) - vdrop(object->handle); - } + if ((object->flags & OBJ_PG_DTOR) == 0) + vm_object_terminate_pages(object); #if VM_NRESERVLEVEL > 0 if (__predict_false(!LIST_EMPTY(&object->rvq))) vm_reserv_break_all(object); #endif - if (__predict_false(!vm_object_cache_is_empty(object))) - vm_page_cache_free(object, 0, 0); KASSERT(object->cred == NULL || object->type == OBJT_DEFAULT || object->type == OBJT_SWAP, @@ -1027,13 +1084,13 @@ * I/O. */ if (object->type == OBJT_VNODE && - (object->flags & OBJ_MIGHTBEDIRTY) != 0) { - vp = object->handle; + (object->flags & OBJ_MIGHTBEDIRTY) != 0 && + ((vp = object->handle)->v_vflag & VV_NOSYNC) == 0) { VM_OBJECT_WUNLOCK(object); (void) vn_start_write(vp, &mp, V_WAIT); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if (syncio && !invalidate && offset == 0 && - OFF_TO_IDX(size) == object->size) { + atop(size) == object->size) { /* * If syncing the whole mapping of the file, * it is faster to schedule all the writes in @@ -1080,6 +1137,33 @@ } /* + * Determine whether the given advice can be applied to the object. Advice is + * not applied to unmanaged pages since they never belong to page queues, and + * since MADV_FREE is destructive, it can apply only to anonymous pages that + * have been mapped at most once. + */ +static bool +vm_object_advice_applies(vm_object_t object, int advice) +{ + + if ((object->flags & OBJ_UNMANAGED) != 0) + return (false); + if (advice != MADV_FREE) + return (true); + return ((object->type == OBJT_DEFAULT || object->type == OBJT_SWAP) && + (object->flags & OBJ_ONEMAPPING) != 0); +} + +static void +vm_object_madvise_freespace(vm_object_t object, int advice, vm_pindex_t pindex, + vm_size_t size) +{ + + if (advice == MADV_FREE && object->type == OBJT_SWAP) + swap_pager_freespace(object, pindex, size); +} + +/* * vm_object_madvise: * * Implements the madvise function at the object/page level. @@ -1102,103 +1186,109 @@ */ void vm_object_madvise(vm_object_t object, vm_pindex_t pindex, vm_pindex_t end, - int advise) + int advice) { vm_pindex_t tpindex; vm_object_t backing_object, tobject; - vm_page_t m; + vm_page_t m, tm; if (object == NULL) return; + +relookup: VM_OBJECT_WLOCK(object); - /* - * Locate and adjust resident pages - */ - for (; pindex < end; pindex += 1) { -relookup: + if (!vm_object_advice_applies(object, advice)) { + VM_OBJECT_WUNLOCK(object); + return; + } + for (m = vm_page_find_least(object, pindex); pindex < end; pindex++) { tobject = object; - tpindex = pindex; -shadowlookup: + /* - * MADV_FREE only operates on OBJT_DEFAULT or OBJT_SWAP pages - * and those pages must be OBJ_ONEMAPPING. + * If the next page isn't resident in the top-level object, we + * need to search the shadow chain. When applying MADV_FREE, we + * take care to release any swap space used to store + * non-resident pages. */ - if (advise == MADV_FREE) { - if ((tobject->type != OBJT_DEFAULT && - tobject->type != OBJT_SWAP) || - (tobject->flags & OBJ_ONEMAPPING) == 0) { - goto unlock_tobject; - } - } else if ((tobject->flags & OBJ_UNMANAGED) != 0) - goto unlock_tobject; - m = vm_page_lookup(tobject, tpindex); - if (m == NULL && advise == MADV_WILLNEED) { + if (m == NULL || pindex < m->pindex) { /* - * If the page is cached, reactivate it. + * Optimize a common case: if the top-level object has + * no backing object, we can skip over the non-resident + * range in constant time. */ - m = vm_page_alloc(tobject, tpindex, VM_ALLOC_IFCACHED | - VM_ALLOC_NOBUSY); + if (object->backing_object == NULL) { + tpindex = (m != NULL && m->pindex < end) ? + m->pindex : end; + vm_object_madvise_freespace(object, advice, + pindex, tpindex - pindex); + if ((pindex = tpindex) == end) + break; + goto next_page; + } + + tpindex = pindex; + do { + vm_object_madvise_freespace(tobject, advice, + tpindex, 1); + /* + * Prepare to search the next object in the + * chain. + */ + backing_object = tobject->backing_object; + if (backing_object == NULL) + goto next_pindex; + VM_OBJECT_WLOCK(backing_object); + tpindex += + OFF_TO_IDX(tobject->backing_object_offset); + if (tobject != object) + VM_OBJECT_WUNLOCK(tobject); + tobject = backing_object; + if (!vm_object_advice_applies(tobject, advice)) + goto next_pindex; + } while ((tm = vm_page_lookup(tobject, tpindex)) == + NULL); + } else { +next_page: + tm = m; + m = TAILQ_NEXT(m, listq); } - if (m == NULL) { - /* - * There may be swap even if there is no backing page - */ - if (advise == MADV_FREE && tobject->type == OBJT_SWAP) - swap_pager_freespace(tobject, tpindex, 1); - /* - * next object - */ - backing_object = tobject->backing_object; - if (backing_object == NULL) - goto unlock_tobject; - VM_OBJECT_WLOCK(backing_object); - tpindex += OFF_TO_IDX(tobject->backing_object_offset); - if (tobject != object) - VM_OBJECT_WUNLOCK(tobject); - tobject = backing_object; - goto shadowlookup; - } else if (m->valid != VM_PAGE_BITS_ALL) - goto unlock_tobject; + /* * If the page is not in a normal state, skip it. */ - vm_page_lock(m); - if (m->hold_count != 0 || m->wire_count != 0) { - vm_page_unlock(m); - goto unlock_tobject; + if (tm->valid != VM_PAGE_BITS_ALL) + goto next_pindex; + vm_page_lock(tm); + if (tm->hold_count != 0 || tm->wire_count != 0) { + vm_page_unlock(tm); + goto next_pindex; } - KASSERT((m->flags & PG_FICTITIOUS) == 0, - ("vm_object_madvise: page %p is fictitious", m)); - KASSERT((m->oflags & VPO_UNMANAGED) == 0, - ("vm_object_madvise: page %p is not managed", m)); - if (vm_page_busied(m)) { - if (advise == MADV_WILLNEED) { + KASSERT((tm->flags & PG_FICTITIOUS) == 0, + ("vm_object_madvise: page %p is fictitious", tm)); + KASSERT((tm->oflags & VPO_UNMANAGED) == 0, + ("vm_object_madvise: page %p is not managed", tm)); + if (vm_page_busied(tm)) { + if (object != tobject) + VM_OBJECT_WUNLOCK(tobject); + VM_OBJECT_WUNLOCK(object); + if (advice == MADV_WILLNEED) { /* * Reference the page before unlocking and * sleeping so that the page daemon is less - * likely to reclaim it. + * likely to reclaim it. */ - vm_page_aflag_set(m, PGA_REFERENCED); + vm_page_aflag_set(tm, PGA_REFERENCED); } - if (object != tobject) - VM_OBJECT_WUNLOCK(object); - VM_OBJECT_WUNLOCK(tobject); - vm_page_busy_sleep(m, "madvpo", false); - VM_OBJECT_WLOCK(object); + vm_page_busy_sleep(tm, "madvpo", false); goto relookup; } - if (advise == MADV_WILLNEED) { - vm_page_activate(m); - } else { - vm_page_advise(m, advise); - } - vm_page_unlock(m); - if (advise == MADV_FREE && tobject->type == OBJT_SWAP) - swap_pager_freespace(tobject, tpindex, 1); -unlock_tobject: + vm_page_advise(tm, advice); + vm_page_unlock(tm); + vm_object_madvise_freespace(tobject, advice, tm->pindex, 1); +next_pindex: if (tobject != object) VM_OBJECT_WUNLOCK(tobject); - } + } VM_OBJECT_WUNLOCK(object); } @@ -1368,11 +1458,11 @@ goto retry; } - /* vm_page_rename() will handle dirty and cache. */ + /* vm_page_rename() will dirty the page. */ if (vm_page_rename(m, new_object, idx)) { VM_OBJECT_WUNLOCK(new_object); VM_OBJECT_WUNLOCK(orig_object); - VM_WAIT; + vm_radix_wait(); VM_OBJECT_WLOCK(orig_object); VM_OBJECT_WLOCK(new_object); goto retry; @@ -1403,19 +1493,6 @@ swap_pager_copy(orig_object, new_object, offidxstart, 0); TAILQ_FOREACH(m, &new_object->memq, listq) vm_page_xunbusy(m); - - /* - * Transfer any cached pages from orig_object to new_object. - * If swap_pager_copy() found swapped out pages within the - * specified range of orig_object, then it changed - * new_object's type to OBJT_SWAP when it transferred those - * pages to new_object. Otherwise, new_object's type - * should still be OBJT_DEFAULT and orig_object should not - * contain any cached pages within the specified range. - */ - if (__predict_false(!vm_object_cache_is_empty(orig_object))) - vm_page_cache_transfer(orig_object, offidxstart, - new_object); } VM_OBJECT_WUNLOCK(orig_object); VM_OBJECT_WUNLOCK(new_object); @@ -1425,12 +1502,11 @@ VM_OBJECT_WLOCK(new_object); } -#define OBSC_TEST_ALL_SHADOWED 0x0001 #define OBSC_COLLAPSE_NOWAIT 0x0002 #define OBSC_COLLAPSE_WAIT 0x0004 static vm_page_t -vm_object_backing_scan_wait(vm_object_t object, vm_page_t p, vm_page_t next, +vm_object_collapse_scan_wait(vm_object_t object, vm_page_t p, vm_page_t next, int op) { vm_object_t backing_object; @@ -1448,8 +1524,9 @@ vm_page_lock(p); VM_OBJECT_WUNLOCK(object); VM_OBJECT_WUNLOCK(backing_object); + /* The page is only NULL when rename fails. */ if (p == NULL) - VM_WAIT; + vm_radix_wait(); else vm_page_busy_sleep(p, "vmocol", false); VM_OBJECT_WLOCK(object); @@ -1458,192 +1535,195 @@ } static bool -vm_object_backing_scan(vm_object_t object, int op) +vm_object_scan_all_shadowed(vm_object_t object) { vm_object_t backing_object; - vm_page_t next, p, pp; - vm_pindex_t backing_offset_index, new_pindex; + vm_page_t p, pp; + vm_pindex_t backing_offset_index, new_pindex, pi, ps; VM_OBJECT_ASSERT_WLOCKED(object); VM_OBJECT_ASSERT_WLOCKED(object->backing_object); backing_object = object->backing_object; - backing_offset_index = OFF_TO_IDX(object->backing_object_offset); /* - * Initial conditions + * Initial conditions: + * + * We do not want to have to test for the existence of swap + * pages in the backing object. XXX but with the new swapper this + * would be pretty easy to do. */ - if (op & OBSC_TEST_ALL_SHADOWED) { + if (backing_object->type != OBJT_DEFAULT && + backing_object->type != OBJT_SWAP) + return (false); + + pi = backing_offset_index = OFF_TO_IDX(object->backing_object_offset); + p = vm_page_find_least(backing_object, pi); + ps = swap_pager_find_least(backing_object, pi); + + /* + * Only check pages inside the parent object's range and + * inside the parent object's mapping of the backing object. + */ + for (;; pi++) { + if (p != NULL && p->pindex < pi) + p = TAILQ_NEXT(p, listq); + if (ps < pi) + ps = swap_pager_find_least(backing_object, pi); + if (p == NULL && ps >= backing_object->size) + break; + else if (p == NULL) + pi = ps; + else + pi = MIN(p->pindex, ps); + + new_pindex = pi - backing_offset_index; + if (new_pindex >= object->size) + break; + /* - * We do not want to have to test for the existence of cache - * or swap pages in the backing object. XXX but with the - * new swapper this would be pretty easy to do. + * See if the parent has the page or if the parent's object + * pager has the page. If the parent has the page but the page + * is not valid, the parent's object pager must have the page. * - * XXX what about anonymous MAP_SHARED memory that hasn't - * been ZFOD faulted yet? If we do not test for this, the - * shadow test may succeed! XXX + * If this fails, the parent does not completely shadow the + * object and we might as well give up now. */ - if (backing_object->type != OBJT_DEFAULT) { + pp = vm_page_lookup(object, new_pindex); + if ((pp == NULL || pp->valid == 0) && + !vm_pager_has_page(object, new_pindex, NULL, NULL)) return (false); - } } - if (op & OBSC_COLLAPSE_WAIT) { + return (true); +} + +static bool +vm_object_collapse_scan(vm_object_t object, int op) +{ + vm_object_t backing_object; + vm_page_t next, p, pp; + vm_pindex_t backing_offset_index, new_pindex; + + VM_OBJECT_ASSERT_WLOCKED(object); + VM_OBJECT_ASSERT_WLOCKED(object->backing_object); + + backing_object = object->backing_object; + backing_offset_index = OFF_TO_IDX(object->backing_object_offset); + + /* + * Initial conditions + */ + if ((op & OBSC_COLLAPSE_WAIT) != 0) vm_object_set_flag(backing_object, OBJ_DEAD); - } /* * Our scan */ - p = TAILQ_FIRST(&backing_object->memq); - while (p) { + for (p = TAILQ_FIRST(&backing_object->memq); p != NULL; p = next) { next = TAILQ_NEXT(p, listq); new_pindex = p->pindex - backing_offset_index; - if (op & OBSC_TEST_ALL_SHADOWED) { - /* - * Ignore pages outside the parent object's range - * and outside the parent object's mapping of the - * backing object. - * - * Note that we do not busy the backing object's - * page. - */ - if (p->pindex < backing_offset_index || - new_pindex >= object->size) { - p = next; - continue; - } - /* - * See if the parent has the page or if the parent's - * object pager has the page. If the parent has the - * page but the page is not valid, the parent's - * object pager must have the page. - * - * If this fails, the parent does not completely shadow - * the object and we might as well give up now. - */ - - pp = vm_page_lookup(object, new_pindex); - if ((pp == NULL || pp->valid == 0) && - !vm_pager_has_page(object, new_pindex, NULL, NULL)) - return (false); - } - /* * Check for busy page */ - if (op & (OBSC_COLLAPSE_WAIT | OBSC_COLLAPSE_NOWAIT)) { - if (vm_page_busied(p)) { - p = vm_object_backing_scan_wait(object, p, - next, op); - continue; - } + if (vm_page_busied(p)) { + next = vm_object_collapse_scan_wait(object, p, next, op); + continue; + } - KASSERT(p->object == backing_object, - ("vm_object_backing_scan: object mismatch")); + KASSERT(p->object == backing_object, + ("vm_object_collapse_scan: object mismatch")); - if (p->pindex < backing_offset_index || - new_pindex >= object->size) { - if (backing_object->type == OBJT_SWAP) - swap_pager_freespace(backing_object, - p->pindex, 1); + if (p->pindex < backing_offset_index || + new_pindex >= object->size) { + if (backing_object->type == OBJT_SWAP) + swap_pager_freespace(backing_object, p->pindex, + 1); - /* - * Page is out of the parent object's range, we - * can simply destroy it. - */ - vm_page_lock(p); - KASSERT(!pmap_page_is_mapped(p), - ("freeing mapped page %p", p)); - if (p->wire_count == 0) - vm_page_free(p); - else - vm_page_remove(p); - vm_page_unlock(p); - p = next; - continue; - } + /* + * Page is out of the parent object's range, we can + * simply destroy it. + */ + vm_page_lock(p); + KASSERT(!pmap_page_is_mapped(p), + ("freeing mapped page %p", p)); + if (p->wire_count == 0) + vm_page_free(p); + else + vm_page_remove(p); + vm_page_unlock(p); + continue; + } - pp = vm_page_lookup(object, new_pindex); - if (pp != NULL && vm_page_busied(pp)) { - /* - * The page in the parent is busy and - * possibly not (yet) valid. Until - * its state is finalized by the busy - * bit owner, we can't tell whether it - * shadows the original page. - * Therefore, we must either skip it - * and the original (backing_object) - * page or wait for its state to be - * finalized. - * - * This is due to a race with vm_fault() - * where we must unbusy the original - * (backing_obj) page before we can - * (re)lock the parent. Hence we can - * get here. - */ - p = vm_object_backing_scan_wait(object, pp, - next, op); - continue; - } + pp = vm_page_lookup(object, new_pindex); + if (pp != NULL && vm_page_busied(pp)) { + /* + * The page in the parent is busy and possibly not + * (yet) valid. Until its state is finalized by the + * busy bit owner, we can't tell whether it shadows the + * original page. Therefore, we must either skip it + * and the original (backing_object) page or wait for + * its state to be finalized. + * + * This is due to a race with vm_fault() where we must + * unbusy the original (backing_obj) page before we can + * (re)lock the parent. Hence we can get here. + */ + next = vm_object_collapse_scan_wait(object, pp, next, + op); + continue; + } - KASSERT(pp == NULL || pp->valid != 0, - ("unbusy invalid page %p", pp)); + KASSERT(pp == NULL || pp->valid != 0, + ("unbusy invalid page %p", pp)); - if (pp != NULL || vm_pager_has_page(object, - new_pindex, NULL, NULL)) { - /* - * The page already exists in the - * parent OR swap exists for this - * location in the parent. Leave the - * parent's page alone. Destroy the - * original page from the backing - * object. - */ - if (backing_object->type == OBJT_SWAP) - swap_pager_freespace(backing_object, - p->pindex, 1); - vm_page_lock(p); - KASSERT(!pmap_page_is_mapped(p), - ("freeing mapped page %p", p)); - if (p->wire_count == 0) - vm_page_free(p); - else - vm_page_remove(p); - vm_page_unlock(p); - p = next; - continue; - } - + if (pp != NULL || vm_pager_has_page(object, new_pindex, NULL, + NULL)) { /* - * Page does not exist in parent, rename the - * page from the backing object to the main object. - * - * If the page was mapped to a process, it can remain - * mapped through the rename. - * vm_page_rename() will handle dirty and cache. + * The page already exists in the parent OR swap exists + * for this location in the parent. Leave the parent's + * page alone. Destroy the original page from the + * backing object. */ - if (vm_page_rename(p, object, new_pindex)) { - p = vm_object_backing_scan_wait(object, NULL, - next, op); - continue; - } - - /* Use the old pindex to free the right page. */ if (backing_object->type == OBJT_SWAP) - swap_pager_freespace(backing_object, - new_pindex + backing_offset_index, 1); + swap_pager_freespace(backing_object, p->pindex, + 1); + vm_page_lock(p); + KASSERT(!pmap_page_is_mapped(p), + ("freeing mapped page %p", p)); + if (p->wire_count == 0) + vm_page_free(p); + else + vm_page_remove(p); + vm_page_unlock(p); + continue; + } + /* + * Page does not exist in parent, rename the page from the + * backing object to the main object. + * + * If the page was mapped to a process, it can remain mapped + * through the rename. vm_page_rename() will dirty the page. + */ + if (vm_page_rename(p, object, new_pindex)) { + next = vm_object_collapse_scan_wait(object, NULL, next, + op); + continue; + } + + /* Use the old pindex to free the right page. */ + if (backing_object->type == OBJT_SWAP) + swap_pager_freespace(backing_object, + new_pindex + backing_offset_index, 1); + #if VM_NRESERVLEVEL > 0 - /* - * Rename the reservation. - */ - vm_reserv_rename(p, object, backing_object, - backing_offset_index); + /* + * Rename the reservation. + */ + vm_reserv_rename(p, object, backing_object, + backing_offset_index); #endif - } - p = next; } return (true); } @@ -1665,7 +1745,7 @@ if (backing_object->ref_count != 1) return; - vm_object_backing_scan(object, OBSC_COLLAPSE_NOWAIT); + vm_object_collapse_scan(object, OBSC_COLLAPSE_NOWAIT); } /* @@ -1698,8 +1778,8 @@ VM_OBJECT_WLOCK(backing_object); if (backing_object->handle != NULL || (backing_object->type != OBJT_DEFAULT && - backing_object->type != OBJT_SWAP) || - (backing_object->flags & OBJ_DEAD) || + backing_object->type != OBJT_SWAP) || + (backing_object->flags & (OBJ_DEAD | OBJ_NOSPLIT)) != 0 || object->handle != NULL || (object->type != OBJT_DEFAULT && object->type != OBJT_SWAP) || @@ -1722,7 +1802,7 @@ * all the resident pages in the entire backing object. * * This is ignoring pager-backed pages such as swap pages. - * vm_object_backing_scan fails the shadowing test in this + * vm_object_collapse_scan fails the shadowing test in this * case. */ if (backing_object->ref_count == 1) { @@ -1731,9 +1811,9 @@ /* * If there is exactly one reference to the backing - * object, we can collapse it into the parent. + * object, we can collapse it into the parent. */ - vm_object_backing_scan(object, OBSC_COLLAPSE_WAIT); + vm_object_collapse_scan(object, OBSC_COLLAPSE_WAIT); #if VM_NRESERVLEVEL > 0 /* @@ -1759,13 +1839,6 @@ backing_object, object, OFF_TO_IDX(object->backing_object_offset), TRUE); - - /* - * Free any cached pages from backing_object. - */ - if (__predict_false( - !vm_object_cache_is_empty(backing_object))) - vm_page_cache_free(backing_object, 0, 0); } /* * Object now shadows whatever backing_object did. @@ -1814,8 +1887,7 @@ * there is nothing we can do so we give up. */ if (object->resident_page_count != object->size && - !vm_object_backing_scan(object, - OBSC_TEST_ALL_SHADOWED)) { + !vm_object_scan_all_shadowed(object)) { VM_OBJECT_WUNLOCK(backing_object); break; } @@ -1889,6 +1961,8 @@ int options) { vm_page_t p, next; + struct mtx *mtx; + struct pglist pgl; VM_OBJECT_ASSERT_WLOCKED(object); KASSERT((object->flags & OBJ_UNMANAGED) == 0 || @@ -1895,10 +1969,12 @@ (options & (OBJPR_CLEANONLY | OBJPR_NOTMAPPED)) == OBJPR_NOTMAPPED, ("vm_object_page_remove: illegal options for object %p", object)); if (object->resident_page_count == 0) - goto skipmemq; + return; vm_object_pip_add(object, 1); + TAILQ_INIT(&pgl); again: p = vm_page_find_least(object, start); + mtx = NULL; /* * Here, the variable "p" is either (1) the page with the least pindex @@ -1915,7 +1991,7 @@ * however, be invalidated if the option OBJPR_CLEANONLY is * not specified. */ - vm_page_lock(p); + vm_page_change_lock(p, &mtx); if (vm_page_xbusied(p)) { VM_OBJECT_WUNLOCK(object); vm_page_busy_sleep(p, "vmopax", true); @@ -1923,13 +1999,14 @@ goto again; } if (p->wire_count != 0) { - if ((options & OBJPR_NOTMAPPED) == 0) + if ((options & OBJPR_NOTMAPPED) == 0 && + object->ref_count != 0) pmap_remove_all(p); if ((options & OBJPR_CLEANONLY) == 0) { p->valid = 0; vm_page_undirty(p); } - goto next; + continue; } if (vm_page_busied(p)) { VM_OBJECT_WUNLOCK(object); @@ -1940,33 +2017,34 @@ KASSERT((p->flags & PG_FICTITIOUS) == 0, ("vm_object_page_remove: page %p is fictitious", p)); if ((options & OBJPR_CLEANONLY) != 0 && p->valid != 0) { - if ((options & OBJPR_NOTMAPPED) == 0) + if ((options & OBJPR_NOTMAPPED) == 0 && + object->ref_count != 0) pmap_remove_write(p); - if (p->dirty) - goto next; + if (p->dirty != 0) + continue; } - if ((options & OBJPR_NOTMAPPED) == 0) + if ((options & OBJPR_NOTMAPPED) == 0 && object->ref_count != 0) pmap_remove_all(p); - vm_page_free(p); -next: - vm_page_unlock(p); + p->flags &= ~PG_ZERO; + if (vm_page_free_prep(p, false)) + TAILQ_INSERT_TAIL(&pgl, p, listq); } + if (mtx != NULL) + mtx_unlock(mtx); + vm_page_free_phys_pglist(&pgl); vm_object_pip_wakeup(object); -skipmemq: - if (__predict_false(!vm_object_cache_is_empty(object))) - vm_page_cache_free(object, start, end); } /* - * vm_object_page_cache: + * vm_object_page_noreuse: * - * For the given object, attempt to move the specified clean - * pages to the cache queue. If a page is wired for any reason, - * then it will not be changed. Pages are specified by the given - * range ["start", "end"). As a special case, if "end" is zero, - * then the range extends from "start" to the end of the object. - * Any mappings to the specified pages are removed before the - * pages are moved to the cache queue. + * For the given object, attempt to move the specified pages to + * the head of the inactive queue. This bypasses regular LRU + * operation and allows the pages to be reused quickly under memory + * pressure. If a page is wired for any reason, then it will not + * be queued. Pages are specified by the range ["start", "end"). + * As a special case, if "end" is zero, then the range extends from + * "start" to the end of the object. * * This operation should only be performed on objects that * contain non-fictitious, managed pages. @@ -1974,14 +2052,14 @@ * The object must be locked. */ void -vm_object_page_cache(vm_object_t object, vm_pindex_t start, vm_pindex_t end) +vm_object_page_noreuse(vm_object_t object, vm_pindex_t start, vm_pindex_t end) { - struct mtx *mtx, *new_mtx; + struct mtx *mtx; vm_page_t p, next; - VM_OBJECT_ASSERT_WLOCKED(object); + VM_OBJECT_ASSERT_LOCKED(object); KASSERT((object->flags & (OBJ_FICTITIOUS | OBJ_UNMANAGED)) == 0, - ("vm_object_page_cache: illegal object %p", object)); + ("vm_object_page_noreuse: illegal object %p", object)); if (object->resident_page_count == 0) return; p = vm_page_find_least(object, start); @@ -1993,18 +2071,8 @@ mtx = NULL; for (; p != NULL && (p->pindex < end || end == 0); p = next) { next = TAILQ_NEXT(p, listq); - - /* - * Avoid releasing and reacquiring the same page lock. - */ - new_mtx = vm_page_lockptr(p); - if (mtx != new_mtx) { - if (mtx != NULL) - mtx_unlock(mtx); - mtx = new_mtx; - mtx_lock(mtx); - } - vm_page_try_to_cache(p); + vm_page_change_lock(p, &mtx); + vm_page_deactivate_noreuse(p); } if (mtx != NULL) mtx_unlock(mtx); @@ -2023,7 +2091,7 @@ boolean_t vm_object_populate(vm_object_t object, vm_pindex_t start, vm_pindex_t end) { - vm_page_t m, ma[1]; + vm_page_t m; vm_pindex_t pindex; int rv; @@ -2031,11 +2099,7 @@ for (pindex = start; pindex < end; pindex++) { m = vm_page_grab(object, pindex, VM_ALLOC_NORMAL); if (m->valid != VM_PAGE_BITS_ALL) { - ma[0] = m; - rv = vm_pager_get_pages(object, ma, 1, 0); - m = vm_page_lookup(object, pindex); - if (m == NULL) - break; + rv = vm_pager_get_pages(object, &m, 1, NULL, NULL); if (rv != VM_PAGER_OK) { vm_page_lock(m); vm_page_free(m); @@ -2090,7 +2154,7 @@ VM_OBJECT_WLOCK(prev_object); if ((prev_object->type != OBJT_DEFAULT && prev_object->type != OBJT_SWAP) || - (prev_object->flags & OBJ_TMPFS_NODE) != 0) { + (prev_object->flags & OBJ_NOSPLIT) != 0) { VM_OBJECT_WUNLOCK(prev_object); return (FALSE); } @@ -2127,7 +2191,7 @@ /* * If prev_object was charged, then this mapping, - * althought not charged now, may become writable + * although not charged now, may become writable * later. Non-NULL cred in the object would prevent * swap reservation during enabling of the write * access, so reserve swap now. Failed reservation @@ -2205,7 +2269,7 @@ vm_object_unwire(vm_object_t object, vm_ooffset_t offset, vm_size_t length, uint8_t queue) { - vm_object_t tobject; + vm_object_t tobject, t1object; vm_page_t m, tm; vm_pindex_t end_pindex, pindex, tpindex; int depth, locked_depth; @@ -2219,6 +2283,7 @@ return; pindex = OFF_TO_IDX(offset); end_pindex = pindex + atop(length); +again: locked_depth = 1; VM_OBJECT_RLOCK(object); m = vm_page_find_least(object, pindex); @@ -2252,6 +2317,16 @@ m = TAILQ_NEXT(m, listq); } vm_page_lock(tm); + if (vm_page_xbusied(tm)) { + for (tobject = object; locked_depth >= 1; + locked_depth--) { + t1object = tobject->backing_object; + VM_OBJECT_RUNLOCK(tobject); + tobject = t1object; + } + vm_page_busy_sleep(tm, "unwbo", true); + goto again; + } vm_page_unwire(tm, queue); vm_page_unlock(tm); next_page: @@ -2258,10 +2333,10 @@ pindex++; } /* Release the accumulated object locks. */ - for (depth = 0; depth < locked_depth; depth++) { - tobject = object->backing_object; - VM_OBJECT_RUNLOCK(object); - object = tobject; + for (tobject = object; locked_depth >= 1; locked_depth--) { + t1object = tobject->backing_object; + VM_OBJECT_RUNLOCK(tobject); + tobject = t1object; } } @@ -2340,9 +2415,9 @@ * sysctl is only meant to give an * approximation of the system anyway. */ - if (m->queue == PQ_ACTIVE) + if (vm_page_active(m)) kvo->kvo_active++; - else if (m->queue == PQ_INACTIVE) + else if (vm_page_inactive(m)) kvo->kvo_inactive++; } Modified: trunk/sys/vm/vm_object.h =================================================================== --- trunk/sys/vm/vm_object.h 2020-02-08 19:35:04 UTC (rev 12313) +++ trunk/sys/vm/vm_object.h 2020-02-08 19:35:48 UTC (rev 12314) @@ -58,7 +58,7 @@ * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. * - * $FreeBSD: stable/10/sys/vm/vm_object.h 313384 2017-02-07 08:33:46Z kib $ + * $FreeBSD: stable/11/sys/vm/vm_object.h 331722 2018-03-29 02:50:57Z eadler $ */ /* @@ -71,6 +71,7 @@ #include <sys/queue.h> #include <sys/_lock.h> #include <sys/_mutex.h> +#include <sys/_pctrie.h> #include <sys/_rwlock.h> #include <vm/_vm_radix.h> @@ -80,17 +81,6 @@ * * vm_object_t Virtual memory object. * - * The root of cached pages pool is protected by both the per-object lock - * and the free pages queue mutex. - * On insert in the cache radix trie, the per-object lock is expected - * to be already held and the free pages queue mutex will be - * acquired during the operation too. - * On remove and lookup from the cache radix trie, only the free - * pages queue mutex is expected to be locked. - * These rules allow for reliably checking for the presence of cached - * pages with only the per-object lock held, thereby reducing contention - * for the free pages queue mutex. - * * List of locks * (c) const until freed * (o) per-object lock @@ -98,12 +88,17 @@ * */ +#ifndef VM_PAGE_HAVE_PGLIST +TAILQ_HEAD(pglist, vm_page); +#define VM_PAGE_HAVE_PGLIST +#endif + struct vm_object { struct rwlock lock; TAILQ_ENTRY(vm_object) object_list; /* list of all objects */ LIST_HEAD(, vm_object) shadow_head; /* objects that this is a shadow for */ LIST_ENTRY(vm_object) shadow_list; /* chain of shadow objects */ - TAILQ_HEAD(respgs, vm_page) memq; /* list of resident pages */ + struct pglist memq; /* list of resident pages */ struct vm_radix rtree; /* root of the resident page radix trie*/ vm_pindex_t size; /* Object size */ int generation; /* generation ID */ @@ -119,7 +114,6 @@ vm_ooffset_t backing_object_offset;/* Offset in backing object */ TAILQ_ENTRY(vm_object) pager_object_list; /* list of all objects of this pager type */ LIST_HEAD(, vm_reserv) rvq; /* list of reservations */ - struct vm_radix cache; /* (o + f) root of the cache page radix trie */ void *handle; union { /* @@ -164,17 +158,17 @@ * the handle changed and hash-chain * invalid. * - * swp_bcount - number of swap 'swblock' metablocks, each - * contains up to 16 swapblk assignments. - * see vm/swap_pager.h + * swp_blks - pc-trie of the allocated swap blocks. + * */ struct { void *swp_tmpfs; - int swp_bcount; + struct pctrie swp_blks; } swp; } un_pager; struct ucred *cred; vm_ooffset_t charge; + void *umtx_data; }; /* @@ -182,10 +176,13 @@ */ #define OBJ_FICTITIOUS 0x0001 /* (c) contains fictitious pages */ #define OBJ_UNMANAGED 0x0002 /* (c) contains unmanaged pages */ -#define OBJ_DEAD 0x0008 /* dead objects (during rundown) */ +#define OBJ_POPULATE 0x0004 /* pager implements populate() */ +#define OBJ_DEAD 0x0008 /* dead objects (during rundown) */ #define OBJ_NOSPLIT 0x0010 /* dont split this object */ -#define OBJ_PIPWNT 0x0040 /* paging in progress wanted */ -#define OBJ_MIGHTBEDIRTY 0x0100 /* object might be dirty, only for vnode */ +#define OBJ_UMTXDEAD 0x0020 /* umtx pshared was terminated */ +#define OBJ_PIPWNT 0x0040 /* paging in progress wanted */ +#define OBJ_PG_DTOR 0x0080 /* dont reset object, leave that for dtor */ +#define OBJ_MIGHTBEDIRTY 0x0100 /* object might be dirty, only for vnode */ #define OBJ_TMPFS_NODE 0x0200 /* object belongs to tmpfs VREG node */ #define OBJ_TMPFS_DIRTY 0x0400 /* dirty tmpfs obj */ #define OBJ_COLORED 0x1000 /* pg_color is defined */ @@ -193,14 +190,29 @@ #define OBJ_DISCONNECTWNT 0x4000 /* disconnect from vnode wanted */ #define OBJ_TMPFS 0x8000 /* has tmpfs vnode allocated */ +/* + * Helpers to perform conversion between vm_object page indexes and offsets. + * IDX_TO_OFF() converts an index into an offset. + * OFF_TO_IDX() converts an offset into an index. Since offsets are signed + * by default, the sign propagation in OFF_TO_IDX(), when applied to + * negative offsets, is intentional and returns a vm_object page index + * that cannot be created by a userspace mapping. + * UOFF_TO_IDX() treats the offset as an unsigned value and converts it + * into an index accordingly. Use it only when the full range of offset + * values are allowed. Currently, this only applies to device mappings. + * OBJ_MAX_SIZE specifies the maximum page index corresponding to the + * maximum unsigned offset. + */ #define IDX_TO_OFF(idx) (((vm_ooffset_t)(idx)) << PAGE_SHIFT) #define OFF_TO_IDX(off) ((vm_pindex_t)(((vm_ooffset_t)(off)) >> PAGE_SHIFT)) +#define UOFF_TO_IDX(off) (((vm_pindex_t)(off)) >> PAGE_SHIFT) +#define OBJ_MAX_SIZE (UOFF_TO_IDX(UINT64_MAX) + 1) #ifdef _KERNEL #define OBJPC_SYNC 0x1 /* sync I/O */ #define OBJPC_INVAL 0x2 /* invalidate */ -#define OBJPC_NOSYNC 0x4 /* skip if PG_NOSYNC */ +#define OBJPC_NOSYNC 0x4 /* skip if VPO_NOSYNC */ /* * The following options are supported by vm_object_page_remove(). @@ -243,6 +255,8 @@ rw_try_upgrade(&(object)->lock) #define VM_OBJECT_WLOCK(object) \ rw_wlock(&(object)->lock) +#define VM_OBJECT_WOWNED(object) \ + rw_wowned(&(object)->lock) #define VM_OBJECT_WUNLOCK(object) \ rw_wunlock(&(object)->lock) @@ -256,6 +270,30 @@ object->flags |= bits; } +/* + * Conditionally set the object's color, which (1) enables the allocation + * of physical memory reservations for anonymous objects and larger-than- + * superpage-sized named objects and (2) determines the first page offset + * within the object at which a reservation may be allocated. In other + * words, the color determines the alignment of the object with respect + * to the largest superpage boundary. When mapping named objects, like + * files or POSIX shared memory objects, the color should be set to zero + * before a virtual address is selected for the mapping. In contrast, + * for anonymous objects, the color may be set after the virtual address + * is selected. + * + * The object must be locked. + */ +static __inline void +vm_object_color(vm_object_t object, u_short color) +{ + + if ((object->flags & OBJ_COLORED) == 0) { + object->pg_color = color; + object->flags |= OBJ_COLORED; + } +} + void vm_object_clear_flag(vm_object_t object, u_short bits); void vm_object_pip_add(vm_object_t object, short i); void vm_object_pip_subtract(vm_object_t object, short i); @@ -263,13 +301,10 @@ void vm_object_pip_wakeupn(vm_object_t object, short i); void vm_object_pip_wait(vm_object_t object, char *waitid); -static __inline boolean_t -vm_object_cache_is_empty(vm_object_t object) -{ +void umtx_shm_object_init(vm_object_t object); +void umtx_shm_object_terminated(vm_object_t object); +extern int umtx_shm_vnobj_persistent; - return (vm_radix_is_empty(&object->cache)); -} - vm_object_t vm_object_allocate (objtype_t, vm_pindex_t); boolean_t vm_object_coalesce(vm_object_t, vm_ooffset_t, vm_size_t, vm_size_t, boolean_t); @@ -280,10 +315,10 @@ void vm_object_set_writeable_dirty (vm_object_t); void vm_object_init (void); void vm_object_madvise(vm_object_t, vm_pindex_t, vm_pindex_t, int); -void vm_object_page_cache(vm_object_t object, vm_pindex_t start, - vm_pindex_t end); boolean_t vm_object_page_clean(vm_object_t object, vm_ooffset_t start, vm_ooffset_t end, int flags); +void vm_object_page_noreuse(vm_object_t object, vm_pindex_t start, + vm_pindex_t end); void vm_object_page_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end, int options); boolean_t vm_object_populate(vm_object_t, vm_pindex_t, vm_pindex_t); Modified: trunk/sys/vm/vm_page.c =================================================================== --- trunk/sys/vm/vm_page.c 2020-02-08 19:35:04 UTC (rev 12313) +++ trunk/sys/vm/vm_page.c 2020-02-08 19:35:48 UTC (rev 12314) @@ -83,7 +83,7 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/vm/vm_page.c 320190 2017-06-21 14:39:31Z jhb $"); +__FBSDID("$FreeBSD: stable/11/sys/vm/vm_page.c 342797 2019-01-06 00:38:28Z kib $"); #include "opt_vm.h" @@ -92,6 +92,7 @@ #include <sys/lock.h> #include <sys/kernel.h> #include <sys/limits.h> +#include <sys/linker.h> #include <sys/malloc.h> #include <sys/mman.h> #include <sys/msgbuf.h> @@ -98,6 +99,8 @@ #include <sys/mutex.h> #include <sys/proc.h> #include <sys/rwlock.h> +#include <sys/sbuf.h> +#include <sys/smp.h> #include <sys/sysctl.h> #include <sys/vmmeter.h> #include <sys/vnode.h> @@ -125,9 +128,9 @@ */ struct vm_domain vm_dom[MAXMEMDOM]; -struct mtx_padalign vm_page_queue_free_mtx; +struct mtx_padalign __exclusive_cache_line vm_page_queue_free_mtx; -struct mtx_padalign pa_lock[PA_LOCK_COUNT]; +struct mtx_padalign __exclusive_cache_line pa_lock[PA_LOCK_COUNT]; vm_page_t vm_page_array; long vm_page_array_size; @@ -135,25 +138,37 @@ int vm_page_zero_count; static int boot_pages = UMA_BOOT_PAGES; -TUNABLE_INT("vm.boot_pages", &boot_pages); -SYSCTL_INT(_vm, OID_AUTO, boot_pages, CTLFLAG_RD, &boot_pages, 0, - "number of pages allocated for bootstrapping the VM system"); +SYSCTL_INT(_vm, OID_AUTO, boot_pages, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, + &boot_pages, 0, + "number of pages allocated for bootstrapping the VM system"); static int pa_tryrelock_restart; SYSCTL_INT(_vm, OID_AUTO, tryrelock_restart, CTLFLAG_RD, &pa_tryrelock_restart, 0, "Number of tryrelock restarts"); +static TAILQ_HEAD(, vm_page) blacklist_head; +static int sysctl_vm_page_blacklist(SYSCTL_HANDLER_ARGS); +SYSCTL_PROC(_vm, OID_AUTO, page_blacklist, CTLTYPE_STRING | CTLFLAG_RD | + CTLFLAG_MPSAFE, NULL, 0, sysctl_vm_page_blacklist, "A", "Blacklist pages"); + +/* Is the page daemon waiting for free pages? */ +static int vm_pageout_pages_needed; + static uma_zone_t fakepg_zone; -static struct vnode *vm_page_alloc_init(vm_page_t m); -static void vm_page_cache_turn_free(vm_page_t m); +static void vm_page_alloc_check(vm_page_t m); static void vm_page_clear_dirty_mask(vm_page_t m, vm_page_bits_t pagebits); -static void vm_page_enqueue(int queue, vm_page_t m); +static void vm_page_enqueue(uint8_t queue, vm_page_t m); +static void vm_page_free_phys(vm_page_t m); +static void vm_page_free_wakeup(void); static void vm_page_init_fakepg(void *dummy); static int vm_page_insert_after(vm_page_t m, vm_object_t object, vm_pindex_t pindex, vm_page_t mpred); static void vm_page_insert_radixdone(vm_page_t m, vm_object_t object, vm_page_t mpred); +static int vm_page_reclaim_run(int req_class, u_long npages, vm_page_t m_run, + vm_paddr_t high); +static int vm_page_alloc_fail(vm_object_t object, int req); SYSINIT(vm_page, SI_SUB_VM, SI_ORDER_SECOND, vm_page_init_fakepg, NULL); @@ -162,7 +177,7 @@ { fakepg_zone = uma_zcreate("fakepg", sizeof(struct vm_page), NULL, NULL, - NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE | UMA_ZONE_VM); + NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE | UMA_ZONE_VM); } /* Make sure that u_long is at least 64 bits when PAGE_SIZE is 32K. */ @@ -210,43 +225,171 @@ void vm_set_page_size(void) { - if (cnt.v_page_size == 0) - cnt.v_page_size = PAGE_SIZE; - if (((cnt.v_page_size - 1) & cnt.v_page_size) != 0) + if (vm_cnt.v_page_size == 0) + vm_cnt.v_page_size = PAGE_SIZE; + if (((vm_cnt.v_page_size - 1) & vm_cnt.v_page_size) != 0) panic("vm_set_page_size: page size not a power of two"); } /* - * vm_page_blacklist_lookup: + * vm_page_blacklist_next: * - * See if a physical address in this page has been listed - * in the blacklist tunable. Entries in the tunable are - * separated by spaces or commas. If an invalid integer is - * encountered then the rest of the string is skipped. + * Find the next entry in the provided string of blacklist + * addresses. Entries are separated by space, comma, or newline. + * If an invalid integer is encountered then the rest of the + * string is skipped. Updates the list pointer to the next + * character, or NULL if the string is exhausted or invalid. */ -static int -vm_page_blacklist_lookup(char *list, vm_paddr_t pa) +static vm_paddr_t +vm_page_blacklist_next(char **list, char *end) { vm_paddr_t bad; char *cp, *pos; - for (pos = list; *pos != '\0'; pos = cp) { + if (list == NULL || *list == NULL) + return (0); + if (**list =='\0') { + *list = NULL; + return (0); + } + + /* + * If there's no end pointer then the buffer is coming from + * the kenv and we know it's null-terminated. + */ + if (end == NULL) + end = *list + strlen(*list); + + /* Ensure that strtoq() won't walk off the end */ + if (*end != '\0') { + if (*end == '\n' || *end == ' ' || *end == ',') + *end = '\0'; + else { + printf("Blacklist not terminated, skipping\n"); + *list = NULL; + return (0); + } + } + + for (pos = *list; *pos != '\0'; pos = cp) { bad = strtoq(pos, &cp, 0); - if (*cp != '\0') { - if (*cp == ' ' || *cp == ',') { - cp++; - if (cp == pos) + if (*cp == '\0' || *cp == ' ' || *cp == ',' || *cp == '\n') { + if (bad == 0) { + if (++cp < end) continue; - } else - break; - } - if (pa == trunc_page(bad)) - return (1); + else + break; + } + } else + break; + if (*cp == '\0' || ++cp >= end) + *list = NULL; + else + *list = cp; + return (trunc_page(bad)); } + printf("Garbage in RAM blacklist, skipping\n"); + *list = NULL; return (0); } +bool +vm_page_blacklist_add(vm_paddr_t pa, bool verbose) +{ + vm_page_t m; + int ret; + + m = vm_phys_paddr_to_vm_page(pa); + if (m == NULL) + return (true); /* page does not exist, no failure */ + + mtx_lock(&vm_page_queue_free_mtx); + ret = vm_phys_unfree_page(m); + if (ret != 0) + vm_phys_freecnt_adj(m, -1); + mtx_unlock(&vm_page_queue_free_mtx); + if (ret != 0) { + TAILQ_INSERT_TAIL(&blacklist_head, m, listq); + if (verbose) + printf("Skipping page with pa 0x%jx\n", (uintmax_t)pa); + } + return (ret); +} + +/* + * vm_page_blacklist_check: + * + * Iterate through the provided string of blacklist addresses, pulling + * each entry out of the physical allocator free list and putting it + * onto a list for reporting via the vm.page_blacklist sysctl. + */ static void +vm_page_blacklist_check(char *list, char *end) +{ + vm_paddr_t pa; + char *next; + + next = list; + while (next != NULL) { + if ((pa = vm_page_blacklist_next(&next, end)) == 0) + continue; + vm_page_blacklist_add(pa, bootverbose); + } +} + +/* + * vm_page_blacklist_load: + * + * Search for a special module named "ram_blacklist". It'll be a + * plain text file provided by the user via the loader directive + * of the same name. + */ +static void +vm_page_blacklist_load(char **list, char **end) +{ + void *mod; + u_char *ptr; + u_int len; + + mod = NULL; + ptr = NULL; + + mod = preload_search_by_type("ram_blacklist"); + if (mod != NULL) { + ptr = preload_fetch_addr(mod); + len = preload_fetch_size(mod); + } + *list = ptr; + if (ptr != NULL) + *end = ptr + len; + else + *end = NULL; + return; +} + +static int +sysctl_vm_page_blacklist(SYSCTL_HANDLER_ARGS) +{ + vm_page_t m; + struct sbuf sbuf; + int error, first; + + first = 1; + error = sysctl_wire_old_buffer(req, 0); + if (error != 0) + return (error); + sbuf_new_for_sysctl(&sbuf, NULL, 128, req); + TAILQ_FOREACH(m, &blacklist_head, listq) { + sbuf_printf(&sbuf, "%s%#jx", first ? "" : ",", + (uintmax_t)m->phys_addr); + first = 0; + } + error = sbuf_finish(&sbuf); + sbuf_delete(&sbuf); + return (error); +} + +static void vm_page_domain_init(struct vm_domain *vmd) { struct vm_pagequeue *pq; @@ -255,16 +398,19 @@ *__DECONST(char **, &vmd->vmd_pagequeues[PQ_INACTIVE].pq_name) = "vm inactive pagequeue"; *__DECONST(u_int **, &vmd->vmd_pagequeues[PQ_INACTIVE].pq_vcnt) = - &cnt.v_inactive_count; + &vm_cnt.v_inactive_count; *__DECONST(char **, &vmd->vmd_pagequeues[PQ_ACTIVE].pq_name) = "vm active pagequeue"; *__DECONST(u_int **, &vmd->vmd_pagequeues[PQ_ACTIVE].pq_vcnt) = - &cnt.v_active_count; + &vm_cnt.v_active_count; + *__DECONST(char **, &vmd->vmd_pagequeues[PQ_LAUNDRY].pq_name) = + "vm laundry pagequeue"; + *__DECONST(int **, &vmd->vmd_pagequeues[PQ_LAUNDRY].pq_vcnt) = + &vm_cnt.v_laundry_count; vmd->vmd_page_count = 0; vmd->vmd_free_count = 0; vmd->vmd_segs = 0; vmd->vmd_oom = FALSE; - vmd->vmd_pass = 0; for (i = 0; i < PQ_COUNT; i++) { pq = &vmd->vmd_pagequeues[i]; TAILQ_INIT(&pq->pq_pl); @@ -274,6 +420,29 @@ } /* + * Initialize a physical page in preparation for adding it to the free + * lists. + */ +static void +vm_page_init_page(vm_page_t m, vm_paddr_t pa, int segind) +{ + + m->object = NULL; + m->wire_count = 0; + m->busy_lock = VPB_UNBUSIED; + m->hold_count = 0; + m->flags = 0; + m->phys_addr = pa; + m->queue = PQ_NONE; + m->psind = 0; + m->segind = segind; + m->order = VM_NFREEORDER; + m->pool = VM_FREEPOOL_DEFAULT; + m->valid = m->dirty = 0; + pmap_page_init(m); +} + +/* * vm_page_startup: * * Initializes the resident memory module. Allocates physical memory for @@ -284,19 +453,16 @@ vm_offset_t vm_page_startup(vm_offset_t vaddr) { + struct vm_domain *vmd; + struct vm_phys_seg *seg; + vm_page_t m; + char *list, *listend; vm_offset_t mapped; - vm_paddr_t high_avail, low_avail, page_range, size; - vm_paddr_t new_end; - int i; - vm_paddr_t pa; - vm_paddr_t last_pa; - char *list; + vm_paddr_t end, high_avail, low_avail, new_end, page_range, size; + vm_paddr_t biggestsize, last_pa, pa; + u_long pagecount; + int biggestone, i, pages_per_zone, segind; - /* the biggest memory array is the second group of pages */ - vm_paddr_t end; - vm_paddr_t biggestsize; - int biggestone; - biggestsize = 0; biggestone = 0; vaddr = round_page(vaddr); @@ -305,15 +471,6 @@ phys_avail[i] = round_page(phys_avail[i]); phys_avail[i + 1] = trunc_page(phys_avail[i + 1]); } - -#ifdef XEN - /* - * There is no obvious reason why i386 PV Xen needs vm_page structs - * created for these pseudo-physical addresses. XXX - */ - vm_phys_add_seg(0, phys_avail[0]); -#endif - for (i = 0; phys_avail[i + 1]; i += 2) { size = phys_avail[i + 1] - phys_avail[i]; if (size > biggestsize) { @@ -334,9 +491,27 @@ vm_page_domain_init(&vm_dom[i]); /* + * Almost all of the pages needed for bootstrapping UMA are used + * for zone structures, so if the number of CPUs results in those + * structures taking more than one page each, we set aside more pages + * in proportion to the zone structure size. + */ + pages_per_zone = howmany(sizeof(struct uma_zone) + + sizeof(struct uma_cache) * (mp_maxid + 1) + + roundup2(sizeof(struct uma_slab), sizeof(void *)), UMA_SLAB_SIZE); + if (pages_per_zone > 1) { + /* Reserve more pages so that we don't run out. */ + boot_pages = UMA_BOOT_PAGES_ZONES * pages_per_zone; + } + + /* * Allocate memory for use when boot strapping the kernel memory * allocator. + * + * CTFLAG_RDTUN doesn't work during the early boot process, so we must + * manually fetch the value. */ + TUNABLE_INT_FETCH("vm.boot_pages", &boot_pages); new_end = end - (boot_pages * UMA_SLAB_SIZE); new_end = trunc_page(new_end); mapped = pmap_map(&vaddr, new_end, end, @@ -344,8 +519,8 @@ bzero((void *)mapped, end - new_end); uma_startup((void *)mapped, boot_pages); -#if defined(__amd64__) || defined(__i386__) || defined(__arm__) || \ - defined(__mips__) +#if defined(__aarch64__) || defined(__amd64__) || defined(__arm__) || \ + defined(__i386__) || defined(__mips__) /* * Allocate a bitmap to indicate that a random physical page * needs to be included in a minidump. @@ -367,8 +542,10 @@ vm_page_dump = (void *)(uintptr_t)pmap_map(&vaddr, new_end, new_end + vm_page_dump_size, VM_PROT_READ | VM_PROT_WRITE); bzero((void *)vm_page_dump, vm_page_dump_size); +#else + (void)last_pa; #endif -#if defined(__amd64__) || defined(__mips__) +#if defined(__aarch64__) || defined(__amd64__) || defined(__mips__) /* * Include the UMA bootstrap pages and vm_page_dump in a crash dump. * When pmap_map() uses the direct map, they are not automatically @@ -471,7 +648,9 @@ new_end = trunc_page(end - page_range * sizeof(struct vm_page)); mapped = pmap_map(&vaddr, new_end, end, VM_PROT_READ | VM_PROT_WRITE); - vm_page_array = (vm_page_t) mapped; + vm_page_array = (vm_page_t)mapped; + vm_page_array_size = page_range; + #if VM_NRESERVLEVEL > 0 /* * Allocate physical memory for the reservation management system's @@ -481,13 +660,13 @@ high_avail = new_end; new_end = vm_reserv_startup(&vaddr, new_end, high_avail); #endif -#if defined(__amd64__) || defined(__mips__) +#if defined(__aarch64__) || defined(__amd64__) || defined(__mips__) /* * Include vm_page_array and vm_reserv_array in a crash dump. */ for (pa = new_end; pa < end; pa += PAGE_SIZE) dump_add_page(pa); -#endif +#endif phys_avail[biggestone + 1] = new_end; /* @@ -498,38 +677,60 @@ vm_phys_add_seg(phys_avail[i], phys_avail[i + 1]); /* - * Clear all of the page structures - */ - bzero((caddr_t) vm_page_array, page_range * sizeof(struct vm_page)); - for (i = 0; i < page_range; i++) - vm_page_array[i].order = VM_NFREEORDER; - vm_page_array_size = page_range; - - /* * Initialize the physical memory allocator. */ vm_phys_init(); /* - * Add every available physical page that is not blacklisted to - * the free lists. + * Initialize the page structures and add every available page to the + * physical memory allocator's free lists. */ - cnt.v_page_count = 0; - cnt.v_free_count = 0; - list = getenv("vm.blacklist"); - for (i = 0; phys_avail[i + 1] != 0; i += 2) { - pa = phys_avail[i]; - last_pa = phys_avail[i + 1]; - while (pa < last_pa) { - if (list != NULL && - vm_page_blacklist_lookup(list, pa)) - printf("Skipping page with pa 0x%jx\n", - (uintmax_t)pa); - else - vm_phys_add_page(pa); - pa += PAGE_SIZE; + vm_cnt.v_page_count = 0; + vm_cnt.v_free_count = 0; + for (segind = 0; segind < vm_phys_nsegs; segind++) { + seg = &vm_phys_segs[segind]; + for (m = seg->first_page, pa = seg->start; pa < seg->end; + m++, pa += PAGE_SIZE) + vm_page_init_page(m, pa, segind); + + /* + * Add the segment to the free lists only if it is covered by + * one of the ranges in phys_avail. Because we've added the + * ranges to the vm_phys_segs array, we can assume that each + * segment is either entirely contained in one of the ranges, + * or doesn't overlap any of them. + */ + for (i = 0; phys_avail[i + 1] != 0; i += 2) { + if (seg->start < phys_avail[i] || + seg->end > phys_avail[i + 1]) + continue; + + m = seg->first_page; + pagecount = (u_long)atop(seg->end - seg->start); + + mtx_lock(&vm_page_queue_free_mtx); + vm_phys_free_contig(m, pagecount); + vm_phys_freecnt_adj(m, (int)pagecount); + mtx_unlock(&vm_page_queue_free_mtx); + vm_cnt.v_page_count += (u_int)pagecount; + + vmd = &vm_dom[seg->domain]; + vmd->vmd_page_count += (u_int)pagecount; + vmd->vmd_segs |= 1UL << m->segind; + break; } } + + /* + * Remove blacklisted pages from the physical memory allocator. + */ + TAILQ_INIT(&blacklist_head); + vm_page_blacklist_load(&list, &listend); + vm_page_blacklist_check(list, listend); + + list = kern_getenv("vm.blacklist"); + vm_page_blacklist_check(list, NULL); + freeenv(list); #if VM_NRESERVLEVEL > 0 /* @@ -603,6 +804,7 @@ { u_int x; + vm_page_lock_assert(m, MA_NOTOWNED); vm_page_assert_sbusied(m); for (;;) { @@ -683,6 +885,41 @@ } } +static void +vm_page_xunbusy_locked(vm_page_t m) +{ + + vm_page_assert_xbusied(m); + vm_page_assert_locked(m); + + atomic_store_rel_int(&m->busy_lock, VPB_UNBUSIED); + /* There is a waiter, do wakeup() instead of vm_page_flash(). */ + wakeup(m); +} + +void +vm_page_xunbusy_maybelocked(vm_page_t m) +{ + bool lockacq; + + vm_page_assert_xbusied(m); + + /* + * Fast path for unbusy. If it succeeds, we know that there + * are no waiters, so we do not need a wakeup. + */ + if (atomic_cmpset_rel_int(&m->busy_lock, VPB_SINGLE_EXCLUSIVER, + VPB_UNBUSIED)) + return; + + lockacq = !mtx_owned(vm_page_lockptr(m)); + if (lockacq) + vm_page_lock(m); + vm_page_xunbusy_locked(m); + if (lockacq) + vm_page_unlock(m); +} + /* * vm_page_xunbusy_hard: * @@ -696,8 +933,7 @@ vm_page_assert_xbusied(m); vm_page_lock(m); - atomic_store_rel_int(&m->busy_lock, VPB_UNBUSIED); - wakeup(m); + vm_page_xunbusy_locked(m); vm_page_unlock(m); } @@ -728,6 +964,23 @@ } /* + * Avoid releasing and reacquiring the same page lock. + */ +void +vm_page_change_lock(vm_page_t m, struct mtx **mtx) +{ + struct mtx *mtx1; + + mtx1 = vm_page_lockptr(m); + if (*mtx == mtx1) + return; + if (*mtx != NULL) + mtx_unlock(*mtx); + *mtx = mtx1; + mtx_lock(mtx1); +} + +/* * Keep page from being freed by the page daemon * much of the same effect as wiring, except much lower * overhead and should be used only for *very* temporary @@ -756,24 +1009,15 @@ * vm_page_unhold_pages: * * Unhold each of the pages that is referenced by the given array. - */ + */ void vm_page_unhold_pages(vm_page_t *ma, int count) { - struct mtx *mtx, *new_mtx; + struct mtx *mtx; mtx = NULL; for (; count != 0; count--) { - /* - * Avoid releasing and reacquiring the same page lock. - */ - new_mtx = vm_page_lockptr(*ma); - if (mtx != new_mtx) { - if (mtx != NULL) - mtx_unlock(mtx); - mtx = new_mtx; - mtx_lock(mtx); - } + vm_page_change_lock(*ma, &mtx); vm_page_unhold(*ma); ma++; } @@ -905,39 +1149,29 @@ } /* - * Unbusy and handle the page queueing for a page from the VOP_GETPAGES() - * array which is not the request page. + * Unbusy and handle the page queueing for a page from a getpages request that + * was optionally read ahead or behind. */ void vm_page_readahead_finish(vm_page_t m) { - if (m->valid != 0) { - /* - * Since the page is not the requested page, whether - * it should be activated or deactivated is not - * obvious. Empirical results have shown that - * deactivating the page is usually the best choice, - * unless the page is wanted by another thread. - */ - vm_page_lock(m); - if ((m->busy_lock & VPB_BIT_WAITERS) != 0) - vm_page_activate(m); - else - vm_page_deactivate(m); - vm_page_unlock(m); - vm_page_xunbusy(m); - } else { - /* - * Free the completely invalid page. Such page state - * occurs due to the short read operation which did - * not covered our page at all, or in case when a read - * error happens. - */ - vm_page_lock(m); - vm_page_free(m); - vm_page_unlock(m); - } + /* We shouldn't put invalid pages on queues. */ + KASSERT(m->valid != 0, ("%s: %p is invalid", __func__, m)); + + /* + * Since the page is not the actually needed one, whether it should + * be activated or deactivated is not obvious. Empirical results + * have shown that deactivating the page is usually the best choice, + * unless the page is wanted by another thread. + */ + vm_page_lock(m); + if ((m->busy_lock & VPB_BIT_WAITERS) != 0) + vm_page_activate(m); + else + vm_page_deactivate(m); + vm_page_unlock(m); + vm_page_xunbusy(m); } /* @@ -991,11 +1225,7 @@ vm_page_dirty_KBI(vm_page_t m) { - /* These assertions refer to this operation by its public name. */ - KASSERT((m->flags & PG_CACHED) == 0, - ("vm_page_dirty: page in cache!")); - KASSERT(!VM_PAGE_IS_FREE(m), - ("vm_page_dirty: page is free!")); + /* Refer to this operation by its public name. */ KASSERT(m->valid == VM_PAGE_BITS_ALL, ("vm_page_dirty: page is invalid!")); m->dirty = VM_PAGE_BITS_ALL; @@ -1119,9 +1349,8 @@ /* * vm_page_remove: * - * Removes the given mem entry from the object/offset-page - * table and the object page list, but do not invalidate/terminate - * the backing store. + * Removes the specified page from its containing object, but does not + * invalidate any backing storage. * * The object must be locked. The page must be locked if it is managed. */ @@ -1129,30 +1358,21 @@ vm_page_remove(vm_page_t m) { vm_object_t object; - boolean_t lockacq; + vm_page_t mrem; if ((m->oflags & VPO_UNMANAGED) == 0) - vm_page_lock_assert(m, MA_OWNED); + vm_page_assert_locked(m); if ((object = m->object) == NULL) return; VM_OBJECT_ASSERT_WLOCKED(object); - if (vm_page_xbusied(m)) { - lockacq = FALSE; - if ((m->oflags & VPO_UNMANAGED) != 0 && - !mtx_owned(vm_page_lockptr(m))) { - lockacq = TRUE; - vm_page_lock(m); - } - vm_page_flash(m); - atomic_store_rel_int(&m->busy_lock, VPB_UNBUSIED); - if (lockacq) - vm_page_unlock(m); - } + if (vm_page_xbusied(m)) + vm_page_xunbusy_maybelocked(m); + mrem = vm_radix_remove(&object->rtree, m->pindex); + KASSERT(mrem == m, ("removed page %p, expected page %p", mrem, m)); /* * Now remove from the object's list of backed pages. */ - vm_radix_remove(&object->rtree, m->pindex); TAILQ_REMOVE(&object->memq, m, listq); /* @@ -1215,7 +1435,7 @@ { vm_page_t next; - VM_OBJECT_ASSERT_WLOCKED(m->object); + VM_OBJECT_ASSERT_LOCKED(m->object); if ((next = TAILQ_NEXT(m, listq)) != NULL) { MPASS(next->object == m->object); if (next->pindex != m->pindex + 1) @@ -1235,7 +1455,7 @@ { vm_page_t prev; - VM_OBJECT_ASSERT_WLOCKED(m->object); + VM_OBJECT_ASSERT_LOCKED(m->object); if ((prev = TAILQ_PREV(m, pglist, listq)) != NULL) { MPASS(prev->object == m->object); if (prev->pindex != m->pindex - 1) @@ -1253,9 +1473,13 @@ vm_page_t vm_page_replace(vm_page_t mnew, vm_object_t object, vm_pindex_t pindex) { - vm_page_t mold, mpred; + vm_page_t mold; VM_OBJECT_ASSERT_WLOCKED(object); + KASSERT(mnew->object == NULL, + ("vm_page_replace: page %p already in object", mnew)); + KASSERT(mnew->queue == PQ_NONE, + ("vm_page_replace: new page %p is on a paging queue", mnew)); /* * This function mostly follows vm_page_insert() and @@ -1262,31 +1486,24 @@ * vm_page_remove() without the radix, object count and vnode * dance. Double check such functions for more comments. */ - mpred = vm_radix_lookup(&object->rtree, pindex); - KASSERT(mpred != NULL, - ("vm_page_replace: replacing page not present with pindex")); - mpred = TAILQ_PREV(mpred, respgs, listq); - if (mpred != NULL) - KASSERT(mpred->pindex < pindex, - ("vm_page_insert_after: mpred doesn't precede pindex")); mnew->object = object; mnew->pindex = pindex; mold = vm_radix_replace(&object->rtree, mnew); KASSERT(mold->queue == PQ_NONE, - ("vm_page_replace: mold is on a paging queue")); + ("vm_page_replace: old page %p is on a paging queue", mold)); - /* Detach the old page from the resident tailq. */ + /* Keep the resident page list in sorted order. */ + TAILQ_INSERT_AFTER(&object->memq, mold, mnew, listq); TAILQ_REMOVE(&object->memq, mold, listq); mold->object = NULL; - vm_page_xunbusy(mold); + vm_page_xunbusy_maybelocked(mold); - /* Insert the new page in the resident tailq. */ - if (mpred != NULL) - TAILQ_INSERT_AFTER(&object->memq, mpred, mnew, listq); - else - TAILQ_INSERT_HEAD(&object->memq, mnew, listq); + /* + * The object's resident_page_count does not change because we have + * swapped one page for another, but OBJ_MIGHTBEDIRTY. + */ if (pmap_page_is_write_mapped(mnew)) vm_object_set_writeable_dirty(object); return (mold); @@ -1306,9 +1523,7 @@ * * Note: we *always* dirty the page. It is necessary both for the * fact that we moved it, and because we may be invalidating - * swap. If the page is on the cache, we have to deactivate it - * or vm_page_dirty() will panic. Dirty pages are not allowed - * on the cache. + * swap. * * The objects must be locked. */ @@ -1354,142 +1569,6 @@ } /* - * Convert all of the given object's cached pages that have a - * pindex within the given range into free pages. If the value - * zero is given for "end", then the range's upper bound is - * infinity. If the given object is backed by a vnode and it - * transitions from having one or more cached pages to none, the - * vnode's hold count is reduced. - */ -void -vm_page_cache_free(vm_object_t object, vm_pindex_t start, vm_pindex_t end) -{ - vm_page_t m; - boolean_t empty; - - mtx_lock(&vm_page_queue_free_mtx); - if (__predict_false(vm_radix_is_empty(&object->cache))) { - mtx_unlock(&vm_page_queue_free_mtx); - return; - } - while ((m = vm_radix_lookup_ge(&object->cache, start)) != NULL) { - if (end != 0 && m->pindex >= end) - break; - vm_radix_remove(&object->cache, m->pindex); - vm_page_cache_turn_free(m); - } - empty = vm_radix_is_empty(&object->cache); - mtx_unlock(&vm_page_queue_free_mtx); - if (object->type == OBJT_VNODE && empty) - vdrop(object->handle); -} - -/* - * Returns the cached page that is associated with the given - * object and offset. If, however, none exists, returns NULL. - * - * The free page queue must be locked. - */ -static inline vm_page_t -vm_page_cache_lookup(vm_object_t object, vm_pindex_t pindex) -{ - - mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); - return (vm_radix_lookup(&object->cache, pindex)); -} - -/* - * Remove the given cached page from its containing object's - * collection of cached pages. - * - * The free page queue must be locked. - */ -static void -vm_page_cache_remove(vm_page_t m) -{ - - mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); - KASSERT((m->flags & PG_CACHED) != 0, - ("vm_page_cache_remove: page %p is not cached", m)); - vm_radix_remove(&m->object->cache, m->pindex); - m->object = NULL; - cnt.v_cache_count--; -} - -/* - * Transfer all of the cached pages with offset greater than or - * equal to 'offidxstart' from the original object's cache to the - * new object's cache. However, any cached pages with offset - * greater than or equal to the new object's size are kept in the - * original object. Initially, the new object's cache must be - * empty. Offset 'offidxstart' in the original object must - * correspond to offset zero in the new object. - * - * The new object must be locked. - */ -void -vm_page_cache_transfer(vm_object_t orig_object, vm_pindex_t offidxstart, - vm_object_t new_object) -{ - vm_page_t m; - - /* - * Insertion into an object's collection of cached pages - * requires the object to be locked. In contrast, removal does - * not. - */ - VM_OBJECT_ASSERT_WLOCKED(new_object); - KASSERT(vm_radix_is_empty(&new_object->cache), - ("vm_page_cache_transfer: object %p has cached pages", - new_object)); - mtx_lock(&vm_page_queue_free_mtx); - while ((m = vm_radix_lookup_ge(&orig_object->cache, - offidxstart)) != NULL) { - /* - * Transfer all of the pages with offset greater than or - * equal to 'offidxstart' from the original object's - * cache to the new object's cache. - */ - if ((m->pindex - offidxstart) >= new_object->size) - break; - vm_radix_remove(&orig_object->cache, m->pindex); - /* Update the page's object and offset. */ - m->object = new_object; - m->pindex -= offidxstart; - if (vm_radix_insert(&new_object->cache, m)) - vm_page_cache_turn_free(m); - } - mtx_unlock(&vm_page_queue_free_mtx); -} - -/* - * Returns TRUE if a cached page is associated with the given object and - * offset, and FALSE otherwise. - * - * The object must be locked. - */ -boolean_t -vm_page_is_cached(vm_object_t object, vm_pindex_t pindex) -{ - vm_page_t m; - - /* - * Insertion into an object's collection of cached pages requires the - * object to be locked. Therefore, if the object is locked and the - * object's collection is empty, there is no need to acquire the free - * page queues lock in order to prove that the specified page doesn't - * exist. - */ - VM_OBJECT_ASSERT_WLOCKED(object); - if (__predict_true(vm_object_cache_is_empty(object))) - return (FALSE); - mtx_lock(&vm_page_queue_free_mtx); - m = vm_page_cache_lookup(object, pindex); - mtx_unlock(&vm_page_queue_free_mtx); - return (m != NULL); -} - -/* * vm_page_alloc: * * Allocate and return a page that is associated with the specified @@ -1505,13 +1584,10 @@ * optional allocation flags: * VM_ALLOC_COUNT(number) the number of additional pages that the caller * intends to allocate - * VM_ALLOC_IFCACHED return page only if it is cached - * VM_ALLOC_IFNOTCACHED return NULL, do not reactivate if the page - * is cached * VM_ALLOC_NOBUSY do not exclusive busy the page * VM_ALLOC_NODUMP do not include the page in a kernel core dump * VM_ALLOC_NOOBJ page is not associated with an object and - * should not be exclusive busy + * should not be exclusive busy * VM_ALLOC_SBUSY shared busy the allocated page * VM_ALLOC_WIRED wire the allocated page * VM_ALLOC_ZERO prefer a zeroed page @@ -1521,21 +1597,41 @@ vm_page_t vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int req) { - struct vnode *vp = NULL; - vm_object_t m_object; - vm_page_t m, mpred; + + return (vm_page_alloc_after(object, pindex, req, object != NULL ? + vm_radix_lookup_le(&object->rtree, pindex) : NULL)); +} + +/* + * Allocate a page in the specified object with the given page index. To + * optimize insertion of the page into the object, the caller must also specifiy + * the resident page in the object with largest index smaller than the given + * page index, or NULL if no such page exists. + */ +vm_page_t +vm_page_alloc_after(vm_object_t object, vm_pindex_t pindex, int req, + vm_page_t mpred) +{ + vm_page_t m; int flags, req_class; + u_int free_count; - mpred = 0; /* XXX: pacify gcc */ KASSERT((object != NULL) == ((req & VM_ALLOC_NOOBJ) == 0) && (object != NULL || (req & VM_ALLOC_SBUSY) == 0) && ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) != (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)), - ("vm_page_alloc: inconsistent object(%p)/req(%x)", (void *)object, - req)); + ("inconsistent object(%p)/req(%x)", object, req)); + KASSERT(object == NULL || (req & VM_ALLOC_WAITOK) == 0, + ("Can't sleep and retry object insertion.")); + KASSERT(mpred == NULL || mpred->pindex < pindex, + ("mpred %p doesn't precede pindex 0x%jx", mpred, + (uintmax_t)pindex)); if (object != NULL) VM_OBJECT_ASSERT_WLOCKED(object); + if (__predict_false((req & VM_ALLOC_IFCACHED) != 0)) + return (NULL); + req_class = req & VM_ALLOC_CLASS_MASK; /* @@ -1544,52 +1640,29 @@ if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT) req_class = VM_ALLOC_SYSTEM; - if (object != NULL) { - mpred = vm_radix_lookup_le(&object->rtree, pindex); - KASSERT(mpred == NULL || mpred->pindex != pindex, - ("vm_page_alloc: pindex already allocated")); - } - /* - * The page allocation request can came from consumers which already - * hold the free page queue mutex, like vm_page_insert() in - * vm_page_cache(). + * Allocate a page if the number of free pages exceeds the minimum + * for the request class. */ - mtx_lock_flags(&vm_page_queue_free_mtx, MTX_RECURSE); - if (cnt.v_free_count + cnt.v_cache_count > cnt.v_free_reserved || +again: + mtx_lock(&vm_page_queue_free_mtx); + if (vm_cnt.v_free_count > vm_cnt.v_free_reserved || (req_class == VM_ALLOC_SYSTEM && - cnt.v_free_count + cnt.v_cache_count > cnt.v_interrupt_free_min) || + vm_cnt.v_free_count > vm_cnt.v_interrupt_free_min) || (req_class == VM_ALLOC_INTERRUPT && - cnt.v_free_count + cnt.v_cache_count > 0)) { + vm_cnt.v_free_count > 0)) { /* - * Allocate from the free queue if the number of free pages - * exceeds the minimum for the request class. + * Can we allocate the page from a reservation? */ - if (object != NULL && - (m = vm_page_cache_lookup(object, pindex)) != NULL) { - if ((req & VM_ALLOC_IFNOTCACHED) != 0) { - mtx_unlock(&vm_page_queue_free_mtx); - return (NULL); - } - if (vm_phys_unfree_page(m)) - vm_phys_set_pool(VM_FREEPOOL_DEFAULT, m, 0); #if VM_NRESERVLEVEL > 0 - else if (!vm_reserv_reactivate_page(m)) -#else - else -#endif - panic("vm_page_alloc: cache page %p is missing" - " from the free queue", m); - } else if ((req & VM_ALLOC_IFCACHED) != 0) { - mtx_unlock(&vm_page_queue_free_mtx); - return (NULL); -#if VM_NRESERVLEVEL > 0 - } else if (object == NULL || (object->flags & (OBJ_COLORED | + if (object == NULL || (object->flags & (OBJ_COLORED | OBJ_FICTITIOUS)) != OBJ_COLORED || (m = - vm_reserv_alloc_page(object, pindex, mpred)) == NULL) { -#else - } else { + vm_reserv_alloc_page(object, pindex, mpred)) == NULL) #endif + { + /* + * If not, allocate it from the free page queues. + */ m = vm_phys_alloc_pages(object != NULL ? VM_FREEPOOL_DEFAULT : VM_FREEPOOL_DIRECT, 0); #if VM_NRESERVLEVEL > 0 @@ -1604,10 +1677,8 @@ /* * Not allocatable, give up. */ - mtx_unlock(&vm_page_queue_free_mtx); - atomic_add_int(&vm_pageout_deficit, - max((u_int)req >> VM_ALLOC_COUNT_SHIFT, 1)); - pagedaemon_wakeup(); + if (vm_page_alloc_fail(object, req)) + goto again; return (NULL); } @@ -1614,52 +1685,23 @@ /* * At this point we had better have found a good page. */ - KASSERT(m != NULL, ("vm_page_alloc: missing page")); - KASSERT(m->queue == PQ_NONE, - ("vm_page_alloc: page %p has unexpected queue %d", m, m->queue)); - KASSERT(m->wire_count == 0, ("vm_page_alloc: page %p is wired", m)); - KASSERT(m->hold_count == 0, ("vm_page_alloc: page %p is held", m)); - KASSERT(!vm_page_busied(m), ("vm_page_alloc: page %p is busy", m)); - KASSERT(m->dirty == 0, ("vm_page_alloc: page %p is dirty", m)); - KASSERT(pmap_page_get_memattr(m) == VM_MEMATTR_DEFAULT, - ("vm_page_alloc: page %p has unexpected memattr %d", m, - pmap_page_get_memattr(m))); - if ((m->flags & PG_CACHED) != 0) { - KASSERT((m->flags & PG_ZERO) == 0, - ("vm_page_alloc: cached page %p is PG_ZERO", m)); - KASSERT(m->valid != 0, - ("vm_page_alloc: cached page %p is invalid", m)); - if (m->object == object && m->pindex == pindex) - cnt.v_reactivated++; - else - m->valid = 0; - m_object = m->object; - vm_page_cache_remove(m); - if (m_object->type == OBJT_VNODE && - vm_object_cache_is_empty(m_object)) - vp = m_object->handle; - } else { - KASSERT(VM_PAGE_IS_FREE(m), - ("vm_page_alloc: page %p is not free", m)); - KASSERT(m->valid == 0, - ("vm_page_alloc: free page %p is valid", m)); - vm_phys_freecnt_adj(m, -1); - } + KASSERT(m != NULL, ("missing page")); + free_count = vm_phys_freecnt_adj(m, -1); + if ((m->flags & PG_ZERO) != 0) + vm_page_zero_count--; + mtx_unlock(&vm_page_queue_free_mtx); + vm_page_alloc_check(m); /* - * Only the PG_ZERO flag is inherited. The PG_CACHED or PG_FREE flag - * must be cleared before the free page queues lock is released. + * Initialize the page. Only the PG_ZERO flag is inherited. */ flags = 0; - if (m->flags & PG_ZERO) { - vm_page_zero_count--; - if (req & VM_ALLOC_ZERO) - flags = PG_ZERO; - } - if (req & VM_ALLOC_NODUMP) + if ((req & VM_ALLOC_ZERO) != 0) + flags = PG_ZERO; + flags &= m->flags; + if ((req & VM_ALLOC_NODUMP) != 0) flags |= PG_NODUMP; m->flags = flags; - mtx_unlock(&vm_page_queue_free_mtx); m->aflags = 0; m->oflags = object == NULL || (object->flags & OBJ_UNMANAGED) != 0 ? VPO_UNMANAGED : 0; @@ -1673,7 +1715,7 @@ * The page lock is not required for wiring a page until that * page is inserted into the object. */ - atomic_add_int(&cnt.v_wire_count, 1); + atomic_add_int(&vm_cnt.v_wire_count, 1); m->wire_count = 1; } m->act_count = 0; @@ -1680,18 +1722,21 @@ if (object != NULL) { if (vm_page_insert_after(m, object, pindex, mpred)) { - /* See the comment below about hold count. */ - if (vp != NULL) - vdrop(vp); pagedaemon_wakeup(); if (req & VM_ALLOC_WIRED) { - atomic_subtract_int(&cnt.v_wire_count, 1); + atomic_subtract_int(&vm_cnt.v_wire_count, 1); m->wire_count = 0; } - m->object = NULL; + KASSERT(m->object == NULL, ("page %p has object", m)); m->oflags = VPO_UNMANAGED; m->busy_lock = VPB_UNBUSIED; - vm_page_free(m); + /* Don't change PG_ZERO. */ + vm_page_free_toq(m); + if (req & VM_ALLOC_WAITFAIL) { + VM_OBJECT_WUNLOCK(object); + vm_radix_wait(); + VM_OBJECT_WLOCK(object); + } return (NULL); } @@ -1703,34 +1748,15 @@ m->pindex = pindex; /* - * The following call to vdrop() must come after the above call - * to vm_page_insert() in case both affect the same object and - * vnode. Otherwise, the affected vnode's hold count could - * temporarily become zero. - */ - if (vp != NULL) - vdrop(vp); - - /* * Don't wakeup too often - wakeup the pageout daemon when * we would be nearly out of memory. */ - if (vm_paging_needed()) + if (vm_paging_needed(free_count)) pagedaemon_wakeup(); return (m); } -static void -vm_page_alloc_contig_vdrop(struct spglist *lst) -{ - - while (!SLIST_EMPTY(lst)) { - vdrop((struct vnode *)SLIST_FIRST(lst)-> plinks.s.pv); - SLIST_REMOVE_HEAD(lst, plinks.s.ss); - } -} - /* * vm_page_alloc_contig: * @@ -1752,6 +1778,8 @@ * memory attribute setting for the physical pages cannot be configured * to VM_MEMATTR_DEFAULT. * + * The specified object may not contain fictitious pages. + * * The caller must always specify an allocation class. * * allocation classes: @@ -1763,7 +1791,7 @@ * VM_ALLOC_NOBUSY do not exclusive busy the page * VM_ALLOC_NODUMP do not include the page in a kernel core dump * VM_ALLOC_NOOBJ page is not associated with an object and - * should not be exclusive busy + * should not be exclusive busy * VM_ALLOC_SBUSY shared busy the allocated page * VM_ALLOC_WIRED wire the allocated page * VM_ALLOC_ZERO prefer a zeroed page @@ -1775,22 +1803,23 @@ u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary, vm_memattr_t memattr) { - struct vnode *drop; - struct spglist deferred_vdrop_list; - vm_page_t m, m_tmp, m_ret; - u_int flags, oflags; + vm_page_t m, m_ret, mpred; + u_int busy_lock, flags, oflags; int req_class; + mpred = NULL; /* XXX: pacify gcc */ KASSERT((object != NULL) == ((req & VM_ALLOC_NOOBJ) == 0) && (object != NULL || (req & VM_ALLOC_SBUSY) == 0) && ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) != (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)), - ("vm_page_alloc: inconsistent object(%p)/req(%x)", (void *)object, + ("vm_page_alloc_contig: inconsistent object(%p)/req(%x)", object, req)); + KASSERT(object == NULL || (req & VM_ALLOC_WAITOK) == 0, + ("Can't sleep and retry object insertion.")); if (object != NULL) { VM_OBJECT_ASSERT_WLOCKED(object); - KASSERT(object->type == OBJT_PHYS, - ("vm_page_alloc_contig: object %p isn't OBJT_PHYS", + KASSERT((object->flags & OBJ_FICTITIOUS) == 0, + ("vm_page_alloc_contig: object %p has fictitious pages", object)); } KASSERT(npages > 0, ("vm_page_alloc_contig: npages is zero")); @@ -1802,40 +1831,48 @@ if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT) req_class = VM_ALLOC_SYSTEM; - SLIST_INIT(&deferred_vdrop_list); + if (object != NULL) { + mpred = vm_radix_lookup_le(&object->rtree, pindex); + KASSERT(mpred == NULL || mpred->pindex != pindex, + ("vm_page_alloc_contig: pindex already allocated")); + } + + /* + * Can we allocate the pages without the number of free pages falling + * below the lower bound for the allocation class? + */ +again: mtx_lock(&vm_page_queue_free_mtx); - if (cnt.v_free_count + cnt.v_cache_count >= npages + - cnt.v_free_reserved || (req_class == VM_ALLOC_SYSTEM && - cnt.v_free_count + cnt.v_cache_count >= npages + - cnt.v_interrupt_free_min) || (req_class == VM_ALLOC_INTERRUPT && - cnt.v_free_count + cnt.v_cache_count >= npages)) { + if (vm_cnt.v_free_count >= npages + vm_cnt.v_free_reserved || + (req_class == VM_ALLOC_SYSTEM && + vm_cnt.v_free_count >= npages + vm_cnt.v_interrupt_free_min) || + (req_class == VM_ALLOC_INTERRUPT && + vm_cnt.v_free_count >= npages)) { + /* + * Can we allocate the pages from a reservation? + */ #if VM_NRESERVLEVEL > 0 retry: if (object == NULL || (object->flags & OBJ_COLORED) == 0 || (m_ret = vm_reserv_alloc_contig(object, pindex, npages, - low, high, alignment, boundary)) == NULL) + low, high, alignment, boundary, mpred)) == NULL) #endif + /* + * If not, allocate them from the free page queues. + */ m_ret = vm_phys_alloc_contig(npages, low, high, alignment, boundary); } else { - mtx_unlock(&vm_page_queue_free_mtx); - atomic_add_int(&vm_pageout_deficit, npages); - pagedaemon_wakeup(); + if (vm_page_alloc_fail(object, req)) + goto again; return (NULL); } - if (m_ret != NULL) - for (m = m_ret; m < &m_ret[npages]; m++) { - drop = vm_page_alloc_init(m); - if (drop != NULL) { - /* - * Enqueue the vnode for deferred vdrop(). - */ - m->plinks.s.pv = drop; - SLIST_INSERT_HEAD(&deferred_vdrop_list, m, - plinks.s.ss); - } - } - else { + if (m_ret != NULL) { + vm_phys_freecnt_adj(m_ret, -npages); + for (m = m_ret; m < &m_ret[npages]; m++) + if ((m->flags & PG_ZERO) != 0) + vm_page_zero_count--; + } else { #if VM_NRESERVLEVEL > 0 if (vm_reserv_reclaim_contig(npages, low, high, alignment, boundary)) @@ -1845,6 +1882,8 @@ mtx_unlock(&vm_page_queue_free_mtx); if (m_ret == NULL) return (NULL); + for (m = m_ret; m < &m_ret[npages]; m++) + vm_page_alloc_check(m); /* * Initialize the pages. Only the PG_ZERO flag is inherited. @@ -1854,9 +1893,15 @@ flags = PG_ZERO; if ((req & VM_ALLOC_NODUMP) != 0) flags |= PG_NODUMP; + oflags = object == NULL || (object->flags & OBJ_UNMANAGED) != 0 ? + VPO_UNMANAGED : 0; + busy_lock = VPB_UNBUSIED; + if ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_NOOBJ | VM_ALLOC_SBUSY)) == 0) + busy_lock = VPB_SINGLE_EXCLUSIVER; + if ((req & VM_ALLOC_SBUSY) != 0) + busy_lock = VPB_SHARERS_WORD(1); if ((req & VM_ALLOC_WIRED) != 0) - atomic_add_int(&cnt.v_wire_count, npages); - oflags = VPO_UNMANAGED; + atomic_add_int(&vm_cnt.v_wire_count, npages); if (object != NULL) { if (object->memattr != VM_MEMATTR_DEFAULT && memattr == VM_MEMATTR_DEFAULT) @@ -1865,39 +1910,37 @@ for (m = m_ret; m < &m_ret[npages]; m++) { m->aflags = 0; m->flags = (m->flags | PG_NODUMP) & flags; - m->busy_lock = VPB_UNBUSIED; - if (object != NULL) { - if ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) == 0) - m->busy_lock = VPB_SINGLE_EXCLUSIVER; - if ((req & VM_ALLOC_SBUSY) != 0) - m->busy_lock = VPB_SHARERS_WORD(1); - } + m->busy_lock = busy_lock; if ((req & VM_ALLOC_WIRED) != 0) m->wire_count = 1; - /* Unmanaged pages don't use "act_count". */ + m->act_count = 0; m->oflags = oflags; if (object != NULL) { - if (vm_page_insert(m, object, pindex)) { - vm_page_alloc_contig_vdrop( - &deferred_vdrop_list); - if (vm_paging_needed()) - pagedaemon_wakeup(); + if (vm_page_insert_after(m, object, pindex, mpred)) { + pagedaemon_wakeup(); if ((req & VM_ALLOC_WIRED) != 0) - atomic_subtract_int(&cnt.v_wire_count, - npages); - for (m_tmp = m, m = m_ret; - m < &m_ret[npages]; m++) { - if ((req & VM_ALLOC_WIRED) != 0) + atomic_subtract_int( + &vm_cnt.v_wire_count, npages); + KASSERT(m->object == NULL, + ("page %p has object", m)); + mpred = m; + for (m = m_ret; m < &m_ret[npages]; m++) { + if (m <= mpred && + (req & VM_ALLOC_WIRED) != 0) m->wire_count = 0; - if (m >= m_tmp) { - m->object = NULL; - m->oflags |= VPO_UNMANAGED; - } + m->oflags = VPO_UNMANAGED; m->busy_lock = VPB_UNBUSIED; - vm_page_free(m); + /* Don't change PG_ZERO. */ + vm_page_free_toq(m); } + if (req & VM_ALLOC_WAITFAIL) { + VM_OBJECT_WUNLOCK(object); + vm_radix_wait(); + VM_OBJECT_WLOCK(object); + } return (NULL); } + mpred = m; } else m->pindex = pindex; if (memattr != VM_MEMATTR_DEFAULT) @@ -1904,63 +1947,29 @@ pmap_page_set_memattr(m, memattr); pindex++; } - vm_page_alloc_contig_vdrop(&deferred_vdrop_list); - if (vm_paging_needed()) + if (vm_paging_needed(vm_cnt.v_free_count)) pagedaemon_wakeup(); return (m_ret); } /* - * Initialize a page that has been freshly dequeued from a freelist. - * The caller has to drop the vnode returned, if it is not NULL. - * - * This function may only be used to initialize unmanaged pages. - * - * To be called with vm_page_queue_free_mtx held. + * Check a page that has been freshly dequeued from a freelist. */ -static struct vnode * -vm_page_alloc_init(vm_page_t m) +static void +vm_page_alloc_check(vm_page_t m) { - struct vnode *drop; - vm_object_t m_object; + KASSERT(m->object == NULL, ("page %p has object", m)); KASSERT(m->queue == PQ_NONE, - ("vm_page_alloc_init: page %p has unexpected queue %d", - m, m->queue)); - KASSERT(m->wire_count == 0, - ("vm_page_alloc_init: page %p is wired", m)); - KASSERT(m->hold_count == 0, - ("vm_page_alloc_init: page %p is held", m)); - KASSERT(!vm_page_busied(m), - ("vm_page_alloc_init: page %p is busy", m)); - KASSERT(m->dirty == 0, - ("vm_page_alloc_init: page %p is dirty", m)); + ("page %p has unexpected queue %d", m, m->queue)); + KASSERT(m->wire_count == 0, ("page %p is wired", m)); + KASSERT(m->hold_count == 0, ("page %p is held", m)); + KASSERT(!vm_page_busied(m), ("page %p is busy", m)); + KASSERT(m->dirty == 0, ("page %p is dirty", m)); KASSERT(pmap_page_get_memattr(m) == VM_MEMATTR_DEFAULT, - ("vm_page_alloc_init: page %p has unexpected memattr %d", + ("page %p has unexpected memattr %d", m, pmap_page_get_memattr(m))); - mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); - drop = NULL; - if ((m->flags & PG_CACHED) != 0) { - KASSERT((m->flags & PG_ZERO) == 0, - ("vm_page_alloc_init: cached page %p is PG_ZERO", m)); - m->valid = 0; - m_object = m->object; - vm_page_cache_remove(m); - if (m_object->type == OBJT_VNODE && - vm_object_cache_is_empty(m_object)) - drop = m_object->handle; - } else { - KASSERT(VM_PAGE_IS_FREE(m), - ("vm_page_alloc_init: page %p is not free", m)); - KASSERT(m->valid == 0, - ("vm_page_alloc_init: free page %p is valid", m)); - vm_phys_freecnt_adj(m, -1); - if ((m->flags & PG_ZERO) != 0) - vm_page_zero_count--; - } - /* Don't clear the PG_ZERO flag; we'll need it later. */ - m->flags &= PG_ZERO; - return (drop); + KASSERT(m->valid == 0, ("free page %p is valid", m)); } /* @@ -1986,9 +1995,8 @@ vm_page_t vm_page_alloc_freelist(int flind, int req) { - struct vnode *drop; vm_page_t m; - u_int flags; + u_int flags, free_count; int req_class; req_class = req & VM_ALLOC_CLASS_MASK; @@ -2002,18 +2010,17 @@ /* * Do not allocate reserved pages unless the req has asked for it. */ - mtx_lock_flags(&vm_page_queue_free_mtx, MTX_RECURSE); - if (cnt.v_free_count + cnt.v_cache_count > cnt.v_free_reserved || +again: + mtx_lock(&vm_page_queue_free_mtx); + if (vm_cnt.v_free_count > vm_cnt.v_free_reserved || (req_class == VM_ALLOC_SYSTEM && - cnt.v_free_count + cnt.v_cache_count > cnt.v_interrupt_free_min) || + vm_cnt.v_free_count > vm_cnt.v_interrupt_free_min) || (req_class == VM_ALLOC_INTERRUPT && - cnt.v_free_count + cnt.v_cache_count > 0)) + vm_cnt.v_free_count > 0)) { m = vm_phys_alloc_freelist_pages(flind, VM_FREEPOOL_DIRECT, 0); - else { - mtx_unlock(&vm_page_queue_free_mtx); - atomic_add_int(&vm_pageout_deficit, - max((u_int)req >> VM_ALLOC_COUNT_SHIFT, 1)); - pagedaemon_wakeup(); + } else { + if (vm_page_alloc_fail(NULL, req)) + goto again; return (NULL); } if (m == NULL) { @@ -2020,8 +2027,11 @@ mtx_unlock(&vm_page_queue_free_mtx); return (NULL); } - drop = vm_page_alloc_init(m); + free_count = vm_phys_freecnt_adj(m, -1); + if ((m->flags & PG_ZERO) != 0) + vm_page_zero_count--; mtx_unlock(&vm_page_queue_free_mtx); + vm_page_alloc_check(m); /* * Initialize the page. Only the PG_ZERO flag is inherited. @@ -2036,44 +2046,602 @@ * The page lock is not required for wiring a page that does * not belong to an object. */ - atomic_add_int(&cnt.v_wire_count, 1); + atomic_add_int(&vm_cnt.v_wire_count, 1); m->wire_count = 1; } /* Unmanaged pages don't use "act_count". */ m->oflags = VPO_UNMANAGED; - if (drop != NULL) - vdrop(drop); - if (vm_paging_needed()) + if (vm_paging_needed(free_count)) pagedaemon_wakeup(); return (m); } +#define VPSC_ANY 0 /* No restrictions. */ +#define VPSC_NORESERV 1 /* Skip reservations; implies VPSC_NOSUPER. */ +#define VPSC_NOSUPER 2 /* Skip superpages. */ + /* + * vm_page_scan_contig: + * + * Scan vm_page_array[] between the specified entries "m_start" and + * "m_end" for a run of contiguous physical pages that satisfy the + * specified conditions, and return the lowest page in the run. The + * specified "alignment" determines the alignment of the lowest physical + * page in the run. If the specified "boundary" is non-zero, then the + * run of physical pages cannot span a physical address that is a + * multiple of "boundary". + * + * "m_end" is never dereferenced, so it need not point to a vm_page + * structure within vm_page_array[]. + * + * "npages" must be greater than zero. "m_start" and "m_end" must not + * span a hole (or discontiguity) in the physical address space. Both + * "alignment" and "boundary" must be a power of two. + */ +vm_page_t +vm_page_scan_contig(u_long npages, vm_page_t m_start, vm_page_t m_end, + u_long alignment, vm_paddr_t boundary, int options) +{ + struct mtx *m_mtx; + vm_object_t object; + vm_paddr_t pa; + vm_page_t m, m_run; +#if VM_NRESERVLEVEL > 0 + int level; +#endif + int m_inc, order, run_ext, run_len; + + KASSERT(npages > 0, ("npages is 0")); + KASSERT(powerof2(alignment), ("alignment is not a power of 2")); + KASSERT(powerof2(boundary), ("boundary is not a power of 2")); + m_run = NULL; + run_len = 0; + m_mtx = NULL; + for (m = m_start; m < m_end && run_len < npages; m += m_inc) { + KASSERT((m->flags & PG_MARKER) == 0, + ("page %p is PG_MARKER", m)); + KASSERT((m->flags & PG_FICTITIOUS) == 0 || m->wire_count == 1, + ("fictitious page %p has invalid wire count", m)); + + /* + * If the current page would be the start of a run, check its + * physical address against the end, alignment, and boundary + * conditions. If it doesn't satisfy these conditions, either + * terminate the scan or advance to the next page that + * satisfies the failed condition. + */ + if (run_len == 0) { + KASSERT(m_run == NULL, ("m_run != NULL")); + if (m + npages > m_end) + break; + pa = VM_PAGE_TO_PHYS(m); + if ((pa & (alignment - 1)) != 0) { + m_inc = atop(roundup2(pa, alignment) - pa); + continue; + } + if (rounddown2(pa ^ (pa + ptoa(npages) - 1), + boundary) != 0) { + m_inc = atop(roundup2(pa, boundary) - pa); + continue; + } + } else + KASSERT(m_run != NULL, ("m_run == NULL")); + + vm_page_change_lock(m, &m_mtx); + m_inc = 1; +retry: + if (m->wire_count != 0 || m->hold_count != 0) + run_ext = 0; +#if VM_NRESERVLEVEL > 0 + else if ((level = vm_reserv_level(m)) >= 0 && + (options & VPSC_NORESERV) != 0) { + run_ext = 0; + /* Advance to the end of the reservation. */ + pa = VM_PAGE_TO_PHYS(m); + m_inc = atop(roundup2(pa + 1, vm_reserv_size(level)) - + pa); + } +#endif + else if ((object = m->object) != NULL) { + /* + * The page is considered eligible for relocation if + * and only if it could be laundered or reclaimed by + * the page daemon. + */ + if (!VM_OBJECT_TRYRLOCK(object)) { + mtx_unlock(m_mtx); + VM_OBJECT_RLOCK(object); + mtx_lock(m_mtx); + if (m->object != object) { + /* + * The page may have been freed. + */ + VM_OBJECT_RUNLOCK(object); + goto retry; + } else if (m->wire_count != 0 || + m->hold_count != 0) { + run_ext = 0; + goto unlock; + } + } + KASSERT((m->flags & PG_UNHOLDFREE) == 0, + ("page %p is PG_UNHOLDFREE", m)); + /* Don't care: PG_NODUMP, PG_ZERO. */ + if (object->type != OBJT_DEFAULT && + object->type != OBJT_SWAP && + object->type != OBJT_VNODE) { + run_ext = 0; +#if VM_NRESERVLEVEL > 0 + } else if ((options & VPSC_NOSUPER) != 0 && + (level = vm_reserv_level_iffullpop(m)) >= 0) { + run_ext = 0; + /* Advance to the end of the superpage. */ + pa = VM_PAGE_TO_PHYS(m); + m_inc = atop(roundup2(pa + 1, + vm_reserv_size(level)) - pa); +#endif + } else if (object->memattr == VM_MEMATTR_DEFAULT && + m->queue != PQ_NONE && !vm_page_busied(m)) { + /* + * The page is allocated but eligible for + * relocation. Extend the current run by one + * page. + */ + KASSERT(pmap_page_get_memattr(m) == + VM_MEMATTR_DEFAULT, + ("page %p has an unexpected memattr", m)); + KASSERT((m->oflags & (VPO_SWAPINPROG | + VPO_SWAPSLEEP | VPO_UNMANAGED)) == 0, + ("page %p has unexpected oflags", m)); + /* Don't care: VPO_NOSYNC. */ + run_ext = 1; + } else + run_ext = 0; +unlock: + VM_OBJECT_RUNLOCK(object); +#if VM_NRESERVLEVEL > 0 + } else if (level >= 0) { + /* + * The page is reserved but not yet allocated. In + * other words, it is still free. Extend the current + * run by one page. + */ + run_ext = 1; +#endif + } else if ((order = m->order) < VM_NFREEORDER) { + /* + * The page is enqueued in the physical memory + * allocator's free page queues. Moreover, it is the + * first page in a power-of-two-sized run of + * contiguous free pages. Add these pages to the end + * of the current run, and jump ahead. + */ + run_ext = 1 << order; + m_inc = 1 << order; + } else { + /* + * Skip the page for one of the following reasons: (1) + * It is enqueued in the physical memory allocator's + * free page queues. However, it is not the first + * page in a run of contiguous free pages. (This case + * rarely occurs because the scan is performed in + * ascending order.) (2) It is not reserved, and it is + * transitioning from free to allocated. (Conversely, + * the transition from allocated to free for managed + * pages is blocked by the page lock.) (3) It is + * allocated but not contained by an object and not + * wired, e.g., allocated by Xen's balloon driver. + */ + run_ext = 0; + } + + /* + * Extend or reset the current run of pages. + */ + if (run_ext > 0) { + if (run_len == 0) + m_run = m; + run_len += run_ext; + } else { + if (run_len > 0) { + m_run = NULL; + run_len = 0; + } + } + } + if (m_mtx != NULL) + mtx_unlock(m_mtx); + if (run_len >= npages) + return (m_run); + return (NULL); +} + +/* + * vm_page_reclaim_run: + * + * Try to relocate each of the allocated virtual pages within the + * specified run of physical pages to a new physical address. Free the + * physical pages underlying the relocated virtual pages. A virtual page + * is relocatable if and only if it could be laundered or reclaimed by + * the page daemon. Whenever possible, a virtual page is relocated to a + * physical address above "high". + * + * Returns 0 if every physical page within the run was already free or + * just freed by a successful relocation. Otherwise, returns a non-zero + * value indicating why the last attempt to relocate a virtual page was + * unsuccessful. + * + * "req_class" must be an allocation class. + */ +static int +vm_page_reclaim_run(int req_class, u_long npages, vm_page_t m_run, + vm_paddr_t high) +{ + struct mtx *m_mtx; + struct spglist free; + vm_object_t object; + vm_paddr_t pa; + vm_page_t m, m_end, m_new; + int error, order, req; + + KASSERT((req_class & VM_ALLOC_CLASS_MASK) == req_class, + ("req_class is not an allocation class")); + SLIST_INIT(&free); + error = 0; + m = m_run; + m_end = m_run + npages; + m_mtx = NULL; + for (; error == 0 && m < m_end; m++) { + KASSERT((m->flags & (PG_FICTITIOUS | PG_MARKER)) == 0, + ("page %p is PG_FICTITIOUS or PG_MARKER", m)); + + /* + * Avoid releasing and reacquiring the same page lock. + */ + vm_page_change_lock(m, &m_mtx); +retry: + if (m->wire_count != 0 || m->hold_count != 0) + error = EBUSY; + else if ((object = m->object) != NULL) { + /* + * The page is relocated if and only if it could be + * laundered or reclaimed by the page daemon. + */ + if (!VM_OBJECT_TRYWLOCK(object)) { + mtx_unlock(m_mtx); + VM_OBJECT_WLOCK(object); + mtx_lock(m_mtx); + if (m->object != object) { + /* + * The page may have been freed. + */ + VM_OBJECT_WUNLOCK(object); + goto retry; + } else if (m->wire_count != 0 || + m->hold_count != 0) { + error = EBUSY; + goto unlock; + } + } + KASSERT((m->flags & PG_UNHOLDFREE) == 0, + ("page %p is PG_UNHOLDFREE", m)); + /* Don't care: PG_NODUMP, PG_ZERO. */ + if (object->type != OBJT_DEFAULT && + object->type != OBJT_SWAP && + object->type != OBJT_VNODE) + error = EINVAL; + else if (object->memattr != VM_MEMATTR_DEFAULT) + error = EINVAL; + else if (m->queue != PQ_NONE && !vm_page_busied(m)) { + KASSERT(pmap_page_get_memattr(m) == + VM_MEMATTR_DEFAULT, + ("page %p has an unexpected memattr", m)); + KASSERT((m->oflags & (VPO_SWAPINPROG | + VPO_SWAPSLEEP | VPO_UNMANAGED)) == 0, + ("page %p has unexpected oflags", m)); + /* Don't care: VPO_NOSYNC. */ + if (m->valid != 0) { + /* + * First, try to allocate a new page + * that is above "high". Failing + * that, try to allocate a new page + * that is below "m_run". Allocate + * the new page between the end of + * "m_run" and "high" only as a last + * resort. + */ + req = req_class | VM_ALLOC_NOOBJ; + if ((m->flags & PG_NODUMP) != 0) + req |= VM_ALLOC_NODUMP; + if (trunc_page(high) != + ~(vm_paddr_t)PAGE_MASK) { + m_new = vm_page_alloc_contig( + NULL, 0, req, 1, + round_page(high), + ~(vm_paddr_t)0, + PAGE_SIZE, 0, + VM_MEMATTR_DEFAULT); + } else + m_new = NULL; + if (m_new == NULL) { + pa = VM_PAGE_TO_PHYS(m_run); + m_new = vm_page_alloc_contig( + NULL, 0, req, 1, + 0, pa - 1, PAGE_SIZE, 0, + VM_MEMATTR_DEFAULT); + } + if (m_new == NULL) { + pa += ptoa(npages); + m_new = vm_page_alloc_contig( + NULL, 0, req, 1, + pa, high, PAGE_SIZE, 0, + VM_MEMATTR_DEFAULT); + } + if (m_new == NULL) { + error = ENOMEM; + goto unlock; + } + KASSERT(m_new->wire_count == 0, + ("page %p is wired", m_new)); + + /* + * Replace "m" with the new page. For + * vm_page_replace(), "m" must be busy + * and dequeued. Finally, change "m" + * as if vm_page_free() was called. + */ + if (object->ref_count != 0) + pmap_remove_all(m); + m_new->aflags = m->aflags; + KASSERT(m_new->oflags == VPO_UNMANAGED, + ("page %p is managed", m_new)); + m_new->oflags = m->oflags & VPO_NOSYNC; + pmap_copy_page(m, m_new); + m_new->valid = m->valid; + m_new->dirty = m->dirty; + m->flags &= ~PG_ZERO; + vm_page_xbusy(m); + vm_page_remque(m); + vm_page_replace_checked(m_new, object, + m->pindex, m); + m->valid = 0; + vm_page_undirty(m); + + /* + * The new page must be deactivated + * before the object is unlocked. + */ + vm_page_change_lock(m_new, &m_mtx); + vm_page_deactivate(m_new); + } else { + m->flags &= ~PG_ZERO; + vm_page_remque(m); + vm_page_remove(m); + KASSERT(m->dirty == 0, + ("page %p is dirty", m)); + } + SLIST_INSERT_HEAD(&free, m, plinks.s.ss); + } else + error = EBUSY; +unlock: + VM_OBJECT_WUNLOCK(object); + } else { + mtx_lock(&vm_page_queue_free_mtx); + order = m->order; + if (order < VM_NFREEORDER) { + /* + * The page is enqueued in the physical memory + * allocator's free page queues. Moreover, it + * is the first page in a power-of-two-sized + * run of contiguous free pages. Jump ahead + * to the last page within that run, and + * continue from there. + */ + m += (1 << order) - 1; + } +#if VM_NRESERVLEVEL > 0 + else if (vm_reserv_is_page_free(m)) + order = 0; +#endif + mtx_unlock(&vm_page_queue_free_mtx); + if (order == VM_NFREEORDER) + error = EINVAL; + } + } + if (m_mtx != NULL) + mtx_unlock(m_mtx); + if ((m = SLIST_FIRST(&free)) != NULL) { + mtx_lock(&vm_page_queue_free_mtx); + do { + SLIST_REMOVE_HEAD(&free, plinks.s.ss); + vm_page_free_phys(m); + } while ((m = SLIST_FIRST(&free)) != NULL); + vm_page_zero_idle_wakeup(); + vm_page_free_wakeup(); + mtx_unlock(&vm_page_queue_free_mtx); + } + return (error); +} + +#define NRUNS 16 + +CTASSERT(powerof2(NRUNS)); + +#define RUN_INDEX(count) ((count) & (NRUNS - 1)) + +#define MIN_RECLAIM 8 + +/* + * vm_page_reclaim_contig: + * + * Reclaim allocated, contiguous physical memory satisfying the specified + * conditions by relocating the virtual pages using that physical memory. + * Returns true if reclamation is successful and false otherwise. Since + * relocation requires the allocation of physical pages, reclamation may + * fail due to a shortage of free pages. When reclamation fails, callers + * are expected to perform VM_WAIT before retrying a failed allocation + * operation, e.g., vm_page_alloc_contig(). + * + * The caller must always specify an allocation class through "req". + * + * allocation classes: + * VM_ALLOC_NORMAL normal process request + * VM_ALLOC_SYSTEM system *really* needs a page + * VM_ALLOC_INTERRUPT interrupt time request + * + * The optional allocation flags are ignored. + * + * "npages" must be greater than zero. Both "alignment" and "boundary" + * must be a power of two. + */ +bool +vm_page_reclaim_contig(int req, u_long npages, vm_paddr_t low, vm_paddr_t high, + u_long alignment, vm_paddr_t boundary) +{ + vm_paddr_t curr_low; + vm_page_t m_run, m_runs[NRUNS]; + u_long count, reclaimed; + int error, i, options, req_class; + + KASSERT(npages > 0, ("npages is 0")); + KASSERT(powerof2(alignment), ("alignment is not a power of 2")); + KASSERT(powerof2(boundary), ("boundary is not a power of 2")); + req_class = req & VM_ALLOC_CLASS_MASK; + + /* + * The page daemon is allowed to dig deeper into the free page list. + */ + if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT) + req_class = VM_ALLOC_SYSTEM; + + /* + * Return if the number of free pages cannot satisfy the requested + * allocation. + */ + count = vm_cnt.v_free_count; + if (count < npages + vm_cnt.v_free_reserved || (count < npages + + vm_cnt.v_interrupt_free_min && req_class == VM_ALLOC_SYSTEM) || + (count < npages && req_class == VM_ALLOC_INTERRUPT)) + return (false); + + /* + * Scan up to three times, relaxing the restrictions ("options") on + * the reclamation of reservations and superpages each time. + */ + for (options = VPSC_NORESERV;;) { + /* + * Find the highest runs that satisfy the given constraints + * and restrictions, and record them in "m_runs". + */ + curr_low = low; + count = 0; + for (;;) { + m_run = vm_phys_scan_contig(npages, curr_low, high, + alignment, boundary, options); + if (m_run == NULL) + break; + curr_low = VM_PAGE_TO_PHYS(m_run) + ptoa(npages); + m_runs[RUN_INDEX(count)] = m_run; + count++; + } + + /* + * Reclaim the highest runs in LIFO (descending) order until + * the number of reclaimed pages, "reclaimed", is at least + * MIN_RECLAIM. Reset "reclaimed" each time because each + * reclamation is idempotent, and runs will (likely) recur + * from one scan to the next as restrictions are relaxed. + */ + reclaimed = 0; + for (i = 0; count > 0 && i < NRUNS; i++) { + count--; + m_run = m_runs[RUN_INDEX(count)]; + error = vm_page_reclaim_run(req_class, npages, m_run, + high); + if (error == 0) { + reclaimed += npages; + if (reclaimed >= MIN_RECLAIM) + return (true); + } + } + + /* + * Either relax the restrictions on the next scan or return if + * the last scan had no restrictions. + */ + if (options == VPSC_NORESERV) + options = VPSC_NOSUPER; + else if (options == VPSC_NOSUPER) + options = VPSC_ANY; + else if (options == VPSC_ANY) + return (reclaimed != 0); + } +} + +/* * vm_wait: (also see VM_WAIT macro) * * Sleep until free pages are available for allocation. * - Called in various places before memory allocations. */ -void -vm_wait(void) +static void +_vm_wait(void) { - mtx_lock(&vm_page_queue_free_mtx); + mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); if (curproc == pageproc) { vm_pageout_pages_needed = 1; msleep(&vm_pageout_pages_needed, &vm_page_queue_free_mtx, PDROP | PSWP, "VMWait", 0); } else { - if (!vm_pages_needed) { - vm_pages_needed = 1; - wakeup(&vm_pages_needed); - } - msleep(&cnt.v_free_count, &vm_page_queue_free_mtx, PDROP | PVM, - "vmwait", 0); + if (pageproc == NULL) + panic("vm_wait in early boot"); + pagedaemon_wait(PVM, "vmwait"); } } +void +vm_wait(void) +{ + + mtx_lock(&vm_page_queue_free_mtx); + _vm_wait(); +} + /* + * vm_page_alloc_fail: + * + * Called when a page allocation function fails. Informs the + * pagedaemon and performs the requested wait. Requires the + * page_queue_free and object lock on entry. Returns with the + * object lock held and free lock released. Returns an error when + * retry is necessary. + * + */ +static int +vm_page_alloc_fail(vm_object_t object, int req) +{ + + mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); + + atomic_add_int(&vm_pageout_deficit, + max((u_int)req >> VM_ALLOC_COUNT_SHIFT, 1)); + if (req & (VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL)) { + if (object != NULL) + VM_OBJECT_WUNLOCK(object); + _vm_wait(); + if (object != NULL) + VM_OBJECT_WLOCK(object); + if (req & VM_ALLOC_WAITOK) + return (EAGAIN); + } else { + mtx_unlock(&vm_page_queue_free_mtx); + pagedaemon_wakeup(); + } + return (0); +} + +/* * vm_waitpfault: (also see VM_WAITPFAULT macro) * * Sleep until free pages are available for allocation. @@ -2088,12 +2656,7 @@ { mtx_lock(&vm_page_queue_free_mtx); - if (!vm_pages_needed) { - vm_pages_needed = 1; - wakeup(&vm_pages_needed); - } - msleep(&cnt.v_free_count, &vm_page_queue_free_mtx, PDROP | PUSER, - "pfault", 0); + pagedaemon_wait(PUSER, "pfault"); } struct vm_pagequeue * @@ -2100,7 +2663,10 @@ vm_page_pagequeue(vm_page_t m) { - return (&vm_phys_domain(m)->vmd_pagequeues[m->queue]); + if (vm_page_in_laundry(m)) + return (&vm_dom[0].vmd_pagequeues[m->queue]); + else + return (&vm_phys_domain(m)->vmd_pagequeues[m->queue]); } /* @@ -2115,9 +2681,9 @@ { struct vm_pagequeue *pq; - vm_page_lock_assert(m, MA_OWNED); - KASSERT(m->queue != PQ_NONE, - ("vm_page_dequeue: page %p is not queued", m)); + vm_page_assert_locked(m); + KASSERT(m->queue < PQ_COUNT, ("vm_page_dequeue: page %p is not queued", + m)); pq = vm_page_pagequeue(m); vm_pagequeue_lock(pq); m->queue = PQ_NONE; @@ -2154,12 +2720,18 @@ * The page must be locked. */ static void -vm_page_enqueue(int queue, vm_page_t m) +vm_page_enqueue(uint8_t queue, vm_page_t m) { struct vm_pagequeue *pq; vm_page_lock_assert(m, MA_OWNED); - pq = &vm_phys_domain(m)->vmd_pagequeues[queue]; + KASSERT(queue < PQ_COUNT, + ("vm_page_enqueue: invalid queue %u request for page %p", + queue, m)); + if (queue == PQ_LAUNDRY) + pq = &vm_dom[0].vmd_pagequeues[queue]; + else + pq = &vm_phys_domain(m)->vmd_pagequeues[queue]; vm_pagequeue_lock(pq); m->queue = queue; TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q); @@ -2243,13 +2815,12 @@ /* * vm_page_free_wakeup: * - * Helper routine for vm_page_free_toq() and vm_page_cache(). This - * routine is called when a page has been added to the cache or free - * queues. + * Helper routine for vm_page_free_toq(). This routine is called + * when a page is added to the free queues. * * The page queues must be locked. */ -static inline void +static void vm_page_free_wakeup(void) { @@ -2259,7 +2830,7 @@ * some free. */ if (vm_pageout_pages_needed && - cnt.v_cache_count + cnt.v_free_count >= cnt.v_pageout_free_min) { + vm_cnt.v_free_count >= vm_cnt.v_pageout_free_min) { wakeup(&vm_pageout_pages_needed); vm_pageout_pages_needed = 0; } @@ -2269,45 +2840,36 @@ * lots of memory. this process will swapin processes. */ if (vm_pages_needed && !vm_page_count_min()) { - vm_pages_needed = 0; - wakeup(&cnt.v_free_count); + vm_pages_needed = false; + wakeup(&vm_cnt.v_free_count); } } /* - * Turn a cached page into a free page, by changing its attributes. - * Keep the statistics up-to-date. + * vm_page_free_prep: * - * The free page queue must be locked. - */ -static void -vm_page_cache_turn_free(vm_page_t m) -{ - - mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); - - m->object = NULL; - m->valid = 0; - /* Clear PG_CACHED and set PG_FREE. */ - m->flags ^= PG_CACHED | PG_FREE; - KASSERT((m->flags & (PG_CACHED | PG_FREE)) == PG_FREE, - ("vm_page_cache_free: page %p has inconsistent flags", m)); - cnt.v_cache_count--; - vm_phys_freecnt_adj(m, 1); -} - -/* - * vm_page_free_toq: + * Prepares the given page to be put on the free list, + * disassociating it from any VM object. The caller may return + * the page to the free list only if this function returns true. * - * Returns the given page to the free list, - * disassociating it with any VM object. - * - * The object must be locked. The page must be locked if it is managed. + * The object must be locked. The page must be locked if it is + * managed. For a queued managed page, the pagequeue_locked + * argument specifies whether the page queue is already locked. */ -void -vm_page_free_toq(vm_page_t m) +bool +vm_page_free_prep(vm_page_t m, bool pagequeue_locked) { +#if defined(DIAGNOSTIC) && defined(PHYS_TO_DMAP) + if ((m->flags & PG_ZERO) != 0) { + uint64_t *p; + int i; + p = (uint64_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); + for (i = 0; i < PAGE_SIZE / sizeof(uint64_t); i++, p++) + KASSERT(*p == 0, ("vm_page_free_prep %p PG_ZERO %d %jx", + m, i, (uintmax_t)*p)); + } +#endif if ((m->oflags & VPO_UNMANAGED) == 0) { vm_page_lock_assert(m, MA_OWNED); KASSERT(!pmap_page_is_mapped(m), @@ -2317,9 +2879,7 @@ ("vm_page_free_toq: unmanaged page %p is queued", m)); PCPU_INC(cnt.v_tfree); - if (VM_PAGE_IS_FREE(m)) - panic("vm_page_free: freeing free page %p", m); - else if (vm_page_sbusied(m)) + if (vm_page_sbusied(m)) panic("vm_page_free: freeing busy page %p", m); /* @@ -2328,7 +2888,12 @@ * callback routine until after we've put the page on the * appropriate free queue. */ - vm_page_remque(m); + if (m->queue != PQ_NONE) { + if (pagequeue_locked) + vm_page_dequeue_locked(m); + else + vm_page_dequeue(m); + } vm_page_remove(m); /* @@ -2335,9 +2900,8 @@ * If fictitious remove object association and * return, otherwise delay object association removal. */ - if ((m->flags & PG_FICTITIOUS) != 0) { - return; - } + if ((m->flags & PG_FICTITIOUS) != 0) + return (false); m->valid = 0; vm_page_undirty(m); @@ -2349,36 +2913,75 @@ KASSERT((m->flags & PG_UNHOLDFREE) == 0, ("vm_page_free: freeing PG_UNHOLDFREE page %p", m)); m->flags |= PG_UNHOLDFREE; - } else { - /* - * Restore the default memory attribute to the page. - */ - if (pmap_page_get_memattr(m) != VM_MEMATTR_DEFAULT) - pmap_page_set_memattr(m, VM_MEMATTR_DEFAULT); + return (false); + } - /* - * Insert the page into the physical memory allocator's - * cache/free page queues. - */ - mtx_lock(&vm_page_queue_free_mtx); - m->flags |= PG_FREE; - vm_phys_freecnt_adj(m, 1); + /* + * Restore the default memory attribute to the page. + */ + if (pmap_page_get_memattr(m) != VM_MEMATTR_DEFAULT) + pmap_page_set_memattr(m, VM_MEMATTR_DEFAULT); + + return (true); +} + +/* + * Insert the page into the physical memory allocator's free page + * queues. This is the last step to free a page. + */ +static void +vm_page_free_phys(vm_page_t m) +{ + + mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); + + vm_phys_freecnt_adj(m, 1); #if VM_NRESERVLEVEL > 0 - if (!vm_reserv_free_page(m)) -#else - if (TRUE) + if (!vm_reserv_free_page(m)) #endif vm_phys_free_pages(m, 0); - if ((m->flags & PG_ZERO) != 0) - ++vm_page_zero_count; - else - vm_page_zero_idle_wakeup(); - vm_page_free_wakeup(); - mtx_unlock(&vm_page_queue_free_mtx); - } + if ((m->flags & PG_ZERO) != 0) + ++vm_page_zero_count; + else + vm_page_zero_idle_wakeup(); } +void +vm_page_free_phys_pglist(struct pglist *tq) +{ + vm_page_t m; + + if (TAILQ_EMPTY(tq)) + return; + mtx_lock(&vm_page_queue_free_mtx); + TAILQ_FOREACH(m, tq, listq) + vm_page_free_phys(m); + vm_page_free_wakeup(); + mtx_unlock(&vm_page_queue_free_mtx); +} + /* + * vm_page_free_toq: + * + * Returns the given page to the free list, disassociating it + * from any VM object. + * + * The object must be locked. The page must be locked if it is + * managed. + */ +void +vm_page_free_toq(vm_page_t m) +{ + + if (!vm_page_free_prep(m, false)) + return; + mtx_lock(&vm_page_queue_free_mtx); + vm_page_free_phys(m); + vm_page_free_wakeup(); + mtx_unlock(&vm_page_queue_free_mtx); +} + +/* * vm_page_wire: * * Mark this page as wired down by yet @@ -2410,7 +3013,7 @@ m->queue == PQ_NONE, ("vm_page_wire: unmanaged page %p is queued", m)); vm_page_remque(m); - atomic_add_int(&cnt.v_wire_count, 1); + atomic_add_int(&vm_cnt.v_wire_count, 1); } m->wire_count++; KASSERT(m->wire_count != 0, ("vm_page_wire: wire_count overflow m=%p", m)); @@ -2419,41 +3022,43 @@ /* * vm_page_unwire: * - * Release one wiring of the specified page, potentially enabling it to be - * paged again. If paging is enabled, then the value of the parameter - * "activate" determines to which queue the page is added. If "activate" is - * non-zero, then the page is added to the active queue. Otherwise, it is - * added to the inactive queue. + * Release one wiring of the specified page, potentially allowing it to be + * paged out. Returns TRUE if the number of wirings transitions to zero and + * FALSE otherwise. * - * However, unless the page belongs to an object, it is not enqueued because - * it cannot be paged out. + * Only managed pages belonging to an object can be paged out. If the number + * of wirings transitions to zero and the page is eligible for page out, then + * the page is added to the specified paging queue (unless PQ_NONE is + * specified). * * If a page is fictitious, then its wire count must always be one. * * A managed page must be locked. */ -void -vm_page_unwire(vm_page_t m, int activate) +boolean_t +vm_page_unwire(vm_page_t m, uint8_t queue) { + KASSERT(queue < PQ_COUNT || queue == PQ_NONE, + ("vm_page_unwire: invalid queue %u request for page %p", + queue, m)); if ((m->oflags & VPO_UNMANAGED) == 0) - vm_page_lock_assert(m, MA_OWNED); + vm_page_assert_locked(m); if ((m->flags & PG_FICTITIOUS) != 0) { KASSERT(m->wire_count == 1, ("vm_page_unwire: fictitious page %p's wire count isn't one", m)); - return; + return (FALSE); } if (m->wire_count > 0) { m->wire_count--; if (m->wire_count == 0) { - atomic_subtract_int(&cnt.v_wire_count, 1); - if ((m->oflags & VPO_UNMANAGED) != 0 || - m->object == NULL) - return; - if (!activate) - m->flags &= ~PG_WINATCFLS; - vm_page_enqueue(activate ? PQ_ACTIVE : PQ_INACTIVE, m); - } + atomic_subtract_int(&vm_cnt.v_wire_count, 1); + if ((m->oflags & VPO_UNMANAGED) == 0 && + m->object != NULL && queue != PQ_NONE) + vm_page_enqueue(queue, m); + return (TRUE); + } else + return (FALSE); } else panic("vm_page_unwire: page %p's wire count is zero", m); } @@ -2461,25 +3066,16 @@ /* * Move the specified page to the inactive queue. * - * Many pages placed on the inactive queue should actually go - * into the cache, but it is difficult to figure out which. What - * we do instead, if the inactive target is well met, is to put - * clean pages at the head of the inactive queue instead of the tail. - * This will cause them to be moved to the cache more quickly and - * if not actively re-referenced, reclaimed more quickly. If we just - * stick these pages at the end of the inactive queue, heavy filesystem - * meta-data accesses can cause an unnecessary paging load on memory bound - * processes. This optimization causes one-time-use metadata to be - * reused more quickly. + * Normally, "noreuse" is FALSE, resulting in LRU ordering of the inactive + * queue. However, setting "noreuse" to TRUE will accelerate the specified + * page's reclamation, but it will not unmap the page from any address space. + * This is implemented by inserting the page near the head of the inactive + * queue, using a marker page to guide FIFO insertion ordering. * - * Normally athead is 0 resulting in LRU operation. athead is set - * to 1 if we want this page to be 'as if it were placed in the cache', - * except without unmapping it from the process address space. - * * The page must be locked. */ static inline void -_vm_page_deactivate(vm_page_t m, int athead) +_vm_page_deactivate(vm_page_t m, boolean_t noreuse) { struct vm_pagequeue *pq; int queue; @@ -2490,7 +3086,7 @@ * Ignore if the page is already inactive, unless it is unlikely to be * reactivated. */ - if ((queue = m->queue) == PQ_INACTIVE && !athead) + if ((queue = m->queue) == PQ_INACTIVE && !noreuse) return; if (m->wire_count == 0 && (m->oflags & VPO_UNMANAGED) == 0) { pq = &vm_phys_domain(m)->vmd_pagequeues[PQ_INACTIVE]; @@ -2501,12 +3097,12 @@ } else { if (queue != PQ_NONE) vm_page_dequeue(m); - m->flags &= ~PG_WINATCFLS; vm_pagequeue_lock(pq); } m->queue = PQ_INACTIVE; - if (athead) - TAILQ_INSERT_HEAD(&pq->pq_pl, m, plinks.q); + if (noreuse) + TAILQ_INSERT_BEFORE(&vm_phys_domain(m)->vmd_inacthead, + m, plinks.q); else TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q); vm_pagequeue_cnt_inc(pq); @@ -2523,165 +3119,73 @@ vm_page_deactivate(vm_page_t m) { - _vm_page_deactivate(m, 0); + _vm_page_deactivate(m, FALSE); } /* - * vm_page_try_to_cache: + * Move the specified page to the inactive queue with the expectation + * that it is unlikely to be reused. * - * Returns 0 on failure, 1 on success + * The page must be locked. */ -int -vm_page_try_to_cache(vm_page_t m) +void +vm_page_deactivate_noreuse(vm_page_t m) { - vm_page_lock_assert(m, MA_OWNED); - VM_OBJECT_ASSERT_WLOCKED(m->object); - if (m->dirty || m->hold_count || m->wire_count || - (m->oflags & VPO_UNMANAGED) != 0 || vm_page_busied(m)) - return (0); - pmap_remove_all(m); - if (m->dirty) - return (0); - vm_page_cache(m); - return (1); + _vm_page_deactivate(m, TRUE); } /* - * vm_page_try_to_free() + * vm_page_launder * - * Attempt to free the page. If we cannot free it, we do nothing. - * 1 is returned on success, 0 on failure. + * Put a page in the laundry. */ -int -vm_page_try_to_free(vm_page_t m) +void +vm_page_launder(vm_page_t m) { + int queue; - vm_page_lock_assert(m, MA_OWNED); - if (m->object != NULL) - VM_OBJECT_ASSERT_WLOCKED(m->object); - if (m->dirty || m->hold_count || m->wire_count || - (m->oflags & VPO_UNMANAGED) != 0 || vm_page_busied(m)) - return (0); - pmap_remove_all(m); - if (m->dirty) - return (0); - vm_page_free(m); - return (1); + vm_page_assert_locked(m); + if ((queue = m->queue) != PQ_LAUNDRY) { + if (m->wire_count == 0 && (m->oflags & VPO_UNMANAGED) == 0) { + if (queue != PQ_NONE) + vm_page_dequeue(m); + vm_page_enqueue(PQ_LAUNDRY, m); + } else + KASSERT(queue == PQ_NONE, + ("wired page %p is queued", m)); + } } /* - * vm_page_cache + * vm_page_try_to_free() * - * Put the specified page onto the page cache queue (if appropriate). - * - * The object and page must be locked. + * Attempt to free the page. If we cannot free it, we do nothing. + * true is returned on success, false on failure. */ -void -vm_page_cache(vm_page_t m) +bool +vm_page_try_to_free(vm_page_t m) { - vm_object_t object; - boolean_t cache_was_empty; - vm_page_lock_assert(m, MA_OWNED); - object = m->object; - VM_OBJECT_ASSERT_WLOCKED(object); - if (vm_page_busied(m) || (m->oflags & VPO_UNMANAGED) || - m->hold_count || m->wire_count) - panic("vm_page_cache: attempting to cache busy page"); - KASSERT(!pmap_page_is_mapped(m), - ("vm_page_cache: page %p is mapped", m)); - KASSERT(m->dirty == 0, ("vm_page_cache: page %p is dirty", m)); - if (m->valid == 0 || object->type == OBJT_DEFAULT || - (object->type == OBJT_SWAP && - !vm_pager_has_page(object, m->pindex, NULL, NULL))) { - /* - * Hypothesis: A cache-elgible page belonging to a - * default object or swap object but without a backing - * store must be zero filled. - */ - vm_page_free(m); - return; + vm_page_assert_locked(m); + if (m->object != NULL) + VM_OBJECT_ASSERT_WLOCKED(m->object); + if (m->dirty != 0 || m->hold_count != 0 || m->wire_count != 0 || + (m->oflags & VPO_UNMANAGED) != 0 || vm_page_busied(m)) + return (false); + if (m->object != NULL && m->object->ref_count != 0) { + pmap_remove_all(m); + if (m->dirty != 0) + return (false); } - KASSERT((m->flags & PG_CACHED) == 0, - ("vm_page_cache: page %p is already cached", m)); - - /* - * Remove the page from the paging queues. - */ - vm_page_remque(m); - - /* - * Remove the page from the object's collection of resident - * pages. - */ - vm_radix_remove(&object->rtree, m->pindex); - TAILQ_REMOVE(&object->memq, m, listq); - object->resident_page_count--; - - /* - * Restore the default memory attribute to the page. - */ - if (pmap_page_get_memattr(m) != VM_MEMATTR_DEFAULT) - pmap_page_set_memattr(m, VM_MEMATTR_DEFAULT); - - /* - * Insert the page into the object's collection of cached pages - * and the physical memory allocator's cache/free page queues. - */ - m->flags &= ~PG_ZERO; - mtx_lock(&vm_page_queue_free_mtx); - cache_was_empty = vm_radix_is_empty(&object->cache); - if (vm_radix_insert(&object->cache, m)) { - mtx_unlock(&vm_page_queue_free_mtx); - if (object->type == OBJT_VNODE && - object->resident_page_count == 0) - vdrop(object->handle); - m->object = NULL; - vm_page_free(m); - return; - } - - /* - * The above call to vm_radix_insert() could reclaim the one pre- - * existing cached page from this object, resulting in a call to - * vdrop(). - */ - if (!cache_was_empty) - cache_was_empty = vm_radix_is_singleton(&object->cache); - - m->flags |= PG_CACHED; - cnt.v_cache_count++; - PCPU_INC(cnt.v_tcached); -#if VM_NRESERVLEVEL > 0 - if (!vm_reserv_free_page(m)) { -#else - if (TRUE) { -#endif - vm_phys_set_pool(VM_FREEPOOL_CACHE, m, 0); - vm_phys_free_pages(m, 0); - } - vm_page_free_wakeup(); - mtx_unlock(&vm_page_queue_free_mtx); - - /* - * Increment the vnode's hold count if this is the object's only - * cached page. Decrement the vnode's hold count if this was - * the object's only resident page. - */ - if (object->type == OBJT_VNODE) { - if (cache_was_empty && object->resident_page_count != 0) - vhold(object->handle); - else if (!cache_was_empty && object->resident_page_count == 0) - vdrop(object->handle); - } + vm_page_free(m); + return (true); } /* * vm_page_advise * - * Deactivate or do nothing, as appropriate. This routine is used - * by madvise() and vop_stdadvise(). + * Apply the specified advice to the given page. * * The object and page must be locked. */ @@ -2694,20 +3198,16 @@ if (advice == MADV_FREE) /* * Mark the page clean. This will allow the page to be freed - * up by the system. However, such pages are often reused - * quickly by malloc() so we do not do anything that would - * cause a page fault if we can help it. - * - * Specifically, we do not try to actually free the page now - * nor do we try to put it in the cache (which would cause a - * page fault on reuse). - * - * But we do make the page as freeable as we can without - * actually taking the step of unmapping it. + * without first paging it out. MADV_FREE pages are often + * quickly reused by malloc(3), so we do not do anything that + * would result in a page fault on a later access. */ vm_page_undirty(m); - else if (advice != MADV_DONTNEED) + else if (advice != MADV_DONTNEED) { + if (advice == MADV_WILLNEED) + vm_page_activate(m); return; + } /* * Clear any references to the page. Otherwise, the page daemon will @@ -2719,11 +3219,15 @@ vm_page_dirty(m); /* - * Place clean pages at the head of the inactive queue rather than the - * tail, thus defeating the queue's LRU operation and ensuring that the - * page will be reused quickly. + * Place clean pages near the head of the inactive queue rather than + * the tail, thus defeating the queue's LRU operation and ensuring that + * the page will be reused quickly. Dirty pages not already in the + * laundry are moved there. */ - _vm_page_deactivate(m, m->dirty == 0); + if (m->dirty == 0) + vm_page_deactivate_noreuse(m); + else + vm_page_launder(m); } /* @@ -2742,16 +3246,23 @@ { vm_page_t m; int sleep; + int pflags; VM_OBJECT_ASSERT_WLOCKED(object); KASSERT((allocflags & VM_ALLOC_SBUSY) == 0 || (allocflags & VM_ALLOC_IGN_SBUSY) != 0, ("vm_page_grab: VM_ALLOC_SBUSY/VM_ALLOC_IGN_SBUSY mismatch")); + pflags = allocflags & + ~(VM_ALLOC_NOWAIT | VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL); + if ((allocflags & VM_ALLOC_NOWAIT) == 0) + pflags |= VM_ALLOC_WAITFAIL; retrylookup: if ((m = vm_page_lookup(object, pindex)) != NULL) { sleep = (allocflags & VM_ALLOC_IGN_SBUSY) != 0 ? vm_page_xbusied(m) : vm_page_busied(m); if (sleep) { + if ((allocflags & VM_ALLOC_NOWAIT) != 0) + return (NULL); /* * Reference the page before unlocking and * sleeping so that the page daemon is less @@ -2778,14 +3289,12 @@ return (m); } } - m = vm_page_alloc(object, pindex, allocflags & ~VM_ALLOC_IGN_SBUSY); + m = vm_page_alloc(object, pindex, pflags); if (m == NULL) { - VM_OBJECT_WUNLOCK(object); - VM_WAIT; - VM_OBJECT_WLOCK(object); + if ((allocflags & VM_ALLOC_NOWAIT) != 0) + return (NULL); goto retrylookup; - } else if (m->valid != 0) - return (m); + } if (allocflags & VM_ALLOC_ZERO && (m->flags & PG_ZERO) == 0) pmap_zero_page(m); return (m); @@ -2792,6 +3301,114 @@ } /* + * Return the specified range of pages from the given object. For each + * page offset within the range, if a page already exists within the object + * at that offset and it is busy, then wait for it to change state. If, + * instead, the page doesn't exist, then allocate it. + * + * The caller must always specify an allocation class. + * + * allocation classes: + * VM_ALLOC_NORMAL normal process request + * VM_ALLOC_SYSTEM system *really* needs the pages + * + * The caller must always specify that the pages are to be busied and/or + * wired. + * + * optional allocation flags: + * VM_ALLOC_IGN_SBUSY do not sleep on soft busy pages + * VM_ALLOC_NOBUSY do not exclusive busy the page + * VM_ALLOC_NOWAIT do not sleep + * VM_ALLOC_SBUSY set page to sbusy state + * VM_ALLOC_WIRED wire the pages + * VM_ALLOC_ZERO zero and validate any invalid pages + * + * If VM_ALLOC_NOWAIT is not specified, this routine may sleep. Otherwise, it + * may return a partial prefix of the requested range. + */ +int +vm_page_grab_pages(vm_object_t object, vm_pindex_t pindex, int allocflags, + vm_page_t *ma, int count) +{ + vm_page_t m, mpred; + int pflags; + int i; + bool sleep; + + VM_OBJECT_ASSERT_WLOCKED(object); + KASSERT(((u_int)allocflags >> VM_ALLOC_COUNT_SHIFT) == 0, + ("vm_page_grap_pages: VM_ALLOC_COUNT() is not allowed")); + KASSERT((allocflags & VM_ALLOC_NOBUSY) == 0 || + (allocflags & VM_ALLOC_WIRED) != 0, + ("vm_page_grab_pages: the pages must be busied or wired")); + KASSERT((allocflags & VM_ALLOC_SBUSY) == 0 || + (allocflags & VM_ALLOC_IGN_SBUSY) != 0, + ("vm_page_grab_pages: VM_ALLOC_SBUSY/IGN_SBUSY mismatch")); + if (count == 0) + return (0); + pflags = allocflags & ~(VM_ALLOC_NOWAIT | VM_ALLOC_WAITOK | + VM_ALLOC_WAITFAIL | VM_ALLOC_IGN_SBUSY); + if ((allocflags & VM_ALLOC_NOWAIT) == 0) + pflags |= VM_ALLOC_WAITFAIL; + i = 0; +retrylookup: + m = vm_radix_lookup_le(&object->rtree, pindex + i); + if (m == NULL || m->pindex != pindex + i) { + mpred = m; + m = NULL; + } else + mpred = TAILQ_PREV(m, pglist, listq); + for (; i < count; i++) { + if (m != NULL) { + sleep = (allocflags & VM_ALLOC_IGN_SBUSY) != 0 ? + vm_page_xbusied(m) : vm_page_busied(m); + if (sleep) { + if ((allocflags & VM_ALLOC_NOWAIT) != 0) + break; + /* + * Reference the page before unlocking and + * sleeping so that the page daemon is less + * likely to reclaim it. + */ + vm_page_aflag_set(m, PGA_REFERENCED); + vm_page_lock(m); + VM_OBJECT_WUNLOCK(object); + vm_page_busy_sleep(m, "grbmaw", (allocflags & + VM_ALLOC_IGN_SBUSY) != 0); + VM_OBJECT_WLOCK(object); + goto retrylookup; + } + if ((allocflags & VM_ALLOC_WIRED) != 0) { + vm_page_lock(m); + vm_page_wire(m); + vm_page_unlock(m); + } + if ((allocflags & (VM_ALLOC_NOBUSY | + VM_ALLOC_SBUSY)) == 0) + vm_page_xbusy(m); + if ((allocflags & VM_ALLOC_SBUSY) != 0) + vm_page_sbusy(m); + } else { + m = vm_page_alloc_after(object, pindex + i, + pflags | VM_ALLOC_COUNT(count - i), mpred); + if (m == NULL) { + if ((allocflags & VM_ALLOC_NOWAIT) != 0) + break; + goto retrylookup; + } + } + if (m->valid == 0 && (allocflags & VM_ALLOC_ZERO) != 0) { + if ((m->flags & PG_ZERO) == 0) + pmap_zero_page(m); + m->valid = VM_PAGE_BITS_ALL; + } + ma[i] = mpred = m; + m = vm_page_next(m); + } + return (i); +} + +/* * Mapping function for valid or dirty bits in a page. * * Inputs are required to range within a page. @@ -2841,17 +3458,17 @@ * bit is clear, we have to zero out a portion of the * first block. */ - if ((frag = base & ~(DEV_BSIZE - 1)) != base && + if ((frag = rounddown2(base, DEV_BSIZE)) != base && (m->valid & (1 << (base >> DEV_BSHIFT))) == 0) pmap_zero_page_area(m, frag, base - frag); /* - * If the ending offset is not DEV_BSIZE aligned and the + * If the ending offset is not DEV_BSIZE aligned and the * valid bit is clear, we have to zero out a portion of * the last block. */ endoff = base + size; - if ((frag = endoff & ~(DEV_BSIZE - 1)) != endoff && + if ((frag = rounddown2(endoff, DEV_BSIZE)) != endoff && (m->valid & (1 << (endoff >> DEV_BSHIFT))) == 0) pmap_zero_page_area(m, endoff, DEV_BSIZE - (endoff & (DEV_BSIZE - 1))); @@ -2858,7 +3475,7 @@ /* * Assert that no previously invalid block that is now being validated - * is already dirty. + * is already dirty. */ KASSERT((~m->valid & vm_page_bits(base, size) & m->dirty) == 0, ("vm_page_set_valid_range: page %p is dirty", m)); @@ -2948,17 +3565,17 @@ * bit is clear, we have to zero out a portion of the * first block. */ - if ((frag = base & ~(DEV_BSIZE - 1)) != base && + if ((frag = rounddown2(base, DEV_BSIZE)) != base && (m->valid & ((vm_page_bits_t)1 << (base >> DEV_BSHIFT))) == 0) pmap_zero_page_area(m, frag, base - frag); /* - * If the ending offset is not DEV_BSIZE aligned and the + * If the ending offset is not DEV_BSIZE aligned and the * valid bit is clear, we have to zero out a portion of * the last block. */ endoff = base + size; - if ((frag = endoff & ~(DEV_BSIZE - 1)) != endoff && + if ((frag = rounddown2(endoff, DEV_BSIZE)) != endoff && (m->valid & ((vm_page_bits_t)1 << (endoff >> DEV_BSHIFT))) == 0) pmap_zero_page_area(m, endoff, DEV_BSIZE - (endoff & (DEV_BSIZE - 1))); @@ -3050,12 +3667,12 @@ /* * vm_page_zero_invalid() * - * The kernel assumes that the invalid portions of a page contain + * The kernel assumes that the invalid portions of a page contain * garbage, but such pages can be mapped into memory by user code. * When this occurs, we must zero out the non-valid portions of the * page so user code sees what it expects. * - * Pages are most often semi-valid when the end of a file is mapped + * Pages are most often semi-valid when the end of a file is mapped * into memory and the file's size is not page aligned. */ void @@ -3072,10 +3689,10 @@ * vm_page_set_validclean(). */ for (b = i = 0; i <= PAGE_SIZE / DEV_BSIZE; ++i) { - if (i == (PAGE_SIZE / DEV_BSIZE) || + if (i == (PAGE_SIZE / DEV_BSIZE) || (m->valid & ((vm_page_bits_t)1 << i))) { if (i > b) { - pmap_zero_page_area(m, + pmap_zero_page_area(m, b << DEV_BSHIFT, (i - b) << DEV_BSHIFT); } b = i + 1; @@ -3109,16 +3726,19 @@ } /* - * vm_page_ps_is_valid: - * - * Returns TRUE if the entire (super)page is valid and FALSE otherwise. + * Returns true if all of the specified predicates are true for the entire + * (super)page and false otherwise. */ -boolean_t -vm_page_ps_is_valid(vm_page_t m) +bool +vm_page_ps_test(vm_page_t m, int flags, vm_page_t skip_m) { + vm_object_t object; int i, npages; - VM_OBJECT_ASSERT_LOCKED(m->object); + object = m->object; + if (skip_m != NULL && skip_m->object != object) + return (false); + VM_OBJECT_ASSERT_LOCKED(object); npages = atop(pagesizes[m->psind]); /* @@ -3127,10 +3747,28 @@ * occupy adjacent entries in vm_page_array[]. */ for (i = 0; i < npages; i++) { - if (m[i].valid != VM_PAGE_BITS_ALL) - return (FALSE); + /* Always test object consistency, including "skip_m". */ + if (m[i].object != object) + return (false); + if (&m[i] == skip_m) + continue; + if ((flags & PS_NONE_BUSY) != 0 && vm_page_busied(&m[i])) + return (false); + if ((flags & PS_ALL_DIRTY) != 0) { + /* + * Calling vm_page_test_dirty() or pmap_is_modified() + * might stop this case from spuriously returning + * "false". However, that would require a write lock + * on the object containing "m[i]". + */ + if (m[i].dirty != VM_PAGE_BITS_ALL) + return (false); + } + if ((flags & PS_ALL_VALID) != 0 && + m[i].valid != VM_PAGE_BITS_ALL) + return (false); } - return (TRUE); + return (true); } /* @@ -3224,16 +3862,16 @@ DB_SHOW_COMMAND(page, vm_page_print_page_info) { - db_printf("cnt.v_free_count: %d\n", cnt.v_free_count); - db_printf("cnt.v_cache_count: %d\n", cnt.v_cache_count); - db_printf("cnt.v_inactive_count: %d\n", cnt.v_inactive_count); - db_printf("cnt.v_active_count: %d\n", cnt.v_active_count); - db_printf("cnt.v_wire_count: %d\n", cnt.v_wire_count); - db_printf("cnt.v_free_reserved: %d\n", cnt.v_free_reserved); - db_printf("cnt.v_free_min: %d\n", cnt.v_free_min); - db_printf("cnt.v_free_target: %d\n", cnt.v_free_target); - db_printf("cnt.v_cache_min: %d\n", cnt.v_cache_min); - db_printf("cnt.v_inactive_target: %d\n", cnt.v_inactive_target); + + db_printf("vm_cnt.v_free_count: %d\n", vm_cnt.v_free_count); + db_printf("vm_cnt.v_inactive_count: %d\n", vm_cnt.v_inactive_count); + db_printf("vm_cnt.v_active_count: %d\n", vm_cnt.v_active_count); + db_printf("vm_cnt.v_laundry_count: %d\n", vm_cnt.v_laundry_count); + db_printf("vm_cnt.v_wire_count: %d\n", vm_cnt.v_wire_count); + db_printf("vm_cnt.v_free_reserved: %d\n", vm_cnt.v_free_reserved); + db_printf("vm_cnt.v_free_min: %d\n", vm_cnt.v_free_min); + db_printf("vm_cnt.v_free_target: %d\n", vm_cnt.v_free_target); + db_printf("vm_cnt.v_inactive_target: %d\n", vm_cnt.v_inactive_target); } DB_SHOW_COMMAND(pageq, vm_page_print_pageq_info) @@ -3240,17 +3878,16 @@ { int dom; - db_printf("pq_free %d pq_cache %d\n", - cnt.v_free_count, cnt.v_cache_count); + db_printf("pq_free %d\n", vm_cnt.v_free_count); for (dom = 0; dom < vm_ndomains; dom++) { db_printf( - "dom %d page_cnt %d free %d pq_act %d pq_inact %d pass %d\n", + "dom %d page_cnt %d free %d pq_act %d pq_inact %d pq_laund %d\n", dom, vm_dom[dom].vmd_page_count, vm_dom[dom].vmd_free_count, vm_dom[dom].vmd_pagequeues[PQ_ACTIVE].pq_cnt, vm_dom[dom].vmd_pagequeues[PQ_INACTIVE].pq_cnt, - vm_dom[dom].vmd_pass); + vm_dom[dom].vmd_pagequeues[PQ_LAUNDRY].pq_cnt); } } @@ -3257,7 +3894,7 @@ DB_SHOW_COMMAND(pginfo, vm_page_print_pginfo) { vm_page_t m; - boolean_t phys; + boolean_t phys, virt; if (!have_addr) { db_printf("show pginfo addr\n"); @@ -3265,7 +3902,10 @@ } phys = strchr(modif, 'p') != NULL; - if (phys) + virt = strchr(modif, 'v') != NULL; + if (virt) + m = PHYS_TO_VM_PAGE(pmap_kextract(addr)); + else if (phys) m = PHYS_TO_VM_PAGE(addr); else m = (vm_page_t)addr; Modified: trunk/sys/vm/vm_page.h =================================================================== --- trunk/sys/vm/vm_page.h 2020-02-08 19:35:04 UTC (rev 12313) +++ trunk/sys/vm/vm_page.h 2020-02-08 19:35:48 UTC (rev 12314) @@ -58,7 +58,7 @@ * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. * - * $FreeBSD: stable/10/sys/vm/vm_page.h 307672 2016-10-20 13:12:19Z kib $ + * $FreeBSD: stable/11/sys/vm/vm_page.h 332505 2018-04-14 17:41:54Z kib $ */ /* @@ -142,7 +142,7 @@ vm_object_t object; /* which object am I in (O,P) */ vm_pindex_t pindex; /* offset into object (O,P) */ vm_paddr_t phys_addr; /* physical address of page */ - struct md_page md; /* machine dependant stuff */ + struct md_page md; /* machine dependent stuff */ u_int wire_count; /* wired down maps refs (P) */ volatile u_int busy_lock; /* busy owners lock */ uint16_t hold_count; /* page hold count (P) */ @@ -150,6 +150,7 @@ uint8_t aflags; /* access is atomic */ uint8_t oflags; /* page VPO_* flags (O) */ uint8_t queue; /* page queue index (P,Q) */ + int8_t psind; /* pagesizes[] index (O) */ int8_t segind; uint8_t order; /* index of the buddy queue */ uint8_t pool; @@ -158,7 +159,6 @@ /* so, on normal X86 kernels, they must be at least 8 bits wide */ vm_page_bits_t valid; /* map of valid DEV_BSIZE chunks (O) */ vm_page_bits_t dirty; /* map of dirty DEV_BSIZE chunks (M) */ - int8_t psind; /* pagesizes[] index (O) */ }; /* @@ -207,9 +207,13 @@ #define PQ_NONE 255 #define PQ_INACTIVE 0 #define PQ_ACTIVE 1 -#define PQ_COUNT 2 +#define PQ_LAUNDRY 2 +#define PQ_COUNT 3 +#ifndef VM_PAGE_HAVE_PGLIST TAILQ_HEAD(pglist, vm_page); +#define VM_PAGE_HAVE_PGLIST +#endif SLIST_HEAD(spglist, vm_page); struct vm_pagequeue { @@ -227,10 +231,11 @@ u_int vmd_free_count; long vmd_segs; /* bitmask of the segments */ boolean_t vmd_oom; - int vmd_pass; /* local pagedaemon pass */ int vmd_oom_seq; int vmd_last_active_scan; + struct vm_page vmd_laundry_marker; struct vm_page vmd_marker; /* marker for pagedaemon private use */ + struct vm_page vmd_inacthead; /* marker for LRU-defeating insertions */ }; extern struct vm_domain vm_dom[MAXMEMDOM]; @@ -237,6 +242,7 @@ #define vm_pagequeue_assert_locked(pq) mtx_assert(&(pq)->pq_mutex, MA_OWNED) #define vm_pagequeue_lock(pq) mtx_lock(&(pq)->pq_mutex) +#define vm_pagequeue_lockptr(pq) (&(pq)->pq_mutex) #define vm_pagequeue_unlock(pq) mtx_unlock(&(pq)->pq_mutex) #ifdef _KERNEL @@ -324,12 +330,9 @@ * Page flags. If changed at any other time than page allocation or * freeing, the modification must be protected by the vm_page lock. */ -#define PG_CACHED 0x0001 /* page is cached */ -#define PG_FREE 0x0002 /* page is free */ #define PG_FICTITIOUS 0x0004 /* physical page doesn't exist */ #define PG_ZERO 0x0008 /* page is zeroed */ #define PG_MARKER 0x0010 /* special queue marker page */ -#define PG_WINATCFLS 0x0040 /* flush dirty page on inactive q */ #define PG_NODUMP 0x0080 /* don't include this page in a dump */ #define PG_UNHOLDFREE 0x0100 /* delayed free of a held page */ @@ -353,19 +356,16 @@ * free * Available for allocation now. * - * cache - * Almost available for allocation. Still associated with - * an object, but clean and immediately freeable. - * - * The following lists are LRU sorted: - * * inactive * Low activity, candidates for reclamation. + * This list is approximately LRU ordered. + * + * laundry * This is the list of pages that should be * paged out next. * * active - * Pages that are "active" i.e. they have been + * Pages that are "active", i.e., they have been * recently referenced. * */ @@ -376,28 +376,51 @@ extern long vm_page_array_size; /* number of vm_page_t's */ extern long first_page; /* first physical page number */ -#define VM_PAGE_IS_FREE(m) (((m)->flags & PG_FREE) != 0) - #define VM_PAGE_TO_PHYS(entry) ((entry)->phys_addr) +/* + * PHYS_TO_VM_PAGE() returns the vm_page_t object that represents a memory + * page to which the given physical address belongs. The correct vm_page_t + * object is returned for addresses that are not page-aligned. + */ vm_page_t PHYS_TO_VM_PAGE(vm_paddr_t pa); -/* page allocation classes: */ +/* + * Page allocation parameters for vm_page for the functions + * vm_page_alloc(), vm_page_grab(), vm_page_alloc_contig() and + * vm_page_alloc_freelist(). Some functions support only a subset + * of the flags, and ignore others, see the flags legend. + * + * The meaning of VM_ALLOC_ZERO differs slightly between the vm_page_alloc*() + * and the vm_page_grab*() functions. See these functions for details. + * + * Bits 0 - 1 define class. + * Bits 2 - 15 dedicated for flags. + * Legend: + * (a) - vm_page_alloc() supports the flag. + * (c) - vm_page_alloc_contig() supports the flag. + * (f) - vm_page_alloc_freelist() supports the flag. + * (g) - vm_page_grab() supports the flag. + * (p) - vm_page_grab_pages() supports the flag. + * Bits above 15 define the count of additional pages that the caller + * intends to allocate. + */ #define VM_ALLOC_NORMAL 0 #define VM_ALLOC_INTERRUPT 1 #define VM_ALLOC_SYSTEM 2 #define VM_ALLOC_CLASS_MASK 3 -/* page allocation flags: */ -#define VM_ALLOC_WIRED 0x0020 /* non pageable */ -#define VM_ALLOC_ZERO 0x0040 /* Try to obtain a zeroed page */ -#define VM_ALLOC_NOOBJ 0x0100 /* No associated object */ -#define VM_ALLOC_NOBUSY 0x0200 /* Do not busy the page */ -#define VM_ALLOC_IFCACHED 0x0400 /* Fail if the page is not cached */ -#define VM_ALLOC_IFNOTCACHED 0x0800 /* Fail if the page is cached */ -#define VM_ALLOC_IGN_SBUSY 0x1000 /* vm_page_grab() only */ -#define VM_ALLOC_NODUMP 0x2000 /* don't include in dump */ -#define VM_ALLOC_SBUSY 0x4000 /* Shared busy the page */ - +#define VM_ALLOC_WAITOK 0x0008 /* (acf) Sleep and retry */ +#define VM_ALLOC_WAITFAIL 0x0010 /* (acf) Sleep and return error */ +#define VM_ALLOC_WIRED 0x0020 /* (acfgp) Allocate a wired page */ +#define VM_ALLOC_ZERO 0x0040 /* (acfgp) Allocate a prezeroed page */ +#define VM_ALLOC_NOOBJ 0x0100 /* (acg) No associated object */ +#define VM_ALLOC_NOBUSY 0x0200 /* (acgp) Do not excl busy the page */ +#define VM_ALLOC_IFCACHED 0x0400 +#define VM_ALLOC_IFNOTCACHED 0x0800 +#define VM_ALLOC_IGN_SBUSY 0x1000 /* (gp) Ignore shared busy flag */ +#define VM_ALLOC_NODUMP 0x2000 /* (ag) don't include in dump */ +#define VM_ALLOC_SBUSY 0x4000 /* (acgp) Shared busy the page */ +#define VM_ALLOC_NOWAIT 0x8000 /* (acfgp) Do not sleep */ #define VM_ALLOC_COUNT_SHIFT 16 #define VM_ALLOC_COUNT(count) ((count) << VM_ALLOC_COUNT_SHIFT) @@ -416,10 +439,26 @@ pflags |= VM_ALLOC_ZERO; if ((malloc_flags & M_NODUMP) != 0) pflags |= VM_ALLOC_NODUMP; + if ((malloc_flags & M_NOWAIT)) + pflags |= VM_ALLOC_NOWAIT; + if ((malloc_flags & M_WAITOK)) + pflags |= VM_ALLOC_WAITOK; return (pflags); } #endif +/* + * Predicates supported by vm_page_ps_test(): + * + * PS_ALL_DIRTY is true only if the entire (super)page is dirty. + * However, it can be spuriously false when the (super)page has become + * dirty in the pmap but that information has not been propagated to the + * machine-independent layer. + */ +#define PS_ALL_DIRTY 0x1 +#define PS_ALL_VALID 0x2 +#define PS_NONE_BUSY 0x4 + void vm_page_busy_downgrade(vm_page_t m); void vm_page_busy_sleep(vm_page_t m, const char *msg, bool nonshared); void vm_page_flash(vm_page_t m); @@ -430,33 +469,38 @@ void vm_page_activate (vm_page_t); void vm_page_advise(vm_page_t m, int advice); -vm_page_t vm_page_alloc (vm_object_t, vm_pindex_t, int); +vm_page_t vm_page_alloc(vm_object_t, vm_pindex_t, int); +vm_page_t vm_page_alloc_after(vm_object_t, vm_pindex_t, int, vm_page_t); vm_page_t vm_page_alloc_contig(vm_object_t object, vm_pindex_t pindex, int req, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary, vm_memattr_t memattr); vm_page_t vm_page_alloc_freelist(int, int); +bool vm_page_blacklist_add(vm_paddr_t pa, bool verbose); +void vm_page_change_lock(vm_page_t m, struct mtx **mtx); vm_page_t vm_page_grab (vm_object_t, vm_pindex_t, int); -void vm_page_cache(vm_page_t); -void vm_page_cache_free(vm_object_t, vm_pindex_t, vm_pindex_t); -void vm_page_cache_transfer(vm_object_t, vm_pindex_t, vm_object_t); -int vm_page_try_to_cache (vm_page_t); -int vm_page_try_to_free (vm_page_t); +int vm_page_grab_pages(vm_object_t object, vm_pindex_t pindex, int allocflags, + vm_page_t *ma, int count); void vm_page_deactivate (vm_page_t); +void vm_page_deactivate_noreuse(vm_page_t); void vm_page_dequeue(vm_page_t m); void vm_page_dequeue_locked(vm_page_t m); vm_page_t vm_page_find_least(vm_object_t, vm_pindex_t); +void vm_page_free_phys_pglist(struct pglist *tq); +bool vm_page_free_prep(vm_page_t m, bool pagequeue_locked); vm_page_t vm_page_getfake(vm_paddr_t paddr, vm_memattr_t memattr); void vm_page_initfake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr); int vm_page_insert (vm_page_t, vm_object_t, vm_pindex_t); -boolean_t vm_page_is_cached(vm_object_t object, vm_pindex_t pindex); +void vm_page_launder(vm_page_t m); vm_page_t vm_page_lookup (vm_object_t, vm_pindex_t); vm_page_t vm_page_next(vm_page_t m); int vm_page_pa_tryrelock(pmap_t, vm_paddr_t, vm_paddr_t *); struct vm_pagequeue *vm_page_pagequeue(vm_page_t m); vm_page_t vm_page_prev(vm_page_t m); -boolean_t vm_page_ps_is_valid(vm_page_t m); +bool vm_page_ps_test(vm_page_t m, int flags, vm_page_t skip_m); void vm_page_putfake(vm_page_t m); void vm_page_readahead_finish(vm_page_t m); +bool vm_page_reclaim_contig(int req, u_long npages, vm_paddr_t low, + vm_paddr_t high, u_long alignment, vm_paddr_t boundary); void vm_page_reference(vm_page_t m); void vm_page_remove (vm_page_t); int vm_page_rename (vm_page_t, vm_object_t, vm_pindex_t); @@ -465,16 +509,20 @@ void vm_page_requeue(vm_page_t m); void vm_page_requeue_locked(vm_page_t m); int vm_page_sbusied(vm_page_t m); +vm_page_t vm_page_scan_contig(u_long npages, vm_page_t m_start, + vm_page_t m_end, u_long alignment, vm_paddr_t boundary, int options); void vm_page_set_valid_range(vm_page_t m, int base, int size); int vm_page_sleep_if_busy(vm_page_t m, const char *msg); vm_offset_t vm_page_startup(vm_offset_t vaddr); void vm_page_sunbusy(vm_page_t m); +bool vm_page_try_to_free(vm_page_t m); int vm_page_trysbusy(vm_page_t m); void vm_page_unhold_pages(vm_page_t *ma, int count); -void vm_page_unwire (vm_page_t, int); +boolean_t vm_page_unwire(vm_page_t m, uint8_t queue); void vm_page_updatefake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr); void vm_page_wire (vm_page_t); void vm_page_xunbusy_hard(vm_page_t m); +void vm_page_xunbusy_maybelocked(vm_page_t m); void vm_page_set_validclean (vm_page_t, int, int); void vm_page_clear_dirty (vm_page_t, int, int); void vm_page_set_invalid (vm_page_t, int, int); @@ -497,17 +545,17 @@ #define vm_page_assert_sbusied(m) \ KASSERT(vm_page_sbusied(m), \ ("vm_page_assert_sbusied: page %p not shared busy @ %s:%d", \ - (void *)m, __FILE__, __LINE__)); + (m), __FILE__, __LINE__)) #define vm_page_assert_unbusied(m) \ KASSERT(!vm_page_busied(m), \ ("vm_page_assert_unbusied: page %p busy @ %s:%d", \ - (void *)m, __FILE__, __LINE__)); + (m), __FILE__, __LINE__)) #define vm_page_assert_xbusied(m) \ KASSERT(vm_page_xbusied(m), \ ("vm_page_assert_xbusied: page %p not exclusive busy @ %s:%d", \ - (void *)m, __FILE__, __LINE__)); + (m), __FILE__, __LINE__)) #define vm_page_busied(m) \ ((m)->busy_lock != VPB_UNBUSIED) @@ -514,22 +562,24 @@ #define vm_page_sbusy(m) do { \ if (!vm_page_trysbusy(m)) \ - panic("%s: page %p failed shared busing", __func__, m); \ + panic("%s: page %p failed shared busying", __func__, \ + (m)); \ } while (0) #define vm_page_tryxbusy(m) \ - (atomic_cmpset_acq_int(&m->busy_lock, VPB_UNBUSIED, \ + (atomic_cmpset_acq_int(&(m)->busy_lock, VPB_UNBUSIED, \ VPB_SINGLE_EXCLUSIVER)) #define vm_page_xbusied(m) \ - ((m->busy_lock & VPB_SINGLE_EXCLUSIVER) != 0) + (((m)->busy_lock & VPB_SINGLE_EXCLUSIVER) != 0) #define vm_page_xbusy(m) do { \ if (!vm_page_tryxbusy(m)) \ - panic("%s: page %p failed exclusive busing", __func__, \ - m); \ + panic("%s: page %p failed exclusive busying", __func__, \ + (m)); \ } while (0) +/* Note: page m's lock must not be owned by the caller. */ #define vm_page_xunbusy(m) do { \ if (!atomic_cmpset_rel_int(&(m)->busy_lock, \ VPB_SINGLE_EXCLUSIVER, VPB_UNBUSIED)) \ @@ -660,5 +710,41 @@ m->dirty = 0; } +static inline void +vm_page_replace_checked(vm_page_t mnew, vm_object_t object, vm_pindex_t pindex, + vm_page_t mold) +{ + vm_page_t mret; + + mret = vm_page_replace(mnew, object, pindex); + KASSERT(mret == mold, + ("invalid page replacement, mold=%p, mret=%p", mold, mret)); + + /* Unused if !INVARIANTS. */ + (void)mold; + (void)mret; +} + +static inline bool +vm_page_active(vm_page_t m) +{ + + return (m->queue == PQ_ACTIVE); +} + +static inline bool +vm_page_inactive(vm_page_t m) +{ + + return (m->queue == PQ_INACTIVE); +} + +static inline bool +vm_page_in_laundry(vm_page_t m) +{ + + return (m->queue == PQ_LAUNDRY); +} + #endif /* _KERNEL */ #endif /* !_VM_PAGE_ */ Modified: trunk/sys/vm/vm_pageout.c =================================================================== --- trunk/sys/vm/vm_pageout.c 2020-02-08 19:35:04 UTC (rev 12313) +++ trunk/sys/vm/vm_pageout.c 2020-02-08 19:35:48 UTC (rev 12314) @@ -74,10 +74,10 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/vm/vm_pageout.c 320550 2017-07-01 19:24:53Z alc $"); +__FBSDID("$FreeBSD: stable/11/sys/vm/vm_pageout.c 331722 2018-03-29 02:50:57Z eadler $"); #include "opt_vm.h" -#include "opt_kdtrace.h" + #include <sys/param.h> #include <sys/systm.h> #include <sys/kernel.h> @@ -120,8 +120,9 @@ /* the kernel process "vm_pageout"*/ static void vm_pageout(void); static void vm_pageout_init(void); -static int vm_pageout_clean(vm_page_t); -static void vm_pageout_scan(struct vm_domain *vmd, int pass); +static int vm_pageout_clean(vm_page_t m, int *numpagedout); +static int vm_pageout_cluster(vm_page_t m); +static bool vm_pageout_scan(struct vm_domain *vmd, int pass); static void vm_pageout_mightbe_oom(struct vm_domain *vmd, int page_shortage, int starting_page_shortage); @@ -139,82 +140,49 @@ &page_kp); SDT_PROVIDER_DEFINE(vm); -SDT_PROBE_DEFINE(vm, , , vm__lowmem_cache); SDT_PROBE_DEFINE(vm, , , vm__lowmem_scan); -#if !defined(NO_SWAPPING) -/* the kernel process "vm_daemon"*/ -static void vm_daemon(void); -static struct proc *vmproc; +/* Pagedaemon activity rates, in subdivisions of one second. */ +#define VM_LAUNDER_RATE 10 +#define VM_INACT_SCAN_RATE 2 -static struct kproc_desc vm_kp = { - "vmdaemon", - vm_daemon, - &vmproc -}; -SYSINIT(vmdaemon, SI_SUB_KTHREAD_VM, SI_ORDER_FIRST, kproc_start, &vm_kp); -#endif - - -int vm_pages_needed; /* Event on which pageout daemon sleeps */ int vm_pageout_deficit; /* Estimated number of pages deficit */ -int vm_pageout_pages_needed; /* flag saying that the pageout daemon needs pages */ -int vm_pageout_wakeup_thresh; +u_int vm_pageout_wakeup_thresh; static int vm_pageout_oom_seq = 12; +bool vm_pageout_wanted; /* Event on which pageout daemon sleeps */ +bool vm_pages_needed; /* Are threads waiting for free pages? */ -#if !defined(NO_SWAPPING) -static int vm_pageout_req_swapout; /* XXX */ -static int vm_daemon_needed; -static struct mtx vm_daemon_mtx; -/* Allow for use by vm_pageout before vm_daemon is initialized. */ -MTX_SYSINIT(vm_daemon, &vm_daemon_mtx, "vm daemon", MTX_DEF); -#endif -static int vm_max_launder = 32; +/* Pending request for dirty page laundering. */ +static enum { + VM_LAUNDRY_IDLE, + VM_LAUNDRY_BACKGROUND, + VM_LAUNDRY_SHORTFALL +} vm_laundry_request = VM_LAUNDRY_IDLE; + static int vm_pageout_update_period; -static int defer_swap_pageouts; static int disable_swap_pageouts; static int lowmem_period = 10; static time_t lowmem_uptime; -#if defined(NO_SWAPPING) -static int vm_swap_enabled = 0; -static int vm_swap_idle_enabled = 0; -#else -static int vm_swap_enabled = 1; -static int vm_swap_idle_enabled = 0; -#endif +static int vm_panic_on_oom = 0; +SYSCTL_INT(_vm, OID_AUTO, panic_on_oom, + CTLFLAG_RWTUN, &vm_panic_on_oom, 0, + "panic on out of memory instead of killing the largest process"); + SYSCTL_INT(_vm, OID_AUTO, pageout_wakeup_thresh, - CTLFLAG_RW, &vm_pageout_wakeup_thresh, 0, + CTLFLAG_RWTUN, &vm_pageout_wakeup_thresh, 0, "free page threshold for waking up the pageout daemon"); -SYSCTL_INT(_vm, OID_AUTO, max_launder, - CTLFLAG_RW, &vm_max_launder, 0, "Limit dirty flushes in pageout"); - SYSCTL_INT(_vm, OID_AUTO, pageout_update_period, - CTLFLAG_RW, &vm_pageout_update_period, 0, + CTLFLAG_RWTUN, &vm_pageout_update_period, 0, "Maximum active LRU update period"); -SYSCTL_INT(_vm, OID_AUTO, lowmem_period, CTLFLAG_RW, &lowmem_period, 0, +SYSCTL_INT(_vm, OID_AUTO, lowmem_period, CTLFLAG_RWTUN, &lowmem_period, 0, "Low memory callback period"); -#if defined(NO_SWAPPING) -SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled, - CTLFLAG_RD, &vm_swap_enabled, 0, "Enable entire process swapout"); -SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled, - CTLFLAG_RD, &vm_swap_idle_enabled, 0, "Allow swapout on idle criteria"); -#else -SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled, - CTLFLAG_RW, &vm_swap_enabled, 0, "Enable entire process swapout"); -SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled, - CTLFLAG_RW, &vm_swap_idle_enabled, 0, "Allow swapout on idle criteria"); -#endif - -SYSCTL_INT(_vm, OID_AUTO, defer_swapspace_pageouts, - CTLFLAG_RW, &defer_swap_pageouts, 0, "Give preference to dirty pages in mem"); - SYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts, - CTLFLAG_RW, &disable_swap_pageouts, 0, "Disallow swapout of dirty pages"); + CTLFLAG_RWTUN, &disable_swap_pageouts, 0, "Disallow swapout of dirty pages"); static int pageout_lock_miss; SYSCTL_INT(_vm, OID_AUTO, pageout_lock_miss, @@ -221,24 +189,39 @@ CTLFLAG_RD, &pageout_lock_miss, 0, "vget() lock misses during pageout"); SYSCTL_INT(_vm, OID_AUTO, pageout_oom_seq, - CTLFLAG_RW, &vm_pageout_oom_seq, 0, + CTLFLAG_RWTUN, &vm_pageout_oom_seq, 0, "back-to-back calls to oom detector to start OOM"); -#define VM_PAGEOUT_PAGE_COUNT 16 -int vm_pageout_page_count = VM_PAGEOUT_PAGE_COUNT; +static int act_scan_laundry_weight = 3; +SYSCTL_INT(_vm, OID_AUTO, act_scan_laundry_weight, CTLFLAG_RWTUN, + &act_scan_laundry_weight, 0, + "weight given to clean vs. dirty pages in active queue scans"); +static u_int vm_background_launder_target; +SYSCTL_UINT(_vm, OID_AUTO, background_launder_target, CTLFLAG_RWTUN, + &vm_background_launder_target, 0, + "background laundering target, in pages"); + +static u_int vm_background_launder_rate = 4096; +SYSCTL_UINT(_vm, OID_AUTO, background_launder_rate, CTLFLAG_RWTUN, + &vm_background_launder_rate, 0, + "background laundering rate, in kilobytes per second"); + +static u_int vm_background_launder_max = 20 * 1024; +SYSCTL_UINT(_vm, OID_AUTO, background_launder_max, CTLFLAG_RWTUN, + &vm_background_launder_max, 0, "background laundering cap, in kilobytes"); + +int vm_pageout_page_count = 32; + int vm_page_max_wired; /* XXX max # of wired pages system-wide */ SYSCTL_INT(_vm, OID_AUTO, max_wired, CTLFLAG_RW, &vm_page_max_wired, 0, "System-wide limit to wired page count"); +static u_int isqrt(u_int num); static boolean_t vm_pageout_fallback_object_lock(vm_page_t, vm_page_t *); -static boolean_t vm_pageout_launder(struct vm_pagequeue *pq, int, vm_paddr_t, - vm_paddr_t); -#if !defined(NO_SWAPPING) -static void vm_pageout_map_deactivate_pages(vm_map_t, long); -static void vm_pageout_object_deactivate_pages(pmap_t, vm_object_t, long); -static void vm_req_vmdaemon(int req); -#endif +static int vm_pageout_launder(struct vm_domain *vmd, int launder, + bool in_shortfall); +static void vm_pageout_laundry_worker(void *arg); static boolean_t vm_pageout_page_lock(vm_page_t, vm_page_t *); /* @@ -352,41 +335,30 @@ } /* - * vm_pageout_clean: - * - * Clean the page and remove it from the laundry. - * - * We set the busy bit to cause potential page faults on this page to - * block. Note the careful timing, however, the busy bit isn't set till - * late and we cannot do anything that will mess with the page. + * Scan for pages at adjacent offsets within the given page's object that are + * eligible for laundering, form a cluster of these pages and the given page, + * and launder that cluster. */ static int -vm_pageout_clean(vm_page_t m) +vm_pageout_cluster(vm_page_t m) { vm_object_t object; - vm_page_t mc[2*vm_pageout_page_count], pb, ps; - int pageout_count; - int ib, is, page_base; - vm_pindex_t pindex = m->pindex; + vm_page_t mc[2 * vm_pageout_page_count], p, pb, ps; + vm_pindex_t pindex; + int ib, is, page_base, pageout_count; - vm_page_lock_assert(m, MA_OWNED); + vm_page_assert_locked(m); object = m->object; VM_OBJECT_ASSERT_WLOCKED(object); + pindex = m->pindex; /* - * It doesn't cost us anything to pageout OBJT_DEFAULT or OBJT_SWAP - * with the new swapper, but we could have serious problems paging - * out other object types if there is insufficient memory. - * - * Unfortunately, checking free memory here is far too late, so the - * check has been moved up a procedural level. + * We can't clean the page if it is busy or held. */ + vm_page_assert_unbusied(m); + KASSERT(m->hold_count == 0, ("page %p is held", m)); - /* - * Can't clean the page if it's busy or held. - */ - vm_page_assert_unbusied(m); - KASSERT(m->hold_count == 0, ("vm_pageout_clean: page %p is held", m)); + pmap_remove_write(m); vm_page_unlock(m); mc[vm_pageout_page_count] = pb = ps = m; @@ -396,33 +368,23 @@ is = 1; /* - * Scan object for clusterable pages. + * We can cluster only if the page is not clean, busy, or held, and + * the page is in the laundry queue. * - * We can cluster ONLY if: ->> the page is NOT - * clean, wired, busy, held, or mapped into a - * buffer, and one of the following: - * 1) The page is inactive, or a seldom used - * active page. - * -or- - * 2) we force the issue. - * * During heavy mmap/modification loads the pageout * daemon can really fragment the underlying file - * due to flushing pages out of order and not trying - * align the clusters (which leave sporatic out-of-order + * due to flushing pages out of order and not trying to + * align the clusters (which leaves sporadic out-of-order * holes). To solve this problem we do the reverse scan * first and attempt to align our cluster, then do a * forward scan if room remains. */ more: - while (ib && pageout_count < vm_pageout_page_count) { - vm_page_t p; - + while (ib != 0 && pageout_count < vm_pageout_page_count) { if (ib > pindex) { ib = 0; break; } - if ((p = vm_page_prev(pb)) == NULL || vm_page_busied(p)) { ib = 0; break; @@ -433,28 +395,27 @@ break; } vm_page_lock(p); - if (p->queue != PQ_INACTIVE || + if (!vm_page_in_laundry(p) || p->hold_count != 0) { /* may be undergoing I/O */ vm_page_unlock(p); ib = 0; break; } + pmap_remove_write(p); vm_page_unlock(p); mc[--page_base] = pb = p; ++pageout_count; ++ib; + /* - * alignment boundry, stop here and switch directions. Do - * not clear ib. + * We are at an alignment boundary. Stop here, and switch + * directions. Do not clear ib. */ if ((pindex - (ib - 1)) % vm_pageout_page_count == 0) break; } - while (pageout_count < vm_pageout_page_count && pindex + is < object->size) { - vm_page_t p; - if ((p = vm_page_next(ps)) == NULL || vm_page_busied(p)) break; vm_page_test_dirty(p); @@ -461,11 +422,12 @@ if (p->dirty == 0) break; vm_page_lock(p); - if (p->queue != PQ_INACTIVE || + if (!vm_page_in_laundry(p) || p->hold_count != 0) { /* may be undergoing I/O */ vm_page_unlock(p); break; } + pmap_remove_write(p); vm_page_unlock(p); mc[page_base + pageout_count] = ps = p; ++pageout_count; @@ -474,17 +436,14 @@ /* * If we exhausted our forward scan, continue with the reverse scan - * when possible, even past a page boundry. This catches boundry - * conditions. + * when possible, even past an alignment boundary. This catches + * boundary conditions. */ - if (ib && pageout_count < vm_pageout_page_count) + if (ib != 0 && pageout_count < vm_pageout_page_count) goto more; - /* - * we allow reads during pageouts... - */ - return (vm_pageout_flush(&mc[page_base], pageout_count, 0, 0, NULL, - NULL)); + return (vm_pageout_flush(&mc[page_base], pageout_count, + VM_PAGER_PUT_NOREUSE, 0, NULL, NULL)); } /* @@ -513,8 +472,8 @@ VM_OBJECT_ASSERT_WLOCKED(object); /* - * Initiate I/O. Bump the vm_page_t->busy counter and - * mark the pages read-only. + * Initiate I/O. Mark the pages busy and verify that they're valid + * and read-only. * * We do not have to fixup the clean/dirty bits here... we can * allow the pager to do it after the I/O completes. @@ -526,8 +485,9 @@ KASSERT(mc[i]->valid == VM_PAGE_BITS_ALL, ("vm_pageout_flush: partially invalid page %p index %d/%d", mc[i], i, count)); + KASSERT((mc[i]->aflags & PGA_WRITEABLE) == 0, + ("vm_pageout_flush: writeable page %p", mc[i])); vm_page_sbusy(mc[i]); - pmap_remove_write(mc[i]); } vm_object_pip_add(object, count); @@ -544,23 +504,33 @@ ("vm_pageout_flush: page %p is not write protected", mt)); switch (pageout_status[i]) { case VM_PAGER_OK: + vm_page_lock(mt); + if (vm_page_in_laundry(mt)) + vm_page_deactivate_noreuse(mt); + vm_page_unlock(mt); + /* FALLTHROUGH */ case VM_PAGER_PEND: numpagedout++; break; case VM_PAGER_BAD: /* - * Page outside of range of object. Right now we - * essentially lose the changes by pretending it - * worked. + * The page is outside the object's range. We pretend + * that the page out worked and clean the page, so the + * changes will be lost if the page is reclaimed by + * the page daemon. */ vm_page_undirty(mt); + vm_page_lock(mt); + if (vm_page_in_laundry(mt)) + vm_page_deactivate_noreuse(mt); + vm_page_unlock(mt); break; case VM_PAGER_ERROR: case VM_PAGER_FAIL: /* - * If page couldn't be paged out, then reactivate the - * page so it doesn't clog the inactive list. (We - * will try paging out it again later). + * If the page couldn't be paged out, then reactivate + * it so that it doesn't clog the laundry and inactive + * queues. (We will try paging it out again later). */ vm_page_lock(mt); vm_page_activate(mt); @@ -583,11 +553,6 @@ if (pageout_status[i] != VM_PAGER_PEND) { vm_object_pip_wakeup(object); vm_page_sunbusy(mt); - if (vm_page_count_severe()) { - vm_page_lock(mt); - vm_page_try_to_cache(mt); - vm_page_unlock(mt); - } } } if (prunlen != NULL) @@ -595,24 +560,172 @@ return (numpagedout); } -static boolean_t -vm_pageout_launder(struct vm_pagequeue *pq, int tries, vm_paddr_t low, - vm_paddr_t high) +/* + * Attempt to acquire all of the necessary locks to launder a page and + * then call through the clustering layer to PUTPAGES. Wait a short + * time for a vnode lock. + * + * Requires the page and object lock on entry, releases both before return. + * Returns 0 on success and an errno otherwise. + */ +static int +vm_pageout_clean(vm_page_t m, int *numpagedout) { + struct vnode *vp; struct mount *mp; - struct vnode *vp; vm_object_t object; - vm_paddr_t pa; - vm_page_t m, m_tmp, next; - int lockmode; + vm_pindex_t pindex; + int error, lockmode; + vm_page_assert_locked(m); + object = m->object; + VM_OBJECT_ASSERT_WLOCKED(object); + error = 0; + vp = NULL; + mp = NULL; + + /* + * The object is already known NOT to be dead. It + * is possible for the vget() to block the whole + * pageout daemon, but the new low-memory handling + * code should prevent it. + * + * We can't wait forever for the vnode lock, we might + * deadlock due to a vn_read() getting stuck in + * vm_wait while holding this vnode. We skip the + * vnode if we can't get it in a reasonable amount + * of time. + */ + if (object->type == OBJT_VNODE) { + vm_page_unlock(m); + vp = object->handle; + if (vp->v_type == VREG && + vn_start_write(vp, &mp, V_NOWAIT) != 0) { + mp = NULL; + error = EDEADLK; + goto unlock_all; + } + KASSERT(mp != NULL, + ("vp %p with NULL v_mount", vp)); + vm_object_reference_locked(object); + pindex = m->pindex; + VM_OBJECT_WUNLOCK(object); + lockmode = MNT_SHARED_WRITES(vp->v_mount) ? + LK_SHARED : LK_EXCLUSIVE; + if (vget(vp, lockmode | LK_TIMELOCK, curthread)) { + vp = NULL; + error = EDEADLK; + goto unlock_mp; + } + VM_OBJECT_WLOCK(object); + + /* + * Ensure that the object and vnode were not disassociated + * while locks were dropped. + */ + if (vp->v_object != object) { + error = ENOENT; + goto unlock_all; + } + vm_page_lock(m); + + /* + * While the object and page were unlocked, the page + * may have been: + * (1) moved to a different queue, + * (2) reallocated to a different object, + * (3) reallocated to a different offset, or + * (4) cleaned. + */ + if (!vm_page_in_laundry(m) || m->object != object || + m->pindex != pindex || m->dirty == 0) { + vm_page_unlock(m); + error = ENXIO; + goto unlock_all; + } + + /* + * The page may have been busied or held while the object + * and page locks were released. + */ + if (vm_page_busied(m) || m->hold_count != 0) { + vm_page_unlock(m); + error = EBUSY; + goto unlock_all; + } + } + + /* + * If a page is dirty, then it is either being washed + * (but not yet cleaned) or it is still in the + * laundry. If it is still in the laundry, then we + * start the cleaning operation. + */ + if ((*numpagedout = vm_pageout_cluster(m)) == 0) + error = EIO; + +unlock_all: + VM_OBJECT_WUNLOCK(object); + +unlock_mp: + vm_page_lock_assert(m, MA_NOTOWNED); + if (mp != NULL) { + if (vp != NULL) + vput(vp); + vm_object_deallocate(object); + vn_finished_write(mp); + } + + return (error); +} + +/* + * Attempt to launder the specified number of pages. + * + * Returns the number of pages successfully laundered. + */ +static int +vm_pageout_launder(struct vm_domain *vmd, int launder, bool in_shortfall) +{ + struct vm_pagequeue *pq; + vm_object_t object; + vm_page_t m, next; + int act_delta, error, maxscan, numpagedout, starting_target; + int vnodes_skipped; + bool pageout_ok, queue_locked; + + starting_target = launder; + vnodes_skipped = 0; + + /* + * Scan the laundry queue for pages eligible to be laundered. We stop + * once the target number of dirty pages have been laundered, or once + * we've reached the end of the queue. A single iteration of this loop + * may cause more than one page to be laundered because of clustering. + * + * maxscan ensures that we don't re-examine requeued pages. Any + * additional pages written as part of a cluster are subtracted from + * maxscan since they must be taken from the laundry queue. + */ + pq = &vmd->vmd_pagequeues[PQ_LAUNDRY]; + maxscan = pq->pq_cnt; + vm_pagequeue_lock(pq); - TAILQ_FOREACH_SAFE(m, &pq->pq_pl, plinks.q, next) { + queue_locked = true; + for (m = TAILQ_FIRST(&pq->pq_pl); + m != NULL && maxscan-- > 0 && launder > 0; + m = next) { + vm_pagequeue_assert_locked(pq); + KASSERT(queue_locked, ("unlocked laundry queue")); + KASSERT(vm_page_in_laundry(m), + ("page %p has an inconsistent queue", m)); + next = TAILQ_NEXT(m, plinks.q); if ((m->flags & PG_MARKER) != 0) continue; - pa = VM_PAGE_TO_PHYS(m); - if (pa < low || pa + PAGE_SIZE > high) - continue; + KASSERT((m->flags & PG_FICTITIOUS) == 0, + ("PG_FICTITIOUS page %p cannot be in laundry queue", m)); + KASSERT((m->oflags & VPO_UNMANAGED) == 0, + ("VPO_UNMANAGED page %p cannot be in laundry queue", m)); if (!vm_pageout_page_lock(m, &next) || m->hold_count != 0) { vm_page_unlock(m); continue; @@ -621,326 +734,341 @@ if ((!VM_OBJECT_TRYWLOCK(object) && (!vm_pageout_fallback_object_lock(m, &next) || m->hold_count != 0)) || vm_page_busied(m)) { + VM_OBJECT_WUNLOCK(object); vm_page_unlock(m); - VM_OBJECT_WUNLOCK(object); continue; } - vm_page_test_dirty(m); - if (m->dirty == 0 && object->ref_count != 0) - pmap_remove_all(m); - if (m->dirty != 0) { - vm_page_unlock(m); - if (tries == 0 || (object->flags & OBJ_DEAD) != 0) { - VM_OBJECT_WUNLOCK(object); - continue; - } - if (object->type == OBJT_VNODE) { - vm_pagequeue_unlock(pq); - vp = object->handle; - vm_object_reference_locked(object); - VM_OBJECT_WUNLOCK(object); - (void)vn_start_write(vp, &mp, V_WAIT); - lockmode = MNT_SHARED_WRITES(vp->v_mount) ? - LK_SHARED : LK_EXCLUSIVE; - vn_lock(vp, lockmode | LK_RETRY); - VM_OBJECT_WLOCK(object); - vm_object_page_clean(object, 0, 0, OBJPC_SYNC); - VM_OBJECT_WUNLOCK(object); - VOP_UNLOCK(vp, 0); - vm_object_deallocate(object); - vn_finished_write(mp); - return (TRUE); - } else if (object->type == OBJT_SWAP || - object->type == OBJT_DEFAULT) { - vm_pagequeue_unlock(pq); - m_tmp = m; - vm_pageout_flush(&m_tmp, 1, VM_PAGER_PUT_SYNC, - 0, NULL, NULL); - VM_OBJECT_WUNLOCK(object); - return (TRUE); - } - } else { - /* - * Dequeue here to prevent lock recursion in - * vm_page_cache(). - */ - vm_page_dequeue_locked(m); - vm_page_cache(m); - vm_page_unlock(m); + + /* + * Unlock the laundry queue, invalidating the 'next' pointer. + * Use a marker to remember our place in the laundry queue. + */ + TAILQ_INSERT_AFTER(&pq->pq_pl, m, &vmd->vmd_laundry_marker, + plinks.q); + vm_pagequeue_unlock(pq); + queue_locked = false; + + /* + * Invalid pages can be easily freed. They cannot be + * mapped; vm_page_free() asserts this. + */ + if (m->valid == 0) + goto free_page; + + /* + * If the page has been referenced and the object is not dead, + * reactivate or requeue the page depending on whether the + * object is mapped. + */ + if ((m->aflags & PGA_REFERENCED) != 0) { + vm_page_aflag_clear(m, PGA_REFERENCED); + act_delta = 1; + } else + act_delta = 0; + if (object->ref_count != 0) + act_delta += pmap_ts_referenced(m); + else { + KASSERT(!pmap_page_is_mapped(m), + ("page %p is mapped", m)); } - VM_OBJECT_WUNLOCK(object); - } - vm_pagequeue_unlock(pq); - return (FALSE); -} + if (act_delta != 0) { + if (object->ref_count != 0) { + PCPU_INC(cnt.v_reactivated); + vm_page_activate(m); -/* - * Increase the number of cached pages. The specified value, "tries", - * determines which categories of pages are cached: - * - * 0: All clean, inactive pages within the specified physical address range - * are cached. Will not sleep. - * 1: The vm_lowmem handlers are called. All inactive pages within - * the specified physical address range are cached. May sleep. - * 2: The vm_lowmem handlers are called. All inactive and active pages - * within the specified physical address range are cached. May sleep. - */ -void -vm_pageout_grow_cache(int tries, vm_paddr_t low, vm_paddr_t high) -{ - int actl, actmax, inactl, inactmax, dom, initial_dom; - static int start_dom = 0; + /* + * Increase the activation count if the page + * was referenced while in the laundry queue. + * This makes it less likely that the page will + * be returned prematurely to the inactive + * queue. + */ + m->act_count += act_delta + ACT_ADVANCE; - if (tries > 0) { + /* + * If this was a background laundering, count + * activated pages towards our target. The + * purpose of background laundering is to ensure + * that pages are eventually cycled through the + * laundry queue, and an activation is a valid + * way out. + */ + if (!in_shortfall) + launder--; + goto drop_page; + } else if ((object->flags & OBJ_DEAD) == 0) + goto requeue_page; + } + /* - * Decrease registered cache sizes. The vm_lowmem handlers - * may acquire locks and/or sleep, so they can only be invoked - * when "tries" is greater than zero. + * If the page appears to be clean at the machine-independent + * layer, then remove all of its mappings from the pmap in + * anticipation of freeing it. If, however, any of the page's + * mappings allow write access, then the page may still be + * modified until the last of those mappings are removed. */ - SDT_PROBE0(vm, , , vm__lowmem_cache); - EVENTHANDLER_INVOKE(vm_lowmem, 0); + if (object->ref_count != 0) { + vm_page_test_dirty(m); + if (m->dirty == 0) + pmap_remove_all(m); + } /* - * We do this explicitly after the caches have been drained - * above. + * Clean pages are freed, and dirty pages are paged out unless + * they belong to a dead object. Requeueing dirty pages from + * dead objects is pointless, as they are being paged out and + * freed by the thread that destroyed the object. */ - uma_reclaim(); + if (m->dirty == 0) { +free_page: + vm_page_free(m); + PCPU_INC(cnt.v_dfree); + } else if ((object->flags & OBJ_DEAD) == 0) { + if (object->type != OBJT_SWAP && + object->type != OBJT_DEFAULT) + pageout_ok = true; + else if (disable_swap_pageouts) + pageout_ok = false; + else + pageout_ok = true; + if (!pageout_ok) { +requeue_page: + vm_pagequeue_lock(pq); + queue_locked = true; + vm_page_requeue_locked(m); + goto drop_page; + } + + /* + * Form a cluster with adjacent, dirty pages from the + * same object, and page out that entire cluster. + * + * The adjacent, dirty pages must also be in the + * laundry. However, their mappings are not checked + * for new references. Consequently, a recently + * referenced page may be paged out. However, that + * page will not be prematurely reclaimed. After page + * out, the page will be placed in the inactive queue, + * where any new references will be detected and the + * page reactivated. + */ + error = vm_pageout_clean(m, &numpagedout); + if (error == 0) { + launder -= numpagedout; + maxscan -= numpagedout - 1; + } else if (error == EDEADLK) { + pageout_lock_miss++; + vnodes_skipped++; + } + goto relock_queue; + } +drop_page: + vm_page_unlock(m); + VM_OBJECT_WUNLOCK(object); +relock_queue: + if (!queue_locked) { + vm_pagequeue_lock(pq); + queue_locked = true; + } + next = TAILQ_NEXT(&vmd->vmd_laundry_marker, plinks.q); + TAILQ_REMOVE(&pq->pq_pl, &vmd->vmd_laundry_marker, plinks.q); } + vm_pagequeue_unlock(pq); /* - * Make the next scan start on the next domain. + * Wakeup the sync daemon if we skipped a vnode in a writeable object + * and we didn't launder enough pages. */ - initial_dom = atomic_fetchadd_int(&start_dom, 1) % vm_ndomains; + if (vnodes_skipped > 0 && launder > 0) + (void)speedup_syncer(); - inactl = 0; - inactmax = cnt.v_inactive_count; - actl = 0; - actmax = tries < 2 ? 0 : cnt.v_active_count; - dom = initial_dom; - - /* - * Scan domains in round-robin order, first inactive queues, - * then active. Since domain usually owns large physically - * contiguous chunk of memory, it makes sense to completely - * exhaust one domain before switching to next, while growing - * the pool of contiguous physical pages. - * - * Do not even start launder a domain which cannot contain - * the specified address range, as indicated by segments - * constituting the domain. - */ -again_inact: - if (inactl < inactmax) { - if (vm_phys_domain_intersects(vm_dom[dom].vmd_segs, - low, high) && - vm_pageout_launder(&vm_dom[dom].vmd_pagequeues[PQ_INACTIVE], - tries, low, high)) { - inactl++; - goto again_inact; - } - if (++dom == vm_ndomains) - dom = 0; - if (dom != initial_dom) - goto again_inact; - } -again_act: - if (actl < actmax) { - if (vm_phys_domain_intersects(vm_dom[dom].vmd_segs, - low, high) && - vm_pageout_launder(&vm_dom[dom].vmd_pagequeues[PQ_ACTIVE], - tries, low, high)) { - actl++; - goto again_act; - } - if (++dom == vm_ndomains) - dom = 0; - if (dom != initial_dom) - goto again_act; - } + return (starting_target - launder); } -#if !defined(NO_SWAPPING) /* - * vm_pageout_object_deactivate_pages - * - * Deactivate enough pages to satisfy the inactive target - * requirements. - * - * The object and map must be locked. + * Compute the integer square root. */ -static void -vm_pageout_object_deactivate_pages(pmap_t pmap, vm_object_t first_object, - long desired) +static u_int +isqrt(u_int num) { - vm_object_t backing_object, object; - vm_page_t p; - int act_delta, remove_mode; + u_int bit, root, tmp; - VM_OBJECT_ASSERT_LOCKED(first_object); - if ((first_object->flags & OBJ_FICTITIOUS) != 0) - return; - for (object = first_object;; object = backing_object) { - if (pmap_resident_count(pmap) <= desired) - goto unlock_return; - VM_OBJECT_ASSERT_LOCKED(object); - if ((object->flags & OBJ_UNMANAGED) != 0 || - object->paging_in_progress != 0) - goto unlock_return; - - remove_mode = 0; - if (object->shadow_count > 1) - remove_mode = 1; - /* - * Scan the object's entire memory queue. - */ - TAILQ_FOREACH(p, &object->memq, listq) { - if (pmap_resident_count(pmap) <= desired) - goto unlock_return; - if (vm_page_busied(p)) - continue; - PCPU_INC(cnt.v_pdpages); - vm_page_lock(p); - if (p->wire_count != 0 || p->hold_count != 0 || - !pmap_page_exists_quick(pmap, p)) { - vm_page_unlock(p); - continue; - } - act_delta = pmap_ts_referenced(p); - if ((p->aflags & PGA_REFERENCED) != 0) { - if (act_delta == 0) - act_delta = 1; - vm_page_aflag_clear(p, PGA_REFERENCED); - } - if (p->queue != PQ_ACTIVE && act_delta != 0) { - vm_page_activate(p); - p->act_count += act_delta; - } else if (p->queue == PQ_ACTIVE) { - if (act_delta == 0) { - p->act_count -= min(p->act_count, - ACT_DECLINE); - if (!remove_mode && p->act_count == 0) { - pmap_remove_all(p); - vm_page_deactivate(p); - } else - vm_page_requeue(p); - } else { - vm_page_activate(p); - if (p->act_count < ACT_MAX - - ACT_ADVANCE) - p->act_count += ACT_ADVANCE; - vm_page_requeue(p); - } - } else if (p->queue == PQ_INACTIVE) - pmap_remove_all(p); - vm_page_unlock(p); + bit = 1u << ((NBBY * sizeof(u_int)) - 2); + while (bit > num) + bit >>= 2; + root = 0; + while (bit != 0) { + tmp = root + bit; + root >>= 1; + if (num >= tmp) { + num -= tmp; + root += bit; } - if ((backing_object = object->backing_object) == NULL) - goto unlock_return; - VM_OBJECT_RLOCK(backing_object); - if (object != first_object) - VM_OBJECT_RUNLOCK(object); + bit >>= 2; } -unlock_return: - if (object != first_object) - VM_OBJECT_RUNLOCK(object); + return (root); } /* - * deactivate some number of pages in a map, try to do it fairly, but - * that is really hard to do. + * Perform the work of the laundry thread: periodically wake up and determine + * whether any pages need to be laundered. If so, determine the number of pages + * that need to be laundered, and launder them. */ static void -vm_pageout_map_deactivate_pages(map, desired) - vm_map_t map; - long desired; +vm_pageout_laundry_worker(void *arg) { - vm_map_entry_t tmpe; - vm_object_t obj, bigobj; - int nothingwired; + struct vm_domain *domain; + struct vm_pagequeue *pq; + uint64_t nclean, ndirty; + u_int last_launder, wakeups; + int domidx, last_target, launder, shortfall, shortfall_cycle, target; + bool in_shortfall; - if (!vm_map_trylock(map)) - return; + domidx = (uintptr_t)arg; + domain = &vm_dom[domidx]; + pq = &domain->vmd_pagequeues[PQ_LAUNDRY]; + KASSERT(domain->vmd_segs != 0, ("domain without segments")); + vm_pageout_init_marker(&domain->vmd_laundry_marker, PQ_LAUNDRY); - bigobj = NULL; - nothingwired = TRUE; + shortfall = 0; + in_shortfall = false; + shortfall_cycle = 0; + target = 0; + last_launder = 0; /* - * first, search out the biggest object, and try to free pages from - * that. + * The pageout laundry worker is never done, so loop forever. */ - tmpe = map->header.next; - while (tmpe != &map->header) { - if ((tmpe->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) { - obj = tmpe->object.vm_object; - if (obj != NULL && VM_OBJECT_TRYRLOCK(obj)) { - if (obj->shadow_count <= 1 && - (bigobj == NULL || - bigobj->resident_page_count < obj->resident_page_count)) { - if (bigobj != NULL) - VM_OBJECT_RUNLOCK(bigobj); - bigobj = obj; - } else - VM_OBJECT_RUNLOCK(obj); - } + for (;;) { + KASSERT(target >= 0, ("negative target %d", target)); + KASSERT(shortfall_cycle >= 0, + ("negative cycle %d", shortfall_cycle)); + launder = 0; + wakeups = VM_METER_PCPU_CNT(v_pdwakeups); + + /* + * First determine whether we need to launder pages to meet a + * shortage of free pages. + */ + if (shortfall > 0) { + in_shortfall = true; + shortfall_cycle = VM_LAUNDER_RATE / VM_INACT_SCAN_RATE; + target = shortfall; + } else if (!in_shortfall) + goto trybackground; + else if (shortfall_cycle == 0 || vm_laundry_target() <= 0) { + /* + * We recently entered shortfall and began laundering + * pages. If we have completed that laundering run + * (and we are no longer in shortfall) or we have met + * our laundry target through other activity, then we + * can stop laundering pages. + */ + in_shortfall = false; + target = 0; + goto trybackground; } - if (tmpe->wired_count > 0) - nothingwired = FALSE; - tmpe = tmpe->next; - } + last_launder = wakeups; + launder = target / shortfall_cycle--; + goto dolaundry; - if (bigobj != NULL) { - vm_pageout_object_deactivate_pages(map->pmap, bigobj, desired); - VM_OBJECT_RUNLOCK(bigobj); - } - /* - * Next, hunt around for other pages to deactivate. We actually - * do this search sort of wrong -- .text first is not the best idea. - */ - tmpe = map->header.next; - while (tmpe != &map->header) { - if (pmap_resident_count(vm_map_pmap(map)) <= desired) - break; - if ((tmpe->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) { - obj = tmpe->object.vm_object; - if (obj != NULL) { - VM_OBJECT_RLOCK(obj); - vm_pageout_object_deactivate_pages(map->pmap, obj, desired); - VM_OBJECT_RUNLOCK(obj); + /* + * There's no immediate need to launder any pages; see if we + * meet the conditions to perform background laundering: + * + * 1. The ratio of dirty to clean inactive pages exceeds the + * background laundering threshold and the pagedaemon has + * been woken up to reclaim pages since our last + * laundering, or + * 2. we haven't yet reached the target of the current + * background laundering run. + * + * The background laundering threshold is not a constant. + * Instead, it is a slowly growing function of the number of + * page daemon wakeups since the last laundering. Thus, as the + * ratio of dirty to clean inactive pages grows, the amount of + * memory pressure required to trigger laundering decreases. + */ +trybackground: + nclean = vm_cnt.v_inactive_count + vm_cnt.v_free_count; + ndirty = vm_cnt.v_laundry_count; + if (target == 0 && wakeups != last_launder && + ndirty * isqrt(wakeups - last_launder) >= nclean) { + target = vm_background_launder_target; + } + + /* + * We have a non-zero background laundering target. If we've + * laundered up to our maximum without observing a page daemon + * wakeup, just stop. This is a safety belt that ensures we + * don't launder an excessive amount if memory pressure is low + * and the ratio of dirty to clean pages is large. Otherwise, + * proceed at the background laundering rate. + */ + if (target > 0) { + if (wakeups != last_launder) { + last_launder = wakeups; + last_target = target; + } else if (last_target - target >= + vm_background_launder_max * PAGE_SIZE / 1024) { + target = 0; } + launder = vm_background_launder_rate * PAGE_SIZE / 1024; + launder /= VM_LAUNDER_RATE; + if (launder > target) + launder = target; } - tmpe = tmpe->next; - } -#ifdef __ia64__ - /* - * Remove all non-wired, managed mappings if a process is swapped out. - * This will free page table pages. - */ - if (desired == 0) - pmap_remove_pages(map->pmap); -#else - /* - * Remove all mappings if a process is swapped out, this will free page - * table pages. - */ - if (desired == 0 && nothingwired) { - pmap_remove(vm_map_pmap(map), vm_map_min(map), - vm_map_max(map)); +dolaundry: + if (launder > 0) { + /* + * Because of I/O clustering, the number of laundered + * pages could exceed "target" by the maximum size of + * a cluster minus one. + */ + target -= min(vm_pageout_launder(domain, launder, + in_shortfall), target); + pause("laundp", hz / VM_LAUNDER_RATE); + } + + /* + * If we're not currently laundering pages and the page daemon + * hasn't posted a new request, sleep until the page daemon + * kicks us. + */ + vm_pagequeue_lock(pq); + if (target == 0 && vm_laundry_request == VM_LAUNDRY_IDLE) + (void)mtx_sleep(&vm_laundry_request, + vm_pagequeue_lockptr(pq), PVM, "launds", 0); + + /* + * If the pagedaemon has indicated that it's in shortfall, start + * a shortfall laundering unless we're already in the middle of + * one. This may preempt a background laundering. + */ + if (vm_laundry_request == VM_LAUNDRY_SHORTFALL && + (!in_shortfall || shortfall_cycle == 0)) { + shortfall = vm_laundry_target() + vm_pageout_deficit; + target = 0; + } else + shortfall = 0; + + if (target == 0) + vm_laundry_request = VM_LAUNDRY_IDLE; + vm_pagequeue_unlock(pq); } -#endif - - vm_map_unlock(map); } -#endif /* !defined(NO_SWAPPING) */ /* * vm_pageout_scan does the dirty work for the pageout daemon. * - * pass 0 - Update active LRU/deactivate pages - * pass 1 - Move inactive to cache or free - * pass 2 - Launder dirty pages + * pass == 0: Update active LRU/deactivate pages + * pass >= 1: Free inactive pages + * + * Returns true if pass was zero or enough pages were freed by the inactive + * queue scan to meet the target. */ -static void +static bool vm_pageout_scan(struct vm_domain *vmd, int pass) { vm_page_t m, next; @@ -947,10 +1075,8 @@ struct vm_pagequeue *pq; vm_object_t object; long min_scan; - int act_delta, addl_page_shortage, deficit, maxscan, page_shortage; - int vnodes_skipped = 0; - int maxlaunder, scan_tick, scanned, starting_page_shortage; - int lockmode; + int act_delta, addl_page_shortage, deficit, inactq_shortage, maxscan; + int page_shortage, scan_tick, scanned, starting_page_shortage; boolean_t queue_locked; /* @@ -981,8 +1107,9 @@ addl_page_shortage = 0; /* - * Calculate the number of pages we want to either free or move - * to the cache. + * Calculate the number of pages that we want to free. This number + * can be negative if many pages are freed between the wakeup call to + * the page daemon and this calculation. */ if (pass > 0) { deficit = atomic_readandclear_int(&vm_pageout_deficit); @@ -992,27 +1119,11 @@ starting_page_shortage = page_shortage; /* - * maxlaunder limits the number of dirty pages we flush per scan. - * For most systems a smaller value (16 or 32) is more robust under - * extreme memory and disk pressure because any unnecessary writes - * to disk can result in extreme performance degredation. However, - * systems with excessive dirty pages (especially when MAP_NOSYNC is - * used) will die horribly with limited laundering. If the pageout - * daemon cannot clean enough pages in the first pass, we let it go - * all out in succeeding passes. + * Start scanning the inactive queue for pages that we can free. The + * scan will stop when we reach the target or we have scanned the + * entire queue. (Note that m->act_count is not used to make + * decisions for the inactive queue, only for the active queue.) */ - if ((maxlaunder = vm_max_launder) <= 1) - maxlaunder = 1; - if (pass > 1) - maxlaunder = 10000; - - /* - * Start scanning the inactive queue for pages we can move to the - * cache or free. The scan will stop when the target is reached or - * we have scanned the entire inactive queue. Note that m->act_count - * is not used to form decisions for the inactive queue, only for the - * active queue. - */ pq = &vmd->vmd_pagequeues[PQ_INACTIVE]; maxscan = pq->pq_cnt; vm_pagequeue_lock(pq); @@ -1022,7 +1133,7 @@ m = next) { vm_pagequeue_assert_locked(pq); KASSERT(queue_locked, ("unlocked inactive queue")); - KASSERT(m->queue == PQ_INACTIVE, ("Inactive queue %p", m)); + KASSERT(vm_page_inactive(m), ("Inactive queue %p", m)); PCPU_INC(cnt.v_pdpages); next = TAILQ_NEXT(m, plinks.q); @@ -1044,55 +1155,76 @@ * different position within the queue. In either * case, addl_page_shortage should not be incremented. */ - if (!vm_pageout_page_lock(m, &next)) { - vm_page_unlock(m); - continue; + if (!vm_pageout_page_lock(m, &next)) + goto unlock_page; + else if (m->hold_count != 0) { + /* + * Held pages are essentially stuck in the + * queue. So, they ought to be discounted + * from the inactive count. See the + * calculation of inactq_shortage before the + * loop over the active queue below. + */ + addl_page_shortage++; + goto unlock_page; } object = m->object; - if (!VM_OBJECT_TRYWLOCK(object) && - !vm_pageout_fallback_object_lock(m, &next)) { - vm_page_unlock(m); - VM_OBJECT_WUNLOCK(object); - continue; + if (!VM_OBJECT_TRYWLOCK(object)) { + if (!vm_pageout_fallback_object_lock(m, &next)) + goto unlock_object; + else if (m->hold_count != 0) { + addl_page_shortage++; + goto unlock_object; + } } - - /* - * Don't mess with busy pages, keep them at at the - * front of the queue, most likely they are being - * paged out. Increment addl_page_shortage for busy - * pages, because they may leave the inactive queue - * shortly after page scan is finished. - */ if (vm_page_busied(m)) { + /* + * Don't mess with busy pages. Leave them at + * the front of the queue. Most likely, they + * are being paged out and will leave the + * queue shortly after the scan finishes. So, + * they ought to be discounted from the + * inactive count. + */ + addl_page_shortage++; +unlock_object: + VM_OBJECT_WUNLOCK(object); +unlock_page: vm_page_unlock(m); - VM_OBJECT_WUNLOCK(object); - addl_page_shortage++; continue; } + KASSERT(m->hold_count == 0, ("Held page %p", m)); /* - * We unlock the inactive page queue, invalidating the - * 'next' pointer. Use our marker to remember our - * place. + * Dequeue the inactive page and unlock the inactive page + * queue, invalidating the 'next' pointer. Dequeueing the + * page here avoids a later reacquisition (and release) of + * the inactive page queue lock when vm_page_activate(), + * vm_page_free(), or vm_page_launder() is called. Use a + * marker to remember our place in the inactive queue. */ TAILQ_INSERT_AFTER(&pq->pq_pl, m, &vmd->vmd_marker, plinks.q); + vm_page_dequeue_locked(m); vm_pagequeue_unlock(pq); queue_locked = FALSE; /* - * We bump the activation count if the page has been - * referenced while in the inactive queue. This makes - * it less likely that the page will be added back to the - * inactive queue prematurely again. Here we check the - * page tables (or emulated bits, if any), given the upper - * level VM system not knowing anything about existing - * references. + * Invalid pages can be easily freed. They cannot be + * mapped, vm_page_free() asserts this. */ - act_delta = 0; + if (m->valid == 0) + goto free_page; + + /* + * If the page has been referenced and the object is not dead, + * reactivate or requeue the page depending on whether the + * object is mapped. + */ if ((m->aflags & PGA_REFERENCED) != 0) { vm_page_aflag_clear(m, PGA_REFERENCED); act_delta = 1; - } + } else + act_delta = 0; if (object->ref_count != 0) { act_delta += pmap_ts_referenced(m); } else { @@ -1099,47 +1231,36 @@ KASSERT(!pmap_page_is_mapped(m), ("vm_pageout_scan: page %p is mapped", m)); } - - /* - * If the upper level VM system knows about any page - * references, we reactivate the page or requeue it. - */ if (act_delta != 0) { - if (object->ref_count) { + if (object->ref_count != 0) { + PCPU_INC(cnt.v_reactivated); vm_page_activate(m); + + /* + * Increase the activation count if the page + * was referenced while in the inactive queue. + * This makes it less likely that the page will + * be returned prematurely to the inactive + * queue. + */ m->act_count += act_delta + ACT_ADVANCE; - } else { + goto drop_page; + } else if ((object->flags & OBJ_DEAD) == 0) { vm_pagequeue_lock(pq); queue_locked = TRUE; - vm_page_requeue_locked(m); + m->queue = PQ_INACTIVE; + TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q); + vm_pagequeue_cnt_inc(pq); + goto drop_page; } - VM_OBJECT_WUNLOCK(object); - vm_page_unlock(m); - goto relock_queue; } - if (m->hold_count != 0) { - vm_page_unlock(m); - VM_OBJECT_WUNLOCK(object); - - /* - * Held pages are essentially stuck in the - * queue. So, they ought to be discounted - * from the inactive count. See the - * calculation of the page_shortage for the - * loop over the active queue below. - */ - addl_page_shortage++; - goto relock_queue; - } - /* * If the page appears to be clean at the machine-independent * layer, then remove all of its mappings from the pmap in - * anticipation of placing it onto the cache queue. If, - * however, any of the page's mappings allow write access, - * then the page may still be modified until the last of those - * mappings are removed. + * anticipation of freeing it. If, however, any of the page's + * mappings allow write access, then the page may still be + * modified until the last of those mappings are removed. */ if (object->ref_count != 0) { vm_page_test_dirty(m); @@ -1147,199 +1268,23 @@ pmap_remove_all(m); } - if (m->valid == 0) { - /* - * Invalid pages can be easily freed - */ + /* + * Clean pages can be freed, but dirty pages must be sent back + * to the laundry, unless they belong to a dead object. + * Requeueing dirty pages from dead objects is pointless, as + * they are being paged out and freed by the thread that + * destroyed the object. + */ + if (m->dirty == 0) { +free_page: vm_page_free(m); PCPU_INC(cnt.v_dfree); --page_shortage; - } else if (m->dirty == 0) { - /* - * Clean pages can be placed onto the cache queue. - * This effectively frees them. - */ - vm_page_cache(m); - --page_shortage; - } else if ((m->flags & PG_WINATCFLS) == 0 && pass < 2) { - /* - * Dirty pages need to be paged out, but flushing - * a page is extremely expensive verses freeing - * a clean page. Rather then artificially limiting - * the number of pages we can flush, we instead give - * dirty pages extra priority on the inactive queue - * by forcing them to be cycled through the queue - * twice before being flushed, after which the - * (now clean) page will cycle through once more - * before being freed. This significantly extends - * the thrash point for a heavily loaded machine. - */ - m->flags |= PG_WINATCFLS; - vm_pagequeue_lock(pq); - queue_locked = TRUE; - vm_page_requeue_locked(m); - } else if (maxlaunder > 0) { - /* - * We always want to try to flush some dirty pages if - * we encounter them, to keep the system stable. - * Normally this number is small, but under extreme - * pressure where there are insufficient clean pages - * on the inactive queue, we may have to go all out. - */ - int swap_pageouts_ok; - struct vnode *vp = NULL; - struct mount *mp = NULL; - - if ((object->type != OBJT_SWAP) && (object->type != OBJT_DEFAULT)) { - swap_pageouts_ok = 1; - } else { - swap_pageouts_ok = !(defer_swap_pageouts || disable_swap_pageouts); - swap_pageouts_ok |= (!disable_swap_pageouts && defer_swap_pageouts && - vm_page_count_min()); - - } - - /* - * We don't bother paging objects that are "dead". - * Those objects are in a "rundown" state. - */ - if (!swap_pageouts_ok || (object->flags & OBJ_DEAD)) { - vm_pagequeue_lock(pq); - vm_page_unlock(m); - VM_OBJECT_WUNLOCK(object); - queue_locked = TRUE; - vm_page_requeue_locked(m); - goto relock_queue; - } - - /* - * The object is already known NOT to be dead. It - * is possible for the vget() to block the whole - * pageout daemon, but the new low-memory handling - * code should prevent it. - * - * The previous code skipped locked vnodes and, worse, - * reordered pages in the queue. This results in - * completely non-deterministic operation and, on a - * busy system, can lead to extremely non-optimal - * pageouts. For example, it can cause clean pages - * to be freed and dirty pages to be moved to the end - * of the queue. Since dirty pages are also moved to - * the end of the queue once-cleaned, this gives - * way too large a weighting to defering the freeing - * of dirty pages. - * - * We can't wait forever for the vnode lock, we might - * deadlock due to a vn_read() getting stuck in - * vm_wait while holding this vnode. We skip the - * vnode if we can't get it in a reasonable amount - * of time. - */ - if (object->type == OBJT_VNODE) { - vm_page_unlock(m); - vp = object->handle; - if (vp->v_type == VREG && - vn_start_write(vp, &mp, V_NOWAIT) != 0) { - mp = NULL; - ++pageout_lock_miss; - if (object->flags & OBJ_MIGHTBEDIRTY) - vnodes_skipped++; - goto unlock_and_continue; - } - KASSERT(mp != NULL, - ("vp %p with NULL v_mount", vp)); - vm_object_reference_locked(object); - VM_OBJECT_WUNLOCK(object); - lockmode = MNT_SHARED_WRITES(vp->v_mount) ? - LK_SHARED : LK_EXCLUSIVE; - if (vget(vp, lockmode | LK_TIMELOCK, - curthread)) { - VM_OBJECT_WLOCK(object); - ++pageout_lock_miss; - if (object->flags & OBJ_MIGHTBEDIRTY) - vnodes_skipped++; - vp = NULL; - goto unlock_and_continue; - } - VM_OBJECT_WLOCK(object); - vm_page_lock(m); - vm_pagequeue_lock(pq); - queue_locked = TRUE; - /* - * The page might have been moved to another - * queue during potential blocking in vget() - * above. The page might have been freed and - * reused for another vnode. - */ - if (m->queue != PQ_INACTIVE || - m->object != object || - TAILQ_NEXT(m, plinks.q) != &vmd->vmd_marker) { - vm_page_unlock(m); - if (object->flags & OBJ_MIGHTBEDIRTY) - vnodes_skipped++; - goto unlock_and_continue; - } - - /* - * The page may have been busied during the - * blocking in vget(). We don't move the - * page back onto the end of the queue so that - * statistics are more correct if we don't. - */ - if (vm_page_busied(m)) { - vm_page_unlock(m); - addl_page_shortage++; - goto unlock_and_continue; - } - - /* - * If the page has become held it might - * be undergoing I/O, so skip it - */ - if (m->hold_count != 0) { - vm_page_unlock(m); - addl_page_shortage++; - if (object->flags & OBJ_MIGHTBEDIRTY) - vnodes_skipped++; - goto unlock_and_continue; - } - vm_pagequeue_unlock(pq); - queue_locked = FALSE; - } - - /* - * If a page is dirty, then it is either being washed - * (but not yet cleaned) or it is still in the - * laundry. If it is still in the laundry, then we - * start the cleaning operation. - * - * decrement page_shortage on success to account for - * the (future) cleaned page. Otherwise we could wind - * up laundering or cleaning too many pages. - */ - if (vm_pageout_clean(m) != 0) { - --page_shortage; - --maxlaunder; - } -unlock_and_continue: - vm_page_lock_assert(m, MA_NOTOWNED); - VM_OBJECT_WUNLOCK(object); - if (mp != NULL) { - if (queue_locked) { - vm_pagequeue_unlock(pq); - queue_locked = FALSE; - } - if (vp != NULL) - vput(vp); - vm_object_deallocate(object); - vn_finished_write(mp); - } - vm_page_lock_assert(m, MA_NOTOWNED); - goto relock_queue; - } + } else if ((object->flags & OBJ_DEAD) == 0) + vm_page_launder(m); +drop_page: vm_page_unlock(m); VM_OBJECT_WUNLOCK(object); -relock_queue: if (!queue_locked) { vm_pagequeue_lock(pq); queue_locked = TRUE; @@ -1349,22 +1294,30 @@ } vm_pagequeue_unlock(pq); -#if !defined(NO_SWAPPING) /* - * Wakeup the swapout daemon if we didn't cache or free the targeted - * number of pages. + * Wake up the laundry thread so that it can perform any needed + * laundering. If we didn't meet our target, we're in shortfall and + * need to launder more aggressively. */ - if (vm_swap_enabled && page_shortage > 0) - vm_req_vmdaemon(VM_SWAP_NORMAL); -#endif + if (vm_laundry_request == VM_LAUNDRY_IDLE && + starting_page_shortage > 0) { + pq = &vm_dom[0].vmd_pagequeues[PQ_LAUNDRY]; + vm_pagequeue_lock(pq); + if (page_shortage > 0) { + vm_laundry_request = VM_LAUNDRY_SHORTFALL; + PCPU_INC(cnt.v_pdshortfalls); + } else if (vm_laundry_request != VM_LAUNDRY_SHORTFALL) + vm_laundry_request = VM_LAUNDRY_BACKGROUND; + wakeup(&vm_laundry_request); + vm_pagequeue_unlock(pq); + } /* - * Wakeup the sync daemon if we skipped a vnode in a writeable object - * and we didn't cache or free enough pages. + * Wakeup the swapout daemon if we didn't free the targeted number of + * pages. */ - if (vnodes_skipped > 0 && page_shortage > cnt.v_free_target - - cnt.v_free_min) - (void)speedup_syncer(); + if (page_shortage > 0) + vm_swapout_run(); /* * If the inactive queue scan fails repeatedly to meet its @@ -1374,10 +1327,20 @@ /* * Compute the number of pages we want to try to move from the - * active queue to the inactive queue. + * active queue to either the inactive or laundry queue. + * + * When scanning active pages, we make clean pages count more heavily + * towards the page shortage than dirty pages. This is because dirty + * pages must be laundered before they can be reused and thus have less + * utility when attempting to quickly alleviate a shortage. However, + * this weighting also causes the scan to deactivate dirty pages more + * more aggressively, improving the effectiveness of clustering and + * ensuring that they can eventually be reused. */ - page_shortage = cnt.v_inactive_target - cnt.v_inactive_count + + inactq_shortage = vm_cnt.v_inactive_target - (vm_cnt.v_inactive_count + + vm_cnt.v_laundry_count / act_scan_laundry_weight) + vm_paging_target() + deficit + addl_page_shortage; + inactq_shortage *= act_scan_laundry_weight; pq = &vmd->vmd_pagequeues[PQ_ACTIVE]; vm_pagequeue_lock(pq); @@ -1394,7 +1357,7 @@ min_scan /= hz * vm_pageout_update_period; } else min_scan = 0; - if (min_scan > 0 || (page_shortage > 0 && maxscan > 0)) + if (min_scan > 0 || (inactq_shortage > 0 && maxscan > 0)) vmd->vmd_last_active_scan = scan_tick; /* @@ -1403,7 +1366,7 @@ * candidates. Held pages may be deactivated. */ for (m = TAILQ_FIRST(&pq->pq_pl), scanned = 0; m != NULL && (scanned < - min_scan || (page_shortage > 0 && scanned < maxscan)); m = next, + min_scan || (inactq_shortage > 0 && scanned < maxscan)); m = next, scanned++) { KASSERT(m->queue == PQ_ACTIVE, ("vm_pageout_scan: page %p isn't active", m)); @@ -1428,11 +1391,12 @@ /* * Check to see "how much" the page has been used. */ - act_delta = 0; - if (m->aflags & PGA_REFERENCED) { + if ((m->aflags & PGA_REFERENCED) != 0) { vm_page_aflag_clear(m, PGA_REFERENCED); - act_delta += 1; - } + act_delta = 1; + } else + act_delta = 0; + /* * Perform an unsynchronized object ref count check. While * the page lock ensures that the page is not reallocated to @@ -1452,41 +1416,60 @@ /* * Advance or decay the act_count based on recent usage. */ - if (act_delta) { + if (act_delta != 0) { m->act_count += ACT_ADVANCE + act_delta; if (m->act_count > ACT_MAX) m->act_count = ACT_MAX; - } else { + } else m->act_count -= min(m->act_count, ACT_DECLINE); - act_delta = m->act_count; - } /* - * Move this page to the tail of the active or inactive + * Move this page to the tail of the active, inactive or laundry * queue depending on usage. */ - if (act_delta == 0) { + if (m->act_count == 0) { /* Dequeue to avoid later lock recursion. */ vm_page_dequeue_locked(m); - vm_page_deactivate(m); - page_shortage--; + + /* + * When not short for inactive pages, let dirty pages go + * through the inactive queue before moving to the + * laundry queues. This gives them some extra time to + * be reactivated, potentially avoiding an expensive + * pageout. During a page shortage, the inactive queue + * is necessarily small, so we may move dirty pages + * directly to the laundry queue. + */ + if (inactq_shortage <= 0) + vm_page_deactivate(m); + else { + /* + * Calling vm_page_test_dirty() here would + * require acquisition of the object's write + * lock. However, during a page shortage, + * directing dirty pages into the laundry + * queue is only an optimization and not a + * requirement. Therefore, we simply rely on + * the opportunistic updates to the page's + * dirty field by the pmap. + */ + if (m->dirty == 0) { + vm_page_deactivate(m); + inactq_shortage -= + act_scan_laundry_weight; + } else { + vm_page_launder(m); + inactq_shortage--; + } + } } else vm_page_requeue_locked(m); vm_page_unlock(m); } vm_pagequeue_unlock(pq); -#if !defined(NO_SWAPPING) - /* - * Idle process swapout -- run once per second. - */ - if (vm_swap_idle_enabled) { - static long lsec; - if (time_second != lsec) { - vm_req_vmdaemon(VM_SWAP_IDLE); - lsec = time_second; - } - } -#endif + if (pass > 0) + vm_swapout_run_idle(); + return (page_shortage <= 0); } static int vm_pageout_oom_vote; @@ -1668,19 +1651,21 @@ PROC_UNLOCK(p); continue; } - _PHOLD(p); + _PHOLD_LITE(p); + PROC_UNLOCK(p); + sx_sunlock(&allproc_lock); if (!vm_map_trylock_read(&vm->vm_map)) { - _PRELE(p); - PROC_UNLOCK(p); vmspace_free(vm); + sx_slock(&allproc_lock); + PRELE(p); continue; } - PROC_UNLOCK(p); size = vmspace_swap_count(vm); if (shortage == VM_OOM_MEM) size += vm_pageout_oom_pagecount(vm); vm_map_unlock_read(&vm->vm_map); vmspace_free(vm); + sx_slock(&allproc_lock); /* * If this process is bigger than the biggest one, @@ -1697,12 +1682,14 @@ } sx_sunlock(&allproc_lock); if (bigproc != NULL) { + if (vm_panic_on_oom != 0) + panic("out of swap space"); PROC_LOCK(bigproc); killproc(bigproc, "out of swap space"); sched_nice(bigproc, PRIO_MIN); _PRELE(bigproc); PROC_UNLOCK(bigproc); - wakeup(&cnt.v_free_count); + wakeup(&vm_cnt.v_free_count); } } @@ -1710,10 +1697,13 @@ vm_pageout_worker(void *arg) { struct vm_domain *domain; - int domidx; + int domidx, pass; + bool target_met; domidx = (uintptr_t)arg; domain = &vm_dom[domidx]; + pass = 0; + target_met = true; /* * XXXKIB It could be useful to bind pageout daemon threads to @@ -1724,54 +1714,80 @@ KASSERT(domain->vmd_segs != 0, ("domain without segments")); domain->vmd_last_active_scan = ticks; vm_pageout_init_marker(&domain->vmd_marker, PQ_INACTIVE); + vm_pageout_init_marker(&domain->vmd_inacthead, PQ_INACTIVE); + TAILQ_INSERT_HEAD(&domain->vmd_pagequeues[PQ_INACTIVE].pq_pl, + &domain->vmd_inacthead, plinks.q); /* * The pageout daemon worker is never done, so loop forever. */ while (TRUE) { + mtx_lock(&vm_page_queue_free_mtx); + /* - * If we have enough free memory, wakeup waiters. Do - * not clear vm_pages_needed until we reach our target, - * otherwise we may be woken up over and over again and - * waste a lot of cpu. + * Generally, after a level >= 1 scan, if there are enough + * free pages to wakeup the waiters, then they are already + * awake. A call to vm_page_free() during the scan awakened + * them. However, in the following case, this wakeup serves + * to bound the amount of time that a thread might wait. + * Suppose a thread's call to vm_page_alloc() fails, but + * before that thread calls VM_WAIT, enough pages are freed by + * other threads to alleviate the free page shortage. The + * thread will, nonetheless, wait until another page is freed + * or this wakeup is performed. */ - mtx_lock(&vm_page_queue_free_mtx); if (vm_pages_needed && !vm_page_count_min()) { - if (!vm_paging_needed()) - vm_pages_needed = 0; - wakeup(&cnt.v_free_count); + vm_pages_needed = false; + wakeup(&vm_cnt.v_free_count); } - if (vm_pages_needed) { + + /* + * Do not clear vm_pageout_wanted until we reach our free page + * target. Otherwise, we may be awakened over and over again, + * wasting CPU time. + */ + if (vm_pageout_wanted && target_met) + vm_pageout_wanted = false; + + /* + * Might the page daemon receive a wakeup call? + */ + if (vm_pageout_wanted) { /* - * We're still not done. Either vm_pages_needed was - * set by another thread during the previous scan - * (typically, this happens during a level 0 scan) or - * vm_pages_needed was already set and the scan failed - * to free enough pages. If we haven't yet performed - * a level >= 2 scan (unlimited dirty cleaning), then - * upgrade the level and scan again now. Otherwise, - * sleep a bit and try again later. While sleeping, - * vm_pages_needed can be cleared. + * No. Either vm_pageout_wanted was set by another + * thread during the previous scan, which must have + * been a level 0 scan, or vm_pageout_wanted was + * already set and the scan failed to free enough + * pages. If we haven't yet performed a level >= 1 + * (page reclamation) scan, then increase the level + * and scan again now. Otherwise, sleep a bit and + * try again later. */ - if (domain->vmd_pass > 1) - msleep(&vm_pages_needed, - &vm_page_queue_free_mtx, PVM, "psleep", - hz / 2); + mtx_unlock(&vm_page_queue_free_mtx); + if (pass >= 1) + pause("pwait", hz / VM_INACT_SCAN_RATE); + pass++; } else { /* - * Good enough, sleep until required to refresh - * stats. + * Yes. If threads are still sleeping in VM_WAIT + * then we immediately start a new scan. Otherwise, + * sleep until the next wakeup or until pages need to + * have their reference stats updated. */ - msleep(&vm_pages_needed, &vm_page_queue_free_mtx, - PVM, "psleep", hz); + if (vm_pages_needed) { + mtx_unlock(&vm_page_queue_free_mtx); + if (pass == 0) + pass++; + } else if (mtx_sleep(&vm_pageout_wanted, + &vm_page_queue_free_mtx, PDROP | PVM, "psleep", + hz) == 0) { + PCPU_INC(cnt.v_pdwakeups); + pass = 1; + } else + pass = 0; } - if (vm_pages_needed) { - cnt.v_pdwakeups++; - domain->vmd_pass++; - } else - domain->vmd_pass = 0; - mtx_unlock(&vm_page_queue_free_mtx); - vm_pageout_scan(domain, domain->vmd_pass); + + target_met = vm_pageout_scan(domain, pass); } } @@ -1784,8 +1800,8 @@ /* * Initialize some paging parameters. */ - cnt.v_interrupt_free_min = 2; - if (cnt.v_page_count < 2000) + vm_cnt.v_interrupt_free_min = 2; + if (vm_cnt.v_page_count < 2000) vm_pageout_page_count = 8; /* @@ -1793,27 +1809,27 @@ * swap pager structures plus enough for any pv_entry structs * when paging. */ - if (cnt.v_page_count > 1024) - cnt.v_free_min = 4 + (cnt.v_page_count - 1024) / 200; + if (vm_cnt.v_page_count > 1024) + vm_cnt.v_free_min = 4 + (vm_cnt.v_page_count - 1024) / 200; else - cnt.v_free_min = 4; - cnt.v_pageout_free_min = (2*MAXBSIZE)/PAGE_SIZE + - cnt.v_interrupt_free_min; - cnt.v_free_reserved = vm_pageout_page_count + - cnt.v_pageout_free_min + (cnt.v_page_count / 768); - cnt.v_free_severe = cnt.v_free_min / 2; - cnt.v_free_target = 4 * cnt.v_free_min + cnt.v_free_reserved; - cnt.v_free_min += cnt.v_free_reserved; - cnt.v_free_severe += cnt.v_free_reserved; - cnt.v_inactive_target = (3 * cnt.v_free_target) / 2; - if (cnt.v_inactive_target > cnt.v_free_count / 3) - cnt.v_inactive_target = cnt.v_free_count / 3; + vm_cnt.v_free_min = 4; + vm_cnt.v_pageout_free_min = (2*MAXBSIZE)/PAGE_SIZE + + vm_cnt.v_interrupt_free_min; + vm_cnt.v_free_reserved = vm_pageout_page_count + + vm_cnt.v_pageout_free_min + (vm_cnt.v_page_count / 768); + vm_cnt.v_free_severe = vm_cnt.v_free_min / 2; + vm_cnt.v_free_target = 4 * vm_cnt.v_free_min + vm_cnt.v_free_reserved; + vm_cnt.v_free_min += vm_cnt.v_free_reserved; + vm_cnt.v_free_severe += vm_cnt.v_free_reserved; + vm_cnt.v_inactive_target = (3 * vm_cnt.v_free_target) / 2; + if (vm_cnt.v_inactive_target > vm_cnt.v_free_count / 3) + vm_cnt.v_inactive_target = vm_cnt.v_free_count / 3; /* * Set the default wakeup threshold to be 10% above the minimum * page limit. This keeps the steady state out of shortfall. */ - vm_pageout_wakeup_thresh = (cnt.v_free_min / 10) * 11; + vm_pageout_wakeup_thresh = (vm_cnt.v_free_min / 10) * 11; /* * Set interval in seconds for active scan. We want to visit each @@ -1825,7 +1841,15 @@ /* XXX does not really belong here */ if (vm_page_max_wired == 0) - vm_page_max_wired = cnt.v_free_count / 3; + vm_page_max_wired = vm_cnt.v_free_count / 3; + + /* + * Target amount of memory to move out of the laundry queue during a + * background laundering. This is proportional to the amount of system + * memory. + */ + vm_background_launder_target = (vm_cnt.v_free_target - + vm_cnt.v_free_min) / 10; } /* @@ -1835,12 +1859,17 @@ vm_pageout(void) { int error; -#if MAXMEMDOM > 1 +#ifdef VM_NUMA_ALLOC int i; #endif swap_pager_swap_init(); -#if MAXMEMDOM > 1 + snprintf(curthread->td_name, sizeof(curthread->td_name), "dom0"); + error = kthread_add(vm_pageout_laundry_worker, NULL, curproc, NULL, + 0, 0, "laundry: dom0"); + if (error != 0) + panic("starting laundry for domain 0, error %d", error); +#ifdef VM_NUMA_ALLOC for (i = 1; i < vm_ndomains; i++) { error = kthread_add(vm_pageout_worker, (void *)(uintptr_t)i, curproc, NULL, 0, 0, "dom%d", i); @@ -1858,175 +1887,42 @@ } /* - * Unless the free page queue lock is held by the caller, this function - * should be regarded as advisory. Specifically, the caller should - * not msleep() on &cnt.v_free_count following this function unless - * the free page queue lock is held until the msleep() is performed. + * Perform an advisory wakeup of the page daemon. */ void pagedaemon_wakeup(void) { - if (!vm_pages_needed && curthread->td_proc != pageproc) { - vm_pages_needed = 1; - wakeup(&vm_pages_needed); - } -} + mtx_assert(&vm_page_queue_free_mtx, MA_NOTOWNED); -#if !defined(NO_SWAPPING) -static void -vm_req_vmdaemon(int req) -{ - static int lastrun = 0; - - mtx_lock(&vm_daemon_mtx); - vm_pageout_req_swapout |= req; - if ((ticks > (lastrun + hz)) || (ticks < lastrun)) { - wakeup(&vm_daemon_needed); - lastrun = ticks; + if (!vm_pageout_wanted && curthread->td_proc != pageproc) { + vm_pageout_wanted = true; + wakeup(&vm_pageout_wanted); } - mtx_unlock(&vm_daemon_mtx); } -static void -vm_daemon(void) +/* + * Wake up the page daemon and wait for it to reclaim free pages. + * + * This function returns with the free queues mutex unlocked. + */ +void +pagedaemon_wait(int pri, const char *wmesg) { - struct rlimit rsslim; - struct proc *p; - struct thread *td; - struct vmspace *vm; - int breakout, swapout_flags, tryagain, attempts; -#ifdef RACCT - uint64_t rsize, ravailable; -#endif - while (TRUE) { - mtx_lock(&vm_daemon_mtx); - msleep(&vm_daemon_needed, &vm_daemon_mtx, PPAUSE, "psleep", -#ifdef RACCT - racct_enable ? hz : 0 -#else - 0 -#endif - ); - swapout_flags = vm_pageout_req_swapout; - vm_pageout_req_swapout = 0; - mtx_unlock(&vm_daemon_mtx); - if (swapout_flags) - swapout_procs(swapout_flags); + mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); - /* - * scan the processes for exceeding their rlimits or if - * process is swapped out -- deactivate pages - */ - tryagain = 0; - attempts = 0; -again: - attempts++; - sx_slock(&allproc_lock); - FOREACH_PROC_IN_SYSTEM(p) { - vm_pindex_t limit, size; - - /* - * if this is a system process or if we have already - * looked at this process, skip it. - */ - PROC_LOCK(p); - if (p->p_state != PRS_NORMAL || - p->p_flag & (P_INEXEC | P_SYSTEM | P_WEXIT)) { - PROC_UNLOCK(p); - continue; - } - /* - * if the process is in a non-running type state, - * don't touch it. - */ - breakout = 0; - FOREACH_THREAD_IN_PROC(p, td) { - thread_lock(td); - if (!TD_ON_RUNQ(td) && - !TD_IS_RUNNING(td) && - !TD_IS_SLEEPING(td) && - !TD_IS_SUSPENDED(td)) { - thread_unlock(td); - breakout = 1; - break; - } - thread_unlock(td); - } - if (breakout) { - PROC_UNLOCK(p); - continue; - } - /* - * get a limit - */ - lim_rlimit(p, RLIMIT_RSS, &rsslim); - limit = OFF_TO_IDX( - qmin(rsslim.rlim_cur, rsslim.rlim_max)); - - /* - * let processes that are swapped out really be - * swapped out set the limit to nothing (will force a - * swap-out.) - */ - if ((p->p_flag & P_INMEM) == 0) - limit = 0; /* XXX */ - vm = vmspace_acquire_ref(p); - PROC_UNLOCK(p); - if (vm == NULL) - continue; - - size = vmspace_resident_count(vm); - if (size >= limit) { - vm_pageout_map_deactivate_pages( - &vm->vm_map, limit); - size = vmspace_resident_count(vm); - } -#ifdef RACCT - if (racct_enable) { - rsize = IDX_TO_OFF(size); - PROC_LOCK(p); - if (p->p_state == PRS_NORMAL) - racct_set(p, RACCT_RSS, rsize); - ravailable = racct_get_available(p, RACCT_RSS); - PROC_UNLOCK(p); - if (rsize > ravailable) { - /* - * Don't be overly aggressive; this - * might be an innocent process, - * and the limit could've been exceeded - * by some memory hog. Don't try - * to deactivate more than 1/4th - * of process' resident set size. - */ - if (attempts <= 8) { - if (ravailable < rsize - - (rsize / 4)) { - ravailable = rsize - - (rsize / 4); - } - } - vm_pageout_map_deactivate_pages( - &vm->vm_map, - OFF_TO_IDX(ravailable)); - /* Update RSS usage after paging out. */ - size = vmspace_resident_count(vm); - rsize = IDX_TO_OFF(size); - PROC_LOCK(p); - if (p->p_state == PRS_NORMAL) - racct_set(p, RACCT_RSS, rsize); - PROC_UNLOCK(p); - if (rsize > ravailable) - tryagain = 1; - } - } -#endif - vmspace_free(vm); - } - sx_sunlock(&allproc_lock); - if (tryagain != 0 && attempts <= 10) - goto again; + /* + * vm_pageout_wanted may have been set by an advisory wakeup, but if the + * page daemon is running on a CPU, the wakeup will have been lost. + * Thus, deliver a potentially spurious wakeup to ensure that the page + * daemon has been notified of the shortage. + */ + if (!vm_pageout_wanted || !vm_pages_needed) { + vm_pageout_wanted = true; + wakeup(&vm_pageout_wanted); } + vm_pages_needed = true; + msleep(&vm_cnt.v_free_count, &vm_page_queue_free_mtx, PDROP | pri, + wmesg, 0); } -#endif /* !defined(NO_SWAPPING) */ Modified: trunk/sys/vm/vm_pageout.h =================================================================== --- trunk/sys/vm/vm_pageout.h 2020-02-08 19:35:04 UTC (rev 12313) +++ trunk/sys/vm/vm_pageout.h 2020-02-08 19:35:48 UTC (rev 12314) @@ -58,12 +58,14 @@ * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. * - * $FreeBSD: stable/10/sys/vm/vm_pageout.h 314664 2017-03-04 12:05:50Z avg $ + * $FreeBSD: stable/11/sys/vm/vm_pageout.h 331722 2018-03-29 02:50:57Z eadler $ */ #ifndef _VM_VM_PAGEOUT_H_ #define _VM_VM_PAGEOUT_H_ +#ifdef _KERNEL + /* * Header file for pageout daemon. */ @@ -73,17 +75,11 @@ */ extern int vm_page_max_wired; -extern int vm_pages_needed; /* should be some "event" structure */ -extern int vm_pageout_pages_needed; extern int vm_pageout_deficit; extern int vm_pageout_page_count; +extern bool vm_pageout_wanted; +extern bool vm_pages_needed; -/* - * Swap out requests - */ -#define VM_SWAP_NORMAL 1 -#define VM_SWAP_IDLE 2 - #define VM_OOM_MEM 1 #define VM_OOM_SWAPZ 2 @@ -101,15 +97,17 @@ * Signal pageout-daemon and wait for it. */ -extern void pagedaemon_wakeup(void); +void pagedaemon_wait(int pri, const char *wmesg); +void pagedaemon_wakeup(void); #define VM_WAIT vm_wait() #define VM_WAITPFAULT vm_waitpfault() -extern void vm_wait(void); -extern void vm_waitpfault(void); +void vm_wait(void); +void vm_waitpfault(void); -#ifdef _KERNEL int vm_pageout_flush(vm_page_t *, int, int, int, int *, boolean_t *); -void vm_pageout_grow_cache(int, vm_paddr_t, vm_paddr_t); void vm_pageout_oom(int shortage); -#endif + +void vm_swapout_run(void); +void vm_swapout_run_idle(void); +#endif /* _KERNEL */ #endif /* _VM_VM_PAGEOUT_H_ */ Modified: trunk/sys/vm/vm_pager.c =================================================================== --- trunk/sys/vm/vm_pager.c 2020-02-08 19:35:04 UTC (rev 12313) +++ trunk/sys/vm/vm_pager.c 2020-02-08 19:35:48 UTC (rev 12314) @@ -65,7 +65,7 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/vm/vm_pager.c 311645 2017-01-07 12:04:30Z kib $"); +__FBSDID("$FreeBSD: stable/11/sys/vm/vm_pager.c 331722 2018-03-29 02:50:57Z eadler $"); #include <sys/param.h> #include <sys/systm.h> @@ -87,7 +87,9 @@ int cluster_pbuf_freecnt = -1; /* unlimited to begin with */ -static int dead_pager_getpages(vm_object_t, vm_page_t *, int, int); +struct buf *swbuf; + +static int dead_pager_getpages(vm_object_t, vm_page_t *, int, int *, int *); static vm_object_t dead_pager_alloc(void *, vm_ooffset_t, vm_prot_t, vm_ooffset_t, struct ucred *); static void dead_pager_putpages(vm_object_t, vm_page_t *, int, int, int *); @@ -95,13 +97,11 @@ static void dead_pager_dealloc(vm_object_t); static int -dead_pager_getpages(obj, ma, count, req) - vm_object_t obj; - vm_page_t *ma; - int count; - int req; +dead_pager_getpages(vm_object_t obj, vm_page_t *ma, int count, int *rbehind, + int *rahead) { - return VM_PAGER_FAIL; + + return (VM_PAGER_FAIL); } static vm_object_t @@ -158,8 +158,6 @@ &mgtdevicepagerops, /* OBJT_MGTDEVICE */ }; -static const int npagers = sizeof(pagertab) / sizeof(pagertab[0]); - /* * Kernel address space for mapping pages. * Used by pagers where KVAs are needed for IO. @@ -168,7 +166,7 @@ * cleaning requests (NPENDINGIO == 64) * the maximum swap cluster size * (MAXPHYS == 64k) if you want to get the most efficiency. */ -struct mtx_padalign pbuf_mtx; +struct mtx_padalign __exclusive_cache_line pbuf_mtx; static TAILQ_HEAD(swqueue, buf) bswlist; static int bswneeded; vm_offset_t swapbkva; /* swap buffers kva */ @@ -182,7 +180,7 @@ /* * Initialize known pagers */ - for (pgops = pagertab; pgops < &pagertab[npagers]; pgops++) + for (pgops = pagertab; pgops < &pagertab[nitems(pagertab)]; pgops++) if ((*pgops)->pgo_init != NULL) (*(*pgops)->pgo_init)(); } @@ -208,6 +206,7 @@ cluster_pbuf_freecnt = nswbuf / 2; vnode_pbuf_freecnt = nswbuf / 2 + 1; + vnode_async_pbuf_freecnt = nswbuf / 2; } /* @@ -241,8 +240,80 @@ (*pagertab[object->type]->pgo_dealloc) (object); } +static void +vm_pager_assert_in(vm_object_t object, vm_page_t *m, int count) +{ +#ifdef INVARIANTS + + VM_OBJECT_ASSERT_WLOCKED(object); + KASSERT(count > 0, ("%s: 0 count", __func__)); + /* + * All pages must be busied, not mapped, not fully valid, + * not dirty and belong to the proper object. + */ + for (int i = 0 ; i < count; i++) { + vm_page_assert_xbusied(m[i]); + KASSERT(!pmap_page_is_mapped(m[i]), + ("%s: page %p is mapped", __func__, m[i])); + KASSERT(m[i]->valid != VM_PAGE_BITS_ALL, + ("%s: request for a valid page %p", __func__, m[i])); + KASSERT(m[i]->dirty == 0, + ("%s: page %p is dirty", __func__, m[i])); + KASSERT(m[i]->object == object, + ("%s: wrong object %p/%p", __func__, object, m[i]->object)); + } +#endif +} + /* - * vm_pager_get_pages() - inline, see vm/vm_pager.h + * Page in the pages for the object using its associated pager. + * The requested page must be fully valid on successful return. + */ +int +vm_pager_get_pages(vm_object_t object, vm_page_t *m, int count, int *rbehind, + int *rahead) +{ +#ifdef INVARIANTS + vm_pindex_t pindex = m[0]->pindex; +#endif + int r; + + vm_pager_assert_in(object, m, count); + + r = (*pagertab[object->type]->pgo_getpages)(object, m, count, rbehind, + rahead); + if (r != VM_PAGER_OK) + return (r); + + for (int i = 0; i < count; i++) { + /* + * If pager has replaced a page, assert that it had + * updated the array. + */ + KASSERT(m[i] == vm_page_lookup(object, pindex++), + ("%s: mismatch page %p pindex %ju", __func__, + m[i], (uintmax_t )pindex - 1)); + /* + * Zero out partially filled data. + */ + if (m[i]->valid != VM_PAGE_BITS_ALL) + vm_page_zero_invalid(m[i], TRUE); + } + return (VM_PAGER_OK); +} + +int +vm_pager_get_pages_async(vm_object_t object, vm_page_t *m, int count, + int *rbehind, int *rahead, pgo_getpages_iodone_t iodone, void *arg) +{ + + vm_pager_assert_in(object, m, count); + + return ((*pagertab[object->type]->pgo_getpages_async)(object, m, + count, rbehind, rahead, iodone, arg)); +} + +/* * vm_pager_put_pages() - inline, see vm/vm_pager.h * vm_pager_has_page() - inline, see vm/vm_pager.h */ @@ -289,12 +360,11 @@ bp->b_rcred = NOCRED; bp->b_wcred = NOCRED; bp->b_qindex = 0; /* On no queue (QUEUE_NONE) */ - bp->b_saveaddr = (caddr_t)(MAXPHYS * (bp - swbuf)) + swapbkva; - bp->b_data = bp->b_saveaddr; - bp->b_kvabase = bp->b_saveaddr; + bp->b_kvabase = (caddr_t)(MAXPHYS * (bp - swbuf)) + swapbkva; + bp->b_data = bp->b_kvabase; bp->b_kvasize = MAXPHYS; + bp->b_flags = 0; bp->b_xflags = 0; - bp->b_flags = 0; bp->b_ioflags = 0; bp->b_iodone = NULL; bp->b_error = 0; Modified: trunk/sys/vm/vm_pager.h =================================================================== --- trunk/sys/vm/vm_pager.h 2020-02-08 19:35:04 UTC (rev 12313) +++ trunk/sys/vm/vm_pager.h 2020-02-08 19:35:48 UTC (rev 12314) @@ -33,7 +33,7 @@ * SUCH DAMAGE. * * @(#)vm_pager.h 8.4 (Berkeley) 1/12/94 - * $FreeBSD: stable/10/sys/vm/vm_pager.h 308365 2016-11-06 13:37:33Z kib $ + * $FreeBSD: stable/11/sys/vm/vm_pager.h 331722 2018-03-29 02:50:57Z eadler $ */ /* @@ -51,19 +51,26 @@ typedef vm_object_t pgo_alloc_t(void *, vm_ooffset_t, vm_prot_t, vm_ooffset_t, struct ucred *); typedef void pgo_dealloc_t(vm_object_t); -typedef int pgo_getpages_t(vm_object_t, vm_page_t *, int, int); +typedef int pgo_getpages_t(vm_object_t, vm_page_t *, int, int *, int *); +typedef void pgo_getpages_iodone_t(void *, vm_page_t *, int, int); +typedef int pgo_getpages_async_t(vm_object_t, vm_page_t *, int, int *, int *, + pgo_getpages_iodone_t, void *); typedef void pgo_putpages_t(vm_object_t, vm_page_t *, int, int, int *); typedef boolean_t pgo_haspage_t(vm_object_t, vm_pindex_t, int *, int *); +typedef int pgo_populate_t(vm_object_t, vm_pindex_t, int, vm_prot_t, + vm_pindex_t *, vm_pindex_t *); typedef void pgo_pageunswapped_t(vm_page_t); struct pagerops { - pgo_init_t *pgo_init; /* Initialize pager. */ - pgo_alloc_t *pgo_alloc; /* Allocate pager. */ - pgo_dealloc_t *pgo_dealloc; /* Disassociate. */ - pgo_getpages_t *pgo_getpages; /* Get (read) page. */ - pgo_putpages_t *pgo_putpages; /* Put (write) page. */ - pgo_haspage_t *pgo_haspage; /* Does pager have page? */ - pgo_pageunswapped_t *pgo_pageunswapped; + pgo_init_t *pgo_init; /* Initialize pager. */ + pgo_alloc_t *pgo_alloc; /* Allocate pager. */ + pgo_dealloc_t *pgo_dealloc; /* Disassociate. */ + pgo_getpages_t *pgo_getpages; /* Get (read) page. */ + pgo_getpages_async_t *pgo_getpages_async; /* Get page asyncly. */ + pgo_putpages_t *pgo_putpages; /* Put (write) page. */ + pgo_haspage_t *pgo_haspage; /* Query page. */ + pgo_populate_t *pgo_populate; /* Bulk spec pagein. */ + pgo_pageunswapped_t *pgo_pageunswapped; }; extern struct pagerops defaultpagerops; @@ -92,6 +99,7 @@ #define VM_PAGER_PUT_SYNC 0x0001 #define VM_PAGER_PUT_INVAL 0x0002 +#define VM_PAGER_PUT_NOREUSE 0x0004 #define VM_PAGER_CLUSTER_OK 0x0008 #ifdef _KERNEL @@ -103,34 +111,12 @@ vm_ooffset_t, struct ucred *); void vm_pager_bufferinit(void); void vm_pager_deallocate(vm_object_t); -static __inline int vm_pager_get_pages(vm_object_t, vm_page_t *, int, int); +int vm_pager_get_pages(vm_object_t, vm_page_t *, int, int *, int *); +int vm_pager_get_pages_async(vm_object_t, vm_page_t *, int, int *, int *, + pgo_getpages_iodone_t, void *); void vm_pager_init(void); vm_object_t vm_pager_object_lookup(struct pagerlst *, void *); -/* - * vm_page_get_pages: - * - * Retrieve pages from the VM system in order to map them into an object - * ( or into VM space somewhere ). If the pagein was successful, we - * must fully validate it. - */ -static __inline int -vm_pager_get_pages( - vm_object_t object, - vm_page_t *m, - int count, - int reqpage -) { - int r; - - VM_OBJECT_ASSERT_WLOCKED(object); - r = (*pagertab[object->type]->pgo_getpages)(object, m, count, reqpage); - if (r == VM_PAGER_OK && m[reqpage]->valid != VM_PAGE_BITS_ALL) { - vm_page_zero_invalid(m[reqpage], TRUE); - } - return (r); -} - static __inline void vm_pager_put_pages( vm_object_t object, @@ -170,6 +156,19 @@ return (ret); } +static __inline int +vm_pager_populate(vm_object_t object, vm_pindex_t pidx, int fault_type, + vm_prot_t max_prot, vm_pindex_t *first, vm_pindex_t *last) +{ + + MPASS((object->flags & OBJ_POPULATE) != 0); + MPASS(pidx < object->size); + MPASS(object->paging_in_progress > 0); + return ((*pagertab[object->type]->pgo_populate)(object, pidx, + fault_type, max_prot, first, last)); +} + + /* * vm_pager_page_unswapped * @@ -195,6 +194,9 @@ struct cdev_pager_ops { int (*cdev_pg_fault)(vm_object_t vm_obj, vm_ooffset_t offset, int prot, vm_page_t *mres); + int (*cdev_pg_populate)(vm_object_t vm_obj, vm_pindex_t pidx, + int fault_type, vm_prot_t max_prot, vm_pindex_t *first, + vm_pindex_t *last); int (*cdev_pg_ctor)(void *handle, vm_ooffset_t size, vm_prot_t prot, vm_ooffset_t foff, struct ucred *cred, u_short *color); void (*cdev_pg_dtor)(void *handle); Modified: trunk/sys/vm/vm_param.h =================================================================== --- trunk/sys/vm/vm_param.h 2020-02-08 19:35:04 UTC (rev 12313) +++ trunk/sys/vm/vm_param.h 2020-02-08 19:35:48 UTC (rev 12314) @@ -58,7 +58,7 @@ * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. * - * $FreeBSD: stable/10/sys/vm/vm_param.h 254168 2013-08-09 23:47:43Z zont $ + * $FreeBSD: stable/11/sys/vm/vm_param.h 331722 2018-03-29 02:50:57Z eadler $ */ /* @@ -76,16 +76,17 @@ #define VM_TOTAL 1 /* struct vmtotal */ #define VM_METER VM_TOTAL/* deprecated, use VM_TOTAL */ #define VM_LOADAVG 2 /* struct loadavg */ -#define VM_V_FREE_MIN 3 /* cnt.v_free_min */ -#define VM_V_FREE_TARGET 4 /* cnt.v_free_target */ -#define VM_V_FREE_RESERVED 5 /* cnt.v_free_reserved */ -#define VM_V_INACTIVE_TARGET 6 /* cnt.v_inactive_target */ -#define VM_V_CACHE_MIN 7 /* cnt.v_cache_min */ -#define VM_V_CACHE_MAX 8 /* cnt.v_cache_max */ -#define VM_V_PAGEOUT_FREE_MIN 9 /* cnt.v_pageout_free_min */ +#define VM_V_FREE_MIN 3 /* vm_cnt.v_free_min */ +#define VM_V_FREE_TARGET 4 /* vm_cnt.v_free_target */ +#define VM_V_FREE_RESERVED 5 /* vm_cnt.v_free_reserved */ +#define VM_V_INACTIVE_TARGET 6 /* vm_cnt.v_inactive_target */ +#define VM_OBSOLETE_7 7 /* unused, formerly v_cache_min */ +#define VM_OBSOLETE_8 8 /* unused, formerly v_cache_max */ +#define VM_V_PAGEOUT_FREE_MIN 9 /* vm_cnt.v_pageout_free_min */ #define VM_OBSOLETE_10 10 /* pageout algorithm */ #define VM_SWAPPING_ENABLED 11 /* swapping enabled */ -#define VM_MAXID 12 /* number of valid vm ids */ +#define VM_OVERCOMMIT 12 /* vm.overcommit */ +#define VM_MAXID 13 /* number of valid vm ids */ /* * Structure for swap device statistics Modified: trunk/sys/vm/vm_phys.c =================================================================== --- trunk/sys/vm/vm_phys.c 2020-02-08 19:35:04 UTC (rev 12313) +++ trunk/sys/vm/vm_phys.c 2020-02-08 19:35:48 UTC (rev 12314) @@ -38,7 +38,7 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/vm/vm_phys.c 308349 2016-11-05 20:14:23Z markj $"); +__FBSDID("$FreeBSD: stable/11/sys/vm/vm_phys.c 331614 2018-03-27 13:09:35Z kib $"); #include "opt_ddb.h" #include "opt_vm.h" @@ -49,13 +49,14 @@ #include <sys/kernel.h> #include <sys/malloc.h> #include <sys/mutex.h> -#if MAXMEMDOM > 1 #include <sys/proc.h> -#endif #include <sys/queue.h> +#include <sys/rwlock.h> #include <sys/sbuf.h> #include <sys/sysctl.h> +#include <sys/tree.h> #include <sys/vmmeter.h> +#include <sys/seq.h> #include <ddb/ddb.h> @@ -66,10 +67,15 @@ #include <vm/vm_page.h> #include <vm/vm_phys.h> +#include <vm/vm_domain.h> + _Static_assert(sizeof(long) * NBBY >= VM_PHYSSEG_MAX, "Too many physsegs."); +#ifdef VM_NUMA_ALLOC struct mem_affinity *mem_affinity; +int *mem_locality; +#endif int vm_ndomains = 1; @@ -76,13 +82,25 @@ struct vm_phys_seg vm_phys_segs[VM_PHYSSEG_MAX]; int vm_phys_nsegs; -#define VM_PHYS_FICTITIOUS_NSEGS 8 -static struct vm_phys_fictitious_seg { +struct vm_phys_fictitious_seg; +static int vm_phys_fictitious_cmp(struct vm_phys_fictitious_seg *, + struct vm_phys_fictitious_seg *); + +RB_HEAD(fict_tree, vm_phys_fictitious_seg) vm_phys_fictitious_tree = + RB_INITIALIZER(_vm_phys_fictitious_tree); + +struct vm_phys_fictitious_seg { + RB_ENTRY(vm_phys_fictitious_seg) node; + /* Memory region data */ vm_paddr_t start; vm_paddr_t end; vm_page_t first_page; -} vm_phys_fictitious_segs[VM_PHYS_FICTITIOUS_NSEGS]; -static struct mtx vm_phys_fictitious_reg_mtx; +}; + +RB_GENERATE_STATIC(fict_tree, vm_phys_fictitious_seg, node, + vm_phys_fictitious_cmp); + +static struct rwlock vm_phys_fictitious_reg_lock; MALLOC_DEFINE(M_FICT_PAGES, "vm_fictitious", "Fictitious VM pages"); static struct vm_freelist @@ -127,21 +145,139 @@ SYSCTL_OID(_vm, OID_AUTO, phys_segs, CTLTYPE_STRING | CTLFLAG_RD, NULL, 0, sysctl_vm_phys_segs, "A", "Phys Seg Info"); +#ifdef VM_NUMA_ALLOC +static int sysctl_vm_phys_locality(SYSCTL_HANDLER_ARGS); +SYSCTL_OID(_vm, OID_AUTO, phys_locality, CTLTYPE_STRING | CTLFLAG_RD, + NULL, 0, sysctl_vm_phys_locality, "A", "Phys Locality Info"); +#endif + SYSCTL_INT(_vm, OID_AUTO, ndomains, CTLFLAG_RD, &vm_ndomains, 0, "Number of physical memory domains available."); +/* + * Default to first-touch + round-robin. + */ +static struct mtx vm_default_policy_mtx; +MTX_SYSINIT(vm_default_policy, &vm_default_policy_mtx, "default policy mutex", + MTX_DEF); +#ifdef VM_NUMA_ALLOC +static struct vm_domain_policy vm_default_policy = + VM_DOMAIN_POLICY_STATIC_INITIALISER(VM_POLICY_FIRST_TOUCH_ROUND_ROBIN, 0); +#else +/* Use round-robin so the domain policy code will only try once per allocation */ +static struct vm_domain_policy vm_default_policy = + VM_DOMAIN_POLICY_STATIC_INITIALISER(VM_POLICY_ROUND_ROBIN, 0); +#endif + static vm_page_t vm_phys_alloc_domain_pages(int domain, int flind, int pool, int order); +static vm_page_t vm_phys_alloc_seg_contig(struct vm_phys_seg *seg, + u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, + vm_paddr_t boundary); static void _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int domain); static void vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end); -static int vm_phys_paddr_to_segind(vm_paddr_t pa); static void vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl, int order); +static int +sysctl_vm_default_policy(SYSCTL_HANDLER_ARGS) +{ + char policy_name[32]; + int error; + + mtx_lock(&vm_default_policy_mtx); + + /* Map policy to output string */ + switch (vm_default_policy.p.policy) { + case VM_POLICY_FIRST_TOUCH: + strcpy(policy_name, "first-touch"); + break; + case VM_POLICY_FIRST_TOUCH_ROUND_ROBIN: + strcpy(policy_name, "first-touch-rr"); + break; + case VM_POLICY_ROUND_ROBIN: + default: + strcpy(policy_name, "rr"); + break; + } + mtx_unlock(&vm_default_policy_mtx); + + error = sysctl_handle_string(oidp, &policy_name[0], + sizeof(policy_name), req); + if (error != 0 || req->newptr == NULL) + return (error); + + mtx_lock(&vm_default_policy_mtx); + /* Set: match on the subset of policies that make sense as a default */ + if (strcmp("first-touch-rr", policy_name) == 0) { + vm_domain_policy_set(&vm_default_policy, + VM_POLICY_FIRST_TOUCH_ROUND_ROBIN, 0); + } else if (strcmp("first-touch", policy_name) == 0) { + vm_domain_policy_set(&vm_default_policy, + VM_POLICY_FIRST_TOUCH, 0); + } else if (strcmp("rr", policy_name) == 0) { + vm_domain_policy_set(&vm_default_policy, + VM_POLICY_ROUND_ROBIN, 0); + } else { + error = EINVAL; + goto finish; + } + + error = 0; +finish: + mtx_unlock(&vm_default_policy_mtx); + return (error); +} + +SYSCTL_PROC(_vm, OID_AUTO, default_policy, CTLTYPE_STRING | CTLFLAG_RW, + 0, 0, sysctl_vm_default_policy, "A", + "Default policy (rr, first-touch, first-touch-rr"); + +/* + * Red-black tree helpers for vm fictitious range management. + */ +static inline int +vm_phys_fictitious_in_range(struct vm_phys_fictitious_seg *p, + struct vm_phys_fictitious_seg *range) +{ + + KASSERT(range->start != 0 && range->end != 0, + ("Invalid range passed on search for vm_fictitious page")); + if (p->start >= range->end) + return (1); + if (p->start < range->start) + return (-1); + + return (0); +} + +static int +vm_phys_fictitious_cmp(struct vm_phys_fictitious_seg *p1, + struct vm_phys_fictitious_seg *p2) +{ + + /* Check if this is a search for a page */ + if (p1->end == 0) + return (vm_phys_fictitious_in_range(p1, p2)); + + KASSERT(p2->end != 0, + ("Invalid range passed as second parameter to vm fictitious comparison")); + + /* Searching to add a new range */ + if (p1->end <= p2->start) + return (-1); + if (p1->start >= p2->end) + return (1); + + panic("Trying to add overlapping vm fictitious ranges:\n" + "[%#jx:%#jx] and [%#jx:%#jx]", (uintmax_t)p1->start, + (uintmax_t)p1->end, (uintmax_t)p2->start, (uintmax_t)p2->end); +} + static __inline int vm_rr_selectdomain(void) { -#if MAXMEMDOM > 1 +#ifdef VM_NUMA_ALLOC struct thread *td; td = curthread; @@ -154,6 +290,53 @@ #endif } +/* + * Initialise a VM domain iterator. + * + * Check the thread policy, then the proc policy, + * then default to the system policy. + * + * Later on the various layers will have this logic + * plumbed into them and the phys code will be explicitly + * handed a VM domain policy to use. + */ +static void +vm_policy_iterator_init(struct vm_domain_iterator *vi) +{ +#ifdef VM_NUMA_ALLOC + struct vm_domain_policy lcl; +#endif + + vm_domain_iterator_init(vi); + +#ifdef VM_NUMA_ALLOC + /* Copy out the thread policy */ + vm_domain_policy_localcopy(&lcl, &curthread->td_vm_dom_policy); + if (lcl.p.policy != VM_POLICY_NONE) { + /* Thread policy is present; use it */ + vm_domain_iterator_set_policy(vi, &lcl); + return; + } + + vm_domain_policy_localcopy(&lcl, + &curthread->td_proc->p_vm_dom_policy); + if (lcl.p.policy != VM_POLICY_NONE) { + /* Process policy is present; use it */ + vm_domain_iterator_set_policy(vi, &lcl); + return; + } +#endif + /* Use system default policy */ + vm_domain_iterator_set_policy(vi, &vm_default_policy); +} + +static void +vm_policy_iterator_finish(struct vm_domain_iterator *vi) +{ + + vm_domain_iterator_cleanup(vi); +} + boolean_t vm_phys_domain_intersects(long mask, vm_paddr_t low, vm_paddr_t high) { @@ -243,6 +426,54 @@ return (error); } +/* + * Return affinity, or -1 if there's no affinity information. + */ +int +vm_phys_mem_affinity(int f, int t) +{ + +#ifdef VM_NUMA_ALLOC + if (mem_locality == NULL) + return (-1); + if (f >= vm_ndomains || t >= vm_ndomains) + return (-1); + return (mem_locality[f * vm_ndomains + t]); +#else + return (-1); +#endif +} + +#ifdef VM_NUMA_ALLOC +/* + * Outputs the VM locality table. + */ +static int +sysctl_vm_phys_locality(SYSCTL_HANDLER_ARGS) +{ + struct sbuf sbuf; + int error, i, j; + + error = sysctl_wire_old_buffer(req, 0); + if (error != 0) + return (error); + sbuf_new_for_sysctl(&sbuf, NULL, 128, req); + + sbuf_printf(&sbuf, "\n"); + + for (i = 0; i < vm_ndomains; i++) { + sbuf_printf(&sbuf, "%d: ", i); + for (j = 0; j < vm_ndomains; j++) { + sbuf_printf(&sbuf, "%d ", vm_phys_mem_affinity(i, j)); + } + sbuf_printf(&sbuf, "\n"); + } + error = sbuf_finish(&sbuf); + sbuf_delete(&sbuf); + return (error); +} +#endif + static void vm_freelist_add(struct vm_freelist *fl, vm_page_t m, int order, int tail) { @@ -289,6 +520,7 @@ static void vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end) { +#ifdef VM_NUMA_ALLOC int i; if (mem_affinity == NULL) { @@ -313,6 +545,9 @@ mem_affinity[i].domain); start = mem_affinity[i].end; } +#else + _vm_phys_create_seg(start, end, 0); +#endif } /* @@ -473,7 +708,8 @@ } } } - mtx_init(&vm_phys_fictitious_reg_mtx, "vmfctr", NULL, MTX_DEF); + + rw_init(&vm_phys_fictitious_reg_lock, "vmfctr"); } /* @@ -495,36 +731,6 @@ } /* - * Initialize a physical page and add it to the free lists. - */ -void -vm_phys_add_page(vm_paddr_t pa) -{ - vm_page_t m; - struct vm_domain *vmd; - - cnt.v_page_count++; - m = vm_phys_paddr_to_vm_page(pa); - m->busy_lock = VPB_UNBUSIED; - m->phys_addr = pa; - m->queue = PQ_NONE; - m->segind = vm_phys_paddr_to_segind(pa); - vmd = vm_phys_domain(m); - vmd->vmd_page_count++; - vmd->vmd_segs |= 1UL << m->segind; - m->flags = PG_FREE; - KASSERT(m->order == VM_NFREEORDER, - ("vm_phys_add_page: page %p has unexpected order %d", - m, m->order)); - m->pool = VM_FREEPOOL_DEFAULT; - pmap_page_init(m); - mtx_lock(&vm_page_queue_free_mtx); - vm_phys_freecnt_adj(m, 1); - vm_phys_free_pages(m, 0); - mtx_unlock(&vm_page_queue_free_mtx); -} - -/* * Allocate a contiguous, power of two-sized set of physical pages * from the free lists. * @@ -534,7 +740,8 @@ vm_phys_alloc_pages(int pool, int order) { vm_page_t m; - int dom, domain, flind; + int domain, flind; + struct vm_domain_iterator vi; KASSERT(pool < VM_NFREEPOOL, ("vm_phys_alloc_pages: pool %d is out of range", pool)); @@ -541,8 +748,9 @@ KASSERT(order < VM_NFREEORDER, ("vm_phys_alloc_pages: order %d is out of range", order)); - for (dom = 0; dom < vm_ndomains; dom++) { - domain = vm_rr_selectdomain(); + vm_policy_iterator_init(&vi); + + while ((vm_domain_iterator_run(&vi, &domain)) == 0) { for (flind = 0; flind < vm_nfreelists; flind++) { m = vm_phys_alloc_domain_pages(domain, flind, pool, order); @@ -550,6 +758,8 @@ return (m); } } + + vm_policy_iterator_finish(&vi); return (NULL); } @@ -564,7 +774,8 @@ vm_phys_alloc_freelist_pages(int freelist, int pool, int order) { vm_page_t m; - int dom, domain; + struct vm_domain_iterator vi; + int domain; KASSERT(freelist < VM_NFREELIST, ("vm_phys_alloc_freelist_pages: freelist %d is out of range", @@ -573,13 +784,17 @@ ("vm_phys_alloc_freelist_pages: pool %d is out of range", pool)); KASSERT(order < VM_NFREEORDER, ("vm_phys_alloc_freelist_pages: order %d is out of range", order)); - for (dom = 0; dom < vm_ndomains; dom++) { - domain = vm_rr_selectdomain(); + + vm_policy_iterator_init(&vi); + + while ((vm_domain_iterator_run(&vi, &domain)) == 0) { m = vm_phys_alloc_domain_pages(domain, vm_freelist_to_flind[freelist], pool, order); if (m != NULL) return (m); } + + vm_policy_iterator_finish(&vi); return (NULL); } @@ -643,23 +858,39 @@ vm_page_t vm_phys_fictitious_to_vm_page(vm_paddr_t pa) { - struct vm_phys_fictitious_seg *seg; + struct vm_phys_fictitious_seg tmp, *seg; vm_page_t m; - int segind; m = NULL; - for (segind = 0; segind < VM_PHYS_FICTITIOUS_NSEGS; segind++) { - seg = &vm_phys_fictitious_segs[segind]; - if (pa >= seg->start && pa < seg->end) { - m = &seg->first_page[atop(pa - seg->start)]; - KASSERT((m->flags & PG_FICTITIOUS) != 0, - ("%p not fictitious", m)); - break; - } - } + tmp.start = pa; + tmp.end = 0; + + rw_rlock(&vm_phys_fictitious_reg_lock); + seg = RB_FIND(fict_tree, &vm_phys_fictitious_tree, &tmp); + rw_runlock(&vm_phys_fictitious_reg_lock); + if (seg == NULL) + return (NULL); + + m = &seg->first_page[atop(pa - seg->start)]; + KASSERT((m->flags & PG_FICTITIOUS) != 0, ("%p not fictitious", m)); + return (m); } +static inline void +vm_phys_fictitious_init_range(vm_page_t range, vm_paddr_t start, + long page_count, vm_memattr_t memattr) +{ + long i; + + bzero(range, page_count * sizeof(*range)); + for (i = 0; i < page_count; i++) { + vm_page_initfake(&range[i], start + PAGE_SIZE * i, memattr); + range[i].oflags &= ~VPO_UNMANAGED; + range[i].busy_lock = VPB_UNBUSIED; + } +} + int vm_phys_fictitious_reg_range(vm_paddr_t start, vm_paddr_t end, vm_memattr_t memattr) @@ -666,104 +897,145 @@ { struct vm_phys_fictitious_seg *seg; vm_page_t fp; - long i, page_count; - int segind; + long page_count; #ifdef VM_PHYSSEG_DENSE - long pi; - boolean_t malloced; + long pi, pe; + long dpage_count; #endif + KASSERT(start < end, + ("Start of segment isn't less than end (start: %jx end: %jx)", + (uintmax_t)start, (uintmax_t)end)); + page_count = (end - start) / PAGE_SIZE; #ifdef VM_PHYSSEG_DENSE pi = atop(start); - if (pi >= first_page && pi < vm_page_array_size + first_page) { - if (atop(end) >= vm_page_array_size + first_page) - return (EINVAL); + pe = atop(end); + if (pi >= first_page && (pi - first_page) < vm_page_array_size) { fp = &vm_page_array[pi - first_page]; - malloced = FALSE; - } else + if ((pe - first_page) > vm_page_array_size) { + /* + * We have a segment that starts inside + * of vm_page_array, but ends outside of it. + * + * Use vm_page_array pages for those that are + * inside of the vm_page_array range, and + * allocate the remaining ones. + */ + dpage_count = vm_page_array_size - (pi - first_page); + vm_phys_fictitious_init_range(fp, start, dpage_count, + memattr); + page_count -= dpage_count; + start += ptoa(dpage_count); + goto alloc; + } + /* + * We can allocate the full range from vm_page_array, + * so there's no need to register the range in the tree. + */ + vm_phys_fictitious_init_range(fp, start, page_count, memattr); + return (0); + } else if (pe > first_page && (pe - first_page) < vm_page_array_size) { + /* + * We have a segment that ends inside of vm_page_array, + * but starts outside of it. + */ + fp = &vm_page_array[0]; + dpage_count = pe - first_page; + vm_phys_fictitious_init_range(fp, ptoa(first_page), dpage_count, + memattr); + end -= ptoa(dpage_count); + page_count -= dpage_count; + goto alloc; + } else if (pi < first_page && pe > (first_page + vm_page_array_size)) { + /* + * Trying to register a fictitious range that expands before + * and after vm_page_array. + */ + return (EINVAL); + } else { +alloc: #endif - { fp = malloc(page_count * sizeof(struct vm_page), M_FICT_PAGES, - M_WAITOK | M_ZERO); + M_WAITOK); #ifdef VM_PHYSSEG_DENSE - malloced = TRUE; -#endif } - for (i = 0; i < page_count; i++) { - vm_page_initfake(&fp[i], start + PAGE_SIZE * i, memattr); - fp[i].oflags &= ~VPO_UNMANAGED; - fp[i].busy_lock = VPB_UNBUSIED; - } - mtx_lock(&vm_phys_fictitious_reg_mtx); - for (segind = 0; segind < VM_PHYS_FICTITIOUS_NSEGS; segind++) { - seg = &vm_phys_fictitious_segs[segind]; - if (seg->start == 0 && seg->end == 0) { - seg->start = start; - seg->end = end; - seg->first_page = fp; - mtx_unlock(&vm_phys_fictitious_reg_mtx); - return (0); - } - } - mtx_unlock(&vm_phys_fictitious_reg_mtx); -#ifdef VM_PHYSSEG_DENSE - if (malloced) #endif - free(fp, M_FICT_PAGES); - return (EBUSY); + vm_phys_fictitious_init_range(fp, start, page_count, memattr); + + seg = malloc(sizeof(*seg), M_FICT_PAGES, M_WAITOK | M_ZERO); + seg->start = start; + seg->end = end; + seg->first_page = fp; + + rw_wlock(&vm_phys_fictitious_reg_lock); + RB_INSERT(fict_tree, &vm_phys_fictitious_tree, seg); + rw_wunlock(&vm_phys_fictitious_reg_lock); + + return (0); } void vm_phys_fictitious_unreg_range(vm_paddr_t start, vm_paddr_t end) { - struct vm_phys_fictitious_seg *seg; - vm_page_t fp; - int segind; + struct vm_phys_fictitious_seg *seg, tmp; #ifdef VM_PHYSSEG_DENSE - long pi; + long pi, pe; #endif + KASSERT(start < end, + ("Start of segment isn't less than end (start: %jx end: %jx)", + (uintmax_t)start, (uintmax_t)end)); + #ifdef VM_PHYSSEG_DENSE pi = atop(start); -#endif - - mtx_lock(&vm_phys_fictitious_reg_mtx); - for (segind = 0; segind < VM_PHYS_FICTITIOUS_NSEGS; segind++) { - seg = &vm_phys_fictitious_segs[segind]; - if (seg->start == start && seg->end == end) { - seg->start = seg->end = 0; - fp = seg->first_page; - seg->first_page = NULL; - mtx_unlock(&vm_phys_fictitious_reg_mtx); -#ifdef VM_PHYSSEG_DENSE - if (pi < first_page || atop(end) >= vm_page_array_size) -#endif - free(fp, M_FICT_PAGES); + pe = atop(end); + if (pi >= first_page && (pi - first_page) < vm_page_array_size) { + if ((pe - first_page) <= vm_page_array_size) { + /* + * This segment was allocated using vm_page_array + * only, there's nothing to do since those pages + * were never added to the tree. + */ return; } + /* + * We have a segment that starts inside + * of vm_page_array, but ends outside of it. + * + * Calculate how many pages were added to the + * tree and free them. + */ + start = ptoa(first_page + vm_page_array_size); + } else if (pe > first_page && (pe - first_page) < vm_page_array_size) { + /* + * We have a segment that ends inside of vm_page_array, + * but starts outside of it. + */ + end = ptoa(first_page); + } else if (pi < first_page && pe > (first_page + vm_page_array_size)) { + /* Since it's not possible to register such a range, panic. */ + panic( + "Unregistering not registered fictitious range [%#jx:%#jx]", + (uintmax_t)start, (uintmax_t)end); } - mtx_unlock(&vm_phys_fictitious_reg_mtx); - KASSERT(0, ("Unregistering not registered fictitious range")); -} +#endif + tmp.start = start; + tmp.end = 0; -/* - * Find the segment containing the given physical address. - */ -static int -vm_phys_paddr_to_segind(vm_paddr_t pa) -{ - struct vm_phys_seg *seg; - int segind; - - for (segind = 0; segind < vm_phys_nsegs; segind++) { - seg = &vm_phys_segs[segind]; - if (pa >= seg->start && pa < seg->end) - return (segind); + rw_wlock(&vm_phys_fictitious_reg_lock); + seg = RB_FIND(fict_tree, &vm_phys_fictitious_tree, &tmp); + if (seg->start != start || seg->end != end) { + rw_wunlock(&vm_phys_fictitious_reg_lock); + panic( + "Unregistering not registered fictitious range [%#jx:%#jx]", + (uintmax_t)start, (uintmax_t)end); } - panic("vm_phys_paddr_to_segind: paddr %#jx is not in any segment" , - (uintmax_t)pa); + RB_REMOVE(fict_tree, &vm_phys_fictitious_tree, seg); + rw_wunlock(&vm_phys_fictitious_reg_lock); + free(seg->first_page, M_FICT_PAGES); + free(seg, M_FICT_PAGES); } /* @@ -853,6 +1125,56 @@ } /* + * Scan physical memory between the specified addresses "low" and "high" for a + * run of contiguous physical pages that satisfy the specified conditions, and + * return the lowest page in the run. The specified "alignment" determines + * the alignment of the lowest physical page in the run. If the specified + * "boundary" is non-zero, then the run of physical pages cannot span a + * physical address that is a multiple of "boundary". + * + * "npages" must be greater than zero. Both "alignment" and "boundary" must + * be a power of two. + */ +vm_page_t +vm_phys_scan_contig(u_long npages, vm_paddr_t low, vm_paddr_t high, + u_long alignment, vm_paddr_t boundary, int options) +{ + vm_paddr_t pa_end; + vm_page_t m_end, m_run, m_start; + struct vm_phys_seg *seg; + int segind; + + KASSERT(npages > 0, ("npages is 0")); + KASSERT(powerof2(alignment), ("alignment is not a power of 2")); + KASSERT(powerof2(boundary), ("boundary is not a power of 2")); + if (low >= high) + return (NULL); + for (segind = 0; segind < vm_phys_nsegs; segind++) { + seg = &vm_phys_segs[segind]; + if (seg->start >= high) + break; + if (low >= seg->end) + continue; + if (low <= seg->start) + m_start = seg->first_page; + else + m_start = &seg->first_page[atop(low - seg->start)]; + if (high < seg->end) + pa_end = high; + else + pa_end = seg->end; + if (pa_end - VM_PAGE_TO_PHYS(m_start) < ptoa(npages)) + continue; + m_end = &seg->first_page[atop(pa_end - seg->start)]; + m_run = vm_page_scan_contig(npages, m_start, m_end, + alignment, boundary, options); + if (m_run != NULL) + return (m_run); + } + return (NULL); +} + +/* * Set the pool for a contiguous, power of two-sized set of physical pages. */ void @@ -946,7 +1268,7 @@ for (;;) { TAILQ_FOREACH_REVERSE(m, &fl[oind].pl, pglist, plinks.q) { for (m_tmp = m; m_tmp < &m[1 << oind]; m_tmp++) { - if ((m_tmp->flags & (PG_CACHED | PG_ZERO)) == 0) { + if ((m_tmp->flags & PG_ZERO) == 0) { vm_phys_unfree_page(m_tmp); vm_phys_freecnt_adj(m, -1); mtx_unlock(&vm_page_queue_free_mtx); @@ -990,85 +1312,125 @@ vm_phys_alloc_contig(u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary) { + vm_paddr_t pa_end, pa_start; + vm_page_t m_run; + struct vm_domain_iterator vi; + struct vm_phys_seg *seg; + int domain, segind; + + KASSERT(npages > 0, ("npages is 0")); + KASSERT(powerof2(alignment), ("alignment is not a power of 2")); + KASSERT(powerof2(boundary), ("boundary is not a power of 2")); + mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); + if (low >= high) + return (NULL); + vm_policy_iterator_init(&vi); +restartdom: + if (vm_domain_iterator_run(&vi, &domain) != 0) { + vm_policy_iterator_finish(&vi); + return (NULL); + } + m_run = NULL; + for (segind = vm_phys_nsegs - 1; segind >= 0; segind--) { + seg = &vm_phys_segs[segind]; + if (seg->start >= high || seg->domain != domain) + continue; + if (low >= seg->end) + break; + if (low <= seg->start) + pa_start = seg->start; + else + pa_start = low; + if (high < seg->end) + pa_end = high; + else + pa_end = seg->end; + if (pa_end - pa_start < ptoa(npages)) + continue; + m_run = vm_phys_alloc_seg_contig(seg, npages, low, high, + alignment, boundary); + if (m_run != NULL) + break; + } + if (m_run == NULL && !vm_domain_iterator_isdone(&vi)) + goto restartdom; + vm_policy_iterator_finish(&vi); + return (m_run); +} + +/* + * Allocate a run of contiguous physical pages from the free list for the + * specified segment. + */ +static vm_page_t +vm_phys_alloc_seg_contig(struct vm_phys_seg *seg, u_long npages, + vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary) +{ struct vm_freelist *fl; - struct vm_phys_seg *seg; - vm_paddr_t pa, pa_last, size; + vm_paddr_t pa, pa_end, size; vm_page_t m, m_ret; u_long npages_end; - int dom, domain, flind, oind, order, pind; + int oind, order, pind; + KASSERT(npages > 0, ("npages is 0")); + KASSERT(powerof2(alignment), ("alignment is not a power of 2")); + KASSERT(powerof2(boundary), ("boundary is not a power of 2")); mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); - size = npages << PAGE_SHIFT; - KASSERT(size != 0, - ("vm_phys_alloc_contig: size must not be 0")); - KASSERT((alignment & (alignment - 1)) == 0, - ("vm_phys_alloc_contig: alignment must be a power of 2")); - KASSERT((boundary & (boundary - 1)) == 0, - ("vm_phys_alloc_contig: boundary must be a power of 2")); /* Compute the queue that is the best fit for npages. */ for (order = 0; (1 << order) < npages; order++); - dom = 0; -restartdom: - domain = vm_rr_selectdomain(); - for (flind = 0; flind < vm_nfreelists; flind++) { - for (oind = min(order, VM_NFREEORDER - 1); oind < VM_NFREEORDER; oind++) { - for (pind = 0; pind < VM_NFREEPOOL; pind++) { - fl = &vm_phys_free_queues[domain][flind][pind][0]; - TAILQ_FOREACH(m_ret, &fl[oind].pl, plinks.q) { + /* Search for a run satisfying the specified conditions. */ + size = npages << PAGE_SHIFT; + for (oind = min(order, VM_NFREEORDER - 1); oind < VM_NFREEORDER; + oind++) { + for (pind = 0; pind < VM_NFREEPOOL; pind++) { + fl = (*seg->free_queues)[pind]; + TAILQ_FOREACH(m_ret, &fl[oind].pl, plinks.q) { + /* + * Is the size of this allocation request + * larger than the largest block size? + */ + if (order >= VM_NFREEORDER) { /* - * A free list may contain physical pages - * from one or more segments. + * Determine if a sufficient number of + * subsequent blocks to satisfy the + * allocation request are free. */ - seg = &vm_phys_segs[m_ret->segind]; - if (seg->start > high || - low >= seg->end) + pa = VM_PAGE_TO_PHYS(m_ret); + pa_end = pa + size; + if (pa_end < pa) continue; - - /* - * Is the size of this allocation request - * larger than the largest block size? - */ - if (order >= VM_NFREEORDER) { - /* - * Determine if a sufficient number - * of subsequent blocks to satisfy - * the allocation request are free. - */ - pa = VM_PAGE_TO_PHYS(m_ret); - pa_last = pa + size; - for (;;) { - pa += 1 << (PAGE_SHIFT + VM_NFREEORDER - 1); - if (pa >= pa_last) - break; - if (pa < seg->start || - pa >= seg->end) - break; - m = &seg->first_page[atop(pa - seg->start)]; - if (m->order != VM_NFREEORDER - 1) - break; - } - /* If not, continue to the next block. */ - if (pa < pa_last) - continue; + for (;;) { + pa += 1 << (PAGE_SHIFT + + VM_NFREEORDER - 1); + if (pa >= pa_end || + pa < seg->start || + pa >= seg->end) + break; + m = &seg->first_page[atop(pa - + seg->start)]; + if (m->order != VM_NFREEORDER - + 1) + break; } + /* If not, go to the next block. */ + if (pa < pa_end) + continue; + } - /* - * Determine if the blocks are within the given range, - * satisfy the given alignment, and do not cross the - * given boundary. - */ - pa = VM_PAGE_TO_PHYS(m_ret); - if (pa >= low && - pa + size <= high && - (pa & (alignment - 1)) == 0 && - ((pa ^ (pa + size - 1)) & ~(boundary - 1)) == 0) - goto done; - } + /* + * Determine if the blocks are within the + * given range, satisfy the given alignment, + * and do not cross the given boundary. + */ + pa = VM_PAGE_TO_PHYS(m_ret); + pa_end = pa + size; + if (pa >= low && pa_end <= high && + (pa & (alignment - 1)) == 0 && + rounddown2(pa ^ (pa_end - 1), boundary) == 0) + goto done; } } } - if (++dom < vm_ndomains) - goto restartdom; return (NULL); done: for (m = m_ret; m < &m_ret[npages]; m = &m[1 << oind]) { Modified: trunk/sys/vm/vm_phys.h =================================================================== --- trunk/sys/vm/vm_phys.h 2020-02-08 19:35:04 UTC (rev 12313) +++ trunk/sys/vm/vm_phys.h 2020-02-08 19:35:48 UTC (rev 12314) @@ -29,7 +29,7 @@ * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * - * $FreeBSD: stable/10/sys/vm/vm_phys.h 285634 2015-07-16 14:41:58Z kib $ + * $FreeBSD: stable/11/sys/vm/vm_phys.h 329381 2018-02-16 16:16:33Z mjg $ */ /* @@ -62,6 +62,7 @@ }; extern struct mem_affinity *mem_affinity; +extern int *mem_locality; extern int vm_ndomains; extern struct vm_phys_seg vm_phys_segs[]; extern int vm_phys_nsegs; @@ -69,7 +70,6 @@ /* * The following functions are only to be used by the virtual memory system. */ -void vm_phys_add_page(vm_paddr_t pa); void vm_phys_add_seg(vm_paddr_t start, vm_paddr_t end); vm_page_t vm_phys_alloc_contig(u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary); @@ -84,9 +84,12 @@ void vm_phys_free_pages(vm_page_t m, int order); void vm_phys_init(void); vm_page_t vm_phys_paddr_to_vm_page(vm_paddr_t pa); +vm_page_t vm_phys_scan_contig(u_long npages, vm_paddr_t low, vm_paddr_t high, + u_long alignment, vm_paddr_t boundary, int options); void vm_phys_set_pool(int pool, vm_page_t m, int order); boolean_t vm_phys_unfree_page(vm_page_t m); boolean_t vm_phys_zero_pages_idle(void); +int vm_phys_mem_affinity(int f, int t); /* * vm_phys_domain: @@ -96,7 +99,7 @@ static inline struct vm_domain * vm_phys_domain(vm_page_t m) { -#if MAXMEMDOM > 1 +#ifdef VM_NUMA_ALLOC int domn, segind; /* XXXKIB try to assert that the page is managed */ @@ -110,13 +113,13 @@ #endif } -static inline void +static inline u_int vm_phys_freecnt_adj(vm_page_t m, int adj) { mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); - cnt.v_free_count += adj; vm_phys_domain(m)->vmd_free_count += adj; + return (vm_cnt.v_free_count += adj); } #endif /* _KERNEL */ Modified: trunk/sys/vm/vm_radix.c =================================================================== --- trunk/sys/vm/vm_radix.c 2020-02-08 19:35:04 UTC (rev 12313) +++ trunk/sys/vm/vm_radix.c 2020-02-08 19:35:48 UTC (rev 12314) @@ -50,7 +50,7 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/vm/vm_radix.c 298653 2016-04-26 17:39:54Z pfg $"); +__FBSDID("$FreeBSD: stable/11/sys/vm/vm_radix.c 327785 2018-01-10 20:39:26Z markj $"); #include "opt_ddb.h" @@ -299,21 +299,19 @@ * are needed to store them. */ if (!uma_zone_reserve_kva(vm_radix_node_zone, - ((vm_paddr_t)cnt.v_page_count * PAGE_SIZE) / (PAGE_SIZE + + ((vm_paddr_t)vm_cnt.v_page_count * PAGE_SIZE) / (PAGE_SIZE + sizeof(struct vm_radix_node)))) panic("%s: unable to reserve KVA", __func__); } -SYSINIT(vm_radix_reserve_kva, SI_SUB_KMEM, SI_ORDER_SECOND, +SYSINIT(vm_radix_reserve_kva, SI_SUB_KMEM, SI_ORDER_THIRD, vm_radix_reserve_kva, NULL); #endif /* * Initialize the UMA slab zone. - * Until vm_radix_prealloc() is called, the zone will be served by the - * UMA boot-time pre-allocated pool of pages. */ void -vm_radix_init(void) +vm_radix_zinit(void) { vm_radix_node_zone = uma_zcreate("RADIX NODE", @@ -342,8 +340,6 @@ index = page->pindex; -restart: - /* * The owner of record for root is not really important because it * will never be used. @@ -361,32 +357,10 @@ panic("%s: key %jx is already present", __func__, (uintmax_t)index); clev = vm_radix_keydiff(m->pindex, index); - - /* - * During node allocation the trie that is being - * walked can be modified because of recursing radix - * trie operations. - * If this is the case, the recursing functions signal - * such situation and the insert operation must - * start from scratch again. - * The freed radix node will then be in the UMA - * caches very likely to avoid the same situation - * to happen. - */ - rtree->rt_flags |= RT_INSERT_INPROG; tmp = vm_radix_node_get(vm_radix_trimkey(index, clev + 1), 2, clev); - rtree->rt_flags &= ~RT_INSERT_INPROG; - if (tmp == NULL) { - rtree->rt_flags &= ~RT_TRIE_MODIFIED; + if (tmp == NULL) return (ENOMEM); - } - if ((rtree->rt_flags & RT_TRIE_MODIFIED) != 0) { - rtree->rt_flags &= ~RT_TRIE_MODIFIED; - tmp->rn_count = 0; - vm_radix_node_put(tmp); - goto restart; - } *parentp = tmp; vm_radix_addpage(tmp, index, clev, page); vm_radix_addpage(tmp, m->pindex, clev, m); @@ -410,21 +384,9 @@ */ newind = rnode->rn_owner; clev = vm_radix_keydiff(newind, index); - - /* See the comments above. */ - rtree->rt_flags |= RT_INSERT_INPROG; tmp = vm_radix_node_get(vm_radix_trimkey(index, clev + 1), 2, clev); - rtree->rt_flags &= ~RT_INSERT_INPROG; - if (tmp == NULL) { - rtree->rt_flags &= ~RT_TRIE_MODIFIED; + if (tmp == NULL) return (ENOMEM); - } - if ((rtree->rt_flags & RT_TRIE_MODIFIED) != 0) { - rtree->rt_flags &= ~RT_TRIE_MODIFIED; - tmp->rn_count = 0; - vm_radix_node_put(tmp); - goto restart; - } *parentp = tmp; vm_radix_addpage(tmp, index, clev, page); slot = vm_radix_slot(newind, clev); @@ -699,10 +661,10 @@ } /* - * Remove the specified index from the tree. - * Panics if the key is not present. + * Remove the specified index from the trie, and return the value stored at + * that index. If the index is not present, return NULL. */ -void +vm_page_t vm_radix_remove(struct vm_radix *rtree, vm_pindex_t index) { struct vm_radix_node *rnode, *parent; @@ -709,41 +671,27 @@ vm_page_t m; int i, slot; - /* - * Detect if a page is going to be removed from a trie which is - * already undergoing another trie operation. - * Right now this is only possible for vm_radix_remove() recursing - * into vm_radix_insert(). - * If this is the case, the caller must be notified about this - * situation. It will also takecare to update the RT_TRIE_MODIFIED - * accordingly. - * The RT_TRIE_MODIFIED bit is set here because the remove operation - * will always succeed. - */ - if ((rtree->rt_flags & RT_INSERT_INPROG) != 0) - rtree->rt_flags |= RT_TRIE_MODIFIED; - rnode = vm_radix_getroot(rtree); if (vm_radix_isleaf(rnode)) { m = vm_radix_topage(rnode); if (m->pindex != index) - panic("%s: invalid key found", __func__); + return (NULL); vm_radix_setroot(rtree, NULL); - return; + return (m); } parent = NULL; for (;;) { if (rnode == NULL) - panic("vm_radix_remove: impossible to locate the key"); + return (NULL); slot = vm_radix_slot(index, rnode->rn_clev); if (vm_radix_isleaf(rnode->rn_child[slot])) { m = vm_radix_topage(rnode->rn_child[slot]); if (m->pindex != index) - panic("%s: invalid key found", __func__); + return (NULL); rnode->rn_child[slot] = NULL; rnode->rn_count--; if (rnode->rn_count > 1) - break; + return (m); for (i = 0; i < VM_RADIX_COUNT; i++) if (rnode->rn_child[i] != NULL) break; @@ -760,7 +708,7 @@ rnode->rn_count--; rnode->rn_child[i] = NULL; vm_radix_node_put(rnode); - break; + return (m); } parent = rnode; rnode = rnode->rn_child[slot]; @@ -777,9 +725,6 @@ { struct vm_radix_node *root; - KASSERT((rtree->rt_flags & RT_INSERT_INPROG) == 0, - ("vm_radix_reclaim_allnodes: unexpected trie recursion")); - root = vm_radix_getroot(rtree); if (root == NULL) return; @@ -831,6 +776,12 @@ panic("%s: original replacing page not found", __func__); } +void +vm_radix_wait(void) +{ + uma_zwait(vm_radix_node_zone); +} + #ifdef DDB /* * Show details about the given radix node. Modified: trunk/sys/vm/vm_radix.h =================================================================== --- trunk/sys/vm/vm_radix.h 2020-02-08 19:35:04 UTC (rev 12313) +++ trunk/sys/vm/vm_radix.h 2020-02-08 19:35:48 UTC (rev 12314) @@ -26,7 +26,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: stable/10/sys/vm/vm_radix.h 266591 2014-05-23 17:47:49Z alc $ + * $FreeBSD: stable/11/sys/vm/vm_radix.h 327785 2018-01-10 20:39:26Z markj $ */ #ifndef _VM_RADIX_H_ @@ -36,15 +36,30 @@ #ifdef _KERNEL -void vm_radix_init(void); int vm_radix_insert(struct vm_radix *rtree, vm_page_t page); +void vm_radix_wait(void); boolean_t vm_radix_is_singleton(struct vm_radix *rtree); vm_page_t vm_radix_lookup(struct vm_radix *rtree, vm_pindex_t index); vm_page_t vm_radix_lookup_ge(struct vm_radix *rtree, vm_pindex_t index); vm_page_t vm_radix_lookup_le(struct vm_radix *rtree, vm_pindex_t index); void vm_radix_reclaim_allnodes(struct vm_radix *rtree); -void vm_radix_remove(struct vm_radix *rtree, vm_pindex_t index); +vm_page_t vm_radix_remove(struct vm_radix *rtree, vm_pindex_t index); vm_page_t vm_radix_replace(struct vm_radix *rtree, vm_page_t newpage); +void vm_radix_zinit(void); +static __inline void +vm_radix_init(struct vm_radix *rtree) +{ + + rtree->rt_root = 0; +} + +static __inline boolean_t +vm_radix_is_empty(struct vm_radix *rtree) +{ + + return (rtree->rt_root == 0); +} + #endif /* _KERNEL */ #endif /* !_VM_RADIX_H_ */ Modified: trunk/sys/vm/vm_reserv.c =================================================================== --- trunk/sys/vm/vm_reserv.c 2020-02-08 19:35:04 UTC (rev 12313) +++ trunk/sys/vm/vm_reserv.c 2020-02-08 19:35:48 UTC (rev 12314) @@ -1,7 +1,7 @@ /* $MidnightBSD$ */ /*- * Copyright (c) 2002-2006 Rice University - * Copyright (c) 2007-2008 Alan L. Cox <alc at cs.rice.edu> + * Copyright (c) 2007-2011 Alan L. Cox <alc at cs.rice.edu> * All rights reserved. * * This software was developed for the FreeBSD Project by Alan L. Cox, @@ -38,7 +38,7 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/vm/vm_reserv.c 280045 2015-03-15 18:40:06Z kib $"); +__FBSDID("$FreeBSD: stable/11/sys/vm/vm_reserv.c 351826 2019-09-04 19:31:37Z ray $"); #include "opt_vm.h" @@ -52,6 +52,7 @@ #include <sys/sbuf.h> #include <sys/sysctl.h> #include <sys/systm.h> +#include <sys/vmmeter.h> #include <vm/vm.h> #include <vm/vm_param.h> @@ -63,7 +64,7 @@ /* * The reservation system supports the speculative allocation of large physical - * pages ("superpages"). Speculative allocation enables the fully-automatic + * pages ("superpages"). Speculative allocation enables the fully automatic * utilization of superpages by the virtual memory system. In other words, no * programmatic directives are required to use superpages. */ @@ -94,6 +95,61 @@ (((object)->pg_color + (pindex)) & (VM_LEVEL_0_NPAGES - 1)) /* + * The size of a population map entry + */ +typedef u_long popmap_t; + +/* + * The number of bits in a population map entry + */ +#define NBPOPMAP (NBBY * sizeof(popmap_t)) + +/* + * The number of population map entries in a reservation + */ +#define NPOPMAP howmany(VM_LEVEL_0_NPAGES, NBPOPMAP) + +/* + * Clear a bit in the population map. + */ +static __inline void +popmap_clear(popmap_t popmap[], int i) +{ + + popmap[i / NBPOPMAP] &= ~(1UL << (i % NBPOPMAP)); +} + +/* + * Set a bit in the population map. + */ +static __inline void +popmap_set(popmap_t popmap[], int i) +{ + + popmap[i / NBPOPMAP] |= 1UL << (i % NBPOPMAP); +} + +/* + * Is a bit in the population map clear? + */ +static __inline boolean_t +popmap_is_clear(popmap_t popmap[], int i) +{ + + return ((popmap[i / NBPOPMAP] & (1UL << (i % NBPOPMAP))) == 0); +} + +/* + * Is a bit in the population map set? + */ +static __inline boolean_t +popmap_is_set(popmap_t popmap[], int i) +{ + + return ((popmap[i / NBPOPMAP] & (1UL << (i % NBPOPMAP))) != 0); +} + +/* * The reservation structure * * A reservation structure is constructed whenever a large physical page is @@ -101,11 +157,11 @@ * physical pages for the range [pindex, pindex + VM_LEVEL_0_NPAGES) of offsets * within that object. The reservation's "popcnt" tracks the number of these * small physical pages that are in use at any given time. When and if the - * reservation is not fully utilized, it appears in the queue of partially- + * reservation is not fully utilized, it appears in the queue of partially * populated reservations. The reservation always appears on the containing * object's list of reservations. * - * A partially-populated reservation can be broken and reclaimed at any time. + * A partially populated reservation can be broken and reclaimed at any time. */ struct vm_reserv { TAILQ_ENTRY(vm_reserv) partpopq; @@ -115,6 +171,7 @@ vm_page_t pages; /* first page of a superpage */ int popcnt; /* # of pages in use */ char inpartpopq; + popmap_t popmap[NPOPMAP]; /* bit vector of used pages */ }; /* @@ -141,11 +198,11 @@ static vm_reserv_t vm_reserv_array; /* - * The partially-populated reservation queue + * The partially populated reservation queue * - * This queue enables the fast recovery of an unused cached or free small page - * from a partially-populated reservation. The reservation at the head of - * this queue is the least-recently-changed, partially-populated reservation. + * This queue enables the fast recovery of an unused free small page from a + * partially populated reservation. The reservation at the head of this queue + * is the least recently changed, partially populated reservation. * * Access to this queue is synchronized by the free page queue lock. */ @@ -162,26 +219,60 @@ SYSCTL_LONG(_vm_reserv, OID_AUTO, freed, CTLFLAG_RD, &vm_reserv_freed, 0, "Cumulative number of freed reservations"); +static int sysctl_vm_reserv_fullpop(SYSCTL_HANDLER_ARGS); + +SYSCTL_PROC(_vm_reserv, OID_AUTO, fullpop, CTLTYPE_INT | CTLFLAG_RD, NULL, 0, + sysctl_vm_reserv_fullpop, "I", "Current number of full reservations"); + static int sysctl_vm_reserv_partpopq(SYSCTL_HANDLER_ARGS); SYSCTL_OID(_vm_reserv, OID_AUTO, partpopq, CTLTYPE_STRING | CTLFLAG_RD, NULL, 0, - sysctl_vm_reserv_partpopq, "A", "Partially-populated reservation queues"); + sysctl_vm_reserv_partpopq, "A", "Partially populated reservation queues"); static long vm_reserv_reclaimed; SYSCTL_LONG(_vm_reserv, OID_AUTO, reclaimed, CTLFLAG_RD, &vm_reserv_reclaimed, 0, "Cumulative number of reclaimed reservations"); -static void vm_reserv_depopulate(vm_reserv_t rv); +static void vm_reserv_break(vm_reserv_t rv); +static void vm_reserv_depopulate(vm_reserv_t rv, int index); static vm_reserv_t vm_reserv_from_page(vm_page_t m); static boolean_t vm_reserv_has_pindex(vm_reserv_t rv, vm_pindex_t pindex); -static void vm_reserv_populate(vm_reserv_t rv); +static void vm_reserv_populate(vm_reserv_t rv, int index); static void vm_reserv_reclaim(vm_reserv_t rv); /* - * Describes the current state of the partially-populated reservation queue. + * Returns the current number of full reservations. + * + * Since the number of full reservations is computed without acquiring the + * free page queue lock, the returned value may be inexact. */ static int +sysctl_vm_reserv_fullpop(SYSCTL_HANDLER_ARGS) +{ + vm_paddr_t paddr; + struct vm_phys_seg *seg; + vm_reserv_t rv; + int fullpop, segind; + + fullpop = 0; + for (segind = 0; segind < vm_phys_nsegs; segind++) { + seg = &vm_phys_segs[segind]; + paddr = roundup2(seg->start, VM_LEVEL_0_SIZE); + while (paddr + VM_LEVEL_0_SIZE > paddr && paddr + + VM_LEVEL_0_SIZE <= seg->end) { + rv = &vm_reserv_array[paddr >> VM_LEVEL_0_SHIFT]; + fullpop += rv->popcnt == VM_LEVEL_0_NPAGES; + paddr += VM_LEVEL_0_SIZE; + } + } + return (sysctl_handle_int(oidp, &fullpop, 0, req)); +} + +/* + * Describes the current state of the partially populated reservation queue. + */ +static int sysctl_vm_reserv_partpopq(SYSCTL_HANDLER_ARGS) { struct sbuf sbuf; @@ -213,18 +304,21 @@ /* * Reduces the given reservation's population count. If the population count * becomes zero, the reservation is destroyed. Additionally, moves the - * reservation to the tail of the partially-populated reservations queue if the + * reservation to the tail of the partially populated reservation queue if the * population count is non-zero. * * The free page queue lock must be held. */ static void -vm_reserv_depopulate(vm_reserv_t rv) +vm_reserv_depopulate(vm_reserv_t rv, int index) { mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); KASSERT(rv->object != NULL, ("vm_reserv_depopulate: reserv %p is free", rv)); + KASSERT(popmap_is_set(rv->popmap, index), + ("vm_reserv_depopulate: reserv %p's popmap[%d] is clear", rv, + index)); KASSERT(rv->popcnt > 0, ("vm_reserv_depopulate: reserv %p's popcnt is corrupted", rv)); if (rv->inpartpopq) { @@ -236,6 +330,7 @@ rv)); rv->pages->psind = 0; } + popmap_clear(rv->popmap, index); rv->popcnt--; if (rv->popcnt == 0) { LIST_REMOVE(rv, objq); @@ -271,17 +366,20 @@ /* * Increases the given reservation's population count. Moves the reservation - * to the tail of the partially-populated reservation queue. + * to the tail of the partially populated reservation queue. * * The free page queue must be locked. */ static void -vm_reserv_populate(vm_reserv_t rv) +vm_reserv_populate(vm_reserv_t rv, int index) { mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); KASSERT(rv->object != NULL, ("vm_reserv_populate: reserv %p is free", rv)); + KASSERT(popmap_is_clear(rv->popmap, index), + ("vm_reserv_populate: reserv %p's popmap[%d] is set", rv, + index)); KASSERT(rv->popcnt < VM_LEVEL_0_NPAGES, ("vm_reserv_populate: reserv %p is already full", rv)); KASSERT(rv->pages->psind == 0, @@ -290,6 +388,7 @@ TAILQ_REMOVE(&vm_rvq_partpop, rv, partpopq); rv->inpartpopq = FALSE; } + popmap_set(rv->popmap, index); rv->popcnt++; if (rv->popcnt < VM_LEVEL_0_NPAGES) { rv->inpartpopq = TRUE; @@ -308,14 +407,18 @@ * physical address boundary that is a multiple of that value. Both * "alignment" and "boundary" must be a power of two. * + * The page "mpred" must immediately precede the offset "pindex" within the + * specified object. + * * The object and free page queue must be locked. */ vm_page_t vm_reserv_alloc_contig(vm_object_t object, vm_pindex_t pindex, u_long npages, - vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary) + vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary, + vm_page_t mpred) { vm_paddr_t pa, size; - vm_page_t m, m_ret, mpred, msucc; + vm_page_t m, m_ret, msucc; vm_pindex_t first, leftcap, rightcap; vm_reserv_t rv; u_long allocpages, maxpages, minpages; @@ -352,10 +455,11 @@ /* * Look for an existing reservation. */ - mpred = vm_radix_lookup_le(&object->rtree, pindex); if (mpred != NULL) { + KASSERT(mpred->object == object, + ("vm_reserv_alloc_contig: object doesn't contain mpred")); KASSERT(mpred->pindex < pindex, - ("vm_reserv_alloc_contig: pindex already allocated")); + ("vm_reserv_alloc_contig: mpred doesn't precede pindex")); rv = vm_reserv_from_page(mpred); if (rv->object == object && vm_reserv_has_pindex(rv, pindex)) goto found; @@ -364,7 +468,7 @@ msucc = TAILQ_FIRST(&object->memq); if (msucc != NULL) { KASSERT(msucc->pindex > pindex, - ("vm_reserv_alloc_contig: pindex already allocated")); + ("vm_reserv_alloc_contig: msucc doesn't succeed pindex")); rv = vm_reserv_from_page(msucc); if (rv->object == object && vm_reserv_has_pindex(rv, pindex)) goto found; @@ -460,9 +564,13 @@ KASSERT(!rv->inpartpopq, ("vm_reserv_alloc_contig: reserv %p's inpartpopq is TRUE", rv)); + for (i = 0; i < NPOPMAP; i++) + KASSERT(rv->popmap[i] == 0, + ("vm_reserv_alloc_contig: reserv %p's popmap is corrupted", + rv)); n = ulmin(VM_LEVEL_0_NPAGES - index, npages); for (i = 0; i < n; i++) - vm_reserv_populate(rv); + vm_reserv_populate(rv, index + i); npages -= n; if (m_ret == NULL) { m_ret = &rv->pages[index]; @@ -489,15 +597,15 @@ return (NULL); /* Handle vm_page_rename(m, new_object, ...). */ for (i = 0; i < npages; i++) - if ((rv->pages[index + i].flags & (PG_CACHED | PG_FREE)) == 0) + if (popmap_is_set(rv->popmap, index + i)) return (NULL); for (i = 0; i < npages; i++) - vm_reserv_populate(rv); + vm_reserv_populate(rv, index + i); return (m); } /* - * Allocates a page from an existing or newly-created reservation. + * Allocates a page from an existing or newly created reservation. * * The page "mpred" must immediately precede the offset "pindex" within the * specified object. @@ -510,6 +618,7 @@ vm_page_t m, msucc; vm_pindex_t first, leftcap, rightcap; vm_reserv_t rv; + int i, index; mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); VM_OBJECT_ASSERT_WLOCKED(object); @@ -598,22 +707,93 @@ ("vm_reserv_alloc_page: reserv %p's popcnt is corrupted", rv)); KASSERT(!rv->inpartpopq, ("vm_reserv_alloc_page: reserv %p's inpartpopq is TRUE", rv)); - vm_reserv_populate(rv); - return (&rv->pages[VM_RESERV_INDEX(object, pindex)]); + for (i = 0; i < NPOPMAP; i++) + KASSERT(rv->popmap[i] == 0, + ("vm_reserv_alloc_page: reserv %p's popmap is corrupted", + rv)); + index = VM_RESERV_INDEX(object, pindex); + vm_reserv_populate(rv, index); + return (&rv->pages[index]); /* * Found a matching reservation. */ found: - m = &rv->pages[VM_RESERV_INDEX(object, pindex)]; + index = VM_RESERV_INDEX(object, pindex); + m = &rv->pages[index]; /* Handle vm_page_rename(m, new_object, ...). */ - if ((m->flags & (PG_CACHED | PG_FREE)) == 0) + if (popmap_is_set(rv->popmap, index)) return (NULL); - vm_reserv_populate(rv); + vm_reserv_populate(rv, index); return (m); } /* + * Breaks the given reservation. All free pages in the reservation + * are returned to the physical memory allocator. The reservation's + * population count and map are reset to their initial state. + * + * The given reservation must not be in the partially populated reservation + * queue. The free page queue lock must be held. + */ +static void +vm_reserv_break(vm_reserv_t rv) +{ + int begin_zeroes, hi, i, lo; + + mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); + KASSERT(rv->object != NULL, + ("vm_reserv_break: reserv %p is free", rv)); + KASSERT(!rv->inpartpopq, + ("vm_reserv_break: reserv %p's inpartpopq is TRUE", rv)); + LIST_REMOVE(rv, objq); + rv->object = NULL; + rv->pages->psind = 0; + i = hi = 0; + do { + /* Find the next 0 bit. Any previous 0 bits are < "hi". */ + lo = ffsl(~(((1UL << hi) - 1) | rv->popmap[i])); + if (lo == 0) { + /* Redundantly clears bits < "hi". */ + rv->popmap[i] = 0; + rv->popcnt -= NBPOPMAP - hi; + while (++i < NPOPMAP) { + lo = ffsl(~rv->popmap[i]); + if (lo == 0) { + rv->popmap[i] = 0; + rv->popcnt -= NBPOPMAP; + } else + break; + } + if (i == NPOPMAP) + break; + hi = 0; + } + KASSERT(lo > 0, ("vm_reserv_break: lo is %d", lo)); + /* Convert from ffsl() to ordinary bit numbering. */ + lo--; + if (lo > 0) { + /* Redundantly clears bits < "hi". */ + rv->popmap[i] &= ~((1UL << lo) - 1); + rv->popcnt -= lo - hi; + } + begin_zeroes = NBPOPMAP * i + lo; + /* Find the next 1 bit. */ + do + hi = ffsl(rv->popmap[i]); + while (hi == 0 && ++i < NPOPMAP); + if (i != NPOPMAP) + /* Convert from ffsl() to ordinary bit numbering. */ + hi--; + vm_phys_free_contig(&rv->pages[begin_zeroes], NBPOPMAP * i + + hi - begin_zeroes); + } while (i < NPOPMAP); + KASSERT(rv->popcnt == 0, + ("vm_reserv_break: reserv %p's popcnt is corrupted", rv)); + vm_reserv_broken++; +} + +/* * Breaks all reservations belonging to the given object. */ void @@ -620,7 +800,6 @@ vm_reserv_break_all(vm_object_t object) { vm_reserv_t rv; - int i; mtx_lock(&vm_page_queue_free_mtx); while ((rv = LIST_FIRST(&object->rvq)) != NULL) { @@ -630,18 +809,7 @@ TAILQ_REMOVE(&vm_rvq_partpop, rv, partpopq); rv->inpartpopq = FALSE; } - LIST_REMOVE(rv, objq); - rv->object = NULL; - for (i = 0; i < VM_LEVEL_0_NPAGES; i++) { - if ((rv->pages[i].flags & (PG_CACHED | PG_FREE)) != 0) - vm_phys_free_pages(&rv->pages[i], 0); - else - rv->popcnt--; - } - KASSERT(rv->popcnt == 0, - ("vm_reserv_break_all: reserv %p's popcnt is corrupted", - rv)); - vm_reserv_broken++; + vm_reserv_break(rv); } mtx_unlock(&vm_page_queue_free_mtx); } @@ -661,10 +829,7 @@ rv = vm_reserv_from_page(m); if (rv->object == NULL) return (FALSE); - if ((m->flags & PG_CACHED) != 0 && m->pool != VM_FREEPOOL_CACHE) - vm_phys_set_pool(VM_FREEPOOL_CACHE, rv->pages, - VM_LEVEL_0_ORDER); - vm_reserv_depopulate(rv); + vm_reserv_depopulate(rv, m - rv->pages); return (TRUE); } @@ -678,15 +843,18 @@ vm_reserv_init(void) { vm_paddr_t paddr; - int i; + struct vm_phys_seg *seg; + int segind; /* * Initialize the reservation array. Specifically, initialize the * "pages" field for every element that has an underlying superpage. */ - for (i = 0; phys_avail[i + 1] != 0; i += 2) { - paddr = roundup2(phys_avail[i], VM_LEVEL_0_SIZE); - while (paddr + VM_LEVEL_0_SIZE <= phys_avail[i + 1]) { + for (segind = 0; segind < vm_phys_nsegs; segind++) { + seg = &vm_phys_segs[segind]; + paddr = roundup2(seg->start, VM_LEVEL_0_SIZE); + while (paddr + VM_LEVEL_0_SIZE > paddr && paddr + + VM_LEVEL_0_SIZE <= seg->end) { vm_reserv_array[paddr >> VM_LEVEL_0_SHIFT].pages = PHYS_TO_VM_PAGE(paddr); paddr += VM_LEVEL_0_SIZE; @@ -695,77 +863,50 @@ } /* - * Returns a reservation level if the given page belongs to a fully-populated - * reservation and -1 otherwise. + * Returns true if the given page belongs to a reservation and that page is + * free. Otherwise, returns false. */ +bool +vm_reserv_is_page_free(vm_page_t m) +{ + vm_reserv_t rv; + + mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); + rv = vm_reserv_from_page(m); + if (rv->object == NULL) + return (false); + return (popmap_is_clear(rv->popmap, m - rv->pages)); +} + +/* + * If the given page belongs to a reservation, returns the level of that + * reservation. Otherwise, returns -1. + */ int -vm_reserv_level_iffullpop(vm_page_t m) +vm_reserv_level(vm_page_t m) { vm_reserv_t rv; rv = vm_reserv_from_page(m); - return (rv->popcnt == VM_LEVEL_0_NPAGES ? 0 : -1); + return (rv->object != NULL ? 0 : -1); } /* - * Prepare for the reactivation of a cached page. - * - * First, suppose that the given page "m" was allocated individually, i.e., not - * as part of a reservation, and cached. Then, suppose a reservation - * containing "m" is allocated by the same object. Although "m" and the - * reservation belong to the same object, "m"'s pindex may not match the - * reservation's. - * - * The free page queue must be locked. + * Returns a reservation level if the given page belongs to a fully populated + * reservation and -1 otherwise. */ -boolean_t -vm_reserv_reactivate_page(vm_page_t m) +int +vm_reserv_level_iffullpop(vm_page_t m) { vm_reserv_t rv; - int i, m_index; - mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); rv = vm_reserv_from_page(m); - if (rv->object == NULL) - return (FALSE); - KASSERT((m->flags & PG_CACHED) != 0, - ("vm_reserv_uncache_page: page %p is not cached", m)); - if (m->object == rv->object && - m->pindex - rv->pindex == VM_RESERV_INDEX(m->object, m->pindex)) - vm_reserv_populate(rv); - else { - KASSERT(rv->inpartpopq, - ("vm_reserv_uncache_page: reserv %p's inpartpopq is FALSE", - rv)); - TAILQ_REMOVE(&vm_rvq_partpop, rv, partpopq); - rv->inpartpopq = FALSE; - LIST_REMOVE(rv, objq); - rv->object = NULL; - /* Don't vm_phys_free_pages(m, 0). */ - m_index = m - rv->pages; - for (i = 0; i < m_index; i++) { - if ((rv->pages[i].flags & (PG_CACHED | PG_FREE)) != 0) - vm_phys_free_pages(&rv->pages[i], 0); - else - rv->popcnt--; - } - for (i++; i < VM_LEVEL_0_NPAGES; i++) { - if ((rv->pages[i].flags & (PG_CACHED | PG_FREE)) != 0) - vm_phys_free_pages(&rv->pages[i], 0); - else - rv->popcnt--; - } - KASSERT(rv->popcnt == 0, - ("vm_reserv_uncache_page: reserv %p's popcnt is corrupted", - rv)); - vm_reserv_broken++; - } - return (TRUE); + return (rv->popcnt == VM_LEVEL_0_NPAGES ? 0 : -1); } /* - * Breaks the given partially-populated reservation, releasing its cached and - * free pages to the physical memory allocator. + * Breaks the given partially populated reservation, releasing its free pages + * to the physical memory allocator. * * The free page queue lock must be held. */ @@ -772,32 +913,20 @@ static void vm_reserv_reclaim(vm_reserv_t rv) { - int i; mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); KASSERT(rv->inpartpopq, - ("vm_reserv_reclaim: reserv %p's inpartpopq is corrupted", rv)); + ("vm_reserv_reclaim: reserv %p's inpartpopq is FALSE", rv)); TAILQ_REMOVE(&vm_rvq_partpop, rv, partpopq); rv->inpartpopq = FALSE; - KASSERT(rv->object != NULL, - ("vm_reserv_reclaim: reserv %p is free", rv)); - LIST_REMOVE(rv, objq); - rv->object = NULL; - for (i = 0; i < VM_LEVEL_0_NPAGES; i++) { - if ((rv->pages[i].flags & (PG_CACHED | PG_FREE)) != 0) - vm_phys_free_pages(&rv->pages[i], 0); - else - rv->popcnt--; - } - KASSERT(rv->popcnt == 0, - ("vm_reserv_reclaim: reserv %p's popcnt is corrupted", rv)); + vm_reserv_break(rv); vm_reserv_reclaimed++; } /* - * Breaks the reservation at the head of the partially-populated reservation - * queue, releasing its cached and free pages to the physical memory - * allocator. Returns TRUE if a reservation is broken and FALSE otherwise. + * Breaks the reservation at the head of the partially populated reservation + * queue, releasing its free pages to the physical memory allocator. Returns + * TRUE if a reservation is broken and FALSE otherwise. * * The free page queue lock must be held. */ @@ -815,11 +944,10 @@ } /* - * Searches the partially-populated reservation queue for the least recently - * active reservation with unused pages, i.e., cached or free, that satisfy the - * given request for contiguous physical memory. If a satisfactory reservation - * is found, it is broken. Returns TRUE if a reservation is broken and FALSE - * otherwise. + * Searches the partially populated reservation queue for the least recently + * changed reservation with free pages that satisfy the given request for + * contiguous physical memory. If a satisfactory reservation is found, it is + * broken. Returns TRUE if a reservation is broken and FALSE otherwise. * * The free page queue lock must be held. */ @@ -827,9 +955,9 @@ vm_reserv_reclaim_contig(u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary) { - vm_paddr_t pa, pa_length, size; + vm_paddr_t pa, size; vm_reserv_t rv; - int i; + int hi, i, lo, low_index, next_free; mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); if (npages > VM_LEVEL_0_NPAGES - 1) @@ -838,30 +966,72 @@ TAILQ_FOREACH(rv, &vm_rvq_partpop, partpopq) { pa = VM_PAGE_TO_PHYS(&rv->pages[VM_LEVEL_0_NPAGES - 1]); if (pa + PAGE_SIZE - size < low) { - /* this entire reservation is too low; go to next */ + /* This entire reservation is too low; go to next. */ continue; } - pa_length = 0; - for (i = 0; i < VM_LEVEL_0_NPAGES; i++) - if ((rv->pages[i].flags & (PG_CACHED | PG_FREE)) != 0) { - pa_length += PAGE_SIZE; - if (pa_length == PAGE_SIZE) { - pa = VM_PAGE_TO_PHYS(&rv->pages[i]); - if (pa + size > high) { - /* skip to next reservation */ - break; - } else if (pa < low || - (pa & (alignment - 1)) != 0 || - ((pa ^ (pa + size - 1)) & - ~(boundary - 1)) != 0) - pa_length = 0; + pa = VM_PAGE_TO_PHYS(&rv->pages[0]); + if (pa + size > high) { + /* This entire reservation is too high; go to next. */ + continue; + } + if (pa < low) { + /* Start the search for free pages at "low". */ + low_index = (low + PAGE_MASK - pa) >> PAGE_SHIFT; + i = low_index / NBPOPMAP; + hi = low_index % NBPOPMAP; + } else + i = hi = 0; + do { + /* Find the next free page. */ + lo = ffsl(~(((1UL << hi) - 1) | rv->popmap[i])); + while (lo == 0 && ++i < NPOPMAP) + lo = ffsl(~rv->popmap[i]); + if (i == NPOPMAP) + break; + /* Convert from ffsl() to ordinary bit numbering. */ + lo--; + next_free = NBPOPMAP * i + lo; + pa = VM_PAGE_TO_PHYS(&rv->pages[next_free]); + KASSERT(pa >= low, + ("vm_reserv_reclaim_contig: pa is too low")); + if (pa + size > high) { + /* The rest of this reservation is too high. */ + break; + } else if ((pa & (alignment - 1)) != 0 || + ((pa ^ (pa + size - 1)) & ~(boundary - 1)) != 0) { + /* + * The current page doesn't meet the alignment + * and/or boundary requirements. Continue + * searching this reservation until the rest + * of its free pages are either excluded or + * exhausted. + */ + hi = lo + 1; + if (hi >= NBPOPMAP) { + hi = 0; + i++; } - if (pa_length >= size) { + continue; + } + /* Find the next used page. */ + hi = ffsl(rv->popmap[i] & ~((1UL << lo) - 1)); + while (hi == 0 && ++i < NPOPMAP) { + if ((NBPOPMAP * i - next_free) * PAGE_SIZE >= + size) { vm_reserv_reclaim(rv); return (TRUE); } - } else - pa_length = 0; + hi = ffsl(rv->popmap[i]); + } + /* Convert from ffsl() to ordinary bit numbering. */ + if (i != NPOPMAP) + hi--; + if ((NBPOPMAP * i + hi - next_free) * PAGE_SIZE >= + size) { + vm_reserv_reclaim(rv); + return (TRUE); + } + } while (i < NPOPMAP); } return (FALSE); } @@ -892,6 +1062,23 @@ } /* + * Returns the size (in bytes) of a reservation of the specified level. + */ +int +vm_reserv_size(int level) +{ + + switch (level) { + case 0: + return (VM_LEVEL_0_SIZE); + case -1: + return (PAGE_SIZE); + default: + return (0); + } +} + +/* * Allocates the virtual and physical memory required by the reservation * management system's data structures, in particular, the reservation array. */ @@ -925,4 +1112,18 @@ return (new_end); } +/* + * Returns the superpage containing the given page. + */ +vm_page_t +vm_reserv_to_superpage(vm_page_t m) +{ + vm_reserv_t rv; + + VM_OBJECT_ASSERT_LOCKED(m->object); + rv = vm_reserv_from_page(m); + return (rv->object == m->object && rv->popcnt == VM_LEVEL_0_NPAGES ? + rv->pages : NULL); +} + #endif /* VM_NRESERVLEVEL > 0 */ Modified: trunk/sys/vm/vm_reserv.h =================================================================== --- trunk/sys/vm/vm_reserv.h 2020-02-08 19:35:04 UTC (rev 12313) +++ trunk/sys/vm/vm_reserv.h 2020-02-08 19:35:48 UTC (rev 12314) @@ -29,7 +29,7 @@ * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * - * $FreeBSD: stable/10/sys/vm/vm_reserv.h 250577 2013-05-12 16:50:18Z alc $ + * $FreeBSD: stable/11/sys/vm/vm_reserv.h 324399 2017-10-07 20:22:04Z alc $ */ /* @@ -48,21 +48,24 @@ */ vm_page_t vm_reserv_alloc_contig(vm_object_t object, vm_pindex_t pindex, u_long npages, vm_paddr_t low, vm_paddr_t high, - u_long alignment, vm_paddr_t boundary); + u_long alignment, vm_paddr_t boundary, vm_page_t mpred); vm_page_t vm_reserv_alloc_page(vm_object_t object, vm_pindex_t pindex, vm_page_t mpred); void vm_reserv_break_all(vm_object_t object); boolean_t vm_reserv_free_page(vm_page_t m); void vm_reserv_init(void); +bool vm_reserv_is_page_free(vm_page_t m); +int vm_reserv_level(vm_page_t m); int vm_reserv_level_iffullpop(vm_page_t m); -boolean_t vm_reserv_reactivate_page(vm_page_t m); boolean_t vm_reserv_reclaim_contig(u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary); boolean_t vm_reserv_reclaim_inactive(void); void vm_reserv_rename(vm_page_t m, vm_object_t new_object, vm_object_t old_object, vm_pindex_t old_object_offset); +int vm_reserv_size(int level); vm_paddr_t vm_reserv_startup(vm_offset_t *vaddr, vm_paddr_t end, vm_paddr_t high_water); +vm_page_t vm_reserv_to_superpage(vm_page_t m); #endif /* VM_NRESERVLEVEL > 0 */ #endif /* _KERNEL */ Added: trunk/sys/vm/vm_swapout.c =================================================================== --- trunk/sys/vm/vm_swapout.c (rev 0) +++ trunk/sys/vm/vm_swapout.c 2020-02-08 19:35:48 UTC (rev 12314) @@ -0,0 +1,955 @@ +/* $MidnightBSD$ */ +/*- + * Copyright (c) 1991 Regents of the University of California. + * All rights reserved. + * Copyright (c) 1994 John S. Dyson + * All rights reserved. + * Copyright (c) 1994 David Greenman + * All rights reserved. + * Copyright (c) 2005 Yahoo! Technologies Norway AS + * All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * The Mach Operating System project at Carnegie-Mellon University. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)vm_pageout.c 7.4 (Berkeley) 5/7/91 + * + * + * Copyright (c) 1987, 1990 Carnegie-Mellon University. + * All rights reserved. + * + * Authors: Avadis Tevanian, Jr., Michael Wayne Young + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution at CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: stable/11/sys/vm/vm_swapout.c 338335 2018-08-27 09:39:34Z kib $"); + +#include "opt_kstack_pages.h" +#include "opt_kstack_max_pages.h" +#include "opt_vm.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/limits.h> +#include <sys/kernel.h> +#include <sys/eventhandler.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/proc.h> +#include <sys/_kstack_cache.h> +#include <sys/kthread.h> +#include <sys/ktr.h> +#include <sys/mount.h> +#include <sys/racct.h> +#include <sys/resourcevar.h> +#include <sys/sched.h> +#include <sys/sdt.h> +#include <sys/signalvar.h> +#include <sys/smp.h> +#include <sys/time.h> +#include <sys/vnode.h> +#include <sys/vmmeter.h> +#include <sys/rwlock.h> +#include <sys/sx.h> +#include <sys/sysctl.h> + +#include <vm/vm.h> +#include <vm/vm_param.h> +#include <vm/vm_object.h> +#include <vm/vm_page.h> +#include <vm/vm_map.h> +#include <vm/vm_pageout.h> +#include <vm/vm_pager.h> +#include <vm/vm_phys.h> +#include <vm/swap_pager.h> +#include <vm/vm_extern.h> +#include <vm/uma.h> + +/* the kernel process "vm_daemon" */ +static void vm_daemon(void); +static struct proc *vmproc; + +static struct kproc_desc vm_kp = { + "vmdaemon", + vm_daemon, + &vmproc +}; +SYSINIT(vmdaemon, SI_SUB_KTHREAD_VM, SI_ORDER_FIRST, kproc_start, &vm_kp); + +static int vm_swap_enabled = 1; +static int vm_swap_idle_enabled = 0; + +SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled, CTLFLAG_RW, + &vm_swap_enabled, 0, + "Enable entire process swapout"); +SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled, CTLFLAG_RW, + &vm_swap_idle_enabled, 0, + "Allow swapout on idle criteria"); + +/* + * Swap_idle_threshold1 is the guaranteed swapped in time for a process + */ +static int swap_idle_threshold1 = 2; +SYSCTL_INT(_vm, OID_AUTO, swap_idle_threshold1, CTLFLAG_RW, + &swap_idle_threshold1, 0, + "Guaranteed swapped in time for a process"); + +/* + * Swap_idle_threshold2 is the time that a process can be idle before + * it will be swapped out, if idle swapping is enabled. + */ +static int swap_idle_threshold2 = 10; +SYSCTL_INT(_vm, OID_AUTO, swap_idle_threshold2, CTLFLAG_RW, + &swap_idle_threshold2, 0, + "Time before a process will be swapped out"); + +static int vm_pageout_req_swapout; /* XXX */ +static int vm_daemon_needed; +static struct mtx vm_daemon_mtx; +/* Allow for use by vm_pageout before vm_daemon is initialized. */ +MTX_SYSINIT(vm_daemon, &vm_daemon_mtx, "vm daemon", MTX_DEF); + +static int swapped_cnt; +static int swap_inprogress; /* Pending swap-ins done outside swapper. */ +static int last_swapin; + +static void swapclear(struct proc *); +static int swapout(struct proc *); +static void vm_swapout_map_deactivate_pages(vm_map_t, long); +static void vm_swapout_object_deactivate_pages(pmap_t, vm_object_t, long); +static void swapout_procs(int action); +static void vm_req_vmdaemon(int req); +static void vm_thread_swapout(struct thread *td); + +/* + * vm_swapout_object_deactivate_pages + * + * Deactivate enough pages to satisfy the inactive target + * requirements. + * + * The object and map must be locked. + */ +static void +vm_swapout_object_deactivate_pages(pmap_t pmap, vm_object_t first_object, + long desired) +{ + vm_object_t backing_object, object; + vm_page_t p; + int act_delta, remove_mode; + + VM_OBJECT_ASSERT_LOCKED(first_object); + if ((first_object->flags & OBJ_FICTITIOUS) != 0) + return; + for (object = first_object;; object = backing_object) { + if (pmap_resident_count(pmap) <= desired) + goto unlock_return; + VM_OBJECT_ASSERT_LOCKED(object); + if ((object->flags & OBJ_UNMANAGED) != 0 || + object->paging_in_progress != 0) + goto unlock_return; + + remove_mode = 0; + if (object->shadow_count > 1) + remove_mode = 1; + /* + * Scan the object's entire memory queue. + */ + TAILQ_FOREACH(p, &object->memq, listq) { + if (pmap_resident_count(pmap) <= desired) + goto unlock_return; + if (should_yield()) + goto unlock_return; + if (vm_page_busied(p)) + continue; + PCPU_INC(cnt.v_pdpages); + vm_page_lock(p); + if (p->wire_count != 0 || p->hold_count != 0 || + !pmap_page_exists_quick(pmap, p)) { + vm_page_unlock(p); + continue; + } + act_delta = pmap_ts_referenced(p); + if ((p->aflags & PGA_REFERENCED) != 0) { + if (act_delta == 0) + act_delta = 1; + vm_page_aflag_clear(p, PGA_REFERENCED); + } + if (!vm_page_active(p) && act_delta != 0) { + vm_page_activate(p); + p->act_count += act_delta; + } else if (vm_page_active(p)) { + if (act_delta == 0) { + p->act_count -= min(p->act_count, + ACT_DECLINE); + if (!remove_mode && p->act_count == 0) { + pmap_remove_all(p); + vm_page_deactivate(p); + } else + vm_page_requeue(p); + } else { + vm_page_activate(p); + if (p->act_count < ACT_MAX - + ACT_ADVANCE) + p->act_count += ACT_ADVANCE; + vm_page_requeue(p); + } + } else if (vm_page_inactive(p)) + pmap_remove_all(p); + vm_page_unlock(p); + } + if ((backing_object = object->backing_object) == NULL) + goto unlock_return; + VM_OBJECT_RLOCK(backing_object); + if (object != first_object) + VM_OBJECT_RUNLOCK(object); + } +unlock_return: + if (object != first_object) + VM_OBJECT_RUNLOCK(object); +} + +/* + * deactivate some number of pages in a map, try to do it fairly, but + * that is really hard to do. + */ +static void +vm_swapout_map_deactivate_pages(vm_map_t map, long desired) +{ + vm_map_entry_t tmpe; + vm_object_t obj, bigobj; + int nothingwired; + + if (!vm_map_trylock_read(map)) + return; + + bigobj = NULL; + nothingwired = TRUE; + + /* + * first, search out the biggest object, and try to free pages from + * that. + */ + tmpe = map->header.next; + while (tmpe != &map->header) { + if ((tmpe->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) { + obj = tmpe->object.vm_object; + if (obj != NULL && VM_OBJECT_TRYRLOCK(obj)) { + if (obj->shadow_count <= 1 && + (bigobj == NULL || + bigobj->resident_page_count < + obj->resident_page_count)) { + if (bigobj != NULL) + VM_OBJECT_RUNLOCK(bigobj); + bigobj = obj; + } else + VM_OBJECT_RUNLOCK(obj); + } + } + if (tmpe->wired_count > 0) + nothingwired = FALSE; + tmpe = tmpe->next; + } + + if (bigobj != NULL) { + vm_swapout_object_deactivate_pages(map->pmap, bigobj, desired); + VM_OBJECT_RUNLOCK(bigobj); + } + /* + * Next, hunt around for other pages to deactivate. We actually + * do this search sort of wrong -- .text first is not the best idea. + */ + tmpe = map->header.next; + while (tmpe != &map->header) { + if (pmap_resident_count(vm_map_pmap(map)) <= desired) + break; + if ((tmpe->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) { + obj = tmpe->object.vm_object; + if (obj != NULL) { + VM_OBJECT_RLOCK(obj); + vm_swapout_object_deactivate_pages(map->pmap, + obj, desired); + VM_OBJECT_RUNLOCK(obj); + } + } + tmpe = tmpe->next; + } + + /* + * Remove all mappings if a process is swapped out, this will free page + * table pages. + */ + if (desired == 0 && nothingwired) { + pmap_remove(vm_map_pmap(map), vm_map_min(map), + vm_map_max(map)); + } + + vm_map_unlock_read(map); +} + +/* + * Swap out requests + */ +#define VM_SWAP_NORMAL 1 +#define VM_SWAP_IDLE 2 + +void +vm_swapout_run(void) +{ + + if (vm_swap_enabled) + vm_req_vmdaemon(VM_SWAP_NORMAL); +} + +/* + * Idle process swapout -- run once per second when pagedaemons are + * reclaiming pages. + */ +void +vm_swapout_run_idle(void) +{ + static long lsec; + + if (!vm_swap_idle_enabled || time_second == lsec) + return; + vm_req_vmdaemon(VM_SWAP_IDLE); + lsec = time_second; +} + +static void +vm_req_vmdaemon(int req) +{ + static int lastrun = 0; + + mtx_lock(&vm_daemon_mtx); + vm_pageout_req_swapout |= req; + if ((ticks > (lastrun + hz)) || (ticks < lastrun)) { + wakeup(&vm_daemon_needed); + lastrun = ticks; + } + mtx_unlock(&vm_daemon_mtx); +} + +static void +vm_daemon(void) +{ + struct rlimit rsslim; + struct proc *p; + struct thread *td; + struct vmspace *vm; + int breakout, swapout_flags, tryagain, attempts; +#ifdef RACCT + uint64_t rsize, ravailable; +#endif + + while (TRUE) { + mtx_lock(&vm_daemon_mtx); + msleep(&vm_daemon_needed, &vm_daemon_mtx, PPAUSE, "psleep", +#ifdef RACCT + racct_enable ? hz : 0 +#else + 0 +#endif + ); + swapout_flags = vm_pageout_req_swapout; + vm_pageout_req_swapout = 0; + mtx_unlock(&vm_daemon_mtx); + if (swapout_flags) + swapout_procs(swapout_flags); + + /* + * scan the processes for exceeding their rlimits or if + * process is swapped out -- deactivate pages + */ + tryagain = 0; + attempts = 0; +again: + attempts++; + sx_slock(&allproc_lock); + FOREACH_PROC_IN_SYSTEM(p) { + vm_pindex_t limit, size; + + /* + * if this is a system process or if we have already + * looked at this process, skip it. + */ + PROC_LOCK(p); + if (p->p_state != PRS_NORMAL || + p->p_flag & (P_INEXEC | P_SYSTEM | P_WEXIT)) { + PROC_UNLOCK(p); + continue; + } + /* + * if the process is in a non-running type state, + * don't touch it. + */ + breakout = 0; + FOREACH_THREAD_IN_PROC(p, td) { + thread_lock(td); + if (!TD_ON_RUNQ(td) && + !TD_IS_RUNNING(td) && + !TD_IS_SLEEPING(td) && + !TD_IS_SUSPENDED(td)) { + thread_unlock(td); + breakout = 1; + break; + } + thread_unlock(td); + } + if (breakout) { + PROC_UNLOCK(p); + continue; + } + /* + * get a limit + */ + lim_rlimit_proc(p, RLIMIT_RSS, &rsslim); + limit = OFF_TO_IDX( + qmin(rsslim.rlim_cur, rsslim.rlim_max)); + + /* + * let processes that are swapped out really be + * swapped out set the limit to nothing (will force a + * swap-out.) + */ + if ((p->p_flag & P_INMEM) == 0) + limit = 0; /* XXX */ + vm = vmspace_acquire_ref(p); + _PHOLD_LITE(p); + PROC_UNLOCK(p); + if (vm == NULL) { + PRELE(p); + continue; + } + sx_sunlock(&allproc_lock); + + size = vmspace_resident_count(vm); + if (size >= limit) { + vm_swapout_map_deactivate_pages( + &vm->vm_map, limit); + size = vmspace_resident_count(vm); + } +#ifdef RACCT + if (racct_enable) { + rsize = IDX_TO_OFF(size); + PROC_LOCK(p); + if (p->p_state == PRS_NORMAL) + racct_set(p, RACCT_RSS, rsize); + ravailable = racct_get_available(p, RACCT_RSS); + PROC_UNLOCK(p); + if (rsize > ravailable) { + /* + * Don't be overly aggressive; this + * might be an innocent process, + * and the limit could've been exceeded + * by some memory hog. Don't try + * to deactivate more than 1/4th + * of process' resident set size. + */ + if (attempts <= 8) { + if (ravailable < rsize - + (rsize / 4)) { + ravailable = rsize - + (rsize / 4); + } + } + vm_swapout_map_deactivate_pages( + &vm->vm_map, + OFF_TO_IDX(ravailable)); + /* Update RSS usage after paging out. */ + size = vmspace_resident_count(vm); + rsize = IDX_TO_OFF(size); + PROC_LOCK(p); + if (p->p_state == PRS_NORMAL) + racct_set(p, RACCT_RSS, rsize); + PROC_UNLOCK(p); + if (rsize > ravailable) + tryagain = 1; + } + } +#endif + vmspace_free(vm); + sx_slock(&allproc_lock); + PRELE(p); + } + sx_sunlock(&allproc_lock); + if (tryagain != 0 && attempts <= 10) { + maybe_yield(); + goto again; + } + } +} + +/* + * Allow a thread's kernel stack to be paged out. + */ +static void +vm_thread_swapout(struct thread *td) +{ + vm_object_t ksobj; + vm_page_t m; + int i, pages; + + cpu_thread_swapout(td); + pages = td->td_kstack_pages; + ksobj = td->td_kstack_obj; + pmap_qremove(td->td_kstack, pages); + VM_OBJECT_WLOCK(ksobj); + for (i = 0; i < pages; i++) { + m = vm_page_lookup(ksobj, i); + if (m == NULL) + panic("vm_thread_swapout: kstack already missing?"); + vm_page_dirty(m); + vm_page_lock(m); + vm_page_unwire(m, PQ_INACTIVE); + vm_page_unlock(m); + } + VM_OBJECT_WUNLOCK(ksobj); +} + +/* + * Bring the kernel stack for a specified thread back in. + */ +static void +vm_thread_swapin(struct thread *td, int oom_alloc) +{ + vm_object_t ksobj; + vm_page_t ma[KSTACK_MAX_PAGES]; + int a, count, i, j, pages, rv; + + pages = td->td_kstack_pages; + ksobj = td->td_kstack_obj; + VM_OBJECT_WLOCK(ksobj); + (void)vm_page_grab_pages(ksobj, 0, oom_alloc | VM_ALLOC_WIRED, ma, + pages); + for (i = 0; i < pages;) { + vm_page_assert_xbusied(ma[i]); + if (ma[i]->valid == VM_PAGE_BITS_ALL) { + vm_page_xunbusy(ma[i]); + i++; + continue; + } + vm_object_pip_add(ksobj, 1); + for (j = i + 1; j < pages; j++) + if (ma[j]->valid == VM_PAGE_BITS_ALL) + break; + rv = vm_pager_has_page(ksobj, ma[i]->pindex, NULL, &a); + KASSERT(rv == 1, ("%s: missing page %p", __func__, ma[i])); + count = min(a + 1, j - i); + rv = vm_pager_get_pages(ksobj, ma + i, count, NULL, NULL); + KASSERT(rv == VM_PAGER_OK, ("%s: cannot get kstack for proc %d", + __func__, td->td_proc->p_pid)); + vm_object_pip_wakeup(ksobj); + for (j = i; j < i + count; j++) + vm_page_xunbusy(ma[j]); + i += count; + } + VM_OBJECT_WUNLOCK(ksobj); + pmap_qenter(td->td_kstack, ma, pages); + cpu_thread_swapin(td); +} + +void +faultin(struct proc *p) +{ + struct thread *td; + int oom_alloc; + + PROC_LOCK_ASSERT(p, MA_OWNED); + + /* + * If another process is swapping in this process, + * just wait until it finishes. + */ + if (p->p_flag & P_SWAPPINGIN) { + while (p->p_flag & P_SWAPPINGIN) + msleep(&p->p_flag, &p->p_mtx, PVM, "faultin", 0); + return; + } + + if ((p->p_flag & P_INMEM) == 0) { + oom_alloc = (p->p_flag & P_WKILLED) != 0 ? VM_ALLOC_SYSTEM : + VM_ALLOC_NORMAL; + + /* + * Don't let another thread swap process p out while we are + * busy swapping it in. + */ + ++p->p_lock; + p->p_flag |= P_SWAPPINGIN; + PROC_UNLOCK(p); + sx_xlock(&allproc_lock); + MPASS(swapped_cnt > 0); + swapped_cnt--; + if (curthread != &thread0) + swap_inprogress++; + sx_xunlock(&allproc_lock); + + /* + * We hold no lock here because the list of threads + * can not change while all threads in the process are + * swapped out. + */ + FOREACH_THREAD_IN_PROC(p, td) + vm_thread_swapin(td, oom_alloc); + + if (curthread != &thread0) { + sx_xlock(&allproc_lock); + MPASS(swap_inprogress > 0); + swap_inprogress--; + last_swapin = ticks; + sx_xunlock(&allproc_lock); + } + PROC_LOCK(p); + swapclear(p); + p->p_swtick = ticks; + + /* Allow other threads to swap p out now. */ + wakeup(&p->p_flag); + --p->p_lock; + } +} + +/* + * This swapin algorithm attempts to swap-in processes only if there + * is enough space for them. Of course, if a process waits for a long + * time, it will be swapped in anyway. + */ + +static struct proc * +swapper_selector(bool wkilled_only) +{ + struct proc *p, *res; + struct thread *td; + int ppri, pri, slptime, swtime; + + sx_assert(&allproc_lock, SA_SLOCKED); + if (swapped_cnt == 0) + return (NULL); + res = NULL; + ppri = INT_MIN; + FOREACH_PROC_IN_SYSTEM(p) { + PROC_LOCK(p); + if (p->p_state == PRS_NEW || (p->p_flag & (P_SWAPPINGOUT | + P_SWAPPINGIN | P_INMEM)) != 0) { + PROC_UNLOCK(p); + continue; + } + if (p->p_state == PRS_NORMAL && (p->p_flag & P_WKILLED) != 0) { + /* + * A swapped-out process might have mapped a + * large portion of the system's pages as + * anonymous memory. There is no other way to + * release the memory other than to kill the + * process, for which we need to swap it in. + */ + return (p); + } + if (wkilled_only) { + PROC_UNLOCK(p); + continue; + } + swtime = (ticks - p->p_swtick) / hz; + FOREACH_THREAD_IN_PROC(p, td) { + /* + * An otherwise runnable thread of a process + * swapped out has only the TDI_SWAPPED bit set. + */ + thread_lock(td); + if (td->td_inhibitors == TDI_SWAPPED) { + slptime = (ticks - td->td_slptick) / hz; + pri = swtime + slptime; + if ((td->td_flags & TDF_SWAPINREQ) == 0) + pri -= p->p_nice * 8; + /* + * if this thread is higher priority + * and there is enough space, then select + * this process instead of the previous + * selection. + */ + if (pri > ppri) { + res = p; + ppri = pri; + } + } + thread_unlock(td); + } + PROC_UNLOCK(p); + } + + if (res != NULL) + PROC_LOCK(res); + return (res); +} + +#define SWAPIN_INTERVAL (MAXSLP * hz / 2) + +/* + * Limit swapper to swap in one non-WKILLED process in MAXSLP/2 + * interval, assuming that there is: + * - no memory shortage; + * - no parallel swap-ins; + * - no other swap-ins in the current SWAPIN_INTERVAL. + */ +static bool +swapper_wkilled_only(void) +{ + + return (vm_page_count_min() || swap_inprogress > 0 || + (u_int)(ticks - last_swapin) < SWAPIN_INTERVAL); +} + +void +swapper(void) +{ + struct proc *p; + + for (;;) { + sx_slock(&allproc_lock); + p = swapper_selector(swapper_wkilled_only()); + sx_sunlock(&allproc_lock); + + if (p == NULL) { + tsleep(&proc0, PVM, "swapin", SWAPIN_INTERVAL); + } else { + PROC_LOCK_ASSERT(p, MA_OWNED); + + /* + * Another process may be bringing or may have + * already brought this process in while we + * traverse all threads. Or, this process may + * have exited or even being swapped out + * again. + */ + if (p->p_state == PRS_NORMAL && (p->p_flag & (P_INMEM | + P_SWAPPINGOUT | P_SWAPPINGIN)) == 0) { + faultin(p); + } + PROC_UNLOCK(p); + } + } +} + +/* + * First, if any processes have been sleeping or stopped for at least + * "swap_idle_threshold1" seconds, they are swapped out. If, however, + * no such processes exist, then the longest-sleeping or stopped + * process is swapped out. Finally, and only as a last resort, if + * there are no sleeping or stopped processes, the longest-resident + * process is swapped out. + */ +static void +swapout_procs(int action) +{ + struct proc *p; + struct thread *td; + int slptime; + bool didswap, doswap; + + MPASS((action & (VM_SWAP_NORMAL | VM_SWAP_IDLE)) != 0); + + didswap = false; + sx_slock(&allproc_lock); + FOREACH_PROC_IN_SYSTEM(p) { + /* + * Filter out not yet fully constructed processes. Do + * not swap out held processes. Avoid processes which + * are system, exiting, execing, traced, already swapped + * out or are in the process of being swapped in or out. + */ + PROC_LOCK(p); + if (p->p_state != PRS_NORMAL || p->p_lock != 0 || (p->p_flag & + (P_SYSTEM | P_WEXIT | P_INEXEC | P_STOPPED_SINGLE | + P_TRACED | P_SWAPPINGOUT | P_SWAPPINGIN | P_INMEM)) != + P_INMEM) { + PROC_UNLOCK(p); + continue; + } + + /* + * Further consideration of this process for swap out + * requires iterating over its threads. We release + * allproc_lock here so that process creation and + * destruction are not blocked while we iterate. + * + * To later reacquire allproc_lock and resume + * iteration over the allproc list, we will first have + * to release the lock on the process. We place a + * hold on the process so that it remains in the + * allproc list while it is unlocked. + */ + _PHOLD_LITE(p); + sx_sunlock(&allproc_lock); + + /* + * Do not swapout a realtime process. + * Guarantee swap_idle_threshold1 time in memory. + * If the system is under memory stress, or if we are + * swapping idle processes >= swap_idle_threshold2, + * then swap the process out. + */ + doswap = true; + FOREACH_THREAD_IN_PROC(p, td) { + thread_lock(td); + slptime = (ticks - td->td_slptick) / hz; + if (PRI_IS_REALTIME(td->td_pri_class) || + slptime < swap_idle_threshold1 || + !thread_safetoswapout(td) || + ((action & VM_SWAP_NORMAL) == 0 && + slptime < swap_idle_threshold2)) + doswap = false; + thread_unlock(td); + if (!doswap) + break; + } + if (doswap && swapout(p) == 0) + didswap = true; + + PROC_UNLOCK(p); + if (didswap) { + sx_xlock(&allproc_lock); + swapped_cnt++; + sx_downgrade(&allproc_lock); + } else + sx_slock(&allproc_lock); + PRELE(p); + } + sx_sunlock(&allproc_lock); + + /* + * If we swapped something out, and another process needed memory, + * then wakeup the sched process. + */ + if (didswap) + wakeup(&proc0); +} + +static void +swapclear(struct proc *p) +{ + struct thread *td; + + PROC_LOCK_ASSERT(p, MA_OWNED); + + FOREACH_THREAD_IN_PROC(p, td) { + thread_lock(td); + td->td_flags |= TDF_INMEM; + td->td_flags &= ~TDF_SWAPINREQ; + TD_CLR_SWAPPED(td); + if (TD_CAN_RUN(td)) + if (setrunnable(td)) { +#ifdef INVARIANTS + /* + * XXX: We just cleared TDI_SWAPPED + * above and set TDF_INMEM, so this + * should never happen. + */ + panic("not waking up swapper"); +#endif + } + thread_unlock(td); + } + p->p_flag &= ~(P_SWAPPINGIN | P_SWAPPINGOUT); + p->p_flag |= P_INMEM; +} + +static int +swapout(struct proc *p) +{ + struct thread *td; + + PROC_LOCK_ASSERT(p, MA_OWNED); + + /* + * The states of this process and its threads may have changed + * by now. Assuming that there is only one pageout daemon thread, + * this process should still be in memory. + */ + KASSERT((p->p_flag & (P_INMEM | P_SWAPPINGOUT | P_SWAPPINGIN)) == + P_INMEM, ("swapout: lost a swapout race?")); + + /* + * Remember the resident count. + */ + p->p_vmspace->vm_swrss = vmspace_resident_count(p->p_vmspace); + + /* + * Check and mark all threads before we proceed. + */ + p->p_flag &= ~P_INMEM; + p->p_flag |= P_SWAPPINGOUT; + FOREACH_THREAD_IN_PROC(p, td) { + thread_lock(td); + if (!thread_safetoswapout(td)) { + thread_unlock(td); + swapclear(p); + return (EBUSY); + } + td->td_flags &= ~TDF_INMEM; + TD_SET_SWAPPED(td); + thread_unlock(td); + } + td = FIRST_THREAD_IN_PROC(p); + ++td->td_ru.ru_nswap; + PROC_UNLOCK(p); + + /* + * This list is stable because all threads are now prevented from + * running. The list is only modified in the context of a running + * thread in this process. + */ + FOREACH_THREAD_IN_PROC(p, td) + vm_thread_swapout(td); + + PROC_LOCK(p); + p->p_flag &= ~P_SWAPPINGOUT; + p->p_swtick = ticks; + return (0); +} Property changes on: trunk/sys/vm/vm_swapout.c ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +MidnightBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Added: trunk/sys/vm/vm_swapout_dummy.c =================================================================== --- trunk/sys/vm/vm_swapout_dummy.c (rev 0) +++ trunk/sys/vm/vm_swapout_dummy.c 2020-02-08 19:35:48 UTC (rev 12314) @@ -0,0 +1,123 @@ +/* $MidnightBSD$ */ +/*- + * Copyright (c) 1991 Regents of the University of California. + * All rights reserved. + * Copyright (c) 1994 John S. Dyson + * All rights reserved. + * Copyright (c) 1994 David Greenman + * All rights reserved. + * Copyright (c) 2005 Yahoo! Technologies Norway AS + * All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * The Mach Operating System project at Carnegie-Mellon University. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)vm_pageout.c 7.4 (Berkeley) 5/7/91 + * + * + * Copyright (c) 1987, 1990 Carnegie-Mellon University. + * All rights reserved. + * + * Authors: Avadis Tevanian, Jr., Michael Wayne Young + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution at CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: stable/11/sys/vm/vm_swapout_dummy.c 325647 2017-11-10 13:17:40Z kib $"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/proc.h> +#include <sys/sysctl.h> +#include <sys/vmmeter.h> + +#include <vm/vm.h> +#include <vm/vm_param.h> +#include <vm/vm_pageout.h> + +static int vm_swap_enabled = 0; +SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled, CTLFLAG_RD, + &vm_swap_enabled, 0, + "Enable entire process swapout"); + +static int vm_swap_idle_enabled = 0; +SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled, CTLFLAG_RD, + &vm_swap_idle_enabled, 0, + "Allow swapout on idle criteria"); + +void +vm_swapout_run(void) +{ +} + +void +vm_swapout_run_idle(void) +{ +} + +void +faultin(struct proc *p) +{ + + PROC_LOCK_ASSERT(p, MA_OWNED); + if ((p->p_flag & P_INMEM) == 0) + panic("faultin: proc %p swapped out with NO_SWAPPING", p); +} + +void +swapper(void) +{ + + for (;;) + tsleep(&proc0, PVM, "swapin", MAXSLP * hz); +} Property changes on: trunk/sys/vm/vm_swapout_dummy.c ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +MidnightBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Modified: trunk/sys/vm/vm_unix.c =================================================================== --- trunk/sys/vm/vm_unix.c 2020-02-08 19:35:04 UTC (rev 12313) +++ trunk/sys/vm/vm_unix.c 2020-02-08 19:35:48 UTC (rev 12314) @@ -44,7 +44,7 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/vm/vm_unix.c 284665 2015-06-21 06:28:26Z trasz $"); +__FBSDID("$FreeBSD: stable/11/sys/vm/vm_unix.c 341467 2018-12-04 15:04:48Z emaste $"); #include <sys/param.h> #include <sys/lock.h> @@ -72,9 +72,7 @@ */ /* ARGSUSED */ int -sys_obreak(td, uap) - struct thread *td; - struct obreak_args *uap; +sys_obreak(struct thread *td, struct obreak_args *uap) { struct vmspace *vm = td->td_proc->p_vmspace; vm_map_t map = &vm->vm_map; @@ -84,11 +82,9 @@ int error = 0; boolean_t do_map_wirefuture; - PROC_LOCK(td->td_proc); - datalim = lim_cur(td->td_proc, RLIMIT_DATA); - lmemlim = lim_cur(td->td_proc, RLIMIT_MEMLOCK); - vmemlim = lim_cur(td->td_proc, RLIMIT_VMEM); - PROC_UNLOCK(td->td_proc); + datalim = lim_cur(td, RLIMIT_DATA); + lmemlim = lim_cur(td, RLIMIT_MEMLOCK); + vmemlim = lim_cur(td, RLIMIT_VMEM); do_map_wirefuture = FALSE; new = round_page((vm_offset_t)uap->nsize); @@ -167,7 +163,7 @@ #endif prot = VM_PROT_RW; #ifdef COMPAT_FREEBSD32 -#if defined(__amd64__) || defined(__ia64__) +#if defined(__amd64__) if (i386_read_exec && SV_PROC_FLAG(td->td_proc, SV_ILP32)) prot |= VM_PROT_EXECUTE; #endif @@ -248,9 +244,7 @@ */ /* ARGSUSED */ int -sys_ovadvise(td, uap) - struct thread *td; - struct ovadvise_args *uap; +sys_ovadvise(struct thread *td, struct ovadvise_args *uap) { /* START_GIANT_OPTIONAL */ /* END_GIANT_OPTIONAL */ Modified: trunk/sys/vm/vm_zeroidle.c =================================================================== --- trunk/sys/vm/vm_zeroidle.c 2020-02-08 19:35:04 UTC (rev 12313) +++ trunk/sys/vm/vm_zeroidle.c 2020-02-08 19:35:48 UTC (rev 12314) @@ -34,7 +34,7 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/vm/vm_zeroidle.c 254065 2013-08-07 16:36:38Z kib $"); +__FBSDID("$FreeBSD: stable/11/sys/vm/vm_zeroidle.c 267992 2014-06-28 03:56:17Z hselasky $"); #include <opt_sched.h> @@ -56,10 +56,9 @@ #include <vm/vm_phys.h> static int idlezero_enable_default = 0; -TUNABLE_INT("vm.idlezero_enable", &idlezero_enable_default); /* Defer setting the enable flag until the kthread is running. */ static int idlezero_enable = 0; -SYSCTL_INT(_vm, OID_AUTO, idlezero_enable, CTLFLAG_RW, &idlezero_enable, 0, +SYSCTL_INT(_vm, OID_AUTO, idlezero_enable, CTLFLAG_RWTUN, &idlezero_enable, 0, "Allow the kernel to use idle cpu cycles to zero-out pages"); /* * Implement the pre-zeroed page mechanism. @@ -85,9 +84,9 @@ * fast sleeps. We also do not want to be continuously zeroing * pages because doing so may flush our L1 and L2 caches too much. */ - if (zero_state && vm_page_zero_count >= ZIDLE_LO(cnt.v_free_count)) + if (zero_state && vm_page_zero_count >= ZIDLE_LO(vm_cnt.v_free_count)) return (0); - if (vm_page_zero_count >= ZIDLE_HI(cnt.v_free_count)) + if (vm_page_zero_count >= ZIDLE_HI(vm_cnt.v_free_count)) return (0); return (1); } @@ -99,7 +98,7 @@ mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); zero_state = 0; if (vm_phys_zero_pages_idle()) { - if (vm_page_zero_count >= ZIDLE_HI(cnt.v_free_count)) + if (vm_page_zero_count >= ZIDLE_HI(vm_cnt.v_free_count)) zero_state = 1; } } Modified: trunk/sys/vm/vnode_pager.c =================================================================== --- trunk/sys/vm/vnode_pager.c 2020-02-08 19:35:04 UTC (rev 12313) +++ trunk/sys/vm/vnode_pager.c 2020-02-08 19:35:48 UTC (rev 12314) @@ -52,8 +52,10 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/vm/vnode_pager.c 291454 2015-11-29 14:44:40Z kib $"); +__FBSDID("$FreeBSD: stable/11/sys/vm/vnode_pager.c 331722 2018-03-29 02:50:57Z eadler $"); +#include "opt_vm.h" + #include <sys/param.h> #include <sys/systm.h> #include <sys/proc.h> @@ -83,21 +85,27 @@ static int vnode_pager_input_smlfs(vm_object_t object, vm_page_t m); static int vnode_pager_input_old(vm_object_t object, vm_page_t m); static void vnode_pager_dealloc(vm_object_t); -static int vnode_pager_getpages(vm_object_t, vm_page_t *, int, int); +static int vnode_pager_getpages(vm_object_t, vm_page_t *, int, int *, int *); +static int vnode_pager_getpages_async(vm_object_t, vm_page_t *, int, int *, + int *, vop_getpages_iodone_t, void *); static void vnode_pager_putpages(vm_object_t, vm_page_t *, int, int, int *); static boolean_t vnode_pager_haspage(vm_object_t, vm_pindex_t, int *, int *); static vm_object_t vnode_pager_alloc(void *, vm_ooffset_t, vm_prot_t, vm_ooffset_t, struct ucred *cred); +static int vnode_pager_generic_getpages_done(struct buf *); +static void vnode_pager_generic_getpages_done_async(struct buf *); struct pagerops vnodepagerops = { .pgo_alloc = vnode_pager_alloc, .pgo_dealloc = vnode_pager_dealloc, .pgo_getpages = vnode_pager_getpages, + .pgo_getpages_async = vnode_pager_getpages_async, .pgo_putpages = vnode_pager_putpages, .pgo_haspage = vnode_pager_haspage, }; int vnode_pbuf_freecnt; +int vnode_async_pbuf_freecnt; /* Create the VM system backing object for this vnode */ int @@ -157,14 +165,26 @@ return; ASSERT_VOP_ELOCKED(vp, "vnode_destroy_vobject"); VM_OBJECT_WLOCK(obj); + umtx_shm_object_terminated(obj); if (obj->ref_count == 0) { /* * don't double-terminate the object */ - if ((obj->flags & OBJ_DEAD) == 0) + if ((obj->flags & OBJ_DEAD) == 0) { vm_object_terminate(obj); - else + } else { + /* + * Waiters were already handled during object + * termination. The exclusive vnode lock hopefully + * prevented new waiters from referencing the dying + * object. + */ + KASSERT((obj->flags & OBJ_DISCONNECTWNT) == 0, + ("OBJ_DISCONNECTWNT set obj %p flags %x", + obj, obj->flags)); + vp->v_object = NULL; VM_OBJECT_WUNLOCK(obj); + } } else { /* * Woe to the process that tries to page now :-). @@ -172,7 +192,7 @@ vm_pager_deallocate(obj); VM_OBJECT_WUNLOCK(obj); } - vp->v_object = NULL; + KASSERT(vp->v_object == NULL, ("vp %p obj %p", vp, vp->v_object)); } @@ -241,9 +261,12 @@ VI_UNLOCK(vp); } else { object->ref_count++; +#if VM_NRESERVLEVEL > 0 + vm_object_color(object, 0); +#endif VM_OBJECT_WUNLOCK(object); } - vref(vp); + vrefact(vp); return (object); } @@ -251,8 +274,7 @@ * The object must be locked. */ static void -vnode_pager_dealloc(object) - vm_object_t object; +vnode_pager_dealloc(vm_object_t object) { struct vnode *vp; int refs; @@ -287,11 +309,8 @@ } static boolean_t -vnode_pager_haspage(object, pindex, before, after) - vm_object_t object; - vm_pindex_t pindex; - int *before; - int *after; +vnode_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before, + int *after) { struct vnode *vp = object->handle; daddr_t bn; @@ -338,16 +357,21 @@ *before += poff; } if (after) { - int numafter; + /* + * The BMAP vop can report a partial block in the + * 'after', but must not report blocks after EOF. + * Assert the latter, and truncate 'after' in case + * of the former. + */ + KASSERT((reqblock + *after) * pagesperblock < + roundup2(object->size, pagesperblock), + ("%s: reqblock %jd after %d size %ju", __func__, + (intmax_t )reqblock, *after, + (uintmax_t )object->size)); *after *= pagesperblock; - numafter = pagesperblock - (poff + 1); - if (IDX_TO_OFF(pindex + numafter) > - object->un_pager.vnp.vnp_size) { - numafter = - OFF_TO_IDX(object->un_pager.vnp.vnp_size) - - pindex; - } - *after += numafter; + *after += pagesperblock - (poff + 1); + if (pindex + *after >= object->size) + *after = object->size - 1 - pindex; } } else { if (before) { @@ -370,9 +394,7 @@ * operation (possibly at object termination time), so we must be careful. */ void -vnode_pager_setsize(vp, nsize) - struct vnode *vp; - vm_ooffset_t nsize; +vnode_pager_setsize(struct vnode *vp, vm_ooffset_t nsize) { vm_object_t object; vm_page_t m; @@ -445,10 +467,6 @@ * replacement from working properly. */ vm_page_clear_dirty(m, base, PAGE_SIZE - base); - } else if ((nsize & PAGE_MASK) && - vm_page_is_cached(object, OFF_TO_IDX(nsize))) { - vm_page_cache_free(object, OFF_TO_IDX(nsize), - nobjsize); } } object->un_pager.vnp.vnp_size = nsize; @@ -497,9 +515,7 @@ * small block filesystem vnode pager input */ static int -vnode_pager_input_smlfs(object, m) - vm_object_t object; - vm_page_t m; +vnode_pager_input_smlfs(vm_object_t object, vm_page_t m) { struct vnode *vp; struct bufobj *bo; @@ -591,9 +607,7 @@ * old style vnode pager input routine */ static int -vnode_pager_input_old(object, m) - vm_object_t object; - vm_page_t m; +vnode_pager_input_old(vm_object_t object, vm_page_t m) { struct uio auio; struct iovec aiov; @@ -666,19 +680,15 @@ * backing vp's VOP_GETPAGES. */ static int -vnode_pager_getpages(object, m, count, reqpage) - vm_object_t object; - vm_page_t *m; - int count; - int reqpage; +vnode_pager_getpages(vm_object_t object, vm_page_t *m, int count, int *rbehind, + int *rahead) { + struct vnode *vp; int rtval; - struct vnode *vp; - int bytes = count * PAGE_SIZE; vp = object->handle; VM_OBJECT_WUNLOCK(object); - rtval = VOP_GETPAGES(vp, m, bytes, reqpage, 0); + rtval = VOP_GETPAGES(vp, m, count, rbehind, rahead); KASSERT(rtval != EOPNOTSUPP, ("vnode_pager: FS getpages not implemented\n")); VM_OBJECT_WLOCK(object); @@ -685,261 +695,373 @@ return rtval; } +static int +vnode_pager_getpages_async(vm_object_t object, vm_page_t *m, int count, + int *rbehind, int *rahead, vop_getpages_iodone_t iodone, void *arg) +{ + struct vnode *vp; + int rtval; + + vp = object->handle; + VM_OBJECT_WUNLOCK(object); + rtval = VOP_GETPAGES_ASYNC(vp, m, count, rbehind, rahead, iodone, arg); + KASSERT(rtval != EOPNOTSUPP, + ("vnode_pager: FS getpages_async not implemented\n")); + VM_OBJECT_WLOCK(object); + return (rtval); +} + /* + * The implementation of VOP_GETPAGES() and VOP_GETPAGES_ASYNC() for + * local filesystems, where partially valid pages can only occur at + * the end of file. + */ +int +vnode_pager_local_getpages(struct vop_getpages_args *ap) +{ + + return (vnode_pager_generic_getpages(ap->a_vp, ap->a_m, ap->a_count, + ap->a_rbehind, ap->a_rahead, NULL, NULL)); +} + +int +vnode_pager_local_getpages_async(struct vop_getpages_async_args *ap) +{ + + return (vnode_pager_generic_getpages(ap->a_vp, ap->a_m, ap->a_count, + ap->a_rbehind, ap->a_rahead, ap->a_iodone, ap->a_arg)); +} + +/* * This is now called from local media FS's to operate against their * own vnodes if they fail to implement VOP_GETPAGES. */ int -vnode_pager_generic_getpages(vp, m, bytecount, reqpage) - struct vnode *vp; - vm_page_t *m; - int bytecount; - int reqpage; +vnode_pager_generic_getpages(struct vnode *vp, vm_page_t *m, int count, + int *a_rbehind, int *a_rahead, vop_getpages_iodone_t iodone, void *arg) { vm_object_t object; struct bufobj *bo; struct buf *bp; - struct mount *mp; - vm_offset_t kva; - daddr_t firstaddr, reqblock; - off_t foff, nextoff, tfoff, pib; - int pbefore, pafter, i, size, bsize, first, last; - int count, error, before, after, secmask; + off_t foff; + int bsize, pagesperblock, *freecnt; + int error, before, after, rbehind, rahead, poff, i; + int bytecount, secmask; KASSERT(vp->v_type != VCHR && vp->v_type != VBLK, - ("vnode_pager_generic_getpages does not support devices")); + ("%s does not support devices", __func__)); + if (vp->v_iflag & VI_DOOMED) return (VM_PAGER_BAD); object = vp->v_object; - count = bytecount / PAGE_SIZE; + foff = IDX_TO_OFF(m[0]->pindex); bsize = vp->v_mount->mnt_stat.f_iosize; + pagesperblock = bsize / PAGE_SIZE; - /* get the UNDERLYING device for the file with VOP_BMAP() */ + KASSERT(foff < object->un_pager.vnp.vnp_size, + ("%s: page %p offset beyond vp %p size", __func__, m[0], vp)); + KASSERT(count <= sizeof(bp->b_pages), + ("%s: requested %d pages", __func__, count)); /* - * originally, we did not check for an error return value -- assuming - * an fs always has a bmap entry point -- that assumption is wrong!!! + * The last page has valid blocks. Invalid part can only + * exist at the end of file, and the page is made fully valid + * by zeroing in vm_pager_get_pages(). */ - foff = IDX_TO_OFF(m[reqpage]->pindex); + if (m[count - 1]->valid != 0 && --count == 0) { + if (iodone != NULL) + iodone(arg, m, 1, 0); + return (VM_PAGER_OK); + } /* - * if we can't bmap, use old VOP code + * Synchronous and asynchronous paging operations use different + * free pbuf counters. This is done to avoid asynchronous requests + * to consume all pbufs. + * Allocate the pbuf at the very beginning of the function, so that + * if we are low on certain kind of pbufs don't even proceed to BMAP, + * but sleep. */ - error = VOP_BMAP(vp, IDX_TO_OFF(m[reqpage]->pindex) / bsize, &bo, - &reqblock, &after, &before); + freecnt = iodone != NULL ? + &vnode_async_pbuf_freecnt : &vnode_pbuf_freecnt; + bp = getpbuf(freecnt); + + /* + * Get the underlying device blocks for the file with VOP_BMAP(). + * If the file system doesn't support VOP_BMAP, use old way of + * getting pages via VOP_READ. + */ + error = VOP_BMAP(vp, foff / bsize, &bo, &bp->b_blkno, &after, &before); if (error == EOPNOTSUPP) { + relpbuf(bp, freecnt); VM_OBJECT_WLOCK(object); - - for (i = 0; i < count; i++) - if (i != reqpage) { - vm_page_lock(m[i]); - vm_page_free(m[i]); - vm_page_unlock(m[i]); - } - PCPU_INC(cnt.v_vnodein); - PCPU_INC(cnt.v_vnodepgsin); - error = vnode_pager_input_old(object, m[reqpage]); + for (i = 0; i < count; i++) { + PCPU_INC(cnt.v_vnodein); + PCPU_INC(cnt.v_vnodepgsin); + error = vnode_pager_input_old(object, m[i]); + if (error) + break; + } VM_OBJECT_WUNLOCK(object); return (error); } else if (error != 0) { - VM_OBJECT_WLOCK(object); - for (i = 0; i < count; i++) - if (i != reqpage) { - vm_page_lock(m[i]); - vm_page_free(m[i]); - vm_page_unlock(m[i]); - } - VM_OBJECT_WUNLOCK(object); + relpbuf(bp, freecnt); return (VM_PAGER_ERROR); + } - /* - * if the blocksize is smaller than a page size, then use - * special small filesystem code. NFS sometimes has a small - * blocksize, but it can handle large reads itself. - */ - } else if ((PAGE_SIZE / bsize) > 1 && - (vp->v_mount->mnt_stat.f_type != nfs_mount_type)) { - VM_OBJECT_WLOCK(object); - for (i = 0; i < count; i++) - if (i != reqpage) { - vm_page_lock(m[i]); - vm_page_free(m[i]); - vm_page_unlock(m[i]); - } - VM_OBJECT_WUNLOCK(object); - PCPU_INC(cnt.v_vnodein); - PCPU_INC(cnt.v_vnodepgsin); - return (vnode_pager_input_smlfs(object, m[reqpage])); + /* + * If the file system supports BMAP, but blocksize is smaller + * than a page size, then use special small filesystem code. + */ + if (pagesperblock == 0) { + relpbuf(bp, freecnt); + for (i = 0; i < count; i++) { + PCPU_INC(cnt.v_vnodein); + PCPU_INC(cnt.v_vnodepgsin); + error = vnode_pager_input_smlfs(object, m[i]); + if (error) + break; + } + return (error); } /* - * If we have a completely valid page available to us, we can - * clean up and return. Otherwise we have to re-read the - * media. + * A sparse file can be encountered only for a single page request, + * which may not be preceded by call to vm_pager_haspage(). */ - VM_OBJECT_WLOCK(object); - if (m[reqpage]->valid == VM_PAGE_BITS_ALL) { - for (i = 0; i < count; i++) - if (i != reqpage) { - vm_page_lock(m[i]); - vm_page_free(m[i]); - vm_page_unlock(m[i]); - } + if (bp->b_blkno == -1) { + KASSERT(count == 1, + ("%s: array[%d] request to a sparse file %p", __func__, + count, vp)); + relpbuf(bp, freecnt); + pmap_zero_page(m[0]); + KASSERT(m[0]->dirty == 0, ("%s: page %p is dirty", + __func__, m[0])); + VM_OBJECT_WLOCK(object); + m[0]->valid = VM_PAGE_BITS_ALL; VM_OBJECT_WUNLOCK(object); - return VM_PAGER_OK; - } else if (reqblock == -1) { - pmap_zero_page(m[reqpage]); - KASSERT(m[reqpage]->dirty == 0, - ("vnode_pager_generic_getpages: page %p is dirty", m)); - m[reqpage]->valid = VM_PAGE_BITS_ALL; - for (i = 0; i < count; i++) - if (i != reqpage) { - vm_page_lock(m[i]); - vm_page_free(m[i]); - vm_page_unlock(m[i]); - } - VM_OBJECT_WUNLOCK(object); return (VM_PAGER_OK); } - m[reqpage]->valid = 0; - VM_OBJECT_WUNLOCK(object); - pib = IDX_TO_OFF(m[reqpage]->pindex) % bsize; - pbefore = ((daddr_t)before * bsize + pib) / PAGE_SIZE; - pafter = ((daddr_t)(after + 1) * bsize - pib) / PAGE_SIZE - 1; - first = reqpage < pbefore ? 0 : reqpage - pbefore; - last = reqpage + pafter >= count ? count - 1 : reqpage + pafter; - if (first > 0 || last + 1 < count) { + bp->b_blkno += (foff % bsize) / DEV_BSIZE; + + /* Recalculate blocks available after/before to pages. */ + poff = (foff % bsize) / PAGE_SIZE; + before *= pagesperblock; + before += poff; + after *= pagesperblock; + after += pagesperblock - (poff + 1); + if (m[0]->pindex + after >= object->size) + after = object->size - 1 - m[0]->pindex; + KASSERT(count <= after + 1, ("%s: %d pages asked, can do only %d", + __func__, count, after + 1)); + after -= count - 1; + + /* Trim requested rbehind/rahead to possible values. */ + rbehind = a_rbehind ? *a_rbehind : 0; + rahead = a_rahead ? *a_rahead : 0; + rbehind = min(rbehind, before); + rbehind = min(rbehind, m[0]->pindex); + rahead = min(rahead, after); + rahead = min(rahead, object->size - m[count - 1]->pindex); + KASSERT(rbehind + rahead + count <= sizeof(bp->b_pages), + ("%s: behind %d ahead %d count %d", __func__, + rbehind, rahead, count)); + + /* + * Fill in the bp->b_pages[] array with requested and optional + * read behind or read ahead pages. Read behind pages are looked + * up in a backward direction, down to a first cached page. Same + * for read ahead pages, but there is no need to shift the array + * in case of encountering a cached page. + */ + i = bp->b_npages = 0; + if (rbehind) { + vm_pindex_t startpindex, tpindex; + vm_page_t p; + VM_OBJECT_WLOCK(object); - for (i = 0; i < first; i++) { - vm_page_lock(m[i]); - vm_page_free(m[i]); - vm_page_unlock(m[i]); + startpindex = m[0]->pindex - rbehind; + if ((p = TAILQ_PREV(m[0], pglist, listq)) != NULL && + p->pindex >= startpindex) + startpindex = p->pindex + 1; + + /* tpindex is unsigned; beware of numeric underflow. */ + for (tpindex = m[0]->pindex - 1; + tpindex >= startpindex && tpindex < m[0]->pindex; + tpindex--, i++) { + p = vm_page_alloc(object, tpindex, VM_ALLOC_NORMAL); + if (p == NULL) { + /* Shift the array. */ + for (int j = 0; j < i; j++) + bp->b_pages[j] = bp->b_pages[j + + tpindex + 1 - startpindex]; + break; + } + bp->b_pages[tpindex - startpindex] = p; } - for (i = last + 1; i < count; i++) { - vm_page_lock(m[i]); - vm_page_free(m[i]); - vm_page_unlock(m[i]); + + bp->b_pgbefore = i; + bp->b_npages += i; + bp->b_blkno -= IDX_TO_OFF(i) / DEV_BSIZE; + } else + bp->b_pgbefore = 0; + + /* Requested pages. */ + for (int j = 0; j < count; j++, i++) + bp->b_pages[i] = m[j]; + bp->b_npages += count; + + if (rahead) { + vm_pindex_t endpindex, tpindex; + vm_page_t p; + + if (!VM_OBJECT_WOWNED(object)) + VM_OBJECT_WLOCK(object); + endpindex = m[count - 1]->pindex + rahead + 1; + if ((p = TAILQ_NEXT(m[count - 1], listq)) != NULL && + p->pindex < endpindex) + endpindex = p->pindex; + if (endpindex > object->size) + endpindex = object->size; + + for (tpindex = m[count - 1]->pindex + 1; + tpindex < endpindex; i++, tpindex++) { + p = vm_page_alloc(object, tpindex, VM_ALLOC_NORMAL); + if (p == NULL) + break; + bp->b_pages[i] = p; } - VM_OBJECT_WUNLOCK(object); - } - /* - * here on direct device I/O - */ - firstaddr = reqblock; - firstaddr += pib / DEV_BSIZE; - firstaddr -= IDX_TO_OFF(reqpage - first) / DEV_BSIZE; + bp->b_pgafter = i - bp->b_npages; + bp->b_npages = i; + } else + bp->b_pgafter = 0; - /* - * The first and last page have been calculated now, move - * input pages to be zero based, and adjust the count. - */ - m += first; - reqpage -= first; - count = last - first + 1; + if (VM_OBJECT_WOWNED(object)) + VM_OBJECT_WUNLOCK(object); - /* - * calculate the file virtual address for the transfer - */ - foff = IDX_TO_OFF(m[0]->pindex); + /* Report back actual behind/ahead read. */ + if (a_rbehind) + *a_rbehind = bp->b_pgbefore; + if (a_rahead) + *a_rahead = bp->b_pgafter; - /* - * calculate the size of the transfer - */ - size = count * PAGE_SIZE; - KASSERT(count > 0, ("zero count")); - if ((foff + size) > object->un_pager.vnp.vnp_size) - size = object->un_pager.vnp.vnp_size - foff; - KASSERT(size > 0, ("zero size")); + KASSERT(bp->b_npages <= sizeof(bp->b_pages), + ("%s: buf %p overflowed", __func__, bp)); /* - * round up physical size for real devices. + * Recalculate first offset and bytecount with regards to read behind. + * Truncate bytecount to vnode real size and round up physical size + * for real devices. */ + foff = IDX_TO_OFF(bp->b_pages[0]->pindex); + bytecount = bp->b_npages << PAGE_SHIFT; + if ((foff + bytecount) > object->un_pager.vnp.vnp_size) + bytecount = object->un_pager.vnp.vnp_size - foff; secmask = bo->bo_bsize - 1; KASSERT(secmask < PAGE_SIZE && secmask > 0, - ("vnode_pager_generic_getpages: sector size %d too large", - secmask + 1)); - size = (size + secmask) & ~secmask; + ("%s: sector size %d too large", __func__, secmask + 1)); + bytecount = (bytecount + secmask) & ~secmask; - bp = getpbuf(&vnode_pbuf_freecnt); - kva = (vm_offset_t)bp->b_data; - /* - * and map the pages to be read into the kva, if the filesystem + * And map the pages to be read into the kva, if the filesystem * requires mapped buffers. */ - mp = vp->v_mount; - if (mp != NULL && (mp->mnt_kern_flag & MNTK_UNMAPPED_BUFS) != 0 && + if ((vp->v_mount->mnt_kern_flag & MNTK_UNMAPPED_BUFS) != 0 && unmapped_buf_allowed) { bp->b_data = unmapped_buf; - bp->b_kvabase = unmapped_buf; bp->b_offset = 0; - bp->b_flags |= B_UNMAPPED; - bp->b_npages = count; - for (i = 0; i < count; i++) - bp->b_pages[i] = m[i]; - } else - pmap_qenter(kva, m, count); + } else { + bp->b_data = bp->b_kvabase; + pmap_qenter((vm_offset_t)bp->b_data, bp->b_pages, bp->b_npages); + } - /* build a minimal buffer header */ + /* Build a minimal buffer header. */ bp->b_iocmd = BIO_READ; - bp->b_iodone = bdone; KASSERT(bp->b_rcred == NOCRED, ("leaking read ucred")); KASSERT(bp->b_wcred == NOCRED, ("leaking write ucred")); bp->b_rcred = crhold(curthread->td_ucred); bp->b_wcred = crhold(curthread->td_ucred); - bp->b_blkno = firstaddr; pbgetbo(bo, bp); bp->b_vp = vp; - bp->b_bcount = size; - bp->b_bufsize = size; - bp->b_runningbufspace = bp->b_bufsize; + bp->b_bcount = bp->b_bufsize = bp->b_runningbufspace = bytecount; + bp->b_iooffset = dbtob(bp->b_blkno); + atomic_add_long(&runningbufspace, bp->b_runningbufspace); - PCPU_INC(cnt.v_vnodein); - PCPU_ADD(cnt.v_vnodepgsin, count); + PCPU_ADD(cnt.v_vnodepgsin, bp->b_npages); - /* do the input */ - bp->b_iooffset = dbtob(bp->b_blkno); - bstrategy(bp); + if (iodone != NULL) { /* async */ + bp->b_pgiodone = iodone; + bp->b_caller1 = arg; + bp->b_iodone = vnode_pager_generic_getpages_done_async; + bp->b_flags |= B_ASYNC; + BUF_KERNPROC(bp); + bstrategy(bp); + return (VM_PAGER_OK); + } else { + bp->b_iodone = bdone; + bstrategy(bp); + bwait(bp, PVM, "vnread"); + error = vnode_pager_generic_getpages_done(bp); + for (i = 0; i < bp->b_npages; i++) + bp->b_pages[i] = NULL; + bp->b_vp = NULL; + pbrelbo(bp); + relpbuf(bp, &vnode_pbuf_freecnt); + return (error != 0 ? VM_PAGER_ERROR : VM_PAGER_OK); + } +} - bwait(bp, PVM, "vnread"); +static void +vnode_pager_generic_getpages_done_async(struct buf *bp) +{ + int error; - if ((bp->b_ioflags & BIO_ERROR) != 0) - error = EIO; + error = vnode_pager_generic_getpages_done(bp); + /* Run the iodone upon the requested range. */ + bp->b_pgiodone(bp->b_caller1, bp->b_pages + bp->b_pgbefore, + bp->b_npages - bp->b_pgbefore - bp->b_pgafter, error); + for (int i = 0; i < bp->b_npages; i++) + bp->b_pages[i] = NULL; + bp->b_vp = NULL; + pbrelbo(bp); + relpbuf(bp, &vnode_async_pbuf_freecnt); +} - if (error == 0 && size != count * PAGE_SIZE) { - if ((bp->b_flags & B_UNMAPPED) != 0) { - bp->b_flags &= ~B_UNMAPPED; - pmap_qenter(kva, m, count); +static int +vnode_pager_generic_getpages_done(struct buf *bp) +{ + vm_object_t object; + off_t tfoff, nextoff; + int i, error; + + error = (bp->b_ioflags & BIO_ERROR) != 0 ? EIO : 0; + object = bp->b_vp->v_object; + + if (error == 0 && bp->b_bcount != bp->b_npages * PAGE_SIZE) { + if (!buf_mapped(bp)) { + bp->b_data = bp->b_kvabase; + pmap_qenter((vm_offset_t)bp->b_data, bp->b_pages, + bp->b_npages); } - bzero((caddr_t)kva + size, PAGE_SIZE * count - size); + bzero(bp->b_data + bp->b_bcount, + PAGE_SIZE * bp->b_npages - bp->b_bcount); } - if ((bp->b_flags & B_UNMAPPED) == 0) - pmap_qremove(kva, count); - if (mp != NULL && (mp->mnt_kern_flag & MNTK_UNMAPPED_BUFS) != 0) { - bp->b_data = (caddr_t)kva; - bp->b_kvabase = (caddr_t)kva; - bp->b_flags &= ~B_UNMAPPED; - for (i = 0; i < count; i++) - bp->b_pages[i] = NULL; + if (buf_mapped(bp)) { + pmap_qremove((vm_offset_t)bp->b_data, bp->b_npages); + bp->b_data = unmapped_buf; } - /* - * free the buffer header back to the swap buffer pool - */ - bp->b_vp = NULL; - pbrelbo(bp); - relpbuf(bp, &vnode_pbuf_freecnt); - VM_OBJECT_WLOCK(object); - for (i = 0, tfoff = foff; i < count; i++, tfoff = nextoff) { + for (i = 0, tfoff = IDX_TO_OFF(bp->b_pages[0]->pindex); + i < bp->b_npages; i++, tfoff = nextoff) { vm_page_t mt; nextoff = tfoff + PAGE_SIZE; - mt = m[i]; + mt = bp->b_pages[i]; if (nextoff <= object->un_pager.vnp.vnp_size) { /* @@ -947,11 +1069,9 @@ */ mt->valid = VM_PAGE_BITS_ALL; KASSERT(mt->dirty == 0, - ("vnode_pager_generic_getpages: page %p is dirty", - mt)); + ("%s: page %p is dirty", __func__, mt)); KASSERT(!pmap_page_is_mapped(mt), - ("vnode_pager_generic_getpages: page %p is mapped", - mt)); + ("%s: page %p is mapped", __func__, mt)); } else { /* * Read did not fill up entire page. @@ -964,18 +1084,17 @@ object->un_pager.vnp.vnp_size - tfoff); KASSERT((mt->dirty & vm_page_bits(0, object->un_pager.vnp.vnp_size - tfoff)) == 0, - ("vnode_pager_generic_getpages: page %p is dirty", - mt)); + ("%s: page %p is dirty", __func__, mt)); } - - if (i != reqpage) + + if (i < bp->b_pgbefore || i >= bp->b_npages - bp->b_pgafter) vm_page_readahead_finish(mt); } VM_OBJECT_WUNLOCK(object); - if (error) { - printf("vnode_pager_getpages: I/O read error\n"); - } - return (error ? VM_PAGER_ERROR : VM_PAGER_OK); + if (error != 0) + printf("%s: I/O read error %d\n", __func__, error); + + return (error); } /* @@ -1006,7 +1125,7 @@ * daemon up. This should be probably be addressed XXX. */ - if (cnt.v_free_count + cnt.v_cache_count < cnt.v_pageout_free_min) + if (vm_cnt.v_free_count < vm_cnt.v_pageout_free_min) flags |= VM_PAGER_PUT_SYNC; /* @@ -1014,19 +1133,36 @@ */ vp = object->handle; VM_OBJECT_WUNLOCK(object); - rtval = VOP_PUTPAGES(vp, m, bytes, flags, rtvals, 0); + rtval = VOP_PUTPAGES(vp, m, bytes, flags, rtvals); KASSERT(rtval != EOPNOTSUPP, ("vnode_pager: stale FS putpages\n")); VM_OBJECT_WLOCK(object); } +static int +vn_off2bidx(vm_ooffset_t offset) +{ + return ((offset & PAGE_MASK) / DEV_BSIZE); +} + +static bool +vn_dirty_blk(vm_page_t m, vm_ooffset_t offset) +{ + + KASSERT(IDX_TO_OFF(m->pindex) <= offset && + offset < IDX_TO_OFF(m->pindex + 1), + ("page %p pidx %ju offset %ju", m, (uintmax_t)m->pindex, + (uintmax_t)offset)); + return ((m->dirty & ((vm_page_bits_t)1 << vn_off2bidx(offset))) != 0); +} + /* * This is now called from local media FS's to operate against their * own vnodes if they fail to implement VOP_PUTPAGES. * * This is typically called indirectly via the pageout daemon and - * clustering has already typically occured, so in general we ask the + * clustering has already typically occurred, so in general we ask the * underlying filesystem to write the data out asynchronously rather * then delayed. */ @@ -1034,18 +1170,14 @@ vnode_pager_generic_putpages(struct vnode *vp, vm_page_t *ma, int bytecount, int flags, int *rtvals) { - int i; vm_object_t object; vm_page_t m; - int count; - - int maxsize, ncount; - vm_ooffset_t poffset; + vm_ooffset_t maxblksz, next_offset, poffset, prev_offset; struct uio auio; struct iovec aiov; - int error; - int ioflags; - int ppscheck = 0; + off_t prev_resid, wrsz; + int count, error, i, maxsize, ncount, pgoff, ppscheck; + bool in_hole; static struct timeval lastfail; static int curfail; @@ -1056,10 +1188,11 @@ rtvals[i] = VM_PAGER_ERROR; if ((int64_t)ma[0]->pindex < 0) { - printf("vnode_pager_putpages: attempt to write meta-data!!! -- 0x%lx(%lx)\n", - (long)ma[0]->pindex, (u_long)ma[0]->dirty); + printf("vnode_pager_generic_putpages: " + "attempt to write meta-data 0x%jx(%lx)\n", + (uintmax_t)ma[0]->pindex, (u_long)ma[0]->dirty); rtvals[0] = VM_PAGER_BAD; - return VM_PAGER_BAD; + return (VM_PAGER_BAD); } maxsize = count * PAGE_SIZE; @@ -1069,7 +1202,7 @@ /* * If the page-aligned write is larger then the actual file we - * have to invalidate pages occuring beyond the file EOF. However, + * have to invalidate pages occurring beyond the file EOF. However, * there is an edge case where a file may not be page-aligned where * the last page is partially invalid. In this case the filesystem * may not properly clear the dirty bits for the entire page (which @@ -1079,14 +1212,20 @@ * We do not under any circumstances truncate the valid bits, as * this will screw up bogus page replacement. */ - VM_OBJECT_WLOCK(object); + VM_OBJECT_RLOCK(object); if (maxsize + poffset > object->un_pager.vnp.vnp_size) { + if (!VM_OBJECT_TRYUPGRADE(object)) { + VM_OBJECT_RUNLOCK(object); + VM_OBJECT_WLOCK(object); + if (maxsize + poffset <= object->un_pager.vnp.vnp_size) + goto downgrade; + } if (object->un_pager.vnp.vnp_size > poffset) { - int pgoff; - maxsize = object->un_pager.vnp.vnp_size - poffset; ncount = btoc(maxsize); if ((pgoff = (int)maxsize & PAGE_MASK) != 0) { + pgoff = roundup2(pgoff, DEV_BSIZE); + /* * If the object is locked and the following * conditions hold, then the page's dirty @@ -1097,6 +1236,7 @@ vm_page_assert_sbusied(m); KASSERT(!pmap_page_is_write_mapped(m), ("vnode_pager_generic_putpages: page %p is not read-only", m)); + MPASS(m->dirty != 0); vm_page_clear_dirty(m, pgoff, PAGE_SIZE - pgoff); } @@ -1104,64 +1244,152 @@ maxsize = 0; ncount = 0; } - if (ncount < count) { - for (i = ncount; i < count; i++) { - rtvals[i] = VM_PAGER_BAD; + for (i = ncount; i < count; i++) + rtvals[i] = VM_PAGER_BAD; +downgrade: + VM_OBJECT_LOCK_DOWNGRADE(object); + } + + auio.uio_iov = &aiov; + auio.uio_segflg = UIO_NOCOPY; + auio.uio_rw = UIO_WRITE; + auio.uio_td = NULL; + maxblksz = roundup2(poffset + maxsize, DEV_BSIZE); + + for (prev_offset = poffset; prev_offset < maxblksz;) { + /* Skip clean blocks. */ + for (in_hole = true; in_hole && prev_offset < maxblksz;) { + m = ma[OFF_TO_IDX(prev_offset - poffset)]; + for (i = vn_off2bidx(prev_offset); + i < sizeof(vm_page_bits_t) * NBBY && + prev_offset < maxblksz; i++) { + if (vn_dirty_blk(m, prev_offset)) { + in_hole = false; + break; + } + prev_offset += DEV_BSIZE; } } + if (in_hole) + goto write_done; + + /* Find longest run of dirty blocks. */ + for (next_offset = prev_offset; next_offset < maxblksz;) { + m = ma[OFF_TO_IDX(next_offset - poffset)]; + for (i = vn_off2bidx(next_offset); + i < sizeof(vm_page_bits_t) * NBBY && + next_offset < maxblksz; i++) { + if (!vn_dirty_blk(m, next_offset)) + goto start_write; + next_offset += DEV_BSIZE; + } + } +start_write: + if (next_offset > poffset + maxsize) + next_offset = poffset + maxsize; + + /* + * Getting here requires finding a dirty block in the + * 'skip clean blocks' loop. + */ + MPASS(prev_offset < next_offset); + + VM_OBJECT_RUNLOCK(object); + aiov.iov_base = NULL; + auio.uio_iovcnt = 1; + auio.uio_offset = prev_offset; + prev_resid = auio.uio_resid = aiov.iov_len = next_offset - + prev_offset; + error = VOP_WRITE(vp, &auio, + vnode_pager_putpages_ioflags(flags), curthread->td_ucred); + + wrsz = prev_resid - auio.uio_resid; + if (wrsz == 0) { + if (ppsratecheck(&lastfail, &curfail, 1) != 0) { + vn_printf(vp, "vnode_pager_putpages: " + "zero-length write at %ju resid %zd\n", + auio.uio_offset, auio.uio_resid); + } + VM_OBJECT_RLOCK(object); + break; + } + + /* Adjust the starting offset for next iteration. */ + prev_offset += wrsz; + MPASS(auio.uio_offset == prev_offset); + + ppscheck = 0; + if (error != 0 && (ppscheck = ppsratecheck(&lastfail, + &curfail, 1)) != 0) + vn_printf(vp, "vnode_pager_putpages: I/O error %d\n", + error); + if (auio.uio_resid != 0 && (ppscheck != 0 || + ppsratecheck(&lastfail, &curfail, 1) != 0)) + vn_printf(vp, "vnode_pager_putpages: residual I/O %zd " + "at %ju\n", auio.uio_resid, + (uintmax_t)ma[0]->pindex); + VM_OBJECT_RLOCK(object); + if (error != 0 || auio.uio_resid != 0) + break; } - VM_OBJECT_WUNLOCK(object); +write_done: + /* Mark completely processed pages. */ + for (i = 0; i < OFF_TO_IDX(prev_offset - poffset); i++) + rtvals[i] = VM_PAGER_OK; + /* Mark partial EOF page. */ + if (prev_offset == poffset + maxsize && (prev_offset & PAGE_MASK) != 0) + rtvals[i++] = VM_PAGER_OK; + /* Unwritten pages in range, free bonus if the page is clean. */ + for (; i < ncount; i++) + rtvals[i] = ma[i]->dirty == 0 ? VM_PAGER_OK : VM_PAGER_ERROR; + VM_OBJECT_RUNLOCK(object); + PCPU_ADD(cnt.v_vnodepgsout, i); + PCPU_INC(cnt.v_vnodeout); + return (rtvals[0]); +} +int +vnode_pager_putpages_ioflags(int pager_flags) +{ + int ioflags; + /* - * pageouts are already clustered, use IO_ASYNC to force a bawrite() - * rather then a bdwrite() to prevent paging I/O from saturating - * the buffer cache. Dummy-up the sequential heuristic to cause - * large ranges to cluster. If neither IO_SYNC or IO_ASYNC is set, - * the system decides how to cluster. + * Pageouts are already clustered, use IO_ASYNC to force a + * bawrite() rather then a bdwrite() to prevent paging I/O + * from saturating the buffer cache. Dummy-up the sequential + * heuristic to cause large ranges to cluster. If neither + * IO_SYNC or IO_ASYNC is set, the system decides how to + * cluster. */ ioflags = IO_VMIO; - if (flags & (VM_PAGER_PUT_SYNC | VM_PAGER_PUT_INVAL)) + if ((pager_flags & (VM_PAGER_PUT_SYNC | VM_PAGER_PUT_INVAL)) != 0) ioflags |= IO_SYNC; - else if ((flags & VM_PAGER_CLUSTER_OK) == 0) + else if ((pager_flags & VM_PAGER_CLUSTER_OK) == 0) ioflags |= IO_ASYNC; - ioflags |= (flags & VM_PAGER_PUT_INVAL) ? IO_INVAL: 0; + ioflags |= (pager_flags & VM_PAGER_PUT_INVAL) != 0 ? IO_INVAL: 0; + ioflags |= (pager_flags & VM_PAGER_PUT_NOREUSE) != 0 ? IO_NOREUSE : 0; ioflags |= IO_SEQMAX << IO_SEQSHIFT; - - aiov.iov_base = (caddr_t) 0; - aiov.iov_len = maxsize; - auio.uio_iov = &aiov; - auio.uio_iovcnt = 1; - auio.uio_offset = poffset; - auio.uio_segflg = UIO_NOCOPY; - auio.uio_rw = UIO_WRITE; - auio.uio_resid = maxsize; - auio.uio_td = (struct thread *) 0; - error = VOP_WRITE(vp, &auio, ioflags, curthread->td_ucred); - PCPU_INC(cnt.v_vnodeout); - PCPU_ADD(cnt.v_vnodepgsout, ncount); - - if (error) { - if ((ppscheck = ppsratecheck(&lastfail, &curfail, 1))) - printf("vnode_pager_putpages: I/O error %d\n", error); - } - if (auio.uio_resid) { - if (ppscheck || ppsratecheck(&lastfail, &curfail, 1)) - printf("vnode_pager_putpages: residual I/O %zd at %lu\n", - auio.uio_resid, (u_long)ma[0]->pindex); - } - for (i = 0; i < ncount; i++) { - rtvals[i] = VM_PAGER_OK; - } - return rtvals[0]; + return (ioflags); } +/* + * vnode_pager_undirty_pages(). + * + * A helper to mark pages as clean after pageout that was possibly + * done with a short write. The lpos argument specifies the page run + * length in bytes, and the written argument specifies how many bytes + * were actually written. eof is the offset past the last valid byte + * in the vnode using the absolute file position of the first byte in + * the run as the base from which it is computed. + */ void -vnode_pager_undirty_pages(vm_page_t *ma, int *rtvals, int written) +vnode_pager_undirty_pages(vm_page_t *ma, int *rtvals, int written, off_t eof, + int lpos) { vm_object_t obj; - int i, pos; + int i, pos, pos_devb; - if (written == 0) + if (written == 0 && eof >= lpos) return; obj = ma[0]->object; VM_OBJECT_WLOCK(obj); @@ -1175,6 +1403,37 @@ vm_page_clear_dirty(ma[i], 0, written & PAGE_MASK); } } + if (eof >= lpos) /* avoid truncation */ + goto done; + for (pos = eof, i = OFF_TO_IDX(trunc_page(pos)); pos < lpos; i++) { + if (pos != trunc_page(pos)) { + /* + * The page contains the last valid byte in + * the vnode, mark the rest of the page as + * clean, potentially making the whole page + * clean. + */ + pos_devb = roundup2(pos & PAGE_MASK, DEV_BSIZE); + vm_page_clear_dirty(ma[i], pos_devb, PAGE_SIZE - + pos_devb); + + /* + * If the page was cleaned, report the pageout + * on it as successful. msync() no longer + * needs to write out the page, endlessly + * creating write requests and dirty buffers. + */ + if (ma[i]->dirty == 0) + rtvals[i] = VM_PAGER_OK; + + pos = round_page(pos); + } else { + /* vm_pageout_flush() clears dirty */ + rtvals[i] = VM_PAGER_BAD; + pos += PAGE_SIZE; + } + } +done: VM_OBJECT_WUNLOCK(obj); } Modified: trunk/sys/vm/vnode_pager.h =================================================================== --- trunk/sys/vm/vnode_pager.h 2020-02-08 19:35:04 UTC (rev 12313) +++ trunk/sys/vm/vnode_pager.h 2020-02-08 19:35:48 UTC (rev 12314) @@ -33,7 +33,7 @@ * SUCH DAMAGE. * * @(#)vnode_pager.h 8.1 (Berkeley) 6/11/93 - * $FreeBSD: stable/10/sys/vm/vnode_pager.h 232071 2012-02-23 21:07:16Z kib $ + * $FreeBSD: stable/11/sys/vm/vnode_pager.h 331722 2018-03-29 02:50:57Z eadler $ */ #ifndef _VNODE_PAGER_ @@ -42,14 +42,17 @@ #ifdef _KERNEL int vnode_pager_generic_getpages(struct vnode *vp, vm_page_t *m, - int count, int reqpage); + int count, int *rbehind, int *rahead, vop_getpages_iodone_t iodone, + void *arg); int vnode_pager_generic_putpages(struct vnode *vp, vm_page_t *m, - int count, boolean_t sync, - int *rtvals); - + int count, int flags, int *rtvals); +int vnode_pager_local_getpages(struct vop_getpages_args *ap); +int vnode_pager_local_getpages_async(struct vop_getpages_async_args *ap); +int vnode_pager_putpages_ioflags(int pager_flags); void vnode_pager_release_writecount(vm_object_t object, vm_offset_t start, vm_offset_t end); -void vnode_pager_undirty_pages(vm_page_t *ma, int *rtvals, int written); +void vnode_pager_undirty_pages(vm_page_t *ma, int *rtvals, int written, + off_t eof, int lpos); void vnode_pager_update_writecount(vm_object_t object, vm_offset_t start, vm_offset_t end); From laffer1 at midnightbsd.org Sat Feb 8 14:38:54 2020 From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org) Date: Sat, 8 Feb 2020 14:38:54 -0500 (EST) Subject: [Midnightbsd-cvs] src [12315] trunk/sys/tools/vnode_if.awk: sync with FreeBSD 11-stable Message-ID: <202002081938.018Jcseu062935@stargazer.midnightbsd.org> Revision: 12315 http://svnweb.midnightbsd.org/src/?rev=12315 Author: laffer1 Date: 2020-02-08 14:38:54 -0500 (Sat, 08 Feb 2020) Log Message: ----------- sync with FreeBSD 11-stable Modified Paths: -------------- trunk/sys/tools/vnode_if.awk Modified: trunk/sys/tools/vnode_if.awk =================================================================== --- trunk/sys/tools/vnode_if.awk 2020-02-08 19:35:48 UTC (rev 12314) +++ trunk/sys/tools/vnode_if.awk 2020-02-08 19:38:54 UTC (rev 12315) @@ -30,7 +30,7 @@ # # @(#)vnode_if.sh 8.1 (Berkeley) 6/10/93 -# $FreeBSD: stable/10/sys/tools/vnode_if.awk 289798 2015-10-23 07:40:43Z avg $ +# $FreeBSD: stable/11/sys/tools/vnode_if.awk 331722 2018-03-29 02:50:57Z eadler $ # $MidnightBSD$ # # Script to produce VFS front-end sugar. @@ -166,8 +166,6 @@ if (cfile) { printc(common_head \ - "#include \"opt_kdtrace.h\"\n" \ - "\n" \ "#include <sys/param.h>\n" \ "#include <sys/event.h>\n" \ "#include <sys/kernel.h>\n" \ From laffer1 at midnightbsd.org Sat Feb 8 14:39:08 2020 From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org) Date: Sat, 8 Feb 2020 14:39:08 -0500 (EST) Subject: [Midnightbsd-cvs] src [12316] trunk/sys/ufs: sync with FreeBSD 11-stable Message-ID: <202002081939.018Jd8ZR062993@stargazer.midnightbsd.org> Revision: 12316 http://svnweb.midnightbsd.org/src/?rev=12316 Author: laffer1 Date: 2020-02-08 14:39:08 -0500 (Sat, 08 Feb 2020) Log Message: ----------- sync with FreeBSD 11-stable Modified Paths: -------------- trunk/sys/ufs/ffs/ffs_alloc.c trunk/sys/ufs/ffs/ffs_balloc.c trunk/sys/ufs/ffs/ffs_extern.h trunk/sys/ufs/ffs/ffs_inode.c trunk/sys/ufs/ffs/ffs_rawread.c trunk/sys/ufs/ffs/ffs_snapshot.c trunk/sys/ufs/ffs/ffs_softdep.c trunk/sys/ufs/ffs/ffs_subr.c trunk/sys/ufs/ffs/ffs_suspend.c trunk/sys/ufs/ffs/ffs_tables.c trunk/sys/ufs/ffs/ffs_vfsops.c trunk/sys/ufs/ffs/ffs_vnops.c trunk/sys/ufs/ffs/fs.h trunk/sys/ufs/ffs/softdep.h trunk/sys/ufs/ufs/README.acls trunk/sys/ufs/ufs/README.extattr trunk/sys/ufs/ufs/acl.h trunk/sys/ufs/ufs/dinode.h trunk/sys/ufs/ufs/dir.h trunk/sys/ufs/ufs/dirhash.h trunk/sys/ufs/ufs/extattr.h trunk/sys/ufs/ufs/gjournal.h trunk/sys/ufs/ufs/inode.h trunk/sys/ufs/ufs/quota.h trunk/sys/ufs/ufs/ufs_acl.c trunk/sys/ufs/ufs/ufs_bmap.c trunk/sys/ufs/ufs/ufs_dirhash.c trunk/sys/ufs/ufs/ufs_extattr.c trunk/sys/ufs/ufs/ufs_extern.h trunk/sys/ufs/ufs/ufs_gjournal.c trunk/sys/ufs/ufs/ufs_inode.c trunk/sys/ufs/ufs/ufs_lookup.c trunk/sys/ufs/ufs/ufs_quota.c trunk/sys/ufs/ufs/ufs_vfsops.c trunk/sys/ufs/ufs/ufs_vnops.c trunk/sys/ufs/ufs/ufsmount.h Modified: trunk/sys/ufs/ffs/ffs_alloc.c =================================================================== --- trunk/sys/ufs/ffs/ffs_alloc.c 2020-02-08 19:38:54 UTC (rev 12315) +++ trunk/sys/ufs/ffs/ffs_alloc.c 2020-02-08 19:39:08 UTC (rev 12316) @@ -61,7 +61,7 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/ufs/ffs/ffs_alloc.c 306630 2016-10-03 10:15:16Z kib $"); +__FBSDID("$FreeBSD: stable/11/sys/ufs/ffs/ffs_alloc.c 344861 2019-03-06 23:59:56Z mckusick $"); #include "opt_quota.h" @@ -164,13 +164,13 @@ #endif *bnp = 0; - fs = ip->i_fs; - ump = ip->i_ump; + ump = ITOUMP(ip); + fs = ump->um_fs; mtx_assert(UFS_MTX(ump), MA_OWNED); #ifdef INVARIANTS if ((u_int)size > fs->fs_bsize || fragoff(fs, size) != 0) { printf("dev = %s, bsize = %ld, size = %d, fs = %s\n", - devtoname(ip->i_dev), (long)fs->fs_bsize, size, + devtoname(ump->um_dev), (long)fs->fs_bsize, size, fs->fs_fsmnt); panic("ffs_alloc: bad size"); } @@ -261,9 +261,9 @@ int64_t delta; vp = ITOV(ip); - fs = ip->i_fs; + ump = ITOUMP(ip); + fs = ump->um_fs; bp = NULL; - ump = ip->i_ump; gbflags = (flags & BA_UNMAPPED) != 0 ? GB_UNMAPPED : 0; mtx_assert(UFS_MTX(ump), MA_OWNED); @@ -274,7 +274,7 @@ (u_int)nsize > fs->fs_bsize || fragoff(fs, nsize) != 0) { printf( "dev = %s, bsize = %ld, osize = %d, nsize = %d, fs = %s\n", - devtoname(ip->i_dev), (long)fs->fs_bsize, osize, + devtoname(ump->um_dev), (long)fs->fs_bsize, osize, nsize, fs->fs_fsmnt); panic("ffs_realloccg: bad size"); } @@ -289,7 +289,7 @@ } if (bprev == 0) { printf("dev = %s, bsize = %ld, bprev = %jd, fs = %s\n", - devtoname(ip->i_dev), (long)fs->fs_bsize, (intmax_t)bprev, + devtoname(ump->um_dev), (long)fs->fs_bsize, (intmax_t)bprev, fs->fs_fsmnt); panic("ffs_realloccg: bad bprev"); } @@ -384,7 +384,7 @@ break; default: printf("dev = %s, optim = %ld, fs = %s\n", - devtoname(ip->i_dev), (long)fs->fs_optim, fs->fs_fsmnt); + devtoname(ump->um_dev), (long)fs->fs_optim, fs->fs_fsmnt); panic("ffs_realloccg: bad optim"); /* NOTREACHED */ } @@ -392,7 +392,7 @@ if (bno > 0) { bp->b_blkno = fsbtodb(fs, bno); if (!DOINGSOFTDEP(vp)) - ffs_blkfree(ump, fs, ip->i_devvp, bprev, (long)osize, + ffs_blkfree(ump, fs, ump->um_devvp, bprev, (long)osize, ip->i_number, vp->v_type, NULL); delta = btodb(nsize - osize); DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + delta); @@ -482,9 +482,19 @@ struct cluster_save *a_buflist; } */ *ap; { + struct ufsmount *ump; - if (doreallocblks == 0) + /* + * If the underlying device can do deletes, then skip reallocating + * the blocks of this file into contiguous sequences. Devices that + * benefit from BIO_DELETE also benefit from not moving the data. + * These devices are flash and therefore work less well with this + * optimization. Also skip if reallocblks has been disabled globally. + */ + ump = ap->a_vp->v_mount->mnt_data; + if (ump->um_candelete || doreallocblks == 0) return (ENOSPC); + /* * We can't wait in softdep prealloc as it may fsync and recurse * here. Instead we simply fail to reallocate blocks if this @@ -493,7 +503,7 @@ if (DOINGSOFTDEP(ap->a_vp)) if (softdep_prealloc(ap->a_vp, MNT_NOWAIT) != 0) return (ENOSPC); - if (VTOI(ap->a_vp)->i_ump->um_fstype == UFS1) + if (ump->um_fstype == UFS1) return (ffs_reallocblks_ufs1(ap)); return (ffs_reallocblks_ufs2(ap)); } @@ -520,8 +530,8 @@ vp = ap->a_vp; ip = VTOI(vp); - fs = ip->i_fs; - ump = ip->i_ump; + ump = ITOUMP(ip); + fs = ump->um_fs; /* * If we are not tracking block clusters or if we have less than 4% * free blocks left, then do not attempt to cluster. Running with @@ -718,7 +728,7 @@ #endif for (blkno = newblk, i = 0; i < len; i++, blkno += fs->fs_frag) { if (!DOINGSOFTDEP(vp)) - ffs_blkfree(ump, fs, ip->i_devvp, + ffs_blkfree(ump, fs, ump->um_devvp, dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize, ip->i_number, vp->v_type, NULL); buflist->bs_children[i]->b_blkno = fsbtodb(fs, blkno); @@ -769,8 +779,8 @@ vp = ap->a_vp; ip = VTOI(vp); - fs = ip->i_fs; - ump = ip->i_ump; + ump = ITOUMP(ip); + fs = ump->um_fs; /* * If we are not tracking block clusters or if we have less than 4% * free blocks left, then do not attempt to cluster. Running with @@ -895,7 +905,7 @@ */ #ifdef DEBUG if (prtrealloc) - printf("realloc: ino %d, lbns %jd-%jd\n\told:", ip->i_number, + printf("realloc: ino %ju, lbns %jd-%jd\n\told:", (uintmax_t)ip->i_number, (intmax_t)start_lbn, (intmax_t)end_lbn); #endif blkno = newblk; @@ -966,7 +976,7 @@ #endif for (blkno = newblk, i = 0; i < len; i++, blkno += fs->fs_frag) { if (!DOINGSOFTDEP(vp)) - ffs_blkfree(ump, fs, ip->i_devvp, + ffs_blkfree(ump, fs, ump->um_devvp, dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize, ip->i_number, vp->v_type, NULL); buflist->bs_children[i]->b_blkno = fsbtodb(fs, blkno); @@ -1031,8 +1041,8 @@ *vpp = NULL; pip = VTOI(pvp); - fs = pip->i_fs; - ump = pip->i_ump; + ump = ITOUMP(pip); + fs = ump->um_fs; UFS_LOCK(ump); reclaimed = 0; @@ -1079,8 +1089,8 @@ ip = VTOI(*vpp); if (ip->i_mode) { dup_alloc: - printf("mode = 0%o, inum = %lu, fs = %s\n", - ip->i_mode, (u_long)ip->i_number, fs->fs_fsmnt); + printf("mode = 0%o, inum = %ju, fs = %s\n", + ip->i_mode, (uintmax_t)ip->i_number, fs->fs_fsmnt); panic("ffs_valloc: dup alloc"); } if (DIP(ip, i_blocks) && (fs->fs_flags & FS_UNCLEAN) == 0) { /* XXX */ @@ -1093,8 +1103,8 @@ /* * Set up a new generation number for this inode. */ - if (ip->i_gen == 0 || ++ip->i_gen == 0) - ip->i_gen = arc4random() / 2 + 1; + while (ip->i_gen == 0 || ++ip->i_gen == 0) + ip->i_gen = arc4random(); DIP_SET(ip, i_gen, ip->i_gen); if (fs->fs_magic == FS_UFS2_MAGIC) { vfs_timestamp(&ts); @@ -1105,10 +1115,12 @@ ip->i_flag = 0; (*vpp)->v_vflag = 0; (*vpp)->v_type = VNON; - if (fs->fs_magic == FS_UFS2_MAGIC) + if (fs->fs_magic == FS_UFS2_MAGIC) { (*vpp)->v_op = &ffs_vnodeops2; - else + ip->i_flag |= IN_UFS2; + } else { (*vpp)->v_op = &ffs_vnodeops1; + } return (0); noinodes: if (reclaimed == 0) { @@ -1149,8 +1161,8 @@ u_int mincg, minndir; u_int maxcontigdirs; - mtx_assert(UFS_MTX(pip->i_ump), MA_OWNED); - fs = pip->i_fs; + mtx_assert(UFS_MTX(ITOUMP(pip)), MA_OWNED); + fs = ITOFS(pip); avgifree = fs->fs_cstotal.cs_nifree / fs->fs_ncg; avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg; @@ -1217,16 +1229,17 @@ * We scan from our preferred cylinder group forward looking * for a cylinder group that meets our criterion. If we get * to the final cylinder group and do not find anything, - * we start scanning backwards from our preferred cylinder - * group. The ideal would be to alternate looking forward - * and backward, but that is just too complex to code for - * the gain it would get. The most likely place where the - * backward scan would take effect is when we start near - * the end of the filesystem and do not find anything from - * where we are to the end. In that case, scanning backward - * will likely find us a suitable cylinder group much closer - * to our desired location than if we were to start scanning - * forward from the beginning of the filesystem. + * we start scanning forwards from the beginning of the + * filesystem. While it might seem sensible to start scanning + * backwards or even to alternate looking forward and backward, + * this approach fails badly when the filesystem is nearly full. + * Specifically, we first search all the areas that have no space + * and finally try the one preceding that. We repeat this on + * every request and in the case of the final block end up + * searching the entire filesystem. By jumping to the front + * of the filesystem, our future forward searches always look + * in new cylinder groups so finds every possible block after + * one pass over the filesystem. */ prefcg = ino_to_cg(fs, pip->i_number); for (cg = prefcg; cg < fs->fs_ncg; cg++) @@ -1297,8 +1310,8 @@ ufs2_daddr_t pref; KASSERT(indx <= 0 || bap != NULL, ("need non-NULL bap")); - mtx_assert(UFS_MTX(ip->i_ump), MA_OWNED); - fs = ip->i_fs; + mtx_assert(UFS_MTX(ITOUMP(ip)), MA_OWNED); + fs = ITOFS(ip); /* * Allocation of indirect blocks is indicated by passing negative * values in indx: -1 for single indirect, -2 for double indirect, @@ -1341,7 +1354,7 @@ /* * If we are at the beginning of a file, or we have already allocated * the maximum number of blocks per cylinder group, or we do not - * have a block allocated immediately preceeding us, then we need + * have a block allocated immediately preceding us, then we need * to decide where to start allocating new blocks. */ if (indx % fs->fs_maxbpg == 0 || bap[indx - 1] == 0) { @@ -1402,8 +1415,8 @@ ufs2_daddr_t pref; KASSERT(indx <= 0 || bap != NULL, ("need non-NULL bap")); - mtx_assert(UFS_MTX(ip->i_ump), MA_OWNED); - fs = ip->i_fs; + mtx_assert(UFS_MTX(ITOUMP(ip)), MA_OWNED); + fs = ITOFS(ip); /* * Allocation of indirect blocks is indicated by passing negative * values in indx: -1 for single indirect, -2 for double indirect, @@ -1446,7 +1459,7 @@ /* * If we are at the beginning of a file, or we have already allocated * the maximum number of blocks per cylinder group, or we do not - * have a block allocated immediately preceeding us, then we need + * have a block allocated immediately preceding us, then we need * to decide where to start allocating new blocks. */ if (indx % fs->fs_maxbpg == 0 || bap[indx - 1] == 0) { @@ -1516,12 +1529,12 @@ ufs2_daddr_t result; u_int i, icg = cg; - mtx_assert(UFS_MTX(ip->i_ump), MA_OWNED); + mtx_assert(UFS_MTX(ITOUMP(ip)), MA_OWNED); #ifdef INVARIANTS if (ITOV(ip)->v_mount->mnt_kern_flag & MNTK_SUSPENDED) panic("ffs_hashalloc: allocation on suspended filesystem"); #endif - fs = ip->i_fs; + fs = ITOFS(ip); /* * 1: preferred cylinder group */ @@ -1579,8 +1592,8 @@ int i, error; u_int8_t *blksfree; - ump = ip->i_ump; - fs = ip->i_fs; + ump = ITOUMP(ip); + fs = ump->um_fs; if (fs->fs_cs(fs, cg).cs_nffree < numfrags(fs, nsize - osize)) return (0); frags = numfrags(fs, nsize); @@ -1590,8 +1603,8 @@ return (0); } UFS_UNLOCK(ump); - error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), - (int)fs->fs_cgsize, NOCRED, &bp); + error = bread(ump->um_devvp, fsbtodb(fs, cgtod(fs, cg)), + (int)fs->fs_cgsize, NOCRED, &bp); if (error) goto fail; cgp = (struct cg *)bp->b_data; @@ -1663,13 +1676,13 @@ int i, allocsiz, error, frags; u_int8_t *blksfree; - ump = ip->i_ump; - fs = ip->i_fs; + ump = ITOUMP(ip); + fs = ump->um_fs; if (fs->fs_cs(fs, cg).cs_nbfree == 0 && size == fs->fs_bsize) return (0); UFS_UNLOCK(ump); - error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), - (int)fs->fs_cgsize, NOCRED, &bp); + error = bread(ump->um_devvp, fsbtodb(fs, cgtod(fs, cg)), + (int)fs->fs_cgsize, NOCRED, &bp); if (error) goto fail; cgp = (struct cg *)bp->b_data; @@ -1765,8 +1778,8 @@ u_int8_t *blksfree; int i, cgbpref; - fs = ip->i_fs; - ump = ip->i_ump; + ump = ITOUMP(ip); + fs = ump->um_fs; mtx_assert(UFS_MTX(ump), MA_OWNED); cgp = (struct cg *)bp->b_data; blksfree = cg_blksfree(cgp); @@ -1851,12 +1864,12 @@ int32_t *lp; u_int8_t *blksfree; - fs = ip->i_fs; - ump = ip->i_ump; + ump = ITOUMP(ip); + fs = ump->um_fs; if (fs->fs_maxcluster[cg] < len) return (0); UFS_UNLOCK(ump); - if (bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), (int)fs->fs_cgsize, + if (bread(ump->um_devvp, fsbtodb(fs, cgtod(fs, cg)), (int)fs->fs_cgsize, NOCRED, &bp)) goto fail_lock; cgp = (struct cg *)bp->b_data; @@ -1955,13 +1968,23 @@ { struct fs *fs; - fs = ip->i_fs; - return (getblk(ip->i_devvp, fsbtodb(fs, ino_to_fsba(fs, + fs = ITOFS(ip); + return (getblk(ITODEVVP(ip), fsbtodb(fs, ino_to_fsba(fs, cg * fs->fs_ipg + cginoblk)), (int)fs->fs_bsize, 0, 0, gbflags)); } /* + * Synchronous inode initialization is needed only when barrier writes do not + * work as advertised, and will impose a heavy cost on file creation in a newly + * created filesystem. + */ +static int doasyncinodeinit = 1; +SYSCTL_INT(_vfs_ffs, OID_AUTO, doasyncinodeinit, CTLFLAG_RWTUN, + &doasyncinodeinit, 0, + "Perform inode block initialization using asynchronous writes"); + +/* * Determine whether an inode can be allocated. * * Check to see if an inode is available, and if it is, @@ -1987,13 +2010,13 @@ int error, start, len, i; u_int32_t old_initediblk; - fs = ip->i_fs; - ump = ip->i_ump; + ump = ITOUMP(ip); + fs = ump->um_fs; check_nifree: if (fs->fs_cs(fs, cg).cs_nifree == 0) return (0); UFS_UNLOCK(ump); - error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), + error = bread(ump->um_devvp, fsbtodb(fs, cgtod(fs, cg)), (int)fs->fs_cgsize, NOCRED, &bp); if (error) { brelse(bp); @@ -2070,9 +2093,11 @@ bzero(ibp->b_data, (int)fs->fs_bsize); dp2 = (struct ufs2_dinode *)(ibp->b_data); for (i = 0; i < INOPB(fs); i++) { - dp2->di_gen = arc4random() / 2 + 1; + while (dp2->di_gen == 0) + dp2->di_gen = arc4random(); dp2++; } + /* * Rather than adding a soft updates dependency to ensure * that the new inode block is written before it is claimed @@ -2082,7 +2107,10 @@ * written. The barrier write should only slow down bulk * loading of newly created filesystems. */ - babarrierwrite(ibp); + if (doasyncinodeinit) + babarrierwrite(ibp); + else + bwrite(ibp); /* * After the inode block is written, try to update the @@ -2090,7 +2118,7 @@ * to it, then leave it unchanged as the other thread * has already set it correctly. */ - error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), + error = bread(ump->um_devvp, fsbtodb(fs, cgtod(fs, cg)), (int)fs->fs_cgsize, NOCRED, &bp); UFS_LOCK(ump); ACTIVECLEAR(fs, cg); @@ -2155,7 +2183,8 @@ cg = dtog(fs, bno); if (devvp->v_type == VREG) { /* devvp is a snapshot */ - dev = VTOI(devvp)->i_devvp->v_rdev; + MPASS(devvp->v_mount->mnt_data == ump); + dev = ump->um_devvp->v_rdev; cgblkno = fragstoblks(fs, cgtod(fs, cg)); } else if (devvp->v_type == VCHR) { /* devvp is a normal disk device */ @@ -2386,7 +2415,7 @@ int i, error, frags, free; u_int8_t *blksfree; - fs = ip->i_fs; + fs = ITOFS(ip); if ((u_int)size > fs->fs_bsize || fragoff(fs, size) != 0) { printf("bsize = %ld, size = %ld, fs = %s\n", (long)fs->fs_bsize, size, fs->fs_fsmnt); @@ -2394,7 +2423,7 @@ } if ((u_int)bno >= fs->fs_size) panic("ffs_checkblk: bad block %jd", (intmax_t)bno); - error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, dtog(fs, bno))), + error = bread(ITODEVVP(ip), fsbtodb(fs, cgtod(fs, dtog(fs, bno))), (int)fs->fs_cgsize, NOCRED, &bp); if (error) panic("ffs_checkblk: cg bread failed"); @@ -2428,6 +2457,7 @@ ino_t ino; int mode; { + struct ufsmount *ump; struct inode *ip; if (DOINGSOFTDEP(pvp)) { @@ -2435,8 +2465,8 @@ return (0); } ip = VTOI(pvp); - return (ffs_freefile(ip->i_ump, ip->i_fs, ip->i_devvp, ino, mode, - NULL)); + ump = VFSTOUFS(pvp->v_mount); + return (ffs_freefile(ump, ump->um_fs, ump->um_devvp, ino, mode, NULL)); } /* @@ -2463,7 +2493,8 @@ cg = ino_to_cg(fs, ino); if (devvp->v_type == VREG) { /* devvp is a snapshot */ - dev = VTOI(devvp)->i_devvp->v_rdev; + MPASS(devvp->v_mount->mnt_data == ump); + dev = ump->um_devvp->v_rdev; cgbno = fragstoblks(fs, cgtod(fs, cg)); } else if (devvp->v_type == VCHR) { /* devvp is a normal disk device */ @@ -2658,6 +2689,8 @@ * the count to zero will cause the inode to be freed. * adjblkcnt(inode, amt) - adjust the number of blocks used by the * inode by the specified amount. + * adjsize(inode, size) - set the size of the inode to the + * specified size. * adjndir, adjbfree, adjifree, adjffree, adjnumclusters(amt) - * adjust the superblock summary. * freedirs(inode, count) - directory inodes [inode..inode + count - 1] @@ -2699,6 +2732,9 @@ static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_BLKCNT, adjblkcnt, CTLFLAG_WR, sysctl_ffs_fsck, "Adjust Inode Used Blocks Count"); +static SYSCTL_NODE(_vfs_ffs, FFS_SET_SIZE, setsize, CTLFLAG_WR, + sysctl_ffs_fsck, "Set the inode size"); + static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NDIR, adjndir, CTLFLAG_WR, sysctl_ffs_fsck, "Adjust number of directories"); @@ -2756,13 +2792,12 @@ struct thread *td = curthread; struct fsck_cmd cmd; struct ufsmount *ump; - struct vnode *vp, *vpold, *dvp, *fdvp; + struct vnode *vp, *dvp, *fdvp; struct inode *ip, *dp; struct mount *mp; struct fs *fs; ufs2_daddr_t blkno; long blkcnt, blksize; - struct filedesc *fdp; struct file *fp, *vfp; cap_rights_t rights; int filetype, error; @@ -2774,7 +2809,7 @@ return (error); if (cmd.version != FFS_CMD_VERSION) return (ERPCMISMATCH); - if ((error = getvnode(td->td_proc->p_fd, cmd.handle, + if ((error = getvnode(td, cmd.handle, cap_rights_init(&rights, CAP_FSCK), &fp)) != 0) return (error); vp = fp->f_data; @@ -2851,6 +2886,23 @@ vput(vp); break; + case FFS_SET_SIZE: +#ifdef DEBUG + if (fsckcmds) { + printf("%s: set inode %jd size to %jd\n", + mp->mnt_stat.f_mntonname, (intmax_t)cmd.value, + (intmax_t)cmd.size); + } +#endif /* DEBUG */ + if ((error = ffs_vget(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &vp))) + break; + ip = VTOI(vp); + DIP_SET(ip, i_size, cmd.size); + ip->i_flag |= IN_CHANGE | IN_MODIFIED; + error = ffs_update(vp, 1); + vput(vp); + break; + case FFS_DIR_FREE: filetype = IFDIR; /* fall through */ @@ -2977,12 +3029,7 @@ break; } VOP_UNLOCK(vp, 0); - fdp = td->td_proc->p_fd; - FILEDESC_XLOCK(fdp); - vpold = fdp->fd_cdir; - fdp->fd_cdir = vp; - FILEDESC_XUNLOCK(fdp); - vrele(vpold); + pwd_chdir(td, vp); break; case FFS_SET_DOTDOT: @@ -3057,7 +3104,7 @@ break; AUDIT_ARG_VNODE1(vp); ip = VTOI(vp); - if (ip->i_ump->um_fstype == UFS1) + if (I_IS_UFS1(ip)) error = copyin((void *)(intptr_t)cmd.size, ip->i_din1, sizeof(struct ufs1_dinode)); else @@ -3077,7 +3124,7 @@ error = EPERM; break; } - if (VTOI(vp)->i_ump != ump) { + if (ITOUMP(VTOI(vp)) != ump) { error = EINVAL; break; } @@ -3089,7 +3136,7 @@ (intmax_t)cmd.value); } #endif /* DEBUG */ - if ((error = getvnode(td->td_proc->p_fd, cmd.value, + if ((error = getvnode(td, cmd.value, cap_rights_init(&rights, CAP_FSCK), &vfp)) != 0) break; if (vfp->f_vnode->v_type != VCHR) { @@ -3174,11 +3221,11 @@ return (EINVAL); } ip = VTOI(vp); - if (ip->i_devvp != devvp) { + if (ITODEVVP(ip) != devvp) { vput(vp); return (EINVAL); } - fs = ip->i_fs; + fs = ITOFS(ip); vput(vp); foffset_lock_uio(fp, uio, flags); vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); Modified: trunk/sys/ufs/ffs/ffs_balloc.c =================================================================== --- trunk/sys/ufs/ffs/ffs_balloc.c 2020-02-08 19:38:54 UTC (rev 12315) +++ trunk/sys/ufs/ffs/ffs_balloc.c 2020-02-08 19:39:08 UTC (rev 12316) @@ -61,7 +61,7 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/ufs/ffs/ffs_balloc.c 304672 2016-08-23 07:55:32Z kib $"); +__FBSDID("$FreeBSD: stable/11/sys/ufs/ffs/ffs_balloc.c 331722 2018-03-29 02:50:57Z eadler $"); #include <sys/param.h> #include <sys/systm.h> @@ -70,6 +70,7 @@ #include <sys/lock.h> #include <sys/mount.h> #include <sys/vnode.h> +#include <sys/vmmeter.h> #include <ufs/ufs/quota.h> #include <ufs/ufs/inode.h> @@ -112,8 +113,8 @@ ip = VTOI(vp); dp = ip->i_din1; - fs = ip->i_fs; - ump = ip->i_ump; + fs = ITOFS(ip); + ump = ITOUMP(ip); lbn = lblkno(fs, startoffset); size = blkoff(fs, startoffset) + size; reclaimed = 0; @@ -549,7 +550,7 @@ } lbns_remfree++; #endif - ffs_blkfree(ump, fs, ip->i_devvp, *blkp, fs->fs_bsize, + ffs_blkfree(ump, fs, ump->um_devvp, *blkp, fs->fs_bsize, ip->i_number, vp->v_type, NULL); } return (error); @@ -585,8 +586,8 @@ ip = VTOI(vp); dp = ip->i_din2; - fs = ip->i_fs; - ump = ip->i_ump; + fs = ITOFS(ip); + ump = ITOUMP(ip); lbn = lblkno(fs, startoffset); size = blkoff(fs, startoffset) + size; reclaimed = 0; @@ -1144,7 +1145,7 @@ } lbns_remfree++; #endif - ffs_blkfree(ump, fs, ip->i_devvp, *blkp, fs->fs_bsize, + ffs_blkfree(ump, fs, ump->um_devvp, *blkp, fs->fs_bsize, ip->i_number, vp->v_type, NULL); } return (error); Modified: trunk/sys/ufs/ffs/ffs_extern.h =================================================================== --- trunk/sys/ufs/ffs/ffs_extern.h 2020-02-08 19:38:54 UTC (rev 12315) +++ trunk/sys/ufs/ffs/ffs_extern.h 2020-02-08 19:39:08 UTC (rev 12316) @@ -28,7 +28,7 @@ * SUCH DAMAGE. * * @(#)ffs_extern.h 8.6 (Berkeley) 3/30/95 - * $FreeBSD: stable/10/sys/ufs/ffs/ffs_extern.h 306175 2016-09-22 10:42:40Z kib $ + * $FreeBSD: stable/11/sys/ufs/ffs/ffs_extern.h 331722 2018-03-29 02:50:57Z eadler $ */ #ifndef _UFS_FFS_EXTERN_H @@ -78,7 +78,6 @@ int ffs_isblock(struct fs *, u_char *, ufs1_daddr_t); int ffs_isfreeblock(struct fs *, u_char *, ufs1_daddr_t); void ffs_load_inode(struct buf *, struct inode *, struct fs *, ino_t); -int ffs_mountroot(void); void ffs_oldfscompat_write(struct fs *, struct ufsmount *); int ffs_own_mount(const struct mount *mp); int ffs_reallocblks(struct vop_reallocblks_args *); @@ -179,6 +178,11 @@ * deadlock when flushing snapshot inodes while holding snaplk. */ #define NO_INO_UPDT 0x00000001 +/* + * Request data sync only from ffs_syncvnode(), not touching even more + * metadata than NO_INO_UPDT. + */ +#define DATA_ONLY 0x00000002 int ffs_rdonly(struct inode *); Modified: trunk/sys/ufs/ffs/ffs_inode.c =================================================================== --- trunk/sys/ufs/ffs/ffs_inode.c 2020-02-08 19:38:54 UTC (rev 12315) +++ trunk/sys/ufs/ffs/ffs_inode.c 2020-02-08 19:39:08 UTC (rev 12316) @@ -31,22 +31,24 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/ufs/ffs/ffs_inode.c 300600 2016-05-24 10:41:34Z kib $"); +__FBSDID("$FreeBSD: stable/11/sys/ufs/ffs/ffs_inode.c 349308 2019-06-23 14:49:30Z asomers $"); #include "opt_quota.h" #include <sys/param.h> #include <sys/systm.h> -#include <sys/mount.h> -#include <sys/proc.h> #include <sys/bio.h> #include <sys/buf.h> -#include <sys/vnode.h> #include <sys/malloc.h> +#include <sys/mount.h> +#include <sys/proc.h> +#include <sys/racct.h> +#include <sys/random.h> #include <sys/resourcevar.h> #include <sys/rwlock.h> +#include <sys/stat.h> #include <sys/vmmeter.h> -#include <sys/stat.h> +#include <sys/vnode.h> #include <vm/vm.h> #include <vm/vm_extern.h> @@ -91,8 +93,8 @@ if ((ip->i_flag & IN_MODIFIED) == 0 && waitfor == 0) return (0); ip->i_flag &= ~(IN_LAZYACCESS | IN_LAZYMOD | IN_MODIFIED); - fs = ip->i_fs; - if (fs->fs_ronly && ip->i_ump->um_fsckpid == 0) + fs = ITOFS(ip); + if (fs->fs_ronly && ITOUMP(ip)->um_fsckpid == 0) return (0); /* * If we are updating a snapshot and another process is currently @@ -109,14 +111,12 @@ if (IS_SNAPSHOT(ip)) flags = GB_LOCK_NOWAIT; loop: - error = breadn_flags(ip->i_devvp, + error = breadn_flags(ITODEVVP(ip), fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), (int) fs->fs_bsize, 0, 0, 0, NOCRED, flags, &bp); if (error != 0) { - if (error != EBUSY) { - brelse(bp); + if (error != EBUSY) return (error); - } KASSERT((IS_SNAPSHOT(ip)), ("EBUSY from non-snapshot")); /* * Wait for our inode block to become available. @@ -144,12 +144,17 @@ softdep_update_inodeblock(ip, bp, waitfor); else if (ip->i_effnlink != ip->i_nlink) panic("ffs_update: bad link cnt"); - if (ip->i_ump->um_fstype == UFS1) + if (I_IS_UFS1(ip)) { *((struct ufs1_dinode *)bp->b_data + ino_to_fsbo(fs, ip->i_number)) = *ip->i_din1; - else + /* XXX: FIX? The entropy here is desirable, but the harvesting may be expensive */ + random_harvest_queue(&(ip->i_din1), sizeof(ip->i_din1), 1, RANDOM_FS_ATIME); + } else { *((struct ufs2_dinode *)bp->b_data + ino_to_fsbo(fs, ip->i_number)) = *ip->i_din2; + /* XXX: FIX? The entropy here is desirable, but the harvesting may be expensive */ + random_harvest_queue(&(ip->i_din2), sizeof(ip->i_din2), 1, RANDOM_FS_ATIME); + } if (waitfor && !DOINGASYNC(vp)) error = bwrite(bp); else if (vm_page_count_severe() || buf_dirty_count_severe()) { @@ -181,7 +186,7 @@ struct inode *ip; ufs2_daddr_t bn, lbn, lastblock, lastiblock[NIADDR], indir_lbn[NIADDR]; ufs2_daddr_t oldblks[NDADDR + NIADDR], newblks[NDADDR + NIADDR]; - ufs2_daddr_t count, blocksreleased = 0, datablocks; + ufs2_daddr_t count, blocksreleased = 0, datablocks, blkno; struct bufobj *bo; struct fs *fs; struct buf *bp; @@ -189,12 +194,12 @@ int softdeptrunc, journaltrunc; int needextclean, extblocks; int offset, size, level, nblocks; - int i, error, allerror; + int i, error, allerror, indiroff; off_t osize; ip = VTOI(vp); - fs = ip->i_fs; - ump = ip->i_ump; + ump = VFSTOUFS(vp->v_mount); + fs = ump->um_fs; bo = &vp->v_bufobj; ASSERT_VOP_LOCKED(vp, "ffs_truncate"); @@ -265,7 +270,7 @@ for (i = 0; i < NXADDR; i++) { if (oldblks[i] == 0) continue; - ffs_blkfree(ump, fs, ip->i_devvp, oldblks[i], + ffs_blkfree(ump, fs, ITODEVVP(ip), oldblks[i], sblksize(fs, osize, i), ip->i_number, vp->v_type, NULL); } @@ -326,16 +331,57 @@ ip->i_flag |= IN_CHANGE | IN_UPDATE; return (ffs_update(vp, !DOINGASYNC(vp))); } - if (DOINGSOFTDEP(vp)) { + /* + * Lookup block number for a given offset. Zero length files + * have no blocks, so return a blkno of -1. + */ + lbn = lblkno(fs, length - 1); + if (length == 0) { + blkno = -1; + } else if (lbn < NDADDR) { + blkno = DIP(ip, i_db[lbn]); + } else { + error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, + cred, BA_METAONLY, &bp); + if (error) + return (error); + indiroff = (lbn - NDADDR) % NINDIR(fs); + if (I_IS_UFS1(ip)) + blkno = ((ufs1_daddr_t *)(bp->b_data))[indiroff]; + else + blkno = ((ufs2_daddr_t *)(bp->b_data))[indiroff]; + /* + * If the block number is non-zero, then the indirect block + * must have been previously allocated and need not be written. + * If the block number is zero, then we may have allocated + * the indirect block and hence need to write it out. + */ + if (blkno != 0) + brelse(bp); + else if (DOINGSOFTDEP(vp) || DOINGASYNC(vp)) + bdwrite(bp); + else + bwrite(bp); + } + /* + * If the block number at the new end of the file is zero, + * then we must allocate it to ensure that the last block of + * the file is allocated. Soft updates does not handle this + * case, so here we have to clean up the soft updates data + * structures describing the allocation past the truncation + * point. Finding and deallocating those structures is a lot of + * work. Since partial truncation with a hole at the end occurs + * rarely, we solve the problem by syncing the file so that it + * will have no soft updates data structures left. + */ + if (blkno == 0 && (error = ffs_syncvnode(vp, MNT_WAIT, 0)) != 0) + return (error); + if (blkno != 0 && DOINGSOFTDEP(vp)) { if (softdeptrunc == 0 && journaltrunc == 0) { /* - * If a file is only partially truncated, then - * we have to clean up the data structures - * describing the allocation past the truncation - * point. Finding and deallocating those structures - * is a lot of work. Since partial truncation occurs - * rarely, we solve the problem by syncing the file - * so that it will have no data structures left. + * If soft updates cannot handle this truncation, + * clean up soft dependency data structures and + * fall through to the synchronous truncation. */ if ((error = ffs_syncvnode(vp, MNT_WAIT, 0)) != 0) return (error); @@ -355,15 +401,17 @@ } } /* - * Shorten the size of the file. If the file is not being - * truncated to a block boundary, the contents of the - * partial block following the end of the file must be - * zero'ed in case it ever becomes accessible again because - * of subsequent file growth. Directories however are not + * Shorten the size of the file. If the last block of the + * shortened file is unallocated, we must allocate it. + * Additionally, if the file is not being truncated to a + * block boundary, the contents of the partial block + * following the end of the file must be zero'ed in + * case it ever becomes accessible again because of + * subsequent file growth. Directories however are not * zero'ed as they should grow back initialized to empty. */ offset = blkoff(fs, length); - if (offset == 0) { + if (blkno != 0 && offset == 0) { ip->i_size = length; DIP_SET(ip, i_size, length); } else { @@ -387,7 +435,7 @@ ip->i_size = length; DIP_SET(ip, i_size, length); size = blksize(fs, ip, lbn); - if (vp->v_type != VDIR) + if (vp->v_type != VDIR && offset != 0) bzero((char *)bp->b_data + offset, (u_int)(size - offset)); /* Kirk's code has reallocbuf(bp, size, 1) here */ @@ -450,7 +498,7 @@ ip->i_size = osize; DIP_SET(ip, i_size, osize); - error = vtruncbuf(vp, cred, length, fs->fs_bsize); + error = vtruncbuf(vp, length, fs->fs_bsize); if (error && (allerror == 0)) allerror = error; @@ -470,7 +518,7 @@ blocksreleased += count; if (lastiblock[level] < 0) { DIP_SET(ip, i_ib[level], 0); - ffs_blkfree(ump, fs, ip->i_devvp, bn, + ffs_blkfree(ump, fs, ump->um_devvp, bn, fs->fs_bsize, ip->i_number, vp->v_type, NULL); blocksreleased += nblocks; @@ -491,7 +539,7 @@ continue; DIP_SET(ip, i_db[i], 0); bsize = blksize(fs, ip, i); - ffs_blkfree(ump, fs, ip->i_devvp, bn, bsize, ip->i_number, + ffs_blkfree(ump, fs, ump->um_devvp, bn, bsize, ip->i_number, vp->v_type, NULL); blocksreleased += btodb(bsize); } @@ -523,7 +571,7 @@ * required for the storage we're keeping. */ bn += numfrags(fs, newspace); - ffs_blkfree(ump, fs, ip->i_devvp, bn, + ffs_blkfree(ump, fs, ump->um_devvp, bn, oldspace - newspace, ip->i_number, vp->v_type, NULL); blocksreleased += btodb(oldspace - newspace); } @@ -582,7 +630,7 @@ ufs2_daddr_t *countp; { struct buf *bp; - struct fs *fs = ip->i_fs; + struct fs *fs; struct vnode *vp; caddr_t copy = NULL; int i, nblocks, error = 0, allerror = 0; @@ -590,8 +638,10 @@ ufs2_daddr_t blkcount, factor, blocksreleased = 0; ufs1_daddr_t *bap1 = NULL; ufs2_daddr_t *bap2 = NULL; -# define BAP(ip, i) (((ip)->i_ump->um_fstype == UFS1) ? bap1[i] : bap2[i]) +#define BAP(ip, i) (I_IS_UFS1(ip) ? bap1[i] : bap2[i]) + fs = ITOFS(ip); + /* * Calculate index in current block of last * block to be kept. -1 indicates the entire @@ -613,6 +663,13 @@ vp = ITOV(ip); bp = getblk(vp, lbn, (int)fs->fs_bsize, 0, 0, 0); if ((bp->b_flags & B_CACHE) == 0) { +#ifdef RACCT + if (racct_enable) { + PROC_LOCK(curproc); + racct_add_buf(curproc, bp, 0); + PROC_UNLOCK(curproc); + } +#endif /* RACCT */ curthread->td_ru.ru_inblock++; /* pay for read */ bp->b_iocmd = BIO_READ; bp->b_flags &= ~B_INVAL; @@ -631,7 +688,7 @@ return (error); } - if (ip->i_ump->um_fstype == UFS1) + if (I_IS_UFS1(ip)) bap1 = (ufs1_daddr_t *)bp->b_data; else bap2 = (ufs2_daddr_t *)bp->b_data; @@ -639,7 +696,7 @@ copy = malloc(fs->fs_bsize, M_TEMP, M_WAITOK); bcopy((caddr_t)bp->b_data, copy, (u_int)fs->fs_bsize); for (i = last + 1; i < NINDIR(fs); i++) - if (ip->i_ump->um_fstype == UFS1) + if (I_IS_UFS1(ip)) bap1[i] = 0; else bap2[i] = 0; @@ -650,7 +707,7 @@ if (error) allerror = error; } - if (ip->i_ump->um_fstype == UFS1) + if (I_IS_UFS1(ip)) bap1 = (ufs1_daddr_t *)copy; else bap2 = (ufs2_daddr_t *)copy; @@ -670,7 +727,7 @@ allerror = error; blocksreleased += blkcount; } - ffs_blkfree(ip->i_ump, fs, ip->i_devvp, nb, fs->fs_bsize, + ffs_blkfree(ITOUMP(ip), fs, ITODEVVP(ip), nb, fs->fs_bsize, ip->i_number, vp->v_type, NULL); blocksreleased += nblocks; } @@ -704,6 +761,6 @@ ffs_rdonly(struct inode *ip) { - return (ip->i_ump->um_fs->fs_ronly != 0); + return (ITOFS(ip)->fs_ronly != 0); } Modified: trunk/sys/ufs/ffs/ffs_rawread.c =================================================================== --- trunk/sys/ufs/ffs/ffs_rawread.c 2020-02-08 19:38:54 UTC (rev 12315) +++ trunk/sys/ufs/ffs/ffs_rawread.c 2020-02-08 19:39:08 UTC (rev 12316) @@ -26,7 +26,7 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/ufs/ffs/ffs_rawread.c 318267 2017-05-14 12:00:00Z kib $"); +__FBSDID("$FreeBSD: stable/11/sys/ufs/ffs/ffs_rawread.c 318266 2017-05-14 11:51:30Z kib $"); #include <sys/param.h> #include <sys/systm.h> @@ -63,8 +63,7 @@ off_t offset, size_t len, struct thread *td, - struct buf *bp, - caddr_t sa); + struct buf *bp); static int ffs_rawread_main(struct vnode *vp, struct uio *uio); @@ -191,8 +190,7 @@ off_t offset, size_t len, struct thread *td, - struct buf *bp, - caddr_t sa) + struct buf *bp) { int error; u_int iolen; @@ -207,7 +205,7 @@ bsize = vp->v_mount->mnt_stat.f_iosize; ip = VTOI(vp); - dp = ip->i_devvp; + dp = ITODEVVP(ip); iolen = ((vm_offset_t) udata) & PAGE_MASK; bp->b_bcount = len; @@ -220,7 +218,6 @@ bp->b_iocmd = BIO_READ; bp->b_iodone = bdone; bp->b_data = udata; - bp->b_saveaddr = sa; blockno = offset / bsize; blockoff = (offset % bsize) / DEV_BSIZE; if ((daddr_t) blockno != blockno) { @@ -273,7 +270,6 @@ { int error, nerror; struct buf *bp, *nbp, *tbp; - caddr_t sa, nsa, tsa; u_int iolen; caddr_t udata; long resid; @@ -295,8 +291,6 @@ bp = NULL; nbp = NULL; - sa = NULL; - nsa = NULL; while (resid > 0) { @@ -303,10 +297,9 @@ if (bp == NULL) { /* Setup first read */ /* XXX: Leave some bufs for swap */ bp = getpbuf(&ffsrawbufcnt); - sa = bp->b_data; pbgetvp(vp, bp); error = ffs_rawread_readahead(vp, udata, offset, - resid, td, bp, sa); + resid, td, bp); if (error != 0) break; @@ -317,7 +310,6 @@ else nbp = NULL; if (nbp != NULL) { - nsa = nbp->b_data; pbgetvp(vp, nbp); nerror = ffs_rawread_readahead(vp, @@ -328,8 +320,7 @@ resid - bp->b_bufsize, td, - nbp, - nsa); + nbp); if (nerror) { pbrelvp(nbp); relpbuf(nbp, &ffsrawbufcnt); @@ -362,8 +353,7 @@ offset, bp->b_bufsize - iolen, td, - bp, - sa); + bp); if (error != 0) break; } else if (nbp != NULL) { /* Complete read with readahead */ @@ -372,10 +362,6 @@ bp = nbp; nbp = tbp; - tsa = sa; - sa = nsa; - nsa = tsa; - if (resid <= bp->b_bufsize) { /* No more readaheads */ pbrelvp(nbp); relpbuf(nbp, &ffsrawbufcnt); @@ -389,8 +375,7 @@ resid - bp->b_bufsize, td, - nbp, - nsa); + nbp); if (nerror != 0) { pbrelvp(nbp); relpbuf(nbp, &ffsrawbufcnt); @@ -401,7 +386,7 @@ break; } else if (resid > 0) { /* More to read, no readahead */ error = ffs_rawread_readahead(vp, udata, offset, - resid, td, bp, sa); + resid, td, bp); if (error != 0) break; } @@ -450,7 +435,7 @@ /* Only handle sector aligned reads */ ip = VTOI(vp); - secsize = ip->i_devvp->v_bufobj.bo_bsize; + secsize = ITODEVVP(ip)->v_bufobj.bo_bsize; if ((uio->uio_offset & (secsize - 1)) == 0 && (uio->uio_resid & (secsize - 1)) == 0) { @@ -470,7 +455,7 @@ } partialbytes = ((unsigned int) ip->i_size) % - ip->i_fs->fs_bsize; + ITOFS(ip)->fs_bsize; blockbytes = (int) filebytes - partialbytes; if (blockbytes > 0) { skipbytes = uio->uio_resid - Modified: trunk/sys/ufs/ffs/ffs_snapshot.c =================================================================== --- trunk/sys/ufs/ffs/ffs_snapshot.c 2020-02-08 19:38:54 UTC (rev 12315) +++ trunk/sys/ufs/ffs/ffs_snapshot.c 2020-02-08 19:39:08 UTC (rev 12316) @@ -35,7 +35,7 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/ufs/ffs/ffs_snapshot.c 322132 2017-08-07 02:29:09Z mckusick $"); +__FBSDID("$FreeBSD: stable/11/sys/ufs/ffs/ffs_snapshot.c 342819 2019-01-06 22:34:47Z mckusick $"); #include "opt_quota.h" @@ -301,9 +301,10 @@ return (error); } vp = nd.ni_vp; + vnode_create_vobject(nd.ni_vp, fs->fs_size, td); vp->v_vflag |= VV_SYSTEM; ip = VTOI(vp); - devvp = ip->i_devvp; + devvp = ITODEVVP(ip); /* * Allocate and copy the last block contents so as to be able * to set size to that of the filesystem. @@ -559,7 +560,7 @@ } VI_UNLOCK(xvp); if (snapdebug) - vprint("ffs_snapshot: busy vnode", xvp); + vn_printf(xvp, "ffs_snapshot: busy vnode "); if (VOP_GETATTR(xvp, &vat, td->td_ucred) == 0 && vat.va_nlink > 0) { VOP_UNLOCK(xvp, 0); @@ -588,7 +589,7 @@ } } snaplistsize += 1; - if (xp->i_ump->um_fstype == UFS1) + if (I_IS_UFS1(xp)) error = expunge_ufs1(vp, xp, copy_fs, fullacct_ufs1, BLK_NOCOPY, 1); else @@ -621,7 +622,7 @@ goto out1; } xp = VTOI(xvp); - if (xp->i_ump->um_fstype == UFS1) + if (I_IS_UFS1(xp)) error = expunge_ufs1(vp, xp, copy_fs, fullacct_ufs1, BLK_NOCOPY, 0); else @@ -707,7 +708,7 @@ TAILQ_FOREACH(xp, &sn->sn_head, i_nextsnap) { if (xp == ip) break; - if (xp->i_ump->um_fstype == UFS1) + if (I_IS_UFS1(xp)) error = expunge_ufs1(vp, xp, fs, snapacct_ufs1, BLK_SNAP, 0); else @@ -736,7 +737,7 @@ * blocks marked as used in the snapshot bitmaps. Also, collect * the list of allocated blocks in i_snapblklist. */ - if (ip->i_ump->um_fstype == UFS1) + if (I_IS_UFS1(ip)) error = expunge_ufs1(vp, ip, copy_fs, mapacct_ufs1, BLK_SNAP, 0); else @@ -888,9 +889,9 @@ int error, len, loc, indiroff; ip = VTOI(vp); - fs = ip->i_fs; - error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), - (int)fs->fs_cgsize, KERNCRED, &bp); + fs = ITOFS(ip); + error = bread(ITODEVVP(ip), fsbtodb(fs, cgtod(fs, cg)), + (int)fs->fs_cgsize, KERNCRED, &bp); if (error) { brelse(bp); return (error); @@ -900,7 +901,7 @@ brelse(bp); return (EIO); } - UFS_LOCK(ip->i_ump); + UFS_LOCK(ITOUMP(ip)); ACTIVESET(fs, cg); /* * Recomputation of summary information might not have been performed @@ -909,7 +910,7 @@ * fsck is slightly more consistent. */ fs->fs_cs(fs, cg) = cgp->cg_cs; - UFS_UNLOCK(ip->i_ump); + UFS_UNLOCK(ITOUMP(ip)); bcopy(bp->b_data, nbp->b_data, fs->fs_cgsize); if (fs->fs_cgsize < fs->fs_bsize) bzero(&nbp->b_data[fs->fs_cgsize], @@ -953,7 +954,7 @@ } indiroff = 0; } - if (ip->i_ump->um_fstype == UFS1) { + if (I_IS_UFS1(ip)) { if (ffs_isblock(fs, cg_blksfree(cgp), loc)) ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = BLK_NOCOPY; @@ -1258,7 +1259,7 @@ *ip->i_snapblklist++ = lblkno; if (blkno == BLK_SNAP) blkno = blkstofrags(fs, lblkno); - ffs_blkfree(ip->i_ump, fs, vp, blkno, fs->fs_bsize, inum, + ffs_blkfree(ITOUMP(ip), fs, vp, blkno, fs->fs_bsize, inum, vp->v_type, NULL); } return (0); @@ -1542,7 +1543,7 @@ *ip->i_snapblklist++ = lblkno; if (blkno == BLK_SNAP) blkno = blkstofrags(fs, lblkno); - ffs_blkfree(ip->i_ump, fs, vp, blkno, fs->fs_bsize, inum, + ffs_blkfree(ITOUMP(ip), fs, vp, blkno, fs->fs_bsize, inum, vp->v_type, NULL); } return (0); @@ -1566,7 +1567,7 @@ * Find snapshot in incore list. */ xp = NULL; - sn = ip->i_devvp->v_rdev->si_snapdata; + sn = ITODEVVP(ip)->v_rdev->si_snapdata; if (sn != NULL) TAILQ_FOREACH(xp, &sn->sn_head, i_nextsnap) if (xp == ip) @@ -1579,8 +1580,8 @@ /* * Delete snapshot inode from superblock. Keep list dense. */ - fs = ip->i_fs; - ump = ip->i_ump; + ump = ITOUMP(ip); + fs = ump->um_fs; UFS_LOCK(ump); for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) if (fs->fs_snapinum[snaploc] == ip->i_number) @@ -1612,8 +1613,8 @@ struct snapdata *sn; ip = VTOI(vp); - fs = ip->i_fs; - devvp = ip->i_devvp; + fs = ITOFS(ip); + devvp = ITODEVVP(ip); /* * If active, delete from incore list (this snapshot may * already have been in the process of being deleted, so @@ -1651,7 +1652,7 @@ if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) DIP_SET(ip, i_db[blkno], 0); else if ((dblk == blkstofrags(fs, blkno) && - ffs_snapblkfree(fs, ip->i_devvp, dblk, fs->fs_bsize, + ffs_snapblkfree(fs, ITODEVVP(ip), dblk, fs->fs_bsize, ip->i_number, vp->v_type, NULL))) { DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - btodb(fs->fs_bsize)); @@ -1669,7 +1670,7 @@ else last = fs->fs_size - blkno; for (loc = 0; loc < last; loc++) { - if (ip->i_ump->um_fstype == UFS1) { + if (I_IS_UFS1(ip)) { dblk = ((ufs1_daddr_t *)(ibp->b_data))[loc]; if (dblk == 0) continue; @@ -1676,7 +1677,7 @@ if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) ((ufs1_daddr_t *)(ibp->b_data))[loc]= 0; else if ((dblk == blkstofrags(fs, blkno) && - ffs_snapblkfree(fs, ip->i_devvp, dblk, + ffs_snapblkfree(fs, ITODEVVP(ip), dblk, fs->fs_bsize, ip->i_number, vp->v_type, NULL))) { ip->i_din1->di_blocks -= @@ -1691,7 +1692,7 @@ if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) ((ufs2_daddr_t *)(ibp->b_data))[loc] = 0; else if ((dblk == blkstofrags(fs, blkno) && - ffs_snapblkfree(fs, ip->i_devvp, dblk, + ffs_snapblkfree(fs, ITODEVVP(ip), dblk, fs->fs_bsize, ip->i_number, vp->v_type, NULL))) { ip->i_din2->di_blocks -= btodb(fs->fs_bsize); ((ufs2_daddr_t *)(ibp->b_data))[loc] = 0; @@ -1786,7 +1787,7 @@ if (error) break; indiroff = (lbn - NDADDR) % NINDIR(fs); - if (ip->i_ump->um_fstype == UFS1) + if (I_IS_UFS1(ip)) blkno=((ufs1_daddr_t *)(ibp->b_data))[indiroff]; else blkno=((ufs2_daddr_t *)(ibp->b_data))[indiroff]; @@ -1811,7 +1812,7 @@ if (lbn < NDADDR) { DIP_SET(ip, i_db[lbn], BLK_NOCOPY); ip->i_flag |= IN_CHANGE | IN_UPDATE; - } else if (ip->i_ump->um_fstype == UFS1) { + } else if (I_IS_UFS1(ip)) { ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = BLK_NOCOPY; bdwrite(ibp); @@ -1859,7 +1860,7 @@ } if (lbn < NDADDR) { DIP_SET(ip, i_db[lbn], bno); - } else if (ip->i_ump->um_fstype == UFS1) { + } else if (I_IS_UFS1(ip)) { ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = bno; bdwrite(ibp); } else { @@ -1991,15 +1992,19 @@ continue; } ip = VTOI(vp); - if (!IS_SNAPSHOT(ip) || ip->i_size == + if (vp->v_type != VREG) { + reason = "non-file snapshot"; + } else if (!IS_SNAPSHOT(ip)) { + reason = "non-snapshot"; + } else if (ip->i_size == lblktosize(fs, howmany(fs->fs_size, fs->fs_frag))) { - if (!IS_SNAPSHOT(ip)) { - reason = "non-snapshot"; - } else { - reason = "old format snapshot"; - (void)ffs_truncate(vp, (off_t)0, 0, NOCRED); - (void)ffs_syncvnode(vp, MNT_WAIT, 0); - } + reason = "old format snapshot"; + (void)ffs_truncate(vp, (off_t)0, 0, NOCRED); + (void)ffs_syncvnode(vp, MNT_WAIT, 0); + } else { + reason = NULL; + } + if (reason != NULL) { printf("ffs_snapshot_mount: %s inode %d\n", reason, fs->fs_snapinum[snaploc]); vput(vp); @@ -2141,7 +2146,7 @@ sn = devvp->v_rdev->si_snapdata; if (sn == NULL || TAILQ_FIRST(&sn->sn_head) == NULL) return (0); - fs = TAILQ_FIRST(&sn->sn_head)->i_fs; + fs = ITOFS(TAILQ_FIRST(&sn->sn_head)); lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno)); snapblklist = sn->sn_blklist; upper = sn->sn_listsize - 1; @@ -2268,7 +2273,7 @@ return (0); /* No snapshot */ } ip = TAILQ_FIRST(&sn->sn_head); - fs = ip->i_fs; + fs = ITOFS(ip); lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno)); snapblklist = sn->sn_blklist; upper = sn->sn_listsize - 1; @@ -2342,7 +2347,7 @@ if (error) break; indiroff = (lbn - NDADDR) % NINDIR(fs); - if (ip->i_ump->um_fstype == UFS1) + if (I_IS_UFS1(ip)) blkno=((ufs1_daddr_t *)(ibp->b_data))[indiroff]; else blkno=((ufs2_daddr_t *)(ibp->b_data))[indiroff]; @@ -2498,15 +2503,19 @@ { struct inode *ip = VTOI(vp); struct bio *bip; + struct fs *fs; + ip = VTOI(vp); + fs = ITOFS(ip); + bip = g_alloc_bio(); bip->bio_cmd = BIO_READ; - bip->bio_offset = dbtob(fsbtodb(ip->i_fs, blkstofrags(ip->i_fs, lbn))); + bip->bio_offset = dbtob(fsbtodb(fs, blkstofrags(fs, lbn))); bip->bio_data = bp->b_data; bip->bio_length = bp->b_bcount; bip->bio_done = NULL; - g_io_request(bip, ip->i_devvp->v_bufobj.bo_private); + g_io_request(bip, ITODEVVP(ip)->v_bufobj.bo_private); bp->b_error = biowait(bip, "snaprdb"); g_destroy_bio(bip); return (bp->b_error); Modified: trunk/sys/ufs/ffs/ffs_softdep.c =================================================================== --- trunk/sys/ufs/ffs/ffs_softdep.c 2020-02-08 19:38:54 UTC (rev 12315) +++ trunk/sys/ufs/ffs/ffs_softdep.c 2020-02-08 19:39:08 UTC (rev 12316) @@ -41,7 +41,7 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/ufs/ffs/ffs_softdep.c 324612 2017-10-13 22:40:57Z jhb $"); +__FBSDID("$FreeBSD: stable/11/sys/ufs/ffs/ffs_softdep.c 357034 2020-01-23 06:24:11Z mckusick $"); #include "opt_ffs.h" #include "opt_quota.h" @@ -70,6 +70,7 @@ #include <sys/namei.h> #include <sys/priv.h> #include <sys/proc.h> +#include <sys/racct.h> #include <sys/rwlock.h> #include <sys/stat.h> #include <sys/sysctl.h> @@ -901,8 +902,10 @@ struct pagedep **); static void pause_timer(void *); static int request_cleanup(struct mount *, int); +static int softdep_request_cleanup_flush(struct mount *, struct ufsmount *); static void schedule_cleanup(struct mount *); -static void softdep_ast_cleanup_proc(void); +static void softdep_ast_cleanup_proc(struct thread *); +static struct ufsmount *softdep_bp_to_mp(struct buf *bp); static int process_worklist_item(struct mount *, int, int); static void process_removes(struct vnode *); static void process_truncates(struct vnode *); @@ -1105,7 +1108,7 @@ LIST_FOREACH_SAFE(wk, dst, wk_list, wkn) { if (wk->wk_type == D_JSEGDEP) jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep); - if (wk->wk_type == D_FREEDEP) + else if (wk->wk_type == D_FREEDEP) freedep = freedep_merge(WK_FREEDEP(wk), freedep); } @@ -1534,10 +1537,10 @@ struct ufsmount *ump; ump = VFSTOUFS(wk->wk_mp); - WORKLIST_REMOVE(wk); if (ump->softdep_worklist_tail == wk) ump->softdep_worklist_tail = (struct worklist *)wk->wk_list.le_prev; + WORKLIST_REMOVE(wk); ump->softdep_on_worklist -= 1; } @@ -1835,11 +1838,11 @@ wake_worklist(wk); add_to_worklist(wk, WK_HEAD); } - LIST_REMOVE(&sentinel, wk_list); /* Sentinal could've become the tail from remove_from_worklist. */ if (ump->softdep_worklist_tail == &sentinel) ump->softdep_worklist_tail = (struct worklist *)sentinel.wk_list.le_prev; + LIST_REMOVE(&sentinel, wk_list); PRELE(curproc); return (matchcnt); } @@ -2893,7 +2896,6 @@ if (ump->softdep_journal_tail == wk) ump->softdep_journal_tail = (struct worklist *)wk->wk_list.le_prev; - WORKLIST_REMOVE(wk); ump->softdep_on_journal -= 1; } @@ -3994,7 +3996,7 @@ struct jmvref *jmvref; jmvref = malloc(sizeof(*jmvref), M_JMVREF, M_SOFTDEP_FLAGS); - workitem_alloc(&jmvref->jm_list, D_JMVREF, UFSTOVFS(dp->i_ump)); + workitem_alloc(&jmvref->jm_list, D_JMVREF, ITOVFS(dp)); jmvref->jm_list.wk_state = ATTACHED | DEPCOMPLETE; jmvref->jm_parent = dp->i_number; jmvref->jm_ino = ino; @@ -4021,7 +4023,7 @@ struct jremref *jremref; jremref = malloc(sizeof(*jremref), M_JREMREF, M_SOFTDEP_FLAGS); - workitem_alloc(&jremref->jr_list, D_JREMREF, UFSTOVFS(dp->i_ump)); + workitem_alloc(&jremref->jr_list, D_JREMREF, ITOVFS(dp)); jremref->jr_state = ATTACHED; newinoref(&jremref->jr_ref, ip->i_number, dp->i_number, diroff, nlink, ip->i_mode); @@ -4057,7 +4059,7 @@ struct jaddref *jaddref; jaddref = malloc(sizeof(*jaddref), M_JADDREF, M_SOFTDEP_FLAGS); - workitem_alloc(&jaddref->ja_list, D_JADDREF, UFSTOVFS(dp->i_ump)); + workitem_alloc(&jaddref->ja_list, D_JADDREF, ITOVFS(dp)); jaddref->ja_state = ATTACHED; jaddref->ja_mkdir = NULL; newinoref(&jaddref->ja_ref, ino, dp->i_number, diroff, nlink, mode); @@ -4645,7 +4647,7 @@ KASSERT(ip->i_nlink >= ip->i_effnlink, ("inodedep_lookup_ip: bad delta")); - (void) inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, DEPALLOC, + (void) inodedep_lookup(ITOVFS(ip), ip->i_number, DEPALLOC, &inodedep); inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; KASSERT((inodedep->id_state & UNLINKED) == 0, ("inode unlinked")); @@ -4668,12 +4670,12 @@ struct jaddref *jaddref; struct vnode *dvp; - KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(dp->i_ump)) != 0, + KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0, ("softdep_setup_create called on non-softdep filesystem")); KASSERT(ip->i_nlink == 1, ("softdep_setup_create: Invalid link count.")); dvp = ITOV(dp); - ACQUIRE_LOCK(dp->i_ump); + ACQUIRE_LOCK(ITOUMP(dp)); inodedep = inodedep_lookup_ip(ip); if (DOINGSUJ(dvp)) { jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, @@ -4682,7 +4684,7 @@ ("softdep_setup_create: No addref structure present.")); } softdep_prelink(dvp, NULL); - FREE_LOCK(dp->i_ump); + FREE_LOCK(ITOUMP(dp)); } /* @@ -4700,7 +4702,7 @@ struct jaddref *jaddref; struct vnode *dvp; - KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(dp->i_ump)) != 0, + KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0, ("softdep_setup_dotdot_link called on non-softdep filesystem")); dvp = ITOV(dp); jaddref = NULL; @@ -4711,13 +4713,13 @@ if (DOINGSUJ(dvp)) jaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET, dp->i_effnlink - 1, dp->i_mode); - ACQUIRE_LOCK(dp->i_ump); + ACQUIRE_LOCK(ITOUMP(dp)); inodedep = inodedep_lookup_ip(dp); if (jaddref) TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref, if_deps); softdep_prelink(dvp, ITOV(ip)); - FREE_LOCK(dp->i_ump); + FREE_LOCK(ITOUMP(dp)); } /* @@ -4735,7 +4737,7 @@ struct jaddref *jaddref; struct vnode *dvp; - KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(dp->i_ump)) != 0, + KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0, ("softdep_setup_link called on non-softdep filesystem")); dvp = ITOV(dp); jaddref = NULL; @@ -4742,13 +4744,13 @@ if (DOINGSUJ(dvp)) jaddref = newjaddref(dp, ip->i_number, 0, ip->i_effnlink - 1, ip->i_mode); - ACQUIRE_LOCK(dp->i_ump); + ACQUIRE_LOCK(ITOUMP(dp)); inodedep = inodedep_lookup_ip(ip); if (jaddref) TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref, if_deps); softdep_prelink(dvp, ITOV(ip)); - FREE_LOCK(dp->i_ump); + FREE_LOCK(ITOUMP(dp)); } /* @@ -4768,7 +4770,7 @@ struct jaddref *jaddref; struct vnode *dvp; - KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(dp->i_ump)) != 0, + KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0, ("softdep_setup_mkdir called on non-softdep filesystem")); dvp = ITOV(dp); dotaddref = dotdotaddref = NULL; @@ -4780,7 +4782,7 @@ dp->i_effnlink - 1, dp->i_mode); dotdotaddref->ja_state |= MKDIR_PARENT; } - ACQUIRE_LOCK(dp->i_ump); + ACQUIRE_LOCK(ITOUMP(dp)); inodedep = inodedep_lookup_ip(ip); if (DOINGSUJ(dvp)) { jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, @@ -4798,7 +4800,7 @@ TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &dotdotaddref->ja_ref, if_deps); softdep_prelink(ITOV(dp), NULL); - FREE_LOCK(dp->i_ump); + FREE_LOCK(ITOUMP(dp)); } /* @@ -4812,14 +4814,14 @@ { struct vnode *dvp; - KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(dp->i_ump)) != 0, + KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0, ("softdep_setup_rmdir called on non-softdep filesystem")); dvp = ITOV(dp); - ACQUIRE_LOCK(dp->i_ump); + ACQUIRE_LOCK(ITOUMP(dp)); (void) inodedep_lookup_ip(ip); (void) inodedep_lookup_ip(dp); softdep_prelink(dvp, ITOV(ip)); - FREE_LOCK(dp->i_ump); + FREE_LOCK(ITOUMP(dp)); } /* @@ -4833,14 +4835,14 @@ { struct vnode *dvp; - KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(dp->i_ump)) != 0, + KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0, ("softdep_setup_unlink called on non-softdep filesystem")); dvp = ITOV(dp); - ACQUIRE_LOCK(dp->i_ump); + ACQUIRE_LOCK(ITOUMP(dp)); (void) inodedep_lookup_ip(ip); (void) inodedep_lookup_ip(dp); softdep_prelink(dvp, ITOV(ip)); - FREE_LOCK(dp->i_ump); + FREE_LOCK(ITOUMP(dp)); } /* @@ -4856,10 +4858,10 @@ struct jaddref *jaddref; struct vnode *dvp; - KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(dp->i_ump)) != 0, + KASSERT(MOUNTEDSOFTDEP(ITOVFS((dp))) != 0, ("softdep_revert_create called on non-softdep filesystem")); dvp = ITOV(dp); - ACQUIRE_LOCK(dp->i_ump); + ACQUIRE_LOCK(ITOUMP(dp)); inodedep = inodedep_lookup_ip(ip); if (DOINGSUJ(dvp)) { jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, @@ -4868,7 +4870,7 @@ ("softdep_revert_create: addref parent mismatch")); cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait); } - FREE_LOCK(dp->i_ump); + FREE_LOCK(ITOUMP(dp)); } /* @@ -4884,10 +4886,10 @@ struct jaddref *jaddref; struct vnode *dvp; - KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(dp->i_ump)) != 0, + KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0, ("softdep_revert_link called on non-softdep filesystem")); dvp = ITOV(dp); - ACQUIRE_LOCK(dp->i_ump); + ACQUIRE_LOCK(ITOUMP(dp)); inodedep = inodedep_lookup_ip(ip); if (DOINGSUJ(dvp)) { jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, @@ -4896,7 +4898,7 @@ ("softdep_revert_link: addref parent mismatch")); cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait); } - FREE_LOCK(dp->i_ump); + FREE_LOCK(ITOUMP(dp)); } /* @@ -4913,11 +4915,11 @@ struct jaddref *dotaddref; struct vnode *dvp; - KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(dp->i_ump)) != 0, + KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0, ("softdep_revert_mkdir called on non-softdep filesystem")); dvp = ITOV(dp); - ACQUIRE_LOCK(dp->i_ump); + ACQUIRE_LOCK(ITOUMP(dp)); inodedep = inodedep_lookup_ip(dp); if (DOINGSUJ(dvp)) { jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, @@ -4939,7 +4941,7 @@ ("softdep_revert_mkdir: dot addref parent mismatch")); cancel_jaddref(dotaddref, inodedep, &inodedep->id_inowait); } - FREE_LOCK(dp->i_ump); + FREE_LOCK(ITOUMP(dp)); } /* @@ -4951,12 +4953,12 @@ struct inode *ip; { - KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(dp->i_ump)) != 0, + KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0, ("softdep_revert_rmdir called on non-softdep filesystem")); - ACQUIRE_LOCK(dp->i_ump); + ACQUIRE_LOCK(ITOUMP(dp)); (void) inodedep_lookup_ip(ip); (void) inodedep_lookup_ip(dp); - FREE_LOCK(dp->i_ump); + FREE_LOCK(ITOUMP(dp)); } /* @@ -5007,10 +5009,10 @@ struct mount *mp; struct fs *fs; - mp = UFSTOVFS(ip->i_ump); + mp = ITOVFS(ip); KASSERT(MOUNTEDSOFTDEP(mp) != 0, ("softdep_setup_inomapdep called on non-softdep filesystem")); - fs = ip->i_ump->um_fs; + fs = VFSTOUFS(mp)->um_fs; jaddref = NULL; /* @@ -5042,7 +5044,7 @@ bmsafemap = malloc(sizeof(struct bmsafemap), M_BMSAFEMAP, M_SOFTDEP_FLAGS); workitem_alloc(&bmsafemap->sm_list, D_BMSAFEMAP, mp); - ACQUIRE_LOCK(ip->i_ump); + ACQUIRE_LOCK(ITOUMP(ip)); if ((inodedep_lookup(mp, newinum, DEPALLOC, &inodedep))) panic("softdep_setup_inomapdep: dependency %p for new" "inode already exists", inodedep); @@ -5057,7 +5059,7 @@ } inodedep->id_bmsafemap = bmsafemap; inodedep->id_state &= ~DEPCOMPLETE; - FREE_LOCK(ip->i_ump); + FREE_LOCK(ITOUMP(ip)); } /* @@ -5279,7 +5281,7 @@ ufs_lbn_t lbn; lbn = bp->b_lblkno; - mp = UFSTOVFS(ip->i_ump); + mp = ITOVFS(ip); KASSERT(MOUNTEDSOFTDEP(mp) != 0, ("softdep_setup_allocdirect called on non-softdep filesystem")); if (oldblkno && oldblkno != newblkno) @@ -5291,7 +5293,7 @@ "softdep_setup_allocdirect: ino %d blkno %jd oldblkno %jd " "off %jd newsize %ld oldsize %d", ip->i_number, newblkno, oldblkno, off, newsize, oldsize); - ACQUIRE_LOCK(ip->i_ump); + ACQUIRE_LOCK(ITOUMP(ip)); if (off >= NDADDR) { if (lbn > 0) panic("softdep_setup_allocdirect: bad lbn %jd, off %jd", @@ -5363,7 +5365,7 @@ TAILQ_INSERT_TAIL(adphead, adp, ad_next); if (oldadp != NULL && oldadp->ad_offset == off) allocdirect_merge(adphead, adp, oldadp); - FREE_LOCK(ip->i_ump); + FREE_LOCK(ITOUMP(ip)); return; } TAILQ_FOREACH(oldadp, adphead, ad_next) { @@ -5377,7 +5379,7 @@ if (oldadp->ad_offset == off) allocdirect_merge(adphead, adp, oldadp); - FREE_LOCK(ip->i_ump); + FREE_LOCK(ITOUMP(ip)); } /* @@ -5541,10 +5543,10 @@ struct jfreefrag *jfreefrag; struct fs *fs; - fs = ip->i_fs; + fs = ITOFS(ip); jfreefrag = malloc(sizeof(struct jfreefrag), M_JFREEFRAG, M_SOFTDEP_FLAGS); - workitem_alloc(&jfreefrag->fr_list, D_JFREEFRAG, UFSTOVFS(ip->i_ump)); + workitem_alloc(&jfreefrag->fr_list, D_JFREEFRAG, ITOVFS(ip)); jfreefrag->fr_jsegdep = newjsegdep(&jfreefrag->fr_list); jfreefrag->fr_state = ATTACHED | DEPCOMPLETE; jfreefrag->fr_ino = ip->i_number; @@ -5567,16 +5569,18 @@ ufs_lbn_t lbn; { struct freefrag *freefrag; + struct ufsmount *ump; struct fs *fs; CTR4(KTR_SUJ, "newfreefrag: ino %d blkno %jd size %ld lbn %jd", ip->i_number, blkno, size, lbn); - fs = ip->i_fs; + ump = ITOUMP(ip); + fs = ump->um_fs; if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag) panic("newfreefrag: frag size"); freefrag = malloc(sizeof(struct freefrag), M_FREEFRAG, M_SOFTDEP_FLAGS); - workitem_alloc(&freefrag->ff_list, D_FREEFRAG, UFSTOVFS(ip->i_ump)); + workitem_alloc(&freefrag->ff_list, D_FREEFRAG, UFSTOVFS(ump)); freefrag->ff_state = ATTACHED; LIST_INIT(&freefrag->ff_jwork); freefrag->ff_inum = ip->i_number; @@ -5584,7 +5588,7 @@ freefrag->ff_blkno = blkno; freefrag->ff_fragsize = size; - if (MOUNTEDSUJ(UFSTOVFS(ip->i_ump))) { + if (MOUNTEDSUJ(UFSTOVFS(ump))) { freefrag->ff_jdep = (struct worklist *) newjfreefrag(freefrag, ip, blkno, size, lbn); } else { @@ -5656,9 +5660,11 @@ struct jnewblk *jnewblk; struct newblk *newblk; struct mount *mp; + struct ufsmount *ump; ufs_lbn_t lbn; - mp = UFSTOVFS(ip->i_ump); + mp = ITOVFS(ip); + ump = VFSTOUFS(mp); KASSERT(MOUNTEDSOFTDEP(mp) != 0, ("softdep_setup_allocext called on non-softdep filesystem")); KASSERT(off < NXADDR, ("softdep_setup_allocext: lbn %lld > NXADDR", @@ -5670,7 +5676,7 @@ else freefrag = NULL; - ACQUIRE_LOCK(ip->i_ump); + ACQUIRE_LOCK(ump); if (newblk_lookup(mp, newblkno, 0, &newblk) == 0) panic("softdep_setup_allocext: lost block"); KASSERT(newblk->nb_list.wk_type == D_NEWBLK, @@ -5721,7 +5727,7 @@ TAILQ_INSERT_TAIL(adphead, adp, ad_next); if (oldadp != NULL && oldadp->ad_offset == off) allocdirect_merge(adphead, adp, oldadp); - FREE_LOCK(ip->i_ump); + FREE_LOCK(ump); return; } TAILQ_FOREACH(oldadp, adphead, ad_next) { @@ -5734,7 +5740,7 @@ TAILQ_INSERT_BEFORE(oldadp, adp, ad_next); if (oldadp->ad_offset == off) allocdirect_merge(adphead, adp, oldadp); - FREE_LOCK(ip->i_ump); + FREE_LOCK(ump); } /* @@ -5779,11 +5785,11 @@ struct jnewblk *jnewblk; if (oldblkno) - freefrag = newfreefrag(ip, oldblkno, ip->i_fs->fs_bsize, lbn); + freefrag = newfreefrag(ip, oldblkno, ITOFS(ip)->fs_bsize, lbn); else freefrag = NULL; - ACQUIRE_LOCK(ip->i_ump); - if (newblk_lookup(UFSTOVFS(ip->i_ump), newblkno, 0, &newblk) == 0) + ACQUIRE_LOCK(ITOUMP(ip)); + if (newblk_lookup(ITOVFS(ip), newblkno, 0, &newblk) == 0) panic("new_allocindir: lost block"); KASSERT(newblk->nb_list.wk_type == D_NEWBLK, ("newallocindir: newblk already initialized")); @@ -5823,8 +5829,10 @@ struct allocindir *aip; struct pagedep *pagedep; struct mount *mp; + struct ufsmount *ump; - mp = UFSTOVFS(ip->i_ump); + mp = ITOVFS(ip); + ump = VFSTOUFS(mp); KASSERT(MOUNTEDSOFTDEP(mp) != 0, ("softdep_setup_allocindir_page called on non-softdep filesystem")); KASSERT(lbn == nbp->b_lblkno, @@ -5845,7 +5853,7 @@ pagedep_lookup(mp, nbp, ip->i_number, lbn, DEPALLOC, &pagedep); WORKLIST_INSERT(&nbp->b_dep, &aip->ai_block.nb_list); freefrag = setup_allocindir_phase2(bp, ip, inodedep, aip, lbn); - FREE_LOCK(ip->i_ump); + FREE_LOCK(ump); if (freefrag) handle_workitem_freefrag(freefrag); } @@ -5864,9 +5872,11 @@ { struct inodedep *inodedep; struct allocindir *aip; + struct ufsmount *ump; ufs_lbn_t lbn; - KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ip->i_ump)) != 0, + ump = ITOUMP(ip); + KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0, ("softdep_setup_allocindir_meta called on non-softdep filesystem")); CTR3(KTR_SUJ, "softdep_setup_allocindir_meta: ino %d blkno %jd ptrno %d", @@ -5874,12 +5884,11 @@ lbn = nbp->b_lblkno; ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_meta"); aip = newallocindir(ip, ptrno, newblkno, 0, lbn); - inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, DEPALLOC, - &inodedep); + inodedep_lookup(UFSTOVFS(ump), ip->i_number, DEPALLOC, &inodedep); WORKLIST_INSERT(&nbp->b_dep, &aip->ai_block.nb_list); if (setup_allocindir_phase2(bp, ip, inodedep, aip, lbn)) panic("softdep_setup_allocindir_meta: Block already existed"); - FREE_LOCK(ip->i_ump); + FREE_LOCK(ump); } static void @@ -5921,7 +5930,7 @@ LOCK_OWNED(ump); indirdep = NULL; newindirdep = NULL; - fs = ip->i_fs; + fs = ump->um_fs; for (;;) { LIST_FOREACH(wk, &bp->b_dep, wk_list) { if (wk->wk_type != D_INDIRDEP) @@ -5943,7 +5952,7 @@ M_INDIRDEP, M_SOFTDEP_FLAGS); workitem_alloc(&newindirdep->ir_list, D_INDIRDEP, mp); newindirdep->ir_state = ATTACHED; - if (ip->i_ump->um_fstype == UFS1) + if (I_IS_UFS1(ip)) newindirdep->ir_state |= UFS1FMT; TAILQ_INIT(&newindirdep->ir_trunc); newindirdep->ir_saveddata = NULL; @@ -5958,7 +5967,7 @@ } newindirdep->ir_freeblks = NULL; newindirdep->ir_savebp = - getblk(ip->i_devvp, bp->b_blkno, bp->b_bcount, 0, 0, 0); + getblk(ump->um_devvp, bp->b_blkno, bp->b_bcount, 0, 0, 0); newindirdep->ir_bp = bp; BUF_KERNPROC(newindirdep->ir_savebp); bcopy(bp->b_data, newindirdep->ir_savebp->b_data, bp->b_bcount); @@ -5996,10 +6005,12 @@ struct allocindir *oldaip; struct freefrag *freefrag; struct mount *mp; + struct ufsmount *ump; - LOCK_OWNED(ip->i_ump); - mp = UFSTOVFS(ip->i_ump); - fs = ip->i_fs; + mp = ITOVFS(ip); + ump = VFSTOUFS(mp); + LOCK_OWNED(ump); + fs = ump->um_fs; if (bp->b_lblkno >= 0) panic("setup_allocindir_phase2: not indir blk"); KASSERT(aip->ai_offset >= 0 && aip->ai_offset < NINDIR(fs), @@ -6084,6 +6095,7 @@ int i; int needj; { + struct ufsmount *ump; ufs2_daddr_t blkno; int frags; @@ -6091,9 +6103,10 @@ if (blkno == 0) return; DIP_SET(ip, i_db[i], 0); - frags = sblksize(ip->i_fs, ip->i_size, i); - frags = numfrags(ip->i_fs, frags); - newfreework(ip->i_ump, freeblks, NULL, i, blkno, frags, 0, needj); + ump = ITOUMP(ip); + frags = sblksize(ump->um_fs, ip->i_size, i); + frags = numfrags(ump->um_fs, frags); + newfreework(ump, freeblks, NULL, i, blkno, frags, 0, needj); } static inline void @@ -6103,6 +6116,7 @@ int i; int needj; { + struct ufsmount *ump; ufs2_daddr_t blkno; int frags; @@ -6110,9 +6124,10 @@ if (blkno == 0) return; ip->i_din2->di_extb[i] = 0; - frags = sblksize(ip->i_fs, ip->i_din2->di_extsize, i); - frags = numfrags(ip->i_fs, frags); - newfreework(ip->i_ump, freeblks, NULL, -1 - i, blkno, frags, 0, needj); + ump = ITOUMP(ip); + frags = sblksize(ump->um_fs, ip->i_din2->di_extsize, i); + frags = numfrags(ump->um_fs, frags); + newfreework(ump, freeblks, NULL, -1 - i, blkno, frags, 0, needj); } static inline void @@ -6123,6 +6138,7 @@ ufs_lbn_t lbn; int needj; { + struct ufsmount *ump; ufs2_daddr_t blkno; blkno = DIP(ip, i_ib[i]); @@ -6129,7 +6145,8 @@ if (blkno == 0) return; DIP_SET(ip, i_ib[i], 0); - newfreework(ip->i_ump, freeblks, NULL, lbn, blkno, ip->i_fs->fs_frag, + ump = ITOUMP(ip); + newfreework(ump, freeblks, NULL, lbn, blkno, ump->um_fs->fs_frag, 0, needj); } @@ -6152,7 +6169,7 @@ freeblks->fb_inum = ip->i_number; freeblks->fb_vtype = ITOV(ip)->v_type; freeblks->fb_modrev = DIP(ip, i_modrev); - freeblks->fb_devvp = ip->i_devvp; + freeblks->fb_devvp = ITODEVVP(ip); freeblks->fb_chkcnt = 0; freeblks->fb_len = 0; @@ -6207,6 +6224,7 @@ struct freework *freework; struct newblk *newblk; struct mount *mp; + struct ufsmount *ump; struct buf *bp; uint8_t *start; uint8_t *end; @@ -6220,6 +6238,7 @@ if (blkno == 0) return (0); mp = freeblks->fb_list.wk_mp; + ump = VFSTOUFS(mp); bp = getblk(ITOV(ip), lbn, mp->mnt_stat.f_iosize, 0, 0, 0); if ((bp->b_flags & B_CACHE) == 0) { bp->b_blkno = blkptrtodb(VFSTOUFS(mp), blkno); @@ -6229,6 +6248,13 @@ vfs_busy_pages(bp, 0); bp->b_iooffset = dbtob(bp->b_blkno); bstrategy(bp); +#ifdef RACCT + if (racct_enable) { + PROC_LOCK(curproc); + racct_add_buf(curproc, bp, 0); + PROC_UNLOCK(curproc); + } +#endif /* RACCT */ curthread->td_ru.ru_inblock++; error = bufwait(bp); if (error) { @@ -6237,22 +6263,21 @@ } } level = lbn_level(lbn); - lbnadd = lbn_offset(ip->i_fs, level); + lbnadd = lbn_offset(ump->um_fs, level); /* * Compute the offset of the last block we want to keep. Store * in the freework the first block we want to completely free. */ off = (lastlbn - -(lbn + level)) / lbnadd; - if (off + 1 == NINDIR(ip->i_fs)) + if (off + 1 == NINDIR(ump->um_fs)) goto nowork; - freework = newfreework(ip->i_ump, freeblks, NULL, lbn, blkno, 0, off+1, - 0); + freework = newfreework(ump, freeblks, NULL, lbn, blkno, 0, off + 1, 0); /* * Link the freework into the indirdep. This will prevent any new * allocations from proceeding until we are finished with the * truncate and the block is written. */ - ACQUIRE_LOCK(ip->i_ump); + ACQUIRE_LOCK(ump); indirdep = indirdep_lookup(mp, ip, bp); if (indirdep->ir_freeblks) panic("setup_trunc_indir: indirdep already truncated."); @@ -6264,12 +6289,12 @@ * live on this newblk. */ if ((indirdep->ir_state & DEPCOMPLETE) == 0) { - newblk_lookup(mp, dbtofsb(ip->i_fs, bp->b_blkno), 0, &newblk); + newblk_lookup(mp, dbtofsb(ump->um_fs, bp->b_blkno), 0, &newblk); LIST_FOREACH(indirn, &newblk->nb_indirdeps, ir_next) trunc_indirdep(indirn, freeblks, bp, off); } else trunc_indirdep(indirdep, freeblks, bp, off); - FREE_LOCK(ip->i_ump); + FREE_LOCK(ump); /* * Creation is protected by the buf lock. The saveddata is only * needed if a full truncation follows a partial truncation but it @@ -6280,7 +6305,7 @@ M_SOFTDEP_FLAGS); nowork: /* Fetch the blkno of the child and the zero start offset. */ - if (ip->i_ump->um_fstype == UFS1) { + if (I_IS_UFS1(ip)) { blkno = ((ufs1_daddr_t *)bp->b_data)[off]; start = (uint8_t *)&((ufs1_daddr_t *)bp->b_data)[off+1]; } else { @@ -6490,9 +6515,9 @@ ufs_lbn_t tmpval, lbn, lastlbn; int frags, lastoff, iboff, allocblock, needj, error, i; - fs = ip->i_fs; - ump = ip->i_ump; + ump = ITOUMP(ip); mp = UFSTOVFS(ump); + fs = ump->um_fs; KASSERT(MOUNTEDSOFTDEP(mp) != 0, ("softdep_journal_freeblocks called on non-softdep filesystem")); vp = ITOV(ip); @@ -6572,13 +6597,13 @@ blkno = DIP(ip, i_db[lastlbn]); if (blkno && oldfrags != frags) { oldfrags -= frags; - oldfrags = numfrags(ip->i_fs, oldfrags); - blkno += numfrags(ip->i_fs, frags); + oldfrags = numfrags(fs, oldfrags); + blkno += numfrags(fs, frags); newfreework(ump, freeblks, NULL, lastlbn, blkno, oldfrags, 0, needj); if (needj) adjust_newfreework(freeblks, - numfrags(ip->i_fs, frags)); + numfrags(fs, frags)); } else if (blkno == 0) allocblock = 1; } @@ -6595,7 +6620,7 @@ DIP_SET(ip, i_size, ip->i_size); datablocks = DIP(ip, i_blocks) - extblocks; if (length != 0) - datablocks = blkcount(ip->i_fs, datablocks, length); + datablocks = blkcount(fs, datablocks, length); freeblks->fb_len = length; } if ((flags & IO_EXT) != 0) { @@ -6622,7 +6647,7 @@ */ ufs_itimes(vp); ip->i_flag &= ~(IN_LAZYACCESS | IN_LAZYMOD | IN_MODIFIED); - error = bread(ip->i_devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), + error = bread(ump->um_devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), (int)fs->fs_bsize, cred, &bp); if (error) { brelse(bp); @@ -6762,20 +6787,22 @@ struct inode *ip; { struct jfsync *jfsync; + struct ufsmount *ump; - KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ip->i_ump)) != 0, + ump = ITOUMP(ip); + KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0, ("softdep_journal_fsync called on non-softdep filesystem")); if ((ip->i_flag & IN_TRUNCATED) == 0) return; ip->i_flag &= ~IN_TRUNCATED; jfsync = malloc(sizeof(*jfsync), M_JFSYNC, M_SOFTDEP_FLAGS | M_ZERO); - workitem_alloc(&jfsync->jfs_list, D_JFSYNC, UFSTOVFS(ip->i_ump)); + workitem_alloc(&jfsync->jfs_list, D_JFSYNC, UFSTOVFS(ump)); jfsync->jfs_size = ip->i_size; jfsync->jfs_ino = ip->i_number; - ACQUIRE_LOCK(ip->i_ump); + ACQUIRE_LOCK(ump); add_to_journal(&jfsync->jfs_list); jwait(&jfsync->jfs_list, MNT_WAIT); - FREE_LOCK(ip->i_ump); + FREE_LOCK(ump); } /* @@ -6827,7 +6854,7 @@ ufs_lbn_t tmpval; ufs_lbn_t lbn; - ump = ip->i_ump; + ump = ITOUMP(ip); mp = UFSTOVFS(ump); KASSERT(MOUNTEDSOFTDEP(mp) != 0, ("softdep_setup_freeblocks called on non-softdep filesystem")); @@ -6834,7 +6861,14 @@ CTR2(KTR_SUJ, "softdep_setup_freeblks: ip %d length %ld", ip->i_number, length); KASSERT(length == 0, ("softdep_setup_freeblocks: non-zero length")); - fs = ip->i_fs; + fs = ump->um_fs; + if ((error = bread(ump->um_devvp, + fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), + (int)fs->fs_bsize, NOCRED, &bp)) != 0) { + brelse(bp); + softdep_error("softdep_setup_freeblocks", error); + return; + } freeblks = newfreeblks(mp, ip); extblocks = 0; datablocks = 0; @@ -6867,16 +6901,10 @@ UFS_UNLOCK(ump); DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - datablocks); /* - * Push the zero'ed inode to to its disk buffer so that we are free + * Push the zero'ed inode to its disk buffer so that we are free * to delete its dependencies below. Once the dependencies are gone * the buffer can be safely released. */ - if ((error = bread(ip->i_devvp, - fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), - (int)fs->fs_bsize, NOCRED, &bp)) != 0) { - brelse(bp); - softdep_error("softdep_setup_freeblocks", error); - } if (ump->um_fstype == UFS1) { dp1 = ((struct ufs1_dinode *)bp->b_data + ino_to_fsbo(fs, ip->i_number)); @@ -6969,7 +6997,7 @@ off_t end, extend; vp = ITOV(ip); - fs = ip->i_fs; + fs = ITOFS(ip); extend = OFF_TO_IDX(lblktosize(fs, -extblocks)); if ((flags & IO_EXT) != 0) vn_pages_remove(vp, extend, 0); @@ -7219,9 +7247,9 @@ struct worklist *wk, *wkn; struct ufsmount *ump; - if ((wk = LIST_FIRST(&bp->b_dep)) == NULL) + ump = softdep_bp_to_mp(bp); + if (ump == NULL) goto done; - ump = VFSTOUFS(wk->wk_mp); ACQUIRE_LOCK(ump); LIST_FOREACH_SAFE(wk, &bp->b_dep, wk_list, wkn) { switch (wk->wk_type) { @@ -7505,7 +7533,7 @@ struct freeblks *freeblks; struct ufsmount *ump; - ump = ip->i_ump; + ump = ITOUMP(ip); KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0, ("softdep_freefile called on non-softdep filesystem")); /* @@ -7516,10 +7544,10 @@ workitem_alloc(&freefile->fx_list, D_FREEFILE, pvp->v_mount); freefile->fx_mode = mode; freefile->fx_oldinum = ino; - freefile->fx_devvp = ip->i_devvp; + freefile->fx_devvp = ump->um_devvp; LIST_INIT(&freefile->fx_jwork); UFS_LOCK(ump); - ip->i_fs->fs_pendinginodes += 1; + ump->um_fs->fs_pendinginodes += 1; UFS_UNLOCK(ump); /* @@ -8439,8 +8467,8 @@ struct mount *mp; int isindir; - ump = dp->i_ump; - mp = UFSTOVFS(ump); + mp = ITOVFS(dp); + ump = VFSTOUFS(mp); KASSERT(MOUNTEDSOFTDEP(mp) != 0, ("softdep_setup_directory_add called on non-softdep filesystem")); /* @@ -8453,7 +8481,7 @@ } jaddref = NULL; mkdir1 = mkdir2 = NULL; - fs = dp->i_fs; + fs = ump->um_fs; lbn = lblkno(fs, diroffset); offset = blkoff(fs, diroffset); dap = malloc(sizeof(struct diradd), M_DIRADD, @@ -8606,10 +8634,12 @@ struct diradd *dap; struct direct *de; struct mount *mp; + struct ufsmount *ump; ufs_lbn_t lbn; int flags; - mp = UFSTOVFS(dp->i_ump); + mp = ITOVFS(dp); + ump = VFSTOUFS(mp); KASSERT(MOUNTEDSOFTDEP(mp) != 0, ("softdep_change_directoryentry_offset called on " "non-softdep filesystem")); @@ -8627,11 +8657,11 @@ dp->i_offset + (oldloc - base), dp->i_offset + (newloc - base)); } - lbn = lblkno(dp->i_fs, dp->i_offset); - offset = blkoff(dp->i_fs, dp->i_offset); + lbn = lblkno(ump->um_fs, dp->i_offset); + offset = blkoff(ump->um_fs, dp->i_offset); oldoffset = offset + (oldloc - base); newoffset = offset + (newloc - base); - ACQUIRE_LOCK(dp->i_ump); + ACQUIRE_LOCK(ump); if (pagedep_lookup(mp, bp, dp->i_number, lbn, flags, &pagedep) == 0) goto done; dap = diradd_lookup(pagedep, oldoffset); @@ -8653,7 +8683,7 @@ add_to_journal(&jmvref->jm_list); } bcopy(oldloc, newloc, entrysize); - FREE_LOCK(dp->i_ump); + FREE_LOCK(ump); } /* @@ -8896,9 +8926,11 @@ { struct dirrem *dirrem, *prevdirrem; struct inodedep *inodedep; + struct ufsmount *ump; int direct; - KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ip->i_ump)) != 0, + ump = ITOUMP(ip); + KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0, ("softdep_setup_remove called on non-softdep filesystem")); /* * Allocate a new dirrem if appropriate and ACQUIRE_LOCK. We want @@ -8910,8 +8942,7 @@ * Add the dirrem to the inodedep's pending remove list for quick * discovery later. */ - if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0, - &inodedep) == 0) + if (inodedep_lookup(UFSTOVFS(ump), ip->i_number, 0, &inodedep) == 0) panic("softdep_setup_remove: Lost inodedep."); KASSERT((inodedep->id_state & UNLINKED) == 0, ("inode unlinked")); dirrem->dm_state |= ONDEPLIST; @@ -8931,7 +8962,7 @@ if ((dirrem->dm_state & COMPLETE) == 0) { LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem, dm_next); - FREE_LOCK(ip->i_ump); + FREE_LOCK(ump); } else { if (prevdirrem != NULL) LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, @@ -8938,7 +8969,7 @@ prevdirrem, dm_next); dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino; direct = LIST_EMPTY(&dirrem->dm_jremrefhd); - FREE_LOCK(ip->i_ump); + FREE_LOCK(ump); if (direct) handle_workitem_remove(dirrem, 0); } @@ -8980,8 +9011,7 @@ struct diradd *dap; struct worklist *wk; - if (pagedep_lookup(UFSTOVFS(ip->i_ump), NULL, ip->i_number, 0, 0, - &pagedep) == 0) + if (pagedep_lookup(ITOVFS(ip), NULL, ip->i_number, 0, 0, &pagedep) == 0) return (jremref); dap = diradd_lookup(pagedep, DOTDOT_OFFSET); if (dap == NULL) @@ -9013,9 +9043,10 @@ struct ufsmount *ump; struct mkdir *mkdir; struct diradd *dap; + struct mount *mp; - if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0, - &inodedep) == 0) + mp = ITOVFS(ip); + if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) return (jremref); dap = inodedep->id_mkdiradd; if (dap == NULL || (dap->da_state & MKDIR_PARENT) == 0) @@ -9030,8 +9061,7 @@ if ((jaddref = mkdir->md_jaddref) != NULL) { mkdir->md_jaddref = NULL; jaddref->ja_state &= ~MKDIR_PARENT; - if (inodedep_lookup(UFSTOVFS(ip->i_ump), jaddref->ja_ino, 0, - &inodedep) == 0) + if (inodedep_lookup(mp, jaddref->ja_ino, 0, &inodedep) == 0) panic("cancel_mkdir_dotdot: Lost parent inodedep"); if (cancel_jaddref(jaddref, inodedep, &dirrem->dm_jwork)) { journal_jremref(dirrem, jremref, inodedep); @@ -9102,6 +9132,7 @@ struct jremref *dotremref; struct jremref *dotdotremref; struct vnode *dvp; + struct ufsmount *ump; /* * Whiteouts have no deletion dependencies. @@ -9109,6 +9140,8 @@ if (ip == NULL) panic("newdirrem: whiteout"); dvp = ITOV(dp); + ump = ITOUMP(dp); + /* * If the system is over its limit and our filesystem is * responsible for more than our share of that usage and @@ -9116,11 +9149,11 @@ * Limiting the number of dirrem structures will also limit * the number of freefile and freeblks structures. */ - ACQUIRE_LOCK(ip->i_ump); - if (!IS_SNAPSHOT(ip) && softdep_excess_items(ip->i_ump, D_DIRREM)) - schedule_cleanup(ITOV(dp)->v_mount); + ACQUIRE_LOCK(ump); + if (!IS_SNAPSHOT(ip) && softdep_excess_items(ump, D_DIRREM)) + schedule_cleanup(UFSTOVFS(ump)); else - FREE_LOCK(ip->i_ump); + FREE_LOCK(ump); dirrem = malloc(sizeof(struct dirrem), M_DIRREM, M_SOFTDEP_FLAGS | M_ZERO); workitem_alloc(&dirrem->dm_list, D_DIRREM, dvp->v_mount); @@ -9150,10 +9183,10 @@ jremref = newjremref(dirrem, dp, ip, dp->i_offset, ip->i_effnlink + 1); } - ACQUIRE_LOCK(ip->i_ump); - lbn = lblkno(dp->i_fs, dp->i_offset); - offset = blkoff(dp->i_fs, dp->i_offset); - pagedep_lookup(UFSTOVFS(dp->i_ump), bp, dp->i_number, lbn, DEPALLOC, + ACQUIRE_LOCK(ump); + lbn = lblkno(ump->um_fs, dp->i_offset); + offset = blkoff(ump->um_fs, dp->i_offset); + pagedep_lookup(UFSTOVFS(ump), bp, dp->i_number, lbn, DEPALLOC, &pagedep); dirrem->dm_pagedep = pagedep; dirrem->dm_offset = offset; @@ -9260,9 +9293,11 @@ struct inodedep *inodedep; struct jaddref *jaddref; struct mount *mp; + struct ufsmount *ump; - offset = blkoff(dp->i_fs, dp->i_offset); - mp = UFSTOVFS(dp->i_ump); + mp = ITOVFS(dp); + ump = VFSTOUFS(mp); + offset = blkoff(ump->um_fs, dp->i_offset); KASSERT(MOUNTEDSOFTDEP(mp) != 0, ("softdep_setup_directory_change called on non-softdep filesystem")); @@ -9312,7 +9347,7 @@ if (LIST_EMPTY(&dirrem->dm_jremrefhd)) add_to_worklist(&dirrem->dm_list, 0); } - FREE_LOCK(dp->i_ump); + FREE_LOCK(ump); return; } /* @@ -9386,7 +9421,7 @@ */ if (inodedep->id_mkdiradd && dp->i_offset != DOTDOT_OFFSET) merge_diradd(inodedep, dap); - FREE_LOCK(dp->i_ump); + FREE_LOCK(ump); } /* @@ -9400,16 +9435,17 @@ struct inode *ip; /* the inode with the increased link count */ { struct inodedep *inodedep; + struct ufsmount *ump; - KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ip->i_ump)) != 0, + ump = ITOUMP(ip); + KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0, ("softdep_change_linkcnt called on non-softdep filesystem")); - ACQUIRE_LOCK(ip->i_ump); - inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, DEPALLOC, - &inodedep); + ACQUIRE_LOCK(ump); + inodedep_lookup(UFSTOVFS(ump), ip->i_number, DEPALLOC, &inodedep); if (ip->i_nlink < ip->i_effnlink) panic("softdep_change_linkcnt: bad delta"); inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; - FREE_LOCK(ip->i_ump); + FREE_LOCK(ump); } /* @@ -9741,14 +9777,20 @@ /* * Move all dependencies waiting on the remove to complete * from the dirrem to the inode inowait list to be completed - * after the inode has been updated and written to disk. Any - * marked MKDIR_PARENT are saved to be completed when the .. ref - * is removed. + * after the inode has been updated and written to disk. + * + * Any marked MKDIR_PARENT are saved to be completed when the + * dotdot ref is removed unless DIRCHG is specified. For + * directory change operations there will be no further + * directory writes and the jsegdeps need to be moved along + * with the rest to be completed when the inode is free or + * stable in the inode free list. */ LIST_INIT(&dotdotwk); while ((wk = LIST_FIRST(&dirrem->dm_jwork)) != NULL) { WORKLIST_REMOVE(wk); - if (wk->wk_state & MKDIR_PARENT) { + if ((dirrem->dm_state & DIRCHG) == 0 && + wk->wk_state & MKDIR_PARENT) { wk->wk_state &= ~MKDIR_PARENT; WORKLIST_INSERT(&dotdotwk, wk); continue; @@ -9938,9 +9980,9 @@ panic("softdep_disk_io_initiation: Writing buffer with " "background write in progress: %p", bp); - if ((wk = LIST_FIRST(&bp->b_dep)) == NULL) + ump = softdep_bp_to_mp(bp); + if (ump == NULL) return; - ump = VFSTOUFS(wk->wk_mp); marker.wk_type = D_LAST + 1; /* Not a normal workitem */ PHOLD(curproc); /* Don't swap out kernel stack */ @@ -10181,22 +10223,22 @@ prevlbn = adp->ad_offset; if (adp->ad_offset < NDADDR && dp->di_db[adp->ad_offset] != adp->ad_newblkno) - panic("%s: direct pointer #%jd mismatch %d != %jd", - "softdep_write_inodeblock", + panic("initiate_write_inodeblock_ufs1: " + "direct pointer #%jd mismatch %d != %jd", (intmax_t)adp->ad_offset, dp->di_db[adp->ad_offset], (intmax_t)adp->ad_newblkno); if (adp->ad_offset >= NDADDR && dp->di_ib[adp->ad_offset - NDADDR] != adp->ad_newblkno) - panic("%s: indirect pointer #%jd mismatch %d != %jd", - "softdep_write_inodeblock", + panic("initiate_write_inodeblock_ufs1: " + "indirect pointer #%jd mismatch %d != %jd", (intmax_t)adp->ad_offset - NDADDR, dp->di_ib[adp->ad_offset - NDADDR], (intmax_t)adp->ad_newblkno); deplist |= 1 << adp->ad_offset; if ((adp->ad_state & ATTACHED) == 0) - panic("softdep_write_inodeblock: Unknown state 0x%x", - adp->ad_state); + panic("initiate_write_inodeblock_ufs1: " + "Unknown state 0x%x", adp->ad_state); #endif /* INVARIANTS */ adp->ad_state &= ~ATTACHED; adp->ad_state |= UNDONE; @@ -10219,7 +10261,8 @@ for (i = adp->ad_offset + 1; i < NDADDR; i++) { #ifdef INVARIANTS if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0) - panic("softdep_write_inodeblock: lost dep1"); + panic("initiate_write_inodeblock_ufs1: " + "lost dep1"); #endif /* INVARIANTS */ dp->di_db[i] = 0; } @@ -10227,7 +10270,8 @@ #ifdef INVARIANTS if (dp->di_ib[i] != 0 && (deplist & ((1 << NDADDR) << i)) == 0) - panic("softdep_write_inodeblock: lost dep2"); + panic("initiate_write_inodeblock_ufs1: " + "lost dep2"); #endif /* INVARIANTS */ dp->di_ib[i] = 0; } @@ -10349,18 +10393,18 @@ adp = TAILQ_NEXT(adp, ad_next)) { #ifdef INVARIANTS if (deplist != 0 && prevlbn >= adp->ad_offset) - panic("softdep_write_inodeblock: lbn order"); + panic("initiate_write_inodeblock_ufs2: lbn order"); prevlbn = adp->ad_offset; if (dp->di_extb[adp->ad_offset] != adp->ad_newblkno) - panic("%s: direct pointer #%jd mismatch %jd != %jd", - "softdep_write_inodeblock", + panic("initiate_write_inodeblock_ufs2: " + "ext pointer #%jd mismatch %jd != %jd", (intmax_t)adp->ad_offset, (intmax_t)dp->di_extb[adp->ad_offset], (intmax_t)adp->ad_newblkno); deplist |= 1 << adp->ad_offset; if ((adp->ad_state & ATTACHED) == 0) - panic("softdep_write_inodeblock: Unknown state 0x%x", - adp->ad_state); + panic("initiate_write_inodeblock_ufs2: Unknown " + "state 0x%x", adp->ad_state); #endif /* INVARIANTS */ adp->ad_state &= ~ATTACHED; adp->ad_state |= UNDONE; @@ -10381,7 +10425,8 @@ for (i = adp->ad_offset + 1; i < NXADDR; i++) { #ifdef INVARIANTS if (dp->di_extb[i] != 0 && (deplist & (1 << i)) == 0) - panic("softdep_write_inodeblock: lost dep1"); + panic("initiate_write_inodeblock_ufs2: " + "lost dep1"); #endif /* INVARIANTS */ dp->di_extb[i] = 0; } @@ -10414,22 +10459,22 @@ prevlbn = adp->ad_offset; if (adp->ad_offset < NDADDR && dp->di_db[adp->ad_offset] != adp->ad_newblkno) - panic("%s: direct pointer #%jd mismatch %jd != %jd", - "softdep_write_inodeblock", + panic("initiate_write_inodeblock_ufs2: " + "direct pointer #%jd mismatch %jd != %jd", (intmax_t)adp->ad_offset, (intmax_t)dp->di_db[adp->ad_offset], (intmax_t)adp->ad_newblkno); if (adp->ad_offset >= NDADDR && dp->di_ib[adp->ad_offset - NDADDR] != adp->ad_newblkno) - panic("%s indirect pointer #%jd mismatch %jd != %jd", - "softdep_write_inodeblock:", + panic("initiate_write_inodeblock_ufs2: " + "indirect pointer #%jd mismatch %jd != %jd", (intmax_t)adp->ad_offset - NDADDR, (intmax_t)dp->di_ib[adp->ad_offset - NDADDR], (intmax_t)adp->ad_newblkno); deplist |= 1 << adp->ad_offset; if ((adp->ad_state & ATTACHED) == 0) - panic("softdep_write_inodeblock: Unknown state 0x%x", - adp->ad_state); + panic("initiate_write_inodeblock_ufs2: Unknown " + "state 0x%x", adp->ad_state); #endif /* INVARIANTS */ adp->ad_state &= ~ATTACHED; adp->ad_state |= UNDONE; @@ -10452,7 +10497,8 @@ for (i = adp->ad_offset + 1; i < NDADDR; i++) { #ifdef INVARIANTS if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0) - panic("softdep_write_inodeblock: lost dep2"); + panic("initiate_write_inodeblock_ufs2: " + "lost dep2"); #endif /* INVARIANTS */ dp->di_db[i] = 0; } @@ -10460,7 +10506,8 @@ #ifdef INVARIANTS if (dp->di_ib[i] != 0 && (deplist & ((1 << NDADDR) << i)) == 0) - panic("softdep_write_inodeblock: lost dep3"); + panic("initiate_write_inodeblock_ufs2: " + "lost dep3"); #endif /* INVARIANTS */ dp->di_ib[i] = 0; } @@ -10940,6 +10987,10 @@ struct freeblks *freeblks; struct buf *sbp; + ump = softdep_bp_to_mp(bp); + if (ump == NULL) + return; + /* * If an error occurred while doing the write, then the data * has not hit the disk and the dependencies cannot be processed. @@ -10946,6 +10997,7 @@ * But we do have to go through and roll forward any dependencies * that were rolled back before the disk write. */ + ACQUIRE_LOCK(ump); if ((bp->b_ioflags & BIO_ERROR) != 0 && (bp->b_flags & B_INVAL) == 0) { LIST_FOREACH(wk, &bp->b_dep, wk_list) { switch (wk->wk_type) { @@ -10973,18 +11025,16 @@ continue; } } + FREE_LOCK(ump); return; } - if ((wk = LIST_FIRST(&bp->b_dep)) == NULL) - return; - ump = VFSTOUFS(wk->wk_mp); LIST_INIT(&reattach); + /* - * This lock must not be released anywhere in this code segment. + * Ump SU lock must not be released anywhere in this code segment. */ sbp = NULL; owk = NULL; - ACQUIRE_LOCK(ump); while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) { WORKLIST_REMOVE(wk); atomic_add_long(&dep_write[wk->wk_type], 1); @@ -11487,7 +11537,8 @@ panic("handle_written_inodeblock: bad size"); if (inodedep->id_savednlink > LINK_MAX) panic("handle_written_inodeblock: Invalid link count " - "%d for inodedep %p", inodedep->id_savednlink, inodedep); + "%jd for inodedep %p", (uintmax_t)inodedep->id_savednlink, + inodedep); if (fstype == UFS1) { if (dp1->di_nlink != inodedep->id_savednlink) { dp1->di_nlink = inodedep->id_savednlink; @@ -12104,21 +12155,22 @@ struct inode *ip; /* the "in_core" copy of the inode */ { struct inodedep *inodedep; + struct ufsmount *ump; - KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ip->i_ump)) != 0, + ump = ITOUMP(ip); + KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0, ("softdep_load_inodeblock called on non-softdep filesystem")); /* * Check for alternate nlink count. */ ip->i_effnlink = ip->i_nlink; - ACQUIRE_LOCK(ip->i_ump); - if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0, - &inodedep) == 0) { - FREE_LOCK(ip->i_ump); + ACQUIRE_LOCK(ump); + if (inodedep_lookup(UFSTOVFS(ump), ip->i_number, 0, &inodedep) == 0) { + FREE_LOCK(ump); return; } ip->i_effnlink -= inodedep->id_nlinkdelta; - FREE_LOCK(ip->i_ump); + FREE_LOCK(ump); } /* @@ -12146,11 +12198,11 @@ struct fs *fs; int error; - ump = ip->i_ump; + ump = ITOUMP(ip); mp = UFSTOVFS(ump); KASSERT(MOUNTEDSOFTDEP(mp) != 0, ("softdep_update_inodeblock called on non-softdep filesystem")); - fs = ip->i_fs; + fs = ump->um_fs; /* * Preserve the freelink that is on disk. clear_unlinked_inodedep() * does not have access to the in-core ip so must write directly into @@ -12315,9 +12367,9 @@ ufs_lbn_t lbn; ip = VTOI(vp); - fs = ip->i_fs; - ump = ip->i_ump; mp = vp->v_mount; + ump = VFSTOUFS(mp); + fs = ump->um_fs; if (MOUNTEDSOFTDEP(mp) == 0) return (0); ACQUIRE_LOCK(ump); @@ -12384,24 +12436,13 @@ FREE_LOCK(ump); if (ffs_vgetf(mp, parentino, LK_NOWAIT | LK_EXCLUSIVE, &pvp, FFSV_FORCEINSMQ)) { - error = vfs_busy(mp, MBF_NOWAIT); - if (error != 0) { - vfs_ref(mp); - VOP_UNLOCK(vp, 0); - error = vfs_busy(mp, 0); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); - vfs_rel(mp); - if (error != 0) - return (ENOENT); - if (vp->v_iflag & VI_DOOMED) { - vfs_unbusy(mp); - return (ENOENT); - } - } + /* + * Unmount cannot proceed after unlock because + * caller must have called vn_start_write(). + */ VOP_UNLOCK(vp, 0); error = ffs_vgetf(mp, parentino, LK_EXCLUSIVE, &pvp, FFSV_FORCEINSMQ); - vfs_unbusy(mp); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if (vp->v_iflag & VI_DOOMED) { if (error == 0) @@ -12590,13 +12631,13 @@ int error; ip = VTOI(vp); - KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ip->i_ump)) != 0, + KASSERT(MOUNTEDSOFTDEP(vp->v_mount) != 0, ("softdep_sync_metadata called on non-softdep filesystem")); /* * Ensure that any direct block dependencies have been cleared, * truncations are started, and inode references are journaled. */ - ACQUIRE_LOCK(ip->i_ump); + ACQUIRE_LOCK(VFSTOUFS(vp->v_mount)); /* * Write all journal records to prevent rollbacks on devvp. */ @@ -12608,7 +12649,7 @@ * indirect blocks. */ process_truncates(vp); - FREE_LOCK(ip->i_ump); + FREE_LOCK(VFSTOUFS(vp->v_mount)); return (error); } @@ -12643,7 +12684,7 @@ return (EBUSY); return (0); } - ump = VTOI(vp)->i_ump; + ump = VFSTOUFS(vp->v_mount); ACQUIRE_LOCK(ump); /* * As we hold the buffer locked, none of its dependencies @@ -13226,10 +13267,9 @@ { struct ufsmount *ump; struct mount *mp; - struct vnode *lvp, *mvp; long starttime; ufs2_daddr_t needed; - int error; + int error, failed_vnode; /* * If we are being called because of a process doing a @@ -13281,7 +13321,7 @@ * * Additionally, if we are unpriviledged and allocating space, * we need to ensure that we clean up enough blocks to get the - * needed number of blocks over the threshhold of the minimum + * needed number of blocks over the threshold of the minimum * number of blocks required to be kept free by the filesystem * (fs_minfree). */ @@ -13320,43 +13360,90 @@ * to the worklist that we can then process to reap addition * resources. We walk the vnodes associated with the mount point * until we get the needed worklist requests that we can reap. + * + * If there are several threads all needing to clean the same + * mount point, only one is allowed to walk the mount list. + * When several threads all try to walk the same mount list, + * they end up competing with each other and often end up in + * livelock. This approach ensures that forward progress is + * made at the cost of occational ENOSPC errors being returned + * that might otherwise have been avoided. */ + error = 1; if ((resource == FLUSH_BLOCKS_WAIT && fs->fs_cstotal.cs_nbfree <= needed) || (resource == FLUSH_INODES_WAIT && fs->fs_pendinginodes > 0 && fs->fs_cstotal.cs_nifree <= needed)) { - MNT_VNODE_FOREACH_ALL(lvp, mp, mvp) { - if (TAILQ_FIRST(&lvp->v_bufobj.bo_dirty.bv_hd) == 0) { - VI_UNLOCK(lvp); - continue; + ACQUIRE_LOCK(ump); + if ((ump->um_softdep->sd_flags & FLUSH_RC_ACTIVE) == 0) { + ump->um_softdep->sd_flags |= FLUSH_RC_ACTIVE; + FREE_LOCK(ump); + failed_vnode = softdep_request_cleanup_flush(mp, ump); + ACQUIRE_LOCK(ump); + ump->um_softdep->sd_flags &= ~FLUSH_RC_ACTIVE; + FREE_LOCK(ump); + if (ump->softdep_on_worklist > 0) { + stat_cleanup_retries += 1; + if (!failed_vnode) + goto retry; } - if (vget(lvp, LK_EXCLUSIVE | LK_INTERLOCK | LK_NOWAIT, - curthread)) - continue; - if (lvp->v_vflag & VV_NOSYNC) { /* unlinked */ - vput(lvp); - continue; - } - (void) ffs_syncvnode(lvp, MNT_NOWAIT, 0); - vput(lvp); + } else { + FREE_LOCK(ump); + error = 0; } - lvp = ump->um_devvp; - if (vn_lock(lvp, LK_EXCLUSIVE | LK_NOWAIT) == 0) { - VOP_FSYNC(lvp, MNT_NOWAIT, curthread); - VOP_UNLOCK(lvp, 0); - } - if (ump->softdep_on_worklist > 0) { - stat_cleanup_retries += 1; - goto retry; - } stat_cleanup_failures += 1; } if (time_second - starttime > stat_cleanup_high_delay) stat_cleanup_high_delay = time_second - starttime; UFS_LOCK(ump); - return (1); + return (error); } +/* + * Scan the vnodes for the specified mount point flushing out any + * vnodes that can be locked without waiting. Finally, try to flush + * the device associated with the mount point if it can be locked + * without waiting. + * + * We return 0 if we were able to lock every vnode in our scan. + * If we had to skip one or more vnodes, we return 1. + */ +static int +softdep_request_cleanup_flush(mp, ump) + struct mount *mp; + struct ufsmount *ump; +{ + struct thread *td; + struct vnode *lvp, *mvp; + int failed_vnode; + + failed_vnode = 0; + td = curthread; + MNT_VNODE_FOREACH_ALL(lvp, mp, mvp) { + if (TAILQ_FIRST(&lvp->v_bufobj.bo_dirty.bv_hd) == 0) { + VI_UNLOCK(lvp); + continue; + } + if (vget(lvp, LK_EXCLUSIVE | LK_INTERLOCK | LK_NOWAIT, + td) != 0) { + failed_vnode = 1; + continue; + } + if (lvp->v_vflag & VV_NOSYNC) { /* unlinked */ + vput(lvp); + continue; + } + (void) ffs_syncvnode(lvp, MNT_NOWAIT, 0); + vput(lvp); + } + lvp = ump->um_devvp; + if (vn_lock(lvp, LK_EXCLUSIVE | LK_NOWAIT) == 0) { + VOP_FSYNC(lvp, MNT_NOWAIT, td); + VOP_UNLOCK(lvp, 0); + } + return (failed_vnode); +} + static bool softdep_excess_items(struct ufsmount *ump, int item) { @@ -13397,15 +13484,13 @@ } static void -softdep_ast_cleanup_proc(void) +softdep_ast_cleanup_proc(struct thread *td) { - struct thread *td; struct mount *mp; struct ufsmount *ump; int error; bool req; - td = curthread; while ((mp = td->td_su) != NULL) { td->td_su = NULL; error = vfs_busy(mp, MBF_NOWAIT); @@ -13443,6 +13528,10 @@ } vfs_unbusy(mp); } + if ((mp = td->td_su) != NULL) { + td->td_su = NULL; + vfs_rel(mp); + } } /* @@ -13688,7 +13777,7 @@ /* * Find the last inode in the block with dependencies. */ - firstino = inodedep->id_ino & ~(INOPB(fs) - 1); + firstino = rounddown2(inodedep->id_ino, INOPB(fs)); for (lastino = firstino + INOPB(fs) - 1; lastino > firstino; lastino--) if (inodedep_lookup(mp, lastino, 0, &inodedep) != 0) break; @@ -13764,12 +13853,14 @@ { struct buf *bp; struct fs *fs; + struct ufsmount *ump; int error; - KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ip->i_ump)) != 0, + ump = ITOUMP(ip); + KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0, ("softdep_inode_append called on non-softdep filesystem")); - fs = ip->i_fs; - error = bread(ip->i_devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), + fs = ump->um_fs; + error = bread(ump->um_devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), (int)fs->fs_bsize, cred, &bp); if (error) { bqrelse(bp); @@ -13797,6 +13888,58 @@ FREE_LOCK(ump); } +static struct ufsmount * +softdep_bp_to_mp(bp) + struct buf *bp; +{ + struct mount *mp; + struct vnode *vp; + + if (LIST_EMPTY(&bp->b_dep)) + return (NULL); + vp = bp->b_vp; + KASSERT(vp != NULL, + ("%s, buffer with dependencies lacks vnode", __func__)); + + /* + * The ump mount point is stable after we get a correct + * pointer, since bp is locked and this prevents unmount from + * proceeding. But to get to it, we cannot dereference bp->b_dep + * head wk_mp, because we do not yet own SU ump lock and + * workitem might be freed while dereferenced. + */ +retry: + switch (vp->v_type) { + case VCHR: + VI_LOCK(vp); + mp = vp->v_type == VCHR ? vp->v_rdev->si_mountpt : NULL; + VI_UNLOCK(vp); + if (mp == NULL) + goto retry; + break; + case VREG: + case VDIR: + case VLNK: + case VFIFO: + case VSOCK: + mp = vp->v_mount; + break; + case VBLK: + vn_printf(vp, "softdep_bp_to_mp: unexpected block device\n"); + /* FALLTHROUGH */ + case VNON: + case VBAD: + case VMARKER: + mp = NULL; + break; + default: + vn_printf(vp, "unknown vnode type"); + mp = NULL; + break; + } + return (VFSTOUFS(mp)); +} + /* * Function to determine if the buffer has outstanding dependencies * that will cause a roll-back if the buffer is written. If wantcount @@ -13822,10 +13965,10 @@ struct diradd *dap; int i, retval; + ump = softdep_bp_to_mp(bp); + if (ump == NULL) + return (0); retval = 0; - if ((wk = LIST_FIRST(&bp->b_dep)) == NULL) - return (0); - ump = VFSTOUFS(wk->wk_mp); ACQUIRE_LOCK(ump); LIST_FOREACH(wk, &bp->b_dep, wk_list) { switch (wk->wk_type) { @@ -13960,7 +14103,7 @@ } out: FREE_LOCK(ump); - return retval; + return (retval); } /* @@ -13982,7 +14125,7 @@ error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, lock); /* - * Even if we sucessfully acquire bp here, we have dropped + * Even if we successfully acquire bp here, we have dropped * lock, which may violates our guarantee. */ if (error == 0) @@ -14009,11 +14152,7 @@ BUF_UNLOCK(bp); if (waitfor != MNT_WAIT) return (NULL); - /* - * The lock argument must be bp->b_vp's mutex in - * this case. - */ -#ifdef DEBUG_VFS_LOCKS +#ifdef DEBUG_VFS_LOCKS if (bp->b_vp->v_type != VCHR) ASSERT_BO_WLOCKED(bp->b_bufobj); #endif @@ -14170,25 +14309,14 @@ /* * Wait for pending output on a vnode to complete. - * Must be called with vnode lock and interlock locked. - * - * XXX: Should just be a call to bufobj_wwait(). */ static void drain_output(vp) struct vnode *vp; { - struct bufobj *bo; - bo = &vp->v_bufobj; ASSERT_VOP_LOCKED(vp, "drain_output"); - ASSERT_BO_WLOCKED(bo); - - while (bo->bo_numoutput) { - bo->bo_flag |= BO_WWAIT; - msleep((caddr_t)&bo->bo_numoutput, - BO_LOCKPTR(bo), PRIBIO + 1, "drainvp", 0); - } + (void)bufobj_wwait(&vp->v_bufobj, 0, 0); } /* @@ -14230,13 +14358,14 @@ static void inodedep_print(struct inodedep *inodedep, int verbose) { - db_printf("%p fs %p st %x ino %jd inoblk %jd delta %d nlink %d" + db_printf("%p fs %p st %x ino %jd inoblk %jd delta %jd nlink %jd" " saveino %p\n", inodedep, inodedep->id_fs, inodedep->id_state, (intmax_t)inodedep->id_ino, (intmax_t)fsbtodb(inodedep->id_fs, ino_to_fsba(inodedep->id_fs, inodedep->id_ino)), - inodedep->id_nlinkdelta, inodedep->id_savednlink, + (intmax_t)inodedep->id_nlinkdelta, + (intmax_t)inodedep->id_savednlink, inodedep->id_savedino1); if (verbose == 0) Modified: trunk/sys/ufs/ffs/ffs_subr.c =================================================================== --- trunk/sys/ufs/ffs/ffs_subr.c 2020-02-08 19:38:54 UTC (rev 12315) +++ trunk/sys/ufs/ffs/ffs_subr.c 2020-02-08 19:39:08 UTC (rev 12316) @@ -31,7 +31,7 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/ufs/ffs/ffs_subr.c 207141 2010-04-24 07:05:35Z jeff $"); +__FBSDID("$FreeBSD: stable/11/sys/ufs/ffs/ffs_subr.c 331722 2018-03-29 02:50:57Z eadler $"); #include <sys/param.h> @@ -56,10 +56,6 @@ #include <ufs/ffs/ffs_extern.h> #include <ufs/ffs/fs.h> -#ifdef KDB -void ffs_checkoverlap(struct buf *, struct inode *); -#endif - /* * Return buffer with the contents of block "offset" from the beginning of * directory "ip". If "res" is non-zero, fill it in with a pointer to the @@ -79,7 +75,7 @@ int bsize, error; ip = VTOI(vp); - fs = ip->i_fs; + fs = ITOFS(ip); lbn = lblkno(fs, offset); bsize = blksize(fs, ip, lbn); @@ -107,7 +103,7 @@ ino_t ino; { - if (ip->i_ump->um_fstype == UFS1) { + if (I_IS_UFS1(ip)) { *ip->i_din1 = *((struct ufs1_dinode *)bp->b_data + ino_to_fsbo(fs, ino)); ip->i_mode = ip->i_din1->di_mode; @@ -166,37 +162,6 @@ } } -#ifdef KDB -void -ffs_checkoverlap(bp, ip) - struct buf *bp; - struct inode *ip; -{ - struct buf *ebp, *ep; - ufs2_daddr_t start, last; - struct vnode *vp; - - ebp = &buf[nbuf]; - start = bp->b_blkno; - last = start + btodb(bp->b_bcount) - 1; - for (ep = buf; ep < ebp; ep++) { - if (ep == bp || (ep->b_flags & B_INVAL) || - ep->b_vp == NULLVP) - continue; - vp = ip->i_devvp; - /* look for overlap */ - if (ep->b_bcount == 0 || ep->b_blkno > last || - ep->b_blkno + btodb(ep->b_bcount) <= start) - continue; - vprint("Disk overlap", vp); - printf("\tstart %jd, end %jd overlap start %jd, end %jd\n", - (intmax_t)start, (intmax_t)last, (intmax_t)ep->b_blkno, - (intmax_t)(ep->b_blkno + btodb(ep->b_bcount) - 1)); - panic("ffs_checkoverlap: Disk buffer overlap"); - } -} -#endif /* KDB */ - /* * block operations * Modified: trunk/sys/ufs/ffs/ffs_suspend.c =================================================================== --- trunk/sys/ufs/ffs/ffs_suspend.c 2020-02-08 19:38:54 UTC (rev 12315) +++ trunk/sys/ufs/ffs/ffs_suspend.c 2020-02-08 19:39:08 UTC (rev 12316) @@ -27,14 +27,15 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: stable/10/sys/ufs/ffs/ffs_suspend.c 306175 2016-09-22 10:42:40Z kib $ + * $FreeBSD: stable/11/sys/ufs/ffs/ffs_suspend.c 337483 2018-08-08 18:51:39Z kib $ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/ufs/ffs/ffs_suspend.c 306175 2016-09-22 10:42:40Z kib $"); +__FBSDID("$FreeBSD: stable/11/sys/ufs/ffs/ffs_suspend.c 337483 2018-08-08 18:51:39Z kib $"); #include <sys/param.h> #include <sys/systm.h> +#include <sys/buf.h> #include <sys/ioccom.h> #include <sys/mount.h> #include <sys/vnode.h> @@ -214,6 +215,31 @@ } static void +ffs_susp_unsuspend(struct mount *mp) +{ + struct ufsmount *ump; + + sx_assert(&ffs_susp_lock, SA_XLOCKED); + + /* + * XXX: The status is kept per-process; the vfs_write_resume() routine + * asserts that the resuming thread is the same one that called + * vfs_write_suspend(). The cdevpriv data, however, is attached + * to the file descriptor, e.g. is inherited during fork. Thus, + * it's possible that the resuming process will be different from + * the one that started the suspension. + * + * Work around by fooling the check in vfs_write_resume(). + */ + mp->mnt_susp_owner = curthread; + + vfs_write_resume(mp, 0); + ump = VFSTOUFS(mp); + ump->um_writesuspended = 0; + vfs_unbusy(mp); +} + +static void ffs_susp_dtor(void *data) { struct fs *fs; @@ -239,22 +265,7 @@ if (error != 0) panic("failed to unsuspend writes on %s", fs->fs_fsmnt); - /* - * XXX: The status is kept per-process; the vfs_write_resume() routine - * asserts that the resuming thread is the same one that called - * vfs_write_suspend(). The cdevpriv data, however, is attached - * to the file descriptor, e.g. is inherited during fork. Thus, - * it's possible that the resuming process will be different from - * the one that started the suspension. - * - * Work around by fooling the check in vfs_write_resume(). - */ - mp->mnt_susp_owner = curthread; - - vfs_write_resume(mp, 0); - vfs_unbusy(mp); - ump->um_writesuspended = 0; - + ffs_susp_unsuspend(mp); sx_xunlock(&ffs_susp_lock); } @@ -294,7 +305,8 @@ break; } error = devfs_set_cdevpriv(mp, ffs_susp_dtor); - KASSERT(error == 0, ("devfs_set_cdevpriv failed")); + if (error != 0) + ffs_susp_unsuspend(mp); break; case UFSRESUME: error = devfs_get_cdevpriv((void **)&mp); Modified: trunk/sys/ufs/ffs/ffs_tables.c =================================================================== --- trunk/sys/ufs/ffs/ffs_tables.c 2020-02-08 19:38:54 UTC (rev 12315) +++ trunk/sys/ufs/ffs/ffs_tables.c 2020-02-08 19:39:08 UTC (rev 12316) @@ -31,7 +31,7 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/ufs/ffs/ffs_tables.c 139825 2005-01-07 02:29:27Z imp $"); +__FBSDID("$FreeBSD: stable/11/sys/ufs/ffs/ffs_tables.c 331722 2018-03-29 02:50:57Z eadler $"); #include <sys/param.h> #include <ufs/ufs/dinode.h> Modified: trunk/sys/ufs/ffs/ffs_vfsops.c =================================================================== --- trunk/sys/ufs/ffs/ffs_vfsops.c 2020-02-08 19:38:54 UTC (rev 12315) +++ trunk/sys/ufs/ffs/ffs_vfsops.c 2020-02-08 19:39:08 UTC (rev 12316) @@ -31,7 +31,7 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/ufs/ffs/ffs_vfsops.c 309208 2016-11-27 09:14:52Z kib $"); +__FBSDID("$FreeBSD: stable/11/sys/ufs/ffs/ffs_vfsops.c 357030 2020-01-23 06:06:32Z mckusick $"); #include "opt_quota.h" #include "opt_ufs.h" @@ -55,6 +55,7 @@ #include <sys/malloc.h> #include <sys/mutex.h> #include <sys/rwlock.h> +#include <sys/vmmeter.h> #include <security/mac/mac_framework.h> @@ -149,7 +150,7 @@ struct fs *fs; pid_t fsckpid = 0; int error, error1, flags; - uint64_t mntorflags; + uint64_t mntorflags, saved_mnt_flag; accmode_t accmode; struct nameidata ndp; char *fspec; @@ -240,7 +241,6 @@ if ((error = ffs_flushfiles(mp, WRITECLOSE, td)) != 0 || (error = ffs_sbupdate(ump, MNT_WAIT, 0)) != 0) return (error); - DROP_GIANT(); g_topology_lock(); /* * Return to normal read-only mode. @@ -247,7 +247,6 @@ */ error = g_access(ump->um_cp, 0, -1, 0); g_topology_unlock(); - PICKUP_GIANT(); ump->um_fsckpid = 0; } if (fs->fs_ronly == 0 && @@ -295,7 +294,6 @@ } if (MOUNTEDSOFTDEP(mp)) softdep_unmount(mp); - DROP_GIANT(); g_topology_lock(); /* * Drop our write and exclusive access. @@ -302,7 +300,6 @@ */ g_access(ump->um_cp, 0, -1, -1); g_topology_unlock(); - PICKUP_GIANT(); fs->fs_ronly = 1; MNT_ILOCK(mp); mp->mnt_flag |= MNT_RDONLY; @@ -360,7 +357,6 @@ return (EPERM); } } - DROP_GIANT(); g_topology_lock(); /* * Request exclusive write access. @@ -367,30 +363,44 @@ */ error = g_access(ump->um_cp, 0, 1, 1); g_topology_unlock(); - PICKUP_GIANT(); if (error) return (error); if ((error = vn_start_write(NULL, &mp, V_WAIT)) != 0) return (error); + error = vfs_write_suspend_umnt(mp); + if (error != 0) + return (error); fs->fs_ronly = 0; MNT_ILOCK(mp); - mp->mnt_flag &= ~MNT_RDONLY; + saved_mnt_flag = MNT_RDONLY; + if (MOUNTEDSOFTDEP(mp) && (mp->mnt_flag & + MNT_ASYNC) != 0) + saved_mnt_flag |= MNT_ASYNC; + mp->mnt_flag &= ~saved_mnt_flag; MNT_IUNLOCK(mp); fs->fs_mtime = time_second; /* check to see if we need to start softdep */ if ((fs->fs_flags & FS_DOSOFTDEP) && (error = softdep_mount(devvp, mp, fs, td->td_ucred))){ - vn_finished_write(mp); + fs->fs_ronly = 1; + MNT_ILOCK(mp); + mp->mnt_flag |= saved_mnt_flag; + MNT_IUNLOCK(mp); + vfs_write_resume(mp, 0); return (error); } fs->fs_clean = 0; if ((error = ffs_sbupdate(ump, MNT_WAIT, 0)) != 0) { - vn_finished_write(mp); + fs->fs_ronly = 1; + MNT_ILOCK(mp); + mp->mnt_flag |= saved_mnt_flag; + MNT_IUNLOCK(mp); + vfs_write_resume(mp, 0); return (error); } if (fs->fs_snapinum[0] != 0) ffs_snapshot_mount(mp); - vn_finished_write(mp); + vfs_write_resume(mp, 0); } /* * Soft updates is incompatible with "async", @@ -434,7 +444,6 @@ } KASSERT(MOUNTEDSOFTDEP(mp) == 0, ("soft updates enabled on read-only file system")); - DROP_GIANT(); g_topology_lock(); /* * Request write access. @@ -441,7 +450,6 @@ */ error = g_access(ump->um_cp, 0, 1, 0); g_topology_unlock(); - PICKUP_GIANT(); if (error) { vfs_mount_error(mp, "Checker activation failed on %s", @@ -540,7 +548,6 @@ ("soft updates enabled on read-only file system")); ump = VFSTOUFS(mp); fs = ump->um_fs; - DROP_GIANT(); g_topology_lock(); /* * Request write access. @@ -547,7 +554,6 @@ */ error = g_access(ump->um_cp, 0, 1, 0); g_topology_unlock(); - PICKUP_GIANT(); if (error) { printf("WARNING: %s: Checker activation " "failed\n", fs->fs_fsmnt); @@ -798,11 +804,9 @@ VOP_UNLOCK(devvp, 0); return (EBUSY); } - DROP_GIANT(); g_topology_lock(); error = g_vfs_open(devvp, &cp, "ffs", ronly ? 0 : 1); g_topology_unlock(); - PICKUP_GIANT(); if (error != 0) { atomic_store_rel_ptr((uintptr_t *)&dev->si_mountpt, 0); VOP_UNLOCK(devvp, 0); @@ -849,7 +853,7 @@ goto out; } fs->fs_fmod = 0; - fs->fs_flags &= ~FS_INDEXDIRS; /* no support for directory indicies */ + fs->fs_flags &= ~FS_INDEXDIRS; /* no support for directory indices */ fs->fs_flags &= ~FS_UNCLEAN; if (fs->fs_clean == 0) { fs->fs_flags |= FS_UNCLEAN; @@ -1117,11 +1121,9 @@ if (bp) brelse(bp); if (cp != NULL) { - DROP_GIANT(); g_topology_lock(); g_vfs_close(cp); g_topology_unlock(); - PICKUP_GIANT(); } if (ump) { mtx_destroy(UFS_MTX(ump)); @@ -1307,7 +1309,6 @@ taskqueue_drain_all(ump->um_trim_tq); taskqueue_free(ump->um_trim_tq); } - DROP_GIANT(); g_topology_lock(); if (ump->um_fsckpid > 0) { /* @@ -1318,7 +1319,6 @@ } g_vfs_close(ump->um_cp); g_topology_unlock(); - PICKUP_GIANT(); atomic_store_rel_ptr((uintptr_t *)&ump->um_dev->si_mountpt, 0); vrele(ump->um_devvp); dev_rel(ump->um_dev); @@ -1334,6 +1334,10 @@ MNT_ILOCK(mp); mp->mnt_flag &= ~MNT_LOCAL; MNT_IUNLOCK(mp); + if (td->td_su == mp) { + td->td_su = NULL; + vfs_rel(mp); + } return (error); fail: @@ -1480,8 +1484,12 @@ allerror = 0; td = curthread; - if ((mp->mnt_flag & MNT_NOATIME) != 0) - goto qupdate; + if ((mp->mnt_flag & MNT_NOATIME) != 0) { +#ifdef QUOTA + qsync(mp); +#endif + goto sbupdate; + } MNT_VNODE_FOREACH_ACTIVE(vp, mp, mvp) { if (vp->v_type == VNON) { VI_UNLOCK(vp); @@ -1503,6 +1511,9 @@ if ((error = vget(vp, LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK, td)) != 0) continue; +#ifdef QUOTA + qsyncvp(vp); +#endif if (sync_doupdate(ip)) error = ffs_update(vp, 0); if (error != 0) @@ -1509,12 +1520,7 @@ allerror = error; vput(vp); } - -qupdate: -#ifdef QUOTA - qsync(mp); -#endif - +sbupdate: if (VFSTOUFS(mp)->um_fs->fs_fmod != 0 && (error = ffs_sbupdate(VFSTOUFS(mp), MNT_LAZY, 0)) != 0) allerror = error; @@ -1607,6 +1613,9 @@ } continue; } +#ifdef QUOTA + qsyncvp(vp); +#endif if ((error = ffs_syncvnode(vp, waitfor, 0)) != 0) allerror = error; vput(vp); @@ -1621,9 +1630,6 @@ if (allerror == 0 && count) goto loop; } -#ifdef QUOTA - qsync(mp); -#endif devvp = ump->um_devvp; bo = &devvp->v_bufobj; @@ -1687,7 +1693,6 @@ struct ufsmount *ump; struct buf *bp; struct vnode *vp; - struct cdev *dev; int error; error = vfs_hash_get(mp, ino, flags, curthread, vpp, NULL, NULL); @@ -1711,7 +1716,6 @@ */ ump = VFSTOUFS(mp); - dev = ump->um_dev; fs = ump->um_fs; ip = uma_zalloc(uma_inode, M_WAITOK | M_ZERO); @@ -1732,11 +1736,10 @@ vp->v_bufobj.bo_bsize = fs->fs_bsize; ip->i_vnode = vp; ip->i_ump = ump; - ip->i_fs = fs; - ip->i_dev = dev; ip->i_number = ino; ip->i_ea_refs = 0; ip->i_nextclustercg = -1; + ip->i_flag = fs->fs_magic == FS_UFS1_MAGIC ? 0 : IN_UFS2; #ifdef QUOTA { int i; @@ -1773,7 +1776,7 @@ *vpp = NULL; return (error); } - if (ip->i_ump->um_fstype == UFS1) + if (I_IS_UFS1(ip)) ip->i_din1 = uma_zalloc(uma_ufs1, M_WAITOK); else ip->i_din2 = uma_zalloc(uma_ufs2, M_WAITOK); @@ -1788,10 +1791,8 @@ * Initialize the vnode from the inode, check for aliases. * Note that the underlying vnode may have changed. */ - if (ip->i_ump->um_fstype == UFS1) - error = ufs_vinit(mp, &ffs_fifoops1, &vp); - else - error = ufs_vinit(mp, &ffs_fifoops2, &vp); + error = ufs_vinit(mp, I_IS_UFS1(ip) ? &ffs_fifoops1 : &ffs_fifoops2, + &vp); if (error) { vput(vp); *vpp = NULL; @@ -1811,7 +1812,8 @@ * already have one. This should only happen on old filesystems. */ if (ip->i_gen == 0) { - ip->i_gen = arc4random() / 2 + 1; + while (ip->i_gen == 0) + ip->i_gen = arc4random(); if ((vp->v_mount->mnt_flag & MNT_RDONLY) == 0) { ip->i_flag |= IN_MODIFIED; DIP_SET(ip, i_gen, ip->i_gen); @@ -1843,6 +1845,7 @@ * * Have to be really careful about stale file handles: * - check that the inode number is valid + * - for UFS2 check that the inode number is initialized * - call ffs_vget() to get the locked inode * - check for an unallocated inode (i_mode == 0) * - check that the given client host has export rights and return @@ -1856,13 +1859,37 @@ struct vnode **vpp; { struct ufid *ufhp; + struct ufsmount *ump; struct fs *fs; + struct cg *cgp; + struct buf *bp; + ino_t ino; + u_int cg; + int error; ufhp = (struct ufid *)fhp; - fs = VFSTOUFS(mp)->um_fs; - if (ufhp->ufid_ino < ROOTINO || - ufhp->ufid_ino >= fs->fs_ncg * fs->fs_ipg) + ino = ufhp->ufid_ino; + ump = VFSTOUFS(mp); + fs = ump->um_fs; + if (ino < ROOTINO || ino >= fs->fs_ncg * fs->fs_ipg) return (ESTALE); + /* + * Need to check if inode is initialized because UFS2 does lazy + * initialization and nfs_fhtovp can offer arbitrary inode numbers. + */ + if (fs->fs_magic != FS_UFS2_MAGIC) + return (ufs_fhtovp(mp, ufhp, flags, vpp)); + cg = ino_to_cg(fs, ino); + error = bread(ump->um_devvp, fsbtodb(fs, cgtod(fs, cg)), + (int)fs->fs_cgsize, NOCRED, &bp); + if (error) + return (error); + cgp = (struct cg *)bp->b_data; + if (!cg_chkmagic(cgp) || ino >= cg * fs->fs_ipg + cgp->cg_initediblk) { + brelse(bp); + return (ESTALE); + } + brelse(bp); return (ufs_fhtovp(mp, ufhp, flags, vpp)); } @@ -1950,13 +1977,13 @@ } bp = sbbp; if (fs->fs_magic == FS_UFS1_MAGIC && fs->fs_sblockloc != SBLOCK_UFS1 && - (fs->fs_flags & FS_FLAGS_UPDATED) == 0) { + (fs->fs_old_flags & FS_FLAGS_UPDATED) == 0) { printf("WARNING: %s: correcting fs_sblockloc from %jd to %d\n", fs->fs_fsmnt, fs->fs_sblockloc, SBLOCK_UFS1); fs->fs_sblockloc = SBLOCK_UFS1; } if (fs->fs_magic == FS_UFS2_MAGIC && fs->fs_sblockloc != SBLOCK_UFS2 && - (fs->fs_flags & FS_FLAGS_UPDATED) == 0) { + (fs->fs_old_flags & FS_FLAGS_UPDATED) == 0) { printf("WARNING: %s: correcting fs_sblockloc from %jd to %d\n", fs->fs_fsmnt, fs->fs_sblockloc, SBLOCK_UFS2); fs->fs_sblockloc = SBLOCK_UFS2; @@ -2032,7 +2059,6 @@ /* * Process dependencies then return any unfinished ones. */ - pbrelvp(bp); if (!LIST_EMPTY(&bp->b_dep) && (bp->b_ioflags & BIO_ERROR) == 0) buf_complete(bp); #ifdef SOFTUPDATES @@ -2045,6 +2071,7 @@ */ bp->b_flags |= B_NOCACHE; bp->b_flags &= ~B_CACHE; + pbrelvp(bp); /* * Prevent brelse() from trying to keep and re-dirtying bp on @@ -2138,7 +2165,7 @@ if (newbp == NULL) goto normal_write; - KASSERT((bp->b_flags & B_UNMAPPED) == 0, ("Unmapped cg")); + KASSERT(buf_mapped(bp), ("Unmapped cg")); memcpy(newbp->b_data, bp->b_data, bp->b_bufsize); BO_LOCK(bp->b_bufobj); bp->b_vflags |= BV_BKGRDINPROG; Modified: trunk/sys/ufs/ffs/ffs_vnops.c =================================================================== --- trunk/sys/ufs/ffs/ffs_vnops.c 2020-02-08 19:38:54 UTC (rev 12315) +++ trunk/sys/ufs/ffs/ffs_vnops.c 2020-02-08 19:39:08 UTC (rev 12316) @@ -63,7 +63,7 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/ufs/ffs/ffs_vnops.c 284201 2015-06-10 02:14:33Z kib $"); +__FBSDID("$FreeBSD: stable/11/sys/ufs/ffs/ffs_vnops.c 331722 2018-03-29 02:50:57Z eadler $"); #include <sys/param.h> #include <sys/bio.h> @@ -78,6 +78,7 @@ #include <sys/priv.h> #include <sys/rwlock.h> #include <sys/stat.h> +#include <sys/sysctl.h> #include <sys/vmmeter.h> #include <sys/vnode.h> @@ -103,9 +104,10 @@ #ifdef DIRECTIO extern int ffs_rawread(struct vnode *vp, struct uio *uio, int *workdone); #endif +static vop_fdatasync_t ffs_fdatasync; static vop_fsync_t ffs_fsync; +static vop_getpages_t ffs_getpages; static vop_lock1_t ffs_lock; -static vop_getpages_t ffs_getpages; static vop_read_t ffs_read; static vop_write_t ffs_write; static int ffs_extread(struct vnode *vp, struct uio *uio, int ioflag); @@ -120,12 +122,13 @@ static vop_setextattr_t ffs_setextattr; static vop_vptofh_t ffs_vptofh; - /* Global vfs data structures for ufs. */ struct vop_vector ffs_vnodeops1 = { .vop_default = &ufs_vnodeops, .vop_fsync = ffs_fsync, + .vop_fdatasync = ffs_fdatasync, .vop_getpages = ffs_getpages, + .vop_getpages_async = vnode_pager_local_getpages_async, .vop_lock1 = ffs_lock, .vop_read = ffs_read, .vop_reallocblks = ffs_reallocblks, @@ -136,6 +139,7 @@ struct vop_vector ffs_fifoops1 = { .vop_default = &ufs_fifoops, .vop_fsync = ffs_fsync, + .vop_fdatasync = ffs_fdatasync, .vop_reallocblks = ffs_reallocblks, /* XXX: really ??? */ .vop_vptofh = ffs_vptofh, }; @@ -144,7 +148,9 @@ struct vop_vector ffs_vnodeops2 = { .vop_default = &ufs_vnodeops, .vop_fsync = ffs_fsync, + .vop_fdatasync = ffs_fdatasync, .vop_getpages = ffs_getpages, + .vop_getpages_async = vnode_pager_local_getpages_async, .vop_lock1 = ffs_lock, .vop_read = ffs_read, .vop_reallocblks = ffs_reallocblks, @@ -161,6 +167,7 @@ struct vop_vector ffs_fifoops2 = { .vop_default = &ufs_fifoops, .vop_fsync = ffs_fsync, + .vop_fdatasync = ffs_fdatasync, .vop_lock1 = ffs_lock, .vop_reallocblks = ffs_reallocblks, .vop_strategy = ffsext_strategy, @@ -216,10 +223,10 @@ { struct inode *ip; struct bufobj *bo; - struct buf *bp; - struct buf *nbp; + struct buf *bp, *nbp; ufs_lbn_t lbn; - int error, wait, passes; + int error, passes; + bool still_dirty, wait; ip = VTOI(vp); ip->i_flag &= ~IN_NEEDSYNC; @@ -238,8 +245,8 @@ */ error = 0; passes = 0; - wait = 0; /* Always do an async pass first. */ - lbn = lblkno(ip->i_fs, (ip->i_size + ip->i_fs->fs_bsize - 1)); + wait = false; /* Always do an async pass first. */ + lbn = lblkno(ITOFS(ip), (ip->i_size + ITOFS(ip)->fs_bsize - 1)); BO_LOCK(bo); loop: TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) @@ -254,15 +261,23 @@ if ((bp->b_vflags & BV_SCANNED) != 0) continue; bp->b_vflags |= BV_SCANNED; - /* Flush indirects in order. */ + /* + * Flush indirects in order, if requested. + * + * Note that if only datasync is requested, we can + * skip indirect blocks when softupdates are not + * active. Otherwise we must flush them with data, + * since dependencies prevent data block writes. + */ if (waitfor == MNT_WAIT && bp->b_lblkno <= -NDADDR && - lbn_level(bp->b_lblkno) >= passes) + (lbn_level(bp->b_lblkno) >= passes || + ((flags & DATA_ONLY) != 0 && !DOINGSOFTDEP(vp)))) continue; if (bp->b_lblkno > lbn) panic("ffs_syncvnode: syncing truncated data."); if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) == 0) { BO_UNLOCK(bo); - } else if (wait != 0) { + } else if (wait) { if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo)) != 0) { @@ -330,31 +345,59 @@ * these will be done with one sync and one async pass. */ if (bo->bo_dirty.bv_cnt > 0) { - /* Write the inode after sync passes to flush deps. */ - if (wait && DOINGSOFTDEP(vp) && (flags & NO_INO_UPDT) == 0) { - BO_UNLOCK(bo); - ffs_update(vp, 1); - BO_LOCK(bo); + if ((flags & DATA_ONLY) == 0) { + still_dirty = true; + } else { + /* + * For data-only sync, dirty indirect buffers + * are ignored. + */ + still_dirty = false; + TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) { + if (bp->b_lblkno > -NDADDR) { + still_dirty = true; + break; + } + } } - /* switch between sync/async. */ - wait = !wait; - if (wait == 1 || ++passes < NIADDR + 2) - goto loop; + + if (still_dirty) { + /* Write the inode after sync passes to flush deps. */ + if (wait && DOINGSOFTDEP(vp) && + (flags & NO_INO_UPDT) == 0) { + BO_UNLOCK(bo); + ffs_update(vp, 1); + BO_LOCK(bo); + } + /* switch between sync/async. */ + wait = !wait; + if (wait || ++passes < NIADDR + 2) + goto loop; #ifdef INVARIANTS - if (!vn_isdisk(vp, NULL)) - vprint("ffs_fsync: dirty", vp); + if (!vn_isdisk(vp, NULL)) + vn_printf(vp, "ffs_fsync: dirty "); #endif + } } BO_UNLOCK(bo); error = 0; - if ((flags & NO_INO_UPDT) == 0) - error = ffs_update(vp, 1); - if (DOINGSUJ(vp)) - softdep_journal_fsync(VTOI(vp)); + if ((flags & DATA_ONLY) == 0) { + if ((flags & NO_INO_UPDT) == 0) + error = ffs_update(vp, 1); + if (DOINGSUJ(vp)) + softdep_journal_fsync(VTOI(vp)); + } return (error); } static int +ffs_fdatasync(struct vop_fdatasync_args *ap) +{ + + return (ffs_syncvnode(ap->a_vp, MNT_WAIT, DATA_ONLY)); +} + +static int ffs_lock(ap) struct vop_lock1_args /* { struct vnode *a_vp; @@ -477,7 +520,7 @@ if (orig_resid == 0) return (0); KASSERT(uio->uio_offset >= 0, ("ffs_read: uio->uio_offset < 0")); - fs = ip->i_fs; + fs = ITOFS(ip); if (uio->uio_offset < ip->i_size && uio->uio_offset >= fs->fs_maxfilesize) return (EOVERFLOW); @@ -559,15 +602,6 @@ } /* - * If IO_DIRECT then set B_DIRECT for the buffer. This - * will cause us to attempt to release the buffer later on - * and will cause the buffer cache to attempt to free the - * underlying pages. - */ - if (ioflag & IO_DIRECT) - bp->b_flags |= B_DIRECT; - - /* * We should only get non-zero b_resid when an I/O error * has occurred, which should cause us to break above. * However, if the short read did not cause an error, @@ -581,7 +615,7 @@ xfersize = size; } - if ((bp->b_flags & B_UNMAPPED) == 0) { + if (buf_mapped(bp)) { error = vn_io_fault_uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); } else { @@ -591,25 +625,7 @@ if (error) break; - if ((ioflag & (IO_VMIO|IO_DIRECT)) && - (LIST_EMPTY(&bp->b_dep))) { - /* - * If there are no dependencies, and it's VMIO, - * then we don't need the buf, mark it available - * for freeing. For non-direct VMIO reads, the VM - * has the data. - */ - bp->b_flags |= B_RELBUF; - brelse(bp); - } else { - /* - * Otherwise let whoever - * made the request take care of - * freeing it. We just queue - * it onto another list. - */ - bqrelse(bp); - } + vfs_bio_brelse(bp, ioflag); } /* @@ -618,15 +634,8 @@ * and on normal completion has not set a new value into it. * so it must have come from a 'break' statement */ - if (bp != NULL) { - if ((ioflag & (IO_VMIO|IO_DIRECT)) && - (LIST_EMPTY(&bp->b_dep))) { - bp->b_flags |= B_RELBUF; - brelse(bp); - } else { - bqrelse(bp); - } - } + if (bp != NULL) + vfs_bio_brelse(bp, ioflag); if ((error == 0 || uio->uio_resid != orig_resid) && (vp->v_mount->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0 && @@ -700,7 +709,7 @@ KASSERT(uio->uio_resid >= 0, ("ffs_write: uio->uio_resid < 0")); KASSERT(uio->uio_offset >= 0, ("ffs_write: uio->uio_offset < 0")); - fs = ip->i_fs; + fs = ITOFS(ip); if ((uoff_t)uio->uio_offset + uio->uio_resid > fs->fs_maxfilesize) return (EFBIG); /* @@ -744,8 +753,6 @@ vnode_pager_setsize(vp, ip->i_size); break; } - if (ioflag & IO_DIRECT) - bp->b_flags |= B_DIRECT; if ((ioflag & (IO_SYNC|IO_INVAL)) == (IO_SYNC|IO_INVAL)) bp->b_flags |= B_NOCACHE; @@ -758,7 +765,7 @@ if (size < xfersize) xfersize = size; - if ((bp->b_flags & B_UNMAPPED) == 0) { + if (buf_mapped(bp)) { error = vn_io_fault_uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); } else { @@ -785,11 +792,9 @@ if (error != 0 && (bp->b_flags & B_CACHE) == 0 && fs->fs_bsize == xfersize) vfs_bio_clrbuf(bp); - if ((ioflag & (IO_VMIO|IO_DIRECT)) && - (LIST_EMPTY(&bp->b_dep))) { - bp->b_flags |= B_RELBUF; - } + vfs_bio_set_flags(bp, ioflag); + /* * If IO_SYNC each buffer is written synchronously. Otherwise * if we have a severe page deficiency write the buffer @@ -848,48 +853,6 @@ } /* - * get page routine - */ -static int -ffs_getpages(ap) - struct vop_getpages_args *ap; -{ - int i; - vm_page_t mreq; - int pcount; - - pcount = round_page(ap->a_count) / PAGE_SIZE; - mreq = ap->a_m[ap->a_reqpage]; - - /* - * if ANY DEV_BSIZE blocks are valid on a large filesystem block, - * then the entire page is valid. Since the page may be mapped, - * user programs might reference data beyond the actual end of file - * occuring within the page. We have to zero that data. - */ - VM_OBJECT_WLOCK(mreq->object); - if (mreq->valid) { - if (mreq->valid != VM_PAGE_BITS_ALL) - vm_page_zero_invalid(mreq, TRUE); - for (i = 0; i < pcount; i++) { - if (i != ap->a_reqpage) { - vm_page_lock(ap->a_m[i]); - vm_page_free(ap->a_m[i]); - vm_page_unlock(ap->a_m[i]); - } - } - VM_OBJECT_WUNLOCK(mreq->object); - return VM_PAGER_OK; - } - VM_OBJECT_WUNLOCK(mreq->object); - - return vnode_pager_generic_getpages(ap->a_vp, ap->a_m, - ap->a_count, - ap->a_reqpage); -} - - -/* * Extended attribute area reading. */ static int @@ -906,7 +869,7 @@ int error; ip = VTOI(vp); - fs = ip->i_fs; + fs = ITOFS(ip); dp = ip->i_din2; #ifdef INVARIANTS @@ -978,15 +941,6 @@ } /* - * If IO_DIRECT then set B_DIRECT for the buffer. This - * will cause us to attempt to release the buffer later on - * and will cause the buffer cache to attempt to free the - * underlying pages. - */ - if (ioflag & IO_DIRECT) - bp->b_flags |= B_DIRECT; - - /* * We should only get non-zero b_resid when an I/O error * has occurred, which should cause us to break above. * However, if the short read did not cause an error, @@ -1004,26 +958,7 @@ (int)xfersize, uio); if (error) break; - - if ((ioflag & (IO_VMIO|IO_DIRECT)) && - (LIST_EMPTY(&bp->b_dep))) { - /* - * If there are no dependencies, and it's VMIO, - * then we don't need the buf, mark it available - * for freeing. For non-direct VMIO reads, the VM - * has the data. - */ - bp->b_flags |= B_RELBUF; - brelse(bp); - } else { - /* - * Otherwise let whoever - * made the request take care of - * freeing it. We just queue - * it onto another list. - */ - bqrelse(bp); - } + vfs_bio_brelse(bp, ioflag); } /* @@ -1032,15 +967,8 @@ * and on normal completion has not set a new value into it. * so it must have come from a 'break' statement */ - if (bp != NULL) { - if ((ioflag & (IO_VMIO|IO_DIRECT)) && - (LIST_EMPTY(&bp->b_dep))) { - bp->b_flags |= B_RELBUF; - brelse(bp); - } else { - bqrelse(bp); - } - } + if (bp != NULL) + vfs_bio_brelse(bp, ioflag); return (error); } @@ -1060,7 +988,7 @@ int blkoffset, error, flags, size, xfersize; ip = VTOI(vp); - fs = ip->i_fs; + fs = ITOFS(ip); dp = ip->i_din2; #ifdef INVARIANTS @@ -1109,8 +1037,6 @@ */ if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize) vfs_bio_clrbuf(bp); - if (ioflag & IO_DIRECT) - bp->b_flags |= B_DIRECT; if (uio->uio_offset + xfersize > dp->di_extsize) dp->di_extsize = uio->uio_offset + xfersize; @@ -1121,11 +1047,9 @@ error = uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); - if ((ioflag & (IO_VMIO|IO_DIRECT)) && - (LIST_EMPTY(&bp->b_dep))) { - bp->b_flags |= B_RELBUF; - } + vfs_bio_set_flags(bp, ioflag); + /* * If IO_SYNC each buffer is written synchronously. Otherwise * if we have a severe page deficiency write the buffer @@ -1232,7 +1156,7 @@ u_char *eae; ip = VTOI(vp); - fs = ip->i_fs; + fs = ITOFS(ip); dp = ip->i_din2; easize = dp->di_extsize; if ((uoff_t)easize + extra > NXADDR * fs->fs_bsize) @@ -1386,8 +1310,7 @@ vp = ap->a_vp; lbn = ap->a_bp->b_lblkno; - if (VTOI(vp)->i_fs->fs_magic == FS_UFS2_MAGIC && - lbn < 0 && lbn >= -NXADDR) + if (I_IS_UFS2(VTOI(vp)) && lbn < 0 && lbn >= -NXADDR) return (VOP_STRATEGY_APV(&ufs_vnodeops, ap)); if (vp->v_type == VFIFO) return (VOP_STRATEGY_APV(&ufs_fifoops, ap)); @@ -1463,7 +1386,7 @@ u_char *eae, *p; ip = VTOI(ap->a_vp); - fs = ip->i_fs; + fs = ITOFS(ip); if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) return (EOPNOTSUPP); @@ -1666,7 +1589,7 @@ u_char *eae, *p; ip = VTOI(ap->a_vp); - fs = ip->i_fs; + fs = ITOFS(ip); if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) return (EOPNOTSUPP); @@ -1786,3 +1709,38 @@ ufhp->ufid_gen = ip->i_gen; return (0); } + +SYSCTL_DECL(_vfs_ffs); +static int use_buf_pager = 0; +SYSCTL_INT(_vfs_ffs, OID_AUTO, use_buf_pager, CTLFLAG_RWTUN, &use_buf_pager, 0, + "Always use buffer pager instead of bmap"); + +static daddr_t +ffs_gbp_getblkno(struct vnode *vp, vm_ooffset_t off) +{ + + return (lblkno(VFSTOUFS(vp->v_mount)->um_fs, off)); +} + +static int +ffs_gbp_getblksz(struct vnode *vp, daddr_t lbn) +{ + + return (blksize(VFSTOUFS(vp->v_mount)->um_fs, VTOI(vp), lbn)); +} + +static int +ffs_getpages(struct vop_getpages_args *ap) +{ + struct vnode *vp; + struct ufsmount *um; + + vp = ap->a_vp; + um = VFSTOUFS(vp->v_mount); + + if (!use_buf_pager && um->um_devvp->v_bufobj.bo_bsize <= PAGE_SIZE) + return (vnode_pager_generic_getpages(vp, ap->a_m, ap->a_count, + ap->a_rbehind, ap->a_rahead, NULL, NULL)); + return (vfs_bio_getpages(vp, ap->a_m, ap->a_count, ap->a_rbehind, + ap->a_rahead, ffs_gbp_getblkno, ffs_gbp_getblksz)); +} Modified: trunk/sys/ufs/ffs/fs.h =================================================================== --- trunk/sys/ufs/ffs/fs.h 2020-02-08 19:38:54 UTC (rev 12315) +++ trunk/sys/ufs/ffs/fs.h 2020-02-08 19:39:08 UTC (rev 12316) @@ -11,7 +11,7 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 4. Neither the name of the University nor the names of its contributors + * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * @@ -28,7 +28,7 @@ * SUCH DAMAGE. * * @(#)fs.h 8.13 (Berkeley) 3/21/95 - * $FreeBSD: stable/10/sys/ufs/ffs/fs.h 322860 2017-08-24 21:44:23Z mckusick $ + * $FreeBSD: stable/11/sys/ufs/ffs/fs.h 356905 2020-01-20 08:28:54Z eugen $ */ #ifndef _UFS_FFS_FS_H_ @@ -220,7 +220,8 @@ #define FFS_UNLINK 14 /* remove a name in the filesystem */ #define FFS_SET_INODE 15 /* update an on-disk inode */ #define FFS_SET_BUFOUTPUT 16 /* set buffered writing on descriptor */ -#define FFS_MAXID 16 /* number of valid ffs ids */ +#define FFS_SET_SIZE 17 /* set inode size */ +#define FFS_MAXID 17 /* number of valid ffs ids */ /* * Command structure passed in to the filesystem to adjust filesystem values. @@ -238,9 +239,7 @@ * A recovery structure placed at the end of the boot block area by newfs * that can be used by fsck to search for alternate superblocks. */ -#define RESID (4096 - 20) /* disk sector size minus recovery area size */ struct fsrecovery { - char block[RESID]; /* unused part of sector */ int32_t fsr_magic; /* magic number */ int32_t fsr_fsbtodb; /* fsbtodb and dbtofsb shift constant */ int32_t fsr_sblkno; /* offset of super-block in filesys */ @@ -416,8 +415,8 @@ * flag to enforce that inconsistent filesystems be mounted read-only. * The FS_INDEXDIRS flag when set indicates that the kernel maintains * on-disk auxiliary indexes (such as B-trees) for speeding directory - * accesses. Kernels that do not support auxiliary indicies clear the - * flag to indicate that the indicies need to be rebuilt (by fsck) before + * accesses. Kernels that do not support auxiliary indices clear the + * flag to indicate that the indices need to be rebuilt (by fsck) before * they can be used. * * FS_ACLS indicates that POSIX.1e ACLs are administratively enabled Modified: trunk/sys/ufs/ffs/softdep.h =================================================================== --- trunk/sys/ufs/ffs/softdep.h 2020-02-08 19:38:54 UTC (rev 12315) +++ trunk/sys/ufs/ffs/softdep.h 2020-02-08 19:39:08 UTC (rev 12316) @@ -37,7 +37,7 @@ * SUCH DAMAGE. * * @(#)softdep.h 9.7 (McKusick) 6/21/00 - * $FreeBSD: stable/10/sys/ufs/ffs/softdep.h 307534 2016-10-17 21:49:54Z mckusick $ + * $FreeBSD: stable/11/sys/ufs/ffs/softdep.h 320057 2017-06-17 17:10:50Z kib $ */ #include <sys/queue.h> @@ -133,7 +133,7 @@ #define INPROGRESS 0x001000 /* dirrem, freeblks, freefrag, freefile only */ #define UFS1FMT 0x002000 /* indirdep only */ #define EXTDATA 0x004000 /* allocdirect only */ -#define ONWORKLIST 0x008000 +#define ONWORKLIST 0x008000 #define IOWAITING 0x010000 /* Thread is waiting for IO to complete. */ #define ONDEPLIST 0x020000 /* Structure is on a dependency list. */ #define UNLINKED 0x040000 /* inodedep has been unlinked. */ @@ -1066,6 +1066,7 @@ #define FLUSH_EXIT 0x0001 /* time to exit */ #define FLUSH_CLEANUP 0x0002 /* need to clear out softdep structures */ #define FLUSH_STARTING 0x0004 /* flush thread not yet started */ +#define FLUSH_RC_ACTIVE 0x0008 /* a thread is flushing the mount point */ /* * Keep the old names from when these were in the ufsmount structure. Modified: trunk/sys/ufs/ufs/README.acls =================================================================== --- trunk/sys/ufs/ufs/README.acls 2020-02-08 19:38:54 UTC (rev 12315) +++ trunk/sys/ufs/ufs/README.acls 2020-02-08 19:39:08 UTC (rev 12316) @@ -1,4 +1,4 @@ -$FreeBSD: stable/10/sys/ufs/ufs/README.acls 105456 2002-10-19 16:09:16Z rwatson $ +$FreeBSD: stable/11/sys/ufs/ufs/README.acls 105456 2002-10-19 16:09:16Z rwatson $ UFS Access Control Lists Copyright Modified: trunk/sys/ufs/ufs/README.extattr =================================================================== --- trunk/sys/ufs/ufs/README.extattr 2020-02-08 19:38:54 UTC (rev 12315) +++ trunk/sys/ufs/ufs/README.extattr 2020-02-08 19:39:08 UTC (rev 12316) @@ -1,4 +1,4 @@ -$FreeBSD: stable/10/sys/ufs/ufs/README.extattr 105417 2002-10-18 21:11:36Z rwatson $ +$FreeBSD: stable/11/sys/ufs/ufs/README.extattr 105417 2002-10-18 21:11:36Z rwatson $ UFS Extended Attributes Copyright Modified: trunk/sys/ufs/ufs/acl.h =================================================================== --- trunk/sys/ufs/ufs/acl.h 2020-02-08 19:38:54 UTC (rev 12315) +++ trunk/sys/ufs/ufs/acl.h 2020-02-08 19:39:08 UTC (rev 12316) @@ -26,7 +26,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: stable/10/sys/ufs/ufs/acl.h 200796 2009-12-21 19:39:10Z trasz $ + * $FreeBSD: stable/11/sys/ufs/ufs/acl.h 200796 2009-12-21 19:39:10Z trasz $ */ /* * Developed by the TrustedBSD Project. Modified: trunk/sys/ufs/ufs/dinode.h =================================================================== --- trunk/sys/ufs/ufs/dinode.h 2020-02-08 19:38:54 UTC (rev 12315) +++ trunk/sys/ufs/ufs/dinode.h 2020-02-08 19:39:08 UTC (rev 12316) @@ -63,7 +63,7 @@ * SUCH DAMAGE. * * @(#)dinode.h 8.3 (Berkeley) 1/21/94 - * $FreeBSD: stable/10/sys/ufs/ufs/dinode.h 259223 2013-12-11 19:25:17Z pfg $ + * $FreeBSD: stable/11/sys/ufs/ufs/dinode.h 257029 2013-10-24 00:33:29Z pfg $ */ #ifndef _UFS_UFS_DINODE_H_ Modified: trunk/sys/ufs/ufs/dir.h =================================================================== --- trunk/sys/ufs/ufs/dir.h 2020-02-08 19:38:54 UTC (rev 12315) +++ trunk/sys/ufs/ufs/dir.h 2020-02-08 19:39:08 UTC (rev 12316) @@ -16,7 +16,7 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 4. Neither the name of the University nor the names of its contributors + * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * @@ -33,7 +33,7 @@ * SUCH DAMAGE. * * @(#)dir.h 8.2 (Berkeley) 1/21/94 - * $FreeBSD: stable/10/sys/ufs/ufs/dir.h 262779 2014-03-05 04:23:19Z pfg $ + * $FreeBSD: stable/11/sys/ufs/ufs/dir.h 347475 2019-05-10 23:46:42Z mckusick $ */ #ifndef _UFS_UFS_DIR_H_ @@ -106,13 +106,11 @@ * The DIRSIZ macro gives the minimum record length which will hold * the directory entry. This requires the amount of space in struct direct * without the d_name field, plus enough space for the name with a terminating - * null byte (dp->d_namlen+1), rounded up to a 4 byte boundary. - * - * + * null byte (dp->d_namlen + 1), rounded up to a 4 byte boundary. */ -#define DIRECTSIZ(namlen) \ - (((uintptr_t)&((struct direct *)0)->d_name + \ - ((namlen)+1)*sizeof(((struct direct *)0)->d_name[0]) + 3) & ~3) +#define DIR_ROUNDUP 4 /* Directory name roundup size */ +#define DIRECTSIZ(namlen) \ + (roundup2(__offsetof(struct direct, d_name) + (namlen) + 1, DIR_ROUNDUP)) #if (BYTE_ORDER == LITTLE_ENDIAN) #define DIRSIZ(oldfmt, dp) \ ((oldfmt) ? DIRECTSIZ((dp)->d_type) : DIRECTSIZ((dp)->d_namlen)) Modified: trunk/sys/ufs/ufs/dirhash.h =================================================================== --- trunk/sys/ufs/ufs/dirhash.h 2020-02-08 19:38:54 UTC (rev 12315) +++ trunk/sys/ufs/ufs/dirhash.h 2020-02-08 19:39:08 UTC (rev 12316) @@ -23,7 +23,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: stable/10/sys/ufs/ufs/dirhash.h 262779 2014-03-05 04:23:19Z pfg $ + * $FreeBSD: stable/11/sys/ufs/ufs/dirhash.h 298804 2016-04-29 20:43:51Z pfg $ */ #ifndef _UFS_UFS_DIRHASH_H_ @@ -61,7 +61,7 @@ * together on a TAILQ list, and hashes with higher scores filter * towards the tail (most recently used) end of the list. * - * New hash entries are given an inital score of DH_SCOREINIT and are + * New hash entries are given an initial score of DH_SCOREINIT and are * placed at the most-recently-used end of the list. This helps a lot * in the worst-case case scenario where every directory access is * to a directory that is not hashed (i.e. the working set of hash Modified: trunk/sys/ufs/ufs/extattr.h =================================================================== --- trunk/sys/ufs/ufs/extattr.h 2020-02-08 19:38:54 UTC (rev 12315) +++ trunk/sys/ufs/ufs/extattr.h 2020-02-08 19:39:08 UTC (rev 12316) @@ -26,7 +26,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: stable/10/sys/ufs/ufs/extattr.h 262779 2014-03-05 04:23:19Z pfg $ + * $FreeBSD: stable/11/sys/ufs/ufs/extattr.h 306553 2016-10-01 09:19:43Z kib $ */ /* * Developed by the TrustedBSD Project. @@ -134,6 +134,10 @@ int uepm_flags; }; +struct vop_getextattr_args; +struct vop_deleteextattr_args; +struct vop_setextattr_args; + void ufs_extattr_uepm_init(struct ufs_extattr_per_mount *uepm); void ufs_extattr_uepm_destroy(struct ufs_extattr_per_mount *uepm); int ufs_extattr_start(struct mount *mp, struct thread *td); Modified: trunk/sys/ufs/ufs/gjournal.h =================================================================== --- trunk/sys/ufs/ufs/gjournal.h 2020-02-08 19:38:54 UTC (rev 12315) +++ trunk/sys/ufs/ufs/gjournal.h 2020-02-08 19:39:08 UTC (rev 12316) @@ -24,7 +24,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: stable/10/sys/ufs/ufs/gjournal.h 262779 2014-03-05 04:23:19Z pfg $ + * $FreeBSD: stable/11/sys/ufs/ufs/gjournal.h 262678 2014-03-02 02:52:34Z pfg $ */ #ifndef _UFS_UFS_GJOURNAL_H_ Modified: trunk/sys/ufs/ufs/inode.h =================================================================== --- trunk/sys/ufs/ufs/inode.h 2020-02-08 19:38:54 UTC (rev 12315) +++ trunk/sys/ufs/ufs/inode.h 2020-02-08 19:39:08 UTC (rev 12316) @@ -33,7 +33,7 @@ * SUCH DAMAGE. * * @(#)inode.h 8.9 (Berkeley) 5/14/95 - * $FreeBSD: stable/10/sys/ufs/ufs/inode.h 283640 2015-05-28 00:11:36Z mckusick $ + * $FreeBSD: stable/11/sys/ufs/ufs/inode.h 331722 2018-03-29 02:50:57Z eadler $ */ #ifndef _UFS_UFS_INODE_H_ @@ -67,14 +67,25 @@ struct inode { TAILQ_ENTRY(inode) i_nextsnap; /* snapshot file list. */ struct vnode *i_vnode;/* Vnode associated with this inode. */ - struct ufsmount *i_ump;/* Ufsmount point associated with this inode. */ + struct ufsmount *i_ump;/* Ufsmount point associated with this inode. */ + struct dquot *i_dquot[MAXQUOTAS]; /* Dquot structures. */ + union { + struct dirhash *dirhash; /* Hashing for large directories. */ + daddr_t *snapblklist; /* Collect expunged snapshot blocks. */ + } i_un; + /* + * The real copy of the on-disk inode. + */ + union { + struct ufs1_dinode *din1; /* UFS1 on-disk dinode. */ + struct ufs2_dinode *din2; /* UFS2 on-disk dinode. */ + } dinode_u; + + ino_t i_number; /* The identity of the inode. */ u_int32_t i_flag; /* flags, see below */ - struct cdev *i_dev; /* Device associated with the inode. */ - ino_t i_number; /* The identity of the inode. */ int i_effnlink; /* i_nlink when I/O completes */ - struct fs *i_fs; /* Associated filesystem superblock. */ - struct dquot *i_dquot[MAXQUOTAS]; /* Dquot structures. */ + /* * Side effects; used during directory lookup. */ @@ -83,11 +94,6 @@ doff_t i_diroff; /* Offset in dir, where we found last entry. */ doff_t i_offset; /* Offset of free space in directory. */ - union { - struct dirhash *dirhash; /* Hashing for large directories. */ - daddr_t *snapblklist; /* Collect expunged snapshot blocks. */ - } i_un; - int i_nextclustercg; /* last cg searched for cluster */ /* @@ -101,20 +107,13 @@ /* * Copies from the on-disk dinode itself. */ - u_int16_t i_mode; /* IFMT, permissions; see below. */ - int16_t i_nlink; /* File link count. */ u_int64_t i_size; /* File byte count. */ + u_int64_t i_gen; /* Generation number. */ u_int32_t i_flags; /* Status flags (chflags). */ - u_int64_t i_gen; /* Generation number. */ u_int32_t i_uid; /* File owner. */ u_int32_t i_gid; /* File group. */ - /* - * The real copy of the on-disk inode. - */ - union { - struct ufs1_dinode *din1; /* UFS1 on-disk dinode. */ - struct ufs2_dinode *din2; /* UFS2 on-disk dinode. */ - } dinode_u; + u_int16_t i_mode; /* IFMT, permissions; see below. */ + int16_t i_nlink; /* File link count. */ }; /* * These flags are kept in i_flag. @@ -124,16 +123,16 @@ #define IN_UPDATE 0x0004 /* Modification time update request. */ #define IN_MODIFIED 0x0008 /* Inode has been modified. */ #define IN_NEEDSYNC 0x0010 /* Inode requires fsync. */ -#define IN_LAZYMOD 0x0040 /* Modified, but don't write yet. */ -#define IN_LAZYACCESS 0x0100 /* Process IN_ACCESS after the +#define IN_LAZYMOD 0x0020 /* Modified, but don't write yet. */ +#define IN_LAZYACCESS 0x0040 /* Process IN_ACCESS after the suspension finished */ -#define IN_EA_LOCKED 0x0200 -#define IN_EA_LOCKWAIT 0x0400 +#define IN_EA_LOCKED 0x0080 +#define IN_EA_LOCKWAIT 0x0100 -#define IN_TRUNCATED 0x0800 /* Journaled truncation pending. */ +#define IN_TRUNCATED 0x0200 /* Journaled truncation pending. */ -#define i_devvp i_ump->um_devvp -#define i_umbufobj i_ump->um_bo +#define IN_UFS2 0x0400 /* UFS2 vs UFS1 */ + #define i_dirhash i_un.dirhash #define i_snapblklist i_un.snapblklist #define i_din1 dinode_u.din1 @@ -140,23 +139,42 @@ #define i_din2 dinode_u.din2 #ifdef _KERNEL + +#define ITOUMP(ip) ((ip)->i_ump) +#define ITODEV(ip) (ITOUMP(ip)->um_dev) +#define ITODEVVP(ip) (ITOUMP(ip)->um_devvp) +#define ITOFS(ip) (ITOUMP(ip)->um_fs) +#define ITOVFS(ip) ((ip)->i_vnode->v_mount) + +static inline _Bool +I_IS_UFS1(const struct inode *ip) +{ + + return ((ip->i_flag & IN_UFS2) == 0); +} + +static inline _Bool +I_IS_UFS2(const struct inode *ip) +{ + + return ((ip->i_flag & IN_UFS2) != 0); +} + /* * The DIP macro is used to access fields in the dinode that are * not cached in the inode itself. */ -#define DIP(ip, field) \ - (((ip)->i_ump->um_fstype == UFS1) ? \ - (ip)->i_din1->d##field : (ip)->i_din2->d##field) -#define DIP_SET(ip, field, val) do { \ - if ((ip)->i_ump->um_fstype == UFS1) \ - (ip)->i_din1->d##field = (val); \ - else \ - (ip)->i_din2->d##field = (val); \ +#define DIP(ip, field) (I_IS_UFS1(ip) ? (ip)->i_din1->d##field : \ + (ip)->i_din2->d##field) +#define DIP_SET(ip, field, val) do { \ + if (I_IS_UFS1(ip)) \ + (ip)->i_din1->d##field = (val); \ + else \ + (ip)->i_din2->d##field = (val); \ } while (0) -#define SHORTLINK(ip) \ - (((ip)->i_ump->um_fstype == UFS1) ? \ - (caddr_t)(ip)->i_din1->di_db : (caddr_t)(ip)->i_din2->di_db) +#define SHORTLINK(ip) (I_IS_UFS1(ip) ? \ + (caddr_t)(ip)->i_din1->di_db : (caddr_t)(ip)->i_din2->di_db) #define IS_SNAPSHOT(ip) ((ip)->i_flags & SF_SNAPSHOT) /* Modified: trunk/sys/ufs/ufs/quota.h =================================================================== --- trunk/sys/ufs/ufs/quota.h 2020-02-08 19:38:54 UTC (rev 12315) +++ trunk/sys/ufs/ufs/quota.h 2020-02-08 19:39:08 UTC (rev 12316) @@ -31,7 +31,7 @@ * SUCH DAMAGE. * * @(#)quota.h 8.3 (Berkeley) 8/19/94 - * $FreeBSD: stable/10/sys/ufs/ufs/quota.h 262779 2014-03-05 04:23:19Z pfg $ + * $FreeBSD: stable/11/sys/ufs/ufs/quota.h 331722 2018-03-29 02:50:57Z eadler $ */ #ifndef _UFS_UFS_QUOTA_H_ Modified: trunk/sys/ufs/ufs/ufs_acl.c =================================================================== --- trunk/sys/ufs/ufs/ufs_acl.c 2020-02-08 19:38:54 UTC (rev 12315) +++ trunk/sys/ufs/ufs/ufs_acl.c 2020-02-08 19:39:08 UTC (rev 12316) @@ -32,7 +32,7 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/ufs/ufs/ufs_acl.c 241011 2012-09-27 23:30:49Z mdf $"); +__FBSDID("$FreeBSD: stable/11/sys/ufs/ufs/ufs_acl.c 306553 2016-10-01 09:19:43Z kib $"); #include "opt_ufs.h" #include "opt_quota.h" @@ -46,6 +46,7 @@ #include <sys/acl.h> #include <sys/event.h> #include <sys/extattr.h> +#include <sys/proc.h> #include <ufs/ufs/quota.h> #include <ufs/ufs/inode.h> @@ -184,7 +185,7 @@ */ printf("ufs_getacl_nfs4(): Loaded invalid ACL (" "%d bytes), inumber %ju on %s\n", len, - (uintmax_t)ip->i_number, ip->i_fs->fs_fsmnt); + (uintmax_t)ip->i_number, ITOFS(ip)->fs_fsmnt); return (EPERM); } @@ -193,7 +194,7 @@ if (error) { printf("ufs_getacl_nfs4(): Loaded invalid ACL " "(failed acl_nfs4_check), inumber %ju on %s\n", - (uintmax_t)ip->i_number, ip->i_fs->fs_fsmnt); + (uintmax_t)ip->i_number, ITOFS(ip)->fs_fsmnt); return (EPERM); } @@ -220,7 +221,7 @@ /* * Read POSIX.1e ACL from an EA. Return error if its not found - * or if any other error has occured. + * or if any other error has occurred. */ static int ufs_get_oldacl(acl_type_t type, struct oldacl *old, struct vnode *vp, @@ -261,7 +262,7 @@ */ printf("ufs_get_oldacl(): Loaded invalid ACL " "(len = %d), inumber %ju on %s\n", len, - (uintmax_t)ip->i_number, ip->i_fs->fs_fsmnt); + (uintmax_t)ip->i_number, ITOFS(ip)->fs_fsmnt); return (EPERM); } Modified: trunk/sys/ufs/ufs/ufs_bmap.c =================================================================== --- trunk/sys/ufs/ufs/ufs_bmap.c 2020-02-08 19:38:54 UTC (rev 12315) +++ trunk/sys/ufs/ufs/ufs_bmap.c 2020-02-08 19:39:08 UTC (rev 12316) @@ -36,7 +36,7 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/ufs/ufs/ufs_bmap.c 284021 2015-06-05 08:36:25Z kib $"); +__FBSDID("$FreeBSD: stable/11/sys/ufs/ufs/ufs_bmap.c 331722 2018-03-29 02:50:57Z eadler $"); #include <sys/param.h> #include <sys/systm.h> @@ -45,6 +45,7 @@ #include <sys/proc.h> #include <sys/vnode.h> #include <sys/mount.h> +#include <sys/racct.h> #include <sys/resourcevar.h> #include <sys/stat.h> @@ -78,7 +79,7 @@ * to physical mapping is requested. */ if (ap->a_bop != NULL) - *ap->a_bop = &VTOI(ap->a_vp)->i_devvp->v_bufobj; + *ap->a_bop = &VFSTOUFS(ap->a_vp->v_mount)->um_devvp->v_bufobj; if (ap->a_bnp == NULL) return (0); @@ -224,6 +225,13 @@ vfs_busy_pages(bp, 0); bp->b_iooffset = dbtob(bp->b_blkno); bstrategy(bp); +#ifdef RACCT + if (racct_enable) { + PROC_LOCK(curproc); + racct_add_buf(curproc, bp, 0); + PROC_UNLOCK(curproc); + } +#endif /* RACCT */ curthread->td_ru.ru_inblock++; error = bufwait(bp); if (error) { @@ -232,7 +240,7 @@ } } - if (ip->i_ump->um_fstype == UFS1) { + if (I_IS_UFS1(ip)) { daddr = ((ufs1_daddr_t *)bp->b_data)[ap->in_off]; if (num == 1 && daddr && runp) { for (bn = ap->in_off + 1; Modified: trunk/sys/ufs/ufs/ufs_dirhash.c =================================================================== --- trunk/sys/ufs/ufs/ufs_dirhash.c 2020-02-08 19:38:54 UTC (rev 12315) +++ trunk/sys/ufs/ufs/ufs_dirhash.c 2020-02-08 19:39:08 UTC (rev 12316) @@ -29,7 +29,7 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/ufs/ufs/ufs_dirhash.c 326846 2017-12-14 11:45:02Z kib $"); +__FBSDID("$FreeBSD: stable/11/sys/ufs/ufs/ufs_dirhash.c 326845 2017-12-14 11:41:12Z kib $"); #include "opt_ufs.h" @@ -86,10 +86,11 @@ static int ufs_dirhashlowmemcount = 0; SYSCTL_INT(_vfs_ufs, OID_AUTO, dirhash_lowmemcount, CTLFLAG_RD, &ufs_dirhashlowmemcount, 0, "number of times low memory hook called"); -static int ufs_dirhashreclaimage = 60; -SYSCTL_INT(_vfs_ufs, OID_AUTO, dirhash_reclaimage, CTLFLAG_RW, - &ufs_dirhashreclaimage, 0, - "max time in seconds of hash inactivity before deletion in low VM events"); +static int ufs_dirhashreclaimpercent = 10; +static int ufsdirhash_set_reclaimpercent(SYSCTL_HANDLER_ARGS); +SYSCTL_PROC(_vfs_ufs, OID_AUTO, dirhash_reclaimpercent, + CTLTYPE_INT | CTLFLAG_RW, 0, 0, ufsdirhash_set_reclaimpercent, "I", + "set percentage of dirhash cache to be removed in low VM events"); static int ufsdirhash_hash(struct dirhash *dh, char *name, int namelen); @@ -1151,7 +1152,7 @@ doff_t blkoff, prevoff; int entrypos, i; - blkoff = offset & ~(DIRBLKSIZ - 1); /* offset of start of block */ + blkoff = rounddown2(offset, DIRBLKSIZ); /* offset of start of block */ entrypos = offset & (DIRBLKSIZ - 1); /* entry relative to block */ blkbuf = (char *)dirp - entrypos; prevoff = blkoff; @@ -1250,50 +1251,53 @@ ufsdirhash_lowmem() { struct dirhash *dh, *dh_temp; - int memfreed = 0; - /* - * Will free a *minimum* of 10% of the dirhash, but possibly much - * more (depending on dirhashreclaimage). System with large dirhashes - * probably also need a much larger dirhashreclaimage. - * XXX: this percentage may need to be adjusted. - */ - int memwanted = ufs_dirhashmem / 10; + int memfreed, memwanted; ufs_dirhashlowmemcount++; + memfreed = 0; + memwanted = ufs_dirhashmem * ufs_dirhashreclaimpercent / 100; DIRHASHLIST_LOCK(); - /* - * Delete dirhashes not used for more than ufs_dirhashreclaimage - * seconds. If we can't get a lock on the dirhash, it will be skipped. + + /* + * Reclaim up to memwanted from the oldest dirhashes. This will allow + * us to make some progress when the system is running out of memory + * without compromising the dinamicity of maximum age. If the situation + * does not improve lowmem will be eventually retriggered and free some + * other entry in the cache. The entries on the head of the list should + * be the oldest. If during list traversal we can't get a lock on the + * dirhash, it will be skipped. */ TAILQ_FOREACH_SAFE(dh, &ufsdirhash_list, dh_list, dh_temp) { - if (!sx_try_xlock(&dh->dh_lock)) - continue; - if (time_second - dh->dh_lastused > ufs_dirhashreclaimage) + if (sx_try_xlock(&dh->dh_lock)) memfreed += ufsdirhash_destroy(dh); - /* Unlock if we didn't delete the dirhash */ - else - ufsdirhash_release(dh); + if (memfreed >= memwanted) + break; } - - /* - * If not enough memory was freed, keep deleting hashes from the head - * of the dirhash list. The ones closest to the head should be the - * oldest. - */ - if (memfreed < memwanted) { - TAILQ_FOREACH_SAFE(dh, &ufsdirhash_list, dh_list, dh_temp) { - if (!sx_try_xlock(&dh->dh_lock)) - continue; - memfreed += ufsdirhash_destroy(dh); - if (memfreed >= memwanted) - break; - } - } DIRHASHLIST_UNLOCK(); } +static int +ufsdirhash_set_reclaimpercent(SYSCTL_HANDLER_ARGS) +{ + int error, v; + v = ufs_dirhashreclaimpercent; + error = sysctl_handle_int(oidp, &v, v, req); + if (error) + return (error); + if (req->newptr == NULL) + return (error); + if (v == ufs_dirhashreclaimpercent) + return (0); + + /* Refuse invalid percentages */ + if (v < 0 || v > 100) + return (EINVAL); + ufs_dirhashreclaimpercent = v; + return (0); +} + void ufsdirhash_init() { Modified: trunk/sys/ufs/ufs/ufs_extattr.c =================================================================== --- trunk/sys/ufs/ufs/ufs_extattr.c 2020-02-08 19:38:54 UTC (rev 12315) +++ trunk/sys/ufs/ufs/ufs_extattr.c 2020-02-08 19:39:08 UTC (rev 12316) @@ -39,7 +39,7 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/ufs/ufs/ufs_extattr.c 302233 2016-06-27 21:44:27Z bdrewery $"); +__FBSDID("$FreeBSD: stable/11/sys/ufs/ufs/ufs_extattr.c 298463 2016-04-22 08:09:27Z ngie $"); #include "opt_ufs.h" Modified: trunk/sys/ufs/ufs/ufs_extern.h =================================================================== --- trunk/sys/ufs/ufs/ufs_extern.h 2020-02-08 19:38:54 UTC (rev 12315) +++ trunk/sys/ufs/ufs/ufs_extern.h 2020-02-08 19:39:08 UTC (rev 12316) @@ -28,7 +28,7 @@ * SUCH DAMAGE. * * @(#)ufs_extern.h 8.10 (Berkeley) 5/14/95 - * $FreeBSD: stable/10/sys/ufs/ufs/ufs_extern.h 262779 2014-03-05 04:23:19Z pfg $ + * $FreeBSD: stable/11/sys/ufs/ufs/ufs_extern.h 331722 2018-03-29 02:50:57Z eadler $ */ #ifndef _UFS_UFS_EXTERN_H_ Modified: trunk/sys/ufs/ufs/ufs_gjournal.c =================================================================== --- trunk/sys/ufs/ufs/ufs_gjournal.c 2020-02-08 19:38:54 UTC (rev 12315) +++ trunk/sys/ufs/ufs/ufs_gjournal.c 2020-02-08 19:39:08 UTC (rev 12316) @@ -26,12 +26,13 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/ufs/ufs/ufs_gjournal.c 306630 2016-10-03 10:15:16Z kib $"); +__FBSDID("$FreeBSD: stable/11/sys/ufs/ufs/ufs_gjournal.c 306627 2016-10-03 09:37:56Z kib $"); #include "opt_ufs.h" #include <sys/param.h> #include <sys/systm.h> +#include <sys/buf.h> #include <sys/kernel.h> #include <sys/vnode.h> #include <sys/lock.h> @@ -65,15 +66,15 @@ ino_t ino; ip = VTOI(vp); - ump = ip->i_ump; - fs = ip->i_fs; - devvp = ip->i_devvp; + ump = VFSTOUFS(vp->v_mount); + fs = ump->um_fs; + devvp = ump->um_devvp; ino = ip->i_number; cg = ino_to_cg(fs, ino); if (devvp->v_type == VREG) { /* devvp is a snapshot */ - dev = VTOI(devvp)->i_devvp->v_rdev; + dev = VFSTOUFS(devvp->v_mount)->um_devvp->v_rdev; cgbno = fragstoblks(fs, cgtod(fs, cg)); } else if (devvp->v_type == VCHR) { /* devvp is a normal disk device */ Modified: trunk/sys/ufs/ufs/ufs_inode.c =================================================================== --- trunk/sys/ufs/ufs/ufs_inode.c 2020-02-08 19:38:54 UTC (rev 12315) +++ trunk/sys/ufs/ufs/ufs_inode.c 2020-02-08 19:39:08 UTC (rev 12316) @@ -36,7 +36,7 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/ufs/ufs/ufs_inode.c 234612 2012-04-23 17:54:49Z trasz $"); +__FBSDID("$FreeBSD: stable/11/sys/ufs/ufs/ufs_inode.c 331722 2018-03-29 02:50:57Z eadler $"); #include "opt_quota.h" #include "opt_ufs.h" @@ -126,7 +126,7 @@ } } isize = ip->i_size; - if (ip->i_ump->um_fstype == UFS2) + if (I_IS_UFS2(ip)) isize += ip->i_din2->di_extsize; if (ip->i_effnlink <= 0 && isize && !UFS_RDONLY(ip)) error = UFS_TRUNCATE(vp, (off_t)0, IO_EXT | IO_NORMAL, NOCRED); @@ -215,7 +215,6 @@ { struct vnode *vp = ap->a_vp; struct inode *ip = VTOI(vp); - struct ufsmount *ump = ip->i_ump; ufs_prepare_reclaim(vp); @@ -234,6 +233,6 @@ VI_LOCK(vp); vp->v_data = 0; VI_UNLOCK(vp); - UFS_IFREE(ump, ip); + UFS_IFREE(ITOUMP(ip), ip); return (0); } Modified: trunk/sys/ufs/ufs/ufs_lookup.c =================================================================== --- trunk/sys/ufs/ufs/ufs_lookup.c 2020-02-08 19:38:54 UTC (rev 12315) +++ trunk/sys/ufs/ufs/ufs_lookup.c 2020-02-08 19:39:08 UTC (rev 12316) @@ -36,7 +36,7 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/ufs/ufs/ufs_lookup.c 306180 2016-09-22 10:51:47Z kib $"); +__FBSDID("$FreeBSD: stable/11/sys/ufs/ufs/ufs_lookup.c 356965 2020-01-22 01:31:02Z mckusick $"); #include "opt_ufs.h" #include "opt_quota.h" @@ -565,7 +565,7 @@ * in the cache as to where the entry was found. */ if ((flags & ISLASTCN) && nameiop == LOOKUP) - dp->i_diroff = i_offset &~ (DIRBLKSIZ - 1); + dp->i_diroff = rounddown2(i_offset, DIRBLKSIZ); /* * If deleting, and at end of pathname, return @@ -824,14 +824,21 @@ struct componentname *cnp; struct direct *newdirp; { + u_int namelen; -#ifdef INVARIANTS - if ((cnp->cn_flags & SAVENAME) == 0) - panic("ufs_makedirentry: missing name"); -#endif + namelen = (unsigned)cnp->cn_namelen; + KASSERT((cnp->cn_flags & SAVENAME) != 0, + ("ufs_makedirentry: missing name")); + KASSERT(namelen <= MAXNAMLEN, + ("ufs_makedirentry: name too long")); newdirp->d_ino = ip->i_number; - newdirp->d_namlen = cnp->cn_namelen; - bcopy(cnp->cn_nameptr, newdirp->d_name, (unsigned)cnp->cn_namelen + 1); + newdirp->d_namlen = namelen; + + /* Zero out after-name padding */ + *(u_int32_t *)(&newdirp->d_name[namelen & ~(DIR_ROUNDUP - 1)]) = 0; + + bcopy(cnp->cn_nameptr, newdirp->d_name, namelen); + if (ITOV(ip)->v_mount->mnt_maxsymlinklen > 0) newdirp->d_type = IFTODT(ip->i_mode); else { @@ -1092,7 +1099,7 @@ if (dp->i_dirhash != NULL) ufsdirhash_checkblock(dp, dirbuf - (dp->i_offset & (DIRBLKSIZ - 1)), - dp->i_offset & ~(DIRBLKSIZ - 1)); + rounddown2(dp->i_offset, DIRBLKSIZ)); #endif if (DOINGSOFTDEP(dvp)) { @@ -1125,8 +1132,9 @@ error = UFS_TRUNCATE(dvp, (off_t)dp->i_endoff, IO_NORMAL | (DOINGASYNC(dvp) ? 0 : IO_SYNC), cr); if (error != 0) - vn_printf(dvp, "ufs_direnter: failed to truncate " - "err %d", error); + vn_printf(dvp, + "ufs_direnter: failed to truncate, error %d\n", + error); #ifdef UFS_DIRHASH if (error == 0 && dp->i_dirhash != NULL) ufsdirhash_dirtrunc(dp, dp->i_endoff); @@ -1160,6 +1168,7 @@ struct inode *dp; struct direct *ep, *rep; struct buf *bp; + off_t offset; int error; dp = VTOI(dvp); @@ -1169,6 +1178,7 @@ */ if (ip) { ip->i_effnlink--; + ip->i_flag |= IN_CHANGE; if (DOINGSOFTDEP(dvp)) { softdep_setup_unlink(dp, ip); } else { @@ -1177,22 +1187,32 @@ ip->i_flag |= IN_CHANGE; } } + if (flags & DOWHITEOUT) + offset = dp->i_offset; + else + offset = dp->i_offset - dp->i_count; + if ((error = UFS_BLKATOFF(dvp, offset, (char **)&ep, &bp)) != 0) { + if (ip) { + ip->i_effnlink++; + ip->i_flag |= IN_CHANGE; + if (DOINGSOFTDEP(dvp)) { + softdep_change_linkcnt(ip); + } else { + ip->i_nlink++; + DIP_SET(ip, i_nlink, ip->i_nlink); + ip->i_flag |= IN_CHANGE; + } + } + return (error); + } if (flags & DOWHITEOUT) { /* * Whiteout entry: set d_ino to WINO. */ - if ((error = - UFS_BLKATOFF(dvp, (off_t)dp->i_offset, (char **)&ep, &bp)) != 0) - return (error); ep->d_ino = WINO; ep->d_type = DT_WHT; goto out; } - - if ((error = UFS_BLKATOFF(dvp, - (off_t)(dp->i_offset - dp->i_count), (char **)&ep, &bp)) != 0) - return (error); - /* Set 'rep' to the entry being removed. */ if (dp->i_count == 0) rep = ep; @@ -1209,22 +1229,27 @@ if (ip && rep->d_ino != ip->i_number) panic("ufs_dirremove: ip %ju does not match dirent ino %ju\n", (uintmax_t)ip->i_number, (uintmax_t)rep->d_ino); - if (dp->i_count == 0) { + /* + * Zero out the file directory entry metadata to reduce disk + * scavenging disclosure. + */ + bzero(&rep->d_name[0], rep->d_namlen); + rep->d_namlen = 0; + rep->d_type = 0; + rep->d_ino = 0; + + if (dp->i_count != 0) { /* - * First entry in block: set d_ino to zero. - */ - ep->d_ino = 0; - } else { - /* * Collapse new free space into previous entry. */ ep->d_reclen += rep->d_reclen; + rep->d_reclen = 0; } #ifdef UFS_DIRHASH if (dp->i_dirhash != NULL) ufsdirhash_checkblock(dp, (char *)ep - ((dp->i_offset - dp->i_count) & (DIRBLKSIZ - 1)), - dp->i_offset & ~(DIRBLKSIZ - 1)); + rounddown2(dp->i_offset, DIRBLKSIZ)); #endif out: error = 0; @@ -1277,6 +1302,7 @@ * necessary. */ oip->i_effnlink--; + oip->i_flag |= IN_CHANGE; if (DOINGSOFTDEP(vdp)) { softdep_setup_unlink(dp, oip); } else { @@ -1286,13 +1312,23 @@ } error = UFS_BLKATOFF(vdp, (off_t)dp->i_offset, (char **)&ep, &bp); - if (error) - return (error); - if (ep->d_namlen == 2 && ep->d_name[1] == '.' && ep->d_name[0] == '.' && - ep->d_ino != oip->i_number) { + if (error == 0 && ep->d_namlen == 2 && ep->d_name[1] == '.' && + ep->d_name[0] == '.' && ep->d_ino != oip->i_number) { brelse(bp); - return (EIDRM); + error = EIDRM; } + if (error) { + oip->i_effnlink++; + oip->i_flag |= IN_CHANGE; + if (DOINGSOFTDEP(vdp)) { + softdep_change_linkcnt(oip); + } else { + oip->i_nlink++; + DIP_SET(oip, i_nlink, oip->i_nlink); + oip->i_flag |= IN_CHANGE; + } + return (error); + } ep->d_ino = newinum; if (!OFSFMT(vdp)) ep->d_type = newtype; @@ -1469,7 +1505,8 @@ } } KASSERT(dd_ino == VTOI(vp1)->i_number, - ("directory %d reparented\n", VTOI(vp1)->i_number)); + ("directory %ju reparented\n", + (uintmax_t)VTOI(vp1)->i_number)); if (vp != tvp) vput(vp); vp = vp1; Modified: trunk/sys/ufs/ufs/ufs_quota.c =================================================================== --- trunk/sys/ufs/ufs/ufs_quota.c 2020-02-08 19:38:54 UTC (rev 12315) +++ trunk/sys/ufs/ufs/ufs_quota.c 2020-02-08 19:39:08 UTC (rev 12316) @@ -34,7 +34,7 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/ufs/ufs/ufs_quota.c 306178 2016-09-22 10:47:56Z kib $"); +__FBSDID("$FreeBSD: stable/11/sys/ufs/ufs/ufs_quota.c 338943 2018-09-26 14:26:29Z kib $"); #include "opt_ffs.h" @@ -233,13 +233,13 @@ /* Reset timer when crossing soft limit */ if (dq->dq_curblocks + change >= dq->dq_bsoftlimit && dq->dq_curblocks < dq->dq_bsoftlimit) - dq->dq_btime = time_second + ip->i_ump->um_btime[i]; + dq->dq_btime = time_second + ITOUMP(ip)->um_btime[i]; dq->dq_curblocks += change; dq->dq_flags |= DQ_MOD; DQI_UNLOCK(dq); if (warn) uprintf("\n%s: warning, %s disk quota exceeded\n", - ITOV(ip)->v_mount->mnt_stat.f_mntonname, + ITOVFS(ip)->mnt_stat.f_mntonname, quotatypes[i]); } return (0); @@ -265,7 +265,7 @@ dq->dq_flags |= DQ_BLKS; DQI_UNLOCK(dq); uprintf("\n%s: write failed, %s disk limit reached\n", - ITOV(ip)->v_mount->mnt_stat.f_mntonname, + ITOVFS(ip)->mnt_stat.f_mntonname, quotatypes[type]); return (EDQUOT); } @@ -278,7 +278,7 @@ */ if (ncurblocks >= dq->dq_bsoftlimit && dq->dq_bsoftlimit) { if (dq->dq_curblocks < dq->dq_bsoftlimit) { - dq->dq_btime = time_second + ip->i_ump->um_btime[type]; + dq->dq_btime = time_second + ITOUMP(ip)->um_btime[type]; if (ip->i_uid == cred->cr_uid) *warn = 1; return (0); @@ -290,7 +290,7 @@ DQI_UNLOCK(dq); uprintf("\n%s: write failed, %s " "disk quota exceeded for too long\n", - ITOV(ip)->v_mount->mnt_stat.f_mntonname, + ITOVFS(ip)->mnt_stat.f_mntonname, quotatypes[type]); return (EDQUOT); } @@ -371,13 +371,13 @@ /* Reset timer when crossing soft limit */ if (dq->dq_curinodes + change >= dq->dq_isoftlimit && dq->dq_curinodes < dq->dq_isoftlimit) - dq->dq_itime = time_second + ip->i_ump->um_itime[i]; + dq->dq_itime = time_second + ITOUMP(ip)->um_itime[i]; dq->dq_curinodes += change; dq->dq_flags |= DQ_MOD; DQI_UNLOCK(dq); if (warn) uprintf("\n%s: warning, %s inode quota exceeded\n", - ITOV(ip)->v_mount->mnt_stat.f_mntonname, + ITOVFS(ip)->mnt_stat.f_mntonname, quotatypes[i]); } return (0); @@ -402,7 +402,7 @@ dq->dq_flags |= DQ_INODS; DQI_UNLOCK(dq); uprintf("\n%s: write failed, %s inode limit reached\n", - ITOV(ip)->v_mount->mnt_stat.f_mntonname, + ITOVFS(ip)->mnt_stat.f_mntonname, quotatypes[type]); return (EDQUOT); } @@ -415,7 +415,7 @@ */ if (ncurinodes >= dq->dq_isoftlimit && dq->dq_isoftlimit) { if (dq->dq_curinodes < dq->dq_isoftlimit) { - dq->dq_itime = time_second + ip->i_ump->um_itime[type]; + dq->dq_itime = time_second + ITOUMP(ip)->um_itime[type]; if (ip->i_uid == cred->cr_uid) *warn = 1; return (0); @@ -427,7 +427,7 @@ DQI_UNLOCK(dq); uprintf("\n%s: write failed, %s " "inode quota exceeded for too long\n", - ITOV(ip)->v_mount->mnt_stat.f_mntonname, + ITOVFS(ip)->mnt_stat.f_mntonname, quotatypes[type]); return (EDQUOT); } @@ -446,10 +446,13 @@ static void chkdquot(struct inode *ip) { - struct ufsmount *ump = ip->i_ump; - struct vnode *vp = ITOV(ip); + struct ufsmount *ump; + struct vnode *vp; int i; + ump = ITOUMP(ip); + vp = ITOV(ip); + /* * Disk quotas must be turned off for system files. Currently * these are snapshots and quota files. @@ -470,7 +473,7 @@ continue; if (ip->i_dquot[i] == NODQUOT) { UFS_UNLOCK(ump); - vprint("chkdquot: missing dquot", ITOV(ip)); + vn_printf(ITOV(ip), "chkdquot: missing dquot "); panic("chkdquot: missing dquot"); } } @@ -708,6 +711,34 @@ return (error); } +static int +quotaoff_inchange1(struct thread *td, struct mount *mp, int type) +{ + int error; + bool need_resume; + + /* + * mp is already suspended on unmount. If not, suspend it, to + * avoid the situation where quotaoff operation eventually + * failing due to SU structures still keeping references on + * dquots, but vnode's references are already clean. This + * would cause quota accounting leak and asserts otherwise. + * Note that the thread has already called vn_start_write(). + */ + if (mp->mnt_susp_owner == td) { + need_resume = false; + } else { + error = vfs_write_suspend_umnt(mp); + if (error != 0) + return (error); + need_resume = true; + } + error = quotaoff1(td, mp, type); + if (need_resume) + vfs_write_resume(mp, VR_START_WRITE); + return (error); +} + /* * Turns off quotas, assumes that ump->um_qflags are already checked * and QTF_CLOSING is set to indicate operation in progress. Fixes @@ -717,10 +748,9 @@ quotaoff_inchange(struct thread *td, struct mount *mp, int type) { struct ufsmount *ump; - int i; - int error; + int error, i; - error = quotaoff1(td, mp, type); + error = quotaoff_inchange1(td, mp, type); ump = VFSTOUFS(mp); UFS_LOCK(ump); @@ -1040,11 +1070,9 @@ * Check if the mount point has any quotas. * If not, simply return. */ - UFS_LOCK(ump); for (i = 0; i < MAXQUOTAS; i++) if (ump->um_quotas[i] != NULLVP) break; - UFS_UNLOCK(ump); if (i == MAXQUOTAS) return (0); /* @@ -1089,11 +1117,9 @@ * Check if the mount point has any quotas. * If not, simply return. */ - UFS_LOCK(ump); for (i = 0; i < MAXQUOTAS; i++) if (ump->um_quotas[i] != NULLVP) break; - UFS_UNLOCK(ump); if (i == MAXQUOTAS) return (0); /* Modified: trunk/sys/ufs/ufs/ufs_vfsops.c =================================================================== --- trunk/sys/ufs/ufs/ufs_vfsops.c 2020-02-08 19:38:54 UTC (rev 12315) +++ trunk/sys/ufs/ufs/ufs_vfsops.c 2020-02-08 19:39:08 UTC (rev 12316) @@ -36,7 +36,7 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/ufs/ufs/ufs_vfsops.c 278150 2015-02-03 11:54:33Z kib $"); +__FBSDID("$FreeBSD: stable/11/sys/ufs/ufs/ufs_vfsops.c 338943 2018-09-26 14:26:29Z kib $"); #include "opt_quota.h" #include "opt_ufs.h" @@ -93,7 +93,8 @@ void *arg; { #ifndef QUOTA - if ((cmds >> SUBCMDSHIFT) == Q_QUOTAON) + if ((cmds >> SUBCMDSHIFT) == Q_QUOTAON || + (cmds >> SUBCMDSHIFT) == Q_QUOTAOFF) vfs_unbusy(mp); return (EOPNOTSUPP); @@ -116,13 +117,13 @@ break; default: - if (cmd == Q_QUOTAON) + if (cmd == Q_QUOTAON || cmd == Q_QUOTAOFF) vfs_unbusy(mp); return (EINVAL); } } if ((u_int)type >= MAXQUOTAS) { - if (cmd == Q_QUOTAON) + if (cmd == Q_QUOTAON || cmd == Q_QUOTAOFF) vfs_unbusy(mp); return (EINVAL); } @@ -133,7 +134,11 @@ break; case Q_QUOTAOFF: + vfs_ref(mp); + vfs_unbusy(mp); + vn_start_write(NULL, &mp, V_WAIT | V_MNTREF); error = quotaoff(td, mp, type); + vn_finished_write(mp); break; case Q_SETQUOTA32: Modified: trunk/sys/ufs/ufs/ufs_vnops.c =================================================================== --- trunk/sys/ufs/ufs/ufs_vnops.c 2020-02-08 19:38:54 UTC (rev 12315) +++ trunk/sys/ufs/ufs/ufs_vnops.c 2020-02-08 19:39:08 UTC (rev 12316) @@ -36,7 +36,7 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: stable/10/sys/ufs/ufs/ufs_vnops.c 332750 2018-04-19 02:50:15Z pfg $"); +__FBSDID("$FreeBSD: stable/11/sys/ufs/ufs/ufs_vnops.c 346032 2019-04-08 15:52:13Z sjg $"); #include "opt_quota.h" #include "opt_suiddir.h" @@ -123,7 +123,6 @@ static vop_whiteout_t ufs_whiteout; static vop_close_t ufsfifo_close; static vop_kqfilter_t ufsfifo_kqfilter; -static vop_pathconf_t ufsfifo_pathconf; SYSCTL_NODE(_vfs, OID_AUTO, ufs, CTLFLAG_RD, 0, "UFS filesystem"); @@ -325,9 +324,6 @@ struct inode *ip = VTOI(vp); accmode_t accmode = ap->a_accmode; int error; -#ifdef QUOTA - int relocked; -#endif #ifdef UFS_ACL struct acl *acl; acl_type_t type; @@ -350,32 +346,14 @@ * Inode is accounted in the quotas only if struct * dquot is attached to it. VOP_ACCESS() is called * from vn_open_cred() and provides a convenient - * point to call getinoquota(). + * point to call getinoquota(). The lock mode is + * exclusive when the file is opening for write. */ - if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) { - - /* - * Upgrade vnode lock, since getinoquota() - * requires exclusive lock to modify inode. - */ - relocked = 1; - vhold(vp); - vn_lock(vp, LK_UPGRADE | LK_RETRY); - VI_LOCK(vp); - if (vp->v_iflag & VI_DOOMED) { - vdropl(vp); - error = ENOENT; - goto relock; - } - vdropl(vp); - } else - relocked = 0; - error = getinoquota(ip); -relock: - if (relocked) - vn_lock(vp, LK_DOWNGRADE | LK_RETRY); - if (error != 0) - return (error); + if (VOP_ISLOCKED(vp) == LK_EXCLUSIVE) { + error = getinoquota(ip); + if (error != 0) + return (error); + } #endif break; default: @@ -385,8 +363,7 @@ /* * If immutable bit set, nobody gets to write it. "& ~VADMIN_PERMS" - * is here, because without it, * it would be impossible for the owner - * to remove the IMMUTABLE flag. + * permits the owner of the file to remove the IMMUTABLE flag. */ if ((accmode & (VMODIFY_PERMS & ~VADMIN_PERMS)) && (ip->i_flags & (IMMUTABLE | SF_SNAPSHOT))) @@ -458,7 +435,7 @@ VI_LOCK(vp); ufs_itimes_locked(vp); - if (ip->i_ump->um_fstype == UFS1) { + if (I_IS_UFS1(ip)) { vap->va_atime.tv_sec = ip->i_din1->di_atime; vap->va_atime.tv_nsec = ip->i_din1->di_atimensec; } else { @@ -469,13 +446,13 @@ /* * Copy from inode table */ - vap->va_fsid = dev2udev(ip->i_dev); + vap->va_fsid = dev2udev(ITOUMP(ip)->um_dev); vap->va_fileid = ip->i_number; vap->va_mode = ip->i_mode & ~IFMT; vap->va_nlink = ip->i_effnlink; vap->va_uid = ip->i_uid; vap->va_gid = ip->i_gid; - if (ip->i_ump->um_fstype == UFS1) { + if (I_IS_UFS1(ip)) { vap->va_rdev = ip->i_din1->di_rdev; vap->va_size = ip->i_din1->di_size; vap->va_mtime.tv_sec = ip->i_din1->di_mtime; @@ -653,8 +630,7 @@ DIP_SET(ip, i_mtime, vap->va_mtime.tv_sec); DIP_SET(ip, i_mtimensec, vap->va_mtime.tv_nsec); } - if (vap->va_birthtime.tv_sec != VNOVAL && - ip->i_ump->um_fstype == UFS2) { + if (vap->va_birthtime.tv_sec != VNOVAL && I_IS_UFS2(ip)) { ip->i_din2->di_birthtime = vap->va_birthtime.tv_sec; ip->i_din2->di_birthnsec = vap->va_birthtime.tv_nsec; } @@ -951,8 +927,8 @@ struct inode *dip; dip = VTOI(dvp); - uprintf("%s: Bad link count %d on parent inode %d in file system %s\n", - funcname, dip->i_effnlink, dip->i_number, + uprintf("%s: Bad link count %d on parent inode %jd in file system %s\n", + funcname, dip->i_effnlink, (intmax_t)dip->i_number, dvp->v_mount->mnt_stat.f_mntonname); } @@ -1362,7 +1338,7 @@ * expunge the original entry's existence. */ if (tip == NULL) { - if (tdp->i_dev != fip->i_dev) + if (ITODEV(tdp) != ITODEV(fip)) panic("ufs_rename: EXDEV"); if (doingdirectory && newparent) { /* @@ -1386,7 +1362,7 @@ tdp->i_endoff < tdp->i_size) endoff = tdp->i_endoff; } else { - if (tip->i_dev != tdp->i_dev || tip->i_dev != fip->i_dev) + if (ITODEV(tip) != ITODEV(tdp) || ITODEV(tip) != ITODEV(fip)) panic("ufs_rename: EXDEV"); /* * Short circuit rename(foo, foo). @@ -1547,8 +1523,9 @@ error = UFS_TRUNCATE(tdvp, endoff, IO_NORMAL | IO_SYNC, tcnp->cn_cred); if (error != 0) - vn_printf(tdvp, "ufs_rename: failed to truncate " - "err %d", error); + vn_printf(tdvp, + "ufs_rename: failed to truncate, error %d\n", + error); #ifdef UFS_DIRHASH else if (tdp->i_dirhash != NULL) ufsdirhash_dirtrunc(tdp, endoff); @@ -2240,7 +2217,7 @@ dstdp.d_fileno = dp->d_ino; dstdp.d_reclen = GENERIC_DIRSIZ(&dstdp); bcopy(dp->d_name, dstdp.d_name, dstdp.d_namlen); - dstdp.d_name[dstdp.d_namlen] = '\0'; + dirent_terminate(&dstdp); if (dstdp.d_reclen > uio->uio_resid) { if (uio->uio_resid == startresid) error = EINVAL; @@ -2323,12 +2300,9 @@ { struct buf *bp = ap->a_bp; struct vnode *vp = ap->a_vp; - struct bufobj *bo; - struct inode *ip; ufs2_daddr_t blkno; int error; - ip = VTOI(vp); if (bp->b_blkno == bp->b_lblkno) { error = ufs_bmaparray(vp, bp->b_lblkno, &blkno, bp, NULL, NULL); bp->b_blkno = blkno; @@ -2346,8 +2320,7 @@ return (0); } bp->b_iooffset = dbtob(bp->b_blkno); - bo = ip->i_umbufobj; - BO_STRATEGY(bo, bp); + BO_STRATEGY(VFSTOUFS(vp->v_mount)->um_bo, bp); return (0); } @@ -2364,7 +2337,7 @@ struct inode *ip = VTOI(vp); printf("\tino %lu, on dev %s", (u_long)ip->i_number, - devtoname(ip->i_dev)); + devtoname(ITODEV(ip))); if (vp->v_type == VFIFO) fifo_printinfo(vp); printf("\n"); @@ -2414,30 +2387,6 @@ } /* - * Return POSIX pathconf information applicable to fifos. - */ -static int -ufsfifo_pathconf(ap) - struct vop_pathconf_args /* { - struct vnode *a_vp; - int a_name; - int *a_retval; - } */ *ap; -{ - - switch (ap->a_name) { - case _PC_ACL_EXTENDED: - case _PC_ACL_NFS4: - case _PC_ACL_PATH_MAX: - case _PC_MAC_PRESENT: - return (ufs_pathconf(ap)); - default: - return (fifo_specops.vop_pathconf(ap)); - } - /* NOTREACHED */ -} - -/* * Return POSIX pathconf information applicable to ufs filesystems. */ static int @@ -2452,17 +2401,14 @@ error = 0; switch (ap->a_name) { - case _PC_LINK_MAX: - *ap->a_retval = LINK_MAX; - break; case _PC_NAME_MAX: *ap->a_retval = NAME_MAX; break; - case _PC_PATH_MAX: - *ap->a_retval = PATH_MAX; - break; case _PC_PIPE_BUF: - *ap->a_retval = PIPE_BUF; + if (ap->a_vp->v_type == VDIR || ap->a_vp->v_type == VFIFO) + *ap->a_retval = PIPE_BUF; + else + error = EINVAL; break; case _PC_CHOWN_RESTRICTED: *ap->a_retval = 1; @@ -2470,28 +2416,20 @@ case _PC_NO_TRUNC: *ap->a_retval = 1; break; +#ifdef UFS_ACL case _PC_ACL_EXTENDED: -#ifdef UFS_ACL if (ap->a_vp->v_mount->mnt_flag & MNT_ACLS) *ap->a_retval = 1; else *ap->a_retval = 0; -#else - *ap->a_retval = 0; -#endif break; - case _PC_ACL_NFS4: -#ifdef UFS_ACL if (ap->a_vp->v_mount->mnt_flag & MNT_NFS4ACLS) *ap->a_retval = 1; else *ap->a_retval = 0; -#else - *ap->a_retval = 0; + break; #endif - break; - case _PC_ACL_PATH_MAX: #ifdef UFS_ACL if (ap->a_vp->v_mount->mnt_flag & (MNT_ACLS | MNT_NFS4ACLS)) @@ -2502,24 +2440,17 @@ *ap->a_retval = 3; #endif break; +#ifdef MAC case _PC_MAC_PRESENT: -#ifdef MAC if (ap->a_vp->v_mount->mnt_flag & MNT_MULTILABEL) *ap->a_retval = 1; else *ap->a_retval = 0; -#else - *ap->a_retval = 0; + break; #endif - break; case _PC_MIN_HOLE_SIZE: *ap->a_retval = ap->a_vp->v_mount->mnt_stat.f_iosize; break; - case _PC_ASYNC_IO: - /* _PC_ASYNC_IO should have been handled by upper layers. */ - KASSERT(0, ("_PC_ASYNC_IO should not get here")); - error = EINVAL; - break; case _PC_PRIO_IO: *ap->a_retval = 0; break; @@ -2549,7 +2480,7 @@ break; default: - error = EINVAL; + error = vop_stdpathconf(ap); break; } return (error); @@ -2571,6 +2502,11 @@ vp = *vpp; ip = VTOI(vp); vp->v_type = IFTOVT(ip->i_mode); + /* + * Only unallocated inodes should be of type VNON. + */ + if (ip->i_mode != 0 && vp->v_type == VNON) + return (EINVAL); if (vp->v_type == VFIFO) vp->v_op = fifoops; ASSERT_VOP_LOCKED(vp, "ufs_vinit"); @@ -2822,7 +2758,7 @@ .vop_inactive = ufs_inactive, .vop_kqfilter = ufsfifo_kqfilter, .vop_markatime = ufs_markatime, - .vop_pathconf = ufsfifo_pathconf, + .vop_pathconf = ufs_pathconf, .vop_print = ufs_print, .vop_read = VOP_PANIC, .vop_reclaim = ufs_reclaim, Modified: trunk/sys/ufs/ufs/ufsmount.h =================================================================== --- trunk/sys/ufs/ufs/ufsmount.h 2020-02-08 19:38:54 UTC (rev 12315) +++ trunk/sys/ufs/ufs/ufsmount.h 2020-02-08 19:39:08 UTC (rev 12316) @@ -28,14 +28,12 @@ * SUCH DAMAGE. * * @(#)ufsmount.h 8.6 (Berkeley) 3/30/95 - * $FreeBSD: stable/10/sys/ufs/ufs/ufsmount.h 297787 2016-04-10 16:32:21Z kib $ + * $FreeBSD: stable/11/sys/ufs/ufs/ufsmount.h 331722 2018-03-29 02:50:57Z eadler $ */ #ifndef _UFS_UFS_UFSMOUNT_H_ #define _UFS_UFS_UFSMOUNT_H_ -#include <sys/buf.h> /* XXX For struct workhead. */ - /* * Arguments to mount UFS-based filesystems */ @@ -111,8 +109,8 @@ #define UFS_VALLOC(aa, bb, cc, dd) VFSTOUFS((aa)->v_mount)->um_valloc(aa, bb, cc, dd) #define UFS_VFREE(aa, bb, cc) VFSTOUFS((aa)->v_mount)->um_vfree(aa, bb, cc) #define UFS_IFREE(aa, bb) ((aa)->um_ifree(aa, bb)) -#define UFS_RDONLY(aa) ((aa)->i_ump->um_rdonly(aa)) -#define UFS_SNAPGONE(aa) ((aa)->i_ump->um_snapgone(aa)) +#define UFS_RDONLY(aa) (ITOUMP(aa)->um_rdonly(aa)) +#define UFS_SNAPGONE(aa) (ITOUMP(aa)->um_snapgone(aa)) #define UFS_LOCK(aa) mtx_lock(&(aa)->um_lock) #define UFS_UNLOCK(aa) mtx_unlock(&(aa)->um_lock) From laffer1 at midnightbsd.org Sat Feb 8 14:40:32 2020 From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org) Date: Sat, 8 Feb 2020 14:40:32 -0500 (EST) Subject: [Midnightbsd-cvs] src [12317] trunk/sys/tools/embed_mfs.sh: sync with FreeBSD 11-stable Message-ID: <202002081940.018JeW9h063726@stargazer.midnightbsd.org> Revision: 12317 http://svnweb.midnightbsd.org/src/?rev=12317 Author: laffer1 Date: 2020-02-08 14:40:31 -0500 (Sat, 08 Feb 2020) Log Message: ----------- sync with FreeBSD 11-stable Modified Paths: -------------- trunk/sys/tools/embed_mfs.sh Modified: trunk/sys/tools/embed_mfs.sh =================================================================== --- trunk/sys/tools/embed_mfs.sh 2020-02-08 19:39:08 UTC (rev 12316) +++ trunk/sys/tools/embed_mfs.sh 2020-02-08 19:40:31 UTC (rev 12317) @@ -23,18 +23,62 @@ # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF # SUCH DAMAGE. # -# $FreeBSD: stable/10/sys/tools/embed_mfs.sh 175984 2008-02-05 10:46:30Z raj $ +# $FreeBSD: stable/11/sys/tools/embed_mfs.sh 331691 2018-03-28 17:19:04Z emaste $ # $MidnightBSD$ # -# Embed the MFS image into the kernel body (expects space reserved via -# MD_ROOT_SIZE) +# Embed an MFS image into the kernel body or the loader body (expects space +# reserved via MD_ROOT_SIZE (kernel) or MD_IMAGE_SIZE (loader)) # -# $1: kernel filename +# $1: kernel or loader filename # $2: MFS image filename # -obs=`strings -at d $1 | grep "MFS Filesystem goes here" | awk '{print $1}'` -dd if=$2 ibs=8192 of=$1 obs=${obs} oseek=1 conv=notrunc 2> /dev/null +if [ $# -ne 2 ]; then + echo "usage: $(basename $0) target mfs_image" + exit 0 +fi +if [ ! -w "$1" ]; then + echo $1 not writable + exit 1 +fi -strings $1 | grep 'MFS Filesystem had better STOP here' > /dev/null || \ - (rm $1 && echo "MFS image too large" && false) +mfs_size=`stat -f '%z' $2 2> /dev/null` +# If we can't determine MFS image size - bail. +[ -z ${mfs_size} ] && echo "Can't determine MFS image size" && exit 1 + +err_no_mfs="Can't locate mfs section within " + +if file -b $1 | grep -q '^ELF ..-bit .SB executable'; then + + sec_info=`elfdump -c $1 2> /dev/null | grep -A 5 -E "sh_name: oldmfs$"` + # If we can't find the mfs section within the given kernel - bail. + [ -z "${sec_info}" ] && echo "${err_no_mfs} $1" && exit 1 + + sec_size=`echo "${sec_info}" | awk '/sh_size/ {print $2}' 2>/dev/null` + sec_start=`echo "${sec_info}" | \ + awk '/sh_offset/ {print $2}' 2>/dev/null` + +else + + #try to find start byte of MFS start flag otherwise - bail. + sec_start=`strings -at d $1 | grep "MFS Filesystem goes here"` || \ + { echo "${err_no_mfs} $1"; exit 1; } + sec_start=`echo ${sec_start} | awk '{print $1}'` + + #try to find start byte of MFS end flag otherwise - bail. + sec_end=`strings -at d $1 | \ + grep "MFS Filesystem had better STOP here"` || \ + { echo "${err_no_mfs} $1"; exit 1; } + sec_end=`echo ${sec_end} | awk '{print $1}'` + + #calculate MFS section size + sec_size=`expr ${sec_end} - ${sec_start}` + +fi + +# If the mfs section size is smaller than the mfs image - bail. +[ ${sec_size} -lt ${mfs_size} ] && echo "MFS image too large" && exit 1 + +# Dump the mfs image into the mfs section +dd if=$2 ibs=8192 of=$1 obs=${sec_start} oseek=1 conv=notrunc 2> /dev/null && \ + echo "MFS image embedded into $1" && exit 0 From laffer1 at midnightbsd.org Sat Feb 8 14:41:46 2020 From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org) Date: Sat, 8 Feb 2020 14:41:46 -0500 (EST) Subject: [Midnightbsd-cvs] src [12318] trunk/sys/tools/fw_stub.awk: sync with FreeBSD 11-stable Message-ID: <202002081941.018JfkIk063810@stargazer.midnightbsd.org> Revision: 12318 http://svnweb.midnightbsd.org/src/?rev=12318 Author: laffer1 Date: 2020-02-08 14:41:45 -0500 (Sat, 08 Feb 2020) Log Message: ----------- sync with FreeBSD 11-stable Modified Paths: -------------- trunk/sys/tools/fw_stub.awk Property Changed: ---------------- trunk/sys/tools/fw_stub.awk Modified: trunk/sys/tools/fw_stub.awk =================================================================== --- trunk/sys/tools/fw_stub.awk 2020-02-08 19:40:31 UTC (rev 12317) +++ trunk/sys/tools/fw_stub.awk 2020-02-08 19:41:45 UTC (rev 12318) @@ -25,8 +25,8 @@ # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF # SUCH DAMAGE. # -# $FreeBSD: src/sys/tools/fw_stub.awk,v 1.6.2.1 2009/11/02 09:47:41 fjoe Exp $ -# $MidnightBSD: src/sys/tools/fw_stub.awk,v 1.3 2012/01/11 04:11:27 laffer1 Exp $ +# $FreeBSD: stable/11/sys/tools/fw_stub.awk 289399 2015-10-16 00:38:05Z bdrewery $ +# $MidnightBSD$ # # Script to generate module .c file from a list of firmware images @@ -157,7 +157,7 @@ printc("\ TUNABLE_LONG_FETCH(\"legal." opt_l ".license_ack\", &" opt_l "_license_ack);\ if (!" opt_l "_license_ack) {\ - printf(\"" opt_m ": You need to read the LICENSE file in /usr/share/doc/legal/" opt_l "/.\\n\");\ + printf(\"" opt_m ": You need to read the LICENSE file in /usr/share/doc/legal/" opt_l ".LICENSE.\\n\");\ printf(\"" opt_m ": If you agree with the license, set legal." opt_l ".license_ack=1 in /boot/loader.conf.\\n\");\ return(EPERM);\ }\n"); Property changes on: trunk/sys/tools/fw_stub.awk ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +MidnightBSD=%H \ No newline at end of property From laffer1 at midnightbsd.org Sat Feb 8 14:43:52 2020 From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org) Date: Sat, 8 Feb 2020 14:43:52 -0500 (EST) Subject: [Midnightbsd-cvs] src [12319] trunk/sys/tools/fdt/make_dtb.sh: sync with FreeBSD 11-stable Message-ID: <202002081943.018JhqsJ063931@stargazer.midnightbsd.org> Revision: 12319 http://svnweb.midnightbsd.org/src/?rev=12319 Author: laffer1 Date: 2020-02-08 14:43:52 -0500 (Sat, 08 Feb 2020) Log Message: ----------- sync with FreeBSD 11-stable Modified Paths: -------------- trunk/sys/tools/fdt/make_dtb.sh Modified: trunk/sys/tools/fdt/make_dtb.sh =================================================================== --- trunk/sys/tools/fdt/make_dtb.sh 2020-02-08 19:41:45 UTC (rev 12318) +++ trunk/sys/tools/fdt/make_dtb.sh 2020-02-08 19:43:52 UTC (rev 12319) @@ -1,6 +1,6 @@ #!/bin/sh # -# $FreeBSD: stable/10/sys/tools/fdt/make_dtb.sh 273715 2014-10-27 00:47:55Z ian $ +# $FreeBSD: stable/11/sys/tools/fdt/make_dtb.sh 318196 2017-05-11 20:30:44Z gonzo $ # $MidnightBSD$ # Script generates dtb file ($3) from dts source ($2) in build tree S ($1) @@ -21,5 +21,5 @@ dtb=${dtb_path}/`basename $d .dts`.dtb echo "converting $d -> $dtb" cpp -P -x assembler-with-cpp -I $S/gnu/dts/include -I $S/boot/fdt/dts/${MACHINE} -I $S/gnu/dts/${MACHINE} -include $d /dev/null | - dtc -O dtb -o $dtb -b 0 -p 1024 -i $S/boot/fdt/dts/${MACHINE} -i $S/gnu/dts/${MACHINE} + dtc -@ -O dtb -o $dtb -b 0 -p 1024 -i $S/boot/fdt/dts/${MACHINE} -i $S/gnu/dts/${MACHINE} done From laffer1 at midnightbsd.org Sat Feb 8 14:46:23 2020 From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org) Date: Sat, 8 Feb 2020 14:46:23 -0500 (EST) Subject: [Midnightbsd-cvs] src [12320] trunk/sys/teken: sync with FreeBSD 11-stable Message-ID: <202002081946.018JkNAu064732@stargazer.midnightbsd.org> Revision: 12320 http://svnweb.midnightbsd.org/src/?rev=12320 Author: laffer1 Date: 2020-02-08 14:46:22 -0500 (Sat, 08 Feb 2020) Log Message: ----------- sync with FreeBSD 11-stable Modified Paths: -------------- trunk/sys/teken/demo/Makefile trunk/sys/teken/demo/teken_demo.c trunk/sys/teken/libteken/Makefile trunk/sys/teken/libteken/teken.3 trunk/sys/teken/sequences trunk/sys/teken/stress/Makefile trunk/sys/teken/stress/teken_stress.c trunk/sys/teken/teken.c trunk/sys/teken/teken.h trunk/sys/teken/teken_scs.h trunk/sys/teken/teken_subr.h trunk/sys/teken/teken_subr_compat.h Modified: trunk/sys/teken/demo/Makefile =================================================================== --- trunk/sys/teken/demo/Makefile 2020-02-08 19:43:52 UTC (rev 12319) +++ trunk/sys/teken/demo/Makefile 2020-02-08 19:46:22 UTC (rev 12320) @@ -1,5 +1,5 @@ # $MidnightBSD$ -# $FreeBSD: stable/10/sys/teken/demo/Makefile 226341 2011-10-13 14:20:27Z ed $ +# $FreeBSD: stable/11/sys/teken/demo/Makefile 226341 2011-10-13 14:20:27Z ed $ PROG= teken_demo LDADD= -lncursesw -lteken -lutil Modified: trunk/sys/teken/demo/teken_demo.c =================================================================== --- trunk/sys/teken/demo/teken_demo.c 2020-02-08 19:43:52 UTC (rev 12319) +++ trunk/sys/teken/demo/teken_demo.c 2020-02-08 19:46:22 UTC (rev 12320) @@ -24,7 +24,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: stable/10/sys/teken/demo/teken_demo.c 262861 2014-03-06 18:30:56Z jhb $ + * $FreeBSD: stable/11/sys/teken/demo/teken_demo.c 286797 2015-08-15 08:29:13Z ed $ */ #include <sys/ioctl.h> @@ -38,7 +38,7 @@ #include <unistd.h> #include <ncurses.h> -#if defined(__MidnightBSD__) +#if defined(__FreeBSD__) #include <libutil.h> #elif defined(__linux__) #include <pty.h> @@ -73,7 +73,7 @@ #define NCOLS 80 #define NROWS 24 -struct pixel buffer[NCOLS][NROWS]; +static struct pixel buffer[NCOLS][NROWS]; static int ptfd; Modified: trunk/sys/teken/libteken/Makefile =================================================================== --- trunk/sys/teken/libteken/Makefile 2020-02-08 19:43:52 UTC (rev 12319) +++ trunk/sys/teken/libteken/Makefile 2020-02-08 19:46:22 UTC (rev 12320) @@ -1,5 +1,5 @@ # $MidnightBSD$ -# $FreeBSD: stable/10/sys/teken/libteken/Makefile 221698 2011-05-09 16:27:39Z ed $ +# $FreeBSD: stable/11/sys/teken/libteken/Makefile 221698 2011-05-09 16:27:39Z ed $ LIB= teken SHLIB_MAJOR= 0 Modified: trunk/sys/teken/libteken/teken.3 =================================================================== --- trunk/sys/teken/libteken/teken.3 2020-02-08 19:43:52 UTC (rev 12319) +++ trunk/sys/teken/libteken/teken.3 2020-02-08 19:46:22 UTC (rev 12320) @@ -23,9 +23,9 @@ .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" -.\" $FreeBSD: stable/10/sys/teken/libteken/teken.3 223574 2011-06-26 18:25:10Z ed $ +.\" $FreeBSD: stable/11/sys/teken/libteken/teken.3 330916 2018-03-14 07:47:26Z eadler $ .\" -.Dd May 9, 2011 +.Dd Mar 13, 2017 .Dt TEKEN 3 .Os .Sh NAME @@ -58,6 +58,8 @@ .Ft const char * .Fn teken_get_sequence "teken_t *t" "unsigned int id" .Ft teken_color_t +.Fn teken_256to16 "teken_color_t color" +.Ft teken_color_t .Fn teken_256to8 "teken_color_t color" .Ft void .Fn teken_get_defattr_cons25 "teken_t *t" "int *fg" "int *bg" @@ -164,10 +166,22 @@ any modern applications. .Pp The +.Fn teken_256to16 +function converts an xterm-256 256-color code to an xterm 16-color code +whose color with default palettes is as similar as possible (not very +similar). +The lower 3 bits of the result are the ANSI color and the next lowest +bit is brightness. +Other layers (hardare and software) that only support 16 colors can use +this to avoid knowing the details of 256-color codes. +.Pp +The .Fn teken_256to8 -function converts a color code to one of the 8 primary colors, allowing -the terminal to be rendered on graphics hardware that only supports 8 or -16 colors (e.g. VGA). +function is similar to +.Fn teken_256to16 +except it converts to an ANSI 8-color code. +This is more accurate than discarding the brigtness bit in the result of +.Fn teken_256to16 . .Pp The .Fn teken_get_defattr_cons25 @@ -189,7 +203,7 @@ .Sh SEE ALSO .Xr ncurses 3 , .Xr termcap 3 , -.Xr syscons 4 . +.Xr syscons 4 .Sh HISTORY The .Nm Modified: trunk/sys/teken/sequences =================================================================== --- trunk/sys/teken/sequences 2020-02-08 19:43:52 UTC (rev 12319) +++ trunk/sys/teken/sequences 2020-02-08 19:46:22 UTC (rev 12320) @@ -23,7 +23,7 @@ # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF # SUCH DAMAGE. # -# $FreeBSD: stable/10/sys/teken/sequences 214817 2010-11-05 00:56:21Z ed $ +# $FreeBSD: stable/11/sys/teken/sequences 214817 2010-11-05 00:56:21Z ed $ # $MidnightBSD$ # File format is as follows: Modified: trunk/sys/teken/stress/Makefile =================================================================== --- trunk/sys/teken/stress/Makefile 2020-02-08 19:43:52 UTC (rev 12319) +++ trunk/sys/teken/stress/Makefile 2020-02-08 19:46:22 UTC (rev 12320) @@ -1,5 +1,5 @@ # $MidnightBSD$ -# $FreeBSD: stable/10/sys/teken/stress/Makefile 221698 2011-05-09 16:27:39Z ed $ +# $FreeBSD: stable/11/sys/teken/stress/Makefile 221698 2011-05-09 16:27:39Z ed $ PROG= teken_stress LDADD= -lteken Modified: trunk/sys/teken/stress/teken_stress.c =================================================================== --- trunk/sys/teken/stress/teken_stress.c 2020-02-08 19:43:52 UTC (rev 12319) +++ trunk/sys/teken/stress/teken_stress.c 2020-02-08 19:46:22 UTC (rev 12320) @@ -24,7 +24,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: stable/10/sys/teken/stress/teken_stress.c 226100 2011-10-07 12:42:03Z ed $ + * $FreeBSD: stable/11/sys/teken/stress/teken_stress.c 226100 2011-10-07 12:42:03Z ed $ */ #include <sys/cdefs.h> Modified: trunk/sys/teken/teken.c =================================================================== --- trunk/sys/teken/teken.c 2020-02-08 19:43:52 UTC (rev 12319) +++ trunk/sys/teken/teken.c 2020-02-08 19:46:22 UTC (rev 12320) @@ -24,17 +24,17 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: stable/10/sys/teken/teken.c 287776 2015-09-14 09:12:28Z ed $ + * $FreeBSD: stable/11/sys/teken/teken.c 330916 2018-03-14 07:47:26Z eadler $ */ #include <sys/cdefs.h> -#if defined(__MidnightBSD__) && defined(_KERNEL) +#if defined(__FreeBSD__) && defined(_KERNEL) #include <sys/param.h> #include <sys/limits.h> #include <sys/lock.h> #include <sys/systm.h> #define teken_assert(x) MPASS(x) -#else /* !(__MidnightBSD__ && _KERNEL) */ +#else /* !(__FreeBSD__ && _KERNEL) */ #include <sys/types.h> #include <assert.h> #include <limits.h> @@ -42,7 +42,7 @@ #include <stdio.h> #include <string.h> #define teken_assert(x) assert(x) -#endif /* __MidnightBSD__ && _KERNEL */ +#endif /* __FreeBSD__ && _KERNEL */ /* debug messages */ #define teken_printf(x,...) @@ -453,55 +453,203 @@ return (0); } +#define k TC_BLACK +#define b TC_BLUE +#define y TC_BROWN +#define c TC_CYAN +#define g TC_GREEN +#define m TC_MAGENTA +#define r TC_RED +#define w TC_WHITE +#define K (TC_BLACK | TC_LIGHT) +#define B (TC_BLUE | TC_LIGHT) +#define Y (TC_BROWN | TC_LIGHT) +#define C (TC_CYAN | TC_LIGHT) +#define G (TC_GREEN | TC_LIGHT) +#define M (TC_MAGENTA | TC_LIGHT) +#define R (TC_RED | TC_LIGHT) +#define W (TC_WHITE | TC_LIGHT) + +/** + * The xterm-256 color map has steps of 0x28 (in the range 0-0xff), except + * for the first step which is 0x5f. Scale to the range 0-6 by dividing + * by 0x28 and rounding down. The range of 0-5 cannot represent the + * larger first step. + * + * This table is generated by the follow rules: + * - if all components are equal, the result is black for (0, 0, 0) and + * (2, 2, 2), else white; otherwise: + * - subtract the smallest component from all components + * - if this gives only one nonzero component, then that is the color + * - else if one component is 2 or more larger than the other nonzero one, + * then that component gives the color + * - else there are 2 nonzero components. The color is that of a small + * equal mixture of these components (cyan, yellow or magenta). E.g., + * (0, 5, 6) (Turquoise2) is a much purer cyan than (0, 2, 3) + * (DeepSkyBlue4), but we map both to cyan since we can't represent + * delicate shades of either blue or cyan and blue would be worse. + * Here it is important that components of 1 never occur. Blue would + * be twice as large as green in (0, 1, 2). + */ +static const teken_color_t teken_256to8tab[] = { + /* xterm normal colors: */ + k, r, g, y, b, m, c, w, + + /* xterm bright colors: */ + k, r, g, y, b, m, c, w, + + /* Red0 submap. */ + k, b, b, b, b, b, + g, c, c, b, b, b, + g, c, c, c, b, b, + g, g, c, c, c, b, + g, g, g, c, c, c, + g, g, g, g, c, c, + + /* Red2 submap. */ + r, m, m, b, b, b, + y, k, b, b, b, b, + y, g, c, c, b, b, + g, g, c, c, c, b, + g, g, g, c, c, c, + g, g, g, g, c, c, + + /* Red3 submap. */ + r, m, m, m, b, b, + y, r, m, m, b, b, + y, y, w, b, b, b, + y, y, g, c, c, b, + g, g, g, c, c, c, + g, g, g, g, c, c, + + /* Red4 submap. */ + r, r, m, m, m, b, + r, r, m, m, m, b, + y, y, r, m, m, b, + y, y, y, w, b, b, + y, y, y, g, c, c, + g, g, g, g, c, c, + + /* Red5 submap. */ + r, r, r, m, m, m, + r, r, r, m, m, m, + r, r, r, m, m, m, + y, y, y, r, m, m, + y, y, y, y, w, b, + y, y, y, y, g, c, + + /* Red6 submap. */ + r, r, r, r, m, m, + r, r, r, r, m, m, + r, r, r, r, m, m, + r, r, r, r, m, m, + y, y, y, y, r, m, + y, y, y, y, y, w, + + /* Grey submap. */ + k, k, k, k, k, k, + k, k, k, k, k, k, + w, w, w, w, w, w, + w, w, w, w, w, w, +}; + +/* + * This table is generated from the previous one by setting TC_LIGHT for + * entries whose luminosity in the xterm256 color map is 60% or larger. + * Thus the previous table is currently not really needed. It will be + * used for different fine tuning of the tables. + */ +static const teken_color_t teken_256to16tab[] = { + /* xterm normal colors: */ + k, r, g, y, b, m, c, w, + + /* xterm bright colors: */ + K, R, G, Y, B, M, C, W, + + /* Red0 submap. */ + k, b, b, b, b, b, + g, c, c, b, b, b, + g, c, c, c, b, b, + g, g, c, c, c, b, + g, g, g, c, c, c, + g, g, g, g, c, c, + + /* Red2 submap. */ + r, m, m, b, b, b, + y, K, b, b, B, B, + y, g, c, c, B, B, + g, g, c, c, C, B, + g, G, G, C, C, C, + g, G, G, G, C, C, + + /* Red3 submap. */ + r, m, m, m, b, b, + y, r, m, m, B, B, + y, y, w, B, B, B, + y, y, G, C, C, B, + g, G, G, C, C, C, + g, G, G, G, C, C, + + /* Red4 submap. */ + r, r, m, m, m, b, + r, r, m, m, M, B, + y, y, R, M, M, B, + y, y, Y, W, B, B, + y, Y, Y, G, C, C, + g, G, G, G, C, C, + + /* Red5 submap. */ + r, r, r, m, m, m, + r, R, R, M, M, M, + r, R, R, M, M, M, + y, Y, Y, R, M, M, + y, Y, Y, Y, W, B, + y, Y, Y, Y, G, C, + + /* Red6 submap. */ + r, r, r, r, m, m, + r, R, R, R, M, M, + r, R, R, R, M, M, + r, R, R, R, M, M, + y, Y, Y, Y, R, M, + y, Y, Y, Y, Y, W, + + /* Grey submap. */ + k, k, k, k, k, k, + K, K, K, K, K, K, + w, w, w, w, w, w, + W, W, W, W, W, W, +}; + +#undef k +#undef b +#undef y +#undef c +#undef g +#undef m +#undef r +#undef w +#undef K +#undef B +#undef Y +#undef C +#undef G +#undef M +#undef R +#undef W + teken_color_t teken_256to8(teken_color_t c) { - unsigned int r, g, b; - if (c < 16) { - /* Traditional color indices. */ - return (c % 8); - } else if (c >= 244) { - /* Upper grayscale colors. */ - return (TC_WHITE); - } else if (c >= 232) { - /* Lower grayscale colors. */ - return (TC_BLACK); - } + return (teken_256to8tab[c % 256]); +} - /* Convert to RGB. */ - c -= 16; - b = c % 6; - g = (c / 6) % 6; - r = c / 36; +teken_color_t +teken_256to16(teken_color_t c) +{ - if (r < g) { - /* Possibly green. */ - if (g < b) - return (TC_BLUE); - else if (g > b) - return (TC_GREEN); - else - return (TC_CYAN); - } else if (r > g) { - /* Possibly red. */ - if (r < b) - return (TC_BLUE); - else if (r > b) - return (TC_RED); - else - return (TC_MAGENTA); - } else { - /* Possibly brown. */ - if (g < b) - return (TC_BLUE); - else if (g > b) - return (TC_BROWN); - else if (r < 3) - return (TC_BLACK); - else - return (TC_WHITE); - } + return (teken_256to16tab[c % 256]); } static const char * const special_strings_cons25[] = { Modified: trunk/sys/teken/teken.h =================================================================== --- trunk/sys/teken/teken.h 2020-02-08 19:43:52 UTC (rev 12319) +++ trunk/sys/teken/teken.h 2020-02-08 19:46:22 UTC (rev 12320) @@ -24,7 +24,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: stable/10/sys/teken/teken.h 262861 2014-03-06 18:30:56Z jhb $ + * $FreeBSD: stable/11/sys/teken/teken.h 330916 2018-03-14 07:47:26Z eadler $ */ #ifndef _TEKEN_H_ @@ -57,6 +57,7 @@ #define TC_CYAN 6 #define TC_WHITE 7 #define TC_NCOLORS 8 +#define TC_LIGHT 8 /* ORed with the others. */ typedef struct { teken_unit_t tp_row; @@ -204,6 +205,7 @@ void teken_set_cons25(teken_t *); /* Color conversion. */ +teken_color_t teken_256to16(teken_color_t); teken_color_t teken_256to8(teken_color_t); #endif /* !_TEKEN_H_ */ Modified: trunk/sys/teken/teken_scs.h =================================================================== --- trunk/sys/teken/teken_scs.h 2020-02-08 19:43:52 UTC (rev 12319) +++ trunk/sys/teken/teken_scs.h 2020-02-08 19:46:22 UTC (rev 12320) @@ -24,7 +24,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: stable/10/sys/teken/teken_scs.h 203659 2010-02-08 09:16:59Z ed $ + * $FreeBSD: stable/11/sys/teken/teken_scs.h 203659 2010-02-08 09:16:59Z ed $ */ static inline teken_char_t Modified: trunk/sys/teken/teken_subr.h =================================================================== --- trunk/sys/teken/teken_subr.h 2020-02-08 19:43:52 UTC (rev 12319) +++ trunk/sys/teken/teken_subr.h 2020-02-08 19:46:22 UTC (rev 12320) @@ -24,7 +24,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: stable/10/sys/teken/teken_subr.h 287776 2015-09-14 09:12:28Z ed $ + * $FreeBSD: stable/11/sys/teken/teken_subr.h 287098 2015-08-24 07:49:27Z ed $ */ static void teken_subr_cursor_up(teken_t *, unsigned int); Modified: trunk/sys/teken/teken_subr_compat.h =================================================================== --- trunk/sys/teken/teken_subr_compat.h 2020-02-08 19:43:52 UTC (rev 12319) +++ trunk/sys/teken/teken_subr_compat.h 2020-02-08 19:46:22 UTC (rev 12320) @@ -24,7 +24,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: stable/10/sys/teken/teken_subr_compat.h 214817 2010-11-05 00:56:21Z ed $ + * $FreeBSD: stable/11/sys/teken/teken_subr_compat.h 330916 2018-03-14 07:47:26Z eadler $ */ static void @@ -41,8 +41,8 @@ teken_subr_cons25_set_adapter_background(teken_t *t, unsigned int c) { - t->t_defattr.ta_bgcolor = cons25_colors[c % 8]; - t->t_curattr.ta_bgcolor = cons25_colors[c % 8]; + t->t_defattr.ta_bgcolor = cons25_colors[c % 8] | (c & 8); + t->t_curattr.ta_bgcolor = cons25_colors[c % 8] | (c & 8); } static void @@ -49,15 +49,8 @@ teken_subr_cons25_set_adapter_foreground(teken_t *t, unsigned int c) { - t->t_defattr.ta_fgcolor = cons25_colors[c % 8]; - t->t_curattr.ta_fgcolor = cons25_colors[c % 8]; - if (c >= 8) { - t->t_defattr.ta_format |= TF_BOLD; - t->t_curattr.ta_format |= TF_BOLD; - } else { - t->t_defattr.ta_format &= ~TF_BOLD; - t->t_curattr.ta_format &= ~TF_BOLD; - } + t->t_defattr.ta_fgcolor = cons25_colors[c % 8] | (c & 8); + t->t_curattr.ta_fgcolor = cons25_colors[c % 8] | (c & 8); } static const teken_color_t cons25_revcolors[8] = { 0, 4, 2, 6, 1, 5, 3, 7 }; From laffer1 at midnightbsd.org Sat Feb 8 14:47:27 2020 From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org) Date: Sat, 8 Feb 2020 14:47:27 -0500 (EST) Subject: [Midnightbsd-cvs] src [12321] trunk/sys/tests: sync with FreeBSD 11-stable Message-ID: <202002081947.018JlRIX064801@stargazer.midnightbsd.org> Revision: 12321 http://svnweb.midnightbsd.org/src/?rev=12321 Author: laffer1 Date: 2020-02-08 14:47:26 -0500 (Sat, 08 Feb 2020) Log Message: ----------- sync with FreeBSD 11-stable Added Paths: ----------- trunk/sys/tests/ trunk/sys/tests/callout_test/ trunk/sys/tests/callout_test/callout_test.c trunk/sys/tests/callout_test.h trunk/sys/tests/framework/ trunk/sys/tests/framework/kern_testfrwk.c trunk/sys/tests/kern_testfrwk.h Added: trunk/sys/tests/callout_test/callout_test.c =================================================================== --- trunk/sys/tests/callout_test/callout_test.c (rev 0) +++ trunk/sys/tests/callout_test/callout_test.c 2020-02-08 19:47:26 UTC (rev 12321) @@ -0,0 +1,284 @@ +/* $MidnightBSD$ */ +/*- + * Copyright (c) 2015 Netflix Inc. All rights reserved. + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: stable/11/sys/tests/callout_test/callout_test.c 319168 2017-05-30 02:53:00Z ngie $"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/conf.h> +#include <sys/cpuctl.h> +#include <sys/fcntl.h> +#include <sys/ioccom.h> +#include <sys/kernel.h> +#include <sys/libkern.h> +#include <sys/malloc.h> +#include <sys/module.h> +#include <sys/mutex.h> +#include <sys/pcpu.h> +#include <sys/pmckern.h> +#include <sys/priv.h> +#include <sys/proc.h> +#include <sys/queue.h> +#include <sys/sched.h> +#include <sys/smp.h> +#include <sys/sysctl.h> +#include <sys/uio.h> +#include <tests/kern_testfrwk.h> +#include <tests/callout_test.h> +#include <machine/cpu.h> + +MALLOC_DEFINE(M_CALLTMP, "Temp callout Memory", "CalloutTest"); + +struct callout_run { + struct mtx lock; + struct callout *co_array; + int co_test; + int co_number_callouts; + int co_return_npa; + int co_completed; + int callout_waiting; + int drain_calls; + int cnt_zero; + int cnt_one; + int index; +}; + +static struct callout_run *comaster[MAXCPU]; + +uint64_t callout_total = 0; + +static void execute_the_co_test(struct callout_run *rn); + +static void +co_saydone(void *arg) +{ + struct callout_run *rn; + + rn = (struct callout_run *)arg; + printf("The callout test is now complete for thread %d\n", + rn->index); + printf("number_callouts:%d\n", + rn->co_number_callouts); + printf("Callouts that bailed (Not PENDING or ACTIVE cleared):%d\n", + rn->co_return_npa); + printf("Callouts that completed:%d\n", rn->co_completed); + printf("Drain calls:%d\n", rn->drain_calls); + printf("Zero returns:%d non-zero:%d\n", + rn->cnt_zero, + rn->cnt_one); + +} + +static void +drainit(void *arg) +{ + struct callout_run *rn; + + rn = (struct callout_run *)arg; + mtx_lock(&rn->lock); + rn->drain_calls++; + mtx_unlock(&rn->lock); +} + +static void +test_callout(void *arg) +{ + struct callout_run *rn; + int cpu; + + critical_enter(); + cpu = curcpu; + critical_exit(); + rn = (struct callout_run *)arg; + atomic_add_int(&rn->callout_waiting, 1); + mtx_lock(&rn->lock); + if (callout_pending(&rn->co_array[cpu]) || + !callout_active(&rn->co_array[cpu])) { + rn->co_return_npa++; + atomic_subtract_int(&rn->callout_waiting, 1); + mtx_unlock(&rn->lock); + return; + } + callout_deactivate(&rn->co_array[cpu]); + rn->co_completed++; + mtx_unlock(&rn->lock); + atomic_subtract_int(&rn->callout_waiting, 1); +} + +void +execute_the_co_test(struct callout_run *rn) +{ + int i, ret, cpu; + uint32_t tk_s, tk_e, tk_d; + + mtx_lock(&rn->lock); + rn->callout_waiting = 0; + for (i = 0; i < rn->co_number_callouts; i++) { + if (rn->co_test == 1) { + /* start all on spread out cpu's */ + cpu = i % mp_ncpus; + callout_reset_sbt_on(&rn->co_array[i], 3, 0, test_callout, rn, + cpu, 0); + } else { + /* Start all on the same CPU */ + callout_reset_sbt_on(&rn->co_array[i], 3, 0, test_callout, rn, + rn->index, 0); + } + } + tk_s = ticks; + while (rn->callout_waiting != rn->co_number_callouts) { + cpu_spinwait(); + tk_e = ticks; + tk_d = tk_e - tk_s; + if (tk_d > 100) { + break; + } + } + /* OK everyone is waiting and we have the lock */ + for (i = 0; i < rn->co_number_callouts; i++) { + ret = callout_async_drain(&rn->co_array[i], drainit); + if (ret) { + rn->cnt_one++; + } else { + rn->cnt_zero++; + } + } + rn->callout_waiting -= rn->cnt_one; + mtx_unlock(&rn->lock); + /* Now wait until all are done */ + tk_s = ticks; + while (rn->callout_waiting > 0) { + cpu_spinwait(); + tk_e = ticks; + tk_d = tk_e - tk_s; + if (tk_d > 100) { + break; + } + } + co_saydone((void *)rn); +} + + +static void +run_callout_test(struct kern_test *test) +{ + struct callout_test *u; + size_t sz; + int i; + struct callout_run *rn; + int index = test->tot_threads_running; + + u = (struct callout_test *)test->test_options; + if (comaster[index] == NULL) { + rn = comaster[index] = malloc(sizeof(struct callout_run), M_CALLTMP, M_WAITOK); + memset(comaster[index], 0, sizeof(struct callout_run)); + mtx_init(&rn->lock, "callouttest", NULL, MTX_DUPOK); + rn->index = index; + } else { + rn = comaster[index]; + rn->co_number_callouts = rn->co_return_npa = 0; + rn->co_completed = rn->callout_waiting = 0; + rn->drain_calls = rn->cnt_zero = rn->cnt_one = 0; + if (rn->co_array) { + free(rn->co_array, M_CALLTMP); + rn->co_array = NULL; + } + } + rn->co_number_callouts = u->number_of_callouts; + rn->co_test = u->test_number; + sz = sizeof(struct callout) * rn->co_number_callouts; + rn->co_array = malloc(sz, M_CALLTMP, M_WAITOK); + for (i = 0; i < rn->co_number_callouts; i++) { + callout_init(&rn->co_array[i], CALLOUT_MPSAFE); + } + execute_the_co_test(rn); +} + +int callout_test_is_loaded = 0; + +static void +cocleanup(void) +{ + int i; + + for (i = 0; i < MAXCPU; i++) { + if (comaster[i]) { + if (comaster[i]->co_array) { + free(comaster[i]->co_array, M_CALLTMP); + comaster[i]->co_array = NULL; + } + free(comaster[i], M_CALLTMP); + comaster[i] = NULL; + } + } +} + +static int +callout_test_modevent(module_t mod, int type, void *data) +{ + int err = 0; + + switch (type) { + case MOD_LOAD: + err = kern_testframework_register("callout_test", + run_callout_test); + if (err) { + printf("Can't load callout_test err:%d returned\n", + err); + } else { + memset(comaster, 0, sizeof(comaster)); + callout_test_is_loaded = 1; + } + break; + case MOD_QUIESCE: + err = kern_testframework_deregister("callout_test"); + if (err == 0) { + callout_test_is_loaded = 0; + cocleanup(); + } + break; + case MOD_UNLOAD: + if (callout_test_is_loaded) { + err = kern_testframework_deregister("callout_test"); + if (err == 0) { + cocleanup(); + callout_test_is_loaded = 0; + } + } + break; + default: + return (EOPNOTSUPP); + } + return (err); +} + +static moduledata_t callout_test_mod = { + .name = "callout_test", + .evhand = callout_test_modevent, + .priv = 0 +}; + +MODULE_DEPEND(callout_test, kern_testframework, 1, 1, 1); +DECLARE_MODULE(callout_test, callout_test_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); Property changes on: trunk/sys/tests/callout_test/callout_test.c ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +MidnightBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Added: trunk/sys/tests/callout_test.h =================================================================== --- trunk/sys/tests/callout_test.h (rev 0) +++ trunk/sys/tests/callout_test.h 2020-02-08 19:47:26 UTC (rev 12321) @@ -0,0 +1,35 @@ +/* $MidnightBSD$ */ +#ifndef __callout_test_h__ +#define __callout_test_h__ +/*- + * Copyright (c) 2015 + * Netflix Incorporated, All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + *__FBSDID("$FreeBSD: stable/11/sys/tests/callout_test.h 290663 2015-11-10 14:14:41Z rrs $"); + * + */ +struct callout_test { + int number_of_callouts; + int test_number; +}; +#endif Property changes on: trunk/sys/tests/callout_test.h ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +MidnightBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Added: trunk/sys/tests/framework/kern_testfrwk.c =================================================================== --- trunk/sys/tests/framework/kern_testfrwk.c (rev 0) +++ trunk/sys/tests/framework/kern_testfrwk.c 2020-02-08 19:47:26 UTC (rev 12321) @@ -0,0 +1,342 @@ +/* $MidnightBSD$ */ +/*- + * Copyright (c) 2015 + * Netflix Incorporated, All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: stable/11/sys/tests/framework/kern_testfrwk.c 319174 2017-05-30 03:10:05Z ngie $"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/bus.h> +#include <sys/callout.h> +#include <sys/kernel.h> +#include <sys/ktr.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/module.h> +#include <sys/mutex.h> +#include <sys/sdt.h> +#include <sys/smp.h> +#include <sys/sysctl.h> +#include <sys/taskqueue.h> +#include <sys/queue.h> +#include <tests/kern_testfrwk.h> +#ifdef SMP +#include <machine/cpu.h> +#endif + +struct kern_test_list { + TAILQ_ENTRY(kern_test_list) next; + char name[TEST_NAME_LEN]; + kerntfunc func; +}; + +TAILQ_HEAD(ktestlist, kern_test_list); + +struct kern_test_entry { + TAILQ_ENTRY(kern_test_entry) next; + struct kern_test_list *kt_e; + struct kern_test kt_data; +}; + +TAILQ_HEAD(ktestqueue, kern_test_entry); + +MALLOC_DEFINE(M_KTFRWK, "kern_tfrwk", "Kernel Test Framework"); +struct kern_totfrwk { + struct taskqueue *kfrwk_tq; + struct task kfrwk_que; + struct ktestlist kfrwk_testlist; + struct ktestqueue kfrwk_testq; + struct mtx kfrwk_mtx; + int kfrwk_waiting; +}; + +struct kern_totfrwk kfrwk; +static int ktest_frwk_inited = 0; + +#define KTFRWK_MUTEX_INIT() mtx_init(&kfrwk.kfrwk_mtx, "kern_test_frwk", "tfrwk", MTX_DEF) + +#define KTFRWK_DESTROY() mtx_destroy(&kfrwk.kfrwk_mtx) + +#define KTFRWK_LOCK() mtx_lock(&kfrwk.kfrwk_mtx) + +#define KTFRWK_UNLOCK() mtx_unlock(&kfrwk.kfrwk_mtx) + +static void +kfrwk_task(void *context, int pending) +{ + struct kern_totfrwk *tf; + struct kern_test_entry *wk; + int free_mem = 0; + struct kern_test kt_data; + kerntfunc ktf; + + memset(&kt_data, 0, sizeof(kt_data)); + ktf = NULL; + tf = (struct kern_totfrwk *)context; + KTFRWK_LOCK(); + wk = TAILQ_FIRST(&tf->kfrwk_testq); + if (wk) { + wk->kt_data.tot_threads_running--; + tf->kfrwk_waiting--; + memcpy(&kt_data, &wk->kt_data, sizeof(kt_data)); + if (wk->kt_data.tot_threads_running == 0) { + TAILQ_REMOVE(&tf->kfrwk_testq, wk, next); + free_mem = 1; + } else { + /* Wake one of my colleages up to help too */ + taskqueue_enqueue(tf->kfrwk_tq, &tf->kfrwk_que); + } + if (wk->kt_e) { + ktf = wk->kt_e->func; + } + } + KTFRWK_UNLOCK(); + if (wk && free_mem) { + free(wk, M_KTFRWK); + } + /* Execute the test */ + if (ktf) { + (*ktf) (&kt_data); + } + /* We are done */ + atomic_add_int(&tf->kfrwk_waiting, 1); +} + +static int +kerntest_frwk_init(void) +{ + u_int ncpus = mp_ncpus ? mp_ncpus : MAXCPU; + + KTFRWK_MUTEX_INIT(); + TAILQ_INIT(&kfrwk.kfrwk_testq); + TAILQ_INIT(&kfrwk.kfrwk_testlist); + /* Now lets start up a number of tasks to do the work */ + TASK_INIT(&kfrwk.kfrwk_que, 0, kfrwk_task, &kfrwk); + kfrwk.kfrwk_tq = taskqueue_create_fast("sbtls_task", M_NOWAIT, + taskqueue_thread_enqueue, &kfrwk.kfrwk_tq); + if (kfrwk.kfrwk_tq == NULL) { + printf("Can't start taskqueue for Kernel Test Framework\n"); + panic("Taskqueue init fails for kfrwk"); + } + taskqueue_start_threads(&kfrwk.kfrwk_tq, ncpus, PI_NET, "[kt_frwk task]"); + kfrwk.kfrwk_waiting = ncpus; + ktest_frwk_inited = 1; + return (0); +} + +static int +kerntest_frwk_fini(void) +{ + KTFRWK_LOCK(); + if (!TAILQ_EMPTY(&kfrwk.kfrwk_testlist)) { + /* Still modules registered */ + KTFRWK_UNLOCK(); + return (EBUSY); + } + ktest_frwk_inited = 0; + KTFRWK_UNLOCK(); + taskqueue_free(kfrwk.kfrwk_tq); + /* Ok lets destroy the mutex on the way outs */ + KTFRWK_DESTROY(); + return (0); +} + + +static int kerntest_execute(SYSCTL_HANDLER_ARGS); + +SYSCTL_NODE(_kern, OID_AUTO, testfrwk, CTLFLAG_RW, 0, "Kernel Test Framework"); +SYSCTL_PROC(_kern_testfrwk, OID_AUTO, runtest, (CTLTYPE_STRUCT | CTLFLAG_RW), + 0, 0, kerntest_execute, "IU", "Execute a kernel test"); + +int +kerntest_execute(SYSCTL_HANDLER_ARGS) +{ + struct kern_test kt; + struct kern_test_list *li, *te = NULL; + struct kern_test_entry *kte = NULL; + int error = 0; + + if (ktest_frwk_inited == 0) { + return (ENOENT); + } + /* Find the entry if possible */ + error = SYSCTL_IN(req, &kt, sizeof(struct kern_test)); + if (error) { + return (error); + } + if (kt.num_threads <= 0) { + return (EINVAL); + } + /* Grab some memory */ + kte = malloc(sizeof(struct kern_test_entry), M_KTFRWK, M_WAITOK); + if (kte == NULL) { + error = ENOMEM; + goto out; + } + KTFRWK_LOCK(); + TAILQ_FOREACH(li, &kfrwk.kfrwk_testlist, next) { + if (strcmp(li->name, kt.name) == 0) { + te = li; + break; + } + } + if (te == NULL) { + printf("Can't find the test %s\n", kt.name); + error = ENOENT; + free(kte, M_KTFRWK); + goto out; + } + /* Ok we have a test item to run, can we? */ + if (!TAILQ_EMPTY(&kfrwk.kfrwk_testq)) { + /* We don't know if there is enough threads */ + error = EAGAIN; + free(kte, M_KTFRWK); + goto out; + } + if (kfrwk.kfrwk_waiting < kt.num_threads) { + error = E2BIG; + free(kte, M_KTFRWK); + goto out; + } + kt.tot_threads_running = kt.num_threads; + /* Ok it looks like we can do it, lets get an entry */ + kte->kt_e = li; + memcpy(&kte->kt_data, &kt, sizeof(kt)); + TAILQ_INSERT_TAIL(&kfrwk.kfrwk_testq, kte, next); + taskqueue_enqueue(kfrwk.kfrwk_tq, &kfrwk.kfrwk_que); +out: + KTFRWK_UNLOCK(); + return (error); +} + +int +kern_testframework_register(const char *name, kerntfunc func) +{ + int error = 0; + struct kern_test_list *li, *te = NULL; + int len; + + len = strlen(name); + if (len >= TEST_NAME_LEN) { + return (E2BIG); + } + te = malloc(sizeof(struct kern_test_list), M_KTFRWK, M_WAITOK); + if (te == NULL) { + error = ENOMEM; + goto out; + } + KTFRWK_LOCK(); + /* First does it already exist? */ + TAILQ_FOREACH(li, &kfrwk.kfrwk_testlist, next) { + if (strcmp(li->name, name) == 0) { + error = EALREADY; + free(te, M_KTFRWK); + goto out; + } + } + /* Ok we can do it, lets add it to the list */ + te->func = func; + strcpy(te->name, name); + TAILQ_INSERT_TAIL(&kfrwk.kfrwk_testlist, te, next); +out: + KTFRWK_UNLOCK(); + return (error); +} + +int +kern_testframework_deregister(const char *name) +{ + struct kern_test_list *li, *te = NULL; + u_int ncpus = mp_ncpus ? mp_ncpus : MAXCPU; + int error = 0; + + KTFRWK_LOCK(); + /* First does it already exist? */ + TAILQ_FOREACH(li, &kfrwk.kfrwk_testlist, next) { + if (strcmp(li->name, name) == 0) { + te = li; + break; + } + } + if (te == NULL) { + /* It is not registered so no problem */ + goto out; + } + if (ncpus != kfrwk.kfrwk_waiting) { + /* We are busy executing something -- can't unload */ + error = EBUSY; + goto out; + } + if (!TAILQ_EMPTY(&kfrwk.kfrwk_testq)) { + /* Something still to execute */ + error = EBUSY; + goto out; + } + /* Ok we can remove the dude safely */ + TAILQ_REMOVE(&kfrwk.kfrwk_testlist, te, next); + memset(te, 0, sizeof(struct kern_test_list)); + free(te, M_KTFRWK); +out: + KTFRWK_UNLOCK(); + return (error); +} + +static int +kerntest_mod_init(module_t mod, int type, void *data) +{ + int err; + + switch (type) { + case MOD_LOAD: + err = kerntest_frwk_init(); + break; + case MOD_QUIESCE: + KTFRWK_LOCK(); + if (TAILQ_EMPTY(&kfrwk.kfrwk_testlist)) { + err = 0; + } else { + err = EBUSY; + } + KTFRWK_UNLOCK(); + break; + case MOD_UNLOAD: + err = kerntest_frwk_fini(); + break; + default: + return (EOPNOTSUPP); + } + return (err); +} + +static moduledata_t kern_test_framework = { + .name = "kernel_testfrwk", + .evhand = kerntest_mod_init, + .priv = 0 +}; + +MODULE_VERSION(kern_testframework, 1); +DECLARE_MODULE(kern_testframework, kern_test_framework, SI_SUB_PSEUDO, SI_ORDER_ANY); Property changes on: trunk/sys/tests/framework/kern_testfrwk.c ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +MidnightBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Added: trunk/sys/tests/kern_testfrwk.h =================================================================== --- trunk/sys/tests/kern_testfrwk.h (rev 0) +++ trunk/sys/tests/kern_testfrwk.h 2020-02-08 19:47:26 UTC (rev 12321) @@ -0,0 +1,50 @@ +/* $MidnightBSD$ */ +/*- + * Copyright (c) 2015 + * Netflix Incorporated, All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + *__FBSDID("$FreeBSD: stable/11/sys/tests/kern_testfrwk.h 290663 2015-11-10 14:14:41Z rrs $"); + * + */ +#ifndef _SYS_KERN_TESTFRWKT_H_ +#define _SYS_KERN_TESTFRWKT_H_ + +#define TEST_NAME_LEN 32 +#define TEST_OPTION_SPACE 256 + +struct kern_test { + char name[TEST_NAME_LEN]; + int num_threads; /* Fill in how many threads you want */ + int tot_threads_running; /* For framework */ + uint8_t test_options[TEST_OPTION_SPACE]; +}; + + +typedef void (*kerntfunc)(struct kern_test *); + +#ifdef _KERNEL +int kern_testframework_register(const char *name, kerntfunc); + +int kern_testframework_deregister(const char *name); +#endif +#endif Property changes on: trunk/sys/tests/kern_testfrwk.h ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +MidnightBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property From laffer1 at midnightbsd.org Sat Feb 8 14:49:05 2020 From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org) Date: Sat, 8 Feb 2020 14:49:05 -0500 (EST) Subject: [Midnightbsd-cvs] src [12322] trunk/sys/sys: sync with FreeBSD 11-stable Message-ID: <202002081949.018Jn5fL064902@stargazer.midnightbsd.org> Revision: 12322 http://svnweb.midnightbsd.org/src/?rev=12322 Author: laffer1 Date: 2020-02-08 14:49:04 -0500 (Sat, 08 Feb 2020) Log Message: ----------- sync with FreeBSD 11-stable Added Paths: ----------- trunk/sys/sys/zlib.h trunk/sys/sys/zutil.h Added: trunk/sys/sys/zlib.h =================================================================== --- trunk/sys/sys/zlib.h (rev 0) +++ trunk/sys/sys/zlib.h 2020-02-08 19:49:04 UTC (rev 12322) @@ -0,0 +1,1019 @@ +/* $MidnightBSD$ */ +/* $FreeBSD: stable/11/sys/sys/zlib.h 281855 2015-04-22 14:38:58Z rodrigc $ */ + +/* + * This file is derived from zlib.h and zconf.h from the zlib-1.0.4 + * distribution by Jean-loup Gailly and Mark Adler, with some additions + * by Paul Mackerras to aid in implementing Deflate compression and + * decompression for PPP packets. + */ + +/* + * ==FILEVERSION 971127== + * + * This marker is used by the Linux installation script to determine + * whether an up-to-date version of this file is already installed. + */ + + +/* +++ zlib.h */ +/*- + zlib.h -- interface of the 'zlib' general purpose compression library + version 1.0.4, Jul 24th, 1996. + + Copyright (C) 1995-1996 Jean-loup Gailly and Mark Adler + + This software is provided 'as-is', without any express or implied + warranty. In no event will the authors be held liable for any damages + arising from the use of this software. + + Permission is granted to anyone to use this software for any purpose, + including commercial applications, and to alter it and redistribute it + freely, subject to the following restrictions: + + 1. The origin of this software must not be misrepresented; you must not + claim that you wrote the original software. If you use this software + in a product, an acknowledgment in the product documentation would be + appreciated but is not required. + 2. Altered source versions must be plainly marked as such, and must not be + misrepresented as being the original software. + 3. This notice may not be removed or altered from any source distribution. + + Jean-loup Gailly Mark Adler + gzip at prep.ai.mit.edu madler at alumni.caltech.edu +*/ +/* + The data format used by the zlib library is described by RFCs (Request for + Comments) 1950 to 1952 in the files ftp://ds.internic.net/rfc/rfc1950.txt + (zlib format), rfc1951.txt (deflate format) and rfc1952.txt (gzip format). +*/ + +#ifndef _ZLIB_H +#define _ZLIB_H + +#ifdef __cplusplus +extern "C" { +#endif + + +/* +++ zconf.h */ +/* zconf.h -- configuration of the zlib compression library + * Copyright (C) 1995-1996 Jean-loup Gailly. + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +/* From: zconf.h,v 1.20 1996/07/02 15:09:28 me Exp $ */ + +#ifndef _ZCONF_H +#define _ZCONF_H + +/* + * If you *really* need a unique prefix for all types and library functions, + * compile with -DZ_PREFIX. The "standard" zlib should be compiled without it. + */ +#ifdef Z_PREFIX +# define deflateInit_ z_deflateInit_ +# define deflate z_deflate +# define deflateEnd z_deflateEnd +# define inflateInit_ z_inflateInit_ +# define inflate z_inflate +# define inflateEnd z_inflateEnd +# define deflateInit2_ z_deflateInit2_ +# define deflateSetDictionary z_deflateSetDictionary +# define deflateCopy z_deflateCopy +# define deflateReset z_deflateReset +# define deflateParams z_deflateParams +# define inflateInit2_ z_inflateInit2_ +# define inflateSetDictionary z_inflateSetDictionary +# define inflateSync z_inflateSync +# define inflateReset z_inflateReset +# define compress z_compress +# define uncompress z_uncompress +# define adler32 z_adler32 +#if 0 +# define crc32 z_crc32 +# define get_crc_table z_get_crc_table +#endif + +# define Byte z_Byte +# define uInt z_uInt +# define uLong z_uLong +# define Bytef z_Bytef +# define charf z_charf +# define intf z_intf +# define uIntf z_uIntf +# define uLongf z_uLongf +# define voidpf z_voidpf +# define voidp z_voidp +#endif + +#if (defined(_WIN32) || defined(__WIN32__)) && !defined(WIN32) +# define WIN32 +#endif +#if defined(__GNUC__) || defined(WIN32) || defined(__386__) || defined(__i386__) +# ifndef __32BIT__ +# define __32BIT__ +# endif +#endif +#if defined(__MSDOS__) && !defined(MSDOS) +# define MSDOS +#endif + +/* + * Compile with -DMAXSEG_64K if the alloc function cannot allocate more + * than 64k bytes at a time (needed on systems with 16-bit int). + */ +#if defined(MSDOS) && !defined(__32BIT__) +# define MAXSEG_64K +#endif +#ifdef MSDOS +# define UNALIGNED_OK +#endif + +#if (defined(MSDOS) || defined(_WINDOWS) || defined(WIN32)) && !defined(STDC) +# define STDC +#endif +#if (defined(__STDC__) || defined(__cplusplus)) && !defined(STDC) +# define STDC +#endif + +#ifndef STDC +# ifndef const /* cannot use !defined(STDC) && !defined(const) on Mac */ +# define const +# endif +#endif + +/* Some Mac compilers merge all .h files incorrectly: */ +#if defined(__MWERKS__) || defined(applec) ||defined(THINK_C) ||defined(__SC__) +# define NO_DUMMY_DECL +#endif + +/* Maximum value for memLevel in deflateInit2 */ +#ifndef MAX_MEM_LEVEL +# ifdef MAXSEG_64K +# define MAX_MEM_LEVEL 8 +# else +# define MAX_MEM_LEVEL 9 +# endif +#endif + +/* Maximum value for windowBits in deflateInit2 and inflateInit2 */ +#ifndef MAX_WBITS +# define MAX_WBITS 15 /* 32K LZ77 window */ +#endif + +/* The memory requirements for deflate are (in bytes): + 1 << (windowBits+2) + 1 << (memLevel+9) + that is: 128K for windowBits=15 + 128K for memLevel = 8 (default values) + plus a few kilobytes for small objects. For example, if you want to reduce + the default memory requirements from 256K to 128K, compile with + make CFLAGS="-O -DMAX_WBITS=14 -DMAX_MEM_LEVEL=7" + Of course this will generally degrade compression (there's no free lunch). + + The memory requirements for inflate are (in bytes) 1 << windowBits + that is, 32K for windowBits=15 (default value) plus a few kilobytes + for small objects. +*/ + + /* Type declarations */ + +#ifndef OF /* function prototypes */ +# ifdef STDC +# define OF(args) args +# else +# define OF(args) () +# endif +#endif + +/* The following definitions for FAR are needed only for MSDOS mixed + * model programming (small or medium model with some far allocations). + * This was tested only with MSC; for other MSDOS compilers you may have + * to define NO_MEMCPY in zutil.h. If you don't need the mixed model, + * just define FAR to be empty. + */ +#if (defined(M_I86SM) || defined(M_I86MM)) && !defined(__32BIT__) + /* MSC small or medium model */ +# define SMALL_MEDIUM +# ifdef _MSC_VER +# define FAR __far +# else +# define FAR far +# endif +#endif +#if defined(__BORLANDC__) && (defined(__SMALL__) || defined(__MEDIUM__)) +# ifndef __32BIT__ +# define SMALL_MEDIUM +# define FAR __far +# endif +#endif +#ifndef FAR +# define FAR +#endif + +typedef unsigned char Byte; /* 8 bits */ +typedef unsigned int uInt; /* 16 bits or more */ +typedef unsigned long uLong; /* 32 bits or more */ + +#if defined(__BORLANDC__) && defined(SMALL_MEDIUM) + /* Borland C/C++ ignores FAR inside typedef */ +# define Bytef Byte FAR +#else + typedef Byte FAR Bytef; +#endif +typedef char FAR charf; +typedef int FAR intf; +typedef uInt FAR uIntf; +typedef uLong FAR uLongf; + +#ifdef STDC + typedef void FAR *voidpf; + typedef void *voidp; +#else + typedef Byte FAR *voidpf; + typedef Byte *voidp; +#endif + + +/* Compile with -DZLIB_DLL for Windows DLL support */ +#if (defined(_WINDOWS) || defined(WINDOWS)) && defined(ZLIB_DLL) +# include <windows.h> +# define EXPORT WINAPI +#else +# define EXPORT +#endif + +#endif /* _ZCONF_H */ +/* --- zconf.h */ + +#define ZLIB_VERSION "1.0.4P" + +/* + The 'zlib' compression library provides in-memory compression and + decompression functions, including integrity checks of the uncompressed + data. This version of the library supports only one compression method + (deflation) but other algorithms may be added later and will have the same + stream interface. + + For compression the application must provide the output buffer and + may optionally provide the input buffer for optimization. For decompression, + the application must provide the input buffer and may optionally provide + the output buffer for optimization. + + Compression can be done in a single step if the buffers are large + enough (for example if an input file is mmap'ed), or can be done by + repeated calls of the compression function. In the latter case, the + application must provide more input and/or consume the output + (providing more output space) before each call. + + The library does not install any signal handler. It is recommended to + add at least a handler for SIGSEGV when decompressing; the library checks + the consistency of the input data whenever possible but may go nuts + for some forms of corrupted input. +*/ + +typedef voidpf (*alloc_func) OF((voidpf opaque, uInt items, uInt size)); +typedef void (*free_func) OF((voidpf opaque, voidpf address)); + +struct internal_state; + +typedef struct z_stream_s { + Bytef *next_in; /* next input byte */ + uInt avail_in; /* number of bytes available at next_in */ + uLong total_in; /* total nb of input bytes read so far */ + + Bytef *next_out; /* next output byte should be put there */ + uInt avail_out; /* remaining free space at next_out */ + uLong total_out; /* total nb of bytes output so far */ + + const char *msg; /* last error message, NULL if no error */ + struct internal_state FAR *state; /* not visible by applications */ + + alloc_func zalloc; /* used to allocate the internal state */ + free_func zfree; /* used to free the internal state */ + voidpf opaque; /* private data object passed to zalloc and zfree */ + + int data_type; /* best guess about the data type: ascii or binary */ + uLong adler; /* adler32 value of the uncompressed data */ + uLong reserved; /* reserved for future use */ +} z_stream; + +typedef z_stream FAR *z_streamp; + +/* + The application must update next_in and avail_in when avail_in has + dropped to zero. It must update next_out and avail_out when avail_out + has dropped to zero. The application must initialize zalloc, zfree and + opaque before calling the init function. All other fields are set by the + compression library and must not be updated by the application. + + The opaque value provided by the application will be passed as the first + parameter for calls of zalloc and zfree. This can be useful for custom + memory management. The compression library attaches no meaning to the + opaque value. + + zalloc must return Z_NULL if there is not enough memory for the object. + On 16-bit systems, the functions zalloc and zfree must be able to allocate + exactly 65536 bytes, but will not be required to allocate more than this + if the symbol MAXSEG_64K is defined (see zconf.h). WARNING: On MSDOS, + pointers returned by zalloc for objects of exactly 65536 bytes *must* + have their offset normalized to zero. The default allocation function + provided by this library ensures this (see zutil.c). To reduce memory + requirements and avoid any allocation of 64K objects, at the expense of + compression ratio, compile the library with -DMAX_WBITS=14 (see zconf.h). + + The fields total_in and total_out can be used for statistics or + progress reports. After compression, total_in holds the total size of + the uncompressed data and may be saved for use in the decompressor + (particularly if the decompressor wants to decompress everything in + a single step). +*/ + + /* constants */ + +#define Z_NO_FLUSH 0 +#define Z_PARTIAL_FLUSH 1 +#define Z_PACKET_FLUSH 2 +#define Z_SYNC_FLUSH 3 +#define Z_FULL_FLUSH 4 +#define Z_FINISH 5 +/* Allowed flush values; see deflate() below for details */ + +#define Z_OK 0 +#define Z_STREAM_END 1 +#define Z_NEED_DICT 2 +#define Z_ERRNO (-1) +#define Z_STREAM_ERROR (-2) +#define Z_DATA_ERROR (-3) +#define Z_MEM_ERROR (-4) +#define Z_BUF_ERROR (-5) +#define Z_VERSION_ERROR (-6) +/* Return codes for the compression/decompression functions. Negative + * values are errors, positive values are used for special but normal events. + */ + +#define Z_NO_COMPRESSION 0 +#define Z_BEST_SPEED 1 +#define Z_BEST_COMPRESSION 9 +#define Z_DEFAULT_COMPRESSION (-1) +/* compression levels */ + +#define Z_FILTERED 1 +#define Z_HUFFMAN_ONLY 2 +#define Z_DEFAULT_STRATEGY 0 +/* compression strategy; see deflateInit2() below for details */ + +#define Z_BINARY 0 +#define Z_ASCII 1 +#define Z_UNKNOWN 2 +/* Possible values of the data_type field */ + +#define Z_DEFLATED 8 +/* The deflate compression method (the only one supported in this version) */ + +#define Z_NULL 0 /* for initializing zalloc, zfree, opaque */ + +#define zlib_version zlibVersion() +/* for compatibility with versions < 1.0.2 */ + + /* basic functions */ + +extern const char * EXPORT zlibVersion OF((void)); +/* The application can compare zlibVersion and ZLIB_VERSION for consistency. + If the first character differs, the library code actually used is + not compatible with the zlib.h header file used by the application. + This check is automatically made by deflateInit and inflateInit. + */ + +/* +extern int EXPORT deflateInit OF((z_streamp strm, int level)); + + Initializes the internal stream state for compression. The fields + zalloc, zfree and opaque must be initialized before by the caller. + If zalloc and zfree are set to Z_NULL, deflateInit updates them to + use default allocation functions. + + The compression level must be Z_DEFAULT_COMPRESSION, or between 0 and 9: + 1 gives best speed, 9 gives best compression, 0 gives no compression at + all (the input data is simply copied a block at a time). + Z_DEFAULT_COMPRESSION requests a default compromise between speed and + compression (currently equivalent to level 6). + + deflateInit returns Z_OK if success, Z_MEM_ERROR if there was not + enough memory, Z_STREAM_ERROR if level is not a valid compression level, + Z_VERSION_ERROR if the zlib library version (zlib_version) is incompatible + with the version assumed by the caller (ZLIB_VERSION). + msg is set to null if there is no error message. deflateInit does not + perform any compression: this will be done by deflate(). +*/ + + +extern int EXPORT deflate OF((z_streamp strm, int flush)); +/* + Performs one or both of the following actions: + + - Compress more input starting at next_in and update next_in and avail_in + accordingly. If not all input can be processed (because there is not + enough room in the output buffer), next_in and avail_in are updated and + processing will resume at this point for the next call of deflate(). + + - Provide more output starting at next_out and update next_out and avail_out + accordingly. This action is forced if the parameter flush is non zero. + Forcing flush frequently degrades the compression ratio, so this parameter + should be set only when necessary (in interactive applications). + Some output may be provided even if flush is not set. + + Before the call of deflate(), the application should ensure that at least + one of the actions is possible, by providing more input and/or consuming + more output, and updating avail_in or avail_out accordingly; avail_out + should never be zero before the call. The application can consume the + compressed output when it wants, for example when the output buffer is full + (avail_out == 0), or after each call of deflate(). If deflate returns Z_OK + and with zero avail_out, it must be called again after making room in the + output buffer because there might be more output pending. + + If the parameter flush is set to Z_PARTIAL_FLUSH, the current compression + block is terminated and flushed to the output buffer so that the + decompressor can get all input data available so far. For method 9, a future + variant on method 8, the current block will be flushed but not terminated. + Z_SYNC_FLUSH has the same effect as partial flush except that the compressed + output is byte aligned (the compressor can clear its internal bit buffer) + and the current block is always terminated; this can be useful if the + compressor has to be restarted from scratch after an interruption (in which + case the internal state of the compressor may be lost). + If flush is set to Z_FULL_FLUSH, the compression block is terminated, a + special marker is output and the compression dictionary is discarded; this + is useful to allow the decompressor to synchronize if one compressed block + has been damaged (see inflateSync below). Flushing degrades compression and + so should be used only when necessary. Using Z_FULL_FLUSH too often can + seriously degrade the compression. If deflate returns with avail_out == 0, + this function must be called again with the same value of the flush + parameter and more output space (updated avail_out), until the flush is + complete (deflate returns with non-zero avail_out). + + If the parameter flush is set to Z_PACKET_FLUSH, the compression + block is terminated, and a zero-length stored block is output, + omitting the length bytes (the effect of this is that the 3-bit type + code 000 for a stored block is output, and the output is then + byte-aligned). This is designed for use at the end of a PPP packet. + + If the parameter flush is set to Z_FINISH, pending input is processed, + pending output is flushed and deflate returns with Z_STREAM_END if there + was enough output space; if deflate returns with Z_OK, this function must be + called again with Z_FINISH and more output space (updated avail_out) but no + more input data, until it returns with Z_STREAM_END or an error. After + deflate has returned Z_STREAM_END, the only possible operations on the + stream are deflateReset or deflateEnd. + + Z_FINISH can be used immediately after deflateInit if all the compression + is to be done in a single step. In this case, avail_out must be at least + 0.1% larger than avail_in plus 12 bytes. If deflate does not return + Z_STREAM_END, then it must be called again as described above. + + deflate() may update data_type if it can make a good guess about + the input data type (Z_ASCII or Z_BINARY). In doubt, the data is considered + binary. This field is only for information purposes and does not affect + the compression algorithm in any manner. + + deflate() returns Z_OK if some progress has been made (more input + processed or more output produced), Z_STREAM_END if all input has been + consumed and all output has been produced (only when flush is set to + Z_FINISH), Z_STREAM_ERROR if the stream state was inconsistent (for example + if next_in or next_out was NULL), Z_BUF_ERROR if no progress is possible. +*/ + + +extern int EXPORT deflateEnd OF((z_streamp strm)); +/* + All dynamically allocated data structures for this stream are freed. + This function discards any unprocessed input and does not flush any + pending output. + + deflateEnd returns Z_OK if success, Z_STREAM_ERROR if the + stream state was inconsistent, Z_DATA_ERROR if the stream was freed + prematurely (some input or output was discarded). In the error case, + msg may be set but then points to a static string (which must not be + deallocated). +*/ + + +/* +extern int EXPORT inflateInit OF((z_streamp strm)); + + Initializes the internal stream state for decompression. The fields + zalloc, zfree and opaque must be initialized before by the caller. If + zalloc and zfree are set to Z_NULL, inflateInit updates them to use default + allocation functions. + + inflateInit returns Z_OK if success, Z_MEM_ERROR if there was not + enough memory, Z_VERSION_ERROR if the zlib library version is incompatible + with the version assumed by the caller. msg is set to null if there is no + error message. inflateInit does not perform any decompression: this will be + done by inflate(). +*/ + +#if defined(__FreeBSD__) && defined(_KERNEL) +#define inflate _zlib104_inflate /* FreeBSD already has an inflate :-( */ +#endif + +extern int EXPORT inflate OF((z_streamp strm, int flush)); +/* + Performs one or both of the following actions: + + - Decompress more input starting at next_in and update next_in and avail_in + accordingly. If not all input can be processed (because there is not + enough room in the output buffer), next_in is updated and processing + will resume at this point for the next call of inflate(). + + - Provide more output starting at next_out and update next_out and avail_out + accordingly. inflate() provides as much output as possible, until there + is no more input data or no more space in the output buffer (see below + about the flush parameter). + + Before the call of inflate(), the application should ensure that at least + one of the actions is possible, by providing more input and/or consuming + more output, and updating the next_* and avail_* values accordingly. + The application can consume the uncompressed output when it wants, for + example when the output buffer is full (avail_out == 0), or after each + call of inflate(). If inflate returns Z_OK and with zero avail_out, it + must be called again after making room in the output buffer because there + might be more output pending. + + If the parameter flush is set to Z_PARTIAL_FLUSH or Z_PACKET_FLUSH, + inflate flushes as much output as possible to the output buffer. The + flushing behavior of inflate is not specified for values of the flush + parameter other than Z_PARTIAL_FLUSH, Z_PACKET_FLUSH or Z_FINISH, but the + current implementation actually flushes as much output as possible + anyway. For Z_PACKET_FLUSH, inflate checks that once all the input data + has been consumed, it is expecting to see the length field of a stored + block; if not, it returns Z_DATA_ERROR. + + inflate() should normally be called until it returns Z_STREAM_END or an + error. However if all decompression is to be performed in a single step + (a single call of inflate), the parameter flush should be set to + Z_FINISH. In this case all pending input is processed and all pending + output is flushed; avail_out must be large enough to hold all the + uncompressed data. (The size of the uncompressed data may have been saved + by the compressor for this purpose.) The next operation on this stream must + be inflateEnd to deallocate the decompression state. The use of Z_FINISH + is never required, but can be used to inform inflate that a faster routine + may be used for the single inflate() call. + + inflate() returns Z_OK if some progress has been made (more input + processed or more output produced), Z_STREAM_END if the end of the + compressed data has been reached and all uncompressed output has been + produced, Z_NEED_DICT if a preset dictionary is needed at this point (see + inflateSetDictionary below), Z_DATA_ERROR if the input data was corrupted, + Z_STREAM_ERROR if the stream structure was inconsistent (for example if + next_in or next_out was NULL), Z_MEM_ERROR if there was not enough memory, + Z_BUF_ERROR if no progress is possible or if there was not enough room in + the output buffer when Z_FINISH is used. In the Z_DATA_ERROR case, the + application may then call inflateSync to look for a good compression block. + In the Z_NEED_DICT case, strm->adler is set to the Adler32 value of the + dictionary chosen by the compressor. +*/ + + +extern int EXPORT inflateEnd OF((z_streamp strm)); +/* + All dynamically allocated data structures for this stream are freed. + This function discards any unprocessed input and does not flush any + pending output. + + inflateEnd returns Z_OK if success, Z_STREAM_ERROR if the stream state + was inconsistent. In the error case, msg may be set but then points to a + static string (which must not be deallocated). +*/ + + /* Advanced functions */ + +/* + The following functions are needed only in some special applications. +*/ + +/* +extern int EXPORT deflateInit2 OF((z_streamp strm, + int level, + int method, + int windowBits, + int memLevel, + int strategy)); + + This is another version of deflateInit with more compression options. The + fields next_in, zalloc, zfree and opaque must be initialized before by + the caller. + + The method parameter is the compression method. It must be Z_DEFLATED in + this version of the library. (Method 9 will allow a 64K history buffer and + partial block flushes.) + + The windowBits parameter is the base two logarithm of the window size + (the size of the history buffer). It should be in the range 8..15 for this + version of the library (the value 16 will be allowed for method 9). Larger + values of this parameter result in better compression at the expense of + memory usage. The default value is 15 if deflateInit is used instead. + + The memLevel parameter specifies how much memory should be allocated + for the internal compression state. memLevel=1 uses minimum memory but + is slow and reduces compression ratio; memLevel=9 uses maximum memory + for optimal speed. The default value is 8. See zconf.h for total memory + usage as a function of windowBits and memLevel. + + The strategy parameter is used to tune the compression algorithm. Use the + value Z_DEFAULT_STRATEGY for normal data, Z_FILTERED for data produced by a + filter (or predictor), or Z_HUFFMAN_ONLY to force Huffman encoding only (no + string match). Filtered data consists mostly of small values with a + somewhat random distribution. In this case, the compression algorithm is + tuned to compress them better. The effect of Z_FILTERED is to force more + Huffman coding and less string matching; it is somewhat intermediate + between Z_DEFAULT and Z_HUFFMAN_ONLY. The strategy parameter only affects + the compression ratio but not the correctness of the compressed output even + if it is not set appropriately. + + If next_in is not null, the library will use this buffer to hold also + some history information; the buffer must either hold the entire input + data, or have at least 1<<(windowBits+1) bytes and be writable. If next_in + is null, the library will allocate its own history buffer (and leave next_in + null). next_out need not be provided here but must be provided by the + application for the next call of deflate(). + + If the history buffer is provided by the application, next_in must + must never be changed by the application since the compressor maintains + information inside this buffer from call to call; the application + must provide more input only by increasing avail_in. next_in is always + reset by the library in this case. + + deflateInit2 returns Z_OK if success, Z_MEM_ERROR if there was + not enough memory, Z_STREAM_ERROR if a parameter is invalid (such as + an invalid method). msg is set to null if there is no error message. + deflateInit2 does not perform any compression: this will be done by + deflate(). +*/ + +extern int EXPORT deflateSetDictionary OF((z_streamp strm, + const Bytef *dictionary, + uInt dictLength)); +/* + Initializes the compression dictionary (history buffer) from the given + byte sequence without producing any compressed output. This function must + be called immediately after deflateInit or deflateInit2, before any call + of deflate. The compressor and decompressor must use exactly the same + dictionary (see inflateSetDictionary). + The dictionary should consist of strings (byte sequences) that are likely + to be encountered later in the data to be compressed, with the most commonly + used strings preferably put towards the end of the dictionary. Using a + dictionary is most useful when the data to be compressed is short and + can be predicted with good accuracy; the data can then be compressed better + than with the default empty dictionary. In this version of the library, + only the last 32K bytes of the dictionary are used. + Upon return of this function, strm->adler is set to the Adler32 value + of the dictionary; the decompressor may later use this value to determine + which dictionary has been used by the compressor. (The Adler32 value + applies to the whole dictionary even if only a subset of the dictionary is + actually used by the compressor.) + + deflateSetDictionary returns Z_OK if success, or Z_STREAM_ERROR if a + parameter is invalid (such as NULL dictionary) or the stream state + is inconsistent (for example if deflate has already been called for this + stream). deflateSetDictionary does not perform any compression: this will + be done by deflate(). +*/ + +extern int EXPORT deflateCopy OF((z_streamp dest, + z_streamp source)); +/* + Sets the destination stream as a complete copy of the source stream. If + the source stream is using an application-supplied history buffer, a new + buffer is allocated for the destination stream. The compressed output + buffer is always application-supplied. It's the responsibility of the + application to provide the correct values of next_out and avail_out for the + next call of deflate. + + This function can be useful when several compression strategies will be + tried, for example when there are several ways of pre-processing the input + data with a filter. The streams that will be discarded should then be freed + by calling deflateEnd. Note that deflateCopy duplicates the internal + compression state which can be quite large, so this strategy is slow and + can consume lots of memory. + + deflateCopy returns Z_OK if success, Z_MEM_ERROR if there was not + enough memory, Z_STREAM_ERROR if the source stream state was inconsistent + (such as zalloc being NULL). msg is left unchanged in both source and + destination. +*/ + +extern int EXPORT deflateReset OF((z_streamp strm)); +/* + This function is equivalent to deflateEnd followed by deflateInit, + but does not free and reallocate all the internal compression state. + The stream will keep the same compression level and any other attributes + that may have been set by deflateInit2. + + deflateReset returns Z_OK if success, or Z_STREAM_ERROR if the source + stream state was inconsistent (such as zalloc or state being NULL). +*/ + +extern int EXPORT deflateParams OF((z_streamp strm, int level, int strategy)); +/* + Dynamically update the compression level and compression strategy. + This can be used to switch between compression and straight copy of + the input data, or to switch to a different kind of input data requiring + a different strategy. If the compression level is changed, the input + available so far is compressed with the old level (and may be flushed); + the new level will take effect only at the next call of deflate(). + + Before the call of deflateParams, the stream state must be set as for + a call of deflate(), since the currently available input may have to + be compressed and flushed. In particular, strm->avail_out must be non-zero. + + deflateParams returns Z_OK if success, Z_STREAM_ERROR if the source + stream state was inconsistent or if a parameter was invalid, Z_BUF_ERROR + if strm->avail_out was zero. +*/ + +extern int EXPORT deflateOutputPending OF((z_streamp strm)); +/* + Returns the number of bytes of output which are immediately + available from the compressor (i.e. without any further input + or flush). +*/ + +/* +extern int EXPORT inflateInit2 OF((z_streamp strm, + int windowBits)); + + This is another version of inflateInit with more compression options. The + fields next_out, zalloc, zfree and opaque must be initialized before by + the caller. + + The windowBits parameter is the base two logarithm of the maximum window + size (the size of the history buffer). It should be in the range 8..15 for + this version of the library (the value 16 will be allowed soon). The + default value is 15 if inflateInit is used instead. If a compressed stream + with a larger window size is given as input, inflate() will return with + the error code Z_DATA_ERROR instead of trying to allocate a larger window. + + If next_out is not null, the library will use this buffer for the history + buffer; the buffer must either be large enough to hold the entire output + data, or have at least 1<<windowBits bytes. If next_out is null, the + library will allocate its own buffer (and leave next_out null). next_in + need not be provided here but must be provided by the application for the + next call of inflate(). + + If the history buffer is provided by the application, next_out must + never be changed by the application since the decompressor maintains + history information inside this buffer from call to call; the application + can only reset next_out to the beginning of the history buffer when + avail_out is zero and all output has been consumed. + + inflateInit2 returns Z_OK if success, Z_MEM_ERROR if there was + not enough memory, Z_STREAM_ERROR if a parameter is invalid (such as + windowBits < 8). msg is set to null if there is no error message. + inflateInit2 does not perform any decompression: this will be done by + inflate(). +*/ + +extern int EXPORT inflateSetDictionary OF((z_streamp strm, + const Bytef *dictionary, + uInt dictLength)); +/* + Initializes the decompression dictionary (history buffer) from the given + uncompressed byte sequence. This function must be called immediately after + a call of inflate if this call returned Z_NEED_DICT. The dictionary chosen + by the compressor can be determined from the Adler32 value returned by this + call of inflate. The compressor and decompressor must use exactly the same + dictionary (see deflateSetDictionary). + + inflateSetDictionary returns Z_OK if success, Z_STREAM_ERROR if a + parameter is invalid (such as NULL dictionary) or the stream state is + inconsistent, Z_DATA_ERROR if the given dictionary doesn't match the + expected one (incorrect Adler32 value). inflateSetDictionary does not + perform any decompression: this will be done by subsequent calls of + inflate(). +*/ + +extern int EXPORT inflateSync OF((z_streamp strm)); +/* + Skips invalid compressed data until the special marker (see deflate() + above) can be found, or until all available input is skipped. No output + is provided. + + inflateSync returns Z_OK if the special marker has been found, Z_BUF_ERROR + if no more input was provided, Z_DATA_ERROR if no marker has been found, + or Z_STREAM_ERROR if the stream structure was inconsistent. In the success + case, the application may save the current current value of total_in which + indicates where valid compressed data was found. In the error case, the + application may repeatedly call inflateSync, providing more input each time, + until success or end of the input data. +*/ + +extern int EXPORT inflateReset OF((z_streamp strm)); +/* + This function is equivalent to inflateEnd followed by inflateInit, + but does not free and reallocate all the internal decompression state. + The stream will keep attributes that may have been set by inflateInit2. + + inflateReset returns Z_OK if success, or Z_STREAM_ERROR if the source + stream state was inconsistent (such as zalloc or state being NULL). +*/ + +extern int inflateIncomp OF((z_stream *strm)); +/* + This function adds the data at next_in (avail_in bytes) to the output + history without performing any output. There must be no pending output, + and the decompressor must be expecting to see the start of a block. + Calling this function is equivalent to decompressing a stored block + containing the data at next_in (except that the data is not output). +*/ + + /* utility functions */ + +/* + The following utility functions are implemented on top of the + basic stream-oriented functions. To simplify the interface, some + default options are assumed (compression level, window size, + standard memory allocation functions). The source code of these + utility functions can easily be modified if you need special options. +*/ + +extern int EXPORT compress OF((Bytef *dest, uLongf *destLen, + const Bytef *source, uLong sourceLen)); +/* + Compresses the source buffer into the destination buffer. sourceLen is + the byte length of the source buffer. Upon entry, destLen is the total + size of the destination buffer, which must be at least 0.1% larger than + sourceLen plus 12 bytes. Upon exit, destLen is the actual size of the + compressed buffer. + This function can be used to compress a whole file at once if the + input file is mmap'ed. + compress returns Z_OK if success, Z_MEM_ERROR if there was not + enough memory, Z_BUF_ERROR if there was not enough room in the output + buffer. +*/ + +extern int EXPORT uncompress OF((Bytef *dest, uLongf *destLen, + const Bytef *source, uLong sourceLen)); +/* + Decompresses the source buffer into the destination buffer. sourceLen is + the byte length of the source buffer. Upon entry, destLen is the total + size of the destination buffer, which must be large enough to hold the + entire uncompressed data. (The size of the uncompressed data must have + been saved previously by the compressor and transmitted to the decompressor + by some mechanism outside the scope of this compression library.) + Upon exit, destLen is the actual size of the compressed buffer. + This function can be used to decompress a whole file at once if the + input file is mmap'ed. + + uncompress returns Z_OK if success, Z_MEM_ERROR if there was not + enough memory, Z_BUF_ERROR if there was not enough room in the output + buffer, or Z_DATA_ERROR if the input data was corrupted. +*/ + + +typedef voidp gzFile; + +extern gzFile EXPORT gzopen OF((const char *path, const char *mode)); +/* + Opens a gzip (.gz) file for reading or writing. The mode parameter + is as in fopen ("rb" or "wb") but can also include a compression level + ("wb9"). gzopen can be used to read a file which is not in gzip format; + in this case gzread will directly read from the file without decompression. + gzopen returns NULL if the file could not be opened or if there was + insufficient memory to allocate the (de)compression state; errno + can be checked to distinguish the two cases (if errno is zero, the + zlib error is Z_MEM_ERROR). +*/ + +extern gzFile EXPORT gzdopen OF((int fd, const char *mode)); +/* + gzdopen() associates a gzFile with the file descriptor fd. File + descriptors are obtained from calls like open, dup, creat, pipe or + fileno (in the file has been previously opened with fopen). + The mode parameter is as in gzopen. + The next call of gzclose on the returned gzFile will also close the + file descriptor fd, just like fclose(fdopen(fd), mode) closes the file + descriptor fd. If you want to keep fd open, use gzdopen(dup(fd), mode). + gzdopen returns NULL if there was insufficient memory to allocate + the (de)compression state. +*/ + +extern int EXPORT gzread OF((gzFile file, voidp buf, unsigned len)); +/* + Reads the given number of uncompressed bytes from the compressed file. + If the input file was not in gzip format, gzread copies the given number + of bytes into the buffer. + gzread returns the number of uncompressed bytes actually read (0 for + end of file, -1 for error). */ + +extern int EXPORT gzwrite OF((gzFile file, const voidp buf, unsigned len)); +/* + Writes the given number of uncompressed bytes into the compressed file. + gzwrite returns the number of uncompressed bytes actually written + (0 in case of error). +*/ + +extern int EXPORT gzflush OF((gzFile file, int flush)); +/* + Flushes all pending output into the compressed file. The parameter + flush is as in the deflate() function. The return value is the zlib + error number (see function gzerror below). gzflush returns Z_OK if + the flush parameter is Z_FINISH and all output could be flushed. + gzflush should be called only when strictly necessary because it can + degrade compression. +*/ + +extern int EXPORT gzclose OF((gzFile file)); +/* + Flushes all pending output if necessary, closes the compressed file + and deallocates all the (de)compression state. The return value is the zlib + error number (see function gzerror below). +*/ + +extern const char * EXPORT gzerror OF((gzFile file, int *errnum)); +/* + Returns the error message for the last error which occurred on the + given compressed file. errnum is set to zlib error number. If an + error occurred in the filesystem and not in the compression library, + errnum is set to Z_ERRNO and the application may consult errno + to get the exact error code. +*/ + + /* checksum functions */ + +/* + These functions are not related to compression but are exported + anyway because they might be useful in applications using the + compression library. +*/ + +extern uLong EXPORT adler32 OF((uLong adler, const Bytef *buf, uInt len)); + +/* + Update a running Adler-32 checksum with the bytes buf[0..len-1] and + return the updated checksum. If buf is NULL, this function returns + the required initial value for the checksum. + An Adler-32 checksum is almost as reliable as a CRC32 but can be computed + much faster. Usage example: + + uLong adler = adler32(0L, Z_NULL, 0); + + while (read_buffer(buffer, length) != EOF) { + adler = adler32(adler, buffer, length); + } + if (adler != original_adler) error(); +*/ + +#if 0 +extern uLong EXPORT crc32 OF((uLong crc, const Bytef *buf, uInt len)); +/* + Update a running crc with the bytes buf[0..len-1] and return the updated + crc. If buf is NULL, this function returns the required initial value + for the crc. Pre- and post-conditioning (one's complement) is performed + within this function so it shouldn't be done by the application. + Usage example: + + uLong crc = crc32(0L, Z_NULL, 0); + + while (read_buffer(buffer, length) != EOF) { + crc = crc32(crc, buffer, length); + } + if (crc != original_crc) error(); +*/ +#endif + + + /* various hacks, don't look :) */ + +/* deflateInit and inflateInit are macros to allow checking the zlib version + * and the compiler's view of z_stream: + */ +extern int EXPORT deflateInit_ OF((z_streamp strm, int level, + const char *version, int stream_size)); +extern int EXPORT inflateInit_ OF((z_streamp strm, + const char *version, int stream_size)); +extern int EXPORT deflateInit2_ OF((z_streamp strm, int level, int method, + int windowBits, int memLevel, int strategy, + const char *version, int stream_size)); +extern int EXPORT inflateInit2_ OF((z_streamp strm, int windowBits, + const char *version, int stream_size)); +#define deflateInit(strm, level) \ + deflateInit_((strm), (level), ZLIB_VERSION, sizeof(z_stream)) +#define inflateInit(strm) \ + inflateInit_((strm), ZLIB_VERSION, sizeof(z_stream)) +#define deflateInit2(strm, level, method, windowBits, memLevel, strategy) \ + deflateInit2_((strm),(level),(method),(windowBits),(memLevel),\ + (strategy), ZLIB_VERSION, sizeof(z_stream)) +#define inflateInit2(strm, windowBits) \ + inflateInit2_((strm), (windowBits), ZLIB_VERSION, sizeof(z_stream)) + +#if !defined(_Z_UTIL_H) && !defined(NO_DUMMY_DECL) + struct internal_state {int dummy;}; /* hack for buggy compilers */ +#endif + +uLongf *get_crc_table OF((void)); /* can be used by asm versions of crc32() */ + +#ifdef __cplusplus +} +#endif + +#endif /* _ZLIB_H */ +/* --- zlib.h */ Property changes on: trunk/sys/sys/zlib.h ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +MidnightBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Added: trunk/sys/sys/zutil.h =================================================================== --- trunk/sys/sys/zutil.h (rev 0) +++ trunk/sys/sys/zutil.h 2020-02-08 19:49:04 UTC (rev 12322) @@ -0,0 +1,232 @@ +/* $MidnightBSD$ */ +/* zutil.h -- internal interface and configuration of the compression library + * Copyright (C) 1995-1996 Jean-loup Gailly. + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +/* WARNING: this file should *not* be used by applications. It is + part of the implementation of the compression library and is + subject to change. Applications should only use zlib.h. + */ + +/* From: zutil.h,v 1.16 1996/07/24 13:41:13 me Exp $ */ +/* $FreeBSD: stable/11/sys/sys/zutil.h 281855 2015-04-22 14:38:58Z rodrigc $ */ + +#ifndef _Z_UTIL_H +#define _Z_UTIL_H + +#define ZEXPORT + +#ifdef _KERNEL +#include <sys/zlib.h> +#else +#include "zlib.h" +#endif + +#ifdef _KERNEL +/* Assume this is a *BSD or SVR4 kernel */ +#include <sys/types.h> +#include <sys/time.h> +#include <sys/systm.h> +#include <sys/module.h> +#include <sys/errno.h> +#include <sys/param.h> +#include <sys/kernel.h> +# define HAVE_MEMCPY +# define memcpy(d, s, n) bcopy((s), (d), (n)) +# define memset(d, v, n) bzero((d), (n)) +# define memcmp bcmp + +#else +#if defined(__KERNEL__) +/* Assume this is a Linux kernel */ +#include <linux/string.h> +#define HAVE_MEMCPY + +#else /* not kernel */ + +#if defined(MSDOS)||defined(VMS)||defined(CRAY)||defined(WIN32)||defined(RISCOS) +# include <stddef.h> +# include <errno.h> +#else + extern int errno; +#endif +#ifdef STDC +# include <string.h> +# include <stdlib.h> +#endif +#endif /* __KERNEL__ */ +#endif /* _KERNEL */ + +#ifndef local +# define local static +#endif +/* compile with -Dlocal if your debugger can't find static symbols */ + +typedef unsigned char uch; +typedef uch FAR uchf; +typedef unsigned short ush; +typedef ush FAR ushf; +typedef unsigned long ulg; + +#define ERR_MSG(err) z_errmsg[Z_NEED_DICT-(err)] + +#define ERR_RETURN(strm,err) \ + return (strm->msg = (const char*)ERR_MSG(err), (err)) +/* To be used only when the state is known to be valid */ + + /* common constants */ + +#ifndef DEF_WBITS +# define DEF_WBITS MAX_WBITS +#endif +/* default windowBits for decompression. MAX_WBITS is for compression only */ + +#if MAX_MEM_LEVEL >= 8 +# define DEF_MEM_LEVEL 8 +#else +# define DEF_MEM_LEVEL MAX_MEM_LEVEL +#endif +/* default memLevel */ + +#define STORED_BLOCK 0 +#define STATIC_TREES 1 +#define DYN_TREES 2 +/* The three kinds of block type */ + +#define MIN_MATCH 3 +#define MAX_MATCH 258 +/* The minimum and maximum match lengths */ + +#define PRESET_DICT 0x20 /* preset dictionary flag in zlib header */ + + /* target dependencies */ + +#ifdef MSDOS +# define OS_CODE 0x00 +# ifdef __TURBOC__ +# include <alloc.h> +# else /* MSC or DJGPP */ +# include <malloc.h> +# endif +#endif + +#ifdef OS2 +# define OS_CODE 0x06 +#endif + +#ifdef WIN32 /* Window 95 & Windows NT */ +# define OS_CODE 0x0b +#endif + +#if defined(VAXC) || defined(VMS) +# define OS_CODE 0x02 +# define FOPEN(name, mode) \ + fopen((name), (mode), "mbc=60", "ctx=stm", "rfm=fix", "mrs=512") +#endif + +#ifdef AMIGA +# define OS_CODE 0x01 +#endif + +#if defined(ATARI) || defined(atarist) +# define OS_CODE 0x05 +#endif + +#ifdef MACOS +# define OS_CODE 0x07 +#endif + +#ifdef __50SERIES /* Prime/PRIMOS */ +# define OS_CODE 0x0F +#endif + +#ifdef TOPS20 +# define OS_CODE 0x0a +#endif + +#if defined(_BEOS_) || defined(RISCOS) +# define fdopen(fd,mode) NULL /* No fdopen() */ +#endif + + /* Common defaults */ + +#ifndef OS_CODE +# define OS_CODE 0x03 /* assume Unix */ +#endif + +#ifndef FOPEN +# define FOPEN(name, mode) fopen((name), (mode)) +#endif + + /* functions */ + +#ifdef HAVE_STRERROR + extern char *strerror OF((int)); +# define zstrerror(errnum) strerror(errnum) +#else +# define zstrerror(errnum) "" +#endif + +#if defined(pyr) +# define NO_MEMCPY +#endif +#if (defined(M_I86SM) || defined(M_I86MM)) && !defined(_MSC_VER) + /* Use our own functions for small and medium model with MSC <= 5.0. + * You may have to use the same strategy for Borland C (untested). + */ +# define NO_MEMCPY +#endif +#if defined(STDC) && !defined(HAVE_MEMCPY) && !defined(NO_MEMCPY) +# define HAVE_MEMCPY +#endif +#ifdef HAVE_MEMCPY +# ifdef SMALL_MEDIUM /* MSDOS small or medium model */ +# define zmemcpy _fmemcpy +# define zmemcmp _fmemcmp +# define zmemzero(dest, len) _fmemset(dest, 0, len) +# else +# define zmemcpy memcpy +# define zmemcmp memcmp +# define zmemzero(dest, len) memset(dest, 0, len) +# endif +#else + extern void zmemcpy OF((Bytef* dest, Bytef* source, uInt len)); + extern int zmemcmp OF((Bytef* s1, Bytef* s2, uInt len)); + extern void zmemzero OF((Bytef* dest, uInt len)); +#endif + +/* Diagnostic functions */ +#ifdef DEBUG_ZLIB +# include <stdio.h> +# ifndef verbose +# define verbose 0 +# endif + extern void z_error OF((char *m)); +# define Assert(cond,msg) {if(!(cond)) z_error(msg);} +# define Trace(x) fprintf x +# define Tracev(x) {if (verbose) fprintf x ;} +# define Tracevv(x) {if (verbose>1) fprintf x ;} +# define Tracec(c,x) {if (verbose && (c)) fprintf x ;} +# define Tracecv(c,x) {if (verbose>1 && (c)) fprintf x ;} +#else +# define Assert(cond,msg) +# define Trace(x) +# define Tracev(x) +# define Tracevv(x) +# define Tracec(c,x) +# define Tracecv(c,x) +#endif + + +typedef uLong (*check_func) OF((uLong check, const Bytef *buf, uInt len)); + +voidpf zcalloc OF((voidpf opaque, unsigned items, unsigned size)); +void zcfree OF((voidpf opaque, voidpf ptr)); + +#define ZALLOC(strm, items, size) \ + (*((strm)->zalloc))((strm)->opaque, (items), (size)) +#define ZFREE(strm, addr) (*((strm)->zfree))((strm)->opaque, (voidpf)(addr)) +#define TRY_FREE(s, p) {if (p) ZFREE(s, p);} + +#endif /* _Z_UTIL_H */ Property changes on: trunk/sys/sys/zutil.h ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +MidnightBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property From laffer1 at midnightbsd.org Sat Feb 8 14:49:57 2020 From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org) Date: Sat, 8 Feb 2020 14:49:57 -0500 (EST) Subject: [Midnightbsd-cvs] src [12323] trunk/sys/sys/watchdog.h: sync with FreeBSD 11-stable Message-ID: <202002081949.018Jnvrx064978@stargazer.midnightbsd.org> Revision: 12323 http://svnweb.midnightbsd.org/src/?rev=12323 Author: laffer1 Date: 2020-02-08 14:49:56 -0500 (Sat, 08 Feb 2020) Log Message: ----------- sync with FreeBSD 11-stable Modified Paths: -------------- trunk/sys/sys/watchdog.h Modified: trunk/sys/sys/watchdog.h =================================================================== --- trunk/sys/sys/watchdog.h 2020-02-08 19:49:04 UTC (rev 12322) +++ trunk/sys/sys/watchdog.h 2020-02-08 19:49:56 UTC (rev 12323) @@ -27,7 +27,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: stable/10/sys/sys/watchdog.h 247405 2013-02-27 19:03:31Z alfred $ + * $FreeBSD: stable/11/sys/sys/watchdog.h 331722 2018-03-29 02:50:57Z eadler $ */ #ifndef _SYS_WATCHDOG_H #define _SYS_WATCHDOG_H @@ -111,6 +111,14 @@ u_int wdog_kern_last_timeout(void); int wdog_kern_pat(u_int utim); + +/* + * The following function pointer is used to attach a software watchdog + * if no hardware watchdog has been attached, and if the software module + * has initialized the function pointer. + */ + +extern void (*wdog_software_attach)(void); #endif #endif /* _SYS_WATCHDOG_H */ From laffer1 at midnightbsd.org Sat Feb 8 14:51:00 2020 From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org) Date: Sat, 8 Feb 2020 14:51:00 -0500 (EST) Subject: [Midnightbsd-cvs] src [12324] trunk/sys/sys/wait.h: sync with FreeBSD 11-stable Message-ID: <202002081951.018Jp0qN065697@stargazer.midnightbsd.org> Revision: 12324 http://svnweb.midnightbsd.org/src/?rev=12324 Author: laffer1 Date: 2020-02-08 14:51:00 -0500 (Sat, 08 Feb 2020) Log Message: ----------- sync with FreeBSD 11-stable Modified Paths: -------------- trunk/sys/sys/wait.h Modified: trunk/sys/sys/wait.h =================================================================== --- trunk/sys/sys/wait.h 2020-02-08 19:49:56 UTC (rev 12323) +++ trunk/sys/sys/wait.h 2020-02-08 19:51:00 UTC (rev 12324) @@ -28,7 +28,7 @@ * SUCH DAMAGE. * * @(#)wait.h 8.2 (Berkeley) 7/10/94 - * $FreeBSD: stable/10/sys/sys/wait.h 254218 2013-08-11 14:15:01Z jilles $ + * $FreeBSD: stable/11/sys/sys/wait.h 331722 2018-03-29 02:50:57Z eadler $ */ #ifndef _SYS_WAIT_H_ @@ -139,7 +139,19 @@ #define WAIT_MYPGRP 0 /* any process in my process group */ #endif /* __BSD_VISIBLE */ +#if defined(_KERNEL) || defined(_WANT_KW_EXITCODE) + +/* + * Clamp the return code to the low 8 bits from full 32 bit value. + * Should be used in kernel to construct the wait(2)-compatible process + * status to usermode. + */ +#define KW_EXITCODE(ret, sig) W_EXITCODE((ret) & 0xff, (sig)) + +#endif /* _KERNEL || _WANT_KW_EXITCODE */ + #ifndef _KERNEL + #include <sys/types.h> __BEGIN_DECLS From laffer1 at midnightbsd.org Sat Feb 8 14:52:44 2020 From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org) Date: Sat, 8 Feb 2020 14:52:44 -0500 (EST) Subject: [Midnightbsd-cvs] src [12325] trunk/sys/sys/vnode.h: sync with FreeBSD 11-stable Message-ID: <202002081952.018JqijP065804@stargazer.midnightbsd.org> Revision: 12325 http://svnweb.midnightbsd.org/src/?rev=12325 Author: laffer1 Date: 2020-02-08 14:52:43 -0500 (Sat, 08 Feb 2020) Log Message: ----------- sync with FreeBSD 11-stable Modified Paths: -------------- trunk/sys/sys/vnode.h Modified: trunk/sys/sys/vnode.h =================================================================== --- trunk/sys/sys/vnode.h 2020-02-08 19:51:00 UTC (rev 12324) +++ trunk/sys/sys/vnode.h 2020-02-08 19:52:43 UTC (rev 12325) @@ -28,7 +28,7 @@ * SUCH DAMAGE. * * @(#)vnode.h 8.7 (Berkeley) 2/4/94 - * $FreeBSD: stable/10/sys/sys/vnode.h 301100 2016-06-01 04:07:33Z kib $ + * $FreeBSD: stable/11/sys/sys/vnode.h 355443 2019-12-06 11:48:22Z kib $ */ #ifndef _SYS_VNODE_H_ @@ -78,6 +78,7 @@ * c - namecache mutex * f - freelist mutex * i - interlock + * I - updated with atomics, 0->1 and 1->0 transitions with interlock held * m - mount point interlock * p - pollinfo lock * u - Only a reference to the vnode is needed to read. @@ -163,8 +164,8 @@ daddr_t v_lastw; /* v last write */ int v_clen; /* v length of cur. cluster */ - int v_holdcnt; /* i prevents recycling. */ - int v_usecount; /* i ref count of users */ + u_int v_holdcnt; /* I prevents recycling. */ + u_int v_usecount; /* I ref count of users */ u_int v_iflag; /* i vnode flags (see below) */ u_int v_vflag; /* v vnode flags */ int v_writecount; /* v ref count of writers */ @@ -234,7 +235,6 @@ * are required for writing but the status may be checked with either. */ #define VI_MOUNT 0x0020 /* Mount in progress */ -#define VI_AGE 0x0040 /* Insert vnode at head of free list */ #define VI_DOOMED 0x0080 /* This vnode is being recycled */ #define VI_FREE 0x0100 /* This vnode is on the freelist */ #define VI_ACTIVE 0x0200 /* This vnode is on the active list */ @@ -254,6 +254,7 @@ #define VV_DELETED 0x0400 /* should be removed */ #define VV_MD 0x0800 /* vnode backs the md device */ #define VV_FORCEINSMQ 0x1000 /* force the insmntque to succeed */ +#define VV_READLINK 0x2000 /* fdescfs linux vnode */ /* * Vnode attributes. A field value of VNOVAL represents a field whose value @@ -303,6 +304,7 @@ #define IO_INVAL 0x0040 /* invalidate after I/O */ #define IO_SYNC 0x0080 /* do I/O synchronously */ #define IO_DIRECT 0x0100 /* attempt to bypass buffer cache */ +#define IO_NOREUSE 0x0200 /* VMIO data won't be reused */ #define IO_EXT 0x0400 /* operate on external attributes */ #define IO_NORMAL 0x0800 /* operate on regular data */ #define IO_NOMACCHECK 0x1000 /* MAC checks unnecessary */ @@ -338,6 +340,8 @@ #define VWRITE_ACL 000040000000 /* change ACL and/or file mode */ #define VWRITE_OWNER 000100000000 /* change file owner */ #define VSYNCHRONIZE 000200000000 /* not used */ +#define VCREAT 000400000000 /* creating new file */ +#define VVERIFY 001000000000 /* verification required */ /* * Permissions that were traditionally granted only to the file owner. @@ -372,6 +376,8 @@ MALLOC_DECLARE(M_VNODE); #endif +extern u_int ncsizefactor; + /* * Convert between vnode types and inode formats (since POSIX.1 * defines mode word of stat structure in terms of inode formats). @@ -393,6 +399,8 @@ #define V_ALT 0x0002 /* vinvalbuf: invalidate only alternate bufs */ #define V_NORMAL 0x0004 /* vinvalbuf: invalidate only regular bufs */ #define V_CLEANONLY 0x0008 /* vinvalbuf: invalidate only clean bufs */ +#define V_VMIO 0x0010 /* vinvalbuf: called during pageout */ +#define V_ALLOWCLEAN 0x0020 /* vinvalbuf: allow clean buffers after flush */ #define REVOKEALL 0x0001 /* vop_revoke: revoke all aliases */ #define V_WAIT 0x0001 /* vn_start_write: sleep for suspend */ #define V_NOWAIT 0x0002 /* vn_start_write: don't sleep for suspend */ @@ -420,7 +428,6 @@ */ extern struct vnode *rootvnode; /* root (i.e. "/") vnode */ extern struct mount *rootdevmp; /* "/dev" mount */ -extern int async_io_version; /* 0 or POSIX version of AIO i'face */ extern int desiredvnodes; /* number of vnodes desired */ extern struct uma_zone *namei_zone; extern struct vattr va_null; /* predefined null vattr structure */ @@ -508,7 +515,9 @@ * reliable since if the thread sleeps between changing the lock * state and checking it with the assert, some other thread could * change the state. They are good enough for debugging a single - * filesystem using a single-threaded test. + * filesystem using a single-threaded test. Note that the unreliability is + * limited to false negatives; efforts were made to ensure that false + * positives cannot occur. */ void assert_vi_locked(struct vnode *vp, const char *str); void assert_vi_unlocked(struct vnode *vp, const char *str); @@ -576,6 +585,7 @@ /* * Finally, include the default set of vnode operations. */ +typedef void vop_getpages_iodone_t(void *, vm_page_t *, int, int); #include "vnode_if.h" /* vn_open_flags */ @@ -582,6 +592,7 @@ #define VN_OPEN_NOAUDIT 0x00000001 #define VN_OPEN_NOCAPCHECK 0x00000002 #define VN_OPEN_NAMECACHE 0x00000004 +#define VN_OPEN_INVFS 0x00000008 /* * Public vnode manipulation functions. @@ -598,10 +609,13 @@ struct ucred; struct uio; struct vattr; +struct vfsops; struct vnode; typedef int (*vn_get_ino_t)(struct mount *, void *, int, struct vnode **); +int bnoreuselist(struct bufv *bufv, struct bufobj *bo, daddr_t startn, + daddr_t endn); /* cache_* may belong in namei.h. */ void cache_changesize(int newhashsize); #define cache_enter(dvp, vp, cnp) \ @@ -613,9 +627,8 @@ struct componentname *cnp, struct timespec *tsp, int *ticksp); void cache_purge(struct vnode *vp); void cache_purge_negative(struct vnode *vp); -void cache_purgevfs(struct mount *mp); +void cache_purgevfs(struct mount *mp, bool force); int change_dir(struct vnode *vp, struct thread *td); -int change_root(struct vnode *vp, struct thread *td); void cvtstat(struct stat *st, struct ostat *ost); void cvtnstat(struct stat *sb, struct nstat *nsb); int getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops, @@ -651,20 +664,20 @@ struct ucred *cred, int *privused); void vattr_null(struct vattr *vap); int vcount(struct vnode *vp); -void vdrop(struct vnode *); -void vdropl(struct vnode *); +#define vdrop(vp) _vdrop((vp), 0) +#define vdropl(vp) _vdrop((vp), 1) +void _vdrop(struct vnode *, bool); int vflush(struct mount *mp, int rootrefs, int flags, struct thread *td); int vget(struct vnode *vp, int lockflag, struct thread *td); void vgone(struct vnode *vp); -void vhold(struct vnode *); -void vholdl(struct vnode *); +#define vhold(vp) _vhold((vp), 0) +#define vholdl(vp) _vhold((vp), 1) +void _vhold(struct vnode *, bool); void vinactive(struct vnode *, struct thread *); int vinvalbuf(struct vnode *vp, int save, int slpflag, int slptimeo); -int vtruncbuf(struct vnode *vp, struct ucred *cred, off_t length, - int blksize); +int vtruncbuf(struct vnode *vp, off_t length, int blksize); void vunref(struct vnode *); void vn_printf(struct vnode *vp, const char *fmt, ...) __printflike(2,3); -#define vprint(label, vp) vn_printf((vp), "%s\n", (label)) int vrecycle(struct vnode *vp); int vn_bmap_seekhole(struct vnode *vp, u_long cmd, off_t *off, struct ucred *cred); @@ -691,7 +704,7 @@ struct ucred *active_cred, struct ucred *file_cred, size_t *aresid, struct thread *td); int vn_rlimit_fsize(const struct vnode *vn, const struct uio *uio, - const struct thread *td); + struct thread *td); int vn_stat(struct vnode *vp, struct stat *sb, struct ucred *active_cred, struct ucred *file_cred, struct thread *td); int vn_start_write(struct vnode *vp, struct mount **mpp, int flags); @@ -730,7 +743,9 @@ void vfs_write_resume(struct mount *mp, int flags); int vfs_write_suspend(struct mount *mp, int flags); int vfs_write_suspend_umnt(struct mount *mp); +void vnlru_free(int, struct vfsops *); int vop_stdbmap(struct vop_bmap_args *); +int vop_stdfdatasync_buf(struct vop_fdatasync_args *); int vop_stdfsync(struct vop_fsync_args *); int vop_stdgetwritemount(struct vop_getwritemount_args *); int vop_stdgetpages(struct vop_getpages_args *); @@ -771,8 +786,6 @@ void vop_create_post(void *a, int rc); void vop_deleteextattr_post(void *a, int rc); void vop_link_post(void *a, int rc); -void vop_lock_pre(void *a); -void vop_lock_post(void *a, int rc); void vop_lookup_post(void *a, int rc); void vop_lookup_pre(void *a); void vop_mkdir_post(void *a, int rc); @@ -787,10 +800,21 @@ void vop_rmdir_post(void *a, int rc); void vop_setattr_post(void *a, int rc); void vop_setextattr_post(void *a, int rc); +void vop_symlink_post(void *a, int rc); + +#ifdef DEBUG_VFS_LOCKS void vop_strategy_pre(void *a); -void vop_symlink_post(void *a, int rc); +void vop_lock_pre(void *a); +void vop_lock_post(void *a, int rc); void vop_unlock_post(void *a, int rc); void vop_unlock_pre(void *a); +#else +#define vop_strategy_pre(x) do { } while (0) +#define vop_lock_pre(x) do { } while (0) +#define vop_lock_post(x, y) do { } while (0) +#define vop_unlock_post(x, y) do { } while (0) +#define vop_unlock_pre(x) do { } while (0) +#endif void vop_rename_fail(struct vop_rename_args *ap); @@ -821,6 +845,8 @@ void vput(struct vnode *vp); void vrele(struct vnode *vp); void vref(struct vnode *vp); +void vrefl(struct vnode *vp); +void vrefact(struct vnode *vp); int vrefcnt(struct vnode *vp); void v_addpollinfo(struct vnode *vp); From laffer1 at midnightbsd.org Sat Feb 8 14:53:26 2020 From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org) Date: Sat, 8 Feb 2020 14:53:26 -0500 (EST) Subject: [Midnightbsd-cvs] src [12326] trunk/sys/sys/vmmeter.h: sync with FreeBSD 11-stable Message-ID: <202002081953.018JrQVv065866@stargazer.midnightbsd.org> Revision: 12326 http://svnweb.midnightbsd.org/src/?rev=12326 Author: laffer1 Date: 2020-02-08 14:53:25 -0500 (Sat, 08 Feb 2020) Log Message: ----------- sync with FreeBSD 11-stable Modified Paths: -------------- trunk/sys/sys/vmmeter.h Modified: trunk/sys/sys/vmmeter.h =================================================================== --- trunk/sys/sys/vmmeter.h 2020-02-08 19:52:43 UTC (rev 12325) +++ trunk/sys/sys/vmmeter.h 2020-02-08 19:53:25 UTC (rev 12326) @@ -28,7 +28,7 @@ * SUCH DAMAGE. * * @(#)vmmeter.h 8.2 (Berkeley) 7/10/94 - * $FreeBSD: stable/10/sys/sys/vmmeter.h 330047 2018-02-27 01:28:19Z jhb $ + * $FreeBSD: stable/11/sys/sys/vmmeter.h 331722 2018-03-29 02:50:57Z eadler $ */ #ifndef _SYS_VMMETER_H_ @@ -76,9 +76,10 @@ u_int v_vnodepgsin; /* (p) vnode_pager pages paged in */ u_int v_vnodepgsout; /* (p) vnode pager pages paged out */ u_int v_intrans; /* (p) intransit blocking page faults */ - u_int v_reactivated; /* (f) pages reactivated from free list */ - u_int v_pdwakeups; /* (f) times daemon has awaken from sleep */ + u_int v_reactivated; /* (p) pages reactivated by the pagedaemon */ + u_int v_pdwakeups; /* (p) times daemon has awaken from sleep */ u_int v_pdpages; /* (p) pages analyzed by daemon */ + u_int v_pdshortfalls; /* (p) page reclamation shortfalls */ u_int v_tcached; /* (p) total pages cached */ u_int v_dfree; /* (p) pages freed by daemon */ @@ -97,9 +98,8 @@ u_int v_active_count; /* (q) pages active */ u_int v_inactive_target; /* (c) pages desired inactive */ u_int v_inactive_count; /* (q) pages inactive */ + u_int v_laundry_count; /* (q) pages eligible for laundering */ u_int v_cache_count; /* (f) pages on cache queue */ - u_int v_cache_min; /* (c) min pages desired on cache queue */ - u_int v_cache_max; /* (c) max pages in cached obj (unused) */ u_int v_pageout_free_min; /* (c) min pages reserved for kernel */ u_int v_interrupt_free_min; /* (c) reserved pages for int code */ u_int v_free_severe; /* (c) severe page depletion point */ @@ -117,9 +117,9 @@ }; #ifdef _KERNEL -extern struct vmmeter cnt; +extern struct vmmeter vm_cnt; -extern int vm_pageout_wakeup_thresh; +extern u_int vm_pageout_wakeup_thresh; /* * Return TRUE if we are under our severe low-free-pages threshold @@ -127,12 +127,11 @@ * This routine is typically used at the user<->system interface to determine * whether we need to block in order to avoid a low memory deadlock. */ - -static __inline -int +static inline int vm_page_count_severe(void) { - return (cnt.v_free_severe > (cnt.v_free_count + cnt.v_cache_count)); + + return (vm_cnt.v_free_severe > vm_cnt.v_free_count); } /* @@ -142,14 +141,13 @@ * we can execute potentially very expensive code in terms of memory. It * is also used by the pageout daemon to calculate when to sleep, when * to wake waiters up, and when (after making a pass) to become more - * desparate. + * desperate. */ - -static __inline -int +static inline int vm_page_count_min(void) { - return (cnt.v_free_min > (cnt.v_free_count + cnt.v_cache_count)); + + return (vm_cnt.v_free_min > vm_cnt.v_free_count); } /* @@ -156,12 +154,11 @@ * Return TRUE if we have not reached our free page target during * free page recovery operations. */ - -static __inline -int +static inline int vm_page_count_target(void) { - return (cnt.v_free_target > (cnt.v_free_count + cnt.v_cache_count)); + + return (vm_cnt.v_free_target > vm_cnt.v_free_count); } /* @@ -168,26 +165,42 @@ * Return the number of pages we need to free-up or cache * A positive number indicates that we do not have enough free pages. */ - -static __inline -int +static inline int vm_paging_target(void) { - return (cnt.v_free_target - (cnt.v_free_count + cnt.v_cache_count)); + + return (vm_cnt.v_free_target - vm_cnt.v_free_count); } /* * Returns TRUE if the pagedaemon needs to be woken up. */ +static inline int +vm_paging_needed(u_int free_count) +{ -static __inline -int -vm_paging_needed(void) + return (free_count < vm_pageout_wakeup_thresh); +} + +/* + * Return the number of pages we need to launder. + * A positive number indicates that we have a shortfall of clean pages. + */ +static inline int +vm_laundry_target(void) { - return (cnt.v_free_count + cnt.v_cache_count < - (u_int)vm_pageout_wakeup_thresh); + + return (vm_paging_target()); } +/* + * Obtain the value of a per-CPU counter. + */ +#define VM_METER_PCPU_CNT(member) \ + vm_meter_cnt(__offsetof(struct vmmeter, member)) + +u_int vm_meter_cnt(size_t); + #endif struct vmtotal { From laffer1 at midnightbsd.org Sat Feb 8 14:54:27 2020 From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org) Date: Sat, 8 Feb 2020 14:54:27 -0500 (EST) Subject: [Midnightbsd-cvs] src [12327] trunk/sys/sys/vdso.h: sync with FreeBSD 11-stable Message-ID: <202002081954.018JsR7s065965@stargazer.midnightbsd.org> Revision: 12327 http://svnweb.midnightbsd.org/src/?rev=12327 Author: laffer1 Date: 2020-02-08 14:54:27 -0500 (Sat, 08 Feb 2020) Log Message: ----------- sync with FreeBSD 11-stable Modified Paths: -------------- trunk/sys/sys/vdso.h Modified: trunk/sys/sys/vdso.h =================================================================== --- trunk/sys/sys/vdso.h 2020-02-08 19:53:25 UTC (rev 12326) +++ trunk/sys/sys/vdso.h 2020-02-08 19:54:27 UTC (rev 12327) @@ -1,3 +1,4 @@ +/* $MidnightBSD$ */ /*- * Copyright 2012 Konstantin Belousov <kib at FreeBSD.ORG>. * All rights reserved. @@ -22,7 +23,7 @@ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * - * $FreeBSD$ + * $FreeBSD: stable/11/sys/sys/vdso.h 331722 2018-03-29 02:50:57Z eadler $ */ #ifndef _SYS_VDSO_H @@ -53,6 +54,9 @@ #define VDSO_TK_VER_1 0x1 #define VDSO_TK_VER_CURR VDSO_TK_VER_1 #define VDSO_TH_ALGO_1 0x1 +#define VDSO_TH_ALGO_2 0x2 +#define VDSO_TH_ALGO_3 0x3 +#define VDSO_TH_ALGO_4 0x4 #ifndef _KERNEL @@ -62,7 +66,7 @@ int __vdso_clock_gettime(clockid_t clock_id, struct timespec *ts); int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz); -u_int __vdso_gettc(const struct vdso_timehands *vdso_th); +int __vdso_gettc(const struct vdso_timehands *vdso_th, u_int *tc); int __vdso_gettimekeep(struct vdso_timekeep **tk); #endif @@ -69,6 +73,14 @@ #ifdef _KERNEL +struct timecounter; + +struct vdso_sv_tk { + int sv_timekeep_off; + int sv_timekeep_curr; + uint32_t sv_timekeep_gen; +}; + void timekeep_push_vdso(void); uint32_t tc_fill_vdso_timehands(struct vdso_timehands *vdso_th); @@ -81,8 +93,11 @@ * global sysctl enable override is handled by machine-independed code * after cpu_fill_vdso_timehands() call is made. */ -uint32_t cpu_fill_vdso_timehands(struct vdso_timehands *vdso_th); +uint32_t cpu_fill_vdso_timehands(struct vdso_timehands *vdso_th, + struct timecounter *tc); +struct vdso_sv_tk *alloc_sv_tk(void); + #define VDSO_TH_NUM 4 #ifdef COMPAT_FREEBSD32 @@ -110,7 +125,9 @@ }; uint32_t tc_fill_vdso_timehands32(struct vdso_timehands32 *vdso_th32); -uint32_t cpu_fill_vdso_timehands32(struct vdso_timehands32 *vdso_th32); +uint32_t cpu_fill_vdso_timehands32(struct vdso_timehands32 *vdso_th32, + struct timecounter *tc); +struct vdso_sv_tk *alloc_sv_tk_compat32(void); #endif #endif From laffer1 at midnightbsd.org Sat Feb 8 14:55:14 2020 From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org) Date: Sat, 8 Feb 2020 14:55:14 -0500 (EST) Subject: [Midnightbsd-cvs] src [12328] trunk/sys/sys/user.h: sync with FreeBSD 11-stable Message-ID: <202002081955.018JtETg066674@stargazer.midnightbsd.org> Revision: 12328 http://svnweb.midnightbsd.org/src/?rev=12328 Author: laffer1 Date: 2020-02-08 14:55:14 -0500 (Sat, 08 Feb 2020) Log Message: ----------- sync with FreeBSD 11-stable Modified Paths: -------------- trunk/sys/sys/user.h Modified: trunk/sys/sys/user.h =================================================================== --- trunk/sys/sys/user.h 2020-02-08 19:54:27 UTC (rev 12327) +++ trunk/sys/sys/user.h 2020-02-08 19:55:14 UTC (rev 12328) @@ -30,7 +30,7 @@ * SUCH DAMAGE. * * @(#)user.h 8.2 (Berkeley) 9/23/93 - * $FreeBSD: stable/10/sys/sys/user.h 310121 2016-12-15 16:52:17Z vangyzen $ + * $FreeBSD: stable/11/sys/sys/user.h 341778 2018-12-10 01:38:48Z kib $ */ #ifndef _SYS_USER_H_ @@ -85,7 +85,7 @@ * it in two places: function fill_kinfo_proc in sys/kern/kern_proc.c and * function kvm_proclist in lib/libkvm/kvm_proc.c . */ -#define KI_NSPARE_INT 7 +#define KI_NSPARE_INT 4 #define KI_NSPARE_LONG 12 #define KI_NSPARE_PTR 6 @@ -172,8 +172,8 @@ signed char ki_nice; /* Process "nice" value */ char ki_lock; /* Process lock (prevent swap) count */ char ki_rqindex; /* Run queue index */ - u_char ki_oncpu; /* Which cpu we are on */ - u_char ki_lastcpu; /* Last cpu we were on */ + u_char ki_oncpu_old; /* Which cpu we are on (legacy) */ + u_char ki_lastcpu_old; /* Last cpu we were on (legacy) */ char ki_tdname[TDNAMLEN+1]; /* thread name */ char ki_wmesg[WMESGLEN+1]; /* wchan message */ char ki_login[LOGNAMELEN+1]; /* setlogin name */ @@ -189,6 +189,9 @@ */ char ki_sparestrings[46]; /* spare string space */ int ki_spareints[KI_NSPARE_INT]; /* spare room for growth */ + int ki_oncpu; /* Which cpu we are on */ + int ki_lastcpu; /* Last cpu we were on */ + int ki_tracer; /* Pid of tracing process */ int ki_flag2; /* P2_* flags */ int ki_fibnum; /* Default FIB number */ u_int ki_cr_flags; /* Credential flags */ @@ -257,6 +260,7 @@ #define KF_TYPE_SEM 9 #define KF_TYPE_PTS 10 #define KF_TYPE_PROCDESC 11 +#define KF_TYPE_DEV 12 #define KF_TYPE_UNKNOWN 255 #define KF_VTYPE_VNON 0 @@ -273,7 +277,7 @@ #define KF_FD_TYPE_CWD -1 /* Current working directory */ #define KF_FD_TYPE_ROOT -2 /* Root directory */ #define KF_FD_TYPE_JAIL -3 /* Jail directory */ -#define KF_FD_TYPE_TRACE -4 /* ptrace vnode */ +#define KF_FD_TYPE_TRACE -4 /* Ktrace vnode */ #define KF_FD_TYPE_TEXT -5 /* Text vnode */ #define KF_FD_TYPE_CTTY -6 /* Controlling terminal */ @@ -556,6 +560,7 @@ int kern_proc_filedesc_out(struct proc *p, struct sbuf *sb, ssize_t maxlen, int flags); +int kern_proc_cwd_out(struct proc *p, struct sbuf *sb, ssize_t maxlen); int kern_proc_out(struct proc *p, struct sbuf *sb, int flags); int kern_proc_vmmap_out(struct proc *p, struct sbuf *sb, ssize_t maxlen, int flags); From laffer1 at midnightbsd.org Sat Feb 8 14:56:27 2020 From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org) Date: Sat, 8 Feb 2020 14:56:27 -0500 (EST) Subject: [Midnightbsd-cvs] src [12329] trunk/sys/sys/umtx.h: sync with FreeBSD 11-stable Message-ID: <202002081956.018JuR8i066770@stargazer.midnightbsd.org> Revision: 12329 http://svnweb.midnightbsd.org/src/?rev=12329 Author: laffer1 Date: 2020-02-08 14:56:26 -0500 (Sat, 08 Feb 2020) Log Message: ----------- sync with FreeBSD 11-stable Modified Paths: -------------- trunk/sys/sys/umtx.h Modified: trunk/sys/sys/umtx.h =================================================================== --- trunk/sys/sys/umtx.h 2020-02-08 19:55:14 UTC (rev 12328) +++ trunk/sys/sys/umtx.h 2020-02-08 19:56:26 UTC (rev 12329) @@ -24,7 +24,7 @@ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * - * $FreeBSD: stable/10/sys/sys/umtx.h 233912 2012-04-05 02:24:08Z davidxu $ + * $FreeBSD: stable/11/sys/sys/umtx.h 331722 2018-03-29 02:50:57Z eadler $ * */ @@ -32,20 +32,28 @@ #define _SYS_UMTX_H_ #include <sys/_umtx.h> -#include <sys/limits.h> -#define UMTX_UNOWNED 0x0 -#define UMTX_CONTESTED LONG_MIN - +/* Common lock flags */ #define USYNC_PROCESS_SHARED 0x0001 /* Process shared sync objs */ +/* umutex flags */ +#define UMUTEX_PRIO_INHERIT 0x0004 /* Priority inherited mutex */ +#define UMUTEX_PRIO_PROTECT 0x0008 /* Priority protect mutex */ +#define UMUTEX_ROBUST 0x0010 /* Robust mutex */ +#define UMUTEX_NONCONSISTENT 0x0020 /* Robust locked but not consistent */ + +/* + * The umutex.m_lock values and bits. The m_owner is the word which + * serves as the lock. Its high bit is the contention indicator and + * rest of bits records the owner TID. TIDs values start with PID_MAX + * + 2 and end by INT32_MAX. The low range [1..PID_MAX] is guaranteed + * to be useable as the special markers. + */ #define UMUTEX_UNOWNED 0x0 #define UMUTEX_CONTESTED 0x80000000U +#define UMUTEX_RB_OWNERDEAD (UMUTEX_CONTESTED | 0x10) +#define UMUTEX_RB_NOTRECOV (UMUTEX_CONTESTED | 0x11) -#define UMUTEX_ERROR_CHECK 0x0002 /* Error-checking mutex */ -#define UMUTEX_PRIO_INHERIT 0x0004 /* Priority inherited mutex */ -#define UMUTEX_PRIO_PROTECT 0x0008 /* Priority protect mutex */ - /* urwlock flags */ #define URWLOCK_PREFER_READER 0x0002 @@ -58,9 +66,14 @@ /* _usem flags */ #define SEM_NAMED 0x0002 +/* _usem2 count field */ +#define USEM_HAS_WAITERS 0x80000000U +#define USEM_MAX_COUNT 0x7fffffffU +#define USEM_COUNT(c) ((c) & USEM_MAX_COUNT) + /* op code for _umtx_op */ -#define UMTX_OP_LOCK 0 -#define UMTX_OP_UNLOCK 1 +#define UMTX_OP_RESERVED0 0 +#define UMTX_OP_RESERVED1 1 #define UMTX_OP_WAIT 2 #define UMTX_OP_WAKE 3 #define UMTX_OP_MUTEX_TRYLOCK 4 @@ -78,11 +91,14 @@ #define UMTX_OP_WAKE_PRIVATE 16 #define UMTX_OP_MUTEX_WAIT 17 #define UMTX_OP_MUTEX_WAKE 18 /* deprecated */ -#define UMTX_OP_SEM_WAIT 19 -#define UMTX_OP_SEM_WAKE 20 +#define UMTX_OP_SEM_WAIT 19 /* deprecated */ +#define UMTX_OP_SEM_WAKE 20 /* deprecated */ #define UMTX_OP_NWAKE_PRIVATE 21 #define UMTX_OP_MUTEX_WAKE2 22 -#define UMTX_OP_MAX 23 +#define UMTX_OP_SEM2_WAIT 23 +#define UMTX_OP_SEM2_WAKE 24 +#define UMTX_OP_SHM 25 +#define UMTX_OP_ROBUST_LISTS 26 /* Flags for UMTX_OP_CV_WAIT */ #define CVWAIT_CHECK_UNPARKING 0x01 @@ -93,86 +109,26 @@ #define UMTX_CHECK_UNPARKING CVWAIT_CHECK_UNPARKING -#ifndef _KERNEL +/* Flags for UMTX_OP_SHM */ +#define UMTX_SHM_CREAT 0x0001 +#define UMTX_SHM_LOOKUP 0x0002 +#define UMTX_SHM_DESTROY 0x0004 +#define UMTX_SHM_ALIVE 0x0008 -int _umtx_op(void *obj, int op, u_long val, void *uaddr, void *uaddr2); +struct umtx_robust_lists_params { + uintptr_t robust_list_offset; + uintptr_t robust_priv_list_offset; + uintptr_t robust_inact_offset; +}; -/* - * Old (deprecated) userland mutex system calls. - */ -int _umtx_lock(struct umtx *mtx); -int _umtx_unlock(struct umtx *mtx); +#ifndef _KERNEL -/* - * Standard api. Try uncontested acquire/release and asks the - * kernel to resolve failures. - */ -static __inline void -umtx_init(struct umtx *umtx) -{ - umtx->u_owner = UMTX_UNOWNED; -} +__BEGIN_DECLS -static __inline u_long -umtx_owner(struct umtx *umtx) -{ - return (umtx->u_owner & ~LONG_MIN); -} +int _umtx_op(void *obj, int op, u_long val, void *uaddr, void *uaddr2); -static __inline int -umtx_lock(struct umtx *umtx, u_long id) -{ - if (atomic_cmpset_acq_long(&umtx->u_owner, UMTX_UNOWNED, id) == 0) - if (_umtx_lock(umtx) == -1) - return (errno); - return (0); -} +__END_DECLS -static __inline int -umtx_trylock(struct umtx *umtx, u_long id) -{ - if (atomic_cmpset_acq_long(&umtx->u_owner, UMTX_UNOWNED, id) == 0) - return (EBUSY); - return (0); -} - -static __inline int -umtx_timedlock(struct umtx *umtx, u_long id, const struct timespec *timeout) -{ - if (atomic_cmpset_acq_long(&umtx->u_owner, UMTX_UNOWNED, id) == 0) - if (_umtx_op(umtx, UMTX_OP_LOCK, id, 0, - __DECONST(void *, timeout)) == -1) - return (errno); - return (0); -} - -static __inline int -umtx_unlock(struct umtx *umtx, u_long id) -{ - if (atomic_cmpset_rel_long(&umtx->u_owner, id, UMTX_UNOWNED) == 0) - if (_umtx_unlock(umtx) == -1) - return (errno); - return (0); -} - -static __inline int -umtx_wait(u_long *p, long val, const struct timespec *timeout) -{ - if (_umtx_op(p, UMTX_OP_WAIT, val, 0, - __DECONST(void *, timeout)) == -1) - return (errno); - return (0); -} - -/* Wake threads waiting on a user address. */ -static __inline int -umtx_wake(u_long *p, int nr_wakeup) -{ - if (_umtx_op(p, UMTX_OP_WAKE, nr_wakeup, 0, 0) == -1) - return (errno); - return (0); -} - #else /* @@ -189,7 +145,10 @@ TYPE_PI_UMUTEX, TYPE_PP_UMUTEX, TYPE_RWLOCK, - TYPE_FUTEX + TYPE_FUTEX, + TYPE_SHM, + TYPE_PI_ROBUST_UMUTEX, + TYPE_PP_ROBUST_UMUTEX, }; /* Key to represent a unique userland synchronous object */ @@ -228,7 +187,7 @@ } int umtx_copyin_timeout(const void *, struct timespec *); -int umtx_key_get(void *, int, int, struct umtx_key *); +int umtx_key_get(const void *, int, int, struct umtx_key *); void umtx_key_release(struct umtx_key *); struct umtx_q *umtxq_alloc(void); void umtxq_free(struct umtx_q *); From laffer1 at midnightbsd.org Sat Feb 8 14:57:07 2020 From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org) Date: Sat, 8 Feb 2020 14:57:07 -0500 (EST) Subject: [Midnightbsd-cvs] src [12330] trunk/sys/sys: sync with FreeBSD 11-stable Message-ID: <202002081957.018Jv70S066845@stargazer.midnightbsd.org> Revision: 12330 http://svnweb.midnightbsd.org/src/?rev=12330 Author: laffer1 Date: 2020-02-08 14:57:06 -0500 (Sat, 08 Feb 2020) Log Message: ----------- sync with FreeBSD 11-stable Modified Paths: -------------- trunk/sys/sys/unistd.h trunk/sys/sys/unpcb.h Modified: trunk/sys/sys/unistd.h =================================================================== --- trunk/sys/sys/unistd.h 2020-02-08 19:56:26 UTC (rev 12329) +++ trunk/sys/sys/unistd.h 2020-02-08 19:57:06 UTC (rev 12330) @@ -28,7 +28,7 @@ * SUCH DAMAGE. * * @(#)unistd.h 8.2 (Berkeley) 1/7/94 - * $FreeBSD: stable/10/sys/sys/unistd.h 312171 2017-01-14 11:27:11Z kib $ + * $FreeBSD: stable/11/sys/sys/unistd.h 353789 2019-10-21 01:24:21Z kevans $ */ #ifndef _SYS_UNISTD_H_ @@ -51,7 +51,7 @@ * returns -1, the functions may be stubbed out. */ #define _POSIX_ADVISORY_INFO 200112L -#define _POSIX_ASYNCHRONOUS_IO 0 +#define _POSIX_ASYNCHRONOUS_IO 200112L #define _POSIX_CHOWN_RESTRICTED 1 #define _POSIX_CLOCK_SELECTION (-1) #define _POSIX_CPUTIME 200112L @@ -187,11 +187,14 @@ #define RFTSIGNUM(flags) (((flags) >> RFTSIGSHIFT) & RFTSIGMASK) #define RFTSIGFLAGS(signum) ((signum) << RFTSIGSHIFT) #define RFPROCDESC (1<<28) /* return a process descriptor */ -#define RFPPWAIT (1<<31) /* parent sleeps until child exits (vfork) */ +/* kernel: parent sleeps until child exits (vfork) */ +#define RFPPWAIT (1<<31) +/* user: vfork(2) semantics, clear signals */ +#define RFSPAWN (1U<<31) #define RFFLAGS (RFFDG | RFPROC | RFMEM | RFNOWAIT | RFCFDG | \ RFTHREAD | RFSIGSHARE | RFLINUXTHPN | RFSTOPPED | RFHIGHPID | RFTSIGZMB | \ - RFPROCDESC | RFPPWAIT) -#define RFKERNELONLY (RFSTOPPED | RFHIGHPID | RFPPWAIT | RFPROCDESC) + RFPROCDESC | RFSPAWN | RFPPWAIT) +#define RFKERNELONLY (RFSTOPPED | RFHIGHPID | RFPROCDESC) #endif /* __BSD_VISIBLE */ Modified: trunk/sys/sys/unpcb.h =================================================================== --- trunk/sys/sys/unpcb.h 2020-02-08 19:56:26 UTC (rev 12329) +++ trunk/sys/sys/unpcb.h 2020-02-08 19:57:06 UTC (rev 12330) @@ -28,7 +28,7 @@ * SUCH DAMAGE. * * @(#)unpcb.h 8.1 (Berkeley) 6/2/93 - * $FreeBSD: stable/10/sys/sys/unpcb.h 305261 2016-09-02 00:14:28Z markj $ + * $FreeBSD: stable/11/sys/sys/unpcb.h 339067 2018-10-01 17:36:58Z asomers $ */ #ifndef _SYS_UNPCB_H_ @@ -151,4 +151,13 @@ }; #endif /* _SYS_SOCKETVAR_H_ */ +#if defined(_KERNEL) +struct thread; + +/* In uipc_userreq.c */ +void +unp_copy_peercred(struct thread *td, struct unpcb *client_unp, + struct unpcb *server_unp, struct unpcb *listen_unp); +#endif + #endif /* _SYS_UNPCB_H_ */ From laffer1 at midnightbsd.org Sat Feb 8 15:00:10 2020 From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org) Date: Sat, 8 Feb 2020 15:00:10 -0500 (EST) Subject: [Midnightbsd-cvs] src [12331] trunk/sys/sys: sync with FreeBSD 11-stable Message-ID: <202002082000.018K0AWi067616@stargazer.midnightbsd.org> Revision: 12331 http://svnweb.midnightbsd.org/src/?rev=12331 Author: laffer1 Date: 2020-02-08 15:00:09 -0500 (Sat, 08 Feb 2020) Log Message: ----------- sync with FreeBSD 11-stable Modified Paths: -------------- trunk/sys/sys/tty.h trunk/sys/sys/ttycom.h trunk/sys/sys/ttydefaults.h trunk/sys/sys/ttydisc.h trunk/sys/sys/ttyqueue.h trunk/sys/sys/turnstile.h trunk/sys/sys/types.h trunk/sys/sys/ucontext.h trunk/sys/sys/ucred.h Modified: trunk/sys/sys/tty.h =================================================================== --- trunk/sys/sys/tty.h 2020-02-08 19:57:06 UTC (rev 12330) +++ trunk/sys/sys/tty.h 2020-02-08 20:00:09 UTC (rev 12331) @@ -27,7 +27,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: stable/10/sys/sys/tty.h 271773 2014-09-18 14:44:47Z grehan $ + * $FreeBSD: stable/11/sys/sys/tty.h 331722 2018-03-29 02:50:57Z eadler $ */ #ifndef _SYS_TTY_H_ @@ -63,6 +63,7 @@ struct mtx *t_mtx; /* TTY lock. */ struct mtx t_mtxobj; /* Per-TTY lock (when not borrowing). */ TAILQ_ENTRY(tty) t_list; /* (l) TTY list entry. */ + int t_drainwait; /* (t) TIOCDRAIN timeout seconds. */ unsigned int t_flags; /* (t) Terminal option flags. */ /* Keep flags in sync with db_show_tty and pstat(8). */ #define TF_NOPREFIX 0x00001 /* Don't prepend "tty" to device name. */ @@ -172,11 +173,11 @@ #define tty_getlock(tp) ((tp)->t_mtx) /* Device node creation. */ -void tty_makedev(struct tty *tp, struct ucred *cred, const char *fmt, ...) - __printflike(3, 4); int tty_makedevf(struct tty *tp, struct ucred *cred, int flags, const char *fmt, ...) __printflike(4, 5); #define TTYMK_CLONING 0x1 +#define tty_makedev(tp, cred, fmt, ...) \ + (void )tty_makedevf((tp), (cred), 0, (fmt), ## __VA_ARGS__) #define tty_makealias(tp,fmt,...) \ make_dev_alias((tp)->t_dev, fmt, ## __VA_ARGS__) Modified: trunk/sys/sys/ttycom.h =================================================================== --- trunk/sys/sys/ttycom.h 2020-02-08 19:57:06 UTC (rev 12330) +++ trunk/sys/sys/ttycom.h 2020-02-08 20:00:09 UTC (rev 12331) @@ -33,7 +33,7 @@ * SUCH DAMAGE. * * @(#)ttycom.h 8.1 (Berkeley) 3/28/94 - * $FreeBSD: stable/10/sys/sys/ttycom.h 231095 2012-02-06 18:15:46Z ed $ + * $FreeBSD: stable/11/sys/sys/ttycom.h 331722 2018-03-29 02:50:57Z eadler $ */ #ifndef _SYS_TTYCOM_H_ Modified: trunk/sys/sys/ttydefaults.h =================================================================== --- trunk/sys/sys/ttydefaults.h 2020-02-08 19:57:06 UTC (rev 12330) +++ trunk/sys/sys/ttydefaults.h 2020-02-08 20:00:09 UTC (rev 12331) @@ -33,7 +33,7 @@ * SUCH DAMAGE. * * @(#)ttydefaults.h 8.4 (Berkeley) 1/21/94 - * $FreeBSD: stable/10/sys/sys/ttydefaults.h 249311 2013-04-09 16:16:34Z ed $ + * $FreeBSD: stable/11/sys/sys/ttydefaults.h 331722 2018-03-29 02:50:57Z eadler $ */ /* Modified: trunk/sys/sys/ttydisc.h =================================================================== --- trunk/sys/sys/ttydisc.h 2020-02-08 19:57:06 UTC (rev 12330) +++ trunk/sys/sys/ttydisc.h 2020-02-08 20:00:09 UTC (rev 12331) @@ -1,3 +1,4 @@ +/* $MidnightBSD$ */ /*- * Copyright (c) 2008 Ed Schouten <ed at FreeBSD.org> * All rights reserved. @@ -26,7 +27,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $MidnightBSD$ + * $FreeBSD: stable/11/sys/sys/ttydisc.h 331722 2018-03-29 02:50:57Z eadler $ */ #ifndef _SYS_TTYDISC_H_ Modified: trunk/sys/sys/ttyqueue.h =================================================================== --- trunk/sys/sys/ttyqueue.h 2020-02-08 19:57:06 UTC (rev 12330) +++ trunk/sys/sys/ttyqueue.h 2020-02-08 20:00:09 UTC (rev 12331) @@ -1,3 +1,4 @@ +/* $MidnightBSD$ */ /*- * Copyright (c) 2008 Ed Schouten <ed at FreeBSD.org> * All rights reserved. @@ -26,7 +27,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $MidnightBSD$ + * $FreeBSD: stable/11/sys/sys/ttyqueue.h 331722 2018-03-29 02:50:57Z eadler $ */ #ifndef _SYS_TTYQUEUE_H_ @@ -69,7 +70,7 @@ #ifdef _KERNEL /* Input queue handling routines. */ -void ttyinq_setsize(struct ttyinq *ti, struct tty *tp, size_t len); +int ttyinq_setsize(struct ttyinq *ti, struct tty *tp, size_t len); void ttyinq_free(struct ttyinq *ti); int ttyinq_read_uio(struct ttyinq *ti, struct tty *tp, struct uio *uio, size_t readlen, size_t flushlen); @@ -136,7 +137,7 @@ /* Output queue handling routines. */ void ttyoutq_flush(struct ttyoutq *to); -void ttyoutq_setsize(struct ttyoutq *to, struct tty *tp, size_t len); +int ttyoutq_setsize(struct ttyoutq *to, struct tty *tp, size_t len); void ttyoutq_free(struct ttyoutq *to); size_t ttyoutq_read(struct ttyoutq *to, void *buf, size_t len); int ttyoutq_read_uio(struct ttyoutq *to, struct tty *tp, struct uio *uio); Modified: trunk/sys/sys/turnstile.h =================================================================== --- trunk/sys/sys/turnstile.h 2020-02-08 19:57:06 UTC (rev 12330) +++ trunk/sys/sys/turnstile.h 2020-02-08 20:00:09 UTC (rev 12331) @@ -24,7 +24,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: stable/10/sys/sys/turnstile.h 262192 2014-02-18 20:27:17Z jhb $ + * $FreeBSD: stable/11/sys/sys/turnstile.h 331722 2018-03-29 02:50:57Z eadler $ */ #ifndef _SYS_TURNSTILE_H_ @@ -34,7 +34,7 @@ * Turnstile interface. Non-sleepable locks use a turnstile for the * queue of threads blocked on them when they are contested. Each * turnstile contains two sub-queues: one for threads waiting for a - * shared, or eread, lock, and one for threads waiting for an + * shared, or read, lock, and one for threads waiting for an * exclusive, or write, lock. * * A thread calls turnstile_chain_lock() to lock the turnstile chain Modified: trunk/sys/sys/types.h =================================================================== --- trunk/sys/sys/types.h 2020-02-08 19:57:06 UTC (rev 12330) +++ trunk/sys/sys/types.h 2020-02-08 20:00:09 UTC (rev 12331) @@ -33,7 +33,7 @@ * SUCH DAMAGE. * * @(#)types.h 8.6 (Berkeley) 2/19/95 - * $FreeBSD: stable/10/sys/sys/types.h 289107 2015-10-10 05:50:42Z kib $ + * $FreeBSD: stable/11/sys/sys/types.h 331722 2018-03-29 02:50:57Z eadler $ */ #ifndef _SYS_TYPES_H_ @@ -175,6 +175,11 @@ #define _OFF_T_DECLARED #endif +#ifndef _OFF64_T_DECLARED +typedef __off64_t off64_t; /* file offset (alias) */ +#define _OFF64_T_DECLARED +#endif + #ifndef _PID_T_DECLARED typedef __pid_t pid_t; /* process id */ #define _PID_T_DECLARED @@ -233,6 +238,11 @@ #define _USECONDS_T_DECLARED #endif +#ifndef _CAP_IOCTL_T_DECLARED +#define _CAP_IOCTL_T_DECLARED +typedef unsigned long cap_ioctl_t; +#endif + #ifndef _CAP_RIGHTS_T_DECLARED #define _CAP_RIGHTS_T_DECLARED struct cap_rights; @@ -241,11 +251,13 @@ #endif typedef __vm_offset_t vm_offset_t; -typedef __vm_ooffset_t vm_ooffset_t; +typedef __int64_t vm_ooffset_t; typedef __vm_paddr_t vm_paddr_t; -typedef __vm_pindex_t vm_pindex_t; +typedef __uint64_t vm_pindex_t; typedef __vm_size_t vm_size_t; +typedef __rman_res_t rman_res_t; + #ifdef _KERNEL typedef int boolean_t; typedef struct device *device_t; Modified: trunk/sys/sys/ucontext.h =================================================================== --- trunk/sys/sys/ucontext.h 2020-02-08 19:57:06 UTC (rev 12330) +++ trunk/sys/sys/ucontext.h 2020-02-08 20:00:09 UTC (rev 12331) @@ -26,7 +26,7 @@ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * - * $FreeBSD: stable/10/sys/sys/ucontext.h 278347 2015-02-07 08:47:15Z kib $ + * $FreeBSD: stable/11/sys/sys/ucontext.h 331722 2018-03-29 02:50:57Z eadler $ */ #ifndef _SYS_UCONTEXT_H_ @@ -34,25 +34,9 @@ #include <sys/signal.h> #include <machine/ucontext.h> +#include <sys/_ucontext.h> -typedef struct __ucontext { - /* - * Keep the order of the first two fields. Also, - * keep them the first two fields in the structure. - * This way we can have a union with struct - * sigcontext and ucontext_t. This allows us to - * support them both at the same time. - * note: the union is not defined, though. - */ - sigset_t uc_sigmask; - mcontext_t uc_mcontext; - - struct __ucontext *uc_link; - stack_t uc_stack; - int uc_flags; #define UCF_SWAPPED 0x00000001 /* Used by swapcontext(3). */ - int __spare__[4]; -} ucontext_t; #if defined(_KERNEL) && defined(COMPAT_FREEBSD4) #if defined(__i386__) Modified: trunk/sys/sys/ucred.h =================================================================== --- trunk/sys/sys/ucred.h 2020-02-08 19:57:06 UTC (rev 12330) +++ trunk/sys/sys/ucred.h 2020-02-08 20:00:09 UTC (rev 12331) @@ -28,7 +28,7 @@ * SUCH DAMAGE. * * @(#)ucred.h 8.4 (Berkeley) 1/9/95 - * $FreeBSD: stable/10/sys/sys/ucred.h 303846 2016-08-08 18:31:28Z bdrewery $ + * $FreeBSD: stable/11/sys/sys/ucred.h 331722 2018-03-29 02:50:57Z eadler $ */ #ifndef _SYS_UCRED_H_ @@ -38,6 +38,8 @@ struct loginclass; +#define XU_NGROUPS 16 + /* * Credentials. * @@ -65,13 +67,12 @@ struct auditinfo_addr cr_audit; /* Audit properties. */ gid_t *cr_groups; /* groups */ int cr_agroups; /* Available groups */ + gid_t cr_smallgroups[XU_NGROUPS]; /* storage for small groups */ }; #define NOCRED ((struct ucred *)0) /* no credential available */ #define FSCRED ((struct ucred *)-1) /* filesystem credential */ #endif /* _KERNEL || _WANT_UCRED */ -#define XU_NGROUPS 16 - /* * Flags for cr_flags. */ @@ -106,13 +107,11 @@ struct ucred *crcopysafe(struct proc *p, struct ucred *cr); struct ucred *crdup(struct ucred *cr); void crextend(struct ucred *cr, int n); -void cred_update_thread(struct thread *td); void proc_set_cred_init(struct proc *p, struct ucred *cr); struct ucred *proc_set_cred(struct proc *p, struct ucred *cr); void crfree(struct ucred *cr); struct ucred *crget(void); struct ucred *crhold(struct ucred *cr); -int crshared(struct ucred *cr); void cru2x(struct ucred *cr, struct xucred *xcr); void crsetgroups(struct ucred *cr, int n, gid_t *groups); int groupmember(gid_t gid, struct ucred *cred); From laffer1 at midnightbsd.org Sat Feb 8 15:01:04 2020 From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org) Date: Sat, 8 Feb 2020 15:01:04 -0500 (EST) Subject: [Midnightbsd-cvs] src [12332] trunk/sys/sys/sx.h: sync with FreeBSD 11-stable Message-ID: <202002082001.018K146H067722@stargazer.midnightbsd.org> Revision: 12332 http://svnweb.midnightbsd.org/src/?rev=12332 Author: laffer1 Date: 2020-02-08 15:01:03 -0500 (Sat, 08 Feb 2020) Log Message: ----------- sync with FreeBSD 11-stable Modified Paths: -------------- trunk/sys/sys/sx.h Modified: trunk/sys/sys/sx.h =================================================================== --- trunk/sys/sys/sx.h 2020-02-08 20:00:09 UTC (rev 12331) +++ trunk/sys/sys/sx.h 2020-02-08 20:01:03 UTC (rev 12332) @@ -27,7 +27,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. * - * $FreeBSD: stable/10/sys/sys/sx.h 323870 2017-09-21 19:24:11Z marius $ + * $FreeBSD: stable/11/sys/sys/sx.h 331722 2018-03-29 02:50:57Z eadler $ */ #ifndef _SYS_SX_H_ @@ -87,6 +87,13 @@ #ifdef _KERNEL +#define sx_recurse lock_object.lo_data + +#define SX_READ_VALUE(sx) ((sx)->sx_lock) + +#define lv_sx_owner(v) \ + ((v & SX_LOCK_SHARED) ? NULL : (struct thread *)SX_OWNER(v)) + /* * Function prototipes. Routines that start with an underscore are not part * of the public interface and are wrappered with a macro. @@ -95,20 +102,22 @@ #define sx_init(sx, desc) sx_init_flags((sx), (desc), 0) void sx_init_flags(struct sx *sx, const char *description, int opts); void sx_destroy(struct sx *sx); +int sx_try_slock_int(struct sx *sx LOCK_FILE_LINE_ARG_DEF); int sx_try_slock_(struct sx *sx, const char *file, int line); +int sx_try_xlock_int(struct sx *sx LOCK_FILE_LINE_ARG_DEF); int sx_try_xlock_(struct sx *sx, const char *file, int line); +int sx_try_upgrade_int(struct sx *sx LOCK_FILE_LINE_ARG_DEF); int sx_try_upgrade_(struct sx *sx, const char *file, int line); +void sx_downgrade_int(struct sx *sx LOCK_FILE_LINE_ARG_DEF); void sx_downgrade_(struct sx *sx, const char *file, int line); +int _sx_slock_int(struct sx *sx, int opts LOCK_FILE_LINE_ARG_DEF); int _sx_slock(struct sx *sx, int opts, const char *file, int line); int _sx_xlock(struct sx *sx, int opts, const char *file, int line); +void _sx_sunlock_int(struct sx *sx LOCK_FILE_LINE_ARG_DEF); void _sx_sunlock(struct sx *sx, const char *file, int line); void _sx_xunlock(struct sx *sx, const char *file, int line); -int _sx_xlock_hard(struct sx *sx, uintptr_t tid, int opts, - const char *file, int line); -int _sx_slock_hard(struct sx *sx, int opts, const char *file, int line); -void _sx_xunlock_hard(struct sx *sx, uintptr_t tid, const char *file, int - line); -void _sx_sunlock_hard(struct sx *sx, const char *file, int line); +int _sx_xlock_hard(struct sx *sx, uintptr_t x, int opts LOCK_FILE_LINE_ARG_DEF); +void _sx_xunlock_hard(struct sx *sx, uintptr_t x LOCK_FILE_LINE_ARG_DEF); #if defined(INVARIANTS) || defined(INVARIANT_SUPPORT) void _sx_assert(const struct sx *sx, int what, const char *file, int line); #endif @@ -141,6 +150,7 @@ * deferred to 'tougher' functions. */ +#if (LOCK_DEBUG == 0) /* Acquire an exclusive lock. */ static __inline int __sx_xlock(struct sx *sx, struct thread *td, int opts, const char *file, @@ -147,14 +157,12 @@ int line) { uintptr_t tid = (uintptr_t)td; + uintptr_t v = SX_LOCK_UNLOCKED; int error = 0; - if (sx->sx_lock != SX_LOCK_UNLOCKED || - !atomic_cmpset_acq_ptr(&sx->sx_lock, SX_LOCK_UNLOCKED, tid)) - error = _sx_xlock_hard(sx, tid, opts, file, line); - else - LOCKSTAT_PROFILE_OBTAIN_LOCK_SUCCESS(LS_SX_XLOCK_ACQUIRE, - sx, 0, 0, file, line); + if (__predict_false(LOCKSTAT_PROFILE_ENABLED(sx__acquire) || + !atomic_fcmpset_acq_ptr(&sx->sx_lock, &v, tid))) + error = _sx_xlock_hard(sx, v, opts); return (error); } @@ -163,48 +171,15 @@ static __inline void __sx_xunlock(struct sx *sx, struct thread *td, const char *file, int line) { - uintptr_t tid = (uintptr_t)td; + uintptr_t x = (uintptr_t)td; - if (sx->sx_lock != tid || - !atomic_cmpset_rel_ptr(&sx->sx_lock, tid, SX_LOCK_UNLOCKED)) - _sx_xunlock_hard(sx, tid, file, line); + if (__predict_false(LOCKSTAT_PROFILE_ENABLED(sx__release) || + !atomic_fcmpset_rel_ptr(&sx->sx_lock, &x, SX_LOCK_UNLOCKED))) + _sx_xunlock_hard(sx, x); } +#endif -/* Acquire a shared lock. */ -static __inline int -__sx_slock(struct sx *sx, int opts, const char *file, int line) -{ - uintptr_t x = sx->sx_lock; - int error = 0; - - if (!(x & SX_LOCK_SHARED) || - !atomic_cmpset_acq_ptr(&sx->sx_lock, x, x + SX_ONE_SHARER)) - error = _sx_slock_hard(sx, opts, file, line); - else - LOCKSTAT_PROFILE_OBTAIN_LOCK_SUCCESS(LS_SX_SLOCK_ACQUIRE, sx, 0, - 0, file, line); - - return (error); -} - /* - * Release a shared lock. We can just drop a single shared lock so - * long as we aren't trying to drop the last shared lock when other - * threads are waiting for an exclusive lock. This takes advantage of - * the fact that an unlocked lock is encoded as a shared lock with a - * count of 0. - */ -static __inline void -__sx_sunlock(struct sx *sx, const char *file, int line) -{ - uintptr_t x = sx->sx_lock; - - if (x == (SX_SHARERS_LOCK(1) | SX_LOCK_EXCLUSIVE_WAITERS) || - !atomic_cmpset_rel_ptr(&sx->sx_lock, x, x - SX_ONE_SHARER)) - _sx_sunlock_hard(sx, file, line); -} - -/* * Public interface for lock operations. */ #ifndef LOCK_DEBUG @@ -217,12 +192,6 @@ _sx_xlock((sx), SX_INTERRUPTIBLE, (file), (line)) #define sx_xunlock_(sx, file, line) \ _sx_xunlock((sx), (file), (line)) -#define sx_slock_(sx, file, line) \ - (void)_sx_slock((sx), 0, (file), (line)) -#define sx_slock_sig_(sx, file, line) \ - _sx_slock((sx), SX_INTERRUPTIBLE, (file) , (line)) -#define sx_sunlock_(sx, file, line) \ - _sx_sunlock((sx), (file), (line)) #else #define sx_xlock_(sx, file, line) \ (void)__sx_xlock((sx), curthread, 0, (file), (line)) @@ -230,17 +199,30 @@ __sx_xlock((sx), curthread, SX_INTERRUPTIBLE, (file), (line)) #define sx_xunlock_(sx, file, line) \ __sx_xunlock((sx), curthread, (file), (line)) +#endif /* LOCK_DEBUG > 0 || SX_NOINLINE */ +#if (LOCK_DEBUG > 0) #define sx_slock_(sx, file, line) \ - (void)__sx_slock((sx), 0, (file), (line)) + (void)_sx_slock((sx), 0, (file), (line)) #define sx_slock_sig_(sx, file, line) \ - __sx_slock((sx), SX_INTERRUPTIBLE, (file), (line)) + _sx_slock((sx), SX_INTERRUPTIBLE, (file) , (line)) #define sx_sunlock_(sx, file, line) \ - __sx_sunlock((sx), (file), (line)) -#endif /* LOCK_DEBUG > 0 || SX_NOINLINE */ + _sx_sunlock((sx), (file), (line)) #define sx_try_slock(sx) sx_try_slock_((sx), LOCK_FILE, LOCK_LINE) #define sx_try_xlock(sx) sx_try_xlock_((sx), LOCK_FILE, LOCK_LINE) #define sx_try_upgrade(sx) sx_try_upgrade_((sx), LOCK_FILE, LOCK_LINE) #define sx_downgrade(sx) sx_downgrade_((sx), LOCK_FILE, LOCK_LINE) +#else +#define sx_slock_(sx, file, line) \ + (void)_sx_slock_int((sx), 0) +#define sx_slock_sig_(sx, file, line) \ + _sx_slock_int((sx), SX_INTERRUPTIBLE) +#define sx_sunlock_(sx, file, line) \ + _sx_sunlock_int((sx)) +#define sx_try_slock(sx) sx_try_slock_int((sx)) +#define sx_try_xlock(sx) sx_try_xlock_int((sx)) +#define sx_try_upgrade(sx) sx_try_upgrade_int((sx)) +#define sx_downgrade(sx) sx_downgrade_int((sx)) +#endif #ifdef INVARIANTS #define sx_assert_(sx, what, file, line) \ _sx_assert((sx), (what), (file), (line)) From laffer1 at midnightbsd.org Sat Feb 8 15:01:56 2020 From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org) Date: Sat, 8 Feb 2020 15:01:56 -0500 (EST) Subject: [Midnightbsd-cvs] src [12333] trunk/sys/sys/time.h: sync with FreeBSD 11-stable Message-ID: <202002082001.018K1u3r067791@stargazer.midnightbsd.org> Revision: 12333 http://svnweb.midnightbsd.org/src/?rev=12333 Author: laffer1 Date: 2020-02-08 15:01:56 -0500 (Sat, 08 Feb 2020) Log Message: ----------- sync with FreeBSD 11-stable Modified Paths: -------------- trunk/sys/sys/time.h Modified: trunk/sys/sys/time.h =================================================================== --- trunk/sys/sys/time.h 2020-02-08 20:01:03 UTC (rev 12332) +++ trunk/sys/sys/time.h 2020-02-08 20:01:56 UTC (rev 12333) @@ -28,7 +28,7 @@ * SUCH DAMAGE. * * @(#)time.h 8.5 (Berkeley) 5/4/95 - * $FreeBSD: stable/10/sys/sys/time.h 304894 2016-08-27 10:56:04Z kib $ + * $FreeBSD: stable/11/sys/sys/time.h 331722 2018-03-29 02:50:57Z eadler $ */ #ifndef _SYS_TIME_H_ @@ -129,7 +129,7 @@ #define SBT_1M (SBT_1S * 60) #define SBT_1MS (SBT_1S / 1000) #define SBT_1US (SBT_1S / 1000000) -#define SBT_1NS (SBT_1S / 1000000000) +#define SBT_1NS (SBT_1S / 1000000000) /* beware rounding, see nstosbt() */ #define SBT_MAX 0x7fffffffffffffffLL static __inline int @@ -156,6 +156,53 @@ return (_bt); } +/* + * Decimal<->sbt conversions. Multiplying or dividing by SBT_1NS results in + * large roundoff errors which sbttons() and nstosbt() avoid. Millisecond and + * microsecond functions are also provided for completeness. + */ +static __inline int64_t +sbttons(sbintime_t _sbt) +{ + + return ((1000000000 * _sbt) >> 32); +} + +static __inline sbintime_t +nstosbt(int64_t _ns) +{ + + return ((_ns * (((uint64_t)1 << 63) / 500000000)) >> 32); +} + +static __inline int64_t +sbttous(sbintime_t _sbt) +{ + + return ((1000000 * _sbt) >> 32); +} + +static __inline sbintime_t +ustosbt(int64_t _us) +{ + + return ((_us * (((uint64_t)1 << 63) / 500000)) >> 32); +} + +static __inline int64_t +sbttoms(sbintime_t _sbt) +{ + + return ((1000 * _sbt) >> 32); +} + +static __inline sbintime_t +mstosbt(int64_t _ms) +{ + + return ((_ms * (((uint64_t)1 << 63) / 500)) >> 32); +} + /*- * Background information: * @@ -211,7 +258,7 @@ struct timespec _ts; _ts.tv_sec = _sbt >> 32; - _ts.tv_nsec = ((uint64_t)1000000000 * (uint32_t)_sbt) >> 32; + _ts.tv_nsec = sbttons((uint32_t)_sbt); return (_ts); } @@ -219,8 +266,7 @@ tstosbt(struct timespec _ts) { - return (((sbintime_t)_ts.tv_sec << 32) + - (_ts.tv_nsec * (((uint64_t)1 << 63) / 500000000) >> 32)); + return (((sbintime_t)_ts.tv_sec << 32) + nstosbt(_ts.tv_nsec)); } static __inline struct timeval @@ -229,7 +275,7 @@ struct timeval _tv; _tv.tv_sec = _sbt >> 32; - _tv.tv_usec = ((uint64_t)1000000 * (uint32_t)_sbt) >> 32; + _tv.tv_usec = sbttous((uint32_t)_sbt); return (_tv); } @@ -237,8 +283,7 @@ tvtosbt(struct timeval _tv) { - return (((sbintime_t)_tv.tv_sec << 32) + - (_tv.tv_usec * (((uint64_t)1 << 63) / 500000) >> 32)); + return (((sbintime_t)_tv.tv_sec << 32) + ustosbt(_tv.tv_usec)); } #endif /* __BSD_VISIBLE */ @@ -373,8 +418,6 @@ extern volatile time_t time_second; extern volatile time_t time_uptime; -extern struct bintime boottimebin; -extern struct timeval boottime; extern struct bintime tc_tick_bt; extern sbintime_t tc_tick_sbt; extern struct bintime tick_bt; @@ -386,6 +429,8 @@ extern sbintime_t sbt_timethreshold; extern sbintime_t sbt_tickthreshold; +extern volatile int rtc_generation; + /* * Functions for looking at our clock: [get]{bin,nano,micro}[up]time() * @@ -399,7 +444,7 @@ * Functions containing "up" returns time relative to boot and * should be used for calculating time intervals. * - * Functions without "up" returns GMT time. + * Functions without "up" returns UTC time. * * Functions with the "get" prefix returns a less precise result * much faster than the functions without "get" prefix and should @@ -441,6 +486,9 @@ void getnanotime(struct timespec *tsp); void getmicrotime(struct timeval *tvp); +void getboottime(struct timeval *boottime); +void getboottimebin(struct bintime *boottimebin); + /* Other functions */ int itimerdecr(struct itimerval *itp, int usec); int itimerfix(struct timeval *tv); From laffer1 at midnightbsd.org Sat Feb 8 15:02:26 2020 From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org) Date: Sat, 8 Feb 2020 15:02:26 -0500 (EST) Subject: [Midnightbsd-cvs] src [12334] trunk/sys/sys/systm.h: sync with FreeBSD 11-stable Message-ID: <202002082002.018K2QO8067963@stargazer.midnightbsd.org> Revision: 12334 http://svnweb.midnightbsd.org/src/?rev=12334 Author: laffer1 Date: 2020-02-08 15:02:25 -0500 (Sat, 08 Feb 2020) Log Message: ----------- sync with FreeBSD 11-stable Modified Paths: -------------- trunk/sys/sys/systm.h Modified: trunk/sys/sys/systm.h =================================================================== --- trunk/sys/sys/systm.h 2020-02-08 20:01:56 UTC (rev 12333) +++ trunk/sys/sys/systm.h 2020-02-08 20:02:25 UTC (rev 12334) @@ -33,7 +33,7 @@ * SUCH DAMAGE. * * @(#)systm.h 8.7 (Berkeley) 3/29/95 - * $FreeBSD: stable/10/sys/sys/systm.h 303433 2016-07-28 11:51:20Z kib $ + * $FreeBSD: stable/11/sys/sys/systm.h 354405 2019-11-06 18:02:18Z mav $ */ #ifndef _SYS_SYSTM_H_ @@ -46,6 +46,8 @@ #include <sys/queue.h> #include <sys/stdint.h> /* for people using printf mainly */ +__NULLABILITY_PRAGMA_PUSH + extern int cold; /* nonzero if we are doing a cold boot */ extern int suspend_blocked; /* block suspend due to pending shutdown */ extern int rebooting; /* kern_reboot() has been called. */ @@ -75,9 +77,9 @@ * Keep in sync with vm_guest_sysctl_names[]. */ enum VM_GUEST { VM_GUEST_NO = 0, VM_GUEST_VM, VM_GUEST_XEN, VM_GUEST_HV, - VM_GUEST_VMWARE, VM_LAST }; + VM_GUEST_VMWARE, VM_GUEST_KVM, VM_GUEST_BHYVE, VM_LAST }; -#if defined(WITNESS) || defined(INVARIANTS) +#if defined(WITNESS) || defined(INVARIANT_SUPPORT) void kassert_panic(const char *fmt, ...) __printflike(1, 2); #endif @@ -84,12 +86,12 @@ #ifdef INVARIANTS /* The option is always available */ #define KASSERT(exp,msg) do { \ if (__predict_false(!(exp))) \ - kassert_panic msg; \ + kassert_panic msg; \ } while (0) #define VNASSERT(exp, vp, msg) do { \ if (__predict_false(!(exp))) { \ vn_printf(vp, "VNASSERT failed\n"); \ - kassert_panic msg; \ + kassert_panic msg; \ } \ } while (0) #else @@ -127,9 +129,20 @@ * Otherwise, the kernel will deadlock since the scheduler isn't * going to run the thread that holds any lock we need. */ -#define SCHEDULER_STOPPED() __predict_false(curthread->td_stopsched) +#define SCHEDULER_STOPPED_TD(td) ({ \ + MPASS((td) == curthread); \ + __predict_false((td)->td_stopsched); \ +}) +#define SCHEDULER_STOPPED() SCHEDULER_STOPPED_TD(curthread) /* + * Align variables. + */ +#define __read_mostly __section(".data.read_mostly") +#define __read_frequently __section(".data.read_frequently") +#define __exclusive_cache_line __aligned(CACHE_LINE_SIZE) \ + __section(".data.exclusive_cache_line") +/* * XXX the hints declarations are even more misplaced than most declarations * in this file, since they are needed in one file (per arch) and only used * in two files. @@ -136,11 +149,10 @@ * XXX most of these variables should be const. */ extern int osreldate; -extern int envmode; -extern int hintmode; /* 0 = off. 1 = config, 2 = fallback */ -extern int dynamic_kenv; +extern bool dynamic_kenv; extern struct mtx kenv_lock; extern char *kern_envp; +extern char *md_envp; extern char static_env[]; extern char static_hints[]; /* by config for now */ @@ -149,11 +161,15 @@ extern const void *zero_region; /* address space maps to a zeroed page */ extern int unmapped_buf_allowed; -extern int iosize_max_clamp; -extern int devfs_iosize_max_clamp; -#define IOSIZE_MAX (iosize_max_clamp ? INT_MAX : SSIZE_MAX) -#define DEVFS_IOSIZE_MAX (devfs_iosize_max_clamp ? INT_MAX : SSIZE_MAX) +#ifdef __LP64__ +#define IOSIZE_MAX iosize_max() +#define DEVFS_IOSIZE_MAX devfs_iosize_max() +#else +#define IOSIZE_MAX SSIZE_MAX +#define DEVFS_IOSIZE_MAX SSIZE_MAX +#endif + /* * General function declarations. */ @@ -186,6 +202,8 @@ #define HASH_WAITOK 0x00000002 void *phashinit(int count, struct malloc_type *type, u_long *nentries); +void *phashinit_flags(int count, struct malloc_type *type, u_long *nentries, + int flags); void g_waitidle(void); void panic(const char *, ...) __dead2 __printflike(1, 2); @@ -208,6 +226,7 @@ __va_list) __printflike(1, 0); void log(int, const char *, ...) __printflike(2, 3); void log_console(struct uio *); +void vlog(int, const char *, __va_list) __printflike(2, 0); int asprintf(char **ret, struct malloc_type *mtp, const char *format, ...) __printflike(3, 4); int printf(const char *, ...) __printflike(1, 2); @@ -221,12 +240,12 @@ int vsnrprintf(char *, size_t, int, const char *, __va_list) __printflike(4, 0); int vsprintf(char *buf, const char *, __va_list) __printflike(2, 0); int ttyprintf(struct tty *, const char *, ...) __printflike(2, 3); -int sscanf(const char *, char const *, ...) __nonnull(1) __nonnull(2); -int vsscanf(const char *, char const *, __va_list) __nonnull(1) __nonnull(2); -long strtol(const char *, char **, int) __nonnull(1); -u_long strtoul(const char *, char **, int) __nonnull(1); -quad_t strtoq(const char *, char **, int) __nonnull(1); -u_quad_t strtouq(const char *, char **, int) __nonnull(1); +int sscanf(const char *, char const * _Nonnull, ...) __scanflike(2, 3); +int vsscanf(const char * _Nonnull, char const * _Nonnull, __va_list) __scanflike(2, 0); +long strtol(const char *, char **, int); +u_long strtoul(const char *, char **, int); +quad_t strtoq(const char *, char **, int); +u_quad_t strtouq(const char *, char **, int); void tprintf(struct proc *p, int pri, const char *, ...) __printflike(3, 4); void vtprintf(struct proc *, int, const char *, __va_list) __printflike(3, 0); void hexdump(const void *ptr, int length, const char *hdr, int flags); @@ -237,32 +256,27 @@ #define HD_OMIT_CHARS (1 << 18) #define ovbcopy(f, t, l) bcopy((f), (t), (l)) -void bcopy(const void *from, void *to, size_t len) __nonnull(1) __nonnull(2); -void bzero(void *buf, size_t len) __nonnull(1); -#define bzero(buf, len) ({ \ - if (__builtin_constant_p(len) && (len) <= 64) \ - __builtin_memset((buf), 0, (len)); \ - else \ - bzero((buf), (len)); \ -}) +void bcopy(const void * _Nonnull from, void * _Nonnull to, size_t len); +void bzero(void * _Nonnull buf, size_t len); +void explicit_bzero(void * _Nonnull, size_t); -void *memcpy(void *to, const void *from, size_t len) __nonnull(1) __nonnull(2); -void *memmove(void *dest, const void *src, size_t n) __nonnull(1) __nonnull(2); +void *memcpy(void * _Nonnull to, const void * _Nonnull from, size_t len); +void *memmove(void * _Nonnull dest, const void * _Nonnull src, size_t n); -int copystr(const void * __restrict kfaddr, void * __restrict kdaddr, - size_t len, size_t * __restrict lencopied) - __nonnull(1) __nonnull(2); -int copyinstr(const void * __restrict udaddr, void * __restrict kaddr, - size_t len, size_t * __restrict lencopied) - __nonnull(1) __nonnull(2); -int copyin(const void * __restrict udaddr, void * __restrict kaddr, - size_t len) __nonnull(1) __nonnull(2); -int copyin_nofault(const void * __restrict udaddr, void * __restrict kaddr, - size_t len) __nonnull(1) __nonnull(2); -int copyout(const void * __restrict kaddr, void * __restrict udaddr, - size_t len) __nonnull(1) __nonnull(2); -int copyout_nofault(const void * __restrict kaddr, void * __restrict udaddr, - size_t len) __nonnull(1) __nonnull(2); +int copystr(const void * _Nonnull __restrict kfaddr, + void * _Nonnull __restrict kdaddr, size_t len, + size_t * __restrict lencopied); +int copyinstr(const void * __restrict udaddr, + void * _Nonnull __restrict kaddr, size_t len, + size_t * __restrict lencopied); +int copyin(const void * __restrict udaddr, + void * _Nonnull __restrict kaddr, size_t len); +int copyin_nofault(const void * __restrict udaddr, + void * _Nonnull __restrict kaddr, size_t len); +int copyout(const void * _Nonnull __restrict kaddr, + void * __restrict udaddr, size_t len); +int copyout_nofault(const void * _Nonnull __restrict kaddr, + void * __restrict udaddr, size_t len); int fubyte(volatile const void *base); long fuword(volatile const void *base); @@ -304,11 +318,12 @@ void stopprofclock(struct proc *); void cpu_startprofclock(void); void cpu_stopprofclock(void); +void suspendclock(void); +void resumeclock(void); sbintime_t cpu_idleclock(void); void cpu_activeclock(void); void cpu_new_callout(int cpu, sbintime_t bt, sbintime_t bt_opt); void cpu_et_frequency(struct eventtimer *et, uint64_t newfreq); -extern int cpu_deepest_sleep; extern int cpu_disable_c2_sleep; extern int cpu_disable_c3_sleep; @@ -316,7 +331,7 @@ int cr_canseesocket(struct ucred *cred, struct socket *so); int cr_canseeinpcb(struct ucred *cred, struct inpcb *inp); -char *getenv(const char *name); +char *kern_getenv(const char *name); void freeenv(char *env); int getenv_int(const char *name, int *data); int getenv_uint(const char *name, unsigned int *data); @@ -323,11 +338,18 @@ int getenv_long(const char *name, long *data); int getenv_ulong(const char *name, unsigned long *data); int getenv_string(const char *name, char *data, int size); +int getenv_int64(const char *name, int64_t *data); +int getenv_uint64(const char *name, uint64_t *data); int getenv_quad(const char *name, quad_t *data); -int setenv(const char *name, const char *value); -int unsetenv(const char *name); +int kern_setenv(const char *name, const char *value); +int kern_unsetenv(const char *name); int testenv(const char *name); +int getenv_array(const char *name, void *data, int size, int *psize, + int type_size, bool allow_signed); +#define GETENV_UNSIGNED false /* negative numbers not allowed */ +#define GETENV_SIGNED true /* negative numbers allowed */ + typedef uint64_t (cpu_tick_f)(void); void set_cputicker(cpu_tick_f *func, uint64_t freq, unsigned var); extern cpu_tick_f *cpu_ticks; @@ -369,7 +391,6 @@ static __inline intrmask_t splimp(void) { return 0; } static __inline intrmask_t splnet(void) { return 0; } static __inline intrmask_t spltty(void) { return 0; } -static __inline intrmask_t splvm(void) { return 0; } static __inline void splx(intrmask_t ipl __unused) { return; } /* @@ -376,8 +397,8 @@ * Common `proc' functions are declared here so that proc.h can be included * less often. */ -int _sleep(void *chan, struct lock_object *lock, int pri, const char *wmesg, - sbintime_t sbt, sbintime_t pr, int flags) __nonnull(1); +int _sleep(void * _Nonnull chan, struct lock_object *lock, int pri, + const char *wmesg, sbintime_t sbt, sbintime_t pr, int flags); #define msleep(chan, mtx, pri, wmesg, timo) \ _sleep((chan), &(mtx)->lock_object, (pri), (wmesg), \ tick_sbt * (timo), 0, C_HARDCLOCK) @@ -384,8 +405,8 @@ #define msleep_sbt(chan, mtx, pri, wmesg, bt, pr, flags) \ _sleep((chan), &(mtx)->lock_object, (pri), (wmesg), (bt), (pr), \ (flags)) -int msleep_spin_sbt(void *chan, struct mtx *mtx, const char *wmesg, - sbintime_t sbt, sbintime_t pr, int flags) __nonnull(1); +int msleep_spin_sbt(void * _Nonnull chan, struct mtx *mtx, + const char *wmesg, sbintime_t sbt, sbintime_t pr, int flags); #define msleep_spin(chan, mtx, wmesg, timo) \ msleep_spin_sbt((chan), (mtx), (wmesg), tick_sbt * (timo), \ 0, C_HARDCLOCK) @@ -393,13 +414,16 @@ int flags); #define pause(wmesg, timo) \ pause_sbt((wmesg), tick_sbt * (timo), 0, C_HARDCLOCK) +#define pause_sig(wmesg, timo) \ + pause_sbt((wmesg), tick_sbt * (timo), 0, C_HARDCLOCK | C_CATCH) #define tsleep(chan, pri, wmesg, timo) \ _sleep((chan), NULL, (pri), (wmesg), tick_sbt * (timo), \ 0, C_HARDCLOCK) #define tsleep_sbt(chan, pri, wmesg, bt, pr, flags) \ _sleep((chan), NULL, (pri), (wmesg), (bt), (pr), (flags)) -void wakeup(void *chan) __nonnull(1); -void wakeup_one(void *chan) __nonnull(1); +void wakeup(void * chan); +void wakeup_one(void * chan); +void wakeup_any(void * chan); /* * Common `struct cdev *' stuff are declared here to avoid #include poisoning @@ -409,6 +433,11 @@ dev_t dev2udev(struct cdev *x); const char *devtoname(struct cdev *cdev); +#ifdef __LP64__ +size_t devfs_iosize_max(void); +size_t iosize_max(void); +#endif + int poll_no_poll(int events); /* XXX: Should be void nanodelay(u_int nsec); */ @@ -419,7 +448,6 @@ struct root_hold_token *root_mount_hold(const char *identifier); void root_mount_rel(struct root_hold_token *h); -void root_mount_wait(void); int root_mounted(void); @@ -439,8 +467,27 @@ void intr_prof_stack_use(struct thread *td, struct trapframe *frame); -extern void (*softdep_ast_cleanup)(void); - void counted_warning(unsigned *counter, const char *msg); +/* + * APIs to manage deprecation and obsolescence. + */ +struct device; +void _gone_in(int major, const char *msg); +void _gone_in_dev(struct device *dev, int major, const char *msg); +#ifdef NO_OBSOLETE_CODE +#define __gone_ok(m, msg) \ + _Static_assert(m < P_OSREL_MAJOR(__FreeBSD_version)), \ + "Obsolete code" msg); +#else +#define __gone_ok(m, msg) +#endif +#define gone_in(major, msg) __gone_ok(major, msg) _gone_in(major, msg) +#define gone_in_dev(dev, major, msg) __gone_ok(major, msg) _gone_in_dev(dev, major, msg) +#define gone_by_fcp101_dev(dev) \ + gone_in_dev((dev), 13, \ + "see https://github.com/freebsd/fcp/blob/master/fcp-0101.md") + +__NULLABILITY_PRAGMA_POP + #endif /* !_SYS_SYSTM_H_ */ From laffer1 at midnightbsd.org Sat Feb 8 15:02:45 2020 From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org) Date: Sat, 8 Feb 2020 15:02:45 -0500 (EST) Subject: [Midnightbsd-cvs] src [12335] trunk/sys/sys/stat.h: sync with FreeBSD 11-stable Message-ID: <202002082002.018K2jV3068019@stargazer.midnightbsd.org> Revision: 12335 http://svnweb.midnightbsd.org/src/?rev=12335 Author: laffer1 Date: 2020-02-08 15:02:44 -0500 (Sat, 08 Feb 2020) Log Message: ----------- sync with FreeBSD 11-stable Modified Paths: -------------- trunk/sys/sys/stat.h Modified: trunk/sys/sys/stat.h =================================================================== --- trunk/sys/sys/stat.h 2020-02-08 20:02:25 UTC (rev 12334) +++ trunk/sys/sys/stat.h 2020-02-08 20:02:44 UTC (rev 12335) @@ -33,7 +33,7 @@ * SUCH DAMAGE. * * @(#)stat.h 8.12 (Berkeley) 6/16/95 - * $FreeBSD: stable/10/sys/sys/stat.h 293474 2016-01-09 14:20:23Z dchagin $ + * $FreeBSD: stable/11/sys/sys/stat.h 331722 2018-03-29 02:50:57Z eadler $ */ #ifndef _SYS_STAT_H_ @@ -348,12 +348,12 @@ #endif int stat(const char * __restrict, struct stat * __restrict); mode_t umask(mode_t); -#if __BSD_VISIBLE || __POSIX_VISIBLE >= 200809 +#if __POSIX_VISIBLE >= 200809 int fstatat(int, const char *, struct stat *, int); int mkdirat(int, const char *, mode_t); int mkfifoat(int, const char *, mode_t); #endif -#if __BSD_VISIBLE || __XSI_VISIBLE >= 700 +#if __XSI_VISIBLE >= 700 int mknodat(int, const char *, mode_t, dev_t); #endif __END_DECLS From laffer1 at midnightbsd.org Sat Feb 8 15:03:36 2020 From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org) Date: Sat, 8 Feb 2020 15:03:36 -0500 (EST) Subject: [Midnightbsd-cvs] src [12336] trunk/sys/sys/smp.h: sync with FreeBSD 11-stable Message-ID: <202002082003.018K3aHi068089@stargazer.midnightbsd.org> Revision: 12336 http://svnweb.midnightbsd.org/src/?rev=12336 Author: laffer1 Date: 2020-02-08 15:03:36 -0500 (Sat, 08 Feb 2020) Log Message: ----------- sync with FreeBSD 11-stable Modified Paths: -------------- trunk/sys/sys/smp.h Modified: trunk/sys/sys/smp.h =================================================================== --- trunk/sys/sys/smp.h 2020-02-08 20:02:44 UTC (rev 12335) +++ trunk/sys/sys/smp.h 2020-02-08 20:03:36 UTC (rev 12336) @@ -7,7 +7,7 @@ * this stuff is worth it, you can buy me a beer in return. Poul-Henning Kamp * ---------------------------------------------------------------------------- * - * $FreeBSD: stable/10/sys/sys/smp.h 331910 2018-04-03 07:52:06Z avg $ + * $FreeBSD: stable/11/sys/sys/smp.h 331909 2018-04-03 07:31:22Z avg $ */ #ifndef _SYS_SMP_H_ @@ -18,9 +18,52 @@ #ifndef LOCORE #include <sys/cpuset.h> +#include <sys/queue.h> /* - * Topology of a NUMA or HTT system. + * Types of nodes in the topological tree. + */ +typedef enum { + /* No node has this type; can be used in topo API calls. */ + TOPO_TYPE_DUMMY, + /* Processing unit aka computing unit aka logical CPU. */ + TOPO_TYPE_PU, + /* Physical subdivision of a package. */ + TOPO_TYPE_CORE, + /* CPU L1/L2/L3 cache. */ + TOPO_TYPE_CACHE, + /* Package aka chip, equivalent to socket. */ + TOPO_TYPE_PKG, + /* NUMA node. */ + TOPO_TYPE_NODE, + /* Other logical or physical grouping of PUs. */ + /* E.g. PUs on the same dye, or PUs sharing an FPU. */ + TOPO_TYPE_GROUP, + /* The whole system. */ + TOPO_TYPE_SYSTEM +} topo_node_type; + +/* Hardware indenitifier of a topology component. */ +typedef unsigned int hwid_t; +/* Logical CPU idenitifier. */ +typedef int cpuid_t; + +/* A node in the topology. */ +struct topo_node { + struct topo_node *parent; + TAILQ_HEAD(topo_children, topo_node) children; + TAILQ_ENTRY(topo_node) siblings; + cpuset_t cpuset; + topo_node_type type; + uintptr_t subtype; + hwid_t hwid; + cpuid_t id; + int nchildren; + int cpu_count; +}; + +/* + * Scheduling topology of a NUMA or SMP system. * * The top level topology is an array of pointers to groups. Each group * contains a bitmask of cpus in its group or subgroups. It may also @@ -53,6 +96,8 @@ #define CG_SHARE_L2 2 #define CG_SHARE_L3 3 +#define MAX_CACHE_LEVELS CG_SHARE_L3 + /* * Behavior modifiers for load balancing and affinity. */ @@ -61,10 +106,29 @@ #define CG_FLAG_THREAD (CG_FLAG_HTT | CG_FLAG_SMT) /* Any threading. */ /* - * Convenience routines for building topologies. + * Convenience routines for building and traversing topologies. */ #ifdef SMP +void topo_init_node(struct topo_node *node); +void topo_init_root(struct topo_node *root); +struct topo_node * topo_add_node_by_hwid(struct topo_node *parent, int hwid, + topo_node_type type, uintptr_t subtype); +struct topo_node * topo_find_node_by_hwid(struct topo_node *parent, int hwid, + topo_node_type type, uintptr_t subtype); +void topo_promote_child(struct topo_node *child); +struct topo_node * topo_next_node(struct topo_node *top, + struct topo_node *node); +struct topo_node * topo_next_nonchild_node(struct topo_node *top, + struct topo_node *node); +void topo_set_pu_id(struct topo_node *node, cpuid_t id); +int topo_analyze(struct topo_node *topo_root, int all, int *pkg_count, + int *cores_per_pkg, int *thrs_per_core); + +#define TOPO_FOREACH(i, root) \ + for (i = root; i != NULL; i = topo_next_node(root, i)) + struct cpu_group *smp_topo(void); +struct cpu_group *smp_topo_alloc(u_int count); struct cpu_group *smp_topo_none(void); struct cpu_group *smp_topo_1level(int l1share, int l1count, int l1flags); struct cpu_group *smp_topo_2level(int l2share, int l2count, int l1share, @@ -89,6 +153,7 @@ extern volatile int smp_started; extern cpuset_t all_cpus; +extern cpuset_t cpuset_domain[MAXMEMDOM]; /* CPUs in each NUMA domain. */ /* * Macro allowing us to determine whether a CPU is absent at any given @@ -179,7 +244,14 @@ int quiesce_all_cpus(const char *, int); int quiesce_cpus(cpuset_t, const char *, int); +/* + * smp_no_rendevous_barrier was renamed to smp_no_rendezvous_barrier + * in __FreeBSD_version 1101508, with the old name remaining in 11.x + * as an alias for compatibility. The old name will be gone in 12.0 + * (__FreeBSD_version >= 1200028). + */ void smp_no_rendevous_barrier(void *); +void smp_no_rendezvous_barrier(void *); void smp_rendezvous(void (*)(void *), void (*)(void *), void (*)(void *), From laffer1 at midnightbsd.org Sat Feb 8 15:04:23 2020 From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org) Date: Sat, 8 Feb 2020 15:04:23 -0500 (EST) Subject: [Midnightbsd-cvs] src [12337] trunk/sys/sys/timeet.h: sync with FreeBSD 11-stable Message-ID: <202002082004.018K4NxK068152@stargazer.midnightbsd.org> Revision: 12337 http://svnweb.midnightbsd.org/src/?rev=12337 Author: laffer1 Date: 2020-02-08 15:04:23 -0500 (Sat, 08 Feb 2020) Log Message: ----------- sync with FreeBSD 11-stable Modified Paths: -------------- trunk/sys/sys/timeet.h Modified: trunk/sys/sys/timeet.h =================================================================== --- trunk/sys/sys/timeet.h 2020-02-08 20:03:36 UTC (rev 12336) +++ trunk/sys/sys/timeet.h 2020-02-08 20:04:23 UTC (rev 12337) @@ -24,7 +24,7 @@ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * - * $FreeBSD: stable/10/sys/sys/timeet.h 266347 2014-05-17 20:10:12Z ian $ + * $FreeBSD: stable/11/sys/sys/timeet.h 331722 2018-03-29 02:50:57Z eadler $ */ #ifndef _SYS_TIMEEC_H_ @@ -54,7 +54,7 @@ struct eventtimer { SLIST_ENTRY(eventtimer) et_all; /* Pointer to the next event timer. */ - char *et_name; + const char *et_name; /* Name of the event timer. */ int et_flags; /* Set of capabilities flags: */ From laffer1 at midnightbsd.org Sat Feb 8 15:04:47 2020 From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org) Date: Sat, 8 Feb 2020 15:04:47 -0500 (EST) Subject: [Midnightbsd-cvs] src [12338] trunk/sys/sys/timetc.h: sync with FreeBSD 11-stable Message-ID: <202002082004.018K4lbI068211@stargazer.midnightbsd.org> Revision: 12338 http://svnweb.midnightbsd.org/src/?rev=12338 Author: laffer1 Date: 2020-02-08 15:04:46 -0500 (Sat, 08 Feb 2020) Log Message: ----------- sync with FreeBSD 11-stable Modified Paths: -------------- trunk/sys/sys/timetc.h Modified: trunk/sys/sys/timetc.h =================================================================== --- trunk/sys/sys/timetc.h 2020-02-08 20:04:23 UTC (rev 12337) +++ trunk/sys/sys/timetc.h 2020-02-08 20:04:46 UTC (rev 12338) @@ -7,7 +7,7 @@ * this stuff is worth it, you can buy me a beer in return. Poul-Henning Kamp * ---------------------------------------------------------------------------- * - * $FreeBSD: stable/10/sys/sys/timetc.h 280973 2015-04-02 01:02:42Z jhb $ + * $FreeBSD: stable/11/sys/sys/timetc.h 305866 2016-09-16 10:04:28Z kib $ */ #ifndef _SYS_TIMETC_H_ @@ -29,8 +29,14 @@ */ struct timecounter; +struct vdso_timehands; +struct vdso_timehands32; typedef u_int timecounter_get_t(struct timecounter *); typedef void timecounter_pps_t(struct timecounter *); +typedef uint32_t timecounter_fill_vdso_timehands_t(struct vdso_timehands *, + struct timecounter *); +typedef uint32_t timecounter_fill_vdso_timehands32_t(struct vdso_timehands32 *, + struct timecounter *); struct timecounter { timecounter_get_t *tc_get_timecount; @@ -50,7 +56,7 @@ /* This mask should mask off any unimplemented bits. */ uint64_t tc_frequency; /* Frequency of the counter in Hz. */ - char *tc_name; + const char *tc_name; /* Name of the timecounter. */ int tc_quality; /* @@ -69,6 +75,8 @@ /* Pointer to the timecounter's private parts. */ struct timecounter *tc_next; /* Pointer to the next timecounter. */ + timecounter_fill_vdso_timehands_t *tc_fill_vdso_timehands; + timecounter_fill_vdso_timehands32_t *tc_fill_vdso_timehands32; }; extern struct timecounter *timecounter; From laffer1 at midnightbsd.org Sat Feb 8 15:06:07 2020 From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org) Date: Sat, 8 Feb 2020 15:06:07 -0500 (EST) Subject: [Midnightbsd-cvs] src [12339] trunk/sys/sys/sysproto.h: sync with FreeBSD 11-stable Message-ID: <202002082006.018K67tU068928@stargazer.midnightbsd.org> Revision: 12339 http://svnweb.midnightbsd.org/src/?rev=12339 Author: laffer1 Date: 2020-02-08 15:06:06 -0500 (Sat, 08 Feb 2020) Log Message: ----------- sync with FreeBSD 11-stable Modified Paths: -------------- trunk/sys/sys/sysproto.h Modified: trunk/sys/sys/sysproto.h =================================================================== --- trunk/sys/sys/sysproto.h 2020-02-08 20:04:46 UTC (rev 12338) +++ trunk/sys/sys/sysproto.h 2020-02-08 20:06:06 UTC (rev 12339) @@ -1,8 +1,9 @@ +/* $MidnightBSD$ */ /* * System call prototypes. * * DO NOT EDIT-- this file is automatically generated. - * $MidnightBSD$ + * $FreeBSD: stable/11/sys/sys/sysproto.h 330964 2018-03-15 02:20:06Z eadler $ */ #ifndef _SYS_SYSPROTO_H_ @@ -182,7 +183,7 @@ struct dup_args { char fd_l_[PADL_(u_int)]; u_int fd; char fd_r_[PADR_(u_int)]; }; -struct pipe_args { +struct freebsd10_pipe_args { register_t dummy; }; struct getegid_args { @@ -531,20 +532,6 @@ char a3_l_[PADL_(int)]; int a3; char a3_r_[PADR_(int)]; char a4_l_[PADL_(int)]; int a4; char a4_r_[PADR_(int)]; }; -struct freebsd6_pread_args { - char fd_l_[PADL_(int)]; int fd; char fd_r_[PADR_(int)]; - char buf_l_[PADL_(void *)]; void * buf; char buf_r_[PADR_(void *)]; - char nbyte_l_[PADL_(size_t)]; size_t nbyte; char nbyte_r_[PADR_(size_t)]; - char pad_l_[PADL_(int)]; int pad; char pad_r_[PADR_(int)]; - char offset_l_[PADL_(off_t)]; off_t offset; char offset_r_[PADR_(off_t)]; -}; -struct freebsd6_pwrite_args { - char fd_l_[PADL_(int)]; int fd; char fd_r_[PADR_(int)]; - char buf_l_[PADL_(const void *)]; const void * buf; char buf_r_[PADR_(const void *)]; - char nbyte_l_[PADL_(size_t)]; size_t nbyte; char nbyte_r_[PADR_(size_t)]; - char pad_l_[PADL_(int)]; int pad; char pad_r_[PADR_(int)]; - char offset_l_[PADL_(off_t)]; off_t offset; char offset_r_[PADR_(off_t)]; -}; struct setfib_args { char fibnum_l_[PADL_(int)]; int fibnum; char fibnum_r_[PADR_(int)]; }; @@ -594,31 +581,6 @@ char count_l_[PADL_(u_int)]; u_int count; char count_r_[PADR_(u_int)]; char basep_l_[PADL_(long *)]; long * basep; char basep_r_[PADR_(long *)]; }; -struct freebsd6_mmap_args { - char addr_l_[PADL_(caddr_t)]; caddr_t addr; char addr_r_[PADR_(caddr_t)]; - char len_l_[PADL_(size_t)]; size_t len; char len_r_[PADR_(size_t)]; - char prot_l_[PADL_(int)]; int prot; char prot_r_[PADR_(int)]; - char flags_l_[PADL_(int)]; int flags; char flags_r_[PADR_(int)]; - char fd_l_[PADL_(int)]; int fd; char fd_r_[PADR_(int)]; - char pad_l_[PADL_(int)]; int pad; char pad_r_[PADR_(int)]; - char pos_l_[PADL_(off_t)]; off_t pos; char pos_r_[PADR_(off_t)]; -}; -struct freebsd6_lseek_args { - char fd_l_[PADL_(int)]; int fd; char fd_r_[PADR_(int)]; - char pad_l_[PADL_(int)]; int pad; char pad_r_[PADR_(int)]; - char offset_l_[PADL_(off_t)]; off_t offset; char offset_r_[PADR_(off_t)]; - char whence_l_[PADL_(int)]; int whence; char whence_r_[PADR_(int)]; -}; -struct freebsd6_truncate_args { - char path_l_[PADL_(char *)]; char * path; char path_r_[PADR_(char *)]; - char pad_l_[PADL_(int)]; int pad; char pad_r_[PADR_(int)]; - char length_l_[PADL_(off_t)]; off_t length; char length_r_[PADR_(off_t)]; -}; -struct freebsd6_ftruncate_args { - char fd_l_[PADL_(int)]; int fd; char fd_r_[PADR_(int)]; - char pad_l_[PADL_(int)]; int pad; char pad_r_[PADR_(int)]; - char length_l_[PADL_(off_t)]; off_t length; char length_r_[PADR_(off_t)]; -}; struct sysctl_args { char name_l_[PADL_(int *)]; int * name; char name_r_[PADR_(int *)]; char namelen_l_[PADL_(u_int)]; u_int namelen; char namelen_r_[PADR_(u_int)]; @@ -736,6 +698,12 @@ struct ffclock_getestimate_args { char cest_l_[PADL_(struct ffclock_estimate *)]; struct ffclock_estimate * cest; char cest_r_[PADR_(struct ffclock_estimate *)]; }; +struct clock_nanosleep_args { + char clock_id_l_[PADL_(clockid_t)]; clockid_t clock_id; char clock_id_r_[PADR_(clockid_t)]; + char flags_l_[PADL_(int)]; int flags; char flags_r_[PADR_(int)]; + char rqtp_l_[PADL_(const struct timespec *)]; const struct timespec * rqtp; char rqtp_r_[PADR_(const struct timespec *)]; + char rmtp_l_[PADL_(struct timespec *)]; struct timespec * rmtp; char rmtp_r_[PADR_(struct timespec *)]; +}; struct clock_getcpuclockid2_args { char id_l_[PADL_(id_t)]; id_t id; char id_r_[PADR_(id_t)]; char which_l_[PADL_(int)]; int which; char which_r_[PADR_(int)]; @@ -882,18 +850,6 @@ struct aio_error_args { char aiocbp_l_[PADL_(struct aiocb *)]; struct aiocb * aiocbp; char aiocbp_r_[PADR_(struct aiocb *)]; }; -struct oaio_read_args { - char aiocbp_l_[PADL_(struct oaiocb *)]; struct oaiocb * aiocbp; char aiocbp_r_[PADR_(struct oaiocb *)]; -}; -struct oaio_write_args { - char aiocbp_l_[PADL_(struct oaiocb *)]; struct oaiocb * aiocbp; char aiocbp_r_[PADR_(struct oaiocb *)]; -}; -struct olio_listio_args { - char mode_l_[PADL_(int)]; int mode; char mode_r_[PADR_(int)]; - char acb_list_l_[PADL_(struct oaiocb *const *)]; struct oaiocb *const * acb_list; char acb_list_r_[PADR_(struct oaiocb *const *)]; - char nent_l_[PADL_(int)]; int nent; char nent_r_[PADR_(int)]; - char sig_l_[PADL_(struct osigevent *)]; struct osigevent * sig; char sig_r_[PADR_(struct osigevent *)]; -}; struct yield_args { register_t dummy; }; @@ -1157,7 +1113,7 @@ struct getfsstat_args { char buf_l_[PADL_(struct statfs *)]; struct statfs * buf; char buf_r_[PADR_(struct statfs *)]; char bufsize_l_[PADL_(long)]; long bufsize; char bufsize_r_[PADR_(long)]; - char flags_l_[PADL_(int)]; int flags; char flags_r_[PADR_(int)]; + char mode_l_[PADL_(int)]; int mode; char mode_r_[PADR_(int)]; }; struct statfs_args { char path_l_[PADL_(char *)]; char * path; char path_r_[PADR_(char *)]; @@ -1300,12 +1256,6 @@ char id_l_[PADL_(long)]; long id; char id_r_[PADR_(long)]; char sig_l_[PADL_(int)]; int sig; char sig_r_[PADR_(int)]; }; -struct _umtx_lock_args { - char umtx_l_[PADL_(struct umtx *)]; struct umtx * umtx; char umtx_r_[PADR_(struct umtx *)]; -}; -struct _umtx_unlock_args { - char umtx_l_[PADL_(struct umtx *)]; struct umtx * umtx; char umtx_r_[PADR_(struct umtx *)]; -}; struct jail_attach_args { char jid_l_[PADL_(int)]; int jid; char jid_r_[PADR_(int)]; }; @@ -1834,6 +1784,19 @@ char times_l_[PADL_(struct timespec *)]; struct timespec * times; char times_r_[PADR_(struct timespec *)]; char flag_l_[PADL_(int)]; int flag; char flag_r_[PADR_(int)]; }; +struct numa_getaffinity_args { + char which_l_[PADL_(cpuwhich_t)]; cpuwhich_t which; char which_r_[PADR_(cpuwhich_t)]; + char id_l_[PADL_(id_t)]; id_t id; char id_r_[PADR_(id_t)]; + char policy_l_[PADL_(struct vm_domain_policy_entry *)]; struct vm_domain_policy_entry * policy; char policy_r_[PADR_(struct vm_domain_policy_entry *)]; +}; +struct numa_setaffinity_args { + char which_l_[PADL_(cpuwhich_t)]; cpuwhich_t which; char which_r_[PADR_(cpuwhich_t)]; + char id_l_[PADL_(id_t)]; id_t id; char id_r_[PADR_(id_t)]; + char policy_l_[PADL_(const struct vm_domain_policy_entry *)]; const struct vm_domain_policy_entry * policy; char policy_r_[PADR_(const struct vm_domain_policy_entry *)]; +}; +struct fdatasync_args { + char fd_l_[PADL_(int)]; int fd; char fd_r_[PADR_(int)]; +}; int nosys(struct thread *, struct nosys_args *); void sys_sys_exit(struct thread *, struct sys_exit_args *); int sys_fork(struct thread *, struct fork_args *); @@ -1870,7 +1833,6 @@ int sys_kill(struct thread *, struct kill_args *); int sys_getppid(struct thread *, struct getppid_args *); int sys_dup(struct thread *, struct dup_args *); -int sys_pipe(struct thread *, struct pipe_args *); int sys_getegid(struct thread *, struct getegid_args *); int sys_profil(struct thread *, struct profil_args *); int sys_ktrace(struct thread *, struct ktrace_args *); @@ -1946,8 +1908,6 @@ int sys_semsys(struct thread *, struct semsys_args *); int sys_msgsys(struct thread *, struct msgsys_args *); int sys_shmsys(struct thread *, struct shmsys_args *); -int freebsd6_pread(struct thread *, struct freebsd6_pread_args *); -int freebsd6_pwrite(struct thread *, struct freebsd6_pwrite_args *); int sys_setfib(struct thread *, struct setfib_args *); int sys_ntp_adjtime(struct thread *, struct ntp_adjtime_args *); int sys_setgid(struct thread *, struct setgid_args *); @@ -1961,10 +1921,6 @@ int sys_getrlimit(struct thread *, struct __getrlimit_args *); int sys_setrlimit(struct thread *, struct __setrlimit_args *); int sys_getdirentries(struct thread *, struct getdirentries_args *); -int freebsd6_mmap(struct thread *, struct freebsd6_mmap_args *); -int freebsd6_lseek(struct thread *, struct freebsd6_lseek_args *); -int freebsd6_truncate(struct thread *, struct freebsd6_truncate_args *); -int freebsd6_ftruncate(struct thread *, struct freebsd6_ftruncate_args *); int sys___sysctl(struct thread *, struct sysctl_args *); int sys_mlock(struct thread *, struct mlock_args *); int sys_munlock(struct thread *, struct munlock_args *); @@ -1992,6 +1948,7 @@ int sys_ffclock_getcounter(struct thread *, struct ffclock_getcounter_args *); int sys_ffclock_setestimate(struct thread *, struct ffclock_setestimate_args *); int sys_ffclock_getestimate(struct thread *, struct ffclock_getestimate_args *); +int sys_clock_nanosleep(struct thread *, struct clock_nanosleep_args *); int sys_clock_getcpuclockid2(struct thread *, struct clock_getcpuclockid2_args *); int sys_ntp_gettime(struct thread *, struct ntp_gettime_args *); int sys_minherit(struct thread *, struct minherit_args *); @@ -2029,9 +1986,6 @@ int sys_aio_suspend(struct thread *, struct aio_suspend_args *); int sys_aio_cancel(struct thread *, struct aio_cancel_args *); int sys_aio_error(struct thread *, struct aio_error_args *); -int sys_oaio_read(struct thread *, struct oaio_read_args *); -int sys_oaio_write(struct thread *, struct oaio_write_args *); -int sys_olio_listio(struct thread *, struct olio_listio_args *); int sys_yield(struct thread *, struct yield_args *); int sys_mlockall(struct thread *, struct mlockall_args *); int sys_munlockall(struct thread *, struct munlockall_args *); @@ -2123,8 +2077,6 @@ int sys_thr_exit(struct thread *, struct thr_exit_args *); int sys_thr_self(struct thread *, struct thr_self_args *); int sys_thr_kill(struct thread *, struct thr_kill_args *); -int sys__umtx_lock(struct thread *, struct _umtx_lock_args *); -int sys__umtx_unlock(struct thread *, struct _umtx_unlock_args *); int sys_jail_attach(struct thread *, struct jail_attach_args *); int sys_extattr_list_fd(struct thread *, struct extattr_list_fd_args *); int sys_extattr_list_file(struct thread *, struct extattr_list_file_args *); @@ -2230,6 +2182,9 @@ int sys_ppoll(struct thread *, struct ppoll_args *); int sys_futimens(struct thread *, struct futimens_args *); int sys_utimensat(struct thread *, struct utimensat_args *); +int sys_numa_getaffinity(struct thread *, struct numa_getaffinity_args *); +int sys_numa_setaffinity(struct thread *, struct numa_setaffinity_args *); +int sys_fdatasync(struct thread *, struct fdatasync_args *); #ifdef COMPAT_43 @@ -2408,7 +2363,7 @@ struct freebsd4_getfsstat_args { char buf_l_[PADL_(struct ostatfs *)]; struct ostatfs * buf; char buf_r_[PADR_(struct ostatfs *)]; char bufsize_l_[PADL_(long)]; long bufsize; char bufsize_r_[PADR_(long)]; - char flags_l_[PADL_(int)]; int flags; char flags_r_[PADR_(int)]; + char mode_l_[PADL_(int)]; int mode; char mode_r_[PADR_(int)]; }; struct freebsd4_statfs_args { char path_l_[PADL_(char *)]; char * path; char path_r_[PADR_(char *)]; @@ -2466,6 +2421,66 @@ #ifdef COMPAT_FREEBSD6 +struct freebsd6_pread_args { + char fd_l_[PADL_(int)]; int fd; char fd_r_[PADR_(int)]; + char buf_l_[PADL_(void *)]; void * buf; char buf_r_[PADR_(void *)]; + char nbyte_l_[PADL_(size_t)]; size_t nbyte; char nbyte_r_[PADR_(size_t)]; + char pad_l_[PADL_(int)]; int pad; char pad_r_[PADR_(int)]; + char offset_l_[PADL_(off_t)]; off_t offset; char offset_r_[PADR_(off_t)]; +}; +struct freebsd6_pwrite_args { + char fd_l_[PADL_(int)]; int fd; char fd_r_[PADR_(int)]; + char buf_l_[PADL_(const void *)]; const void * buf; char buf_r_[PADR_(const void *)]; + char nbyte_l_[PADL_(size_t)]; size_t nbyte; char nbyte_r_[PADR_(size_t)]; + char pad_l_[PADL_(int)]; int pad; char pad_r_[PADR_(int)]; + char offset_l_[PADL_(off_t)]; off_t offset; char offset_r_[PADR_(off_t)]; +}; +struct freebsd6_mmap_args { + char addr_l_[PADL_(caddr_t)]; caddr_t addr; char addr_r_[PADR_(caddr_t)]; + char len_l_[PADL_(size_t)]; size_t len; char len_r_[PADR_(size_t)]; + char prot_l_[PADL_(int)]; int prot; char prot_r_[PADR_(int)]; + char flags_l_[PADL_(int)]; int flags; char flags_r_[PADR_(int)]; + char fd_l_[PADL_(int)]; int fd; char fd_r_[PADR_(int)]; + char pad_l_[PADL_(int)]; int pad; char pad_r_[PADR_(int)]; + char pos_l_[PADL_(off_t)]; off_t pos; char pos_r_[PADR_(off_t)]; +}; +struct freebsd6_lseek_args { + char fd_l_[PADL_(int)]; int fd; char fd_r_[PADR_(int)]; + char pad_l_[PADL_(int)]; int pad; char pad_r_[PADR_(int)]; + char offset_l_[PADL_(off_t)]; off_t offset; char offset_r_[PADR_(off_t)]; + char whence_l_[PADL_(int)]; int whence; char whence_r_[PADR_(int)]; +}; +struct freebsd6_truncate_args { + char path_l_[PADL_(char *)]; char * path; char path_r_[PADR_(char *)]; + char pad_l_[PADL_(int)]; int pad; char pad_r_[PADR_(int)]; + char length_l_[PADL_(off_t)]; off_t length; char length_r_[PADR_(off_t)]; +}; +struct freebsd6_ftruncate_args { + char fd_l_[PADL_(int)]; int fd; char fd_r_[PADR_(int)]; + char pad_l_[PADL_(int)]; int pad; char pad_r_[PADR_(int)]; + char length_l_[PADL_(off_t)]; off_t length; char length_r_[PADR_(off_t)]; +}; +struct freebsd6_aio_read_args { + char aiocbp_l_[PADL_(struct oaiocb *)]; struct oaiocb * aiocbp; char aiocbp_r_[PADR_(struct oaiocb *)]; +}; +struct freebsd6_aio_write_args { + char aiocbp_l_[PADL_(struct oaiocb *)]; struct oaiocb * aiocbp; char aiocbp_r_[PADR_(struct oaiocb *)]; +}; +struct freebsd6_lio_listio_args { + char mode_l_[PADL_(int)]; int mode; char mode_r_[PADR_(int)]; + char acb_list_l_[PADL_(struct oaiocb *const *)]; struct oaiocb *const * acb_list; char acb_list_r_[PADR_(struct oaiocb *const *)]; + char nent_l_[PADL_(int)]; int nent; char nent_r_[PADR_(int)]; + char sig_l_[PADL_(struct osigevent *)]; struct osigevent * sig; char sig_r_[PADR_(struct osigevent *)]; +}; +int freebsd6_pread(struct thread *, struct freebsd6_pread_args *); +int freebsd6_pwrite(struct thread *, struct freebsd6_pwrite_args *); +int freebsd6_mmap(struct thread *, struct freebsd6_mmap_args *); +int freebsd6_lseek(struct thread *, struct freebsd6_lseek_args *); +int freebsd6_truncate(struct thread *, struct freebsd6_truncate_args *); +int freebsd6_ftruncate(struct thread *, struct freebsd6_ftruncate_args *); +int freebsd6_aio_read(struct thread *, struct freebsd6_aio_read_args *); +int freebsd6_aio_write(struct thread *, struct freebsd6_aio_write_args *); +int freebsd6_lio_listio(struct thread *, struct freebsd6_lio_listio_args *); #endif /* COMPAT_FREEBSD6 */ @@ -2494,11 +2509,18 @@ #endif /* COMPAT_FREEBSD7 */ + +#ifdef COMPAT_FREEBSD10 + +int freebsd10_pipe(struct thread *, struct freebsd10_pipe_args *); + +#endif /* COMPAT_FREEBSD10 */ + #define SYS_AUE_syscall AUE_NULL #define SYS_AUE_exit AUE_EXIT #define SYS_AUE_fork AUE_FORK -#define SYS_AUE_read AUE_NULL -#define SYS_AUE_write AUE_NULL +#define SYS_AUE_read AUE_READ +#define SYS_AUE_write AUE_WRITE #define SYS_AUE_open AUE_OPEN_RWTC #define SYS_AUE_close AUE_CLOSE #define SYS_AUE_wait4 AUE_WAIT4 @@ -2535,7 +2557,7 @@ #define SYS_AUE_getppid AUE_GETPPID #define SYS_AUE_olstat AUE_LSTAT #define SYS_AUE_dup AUE_DUP -#define SYS_AUE_pipe AUE_PIPE +#define SYS_AUE_freebsd10_pipe AUE_PIPE #define SYS_AUE_getegid AUE_GETEGID #define SYS_AUE_profil AUE_PROFILE #define SYS_AUE_ktrace AUE_KTRACE @@ -2698,6 +2720,7 @@ #define SYS_AUE_ffclock_getcounter AUE_NULL #define SYS_AUE_ffclock_setestimate AUE_NULL #define SYS_AUE_ffclock_getestimate AUE_NULL +#define SYS_AUE_clock_nanosleep AUE_NULL #define SYS_AUE_clock_getcpuclockid2 AUE_NULL #define SYS_AUE_ntp_gettime AUE_NULL #define SYS_AUE_minherit AUE_MINHERIT @@ -2736,9 +2759,9 @@ #define SYS_AUE_aio_suspend AUE_NULL #define SYS_AUE_aio_cancel AUE_NULL #define SYS_AUE_aio_error AUE_NULL -#define SYS_AUE_oaio_read AUE_NULL -#define SYS_AUE_oaio_write AUE_NULL -#define SYS_AUE_olio_listio AUE_NULL +#define SYS_AUE_freebsd6_aio_read AUE_NULL +#define SYS_AUE_freebsd6_aio_write AUE_NULL +#define SYS_AUE_freebsd6_lio_listio AUE_NULL #define SYS_AUE_yield AUE_NULL #define SYS_AUE_mlockall AUE_MLOCKALL #define SYS_AUE_munlockall AUE_MUNLOCKALL @@ -2833,8 +2856,6 @@ #define SYS_AUE_thr_exit AUE_NULL #define SYS_AUE_thr_self AUE_NULL #define SYS_AUE_thr_kill AUE_NULL -#define SYS_AUE__umtx_lock AUE_NULL -#define SYS_AUE__umtx_unlock AUE_NULL #define SYS_AUE_jail_attach AUE_NULL #define SYS_AUE_extattr_list_fd AUE_EXTATTR_LIST_FD #define SYS_AUE_extattr_list_file AUE_EXTATTR_LIST_FILE @@ -2940,6 +2961,9 @@ #define SYS_AUE_ppoll AUE_POLL #define SYS_AUE_futimens AUE_FUTIMES #define SYS_AUE_utimensat AUE_FUTIMESAT +#define SYS_AUE_numa_getaffinity AUE_NULL +#define SYS_AUE_numa_setaffinity AUE_NULL +#define SYS_AUE_fdatasync AUE_FSYNC #undef PAD_ #undef PADL_ From laffer1 at midnightbsd.org Sat Feb 8 15:07:15 2020 From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org) Date: Sat, 8 Feb 2020 15:07:15 -0500 (EST) Subject: [Midnightbsd-cvs] src [12340] trunk/sys/sys/taskqueue.h: sync with FreeBSD 11-stable Message-ID: <202002082007.018K7Fth069005@stargazer.midnightbsd.org> Revision: 12340 http://svnweb.midnightbsd.org/src/?rev=12340 Author: laffer1 Date: 2020-02-08 15:07:14 -0500 (Sat, 08 Feb 2020) Log Message: ----------- sync with FreeBSD 11-stable Modified Paths: -------------- trunk/sys/sys/taskqueue.h Modified: trunk/sys/sys/taskqueue.h =================================================================== --- trunk/sys/sys/taskqueue.h 2020-02-08 20:06:06 UTC (rev 12339) +++ trunk/sys/sys/taskqueue.h 2020-02-08 20:07:14 UTC (rev 12340) @@ -24,7 +24,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: stable/10/sys/sys/taskqueue.h 315268 2017-03-14 16:00:33Z hselasky $ + * $FreeBSD: stable/11/sys/sys/taskqueue.h 341154 2018-11-28 17:00:18Z markj $ */ #ifndef _SYS_TASKQUEUE_H_ @@ -37,8 +37,10 @@ #include <sys/queue.h> #include <sys/_task.h> #include <sys/_callout.h> +#include <sys/_cpuset.h> struct taskqueue; +struct taskqgroup; struct thread; struct timeout_task { @@ -55,6 +57,7 @@ #define TASKQUEUE_CALLBACK_TYPE_MIN TASKQUEUE_CALLBACK_TYPE_INIT #define TASKQUEUE_CALLBACK_TYPE_MAX TASKQUEUE_CALLBACK_TYPE_SHUTDOWN #define TASKQUEUE_NUM_CALLBACKS TASKQUEUE_CALLBACK_TYPE_MAX + 1 +#define TASKQUEUE_NAMELEN 32 typedef void (*taskqueue_callback_fn)(void *context); @@ -72,9 +75,14 @@ void *context); int taskqueue_start_threads(struct taskqueue **tqp, int count, int pri, const char *name, ...) __printflike(4, 5); +int taskqueue_start_threads_cpuset(struct taskqueue **tqp, int count, + int pri, cpuset_t *mask, const char *name, ...) __printflike(5, 6); int taskqueue_enqueue(struct taskqueue *queue, struct task *task); int taskqueue_enqueue_timeout(struct taskqueue *queue, struct timeout_task *timeout_task, int ticks); +int taskqueue_enqueue_timeout_sbt(struct taskqueue *queue, + struct timeout_task *timeout_task, sbintime_t sbt, sbintime_t pr, + int flags); int taskqueue_poll_is_busy(struct taskqueue *queue, struct task *task); int taskqueue_cancel(struct taskqueue *queue, struct task *task, u_int *pendp); @@ -84,6 +92,7 @@ void taskqueue_drain_timeout(struct taskqueue *queue, struct timeout_task *timeout_task); void taskqueue_drain_all(struct taskqueue *queue); +void taskqueue_quiesce(struct taskqueue *queue); void taskqueue_free(struct taskqueue *queue); void taskqueue_run(struct taskqueue *queue); void taskqueue_block(struct taskqueue *queue); @@ -142,7 +151,7 @@ init; \ } \ \ -SYSINIT(taskqueue_##name, SI_SUB_CONFIGURE, SI_ORDER_SECOND, \ +SYSINIT(taskqueue_##name, SI_SUB_TASKQ, SI_ORDER_SECOND, \ taskqueue_define_##name, NULL); \ \ struct __hack @@ -167,7 +176,7 @@ init; \ } \ \ -SYSINIT(taskqueue_##name, SI_SUB_CONFIGURE, SI_ORDER_SECOND, \ +SYSINIT(taskqueue_##name, SI_SUB_TASKQ, SI_ORDER_SECOND, \ taskqueue_define_##name, NULL); \ \ struct __hack @@ -197,7 +206,6 @@ * from a fast interrupt handler context. */ TASKQUEUE_DECLARE(fast); -int taskqueue_enqueue_fast(struct taskqueue *queue, struct task *task); struct taskqueue *taskqueue_create_fast(const char *name, int mflags, taskqueue_enqueue_fn enqueue, void *context); From laffer1 at midnightbsd.org Sat Feb 8 15:07:52 2020 From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org) Date: Sat, 8 Feb 2020 15:07:52 -0500 (EST) Subject: [Midnightbsd-cvs] src [12341] trunk/sys/sys/syslimits.h: sync with FreeBSD 11-stable Message-ID: <202002082007.018K7qGg069072@stargazer.midnightbsd.org> Revision: 12341 http://svnweb.midnightbsd.org/src/?rev=12341 Author: laffer1 Date: 2020-02-08 15:07:51 -0500 (Sat, 08 Feb 2020) Log Message: ----------- sync with FreeBSD 11-stable Modified Paths: -------------- trunk/sys/sys/syslimits.h Modified: trunk/sys/sys/syslimits.h =================================================================== --- trunk/sys/sys/syslimits.h 2020-02-08 20:07:14 UTC (rev 12340) +++ trunk/sys/sys/syslimits.h 2020-02-08 20:07:51 UTC (rev 12341) @@ -1,3 +1,4 @@ +/* $MidnightBSD$ */ /*- * Copyright (c) 1988, 1993 * The Regents of the University of California. All rights reserved. @@ -27,13 +28,13 @@ * SUCH DAMAGE. * * @(#)syslimits.h 8.1 (Berkeley) 6/2/93 - * $MidnightBSD$ + * $FreeBSD: stable/11/sys/sys/syslimits.h 331722 2018-03-29 02:50:57Z eadler $ */ #ifndef _SYS_SYSLIMITS_H_ #define _SYS_SYSLIMITS_H_ -#if !defined(_KERNEL) && !defined(_LIMITS_H_) && !defined(_SYS_PARAM_H_) +#if !defined(_STANDALONE) && !defined(_KERNEL) && !defined(_LIMITS_H_) && !defined(_SYS_PARAM_H_) #ifndef _SYS_CDEFS_H_ #error this file needs sys/cdefs.h as a prerequisite #endif From laffer1 at midnightbsd.org Sat Feb 8 15:08:43 2020 From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org) Date: Sat, 8 Feb 2020 15:08:43 -0500 (EST) Subject: [Midnightbsd-cvs] src [12342] trunk/sys/sys/sem.h: sync with FreeBSD 11-stable Message-ID: <202002082008.018K8hoi069149@stargazer.midnightbsd.org> Revision: 12342 http://svnweb.midnightbsd.org/src/?rev=12342 Author: laffer1 Date: 2020-02-08 15:08:42 -0500 (Sat, 08 Feb 2020) Log Message: ----------- sync with FreeBSD 11-stable Modified Paths: -------------- trunk/sys/sys/sem.h Modified: trunk/sys/sys/sem.h =================================================================== --- trunk/sys/sys/sem.h 2020-02-08 20:07:51 UTC (rev 12341) +++ trunk/sys/sys/sem.h 2020-02-08 20:08:42 UTC (rev 12342) @@ -1,5 +1,5 @@ /* $MidnightBSD$ */ -/* $FreeBSD: stable/10/sys/sys/sem.h 224016 2011-07-14 14:18:14Z bz $ */ +/* $FreeBSD: stable/11/sys/sys/sem.h 347995 2019-05-20 16:31:45Z kib $ */ /* $NetBSD: sem.h,v 1.5 1994/06/29 06:45:15 cgd Exp $ */ /* @@ -11,6 +11,9 @@ #ifndef _SYS_SEM_H_ #define _SYS_SEM_H_ +#ifdef _WANT_SYSVSEM_INTERNALS +#define _WANT_SYSVIPC_INTERNALS +#endif #include <sys/ipc.h> #ifndef _PID_T_DECLARED @@ -38,7 +41,7 @@ long sem_pad1; /* SVABI/386 says I need this here */ time_t sem_ctime; /* last change time */ /* Times measured in secs since */ - /* 00:00:00 GMT, Jan. 1, 1970 */ + /* 00:00:00 UTC, Jan. 1, 1970, without leap seconds */ long sem_pad2; /* SVABI/386 says I need this here */ long sem_pad3[4]; /* SVABI/386 says I need this here */ }; @@ -51,7 +54,7 @@ time_t sem_otime; /* last operation time */ time_t sem_ctime; /* last change time */ /* Times measured in secs since */ - /* 00:00:00 GMT, Jan. 1, 1970 */ + /* 00:00:00 UTC, Jan. 1, 1970, without leap seconds */ }; /* @@ -102,8 +105,7 @@ #define SEM_A IPC_W /* alter permission */ #define SEM_R IPC_R /* read permission */ -#ifdef _KERNEL - +#if defined(_KERNEL) || defined(_WANT_SYSVSEM_INTERNALS) /* * semaphore info struct */ @@ -118,7 +120,6 @@ semvmx, /* semaphore maximum value */ semaem; /* adjust on exit max value */ }; -extern struct seminfo seminfo; /* * Kernel wrapper for the user-level structure @@ -132,13 +133,16 @@ /* internal "mode" bits */ #define SEM_ALLOC 01000 /* semaphore is allocated */ #define SEM_DEST 02000 /* semaphore will be destroyed on last detach */ +#endif +#ifdef _KERNEL +extern struct seminfo seminfo; /* * Process sem_undo vectors at proc exit. */ void semexit(struct proc *p); -#else /* ! _KERNEL */ +#else /* !_KERNEL */ __BEGIN_DECLS #if __BSD_VISIBLE From laffer1 at midnightbsd.org Sat Feb 8 15:08:58 2020 From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org) Date: Sat, 8 Feb 2020 15:08:58 -0500 (EST) Subject: [Midnightbsd-cvs] src [12343] trunk/sys/sys/stack.h: sync with FreeBSD 11-stable Message-ID: <202002082008.018K8wgq069202@stargazer.midnightbsd.org> Revision: 12343 http://svnweb.midnightbsd.org/src/?rev=12343 Author: laffer1 Date: 2020-02-08 15:08:57 -0500 (Sat, 08 Feb 2020) Log Message: ----------- sync with FreeBSD 11-stable Modified Paths: -------------- trunk/sys/sys/stack.h Modified: trunk/sys/sys/stack.h =================================================================== --- trunk/sys/sys/stack.h 2020-02-08 20:08:42 UTC (rev 12342) +++ trunk/sys/sys/stack.h 2020-02-08 20:08:57 UTC (rev 12343) @@ -24,7 +24,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: stable/10/sys/sys/stack.h 227581 2011-11-16 19:06:55Z pjd $ + * $FreeBSD: stable/11/sys/sys/stack.h 331722 2018-03-29 02:50:57Z eadler $ */ #ifndef _SYS_STACK_H_ @@ -57,9 +57,10 @@ #define CTRSTACK(m, st, depth, cheap) #endif -/* MD Routine. */ +/* MD Routines. */ struct thread; void stack_save(struct stack *); void stack_save_td(struct stack *, struct thread *); +int stack_save_td_running(struct stack *, struct thread *); #endif From laffer1 at midnightbsd.org Sat Feb 8 15:09:30 2020 From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org) Date: Sat, 8 Feb 2020 15:09:30 -0500 (EST) Subject: [Midnightbsd-cvs] src [12344] trunk/sys/sys/sysent.h: sync with FreeBSD 11-stable Message-ID: <202002082009.018K9UHA069258@stargazer.midnightbsd.org> Revision: 12344 http://svnweb.midnightbsd.org/src/?rev=12344 Author: laffer1 Date: 2020-02-08 15:09:29 -0500 (Sat, 08 Feb 2020) Log Message: ----------- sync with FreeBSD 11-stable Modified Paths: -------------- trunk/sys/sys/sysent.h Modified: trunk/sys/sys/sysent.h =================================================================== --- trunk/sys/sys/sysent.h 2020-02-08 20:08:57 UTC (rev 12343) +++ trunk/sys/sys/sysent.h 2020-02-08 20:09:29 UTC (rev 12344) @@ -27,7 +27,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: stable/10/sys/sys/sysent.h 303395 2016-07-27 16:27:41Z julian $ + * $FreeBSD: stable/11/sys/sys/sysent.h 346815 2019-04-28 13:16:54Z dchagin $ */ #ifndef _SYS_SYSENT_H_ @@ -39,19 +39,19 @@ struct sysent; struct thread; struct ksiginfo; +struct syscall_args; +enum systrace_probe_t { + SYSTRACE_ENTRY, + SYSTRACE_RETURN, +}; + typedef int sy_call_t(struct thread *, void *); -/* Used by the machine dependent syscall() code. */ -typedef void (*systrace_probe_func_t)(u_int32_t, int, struct sysent *, void *, - int); +typedef void (*systrace_probe_func_t)(struct syscall_args *, + enum systrace_probe_t, int); +typedef void (*systrace_args_func_t)(int, void *, uint64_t *, int *); -/* - * Used by loaded syscalls to convert arguments to a DTrace array - * of 64-bit arguments. - */ -typedef void (*systrace_args_func_t)(int, void *, u_int64_t *, int *); - extern systrace_probe_func_t systrace_probe_func; struct sysent { /* system call table */ @@ -77,9 +77,14 @@ #define SY_THR_ABSENT 0x4 #define SY_THR_INCR 0x8 +#ifdef KLD_MODULE +#define SY_THR_STATIC_KLD 0 +#else +#define SY_THR_STATIC_KLD SY_THR_STATIC +#endif + struct image_params; struct __sigset; -struct syscall_args; struct trapframe; struct vnode; @@ -87,10 +92,8 @@ int sv_size; /* number of entries */ struct sysent *sv_table; /* pointer to sysent */ u_int sv_mask; /* optional mask to index */ - int sv_sigsize; /* size of signal translation table */ - int *sv_sigtbl; /* signal translation table */ int sv_errsize; /* size of errno translation table */ - int *sv_errtbl; /* errno translation table */ + const int *sv_errtbl; /* errno translation table */ int (*sv_transtrap)(int, int); /* translate trap-to-signal mapping */ int (*sv_fixup)(register_t **, struct image_params *); @@ -99,8 +102,6 @@ /* send signal */ char *sv_sigcode; /* start of sigtramp code */ int *sv_szsigcode; /* size of sigtramp code */ - void (*sv_prepsyscall)(struct trapframe *, int *, u_int *, - caddr_t *); char *sv_name; /* name of binary type */ int (*sv_coredump)(struct thread *, struct vnode *, off_t, int); /* function to dump core, or NULL */ @@ -119,27 +120,28 @@ u_long *sv_maxssiz; u_int sv_flags; void (*sv_set_syscall_retval)(struct thread *, int); - int (*sv_fetch_syscall_args)(struct thread *, struct - syscall_args *); + int (*sv_fetch_syscall_args)(struct thread *); const char **sv_syscallnames; + vm_offset_t sv_timekeep_base; vm_offset_t sv_shared_page_base; vm_offset_t sv_shared_page_len; vm_offset_t sv_sigcode_base; - vm_offset_t sv_timekeep_base; - int sv_timekeep_off; - int sv_timekeep_curr; - uint32_t sv_timekeep_gen; void *sv_shared_page_obj; void (*sv_schedtail)(struct thread *); void (*sv_thread_detach)(struct thread *); int (*sv_trap)(struct thread *); + u_long *sv_hwcap; /* Value passed in AT_HWCAP. */ + u_long *sv_hwcap2; /* Value passed in AT_HWCAP2. */ }; -#define SV_ILP32 0x000100 -#define SV_LP64 0x000200 -#define SV_IA32 0x004000 -#define SV_AOUT 0x008000 -#define SV_SHP 0x010000 +#define SV_ILP32 0x000100 /* 32-bit executable. */ +#define SV_LP64 0x000200 /* 64-bit executable. */ +#define SV_IA32 0x004000 /* Intel 32-bit executable. */ +#define SV_AOUT 0x008000 /* a.out executable. */ +#define SV_SHP 0x010000 /* Shared page. */ +#define SV_CAPSICUM 0x020000 /* Force cap_enter() on startup. */ +#define SV_TIMEKEEP 0x040000 /* Shared page timehands. */ +#define SV_HWCAP 0x080000 /* sv_hwcap field is valid. */ #define SV_ABI_MASK 0xff #define SV_ABI_ERRNO(p, e) ((p)->p_sysent->sv_errsize <= 0 ? e : \ @@ -151,6 +153,7 @@ /* same as ELFOSABI_XXX, to prevent header pollution */ #define SV_ABI_LINUX 3 #define SV_ABI_FREEBSD 9 +#define SV_ABI_CLOUDABI 17 #define SV_ABI_UNDEF 255 #ifdef _KERNEL @@ -158,7 +161,7 @@ extern struct sysent sysent[]; extern const char *syscallnames[]; -#if defined(__amd64__) || defined(__ia64__) +#if defined(__amd64__) extern int i386_read_exec; #endif @@ -172,6 +175,7 @@ int *offset; /* offset into sysent */ struct sysent *new_sysent; /* new sysent */ struct sysent old_sysent; /* old sysent */ + int flags; /* flags for syscall_register */ }; /* separate initialization vector so it can be used in a substructure */ @@ -230,33 +234,39 @@ int syscall_no; int registered; }; -#define SYSCALL_INIT_HELPER(syscallname) { \ +#define SYSCALL_INIT_HELPER_F(syscallname, flags) { \ .new_sysent = { \ .sy_narg = (sizeof(struct syscallname ## _args ) \ / sizeof(register_t)), \ .sy_call = (sy_call_t *)& sys_ ## syscallname, \ - .sy_auevent = SYS_AUE_##syscallname \ + .sy_auevent = SYS_AUE_##syscallname, \ + .sy_flags = (flags) \ }, \ .syscall_no = SYS_##syscallname \ } -#define SYSCALL_INIT_HELPER_COMPAT(syscallname) { \ +#define SYSCALL_INIT_HELPER_COMPAT_F(syscallname, flags) { \ .new_sysent = { \ .sy_narg = (sizeof(struct syscallname ## _args ) \ / sizeof(register_t)), \ .sy_call = (sy_call_t *)& syscallname, \ - .sy_auevent = SYS_AUE_##syscallname \ + .sy_auevent = SYS_AUE_##syscallname, \ + .sy_flags = (flags) \ }, \ .syscall_no = SYS_##syscallname \ } +#define SYSCALL_INIT_HELPER(syscallname) \ + SYSCALL_INIT_HELPER_F(syscallname, 0) +#define SYSCALL_INIT_HELPER_COMPAT(syscallname) \ + SYSCALL_INIT_HELPER_COMPAT_F(syscallname, 0) #define SYSCALL_INIT_LAST { \ .syscall_no = NO_SYSCALL \ } int syscall_register(int *offset, struct sysent *new_sysent, - struct sysent *old_sysent); + struct sysent *old_sysent, int flags); int syscall_deregister(int *offset, struct sysent *old_sysent); int syscall_module_handler(struct module *mod, int what, void *arg); -int syscall_helper_register(struct syscall_helper_data *sd); +int syscall_helper_register(struct syscall_helper_data *sd, int flags); int syscall_helper_unregister(struct syscall_helper_data *sd); struct proc; @@ -275,6 +285,7 @@ int shared_page_fill(int size, int align, const void *data); void shared_page_write(int base, int size, const void *data); void exec_sysvec_init(void *param); +void exec_inittk(void); #define INIT_SYSENTVEC(name, sv) \ SYSINIT(name, SI_SUB_EXEC, SI_ORDER_ANY, \ From laffer1 at midnightbsd.org Sun Feb 9 11:49:32 2020 From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org) Date: Sun, 9 Feb 2020 11:49:32 -0500 (EST) Subject: [Midnightbsd-cvs] src [12345] trunk/sys/sys/spigenio.h: sync with FreeBSD 11-stable Message-ID: <202002091649.019GnWGP073936@stargazer.midnightbsd.org> Revision: 12345 http://svnweb.midnightbsd.org/src/?rev=12345 Author: laffer1 Date: 2020-02-09 11:49:31 -0500 (Sun, 09 Feb 2020) Log Message: ----------- sync with FreeBSD 11-stable Added Paths: ----------- trunk/sys/sys/spigenio.h Added: trunk/sys/sys/spigenio.h =================================================================== --- trunk/sys/sys/spigenio.h (rev 0) +++ trunk/sys/sys/spigenio.h 2020-02-09 16:49:31 UTC (rev 12345) @@ -0,0 +1,55 @@ +/* $MidnightBSD$ */ +/*- + * Copyright (c) 2000 Doug Rabson + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: stable/11/sys/sys/spigenio.h 332942 2018-04-24 17:00:08Z ian $ + */ + +#ifndef _SYS_SPIGENIO_H_ +#define _SYS_SPIGENIO_H_ + +#include <sys/_iovec.h> + +struct spigen_transfer { + struct iovec st_command; /* master to slave */ + struct iovec st_data; /* slave to master and/or master to slave */ +}; + +struct spigen_transfer_mmapped { + size_t stm_command_length; /* at offset 0 in mmap(2) area */ + size_t stm_data_length; /* at offset stm_command_length */ +}; + +#define SPIGENIOC_BASE 'S' +#define SPIGENIOC_TRANSFER _IOW(SPIGENIOC_BASE, 0, \ + struct spigen_transfer) +#define SPIGENIOC_TRANSFER_MMAPPED _IOW(SPIGENIOC_BASE, 1, \ + struct spigen_transfer_mmapped) +#define SPIGENIOC_GET_CLOCK_SPEED _IOR(SPIGENIOC_BASE, 2, uint32_t) +#define SPIGENIOC_SET_CLOCK_SPEED _IOW(SPIGENIOC_BASE, 3, uint32_t) +#define SPIGENIOC_GET_SPI_MODE _IOR(SPIGENIOC_BASE, 4, uint32_t) +#define SPIGENIOC_SET_SPI_MODE _IOW(SPIGENIOC_BASE, 5, uint32_t) + +#endif /* !_SYS_SPIGENIO_H_ */ Property changes on: trunk/sys/sys/spigenio.h ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +MidnightBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property From laffer1 at midnightbsd.org Sun Feb 9 12:03:30 2020 From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org) Date: Sun, 9 Feb 2020 12:03:30 -0500 (EST) Subject: [Midnightbsd-cvs] src [12346] trunk/sys/sys/syscallsubr.h: sync with FreeBSD 11-stable Message-ID: <202002091703.019H3UID076491@stargazer.midnightbsd.org> Revision: 12346 http://svnweb.midnightbsd.org/src/?rev=12346 Author: laffer1 Date: 2020-02-09 12:03:29 -0500 (Sun, 09 Feb 2020) Log Message: ----------- sync with FreeBSD 11-stable Modified Paths: -------------- trunk/sys/sys/syscallsubr.h Modified: trunk/sys/sys/syscallsubr.h =================================================================== --- trunk/sys/sys/syscallsubr.h 2020-02-09 16:49:31 UTC (rev 12345) +++ trunk/sys/sys/syscallsubr.h 2020-02-09 17:03:29 UTC (rev 12346) @@ -23,7 +23,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: stable/10/sys/sys/syscallsubr.h 321009 2017-07-15 14:48:31Z dchagin $ + * $FreeBSD: stable/11/sys/sys/syscallsubr.h 356634 2020-01-11 15:06:06Z kevans $ */ #ifndef _SYS_SYSCALLSUBR_H_ @@ -34,8 +34,10 @@ #include <sys/socket.h> #include <sys/mac.h> #include <sys/mount.h> +#include <sys/_cpuset.h> struct file; +struct filecaps; enum idtype; struct itimerval; struct image_args; @@ -59,6 +61,8 @@ struct sched_param; struct __wrusage; +typedef int (*mmap_check_fp_fn)(struct file *, int, int, int); + int kern___getcwd(struct thread *td, char *buf, enum uio_seg bufseg, u_int buflen, u_int path_max); int kern_accept(struct thread *td, int s, struct sockaddr **name, @@ -65,8 +69,6 @@ socklen_t *namelen, struct file **fp); int kern_accept4(struct thread *td, int s, struct sockaddr **name, socklen_t *namelen, int flags, struct file **fp); -int kern_access(struct thread *td, char *path, enum uio_seg pathseg, - int flags); int kern_accessat(struct thread *td, int fd, char *path, enum uio_seg pathseg, int flags, int mode); int kern_adjtime(struct thread *td, struct timeval *delta, @@ -73,14 +75,11 @@ struct timeval *olddelta); int kern_alternate_path(struct thread *td, const char *prefix, const char *path, enum uio_seg pathseg, char **pathbuf, int create, int dirfd); -int kern_bind(struct thread *td, int fd, struct sockaddr *sa); +int kern_bindat(struct thread *td, int dirfd, int fd, struct sockaddr *sa); int kern_cap_ioctls_limit(struct thread *td, int fd, u_long *cmds, size_t ncmds); +int kern_cap_rights_limit(struct thread *td, int fd, cap_rights_t *rights); int kern_chdir(struct thread *td, char *path, enum uio_seg pathseg); -int kern_chmod(struct thread *td, char *path, enum uio_seg pathseg, - int mode); -int kern_chown(struct thread *td, char *path, enum uio_seg pathseg, int uid, - int gid); int kern_clock_getcpuclockid2(struct thread *td, id_t id, int which, clockid_t *clk_id); int kern_clock_getres(struct thread *td, clockid_t clock_id, @@ -87,12 +86,23 @@ struct timespec *ts); int kern_clock_gettime(struct thread *td, clockid_t clock_id, struct timespec *ats); +int kern_clock_nanosleep(struct thread *td, clockid_t clock_id, int flags, + const struct timespec *rqtp, struct timespec *rmtp); int kern_clock_settime(struct thread *td, clockid_t clock_id, struct timespec *ats); int kern_close(struct thread *td, int fd); -int kern_connect(struct thread *td, int fd, struct sockaddr *sa); -int kern_eaccess(struct thread *td, char *path, enum uio_seg pathseg, - int flags); +int kern_connectat(struct thread *td, int dirfd, int fd, + struct sockaddr *sa); +int kern_cpuset_getaffinity(struct thread *td, cpulevel_t level, + cpuwhich_t which, id_t id, size_t cpusetsize, cpuset_t *maskp); +int kern_cpuset_setaffinity(struct thread *td, cpulevel_t level, + cpuwhich_t which, id_t id, size_t cpusetsize, + const cpuset_t *maskp); +int kern_cpuset_getid(struct thread *td, cpulevel_t level, + cpuwhich_t which, id_t id, cpusetid_t *setid); +int kern_cpuset_setid(struct thread *td, cpuwhich_t which, + id_t id, cpusetid_t setid); +int kern_dup(struct thread *td, u_int mode, int flags, int old, int new); int kern_execve(struct thread *td, struct image_args *args, struct mac *mac_p); int kern_fchmodat(struct thread *td, int fd, char *path, @@ -103,8 +113,10 @@ int kern_fcntl_freebsd(struct thread *td, int fd, int cmd, long arg); int kern_fhstat(struct thread *td, fhandle_t fh, struct stat *buf); int kern_fhstatfs(struct thread *td, fhandle_t fh, struct statfs *buf); +int kern_fpathconf(struct thread *td, int fd, int name); int kern_fstat(struct thread *td, int fd, struct stat *sbp); int kern_fstatfs(struct thread *td, int fd, struct statfs *buf); +int kern_fsync(struct thread *td, int fd, bool fullsync); int kern_ftruncate(struct thread *td, int fd, off_t length); int kern_futimes(struct thread *td, int fd, struct timeval *tptr, enum uio_seg tptrseg); @@ -113,9 +125,9 @@ int kern_getdirentries(struct thread *td, int fd, char *buf, u_int count, long *basep, ssize_t *residp, enum uio_seg bufseg); int kern_getfsstat(struct thread *td, struct statfs **buf, size_t bufsize, - enum uio_seg bufseg, int flags); -int kern_getgroups(struct thread *td, u_int *ngrp, gid_t *groups); + size_t *countp, enum uio_seg bufseg, int mode); int kern_getitimer(struct thread *, u_int, struct itimerval *); +int kern_getppid(struct thread *); int kern_getpeername(struct thread *td, int fd, struct sockaddr **sa, socklen_t *alen); int kern_getrusage(struct thread *td, int who, struct rusage *rup); @@ -129,52 +141,56 @@ int kern_jail_set(struct thread *td, struct uio *options, int flags); int kern_kevent(struct thread *td, int fd, int nchanges, int nevents, struct kevent_copyops *k_ops, const struct timespec *timeout); +int kern_kevent_anonymous(struct thread *td, int nevents, + struct kevent_copyops *k_ops); int kern_kevent_fp(struct thread *td, struct file *fp, int nchanges, int nevents, struct kevent_copyops *k_ops, const struct timespec *timeout); -int kern_kqueue(struct thread *td, int flags); +int kern_kqueue(struct thread *td, int flags, struct filecaps *fcaps); int kern_kldload(struct thread *td, const char *file, int *fileid); int kern_kldstat(struct thread *td, int fileid, struct kld_file_stat *stat); int kern_kldunload(struct thread *td, int fileid, int flags); -int kern_lchown(struct thread *td, char *path, enum uio_seg pathseg, - int uid, int gid); -int kern_link(struct thread *td, char *path, char *link, - enum uio_seg segflg); int kern_linkat(struct thread *td, int fd1, int fd2, char *path1, char *path2, enum uio_seg segflg, int follow); -int kern_lstat(struct thread *td, char *path, enum uio_seg pathseg, - struct stat *sbp); +int kern_listen(struct thread *td, int s, int backlog); +int kern_lseek(struct thread *td, int fd, off_t offset, int whence); int kern_lutimes(struct thread *td, char *path, enum uio_seg pathseg, struct timeval *tptr, enum uio_seg tptrseg); -int kern_mkdir(struct thread *td, char *path, enum uio_seg segflg, - int mode); +int kern_madvise(struct thread *td, uintptr_t addr, size_t len, int behav); +int kern_mincore(struct thread *td, uintptr_t addr, size_t len, char *vec); int kern_mkdirat(struct thread *td, int fd, char *path, enum uio_seg segflg, int mode); -int kern_mkfifo(struct thread *td, char *path, enum uio_seg pathseg, - int mode); int kern_mkfifoat(struct thread *td, int fd, char *path, enum uio_seg pathseg, int mode); -int kern_mknod(struct thread *td, char *path, enum uio_seg pathseg, - int mode, int dev); int kern_mknodat(struct thread *td, int fd, char *path, enum uio_seg pathseg, int mode, int dev); +int kern_mlock(struct proc *proc, struct ucred *cred, uintptr_t addr, + size_t len); +int kern_mmap(struct thread *td, uintptr_t addr, size_t size, int prot, + int flags, int fd, off_t pos); +int kern_mmap_fpcheck(struct thread *td, uintptr_t addr, size_t len, + int prot, int flags, int fd, off_t pos, + mmap_check_fp_fn check_fp_fn); +int kern_mprotect(struct thread *td, uintptr_t addr, size_t size, int prot); int kern_msgctl(struct thread *, int, int, struct msqid_ds *); int kern_msgrcv(struct thread *, int, void *, size_t, long, int, long *); int kern_msgsnd(struct thread *, int, const void *, size_t, int, long); +int kern_msync(struct thread *td, uintptr_t addr, size_t size, int flags); +int kern_munlock(struct thread *td, uintptr_t addr, size_t size); +int kern_munmap(struct thread *td, uintptr_t addr, size_t size); int kern_nanosleep(struct thread *td, struct timespec *rqt, struct timespec *rmt); int kern_ogetdirentries(struct thread *td, struct ogetdirentries_args *uap, long *ploff); -int kern_open(struct thread *td, char *path, enum uio_seg pathseg, - int flags, int mode); int kern_openat(struct thread *td, int fd, char *path, enum uio_seg pathseg, int flags, int mode); int kern_pathconf(struct thread *td, char *path, enum uio_seg pathseg, int name, u_long flags); -int kern_pipe(struct thread *td, int fildes[2]); -int kern_pipe2(struct thread *td, int fildes[2], int flags); +int kern_pipe(struct thread *td, int fildes[2], int flags, + struct filecaps *fcaps1, struct filecaps *fcaps2); int kern_poll(struct thread *td, struct pollfd *fds, u_int nfds, struct timespec *tsp, sigset_t *uset); +int kern_posix_error(struct thread *td, int error); int kern_posix_fadvise(struct thread *td, int fd, off_t offset, off_t len, int advice); int kern_posix_fallocate(struct thread *td, int fd, off_t offset, @@ -181,24 +197,23 @@ off_t len); int kern_procctl(struct thread *td, enum idtype idtype, id_t id, int com, void *data); +int kern_pread(struct thread *td, int fd, void *buf, size_t nbyte, + off_t offset); int kern_preadv(struct thread *td, int fd, struct uio *auio, off_t offset); int kern_pselect(struct thread *td, int nd, fd_set *in, fd_set *ou, fd_set *ex, struct timeval *tvp, sigset_t *uset, int abi_nfdbits); int kern_ptrace(struct thread *td, int req, pid_t pid, void *addr, int data); +int kern_pwrite(struct thread *td, int fd, const void *buf, size_t nbyte, + off_t offset); int kern_pwritev(struct thread *td, int fd, struct uio *auio, off_t offset); -int kern_readlink(struct thread *td, char *path, enum uio_seg pathseg, - char *buf, enum uio_seg bufseg, size_t count); int kern_readlinkat(struct thread *td, int fd, char *path, enum uio_seg pathseg, char *buf, enum uio_seg bufseg, size_t count); int kern_readv(struct thread *td, int fd, struct uio *auio); int kern_recvit(struct thread *td, int s, struct msghdr *mp, enum uio_seg fromseg, struct mbuf **controlp); -int kern_rename(struct thread *td, char *from, char *to, - enum uio_seg pathseg); int kern_renameat(struct thread *td, int oldfd, char *old, int newfd, char *new, enum uio_seg pathseg); -int kern_rmdir(struct thread *td, char *path, enum uio_seg pathseg); int kern_rmdirat(struct thread *td, int fd, char *path, enum uio_seg pathseg); int kern_sched_getparam(struct thread *td, struct thread *targettd, @@ -229,11 +244,14 @@ void *optval, enum uio_seg valseg, socklen_t valsize); int kern_settimeofday(struct thread *td, struct timeval *tv, struct timezone *tzp); +int kern_shm_open(struct thread *td, const char *userpath, int flags, + mode_t mode, struct filecaps *fcaps); int kern_shmat(struct thread *td, int shmid, const void *shmaddr, int shmflg); int kern_shmctl(struct thread *td, int shmid, int cmd, void *buf, size_t *bufsz); -int kern_sigaction(struct thread *td, int sig, struct sigaction *act, +int kern_shutdown(struct thread *td, int s, int how); +int kern_sigaction(struct thread *td, int sig, const struct sigaction *act, struct sigaction *oact, int flags); int kern_sigaltstack(struct thread *td, stack_t *ss, stack_t *oss); int kern_sigprocmask(struct thread *td, int how, @@ -241,19 +259,14 @@ int kern_sigsuspend(struct thread *td, sigset_t mask); int kern_sigtimedwait(struct thread *td, sigset_t waitset, struct ksiginfo *ksi, struct timespec *timeout); -int kern_stat(struct thread *td, char *path, enum uio_seg pathseg, - struct stat *sbp); int kern_sigqueue(struct thread *td, pid_t pid, int signum, union sigval *value); +int kern_socket(struct thread *td, int domain, int type, int protocol); int kern_statat(struct thread *td, int flag, int fd, char *path, - enum uio_seg pathseg, struct stat *sbp); -int kern_statat_vnhook(struct thread *td, int flag, int fd, char *path, enum uio_seg pathseg, struct stat *sbp, void (*hook)(struct vnode *vp, struct stat *sbp)); int kern_statfs(struct thread *td, char *path, enum uio_seg pathseg, struct statfs *buf); -int kern_symlink(struct thread *td, char *path, char *link, - enum uio_seg segflg); int kern_symlinkat(struct thread *td, char *path1, int fd, char *path2, enum uio_seg segflg); int kern_ktimer_create(struct thread *td, clockid_t clock_id, @@ -270,11 +283,8 @@ int kern_thr_suspend(struct thread *td, struct timespec *tsp); int kern_truncate(struct thread *td, char *path, enum uio_seg pathseg, off_t length); -int kern_unlink(struct thread *td, char *path, enum uio_seg pathseg); int kern_unlinkat(struct thread *td, int fd, char *path, enum uio_seg pathseg, ino_t oldinum); -int kern_utimes(struct thread *td, char *path, enum uio_seg pathseg, - struct timeval *tptr, enum uio_seg tptrseg); int kern_utimesat(struct thread *td, int fd, char *path, enum uio_seg pathseg, struct timeval *tptr, enum uio_seg tptrseg); int kern_utimensat(struct thread *td, int fd, char *path, From laffer1 at midnightbsd.org Sun Feb 9 12:05:27 2020 From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org) Date: Sun, 9 Feb 2020 12:05:27 -0500 (EST) Subject: [Midnightbsd-cvs] src [12347] trunk/sys/sys/timex.h: sync with FreeBSD 11-stable Message-ID: <202002091705.019H5RcO077226@stargazer.midnightbsd.org> Revision: 12347 http://svnweb.midnightbsd.org/src/?rev=12347 Author: laffer1 Date: 2020-02-09 12:05:26 -0500 (Sun, 09 Feb 2020) Log Message: ----------- sync with FreeBSD 11-stable Modified Paths: -------------- trunk/sys/sys/timex.h Modified: trunk/sys/sys/timex.h =================================================================== --- trunk/sys/sys/timex.h 2020-02-09 17:03:29 UTC (rev 12346) +++ trunk/sys/sys/timex.h 2020-02-09 17:05:26 UTC (rev 12347) @@ -3,6 +3,7 @@ *********************************************************************** * * * Copyright (c) David L. Mills 1993-2001 * + * Copyright (c) Poul-Henning Kamp 2000-2001 * * * * Permission to use, copy, modify, and distribute this software and * * its documentation for any purpose and without fee is hereby * @@ -16,94 +17,30 @@ * purpose. It is provided "as is" without express or implied * * warranty. * * * - **********************************************************************/ - -/* - * Modification history timex.h + *********************************************************************** * - * 16 Aug 00 David L. Mills - * API Version 4. Added MOD_TAI and tai member of ntptimeval - * structure. + * $FreeBSD: stable/11/sys/sys/timex.h 298981 2016-05-03 15:14:17Z pfg $ * - * 17 Nov 98 David L. Mills - * Revised for nanosecond kernel and user interface. - * - * 26 Sep 94 David L. Mills - * Added defines for hybrid phase/frequency-lock loop. - * - * 19 Mar 94 David L. Mills - * Moved defines from kernel routines to header file and added new - * defines for PPS phase-lock loop. - * - * 20 Feb 94 David L. Mills - * Revised status codes and structures for external clock and PPS - * signal discipline. - * - * 28 Nov 93 David L. Mills - * Adjusted parameters to improve stability and increase poll - * interval. - * - * 17 Sep 93 David L. Mills - * Created file - * - * $FreeBSD: stable/10/sys/sys/timex.h 250889 2013-05-21 21:50:11Z ed $ - */ -/* * This header file defines the Network Time Protocol (NTP) interfaces - * for user and daemon application programs. These are implemented using - * defined syscalls and data structures and require specific kernel - * support. + * for user and daemon application programs. * - * The original precision time kernels developed from 1993 have an - * ultimate resolution of one microsecond; however, the most recent - * kernels have an ultimate resolution of one nanosecond. In these - * kernels, a ntp_adjtime() syscalls can be used to determine which - * resolution is in use and to select either one at any time. The - * resolution selected affects the scaling of certain fields in the - * ntp_gettime() and ntp_adjtime() syscalls, as described below. + * This file was originally created 17 Sep 93 by David L. Mills, Professor + * of University of Delaware, building on work which had already been ongoing + * for a decade and a half at that point in time. * - * NAME - * ntp_gettime - NTP user application interface + * In 2000 the APIs got a upgrade from microseconds to nanoseconds, + * a joint work between Poul-Henning Kamp and David L. Mills. * - * SYNOPSIS - * #include <sys/timex.h> - * - * int ntp_gettime(struct ntptimeval *ntv); - * - * DESCRIPTION - * The time returned by ntp_gettime() is in a timespec structure, - * but may be in either microsecond (seconds and microseconds) or - * nanosecond (seconds and nanoseconds) format. The particular - * format in use is determined by the STA_NANO bit of the status - * word returned by the ntp_adjtime() syscall. - * - * NAME - * ntp_adjtime - NTP daemon application interface - * - * SYNOPSIS - * #include <sys/timex.h> - * #include <sys/syscall.h> - * - * int syscall(SYS_ntp_adjtime, tptr); - * int SYS_ntp_adjtime; - * struct timex *tptr; - * - * DESCRIPTION - * Certain fields of the timex structure are interpreted in either - * microseconds or nanoseconds according to the state of the - * STA_NANO bit in the status word. See the description below for - * further information. */ + #ifndef _SYS_TIMEX_H_ #define _SYS_TIMEX_H_ 1 -#define NTP_API 4 /* NTP API version */ +#define NTP_API 4 /* NTP API version */ + #ifdef __MidnightBSD__ #include <sys/_timespec.h> #endif /* __MidnightBSD__ */ -#ifndef MSDOS /* Microsoft specific */ -#include <sys/syscall.h> -#endif /* MSDOS */ /* * The following defines establish the performance envelope of the @@ -114,98 +51,93 @@ * mode. Between these two limits the operating mode is selected by the * STA_FLL bit in the status word. */ -#define MAXPHASE 500000000L /* max phase error (ns) */ -#define MAXFREQ 500000L /* max freq error (ns/s) */ -#define MINSEC 256 /* min FLL update interval (s) */ -#define MAXSEC 2048 /* max PLL update interval (s) */ -#define NANOSECOND 1000000000L /* nanoseconds in one second */ -#define SCALE_PPM (65536 / 1000) /* crude ns/s to scaled PPM */ -#define MAXTC 10 /* max time constant */ +#define MAXPHASE 500000000L /* max phase error (ns) */ +#define MAXFREQ 500000L /* max freq error (ns/s) */ +#define MINSEC 256 /* min FLL update interval (s) */ +#define MAXSEC 2048 /* max PLL update interval (s) */ +#define NANOSECOND 1000000000L /* nanoseconds in one second */ +#define SCALE_PPM (65536 / 1000) /* crude ns/s to scaled PPM */ +#define MAXTC 10 /* max time constant */ + /* - * The following defines and structures define the user interface for - * the ntp_gettime() and ntp_adjtime() syscalls. - * * Control mode codes (timex.modes) */ -#define MOD_OFFSET 0x0001 /* set time offset */ -#define MOD_FREQUENCY 0x0002 /* set frequency offset */ -#define MOD_MAXERROR 0x0004 /* set maximum time error */ -#define MOD_ESTERROR 0x0008 /* set estimated time error */ -#define MOD_STATUS 0x0010 /* set clock status bits */ -#define MOD_TIMECONST 0x0020 /* set PLL time constant */ -#define MOD_PPSMAX 0x0040 /* set PPS maximum averaging time */ -#define MOD_TAI 0x0080 /* set TAI offset */ -#define MOD_MICRO 0x1000 /* select microsecond resolution */ -#define MOD_NANO 0x2000 /* select nanosecond resolution */ -#define MOD_CLKB 0x4000 /* select clock B */ -#define MOD_CLKA 0x8000 /* select clock A */ +#define MOD_OFFSET 0x0001 /* set time offset */ +#define MOD_FREQUENCY 0x0002 /* set frequency offset */ +#define MOD_MAXERROR 0x0004 /* set maximum time error */ +#define MOD_ESTERROR 0x0008 /* set estimated time error */ +#define MOD_STATUS 0x0010 /* set clock status bits */ +#define MOD_TIMECONST 0x0020 /* set PLL time constant */ +#define MOD_PPSMAX 0x0040 /* set PPS maximum averaging time */ +#define MOD_TAI 0x0080 /* set TAI offset */ +#define MOD_MICRO 0x1000 /* select microsecond resolution */ +#define MOD_NANO 0x2000 /* select nanosecond resolution */ +#define MOD_CLKB 0x4000 /* select clock B */ +#define MOD_CLKA 0x8000 /* select clock A */ /* * Status codes (timex.status) */ -#define STA_PLL 0x0001 /* enable PLL updates (rw) */ -#define STA_PPSFREQ 0x0002 /* enable PPS freq discipline (rw) */ -#define STA_PPSTIME 0x0004 /* enable PPS time discipline (rw) */ -#define STA_FLL 0x0008 /* enable FLL mode (rw) */ -#define STA_INS 0x0010 /* insert leap (rw) */ -#define STA_DEL 0x0020 /* delete leap (rw) */ -#define STA_UNSYNC 0x0040 /* clock unsynchronized (rw) */ -#define STA_FREQHOLD 0x0080 /* hold frequency (rw) */ -#define STA_PPSSIGNAL 0x0100 /* PPS signal present (ro) */ -#define STA_PPSJITTER 0x0200 /* PPS signal jitter exceeded (ro) */ -#define STA_PPSWANDER 0x0400 /* PPS signal wander exceeded (ro) */ -#define STA_PPSERROR 0x0800 /* PPS signal calibration error (ro) */ -#define STA_CLOCKERR 0x1000 /* clock hardware fault (ro) */ -#define STA_NANO 0x2000 /* resolution (0 = us, 1 = ns) (ro) */ -#define STA_MODE 0x4000 /* mode (0 = PLL, 1 = FLL) (ro) */ -#define STA_CLK 0x8000 /* clock source (0 = A, 1 = B) (ro) */ +#define STA_PLL 0x0001 /* enable PLL updates (rw) */ +#define STA_PPSFREQ 0x0002 /* enable PPS freq discipline (rw) */ +#define STA_PPSTIME 0x0004 /* enable PPS time discipline (rw) */ +#define STA_FLL 0x0008 /* enable FLL mode (rw) */ +#define STA_INS 0x0010 /* insert leap (rw) */ +#define STA_DEL 0x0020 /* delete leap (rw) */ +#define STA_UNSYNC 0x0040 /* clock unsynchronized (rw) */ +#define STA_FREQHOLD 0x0080 /* hold frequency (rw) */ +#define STA_PPSSIGNAL 0x0100 /* PPS signal present (ro) */ +#define STA_PPSJITTER 0x0200 /* PPS signal jitter exceeded (ro) */ +#define STA_PPSWANDER 0x0400 /* PPS signal wander exceeded (ro) */ +#define STA_PPSERROR 0x0800 /* PPS signal calibration error (ro) */ +#define STA_CLOCKERR 0x1000 /* clock hardware fault (ro) */ +#define STA_NANO 0x2000 /* resolution (0 = us, 1 = ns) (ro) */ +#define STA_MODE 0x4000 /* mode (0 = PLL, 1 = FLL) (ro) */ +#define STA_CLK 0x8000 /* clock source (0 = A, 1 = B) (ro) */ #define STA_RONLY (STA_PPSSIGNAL | STA_PPSJITTER | STA_PPSWANDER | \ STA_PPSERROR | STA_CLOCKERR | STA_NANO | STA_MODE | STA_CLK) /* - * Clock states (time_state) + * Clock states (ntptimeval.time_state) */ -#define TIME_OK 0 /* no leap second warning */ -#define TIME_INS 1 /* insert leap second warning */ -#define TIME_DEL 2 /* delete leap second warning */ -#define TIME_OOP 3 /* leap second in progress */ -#define TIME_WAIT 4 /* leap second has occured */ -#define TIME_ERROR 5 /* error (see status word) */ +#define TIME_OK 0 /* no leap second warning */ +#define TIME_INS 1 /* insert leap second warning */ +#define TIME_DEL 2 /* delete leap second warning */ +#define TIME_OOP 3 /* leap second in progress */ +#define TIME_WAIT 4 /* leap second has occurred */ +#define TIME_ERROR 5 /* error (see status word) */ /* - * NTP user interface (ntp_gettime()) - used to read kernel clock values - * - * Note: The time member is in microseconds if STA_NANO is zero and - * nanoseconds if not. + * NTP user interface -- ntp_gettime(2) - used to read kernel clock values */ struct ntptimeval { - struct timespec time; /* current time (ns) (ro) */ - long maxerror; /* maximum error (us) (ro) */ - long esterror; /* estimated error (us) (ro) */ - long tai; /* TAI offset */ - int time_state; /* time status */ + struct timespec time; /* current time (ns) (ro) */ + long maxerror; /* maximum error (us) (ro) */ + long esterror; /* estimated error (us) (ro) */ + long tai; /* TAI offset */ + int time_state; /* time status */ }; /* - * NTP daemon interface (ntp_adjtime()) - used to discipline CPU clock - * oscillator and determine status. + * NTP daemon interface -- ntp_adjtime(2) -- used to discipline CPU clock + * oscillator and control/determine status. * * Note: The offset, precision and jitter members are in microseconds if * STA_NANO is zero and nanoseconds if not. */ struct timex { - unsigned int modes; /* clock mode bits (wo) */ - long offset; /* time offset (ns/us) (rw) */ - long freq; /* frequency offset (scaled PPM) (rw) */ - long maxerror; /* maximum error (us) (rw) */ - long esterror; /* estimated error (us) (rw) */ - int status; /* clock status bits (rw) */ - long constant; /* poll interval (log2 s) (rw) */ - long precision; /* clock precision (ns/us) (ro) */ - long tolerance; /* clock frequency tolerance (scaled - * PPM) (ro) */ + unsigned int modes; /* clock mode bits (wo) */ + long offset; /* time offset (ns/us) (rw) */ + long freq; /* frequency offset (scaled PPM) (rw) */ + long maxerror; /* maximum error (us) (rw) */ + long esterror; /* estimated error (us) (rw) */ + int status; /* clock status bits (rw) */ + long constant; /* poll interval (log2 s) (rw) */ + long precision; /* clock precision (ns/us) (ro) */ + long tolerance; /* clock frequency tolerance (scaled + * PPM) (ro) */ /* * The following read-only structure members are implemented * only if the PPS signal discipline is configured in the @@ -212,14 +144,14 @@ * kernel. They are included in all configurations to insure * portability. */ - long ppsfreq; /* PPS frequency (scaled PPM) (ro) */ - long jitter; /* PPS jitter (ns/us) (ro) */ - int shift; /* interval duration (s) (shift) (ro) */ - long stabil; /* PPS stability (scaled PPM) (ro) */ - long jitcnt; /* jitter limit exceeded (ro) */ - long calcnt; /* calibration intervals (ro) */ - long errcnt; /* calibration errors (ro) */ - long stbcnt; /* stability limit exceeded (ro) */ + long ppsfreq; /* PPS frequency (scaled PPM) (ro) */ + long jitter; /* PPS jitter (ns/us) (ro) */ + int shift; /* interval duration (s) (shift) (ro) */ + long stabil; /* PPS stability (scaled PPM) (ro) */ + long jitcnt; /* jitter limit exceeded (ro) */ + long calcnt; /* calibration intervals (ro) */ + long errcnt; /* calibration errors (ro) */ + long stbcnt; /* stability limit exceeded (ro) */ }; #ifdef __MidnightBSD__ From laffer1 at midnightbsd.org Sun Feb 9 12:06:02 2020 From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org) Date: Sun, 9 Feb 2020 12:06:02 -0500 (EST) Subject: [Midnightbsd-cvs] src [12348] trunk/sys/sys/sysctl.h: sync with FreeBSD 11-stable Message-ID: <202002091706.019H62nM077308@stargazer.midnightbsd.org> Revision: 12348 http://svnweb.midnightbsd.org/src/?rev=12348 Author: laffer1 Date: 2020-02-09 12:06:01 -0500 (Sun, 09 Feb 2020) Log Message: ----------- sync with FreeBSD 11-stable Modified Paths: -------------- trunk/sys/sys/sysctl.h Modified: trunk/sys/sys/sysctl.h =================================================================== --- trunk/sys/sys/sysctl.h 2020-02-09 17:05:26 UTC (rev 12347) +++ trunk/sys/sys/sysctl.h 2020-02-09 17:06:01 UTC (rev 12348) @@ -31,7 +31,7 @@ * SUCH DAMAGE. * * @(#)sysctl.h 8.1 (Berkeley) 6/2/93 - * $FreeBSD: stable/10/sys/sys/sysctl.h 324749 2017-10-19 08:00:34Z avg $ + * $FreeBSD: stable/11/sys/sys/sysctl.h 354762 2019-11-16 00:33:02Z scottl $ */ #ifndef _SYS_SYSCTL_H_ @@ -74,6 +74,12 @@ #define CTLTYPE_LONG 7 /* name describes a long */ #define CTLTYPE_ULONG 8 /* name describes an unsigned long */ #define CTLTYPE_U64 9 /* name describes an unsigned 64-bit number */ +#define CTLTYPE_U8 0xa /* name describes an unsigned 8-bit number */ +#define CTLTYPE_U16 0xb /* name describes an unsigned 16-bit number */ +#define CTLTYPE_S8 0xc /* name describes a signed 8-bit number */ +#define CTLTYPE_S16 0xd /* name describes a signed 16-bit number */ +#define CTLTYPE_S32 0xe /* name describes a signed 32-bit number */ +#define CTLTYPE_U32 0xf /* name describes an unsigned 32-bit number */ #define CTLFLAG_RD 0x80000000 /* Allow reads of variable */ #define CTLFLAG_WR 0x40000000 /* Allow writes to the variable */ @@ -85,7 +91,7 @@ #define CTLFLAG_DYN 0x02000000 /* Dynamic oid - can be freed */ #define CTLFLAG_SKIP 0x01000000 /* Skip this sysctl when listing */ #define CTLMASK_SECURE 0x00F00000 /* Secure level */ -#define CTLFLAG_TUN 0x00080000 /* Tunable variable */ +#define CTLFLAG_TUN 0x00080000 /* Default value is loaded from getenv() */ #define CTLFLAG_RDTUN (CTLFLAG_RD|CTLFLAG_TUN) #define CTLFLAG_RWTUN (CTLFLAG_RW|CTLFLAG_TUN) #define CTLFLAG_MPSAFE 0x00040000 /* Handler is MP safe */ @@ -133,7 +139,7 @@ #endif #define SYSCTL_HANDLER_ARGS struct sysctl_oid *oidp, void *arg1, \ - intptr_t arg2, struct sysctl_req *req + intmax_t arg2, struct sysctl_req *req /* definitions for sysctl_req 'lock' member */ #define REQ_UNWIRED 1 @@ -140,7 +146,7 @@ #define REQ_WIRED 2 /* definitions for sysctl_req 'flags' member */ -#if defined(__amd64__) || defined(__ia64__) || defined(__powerpc64__) ||\ +#if defined(__amd64__) || defined(__powerpc64__) ||\ (defined(__mips__) && defined(__mips_n64)) #define SCTL_MASK32 1 /* 32 bit emulation */ #endif @@ -171,12 +177,13 @@ * be hidden behind it, expanded by the handler. */ struct sysctl_oid { + struct sysctl_oid_list oid_children; struct sysctl_oid_list *oid_parent; SLIST_ENTRY(sysctl_oid) oid_link; int oid_number; u_int oid_kind; void *oid_arg1; - intptr_t oid_arg2; + intmax_t oid_arg2; const char *oid_name; int (*oid_handler)(SYSCTL_HANDLER_ARGS); const char *oid_fmt; @@ -187,18 +194,26 @@ #define SYSCTL_IN(r, p, l) (r->newfunc)(r, p, l) #define SYSCTL_OUT(r, p, l) (r->oldfunc)(r, p, l) +#define SYSCTL_OUT_STR(r, p) (r->oldfunc)(r, p, strlen(p) + 1) +int sysctl_handle_bool(SYSCTL_HANDLER_ARGS); +int sysctl_handle_8(SYSCTL_HANDLER_ARGS); +int sysctl_handle_16(SYSCTL_HANDLER_ARGS); +int sysctl_handle_32(SYSCTL_HANDLER_ARGS); +int sysctl_handle_64(SYSCTL_HANDLER_ARGS); int sysctl_handle_int(SYSCTL_HANDLER_ARGS); int sysctl_msec_to_ticks(SYSCTL_HANDLER_ARGS); int sysctl_handle_long(SYSCTL_HANDLER_ARGS); -int sysctl_handle_64(SYSCTL_HANDLER_ARGS); int sysctl_handle_string(SYSCTL_HANDLER_ARGS); int sysctl_handle_opaque(SYSCTL_HANDLER_ARGS); int sysctl_handle_counter_u64(SYSCTL_HANDLER_ARGS); +int sysctl_handle_counter_u64_array(SYSCTL_HANDLER_ARGS); int sysctl_handle_uma_zone_max(SYSCTL_HANDLER_ARGS); int sysctl_handle_uma_zone_cur(SYSCTL_HANDLER_ARGS); +int sysctl_sec_to_timeval(SYSCTL_HANDLER_ARGS); + int sysctl_dpcpu_int(SYSCTL_HANDLER_ARGS); int sysctl_dpcpu_long(SYSCTL_HANDLER_ARGS); int sysctl_dpcpu_quad(SYSCTL_HANDLER_ARGS); @@ -212,15 +227,16 @@ void sysctl_unregister_oid(struct sysctl_oid *oidp); /* Declare a static oid to allow child oids to be added to it. */ -#define SYSCTL_DECL(name) \ - extern struct sysctl_oid_list sysctl_##name##_children +#define SYSCTL_DECL(name) \ + extern struct sysctl_oid sysctl__##name /* Hide these in macros. */ -#define SYSCTL_CHILDREN(oid_ptr) \ - (struct sysctl_oid_list *)(oid_ptr)->oid_arg1 -#define SYSCTL_PARENT(oid_ptr) NULL /* not supported */ -#define SYSCTL_CHILDREN_SET(oid_ptr, val) (oid_ptr)->oid_arg1 = (val) -#define SYSCTL_STATIC_CHILDREN(oid_name) (&sysctl_##oid_name##_children) +#define SYSCTL_CHILDREN(oid_ptr) (&(oid_ptr)->oid_children) +#define SYSCTL_PARENT(oid_ptr) \ + (((oid_ptr)->oid_parent != &sysctl__children) ? \ + __containerof((oid_ptr)->oid_parent, struct sysctl_oid, \ + oid_children) : (struct sysctl_oid *)NULL) +#define SYSCTL_STATIC_CHILDREN(oid_name) (&sysctl__##oid_name.oid_children) /* === Structs and macros related to context handling. === */ @@ -233,7 +249,7 @@ TAILQ_HEAD(sysctl_ctx_list, sysctl_ctx_entry); #define SYSCTL_NODE_CHILDREN(parent, name) \ - sysctl_##parent##_##name##_children + sysctl__##parent##_##name.oid_children #ifndef NO_SYSCTL_DESCR #define __DESCR(d) d @@ -241,44 +257,52 @@ #define __DESCR(d) "" #endif -/* This constructs a "raw" MIB oid. */ -#define SYSCTL_OID(parent, nbr, name, kind, a1, a2, handler, fmt, descr)\ - static struct sysctl_oid sysctl__##parent##_##name = { \ - &sysctl_##parent##_children, \ - { NULL }, \ - nbr, \ - kind, \ - a1, \ - a2, \ - #name, \ - handler, \ - fmt, \ - 0, \ - 0, \ - __DESCR(descr) \ - }; \ - DATA_SET(sysctl_set, sysctl__##parent##_##name) +/* This macro is only for internal use */ +#define SYSCTL_OID_RAW(id, parent_child_head, nbr, name, kind, a1, a2, handler, fmt, descr) \ + struct sysctl_oid id = { \ + .oid_parent = (parent_child_head), \ + .oid_children = SLIST_HEAD_INITIALIZER(&id.oid_children), \ + .oid_number = (nbr), \ + .oid_kind = (kind), \ + .oid_arg1 = (a1), \ + .oid_arg2 = (a2), \ + .oid_name = (name), \ + .oid_handler = (handler), \ + .oid_fmt = (fmt), \ + .oid_descr = __DESCR(descr) \ + }; \ + DATA_SET(sysctl_set, id) +/* This constructs a static "raw" MIB oid. */ +#define SYSCTL_OID(parent, nbr, name, kind, a1, a2, handler, fmt, descr) \ + static SYSCTL_OID_RAW(sysctl__##parent##_##name, \ + SYSCTL_CHILDREN(&sysctl__##parent), \ + nbr, #name, kind, a1, a2, handler, fmt, descr) + +/* This constructs a global "raw" MIB oid. */ +#define SYSCTL_OID_GLOBAL(parent, nbr, name, kind, a1, a2, handler, fmt, descr) \ + SYSCTL_OID_RAW(sysctl__##parent##_##name, \ + SYSCTL_CHILDREN(&sysctl__##parent), \ + nbr, #name, kind, a1, a2, handler, fmt, descr) + #define SYSCTL_ADD_OID(ctx, parent, nbr, name, kind, a1, a2, handler, fmt, descr) \ sysctl_add_oid(ctx, parent, nbr, name, kind, a1, a2, handler, fmt, __DESCR(descr)) /* This constructs a root node from which other nodes can hang. */ -#define SYSCTL_ROOT_NODE(nbr, name, access, handler, descr) \ - SYSCTL_NODE(, nbr, name, access, handler, descr); \ +#define SYSCTL_ROOT_NODE(nbr, name, access, handler, descr) \ + SYSCTL_OID_RAW(sysctl___##name, &sysctl__children, \ + nbr, #name, CTLTYPE_NODE|(access), NULL, 0, \ + handler, "N", descr); \ CTASSERT(((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_NODE) /* This constructs a node from which other oids can hang. */ -#define SYSCTL_NODE(parent, nbr, name, access, handler, descr) \ - struct sysctl_oid_list SYSCTL_NODE_CHILDREN(parent, name); \ - SYSCTL_OID(parent, nbr, name, CTLTYPE_NODE|(access), \ - (void*)&SYSCTL_NODE_CHILDREN(parent, name), 0, handler, "N", descr); \ +#define SYSCTL_NODE(parent, nbr, name, access, handler, descr) \ + SYSCTL_OID_GLOBAL(parent, nbr, name, CTLTYPE_NODE|(access), \ + NULL, 0, handler, "N", descr); \ CTASSERT(((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_NODE) -#define SYSCTL_ADD_ROOT_NODE(ctx, nbr, name, access, handler, descr) \ - SYSCTL_ADD_NODE(ctx, SYSCTL_STATIC_CHILDREN(), nbr, name, access, handler, descr) - #define SYSCTL_ADD_NODE(ctx, parent, nbr, name, access, handler, descr) \ ({ \ CTASSERT(((access) & CTLTYPE) == 0 || \ @@ -287,6 +311,15 @@ NULL, 0, handler, "N", __DESCR(descr)); \ }) +#define SYSCTL_ADD_ROOT_NODE(ctx, nbr, name, access, handler, descr) \ +({ \ + CTASSERT(((access) & CTLTYPE) == 0 || \ + ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_NODE); \ + sysctl_add_oid(ctx, &sysctl__children, nbr, name, \ + CTLTYPE_NODE|(access), \ + NULL, 0, handler, "N", __DESCR(descr)); \ +}) + /* Oid for a string. len can be 0 to indicate '\0' termination. */ #define SYSCTL_STRING(parent, nbr, name, access, arg, len, descr) \ SYSCTL_OID(parent, nbr, name, CTLTYPE_STRING|(access), \ @@ -303,6 +336,202 @@ __arg, len, sysctl_handle_string, "A", __DESCR(descr)); \ }) +/* Oid for a constant '\0' terminated string. */ +#define SYSCTL_CONST_STRING(parent, nbr, name, access, arg, descr) \ + SYSCTL_OID(parent, nbr, name, CTLTYPE_STRING|(access), \ + __DECONST(char *, arg), 0, sysctl_handle_string, "A", descr); \ + CTASSERT(!(access & CTLFLAG_WR)); \ + CTASSERT(((access) & CTLTYPE) == 0 || \ + ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_STRING) + +#define SYSCTL_ADD_CONST_STRING(ctx, parent, nbr, name, access, arg, descr) \ +({ \ + char *__arg = __DECONST(char *, arg); \ + CTASSERT(!(access & CTLFLAG_WR)); \ + CTASSERT(((access) & CTLTYPE) == 0 || \ + ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_STRING); \ + sysctl_add_oid(ctx, parent, nbr, name, CTLTYPE_STRING|(access), \ + __arg, 0, sysctl_handle_string, "A", __DESCR(descr)); \ +}) + +/* Oid for a bool. If ptr is NULL, val is returned. */ +#define SYSCTL_NULL_BOOL_PTR ((bool *)NULL) +#define SYSCTL_BOOL(parent, nbr, name, access, ptr, val, descr) \ + SYSCTL_OID(parent, nbr, name, \ + CTLTYPE_U8 | CTLFLAG_MPSAFE | (access), \ + ptr, val, sysctl_handle_bool, "CU", descr); \ + CTASSERT(((access) & CTLTYPE) == 0 && \ + sizeof(bool) == sizeof(*(ptr))) + +#define SYSCTL_ADD_BOOL(ctx, parent, nbr, name, access, ptr, val, descr) \ +({ \ + bool *__ptr = (ptr); \ + CTASSERT(((access) & CTLTYPE) == 0); \ + sysctl_add_oid(ctx, parent, nbr, name, \ + CTLTYPE_U8 | CTLFLAG_MPSAFE | (access), \ + __ptr, val, sysctl_handle_bool, "CU", __DESCR(descr)); \ +}) + +/* Oid for a signed 8-bit int. If ptr is NULL, val is returned. */ +#define SYSCTL_NULL_S8_PTR ((int8_t *)NULL) +#define SYSCTL_S8(parent, nbr, name, access, ptr, val, descr) \ + SYSCTL_OID(parent, nbr, name, \ + CTLTYPE_S8 | CTLFLAG_MPSAFE | (access), \ + ptr, val, sysctl_handle_8, "C", descr); \ + CTASSERT((((access) & CTLTYPE) == 0 || \ + ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_S8) && \ + sizeof(int8_t) == sizeof(*(ptr))) + +#define SYSCTL_ADD_S8(ctx, parent, nbr, name, access, ptr, val, descr) \ +({ \ + int8_t *__ptr = (ptr); \ + CTASSERT(((access) & CTLTYPE) == 0 || \ + ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_S8); \ + sysctl_add_oid(ctx, parent, nbr, name, \ + CTLTYPE_S8 | CTLFLAG_MPSAFE | (access), \ + __ptr, val, sysctl_handle_8, "C", __DESCR(descr)); \ +}) + +/* Oid for an unsigned 8-bit int. If ptr is NULL, val is returned. */ +#define SYSCTL_NULL_U8_PTR ((uint8_t *)NULL) +#define SYSCTL_U8(parent, nbr, name, access, ptr, val, descr) \ + SYSCTL_OID(parent, nbr, name, \ + CTLTYPE_U8 | CTLFLAG_MPSAFE | (access), \ + ptr, val, sysctl_handle_8, "CU", descr); \ + CTASSERT((((access) & CTLTYPE) == 0 || \ + ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_U8) && \ + sizeof(uint8_t) == sizeof(*(ptr))) + +#define SYSCTL_ADD_U8(ctx, parent, nbr, name, access, ptr, val, descr) \ +({ \ + uint8_t *__ptr = (ptr); \ + CTASSERT(((access) & CTLTYPE) == 0 || \ + ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_U8); \ + sysctl_add_oid(ctx, parent, nbr, name, \ + CTLTYPE_U8 | CTLFLAG_MPSAFE | (access), \ + __ptr, val, sysctl_handle_8, "CU", __DESCR(descr)); \ +}) + +/* Oid for a signed 16-bit int. If ptr is NULL, val is returned. */ +#define SYSCTL_NULL_S16_PTR ((int16_t *)NULL) +#define SYSCTL_S16(parent, nbr, name, access, ptr, val, descr) \ + SYSCTL_OID(parent, nbr, name, \ + CTLTYPE_S16 | CTLFLAG_MPSAFE | (access), \ + ptr, val, sysctl_handle_16, "S", descr); \ + CTASSERT((((access) & CTLTYPE) == 0 || \ + ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_S16) && \ + sizeof(int16_t) == sizeof(*(ptr))) + +#define SYSCTL_ADD_S16(ctx, parent, nbr, name, access, ptr, val, descr) \ +({ \ + int16_t *__ptr = (ptr); \ + CTASSERT(((access) & CTLTYPE) == 0 || \ + ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_S16); \ + sysctl_add_oid(ctx, parent, nbr, name, \ + CTLTYPE_S16 | CTLFLAG_MPSAFE | (access), \ + __ptr, val, sysctl_handle_16, "S", __DESCR(descr)); \ +}) + +/* Oid for an unsigned 16-bit int. If ptr is NULL, val is returned. */ +#define SYSCTL_NULL_U16_PTR ((uint16_t *)NULL) +#define SYSCTL_U16(parent, nbr, name, access, ptr, val, descr) \ + SYSCTL_OID(parent, nbr, name, \ + CTLTYPE_U16 | CTLFLAG_MPSAFE | (access), \ + ptr, val, sysctl_handle_16, "SU", descr); \ + CTASSERT((((access) & CTLTYPE) == 0 || \ + ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_U16) && \ + sizeof(uint16_t) == sizeof(*(ptr))) + +#define SYSCTL_ADD_U16(ctx, parent, nbr, name, access, ptr, val, descr) \ +({ \ + uint16_t *__ptr = (ptr); \ + CTASSERT(((access) & CTLTYPE) == 0 || \ + ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_U16); \ + sysctl_add_oid(ctx, parent, nbr, name, \ + CTLTYPE_U16 | CTLFLAG_MPSAFE | (access), \ + __ptr, val, sysctl_handle_16, "SU", __DESCR(descr)); \ +}) + +/* Oid for a signed 32-bit int. If ptr is NULL, val is returned. */ +#define SYSCTL_NULL_S32_PTR ((int32_t *)NULL) +#define SYSCTL_S32(parent, nbr, name, access, ptr, val, descr) \ + SYSCTL_OID(parent, nbr, name, \ + CTLTYPE_S32 | CTLFLAG_MPSAFE | (access), \ + ptr, val, sysctl_handle_32, "I", descr); \ + CTASSERT((((access) & CTLTYPE) == 0 || \ + ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_S32) && \ + sizeof(int32_t) == sizeof(*(ptr))) + +#define SYSCTL_ADD_S32(ctx, parent, nbr, name, access, ptr, val, descr) \ +({ \ + int32_t *__ptr = (ptr); \ + CTASSERT(((access) & CTLTYPE) == 0 || \ + ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_S32); \ + sysctl_add_oid(ctx, parent, nbr, name, \ + CTLTYPE_S32 | CTLFLAG_MPSAFE | (access), \ + __ptr, val, sysctl_handle_32, "I", __DESCR(descr)); \ +}) + +/* Oid for an unsigned 32-bit int. If ptr is NULL, val is returned. */ +#define SYSCTL_NULL_U32_PTR ((uint32_t *)NULL) +#define SYSCTL_U32(parent, nbr, name, access, ptr, val, descr) \ + SYSCTL_OID(parent, nbr, name, \ + CTLTYPE_U32 | CTLFLAG_MPSAFE | (access), \ + ptr, val, sysctl_handle_32, "IU", descr); \ + CTASSERT((((access) & CTLTYPE) == 0 || \ + ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_U32) && \ + sizeof(uint32_t) == sizeof(*(ptr))) + +#define SYSCTL_ADD_U32(ctx, parent, nbr, name, access, ptr, val, descr) \ +({ \ + uint32_t *__ptr = (ptr); \ + CTASSERT(((access) & CTLTYPE) == 0 || \ + ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_U32); \ + sysctl_add_oid(ctx, parent, nbr, name, \ + CTLTYPE_U32 | CTLFLAG_MPSAFE | (access), \ + __ptr, val, sysctl_handle_32, "IU", __DESCR(descr)); \ +}) + +/* Oid for a signed 64-bit int. If ptr is NULL, val is returned. */ +#define SYSCTL_NULL_S64_PTR ((int64_t *)NULL) +#define SYSCTL_S64(parent, nbr, name, access, ptr, val, descr) \ + SYSCTL_OID(parent, nbr, name, \ + CTLTYPE_S64 | CTLFLAG_MPSAFE | (access), \ + ptr, val, sysctl_handle_64, "Q", descr); \ + CTASSERT((((access) & CTLTYPE) == 0 || \ + ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_S64) && \ + sizeof(int64_t) == sizeof(*(ptr))) + +#define SYSCTL_ADD_S64(ctx, parent, nbr, name, access, ptr, val, descr) \ +({ \ + int64_t *__ptr = (ptr); \ + CTASSERT(((access) & CTLTYPE) == 0 || \ + ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_S64); \ + sysctl_add_oid(ctx, parent, nbr, name, \ + CTLTYPE_S64 | CTLFLAG_MPSAFE | (access), \ + __ptr, val, sysctl_handle_64, "Q", __DESCR(descr)); \ +}) + +/* Oid for an unsigned 64-bit int. If ptr is NULL, val is returned. */ +#define SYSCTL_NULL_U64_PTR ((uint64_t *)NULL) +#define SYSCTL_U64(parent, nbr, name, access, ptr, val, descr) \ + SYSCTL_OID(parent, nbr, name, \ + CTLTYPE_U64 | CTLFLAG_MPSAFE | (access), \ + ptr, val, sysctl_handle_64, "QU", descr); \ + CTASSERT((((access) & CTLTYPE) == 0 || \ + ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_U64) && \ + sizeof(uint64_t) == sizeof(*(ptr))) + +#define SYSCTL_ADD_U64(ctx, parent, nbr, name, access, ptr, val, descr) \ +({ \ + uint64_t *__ptr = (ptr); \ + CTASSERT(((access) & CTLTYPE) == 0 || \ + ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_U64); \ + sysctl_add_oid(ctx, parent, nbr, name, \ + CTLTYPE_U64 | CTLFLAG_MPSAFE | (access), \ + __ptr, val, sysctl_handle_64, "QU", __DESCR(descr)); \ +}) + /* Oid for an int. If ptr is SYSCTL_NULL_INT_PTR, val is returned. */ #define SYSCTL_NULL_INT_PTR ((int *)NULL) #define SYSCTL_INT(parent, nbr, name, access, ptr, val, descr) \ @@ -309,9 +538,9 @@ SYSCTL_OID(parent, nbr, name, \ CTLTYPE_INT | CTLFLAG_MPSAFE | (access), \ ptr, val, sysctl_handle_int, "I", descr); \ - CTASSERT(((access) & CTLTYPE) == 0 || \ - ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_INT); \ - CTASSERT(sizeof(int) == sizeof(*(ptr))) + CTASSERT((((access) & CTLTYPE) == 0 || \ + ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_INT) && \ + sizeof(int) == sizeof(*(ptr))) #define SYSCTL_ADD_INT(ctx, parent, nbr, name, access, ptr, val, descr) \ ({ \ @@ -329,9 +558,9 @@ SYSCTL_OID(parent, nbr, name, \ CTLTYPE_UINT | CTLFLAG_MPSAFE | (access), \ ptr, val, sysctl_handle_int, "IU", descr); \ - CTASSERT(((access) & CTLTYPE) == 0 || \ - ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_UINT);\ - CTASSERT(sizeof(unsigned) == sizeof(*(ptr))) + CTASSERT((((access) & CTLTYPE) == 0 || \ + ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_UINT) && \ + sizeof(unsigned) == sizeof(*(ptr))) #define SYSCTL_ADD_UINT(ctx, parent, nbr, name, access, ptr, val, descr) \ ({ \ @@ -349,9 +578,9 @@ SYSCTL_OID(parent, nbr, name, \ CTLTYPE_LONG | CTLFLAG_MPSAFE | (access), \ ptr, val, sysctl_handle_long, "L", descr); \ - CTASSERT(((access) & CTLTYPE) == 0 || \ - ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_LONG);\ - CTASSERT(sizeof(long) == sizeof(*(ptr))) + CTASSERT((((access) & CTLTYPE) == 0 || \ + ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_LONG) && \ + sizeof(long) == sizeof(*(ptr))) #define SYSCTL_ADD_LONG(ctx, parent, nbr, name, access, ptr, descr) \ ({ \ @@ -369,9 +598,9 @@ SYSCTL_OID(parent, nbr, name, \ CTLTYPE_ULONG | CTLFLAG_MPSAFE | (access), \ ptr, val, sysctl_handle_long, "LU", descr); \ - CTASSERT(((access) & CTLTYPE) == 0 || \ - ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_ULONG); \ - CTASSERT(sizeof(unsigned long) == sizeof(*(ptr))) + CTASSERT((((access) & CTLTYPE) == 0 || \ + ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_ULONG) && \ + sizeof(unsigned long) == sizeof(*(ptr))) #define SYSCTL_ADD_ULONG(ctx, parent, nbr, name, access, ptr, descr) \ ({ \ @@ -389,9 +618,9 @@ SYSCTL_OID(parent, nbr, name, \ CTLTYPE_S64 | CTLFLAG_MPSAFE | (access), \ ptr, val, sysctl_handle_64, "Q", descr); \ - CTASSERT(((access) & CTLTYPE) == 0 || \ - ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_S64); \ - CTASSERT(sizeof(int64_t) == sizeof(*(ptr))) + CTASSERT((((access) & CTLTYPE) == 0 || \ + ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_S64) && \ + sizeof(int64_t) == sizeof(*(ptr))) #define SYSCTL_ADD_QUAD(ctx, parent, nbr, name, access, ptr, descr) \ ({ \ @@ -408,9 +637,9 @@ SYSCTL_OID(parent, nbr, name, \ CTLTYPE_U64 | CTLFLAG_MPSAFE | (access), \ ptr, val, sysctl_handle_64, "QU", descr); \ - CTASSERT(((access) & CTLTYPE) == 0 || \ - ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_U64); \ - CTASSERT(sizeof(uint64_t) == sizeof(*(ptr))) + CTASSERT((((access) & CTLTYPE) == 0 || \ + ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_U64) && \ + sizeof(uint64_t) == sizeof(*(ptr))) #define SYSCTL_ADD_UQUAD(ctx, parent, nbr, name, access, ptr, descr) \ ({ \ @@ -426,9 +655,9 @@ #define SYSCTL_ADD_UAUTO(ctx, parent, nbr, name, access, ptr, descr) \ ({ \ struct sysctl_oid *__ret; \ - CTASSERT(sizeof(uint64_t) == sizeof(*(ptr)) || \ - sizeof(unsigned) == sizeof(*(ptr))); \ - CTASSERT(((access) & CTLTYPE) == 0); \ + CTASSERT((sizeof(uint64_t) == sizeof(*(ptr)) || \ + sizeof(unsigned) == sizeof(*(ptr))) && \ + ((access) & CTLTYPE) == 0); \ if (sizeof(uint64_t) == sizeof(*(ptr))) { \ __ret = sysctl_add_oid(ctx, parent, nbr, name, \ CTLTYPE_U64 | CTLFLAG_MPSAFE | (access), \ @@ -448,10 +677,10 @@ SYSCTL_OID(parent, nbr, name, \ CTLTYPE_U64 | CTLFLAG_MPSAFE | (access), \ (ptr), 0, sysctl_handle_counter_u64, "QU", descr); \ - CTASSERT(((access) & CTLTYPE) == 0 || \ - ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_U64); \ - CTASSERT(sizeof(counter_u64_t) == sizeof(*(ptr))); \ - CTASSERT(sizeof(uint64_t) == sizeof(**(ptr))) + CTASSERT((((access) & CTLTYPE) == 0 || \ + ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_U64) && \ + sizeof(counter_u64_t) == sizeof(*(ptr)) && \ + sizeof(uint64_t) == sizeof(**(ptr))) #define SYSCTL_ADD_COUNTER_U64(ctx, parent, nbr, name, access, ptr, descr) \ ({ \ @@ -463,6 +692,28 @@ __ptr, 0, sysctl_handle_counter_u64, "QU", __DESCR(descr)); \ }) +/* Oid for an array of counter(9)s. The pointer and length must be non zero. */ +#define SYSCTL_COUNTER_U64_ARRAY(parent, nbr, name, access, ptr, len, descr) \ + SYSCTL_OID(parent, nbr, name, \ + CTLTYPE_OPAQUE | CTLFLAG_MPSAFE | (access), \ + (ptr), (len), sysctl_handle_counter_u64_array, "S", descr); \ + CTASSERT((((access) & CTLTYPE) == 0 || \ + ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_OPAQUE) && \ + sizeof(counter_u64_t) == sizeof(*(ptr)) && \ + sizeof(uint64_t) == sizeof(**(ptr))) + +#define SYSCTL_ADD_COUNTER_U64_ARRAY(ctx, parent, nbr, name, access, \ + ptr, len, descr) \ +({ \ + counter_u64_t *__ptr = (ptr); \ + CTASSERT(((access) & CTLTYPE) == 0 || \ + ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_OPAQUE); \ + sysctl_add_oid(ctx, parent, nbr, name, \ + CTLTYPE_OPAQUE | CTLFLAG_MPSAFE | (access), \ + __ptr, len, sysctl_handle_counter_u64_array, "S", \ + __DESCR(descr)); \ +}) + /* Oid for an opaque object. Specified by a pointer and a length. */ #define SYSCTL_OPAQUE(parent, nbr, name, access, ptr, len, fmt, descr) \ SYSCTL_OID(parent, nbr, name, CTLTYPE_OPAQUE|(access), \ @@ -544,6 +795,24 @@ __ptr, 0, sysctl_handle_uma_zone_cur, "I", __DESCR(descr)); \ }) +/* OID expressing a struct timeval as seconds */ +#define SYSCTL_TIMEVAL_SEC(parent, nbr, name, access, ptr, descr) \ + SYSCTL_OID(parent, nbr, name, \ + CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RD | (access), \ + (ptr), 0, sysctl_sec_to_timeval, "I", descr); \ + CTASSERT(((access) & CTLTYPE) == 0 || \ + ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_INT) +#define SYSCTL_ADD_TIMEVAL_SEC(ctx, parent, nbr, name, access, ptr, descr) \ +({ \ + struct timeval *__ptr = (ptr); \ + CTASSERT(((access) & CTLTYPE) == 0 || \ + ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_INT); \ + sysctl_add_oid(ctx, parent, nbr, name, \ + CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RD | (access), \ + __ptr, 0, sysctl_sec_to_timeval, "I", __DESCR(descr), \ + NULL); \ +}) + /* * A macro to generate a read-only sysctl to indicate the presence of optional * kernel features. @@ -567,7 +836,6 @@ #define CTL_MACHDEP 7 /* machine dependent */ #define CTL_USER 8 /* user-level */ #define CTL_P1003_1B 9 /* POSIX 1003.1B */ -#define CTL_MAXID 10 /* number of valid top-level ids */ /* * CTL_KERN identifiers @@ -609,7 +877,6 @@ #define KERN_IOV_MAX 35 /* int: value of UIO_MAXIOV */ #define KERN_HOSTUUID 36 /* string: host UUID identifier */ #define KERN_ARND 37 /* int: from arc4rand() */ -#define KERN_MAXID 38 /* number of valid kern ids */ /* * KERN_PROC subtypes */ @@ -644,6 +911,8 @@ #define KERN_PROC_UMASK 39 /* process umask */ #define KERN_PROC_OSREL 40 /* osreldate for process binary */ #define KERN_PROC_SIGTRAMP 41 /* signal trampoline location */ +#define KERN_PROC_CWD 42 /* process current working directory */ +#define KERN_PROC_NFDS 43 /* number of open file descriptors */ /* * KERN_IPC identifiers @@ -671,7 +940,6 @@ #define HW_FLOATINGPT 10 /* int: has HW floating point? */ #define HW_MACHINE_ARCH 11 /* string: machine architecture */ #define HW_REALMEM 12 /* int: 'real' memory */ -#define HW_MAXID 13 /* number of valid hw ids */ /* * CTL_USER definitions @@ -696,7 +964,6 @@ #define USER_POSIX2_UPE 18 /* int: POSIX2_UPE */ #define USER_STREAM_MAX 19 /* int: POSIX2_STREAM_MAX */ #define USER_TZNAME_MAX 20 /* int: POSIX2_TZNAME_MAX */ -#define USER_MAXID 21 /* number of valid user ids */ #define CTL_P1003_1B_ASYNCHRONOUS_IO 1 /* boolean */ #define CTL_P1003_1B_MAPPED_FILES 2 /* boolean */ @@ -752,6 +1019,7 @@ SYSCTL_DECL(_hw_bus_devices); SYSCTL_DECL(_hw_bus_info); SYSCTL_DECL(_machdep); +SYSCTL_DECL(_machdep_mitigations); SYSCTL_DECL(_user); SYSCTL_DECL(_compat); SYSCTL_DECL(_regression); @@ -766,7 +1034,7 @@ /* Dynamic oid handling */ struct sysctl_oid *sysctl_add_oid(struct sysctl_ctx_list *clist, struct sysctl_oid_list *parent, int nbr, const char *name, int kind, - void *arg1, intptr_t arg2, int (*handler)(SYSCTL_HANDLER_ARGS), + void *arg1, intmax_t arg2, int (*handler)(SYSCTL_HANDLER_ARGS), const char *fmt, const char *descr); int sysctl_remove_name(struct sysctl_oid *parent, const char *name, int del, int recurse); @@ -794,8 +1062,8 @@ size_t *retval, int flags); int sysctl_find_oid(int *name, u_int namelen, struct sysctl_oid **noid, int *nindx, struct sysctl_req *req); -void sysctl_lock(void); -void sysctl_unlock(void); +void sysctl_wlock(void); +void sysctl_wunlock(void); int sysctl_wire_old_buffer(struct sysctl_req *req, size_t len); struct sbuf; From laffer1 at midnightbsd.org Sun Feb 9 12:07:19 2020 From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org) Date: Sun, 9 Feb 2020 12:07:19 -0500 (EST) Subject: [Midnightbsd-cvs] src [12349] trunk/sys/sys/syscall.mk: sync with FreeBSD 11-stable Message-ID: <202002091707.019H7J60077410@stargazer.midnightbsd.org> Revision: 12349 http://svnweb.midnightbsd.org/src/?rev=12349 Author: laffer1 Date: 2020-02-09 12:07:18 -0500 (Sun, 09 Feb 2020) Log Message: ----------- sync with FreeBSD 11-stable Modified Paths: -------------- trunk/sys/sys/syscall.mk Modified: trunk/sys/sys/syscall.mk =================================================================== --- trunk/sys/sys/syscall.mk 2020-02-09 17:06:01 UTC (rev 12348) +++ trunk/sys/sys/syscall.mk 2020-02-09 17:07:18 UTC (rev 12349) @@ -18,7 +18,6 @@ chmod.o \ chown.o \ break.o \ - freebsd4_getfsstat.o \ getpid.o \ mount.o \ unmount.o \ @@ -39,7 +38,7 @@ kill.o \ getppid.o \ dup.o \ - pipe.o \ + freebsd10_pipe.o \ getegid.o \ profil.o \ ktrace.o \ @@ -108,20 +107,13 @@ quotactl.o \ nlm_syscall.o \ nfssvc.o \ - freebsd4_statfs.o \ - freebsd4_fstatfs.o \ lgetfh.o \ getfh.o \ - freebsd4_getdomainname.o \ - freebsd4_setdomainname.o \ - freebsd4_uname.o \ sysarch.o \ rtprio.o \ semsys.o \ msgsys.o \ shmsys.o \ - freebsd6_pread.o \ - freebsd6_pwrite.o \ setfib.o \ ntp_adjtime.o \ setgid.o \ @@ -135,11 +127,7 @@ getrlimit.o \ setrlimit.o \ getdirentries.o \ - freebsd6_mmap.o \ __syscall.o \ - freebsd6_lseek.o \ - freebsd6_truncate.o \ - freebsd6_ftruncate.o \ __sysctl.o \ mlock.o \ munlock.o \ @@ -170,6 +158,7 @@ ffclock_getcounter.o \ ffclock_setestimate.o \ ffclock_getestimate.o \ + clock_nanosleep.o \ clock_getcpuclockid2.o \ ntp_gettime.o \ minherit.o \ @@ -190,7 +179,6 @@ nlstat.o \ preadv.o \ pwritev.o \ - freebsd4_fhstatfs.o \ fhopen.o \ fhstat.o \ modnext.o \ @@ -210,9 +198,6 @@ aio_suspend.o \ aio_cancel.o \ aio_error.o \ - oaio_read.o \ - oaio_write.o \ - olio_listio.o \ yield.o \ mlockall.o \ munlockall.o \ @@ -226,15 +211,12 @@ sched_get_priority_min.o \ sched_rr_get_interval.o \ utrace.o \ - freebsd4_sendfile.o \ kldsym.o \ jail.o \ nnpfs_syscall.o \ sigprocmask.o \ sigsuspend.o \ - freebsd4_sigaction.o \ sigpending.o \ - freebsd4_sigreturn.o \ sigtimedwait.o \ sigwaitinfo.o \ __acl_get_file.o \ @@ -307,8 +289,6 @@ thr_exit.o \ thr_self.o \ thr_kill.o \ - _umtx_lock.o \ - _umtx_unlock.o \ jail_attach.o \ extattr_list_fd.o \ extattr_list_file.o \ @@ -413,4 +393,7 @@ procctl.o \ ppoll.o \ futimens.o \ - utimensat.o + utimensat.o \ + numa_getaffinity.o \ + numa_setaffinity.o \ + fdatasync.o From laffer1 at midnightbsd.org Sun Feb 9 12:08:07 2020 From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org) Date: Sun, 9 Feb 2020 12:08:07 -0500 (EST) Subject: [Midnightbsd-cvs] src [12350] trunk/sys/sys/syscall.h: sync with FreeBSD 11-stable Message-ID: <202002091708.019H87Ur077472@stargazer.midnightbsd.org> Revision: 12350 http://svnweb.midnightbsd.org/src/?rev=12350 Author: laffer1 Date: 2020-02-09 12:08:07 -0500 (Sun, 09 Feb 2020) Log Message: ----------- sync with FreeBSD 11-stable Modified Paths: -------------- trunk/sys/sys/syscall.h Modified: trunk/sys/sys/syscall.h =================================================================== --- trunk/sys/sys/syscall.h 2020-02-09 17:07:18 UTC (rev 12349) +++ trunk/sys/sys/syscall.h 2020-02-09 17:08:07 UTC (rev 12350) @@ -1,8 +1,8 @@ +/* $MidnightBSD$ */ /* * System call numbers. * * DO NOT EDIT-- this file is automatically generated. - * $MidnightBSD$ */ #define SYS_syscall 0 @@ -23,7 +23,7 @@ #define SYS_chmod 15 #define SYS_chown 16 #define SYS_break 17 -#define SYS_freebsd4_getfsstat 18 + /* 18 is freebsd4 getfsstat */ /* 19 is old lseek */ #define SYS_getpid 20 #define SYS_mount 21 @@ -47,7 +47,7 @@ #define SYS_getppid 39 /* 40 is old lstat */ #define SYS_dup 41 -#define SYS_pipe 42 +#define SYS_freebsd10_pipe 42 #define SYS_getegid 43 #define SYS_profil 44 #define SYS_ktrace 45 @@ -156,20 +156,20 @@ #define SYS_nlm_syscall 154 #define SYS_nfssvc 155 /* 156 is old getdirentries */ -#define SYS_freebsd4_statfs 157 -#define SYS_freebsd4_fstatfs 158 + /* 157 is freebsd4 statfs */ + /* 158 is freebsd4 fstatfs */ #define SYS_lgetfh 160 #define SYS_getfh 161 -#define SYS_freebsd4_getdomainname 162 -#define SYS_freebsd4_setdomainname 163 -#define SYS_freebsd4_uname 164 + /* 162 is freebsd4 getdomainname */ + /* 163 is freebsd4 setdomainname */ + /* 164 is freebsd4 uname */ #define SYS_sysarch 165 #define SYS_rtprio 166 #define SYS_semsys 169 #define SYS_msgsys 170 #define SYS_shmsys 171 -#define SYS_freebsd6_pread 173 -#define SYS_freebsd6_pwrite 174 + /* 173 is freebsd6 pread */ + /* 174 is freebsd6 pwrite */ #define SYS_setfib 175 #define SYS_ntp_adjtime 176 #define SYS_setgid 181 @@ -183,11 +183,11 @@ #define SYS_getrlimit 194 #define SYS_setrlimit 195 #define SYS_getdirentries 196 -#define SYS_freebsd6_mmap 197 + /* 197 is freebsd6 mmap */ #define SYS___syscall 198 -#define SYS_freebsd6_lseek 199 -#define SYS_freebsd6_truncate 200 -#define SYS_freebsd6_ftruncate 201 + /* 199 is freebsd6 lseek */ + /* 200 is freebsd6 truncate */ + /* 201 is freebsd6 ftruncate */ #define SYS___sysctl 202 #define SYS_mlock 203 #define SYS_munlock 204 @@ -218,6 +218,7 @@ #define SYS_ffclock_getcounter 241 #define SYS_ffclock_setestimate 242 #define SYS_ffclock_getestimate 243 +#define SYS_clock_nanosleep 244 #define SYS_clock_getcpuclockid2 247 #define SYS_ntp_gettime 248 #define SYS_minherit 250 @@ -238,7 +239,7 @@ #define SYS_nlstat 280 #define SYS_preadv 289 #define SYS_pwritev 290 -#define SYS_freebsd4_fhstatfs 297 + /* 297 is freebsd4 fhstatfs */ #define SYS_fhopen 298 #define SYS_fhstat 299 #define SYS_modnext 300 @@ -259,9 +260,9 @@ #define SYS_aio_suspend 315 #define SYS_aio_cancel 316 #define SYS_aio_error 317 -#define SYS_oaio_read 318 -#define SYS_oaio_write 319 -#define SYS_olio_listio 320 + /* 318 is freebsd6 aio_read */ + /* 319 is freebsd6 aio_write */ + /* 320 is freebsd6 lio_listio */ #define SYS_yield 321 /* 322 is obsolete thr_sleep */ /* 323 is obsolete thr_wakeup */ @@ -277,15 +278,15 @@ #define SYS_sched_get_priority_min 333 #define SYS_sched_rr_get_interval 334 #define SYS_utrace 335 -#define SYS_freebsd4_sendfile 336 + /* 336 is freebsd4 sendfile */ #define SYS_kldsym 337 #define SYS_jail 338 #define SYS_nnpfs_syscall 339 #define SYS_sigprocmask 340 #define SYS_sigsuspend 341 -#define SYS_freebsd4_sigaction 342 + /* 342 is freebsd4 sigaction */ #define SYS_sigpending 343 -#define SYS_freebsd4_sigreturn 344 + /* 344 is freebsd4 sigreturn */ #define SYS_sigtimedwait 345 #define SYS_sigwaitinfo 346 #define SYS___acl_get_file 347 @@ -358,8 +359,6 @@ #define SYS_thr_exit 431 #define SYS_thr_self 432 #define SYS_thr_kill 433 -#define SYS__umtx_lock 434 -#define SYS__umtx_unlock 435 #define SYS_jail_attach 436 #define SYS_extattr_list_fd 437 #define SYS_extattr_list_file 438 @@ -466,4 +465,7 @@ #define SYS_ppoll 545 #define SYS_futimens 546 #define SYS_utimensat 547 -#define SYS_MAXSYSCALL 548 +#define SYS_numa_getaffinity 548 +#define SYS_numa_setaffinity 549 +#define SYS_fdatasync 550 +#define SYS_MAXSYSCALL 551 From laffer1 at midnightbsd.org Sun Feb 9 12:49:22 2020 From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org) Date: Sun, 9 Feb 2020 12:49:22 -0500 (EST) Subject: [Midnightbsd-cvs] src [12351] trunk/sys/sys/stdint.h: sync with FreeBSD 11-stable Message-ID: <202002091749.019HnMe3083984@stargazer.midnightbsd.org> Revision: 12351 http://svnweb.midnightbsd.org/src/?rev=12351 Author: laffer1 Date: 2020-02-09 12:49:21 -0500 (Sun, 09 Feb 2020) Log Message: ----------- sync with FreeBSD 11-stable Modified Paths: -------------- trunk/sys/sys/stdint.h Modified: trunk/sys/sys/stdint.h =================================================================== --- trunk/sys/sys/stdint.h 2020-02-09 17:08:07 UTC (rev 12350) +++ trunk/sys/sys/stdint.h 2020-02-09 17:49:21 UTC (rev 12351) @@ -24,7 +24,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: stable/10/sys/sys/stdint.h 291134 2015-11-21 16:21:27Z kib $ + * $FreeBSD: stable/11/sys/sys/stdint.h 331722 2018-03-29 02:50:57Z eadler $ */ #ifndef _SYS_STDINT_H_ @@ -67,4 +67,11 @@ #define WCHAR_MIN __WCHAR_MIN #define WCHAR_MAX __WCHAR_MAX +#if __EXT1_VISIBLE +/* ISO/IEC 9899:2011 K.3.4.4 */ +#ifndef RSIZE_MAX +#define RSIZE_MAX (SIZE_MAX >> 1) +#endif +#endif /* __EXT1_VISIBLE */ + #endif /* !_SYS_STDINT_H_ */ From laffer1 at midnightbsd.org Sun Feb 9 12:50:32 2020 From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org) Date: Sun, 9 Feb 2020 12:50:32 -0500 (EST) Subject: [Midnightbsd-cvs] src [12352] trunk/sys/sys/sockio.h: sync with FreeBSD 11-stable Message-ID: <202002091750.019HoWxS084700@stargazer.midnightbsd.org> Revision: 12352 http://svnweb.midnightbsd.org/src/?rev=12352 Author: laffer1 Date: 2020-02-09 12:50:31 -0500 (Sun, 09 Feb 2020) Log Message: ----------- sync with FreeBSD 11-stable Modified Paths: -------------- trunk/sys/sys/sockio.h Modified: trunk/sys/sys/sockio.h =================================================================== --- trunk/sys/sys/sockio.h 2020-02-09 17:49:21 UTC (rev 12351) +++ trunk/sys/sys/sockio.h 2020-02-09 17:50:31 UTC (rev 12352) @@ -28,7 +28,7 @@ * SUCH DAMAGE. * * @(#)sockio.h 8.1 (Berkeley) 3/28/94 - * $FreeBSD: stable/10/sys/sys/sockio.h 324462 2017-10-10 02:35:04Z sephe $ + * $FreeBSD: stable/11/sys/sys/sockio.h 352649 2019-09-24 06:36:25Z kib $ */ #ifndef _SYS_SOCKIO_H_ @@ -51,28 +51,28 @@ #define SIOCGETSGCNT _IOWR('r', 16, struct sioc_sg_req) /* get s,g pkt cnt */ #define SIOCSIFADDR _IOW('i', 12, struct ifreq) /* set ifnet address */ -#define OSIOCGIFADDR _IOWR('i', 13, struct ifreq) /* get ifnet address */ +/* OSIOCGIFADDR _IOWR('i', 13, struct ifreq) 4.3BSD */ #define SIOCGIFADDR _IOWR('i', 33, struct ifreq) /* get ifnet address */ #define SIOCSIFDSTADDR _IOW('i', 14, struct ifreq) /* set p-p address */ -#define OSIOCGIFDSTADDR _IOWR('i', 15, struct ifreq) /* get p-p address */ +/* OSIOCGIFDSTADDR _IOWR('i', 15, struct ifreq) 4.3BSD */ #define SIOCGIFDSTADDR _IOWR('i', 34, struct ifreq) /* get p-p address */ #define SIOCSIFFLAGS _IOW('i', 16, struct ifreq) /* set ifnet flags */ #define SIOCGIFFLAGS _IOWR('i', 17, struct ifreq) /* get ifnet flags */ -#define OSIOCGIFBRDADDR _IOWR('i', 18, struct ifreq) /* get broadcast addr */ +/* OSIOCGIFBRDADDR _IOWR('i', 18, struct ifreq) 4.3BSD */ #define SIOCGIFBRDADDR _IOWR('i', 35, struct ifreq) /* get broadcast addr */ #define SIOCSIFBRDADDR _IOW('i', 19, struct ifreq) /* set broadcast addr */ -#define OSIOCGIFCONF _IOWR('i', 20, struct ifconf) /* get ifnet list */ +/* OSIOCGIFCONF _IOWR('i', 20, struct ifconf) 4.3BSD */ #define SIOCGIFCONF _IOWR('i', 36, struct ifconf) /* get ifnet list */ -#define OSIOCGIFNETMASK _IOWR('i', 21, struct ifreq) /* get net addr mask */ +/* OSIOCGIFNETMASK _IOWR('i', 21, struct ifreq) 4.3BSD */ #define SIOCGIFNETMASK _IOWR('i', 37, struct ifreq) /* get net addr mask */ #define SIOCSIFNETMASK _IOW('i', 22, struct ifreq) /* set net addr mask */ #define SIOCGIFMETRIC _IOWR('i', 23, struct ifreq) /* get IF metric */ #define SIOCSIFMETRIC _IOW('i', 24, struct ifreq) /* set IF metric */ #define SIOCDIFADDR _IOW('i', 25, struct ifreq) /* delete IF addr */ -#define OSIOCAIFADDR _IOW('i', 26, struct oifaliasreq)/* add/chg IF alias */ -#define SIOCALIFADDR _IOW('i', 27, struct if_laddrreq) /* add IF addr */ -#define SIOCGLIFADDR _IOWR('i', 28, struct if_laddrreq) /* get IF addr */ -#define SIOCDLIFADDR _IOW('i', 29, struct if_laddrreq) /* delete IF addr */ +#define OSIOCAIFADDR _IOW('i', 26, struct oifaliasreq) /* FreeBSD 9.x */ +/* SIOCALIFADDR _IOW('i', 27, struct if_laddrreq) KAME */ +/* SIOCGLIFADDR _IOWR('i', 28, struct if_laddrreq) KAME */ +/* SIOCDLIFADDR _IOW('i', 29, struct if_laddrreq) KAME */ #define SIOCSIFCAP _IOW('i', 30, struct ifreq) /* set IF features */ #define SIOCGIFCAP _IOWR('i', 31, struct ifreq) /* get IF features */ #define SIOCGIFINDEX _IOWR('i', 32, struct ifreq) /* get IF index */ @@ -104,8 +104,8 @@ #define SIOCGIFPSRCADDR _IOWR('i', 71, struct ifreq) /* get gif psrc addr */ #define SIOCGIFPDSTADDR _IOWR('i', 72, struct ifreq) /* get gif pdst addr */ #define SIOCDIFPHYADDR _IOW('i', 73, struct ifreq) /* delete gif addrs */ -#define SIOCSLIFPHYADDR _IOW('i', 74, struct if_laddrreq) /* set gif addrs */ -#define SIOCGLIFPHYADDR _IOWR('i', 75, struct if_laddrreq) /* get gif addrs */ +/* SIOCSLIFPHYADDR _IOW('i', 74, struct if_laddrreq) KAME */ +/* SIOCGLIFPHYADDR _IOWR('i', 75, struct if_laddrreq) KAME */ #define SIOCGPRIVATE_0 _IOWR('i', 80, struct ifreq) /* device private 0 */ #define SIOCGPRIVATE_1 _IOWR('i', 81, struct ifreq) /* device private 1 */ @@ -139,4 +139,9 @@ #define SIOCGIFRSSHASH _IOWR('i', 151, struct ifrsshash)/* get the current RSS type/func settings */ +#define SIOCGLANPCP _IOWR('i', 152, struct ifreq) /* Get (V)LAN PCP */ +#define SIOCSLANPCP _IOW('i', 153, struct ifreq) /* Set (V)LAN PCP */ + +#define SIOCGIFDOWNREASON _IOWR('i', 154, struct ifdownreason) + #endif /* !_SYS_SOCKIO_H_ */ From laffer1 at midnightbsd.org Sun Feb 9 12:50:44 2020 From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org) Date: Sun, 9 Feb 2020 12:50:44 -0500 (EST) Subject: [Midnightbsd-cvs] src [12353] trunk/sys/sys/shm.h: sync with FreeBSD 11-stable Message-ID: <202002091750.019HoiV4084753@stargazer.midnightbsd.org> Revision: 12353 http://svnweb.midnightbsd.org/src/?rev=12353 Author: laffer1 Date: 2020-02-09 12:50:44 -0500 (Sun, 09 Feb 2020) Log Message: ----------- sync with FreeBSD 11-stable Modified Paths: -------------- trunk/sys/sys/shm.h Modified: trunk/sys/sys/shm.h =================================================================== --- trunk/sys/sys/shm.h 2020-02-09 17:50:31 UTC (rev 12352) +++ trunk/sys/sys/shm.h 2020-02-09 17:50:44 UTC (rev 12353) @@ -1,5 +1,5 @@ /* $MidnightBSD$ */ -/* $FreeBSD: stable/10/sys/sys/shm.h 328294 2018-01-23 18:22:41Z jhb $ */ +/* $FreeBSD: stable/11/sys/sys/shm.h 347995 2019-05-20 16:31:45Z kib $ */ /* $NetBSD: shm.h,v 1.15 1994/06/29 06:45:17 cgd Exp $ */ /*- @@ -41,11 +41,17 @@ #define _SYS_SHM_H_ #include <sys/cdefs.h> +#ifdef _WANT_SYSVSHM_INTERNALS +#define _WANT_SYSVIPC_INTERNALS +#endif #include <sys/ipc.h> #include <sys/_types.h> +#include <machine/param.h> + #define SHM_RDONLY 010000 /* Attach read-only (else read-write) */ #define SHM_RND 020000 /* Round attach address to SHMLBA */ +#define SHM_REMAP 030000 /* Unmap before mapping */ #define SHMLBA PAGE_SIZE /* Segment low boundary address multiple */ /* "official" access mode definitions; somewhat braindead since you have @@ -102,9 +108,7 @@ time_t shm_ctime; /* time of last change by shmctl() */ }; -#ifdef _KERNEL -#include <vm/vm.h> - +#if defined(_KERNEL) || defined(_WANT_SYSVSHM_INTERNALS) /* * System 5 style catch-all structure for shared memory constants that * might be of interest to user programs. Do we really want/need this? @@ -117,6 +121,8 @@ u_long shmall; /* max amount of shared memory (pages) */ }; +struct vm_object; + /* * Add a kernel wrapper to the shmid_ds struct so that private info (like the * MAC label) can be added to it, without changing the user interface. @@ -123,13 +129,12 @@ */ struct shmid_kernel { struct shmid_ds u; - vm_object_t object; + struct vm_object *object; struct label *label; /* MAC label */ struct ucred *cred; /* creator's credendials */ }; +#endif -extern struct shminfo shminfo; - struct shm_info { int used_ids; unsigned long shm_tot; @@ -139,12 +144,15 @@ unsigned long swap_successes; }; -struct thread; +#ifdef _KERNEL struct proc; struct vmspace; +extern struct shminfo shminfo; + void shmexit(struct vmspace *); void shmfork(struct proc *, struct proc *); + #else /* !_KERNEL */ #include <sys/cdefs.h> @@ -164,6 +172,6 @@ int shmdt(const void *); __END_DECLS -#endif /* !_KERNEL */ +#endif /* _KERNEL */ #endif /* !_SYS_SHM_H_ */ From laffer1 at midnightbsd.org Sun Feb 9 12:50:57 2020 From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org) Date: Sun, 9 Feb 2020 12:50:57 -0500 (EST) Subject: [Midnightbsd-cvs] src [12354] trunk/sys/sys/socketvar.h: sync with FreeBSD 11-stable Message-ID: <202002091750.019HovSY084797@stargazer.midnightbsd.org> Revision: 12354 http://svnweb.midnightbsd.org/src/?rev=12354 Author: laffer1 Date: 2020-02-09 12:50:57 -0500 (Sun, 09 Feb 2020) Log Message: ----------- sync with FreeBSD 11-stable Modified Paths: -------------- trunk/sys/sys/socketvar.h Modified: trunk/sys/sys/socketvar.h =================================================================== --- trunk/sys/sys/socketvar.h 2020-02-09 17:50:44 UTC (rev 12353) +++ trunk/sys/sys/socketvar.h 2020-02-09 17:50:57 UTC (rev 12354) @@ -29,7 +29,7 @@ * * @(#)socketvar.h 8.3 (Berkeley) 2/19/95 * - * $FreeBSD: stable/10/sys/sys/socketvar.h 321021 2017-07-15 17:28:03Z dchagin $ + * $FreeBSD: stable/11/sys/sys/socketvar.h 338617 2018-09-12 18:52:18Z sobomax $ */ #ifndef _SYS_SOCKETVAR_H_ @@ -39,6 +39,7 @@ #include <sys/selinfo.h> /* for struct selinfo */ #include <sys/_lock.h> #include <sys/_mutex.h> +#include <sys/osd.h> #include <sys/_sx.h> #include <sys/sockbuf.h> #include <sys/sockstate.h> @@ -64,7 +65,6 @@ * (a) constant after allocation, no locking required. * (b) locked by SOCK_LOCK(so). * (c) locked by SOCKBUF_LOCK(&so->so_rcv). - * (d) locked by SOCKBUF_LOCK(&so->so_snd). * (e) locked by ACCEPT_LOCK(). * (f) not locked since integer reads/writes are atomic. * (g) used only as a sleep/wakeup address, no value. @@ -95,16 +95,15 @@ TAILQ_HEAD(, socket) so_incomp; /* (e) queue of partial unaccepted connections */ TAILQ_HEAD(, socket) so_comp; /* (e) queue of complete unaccepted connections */ TAILQ_ENTRY(socket) so_list; /* (e) list of unaccepted connections */ - u_short so_qlen; /* (e) number of unaccepted connections */ - u_short so_incqlen; /* (e) number of unaccepted incomplete + u_int so_qlen; /* (e) number of unaccepted connections */ + u_int so_incqlen; /* (e) number of unaccepted incomplete connections */ - u_short so_qlimit; /* (e) max number queued connections */ + u_int so_qlimit; /* (e) max number queued connections */ short so_timeo; /* (g) connection timeout */ u_short so_error; /* (f) error affecting connection */ struct sigio *so_sigio; /* [sg] information for async I/O or out of band data (SIGURG) */ u_long so_oobmark; /* (c) chars to oob mark */ - TAILQ_HEAD(, aiocblist) so_aiojobq; /* AIO ops waiting on socket */ struct sockbuf so_rcv, so_snd; @@ -119,6 +118,7 @@ void *so_accept_filter_arg; /* saved filter args */ char *so_accept_filter_str; /* saved user args */ } *so_accf; + struct osd osd; /* Object Specific extensions */ /* * so_fibnum, so_user_cookie and friends can be used to attach * some user-specified metadata to a socket, which then can be @@ -127,6 +127,11 @@ */ int so_fibnum; /* routing domain for this socket */ uint32_t so_user_cookie; + + int so_ts_clock; /* type of the clock used for timestamps */ + + void *so_pspare[2]; /* packet pacing / general use */ + int so_ispare[2]; /* packet pacing / general use */ }; /* @@ -171,9 +176,9 @@ caddr_t so_pcb; /* another convenient handle */ int xso_protocol; int xso_family; - u_short so_qlen; - u_short so_incqlen; - u_short so_qlimit; + u_int so_qlen; + u_int so_incqlen; + u_int so_qlimit; short so_timeo; u_short so_error; pid_t so_pgid; @@ -207,7 +212,7 @@ /* can we read something from so? */ #define soreadabledata(so) \ - ((so)->so_rcv.sb_cc >= (so)->so_rcv.sb_lowat || \ + (sbavail(&(so)->so_rcv) >= (so)->so_rcv.sb_lowat || \ !TAILQ_EMPTY(&(so)->so_comp) || (so)->so_error) #define soreadable(so) \ (soreadabledata(so) || ((so)->so_rcv.sb_state & SBS_CANTRCVMORE)) @@ -294,11 +299,32 @@ MALLOC_DECLARE(M_SONAME); #endif +/* + * Socket specific helper hook point identifiers + * Do not leave holes in the sequence, hook registration is a loop. + */ +#define HHOOK_SOCKET_OPT 0 +#define HHOOK_SOCKET_CREATE 1 +#define HHOOK_SOCKET_RCV 2 +#define HHOOK_SOCKET_SND 3 +#define HHOOK_FILT_SOREAD 4 +#define HHOOK_FILT_SOWRITE 5 +#define HHOOK_SOCKET_CLOSE 6 +#define HHOOK_SOCKET_LAST HHOOK_SOCKET_CLOSE + +struct socket_hhook_data { + struct socket *so; + struct mbuf *m; + void *hctx; /* hook point specific data*/ + int status; +}; + extern int maxsockets; extern u_long sb_max; extern so_gen_t so_gencnt; struct file; +struct filecaps; struct filedesc; struct mbuf; struct sockaddr; @@ -316,12 +342,14 @@ /* * From uipc_socket and friends */ -int sockargs(struct mbuf **mp, caddr_t buf, int buflen, int type); int getsockaddr(struct sockaddr **namp, caddr_t uaddr, size_t len); int getsock_cap(struct thread *td, int fd, cap_rights_t *rightsp, - struct file **fpp, u_int *fflagp); + struct file **fpp, u_int *fflagp, struct filecaps *havecaps); void soabort(struct socket *so); int soaccept(struct socket *so, struct sockaddr **nam); +void soaio_enqueue(struct task *task); +void soaio_rcv(void *context, int pending); +void soaio_snd(void *context, int pending); int socheckuid(struct socket *so, uid_t uid); int sobind(struct socket *so, struct sockaddr *nam, struct thread *td); int sobindat(int fd, struct socket *so, struct sockaddr *nam, @@ -376,6 +404,7 @@ void soupcall_set(struct socket *so, int which, int (*func)(struct socket *, void *, int), void *arg); void sowakeup(struct socket *so, struct sockbuf *sb); +void sowakeup_aio(struct socket *so, struct sockbuf *sb); int selsocket(struct socket *so, int events, struct timeval *tv, struct thread *td); From laffer1 at midnightbsd.org Sun Feb 9 12:51:23 2020 From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org) Date: Sun, 9 Feb 2020 12:51:23 -0500 (EST) Subject: [Midnightbsd-cvs] src [12355] trunk/sys/sys/socket.h: sync with FreeBSD 11-stable Message-ID: <202002091751.019HpNAL084848@stargazer.midnightbsd.org> Revision: 12355 http://svnweb.midnightbsd.org/src/?rev=12355 Author: laffer1 Date: 2020-02-09 12:51:22 -0500 (Sun, 09 Feb 2020) Log Message: ----------- sync with FreeBSD 11-stable Modified Paths: -------------- trunk/sys/sys/socket.h Modified: trunk/sys/sys/socket.h =================================================================== --- trunk/sys/sys/socket.h 2020-02-09 17:50:57 UTC (rev 12354) +++ trunk/sys/sys/socket.h 2020-02-09 17:51:22 UTC (rev 12355) @@ -28,7 +28,7 @@ * SUCH DAMAGE. * * @(#)socket.h 8.4 (Berkeley) 2/21/94 - * $FreeBSD: stable/10/sys/sys/socket.h 254925 2013-08-26 18:16:05Z jhb $ + * $FreeBSD: stable/11/sys/sys/socket.h 338617 2018-09-12 18:52:18Z sobomax $ */ #ifndef _SYS_SOCKET_H_ @@ -85,6 +85,16 @@ #endif #endif +#ifndef _UINT32_T_DECLARED +typedef __uint32_t uint32_t; +#define _UINT32_T_DECLARED +#endif + +#ifndef _UINTPTR_T_DECLARED +typedef __uintptr_t uintptr_t; +#define _UINTPTR_T_DECLARED +#endif + /* * Types */ @@ -149,8 +159,18 @@ #define SO_USER_COOKIE 0x1015 /* user cookie (dummynet etc.) */ #define SO_PROTOCOL 0x1016 /* get socket protocol (Linux name) */ #define SO_PROTOTYPE SO_PROTOCOL /* alias for SO_PROTOCOL (SunOS name) */ +#define SO_TS_CLOCK 0x1017 /* clock type used for SO_TIMESTAMP */ #endif +#if __BSD_VISIBLE +#define SO_TS_REALTIME_MICRO 0 /* microsecond resolution, realtime */ +#define SO_TS_BINTIME 1 /* sub-nanosecond resolution, realtime */ +#define SO_TS_REALTIME 2 /* nanosecond resolution, realtime */ +#define SO_TS_MONOTONIC 3 /* nanosecond resolution, monotonic */ +#define SO_TS_DEFAULT SO_TS_REALTIME_MICRO +#define SO_TS_CLOCK_MAX SO_TS_MONOTONIC +#endif + /* * Space reserved for new socket options added by third-party vendors. * This range applies to all socket option levels. New socket options @@ -367,9 +387,8 @@ * Second level is protocol family. * Third level is protocol number. * - * Further levels are defined by the individual families below. + * Further levels are defined by the individual families. */ -#define NET_MAXID AF_MAX /* * PF_ROUTE - Routing table @@ -385,14 +404,12 @@ #define NET_RT_IFMALIST 4 /* return multicast address list */ #define NET_RT_IFLISTL 5 /* Survey interface list, using 'l'en * versions of msghdr structs. */ -#define NET_RT_MAXID 6 - #endif /* __BSD_VISIBLE */ /* * Maximum queue length specifiable by listen. */ -#define SOMAXCONN 256 +#define SOMAXCONN 128 /* * Message header for recvmsg and sendmsg calls. @@ -425,9 +442,11 @@ #define MSG_NBIO 0x4000 /* FIONBIO mode, used by fifofs */ #define MSG_COMPAT 0x8000 /* used in sendit() */ #define MSG_CMSG_CLOEXEC 0x40000 /* make received fds close-on-exec */ +#define MSG_WAITFORONE 0x80000 /* for recvmmsg() */ #endif #ifdef _KERNEL #define MSG_SOCALLBCK 0x10000 /* for use by socket callbacks - soreceive (TCP) */ +#define MSG_MORETOCOME 0x100000 /* additional data pending */ #endif /* @@ -493,7 +512,7 @@ /* given pointer to struct cmsghdr, return pointer to next cmsghdr */ #define CMSG_NXTHDR(mhdr, cmsg) \ - ((char *)(cmsg) == NULL ? CMSG_FIRSTHDR(mhdr) : \ + ((char *)(cmsg) == (char *)0 ? CMSG_FIRSTHDR(mhdr) : \ ((char *)(cmsg) + _ALIGN(((struct cmsghdr *)(cmsg))->cmsg_len) + \ _ALIGN(sizeof(struct cmsghdr)) > \ (char *)(mhdr)->msg_control + (mhdr)->msg_controllen) ? \ @@ -508,7 +527,7 @@ #define CMSG_FIRSTHDR(mhdr) \ ((mhdr)->msg_controllen >= sizeof(struct cmsghdr) ? \ (struct cmsghdr *)(mhdr)->msg_control : \ - (struct cmsghdr *)NULL) + (struct cmsghdr *)0) #if __BSD_VISIBLE /* RFC 2292 additions */ @@ -526,6 +545,8 @@ #define SCM_TIMESTAMP 0x02 /* timestamp (struct timeval) */ #define SCM_CREDS 0x03 /* process creds (struct cmsgcred) */ #define SCM_BINTIME 0x04 /* timestamp (struct bintime) */ +#define SCM_REALTIME 0x05 /* timestamp (struct timespec) */ +#define SCM_MONOTONIC 0x06 /* timestamp (struct timespec) */ #endif #if __BSD_VISIBLE @@ -581,12 +602,22 @@ * Sendfile-specific flag(s) */ #define SF_NODISKIO 0x00000001 -#define SF_MNOWAIT 0x00000002 +#define SF_MNOWAIT 0x00000002 /* obsolete */ #define SF_SYNC 0x00000004 +#define SF_NOCACHE 0x00000010 +#define SF_FLAGS(rh, flags) (((rh) << 16) | (flags)) #ifdef _KERNEL -#define SFK_COMPAT 0x00000001 +#define SF_READAHEAD(flags) ((flags) >> 16) #endif /* _KERNEL */ + +/* + * Sendmmsg/recvmmsg specific structure(s) + */ +struct mmsghdr { + struct msghdr msg_hdr; /* message header */ + ssize_t msg_len; /* message length */ +}; #endif /* __BSD_VISIBLE */ #ifndef _KERNEL @@ -609,6 +640,11 @@ ssize_t recv(int, void *, size_t, int); ssize_t recvfrom(int, void *, size_t, int, struct sockaddr * __restrict, socklen_t * __restrict); ssize_t recvmsg(int, struct msghdr *, int); +#if __BSD_VISIBLE +struct timespec; +ssize_t recvmmsg(int, struct mmsghdr * __restrict, size_t, int, + const struct timespec * __restrict); +#endif ssize_t send(int, const void *, size_t, int); ssize_t sendto(int, const void *, size_t, int, const struct sockaddr *, socklen_t); @@ -615,6 +651,7 @@ ssize_t sendmsg(int, const struct msghdr *, int); #if __BSD_VISIBLE int sendfile(int, int, off_t, size_t, struct sf_hdtr *, off_t *, int); +ssize_t sendmmsg(int, struct mmsghdr * __restrict, size_t, int); int setfib(int); #endif int setsockopt(int, int, int, const void *, socklen_t); From laffer1 at midnightbsd.org Sun Feb 9 12:51:39 2020 From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org) Date: Sun, 9 Feb 2020 12:51:39 -0500 (EST) Subject: [Midnightbsd-cvs] src [12356] trunk/sys/sys/sockbuf.h: sync with FreeBSD 11-stable Message-ID: <202002091751.019Hpdsc084897@stargazer.midnightbsd.org> Revision: 12356 http://svnweb.midnightbsd.org/src/?rev=12356 Author: laffer1 Date: 2020-02-09 12:51:38 -0500 (Sun, 09 Feb 2020) Log Message: ----------- sync with FreeBSD 11-stable Modified Paths: -------------- trunk/sys/sys/sockbuf.h Modified: trunk/sys/sys/sockbuf.h =================================================================== --- trunk/sys/sys/sockbuf.h 2020-02-09 17:51:22 UTC (rev 12355) +++ trunk/sys/sys/sockbuf.h 2020-02-09 17:51:38 UTC (rev 12356) @@ -29,7 +29,7 @@ * * @(#)socketvar.h 8.3 (Berkeley) 2/19/95 * - * $FreeBSD: stable/10/sys/sys/sockbuf.h 279930 2015-03-12 17:07:45Z sjg $ + * $FreeBSD: stable/11/sys/sys/sockbuf.h 337975 2018-08-17 16:04:20Z markj $ */ #ifndef _SYS_SOCKBUF_H_ #define _SYS_SOCKBUF_H_ @@ -37,6 +37,7 @@ #include <sys/_lock.h> #include <sys/_mutex.h> #include <sys/_sx.h> +#include <sys/_task.h> #define SB_MAX (2*1024*1024) /* default for max chars in sockbuf */ @@ -54,6 +55,7 @@ #define SB_IN_TOE 0x400 /* socket buffer is in the middle of an operation */ #define SB_AUTOSIZE 0x800 /* automatically size socket buffer */ #define SB_STOP 0x1000 /* backpressure indicator */ +#define SB_AIO_RUNNING 0x2000 /* AIO operation running */ #define SBS_CANTSENDMORE 0x0010 /* can't send more data to peer */ #define SBS_CANTRCVMORE 0x0020 /* can't receive more data from peer */ @@ -78,31 +80,38 @@ /* * Variables for socket buffering. + * + * Locking key to struct sockbuf: + * (a) locked by SOCKBUF_LOCK(). */ struct sockbuf { struct selinfo sb_sel; /* process selecting read/write */ struct mtx sb_mtx; /* sockbuf lock */ struct sx sb_sx; /* prevent I/O interlacing */ - short sb_state; /* (c/d) socket state on sockbuf */ + short sb_state; /* (a) socket state on sockbuf */ #define sb_startzero sb_mb - struct mbuf *sb_mb; /* (c/d) the mbuf chain */ - struct mbuf *sb_mbtail; /* (c/d) the last mbuf in the chain */ - struct mbuf *sb_lastrecord; /* (c/d) first mbuf of last + struct mbuf *sb_mb; /* (a) the mbuf chain */ + struct mbuf *sb_mbtail; /* (a) the last mbuf in the chain */ + struct mbuf *sb_lastrecord; /* (a) first mbuf of last * record in socket buffer */ - struct mbuf *sb_sndptr; /* (c/d) pointer into mbuf chain */ - u_int sb_sndptroff; /* (c/d) byte offset of ptr into chain */ - u_int sb_cc; /* (c/d) actual chars in buffer */ - u_int sb_hiwat; /* (c/d) max actual char count */ - u_int sb_mbcnt; /* (c/d) chars of mbufs used */ - u_int sb_mcnt; /* (c/d) number of mbufs in buffer */ - u_int sb_ccnt; /* (c/d) number of clusters in buffer */ - u_int sb_mbmax; /* (c/d) max chars of mbufs to use */ - u_int sb_ctl; /* (c/d) non-data chars in buffer */ - int sb_lowat; /* (c/d) low water mark */ - sbintime_t sb_timeo; /* (c/d) timeout for read/write */ - short sb_flags; /* (c/d) flags, see below */ - int (*sb_upcall)(struct socket *, void *, int); /* (c/d) */ - void *sb_upcallarg; /* (c/d) */ + struct mbuf *sb_sndptr; /* (a) pointer into mbuf chain */ + struct mbuf *sb_fnrdy; /* (a) pointer to first not ready buffer */ + u_int sb_sndptroff; /* (a) byte offset of ptr into chain */ + u_int sb_acc; /* (a) available chars in buffer */ + u_int sb_ccc; /* (a) claimed chars in buffer */ + u_int sb_hiwat; /* (a) max actual char count */ + u_int sb_mbcnt; /* (a) chars of mbufs used */ + u_int sb_mcnt; /* (a) number of mbufs in buffer */ + u_int sb_ccnt; /* (a) number of clusters in buffer */ + u_int sb_mbmax; /* (a) max chars of mbufs to use */ + u_int sb_ctl; /* (a) non-data chars in buffer */ + int sb_lowat; /* (a) low water mark */ + sbintime_t sb_timeo; /* (a) timeout for read/write */ + short sb_flags; /* (a) flags, see below */ + int (*sb_upcall)(struct socket *, void *, int); /* (a) */ + void *sb_upcallarg; /* (a) */ + TAILQ_HEAD(, kaiocb) sb_aiojobq; /* (a) pending AIO ops */ + struct task sb_aiotask; /* AIO task */ }; #ifdef _KERNEL @@ -121,10 +130,17 @@ #define SOCKBUF_LOCK_ASSERT(_sb) mtx_assert(SOCKBUF_MTX(_sb), MA_OWNED) #define SOCKBUF_UNLOCK_ASSERT(_sb) mtx_assert(SOCKBUF_MTX(_sb), MA_NOTOWNED) -void sbappend(struct sockbuf *sb, struct mbuf *m); -void sbappend_locked(struct sockbuf *sb, struct mbuf *m); -void sbappendstream(struct sockbuf *sb, struct mbuf *m); -void sbappendstream_locked(struct sockbuf *sb, struct mbuf *m); +/* + * Socket buffer private mbuf(9) flags. + */ +#define M_NOTREADY M_PROTO1 /* m_data not populated yet */ +#define M_BLOCKED M_PROTO2 /* M_NOTREADY in front of m */ +#define M_NOTAVAIL (M_NOTREADY | M_BLOCKED) + +void sbappend(struct sockbuf *sb, struct mbuf *m, int flags); +void sbappend_locked(struct sockbuf *sb, struct mbuf *m, int flags); +void sbappendstream(struct sockbuf *sb, struct mbuf *m, int flags); +void sbappendstream_locked(struct sockbuf *sb, struct mbuf *m, int flags); int sbappendaddr(struct sockbuf *sb, const struct sockaddr *asa, struct mbuf *m0, struct mbuf *control); int sbappendaddr_locked(struct sockbuf *sb, const struct sockaddr *asa, @@ -131,13 +147,12 @@ struct mbuf *m0, struct mbuf *control); int sbappendaddr_nospacecheck_locked(struct sockbuf *sb, const struct sockaddr *asa, struct mbuf *m0, struct mbuf *control); -int sbappendcontrol(struct sockbuf *sb, struct mbuf *m0, +void sbappendcontrol(struct sockbuf *sb, struct mbuf *m0, struct mbuf *control); -int sbappendcontrol_locked(struct sockbuf *sb, struct mbuf *m0, +void sbappendcontrol_locked(struct sockbuf *sb, struct mbuf *m0, struct mbuf *control); void sbappendrecord(struct sockbuf *sb, struct mbuf *m0); void sbappendrecord_locked(struct sockbuf *sb, struct mbuf *m0); -void sbcheck(struct sockbuf *sb); void sbcompress(struct sockbuf *sb, struct mbuf *m, struct mbuf *n); struct mbuf * sbcreatecontrol(caddr_t p, int size, int type, int level); @@ -165,58 +180,61 @@ int sbwait(struct sockbuf *sb); int sblock(struct sockbuf *sb, int flags); void sbunlock(struct sockbuf *sb); +void sballoc(struct sockbuf *, struct mbuf *); +void sbfree(struct sockbuf *, struct mbuf *); +int sbready(struct sockbuf *, struct mbuf *, int); /* + * Return how much data is available to be taken out of socket + * buffer right now. + */ +static inline u_int +sbavail(struct sockbuf *sb) +{ + +#if 0 + SOCKBUF_LOCK_ASSERT(sb); +#endif + return (sb->sb_acc); +} + +/* + * Return how much data sits there in the socket buffer + * It might be that some data is not yet ready to be read. + */ +static inline u_int +sbused(struct sockbuf *sb) +{ + +#if 0 + SOCKBUF_LOCK_ASSERT(sb); +#endif + return (sb->sb_ccc); +} + +/* * How much space is there in a socket buffer (so->so_snd or so->so_rcv)? * This is problematical if the fields are unsigned, as the space might - * still be negative (cc > hiwat or mbcnt > mbmax). Should detect - * overflow and return 0. Should use "lmin" but it doesn't exist now. + * still be negative (ccc > hiwat or mbcnt > mbmax). */ -static __inline -long +static inline long sbspace(struct sockbuf *sb) { int bleft, mleft; /* size should match sockbuf fields */ +#if 0 + SOCKBUF_LOCK_ASSERT(sb); +#endif + if (sb->sb_flags & SB_STOP) return(0); - bleft = sb->sb_hiwat - sb->sb_cc; + + bleft = sb->sb_hiwat - sb->sb_ccc; mleft = sb->sb_mbmax - sb->sb_mbcnt; - return((bleft < mleft) ? bleft : mleft); -} -/* adjust counters in sb reflecting allocation of m */ -#define sballoc(sb, m) { \ - (sb)->sb_cc += (m)->m_len; \ - if ((m)->m_type != MT_DATA && (m)->m_type != MT_OOBDATA) \ - (sb)->sb_ctl += (m)->m_len; \ - (sb)->sb_mbcnt += MSIZE; \ - (sb)->sb_mcnt += 1; \ - if ((m)->m_flags & M_EXT) { \ - (sb)->sb_mbcnt += (m)->m_ext.ext_size; \ - (sb)->sb_ccnt += 1; \ - } \ + return ((bleft < mleft) ? bleft : mleft); } -/* adjust counters in sb reflecting freeing of m */ -#define sbfree(sb, m) { \ - (sb)->sb_cc -= (m)->m_len; \ - if ((m)->m_type != MT_DATA && (m)->m_type != MT_OOBDATA) \ - (sb)->sb_ctl -= (m)->m_len; \ - (sb)->sb_mbcnt -= MSIZE; \ - (sb)->sb_mcnt -= 1; \ - if ((m)->m_flags & M_EXT) { \ - (sb)->sb_mbcnt -= (m)->m_ext.ext_size; \ - (sb)->sb_ccnt -= 1; \ - } \ - if ((sb)->sb_sndptr == (m)) { \ - (sb)->sb_sndptr = NULL; \ - (sb)->sb_sndptroff = 0; \ - } \ - if ((sb)->sb_sndptroff != 0) \ - (sb)->sb_sndptroff -= (m)->m_len; \ -} - #define SB_EMPTY_FIXUP(sb) do { \ if ((sb)->sb_mb == NULL) { \ (sb)->sb_mbtail = NULL; \ @@ -226,13 +244,15 @@ #ifdef SOCKBUF_DEBUG void sblastrecordchk(struct sockbuf *, const char *, int); +void sblastmbufchk(struct sockbuf *, const char *, int); +void sbcheck(struct sockbuf *, const char *, int); #define SBLASTRECORDCHK(sb) sblastrecordchk((sb), __FILE__, __LINE__) - -void sblastmbufchk(struct sockbuf *, const char *, int); #define SBLASTMBUFCHK(sb) sblastmbufchk((sb), __FILE__, __LINE__) +#define SBCHECK(sb) sbcheck((sb), __FILE__, __LINE__) #else -#define SBLASTRECORDCHK(sb) /* nothing */ -#define SBLASTMBUFCHK(sb) /* nothing */ +#define SBLASTRECORDCHK(sb) do {} while (0) +#define SBLASTMBUFCHK(sb) do {} while (0) +#define SBCHECK(sb) do {} while (0) #endif /* SOCKBUF_DEBUG */ #endif /* _KERNEL */ From laffer1 at midnightbsd.org Sun Feb 9 12:52:06 2020 From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org) Date: Sun, 9 Feb 2020 12:52:06 -0500 (EST) Subject: [Midnightbsd-cvs] src [12357] trunk/sys/sys: sync with FreeBSD 11-stable Message-ID: <202002091752.019Hq6ij084969@stargazer.midnightbsd.org> Revision: 12357 http://svnweb.midnightbsd.org/src/?rev=12357 Author: laffer1 Date: 2020-02-09 12:52:06 -0500 (Sun, 09 Feb 2020) Log Message: ----------- sync with FreeBSD 11-stable Modified Paths: -------------- trunk/sys/sys/sleepqueue.h trunk/sys/sys/slicer.h Modified: trunk/sys/sys/sleepqueue.h =================================================================== --- trunk/sys/sys/sleepqueue.h 2020-02-09 17:51:38 UTC (rev 12356) +++ trunk/sys/sys/sleepqueue.h 2020-02-09 17:52:06 UTC (rev 12357) @@ -24,7 +24,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: stable/10/sys/sys/sleepqueue.h 274139 2014-11-05 16:24:57Z lwhsu $ + * $FreeBSD: stable/11/sys/sys/sleepqueue.h 354405 2019-11-06 18:02:18Z mav $ */ #ifndef _SYS_SLEEPQUEUE_H_ @@ -84,6 +84,7 @@ #define SLEEPQ_SX 0x03 /* Used by an sx lock. */ #define SLEEPQ_LK 0x04 /* Used by a lockmgr. */ #define SLEEPQ_INTERRUPTIBLE 0x100 /* Sleep is interruptible. */ +#define SLEEPQ_UNFAIR 0x200 /* Unfair wakeup order. */ void init_sleepqueues(void); int sleepq_abort(struct thread *td, int intrval); @@ -91,11 +92,14 @@ int flags, int queue); struct sleepqueue *sleepq_alloc(void); int sleepq_broadcast(void *wchan, int flags, int pri, int queue); +void sleepq_chains_remove_matching(bool (*matches)(struct thread *)); void sleepq_free(struct sleepqueue *sq); void sleepq_lock(void *wchan); struct sleepqueue *sleepq_lookup(void *wchan); void sleepq_release(void *wchan); void sleepq_remove(struct thread *td, void *wchan); +int sleepq_remove_matching(struct sleepqueue *sq, int queue, + bool (*matches)(struct thread *), int pri); int sleepq_signal(void *wchan, int flags, int pri, int queue); void sleepq_set_timeout_sbt(void *wchan, sbintime_t sbt, sbintime_t pr, int flags); @@ -108,5 +112,11 @@ void sleepq_wait(void *wchan, int pri); int sleepq_wait_sig(void *wchan, int pri); +#ifdef STACK +struct sbuf; +int sleepq_sbuf_print_stacks(struct sbuf *sb, void *wchan, int queue, + int *count_stacks_printed); +#endif + #endif /* _KERNEL */ #endif /* !_SYS_SLEEPQUEUE_H_ */ Modified: trunk/sys/sys/slicer.h =================================================================== --- trunk/sys/sys/slicer.h 2020-02-09 17:51:38 UTC (rev 12356) +++ trunk/sys/sys/slicer.h 2020-02-09 17:52:06 UTC (rev 12357) @@ -24,7 +24,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: stable/10/sys/sys/slicer.h 318159 2017-05-10 21:42:16Z marius $ + * $FreeBSD: stable/11/sys/sys/slicer.h 346557 2019-04-22 15:04:11Z ian $ */ #ifndef _FLASH_SLICER_H_ @@ -57,7 +57,7 @@ #define FLASH_SLICES_TYPE_SPI 2 #define FLASH_SLICES_TYPE_MMC 3 -/* Use NULL for deregistering a slicer */ +/* Use NULL and set force to true for deregistering a slicer */ void flash_register_slicer(flash_slicer_t slicer, u_int type, bool force); #endif /* _KERNEL */ From laffer1 at midnightbsd.org Sun Feb 9 13:26:51 2020 From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org) Date: Sun, 9 Feb 2020 13:26:51 -0500 (EST) Subject: [Midnightbsd-cvs] src [12358] trunk/sys/sys/signal.h: sync with FreeBSD 11-stable Message-ID: <202002091826.019IQpod090805@stargazer.midnightbsd.org> Revision: 12358 http://svnweb.midnightbsd.org/src/?rev=12358 Author: laffer1 Date: 2020-02-09 13:26:51 -0500 (Sun, 09 Feb 2020) Log Message: ----------- sync with FreeBSD 11-stable Modified Paths: -------------- trunk/sys/sys/signal.h Modified: trunk/sys/sys/signal.h =================================================================== --- trunk/sys/sys/signal.h 2020-02-09 17:52:06 UTC (rev 12357) +++ trunk/sys/sys/signal.h 2020-02-09 18:26:51 UTC (rev 12358) @@ -33,7 +33,7 @@ * SUCH DAMAGE. * * @(#)signal.h 8.4 (Berkeley) 5/4/95 - * $FreeBSD: stable/10/sys/sys/signal.h 233519 2012-03-26 19:12:09Z rmh $ + * $FreeBSD: stable/11/sys/sys/signal.h 331722 2018-03-29 02:50:57Z eadler $ */ #ifndef _SYS_SIGNAL_H_ @@ -46,6 +46,23 @@ #include <machine/_limits.h> /* __MINSIGSTKSZ */ #include <machine/signal.h> /* sig_atomic_t; trap codes; sigcontext */ +#if __POSIX_VISIBLE >= 200809 + +#include <sys/_pthreadtypes.h> +#include <sys/_timespec.h> + +#ifndef _SIZE_T_DECLARED +typedef __size_t size_t; +#define _SIZE_T_DECLARED +#endif + +#ifndef _UID_T_DECLARED +typedef __uid_t uid_t; +#define _UID_T_DECLARED +#endif + +#endif /* __POSIX_VISIBLE >= 200809 */ + /* * System defined signals. */ @@ -158,9 +175,22 @@ int sigval_int; void *sigval_ptr; }; + +#if defined(_WANT_LWPINFO32) || (defined(_KERNEL) && defined(__LP64__)) +union sigval32 { + int sival_int; + uint32_t sival_ptr; + /* 6.0 compatibility */ + int sigval_int; + uint32_t sigval_ptr; +}; #endif +#endif #if __POSIX_VISIBLE >= 199309 + +struct pthread_attr; + struct sigevent { int sigev_notify; /* Notification type */ int sigev_signo; /* Signal number */ @@ -169,7 +199,7 @@ __lwpid_t _threadid; struct { void (*_function)(union sigval); - void *_attribute; /* pthread_attr_t * */ + struct pthread_attr **_attribute; } _sigev_thread; unsigned short _kevent_flags; long __spare__[8]; @@ -191,6 +221,7 @@ #define SIGEV_KEVENT 3 /* Generate a kevent. */ #define SIGEV_THREAD_ID 4 /* Send signal to a kernel thread. */ #endif + #endif /* __POSIX_VISIBLE >= 199309 */ #if __POSIX_VISIBLE >= 199309 || __XSI_VISIBLE @@ -236,6 +267,38 @@ #define si_mqd _reason._mesgq._mqd #define si_band _reason._poll._band +#if defined(_WANT_LWPINFO32) || (defined(_KERNEL) && defined(__LP64__)) +struct siginfo32 { + int si_signo; /* signal number */ + int si_errno; /* errno association */ + int si_code; /* signal code */ + __pid_t si_pid; /* sending process */ + __uid_t si_uid; /* sender's ruid */ + int si_status; /* exit value */ + uint32_t si_addr; /* faulting instruction */ + union sigval32 si_value; /* signal value */ + union { + struct { + int _trapno;/* machine specific trap code */ + } _fault; + struct { + int _timerid; + int _overrun; + } _timer; + struct { + int _mqd; + } _mesgq; + struct { + int32_t _band; /* band event for SIGPOLL */ + } _poll; /* was this ever used ? */ + struct { + int32_t __spare1__; + int __spare2__[7]; + } __spare__; + } _reason; +}; +#endif + /** si_code **/ /* codes for SIGILL */ #define ILL_ILLOPC 1 /* Illegal opcode. */ @@ -271,6 +334,7 @@ #define TRAP_BRKPT 1 /* Process breakpoint. */ #define TRAP_TRACE 2 /* Process trace trap. */ #define TRAP_DTRACE 3 /* DTrace induced trap. */ +#define TRAP_CAP 4 /* Capabilities protective trap. */ /* codes for SIGCHLD */ #define CLD_EXITED 1 /* Child has exited */ @@ -355,18 +419,10 @@ #endif #if __XSI_VISIBLE -/* - * Structure used in sigaltstack call. - */ #if __BSD_VISIBLE -typedef struct sigaltstack { -#else -typedef struct { +#define __stack_t sigaltstack #endif - char *ss_sp; /* signal stack base */ - __size_t ss_size; /* signal stack length */ - int ss_flags; /* SS_DISABLE and/or SS_ONSTACK */ -} stack_t; +typedef struct __stack_t stack_t; #define SS_ONSTACK 0x0001 /* take signal on alternate stack */ #define SS_DISABLE 0x0004 /* disable taking signals on alternate stack */ @@ -374,6 +430,17 @@ #define SIGSTKSZ (MINSIGSTKSZ + 32768) /* recommended stack size */ #endif +/* + * Structure used in sigaltstack call. Its definition is always + * needed for __ucontext. If __BSD_VISIBLE is defined, the structure + * tag is actually sigaltstack. + */ +struct __stack_t { + void *ss_sp; /* signal stack base */ + __size_t ss_size; /* signal stack length */ + int ss_flags; /* SS_DISABLE and/or SS_ONSTACK */ +}; + #if __BSD_VISIBLE /* * 4.3 compatibility: @@ -407,8 +474,7 @@ * Structure used in sigstack call. */ struct sigstack { - /* XXX ss_sp's type should be `void *'. */ - char *ss_sp; /* signal stack pointer */ + void *ss_sp; /* signal stack pointer */ int ss_onstack; /* current status */ }; #endif From laffer1 at midnightbsd.org Sun Feb 9 13:28:48 2020 From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org) Date: Sun, 9 Feb 2020 13:28:48 -0500 (EST) Subject: [Midnightbsd-cvs] src [12359] trunk/sys/sys/signalvar.h: sync with FreeBSD 11-stable Message-ID: <202002091828.019ISmK8090921@stargazer.midnightbsd.org> Revision: 12359 http://svnweb.midnightbsd.org/src/?rev=12359 Author: laffer1 Date: 2020-02-09 13:28:48 -0500 (Sun, 09 Feb 2020) Log Message: ----------- sync with FreeBSD 11-stable Modified Paths: -------------- trunk/sys/sys/signalvar.h Modified: trunk/sys/sys/signalvar.h =================================================================== --- trunk/sys/sys/signalvar.h 2020-02-09 18:26:51 UTC (rev 12358) +++ trunk/sys/sys/signalvar.h 2020-02-09 18:28:48 UTC (rev 12359) @@ -28,7 +28,7 @@ * SUCH DAMAGE. * * @(#)signalvar.h 8.6 (Berkeley) 2/19/95 - * $FreeBSD: stable/10/sys/sys/signalvar.h 315949 2017-03-25 13:33:23Z badger $ + * $FreeBSD: stable/11/sys/sys/signalvar.h 353789 2019-10-21 01:24:21Z kevans $ */ #ifndef _SYS_SIGNALVAR_H_ @@ -200,6 +200,7 @@ return (1); } +#ifdef COMPAT_FREEBSD6 struct osigevent { int sigev_notify; /* Notification type */ union { @@ -208,6 +209,7 @@ } __sigev_u; union sigval sigev_value; /* Signal value */ }; +#endif typedef struct ksiginfo { TAILQ_ENTRY(ksiginfo) ksi_link; @@ -326,9 +328,41 @@ #define SIGPROCMASK_PROC_LOCKED 0x0002 #define SIGPROCMASK_PS_LOCKED 0x0004 +/* + * Modes for sigdeferstop(). Manages behaviour of + * thread_suspend_check() in the region delimited by + * sigdeferstop()/sigallowstop(). Must be restored to + * SIGDEFERSTOP_OFF before returning to userspace. + */ +#define SIGDEFERSTOP_NOP 0 /* continue doing whatever is done now */ +#define SIGDEFERSTOP_OFF 1 /* stop ignoring STOPs */ +#define SIGDEFERSTOP_SILENT 2 /* silently ignore STOPs */ +#define SIGDEFERSTOP_EINTR 3 /* ignore STOPs, return EINTR */ +#define SIGDEFERSTOP_ERESTART 4 /* ignore STOPs, return ERESTART */ + +#define SIGDEFERSTOP_VAL_NCHG (-1) /* placeholder indicating no state change */ +int sigdeferstop_impl(int mode); +void sigallowstop_impl(int prev); + +static inline int +sigdeferstop(int mode) +{ + + if (mode == SIGDEFERSTOP_NOP) + return (SIGDEFERSTOP_VAL_NCHG); + return (sigdeferstop_impl(mode)); +} + +static inline void +sigallowstop(int prev) +{ + + if (prev == SIGDEFERSTOP_VAL_NCHG) + return; + sigallowstop_impl(prev); +} + int cursig(struct thread *td); -int sigdeferstop(void); -int sigallowstop(void); void execsigs(struct proc *p); void gsignal(int pgid, int sig, ksiginfo_t *ksi); void killproc(struct proc *p, char *why); @@ -346,6 +380,7 @@ void sigacts_free(struct sigacts *ps); struct sigacts *sigacts_hold(struct sigacts *ps); int sigacts_shared(struct sigacts *ps); +void sig_drop_caught(struct proc *p); void sigexit(struct thread *td, int sig) __dead2; int sigev_findtd(struct proc *p, struct sigevent *sigev, struct thread **); int sig_ffs(sigset_t *set); From laffer1 at midnightbsd.org Sun Feb 9 13:30:16 2020 From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org) Date: Sun, 9 Feb 2020 13:30:16 -0500 (EST) Subject: [Midnightbsd-cvs] src [12360] trunk/sys/sys: sync with FreeBSD 11-stable Message-ID: <202002091830.019IUGsh091647@stargazer.midnightbsd.org> Revision: 12360 http://svnweb.midnightbsd.org/src/?rev=12360 Author: laffer1 Date: 2020-02-09 13:30:15 -0500 (Sun, 09 Feb 2020) Log Message: ----------- sync with FreeBSD 11-stable Modified Paths: -------------- trunk/sys/sys/sf_buf.h trunk/sys/sys/sglist.h Modified: trunk/sys/sys/sf_buf.h =================================================================== --- trunk/sys/sys/sf_buf.h 2020-02-09 18:28:48 UTC (rev 12359) +++ trunk/sys/sys/sf_buf.h 2020-02-09 18:30:15 UTC (rev 12360) @@ -1,5 +1,6 @@ /* $MidnightBSD$ */ /*- + * Copyright (c) 2014 Gleb Smirnoff <glebius at FreeBSD.org> * Copyright (c) 2003-2004 Alan L. Cox <alc at cs.rice.edu> * All rights reserved. * @@ -24,13 +25,166 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: stable/10/sys/sys/sf_buf.h 255786 2013-09-22 13:36:52Z glebius $ + * $FreeBSD: stable/11/sys/sys/sf_buf.h 331722 2018-03-29 02:50:57Z eadler $ */ #ifndef _SYS_SF_BUF_H_ #define _SYS_SF_BUF_H_ +struct sfstat { /* sendfile statistics */ + uint64_t sf_syscalls; /* times sendfile was called */ + uint64_t sf_noiocnt; /* times sendfile didn't require I/O */ + uint64_t sf_iocnt; /* times sendfile had to do disk I/O */ + uint64_t sf_pages_read; /* pages read as part of a request */ + uint64_t sf_pages_valid; /* pages were valid for a request */ + uint64_t sf_rhpages_requested; /* readahead pages requested */ + uint64_t sf_rhpages_read; /* readahead pages read */ + uint64_t sf_busy; /* times aborted on a busy page */ + uint64_t sf_allocfail; /* times sfbuf allocation failed */ + uint64_t sf_allocwait; /* times sfbuf allocation had to wait */ +}; + +#ifdef _KERNEL +#include <sys/types.h> +#include <sys/systm.h> +#include <sys/counter.h> +#include <vm/vm.h> +#include <vm/vm_param.h> +#include <vm/vm_page.h> + /* + * Sf_bufs, or sendfile(2) buffers provide a vm_page that is mapped + * into kernel address space. Note, that they aren't used only + * by sendfile(2)! + * + * Sf_bufs could be implemented as a feature of vm_page_t, but that + * would require growth of the structure. That's why they are implemented + * as a separate hash indexed by vm_page address. Implementation lives in + * kern/subr_sfbuf.c. Meanwhile, most 64-bit machines have a physical map, + * so they don't require this hash at all, thus ignore subr_sfbuf.c. + * + * Different 32-bit architectures demand different requirements on sf_buf + * hash and functions. They request features in machine/vmparam.h, which + * enable parts of this file. They can also optionally provide helpers in + * machine/sf_buf.h + * + * Defines are: + * SFBUF This machine requires sf_buf hash. + * subr_sfbuf.c should be compiled. + * SFBUF_CPUSET This machine can perform SFB_CPUPRIVATE mappings, + * that do no invalidate cache on the rest of CPUs. + * SFBUF_NOMD This machine doesn't have machine/sf_buf.h + * + * SFBUF_OPTIONAL_DIRECT_MAP Value of this define is used as boolean + * variable that tells whether machine is + * capable of direct map or not at runtime. + * SFBUF_MAP This machine provides its own sf_buf_map() and + * sf_buf_unmap(). + * SFBUF_PROCESS_PAGE This machine provides sf_buf_process_page() + * function. + */ + +#ifdef SFBUF +#if defined(SMP) && defined(SFBUF_CPUSET) +#include <sys/_cpuset.h> +#endif +#include <sys/queue.h> + +struct sf_buf { + LIST_ENTRY(sf_buf) list_entry; /* list of buffers */ + TAILQ_ENTRY(sf_buf) free_entry; /* list of buffers */ + vm_page_t m; /* currently mapped page */ + vm_offset_t kva; /* va of mapping */ + int ref_count; /* usage of this mapping */ +#if defined(SMP) && defined(SFBUF_CPUSET) + cpuset_t cpumask; /* where mapping is valid */ +#endif +}; +#else /* ! SFBUF */ +struct sf_buf; +#endif /* SFBUF */ + +#ifndef SFBUF_NOMD +#include <machine/sf_buf.h> +#endif +#ifdef SFBUF_OPTIONAL_DIRECT_MAP +#include <machine/md_var.h> +#endif + +#ifdef SFBUF +struct sf_buf *sf_buf_alloc(struct vm_page *, int); +void sf_buf_free(struct sf_buf *); +void sf_buf_ref(struct sf_buf *); + +static inline vm_offset_t +sf_buf_kva(struct sf_buf *sf) +{ +#ifdef SFBUF_OPTIONAL_DIRECT_MAP + if (SFBUF_OPTIONAL_DIRECT_MAP) + return (SFBUF_PHYS_DMAP(VM_PAGE_TO_PHYS((vm_page_t)sf))); +#endif + + return (sf->kva); +} + +static inline vm_page_t +sf_buf_page(struct sf_buf *sf) +{ +#ifdef SFBUF_OPTIONAL_DIRECT_MAP + if (SFBUF_OPTIONAL_DIRECT_MAP) + return ((vm_page_t)sf); +#endif + + return (sf->m); +} + +#ifndef SFBUF_MAP +#include <vm/pmap.h> + +static inline void +sf_buf_map(struct sf_buf *sf, int flags) +{ + + pmap_qenter(sf->kva, &sf->m, 1); +} + +static inline int +sf_buf_unmap(struct sf_buf *sf) +{ + + return (0); +} +#endif /* SFBUF_MAP */ + +#if defined(SMP) && defined(SFBUF_CPUSET) +void sf_buf_shootdown(struct sf_buf *, int); +#endif + +#ifdef SFBUF_PROCESS_PAGE +boolean_t sf_buf_process_page(vm_page_t, void (*)(struct sf_buf *)); +#endif + +#else /* ! SFBUF */ + +static inline struct sf_buf * +sf_buf_alloc(struct vm_page *m, int pri) +{ + + return ((struct sf_buf *)m); +} + +static inline void +sf_buf_free(struct sf_buf *sf) +{ +} + +static inline void +sf_buf_ref(struct sf_buf *sf) +{ +} +#endif /* SFBUF */ + +/* * Options to sf_buf_alloc() are specified through its flags argument. This * argument's value should be the result of a bitwise or'ing of one or more * of the following values. @@ -41,20 +195,6 @@ #define SFB_DEFAULT 0 #define SFB_NOWAIT 4 /* Return NULL if all bufs are used. */ -struct vm_page; - -struct sfstat { /* sendfile statistics */ - uint64_t sf_iocnt; /* times sendfile had to do disk I/O */ - uint64_t sf_allocfail; /* times sfbuf allocation failed */ - uint64_t sf_allocwait; /* times sfbuf allocation had to wait */ -}; - -#ifdef _KERNEL -#include <machine/sf_buf.h> -#include <sys/systm.h> -#include <sys/counter.h> -struct mbuf; /* for sf_buf_mext() */ - extern counter_u64_t sfstat[sizeof(struct sfstat) / sizeof(uint64_t)]; #define SFSTAT_ADD(name, val) \ counter_u64_add(sfstat[offsetof(struct sfstat, name) / sizeof(uint64_t)],\ @@ -61,7 +201,4 @@ (val)) #define SFSTAT_INC(name) SFSTAT_ADD(name, 1) #endif /* _KERNEL */ - -int sf_buf_mext(struct mbuf *mb, void *addr, void *args); - #endif /* !_SYS_SF_BUF_H_ */ Modified: trunk/sys/sys/sglist.h =================================================================== --- trunk/sys/sys/sglist.h 2020-02-09 18:28:48 UTC (rev 12359) +++ trunk/sys/sys/sglist.h 2020-02-09 18:30:15 UTC (rev 12360) @@ -28,7 +28,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: stable/10/sys/sys/sglist.h 260856 2014-01-18 18:36:41Z bryanv $ + * $FreeBSD: stable/11/sys/sys/sglist.h 345039 2019-03-11 22:48:51Z jhb $ */ /* @@ -89,13 +89,18 @@ int sglist_append_mbuf(struct sglist *sg, struct mbuf *m0); int sglist_append_phys(struct sglist *sg, vm_paddr_t paddr, size_t len); +int sglist_append_sglist(struct sglist *sg, struct sglist *source, + size_t offset, size_t length); int sglist_append_uio(struct sglist *sg, struct uio *uio); int sglist_append_user(struct sglist *sg, void *buf, size_t len, struct thread *td); +int sglist_append_vmpages(struct sglist *sg, vm_page_t *m, size_t pgoff, + size_t len); struct sglist *sglist_build(void *buf, size_t len, int mflags); struct sglist *sglist_clone(struct sglist *sg, int mflags); int sglist_consume_uio(struct sglist *sg, struct uio *uio, size_t resid); int sglist_count(void *buf, size_t len); +int sglist_count_vmpages(vm_page_t *m, size_t pgoff, size_t len); void sglist_free(struct sglist *sg); int sglist_join(struct sglist *first, struct sglist *second); size_t sglist_length(struct sglist *sg); From laffer1 at midnightbsd.org Sun Feb 9 13:31:04 2020 From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org) Date: Sun, 9 Feb 2020 13:31:04 -0500 (EST) Subject: [Midnightbsd-cvs] src [12361] trunk/sys/sys: sync with FreeBSD 11-stable Message-ID: <202002091831.019IV4Du091736@stargazer.midnightbsd.org> Revision: 12361 http://svnweb.midnightbsd.org/src/?rev=12361 Author: laffer1 Date: 2020-02-09 13:31:04 -0500 (Sun, 09 Feb 2020) Log Message: ----------- sync with FreeBSD 11-stable Modified Paths: -------------- trunk/sys/sys/sdt.h trunk/sys/sys/seq.h Modified: trunk/sys/sys/sdt.h =================================================================== --- trunk/sys/sys/sdt.h 2020-02-09 18:30:15 UTC (rev 12360) +++ trunk/sys/sys/sdt.h 2020-02-09 18:31:04 UTC (rev 12361) @@ -23,7 +23,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: stable/10/sys/sys/sdt.h 289795 2015-10-23 07:37:44Z avg $ + * $FreeBSD: stable/11/sys/sys/sdt.h 331722 2018-03-29 02:50:57Z eadler $ * * Statically Defined Tracing (SDT) definitions. * @@ -81,6 +81,8 @@ #include <sys/cdefs.h> #include <sys/linker_set.h> +extern volatile bool sdt_probes_enabled; + #ifndef KDTRACE_HOOKS #define SDT_PROVIDER_DEFINE(prov) @@ -162,10 +164,12 @@ extern struct sdt_probe sdt_##prov##_##mod##_##func##_##name[1] #define SDT_PROBE(prov, mod, func, name, arg0, arg1, arg2, arg3, arg4) do { \ - if (sdt_##prov##_##mod##_##func##_##name->id) \ + if (__predict_false(sdt_probes_enabled)) { \ + if (__predict_false(sdt_##prov##_##mod##_##func##_##name->id)) \ (*sdt_probe_func)(sdt_##prov##_##mod##_##func##_##name->id, \ (uintptr_t) arg0, (uintptr_t) arg1, (uintptr_t) arg2, \ (uintptr_t) arg3, (uintptr_t) arg4); \ + } \ } while (0) #define SDT_PROBE_ARGTYPE(prov, mod, func, name, num, type, xtype) \ Modified: trunk/sys/sys/seq.h =================================================================== --- trunk/sys/sys/seq.h 2020-02-09 18:30:15 UTC (rev 12360) +++ trunk/sys/sys/seq.h 2020-02-09 18:31:04 UTC (rev 12361) @@ -23,7 +23,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: stable/10/sys/sys/seq.h 273109 2014-10-14 21:19:23Z mjg $ + * $FreeBSD: stable/11/sys/sys/seq.h 312714 2017-01-24 19:39:24Z mjg $ */ #ifndef _SYS_SEQ_H_ @@ -60,7 +60,6 @@ * lobj = gobj; * if (seq_consistent(&gobj->seq, seq)) * break; - * cpu_spinwait(); * } * foo(lobj); */ @@ -70,25 +69,6 @@ #include <machine/cpu.h> -/* - * This is a temporary hack until memory barriers are cleaned up. - * - * atomic_load_acq_int at least on amd64 provides a full memory barrier, - * in a way which affects perforance. - * - * Hack below covers all architectures and avoids most of the penalty at least - * on amd64. - */ -static __inline int -atomic_load_acq_rmb_int(volatile u_int *p) -{ - volatile u_int v; - - v = *p; - atomic_load_acq_int(&v); - return (v); -} - static __inline bool seq_in_modify(seq_t seqp) { @@ -101,7 +81,8 @@ { MPASS(!seq_in_modify(*seqp)); - atomic_add_acq_int(seqp, 1); + *seqp += 1; + atomic_thread_fence_rel(); } static __inline void @@ -108,17 +89,17 @@ seq_write_end(seq_t *seqp) { - atomic_add_rel_int(seqp, 1); + atomic_store_rel_int(seqp, *seqp + 1); MPASS(!seq_in_modify(*seqp)); } static __inline seq_t -seq_read(seq_t *seqp) +seq_read(const seq_t *seqp) { seq_t ret; for (;;) { - ret = atomic_load_acq_rmb_int(seqp); + ret = atomic_load_acq_int(__DECONST(seq_t *, seqp)); if (seq_in_modify(ret)) { cpu_spinwait(); continue; @@ -130,17 +111,18 @@ } static __inline seq_t -seq_consistent(seq_t *seqp, seq_t oldseq) +seq_consistent_nomb(const seq_t *seqp, seq_t oldseq) { - return (atomic_load_acq_rmb_int(seqp) == oldseq); + return (*seqp == oldseq); } static __inline seq_t -seq_consistent_nomb(seq_t *seqp, seq_t oldseq) +seq_consistent(const seq_t *seqp, seq_t oldseq) { - return (*seqp == oldseq); + atomic_thread_fence_acq(); + return (seq_consistent_nomb(seqp, oldseq)); } #endif /* _KERNEL */ From laffer1 at midnightbsd.org Sun Feb 9 13:33:36 2020 From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org) Date: Sun, 9 Feb 2020 13:33:36 -0500 (EST) Subject: [Midnightbsd-cvs] src [12362] trunk/sys/sys/param.h: sync with FreeBSD 11-stable Message-ID: <202002091833.019IXa5V091877@stargazer.midnightbsd.org> Revision: 12362 http://svnweb.midnightbsd.org/src/?rev=12362 Author: laffer1 Date: 2020-02-09 13:33:35 -0500 (Sun, 09 Feb 2020) Log Message: ----------- sync with FreeBSD 11-stable Modified Paths: -------------- trunk/sys/sys/param.h Modified: trunk/sys/sys/param.h =================================================================== --- trunk/sys/sys/param.h 2020-02-09 18:31:04 UTC (rev 12361) +++ trunk/sys/sys/param.h 2020-02-09 18:33:35 UTC (rev 12362) @@ -55,7 +55,7 @@ /* Version of FreeBSD we're compatible with */ #undef __FreeBSD_version -#define __FreeBSD_version 1004501 /* Master, propagated to newvers */ +#define __FreeBSD_version 1103507 /* Master, propagated to newvers */ /* * It is tempting to use this macro in userland code when we want to enable @@ -70,12 +70,17 @@ #define __MidnightBSD_kernel__ #if defined(_KERNEL) || defined(IN_RTLD) -#define P_OSREL_SIGWAIT 700000 -#define P_OSREL_SIGSEGV 700004 -#define P_OSREL_MAP_ANON 800104 +#define P_OSREL_SIGWAIT 700000 +#define P_OSREL_SIGSEGV 700004 +#define P_OSREL_MAP_ANON 800104 +#define P_OSREL_MAP_FSTRICT 1100036 +#define P_OSREL_SHUTDOWN_ENOTCONN 1100077 #define P_OSREL_MAP_GUARD 1200035 +#define P_OSREL_MAP_GUARD_11 1101501 +#define P_OSREL_WRFSBASE 1200041 +#define P_OSREL_WRFSBASE_11 1101503 -#define P_OSREL_MAJOR(x) ((x) / 100000) +#define P_OSREL_MAJOR(x) ((x) / 100000) #endif #ifndef LOCORE From laffer1 at midnightbsd.org Sun Feb 9 13:34:07 2020 From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org) Date: Sun, 9 Feb 2020 13:34:07 -0500 (EST) Subject: [Midnightbsd-cvs] src [12363] trunk/sys/sys/select.h: sync with FreeBSD 11-stable Message-ID: <202002091834.019IY7Jm091927@stargazer.midnightbsd.org> Revision: 12363 http://svnweb.midnightbsd.org/src/?rev=12363 Author: laffer1 Date: 2020-02-09 13:34:06 -0500 (Sun, 09 Feb 2020) Log Message: ----------- sync with FreeBSD 11-stable Modified Paths: -------------- trunk/sys/sys/select.h Modified: trunk/sys/sys/select.h =================================================================== --- trunk/sys/sys/select.h 2020-02-09 18:33:35 UTC (rev 12362) +++ trunk/sys/sys/select.h 2020-02-09 18:34:06 UTC (rev 12363) @@ -27,7 +27,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: stable/10/sys/sys/select.h 154090 2006-01-06 22:12:46Z marcel $ + * $FreeBSD: stable/11/sys/sys/select.h 331722 2018-03-29 02:50:57Z eadler $ */ #ifndef _SYS_SELECT_H_ @@ -57,7 +57,7 @@ * be enough for most uses. */ #ifndef FD_SETSIZE -#define FD_SETSIZE 1024U +#define FD_SETSIZE 1024 #endif #define _NFDBITS (sizeof(__fd_mask) * 8) /* bits per mask */ From laffer1 at midnightbsd.org Sun Feb 9 13:34:17 2020 From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org) Date: Sun, 9 Feb 2020 13:34:17 -0500 (EST) Subject: [Midnightbsd-cvs] src [12364] trunk/sys/sys/sched.h: sync with FreeBSD 11-stable Message-ID: <202002091834.019IYHlM091974@stargazer.midnightbsd.org> Revision: 12364 http://svnweb.midnightbsd.org/src/?rev=12364 Author: laffer1 Date: 2020-02-09 13:34:16 -0500 (Sun, 09 Feb 2020) Log Message: ----------- sync with FreeBSD 11-stable Modified Paths: -------------- trunk/sys/sys/sched.h Modified: trunk/sys/sys/sched.h =================================================================== --- trunk/sys/sys/sched.h 2020-02-09 18:34:06 UTC (rev 12363) +++ trunk/sys/sys/sched.h 2020-02-09 18:34:16 UTC (rev 12364) @@ -57,7 +57,7 @@ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * - * $FreeBSD: stable/10/sys/sys/sched.h 253604 2013-07-24 09:45:31Z avg $ + * $FreeBSD: stable/11/sys/sys/sched.h 331722 2018-03-29 02:50:57Z eadler $ */ #ifndef _SCHED_H_ @@ -91,6 +91,7 @@ * priorities inherited from their procs, and use up cpu time. */ void sched_exit_thread(struct thread *td, struct thread *child); +u_int sched_estcpu(struct thread *td); void sched_fork_thread(struct thread *td, struct thread *child); void sched_lend_prio(struct thread *td, u_char prio); void sched_lend_user_prio(struct thread *td, u_char pri); @@ -103,7 +104,6 @@ void sched_user_prio(struct thread *td, u_char prio); void sched_userret(struct thread *td); void sched_wakeup(struct thread *td); -void sched_preempt(struct thread *td); #ifdef RACCT #ifdef SCHED_4BSD fixpt_t sched_pctcpu_delta(struct thread *td); @@ -115,8 +115,8 @@ */ void sched_add(struct thread *td, int flags); void sched_clock(struct thread *td); +void sched_preempt(struct thread *td); void sched_rem(struct thread *td); -void sched_tick(int cnt); void sched_relinquish(struct thread *td); struct thread *sched_choose(void); void sched_idletd(void *); @@ -223,6 +223,7 @@ */ #ifndef _KERNEL #include <sys/cdefs.h> +#include <sys/_timespec.h> #include <sys/_types.h> #ifndef _PID_T_DECLARED @@ -230,8 +231,6 @@ #define _PID_T_DECLARED #endif -struct timespec; - __BEGIN_DECLS int sched_get_priority_max(int); int sched_get_priority_min(int); From laffer1 at midnightbsd.org Sun Feb 9 13:34:37 2020 From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org) Date: Sun, 9 Feb 2020 13:34:37 -0500 (EST) Subject: [Midnightbsd-cvs] src [12365] trunk/sys/sys/rwlock.h: sync with FreeBSD 11-stable Message-ID: <202002091834.019IYbF3092027@stargazer.midnightbsd.org> Revision: 12365 http://svnweb.midnightbsd.org/src/?rev=12365 Author: laffer1 Date: 2020-02-09 13:34:36 -0500 (Sun, 09 Feb 2020) Log Message: ----------- sync with FreeBSD 11-stable Modified Paths: -------------- trunk/sys/sys/rwlock.h Modified: trunk/sys/sys/rwlock.h =================================================================== --- trunk/sys/sys/rwlock.h 2020-02-09 18:34:16 UTC (rev 12364) +++ trunk/sys/sys/rwlock.h 2020-02-09 18:34:36 UTC (rev 12365) @@ -24,7 +24,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: stable/10/sys/sys/rwlock.h 323870 2017-09-21 19:24:11Z marius $ + * $FreeBSD: stable/11/sys/sys/rwlock.h 343420 2019-01-25 11:01:11Z kib $ */ #ifndef _SYS_RWLOCK_H_ @@ -59,13 +59,14 @@ #define RW_LOCK_READ_WAITERS 0x02 #define RW_LOCK_WRITE_WAITERS 0x04 #define RW_LOCK_WRITE_SPINNER 0x08 +#define RW_LOCK_WRITER_RECURSED 0x10 #define RW_LOCK_FLAGMASK \ (RW_LOCK_READ | RW_LOCK_READ_WAITERS | RW_LOCK_WRITE_WAITERS | \ - RW_LOCK_WRITE_SPINNER) + RW_LOCK_WRITE_SPINNER | RW_LOCK_WRITER_RECURSED) #define RW_LOCK_WAITERS (RW_LOCK_READ_WAITERS | RW_LOCK_WRITE_WAITERS) #define RW_OWNER(x) ((x) & ~RW_LOCK_FLAGMASK) -#define RW_READERS_SHIFT 4 +#define RW_READERS_SHIFT 5 #define RW_READERS(x) (RW_OWNER((x)) >> RW_READERS_SHIFT) #define RW_READERS_LOCK(x) ((x) << RW_READERS_SHIFT | RW_LOCK_READ) #define RW_ONE_READER (1 << RW_READERS_SHIFT) @@ -77,6 +78,8 @@ #define rw_recurse lock_object.lo_data +#define RW_READ_VALUE(x) ((x)->rw_lock) + /* Very simple operations on rw_lock. */ /* Try to obtain a write lock once. */ @@ -83,10 +86,16 @@ #define _rw_write_lock(rw, tid) \ atomic_cmpset_acq_ptr(&(rw)->rw_lock, RW_UNLOCKED, (tid)) +#define _rw_write_lock_fetch(rw, vp, tid) \ + atomic_fcmpset_acq_ptr(&(rw)->rw_lock, vp, (tid)) + /* Release a write lock quickly if there are no waiters. */ #define _rw_write_unlock(rw, tid) \ atomic_cmpset_rel_ptr(&(rw)->rw_lock, (tid), RW_UNLOCKED) +#define _rw_write_unlock_fetch(rw, tid) \ + atomic_fcmpset_rel_ptr(&(rw)->rw_lock, (tid), RW_UNLOCKED) + /* * Full lock operations that are suitable to be inlined in non-debug * kernels. If the lock cannot be acquired or released trivially then @@ -96,22 +105,20 @@ /* Acquire a write lock. */ #define __rw_wlock(rw, tid, file, line) do { \ uintptr_t _tid = (uintptr_t)(tid); \ - \ - if ((rw)->rw_lock != RW_UNLOCKED || !_rw_write_lock((rw), _tid))\ - _rw_wlock_hard((rw), _tid, (file), (line)); \ - else \ - LOCKSTAT_PROFILE_OBTAIN_LOCK_SUCCESS(LS_RW_WLOCK_ACQUIRE, \ - rw, 0, 0, (file), (line)); \ + uintptr_t _v = RW_UNLOCKED; \ + \ + if (__predict_false(LOCKSTAT_PROFILE_ENABLED(rw__acquire) || \ + !_rw_write_lock_fetch((rw), &_v, _tid))) \ + _rw_wlock_hard((rw), _v, (file), (line)); \ } while (0) /* Release a write lock. */ #define __rw_wunlock(rw, tid, file, line) do { \ - uintptr_t _tid = (uintptr_t)(tid); \ + uintptr_t _v = (uintptr_t)(tid); \ \ - if ((rw)->rw_recurse) \ - (rw)->rw_recurse--; \ - else if (!_rw_write_unlock((rw), _tid)) \ - _rw_wunlock_hard((rw), _tid, (file), (line)); \ + if (__predict_false(LOCKSTAT_PROFILE_ENABLED(rw__release) || \ + !_rw_write_unlock_fetch((rw), &_v))) \ + _rw_wunlock_hard((rw), _v, (file), (line)); \ } while (0) /* @@ -122,19 +129,24 @@ void _rw_init_flags(volatile uintptr_t *c, const char *name, int opts); void _rw_destroy(volatile uintptr_t *c); void rw_sysinit(void *arg); -void rw_sysinit_flags(void *arg); int _rw_wowned(const volatile uintptr_t *c); void _rw_wlock_cookie(volatile uintptr_t *c, const char *file, int line); +int __rw_try_wlock_int(struct rwlock *rw LOCK_FILE_LINE_ARG_DEF); int __rw_try_wlock(volatile uintptr_t *c, const char *file, int line); void _rw_wunlock_cookie(volatile uintptr_t *c, const char *file, int line); +void __rw_rlock_int(struct rwlock *rw LOCK_FILE_LINE_ARG_DEF); void __rw_rlock(volatile uintptr_t *c, const char *file, int line); +int __rw_try_rlock_int(struct rwlock *rw LOCK_FILE_LINE_ARG_DEF); int __rw_try_rlock(volatile uintptr_t *c, const char *file, int line); +void _rw_runlock_cookie_int(struct rwlock *rw LOCK_FILE_LINE_ARG_DEF); void _rw_runlock_cookie(volatile uintptr_t *c, const char *file, int line); -void __rw_wlock_hard(volatile uintptr_t *c, uintptr_t tid, const char *file, - int line); -void __rw_wunlock_hard(volatile uintptr_t *c, uintptr_t tid, - const char *file, int line); +void __rw_wlock_hard(volatile uintptr_t *c, uintptr_t v + LOCK_FILE_LINE_ARG_DEF); +void __rw_wunlock_hard(volatile uintptr_t *c, uintptr_t v + LOCK_FILE_LINE_ARG_DEF); +int __rw_try_upgrade_int(struct rwlock *rw LOCK_FILE_LINE_ARG_DEF); int __rw_try_upgrade(volatile uintptr_t *c, const char *file, int line); +void __rw_downgrade_int(struct rwlock *rw LOCK_FILE_LINE_ARG_DEF); void __rw_downgrade(volatile uintptr_t *c, const char *file, int line); #if defined(INVARIANTS) || defined(INVARIANT_SUPPORT) void __rw_assert(const volatile uintptr_t *c, int what, const char *file, @@ -160,20 +172,38 @@ __rw_try_wlock(&(rw)->rw_lock, f, l) #define _rw_wunlock(rw, f, l) \ _rw_wunlock_cookie(&(rw)->rw_lock, f, l) +#define _rw_try_rlock(rw, f, l) \ + __rw_try_rlock(&(rw)->rw_lock, f, l) +#if LOCK_DEBUG > 0 #define _rw_rlock(rw, f, l) \ __rw_rlock(&(rw)->rw_lock, f, l) -#define _rw_try_rlock(rw, f, l) \ - __rw_try_rlock(&(rw)->rw_lock, f, l) #define _rw_runlock(rw, f, l) \ _rw_runlock_cookie(&(rw)->rw_lock, f, l) -#define _rw_wlock_hard(rw, t, f, l) \ - __rw_wlock_hard(&(rw)->rw_lock, t, f, l) -#define _rw_wunlock_hard(rw, t, f, l) \ - __rw_wunlock_hard(&(rw)->rw_lock, t, f, l) +#else +#define _rw_rlock(rw, f, l) \ + __rw_rlock_int((struct rwlock *)rw) +#define _rw_runlock(rw, f, l) \ + _rw_runlock_cookie_int((struct rwlock *)rw) +#endif +#if LOCK_DEBUG > 0 +#define _rw_wlock_hard(rw, v, f, l) \ + __rw_wlock_hard(&(rw)->rw_lock, v, f, l) +#define _rw_wunlock_hard(rw, v, f, l) \ + __rw_wunlock_hard(&(rw)->rw_lock, v, f, l) #define _rw_try_upgrade(rw, f, l) \ __rw_try_upgrade(&(rw)->rw_lock, f, l) #define _rw_downgrade(rw, f, l) \ __rw_downgrade(&(rw)->rw_lock, f, l) +#else +#define _rw_wlock_hard(rw, v, f, l) \ + __rw_wlock_hard(&(rw)->rw_lock, v) +#define _rw_wunlock_hard(rw, v, f, l) \ + __rw_wunlock_hard(&(rw)->rw_lock, v) +#define _rw_try_upgrade(rw, f, l) \ + __rw_try_upgrade_int(rw) +#define _rw_downgrade(rw, f, l) \ + __rw_downgrade_int(rw) +#endif #if defined(INVARIANTS) || defined(INVARIANT_SUPPORT) #define _rw_assert(rw, w, f, l) \ __rw_assert(&(rw)->rw_lock, w, f, l) @@ -212,23 +242,19 @@ _sleep((chan), &(rw)->lock_object, (pri), (wmesg), \ tick_sbt * (timo), 0, C_HARDCLOCK) -#define rw_initialized(rw) lock_initalized(&(rw)->lock_object) +#define rw_initialized(rw) lock_initialized(&(rw)->lock_object) struct rw_args { void *ra_rw; const char *ra_desc; -}; - -struct rw_args_flags { - void *ra_rw; - const char *ra_desc; int ra_flags; }; -#define RW_SYSINIT(name, rw, desc) \ +#define RW_SYSINIT_FLAGS(name, rw, desc, flags) \ static struct rw_args name##_args = { \ (rw), \ (desc), \ + (flags), \ }; \ SYSINIT(name##_rw_sysinit, SI_SUB_LOCK, SI_ORDER_MIDDLE, \ rw_sysinit, &name##_args); \ @@ -235,18 +261,8 @@ SYSUNINIT(name##_rw_sysuninit, SI_SUB_LOCK, SI_ORDER_MIDDLE, \ _rw_destroy, __DEVOLATILE(void *, &(rw)->rw_lock)) +#define RW_SYSINIT(name, rw, desc) RW_SYSINIT_FLAGS(name, rw, desc, 0) -#define RW_SYSINIT_FLAGS(name, rw, desc, flags) \ - static struct rw_args_flags name##_args = { \ - (rw), \ - (desc), \ - (flags), \ - }; \ - SYSINIT(name##_rw_sysinit, SI_SUB_LOCK, SI_ORDER_MIDDLE, \ - rw_sysinit_flags, &name##_args); \ - SYSUNINIT(name##_rw_sysuninit, SI_SUB_LOCK, SI_ORDER_MIDDLE, \ - _rw_destroy, __DEVOLATILE(void *, &(rw)->rw_lock)) - /* * Options passed to rw_init_flags(). */ From laffer1 at midnightbsd.org Sun Feb 9 13:38:04 2020 From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org) Date: Sun, 9 Feb 2020 13:38:04 -0500 (EST) Subject: [Midnightbsd-cvs] src [12366] trunk/sys/sys: sync with FreeBSD 11-stable Message-ID: <202002091838.019Ic4Un092848@stargazer.midnightbsd.org> Revision: 12366 http://svnweb.midnightbsd.org/src/?rev=12366 Author: laffer1 Date: 2020-02-09 13:38:04 -0500 (Sun, 09 Feb 2020) Log Message: ----------- sync with FreeBSD 11-stable Modified Paths: -------------- trunk/sys/sys/_bitset.h trunk/sys/sys/rman.h trunk/sys/sys/rmlock.h Modified: trunk/sys/sys/_bitset.h =================================================================== --- trunk/sys/sys/_bitset.h 2020-02-09 18:34:36 UTC (rev 12365) +++ trunk/sys/sys/_bitset.h 2020-02-09 18:38:04 UTC (rev 12366) @@ -27,7 +27,7 @@ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * - * $FreeBSD: stable/10/sys/sys/_bitset.h 290739 2015-11-13 01:37:08Z markj $ + * $FreeBSD: stable/11/sys/sys/_bitset.h 331722 2018-03-29 02:50:57Z eadler $ */ #ifndef _SYS__BITSET_H_ @@ -37,26 +37,23 @@ * Macros addressing word and bit within it, tuned to make compiler * optimize cases when SETSIZE fits into single machine word. */ -#define _BITSET_BITS (sizeof(long) * NBBY) +#define _BITSET_BITS (sizeof(long) * 8) -#define __bitset_words(_s) (howmany(_s, _BITSET_BITS)) +#define __howmany(x, y) (((x) + ((y) - 1)) / (y)) -#define __bitset_mask(_s, n) \ - (1L << ((__bitset_words((_s)) == 1) ? \ - (__size_t)(n) : ((n) % _BITSET_BITS))) +#define __bitset_words(_s) (__howmany(_s, _BITSET_BITS)) -#define __bitset_word(_s, n) \ - ((__bitset_words((_s)) == 1) ? 0 : ((n) / _BITSET_BITS)) - #define BITSET_DEFINE(t, _s) \ struct t { \ long __bits[__bitset_words((_s))]; \ } -#define BITSET_T_INITIALIZER(x) \ - { .__bits = { x } } +/* + * Helper to declare a bitset without it's size being a constant. + * + * Sadly we cannot declare a bitset struct with '__bits[]', because it's + * the only member of the struct and the compiler complains. + */ +#define BITSET_DEFINE_VAR(t) BITSET_DEFINE(t, 1) -#define BITSET_FSET(n) \ - [ 0 ... ((n) - 1) ] = (-1L) - #endif /* !_SYS__BITSET_H_ */ Modified: trunk/sys/sys/rman.h =================================================================== --- trunk/sys/sys/rman.h 2020-02-09 18:34:36 UTC (rev 12365) +++ trunk/sys/sys/rman.h 2020-02-09 18:38:04 UTC (rev 12366) @@ -27,7 +27,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: stable/10/sys/sys/rman.h 303254 2016-07-24 04:38:50Z jhb $ + * $FreeBSD: stable/11/sys/sys/rman.h 300317 2016-05-20 17:57:47Z jhb $ */ #ifndef _SYS_RMAN_H_ @@ -48,6 +48,7 @@ #define RF_FIRSTSHARE 0x0020 /* first in sharing list */ #define RF_PREFETCHABLE 0x0040 /* resource is prefetchable */ #define RF_OPTIONAL 0x0080 /* for bus_alloc_resources() */ +#define RF_UNMAPPED 0x0100 /* don't map resource when activating */ #define RF_ALIGNMENT_SHIFT 10 /* alignment size bit starts bit 10 */ #define RF_ALIGNMENT_MASK (0x003F << RF_ALIGNMENT_SHIFT) @@ -62,6 +63,10 @@ */ #define RM_TEXTLEN 32 +#define RM_MAX_END (~(rman_res_t)0) + +#define RMAN_IS_DEFAULT_RANGE(s,e) ((s) == 0 && (e) == RM_MAX_END) + /* * Userspace-exported structures. */ @@ -71,8 +76,8 @@ uintptr_t r_device; /* device owning this resource */ char r_devname[RM_TEXTLEN]; /* device name XXX obsolete */ - u_long r_start; /* offset in resource space */ - u_long r_size; /* size in resource space */ + rman_res_t r_start; /* offset in resource space */ + rman_res_t r_size; /* size in resource space */ u_int r_flags; /* RF_* flags */ }; @@ -80,8 +85,8 @@ uintptr_t rm_handle; /* rman uniquifier */ char rm_descr[RM_TEXTLEN]; /* rman description */ - u_long rm_start; /* base of managed region */ - u_long rm_size; /* size of managed region */ + rman_res_t rm_start; /* base of managed region */ + rman_res_t rm_size; /* size of managed region */ enum rman_type rm_type; /* region type */ }; @@ -102,6 +107,7 @@ }; struct resource_i; +struct resource_map; TAILQ_HEAD(resource_head, resource_i); @@ -109,8 +115,8 @@ struct resource_head rm_list; struct mtx *rm_mtx; /* mutex used to protect rm_list */ TAILQ_ENTRY(rman) rm_link; /* link in list of all rmans */ - u_long rm_start; /* index of globally first entry */ - u_long rm_end; /* index of globally last entry */ + rman_res_t rm_start; /* index of globally first entry */ + rman_res_t rm_end; /* index of globally last entry */ enum rman_type rm_type; /* what type of resource this is */ const char *rm_descr; /* text descripion of this resource */ }; @@ -117,38 +123,40 @@ TAILQ_HEAD(rman_head, rman); int rman_activate_resource(struct resource *r); -int rman_adjust_resource(struct resource *r, u_long start, u_long end); -int rman_first_free_region(struct rman *rm, u_long *start, u_long *end); +int rman_adjust_resource(struct resource *r, rman_res_t start, rman_res_t end); +int rman_first_free_region(struct rman *rm, rman_res_t *start, rman_res_t *end); bus_space_handle_t rman_get_bushandle(struct resource *); bus_space_tag_t rman_get_bustag(struct resource *); -u_long rman_get_end(struct resource *); +rman_res_t rman_get_end(struct resource *); struct device *rman_get_device(struct resource *); u_int rman_get_flags(struct resource *); +void rman_get_mapping(struct resource *, struct resource_map *); int rman_get_rid(struct resource *); -u_long rman_get_size(struct resource *); -u_long rman_get_start(struct resource *); +rman_res_t rman_get_size(struct resource *); +rman_res_t rman_get_start(struct resource *); void *rman_get_virtual(struct resource *); int rman_deactivate_resource(struct resource *r); int rman_fini(struct rman *rm); int rman_init(struct rman *rm); int rman_init_from_resource(struct rman *rm, struct resource *r); -int rman_last_free_region(struct rman *rm, u_long *start, u_long *end); +int rman_last_free_region(struct rman *rm, rman_res_t *start, rman_res_t *end); uint32_t rman_make_alignment_flags(uint32_t size); -int rman_manage_region(struct rman *rm, u_long start, u_long end); +int rman_manage_region(struct rman *rm, rman_res_t start, rman_res_t end); int rman_is_region_manager(struct resource *r, struct rman *rm); int rman_release_resource(struct resource *r); -struct resource *rman_reserve_resource(struct rman *rm, u_long start, - u_long end, u_long count, +struct resource *rman_reserve_resource(struct rman *rm, rman_res_t start, + rman_res_t end, rman_res_t count, u_int flags, struct device *dev); -struct resource *rman_reserve_resource_bound(struct rman *rm, u_long start, - u_long end, u_long count, u_long bound, +struct resource *rman_reserve_resource_bound(struct rman *rm, rman_res_t start, + rman_res_t end, rman_res_t count, rman_res_t bound, u_int flags, struct device *dev); void rman_set_bushandle(struct resource *_r, bus_space_handle_t _h); void rman_set_bustag(struct resource *_r, bus_space_tag_t _t); void rman_set_device(struct resource *_r, struct device *_dev); -void rman_set_end(struct resource *_r, u_long _end); +void rman_set_end(struct resource *_r, rman_res_t _end); +void rman_set_mapping(struct resource *, struct resource_map *); void rman_set_rid(struct resource *_r, int _rid); -void rman_set_start(struct resource *_r, u_long _start); +void rman_set_start(struct resource *_r, rman_res_t _start); void rman_set_virtual(struct resource *_r, void *_v); extern struct rman_head rman_head; Modified: trunk/sys/sys/rmlock.h =================================================================== --- trunk/sys/sys/rmlock.h 2020-02-09 18:34:36 UTC (rev 12365) +++ trunk/sys/sys/rmlock.h 2020-02-09 18:38:04 UTC (rev 12366) @@ -27,7 +27,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: stable/10/sys/sys/rmlock.h 323870 2017-09-21 19:24:11Z marius $ + * $FreeBSD: stable/11/sys/sys/rmlock.h 343420 2019-01-25 11:01:11Z kib $ */ #ifndef _SYS_RMLOCK_H_ @@ -53,7 +53,6 @@ void rm_destroy(struct rmlock *rm); int rm_wowned(const struct rmlock *rm); void rm_sysinit(void *arg); -void rm_sysinit_flags(void *arg); void _rm_wlock_debug(struct rmlock *rm, const char *file, int line); void _rm_wunlock_debug(struct rmlock *rm, const char *file, int line); @@ -102,18 +101,14 @@ struct rm_args { struct rmlock *ra_rm; const char *ra_desc; + int ra_flags; }; -struct rm_args_flags { - struct rmlock *ra_rm; - const char *ra_desc; - int ra_opts; -}; - -#define RM_SYSINIT(name, rm, desc) \ +#define RM_SYSINIT_FLAGS(name, rm, desc, flags) \ static struct rm_args name##_args = { \ (rm), \ (desc), \ + (flags), \ }; \ SYSINIT(name##_rm_sysinit, SI_SUB_LOCK, SI_ORDER_MIDDLE, \ rm_sysinit, &name##_args); \ @@ -120,18 +115,8 @@ SYSUNINIT(name##_rm_sysuninit, SI_SUB_LOCK, SI_ORDER_MIDDLE, \ rm_destroy, (rm)) +#define RM_SYSINIT(name, rm, desc) RM_SYSINIT_FLAGS(name, rm, desc, 0) -#define RM_SYSINIT_FLAGS(name, rm, desc, opts) \ - static struct rm_args name##_args = { \ - (rm), \ - (desc), \ - (opts), \ - }; \ - SYSINIT(name##_rm_sysinit, SI_SUB_LOCK, SI_ORDER_MIDDLE, \ - rm_sysinit_flags, &name##_args); \ - SYSUNINIT(name##_rm_sysuninit, SI_SUB_LOCK, SI_ORDER_MIDDLE, \ - rm_destroy, (rm)) - #if defined(INVARIANTS) || defined(INVARIANT_SUPPORT) #define RA_LOCKED LA_LOCKED #define RA_RLOCKED LA_SLOCKED