[Midnightbsd-cvs] src [8963] trunk/sys: merge in TOE update from FreeBSD 252555.

laffer1 at midnightbsd.org laffer1 at midnightbsd.org
Tue Sep 27 12:43:00 EDT 2016


Revision: 8963
          http://svnweb.midnightbsd.org/src/?rev=8963
Author:   laffer1
Date:     2016-09-27 12:42:59 -0400 (Tue, 27 Sep 2016)
Log Message:
-----------
merge in TOE update from FreeBSD 252555.

Modified Paths:
--------------
    trunk/sys/conf/NOTES
    trunk/sys/contrib/rdma/krping/krping.c
    trunk/sys/contrib/rdma/krping/krping.h
    trunk/sys/contrib/rdma/krping/krping_dev.c
    trunk/sys/contrib/rdma/rdma_addr.c
    trunk/sys/contrib/rdma/rdma_cache.c
    trunk/sys/modules/rdma/krping/Makefile
    trunk/sys/modules/toecore/Makefile
    trunk/sys/net/if_llatbl.h
    trunk/sys/net/if_vlan.c
    trunk/sys/netinet/if_ether.c
    trunk/sys/netinet/if_ether.h
    trunk/sys/netinet/tcp_input.c
    trunk/sys/netinet/tcp_offload.h
    trunk/sys/netinet/tcp_output.c
    trunk/sys/netinet/tcp_subr.c
    trunk/sys/netinet/tcp_syncache.c
    trunk/sys/netinet/tcp_syncache.h
    trunk/sys/netinet/tcp_timer.c
    trunk/sys/netinet/tcp_usrreq.c
    trunk/sys/netinet/tcp_var.h
    trunk/sys/netinet/toecore.c
    trunk/sys/netinet/toecore.h
    trunk/sys/netinet6/nd6.c
    trunk/sys/netinet6/nd6_nbr.c

Modified: trunk/sys/conf/NOTES
===================================================================
--- trunk/sys/conf/NOTES	2016-09-27 16:35:26 UTC (rev 8962)
+++ trunk/sys/conf/NOTES	2016-09-27 16:42:59 UTC (rev 8963)
@@ -548,6 +548,8 @@
 
 options 	TCP_OFFLOAD		# TCP offload support.
 
+options 	TCP_OFFLOAD		# TCP offload support.
+
 # In order to enable IPSEC you MUST also add device crypto to 
 # your kernel configuration
 options 	IPSEC			#IP security (requires device crypto)

Modified: trunk/sys/contrib/rdma/krping/krping.c
===================================================================
--- trunk/sys/contrib/rdma/krping/krping.c	2016-09-27 16:35:26 UTC (rev 8962)
+++ trunk/sys/contrib/rdma/krping/krping.c	2016-09-27 16:42:59 UTC (rev 8963)
@@ -41,7 +41,6 @@
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/socket.h>
-#include <sys/module.h>
 #include <sys/endian.h>
 #include <sys/limits.h>
 #include <sys/proc.h>
@@ -53,11 +52,13 @@
 #include <sys/queue.h>
 #include <sys/taskqueue.h>
 #include <sys/syslog.h>
+#include <netinet/in.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 
-#include <contrib/rdma/rdma_cm.h>
+#include <linux/types.h>
+#include <rdma/rdma_cm.h>
 
 #include "getopt.h"
 #include "krping.h"
@@ -83,6 +84,7 @@
 	{"bw", OPT_NOPARAM, 'B'},
 	{"tx-depth", OPT_INT, 't'},
   	{"poll", OPT_NOPARAM, 'P'},
+  	{"memlimit", OPT_INT, 'm'},
 	{NULL, 0, 0}
 };
 
@@ -254,10 +256,14 @@
 		ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP);
 	while ((ret = ib_poll_cq(cb->cq, 1, &wc)) == 1) {
 		if (wc.status) {
-			if (wc.status != IB_WC_WR_FLUSH_ERR)
-				log(LOG_ERR, "cq completion failed status %d\n",
+			if (wc.status == IB_WC_WR_FLUSH_ERR) {
+				DEBUG_LOG("cq flushed\n");
+				continue;
+			} else {
+				log(LOG_CRIT, "cq completion failed status %d\n",
 					wc.status);
-			goto error;
+				goto error;
+			}
 		}
 
 		switch (wc.opcode) {
@@ -432,8 +438,17 @@
 		}
 	}
 
-	cb->rdma_buf = contigmalloc(cb->size, M_DEVBUF, M_WAITOK, 0, -1UL,
-		PAGE_SIZE, 0);
+	/* RNIC adapters have a limit upto which it can register physical memory
+	 * If DMA-MR memory mode is set then normally driver registers maximum
+	 * supported memory. After that if contigmalloc allocates memory beyond the
+	 * specified RNIC limit then Krping may not work.
+	 */
+	if (cb->use_dmamr && cb->memlimit)
+		cb->rdma_buf = contigmalloc(cb->size, M_DEVBUF, M_WAITOK, 0, cb->memlimit,
+					    PAGE_SIZE, 0);
+	else 
+		cb->rdma_buf = contigmalloc(cb->size, M_DEVBUF, M_WAITOK, 0, -1UL,
+					    PAGE_SIZE, 0);
 
 	if (!cb->rdma_buf) {
 		log(LOG_ERR, "rdma_buf malloc failed\n");
@@ -458,8 +473,12 @@
 	}
 
 	if (!cb->server || cb->wlat || cb->rlat || cb->bw) {
-		cb->start_buf = contigmalloc(cb->size, M_DEVBUF, M_WAITOK,
-			0, -1UL, PAGE_SIZE, 0);
+		if (cb->use_dmamr && cb->memlimit)
+			cb->start_buf = contigmalloc(cb->size, M_DEVBUF, M_WAITOK,
+						     0, cb->memlimit, PAGE_SIZE, 0);
+		else
+			cb->start_buf = contigmalloc(cb->size, M_DEVBUF, M_WAITOK,
+						     0, -1UL, PAGE_SIZE, 0);
 		if (!cb->start_buf) {
 			log(LOG_ERR, "start_buf malloc failed\n");
 			ret = ENOMEM;
@@ -1636,6 +1655,8 @@
 	cb->state = IDLE;
 	cb->size = 64;
 	cb->txdepth = RPING_SQ_DEPTH;
+	cb->use_dmamr = 1;
+	cb->memlimit = 0;
 	mtx_init(&cb->lock, "krping mtx", NULL, MTX_DUPOK|MTX_DEF);
 
 	while ((op = krping_getopt("krping", &cmd, krping_opts, NULL, &optarg,
@@ -1713,6 +1734,15 @@
 		case 'd':
 			debug++;
 			break;
+		case 'm':
+                        cb->memlimit = optint;
+                        if (cb->memlimit < 1) {
+                                log(LOG_ERR, "Invalid memory limit %ju\n",
+				    cb->memlimit);
+                                ret = EINVAL;
+                        } else
+                                DEBUG_LOG(PFX "memory limit %d\n", (int)optint);
+                        break;
 		default:
 			log(LOG_ERR, "unknown opt %s\n", optarg);
 			ret = EINVAL;

Modified: trunk/sys/contrib/rdma/krping/krping.h
===================================================================
--- trunk/sys/contrib/rdma/krping/krping.h	2016-09-27 16:35:26 UTC (rev 8962)
+++ trunk/sys/contrib/rdma/krping/krping.h	2016-09-27 16:42:59 UTC (rev 8963)
@@ -1,7 +1,7 @@
 /*
  * $FreeBSD$
  */
-#include <contrib/rdma/ib_verbs.h>
+#include <rdma/ib_verbs.h>
 #include <netinet/in.h>
 
 /*
@@ -92,6 +92,8 @@
 	int count;			/* ping count */
 	int size;			/* ping data size */
 	int validate;			/* validate ping data */
+	uint64_t memlimit;		/* limit of the physical memory that
+					   can be registered with dma_mr mode */
 
 	/* CM stuff */
 	struct rdma_cm_id *cm_id;	/* connection on client side,*/

Modified: trunk/sys/contrib/rdma/krping/krping_dev.c
===================================================================
--- trunk/sys/contrib/rdma/krping/krping_dev.c	2016-09-27 16:35:26 UTC (rev 8962)
+++ trunk/sys/contrib/rdma/krping/krping_dev.c	2016-09-27 16:42:59 UTC (rev 8963)
@@ -14,7 +14,6 @@
 __FBSDID("$FreeBSD$");
 
 #include <sys/types.h>
-#include <sys/module.h>
 #include <sys/systm.h>  /* uprintf */
 #include <sys/errno.h>
 #include <sys/param.h>  /* defines used in kernel.h */
@@ -51,6 +50,9 @@
 /* vars */
 static struct cdev *krping_dev;
 
+#undef MODULE_VERSION
+#include <sys/module.h>
+
 static int
 krping_loader(struct module *m, int what, void *arg)
 {
@@ -175,6 +177,4 @@
 	return(err);
 }
 
-MODULE_DEPEND(krping, rdma_core, 1, 1, 1);
-MODULE_DEPEND(krping, rdma_cma, 1, 1, 1);
 DEV_MODULE(krping,krping_loader,NULL);

Modified: trunk/sys/contrib/rdma/rdma_addr.c
===================================================================
--- trunk/sys/contrib/rdma/rdma_addr.c	2016-09-27 16:35:26 UTC (rev 8962)
+++ trunk/sys/contrib/rdma/rdma_addr.c	2016-09-27 16:42:59 UTC (rev 8963)
@@ -117,7 +117,8 @@
 		     const unsigned char *dst_dev_addr)
 {
 	dev_addr->dev_type = RDMA_NODE_RNIC;
-	memcpy(dev_addr->src_dev_addr, IF_LLADDR(dev), MAX_ADDR_LEN);
+	memset(dev_addr->src_dev_addr, 0, MAX_ADDR_LEN);
+	memcpy(dev_addr->src_dev_addr, IF_LLADDR(dev), dev->if_addrlen);
 	memcpy(dev_addr->broadcast, dev->if_broadcastaddr, MAX_ADDR_LEN);
 	if (dst_dev_addr)
 		memcpy(dev_addr->dst_dev_addr, dst_dev_addr, MAX_ADDR_LEN);
@@ -207,7 +208,7 @@
 		goto put;
 	}
  	ret = arpresolve(iproute.ro_rt->rt_ifp, iproute.ro_rt, NULL, 
-		rt_key(iproute.ro_rt), dmac, &lle);
+		(struct sockaddr *)dst_in, dmac, &lle);
 	if (ret) {
 		goto put;
 	}

Modified: trunk/sys/contrib/rdma/rdma_cache.c
===================================================================
--- trunk/sys/contrib/rdma/rdma_cache.c	2016-09-27 16:35:26 UTC (rev 8962)
+++ trunk/sys/contrib/rdma/rdma_cache.c	2016-09-27 16:42:59 UTC (rev 8963)
@@ -132,7 +132,7 @@
 	for (p = 0; p <= end_port(device) - start_port(device); ++p) {
 		cache = device->cache.gid_cache[p];
 		for (i = 0; i < cache->table_len; ++i) {
-			if (!memcmp(gid, &cache->table[i], 6)) { /* XXX */
+			if (!memcmp(gid, &cache->table[i], sizeof *gid)) {
 				*port_num = p + start_port(device);
 				if (index)
 					*index = i;

Modified: trunk/sys/modules/rdma/krping/Makefile
===================================================================
--- trunk/sys/modules/rdma/krping/Makefile	2016-09-27 16:35:26 UTC (rev 8962)
+++ trunk/sys/modules/rdma/krping/Makefile	2016-09-27 16:42:59 UTC (rev 8963)
@@ -6,5 +6,7 @@
 KMOD= krping
 SRCS= krping.c krping_dev.c getopt.c
 SRCS+=  bus_if.h device_if.h opt_sched.h pci_if.h pcib_if.h
+SRCS+=  vnode_if.h
+CFLAGS+= -I${.CURDIR}/../../../ofed/include 
 
 .include <bsd.kmod.mk>

Modified: trunk/sys/modules/toecore/Makefile
===================================================================
--- trunk/sys/modules/toecore/Makefile	2016-09-27 16:35:26 UTC (rev 8962)
+++ trunk/sys/modules/toecore/Makefile	2016-09-27 16:42:59 UTC (rev 8963)
@@ -1,3 +1,12 @@
+# $FreeBSD$
+
+.PATH: ${.CURDIR}/../../netinet
+
+KMOD=	toecore
+SRCS=	toecore.c
+SRCS+=	opt_ofed.h
+
+.include <bsd.kmod.mk>
 # $MidnightBSD$
 # $FreeBSD: release/9.2.0/sys/modules/toecore/Makefile 237263 2012-06-19 07:34:13Z np $
 

Modified: trunk/sys/net/if_llatbl.h
===================================================================
--- trunk/sys/net/if_llatbl.h	2016-09-27 16:35:26 UTC (rev 8962)
+++ trunk/sys/net/if_llatbl.h	2016-09-27 16:42:59 UTC (rev 8963)
@@ -205,4 +205,14 @@
 }
 
 int		lla_rt_output(struct rt_msghdr *, struct rt_addrinfo *);
+
+#include <sys/eventhandler.h>
+enum {
+	LLENTRY_RESOLVED,
+	LLENTRY_TIMEDOUT,
+	LLENTRY_DELETED,
+	LLENTRY_EXPIRED,
+};
+typedef void (*lle_event_fn)(void *, struct llentry *, int);
+EVENTHANDLER_DECLARE(lle_event, lle_event_fn);
 #endif  /* _NET_IF_LLATBL_H_ */

Modified: trunk/sys/net/if_vlan.c
===================================================================
--- trunk/sys/net/if_vlan.c	2016-09-27 16:35:26 UTC (rev 8962)
+++ trunk/sys/net/if_vlan.c	2016-09-27 16:42:59 UTC (rev 8963)
@@ -742,8 +742,8 @@
 		vlan_trunk_cap_p = NULL;
 		vlan_trunkdev_p = NULL;
 		vlan_tag_p = NULL;
-		vlan_cookie_p = vlan_cookie;
-		vlan_setcookie_p = vlan_setcookie;
+		vlan_cookie_p = NULL;
+		vlan_setcookie_p = NULL;
 		vlan_devat_p = NULL;
 		VLAN_LOCK_DESTROY();
 		if (bootverbose)
@@ -1504,6 +1504,22 @@
 		ifp->if_capenable &= ~(p->if_capenable & IFCAP_TSO);
 		ifp->if_hwassist &= ~(p->if_hwassist & CSUM_TSO);
 	}
+
+	/*
+	 * If the parent interface can offload TCP connections over VLANs then
+	 * propagate its TOE capability to the VLAN interface.
+	 *
+	 * All TOE drivers in the tree today can deal with VLANs.  If this
+	 * changes then IFCAP_VLAN_TOE should be promoted to a full capability
+	 * with its own bit.
+	 */
+#define	IFCAP_VLAN_TOE IFCAP_TOE
+	if (p->if_capabilities & IFCAP_VLAN_TOE)
+		ifp->if_capabilities |= p->if_capabilities & IFCAP_TOE;
+	if (p->if_capenable & IFCAP_VLAN_TOE) {
+		TOEDEV(ifp) = TOEDEV(p);
+		ifp->if_capenable |= p->if_capenable & IFCAP_TOE;
+	}
 }
 
 static void

Modified: trunk/sys/netinet/if_ether.c
===================================================================
--- trunk/sys/netinet/if_ether.c	2016-09-27 16:35:26 UTC (rev 8962)
+++ trunk/sys/netinet/if_ether.c	2016-09-27 16:42:59 UTC (rev 8963)
@@ -179,6 +179,16 @@
 	ifp = lle->lle_tbl->llt_ifp;
 	CURVNET_SET(ifp->if_vnet);
 
+	if ((lle->la_flags & LLE_DELETED) == 0) {
+		int evt;
+
+		if (lle->la_flags & LLE_VALID)
+			evt = LLENTRY_EXPIRED;
+		else
+			evt = LLENTRY_TIMEDOUT;
+		EVENTHANDLER_INVOKE(lle_event, lle, evt);
+	}
+
 	callout_stop(&lle->la_timer);
 
 	/* XXX: LOR avoidance. We still have ref on lle. */
@@ -722,7 +732,7 @@
 		(void)memcpy(&la->ll_addr, ar_sha(ah), ifp->if_addrlen);
 		la->la_flags |= LLE_VALID;
 
-		EVENTHANDLER_INVOKE(arp_update_event, la);
+		EVENTHANDLER_INVOKE(lle_event, la, LLENTRY_RESOLVED);
 
 		if (!(la->la_flags & LLE_STATIC)) {
 			int canceled;

Modified: trunk/sys/netinet/if_ether.h
===================================================================
--- trunk/sys/netinet/if_ether.h	2016-09-27 16:35:26 UTC (rev 8962)
+++ trunk/sys/netinet/if_ether.h	2016-09-27 16:42:59 UTC (rev 8963)
@@ -117,11 +117,6 @@
 		    struct llentry **lle);
 void	arp_ifinit(struct ifnet *, struct ifaddr *);
 void	arp_ifinit2(struct ifnet *, struct ifaddr *, u_char *);
-
-#include <sys/eventhandler.h>
-typedef void (*llevent_arp_update_fn)(void *, struct llentry *);
-EVENTHANDLER_DECLARE(arp_update_event, llevent_arp_update_fn);
-
 #endif
 
 #endif

Modified: trunk/sys/netinet/tcp_input.c
===================================================================
--- trunk/sys/netinet/tcp_input.c	2016-09-27 16:35:26 UTC (rev 8962)
+++ trunk/sys/netinet/tcp_input.c	2016-09-27 16:42:59 UTC (rev 8963)
@@ -105,6 +105,9 @@
 #ifdef TCPDEBUG
 #include <netinet/tcp_debug.h>
 #endif /* TCPDEBUG */
+#ifdef TCP_OFFLOAD
+#include <netinet/tcp_offload.h>
+#endif
 
 #ifdef IPSEC
 #include <netipsec/ipsec.h>
@@ -1000,6 +1003,14 @@
 		goto dropwithreset;
 	}
 
+#ifdef TCP_OFFLOAD
+	if (tp->t_flags & TF_TOE) {
+		tcp_offload_input(tp, m);
+		m = NULL;	/* consumed by the TOE driver */
+		goto dropunlock;
+	}
+#endif
+
 	/*
 	 * We've identified a valid inpcb, but it could be that we need an
 	 * inpcbinfo write lock but don't hold it.  In this case, attempt to

Modified: trunk/sys/netinet/tcp_offload.h
===================================================================
--- trunk/sys/netinet/tcp_offload.h	2016-09-27 16:35:26 UTC (rev 8962)
+++ trunk/sys/netinet/tcp_offload.h	2016-09-27 16:42:59 UTC (rev 8963)
@@ -34,321 +34,15 @@
 #error "no user-serviceable parts inside"
 #endif
 
-/*
- * A driver publishes that it provides offload services
- * by setting IFCAP_TOE in the ifnet. The offload connect
- * will bypass any further work if the interface that a
- * connection would use does not support TCP offload.
- *
- * The TOE API assumes that the tcp offload engine can offload the 
- * the entire connection from set up to teardown, with some provision 
- * being made to allowing the software stack to handle time wait. If
- * the device does not meet these criteria, it is the driver's responsibility
- * to overload the functions that it needs to in tcp_usrreqs and make
- * its own calls to tcp_output if it needs to do so.
- *
- * There is currently no provision for the device advertising the congestion
- * control algorithms it supports as there is currently no API for querying 
- * an operating system for the protocols that it has loaded. This is a desirable
- * future extension.
- *
- *
- *
- * It is assumed that individuals deploying TOE will want connections
- * to be offloaded without software changes so all connections on an
- * interface providing TOE are offloaded unless the SO_NO_OFFLOAD 
- * flag is set on the socket.
- *
- *
- * The toe_usrreqs structure constitutes the TOE driver's 
- * interface to the TCP stack for functionality that doesn't
- * interact directly with userspace. If one wants to provide
- * (optional) functionality to do zero-copy to/from
- * userspace one still needs to override soreceive/sosend 
- * with functions that fault in and pin the user buffers.
- *
- * + tu_send
- *   - tells the driver that new data may have been added to the 
- *     socket's send buffer - the driver should not fail if the
- *     buffer is in fact unchanged
- *   - the driver is responsible for providing credits (bytes in the send window)
- *     back to the socket by calling sbdrop() as segments are acknowledged.
- *   - The driver expects the inpcb lock to be held - the driver is expected
- *     not to drop the lock. Hence the driver is not allowed to acquire the
- *     pcbinfo lock during this call.
- *
- * + tu_rcvd
- *   - returns credits to the driver and triggers window updates
- *     to the peer (a credit as used here is a byte in the peer's receive window)
- *   - the driver is expected to determine how many bytes have been 
- *     consumed and credit that back to the card so that it can grow
- *     the window again by maintaining its own state between invocations.
- *   - In principle this could be used to shrink the window as well as
- *     grow the window, although it is not used for that now.
- *   - this function needs to correctly handle being called any number of
- *     times without any bytes being consumed from the receive buffer.
- *   - The driver expects the inpcb lock to be held - the driver is expected
- *     not to drop the lock. Hence the driver is not allowed to acquire the
- *     pcbinfo lock during this call.
- *
- * + tu_disconnect
- *   - tells the driver to send FIN to peer
- *   - driver is expected to send the remaining data and then do a clean half close
- *   - disconnect implies at least half-close so only send, reset, and detach
- *     are legal
- *   - the driver is expected to handle transition through the shutdown
- *     state machine and allow the stack to support SO_LINGER.
- *   - The driver expects the inpcb lock to be held - the driver is expected
- *     not to drop the lock. Hence the driver is not allowed to acquire the
- *     pcbinfo lock during this call.
- *
- * + tu_reset
- *   - closes the connection and sends a RST to peer
- *   - driver is expectd to trigger an RST and detach the toepcb
- *   - no further calls are legal after reset
- *   - The driver expects the inpcb lock to be held - the driver is expected
- *     not to drop the lock. Hence the driver is not allowed to acquire the
- *     pcbinfo lock during this call.
- *
- *   The following fields in the tcpcb are expected to be referenced by the driver:
- *	+ iss
- *	+ rcv_nxt
- *	+ rcv_wnd
- *	+ snd_isn
- *	+ snd_max
- *	+ snd_nxt
- *	+ snd_una
- *	+ t_flags
- *	+ t_inpcb
- *	+ t_maxseg
- *	+ t_toe
- *
- *   The following fields in the inpcb are expected to be referenced by the driver:
- *	+ inp_lport
- *	+ inp_fport
- *	+ inp_laddr
- *	+ inp_fport
- *	+ inp_socket
- *	+ inp_ip_tos
- *
- *   The following fields in the socket are expected to be referenced by the
- *   driver:
- *	+ so_comp
- *	+ so_error
- *	+ so_linger
- *	+ so_options
- *	+ so_rcv
- *	+ so_snd
- *	+ so_state
- *	+ so_timeo
- *
- *   These functions all return 0 on success and can return the following errors
- *   as appropriate:
- *	+ EPERM:
- *	+ ENOBUFS: memory allocation failed
- *	+ EMSGSIZE: MTU changed during the call
- *	+ EHOSTDOWN:
- *	+ EHOSTUNREACH:
- *	+ ENETDOWN:
- *	* ENETUNREACH: the peer is no longer reachable
- *
- * + tu_detach
- *   - tells driver that the socket is going away so disconnect
- *     the toepcb and free appropriate resources
- *   - allows the driver to cleanly handle the case of connection state
- *     outliving the socket
- *   - no further calls are legal after detach
- *   - the driver is expected to provide its own synchronization between
- *     detach and receiving new data.
- * 
- * + tu_syncache_event
- *   - even if it is not actually needed, the driver is expected to
- *     call syncache_add for the initial SYN and then syncache_expand
- *     for the SYN,ACK
- *   - tells driver that a connection either has not been added or has 
- *     been dropped from the syncache
- *   - the driver is expected to maintain state that lives outside the 
- *     software stack so the syncache needs to be able to notify the
- *     toe driver that the software stack is not going to create a connection
- *     for a received SYN
- *   - The driver is responsible for any synchronization required between
- *     the syncache dropping an entry and the driver processing the SYN,ACK.
- * 
- */
-struct toe_usrreqs {
-	int (*tu_send)(struct tcpcb *tp);
-	int (*tu_rcvd)(struct tcpcb *tp);
-	int (*tu_disconnect)(struct tcpcb *tp);
-	int (*tu_reset)(struct tcpcb *tp);
-	void (*tu_detach)(struct tcpcb *tp);
-	void (*tu_syncache_event)(int event, void *toep);
-};
+extern int registered_toedevs;
 
-/*
- * Proxy for struct tcpopt between TOE drivers and TCP functions.
- */
-struct toeopt {
-	u_int64_t	to_flags;	/* see tcpopt in tcp_var.h */
-	u_int16_t	to_mss;		/* maximum segment size */
-	u_int8_t	to_wscale;	/* window scaling */
+int  tcp_offload_connect(struct socket *, struct sockaddr *);
+void tcp_offload_listen_start(struct tcpcb *);
+void tcp_offload_listen_stop(struct tcpcb *);
+void tcp_offload_input(struct tcpcb *, struct mbuf *);
+int  tcp_offload_output(struct tcpcb *);
+void tcp_offload_rcvd(struct tcpcb *);
+void tcp_offload_ctloutput(struct tcpcb *, int, int);
+void tcp_offload_detach(struct tcpcb *);
 
-	u_int8_t	_pad1;		/* explicit pad for 64bit alignment */
-	u_int32_t	_pad2;		/* explicit pad for 64bit alignment */
-	u_int64_t	_pad3[4];	/* TBD */
-};
-
-#define	TOE_SC_ENTRY_PRESENT		1	/* 4-tuple already present */
-#define	TOE_SC_DROP			2	/* connection was timed out */
-
-/*
- * Because listen is a one-to-many relationship (a socket can be listening 
- * on all interfaces on a machine some of which may be using different TCP
- * offload devices), listen uses a publish/subscribe mechanism. The TCP
- * offload driver registers a listen notification function with the stack.
- * When a listen socket is created all TCP offload devices are notified
- * so that they can do the appropriate set up to offload connections on the
- * port to which the socket is bound. When the listen socket is closed,
- * the offload devices are notified so that they will stop listening on that
- * port and free any associated resources as well as sending RSTs on any
- * connections in the SYN_RCVD state.
- *
- */
-
-typedef	void	(*tcp_offload_listen_start_fn)(void *, struct tcpcb *);
-typedef	void	(*tcp_offload_listen_stop_fn)(void *, struct tcpcb *);
-
-EVENTHANDLER_DECLARE(tcp_offload_listen_start, tcp_offload_listen_start_fn);
-EVENTHANDLER_DECLARE(tcp_offload_listen_stop, tcp_offload_listen_stop_fn);
-
-/*
- * Check if the socket can be offloaded by the following steps:
- * - determine the egress interface
- * - check the interface for TOE capability and TOE is enabled
- * - check if the device has resources to offload the connection
- */
-int	tcp_offload_connect(struct socket *so, struct sockaddr *nam);
-
-/*
- * The tcp_output_* routines are wrappers around the toe_usrreqs calls
- * which trigger packet transmission. In the non-offloaded case they
- * translate to tcp_output. The tcp_offload_* routines notify TOE
- * of specific events. I the non-offloaded case they are no-ops.
- *
- * Listen is a special case because it is a 1 to many relationship
- * and there can be more than one offload driver in the system.
- */
-
-/*
- * Connection is offloaded
- */
-#define	tp_offload(tp)		((tp)->t_flags & TF_TOE)
-
-/*
- * hackish way of allowing this file to also be included by TOE
- * which needs to be kept ignorant of socket implementation details
- */
-#ifdef _SYS_SOCKETVAR_H_
-/*
- * The socket has not been marked as "do not offload"
- */
-#define	SO_OFFLOADABLE(so)	((so->so_options & SO_NO_OFFLOAD) == 0)
-
-static __inline int
-tcp_output_connect(struct socket *so, struct sockaddr *nam)
-{
-	struct tcpcb *tp = sototcpcb(so);
-	int error;
-
-	/*
-	 * If offload has been disabled for this socket or the 
-	 * connection cannot be offloaded just call tcp_output
-	 * to start the TCP state machine.
-	 */
-#ifndef TCP_OFFLOAD_DISABLE	
-	if (!SO_OFFLOADABLE(so) || (error = tcp_offload_connect(so, nam)) != 0)
-#endif		
-		error = tcp_output(tp);
-	return (error);
-}
-
-static __inline int
-tcp_output_send(struct tcpcb *tp)
-{
-
-#ifndef TCP_OFFLOAD_DISABLE
-	if (tp_offload(tp))
-		return (tp->t_tu->tu_send(tp));
 #endif
-	return (tcp_output(tp));
-}
-
-static __inline int
-tcp_output_rcvd(struct tcpcb *tp)
-{
-
-#ifndef TCP_OFFLOAD_DISABLE
-	if (tp_offload(tp))
-		return (tp->t_tu->tu_rcvd(tp));
-#endif
-	return (tcp_output(tp));
-}
-
-static __inline int
-tcp_output_disconnect(struct tcpcb *tp)
-{
-
-#ifndef TCP_OFFLOAD_DISABLE
-	if (tp_offload(tp))
-		return (tp->t_tu->tu_disconnect(tp));
-#endif
-	return (tcp_output(tp));
-}
-
-static __inline int
-tcp_output_reset(struct tcpcb *tp)
-{
-
-#ifndef TCP_OFFLOAD_DISABLE
-	if (tp_offload(tp))
-		return (tp->t_tu->tu_reset(tp));
-#endif
-	return (tcp_output(tp));
-}
-
-static __inline void
-tcp_offload_detach(struct tcpcb *tp)
-{
-
-#ifndef TCP_OFFLOAD_DISABLE
-	if (tp_offload(tp))
-		tp->t_tu->tu_detach(tp);
-#endif	
-}
-
-static __inline void
-tcp_offload_listen_open(struct tcpcb *tp)
-{
-
-#ifndef TCP_OFFLOAD_DISABLE
-	if (SO_OFFLOADABLE(tp->t_inpcb->inp_socket))
-		EVENTHANDLER_INVOKE(tcp_offload_listen_start, tp);
-#endif	
-}
-
-static __inline void
-tcp_offload_listen_close(struct tcpcb *tp)
-{
-
-#ifndef TCP_OFFLOAD_DISABLE
-	EVENTHANDLER_INVOKE(tcp_offload_listen_stop, tp);
-#endif	
-}
-#undef SO_OFFLOADABLE
-#endif /* _SYS_SOCKETVAR_H_ */
-#undef tp_offload
-
-void tcp_offload_twstart(struct tcpcb *tp);
-struct tcpcb *tcp_offload_close(struct tcpcb *tp);
-struct tcpcb *tcp_offload_drop(struct tcpcb *tp, int error);
-
-#endif /* _NETINET_TCP_OFFLOAD_H_ */

Modified: trunk/sys/netinet/tcp_output.c
===================================================================
--- trunk/sys/netinet/tcp_output.c	2016-09-27 16:35:26 UTC (rev 8962)
+++ trunk/sys/netinet/tcp_output.c	2016-09-27 16:42:59 UTC (rev 8963)
@@ -75,6 +75,9 @@
 #ifdef TCPDEBUG
 #include <netinet/tcp_debug.h>
 #endif
+#ifdef TCP_OFFLOAD
+#include <netinet/tcp_offload.h>
+#endif
 
 #ifdef IPSEC
 #include <netipsec/ipsec.h>
@@ -196,6 +199,11 @@
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
+#ifdef TCP_OFFLOAD
+	if (tp->t_flags & TF_TOE)
+		return (tcp_offload_output(tp));
+#endif
+
 	/*
 	 * Determine length of data that should be transmitted,
 	 * and flags that will be used.

Modified: trunk/sys/netinet/tcp_subr.c
===================================================================
--- trunk/sys/netinet/tcp_subr.c	2016-09-27 16:35:26 UTC (rev 8962)
+++ trunk/sys/netinet/tcp_subr.c	2016-09-27 16:42:59 UTC (rev 8963)
@@ -85,7 +85,6 @@
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 #include <netinet/tcp_syncache.h>
-#include <netinet/tcp_offload.h>
 #ifdef INET6
 #include <netinet6/tcp6_var.h>
 #endif
@@ -96,6 +95,9 @@
 #ifdef INET6
 #include <netinet6/ip6protosw.h>
 #endif
+#ifdef TCP_OFFLOAD
+#include <netinet/tcp_offload.h>
+#endif
 
 #ifdef IPSEC
 #include <netipsec/ipsec.h>
@@ -824,7 +826,7 @@
 
 	if (TCPS_HAVERCVDSYN(tp->t_state)) {
 		tp->t_state = TCPS_CLOSED;
-		(void) tcp_output_reset(tp);
+		(void) tcp_output(tp);
 		TCPSTAT_INC(tcps_drops);
 	} else
 		TCPSTAT_INC(tcps_conndrops);
@@ -924,8 +926,12 @@
 
 	/* free the reassembly queue, if any */
 	tcp_reass_flush(tp);
+
+#ifdef TCP_OFFLOAD
 	/* Disconnect offload device, if any. */
-	tcp_offload_detach(tp);
+	if (tp->t_flags & TF_TOE)
+		tcp_offload_detach(tp);
+#endif
 		
 	tcp_free_sackholes(tp);
 
@@ -954,9 +960,10 @@
 	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 	INP_WLOCK_ASSERT(inp);
 
-	/* Notify any offload devices of listener close */
+#ifdef TCP_OFFLOAD
 	if (tp->t_state == TCPS_LISTEN)
-		tcp_offload_listen_close(tp);
+		tcp_offload_listen_stop(tp);
+#endif
 	in_pcbdrop(inp);
 	TCPSTAT_INC(tcps_closed);
 	KASSERT(inp->inp_socket != NULL, ("tcp_close: inp_socket NULL"));
@@ -1695,7 +1702,7 @@
 	tp->snd_recover = tp->snd_max;
 	if (tp->t_flags & TF_SACK_PERMIT)
 		EXIT_FASTRECOVERY(tp->t_flags);
-	tcp_output_send(tp);
+	tcp_output(tp);
 	return (inp);
 }
 

Modified: trunk/sys/netinet/tcp_syncache.c
===================================================================
--- trunk/sys/netinet/tcp_syncache.c	2016-09-27 16:35:26 UTC (rev 8962)
+++ trunk/sys/netinet/tcp_syncache.c	2016-09-27 16:42:59 UTC (rev 8963)
@@ -81,10 +81,12 @@
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 #include <netinet/tcp_syncache.h>
-#include <netinet/tcp_offload.h>
 #ifdef INET6
 #include <netinet6/tcp6_var.h>
 #endif
+#ifdef TCP_OFFLOAD
+#include <netinet/toecore.h>
+#endif
 
 #ifdef IPSEC
 #include <netipsec/ipsec.h>
@@ -110,10 +112,8 @@
     &VNET_NAME(tcp_syncookiesonly), 0,
     "Use only TCP SYN cookies");
 
-#ifdef TCP_OFFLOAD_DISABLE
-#define TOEPCB_ISSET(sc) (0)
-#else
-#define TOEPCB_ISSET(sc) ((sc)->sc_toepcb != NULL)
+#ifdef TCP_OFFLOAD
+#define ADDED_BY_TOE(sc) ((sc)->sc_tod != NULL)
 #endif
 
 static void	 syncache_drop(struct syncache *, struct syncache_head *);
@@ -332,6 +332,14 @@
 	TAILQ_INSERT_HEAD(&sch->sch_bucket, sc, sc_hash);
 	sch->sch_length++;
 
+#ifdef TCP_OFFLOAD
+	if (ADDED_BY_TOE(sc)) {
+		struct toedev *tod = sc->sc_tod;
+
+		tod->tod_syncache_added(tod, sc->sc_todctx);
+	}
+#endif
+
 	/* Reinitialize the bucket row's timer. */
 	if (sch->sch_length == 1)
 		sch->sch_nextc = ticks + INT_MAX;
@@ -356,10 +364,14 @@
 	TAILQ_REMOVE(&sch->sch_bucket, sc, sc_hash);
 	sch->sch_length--;
 
-#ifndef TCP_OFFLOAD_DISABLE
-	if (sc->sc_tu)
-		sc->sc_tu->tu_syncache_event(TOE_SC_DROP, sc->sc_toepcb);
-#endif		    
+#ifdef TCP_OFFLOAD
+	if (ADDED_BY_TOE(sc)) {
+		struct toedev *tod = sc->sc_tod;
+
+		tod->tod_syncache_removed(tod, sc->sc_todctx);
+	}
+#endif
+
 	syncache_free(sc);
 	V_tcp_syncache.cache_count--;
 }
@@ -846,7 +858,19 @@
 	if (sc->sc_rxmits > 1)
 		tp->snd_cwnd = tp->t_maxseg;
 
+#ifdef TCP_OFFLOAD
 	/*
+	 * Allow a TOE driver to install its hooks.  Note that we hold the
+	 * pcbinfo lock too and that prevents tcp_usr_accept from accepting a
+	 * new connection before the TOE driver has done its thing.
+	 */
+	if (ADDED_BY_TOE(sc)) {
+		struct toedev *tod = sc->sc_tod;
+
+		tod->tod_offload_socket(tod, sc->sc_todctx, so);
+	}
+#endif
+	/*
 	 * Copy and activate timers.
 	 */
 	tp->t_keepinit = sototcpcb(lso)->t_keepinit;
@@ -926,6 +950,13 @@
 		/* Pull out the entry to unlock the bucket row. */
 		TAILQ_REMOVE(&sch->sch_bucket, sc, sc_hash);
 		sch->sch_length--;
+#ifdef TCP_OFFLOAD
+		if (ADDED_BY_TOE(sc)) {
+			struct toedev *tod = sc->sc_tod;
+
+			tod->tod_syncache_removed(tod, sc->sc_todctx);
+		}
+#endif
 		V_tcp_syncache.cache_count--;
 		SCH_UNLOCK(sch);
 	}
@@ -934,7 +965,7 @@
 	 * Segment validation:
 	 * ACK must match our initial sequence number + 1 (the SYN|ACK).
 	 */
-	if (th->th_ack != sc->sc_iss + 1 && !TOEPCB_ISSET(sc)) {
+	if (th->th_ack != sc->sc_iss + 1) {
 		if ((s = tcp_log_addrs(inc, th, NULL, NULL)))
 			log(LOG_DEBUG, "%s; %s: ACK %u != ISS+1 %u, segment "
 			    "rejected\n", s, __func__, th->th_ack, sc->sc_iss);
@@ -945,9 +976,8 @@
 	 * The SEQ must fall in the window starting at the received
 	 * initial receive sequence number + 1 (the SYN).
 	 */
-	if ((SEQ_LEQ(th->th_seq, sc->sc_irs) ||
-	    SEQ_GT(th->th_seq, sc->sc_irs + sc->sc_wnd)) &&
-	    !TOEPCB_ISSET(sc)) {
+	if (SEQ_LEQ(th->th_seq, sc->sc_irs) ||
+	    SEQ_GT(th->th_seq, sc->sc_irs + sc->sc_wnd)) {
 		if ((s = tcp_log_addrs(inc, th, NULL, NULL)))
 			log(LOG_DEBUG, "%s; %s: SEQ %u != IRS+1 %u, segment "
 			    "rejected\n", s, __func__, th->th_seq, sc->sc_irs);
@@ -964,8 +994,7 @@
 	 * If timestamps were negotiated the reflected timestamp
 	 * must be equal to what we actually sent in the SYN|ACK.
 	 */
-	if ((to->to_flags & TOF_TS) && to->to_tsecr != sc->sc_ts &&
-	    !TOEPCB_ISSET(sc)) {
+	if ((to->to_flags & TOF_TS) && to->to_tsecr != sc->sc_ts) {
 		if ((s = tcp_log_addrs(inc, th, NULL, NULL)))
 			log(LOG_DEBUG, "%s; %s: TSECR %u != TS %u, "
 			    "segment rejected\n",
@@ -993,25 +1022,6 @@
 	return (0);
 }
 
-int
-tcp_offload_syncache_expand(struct in_conninfo *inc, struct toeopt *toeo,
-    struct tcphdr *th, struct socket **lsop, struct mbuf *m)
-{
-	struct tcpopt to;
-	int rc;
-
-	bzero(&to, sizeof(struct tcpopt));
-	to.to_mss = toeo->to_mss;
-	to.to_wscale = toeo->to_wscale;
-	to.to_flags = toeo->to_flags;
-	
-	INP_INFO_WLOCK(&V_tcbinfo);
-	rc = syncache_expand(inc, &to, th, lsop, m);
-	INP_INFO_WUNLOCK(&V_tcbinfo);
-
-	return (rc);
-}
-
 /*
  * Given a LISTEN socket and an inbound SYN request, add
  * this to the syn cache, and send back a segment:
@@ -1027,8 +1037,8 @@
  */
 static void
 _syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th,
-    struct inpcb *inp, struct socket **lsop, struct mbuf *m,
-    struct toe_usrreqs *tu, void *toepcb)
+    struct inpcb *inp, struct socket **lsop, struct mbuf *m, void *tod,
+    void *todctx)
 {
 	struct tcpcb *tp;
 	struct socket *so;
@@ -1114,11 +1124,6 @@
 	sc = syncache_lookup(inc, &sch);	/* returns locked entry */
 	SCH_LOCK_ASSERT(sch);
 	if (sc != NULL) {
-#ifndef TCP_OFFLOAD_DISABLE
-		if (sc->sc_tu)
-			sc->sc_tu->tu_syncache_event(TOE_SC_ENTRY_PRESENT,
-			    sc->sc_toepcb);
-#endif		    
 		TCPSTAT_INC(tcps_sc_dupsyn);
 		if (ipopts) {
 			/*
@@ -1151,7 +1156,7 @@
 			    s, __func__);
 			free(s, M_TCPLOG);
 		}
-		if (!TOEPCB_ISSET(sc) && syncache_respond(sc) == 0) {
+		if (syncache_respond(sc) == 0) {
 			sc->sc_rxmits = 0;
 			syncache_timeout(sc, sch, 1);
 			TCPSTAT_INC(tcps_sndacks);
@@ -1202,9 +1207,9 @@
 		sc->sc_ip_tos = ip_tos;
 		sc->sc_ip_ttl = ip_ttl;
 	}
-#ifndef TCP_OFFLOAD_DISABLE	
-	sc->sc_tu = tu;
-	sc->sc_toepcb = toepcb;
+#ifdef TCP_OFFLOAD
+	sc->sc_tod = tod;
+	sc->sc_todctx = todctx;
 #endif
 	sc->sc_irs = th->th_seq;
 	sc->sc_iss = arc4random();
@@ -1299,7 +1304,7 @@
 	/*
 	 * Do a standard 3-way handshake.
 	 */
-	if (TOEPCB_ISSET(sc) || syncache_respond(sc) == 0) {
+	if (syncache_respond(sc) == 0) {
 		if (V_tcp_syncookies && V_tcp_syncookiesonly && sc != &scs)
 			syncache_free(sc);
 		else if (sc != &scs)
@@ -1480,6 +1485,15 @@
 		th->th_sum = in6_cksum_pseudo(ip6, tlen + optlen - hlen,
 		    IPPROTO_TCP, 0);
 		ip6->ip6_hlim = in6_selecthlim(NULL, NULL);
+#ifdef TCP_OFFLOAD
+		if (ADDED_BY_TOE(sc)) {
+			struct toedev *tod = sc->sc_tod;
+
+			error = tod->tod_syncache_respond(tod, sc->sc_todctx, m);
+
+			return (error);
+		}
+#endif
 		error = ip6_output(m, NULL, NULL, 0, NULL, NULL, NULL);
 	}
 #endif
@@ -1491,6 +1505,15 @@
 		m->m_pkthdr.csum_flags = CSUM_TCP;
 		th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
 		    htons(tlen + optlen - hlen + IPPROTO_TCP));
+#ifdef TCP_OFFLOAD
+		if (ADDED_BY_TOE(sc)) {
+			struct toedev *tod = sc->sc_tod;
+
+			error = tod->tod_syncache_respond(tod, sc->sc_todctx, m);
+
+			return (error);
+		}
+#endif
 		error = ip_output(m, sc->sc_ipopts, NULL, 0, NULL, NULL);
 	}
 #endif
@@ -1505,23 +1528,12 @@
 }
 
 void
-tcp_offload_syncache_add(struct in_conninfo *inc, struct toeopt *toeo,
-    struct tcphdr *th, struct inpcb *inp, struct socket **lsop,
-    struct toe_usrreqs *tu, void *toepcb)
+tcp_offload_syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th,
+    struct inpcb *inp, struct socket **lsop, void *tod, void *todctx)
 {
-	struct tcpopt to;
 
-	bzero(&to, sizeof(struct tcpopt));
-	to.to_mss = toeo->to_mss;
-	to.to_wscale = toeo->to_wscale;
-	to.to_flags = toeo->to_flags;
-
-	INP_INFO_WLOCK(&V_tcbinfo);
-	INP_WLOCK(inp);
-
-	_syncache_add(inc, &to, th, inp, lsop, NULL, tu, toepcb);
+	_syncache_add(inc, to, th, inp, lsop, NULL, tod, todctx);
 }
-
 /*
  * The purpose of SYN cookies is to avoid keeping track of all SYN's we
  * receive and to be able to handle SYN floods from bogus source addresses

Modified: trunk/sys/netinet/tcp_syncache.h
===================================================================
--- trunk/sys/netinet/tcp_syncache.h	2016-09-27 16:35:26 UTC (rev 8962)
+++ trunk/sys/netinet/tcp_syncache.h	2016-09-27 16:42:59 UTC (rev 8963)
@@ -34,8 +34,6 @@
 #define _NETINET_TCP_SYNCACHE_H_
 #ifdef _KERNEL
 
-struct toeopt;
-
 void	 syncache_init(void);
 #ifdef VIMAGE
 void	syncache_destroy(void);
@@ -43,14 +41,10 @@
 void	 syncache_unreach(struct in_conninfo *, struct tcphdr *);
 int	 syncache_expand(struct in_conninfo *, struct tcpopt *,
 	     struct tcphdr *, struct socket **, struct mbuf *);
-int	 tcp_offload_syncache_expand(struct in_conninfo *inc, struct toeopt *toeo,
-             struct tcphdr *th, struct socket **lsop, struct mbuf *m);
 void	 syncache_add(struct in_conninfo *, struct tcpopt *,
 	     struct tcphdr *, struct inpcb *, struct socket **, struct mbuf *);
-void	 tcp_offload_syncache_add(struct in_conninfo *, struct toeopt *,
-             struct tcphdr *, struct inpcb *, struct socket **,
-             struct toe_usrreqs *tu, void *toepcb);
-
+void	 tcp_offload_syncache_add(struct in_conninfo *, struct tcpopt *,
+	     struct tcphdr *, struct inpcb *, struct socket **, void *, void *);
 void	 syncache_chkrst(struct in_conninfo *, struct tcphdr *);
 void	 syncache_badack(struct in_conninfo *);
 int	 syncache_pcbcount(void);
@@ -75,10 +69,10 @@
 	u_int8_t	sc_requested_s_scale:4,
 			sc_requested_r_scale:4;
 	u_int16_t	sc_flags;
-#ifndef TCP_OFFLOAD_DISABLE
-	struct toe_usrreqs *sc_tu;		/* TOE operations */
-	void		*sc_toepcb;		/* TOE protocol block */
-#endif			
+#if defined(TCP_OFFLOAD) || !defined(TCP_OFFLOAD_DISABLE)
+	struct toedev	*sc_tod;		/* entry added by this TOE */
+	void		*sc_todctx;		/* TOE driver context */
+#endif
 	struct label	*sc_label;		/* MAC label reference */
 	struct ucred	*sc_cred;		/* cred cache for jail checks */
 

Modified: trunk/sys/netinet/tcp_timer.c
===================================================================
--- trunk/sys/netinet/tcp_timer.c	2016-09-27 16:35:26 UTC (rev 8962)
+++ trunk/sys/netinet/tcp_timer.c	2016-09-27 16:42:59 UTC (rev 8963)
@@ -32,6 +32,7 @@
 #include <sys/cdefs.h>
 __MBSDID("$MidnightBSD$");
 
+#include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_tcpdebug.h"
 
@@ -637,6 +638,11 @@
 	struct inpcb *inp = tp->t_inpcb;
 	int cpu = INP_CPU(inp);
 
+#ifdef TCP_OFFLOAD
+	if (tp->t_flags & TF_TOE)
+		return;
+#endif
+
 	switch (timer_type) {
 		case TT_DELACK:
 			t_callout = &tp->t_timers->tt_delack;

Modified: trunk/sys/netinet/tcp_usrreq.c
===================================================================
--- trunk/sys/netinet/tcp_usrreq.c	2016-09-27 16:35:26 UTC (rev 8962)
+++ trunk/sys/netinet/tcp_usrreq.c	2016-09-27 16:42:59 UTC (rev 8963)
@@ -87,7 +87,9 @@
 #ifdef TCPDEBUG
 #include <netinet/tcp_debug.h>
 #endif
+#ifdef TCP_OFFLOAD
 #include <netinet/tcp_offload.h>
+#endif
 
 /*
  * TCP protocol interface to socket abstraction.
@@ -367,7 +369,10 @@
 	if (error == 0) {
 		tp->t_state = TCPS_LISTEN;
 		solisten_proto(so, backlog);
-		tcp_offload_listen_open(tp);
+#ifdef TCP_OFFLOAD
+		if ((so->so_options & SO_NO_OFFLOAD) == 0)
+			tcp_offload_listen_start(tp);
+#endif
 	}
 	SOCK_UNLOCK(so);
 
@@ -409,6 +414,10 @@
 	if (error == 0) {
 		tp->t_state = TCPS_LISTEN;
 		solisten_proto(so, backlog);
+#ifdef TCP_OFFLOAD
+		if ((so->so_options & SO_NO_OFFLOAD) == 0)
+			tcp_offload_listen_start(tp);
+#endif
 	}
 	SOCK_UNLOCK(so);
 
@@ -459,7 +468,14 @@
 	TCPDEBUG1();
 	if ((error = tcp_connect(tp, nam, td)) != 0)
 		goto out;
-	error = tcp_output_connect(so, nam);
+#ifdef TCP_OFFLOAD
+	if (registered_toedevs > 0 &&
+	    (so->so_options & SO_NO_OFFLOAD) == 0 &&
+	    (error = tcp_offload_connect(so, nam)) == 0)
+		goto out;
+#endif
+	tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp));
+	error = tcp_output(tp);
 out:
 	TCPDEBUG2(PRU_CONNECT);
 	INP_WUNLOCK(inp);
@@ -519,7 +535,13 @@
 			goto out;
 		if ((error = tcp_connect(tp, (struct sockaddr *)&sin, td)) != 0)
 			goto out;
-		error = tcp_output_connect(so, nam);
+#ifdef TCP_OFFLOAD
+		if (registered_toedevs > 0 &&
+		    (so->so_options & SO_NO_OFFLOAD) == 0 &&
+		    (error = tcp_offload_connect(so, nam)) == 0)
+			goto out;
+#endif
+		error = tcp_output(tp);
 		goto out;
 	}
 #endif
@@ -530,7 +552,14 @@
 		goto out;
 	if ((error = tcp6_connect(tp, nam, td)) != 0)
 		goto out;
-	error = tcp_output_connect(so, nam);
+#ifdef TCP_OFFLOAD
+	if (registered_toedevs > 0 &&
+	    (so->so_options & SO_NO_OFFLOAD) == 0 &&
+	    (error = tcp_offload_connect(so, nam)) == 0)
+		goto out;
+#endif
+	tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp));
+	error = tcp_output(tp);
 
 out:
 	TCPDEBUG2(PRU_CONNECT);
@@ -709,7 +738,7 @@
 	socantsendmore(so);
 	tcp_usrclosed(tp);
 	if (!(inp->inp_flags & INP_DROPPED))
-		error = tcp_output_disconnect(tp);
+		error = tcp_output(tp);
 
 out:
 	TCPDEBUG2(PRU_SHUTDOWN);
@@ -739,7 +768,12 @@
 	}
 	tp = intotcpcb(inp);
 	TCPDEBUG1();
-	tcp_output_rcvd(tp);
+#ifdef TCP_OFFLOAD
+	if (tp->t_flags & TF_TOE)
+		tcp_offload_rcvd(tp);
+	else
+#endif
+	tcp_output(tp);
 
 out:
 	TCPDEBUG2(PRU_RCVD);
@@ -835,7 +869,7 @@
 		if (!(inp->inp_flags & INP_DROPPED)) {
 			if (flags & PRUS_MORETOCOME)
 				tp->t_flags |= TF_MORETOCOME;
-			error = tcp_output_send(tp);
+			error = tcp_output(tp);
 			if (flags & PRUS_MORETOCOME)
 				tp->t_flags &= ~TF_MORETOCOME;
 		}
@@ -884,7 +918,7 @@
 		}
 		tp->snd_up = tp->snd_una + so->so_snd.sb_cc;
 		tp->t_flags |= TF_FORCEDATA;
-		error = tcp_output_send(tp);
+		error = tcp_output(tp);
 		tp->t_flags &= ~TF_FORCEDATA;
 	}
 out:
@@ -1119,7 +1153,6 @@
 	soisconnecting(so);
 	TCPSTAT_INC(tcps_connattempt);
 	tp->t_state = TCPS_SYN_SENT;
-	tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp));
 	tp->iss = tcp_new_isn(tp);
 	tcp_sendseqinit(tp);
 
@@ -1192,7 +1225,6 @@
 	soisconnecting(so);
 	TCPSTAT_INC(tcps_connattempt);
 	tp->t_state = TCPS_SYN_SENT;
-	tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp));
 	tp->iss = tcp_new_isn(tp);
 	tcp_sendseqinit(tp);
 
@@ -1323,9 +1355,9 @@
 				tp->t_flags |= TF_SIGNATURE;
 			else
 				tp->t_flags &= ~TF_SIGNATURE;
-			INP_WUNLOCK(inp);
-			break;
+			goto unlock_and_done;
 #endif /* TCP_SIGNATURE */
+
 		case TCP_NODELAY:
 		case TCP_NOOPT:
 			INP_WUNLOCK(inp);
@@ -1351,6 +1383,13 @@
 				tp->t_flags |= opt;
 			else
 				tp->t_flags &= ~opt;
+unlock_and_done:
+#ifdef TCP_OFFLOAD
+			if (tp->t_flags & TF_TOE) {
+				tcp_offload_ctloutput(tp, sopt->sopt_dir,
+				    sopt->sopt_name);
+			}
+#endif
 			INP_WUNLOCK(inp);
 			break;
 
@@ -1369,8 +1408,7 @@
 				if (TCPS_HAVEESTABLISHED(tp->t_state))
 					error = tcp_output(tp);
 			}
-			INP_WUNLOCK(inp);
-			break;
+			goto unlock_and_done;
 
 		case TCP_MAXSEG:
 			INP_WUNLOCK(inp);
@@ -1385,8 +1423,7 @@
 				tp->t_maxseg = optval;
 			else
 				error = EINVAL;
-			INP_WUNLOCK(inp);
-			break;
+			goto unlock_and_done;
 
 		case TCP_INFO:
 			INP_WUNLOCK(inp);
@@ -1438,8 +1475,7 @@
 				}
 			}
 			CC_LIST_RUNLOCK();
-			INP_WUNLOCK(inp);
-			break;
+			goto unlock_and_done;
 
 		case TCP_KEEPIDLE:
 		case TCP_KEEPINTVL:
@@ -1483,8 +1519,7 @@
 					    TP_KEEPINIT(tp));
 				break;
 			}
-			INP_WUNLOCK(inp);
-			break;
+			goto unlock_and_done;
 
 		case TCP_KEEPCNT:
 			INP_WUNLOCK(inp);
@@ -1654,7 +1689,7 @@
 		sbflush(&so->so_rcv);
 		tcp_usrclosed(tp);
 		if (!(inp->inp_flags & INP_DROPPED))
-			tcp_output_disconnect(tp);
+			tcp_output(tp);
 	}
 }
 
@@ -1677,7 +1712,9 @@
 
 	switch (tp->t_state) {
 	case TCPS_LISTEN:
-		tcp_offload_listen_close(tp);
+#ifdef TCP_OFFLOAD
+		tcp_offload_listen_stop(tp);
+#endif
 		/* FALLTHROUGH */
 	case TCPS_CLOSED:
 		tp->t_state = TCPS_CLOSED;

Modified: trunk/sys/netinet/tcp_var.h
===================================================================
--- trunk/sys/netinet/tcp_var.h	2016-09-27 16:35:26 UTC (rev 8962)
+++ trunk/sys/netinet/tcp_var.h	2016-09-27 16:42:59 UTC (rev 8963)
@@ -194,7 +194,7 @@
 	int	t_rttlow;		/* smallest observerved RTT */
 	u_int32_t	rfbuf_ts;	/* recv buffer autoscaling timestamp */
 	int	rfbuf_cnt;		/* recv buffer autoscaling byte count */
-	struct toe_usrreqs *t_tu;	/* offload operations vector */
+	struct toedev	*tod;		/* toedev handling this connection */
 	int	t_sndrexmitpack;	/* retransmit packets sent */
 	int	t_rcvoopack;		/* out-of-order packets received */
 	void	*t_toe;			/* TOE pcb pointer */

Modified: trunk/sys/netinet/toecore.c
===================================================================
--- trunk/sys/netinet/toecore.c	2016-09-27 16:35:26 UTC (rev 8962)
+++ trunk/sys/netinet/toecore.c	2016-09-27 16:42:59 UTC (rev 8963)
@@ -1,3 +1,649 @@
+/*-
+ * Copyright (c) 2012 Chelsio Communications, Inc.
+ * All rights reserved.
+ * Written by: Navdeep Parhar <np at FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_inet.h"
+#include "opt_inet6.h"
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/mbuf.h>
+#include <sys/module.h>
+#include <sys/types.h>
+#include <sys/sockopt.h>
+#include <sys/sysctl.h>
+#include <sys/socket.h>
+
+#include <net/ethernet.h>
+#include <net/if.h>
+#include <net/if_types.h>
+#include <net/if_vlan_var.h>
+#include <net/if_llatbl.h>
+#include <net/route.h>
+
+#include <netinet/if_ether.h>
+#include <netinet/in.h>
+#include <netinet/in_pcb.h>
+#include <netinet/in_var.h>
+#include <netinet6/in6_var.h>
+#include <netinet6/in6_pcb.h>
+#include <netinet6/nd6.h>
+#define TCPSTATES
+#include <netinet/tcp.h>
+#include <netinet/tcp_fsm.h>
+#include <netinet/tcp_timer.h>
+#include <netinet/tcp_var.h>
+#include <netinet/tcp_syncache.h>
+#include <netinet/tcp_offload.h>
+#include <netinet/toecore.h>
+
+static struct mtx toedev_lock;
+static TAILQ_HEAD(, toedev) toedev_list;
+static eventhandler_tag listen_start_eh;
+static eventhandler_tag listen_stop_eh;
+static eventhandler_tag lle_event_eh;
+static eventhandler_tag route_redirect_eh;
+
+static int
+toedev_connect(struct toedev *tod __unused, struct socket *so __unused,
+    struct rtentry *rt __unused, struct sockaddr *nam __unused)
+{
+
+	return (ENOTSUP);
+}
+
+static int
+toedev_listen_start(struct toedev *tod __unused, struct tcpcb *tp __unused)
+{
+
+	return (ENOTSUP);
+}
+
+static int
+toedev_listen_stop(struct toedev *tod __unused, struct tcpcb *tp __unused)
+{
+
+	return (ENOTSUP);
+}
+
+static void
+toedev_input(struct toedev *tod __unused, struct tcpcb *tp __unused,
+    struct mbuf *m)
+{
+
+	m_freem(m);
+	return;
+}
+
+static void
+toedev_rcvd(struct toedev *tod __unused, struct tcpcb *tp __unused)
+{
+
+	return;
+}
+
+static int
+toedev_output(struct toedev *tod __unused, struct tcpcb *tp __unused)
+{
+
+	return (ENOTSUP);
+}
+
+static void
+toedev_pcb_detach(struct toedev *tod __unused, struct tcpcb *tp __unused)
+{
+
+	return;
+}
+
+static void
+toedev_l2_update(struct toedev *tod __unused, struct ifnet *ifp __unused,
+    struct sockaddr *sa __unused, uint8_t *lladdr __unused,
+    uint16_t vtag __unused)
+{
+
+	return;
+}
+
+static void
+toedev_route_redirect(struct toedev *tod __unused, struct ifnet *ifp __unused,
+    struct rtentry *rt0 __unused, struct rtentry *rt1 __unused)
+{
+
+	return;
+}
+
+static void
+toedev_syncache_added(struct toedev *tod __unused, void *ctx __unused)
+{
+
+	return;
+}
+
+static void
+toedev_syncache_removed(struct toedev *tod __unused, void *ctx __unused)
+{
+
+	return;
+}
+
+static int
+toedev_syncache_respond(struct toedev *tod __unused, void *ctx __unused,
+    struct mbuf *m)
+{
+
+	m_freem(m);
+	return (0);
+}
+
+static void
+toedev_offload_socket(struct toedev *tod __unused, void *ctx __unused,
+    struct socket *so __unused)
+{
+
+	return;
+}
+
+static void
+toedev_ctloutput(struct toedev *tod __unused, struct tcpcb *tp __unused,
+    int sopt_dir __unused, int sopt_name __unused)
+{
+
+	return;
+}
+
+/*
+ * Inform one or more TOE devices about a listening socket.
+ */
+static void
+toe_listen_start(struct inpcb *inp, void *arg)
+{
+	struct toedev *t, *tod;
+	struct tcpcb *tp;
+
+	INP_WLOCK_ASSERT(inp);
+	KASSERT(inp->inp_pcbinfo == &V_tcbinfo,
+	    ("%s: inp is not a TCP inp", __func__));
+
+	if (inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT))
+		return;
+
+	tp = intotcpcb(inp);
+	if (tp->t_state != TCPS_LISTEN)
+		return;
+
+	t = arg;
+	mtx_lock(&toedev_lock);
+	TAILQ_FOREACH(tod, &toedev_list, link) {
+		if (t == NULL || t == tod)
+			tod->tod_listen_start(tod, tp);
+	}
+	mtx_unlock(&toedev_lock);
+}
+
+static void
+toe_listen_start_event(void *arg __unused, struct tcpcb *tp)
+{
+	struct inpcb *inp = tp->t_inpcb;
+
+	INP_WLOCK_ASSERT(inp);
+	KASSERT(tp->t_state == TCPS_LISTEN,
+	    ("%s: t_state %s", __func__, tcpstates[tp->t_state]));
+
+	toe_listen_start(inp, NULL);
+}
+
+static void
+toe_listen_stop_event(void *arg __unused, struct tcpcb *tp)
+{
+	struct toedev *tod;
+#ifdef INVARIANTS
+	struct inpcb *inp = tp->t_inpcb;
+#endif
+
+	INP_WLOCK_ASSERT(inp);
+	KASSERT(tp->t_state == TCPS_LISTEN,
+	    ("%s: t_state %s", __func__, tcpstates[tp->t_state]));
+
+	mtx_lock(&toedev_lock);
+	TAILQ_FOREACH(tod, &toedev_list, link)
+	    tod->tod_listen_stop(tod, tp);
+	mtx_unlock(&toedev_lock);
+}
+
+/*
+ * Fill up a freshly allocated toedev struct with reasonable defaults.
+ */
+void
+init_toedev(struct toedev *tod)
+{
+
+	tod->tod_softc = NULL;
+
+	/*
+	 * Provide no-op defaults so that the kernel can call any toedev
+	 * function without having to check whether the TOE driver supplied one
+	 * or not.
+	 */
+	tod->tod_connect = toedev_connect;
+	tod->tod_listen_start = toedev_listen_start;
+	tod->tod_listen_stop = toedev_listen_stop;
+	tod->tod_input = toedev_input;
+	tod->tod_rcvd = toedev_rcvd;
+	tod->tod_output = toedev_output;
+	tod->tod_send_rst = toedev_output;
+	tod->tod_send_fin = toedev_output;
+	tod->tod_pcb_detach = toedev_pcb_detach;
+	tod->tod_l2_update = toedev_l2_update;
+	tod->tod_route_redirect = toedev_route_redirect;
+	tod->tod_syncache_added = toedev_syncache_added;
+	tod->tod_syncache_removed = toedev_syncache_removed;
+	tod->tod_syncache_respond = toedev_syncache_respond;
+	tod->tod_offload_socket = toedev_offload_socket;
+	tod->tod_ctloutput = toedev_ctloutput;
+}
+
+/*
+ * Register an active TOE device with the system.  This allows it to receive
+ * notifications from the kernel.
+ */
+int
+register_toedev(struct toedev *tod)
+{
+	struct toedev *t;
+
+	mtx_lock(&toedev_lock);
+	TAILQ_FOREACH(t, &toedev_list, link) {
+		if (t == tod) {
+			mtx_unlock(&toedev_lock);
+			return (EEXIST);
+		}
+	}
+
+	TAILQ_INSERT_TAIL(&toedev_list, tod, link);
+	registered_toedevs++;
+	mtx_unlock(&toedev_lock);
+
+	inp_apply_all(toe_listen_start, tod);
+
+	return (0);
+}
+
+/*
+ * Remove the TOE device from the global list of active TOE devices.  It is the
+ * caller's responsibility to ensure that the TOE device is quiesced prior to
+ * this call.
+ */
+int
+unregister_toedev(struct toedev *tod)
+{
+	struct toedev *t, *t2;
+	int rc = ENODEV;
+
+	mtx_lock(&toedev_lock);
+	TAILQ_FOREACH_SAFE(t, &toedev_list, link, t2) {
+		if (t == tod) {
+			TAILQ_REMOVE(&toedev_list, tod, link);
+			registered_toedevs--;
+			rc = 0;
+			break;
+		}
+	}
+	KASSERT(registered_toedevs >= 0,
+	    ("%s: registered_toedevs (%d) < 0", __func__, registered_toedevs));
+	mtx_unlock(&toedev_lock);
+	return (rc);
+}
+
+void
+toe_syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th,
+    struct inpcb *inp, void *tod, void *todctx)
+{
+	struct socket *lso = inp->inp_socket;
+
+	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+	INP_WLOCK_ASSERT(inp);
+
+	tcp_offload_syncache_add(inc, to, th, inp, &lso, tod, todctx);
+}
+
+int
+toe_syncache_expand(struct in_conninfo *inc, struct tcpopt *to,
+    struct tcphdr *th, struct socket **lsop)
+{
+
+	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+
+	return (syncache_expand(inc, to, th, lsop, NULL));
+}
+
+/*
+ * General purpose check to see if a 4-tuple is in use by the kernel.  If a TCP
+ * header (presumably for an incoming SYN) is also provided, an existing 4-tuple
+ * in TIME_WAIT may be assassinated freeing it up for re-use.
+ *
+ * Note that the TCP header must have been run through tcp_fields_to_host() or
+ * equivalent.
+ */
+int
+toe_4tuple_check(struct in_conninfo *inc, struct tcphdr *th, struct ifnet *ifp)
+{
+	struct inpcb *inp;
+
+	if (inc->inc_flags & INC_ISIPV6) {
+		inp = in6_pcblookup(&V_tcbinfo, &inc->inc6_faddr,
+		    inc->inc_fport, &inc->inc6_laddr, inc->inc_lport,
+		    INPLOOKUP_WLOCKPCB, ifp);
+	} else {
+		inp = in_pcblookup(&V_tcbinfo, inc->inc_faddr, inc->inc_fport,
+		    inc->inc_laddr, inc->inc_lport, INPLOOKUP_WLOCKPCB, ifp);
+	}
+	if (inp != NULL) {
+		INP_WLOCK_ASSERT(inp);
+
+		if ((inp->inp_flags & INP_TIMEWAIT) && th != NULL) {
+
+			INP_INFO_WLOCK_ASSERT(&V_tcbinfo); /* for twcheck */
+			if (!tcp_twcheck(inp, NULL, th, NULL, 0))
+				return (EADDRINUSE);
+		} else {
+			INP_WUNLOCK(inp);
+			return (EADDRINUSE);
+		}
+	}
+
+	return (0);
+}
+
+static void
+toe_lle_event(void *arg __unused, struct llentry *lle, int evt)
+{
+	struct toedev *tod;
+	struct ifnet *ifp;
+	struct sockaddr *sa;
+	uint8_t *lladdr;
+	uint16_t vtag;
+
+	LLE_WLOCK_ASSERT(lle);
+
+	ifp = lle->lle_tbl->llt_ifp;
+	sa = L3_ADDR(lle);
+
+	KASSERT(sa->sa_family == AF_INET || sa->sa_family == AF_INET6,
+	    ("%s: lle_event %d for lle %p but sa %p !INET && !INET6",
+	    __func__, evt, lle, sa));
+
+	/*
+	 * Not interested if the interface's TOE capability is not enabled.
+	 */
+	if ((sa->sa_family == AF_INET && !(ifp->if_capenable & IFCAP_TOE4)) ||
+	    (sa->sa_family == AF_INET6 && !(ifp->if_capenable & IFCAP_TOE6)))
+		return;
+
+	tod = TOEDEV(ifp);
+	if (tod == NULL)
+		return;
+
+	vtag = 0xfff;
+	if (evt != LLENTRY_RESOLVED) {
+
+		/*
+		 * LLENTRY_TIMEDOUT, LLENTRY_DELETED, LLENTRY_EXPIRED all mean
+		 * this entry is going to be deleted.
+		 */
+
+		lladdr = NULL;
+	} else {
+
+		KASSERT(lle->la_flags & LLE_VALID,
+		    ("%s: %p resolved but not valid?", __func__, lle));
+
+		lladdr = (uint8_t *)&lle->ll_addr;
+#ifdef VLAN_TAG
+		VLAN_TAG(ifp, &vtag);
+#endif
+	}
+
+	tod->tod_l2_update(tod, ifp, sa, lladdr, vtag);
+}
+
+/*
+ * XXX: implement.
+ */
+static void
+toe_route_redirect_event(void *arg __unused, struct rtentry *rt0,
+    struct rtentry *rt1, struct sockaddr *sa)
+{
+
+	return;
+}
+
+#ifdef INET6
+/*
+ * XXX: no checks to verify that sa is really a neighbor because we assume it is
+ * the result of a route lookup and is on-link on the given ifp.
+ */
+static int
+toe_nd6_resolve(struct ifnet *ifp, struct sockaddr *sa, uint8_t *lladdr)
+{
+	struct llentry *lle;
+	struct sockaddr_in6 *sin6 = (void *)sa;
+	int rc, flags = 0;
+
+restart:
+	IF_AFDATA_RLOCK(ifp);
+	lle = lla_lookup(LLTABLE6(ifp), flags, sa);
+	IF_AFDATA_RUNLOCK(ifp);
+	if (lle == NULL) {
+		IF_AFDATA_LOCK(ifp);
+		lle = nd6_lookup(&sin6->sin6_addr, ND6_CREATE | ND6_EXCLUSIVE,
+		    ifp);
+		IF_AFDATA_UNLOCK(ifp);
+		if (lle == NULL)
+			return (ENOMEM); /* Couldn't create entry in cache. */
+		lle->ln_state = ND6_LLINFO_INCOMPLETE;
+		nd6_llinfo_settimer_locked(lle,
+		    (long)ND_IFINFO(ifp)->retrans * hz / 1000);
+		LLE_WUNLOCK(lle);
+
+		nd6_ns_output(ifp, NULL, &sin6->sin6_addr, NULL, 0);
+
+		return (EWOULDBLOCK);
+	}
+
+	if (lle->ln_state == ND6_LLINFO_STALE) {
+		if ((flags & LLE_EXCLUSIVE) == 0) {
+			LLE_RUNLOCK(lle);
+			flags |= LLE_EXCLUSIVE;
+			goto restart;
+		}
+
+		LLE_WLOCK_ASSERT(lle);
+
+		lle->la_asked = 0;
+		lle->ln_state = ND6_LLINFO_DELAY;
+		nd6_llinfo_settimer_locked(lle, (long)V_nd6_delay * hz);
+	}
+
+	if (lle->la_flags & LLE_VALID) {
+		memcpy(lladdr, &lle->ll_addr, ifp->if_addrlen);
+		rc = 0;
+	} else
+		rc = EWOULDBLOCK;
+
+	if (flags & LLE_EXCLUSIVE)
+		LLE_WUNLOCK(lle);
+	else
+		LLE_RUNLOCK(lle);
+
+	return (rc);
+}
+#endif
+
+/*
+ * Returns 0 or EWOULDBLOCK on success (any other value is an error).  0 means
+ * lladdr and vtag are valid on return, EWOULDBLOCK means the TOE driver's
+ * tod_l2_update will be called later, when the entry is resolved or times out.
+ */
+int
+toe_l2_resolve(struct toedev *tod, struct ifnet *ifp, struct sockaddr *sa,
+    uint8_t *lladdr, uint16_t *vtag)
+{
+#ifdef INET
+	struct llentry *lle;
+#endif
+	int rc;
+
+	switch (sa->sa_family) {
+#ifdef INET
+	case AF_INET:
+		rc = arpresolve(ifp, NULL, NULL, sa, lladdr, &lle);
+		break;
+#endif
+#ifdef INET6
+	case AF_INET6:
+		rc = toe_nd6_resolve(ifp, sa, lladdr);
+		break;
+#endif
+	default:
+		return (EPROTONOSUPPORT);
+	}
+
+	if (rc == 0) {
+#ifdef VLAN_TAG
+		if (VLAN_TAG(ifp, vtag) != 0)
+#endif
+			*vtag = 0xfff;
+	}
+
+	return (rc);
+}
+
+void
+toe_connect_failed(struct toedev *tod, struct inpcb *inp, int err)
+{
+
+	INP_WLOCK_ASSERT(inp);
+
+	if (!(inp->inp_flags & INP_DROPPED)) {
+		struct tcpcb *tp = intotcpcb(inp);
+
+		KASSERT(tp->t_flags & TF_TOE,
+		    ("%s: tp %p not offloaded.", __func__, tp));
+
+		if (err == EAGAIN) {
+
+			/*
+			 * Temporary failure during offload, take this PCB back.
+			 * Detach from the TOE driver and do the rest of what
+			 * TCP's pru_connect would have done if the connection
+			 * wasn't offloaded.
+			 */
+
+			tod->tod_pcb_detach(tod, tp);
+			KASSERT(!(tp->t_flags & TF_TOE),
+			    ("%s: tp %p still offloaded.", __func__, tp));
+			tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp));
+			(void) tcp_output(tp);
+		} else {
+
+			INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+			tp = tcp_drop(tp, err);
+			if (tp == NULL)
+				INP_WLOCK(inp);	/* re-acquire */
+		}
+	}
+	INP_WLOCK_ASSERT(inp);
+}
+
+static int
+toecore_load(void)
+{
+
+	mtx_init(&toedev_lock, "toedev lock", NULL, MTX_DEF);
+	TAILQ_INIT(&toedev_list);
+
+	listen_start_eh = EVENTHANDLER_REGISTER(tcp_offload_listen_start,
+	    toe_listen_start_event, NULL, EVENTHANDLER_PRI_ANY);
+	listen_stop_eh = EVENTHANDLER_REGISTER(tcp_offload_listen_stop,
+	    toe_listen_stop_event, NULL, EVENTHANDLER_PRI_ANY);
+	lle_event_eh = EVENTHANDLER_REGISTER(lle_event, toe_lle_event, NULL,
+	    EVENTHANDLER_PRI_ANY);
+	route_redirect_eh = EVENTHANDLER_REGISTER(route_redirect_event,
+	    toe_route_redirect_event, NULL, EVENTHANDLER_PRI_ANY);
+
+	return (0);
+}
+
+static int
+toecore_unload(void)
+{
+
+	mtx_lock(&toedev_lock);
+	if (!TAILQ_EMPTY(&toedev_list)) {
+		mtx_unlock(&toedev_lock);
+		return (EBUSY);
+	}
+
+	EVENTHANDLER_DEREGISTER(tcp_offload_listen_start, listen_start_eh);
+	EVENTHANDLER_DEREGISTER(tcp_offload_listen_stop, listen_stop_eh);
+	EVENTHANDLER_DEREGISTER(lle_event, lle_event_eh);
+	EVENTHANDLER_DEREGISTER(route_redirect_event, route_redirect_eh);
+
+	mtx_unlock(&toedev_lock);
+	mtx_destroy(&toedev_lock);
+
+	return (0);
+}
+
+static int
+toecore_mod_handler(module_t mod, int cmd, void *arg)
+{
+
+	if (cmd == MOD_LOAD)
+		return (toecore_load());
+
+	if (cmd == MOD_UNLOAD)
+		return (toecore_unload());
+
+	return (EOPNOTSUPP);
+}
+
+static moduledata_t mod_data= {
+	"toecore",
+	toecore_mod_handler,
+	0
+};
+
+MODULE_VERSION(toecore, 1);
+DECLARE_MODULE(toecore, mod_data, SI_SUB_EXEC, SI_ORDER_ANY);
 /* $MidnightBSD$ */
 /*-
  * Copyright (c) 2012 Chelsio Communications, Inc.

Modified: trunk/sys/netinet/toecore.h
===================================================================
--- trunk/sys/netinet/toecore.h	2016-09-27 16:35:26 UTC (rev 8962)
+++ trunk/sys/netinet/toecore.h	2016-09-27 16:42:59 UTC (rev 8963)
@@ -1,3 +1,133 @@
+/*-
+ * Copyright (c) 2012 Chelsio Communications, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _NETINET_TOE_H_
+#define	_NETINET_TOE_H_
+
+#ifndef _KERNEL
+#error "no user-serviceable parts inside"
+#endif
+
+struct tcpopt;
+struct tcphdr;
+struct in_conninfo;
+
+struct toedev {
+	TAILQ_ENTRY(toedev) link;	/* glue for toedev_list */
+	void *tod_softc;		/* TOE driver private data */
+
+	/*
+	 * Active open.  If a failure occurs, it is reported back by the driver
+	 * via toe_connect_failed.
+	 */
+	int (*tod_connect)(struct toedev *, struct socket *, struct rtentry *,
+	    struct sockaddr *);
+
+	/* Passive open. */
+	int (*tod_listen_start)(struct toedev *, struct tcpcb *);
+	int (*tod_listen_stop)(struct toedev *, struct tcpcb *);
+
+	/*
+	 * The kernel uses this routine to pass on any frame it receives for an
+	 * offloaded connection to the TOE driver.  This is an unusual event.
+	 */
+	void (*tod_input)(struct toedev *, struct tcpcb *, struct mbuf *);
+
+	/*
+	 * This is called by the kernel during pru_rcvd for an offloaded TCP
+	 * connection and provides an opportunity for the TOE driver to manage
+	 * its rx window and credits.
+	 */
+	void (*tod_rcvd)(struct toedev *, struct tcpcb *);
+
+	/*
+	 * Transmit routine.  The kernel calls this to have the TOE driver
+	 * evaluate whether there is data to be transmitted, and transmit it.
+	 */
+	int (*tod_output)(struct toedev *, struct tcpcb *);
+
+	/* Immediate teardown: send RST to peer. */
+	int (*tod_send_rst)(struct toedev *, struct tcpcb *);
+
+	/* Initiate orderly disconnect by sending FIN to the peer. */
+	int (*tod_send_fin)(struct toedev *, struct tcpcb *);
+
+	/* Called to indicate that the kernel is done with this TCP PCB. */
+	void (*tod_pcb_detach)(struct toedev *, struct tcpcb *);
+
+	/*
+	 * The kernel calls this once it has information about an L2 entry that
+	 * the TOE driver enquired about previously (via toe_l2_resolve).
+	 */
+	void (*tod_l2_update)(struct toedev *, struct ifnet *,
+	    struct sockaddr *, uint8_t *, uint16_t);
+
+	/* XXX.  Route has been redirected. */
+	void (*tod_route_redirect)(struct toedev *, struct ifnet *,
+	    struct rtentry *, struct rtentry *);
+
+	/* Syncache interaction. */
+	void (*tod_syncache_added)(struct toedev *, void *);
+	void (*tod_syncache_removed)(struct toedev *, void *);
+	int (*tod_syncache_respond)(struct toedev *, void *, struct mbuf *);
+	void (*tod_offload_socket)(struct toedev *, void *, struct socket *);
+
+	/* TCP socket option */
+	void (*tod_ctloutput)(struct toedev *, struct tcpcb *, int, int);
+};
+
+#include <sys/eventhandler.h>
+typedef	void (*tcp_offload_listen_start_fn)(void *, struct tcpcb *);
+typedef	void (*tcp_offload_listen_stop_fn)(void *, struct tcpcb *);
+EVENTHANDLER_DECLARE(tcp_offload_listen_start, tcp_offload_listen_start_fn);
+EVENTHANDLER_DECLARE(tcp_offload_listen_stop, tcp_offload_listen_stop_fn);
+
+void init_toedev(struct toedev *);
+int register_toedev(struct toedev *);
+int unregister_toedev(struct toedev *);
+
+/*
+ * General interface for looking up L2 information for an IP address.  If an
+ * answer is not available right away then the TOE driver's tod_l2_update will
+ * be called later.
+ */
+int toe_l2_resolve(struct toedev *, struct ifnet *, struct sockaddr *,
+    uint8_t *, uint16_t *);
+
+void toe_connect_failed(struct toedev *, struct inpcb *, int);
+
+void toe_syncache_add(struct in_conninfo *, struct tcpopt *, struct tcphdr *,
+    struct inpcb *, void *, void *);
+int  toe_syncache_expand(struct in_conninfo *, struct tcpopt *, struct tcphdr *,
+    struct socket **);
+
+int toe_4tuple_check(struct in_conninfo *, struct tcphdr *, struct ifnet *);
+#endif
 /* $MidnightBSD$ */
 /*-
  * Copyright (c) 2012 Chelsio Communications, Inc.

Modified: trunk/sys/netinet6/nd6.c
===================================================================
--- trunk/sys/netinet6/nd6.c	2016-09-27 16:35:26 UTC (rev 8962)
+++ trunk/sys/netinet6/nd6.c	2016-09-27 16:42:59 UTC (rev 8963)
@@ -509,6 +509,7 @@
 				ln->la_hold = m0;
 				clear_llinfo_pqueue(ln);
 			}
+			EVENTHANDLER_INVOKE(lle_event, ln, LLENTRY_TIMEDOUT);
 			(void)nd6_free(ln, 0);
 			ln = NULL;
 			if (m != NULL)
@@ -526,6 +527,7 @@
 	case ND6_LLINFO_STALE:
 		/* Garbage Collection(RFC 2461 5.3) */
 		if (!ND6_LLINFO_PERMANENT(ln)) {
+			EVENTHANDLER_INVOKE(lle_event, ln, LLENTRY_EXPIRED);
 			(void)nd6_free(ln, 1);
 			ln = NULL;
 		}
@@ -553,6 +555,7 @@
 			nd6_ns_output(ifp, dst, dst, ln, 0);
 			LLE_WLOCK(ln);
 		} else {
+			EVENTHANDLER_INVOKE(lle_event, ln, LLENTRY_EXPIRED);
 			(void)nd6_free(ln, 0);
 			ln = NULL;
 		}
@@ -1617,6 +1620,7 @@
 		 */
 		bcopy(lladdr, &ln->ll_addr, ifp->if_addrlen);
 		ln->la_flags |= LLE_VALID;
+		EVENTHANDLER_INVOKE(lle_event, ln, LLENTRY_RESOLVED);
 	}
 
 	if (!is_newentry) {
@@ -2185,7 +2189,7 @@
 
 	*lle = NULL;
 	IF_AFDATA_UNLOCK_ASSERT(ifp);
-	if (m->m_flags & M_MCAST) {
+	if (m != NULL && m->m_flags & M_MCAST) {
 		int i;
 
 		switch (ifp->if_type) {

Modified: trunk/sys/netinet6/nd6_nbr.c
===================================================================
--- trunk/sys/netinet6/nd6_nbr.c	2016-09-27 16:35:26 UTC (rev 8962)
+++ trunk/sys/netinet6/nd6_nbr.c	2016-09-27 16:42:59 UTC (rev 8963)
@@ -755,6 +755,7 @@
 		 */
 		bcopy(lladdr, &ln->ll_addr, ifp->if_addrlen);
 		ln->la_flags |= LLE_VALID;
+		EVENTHANDLER_INVOKE(lle_event, ln, LLENTRY_RESOLVED);
 		if (is_solicited) {
 			ln->ln_state = ND6_LLINFO_REACHABLE;
 			ln->ln_byhint = 0;
@@ -830,6 +831,8 @@
 			if (lladdr != NULL) {
 				bcopy(lladdr, &ln->ll_addr, ifp->if_addrlen);
 				ln->la_flags |= LLE_VALID;
+				EVENTHANDLER_INVOKE(lle_event, ln,
+				    LLENTRY_RESOLVED);
 			}
 
 			/*



More information about the Midnightbsd-cvs mailing list