[Midnightbsd-cvs] src [8018] trunk/sys: Update cxgbe

laffer1 at midnightbsd.org laffer1 at midnightbsd.org
Thu Sep 15 16:11:39 EDT 2016


Revision: 8018
          http://svnweb.midnightbsd.org/src/?rev=8018
Author:   laffer1
Date:     2016-09-15 16:11:39 -0400 (Thu, 15 Sep 2016)
Log Message:
-----------
Update cxgbe

Convert some fixed params to tunables
if_iqdrops should include frames truncated within the chip.

Assume INET/INET6 and TCP_OFFLOAD when the driver is built out of tree.

Fix some buffer sizes.

Modified Paths:
--------------
    trunk/sys/dev/cxgb/ulp/tom/cxgb_listen.c
    trunk/sys/dev/cxgb/ulp/tom/cxgb_toepcb.h
    trunk/sys/dev/cxgbe/adapter.h
    trunk/sys/dev/cxgbe/common/t4_hw.h
    trunk/sys/dev/cxgbe/common/t4_msg.h
    trunk/sys/dev/cxgbe/firmware/t4fw_cfg.txt
    trunk/sys/dev/cxgbe/offload.h
    trunk/sys/dev/cxgbe/t4_main.c
    trunk/sys/dev/cxgbe/t4_sge.c
    trunk/sys/dev/cxgbe/tom/t4_connect.c
    trunk/sys/dev/cxgbe/tom/t4_cpl_io.c
    trunk/sys/dev/cxgbe/tom/t4_listen.c
    trunk/sys/dev/cxgbe/tom/t4_tom.c
    trunk/sys/dev/cxgbe/tom/t4_tom.h
    trunk/sys/modules/cxgbe/if_cxgbe/Makefile
    trunk/sys/modules/cxgbe/tom/Makefile

Modified: trunk/sys/dev/cxgb/ulp/tom/cxgb_listen.c
===================================================================
--- trunk/sys/dev/cxgb/ulp/tom/cxgb_listen.c	2016-09-15 09:05:39 UTC (rev 8017)
+++ trunk/sys/dev/cxgb/ulp/tom/cxgb_listen.c	2016-09-15 20:11:39 UTC (rev 8018)
@@ -41,6 +41,7 @@
 #include <netinet/ip.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_var.h>
+#include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 #define TCPSTATES
 #include <netinet/tcp_fsm.h>
@@ -759,6 +760,15 @@
 		goto reset;
 	}
 
+	if (__predict_false(!(synqe->flags & TP_SYNQE_EXPANDED))) {
+		struct inpcb *new_inp = sotoinpcb(so);
+
+		INP_WLOCK(new_inp);
+		tcp_timer_activate(intotcpcb(new_inp), TT_KEEP, 0);
+		t3_offload_socket(tod, synqe, so);
+		INP_WUNLOCK(new_inp);
+	}
+
 	/* Remove the synq entry and release its reference on the lctx */
 	TAILQ_REMOVE(&lctx->synq, synqe, link);
 	inp = release_lctx(td, lctx);
@@ -1136,5 +1146,6 @@
 	offload_socket(so, toep);
 	make_established(so, cpl->snd_isn, cpl->rcv_isn, cpl->tcp_opt);
 	update_tid(td, toep, synqe->tid);
+	synqe->flags |= TP_SYNQE_EXPANDED;
 }
 #endif

Modified: trunk/sys/dev/cxgb/ulp/tom/cxgb_toepcb.h
===================================================================
--- trunk/sys/dev/cxgb/ulp/tom/cxgb_toepcb.h	2016-09-15 09:05:39 UTC (rev 8017)
+++ trunk/sys/dev/cxgb/ulp/tom/cxgb_toepcb.h	2016-09-15 20:11:39 UTC (rev 8018)
@@ -44,6 +44,7 @@
 #define TP_IS_A_SYNQ_ENTRY	(1 << 9)
 #define TP_ABORT_RPL_SENT	(1 << 10)
 #define TP_SEND_FIN          	(1 << 11)
+#define TP_SYNQE_EXPANDED	(1 << 12)
 
 struct toepcb {
 	TAILQ_ENTRY(toepcb) link; /* toep_list */

Modified: trunk/sys/dev/cxgbe/adapter.h
===================================================================
--- trunk/sys/dev/cxgbe/adapter.h	2016-09-15 09:05:39 UTC (rev 8017)
+++ trunk/sys/dev/cxgbe/adapter.h	2016-09-15 20:11:39 UTC (rev 8018)
@@ -135,6 +135,7 @@
 #else
 	FL_BUF_SIZES = 3,	/* cluster, jumbo9k, jumbo16k */
 #endif
+	OFLD_BUF_SIZE = MJUM16BYTES,	/* size of fl buffer for TOE rxq */
 
 	CTRL_EQ_QSIZE = 128,
 
@@ -143,6 +144,12 @@
 	TX_WR_FLITS = SGE_MAX_WR_LEN / 8
 };
 
+#ifdef T4_PKT_TIMESTAMP
+#define RX_COPY_THRESHOLD (MINCLSIZE - 8)
+#else
+#define RX_COPY_THRESHOLD MINCLSIZE
+#endif
+
 enum {
 	/* adapter intr_type */
 	INTR_INTX	= (1 << 0),
@@ -510,6 +517,7 @@
 typedef int (*cpl_handler_t)(struct sge_iq *, const struct rss_header *,
     struct mbuf *);
 typedef int (*an_handler_t)(struct sge_iq *, const struct rsp_ctrl *);
+typedef int (*fw_msg_handler_t)(struct adapter *, const __be64 *);
 
 struct adapter {
 	SLIST_ENTRY(adapter) link;
@@ -582,7 +590,8 @@
 	struct callout sfl_callout;
 
 	an_handler_t an_handler __aligned(CACHE_LINE_SIZE);
-	cpl_handler_t cpl_handler[256];
+	fw_msg_handler_t fw_msg_handler[4];	/* NUM_FW6_TYPES */
+	cpl_handler_t cpl_handler[0xef];	/* NUM_CPL_CMDS */
 };
 
 #define ADAPTER_LOCK(sc)		mtx_lock(&(sc)->sc_lock)
@@ -741,6 +750,8 @@
 void t4_iterate(void (*)(struct adapter *, void *), void *);
 int t4_register_cpl_handler(struct adapter *, int, cpl_handler_t);
 int t4_register_an_handler(struct adapter *, an_handler_t);
+int t4_register_fw_msg_handler(struct adapter *, int, fw_msg_handler_t);
+int t4_filter_rpl(struct sge_iq *, const struct rss_header *, struct mbuf *);
 
 /* t4_sge.c */
 void t4_sge_modload(void);

Modified: trunk/sys/dev/cxgbe/common/t4_hw.h
===================================================================
--- trunk/sys/dev/cxgbe/common/t4_hw.h	2016-09-15 09:05:39 UTC (rev 8017)
+++ trunk/sys/dev/cxgbe/common/t4_hw.h	2016-09-15 20:11:39 UTC (rev 8018)
@@ -161,10 +161,12 @@
 #define S_PPOD_TAG    6
 #define M_PPOD_TAG    0xFFFFFF
 #define V_PPOD_TAG(x) ((x) << S_PPOD_TAG)
+#define G_PPOD_TAG(x) (((x) >> S_PPOD_TAG) & M_PPOD_TAG)
 
 #define S_PPOD_PGSZ    30
 #define M_PPOD_PGSZ    0x3
 #define V_PPOD_PGSZ(x) ((x) << S_PPOD_PGSZ)
+#define G_PPOD_PGSZ(x) (((x) >> S_PPOD_PGSZ) & M_PPOD_PGSZ)
 
 #define S_PPOD_TID    32
 #define M_PPOD_TID    0xFFFFFF

Modified: trunk/sys/dev/cxgbe/common/t4_msg.h
===================================================================
--- trunk/sys/dev/cxgbe/common/t4_msg.h	2016-09-15 09:05:39 UTC (rev 8017)
+++ trunk/sys/dev/cxgbe/common/t4_msg.h	2016-09-15 20:11:39 UTC (rev 8018)
@@ -792,6 +792,14 @@
 	__be64 val;
 };
 
+struct cpl_set_tcb_field_core {
+	union opcode_tid ot;
+	__be16 reply_ctrl;
+	__be16 word_cookie;
+	__be64 mask;
+	__be64 val;
+};
+
 /* cpl_set_tcb_field.word_cookie fields */
 #define S_WORD    0
 #define M_WORD    0x1F
@@ -1376,6 +1384,11 @@
 	__be32 credit_dack;
 };
 
+struct cpl_rx_data_ack_core {
+	union opcode_tid ot;
+	__be32 credit_dack;
+};
+
 /* cpl_rx_data_ack.ack_seq fields */
 #define S_RX_CREDITS    0
 #define M_RX_CREDITS    0x3FFFFFF
@@ -2281,6 +2294,8 @@
 	FW6_TYPE_WR_RPL = 1,
 	FW6_TYPE_CQE = 2,
 	FW6_TYPE_OFLD_CONNECTION_WR_RPL = 3,
+
+	NUM_FW6_TYPES
 };
 
 struct cpl_fw6_msg_ofld_connection_wr_rpl {

Modified: trunk/sys/dev/cxgbe/firmware/t4fw_cfg.txt
===================================================================
--- trunk/sys/dev/cxgbe/firmware/t4fw_cfg.txt	2016-09-15 09:05:39 UTC (rev 8017)
+++ trunk/sys/dev/cxgbe/firmware/t4fw_cfg.txt	2016-09-15 20:11:39 UTC (rev 8018)
@@ -20,7 +20,7 @@
 	filterMode = fragmentation, mpshittype, protocol, vlan, port, fcoe
 
 	# TP rx and tx payload memory (% of the total EDRAM + DDR3).
-	tp_pmrx = 40
+	tp_pmrx = 38
 	tp_pmtx = 60
 	tp_pmrx_pagesize = 64K
 	tp_pmtx_pagesize = 64K
@@ -67,7 +67,8 @@
 	# driver will mask off features it won't use
 	protocol = ofld
 
-	tp_l2t = 100
+	tp_l2t = 4096
+	tp_ddp = 2
 
 	# TCAM has 8K cells; each region must start at a multiple of 128 cell.
 	# Each entry in these categories takes 4 cells each.  nhash will use the
@@ -136,7 +137,7 @@
 
 [fini]
 	version = 0x1
-	checksum = 0xdb5813f9
+	checksum = 0x162df193
 #
 # $MidnightBSD$
 #

Modified: trunk/sys/dev/cxgbe/offload.h
===================================================================
--- trunk/sys/dev/cxgbe/offload.h	2016-09-15 09:05:39 UTC (rev 8017)
+++ trunk/sys/dev/cxgbe/offload.h	2016-09-15 20:11:39 UTC (rev 8018)
@@ -31,13 +31,16 @@
 #ifndef __T4_OFFLOAD_H__
 #define __T4_OFFLOAD_H__
 
-#define INIT_ULPTX_WR(w, wrlen, atomic, tid) do { \
-	(w)->wr.wr_hi = htonl(V_FW_WR_OP(FW_ULPTX_WR) | V_FW_WR_ATOMIC(atomic)); \
-	(w)->wr.wr_mid = htonl(V_FW_WR_LEN16(DIV_ROUND_UP(wrlen, 16)) | \
+#define INIT_ULPTX_WRH(w, wrlen, atomic, tid) do { \
+	(w)->wr_hi = htonl(V_FW_WR_OP(FW_ULPTX_WR) | V_FW_WR_ATOMIC(atomic)); \
+	(w)->wr_mid = htonl(V_FW_WR_LEN16(DIV_ROUND_UP(wrlen, 16)) | \
 			       V_FW_WR_FLOWID(tid)); \
-	(w)->wr.wr_lo = cpu_to_be64(0); \
+	(w)->wr_lo = cpu_to_be64(0); \
 } while (0)
 
+#define INIT_ULPTX_WR(w, wrlen, atomic, tid) \
+    INIT_ULPTX_WRH(&((w)->wr), wrlen, atomic, tid)
+
 #define INIT_TP_WR(w, tid) do { \
 	(w)->wr.wr_hi = htonl(V_FW_WR_OP(FW_TP_WR) | \
                               V_FW_WR_IMMDLEN(sizeof(*w) - sizeof(w->wr))); \

Modified: trunk/sys/dev/cxgbe/t4_main.c
===================================================================
--- trunk/sys/dev/cxgbe/t4_main.c	2016-09-15 09:05:39 UTC (rev 8017)
+++ trunk/sys/dev/cxgbe/t4_main.c	2016-09-15 20:11:39 UTC (rev 8018)
@@ -306,6 +306,7 @@
 static int cpl_not_handled(struct sge_iq *, const struct rss_header *,
     struct mbuf *);
 static int an_not_handled(struct sge_iq *, const struct rsp_ctrl *);
+static int fw_msg_not_handled(struct adapter *, const __be64 *);
 static int t4_sysctls(struct adapter *);
 static int cxgbe_sysctls(struct port_info *);
 static int sysctl_int_array(SYSCTL_HANDLER_ARGS);
@@ -345,8 +346,6 @@
 static void clear_filter(struct filter_entry *);
 static int set_filter_wr(struct adapter *, int);
 static int del_filter_wr(struct adapter *, int);
-static int filter_rpl(struct sge_iq *, const struct rss_header *,
-    struct mbuf *);
 static int get_sge_context(struct adapter *, struct t4_sge_context *);
 static int read_card_mem(struct adapter *, struct t4_mem_range *);
 #ifdef TCP_OFFLOAD
@@ -381,6 +380,10 @@
 CTASSERT(offsetof(struct sge_ofld_rxq, fl) == offsetof(struct sge_rxq, fl));
 #endif
 
+/* No easy way to include t4_msg.h before adapter.h so we check this way */
+CTASSERT(ARRAY_SIZE(((struct adapter *)0)->cpl_handler) == NUM_CPL_CMDS);
+CTASSERT(ARRAY_SIZE(((struct adapter *)0)->fw_msg_handler) == NUM_FW6_TYPES);
+
 static int
 t4_probe(device_t dev)
 {
@@ -458,7 +461,9 @@
 	sc->an_handler = an_not_handled;
 	for (i = 0; i < ARRAY_SIZE(sc->cpl_handler); i++)
 		sc->cpl_handler[i] = cpl_not_handled;
-	t4_register_cpl_handler(sc, CPL_SET_TCB_RPL, filter_rpl);
+	for (i = 0; i < ARRAY_SIZE(sc->fw_msg_handler); i++)
+		sc->fw_msg_handler[i] = fw_msg_not_handled;
+	t4_register_cpl_handler(sc, CPL_SET_TCB_RPL, t4_filter_rpl);
 
 	/* Prepare the adapter for operation */
 	rc = -t4_prep_adapter(sc);
@@ -510,18 +515,24 @@
 		goto done; /* error message displayed already */
 
 	if (sc->flags & MASTER_PF) {
+		uint16_t indsz = min(RX_COPY_THRESHOLD - 1, M_INDICATESIZE);
 
 		/* final tweaks to some settings */
 
 		t4_load_mtus(sc, sc->params.mtus, sc->params.a_wnd,
 		    sc->params.b_wnd);
-		t4_write_reg(sc, A_ULP_RX_TDDP_PSZ, V_HPZ0(PAGE_SHIFT - 12));
+		/* 4K, 16K, 64K, 256K DDP "page sizes" */
+		t4_write_reg(sc, A_ULP_RX_TDDP_PSZ, V_HPZ0(0) | V_HPZ1(2) |
+		    V_HPZ2(4) | V_HPZ3(6));
+		t4_set_reg_field(sc, A_ULP_RX_CTL, F_TDDPTAGTCB, F_TDDPTAGTCB);
 		t4_set_reg_field(sc, A_TP_PARA_REG3, F_TUNNELCNGDROP0 |
-		    F_TUNNELCNGDROP1 | F_TUNNELCNGDROP2 | F_TUNNELCNGDROP3, 0);
+		    F_TUNNELCNGDROP1 | F_TUNNELCNGDROP2 | F_TUNNELCNGDROP3,
+		    F_TUNNELCNGDROP0 | F_TUNNELCNGDROP1 | F_TUNNELCNGDROP2 |
+		    F_TUNNELCNGDROP3);
 		t4_set_reg_field(sc, A_TP_PARA_REG5,
 		    V_INDICATESIZE(M_INDICATESIZE) |
 		    F_REARMDDPOFFSET | F_RESETDDPOFFSET,
-		    V_INDICATESIZE(M_INDICATESIZE) |
+		    V_INDICATESIZE(indsz) |
 		    F_REARMDDPOFFSET | F_RESETDDPOFFSET);
 	} else {
 		/*
@@ -2942,7 +2953,8 @@
 	ifp->if_omcasts = s->tx_mcast_frames - s->tx_pause;
 	ifp->if_imcasts = s->rx_mcast_frames - s->rx_pause;
 	ifp->if_iqdrops = s->rx_ovflow0 + s->rx_ovflow1 + s->rx_ovflow2 +
-	    s->rx_ovflow3;
+	    s->rx_ovflow3 + s->rx_trunc0 + s->rx_trunc1 + s->rx_trunc2 +
+	    s->rx_trunc3;
 
 	drops = s->tx_drop;
 	for_each_txq(pi, i, txq)
@@ -2977,7 +2989,7 @@
 	panic("%s: opcode 0x%02x on iq %p with payload %p",
 	    __func__, rss->opcode, iq, m);
 #else
-	log(LOG_ERR, "%s: opcode 0x%02x on iq %p with payload %p",
+	log(LOG_ERR, "%s: opcode 0x%02x on iq %p with payload %p\n",
 	    __func__, rss->opcode, iq, m);
 	m_freem(m);
 #endif
@@ -3006,7 +3018,7 @@
 #ifdef INVARIANTS
 	panic("%s: async notification on iq %p (ctrl %p)", __func__, iq, ctrl);
 #else
-	log(LOG_ERR, "%s: async notification on iq %p (ctrl %p)",
+	log(LOG_ERR, "%s: async notification on iq %p (ctrl %p)\n",
 	    __func__, iq, ctrl);
 #endif
 	return (EDOOFUS);
@@ -3025,6 +3037,35 @@
 }
 
 static int
+fw_msg_not_handled(struct adapter *sc, const __be64 *rpl)
+{
+	__be64 *r = __DECONST(__be64 *, rpl);
+	struct cpl_fw6_msg *cpl = member2struct(cpl_fw6_msg, data, r);
+
+#ifdef INVARIANTS
+	panic("%s: fw_msg type %d", __func__, cpl->type);
+#else
+	log(LOG_ERR, "%s: fw_msg type %d\n", __func__, cpl->type);
+#endif
+	return (EDOOFUS);
+}
+
+int
+t4_register_fw_msg_handler(struct adapter *sc, int type, fw_msg_handler_t h)
+{
+	uintptr_t *loc, new;
+
+	if (type >= ARRAY_SIZE(sc->fw_msg_handler))
+		return (EINVAL);
+
+	new = h ? (uintptr_t)h : (uintptr_t)fw_msg_not_handled;
+	loc = (uintptr_t *) &sc->fw_msg_handler[type];
+	atomic_store_rel_ptr(loc, new);
+
+	return (0);
+}
+
+static int
 t4_sysctls(struct adapter *sc)
 {
 	struct sysctl_ctx_list *ctx;
@@ -3191,10 +3232,13 @@
 		sc->tt.ddp = 0;
 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "ddp", CTLFLAG_RW,
 		    &sc->tt.ddp, 0, "DDP allowed");
-		sc->tt.indsz = M_INDICATESIZE;
+
+		sc->tt.indsz = G_INDICATESIZE(t4_read_reg(sc, A_TP_PARA_REG5));
 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "indsz", CTLFLAG_RW,
 		    &sc->tt.indsz, 0, "DDP max indicate size allowed");
-		sc->tt.ddp_thres = 3*4096;
+
+		sc->tt.ddp_thres =
+		    G_RXCOALESCESIZE(t4_read_reg(sc, A_TP_PARA_REG2));
 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "ddp_thres", CTLFLAG_RW,
 		    &sc->tt.ddp_thres, 0, "DDP threshold");
 	}
@@ -4961,8 +5005,8 @@
 	return (0);
 }
 
-static int
-filter_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
+int
+t4_filter_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_set_tcb_rpl *rpl = (const void *)(rss + 1);

Modified: trunk/sys/dev/cxgbe/t4_sge.c
===================================================================
--- trunk/sys/dev/cxgbe/t4_sge.c	2016-09-15 09:05:39 UTC (rev 8017)
+++ trunk/sys/dev/cxgbe/t4_sge.c	2016-09-15 20:11:39 UTC (rev 8018)
@@ -68,13 +68,38 @@
 #define FL_BUF_TYPE(x)	(fl_buf_info[x].type)
 #define FL_BUF_ZONE(x)	(fl_buf_info[x].zone)
 
-enum {
-	FL_PKTSHIFT = 2
-};
+/*
+ * Ethernet frames are DMA'd at this byte offset into the freelist buffer.
+ * 0-7 are valid values.
+ */
+static int fl_pktshift = 2;
+TUNABLE_INT("hw.cxgbe.fl_pktshift", &fl_pktshift);
 
-static int fl_pad = CACHE_LINE_SIZE;
-static int spg_len = 64;
+/*
+ * Pad ethernet payload up to this boundary.
+ * -1: driver should figure out a good value.
+ *  Any power of 2, from 32 to 4096 (both inclusive) is a valid value.
+ */
+static int fl_pad = -1;
+TUNABLE_INT("hw.cxgbe.fl_pad", &fl_pad);
 
+/*
+ * Status page length.
+ * -1: driver should figure out a good value.
+ *  64 or 128 are the only other valid values.
+ */
+static int spg_len = -1;
+TUNABLE_INT("hw.cxgbe.spg_len", &spg_len);
+
+/*
+ * Congestion drops.
+ * -1: no congestion feedback (not recommended).
+ *  0: backpressure the channel instead of dropping packets right away.
+ *  1: no backpressure, drop packets for the congested queue immediately.
+ */
+static int cong_drop = 0;
+TUNABLE_INT("hw.cxgbe.cong_drop", &cong_drop);
+
 /* Used to track coalesced tx work request */
 struct txpkts {
 	uint64_t *flitp;	/* ptr to flit where next pkt should start */
@@ -160,7 +185,7 @@
 static __be64 get_flit(bus_dma_segment_t *, int, int);
 static int handle_sge_egr_update(struct sge_iq *, const struct rss_header *,
     struct mbuf *);
-static int handle_fw_rpl(struct sge_iq *, const struct rss_header *,
+static int handle_fw_msg(struct sge_iq *, const struct rss_header *,
     struct mbuf *);
 
 static int sysctl_uint16(SYSCTL_HANDLER_ARGS);
@@ -170,7 +195,8 @@
 #endif
 
 /*
- * Called on MOD_LOAD and fills up fl_buf_info[].
+ * Called on MOD_LOAD.  Fills up fl_buf_info[] and validates/calculates the SGE
+ * tunables.
  */
 void
 t4_sge_modload(void)
@@ -191,10 +217,49 @@
 		FL_BUF_ZONE(i) = m_getzone(bufsize[i]);
 	}
 
+	if (fl_pktshift < 0 || fl_pktshift > 7) {
+		printf("Invalid hw.cxgbe.fl_pktshift value (%d),"
+		    " using 2 instead.\n", fl_pktshift);
+		fl_pktshift = 2;
+	}
+
+	if (fl_pad < 32 || fl_pad > 4096 || !powerof2(fl_pad)) {
+		int pad;
+
 #if defined(__i386__) || defined(__amd64__)
-	fl_pad = max(cpu_clflush_line_size, 32);
-	spg_len = cpu_clflush_line_size > 64 ? 128 : 64;
+		pad = max(cpu_clflush_line_size, 32);
+#else
+		pad = max(CACHE_LINE_SIZE, 32);
 #endif
+		pad = min(pad, 4096);
+
+		if (fl_pad != -1) {
+			printf("Invalid hw.cxgbe.fl_pad value (%d),"
+			    " using %d instead.\n", fl_pad, pad);
+		}
+		fl_pad = pad;
+	}
+
+	if (spg_len != 64 && spg_len != 128) {
+		int len;
+
+#if defined(__i386__) || defined(__amd64__)
+		len = cpu_clflush_line_size > 64 ? 128 : 64;
+#else
+		len = 64;
+#endif
+		if (spg_len != -1) {
+			printf("Invalid hw.cxgbe.spg_len value (%d),"
+			    " using %d instead.\n", spg_len, len);
+		}
+		spg_len = len;
+	}
+
+	if (cong_drop < -1 || cong_drop > 1) {
+		printf("Invalid hw.cxgbe.cong_drop value (%d),"
+		    " using 0 instead.\n", cong_drop);
+		cong_drop = 0;
+	}
 }
 
 /**
@@ -215,7 +280,7 @@
 	ctrl_mask = V_PKTSHIFT(M_PKTSHIFT) | F_RXPKTCPLMODE |
 	    V_INGPADBOUNDARY(M_INGPADBOUNDARY) |
 	    F_EGRSTATUSPAGESIZE;
-	ctrl_val = V_PKTSHIFT(FL_PKTSHIFT) | F_RXPKTCPLMODE |
+	ctrl_val = V_PKTSHIFT(fl_pktshift) | F_RXPKTCPLMODE |
 	    V_INGPADBOUNDARY(ilog2(fl_pad) - 5) |
 	    V_EGRSTATUSPAGESIZE(spg_len == 128);
 
@@ -296,11 +361,13 @@
 	sc->sge.timer_val[4] = G_TIMERVALUE4(v) / core_ticks_per_usec(sc);
 	sc->sge.timer_val[5] = G_TIMERVALUE5(v) / core_ticks_per_usec(sc);
 
-	t4_register_cpl_handler(sc, CPL_FW4_MSG, handle_fw_rpl);
-	t4_register_cpl_handler(sc, CPL_FW6_MSG, handle_fw_rpl);
+	t4_register_cpl_handler(sc, CPL_FW4_MSG, handle_fw_msg);
+	t4_register_cpl_handler(sc, CPL_FW6_MSG, handle_fw_msg);
 	t4_register_cpl_handler(sc, CPL_SGE_EGR_UPDATE, handle_sge_egr_update);
 	t4_register_cpl_handler(sc, CPL_RX_PKT, t4_eth_rx);
 
+	t4_register_fw_msg_handler(sc, FW6_TYPE_CMD_RPL, t4_handle_fw_rpl);
+
 	return (rc);
 }
 
@@ -477,6 +544,18 @@
 	return (iq);
 }
 
+static inline int
+mtu_to_bufsize(int mtu)
+{
+	int bufsize;
+
+	/* large enough for a frame even when VLAN extraction is disabled */
+	bufsize = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + mtu;
+	bufsize = roundup(bufsize + fl_pktshift, fl_pad);
+
+	return (bufsize);
+}
+
 int
 t4_setup_port_queues(struct port_info *pi)
 {
@@ -493,6 +572,7 @@
 	struct adapter *sc = pi->adapter;
 	struct sysctl_oid *oid = device_get_sysctl_tree(pi->dev);
 	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
+	int bufsize = mtu_to_bufsize(pi->ifp->if_mtu);
 
 	oid = SYSCTL_ADD_NODE(&pi->ctx, children, OID_AUTO, "rxq", CTLFLAG_RD,
 	    NULL, "rx queues");
@@ -522,7 +602,7 @@
 
 		snprintf(name, sizeof(name), "%s rxq%d-fl",
 		    device_get_nameunit(pi->dev), i);
-		init_fl(&rxq->fl, pi->qsize_rxq / 8, pi->ifp->if_mtu, name);
+		init_fl(&rxq->fl, pi->qsize_rxq / 8, bufsize, name);
 
 		if (sc->flags & INTR_DIRECT
 #ifdef TCP_OFFLOAD
@@ -547,7 +627,7 @@
 
 		snprintf(name, sizeof(name), "%s ofld_rxq%d-fl",
 		    device_get_nameunit(pi->dev), i);
-		init_fl(&ofld_rxq->fl, pi->qsize_rxq / 8, MJUM16BYTES, name);
+		init_fl(&ofld_rxq->fl, pi->qsize_rxq / 8, OFLD_BUF_SIZE, name);
 
 		if (sc->flags & INTR_DIRECT ||
 		    (sc->intr_count > 1 && pi->nofldrxq > pi->nrxq)) {
@@ -942,13 +1022,6 @@
 	return (0);
 }
 
-
-#ifdef T4_PKT_TIMESTAMP
-#define RX_COPY_THRESHOLD (MINCLSIZE - 8)
-#else
-#define RX_COPY_THRESHOLD MINCLSIZE
-#endif
-
 static struct mbuf *
 get_fl_payload(struct adapter *sc, struct sge_fl *fl, uint32_t len_newbuf,
     int *fl_bufs_used)
@@ -1050,9 +1123,9 @@
 	KASSERT(m0 != NULL, ("%s: no payload with opcode %02x", __func__,
 	    rss->opcode));
 
-	m0->m_pkthdr.len -= FL_PKTSHIFT;
-	m0->m_len -= FL_PKTSHIFT;
-	m0->m_data += FL_PKTSHIFT;
+	m0->m_pkthdr.len -= fl_pktshift;
+	m0->m_len -= fl_pktshift;
+	m0->m_data += fl_pktshift;
 
 	m0->m_pkthdr.rcvif = ifp;
 	m0->m_flags |= M_FLOWID;
@@ -1386,11 +1459,8 @@
 	struct port_info *pi = ifp->if_softc;
 	struct sge_rxq *rxq;
 	struct sge_fl *fl;
-	int i, bufsize;
+	int i, bufsize = mtu_to_bufsize(ifp->if_mtu);
 
-	/* large enough for a frame even when VLAN extraction is disabled */
-	bufsize = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + ifp->if_mtu;
-	bufsize = roundup(bufsize + FL_PKTSHIFT, fl_pad);
 	for_each_rxq(pi, i, rxq) {
 		fl = &rxq->fl;
 
@@ -1793,6 +1863,18 @@
 	return free_wrq(sc, &sc->sge.mgmtq);
 }
 
+static inline int
+tnl_cong(struct port_info *pi)
+{
+
+	if (cong_drop == -1)
+		return (-1);
+	else if (cong_drop == 1)
+		return (0);
+	else
+		return (1 << pi->tx_chan);
+}
+
 static int
 alloc_rxq(struct port_info *pi, struct sge_rxq *rxq, int intr_idx, int idx,
     struct sysctl_oid *oid)
@@ -1801,7 +1883,7 @@
 	struct sysctl_oid_list *children;
 	char name[16];
 
-	rc = alloc_iq_fl(pi, &rxq->iq, &rxq->fl, intr_idx, 1 << pi->tx_chan);
+	rc = alloc_iq_fl(pi, &rxq->iq, &rxq->fl, intr_idx, tnl_cong(pi));
 	if (rc != 0)
 		return (rc);
 
@@ -3433,17 +3515,15 @@
 }
 
 static int
-handle_fw_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
+handle_fw_msg(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 {
+	struct adapter *sc = iq->adapter;
 	const struct cpl_fw6_msg *cpl = (const void *)(rss + 1);
 
 	KASSERT(m == NULL, ("%s: payload with opcode %02x", __func__,
 	    rss->opcode));
 
-	if (cpl->type == FW6_TYPE_CMD_RPL)
-		t4_handle_fw_rpl(iq->adapter, cpl->data);
-
-	return (0);
+	return (sc->fw_msg_handler[cpl->type](sc, &cpl->data[0]));
 }
 
 static int

Modified: trunk/sys/dev/cxgbe/tom/t4_connect.c
===================================================================
--- trunk/sys/dev/cxgbe/tom/t4_connect.c	2016-09-15 09:05:39 UTC (rev 8017)
+++ trunk/sys/dev/cxgbe/tom/t4_connect.c	2016-09-15 20:11:39 UTC (rev 8018)
@@ -247,10 +247,14 @@
 	opt2 |= F_RX_COALESCE_VALID | V_RX_COALESCE(M_RX_COALESCE);
 	opt2 |= F_RSS_QUEUE_VALID | V_RSS_QUEUE(toep->ofld_rxq->iq.abs_id);
 
+#ifdef USE_DDP_RX_FLOW_CONTROL
+	if (toep->ulp_mode == ULP_MODE_TCPDDP)
+		opt2 |= F_RX_FC_VALID | F_RX_FC_DDP;
+#endif
+
 	return (htobe32(opt2));
 }
 
-
 void
 t4_init_connect_cpl_handlers(struct adapter *sc)
 {
@@ -320,7 +324,10 @@
 
 	toep->tid = atid;
 	toep->l2te = e;
-	toep->ulp_mode = ULP_MODE_NONE;
+	if (sc->tt.ddp && (so->so_options & SO_NO_DDP) == 0)
+		set_tcpddp_ulp_mode(toep);
+	else
+		toep->ulp_mode = ULP_MODE_NONE;
 	SOCKBUF_LOCK(&so->so_rcv);
 	/* opt0 rcv_bufsiz initially, assumes its normal meaning later */
 	toep->rx_credits = min(select_rcv_wnd(so) >> 10, M_RCV_BUFSIZ);
@@ -354,7 +361,7 @@
 
 	rc = t4_l2t_send(sc, wr, e);
 	if (rc == 0) {
-		toepcb_set_flag(toep, TPF_CPL_PENDING);
+		toep->flags |= TPF_CPL_PENDING;
 		return (0);
 	}
 

Modified: trunk/sys/dev/cxgbe/tom/t4_cpl_io.c
===================================================================
--- trunk/sys/dev/cxgbe/tom/t4_cpl_io.c	2016-09-15 09:05:39 UTC (rev 8017)
+++ trunk/sys/dev/cxgbe/tom/t4_cpl_io.c	2016-09-15 20:11:39 UTC (rev 8018)
@@ -53,6 +53,7 @@
 #include "common/common.h"
 #include "common/t4_msg.h"
 #include "common/t4_regs.h"
+#include "common/t4_tcb.h"
 #include "tom/t4_tom_l2t.h"
 #include "tom/t4_tom.h"
 
@@ -80,7 +81,7 @@
 	unsigned int pfvf = G_FW_VIID_PFN(pi->viid) << S_FW_VIID_PFN;
 	struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx];
 
-	KASSERT(!toepcb_flag(toep, TPF_FLOWC_WR_SENT),
+	KASSERT(!(toep->flags & TPF_FLOWC_WR_SENT),
 	    ("%s: flowc for tid %u sent already", __func__, toep->tid));
 
 	CTR2(KTR_CXGBE, "%s: tid %u", __func__, toep->tid);
@@ -130,7 +131,7 @@
 		toep->txsd_pidx = 0;
 	toep->txsd_avail--;
 
-	toepcb_set_flag(toep, TPF_FLOWC_WR_SENT);
+	toep->flags |= TPF_FLOWC_WR_SENT;
         t4_wrq_tx(sc, wr);
 }
 
@@ -150,15 +151,15 @@
 	    inp->inp_flags & INP_DROPPED ? "inp dropped" :
 	    tcpstates[tp->t_state],
 	    toep->flags, inp->inp_flags,
-	    toepcb_flag(toep, TPF_ABORT_SHUTDOWN) ?
+	    toep->flags & TPF_ABORT_SHUTDOWN ?
 	    " (abort already in progress)" : "");
 
-	if (toepcb_flag(toep, TPF_ABORT_SHUTDOWN))
+	if (toep->flags & TPF_ABORT_SHUTDOWN)
 		return;	/* abort already in progress */
 
-	toepcb_set_flag(toep, TPF_ABORT_SHUTDOWN);
+	toep->flags |= TPF_ABORT_SHUTDOWN;
 
-	KASSERT(toepcb_flag(toep, TPF_FLOWC_WR_SENT),
+	KASSERT(toep->flags & TPF_FLOWC_WR_SENT,
 	    ("%s: flowc_wr not sent for tid %d.", __func__, tid));
 
 	wr = alloc_wrqe(sizeof(*req), toep->ofld_txq);
@@ -173,7 +174,7 @@
 		req->rsvd0 = htobe32(snd_nxt);
 	else
 		req->rsvd0 = htobe32(tp->snd_nxt);
-	req->rsvd1 = !toepcb_flag(toep, TPF_TX_DATA_SENT);
+	req->rsvd1 = !(toep->flags & TPF_TX_DATA_SENT);
 	req->cmd = CPL_ABORT_SEND_RST;
 
 	/*
@@ -299,12 +300,14 @@
 }
 
 static int
-send_rx_credits(struct adapter *sc, struct toepcb *toep, uint32_t credits)
+send_rx_credits(struct adapter *sc, struct toepcb *toep, int credits)
 {
 	struct wrqe *wr;
 	struct cpl_rx_data_ack *req;
 	uint32_t dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1);
 
+	KASSERT(credits >= 0, ("%s: %d credits", __func__, credits));
+
 	wr = alloc_wrqe(sizeof(*req), toep->ctrlq);
 	if (wr == NULL)
 		return (0);
@@ -323,25 +326,28 @@
 	struct adapter *sc = tod->tod_softc;
 	struct inpcb *inp = tp->t_inpcb;
 	struct socket *so = inp->inp_socket;
-	struct sockbuf *so_rcv = &so->so_rcv;
+	struct sockbuf *sb = &so->so_rcv;
 	struct toepcb *toep = tp->t_toe;
-	int must_send;
+	int credits;
 
 	INP_WLOCK_ASSERT(inp);
 
-	SOCKBUF_LOCK(so_rcv);
-	KASSERT(toep->enqueued >= so_rcv->sb_cc,
-	    ("%s: so_rcv->sb_cc > enqueued", __func__));
-	toep->rx_credits += toep->enqueued - so_rcv->sb_cc;
-	toep->enqueued = so_rcv->sb_cc;
-	SOCKBUF_UNLOCK(so_rcv);
+	SOCKBUF_LOCK(sb);
+	KASSERT(toep->sb_cc >= sb->sb_cc,
+	    ("%s: sb %p has more data (%d) than last time (%d).",
+	    __func__, sb, sb->sb_cc, toep->sb_cc));
+	toep->rx_credits += toep->sb_cc - sb->sb_cc;
+	toep->sb_cc = sb->sb_cc;
+	credits = toep->rx_credits;
+	SOCKBUF_UNLOCK(sb);
 
-	must_send = toep->rx_credits + 16384 >= tp->rcv_wnd;
-	if (must_send || toep->rx_credits >= 15 * 1024) {
-		int credits;
+	if (credits > 0 &&
+	    (credits + 16384 >= tp->rcv_wnd || credits >= 15 * 1024)) {
 
-		credits = send_rx_credits(sc, toep, toep->rx_credits);
+		credits = send_rx_credits(sc, toep, credits);
+		SOCKBUF_LOCK(sb);
 		toep->rx_credits -= credits;
+		SOCKBUF_UNLOCK(sb);
 		tp->rcv_wnd += credits;
 		tp->rcv_adv += credits;
 	}
@@ -358,12 +364,12 @@
 	unsigned int tid = toep->tid;
 
 	CTR3(KTR_CXGBE, "%s: tid %u%s", __func__, toep->tid,
-	    toepcb_flag(toep, TPF_FIN_SENT) ? ", IGNORED" : "");
+	    toep->flags & TPF_FIN_SENT ? ", IGNORED" : "");
 
-	if (toepcb_flag(toep, TPF_FIN_SENT))
+	if (toep->flags & TPF_FIN_SENT)
 		return (0);
 
-	KASSERT(toepcb_flag(toep, TPF_FLOWC_WR_SENT),
+	KASSERT(toep->flags & TPF_FLOWC_WR_SENT,
 	    ("%s: flowc_wr not sent for tid %u.", __func__, tid));
 
 	wr = alloc_wrqe(sizeof(*req), toep->ofld_txq);
@@ -381,8 +387,8 @@
         OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid));
 	req->rsvd = 0;
 
-	toepcb_set_flag(toep, TPF_FIN_SENT);
-	toepcb_clr_flag(toep, TPF_SEND_FIN);
+	toep->flags |= TPF_FIN_SENT;
+	toep->flags &= ~TPF_SEND_FIN;
 	t4_l2t_send(sc, wr, toep->l2te);
 
 	return (0);
@@ -534,10 +540,11 @@
 	struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx];
 
 	INP_WLOCK_ASSERT(inp);
-	KASSERT(toepcb_flag(toep, TPF_FLOWC_WR_SENT),
+	KASSERT(toep->flags & TPF_FLOWC_WR_SENT,
 	    ("%s: flowc_wr not sent for tid %u.", __func__, toep->tid));
 
-	if (toep->ulp_mode != ULP_MODE_NONE)
+	if (__predict_false(toep->ulp_mode != ULP_MODE_NONE &&
+	    toep->ulp_mode != ULP_MODE_TCPDDP))
 		CXGBE_UNIMPLEMENTED("ulp_mode");
 
 	/*
@@ -544,7 +551,7 @@
 	 * This function doesn't resume by itself.  Someone else must clear the
 	 * flag and call this function.
 	 */
-	if (__predict_false(toepcb_flag(toep, TPF_TX_SUSPENDED)))
+	if (__predict_false(toep->flags & TPF_TX_SUSPENDED))
 		return;
 
 	do {
@@ -570,7 +577,7 @@
 				plen -= m->m_len;
 				if (plen == 0) {
 					/* Too few credits */
-					toepcb_set_flag(toep, TPF_TX_SUSPENDED);
+					toep->flags |= TPF_TX_SUSPENDED;
 					SOCKBUF_UNLOCK(sb);
 					return;
 				}
@@ -613,7 +620,7 @@
 			break;
 		}
 
-		if (__predict_false(toepcb_flag(toep, TPF_FIN_SENT)))
+		if (__predict_false(toep->flags & TPF_FIN_SENT))
 			panic("%s: excess tx.", __func__);
 
 		if (plen <= max_imm) {
@@ -624,7 +631,7 @@
 					toep->ofld_txq);
 			if (wr == NULL) {
 				/* XXX: how will we recover from this? */
-				toepcb_set_flag(toep, TPF_TX_SUSPENDED);
+				toep->flags |= TPF_TX_SUSPENDED;
 				return;
 			}
 			txwr = wrtod(wr);
@@ -642,7 +649,7 @@
 			wr = alloc_wrqe(roundup(wr_len, 16), toep->ofld_txq);
 			if (wr == NULL) {
 				/* XXX: how will we recover from this? */
-				toepcb_set_flag(toep, TPF_TX_SUSPENDED);
+				toep->flags |= TPF_TX_SUSPENDED;
 				return;
 			}
 			txwr = wrtod(wr);
@@ -671,7 +678,7 @@
 		sb->sb_sndptr = sb_sndptr;
 		SOCKBUF_UNLOCK(sb);
 
-		toepcb_set_flag(toep, TPF_TX_DATA_SENT);
+		toep->flags |= TPF_TX_DATA_SENT;
 
 		KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__));
 		txsd->plen = plen;
@@ -687,7 +694,7 @@
 	} while (m != NULL);
 
 	/* Send a FIN if requested, but only if there's no more data to send */
-	if (m == NULL && toepcb_flag(toep, TPF_SEND_FIN))
+	if (m == NULL && toep->flags & TPF_SEND_FIN)
 		close_conn(sc, toep);
 }
 
@@ -724,7 +731,7 @@
 	    ("%s: inp %p dropped.", __func__, inp));
 	KASSERT(toep != NULL, ("%s: toep is NULL", __func__));
 
-	toepcb_set_flag(toep, TPF_SEND_FIN);
+	toep->flags |= TPF_SEND_FIN;
 	t4_push_frames(sc, toep);
 
 	return (0);
@@ -745,7 +752,7 @@
 	KASSERT(toep != NULL, ("%s: toep is NULL", __func__));
 
 	/* hmmmm */
-	KASSERT(toepcb_flag(toep, TPF_FLOWC_WR_SENT),
+	KASSERT(toep->flags & TPF_FLOWC_WR_SENT,
 	    ("%s: flowc for tid %u [%s] not sent already",
 	    __func__, toep->tid, tcpstates[tp->t_state]));
 
@@ -765,7 +772,8 @@
 	struct toepcb *toep = lookup_tid(sc, tid);
 	struct inpcb *inp = toep->inp;
 	struct tcpcb *tp = NULL;
-	struct socket *so = NULL;
+	struct socket *so;
+	struct sockbuf *sb;
 #ifdef INVARIANTS
 	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
 #endif
@@ -782,13 +790,38 @@
 	CTR5(KTR_CXGBE, "%s: tid %u (%s), toep_flags 0x%x, inp %p", __func__,
 	    tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags, inp);
 
-	if (toepcb_flag(toep, TPF_ABORT_SHUTDOWN))
+	if (toep->flags & TPF_ABORT_SHUTDOWN)
 		goto done;
 
+	tp->rcv_nxt++;	/* FIN */
+
 	so = inp->inp_socket;
+	sb = &so->so_rcv;
+	SOCKBUF_LOCK(sb);
+	if (__predict_false(toep->ddp_flags & (DDP_BUF0_ACTIVE | DDP_BUF1_ACTIVE))) {
+		m = m_get(M_NOWAIT, MT_DATA);
+		if (m == NULL)
+			CXGBE_UNIMPLEMENTED("mbuf alloc failure");
 
-	socantrcvmore(so);
-	tp->rcv_nxt++;	/* FIN */
+		m->m_len = be32toh(cpl->rcv_nxt) - tp->rcv_nxt;
+		m->m_flags |= M_DDP;	/* Data is already where it should be */
+		m->m_data = "nothing to see here";
+		tp->rcv_nxt = be32toh(cpl->rcv_nxt);
+
+		toep->ddp_flags &= ~(DDP_BUF0_ACTIVE | DDP_BUF1_ACTIVE);
+
+		KASSERT(toep->sb_cc >= sb->sb_cc,
+		    ("%s: sb %p has more data (%d) than last time (%d).",
+		    __func__, sb, sb->sb_cc, toep->sb_cc));
+		toep->rx_credits += toep->sb_cc - sb->sb_cc;
+#ifdef USE_DDP_RX_FLOW_CONTROL
+		toep->rx_credits -= m->m_len;	/* adjust for F_RX_FC_DDP */
+#endif
+		sbappendstream_locked(sb, m);
+		toep->sb_cc = sb->sb_cc;
+	}
+	socantrcvmore_locked(so);	/* unlocks the sockbuf */
+
 	KASSERT(tp->rcv_nxt == be32toh(cpl->rcv_nxt),
 	    ("%s: rcv_nxt mismatch: %u %u", __func__, tp->rcv_nxt,
 	    be32toh(cpl->rcv_nxt)));
@@ -855,7 +888,7 @@
 	CTR4(KTR_CXGBE, "%s: tid %u (%s), toep_flags 0x%x",
 	    __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags);
 
-	if (toepcb_flag(toep, TPF_ABORT_SHUTDOWN))
+	if (toep->flags & TPF_ABORT_SHUTDOWN)
 		goto done;
 
 	so = inp->inp_socket;
@@ -953,7 +986,7 @@
 	    ("%s: unexpected opcode 0x%x", __func__, opcode));
 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
 
-	if (toepcb_flag(toep, TPF_SYNQE))
+	if (toep->flags & TPF_SYNQE)
 		return (do_abort_req_synqe(iq, rss, m));
 
 	KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
@@ -974,8 +1007,8 @@
 
 	CTR6(KTR_CXGBE,
 	    "%s: tid %d (%s), toep_flags 0x%x, inp_flags 0x%x, status %d",
-	    __func__, tid, tcpstates[tp->t_state], toep->flags, inp->inp_flags,
-	    cpl->status);
+	    __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags,
+	    inp->inp_flags, cpl->status);
 
 	/*
 	 * If we'd initiated an abort earlier the reply to it is responsible for
@@ -982,11 +1015,11 @@
 	 * cleaning up resources.  Otherwise we tear everything down right here
 	 * right now.  We owe the T4 a CPL_ABORT_RPL no matter what.
 	 */
-	if (toepcb_flag(toep, TPF_ABORT_SHUTDOWN)) {
+	if (toep->flags & TPF_ABORT_SHUTDOWN) {
 		INP_WUNLOCK(inp);
 		goto done;
 	}
-	toepcb_set_flag(toep, TPF_ABORT_SHUTDOWN);
+	toep->flags |= TPF_ABORT_SHUTDOWN;
 
 	so_error_set(so, abort_status_to_errno(tp, cpl->status));
 	tp = tcp_close(tp);
@@ -1019,7 +1052,7 @@
 	    ("%s: unexpected opcode 0x%x", __func__, opcode));
 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
 
-	if (toepcb_flag(toep, TPF_SYNQE))
+	if (toep->flags & TPF_SYNQE)
 		return (do_abort_rpl_synqe(iq, rss, m));
 
 	KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
@@ -1027,7 +1060,7 @@
 	CTR5(KTR_CXGBE, "%s: tid %u, toep %p, inp %p, status %d",
 	    __func__, tid, toep, inp, cpl->status);
 
-	KASSERT(toepcb_flag(toep, TPF_ABORT_SHUTDOWN),
+	KASSERT(toep->flags & TPF_ABORT_SHUTDOWN,
 	    ("%s: wasn't expecting abort reply", __func__));
 
 	INP_WLOCK(inp);
@@ -1046,15 +1079,16 @@
 	struct inpcb *inp = toep->inp;
 	struct tcpcb *tp;
 	struct socket *so;
-	struct sockbuf *so_rcv;
+	struct sockbuf *sb;
+	int len;
 
-	if (__predict_false(toepcb_flag(toep, TPF_SYNQE))) {
+	if (__predict_false(toep->flags & TPF_SYNQE)) {
 		/*
 		 * do_pass_establish failed and must be attempting to abort the
 		 * synqe's tid.  Meanwhile, the T4 has sent us data for such a
 		 * connection.
 		 */
-		KASSERT(toepcb_flag(toep, TPF_ABORT_SHUTDOWN),
+		KASSERT(toep->flags & TPF_ABORT_SHUTDOWN,
 		    ("%s: synqe and tid isn't being aborted.", __func__));
 		m_freem(m);
 		return (0);
@@ -1064,11 +1098,12 @@
 
 	/* strip off CPL header */
 	m_adj(m, sizeof(*cpl));
+	len = m->m_pkthdr.len;
 
 	INP_WLOCK(inp);
 	if (inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) {
 		CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x",
-		    __func__, tid, m->m_pkthdr.len, inp->inp_flags);
+		    __func__, tid, len, inp->inp_flags);
 		INP_WUNLOCK(inp);
 		m_freem(m);
 		return (0);
@@ -1084,21 +1119,20 @@
 	}
 #endif
 
-	tp->rcv_nxt += m->m_pkthdr.len;
-	KASSERT(tp->rcv_wnd >= m->m_pkthdr.len,
-	    ("%s: negative window size", __func__));
-	tp->rcv_wnd -= m->m_pkthdr.len;
+	tp->rcv_nxt += len;
+	KASSERT(tp->rcv_wnd >= len, ("%s: negative window size", __func__));
+	tp->rcv_wnd -= len;
 	tp->t_rcvtime = ticks;
 
 	so = inp_inpcbtosocket(inp);
-	so_rcv = &so->so_rcv;
-	SOCKBUF_LOCK(so_rcv);
+	sb = &so->so_rcv;
+	SOCKBUF_LOCK(sb);
 
-	if (__predict_false(so_rcv->sb_state & SBS_CANTRCVMORE)) {
+	if (__predict_false(sb->sb_state & SBS_CANTRCVMORE)) {
 		CTR3(KTR_CXGBE, "%s: tid %u, excess rx (%d bytes)",
-		    __func__, tid, m->m_pkthdr.len);
+		    __func__, tid, len);
 		m_freem(m);
-		SOCKBUF_UNLOCK(so_rcv);
+		SOCKBUF_UNLOCK(sb);
 		INP_WUNLOCK(inp);
 
 		INP_INFO_WLOCK(&V_tcbinfo);
@@ -1112,23 +1146,76 @@
 	}
 
 	/* receive buffer autosize */
-	if (so_rcv->sb_flags & SB_AUTOSIZE &&
+	if (sb->sb_flags & SB_AUTOSIZE &&
 	    V_tcp_do_autorcvbuf &&
-	    so_rcv->sb_hiwat < V_tcp_autorcvbuf_max &&
-	    m->m_pkthdr.len > (sbspace(so_rcv) / 8 * 7)) {
-		unsigned int hiwat = so_rcv->sb_hiwat;
+	    sb->sb_hiwat < V_tcp_autorcvbuf_max &&
+	    len > (sbspace(sb) / 8 * 7)) {
+		unsigned int hiwat = sb->sb_hiwat;
 		unsigned int newsize = min(hiwat + V_tcp_autorcvbuf_inc,
 		    V_tcp_autorcvbuf_max);
 
-		if (!sbreserve_locked(so_rcv, newsize, so, NULL))
-			so_rcv->sb_flags &= ~SB_AUTOSIZE;
+		if (!sbreserve_locked(sb, newsize, so, NULL))
+			sb->sb_flags &= ~SB_AUTOSIZE;
 		else
 			toep->rx_credits += newsize - hiwat;
 	}
-	toep->enqueued += m->m_pkthdr.len;
-	sbappendstream_locked(so_rcv, m);
+
+	if (toep->ulp_mode == ULP_MODE_TCPDDP) {
+		int changed = !(toep->ddp_flags & DDP_ON) ^ cpl->ddp_off;
+
+		if (changed) {
+			if (__predict_false(!(toep->ddp_flags & DDP_SC_REQ))) {
+				/* XXX: handle this if legitimate */
+				panic("%s: unexpected DDP state change %d",
+				    __func__, cpl->ddp_off);
+			}
+			toep->ddp_flags ^= DDP_ON | DDP_SC_REQ;
+		}
+
+		if ((toep->ddp_flags & DDP_OK) == 0 &&
+		    time_uptime >= toep->ddp_disabled + DDP_RETRY_WAIT) {
+			toep->ddp_score = DDP_LOW_SCORE;
+			toep->ddp_flags |= DDP_OK;
+			CTR3(KTR_CXGBE, "%s: tid %u DDP_OK @ %u",
+			    __func__, tid, time_uptime);
+		}
+
+		if (toep->ddp_flags & DDP_ON) {
+
+			/*
+			 * CPL_RX_DATA with DDP on can only be an indicate.  Ask
+			 * soreceive to post a buffer or disable DDP.  The
+			 * payload that arrived in this indicate is appended to
+			 * the socket buffer as usual.
+			 */
+
+#if 0
+			CTR5(KTR_CXGBE,
+			    "%s: tid %u (0x%x) DDP indicate (seq 0x%x, len %d)",
+			    __func__, tid, toep->flags, be32toh(cpl->seq), len);
+#endif
+			sb->sb_flags |= SB_DDP_INDICATE;
+		} else if ((toep->ddp_flags & (DDP_OK|DDP_SC_REQ)) == DDP_OK &&
+		    tp->rcv_wnd > DDP_RSVD_WIN && len >= sc->tt.ddp_thres) {
+
+			/*
+			 * DDP allowed but isn't on (and a request to switch it
+			 * on isn't pending either), and conditions are ripe for
+			 * it to work.  Switch it on.
+			 */
+
+			enable_ddp(sc, toep);
+		}
+	}
+
+	KASSERT(toep->sb_cc >= sb->sb_cc,
+	    ("%s: sb %p has more data (%d) than last time (%d).",
+	    __func__, sb, sb->sb_cc, toep->sb_cc));
+	toep->rx_credits += toep->sb_cc - sb->sb_cc;
+	sbappendstream_locked(sb, m);
+	toep->sb_cc = sb->sb_cc;
 	sorwakeup_locked(so);
-	SOCKBUF_UNLOCK_ASSERT(so_rcv);
+	SOCKBUF_UNLOCK_ASSERT(sb);
 
 	INP_WUNLOCK(inp);
 	return (0);
@@ -1179,8 +1266,8 @@
 	 * Very unusual case: we'd sent a flowc + abort_req for a synq entry and
 	 * now this comes back carrying the credits for the flowc.
 	 */
-	if (__predict_false(toepcb_flag(toep, TPF_SYNQE))) {
-		KASSERT(toepcb_flag(toep, TPF_ABORT_SHUTDOWN),
+	if (__predict_false(toep->flags & TPF_SYNQE)) {
+		KASSERT(toep->flags & TPF_ABORT_SHUTDOWN,
 		    ("%s: credits for a synq entry %p", __func__, toep));
 		return (0);
 	}
@@ -1194,7 +1281,7 @@
 
 	INP_WLOCK(inp);
 
-	if (__predict_false(toepcb_flag(toep, TPF_ABORT_SHUTDOWN))) {
+	if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN)) {
 		INP_WUNLOCK(inp);
 		return (0);
 	}
@@ -1250,11 +1337,11 @@
 	}
 
 	/* XXX */
-	if ((toepcb_flag(toep, TPF_TX_SUSPENDED) &&
+	if ((toep->flags & TPF_TX_SUSPENDED &&
 	    toep->tx_credits >= MIN_OFLD_TX_CREDITS) ||
 	    toep->tx_credits == toep->txsd_total *
 	    howmany((sizeof(struct fw_ofld_tx_data_wr) + 1), 16)) {
-		toepcb_clr_flag(toep, TPF_TX_SUSPENDED);
+		toep->flags &= ~TPF_TX_SUSPENDED;
 		t4_push_frames(sc, toep);
 	}
 	INP_WUNLOCK(inp);
@@ -1262,7 +1349,52 @@
 	return (0);
 }
 
+static int
+do_set_tcb_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
+{
+	struct adapter *sc = iq->adapter;
+	const struct cpl_set_tcb_rpl *cpl = (const void *)(rss + 1);
+	unsigned int tid = GET_TID(cpl);
+#ifdef INVARIANTS
+	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
+#endif
+
+	KASSERT(opcode == CPL_SET_TCB_RPL,
+	    ("%s: unexpected opcode 0x%x", __func__, opcode));
+	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
+
+	if (tid >= sc->tids.ftid_base &&
+	    tid < sc->tids.ftid_base + sc->tids.nftids)
+		return (t4_filter_rpl(iq, rss, m)); /* TCB is a filter */
+
+	CXGBE_UNIMPLEMENTED(__func__);
+}
+
 void
+t4_set_tcb_field(struct adapter *sc, struct toepcb *toep, uint16_t word,
+    uint64_t mask, uint64_t val)
+{
+	struct wrqe *wr;
+	struct cpl_set_tcb_field *req;
+
+	wr = alloc_wrqe(sizeof(*req), toep->ctrlq);
+	if (wr == NULL) {
+		/* XXX */
+		panic("%s: allocation failure.", __func__);
+	}
+	req = wrtod(wr);
+
+	INIT_TP_WR_MIT_CPL(req, CPL_SET_TCB_FIELD, toep->tid);
+	req->reply_ctrl = htobe16(V_NO_REPLY(1) |
+	    V_QUEUENO(toep->ofld_rxq->iq.abs_id));
+	req->word_cookie = htobe16(V_WORD(word) | V_COOKIE(0));
+	req->mask = htobe64(mask);
+	req->val = htobe64(val);
+
+	t4_wrq_tx(sc, wr);
+}
+
+void
 t4_init_cpl_io_handlers(struct adapter *sc)
 {
 
@@ -1272,5 +1404,13 @@
 	t4_register_cpl_handler(sc, CPL_ABORT_RPL_RSS, do_abort_rpl);
 	t4_register_cpl_handler(sc, CPL_RX_DATA, do_rx_data);
 	t4_register_cpl_handler(sc, CPL_FW4_ACK, do_fw4_ack);
+	t4_register_cpl_handler(sc, CPL_SET_TCB_RPL, do_set_tcb_rpl);
 }
+
+void
+t4_uninit_cpl_io_handlers(struct adapter *sc)
+{
+
+	t4_register_cpl_handler(sc, CPL_SET_TCB_RPL, t4_filter_rpl);
+}
 #endif

Modified: trunk/sys/dev/cxgbe/tom/t4_listen.c
===================================================================
--- trunk/sys/dev/cxgbe/tom/t4_listen.c	2016-09-15 09:05:39 UTC (rev 8017)
+++ trunk/sys/dev/cxgbe/tom/t4_listen.c	2016-09-15 20:11:39 UTC (rev 8018)
@@ -50,6 +50,7 @@
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/ip.h>
+#include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 #define TCPSTATES
 #include <netinet/tcp_fsm.h>
@@ -283,11 +284,11 @@
 
 	CTR4(KTR_CXGBE, "%s: synqe %p, tid %d%s",
 	    __func__, synqe, synqe->tid,
-	    synqe_flag(synqe, TPF_ABORT_SHUTDOWN) ?
+	    synqe->flags & TPF_ABORT_SHUTDOWN ?
 	    " (abort already in progress)" : "");
-	if (synqe_flag(synqe, TPF_ABORT_SHUTDOWN))
+	if (synqe->flags & TPF_ABORT_SHUTDOWN)
 		return;	/* abort already in progress */
-	synqe_set_flag(synqe, TPF_ABORT_SHUTDOWN);
+	synqe->flags |= TPF_ABORT_SHUTDOWN;
 
 	get_qids_from_mbuf(m, &txqid, &rxqid);
 	ofld_txq = &sc->sge.ofld_txq[txqid];
@@ -318,7 +319,7 @@
         flowc->mnemval[2].val = htobe32(pi->tx_chan);
         flowc->mnemval[3].mnemonic = FW_FLOWC_MNEM_IQID;
         flowc->mnemval[3].val = htobe32(ofld_rxq->iq.abs_id);
-	synqe_set_flag(synqe, TPF_FLOWC_WR_SENT);
+	synqe->flags |= TPF_FLOWC_WR_SENT;
 
 	/* ... then ABORT request */
 	INIT_TP_WR_MIT_CPL(req, CPL_ABORT_REQ, synqe->tid);
@@ -515,7 +516,7 @@
 {
 
 	if (refcount_release(&synqe->refcnt)) {
-		int needfree = synqe_flag(synqe, TPF_SYNQE_NEEDFREE);
+		int needfree = synqe->flags & TPF_SYNQE_NEEDFREE;
 
 		m_freem(synqe->syn);
 		if (needfree)
@@ -740,7 +741,7 @@
 	 * cleaning up resources.  Otherwise we tear everything down right here
 	 * right now.  We owe the T4 a CPL_ABORT_RPL no matter what.
 	 */
-	if (synqe_flag(synqe, TPF_ABORT_SHUTDOWN)) {
+	if (synqe->flags & TPF_ABORT_SHUTDOWN) {
 		INP_WUNLOCK(inp);
 		goto done;
 	}
@@ -775,7 +776,7 @@
 	    __func__, tid, synqe, synqe->flags, synqe->lctx, cpl->status);
 
 	INP_WLOCK(inp);
-	KASSERT(synqe_flag(synqe, TPF_ABORT_SHUTDOWN),
+	KASSERT(synqe->flags & TPF_ABORT_SHUTDOWN,
 	    ("%s: wasn't expecting abort reply for synqe %p (0x%x)",
 	    __func__, synqe, synqe->flags));
 
@@ -798,13 +799,14 @@
 
 	INP_INFO_LOCK_ASSERT(&V_tcbinfo); /* prevents bad race with accept() */
 	INP_WLOCK_ASSERT(inp);
-	KASSERT(synqe_flag(synqe, TPF_SYNQE),
+	KASSERT(synqe->flags & TPF_SYNQE,
 	    ("%s: %p not a synq_entry?", __func__, arg));
 
 	offload_socket(so, toep);
 	make_established(toep, cpl->snd_isn, cpl->rcv_isn, cpl->tcp_opt);
-	toepcb_set_flag(toep, TPF_CPL_PENDING);
+	toep->flags |= TPF_CPL_PENDING;
 	update_tid(sc, synqe->tid, toep);
+	synqe->flags |= TPF_SYNQE_EXPANDED;
 }
 
 static inline void
@@ -843,14 +845,12 @@
 		synqe = malloc(sizeof(*synqe), M_CXGBE, M_NOWAIT);
 		if (synqe == NULL)
 			return (NULL);
-	} else
+		synqe->flags = TPF_SYNQE | TPF_SYNQE_NEEDFREE;
+	} else {
 		synqe = (void *)(m->m_data + m->m_len + tspace - sizeof(*synqe));
+		synqe->flags = TPF_SYNQE;
+	}
 
-	synqe->flags = 0;
-	synqe_set_flag(synqe, TPF_SYNQE);
-	if (tspace < len)
-		synqe_set_flag(synqe, TPF_SYNQE_NEEDFREE);
-
 	return (synqe);
 }
 
@@ -881,7 +881,7 @@
  */
 static uint32_t
 calc_opt2p(struct adapter *sc, struct port_info *pi, int rxqid,
-    const struct tcp_options *tcpopt, struct tcphdr *th)
+    const struct tcp_options *tcpopt, struct tcphdr *th, int ulp_mode)
 {
 	uint32_t opt2 = 0;
 	struct sge_ofld_rxq *ofld_rxq = &sc->sge.ofld_rxq[rxqid];
@@ -902,6 +902,11 @@
 	opt2 |= F_RX_COALESCE_VALID | V_RX_COALESCE(M_RX_COALESCE);
 	opt2 |= F_RSS_QUEUE_VALID | V_RSS_QUEUE(ofld_rxq->iq.abs_id);
 
+#ifdef USE_DDP_RX_FLOW_CONTROL
+	if (ulp_mode == ULP_MODE_TCPDDP)
+		opt2 |= F_RX_FC_VALID | F_RX_FC_DDP;
+#endif
+
 	return htobe32(opt2);
 }
 
@@ -985,7 +990,7 @@
 	struct l2t_entry *e = NULL;
 	struct rtentry *rt;
 	struct sockaddr_in nam;
-	int rscale, mtu_idx, rx_credits, rxqid;
+	int rscale, mtu_idx, rx_credits, rxqid, ulp_mode;
 	struct synq_entry *synqe = NULL;
 	int reject_reason;
 	uint16_t vid;
@@ -1108,9 +1113,13 @@
 	get_qids_from_mbuf(m, NULL, &rxqid);
 
 	INIT_TP_WR_MIT_CPL(rpl, CPL_PASS_ACCEPT_RPL, tid);
-	rpl->opt0 = calc_opt0(so, pi, e, mtu_idx, rscale, rx_credits,
-	    ULP_MODE_NONE);
-	rpl->opt2 = calc_opt2p(sc, pi, rxqid, &cpl->tcpopt, &th);
+	if (sc->tt.ddp && (so->so_options & SO_NO_DDP) == 0) {
+		ulp_mode = ULP_MODE_TCPDDP;
+		synqe->flags |= TPF_SYNQE_TCPDDP;
+	} else
+		ulp_mode = ULP_MODE_NONE;
+	rpl->opt0 = calc_opt0(so, pi, e, mtu_idx, rscale, rx_credits, ulp_mode);
+	rpl->opt2 = calc_opt2p(sc, pi, rxqid, &cpl->tcpopt, &th, ulp_mode);
 
 	synqe->tid = tid;
 	synqe->lctx = lctx;
@@ -1151,7 +1160,7 @@
 		INP_WLOCK(inp);
 		if (__predict_false(inp->inp_flags & INP_DROPPED)) {
 			/* listener closed.  synqe must have been aborted. */
-			KASSERT(synqe_flag(synqe, TPF_ABORT_SHUTDOWN),
+			KASSERT(synqe->flags & TPF_ABORT_SHUTDOWN,
 			    ("%s: listener %p closed but synqe %p not aborted",
 			    __func__, inp, synqe));
 
@@ -1169,7 +1178,7 @@
 		 * that can only happen if the listener was closed and we just
 		 * checked for that.
 		 */
-		KASSERT(!synqe_flag(synqe, TPF_ABORT_SHUTDOWN),
+		KASSERT(!(synqe->flags & TPF_ABORT_SHUTDOWN),
 		    ("%s: synqe %p aborted, but listener %p not dropped.",
 		    __func__, synqe, inp));
 
@@ -1266,7 +1275,7 @@
 	    ("%s: unexpected opcode 0x%x", __func__, opcode));
 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
 	KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__));
-	KASSERT(synqe_flag(synqe, TPF_SYNQE),
+	KASSERT(synqe->flags & TPF_SYNQE,
 	    ("%s: tid %u (ctx %p) not a synqe", __func__, tid, synqe));
 
 	INP_INFO_WLOCK(&V_tcbinfo);	/* for syncache_expand */
@@ -1283,7 +1292,7 @@
 		 * on the lctx's synq.  do_abort_rpl for the tid is responsible
 		 * for cleaning up.
 		 */
-		KASSERT(synqe_flag(synqe, TPF_ABORT_SHUTDOWN),
+		KASSERT(synqe->flags & TPF_ABORT_SHUTDOWN,
 		    ("%s: listen socket dropped but tid %u not aborted.",
 		    __func__, tid));
 
@@ -1313,7 +1322,10 @@
 	}
 	toep->tid = tid;
 	toep->l2te = &sc->l2t->l2tab[synqe->l2e_idx];
-	toep->ulp_mode = ULP_MODE_NONE;
+	if (synqe->flags & TPF_SYNQE_TCPDDP)
+		set_tcpddp_ulp_mode(toep);
+	else
+		toep->ulp_mode = ULP_MODE_NONE;
 	/* opt0 rcv_bufsiz initially, assumes its normal meaning later */
 	toep->rx_credits = synqe->rcv_bufsize;
 
@@ -1339,6 +1351,24 @@
 		goto reset;
 	}
 
+	/*
+	 * This is for the unlikely case where the syncache entry that we added
+	 * has been evicted from the syncache, but the syncache_expand above
+	 * works because of syncookies.
+	 *
+	 * XXX: we've held the tcbinfo lock throughout so there's no risk of
+	 * anyone accept'ing a connection before we've installed our hooks, but
+	 * this somewhat defeats the purpose of having a tod_offload_socket :-(
+	 */
+	if (__predict_false(!(synqe->flags & TPF_SYNQE_EXPANDED))) {
+		struct inpcb *new_inp = sotoinpcb(so);
+
+		INP_WLOCK(new_inp);
+		tcp_timer_activate(intotcpcb(new_inp), TT_KEEP, 0);
+		t4_offload_socket(TOEDEV(ifp), synqe, so);
+		INP_WUNLOCK(new_inp);
+	}
+
 	/* Done with the synqe */
 	TAILQ_REMOVE(&lctx->synq, synqe, link);
 	inp = release_lctx(sc, lctx);

Modified: trunk/sys/dev/cxgbe/tom/t4_tom.c
===================================================================
--- trunk/sys/dev/cxgbe/tom/t4_tom.c	2016-09-15 09:05:39 UTC (rev 8017)
+++ trunk/sys/dev/cxgbe/tom/t4_tom.c	2016-09-15 20:11:39 UTC (rev 8018)
@@ -55,6 +55,9 @@
 #include "tom/t4_tom_l2t.h"
 #include "tom/t4_tom.h"
 
+static struct protosw ddp_protosw;
+static struct pr_usrreqs ddp_usrreqs;
+
 /* Module ops */
 static int t4_tom_mod_load(void);
 static int t4_tom_mod_unload(void);
@@ -138,9 +141,9 @@
 free_toepcb(struct toepcb *toep)
 {
 
-	KASSERT(toepcb_flag(toep, TPF_ATTACHED) == 0,
+	KASSERT(!(toep->flags & TPF_ATTACHED),
 	    ("%s: attached to an inpcb", __func__));
-	KASSERT(toepcb_flag(toep, TPF_CPL_PENDING) == 0,
+	KASSERT(!(toep->flags & TPF_CPL_PENDING),
 	    ("%s: CPL pending", __func__));
 
 	free(toep, M_CXGBE);
@@ -167,6 +170,8 @@
 	sb = &so->so_rcv;
 	SOCKBUF_LOCK(sb);
 	sb->sb_flags |= SB_NOCOALESCE;
+	if (toep->ulp_mode == ULP_MODE_TCPDDP)
+		so->so_proto = &ddp_protosw;
 	SOCKBUF_UNLOCK(sb);
 
 	/* Update TCP PCB */
@@ -176,7 +181,7 @@
 
 	/* Install an extra hold on inp */
 	toep->inp = inp;
-	toepcb_set_flag(toep, TPF_ATTACHED);
+	toep->flags |= TPF_ATTACHED;
 	in_pcbref(inp);
 
 	/* Add the TOE PCB to the active list */
@@ -211,7 +216,7 @@
 	tp->t_flags &= ~TF_TOE;
 
 	toep->inp = NULL;
-	toepcb_clr_flag(toep, TPF_ATTACHED);
+	toep->flags &= ~TPF_ATTACHED;
 	if (in_pcbrele_wlocked(inp))
 		panic("%s: inp freed.", __func__);
 
@@ -227,14 +232,17 @@
 	struct adapter *sc = td_adapter(td);
 	int tid = toep->tid;
 
-	KASSERT(toepcb_flag(toep, TPF_CPL_PENDING) == 0,
+	KASSERT(!(toep->flags & TPF_CPL_PENDING),
 	    ("%s: %p has CPL pending.", __func__, toep));
-	KASSERT(toepcb_flag(toep, TPF_ATTACHED) == 0,
+	KASSERT(!(toep->flags & TPF_ATTACHED),
 	    ("%s: %p is still attached.", __func__, toep));
 
 	CTR4(KTR_CXGBE, "%s: toep %p (tid %d, l2te %p)",
 	    __func__, toep, tid, toep->l2te);
 
+	if (toep->ulp_mode == ULP_MODE_TCPDDP)
+		release_ddp_resources(toep);
+
 	if (toep->l2te)
 		t4_l2t_release(toep->l2te);
 
@@ -269,7 +277,7 @@
 	INP_WLOCK_ASSERT(inp);
 
 	KASSERT(toep != NULL, ("%s: toep is NULL", __func__));
-	KASSERT(toepcb_flag(toep, TPF_ATTACHED),
+	KASSERT(toep->flags & TPF_ATTACHED,
 	    ("%s: not attached", __func__));
 
 #ifdef KTR
@@ -287,9 +295,9 @@
 
 	tp->t_toe = NULL;
 	tp->t_flags &= ~TF_TOE;
-	toepcb_clr_flag(toep, TPF_ATTACHED);
+	toep->flags &= ~TPF_ATTACHED;
 
-	if (toepcb_flag(toep, TPF_CPL_PENDING) == 0)
+	if (!(toep->flags & TPF_CPL_PENDING))
 		release_offload_resources(toep);
 }
 
@@ -304,7 +312,7 @@
 
 	KASSERT(inp != NULL, ("%s: inp is NULL", __func__));
 	INP_WLOCK_ASSERT(inp);
-	KASSERT(toepcb_flag(toep, TPF_CPL_PENDING),
+	KASSERT(toep->flags & TPF_CPL_PENDING,
 	    ("%s: CPL not pending already?", __func__));
 
 	CTR6(KTR_CXGBE, "%s: tid %d, toep %p (0x%x), inp %p (0x%x)",
@@ -311,9 +319,9 @@
 	    __func__, toep->tid, toep, toep->flags, inp, inp->inp_flags);
 
 	toep->inp = NULL;
-	toepcb_clr_flag(toep, TPF_CPL_PENDING);
+	toep->flags &= ~TPF_CPL_PENDING;
 
-	if (toepcb_flag(toep, TPF_ATTACHED) == 0)
+	if (!(toep->flags & TPF_ATTACHED))
 		release_offload_resources(toep);
 
 	if (!in_pcbrele_wlocked(inp))
@@ -568,6 +576,8 @@
 	    ("%s: lctx hash table is not empty.", __func__));
 
 	t4_uninit_l2t_cpl_handlers(sc);
+	t4_uninit_cpl_io_handlers(sc);
+	t4_uninit_ddp(sc, td);
 
 	if (td->listen_mask != 0)
 		hashdestroy(td->listen_hash, M_CXGBE, td->listen_mask);
@@ -613,6 +623,8 @@
 	if (rc != 0)
 		goto done;
 
+	t4_init_ddp(sc, td);
+
 	/* CPL handlers */
 	t4_init_connect_cpl_handlers(sc);
 	t4_init_l2t_cpl_handlers(sc);
@@ -688,7 +700,17 @@
 t4_tom_mod_load(void)
 {
 	int rc;
+	struct protosw *tcp_protosw;
 
+	tcp_protosw = pffindproto(PF_INET, IPPROTO_TCP, SOCK_STREAM);
+	if (tcp_protosw == NULL)
+		return (ENOPROTOOPT);
+
+	bcopy(tcp_protosw, &ddp_protosw, sizeof(ddp_protosw));
+	bcopy(tcp_protosw->pr_usrreqs, &ddp_usrreqs, sizeof(ddp_usrreqs));
+	ddp_usrreqs.pru_soreceive = t4_soreceive_ddp;
+	ddp_protosw.pr_usrreqs = &ddp_usrreqs;
+
 	rc = t4_register_uld(&tom_uld_info);
 	if (rc != 0)
 		t4_tom_mod_unload();

Modified: trunk/sys/dev/cxgbe/tom/t4_tom.h
===================================================================
--- trunk/sys/dev/cxgbe/tom/t4_tom.h	2016-09-15 09:05:39 UTC (rev 8017)
+++ trunk/sys/dev/cxgbe/tom/t4_tom.h	2016-09-15 20:11:39 UTC (rev 8018)
@@ -46,25 +46,58 @@
  */
 #define MAX_RCV_WND ((1U << 27) - 1)
 
+#define	DDP_RSVD_WIN (16 * 1024U)
+#define	SB_DDP_INDICATE	SB_IN_TOE	/* soreceive must respond to indicate */
+
+#define	M_DDP	M_PROTO1
+
+#define USE_DDP_RX_FLOW_CONTROL
+
 /* TOE PCB flags */
 enum {
-	TPF_ATTACHED,		/* a tcpcb refers to this toepcb */
-	TPF_FLOWC_WR_SENT,	/* firmware flow context WR sent */
-	TPF_TX_DATA_SENT,	/* some data sent */
-	TPF_TX_SUSPENDED,	/* tx suspended for lack of resources */
-	TPF_SEND_FIN,		/* send FIN after sending all pending data */
-	TPF_FIN_SENT,		/* FIN has been sent */
-	TPF_ABORT_SHUTDOWN,	/* connection abort is in progress */
-	TPF_CPL_PENDING,	/* haven't received the last CPL */
-	TPF_SYNQE,		/* synq_entry, not really a toepcb */
-	TPF_SYNQE_NEEDFREE,	/* synq_entry was allocated externally */
+	TPF_ATTACHED	   = (1 << 0),	/* a tcpcb refers to this toepcb */
+	TPF_FLOWC_WR_SENT  = (1 << 1),	/* firmware flow context WR sent */
+	TPF_TX_DATA_SENT   = (1 << 2),	/* some data sent */
+	TPF_TX_SUSPENDED   = (1 << 3),	/* tx suspended for lack of resources */
+	TPF_SEND_FIN	   = (1 << 4),	/* send FIN after all pending data */
+	TPF_FIN_SENT	   = (1 << 5),	/* FIN has been sent */
+	TPF_ABORT_SHUTDOWN = (1 << 6),	/* connection abort is in progress */
+	TPF_CPL_PENDING    = (1 << 7),	/* haven't received the last CPL */
+	TPF_SYNQE	   = (1 << 8),	/* synq_entry, not really a toepcb */
+	TPF_SYNQE_NEEDFREE = (1 << 9),	/* synq_entry was malloc'd separately */
+	TPF_SYNQE_TCPDDP   = (1 << 10),	/* ulp_mode TCPDDP in toepcb */
+	TPF_SYNQE_EXPANDED = (1 << 11),	/* toepcb ready, tid context updated */
 };
 
+enum {
+	DDP_OK		= (1 << 0),	/* OK to turn on DDP */
+	DDP_SC_REQ	= (1 << 1),	/* state change (on/off) requested */
+	DDP_ON		= (1 << 2),	/* DDP is turned on */
+	DDP_BUF0_ACTIVE	= (1 << 3),	/* buffer 0 in use (not invalidated) */
+	DDP_BUF1_ACTIVE	= (1 << 4),	/* buffer 1 in use (not invalidated) */
+};
+
 struct ofld_tx_sdesc {
 	uint32_t plen;		/* payload length */
 	uint8_t tx_credits;	/* firmware tx credits (unit is 16B) */
 };
 
+struct ppod_region {
+	TAILQ_ENTRY(ppod_region) link;
+	int used;	/* # of pods used by this region */
+	int free;	/* # of contiguous pods free right after this region */
+};
+
+struct ddp_buffer {
+	uint32_t tag;	/* includes color, page pod addr, and DDP page size */
+	int nppods;
+	int offset;
+	int len;
+	struct ppod_region ppod_region;
+	int npages;
+	vm_page_t *pages;
+};
+
 struct toepcb {
 	TAILQ_ENTRY(toepcb) link; /* toep_list */
 	unsigned int flags;	/* miscellaneous flags */
@@ -77,11 +110,16 @@
 	struct l2t_entry *l2te;	/* L2 table entry used by this connection */
 	int tid;		/* Connection identifier */
 	unsigned int tx_credits;/* tx WR credits (in 16 byte units) remaining */
-	unsigned int enqueued;	/* # of bytes added to so_rcv (not yet read) */
+	unsigned int sb_cc;	/* last noted value of so_rcv->sb_cc */
 	int rx_credits;		/* rx credits (in bytes) to be returned to hw */
 
 	unsigned int ulp_mode;	/* ULP mode */
 
+	unsigned int ddp_flags;
+	struct ddp_buffer *db[2];
+	time_t ddp_disabled;
+	uint8_t ddp_score;
+
 	/* Tx software descriptor */
 	uint8_t txsd_total;
 	uint8_t txsd_pidx;
@@ -97,27 +135,19 @@
 	unsigned int mss;
 };
 
-static inline int
-toepcb_flag(struct toepcb *toep, int flag)
-{
+#define	DDP_RETRY_WAIT	5	/* seconds to wait before re-enabling DDP */
+#define	DDP_LOW_SCORE	1
+#define	DDP_HIGH_SCORE	3
 
-	return isset(&toep->flags, flag);
-}
-
 static inline void
-toepcb_set_flag(struct toepcb *toep, int flag)
+set_tcpddp_ulp_mode(struct toepcb *toep)
 {
 
-	setbit(&toep->flags, flag);
+	toep->ulp_mode = ULP_MODE_TCPDDP;
+	toep->ddp_flags = DDP_OK;
+	toep->ddp_score = DDP_LOW_SCORE;
 }
 
-static inline void
-toepcb_clr_flag(struct toepcb *toep, int flag)
-{
-
-	clrbit(&toep->flags, flag);
-}
-
 /*
  * Compressed state for embryonic connections for a listener.  Barely fits in
  * 64B, try not to grow it further.
@@ -136,27 +166,6 @@
 	uint16_t rcv_bufsize;
 };
 
-static inline int
-synqe_flag(struct synq_entry *synqe, int flag)
-{
-
-	return isset(&synqe->flags, flag);
-}
-
-static inline void
-synqe_set_flag(struct synq_entry *synqe, int flag)
-{
-
-	setbit(&synqe->flags, flag);
-}
-
-static inline void
-synqe_clr_flag(struct synq_entry *synqe, int flag)
-{
-
-	clrbit(&synqe->flags, flag);
-}
-
 /* listen_ctx flags */
 #define LCTX_RPL_PENDING 1	/* waiting for a CPL_PASS_OPEN_RPL */
 
@@ -171,6 +180,8 @@
 	TAILQ_HEAD(, synq_entry) synq;
 };
 
+TAILQ_HEAD(ppod_head, ppod_region);
+
 struct tom_data {
 	struct toedev tod;
 
@@ -178,10 +189,16 @@
 	struct mtx toep_list_lock;
 	TAILQ_HEAD(, toepcb) toep_list;
 
+	struct mtx lctx_hash_lock;
 	LIST_HEAD(, listen_ctx) *listen_hash;
 	u_long listen_mask;
 	int lctx_count;		/* # of lctx in the hash table */
-	struct mtx lctx_hash_lock;
+
+	struct mtx ppod_lock;
+	int nppods;
+	int nppods_free;	/* # of available ppods */
+	int nppods_free_head;	/* # of available ppods at the begining */
+	struct ppod_head ppods;
 };
 
 static inline struct tom_data *
@@ -236,6 +253,7 @@
 
 /* t4_cpl_io.c */
 void t4_init_cpl_io_handlers(struct adapter *);
+void t4_uninit_cpl_io_handlers(struct adapter *);
 void send_abort_rpl(struct adapter *, struct sge_wrq *, int , int);
 void send_flowc_wr(struct toepcb *, struct flowc_tx_params *);
 void send_reset(struct adapter *, struct toepcb *, uint32_t);
@@ -244,5 +262,14 @@
 int t4_tod_output(struct toedev *, struct tcpcb *);
 int t4_send_fin(struct toedev *, struct tcpcb *);
 int t4_send_rst(struct toedev *, struct tcpcb *);
+void t4_set_tcb_field(struct adapter *, struct toepcb *, uint16_t, uint64_t,
+    uint64_t);
 
+/* t4_ddp.c */
+void t4_init_ddp(struct adapter *, struct tom_data *);
+void t4_uninit_ddp(struct adapter *, struct tom_data *);
+int t4_soreceive_ddp(struct socket *, struct sockaddr **, struct uio *,
+    struct mbuf **, struct mbuf **, int *);
+void enable_ddp(struct adapter *, struct toepcb *toep);
+void release_ddp_resources(struct toepcb *toep);
 #endif

Modified: trunk/sys/modules/cxgbe/if_cxgbe/Makefile
===================================================================
--- trunk/sys/modules/cxgbe/if_cxgbe/Makefile	2016-09-15 09:05:39 UTC (rev 8017)
+++ trunk/sys/modules/cxgbe/if_cxgbe/Makefile	2016-09-15 20:11:39 UTC (rev 8018)
@@ -2,6 +2,8 @@
 # $FreeBSD$
 #
 
+.include <bsd.own.mk>
+
 CXGBE = ${.CURDIR}/../../../dev/cxgbe
 .PATH: ${CXGBE} ${CXGBE}/common
 
@@ -17,4 +19,17 @@
 # Provide the timestamp of a packet in its header mbuf.
 #CFLAGS+= -DT4_PKT_TIMESTAMP
 
+.if !defined(KERNBUILDDIR)
+.if ${MK_INET_SUPPORT} != "no"
+opt_inet.h:
+	@echo "#define INET 1" > ${.TARGET}
+	@echo "#define TCP_OFFLOAD 1" >> ${.TARGET}
+.endif
+
+.if ${MK_INET6_SUPPORT} != "no"
+opt_inet6.h:
+	@echo "#define INET6 1" > ${.TARGET}
+.endif
+.endif
+
 .include <bsd.kmod.mk>

Modified: trunk/sys/modules/cxgbe/tom/Makefile
===================================================================
--- trunk/sys/modules/cxgbe/tom/Makefile	2016-09-15 09:05:39 UTC (rev 8017)
+++ trunk/sys/modules/cxgbe/tom/Makefile	2016-09-15 20:11:39 UTC (rev 8018)
@@ -2,14 +2,24 @@
 # $FreeBSD$
 #
 
+.include <bsd.own.mk>
+
 CXGBE = ${.CURDIR}/../../../dev/cxgbe
 .PATH: ${CXGBE}/tom
 
 KMOD = t4_tom
-SRCS = t4_tom.c t4_connect.c t4_listen.c t4_cpl_io.c t4_tom_l2t.c
+SRCS = t4_tom.c t4_connect.c t4_listen.c t4_cpl_io.c t4_tom_l2t.c t4_ddp.c
 SRCS+= device_if.h bus_if.h pci_if.h
 SRCS+= opt_inet.h
 
 CFLAGS+= -I${CXGBE}
 
+.if !defined(KERNBUILDDIR)
+.if ${MK_INET_SUPPORT} != "no"
+opt_inet.h:
+	echo "#define INET 1" > ${.TARGET}
+	echo "#define TCP_OFFLOAD 1" >> ${.TARGET}
+.endif
+.endif
+
 .include <bsd.kmod.mk>



More information about the Midnightbsd-cvs mailing list