[Midnightbsd-cvs] src [8728] trunk/sys: sync netmap with FreeBSD @ rev 246355

laffer1 at midnightbsd.org laffer1 at midnightbsd.org
Sun Sep 25 19:53:30 EDT 2016


Revision: 8728
          http://svnweb.midnightbsd.org/src/?rev=8728
Author:   laffer1
Date:     2016-09-25 19:53:30 -0400 (Sun, 25 Sep 2016)
Log Message:
-----------
sync netmap with FreeBSD @ rev 246355

Revision Links:
--------------
    http://svnweb.midnightbsd.org/src/?rev=246355

Modified Paths:
--------------
    trunk/sys/dev/netmap/if_em_netmap.h
    trunk/sys/dev/netmap/if_igb_netmap.h
    trunk/sys/dev/netmap/if_lem_netmap.h
    trunk/sys/dev/netmap/if_re_netmap.h
    trunk/sys/dev/netmap/netmap.c
    trunk/sys/dev/netmap/netmap_kern.h
    trunk/sys/dev/netmap/netmap_mem2.c
    trunk/sys/net/netmap.h
    trunk/sys/net/netmap_user.h

Removed Paths:
-------------
    trunk/sys/dev/netmap/netmap_mem1.c

Modified: trunk/sys/dev/netmap/if_em_netmap.h
===================================================================
--- trunk/sys/dev/netmap/if_em_netmap.h	2016-09-25 22:40:55 UTC (rev 8727)
+++ trunk/sys/dev/netmap/if_em_netmap.h	2016-09-25 23:53:30 UTC (rev 8728)
@@ -171,7 +171,7 @@
 	u_int j, k, l, n = 0, lim = kring->nkr_num_slots - 1;
 
 	/* generate an interrupt approximately every half ring */
-	int report_frequency = kring->nkr_num_slots >> 1;
+	u_int report_frequency = kring->nkr_num_slots >> 1;
 
 	k = ring->cur;
 	if (k > lim)
@@ -292,6 +292,8 @@
 	l = rxr->next_to_check;
 	j = netmap_idx_n2k(kring, l);
 	if (netmap_no_pendintr || force_update) {
+		uint16_t slot_flags = kring->nkr_slot_flags;
+
 		for (n = 0; ; n++) {
 			struct e1000_rx_desc *curr = &rxr->rx_base[l];
 			uint32_t staterr = le32toh(curr->status);
@@ -299,6 +301,7 @@
 			if ((staterr & E1000_RXD_STAT_DD) == 0)
 				break;
 			ring->slot[j].len = le16toh(curr->length);
+			ring->slot[j].flags = slot_flags;
 			bus_dmamap_sync(rxr->rxtag, rxr->rx_buffers[l].map,
 				BUS_DMASYNC_POSTREAD);
 			j = (j == lim) ? 0 : j + 1;

Modified: trunk/sys/dev/netmap/if_igb_netmap.h
===================================================================
--- trunk/sys/dev/netmap/if_igb_netmap.h	2016-09-25 22:40:55 UTC (rev 8727)
+++ trunk/sys/dev/netmap/if_igb_netmap.h	2016-09-25 23:53:30 UTC (rev 8728)
@@ -125,7 +125,7 @@
 	u_int j, k, l, n = 0, lim = kring->nkr_num_slots - 1;
 
 	/* generate an interrupt approximately every half ring */
-	int report_frequency = kring->nkr_num_slots >> 1;
+	u_int report_frequency = kring->nkr_num_slots >> 1;
 
 	k = ring->cur;
 	if (k > lim)
@@ -263,6 +263,8 @@
 	l = rxr->next_to_check;
 	j = netmap_idx_n2k(kring, l);
 	if (netmap_no_pendintr || force_update) {
+		uint16_t slot_flags = kring->nkr_slot_flags;
+
 		for (n = 0; ; n++) {
 			union e1000_adv_rx_desc *curr = &rxr->rx_base[l];
 			uint32_t staterr = le32toh(curr->wb.upper.status_error);
@@ -270,6 +272,7 @@
 			if ((staterr & E1000_RXD_STAT_DD) == 0)
 				break;
 			ring->slot[j].len = le16toh(curr->wb.upper.length);
+			ring->slot[j].flags = slot_flags;
 			bus_dmamap_sync(rxr->ptag,
 				rxr->rx_buffers[l].pmap, BUS_DMASYNC_POSTREAD);
 			j = (j == lim) ? 0 : j + 1;

Modified: trunk/sys/dev/netmap/if_lem_netmap.h
===================================================================
--- trunk/sys/dev/netmap/if_lem_netmap.h	2016-09-25 22:40:55 UTC (rev 8727)
+++ trunk/sys/dev/netmap/if_lem_netmap.h	2016-09-25 23:53:30 UTC (rev 8728)
@@ -253,6 +253,8 @@
 	l = adapter->next_rx_desc_to_check;
 	j = netmap_idx_n2k(kring, l);
 	if (netmap_no_pendintr || force_update) {
+		uint16_t slot_flags = kring->nkr_slot_flags;
+
 		for (n = 0; ; n++) {
 			struct e1000_rx_desc *curr = &adapter->rx_desc_base[l];
 			uint32_t staterr = le32toh(curr->status);
@@ -266,6 +268,7 @@
 				len = 0;
 			}
 			ring->slot[j].len = len;
+			ring->slot[j].flags = slot_flags;
 			bus_dmamap_sync(adapter->rxtag,
 				adapter->rx_buffer_area[l].map,
 				    BUS_DMASYNC_POSTREAD);

Modified: trunk/sys/dev/netmap/if_re_netmap.h
===================================================================
--- trunk/sys/dev/netmap/if_re_netmap.h	2016-09-25 22:40:55 UTC (rev 8727)
+++ trunk/sys/dev/netmap/if_re_netmap.h	2016-09-25 23:53:30 UTC (rev 8728)
@@ -245,6 +245,8 @@
 	l = sc->rl_ldata.rl_rx_prodidx; /* next pkt to check */
 	j = netmap_idx_n2k(kring, l); /* the kring index */
 	if (netmap_no_pendintr || force_update) {
+		uint16_t slot_flags = kring->nkr_slot_flags;
+
 		for (n = kring->nr_hwavail; n < lim ; n++) {
 			struct rl_desc *cur_rx = &sc->rl_ldata.rl_rx_list[l];
 			uint32_t rxstat = le32toh(cur_rx->rl_cmdstat);
@@ -256,6 +258,7 @@
 			/* XXX subtract crc */
 			total_len = (total_len < 4) ? 0 : total_len - 4;
 			kring->ring->slot[j].len = total_len;
+			kring->ring->slot[j].flags = slot_flags;
 			/*  sync was in re_newbuf() */
 			bus_dmamap_sync(sc->rl_ldata.rl_rx_mtag,
 			    rxd[l].rx_dmamap, BUS_DMASYNC_POSTREAD);

Modified: trunk/sys/dev/netmap/netmap.c
===================================================================
--- trunk/sys/dev/netmap/netmap.c	2016-09-25 22:40:55 UTC (rev 8727)
+++ trunk/sys/dev/netmap/netmap.c	2016-09-25 23:53:30 UTC (rev 8728)
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2011 Matteo Landi, Luigi Rizzo. All rights reserved.
+ * Copyright (C) 2011-2012 Matteo Landi, Luigi Rizzo. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -23,6 +23,8 @@
  * SUCH DAMAGE.
  */
 
+#define NM_BRIDGE
+
 /*
  * This module supports memory mapped access to network devices,
  * see netmap(4).
@@ -52,6 +54,16 @@
  *    transmit or receive queues (or all queues for a given interface).
  */
 
+#ifdef linux
+#include "bsd_glue.h"
+static netdev_tx_t linux_netmap_start(struct sk_buff *skb, struct net_device *dev);
+#endif /* linux */
+
+#ifdef __APPLE__
+#include "osx_glue.h"
+#endif /* __APPLE__ */
+
+#ifdef __FreeBSD__
 #include <sys/cdefs.h> /* prerequisite */
 __MBSDID("$MidnightBSD$");
 
@@ -78,21 +90,16 @@
 #include <net/if.h>
 #include <net/bpf.h>		/* BIOCIMMEDIATE */
 #include <net/vnet.h>
-#include <net/netmap.h>
-#include <dev/netmap/netmap_kern.h>
 #include <machine/bus.h>	/* bus_dmamap_* */
 
 MALLOC_DEFINE(M_NETMAP, "netmap", "Network memory map");
+#endif /* __FreeBSD__ */
 
-/*
- * lock and unlock for the netmap memory allocator
- */
-#define NMA_LOCK()	mtx_lock(&nm_mem->nm_mtx);
-#define NMA_UNLOCK()	mtx_unlock(&nm_mem->nm_mtx);
-struct netmap_mem_d;
-static struct netmap_mem_d *nm_mem;	/* Our memory allocator. */
+#include <net/netmap.h>
+#include <dev/netmap/netmap_kern.h>
 
 u_int netmap_total_buffers;
+u_int netmap_buf_size;
 char *netmap_buffer_base;	/* address of an invalid buffer */
 
 /* user-controlled variables */
@@ -105,10 +112,6 @@
     CTLFLAG_RW, &netmap_verbose, 0, "Verbose mode");
 SYSCTL_INT(_dev_netmap, OID_AUTO, no_timestamp,
     CTLFLAG_RW, &netmap_no_timestamp, 0, "no_timestamp");
-int netmap_buf_size = 2048;
-TUNABLE_INT("hw.netmap.buf_size", &netmap_buf_size);
-SYSCTL_INT(_dev_netmap, OID_AUTO, buf_size,
-    CTLFLAG_RD, &netmap_buf_size, 0, "Size of packet buffers");
 int netmap_mitigate = 1;
 SYSCTL_INT(_dev_netmap, OID_AUTO, mitigate, CTLFLAG_RW, &netmap_mitigate, 0, "");
 int netmap_no_pendintr = 1;
@@ -115,7 +118,210 @@
 SYSCTL_INT(_dev_netmap, OID_AUTO, no_pendintr,
     CTLFLAG_RW, &netmap_no_pendintr, 0, "Always look for new received packets.");
 
+int netmap_drop = 0;	/* debugging */
+int netmap_flags = 0;	/* debug flags */
+int netmap_fwd = 0;	/* force transparent mode */
+int netmap_copy = 0;	/* debugging, copy content */
 
+SYSCTL_INT(_dev_netmap, OID_AUTO, drop, CTLFLAG_RW, &netmap_drop, 0 , "");
+SYSCTL_INT(_dev_netmap, OID_AUTO, flags, CTLFLAG_RW, &netmap_flags, 0 , "");
+SYSCTL_INT(_dev_netmap, OID_AUTO, fwd, CTLFLAG_RW, &netmap_fwd, 0 , "");
+SYSCTL_INT(_dev_netmap, OID_AUTO, copy, CTLFLAG_RW, &netmap_copy, 0 , "");
+
+#ifdef NM_BRIDGE /* support for netmap bridge */
+
+/*
+ * system parameters.
+ *
+ * All switched ports have prefix NM_NAME.
+ * The switch has a max of NM_BDG_MAXPORTS ports (often stored in a bitmap,
+ * so a practical upper bound is 64).
+ * Each tx ring is read-write, whereas rx rings are readonly (XXX not done yet).
+ * The virtual interfaces use per-queue lock instead of core lock.
+ * In the tx loop, we aggregate traffic in batches to make all operations
+ * faster. The batch size is NM_BDG_BATCH
+ */
+#define	NM_NAME			"vale"	/* prefix for the interface */
+#define NM_BDG_MAXPORTS		16	/* up to 64 ? */
+#define NM_BRIDGE_RINGSIZE	1024	/* in the device */
+#define NM_BDG_HASH		1024	/* forwarding table entries */
+#define NM_BDG_BATCH		1024	/* entries in the forwarding buffer */
+#define	NM_BRIDGES		4	/* number of bridges */
+int netmap_bridge = NM_BDG_BATCH; /* bridge batch size */
+SYSCTL_INT(_dev_netmap, OID_AUTO, bridge, CTLFLAG_RW, &netmap_bridge, 0 , "");
+
+#ifdef linux
+#define	ADD_BDG_REF(ifp)	(NA(ifp)->if_refcount++)
+#define	DROP_BDG_REF(ifp)	(NA(ifp)->if_refcount-- <= 1)
+#else /* !linux */
+#define	ADD_BDG_REF(ifp)	(ifp)->if_refcount++
+#define	DROP_BDG_REF(ifp)	refcount_release(&(ifp)->if_refcount)
+#ifdef __FreeBSD__
+#include <sys/endian.h>
+#include <sys/refcount.h>
+#endif /* __FreeBSD__ */
+#define prefetch(x)	__builtin_prefetch(x)
+#endif /* !linux */
+
+static void bdg_netmap_attach(struct ifnet *ifp);
+static int bdg_netmap_reg(struct ifnet *ifp, int onoff);
+/* per-tx-queue entry */
+struct nm_bdg_fwd {	/* forwarding entry for a bridge */
+	void *buf;
+	uint64_t dst;	/* dst mask */
+	uint32_t src;	/* src index ? */
+	uint16_t len;	/* src len */
+};
+
+struct nm_hash_ent {
+	uint64_t	mac;	/* the top 2 bytes are the epoch */
+	uint64_t	ports;
+};
+
+/*
+ * Interfaces for a bridge are all in ports[].
+ * The array has fixed size, an empty entry does not terminate
+ * the search.
+ */
+struct nm_bridge {
+	struct ifnet *bdg_ports[NM_BDG_MAXPORTS];
+	int n_ports;
+	uint64_t act_ports;
+	int freelist;	/* first buffer index */
+	NM_SELINFO_T si;	/* poll/select wait queue */
+	NM_LOCK_T bdg_lock;	/* protect the selinfo ? */
+
+	/* the forwarding table, MAC+ports */
+	struct nm_hash_ent ht[NM_BDG_HASH];
+
+	int namelen;	/* 0 means free */
+	char basename[IFNAMSIZ];
+};
+
+struct nm_bridge nm_bridges[NM_BRIDGES];
+
+#define BDG_LOCK(b)	mtx_lock(&(b)->bdg_lock)
+#define BDG_UNLOCK(b)	mtx_unlock(&(b)->bdg_lock)
+
+/*
+ * NA(ifp)->bdg_port	port index
+ */
+
+// XXX only for multiples of 64 bytes, non overlapped.
+static inline void
+pkt_copy(void *_src, void *_dst, int l)
+{
+        uint64_t *src = _src;
+        uint64_t *dst = _dst;
+        if (unlikely(l >= 1024)) {
+                bcopy(src, dst, l);
+                return;
+        }
+        for (; likely(l > 0); l-=64) {
+                *dst++ = *src++;
+                *dst++ = *src++;
+                *dst++ = *src++;
+                *dst++ = *src++;
+                *dst++ = *src++;
+                *dst++ = *src++;
+                *dst++ = *src++;
+                *dst++ = *src++;
+        }
+}
+
+/*
+ * locate a bridge among the existing ones.
+ * a ':' in the name terminates the bridge name. Otherwise, just NM_NAME.
+ * We assume that this is called with a name of at least NM_NAME chars.
+ */
+static struct nm_bridge *
+nm_find_bridge(const char *name)
+{
+	int i, l, namelen, e;
+	struct nm_bridge *b = NULL;
+
+	namelen = strlen(NM_NAME);	/* base length */
+	l = strlen(name);		/* actual length */
+	for (i = namelen + 1; i < l; i++) {
+		if (name[i] == ':') {
+			namelen = i;
+			break;
+		}
+	}
+	if (namelen >= IFNAMSIZ)
+		namelen = IFNAMSIZ;
+	ND("--- prefix is '%.*s' ---", namelen, name);
+
+	/* use the first entry for locking */
+	BDG_LOCK(nm_bridges); // XXX do better
+	for (e = -1, i = 1; i < NM_BRIDGES; i++) {
+		b = nm_bridges + i;
+		if (b->namelen == 0)
+			e = i;	/* record empty slot */
+		else if (strncmp(name, b->basename, namelen) == 0) {
+			ND("found '%.*s' at %d", namelen, name, i);
+			break;
+		}
+	}
+	if (i == NM_BRIDGES) { /* all full */
+		if (e == -1) { /* no empty slot */
+			b = NULL;
+		} else {
+			b = nm_bridges + e;
+			strncpy(b->basename, name, namelen);
+			b->namelen = namelen;
+		}
+	}
+	BDG_UNLOCK(nm_bridges);
+	return b;
+}
+#endif /* NM_BRIDGE */
+
+
+/*
+ * Fetch configuration from the device, to cope with dynamic
+ * reconfigurations after loading the module.
+ */
+static int
+netmap_update_config(struct netmap_adapter *na)
+{
+	struct ifnet *ifp = na->ifp;
+	u_int txr, txd, rxr, rxd;
+
+	txr = txd = rxr = rxd = 0;
+	if (na->nm_config) {
+		na->nm_config(ifp, &txr, &txd, &rxr, &rxd);
+	} else {
+		/* take whatever we had at init time */
+		txr = na->num_tx_rings;
+		txd = na->num_tx_desc;
+		rxr = na->num_rx_rings;
+		rxd = na->num_rx_desc;
+	}	
+
+	if (na->num_tx_rings == txr && na->num_tx_desc == txd &&
+	    na->num_rx_rings == rxr && na->num_rx_desc == rxd)
+		return 0; /* nothing changed */
+	if (netmap_verbose || na->refcount > 0) {
+		D("stored config %s: txring %d x %d, rxring %d x %d",
+			ifp->if_xname,
+			na->num_tx_rings, na->num_tx_desc,
+			na->num_rx_rings, na->num_rx_desc);
+		D("new config %s: txring %d x %d, rxring %d x %d",
+			ifp->if_xname, txr, txd, rxr, rxd);
+	}
+	if (na->refcount == 0) {
+		D("configuration changed (but fine)");
+		na->num_tx_rings = txr;
+		na->num_tx_desc = txd;
+		na->num_rx_rings = rxr;
+		na->num_rx_desc = rxd;
+		return 0;
+	}
+	D("configuration changed while active, this is bad...");
+	return 1;
+}
+
 /*------------- memory allocator -----------------*/
 #ifdef NETMAP_MEM2
 #include "netmap_mem2.c"
@@ -124,17 +330,55 @@
 #endif /* !NETMAP_MEM2 */
 /*------------ end of memory allocator ----------*/
 
-/* Structure associated to each thread which registered an interface. */
+
+/* Structure associated to each thread which registered an interface.
+ *
+ * The first 4 fields of this structure are written by NIOCREGIF and
+ * read by poll() and NIOC?XSYNC.
+ * There is low contention among writers (actually, a correct user program
+ * should have no contention among writers) and among writers and readers,
+ * so we use a single global lock to protect the structure initialization.
+ * Since initialization involves the allocation of memory, we reuse the memory
+ * allocator lock.
+ * Read access to the structure is lock free. Readers must check that
+ * np_nifp is not NULL before using the other fields.
+ * If np_nifp is NULL initialization has not been performed, so they should
+ * return an error to userlevel.
+ *
+ * The ref_done field is used to regulate access to the refcount in the
+ * memory allocator. The refcount must be incremented at most once for
+ * each open("/dev/netmap"). The increment is performed by the first
+ * function that calls netmap_get_memory() (currently called by
+ * mmap(), NIOCGINFO and NIOCREGIF).
+ * If the refcount is incremented, it is then decremented when the
+ * private structure is destroyed.
+ */
 struct netmap_priv_d {
-	struct netmap_if *np_nifp;	/* netmap interface descriptor. */
+	struct netmap_if * volatile np_nifp;	/* netmap interface descriptor. */
 
 	struct ifnet	*np_ifp;	/* device for which we hold a reference */
 	int		np_ringid;	/* from the ioctl */
 	u_int		np_qfirst, np_qlast;	/* range of rings to scan */
 	uint16_t	np_txpoll;
+
+	unsigned long	ref_done;	/* use with NMA_LOCK held */
 };
 
 
+static int
+netmap_get_memory(struct netmap_priv_d* p)
+{
+	int error = 0;
+	NMA_LOCK();
+	if (!p->ref_done) {
+		error = netmap_memory_finalize();
+		if (!error)
+			p->ref_done = 1;
+	}
+	NMA_UNLOCK();
+	return error;
+}
+
 /*
  * File descriptor's private data destructor.
  *
@@ -141,6 +385,7 @@
  * Call nm_register(ifp,0) to stop netmap mode on the interface and
  * revert to normal operation. We expect that np_ifp has not gone.
  */
+/* call with NMA_LOCK held */
 static void
 netmap_dtor_locked(void *data)
 {
@@ -153,7 +398,8 @@
 	if (na->refcount <= 0) {	/* last instance */
 		u_int i, j, lim;
 
-		D("deleting last netmap instance for %s", ifp->if_xname);
+		if (netmap_verbose)
+			D("deleting last instance for %s", ifp->if_xname);
 		/*
 		 * there is a race here with *_netmap_task() and
 		 * netmap_poll(), which don't run under NETMAP_REG_LOCK.
@@ -180,7 +426,6 @@
 		selwakeuppri(&na->tx_si, PI_NET);
 		selwakeuppri(&na->rx_si, PI_NET);
 		/* release all buffers */
-		NMA_LOCK();
 		for (i = 0; i < na->num_tx_rings + 1; i++) {
 			struct netmap_ring *ring = na->tx_rings[i].ring;
 			lim = na->tx_rings[i].nkr_num_slots;
@@ -200,7 +445,6 @@
 		/* XXX kqueue(9) needed; these will mirror knlist_init. */
 		/* knlist_destroy(&na->tx_si.si_note); */
 		/* knlist_destroy(&na->rx_si.si_note); */
-		NMA_UNLOCK();
 		netmap_free_rings(na);
 		wakeup(na);
 	}
@@ -207,24 +451,131 @@
 	netmap_if_free(nifp);
 }
 
+static void
+nm_if_rele(struct ifnet *ifp)
+{
+#ifndef NM_BRIDGE
+	if_rele(ifp);
+#else /* NM_BRIDGE */
+	int i, full;
+	struct nm_bridge *b;
 
+	if (strncmp(ifp->if_xname, NM_NAME, sizeof(NM_NAME) - 1)) {
+		if_rele(ifp);
+		return;
+	}
+	if (!DROP_BDG_REF(ifp))
+		return;
+	b = ifp->if_bridge;
+	BDG_LOCK(nm_bridges);
+	BDG_LOCK(b);
+	ND("want to disconnect %s from the bridge", ifp->if_xname);
+	full = 0;
+	for (i = 0; i < NM_BDG_MAXPORTS; i++) {
+		if (b->bdg_ports[i] == ifp) {
+			b->bdg_ports[i] = NULL;
+			bzero(ifp, sizeof(*ifp));
+			free(ifp, M_DEVBUF);
+			break;
+		}
+		else if (b->bdg_ports[i] != NULL)
+			full = 1;
+	}
+	BDG_UNLOCK(b);
+	if (full == 0) {
+		ND("freeing bridge %d", b - nm_bridges);
+		b->namelen = 0;
+	}
+	BDG_UNLOCK(nm_bridges);
+	if (i == NM_BDG_MAXPORTS)
+		D("ouch, cannot find ifp to remove");
+#endif /* NM_BRIDGE */
+}
+
 static void
 netmap_dtor(void *data)
 {
 	struct netmap_priv_d *priv = data;
 	struct ifnet *ifp = priv->np_ifp;
-	struct netmap_adapter *na = NA(ifp);
+	struct netmap_adapter *na;
 
-	na->nm_lock(ifp, NETMAP_REG_LOCK, 0);
-	netmap_dtor_locked(data);
-	na->nm_lock(ifp, NETMAP_REG_UNLOCK, 0);
+	NMA_LOCK();
+	if (ifp) {
+		na = NA(ifp);
+		na->nm_lock(ifp, NETMAP_REG_LOCK, 0);
+		netmap_dtor_locked(data);
+		na->nm_lock(ifp, NETMAP_REG_UNLOCK, 0);
 
-	if_rele(ifp);
+		nm_if_rele(ifp);
+	}
+	if (priv->ref_done) {
+		netmap_memory_deref();
+	}
+	NMA_UNLOCK();
 	bzero(priv, sizeof(*priv));	/* XXX for safety */
 	free(priv, M_DEVBUF);
 }
 
+#ifdef __FreeBSD__
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_object.h>
+#include <vm/vm_page.h>
+#include <vm/vm_pager.h>
+#include <vm/uma.h>
 
+static struct cdev_pager_ops saved_cdev_pager_ops;
+
+static int
+netmap_dev_pager_ctor(void *handle, vm_ooffset_t size, vm_prot_t prot,
+    vm_ooffset_t foff, struct ucred *cred, u_short *color)
+{
+	if (netmap_verbose)
+		D("first mmap for %p", handle);
+	return saved_cdev_pager_ops.cdev_pg_ctor(handle,
+			size, prot, foff, cred, color);
+}
+
+static void
+netmap_dev_pager_dtor(void *handle)
+{
+	saved_cdev_pager_ops.cdev_pg_dtor(handle);
+	ND("ready to release memory for %p", handle);
+}
+
+
+static struct cdev_pager_ops netmap_cdev_pager_ops = {
+        .cdev_pg_ctor = netmap_dev_pager_ctor,
+        .cdev_pg_dtor = netmap_dev_pager_dtor,
+        .cdev_pg_fault = NULL,
+};
+
+static int
+netmap_mmap_single(struct cdev *cdev, vm_ooffset_t *foff,
+	vm_size_t objsize,  vm_object_t *objp, int prot)
+{
+	vm_object_t obj;
+
+	ND("cdev %p foff %jd size %jd objp %p prot %d", cdev,
+	    (intmax_t )*foff, (intmax_t )objsize, objp, prot);
+	obj = vm_pager_allocate(OBJT_DEVICE, cdev, objsize, prot, *foff,
+            curthread->td_ucred);
+	ND("returns obj %p", obj);
+	if (obj == NULL)
+		return EINVAL;
+	if (saved_cdev_pager_ops.cdev_pg_fault == NULL) {
+		ND("initialize cdev_pager_ops");
+		saved_cdev_pager_ops = *(obj->un_pager.devp.ops);
+		netmap_cdev_pager_ops.cdev_pg_fault =
+			saved_cdev_pager_ops.cdev_pg_fault;
+	};
+	obj->un_pager.devp.ops = &netmap_cdev_pager_ops;
+	*objp = obj;
+	return 0;
+}
+#endif /* __FreeBSD__ */
+
+
 /*
  * mmap(2) support for the "netmap" device.
  *
@@ -235,6 +586,7 @@
  * Return 0 on success, -1 otherwise.
  */
 
+#ifdef __FreeBSD__
 static int
 netmap_mmap(__unused struct cdev *dev,
 #if __FreeBSD_version < 900000
@@ -245,53 +597,128 @@
 #endif
 	)
 {
+	int error = 0;
+	struct netmap_priv_d *priv;
+
 	if (nprot & PROT_EXEC)
 		return (-1);	// XXX -1 or EINVAL ?
 
+	error = devfs_get_cdevpriv((void **)&priv);
+	if (error == EBADF) {	/* called on fault, memory is initialized */
+		ND(5, "handling fault at ofs 0x%x", offset);
+		error = 0;
+	} else if (error == 0)	/* make sure memory is set */
+		error = netmap_get_memory(priv);
+	if (error)
+		return (error);
+
 	ND("request for offset 0x%x", (uint32_t)offset);
 	*paddr = netmap_ofstophys(offset);
 
-	return (0);
+	return (*paddr ? 0 : ENOMEM);
 }
 
+static int
+netmap_close(struct cdev *dev, int fflag, int devtype, struct thread *td)
+{
+	if (netmap_verbose)
+		D("dev %p fflag 0x%x devtype %d td %p",
+			dev, fflag, devtype, td);
+	return 0;
+}
 
+static int
+netmap_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
+{
+	struct netmap_priv_d *priv;
+	int error;
+
+	priv = malloc(sizeof(struct netmap_priv_d), M_DEVBUF,
+			      M_NOWAIT | M_ZERO);
+	if (priv == NULL)
+		return ENOMEM;
+
+	error = devfs_set_cdevpriv(priv, netmap_dtor);
+	if (error)
+	        return error;
+
+	return 0;
+}
+#endif /* __FreeBSD__ */
+
+
 /*
  * Handlers for synchronization of the queues from/to the host.
- *
- * netmap_sync_to_host() passes packets up. We are called from a
- * system call in user process context, and the only contention
- * can be among multiple user threads erroneously calling
- * this routine concurrently. In principle we should not even
- * need to lock.
+ * Netmap has two operating modes:
+ * - in the default mode, the rings connected to the host stack are
+ *   just another ring pair managed by userspace;
+ * - in transparent mode (XXX to be defined) incoming packets
+ *   (from the host or the NIC) are marked as NS_FORWARD upon
+ *   arrival, and the user application has a chance to reset the
+ *   flag for packets that should be dropped.
+ *   On the RXSYNC or poll(), packets in RX rings between
+ *   kring->nr_kcur and ring->cur with NS_FORWARD still set are moved
+ *   to the other side.
+ * The transfer NIC --> host is relatively easy, just encapsulate
+ * into mbufs and we are done. The host --> NIC side is slightly
+ * harder because there might not be room in the tx ring so it
+ * might take a while before releasing the buffer.
  */
+
+/*
+ * pass a chain of buffers to the host stack as coming from 'dst'
+ */
 static void
-netmap_sync_to_host(struct netmap_adapter *na)
+netmap_send_up(struct ifnet *dst, struct mbuf *head)
 {
-	struct netmap_kring *kring = &na->tx_rings[na->num_tx_rings];
-	struct netmap_ring *ring = kring->ring;
-	struct mbuf *head = NULL, *tail = NULL, *m;
-	u_int k, n, lim = kring->nkr_num_slots - 1;
+	struct mbuf *m;
 
-	k = ring->cur;
-	if (k > lim) {
-		netmap_ring_reinit(kring);
-		return;
+	/* send packets up, outside the lock */
+	while ((m = head) != NULL) {
+		head = head->m_nextpkt;
+		m->m_nextpkt = NULL;
+		if (netmap_verbose & NM_VERB_HOST)
+			D("sending up pkt %p size %d", m, MBUF_LEN(m));
+		NM_SEND_UP(dst, m);
 	}
-	// na->nm_lock(na->ifp, NETMAP_CORE_LOCK, 0);
+}
 
-	/* Take packets from hwcur to cur and pass them up.
+struct mbq {
+	struct mbuf *head;
+	struct mbuf *tail;
+	int count;
+};
+
+/*
+ * put a copy of the buffers marked NS_FORWARD into an mbuf chain.
+ * Run from hwcur to cur - reserved
+ */
+static void
+netmap_grab_packets(struct netmap_kring *kring, struct mbq *q, int force)
+{
+	/* Take packets from hwcur to cur-reserved and pass them up.
 	 * In case of no buffers we give up. At the end of the loop,
 	 * the queue is drained in all cases.
+	 * XXX handle reserved
 	 */
+	int k = kring->ring->cur - kring->ring->reserved;
+	u_int n, lim = kring->nkr_num_slots - 1;
+	struct mbuf *m, *tail = q->tail;
+
+	if (k < 0)
+		k = k + kring->nkr_num_slots;
 	for (n = kring->nr_hwcur; n != k;) {
-		struct netmap_slot *slot = &ring->slot[n];
+		struct netmap_slot *slot = &kring->ring->slot[n];
 
 		n = (n == lim) ? 0 : n + 1;
+		if ((slot->flags & NS_FORWARD) == 0 && !force)
+			continue;
 		if (slot->len < 14 || slot->len > NETMAP_BUF_SIZE) {
 			D("bad pkt at %d len %d", n, slot->len);
 			continue;
 		}
-		m = m_devget(NMB(slot), slot->len, 0, na->ifp, NULL);
+		slot->flags &= ~NS_FORWARD; // XXX needed ?
+		m = m_devget(NMB(slot), slot->len, 0, kring->na->ifp, NULL);
 
 		if (m == NULL)
 			break;
@@ -298,22 +725,94 @@
 		if (tail)
 			tail->m_nextpkt = m;
 		else
-			head = m;
+			q->head = m;
 		tail = m;
+		q->count++;
 		m->m_nextpkt = NULL;
 	}
+	q->tail = tail;
+}
+
+/*
+ * called under main lock to send packets from the host to the NIC
+ * The host ring has packets from nr_hwcur to (cur - reserved)
+ * to be sent down. We scan the tx rings, which have just been
+ * flushed so nr_hwcur == cur. Pushing packets down means
+ * increment cur and decrement avail.
+ * XXX to be verified
+ */
+static void
+netmap_sw_to_nic(struct netmap_adapter *na)
+{
+	struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings];
+	struct netmap_kring *k1 = &na->tx_rings[0];
+	int i, howmany, src_lim, dst_lim;
+
+	howmany = kring->nr_hwavail;	/* XXX otherwise cur - reserved - nr_hwcur */
+
+	src_lim = kring->nkr_num_slots;
+	for (i = 0; howmany > 0 && i < na->num_tx_rings; i++, k1++) {
+		ND("%d packets left to ring %d (space %d)", howmany, i, k1->nr_hwavail);
+		dst_lim = k1->nkr_num_slots;
+		while (howmany > 0 && k1->ring->avail > 0) {
+			struct netmap_slot *src, *dst, tmp;
+			src = &kring->ring->slot[kring->nr_hwcur];
+			dst = &k1->ring->slot[k1->ring->cur];
+			tmp = *src;
+			src->buf_idx = dst->buf_idx;
+			src->flags = NS_BUF_CHANGED;
+
+			dst->buf_idx = tmp.buf_idx;
+			dst->len = tmp.len;
+			dst->flags = NS_BUF_CHANGED;
+			ND("out len %d buf %d from %d to %d",
+				dst->len, dst->buf_idx,
+				kring->nr_hwcur, k1->ring->cur);
+
+			if (++kring->nr_hwcur >= src_lim)
+				kring->nr_hwcur = 0;
+			howmany--;
+			kring->nr_hwavail--;
+			if (++k1->ring->cur >= dst_lim)
+				k1->ring->cur = 0;
+			k1->ring->avail--;
+		}
+		kring->ring->cur = kring->nr_hwcur; // XXX
+		k1++;
+	}
+}
+
+/*
+ * netmap_sync_to_host() passes packets up. We are called from a
+ * system call in user process context, and the only contention
+ * can be among multiple user threads erroneously calling
+ * this routine concurrently.
+ */
+static void
+netmap_sync_to_host(struct netmap_adapter *na)
+{
+	struct netmap_kring *kring = &na->tx_rings[na->num_tx_rings];
+	struct netmap_ring *ring = kring->ring;
+	u_int k, lim = kring->nkr_num_slots - 1;
+	struct mbq q = { NULL, NULL };
+
+	k = ring->cur;
+	if (k > lim) {
+		netmap_ring_reinit(kring);
+		return;
+	}
+	// na->nm_lock(na->ifp, NETMAP_CORE_LOCK, 0);
+
+	/* Take packets from hwcur to cur and pass them up.
+	 * In case of no buffers we give up. At the end of the loop,
+	 * the queue is drained in all cases.
+	 */
+	netmap_grab_packets(kring, &q, 1);
 	kring->nr_hwcur = k;
 	kring->nr_hwavail = ring->avail = lim;
 	// na->nm_lock(na->ifp, NETMAP_CORE_UNLOCK, 0);
 
-	/* send packets up, outside the lock */
-	while ((m = head) != NULL) {
-		head = head->m_nextpkt;
-		m->m_nextpkt = NULL;
-		if (netmap_verbose & NM_VERB_HOST)
-			D("sending up pkt %p size %d", m, MBUF_LEN(m));
-		NM_SEND_UP(na->ifp, m);
-	}
+	netmap_send_up(na->ifp, q.head);
 }
 
 /*
@@ -323,9 +822,12 @@
  *
  * This routine also does the selrecord if called from the poll handler
  * (we know because td != NULL).
+ *
+ * NOTE: on linux, selrecord() is defined as a macro and uses pwait
+ *     as an additional hidden argument.
  */
 static void
-netmap_sync_from_host(struct netmap_adapter *na, struct thread *td)
+netmap_sync_from_host(struct netmap_adapter *na, struct thread *td, void *pwait)
 {
 	struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings];
 	struct netmap_ring *ring = kring->ring;
@@ -332,6 +834,7 @@
 	u_int j, n, lim = kring->nkr_num_slots;
 	u_int k = ring->cur, resvd = ring->reserved;
 
+	(void)pwait;	/* disable unused warnings */
 	na->nm_lock(na->ifp, NETMAP_CORE_LOCK, 0);
 	if (k >= lim) {
 		netmap_ring_reinit(kring);
@@ -370,6 +873,64 @@
 static int
 get_ifp(const char *name, struct ifnet **ifp)
 {
+#ifdef NM_BRIDGE
+	struct ifnet *iter = NULL;
+
+	do {
+		struct nm_bridge *b;
+		int i, l, cand = -1;
+
+		if (strncmp(name, NM_NAME, sizeof(NM_NAME) - 1))
+			break;
+		b = nm_find_bridge(name);
+		if (b == NULL) {
+			D("no bridges available for '%s'", name);
+			return (ENXIO);
+		}
+		/* XXX locking */
+		BDG_LOCK(b);
+		/* lookup in the local list of ports */
+		for (i = 0; i < NM_BDG_MAXPORTS; i++) {
+			iter = b->bdg_ports[i];
+			if (iter == NULL) {
+				if (cand == -1)
+					cand = i; /* potential insert point */
+				continue;
+			}
+			if (!strcmp(iter->if_xname, name)) {
+				ADD_BDG_REF(iter);
+				ND("found existing interface");
+				BDG_UNLOCK(b);
+				break;
+			}
+		}
+		if (i < NM_BDG_MAXPORTS) /* already unlocked */
+			break;
+		if (cand == -1) {
+			D("bridge full, cannot create new port");
+no_port:
+			BDG_UNLOCK(b);
+			*ifp = NULL;
+			return EINVAL;
+		}
+		ND("create new bridge port %s", name);
+		/* space for forwarding list after the ifnet */
+		l = sizeof(*iter) +
+			 sizeof(struct nm_bdg_fwd)*NM_BDG_BATCH ;
+		iter = malloc(l, M_DEVBUF, M_NOWAIT | M_ZERO);
+		if (!iter)
+			goto no_port;
+		strcpy(iter->if_xname, name);
+		bdg_netmap_attach(iter);
+		b->bdg_ports[cand] = iter;
+		iter->if_bridge = b;
+		ADD_BDG_REF(iter);
+		BDG_UNLOCK(b);
+		ND("attaching virtual bridge %p", b);
+	} while (0);
+	*ifp = iter;
+	if (! *ifp)
+#endif /* NM_BRIDGE */
 	*ifp = ifunit_ref(name);
 	if (*ifp == NULL)
 		return (ENXIO);
@@ -376,9 +937,9 @@
 	/* can do this if the capability exists and if_pspare[0]
 	 * points to the netmap descriptor.
 	 */
-	if ((*ifp)->if_capabilities & IFCAP_NETMAP && NA(*ifp))
+	if (NETMAP_CAPABLE(*ifp))
 		return 0;	/* valid pointer, we hold the refcount */
-	if_rele(*ifp);
+	nm_if_rele(*ifp);
 	return EINVAL;	// not NETMAP capable
 }
 
@@ -402,7 +963,7 @@
 	u_int i, lim = kring->nkr_num_slots - 1;
 	int errors = 0;
 
-	D("called for %s", kring->na->ifp->if_xname);
+	RD(10, "called for %s", kring->na->ifp->if_xname);
 	if (ring->cur > lim)
 		errors++;
 	for (i = 0; i <= lim; i++) {
@@ -424,9 +985,9 @@
 		int pos = kring - kring->na->tx_rings;
 		int n = kring->na->num_tx_rings + 1;
 
-		D("total %d errors", errors);
+		RD(10, "total %d errors", errors);
 		errors++;
-		D("%s %s[%d] reinit, cur %d -> %d avail %d -> %d",
+		RD(10, "%s %s[%d] reinit, cur %d -> %d avail %d -> %d",
 			kring->na->ifp->if_xname,
 			pos < n ?  "TX" : "RX", pos < n ? pos : pos - n,
 			ring->cur, kring->nr_hwcur,
@@ -474,6 +1035,7 @@
 	priv->np_txpoll = (ringid & NETMAP_NO_TX_POLL) ? 0 : 1;
 	if (need_lock)
 		na->nm_lock(ifp, NETMAP_CORE_UNLOCK, 0);
+    if (netmap_verbose) {
 	if (ringid & NETMAP_SW_RING)
 		D("ringid %s set to SW RING", ifp->if_xname);
 	else if (ringid & NETMAP_HW_RING)
@@ -481,6 +1043,7 @@
 			priv->np_qfirst);
 	else
 		D("ringid %s set to all %d HW RINGS", ifp->if_xname, lim);
+    }
 	return 0;
 }
 
@@ -498,8 +1061,8 @@
  * Return 0 on success, errno otherwise.
  */
 static int
-netmap_ioctl(__unused struct cdev *dev, u_long cmd, caddr_t data,
-	__unused int fflag, struct thread *td)
+netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data,
+	int fflag, struct thread *td)
 {
 	struct netmap_priv_d *priv = NULL;
 	struct ifnet *ifp;
@@ -509,22 +1072,36 @@
 	u_int i, lim;
 	struct netmap_if *nifp;
 
+	(void)dev;	/* UNUSED */
+	(void)fflag;	/* UNUSED */
+#ifdef linux
+#define devfs_get_cdevpriv(pp)				\
+	({ *(struct netmap_priv_d **)pp = ((struct file *)td)->private_data; 	\
+		(*pp ? 0 : ENOENT); })
+
+/* devfs_set_cdevpriv cannot fail on linux */
+#define devfs_set_cdevpriv(p, fn)				\
+	({ ((struct file *)td)->private_data = p; (p ? 0 : EINVAL); })
+
+
+#define devfs_clear_cdevpriv()	do {				\
+		netmap_dtor(priv); ((struct file *)td)->private_data = 0;	\
+	} while (0)
+#endif /* linux */
+
 	CURVNET_SET(TD_TO_VNET(td));
 
 	error = devfs_get_cdevpriv((void **)&priv);
-	if (error != ENOENT && error != 0) {
+	if (error) {
 		CURVNET_RESTORE();
-		return (error);
+		/* XXX ENOENT should be impossible, since the priv
+		 * is now created in the open */
+		return (error == ENOENT ? ENXIO : error);
 	}
 
-	error = 0;	/* Could be ENOENT */
+	nmr->nr_name[sizeof(nmr->nr_name) - 1] = '\0';	/* truncate name */
 	switch (cmd) {
 	case NIOCGINFO:		/* return capabilities etc */
-		/* memsize is always valid */
-		nmr->nr_memsize = nm_mem->nm_totalsize;
-		nmr->nr_offset = 0;
-		nmr->nr_rx_rings = nmr->nr_tx_rings = 0;
-		nmr->nr_rx_slots = nmr->nr_tx_slots = 0;
 		if (nmr->nr_version != NETMAP_API) {
 			D("API mismatch got %d have %d",
 				nmr->nr_version, NETMAP_API);
@@ -532,6 +1109,16 @@
 			error = EINVAL;
 			break;
 		}
+		/* update configuration */
+		error = netmap_get_memory(priv);
+		ND("get_memory returned %d", error);
+		if (error)
+			break;
+		/* memsize is always valid */
+		nmr->nr_memsize = nm_mem.nm_totalsize;
+		nmr->nr_offset = 0;
+		nmr->nr_rx_rings = nmr->nr_tx_rings = 0;
+		nmr->nr_rx_slots = nmr->nr_tx_slots = 0;
 		if (nmr->nr_name[0] == '\0')	/* just get memory info */
 			break;
 		error = get_ifp(nmr->nr_name, &ifp); /* get a refcount */
@@ -538,11 +1125,12 @@
 		if (error)
 			break;
 		na = NA(ifp); /* retrieve netmap_adapter */
+		netmap_update_config(na);
 		nmr->nr_rx_rings = na->num_rx_rings;
 		nmr->nr_tx_rings = na->num_tx_rings;
 		nmr->nr_rx_slots = na->num_rx_desc;
 		nmr->nr_tx_slots = na->num_tx_desc;
-		if_rele(ifp);	/* return the refcount */
+		nm_if_rele(ifp);	/* return the refcount */
 		break;
 
 	case NIOCREGIF:
@@ -551,26 +1139,26 @@
 			error = EINVAL;
 			break;
 		}
-		if (priv != NULL) {	/* thread already registered */
+		/* ensure allocators are ready */
+		error = netmap_get_memory(priv);
+		ND("get_memory returned %d", error);
+		if (error)
+			break;
+
+		/* protect access to priv from concurrent NIOCREGIF */
+		NMA_LOCK();
+		if (priv->np_ifp != NULL) {	/* thread already registered */
 			error = netmap_set_ringid(priv, nmr->nr_ringid);
+			NMA_UNLOCK();
 			break;
 		}
 		/* find the interface and a reference */
 		error = get_ifp(nmr->nr_name, &ifp); /* keep reference */
-		if (error)
+		if (error) {
+			NMA_UNLOCK();
 			break;
+		}
 		na = NA(ifp); /* retrieve netmap adapter */
-		/*
-		 * Allocate the private per-thread structure.
-		 * XXX perhaps we can use a blocking malloc ?
-		 */
-		priv = malloc(sizeof(struct netmap_priv_d), M_DEVBUF,
-			      M_NOWAIT | M_ZERO);
-		if (priv == NULL) {
-			error = ENOMEM;
-			if_rele(ifp);   /* return the refcount */
-			break;
-		}
 
 		for (i = 10; i > 0; i--) {
 			na->nm_lock(ifp, NETMAP_REG_LOCK, 0);
@@ -582,16 +1170,18 @@
 		if (i == 0) {
 			D("too many NIOCREGIF attempts, give up");
 			error = EINVAL;
-			free(priv, M_DEVBUF);
-			if_rele(ifp);	/* return the refcount */
+			nm_if_rele(ifp);	/* return the refcount */
+			NMA_UNLOCK();
 			break;
 		}
 
+		/* ring configuration may have changed, fetch from the card */
+		netmap_update_config(na);
 		priv->np_ifp = ifp;	/* store the reference */
 		error = netmap_set_ringid(priv, nmr->nr_ringid);
 		if (error)
 			goto error;
-		priv->np_nifp = nifp = netmap_if_new(nmr->nr_name, na);
+		nifp = netmap_if_new(nmr->nr_name, na);
 		if (nifp == NULL) { /* allocation failed */
 			error = ENOMEM;
 		} else if (ifp->if_capenable & IFCAP_NETMAP) {
@@ -601,33 +1191,36 @@
 			 * and make it use the shared buffers.
 			 */
 			for (i = 0 ; i < na->num_tx_rings + 1; i++)
-				mtx_init(&na->tx_rings[i].q_lock, "nm_txq_lock", NULL, MTX_DEF);
-			for (i = 0 ; i < na->num_rx_rings + 1; i++)
-				mtx_init(&na->rx_rings[i].q_lock, "nm_rxq_lock", NULL, MTX_DEF);
+				mtx_init(&na->tx_rings[i].q_lock, "nm_txq_lock", MTX_NETWORK_LOCK, MTX_DEF);
+			for (i = 0 ; i < na->num_rx_rings + 1; i++) {
+				mtx_init(&na->rx_rings[i].q_lock, "nm_rxq_lock", MTX_NETWORK_LOCK, MTX_DEF);
+			}
 			error = na->nm_register(ifp, 1); /* mode on */
-			if (error)
+			if (error) {
 				netmap_dtor_locked(priv);
+				netmap_if_free(nifp);
+			}
 		}
 
 		if (error) {	/* reg. failed, release priv and ref */
 error:
 			na->nm_lock(ifp, NETMAP_REG_UNLOCK, 0);
-			if_rele(ifp);	/* return the refcount */
-			bzero(priv, sizeof(*priv));
-			free(priv, M_DEVBUF);
+			nm_if_rele(ifp);	/* return the refcount */
+			priv->np_ifp = NULL;
+			priv->np_nifp = NULL;
+			NMA_UNLOCK();
 			break;
 		}
 
 		na->nm_lock(ifp, NETMAP_REG_UNLOCK, 0);
-		error = devfs_set_cdevpriv(priv, netmap_dtor);
 
-		if (error != 0) {
-			/* could not assign the private storage for the
-			 * thread, call the destructor explicitly.
-			 */
-			netmap_dtor(priv);
-			break;
-		}
+		/* the following assignment is a commitment.
+		 * Readers (i.e., poll and *SYNC) check for
+		 * np_nifp != NULL without locking
+		 */
+		wmb(); /* make sure previous writes are visible to all CPUs */
+		priv->np_nifp = nifp;
+		NMA_UNLOCK();
 
 		/* return the offset of the netmap_if object */
 		nmr->nr_rx_rings = na->num_rx_rings;
@@ -634,34 +1227,41 @@
 		nmr->nr_tx_rings = na->num_tx_rings;
 		nmr->nr_rx_slots = na->num_rx_desc;
 		nmr->nr_tx_slots = na->num_tx_desc;
-		nmr->nr_memsize = nm_mem->nm_totalsize;
+		nmr->nr_memsize = nm_mem.nm_totalsize;
 		nmr->nr_offset = netmap_if_offset(nifp);
 		break;
 
 	case NIOCUNREGIF:
-		if (priv == NULL) {
+		// XXX we have no data here ?
+		D("deprecated, data is %p", nmr);
+		error = EINVAL;
+		break;
+
+	case NIOCTXSYNC:
+	case NIOCRXSYNC:
+		nifp = priv->np_nifp;
+
+		if (nifp == NULL) {
 			error = ENXIO;
 			break;
 		}
+		rmb(); /* make sure following reads are not from cache */
 
-		/* the interface is unregistered inside the
-		   destructor of the private data. */
-		devfs_clear_cdevpriv();
-		break;
 
-	case NIOCTXSYNC:
-        case NIOCRXSYNC:
-		if (priv == NULL) {
+		ifp = priv->np_ifp;	/* we have a reference */
+
+		if (ifp == NULL) {
+			D("Internal error: nifp != NULL && ifp == NULL");
 			error = ENXIO;
 			break;
 		}
-		ifp = priv->np_ifp;	/* we have a reference */
+
 		na = NA(ifp); /* retrieve netmap adapter */
 		if (priv->np_qfirst == NETMAP_SW_RING) { /* host rings */
 			if (cmd == NIOCTXSYNC)
 				netmap_sync_to_host(na);
 			else
-				netmap_sync_from_host(na, NULL);
+				netmap_sync_from_host(na, NULL, NULL);
 			break;
 		}
 		/* find the last ring to scan */
@@ -690,6 +1290,7 @@
 
 		break;
 
+#ifdef __FreeBSD__
 	case BIOCIMMEDIATE:
 	case BIOCGHDRCMPLT:
 	case BIOCSHDRCMPLT:
@@ -707,9 +1308,14 @@
 		so.so_vnet = ifp->if_vnet;
 		// so->so_proto not null.
 		error = ifioctl(&so, cmd, data, td);
-		if_rele(ifp);
+		nm_if_rele(ifp);
 		break;
 	    }
+
+#else /* linux */
+	default:
+		error = EOPNOTSUPP;
+#endif /* linux */
 	}
 
 	CURVNET_RESTORE();
@@ -726,9 +1332,13 @@
  * selfd or on the global one.
  * Device-dependent parts (locking and sync of tx/rx rings)
  * are done through callbacks.
+ *
+ * On linux, arguments are really pwait, the poll table, and 'td' is struct file *
+ * The first one is remapped to pwait as selrecord() uses the name as an
+ * hidden argument.
  */
 static int
-netmap_poll(__unused struct cdev *dev, int events, struct thread *td)
+netmap_poll(struct cdev *dev, int events, struct thread *td)
 {
 	struct netmap_priv_d *priv = NULL;
 	struct netmap_adapter *na;
@@ -735,12 +1345,22 @@
 	struct ifnet *ifp;
 	struct netmap_kring *kring;
 	u_int core_lock, i, check_all, want_tx, want_rx, revents = 0;
-	u_int lim_tx, lim_rx;
+	u_int lim_tx, lim_rx, host_forwarded = 0;
+	struct mbq q = { NULL, NULL, 0 };
 	enum {NO_CL, NEED_CL, LOCKED_CL }; /* see below */
+	void *pwait = dev;	/* linux compatibility */
 
+	(void)pwait;
+
 	if (devfs_get_cdevpriv((void **)&priv) != 0 || priv == NULL)
 		return POLLERR;
 
+	if (priv->np_nifp == NULL) {
+		D("No if registered");
+		return POLLERR;
+	}
+	rmb(); /* make sure following reads are not from cache */
+
 	ifp = priv->np_ifp;
 	// XXX check for deleting() ?
 	if ( (ifp->if_capenable & IFCAP_NETMAP) == 0)
@@ -766,7 +1386,7 @@
 		if (want_rx) {
 			kring = &na->rx_rings[lim_rx];
 			if (kring->ring->avail == 0)
-				netmap_sync_from_host(na, td);
+				netmap_sync_from_host(na, td, dev);
 			if (kring->ring->avail > 0) {
 				revents |= want_rx;
 			}
@@ -774,6 +1394,17 @@
 		return (revents);
 	}
 
+	/* if we are in transparent mode, check also the host rx ring */
+	kring = &na->rx_rings[lim_rx];
+	if ( (priv->np_qlast == NETMAP_HW_RING) // XXX check_all
+			&& want_rx
+			&& (netmap_fwd || kring->ring->flags & NR_FORWARD) ) {
+		if (kring->ring->avail == 0)
+			netmap_sync_from_host(na, td, dev);
+		if (kring->ring->avail > 0)
+			revents |= want_rx;
+	}
+
 	/*
 	 * check_all is set if the card has more than one queue and
 	 * the client is polling all of them. If true, we sleep on
@@ -812,6 +1443,13 @@
 	 * LOCKED_CL	core lock is set, so we need to release it.
 	 */
 	core_lock = (check_all || !na->separate_locks) ? NEED_CL : NO_CL;
+#ifdef NM_BRIDGE
+	/* the bridge uses separate locks */
+	if (na->nm_register == bdg_netmap_reg) {
+		ND("not using core lock for %s", ifp->if_xname);
+		core_lock = NO_CL;
+	}
+#endif /* NM_BRIDGE */
 	if (priv->np_qlast != NETMAP_HW_RING) {
 		lim_tx = lim_rx = priv->np_qlast;
 	}
@@ -842,6 +1480,7 @@
 	 * to avoid that the tx rings stall).
 	 */
 	if (priv->np_txpoll || want_tx) {
+flush_tx:
 		for (i = priv->np_qfirst; i < lim_tx; i++) {
 			kring = &na->tx_rings[i];
 			/*
@@ -894,6 +1533,11 @@
 			}
 			if (na->separate_locks)
 				na->nm_lock(ifp, NETMAP_RX_LOCK, i);
+			if (netmap_fwd ||kring->ring->flags & NR_FORWARD) {
+				ND(10, "forwarding some buffers up %d to %d",
+				    kring->nr_hwcur, kring->ring->cur);
+				netmap_grab_packets(kring, &q, netmap_fwd);
+			}
 
 			if (na->nm_rxsync(ifp, i, 0 /* no lock */))
 				revents |= POLLERR;
@@ -916,8 +1560,28 @@
 		if (want_rx)
 			selrecord(td, &na->rx_si);
 	}
+
+	/* forward host to the netmap ring */
+	kring = &na->rx_rings[lim_rx];
+	if (kring->nr_hwavail > 0)
+		ND("host rx %d has %d packets", lim_rx, kring->nr_hwavail);
+	if ( (priv->np_qlast == NETMAP_HW_RING) // XXX check_all
+			&& (netmap_fwd || kring->ring->flags & NR_FORWARD)
+			 && kring->nr_hwavail > 0 && !host_forwarded) {
+		if (core_lock == NEED_CL) {
+			na->nm_lock(ifp, NETMAP_CORE_LOCK, 0);
+			core_lock = LOCKED_CL;
+		}
+		netmap_sw_to_nic(na);
+		host_forwarded = 1; /* prevent another pass */
+		want_rx = 0;
+		goto flush_tx;
+	}
+
 	if (core_lock == LOCKED_CL)
 		na->nm_lock(ifp, NETMAP_CORE_UNLOCK, 0);
+	if (q.head)
+		netmap_send_up(na->ifp, q.head);
 
 	return (revents);
 }
@@ -981,55 +1645,43 @@
  * setups.
  */
 int
-netmap_attach(struct netmap_adapter *na, int num_queues)
+netmap_attach(struct netmap_adapter *arg, int num_queues)
 {
-	int n, size;
-	void *buf;
-	struct ifnet *ifp = na->ifp;
+	struct netmap_adapter *na = NULL;
+	struct ifnet *ifp = arg ? arg->ifp : NULL;
 
-	if (ifp == NULL) {
-		D("ifp not set, giving up");
-		return EINVAL;
-	}
-	/* clear other fields ? */
-	na->refcount = 0;
+	if (arg == NULL || ifp == NULL)
+		goto fail;
+	na = malloc(sizeof(*na), M_DEVBUF, M_NOWAIT | M_ZERO);
+	if (na == NULL)
+		goto fail;
+	WNA(ifp) = na;
+	*na = *arg; /* copy everything, trust the driver to not pass junk */
+	NETMAP_SET_CAPABLE(ifp);
 	if (na->num_tx_rings == 0)
 		na->num_tx_rings = num_queues;
 	na->num_rx_rings = num_queues;
-	/* on each direction we have N+1 resources
-	 * 0..n-1	are the hardware rings
-	 * n		is the ring attached to the stack.
-	 */
-	n = na->num_rx_rings + na->num_tx_rings + 2;
-	size = sizeof(*na) + n * sizeof(struct netmap_kring);
-
-	buf = malloc(size, M_DEVBUF, M_NOWAIT | M_ZERO);
-	if (buf) {
-		WNA(ifp) = buf;
-		na->tx_rings = (void *)((char *)buf + sizeof(*na));
-		na->rx_rings = na->tx_rings + na->num_tx_rings + 1;
-		bcopy(na, buf, sizeof(*na));
-		ifp->if_capabilities |= IFCAP_NETMAP;
-
-		na = buf;
-		/* Core lock initialized here.  Others are initialized after
-		 * netmap_if_new.
-		 */
-		mtx_init(&na->core_lock, "netmap core lock", NULL, MTX_DEF);
-		if (na->nm_lock == NULL) {
-			ND("using default locks for %s", ifp->if_xname);
-			na->nm_lock = netmap_lock_wrapper;
-		}
+	na->refcount = na->na_single = na->na_multi = 0;
+	/* Core lock initialized here, others after netmap_if_new. */
+	mtx_init(&na->core_lock, "netmap core lock", MTX_NETWORK_LOCK, MTX_DEF);
+	if (na->nm_lock == NULL) {
+		ND("using default locks for %s", ifp->if_xname);
+		na->nm_lock = netmap_lock_wrapper;
 	}
 #ifdef linux
-	D("netdev_ops %p", ifp->netdev_ops);
-	/* prepare a clone of the netdev ops */
-	na->nm_ndo = *ifp->netdev_ops;
-	na->nm_ndo.ndo_start_xmit = netmap_start_linux;
+	if (ifp->netdev_ops) {
+		ND("netdev_ops %p", ifp->netdev_ops);
+		/* prepare a clone of the netdev ops */
+		na->nm_ndo = *ifp->netdev_ops;
+	}
+	na->nm_ndo.ndo_start_xmit = linux_netmap_start;
 #endif
-	D("%s for %s", buf ? "ok" : "failed", ifp->if_xname);
+	D("success for %s", ifp->if_xname);
+	return 0;
 
-	return (buf ? 0 : ENOMEM);
+fail:
+	D("fail, arg %p ifp %p na %p", arg, ifp, na);
+	return (na ? EINVAL : ENOMEM);
 }
 
 
@@ -1047,6 +1699,10 @@
 
 	mtx_destroy(&na->core_lock);
 
+	if (na->tx_rings) { /* XXX should not happen */
+		D("freeing leftover tx_rings");
+		free(na->tx_rings, M_DEVBUF);
+	}
 	bzero(na, sizeof(*na));
 	WNA(ifp) = NULL;
 	free(na, M_DEVBUF);
@@ -1064,7 +1720,7 @@
 	struct netmap_adapter *na = NA(ifp);
 	struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings];
 	u_int i, len = MBUF_LEN(m);
-	int error = EBUSY, lim = kring->nkr_num_slots - 1;
+	u_int error = EBUSY, lim = kring->nkr_num_slots - 1;
 	struct netmap_slot *slot;
 
 	if (netmap_verbose & NM_VERB_HOST)
@@ -1077,7 +1733,8 @@
 		goto done;	/* no space */
 	}
 	if (len > NETMAP_BUF_SIZE) {
-		D("drop packet size %d > %d", len, NETMAP_BUF_SIZE);
+		D("%s from_host, drop packet size %d > %d", ifp->if_xname,
+			len, NETMAP_BUF_SIZE);
 		goto done;	/* too long for us */
 	}
 
@@ -1088,6 +1745,7 @@
 	slot = &kring->ring->slot[i];
 	m_copydata(m, 0, len, NMB(slot));
 	slot->len = len;
+	slot->flags = kring->nkr_slot_flags;
 	kring->nr_hwavail++;
 	if (netmap_verbose  & NM_VERB_HOST)
 		D("wake up host ring %s %d", na->ifp->if_xname, na->num_rx_rings);
@@ -1124,9 +1782,13 @@
 		return NULL;	/* nothing to reinitialize */
 
 	if (tx == NR_TX) {
+		if (n >= na->num_tx_rings)
+			return NULL;
 		kring = na->tx_rings + n;
 		new_hwofs = kring->nr_hwcur - new_cur;
 	} else {
+		if (n >= na->num_rx_rings)
+			return NULL;
 		kring = na->rx_rings + n;
 		new_hwofs = kring->nr_hwcur + kring->nr_hwavail - new_cur;
 	}
@@ -1138,10 +1800,20 @@
 	kring->nkr_hwofs = new_hwofs;
 	if (tx == NR_TX)
 		kring->nr_hwavail = kring->nkr_num_slots - 1;
-	D("new hwofs %d on %s %s[%d]",
+	ND(10, "new hwofs %d on %s %s[%d]",
 			kring->nkr_hwofs, na->ifp->if_xname,
 			tx == NR_TX ? "TX" : "RX", n);
 
+#if 0 // def linux
+	/* XXX check that the mappings are correct */
+	/* need ring_nr, adapter->pdev, direction */
+	buffer_info->dma = dma_map_single(&pdev->dev, addr, adapter->rx_buffer_len, DMA_FROM_DEVICE);
+	if (dma_mapping_error(&adapter->pdev->dev, buffer_info->dma)) {
+		D("error mapping rx netmap buffer %d", i);
+		// XXX fix error handling
+	}
+
+#endif /* linux */
 	/*
 	 * Wakeup on the individual and global lock
 	 * We do the wakeup here, but the ring is not yet reconfigured.
@@ -1175,12 +1847,22 @@
 
 	if (!(ifp->if_capenable & IFCAP_NETMAP))
 		return 0;
+	ND(5, "received %s queue %d", work_done ? "RX" : "TX" , q);
 	na = NA(ifp);
+	if (na->na_flags & NAF_SKIP_INTR) {
+		ND("use regular interrupt");
+		return 0;
+	}
+
 	if (work_done) { /* RX path */
+		if (q >= na->num_rx_rings)
+			return 0;	// regular queue
 		r = na->rx_rings + q;
 		r->nr_kflags |= NKR_PENDINTR;
 		main_wq = (na->num_rx_rings > 1) ? &na->rx_si : NULL;
 	} else { /* tx path */
+		if (q >= na->num_tx_rings)
+			return 0;	// regular queue
 		r = na->tx_rings + q;
 		main_wq = (na->num_tx_rings > 1) ? &na->tx_si : NULL;
 		work_done = &q; /* dummy */
@@ -1206,15 +1888,547 @@
 }
 
 
+#ifdef linux	/* linux-specific routines */
+
+/*
+ * Remap linux arguments into the FreeBSD call.
+ * - pwait is the poll table, passed as 'dev';
+ *   If pwait == NULL someone else already woke up before. We can report
+ *   events but they are filtered upstream.
+ *   If pwait != NULL, then pwait->key contains the list of events.
+ * - events is computed from pwait as above.
+ * - file is passed as 'td';
+ */
+static u_int
+linux_netmap_poll(struct file * file, struct poll_table_struct *pwait)
+{
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3,4,0)
+	int events = pwait ? pwait->key : POLLIN | POLLOUT;
+#else /* in 3.4.0 field 'key' was renamed to '_key' */
+	int events = pwait ? pwait->_key : POLLIN | POLLOUT;
+#endif
+	return netmap_poll((void *)pwait, events, (void *)file);
+}
+
+static int
+linux_netmap_mmap(struct file *f, struct vm_area_struct *vma)
+{
+	int lut_skip, i, j;
+	int user_skip = 0;
+	struct lut_entry *l_entry;
+	int error = 0;
+	unsigned long off, tomap;
+	/*
+	 * vma->vm_start: start of mapping user address space
+	 * vma->vm_end: end of the mapping user address space
+	 * vma->vm_pfoff: offset of first page in the device
+	 */
+
+	// XXX security checks
+
+	error = netmap_get_memory(f->private_data);
+	ND("get_memory returned %d", error);
+	if (error)
+	    return -error;
+
+	off = vma->vm_pgoff << PAGE_SHIFT; /* offset in bytes */
+	tomap = vma->vm_end - vma->vm_start;
+	for (i = 0; i < NETMAP_POOLS_NR; i++) {  /* loop through obj_pools */
+		const struct netmap_obj_pool *p = &nm_mem.pools[i];
+		/*
+		 * In each pool memory is allocated in clusters
+		 * of size _clustsize, each containing clustentries
+		 * entries. For each object k we already store the
+		 * vtophys mapping in lut[k] so we use that, scanning
+		 * the lut[] array in steps of clustentries,
+		 * and we map each cluster (not individual pages,
+		 * it would be overkill).
+		 */
+
+		/*
+		 * We interpret vm_pgoff as an offset into the whole
+		 * netmap memory, as if all clusters where contiguous.
+		 */
+		for (lut_skip = 0, j = 0; j < p->_numclusters; j++, lut_skip += p->clustentries) {
+			unsigned long paddr, mapsize;
+			if (p->_clustsize <= off) {
+				off -= p->_clustsize;
+				continue;
+			}
+			l_entry = &p->lut[lut_skip]; /* first obj in the cluster */
+			paddr = l_entry->paddr + off;
+			mapsize = p->_clustsize - off;
+			off = 0;
+			if (mapsize > tomap)
+				mapsize = tomap;
+			ND("remap_pfn_range(%lx, %lx, %lx)",
+				vma->vm_start + user_skip,
+				paddr >> PAGE_SHIFT, mapsize);
+			if (remap_pfn_range(vma, vma->vm_start + user_skip,
+					paddr >> PAGE_SHIFT, mapsize,
+					vma->vm_page_prot))
+				return -EAGAIN; // XXX check return value
+			user_skip += mapsize;
+			tomap -= mapsize;
+			if (tomap == 0)
+				goto done;
+		}
+	}
+done:
+
+	return 0;
+}
+
+static netdev_tx_t
+linux_netmap_start(struct sk_buff *skb, struct net_device *dev)
+{
+	netmap_start(dev, skb);
+	return (NETDEV_TX_OK);
+}
+
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,37)	// XXX was 38
+#define LIN_IOCTL_NAME	.ioctl
+int
+linux_netmap_ioctl(struct inode *inode, struct file *file, u_int cmd, u_long data /* arg */)
+#else
+#define LIN_IOCTL_NAME	.unlocked_ioctl
+long
+linux_netmap_ioctl(struct file *file, u_int cmd, u_long data /* arg */)
+#endif
+{
+	int ret;
+	struct nmreq nmr;
+	bzero(&nmr, sizeof(nmr));
+
+	if (data && copy_from_user(&nmr, (void *)data, sizeof(nmr) ) != 0)
+		return -EFAULT;
+	ret = netmap_ioctl(NULL, cmd, (caddr_t)&nmr, 0, (void *)file);
+	if (data && copy_to_user((void*)data, &nmr, sizeof(nmr) ) != 0)
+		return -EFAULT;
+	return -ret;
+}
+
+
+static int
+netmap_release(struct inode *inode, struct file *file)
+{
+	(void)inode;	/* UNUSED */
+	if (file->private_data)
+		netmap_dtor(file->private_data);
+	return (0);
+}
+
+static int
+linux_netmap_open(struct inode *inode, struct file *file)
+{
+	struct netmap_priv_d *priv;
+	(void)inode;	/* UNUSED */
+
+	priv = malloc(sizeof(struct netmap_priv_d), M_DEVBUF,
+			      M_NOWAIT | M_ZERO);
+	if (priv == NULL)
+		return -ENOMEM;
+
+	file->private_data = priv;
+
+	return (0);
+}
+
+static struct file_operations netmap_fops = {
+    .open = linux_netmap_open,
+    .mmap = linux_netmap_mmap,
+    LIN_IOCTL_NAME = linux_netmap_ioctl,
+    .poll = linux_netmap_poll,
+    .release = netmap_release,
+};
+
+static struct miscdevice netmap_cdevsw = {	/* same name as FreeBSD */
+	MISC_DYNAMIC_MINOR,
+	"netmap",
+	&netmap_fops,
+};
+
+static int netmap_init(void);
+static void netmap_fini(void);
+
+/* Errors have negative values on linux */
+static int linux_netmap_init(void)
+{
+	return -netmap_init();
+}
+
+module_init(linux_netmap_init);
+module_exit(netmap_fini);
+/* export certain symbols to other modules */
+EXPORT_SYMBOL(netmap_attach);		// driver attach routines
+EXPORT_SYMBOL(netmap_detach);		// driver detach routines
+EXPORT_SYMBOL(netmap_ring_reinit);	// ring init on error
+EXPORT_SYMBOL(netmap_buffer_lut);
+EXPORT_SYMBOL(netmap_total_buffers);	// index check
+EXPORT_SYMBOL(netmap_buffer_base);
+EXPORT_SYMBOL(netmap_reset);		// ring init routines
+EXPORT_SYMBOL(netmap_buf_size);
+EXPORT_SYMBOL(netmap_rx_irq);		// default irq handler
+EXPORT_SYMBOL(netmap_no_pendintr);	// XXX mitigation - should go away
+
+
+MODULE_AUTHOR("http://info.iet.unipi.it/~luigi/netmap/");
+MODULE_DESCRIPTION("The netmap packet I/O framework");
+MODULE_LICENSE("Dual BSD/GPL"); /* the code here is all BSD. */
+
+#else /* __FreeBSD__ */
+
 static struct cdevsw netmap_cdevsw = {
 	.d_version = D_VERSION,
 	.d_name = "netmap",
+	.d_open = netmap_open,
 	.d_mmap = netmap_mmap,
+	.d_mmap_single = netmap_mmap_single,
 	.d_ioctl = netmap_ioctl,
 	.d_poll = netmap_poll,
+	.d_close = netmap_close,
 };
+#endif /* __FreeBSD__ */
 
+#ifdef NM_BRIDGE
+/*
+ *---- support for virtual bridge -----
+ */
 
+/* ----- FreeBSD if_bridge hash function ------- */
+
+/*
+ * The following hash function is adapted from "Hash Functions" by Bob Jenkins
+ * ("Algorithm Alley", Dr. Dobbs Journal, September 1997).
+ *
+ * http://www.burtleburtle.net/bob/hash/spooky.html
+ */
+#define mix(a, b, c)                                                    \
+do {                                                                    \
+        a -= b; a -= c; a ^= (c >> 13);                                 \
+        b -= c; b -= a; b ^= (a << 8);                                  \
+        c -= a; c -= b; c ^= (b >> 13);                                 \
+        a -= b; a -= c; a ^= (c >> 12);                                 \
+        b -= c; b -= a; b ^= (a << 16);                                 \
+        c -= a; c -= b; c ^= (b >> 5);                                  \
+        a -= b; a -= c; a ^= (c >> 3);                                  \
+        b -= c; b -= a; b ^= (a << 10);                                 \
+        c -= a; c -= b; c ^= (b >> 15);                                 \
+} while (/*CONSTCOND*/0)
+
+static __inline uint32_t
+nm_bridge_rthash(const uint8_t *addr)
+{
+        uint32_t a = 0x9e3779b9, b = 0x9e3779b9, c = 0; // hask key
+
+        b += addr[5] << 8;
+        b += addr[4];
+        a += addr[3] << 24;
+        a += addr[2] << 16;
+        a += addr[1] << 8;
+        a += addr[0];
+
+        mix(a, b, c);
+#define BRIDGE_RTHASH_MASK	(NM_BDG_HASH-1)
+        return (c & BRIDGE_RTHASH_MASK);
+}
+
+#undef mix
+
+
+static int
+bdg_netmap_reg(struct ifnet *ifp, int onoff)
+{
+	int i, err = 0;
+	struct nm_bridge *b = ifp->if_bridge;
+
+	BDG_LOCK(b);
+	if (onoff) {
+		/* the interface must be already in the list.
+		 * only need to mark the port as active
+		 */
+		ND("should attach %s to the bridge", ifp->if_xname);
+		for (i=0; i < NM_BDG_MAXPORTS; i++)
+			if (b->bdg_ports[i] == ifp)
+				break;
+		if (i == NM_BDG_MAXPORTS) {
+			D("no more ports available");
+			err = EINVAL;
+			goto done;
+		}
+		ND("setting %s in netmap mode", ifp->if_xname);
+		ifp->if_capenable |= IFCAP_NETMAP;
+		NA(ifp)->bdg_port = i;
+		b->act_ports |= (1<<i);
+		b->bdg_ports[i] = ifp;
+	} else {
+		/* should be in the list, too -- remove from the mask */
+		ND("removing %s from netmap mode", ifp->if_xname);
+		ifp->if_capenable &= ~IFCAP_NETMAP;
+		i = NA(ifp)->bdg_port;
+		b->act_ports &= ~(1<<i);
+	}
+done:
+	BDG_UNLOCK(b);
+	return err;
+}
+
+
+static int
+nm_bdg_flush(struct nm_bdg_fwd *ft, int n, struct ifnet *ifp)
+{
+	int i, ifn;
+	uint64_t all_dst, dst;
+	uint32_t sh, dh;
+	uint64_t mysrc = 1 << NA(ifp)->bdg_port;
+	uint64_t smac, dmac;
+	struct netmap_slot *slot;
+	struct nm_bridge *b = ifp->if_bridge;
+
+	ND("prepare to send %d packets, act_ports 0x%x", n, b->act_ports);
+	/* only consider valid destinations */
+	all_dst = (b->act_ports & ~mysrc);
+	/* first pass: hash and find destinations */
+	for (i = 0; likely(i < n); i++) {
+		uint8_t *buf = ft[i].buf;
+		dmac = le64toh(*(uint64_t *)(buf)) & 0xffffffffffff;
+		smac = le64toh(*(uint64_t *)(buf + 4));
+		smac >>= 16;
+		if (unlikely(netmap_verbose)) {
+		    uint8_t *s = buf+6, *d = buf;
+		    D("%d len %4d %02x:%02x:%02x:%02x:%02x:%02x -> %02x:%02x:%02x:%02x:%02x:%02x",
+			i,
+			ft[i].len,
+			s[0], s[1], s[2], s[3], s[4], s[5],
+			d[0], d[1], d[2], d[3], d[4], d[5]);
+		}
+		/*
+		 * The hash is somewhat expensive, there might be some
+		 * worthwhile optimizations here.
+		 */
+		if ((buf[6] & 1) == 0) { /* valid src */
+		    	uint8_t *s = buf+6;
+			sh = nm_bridge_rthash(buf+6); // XXX hash of source
+			/* update source port forwarding entry */
+			b->ht[sh].mac = smac;	/* XXX expire ? */
+			b->ht[sh].ports = mysrc;
+			if (netmap_verbose)
+			    D("src %02x:%02x:%02x:%02x:%02x:%02x on port %d",
+				s[0], s[1], s[2], s[3], s[4], s[5], NA(ifp)->bdg_port);
+		}
+		dst = 0;
+		if ( (buf[0] & 1) == 0) { /* unicast */
+		    	uint8_t *d = buf;
+			dh = nm_bridge_rthash(buf); // XXX hash of dst
+			if (b->ht[dh].mac == dmac) {	/* found dst */
+				dst = b->ht[dh].ports;
+				if (netmap_verbose)
+				    D("dst %02x:%02x:%02x:%02x:%02x:%02x to port %x",
+					d[0], d[1], d[2], d[3], d[4], d[5], (uint32_t)(dst >> 16));
+			}
+		}
+		if (dst == 0)
+			dst = all_dst;
+		dst &= all_dst; /* only consider valid ports */
+		if (unlikely(netmap_verbose))
+			D("pkt goes to ports 0x%x", (uint32_t)dst);
+		ft[i].dst = dst;
+	}
+
+	/* second pass, scan interfaces and forward */
+	all_dst = (b->act_ports & ~mysrc);
+	for (ifn = 0; all_dst; ifn++) {
+		struct ifnet *dst_ifp = b->bdg_ports[ifn];
+		struct netmap_adapter *na;
+		struct netmap_kring *kring;
+		struct netmap_ring *ring;
+		int j, lim, sent, locked;
+
+		if (!dst_ifp)
+			continue;
+		ND("scan port %d %s", ifn, dst_ifp->if_xname);
+		dst = 1 << ifn;
+		if ((dst & all_dst) == 0)	/* skip if not set */
+			continue;
+		all_dst &= ~dst;	/* clear current node */
+		na = NA(dst_ifp);
+
+		ring = NULL;
+		kring = NULL;
+		lim = sent = locked = 0;
+		/* inside, scan slots */
+		for (i = 0; likely(i < n); i++) {
+			if ((ft[i].dst & dst) == 0)
+				continue;	/* not here */
+			if (!locked) {
+				kring = &na->rx_rings[0];
+				ring = kring->ring;
+				lim = kring->nkr_num_slots - 1;
+				na->nm_lock(dst_ifp, NETMAP_RX_LOCK, 0);
+				locked = 1;
+			}
+			if (unlikely(kring->nr_hwavail >= lim)) {
+				if (netmap_verbose)
+					D("rx ring full on %s", ifp->if_xname);
+				break;
+			}
+			j = kring->nr_hwcur + kring->nr_hwavail;
+			if (j > lim)
+				j -= kring->nkr_num_slots;
+			slot = &ring->slot[j];
+			ND("send %d %d bytes at %s:%d", i, ft[i].len, dst_ifp->if_xname, j);
+			pkt_copy(ft[i].buf, NMB(slot), ft[i].len);
+			slot->len = ft[i].len;
+			kring->nr_hwavail++;
+			sent++;
+		}
+		if (locked) {
+			ND("sent %d on %s", sent, dst_ifp->if_xname);
+			if (sent)
+				selwakeuppri(&kring->si, PI_NET);
+			na->nm_lock(dst_ifp, NETMAP_RX_UNLOCK, 0);
+		}
+	}
+	return 0;
+}
+
+/*
+ * main dispatch routine
+ */
+static int
+bdg_netmap_txsync(struct ifnet *ifp, u_int ring_nr, int do_lock)
+{
+	struct netmap_adapter *na = NA(ifp);
+	struct netmap_kring *kring = &na->tx_rings[ring_nr];
+	struct netmap_ring *ring = kring->ring;
+	int i, j, k, lim = kring->nkr_num_slots - 1;
+	struct nm_bdg_fwd *ft = (struct nm_bdg_fwd *)(ifp + 1);
+	int ft_i;	/* position in the forwarding table */
+
+	k = ring->cur;
+	if (k > lim)
+		return netmap_ring_reinit(kring);
+	if (do_lock)
+		na->nm_lock(ifp, NETMAP_TX_LOCK, ring_nr);
+
+	if (netmap_bridge <= 0) { /* testing only */
+		j = k; // used all
+		goto done;
+	}
+	if (netmap_bridge > NM_BDG_BATCH)
+		netmap_bridge = NM_BDG_BATCH;
+
+	ft_i = 0;	/* start from 0 */
+	for (j = kring->nr_hwcur; likely(j != k); j = unlikely(j == lim) ? 0 : j+1) {
+		struct netmap_slot *slot = &ring->slot[j];
+		int len = ft[ft_i].len = slot->len;
+		char *buf = ft[ft_i].buf = NMB(slot);
+
+		prefetch(buf);
+		if (unlikely(len < 14))
+			continue;
+		if (unlikely(++ft_i == netmap_bridge))
+			ft_i = nm_bdg_flush(ft, ft_i, ifp);
+	}
+	if (ft_i)
+		ft_i = nm_bdg_flush(ft, ft_i, ifp);
+	/* count how many packets we sent */
+	i = k - j;
+	if (i < 0)
+		i += kring->nkr_num_slots;
+	kring->nr_hwavail = kring->nkr_num_slots - 1 - i;
+	if (j != k)
+		D("early break at %d/ %d, avail %d", j, k, kring->nr_hwavail);
+
+done:
+	kring->nr_hwcur = j;
+	ring->avail = kring->nr_hwavail;
+	if (do_lock)
+		na->nm_lock(ifp, NETMAP_TX_UNLOCK, ring_nr);
+
+	if (netmap_verbose)
+		D("%s ring %d lock %d", ifp->if_xname, ring_nr, do_lock);
+	return 0;
+}
+
+static int
+bdg_netmap_rxsync(struct ifnet *ifp, u_int ring_nr, int do_lock)
+{
+	struct netmap_adapter *na = NA(ifp);
+	struct netmap_kring *kring = &na->rx_rings[ring_nr];
+	struct netmap_ring *ring = kring->ring;
+	u_int j, n, lim = kring->nkr_num_slots - 1;
+	u_int k = ring->cur, resvd = ring->reserved;
+
+	ND("%s ring %d lock %d avail %d",
+		ifp->if_xname, ring_nr, do_lock, kring->nr_hwavail);
+
+	if (k > lim)
+		return netmap_ring_reinit(kring);
+	if (do_lock)
+		na->nm_lock(ifp, NETMAP_RX_LOCK, ring_nr);
+
+	/* skip past packets that userspace has released */
+	j = kring->nr_hwcur;    /* netmap ring index */
+	if (resvd > 0) {
+		if (resvd + ring->avail >= lim + 1) {
+			D("XXX invalid reserve/avail %d %d", resvd, ring->avail);
+			ring->reserved = resvd = 0; // XXX panic...
+		}
+		k = (k >= resvd) ? k - resvd : k + lim + 1 - resvd;
+	}
+
+	if (j != k) { /* userspace has released some packets. */
+		n = k - j;
+		if (n < 0)
+			n += kring->nkr_num_slots;
+		ND("userspace releases %d packets", n);
+                for (n = 0; likely(j != k); n++) {
+                        struct netmap_slot *slot = &ring->slot[j];
+                        void *addr = NMB(slot);
+
+                        if (addr == netmap_buffer_base) { /* bad buf */
+                                if (do_lock)
+                                        na->nm_lock(ifp, NETMAP_RX_UNLOCK, ring_nr);
+                                return netmap_ring_reinit(kring);
+                        }
+			/* decrease refcount for buffer */
+
+			slot->flags &= ~NS_BUF_CHANGED;
+                        j = unlikely(j == lim) ? 0 : j + 1;
+                }
+                kring->nr_hwavail -= n;
+                kring->nr_hwcur = k;
+        }
+        /* tell userspace that there are new packets */
+        ring->avail = kring->nr_hwavail - resvd;
+
+	if (do_lock)
+		na->nm_lock(ifp, NETMAP_RX_UNLOCK, ring_nr);
+	return 0;
+}
+
+static void
+bdg_netmap_attach(struct ifnet *ifp)
+{
+	struct netmap_adapter na;
+
+	ND("attaching virtual bridge");
+	bzero(&na, sizeof(na));
+
+	na.ifp = ifp;
+	na.separate_locks = 1;
+	na.num_tx_desc = NM_BRIDGE_RINGSIZE;
+	na.num_rx_desc = NM_BRIDGE_RINGSIZE;
+	na.nm_txsync = bdg_netmap_txsync;
+	na.nm_rxsync = bdg_netmap_rxsync;
+	na.nm_register = bdg_netmap_reg;
+	netmap_attach(&na, 1);
+}
+
+#endif /* NM_BRIDGE */
+
 static struct cdev *netmap_dev; /* /dev/netmap character device. */
 
 
@@ -1233,13 +2447,20 @@
 
 	error = netmap_memory_init();
 	if (error != 0) {
-		printf("netmap: unable to initialize the memory allocator.");
+		printf("netmap: unable to initialize the memory allocator.\n");
 		return (error);
 	}
-	printf("netmap: loaded module with %d Mbytes\n",
-		(int)(nm_mem->nm_totalsize >> 20));
+	printf("netmap: loaded module\n");
 	netmap_dev = make_dev(&netmap_cdevsw, 0, UID_ROOT, GID_WHEEL, 0660,
 			      "netmap");
+
+#ifdef NM_BRIDGE
+	{
+	int i;
+	for (i = 0; i < NM_BRIDGES; i++)
+		mtx_init(&nm_bridges[i].bdg_lock, "bdg lock", "bdg_lock", MTX_DEF);
+	}
+#endif
 	return (error);
 }
 
@@ -1258,6 +2479,7 @@
 }
 
 
+#ifdef __FreeBSD__
 /*
  * Kernel entry point.
  *
@@ -1289,3 +2511,4 @@
 
 
 DEV_MODULE(netmap, netmap_loader, NULL);
+#endif /* __FreeBSD__ */

Modified: trunk/sys/dev/netmap/netmap_kern.h
===================================================================
--- trunk/sys/dev/netmap/netmap_kern.h	2016-09-25 22:40:55 UTC (rev 8727)
+++ trunk/sys/dev/netmap/netmap_kern.h	2016-09-25 23:53:30 UTC (rev 8728)
@@ -1,6 +1,6 @@
 /*
  * Copyright (C) 2011-2012 Matteo Landi, Luigi Rizzo. All rights reserved.
- * 
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
@@ -9,7 +9,7 @@
  *   2. Redistributions in binary form must reproduce the above copyright
  *      notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
- * 
+ *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
@@ -37,6 +37,9 @@
 #define NETMAP_MEM2    // use the new memory allocator
 
 #if defined(__FreeBSD__)
+#define likely(x)	__builtin_expect(!!(x), 1)
+#define unlikely(x)	__builtin_expect(!!(x), 0)
+
 #define	NM_LOCK_T	struct mtx
 #define	NM_SELINFO_T	struct selinfo
 #define	MBUF_LEN(m)	((m)->m_pkthdr.len)
@@ -46,12 +49,34 @@
 #define	NM_SELINFO_T	wait_queue_head_t
 #define	MBUF_LEN(m)	((m)->len)
 #define	NM_SEND_UP(ifp, m)	netif_rx(m)
+
+#ifndef DEV_NETMAP
+#define DEV_NETMAP
+#endif
+
+/*
+ * IFCAP_NETMAP goes into net_device's priv_flags (if_capenable).
+ * This was 16 bits up to linux 2.6.36, so we need a 16 bit value on older
+ * platforms and tolerate the clash with IFF_DYNAMIC and IFF_BRIDGE_PORT.
+ * For the 32-bit value, 0x100000 has no clashes until at least 3.5.1
+ */
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,37)
+#define IFCAP_NETMAP	0x8000
 #else
-#error unsupported platform
+#define IFCAP_NETMAP	0x100000
 #endif
 
-#ifdef MALLOC_DECLARE
-MALLOC_DECLARE(M_NETMAP);
+#elif defined (__APPLE__)
+#warning apple support is incomplete.
+#define likely(x)	__builtin_expect(!!(x), 1)
+#define unlikely(x)	__builtin_expect(!!(x), 0)
+#define	NM_LOCK_T	IOLock *
+#define	NM_SELINFO_T	struct selinfo
+#define	MBUF_LEN(m)	((m)->m_pkthdr.len)
+#define	NM_SEND_UP(ifp, m)	((ifp)->if_input)(ifp, m)
+
+#else
+#error unsupported platform
 #endif
 
 #define ND(format, ...)
@@ -63,7 +88,19 @@
 		(int)__xxts.tv_sec % 1000, (int)__xxts.tv_usec,	\
 		__FUNCTION__, __LINE__, ##__VA_ARGS__);		\
 	} while (0)
- 
+
+/* rate limited, lps indicates how many per second */
+#define RD(lps, format, ...)					\
+	do {							\
+		static int t0, __cnt;				\
+		if (t0 != time_second) {			\
+			t0 = time_second;			\
+			__cnt = 0;				\
+		}						\
+		if (__cnt++ < lps)				\
+			D(format, ##__VA_ARGS__);		\
+	} while (0)
+
 struct netmap_adapter;
 
 /*
@@ -82,6 +119,10 @@
  * RX rings: the next empty buffer (hwcur + hwavail + hwofs) coincides with
  * 	the next empty buffer as known by the hardware (next_to_check or so).
  * TX rings: hwcur + hwofs coincides with next_to_send
+ *
+ * For received packets, slot->flags is set to nkr_slot_flags
+ * so we can provide a proper initial value (e.g. set NS_FORWARD
+ * when operating in 'transparent' mode).
  */
 struct netmap_kring {
 	struct netmap_ring *ring;
@@ -91,6 +132,7 @@
 #define NKR_PENDINTR	0x1	// Pending interrupt.
 	u_int nkr_num_slots;
 
+	uint16_t	nkr_slot_flags;	/* initial value for flags */
 	int	nkr_hwofs;	/* offset between NIC and netmap ring */
 	struct netmap_adapter *na;
 	NM_SELINFO_T si;	/* poll/select wait queue */
@@ -103,6 +145,18 @@
  * support netmap operation.
  */
 struct netmap_adapter {
+	/*
+	 * On linux we do not have a good way to tell if an interface
+	 * is netmap-capable. So we use the following trick:
+	 * NA(ifp) points here, and the first entry (which hopefully
+	 * always exists and is at least 32 bits) contains a magic
+	 * value which we can use to detect that the interface is good.
+	 */
+	uint32_t magic;
+	uint32_t na_flags;	/* future place for IFCAP_NETMAP */
+#define NAF_SKIP_INTR	1	/* use the regular interrupt handler.
+				 * useful during initialization
+				 */
 	int refcount; /* number of user-space descriptors using this
 			 interface, which is equal to the number of
 			 struct netmap_if objs in the mapped region. */
@@ -123,7 +177,6 @@
 
 	u_int num_tx_desc; /* number of descriptor in each queue */
 	u_int num_rx_desc;
-	//u_int buff_size;	// XXX deprecate, use NETMAP_BUF_SIZE
 
 	/* tx_rings and rx_rings are private but allocated
 	 * as a contiguous chunk of memory. Each array has
@@ -150,13 +203,19 @@
 	void (*nm_lock)(struct ifnet *, int what, u_int ringid);
 	int (*nm_txsync)(struct ifnet *, u_int ring, int lock);
 	int (*nm_rxsync)(struct ifnet *, u_int ring, int lock);
+	/* return configuration information */
+	int (*nm_config)(struct ifnet *, u_int *txr, u_int *txd,
+					u_int *rxr, u_int *rxd);
+
+	int bdg_port;
 #ifdef linux
 	struct net_device_ops nm_ndo;
+	int if_refcount;	// XXX additions for bridge
 #endif /* linux */
 };
 
 /*
- * The combination of "enable" (ifp->if_capabilities &IFCAP_NETMAP)
+ * The combination of "enable" (ifp->if_capenable & IFCAP_NETMAP)
  * and refcount gives the status of the interface, namely:
  *
  *	enable	refcount	Status
@@ -212,7 +271,7 @@
 	enum txrx tx, int n, u_int new_cur);
 int netmap_ring_reinit(struct netmap_kring *);
 
-extern int netmap_buf_size;
+extern u_int netmap_buf_size;
 #define NETMAP_BUF_SIZE	netmap_buf_size
 extern int netmap_mitigate;
 extern int netmap_no_pendintr;
@@ -239,7 +298,38 @@
 #endif
 #define	NA(_ifp)	((struct netmap_adapter *)WNA(_ifp))
 
+/*
+ * Macros to determine if an interface is netmap capable or netmap enabled.
+ * See the magic field in struct netmap_adapter.
+ */
+#ifdef __FreeBSD__
+/*
+ * on FreeBSD just use if_capabilities and if_capenable.
+ */
+#define NETMAP_CAPABLE(ifp)	(NA(ifp) &&		\
+	(ifp)->if_capabilities & IFCAP_NETMAP )
 
+#define	NETMAP_SET_CAPABLE(ifp)				\
+	(ifp)->if_capabilities |= IFCAP_NETMAP
+
+#else	/* linux */
+
+/*
+ * on linux:
+ * we check if NA(ifp) is set and its first element has a related
+ * magic value. The capenable is within the struct netmap_adapter.
+ */
+#define	NETMAP_MAGIC	0x52697a7a
+
+#define NETMAP_CAPABLE(ifp)	(NA(ifp) &&		\
+	((uint32_t)(uintptr_t)NA(ifp) ^ NA(ifp)->magic) == NETMAP_MAGIC )
+
+#define	NETMAP_SET_CAPABLE(ifp)				\
+	NA(ifp)->magic = ((uint32_t)(uintptr_t)NA(ifp)) ^ NETMAP_MAGIC
+
+#endif	/* linux */
+
+#ifdef __FreeBSD__
 /* Callback invoked by the dma machinery after a successfull dmamap_load */
 static void netmap_dmamap_cb(__unused void *arg,
     __unused bus_dma_segment_t * segs, __unused int nseg, __unused int error)
@@ -267,8 +357,50 @@
 		    netmap_dmamap_cb, NULL, BUS_DMA_NOWAIT);
 	}
 }
+#else /* linux */
 
 /*
+ * XXX How do we redefine these functions:
+ *
+ * on linux we need
+ *	dma_map_single(&pdev->dev, virt_addr, len, direction)
+ *	dma_unmap_single(&adapter->pdev->dev, phys_addr, len, direction
+ * The len can be implicit (on netmap it is NETMAP_BUF_SIZE)
+ * unfortunately the direction is not, so we need to change
+ * something to have a cross API
+ */
+#define netmap_load_map(_t, _m, _b)
+#define netmap_reload_map(_t, _m, _b)
+#if 0
+	struct e1000_buffer *buffer_info =  &tx_ring->buffer_info[l];
+	/* set time_stamp *before* dma to help avoid a possible race */
+	buffer_info->time_stamp = jiffies;
+	buffer_info->mapped_as_page = false;
+	buffer_info->length = len;
+	//buffer_info->next_to_watch = l;
+	/* reload dma map */
+	dma_unmap_single(&adapter->pdev->dev, buffer_info->dma,
+			NETMAP_BUF_SIZE, DMA_TO_DEVICE);
+	buffer_info->dma = dma_map_single(&adapter->pdev->dev,
+			addr, NETMAP_BUF_SIZE, DMA_TO_DEVICE);
+
+	if (dma_mapping_error(&adapter->pdev->dev, buffer_info->dma)) {
+		D("dma mapping error");
+		/* goto dma_error; See e1000_put_txbuf() */
+		/* XXX reset */
+	}
+	tx_desc->buffer_addr = htole64(buffer_info->dma); //XXX
+
+#endif
+
+/*
+ * The bus_dmamap_sync() can be one of wmb() or rmb() depending on direction.
+ */
+#define bus_dmamap_sync(_a, _b, _c)
+
+#endif /* linux */
+
+/*
  * functions to map NIC to KRING indexes (n2k) and vice versa (k2n)
  */
 static inline int
@@ -322,7 +454,7 @@
 NMB(struct netmap_slot *slot)
 {
 	uint32_t i = slot->buf_idx;
-	return (i >= netmap_total_buffers) ?  NMB_VA(0) : NMB_VA(i);
+	return (unlikely(i >= netmap_total_buffers)) ?  NMB_VA(0) : NMB_VA(i);
 }
 
 static inline void *
@@ -341,4 +473,6 @@
 /* default functions to handle rx/tx interrupts */
 int netmap_rx_irq(struct ifnet *, int, int *);
 #define netmap_tx_irq(_n, _q) netmap_rx_irq(_n, _q, NULL)
+
+extern int netmap_copy;
 #endif /* _NET_NETMAP_KERN_H_ */

Deleted: trunk/sys/dev/netmap/netmap_mem1.c
===================================================================
--- trunk/sys/dev/netmap/netmap_mem1.c	2016-09-25 22:40:55 UTC (rev 8727)
+++ trunk/sys/dev/netmap/netmap_mem1.c	2016-09-25 23:53:30 UTC (rev 8728)
@@ -1,521 +0,0 @@
-/*
- * Copyright (C) 2011 Matteo Landi, Luigi Rizzo. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *   1. Redistributions of source code must retain the above copyright
- *      notice, this list of conditions and the following disclaimer.
- *   2. Redistributions in binary form must reproduce the above copyright
- *      notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-/*
- * $MidnightBSD$
- *
- * The original netmap memory allocator, using a single large
- * chunk of memory allocated with contigmalloc.
- */
-
-/*
- * Default amount of memory pre-allocated by the module.
- * We start with a large size and then shrink our demand
- * according to what is avalable when the module is loaded.
- */
-#define NETMAP_MEMORY_SIZE (64 * 1024 * PAGE_SIZE)
-static void * netmap_malloc(size_t size, const char *msg);
-static void netmap_free(void *addr, const char *msg);
-
-#define netmap_if_malloc(len)   netmap_malloc(len, "nifp")
-#define netmap_if_free(v)	netmap_free((v), "nifp")
-
-#define netmap_ring_malloc(len) netmap_malloc(len, "ring")
-#define netmap_free_rings(na)		\
-	netmap_free((na)->tx_rings[0].ring, "shadow rings");
-
-/*
- * Allocator for a pool of packet buffers. For each buffer we have
- * one entry in the bitmap to signal the state. Allocation scans
- * the bitmap, but since this is done only on attach, we are not
- * too worried about performance
- * XXX if we need to allocate small blocks, a translation
- * table is used both for kernel virtual address and physical
- * addresses.
- */
-struct netmap_buf_pool {
-	u_int total_buffers;	/* total buffers. */
-	u_int free;
-	u_int bufsize;
-	char *base;		/* buffer base address */
-	uint32_t *bitmap;	/* one bit per buffer, 1 means free */
-};
-struct netmap_buf_pool nm_buf_pool;
-SYSCTL_INT(_dev_netmap, OID_AUTO, total_buffers,
-    CTLFLAG_RD, &nm_buf_pool.total_buffers, 0, "total_buffers");
-SYSCTL_INT(_dev_netmap, OID_AUTO, free_buffers,
-    CTLFLAG_RD, &nm_buf_pool.free, 0, "free_buffers");
-
-
-/*
- * Allocate n buffers from the ring, and fill the slot.
- * Buffer 0 is the 'junk' buffer.
- */
-static void
-netmap_new_bufs(struct netmap_if *nifp __unused,
-		struct netmap_slot *slot, u_int n)
-{
-	struct netmap_buf_pool *p = &nm_buf_pool;
-	uint32_t bi = 0;		/* index in the bitmap */
-	uint32_t mask, j, i = 0;	/* slot counter */
-
-	if (n > p->free) {
-		D("only %d out of %d buffers available", i, n);
-		return;
-	}
-	/* termination is guaranteed by p->free */
-	while (i < n && p->free > 0) {
-		uint32_t cur = p->bitmap[bi];
-		if (cur == 0) { /* bitmask is fully used */
-			bi++;
-			continue;
-		}
-		/* locate a slot */
-		for (j = 0, mask = 1; (cur & mask) == 0; j++, mask <<= 1) ;
-		p->bitmap[bi] &= ~mask;		/* slot in use */
-		p->free--;
-		slot[i].buf_idx = bi*32+j;
-		slot[i].len = p->bufsize;
-		slot[i].flags = NS_BUF_CHANGED;
-		i++;
-	}
-	ND("allocated %d buffers, %d available", n, p->free);
-}
-
-
-static void
-netmap_free_buf(struct netmap_if *nifp __unused, uint32_t i)
-{
-	struct netmap_buf_pool *p = &nm_buf_pool;
-
-	uint32_t pos, mask;
-	if (i >= p->total_buffers) {
-		D("invalid free index %d", i);
-		return;
-	}
-	pos = i / 32;
-	mask = 1 << (i % 32);
-	if (p->bitmap[pos] & mask) {
-		D("slot %d already free", i);
-		return;
-	}
-	p->bitmap[pos] |= mask;
-	p->free++;
-}
-
-
-/* Descriptor of the memory objects handled by our memory allocator. */
-struct netmap_mem_obj {
-	TAILQ_ENTRY(netmap_mem_obj) nmo_next; /* next object in the
-						 chain. */
-	int nmo_used; /* flag set on used memory objects. */
-	size_t nmo_size; /* size of the memory area reserved for the
-			    object. */
-	void *nmo_data; /* pointer to the memory area. */
-};
-
-/* Wrap our memory objects to make them ``chainable``. */
-TAILQ_HEAD(netmap_mem_obj_h, netmap_mem_obj);
-
-
-/* Descriptor of our custom memory allocator. */
-struct netmap_mem_d {
-	struct mtx nm_mtx; /* lock used to handle the chain of memory
-			      objects. */
-	struct netmap_mem_obj_h nm_molist; /* list of memory objects */
-	size_t nm_size; /* total amount of memory used for rings etc. */
-	size_t nm_totalsize; /* total amount of allocated memory
-		(the difference is used for buffers) */
-	size_t nm_buf_start; /* offset of packet buffers.
-			This is page-aligned. */
-	size_t nm_buf_len; /* total memory for buffers */
-	void *nm_buffer; /* pointer to the whole pre-allocated memory
-			    area. */
-};
-
-/* Shorthand to compute a netmap interface offset. */
-#define netmap_if_offset(v)                                     \
-    ((char *) (v) - (char *) nm_mem->nm_buffer)
-/* .. and get a physical address given a memory offset */
-#define netmap_ofstophys(o)                                     \
-    (vtophys(nm_mem->nm_buffer) + (o))
-
-
-/*------ netmap memory allocator -------*/
-/*
- * Request for a chunk of memory.
- *
- * Memory objects are arranged into a list, hence we need to walk this
- * list until we find an object with the needed amount of data free.
- * This sounds like a completely inefficient implementation, but given
- * the fact that data allocation is done once, we can handle it
- * flawlessly.
- *
- * Return NULL on failure.
- */
-static void *
-netmap_malloc(size_t size, __unused const char *msg)
-{
-	struct netmap_mem_obj *mem_obj, *new_mem_obj;
-	void *ret = NULL;
-
-	NMA_LOCK();
-	TAILQ_FOREACH(mem_obj, &nm_mem->nm_molist, nmo_next) {
-		if (mem_obj->nmo_used != 0 || mem_obj->nmo_size < size)
-			continue;
-
-		new_mem_obj = malloc(sizeof(struct netmap_mem_obj), M_NETMAP,
-				     M_WAITOK | M_ZERO);
-		TAILQ_INSERT_BEFORE(mem_obj, new_mem_obj, nmo_next);
-
-		new_mem_obj->nmo_used = 1;
-		new_mem_obj->nmo_size = size;
-		new_mem_obj->nmo_data = mem_obj->nmo_data;
-		memset(new_mem_obj->nmo_data, 0, new_mem_obj->nmo_size);
-
-		mem_obj->nmo_size -= size;
-		mem_obj->nmo_data = (char *) mem_obj->nmo_data + size;
-		if (mem_obj->nmo_size == 0) {
-			TAILQ_REMOVE(&nm_mem->nm_molist, mem_obj,
-				     nmo_next);
-			free(mem_obj, M_NETMAP);
-		}
-
-		ret = new_mem_obj->nmo_data;
-
-		break;
-	}
-	NMA_UNLOCK();
-	ND("%s: %d bytes at %p", msg, size, ret);
-
-	return (ret);
-}
-
-/*
- * Return the memory to the allocator.
- *
- * While freeing a memory object, we try to merge adjacent chunks in
- * order to reduce memory fragmentation.
- */
-static void
-netmap_free(void *addr, const char *msg)
-{
-	size_t size;
-	struct netmap_mem_obj *cur, *prev, *next;
-
-	if (addr == NULL) {
-		D("NULL addr for %s", msg);
-		return;
-	}
-
-	NMA_LOCK();
-	TAILQ_FOREACH(cur, &nm_mem->nm_molist, nmo_next) {
-		if (cur->nmo_data == addr && cur->nmo_used)
-			break;
-	}
-	if (cur == NULL) {
-		NMA_UNLOCK();
-		D("invalid addr %s %p", msg, addr);
-		return;
-	}
-
-	size = cur->nmo_size;
-	cur->nmo_used = 0;
-
-	/* merge current chunk of memory with the previous one,
-	   if present. */
-	prev = TAILQ_PREV(cur, netmap_mem_obj_h, nmo_next);
-	if (prev && prev->nmo_used == 0) {
-		TAILQ_REMOVE(&nm_mem->nm_molist, cur, nmo_next);
-		prev->nmo_size += cur->nmo_size;
-		free(cur, M_NETMAP);
-		cur = prev;
-	}
-
-	/* merge with the next one */
-	next = TAILQ_NEXT(cur, nmo_next);
-	if (next && next->nmo_used == 0) {
-		TAILQ_REMOVE(&nm_mem->nm_molist, next, nmo_next);
-		cur->nmo_size += next->nmo_size;
-		free(next, M_NETMAP);
-	}
-	NMA_UNLOCK();
-	ND("freed %s %d bytes at %p", msg, size, addr);
-}
-
-
-/*
- * Create and return a new ``netmap_if`` object, and possibly also
- * rings and packet buffors.
- *
- * Return NULL on failure.
- */
-static void *
-netmap_if_new(const char *ifname, struct netmap_adapter *na)
-{
-	struct netmap_if *nifp;
-	struct netmap_ring *ring;
-	struct netmap_kring *kring;
-	char *buff;
-	u_int i, len, ofs, numdesc;
-	u_int nrx = na->num_rx_rings + 1; /* shorthand, include stack queue */
-	u_int ntx = na->num_tx_rings + 1; /* shorthand, include stack queue */
-
-	/*
-	 * the descriptor is followed inline by an array of offsets
-	 * to the tx and rx rings in the shared memory region.
-	 */
-	len = sizeof(struct netmap_if) + (nrx + ntx) * sizeof(ssize_t);
-	nifp = netmap_if_malloc(len);
-	if (nifp == NULL)
-		return (NULL);
-
-	/* initialize base fields */
-	*(int *)(uintptr_t)&nifp->ni_rx_rings = na->num_rx_rings;
-	*(int *)(uintptr_t)&nifp->ni_tx_rings = na->num_tx_rings;
-	strncpy(nifp->ni_name, ifname, IFNAMSIZ);
-
-	(na->refcount)++;	/* XXX atomic ? we are under lock */
-	if (na->refcount > 1)
-		goto final;
-
-	/*
-	 * First instance. Allocate the netmap rings
-	 * (one for each hw queue, one pair for the host).
-	 * The rings are contiguous, but have variable size.
-	 * The entire block is reachable at
-	 *	na->tx_rings[0]
-	 */
-	len = (ntx + nrx) * sizeof(struct netmap_ring) +
-	      (ntx * na->num_tx_desc + nrx * na->num_rx_desc) *
-		   sizeof(struct netmap_slot);
-	buff = netmap_ring_malloc(len);
-	if (buff == NULL) {
-		D("failed to allocate %d bytes for %s shadow ring",
-			len, ifname);
-error:
-		(na->refcount)--;
-		netmap_if_free(nifp);
-		return (NULL);
-	}
-	/* Check whether we have enough buffers */
-	len = ntx * na->num_tx_desc + nrx * na->num_rx_desc;
-	NMA_LOCK();
-	if (nm_buf_pool.free < len) {
-		NMA_UNLOCK();
-		netmap_free(buff, "not enough bufs");
-		goto error;
-	}
-	/*
-	 * in the kring, store the pointers to the shared rings
-	 * and initialize the rings. We are under NMA_LOCK().
-	 */
-	ofs = 0;
-	for (i = 0; i < ntx; i++) { /* Transmit rings */
-		kring = &na->tx_rings[i];
-		numdesc = na->num_tx_desc;
-		bzero(kring, sizeof(*kring));
-		kring->na = na;
-
-		ring = kring->ring = (struct netmap_ring *)(buff + ofs);
-		*(ssize_t *)(uintptr_t)&ring->buf_ofs =
-			nm_buf_pool.base - (char *)ring;
-		ND("txring[%d] at %p ofs %d", i, ring, ring->buf_ofs);
-		*(uint32_t *)(uintptr_t)&ring->num_slots =
-			kring->nkr_num_slots = numdesc;
-
-		/*
-		 * IMPORTANT:
-		 * Always keep one slot empty, so we can detect new
-		 * transmissions comparing cur and nr_hwcur (they are
-		 * the same only if there are no new transmissions).
-		 */
-		ring->avail = kring->nr_hwavail = numdesc - 1;
-		ring->cur = kring->nr_hwcur = 0;
-		*(uint16_t *)(uintptr_t)&ring->nr_buf_size = NETMAP_BUF_SIZE;
-		netmap_new_bufs(nifp, ring->slot, numdesc);
-
-		ofs += sizeof(struct netmap_ring) +
-			numdesc * sizeof(struct netmap_slot);
-	}
-
-	for (i = 0; i < nrx; i++) { /* Receive rings */
-		kring = &na->rx_rings[i];
-		numdesc = na->num_rx_desc;
-		bzero(kring, sizeof(*kring));
-		kring->na = na;
-
-		ring = kring->ring = (struct netmap_ring *)(buff + ofs);
-		*(ssize_t *)(uintptr_t)&ring->buf_ofs =
-			nm_buf_pool.base - (char *)ring;
-		ND("rxring[%d] at %p offset %d", i, ring, ring->buf_ofs);
-		*(uint32_t *)(uintptr_t)&ring->num_slots =
-			kring->nkr_num_slots = numdesc;
-		ring->cur = kring->nr_hwcur = 0;
-		ring->avail = kring->nr_hwavail = 0; /* empty */
-		*(uint16_t *)(uintptr_t)&ring->nr_buf_size = NETMAP_BUF_SIZE;
-		netmap_new_bufs(nifp, ring->slot, numdesc);
-		ofs += sizeof(struct netmap_ring) +
-			numdesc * sizeof(struct netmap_slot);
-	}
-	NMA_UNLOCK();
-	// XXX initialize the selrecord structs.
-
-final:
-	/*
-	 * fill the slots for the rx and tx queues. They contain the offset
-	 * between the ring and nifp, so the information is usable in
-	 * userspace to reach the ring from the nifp.
-	 */
-	for (i = 0; i < ntx; i++) {
-		*(ssize_t *)(uintptr_t)&nifp->ring_ofs[i] =
-			(char *)na->tx_rings[i].ring - (char *)nifp;
-	}
-	for (i = 0; i < nrx; i++) {
-		*(ssize_t *)(uintptr_t)&nifp->ring_ofs[i+ntx] =
-			(char *)na->rx_rings[i].ring - (char *)nifp;
-	}
-	return (nifp);
-}
-
-/*
- * Initialize the memory allocator.
- *
- * Create the descriptor for the memory , allocate the pool of memory
- * and initialize the list of memory objects with a single chunk
- * containing the whole pre-allocated memory marked as free.
- *
- * Start with a large size, then halve as needed if we fail to
- * allocate the block. While halving, always add one extra page
- * because buffers 0 and 1 are used for special purposes.
- * Return 0 on success, errno otherwise.
- */
-static int
-netmap_memory_init(void)
-{
-	struct netmap_mem_obj *mem_obj;
-	void *buf = NULL;
-	int i, n, sz = NETMAP_MEMORY_SIZE;
-	int extra_sz = 0; // space for rings and two spare buffers
-
-	for (; sz >= 1<<20; sz >>=1) {
-		extra_sz = sz/200;
-		extra_sz = (extra_sz + 2*PAGE_SIZE - 1) & ~(PAGE_SIZE-1);
-	        buf = contigmalloc(sz + extra_sz,
-			     M_NETMAP,
-			     M_WAITOK | M_ZERO,
-			     0, /* low address */
-			     -1UL, /* high address */
-			     PAGE_SIZE, /* alignment */
-			     0 /* boundary */
-			    );
-		if (buf)
-			break;
-	}
-	if (buf == NULL)
-		return (ENOMEM);
-	sz += extra_sz;
-	nm_mem = malloc(sizeof(struct netmap_mem_d), M_NETMAP,
-			      M_WAITOK | M_ZERO);
-	mtx_init(&nm_mem->nm_mtx, "netmap memory allocator lock", NULL,
-		 MTX_DEF);
-	TAILQ_INIT(&nm_mem->nm_molist);
-	nm_mem->nm_buffer = buf;
-	nm_mem->nm_totalsize = sz;
-
-	/*
-	 * A buffer takes 2k, a slot takes 8 bytes + ring overhead,
-	 * so the ratio is 200:1. In other words, we can use 1/200 of
-	 * the memory for the rings, and the rest for the buffers,
-	 * and be sure we never run out.
-	 */
-	nm_mem->nm_size = sz/200;
-	nm_mem->nm_buf_start =
-		(nm_mem->nm_size + PAGE_SIZE - 1) & ~(PAGE_SIZE-1);
-	nm_mem->nm_buf_len = sz - nm_mem->nm_buf_start;
-
-	nm_buf_pool.base = nm_mem->nm_buffer;
-	nm_buf_pool.base += nm_mem->nm_buf_start;
-	netmap_buffer_base = nm_buf_pool.base;
-	D("netmap_buffer_base %p (offset %d)",
-		netmap_buffer_base, (int)nm_mem->nm_buf_start);
-	/* number of buffers, they all start as free */
-
-	netmap_total_buffers = nm_buf_pool.total_buffers =
-		nm_mem->nm_buf_len / NETMAP_BUF_SIZE;
-	nm_buf_pool.bufsize = NETMAP_BUF_SIZE;
-
-	D("Have %d MB, use %dKB for rings, %d buffers at %p",
-		(sz >> 20), (int)(nm_mem->nm_size >> 10),
-		nm_buf_pool.total_buffers, nm_buf_pool.base);
-
-	/* allocate and initialize the bitmap. Entry 0 is considered
-	 * always busy (used as default when there are no buffers left).
-	 */
-	n = (nm_buf_pool.total_buffers + 31) / 32;
-	nm_buf_pool.bitmap = malloc(sizeof(uint32_t) * n, M_NETMAP,
-			 M_WAITOK | M_ZERO);
-	nm_buf_pool.bitmap[0] = ~3; /* slot 0 and 1 always busy */
-	for (i = 1; i < n; i++)
-		nm_buf_pool.bitmap[i] = ~0;
-	nm_buf_pool.free = nm_buf_pool.total_buffers - 2;
-	
-	mem_obj = malloc(sizeof(struct netmap_mem_obj), M_NETMAP,
-			 M_WAITOK | M_ZERO);
-	TAILQ_INSERT_HEAD(&nm_mem->nm_molist, mem_obj, nmo_next);
-	mem_obj->nmo_used = 0;
-	mem_obj->nmo_size = nm_mem->nm_size;
-	mem_obj->nmo_data = nm_mem->nm_buffer;
-
-	return (0);
-}
-
-
-/*
- * Finalize the memory allocator.
- *
- * Free all the memory objects contained inside the list, and deallocate
- * the pool of memory; finally free the memory allocator descriptor.
- */
-static void
-netmap_memory_fini(void)
-{
-	struct netmap_mem_obj *mem_obj;
-
-	while (!TAILQ_EMPTY(&nm_mem->nm_molist)) {
-		mem_obj = TAILQ_FIRST(&nm_mem->nm_molist);
-		TAILQ_REMOVE(&nm_mem->nm_molist, mem_obj, nmo_next);
-		if (mem_obj->nmo_used == 1) {
-			printf("netmap: leaked %d bytes at %p\n",
-			       (int)mem_obj->nmo_size,
-			       mem_obj->nmo_data);
-		}
-		free(mem_obj, M_NETMAP);
-	}
-	contigfree(nm_mem->nm_buffer, nm_mem->nm_totalsize, M_NETMAP);
-	// XXX mutex_destroy(nm_mtx);
-	free(nm_mem, M_NETMAP);
-}
-/*------------- end of memory allocator -----------------*/

Modified: trunk/sys/dev/netmap/netmap_mem2.c
===================================================================
--- trunk/sys/dev/netmap/netmap_mem2.c	2016-09-25 22:40:55 UTC (rev 8727)
+++ trunk/sys/dev/netmap/netmap_mem2.c	2016-09-25 23:53:30 UTC (rev 8728)
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2012 Matteo Landi, Luigi Rizzo. All rights reserved.
+ * Copyright (C) 2012 Matteo Landi, Luigi Rizzo, Giuseppe Lettieri. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -51,7 +51,8 @@
  * of the object, and from there locate the offset from the beginning
  * of the region.
  *
- * Allocator for a pool of memory objects of the same size.
+ * The invididual allocators manage a pool of memory for objects of
+ * the same size.
  * The pool is split into smaller clusters, whose size is a
  * multiple of the page size. The cluster size is chosen
  * to minimize the waste for a given max cluster size
@@ -58,52 +59,90 @@
  * (we do it by brute force, as we have relatively few object
  * per cluster).
  *
- * To be polite with the cache, objects are aligned to
- * the cache line, or 64 bytes. Sizes are rounded to multiple of 64.
- * For each object we have
- * one entry in the bitmap to signal the state. Allocation scans
- * the bitmap, but since this is done only on attach, we are not
+ * Objects are aligned to the cache line (64 bytes) rounding up object
+ * sizes when needed. A bitmap contains the state of each object.
+ * Allocation scans the bitmap; this is done only on attach, so we are not
  * too worried about performance
- */
-
-/*
- *	MEMORY SIZES:
  *
- * (all the parameters below will become tunables)
+ * For each allocator we can define (thorugh sysctl) the size and
+ * number of each object. Memory is allocated at the first use of a
+ * netmap file descriptor, and can be freed when all such descriptors
+ * have been released (including unmapping the memory).
+ * If memory is scarce, the system tries to get as much as possible
+ * and the sysctl values reflect the actual allocation.
+ * Together with desired values, the sysctl export also absolute
+ * min and maximum values that cannot be overridden.
  *
- * struct netmap_if is variable size but small.
- * Assuming each NIC has 8+2 rings, (4+1 tx, 4+1 rx) the netmap_if
- * uses 120 bytes on a 64-bit machine.
- * We allocate NETMAP_IF_MAX_SIZE  (1024) which should work even for
- * cards with 48 ring pairs.
- * The total number of 'struct netmap_if' could be slightly larger
- * that the total number of rings on all interfaces on the system.
+ * struct netmap_if:
+ *	variable size, max 16 bytes per ring pair plus some fixed amount.
+ *	1024 bytes should be large enough in practice.
+ *
+ *	In the worst case we have one netmap_if per ring in the system.
+ *
+ * struct netmap_ring
+ *	variable too, 8 byte per slot plus some fixed amount.
+ *	Rings can be large (e.g. 4k slots, or >32Kbytes).
+ *	We default to 36 KB (9 pages), and a few hundred rings.
+ *
+ * struct netmap_buffer
+ *	The more the better, both because fast interfaces tend to have
+ *	many slots, and because we may want to use buffers to store
+ *	packets in userspace avoiding copies.
+ *	Must contain a full frame (eg 1518, or more for vlans, jumbo
+ *	frames etc.) plus be nicely aligned, plus some NICs restrict
+ *	the size to multiple of 1K or so. Default to 2K
  */
-#define NETMAP_IF_MAX_SIZE      1024
-#define NETMAP_IF_MAX_NUM       512
 
-/*
- * netmap rings are up to 2..4k descriptors, 8 bytes each,
- * plus some glue at the beginning (32 bytes).
- * We set the default ring size to 9 pages (36K) and enable
- * a few hundreds of them.
- */
-#define NETMAP_RING_MAX_SIZE    (9*PAGE_SIZE)
-#define NETMAP_RING_MAX_NUM     200	/* approx 8MB */
-
-/*
- * Buffers: the more the better. Buffer size is NETMAP_BUF_SIZE,
- * 2k or slightly less, aligned to 64 bytes.
- * A large 10G interface can have 2k*18 = 36k buffers per interface,
- * or about 72MB of memory. Up to us to use more.
- */
 #ifndef CONSERVATIVE
-#define NETMAP_BUF_MAX_NUM      100000  /* 200MB */
+#define NETMAP_BUF_MAX_NUM	20*4096*2	/* large machine */
 #else /* CONSERVATIVE */
 #define NETMAP_BUF_MAX_NUM      20000   /* 40MB */
 #endif
 
+#ifdef linux
+#define NMA_LOCK_T		struct semaphore
+#define NMA_LOCK_INIT()		sema_init(&nm_mem.nm_mtx, 1)
+#define NMA_LOCK_DESTROY()	
+#define NMA_LOCK()		down(&nm_mem.nm_mtx)
+#define NMA_UNLOCK()		up(&nm_mem.nm_mtx)
+#else /* !linux */
+#define NMA_LOCK_T		struct mtx
+#define NMA_LOCK_INIT()		mtx_init(&nm_mem.nm_mtx, "netmap memory allocator lock", NULL, MTX_DEF)
+#define NMA_LOCK_DESTROY()	mtx_destroy(&nm_mem.nm_mtx)
+#define NMA_LOCK()		mtx_lock(&nm_mem.nm_mtx)
+#define NMA_UNLOCK()		mtx_unlock(&nm_mem.nm_mtx)
+#endif /* linux */
 
+enum {
+	NETMAP_IF_POOL   = 0,
+	NETMAP_RING_POOL,
+	NETMAP_BUF_POOL,
+	NETMAP_POOLS_NR
+};
+
+
+struct netmap_obj_params {
+	u_int size;
+	u_int num;
+};
+
+
+struct netmap_obj_params netmap_params[NETMAP_POOLS_NR] = {
+	[NETMAP_IF_POOL] = {
+		.size = 1024,
+		.num  = 100,
+	},
+	[NETMAP_RING_POOL] = {
+		.size = 9*PAGE_SIZE,
+		.num  = 200,
+	},
+	[NETMAP_BUF_POOL] = {
+		.size = 2048,
+		.num  = NETMAP_BUF_MAX_NUM,
+	},
+};
+
+
 struct netmap_obj_pool {
 	char name[16];		/* name of the allocator */
 	u_int objtotal;         /* actual total number of objects. */
@@ -110,6 +149,12 @@
 	u_int objfree;          /* number of free objects. */
 	u_int clustentries;	/* actual objects per cluster */
 
+	/* limits */
+	u_int objminsize;	/* minimum object size */
+	u_int objmaxsize;	/* maximum object size */
+	u_int nummin;		/* minimum number of objects */
+	u_int nummax;		/* maximum number of objects */
+
 	/* the total memory space is _numclusters*_clustsize */
 	u_int _numclusters;	/* how many clusters */
 	u_int _clustsize;        /* cluster size */
@@ -118,21 +163,70 @@
 	u_int _memtotal;	/* _numclusters*_clustsize */
 	struct lut_entry *lut;  /* virt,phys addresses, objtotal entries */
 	uint32_t *bitmap;       /* one bit per buffer, 1 means free */
+	uint32_t bitmap_slots;	/* number of uint32 entries in bitmap */
 };
 
+
 struct netmap_mem_d {
-	NM_LOCK_T nm_mtx; /* protect the allocator ? */
+	NMA_LOCK_T nm_mtx;  /* protect the allocator */
 	u_int nm_totalsize; /* shorthand */
 
-	/* pointers to the three allocators */
-	struct netmap_obj_pool *nm_if_pool;
-	struct netmap_obj_pool *nm_ring_pool;
-	struct netmap_obj_pool *nm_buf_pool;
+	int finalized;		/* !=0 iff preallocation done */
+	int lasterr;		/* last error for curr config */
+	int refcount;		/* existing priv structures */
+	/* the three allocators */
+	struct netmap_obj_pool pools[NETMAP_POOLS_NR];
 };
 
+
+static struct netmap_mem_d nm_mem = {	/* Our memory allocator. */
+	.pools = {
+		[NETMAP_IF_POOL] = {
+			.name 	= "netmap_if",
+			.objminsize = sizeof(struct netmap_if),
+			.objmaxsize = 4096,
+			.nummin     = 10,	/* don't be stingy */
+			.nummax	    = 10000,	/* XXX very large */
+		},
+		[NETMAP_RING_POOL] = {
+			.name 	= "netmap_ring",
+			.objminsize = sizeof(struct netmap_ring),
+			.objmaxsize = 32*PAGE_SIZE,
+			.nummin     = 2,
+			.nummax	    = 1024,
+		},
+		[NETMAP_BUF_POOL] = {
+			.name	= "netmap_buf",
+			.objminsize = 64,
+			.objmaxsize = 65536,
+			.nummin     = 4,
+			.nummax	    = 1000000, /* one million! */
+		},
+	},
+};
+
 struct lut_entry *netmap_buffer_lut;	/* exported */
 
+/* memory allocator related sysctls */
 
+#define STRINGIFY(x) #x
+
+#define DECLARE_SYSCTLS(id, name) \
+	/* TUNABLE_INT("hw.netmap." STRINGIFY(name) "_size", &netmap_params[id].size); */ \
+	SYSCTL_INT(_dev_netmap, OID_AUTO, name##_size, \
+	    CTLFLAG_RW, &netmap_params[id].size, 0, "Requested size of netmap " STRINGIFY(name) "s"); \
+        SYSCTL_INT(_dev_netmap, OID_AUTO, name##_curr_size, \
+            CTLFLAG_RD, &nm_mem.pools[id]._objsize, 0, "Current size of netmap " STRINGIFY(name) "s"); \
+	/* TUNABLE_INT("hw.netmap." STRINGIFY(name) "_num", &netmap_params[id].num); */ \
+        SYSCTL_INT(_dev_netmap, OID_AUTO, name##_num, \
+            CTLFLAG_RW, &netmap_params[id].num, 0, "Requested number of netmap " STRINGIFY(name) "s"); \
+        SYSCTL_INT(_dev_netmap, OID_AUTO, name##_curr_num, \
+            CTLFLAG_RD, &nm_mem.pools[id].objtotal, 0, "Current number of netmap " STRINGIFY(name) "s")
+
+DECLARE_SYSCTLS(NETMAP_IF_POOL, if);
+DECLARE_SYSCTLS(NETMAP_RING_POOL, ring);
+DECLARE_SYSCTLS(NETMAP_BUF_POOL, buf);
+
 /*
  * Convert a userspace offset to a phisical address.
  * XXX re-do in a simpler way.
@@ -146,24 +240,25 @@
 static inline vm_paddr_t
 netmap_ofstophys(vm_offset_t offset)
 {
-	const struct netmap_obj_pool *p[] = {
-		nm_mem->nm_if_pool,
-		nm_mem->nm_ring_pool,
-		nm_mem->nm_buf_pool };
 	int i;
 	vm_offset_t o = offset;
+	struct netmap_obj_pool *p = nm_mem.pools;
 
-
-	for (i = 0; i < 3; offset -= p[i]->_memtotal, i++) {
-		if (offset >= p[i]->_memtotal)
+	for (i = 0; i < NETMAP_POOLS_NR; offset -= p[i]._memtotal, i++) {
+		if (offset >= p[i]._memtotal)
 			continue;
 		// XXX now scan the clusters
-		return p[i]->lut[offset / p[i]->_objsize].paddr +
-			offset % p[i]->_objsize;
+		return p[i].lut[offset / p[i]._objsize].paddr +
+			offset % p[i]._objsize;
 	}
+	/* this is only in case of errors */
 	D("invalid ofs 0x%x out of 0x%x 0x%x 0x%x", (u_int)o,
-		p[0]->_memtotal, p[0]->_memtotal + p[1]->_memtotal,
-		p[0]->_memtotal + p[1]->_memtotal + p[2]->_memtotal);
+		p[NETMAP_IF_POOL]._memtotal,
+		p[NETMAP_IF_POOL]._memtotal
+			+ p[NETMAP_RING_POOL]._memtotal,
+		p[NETMAP_IF_POOL]._memtotal
+			+ p[NETMAP_RING_POOL]._memtotal
+			+ p[NETMAP_BUF_POOL]._memtotal);
 	return 0;	// XXX bad address
 }
 
@@ -198,20 +293,24 @@
 
 /* Helper functions which convert virtual addresses to offsets */
 #define netmap_if_offset(v)					\
-	netmap_obj_offset(nm_mem->nm_if_pool, (v))
+	netmap_obj_offset(&nm_mem.pools[NETMAP_IF_POOL], (v))
 
 #define netmap_ring_offset(v)					\
-    (nm_mem->nm_if_pool->_memtotal + 				\
-	netmap_obj_offset(nm_mem->nm_ring_pool, (v)))
+    (nm_mem.pools[NETMAP_IF_POOL]._memtotal + 				\
+	netmap_obj_offset(&nm_mem.pools[NETMAP_RING_POOL], (v)))
 
 #define netmap_buf_offset(v)					\
-    (nm_mem->nm_if_pool->_memtotal +				\
-	nm_mem->nm_ring_pool->_memtotal +			\
-	netmap_obj_offset(nm_mem->nm_buf_pool, (v)))
+    (nm_mem.pools[NETMAP_IF_POOL]._memtotal +				\
+	nm_mem.pools[NETMAP_RING_POOL]._memtotal +			\
+	netmap_obj_offset(&nm_mem.pools[NETMAP_BUF_POOL], (v)))
 
 
+/*
+ * report the index, and use start position as a hint,
+ * otherwise buffer allocation becomes terribly expensive.
+ */
 static void *
-netmap_obj_malloc(struct netmap_obj_pool *p, int len)
+netmap_obj_malloc(struct netmap_obj_pool *p, int len, uint32_t *start, uint32_t *index)
 {
 	uint32_t i = 0;			/* index in the bitmap */
 	uint32_t mask, j;		/* slot counter */
@@ -227,9 +326,11 @@
 		D("%s allocator: run out of memory", p->name);
 		return NULL;
 	}
+	if (start)
+		i = *start;
 
-	/* termination is guaranteed by p->free */
-	while (vaddr == NULL) {
+	/* termination is guaranteed by p->free, but better check bounds on i */
+	while (vaddr == NULL && i < p->bitmap_slots)  {
 		uint32_t cur = p->bitmap[i];
 		if (cur == 0) { /* bitmask is fully used */
 			i++;
@@ -243,9 +344,13 @@
 		p->objfree--;
 
 		vaddr = p->lut[i * 32 + j].vaddr;
+		if (index)
+			*index = i * 32 + j;
 	}
 	ND("%s allocator: allocated object @ [%d][%d]: vaddr %p", i, j, vaddr);
 
+	if (start)
+		*start = i;
 	return vaddr;
 }
 
@@ -283,51 +388,59 @@
 		netmap_obj_free(p, j);
 		return;
 	}
-	ND("address %p is not contained inside any cluster (%s)",
+	D("address %p is not contained inside any cluster (%s)",
 	    vaddr, p->name);
 }
 
-#define netmap_if_malloc(len)	netmap_obj_malloc(nm_mem->nm_if_pool, len)
-#define netmap_if_free(v)	netmap_obj_free_va(nm_mem->nm_if_pool, (v))
-#define netmap_ring_malloc(len)	netmap_obj_malloc(nm_mem->nm_ring_pool, len)
-#define netmap_buf_malloc()			\
-	netmap_obj_malloc(nm_mem->nm_buf_pool, NETMAP_BUF_SIZE)
+#define netmap_if_malloc(len)	netmap_obj_malloc(&nm_mem.pools[NETMAP_IF_POOL], len, NULL, NULL)
+#define netmap_if_free(v)	netmap_obj_free_va(&nm_mem.pools[NETMAP_IF_POOL], (v))
+#define netmap_ring_malloc(len)	netmap_obj_malloc(&nm_mem.pools[NETMAP_RING_POOL], len, NULL, NULL)
+#define netmap_ring_free(v)	netmap_obj_free_va(&nm_mem.pools[NETMAP_RING_POOL], (v))
+#define netmap_buf_malloc(_pos, _index)			\
+	netmap_obj_malloc(&nm_mem.pools[NETMAP_BUF_POOL], NETMAP_BUF_SIZE, _pos, _index)
 
 
 /* Return the index associated to the given packet buffer */
 #define netmap_buf_index(v)						\
-    (netmap_obj_offset(nm_mem->nm_buf_pool, (v)) / nm_mem->nm_buf_pool->_objsize)
+    (netmap_obj_offset(&nm_mem.pools[NETMAP_BUF_POOL], (v)) / nm_mem.pools[NETMAP_BUF_POOL]._objsize)
 
 
-static void
-netmap_new_bufs(struct netmap_if *nifp __unused,
+/* Return nonzero on error */
+static int
+netmap_new_bufs(struct netmap_if *nifp,
                 struct netmap_slot *slot, u_int n)
 {
-	struct netmap_obj_pool *p = nm_mem->nm_buf_pool;
-	uint32_t i = 0;	/* slot counter */
+	struct netmap_obj_pool *p = &nm_mem.pools[NETMAP_BUF_POOL];
+	int i = 0;	/* slot counter */
+	uint32_t pos = 0;	/* slot in p->bitmap */
+	uint32_t index = 0;	/* buffer index */
 
+	(void)nifp;	/* UNUSED */
 	for (i = 0; i < n; i++) {
-		void *vaddr = netmap_buf_malloc();
+		void *vaddr = netmap_buf_malloc(&pos, &index);
 		if (vaddr == NULL) {
 			D("unable to locate empty packet buffer");
 			goto cleanup;
 		}
-
-		slot[i].buf_idx = netmap_buf_index(vaddr);
-		KASSERT(slot[i].buf_idx != 0,
-		    ("Assigning buf_idx=0 to just created slot"));
+		slot[i].buf_idx = index;
 		slot[i].len = p->_objsize;
-		slot[i].flags = NS_BUF_CHANGED; // XXX GAETANO hack
+		/* XXX setting flags=NS_BUF_CHANGED forces a pointer reload
+		 * in the NIC ring. This is a hack that hides missing
+		 * initializations in the drivers, and should go away.
+		 */
+		slot[i].flags = NS_BUF_CHANGED;
 	}
 
-	ND("allocated %d buffers, %d available", n, p->objfree);
-	return;
+	ND("allocated %d buffers, %d available, first at %d", n, p->objfree, pos);
+	return (0);
 
 cleanup:
 	while (i > 0) {
 		i--;
-		netmap_obj_free(nm_mem->nm_buf_pool, slot[i].buf_idx);
+		netmap_obj_free(p, slot[i].buf_idx);
 	}
+	bzero(slot, n * sizeof(slot[0]));
+	return (ENOMEM);
 }
 
 
@@ -334,25 +447,23 @@
 static void
 netmap_free_buf(struct netmap_if *nifp, uint32_t i)
 {
-	struct netmap_obj_pool *p = nm_mem->nm_buf_pool;
+	struct netmap_obj_pool *p = &nm_mem.pools[NETMAP_BUF_POOL];
+
 	if (i < 2 || i >= p->objtotal) {
 		D("Cannot free buf#%d: should be in [2, %d[", i, p->objtotal);
 		return;
 	}
-	netmap_obj_free(nm_mem->nm_buf_pool, i);
+	netmap_obj_free(p, i);
 }
 
-
-/*
- * Free all resources related to an allocator.
- */
 static void
-netmap_destroy_obj_allocator(struct netmap_obj_pool *p)
+netmap_reset_obj_allocator(struct netmap_obj_pool *p)
 {
 	if (p == NULL)
 		return;
 	if (p->bitmap)
 		free(p->bitmap, M_NETMAP);
+	p->bitmap = NULL;
 	if (p->lut) {
 		int i;
 		for (i = 0; i < p->objtotal; i += p->clustentries) {
@@ -360,13 +471,27 @@
 				contigfree(p->lut[i].vaddr, p->_clustsize, M_NETMAP);
 		}
 		bzero(p->lut, sizeof(struct lut_entry) * p->objtotal);
+#ifdef linux
+		vfree(p->lut);
+#else
 		free(p->lut, M_NETMAP);
+#endif
 	}
-	bzero(p, sizeof(*p));
-	free(p, M_NETMAP);
+	p->lut = NULL;
 }
 
 /*
+ * Free all resources related to an allocator.
+ */
+static void
+netmap_destroy_obj_allocator(struct netmap_obj_pool *p)
+{
+	if (p == NULL)
+		return;
+	netmap_reset_obj_allocator(p);
+}
+
+/*
  * We receive a request for objtotal objects, of size objsize each.
  * Internally we may round up both numbers, as we allocate objects
  * in small clusters multiple of the page size.
@@ -377,10 +502,12 @@
  * XXX note -- userspace needs the buffers to be contiguous,
  *	so we cannot afford gaps at the end of a cluster.
  */
-static struct netmap_obj_pool *
-netmap_new_obj_allocator(const char *name, u_int objtotal, u_int objsize)
+
+
+/* call with NMA_LOCK held */
+static int
+netmap_config_obj_allocator(struct netmap_obj_pool *p, u_int objtotal, u_int objsize)
 {
-	struct netmap_obj_pool *p;
 	int i, n;
 	u_int clustsize;	/* the cluster size, multiple of page size */
 	u_int clustentries;	/* how many objects per entry */
@@ -390,7 +517,7 @@
 	if (objsize >= MAX_CLUSTSIZE) {
 		/* we could do it but there is no point */
 		D("unsupported allocation for %d bytes", objsize);
-		return NULL;
+		goto error;
 	}
 	/* make sure objsize is a multiple of LINE_ROUND */
 	i = (objsize & (LINE_ROUND - 1));
@@ -398,6 +525,16 @@
 		D("XXX aligning object by %d bytes", LINE_ROUND - i);
 		objsize += LINE_ROUND - i;
 	}
+	if (objsize < p->objminsize || objsize > p->objmaxsize) {
+		D("requested objsize %d out of range [%d, %d]", 
+			objsize, p->objminsize, p->objmaxsize);
+		goto error;
+	}
+	if (objtotal < p->nummin || objtotal > p->nummax) {
+		D("requested objtotal %d out of range [%d, %d]", 
+			objtotal, p->nummin, p->nummax);
+		goto error;
+	}
 	/*
 	 * Compute number of objects using a brute-force approach:
 	 * given a max cluster size,
@@ -422,22 +559,14 @@
 	i =  (clustsize & (PAGE_SIZE - 1));
 	if (i)
 		clustsize += PAGE_SIZE - i;
-	D("objsize %d clustsize %d objects %d",
-		objsize, clustsize, clustentries);
+	if (netmap_verbose)
+		D("objsize %d clustsize %d objects %d",
+			objsize, clustsize, clustentries);
 
-	p = malloc(sizeof(struct netmap_obj_pool), M_NETMAP,
-	    M_WAITOK | M_ZERO);
-	if (p == NULL) {
-		D("Unable to create '%s' allocator", name);
-		return NULL;
-	}
 	/*
-	 * Allocate and initialize the lookup table.
-	 *
 	 * The number of clusters is n = ceil(objtotal/clustentries)
 	 * objtotal' = n * clustentries
 	 */
-	strncpy(p->name, name, sizeof(p->name));
 	p->clustentries = clustentries;
 	p->_clustsize = clustsize;
 	n = (objtotal + clustentries - 1) / clustentries;
@@ -444,55 +573,77 @@
 	p->_numclusters = n;
 	p->objtotal = n * clustentries;
 	p->objfree = p->objtotal - 2; /* obj 0 and 1 are reserved */
+	p->_memtotal = p->_numclusters * p->_clustsize;
 	p->_objsize = objsize;
-	p->_memtotal = p->_numclusters * p->_clustsize;
 
-	p->lut = malloc(sizeof(struct lut_entry) * p->objtotal,
-	    M_NETMAP, M_WAITOK | M_ZERO);
+	return 0;
+
+error:
+	p->_objsize = objsize;
+	p->objtotal = objtotal;
+
+	return EINVAL;
+}
+
+
+/* call with NMA_LOCK held */
+static int
+netmap_finalize_obj_allocator(struct netmap_obj_pool *p)
+{
+	int i, n;
+
+	n = sizeof(struct lut_entry) * p->objtotal;
+#ifdef linux
+	p->lut = vmalloc(n);
+#else
+	p->lut = malloc(n, M_NETMAP, M_NOWAIT | M_ZERO);
+#endif
 	if (p->lut == NULL) {
-		D("Unable to create lookup table for '%s' allocator", name);
+		D("Unable to create lookup table (%d bytes) for '%s'", n, p->name);
 		goto clean;
 	}
 
 	/* Allocate the bitmap */
 	n = (p->objtotal + 31) / 32;
-	p->bitmap = malloc(sizeof(uint32_t) * n, M_NETMAP, M_WAITOK | M_ZERO);
+	p->bitmap = malloc(sizeof(uint32_t) * n, M_NETMAP, M_NOWAIT | M_ZERO);
 	if (p->bitmap == NULL) {
 		D("Unable to create bitmap (%d entries) for allocator '%s'", n,
-		    name);
+		    p->name);
 		goto clean;
 	}
+	p->bitmap_slots = n;
 
 	/*
 	 * Allocate clusters, init pointers and bitmap
 	 */
 	for (i = 0; i < p->objtotal;) {
-		int lim = i + clustentries;
+		int lim = i + p->clustentries;
 		char *clust;
 
-		clust = contigmalloc(clustsize, M_NETMAP, M_WAITOK | M_ZERO,
+		clust = contigmalloc(p->_clustsize, M_NETMAP, M_NOWAIT | M_ZERO,
 		    0, -1UL, PAGE_SIZE, 0);
 		if (clust == NULL) {
 			/*
 			 * If we get here, there is a severe memory shortage,
 			 * so halve the allocated memory to reclaim some.
+			 * XXX check boundaries
 			 */
 			D("Unable to create cluster at %d for '%s' allocator",
-			    i, name);
+			    i, p->name);
 			lim = i / 2;
-			for (; i >= lim; i--) {
+			for (i--; i >= lim; i--) {
 				p->bitmap[ (i>>5) ] &=  ~( 1 << (i & 31) );
-				if (i % clustentries == 0 && p->lut[i].vaddr)
+				if (i % p->clustentries == 0 && p->lut[i].vaddr)
 					contigfree(p->lut[i].vaddr,
 						p->_clustsize, M_NETMAP);
 			}
 			p->objtotal = i;
 			p->objfree = p->objtotal - 2;
-			p->_numclusters = i / clustentries;
+			p->_numclusters = i / p->clustentries;
 			p->_memtotal = p->_numclusters * p->_clustsize;
 			break;
 		}
-		for (; i < lim; i++, clust += objsize) {
+		for (; i < lim; i++, clust += p->_objsize) {
 			p->bitmap[ (i>>5) ] |=  ( 1 << (i & 31) );
 			p->lut[i].vaddr = clust;
 			p->lut[i].paddr = vtophys(clust);
@@ -499,85 +650,175 @@
 		}
 	}
 	p->bitmap[0] = ~3; /* objs 0 and 1 is always busy */
-	D("Pre-allocated %d clusters (%d/%dKB) for '%s'",
-	    p->_numclusters, p->_clustsize >> 10,
-	    p->_memtotal >> 10, name);
+	if (netmap_verbose)
+		D("Pre-allocated %d clusters (%d/%dKB) for '%s'",
+		    p->_numclusters, p->_clustsize >> 10,
+		    p->_memtotal >> 10, p->name);
 
-	return p;
+	return 0;
 
 clean:
-	netmap_destroy_obj_allocator(p);
-	return NULL;
+	netmap_reset_obj_allocator(p);
+	return ENOMEM;
 }
 
+/* call with lock held */
 static int
-netmap_memory_init(void)
+netmap_memory_config_changed(void)
 {
-	struct netmap_obj_pool *p;
+	int i;
 
-	nm_mem = malloc(sizeof(struct netmap_mem_d), M_NETMAP,
-			      M_WAITOK | M_ZERO);
-	if (nm_mem == NULL)
-		goto clean;
+	for (i = 0; i < NETMAP_POOLS_NR; i++) {
+		if (nm_mem.pools[i]._objsize != netmap_params[i].size ||
+		    nm_mem.pools[i].objtotal != netmap_params[i].num)
+		    return 1;
+	}
+	return 0;
+}
 
-	p = netmap_new_obj_allocator("netmap_if",
-	    NETMAP_IF_MAX_NUM, NETMAP_IF_MAX_SIZE);
-	if (p == NULL)
-		goto clean;
-	nm_mem->nm_if_pool = p;
 
-	p = netmap_new_obj_allocator("netmap_ring",
-	    NETMAP_RING_MAX_NUM, NETMAP_RING_MAX_SIZE);
-	if (p == NULL)
-		goto clean;
-	nm_mem->nm_ring_pool = p;
+/* call with lock held */
+static int
+netmap_memory_config(void)
+{
+	int i;
 
-	p = netmap_new_obj_allocator("netmap_buf",
-	    NETMAP_BUF_MAX_NUM, NETMAP_BUF_SIZE);
-	if (p == NULL)
-		goto clean;
-	netmap_total_buffers = p->objtotal;
-	netmap_buffer_lut = p->lut;
-	nm_mem->nm_buf_pool = p;
-	netmap_buffer_base = p->lut[0].vaddr;
 
-	mtx_init(&nm_mem->nm_mtx, "netmap memory allocator lock", NULL,
-		 MTX_DEF);
-	nm_mem->nm_totalsize =
-	    nm_mem->nm_if_pool->_memtotal +
-	    nm_mem->nm_ring_pool->_memtotal +
-	    nm_mem->nm_buf_pool->_memtotal;
+	if (!netmap_memory_config_changed())
+		goto out;
 
+	D("reconfiguring");
+
+	if (nm_mem.finalized) {
+		/* reset previous allocation */
+		for (i = 0; i < NETMAP_POOLS_NR; i++) {
+			netmap_reset_obj_allocator(&nm_mem.pools[i]);
+		}    
+		nm_mem.finalized = 0;
+        }
+
+	for (i = 0; i < NETMAP_POOLS_NR; i++) {
+		nm_mem.lasterr = netmap_config_obj_allocator(&nm_mem.pools[i],
+				netmap_params[i].num, netmap_params[i].size);
+		if (nm_mem.lasterr)
+			goto out;
+	}
+
 	D("Have %d KB for interfaces, %d KB for rings and %d MB for buffers",
-	    nm_mem->nm_if_pool->_memtotal >> 10,
-	    nm_mem->nm_ring_pool->_memtotal >> 10,
-	    nm_mem->nm_buf_pool->_memtotal >> 20);
-	return 0;
+	    nm_mem.pools[NETMAP_IF_POOL]._memtotal >> 10,
+	    nm_mem.pools[NETMAP_RING_POOL]._memtotal >> 10,
+	    nm_mem.pools[NETMAP_BUF_POOL]._memtotal >> 20);
 
-clean:
-	if (nm_mem) {
-		netmap_destroy_obj_allocator(nm_mem->nm_ring_pool);
-		netmap_destroy_obj_allocator(nm_mem->nm_if_pool);
-		free(nm_mem, M_NETMAP);
+out:
+
+	return nm_mem.lasterr;
+}
+
+/* call with lock held */
+static int
+netmap_memory_finalize(void)
+{
+	int i;
+	u_int totalsize = 0;
+
+	nm_mem.refcount++;
+	if (nm_mem.refcount > 1) {
+		ND("busy (refcount %d)", nm_mem.refcount);
+		goto out;
 	}
-	return ENOMEM;
+
+	/* update configuration if changed */
+	if (netmap_memory_config())
+		goto out;
+
+	if (nm_mem.finalized) {
+		/* may happen if config is not changed */
+		ND("nothing to do");
+		goto out;
+	}
+
+	for (i = 0; i < NETMAP_POOLS_NR; i++) {
+		nm_mem.lasterr = netmap_finalize_obj_allocator(&nm_mem.pools[i]);
+		if (nm_mem.lasterr)
+			goto cleanup;
+		totalsize += nm_mem.pools[i]._memtotal;
+	}
+	nm_mem.nm_totalsize = totalsize;
+
+	/* backward compatibility */
+	netmap_buf_size = nm_mem.pools[NETMAP_BUF_POOL]._objsize;
+	netmap_total_buffers = nm_mem.pools[NETMAP_BUF_POOL].objtotal;
+
+	netmap_buffer_lut = nm_mem.pools[NETMAP_BUF_POOL].lut;
+	netmap_buffer_base = nm_mem.pools[NETMAP_BUF_POOL].lut[0].vaddr;
+
+	nm_mem.finalized = 1;
+	nm_mem.lasterr = 0;
+
+	/* make sysctl values match actual values in the pools */
+	for (i = 0; i < NETMAP_POOLS_NR; i++) {
+		netmap_params[i].size = nm_mem.pools[i]._objsize;
+		netmap_params[i].num  = nm_mem.pools[i].objtotal;
+	}
+
+out:
+	if (nm_mem.lasterr)
+		nm_mem.refcount--;
+
+	return nm_mem.lasterr;
+
+cleanup:
+	for (i = 0; i < NETMAP_POOLS_NR; i++) {
+		netmap_reset_obj_allocator(&nm_mem.pools[i]);
+	}
+	nm_mem.refcount--;
+
+	return nm_mem.lasterr;
 }
 
+static int
+netmap_memory_init(void)
+{
+	NMA_LOCK_INIT();
+	return (0);
+}
 
 static void
 netmap_memory_fini(void)
 {
-	if (!nm_mem)
+	int i;
+
+	for (i = 0; i < NETMAP_POOLS_NR; i++) {
+	    netmap_destroy_obj_allocator(&nm_mem.pools[i]);
+	}
+	NMA_LOCK_DESTROY();
+}
+
+static void
+netmap_free_rings(struct netmap_adapter *na)
+{
+	int i;
+	if (!na->tx_rings)
 		return;
-	netmap_destroy_obj_allocator(nm_mem->nm_if_pool);
-	netmap_destroy_obj_allocator(nm_mem->nm_ring_pool);
-	netmap_destroy_obj_allocator(nm_mem->nm_buf_pool);
-	mtx_destroy(&nm_mem->nm_mtx);
-	free(nm_mem, M_NETMAP);
+	for (i = 0; i < na->num_tx_rings + 1; i++) {
+		netmap_ring_free(na->tx_rings[i].ring);
+		na->tx_rings[i].ring = NULL;
+	}
+	for (i = 0; i < na->num_rx_rings + 1; i++) {
+		netmap_ring_free(na->rx_rings[i].ring);
+		na->rx_rings[i].ring = NULL;
+	}
+	free(na->tx_rings, M_DEVBUF);
+	na->tx_rings = na->rx_rings = NULL;
 }
 
 
 
+/* call with NMA_LOCK held */
+/*
+ * Allocate the per-fd structure netmap_if.
+ * If this is the first instance, also allocate the krings, rings etc.
+ */
 static void *
 netmap_if_new(const char *ifname, struct netmap_adapter *na)
 {
@@ -584,12 +825,15 @@
 	struct netmap_if *nifp;
 	struct netmap_ring *ring;
 	ssize_t base; /* handy for relative offsets between rings and nifp */
-	u_int i, len, ndesc;
-	u_int ntx = na->num_tx_rings + 1; /* shorthand, include stack ring */
-	u_int nrx = na->num_rx_rings + 1; /* shorthand, include stack ring */
+	u_int i, len, ndesc, ntx, nrx;
 	struct netmap_kring *kring;
 
-	NMA_LOCK();
+	if (netmap_update_config(na)) {
+		/* configuration mismatch, report and fail */
+		return NULL;
+	}
+	ntx = na->num_tx_rings + 1; /* shorthand, include stack ring */
+	nrx = na->num_rx_rings + 1; /* shorthand, include stack ring */
 	/*
 	 * the descriptor is followed inline by an array of offsets
 	 * to the tx and rx rings in the shared memory region.
@@ -597,7 +841,6 @@
 	len = sizeof(struct netmap_if) + (nrx + ntx) * sizeof(ssize_t);
 	nifp = netmap_if_malloc(len);
 	if (nifp == NULL) {
-		NMA_UNLOCK();
 		return NULL;
 	}
 
@@ -608,10 +851,17 @@
 
 	(na->refcount)++;	/* XXX atomic ? we are under lock */
 	if (na->refcount > 1) { /* already setup, we are done */
-		NMA_UNLOCK();
 		goto final;
 	}
 
+	len = (ntx + nrx) * sizeof(struct netmap_kring);
+	na->tx_rings = malloc(len, M_DEVBUF, M_NOWAIT | M_ZERO);
+	if (na->tx_rings == NULL) {
+		D("Cannot allocate krings for %s", ifname);
+		goto cleanup;
+	}
+	na->rx_rings = na->tx_rings + ntx;
+
 	/*
 	 * First instance, allocate netmap rings and buffers for this card
 	 * The rings are contiguous, but have variable size.
@@ -632,8 +882,8 @@
 		kring->ring = ring;
 		*(int *)(uintptr_t)&ring->num_slots = kring->nkr_num_slots = ndesc;
 		*(ssize_t *)(uintptr_t)&ring->buf_ofs =
-		    (nm_mem->nm_if_pool->_memtotal +
-			nm_mem->nm_ring_pool->_memtotal) -
+		    (nm_mem.pools[NETMAP_IF_POOL]._memtotal +
+			nm_mem.pools[NETMAP_RING_POOL]._memtotal) -
 			netmap_ring_offset(ring);
 
 		/*
@@ -646,7 +896,10 @@
 		ring->cur = kring->nr_hwcur = 0;
 		*(int *)(uintptr_t)&ring->nr_buf_size = NETMAP_BUF_SIZE;
 		ND("initializing slots for txring[%d]", i);
-		netmap_new_bufs(nifp, ring->slot, ndesc);
+		if (netmap_new_bufs(nifp, ring->slot, ndesc)) {
+			D("Cannot allocate buffers for tx_ring[%d] for %s", i, ifname);
+			goto cleanup;
+		}
 	}
 
 	for (i = 0; i < nrx; i++) { /* Receive rings */
@@ -666,8 +919,8 @@
 		kring->ring = ring;
 		*(int *)(uintptr_t)&ring->num_slots = kring->nkr_num_slots = ndesc;
 		*(ssize_t *)(uintptr_t)&ring->buf_ofs =
-		    (nm_mem->nm_if_pool->_memtotal +
-		        nm_mem->nm_ring_pool->_memtotal) -
+		    (nm_mem.pools[NETMAP_IF_POOL]._memtotal +
+		        nm_mem.pools[NETMAP_RING_POOL]._memtotal) -
 			netmap_ring_offset(ring);
 
 		ring->cur = kring->nr_hwcur = 0;
@@ -674,17 +927,19 @@
 		ring->avail = kring->nr_hwavail = 0; /* empty */
 		*(int *)(uintptr_t)&ring->nr_buf_size = NETMAP_BUF_SIZE;
 		ND("initializing slots for rxring[%d]", i);
-		netmap_new_bufs(nifp, ring->slot, ndesc);
+		if (netmap_new_bufs(nifp, ring->slot, ndesc)) {
+			D("Cannot allocate buffers for rx_ring[%d] for %s", i, ifname);
+			goto cleanup;
+		}
 	}
-	NMA_UNLOCK();
 #ifdef linux
 	// XXX initialize the selrecord structs.
 	for (i = 0; i < ntx; i++)
+		init_waitqueue_head(&na->tx_rings[i].si);
+	for (i = 0; i < nrx; i++)
 		init_waitqueue_head(&na->rx_rings[i].si);
-	for (i = 0; i < nrx; i++)
-		init_waitqueue_head(&na->tx_rings[i].si);
+	init_waitqueue_head(&na->tx_si);
 	init_waitqueue_head(&na->rx_si);
-	init_waitqueue_head(&na->tx_si);
 #endif
 final:
 	/*
@@ -703,19 +958,17 @@
 	}
 	return (nifp);
 cleanup:
-	// XXX missing
-	NMA_UNLOCK();
+	netmap_free_rings(na);
+	netmap_if_free(nifp);
+	(na->refcount)--;
 	return NULL;
 }
 
+/* call with NMA_LOCK held */
 static void
-netmap_free_rings(struct netmap_adapter *na)
+netmap_memory_deref(void)
 {
-	int i;
-	for (i = 0; i < na->num_tx_rings + 1; i++)
-		netmap_obj_free_va(nm_mem->nm_ring_pool,
-			na->tx_rings[i].ring);
-	for (i = 0; i < na->num_rx_rings + 1; i++)
-		netmap_obj_free_va(nm_mem->nm_ring_pool,
-			na->rx_rings[i].ring);
+	nm_mem.refcount--;
+	if (netmap_verbose)
+		D("refcount = %d", nm_mem.refcount);
 }

Modified: trunk/sys/net/netmap.h
===================================================================
--- trunk/sys/net/netmap.h	2016-09-25 22:40:55 UTC (rev 8727)
+++ trunk/sys/net/netmap.h	2016-09-25 23:53:30 UTC (rev 8728)
@@ -113,15 +113,28 @@
  * In the kernel, buffers do not necessarily need to be contiguous,
  * and the virtual and physical addresses are derived through
  * a lookup table.
- * To associate a different buffer to a slot, applications must
- * write the new index in buf_idx, and set NS_BUF_CHANGED flag to
- * make sure that the kernel updates the hardware ring as needed.
  *
- * Normally the driver is not requested to report the result of
- * transmissions (this can dramatically speed up operation).
- * However the user may request to report completion by setting
- * NS_REPORT.
+ * struct netmap_slot:
+ *
+ * buf_idx	is the index of the buffer associated to the slot.
+ * len		is the length of the payload
+ * NS_BUF_CHANGED	must be set whenever userspace wants
+ *		to change buf_idx (it might be necessary to
+ *		reprogram the NIC slot)
+ * NS_REPORT	must be set if we want the NIC to generate an interrupt
+ *		when this slot is used. Leaving it to 0 improves
+ *		performance.
+ * NS_FORWARD	if set on a receive ring, and the device is in
+ *		transparent mode, buffers released with the flag set
+ *		will be forwarded to the 'other' side (host stack
+ *		or NIC, respectively) on the next select() or ioctl()
+ * NS_NO_LEARN	on a VALE switch, do not 'learn' the source port for
+ *		this packet.
+ * NS_PORT_MASK	the high 8 bits of the flag, if not zero, indicate the
+ *		destination port for the VALE switch, overriding
+ *		the lookup table.
  */
+
 struct netmap_slot {
 	uint32_t buf_idx; /* buffer index */
 	uint16_t len;	/* packet length, to be copied to/from the hw ring */
@@ -130,6 +143,9 @@
 #define	NS_REPORT	0x0002	/* ask the hardware to report results
 				 * e.g. by generating an interrupt
 				 */
+#define	NS_FORWARD	0x0004	/* pass packet to the other endpoint
+				 * (host stack or device
+				 */
 };
 
 /*
@@ -186,6 +202,18 @@
  *	a system call.
  *
  *	The netmap_kring is only modified by the upper half of the kernel.
+ *
+ * FLAGS
+ *	NR_TIMESTAMP	updates the 'ts' field on each syscall. This is
+ *			a global timestamp for all packets.
+ *	NR_RX_TSTMP	if set, the last 64 byte in each buffer will
+ *			contain a timestamp for the frame supplied by
+ *			the hardware (if supported)
+ *	NR_FORWARD	if set, the NS_FORWARD flag in each slot of the
+ *			RX ring is checked, and if set the packet is
+ *			passed to the other side (host stack or device,
+ *			respectively). This permits bpf-like behaviour
+ *			or transparency for selected packets.
  */
 struct netmap_ring {
 	/*
@@ -202,6 +230,8 @@
 	const uint16_t	nr_buf_size;
 	uint16_t	flags;
 #define	NR_TIMESTAMP	0x0002		/* set timestamp on *sync() */
+#define	NR_FORWARD	0x0004		/* enable NS_FORWARD for ring */
+#define	NR_RX_TSTMP	0x0008		/* set rx timestamp in slots */
 
 	struct timeval	ts;		/* time of last *sync() */
 

Modified: trunk/sys/net/netmap_user.h
===================================================================
--- trunk/sys/net/netmap_user.h	2016-09-25 22:40:55 UTC (rev 8727)
+++ trunk/sys/net/netmap_user.h	2016-09-25 23:53:30 UTC (rev 8728)
@@ -62,16 +62,17 @@
 #ifndef _NET_NETMAP_USER_H_
 #define _NET_NETMAP_USER_H_
 
-#define NETMAP_IF(b, o)	(struct netmap_if *)((char *)(b) + (o))
+#define _NETMAP_OFFSET(type, ptr, offset) \
+	((type)(void *)((char *)(ptr) + (offset)))
 
-#define NETMAP_TXRING(nifp, index)			\
-	((struct netmap_ring *)((char *)(nifp) +	\
-		(nifp)->ring_ofs[index] ) )
+#define NETMAP_IF(b, o)	_NETMAP_OFFSET(struct netmap_if *, b, o)
 
-#define NETMAP_RXRING(nifp, index)			\
-	((struct netmap_ring *)((char *)(nifp) +	\
-	    (nifp)->ring_ofs[index + (nifp)->ni_tx_rings + 1] ) )
+#define NETMAP_TXRING(nifp, index) _NETMAP_OFFSET(struct netmap_ring *, \
+	nifp, (nifp)->ring_ofs[index] )
 
+#define NETMAP_RXRING(nifp, index) _NETMAP_OFFSET(struct netmap_ring *,	\
+	nifp, (nifp)->ring_ofs[index + (nifp)->ni_tx_rings + 1] )
+
 #define NETMAP_BUF(ring, index)				\
 	((char *)(ring) + (ring)->buf_ofs + ((index)*(ring)->nr_buf_size))
 



More information about the Midnightbsd-cvs mailing list