[Midnightbsd-cvs] src [7911] U trunk/sys/ofed/include/rdma: sync with freebsd rev 244242

laffer1 at midnightbsd.org laffer1 at midnightbsd.org
Wed Sep 14 15:35:22 EDT 2016


Revision: 7911
          http://svnweb.midnightbsd.org/src/?rev=7911
Author:   laffer1
Date:     2016-09-14 15:35:22 -0400 (Wed, 14 Sep 2016)
Log Message:
-----------
sync with freebsd rev 244242

Revision Links:
--------------
    http://svnweb.midnightbsd.org/src/?rev=244242

Modified Paths:
--------------
    trunk/sys/ofed/drivers/infiniband/hw/mlx4/Kconfig
    trunk/sys/ofed/drivers/infiniband/hw/mlx4/Makefile
    trunk/sys/ofed/drivers/infiniband/hw/mlx4/ah.c
    trunk/sys/ofed/drivers/infiniband/hw/mlx4/cq.c
    trunk/sys/ofed/drivers/infiniband/hw/mlx4/mad.c
    trunk/sys/ofed/drivers/infiniband/hw/mlx4/main.c
    trunk/sys/ofed/drivers/infiniband/hw/mlx4/mlx4_ib.h
    trunk/sys/ofed/drivers/infiniband/hw/mlx4/mr.c
    trunk/sys/ofed/drivers/infiniband/hw/mlx4/qp.c
    trunk/sys/ofed/drivers/infiniband/hw/mlx4/srq.c
    trunk/sys/ofed/drivers/infiniband/hw/mlx4/user.h
    trunk/sys/ofed/drivers/infiniband/hw/mlx4/wc.c
    trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_allocator.c
    trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_cmd.c
    trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_config_reg.h
    trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_main.c
    trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_memfree.c
    trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_provider.c
    trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_reset.c
    trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_uar.c
    trunk/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib.h
    trunk/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_cm.c
    trunk/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c
    trunk/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_main.c
    trunk/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
    trunk/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
    trunk/sys/ofed/drivers/infiniband/ulp/sdp/sdp.h
    trunk/sys/ofed/drivers/infiniband/ulp/sdp/sdp_bcopy.c
    trunk/sys/ofed/drivers/infiniband/ulp/sdp/sdp_cma.c
    trunk/sys/ofed/drivers/infiniband/ulp/sdp/sdp_main.c
    trunk/sys/ofed/drivers/infiniband/ulp/sdp/sdp_rx.c
    trunk/sys/ofed/drivers/infiniband/util/madeye.c
    trunk/sys/ofed/drivers/net/mlx4/Makefile
    trunk/sys/ofed/drivers/net/mlx4/alloc.c
    trunk/sys/ofed/drivers/net/mlx4/catas.c
    trunk/sys/ofed/drivers/net/mlx4/cmd.c
    trunk/sys/ofed/drivers/net/mlx4/cq.c
    trunk/sys/ofed/drivers/net/mlx4/en_cq.c
    trunk/sys/ofed/drivers/net/mlx4/en_main.c
    trunk/sys/ofed/drivers/net/mlx4/en_netdev.c
    trunk/sys/ofed/drivers/net/mlx4/en_port.c
    trunk/sys/ofed/drivers/net/mlx4/en_port.h
    trunk/sys/ofed/drivers/net/mlx4/en_resources.c
    trunk/sys/ofed/drivers/net/mlx4/en_rx.c
    trunk/sys/ofed/drivers/net/mlx4/en_tx.c
    trunk/sys/ofed/drivers/net/mlx4/eq.c
    trunk/sys/ofed/drivers/net/mlx4/fw.c
    trunk/sys/ofed/drivers/net/mlx4/fw.h
    trunk/sys/ofed/drivers/net/mlx4/icm.c
    trunk/sys/ofed/drivers/net/mlx4/icm.h
    trunk/sys/ofed/drivers/net/mlx4/intf.c
    trunk/sys/ofed/drivers/net/mlx4/main.c
    trunk/sys/ofed/drivers/net/mlx4/mcg.c
    trunk/sys/ofed/drivers/net/mlx4/mlx4.h
    trunk/sys/ofed/drivers/net/mlx4/mlx4_en.h
    trunk/sys/ofed/drivers/net/mlx4/mr.c
    trunk/sys/ofed/drivers/net/mlx4/pd.c
    trunk/sys/ofed/drivers/net/mlx4/port.c
    trunk/sys/ofed/drivers/net/mlx4/profile.c
    trunk/sys/ofed/drivers/net/mlx4/qp.c
    trunk/sys/ofed/drivers/net/mlx4/reset.c
    trunk/sys/ofed/drivers/net/mlx4/sense.c
    trunk/sys/ofed/drivers/net/mlx4/srq.c
    trunk/sys/ofed/include/asm/atomic-long.h
    trunk/sys/ofed/include/asm/atomic.h
    trunk/sys/ofed/include/asm/byteorder.h
    trunk/sys/ofed/include/asm/fcntl.h
    trunk/sys/ofed/include/asm/io.h
    trunk/sys/ofed/include/asm/pgtable.h
    trunk/sys/ofed/include/asm/types.h
    trunk/sys/ofed/include/asm/uaccess.h
    trunk/sys/ofed/include/linux/bitops.h
    trunk/sys/ofed/include/linux/cdev.h
    trunk/sys/ofed/include/linux/compat.h
    trunk/sys/ofed/include/linux/compiler.h
    trunk/sys/ofed/include/linux/completion.h
    trunk/sys/ofed/include/linux/delay.h
    trunk/sys/ofed/include/linux/device.h
    trunk/sys/ofed/include/linux/dma-attrs.h
    trunk/sys/ofed/include/linux/dma-mapping.h
    trunk/sys/ofed/include/linux/dmapool.h
    trunk/sys/ofed/include/linux/err.h
    trunk/sys/ofed/include/linux/errno.h
    trunk/sys/ofed/include/linux/file.h
    trunk/sys/ofed/include/linux/fs.h
    trunk/sys/ofed/include/linux/gfp.h
    trunk/sys/ofed/include/linux/hardirq.h
    trunk/sys/ofed/include/linux/idr.h
    trunk/sys/ofed/include/linux/if_arp.h
    trunk/sys/ofed/include/linux/if_ether.h
    trunk/sys/ofed/include/linux/if_vlan.h
    trunk/sys/ofed/include/linux/in.h
    trunk/sys/ofed/include/linux/in6.h
    trunk/sys/ofed/include/linux/inetdevice.h
    trunk/sys/ofed/include/linux/interrupt.h
    trunk/sys/ofed/include/linux/io-mapping.h
    trunk/sys/ofed/include/linux/io.h
    trunk/sys/ofed/include/linux/ioctl.h
    trunk/sys/ofed/include/linux/jiffies.h
    trunk/sys/ofed/include/linux/kdev_t.h
    trunk/sys/ofed/include/linux/kernel.h
    trunk/sys/ofed/include/linux/kobject.h
    trunk/sys/ofed/include/linux/kref.h
    trunk/sys/ofed/include/linux/kthread.h
    trunk/sys/ofed/include/linux/linux_compat.c
    trunk/sys/ofed/include/linux/linux_idr.c
    trunk/sys/ofed/include/linux/linux_radix.c
    trunk/sys/ofed/include/linux/list.h
    trunk/sys/ofed/include/linux/lockdep.h
    trunk/sys/ofed/include/linux/log2.h
    trunk/sys/ofed/include/linux/miscdevice.h
    trunk/sys/ofed/include/linux/mlx4/cmd.h
    trunk/sys/ofed/include/linux/mlx4/cq.h
    trunk/sys/ofed/include/linux/mlx4/device.h
    trunk/sys/ofed/include/linux/mlx4/doorbell.h
    trunk/sys/ofed/include/linux/mlx4/driver.h
    trunk/sys/ofed/include/linux/mlx4/qp.h
    trunk/sys/ofed/include/linux/mlx4/srq.h
    trunk/sys/ofed/include/linux/mm.h
    trunk/sys/ofed/include/linux/module.h
    trunk/sys/ofed/include/linux/moduleparam.h
    trunk/sys/ofed/include/linux/mutex.h
    trunk/sys/ofed/include/linux/net.h
    trunk/sys/ofed/include/linux/netdevice.h
    trunk/sys/ofed/include/linux/notifier.h
    trunk/sys/ofed/include/linux/page.h
    trunk/sys/ofed/include/linux/pci.h
    trunk/sys/ofed/include/linux/poll.h
    trunk/sys/ofed/include/linux/radix-tree.h
    trunk/sys/ofed/include/linux/random.h
    trunk/sys/ofed/include/linux/rbtree.h
    trunk/sys/ofed/include/linux/rwlock.h
    trunk/sys/ofed/include/linux/rwsem.h
    trunk/sys/ofed/include/linux/scatterlist.h
    trunk/sys/ofed/include/linux/sched.h
    trunk/sys/ofed/include/linux/semaphore.h
    trunk/sys/ofed/include/linux/slab.h
    trunk/sys/ofed/include/linux/socket.h
    trunk/sys/ofed/include/linux/spinlock.h
    trunk/sys/ofed/include/linux/string.h
    trunk/sys/ofed/include/linux/sysfs.h
    trunk/sys/ofed/include/linux/timer.h
    trunk/sys/ofed/include/linux/types.h
    trunk/sys/ofed/include/linux/uaccess.h
    trunk/sys/ofed/include/linux/vmalloc.h
    trunk/sys/ofed/include/linux/wait.h
    trunk/sys/ofed/include/linux/workqueue.h
    trunk/sys/ofed/include/net/ip.h
    trunk/sys/ofed/include/net/ipv6.h
    trunk/sys/ofed/include/net/netevent.h
    trunk/sys/ofed/include/net/tcp.h
    trunk/sys/ofed/include/rdma/ib_addr.h
    trunk/sys/ofed/include/rdma/ib_cm.h
    trunk/sys/ofed/include/rdma/ib_mad.h
    trunk/sys/ofed/include/rdma/ib_sa.h
    trunk/sys/ofed/include/rdma/ib_smi.h
    trunk/sys/ofed/include/rdma/ib_umem.h
    trunk/sys/ofed/include/rdma/ib_user_cm.h
    trunk/sys/ofed/include/rdma/ib_user_verbs.h
    trunk/sys/ofed/include/rdma/ib_verbs.h
    trunk/sys/ofed/include/rdma/iw_cm.h
    trunk/sys/ofed/include/rdma/sdp_socket.h

Added Paths:
-----------
    trunk/sys/ofed/drivers/infiniband/core/
    trunk/sys/ofed/drivers/infiniband/core/Makefile
    trunk/sys/ofed/drivers/infiniband/core/addr.c
    trunk/sys/ofed/drivers/infiniband/core/agent.c
    trunk/sys/ofed/drivers/infiniband/core/agent.h
    trunk/sys/ofed/drivers/infiniband/core/cache.c
    trunk/sys/ofed/drivers/infiniband/core/cm.c
    trunk/sys/ofed/drivers/infiniband/core/cm_msgs.h
    trunk/sys/ofed/drivers/infiniband/core/cma.c
    trunk/sys/ofed/drivers/infiniband/core/core_priv.h
    trunk/sys/ofed/drivers/infiniband/core/device.c
    trunk/sys/ofed/drivers/infiniband/core/fmr_pool.c
    trunk/sys/ofed/drivers/infiniband/core/iwcm.c
    trunk/sys/ofed/drivers/infiniband/core/iwcm.h
    trunk/sys/ofed/drivers/infiniband/core/local_sa.c
    trunk/sys/ofed/drivers/infiniband/core/mad.c
    trunk/sys/ofed/drivers/infiniband/core/mad_priv.h
    trunk/sys/ofed/drivers/infiniband/core/mad_rmpp.c
    trunk/sys/ofed/drivers/infiniband/core/mad_rmpp.h
    trunk/sys/ofed/drivers/infiniband/core/multicast.c
    trunk/sys/ofed/drivers/infiniband/core/notice.c
    trunk/sys/ofed/drivers/infiniband/core/packer.c
    trunk/sys/ofed/drivers/infiniband/core/sa.h
    trunk/sys/ofed/drivers/infiniband/core/sa_query.c
    trunk/sys/ofed/drivers/infiniband/core/smi.c
    trunk/sys/ofed/drivers/infiniband/core/smi.h
    trunk/sys/ofed/drivers/infiniband/core/sysfs.c
    trunk/sys/ofed/drivers/infiniband/core/ucm.c
    trunk/sys/ofed/drivers/infiniband/core/ucma.c
    trunk/sys/ofed/drivers/infiniband/core/ud_header.c
    trunk/sys/ofed/drivers/infiniband/core/umem.c
    trunk/sys/ofed/drivers/infiniband/core/user_mad.c
    trunk/sys/ofed/drivers/infiniband/core/uverbs.h
    trunk/sys/ofed/drivers/infiniband/core/uverbs_cmd.c
    trunk/sys/ofed/drivers/infiniband/core/uverbs_main.c
    trunk/sys/ofed/drivers/infiniband/core/uverbs_marshall.c
    trunk/sys/ofed/drivers/infiniband/core/verbs.c
    trunk/sys/ofed/drivers/infiniband/hw/mlx4/alias_GUID.c
    trunk/sys/ofed/drivers/infiniband/hw/mlx4/cm.c
    trunk/sys/ofed/drivers/infiniband/hw/mlx4/mcg.c
    trunk/sys/ofed/drivers/infiniband/hw/mlx4/sysfs.c
    trunk/sys/ofed/drivers/net/mlx4/mlx4_stats.h
    trunk/sys/ofed/drivers/net/mlx4/resource_tracker.c
    trunk/sys/ofed/drivers/net/mlx4/sys_tune.c
    trunk/sys/ofed/drivers/net/mlx4/utils.c
    trunk/sys/ofed/drivers/net/mlx4/utils.h
    trunk/sys/ofed/include/linux/cache.h
    trunk/sys/ofed/include/linux/clocksource.h
    trunk/sys/ofed/include/linux/etherdevice.h
    trunk/sys/ofed/include/linux/kmod.h
    trunk/sys/ofed/include/linux/ktime.h
    trunk/sys/ofed/include/linux/math64.h
    trunk/sys/ofed/include/linux/printk.h
    trunk/sys/ofed/include/net/if_inet6.h
    trunk/sys/ofed/include/rdma/ib_pma.h

Property Changed:
----------------
    trunk/sys/ofed/drivers/infiniband/hw/mlx4/Kconfig
    trunk/sys/ofed/drivers/infiniband/hw/mlx4/Makefile
    trunk/sys/ofed/drivers/infiniband/hw/mlx4/ah.c
    trunk/sys/ofed/drivers/infiniband/hw/mlx4/cq.c
    trunk/sys/ofed/drivers/infiniband/hw/mlx4/doorbell.c
    trunk/sys/ofed/drivers/infiniband/hw/mlx4/mad.c
    trunk/sys/ofed/drivers/infiniband/hw/mlx4/main.c
    trunk/sys/ofed/drivers/infiniband/hw/mlx4/mlx4_ib.h
    trunk/sys/ofed/drivers/infiniband/hw/mlx4/mr.c
    trunk/sys/ofed/drivers/infiniband/hw/mlx4/qp.c
    trunk/sys/ofed/drivers/infiniband/hw/mlx4/srq.c
    trunk/sys/ofed/drivers/infiniband/hw/mlx4/user.h
    trunk/sys/ofed/drivers/infiniband/hw/mlx4/wc.c
    trunk/sys/ofed/drivers/infiniband/hw/mlx4/wc.h
    trunk/sys/ofed/drivers/infiniband/hw/mthca/Kconfig
    trunk/sys/ofed/drivers/infiniband/hw/mthca/Makefile
    trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_allocator.c
    trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_av.c
    trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_catas.c
    trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_cmd.c
    trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_cmd.h
    trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_config_reg.h
    trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_cq.c
    trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_dev.h
    trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_doorbell.h
    trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_eq.c
    trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_mad.c
    trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_main.c
    trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_mcg.c
    trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_memfree.c
    trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_memfree.h
    trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_mr.c
    trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_pd.c
    trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_profile.c
    trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_profile.h
    trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_provider.c
    trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_provider.h
    trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_qp.c
    trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_reset.c
    trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_srq.c
    trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_uar.c
    trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_user.h
    trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_wqe.h
    trunk/sys/ofed/drivers/infiniband/ulp/ipoib/Kconfig
    trunk/sys/ofed/drivers/infiniband/ulp/ipoib/Makefile
    trunk/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib.h
    trunk/sys/ofed/drivers/infiniband/ulp/sdp/Kconfig
    trunk/sys/ofed/drivers/infiniband/ulp/sdp/Makefile
    trunk/sys/ofed/drivers/infiniband/ulp/sdp/sdp.h
    trunk/sys/ofed/drivers/infiniband/ulp/sdp/sdp_dbg.h
    trunk/sys/ofed/drivers/net/mlx4/en_port.h
    trunk/sys/ofed/drivers/net/mlx4/fw.h
    trunk/sys/ofed/drivers/net/mlx4/icm.h
    trunk/sys/ofed/drivers/net/mlx4/mlx4.h
    trunk/sys/ofed/drivers/net/mlx4/mlx4_en.h
    trunk/sys/ofed/include/asm/atomic-long.h
    trunk/sys/ofed/include/asm/atomic.h
    trunk/sys/ofed/include/asm/byteorder.h
    trunk/sys/ofed/include/asm/current.h
    trunk/sys/ofed/include/asm/fcntl.h
    trunk/sys/ofed/include/asm/io.h
    trunk/sys/ofed/include/asm/page.h
    trunk/sys/ofed/include/asm/pgtable.h
    trunk/sys/ofed/include/asm/semaphore.h
    trunk/sys/ofed/include/asm/system.h
    trunk/sys/ofed/include/asm/types.h
    trunk/sys/ofed/include/asm/uaccess.h
    trunk/sys/ofed/include/linux/linux_compat.c
    trunk/sys/ofed/include/linux/linux_idr.c
    trunk/sys/ofed/include/linux/linux_radix.c
    trunk/sys/ofed/include/linux/mlx4/cmd.h
    trunk/sys/ofed/include/linux/mlx4/cq.h
    trunk/sys/ofed/include/linux/mlx4/device.h
    trunk/sys/ofed/include/linux/mlx4/doorbell.h
    trunk/sys/ofed/include/linux/mlx4/driver.h
    trunk/sys/ofed/include/linux/mlx4/qp.h
    trunk/sys/ofed/include/linux/mlx4/srq.h
    trunk/sys/ofed/include/rdma/Kbuild
    trunk/sys/ofed/include/rdma/ib_addr.h
    trunk/sys/ofed/include/rdma/ib_cache.h
    trunk/sys/ofed/include/rdma/ib_cm.h
    trunk/sys/ofed/include/rdma/ib_fmr_pool.h
    trunk/sys/ofed/include/rdma/ib_mad.h
    trunk/sys/ofed/include/rdma/ib_marshall.h
    trunk/sys/ofed/include/rdma/ib_pack.h
    trunk/sys/ofed/include/rdma/ib_sa.h
    trunk/sys/ofed/include/rdma/ib_smi.h
    trunk/sys/ofed/include/rdma/ib_umem.h
    trunk/sys/ofed/include/rdma/ib_user_cm.h
    trunk/sys/ofed/include/rdma/ib_user_mad.h
    trunk/sys/ofed/include/rdma/ib_user_sa.h
    trunk/sys/ofed/include/rdma/ib_user_verbs.h
    trunk/sys/ofed/include/rdma/ib_verbs.h
    trunk/sys/ofed/include/rdma/iw_cm.h
    trunk/sys/ofed/include/rdma/rdma_cm.h
    trunk/sys/ofed/include/rdma/rdma_cm_ib.h
    trunk/sys/ofed/include/rdma/rdma_user_cm.h
    trunk/sys/ofed/include/rdma/sdp_socket.h

Added: trunk/sys/ofed/drivers/infiniband/core/Makefile
===================================================================
--- trunk/sys/ofed/drivers/infiniband/core/Makefile	                        (rev 0)
+++ trunk/sys/ofed/drivers/infiniband/core/Makefile	2016-09-14 19:35:22 UTC (rev 7911)
@@ -0,0 +1,32 @@
+infiniband-$(CONFIG_INFINIBAND_ADDR_TRANS)	:= ib_addr.o rdma_cm.o
+user_access-$(CONFIG_INFINIBAND_ADDR_TRANS)	:= rdma_ucm.o
+
+obj-$(CONFIG_INFINIBAND) +=		ib_core.o ib_mad.o ib_sa.o \
+					ib_cm.o iw_cm.o $(infiniband-y)
+obj-$(CONFIG_INFINIBAND_USER_MAD) +=	ib_umad.o
+obj-$(CONFIG_INFINIBAND_USER_ACCESS) +=	ib_uverbs.o ib_ucm.o \
+					$(user_access-y)
+
+ib_core-y :=			packer.o ud_header.o verbs.o sysfs.o \
+				device.o fmr_pool.o cache.o
+ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o
+
+ib_mad-y :=			mad.o smi.o agent.o mad_rmpp.o
+
+ib_sa-y :=			sa_query.o multicast.o notice.o local_sa.o
+
+ib_cm-y :=			cm.o
+
+iw_cm-y :=			iwcm.o
+
+rdma_cm-y :=			cma.o
+
+rdma_ucm-y :=			ucma.o
+
+ib_addr-y :=			addr.o
+
+ib_umad-y :=			user_mad.o
+
+ib_ucm-y :=			ucm.o
+
+ib_uverbs-y :=			uverbs_main.o uverbs_cmd.o uverbs_marshall.o


Property changes on: trunk/sys/ofed/drivers/infiniband/core/Makefile
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/sys/ofed/drivers/infiniband/core/addr.c
===================================================================
--- trunk/sys/ofed/drivers/infiniband/core/addr.c	                        (rev 0)
+++ trunk/sys/ofed/drivers/infiniband/core/addr.c	2016-09-14 19:35:22 UTC (rev 7911)
@@ -0,0 +1,644 @@
+/*
+ * Copyright (c) 2005 Voltaire Inc.  All rights reserved.
+ * Copyright (c) 2002-2005, Network Appliance, Inc. All rights reserved.
+ * Copyright (c) 1999-2005, Mellanox Technologies, Inc. All rights reserved.
+ * Copyright (c) 2005 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/mutex.h>
+#include <linux/inetdevice.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+#include <linux/module.h>
+#include <linux/notifier.h>
+#include <net/route.h>
+#include <net/netevent.h>
+#include <rdma/ib_addr.h>
+#include <netinet/if_ether.h>
+
+
+MODULE_AUTHOR("Sean Hefty");
+MODULE_DESCRIPTION("IB Address Translation");
+MODULE_LICENSE("Dual BSD/GPL");
+
+struct addr_req {
+	struct list_head list;
+	struct sockaddr_storage src_addr;
+	struct sockaddr_storage dst_addr;
+	struct rdma_dev_addr *addr;
+	struct rdma_addr_client *client;
+	void *context;
+	void (*callback)(int status, struct sockaddr *src_addr,
+			 struct rdma_dev_addr *addr, void *context);
+	unsigned long timeout;
+	int status;
+};
+
+static void process_req(struct work_struct *work);
+
+static DEFINE_MUTEX(lock);
+static LIST_HEAD(req_list);
+static struct delayed_work work;
+static struct workqueue_struct *addr_wq;
+
+void rdma_addr_register_client(struct rdma_addr_client *client)
+{
+	atomic_set(&client->refcount, 1);
+	init_completion(&client->comp);
+}
+EXPORT_SYMBOL(rdma_addr_register_client);
+
+static inline void put_client(struct rdma_addr_client *client)
+{
+	if (atomic_dec_and_test(&client->refcount))
+		complete(&client->comp);
+}
+
+void rdma_addr_unregister_client(struct rdma_addr_client *client)
+{
+	put_client(client);
+	wait_for_completion(&client->comp);
+}
+EXPORT_SYMBOL(rdma_addr_unregister_client);
+
+#ifdef __linux__
+int rdma_copy_addr(struct rdma_dev_addr *dev_addr, struct net_device *dev,
+		     const unsigned char *dst_dev_addr)
+{
+	dev_addr->dev_type = dev->type;
+	memcpy(dev_addr->src_dev_addr, dev->dev_addr, MAX_ADDR_LEN);
+	memcpy(dev_addr->broadcast, dev->broadcast, MAX_ADDR_LEN);
+	if (dst_dev_addr)
+		memcpy(dev_addr->dst_dev_addr, dst_dev_addr, MAX_ADDR_LEN);
+	dev_addr->bound_dev_if = dev->ifindex;
+	return 0;
+}
+#else
+int rdma_copy_addr(struct rdma_dev_addr *dev_addr, struct ifnet *dev,
+		     const unsigned char *dst_dev_addr)
+{
+	if (dev->if_type == IFT_INFINIBAND)
+		dev_addr->dev_type = ARPHRD_INFINIBAND;
+	else if (dev->if_type == IFT_ETHER)
+		dev_addr->dev_type = ARPHRD_ETHER;
+	else
+		dev_addr->dev_type = 0;
+	memcpy(dev_addr->src_dev_addr, IF_LLADDR(dev), dev->if_addrlen);
+	memcpy(dev_addr->broadcast, __DECONST(char *, dev->if_broadcastaddr),
+	    dev->if_addrlen);
+	if (dst_dev_addr)
+		memcpy(dev_addr->dst_dev_addr, dst_dev_addr, dev->if_addrlen);
+	dev_addr->bound_dev_if = dev->if_index;
+	return 0;
+}
+#endif
+EXPORT_SYMBOL(rdma_copy_addr);
+
+int rdma_translate_ip(struct sockaddr *addr, struct rdma_dev_addr *dev_addr)
+{
+	struct net_device *dev;
+	int ret = -EADDRNOTAVAIL;
+
+	if (dev_addr->bound_dev_if) {
+		dev = dev_get_by_index(&init_net, dev_addr->bound_dev_if);
+		if (!dev)
+			return -ENODEV;
+		ret = rdma_copy_addr(dev_addr, dev, NULL);
+		dev_put(dev);
+		return ret;
+	}
+
+	switch (addr->sa_family) {
+#ifdef INET
+	case AF_INET:
+		dev = ip_dev_find(NULL,
+			((struct sockaddr_in *) addr)->sin_addr.s_addr);
+
+		if (!dev)
+			return ret;
+
+		ret = rdma_copy_addr(dev_addr, dev, NULL);
+		dev_put(dev);
+		break;
+#endif
+
+#if defined(INET6)
+	case AF_INET6:
+#ifdef __linux__
+		read_lock(&dev_base_lock);
+		for_each_netdev(&init_net, dev) {
+			if (ipv6_chk_addr(&init_net,
+					  &((struct sockaddr_in6 *) addr)->sin6_addr,
+					  dev, 1)) {
+				ret = rdma_copy_addr(dev_addr, dev, NULL);
+				break;
+			}
+		}
+		read_unlock(&dev_base_lock);
+#else
+		{
+			struct sockaddr_in6 *sin6;
+			struct ifaddr *ifa;
+			in_port_t port;
+
+			sin6 = (struct sockaddr_in6 *)addr;
+			port = sin6->sin6_port;
+			sin6->sin6_port = 0;
+			ifa = ifa_ifwithaddr(addr);
+			sin6->sin6_port = port;
+			if (ifa == NULL) {
+				ret = -ENODEV;
+				break;
+			}
+			ret = rdma_copy_addr(dev_addr, ifa->ifa_ifp, NULL);
+			ifa_free(ifa);
+			break;
+		}
+#endif
+		break;
+#endif
+	}
+	return ret;
+}
+EXPORT_SYMBOL(rdma_translate_ip);
+
+static void set_timeout(unsigned long time)
+{
+	unsigned long delay;
+
+	delay = time - jiffies;
+	if ((long)delay <= 0)
+		delay = 1;
+
+	mod_delayed_work(addr_wq, &work, delay);
+}
+
+static void queue_req(struct addr_req *req)
+{
+	struct addr_req *temp_req;
+
+	mutex_lock(&lock);
+	list_for_each_entry_reverse(temp_req, &req_list, list) {
+		if (time_after_eq(req->timeout, temp_req->timeout))
+			break;
+	}
+
+	list_add(&req->list, &temp_req->list);
+
+	if (req_list.next == &req->list)
+		set_timeout(req->timeout);
+	mutex_unlock(&lock);
+}
+
+#ifdef __linux__
+static int addr4_resolve(struct sockaddr_in *src_in,
+			 struct sockaddr_in *dst_in,
+			 struct rdma_dev_addr *addr)
+{
+	__be32 src_ip = src_in->sin_addr.s_addr;
+	__be32 dst_ip = dst_in->sin_addr.s_addr;
+	struct flowi fl;
+	struct rtable *rt;
+	struct neighbour *neigh;
+	int ret;
+
+	memset(&fl, 0, sizeof fl);
+	fl.nl_u.ip4_u.daddr = dst_ip;
+	fl.nl_u.ip4_u.saddr = src_ip;
+	fl.oif = addr->bound_dev_if;
+
+	ret = ip_route_output_key(&init_net, &rt, &fl);
+	if (ret)
+		goto out;
+
+	src_in->sin_family = AF_INET;
+	src_in->sin_addr.s_addr = rt->rt_src;
+
+	if (rt->idev->dev->flags & IFF_LOOPBACK) {
+		ret = rdma_translate_ip((struct sockaddr *) dst_in, addr);
+		if (!ret)
+			memcpy(addr->dst_dev_addr, addr->src_dev_addr, MAX_ADDR_LEN);
+		goto put;
+	}
+
+	/* If the device does ARP internally, return 'done' */
+	if (rt->idev->dev->flags & IFF_NOARP) {
+		rdma_copy_addr(addr, rt->idev->dev, NULL);
+		goto put;
+	}
+
+	neigh = neigh_lookup(&arp_tbl, &rt->rt_gateway, rt->idev->dev);
+	if (!neigh || !(neigh->nud_state & NUD_VALID)) {
+		neigh_event_send(rt->u.dst.neighbour, NULL);
+		ret = -ENODATA;
+		if (neigh)
+			goto release;
+		goto put;
+	}
+
+	ret = rdma_copy_addr(addr, neigh->dev, neigh->ha);
+release:
+	neigh_release(neigh);
+put:
+	ip_rt_put(rt);
+out:
+	return ret;
+}
+
+#if defined(INET6)
+static int addr6_resolve(struct sockaddr_in6 *src_in,
+			 struct sockaddr_in6 *dst_in,
+			 struct rdma_dev_addr *addr)
+{
+	struct flowi fl;
+	struct neighbour *neigh;
+	struct dst_entry *dst;
+	int ret;
+
+	memset(&fl, 0, sizeof fl);
+	ipv6_addr_copy(&fl.fl6_dst, &dst_in->sin6_addr);
+	ipv6_addr_copy(&fl.fl6_src, &src_in->sin6_addr);
+	fl.oif = addr->bound_dev_if;
+
+	dst = ip6_route_output(&init_net, NULL, &fl);
+	if ((ret = dst->error))
+		goto put;
+
+	if (ipv6_addr_any(&fl.fl6_src)) {
+		ret = ipv6_dev_get_saddr(&init_net, ip6_dst_idev(dst)->dev,
+					 &fl.fl6_dst, 0, &fl.fl6_src);
+		if (ret)
+			goto put;
+
+		src_in->sin6_family = AF_INET6;
+		ipv6_addr_copy(&src_in->sin6_addr, &fl.fl6_src);
+	}
+
+	if (dst->dev->flags & IFF_LOOPBACK) {
+		ret = rdma_translate_ip((struct sockaddr *) dst_in, addr);
+		if (!ret)
+			memcpy(addr->dst_dev_addr, addr->src_dev_addr, MAX_ADDR_LEN);
+		goto put;
+	}
+
+	/* If the device does ARP internally, return 'done' */
+	if (dst->dev->flags & IFF_NOARP) {
+		ret = rdma_copy_addr(addr, dst->dev, NULL);
+		goto put;
+	}
+	
+	neigh = dst->neighbour;
+	if (!neigh || !(neigh->nud_state & NUD_VALID)) {
+		neigh_event_send(dst->neighbour, NULL);
+		ret = -ENODATA;
+		goto put;
+	}
+
+	ret = rdma_copy_addr(addr, dst->dev, neigh->ha);
+put:
+	dst_release(dst);
+	return ret;
+}
+#else
+static int addr6_resolve(struct sockaddr_in6 *src_in,
+			 struct sockaddr_in6 *dst_in,
+			 struct rdma_dev_addr *addr)
+{
+	return -EADDRNOTAVAIL;
+}
+#endif
+
+#else
+#include <netinet/if_ether.h>
+
+static int addr_resolve(struct sockaddr *src_in,
+			struct sockaddr *dst_in,
+			struct rdma_dev_addr *addr)
+{
+	struct sockaddr_in *sin;
+	struct sockaddr_in6 *sin6;
+	struct ifaddr *ifa;
+	struct ifnet *ifp;
+#if defined(INET) || defined(INET6)
+	struct llentry *lle;
+#endif
+	struct rtentry *rte;
+	in_port_t port;
+	u_char edst[MAX_ADDR_LEN];
+	int multi;
+	int bcast;
+	int error = 0;
+
+	/*
+	 * Determine whether the address is unicast, multicast, or broadcast
+	 * and whether the source interface is valid.
+	 */
+	multi = 0;
+	bcast = 0;
+	sin = NULL;
+	sin6 = NULL;
+	ifp = NULL;
+	rte = NULL;
+	switch (dst_in->sa_family) {
+#ifdef INET
+	case AF_INET:
+		sin = (struct sockaddr_in *)dst_in;
+		if (sin->sin_addr.s_addr == INADDR_BROADCAST)
+			bcast = 1;
+		if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr)))
+			multi = 1;
+		sin = (struct sockaddr_in *)src_in;
+		if (sin->sin_addr.s_addr != INADDR_ANY) {
+			/*
+			 * Address comparison fails if the port is set
+			 * cache it here to be restored later.
+			 */
+			port = sin->sin_port;
+			sin->sin_port = 0;
+			memset(&sin->sin_zero, 0, sizeof(sin->sin_zero));
+		} else
+			src_in = NULL; 
+		break;
+#endif
+#ifdef INET6
+	case AF_INET6:
+		sin6 = (struct sockaddr_in6 *)dst_in;
+		if (IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr))
+			multi = 1;
+		sin6 = (struct sockaddr_in6 *)src_in;
+		if (!IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
+			port = sin6->sin6_port;
+			sin6->sin6_port = 0;
+		} else
+			src_in = NULL;
+		break;
+#endif
+	default:
+		return -EINVAL;
+	}
+	/*
+	 * If we have a source address to use look it up first and verify
+	 * that it is a local interface.
+	 */
+	if (src_in) {
+		ifa = ifa_ifwithaddr(src_in);
+		if (sin)
+			sin->sin_port = port;
+		if (sin6)
+			sin6->sin6_port = port;
+		if (ifa == NULL)
+			return -ENETUNREACH;
+		ifp = ifa->ifa_ifp;
+		ifa_free(ifa);
+		if (bcast || multi)
+			goto mcast;
+	}
+	/*
+	 * Make sure the route exists and has a valid link.
+	 */
+	rte = rtalloc1(dst_in, 1, 0);
+	if (rte == NULL || rte->rt_ifp == NULL || !RT_LINK_IS_UP(rte->rt_ifp)) {
+		if (rte) 
+			RTFREE_LOCKED(rte);
+		return -EHOSTUNREACH;
+	}
+	/*
+	 * If it's not multicast or broadcast and the route doesn't match the
+	 * requested interface return unreachable.  Otherwise fetch the
+	 * correct interface pointer and unlock the route.
+	 */
+	if (multi || bcast) {
+		if (ifp == NULL)
+			ifp = rte->rt_ifp;
+		RTFREE_LOCKED(rte);
+	} else if (ifp && ifp != rte->rt_ifp) {
+		RTFREE_LOCKED(rte);
+		return -ENETUNREACH;
+	} else {
+		if (ifp == NULL)
+			ifp = rte->rt_ifp;
+		RT_UNLOCK(rte);
+	}
+mcast:
+	if (bcast)
+		return rdma_copy_addr(addr, ifp, ifp->if_broadcastaddr);
+	if (multi) {
+		struct sockaddr *llsa;
+
+		error = ifp->if_resolvemulti(ifp, &llsa, dst_in);
+		if (error)
+			return -error;
+		error = rdma_copy_addr(addr, ifp,
+		    LLADDR((struct sockaddr_dl *)llsa));
+		free(llsa, M_IFMADDR);
+		return error;
+	}
+	/*
+	 * Resolve the link local address.
+	 */
+	switch (dst_in->sa_family) {
+#ifdef INET
+	case AF_INET:
+		error = arpresolve(ifp, rte, NULL, dst_in, edst, &lle);
+		break;
+#endif
+#ifdef INET6
+	case AF_INET6:
+		error = nd6_storelladdr(ifp, NULL, dst_in, (u_char *)edst, &lle);
+		break;
+#endif
+	default:
+		/* XXX: Shouldn't happen. */
+		error = -EINVAL;
+	}
+	RTFREE(rte);
+	if (error == 0)
+		return rdma_copy_addr(addr, ifp, edst);
+	if (error == EWOULDBLOCK)
+		return -ENODATA;
+	return -error;
+}
+
+#endif
+
+static void process_req(struct work_struct *work)
+{
+	struct addr_req *req, *temp_req;
+	struct sockaddr *src_in, *dst_in;
+	struct list_head done_list;
+
+	INIT_LIST_HEAD(&done_list);
+
+	mutex_lock(&lock);
+	list_for_each_entry_safe(req, temp_req, &req_list, list) {
+		if (req->status == -ENODATA) {
+			src_in = (struct sockaddr *) &req->src_addr;
+			dst_in = (struct sockaddr *) &req->dst_addr;
+			req->status = addr_resolve(src_in, dst_in, req->addr);
+			if (req->status && time_after_eq(jiffies, req->timeout))
+				req->status = -ETIMEDOUT;
+			else if (req->status == -ENODATA)
+				continue;
+		}
+		list_move_tail(&req->list, &done_list);
+	}
+
+	if (!list_empty(&req_list)) {
+		req = list_entry(req_list.next, struct addr_req, list);
+		set_timeout(req->timeout);
+	}
+	mutex_unlock(&lock);
+
+	list_for_each_entry_safe(req, temp_req, &done_list, list) {
+		list_del(&req->list);
+		req->callback(req->status, (struct sockaddr *) &req->src_addr,
+			req->addr, req->context);
+		put_client(req->client);
+		kfree(req);
+	}
+}
+
+int rdma_resolve_ip(struct rdma_addr_client *client,
+		    struct sockaddr *src_addr, struct sockaddr *dst_addr,
+		    struct rdma_dev_addr *addr, int timeout_ms,
+		    void (*callback)(int status, struct sockaddr *src_addr,
+				     struct rdma_dev_addr *addr, void *context),
+		    void *context)
+{
+	struct sockaddr *src_in, *dst_in;
+	struct addr_req *req;
+	int ret = 0;
+
+	req = kzalloc(sizeof *req, GFP_KERNEL);
+	if (!req)
+		return -ENOMEM;
+
+	src_in = (struct sockaddr *) &req->src_addr;
+	dst_in = (struct sockaddr *) &req->dst_addr;
+
+	if (src_addr) {
+		if (src_addr->sa_family != dst_addr->sa_family) {
+			ret = -EINVAL;
+			goto err;
+		}
+
+		memcpy(src_in, src_addr, ip_addr_size(src_addr));
+	} else {
+		src_in->sa_family = dst_addr->sa_family;
+	}
+
+	memcpy(dst_in, dst_addr, ip_addr_size(dst_addr));
+	req->addr = addr;
+	req->callback = callback;
+	req->context = context;
+	req->client = client;
+	atomic_inc(&client->refcount);
+
+	req->status = addr_resolve(src_in, dst_in, addr);
+	switch (req->status) {
+	case 0:
+		req->timeout = jiffies;
+		queue_req(req);
+		break;
+	case -ENODATA:
+		req->timeout = msecs_to_jiffies(timeout_ms) + jiffies;
+		queue_req(req);
+		break;
+	default:
+		ret = req->status;
+		atomic_dec(&client->refcount);
+		goto err;
+	}
+	return ret;
+err:
+	kfree(req);
+	return ret;
+}
+EXPORT_SYMBOL(rdma_resolve_ip);
+
+void rdma_addr_cancel(struct rdma_dev_addr *addr)
+{
+	struct addr_req *req, *temp_req;
+
+	mutex_lock(&lock);
+	list_for_each_entry_safe(req, temp_req, &req_list, list) {
+		if (req->addr == addr) {
+			req->status = -ECANCELED;
+			req->timeout = jiffies;
+			list_move(&req->list, &req_list);
+			set_timeout(req->timeout);
+			break;
+		}
+	}
+	mutex_unlock(&lock);
+}
+EXPORT_SYMBOL(rdma_addr_cancel);
+
+static int netevent_callback(struct notifier_block *self, unsigned long event,
+	void *ctx)
+{
+	if (event == NETEVENT_NEIGH_UPDATE) {
+#ifdef __linux__
+		struct neighbour *neigh = ctx;
+
+		if (neigh->nud_state & NUD_VALID) {
+			set_timeout(jiffies);
+		}
+#else
+		set_timeout(jiffies);
+#endif
+	}
+	return 0;
+}
+
+static struct notifier_block nb = {
+	.notifier_call = netevent_callback
+};
+
+static int __init addr_init(void)
+{
+	INIT_DELAYED_WORK(&work, process_req);
+	addr_wq = create_singlethread_workqueue("ib_addr");
+	if (!addr_wq)
+		return -ENOMEM;
+
+	register_netevent_notifier(&nb);
+	return 0;
+}
+
+static void __exit addr_cleanup(void)
+{
+	unregister_netevent_notifier(&nb);
+	destroy_workqueue(addr_wq);
+}
+
+module_init(addr_init);
+module_exit(addr_cleanup);


Property changes on: trunk/sys/ofed/drivers/infiniband/core/addr.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/sys/ofed/drivers/infiniband/core/agent.c
===================================================================
--- trunk/sys/ofed/drivers/infiniband/core/agent.c	                        (rev 0)
+++ trunk/sys/ofed/drivers/infiniband/core/agent.c	2016-09-14 19:35:22 UTC (rev 7911)
@@ -0,0 +1,217 @@
+/*
+ * Copyright (c) 2004, 2005 Mellanox Technologies Ltd.  All rights reserved.
+ * Copyright (c) 2004, 2005 Infinicon Corporation.  All rights reserved.
+ * Copyright (c) 2004, 2005 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2004, 2005 Topspin Corporation.  All rights reserved.
+ * Copyright (c) 2004-2007 Voltaire Corporation.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/slab.h>
+#include <linux/string.h>
+
+#include "agent.h"
+#include "smi.h"
+#include "mad_priv.h"
+
+#define SPFX "ib_agent: "
+
+struct ib_agent_port_private {
+	struct list_head port_list;
+	struct ib_mad_agent *agent[2];
+};
+
+static DEFINE_SPINLOCK(ib_agent_port_list_lock);
+static LIST_HEAD(ib_agent_port_list);
+
+static struct ib_agent_port_private *
+__ib_get_agent_port(struct ib_device *device, int port_num)
+{
+	struct ib_agent_port_private *entry;
+
+	list_for_each_entry(entry, &ib_agent_port_list, port_list) {
+		if (entry->agent[1]->device == device &&
+		    entry->agent[1]->port_num == port_num)
+			return entry;
+	}
+	return NULL;
+}
+
+static struct ib_agent_port_private *
+ib_get_agent_port(struct ib_device *device, int port_num)
+{
+	struct ib_agent_port_private *entry;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ib_agent_port_list_lock, flags);
+	entry = __ib_get_agent_port(device, port_num);
+	spin_unlock_irqrestore(&ib_agent_port_list_lock, flags);
+	return entry;
+}
+
+void agent_send_response(struct ib_mad *mad, struct ib_grh *grh,
+			 struct ib_wc *wc, struct ib_device *device,
+			 int port_num, int qpn)
+{
+	struct ib_agent_port_private *port_priv;
+	struct ib_mad_agent *agent;
+	struct ib_mad_send_buf *send_buf;
+	struct ib_ah *ah;
+	struct ib_mad_send_wr_private *mad_send_wr;
+
+	if (device->node_type == RDMA_NODE_IB_SWITCH)
+		port_priv = ib_get_agent_port(device, 0);
+	else
+		port_priv = ib_get_agent_port(device, port_num);
+
+	if (!port_priv) {
+		printk(KERN_ERR SPFX "Unable to find port agent\n");
+		return;
+	}
+
+	agent = port_priv->agent[qpn];
+	ah = ib_create_ah_from_wc(agent->qp->pd, wc, grh, port_num);
+	if (IS_ERR(ah)) {
+		printk(KERN_ERR SPFX "ib_create_ah_from_wc error %ld\n",
+			PTR_ERR(ah));
+		return;
+	}
+
+	send_buf = ib_create_send_mad(agent, wc->src_qp, wc->pkey_index, 0,
+				      IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA,
+				      GFP_KERNEL);
+	if (IS_ERR(send_buf)) {
+		printk(KERN_ERR SPFX "ib_create_send_mad error\n");
+		goto err1;
+	}
+
+	memcpy(send_buf->mad, mad, sizeof *mad);
+	send_buf->ah = ah;
+
+	if (device->node_type == RDMA_NODE_IB_SWITCH) {
+		mad_send_wr = container_of(send_buf,
+					   struct ib_mad_send_wr_private,
+					   send_buf);
+		mad_send_wr->send_wr.wr.ud.port_num = port_num;
+	}
+
+	if (ib_post_send_mad(send_buf, NULL)) {
+		printk(KERN_ERR SPFX "ib_post_send_mad error\n");
+		goto err2;
+	}
+	return;
+err2:
+	ib_free_send_mad(send_buf);
+err1:
+	ib_destroy_ah(ah);
+}
+
+static void agent_send_handler(struct ib_mad_agent *mad_agent,
+			       struct ib_mad_send_wc *mad_send_wc)
+{
+	ib_destroy_ah(mad_send_wc->send_buf->ah);
+	ib_free_send_mad(mad_send_wc->send_buf);
+}
+
+int ib_agent_port_open(struct ib_device *device, int port_num)
+{
+	struct ib_agent_port_private *port_priv;
+	unsigned long flags;
+	int ret;
+
+	/* Create new device info */
+	port_priv = kzalloc(sizeof *port_priv, GFP_KERNEL);
+	if (!port_priv) {
+		printk(KERN_ERR SPFX "No memory for ib_agent_port_private\n");
+		ret = -ENOMEM;
+		goto error1;
+	}
+
+	if (rdma_port_get_link_layer(device, port_num) == IB_LINK_LAYER_INFINIBAND) {
+		/* Obtain send only MAD agent for SMI QP */
+		port_priv->agent[0] = ib_register_mad_agent(device, port_num,
+							    IB_QPT_SMI, NULL, 0,
+							    &agent_send_handler,
+							    NULL, NULL);
+		if (IS_ERR(port_priv->agent[0])) {
+			ret = PTR_ERR(port_priv->agent[0]);
+			goto error2;
+		}
+	}
+
+	/* Obtain send only MAD agent for GSI QP */
+	port_priv->agent[1] = ib_register_mad_agent(device, port_num,
+						    IB_QPT_GSI, NULL, 0,
+						    &agent_send_handler,
+						    NULL, NULL);
+	if (IS_ERR(port_priv->agent[1])) {
+		ret = PTR_ERR(port_priv->agent[1]);
+		goto error3;
+	}
+
+	spin_lock_irqsave(&ib_agent_port_list_lock, flags);
+	list_add_tail(&port_priv->port_list, &ib_agent_port_list);
+	spin_unlock_irqrestore(&ib_agent_port_list_lock, flags);
+
+	return 0;
+
+error3:
+	if (port_priv->agent[0])
+		ib_unregister_mad_agent(port_priv->agent[0]);
+error2:
+	kfree(port_priv);
+error1:
+	return ret;
+}
+
+int ib_agent_port_close(struct ib_device *device, int port_num)
+{
+	struct ib_agent_port_private *port_priv;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ib_agent_port_list_lock, flags);
+	port_priv = __ib_get_agent_port(device, port_num);
+	if (port_priv == NULL) {
+		spin_unlock_irqrestore(&ib_agent_port_list_lock, flags);
+		printk(KERN_ERR SPFX "Port %d not found\n", port_num);
+		return -ENODEV;
+	}
+	list_del(&port_priv->port_list);
+	spin_unlock_irqrestore(&ib_agent_port_list_lock, flags);
+
+	ib_unregister_mad_agent(port_priv->agent[1]);
+	if (port_priv->agent[0])
+		ib_unregister_mad_agent(port_priv->agent[0]);
+
+	kfree(port_priv);
+	return 0;
+}


Property changes on: trunk/sys/ofed/drivers/infiniband/core/agent.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/sys/ofed/drivers/infiniband/core/agent.h
===================================================================
--- trunk/sys/ofed/drivers/infiniband/core/agent.h	                        (rev 0)
+++ trunk/sys/ofed/drivers/infiniband/core/agent.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2004 Mellanox Technologies Ltd.  All rights reserved.
+ * Copyright (c) 2004 Infinicon Corporation.  All rights reserved.
+ * Copyright (c) 2004 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation.  All rights reserved.
+ * Copyright (c) 2004 Voltaire Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __AGENT_H_
+#define __AGENT_H_
+
+#include <linux/err.h>
+#include <rdma/ib_mad.h>
+
+extern int ib_agent_port_open(struct ib_device *device, int port_num);
+
+extern int ib_agent_port_close(struct ib_device *device, int port_num);
+
+extern void agent_send_response(struct ib_mad *mad, struct ib_grh *grh,
+				struct ib_wc *wc, struct ib_device *device,
+				int port_num, int qpn);
+
+#endif	/* __AGENT_H_ */


Property changes on: trunk/sys/ofed/drivers/infiniband/core/agent.h
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/sys/ofed/drivers/infiniband/core/cache.c
===================================================================
--- trunk/sys/ofed/drivers/infiniband/core/cache.c	                        (rev 0)
+++ trunk/sys/ofed/drivers/infiniband/core/cache.c	2016-09-14 19:35:22 UTC (rev 7911)
@@ -0,0 +1,398 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Intel Corporation. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005 Voltaire, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+
+#include <rdma/ib_cache.h>
+
+#include "core_priv.h"
+
+struct ib_pkey_cache {
+	int             table_len;
+	u16             table[0];
+};
+
+struct ib_gid_cache {
+	int             table_len;
+	union ib_gid    table[0];
+};
+
+struct ib_update_work {
+	struct work_struct work;
+	struct ib_device  *device;
+	u8                 port_num;
+};
+
+static inline int start_port(struct ib_device *device)
+{
+	return (device->node_type == RDMA_NODE_IB_SWITCH) ? 0 : 1;
+}
+
+static inline int end_port(struct ib_device *device)
+{
+	return (device->node_type == RDMA_NODE_IB_SWITCH) ?
+		0 : device->phys_port_cnt;
+}
+
+int ib_get_cached_gid(struct ib_device *device,
+		      u8                port_num,
+		      int               index,
+		      union ib_gid     *gid)
+{
+	struct ib_gid_cache *cache;
+	unsigned long flags;
+	int ret = 0;
+
+	if (port_num < start_port(device) || port_num > end_port(device))
+		return -EINVAL;
+
+	read_lock_irqsave(&device->cache.lock, flags);
+
+	cache = device->cache.gid_cache[port_num - start_port(device)];
+
+	if (index < 0 || index >= cache->table_len)
+		ret = -EINVAL;
+	else
+		*gid = cache->table[index];
+
+	read_unlock_irqrestore(&device->cache.lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL(ib_get_cached_gid);
+
+int ib_find_cached_gid(struct ib_device *device,
+		       union ib_gid	*gid,
+		       u8               *port_num,
+		       u16              *index)
+{
+	struct ib_gid_cache *cache;
+	unsigned long flags;
+	int p, i;
+	int ret = -ENOENT;
+
+	*port_num = -1;
+	if (index)
+		*index = -1;
+
+	read_lock_irqsave(&device->cache.lock, flags);
+
+	for (p = 0; p <= end_port(device) - start_port(device); ++p) {
+		cache = device->cache.gid_cache[p];
+		for (i = 0; i < cache->table_len; ++i) {
+			if (!memcmp(gid, &cache->table[i], sizeof *gid)) {
+				*port_num = p + start_port(device);
+				if (index)
+					*index = i;
+				ret = 0;
+				goto found;
+			}
+		}
+	}
+found:
+	read_unlock_irqrestore(&device->cache.lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL(ib_find_cached_gid);
+
+int ib_get_cached_pkey(struct ib_device *device,
+		       u8                port_num,
+		       int               index,
+		       u16              *pkey)
+{
+	struct ib_pkey_cache *cache;
+	unsigned long flags;
+	int ret = 0;
+
+	if (port_num < start_port(device) || port_num > end_port(device))
+		return -EINVAL;
+
+	read_lock_irqsave(&device->cache.lock, flags);
+
+	cache = device->cache.pkey_cache[port_num - start_port(device)];
+
+	if (index < 0 || index >= cache->table_len)
+		ret = -EINVAL;
+	else
+		*pkey = cache->table[index];
+
+	read_unlock_irqrestore(&device->cache.lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL(ib_get_cached_pkey);
+
+int ib_find_cached_pkey(struct ib_device *device,
+			u8                port_num,
+			u16               pkey,
+			u16              *index)
+{
+	struct ib_pkey_cache *cache;
+	unsigned long flags;
+	int i;
+	int ret = -ENOENT;
+
+	if (port_num < start_port(device) || port_num > end_port(device))
+		return -EINVAL;
+
+	read_lock_irqsave(&device->cache.lock, flags);
+
+	cache = device->cache.pkey_cache[port_num - start_port(device)];
+
+	*index = -1;
+
+	for (i = 0; i < cache->table_len; ++i)
+		if ((cache->table[i] & 0x7fff) == (pkey & 0x7fff)) {
+			*index = i;
+			ret = 0;
+			break;
+		}
+
+	read_unlock_irqrestore(&device->cache.lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL(ib_find_cached_pkey);
+
+int ib_get_cached_lmc(struct ib_device *device,
+		      u8                port_num,
+		      u8                *lmc)
+{
+	unsigned long flags;
+	int ret = 0;
+
+	if (port_num < start_port(device) || port_num > end_port(device))
+		return -EINVAL;
+
+	read_lock_irqsave(&device->cache.lock, flags);
+	*lmc = device->cache.lmc_cache[port_num - start_port(device)];
+	read_unlock_irqrestore(&device->cache.lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL(ib_get_cached_lmc);
+
+static void ib_cache_update(struct ib_device *device,
+			    u8                port)
+{
+	struct ib_port_attr       *tprops = NULL;
+	struct ib_pkey_cache      *pkey_cache = NULL, *old_pkey_cache;
+	struct ib_gid_cache       *gid_cache = NULL, *old_gid_cache;
+	int                        i;
+	int                        ret;
+
+	tprops = kmalloc(sizeof *tprops, GFP_KERNEL);
+	if (!tprops)
+		return;
+
+	ret = ib_query_port(device, port, tprops);
+	if (ret) {
+		printk(KERN_WARNING "ib_query_port failed (%d) for %s\n",
+		       ret, device->name);
+		goto err;
+	}
+
+	pkey_cache = kmalloc(sizeof *pkey_cache + tprops->pkey_tbl_len *
+			     sizeof *pkey_cache->table, GFP_KERNEL);
+	if (!pkey_cache)
+		goto err;
+
+	pkey_cache->table_len = tprops->pkey_tbl_len;
+
+	gid_cache = kmalloc(sizeof *gid_cache + tprops->gid_tbl_len *
+			    sizeof *gid_cache->table, GFP_KERNEL);
+	if (!gid_cache)
+		goto err;
+
+	gid_cache->table_len = tprops->gid_tbl_len;
+
+	for (i = 0; i < pkey_cache->table_len; ++i) {
+		ret = ib_query_pkey(device, port, i, pkey_cache->table + i);
+		if (ret) {
+			printk(KERN_WARNING "ib_query_pkey failed (%d) for %s (index %d)\n",
+			       ret, device->name, i);
+			goto err;
+		}
+	}
+
+	for (i = 0; i < gid_cache->table_len; ++i) {
+		ret = ib_query_gid(device, port, i, gid_cache->table + i);
+		if (ret) {
+			printk(KERN_WARNING "ib_query_gid failed (%d) for %s (index %d)\n",
+			       ret, device->name, i);
+			goto err;
+		}
+	}
+
+	write_lock_irq(&device->cache.lock);
+
+	old_pkey_cache = device->cache.pkey_cache[port - start_port(device)];
+	old_gid_cache  = device->cache.gid_cache [port - start_port(device)];
+
+	device->cache.pkey_cache[port - start_port(device)] = pkey_cache;
+	device->cache.gid_cache [port - start_port(device)] = gid_cache;
+
+	device->cache.lmc_cache[port - start_port(device)] = tprops->lmc;
+
+	write_unlock_irq(&device->cache.lock);
+
+	kfree(old_pkey_cache);
+	kfree(old_gid_cache);
+	kfree(tprops);
+	return;
+
+err:
+	kfree(pkey_cache);
+	kfree(gid_cache);
+	kfree(tprops);
+}
+
+static void ib_cache_task(struct work_struct *_work)
+{
+	struct ib_update_work *work =
+		container_of(_work, struct ib_update_work, work);
+
+	ib_cache_update(work->device, work->port_num);
+	kfree(work);
+}
+
+static void ib_cache_event(struct ib_event_handler *handler,
+			   struct ib_event *event)
+{
+	struct ib_update_work *work;
+
+	if (event->event == IB_EVENT_PORT_ERR    ||
+	    event->event == IB_EVENT_PORT_ACTIVE ||
+	    event->event == IB_EVENT_LID_CHANGE  ||
+	    event->event == IB_EVENT_PKEY_CHANGE ||
+	    event->event == IB_EVENT_SM_CHANGE   ||
+	    event->event == IB_EVENT_CLIENT_REREGISTER ||
+	    event->event == IB_EVENT_GID_CHANGE) {
+		work = kmalloc(sizeof *work, GFP_ATOMIC);
+		if (work) {
+			INIT_WORK(&work->work, ib_cache_task);
+			work->device   = event->device;
+			work->port_num = event->element.port_num;
+			schedule_work(&work->work);
+		}
+	}
+}
+
+static void ib_cache_setup_one(struct ib_device *device)
+{
+	int p;
+
+	rwlock_init(&device->cache.lock);
+
+	device->cache.pkey_cache =
+		kmalloc(sizeof *device->cache.pkey_cache *
+			(end_port(device) - start_port(device) + 1), GFP_KERNEL);
+	device->cache.gid_cache =
+		kmalloc(sizeof *device->cache.gid_cache *
+			(end_port(device) - start_port(device) + 1), GFP_KERNEL);
+
+	device->cache.lmc_cache = kmalloc(sizeof *device->cache.lmc_cache *
+					  (end_port(device) -
+					   start_port(device) + 1),
+					  GFP_KERNEL);
+
+	if (!device->cache.pkey_cache || !device->cache.gid_cache ||
+	    !device->cache.lmc_cache) {
+		printk(KERN_WARNING "Couldn't allocate cache "
+		       "for %s\n", device->name);
+		goto err;
+	}
+
+	for (p = 0; p <= end_port(device) - start_port(device); ++p) {
+		device->cache.pkey_cache[p] = NULL;
+		device->cache.gid_cache [p] = NULL;
+		ib_cache_update(device, p + start_port(device));
+	}
+
+	INIT_IB_EVENT_HANDLER(&device->cache.event_handler,
+			      device, ib_cache_event);
+	if (ib_register_event_handler(&device->cache.event_handler))
+		goto err_cache;
+
+	return;
+
+err_cache:
+	for (p = 0; p <= end_port(device) - start_port(device); ++p) {
+		kfree(device->cache.pkey_cache[p]);
+		kfree(device->cache.gid_cache[p]);
+	}
+
+err:
+	kfree(device->cache.pkey_cache);
+	kfree(device->cache.gid_cache);
+	kfree(device->cache.lmc_cache);
+}
+
+static void ib_cache_cleanup_one(struct ib_device *device)
+{
+	int p;
+
+	ib_unregister_event_handler(&device->cache.event_handler);
+	flush_scheduled_work();
+
+	for (p = 0; p <= end_port(device) - start_port(device); ++p) {
+		kfree(device->cache.pkey_cache[p]);
+		kfree(device->cache.gid_cache[p]);
+	}
+
+	kfree(device->cache.pkey_cache);
+	kfree(device->cache.gid_cache);
+	kfree(device->cache.lmc_cache);
+}
+
+static struct ib_client cache_client = {
+	.name   = "cache",
+	.add    = ib_cache_setup_one,
+	.remove = ib_cache_cleanup_one
+};
+
+int __init ib_cache_setup(void)
+{
+	return ib_register_client(&cache_client);
+}
+
+void __exit ib_cache_cleanup(void)
+{
+	ib_unregister_client(&cache_client);
+}


Property changes on: trunk/sys/ofed/drivers/infiniband/core/cache.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/sys/ofed/drivers/infiniband/core/cm.c
===================================================================
--- trunk/sys/ofed/drivers/infiniband/core/cm.c	                        (rev 0)
+++ trunk/sys/ofed/drivers/infiniband/core/cm.c	2016-09-14 19:35:22 UTC (rev 7911)
@@ -0,0 +1,3897 @@
+/*
+ * Copyright (c) 2004-2007 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation.  All rights reserved.
+ * Copyright (c) 2004, 2005 Voltaire Corporation.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/completion.h>
+#include <linux/dma-mapping.h>
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/idr.h>
+#include <linux/interrupt.h>
+#include <linux/random.h>
+#include <linux/rbtree.h>
+#include <linux/spinlock.h>
+#include <linux/sysfs.h>
+#include <linux/workqueue.h>
+#include <linux/kdev_t.h>
+#include <linux/string.h>
+
+#include <asm/atomic-long.h>
+
+#include <rdma/ib_cache.h>
+#include <rdma/ib_cm.h>
+#include "cm_msgs.h"
+
+MODULE_AUTHOR("Sean Hefty");
+MODULE_DESCRIPTION("InfiniBand CM");
+MODULE_LICENSE("Dual BSD/GPL");
+
+#define PFX    "ib_cm: "
+
+/*
+ * Limit CM message timeouts to something reasonable:
+ * 8 seconds per message, with up to 15 retries
+ */
+static int max_timeout = 21;
+module_param(max_timeout, int, 0644);
+MODULE_PARM_DESC(max_timeout, "Maximum IB CM per message timeout "
+                             "(default=21, or ~8 seconds)");
+
+static void cm_add_one(struct ib_device *device);
+static void cm_remove_one(struct ib_device *device);
+
+static struct ib_client cm_client = {
+	.name   = "cm",
+	.add    = cm_add_one,
+	.remove = cm_remove_one
+};
+
+static struct ib_cm {
+	spinlock_t lock;
+	struct list_head device_list;
+	rwlock_t device_lock;
+	struct rb_root listen_service_table;
+	u64 listen_service_id;
+	/* struct rb_root peer_service_table; todo: fix peer to peer */
+	struct rb_root remote_qp_table;
+	struct rb_root remote_id_table;
+	struct rb_root remote_sidr_table;
+	struct idr local_id_table;
+	__be32 random_id_operand;
+	struct list_head timewait_list;
+	struct workqueue_struct *wq;
+} cm;
+
+/* Counter indexes ordered by attribute ID */
+enum {
+	CM_REQ_COUNTER,
+	CM_MRA_COUNTER,
+	CM_REJ_COUNTER,
+	CM_REP_COUNTER,
+	CM_RTU_COUNTER,
+	CM_DREQ_COUNTER,
+	CM_DREP_COUNTER,
+	CM_SIDR_REQ_COUNTER,
+	CM_SIDR_REP_COUNTER,
+	CM_LAP_COUNTER,
+	CM_APR_COUNTER,
+	CM_ATTR_COUNT,
+	CM_ATTR_ID_OFFSET = 0x0010,
+};
+
+enum {
+	CM_XMIT,
+	CM_XMIT_RETRIES,
+	CM_RECV,
+	CM_RECV_DUPLICATES,
+	CM_COUNTER_GROUPS
+};
+
+static char const counter_group_names[CM_COUNTER_GROUPS]
+				     [sizeof("cm_rx_duplicates")] = {
+	"cm_tx_msgs", "cm_tx_retries",
+	"cm_rx_msgs", "cm_rx_duplicates"
+};
+
+struct cm_counter_group {
+	struct kobject obj;
+	atomic_long_t counter[CM_ATTR_COUNT];
+};
+
+struct cm_counter_attribute {
+	struct attribute attr;
+	int index;
+};
+
+#define CM_COUNTER_ATTR(_name, _index) \
+struct cm_counter_attribute cm_##_name##_counter_attr = { \
+	.attr = { .name = __stringify(_name), .mode = 0444 }, \
+	.index = _index \
+}
+
+static CM_COUNTER_ATTR(req, CM_REQ_COUNTER);
+static CM_COUNTER_ATTR(mra, CM_MRA_COUNTER);
+static CM_COUNTER_ATTR(rej, CM_REJ_COUNTER);
+static CM_COUNTER_ATTR(rep, CM_REP_COUNTER);
+static CM_COUNTER_ATTR(rtu, CM_RTU_COUNTER);
+static CM_COUNTER_ATTR(dreq, CM_DREQ_COUNTER);
+static CM_COUNTER_ATTR(drep, CM_DREP_COUNTER);
+static CM_COUNTER_ATTR(sidr_req, CM_SIDR_REQ_COUNTER);
+static CM_COUNTER_ATTR(sidr_rep, CM_SIDR_REP_COUNTER);
+static CM_COUNTER_ATTR(lap, CM_LAP_COUNTER);
+static CM_COUNTER_ATTR(apr, CM_APR_COUNTER);
+
+static struct attribute *cm_counter_default_attrs[] = {
+	&cm_req_counter_attr.attr,
+	&cm_mra_counter_attr.attr,
+	&cm_rej_counter_attr.attr,
+	&cm_rep_counter_attr.attr,
+	&cm_rtu_counter_attr.attr,
+	&cm_dreq_counter_attr.attr,
+	&cm_drep_counter_attr.attr,
+	&cm_sidr_req_counter_attr.attr,
+	&cm_sidr_rep_counter_attr.attr,
+	&cm_lap_counter_attr.attr,
+	&cm_apr_counter_attr.attr,
+	NULL
+};
+
+struct cm_port {
+	struct cm_device *cm_dev;
+	struct ib_mad_agent *mad_agent;
+	struct kobject port_obj;
+	u8 port_num;
+	struct cm_counter_group counter_group[CM_COUNTER_GROUPS];
+};
+
+struct cm_device {
+	struct list_head list;
+	struct ib_device *ib_device;
+	struct device *device;
+	u8 ack_delay;
+	struct cm_port *port[0];
+};
+
+struct cm_av {
+	struct cm_port *port;
+	union ib_gid dgid;
+	struct ib_ah_attr ah_attr;
+	u16 pkey_index;
+	u8 timeout;
+};
+
+struct cm_work {
+	struct delayed_work work;
+	struct list_head list;
+	struct cm_port *port;
+	struct ib_mad_recv_wc *mad_recv_wc;	/* Received MADs */
+	__be32 local_id;			/* Established / timewait */
+	__be32 remote_id;
+	struct ib_cm_event cm_event;
+	struct ib_sa_path_rec path[0];
+};
+
+struct cm_timewait_info {
+	struct cm_work work;			/* Must be first. */
+	struct list_head list;
+	struct rb_node remote_qp_node;
+	struct rb_node remote_id_node;
+	__be64 remote_ca_guid;
+	__be32 remote_qpn;
+	u8 inserted_remote_qp;
+	u8 inserted_remote_id;
+};
+
+struct cm_id_private {
+	struct ib_cm_id	id;
+
+	struct rb_node service_node;
+	struct rb_node sidr_id_node;
+	spinlock_t lock;	/* Do not acquire inside cm.lock */
+	struct completion comp;
+	atomic_t refcount;
+
+	struct ib_mad_send_buf *msg;
+	struct cm_timewait_info *timewait_info;
+	/* todo: use alternate port on send failure */
+	struct cm_av av;
+	struct cm_av alt_av;
+	struct ib_cm_compare_data *compare_data;
+
+	void *private_data;
+	__be64 tid;
+	__be32 local_qpn;
+	__be32 remote_qpn;
+	enum ib_qp_type qp_type;
+	__be32 sq_psn;
+	__be32 rq_psn;
+	int timeout_ms;
+	enum ib_mtu path_mtu;
+	__be16 pkey;
+	u8 private_data_len;
+	u8 max_cm_retries;
+	u8 peer_to_peer;
+	u8 responder_resources;
+	u8 initiator_depth;
+	u8 retry_count;
+	u8 rnr_retry_count;
+	u8 service_timeout;
+	u8 target_ack_delay;
+
+	struct list_head work_list;
+	atomic_t work_count;
+};
+
+static void cm_work_handler(struct work_struct *work);
+
+static inline void cm_deref_id(struct cm_id_private *cm_id_priv)
+{
+	if (atomic_dec_and_test(&cm_id_priv->refcount))
+		complete(&cm_id_priv->comp);
+}
+
+static int cm_alloc_msg(struct cm_id_private *cm_id_priv,
+			struct ib_mad_send_buf **msg)
+{
+	struct ib_mad_agent *mad_agent;
+	struct ib_mad_send_buf *m;
+	struct ib_ah *ah;
+
+	mad_agent = cm_id_priv->av.port->mad_agent;
+	ah = ib_create_ah(mad_agent->qp->pd, &cm_id_priv->av.ah_attr);
+	if (IS_ERR(ah))
+		return PTR_ERR(ah);
+
+	m = ib_create_send_mad(mad_agent, cm_id_priv->id.remote_cm_qpn,
+			       cm_id_priv->av.pkey_index,
+			       0, IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA,
+			       GFP_ATOMIC);
+	if (IS_ERR(m)) {
+		ib_destroy_ah(ah);
+		return PTR_ERR(m);
+	}
+
+	/* Timeout set by caller if response is expected. */
+	m->ah = ah;
+	m->retries = cm_id_priv->max_cm_retries;
+
+	atomic_inc(&cm_id_priv->refcount);
+	m->context[0] = cm_id_priv;
+	*msg = m;
+	return 0;
+}
+
+static int cm_alloc_response_msg(struct cm_port *port,
+				 struct ib_mad_recv_wc *mad_recv_wc,
+				 struct ib_mad_send_buf **msg)
+{
+	struct ib_mad_send_buf *m;
+	struct ib_ah *ah;
+
+	ah = ib_create_ah_from_wc(port->mad_agent->qp->pd, mad_recv_wc->wc,
+				  mad_recv_wc->recv_buf.grh, port->port_num);
+	if (IS_ERR(ah))
+		return PTR_ERR(ah);
+
+	m = ib_create_send_mad(port->mad_agent, 1, mad_recv_wc->wc->pkey_index,
+			       0, IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA,
+			       GFP_ATOMIC);
+	if (IS_ERR(m)) {
+		ib_destroy_ah(ah);
+		return PTR_ERR(m);
+	}
+	m->ah = ah;
+	*msg = m;
+	return 0;
+}
+
+static void cm_free_msg(struct ib_mad_send_buf *msg)
+{
+	ib_destroy_ah(msg->ah);
+	if (msg->context[0])
+		cm_deref_id(msg->context[0]);
+	ib_free_send_mad(msg);
+}
+
+static void * cm_copy_private_data(const void *private_data,
+				   u8 private_data_len)
+{
+	void *data;
+
+	if (!private_data || !private_data_len)
+		return NULL;
+
+	data = kmemdup(private_data, private_data_len, GFP_KERNEL);
+	if (!data)
+		return ERR_PTR(-ENOMEM);
+
+	return data;
+}
+
+static void cm_set_private_data(struct cm_id_private *cm_id_priv,
+				 void *private_data, u8 private_data_len)
+{
+	if (cm_id_priv->private_data && cm_id_priv->private_data_len)
+		kfree(cm_id_priv->private_data);
+
+	cm_id_priv->private_data = private_data;
+	cm_id_priv->private_data_len = private_data_len;
+}
+
+static void cm_init_av_for_response(struct cm_port *port, struct ib_wc *wc,
+				    struct ib_grh *grh, struct cm_av *av)
+{
+	av->port = port;
+	av->pkey_index = wc->pkey_index;
+	ib_init_ah_from_wc(port->cm_dev->ib_device, port->port_num, wc,
+			   grh, &av->ah_attr);
+}
+
+static int cm_init_av_by_path(struct ib_sa_path_rec *path, struct cm_av *av)
+{
+	struct cm_device *cm_dev;
+	struct cm_port *port = NULL;
+	unsigned long flags;
+	int ret;
+	u8 p;
+
+	read_lock_irqsave(&cm.device_lock, flags);
+	list_for_each_entry(cm_dev, &cm.device_list, list) {
+		if (!ib_find_cached_gid(cm_dev->ib_device, &path->sgid,
+					&p, NULL)) {
+			port = cm_dev->port[p-1];
+			break;
+		}
+	}
+	read_unlock_irqrestore(&cm.device_lock, flags);
+
+	if (!port)
+		return -EINVAL;
+
+	ret = ib_find_cached_pkey(cm_dev->ib_device, port->port_num,
+				  be16_to_cpu(path->pkey), &av->pkey_index);
+	if (ret)
+		return ret;
+
+	av->port = port;
+	ib_init_ah_from_path(cm_dev->ib_device, port->port_num, path,
+			     &av->ah_attr);
+	av->timeout = path->packet_life_time + 1;
+	return 0;
+}
+
+static int cm_alloc_id(struct cm_id_private *cm_id_priv)
+{
+	unsigned long flags;
+	int ret, id;
+	static int next_id;
+
+	do {
+		spin_lock_irqsave(&cm.lock, flags);
+		ret = idr_get_new_above(&cm.local_id_table, cm_id_priv,
+					next_id, &id);
+		if (!ret)
+			next_id = ((unsigned) id + 1) & MAX_ID_MASK;
+		spin_unlock_irqrestore(&cm.lock, flags);
+	} while( (ret == -EAGAIN) && idr_pre_get(&cm.local_id_table, GFP_KERNEL) );
+
+	cm_id_priv->id.local_id = (__force __be32)id ^ cm.random_id_operand;
+	return ret;
+}
+
+static void cm_free_id(__be32 local_id)
+{
+	spin_lock_irq(&cm.lock);
+	idr_remove(&cm.local_id_table,
+		   (__force int) (local_id ^ cm.random_id_operand));
+	spin_unlock_irq(&cm.lock);
+}
+
+static struct cm_id_private * cm_get_id(__be32 local_id, __be32 remote_id)
+{
+	struct cm_id_private *cm_id_priv;
+
+	cm_id_priv = idr_find(&cm.local_id_table,
+			      (__force int) (local_id ^ cm.random_id_operand));
+	if (cm_id_priv) {
+		if (cm_id_priv->id.remote_id == remote_id)
+			atomic_inc(&cm_id_priv->refcount);
+		else
+			cm_id_priv = NULL;
+	}
+
+	return cm_id_priv;
+}
+
+static struct cm_id_private * cm_acquire_id(__be32 local_id, __be32 remote_id)
+{
+	struct cm_id_private *cm_id_priv;
+
+	spin_lock_irq(&cm.lock);
+	cm_id_priv = cm_get_id(local_id, remote_id);
+	spin_unlock_irq(&cm.lock);
+
+	return cm_id_priv;
+}
+
+static void cm_mask_copy(u8 *dst, u8 *src, u8 *mask)
+{
+	int i;
+
+	for (i = 0; i < IB_CM_COMPARE_SIZE / sizeof(unsigned long); i++)
+		((unsigned long *) dst)[i] = ((unsigned long *) src)[i] &
+					     ((unsigned long *) mask)[i];
+}
+
+static int cm_compare_data(struct ib_cm_compare_data *src_data,
+			   struct ib_cm_compare_data *dst_data)
+{
+	u8 src[IB_CM_COMPARE_SIZE];
+	u8 dst[IB_CM_COMPARE_SIZE];
+
+	if (!src_data || !dst_data)
+		return 0;
+
+	cm_mask_copy(src, src_data->data, dst_data->mask);
+	cm_mask_copy(dst, dst_data->data, src_data->mask);
+	return memcmp(src, dst, IB_CM_COMPARE_SIZE);
+}
+
+static int cm_compare_private_data(u8 *private_data,
+				   struct ib_cm_compare_data *dst_data)
+{
+	u8 src[IB_CM_COMPARE_SIZE];
+
+	if (!dst_data)
+		return 0;
+
+	cm_mask_copy(src, private_data, dst_data->mask);
+	return memcmp(src, dst_data->data, IB_CM_COMPARE_SIZE);
+}
+
+/*
+ * Trivial helpers to strip endian annotation and compare; the
+ * endianness doesn't actually matter since we just need a stable
+ * order for the RB tree.
+ */
+static int be32_lt(__be32 a, __be32 b)
+{
+	return (__force u32) a < (__force u32) b;
+}
+
+static int be32_gt(__be32 a, __be32 b)
+{
+	return (__force u32) a > (__force u32) b;
+}
+
+static int be64_lt(__be64 a, __be64 b)
+{
+	return (__force u64) a < (__force u64) b;
+}
+
+static int be64_gt(__be64 a, __be64 b)
+{
+	return (__force u64) a > (__force u64) b;
+}
+
+static struct cm_id_private * cm_insert_listen(struct cm_id_private *cm_id_priv)
+{
+	struct rb_node **link = &cm.listen_service_table.rb_node;
+	struct rb_node *parent = NULL;
+	struct cm_id_private *cur_cm_id_priv;
+	__be64 service_id = cm_id_priv->id.service_id;
+	__be64 service_mask = cm_id_priv->id.service_mask;
+	int data_cmp;
+
+	while (*link) {
+		parent = *link;
+		cur_cm_id_priv = rb_entry(parent, struct cm_id_private,
+					  service_node);
+		data_cmp = cm_compare_data(cm_id_priv->compare_data,
+					   cur_cm_id_priv->compare_data);
+		if ((cur_cm_id_priv->id.service_mask & service_id) ==
+		    (service_mask & cur_cm_id_priv->id.service_id) &&
+		    (cm_id_priv->id.device == cur_cm_id_priv->id.device) &&
+		    !data_cmp)
+			return cur_cm_id_priv;
+
+		if (cm_id_priv->id.device < cur_cm_id_priv->id.device)
+			link = &(*link)->rb_left;
+		else if (cm_id_priv->id.device > cur_cm_id_priv->id.device)
+			link = &(*link)->rb_right;
+		else if (be64_lt(service_id, cur_cm_id_priv->id.service_id))
+			link = &(*link)->rb_left;
+		else if (be64_gt(service_id, cur_cm_id_priv->id.service_id))
+			link = &(*link)->rb_right;
+		else if (data_cmp < 0)
+			link = &(*link)->rb_left;
+		else
+			link = &(*link)->rb_right;
+	}
+	rb_link_node(&cm_id_priv->service_node, parent, link);
+	rb_insert_color(&cm_id_priv->service_node, &cm.listen_service_table);
+	return NULL;
+}
+
+static struct cm_id_private * cm_find_listen(struct ib_device *device,
+					     __be64 service_id,
+					     u8 *private_data)
+{
+	struct rb_node *node = cm.listen_service_table.rb_node;
+	struct cm_id_private *cm_id_priv;
+	int data_cmp;
+
+	while (node) {
+		cm_id_priv = rb_entry(node, struct cm_id_private, service_node);
+		data_cmp = cm_compare_private_data(private_data,
+						   cm_id_priv->compare_data);
+		if ((cm_id_priv->id.service_mask & service_id) ==
+		     cm_id_priv->id.service_id &&
+		    (cm_id_priv->id.device == device) && !data_cmp)
+			return cm_id_priv;
+
+		if (device < cm_id_priv->id.device)
+			node = node->rb_left;
+		else if (device > cm_id_priv->id.device)
+			node = node->rb_right;
+		else if (be64_lt(service_id, cm_id_priv->id.service_id))
+			node = node->rb_left;
+		else if (be64_gt(service_id, cm_id_priv->id.service_id))
+			node = node->rb_right;
+		else if (data_cmp < 0)
+			node = node->rb_left;
+		else
+			node = node->rb_right;
+	}
+	return NULL;
+}
+
+static struct cm_timewait_info * cm_insert_remote_id(struct cm_timewait_info
+						     *timewait_info)
+{
+	struct rb_node **link = &cm.remote_id_table.rb_node;
+	struct rb_node *parent = NULL;
+	struct cm_timewait_info *cur_timewait_info;
+	__be64 remote_ca_guid = timewait_info->remote_ca_guid;
+	__be32 remote_id = timewait_info->work.remote_id;
+
+	while (*link) {
+		parent = *link;
+		cur_timewait_info = rb_entry(parent, struct cm_timewait_info,
+					     remote_id_node);
+		if (be32_lt(remote_id, cur_timewait_info->work.remote_id))
+			link = &(*link)->rb_left;
+		else if (be32_gt(remote_id, cur_timewait_info->work.remote_id))
+			link = &(*link)->rb_right;
+		else if (be64_lt(remote_ca_guid, cur_timewait_info->remote_ca_guid))
+			link = &(*link)->rb_left;
+		else if (be64_gt(remote_ca_guid, cur_timewait_info->remote_ca_guid))
+			link = &(*link)->rb_right;
+		else
+			return cur_timewait_info;
+	}
+	timewait_info->inserted_remote_id = 1;
+	rb_link_node(&timewait_info->remote_id_node, parent, link);
+	rb_insert_color(&timewait_info->remote_id_node, &cm.remote_id_table);
+	return NULL;
+}
+
+static struct cm_timewait_info * cm_find_remote_id(__be64 remote_ca_guid,
+						   __be32 remote_id)
+{
+	struct rb_node *node = cm.remote_id_table.rb_node;
+	struct cm_timewait_info *timewait_info;
+
+	while (node) {
+		timewait_info = rb_entry(node, struct cm_timewait_info,
+					 remote_id_node);
+		if (be32_lt(remote_id, timewait_info->work.remote_id))
+			node = node->rb_left;
+		else if (be32_gt(remote_id, timewait_info->work.remote_id))
+			node = node->rb_right;
+		else if (be64_lt(remote_ca_guid, timewait_info->remote_ca_guid))
+			node = node->rb_left;
+		else if (be64_gt(remote_ca_guid, timewait_info->remote_ca_guid))
+			node = node->rb_right;
+		else
+			return timewait_info;
+	}
+	return NULL;
+}
+
+static struct cm_timewait_info * cm_insert_remote_qpn(struct cm_timewait_info
+						      *timewait_info)
+{
+	struct rb_node **link = &cm.remote_qp_table.rb_node;
+	struct rb_node *parent = NULL;
+	struct cm_timewait_info *cur_timewait_info;
+	__be64 remote_ca_guid = timewait_info->remote_ca_guid;
+	__be32 remote_qpn = timewait_info->remote_qpn;
+
+	while (*link) {
+		parent = *link;
+		cur_timewait_info = rb_entry(parent, struct cm_timewait_info,
+					     remote_qp_node);
+		if (be32_lt(remote_qpn, cur_timewait_info->remote_qpn))
+			link = &(*link)->rb_left;
+		else if (be32_gt(remote_qpn, cur_timewait_info->remote_qpn))
+			link = &(*link)->rb_right;
+		else if (be64_lt(remote_ca_guid, cur_timewait_info->remote_ca_guid))
+			link = &(*link)->rb_left;
+		else if (be64_gt(remote_ca_guid, cur_timewait_info->remote_ca_guid))
+			link = &(*link)->rb_right;
+		else
+			return cur_timewait_info;
+	}
+	timewait_info->inserted_remote_qp = 1;
+	rb_link_node(&timewait_info->remote_qp_node, parent, link);
+	rb_insert_color(&timewait_info->remote_qp_node, &cm.remote_qp_table);
+	return NULL;
+}
+
+static struct cm_id_private * cm_insert_remote_sidr(struct cm_id_private
+						    *cm_id_priv)
+{
+	struct rb_node **link = &cm.remote_sidr_table.rb_node;
+	struct rb_node *parent = NULL;
+	struct cm_id_private *cur_cm_id_priv;
+	union ib_gid *port_gid = &cm_id_priv->av.dgid;
+	__be32 remote_id = cm_id_priv->id.remote_id;
+
+	while (*link) {
+		parent = *link;
+		cur_cm_id_priv = rb_entry(parent, struct cm_id_private,
+					  sidr_id_node);
+		if (be32_lt(remote_id, cur_cm_id_priv->id.remote_id))
+			link = &(*link)->rb_left;
+		else if (be32_gt(remote_id, cur_cm_id_priv->id.remote_id))
+			link = &(*link)->rb_right;
+		else {
+			int cmp;
+			cmp = memcmp(port_gid, &cur_cm_id_priv->av.dgid,
+				     sizeof *port_gid);
+			if (cmp < 0)
+				link = &(*link)->rb_left;
+			else if (cmp > 0)
+				link = &(*link)->rb_right;
+			else
+				return cur_cm_id_priv;
+		}
+	}
+	rb_link_node(&cm_id_priv->sidr_id_node, parent, link);
+	rb_insert_color(&cm_id_priv->sidr_id_node, &cm.remote_sidr_table);
+	return NULL;
+}
+
+static void cm_reject_sidr_req(struct cm_id_private *cm_id_priv,
+			       enum ib_cm_sidr_status status)
+{
+	struct ib_cm_sidr_rep_param param;
+
+	memset(&param, 0, sizeof param);
+	param.status = status;
+	ib_send_cm_sidr_rep(&cm_id_priv->id, &param);
+}
+
+struct ib_cm_id *ib_create_cm_id(struct ib_device *device,
+				 ib_cm_handler cm_handler,
+				 void *context)
+{
+	struct cm_id_private *cm_id_priv;
+	int ret;
+
+	cm_id_priv = kzalloc(sizeof *cm_id_priv, GFP_KERNEL);
+	if (!cm_id_priv)
+		return ERR_PTR(-ENOMEM);
+
+	cm_id_priv->id.state = IB_CM_IDLE;
+	cm_id_priv->id.device = device;
+	cm_id_priv->id.cm_handler = cm_handler;
+	cm_id_priv->id.context = context;
+	cm_id_priv->id.remote_cm_qpn = 1;
+	ret = cm_alloc_id(cm_id_priv);
+	if (ret)
+		goto error;
+
+	spin_lock_init(&cm_id_priv->lock);
+	init_completion(&cm_id_priv->comp);
+	INIT_LIST_HEAD(&cm_id_priv->work_list);
+	atomic_set(&cm_id_priv->work_count, -1);
+	atomic_set(&cm_id_priv->refcount, 1);
+	return &cm_id_priv->id;
+
+error:
+	kfree(cm_id_priv);
+	return ERR_PTR(-ENOMEM);
+}
+EXPORT_SYMBOL(ib_create_cm_id);
+
+static struct cm_work * cm_dequeue_work(struct cm_id_private *cm_id_priv)
+{
+	struct cm_work *work;
+
+	if (list_empty(&cm_id_priv->work_list))
+		return NULL;
+
+	work = list_entry(cm_id_priv->work_list.next, struct cm_work, list);
+	list_del(&work->list);
+	return work;
+}
+
+static void cm_free_work(struct cm_work *work)
+{
+	if (work->mad_recv_wc)
+		ib_free_recv_mad(work->mad_recv_wc);
+	kfree(work);
+}
+
+static inline int cm_convert_to_ms(int iba_time)
+{
+	/* approximate conversion to ms from 4.096us x 2^iba_time */
+	return 1 << max(iba_time - 8, 0);
+}
+
+/*
+ * calculate: 4.096x2^ack_timeout = 4.096x2^ack_delay + 2x4.096x2^life_time
+ * Because of how ack_timeout is stored, adding one doubles the timeout.
+ * To avoid large timeouts, select the max(ack_delay, life_time + 1), and
+ * increment it (round up) only if the other is within 50%.
+ */
+static u8 cm_ack_timeout(u8 ca_ack_delay, u8 packet_life_time)
+{
+	int ack_timeout = packet_life_time + 1;
+
+	if (ack_timeout >= ca_ack_delay)
+		ack_timeout += (ca_ack_delay >= (ack_timeout - 1));
+	else
+		ack_timeout = ca_ack_delay +
+			      (ack_timeout >= (ca_ack_delay - 1));
+
+	return min(31, ack_timeout);
+}
+
+static void cm_cleanup_timewait(struct cm_timewait_info *timewait_info)
+{
+	if (timewait_info->inserted_remote_id) {
+		rb_erase(&timewait_info->remote_id_node, &cm.remote_id_table);
+		timewait_info->inserted_remote_id = 0;
+	}
+
+	if (timewait_info->inserted_remote_qp) {
+		rb_erase(&timewait_info->remote_qp_node, &cm.remote_qp_table);
+		timewait_info->inserted_remote_qp = 0;
+	}
+}
+
+static struct cm_timewait_info * cm_create_timewait_info(__be32 local_id)
+{
+	struct cm_timewait_info *timewait_info;
+
+	timewait_info = kzalloc(sizeof *timewait_info, GFP_KERNEL);
+	if (!timewait_info)
+		return ERR_PTR(-ENOMEM);
+
+	timewait_info->work.local_id = local_id;
+	INIT_DELAYED_WORK(&timewait_info->work.work, cm_work_handler);
+	timewait_info->work.cm_event.event = IB_CM_TIMEWAIT_EXIT;
+	return timewait_info;
+}
+
+static void cm_enter_timewait(struct cm_id_private *cm_id_priv)
+{
+	int wait_time;
+	unsigned long flags;
+
+	spin_lock_irqsave(&cm.lock, flags);
+	cm_cleanup_timewait(cm_id_priv->timewait_info);
+	list_add_tail(&cm_id_priv->timewait_info->list, &cm.timewait_list);
+	spin_unlock_irqrestore(&cm.lock, flags);
+
+	/*
+	 * The cm_id could be destroyed by the user before we exit timewait.
+	 * To protect against this, we search for the cm_id after exiting
+	 * timewait before notifying the user that we've exited timewait.
+	 */
+	cm_id_priv->id.state = IB_CM_TIMEWAIT;
+	wait_time = cm_convert_to_ms(cm_id_priv->av.timeout);
+	queue_delayed_work(cm.wq, &cm_id_priv->timewait_info->work.work,
+			   msecs_to_jiffies(wait_time));
+	cm_id_priv->timewait_info = NULL;
+}
+
+static void cm_reset_to_idle(struct cm_id_private *cm_id_priv)
+{
+	unsigned long flags;
+
+	cm_id_priv->id.state = IB_CM_IDLE;
+	if (cm_id_priv->timewait_info) {
+		spin_lock_irqsave(&cm.lock, flags);
+		cm_cleanup_timewait(cm_id_priv->timewait_info);
+		spin_unlock_irqrestore(&cm.lock, flags);
+		kfree(cm_id_priv->timewait_info);
+		cm_id_priv->timewait_info = NULL;
+	}
+}
+
+static void cm_destroy_id(struct ib_cm_id *cm_id, int err)
+{
+	struct cm_id_private *cm_id_priv;
+	struct cm_work *work;
+
+	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+retest:
+	spin_lock_irq(&cm_id_priv->lock);
+	switch (cm_id->state) {
+	case IB_CM_LISTEN:
+		cm_id->state = IB_CM_IDLE;
+		spin_unlock_irq(&cm_id_priv->lock);
+		spin_lock_irq(&cm.lock);
+		rb_erase(&cm_id_priv->service_node, &cm.listen_service_table);
+		spin_unlock_irq(&cm.lock);
+		break;
+	case IB_CM_SIDR_REQ_SENT:
+		cm_id->state = IB_CM_IDLE;
+		ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+		spin_unlock_irq(&cm_id_priv->lock);
+		break;
+	case IB_CM_SIDR_REQ_RCVD:
+		spin_unlock_irq(&cm_id_priv->lock);
+		cm_reject_sidr_req(cm_id_priv, IB_SIDR_REJECT);
+		break;
+	case IB_CM_REQ_SENT:
+		ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+		spin_unlock_irq(&cm_id_priv->lock);
+		ib_send_cm_rej(cm_id, IB_CM_REJ_TIMEOUT,
+			       &cm_id_priv->id.device->node_guid,
+			       sizeof cm_id_priv->id.device->node_guid,
+			       NULL, 0);
+		break;
+	case IB_CM_REQ_RCVD:
+		if (err == -ENOMEM) {
+			/* Do not reject to allow future retries. */
+			cm_reset_to_idle(cm_id_priv);
+			spin_unlock_irq(&cm_id_priv->lock);
+		} else {
+			spin_unlock_irq(&cm_id_priv->lock);
+			ib_send_cm_rej(cm_id, IB_CM_REJ_CONSUMER_DEFINED,
+				       NULL, 0, NULL, 0);
+		}
+		break;
+	case IB_CM_MRA_REQ_RCVD:
+	case IB_CM_REP_SENT:
+	case IB_CM_MRA_REP_RCVD:
+		ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+		/* Fall through */
+	case IB_CM_MRA_REQ_SENT:
+	case IB_CM_REP_RCVD:
+	case IB_CM_MRA_REP_SENT:
+		spin_unlock_irq(&cm_id_priv->lock);
+		ib_send_cm_rej(cm_id, IB_CM_REJ_CONSUMER_DEFINED,
+			       NULL, 0, NULL, 0);
+		break;
+	case IB_CM_ESTABLISHED:
+		spin_unlock_irq(&cm_id_priv->lock);
+		ib_send_cm_dreq(cm_id, NULL, 0);
+		goto retest;
+	case IB_CM_DREQ_SENT:
+		ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+		cm_enter_timewait(cm_id_priv);
+		spin_unlock_irq(&cm_id_priv->lock);
+		break;
+	case IB_CM_DREQ_RCVD:
+		spin_unlock_irq(&cm_id_priv->lock);
+		ib_send_cm_drep(cm_id, NULL, 0);
+		break;
+	default:
+		spin_unlock_irq(&cm_id_priv->lock);
+		break;
+	}
+
+	cm_free_id(cm_id->local_id);
+	cm_deref_id(cm_id_priv);
+	wait_for_completion(&cm_id_priv->comp);
+	while ((work = cm_dequeue_work(cm_id_priv)) != NULL)
+		cm_free_work(work);
+	kfree(cm_id_priv->compare_data);
+	kfree(cm_id_priv->private_data);
+	kfree(cm_id_priv);
+}
+
+void ib_destroy_cm_id(struct ib_cm_id *cm_id)
+{
+	cm_destroy_id(cm_id, 0);
+}
+EXPORT_SYMBOL(ib_destroy_cm_id);
+
+int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id, __be64 service_mask,
+		 struct ib_cm_compare_data *compare_data)
+{
+	struct cm_id_private *cm_id_priv, *cur_cm_id_priv;
+	unsigned long flags;
+	int ret = 0;
+
+	service_mask = service_mask ? service_mask : ~cpu_to_be64(0);
+	service_id &= service_mask;
+	if ((service_id & IB_SERVICE_ID_AGN_MASK) == IB_CM_ASSIGN_SERVICE_ID &&
+	    (service_id != IB_CM_ASSIGN_SERVICE_ID))
+		return -EINVAL;
+
+	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+	if (cm_id->state != IB_CM_IDLE)
+		return -EINVAL;
+
+	if (compare_data) {
+		cm_id_priv->compare_data = kzalloc(sizeof *compare_data,
+						   GFP_KERNEL);
+		if (!cm_id_priv->compare_data)
+			return -ENOMEM;
+		cm_mask_copy(cm_id_priv->compare_data->data,
+			     compare_data->data, compare_data->mask);
+		memcpy(cm_id_priv->compare_data->mask, compare_data->mask,
+		       IB_CM_COMPARE_SIZE);
+	}
+
+	cm_id->state = IB_CM_LISTEN;
+
+	spin_lock_irqsave(&cm.lock, flags);
+	if (service_id == IB_CM_ASSIGN_SERVICE_ID) {
+		cm_id->service_id = cpu_to_be64(cm.listen_service_id++);
+		cm_id->service_mask = ~cpu_to_be64(0);
+	} else {
+		cm_id->service_id = service_id;
+		cm_id->service_mask = service_mask;
+	}
+	cur_cm_id_priv = cm_insert_listen(cm_id_priv);
+	spin_unlock_irqrestore(&cm.lock, flags);
+
+	if (cur_cm_id_priv) {
+		cm_id->state = IB_CM_IDLE;
+		kfree(cm_id_priv->compare_data);
+		cm_id_priv->compare_data = NULL;
+		ret = -EBUSY;
+	}
+	return ret;
+}
+EXPORT_SYMBOL(ib_cm_listen);
+
+static __be64 cm_form_tid(struct cm_id_private *cm_id_priv,
+			  enum cm_msg_sequence msg_seq)
+{
+	u64 hi_tid, low_tid;
+
+	hi_tid   = ((u64) cm_id_priv->av.port->mad_agent->hi_tid) << 32;
+	low_tid  = (u64) ((__force u32)cm_id_priv->id.local_id |
+			  (msg_seq << 30));
+	return cpu_to_be64(hi_tid | low_tid);
+}
+
+static void cm_format_mad_hdr(struct ib_mad_hdr *hdr,
+			      __be16 attr_id, __be64 tid)
+{
+	hdr->base_version  = IB_MGMT_BASE_VERSION;
+	hdr->mgmt_class	   = IB_MGMT_CLASS_CM;
+	hdr->class_version = IB_CM_CLASS_VERSION;
+	hdr->method	   = IB_MGMT_METHOD_SEND;
+	hdr->attr_id	   = attr_id;
+	hdr->tid	   = tid;
+}
+
+static void cm_format_req(struct cm_req_msg *req_msg,
+			  struct cm_id_private *cm_id_priv,
+			  struct ib_cm_req_param *param)
+{
+	struct ib_sa_path_rec *pri_path = param->primary_path;
+	struct ib_sa_path_rec *alt_path = param->alternate_path;
+
+	cm_format_mad_hdr(&req_msg->hdr, CM_REQ_ATTR_ID,
+			  cm_form_tid(cm_id_priv, CM_MSG_SEQUENCE_REQ));
+
+	req_msg->local_comm_id = cm_id_priv->id.local_id;
+	req_msg->service_id = param->service_id;
+	req_msg->local_ca_guid = cm_id_priv->id.device->node_guid;
+	cm_req_set_local_qpn(req_msg, cpu_to_be32(param->qp_num));
+	cm_req_set_resp_res(req_msg, param->responder_resources);
+	cm_req_set_init_depth(req_msg, param->initiator_depth);
+	cm_req_set_remote_resp_timeout(req_msg,
+				       param->remote_cm_response_timeout);
+       if (param->remote_cm_response_timeout > (u8) max_timeout) {
+               printk(KERN_WARNING PFX "req remote_cm_response_timeout %d > "
+                      "%d, decreasing\n", param->remote_cm_response_timeout,
+                      max_timeout);
+               cm_req_set_remote_resp_timeout(req_msg, (u8) max_timeout);
+       }
+	cm_req_set_qp_type(req_msg, param->qp_type);
+	cm_req_set_flow_ctrl(req_msg, param->flow_control);
+	cm_req_set_starting_psn(req_msg, cpu_to_be32(param->starting_psn));
+	cm_req_set_local_resp_timeout(req_msg,
+				      param->local_cm_response_timeout);
+       if (param->local_cm_response_timeout > (u8) max_timeout) {
+               printk(KERN_WARNING PFX "req local_cm_response_timeout %d > "
+                      "%d, decreasing\n", param->local_cm_response_timeout,
+                      max_timeout);
+               cm_req_set_local_resp_timeout(req_msg, (u8) max_timeout);
+       }
+	cm_req_set_retry_count(req_msg, param->retry_count);
+	req_msg->pkey = param->primary_path->pkey;
+	cm_req_set_path_mtu(req_msg, param->primary_path->mtu);
+	cm_req_set_rnr_retry_count(req_msg, param->rnr_retry_count);
+	cm_req_set_max_cm_retries(req_msg, param->max_cm_retries);
+	cm_req_set_srq(req_msg, param->srq);
+
+	if (pri_path->hop_limit <= 1) {
+		req_msg->primary_local_lid = pri_path->slid;
+		req_msg->primary_remote_lid = pri_path->dlid;
+	} else {
+		/* Work-around until there's a way to obtain remote LID info */
+		req_msg->primary_local_lid = IB_LID_PERMISSIVE;
+		req_msg->primary_remote_lid = IB_LID_PERMISSIVE;
+	}
+	req_msg->primary_local_gid = pri_path->sgid;
+	req_msg->primary_remote_gid = pri_path->dgid;
+	cm_req_set_primary_flow_label(req_msg, pri_path->flow_label);
+	cm_req_set_primary_packet_rate(req_msg, pri_path->rate);
+	req_msg->primary_traffic_class = pri_path->traffic_class;
+	req_msg->primary_hop_limit = pri_path->hop_limit;
+	cm_req_set_primary_sl(req_msg, pri_path->sl);
+	cm_req_set_primary_subnet_local(req_msg, (pri_path->hop_limit <= 1));
+	cm_req_set_primary_local_ack_timeout(req_msg,
+		cm_ack_timeout(cm_id_priv->av.port->cm_dev->ack_delay,
+			       pri_path->packet_life_time));
+
+	if (alt_path) {
+		if (alt_path->hop_limit <= 1) {
+			req_msg->alt_local_lid = alt_path->slid;
+			req_msg->alt_remote_lid = alt_path->dlid;
+		} else {
+			req_msg->alt_local_lid = IB_LID_PERMISSIVE;
+			req_msg->alt_remote_lid = IB_LID_PERMISSIVE;
+		}
+		req_msg->alt_local_gid = alt_path->sgid;
+		req_msg->alt_remote_gid = alt_path->dgid;
+		cm_req_set_alt_flow_label(req_msg,
+					  alt_path->flow_label);
+		cm_req_set_alt_packet_rate(req_msg, alt_path->rate);
+		req_msg->alt_traffic_class = alt_path->traffic_class;
+		req_msg->alt_hop_limit = alt_path->hop_limit;
+		cm_req_set_alt_sl(req_msg, alt_path->sl);
+		cm_req_set_alt_subnet_local(req_msg, (alt_path->hop_limit <= 1));
+		cm_req_set_alt_local_ack_timeout(req_msg,
+			cm_ack_timeout(cm_id_priv->av.port->cm_dev->ack_delay,
+				       alt_path->packet_life_time));
+	}
+
+	if (param->private_data && param->private_data_len)
+		memcpy(req_msg->private_data, param->private_data,
+		       param->private_data_len);
+}
+
+static int cm_validate_req_param(struct ib_cm_req_param *param)
+{
+	/* peer-to-peer not supported */
+	if (param->peer_to_peer)
+		return -EINVAL;
+
+	if (!param->primary_path)
+		return -EINVAL;
+
+	if (param->qp_type != IB_QPT_RC && param->qp_type != IB_QPT_UC)
+		return -EINVAL;
+
+	if (param->private_data &&
+	    param->private_data_len > IB_CM_REQ_PRIVATE_DATA_SIZE)
+		return -EINVAL;
+
+	if (param->alternate_path &&
+	    (param->alternate_path->pkey != param->primary_path->pkey ||
+	     param->alternate_path->mtu != param->primary_path->mtu))
+		return -EINVAL;
+
+	return 0;
+}
+
+int ib_send_cm_req(struct ib_cm_id *cm_id,
+		   struct ib_cm_req_param *param)
+{
+	struct cm_id_private *cm_id_priv;
+	struct cm_req_msg *req_msg;
+	unsigned long flags;
+	int ret;
+
+	ret = cm_validate_req_param(param);
+	if (ret)
+		return ret;
+
+	/* Verify that we're not in timewait. */
+	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	if (cm_id->state != IB_CM_IDLE) {
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		ret = -EINVAL;
+		goto out;
+	}
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+	cm_id_priv->timewait_info = cm_create_timewait_info(cm_id_priv->
+							    id.local_id);
+	if (IS_ERR(cm_id_priv->timewait_info)) {
+		ret = PTR_ERR(cm_id_priv->timewait_info);
+		goto out;
+	}
+
+	ret = cm_init_av_by_path(param->primary_path, &cm_id_priv->av);
+	if (ret)
+		goto error1;
+	if (param->alternate_path) {
+		ret = cm_init_av_by_path(param->alternate_path,
+					 &cm_id_priv->alt_av);
+		if (ret)
+			goto error1;
+	}
+	cm_id->service_id = param->service_id;
+	cm_id->service_mask = ~cpu_to_be64(0);
+	cm_id_priv->timeout_ms = cm_convert_to_ms(
+				    param->primary_path->packet_life_time) * 2 +
+				 cm_convert_to_ms(
+				    param->remote_cm_response_timeout);
+       if (cm_id_priv->timeout_ms > cm_convert_to_ms(max_timeout)) {
+               printk(KERN_WARNING PFX "req timeout_ms %d > %d, decreasing\n",
+                      cm_id_priv->timeout_ms, cm_convert_to_ms(max_timeout));
+               cm_id_priv->timeout_ms = cm_convert_to_ms(max_timeout);
+       }
+	cm_id_priv->max_cm_retries = param->max_cm_retries;
+	cm_id_priv->initiator_depth = param->initiator_depth;
+	cm_id_priv->responder_resources = param->responder_resources;
+	cm_id_priv->retry_count = param->retry_count;
+	cm_id_priv->path_mtu = param->primary_path->mtu;
+	cm_id_priv->pkey = param->primary_path->pkey;
+	cm_id_priv->qp_type = param->qp_type;
+
+	ret = cm_alloc_msg(cm_id_priv, &cm_id_priv->msg);
+	if (ret)
+		goto error1;
+
+	req_msg = (struct cm_req_msg *) cm_id_priv->msg->mad;
+	cm_format_req(req_msg, cm_id_priv, param);
+	cm_id_priv->tid = req_msg->hdr.tid;
+	cm_id_priv->msg->timeout_ms = cm_id_priv->timeout_ms;
+	cm_id_priv->msg->context[1] = (void *) (unsigned long) IB_CM_REQ_SENT;
+
+	cm_id_priv->local_qpn = cm_req_get_local_qpn(req_msg);
+	cm_id_priv->rq_psn = cm_req_get_starting_psn(req_msg);
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	ret = ib_post_send_mad(cm_id_priv->msg, NULL);
+	if (ret) {
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		goto error2;
+	}
+	BUG_ON(cm_id->state != IB_CM_IDLE);
+	cm_id->state = IB_CM_REQ_SENT;
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	return 0;
+
+error2:	cm_free_msg(cm_id_priv->msg);
+error1:	kfree(cm_id_priv->timewait_info);
+out:	return ret;
+}
+EXPORT_SYMBOL(ib_send_cm_req);
+
+static int cm_issue_rej(struct cm_port *port,
+			struct ib_mad_recv_wc *mad_recv_wc,
+			enum ib_cm_rej_reason reason,
+			enum cm_msg_response msg_rejected,
+			void *ari, u8 ari_length)
+{
+	struct ib_mad_send_buf *msg = NULL;
+	struct cm_rej_msg *rej_msg, *rcv_msg;
+	int ret;
+
+	ret = cm_alloc_response_msg(port, mad_recv_wc, &msg);
+	if (ret)
+		return ret;
+
+	/* We just need common CM header information.  Cast to any message. */
+	rcv_msg = (struct cm_rej_msg *) mad_recv_wc->recv_buf.mad;
+	rej_msg = (struct cm_rej_msg *) msg->mad;
+
+	cm_format_mad_hdr(&rej_msg->hdr, CM_REJ_ATTR_ID, rcv_msg->hdr.tid);
+	rej_msg->remote_comm_id = rcv_msg->local_comm_id;
+	rej_msg->local_comm_id = rcv_msg->remote_comm_id;
+	cm_rej_set_msg_rejected(rej_msg, msg_rejected);
+	rej_msg->reason = cpu_to_be16(reason);
+
+	if (ari && ari_length) {
+		cm_rej_set_reject_info_len(rej_msg, ari_length);
+		memcpy(rej_msg->ari, ari, ari_length);
+	}
+
+	ret = ib_post_send_mad(msg, NULL);
+	if (ret)
+		cm_free_msg(msg);
+
+	return ret;
+}
+
+static inline int cm_is_active_peer(__be64 local_ca_guid, __be64 remote_ca_guid,
+				    __be32 local_qpn, __be32 remote_qpn)
+{
+	return (be64_to_cpu(local_ca_guid) > be64_to_cpu(remote_ca_guid) ||
+		((local_ca_guid == remote_ca_guid) &&
+		 (be32_to_cpu(local_qpn) > be32_to_cpu(remote_qpn))));
+}
+
+static void cm_format_paths_from_req(struct cm_req_msg *req_msg,
+					    struct ib_sa_path_rec *primary_path,
+					    struct ib_sa_path_rec *alt_path)
+{
+	memset(primary_path, 0, sizeof *primary_path);
+	primary_path->dgid = req_msg->primary_local_gid;
+	primary_path->sgid = req_msg->primary_remote_gid;
+	primary_path->dlid = req_msg->primary_local_lid;
+	primary_path->slid = req_msg->primary_remote_lid;
+	primary_path->flow_label = cm_req_get_primary_flow_label(req_msg);
+	primary_path->hop_limit = req_msg->primary_hop_limit;
+	primary_path->traffic_class = req_msg->primary_traffic_class;
+	primary_path->reversible = 1;
+	primary_path->pkey = req_msg->pkey;
+	primary_path->sl = cm_req_get_primary_sl(req_msg);
+	primary_path->mtu_selector = IB_SA_EQ;
+	primary_path->mtu = cm_req_get_path_mtu(req_msg);
+	primary_path->rate_selector = IB_SA_EQ;
+	primary_path->rate = cm_req_get_primary_packet_rate(req_msg);
+	primary_path->packet_life_time_selector = IB_SA_EQ;
+	primary_path->packet_life_time =
+		cm_req_get_primary_local_ack_timeout(req_msg);
+	primary_path->packet_life_time -= (primary_path->packet_life_time > 0);
+
+	if (req_msg->alt_local_lid) {
+		memset(alt_path, 0, sizeof *alt_path);
+		alt_path->dgid = req_msg->alt_local_gid;
+		alt_path->sgid = req_msg->alt_remote_gid;
+		alt_path->dlid = req_msg->alt_local_lid;
+		alt_path->slid = req_msg->alt_remote_lid;
+		alt_path->flow_label = cm_req_get_alt_flow_label(req_msg);
+		alt_path->hop_limit = req_msg->alt_hop_limit;
+		alt_path->traffic_class = req_msg->alt_traffic_class;
+		alt_path->reversible = 1;
+		alt_path->pkey = req_msg->pkey;
+		alt_path->sl = cm_req_get_alt_sl(req_msg);
+		alt_path->mtu_selector = IB_SA_EQ;
+		alt_path->mtu = cm_req_get_path_mtu(req_msg);
+		alt_path->rate_selector = IB_SA_EQ;
+		alt_path->rate = cm_req_get_alt_packet_rate(req_msg);
+		alt_path->packet_life_time_selector = IB_SA_EQ;
+		alt_path->packet_life_time =
+			cm_req_get_alt_local_ack_timeout(req_msg);
+		alt_path->packet_life_time -= (alt_path->packet_life_time > 0);
+	}
+}
+
+static void cm_format_req_event(struct cm_work *work,
+				struct cm_id_private *cm_id_priv,
+				struct ib_cm_id *listen_id)
+{
+	struct cm_req_msg *req_msg;
+	struct ib_cm_req_event_param *param;
+
+	req_msg = (struct cm_req_msg *)work->mad_recv_wc->recv_buf.mad;
+	param = &work->cm_event.param.req_rcvd;
+	param->listen_id = listen_id;
+	param->port = cm_id_priv->av.port->port_num;
+	param->primary_path = &work->path[0];
+	if (req_msg->alt_local_lid)
+		param->alternate_path = &work->path[1];
+	else
+		param->alternate_path = NULL;
+	param->remote_ca_guid = req_msg->local_ca_guid;
+	param->remote_qkey = be32_to_cpu(req_msg->local_qkey);
+	param->remote_qpn = be32_to_cpu(cm_req_get_local_qpn(req_msg));
+	param->qp_type = cm_req_get_qp_type(req_msg);
+	param->starting_psn = be32_to_cpu(cm_req_get_starting_psn(req_msg));
+	param->responder_resources = cm_req_get_init_depth(req_msg);
+	param->initiator_depth = cm_req_get_resp_res(req_msg);
+	param->local_cm_response_timeout =
+					cm_req_get_remote_resp_timeout(req_msg);
+	param->flow_control = cm_req_get_flow_ctrl(req_msg);
+	param->remote_cm_response_timeout =
+					cm_req_get_local_resp_timeout(req_msg);
+	param->retry_count = cm_req_get_retry_count(req_msg);
+	param->rnr_retry_count = cm_req_get_rnr_retry_count(req_msg);
+	param->srq = cm_req_get_srq(req_msg);
+	work->cm_event.private_data = &req_msg->private_data;
+}
+
+static void cm_process_work(struct cm_id_private *cm_id_priv,
+			    struct cm_work *work)
+{
+	int ret;
+
+	/* We will typically only have the current event to report. */
+	ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, &work->cm_event);
+	cm_free_work(work);
+
+	while (!ret && !atomic_add_negative(-1, &cm_id_priv->work_count)) {
+		spin_lock_irq(&cm_id_priv->lock);
+		work = cm_dequeue_work(cm_id_priv);
+		spin_unlock_irq(&cm_id_priv->lock);
+		BUG_ON(!work);
+		ret = cm_id_priv->id.cm_handler(&cm_id_priv->id,
+						&work->cm_event);
+		cm_free_work(work);
+	}
+	cm_deref_id(cm_id_priv);
+	if (ret)
+		cm_destroy_id(&cm_id_priv->id, ret);
+}
+
+static void cm_format_mra(struct cm_mra_msg *mra_msg,
+			  struct cm_id_private *cm_id_priv,
+			  enum cm_msg_response msg_mraed, u8 service_timeout,
+			  const void *private_data, u8 private_data_len)
+{
+	cm_format_mad_hdr(&mra_msg->hdr, CM_MRA_ATTR_ID, cm_id_priv->tid);
+	cm_mra_set_msg_mraed(mra_msg, msg_mraed);
+	mra_msg->local_comm_id = cm_id_priv->id.local_id;
+	mra_msg->remote_comm_id = cm_id_priv->id.remote_id;
+	cm_mra_set_service_timeout(mra_msg, service_timeout);
+
+	if (private_data && private_data_len)
+		memcpy(mra_msg->private_data, private_data, private_data_len);
+}
+
+static void cm_format_rej(struct cm_rej_msg *rej_msg,
+			  struct cm_id_private *cm_id_priv,
+			  enum ib_cm_rej_reason reason,
+			  void *ari,
+			  u8 ari_length,
+			  const void *private_data,
+			  u8 private_data_len)
+{
+	cm_format_mad_hdr(&rej_msg->hdr, CM_REJ_ATTR_ID, cm_id_priv->tid);
+	rej_msg->remote_comm_id = cm_id_priv->id.remote_id;
+
+	switch(cm_id_priv->id.state) {
+	case IB_CM_REQ_RCVD:
+		rej_msg->local_comm_id = 0;
+		cm_rej_set_msg_rejected(rej_msg, CM_MSG_RESPONSE_REQ);
+		break;
+	case IB_CM_MRA_REQ_SENT:
+		rej_msg->local_comm_id = cm_id_priv->id.local_id;
+		cm_rej_set_msg_rejected(rej_msg, CM_MSG_RESPONSE_REQ);
+		break;
+	case IB_CM_REP_RCVD:
+	case IB_CM_MRA_REP_SENT:
+		rej_msg->local_comm_id = cm_id_priv->id.local_id;
+		cm_rej_set_msg_rejected(rej_msg, CM_MSG_RESPONSE_REP);
+		break;
+	default:
+		rej_msg->local_comm_id = cm_id_priv->id.local_id;
+		cm_rej_set_msg_rejected(rej_msg, CM_MSG_RESPONSE_OTHER);
+		break;
+	}
+
+	rej_msg->reason = cpu_to_be16(reason);
+	if (ari && ari_length) {
+		cm_rej_set_reject_info_len(rej_msg, ari_length);
+		memcpy(rej_msg->ari, ari, ari_length);
+	}
+
+	if (private_data && private_data_len)
+		memcpy(rej_msg->private_data, private_data, private_data_len);
+}
+
+static void cm_dup_req_handler(struct cm_work *work,
+			       struct cm_id_private *cm_id_priv)
+{
+	struct ib_mad_send_buf *msg = NULL;
+	int ret;
+
+	atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+			counter[CM_REQ_COUNTER]);
+
+	/* Quick state check to discard duplicate REQs. */
+	if (cm_id_priv->id.state == IB_CM_REQ_RCVD)
+		return;
+
+	ret = cm_alloc_response_msg(work->port, work->mad_recv_wc, &msg);
+	if (ret)
+		return;
+
+	spin_lock_irq(&cm_id_priv->lock);
+	switch (cm_id_priv->id.state) {
+	case IB_CM_MRA_REQ_SENT:
+		cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv,
+			      CM_MSG_RESPONSE_REQ, cm_id_priv->service_timeout,
+			      cm_id_priv->private_data,
+			      cm_id_priv->private_data_len);
+		break;
+	case IB_CM_TIMEWAIT:
+		cm_format_rej((struct cm_rej_msg *) msg->mad, cm_id_priv,
+			      IB_CM_REJ_STALE_CONN, NULL, 0, NULL, 0);
+		break;
+	default:
+		goto unlock;
+	}
+	spin_unlock_irq(&cm_id_priv->lock);
+
+	ret = ib_post_send_mad(msg, NULL);
+	if (ret)
+		goto free;
+	return;
+
+unlock:	spin_unlock_irq(&cm_id_priv->lock);
+free:	cm_free_msg(msg);
+}
+
+static struct cm_id_private * cm_match_req(struct cm_work *work,
+					   struct cm_id_private *cm_id_priv)
+{
+	struct cm_id_private *listen_cm_id_priv, *cur_cm_id_priv;
+	struct cm_timewait_info *timewait_info;
+	struct cm_req_msg *req_msg;
+
+	req_msg = (struct cm_req_msg *)work->mad_recv_wc->recv_buf.mad;
+
+	/* Check for possible duplicate REQ. */
+	spin_lock_irq(&cm.lock);
+	timewait_info = cm_insert_remote_id(cm_id_priv->timewait_info);
+	if (timewait_info) {
+		cur_cm_id_priv = cm_get_id(timewait_info->work.local_id,
+					   timewait_info->work.remote_id);
+		spin_unlock_irq(&cm.lock);
+		if (cur_cm_id_priv) {
+			cm_dup_req_handler(work, cur_cm_id_priv);
+			cm_deref_id(cur_cm_id_priv);
+		}
+		return NULL;
+	}
+
+	/* Check for stale connections. */
+	timewait_info = cm_insert_remote_qpn(cm_id_priv->timewait_info);
+	if (timewait_info) {
+		cm_cleanup_timewait(cm_id_priv->timewait_info);
+		spin_unlock_irq(&cm.lock);
+		cm_issue_rej(work->port, work->mad_recv_wc,
+			     IB_CM_REJ_STALE_CONN, CM_MSG_RESPONSE_REQ,
+			     NULL, 0);
+		return NULL;
+	}
+
+	/* Find matching listen request. */
+	listen_cm_id_priv = cm_find_listen(cm_id_priv->id.device,
+					   req_msg->service_id,
+					   req_msg->private_data);
+	if (!listen_cm_id_priv) {
+		cm_cleanup_timewait(cm_id_priv->timewait_info);
+		spin_unlock_irq(&cm.lock);
+		cm_issue_rej(work->port, work->mad_recv_wc,
+			     IB_CM_REJ_INVALID_SERVICE_ID, CM_MSG_RESPONSE_REQ,
+			     NULL, 0);
+		goto out;
+	}
+	atomic_inc(&listen_cm_id_priv->refcount);
+	atomic_inc(&cm_id_priv->refcount);
+	cm_id_priv->id.state = IB_CM_REQ_RCVD;
+	atomic_inc(&cm_id_priv->work_count);
+	spin_unlock_irq(&cm.lock);
+out:
+	return listen_cm_id_priv;
+}
+
+/*
+ * Work-around for inter-subnet connections.  If the LIDs are permissive,
+ * we need to override the LID/SL data in the REQ with the LID information
+ * in the work completion.
+ */
+static void cm_process_routed_req(struct cm_req_msg *req_msg, struct ib_wc *wc)
+{
+	if (!cm_req_get_primary_subnet_local(req_msg)) {
+		if (req_msg->primary_local_lid == IB_LID_PERMISSIVE) {
+			req_msg->primary_local_lid = cpu_to_be16(wc->slid);
+			cm_req_set_primary_sl(req_msg, wc->sl);
+		}
+
+		if (req_msg->primary_remote_lid == IB_LID_PERMISSIVE)
+			req_msg->primary_remote_lid = cpu_to_be16(wc->dlid_path_bits);
+	}
+
+	if (!cm_req_get_alt_subnet_local(req_msg)) {
+		if (req_msg->alt_local_lid == IB_LID_PERMISSIVE) {
+			req_msg->alt_local_lid = cpu_to_be16(wc->slid);
+			cm_req_set_alt_sl(req_msg, wc->sl);
+		}
+
+		if (req_msg->alt_remote_lid == IB_LID_PERMISSIVE)
+			req_msg->alt_remote_lid = cpu_to_be16(wc->dlid_path_bits);
+	}
+}
+
+static int cm_req_handler(struct cm_work *work)
+{
+	struct ib_cm_id *cm_id;
+	struct cm_id_private *cm_id_priv, *listen_cm_id_priv;
+	struct cm_req_msg *req_msg;
+	int ret;
+
+	req_msg = (struct cm_req_msg *)work->mad_recv_wc->recv_buf.mad;
+
+	cm_id = ib_create_cm_id(work->port->cm_dev->ib_device, NULL, NULL);
+	if (IS_ERR(cm_id))
+		return PTR_ERR(cm_id);
+
+	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+	cm_id_priv->id.remote_id = req_msg->local_comm_id;
+	cm_init_av_for_response(work->port, work->mad_recv_wc->wc,
+				work->mad_recv_wc->recv_buf.grh,
+				&cm_id_priv->av);
+	cm_id_priv->timewait_info = cm_create_timewait_info(cm_id_priv->
+							    id.local_id);
+	if (IS_ERR(cm_id_priv->timewait_info)) {
+		ret = PTR_ERR(cm_id_priv->timewait_info);
+		goto destroy;
+	}
+	cm_id_priv->timewait_info->work.remote_id = req_msg->local_comm_id;
+	cm_id_priv->timewait_info->remote_ca_guid = req_msg->local_ca_guid;
+	cm_id_priv->timewait_info->remote_qpn = cm_req_get_local_qpn(req_msg);
+
+	listen_cm_id_priv = cm_match_req(work, cm_id_priv);
+	if (!listen_cm_id_priv) {
+		ret = -EINVAL;
+		kfree(cm_id_priv->timewait_info);
+		goto destroy;
+	}
+
+	cm_id_priv->id.cm_handler = listen_cm_id_priv->id.cm_handler;
+	cm_id_priv->id.context = listen_cm_id_priv->id.context;
+	cm_id_priv->id.service_id = req_msg->service_id;
+	cm_id_priv->id.service_mask = ~cpu_to_be64(0);
+
+	cm_process_routed_req(req_msg, work->mad_recv_wc->wc);
+	cm_format_paths_from_req(req_msg, &work->path[0], &work->path[1]);
+	ret = cm_init_av_by_path(&work->path[0], &cm_id_priv->av);
+	if (ret) {
+		ib_get_cached_gid(work->port->cm_dev->ib_device,
+				  work->port->port_num, 0, &work->path[0].sgid);
+		ib_send_cm_rej(cm_id, IB_CM_REJ_INVALID_GID,
+			       &work->path[0].sgid, sizeof work->path[0].sgid,
+			       NULL, 0);
+		goto rejected;
+	}
+	if (req_msg->alt_local_lid) {
+		ret = cm_init_av_by_path(&work->path[1], &cm_id_priv->alt_av);
+		if (ret) {
+			ib_send_cm_rej(cm_id, IB_CM_REJ_INVALID_ALT_GID,
+				       &work->path[0].sgid,
+				       sizeof work->path[0].sgid, NULL, 0);
+			goto rejected;
+		}
+	}
+	cm_id_priv->tid = req_msg->hdr.tid;
+	cm_id_priv->timeout_ms = cm_convert_to_ms(
+					cm_req_get_local_resp_timeout(req_msg));
+       if (cm_req_get_local_resp_timeout(req_msg) > (u8) max_timeout) {
+               printk(KERN_WARNING PFX "rcvd cm_local_resp_timeout %d > %d, "
+                      "decreasing used timeout_ms\n",
+                      cm_req_get_local_resp_timeout(req_msg), max_timeout);
+               cm_id_priv->timeout_ms = cm_convert_to_ms(max_timeout);
+       }
+
+	cm_id_priv->max_cm_retries = cm_req_get_max_cm_retries(req_msg);
+	cm_id_priv->remote_qpn = cm_req_get_local_qpn(req_msg);
+	cm_id_priv->initiator_depth = cm_req_get_resp_res(req_msg);
+	cm_id_priv->responder_resources = cm_req_get_init_depth(req_msg);
+	cm_id_priv->path_mtu = cm_req_get_path_mtu(req_msg);
+	cm_id_priv->pkey = req_msg->pkey;
+	cm_id_priv->sq_psn = cm_req_get_starting_psn(req_msg);
+	cm_id_priv->retry_count = cm_req_get_retry_count(req_msg);
+	cm_id_priv->rnr_retry_count = cm_req_get_rnr_retry_count(req_msg);
+	cm_id_priv->qp_type = cm_req_get_qp_type(req_msg);
+
+	cm_format_req_event(work, cm_id_priv, &listen_cm_id_priv->id);
+	cm_process_work(cm_id_priv, work);
+	cm_deref_id(listen_cm_id_priv);
+	return 0;
+
+rejected:
+	atomic_dec(&cm_id_priv->refcount);
+	cm_deref_id(listen_cm_id_priv);
+destroy:
+	ib_destroy_cm_id(cm_id);
+	return ret;
+}
+
+static void cm_format_rep(struct cm_rep_msg *rep_msg,
+			  struct cm_id_private *cm_id_priv,
+			  struct ib_cm_rep_param *param)
+{
+	cm_format_mad_hdr(&rep_msg->hdr, CM_REP_ATTR_ID, cm_id_priv->tid);
+	rep_msg->local_comm_id = cm_id_priv->id.local_id;
+	rep_msg->remote_comm_id = cm_id_priv->id.remote_id;
+	cm_rep_set_local_qpn(rep_msg, cpu_to_be32(param->qp_num));
+	cm_rep_set_starting_psn(rep_msg, cpu_to_be32(param->starting_psn));
+	rep_msg->resp_resources = param->responder_resources;
+	rep_msg->initiator_depth = param->initiator_depth;
+	cm_rep_set_target_ack_delay(rep_msg,
+				    cm_id_priv->av.port->cm_dev->ack_delay);
+	cm_rep_set_failover(rep_msg, param->failover_accepted);
+	cm_rep_set_flow_ctrl(rep_msg, param->flow_control);
+	cm_rep_set_rnr_retry_count(rep_msg, param->rnr_retry_count);
+	cm_rep_set_srq(rep_msg, param->srq);
+	rep_msg->local_ca_guid = cm_id_priv->id.device->node_guid;
+
+	if (param->private_data && param->private_data_len)
+		memcpy(rep_msg->private_data, param->private_data,
+		       param->private_data_len);
+}
+
+int ib_send_cm_rep(struct ib_cm_id *cm_id,
+		   struct ib_cm_rep_param *param)
+{
+	struct cm_id_private *cm_id_priv;
+	struct ib_mad_send_buf *msg;
+	struct cm_rep_msg *rep_msg;
+	unsigned long flags;
+	int ret;
+
+	if (param->private_data &&
+	    param->private_data_len > IB_CM_REP_PRIVATE_DATA_SIZE)
+		return -EINVAL;
+
+	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	if (cm_id->state != IB_CM_REQ_RCVD &&
+	    cm_id->state != IB_CM_MRA_REQ_SENT) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = cm_alloc_msg(cm_id_priv, &msg);
+	if (ret)
+		goto out;
+
+	rep_msg = (struct cm_rep_msg *) msg->mad;
+	cm_format_rep(rep_msg, cm_id_priv, param);
+	msg->timeout_ms = cm_id_priv->timeout_ms;
+	msg->context[1] = (void *) (unsigned long) IB_CM_REP_SENT;
+
+	ret = ib_post_send_mad(msg, NULL);
+	if (ret) {
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		cm_free_msg(msg);
+		return ret;
+	}
+
+	cm_id->state = IB_CM_REP_SENT;
+	cm_id_priv->msg = msg;
+	cm_id_priv->initiator_depth = param->initiator_depth;
+	cm_id_priv->responder_resources = param->responder_resources;
+	cm_id_priv->rq_psn = cm_rep_get_starting_psn(rep_msg);
+	cm_id_priv->local_qpn = cm_rep_get_local_qpn(rep_msg);
+
+out:	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	return ret;
+}
+EXPORT_SYMBOL(ib_send_cm_rep);
+
+static void cm_format_rtu(struct cm_rtu_msg *rtu_msg,
+			  struct cm_id_private *cm_id_priv,
+			  const void *private_data,
+			  u8 private_data_len)
+{
+	cm_format_mad_hdr(&rtu_msg->hdr, CM_RTU_ATTR_ID, cm_id_priv->tid);
+	rtu_msg->local_comm_id = cm_id_priv->id.local_id;
+	rtu_msg->remote_comm_id = cm_id_priv->id.remote_id;
+
+	if (private_data && private_data_len)
+		memcpy(rtu_msg->private_data, private_data, private_data_len);
+}
+
+int ib_send_cm_rtu(struct ib_cm_id *cm_id,
+		   const void *private_data,
+		   u8 private_data_len)
+{
+	struct cm_id_private *cm_id_priv;
+	struct ib_mad_send_buf *msg;
+	unsigned long flags;
+	void *data;
+	int ret;
+
+	if (private_data && private_data_len > IB_CM_RTU_PRIVATE_DATA_SIZE)
+		return -EINVAL;
+
+	data = cm_copy_private_data(private_data, private_data_len);
+	if (IS_ERR(data))
+		return PTR_ERR(data);
+
+	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	if (cm_id->state != IB_CM_REP_RCVD &&
+	    cm_id->state != IB_CM_MRA_REP_SENT) {
+		ret = -EINVAL;
+		goto error;
+	}
+
+	ret = cm_alloc_msg(cm_id_priv, &msg);
+	if (ret)
+		goto error;
+
+	cm_format_rtu((struct cm_rtu_msg *) msg->mad, cm_id_priv,
+		      private_data, private_data_len);
+
+	ret = ib_post_send_mad(msg, NULL);
+	if (ret) {
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		cm_free_msg(msg);
+		kfree(data);
+		return ret;
+	}
+
+	cm_id->state = IB_CM_ESTABLISHED;
+	cm_set_private_data(cm_id_priv, data, private_data_len);
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	return 0;
+
+error:	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	kfree(data);
+	return ret;
+}
+EXPORT_SYMBOL(ib_send_cm_rtu);
+
+static void cm_format_rep_event(struct cm_work *work)
+{
+	struct cm_rep_msg *rep_msg;
+	struct ib_cm_rep_event_param *param;
+
+	rep_msg = (struct cm_rep_msg *)work->mad_recv_wc->recv_buf.mad;
+	param = &work->cm_event.param.rep_rcvd;
+	param->remote_ca_guid = rep_msg->local_ca_guid;
+	param->remote_qkey = be32_to_cpu(rep_msg->local_qkey);
+	param->remote_qpn = be32_to_cpu(cm_rep_get_local_qpn(rep_msg));
+	param->starting_psn = be32_to_cpu(cm_rep_get_starting_psn(rep_msg));
+	param->responder_resources = rep_msg->initiator_depth;
+	param->initiator_depth = rep_msg->resp_resources;
+	param->target_ack_delay = cm_rep_get_target_ack_delay(rep_msg);
+	param->failover_accepted = cm_rep_get_failover(rep_msg);
+	param->flow_control = cm_rep_get_flow_ctrl(rep_msg);
+	param->rnr_retry_count = cm_rep_get_rnr_retry_count(rep_msg);
+	param->srq = cm_rep_get_srq(rep_msg);
+	work->cm_event.private_data = &rep_msg->private_data;
+}
+
+static void cm_dup_rep_handler(struct cm_work *work)
+{
+	struct cm_id_private *cm_id_priv;
+	struct cm_rep_msg *rep_msg;
+	struct ib_mad_send_buf *msg = NULL;
+	int ret;
+
+	rep_msg = (struct cm_rep_msg *) work->mad_recv_wc->recv_buf.mad;
+	cm_id_priv = cm_acquire_id(rep_msg->remote_comm_id,
+				   rep_msg->local_comm_id);
+	if (!cm_id_priv)
+		return;
+
+	atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+			counter[CM_REP_COUNTER]);
+	ret = cm_alloc_response_msg(work->port, work->mad_recv_wc, &msg);
+	if (ret)
+		goto deref;
+
+	spin_lock_irq(&cm_id_priv->lock);
+	if (cm_id_priv->id.state == IB_CM_ESTABLISHED)
+		cm_format_rtu((struct cm_rtu_msg *) msg->mad, cm_id_priv,
+			      cm_id_priv->private_data,
+			      cm_id_priv->private_data_len);
+	else if (cm_id_priv->id.state == IB_CM_MRA_REP_SENT)
+		cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv,
+			      CM_MSG_RESPONSE_REP, cm_id_priv->service_timeout,
+			      cm_id_priv->private_data,
+			      cm_id_priv->private_data_len);
+	else
+		goto unlock;
+	spin_unlock_irq(&cm_id_priv->lock);
+
+	ret = ib_post_send_mad(msg, NULL);
+	if (ret)
+		goto free;
+	goto deref;
+
+unlock:	spin_unlock_irq(&cm_id_priv->lock);
+free:	cm_free_msg(msg);
+deref:	cm_deref_id(cm_id_priv);
+}
+
+static int cm_rep_handler(struct cm_work *work)
+{
+	struct cm_id_private *cm_id_priv;
+	struct cm_rep_msg *rep_msg;
+	int ret;
+
+	rep_msg = (struct cm_rep_msg *)work->mad_recv_wc->recv_buf.mad;
+	cm_id_priv = cm_acquire_id(rep_msg->remote_comm_id, 0);
+	if (!cm_id_priv) {
+		cm_dup_rep_handler(work);
+		return -EINVAL;
+	}
+
+	cm_format_rep_event(work);
+
+	spin_lock_irq(&cm_id_priv->lock);
+	switch (cm_id_priv->id.state) {
+	case IB_CM_REQ_SENT:
+	case IB_CM_MRA_REQ_RCVD:
+		break;
+	default:
+		spin_unlock_irq(&cm_id_priv->lock);
+		ret = -EINVAL;
+		goto error;
+	}
+
+	cm_id_priv->timewait_info->work.remote_id = rep_msg->local_comm_id;
+	cm_id_priv->timewait_info->remote_ca_guid = rep_msg->local_ca_guid;
+	cm_id_priv->timewait_info->remote_qpn = cm_rep_get_local_qpn(rep_msg);
+
+	spin_lock(&cm.lock);
+	/* Check for duplicate REP. */
+	if (cm_insert_remote_id(cm_id_priv->timewait_info)) {
+		spin_unlock(&cm.lock);
+		spin_unlock_irq(&cm_id_priv->lock);
+		ret = -EINVAL;
+		goto error;
+	}
+	/* Check for a stale connection. */
+	if (cm_insert_remote_qpn(cm_id_priv->timewait_info)) {
+		rb_erase(&cm_id_priv->timewait_info->remote_id_node,
+			 &cm.remote_id_table);
+		cm_id_priv->timewait_info->inserted_remote_id = 0;
+		spin_unlock(&cm.lock);
+		spin_unlock_irq(&cm_id_priv->lock);
+		cm_issue_rej(work->port, work->mad_recv_wc,
+			     IB_CM_REJ_STALE_CONN, CM_MSG_RESPONSE_REP,
+			     NULL, 0);
+		ret = -EINVAL;
+		goto error;
+	}
+	spin_unlock(&cm.lock);
+
+	cm_id_priv->id.state = IB_CM_REP_RCVD;
+	cm_id_priv->id.remote_id = rep_msg->local_comm_id;
+	cm_id_priv->remote_qpn = cm_rep_get_local_qpn(rep_msg);
+	cm_id_priv->initiator_depth = rep_msg->resp_resources;
+	cm_id_priv->responder_resources = rep_msg->initiator_depth;
+	cm_id_priv->sq_psn = cm_rep_get_starting_psn(rep_msg);
+	cm_id_priv->rnr_retry_count = cm_rep_get_rnr_retry_count(rep_msg);
+	cm_id_priv->target_ack_delay = cm_rep_get_target_ack_delay(rep_msg);
+	cm_id_priv->av.timeout =
+			cm_ack_timeout(cm_id_priv->target_ack_delay,
+				       cm_id_priv->av.timeout - 1);
+	cm_id_priv->alt_av.timeout =
+			cm_ack_timeout(cm_id_priv->target_ack_delay,
+				       cm_id_priv->alt_av.timeout - 1);
+
+	/* todo: handle peer_to_peer */
+
+	ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+	ret = atomic_inc_and_test(&cm_id_priv->work_count);
+	if (!ret)
+		list_add_tail(&work->list, &cm_id_priv->work_list);
+	spin_unlock_irq(&cm_id_priv->lock);
+
+	if (ret)
+		cm_process_work(cm_id_priv, work);
+	else
+		cm_deref_id(cm_id_priv);
+	return 0;
+
+error:
+	cm_deref_id(cm_id_priv);
+	return ret;
+}
+
+static int cm_establish_handler(struct cm_work *work)
+{
+	struct cm_id_private *cm_id_priv;
+	int ret;
+
+	/* See comment in cm_establish about lookup. */
+	cm_id_priv = cm_acquire_id(work->local_id, work->remote_id);
+	if (!cm_id_priv)
+		return -EINVAL;
+
+	spin_lock_irq(&cm_id_priv->lock);
+	if (cm_id_priv->id.state != IB_CM_ESTABLISHED) {
+		spin_unlock_irq(&cm_id_priv->lock);
+		goto out;
+	}
+
+	ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+	ret = atomic_inc_and_test(&cm_id_priv->work_count);
+	if (!ret)
+		list_add_tail(&work->list, &cm_id_priv->work_list);
+	spin_unlock_irq(&cm_id_priv->lock);
+
+	if (ret)
+		cm_process_work(cm_id_priv, work);
+	else
+		cm_deref_id(cm_id_priv);
+	return 0;
+out:
+	cm_deref_id(cm_id_priv);
+	return -EINVAL;
+}
+
+static int cm_rtu_handler(struct cm_work *work)
+{
+	struct cm_id_private *cm_id_priv;
+	struct cm_rtu_msg *rtu_msg;
+	int ret;
+
+	rtu_msg = (struct cm_rtu_msg *)work->mad_recv_wc->recv_buf.mad;
+	cm_id_priv = cm_acquire_id(rtu_msg->remote_comm_id,
+				   rtu_msg->local_comm_id);
+	if (!cm_id_priv)
+		return -EINVAL;
+
+	work->cm_event.private_data = &rtu_msg->private_data;
+
+	spin_lock_irq(&cm_id_priv->lock);
+	if (cm_id_priv->id.state != IB_CM_REP_SENT &&
+	    cm_id_priv->id.state != IB_CM_MRA_REP_RCVD) {
+		spin_unlock_irq(&cm_id_priv->lock);
+		atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+				counter[CM_RTU_COUNTER]);
+		goto out;
+	}
+	cm_id_priv->id.state = IB_CM_ESTABLISHED;
+
+	ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+	ret = atomic_inc_and_test(&cm_id_priv->work_count);
+	if (!ret)
+		list_add_tail(&work->list, &cm_id_priv->work_list);
+	spin_unlock_irq(&cm_id_priv->lock);
+
+	if (ret)
+		cm_process_work(cm_id_priv, work);
+	else
+		cm_deref_id(cm_id_priv);
+	return 0;
+out:
+	cm_deref_id(cm_id_priv);
+	return -EINVAL;
+}
+
+static void cm_format_dreq(struct cm_dreq_msg *dreq_msg,
+			  struct cm_id_private *cm_id_priv,
+			  const void *private_data,
+			  u8 private_data_len)
+{
+	cm_format_mad_hdr(&dreq_msg->hdr, CM_DREQ_ATTR_ID,
+			  cm_form_tid(cm_id_priv, CM_MSG_SEQUENCE_DREQ));
+	dreq_msg->local_comm_id = cm_id_priv->id.local_id;
+	dreq_msg->remote_comm_id = cm_id_priv->id.remote_id;
+	cm_dreq_set_remote_qpn(dreq_msg, cm_id_priv->remote_qpn);
+
+	if (private_data && private_data_len)
+		memcpy(dreq_msg->private_data, private_data, private_data_len);
+}
+
+int ib_send_cm_dreq(struct ib_cm_id *cm_id,
+		    const void *private_data,
+		    u8 private_data_len)
+{
+	struct cm_id_private *cm_id_priv;
+	struct ib_mad_send_buf *msg;
+	unsigned long flags;
+	int ret;
+
+	if (private_data && private_data_len > IB_CM_DREQ_PRIVATE_DATA_SIZE)
+		return -EINVAL;
+
+	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	if (cm_id->state != IB_CM_ESTABLISHED) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = cm_alloc_msg(cm_id_priv, &msg);
+	if (ret) {
+		cm_enter_timewait(cm_id_priv);
+		goto out;
+	}
+
+	cm_format_dreq((struct cm_dreq_msg *) msg->mad, cm_id_priv,
+		       private_data, private_data_len);
+	msg->timeout_ms = cm_id_priv->timeout_ms;
+	msg->context[1] = (void *) (unsigned long) IB_CM_DREQ_SENT;
+
+	ret = ib_post_send_mad(msg, NULL);
+	if (ret) {
+		cm_enter_timewait(cm_id_priv);
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		cm_free_msg(msg);
+		return ret;
+	}
+
+	cm_id->state = IB_CM_DREQ_SENT;
+	cm_id_priv->msg = msg;
+out:	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	return ret;
+}
+EXPORT_SYMBOL(ib_send_cm_dreq);
+
+static void cm_format_drep(struct cm_drep_msg *drep_msg,
+			  struct cm_id_private *cm_id_priv,
+			  const void *private_data,
+			  u8 private_data_len)
+{
+	cm_format_mad_hdr(&drep_msg->hdr, CM_DREP_ATTR_ID, cm_id_priv->tid);
+	drep_msg->local_comm_id = cm_id_priv->id.local_id;
+	drep_msg->remote_comm_id = cm_id_priv->id.remote_id;
+
+	if (private_data && private_data_len)
+		memcpy(drep_msg->private_data, private_data, private_data_len);
+}
+
+int ib_send_cm_drep(struct ib_cm_id *cm_id,
+		    const void *private_data,
+		    u8 private_data_len)
+{
+	struct cm_id_private *cm_id_priv;
+	struct ib_mad_send_buf *msg;
+	unsigned long flags;
+	void *data;
+	int ret;
+
+	if (private_data && private_data_len > IB_CM_DREP_PRIVATE_DATA_SIZE)
+		return -EINVAL;
+
+	data = cm_copy_private_data(private_data, private_data_len);
+	if (IS_ERR(data))
+		return PTR_ERR(data);
+
+	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	if (cm_id->state != IB_CM_DREQ_RCVD) {
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		kfree(data);
+		return -EINVAL;
+	}
+
+	cm_set_private_data(cm_id_priv, data, private_data_len);
+	cm_enter_timewait(cm_id_priv);
+
+	ret = cm_alloc_msg(cm_id_priv, &msg);
+	if (ret)
+		goto out;
+
+	cm_format_drep((struct cm_drep_msg *) msg->mad, cm_id_priv,
+		       private_data, private_data_len);
+
+	ret = ib_post_send_mad(msg, NULL);
+	if (ret) {
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		cm_free_msg(msg);
+		return ret;
+	}
+
+out:	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	return ret;
+}
+EXPORT_SYMBOL(ib_send_cm_drep);
+
+static int cm_issue_drep(struct cm_port *port,
+			 struct ib_mad_recv_wc *mad_recv_wc)
+{
+	struct ib_mad_send_buf *msg = NULL;
+	struct cm_dreq_msg *dreq_msg;
+	struct cm_drep_msg *drep_msg;
+	int ret;
+
+	ret = cm_alloc_response_msg(port, mad_recv_wc, &msg);
+	if (ret)
+		return ret;
+
+	dreq_msg = (struct cm_dreq_msg *) mad_recv_wc->recv_buf.mad;
+	drep_msg = (struct cm_drep_msg *) msg->mad;
+
+	cm_format_mad_hdr(&drep_msg->hdr, CM_DREP_ATTR_ID, dreq_msg->hdr.tid);
+	drep_msg->remote_comm_id = dreq_msg->local_comm_id;
+	drep_msg->local_comm_id = dreq_msg->remote_comm_id;
+
+	ret = ib_post_send_mad(msg, NULL);
+	if (ret)
+		cm_free_msg(msg);
+
+	return ret;
+}
+
+static int cm_dreq_handler(struct cm_work *work)
+{
+	struct cm_id_private *cm_id_priv;
+	struct cm_dreq_msg *dreq_msg;
+	struct ib_mad_send_buf *msg = NULL;
+	int ret;
+
+	dreq_msg = (struct cm_dreq_msg *)work->mad_recv_wc->recv_buf.mad;
+	cm_id_priv = cm_acquire_id(dreq_msg->remote_comm_id,
+				   dreq_msg->local_comm_id);
+	if (!cm_id_priv) {
+		atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+				counter[CM_DREQ_COUNTER]);
+		cm_issue_drep(work->port, work->mad_recv_wc);
+		return -EINVAL;
+	}
+
+	work->cm_event.private_data = &dreq_msg->private_data;
+
+	spin_lock_irq(&cm_id_priv->lock);
+	if (cm_id_priv->local_qpn != cm_dreq_get_remote_qpn(dreq_msg))
+		goto unlock;
+
+	switch (cm_id_priv->id.state) {
+	case IB_CM_REP_SENT:
+	case IB_CM_DREQ_SENT:
+		ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+		break;
+	case IB_CM_ESTABLISHED:
+	case IB_CM_MRA_REP_RCVD:
+		break;
+	case IB_CM_TIMEWAIT:
+		atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+				counter[CM_DREQ_COUNTER]);
+		if (cm_alloc_response_msg(work->port, work->mad_recv_wc, &msg))
+			goto unlock;
+
+		cm_format_drep((struct cm_drep_msg *) msg->mad, cm_id_priv,
+			       cm_id_priv->private_data,
+			       cm_id_priv->private_data_len);
+		spin_unlock_irq(&cm_id_priv->lock);
+
+		if (ib_post_send_mad(msg, NULL))
+			cm_free_msg(msg);
+		goto deref;
+	case IB_CM_DREQ_RCVD:
+		atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+				counter[CM_DREQ_COUNTER]);
+		goto unlock;
+	default:
+		goto unlock;
+	}
+	cm_id_priv->id.state = IB_CM_DREQ_RCVD;
+	cm_id_priv->tid = dreq_msg->hdr.tid;
+	ret = atomic_inc_and_test(&cm_id_priv->work_count);
+	if (!ret)
+		list_add_tail(&work->list, &cm_id_priv->work_list);
+	spin_unlock_irq(&cm_id_priv->lock);
+
+	if (ret)
+		cm_process_work(cm_id_priv, work);
+	else
+		cm_deref_id(cm_id_priv);
+	return 0;
+
+unlock:	spin_unlock_irq(&cm_id_priv->lock);
+deref:	cm_deref_id(cm_id_priv);
+	return -EINVAL;
+}
+
+static int cm_drep_handler(struct cm_work *work)
+{
+	struct cm_id_private *cm_id_priv;
+	struct cm_drep_msg *drep_msg;
+	int ret;
+
+	drep_msg = (struct cm_drep_msg *)work->mad_recv_wc->recv_buf.mad;
+	cm_id_priv = cm_acquire_id(drep_msg->remote_comm_id,
+				   drep_msg->local_comm_id);
+	if (!cm_id_priv)
+		return -EINVAL;
+
+	work->cm_event.private_data = &drep_msg->private_data;
+
+	spin_lock_irq(&cm_id_priv->lock);
+	if (cm_id_priv->id.state != IB_CM_DREQ_SENT &&
+	    cm_id_priv->id.state != IB_CM_DREQ_RCVD) {
+		spin_unlock_irq(&cm_id_priv->lock);
+		goto out;
+	}
+	cm_enter_timewait(cm_id_priv);
+
+	ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+	ret = atomic_inc_and_test(&cm_id_priv->work_count);
+	if (!ret)
+		list_add_tail(&work->list, &cm_id_priv->work_list);
+	spin_unlock_irq(&cm_id_priv->lock);
+
+	if (ret)
+		cm_process_work(cm_id_priv, work);
+	else
+		cm_deref_id(cm_id_priv);
+	return 0;
+out:
+	cm_deref_id(cm_id_priv);
+	return -EINVAL;
+}
+
+int ib_send_cm_rej(struct ib_cm_id *cm_id,
+		   enum ib_cm_rej_reason reason,
+		   void *ari,
+		   u8 ari_length,
+		   const void *private_data,
+		   u8 private_data_len)
+{
+	struct cm_id_private *cm_id_priv;
+	struct ib_mad_send_buf *msg;
+	unsigned long flags;
+	int ret;
+
+	if ((private_data && private_data_len > IB_CM_REJ_PRIVATE_DATA_SIZE) ||
+	    (ari && ari_length > IB_CM_REJ_ARI_LENGTH))
+		return -EINVAL;
+
+	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	switch (cm_id->state) {
+	case IB_CM_REQ_SENT:
+	case IB_CM_MRA_REQ_RCVD:
+	case IB_CM_REQ_RCVD:
+	case IB_CM_MRA_REQ_SENT:
+	case IB_CM_REP_RCVD:
+	case IB_CM_MRA_REP_SENT:
+		ret = cm_alloc_msg(cm_id_priv, &msg);
+		if (!ret)
+			cm_format_rej((struct cm_rej_msg *) msg->mad,
+				      cm_id_priv, reason, ari, ari_length,
+				      private_data, private_data_len);
+
+		cm_reset_to_idle(cm_id_priv);
+		break;
+	case IB_CM_REP_SENT:
+	case IB_CM_MRA_REP_RCVD:
+		ret = cm_alloc_msg(cm_id_priv, &msg);
+		if (!ret)
+			cm_format_rej((struct cm_rej_msg *) msg->mad,
+				      cm_id_priv, reason, ari, ari_length,
+				      private_data, private_data_len);
+
+		cm_enter_timewait(cm_id_priv);
+		break;
+	default:
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (ret)
+		goto out;
+
+	ret = ib_post_send_mad(msg, NULL);
+	if (ret)
+		cm_free_msg(msg);
+
+out:	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	return ret;
+}
+EXPORT_SYMBOL(ib_send_cm_rej);
+
+static void cm_format_rej_event(struct cm_work *work)
+{
+	struct cm_rej_msg *rej_msg;
+	struct ib_cm_rej_event_param *param;
+
+	rej_msg = (struct cm_rej_msg *)work->mad_recv_wc->recv_buf.mad;
+	param = &work->cm_event.param.rej_rcvd;
+	param->ari = rej_msg->ari;
+	param->ari_length = cm_rej_get_reject_info_len(rej_msg);
+	param->reason = __be16_to_cpu(rej_msg->reason);
+	work->cm_event.private_data = &rej_msg->private_data;
+}
+
+static struct cm_id_private * cm_acquire_rejected_id(struct cm_rej_msg *rej_msg)
+{
+	struct cm_timewait_info *timewait_info;
+	struct cm_id_private *cm_id_priv;
+	__be32 remote_id;
+
+	remote_id = rej_msg->local_comm_id;
+
+	if (__be16_to_cpu(rej_msg->reason) == IB_CM_REJ_TIMEOUT) {
+		spin_lock_irq(&cm.lock);
+		timewait_info = cm_find_remote_id( *((__be64 *) rej_msg->ari),
+						  remote_id);
+		if (!timewait_info) {
+			spin_unlock_irq(&cm.lock);
+			return NULL;
+		}
+		cm_id_priv = idr_find(&cm.local_id_table, (__force int)
+				      (timewait_info->work.local_id ^
+				       cm.random_id_operand));
+		if (cm_id_priv) {
+			if (cm_id_priv->id.remote_id == remote_id)
+				atomic_inc(&cm_id_priv->refcount);
+			else
+				cm_id_priv = NULL;
+		}
+		spin_unlock_irq(&cm.lock);
+	} else if (cm_rej_get_msg_rejected(rej_msg) == CM_MSG_RESPONSE_REQ)
+		cm_id_priv = cm_acquire_id(rej_msg->remote_comm_id, 0);
+	else
+		cm_id_priv = cm_acquire_id(rej_msg->remote_comm_id, remote_id);
+
+	return cm_id_priv;
+}
+
+static int cm_rej_handler(struct cm_work *work)
+{
+	struct cm_id_private *cm_id_priv;
+	struct cm_rej_msg *rej_msg;
+	int ret;
+
+	rej_msg = (struct cm_rej_msg *)work->mad_recv_wc->recv_buf.mad;
+	cm_id_priv = cm_acquire_rejected_id(rej_msg);
+	if (!cm_id_priv)
+		return -EINVAL;
+
+	cm_format_rej_event(work);
+
+	spin_lock_irq(&cm_id_priv->lock);
+	switch (cm_id_priv->id.state) {
+	case IB_CM_REQ_SENT:
+	case IB_CM_MRA_REQ_RCVD:
+	case IB_CM_REP_SENT:
+	case IB_CM_MRA_REP_RCVD:
+		ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+		/* fall through */
+	case IB_CM_REQ_RCVD:
+	case IB_CM_MRA_REQ_SENT:
+		if (__be16_to_cpu(rej_msg->reason) == IB_CM_REJ_STALE_CONN)
+			cm_enter_timewait(cm_id_priv);
+		else
+			cm_reset_to_idle(cm_id_priv);
+		break;
+	case IB_CM_DREQ_SENT:
+		ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+		/* fall through */
+	case IB_CM_REP_RCVD:
+	case IB_CM_MRA_REP_SENT:
+	case IB_CM_ESTABLISHED:
+		cm_enter_timewait(cm_id_priv);
+		break;
+	default:
+		spin_unlock_irq(&cm_id_priv->lock);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = atomic_inc_and_test(&cm_id_priv->work_count);
+	if (!ret)
+		list_add_tail(&work->list, &cm_id_priv->work_list);
+	spin_unlock_irq(&cm_id_priv->lock);
+
+	if (ret)
+		cm_process_work(cm_id_priv, work);
+	else
+		cm_deref_id(cm_id_priv);
+	return 0;
+out:
+	cm_deref_id(cm_id_priv);
+	return -EINVAL;
+}
+
+int ib_send_cm_mra(struct ib_cm_id *cm_id,
+		   u8 service_timeout,
+		   const void *private_data,
+		   u8 private_data_len)
+{
+	struct cm_id_private *cm_id_priv;
+	struct ib_mad_send_buf *msg;
+	enum ib_cm_state cm_state;
+	enum ib_cm_lap_state lap_state;
+	enum cm_msg_response msg_response;
+	void *data;
+	unsigned long flags;
+	int ret;
+
+	if (private_data && private_data_len > IB_CM_MRA_PRIVATE_DATA_SIZE)
+		return -EINVAL;
+
+	data = cm_copy_private_data(private_data, private_data_len);
+	if (IS_ERR(data))
+		return PTR_ERR(data);
+
+	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	switch(cm_id_priv->id.state) {
+	case IB_CM_REQ_RCVD:
+		cm_state = IB_CM_MRA_REQ_SENT;
+		lap_state = cm_id->lap_state;
+		msg_response = CM_MSG_RESPONSE_REQ;
+		break;
+	case IB_CM_REP_RCVD:
+		cm_state = IB_CM_MRA_REP_SENT;
+		lap_state = cm_id->lap_state;
+		msg_response = CM_MSG_RESPONSE_REP;
+		break;
+	case IB_CM_ESTABLISHED:
+		if (cm_id->lap_state == IB_CM_LAP_RCVD) {
+			cm_state = cm_id->state;
+			lap_state = IB_CM_MRA_LAP_SENT;
+			msg_response = CM_MSG_RESPONSE_OTHER;
+			break;
+		}
+	default:
+		ret = -EINVAL;
+		goto error1;
+	}
+
+	if (!(service_timeout & IB_CM_MRA_FLAG_DELAY)) {
+		ret = cm_alloc_msg(cm_id_priv, &msg);
+		if (ret)
+			goto error1;
+
+		cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv,
+			      msg_response, service_timeout,
+			      private_data, private_data_len);
+		ret = ib_post_send_mad(msg, NULL);
+		if (ret)
+			goto error2;
+	}
+
+	cm_id->state = cm_state;
+	cm_id->lap_state = lap_state;
+	cm_id_priv->service_timeout = service_timeout;
+	cm_set_private_data(cm_id_priv, data, private_data_len);
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	return 0;
+
+error1:	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	kfree(data);
+	return ret;
+
+error2:	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	kfree(data);
+	cm_free_msg(msg);
+	return ret;
+}
+EXPORT_SYMBOL(ib_send_cm_mra);
+
+static struct cm_id_private * cm_acquire_mraed_id(struct cm_mra_msg *mra_msg)
+{
+	switch (cm_mra_get_msg_mraed(mra_msg)) {
+	case CM_MSG_RESPONSE_REQ:
+		return cm_acquire_id(mra_msg->remote_comm_id, 0);
+	case CM_MSG_RESPONSE_REP:
+	case CM_MSG_RESPONSE_OTHER:
+		return cm_acquire_id(mra_msg->remote_comm_id,
+				     mra_msg->local_comm_id);
+	default:
+		return NULL;
+	}
+}
+
+static int cm_mra_handler(struct cm_work *work)
+{
+	struct cm_id_private *cm_id_priv;
+	struct cm_mra_msg *mra_msg;
+	int timeout, ret;
+
+	mra_msg = (struct cm_mra_msg *)work->mad_recv_wc->recv_buf.mad;
+	cm_id_priv = cm_acquire_mraed_id(mra_msg);
+	if (!cm_id_priv)
+		return -EINVAL;
+
+	work->cm_event.private_data = &mra_msg->private_data;
+	work->cm_event.param.mra_rcvd.service_timeout =
+					cm_mra_get_service_timeout(mra_msg);
+	timeout = cm_convert_to_ms(cm_mra_get_service_timeout(mra_msg)) +
+		  cm_convert_to_ms(cm_id_priv->av.timeout);
+       if (timeout > cm_convert_to_ms(max_timeout)) {
+               printk(KERN_WARNING PFX "calculated mra timeout %d > %d, "
+                      "decreasing used timeout_ms\n", timeout,
+                      cm_convert_to_ms(max_timeout));
+               timeout = cm_convert_to_ms(max_timeout);
+       }
+
+	spin_lock_irq(&cm_id_priv->lock);
+	switch (cm_id_priv->id.state) {
+	case IB_CM_REQ_SENT:
+		if (cm_mra_get_msg_mraed(mra_msg) != CM_MSG_RESPONSE_REQ ||
+		    ib_modify_mad(cm_id_priv->av.port->mad_agent,
+				  cm_id_priv->msg, timeout))
+			goto out;
+		cm_id_priv->id.state = IB_CM_MRA_REQ_RCVD;
+		break;
+	case IB_CM_REP_SENT:
+		if (cm_mra_get_msg_mraed(mra_msg) != CM_MSG_RESPONSE_REP ||
+		    ib_modify_mad(cm_id_priv->av.port->mad_agent,
+				  cm_id_priv->msg, timeout))
+			goto out;
+		cm_id_priv->id.state = IB_CM_MRA_REP_RCVD;
+		break;
+	case IB_CM_ESTABLISHED:
+		if (cm_mra_get_msg_mraed(mra_msg) != CM_MSG_RESPONSE_OTHER ||
+		    cm_id_priv->id.lap_state != IB_CM_LAP_SENT ||
+		    ib_modify_mad(cm_id_priv->av.port->mad_agent,
+				  cm_id_priv->msg, timeout)) {
+			if (cm_id_priv->id.lap_state == IB_CM_MRA_LAP_RCVD)
+				atomic_long_inc(&work->port->
+						counter_group[CM_RECV_DUPLICATES].
+						counter[CM_MRA_COUNTER]);
+			goto out;
+		}
+		cm_id_priv->id.lap_state = IB_CM_MRA_LAP_RCVD;
+		break;
+	case IB_CM_MRA_REQ_RCVD:
+	case IB_CM_MRA_REP_RCVD:
+		atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+				counter[CM_MRA_COUNTER]);
+		/* fall through */
+	default:
+		goto out;
+	}
+
+	cm_id_priv->msg->context[1] = (void *) (unsigned long)
+				      cm_id_priv->id.state;
+	ret = atomic_inc_and_test(&cm_id_priv->work_count);
+	if (!ret)
+		list_add_tail(&work->list, &cm_id_priv->work_list);
+	spin_unlock_irq(&cm_id_priv->lock);
+
+	if (ret)
+		cm_process_work(cm_id_priv, work);
+	else
+		cm_deref_id(cm_id_priv);
+	return 0;
+out:
+	spin_unlock_irq(&cm_id_priv->lock);
+	cm_deref_id(cm_id_priv);
+	return -EINVAL;
+}
+
+static void cm_format_lap(struct cm_lap_msg *lap_msg,
+			  struct cm_id_private *cm_id_priv,
+			  struct ib_sa_path_rec *alternate_path,
+			  const void *private_data,
+			  u8 private_data_len)
+{
+	cm_format_mad_hdr(&lap_msg->hdr, CM_LAP_ATTR_ID,
+			  cm_form_tid(cm_id_priv, CM_MSG_SEQUENCE_LAP));
+	lap_msg->local_comm_id = cm_id_priv->id.local_id;
+	lap_msg->remote_comm_id = cm_id_priv->id.remote_id;
+	cm_lap_set_remote_qpn(lap_msg, cm_id_priv->remote_qpn);
+	/* todo: need remote CM response timeout */
+	cm_lap_set_remote_resp_timeout(lap_msg, 0x1F);
+	lap_msg->alt_local_lid = alternate_path->slid;
+	lap_msg->alt_remote_lid = alternate_path->dlid;
+	lap_msg->alt_local_gid = alternate_path->sgid;
+	lap_msg->alt_remote_gid = alternate_path->dgid;
+	cm_lap_set_flow_label(lap_msg, alternate_path->flow_label);
+	cm_lap_set_traffic_class(lap_msg, alternate_path->traffic_class);
+	lap_msg->alt_hop_limit = alternate_path->hop_limit;
+	cm_lap_set_packet_rate(lap_msg, alternate_path->rate);
+	cm_lap_set_sl(lap_msg, alternate_path->sl);
+	cm_lap_set_subnet_local(lap_msg, 1); /* local only... */
+	cm_lap_set_local_ack_timeout(lap_msg,
+		cm_ack_timeout(cm_id_priv->av.port->cm_dev->ack_delay,
+			       alternate_path->packet_life_time));
+
+	if (private_data && private_data_len)
+		memcpy(lap_msg->private_data, private_data, private_data_len);
+}
+
+int ib_send_cm_lap(struct ib_cm_id *cm_id,
+		   struct ib_sa_path_rec *alternate_path,
+		   const void *private_data,
+		   u8 private_data_len)
+{
+	struct cm_id_private *cm_id_priv;
+	struct ib_mad_send_buf *msg;
+	unsigned long flags;
+	int ret;
+
+	if (private_data && private_data_len > IB_CM_LAP_PRIVATE_DATA_SIZE)
+		return -EINVAL;
+
+	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	if (cm_id->state != IB_CM_ESTABLISHED ||
+	    (cm_id->lap_state != IB_CM_LAP_UNINIT &&
+	     cm_id->lap_state != IB_CM_LAP_IDLE)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = cm_init_av_by_path(alternate_path, &cm_id_priv->alt_av);
+	if (ret)
+		goto out;
+	cm_id_priv->alt_av.timeout =
+			cm_ack_timeout(cm_id_priv->target_ack_delay,
+				       cm_id_priv->alt_av.timeout - 1);
+
+	ret = cm_alloc_msg(cm_id_priv, &msg);
+	if (ret)
+		goto out;
+
+	cm_format_lap((struct cm_lap_msg *) msg->mad, cm_id_priv,
+		      alternate_path, private_data, private_data_len);
+	msg->timeout_ms = cm_id_priv->timeout_ms;
+	msg->context[1] = (void *) (unsigned long) IB_CM_ESTABLISHED;
+
+	ret = ib_post_send_mad(msg, NULL);
+	if (ret) {
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		cm_free_msg(msg);
+		return ret;
+	}
+
+	cm_id->lap_state = IB_CM_LAP_SENT;
+	cm_id_priv->msg = msg;
+
+out:	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	return ret;
+}
+EXPORT_SYMBOL(ib_send_cm_lap);
+
+static void cm_format_path_from_lap(struct cm_id_private *cm_id_priv,
+				    struct ib_sa_path_rec *path,
+				    struct cm_lap_msg *lap_msg)
+{
+	memset(path, 0, sizeof *path);
+	path->dgid = lap_msg->alt_local_gid;
+	path->sgid = lap_msg->alt_remote_gid;
+	path->dlid = lap_msg->alt_local_lid;
+	path->slid = lap_msg->alt_remote_lid;
+	path->flow_label = cm_lap_get_flow_label(lap_msg);
+	path->hop_limit = lap_msg->alt_hop_limit;
+	path->traffic_class = cm_lap_get_traffic_class(lap_msg);
+	path->reversible = 1;
+	path->pkey = cm_id_priv->pkey;
+	path->sl = cm_lap_get_sl(lap_msg);
+	path->mtu_selector = IB_SA_EQ;
+	path->mtu = cm_id_priv->path_mtu;
+	path->rate_selector = IB_SA_EQ;
+	path->rate = cm_lap_get_packet_rate(lap_msg);
+	path->packet_life_time_selector = IB_SA_EQ;
+	path->packet_life_time = cm_lap_get_local_ack_timeout(lap_msg);
+	path->packet_life_time -= (path->packet_life_time > 0);
+}
+
+static int cm_lap_handler(struct cm_work *work)
+{
+	struct cm_id_private *cm_id_priv;
+	struct cm_lap_msg *lap_msg;
+	struct ib_cm_lap_event_param *param;
+	struct ib_mad_send_buf *msg = NULL;
+	int ret;
+
+	/* todo: verify LAP request and send reject APR if invalid. */
+	lap_msg = (struct cm_lap_msg *)work->mad_recv_wc->recv_buf.mad;
+	cm_id_priv = cm_acquire_id(lap_msg->remote_comm_id,
+				   lap_msg->local_comm_id);
+	if (!cm_id_priv)
+		return -EINVAL;
+
+	param = &work->cm_event.param.lap_rcvd;
+	param->alternate_path = &work->path[0];
+	cm_format_path_from_lap(cm_id_priv, param->alternate_path, lap_msg);
+	work->cm_event.private_data = &lap_msg->private_data;
+
+	spin_lock_irq(&cm_id_priv->lock);
+	if (cm_id_priv->id.state != IB_CM_ESTABLISHED)
+		goto unlock;
+
+	switch (cm_id_priv->id.lap_state) {
+	case IB_CM_LAP_UNINIT:
+	case IB_CM_LAP_IDLE:
+		break;
+	case IB_CM_MRA_LAP_SENT:
+		atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+				counter[CM_LAP_COUNTER]);
+		if (cm_alloc_response_msg(work->port, work->mad_recv_wc, &msg))
+			goto unlock;
+
+		cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv,
+			      CM_MSG_RESPONSE_OTHER,
+			      cm_id_priv->service_timeout,
+			      cm_id_priv->private_data,
+			      cm_id_priv->private_data_len);
+		spin_unlock_irq(&cm_id_priv->lock);
+
+		if (ib_post_send_mad(msg, NULL))
+			cm_free_msg(msg);
+		goto deref;
+	case IB_CM_LAP_RCVD:
+		atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+				counter[CM_LAP_COUNTER]);
+		goto unlock;
+	default:
+		goto unlock;
+	}
+
+	cm_id_priv->id.lap_state = IB_CM_LAP_RCVD;
+	cm_id_priv->tid = lap_msg->hdr.tid;
+	cm_init_av_for_response(work->port, work->mad_recv_wc->wc,
+				work->mad_recv_wc->recv_buf.grh,
+				&cm_id_priv->av);
+	cm_init_av_by_path(param->alternate_path, &cm_id_priv->alt_av);
+	ret = atomic_inc_and_test(&cm_id_priv->work_count);
+	if (!ret)
+		list_add_tail(&work->list, &cm_id_priv->work_list);
+	spin_unlock_irq(&cm_id_priv->lock);
+
+	if (ret)
+		cm_process_work(cm_id_priv, work);
+	else
+		cm_deref_id(cm_id_priv);
+	return 0;
+
+unlock:	spin_unlock_irq(&cm_id_priv->lock);
+deref:	cm_deref_id(cm_id_priv);
+	return -EINVAL;
+}
+
+static void cm_format_apr(struct cm_apr_msg *apr_msg,
+			  struct cm_id_private *cm_id_priv,
+			  enum ib_cm_apr_status status,
+			  void *info,
+			  u8 info_length,
+			  const void *private_data,
+			  u8 private_data_len)
+{
+	cm_format_mad_hdr(&apr_msg->hdr, CM_APR_ATTR_ID, cm_id_priv->tid);
+	apr_msg->local_comm_id = cm_id_priv->id.local_id;
+	apr_msg->remote_comm_id = cm_id_priv->id.remote_id;
+	apr_msg->ap_status = (u8) status;
+
+	if (info && info_length) {
+		apr_msg->info_length = info_length;
+		memcpy(apr_msg->info, info, info_length);
+	}
+
+	if (private_data && private_data_len)
+		memcpy(apr_msg->private_data, private_data, private_data_len);
+}
+
+int ib_send_cm_apr(struct ib_cm_id *cm_id,
+		   enum ib_cm_apr_status status,
+		   void *info,
+		   u8 info_length,
+		   const void *private_data,
+		   u8 private_data_len)
+{
+	struct cm_id_private *cm_id_priv;
+	struct ib_mad_send_buf *msg;
+	unsigned long flags;
+	int ret;
+
+	if ((private_data && private_data_len > IB_CM_APR_PRIVATE_DATA_SIZE) ||
+	    (info && info_length > IB_CM_APR_INFO_LENGTH))
+		return -EINVAL;
+
+	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	if (cm_id->state != IB_CM_ESTABLISHED ||
+	    (cm_id->lap_state != IB_CM_LAP_RCVD &&
+	     cm_id->lap_state != IB_CM_MRA_LAP_SENT)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = cm_alloc_msg(cm_id_priv, &msg);
+	if (ret)
+		goto out;
+
+	cm_format_apr((struct cm_apr_msg *) msg->mad, cm_id_priv, status,
+		      info, info_length, private_data, private_data_len);
+	ret = ib_post_send_mad(msg, NULL);
+	if (ret) {
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		cm_free_msg(msg);
+		return ret;
+	}
+
+	cm_id->lap_state = IB_CM_LAP_IDLE;
+out:	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	return ret;
+}
+EXPORT_SYMBOL(ib_send_cm_apr);
+
+static int cm_apr_handler(struct cm_work *work)
+{
+	struct cm_id_private *cm_id_priv;
+	struct cm_apr_msg *apr_msg;
+	int ret;
+
+	apr_msg = (struct cm_apr_msg *)work->mad_recv_wc->recv_buf.mad;
+	cm_id_priv = cm_acquire_id(apr_msg->remote_comm_id,
+				   apr_msg->local_comm_id);
+	if (!cm_id_priv)
+		return -EINVAL; /* Unmatched reply. */
+
+	work->cm_event.param.apr_rcvd.ap_status = apr_msg->ap_status;
+	work->cm_event.param.apr_rcvd.apr_info = &apr_msg->info;
+	work->cm_event.param.apr_rcvd.info_len = apr_msg->info_length;
+	work->cm_event.private_data = &apr_msg->private_data;
+
+	spin_lock_irq(&cm_id_priv->lock);
+	if (cm_id_priv->id.state != IB_CM_ESTABLISHED ||
+	    (cm_id_priv->id.lap_state != IB_CM_LAP_SENT &&
+	     cm_id_priv->id.lap_state != IB_CM_MRA_LAP_RCVD)) {
+		spin_unlock_irq(&cm_id_priv->lock);
+		goto out;
+	}
+	cm_id_priv->id.lap_state = IB_CM_LAP_IDLE;
+	ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+	cm_id_priv->msg = NULL;
+
+	ret = atomic_inc_and_test(&cm_id_priv->work_count);
+	if (!ret)
+		list_add_tail(&work->list, &cm_id_priv->work_list);
+	spin_unlock_irq(&cm_id_priv->lock);
+
+	if (ret)
+		cm_process_work(cm_id_priv, work);
+	else
+		cm_deref_id(cm_id_priv);
+	return 0;
+out:
+	cm_deref_id(cm_id_priv);
+	return -EINVAL;
+}
+
+static int cm_timewait_handler(struct cm_work *work)
+{
+	struct cm_timewait_info *timewait_info;
+	struct cm_id_private *cm_id_priv;
+	int ret;
+
+	timewait_info = (struct cm_timewait_info *)work;
+	spin_lock_irq(&cm.lock);
+	list_del(&timewait_info->list);
+	spin_unlock_irq(&cm.lock);
+
+	cm_id_priv = cm_acquire_id(timewait_info->work.local_id,
+				   timewait_info->work.remote_id);
+	if (!cm_id_priv)
+		return -EINVAL;
+
+	spin_lock_irq(&cm_id_priv->lock);
+	if (cm_id_priv->id.state != IB_CM_TIMEWAIT ||
+	    cm_id_priv->remote_qpn != timewait_info->remote_qpn) {
+		spin_unlock_irq(&cm_id_priv->lock);
+		goto out;
+	}
+	cm_id_priv->id.state = IB_CM_IDLE;
+	ret = atomic_inc_and_test(&cm_id_priv->work_count);
+	if (!ret)
+		list_add_tail(&work->list, &cm_id_priv->work_list);
+	spin_unlock_irq(&cm_id_priv->lock);
+
+	if (ret)
+		cm_process_work(cm_id_priv, work);
+	else
+		cm_deref_id(cm_id_priv);
+	return 0;
+out:
+	cm_deref_id(cm_id_priv);
+	return -EINVAL;
+}
+
+static void cm_format_sidr_req(struct cm_sidr_req_msg *sidr_req_msg,
+			       struct cm_id_private *cm_id_priv,
+			       struct ib_cm_sidr_req_param *param)
+{
+	cm_format_mad_hdr(&sidr_req_msg->hdr, CM_SIDR_REQ_ATTR_ID,
+			  cm_form_tid(cm_id_priv, CM_MSG_SEQUENCE_SIDR));
+	sidr_req_msg->request_id = cm_id_priv->id.local_id;
+	sidr_req_msg->pkey = param->path->pkey;
+	sidr_req_msg->service_id = param->service_id;
+
+	if (param->private_data && param->private_data_len)
+		memcpy(sidr_req_msg->private_data, param->private_data,
+		       param->private_data_len);
+}
+
+int ib_send_cm_sidr_req(struct ib_cm_id *cm_id,
+			struct ib_cm_sidr_req_param *param)
+{
+	struct cm_id_private *cm_id_priv;
+	struct ib_mad_send_buf *msg;
+	unsigned long flags;
+	int ret;
+
+	if (!param->path || (param->private_data &&
+	     param->private_data_len > IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE))
+		return -EINVAL;
+
+	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+	ret = cm_init_av_by_path(param->path, &cm_id_priv->av);
+	if (ret)
+		goto out;
+
+	cm_id->service_id = param->service_id;
+	cm_id->service_mask = ~cpu_to_be64(0);
+	cm_id_priv->timeout_ms = param->timeout_ms;
+       if (cm_id_priv->timeout_ms > cm_convert_to_ms(max_timeout)) {
+               printk(KERN_WARNING PFX "sidr req timeout_ms %d > %d, "
+                      "decreasing used timeout_ms\n", param->timeout_ms,
+                      cm_convert_to_ms(max_timeout));
+               cm_id_priv->timeout_ms = cm_convert_to_ms(max_timeout);
+       }
+	cm_id_priv->max_cm_retries = param->max_cm_retries;
+	ret = cm_alloc_msg(cm_id_priv, &msg);
+	if (ret)
+		goto out;
+
+	cm_format_sidr_req((struct cm_sidr_req_msg *) msg->mad, cm_id_priv,
+			   param);
+	msg->timeout_ms = cm_id_priv->timeout_ms;
+	msg->context[1] = (void *) (unsigned long) IB_CM_SIDR_REQ_SENT;
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	if (cm_id->state == IB_CM_IDLE)
+		ret = ib_post_send_mad(msg, NULL);
+	else
+		ret = -EINVAL;
+
+	if (ret) {
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		cm_free_msg(msg);
+		goto out;
+	}
+	cm_id->state = IB_CM_SIDR_REQ_SENT;
+	cm_id_priv->msg = msg;
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+out:
+	return ret;
+}
+EXPORT_SYMBOL(ib_send_cm_sidr_req);
+
+static void cm_format_sidr_req_event(struct cm_work *work,
+				     struct ib_cm_id *listen_id)
+{
+	struct cm_sidr_req_msg *sidr_req_msg;
+	struct ib_cm_sidr_req_event_param *param;
+
+	sidr_req_msg = (struct cm_sidr_req_msg *)
+				work->mad_recv_wc->recv_buf.mad;
+	param = &work->cm_event.param.sidr_req_rcvd;
+	param->pkey = __be16_to_cpu(sidr_req_msg->pkey);
+	param->listen_id = listen_id;
+	param->port = work->port->port_num;
+	work->cm_event.private_data = &sidr_req_msg->private_data;
+}
+
+static int cm_sidr_req_handler(struct cm_work *work)
+{
+	struct ib_cm_id *cm_id;
+	struct cm_id_private *cm_id_priv, *cur_cm_id_priv;
+	struct cm_sidr_req_msg *sidr_req_msg;
+	struct ib_wc *wc;
+
+	cm_id = ib_create_cm_id(work->port->cm_dev->ib_device, NULL, NULL);
+	if (IS_ERR(cm_id))
+		return PTR_ERR(cm_id);
+	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+
+	/* Record SGID/SLID and request ID for lookup. */
+	sidr_req_msg = (struct cm_sidr_req_msg *)
+				work->mad_recv_wc->recv_buf.mad;
+	wc = work->mad_recv_wc->wc;
+	cm_id_priv->av.dgid.global.subnet_prefix = cpu_to_be64(wc->slid);
+	cm_id_priv->av.dgid.global.interface_id = 0;
+	cm_init_av_for_response(work->port, work->mad_recv_wc->wc,
+				work->mad_recv_wc->recv_buf.grh,
+				&cm_id_priv->av);
+	cm_id_priv->id.remote_id = sidr_req_msg->request_id;
+	cm_id_priv->tid = sidr_req_msg->hdr.tid;
+	atomic_inc(&cm_id_priv->work_count);
+
+	spin_lock_irq(&cm.lock);
+	cur_cm_id_priv = cm_insert_remote_sidr(cm_id_priv);
+	if (cur_cm_id_priv) {
+		spin_unlock_irq(&cm.lock);
+		atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+				counter[CM_SIDR_REQ_COUNTER]);
+		goto out; /* Duplicate message. */
+	}
+	cm_id_priv->id.state = IB_CM_SIDR_REQ_RCVD;
+	cur_cm_id_priv = cm_find_listen(cm_id->device,
+					sidr_req_msg->service_id,
+					sidr_req_msg->private_data);
+	if (!cur_cm_id_priv) {
+		spin_unlock_irq(&cm.lock);
+		cm_reject_sidr_req(cm_id_priv, IB_SIDR_UNSUPPORTED);
+		goto out; /* No match. */
+	}
+	atomic_inc(&cur_cm_id_priv->refcount);
+	spin_unlock_irq(&cm.lock);
+
+	cm_id_priv->id.cm_handler = cur_cm_id_priv->id.cm_handler;
+	cm_id_priv->id.context = cur_cm_id_priv->id.context;
+	cm_id_priv->id.service_id = sidr_req_msg->service_id;
+	cm_id_priv->id.service_mask = ~cpu_to_be64(0);
+
+	cm_format_sidr_req_event(work, &cur_cm_id_priv->id);
+	cm_process_work(cm_id_priv, work);
+	cm_deref_id(cur_cm_id_priv);
+	return 0;
+out:
+	ib_destroy_cm_id(&cm_id_priv->id);
+	return -EINVAL;
+}
+
+static void cm_format_sidr_rep(struct cm_sidr_rep_msg *sidr_rep_msg,
+			       struct cm_id_private *cm_id_priv,
+			       struct ib_cm_sidr_rep_param *param)
+{
+	cm_format_mad_hdr(&sidr_rep_msg->hdr, CM_SIDR_REP_ATTR_ID,
+			  cm_id_priv->tid);
+	sidr_rep_msg->request_id = cm_id_priv->id.remote_id;
+	sidr_rep_msg->status = param->status;
+	cm_sidr_rep_set_qpn(sidr_rep_msg, cpu_to_be32(param->qp_num));
+	sidr_rep_msg->service_id = cm_id_priv->id.service_id;
+	sidr_rep_msg->qkey = cpu_to_be32(param->qkey);
+
+	if (param->info && param->info_length)
+		memcpy(sidr_rep_msg->info, param->info, param->info_length);
+
+	if (param->private_data && param->private_data_len)
+		memcpy(sidr_rep_msg->private_data, param->private_data,
+		       param->private_data_len);
+}
+
+int ib_send_cm_sidr_rep(struct ib_cm_id *cm_id,
+			struct ib_cm_sidr_rep_param *param)
+{
+	struct cm_id_private *cm_id_priv;
+	struct ib_mad_send_buf *msg;
+	unsigned long flags;
+	int ret;
+
+	if ((param->info && param->info_length > IB_CM_SIDR_REP_INFO_LENGTH) ||
+	    (param->private_data &&
+	     param->private_data_len > IB_CM_SIDR_REP_PRIVATE_DATA_SIZE))
+		return -EINVAL;
+
+	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	if (cm_id->state != IB_CM_SIDR_REQ_RCVD) {
+		ret = -EINVAL;
+		goto error;
+	}
+
+	ret = cm_alloc_msg(cm_id_priv, &msg);
+	if (ret)
+		goto error;
+
+	cm_format_sidr_rep((struct cm_sidr_rep_msg *) msg->mad, cm_id_priv,
+			   param);
+	ret = ib_post_send_mad(msg, NULL);
+	if (ret) {
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		cm_free_msg(msg);
+		return ret;
+	}
+	cm_id->state = IB_CM_IDLE;
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+	spin_lock_irqsave(&cm.lock, flags);
+	rb_erase(&cm_id_priv->sidr_id_node, &cm.remote_sidr_table);
+	spin_unlock_irqrestore(&cm.lock, flags);
+	return 0;
+
+error:	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	return ret;
+}
+EXPORT_SYMBOL(ib_send_cm_sidr_rep);
+
+static void cm_format_sidr_rep_event(struct cm_work *work)
+{
+	struct cm_sidr_rep_msg *sidr_rep_msg;
+	struct ib_cm_sidr_rep_event_param *param;
+
+	sidr_rep_msg = (struct cm_sidr_rep_msg *)
+				work->mad_recv_wc->recv_buf.mad;
+	param = &work->cm_event.param.sidr_rep_rcvd;
+	param->status = sidr_rep_msg->status;
+	param->qkey = be32_to_cpu(sidr_rep_msg->qkey);
+	param->qpn = be32_to_cpu(cm_sidr_rep_get_qpn(sidr_rep_msg));
+	param->info = &sidr_rep_msg->info;
+	param->info_len = sidr_rep_msg->info_length;
+	work->cm_event.private_data = &sidr_rep_msg->private_data;
+}
+
+static int cm_sidr_rep_handler(struct cm_work *work)
+{
+	struct cm_sidr_rep_msg *sidr_rep_msg;
+	struct cm_id_private *cm_id_priv;
+
+	sidr_rep_msg = (struct cm_sidr_rep_msg *)
+				work->mad_recv_wc->recv_buf.mad;
+	cm_id_priv = cm_acquire_id(sidr_rep_msg->request_id, 0);
+	if (!cm_id_priv)
+		return -EINVAL; /* Unmatched reply. */
+
+	spin_lock_irq(&cm_id_priv->lock);
+	if (cm_id_priv->id.state != IB_CM_SIDR_REQ_SENT) {
+		spin_unlock_irq(&cm_id_priv->lock);
+		goto out;
+	}
+	cm_id_priv->id.state = IB_CM_IDLE;
+	ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+	spin_unlock_irq(&cm_id_priv->lock);
+
+	cm_format_sidr_rep_event(work);
+	cm_process_work(cm_id_priv, work);
+	return 0;
+out:
+	cm_deref_id(cm_id_priv);
+	return -EINVAL;
+}
+
+static void cm_process_send_error(struct ib_mad_send_buf *msg,
+				  enum ib_wc_status wc_status)
+{
+	struct cm_id_private *cm_id_priv;
+	struct ib_cm_event cm_event;
+	enum ib_cm_state state;
+	int ret;
+
+	memset(&cm_event, 0, sizeof cm_event);
+	cm_id_priv = msg->context[0];
+
+	/* Discard old sends or ones without a response. */
+	spin_lock_irq(&cm_id_priv->lock);
+	state = (enum ib_cm_state) (unsigned long) msg->context[1];
+	if (msg != cm_id_priv->msg || state != cm_id_priv->id.state)
+		goto discard;
+
+	switch (state) {
+	case IB_CM_REQ_SENT:
+	case IB_CM_MRA_REQ_RCVD:
+		cm_reset_to_idle(cm_id_priv);
+		cm_event.event = IB_CM_REQ_ERROR;
+		break;
+	case IB_CM_REP_SENT:
+	case IB_CM_MRA_REP_RCVD:
+		cm_reset_to_idle(cm_id_priv);
+		cm_event.event = IB_CM_REP_ERROR;
+		break;
+	case IB_CM_DREQ_SENT:
+		cm_enter_timewait(cm_id_priv);
+		cm_event.event = IB_CM_DREQ_ERROR;
+		break;
+	case IB_CM_SIDR_REQ_SENT:
+		cm_id_priv->id.state = IB_CM_IDLE;
+		cm_event.event = IB_CM_SIDR_REQ_ERROR;
+		break;
+	default:
+		goto discard;
+	}
+	spin_unlock_irq(&cm_id_priv->lock);
+	cm_event.param.send_status = wc_status;
+
+	/* No other events can occur on the cm_id at this point. */
+	ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, &cm_event);
+	cm_free_msg(msg);
+	if (ret)
+		ib_destroy_cm_id(&cm_id_priv->id);
+	return;
+discard:
+	spin_unlock_irq(&cm_id_priv->lock);
+	cm_free_msg(msg);
+}
+
+static void cm_send_handler(struct ib_mad_agent *mad_agent,
+			    struct ib_mad_send_wc *mad_send_wc)
+{
+	struct ib_mad_send_buf *msg = mad_send_wc->send_buf;
+	struct cm_port *port;
+	u16 attr_index;
+
+	port = mad_agent->context;
+	attr_index = be16_to_cpu(((struct ib_mad_hdr *)
+				  msg->mad)->attr_id) - CM_ATTR_ID_OFFSET;
+
+	/*
+	 * If the send was in response to a received message (context[0] is not
+	 * set to a cm_id), and is not a REJ, then it is a send that was
+	 * manually retried.
+	 */
+	if (!msg->context[0] && (attr_index != CM_REJ_COUNTER))
+		msg->retries = 1;
+
+	atomic_long_add(1 + msg->retries,
+			&port->counter_group[CM_XMIT].counter[attr_index]);
+	if (msg->retries)
+		atomic_long_add(msg->retries,
+				&port->counter_group[CM_XMIT_RETRIES].
+				counter[attr_index]);
+
+	switch (mad_send_wc->status) {
+	case IB_WC_SUCCESS:
+	case IB_WC_WR_FLUSH_ERR:
+		cm_free_msg(msg);
+		break;
+	default:
+		if (msg->context[0] && msg->context[1])
+			cm_process_send_error(msg, mad_send_wc->status);
+		else
+			cm_free_msg(msg);
+		break;
+	}
+}
+
+static void cm_work_handler(struct work_struct *_work)
+{
+	struct cm_work *work = container_of(_work, struct cm_work, work.work);
+	int ret;
+
+	switch (work->cm_event.event) {
+	case IB_CM_REQ_RECEIVED:
+		ret = cm_req_handler(work);
+		break;
+	case IB_CM_MRA_RECEIVED:
+		ret = cm_mra_handler(work);
+		break;
+	case IB_CM_REJ_RECEIVED:
+		ret = cm_rej_handler(work);
+		break;
+	case IB_CM_REP_RECEIVED:
+		ret = cm_rep_handler(work);
+		break;
+	case IB_CM_RTU_RECEIVED:
+		ret = cm_rtu_handler(work);
+		break;
+	case IB_CM_USER_ESTABLISHED:
+		ret = cm_establish_handler(work);
+		break;
+	case IB_CM_DREQ_RECEIVED:
+		ret = cm_dreq_handler(work);
+		break;
+	case IB_CM_DREP_RECEIVED:
+		ret = cm_drep_handler(work);
+		break;
+	case IB_CM_SIDR_REQ_RECEIVED:
+		ret = cm_sidr_req_handler(work);
+		break;
+	case IB_CM_SIDR_REP_RECEIVED:
+		ret = cm_sidr_rep_handler(work);
+		break;
+	case IB_CM_LAP_RECEIVED:
+		ret = cm_lap_handler(work);
+		break;
+	case IB_CM_APR_RECEIVED:
+		ret = cm_apr_handler(work);
+		break;
+	case IB_CM_TIMEWAIT_EXIT:
+		ret = cm_timewait_handler(work);
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+	if (ret)
+		cm_free_work(work);
+}
+
+static int cm_establish(struct ib_cm_id *cm_id)
+{
+	struct cm_id_private *cm_id_priv;
+	struct cm_work *work;
+	unsigned long flags;
+	int ret = 0;
+
+	work = kmalloc(sizeof *work, GFP_ATOMIC);
+	if (!work)
+		return -ENOMEM;
+
+	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	switch (cm_id->state)
+	{
+	case IB_CM_REP_SENT:
+	case IB_CM_MRA_REP_RCVD:
+		cm_id->state = IB_CM_ESTABLISHED;
+		break;
+	case IB_CM_ESTABLISHED:
+		ret = -EISCONN;
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+	if (ret) {
+		kfree(work);
+		goto out;
+	}
+
+	/*
+	 * The CM worker thread may try to destroy the cm_id before it
+	 * can execute this work item.  To prevent potential deadlock,
+	 * we need to find the cm_id once we're in the context of the
+	 * worker thread, rather than holding a reference on it.
+	 */
+	INIT_DELAYED_WORK(&work->work, cm_work_handler);
+	work->local_id = cm_id->local_id;
+	work->remote_id = cm_id->remote_id;
+	work->mad_recv_wc = NULL;
+	work->cm_event.event = IB_CM_USER_ESTABLISHED;
+	queue_delayed_work(cm.wq, &work->work, 0);
+out:
+	return ret;
+}
+
+static int cm_migrate(struct ib_cm_id *cm_id)
+{
+	struct cm_id_private *cm_id_priv;
+	unsigned long flags;
+	int ret = 0;
+
+	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	if (cm_id->state == IB_CM_ESTABLISHED &&
+	    (cm_id->lap_state == IB_CM_LAP_UNINIT ||
+	     cm_id->lap_state == IB_CM_LAP_IDLE)) {
+		cm_id->lap_state = IB_CM_LAP_IDLE;
+		cm_id_priv->av = cm_id_priv->alt_av;
+	} else
+		ret = -EINVAL;
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+	return ret;
+}
+
+int ib_cm_notify(struct ib_cm_id *cm_id, enum ib_event_type event)
+{
+	int ret;
+
+	switch (event) {
+	case IB_EVENT_COMM_EST:
+		ret = cm_establish(cm_id);
+		break;
+	case IB_EVENT_PATH_MIG:
+		ret = cm_migrate(cm_id);
+		break;
+	default:
+		ret = -EINVAL;
+	}
+	return ret;
+}
+EXPORT_SYMBOL(ib_cm_notify);
+
+static void cm_recv_handler(struct ib_mad_agent *mad_agent,
+			    struct ib_mad_recv_wc *mad_recv_wc)
+{
+	struct cm_port *port = mad_agent->context;
+	struct cm_work *work;
+	enum ib_cm_event_type event;
+	u16 attr_id;
+	int paths = 0;
+
+	switch (mad_recv_wc->recv_buf.mad->mad_hdr.attr_id) {
+	case CM_REQ_ATTR_ID:
+		paths = 1 + (((struct cm_req_msg *) mad_recv_wc->recv_buf.mad)->
+						    alt_local_lid != 0);
+		event = IB_CM_REQ_RECEIVED;
+		break;
+	case CM_MRA_ATTR_ID:
+		event = IB_CM_MRA_RECEIVED;
+		break;
+	case CM_REJ_ATTR_ID:
+		event = IB_CM_REJ_RECEIVED;
+		break;
+	case CM_REP_ATTR_ID:
+		event = IB_CM_REP_RECEIVED;
+		break;
+	case CM_RTU_ATTR_ID:
+		event = IB_CM_RTU_RECEIVED;
+		break;
+	case CM_DREQ_ATTR_ID:
+		event = IB_CM_DREQ_RECEIVED;
+		break;
+	case CM_DREP_ATTR_ID:
+		event = IB_CM_DREP_RECEIVED;
+		break;
+	case CM_SIDR_REQ_ATTR_ID:
+		event = IB_CM_SIDR_REQ_RECEIVED;
+		break;
+	case CM_SIDR_REP_ATTR_ID:
+		event = IB_CM_SIDR_REP_RECEIVED;
+		break;
+	case CM_LAP_ATTR_ID:
+		paths = 1;
+		event = IB_CM_LAP_RECEIVED;
+		break;
+	case CM_APR_ATTR_ID:
+		event = IB_CM_APR_RECEIVED;
+		break;
+	default:
+		ib_free_recv_mad(mad_recv_wc);
+		return;
+	}
+
+	attr_id = be16_to_cpu(mad_recv_wc->recv_buf.mad->mad_hdr.attr_id);
+	atomic_long_inc(&port->counter_group[CM_RECV].
+			counter[attr_id - CM_ATTR_ID_OFFSET]);
+
+	work = kmalloc(sizeof *work + sizeof(struct ib_sa_path_rec) * paths,
+		       GFP_KERNEL);
+	if (!work) {
+		ib_free_recv_mad(mad_recv_wc);
+		return;
+	}
+
+	INIT_DELAYED_WORK(&work->work, cm_work_handler);
+	work->cm_event.event = event;
+	work->mad_recv_wc = mad_recv_wc;
+	work->port = port;
+	queue_delayed_work(cm.wq, &work->work, 0);
+}
+
+static int cm_init_qp_init_attr(struct cm_id_private *cm_id_priv,
+				struct ib_qp_attr *qp_attr,
+				int *qp_attr_mask)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	switch (cm_id_priv->id.state) {
+	case IB_CM_REQ_SENT:
+	case IB_CM_MRA_REQ_RCVD:
+	case IB_CM_REQ_RCVD:
+	case IB_CM_MRA_REQ_SENT:
+	case IB_CM_REP_RCVD:
+	case IB_CM_MRA_REP_SENT:
+	case IB_CM_REP_SENT:
+	case IB_CM_MRA_REP_RCVD:
+	case IB_CM_ESTABLISHED:
+		*qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS |
+				IB_QP_PKEY_INDEX | IB_QP_PORT;
+		qp_attr->qp_access_flags = IB_ACCESS_REMOTE_WRITE;
+		if (cm_id_priv->responder_resources)
+			qp_attr->qp_access_flags |= IB_ACCESS_REMOTE_READ |
+						    IB_ACCESS_REMOTE_ATOMIC;
+		qp_attr->pkey_index = cm_id_priv->av.pkey_index;
+		qp_attr->port_num = cm_id_priv->av.port->port_num;
+		ret = 0;
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	return ret;
+}
+
+static int cm_init_qp_rtr_attr(struct cm_id_private *cm_id_priv,
+			       struct ib_qp_attr *qp_attr,
+			       int *qp_attr_mask)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	switch (cm_id_priv->id.state) {
+	case IB_CM_REQ_RCVD:
+	case IB_CM_MRA_REQ_SENT:
+	case IB_CM_REP_RCVD:
+	case IB_CM_MRA_REP_SENT:
+	case IB_CM_REP_SENT:
+	case IB_CM_MRA_REP_RCVD:
+	case IB_CM_ESTABLISHED:
+		*qp_attr_mask = IB_QP_STATE | IB_QP_AV | IB_QP_PATH_MTU |
+				IB_QP_DEST_QPN | IB_QP_RQ_PSN;
+		qp_attr->ah_attr = cm_id_priv->av.ah_attr;
+		qp_attr->path_mtu = cm_id_priv->path_mtu;
+		qp_attr->dest_qp_num = be32_to_cpu(cm_id_priv->remote_qpn);
+		qp_attr->rq_psn = be32_to_cpu(cm_id_priv->rq_psn);
+		if (cm_id_priv->qp_type == IB_QPT_RC) {
+			*qp_attr_mask |= IB_QP_MAX_DEST_RD_ATOMIC |
+					 IB_QP_MIN_RNR_TIMER;
+			qp_attr->max_dest_rd_atomic =
+					cm_id_priv->responder_resources;
+			qp_attr->min_rnr_timer = 0;
+		}
+		if (cm_id_priv->alt_av.ah_attr.dlid) {
+			*qp_attr_mask |= IB_QP_ALT_PATH;
+			qp_attr->alt_port_num = cm_id_priv->alt_av.port->port_num;
+			qp_attr->alt_pkey_index = cm_id_priv->alt_av.pkey_index;
+			qp_attr->alt_timeout = cm_id_priv->alt_av.timeout;
+			qp_attr->alt_ah_attr = cm_id_priv->alt_av.ah_attr;
+		}
+		ret = 0;
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	return ret;
+}
+
+static int cm_init_qp_rts_attr(struct cm_id_private *cm_id_priv,
+			       struct ib_qp_attr *qp_attr,
+			       int *qp_attr_mask)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	switch (cm_id_priv->id.state) {
+	/* Allow transition to RTS before sending REP */
+	case IB_CM_REQ_RCVD:
+	case IB_CM_MRA_REQ_SENT:
+
+	case IB_CM_REP_RCVD:
+	case IB_CM_MRA_REP_SENT:
+	case IB_CM_REP_SENT:
+	case IB_CM_MRA_REP_RCVD:
+	case IB_CM_ESTABLISHED:
+		if (cm_id_priv->id.lap_state == IB_CM_LAP_UNINIT) {
+			*qp_attr_mask = IB_QP_STATE | IB_QP_SQ_PSN;
+			qp_attr->sq_psn = be32_to_cpu(cm_id_priv->sq_psn);
+			if (cm_id_priv->qp_type == IB_QPT_RC) {
+				*qp_attr_mask |= IB_QP_TIMEOUT | IB_QP_RETRY_CNT |
+						 IB_QP_RNR_RETRY |
+						 IB_QP_MAX_QP_RD_ATOMIC;
+				qp_attr->timeout = cm_id_priv->av.timeout;
+				qp_attr->retry_cnt = cm_id_priv->retry_count;
+				qp_attr->rnr_retry = cm_id_priv->rnr_retry_count;
+				qp_attr->max_rd_atomic =
+					cm_id_priv->initiator_depth;
+			}
+			if (cm_id_priv->alt_av.ah_attr.dlid) {
+				*qp_attr_mask |= IB_QP_PATH_MIG_STATE;
+				qp_attr->path_mig_state = IB_MIG_REARM;
+			}
+		} else {
+			*qp_attr_mask = IB_QP_ALT_PATH | IB_QP_PATH_MIG_STATE;
+			qp_attr->alt_port_num = cm_id_priv->alt_av.port->port_num;
+			qp_attr->alt_pkey_index = cm_id_priv->alt_av.pkey_index;
+			qp_attr->alt_timeout = cm_id_priv->alt_av.timeout;
+			qp_attr->alt_ah_attr = cm_id_priv->alt_av.ah_attr;
+			qp_attr->path_mig_state = IB_MIG_REARM;
+		}
+		ret = 0;
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	return ret;
+}
+
+int ib_cm_init_qp_attr(struct ib_cm_id *cm_id,
+		       struct ib_qp_attr *qp_attr,
+		       int *qp_attr_mask)
+{
+	struct cm_id_private *cm_id_priv;
+	int ret;
+
+	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+	switch (qp_attr->qp_state) {
+	case IB_QPS_INIT:
+		ret = cm_init_qp_init_attr(cm_id_priv, qp_attr, qp_attr_mask);
+		break;
+	case IB_QPS_RTR:
+		ret = cm_init_qp_rtr_attr(cm_id_priv, qp_attr, qp_attr_mask);
+		break;
+	case IB_QPS_RTS:
+		ret = cm_init_qp_rts_attr(cm_id_priv, qp_attr, qp_attr_mask);
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+	return ret;
+}
+EXPORT_SYMBOL(ib_cm_init_qp_attr);
+
+static void cm_get_ack_delay(struct cm_device *cm_dev)
+{
+	struct ib_device_attr attr;
+
+	if (ib_query_device(cm_dev->ib_device, &attr))
+		cm_dev->ack_delay = 0; /* acks will rely on packet life time */
+	else
+		cm_dev->ack_delay = attr.local_ca_ack_delay;
+}
+
+static ssize_t cm_show_counter(struct kobject *obj, struct attribute *attr,
+			       char *buf)
+{
+	struct cm_counter_group *group;
+	struct cm_counter_attribute *cm_attr;
+
+	group = container_of(obj, struct cm_counter_group, obj);
+	cm_attr = container_of(attr, struct cm_counter_attribute, attr);
+
+	return sprintf(buf, "%ld\n",
+		       atomic_long_read(&group->counter[cm_attr->index]));
+}
+
+static struct sysfs_ops cm_counter_ops = {
+	.show = cm_show_counter
+};
+
+static struct kobj_type cm_counter_obj_type = {
+	.sysfs_ops = &cm_counter_ops,
+	.default_attrs = cm_counter_default_attrs
+};
+
+static void cm_release_port_obj(struct kobject *obj)
+{
+	struct cm_port *cm_port;
+
+	cm_port = container_of(obj, struct cm_port, port_obj);
+	kfree(cm_port);
+}
+
+static struct kobj_type cm_port_obj_type = {
+	.release = cm_release_port_obj
+};
+
+struct class cm_class = {
+	.name    = "infiniband_cm",
+};
+EXPORT_SYMBOL(cm_class);
+
+static int cm_create_port_fs(struct cm_port *port)
+{
+	int i, ret;
+
+	ret = kobject_init_and_add(&port->port_obj, &cm_port_obj_type,
+				   &port->cm_dev->device->kobj,
+				   "%d", port->port_num);
+	if (ret) {
+		kfree(port);
+		return ret;
+	}
+
+	for (i = 0; i < CM_COUNTER_GROUPS; i++) {
+		ret = kobject_init_and_add(&port->counter_group[i].obj,
+					   &cm_counter_obj_type,
+					   &port->port_obj,
+					   "%s", counter_group_names[i]);
+		if (ret)
+			goto error;
+	}
+
+	return 0;
+
+error:
+	while (i--)
+		kobject_put(&port->counter_group[i].obj);
+	kobject_put(&port->port_obj);
+	return ret;
+
+}
+
+static void cm_remove_port_fs(struct cm_port *port)
+{
+	int i;
+
+	for (i = 0; i < CM_COUNTER_GROUPS; i++)
+		kobject_put(&port->counter_group[i].obj);
+
+	kobject_put(&port->port_obj);
+}
+
+static void cm_add_one(struct ib_device *ib_device)
+{
+	struct cm_device *cm_dev;
+	struct cm_port *port;
+	struct ib_mad_reg_req reg_req = {
+		.mgmt_class = IB_MGMT_CLASS_CM,
+		.mgmt_class_version = IB_CM_CLASS_VERSION
+	};
+	struct ib_port_modify port_modify = {
+		.set_port_cap_mask = IB_PORT_CM_SUP
+	};
+	unsigned long flags;
+	int ret;
+	u8 i;
+
+	if (rdma_node_get_transport(ib_device->node_type) != RDMA_TRANSPORT_IB)
+		return;
+
+	cm_dev = kzalloc(sizeof(*cm_dev) + sizeof(*port) *
+			 ib_device->phys_port_cnt, GFP_KERNEL);
+	if (!cm_dev)
+		return;
+
+	cm_dev->ib_device = ib_device;
+	cm_get_ack_delay(cm_dev);
+
+	cm_dev->device = device_create(&cm_class, &ib_device->dev,
+				       MKDEV(0, 0), NULL,
+				       "%s", ib_device->name);
+	if (!cm_dev->device) {
+		kfree(cm_dev);
+		return;
+	}
+
+	set_bit(IB_MGMT_METHOD_SEND, reg_req.method_mask);
+	for (i = 1; i <= ib_device->phys_port_cnt; i++) {
+		port = kzalloc(sizeof *port, GFP_KERNEL);
+		if (!port)
+			goto error1;
+
+		cm_dev->port[i-1] = port;
+		port->cm_dev = cm_dev;
+		port->port_num = i;
+
+		ret = cm_create_port_fs(port);
+		if (ret)
+			goto error1;
+
+		port->mad_agent = ib_register_mad_agent(ib_device, i,
+							IB_QPT_GSI,
+							&reg_req,
+							0,
+							cm_send_handler,
+							cm_recv_handler,
+							port);
+		if (IS_ERR(port->mad_agent))
+			goto error2;
+
+		ret = ib_modify_port(ib_device, i, 0, &port_modify);
+		if (ret)
+			goto error3;
+	}
+	ib_set_client_data(ib_device, &cm_client, cm_dev);
+
+	write_lock_irqsave(&cm.device_lock, flags);
+	list_add_tail(&cm_dev->list, &cm.device_list);
+	write_unlock_irqrestore(&cm.device_lock, flags);
+	return;
+
+error3:
+	ib_unregister_mad_agent(port->mad_agent);
+error2:
+	cm_remove_port_fs(port);
+error1:
+	port_modify.set_port_cap_mask = 0;
+	port_modify.clr_port_cap_mask = IB_PORT_CM_SUP;
+	while (--i) {
+		port = cm_dev->port[i-1];
+		ib_modify_port(ib_device, port->port_num, 0, &port_modify);
+		ib_unregister_mad_agent(port->mad_agent);
+		cm_remove_port_fs(port);
+	}
+	device_unregister(cm_dev->device);
+	kfree(cm_dev);
+}
+
+static void cm_remove_one(struct ib_device *ib_device)
+{
+	struct cm_device *cm_dev;
+	struct cm_port *port;
+	struct ib_port_modify port_modify = {
+		.clr_port_cap_mask = IB_PORT_CM_SUP
+	};
+	unsigned long flags;
+	int i;
+
+	cm_dev = ib_get_client_data(ib_device, &cm_client);
+	if (!cm_dev)
+		return;
+
+	write_lock_irqsave(&cm.device_lock, flags);
+	list_del(&cm_dev->list);
+	write_unlock_irqrestore(&cm.device_lock, flags);
+
+	for (i = 1; i <= ib_device->phys_port_cnt; i++) {
+		port = cm_dev->port[i-1];
+		ib_modify_port(ib_device, port->port_num, 0, &port_modify);
+		ib_unregister_mad_agent(port->mad_agent);
+		flush_workqueue(cm.wq);
+		cm_remove_port_fs(port);
+	}
+	device_unregister(cm_dev->device);
+	kfree(cm_dev);
+}
+
+static int __init ib_cm_init(void)
+{
+	int ret;
+
+	memset(&cm, 0, sizeof cm);
+	INIT_LIST_HEAD(&cm.device_list);
+	rwlock_init(&cm.device_lock);
+	spin_lock_init(&cm.lock);
+	cm.listen_service_table = RB_ROOT;
+	cm.listen_service_id = be64_to_cpu(IB_CM_ASSIGN_SERVICE_ID);
+	cm.remote_id_table = RB_ROOT;
+	cm.remote_qp_table = RB_ROOT;
+	cm.remote_sidr_table = RB_ROOT;
+	idr_init(&cm.local_id_table);
+	get_random_bytes(&cm.random_id_operand, sizeof cm.random_id_operand);
+	idr_pre_get(&cm.local_id_table, GFP_KERNEL);
+	INIT_LIST_HEAD(&cm.timewait_list);
+
+	ret = class_register(&cm_class);
+	if (ret)
+		return -ENOMEM;
+
+	cm.wq = create_workqueue("ib_cm");
+	if (!cm.wq) {
+		ret = -ENOMEM;
+		goto error1;
+	}
+
+	ret = ib_register_client(&cm_client);
+	if (ret)
+		goto error2;
+
+	return 0;
+error2:
+	destroy_workqueue(cm.wq);
+error1:
+	class_unregister(&cm_class);
+	return ret;
+}
+
+static void __exit ib_cm_cleanup(void)
+{
+	struct cm_timewait_info *timewait_info, *tmp;
+
+	spin_lock_irq(&cm.lock);
+	list_for_each_entry(timewait_info, &cm.timewait_list, list)
+		cancel_delayed_work(&timewait_info->work.work);
+	spin_unlock_irq(&cm.lock);
+
+	ib_unregister_client(&cm_client);
+	destroy_workqueue(cm.wq);
+
+	list_for_each_entry_safe(timewait_info, tmp, &cm.timewait_list, list) {
+		list_del(&timewait_info->list);
+		kfree(timewait_info);
+	}
+
+	class_unregister(&cm_class);
+	idr_destroy(&cm.local_id_table);
+}
+
+module_init_order(ib_cm_init, SI_ORDER_SECOND);
+module_exit_order(ib_cm_cleanup, SI_ORDER_FIRST);
+


Property changes on: trunk/sys/ofed/drivers/infiniband/core/cm.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/sys/ofed/drivers/infiniband/core/cm_msgs.h
===================================================================
--- trunk/sys/ofed/drivers/infiniband/core/cm_msgs.h	                        (rev 0)
+++ trunk/sys/ofed/drivers/infiniband/core/cm_msgs.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -0,0 +1,819 @@
+/*
+ * Copyright (c) 2004 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation.  All rights reserved.
+ * Copyright (c) 2004 Voltaire Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING the madirectory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use source and binary forms, with or
+ *     withmodification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retathe above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHWARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS THE
+ * SOFTWARE.
+ */
+#if !defined(CM_MSGS_H)
+#define CM_MSGS_H
+
+#include <rdma/ib_mad.h>
+#include <rdma/ib_cm.h>
+
+/*
+ * Parameters to routines below should be in network-byte order, and values
+ * are returned in network-byte order.
+ */
+
+#define IB_CM_CLASS_VERSION	2 /* IB specification 1.2 */
+
+#define CM_REQ_ATTR_ID		cpu_to_be16(0x0010)
+#define CM_MRA_ATTR_ID		cpu_to_be16(0x0011)
+#define CM_REJ_ATTR_ID		cpu_to_be16(0x0012)
+#define CM_REP_ATTR_ID		cpu_to_be16(0x0013)
+#define CM_RTU_ATTR_ID		cpu_to_be16(0x0014)
+#define CM_DREQ_ATTR_ID		cpu_to_be16(0x0015)
+#define CM_DREP_ATTR_ID		cpu_to_be16(0x0016)
+#define CM_SIDR_REQ_ATTR_ID	cpu_to_be16(0x0017)
+#define CM_SIDR_REP_ATTR_ID	cpu_to_be16(0x0018)
+#define CM_LAP_ATTR_ID		cpu_to_be16(0x0019)
+#define CM_APR_ATTR_ID		cpu_to_be16(0x001A)
+
+enum cm_msg_sequence {
+	CM_MSG_SEQUENCE_REQ,
+	CM_MSG_SEQUENCE_LAP,
+	CM_MSG_SEQUENCE_DREQ,
+	CM_MSG_SEQUENCE_SIDR
+};
+
+struct cm_req_msg {
+	struct ib_mad_hdr hdr;
+
+	__be32 local_comm_id;
+	__be32 rsvd4;
+	__be64 service_id;
+	__be64 local_ca_guid;
+	__be32 rsvd24;
+	__be32 local_qkey;
+	/* local QPN:24, responder resources:8 */
+	__be32 offset32;
+	/* local EECN:24, initiator depth:8 */
+	__be32 offset36;
+	/*
+	 * remote EECN:24, remote CM response timeout:5,
+	 * transport service type:2, end-to-end flow control:1
+	 */
+	__be32 offset40;
+	/* starting PSN:24, local CM response timeout:5, retry count:3 */
+	__be32 offset44;
+	__be16 pkey;
+	/* path MTU:4, RDC exists:1, RNR retry count:3. */
+	u8 offset50;
+	/* max CM Retries:4, SRQ:1, rsvd:3 */
+	u8 offset51;
+
+	__be16 primary_local_lid;
+	__be16 primary_remote_lid;
+	union ib_gid primary_local_gid;
+	union ib_gid primary_remote_gid;
+	/* flow label:20, rsvd:6, packet rate:6 */
+	__be32 primary_offset88;
+	u8 primary_traffic_class;
+	u8 primary_hop_limit;
+	/* SL:4, subnet local:1, rsvd:3 */
+	u8 primary_offset94;
+	/* local ACK timeout:5, rsvd:3 */
+	u8 primary_offset95;
+
+	__be16 alt_local_lid;
+	__be16 alt_remote_lid;
+	union ib_gid alt_local_gid;
+	union ib_gid alt_remote_gid;
+	/* flow label:20, rsvd:6, packet rate:6 */
+	__be32 alt_offset132;
+	u8 alt_traffic_class;
+	u8 alt_hop_limit;
+	/* SL:4, subnet local:1, rsvd:3 */
+	u8 alt_offset138;
+	/* local ACK timeout:5, rsvd:3 */
+	u8 alt_offset139;
+
+	u8 private_data[IB_CM_REQ_PRIVATE_DATA_SIZE];
+
+} __attribute__ ((packed));
+
+static inline __be32 cm_req_get_local_qpn(struct cm_req_msg *req_msg)
+{
+	return cpu_to_be32(be32_to_cpu(req_msg->offset32) >> 8);
+}
+
+static inline void cm_req_set_local_qpn(struct cm_req_msg *req_msg, __be32 qpn)
+{
+	req_msg->offset32 = cpu_to_be32((be32_to_cpu(qpn) << 8) |
+					 (be32_to_cpu(req_msg->offset32) &
+					  0x000000FF));
+}
+
+static inline u8 cm_req_get_resp_res(struct cm_req_msg *req_msg)
+{
+	return (u8) be32_to_cpu(req_msg->offset32);
+}
+
+static inline void cm_req_set_resp_res(struct cm_req_msg *req_msg, u8 resp_res)
+{
+	req_msg->offset32 = cpu_to_be32(resp_res |
+					(be32_to_cpu(req_msg->offset32) &
+					 0xFFFFFF00));
+}
+
+static inline u8 cm_req_get_init_depth(struct cm_req_msg *req_msg)
+{
+	return (u8) be32_to_cpu(req_msg->offset36);
+}
+
+static inline void cm_req_set_init_depth(struct cm_req_msg *req_msg,
+					 u8 init_depth)
+{
+	req_msg->offset36 = cpu_to_be32(init_depth |
+					(be32_to_cpu(req_msg->offset36) &
+					 0xFFFFFF00));
+}
+
+static inline u8 cm_req_get_remote_resp_timeout(struct cm_req_msg *req_msg)
+{
+	return (u8) ((be32_to_cpu(req_msg->offset40) & 0xF8) >> 3);
+}
+
+static inline void cm_req_set_remote_resp_timeout(struct cm_req_msg *req_msg,
+						  u8 resp_timeout)
+{
+	req_msg->offset40 = cpu_to_be32((resp_timeout << 3) |
+					 (be32_to_cpu(req_msg->offset40) &
+					  0xFFFFFF07));
+}
+
+static inline enum ib_qp_type cm_req_get_qp_type(struct cm_req_msg *req_msg)
+{
+	u8 transport_type = (u8) (be32_to_cpu(req_msg->offset40) & 0x06) >> 1;
+	switch(transport_type) {
+	case 0: return IB_QPT_RC;
+	case 1: return IB_QPT_UC;
+	default: return 0;
+	}
+}
+
+static inline void cm_req_set_qp_type(struct cm_req_msg *req_msg,
+				      enum ib_qp_type qp_type)
+{
+	switch(qp_type) {
+	case IB_QPT_UC:
+		req_msg->offset40 = cpu_to_be32((be32_to_cpu(
+						  req_msg->offset40) &
+						   0xFFFFFFF9) | 0x2);
+		break;
+	default:
+		req_msg->offset40 = cpu_to_be32(be32_to_cpu(
+						 req_msg->offset40) &
+						  0xFFFFFFF9);
+	}
+}
+
+static inline u8 cm_req_get_flow_ctrl(struct cm_req_msg *req_msg)
+{
+	return be32_to_cpu(req_msg->offset40) & 0x1;
+}
+
+static inline void cm_req_set_flow_ctrl(struct cm_req_msg *req_msg,
+					u8 flow_ctrl)
+{
+	req_msg->offset40 = cpu_to_be32((flow_ctrl & 0x1) |
+					 (be32_to_cpu(req_msg->offset40) &
+					  0xFFFFFFFE));
+}
+
+static inline __be32 cm_req_get_starting_psn(struct cm_req_msg *req_msg)
+{
+	return cpu_to_be32(be32_to_cpu(req_msg->offset44) >> 8);
+}
+
+static inline void cm_req_set_starting_psn(struct cm_req_msg *req_msg,
+					   __be32 starting_psn)
+{
+	req_msg->offset44 = cpu_to_be32((be32_to_cpu(starting_psn) << 8) |
+			    (be32_to_cpu(req_msg->offset44) & 0x000000FF));
+}
+
+static inline u8 cm_req_get_local_resp_timeout(struct cm_req_msg *req_msg)
+{
+	return (u8) ((be32_to_cpu(req_msg->offset44) & 0xF8) >> 3);
+}
+
+static inline void cm_req_set_local_resp_timeout(struct cm_req_msg *req_msg,
+						 u8 resp_timeout)
+{
+	req_msg->offset44 = cpu_to_be32((resp_timeout << 3) |
+			    (be32_to_cpu(req_msg->offset44) & 0xFFFFFF07));
+}
+
+static inline u8 cm_req_get_retry_count(struct cm_req_msg *req_msg)
+{
+	return (u8) (be32_to_cpu(req_msg->offset44) & 0x7);
+}
+
+static inline void cm_req_set_retry_count(struct cm_req_msg *req_msg,
+					  u8 retry_count)
+{
+	req_msg->offset44 = cpu_to_be32((retry_count & 0x7) |
+			    (be32_to_cpu(req_msg->offset44) & 0xFFFFFFF8));
+}
+
+static inline u8 cm_req_get_path_mtu(struct cm_req_msg *req_msg)
+{
+	return req_msg->offset50 >> 4;
+}
+
+static inline void cm_req_set_path_mtu(struct cm_req_msg *req_msg, u8 path_mtu)
+{
+	req_msg->offset50 = (u8) ((req_msg->offset50 & 0xF) | (path_mtu << 4));
+}
+
+static inline u8 cm_req_get_rnr_retry_count(struct cm_req_msg *req_msg)
+{
+	return req_msg->offset50 & 0x7;
+}
+
+static inline void cm_req_set_rnr_retry_count(struct cm_req_msg *req_msg,
+					      u8 rnr_retry_count)
+{
+	req_msg->offset50 = (u8) ((req_msg->offset50 & 0xF8) |
+				  (rnr_retry_count & 0x7));
+}
+
+static inline u8 cm_req_get_max_cm_retries(struct cm_req_msg *req_msg)
+{
+	return req_msg->offset51 >> 4;
+}
+
+static inline void cm_req_set_max_cm_retries(struct cm_req_msg *req_msg,
+					     u8 retries)
+{
+	req_msg->offset51 = (u8) ((req_msg->offset51 & 0xF) | (retries << 4));
+}
+
+static inline u8 cm_req_get_srq(struct cm_req_msg *req_msg)
+{
+	return (req_msg->offset51 & 0x8) >> 3;
+}
+
+static inline void cm_req_set_srq(struct cm_req_msg *req_msg, u8 srq)
+{
+	req_msg->offset51 = (u8) ((req_msg->offset51 & 0xF7) |
+				  ((srq & 0x1) << 3));
+}
+
+static inline __be32 cm_req_get_primary_flow_label(struct cm_req_msg *req_msg)
+{
+	return cpu_to_be32(be32_to_cpu(req_msg->primary_offset88) >> 12);
+}
+
+static inline void cm_req_set_primary_flow_label(struct cm_req_msg *req_msg,
+						 __be32 flow_label)
+{
+	req_msg->primary_offset88 = cpu_to_be32(
+				    (be32_to_cpu(req_msg->primary_offset88) &
+				     0x00000FFF) |
+				     (be32_to_cpu(flow_label) << 12));
+}
+
+static inline u8 cm_req_get_primary_packet_rate(struct cm_req_msg *req_msg)
+{
+	return (u8) (be32_to_cpu(req_msg->primary_offset88) & 0x3F);
+}
+
+static inline void cm_req_set_primary_packet_rate(struct cm_req_msg *req_msg,
+						  u8 rate)
+{
+	req_msg->primary_offset88 = cpu_to_be32(
+				    (be32_to_cpu(req_msg->primary_offset88) &
+				     0xFFFFFFC0) | (rate & 0x3F));
+}
+
+static inline u8 cm_req_get_primary_sl(struct cm_req_msg *req_msg)
+{
+	return (u8) (req_msg->primary_offset94 >> 4);
+}
+
+static inline void cm_req_set_primary_sl(struct cm_req_msg *req_msg, u8 sl)
+{
+	req_msg->primary_offset94 = (u8) ((req_msg->primary_offset94 & 0x0F) |
+					  (sl << 4));
+}
+
+static inline u8 cm_req_get_primary_subnet_local(struct cm_req_msg *req_msg)
+{
+	return (u8) ((req_msg->primary_offset94 & 0x08) >> 3);
+}
+
+static inline void cm_req_set_primary_subnet_local(struct cm_req_msg *req_msg,
+						   u8 subnet_local)
+{
+	req_msg->primary_offset94 = (u8) ((req_msg->primary_offset94 & 0xF7) |
+					  ((subnet_local & 0x1) << 3));
+}
+
+static inline u8 cm_req_get_primary_local_ack_timeout(struct cm_req_msg *req_msg)
+{
+	return (u8) (req_msg->primary_offset95 >> 3);
+}
+
+static inline void cm_req_set_primary_local_ack_timeout(struct cm_req_msg *req_msg,
+							u8 local_ack_timeout)
+{
+	req_msg->primary_offset95 = (u8) ((req_msg->primary_offset95 & 0x07) |
+					  (local_ack_timeout << 3));
+}
+
+static inline __be32 cm_req_get_alt_flow_label(struct cm_req_msg *req_msg)
+{
+	return cpu_to_be32(be32_to_cpu(req_msg->alt_offset132) >> 12);
+}
+
+static inline void cm_req_set_alt_flow_label(struct cm_req_msg *req_msg,
+					     __be32 flow_label)
+{
+	req_msg->alt_offset132 = cpu_to_be32(
+				 (be32_to_cpu(req_msg->alt_offset132) &
+				  0x00000FFF) |
+				  (be32_to_cpu(flow_label) << 12));
+}
+
+static inline u8 cm_req_get_alt_packet_rate(struct cm_req_msg *req_msg)
+{
+	return (u8) (be32_to_cpu(req_msg->alt_offset132) & 0x3F);
+}
+
+static inline void cm_req_set_alt_packet_rate(struct cm_req_msg *req_msg,
+					      u8 rate)
+{
+	req_msg->alt_offset132 = cpu_to_be32(
+				 (be32_to_cpu(req_msg->alt_offset132) &
+				  0xFFFFFFC0) | (rate & 0x3F));
+}
+
+static inline u8 cm_req_get_alt_sl(struct cm_req_msg *req_msg)
+{
+	return (u8) (req_msg->alt_offset138 >> 4);
+}
+
+static inline void cm_req_set_alt_sl(struct cm_req_msg *req_msg, u8 sl)
+{
+	req_msg->alt_offset138 = (u8) ((req_msg->alt_offset138 & 0x0F) |
+				       (sl << 4));
+}
+
+static inline u8 cm_req_get_alt_subnet_local(struct cm_req_msg *req_msg)
+{
+	return (u8) ((req_msg->alt_offset138 & 0x08) >> 3);
+}
+
+static inline void cm_req_set_alt_subnet_local(struct cm_req_msg *req_msg,
+					       u8 subnet_local)
+{
+	req_msg->alt_offset138 = (u8) ((req_msg->alt_offset138 & 0xF7) |
+				       ((subnet_local & 0x1) << 3));
+}
+
+static inline u8 cm_req_get_alt_local_ack_timeout(struct cm_req_msg *req_msg)
+{
+	return (u8) (req_msg->alt_offset139 >> 3);
+}
+
+static inline void cm_req_set_alt_local_ack_timeout(struct cm_req_msg *req_msg,
+						    u8 local_ack_timeout)
+{
+	req_msg->alt_offset139 = (u8) ((req_msg->alt_offset139 & 0x07) |
+				       (local_ack_timeout << 3));
+}
+
+/* Message REJected or MRAed */
+enum cm_msg_response {
+	CM_MSG_RESPONSE_REQ = 0x0,
+	CM_MSG_RESPONSE_REP = 0x1,
+	CM_MSG_RESPONSE_OTHER = 0x2
+};
+
+ struct cm_mra_msg {
+	struct ib_mad_hdr hdr;
+
+	__be32 local_comm_id;
+	__be32 remote_comm_id;
+	/* message MRAed:2, rsvd:6 */
+	u8 offset8;
+	/* service timeout:5, rsvd:3 */
+	u8 offset9;
+
+	u8 private_data[IB_CM_MRA_PRIVATE_DATA_SIZE];
+
+} __attribute__ ((packed));
+
+static inline u8 cm_mra_get_msg_mraed(struct cm_mra_msg *mra_msg)
+{
+	return (u8) (mra_msg->offset8 >> 6);
+}
+
+static inline void cm_mra_set_msg_mraed(struct cm_mra_msg *mra_msg, u8 msg)
+{
+	mra_msg->offset8 = (u8) ((mra_msg->offset8 & 0x3F) | (msg << 6));
+}
+
+static inline u8 cm_mra_get_service_timeout(struct cm_mra_msg *mra_msg)
+{
+	return (u8) (mra_msg->offset9 >> 3);
+}
+
+static inline void cm_mra_set_service_timeout(struct cm_mra_msg *mra_msg,
+					      u8 service_timeout)
+{
+	mra_msg->offset9 = (u8) ((mra_msg->offset9 & 0x07) |
+				 (service_timeout << 3));
+}
+
+struct cm_rej_msg {
+	struct ib_mad_hdr hdr;
+
+	__be32 local_comm_id;
+	__be32 remote_comm_id;
+	/* message REJected:2, rsvd:6 */
+	u8 offset8;
+	/* reject info length:7, rsvd:1. */
+	u8 offset9;
+	__be16 reason;
+	u8 ari[IB_CM_REJ_ARI_LENGTH];
+
+	u8 private_data[IB_CM_REJ_PRIVATE_DATA_SIZE];
+
+} __attribute__ ((packed));
+
+static inline u8 cm_rej_get_msg_rejected(struct cm_rej_msg *rej_msg)
+{
+	return (u8) (rej_msg->offset8 >> 6);
+}
+
+static inline void cm_rej_set_msg_rejected(struct cm_rej_msg *rej_msg, u8 msg)
+{
+	rej_msg->offset8 = (u8) ((rej_msg->offset8 & 0x3F) | (msg << 6));
+}
+
+static inline u8 cm_rej_get_reject_info_len(struct cm_rej_msg *rej_msg)
+{
+	return (u8) (rej_msg->offset9 >> 1);
+}
+
+static inline void cm_rej_set_reject_info_len(struct cm_rej_msg *rej_msg,
+					      u8 len)
+{
+	rej_msg->offset9 = (u8) ((rej_msg->offset9 & 0x1) | (len << 1));
+}
+
+struct cm_rep_msg {
+	struct ib_mad_hdr hdr;
+
+	__be32 local_comm_id;
+	__be32 remote_comm_id;
+	__be32 local_qkey;
+	/* local QPN:24, rsvd:8 */
+	__be32 offset12;
+	/* local EECN:24, rsvd:8 */
+	__be32 offset16;
+	/* starting PSN:24 rsvd:8 */
+	__be32 offset20;
+	u8 resp_resources;
+	u8 initiator_depth;
+	/* target ACK delay:5, failover accepted:2, end-to-end flow control:1 */
+	u8 offset26;
+	/* RNR retry count:3, SRQ:1, rsvd:5 */
+	u8 offset27;
+	__be64 local_ca_guid;
+
+	u8 private_data[IB_CM_REP_PRIVATE_DATA_SIZE];
+
+} __attribute__ ((packed));
+
+static inline __be32 cm_rep_get_local_qpn(struct cm_rep_msg *rep_msg)
+{
+	return cpu_to_be32(be32_to_cpu(rep_msg->offset12) >> 8);
+}
+
+static inline void cm_rep_set_local_qpn(struct cm_rep_msg *rep_msg, __be32 qpn)
+{
+	rep_msg->offset12 = cpu_to_be32((be32_to_cpu(qpn) << 8) |
+			    (be32_to_cpu(rep_msg->offset12) & 0x000000FF));
+}
+
+static inline __be32 cm_rep_get_starting_psn(struct cm_rep_msg *rep_msg)
+{
+	return cpu_to_be32(be32_to_cpu(rep_msg->offset20) >> 8);
+}
+
+static inline void cm_rep_set_starting_psn(struct cm_rep_msg *rep_msg,
+					   __be32 starting_psn)
+{
+	rep_msg->offset20 = cpu_to_be32((be32_to_cpu(starting_psn) << 8) |
+			    (be32_to_cpu(rep_msg->offset20) & 0x000000FF));
+}
+
+static inline u8 cm_rep_get_target_ack_delay(struct cm_rep_msg *rep_msg)
+{
+	return (u8) (rep_msg->offset26 >> 3);
+}
+
+static inline void cm_rep_set_target_ack_delay(struct cm_rep_msg *rep_msg,
+					       u8 target_ack_delay)
+{
+	rep_msg->offset26 = (u8) ((rep_msg->offset26 & 0x07) |
+				  (target_ack_delay << 3));
+}
+
+static inline u8 cm_rep_get_failover(struct cm_rep_msg *rep_msg)
+{
+	return (u8) ((rep_msg->offset26 & 0x06) >> 1);
+}
+
+static inline void cm_rep_set_failover(struct cm_rep_msg *rep_msg, u8 failover)
+{
+	rep_msg->offset26 = (u8) ((rep_msg->offset26 & 0xF9) |
+				  ((failover & 0x3) << 1));
+}
+
+static inline u8 cm_rep_get_flow_ctrl(struct cm_rep_msg *rep_msg)
+{
+	return (u8) (rep_msg->offset26 & 0x01);
+}
+
+static inline void cm_rep_set_flow_ctrl(struct cm_rep_msg *rep_msg,
+					    u8 flow_ctrl)
+{
+	rep_msg->offset26 = (u8) ((rep_msg->offset26 & 0xFE) |
+				  (flow_ctrl & 0x1));
+}
+
+static inline u8 cm_rep_get_rnr_retry_count(struct cm_rep_msg *rep_msg)
+{
+	return (u8) (rep_msg->offset27 >> 5);
+}
+
+static inline void cm_rep_set_rnr_retry_count(struct cm_rep_msg *rep_msg,
+					      u8 rnr_retry_count)
+{
+	rep_msg->offset27 = (u8) ((rep_msg->offset27 & 0x1F) |
+				  (rnr_retry_count << 5));
+}
+
+static inline u8 cm_rep_get_srq(struct cm_rep_msg *rep_msg)
+{
+	return (u8) ((rep_msg->offset27 >> 4) & 0x1);
+}
+
+static inline void cm_rep_set_srq(struct cm_rep_msg *rep_msg, u8 srq)
+{
+	rep_msg->offset27 = (u8) ((rep_msg->offset27 & 0xEF) |
+				  ((srq & 0x1) << 4));
+}
+
+struct cm_rtu_msg {
+	struct ib_mad_hdr hdr;
+
+	__be32 local_comm_id;
+	__be32 remote_comm_id;
+
+	u8 private_data[IB_CM_RTU_PRIVATE_DATA_SIZE];
+
+} __attribute__ ((packed));
+
+struct cm_dreq_msg {
+	struct ib_mad_hdr hdr;
+
+	__be32 local_comm_id;
+	__be32 remote_comm_id;
+	/* remote QPN/EECN:24, rsvd:8 */
+	__be32 offset8;
+
+	u8 private_data[IB_CM_DREQ_PRIVATE_DATA_SIZE];
+
+} __attribute__ ((packed));
+
+static inline __be32 cm_dreq_get_remote_qpn(struct cm_dreq_msg *dreq_msg)
+{
+	return cpu_to_be32(be32_to_cpu(dreq_msg->offset8) >> 8);
+}
+
+static inline void cm_dreq_set_remote_qpn(struct cm_dreq_msg *dreq_msg, __be32 qpn)
+{
+	dreq_msg->offset8 = cpu_to_be32((be32_to_cpu(qpn) << 8) |
+			    (be32_to_cpu(dreq_msg->offset8) & 0x000000FF));
+}
+
+struct cm_drep_msg {
+	struct ib_mad_hdr hdr;
+
+	__be32 local_comm_id;
+	__be32 remote_comm_id;
+
+	u8 private_data[IB_CM_DREP_PRIVATE_DATA_SIZE];
+
+} __attribute__ ((packed));
+
+struct cm_lap_msg {
+	struct ib_mad_hdr hdr;
+
+	__be32 local_comm_id;
+	__be32 remote_comm_id;
+
+	__be32 rsvd8;
+	/* remote QPN/EECN:24, remote CM response timeout:5, rsvd:3 */
+	__be32 offset12;
+	__be32 rsvd16;
+
+	__be16 alt_local_lid;
+	__be16 alt_remote_lid;
+	union ib_gid alt_local_gid;
+	union ib_gid alt_remote_gid;
+	/* flow label:20, rsvd:4, traffic class:8 */
+	__be32 offset56;
+	u8 alt_hop_limit;
+	/* rsvd:2, packet rate:6 */
+	u8 offset61;
+	/* SL:4, subnet local:1, rsvd:3 */
+	u8 offset62;
+	/* local ACK timeout:5, rsvd:3 */
+	u8 offset63;
+
+	u8 private_data[IB_CM_LAP_PRIVATE_DATA_SIZE];
+} __attribute__  ((packed));
+
+static inline __be32 cm_lap_get_remote_qpn(struct cm_lap_msg *lap_msg)
+{
+	return cpu_to_be32(be32_to_cpu(lap_msg->offset12) >> 8);
+}
+
+static inline void cm_lap_set_remote_qpn(struct cm_lap_msg *lap_msg, __be32 qpn)
+{
+	lap_msg->offset12 = cpu_to_be32((be32_to_cpu(qpn) << 8) |
+					 (be32_to_cpu(lap_msg->offset12) &
+					  0x000000FF));
+}
+
+static inline u8 cm_lap_get_remote_resp_timeout(struct cm_lap_msg *lap_msg)
+{
+	return (u8) ((be32_to_cpu(lap_msg->offset12) & 0xF8) >> 3);
+}
+
+static inline void cm_lap_set_remote_resp_timeout(struct cm_lap_msg *lap_msg,
+						  u8 resp_timeout)
+{
+	lap_msg->offset12 = cpu_to_be32((resp_timeout << 3) |
+					 (be32_to_cpu(lap_msg->offset12) &
+					  0xFFFFFF07));
+}
+
+static inline __be32 cm_lap_get_flow_label(struct cm_lap_msg *lap_msg)
+{
+	return cpu_to_be32(be32_to_cpu(lap_msg->offset56) >> 12);
+}
+
+static inline void cm_lap_set_flow_label(struct cm_lap_msg *lap_msg,
+					 __be32 flow_label)
+{
+	lap_msg->offset56 = cpu_to_be32(
+				 (be32_to_cpu(lap_msg->offset56) & 0x00000FFF) |
+				 (be32_to_cpu(flow_label) << 12));
+}
+
+static inline u8 cm_lap_get_traffic_class(struct cm_lap_msg *lap_msg)
+{
+	return (u8) be32_to_cpu(lap_msg->offset56);
+}
+
+static inline void cm_lap_set_traffic_class(struct cm_lap_msg *lap_msg,
+					    u8 traffic_class)
+{
+	lap_msg->offset56 = cpu_to_be32(traffic_class |
+					 (be32_to_cpu(lap_msg->offset56) &
+					  0xFFFFFF00));
+}
+
+static inline u8 cm_lap_get_packet_rate(struct cm_lap_msg *lap_msg)
+{
+	return lap_msg->offset61 & 0x3F;
+}
+
+static inline void cm_lap_set_packet_rate(struct cm_lap_msg *lap_msg,
+					  u8 packet_rate)
+{
+	lap_msg->offset61 = (packet_rate & 0x3F) | (lap_msg->offset61 & 0xC0);
+}
+
+static inline u8 cm_lap_get_sl(struct cm_lap_msg *lap_msg)
+{
+	return lap_msg->offset62 >> 4;
+}
+
+static inline void cm_lap_set_sl(struct cm_lap_msg *lap_msg, u8 sl)
+{
+	lap_msg->offset62 = (sl << 4) | (lap_msg->offset62 & 0x0F);
+}
+
+static inline u8 cm_lap_get_subnet_local(struct cm_lap_msg *lap_msg)
+{
+	return (lap_msg->offset62 >> 3) & 0x1;
+}
+
+static inline void cm_lap_set_subnet_local(struct cm_lap_msg *lap_msg,
+					   u8 subnet_local)
+{
+	lap_msg->offset62 = ((subnet_local & 0x1) << 3) |
+			     (lap_msg->offset61 & 0xF7);
+}
+static inline u8 cm_lap_get_local_ack_timeout(struct cm_lap_msg *lap_msg)
+{
+	return lap_msg->offset63 >> 3;
+}
+
+static inline void cm_lap_set_local_ack_timeout(struct cm_lap_msg *lap_msg,
+						u8 local_ack_timeout)
+{
+	lap_msg->offset63 = (local_ack_timeout << 3) |
+			    (lap_msg->offset63 & 0x07);
+}
+
+struct cm_apr_msg {
+	struct ib_mad_hdr hdr;
+
+	__be32 local_comm_id;
+	__be32 remote_comm_id;
+
+	u8 info_length;
+	u8 ap_status;
+	u8 info[IB_CM_APR_INFO_LENGTH];
+
+	u8 private_data[IB_CM_APR_PRIVATE_DATA_SIZE];
+} __attribute__ ((packed));
+
+struct cm_sidr_req_msg {
+	struct ib_mad_hdr hdr;
+
+	__be32 request_id;
+	__be16 pkey;
+	__be16 rsvd;
+	__be64 service_id;
+
+	u8 private_data[IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE];
+} __attribute__ ((packed));
+
+struct cm_sidr_rep_msg {
+	struct ib_mad_hdr hdr;
+
+	__be32 request_id;
+	u8 status;
+	u8 info_length;
+	__be16 rsvd;
+	/* QPN:24, rsvd:8 */
+	__be32 offset8;
+	__be64 service_id;
+	__be32 qkey;
+	u8 info[IB_CM_SIDR_REP_INFO_LENGTH];
+
+	u8 private_data[IB_CM_SIDR_REP_PRIVATE_DATA_SIZE];
+} __attribute__ ((packed));
+
+static inline __be32 cm_sidr_rep_get_qpn(struct cm_sidr_rep_msg *sidr_rep_msg)
+{
+	return cpu_to_be32(be32_to_cpu(sidr_rep_msg->offset8) >> 8);
+}
+
+static inline void cm_sidr_rep_set_qpn(struct cm_sidr_rep_msg *sidr_rep_msg,
+				       __be32 qpn)
+{
+	sidr_rep_msg->offset8 = cpu_to_be32((be32_to_cpu(qpn) << 8) |
+					(be32_to_cpu(sidr_rep_msg->offset8) &
+					 0x000000FF));
+}
+
+#endif /* CM_MSGS_H */


Property changes on: trunk/sys/ofed/drivers/infiniband/core/cm_msgs.h
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/sys/ofed/drivers/infiniband/core/cma.c
===================================================================
--- trunk/sys/ofed/drivers/infiniband/core/cma.c	                        (rev 0)
+++ trunk/sys/ofed/drivers/infiniband/core/cma.c	2016-09-14 19:35:22 UTC (rev 7911)
@@ -0,0 +1,3420 @@
+/*
+ * Copyright (c) 2005 Voltaire Inc.  All rights reserved.
+ * Copyright (c) 2002-2005, Network Appliance, Inc. All rights reserved.
+ * Copyright (c) 1999-2005, Mellanox Technologies, Inc. All rights reserved.
+ * Copyright (c) 2005-2006 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/completion.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/mutex.h>
+#include <linux/random.h>
+#include <linux/idr.h>
+#include <linux/inetdevice.h>
+
+#include <net/tcp.h>
+#include <net/ipv6.h>
+
+#include <rdma/rdma_cm.h>
+#include <rdma/rdma_cm_ib.h>
+#include <rdma/ib_cache.h>
+#include <rdma/ib_cm.h>
+#include <rdma/ib_sa.h>
+#include <rdma/iw_cm.h>
+
+MODULE_AUTHOR("Sean Hefty");
+MODULE_DESCRIPTION("Generic RDMA CM Agent");
+MODULE_LICENSE("Dual BSD/GPL");
+
+static int tavor_quirk = 0;
+module_param_named(tavor_quirk, tavor_quirk, int, 0644);
+MODULE_PARM_DESC(tavor_quirk, "Tavor performance quirk: limit MTU to 1K if > 0");
+
+int unify_tcp_port_space = 1;
+module_param(unify_tcp_port_space, int, 0644);
+MODULE_PARM_DESC(unify_tcp_port_space, "Unify the host TCP and RDMA port "
+		 "space allocation (default=1)");
+
+#define CMA_CM_RESPONSE_TIMEOUT 20
+#define CMA_MAX_CM_RETRIES 15
+#define CMA_CM_MRA_SETTING (IB_CM_MRA_FLAG_DELAY | 24)
+#define IBOE_PACKET_LIFETIME 18
+
+static int cma_response_timeout = CMA_CM_RESPONSE_TIMEOUT;
+module_param_named(cma_response_timeout, cma_response_timeout, int, 0644);
+MODULE_PARM_DESC(cma_response_timeout, "CMA_CM_RESPONSE_TIMEOUT default=20");
+
+static int def_prec2sl = 3;
+module_param_named(def_prec2sl, def_prec2sl, int, 0644);
+MODULE_PARM_DESC(def_prec2sl, "Default value for SL priority with RoCE. Valid values 0 - 7");
+
+static void cma_add_one(struct ib_device *device);
+static void cma_remove_one(struct ib_device *device);
+
+static struct ib_client cma_client = {
+	.name   = "cma",
+	.add    = cma_add_one,
+	.remove = cma_remove_one
+};
+
+static struct ib_sa_client sa_client;
+static struct rdma_addr_client addr_client;
+static LIST_HEAD(dev_list);
+static LIST_HEAD(listen_any_list);
+static DEFINE_MUTEX(lock);
+static struct workqueue_struct *cma_wq;
+static DEFINE_IDR(sdp_ps);
+static DEFINE_IDR(tcp_ps);
+static DEFINE_IDR(udp_ps);
+static DEFINE_IDR(ipoib_ps);
+#if defined(INET)
+static int next_port;
+#endif
+
+struct cma_device {
+	struct list_head	list;
+	struct ib_device	*device;
+	struct completion	comp;
+	atomic_t		refcount;
+	struct list_head	id_list;
+};
+
+enum cma_state {
+	CMA_IDLE,
+	CMA_ADDR_QUERY,
+	CMA_ADDR_RESOLVED,
+	CMA_ROUTE_QUERY,
+	CMA_ROUTE_RESOLVED,
+	CMA_CONNECT,
+	CMA_DISCONNECT,
+	CMA_ADDR_BOUND,
+	CMA_LISTEN,
+	CMA_DEVICE_REMOVAL,
+	CMA_DESTROYING
+};
+
+struct rdma_bind_list {
+	struct idr		*ps;
+	struct hlist_head	owners;
+	unsigned short		port;
+};
+
+/*
+ * Device removal can occur at anytime, so we need extra handling to
+ * serialize notifying the user of device removal with other callbacks.
+ * We do this by disabling removal notification while a callback is in process,
+ * and reporting it after the callback completes.
+ */
+struct rdma_id_private {
+	struct rdma_cm_id	id;
+
+	struct rdma_bind_list	*bind_list;
+	struct socket		*sock;
+	struct hlist_node	node;
+	struct list_head	list; /* listen_any_list or cma_device.list */
+	struct list_head	listen_list; /* per device listens */
+	struct cma_device	*cma_dev;
+	struct list_head	mc_list;
+
+	int			internal_id;
+	enum cma_state		state;
+	spinlock_t		lock;
+	struct mutex		qp_mutex;
+
+	struct completion	comp;
+	atomic_t		refcount;
+	struct mutex		handler_mutex;
+
+	int			backlog;
+	int			timeout_ms;
+	struct ib_sa_query	*query;
+	int			query_id;
+	union {
+		struct ib_cm_id	*ib;
+		struct iw_cm_id	*iw;
+	} cm_id;
+
+	u32			seq_num;
+	u32			qkey;
+	u32			qp_num;
+	u8			srq;
+	u8			tos;
+};
+
+struct cma_multicast {
+	struct rdma_id_private *id_priv;
+	union {
+		struct ib_sa_multicast *ib;
+	} multicast;
+	struct list_head	list;
+	void			*context;
+	struct sockaddr_storage	addr;
+	struct kref		mcref;
+};
+
+struct cma_work {
+	struct work_struct	work;
+	struct rdma_id_private	*id;
+	enum cma_state		old_state;
+	enum cma_state		new_state;
+	struct rdma_cm_event	event;
+};
+
+struct cma_ndev_work {
+	struct work_struct	work;
+	struct rdma_id_private	*id;
+	struct rdma_cm_event	event;
+};
+
+struct iboe_mcast_work {
+	struct work_struct	 work;
+	struct rdma_id_private	*id;
+	struct cma_multicast	*mc;
+};
+
+union cma_ip_addr {
+	struct in6_addr ip6;
+	struct {
+		__be32 pad[3];
+		__be32 addr;
+	} ip4;
+};
+
+struct cma_hdr {
+	u8 cma_version;
+	u8 ip_version;	/* IP version: 7:4 */
+	__be16 port;
+	union cma_ip_addr src_addr;
+	union cma_ip_addr dst_addr;
+};
+
+struct sdp_hh {
+	u8 bsdh[16];
+	u8 sdp_version; /* Major version: 7:4 */
+	u8 ip_version;	/* IP version: 7:4 */
+	u8 sdp_specific1[10];
+	__be16 port;
+	__be16 sdp_specific2;
+	union cma_ip_addr src_addr;
+	union cma_ip_addr dst_addr;
+};
+
+struct sdp_hah {
+	u8 bsdh[16];
+	u8 sdp_version;
+};
+
+#define CMA_VERSION 0x00
+#define SDP_MAJ_VERSION 0x2
+
+static int cma_comp(struct rdma_id_private *id_priv, enum cma_state comp)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&id_priv->lock, flags);
+	ret = (id_priv->state == comp);
+	spin_unlock_irqrestore(&id_priv->lock, flags);
+	return ret;
+}
+
+static int cma_comp_exch(struct rdma_id_private *id_priv,
+			 enum cma_state comp, enum cma_state exch)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&id_priv->lock, flags);
+	if ((ret = (id_priv->state == comp)))
+		id_priv->state = exch;
+	spin_unlock_irqrestore(&id_priv->lock, flags);
+	return ret;
+}
+
+static enum cma_state cma_exch(struct rdma_id_private *id_priv,
+			       enum cma_state exch)
+{
+	unsigned long flags;
+	enum cma_state old;
+
+	spin_lock_irqsave(&id_priv->lock, flags);
+	old = id_priv->state;
+	id_priv->state = exch;
+	spin_unlock_irqrestore(&id_priv->lock, flags);
+	return old;
+}
+
+static inline u8 cma_get_ip_ver(struct cma_hdr *hdr)
+{
+	return hdr->ip_version >> 4;
+}
+
+static inline void cma_set_ip_ver(struct cma_hdr *hdr, u8 ip_ver)
+{
+	hdr->ip_version = (ip_ver << 4) | (hdr->ip_version & 0xF);
+}
+
+static inline u8 sdp_get_majv(u8 sdp_version)
+{
+	return sdp_version >> 4;
+}
+
+static inline u8 sdp_get_ip_ver(struct sdp_hh *hh)
+{
+	return hh->ip_version >> 4;
+}
+
+static inline void sdp_set_ip_ver(struct sdp_hh *hh, u8 ip_ver)
+{
+	hh->ip_version = (ip_ver << 4) | (hh->ip_version & 0xF);
+}
+
+static inline int cma_is_ud_ps(enum rdma_port_space ps)
+{
+	return (ps == RDMA_PS_UDP || ps == RDMA_PS_IPOIB);
+}
+
+static void cma_attach_to_dev(struct rdma_id_private *id_priv,
+			      struct cma_device *cma_dev)
+{
+	atomic_inc(&cma_dev->refcount);
+	id_priv->cma_dev = cma_dev;
+	id_priv->id.device = cma_dev->device;
+	id_priv->id.route.addr.dev_addr.transport =
+		rdma_node_get_transport(cma_dev->device->node_type);
+	list_add_tail(&id_priv->list, &cma_dev->id_list);
+}
+
+static inline void cma_deref_dev(struct cma_device *cma_dev)
+{
+	if (atomic_dec_and_test(&cma_dev->refcount))
+		complete(&cma_dev->comp);
+}
+
+static inline void release_mc(struct kref *kref)
+{
+	struct cma_multicast *mc = container_of(kref, struct cma_multicast, mcref);
+
+	kfree(mc->multicast.ib);
+	kfree(mc);
+}
+
+static void cma_detach_from_dev(struct rdma_id_private *id_priv)
+{
+	list_del(&id_priv->list);
+	cma_deref_dev(id_priv->cma_dev);
+	id_priv->cma_dev = NULL;
+}
+
+static int cma_set_qkey(struct rdma_id_private *id_priv)
+{
+	struct ib_sa_mcmember_rec rec;
+	int ret = 0;
+
+	if (id_priv->qkey)
+		return 0;
+
+	switch (id_priv->id.ps) {
+	case RDMA_PS_UDP:
+		id_priv->qkey = RDMA_UDP_QKEY;
+		break;
+	case RDMA_PS_IPOIB:
+		ib_addr_get_mgid(&id_priv->id.route.addr.dev_addr, &rec.mgid);
+		ret = ib_sa_get_mcmember_rec(id_priv->id.device,
+					     id_priv->id.port_num, &rec.mgid,
+					     &rec);
+		if (!ret)
+			id_priv->qkey = be32_to_cpu(rec.qkey);
+		break;
+	default:
+		break;
+	}
+	return ret;
+}
+
+static int cma_acquire_dev(struct rdma_id_private *id_priv)
+{
+	struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
+	struct cma_device *cma_dev;
+	union ib_gid gid;
+	int ret = -ENODEV;
+
+	if (dev_addr->dev_type != ARPHRD_INFINIBAND) {
+		iboe_addr_get_sgid(dev_addr, &gid);
+		list_for_each_entry(cma_dev, &dev_list, list) {
+			ret = ib_find_cached_gid(cma_dev->device, &gid,
+						 &id_priv->id.port_num, NULL);
+			if (!ret)
+				goto out;
+		}
+	}
+
+	memcpy(&gid, dev_addr->src_dev_addr +
+	       rdma_addr_gid_offset(dev_addr), sizeof gid);
+	list_for_each_entry(cma_dev, &dev_list, list) {
+		ret = ib_find_cached_gid(cma_dev->device, &gid,
+					 &id_priv->id.port_num, NULL);
+		if (!ret)
+			break;
+	}
+
+out:
+	if (!ret)
+		cma_attach_to_dev(id_priv, cma_dev);
+
+	return ret;
+}
+
+static void cma_deref_id(struct rdma_id_private *id_priv)
+{
+	if (atomic_dec_and_test(&id_priv->refcount))
+		complete(&id_priv->comp);
+}
+
+static int cma_disable_callback(struct rdma_id_private *id_priv,
+			      enum cma_state state)
+{
+	mutex_lock(&id_priv->handler_mutex);
+	if (id_priv->state != state) {
+		mutex_unlock(&id_priv->handler_mutex);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static int cma_has_cm_dev(struct rdma_id_private *id_priv)
+{
+	return (id_priv->id.device && id_priv->cm_id.ib);
+}
+
+struct rdma_cm_id *rdma_create_id(rdma_cm_event_handler event_handler,
+				  void *context, enum rdma_port_space ps)
+{
+	struct rdma_id_private *id_priv;
+
+	id_priv = kzalloc(sizeof *id_priv, GFP_KERNEL);
+	if (!id_priv)
+		return ERR_PTR(-ENOMEM);
+
+	id_priv->state = CMA_IDLE;
+	id_priv->id.context = context;
+	id_priv->id.event_handler = event_handler;
+	id_priv->id.ps = ps;
+	spin_lock_init(&id_priv->lock);
+	mutex_init(&id_priv->qp_mutex);
+	init_completion(&id_priv->comp);
+	atomic_set(&id_priv->refcount, 1);
+	mutex_init(&id_priv->handler_mutex);
+	INIT_LIST_HEAD(&id_priv->listen_list);
+	INIT_LIST_HEAD(&id_priv->mc_list);
+	get_random_bytes(&id_priv->seq_num, sizeof id_priv->seq_num);
+
+	return &id_priv->id;
+}
+EXPORT_SYMBOL(rdma_create_id);
+
+static int cma_init_ud_qp(struct rdma_id_private *id_priv, struct ib_qp *qp)
+{
+	struct ib_qp_attr qp_attr;
+	int qp_attr_mask, ret;
+
+	qp_attr.qp_state = IB_QPS_INIT;
+	ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask);
+	if (ret)
+		return ret;
+
+	ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask);
+	if (ret)
+		return ret;
+
+	qp_attr.qp_state = IB_QPS_RTR;
+	ret = ib_modify_qp(qp, &qp_attr, IB_QP_STATE);
+	if (ret)
+		return ret;
+
+	qp_attr.qp_state = IB_QPS_RTS;
+	qp_attr.sq_psn = 0;
+	ret = ib_modify_qp(qp, &qp_attr, IB_QP_STATE | IB_QP_SQ_PSN);
+
+	return ret;
+}
+
+static int cma_init_conn_qp(struct rdma_id_private *id_priv, struct ib_qp *qp)
+{
+	struct ib_qp_attr qp_attr;
+	int qp_attr_mask, ret;
+
+	qp_attr.qp_state = IB_QPS_INIT;
+	ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask);
+	if (ret)
+		return ret;
+
+	return ib_modify_qp(qp, &qp_attr, qp_attr_mask);
+}
+
+int rdma_create_qp(struct rdma_cm_id *id, struct ib_pd *pd,
+		   struct ib_qp_init_attr *qp_init_attr)
+{
+	struct rdma_id_private *id_priv;
+	struct ib_qp *qp;
+	int ret;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	if (id->device != pd->device)
+		return -EINVAL;
+
+	qp = ib_create_qp(pd, qp_init_attr);
+	if (IS_ERR(qp))
+		return PTR_ERR(qp);
+
+	if (cma_is_ud_ps(id_priv->id.ps))
+		ret = cma_init_ud_qp(id_priv, qp);
+	else
+		ret = cma_init_conn_qp(id_priv, qp);
+	if (ret)
+		goto err;
+
+	id->qp = qp;
+	id_priv->qp_num = qp->qp_num;
+	id_priv->srq = (qp->srq != NULL);
+	return 0;
+err:
+	ib_destroy_qp(qp);
+	return ret;
+}
+EXPORT_SYMBOL(rdma_create_qp);
+
+void rdma_destroy_qp(struct rdma_cm_id *id)
+{
+	struct rdma_id_private *id_priv;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	mutex_lock(&id_priv->qp_mutex);
+	ib_destroy_qp(id_priv->id.qp);
+	id_priv->id.qp = NULL;
+	mutex_unlock(&id_priv->qp_mutex);
+}
+EXPORT_SYMBOL(rdma_destroy_qp);
+
+static int cma_modify_qp_rtr(struct rdma_id_private *id_priv,
+			     struct rdma_conn_param *conn_param)
+{
+	struct ib_qp_attr qp_attr;
+	int qp_attr_mask, ret;
+
+	mutex_lock(&id_priv->qp_mutex);
+	if (!id_priv->id.qp) {
+		ret = 0;
+		goto out;
+	}
+
+	/* Need to update QP attributes from default values. */
+	qp_attr.qp_state = IB_QPS_INIT;
+	ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask);
+	if (ret)
+		goto out;
+
+	ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask);
+	if (ret)
+		goto out;
+
+	qp_attr.qp_state = IB_QPS_RTR;
+	ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask);
+	if (ret)
+		goto out;
+
+	if (conn_param)
+		qp_attr.max_dest_rd_atomic = conn_param->responder_resources;
+	ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask);
+out:
+	mutex_unlock(&id_priv->qp_mutex);
+	return ret;
+}
+
+static int cma_modify_qp_rts(struct rdma_id_private *id_priv,
+			     struct rdma_conn_param *conn_param)
+{
+	struct ib_qp_attr qp_attr;
+	int qp_attr_mask, ret;
+
+	mutex_lock(&id_priv->qp_mutex);
+	if (!id_priv->id.qp) {
+		ret = 0;
+		goto out;
+	}
+
+	qp_attr.qp_state = IB_QPS_RTS;
+	ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask);
+	if (ret)
+		goto out;
+
+	if (conn_param)
+		qp_attr.max_rd_atomic = conn_param->initiator_depth;
+	ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask);
+out:
+	mutex_unlock(&id_priv->qp_mutex);
+	return ret;
+}
+
+static int cma_modify_qp_err(struct rdma_id_private *id_priv)
+{
+	struct ib_qp_attr qp_attr;
+	int ret;
+
+	mutex_lock(&id_priv->qp_mutex);
+	if (!id_priv->id.qp) {
+		ret = 0;
+		goto out;
+	}
+
+	qp_attr.qp_state = IB_QPS_ERR;
+	ret = ib_modify_qp(id_priv->id.qp, &qp_attr, IB_QP_STATE);
+out:
+	mutex_unlock(&id_priv->qp_mutex);
+	return ret;
+}
+
+static int cma_ib_init_qp_attr(struct rdma_id_private *id_priv,
+			       struct ib_qp_attr *qp_attr, int *qp_attr_mask)
+{
+	struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
+	int ret;
+	u16 pkey;
+
+	if (rdma_port_get_link_layer(id_priv->id.device, id_priv->id.port_num) ==
+	    IB_LINK_LAYER_INFINIBAND)
+		pkey = ib_addr_get_pkey(dev_addr);
+	else
+		pkey = 0xffff;
+
+	ret = ib_find_cached_pkey(id_priv->id.device, id_priv->id.port_num,
+				  pkey, &qp_attr->pkey_index);
+	if (ret)
+		return ret;
+
+	qp_attr->port_num = id_priv->id.port_num;
+	*qp_attr_mask = IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_PORT;
+
+	if (cma_is_ud_ps(id_priv->id.ps)) {
+		ret = cma_set_qkey(id_priv);
+		if (ret)
+			return ret;
+
+		qp_attr->qkey = id_priv->qkey;
+		*qp_attr_mask |= IB_QP_QKEY;
+	} else {
+		qp_attr->qp_access_flags = 0;
+		*qp_attr_mask |= IB_QP_ACCESS_FLAGS;
+	}
+	return 0;
+}
+
+int rdma_init_qp_attr(struct rdma_cm_id *id, struct ib_qp_attr *qp_attr,
+		       int *qp_attr_mask)
+{
+	struct rdma_id_private *id_priv;
+	int ret = 0;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	switch (rdma_node_get_transport(id_priv->id.device->node_type)) {
+	case RDMA_TRANSPORT_IB:
+		if (!id_priv->cm_id.ib || cma_is_ud_ps(id_priv->id.ps))
+			ret = cma_ib_init_qp_attr(id_priv, qp_attr, qp_attr_mask);
+		else
+			ret = ib_cm_init_qp_attr(id_priv->cm_id.ib, qp_attr,
+						 qp_attr_mask);
+		if (qp_attr->qp_state == IB_QPS_RTR)
+			qp_attr->rq_psn = id_priv->seq_num;
+		break;
+	case RDMA_TRANSPORT_IWARP:
+		if (!id_priv->cm_id.iw) {
+			qp_attr->qp_access_flags = 0;
+			*qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS;
+		} else
+			ret = iw_cm_init_qp_attr(id_priv->cm_id.iw, qp_attr,
+						 qp_attr_mask);
+		break;
+	default:
+		ret = -ENOSYS;
+		break;
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL(rdma_init_qp_attr);
+
+static inline int cma_zero_addr(struct sockaddr *addr)
+{
+	struct in6_addr *ip6;
+
+	if (addr->sa_family == AF_INET)
+		return ipv4_is_zeronet(
+			((struct sockaddr_in *)addr)->sin_addr.s_addr);
+	else {
+		ip6 = &((struct sockaddr_in6 *) addr)->sin6_addr;
+		return (ip6->s6_addr32[0] | ip6->s6_addr32[1] |
+			ip6->s6_addr32[2] | ip6->s6_addr32[3]) == 0;
+	}
+}
+
+static inline int cma_loopback_addr(struct sockaddr *addr)
+{
+	if (addr->sa_family == AF_INET)
+		return ipv4_is_loopback(
+			((struct sockaddr_in *) addr)->sin_addr.s_addr);
+	else
+		return ipv6_addr_loopback(
+			&((struct sockaddr_in6 *) addr)->sin6_addr);
+}
+
+static inline int cma_any_addr(struct sockaddr *addr)
+{
+	return cma_zero_addr(addr) || cma_loopback_addr(addr);
+}
+
+static inline __be16 cma_port(struct sockaddr *addr)
+{
+	if (addr->sa_family == AF_INET)
+		return ((struct sockaddr_in *) addr)->sin_port;
+	else
+		return ((struct sockaddr_in6 *) addr)->sin6_port;
+}
+
+static inline int cma_any_port(struct sockaddr *addr)
+{
+	return !cma_port(addr);
+}
+
+static int cma_get_net_info(void *hdr, enum rdma_port_space ps,
+			    u8 *ip_ver, __be16 *port,
+			    union cma_ip_addr **src, union cma_ip_addr **dst)
+{
+	switch (ps) {
+	case RDMA_PS_SDP:
+		if (sdp_get_majv(((struct sdp_hh *) hdr)->sdp_version) !=
+		    SDP_MAJ_VERSION)
+			return -EINVAL;
+
+		*ip_ver	= sdp_get_ip_ver(hdr);
+		*port	= ((struct sdp_hh *) hdr)->port;
+		*src	= &((struct sdp_hh *) hdr)->src_addr;
+		*dst	= &((struct sdp_hh *) hdr)->dst_addr;
+		break;
+	default:
+		if (((struct cma_hdr *) hdr)->cma_version != CMA_VERSION)
+			return -EINVAL;
+
+		*ip_ver	= cma_get_ip_ver(hdr);
+		*port	= ((struct cma_hdr *) hdr)->port;
+		*src	= &((struct cma_hdr *) hdr)->src_addr;
+		*dst	= &((struct cma_hdr *) hdr)->dst_addr;
+		break;
+	}
+
+	if (*ip_ver != 4 && *ip_ver != 6)
+		return -EINVAL;
+	return 0;
+}
+
+static void cma_save_net_info(struct rdma_addr *addr,
+			      struct rdma_addr *listen_addr,
+			      u8 ip_ver, __be16 port,
+			      union cma_ip_addr *src, union cma_ip_addr *dst)
+{
+	struct sockaddr_in *listen4, *ip4;
+	struct sockaddr_in6 *listen6, *ip6;
+
+	switch (ip_ver) {
+	case 4:
+		listen4 = (struct sockaddr_in *) &listen_addr->src_addr;
+		ip4 = (struct sockaddr_in *) &addr->src_addr;
+		ip4->sin_family = listen4->sin_family;
+		ip4->sin_addr.s_addr = dst->ip4.addr;
+		ip4->sin_port = listen4->sin_port;
+
+		ip4 = (struct sockaddr_in *) &addr->dst_addr;
+		ip4->sin_family = listen4->sin_family;
+		ip4->sin_addr.s_addr = src->ip4.addr;
+		ip4->sin_port = port;
+		break;
+	case 6:
+		listen6 = (struct sockaddr_in6 *) &listen_addr->src_addr;
+		ip6 = (struct sockaddr_in6 *) &addr->src_addr;
+		ip6->sin6_family = listen6->sin6_family;
+		ip6->sin6_addr = dst->ip6;
+		ip6->sin6_port = listen6->sin6_port;
+
+		ip6 = (struct sockaddr_in6 *) &addr->dst_addr;
+		ip6->sin6_family = listen6->sin6_family;
+		ip6->sin6_addr = src->ip6;
+		ip6->sin6_port = port;
+		break;
+	default:
+		break;
+	}
+}
+
+static inline int cma_user_data_offset(enum rdma_port_space ps)
+{
+	switch (ps) {
+	case RDMA_PS_SDP:
+		return 0;
+	default:
+		return sizeof(struct cma_hdr);
+	}
+}
+
+static void cma_cancel_route(struct rdma_id_private *id_priv)
+{
+	switch (rdma_port_get_link_layer(id_priv->id.device, id_priv->id.port_num)) {
+	case IB_LINK_LAYER_INFINIBAND:
+		if (id_priv->query)
+			ib_sa_cancel_query(id_priv->query_id, id_priv->query);
+		break;
+	default:
+		break;
+	}
+}
+
+static void cma_cancel_listens(struct rdma_id_private *id_priv)
+{
+	struct rdma_id_private *dev_id_priv;
+
+	/*
+	 * Remove from listen_any_list to prevent added devices from spawning
+	 * additional listen requests.
+	 */
+	mutex_lock(&lock);
+	list_del(&id_priv->list);
+
+	while (!list_empty(&id_priv->listen_list)) {
+		dev_id_priv = list_entry(id_priv->listen_list.next,
+					 struct rdma_id_private, listen_list);
+		/* sync with device removal to avoid duplicate destruction */
+		list_del_init(&dev_id_priv->list);
+		list_del(&dev_id_priv->listen_list);
+		mutex_unlock(&lock);
+
+		rdma_destroy_id(&dev_id_priv->id);
+		mutex_lock(&lock);
+	}
+	mutex_unlock(&lock);
+}
+
+static void cma_cancel_operation(struct rdma_id_private *id_priv,
+				 enum cma_state state)
+{
+	switch (state) {
+	case CMA_ADDR_QUERY:
+		rdma_addr_cancel(&id_priv->id.route.addr.dev_addr);
+		break;
+	case CMA_ROUTE_QUERY:
+		cma_cancel_route(id_priv);
+		break;
+	case CMA_LISTEN:
+		if (cma_any_addr((struct sockaddr *) &id_priv->id.route.addr.src_addr)
+				&& !id_priv->cma_dev)
+			cma_cancel_listens(id_priv);
+		break;
+	default:
+		break;
+	}
+}
+
+static void cma_release_port(struct rdma_id_private *id_priv)
+{
+	struct rdma_bind_list *bind_list = id_priv->bind_list;
+
+	if (!bind_list)
+		return;
+
+	mutex_lock(&lock);
+	hlist_del(&id_priv->node);
+	if (hlist_empty(&bind_list->owners)) {
+		idr_remove(bind_list->ps, bind_list->port);
+		kfree(bind_list);
+	}
+	mutex_unlock(&lock);
+	if (id_priv->sock)
+		sock_release(id_priv->sock);
+}
+
+static void cma_leave_mc_groups(struct rdma_id_private *id_priv)
+{
+	struct cma_multicast *mc;
+
+	while (!list_empty(&id_priv->mc_list)) {
+		mc = container_of(id_priv->mc_list.next,
+				  struct cma_multicast, list);
+		list_del(&mc->list);
+		switch (rdma_port_get_link_layer(id_priv->cma_dev->device, id_priv->id.port_num)) {
+		case IB_LINK_LAYER_INFINIBAND:
+			ib_sa_free_multicast(mc->multicast.ib);
+			kfree(mc);
+			break;
+		case IB_LINK_LAYER_ETHERNET:
+			kref_put(&mc->mcref, release_mc);
+			break;
+		default:
+			break;
+		}
+	}
+}
+
+void rdma_destroy_id(struct rdma_cm_id *id)
+{
+	struct rdma_id_private *id_priv;
+	enum cma_state state;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	state = cma_exch(id_priv, CMA_DESTROYING);
+	cma_cancel_operation(id_priv, state);
+
+	mutex_lock(&lock);
+	if (id_priv->cma_dev) {
+		mutex_unlock(&lock);
+		switch (rdma_node_get_transport(id_priv->id.device->node_type)) {
+		case RDMA_TRANSPORT_IB:
+			if (id_priv->cm_id.ib && !IS_ERR(id_priv->cm_id.ib))
+				ib_destroy_cm_id(id_priv->cm_id.ib);
+			break;
+		case RDMA_TRANSPORT_IWARP:
+			if (id_priv->cm_id.iw && !IS_ERR(id_priv->cm_id.iw))
+				iw_destroy_cm_id(id_priv->cm_id.iw);
+			break;
+		default:
+			break;
+		}
+		cma_leave_mc_groups(id_priv);
+		mutex_lock(&lock);
+		cma_detach_from_dev(id_priv);
+	}
+	mutex_unlock(&lock);
+
+	cma_release_port(id_priv);
+	cma_deref_id(id_priv);
+	wait_for_completion(&id_priv->comp);
+
+	if (id_priv->internal_id)
+		cma_deref_id(id_priv->id.context);
+
+	kfree(id_priv->id.route.path_rec);
+	kfree(id_priv);
+}
+EXPORT_SYMBOL(rdma_destroy_id);
+
+static int cma_rep_recv(struct rdma_id_private *id_priv)
+{
+	int ret;
+
+	ret = cma_modify_qp_rtr(id_priv, NULL);
+	if (ret)
+		goto reject;
+
+	ret = cma_modify_qp_rts(id_priv, NULL);
+	if (ret)
+		goto reject;
+
+	ret = ib_send_cm_rtu(id_priv->cm_id.ib, NULL, 0);
+	if (ret)
+		goto reject;
+
+	return 0;
+reject:
+	cma_modify_qp_err(id_priv);
+	ib_send_cm_rej(id_priv->cm_id.ib, IB_CM_REJ_CONSUMER_DEFINED,
+		       NULL, 0, NULL, 0);
+	return ret;
+}
+
+static int cma_verify_rep(struct rdma_id_private *id_priv, void *data)
+{
+	if (id_priv->id.ps == RDMA_PS_SDP &&
+	    sdp_get_majv(((struct sdp_hah *) data)->sdp_version) !=
+	    SDP_MAJ_VERSION)
+		return -EINVAL;
+
+	return 0;
+}
+
+static void cma_set_rep_event_data(struct rdma_cm_event *event,
+				   struct ib_cm_rep_event_param *rep_data,
+				   void *private_data)
+{
+	event->param.conn.private_data = private_data;
+	event->param.conn.private_data_len = IB_CM_REP_PRIVATE_DATA_SIZE;
+	event->param.conn.responder_resources = rep_data->responder_resources;
+	event->param.conn.initiator_depth = rep_data->initiator_depth;
+	event->param.conn.flow_control = rep_data->flow_control;
+	event->param.conn.rnr_retry_count = rep_data->rnr_retry_count;
+	event->param.conn.srq = rep_data->srq;
+	event->param.conn.qp_num = rep_data->remote_qpn;
+}
+
+static int cma_ib_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event)
+{
+	struct rdma_id_private *id_priv = cm_id->context;
+	struct rdma_cm_event event;
+	int ret = 0;
+
+	if ((ib_event->event != IB_CM_TIMEWAIT_EXIT &&
+		cma_disable_callback(id_priv, CMA_CONNECT)) ||
+	    (ib_event->event == IB_CM_TIMEWAIT_EXIT &&
+		cma_disable_callback(id_priv, CMA_DISCONNECT)))
+		return 0;
+
+	memset(&event, 0, sizeof event);
+	switch (ib_event->event) {
+	case IB_CM_REQ_ERROR:
+	case IB_CM_REP_ERROR:
+		event.event = RDMA_CM_EVENT_UNREACHABLE;
+		event.status = -ETIMEDOUT;
+		break;
+	case IB_CM_REP_RECEIVED:
+		event.status = cma_verify_rep(id_priv, ib_event->private_data);
+		if (event.status)
+			event.event = RDMA_CM_EVENT_CONNECT_ERROR;
+		else if (id_priv->id.qp && id_priv->id.ps != RDMA_PS_SDP) {
+			event.status = cma_rep_recv(id_priv);
+			event.event = event.status ? RDMA_CM_EVENT_CONNECT_ERROR :
+						     RDMA_CM_EVENT_ESTABLISHED;
+		} else
+			event.event = RDMA_CM_EVENT_CONNECT_RESPONSE;
+		cma_set_rep_event_data(&event, &ib_event->param.rep_rcvd,
+				       ib_event->private_data);
+		break;
+	case IB_CM_RTU_RECEIVED:
+	case IB_CM_USER_ESTABLISHED:
+		event.event = RDMA_CM_EVENT_ESTABLISHED;
+		break;
+	case IB_CM_DREQ_ERROR:
+		event.status = -ETIMEDOUT; /* fall through */
+	case IB_CM_DREQ_RECEIVED:
+	case IB_CM_DREP_RECEIVED:
+		if (!cma_comp_exch(id_priv, CMA_CONNECT, CMA_DISCONNECT))
+			goto out;
+		event.event = RDMA_CM_EVENT_DISCONNECTED;
+		break;
+	case IB_CM_TIMEWAIT_EXIT:
+		event.event = RDMA_CM_EVENT_TIMEWAIT_EXIT;
+		break;
+	case IB_CM_MRA_RECEIVED:
+		/* ignore event */
+		goto out;
+	case IB_CM_REJ_RECEIVED:
+		cma_modify_qp_err(id_priv);
+		event.status = ib_event->param.rej_rcvd.reason;
+		event.event = RDMA_CM_EVENT_REJECTED;
+		event.param.conn.private_data = ib_event->private_data;
+		event.param.conn.private_data_len = IB_CM_REJ_PRIVATE_DATA_SIZE;
+		break;
+	default:
+		printk(KERN_ERR "RDMA CMA: unexpected IB CM event: %d\n",
+		       ib_event->event);
+		goto out;
+	}
+
+	ret = id_priv->id.event_handler(&id_priv->id, &event);
+	if (ret) {
+		/* Destroy the CM ID by returning a non-zero value. */
+		id_priv->cm_id.ib = NULL;
+		cma_exch(id_priv, CMA_DESTROYING);
+		mutex_unlock(&id_priv->handler_mutex);
+		rdma_destroy_id(&id_priv->id);
+		return ret;
+	}
+out:
+	mutex_unlock(&id_priv->handler_mutex);
+	return ret;
+}
+
+static struct rdma_id_private *cma_new_conn_id(struct rdma_cm_id *listen_id,
+					       struct ib_cm_event *ib_event)
+{
+	struct rdma_id_private *id_priv;
+	struct rdma_cm_id *id;
+	struct rdma_route *rt;
+	union cma_ip_addr *src, *dst;
+	__be16 port;
+	u8 ip_ver;
+	int ret;
+
+	if (cma_get_net_info(ib_event->private_data, listen_id->ps,
+			     &ip_ver, &port, &src, &dst))
+		goto err;
+
+	id = rdma_create_id(listen_id->event_handler, listen_id->context,
+			    listen_id->ps);
+	if (IS_ERR(id))
+		goto err;
+
+	cma_save_net_info(&id->route.addr, &listen_id->route.addr,
+			  ip_ver, port, src, dst);
+
+	rt = &id->route;
+	rt->num_paths = ib_event->param.req_rcvd.alternate_path ? 2 : 1;
+	rt->path_rec = kmalloc(sizeof *rt->path_rec * rt->num_paths,
+			       GFP_KERNEL);
+	if (!rt->path_rec)
+		goto destroy_id;
+
+	rt->path_rec[0] = *ib_event->param.req_rcvd.primary_path;
+	if (rt->num_paths == 2)
+		rt->path_rec[1] = *ib_event->param.req_rcvd.alternate_path;
+
+	if (cma_any_addr((struct sockaddr *) &rt->addr.src_addr)) {
+		rt->addr.dev_addr.dev_type = ARPHRD_INFINIBAND;
+		rdma_addr_set_sgid(&rt->addr.dev_addr, &rt->path_rec[0].sgid);
+		ib_addr_set_pkey(&rt->addr.dev_addr, rt->path_rec[0].pkey);
+	} else {
+		ret = rdma_translate_ip((struct sockaddr *) &rt->addr.src_addr,
+					&rt->addr.dev_addr);
+		if (ret)
+			goto destroy_id;
+	}
+	rdma_addr_set_dgid(&rt->addr.dev_addr, &rt->path_rec[0].dgid);
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	id_priv->state = CMA_CONNECT;
+	return id_priv;
+
+destroy_id:
+	rdma_destroy_id(id);
+err:
+	return NULL;
+}
+
+static struct rdma_id_private *cma_new_udp_id(struct rdma_cm_id *listen_id,
+					      struct ib_cm_event *ib_event)
+{
+	struct rdma_id_private *id_priv;
+	struct rdma_cm_id *id;
+	union cma_ip_addr *src, *dst;
+	__be16 port;
+	u8 ip_ver;
+	int ret;
+
+	id = rdma_create_id(listen_id->event_handler, listen_id->context,
+			    listen_id->ps);
+	if (IS_ERR(id))
+		return NULL;
+
+
+	if (cma_get_net_info(ib_event->private_data, listen_id->ps,
+			     &ip_ver, &port, &src, &dst))
+		goto err;
+
+	cma_save_net_info(&id->route.addr, &listen_id->route.addr,
+			  ip_ver, port, src, dst);
+
+	if (!cma_any_addr((struct sockaddr *) &id->route.addr.src_addr)) {
+		ret = rdma_translate_ip((struct sockaddr *) &id->route.addr.src_addr,
+					&id->route.addr.dev_addr);
+		if (ret)
+			goto err;
+	}
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	id_priv->state = CMA_CONNECT;
+	return id_priv;
+err:
+	rdma_destroy_id(id);
+	return NULL;
+}
+
+static void cma_set_req_event_data(struct rdma_cm_event *event,
+				   struct ib_cm_req_event_param *req_data,
+				   void *private_data, int offset)
+{
+	event->param.conn.private_data = private_data + offset;
+	event->param.conn.private_data_len = IB_CM_REQ_PRIVATE_DATA_SIZE - offset;
+	event->param.conn.responder_resources = req_data->responder_resources;
+	event->param.conn.initiator_depth = req_data->initiator_depth;
+	event->param.conn.flow_control = req_data->flow_control;
+	event->param.conn.retry_count = req_data->retry_count;
+	event->param.conn.rnr_retry_count = req_data->rnr_retry_count;
+	event->param.conn.srq = req_data->srq;
+	event->param.conn.qp_num = req_data->remote_qpn;
+}
+
+static int cma_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event)
+{
+	struct rdma_id_private *listen_id, *conn_id;
+	struct rdma_cm_event event;
+	int offset, ret;
+
+	listen_id = cm_id->context;
+	if (cma_disable_callback(listen_id, CMA_LISTEN))
+		return -ECONNABORTED;
+
+	memset(&event, 0, sizeof event);
+	offset = cma_user_data_offset(listen_id->id.ps);
+	event.event = RDMA_CM_EVENT_CONNECT_REQUEST;
+	if (cma_is_ud_ps(listen_id->id.ps)) {
+		conn_id = cma_new_udp_id(&listen_id->id, ib_event);
+		event.param.ud.private_data = ib_event->private_data + offset;
+		event.param.ud.private_data_len =
+				IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE - offset;
+	} else {
+		conn_id = cma_new_conn_id(&listen_id->id, ib_event);
+		cma_set_req_event_data(&event, &ib_event->param.req_rcvd,
+				       ib_event->private_data, offset);
+	}
+	if (!conn_id) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	mutex_lock_nested(&conn_id->handler_mutex, SINGLE_DEPTH_NESTING);
+	mutex_lock(&lock);
+	ret = cma_acquire_dev(conn_id);
+	mutex_unlock(&lock);
+	if (ret)
+		goto release_conn_id;
+
+	conn_id->cm_id.ib = cm_id;
+	cm_id->context = conn_id;
+	cm_id->cm_handler = cma_ib_handler;
+
+	ret = conn_id->id.event_handler(&conn_id->id, &event);
+	if (!ret) {
+		/*
+		 * Acquire mutex to prevent user executing rdma_destroy_id()
+		 * while we're accessing the cm_id.
+		 */
+		mutex_lock(&lock);
+		if (cma_comp(conn_id, CMA_CONNECT) &&
+		    !cma_is_ud_ps(conn_id->id.ps))
+			ib_send_cm_mra(cm_id, CMA_CM_MRA_SETTING, NULL, 0);
+		mutex_unlock(&lock);
+		mutex_unlock(&conn_id->handler_mutex);
+		goto out;
+	}
+
+	/* Destroy the CM ID by returning a non-zero value. */
+	conn_id->cm_id.ib = NULL;
+
+release_conn_id:
+	cma_exch(conn_id, CMA_DESTROYING);
+	mutex_unlock(&conn_id->handler_mutex);
+	rdma_destroy_id(&conn_id->id);
+
+out:
+	mutex_unlock(&listen_id->handler_mutex);
+	return ret;
+}
+
+static __be64 cma_get_service_id(enum rdma_port_space ps, struct sockaddr *addr)
+{
+	return cpu_to_be64(((u64)ps << 16) + be16_to_cpu(cma_port(addr)));
+}
+
+static void cma_set_compare_data(enum rdma_port_space ps, struct sockaddr *addr,
+				 struct ib_cm_compare_data *compare)
+{
+	struct cma_hdr *cma_data, *cma_mask;
+	struct sdp_hh *sdp_data, *sdp_mask;
+	__be32 ip4_addr;
+#ifdef INET6
+	struct in6_addr ip6_addr;
+#endif
+
+	memset(compare, 0, sizeof *compare);
+	cma_data = (void *) compare->data;
+	cma_mask = (void *) compare->mask;
+	sdp_data = (void *) compare->data;
+	sdp_mask = (void *) compare->mask;
+
+	switch (addr->sa_family) {
+	case AF_INET:
+		ip4_addr = ((struct sockaddr_in *) addr)->sin_addr.s_addr;
+		if (ps == RDMA_PS_SDP) {
+			sdp_set_ip_ver(sdp_data, 4);
+			sdp_set_ip_ver(sdp_mask, 0xF);
+			sdp_data->dst_addr.ip4.addr = ip4_addr;
+			sdp_mask->dst_addr.ip4.addr = htonl(~0);
+		} else {
+			cma_set_ip_ver(cma_data, 4);
+			cma_set_ip_ver(cma_mask, 0xF);
+			cma_data->dst_addr.ip4.addr = ip4_addr;
+			cma_mask->dst_addr.ip4.addr = htonl(~0);
+		}
+		break;
+#ifdef INET6
+	case AF_INET6:
+		ip6_addr = ((struct sockaddr_in6 *) addr)->sin6_addr;
+		if (ps == RDMA_PS_SDP) {
+			sdp_set_ip_ver(sdp_data, 6);
+			sdp_set_ip_ver(sdp_mask, 0xF);
+			sdp_data->dst_addr.ip6 = ip6_addr;
+			memset(&sdp_mask->dst_addr.ip6, 0xFF,
+			       sizeof sdp_mask->dst_addr.ip6);
+		} else {
+			cma_set_ip_ver(cma_data, 6);
+			cma_set_ip_ver(cma_mask, 0xF);
+			cma_data->dst_addr.ip6 = ip6_addr;
+			memset(&cma_mask->dst_addr.ip6, 0xFF,
+			       sizeof cma_mask->dst_addr.ip6);
+		}
+		break;
+#endif
+	default:
+		break;
+	}
+}
+
+static int cma_iw_handler(struct iw_cm_id *iw_id, struct iw_cm_event *iw_event)
+{
+	struct rdma_id_private *id_priv = iw_id->context;
+	struct rdma_cm_event event;
+	struct sockaddr_in *sin;
+	int ret = 0;
+
+	if (cma_disable_callback(id_priv, CMA_CONNECT))
+		return 0;
+
+	memset(&event, 0, sizeof event);
+	switch (iw_event->event) {
+	case IW_CM_EVENT_CLOSE:
+		event.event = RDMA_CM_EVENT_DISCONNECTED;
+		break;
+	case IW_CM_EVENT_CONNECT_REPLY:
+		sin = (struct sockaddr_in *) &id_priv->id.route.addr.src_addr;
+		*sin = iw_event->local_addr;
+		sin = (struct sockaddr_in *) &id_priv->id.route.addr.dst_addr;
+		*sin = iw_event->remote_addr;
+		switch ((int)iw_event->status) {
+		case 0:
+			event.event = RDMA_CM_EVENT_ESTABLISHED;
+			break;
+		case -ECONNRESET:
+		case -ECONNREFUSED:
+			event.event = RDMA_CM_EVENT_REJECTED;
+			break;
+		case -ETIMEDOUT:
+			event.event = RDMA_CM_EVENT_UNREACHABLE;
+			break;
+		default:
+			event.event = RDMA_CM_EVENT_CONNECT_ERROR;
+			break;
+		}
+		break;
+	case IW_CM_EVENT_ESTABLISHED:
+		event.event = RDMA_CM_EVENT_ESTABLISHED;
+		break;
+	default:
+		BUG_ON(1);
+	}
+
+	event.status = iw_event->status;
+	event.param.conn.private_data = iw_event->private_data;
+	event.param.conn.private_data_len = iw_event->private_data_len;
+	ret = id_priv->id.event_handler(&id_priv->id, &event);
+	if (ret) {
+		/* Destroy the CM ID by returning a non-zero value. */
+		id_priv->cm_id.iw = NULL;
+		cma_exch(id_priv, CMA_DESTROYING);
+		mutex_unlock(&id_priv->handler_mutex);
+		rdma_destroy_id(&id_priv->id);
+		return ret;
+	}
+
+	mutex_unlock(&id_priv->handler_mutex);
+	return ret;
+}
+
+static int iw_conn_req_handler(struct iw_cm_id *cm_id,
+			       struct iw_cm_event *iw_event)
+{
+	struct rdma_cm_id *new_cm_id;
+	struct rdma_id_private *listen_id, *conn_id;
+	struct sockaddr_in *sin;
+	struct net_device *dev = NULL;
+	struct rdma_cm_event event;
+	int ret;
+	struct ib_device_attr attr;
+
+	listen_id = cm_id->context;
+	if (cma_disable_callback(listen_id, CMA_LISTEN))
+		return -ECONNABORTED;
+
+	/* Create a new RDMA id for the new IW CM ID */
+	new_cm_id = rdma_create_id(listen_id->id.event_handler,
+				   listen_id->id.context,
+				   RDMA_PS_TCP);
+	if (IS_ERR(new_cm_id)) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	conn_id = container_of(new_cm_id, struct rdma_id_private, id);
+	mutex_lock_nested(&conn_id->handler_mutex, SINGLE_DEPTH_NESTING);
+	conn_id->state = CMA_CONNECT;
+
+	dev = ip_dev_find(NULL, iw_event->local_addr.sin_addr.s_addr);
+	if (!dev) {
+		ret = -EADDRNOTAVAIL;
+		mutex_unlock(&conn_id->handler_mutex);
+		rdma_destroy_id(new_cm_id);
+		goto out;
+	}
+	ret = rdma_copy_addr(&conn_id->id.route.addr.dev_addr, dev, NULL);
+	if (ret) {
+		mutex_unlock(&conn_id->handler_mutex);
+		rdma_destroy_id(new_cm_id);
+		goto out;
+	}
+
+	mutex_lock(&lock);
+	ret = cma_acquire_dev(conn_id);
+	mutex_unlock(&lock);
+	if (ret) {
+		mutex_unlock(&conn_id->handler_mutex);
+		rdma_destroy_id(new_cm_id);
+		goto out;
+	}
+
+	conn_id->cm_id.iw = cm_id;
+	cm_id->context = conn_id;
+	cm_id->cm_handler = cma_iw_handler;
+
+	sin = (struct sockaddr_in *) &new_cm_id->route.addr.src_addr;
+	*sin = iw_event->local_addr;
+	sin = (struct sockaddr_in *) &new_cm_id->route.addr.dst_addr;
+	*sin = iw_event->remote_addr;
+
+	ret = ib_query_device(conn_id->id.device, &attr);
+	if (ret) {
+		mutex_unlock(&conn_id->handler_mutex);
+		rdma_destroy_id(new_cm_id);
+		goto out;
+	}
+
+	memset(&event, 0, sizeof event);
+	event.event = RDMA_CM_EVENT_CONNECT_REQUEST;
+	event.param.conn.private_data = iw_event->private_data;
+	event.param.conn.private_data_len = iw_event->private_data_len;
+	event.param.conn.initiator_depth = attr.max_qp_init_rd_atom;
+	event.param.conn.responder_resources = attr.max_qp_rd_atom;
+	ret = conn_id->id.event_handler(&conn_id->id, &event);
+	if (ret) {
+		/* User wants to destroy the CM ID */
+		conn_id->cm_id.iw = NULL;
+		cma_exch(conn_id, CMA_DESTROYING);
+		mutex_unlock(&conn_id->handler_mutex);
+		rdma_destroy_id(&conn_id->id);
+		goto out;
+	}
+
+	mutex_unlock(&conn_id->handler_mutex);
+
+out:
+	if (dev)
+		dev_put(dev);
+	mutex_unlock(&listen_id->handler_mutex);
+	return ret;
+}
+
+static int cma_ib_listen(struct rdma_id_private *id_priv)
+{
+	struct ib_cm_compare_data compare_data;
+	struct sockaddr *addr;
+	__be64 svc_id;
+	int ret;
+
+	id_priv->cm_id.ib = ib_create_cm_id(id_priv->id.device, cma_req_handler,
+					    id_priv);
+	if (IS_ERR(id_priv->cm_id.ib))
+		return PTR_ERR(id_priv->cm_id.ib);
+
+	addr = (struct sockaddr *) &id_priv->id.route.addr.src_addr;
+	svc_id = cma_get_service_id(id_priv->id.ps, addr);
+	if (cma_any_addr(addr))
+		ret = ib_cm_listen(id_priv->cm_id.ib, svc_id, 0, NULL);
+	else {
+		cma_set_compare_data(id_priv->id.ps, addr, &compare_data);
+		ret = ib_cm_listen(id_priv->cm_id.ib, svc_id, 0, &compare_data);
+	}
+
+	if (ret) {
+		ib_destroy_cm_id(id_priv->cm_id.ib);
+		id_priv->cm_id.ib = NULL;
+	}
+
+	return ret;
+}
+
+static int cma_iw_listen(struct rdma_id_private *id_priv, int backlog)
+{
+	int ret;
+	struct sockaddr_in *sin;
+
+	id_priv->cm_id.iw = iw_create_cm_id(id_priv->id.device,
+					    id_priv->sock,
+					    iw_conn_req_handler,
+					    id_priv);
+	if (IS_ERR(id_priv->cm_id.iw))
+		return PTR_ERR(id_priv->cm_id.iw);
+
+	sin = (struct sockaddr_in *) &id_priv->id.route.addr.src_addr;
+	id_priv->cm_id.iw->local_addr = *sin;
+
+	ret = iw_cm_listen(id_priv->cm_id.iw, backlog);
+
+	if (ret) {
+		iw_destroy_cm_id(id_priv->cm_id.iw);
+		id_priv->cm_id.iw = NULL;
+	}
+
+	return ret;
+}
+
+static int cma_listen_handler(struct rdma_cm_id *id,
+			      struct rdma_cm_event *event)
+{
+	struct rdma_id_private *id_priv = id->context;
+
+	id->context = id_priv->id.context;
+	id->event_handler = id_priv->id.event_handler;
+	return id_priv->id.event_handler(id, event);
+}
+
+static void cma_listen_on_dev(struct rdma_id_private *id_priv,
+			      struct cma_device *cma_dev)
+{
+	struct rdma_id_private *dev_id_priv;
+	struct rdma_cm_id *id;
+	int ret;
+
+	id = rdma_create_id(cma_listen_handler, id_priv, id_priv->id.ps);
+	if (IS_ERR(id))
+		return;
+
+	dev_id_priv = container_of(id, struct rdma_id_private, id);
+
+	dev_id_priv->state = CMA_ADDR_BOUND;
+	memcpy(&id->route.addr.src_addr, &id_priv->id.route.addr.src_addr,
+	       ip_addr_size((struct sockaddr *) &id_priv->id.route.addr.src_addr));
+
+	cma_attach_to_dev(dev_id_priv, cma_dev);
+	list_add_tail(&dev_id_priv->listen_list, &id_priv->listen_list);
+	atomic_inc(&id_priv->refcount);
+	dev_id_priv->internal_id = 1;
+
+	ret = rdma_listen(id, id_priv->backlog);
+	if (ret)
+		printk(KERN_WARNING "RDMA CMA: cma_listen_on_dev, error %d, "
+		       "listening on device %s\n", ret, cma_dev->device->name);
+}
+
+static void cma_listen_on_all(struct rdma_id_private *id_priv)
+{
+	struct cma_device *cma_dev;
+
+	mutex_lock(&lock);
+	list_add_tail(&id_priv->list, &listen_any_list);
+	list_for_each_entry(cma_dev, &dev_list, list)
+		cma_listen_on_dev(id_priv, cma_dev);
+	mutex_unlock(&lock);
+}
+
+int rdma_listen(struct rdma_cm_id *id, int backlog)
+{
+	struct rdma_id_private *id_priv;
+	int ret;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	if (id_priv->state == CMA_IDLE) {
+		((struct sockaddr *) &id->route.addr.src_addr)->sa_family = AF_INET;
+		ret = rdma_bind_addr(id, (struct sockaddr *) &id->route.addr.src_addr);
+		if (ret)
+			return ret;
+	}
+
+	if (!cma_comp_exch(id_priv, CMA_ADDR_BOUND, CMA_LISTEN))
+		return -EINVAL;
+
+	id_priv->backlog = backlog;
+	if (id->device) {
+		switch (rdma_node_get_transport(id->device->node_type)) {
+		case RDMA_TRANSPORT_IB:
+			ret = cma_ib_listen(id_priv);
+			if (ret)
+				goto err;
+			break;
+		case RDMA_TRANSPORT_IWARP:
+			ret = cma_iw_listen(id_priv, backlog);
+			if (ret)
+				goto err;
+			break;
+		default:
+			ret = -ENOSYS;
+			goto err;
+		}
+	} else
+		cma_listen_on_all(id_priv);
+
+	return 0;
+err:
+	id_priv->backlog = 0;
+	cma_comp_exch(id_priv, CMA_LISTEN, CMA_ADDR_BOUND);
+	return ret;
+}
+EXPORT_SYMBOL(rdma_listen);
+
+void rdma_set_service_type(struct rdma_cm_id *id, int tos)
+{
+	struct rdma_id_private *id_priv;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	id_priv->tos = (u8) tos;
+}
+EXPORT_SYMBOL(rdma_set_service_type);
+
+static void cma_query_handler(int status, struct ib_sa_path_rec *path_rec,
+			      void *context)
+{
+	struct cma_work *work = context;
+	struct rdma_route *route;
+
+	route = &work->id->id.route;
+
+	if (!status) {
+		route->num_paths = 1;
+		*route->path_rec = *path_rec;
+	} else {
+		work->old_state = CMA_ROUTE_QUERY;
+		work->new_state = CMA_ADDR_RESOLVED;
+		work->event.event = RDMA_CM_EVENT_ROUTE_ERROR;
+		work->event.status = status;
+	}
+
+	queue_work(cma_wq, &work->work);
+}
+
+static int cma_query_ib_route(struct rdma_id_private *id_priv, int timeout_ms,
+			      struct cma_work *work)
+{
+	struct rdma_addr *addr = &id_priv->id.route.addr;
+	struct ib_sa_path_rec path_rec;
+	ib_sa_comp_mask comp_mask;
+	struct sockaddr_in6 *sin6;
+
+	memset(&path_rec, 0, sizeof path_rec);
+	rdma_addr_get_sgid(&addr->dev_addr, &path_rec.sgid);
+	rdma_addr_get_dgid(&addr->dev_addr, &path_rec.dgid);
+	path_rec.pkey = cpu_to_be16(ib_addr_get_pkey(&addr->dev_addr));
+	path_rec.numb_path = 1;
+	path_rec.reversible = 1;
+	path_rec.service_id = cma_get_service_id(id_priv->id.ps,
+							(struct sockaddr *) &addr->dst_addr);
+
+	comp_mask = IB_SA_PATH_REC_DGID | IB_SA_PATH_REC_SGID |
+		    IB_SA_PATH_REC_PKEY | IB_SA_PATH_REC_NUMB_PATH |
+		    IB_SA_PATH_REC_REVERSIBLE | IB_SA_PATH_REC_SERVICE_ID;
+
+	if (addr->src_addr.ss_family == AF_INET) {
+		path_rec.qos_class = cpu_to_be16((u16) id_priv->tos);
+		comp_mask |= IB_SA_PATH_REC_QOS_CLASS;
+	} else {
+		sin6 = (struct sockaddr_in6 *) &addr->src_addr;
+		path_rec.traffic_class = (u8) (be32_to_cpu(sin6->sin6_flowinfo) >> 20);
+		comp_mask |= IB_SA_PATH_REC_TRAFFIC_CLASS;
+	}
+
+	if (tavor_quirk) {
+		path_rec.mtu_selector = IB_SA_LT;
+		path_rec.mtu = IB_MTU_2048;
+	}
+
+	id_priv->query_id = ib_sa_path_rec_get(&sa_client, id_priv->id.device,
+					       id_priv->id.port_num, &path_rec,
+					       comp_mask, timeout_ms,
+					       GFP_KERNEL, cma_query_handler,
+					       work, &id_priv->query);
+
+	return (id_priv->query_id < 0) ? id_priv->query_id : 0;
+}
+
+static void cma_work_handler(struct work_struct *_work)
+{
+	struct cma_work *work = container_of(_work, struct cma_work, work);
+	struct rdma_id_private *id_priv = work->id;
+	int destroy = 0;
+
+	mutex_lock(&id_priv->handler_mutex);
+	if (!cma_comp_exch(id_priv, work->old_state, work->new_state))
+		goto out;
+
+	if (id_priv->id.event_handler(&id_priv->id, &work->event)) {
+		cma_exch(id_priv, CMA_DESTROYING);
+		destroy = 1;
+	}
+out:
+	mutex_unlock(&id_priv->handler_mutex);
+	cma_deref_id(id_priv);
+	if (destroy)
+		rdma_destroy_id(&id_priv->id);
+	kfree(work);
+}
+
+static void cma_ndev_work_handler(struct work_struct *_work)
+{
+	struct cma_ndev_work *work = container_of(_work, struct cma_ndev_work, work);
+	struct rdma_id_private *id_priv = work->id;
+	int destroy = 0;
+
+	mutex_lock(&id_priv->handler_mutex);
+	if (id_priv->state == CMA_DESTROYING ||
+	    id_priv->state == CMA_DEVICE_REMOVAL)
+		goto out;
+
+	if (id_priv->id.event_handler(&id_priv->id, &work->event)) {
+		cma_exch(id_priv, CMA_DESTROYING);
+		destroy = 1;
+	}
+
+out:
+	mutex_unlock(&id_priv->handler_mutex);
+	cma_deref_id(id_priv);
+	if (destroy)
+		rdma_destroy_id(&id_priv->id);
+	kfree(work);
+}
+
+static int cma_resolve_ib_route(struct rdma_id_private *id_priv, int timeout_ms)
+{
+	struct rdma_route *route = &id_priv->id.route;
+	struct cma_work *work;
+	int ret;
+
+	work = kzalloc(sizeof *work, GFP_KERNEL);
+	if (!work)
+		return -ENOMEM;
+
+	work->id = id_priv;
+	INIT_WORK(&work->work, cma_work_handler);
+	work->old_state = CMA_ROUTE_QUERY;
+	work->new_state = CMA_ROUTE_RESOLVED;
+	work->event.event = RDMA_CM_EVENT_ROUTE_RESOLVED;
+
+	route->path_rec = kmalloc(sizeof *route->path_rec, GFP_KERNEL);
+	if (!route->path_rec) {
+		ret = -ENOMEM;
+		goto err1;
+	}
+
+	ret = cma_query_ib_route(id_priv, timeout_ms, work);
+	if (ret)
+		goto err2;
+
+	return 0;
+err2:
+	kfree(route->path_rec);
+	route->path_rec = NULL;
+err1:
+	kfree(work);
+	return ret;
+}
+
+int rdma_set_ib_paths(struct rdma_cm_id *id,
+		      struct ib_sa_path_rec *path_rec, int num_paths)
+{
+	struct rdma_id_private *id_priv;
+	int ret;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	if (!cma_comp_exch(id_priv, CMA_ADDR_RESOLVED, CMA_ROUTE_RESOLVED))
+		return -EINVAL;
+
+	id->route.path_rec = kmalloc(sizeof *path_rec * num_paths, GFP_KERNEL);
+	if (!id->route.path_rec) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	memcpy(id->route.path_rec, path_rec, sizeof *path_rec * num_paths);
+	return 0;
+err:
+	cma_comp_exch(id_priv, CMA_ROUTE_RESOLVED, CMA_ADDR_RESOLVED);
+	return ret;
+}
+EXPORT_SYMBOL(rdma_set_ib_paths);
+
+static int cma_resolve_iw_route(struct rdma_id_private *id_priv, int timeout_ms)
+{
+	struct cma_work *work;
+
+	work = kzalloc(sizeof *work, GFP_KERNEL);
+	if (!work)
+		return -ENOMEM;
+
+	work->id = id_priv;
+	INIT_WORK(&work->work, cma_work_handler);
+	work->old_state = CMA_ROUTE_QUERY;
+	work->new_state = CMA_ROUTE_RESOLVED;
+	work->event.event = RDMA_CM_EVENT_ROUTE_RESOLVED;
+	queue_work(cma_wq, &work->work);
+	return 0;
+}
+
+static u8 tos_to_sl(u8 tos)
+{
+	return def_prec2sl & 7;
+}
+
+static int cma_resolve_iboe_route(struct rdma_id_private *id_priv)
+{
+	struct rdma_route *route = &id_priv->id.route;
+	struct rdma_addr *addr = &route->addr;
+	struct cma_work *work;
+	int ret;
+	struct sockaddr_in *src_addr = (struct sockaddr_in *)&route->addr.src_addr;
+	struct sockaddr_in *dst_addr = (struct sockaddr_in *)&route->addr.dst_addr;
+	struct net_device *ndev = NULL;
+	u16 vid;
+
+	if (src_addr->sin_family != dst_addr->sin_family)
+		return -EINVAL;
+
+	work = kzalloc(sizeof *work, GFP_KERNEL);
+	if (!work)
+		return -ENOMEM;
+
+	work->id = id_priv;
+	INIT_WORK(&work->work, cma_work_handler);
+
+	route->path_rec = kzalloc(sizeof *route->path_rec, GFP_KERNEL);
+	if (!route->path_rec) {
+		ret = -ENOMEM;
+		goto err1;
+	}
+
+	route->num_paths = 1;
+
+	if (addr->dev_addr.bound_dev_if)
+		ndev = dev_get_by_index(&init_net, addr->dev_addr.bound_dev_if);
+	if (!ndev) {
+		ret = -ENODEV;
+		goto err2;
+	}
+
+	vid = rdma_vlan_dev_vlan_id(ndev);
+
+	iboe_mac_vlan_to_ll(&route->path_rec->sgid, addr->dev_addr.src_dev_addr, vid);
+	iboe_mac_vlan_to_ll(&route->path_rec->dgid, addr->dev_addr.dst_dev_addr, vid);
+
+	route->path_rec->hop_limit = 1;
+	route->path_rec->reversible = 1;
+	route->path_rec->pkey = cpu_to_be16(0xffff);
+	route->path_rec->mtu_selector = IB_SA_EQ;
+	route->path_rec->sl = tos_to_sl(id_priv->tos);
+
+#ifdef __linux__
+	route->path_rec->mtu = iboe_get_mtu(ndev->mtu);
+#else
+	route->path_rec->mtu = iboe_get_mtu(ndev->if_mtu);
+#endif
+	route->path_rec->rate_selector = IB_SA_EQ;
+	route->path_rec->rate = iboe_get_rate(ndev);
+	dev_put(ndev);
+	route->path_rec->packet_life_time_selector = IB_SA_EQ;
+	route->path_rec->packet_life_time = IBOE_PACKET_LIFETIME;
+	if (!route->path_rec->mtu) {
+		ret = -EINVAL;
+		goto err2;
+	}
+
+	work->old_state = CMA_ROUTE_QUERY;
+	work->new_state = CMA_ROUTE_RESOLVED;
+	work->event.event = RDMA_CM_EVENT_ROUTE_RESOLVED;
+	work->event.status = 0;
+
+	queue_work(cma_wq, &work->work);
+
+	return 0;
+
+err2:
+	kfree(route->path_rec);
+	route->path_rec = NULL;
+err1:
+	kfree(work);
+	return ret;
+}
+
+int rdma_resolve_route(struct rdma_cm_id *id, int timeout_ms)
+{
+	struct rdma_id_private *id_priv;
+	int ret;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	if (!cma_comp_exch(id_priv, CMA_ADDR_RESOLVED, CMA_ROUTE_QUERY))
+		return -EINVAL;
+
+	atomic_inc(&id_priv->refcount);
+	switch (rdma_node_get_transport(id->device->node_type)) {
+	case RDMA_TRANSPORT_IB:
+		switch (rdma_port_get_link_layer(id->device, id->port_num)) {
+		case IB_LINK_LAYER_INFINIBAND:
+			ret = cma_resolve_ib_route(id_priv, timeout_ms);
+			break;
+		case IB_LINK_LAYER_ETHERNET:
+			ret = cma_resolve_iboe_route(id_priv);
+			break;
+		default:
+			ret = -ENOSYS;
+		}
+		break;
+	case RDMA_TRANSPORT_IWARP:
+		ret = cma_resolve_iw_route(id_priv, timeout_ms);
+		break;
+	default:
+		ret = -ENOSYS;
+		break;
+	}
+	if (ret)
+		goto err;
+
+	return 0;
+err:
+	cma_comp_exch(id_priv, CMA_ROUTE_QUERY, CMA_ADDR_RESOLVED);
+	cma_deref_id(id_priv);
+	return ret;
+}
+EXPORT_SYMBOL(rdma_resolve_route);
+
+static int cma_bind_loopback(struct rdma_id_private *id_priv)
+{
+	struct cma_device *cma_dev;
+	struct ib_port_attr port_attr;
+	union ib_gid gid;
+	u16 pkey;
+	int ret;
+	u8 p;
+
+	mutex_lock(&lock);
+	if (list_empty(&dev_list)) {
+		ret = -ENODEV;
+		goto out;
+	}
+	list_for_each_entry(cma_dev, &dev_list, list)
+		for (p = 1; p <= cma_dev->device->phys_port_cnt; ++p)
+			if (!ib_query_port(cma_dev->device, p, &port_attr) &&
+			    port_attr.state == IB_PORT_ACTIVE)
+				goto port_found;
+
+	p = 1;
+	cma_dev = list_entry(dev_list.next, struct cma_device, list);
+
+port_found:
+	ret = ib_get_cached_gid(cma_dev->device, p, 0, &gid);
+	if (ret)
+		goto out;
+
+	ret = ib_get_cached_pkey(cma_dev->device, p, 0, &pkey);
+	if (ret)
+		goto out;
+
+	id_priv->id.route.addr.dev_addr.dev_type =
+		(rdma_port_get_link_layer(cma_dev->device, p) == IB_LINK_LAYER_INFINIBAND) ?
+		ARPHRD_INFINIBAND : ARPHRD_ETHER;
+
+	rdma_addr_set_sgid(&id_priv->id.route.addr.dev_addr, &gid);
+	ib_addr_set_pkey(&id_priv->id.route.addr.dev_addr, pkey);
+	id_priv->id.port_num = p;
+	cma_attach_to_dev(id_priv, cma_dev);
+out:
+	mutex_unlock(&lock);
+	return ret;
+}
+
+static void addr_handler(int status, struct sockaddr *src_addr,
+			 struct rdma_dev_addr *dev_addr, void *context)
+{
+	struct rdma_id_private *id_priv = context;
+	struct rdma_cm_event event;
+
+	memset(&event, 0, sizeof event);
+	mutex_lock(&id_priv->handler_mutex);
+
+	/*
+	 * Grab mutex to block rdma_destroy_id() from removing the device while
+	 * we're trying to acquire it.
+	 */
+	mutex_lock(&lock);
+	if (!cma_comp_exch(id_priv, CMA_ADDR_QUERY, CMA_ADDR_RESOLVED)) {
+		mutex_unlock(&lock);
+		goto out;
+	}
+
+	if (!status && !id_priv->cma_dev)
+		status = cma_acquire_dev(id_priv);
+	mutex_unlock(&lock);
+
+	if (status) {
+		if (!cma_comp_exch(id_priv, CMA_ADDR_RESOLVED, CMA_ADDR_BOUND))
+			goto out;
+		event.event = RDMA_CM_EVENT_ADDR_ERROR;
+		event.status = status;
+	} else {
+		memcpy(&id_priv->id.route.addr.src_addr, src_addr,
+		       ip_addr_size(src_addr));
+		event.event = RDMA_CM_EVENT_ADDR_RESOLVED;
+	}
+
+	if (id_priv->id.event_handler(&id_priv->id, &event)) {
+		cma_exch(id_priv, CMA_DESTROYING);
+		mutex_unlock(&id_priv->handler_mutex);
+		cma_deref_id(id_priv);
+		rdma_destroy_id(&id_priv->id);
+		return;
+	}
+out:
+	mutex_unlock(&id_priv->handler_mutex);
+	cma_deref_id(id_priv);
+}
+
+static int cma_resolve_loopback(struct rdma_id_private *id_priv)
+{
+	struct cma_work *work;
+	struct sockaddr *src, *dst;
+	union ib_gid gid;
+	int ret;
+
+	work = kzalloc(sizeof *work, GFP_KERNEL);
+	if (!work)
+		return -ENOMEM;
+
+	if (!id_priv->cma_dev) {
+		ret = cma_bind_loopback(id_priv);
+		if (ret)
+			goto err;
+	}
+
+	rdma_addr_get_sgid(&id_priv->id.route.addr.dev_addr, &gid);
+	rdma_addr_set_dgid(&id_priv->id.route.addr.dev_addr, &gid);
+
+	src = (struct sockaddr *) &id_priv->id.route.addr.src_addr;
+	if (cma_zero_addr(src)) {
+		dst = (struct sockaddr *) &id_priv->id.route.addr.dst_addr;
+		if ((src->sa_family = dst->sa_family) == AF_INET) {
+			((struct sockaddr_in *) src)->sin_addr.s_addr =
+				((struct sockaddr_in *) dst)->sin_addr.s_addr;
+		} else {
+			ipv6_addr_copy(&((struct sockaddr_in6 *) src)->sin6_addr,
+				       &((struct sockaddr_in6 *) dst)->sin6_addr);
+		}
+	}
+
+	work->id = id_priv;
+	INIT_WORK(&work->work, cma_work_handler);
+	work->old_state = CMA_ADDR_QUERY;
+	work->new_state = CMA_ADDR_RESOLVED;
+	work->event.event = RDMA_CM_EVENT_ADDR_RESOLVED;
+	queue_work(cma_wq, &work->work);
+	return 0;
+err:
+	kfree(work);
+	return ret;
+}
+
+static int cma_bind_addr(struct rdma_cm_id *id, struct sockaddr *src_addr,
+			 struct sockaddr *dst_addr)
+{
+	if (!src_addr || !src_addr->sa_family) {
+		src_addr = (struct sockaddr *) &id->route.addr.src_addr;
+		if ((src_addr->sa_family = dst_addr->sa_family) == AF_INET6) {
+			((struct sockaddr_in6 *) src_addr)->sin6_scope_id =
+				((struct sockaddr_in6 *) dst_addr)->sin6_scope_id;
+		}
+	}
+	if (!cma_any_addr(src_addr))
+		return rdma_bind_addr(id, src_addr);
+	else {
+		struct sockaddr_in addr_in;
+
+        	memset(&addr_in, 0, sizeof addr_in);
+        	addr_in.sin_family = dst_addr->sa_family;
+        	addr_in.sin_len = sizeof addr_in;
+        	return rdma_bind_addr(id, (struct sockaddr *) &addr_in);
+	}
+}
+
+int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr,
+		      struct sockaddr *dst_addr, int timeout_ms)
+{
+	struct rdma_id_private *id_priv;
+	int ret;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	if (id_priv->state == CMA_IDLE) {
+		ret = cma_bind_addr(id, src_addr, dst_addr);
+		if (ret)
+			return ret;
+	}
+
+	if (!cma_comp_exch(id_priv, CMA_ADDR_BOUND, CMA_ADDR_QUERY))
+		return -EINVAL;
+
+	atomic_inc(&id_priv->refcount);
+	memcpy(&id->route.addr.dst_addr, dst_addr, ip_addr_size(dst_addr));
+	if (cma_any_addr(dst_addr))
+		ret = cma_resolve_loopback(id_priv);
+	else
+		ret = rdma_resolve_ip(&addr_client, (struct sockaddr *) &id->route.addr.src_addr,
+				      dst_addr, &id->route.addr.dev_addr,
+				      timeout_ms, addr_handler, id_priv);
+	if (ret)
+		goto err;
+
+	return 0;
+err:
+	cma_comp_exch(id_priv, CMA_ADDR_QUERY, CMA_ADDR_BOUND);
+	cma_deref_id(id_priv);
+	return ret;
+}
+EXPORT_SYMBOL(rdma_resolve_addr);
+
+static void cma_bind_port(struct rdma_bind_list *bind_list,
+			  struct rdma_id_private *id_priv)
+{
+	struct sockaddr_in *sin;
+
+	sin = (struct sockaddr_in *) &id_priv->id.route.addr.src_addr;
+	sin->sin_port = htons(bind_list->port);
+	id_priv->bind_list = bind_list;
+	hlist_add_head(&id_priv->node, &bind_list->owners);
+}
+
+static int cma_alloc_port(struct idr *ps, struct rdma_id_private *id_priv,
+			  unsigned short snum)
+{
+	struct rdma_bind_list *bind_list;
+	int port, ret;
+
+	bind_list = kzalloc(sizeof *bind_list, GFP_KERNEL);
+	if (!bind_list)
+		return -ENOMEM;
+
+	do {
+		ret = idr_get_new_above(ps, bind_list, snum, &port);
+	} while ((ret == -EAGAIN) && idr_pre_get(ps, GFP_KERNEL));
+
+	if (ret)
+		goto err1;
+
+	if (port != snum) {
+		ret = -EADDRNOTAVAIL;
+		goto err2;
+	}
+
+	bind_list->ps = ps;
+	bind_list->port = (unsigned short) port;
+	cma_bind_port(bind_list, id_priv);
+	return 0;
+err2:
+	idr_remove(ps, port);
+err1:
+	kfree(bind_list);
+	return ret;
+}
+
+static int cma_alloc_any_port(struct idr *ps, struct rdma_id_private *id_priv)
+{
+#if defined(INET)
+	struct rdma_bind_list *bind_list;
+	int port, ret, low, high;
+
+	bind_list = kzalloc(sizeof *bind_list, GFP_KERNEL);
+	if (!bind_list)
+		return -ENOMEM;
+
+retry:
+	/* FIXME: add proper port randomization per like inet_csk_get_port */
+	do {
+		ret = idr_get_new_above(ps, bind_list, next_port, &port);
+	} while ((ret == -EAGAIN) && idr_pre_get(ps, GFP_KERNEL));
+
+	if (ret)
+		goto err1;
+
+	inet_get_local_port_range(&low, &high);
+	if (port > high) {
+		if (next_port != low) {
+			idr_remove(ps, port);
+			next_port = low;
+			goto retry;
+		}
+		ret = -EADDRNOTAVAIL;
+		goto err2;
+	}
+
+	if (port == high)
+		next_port = low;
+	else
+		next_port = port + 1;
+
+	bind_list->ps = ps;
+	bind_list->port = (unsigned short) port;
+	cma_bind_port(bind_list, id_priv);
+	return 0;
+err2:
+	idr_remove(ps, port);
+err1:
+	kfree(bind_list);
+	return ret;
+#else
+	return -ENOSPC;
+#endif
+}
+
+static int cma_use_port(struct idr *ps, struct rdma_id_private *id_priv)
+{
+	struct rdma_id_private *cur_id;
+	struct sockaddr_in *sin, *cur_sin;
+	struct rdma_bind_list *bind_list;
+	struct hlist_node *node;
+	unsigned short snum;
+
+	sin = (struct sockaddr_in *) &id_priv->id.route.addr.src_addr;
+	snum = ntohs(sin->sin_port);
+#ifdef __linux__
+	if (snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE))
+		return -EACCES;
+#endif
+
+	bind_list = idr_find(ps, snum);
+	if (!bind_list)
+		return cma_alloc_port(ps, id_priv, snum);
+
+	/*
+	 * We don't support binding to any address if anyone is bound to
+	 * a specific address on the same port.
+	 */
+	if (cma_any_addr((struct sockaddr *) &id_priv->id.route.addr.src_addr))
+		return -EADDRNOTAVAIL;
+
+	hlist_for_each_entry(cur_id, node, &bind_list->owners, node) {
+		if (cma_any_addr((struct sockaddr *) &cur_id->id.route.addr.src_addr))
+			return -EADDRNOTAVAIL;
+
+		cur_sin = (struct sockaddr_in *) &cur_id->id.route.addr.src_addr;
+		if (sin->sin_addr.s_addr == cur_sin->sin_addr.s_addr)
+			return -EADDRINUSE;
+	}
+
+	cma_bind_port(bind_list, id_priv);
+	return 0;
+}
+
+static int cma_get_tcp_port(struct rdma_id_private *id_priv)
+{
+	int ret;
+	int size;
+	struct socket *sock;
+
+	ret = sock_create_kern(AF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
+	if (ret)
+		return ret;
+#ifdef __linux__
+	ret = sock->ops->bind(sock,
+			(struct sockaddr *) &id_priv->id.route.addr.src_addr,
+			ip_addr_size((struct sockaddr *) &id_priv->id.route.addr.src_addr));
+#else
+	ret = -sobind(sock,
+			(struct sockaddr *)&id_priv->id.route.addr.src_addr,
+			curthread);
+#endif
+	if (ret) {
+		sock_release(sock);
+		return ret;
+	}
+
+	size = ip_addr_size((struct sockaddr *) &id_priv->id.route.addr.src_addr);
+	ret = sock_getname(sock,
+			(struct sockaddr *) &id_priv->id.route.addr.src_addr,
+			&size, 0);
+	if (ret) {
+		sock_release(sock);
+		return ret;
+	}
+
+	id_priv->sock = sock;
+	return 0;
+}
+
+static int cma_get_port(struct rdma_id_private *id_priv)
+{
+	struct idr *ps;
+	int ret;
+
+	switch (id_priv->id.ps) {
+	case RDMA_PS_SDP:
+		ps = &sdp_ps;
+		break;
+	case RDMA_PS_TCP:
+		ps = &tcp_ps;
+		if (unify_tcp_port_space) {
+			ret = cma_get_tcp_port(id_priv);
+			if (ret)
+				goto out;
+		}
+		break;
+	case RDMA_PS_UDP:
+		ps = &udp_ps;
+		break;
+	case RDMA_PS_IPOIB:
+		ps = &ipoib_ps;
+		break;
+	default:
+		return -EPROTONOSUPPORT;
+	}
+
+	mutex_lock(&lock);
+	if (cma_any_port((struct sockaddr *) &id_priv->id.route.addr.src_addr))
+		ret = cma_alloc_any_port(ps, id_priv);
+	else
+		ret = cma_use_port(ps, id_priv);
+	mutex_unlock(&lock);
+out:
+	return ret;
+}
+
+static int cma_check_linklocal(struct rdma_dev_addr *dev_addr,
+			       struct sockaddr *addr)
+{
+#if defined(INET6)
+	struct sockaddr_in6 *sin6;
+
+	if (addr->sa_family != AF_INET6)
+		return 0;
+
+	sin6 = (struct sockaddr_in6 *) addr;
+#ifdef __linux__
+	if ((ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL) &&
+#else
+	if (IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr) &&
+#endif
+	    !sin6->sin6_scope_id)
+			return -EINVAL;
+
+	dev_addr->bound_dev_if = sin6->sin6_scope_id;
+#endif
+	return 0;
+}
+
+int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr)
+{
+	struct rdma_id_private *id_priv;
+	int ret;
+
+	if (addr->sa_family != AF_INET && addr->sa_family != AF_INET6)
+		return -EAFNOSUPPORT;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	if (!cma_comp_exch(id_priv, CMA_IDLE, CMA_ADDR_BOUND))
+		return -EINVAL;
+
+	ret = cma_check_linklocal(&id->route.addr.dev_addr, addr);
+	if (ret)
+		goto err1;
+
+	if (!cma_any_addr(addr)) {
+		ret = rdma_translate_ip(addr, &id->route.addr.dev_addr);
+		if (ret)
+			goto err1;
+
+		mutex_lock(&lock);
+		ret = cma_acquire_dev(id_priv);
+		mutex_unlock(&lock);
+		if (ret)
+			goto err1;
+	}
+
+	memcpy(&id->route.addr.src_addr, addr, ip_addr_size(addr));
+	ret = cma_get_port(id_priv);
+	if (ret)
+		goto err2;
+
+	return 0;
+err2:
+	if (id_priv->cma_dev) {
+		mutex_lock(&lock);
+		cma_detach_from_dev(id_priv);
+		mutex_unlock(&lock);
+	}
+err1:
+	cma_comp_exch(id_priv, CMA_ADDR_BOUND, CMA_IDLE);
+	return ret;
+}
+EXPORT_SYMBOL(rdma_bind_addr);
+
+static int cma_format_hdr(void *hdr, enum rdma_port_space ps,
+			  struct rdma_route *route)
+{
+	struct cma_hdr *cma_hdr;
+	struct sdp_hh *sdp_hdr;
+
+	if (route->addr.src_addr.ss_family == AF_INET) {
+		struct sockaddr_in *src4, *dst4;
+
+		src4 = (struct sockaddr_in *) &route->addr.src_addr;
+		dst4 = (struct sockaddr_in *) &route->addr.dst_addr;
+
+		switch (ps) {
+		case RDMA_PS_SDP:
+			sdp_hdr = hdr;
+			if (sdp_get_majv(sdp_hdr->sdp_version) != SDP_MAJ_VERSION)
+				return -EINVAL;
+			sdp_set_ip_ver(sdp_hdr, 4);
+			sdp_hdr->src_addr.ip4.addr = src4->sin_addr.s_addr;
+			sdp_hdr->dst_addr.ip4.addr = dst4->sin_addr.s_addr;
+			sdp_hdr->port = src4->sin_port;
+			break;
+		default:
+			cma_hdr = hdr;
+			cma_hdr->cma_version = CMA_VERSION;
+			cma_set_ip_ver(cma_hdr, 4);
+			cma_hdr->src_addr.ip4.addr = src4->sin_addr.s_addr;
+			cma_hdr->dst_addr.ip4.addr = dst4->sin_addr.s_addr;
+			cma_hdr->port = src4->sin_port;
+			break;
+		}
+	} else {
+		struct sockaddr_in6 *src6, *dst6;
+
+		src6 = (struct sockaddr_in6 *) &route->addr.src_addr;
+		dst6 = (struct sockaddr_in6 *) &route->addr.dst_addr;
+
+		switch (ps) {
+		case RDMA_PS_SDP:
+			sdp_hdr = hdr;
+			if (sdp_get_majv(sdp_hdr->sdp_version) != SDP_MAJ_VERSION)
+				return -EINVAL;
+			sdp_set_ip_ver(sdp_hdr, 6);
+			sdp_hdr->src_addr.ip6 = src6->sin6_addr;
+			sdp_hdr->dst_addr.ip6 = dst6->sin6_addr;
+			sdp_hdr->port = src6->sin6_port;
+			break;
+		default:
+			cma_hdr = hdr;
+			cma_hdr->cma_version = CMA_VERSION;
+			cma_set_ip_ver(cma_hdr, 6);
+			cma_hdr->src_addr.ip6 = src6->sin6_addr;
+			cma_hdr->dst_addr.ip6 = dst6->sin6_addr;
+			cma_hdr->port = src6->sin6_port;
+			break;
+		}
+	}
+	return 0;
+}
+
+static int cma_sidr_rep_handler(struct ib_cm_id *cm_id,
+				struct ib_cm_event *ib_event)
+{
+	struct rdma_id_private *id_priv = cm_id->context;
+	struct rdma_cm_event event;
+	struct ib_cm_sidr_rep_event_param *rep = &ib_event->param.sidr_rep_rcvd;
+	int ret = 0;
+
+	if (cma_disable_callback(id_priv, CMA_CONNECT))
+		return 0;
+
+	memset(&event, 0, sizeof event);
+	switch (ib_event->event) {
+	case IB_CM_SIDR_REQ_ERROR:
+		event.event = RDMA_CM_EVENT_UNREACHABLE;
+		event.status = -ETIMEDOUT;
+		break;
+	case IB_CM_SIDR_REP_RECEIVED:
+		event.param.ud.private_data = ib_event->private_data;
+		event.param.ud.private_data_len = IB_CM_SIDR_REP_PRIVATE_DATA_SIZE;
+		if (rep->status != IB_SIDR_SUCCESS) {
+			event.event = RDMA_CM_EVENT_UNREACHABLE;
+			event.status = ib_event->param.sidr_rep_rcvd.status;
+			break;
+		}
+		ret = cma_set_qkey(id_priv);
+		if (ret) {
+			event.event = RDMA_CM_EVENT_ADDR_ERROR;
+			event.status = -EINVAL;
+			break;
+		}
+		if (id_priv->qkey != rep->qkey) {
+			event.event = RDMA_CM_EVENT_UNREACHABLE;
+			event.status = -EINVAL;
+			break;
+		}
+		ib_init_ah_from_path(id_priv->id.device, id_priv->id.port_num,
+				     id_priv->id.route.path_rec,
+				     &event.param.ud.ah_attr);
+		event.param.ud.qp_num = rep->qpn;
+		event.param.ud.qkey = rep->qkey;
+		event.event = RDMA_CM_EVENT_ESTABLISHED;
+		event.status = 0;
+		break;
+	default:
+		printk(KERN_ERR "RDMA CMA: unexpected IB CM event: %d\n",
+		       ib_event->event);
+		goto out;
+	}
+
+	ret = id_priv->id.event_handler(&id_priv->id, &event);
+	if (ret) {
+		/* Destroy the CM ID by returning a non-zero value. */
+		id_priv->cm_id.ib = NULL;
+		cma_exch(id_priv, CMA_DESTROYING);
+		mutex_unlock(&id_priv->handler_mutex);
+		rdma_destroy_id(&id_priv->id);
+		return ret;
+	}
+out:
+	mutex_unlock(&id_priv->handler_mutex);
+	return ret;
+}
+
+static int cma_resolve_ib_udp(struct rdma_id_private *id_priv,
+			      struct rdma_conn_param *conn_param)
+{
+	struct ib_cm_sidr_req_param req;
+	struct rdma_route *route;
+	int ret;
+
+	req.private_data_len = sizeof(struct cma_hdr) +
+			       conn_param->private_data_len;
+	req.private_data = kzalloc(req.private_data_len, GFP_ATOMIC);
+	if (!req.private_data)
+		return -ENOMEM;
+
+	if (conn_param->private_data && conn_param->private_data_len)
+		memcpy((void *) req.private_data + sizeof(struct cma_hdr),
+		       conn_param->private_data, conn_param->private_data_len);
+
+	route = &id_priv->id.route;
+	ret = cma_format_hdr((void *) req.private_data, id_priv->id.ps, route);
+	if (ret)
+		goto out;
+
+	id_priv->cm_id.ib = ib_create_cm_id(id_priv->id.device,
+					    cma_sidr_rep_handler, id_priv);
+	if (IS_ERR(id_priv->cm_id.ib)) {
+		ret = PTR_ERR(id_priv->cm_id.ib);
+		goto out;
+	}
+
+	req.path = route->path_rec;
+	req.service_id = cma_get_service_id(id_priv->id.ps,
+					    (struct sockaddr *) &route->addr.dst_addr);
+	req.timeout_ms = 1 << (cma_response_timeout - 8);
+	req.max_cm_retries = CMA_MAX_CM_RETRIES;
+
+	ret = ib_send_cm_sidr_req(id_priv->cm_id.ib, &req);
+	if (ret) {
+		ib_destroy_cm_id(id_priv->cm_id.ib);
+		id_priv->cm_id.ib = NULL;
+	}
+out:
+	kfree(req.private_data);
+	return ret;
+}
+
+static int cma_connect_ib(struct rdma_id_private *id_priv,
+			  struct rdma_conn_param *conn_param)
+{
+	struct ib_cm_req_param req;
+	struct rdma_route *route;
+	void *private_data;
+	int offset, ret;
+
+	memset(&req, 0, sizeof req);
+	offset = cma_user_data_offset(id_priv->id.ps);
+	req.private_data_len = offset + conn_param->private_data_len;
+	private_data = kzalloc(req.private_data_len, GFP_ATOMIC);
+	if (!private_data)
+		return -ENOMEM;
+
+	if (conn_param->private_data && conn_param->private_data_len)
+		memcpy(private_data + offset, conn_param->private_data,
+		       conn_param->private_data_len);
+
+	id_priv->cm_id.ib = ib_create_cm_id(id_priv->id.device, cma_ib_handler,
+					    id_priv);
+	if (IS_ERR(id_priv->cm_id.ib)) {
+		ret = PTR_ERR(id_priv->cm_id.ib);
+		goto out;
+	}
+
+	route = &id_priv->id.route;
+	ret = cma_format_hdr(private_data, id_priv->id.ps, route);
+	if (ret)
+		goto out;
+	req.private_data = private_data;
+
+	req.primary_path = &route->path_rec[0];
+	if (route->num_paths == 2)
+		req.alternate_path = &route->path_rec[1];
+
+	req.service_id = cma_get_service_id(id_priv->id.ps,
+					    (struct sockaddr *) &route->addr.dst_addr);
+	req.qp_num = id_priv->qp_num;
+	req.qp_type = IB_QPT_RC;
+	req.starting_psn = id_priv->seq_num;
+	req.responder_resources = conn_param->responder_resources;
+	req.initiator_depth = conn_param->initiator_depth;
+	req.flow_control = conn_param->flow_control;
+	req.retry_count = conn_param->retry_count;
+	req.rnr_retry_count = conn_param->rnr_retry_count;
+       req.remote_cm_response_timeout = cma_response_timeout;
+       req.local_cm_response_timeout = cma_response_timeout;
+	req.max_cm_retries = CMA_MAX_CM_RETRIES;
+	req.srq = id_priv->srq ? 1 : 0;
+
+	ret = ib_send_cm_req(id_priv->cm_id.ib, &req);
+out:
+	if (ret && !IS_ERR(id_priv->cm_id.ib)) {
+		ib_destroy_cm_id(id_priv->cm_id.ib);
+		id_priv->cm_id.ib = NULL;
+	}
+
+	kfree(private_data);
+	return ret;
+}
+
+static int cma_connect_iw(struct rdma_id_private *id_priv,
+			  struct rdma_conn_param *conn_param)
+{
+	struct iw_cm_id *cm_id;
+	struct sockaddr_in* sin;
+	int ret;
+	struct iw_cm_conn_param iw_param;
+
+	cm_id = iw_create_cm_id(id_priv->id.device, id_priv->sock,
+				cma_iw_handler, id_priv);
+	if (IS_ERR(cm_id)) {
+		ret = PTR_ERR(cm_id);
+		goto out;
+	}
+
+	id_priv->cm_id.iw = cm_id;
+
+	sin = (struct sockaddr_in*) &id_priv->id.route.addr.src_addr;
+	cm_id->local_addr = *sin;
+
+	sin = (struct sockaddr_in*) &id_priv->id.route.addr.dst_addr;
+	cm_id->remote_addr = *sin;
+
+	ret = cma_modify_qp_rtr(id_priv, conn_param);
+	if (ret)
+		goto out;
+
+	iw_param.ord = conn_param->initiator_depth;
+	iw_param.ird = conn_param->responder_resources;
+	iw_param.private_data = conn_param->private_data;
+	iw_param.private_data_len = conn_param->private_data_len;
+	if (id_priv->id.qp)
+		iw_param.qpn = id_priv->qp_num;
+	else
+		iw_param.qpn = conn_param->qp_num;
+	ret = iw_cm_connect(cm_id, &iw_param);
+out:
+	if (ret && !IS_ERR(cm_id)) {
+		iw_destroy_cm_id(cm_id);
+		id_priv->cm_id.iw = NULL;
+	}
+	return ret;
+}
+
+int rdma_connect(struct rdma_cm_id *id, struct rdma_conn_param *conn_param)
+{
+	struct rdma_id_private *id_priv;
+	int ret;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	if (!cma_comp_exch(id_priv, CMA_ROUTE_RESOLVED, CMA_CONNECT))
+		return -EINVAL;
+
+	if (!id->qp) {
+		id_priv->qp_num = conn_param->qp_num;
+		id_priv->srq = conn_param->srq;
+	}
+
+	switch (rdma_node_get_transport(id->device->node_type)) {
+	case RDMA_TRANSPORT_IB:
+		if (cma_is_ud_ps(id->ps))
+			ret = cma_resolve_ib_udp(id_priv, conn_param);
+		else
+			ret = cma_connect_ib(id_priv, conn_param);
+		break;
+	case RDMA_TRANSPORT_IWARP:
+		ret = cma_connect_iw(id_priv, conn_param);
+		break;
+	default:
+		ret = -ENOSYS;
+		break;
+	}
+	if (ret)
+		goto err;
+
+	return 0;
+err:
+	cma_comp_exch(id_priv, CMA_CONNECT, CMA_ROUTE_RESOLVED);
+	return ret;
+}
+EXPORT_SYMBOL(rdma_connect);
+
+static int cma_accept_ib(struct rdma_id_private *id_priv,
+			 struct rdma_conn_param *conn_param)
+{
+	struct ib_cm_rep_param rep;
+	int ret;
+
+	ret = cma_modify_qp_rtr(id_priv, conn_param);
+	if (ret)
+		goto out;
+
+	ret = cma_modify_qp_rts(id_priv, conn_param);
+	if (ret)
+		goto out;
+
+	memset(&rep, 0, sizeof rep);
+	rep.qp_num = id_priv->qp_num;
+	rep.starting_psn = id_priv->seq_num;
+	rep.private_data = conn_param->private_data;
+	rep.private_data_len = conn_param->private_data_len;
+	rep.responder_resources = conn_param->responder_resources;
+	rep.initiator_depth = conn_param->initiator_depth;
+	rep.failover_accepted = 0;
+	rep.flow_control = conn_param->flow_control;
+	rep.rnr_retry_count = conn_param->rnr_retry_count;
+	rep.srq = id_priv->srq ? 1 : 0;
+
+	ret = ib_send_cm_rep(id_priv->cm_id.ib, &rep);
+out:
+	return ret;
+}
+
+static int cma_accept_iw(struct rdma_id_private *id_priv,
+		  struct rdma_conn_param *conn_param)
+{
+	struct iw_cm_conn_param iw_param;
+	int ret;
+
+	ret = cma_modify_qp_rtr(id_priv, conn_param);
+	if (ret)
+		return ret;
+
+	iw_param.ord = conn_param->initiator_depth;
+	iw_param.ird = conn_param->responder_resources;
+	iw_param.private_data = conn_param->private_data;
+	iw_param.private_data_len = conn_param->private_data_len;
+	if (id_priv->id.qp) {
+		iw_param.qpn = id_priv->qp_num;
+	} else
+		iw_param.qpn = conn_param->qp_num;
+
+	return iw_cm_accept(id_priv->cm_id.iw, &iw_param);
+}
+
+static int cma_send_sidr_rep(struct rdma_id_private *id_priv,
+			     enum ib_cm_sidr_status status,
+			     const void *private_data, int private_data_len)
+{
+	struct ib_cm_sidr_rep_param rep;
+	int ret;
+
+	memset(&rep, 0, sizeof rep);
+	rep.status = status;
+	if (status == IB_SIDR_SUCCESS) {
+		ret = cma_set_qkey(id_priv);
+		if (ret)
+			return ret;
+		rep.qp_num = id_priv->qp_num;
+		rep.qkey = id_priv->qkey;
+	}
+	rep.private_data = private_data;
+	rep.private_data_len = private_data_len;
+
+	return ib_send_cm_sidr_rep(id_priv->cm_id.ib, &rep);
+}
+
+int rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param)
+{
+	struct rdma_id_private *id_priv;
+	int ret;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	if (!cma_comp(id_priv, CMA_CONNECT))
+		return -EINVAL;
+
+	if (!id->qp && conn_param) {
+		id_priv->qp_num = conn_param->qp_num;
+		id_priv->srq = conn_param->srq;
+	}
+
+	switch (rdma_node_get_transport(id->device->node_type)) {
+	case RDMA_TRANSPORT_IB:
+		if (cma_is_ud_ps(id->ps))
+			ret = cma_send_sidr_rep(id_priv, IB_SIDR_SUCCESS,
+						conn_param->private_data,
+						conn_param->private_data_len);
+		else if (conn_param)
+			ret = cma_accept_ib(id_priv, conn_param);
+		else
+			ret = cma_rep_recv(id_priv);
+		break;
+	case RDMA_TRANSPORT_IWARP:
+		ret = cma_accept_iw(id_priv, conn_param);
+		break;
+	default:
+		ret = -ENOSYS;
+		break;
+	}
+
+	if (ret)
+		goto reject;
+
+	return 0;
+reject:
+	cma_modify_qp_err(id_priv);
+	rdma_reject(id, NULL, 0);
+	return ret;
+}
+EXPORT_SYMBOL(rdma_accept);
+
+int rdma_notify(struct rdma_cm_id *id, enum ib_event_type event)
+{
+	struct rdma_id_private *id_priv;
+	int ret;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	if (!cma_has_cm_dev(id_priv))
+		return -EINVAL;
+
+	switch (id->device->node_type) {
+	case RDMA_NODE_IB_CA:
+		ret = ib_cm_notify(id_priv->cm_id.ib, event);
+		break;
+	default:
+		ret = 0;
+		break;
+	}
+	return ret;
+}
+EXPORT_SYMBOL(rdma_notify);
+
+int rdma_reject(struct rdma_cm_id *id, const void *private_data,
+		u8 private_data_len)
+{
+	struct rdma_id_private *id_priv;
+	int ret;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	if (!cma_has_cm_dev(id_priv))
+		return -EINVAL;
+
+	switch (rdma_node_get_transport(id->device->node_type)) {
+	case RDMA_TRANSPORT_IB:
+		if (cma_is_ud_ps(id->ps))
+			ret = cma_send_sidr_rep(id_priv, IB_SIDR_REJECT,
+						private_data, private_data_len);
+		else
+			ret = ib_send_cm_rej(id_priv->cm_id.ib,
+					     IB_CM_REJ_CONSUMER_DEFINED, NULL,
+					     0, private_data, private_data_len);
+		break;
+	case RDMA_TRANSPORT_IWARP:
+		ret = iw_cm_reject(id_priv->cm_id.iw,
+				   private_data, private_data_len);
+		break;
+	default:
+		ret = -ENOSYS;
+		break;
+	}
+	return ret;
+}
+EXPORT_SYMBOL(rdma_reject);
+
+int rdma_disconnect(struct rdma_cm_id *id)
+{
+	struct rdma_id_private *id_priv;
+	int ret;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	if (!cma_has_cm_dev(id_priv))
+		return -EINVAL;
+
+	switch (rdma_node_get_transport(id->device->node_type)) {
+	case RDMA_TRANSPORT_IB:
+		ret = cma_modify_qp_err(id_priv);
+		if (ret)
+			goto out;
+		/* Initiate or respond to a disconnect. */
+		if (ib_send_cm_dreq(id_priv->cm_id.ib, NULL, 0))
+			ib_send_cm_drep(id_priv->cm_id.ib, NULL, 0);
+		break;
+	case RDMA_TRANSPORT_IWARP:
+		ret = iw_cm_disconnect(id_priv->cm_id.iw, 0);
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+out:
+	return ret;
+}
+EXPORT_SYMBOL(rdma_disconnect);
+
+static int cma_ib_mc_handler(int status, struct ib_sa_multicast *multicast)
+{
+	struct rdma_id_private *id_priv;
+	struct cma_multicast *mc = multicast->context;
+	struct rdma_cm_event event;
+	int ret;
+
+	id_priv = mc->id_priv;
+	if (cma_disable_callback(id_priv, CMA_ADDR_BOUND) &&
+	    cma_disable_callback(id_priv, CMA_ADDR_RESOLVED))
+		return 0;
+
+	mutex_lock(&id_priv->qp_mutex);
+	if (!status && id_priv->id.qp)
+		status = ib_attach_mcast(id_priv->id.qp, &multicast->rec.mgid,
+					 multicast->rec.mlid);
+	mutex_unlock(&id_priv->qp_mutex);
+
+	memset(&event, 0, sizeof event);
+	event.status = status;
+	event.param.ud.private_data = mc->context;
+	if (!status) {
+		event.event = RDMA_CM_EVENT_MULTICAST_JOIN;
+		ib_init_ah_from_mcmember(id_priv->id.device,
+					 id_priv->id.port_num, &multicast->rec,
+					 &event.param.ud.ah_attr);
+		event.param.ud.qp_num = 0xFFFFFF;
+		event.param.ud.qkey = be32_to_cpu(multicast->rec.qkey);
+	} else
+		event.event = RDMA_CM_EVENT_MULTICAST_ERROR;
+
+	ret = id_priv->id.event_handler(&id_priv->id, &event);
+	if (ret) {
+		cma_exch(id_priv, CMA_DESTROYING);
+		mutex_unlock(&id_priv->handler_mutex);
+		rdma_destroy_id(&id_priv->id);
+		return 0;
+	}
+
+	mutex_unlock(&id_priv->handler_mutex);
+	return 0;
+}
+
+static void cma_set_mgid(struct rdma_id_private *id_priv,
+			 struct sockaddr *addr, union ib_gid *mgid)
+{
+#if defined(INET) || defined(INET6)
+	unsigned char mc_map[MAX_ADDR_LEN];
+	struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
+#endif
+#ifdef INET
+	struct sockaddr_in *sin = (struct sockaddr_in *) addr;
+#endif
+#ifdef INET6
+	struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) addr;
+#endif
+
+	if (cma_any_addr(addr)) {
+		memset(mgid, 0, sizeof *mgid);
+#ifdef INET6
+	} else if ((addr->sa_family == AF_INET6) &&
+		   ((be32_to_cpu(sin6->sin6_addr.s6_addr32[0]) & 0xFFF0FFFF) ==
+								 0xFF10A01B)) {
+		/* IPv6 address is an SA assigned MGID. */
+		memcpy(mgid, &sin6->sin6_addr, sizeof *mgid);
+	} else if (addr->sa_family == AF_INET6) {
+		ipv6_ib_mc_map(&sin6->sin6_addr, dev_addr->broadcast, mc_map);
+		if (id_priv->id.ps == RDMA_PS_UDP)
+			mc_map[7] = 0x01;	/* Use RDMA CM signature */
+		*mgid = *(union ib_gid *) (mc_map + 4);
+#endif
+#ifdef INET
+	} else {
+		ip_ib_mc_map(sin->sin_addr.s_addr, dev_addr->broadcast, mc_map);
+		if (id_priv->id.ps == RDMA_PS_UDP)
+			mc_map[7] = 0x01;	/* Use RDMA CM signature */
+		*mgid = *(union ib_gid *) (mc_map + 4);
+#endif
+	}
+}
+
+static int cma_join_ib_multicast(struct rdma_id_private *id_priv,
+				 struct cma_multicast *mc)
+{
+	struct ib_sa_mcmember_rec rec;
+	struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
+	ib_sa_comp_mask comp_mask;
+	int ret;
+
+	ib_addr_get_mgid(dev_addr, &rec.mgid);
+	ret = ib_sa_get_mcmember_rec(id_priv->id.device, id_priv->id.port_num,
+				     &rec.mgid, &rec);
+	if (ret)
+		return ret;
+
+	cma_set_mgid(id_priv, (struct sockaddr *) &mc->addr, &rec.mgid);
+	if (id_priv->id.ps == RDMA_PS_UDP)
+		rec.qkey = cpu_to_be32(RDMA_UDP_QKEY);
+	rdma_addr_get_sgid(dev_addr, &rec.port_gid);
+	rec.pkey = cpu_to_be16(ib_addr_get_pkey(dev_addr));
+	rec.join_state = 1;
+
+	comp_mask = IB_SA_MCMEMBER_REC_MGID | IB_SA_MCMEMBER_REC_PORT_GID |
+		    IB_SA_MCMEMBER_REC_PKEY | IB_SA_MCMEMBER_REC_JOIN_STATE |
+		    IB_SA_MCMEMBER_REC_QKEY | IB_SA_MCMEMBER_REC_SL |
+		    IB_SA_MCMEMBER_REC_FLOW_LABEL |
+		    IB_SA_MCMEMBER_REC_TRAFFIC_CLASS;
+
+	if (id_priv->id.ps == RDMA_PS_IPOIB)
+		comp_mask |= IB_SA_MCMEMBER_REC_RATE |
+			     IB_SA_MCMEMBER_REC_RATE_SELECTOR;
+
+	mc->multicast.ib = ib_sa_join_multicast(&sa_client, id_priv->id.device,
+						id_priv->id.port_num, &rec,
+						comp_mask, GFP_KERNEL,
+						cma_ib_mc_handler, mc);
+	if (IS_ERR(mc->multicast.ib))
+		return PTR_ERR(mc->multicast.ib);
+
+	return 0;
+}
+
+
+static void iboe_mcast_work_handler(struct work_struct *work)
+{
+	struct iboe_mcast_work *mw = container_of(work, struct iboe_mcast_work, work);
+	struct cma_multicast *mc = mw->mc;
+	struct ib_sa_multicast *m = mc->multicast.ib;
+
+	mc->multicast.ib->context = mc;
+	cma_ib_mc_handler(0, m);
+	kref_put(&mc->mcref, release_mc);
+	kfree(mw);
+}
+
+static void cma_iboe_set_mgid(struct sockaddr *addr, union ib_gid *mgid)
+{
+	struct sockaddr_in *sin = (struct sockaddr_in *)addr;
+	struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)addr;
+
+	if (cma_any_addr(addr)) {
+		memset(mgid, 0, sizeof *mgid);
+	} else if (addr->sa_family == AF_INET6)
+		memcpy(mgid, &sin6->sin6_addr, sizeof *mgid);
+	else {
+		mgid->raw[0] = 0xff;
+		mgid->raw[1] = 0x0e;
+		mgid->raw[2] = 0;
+		mgid->raw[3] = 0;
+		mgid->raw[4] = 0;
+		mgid->raw[5] = 0;
+		mgid->raw[6] = 0;
+		mgid->raw[7] = 0;
+		mgid->raw[8] = 0;
+		mgid->raw[9] = 0;
+		mgid->raw[10] = 0xff;
+		mgid->raw[11] = 0xff;
+		*(__be32 *)(&mgid->raw[12]) = sin->sin_addr.s_addr;
+	}
+}
+
+static int cma_iboe_join_multicast(struct rdma_id_private *id_priv,
+				   struct cma_multicast *mc)
+{
+	struct iboe_mcast_work *work;
+	struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
+	int err;
+	struct sockaddr *addr = (struct sockaddr *)&mc->addr;
+	struct net_device *ndev = NULL;
+
+	if (cma_zero_addr((struct sockaddr *)&mc->addr))
+		return -EINVAL;
+
+	work = kzalloc(sizeof *work, GFP_KERNEL);
+	if (!work)
+		return -ENOMEM;
+
+	mc->multicast.ib = kzalloc(sizeof(struct ib_sa_multicast), GFP_KERNEL);
+	if (!mc->multicast.ib) {
+		err = -ENOMEM;
+		goto out1;
+	}
+
+	cma_iboe_set_mgid(addr, &mc->multicast.ib->rec.mgid);
+
+	mc->multicast.ib->rec.pkey = cpu_to_be16(0xffff);
+	if (id_priv->id.ps == RDMA_PS_UDP)
+		mc->multicast.ib->rec.qkey = cpu_to_be32(RDMA_UDP_QKEY);
+
+	if (dev_addr->bound_dev_if)
+		ndev = dev_get_by_index(&init_net, dev_addr->bound_dev_if);
+	if (!ndev) {
+		err = -ENODEV;
+		goto out2;
+	}
+
+	mc->multicast.ib->rec.rate = iboe_get_rate(ndev);
+	mc->multicast.ib->rec.hop_limit = 1;
+#ifdef __linux__
+	mc->multicast.ib->rec.mtu = iboe_get_mtu(ndev->mtu);
+#else
+	mc->multicast.ib->rec.mtu = iboe_get_mtu(ndev->if_mtu);
+#endif
+	dev_put(ndev);
+	if (!mc->multicast.ib->rec.mtu) {
+		err = -EINVAL;
+		goto out2;
+	}
+	iboe_addr_get_sgid(dev_addr, &mc->multicast.ib->rec.port_gid);
+	work->id = id_priv;
+	work->mc = mc;
+	INIT_WORK(&work->work, iboe_mcast_work_handler);
+	kref_get(&mc->mcref);
+	queue_work(cma_wq, &work->work);
+
+	return 0;
+
+out2:
+	kfree(mc->multicast.ib);
+out1:
+	kfree(work);
+	return err;
+}
+
+int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr,
+			void *context)
+{
+	struct rdma_id_private *id_priv;
+	struct cma_multicast *mc;
+	int ret;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	if (!cma_comp(id_priv, CMA_ADDR_BOUND) &&
+	    !cma_comp(id_priv, CMA_ADDR_RESOLVED))
+		return -EINVAL;
+
+	mc = kmalloc(sizeof *mc, GFP_KERNEL);
+	if (!mc)
+		return -ENOMEM;
+
+	memcpy(&mc->addr, addr, ip_addr_size(addr));
+	mc->context = context;
+	mc->id_priv = id_priv;
+
+	spin_lock(&id_priv->lock);
+	list_add(&mc->list, &id_priv->mc_list);
+	spin_unlock(&id_priv->lock);
+
+	switch (rdma_node_get_transport(id->device->node_type)) {
+	case RDMA_TRANSPORT_IB:
+		switch (rdma_port_get_link_layer(id->device, id->port_num)) {
+		case IB_LINK_LAYER_INFINIBAND:
+			ret = cma_join_ib_multicast(id_priv, mc);
+			break;
+		case IB_LINK_LAYER_ETHERNET:
+			kref_init(&mc->mcref);
+			ret = cma_iboe_join_multicast(id_priv, mc);
+			break;
+		default:
+			ret = -EINVAL;
+		}
+		break;
+	default:
+		ret = -ENOSYS;
+		break;
+	}
+
+	if (ret) {
+		spin_lock_irq(&id_priv->lock);
+		list_del(&mc->list);
+		spin_unlock_irq(&id_priv->lock);
+		kfree(mc);
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL(rdma_join_multicast);
+
+void rdma_leave_multicast(struct rdma_cm_id *id, struct sockaddr *addr)
+{
+	struct rdma_id_private *id_priv;
+	struct cma_multicast *mc;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	spin_lock_irq(&id_priv->lock);
+	list_for_each_entry(mc, &id_priv->mc_list, list) {
+		if (!memcmp(&mc->addr, addr, ip_addr_size(addr))) {
+			list_del(&mc->list);
+			spin_unlock_irq(&id_priv->lock);
+
+			if (id->qp)
+				ib_detach_mcast(id->qp,
+						&mc->multicast.ib->rec.mgid,
+						mc->multicast.ib->rec.mlid);
+			if (rdma_node_get_transport(id_priv->cma_dev->device->node_type) == RDMA_TRANSPORT_IB) {
+				switch (rdma_port_get_link_layer(id->device, id->port_num)) {
+				case IB_LINK_LAYER_INFINIBAND:
+					ib_sa_free_multicast(mc->multicast.ib);
+					kfree(mc);
+					break;
+				case IB_LINK_LAYER_ETHERNET:
+					kref_put(&mc->mcref, release_mc);
+					break;
+				default:
+					break;
+				}
+			}
+			return;
+		}
+	}
+	spin_unlock_irq(&id_priv->lock);
+}
+EXPORT_SYMBOL(rdma_leave_multicast);
+
+static int cma_netdev_change(struct net_device *ndev, struct rdma_id_private *id_priv)
+{
+	struct rdma_dev_addr *dev_addr;
+	struct cma_ndev_work *work;
+
+	dev_addr = &id_priv->id.route.addr.dev_addr;
+
+#ifdef __linux__
+	if ((dev_addr->bound_dev_if == ndev->ifindex) &&
+	    memcmp(dev_addr->src_dev_addr, ndev->dev_addr, ndev->addr_len)) {
+		printk(KERN_INFO "RDMA CM addr change for ndev %s used by id %p\n",
+		       ndev->name, &id_priv->id);
+#else
+	if ((dev_addr->bound_dev_if == ndev->if_index) &&
+	    memcmp(dev_addr->src_dev_addr, IF_LLADDR(ndev), ndev->if_addrlen)) {
+		printk(KERN_INFO "RDMA CM addr change for ndev %s used by id %p\n",
+		       ndev->if_xname, &id_priv->id);
+#endif
+		work = kzalloc(sizeof *work, GFP_KERNEL);
+		if (!work)
+			return -ENOMEM;
+
+		INIT_WORK(&work->work, cma_ndev_work_handler);
+		work->id = id_priv;
+		work->event.event = RDMA_CM_EVENT_ADDR_CHANGE;
+		atomic_inc(&id_priv->refcount);
+		queue_work(cma_wq, &work->work);
+	}
+
+	return 0;
+}
+
+static int cma_netdev_callback(struct notifier_block *self, unsigned long event,
+			       void *ctx)
+{
+	struct net_device *ndev = (struct net_device *)ctx;
+	struct cma_device *cma_dev;
+	struct rdma_id_private *id_priv;
+	int ret = NOTIFY_DONE;
+
+#ifdef __linux__
+	if (dev_net(ndev) != &init_net)
+		return NOTIFY_DONE;
+
+	if (event != NETDEV_BONDING_FAILOVER)
+		return NOTIFY_DONE;
+
+	if (!(ndev->flags & IFF_MASTER) || !(ndev->priv_flags & IFF_BONDING))
+		return NOTIFY_DONE;
+#else
+	if (event != NETDEV_DOWN && event != NETDEV_UNREGISTER)
+		return NOTIFY_DONE;
+#endif
+
+	mutex_lock(&lock);
+	list_for_each_entry(cma_dev, &dev_list, list)
+		list_for_each_entry(id_priv, &cma_dev->id_list, list) {
+			ret = cma_netdev_change(ndev, id_priv);
+			if (ret)
+				goto out;
+		}
+
+out:
+	mutex_unlock(&lock);
+	return ret;
+}
+
+static struct notifier_block cma_nb = {
+	.notifier_call = cma_netdev_callback
+};
+
+static void cma_add_one(struct ib_device *device)
+{
+	struct cma_device *cma_dev;
+	struct rdma_id_private *id_priv;
+
+	cma_dev = kmalloc(sizeof *cma_dev, GFP_KERNEL);
+	if (!cma_dev)
+		return;
+
+	cma_dev->device = device;
+
+	init_completion(&cma_dev->comp);
+	atomic_set(&cma_dev->refcount, 1);
+	INIT_LIST_HEAD(&cma_dev->id_list);
+	ib_set_client_data(device, &cma_client, cma_dev);
+
+	mutex_lock(&lock);
+	list_add_tail(&cma_dev->list, &dev_list);
+	list_for_each_entry(id_priv, &listen_any_list, list)
+		cma_listen_on_dev(id_priv, cma_dev);
+	mutex_unlock(&lock);
+}
+
+static int cma_remove_id_dev(struct rdma_id_private *id_priv)
+{
+	struct rdma_cm_event event;
+	enum cma_state state;
+	int ret = 0;
+
+	/* Record that we want to remove the device */
+	state = cma_exch(id_priv, CMA_DEVICE_REMOVAL);
+	if (state == CMA_DESTROYING)
+		return 0;
+
+	cma_cancel_operation(id_priv, state);
+	mutex_lock(&id_priv->handler_mutex);
+
+	/* Check for destruction from another callback. */
+	if (!cma_comp(id_priv, CMA_DEVICE_REMOVAL))
+		goto out;
+
+	memset(&event, 0, sizeof event);
+	event.event = RDMA_CM_EVENT_DEVICE_REMOVAL;
+	ret = id_priv->id.event_handler(&id_priv->id, &event);
+out:
+	mutex_unlock(&id_priv->handler_mutex);
+	return ret;
+}
+
+static void cma_process_remove(struct cma_device *cma_dev)
+{
+	struct rdma_id_private *id_priv;
+	int ret;
+
+	mutex_lock(&lock);
+	while (!list_empty(&cma_dev->id_list)) {
+		id_priv = list_entry(cma_dev->id_list.next,
+				     struct rdma_id_private, list);
+
+		list_del(&id_priv->listen_list);
+		list_del_init(&id_priv->list);
+		atomic_inc(&id_priv->refcount);
+		mutex_unlock(&lock);
+
+		ret = id_priv->internal_id ? 1 : cma_remove_id_dev(id_priv);
+		cma_deref_id(id_priv);
+		if (ret)
+			rdma_destroy_id(&id_priv->id);
+
+		mutex_lock(&lock);
+	}
+	mutex_unlock(&lock);
+
+	cma_deref_dev(cma_dev);
+	wait_for_completion(&cma_dev->comp);
+}
+
+static void cma_remove_one(struct ib_device *device)
+{
+	struct cma_device *cma_dev;
+
+	cma_dev = ib_get_client_data(device, &cma_client);
+	if (!cma_dev)
+		return;
+
+	mutex_lock(&lock);
+	list_del(&cma_dev->list);
+	mutex_unlock(&lock);
+
+	cma_process_remove(cma_dev);
+	kfree(cma_dev);
+}
+
+static int cma_init(void)
+{
+	int ret;
+#if defined(INET)
+	int low, high, remaining;
+
+	get_random_bytes(&next_port, sizeof next_port);
+	inet_get_local_port_range(&low, &high);
+	remaining = (high - low) + 1;
+	next_port = ((unsigned int) next_port % remaining) + low;
+#endif
+
+	cma_wq = create_singlethread_workqueue("rdma_cm");
+	if (!cma_wq)
+		return -ENOMEM;
+
+	ib_sa_register_client(&sa_client);
+	rdma_addr_register_client(&addr_client);
+	register_netdevice_notifier(&cma_nb);
+
+	ret = ib_register_client(&cma_client);
+	if (ret)
+		goto err;
+	return 0;
+
+err:
+	unregister_netdevice_notifier(&cma_nb);
+	rdma_addr_unregister_client(&addr_client);
+	ib_sa_unregister_client(&sa_client);
+	destroy_workqueue(cma_wq);
+	return ret;
+}
+
+static void cma_cleanup(void)
+{
+	ib_unregister_client(&cma_client);
+	unregister_netdevice_notifier(&cma_nb);
+	rdma_addr_unregister_client(&addr_client);
+	ib_sa_unregister_client(&sa_client);
+	destroy_workqueue(cma_wq);
+	idr_destroy(&sdp_ps);
+	idr_destroy(&tcp_ps);
+	idr_destroy(&udp_ps);
+	idr_destroy(&ipoib_ps);
+}
+
+module_init(cma_init);
+module_exit(cma_cleanup);


Property changes on: trunk/sys/ofed/drivers/infiniband/core/cma.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/sys/ofed/drivers/infiniband/core/core_priv.h
===================================================================
--- trunk/sys/ofed/drivers/infiniband/core/core_priv.h	                        (rev 0)
+++ trunk/sys/ofed/drivers/infiniband/core/core_priv.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _CORE_PRIV_H
+#define _CORE_PRIV_H
+
+#include <linux/list.h>
+#include <linux/spinlock.h>
+
+#include <rdma/ib_verbs.h>
+
+int  ib_device_register_sysfs(struct ib_device *device, int (*port_callback)(struct ib_device *,
+                                                  u8, struct kobject *));
+void ib_device_unregister_sysfs(struct ib_device *device);
+
+int  ib_sysfs_setup(void);
+void ib_sysfs_cleanup(void);
+
+int  ib_cache_setup(void);
+void ib_cache_cleanup(void);
+
+#endif /* _CORE_PRIV_H */


Property changes on: trunk/sys/ofed/drivers/infiniband/core/core_priv.h
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/sys/ofed/drivers/infiniband/core/device.c
===================================================================
--- trunk/sys/ofed/drivers/infiniband/core/device.c	                        (rev 0)
+++ trunk/sys/ofed/drivers/infiniband/core/device.c	2016-09-14 19:35:22 UTC (rev 7911)
@@ -0,0 +1,771 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/mutex.h>
+#include <linux/workqueue.h>
+
+#include "core_priv.h"
+
+MODULE_AUTHOR("Roland Dreier");
+MODULE_DESCRIPTION("core kernel InfiniBand API");
+MODULE_LICENSE("Dual BSD/GPL");
+
+#ifdef __ia64__
+/* workaround for a bug in hp chipset that would cause kernel
+   panic when dma resources are exhaused */
+int dma_map_sg_hp_wa = 0;
+#endif
+
+struct ib_client_data {
+	struct list_head  list;
+	struct ib_client *client;
+	void *            data;
+};
+
+static LIST_HEAD(device_list);
+static LIST_HEAD(client_list);
+
+/*
+ * device_mutex protects access to both device_list and client_list.
+ * There's no real point to using multiple locks or something fancier
+ * like an rwsem: we always access both lists, and we're always
+ * modifying one list or the other list.  In any case this is not a
+ * hot path so there's no point in trying to optimize.
+ */
+static DEFINE_MUTEX(device_mutex);
+
+static int ib_device_check_mandatory(struct ib_device *device)
+{
+#define IB_MANDATORY_FUNC(x) { offsetof(struct ib_device, x), #x }
+	static const struct {
+		size_t offset;
+		char  *name;
+	} mandatory_table[] = {
+		IB_MANDATORY_FUNC(query_device),
+		IB_MANDATORY_FUNC(query_port),
+		IB_MANDATORY_FUNC(query_pkey),
+		IB_MANDATORY_FUNC(query_gid),
+		IB_MANDATORY_FUNC(alloc_pd),
+		IB_MANDATORY_FUNC(dealloc_pd),
+		IB_MANDATORY_FUNC(create_ah),
+		IB_MANDATORY_FUNC(destroy_ah),
+		IB_MANDATORY_FUNC(create_qp),
+		IB_MANDATORY_FUNC(modify_qp),
+		IB_MANDATORY_FUNC(destroy_qp),
+		IB_MANDATORY_FUNC(post_send),
+		IB_MANDATORY_FUNC(post_recv),
+		IB_MANDATORY_FUNC(create_cq),
+		IB_MANDATORY_FUNC(destroy_cq),
+		IB_MANDATORY_FUNC(poll_cq),
+		IB_MANDATORY_FUNC(req_notify_cq),
+		IB_MANDATORY_FUNC(get_dma_mr),
+		IB_MANDATORY_FUNC(dereg_mr)
+	};
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(mandatory_table); ++i) {
+		if (!*(void **) ((u_char *) device + mandatory_table[i].offset)) {
+			printk(KERN_WARNING "Device %s is missing mandatory function %s\n",
+			       device->name, mandatory_table[i].name);
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
+static struct ib_device *__ib_device_get_by_name(const char *name)
+{
+	struct ib_device *device;
+
+	list_for_each_entry(device, &device_list, core_list)
+		if (!strncmp(name, device->name, IB_DEVICE_NAME_MAX))
+			return device;
+
+	return NULL;
+}
+
+
+static int alloc_name(char *name)
+{
+	unsigned long *inuse;
+	char buf[IB_DEVICE_NAME_MAX];
+	struct ib_device *device;
+	int i;
+
+	inuse = (unsigned long *) get_zeroed_page(GFP_KERNEL);
+	if (!inuse)
+		return -ENOMEM;
+
+	list_for_each_entry(device, &device_list, core_list) {
+		if (!sscanf(device->name, name, &i))
+			continue;
+		if (i < 0 || i >= PAGE_SIZE * 8)
+			continue;
+		snprintf(buf, sizeof buf, name, i);
+		if (!strncmp(buf, device->name, IB_DEVICE_NAME_MAX))
+			set_bit(i, inuse);
+	}
+
+	i = find_first_zero_bit(inuse, PAGE_SIZE * 8);
+	free_page((unsigned long) inuse);
+	snprintf(buf, sizeof buf, name, i);
+
+	if (__ib_device_get_by_name(buf))
+		return -ENFILE;
+
+	strlcpy(name, buf, IB_DEVICE_NAME_MAX);
+	return 0;
+}
+
+static int start_port(struct ib_device *device)
+{
+	return (device->node_type == RDMA_NODE_IB_SWITCH) ? 0 : 1;
+}
+
+
+static int end_port(struct ib_device *device)
+{
+	return (device->node_type == RDMA_NODE_IB_SWITCH) ?
+		0 : device->phys_port_cnt;
+}
+
+/**
+ * ib_alloc_device - allocate an IB device struct
+ * @size:size of structure to allocate
+ *
+ * Low-level drivers should use ib_alloc_device() to allocate &struct
+ * ib_device.  @size is the size of the structure to be allocated,
+ * including any private data used by the low-level driver.
+ * ib_dealloc_device() must be used to free structures allocated with
+ * ib_alloc_device().
+ */
+struct ib_device *ib_alloc_device(size_t size)
+{
+	BUG_ON(size < sizeof (struct ib_device));
+
+	return kzalloc(size, GFP_KERNEL);
+}
+EXPORT_SYMBOL(ib_alloc_device);
+
+/**
+ * ib_dealloc_device - free an IB device struct
+ * @device:structure to free
+ *
+ * Free a structure allocated with ib_alloc_device().
+ */
+void ib_dealloc_device(struct ib_device *device)
+{
+	if (device->reg_state == IB_DEV_UNINITIALIZED) {
+		kfree(device);
+		return;
+	}
+
+	BUG_ON(device->reg_state != IB_DEV_UNREGISTERED);
+
+	kobject_put(&device->dev.kobj);
+}
+EXPORT_SYMBOL(ib_dealloc_device);
+
+static int add_client_context(struct ib_device *device, struct ib_client *client)
+{
+	struct ib_client_data *context;
+	unsigned long flags;
+
+	context = kmalloc(sizeof *context, GFP_KERNEL);
+	if (!context) {
+		printk(KERN_WARNING "Couldn't allocate client context for %s/%s\n",
+		       device->name, client->name);
+		return -ENOMEM;
+	}
+
+	context->client = client;
+	context->data   = NULL;
+
+	spin_lock_irqsave(&device->client_data_lock, flags);
+	list_add(&context->list, &device->client_data_list);
+	spin_unlock_irqrestore(&device->client_data_lock, flags);
+
+	return 0;
+}
+
+static int read_port_table_lengths(struct ib_device *device)
+{
+	struct ib_port_attr *tprops = NULL;
+	int num_ports, ret = -ENOMEM;
+	u8 port_index;
+
+	tprops = kmalloc(sizeof *tprops, GFP_KERNEL);
+	if (!tprops)
+		goto out;
+
+	num_ports = end_port(device) - start_port(device) + 1;
+
+	device->pkey_tbl_len = kmalloc(sizeof *device->pkey_tbl_len * num_ports,
+				       GFP_KERNEL);
+	device->gid_tbl_len = kmalloc(sizeof *device->gid_tbl_len * num_ports,
+				      GFP_KERNEL);
+	if (!device->pkey_tbl_len || !device->gid_tbl_len)
+		goto err;
+
+	for (port_index = 0; port_index < num_ports; ++port_index) {
+		ret = ib_query_port(device, port_index + start_port(device),
+					tprops);
+		if (ret)
+			goto err;
+		device->pkey_tbl_len[port_index] = tprops->pkey_tbl_len;
+		device->gid_tbl_len[port_index]  = tprops->gid_tbl_len;
+	}
+
+	ret = 0;
+	goto out;
+
+err:
+	kfree(device->gid_tbl_len);
+	kfree(device->pkey_tbl_len);
+out:
+	kfree(tprops);
+	return ret;
+}
+
+/**
+ * ib_register_device - Register an IB device with IB core
+ * @device:Device to register
+ *
+ * Low-level drivers use ib_register_device() to register their
+ * devices with the IB core.  All registered clients will receive a
+ * callback for each device that is added. @device must be allocated
+ * with ib_alloc_device().
+ */
+int ib_register_device(struct ib_device *device,
+		       int (*port_callback)(struct ib_device *,
+					    u8, struct kobject *))
+{
+	int ret;
+
+	mutex_lock(&device_mutex);
+
+	if (strchr(device->name, '%')) {
+		ret = alloc_name(device->name);
+		if (ret)
+			goto out;
+	}
+
+	if (ib_device_check_mandatory(device)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	INIT_LIST_HEAD(&device->event_handler_list);
+	INIT_LIST_HEAD(&device->client_data_list);
+	spin_lock_init(&device->event_handler_lock);
+	spin_lock_init(&device->client_data_lock);
+	device->ib_uverbs_xrcd_table = RB_ROOT;
+	mutex_init(&device->xrcd_table_mutex);
+
+	ret = read_port_table_lengths(device);
+	if (ret) {
+		printk(KERN_WARNING "Couldn't create table lengths cache for device %s\n",
+		       device->name);
+		goto out;
+	}
+
+	ret = ib_device_register_sysfs(device, port_callback);
+	if (ret) {
+		printk(KERN_WARNING "Couldn't register device %s with driver model\n",
+		       device->name);
+		kfree(device->gid_tbl_len);
+		kfree(device->pkey_tbl_len);
+		goto out;
+	}
+
+	list_add_tail(&device->core_list, &device_list);
+
+	device->reg_state = IB_DEV_REGISTERED;
+
+	{
+		struct ib_client *client;
+
+		list_for_each_entry(client, &client_list, list)
+			if (client->add && !add_client_context(device, client))
+				client->add(device);
+	}
+
+ out:
+	mutex_unlock(&device_mutex);
+	return ret;
+}
+EXPORT_SYMBOL(ib_register_device);
+
+/**
+ * ib_unregister_device - Unregister an IB device
+ * @device:Device to unregister
+ *
+ * Unregister an IB device.  All clients will receive a remove callback.
+ */
+void ib_unregister_device(struct ib_device *device)
+{
+	struct ib_client *client;
+	struct ib_client_data *context, *tmp;
+	unsigned long flags;
+
+	mutex_lock(&device_mutex);
+
+	list_for_each_entry_reverse(client, &client_list, list)
+		if (client->remove)
+			client->remove(device);
+
+	list_del(&device->core_list);
+
+	kfree(device->gid_tbl_len);
+	kfree(device->pkey_tbl_len);
+
+	mutex_unlock(&device_mutex);
+
+	ib_device_unregister_sysfs(device);
+
+	spin_lock_irqsave(&device->client_data_lock, flags);
+	list_for_each_entry_safe(context, tmp, &device->client_data_list, list)
+		kfree(context);
+	spin_unlock_irqrestore(&device->client_data_lock, flags);
+
+	device->reg_state = IB_DEV_UNREGISTERED;
+}
+EXPORT_SYMBOL(ib_unregister_device);
+
+/**
+ * ib_register_client - Register an IB client
+ * @client:Client to register
+ *
+ * Upper level users of the IB drivers can use ib_register_client() to
+ * register callbacks for IB device addition and removal.  When an IB
+ * device is added, each registered client's add method will be called
+ * (in the order the clients were registered), and when a device is
+ * removed, each client's remove method will be called (in the reverse
+ * order that clients were registered).  In addition, when
+ * ib_register_client() is called, the client will receive an add
+ * callback for all devices already registered.
+ */
+int ib_register_client(struct ib_client *client)
+{
+	struct ib_device *device;
+
+	mutex_lock(&device_mutex);
+
+	list_add_tail(&client->list, &client_list);
+	list_for_each_entry(device, &device_list, core_list)
+		if (client->add && !add_client_context(device, client))
+			client->add(device);
+
+	mutex_unlock(&device_mutex);
+
+	return 0;
+}
+EXPORT_SYMBOL(ib_register_client);
+
+/**
+ * ib_unregister_client - Unregister an IB client
+ * @client:Client to unregister
+ *
+ * Upper level users use ib_unregister_client() to remove their client
+ * registration.  When ib_unregister_client() is called, the client
+ * will receive a remove callback for each IB device still registered.
+ */
+void ib_unregister_client(struct ib_client *client)
+{
+	struct ib_client_data *context, *tmp;
+	struct ib_device *device;
+	unsigned long flags;
+
+	mutex_lock(&device_mutex);
+
+	list_for_each_entry(device, &device_list, core_list) {
+		if (client->remove)
+			client->remove(device);
+
+		spin_lock_irqsave(&device->client_data_lock, flags);
+		list_for_each_entry_safe(context, tmp, &device->client_data_list, list)
+			if (context->client == client) {
+				list_del(&context->list);
+				kfree(context);
+			}
+		spin_unlock_irqrestore(&device->client_data_lock, flags);
+	}
+	list_del(&client->list);
+
+	mutex_unlock(&device_mutex);
+}
+EXPORT_SYMBOL(ib_unregister_client);
+
+/**
+ * ib_get_client_data - Get IB client context
+ * @device:Device to get context for
+ * @client:Client to get context for
+ *
+ * ib_get_client_data() returns client context set with
+ * ib_set_client_data().
+ */
+void *ib_get_client_data(struct ib_device *device, struct ib_client *client)
+{
+	struct ib_client_data *context;
+	void *ret = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&device->client_data_lock, flags);
+	list_for_each_entry(context, &device->client_data_list, list)
+		if (context->client == client) {
+			ret = context->data;
+			break;
+		}
+	spin_unlock_irqrestore(&device->client_data_lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL(ib_get_client_data);
+
+/**
+ * ib_set_client_data - Set IB client context
+ * @device:Device to set context for
+ * @client:Client to set context for
+ * @data:Context to set
+ *
+ * ib_set_client_data() sets client context that can be retrieved with
+ * ib_get_client_data().
+ */
+void ib_set_client_data(struct ib_device *device, struct ib_client *client,
+			void *data)
+{
+	struct ib_client_data *context;
+	unsigned long flags;
+
+	spin_lock_irqsave(&device->client_data_lock, flags);
+	list_for_each_entry(context, &device->client_data_list, list)
+		if (context->client == client) {
+			context->data = data;
+			goto out;
+		}
+
+	printk(KERN_WARNING "No client context found for %s/%s\n",
+	       device->name, client->name);
+
+out:
+	spin_unlock_irqrestore(&device->client_data_lock, flags);
+}
+EXPORT_SYMBOL(ib_set_client_data);
+
+/**
+ * ib_register_event_handler - Register an IB event handler
+ * @event_handler:Handler to register
+ *
+ * ib_register_event_handler() registers an event handler that will be
+ * called back when asynchronous IB events occur (as defined in
+ * chapter 11 of the InfiniBand Architecture Specification).  This
+ * callback may occur in interrupt context.
+ */
+int ib_register_event_handler  (struct ib_event_handler *event_handler)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&event_handler->device->event_handler_lock, flags);
+	list_add_tail(&event_handler->list,
+		      &event_handler->device->event_handler_list);
+	spin_unlock_irqrestore(&event_handler->device->event_handler_lock, flags);
+
+	return 0;
+}
+EXPORT_SYMBOL(ib_register_event_handler);
+
+/**
+ * ib_unregister_event_handler - Unregister an event handler
+ * @event_handler:Handler to unregister
+ *
+ * Unregister an event handler registered with
+ * ib_register_event_handler().
+ */
+int ib_unregister_event_handler(struct ib_event_handler *event_handler)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&event_handler->device->event_handler_lock, flags);
+	list_del(&event_handler->list);
+	spin_unlock_irqrestore(&event_handler->device->event_handler_lock, flags);
+
+	return 0;
+}
+EXPORT_SYMBOL(ib_unregister_event_handler);
+
+/**
+ * ib_dispatch_event - Dispatch an asynchronous event
+ * @event:Event to dispatch
+ *
+ * Low-level drivers must call ib_dispatch_event() to dispatch the
+ * event to all registered event handlers when an asynchronous event
+ * occurs.
+ */
+void ib_dispatch_event(struct ib_event *event)
+{
+	unsigned long flags;
+	struct ib_event_handler *handler;
+
+	spin_lock_irqsave(&event->device->event_handler_lock, flags);
+
+	list_for_each_entry(handler, &event->device->event_handler_list, list)
+		handler->handler(handler, event);
+
+	spin_unlock_irqrestore(&event->device->event_handler_lock, flags);
+}
+EXPORT_SYMBOL(ib_dispatch_event);
+
+/**
+ * ib_query_device - Query IB device attributes
+ * @device:Device to query
+ * @device_attr:Device attributes
+ *
+ * ib_query_device() returns the attributes of a device through the
+ * @device_attr pointer.
+ */
+int ib_query_device(struct ib_device *device,
+		    struct ib_device_attr *device_attr)
+{
+	return device->query_device(device, device_attr);
+}
+EXPORT_SYMBOL(ib_query_device);
+
+/**
+ * ib_query_port - Query IB port attributes
+ * @device:Device to query
+ * @port_num:Port number to query
+ * @port_attr:Port attributes
+ *
+ * ib_query_port() returns the attributes of a port through the
+ * @port_attr pointer.
+ */
+int ib_query_port(struct ib_device *device,
+		  u8 port_num,
+		  struct ib_port_attr *port_attr)
+{
+	if (port_num < start_port(device) || port_num > end_port(device))
+		return -EINVAL;
+
+	return device->query_port(device, port_num, port_attr);
+}
+EXPORT_SYMBOL(ib_query_port);
+
+/**
+ * ib_query_gid - Get GID table entry
+ * @device:Device to query
+ * @port_num:Port number to query
+ * @index:GID table index to query
+ * @gid:Returned GID
+ *
+ * ib_query_gid() fetches the specified GID table entry.
+ */
+int ib_query_gid(struct ib_device *device,
+		 u8 port_num, int index, union ib_gid *gid)
+{
+	return device->query_gid(device, port_num, index, gid);
+}
+EXPORT_SYMBOL(ib_query_gid);
+
+/**
+ * ib_query_pkey - Get P_Key table entry
+ * @device:Device to query
+ * @port_num:Port number to query
+ * @index:P_Key table index to query
+ * @pkey:Returned P_Key
+ *
+ * ib_query_pkey() fetches the specified P_Key table entry.
+ */
+int ib_query_pkey(struct ib_device *device,
+		  u8 port_num, u16 index, u16 *pkey)
+{
+	return device->query_pkey(device, port_num, index, pkey);
+}
+EXPORT_SYMBOL(ib_query_pkey);
+
+/**
+ * ib_modify_device - Change IB device attributes
+ * @device:Device to modify
+ * @device_modify_mask:Mask of attributes to change
+ * @device_modify:New attribute values
+ *
+ * ib_modify_device() changes a device's attributes as specified by
+ * the @device_modify_mask and @device_modify structure.
+ */
+int ib_modify_device(struct ib_device *device,
+		     int device_modify_mask,
+		     struct ib_device_modify *device_modify)
+{
+	return device->modify_device(device, device_modify_mask,
+				     device_modify);
+}
+EXPORT_SYMBOL(ib_modify_device);
+
+/**
+ * ib_modify_port - Modifies the attributes for the specified port.
+ * @device: The device to modify.
+ * @port_num: The number of the port to modify.
+ * @port_modify_mask: Mask used to specify which attributes of the port
+ *   to change.
+ * @port_modify: New attribute values for the port.
+ *
+ * ib_modify_port() changes a port's attributes as specified by the
+ * @port_modify_mask and @port_modify structure.
+ */
+int ib_modify_port(struct ib_device *device,
+		   u8 port_num, int port_modify_mask,
+		   struct ib_port_modify *port_modify)
+{
+	if (port_num < start_port(device) || port_num > end_port(device))
+		return -EINVAL;
+
+	return device->modify_port(device, port_num, port_modify_mask,
+				   port_modify);
+}
+EXPORT_SYMBOL(ib_modify_port);
+
+/**
+ * ib_find_gid - Returns the port number and GID table index where
+ *   a specified GID value occurs.
+ * @device: The device to query.
+ * @gid: The GID value to search for.
+ * @port_num: The port number of the device where the GID value was found.
+ * @index: The index into the GID table where the GID was found.  This
+ *   parameter may be NULL.
+ */
+int ib_find_gid(struct ib_device *device, union ib_gid *gid,
+		u8 *port_num, u16 *index)
+{
+	union ib_gid tmp_gid;
+	int ret, port, i;
+
+	for (port = start_port(device); port <= end_port(device); ++port) {
+		for (i = 0; i < device->gid_tbl_len[port - start_port(device)]; ++i) {
+			ret = ib_query_gid(device, port, i, &tmp_gid);
+			if (ret)
+				return ret;
+			if (!memcmp(&tmp_gid, gid, sizeof *gid)) {
+				*port_num = port;
+				if (index)
+					*index = i;
+				return 0;
+			}
+		}
+	}
+
+	return -ENOENT;
+}
+EXPORT_SYMBOL(ib_find_gid);
+
+/**
+ * ib_find_pkey - Returns the PKey table index where a specified
+ *   PKey value occurs.
+ * @device: The device to query.
+ * @port_num: The port number of the device to search for the PKey.
+ * @pkey: The PKey value to search for.
+ * @index: The index into the PKey table where the PKey was found.
+ */
+int ib_find_pkey(struct ib_device *device,
+		 u8 port_num, u16 pkey, u16 *index)
+{
+	int ret, i;
+	u16 tmp_pkey;
+
+	for (i = 0; i < device->pkey_tbl_len[port_num - start_port(device)]; ++i) {
+		ret = ib_query_pkey(device, port_num, i, &tmp_pkey);
+		if (ret)
+			return ret;
+
+		if ((pkey & 0x7fff) == (tmp_pkey & 0x7fff)) {
+			*index = i;
+			return 0;
+		}
+	}
+
+	return -ENOENT;
+}
+EXPORT_SYMBOL(ib_find_pkey);
+
+static int __init ib_core_init(void)
+{
+	int ret;
+
+#ifdef __ia64__
+	if (ia64_platform_is("hpzx1"))
+		dma_map_sg_hp_wa = 1;
+#endif
+
+	ret = ib_sysfs_setup();
+	if (ret)
+		printk(KERN_WARNING "Couldn't create InfiniBand device class\n");
+
+	ret = ib_cache_setup();
+	if (ret) {
+		printk(KERN_WARNING "Couldn't set up InfiniBand P_Key/GID cache\n");
+		ib_sysfs_cleanup();
+	}
+
+	return ret;
+}
+
+static void __exit ib_core_cleanup(void)
+{
+	ib_cache_cleanup();
+	ib_sysfs_cleanup();
+	/* Make sure that any pending umem accounting work is done. */
+	flush_scheduled_work();
+}
+
+module_init(ib_core_init);
+module_exit(ib_core_cleanup);
+
+#undef MODULE_VERSION
+#include <sys/module.h>
+static int
+ibcore_evhand(module_t mod, int event, void *arg)
+{
+	return (0);
+}
+
+static moduledata_t ibcore_mod = {
+	.name = "ibcore",
+	.evhand = ibcore_evhand,
+};
+
+MODULE_VERSION(ibcore, 1);
+DECLARE_MODULE(ibcore, ibcore_mod, SI_SUB_SMP, SI_ORDER_ANY);


Property changes on: trunk/sys/ofed/drivers/infiniband/core/device.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/sys/ofed/drivers/infiniband/core/fmr_pool.c
===================================================================
--- trunk/sys/ofed/drivers/infiniband/core/fmr_pool.c	                        (rev 0)
+++ trunk/sys/ofed/drivers/infiniband/core/fmr_pool.c	2016-09-14 19:35:22 UTC (rev 7911)
@@ -0,0 +1,544 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/errno.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/jhash.h>
+#include <linux/kthread.h>
+
+#include <rdma/ib_fmr_pool.h>
+
+#include "core_priv.h"
+
+#define PFX "fmr_pool: "
+
+enum {
+	IB_FMR_MAX_REMAPS = 32,
+
+	IB_FMR_HASH_BITS  = 8,
+	IB_FMR_HASH_SIZE  = 1 << IB_FMR_HASH_BITS,
+	IB_FMR_HASH_MASK  = IB_FMR_HASH_SIZE - 1
+};
+
+/*
+ * If an FMR is not in use, then the list member will point to either
+ * its pool's free_list (if the FMR can be mapped again; that is,
+ * remap_count < pool->max_remaps) or its pool's dirty_list (if the
+ * FMR needs to be unmapped before being remapped).  In either of
+ * these cases it is a bug if the ref_count is not 0.  In other words,
+ * if ref_count is > 0, then the list member must not be linked into
+ * either free_list or dirty_list.
+ *
+ * The cache_node member is used to link the FMR into a cache bucket
+ * (if caching is enabled).  This is independent of the reference
+ * count of the FMR.  When a valid FMR is released, its ref_count is
+ * decremented, and if ref_count reaches 0, the FMR is placed in
+ * either free_list or dirty_list as appropriate.  However, it is not
+ * removed from the cache and may be "revived" if a call to
+ * ib_fmr_register_physical() occurs before the FMR is remapped.  In
+ * this case we just increment the ref_count and remove the FMR from
+ * free_list/dirty_list.
+ *
+ * Before we remap an FMR from free_list, we remove it from the cache
+ * (to prevent another user from obtaining a stale FMR).  When an FMR
+ * is released, we add it to the tail of the free list, so that our
+ * cache eviction policy is "least recently used."
+ *
+ * All manipulation of ref_count, list and cache_node is protected by
+ * pool_lock to maintain consistency.
+ */
+
+struct ib_fmr_pool {
+	spinlock_t                pool_lock;
+
+	int                       pool_size;
+	int                       max_pages;
+	int			  max_remaps;
+	int                       dirty_watermark;
+	int                       dirty_len;
+	struct list_head          free_list;
+	struct list_head          dirty_list;
+	struct hlist_head        *cache_bucket;
+
+	void                     (*flush_function)(struct ib_fmr_pool *pool,
+						   void *              arg);
+	void                     *flush_arg;
+
+	struct task_struct       *thread;
+
+	atomic_t                  req_ser;
+	atomic_t                  flush_ser;
+
+	wait_queue_head_t         force_wait;
+};
+
+static inline u32 ib_fmr_hash(u64 first_page)
+{
+	return jhash_2words((u32) first_page, (u32) (first_page >> 32), 0) &
+		(IB_FMR_HASH_SIZE - 1);
+}
+
+/* Caller must hold pool_lock */
+static inline struct ib_pool_fmr *ib_fmr_cache_lookup(struct ib_fmr_pool *pool,
+						      u64 *page_list,
+						      int  page_list_len,
+						      u64  io_virtual_address)
+{
+	struct hlist_head *bucket;
+	struct ib_pool_fmr *fmr;
+	struct hlist_node *pos;
+
+	if (!pool->cache_bucket)
+		return NULL;
+
+	bucket = pool->cache_bucket + ib_fmr_hash(*page_list);
+
+	hlist_for_each_entry(fmr, pos, bucket, cache_node)
+		if (io_virtual_address == fmr->io_virtual_address &&
+		    page_list_len      == fmr->page_list_len      &&
+		    !memcmp(page_list, fmr->page_list,
+			    page_list_len * sizeof *page_list))
+			return fmr;
+
+	return NULL;
+}
+
+static void ib_fmr_batch_release(struct ib_fmr_pool *pool)
+{
+	int                 ret;
+	struct ib_pool_fmr *fmr;
+	LIST_HEAD(unmap_list);
+	LIST_HEAD(fmr_list);
+
+	spin_lock_irq(&pool->pool_lock);
+
+	list_for_each_entry(fmr, &pool->dirty_list, list) {
+		hlist_del_init(&fmr->cache_node);
+		fmr->remap_count = 0;
+		list_add_tail(&fmr->fmr->list, &fmr_list);
+
+#ifdef DEBUG
+		if (fmr->ref_count !=0) {
+			printk(KERN_WARNING PFX "Unmapping FMR %p with ref count %d\n",
+			       fmr, fmr->ref_count);
+		}
+#endif
+	}
+
+	list_splice_init(&pool->dirty_list, &unmap_list);
+	pool->dirty_len = 0;
+
+	spin_unlock_irq(&pool->pool_lock);
+
+	if (list_empty(&unmap_list)) {
+		return;
+	}
+
+	ret = ib_unmap_fmr(&fmr_list);
+	if (ret)
+		printk(KERN_WARNING PFX "ib_unmap_fmr returned %d\n", ret);
+
+	spin_lock_irq(&pool->pool_lock);
+	list_splice(&unmap_list, &pool->free_list);
+	spin_unlock_irq(&pool->pool_lock);
+}
+
+static int ib_fmr_cleanup_thread(void *pool_ptr)
+{
+	struct ib_fmr_pool *pool = pool_ptr;
+
+	do {
+		if (atomic_read(&pool->flush_ser) - atomic_read(&pool->req_ser) < 0) {
+			ib_fmr_batch_release(pool);
+
+			atomic_inc(&pool->flush_ser);
+			wake_up_interruptible(&pool->force_wait);
+
+			if (pool->flush_function)
+				pool->flush_function(pool, pool->flush_arg);
+		}
+
+		set_current_state(TASK_INTERRUPTIBLE);
+		if (atomic_read(&pool->flush_ser) - atomic_read(&pool->req_ser) >= 0 &&
+		    !kthread_should_stop())
+			schedule();
+		__set_current_state(TASK_RUNNING);
+	} while (!kthread_should_stop());
+
+	return 0;
+}
+
+/**
+ * ib_create_fmr_pool - Create an FMR pool
+ * @pd:Protection domain for FMRs
+ * @params:FMR pool parameters
+ *
+ * Create a pool of FMRs.  Return value is pointer to new pool or
+ * error code if creation failed.
+ */
+struct ib_fmr_pool *ib_create_fmr_pool(struct ib_pd             *pd,
+				       struct ib_fmr_pool_param *params)
+{
+	struct ib_device   *device;
+	struct ib_fmr_pool *pool;
+	struct ib_device_attr *attr;
+	int i;
+	int ret;
+	int max_remaps;
+
+	if (!params)
+		return ERR_PTR(-EINVAL);
+
+	device = pd->device;
+	if (!device->alloc_fmr    || !device->dealloc_fmr  ||
+	    !device->map_phys_fmr || !device->unmap_fmr) {
+		printk(KERN_INFO PFX "Device %s does not support FMRs\n",
+		       device->name);
+		return ERR_PTR(-ENOSYS);
+	}
+
+	attr = kmalloc(sizeof *attr, GFP_KERNEL);
+	if (!attr) {
+		printk(KERN_WARNING PFX "couldn't allocate device attr struct\n");
+		return ERR_PTR(-ENOMEM);
+	}
+
+	ret = ib_query_device(device, attr);
+	if (ret) {
+		printk(KERN_WARNING PFX "couldn't query device: %d\n", ret);
+		kfree(attr);
+		return ERR_PTR(ret);
+	}
+
+	if (!attr->max_map_per_fmr)
+		max_remaps = IB_FMR_MAX_REMAPS;
+	else
+		max_remaps = attr->max_map_per_fmr;
+
+	kfree(attr);
+
+	pool = kmalloc(sizeof *pool, GFP_KERNEL);
+	if (!pool) {
+		printk(KERN_WARNING PFX "couldn't allocate pool struct\n");
+		return ERR_PTR(-ENOMEM);
+	}
+
+	pool->cache_bucket   = NULL;
+
+	pool->flush_function = params->flush_function;
+	pool->flush_arg      = params->flush_arg;
+
+	INIT_LIST_HEAD(&pool->free_list);
+	INIT_LIST_HEAD(&pool->dirty_list);
+
+	if (params->cache) {
+		pool->cache_bucket =
+			kmalloc(IB_FMR_HASH_SIZE * sizeof *pool->cache_bucket,
+				GFP_KERNEL);
+		if (!pool->cache_bucket) {
+			printk(KERN_WARNING PFX "Failed to allocate cache in pool\n");
+			ret = -ENOMEM;
+			goto out_free_pool;
+		}
+
+		for (i = 0; i < IB_FMR_HASH_SIZE; ++i)
+			INIT_HLIST_HEAD(pool->cache_bucket + i);
+	}
+
+	pool->pool_size       = 0;
+	pool->max_pages       = params->max_pages_per_fmr;
+	pool->max_remaps      = max_remaps;
+	pool->dirty_watermark = params->dirty_watermark;
+	pool->dirty_len       = 0;
+	spin_lock_init(&pool->pool_lock);
+	atomic_set(&pool->req_ser,   0);
+	atomic_set(&pool->flush_ser, 0);
+	init_waitqueue_head(&pool->force_wait);
+
+	pool->thread = kthread_run(ib_fmr_cleanup_thread,
+				   pool,
+				   "ib_fmr(%s)",
+				   device->name);
+	if (IS_ERR(pool->thread)) {
+		printk(KERN_WARNING PFX "couldn't start cleanup thread\n");
+		ret = PTR_ERR(pool->thread);
+		goto out_free_pool;
+	}
+
+	{
+		struct ib_pool_fmr *fmr;
+		struct ib_fmr_attr fmr_attr = {
+			.max_pages  = params->max_pages_per_fmr,
+			.max_maps   = pool->max_remaps,
+			.page_shift = params->page_shift
+		};
+		int bytes_per_fmr = sizeof *fmr;
+
+		if (pool->cache_bucket)
+			bytes_per_fmr += params->max_pages_per_fmr * sizeof (u64);
+
+		for (i = 0; i < params->pool_size; ++i) {
+			fmr = kmalloc(bytes_per_fmr, GFP_KERNEL);
+			if (!fmr) {
+				printk(KERN_WARNING PFX "failed to allocate fmr "
+				       "struct for FMR %d\n", i);
+				goto out_fail;
+			}
+
+			fmr->pool             = pool;
+			fmr->remap_count      = 0;
+			fmr->ref_count        = 0;
+			INIT_HLIST_NODE(&fmr->cache_node);
+
+			fmr->fmr = ib_alloc_fmr(pd, params->access, &fmr_attr);
+			if (IS_ERR(fmr->fmr)) {
+				printk(KERN_WARNING PFX "fmr_create failed "
+				       "for FMR %d\n", i);
+				kfree(fmr);
+				goto out_fail;
+			}
+
+			list_add_tail(&fmr->list, &pool->free_list);
+			++pool->pool_size;
+		}
+	}
+
+	return pool;
+
+ out_free_pool:
+	kfree(pool->cache_bucket);
+	kfree(pool);
+
+	return ERR_PTR(ret);
+
+ out_fail:
+	ib_destroy_fmr_pool(pool);
+
+	return ERR_PTR(-ENOMEM);
+}
+EXPORT_SYMBOL(ib_create_fmr_pool);
+
+/**
+ * ib_destroy_fmr_pool - Free FMR pool
+ * @pool:FMR pool to free
+ *
+ * Destroy an FMR pool and free all associated resources.
+ */
+void ib_destroy_fmr_pool(struct ib_fmr_pool *pool)
+{
+	struct ib_pool_fmr *fmr;
+	struct ib_pool_fmr *tmp;
+	LIST_HEAD(fmr_list);
+	int                 i;
+
+	kthread_stop(pool->thread);
+	ib_fmr_batch_release(pool);
+
+	i = 0;
+	list_for_each_entry_safe(fmr, tmp, &pool->free_list, list) {
+		if (fmr->remap_count) {
+			INIT_LIST_HEAD(&fmr_list);
+			list_add_tail(&fmr->fmr->list, &fmr_list);
+			ib_unmap_fmr(&fmr_list);
+		}
+		ib_dealloc_fmr(fmr->fmr);
+		list_del(&fmr->list);
+		kfree(fmr);
+		++i;
+	}
+
+	if (i < pool->pool_size)
+		printk(KERN_WARNING PFX "pool still has %d regions registered\n",
+		       pool->pool_size - i);
+
+	kfree(pool->cache_bucket);
+	kfree(pool);
+}
+EXPORT_SYMBOL(ib_destroy_fmr_pool);
+
+/**
+ * ib_flush_fmr_pool - Invalidate all unmapped FMRs
+ * @pool:FMR pool to flush
+ *
+ * Ensure that all unmapped FMRs are fully invalidated.
+ */
+int ib_flush_fmr_pool(struct ib_fmr_pool *pool)
+{
+	int serial;
+	struct ib_pool_fmr *fmr, *next;
+
+	/*
+	 * The free_list holds FMRs that may have been used
+	 * but have not been remapped enough times to be dirty.
+	 * Put them on the dirty list now so that the cleanup
+	 * thread will reap them too.
+	 */
+	spin_lock_irq(&pool->pool_lock);
+	list_for_each_entry_safe(fmr, next, &pool->free_list, list) {
+		if (fmr->remap_count > 0)
+			list_move(&fmr->list, &pool->dirty_list);
+	}
+	spin_unlock_irq(&pool->pool_lock);
+
+	serial = atomic_inc_return(&pool->req_ser);
+	wake_up_process(pool->thread);
+
+	if (wait_event_interruptible(pool->force_wait,
+				     atomic_read(&pool->flush_ser) - serial >= 0))
+		return -EINTR;
+
+	return 0;
+}
+EXPORT_SYMBOL(ib_flush_fmr_pool);
+
+/**
+ * ib_fmr_pool_map_phys -
+ * @pool:FMR pool to allocate FMR from
+ * @page_list:List of pages to map
+ * @list_len:Number of pages in @page_list
+ * @io_virtual_address:I/O virtual address for new FMR
+ *
+ * Map an FMR from an FMR pool.
+ */
+struct ib_pool_fmr *ib_fmr_pool_map_phys(struct ib_fmr_pool *pool_handle,
+					 u64                *page_list,
+					 int                 list_len,
+					 u64                 io_virtual_address)
+{
+	struct ib_fmr_pool *pool = pool_handle;
+	struct ib_pool_fmr *fmr;
+	unsigned long       flags;
+	int                 result;
+
+	if (list_len < 1 || list_len > pool->max_pages)
+		return ERR_PTR(-EINVAL);
+
+	spin_lock_irqsave(&pool->pool_lock, flags);
+	fmr = ib_fmr_cache_lookup(pool,
+				  page_list,
+				  list_len,
+				  io_virtual_address);
+	if (fmr) {
+		/* found in cache */
+		++fmr->ref_count;
+		if (fmr->ref_count == 1) {
+			list_del(&fmr->list);
+		}
+
+		spin_unlock_irqrestore(&pool->pool_lock, flags);
+
+		return fmr;
+	}
+
+	if (list_empty(&pool->free_list)) {
+		spin_unlock_irqrestore(&pool->pool_lock, flags);
+		return ERR_PTR(-EAGAIN);
+	}
+
+	fmr = list_entry(pool->free_list.next, struct ib_pool_fmr, list);
+	list_del(&fmr->list);
+	hlist_del_init(&fmr->cache_node);
+	spin_unlock_irqrestore(&pool->pool_lock, flags);
+
+	result = ib_map_phys_fmr(fmr->fmr, page_list, list_len,
+				 io_virtual_address);
+
+	if (result) {
+		spin_lock_irqsave(&pool->pool_lock, flags);
+		list_add(&fmr->list, &pool->free_list);
+		spin_unlock_irqrestore(&pool->pool_lock, flags);
+
+		printk(KERN_WARNING PFX "fmr_map returns %d\n", result);
+
+		return ERR_PTR(result);
+	}
+
+	++fmr->remap_count;
+	fmr->ref_count = 1;
+
+	if (pool->cache_bucket) {
+		fmr->io_virtual_address = io_virtual_address;
+		fmr->page_list_len      = list_len;
+		memcpy(fmr->page_list, page_list, list_len * sizeof(*page_list));
+
+		spin_lock_irqsave(&pool->pool_lock, flags);
+		hlist_add_head(&fmr->cache_node,
+			       pool->cache_bucket + ib_fmr_hash(fmr->page_list[0]));
+		spin_unlock_irqrestore(&pool->pool_lock, flags);
+	}
+
+	return fmr;
+}
+EXPORT_SYMBOL(ib_fmr_pool_map_phys);
+
+/**
+ * ib_fmr_pool_unmap - Unmap FMR
+ * @fmr:FMR to unmap
+ *
+ * Unmap an FMR.  The FMR mapping may remain valid until the FMR is
+ * reused (or until ib_flush_fmr_pool() is called).
+ */
+int ib_fmr_pool_unmap(struct ib_pool_fmr *fmr)
+{
+	struct ib_fmr_pool *pool;
+	unsigned long flags;
+
+	pool = fmr->pool;
+
+	spin_lock_irqsave(&pool->pool_lock, flags);
+
+	--fmr->ref_count;
+	if (!fmr->ref_count) {
+		if (fmr->remap_count < pool->max_remaps) {
+			list_add_tail(&fmr->list, &pool->free_list);
+		} else {
+			list_add_tail(&fmr->list, &pool->dirty_list);
+			if (++pool->dirty_len >= pool->dirty_watermark) {
+				atomic_inc(&pool->req_ser);
+				wake_up_process(pool->thread);
+			}
+		}
+	}
+
+#ifdef DEBUG
+	if (fmr->ref_count < 0)
+		printk(KERN_WARNING PFX "FMR %p has ref count %d < 0\n",
+		       fmr, fmr->ref_count);
+#endif
+
+	spin_unlock_irqrestore(&pool->pool_lock, flags);
+
+	return 0;
+}
+EXPORT_SYMBOL(ib_fmr_pool_unmap);


Property changes on: trunk/sys/ofed/drivers/infiniband/core/fmr_pool.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/sys/ofed/drivers/infiniband/core/iwcm.c
===================================================================
--- trunk/sys/ofed/drivers/infiniband/core/iwcm.c	                        (rev 0)
+++ trunk/sys/ofed/drivers/infiniband/core/iwcm.c	2016-09-14 19:35:22 UTC (rev 7911)
@@ -0,0 +1,1029 @@
+/*
+ * Copyright (c) 2004, 2005 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation.  All rights reserved.
+ * Copyright (c) 2004, 2005 Voltaire Corporation.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ * Copyright (c) 2005 Network Appliance, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/dma-mapping.h>
+#include <linux/err.h>
+#include <linux/idr.h>
+#include <linux/interrupt.h>
+#include <linux/rbtree.h>
+#include <linux/spinlock.h>
+#include <linux/workqueue.h>
+#include <linux/completion.h>
+#include <linux/string.h>
+
+#include <rdma/iw_cm.h>
+#include <rdma/ib_addr.h>
+
+#include "iwcm.h"
+
+MODULE_AUTHOR("Tom Tucker");
+MODULE_DESCRIPTION("iWARP CM");
+MODULE_LICENSE("Dual BSD/GPL");
+
+static struct workqueue_struct *iwcm_wq;
+struct iwcm_work {
+	struct work_struct work;
+	struct iwcm_id_private *cm_id;
+	struct list_head list;
+	struct iw_cm_event event;
+	struct list_head free_list;
+};
+
+/*
+ * The following services provide a mechanism for pre-allocating iwcm_work
+ * elements.  The design pre-allocates them  based on the cm_id type:
+ *	LISTENING IDS: 	Get enough elements preallocated to handle the
+ *			listen backlog.
+ *	ACTIVE IDS:	4: CONNECT_REPLY, ESTABLISHED, DISCONNECT, CLOSE
+ *	PASSIVE IDS:	3: ESTABLISHED, DISCONNECT, CLOSE
+ *
+ * Allocating them in connect and listen avoids having to deal
+ * with allocation failures on the event upcall from the provider (which
+ * is called in the interrupt context).
+ *
+ * One exception is when creating the cm_id for incoming connection requests.
+ * There are two cases:
+ * 1) in the event upcall, cm_event_handler(), for a listening cm_id.  If
+ *    the backlog is exceeded, then no more connection request events will
+ *    be processed.  cm_event_handler() returns -ENOMEM in this case.  Its up
+ *    to the provider to reject the connection request.
+ * 2) in the connection request workqueue handler, cm_conn_req_handler().
+ *    If work elements cannot be allocated for the new connect request cm_id,
+ *    then IWCM will call the provider reject method.  This is ok since
+ *    cm_conn_req_handler() runs in the workqueue thread context.
+ */
+
+static struct iwcm_work *get_work(struct iwcm_id_private *cm_id_priv)
+{
+	struct iwcm_work *work;
+
+	if (list_empty(&cm_id_priv->work_free_list))
+		return NULL;
+	work = list_entry(cm_id_priv->work_free_list.next, struct iwcm_work,
+			  free_list);
+	list_del_init(&work->free_list);
+	return work;
+}
+
+static void put_work(struct iwcm_work *work)
+{
+	list_add(&work->free_list, &work->cm_id->work_free_list);
+}
+
+static void dealloc_work_entries(struct iwcm_id_private *cm_id_priv)
+{
+	struct list_head *e, *tmp;
+
+	list_for_each_safe(e, tmp, &cm_id_priv->work_free_list)
+		kfree(list_entry(e, struct iwcm_work, free_list));
+}
+
+static int alloc_work_entries(struct iwcm_id_private *cm_id_priv, int count)
+{
+	struct iwcm_work *work;
+
+	BUG_ON(!list_empty(&cm_id_priv->work_free_list));
+	while (count--) {
+		work = kmalloc(sizeof(struct iwcm_work), GFP_KERNEL);
+		if (!work) {
+			dealloc_work_entries(cm_id_priv);
+			return -ENOMEM;
+		}
+		work->cm_id = cm_id_priv;
+		INIT_LIST_HEAD(&work->list);
+		put_work(work);
+	}
+	return 0;
+}
+
+/*
+ * Save private data from incoming connection requests to
+ * iw_cm_event, so the low level driver doesn't have to. Adjust
+ * the event ptr to point to the local copy.
+ */
+static int copy_private_data(struct iw_cm_event *event)
+{
+	void *p;
+
+	p = kmemdup(event->private_data, event->private_data_len, GFP_ATOMIC);
+	if (!p)
+		return -ENOMEM;
+	event->private_data = p;
+	return 0;
+}
+
+static void free_cm_id(struct iwcm_id_private *cm_id_priv)
+{
+	dealloc_work_entries(cm_id_priv);
+	kfree(cm_id_priv);
+}
+
+/*
+ * Release a reference on cm_id. If the last reference is being
+ * released, enable the waiting thread (in iw_destroy_cm_id) to
+ * get woken up, and return 1 if a thread is already waiting.
+ */
+static int iwcm_deref_id(struct iwcm_id_private *cm_id_priv)
+{
+	BUG_ON(atomic_read(&cm_id_priv->refcount)==0);
+	if (atomic_dec_and_test(&cm_id_priv->refcount)) {
+		BUG_ON(!list_empty(&cm_id_priv->work_list));
+		complete(&cm_id_priv->destroy_comp);
+		return 1;
+	}
+
+	return 0;
+}
+
+static void add_ref(struct iw_cm_id *cm_id)
+{
+	struct iwcm_id_private *cm_id_priv;
+	cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+	atomic_inc(&cm_id_priv->refcount);
+}
+
+static void rem_ref(struct iw_cm_id *cm_id)
+{
+	struct iwcm_id_private *cm_id_priv;
+	cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+	if (iwcm_deref_id(cm_id_priv) &&
+	    test_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags)) {
+		BUG_ON(!list_empty(&cm_id_priv->work_list));
+		free_cm_id(cm_id_priv);
+	}
+}
+
+static int cm_event_handler(struct iw_cm_id *cm_id, struct iw_cm_event *event);
+
+struct iw_cm_id *iw_create_cm_id(struct ib_device *device,
+				 struct socket *so,
+				 iw_cm_handler cm_handler,
+				 void *context)
+{
+	struct iwcm_id_private *cm_id_priv;
+
+	cm_id_priv = kzalloc(sizeof(*cm_id_priv), GFP_KERNEL);
+	if (!cm_id_priv)
+		return ERR_PTR(-ENOMEM);
+
+	cm_id_priv->state = IW_CM_STATE_IDLE;
+	cm_id_priv->id.device = device;
+	cm_id_priv->id.cm_handler = cm_handler;
+	cm_id_priv->id.context = context;
+	cm_id_priv->id.event_handler = cm_event_handler;
+	cm_id_priv->id.add_ref = add_ref;
+	cm_id_priv->id.rem_ref = rem_ref;
+	cm_id_priv->id.so = so;
+	spin_lock_init(&cm_id_priv->lock);
+	atomic_set(&cm_id_priv->refcount, 1);
+	init_waitqueue_head(&cm_id_priv->connect_wait);
+	init_completion(&cm_id_priv->destroy_comp);
+	INIT_LIST_HEAD(&cm_id_priv->work_list);
+	INIT_LIST_HEAD(&cm_id_priv->work_free_list);
+
+	return &cm_id_priv->id;
+}
+EXPORT_SYMBOL(iw_create_cm_id);
+
+
+static int iwcm_modify_qp_err(struct ib_qp *qp)
+{
+	struct ib_qp_attr qp_attr;
+
+	if (!qp)
+		return -EINVAL;
+
+	qp_attr.qp_state = IB_QPS_ERR;
+	return ib_modify_qp(qp, &qp_attr, IB_QP_STATE);
+}
+
+/*
+ * This is really the RDMAC CLOSING state. It is most similar to the
+ * IB SQD QP state.
+ */
+static int iwcm_modify_qp_sqd(struct ib_qp *qp)
+{
+	struct ib_qp_attr qp_attr;
+
+	BUG_ON(qp == NULL);
+	qp_attr.qp_state = IB_QPS_SQD;
+	return ib_modify_qp(qp, &qp_attr, IB_QP_STATE);
+}
+
+/*
+ * CM_ID <-- CLOSING
+ *
+ * Block if a passive or active connection is currently being processed. Then
+ * process the event as follows:
+ * - If we are ESTABLISHED, move to CLOSING and modify the QP state
+ *   based on the abrupt flag
+ * - If the connection is already in the CLOSING or IDLE state, the peer is
+ *   disconnecting concurrently with us and we've already seen the
+ *   DISCONNECT event -- ignore the request and return 0
+ * - Disconnect on a listening endpoint returns -EINVAL
+ */
+int iw_cm_disconnect(struct iw_cm_id *cm_id, int abrupt)
+{
+	struct iwcm_id_private *cm_id_priv;
+	unsigned long flags;
+	int ret = 0;
+	struct ib_qp *qp = NULL;
+
+	cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+	/* Wait if we're currently in a connect or accept downcall */
+	wait_event(cm_id_priv->connect_wait,
+		   !test_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags));
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	switch (cm_id_priv->state) {
+	case IW_CM_STATE_ESTABLISHED:
+		cm_id_priv->state = IW_CM_STATE_CLOSING;
+
+		/* QP could be <nul> for user-mode client */
+		if (cm_id_priv->qp)
+			qp = cm_id_priv->qp;
+		else
+			ret = -EINVAL;
+		break;
+	case IW_CM_STATE_LISTEN:
+		ret = -EINVAL;
+		break;
+	case IW_CM_STATE_CLOSING:
+		/* remote peer closed first */
+	case IW_CM_STATE_IDLE:
+		/* accept or connect returned !0 */
+		break;
+	case IW_CM_STATE_CONN_RECV:
+		/*
+		 * App called disconnect before/without calling accept after
+		 * connect_request event delivered.
+		 */
+		break;
+	case IW_CM_STATE_CONN_SENT:
+		/* Can only get here if wait above fails */
+	default:
+		BUG();
+	}
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+	if (qp) {
+		if (abrupt)
+			ret = iwcm_modify_qp_err(qp);
+		else
+			ret = iwcm_modify_qp_sqd(qp);
+
+		/*
+		 * If both sides are disconnecting the QP could
+		 * already be in ERR or SQD states
+		 */
+		ret = 0;
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL(iw_cm_disconnect);
+
+/*
+ * CM_ID <-- DESTROYING
+ *
+ * Clean up all resources associated with the connection and release
+ * the initial reference taken by iw_create_cm_id.
+ */
+static void destroy_cm_id(struct iw_cm_id *cm_id)
+{
+	struct iwcm_id_private *cm_id_priv;
+	unsigned long flags;
+	int ret;
+
+	cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+	/*
+	 * Wait if we're currently in a connect or accept downcall. A
+	 * listening endpoint should never block here.
+	 */
+	wait_event(cm_id_priv->connect_wait,
+		   !test_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags));
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	switch (cm_id_priv->state) {
+	case IW_CM_STATE_LISTEN:
+		cm_id_priv->state = IW_CM_STATE_DESTROYING;
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		/* destroy the listening endpoint */
+		ret = cm_id->device->iwcm->destroy_listen(cm_id);
+		spin_lock_irqsave(&cm_id_priv->lock, flags);
+		break;
+	case IW_CM_STATE_ESTABLISHED:
+		cm_id_priv->state = IW_CM_STATE_DESTROYING;
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		/* Abrupt close of the connection */
+		(void)iwcm_modify_qp_err(cm_id_priv->qp);
+		spin_lock_irqsave(&cm_id_priv->lock, flags);
+		break;
+	case IW_CM_STATE_IDLE:
+	case IW_CM_STATE_CLOSING:
+		cm_id_priv->state = IW_CM_STATE_DESTROYING;
+		break;
+	case IW_CM_STATE_CONN_RECV:
+		/*
+		 * App called destroy before/without calling accept after
+		 * receiving connection request event notification or
+		 * returned non zero from the event callback function.
+		 * In either case, must tell the provider to reject.
+		 */
+		cm_id_priv->state = IW_CM_STATE_DESTROYING;
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		cm_id->device->iwcm->reject(cm_id, NULL, 0);
+		spin_lock_irqsave(&cm_id_priv->lock, flags);
+		break;
+	case IW_CM_STATE_CONN_SENT:
+	case IW_CM_STATE_DESTROYING:
+	default:
+		BUG();
+		break;
+	}
+	if (cm_id_priv->qp) {
+		cm_id_priv->id.device->iwcm->rem_ref(cm_id_priv->qp);
+		cm_id_priv->qp = NULL;
+	}
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+	(void)iwcm_deref_id(cm_id_priv);
+}
+
+/*
+ * This function is only called by the application thread and cannot
+ * be called by the event thread. The function will wait for all
+ * references to be released on the cm_id and then kfree the cm_id
+ * object.
+ */
+void iw_destroy_cm_id(struct iw_cm_id *cm_id)
+{
+	struct iwcm_id_private *cm_id_priv;
+
+	cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+	BUG_ON(test_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags));
+
+	destroy_cm_id(cm_id);
+
+	wait_for_completion(&cm_id_priv->destroy_comp);
+
+	free_cm_id(cm_id_priv);
+}
+EXPORT_SYMBOL(iw_destroy_cm_id);
+
+/*
+ * CM_ID <-- LISTEN
+ *
+ * Start listening for connect requests. Generates one CONNECT_REQUEST
+ * event for each inbound connect request.
+ */
+int iw_cm_listen(struct iw_cm_id *cm_id, int backlog)
+{
+	struct iwcm_id_private *cm_id_priv;
+	unsigned long flags;
+	int ret;
+
+	cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+
+	ret = alloc_work_entries(cm_id_priv, backlog);
+	if (ret)
+		return ret;
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	switch (cm_id_priv->state) {
+	case IW_CM_STATE_IDLE:
+		cm_id_priv->state = IW_CM_STATE_LISTEN;
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		ret = cm_id->device->iwcm->create_listen(cm_id, backlog);
+		if (ret)
+			cm_id_priv->state = IW_CM_STATE_IDLE;
+		spin_lock_irqsave(&cm_id_priv->lock, flags);
+		break;
+	default:
+		ret = -EINVAL;
+	}
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL(iw_cm_listen);
+
+/*
+ * CM_ID <-- IDLE
+ *
+ * Rejects an inbound connection request. No events are generated.
+ */
+int iw_cm_reject(struct iw_cm_id *cm_id,
+		 const void *private_data,
+		 u8 private_data_len)
+{
+	struct iwcm_id_private *cm_id_priv;
+	unsigned long flags;
+	int ret;
+
+	cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+	set_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	if (cm_id_priv->state != IW_CM_STATE_CONN_RECV) {
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+		wake_up_all(&cm_id_priv->connect_wait);
+		return -EINVAL;
+	}
+	cm_id_priv->state = IW_CM_STATE_IDLE;
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+	ret = cm_id->device->iwcm->reject(cm_id, private_data,
+					  private_data_len);
+
+	clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+	wake_up_all(&cm_id_priv->connect_wait);
+
+	return ret;
+}
+EXPORT_SYMBOL(iw_cm_reject);
+
+/*
+ * CM_ID <-- ESTABLISHED
+ *
+ * Accepts an inbound connection request and generates an ESTABLISHED
+ * event. Callers of iw_cm_disconnect and iw_destroy_cm_id will block
+ * until the ESTABLISHED event is received from the provider.
+ */
+int iw_cm_accept(struct iw_cm_id *cm_id,
+		 struct iw_cm_conn_param *iw_param)
+{
+	struct iwcm_id_private *cm_id_priv;
+	struct ib_qp *qp;
+	unsigned long flags;
+	int ret;
+
+	cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+	set_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	if (cm_id_priv->state != IW_CM_STATE_CONN_RECV) {
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+		wake_up_all(&cm_id_priv->connect_wait);
+		return -EINVAL;
+	}
+	/* Get the ib_qp given the QPN */
+	qp = cm_id->device->iwcm->get_qp(cm_id->device, iw_param->qpn);
+	if (!qp) {
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		return -EINVAL;
+	}
+	cm_id->device->iwcm->add_ref(qp);
+	cm_id_priv->qp = qp;
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+	ret = cm_id->device->iwcm->accept(cm_id, iw_param);
+	if (ret) {
+		/* An error on accept precludes provider events */
+		BUG_ON(cm_id_priv->state != IW_CM_STATE_CONN_RECV);
+		cm_id_priv->state = IW_CM_STATE_IDLE;
+		spin_lock_irqsave(&cm_id_priv->lock, flags);
+		if (cm_id_priv->qp) {
+			cm_id->device->iwcm->rem_ref(qp);
+			cm_id_priv->qp = NULL;
+		}
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+		wake_up_all(&cm_id_priv->connect_wait);
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL(iw_cm_accept);
+
+/*
+ * Active Side: CM_ID <-- CONN_SENT
+ *
+ * If successful, results in the generation of a CONNECT_REPLY
+ * event. iw_cm_disconnect and iw_cm_destroy will block until the
+ * CONNECT_REPLY event is received from the provider.
+ */
+int iw_cm_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param)
+{
+	struct iwcm_id_private *cm_id_priv;
+	int ret;
+	unsigned long flags;
+	struct ib_qp *qp;
+
+	cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+
+	ret = alloc_work_entries(cm_id_priv, 4);
+	if (ret)
+		return ret;
+
+	set_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+
+	if (cm_id_priv->state != IW_CM_STATE_IDLE) {
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+		wake_up_all(&cm_id_priv->connect_wait);
+		return -EINVAL;
+	}
+
+	/* Get the ib_qp given the QPN */
+	qp = cm_id->device->iwcm->get_qp(cm_id->device, iw_param->qpn);
+	if (!qp) {
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		return -EINVAL;
+	}
+	cm_id->device->iwcm->add_ref(qp);
+	cm_id_priv->qp = qp;
+	cm_id_priv->state = IW_CM_STATE_CONN_SENT;
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+	ret = cm_id->device->iwcm->connect(cm_id, iw_param);
+	if (ret) {
+		spin_lock_irqsave(&cm_id_priv->lock, flags);
+		if (cm_id_priv->qp) {
+			cm_id->device->iwcm->rem_ref(qp);
+			cm_id_priv->qp = NULL;
+		}
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		BUG_ON(cm_id_priv->state != IW_CM_STATE_CONN_SENT);
+		cm_id_priv->state = IW_CM_STATE_IDLE;
+		clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+		wake_up_all(&cm_id_priv->connect_wait);
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL(iw_cm_connect);
+
+/*
+ * Passive Side: new CM_ID <-- CONN_RECV
+ *
+ * Handles an inbound connect request. The function creates a new
+ * iw_cm_id to represent the new connection and inherits the client
+ * callback function and other attributes from the listening parent.
+ *
+ * The work item contains a pointer to the listen_cm_id and the event. The
+ * listen_cm_id contains the client cm_handler, context and
+ * device. These are copied when the device is cloned. The event
+ * contains the new four tuple.
+ *
+ * An error on the child should not affect the parent, so this
+ * function does not return a value.
+ */
+static void cm_conn_req_handler(struct iwcm_id_private *listen_id_priv,
+				struct iw_cm_event *iw_event)
+{
+	unsigned long flags;
+	struct iw_cm_id *cm_id;
+	struct iwcm_id_private *cm_id_priv;
+	int ret;
+
+	/*
+	 * The provider should never generate a connection request
+	 * event with a bad status.
+	 */
+	BUG_ON(iw_event->status);
+
+	/*
+	 * We could be destroying the listening id. If so, ignore this
+	 * upcall.
+	 */
+	spin_lock_irqsave(&listen_id_priv->lock, flags);
+	if (listen_id_priv->state != IW_CM_STATE_LISTEN) {
+		spin_unlock_irqrestore(&listen_id_priv->lock, flags);
+		goto out;
+	}
+	spin_unlock_irqrestore(&listen_id_priv->lock, flags);
+
+	cm_id = iw_create_cm_id(listen_id_priv->id.device,
+				iw_event->so,
+				listen_id_priv->id.cm_handler,
+				listen_id_priv->id.context);
+	/* If the cm_id could not be created, ignore the request */
+	if (IS_ERR(cm_id))
+		goto out;
+
+	cm_id->provider_data = iw_event->provider_data;
+	cm_id->local_addr = iw_event->local_addr;
+	cm_id->remote_addr = iw_event->remote_addr;
+
+	cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+	cm_id_priv->state = IW_CM_STATE_CONN_RECV;
+
+	ret = alloc_work_entries(cm_id_priv, 3);
+	if (ret) {
+		iw_cm_reject(cm_id, NULL, 0);
+		iw_destroy_cm_id(cm_id);
+		goto out;
+	}
+
+	/* Call the client CM handler */
+	ret = cm_id->cm_handler(cm_id, iw_event);
+	if (ret) {
+		iw_cm_reject(cm_id, NULL, 0);
+		set_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags);
+		destroy_cm_id(cm_id);
+		if (atomic_read(&cm_id_priv->refcount)==0)
+			free_cm_id(cm_id_priv);
+	}
+
+out:
+	if (iw_event->private_data_len)
+		kfree(iw_event->private_data);
+}
+
+/*
+ * Passive Side: CM_ID <-- ESTABLISHED
+ *
+ * The provider generated an ESTABLISHED event which means that
+ * the MPA negotion has completed successfully and we are now in MPA
+ * FPDU mode.
+ *
+ * This event can only be received in the CONN_RECV state. If the
+ * remote peer closed, the ESTABLISHED event would be received followed
+ * by the CLOSE event. If the app closes, it will block until we wake
+ * it up after processing this event.
+ */
+static int cm_conn_est_handler(struct iwcm_id_private *cm_id_priv,
+			       struct iw_cm_event *iw_event)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+
+	/*
+	 * We clear the CONNECT_WAIT bit here to allow the callback
+	 * function to call iw_cm_disconnect. Calling iw_destroy_cm_id
+	 * from a callback handler is not allowed.
+	 */
+	clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+	BUG_ON(cm_id_priv->state != IW_CM_STATE_CONN_RECV);
+	cm_id_priv->state = IW_CM_STATE_ESTABLISHED;
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, iw_event);
+	wake_up_all(&cm_id_priv->connect_wait);
+
+	return ret;
+}
+
+/*
+ * Active Side: CM_ID <-- ESTABLISHED
+ *
+ * The app has called connect and is waiting for the established event to
+ * post it's requests to the server. This event will wake up anyone
+ * blocked in iw_cm_disconnect or iw_destroy_id.
+ */
+static int cm_conn_rep_handler(struct iwcm_id_private *cm_id_priv,
+			       struct iw_cm_event *iw_event)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	/*
+	 * Clear the connect wait bit so a callback function calling
+	 * iw_cm_disconnect will not wait and deadlock this thread
+	 */
+	clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+	BUG_ON(cm_id_priv->state != IW_CM_STATE_CONN_SENT);
+	if (iw_event->status == IW_CM_EVENT_STATUS_ACCEPTED) {
+		cm_id_priv->id.local_addr = iw_event->local_addr;
+		cm_id_priv->id.remote_addr = iw_event->remote_addr;
+		cm_id_priv->state = IW_CM_STATE_ESTABLISHED;
+	} else {
+		/* REJECTED or RESET */
+		cm_id_priv->id.device->iwcm->rem_ref(cm_id_priv->qp);
+		cm_id_priv->qp = NULL;
+		cm_id_priv->state = IW_CM_STATE_IDLE;
+	}
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, iw_event);
+
+	if (iw_event->private_data_len)
+		kfree(iw_event->private_data);
+
+	/* Wake up waiters on connect complete */
+	wake_up_all(&cm_id_priv->connect_wait);
+
+	return ret;
+}
+
+/*
+ * CM_ID <-- CLOSING
+ *
+ * If in the ESTABLISHED state, move to CLOSING.
+ */
+static void cm_disconnect_handler(struct iwcm_id_private *cm_id_priv,
+				  struct iw_cm_event *iw_event)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	if (cm_id_priv->state == IW_CM_STATE_ESTABLISHED)
+		cm_id_priv->state = IW_CM_STATE_CLOSING;
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+}
+
+/*
+ * CM_ID <-- IDLE
+ *
+ * If in the ESTBLISHED or CLOSING states, the QP will have have been
+ * moved by the provider to the ERR state. Disassociate the CM_ID from
+ * the QP,  move to IDLE, and remove the 'connected' reference.
+ *
+ * If in some other state, the cm_id was destroyed asynchronously.
+ * This is the last reference that will result in waking up
+ * the app thread blocked in iw_destroy_cm_id.
+ */
+static int cm_close_handler(struct iwcm_id_private *cm_id_priv,
+				  struct iw_cm_event *iw_event)
+{
+	unsigned long flags;
+	int ret = 0;
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+
+	if (cm_id_priv->qp) {
+		cm_id_priv->id.device->iwcm->rem_ref(cm_id_priv->qp);
+		cm_id_priv->qp = NULL;
+	}
+	switch (cm_id_priv->state) {
+	case IW_CM_STATE_ESTABLISHED:
+	case IW_CM_STATE_CLOSING:
+		cm_id_priv->state = IW_CM_STATE_IDLE;
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, iw_event);
+		spin_lock_irqsave(&cm_id_priv->lock, flags);
+		break;
+	case IW_CM_STATE_DESTROYING:
+		break;
+	default:
+		BUG();
+	}
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+	return ret;
+}
+
+static int process_event(struct iwcm_id_private *cm_id_priv,
+			 struct iw_cm_event *iw_event)
+{
+	int ret = 0;
+
+	switch (iw_event->event) {
+	case IW_CM_EVENT_CONNECT_REQUEST:
+		cm_conn_req_handler(cm_id_priv, iw_event);
+		break;
+	case IW_CM_EVENT_CONNECT_REPLY:
+		ret = cm_conn_rep_handler(cm_id_priv, iw_event);
+		break;
+	case IW_CM_EVENT_ESTABLISHED:
+		ret = cm_conn_est_handler(cm_id_priv, iw_event);
+		break;
+	case IW_CM_EVENT_DISCONNECT:
+		cm_disconnect_handler(cm_id_priv, iw_event);
+		break;
+	case IW_CM_EVENT_CLOSE:
+		ret = cm_close_handler(cm_id_priv, iw_event);
+		break;
+	default:
+		BUG();
+	}
+
+	return ret;
+}
+
+/*
+ * Process events on the work_list for the cm_id. If the callback
+ * function requests that the cm_id be deleted, a flag is set in the
+ * cm_id flags to indicate that when the last reference is
+ * removed, the cm_id is to be destroyed. This is necessary to
+ * distinguish between an object that will be destroyed by the app
+ * thread asleep on the destroy_comp list vs. an object destroyed
+ * here synchronously when the last reference is removed.
+ */
+static void cm_work_handler(struct work_struct *_work)
+{
+	struct iwcm_work *work = container_of(_work, struct iwcm_work, work);
+	struct iw_cm_event levent;
+	struct iwcm_id_private *cm_id_priv = work->cm_id;
+	unsigned long flags;
+	int empty;
+	int ret = 0;
+	int destroy_id;
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	empty = list_empty(&cm_id_priv->work_list);
+	while (!empty) {
+		work = list_entry(cm_id_priv->work_list.next,
+				  struct iwcm_work, list);
+		list_del_init(&work->list);
+		empty = list_empty(&cm_id_priv->work_list);
+		levent = work->event;
+		put_work(work);
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+		ret = process_event(cm_id_priv, &levent);
+		if (ret) {
+			set_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags);
+			destroy_cm_id(&cm_id_priv->id);
+		}
+		BUG_ON(atomic_read(&cm_id_priv->refcount)==0);
+		destroy_id = test_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags);
+		if (iwcm_deref_id(cm_id_priv)) {
+			if (destroy_id) {
+				BUG_ON(!list_empty(&cm_id_priv->work_list));
+				free_cm_id(cm_id_priv);
+			}
+			return;
+		}
+		spin_lock_irqsave(&cm_id_priv->lock, flags);
+	}
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+}
+
+/*
+ * This function is called on interrupt context. Schedule events on
+ * the iwcm_wq thread to allow callback functions to downcall into
+ * the CM and/or block.  Events are queued to a per-CM_ID
+ * work_list. If this is the first event on the work_list, the work
+ * element is also queued on the iwcm_wq thread.
+ *
+ * Each event holds a reference on the cm_id. Until the last posted
+ * event has been delivered and processed, the cm_id cannot be
+ * deleted.
+ *
+ * Returns:
+ * 	      0	- the event was handled.
+ *	-ENOMEM	- the event was not handled due to lack of resources.
+ */
+static int cm_event_handler(struct iw_cm_id *cm_id,
+			     struct iw_cm_event *iw_event)
+{
+	struct iwcm_work *work;
+	struct iwcm_id_private *cm_id_priv;
+	unsigned long flags;
+	int ret = 0;
+
+	cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	work = get_work(cm_id_priv);
+	if (!work) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	INIT_WORK(&work->work, cm_work_handler);
+	work->cm_id = cm_id_priv;
+	work->event = *iw_event;
+
+	if ((work->event.event == IW_CM_EVENT_CONNECT_REQUEST ||
+	     work->event.event == IW_CM_EVENT_CONNECT_REPLY) &&
+	    work->event.private_data_len) {
+		ret = copy_private_data(&work->event);
+		if (ret) {
+			put_work(work);
+			goto out;
+		}
+	}
+
+	atomic_inc(&cm_id_priv->refcount);
+	if (list_empty(&cm_id_priv->work_list)) {
+		list_add_tail(&work->list, &cm_id_priv->work_list);
+		queue_work(iwcm_wq, &work->work);
+	} else
+		list_add_tail(&work->list, &cm_id_priv->work_list);
+out:
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	return ret;
+}
+
+static int iwcm_init_qp_init_attr(struct iwcm_id_private *cm_id_priv,
+				  struct ib_qp_attr *qp_attr,
+				  int *qp_attr_mask)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	switch (cm_id_priv->state) {
+	case IW_CM_STATE_IDLE:
+	case IW_CM_STATE_CONN_SENT:
+	case IW_CM_STATE_CONN_RECV:
+	case IW_CM_STATE_ESTABLISHED:
+		*qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS;
+		qp_attr->qp_access_flags = IB_ACCESS_REMOTE_WRITE|
+					   IB_ACCESS_REMOTE_READ;
+		ret = 0;
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	return ret;
+}
+
+static int iwcm_init_qp_rts_attr(struct iwcm_id_private *cm_id_priv,
+				  struct ib_qp_attr *qp_attr,
+				  int *qp_attr_mask)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	switch (cm_id_priv->state) {
+	case IW_CM_STATE_IDLE:
+	case IW_CM_STATE_CONN_SENT:
+	case IW_CM_STATE_CONN_RECV:
+	case IW_CM_STATE_ESTABLISHED:
+		*qp_attr_mask = 0;
+		ret = 0;
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	return ret;
+}
+
+int iw_cm_init_qp_attr(struct iw_cm_id *cm_id,
+		       struct ib_qp_attr *qp_attr,
+		       int *qp_attr_mask)
+{
+	struct iwcm_id_private *cm_id_priv;
+	int ret;
+
+	cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+	switch (qp_attr->qp_state) {
+	case IB_QPS_INIT:
+	case IB_QPS_RTR:
+		ret = iwcm_init_qp_init_attr(cm_id_priv,
+					     qp_attr, qp_attr_mask);
+		break;
+	case IB_QPS_RTS:
+		ret = iwcm_init_qp_rts_attr(cm_id_priv,
+					    qp_attr, qp_attr_mask);
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+	return ret;
+}
+EXPORT_SYMBOL(iw_cm_init_qp_attr);
+
+static int __init iw_cm_init(void)
+{
+	iwcm_wq = create_singlethread_workqueue("iw_cm_wq");
+	if (!iwcm_wq)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void __exit iw_cm_cleanup(void)
+{
+	destroy_workqueue(iwcm_wq);
+}
+
+module_init(iw_cm_init);
+module_exit(iw_cm_cleanup);


Property changes on: trunk/sys/ofed/drivers/infiniband/core/iwcm.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/sys/ofed/drivers/infiniband/core/iwcm.h
===================================================================
--- trunk/sys/ofed/drivers/infiniband/core/iwcm.h	                        (rev 0)
+++ trunk/sys/ofed/drivers/infiniband/core/iwcm.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2005 Network Appliance, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef IWCM_H
+#define IWCM_H
+
+enum iw_cm_state {
+	IW_CM_STATE_IDLE,             /* unbound, inactive */
+	IW_CM_STATE_LISTEN,           /* listen waiting for connect */
+	IW_CM_STATE_CONN_RECV,        /* inbound waiting for user accept */
+	IW_CM_STATE_CONN_SENT,        /* outbound waiting for peer accept */
+	IW_CM_STATE_ESTABLISHED,      /* established */
+	IW_CM_STATE_CLOSING,	      /* disconnect */
+	IW_CM_STATE_DESTROYING        /* object being deleted */
+};
+
+struct iwcm_id_private {
+	struct iw_cm_id	id;
+	enum iw_cm_state state;
+	unsigned long flags;
+	struct ib_qp *qp;
+	struct completion destroy_comp;
+	wait_queue_head_t connect_wait;
+	struct list_head work_list;
+	spinlock_t lock;
+	atomic_t refcount;
+	struct list_head work_free_list;
+};
+
+#define IWCM_F_CALLBACK_DESTROY   1
+#define IWCM_F_CONNECT_WAIT       2
+
+#endif /* IWCM_H */


Property changes on: trunk/sys/ofed/drivers/infiniband/core/iwcm.h
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/sys/ofed/drivers/infiniband/core/local_sa.c
===================================================================
--- trunk/sys/ofed/drivers/infiniband/core/local_sa.c	                        (rev 0)
+++ trunk/sys/ofed/drivers/infiniband/core/local_sa.c	2016-09-14 19:35:22 UTC (rev 7911)
@@ -0,0 +1,1273 @@
+/*
+ * Copyright (c) 2006 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/dma-mapping.h>
+#include <linux/err.h>
+#include <linux/interrupt.h>
+#include <linux/rbtree.h>
+#include <linux/mutex.h>
+#include <linux/spinlock.h>
+#include <linux/pci.h>
+#include <linux/miscdevice.h>
+#include <linux/random.h>
+
+#include <rdma/ib_cache.h>
+#include <rdma/ib_sa.h>
+#include "sa.h"
+
+MODULE_AUTHOR("Sean Hefty");
+MODULE_DESCRIPTION("InfiniBand subnet administration caching");
+MODULE_LICENSE("Dual BSD/GPL");
+
+enum {
+	SA_DB_MAX_PATHS_PER_DEST = 0x7F,
+	SA_DB_MIN_RETRY_TIMER	 = 4000,  /*   4 sec */
+	SA_DB_MAX_RETRY_TIMER	 = 256000 /* 256 sec */
+};
+
+static int set_paths_per_dest(const char *val, struct kernel_param *kp);
+static unsigned long paths_per_dest = 0;
+module_param_call(paths_per_dest, set_paths_per_dest, param_get_ulong,
+		  &paths_per_dest, 0644);
+MODULE_PARM_DESC(paths_per_dest, "Maximum number of paths to retrieve "
+				 "to each destination (DGID).  Set to 0 "
+				 "to disable cache.");
+
+static int set_subscribe_inform_info(const char *val, struct kernel_param *kp);
+static char subscribe_inform_info = 1;
+module_param_call(subscribe_inform_info, set_subscribe_inform_info,
+		  param_get_bool, &subscribe_inform_info, 0644);
+MODULE_PARM_DESC(subscribe_inform_info,
+		 "Subscribe for SA InformInfo/Notice events.");
+
+static int do_refresh(const char *val, struct kernel_param *kp);
+module_param_call(refresh, do_refresh, NULL, NULL, 0200);
+
+static unsigned long retry_timer = SA_DB_MIN_RETRY_TIMER;
+
+enum sa_db_lookup_method {
+	SA_DB_LOOKUP_LEAST_USED,
+	SA_DB_LOOKUP_RANDOM
+};
+
+static int set_lookup_method(const char *val, struct kernel_param *kp);
+static int get_lookup_method(char *buf, struct kernel_param *kp);
+static unsigned long lookup_method;
+module_param_call(lookup_method, set_lookup_method, get_lookup_method,
+		  &lookup_method, 0644);
+MODULE_PARM_DESC(lookup_method, "Method used to return path records when "
+				"multiple paths exist to a given destination.");
+
+static void sa_db_add_dev(struct ib_device *device);
+static void sa_db_remove_dev(struct ib_device *device);
+
+static struct ib_client sa_db_client = {
+	.name   = "local_sa",
+	.add    = sa_db_add_dev,
+	.remove = sa_db_remove_dev
+};
+
+static LIST_HEAD(dev_list);
+static DEFINE_MUTEX(lock);
+static rwlock_t rwlock;
+static struct workqueue_struct *sa_wq;
+static struct ib_sa_client sa_client;
+
+enum sa_db_state {
+	SA_DB_IDLE,
+	SA_DB_REFRESH,
+	SA_DB_DESTROY
+};
+
+struct sa_db_port {
+	struct sa_db_device	*dev;
+	struct ib_mad_agent	*agent;
+	/* Limit number of outstanding MADs to SA to reduce SA flooding */
+	struct ib_mad_send_buf	*msg;
+	u16			sm_lid;
+	u8			sm_sl;
+	struct ib_inform_info	*in_info;
+	struct ib_inform_info	*out_info;
+	struct rb_root		paths;
+	struct list_head	update_list;
+	unsigned long		update_id;
+	enum sa_db_state	state;
+	struct work_struct	work;
+	union ib_gid		gid;
+	int			port_num;
+};
+
+struct sa_db_device {
+	struct list_head	list;
+	struct ib_device	*device;
+	struct ib_event_handler event_handler;
+	int			start_port;
+	int			port_count;
+	struct sa_db_port	port[0];
+};
+
+struct ib_sa_iterator {
+	struct ib_sa_iterator	*next;
+};
+
+struct ib_sa_attr_iter {
+	struct ib_sa_iterator	*iter;
+	unsigned long		flags;
+};
+
+struct ib_sa_attr_list {
+	struct ib_sa_iterator	iter;
+	struct ib_sa_iterator	*tail;
+	int			update_id;
+	union ib_gid		gid;
+	struct rb_node		node;
+};
+
+struct ib_path_rec_info {
+	struct ib_sa_iterator	iter; /* keep first */
+	struct ib_sa_path_rec	rec;
+	unsigned long		lookups;
+};
+
+struct ib_sa_mad_iter {
+	struct ib_mad_recv_wc	*recv_wc;
+	struct ib_mad_recv_buf	*recv_buf;
+	int			attr_size;
+	int			attr_offset;
+	int			data_offset;
+	int			data_left;
+	void			*attr;
+	u8			attr_data[0];
+};
+
+enum sa_update_type {
+	SA_UPDATE_FULL,
+	SA_UPDATE_ADD,
+	SA_UPDATE_REMOVE
+};
+
+struct update_info {
+	struct list_head	list;
+	union ib_gid		gid;
+	enum sa_update_type	type;
+};
+
+struct sa_path_request {
+	struct work_struct	work;
+	struct ib_sa_client	*client;
+	void			(*callback)(int, struct ib_sa_path_rec *, void *);
+	void			*context;
+	struct ib_sa_path_rec	path_rec;
+};
+
+static void process_updates(struct sa_db_port *port);
+
+static void free_attr_list(struct ib_sa_attr_list *attr_list)
+{
+	struct ib_sa_iterator *cur;
+
+	for (cur = attr_list->iter.next; cur; cur = attr_list->iter.next) {
+		attr_list->iter.next = cur->next;
+		kfree(cur);
+	}
+	attr_list->tail = &attr_list->iter;
+}
+
+static void remove_attr(struct rb_root *root, struct ib_sa_attr_list *attr_list)
+{
+	rb_erase(&attr_list->node, root);
+	free_attr_list(attr_list);
+	kfree(attr_list);
+}
+
+static void remove_all_attrs(struct rb_root *root)
+{
+	struct rb_node *node, *next_node;
+	struct ib_sa_attr_list *attr_list;
+
+	write_lock_irq(&rwlock);
+	for (node = rb_first(root); node; node = next_node) {
+		next_node = rb_next(node);
+		attr_list = rb_entry(node, struct ib_sa_attr_list, node);
+		remove_attr(root, attr_list);
+	}
+	write_unlock_irq(&rwlock);
+}
+
+static void remove_old_attrs(struct rb_root *root, unsigned long update_id)
+{
+	struct rb_node *node, *next_node;
+	struct ib_sa_attr_list *attr_list;
+
+	write_lock_irq(&rwlock);
+	for (node = rb_first(root); node; node = next_node) {
+		next_node = rb_next(node);
+		attr_list = rb_entry(node, struct ib_sa_attr_list, node);
+		if (attr_list->update_id != update_id)
+			remove_attr(root, attr_list);
+	}
+	write_unlock_irq(&rwlock);
+}
+
+static struct ib_sa_attr_list *insert_attr_list(struct rb_root *root,
+						struct ib_sa_attr_list *attr_list)
+{
+	struct rb_node **link = &root->rb_node;
+	struct rb_node *parent = NULL;
+	struct ib_sa_attr_list *cur_attr_list;
+	int cmp;
+
+	while (*link) {
+		parent = *link;
+		cur_attr_list = rb_entry(parent, struct ib_sa_attr_list, node);
+		cmp = memcmp(&cur_attr_list->gid, &attr_list->gid,
+			     sizeof attr_list->gid);
+		if (cmp < 0)
+			link = &(*link)->rb_left;
+		else if (cmp > 0)
+			link = &(*link)->rb_right;
+		else
+			return cur_attr_list;
+	}
+	rb_link_node(&attr_list->node, parent, link);
+	rb_insert_color(&attr_list->node, root);
+	return NULL;
+}
+
+static struct ib_sa_attr_list *find_attr_list(struct rb_root *root, u8 *gid)
+{
+	struct rb_node *node = root->rb_node;
+	struct ib_sa_attr_list *attr_list;
+	int cmp;
+
+	while (node) {
+		attr_list = rb_entry(node, struct ib_sa_attr_list, node);
+		cmp = memcmp(&attr_list->gid, gid, sizeof attr_list->gid);
+		if (cmp < 0)
+			node = node->rb_left;
+		else if (cmp > 0)
+			node = node->rb_right;
+		else
+			return attr_list;
+	}
+	return NULL;
+}
+
+static int insert_attr(struct rb_root *root, unsigned long update_id, void *key,
+		       struct ib_sa_iterator *iter)
+{
+	struct ib_sa_attr_list *attr_list;
+	void *err;
+
+	write_lock_irq(&rwlock);
+	attr_list = find_attr_list(root, key);
+	if (!attr_list) {
+		write_unlock_irq(&rwlock);
+		attr_list = kmalloc(sizeof *attr_list, GFP_KERNEL);
+		if (!attr_list)
+			return -ENOMEM;
+
+		attr_list->iter.next = NULL;
+		attr_list->tail = &attr_list->iter;
+		attr_list->update_id = update_id;
+		memcpy(attr_list->gid.raw, key, sizeof attr_list->gid);
+
+		write_lock_irq(&rwlock);
+		err = insert_attr_list(root, attr_list);
+		if (err) {
+			write_unlock_irq(&rwlock);
+			kfree(attr_list);
+			return PTR_ERR(err);
+		}
+	} else if (attr_list->update_id != update_id) {
+		free_attr_list(attr_list);
+		attr_list->update_id = update_id;
+	}
+
+	attr_list->tail->next = iter;
+	iter->next = NULL;
+	attr_list->tail = iter;
+	write_unlock_irq(&rwlock);
+	return 0;
+}
+
+static struct ib_sa_mad_iter *ib_sa_iter_create(struct ib_mad_recv_wc *mad_recv_wc)
+{
+	struct ib_sa_mad_iter *iter;
+	struct ib_sa_mad *mad = (struct ib_sa_mad *) mad_recv_wc->recv_buf.mad;
+	int attr_size, attr_offset;
+
+	attr_offset = be16_to_cpu(mad->sa_hdr.attr_offset) * 8;
+	attr_size = 64;		/* path record length */
+	if (attr_offset < attr_size)
+		return ERR_PTR(-EINVAL);
+
+	iter = kzalloc(sizeof *iter + attr_size, GFP_KERNEL);
+	if (!iter)
+		return ERR_PTR(-ENOMEM);
+
+	iter->data_left = mad_recv_wc->mad_len - IB_MGMT_SA_HDR;
+	iter->recv_wc = mad_recv_wc;
+	iter->recv_buf = &mad_recv_wc->recv_buf;
+	iter->attr_offset = attr_offset;
+	iter->attr_size = attr_size;
+	return iter;
+}
+
+static void ib_sa_iter_free(struct ib_sa_mad_iter *iter)
+{
+	kfree(iter);
+}
+
+static void *ib_sa_iter_next(struct ib_sa_mad_iter *iter)
+{
+	struct ib_sa_mad *mad;
+	int left, offset = 0;
+
+	while (iter->data_left >= iter->attr_offset) {
+		while (iter->data_offset < IB_MGMT_SA_DATA) {
+			mad = (struct ib_sa_mad *) iter->recv_buf->mad;
+
+			left = IB_MGMT_SA_DATA - iter->data_offset;
+			if (left < iter->attr_size) {
+				/* copy first piece of the attribute */
+				iter->attr = &iter->attr_data;
+				memcpy(iter->attr,
+				       &mad->data[iter->data_offset], left);
+				offset = left;
+				break;
+			} else if (offset) {
+				/* copy the second piece of the attribute */
+				memcpy(iter->attr + offset, &mad->data[0],
+				       iter->attr_size - offset);
+				iter->data_offset = iter->attr_size - offset;
+				offset = 0;
+			} else {
+				iter->attr = &mad->data[iter->data_offset];
+				iter->data_offset += iter->attr_size;
+			}
+
+			iter->data_left -= iter->attr_offset;
+			goto out;
+		}
+		iter->data_offset = 0;
+		iter->recv_buf = list_entry(iter->recv_buf->list.next,
+					    struct ib_mad_recv_buf, list);
+	}
+	iter->attr = NULL;
+out:
+	return iter->attr;
+}
+
+/*
+ * Copy path records from a received response and insert them into our cache.
+ * A path record in the MADs are in network order, packed, and may
+ * span multiple MAD buffers, just to make our life hard.
+ */
+static void update_path_db(struct sa_db_port *port,
+			   struct ib_mad_recv_wc *mad_recv_wc,
+			   enum sa_update_type type)
+{
+	struct ib_sa_mad_iter *iter;
+	struct ib_path_rec_info *path_info;
+	void *attr;
+	int ret;
+
+	iter = ib_sa_iter_create(mad_recv_wc);
+	if (IS_ERR(iter))
+		return;
+
+	port->update_id += (type == SA_UPDATE_FULL);
+
+	while ((attr = ib_sa_iter_next(iter)) &&
+	       (path_info = kmalloc(sizeof *path_info, GFP_KERNEL))) {
+
+		ib_sa_unpack_attr(&path_info->rec, attr, IB_SA_ATTR_PATH_REC);
+
+		ret = insert_attr(&port->paths, port->update_id,
+				  path_info->rec.dgid.raw, &path_info->iter);
+		if (ret) {
+			kfree(path_info);
+			break;
+		}
+	}
+	ib_sa_iter_free(iter);
+
+	if (type == SA_UPDATE_FULL)
+		remove_old_attrs(&port->paths, port->update_id);
+}
+
+static struct ib_mad_send_buf *get_sa_msg(struct sa_db_port *port,
+					  struct update_info *update)
+{
+	struct ib_ah_attr ah_attr;
+	struct ib_mad_send_buf *msg;
+
+	msg = ib_create_send_mad(port->agent, 1, 0, 0, IB_MGMT_SA_HDR,
+				 IB_MGMT_SA_DATA, GFP_KERNEL);
+	if (IS_ERR(msg))
+		return NULL;
+
+	memset(&ah_attr, 0, sizeof ah_attr);
+	ah_attr.dlid = port->sm_lid;
+	ah_attr.sl = port->sm_sl;
+	ah_attr.port_num = port->port_num;
+
+	msg->ah = ib_create_ah(port->agent->qp->pd, &ah_attr);
+	if (IS_ERR(msg->ah)) {
+		ib_free_send_mad(msg);
+		return NULL;
+	}
+
+	msg->timeout_ms = retry_timer;
+	msg->retries = 0;
+	msg->context[0] = port;
+	msg->context[1] = update;
+	return msg;
+}
+
+static __be64 form_tid(u32 hi_tid)
+{
+	static atomic_t tid;
+	return cpu_to_be64((((u64) hi_tid) << 32) |
+			   ((u32) atomic_inc_return(&tid)));
+}
+
+static void format_path_req(struct sa_db_port *port,
+			    struct update_info *update,
+			    struct ib_mad_send_buf *msg)
+{
+	struct ib_sa_mad *mad = msg->mad;
+	struct ib_sa_path_rec path_rec;
+
+	mad->mad_hdr.base_version  = IB_MGMT_BASE_VERSION;
+	mad->mad_hdr.mgmt_class	   = IB_MGMT_CLASS_SUBN_ADM;
+	mad->mad_hdr.class_version = IB_SA_CLASS_VERSION;
+	mad->mad_hdr.method	   = IB_SA_METHOD_GET_TABLE;
+	mad->mad_hdr.attr_id	   = cpu_to_be16(IB_SA_ATTR_PATH_REC);
+	mad->mad_hdr.tid	   = form_tid(msg->mad_agent->hi_tid);
+
+	mad->sa_hdr.comp_mask = IB_SA_PATH_REC_SGID | IB_SA_PATH_REC_NUMB_PATH;
+
+	path_rec.sgid = port->gid;
+	path_rec.numb_path = (u8) paths_per_dest;
+
+	if (update->type == SA_UPDATE_ADD) {
+		mad->sa_hdr.comp_mask |= IB_SA_PATH_REC_DGID;
+		memcpy(&path_rec.dgid, &update->gid, sizeof path_rec.dgid);
+	}
+
+	ib_sa_pack_attr(mad->data, &path_rec, IB_SA_ATTR_PATH_REC);
+}
+
+static int send_query(struct sa_db_port *port,
+		      struct update_info *update)
+{
+	int ret;
+
+	port->msg = get_sa_msg(port, update);
+	if (!port->msg)
+		return -ENOMEM;
+
+	format_path_req(port, update, port->msg);
+
+	ret = ib_post_send_mad(port->msg, NULL);
+	if (ret)
+		goto err;
+
+	return 0;
+
+err:
+	ib_destroy_ah(port->msg->ah);
+	ib_free_send_mad(port->msg);
+	return ret;
+}
+
+static void add_update(struct sa_db_port *port, u8 *gid,
+		       enum sa_update_type type)
+{
+	struct update_info *update;
+
+	update = kmalloc(sizeof *update, GFP_KERNEL);
+	if (update) {
+		if (gid)
+			memcpy(&update->gid, gid, sizeof update->gid);
+		update->type = type;
+		list_add(&update->list, &port->update_list);
+	}
+
+	if (port->state == SA_DB_IDLE) {
+		port->state = SA_DB_REFRESH;
+		process_updates(port);
+	}
+}
+
+static void clean_update_list(struct sa_db_port *port)
+{
+	struct update_info *update;
+
+	while (!list_empty(&port->update_list)) {
+		update = list_entry(port->update_list.next,
+				    struct update_info, list);
+		list_del(&update->list);
+		kfree(update);
+	}
+}
+
+static int notice_handler(int status, struct ib_inform_info *info,
+			  struct ib_sa_notice *notice)
+{
+	struct sa_db_port *port = info->context;
+	struct ib_sa_notice_data_gid *gid_data;
+	struct ib_inform_info **pinfo;
+	enum sa_update_type type;
+
+	if (info->trap_number == IB_SA_SM_TRAP_GID_IN_SERVICE) {
+		pinfo = &port->in_info;
+		type = SA_UPDATE_ADD;
+	} else {
+		pinfo = &port->out_info;
+		type = SA_UPDATE_REMOVE;
+	}
+
+	mutex_lock(&lock);
+	if (port->state == SA_DB_DESTROY || !*pinfo) {
+		mutex_unlock(&lock);
+		return 0;
+	}
+
+	if (notice) {
+		gid_data = (struct ib_sa_notice_data_gid *)
+			   &notice->data_details;
+		add_update(port, gid_data->gid, type);
+		mutex_unlock(&lock);
+	} else if (status == -ENETRESET) {
+		*pinfo = NULL;
+		mutex_unlock(&lock);
+	} else {
+		if (status)
+			*pinfo = ERR_PTR(-EINVAL);
+		port->state = SA_DB_IDLE;
+		clean_update_list(port);
+		mutex_unlock(&lock);
+		queue_work(sa_wq, &port->work);
+	}
+
+	return status;
+}
+
+static int reg_in_info(struct sa_db_port *port)
+{
+	int ret = 0;
+
+	port->in_info = ib_sa_register_inform_info(&sa_client,
+						   port->dev->device,
+						   port->port_num,
+						   IB_SA_SM_TRAP_GID_IN_SERVICE,
+						   GFP_KERNEL, notice_handler,
+						   port);
+	if (IS_ERR(port->in_info))
+		ret = PTR_ERR(port->in_info);
+
+	return ret;
+}
+
+static int reg_out_info(struct sa_db_port *port)
+{
+	int ret = 0;
+
+	port->out_info = ib_sa_register_inform_info(&sa_client,
+						    port->dev->device,
+						    port->port_num,
+						    IB_SA_SM_TRAP_GID_OUT_OF_SERVICE,
+						    GFP_KERNEL, notice_handler,
+						    port);
+	if (IS_ERR(port->out_info))
+		ret = PTR_ERR(port->out_info);
+
+	return ret;
+}
+
+static void unsubscribe_port(struct sa_db_port *port)
+{
+	if (port->in_info && !IS_ERR(port->in_info))
+		ib_sa_unregister_inform_info(port->in_info);
+
+	if (port->out_info && !IS_ERR(port->out_info))
+		ib_sa_unregister_inform_info(port->out_info);
+
+	port->out_info = NULL;
+	port->in_info = NULL;
+
+}
+
+static void cleanup_port(struct sa_db_port *port)
+{
+	unsubscribe_port(port);
+
+	clean_update_list(port);
+	remove_all_attrs(&port->paths);
+}
+
+static int update_port_info(struct sa_db_port *port)
+{
+	struct ib_port_attr port_attr;
+	int ret;
+
+	ret = ib_query_port(port->dev->device, port->port_num, &port_attr);
+	if (ret)
+		return ret;
+
+	if (port_attr.state != IB_PORT_ACTIVE)
+		return -ENODATA;
+
+        port->sm_lid = port_attr.sm_lid;
+	port->sm_sl = port_attr.sm_sl;
+	return 0;
+}
+
+static void process_updates(struct sa_db_port *port)
+{
+	struct update_info *update;
+	struct ib_sa_attr_list *attr_list;
+	int ret;
+
+	if (!paths_per_dest || update_port_info(port)) {
+		cleanup_port(port);
+		goto out;
+	}
+
+	/* Event registration is an optimization, so ignore failures. */
+	if (subscribe_inform_info) {
+		if (!port->out_info) {
+			ret = reg_out_info(port);
+			if (!ret)
+				return;
+		}
+
+		if (!port->in_info) {
+			ret = reg_in_info(port);
+			if (!ret)
+				return;
+		}
+	} else
+		unsubscribe_port(port);
+
+	while (!list_empty(&port->update_list)) {
+		update = list_entry(port->update_list.next,
+				    struct update_info, list);
+
+		if (update->type == SA_UPDATE_REMOVE) {
+			write_lock_irq(&rwlock);
+			attr_list = find_attr_list(&port->paths,
+						   update->gid.raw);
+			if (attr_list)
+				remove_attr(&port->paths, attr_list);
+			write_unlock_irq(&rwlock);
+		} else {
+			ret = send_query(port, update);
+			if (!ret)
+				return;
+
+		}
+		list_del(&update->list);
+		kfree(update);
+	}
+out:
+	port->state = SA_DB_IDLE;
+}
+
+static void refresh_port_db(struct sa_db_port *port)
+{
+	if (port->state == SA_DB_DESTROY)
+		return;
+
+	if (port->state == SA_DB_REFRESH) {
+		clean_update_list(port);
+		ib_cancel_mad(port->agent, port->msg);
+	}
+
+	add_update(port, NULL, SA_UPDATE_FULL);
+}
+
+static void refresh_dev_db(struct sa_db_device *dev)
+{
+	int i;
+
+	for (i = 0; i < dev->port_count; i++)
+		refresh_port_db(&dev->port[i]);
+}
+
+static void refresh_db(void)
+{
+	struct sa_db_device *dev;
+
+	list_for_each_entry(dev, &dev_list, list)
+		refresh_dev_db(dev);
+}
+
+static int do_refresh(const char *val, struct kernel_param *kp)
+{
+	mutex_lock(&lock);
+	refresh_db();
+	mutex_unlock(&lock);
+	return 0;
+}
+
+static int get_lookup_method(char *buf, struct kernel_param *kp)
+{
+	return sprintf(buf,
+		       "%c %d round robin\n"
+		       "%c %d random",
+		       (lookup_method == SA_DB_LOOKUP_LEAST_USED) ? '*' : ' ',
+		       SA_DB_LOOKUP_LEAST_USED,
+		       (lookup_method == SA_DB_LOOKUP_RANDOM) ? '*' : ' ',
+		       SA_DB_LOOKUP_RANDOM);
+}
+
+static int set_lookup_method(const char *val, struct kernel_param *kp)
+{
+	unsigned long method;
+	int ret = 0;
+
+	method = simple_strtoul(val, NULL, 0);
+
+	switch (method) {
+	case SA_DB_LOOKUP_LEAST_USED:
+	case SA_DB_LOOKUP_RANDOM:
+		lookup_method = method;
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+	return ret;
+}
+
+static int set_paths_per_dest(const char *val, struct kernel_param *kp)
+{
+	int ret;
+
+	mutex_lock(&lock);
+	ret = param_set_ulong(val, kp);
+	if (ret)
+		goto out;
+
+	if (paths_per_dest > SA_DB_MAX_PATHS_PER_DEST)
+		paths_per_dest = SA_DB_MAX_PATHS_PER_DEST;
+	refresh_db();
+out:
+	mutex_unlock(&lock);
+	return ret;
+}
+
+static int set_subscribe_inform_info(const char *val, struct kernel_param *kp)
+{
+	int ret;
+
+	ret = param_set_bool(val, kp);
+	if (ret)
+		return ret;
+
+	return do_refresh(val, kp);
+}
+
+static void port_work_handler(struct work_struct *work)
+{
+	struct sa_db_port *port;
+
+	port = container_of(work, typeof(*port), work);
+	mutex_lock(&lock);
+	refresh_port_db(port);
+	mutex_unlock(&lock);
+}
+
+static void handle_event(struct ib_event_handler *event_handler,
+			 struct ib_event *event)
+{
+	struct sa_db_device *dev;
+	struct sa_db_port *port;
+
+	dev = container_of(event_handler, typeof(*dev), event_handler);
+	port = &dev->port[event->element.port_num - dev->start_port];
+
+	switch (event->event) {
+	case IB_EVENT_PORT_ERR:
+	case IB_EVENT_LID_CHANGE:
+	case IB_EVENT_SM_CHANGE:
+	case IB_EVENT_CLIENT_REREGISTER:
+	case IB_EVENT_PKEY_CHANGE:
+	case IB_EVENT_PORT_ACTIVE:
+		queue_work(sa_wq, &port->work);
+		break;
+	default:
+		break;
+	}
+}
+
+static void ib_free_path_iter(struct ib_sa_attr_iter *iter)
+{
+	read_unlock_irqrestore(&rwlock, iter->flags);
+}
+
+static int ib_create_path_iter(struct ib_device *device, u8 port_num,
+			       union ib_gid *dgid, struct ib_sa_attr_iter *iter)
+{
+	struct sa_db_device *dev;
+	struct sa_db_port *port;
+	struct ib_sa_attr_list *list;
+
+	dev = ib_get_client_data(device, &sa_db_client);
+	if (!dev)
+		return -ENODEV;
+
+	port = &dev->port[port_num - dev->start_port];
+
+	read_lock_irqsave(&rwlock, iter->flags);
+	list = find_attr_list(&port->paths, dgid->raw);
+	if (!list) {
+		ib_free_path_iter(iter);
+		return -ENODATA;
+	}
+
+	iter->iter = &list->iter;
+	return 0;
+}
+
+static struct ib_sa_path_rec *ib_get_next_path(struct ib_sa_attr_iter *iter)
+{
+	struct ib_path_rec_info *next_path;
+
+	iter->iter = iter->iter->next;
+	if (iter->iter) {
+		next_path = container_of(iter->iter, struct ib_path_rec_info, iter);
+		return &next_path->rec;
+	} else
+		return NULL;
+}
+
+static int cmp_rec(struct ib_sa_path_rec *src,
+		   struct ib_sa_path_rec *dst, ib_sa_comp_mask comp_mask)
+{
+	/* DGID check already done */
+	if (comp_mask & IB_SA_PATH_REC_SGID &&
+	    memcmp(&src->sgid, &dst->sgid, sizeof src->sgid))
+		return -EINVAL;
+	if (comp_mask & IB_SA_PATH_REC_DLID && src->dlid != dst->dlid)
+		return -EINVAL;
+	if (comp_mask & IB_SA_PATH_REC_SLID && src->slid != dst->slid)
+		return -EINVAL;
+	if (comp_mask & IB_SA_PATH_REC_RAW_TRAFFIC &&
+	    src->raw_traffic != dst->raw_traffic)
+		return -EINVAL;
+
+	if (comp_mask & IB_SA_PATH_REC_FLOW_LABEL &&
+	    src->flow_label != dst->flow_label)
+		return -EINVAL;
+	if (comp_mask & IB_SA_PATH_REC_HOP_LIMIT &&
+	    src->hop_limit != dst->hop_limit)
+		return -EINVAL;
+	if (comp_mask & IB_SA_PATH_REC_TRAFFIC_CLASS &&
+	    src->traffic_class != dst->traffic_class)
+		return -EINVAL;
+	if (comp_mask & IB_SA_PATH_REC_REVERSIBLE &&
+	    dst->reversible && !src->reversible)
+		return -EINVAL;
+	/* Numb path check already done */
+	if (comp_mask & IB_SA_PATH_REC_PKEY && src->pkey != dst->pkey)
+		return -EINVAL;
+
+	if (comp_mask & IB_SA_PATH_REC_SL && src->sl != dst->sl)
+		return -EINVAL;
+
+	if (ib_sa_check_selector(comp_mask, IB_SA_PATH_REC_MTU_SELECTOR,
+				 IB_SA_PATH_REC_MTU, dst->mtu_selector,
+				 src->mtu, dst->mtu))
+		return -EINVAL;
+	if (ib_sa_check_selector(comp_mask, IB_SA_PATH_REC_RATE_SELECTOR,
+				 IB_SA_PATH_REC_RATE, dst->rate_selector,
+				 src->rate, dst->rate))
+		return -EINVAL;
+	if (ib_sa_check_selector(comp_mask,
+				 IB_SA_PATH_REC_PACKET_LIFE_TIME_SELECTOR,
+				 IB_SA_PATH_REC_PACKET_LIFE_TIME,
+				 dst->packet_life_time_selector,
+				 src->packet_life_time, dst->packet_life_time))
+		return -EINVAL;
+
+	return 0;
+}
+
+static struct ib_sa_path_rec *get_random_path(struct ib_sa_attr_iter *iter,
+					      struct ib_sa_path_rec *req_path,
+					      ib_sa_comp_mask comp_mask)
+{
+	struct ib_sa_path_rec *path, *rand_path = NULL;
+	int num, count = 0;
+
+	for (path = ib_get_next_path(iter); path;
+	     path = ib_get_next_path(iter)) {
+		if (!cmp_rec(path, req_path, comp_mask)) {
+			get_random_bytes(&num, sizeof num);
+			if ((num % ++count) == 0)
+				rand_path = path;
+		}
+	}
+
+	return rand_path;
+}
+
+static struct ib_sa_path_rec *get_next_path(struct ib_sa_attr_iter *iter,
+					    struct ib_sa_path_rec *req_path,
+					    ib_sa_comp_mask comp_mask)
+{
+	struct ib_path_rec_info *cur_path, *next_path = NULL;
+	struct ib_sa_path_rec *path;
+	unsigned long lookups = ~0;
+
+	for (path = ib_get_next_path(iter); path;
+	     path = ib_get_next_path(iter)) {
+		if (!cmp_rec(path, req_path, comp_mask)) {
+
+			cur_path = container_of(iter->iter, struct ib_path_rec_info,
+						iter);
+			if (cur_path->lookups < lookups) {
+				lookups = cur_path->lookups;
+				next_path = cur_path;
+			}
+		}
+	}
+
+	if (next_path) {
+		next_path->lookups++;
+		return &next_path->rec;
+	} else
+		return NULL;
+}
+
+static void report_path(struct work_struct *work)
+{
+	struct sa_path_request *req;
+
+	req = container_of(work, struct sa_path_request, work);
+	req->callback(0, &req->path_rec, req->context);
+	ib_sa_client_put(req->client);
+	kfree(req);
+}
+
+/**
+ * ib_sa_path_rec_get - Start a Path get query
+ * @client:SA client
+ * @device:device to send query on
+ * @port_num: port number to send query on
+ * @rec:Path Record to send in query
+ * @comp_mask:component mask to send in query
+ * @timeout_ms:time to wait for response
+ * @gfp_mask:GFP mask to use for internal allocations
+ * @callback:function called when query completes, times out or is
+ * canceled
+ * @context:opaque user context passed to callback
+ * @sa_query:query context, used to cancel query
+ *
+ * Send a Path Record Get query to the SA to look up a path.  The
+ * callback function will be called when the query completes (or
+ * fails); status is 0 for a successful response, -EINTR if the query
+ * is canceled, -ETIMEDOUT is the query timed out, or -EIO if an error
+ * occurred sending the query.  The resp parameter of the callback is
+ * only valid if status is 0.
+ *
+ * If the return value of ib_sa_path_rec_get() is negative, it is an
+ * error code.  Otherwise it is a query ID that can be used to cancel
+ * the query.
+ */
+int ib_sa_path_rec_get(struct ib_sa_client *client,
+		       struct ib_device *device, u8 port_num,
+		       struct ib_sa_path_rec *rec,
+		       ib_sa_comp_mask comp_mask,
+		       int timeout_ms, gfp_t gfp_mask,
+		       void (*callback)(int status,
+					struct ib_sa_path_rec *resp,
+					void *context),
+		       void *context,
+		       struct ib_sa_query **sa_query)
+{
+	struct sa_path_request *req;
+	struct ib_sa_attr_iter iter;
+	struct ib_sa_path_rec *path_rec;
+	int ret;
+
+	if (!paths_per_dest)
+		goto query_sa;
+
+	if (!(comp_mask & IB_SA_PATH_REC_DGID) ||
+	    !(comp_mask & IB_SA_PATH_REC_NUMB_PATH) || rec->numb_path != 1)
+		goto query_sa;
+
+	req = kmalloc(sizeof *req, gfp_mask);
+	if (!req)
+		goto query_sa;
+
+	ret = ib_create_path_iter(device, port_num, &rec->dgid, &iter);
+	if (ret)
+		goto free_req;
+
+	if (lookup_method == SA_DB_LOOKUP_RANDOM)
+		path_rec = get_random_path(&iter, rec, comp_mask);
+	else
+		path_rec = get_next_path(&iter, rec, comp_mask);
+
+	if (!path_rec)
+		goto free_iter;
+
+	memcpy(&req->path_rec, path_rec, sizeof *path_rec);
+	ib_free_path_iter(&iter);
+
+	INIT_WORK(&req->work, report_path);
+	req->client = client;
+	req->callback = callback;
+	req->context = context;
+
+	ib_sa_client_get(client);
+	queue_work(sa_wq, &req->work);
+	*sa_query = ERR_PTR(-EEXIST);
+	return 0;
+
+free_iter:
+	ib_free_path_iter(&iter);
+free_req:
+	kfree(req);
+query_sa:
+	return ib_sa_path_rec_query(client, device, port_num, rec, comp_mask,
+				    timeout_ms, gfp_mask, callback, context,
+				    sa_query);
+}
+EXPORT_SYMBOL(ib_sa_path_rec_get);
+
+static void recv_handler(struct ib_mad_agent *mad_agent,
+			 struct ib_mad_recv_wc *mad_recv_wc)
+{
+	struct sa_db_port *port;
+	struct update_info *update;
+	struct ib_mad_send_buf *msg;
+	enum sa_update_type type;
+
+	msg = (struct ib_mad_send_buf *) (unsigned long) mad_recv_wc->wc->wr_id;
+	port = msg->context[0];
+	update = msg->context[1];
+
+	mutex_lock(&lock);
+	if (port->state == SA_DB_DESTROY ||
+	    update != list_entry(port->update_list.next,
+				 struct update_info, list)) {
+		mutex_unlock(&lock);
+	} else {
+		type = update->type;
+		mutex_unlock(&lock);
+		update_path_db(mad_agent->context, mad_recv_wc, type);
+	}
+
+	ib_free_recv_mad(mad_recv_wc);
+}
+
+static void send_handler(struct ib_mad_agent *agent,
+			 struct ib_mad_send_wc *mad_send_wc)
+{
+	struct ib_mad_send_buf *msg;
+	struct sa_db_port *port;
+	struct update_info *update;
+	int ret;
+
+	msg = mad_send_wc->send_buf;
+	port = msg->context[0];
+	update = msg->context[1];
+
+	mutex_lock(&lock);
+	if (port->state == SA_DB_DESTROY)
+		goto unlock;
+
+	if (update == list_entry(port->update_list.next,
+				 struct update_info, list)) {
+
+		if (mad_send_wc->status == IB_WC_RESP_TIMEOUT_ERR &&
+		    msg->timeout_ms < SA_DB_MAX_RETRY_TIMER) {
+
+			msg->timeout_ms <<= 1;
+			ret = ib_post_send_mad(msg, NULL);
+			if (!ret) {
+				mutex_unlock(&lock);
+				return;
+			}
+		}
+		list_del(&update->list);
+		kfree(update);
+	}
+	process_updates(port);
+unlock:
+	mutex_unlock(&lock);
+
+	ib_destroy_ah(msg->ah);
+	ib_free_send_mad(msg);
+}
+
+static int init_port(struct sa_db_device *dev, int port_num)
+{
+	struct sa_db_port *port;
+	int ret;
+
+	port = &dev->port[port_num - dev->start_port];
+	port->dev = dev;
+	port->port_num = port_num;
+	INIT_WORK(&port->work, port_work_handler);
+	port->paths = RB_ROOT;
+	INIT_LIST_HEAD(&port->update_list);
+
+	ret = ib_get_cached_gid(dev->device, port_num, 0, &port->gid);
+	if (ret)
+		return ret;
+
+	port->agent = ib_register_mad_agent(dev->device, port_num, IB_QPT_GSI,
+					    NULL, IB_MGMT_RMPP_VERSION,
+					    send_handler, recv_handler, port);
+	if (IS_ERR(port->agent))
+		ret = PTR_ERR(port->agent);
+
+	return ret;
+}
+
+static void destroy_port(struct sa_db_port *port)
+{
+	mutex_lock(&lock);
+	port->state = SA_DB_DESTROY;
+	mutex_unlock(&lock);
+
+	ib_unregister_mad_agent(port->agent);
+	cleanup_port(port);
+	flush_workqueue(sa_wq);
+}
+
+static void sa_db_add_dev(struct ib_device *device)
+{
+	struct sa_db_device *dev;
+	struct sa_db_port *port;
+	int s, e, i, ret;
+
+	if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
+		return;
+
+	if (device->node_type == RDMA_NODE_IB_SWITCH) {
+		s = e = 0;
+	} else {
+		s = 1;
+		e = device->phys_port_cnt;
+	}
+
+	dev = kzalloc(sizeof *dev + (e - s + 1) * sizeof *port, GFP_KERNEL);
+	if (!dev)
+		return;
+
+	dev->start_port = s;
+	dev->port_count = e - s + 1;
+	dev->device = device;
+	for (i = 0; i < dev->port_count; i++) {
+		ret = init_port(dev, s + i);
+		if (ret)
+			goto err;
+	}
+
+	ib_set_client_data(device, &sa_db_client, dev);
+
+	INIT_IB_EVENT_HANDLER(&dev->event_handler, device, handle_event);
+
+	mutex_lock(&lock);
+	list_add_tail(&dev->list, &dev_list);
+	refresh_dev_db(dev);
+	mutex_unlock(&lock);
+
+	ib_register_event_handler(&dev->event_handler);
+	return;
+err:
+	while (i--)
+		destroy_port(&dev->port[i]);
+	kfree(dev);
+}
+
+static void sa_db_remove_dev(struct ib_device *device)
+{
+	struct sa_db_device *dev;
+	int i;
+
+	dev = ib_get_client_data(device, &sa_db_client);
+	if (!dev)
+		return;
+
+	ib_unregister_event_handler(&dev->event_handler);
+	flush_workqueue(sa_wq);
+
+	for (i = 0; i < dev->port_count; i++)
+		destroy_port(&dev->port[i]);
+
+	mutex_lock(&lock);
+	list_del(&dev->list);
+	mutex_unlock(&lock);
+
+	kfree(dev);
+}
+
+int sa_db_init(void)
+{
+	int ret;
+
+	rwlock_init(&rwlock);
+	sa_wq = create_singlethread_workqueue("local_sa");
+	if (!sa_wq)
+		return -ENOMEM;
+
+	ib_sa_register_client(&sa_client);
+	ret = ib_register_client(&sa_db_client);
+	if (ret)
+		goto err;
+
+	return 0;
+
+err:
+	ib_sa_unregister_client(&sa_client);
+	destroy_workqueue(sa_wq);
+	return ret;
+}
+
+void sa_db_cleanup(void)
+{
+	ib_unregister_client(&sa_db_client);
+	ib_sa_unregister_client(&sa_client);
+	destroy_workqueue(sa_wq);
+}


Property changes on: trunk/sys/ofed/drivers/infiniband/core/local_sa.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/sys/ofed/drivers/infiniband/core/mad.c
===================================================================
--- trunk/sys/ofed/drivers/infiniband/core/mad.c	                        (rev 0)
+++ trunk/sys/ofed/drivers/infiniband/core/mad.c	2016-09-14 19:35:22 UTC (rev 7911)
@@ -0,0 +1,3057 @@
+/*
+ * Copyright (c) 2004-2007 Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2005 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies Ltd.  All rights reserved.
+ * Copyright (c) 2009 HNR Consulting. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/dma-mapping.h>
+#include <rdma/ib_cache.h>
+
+#include "mad_priv.h"
+#include "mad_rmpp.h"
+#include "smi.h"
+#include "agent.h"
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_DESCRIPTION("kernel IB MAD API");
+MODULE_AUTHOR("Hal Rosenstock");
+MODULE_AUTHOR("Sean Hefty");
+
+int mad_sendq_size = IB_MAD_QP_SEND_SIZE;
+int mad_recvq_size = IB_MAD_QP_RECV_SIZE;
+
+module_param_named(send_queue_size, mad_sendq_size, int, 0444);
+MODULE_PARM_DESC(send_queue_size, "Size of send queue in number of work requests");
+module_param_named(recv_queue_size, mad_recvq_size, int, 0444);
+MODULE_PARM_DESC(recv_queue_size, "Size of receive queue in number of work requests");
+
+static struct kmem_cache *ib_mad_cache;
+
+static struct list_head ib_mad_port_list;
+static u32 ib_mad_client_id = 0;
+
+/* Port list lock */
+static spinlock_t ib_mad_port_list_lock;
+
+
+/* Forward declarations */
+static int method_in_use(struct ib_mad_mgmt_method_table **method,
+			 struct ib_mad_reg_req *mad_reg_req);
+static void remove_mad_reg_req(struct ib_mad_agent_private *priv);
+static struct ib_mad_agent_private *find_mad_agent(
+					struct ib_mad_port_private *port_priv,
+					struct ib_mad *mad);
+static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info,
+				    struct ib_mad_private *mad);
+static void cancel_mads(struct ib_mad_agent_private *mad_agent_priv);
+static void timeout_sends(struct work_struct *work);
+static void local_completions(struct work_struct *work);
+static int add_nonoui_reg_req(struct ib_mad_reg_req *mad_reg_req,
+			      struct ib_mad_agent_private *agent_priv,
+			      u8 mgmt_class);
+static int add_oui_reg_req(struct ib_mad_reg_req *mad_reg_req,
+			   struct ib_mad_agent_private *agent_priv);
+
+/*
+ * Returns a ib_mad_port_private structure or NULL for a device/port
+ * Assumes ib_mad_port_list_lock is being held
+ */
+static inline struct ib_mad_port_private *
+__ib_get_mad_port(struct ib_device *device, int port_num)
+{
+	struct ib_mad_port_private *entry;
+
+	list_for_each_entry(entry, &ib_mad_port_list, port_list) {
+		if (entry->device == device && entry->port_num == port_num)
+			return entry;
+	}
+	return NULL;
+}
+
+/*
+ * Wrapper function to return a ib_mad_port_private structure or NULL
+ * for a device/port
+ */
+static inline struct ib_mad_port_private *
+ib_get_mad_port(struct ib_device *device, int port_num)
+{
+	struct ib_mad_port_private *entry;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ib_mad_port_list_lock, flags);
+	entry = __ib_get_mad_port(device, port_num);
+	spin_unlock_irqrestore(&ib_mad_port_list_lock, flags);
+
+	return entry;
+}
+
+static inline u8 convert_mgmt_class(u8 mgmt_class)
+{
+	/* Alias IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE to 0 */
+	return mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE ?
+		0 : mgmt_class;
+}
+
+static int get_spl_qp_index(enum ib_qp_type qp_type)
+{
+	switch (qp_type)
+	{
+	case IB_QPT_SMI:
+		return 0;
+	case IB_QPT_GSI:
+		return 1;
+	default:
+		return -1;
+	}
+}
+
+static int vendor_class_index(u8 mgmt_class)
+{
+	return mgmt_class - IB_MGMT_CLASS_VENDOR_RANGE2_START;
+}
+
+static int is_vendor_class(u8 mgmt_class)
+{
+	if ((mgmt_class < IB_MGMT_CLASS_VENDOR_RANGE2_START) ||
+	    (mgmt_class > IB_MGMT_CLASS_VENDOR_RANGE2_END))
+		return 0;
+	return 1;
+}
+
+static int is_vendor_oui(char *oui)
+{
+	if (oui[0] || oui[1] || oui[2])
+		return 1;
+	return 0;
+}
+
+static int is_vendor_method_in_use(
+		struct ib_mad_mgmt_vendor_class *vendor_class,
+		struct ib_mad_reg_req *mad_reg_req)
+{
+	struct ib_mad_mgmt_method_table *method;
+	int i;
+
+	for (i = 0; i < MAX_MGMT_OUI; i++) {
+		if (!memcmp(vendor_class->oui[i], mad_reg_req->oui, 3)) {
+			method = vendor_class->method_table[i];
+			if (method) {
+				if (method_in_use(&method, mad_reg_req))
+					return 1;
+				else
+					break;
+			}
+		}
+	}
+	return 0;
+}
+
+int ib_response_mad(struct ib_mad *mad)
+{
+	return ((mad->mad_hdr.method & IB_MGMT_METHOD_RESP) ||
+		(mad->mad_hdr.method == IB_MGMT_METHOD_TRAP_REPRESS) ||
+		((mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_BM) &&
+		 (mad->mad_hdr.attr_mod & IB_BM_ATTR_MOD_RESP)));
+}
+EXPORT_SYMBOL(ib_response_mad);
+
+static void timeout_callback(unsigned long data)
+{
+	struct ib_mad_agent_private *mad_agent_priv =
+		(struct ib_mad_agent_private *) data;
+
+	queue_work(mad_agent_priv->qp_info->port_priv->wq,
+		   &mad_agent_priv->timeout_work);
+}
+
+/*
+ * ib_register_mad_agent - Register to send/receive MADs
+ */
+struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device,
+					   u8 port_num,
+					   enum ib_qp_type qp_type,
+					   struct ib_mad_reg_req *mad_reg_req,
+					   u8 rmpp_version,
+					   ib_mad_send_handler send_handler,
+					   ib_mad_recv_handler recv_handler,
+					   void *context)
+{
+	struct ib_mad_port_private *port_priv;
+	struct ib_mad_agent *ret = ERR_PTR(-EINVAL);
+	struct ib_mad_agent_private *mad_agent_priv;
+	struct ib_mad_reg_req *reg_req = NULL;
+	struct ib_mad_mgmt_class_table *class;
+	struct ib_mad_mgmt_vendor_class_table *vendor;
+	struct ib_mad_mgmt_vendor_class *vendor_class;
+	struct ib_mad_mgmt_method_table *method;
+	int ret2, qpn;
+	unsigned long flags;
+	u8 mgmt_class, vclass;
+
+	/* Validate parameters */
+	qpn = get_spl_qp_index(qp_type);
+	if (qpn == -1)
+		goto error1;
+
+	if (rmpp_version && rmpp_version != IB_MGMT_RMPP_VERSION)
+		goto error1;
+
+	/* Validate MAD registration request if supplied */
+	if (mad_reg_req) {
+		if (mad_reg_req->mgmt_class_version >= MAX_MGMT_VERSION)
+			goto error1;
+		if (!recv_handler)
+			goto error1;
+		if (mad_reg_req->mgmt_class >= MAX_MGMT_CLASS) {
+			/*
+			 * IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE is the only
+			 * one in this range currently allowed
+			 */
+			if (mad_reg_req->mgmt_class !=
+			    IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
+				goto error1;
+		} else if (mad_reg_req->mgmt_class == 0) {
+			/*
+			 * Class 0 is reserved in IBA and is used for
+			 * aliasing of IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE
+			 */
+			goto error1;
+		} else if (is_vendor_class(mad_reg_req->mgmt_class)) {
+			/*
+			 * If class is in "new" vendor range,
+			 * ensure supplied OUI is not zero
+			 */
+			if (!is_vendor_oui(mad_reg_req->oui))
+				goto error1;
+		}
+		/* Make sure class supplied is consistent with RMPP */
+		if (!ib_is_mad_class_rmpp(mad_reg_req->mgmt_class)) {
+			if (rmpp_version)
+				goto error1;
+		}
+		/* Make sure class supplied is consistent with QP type */
+		if (qp_type == IB_QPT_SMI) {
+			if ((mad_reg_req->mgmt_class !=
+					IB_MGMT_CLASS_SUBN_LID_ROUTED) &&
+			    (mad_reg_req->mgmt_class !=
+					IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE))
+				goto error1;
+		} else {
+			if ((mad_reg_req->mgmt_class ==
+					IB_MGMT_CLASS_SUBN_LID_ROUTED) ||
+			    (mad_reg_req->mgmt_class ==
+					IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE))
+				goto error1;
+		}
+	} else {
+		/* No registration request supplied */
+		if (!send_handler)
+			goto error1;
+	}
+
+	/* Validate device and port */
+	port_priv = ib_get_mad_port(device, port_num);
+	if (!port_priv) {
+		ret = ERR_PTR(-ENODEV);
+		goto error1;
+	}
+
+	/* Allocate structures */
+	mad_agent_priv = kzalloc(sizeof *mad_agent_priv, GFP_KERNEL);
+	if (!mad_agent_priv) {
+		ret = ERR_PTR(-ENOMEM);
+		goto error1;
+	}
+
+	mad_agent_priv->agent.mr = ib_get_dma_mr(port_priv->qp_info[qpn].qp->pd,
+						 IB_ACCESS_LOCAL_WRITE);
+	if (IS_ERR(mad_agent_priv->agent.mr)) {
+		ret = ERR_PTR(-ENOMEM);
+		goto error2;
+	}
+
+	if (mad_reg_req) {
+		reg_req = kmalloc(sizeof *reg_req, GFP_KERNEL);
+		if (!reg_req) {
+			ret = ERR_PTR(-ENOMEM);
+			goto error3;
+		}
+		/* Make a copy of the MAD registration request */
+		memcpy(reg_req, mad_reg_req, sizeof *reg_req);
+	}
+
+	/* Now, fill in the various structures */
+	mad_agent_priv->qp_info = &port_priv->qp_info[qpn];
+	mad_agent_priv->reg_req = reg_req;
+	mad_agent_priv->agent.rmpp_version = rmpp_version;
+	mad_agent_priv->agent.device = device;
+	mad_agent_priv->agent.recv_handler = recv_handler;
+	mad_agent_priv->agent.send_handler = send_handler;
+	mad_agent_priv->agent.context = context;
+	mad_agent_priv->agent.qp = port_priv->qp_info[qpn].qp;
+	mad_agent_priv->agent.port_num = port_num;
+	spin_lock_init(&mad_agent_priv->lock);
+	INIT_LIST_HEAD(&mad_agent_priv->send_list);
+	INIT_LIST_HEAD(&mad_agent_priv->wait_list);
+	INIT_LIST_HEAD(&mad_agent_priv->done_list);
+	INIT_LIST_HEAD(&mad_agent_priv->rmpp_list);
+	INIT_WORK(&mad_agent_priv->timeout_work, timeout_sends);
+	setup_timer(&mad_agent_priv->timeout_timer, timeout_callback,
+		    (unsigned long) mad_agent_priv);
+	INIT_LIST_HEAD(&mad_agent_priv->local_list);
+	INIT_WORK(&mad_agent_priv->local_work, local_completions);
+	atomic_set(&mad_agent_priv->refcount, 1);
+	init_completion(&mad_agent_priv->comp);
+
+	spin_lock_irqsave(&port_priv->reg_lock, flags);
+	mad_agent_priv->agent.hi_tid = ++ib_mad_client_id;
+
+	/*
+	 * Make sure MAD registration (if supplied)
+	 * is non overlapping with any existing ones
+	 */
+	if (mad_reg_req) {
+		mgmt_class = convert_mgmt_class(mad_reg_req->mgmt_class);
+		if (!is_vendor_class(mgmt_class)) {
+			class = port_priv->version[mad_reg_req->
+						   mgmt_class_version].class;
+			if (class) {
+				method = class->method_table[mgmt_class];
+				if (method) {
+					if (method_in_use(&method,
+							   mad_reg_req))
+						goto error4;
+				}
+			}
+			ret2 = add_nonoui_reg_req(mad_reg_req, mad_agent_priv,
+						  mgmt_class);
+		} else {
+			/* "New" vendor class range */
+			vendor = port_priv->version[mad_reg_req->
+						    mgmt_class_version].vendor;
+			if (vendor) {
+				vclass = vendor_class_index(mgmt_class);
+				vendor_class = vendor->vendor_class[vclass];
+				if (vendor_class) {
+					if (is_vendor_method_in_use(
+							vendor_class,
+							mad_reg_req))
+						goto error4;
+				}
+			}
+			ret2 = add_oui_reg_req(mad_reg_req, mad_agent_priv);
+		}
+		if (ret2) {
+			ret = ERR_PTR(ret2);
+			goto error4;
+		}
+	}
+
+	/* Add mad agent into port's agent list */
+	list_add_tail(&mad_agent_priv->agent_list, &port_priv->agent_list);
+	spin_unlock_irqrestore(&port_priv->reg_lock, flags);
+
+	return &mad_agent_priv->agent;
+
+error4:
+	spin_unlock_irqrestore(&port_priv->reg_lock, flags);
+	kfree(reg_req);
+error3:
+	ib_dereg_mr(mad_agent_priv->agent.mr);
+error2:
+	kfree(mad_agent_priv);
+error1:
+	return ret;
+}
+EXPORT_SYMBOL(ib_register_mad_agent);
+
+static inline int is_snooping_sends(int mad_snoop_flags)
+{
+	return (mad_snoop_flags &
+		(/*IB_MAD_SNOOP_POSTED_SENDS |
+		 IB_MAD_SNOOP_RMPP_SENDS |*/
+		 IB_MAD_SNOOP_SEND_COMPLETIONS /*|
+		 IB_MAD_SNOOP_RMPP_SEND_COMPLETIONS*/));
+}
+
+static inline int is_snooping_recvs(int mad_snoop_flags)
+{
+	return (mad_snoop_flags &
+		(IB_MAD_SNOOP_RECVS /*|
+		 IB_MAD_SNOOP_RMPP_RECVS*/));
+}
+
+static int register_snoop_agent(struct ib_mad_qp_info *qp_info,
+				struct ib_mad_snoop_private *mad_snoop_priv)
+{
+	struct ib_mad_snoop_private **new_snoop_table;
+	unsigned long flags;
+	int i;
+
+	spin_lock_irqsave(&qp_info->snoop_lock, flags);
+	/* Check for empty slot in array. */
+	for (i = 0; i < qp_info->snoop_table_size; i++)
+		if (!qp_info->snoop_table[i])
+			break;
+
+	if (i == qp_info->snoop_table_size) {
+		/* Grow table. */
+		new_snoop_table = krealloc(qp_info->snoop_table,
+					   sizeof mad_snoop_priv *
+					   (qp_info->snoop_table_size + 1),
+					   GFP_ATOMIC);
+		if (!new_snoop_table) {
+			i = -ENOMEM;
+			goto out;
+		}
+
+		qp_info->snoop_table = new_snoop_table;
+		qp_info->snoop_table_size++;
+	}
+	qp_info->snoop_table[i] = mad_snoop_priv;
+	atomic_inc(&qp_info->snoop_count);
+out:
+	spin_unlock_irqrestore(&qp_info->snoop_lock, flags);
+	return i;
+}
+
+struct ib_mad_agent *ib_register_mad_snoop(struct ib_device *device,
+					   u8 port_num,
+					   enum ib_qp_type qp_type,
+					   int mad_snoop_flags,
+					   ib_mad_snoop_handler snoop_handler,
+					   ib_mad_recv_handler recv_handler,
+					   void *context)
+{
+	struct ib_mad_port_private *port_priv;
+	struct ib_mad_agent *ret;
+	struct ib_mad_snoop_private *mad_snoop_priv;
+	int qpn;
+
+	/* Validate parameters */
+	if ((is_snooping_sends(mad_snoop_flags) && !snoop_handler) ||
+	    (is_snooping_recvs(mad_snoop_flags) && !recv_handler)) {
+		ret = ERR_PTR(-EINVAL);
+		goto error1;
+	}
+	qpn = get_spl_qp_index(qp_type);
+	if (qpn == -1) {
+		ret = ERR_PTR(-EINVAL);
+		goto error1;
+	}
+	port_priv = ib_get_mad_port(device, port_num);
+	if (!port_priv) {
+		ret = ERR_PTR(-ENODEV);
+		goto error1;
+	}
+	/* Allocate structures */
+	mad_snoop_priv = kzalloc(sizeof *mad_snoop_priv, GFP_KERNEL);
+	if (!mad_snoop_priv) {
+		ret = ERR_PTR(-ENOMEM);
+		goto error1;
+	}
+
+	/* Now, fill in the various structures */
+	mad_snoop_priv->qp_info = &port_priv->qp_info[qpn];
+	mad_snoop_priv->agent.device = device;
+	mad_snoop_priv->agent.recv_handler = recv_handler;
+	mad_snoop_priv->agent.snoop_handler = snoop_handler;
+	mad_snoop_priv->agent.context = context;
+	mad_snoop_priv->agent.qp = port_priv->qp_info[qpn].qp;
+	mad_snoop_priv->agent.port_num = port_num;
+	mad_snoop_priv->mad_snoop_flags = mad_snoop_flags;
+	init_completion(&mad_snoop_priv->comp);
+	mad_snoop_priv->snoop_index = register_snoop_agent(
+						&port_priv->qp_info[qpn],
+						mad_snoop_priv);
+	if (mad_snoop_priv->snoop_index < 0) {
+		ret = ERR_PTR(mad_snoop_priv->snoop_index);
+		goto error2;
+	}
+
+	atomic_set(&mad_snoop_priv->refcount, 1);
+	return &mad_snoop_priv->agent;
+
+error2:
+	kfree(mad_snoop_priv);
+error1:
+	return ret;
+}
+EXPORT_SYMBOL(ib_register_mad_snoop);
+
+static inline void deref_mad_agent(struct ib_mad_agent_private *mad_agent_priv)
+{
+	if (atomic_dec_and_test(&mad_agent_priv->refcount))
+		complete(&mad_agent_priv->comp);
+}
+
+static inline void deref_snoop_agent(struct ib_mad_snoop_private *mad_snoop_priv)
+{
+	if (atomic_dec_and_test(&mad_snoop_priv->refcount))
+		complete(&mad_snoop_priv->comp);
+}
+
+static void unregister_mad_agent(struct ib_mad_agent_private *mad_agent_priv)
+{
+	struct ib_mad_port_private *port_priv;
+	unsigned long flags;
+
+	/* Note that we could still be handling received MADs */
+
+	/*
+	 * Canceling all sends results in dropping received response
+	 * MADs, preventing us from queuing additional work
+	 */
+	cancel_mads(mad_agent_priv);
+	port_priv = mad_agent_priv->qp_info->port_priv;
+	del_timer_sync(&mad_agent_priv->timeout_timer);
+	cancel_work_sync(&mad_agent_priv->timeout_work);
+
+	spin_lock_irqsave(&port_priv->reg_lock, flags);
+	remove_mad_reg_req(mad_agent_priv);
+	list_del(&mad_agent_priv->agent_list);
+	spin_unlock_irqrestore(&port_priv->reg_lock, flags);
+
+	flush_workqueue(port_priv->wq);
+	ib_cancel_rmpp_recvs(mad_agent_priv);
+
+	deref_mad_agent(mad_agent_priv);
+	wait_for_completion(&mad_agent_priv->comp);
+
+	kfree(mad_agent_priv->reg_req);
+	ib_dereg_mr(mad_agent_priv->agent.mr);
+	kfree(mad_agent_priv);
+}
+
+static void unregister_mad_snoop(struct ib_mad_snoop_private *mad_snoop_priv)
+{
+	struct ib_mad_qp_info *qp_info;
+	unsigned long flags;
+
+	qp_info = mad_snoop_priv->qp_info;
+	spin_lock_irqsave(&qp_info->snoop_lock, flags);
+	qp_info->snoop_table[mad_snoop_priv->snoop_index] = NULL;
+	atomic_dec(&qp_info->snoop_count);
+	spin_unlock_irqrestore(&qp_info->snoop_lock, flags);
+
+	deref_snoop_agent(mad_snoop_priv);
+	wait_for_completion(&mad_snoop_priv->comp);
+
+	kfree(mad_snoop_priv);
+}
+
+/*
+ * ib_unregister_mad_agent - Unregisters a client from using MAD services
+ */
+int ib_unregister_mad_agent(struct ib_mad_agent *mad_agent)
+{
+	struct ib_mad_agent_private *mad_agent_priv;
+	struct ib_mad_snoop_private *mad_snoop_priv;
+
+	/* If the TID is zero, the agent can only snoop. */
+	if (mad_agent->hi_tid) {
+		mad_agent_priv = container_of(mad_agent,
+					      struct ib_mad_agent_private,
+					      agent);
+		unregister_mad_agent(mad_agent_priv);
+	} else {
+		mad_snoop_priv = container_of(mad_agent,
+					      struct ib_mad_snoop_private,
+					      agent);
+		unregister_mad_snoop(mad_snoop_priv);
+	}
+	return 0;
+}
+EXPORT_SYMBOL(ib_unregister_mad_agent);
+
+static void dequeue_mad(struct ib_mad_list_head *mad_list)
+{
+	struct ib_mad_queue *mad_queue;
+	unsigned long flags;
+
+	BUG_ON(!mad_list->mad_queue);
+	mad_queue = mad_list->mad_queue;
+	spin_lock_irqsave(&mad_queue->lock, flags);
+	list_del(&mad_list->list);
+	mad_queue->count--;
+	spin_unlock_irqrestore(&mad_queue->lock, flags);
+}
+
+static void snoop_send(struct ib_mad_qp_info *qp_info,
+		       struct ib_mad_send_buf *send_buf,
+		       struct ib_mad_send_wc *mad_send_wc,
+		       int mad_snoop_flags)
+{
+	struct ib_mad_snoop_private *mad_snoop_priv;
+	unsigned long flags;
+	int i;
+
+	spin_lock_irqsave(&qp_info->snoop_lock, flags);
+	for (i = 0; i < qp_info->snoop_table_size; i++) {
+		mad_snoop_priv = qp_info->snoop_table[i];
+		if (!mad_snoop_priv ||
+		    !(mad_snoop_priv->mad_snoop_flags & mad_snoop_flags))
+			continue;
+
+		atomic_inc(&mad_snoop_priv->refcount);
+		spin_unlock_irqrestore(&qp_info->snoop_lock, flags);
+		mad_snoop_priv->agent.snoop_handler(&mad_snoop_priv->agent,
+						    send_buf, mad_send_wc);
+		deref_snoop_agent(mad_snoop_priv);
+		spin_lock_irqsave(&qp_info->snoop_lock, flags);
+	}
+	spin_unlock_irqrestore(&qp_info->snoop_lock, flags);
+}
+
+static void snoop_recv(struct ib_mad_qp_info *qp_info,
+		       struct ib_mad_recv_wc *mad_recv_wc,
+		       int mad_snoop_flags)
+{
+	struct ib_mad_snoop_private *mad_snoop_priv;
+	unsigned long flags;
+	int i;
+
+	spin_lock_irqsave(&qp_info->snoop_lock, flags);
+	for (i = 0; i < qp_info->snoop_table_size; i++) {
+		mad_snoop_priv = qp_info->snoop_table[i];
+		if (!mad_snoop_priv ||
+		    !(mad_snoop_priv->mad_snoop_flags & mad_snoop_flags))
+			continue;
+
+		atomic_inc(&mad_snoop_priv->refcount);
+		spin_unlock_irqrestore(&qp_info->snoop_lock, flags);
+		mad_snoop_priv->agent.recv_handler(&mad_snoop_priv->agent,
+						   mad_recv_wc);
+		deref_snoop_agent(mad_snoop_priv);
+		spin_lock_irqsave(&qp_info->snoop_lock, flags);
+	}
+	spin_unlock_irqrestore(&qp_info->snoop_lock, flags);
+}
+
+static void build_smp_wc(struct ib_qp *qp,
+			 u64 wr_id, u16 slid, u16 pkey_index, u8 port_num,
+			 struct ib_wc *wc)
+{
+	memset(wc, 0, sizeof *wc);
+	wc->wr_id = wr_id;
+	wc->status = IB_WC_SUCCESS;
+	wc->opcode = IB_WC_RECV;
+	wc->pkey_index = pkey_index;
+	wc->byte_len = sizeof(struct ib_mad) + sizeof(struct ib_grh);
+	wc->src_qp = IB_QP0;
+	wc->qp = qp;
+	wc->slid = slid;
+	wc->sl = 0;
+	wc->dlid_path_bits = 0;
+	wc->port_num = port_num;
+}
+
+/*
+ * Return 0 if SMP is to be sent
+ * Return 1 if SMP was consumed locally (whether or not solicited)
+ * Return < 0 if error
+ */
+static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv,
+				  struct ib_mad_send_wr_private *mad_send_wr)
+{
+	int ret = 0;
+	struct ib_smp *smp = mad_send_wr->send_buf.mad;
+	unsigned long flags;
+	struct ib_mad_local_private *local;
+	struct ib_mad_private *mad_priv;
+	struct ib_mad_port_private *port_priv;
+	struct ib_mad_agent_private *recv_mad_agent = NULL;
+	struct ib_device *device = mad_agent_priv->agent.device;
+	u8 port_num;
+	struct ib_wc mad_wc;
+	struct ib_send_wr *send_wr = &mad_send_wr->send_wr;
+
+	if (device->node_type == RDMA_NODE_IB_SWITCH)
+		port_num = send_wr->wr.ud.port_num;
+	else
+		port_num = mad_agent_priv->agent.port_num;
+
+	/*
+	 * Directed route handling starts if the initial LID routed part of
+	 * a request or the ending LID routed part of a response is empty.
+	 * If we are at the start of the LID routed part, don't update the
+	 * hop_ptr or hop_cnt.  See section 14.2.2, Vol 1 IB spec.
+	 */
+	if ((ib_get_smp_direction(smp) ? smp->dr_dlid : smp->dr_slid) !=
+	     IB_LID_PERMISSIVE)
+		goto out;
+	if (smi_handle_dr_smp_send(smp, device->node_type, port_num) ==
+	     IB_SMI_DISCARD) {
+		ret = -EINVAL;
+		printk(KERN_ERR PFX "Invalid directed route\n");
+		goto out;
+	}
+
+	/* Check to post send on QP or process locally */
+	if (smi_check_local_smp(smp, device) == IB_SMI_DISCARD &&
+	    smi_check_local_returning_smp(smp, device) == IB_SMI_DISCARD)
+		goto out;
+
+	local = kmalloc(sizeof *local, GFP_ATOMIC);
+	if (!local) {
+		ret = -ENOMEM;
+		printk(KERN_ERR PFX "No memory for ib_mad_local_private\n");
+		goto out;
+	}
+	local->mad_priv = NULL;
+	local->recv_mad_agent = NULL;
+	mad_priv = kmem_cache_alloc(ib_mad_cache, GFP_ATOMIC);
+	if (!mad_priv) {
+		ret = -ENOMEM;
+		printk(KERN_ERR PFX "No memory for local response MAD\n");
+		kfree(local);
+		goto out;
+	}
+
+	build_smp_wc(mad_agent_priv->agent.qp,
+		     send_wr->wr_id, be16_to_cpu(smp->dr_slid),
+		     send_wr->wr.ud.pkey_index,
+		     send_wr->wr.ud.port_num, &mad_wc);
+
+	/* No GRH for DR SMP */
+	ret = device->process_mad(device, 0, port_num, &mad_wc, NULL,
+				  (struct ib_mad *)smp,
+				  (struct ib_mad *)&mad_priv->mad);
+	switch (ret)
+	{
+	case IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY:
+		if (ib_response_mad(&mad_priv->mad.mad) &&
+		    mad_agent_priv->agent.recv_handler) {
+			local->mad_priv = mad_priv;
+			local->recv_mad_agent = mad_agent_priv;
+			/*
+			 * Reference MAD agent until receive
+			 * side of local completion handled
+			 */
+			atomic_inc(&mad_agent_priv->refcount);
+		} else
+			kmem_cache_free(ib_mad_cache, mad_priv);
+		break;
+	case IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED:
+		kmem_cache_free(ib_mad_cache, mad_priv);
+		break;
+	case IB_MAD_RESULT_SUCCESS:
+		/* Treat like an incoming receive MAD */
+		port_priv = ib_get_mad_port(mad_agent_priv->agent.device,
+					    mad_agent_priv->agent.port_num);
+		if (port_priv) {
+			memcpy(&mad_priv->mad.mad, smp, sizeof(struct ib_mad));
+			recv_mad_agent = find_mad_agent(port_priv,
+						        &mad_priv->mad.mad);
+		}
+		if (!port_priv || !recv_mad_agent) {
+			/*
+			 * No receiving agent so drop packet and
+			 * generate send completion.
+			 */
+			kmem_cache_free(ib_mad_cache, mad_priv);
+			break;
+		}
+		local->mad_priv = mad_priv;
+		local->recv_mad_agent = recv_mad_agent;
+		break;
+	default:
+		kmem_cache_free(ib_mad_cache, mad_priv);
+		kfree(local);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	local->mad_send_wr = mad_send_wr;
+	/* Reference MAD agent until send side of local completion handled */
+	atomic_inc(&mad_agent_priv->refcount);
+	/* Queue local completion to local list */
+	spin_lock_irqsave(&mad_agent_priv->lock, flags);
+	list_add_tail(&local->completion_list, &mad_agent_priv->local_list);
+	spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+	queue_work(mad_agent_priv->qp_info->port_priv->wq,
+		   &mad_agent_priv->local_work);
+	ret = 1;
+out:
+	return ret;
+}
+
+static int get_pad_size(int hdr_len, int data_len)
+{
+	int seg_size, pad;
+
+	seg_size = sizeof(struct ib_mad) - hdr_len;
+	if (data_len && seg_size) {
+		pad = seg_size - data_len % seg_size;
+		return pad == seg_size ? 0 : pad;
+	} else
+		return seg_size;
+}
+
+static void free_send_rmpp_list(struct ib_mad_send_wr_private *mad_send_wr)
+{
+	struct ib_rmpp_segment *s, *t;
+
+	list_for_each_entry_safe(s, t, &mad_send_wr->rmpp_list, list) {
+		list_del(&s->list);
+		kfree(s);
+	}
+}
+
+static int alloc_send_rmpp_list(struct ib_mad_send_wr_private *send_wr,
+				gfp_t gfp_mask)
+{
+	struct ib_mad_send_buf *send_buf = &send_wr->send_buf;
+	struct ib_rmpp_mad *rmpp_mad = send_buf->mad;
+	struct ib_rmpp_segment *seg = NULL;
+	int left, seg_size, pad;
+
+	send_buf->seg_size = sizeof (struct ib_mad) - send_buf->hdr_len;
+	seg_size = send_buf->seg_size;
+	pad = send_wr->pad;
+
+	/* Allocate data segments. */
+	for (left = send_buf->data_len + pad; left > 0; left -= seg_size) {
+		seg = kmalloc(sizeof (*seg) + seg_size, gfp_mask);
+		if (!seg) {
+			printk(KERN_ERR "alloc_send_rmpp_segs: RMPP mem "
+			       "alloc failed for len %zd, gfp %#x\n",
+			       sizeof (*seg) + seg_size, gfp_mask);
+			free_send_rmpp_list(send_wr);
+			return -ENOMEM;
+		}
+		seg->num = ++send_buf->seg_count;
+		list_add_tail(&seg->list, &send_wr->rmpp_list);
+	}
+
+	/* Zero any padding */
+	if (pad)
+		memset(seg->data + seg_size - pad, 0, pad);
+
+	rmpp_mad->rmpp_hdr.rmpp_version = send_wr->mad_agent_priv->
+					  agent.rmpp_version;
+	rmpp_mad->rmpp_hdr.rmpp_type = IB_MGMT_RMPP_TYPE_DATA;
+	ib_set_rmpp_flags(&rmpp_mad->rmpp_hdr, IB_MGMT_RMPP_FLAG_ACTIVE);
+
+	send_wr->cur_seg = container_of(send_wr->rmpp_list.next,
+					struct ib_rmpp_segment, list);
+	send_wr->last_ack_seg = send_wr->cur_seg;
+	return 0;
+}
+
+struct ib_mad_send_buf * ib_create_send_mad(struct ib_mad_agent *mad_agent,
+					    u32 remote_qpn, u16 pkey_index,
+					    int rmpp_active,
+					    int hdr_len, int data_len,
+					    gfp_t gfp_mask)
+{
+	struct ib_mad_agent_private *mad_agent_priv;
+	struct ib_mad_send_wr_private *mad_send_wr;
+	int pad, message_size, ret, size;
+	void *buf;
+
+	mad_agent_priv = container_of(mad_agent, struct ib_mad_agent_private,
+				      agent);
+	pad = get_pad_size(hdr_len, data_len);
+	message_size = hdr_len + data_len + pad;
+
+	if ((!mad_agent->rmpp_version &&
+	     (rmpp_active || message_size > sizeof(struct ib_mad))) ||
+	    (!rmpp_active && message_size > sizeof(struct ib_mad)))
+		return ERR_PTR(-EINVAL);
+
+	size = rmpp_active ? hdr_len : sizeof(struct ib_mad);
+	buf = kzalloc(sizeof *mad_send_wr + size, gfp_mask);
+	if (!buf)
+		return ERR_PTR(-ENOMEM);
+
+	mad_send_wr = buf + size;
+	INIT_LIST_HEAD(&mad_send_wr->rmpp_list);
+	mad_send_wr->send_buf.mad = buf;
+	mad_send_wr->send_buf.hdr_len = hdr_len;
+	mad_send_wr->send_buf.data_len = data_len;
+	mad_send_wr->pad = pad;
+
+	mad_send_wr->mad_agent_priv = mad_agent_priv;
+	mad_send_wr->sg_list[0].length = hdr_len;
+	mad_send_wr->sg_list[0].lkey = mad_agent->mr->lkey;
+	mad_send_wr->sg_list[1].length = sizeof(struct ib_mad) - hdr_len;
+	mad_send_wr->sg_list[1].lkey = mad_agent->mr->lkey;
+
+	mad_send_wr->send_wr.wr_id = (unsigned long) mad_send_wr;
+	mad_send_wr->send_wr.sg_list = mad_send_wr->sg_list;
+	mad_send_wr->send_wr.num_sge = 2;
+	mad_send_wr->send_wr.opcode = IB_WR_SEND;
+	mad_send_wr->send_wr.send_flags = IB_SEND_SIGNALED;
+	mad_send_wr->send_wr.wr.ud.remote_qpn = remote_qpn;
+	mad_send_wr->send_wr.wr.ud.remote_qkey = IB_QP_SET_QKEY;
+	mad_send_wr->send_wr.wr.ud.pkey_index = pkey_index;
+
+	if (rmpp_active) {
+		ret = alloc_send_rmpp_list(mad_send_wr, gfp_mask);
+		if (ret) {
+			kfree(buf);
+			return ERR_PTR(ret);
+		}
+	}
+
+	mad_send_wr->send_buf.mad_agent = mad_agent;
+	atomic_inc(&mad_agent_priv->refcount);
+	return &mad_send_wr->send_buf;
+}
+EXPORT_SYMBOL(ib_create_send_mad);
+
+int ib_get_mad_data_offset(u8 mgmt_class)
+{
+	if (mgmt_class == IB_MGMT_CLASS_SUBN_ADM)
+		return IB_MGMT_SA_HDR;
+	else if ((mgmt_class == IB_MGMT_CLASS_DEVICE_MGMT) ||
+		 (mgmt_class == IB_MGMT_CLASS_DEVICE_ADM) ||
+		 (mgmt_class == IB_MGMT_CLASS_BIS))
+		return IB_MGMT_DEVICE_HDR;
+	else if ((mgmt_class >= IB_MGMT_CLASS_VENDOR_RANGE2_START) &&
+		 (mgmt_class <= IB_MGMT_CLASS_VENDOR_RANGE2_END))
+		return IB_MGMT_VENDOR_HDR;
+	else
+		return IB_MGMT_MAD_HDR;
+}
+EXPORT_SYMBOL(ib_get_mad_data_offset);
+
+int ib_is_mad_class_rmpp(u8 mgmt_class)
+{
+	if ((mgmt_class == IB_MGMT_CLASS_SUBN_ADM) ||
+	    (mgmt_class == IB_MGMT_CLASS_DEVICE_MGMT) ||
+	    (mgmt_class == IB_MGMT_CLASS_DEVICE_ADM) ||
+	    (mgmt_class == IB_MGMT_CLASS_BIS) ||
+	    ((mgmt_class >= IB_MGMT_CLASS_VENDOR_RANGE2_START) &&
+	     (mgmt_class <= IB_MGMT_CLASS_VENDOR_RANGE2_END)))
+		return 1;
+	return 0;
+}
+EXPORT_SYMBOL(ib_is_mad_class_rmpp);
+
+void *ib_get_rmpp_segment(struct ib_mad_send_buf *send_buf, int seg_num)
+{
+	struct ib_mad_send_wr_private *mad_send_wr;
+	struct list_head *list;
+
+	mad_send_wr = container_of(send_buf, struct ib_mad_send_wr_private,
+				   send_buf);
+	list = &mad_send_wr->cur_seg->list;
+
+	if (mad_send_wr->cur_seg->num < seg_num) {
+		list_for_each_entry(mad_send_wr->cur_seg, list, list)
+			if (mad_send_wr->cur_seg->num == seg_num)
+				break;
+	} else if (mad_send_wr->cur_seg->num > seg_num) {
+		list_for_each_entry_reverse(mad_send_wr->cur_seg, list, list)
+			if (mad_send_wr->cur_seg->num == seg_num)
+				break;
+	}
+	return mad_send_wr->cur_seg->data;
+}
+EXPORT_SYMBOL(ib_get_rmpp_segment);
+
+static inline void *ib_get_payload(struct ib_mad_send_wr_private *mad_send_wr)
+{
+	if (mad_send_wr->send_buf.seg_count)
+		return ib_get_rmpp_segment(&mad_send_wr->send_buf,
+					   mad_send_wr->seg_num);
+	else
+		return mad_send_wr->send_buf.mad +
+		       mad_send_wr->send_buf.hdr_len;
+}
+
+void ib_free_send_mad(struct ib_mad_send_buf *send_buf)
+{
+	struct ib_mad_agent_private *mad_agent_priv;
+	struct ib_mad_send_wr_private *mad_send_wr;
+
+	mad_agent_priv = container_of(send_buf->mad_agent,
+				      struct ib_mad_agent_private, agent);
+	mad_send_wr = container_of(send_buf, struct ib_mad_send_wr_private,
+				   send_buf);
+
+	free_send_rmpp_list(mad_send_wr);
+	kfree(send_buf->mad);
+	deref_mad_agent(mad_agent_priv);
+}
+EXPORT_SYMBOL(ib_free_send_mad);
+
+int ib_send_mad(struct ib_mad_send_wr_private *mad_send_wr)
+{
+	struct ib_mad_qp_info *qp_info;
+	struct list_head *list;
+	struct ib_send_wr *bad_send_wr;
+	struct ib_mad_agent *mad_agent;
+	struct ib_sge *sge;
+	unsigned long flags;
+	int ret;
+
+	/* Set WR ID to find mad_send_wr upon completion */
+	qp_info = mad_send_wr->mad_agent_priv->qp_info;
+	mad_send_wr->send_wr.wr_id = (unsigned long)&mad_send_wr->mad_list;
+	mad_send_wr->mad_list.mad_queue = &qp_info->send_queue;
+
+	mad_agent = mad_send_wr->send_buf.mad_agent;
+	sge = mad_send_wr->sg_list;
+	sge[0].addr = ib_dma_map_single(mad_agent->device,
+					mad_send_wr->send_buf.mad,
+					sge[0].length,
+					DMA_TO_DEVICE);
+	mad_send_wr->header_mapping = sge[0].addr;
+
+	sge[1].addr = ib_dma_map_single(mad_agent->device,
+					ib_get_payload(mad_send_wr),
+					sge[1].length,
+					DMA_TO_DEVICE);
+	mad_send_wr->payload_mapping = sge[1].addr;
+
+	spin_lock_irqsave(&qp_info->send_queue.lock, flags);
+	if (qp_info->send_queue.count < qp_info->send_queue.max_active) {
+		ret = ib_post_send(mad_agent->qp, &mad_send_wr->send_wr,
+				   &bad_send_wr);
+		list = &qp_info->send_queue.list;
+	} else {
+		ret = 0;
+		list = &qp_info->overflow_list;
+	}
+
+	if (!ret) {
+		qp_info->send_queue.count++;
+		list_add_tail(&mad_send_wr->mad_list.list, list);
+	}
+	spin_unlock_irqrestore(&qp_info->send_queue.lock, flags);
+	if (ret) {
+		ib_dma_unmap_single(mad_agent->device,
+				    mad_send_wr->header_mapping,
+				    sge[0].length, DMA_TO_DEVICE);
+		ib_dma_unmap_single(mad_agent->device,
+				    mad_send_wr->payload_mapping,
+				    sge[1].length, DMA_TO_DEVICE);
+	}
+	return ret;
+}
+
+/*
+ * ib_post_send_mad - Posts MAD(s) to the send queue of the QP associated
+ *  with the registered client
+ */
+int ib_post_send_mad(struct ib_mad_send_buf *send_buf,
+		     struct ib_mad_send_buf **bad_send_buf)
+{
+	struct ib_mad_agent_private *mad_agent_priv;
+	struct ib_mad_send_buf *next_send_buf;
+	struct ib_mad_send_wr_private *mad_send_wr;
+	unsigned long flags;
+	int ret = -EINVAL;
+
+	/* Walk list of send WRs and post each on send list */
+	for (; send_buf; send_buf = next_send_buf) {
+
+		mad_send_wr = container_of(send_buf,
+					   struct ib_mad_send_wr_private,
+					   send_buf);
+		mad_agent_priv = mad_send_wr->mad_agent_priv;
+
+		if (!send_buf->mad_agent->send_handler ||
+		    (send_buf->timeout_ms &&
+		     !send_buf->mad_agent->recv_handler)) {
+			ret = -EINVAL;
+			goto error;
+		}
+
+		if (!ib_is_mad_class_rmpp(((struct ib_mad_hdr *) send_buf->mad)->mgmt_class)) {
+			if (mad_agent_priv->agent.rmpp_version) {
+				ret = -EINVAL;
+				goto error;
+			}
+		}
+
+		/*
+		 * Save pointer to next work request to post in case the
+		 * current one completes, and the user modifies the work
+		 * request associated with the completion
+		 */
+		next_send_buf = send_buf->next;
+		mad_send_wr->send_wr.wr.ud.ah = send_buf->ah;
+
+		if (((struct ib_mad_hdr *) send_buf->mad)->mgmt_class ==
+		    IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) {
+			ret = handle_outgoing_dr_smp(mad_agent_priv,
+						     mad_send_wr);
+			if (ret < 0)		/* error */
+				goto error;
+			else if (ret == 1)	/* locally consumed */
+				continue;
+		}
+
+		mad_send_wr->tid = ((struct ib_mad_hdr *) send_buf->mad)->tid;
+		/* Timeout will be updated after send completes */
+		mad_send_wr->timeout = msecs_to_jiffies(send_buf->timeout_ms);
+		mad_send_wr->max_retries = send_buf->retries;
+		mad_send_wr->retries_left = send_buf->retries;
+		send_buf->retries = 0;
+		/* Reference for work request to QP + response */
+		mad_send_wr->refcount = 1 + (mad_send_wr->timeout > 0);
+		mad_send_wr->status = IB_WC_SUCCESS;
+
+		/* Reference MAD agent until send completes */
+		atomic_inc(&mad_agent_priv->refcount);
+		spin_lock_irqsave(&mad_agent_priv->lock, flags);
+		list_add_tail(&mad_send_wr->agent_list,
+			      &mad_agent_priv->send_list);
+		spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+
+		if (mad_agent_priv->agent.rmpp_version) {
+			ret = ib_send_rmpp_mad(mad_send_wr);
+			if (ret >= 0 && ret != IB_RMPP_RESULT_CONSUMED)
+				ret = ib_send_mad(mad_send_wr);
+		} else
+			ret = ib_send_mad(mad_send_wr);
+		if (ret < 0) {
+			/* Fail send request */
+			spin_lock_irqsave(&mad_agent_priv->lock, flags);
+			list_del(&mad_send_wr->agent_list);
+			spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+			atomic_dec(&mad_agent_priv->refcount);
+			goto error;
+		}
+	}
+	return 0;
+error:
+	if (bad_send_buf)
+		*bad_send_buf = send_buf;
+	return ret;
+}
+EXPORT_SYMBOL(ib_post_send_mad);
+
+/*
+ * ib_free_recv_mad - Returns data buffers used to receive
+ *  a MAD to the access layer
+ */
+void ib_free_recv_mad(struct ib_mad_recv_wc *mad_recv_wc)
+{
+	struct ib_mad_recv_buf *mad_recv_buf, *temp_recv_buf;
+	struct ib_mad_private_header *mad_priv_hdr;
+	struct ib_mad_private *priv;
+	struct list_head free_list;
+
+	INIT_LIST_HEAD(&free_list);
+	list_splice_init(&mad_recv_wc->rmpp_list, &free_list);
+
+	list_for_each_entry_safe(mad_recv_buf, temp_recv_buf,
+					&free_list, list) {
+		mad_recv_wc = container_of(mad_recv_buf, struct ib_mad_recv_wc,
+					   recv_buf);
+		mad_priv_hdr = container_of(mad_recv_wc,
+					    struct ib_mad_private_header,
+					    recv_wc);
+		priv = container_of(mad_priv_hdr, struct ib_mad_private,
+				    header);
+		kmem_cache_free(ib_mad_cache, priv);
+	}
+}
+EXPORT_SYMBOL(ib_free_recv_mad);
+
+struct ib_mad_agent *ib_redirect_mad_qp(struct ib_qp *qp,
+					u8 rmpp_version,
+					ib_mad_send_handler send_handler,
+					ib_mad_recv_handler recv_handler,
+					void *context)
+{
+	return ERR_PTR(-EINVAL);	/* XXX: for now */
+}
+EXPORT_SYMBOL(ib_redirect_mad_qp);
+
+int ib_process_mad_wc(struct ib_mad_agent *mad_agent,
+		      struct ib_wc *wc)
+{
+	printk(KERN_ERR PFX "ib_process_mad_wc() not implemented yet\n");
+	return 0;
+}
+EXPORT_SYMBOL(ib_process_mad_wc);
+
+static int method_in_use(struct ib_mad_mgmt_method_table **method,
+			 struct ib_mad_reg_req *mad_reg_req)
+{
+	int i;
+
+	for (i = find_first_bit(mad_reg_req->method_mask, IB_MGMT_MAX_METHODS);
+	     i < IB_MGMT_MAX_METHODS;
+	     i = find_next_bit(mad_reg_req->method_mask, IB_MGMT_MAX_METHODS,
+			       1+i)) {
+		if ((*method)->agent[i]) {
+			printk(KERN_ERR PFX "Method %d already in use\n", i);
+			return -EINVAL;
+		}
+	}
+	return 0;
+}
+
+static int allocate_method_table(struct ib_mad_mgmt_method_table **method)
+{
+	/* Allocate management method table */
+	*method = kzalloc(sizeof **method, GFP_ATOMIC);
+	if (!*method) {
+		printk(KERN_ERR PFX "No memory for "
+		       "ib_mad_mgmt_method_table\n");
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+/*
+ * Check to see if there are any methods still in use
+ */
+static int check_method_table(struct ib_mad_mgmt_method_table *method)
+{
+	int i;
+
+	for (i = 0; i < IB_MGMT_MAX_METHODS; i++)
+		if (method->agent[i])
+			return 1;
+	return 0;
+}
+
+/*
+ * Check to see if there are any method tables for this class still in use
+ */
+static int check_class_table(struct ib_mad_mgmt_class_table *class)
+{
+	int i;
+
+	for (i = 0; i < MAX_MGMT_CLASS; i++)
+		if (class->method_table[i])
+			return 1;
+	return 0;
+}
+
+static int check_vendor_class(struct ib_mad_mgmt_vendor_class *vendor_class)
+{
+	int i;
+
+	for (i = 0; i < MAX_MGMT_OUI; i++)
+		if (vendor_class->method_table[i])
+			return 1;
+	return 0;
+}
+
+static int find_vendor_oui(struct ib_mad_mgmt_vendor_class *vendor_class,
+			   char *oui)
+{
+	int i;
+
+	for (i = 0; i < MAX_MGMT_OUI; i++)
+		/* Is there matching OUI for this vendor class ? */
+		if (!memcmp(vendor_class->oui[i], oui, 3))
+			return i;
+
+	return -1;
+}
+
+static int check_vendor_table(struct ib_mad_mgmt_vendor_class_table *vendor)
+{
+	int i;
+
+	for (i = 0; i < MAX_MGMT_VENDOR_RANGE2; i++)
+		if (vendor->vendor_class[i])
+			return 1;
+
+	return 0;
+}
+
+static void remove_methods_mad_agent(struct ib_mad_mgmt_method_table *method,
+				     struct ib_mad_agent_private *agent)
+{
+	int i;
+
+	/* Remove any methods for this mad agent */
+	for (i = 0; i < IB_MGMT_MAX_METHODS; i++) {
+		if (method->agent[i] == agent) {
+			method->agent[i] = NULL;
+		}
+	}
+}
+
+static int add_nonoui_reg_req(struct ib_mad_reg_req *mad_reg_req,
+			      struct ib_mad_agent_private *agent_priv,
+			      u8 mgmt_class)
+{
+	struct ib_mad_port_private *port_priv;
+	struct ib_mad_mgmt_class_table **class;
+	struct ib_mad_mgmt_method_table **method;
+	int i, ret;
+
+	port_priv = agent_priv->qp_info->port_priv;
+	class = &port_priv->version[mad_reg_req->mgmt_class_version].class;
+	if (!*class) {
+		/* Allocate management class table for "new" class version */
+		*class = kzalloc(sizeof **class, GFP_ATOMIC);
+		if (!*class) {
+			printk(KERN_ERR PFX "No memory for "
+			       "ib_mad_mgmt_class_table\n");
+			ret = -ENOMEM;
+			goto error1;
+		}
+
+		/* Allocate method table for this management class */
+		method = &(*class)->method_table[mgmt_class];
+		if ((ret = allocate_method_table(method)))
+			goto error2;
+	} else {
+		method = &(*class)->method_table[mgmt_class];
+		if (!*method) {
+			/* Allocate method table for this management class */
+			if ((ret = allocate_method_table(method)))
+				goto error1;
+		}
+	}
+
+	/* Now, make sure methods are not already in use */
+	if (method_in_use(method, mad_reg_req))
+		goto error3;
+
+	/* Finally, add in methods being registered */
+	for (i = find_first_bit(mad_reg_req->method_mask,
+				IB_MGMT_MAX_METHODS);
+	     i < IB_MGMT_MAX_METHODS;
+	     i = find_next_bit(mad_reg_req->method_mask, IB_MGMT_MAX_METHODS,
+			       1+i)) {
+		(*method)->agent[i] = agent_priv;
+	}
+	return 0;
+
+error3:
+	/* Remove any methods for this mad agent */
+	remove_methods_mad_agent(*method, agent_priv);
+	/* Now, check to see if there are any methods in use */
+	if (!check_method_table(*method)) {
+		/* If not, release management method table */
+		kfree(*method);
+		*method = NULL;
+	}
+	ret = -EINVAL;
+	goto error1;
+error2:
+	kfree(*class);
+	*class = NULL;
+error1:
+	return ret;
+}
+
+static int add_oui_reg_req(struct ib_mad_reg_req *mad_reg_req,
+			   struct ib_mad_agent_private *agent_priv)
+{
+	struct ib_mad_port_private *port_priv;
+	struct ib_mad_mgmt_vendor_class_table **vendor_table;
+	struct ib_mad_mgmt_vendor_class_table *vendor = NULL;
+	struct ib_mad_mgmt_vendor_class *vendor_class = NULL;
+	struct ib_mad_mgmt_method_table **method;
+	int i, ret = -ENOMEM;
+	u8 vclass;
+
+	/* "New" vendor (with OUI) class */
+	vclass = vendor_class_index(mad_reg_req->mgmt_class);
+	port_priv = agent_priv->qp_info->port_priv;
+	vendor_table = &port_priv->version[
+				mad_reg_req->mgmt_class_version].vendor;
+	if (!*vendor_table) {
+		/* Allocate mgmt vendor class table for "new" class version */
+		vendor = kzalloc(sizeof *vendor, GFP_ATOMIC);
+		if (!vendor) {
+			printk(KERN_ERR PFX "No memory for "
+			       "ib_mad_mgmt_vendor_class_table\n");
+			goto error1;
+		}
+
+		*vendor_table = vendor;
+	}
+	if (!(*vendor_table)->vendor_class[vclass]) {
+		/* Allocate table for this management vendor class */
+		vendor_class = kzalloc(sizeof *vendor_class, GFP_ATOMIC);
+		if (!vendor_class) {
+			printk(KERN_ERR PFX "No memory for "
+			       "ib_mad_mgmt_vendor_class\n");
+			goto error2;
+		}
+
+		(*vendor_table)->vendor_class[vclass] = vendor_class;
+	}
+	for (i = 0; i < MAX_MGMT_OUI; i++) {
+		/* Is there matching OUI for this vendor class ? */
+		if (!memcmp((*vendor_table)->vendor_class[vclass]->oui[i],
+			    mad_reg_req->oui, 3)) {
+			method = &(*vendor_table)->vendor_class[
+						vclass]->method_table[i];
+			BUG_ON(!*method);
+			goto check_in_use;
+		}
+	}
+	for (i = 0; i < MAX_MGMT_OUI; i++) {
+		/* OUI slot available ? */
+		if (!is_vendor_oui((*vendor_table)->vendor_class[
+				vclass]->oui[i])) {
+			method = &(*vendor_table)->vendor_class[
+				vclass]->method_table[i];
+			BUG_ON(*method);
+			/* Allocate method table for this OUI */
+			if ((ret = allocate_method_table(method)))
+				goto error3;
+			memcpy((*vendor_table)->vendor_class[vclass]->oui[i],
+			       mad_reg_req->oui, 3);
+			goto check_in_use;
+		}
+	}
+	printk(KERN_ERR PFX "All OUI slots in use\n");
+	goto error3;
+
+check_in_use:
+	/* Now, make sure methods are not already in use */
+	if (method_in_use(method, mad_reg_req))
+		goto error4;
+
+	/* Finally, add in methods being registered */
+	for (i = find_first_bit(mad_reg_req->method_mask,
+				IB_MGMT_MAX_METHODS);
+	     i < IB_MGMT_MAX_METHODS;
+	     i = find_next_bit(mad_reg_req->method_mask, IB_MGMT_MAX_METHODS,
+			       1+i)) {
+		(*method)->agent[i] = agent_priv;
+	}
+	return 0;
+
+error4:
+	/* Remove any methods for this mad agent */
+	remove_methods_mad_agent(*method, agent_priv);
+	/* Now, check to see if there are any methods in use */
+	if (!check_method_table(*method)) {
+		/* If not, release management method table */
+		kfree(*method);
+		*method = NULL;
+	}
+	ret = -EINVAL;
+error3:
+	if (vendor_class) {
+		(*vendor_table)->vendor_class[vclass] = NULL;
+		kfree(vendor_class);
+	}
+error2:
+	if (vendor) {
+		*vendor_table = NULL;
+		kfree(vendor);
+	}
+error1:
+	return ret;
+}
+
+static void remove_mad_reg_req(struct ib_mad_agent_private *agent_priv)
+{
+	struct ib_mad_port_private *port_priv;
+	struct ib_mad_mgmt_class_table *class;
+	struct ib_mad_mgmt_method_table *method;
+	struct ib_mad_mgmt_vendor_class_table *vendor;
+	struct ib_mad_mgmt_vendor_class *vendor_class;
+	int index;
+	u8 mgmt_class;
+
+	/*
+	 * Was MAD registration request supplied
+	 * with original registration ?
+	 */
+	if (!agent_priv->reg_req) {
+		goto out;
+	}
+
+	port_priv = agent_priv->qp_info->port_priv;
+	mgmt_class = convert_mgmt_class(agent_priv->reg_req->mgmt_class);
+	class = port_priv->version[
+			agent_priv->reg_req->mgmt_class_version].class;
+	if (!class)
+		goto vendor_check;
+
+	method = class->method_table[mgmt_class];
+	if (method) {
+		/* Remove any methods for this mad agent */
+		remove_methods_mad_agent(method, agent_priv);
+		/* Now, check to see if there are any methods still in use */
+		if (!check_method_table(method)) {
+			/* If not, release management method table */
+			 kfree(method);
+			 class->method_table[mgmt_class] = NULL;
+			 /* Any management classes left ? */
+			if (!check_class_table(class)) {
+				/* If not, release management class table */
+				kfree(class);
+				port_priv->version[
+					agent_priv->reg_req->
+					mgmt_class_version].class = NULL;
+			}
+		}
+	}
+
+vendor_check:
+	if (!is_vendor_class(mgmt_class))
+		goto out;
+
+	/* normalize mgmt_class to vendor range 2 */
+	mgmt_class = vendor_class_index(agent_priv->reg_req->mgmt_class);
+	vendor = port_priv->version[
+			agent_priv->reg_req->mgmt_class_version].vendor;
+
+	if (!vendor)
+		goto out;
+
+	vendor_class = vendor->vendor_class[mgmt_class];
+	if (vendor_class) {
+		index = find_vendor_oui(vendor_class, agent_priv->reg_req->oui);
+		if (index < 0)
+			goto out;
+		method = vendor_class->method_table[index];
+		if (method) {
+			/* Remove any methods for this mad agent */
+			remove_methods_mad_agent(method, agent_priv);
+			/*
+			 * Now, check to see if there are
+			 * any methods still in use
+			 */
+			if (!check_method_table(method)) {
+				/* If not, release management method table */
+				kfree(method);
+				vendor_class->method_table[index] = NULL;
+				memset(vendor_class->oui[index], 0, 3);
+				/* Any OUIs left ? */
+				if (!check_vendor_class(vendor_class)) {
+					/* If not, release vendor class table */
+					kfree(vendor_class);
+					vendor->vendor_class[mgmt_class] = NULL;
+					/* Any other vendor classes left ? */
+					if (!check_vendor_table(vendor)) {
+						kfree(vendor);
+						port_priv->version[
+							agent_priv->reg_req->
+							mgmt_class_version].
+							vendor = NULL;
+					}
+				}
+			}
+		}
+	}
+
+out:
+	return;
+}
+
+static struct ib_mad_agent_private *
+find_mad_agent(struct ib_mad_port_private *port_priv,
+	       struct ib_mad *mad)
+{
+	struct ib_mad_agent_private *mad_agent = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&port_priv->reg_lock, flags);
+	if (ib_response_mad(mad)) {
+		u32 hi_tid;
+		struct ib_mad_agent_private *entry;
+
+		/*
+		 * Routing is based on high 32 bits of transaction ID
+		 * of MAD.
+		 */
+		hi_tid = be64_to_cpu(mad->mad_hdr.tid) >> 32;
+		list_for_each_entry(entry, &port_priv->agent_list, agent_list) {
+			if (entry->agent.hi_tid == hi_tid) {
+				mad_agent = entry;
+				break;
+			}
+		}
+	} else {
+		struct ib_mad_mgmt_class_table *class;
+		struct ib_mad_mgmt_method_table *method;
+		struct ib_mad_mgmt_vendor_class_table *vendor;
+		struct ib_mad_mgmt_vendor_class *vendor_class;
+		struct ib_vendor_mad *vendor_mad;
+		int index;
+
+		/*
+		 * Routing is based on version, class, and method
+		 * For "newer" vendor MADs, also based on OUI
+		 */
+		if (mad->mad_hdr.class_version >= MAX_MGMT_VERSION)
+			goto out;
+		if (!is_vendor_class(mad->mad_hdr.mgmt_class)) {
+			class = port_priv->version[
+					mad->mad_hdr.class_version].class;
+			if (!class)
+				goto out;
+			method = class->method_table[convert_mgmt_class(
+							mad->mad_hdr.mgmt_class)];
+			if (method)
+				mad_agent = method->agent[mad->mad_hdr.method &
+							  ~IB_MGMT_METHOD_RESP];
+		} else {
+			vendor = port_priv->version[
+					mad->mad_hdr.class_version].vendor;
+			if (!vendor)
+				goto out;
+			vendor_class = vendor->vendor_class[vendor_class_index(
+						mad->mad_hdr.mgmt_class)];
+			if (!vendor_class)
+				goto out;
+			/* Find matching OUI */
+			vendor_mad = (struct ib_vendor_mad *)mad;
+			index = find_vendor_oui(vendor_class, vendor_mad->oui);
+			if (index == -1)
+				goto out;
+			method = vendor_class->method_table[index];
+			if (method) {
+				mad_agent = method->agent[mad->mad_hdr.method &
+							  ~IB_MGMT_METHOD_RESP];
+			}
+		}
+	}
+
+	if (mad_agent) {
+		if (mad_agent->agent.recv_handler)
+			atomic_inc(&mad_agent->refcount);
+		else {
+			printk(KERN_NOTICE PFX "No receive handler for client "
+			       "%p on port %d\n",
+			       &mad_agent->agent, port_priv->port_num);
+			mad_agent = NULL;
+		}
+	}
+out:
+	spin_unlock_irqrestore(&port_priv->reg_lock, flags);
+
+	return mad_agent;
+}
+
+static int validate_mad(struct ib_mad *mad, u32 qp_num)
+{
+	int valid = 0;
+
+	/* Make sure MAD base version is understood */
+	if (mad->mad_hdr.base_version != IB_MGMT_BASE_VERSION) {
+		printk(KERN_ERR PFX "MAD received with unsupported base "
+		       "version %d\n", mad->mad_hdr.base_version);
+		goto out;
+	}
+
+	/* Filter SMI packets sent to other than QP0 */
+	if ((mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED) ||
+	    (mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)) {
+		if (qp_num == 0)
+			valid = 1;
+	} else {
+		/* Filter GSI packets sent to QP0 */
+		if (qp_num != 0)
+			valid = 1;
+	}
+
+out:
+	return valid;
+}
+
+static int is_data_mad(struct ib_mad_agent_private *mad_agent_priv,
+		       struct ib_mad_hdr *mad_hdr)
+{
+	struct ib_rmpp_mad *rmpp_mad;
+
+	rmpp_mad = (struct ib_rmpp_mad *)mad_hdr;
+	return !mad_agent_priv->agent.rmpp_version ||
+		!(ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) &
+				    IB_MGMT_RMPP_FLAG_ACTIVE) ||
+		(rmpp_mad->rmpp_hdr.rmpp_type == IB_MGMT_RMPP_TYPE_DATA);
+}
+
+static inline int rcv_has_same_class(struct ib_mad_send_wr_private *wr,
+				     struct ib_mad_recv_wc *rwc)
+{
+	return ((struct ib_mad *)(wr->send_buf.mad))->mad_hdr.mgmt_class ==
+		rwc->recv_buf.mad->mad_hdr.mgmt_class;
+}
+
+static inline int rcv_has_same_gid(struct ib_mad_agent_private *mad_agent_priv,
+				   struct ib_mad_send_wr_private *wr,
+				   struct ib_mad_recv_wc *rwc )
+{
+	struct ib_ah_attr attr;
+	u8 send_resp, rcv_resp;
+	union ib_gid sgid;
+	struct ib_device *device = mad_agent_priv->agent.device;
+	u8 port_num = mad_agent_priv->agent.port_num;
+	u8 lmc;
+
+	send_resp = ib_response_mad((struct ib_mad *)wr->send_buf.mad);
+	rcv_resp = ib_response_mad(rwc->recv_buf.mad);
+
+	if (send_resp == rcv_resp)
+		/* both requests, or both responses. GIDs different */
+		return 0;
+
+	if (ib_query_ah(wr->send_buf.ah, &attr))
+		/* Assume not equal, to avoid false positives. */
+		return 0;
+
+	if (!!(attr.ah_flags & IB_AH_GRH) !=
+	    !!(rwc->wc->wc_flags & IB_WC_GRH))
+		/* one has GID, other does not.  Assume different */
+		return 0;
+
+	if (!send_resp && rcv_resp) {
+		/* is request/response. */
+		if (!(attr.ah_flags & IB_AH_GRH)) {
+			if (ib_get_cached_lmc(device, port_num, &lmc))
+				return 0;
+			return (!lmc || !((attr.src_path_bits ^
+					   rwc->wc->dlid_path_bits) &
+					  ((1 << lmc) - 1)));
+		} else {
+			if (ib_get_cached_gid(device, port_num,
+					      attr.grh.sgid_index, &sgid))
+				return 0;
+			return !memcmp(sgid.raw, rwc->recv_buf.grh->dgid.raw,
+				       16);
+		}
+	}
+
+	if (!(attr.ah_flags & IB_AH_GRH))
+		return attr.dlid == rwc->wc->slid;
+	else
+		return !memcmp(attr.grh.dgid.raw, rwc->recv_buf.grh->sgid.raw,
+			       16);
+}
+
+static inline int is_direct(u8 class)
+{
+	return (class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE);
+}
+
+struct ib_mad_send_wr_private*
+ib_find_send_mad(struct ib_mad_agent_private *mad_agent_priv,
+		 struct ib_mad_recv_wc *wc)
+{
+	struct ib_mad_send_wr_private *wr;
+	struct ib_mad *mad;
+
+	mad = (struct ib_mad *)wc->recv_buf.mad;
+
+	list_for_each_entry(wr, &mad_agent_priv->wait_list, agent_list) {
+		if ((wr->tid == mad->mad_hdr.tid) &&
+		    rcv_has_same_class(wr, wc) &&
+		    /*
+		     * Don't check GID for direct routed MADs.
+		     * These might have permissive LIDs.
+		     */
+		    (is_direct(wc->recv_buf.mad->mad_hdr.mgmt_class) ||
+		     rcv_has_same_gid(mad_agent_priv, wr, wc)))
+			return (wr->status == IB_WC_SUCCESS) ? wr : NULL;
+	}
+
+	/*
+	 * It's possible to receive the response before we've
+	 * been notified that the send has completed
+	 */
+	list_for_each_entry(wr, &mad_agent_priv->send_list, agent_list) {
+		if (is_data_mad(mad_agent_priv, wr->send_buf.mad) &&
+		    wr->tid == mad->mad_hdr.tid &&
+		    wr->timeout &&
+		    rcv_has_same_class(wr, wc) &&
+		    /*
+		     * Don't check GID for direct routed MADs.
+		     * These might have permissive LIDs.
+		     */
+		    (is_direct(wc->recv_buf.mad->mad_hdr.mgmt_class) ||
+		     rcv_has_same_gid(mad_agent_priv, wr, wc)))
+			/* Verify request has not been canceled */
+			return (wr->status == IB_WC_SUCCESS) ? wr : NULL;
+	}
+	return NULL;
+}
+
+void ib_mark_mad_done(struct ib_mad_send_wr_private *mad_send_wr)
+{
+	mad_send_wr->timeout = 0;
+	if (mad_send_wr->refcount == 1)
+		list_move_tail(&mad_send_wr->agent_list,
+			      &mad_send_wr->mad_agent_priv->done_list);
+}
+
+static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv,
+				 struct ib_mad_recv_wc *mad_recv_wc)
+{
+	struct ib_mad_send_wr_private *mad_send_wr;
+	struct ib_mad_send_wc mad_send_wc;
+	unsigned long flags;
+
+	INIT_LIST_HEAD(&mad_recv_wc->rmpp_list);
+	list_add(&mad_recv_wc->recv_buf.list, &mad_recv_wc->rmpp_list);
+	if (mad_agent_priv->agent.rmpp_version) {
+		mad_recv_wc = ib_process_rmpp_recv_wc(mad_agent_priv,
+						      mad_recv_wc);
+		if (!mad_recv_wc) {
+			deref_mad_agent(mad_agent_priv);
+			return;
+		}
+	}
+
+	/* Complete corresponding request */
+	if (ib_response_mad(mad_recv_wc->recv_buf.mad)) {
+		spin_lock_irqsave(&mad_agent_priv->lock, flags);
+		mad_send_wr = ib_find_send_mad(mad_agent_priv, mad_recv_wc);
+		if (!mad_send_wr) {
+			spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+			ib_free_recv_mad(mad_recv_wc);
+			deref_mad_agent(mad_agent_priv);
+			return;
+		}
+		ib_mark_mad_done(mad_send_wr);
+		spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+
+		/* Defined behavior is to complete response before request */
+		mad_recv_wc->wc->wr_id = (unsigned long) &mad_send_wr->send_buf;
+		mad_agent_priv->agent.recv_handler(&mad_agent_priv->agent,
+						   mad_recv_wc);
+		atomic_dec(&mad_agent_priv->refcount);
+
+		mad_send_wc.status = IB_WC_SUCCESS;
+		mad_send_wc.vendor_err = 0;
+		mad_send_wc.send_buf = &mad_send_wr->send_buf;
+		ib_mad_complete_send_wr(mad_send_wr, &mad_send_wc);
+	} else {
+		mad_agent_priv->agent.recv_handler(&mad_agent_priv->agent,
+						   mad_recv_wc);
+		deref_mad_agent(mad_agent_priv);
+	}
+}
+
+static void ib_mad_recv_done_handler(struct ib_mad_port_private *port_priv,
+				     struct ib_wc *wc)
+{
+	struct ib_mad_qp_info *qp_info;
+	struct ib_mad_private_header *mad_priv_hdr;
+	struct ib_mad_private *recv, *response = NULL;
+	struct ib_mad_list_head *mad_list;
+	struct ib_mad_agent_private *mad_agent;
+	int port_num;
+
+	mad_list = (struct ib_mad_list_head *)(unsigned long)wc->wr_id;
+	qp_info = mad_list->mad_queue->qp_info;
+	dequeue_mad(mad_list);
+
+	mad_priv_hdr = container_of(mad_list, struct ib_mad_private_header,
+				    mad_list);
+	recv = container_of(mad_priv_hdr, struct ib_mad_private, header);
+	ib_dma_unmap_single(port_priv->device,
+			    recv->header.mapping,
+			    sizeof(struct ib_mad_private) -
+			      sizeof(struct ib_mad_private_header),
+			    DMA_FROM_DEVICE);
+
+	/* Setup MAD receive work completion from "normal" work completion */
+	recv->header.wc = *wc;
+	recv->header.recv_wc.wc = &recv->header.wc;
+	recv->header.recv_wc.mad_len = sizeof(struct ib_mad);
+	recv->header.recv_wc.recv_buf.mad = &recv->mad.mad;
+	recv->header.recv_wc.recv_buf.grh = &recv->grh;
+
+	if (atomic_read(&qp_info->snoop_count))
+		snoop_recv(qp_info, &recv->header.recv_wc, IB_MAD_SNOOP_RECVS);
+
+	/* Validate MAD */
+	if (!validate_mad(&recv->mad.mad, qp_info->qp->qp_num))
+		goto out;
+
+	response = kmem_cache_alloc(ib_mad_cache, GFP_KERNEL);
+	if (!response) {
+		printk(KERN_ERR PFX "ib_mad_recv_done_handler no memory "
+		       "for response buffer\n");
+		goto out;
+	}
+
+	if (port_priv->device->node_type == RDMA_NODE_IB_SWITCH)
+		port_num = wc->port_num;
+	else
+		port_num = port_priv->port_num;
+
+	if (recv->mad.mad.mad_hdr.mgmt_class ==
+	    IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) {
+		enum smi_forward_action retsmi;
+
+		if (smi_handle_dr_smp_recv(&recv->mad.smp,
+					   port_priv->device->node_type,
+					   port_num,
+					   port_priv->device->phys_port_cnt) ==
+					   IB_SMI_DISCARD)
+			goto out;
+
+		retsmi = smi_check_forward_dr_smp(&recv->mad.smp);
+		if (retsmi == IB_SMI_LOCAL)
+			goto local;
+
+		if (retsmi == IB_SMI_SEND) { /* don't forward */
+			if (smi_handle_dr_smp_send(&recv->mad.smp,
+						   port_priv->device->node_type,
+						   port_num) == IB_SMI_DISCARD)
+				goto out;
+
+			if (smi_check_local_smp(&recv->mad.smp, port_priv->device) == IB_SMI_DISCARD)
+				goto out;
+		} else if (port_priv->device->node_type == RDMA_NODE_IB_SWITCH) {
+			/* forward case for switches */
+			memcpy(response, recv, sizeof(*response));
+			response->header.recv_wc.wc = &response->header.wc;
+			response->header.recv_wc.recv_buf.mad = &response->mad.mad;
+			response->header.recv_wc.recv_buf.grh = &response->grh;
+
+			agent_send_response(&response->mad.mad,
+					    &response->grh, wc,
+					    port_priv->device,
+					    smi_get_fwd_port(&recv->mad.smp),
+					    qp_info->qp->qp_num);
+
+			goto out;
+		}
+	}
+
+local:
+	/* Give driver "right of first refusal" on incoming MAD */
+	if (port_priv->device->process_mad) {
+		int ret;
+
+		ret = port_priv->device->process_mad(port_priv->device, 0,
+						     port_priv->port_num,
+						     wc, &recv->grh,
+						     &recv->mad.mad,
+						     &response->mad.mad);
+		if (ret & IB_MAD_RESULT_SUCCESS) {
+			if (ret & IB_MAD_RESULT_CONSUMED)
+				goto out;
+			if (ret & IB_MAD_RESULT_REPLY) {
+				agent_send_response(&response->mad.mad,
+						    &recv->grh, wc,
+						    port_priv->device,
+						    port_num,
+						    qp_info->qp->qp_num);
+				goto out;
+			}
+		}
+	}
+
+	mad_agent = find_mad_agent(port_priv, &recv->mad.mad);
+	if (mad_agent) {
+		ib_mad_complete_recv(mad_agent, &recv->header.recv_wc);
+		/*
+		 * recv is freed up in error cases in ib_mad_complete_recv
+		 * or via recv_handler in ib_mad_complete_recv()
+		 */
+		recv = NULL;
+	}
+
+out:
+	/* Post another receive request for this QP */
+	if (response) {
+		ib_mad_post_receive_mads(qp_info, response);
+		if (recv)
+			kmem_cache_free(ib_mad_cache, recv);
+	} else
+		ib_mad_post_receive_mads(qp_info, recv);
+}
+
+static void adjust_timeout(struct ib_mad_agent_private *mad_agent_priv)
+{
+	struct ib_mad_send_wr_private *mad_send_wr;
+
+	if (list_empty(&mad_agent_priv->wait_list)) {
+		del_timer(&mad_agent_priv->timeout_timer);
+	} else {
+		mad_send_wr = list_entry(mad_agent_priv->wait_list.next,
+					 struct ib_mad_send_wr_private,
+					 agent_list);
+
+		if (time_after(mad_agent_priv->timeout,
+			       mad_send_wr->timeout)) {
+			mad_agent_priv->timeout = mad_send_wr->timeout;
+			mod_timer(&mad_agent_priv->timeout_timer,
+				  mad_send_wr->timeout);
+		}
+	}
+}
+
+static void wait_for_response(struct ib_mad_send_wr_private *mad_send_wr)
+{
+	struct ib_mad_agent_private *mad_agent_priv;
+	struct ib_mad_send_wr_private *temp_mad_send_wr;
+	struct list_head *list_item;
+	unsigned long delay;
+
+	mad_agent_priv = mad_send_wr->mad_agent_priv;
+	list_del(&mad_send_wr->agent_list);
+
+	delay = mad_send_wr->timeout;
+	mad_send_wr->timeout += jiffies;
+
+	if (delay) {
+		list_for_each_prev(list_item, &mad_agent_priv->wait_list) {
+			temp_mad_send_wr = list_entry(list_item,
+						struct ib_mad_send_wr_private,
+						agent_list);
+			if (time_after(mad_send_wr->timeout,
+				       temp_mad_send_wr->timeout))
+				break;
+		}
+	} else
+		list_item = &mad_agent_priv->wait_list;
+	list_add(&mad_send_wr->agent_list, list_item);
+
+	/* Reschedule a work item if we have a shorter timeout */
+	if (mad_agent_priv->wait_list.next == &mad_send_wr->agent_list)
+		mod_timer(&mad_agent_priv->timeout_timer,
+			  mad_send_wr->timeout);
+}
+
+void ib_reset_mad_timeout(struct ib_mad_send_wr_private *mad_send_wr,
+			  int timeout_ms)
+{
+	mad_send_wr->timeout = msecs_to_jiffies(timeout_ms);
+	wait_for_response(mad_send_wr);
+}
+
+/*
+ * Process a send work completion
+ */
+void ib_mad_complete_send_wr(struct ib_mad_send_wr_private *mad_send_wr,
+			     struct ib_mad_send_wc *mad_send_wc)
+{
+	struct ib_mad_agent_private	*mad_agent_priv;
+	unsigned long			flags;
+	int				ret;
+
+	mad_agent_priv = mad_send_wr->mad_agent_priv;
+	spin_lock_irqsave(&mad_agent_priv->lock, flags);
+	if (mad_agent_priv->agent.rmpp_version) {
+		ret = ib_process_rmpp_send_wc(mad_send_wr, mad_send_wc);
+		if (ret == IB_RMPP_RESULT_CONSUMED)
+			goto done;
+	} else
+		ret = IB_RMPP_RESULT_UNHANDLED;
+
+	if (mad_send_wc->status != IB_WC_SUCCESS &&
+	    mad_send_wr->status == IB_WC_SUCCESS) {
+		mad_send_wr->status = mad_send_wc->status;
+		mad_send_wr->refcount -= (mad_send_wr->timeout > 0);
+	}
+
+	if (--mad_send_wr->refcount > 0) {
+		if (mad_send_wr->refcount == 1 && mad_send_wr->timeout &&
+		    mad_send_wr->status == IB_WC_SUCCESS) {
+			wait_for_response(mad_send_wr);
+		}
+		goto done;
+	}
+
+	/* Remove send from MAD agent and notify client of completion */
+	list_del(&mad_send_wr->agent_list);
+	adjust_timeout(mad_agent_priv);
+	spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+
+	if (mad_send_wr->status != IB_WC_SUCCESS )
+		mad_send_wc->status = mad_send_wr->status;
+	if (ret == IB_RMPP_RESULT_INTERNAL)
+		ib_rmpp_send_handler(mad_send_wc);
+	else
+		mad_agent_priv->agent.send_handler(&mad_agent_priv->agent,
+						   mad_send_wc);
+
+	/* Release reference on agent taken when sending */
+	deref_mad_agent(mad_agent_priv);
+	return;
+done:
+	spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+}
+
+static void ib_mad_send_done_handler(struct ib_mad_port_private *port_priv,
+				     struct ib_wc *wc)
+{
+	struct ib_mad_send_wr_private	*mad_send_wr, *queued_send_wr;
+	struct ib_mad_list_head		*mad_list;
+	struct ib_mad_qp_info		*qp_info;
+	struct ib_mad_queue		*send_queue;
+	struct ib_send_wr		*bad_send_wr;
+	struct ib_mad_send_wc		mad_send_wc;
+	unsigned long flags;
+	int ret;
+
+	mad_list = (struct ib_mad_list_head *)(unsigned long)wc->wr_id;
+	mad_send_wr = container_of(mad_list, struct ib_mad_send_wr_private,
+				   mad_list);
+	send_queue = mad_list->mad_queue;
+	qp_info = send_queue->qp_info;
+
+retry:
+	ib_dma_unmap_single(mad_send_wr->send_buf.mad_agent->device,
+			    mad_send_wr->header_mapping,
+			    mad_send_wr->sg_list[0].length, DMA_TO_DEVICE);
+	ib_dma_unmap_single(mad_send_wr->send_buf.mad_agent->device,
+			    mad_send_wr->payload_mapping,
+			    mad_send_wr->sg_list[1].length, DMA_TO_DEVICE);
+	queued_send_wr = NULL;
+	spin_lock_irqsave(&send_queue->lock, flags);
+	list_del(&mad_list->list);
+
+	/* Move queued send to the send queue */
+	if (send_queue->count-- > send_queue->max_active) {
+		mad_list = container_of(qp_info->overflow_list.next,
+					struct ib_mad_list_head, list);
+		queued_send_wr = container_of(mad_list,
+					struct ib_mad_send_wr_private,
+					mad_list);
+		list_move_tail(&mad_list->list, &send_queue->list);
+	}
+	spin_unlock_irqrestore(&send_queue->lock, flags);
+
+	mad_send_wc.send_buf = &mad_send_wr->send_buf;
+	mad_send_wc.status = wc->status;
+	mad_send_wc.vendor_err = wc->vendor_err;
+	if (atomic_read(&qp_info->snoop_count))
+		snoop_send(qp_info, &mad_send_wr->send_buf, &mad_send_wc,
+			   IB_MAD_SNOOP_SEND_COMPLETIONS);
+	ib_mad_complete_send_wr(mad_send_wr, &mad_send_wc);
+
+	if (queued_send_wr) {
+		ret = ib_post_send(qp_info->qp, &queued_send_wr->send_wr,
+				   &bad_send_wr);
+		if (ret) {
+			printk(KERN_ERR PFX "ib_post_send failed: %d\n", ret);
+			mad_send_wr = queued_send_wr;
+			wc->status = IB_WC_LOC_QP_OP_ERR;
+			goto retry;
+		}
+	}
+}
+
+static void mark_sends_for_retry(struct ib_mad_qp_info *qp_info)
+{
+	struct ib_mad_send_wr_private *mad_send_wr;
+	struct ib_mad_list_head *mad_list;
+	unsigned long flags;
+
+	spin_lock_irqsave(&qp_info->send_queue.lock, flags);
+	list_for_each_entry(mad_list, &qp_info->send_queue.list, list) {
+		mad_send_wr = container_of(mad_list,
+					   struct ib_mad_send_wr_private,
+					   mad_list);
+		mad_send_wr->retry = 1;
+	}
+	spin_unlock_irqrestore(&qp_info->send_queue.lock, flags);
+}
+
+static void mad_error_handler(struct ib_mad_port_private *port_priv,
+			      struct ib_wc *wc)
+{
+	struct ib_mad_list_head *mad_list;
+	struct ib_mad_qp_info *qp_info;
+	struct ib_mad_send_wr_private *mad_send_wr;
+	int ret;
+
+	/* Determine if failure was a send or receive */
+	mad_list = (struct ib_mad_list_head *)(unsigned long)wc->wr_id;
+	qp_info = mad_list->mad_queue->qp_info;
+	if (mad_list->mad_queue == &qp_info->recv_queue)
+		/*
+		 * Receive errors indicate that the QP has entered the error
+		 * state - error handling/shutdown code will cleanup
+		 */
+		return;
+
+	/*
+	 * Send errors will transition the QP to SQE - move
+	 * QP to RTS and repost flushed work requests
+	 */
+	mad_send_wr = container_of(mad_list, struct ib_mad_send_wr_private,
+				   mad_list);
+	if (wc->status == IB_WC_WR_FLUSH_ERR) {
+		if (mad_send_wr->retry) {
+			/* Repost send */
+			struct ib_send_wr *bad_send_wr;
+
+			mad_send_wr->retry = 0;
+			ret = ib_post_send(qp_info->qp, &mad_send_wr->send_wr,
+					&bad_send_wr);
+			if (ret)
+				ib_mad_send_done_handler(port_priv, wc);
+		} else
+			ib_mad_send_done_handler(port_priv, wc);
+	} else {
+		struct ib_qp_attr *attr;
+
+		/* Transition QP to RTS and fail offending send */
+		attr = kmalloc(sizeof *attr, GFP_KERNEL);
+		if (attr) {
+			attr->qp_state = IB_QPS_RTS;
+			attr->cur_qp_state = IB_QPS_SQE;
+			ret = ib_modify_qp(qp_info->qp, attr,
+					   IB_QP_STATE | IB_QP_CUR_STATE);
+			kfree(attr);
+			if (ret)
+				printk(KERN_ERR PFX "mad_error_handler - "
+				       "ib_modify_qp to RTS : %d\n", ret);
+			else
+				mark_sends_for_retry(qp_info);
+		}
+		ib_mad_send_done_handler(port_priv, wc);
+	}
+}
+
+/*
+ * IB MAD completion callback
+ */
+static void ib_mad_completion_handler(struct work_struct *work)
+{
+	struct ib_mad_port_private *port_priv;
+	struct ib_wc wc;
+
+	port_priv = container_of(work, struct ib_mad_port_private, work);
+	ib_req_notify_cq(port_priv->cq, IB_CQ_NEXT_COMP);
+
+	while (ib_poll_cq(port_priv->cq, 1, &wc) == 1) {
+		if (wc.status == IB_WC_SUCCESS) {
+			switch (wc.opcode) {
+			case IB_WC_SEND:
+				ib_mad_send_done_handler(port_priv, &wc);
+				break;
+			case IB_WC_RECV:
+				ib_mad_recv_done_handler(port_priv, &wc);
+				break;
+			default:
+				BUG_ON(1);
+				break;
+			}
+		} else
+			mad_error_handler(port_priv, &wc);
+	}
+}
+
+static void cancel_mads(struct ib_mad_agent_private *mad_agent_priv)
+{
+	unsigned long flags;
+	struct ib_mad_send_wr_private *mad_send_wr, *temp_mad_send_wr;
+	struct ib_mad_send_wc mad_send_wc;
+	struct list_head cancel_list;
+
+	INIT_LIST_HEAD(&cancel_list);
+
+	spin_lock_irqsave(&mad_agent_priv->lock, flags);
+	list_for_each_entry_safe(mad_send_wr, temp_mad_send_wr,
+				 &mad_agent_priv->send_list, agent_list) {
+		if (mad_send_wr->status == IB_WC_SUCCESS) {
+			mad_send_wr->status = IB_WC_WR_FLUSH_ERR;
+			mad_send_wr->refcount -= (mad_send_wr->timeout > 0);
+		}
+	}
+
+	/* Empty wait list to prevent receives from finding a request */
+	list_splice_init(&mad_agent_priv->wait_list, &cancel_list);
+	spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+
+	/* Report all cancelled requests */
+	mad_send_wc.status = IB_WC_WR_FLUSH_ERR;
+	mad_send_wc.vendor_err = 0;
+
+	list_for_each_entry_safe(mad_send_wr, temp_mad_send_wr,
+				 &cancel_list, agent_list) {
+		mad_send_wc.send_buf = &mad_send_wr->send_buf;
+		list_del(&mad_send_wr->agent_list);
+		mad_agent_priv->agent.send_handler(&mad_agent_priv->agent,
+						   &mad_send_wc);
+		atomic_dec(&mad_agent_priv->refcount);
+	}
+}
+
+static struct ib_mad_send_wr_private*
+find_send_wr(struct ib_mad_agent_private *mad_agent_priv,
+	     struct ib_mad_send_buf *send_buf)
+{
+	struct ib_mad_send_wr_private *mad_send_wr;
+
+	list_for_each_entry(mad_send_wr, &mad_agent_priv->wait_list,
+			    agent_list) {
+		if (&mad_send_wr->send_buf == send_buf)
+			return mad_send_wr;
+	}
+
+	list_for_each_entry(mad_send_wr, &mad_agent_priv->send_list,
+			    agent_list) {
+		if (is_data_mad(mad_agent_priv, mad_send_wr->send_buf.mad) &&
+		    &mad_send_wr->send_buf == send_buf)
+			return mad_send_wr;
+	}
+	return NULL;
+}
+
+int ib_modify_mad(struct ib_mad_agent *mad_agent,
+		  struct ib_mad_send_buf *send_buf, u32 timeout_ms)
+{
+	struct ib_mad_agent_private *mad_agent_priv;
+	struct ib_mad_send_wr_private *mad_send_wr;
+	unsigned long flags;
+	int active;
+
+	mad_agent_priv = container_of(mad_agent, struct ib_mad_agent_private,
+				      agent);
+	spin_lock_irqsave(&mad_agent_priv->lock, flags);
+	mad_send_wr = find_send_wr(mad_agent_priv, send_buf);
+	if (!mad_send_wr || mad_send_wr->status != IB_WC_SUCCESS) {
+		spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+		return -EINVAL;
+	}
+
+	active = (!mad_send_wr->timeout || mad_send_wr->refcount > 1);
+	if (!timeout_ms) {
+		mad_send_wr->status = IB_WC_WR_FLUSH_ERR;
+		mad_send_wr->refcount -= (mad_send_wr->timeout > 0);
+	}
+
+	mad_send_wr->send_buf.timeout_ms = timeout_ms;
+	if (active)
+		mad_send_wr->timeout = msecs_to_jiffies(timeout_ms);
+	else
+		ib_reset_mad_timeout(mad_send_wr, timeout_ms);
+
+	spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+	return 0;
+}
+EXPORT_SYMBOL(ib_modify_mad);
+
+void ib_cancel_mad(struct ib_mad_agent *mad_agent,
+		   struct ib_mad_send_buf *send_buf)
+{
+	ib_modify_mad(mad_agent, send_buf, 0);
+}
+EXPORT_SYMBOL(ib_cancel_mad);
+
+static void local_completions(struct work_struct *work)
+{
+	struct ib_mad_agent_private *mad_agent_priv;
+	struct ib_mad_local_private *local;
+	struct ib_mad_agent_private *recv_mad_agent;
+	unsigned long flags;
+	int free_mad;
+	struct ib_wc wc;
+	struct ib_mad_send_wc mad_send_wc;
+
+	mad_agent_priv =
+		container_of(work, struct ib_mad_agent_private, local_work);
+
+	spin_lock_irqsave(&mad_agent_priv->lock, flags);
+	while (!list_empty(&mad_agent_priv->local_list)) {
+		local = list_entry(mad_agent_priv->local_list.next,
+				   struct ib_mad_local_private,
+				   completion_list);
+		list_del(&local->completion_list);
+		spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+		free_mad = 0;
+		if (local->mad_priv) {
+			recv_mad_agent = local->recv_mad_agent;
+			if (!recv_mad_agent) {
+				printk(KERN_ERR PFX "No receive MAD agent for local completion\n");
+				free_mad = 1;
+				goto local_send_completion;
+			}
+
+			/*
+			 * Defined behavior is to complete response
+			 * before request
+			 */
+			build_smp_wc(recv_mad_agent->agent.qp,
+				     (unsigned long) local->mad_send_wr,
+				     be16_to_cpu(IB_LID_PERMISSIVE),
+				     0, recv_mad_agent->agent.port_num, &wc);
+
+			local->mad_priv->header.recv_wc.wc = &wc;
+			local->mad_priv->header.recv_wc.mad_len =
+						sizeof(struct ib_mad);
+			INIT_LIST_HEAD(&local->mad_priv->header.recv_wc.rmpp_list);
+			list_add(&local->mad_priv->header.recv_wc.recv_buf.list,
+				 &local->mad_priv->header.recv_wc.rmpp_list);
+			local->mad_priv->header.recv_wc.recv_buf.grh = NULL;
+			local->mad_priv->header.recv_wc.recv_buf.mad =
+						&local->mad_priv->mad.mad;
+			if (atomic_read(&recv_mad_agent->qp_info->snoop_count))
+				snoop_recv(recv_mad_agent->qp_info,
+					  &local->mad_priv->header.recv_wc,
+					   IB_MAD_SNOOP_RECVS);
+			recv_mad_agent->agent.recv_handler(
+						&recv_mad_agent->agent,
+						&local->mad_priv->header.recv_wc);
+			spin_lock_irqsave(&recv_mad_agent->lock, flags);
+			atomic_dec(&recv_mad_agent->refcount);
+			spin_unlock_irqrestore(&recv_mad_agent->lock, flags);
+		}
+
+local_send_completion:
+		/* Complete send */
+		mad_send_wc.status = IB_WC_SUCCESS;
+		mad_send_wc.vendor_err = 0;
+		mad_send_wc.send_buf = &local->mad_send_wr->send_buf;
+		if (atomic_read(&mad_agent_priv->qp_info->snoop_count))
+			snoop_send(mad_agent_priv->qp_info,
+				   &local->mad_send_wr->send_buf,
+				   &mad_send_wc, IB_MAD_SNOOP_SEND_COMPLETIONS);
+		mad_agent_priv->agent.send_handler(&mad_agent_priv->agent,
+						   &mad_send_wc);
+
+		spin_lock_irqsave(&mad_agent_priv->lock, flags);
+		atomic_dec(&mad_agent_priv->refcount);
+		if (free_mad)
+			kmem_cache_free(ib_mad_cache, local->mad_priv);
+		kfree(local);
+	}
+	spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+}
+
+static int retry_send(struct ib_mad_send_wr_private *mad_send_wr)
+{
+	int ret;
+
+	if (!mad_send_wr->retries_left)
+		return -ETIMEDOUT;
+
+	mad_send_wr->retries_left--;
+	mad_send_wr->send_buf.retries++;
+
+	mad_send_wr->timeout = msecs_to_jiffies(mad_send_wr->send_buf.timeout_ms);
+
+	if (mad_send_wr->mad_agent_priv->agent.rmpp_version) {
+		ret = ib_retry_rmpp(mad_send_wr);
+		switch (ret) {
+		case IB_RMPP_RESULT_UNHANDLED:
+			ret = ib_send_mad(mad_send_wr);
+			break;
+		case IB_RMPP_RESULT_CONSUMED:
+			ret = 0;
+			break;
+		default:
+			ret = -ECOMM;
+			break;
+		}
+	} else
+		ret = ib_send_mad(mad_send_wr);
+
+	if (!ret) {
+		mad_send_wr->refcount++;
+		list_add_tail(&mad_send_wr->agent_list,
+			      &mad_send_wr->mad_agent_priv->send_list);
+	}
+	return ret;
+}
+
+static void timeout_sends(struct work_struct *work)
+{
+	struct ib_mad_agent_private *mad_agent_priv;
+	struct ib_mad_send_wr_private *mad_send_wr;
+	struct ib_mad_send_wc mad_send_wc;
+	unsigned long flags;
+
+	mad_agent_priv = container_of(work, struct ib_mad_agent_private,
+				      timeout_work);
+	mad_send_wc.vendor_err = 0;
+
+	spin_lock_irqsave(&mad_agent_priv->lock, flags);
+	while (!list_empty(&mad_agent_priv->wait_list)) {
+		mad_send_wr = list_entry(mad_agent_priv->wait_list.next,
+					 struct ib_mad_send_wr_private,
+					 agent_list);
+
+		if (time_after(mad_send_wr->timeout, jiffies)) {
+			mod_timer(&mad_agent_priv->timeout_timer,
+				  mad_send_wr->timeout);
+			break;
+		}
+
+		list_del(&mad_send_wr->agent_list);
+		if (mad_send_wr->status == IB_WC_SUCCESS &&
+		    !retry_send(mad_send_wr))
+			continue;
+
+		spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+
+		if (mad_send_wr->status == IB_WC_SUCCESS)
+			mad_send_wc.status = IB_WC_RESP_TIMEOUT_ERR;
+		else
+			mad_send_wc.status = mad_send_wr->status;
+		mad_send_wc.send_buf = &mad_send_wr->send_buf;
+		mad_agent_priv->agent.send_handler(&mad_agent_priv->agent,
+						   &mad_send_wc);
+
+		atomic_dec(&mad_agent_priv->refcount);
+		spin_lock_irqsave(&mad_agent_priv->lock, flags);
+	}
+	spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+}
+
+static void ib_mad_thread_completion_handler(struct ib_cq *cq, void *arg)
+{
+	struct ib_mad_port_private *port_priv = cq->cq_context;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ib_mad_port_list_lock, flags);
+	if (!list_empty(&port_priv->port_list))
+		queue_work(port_priv->wq, &port_priv->work);
+	spin_unlock_irqrestore(&ib_mad_port_list_lock, flags);
+}
+
+/*
+ * Allocate receive MADs and post receive WRs for them
+ */
+static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info,
+				    struct ib_mad_private *mad)
+{
+	unsigned long flags;
+	int post, ret;
+	struct ib_mad_private *mad_priv;
+	struct ib_sge sg_list;
+	struct ib_recv_wr recv_wr, *bad_recv_wr;
+	struct ib_mad_queue *recv_queue = &qp_info->recv_queue;
+
+	/* Initialize common scatter list fields */
+	sg_list.length = sizeof *mad_priv - sizeof mad_priv->header;
+	sg_list.lkey = (*qp_info->port_priv->mr).lkey;
+
+	/* Initialize common receive WR fields */
+	recv_wr.next = NULL;
+	recv_wr.sg_list = &sg_list;
+	recv_wr.num_sge = 1;
+
+	do {
+		/* Allocate and map receive buffer */
+		if (mad) {
+			mad_priv = mad;
+			mad = NULL;
+		} else {
+			mad_priv = kmem_cache_alloc(ib_mad_cache, GFP_KERNEL);
+			if (!mad_priv) {
+				printk(KERN_ERR PFX "No memory for receive buffer\n");
+				ret = -ENOMEM;
+				break;
+			}
+		}
+		sg_list.addr = ib_dma_map_single(qp_info->port_priv->device,
+						 &mad_priv->grh,
+						 sizeof *mad_priv -
+						   sizeof mad_priv->header,
+						 DMA_FROM_DEVICE);
+		mad_priv->header.mapping = sg_list.addr;
+		recv_wr.wr_id = (unsigned long)&mad_priv->header.mad_list;
+		mad_priv->header.mad_list.mad_queue = recv_queue;
+
+		/* Post receive WR */
+		spin_lock_irqsave(&recv_queue->lock, flags);
+		post = (++recv_queue->count < recv_queue->max_active);
+		list_add_tail(&mad_priv->header.mad_list.list, &recv_queue->list);
+		spin_unlock_irqrestore(&recv_queue->lock, flags);
+		ret = ib_post_recv(qp_info->qp, &recv_wr, &bad_recv_wr);
+		if (ret) {
+			spin_lock_irqsave(&recv_queue->lock, flags);
+			list_del(&mad_priv->header.mad_list.list);
+			recv_queue->count--;
+			spin_unlock_irqrestore(&recv_queue->lock, flags);
+			ib_dma_unmap_single(qp_info->port_priv->device,
+					    mad_priv->header.mapping,
+					    sizeof *mad_priv -
+					      sizeof mad_priv->header,
+					    DMA_FROM_DEVICE);
+			kmem_cache_free(ib_mad_cache, mad_priv);
+			printk(KERN_ERR PFX "ib_post_recv failed: %d\n", ret);
+			break;
+		}
+	} while (post);
+
+	return ret;
+}
+
+/*
+ * Return all the posted receive MADs
+ */
+static void cleanup_recv_queue(struct ib_mad_qp_info *qp_info)
+{
+	struct ib_mad_private_header *mad_priv_hdr;
+	struct ib_mad_private *recv;
+	struct ib_mad_list_head *mad_list;
+
+	if (!qp_info->qp)
+		return;
+
+	while (!list_empty(&qp_info->recv_queue.list)) {
+
+		mad_list = list_entry(qp_info->recv_queue.list.next,
+				      struct ib_mad_list_head, list);
+		mad_priv_hdr = container_of(mad_list,
+					    struct ib_mad_private_header,
+					    mad_list);
+		recv = container_of(mad_priv_hdr, struct ib_mad_private,
+				    header);
+
+		/* Remove from posted receive MAD list */
+		list_del(&mad_list->list);
+
+		ib_dma_unmap_single(qp_info->port_priv->device,
+				    recv->header.mapping,
+				    sizeof(struct ib_mad_private) -
+				      sizeof(struct ib_mad_private_header),
+				    DMA_FROM_DEVICE);
+		kmem_cache_free(ib_mad_cache, recv);
+	}
+
+	qp_info->recv_queue.count = 0;
+}
+
+/*
+ * Start the port
+ */
+static int ib_mad_port_start(struct ib_mad_port_private *port_priv)
+{
+	int ret, i;
+	struct ib_qp_attr *attr;
+	struct ib_qp *qp;
+
+	attr = kmalloc(sizeof *attr, GFP_KERNEL);
+	if (!attr) {
+		printk(KERN_ERR PFX "Couldn't kmalloc ib_qp_attr\n");
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < IB_MAD_QPS_CORE; i++) {
+		qp = port_priv->qp_info[i].qp;
+		if (!qp)
+			continue;
+
+		/*
+		 * PKey index for QP1 is irrelevant but
+		 * one is needed for the Reset to Init transition
+		 */
+		attr->qp_state = IB_QPS_INIT;
+		attr->pkey_index = 0;
+		attr->qkey = (qp->qp_num == 0) ? 0 : IB_QP1_QKEY;
+		ret = ib_modify_qp(qp, attr, IB_QP_STATE |
+					     IB_QP_PKEY_INDEX | IB_QP_QKEY);
+		if (ret) {
+			printk(KERN_ERR PFX "Couldn't change QP%d state to "
+			       "INIT: %d\n", i, ret);
+			goto out;
+		}
+
+		attr->qp_state = IB_QPS_RTR;
+		ret = ib_modify_qp(qp, attr, IB_QP_STATE);
+		if (ret) {
+			printk(KERN_ERR PFX "Couldn't change QP%d state to "
+			       "RTR: %d\n", i, ret);
+			goto out;
+		}
+
+		attr->qp_state = IB_QPS_RTS;
+		attr->sq_psn = IB_MAD_SEND_Q_PSN;
+		ret = ib_modify_qp(qp, attr, IB_QP_STATE | IB_QP_SQ_PSN);
+		if (ret) {
+			printk(KERN_ERR PFX "Couldn't change QP%d state to "
+			       "RTS: %d\n", i, ret);
+			goto out;
+		}
+	}
+
+	ret = ib_req_notify_cq(port_priv->cq, IB_CQ_NEXT_COMP);
+	if (ret) {
+		printk(KERN_ERR PFX "Failed to request completion "
+		       "notification: %d\n", ret);
+		goto out;
+	}
+
+	for (i = 0; i < IB_MAD_QPS_CORE; i++) {
+		if (!port_priv->qp_info[i].qp)
+			continue;
+
+		ret = ib_mad_post_receive_mads(&port_priv->qp_info[i], NULL);
+		if (ret) {
+			printk(KERN_ERR PFX "Couldn't post receive WRs\n");
+			goto out;
+		}
+	}
+out:
+	kfree(attr);
+	return ret;
+}
+
+static void qp_event_handler(struct ib_event *event, void *qp_context)
+{
+	struct ib_mad_qp_info	*qp_info = qp_context;
+
+	/* It's worse than that! He's dead, Jim! */
+	printk(KERN_ERR PFX "Fatal error (%d) on MAD QP (%d)\n",
+		event->event, qp_info->qp->qp_num);
+}
+
+static void init_mad_queue(struct ib_mad_qp_info *qp_info,
+			   struct ib_mad_queue *mad_queue)
+{
+	mad_queue->qp_info = qp_info;
+	mad_queue->count = 0;
+	spin_lock_init(&mad_queue->lock);
+	INIT_LIST_HEAD(&mad_queue->list);
+}
+
+static void init_mad_qp(struct ib_mad_port_private *port_priv,
+			struct ib_mad_qp_info *qp_info)
+{
+	qp_info->port_priv = port_priv;
+	init_mad_queue(qp_info, &qp_info->send_queue);
+	init_mad_queue(qp_info, &qp_info->recv_queue);
+	INIT_LIST_HEAD(&qp_info->overflow_list);
+	spin_lock_init(&qp_info->snoop_lock);
+	qp_info->snoop_table = NULL;
+	qp_info->snoop_table_size = 0;
+	atomic_set(&qp_info->snoop_count, 0);
+}
+
+static int create_mad_qp(struct ib_mad_qp_info *qp_info,
+			 enum ib_qp_type qp_type)
+{
+	struct ib_qp_init_attr	qp_init_attr;
+	int ret;
+
+	memset(&qp_init_attr, 0, sizeof qp_init_attr);
+	qp_init_attr.send_cq = qp_info->port_priv->cq;
+	qp_init_attr.recv_cq = qp_info->port_priv->cq;
+	qp_init_attr.sq_sig_type = IB_SIGNAL_ALL_WR;
+	qp_init_attr.cap.max_send_wr = mad_sendq_size;
+	qp_init_attr.cap.max_recv_wr = mad_recvq_size;
+	qp_init_attr.cap.max_send_sge = IB_MAD_SEND_REQ_MAX_SG;
+	qp_init_attr.cap.max_recv_sge = IB_MAD_RECV_REQ_MAX_SG;
+	qp_init_attr.qp_type = qp_type;
+	qp_init_attr.port_num = qp_info->port_priv->port_num;
+	qp_init_attr.qp_context = qp_info;
+	qp_init_attr.event_handler = qp_event_handler;
+	qp_info->qp = ib_create_qp(qp_info->port_priv->pd, &qp_init_attr);
+	if (IS_ERR(qp_info->qp)) {
+		printk(KERN_ERR PFX "Couldn't create ib_mad QP%d\n",
+		       get_spl_qp_index(qp_type));
+		ret = PTR_ERR(qp_info->qp);
+		goto error;
+	}
+	/* Use minimum queue sizes unless the CQ is resized */
+	qp_info->send_queue.max_active = mad_sendq_size;
+	qp_info->recv_queue.max_active = mad_recvq_size;
+	return 0;
+
+error:
+	return ret;
+}
+
+static void destroy_mad_qp(struct ib_mad_qp_info *qp_info)
+{
+	if (!qp_info->qp)
+		return;
+
+	ib_destroy_qp(qp_info->qp);
+	kfree(qp_info->snoop_table);
+}
+
+/*
+ * Open the port
+ * Create the QP, PD, MR, and CQ if needed
+ */
+static int ib_mad_port_open(struct ib_device *device,
+			    int port_num)
+{
+	int ret, cq_size;
+	struct ib_mad_port_private *port_priv;
+	unsigned long flags;
+	char name[sizeof "ib_mad123"];
+	int has_smi;
+
+	/* Create new device info */
+	port_priv = kzalloc(sizeof *port_priv, GFP_KERNEL);
+	if (!port_priv) {
+		printk(KERN_ERR PFX "No memory for ib_mad_port_private\n");
+		return -ENOMEM;
+	}
+
+	port_priv->device = device;
+	port_priv->port_num = port_num;
+	spin_lock_init(&port_priv->reg_lock);
+	INIT_LIST_HEAD(&port_priv->agent_list);
+	init_mad_qp(port_priv, &port_priv->qp_info[0]);
+	init_mad_qp(port_priv, &port_priv->qp_info[1]);
+
+	cq_size = mad_sendq_size + mad_recvq_size;
+	has_smi = rdma_port_get_link_layer(device, port_num) == IB_LINK_LAYER_INFINIBAND;
+	if (has_smi)
+		cq_size *= 2;
+
+	port_priv->cq = ib_create_cq(port_priv->device,
+				     ib_mad_thread_completion_handler,
+				     NULL, port_priv, cq_size, 0);
+	if (IS_ERR(port_priv->cq)) {
+		printk(KERN_ERR PFX "Couldn't create ib_mad CQ\n");
+		ret = PTR_ERR(port_priv->cq);
+		goto error3;
+	}
+
+	port_priv->pd = ib_alloc_pd(device);
+	if (IS_ERR(port_priv->pd)) {
+		printk(KERN_ERR PFX "Couldn't create ib_mad PD\n");
+		ret = PTR_ERR(port_priv->pd);
+		goto error4;
+	}
+
+	port_priv->mr = ib_get_dma_mr(port_priv->pd, IB_ACCESS_LOCAL_WRITE);
+	if (IS_ERR(port_priv->mr)) {
+		printk(KERN_ERR PFX "Couldn't get ib_mad DMA MR\n");
+		ret = PTR_ERR(port_priv->mr);
+		goto error5;
+	}
+
+	if (has_smi) {
+		ret = create_mad_qp(&port_priv->qp_info[0], IB_QPT_SMI);
+		if (ret)
+			goto error6;
+	}
+	ret = create_mad_qp(&port_priv->qp_info[1], IB_QPT_GSI);
+	if (ret)
+		goto error7;
+
+	snprintf(name, sizeof name, "ib_mad%d", port_num);
+	port_priv->wq = create_singlethread_workqueue(name);
+	if (!port_priv->wq) {
+		ret = -ENOMEM;
+		goto error8;
+	}
+	INIT_WORK(&port_priv->work, ib_mad_completion_handler);
+
+	spin_lock_irqsave(&ib_mad_port_list_lock, flags);
+	list_add_tail(&port_priv->port_list, &ib_mad_port_list);
+	spin_unlock_irqrestore(&ib_mad_port_list_lock, flags);
+
+	ret = ib_mad_port_start(port_priv);
+	if (ret) {
+		printk(KERN_ERR PFX "Couldn't start port\n");
+		goto error9;
+	}
+
+	return 0;
+
+error9:
+	spin_lock_irqsave(&ib_mad_port_list_lock, flags);
+	list_del_init(&port_priv->port_list);
+	spin_unlock_irqrestore(&ib_mad_port_list_lock, flags);
+
+	destroy_workqueue(port_priv->wq);
+error8:
+	destroy_mad_qp(&port_priv->qp_info[1]);
+error7:
+	destroy_mad_qp(&port_priv->qp_info[0]);
+error6:
+	ib_dereg_mr(port_priv->mr);
+error5:
+	ib_dealloc_pd(port_priv->pd);
+error4:
+	ib_destroy_cq(port_priv->cq);
+	cleanup_recv_queue(&port_priv->qp_info[1]);
+	cleanup_recv_queue(&port_priv->qp_info[0]);
+error3:
+	kfree(port_priv);
+
+	return ret;
+}
+
+/*
+ * Close the port
+ * If there are no classes using the port, free the port
+ * resources (CQ, MR, PD, QP) and remove the port's info structure
+ */
+static int ib_mad_port_close(struct ib_device *device, int port_num)
+{
+	struct ib_mad_port_private *port_priv;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ib_mad_port_list_lock, flags);
+	port_priv = __ib_get_mad_port(device, port_num);
+	if (port_priv == NULL) {
+		spin_unlock_irqrestore(&ib_mad_port_list_lock, flags);
+		printk(KERN_ERR PFX "Port %d not found\n", port_num);
+		return -ENODEV;
+	}
+	list_del_init(&port_priv->port_list);
+	spin_unlock_irqrestore(&ib_mad_port_list_lock, flags);
+
+	destroy_workqueue(port_priv->wq);
+	destroy_mad_qp(&port_priv->qp_info[1]);
+	destroy_mad_qp(&port_priv->qp_info[0]);
+	ib_dereg_mr(port_priv->mr);
+	ib_dealloc_pd(port_priv->pd);
+	ib_destroy_cq(port_priv->cq);
+	cleanup_recv_queue(&port_priv->qp_info[1]);
+	cleanup_recv_queue(&port_priv->qp_info[0]);
+	/* XXX: Handle deallocation of MAD registration tables */
+
+	kfree(port_priv);
+
+	return 0;
+}
+
+static void ib_mad_init_device(struct ib_device *device)
+{
+	int start, end, i;
+
+	if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
+		return;
+
+	if (device->node_type == RDMA_NODE_IB_SWITCH) {
+		start = 0;
+		end   = 0;
+	} else {
+		start = 1;
+		end   = device->phys_port_cnt;
+	}
+
+	for (i = start; i <= end; i++) {
+		if (ib_mad_port_open(device, i)) {
+			printk(KERN_ERR PFX "Couldn't open %s port %d\n",
+			       device->name, i);
+			goto error;
+		}
+		if (ib_agent_port_open(device, i)) {
+			printk(KERN_ERR PFX "Couldn't open %s port %d "
+			       "for agents\n",
+			       device->name, i);
+			goto error_agent;
+		}
+	}
+	return;
+
+error_agent:
+	if (ib_mad_port_close(device, i))
+		printk(KERN_ERR PFX "Couldn't close %s port %d\n",
+		       device->name, i);
+
+error:
+	i--;
+
+	while (i >= start) {
+		if (ib_agent_port_close(device, i))
+			printk(KERN_ERR PFX "Couldn't close %s port %d "
+			       "for agents\n",
+			       device->name, i);
+		if (ib_mad_port_close(device, i))
+			printk(KERN_ERR PFX "Couldn't close %s port %d\n",
+			       device->name, i);
+		i--;
+	}
+}
+
+static void ib_mad_remove_device(struct ib_device *device)
+{
+	int i, num_ports, cur_port;
+
+	if (device->node_type == RDMA_NODE_IB_SWITCH) {
+		num_ports = 1;
+		cur_port = 0;
+	} else {
+		num_ports = device->phys_port_cnt;
+		cur_port = 1;
+	}
+	for (i = 0; i < num_ports; i++, cur_port++) {
+		if (ib_agent_port_close(device, cur_port))
+			printk(KERN_ERR PFX "Couldn't close %s port %d "
+			       "for agents\n",
+			       device->name, cur_port);
+		if (ib_mad_port_close(device, cur_port))
+			printk(KERN_ERR PFX "Couldn't close %s port %d\n",
+			       device->name, cur_port);
+	}
+}
+
+static struct ib_client mad_client = {
+	.name   = "mad",
+	.add = ib_mad_init_device,
+	.remove = ib_mad_remove_device
+};
+
+static int __init ib_mad_init_module(void)
+{
+	int ret;
+
+	mad_recvq_size = min(mad_recvq_size, IB_MAD_QP_MAX_SIZE);
+	mad_recvq_size = max(mad_recvq_size, IB_MAD_QP_MIN_SIZE);
+
+	mad_sendq_size = min(mad_sendq_size, IB_MAD_QP_MAX_SIZE);
+	mad_sendq_size = max(mad_sendq_size, IB_MAD_QP_MIN_SIZE);
+
+	spin_lock_init(&ib_mad_port_list_lock);
+
+	ib_mad_cache = kmem_cache_create("ib_mad",
+					 sizeof(struct ib_mad_private),
+					 0,
+					 SLAB_HWCACHE_ALIGN,
+					 NULL);
+	if (!ib_mad_cache) {
+		printk(KERN_ERR PFX "Couldn't create ib_mad cache\n");
+		ret = -ENOMEM;
+		goto error1;
+	}
+
+	INIT_LIST_HEAD(&ib_mad_port_list);
+
+	if (ib_register_client(&mad_client)) {
+		printk(KERN_ERR PFX "Couldn't register ib_mad client\n");
+		ret = -EINVAL;
+		goto error2;
+	}
+
+	return 0;
+
+error2:
+	kmem_cache_destroy(ib_mad_cache);
+error1:
+	return ret;
+}
+
+static void __exit ib_mad_cleanup_module(void)
+{
+	ib_unregister_client(&mad_client);
+	kmem_cache_destroy(ib_mad_cache);
+}
+
+module_init(ib_mad_init_module);
+module_exit(ib_mad_cleanup_module);
+


Property changes on: trunk/sys/ofed/drivers/infiniband/core/mad.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/sys/ofed/drivers/infiniband/core/mad_priv.h
===================================================================
--- trunk/sys/ofed/drivers/infiniband/core/mad_priv.h	                        (rev 0)
+++ trunk/sys/ofed/drivers/infiniband/core/mad_priv.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -0,0 +1,231 @@
+/*
+ * Copyright (c) 2004, 2005, Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2005 Intel Corporation. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2009 HNR Consulting. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __IB_MAD_PRIV_H__
+#define __IB_MAD_PRIV_H__
+
+#include <linux/completion.h>
+#include <linux/err.h>
+#include <linux/workqueue.h>
+#include <rdma/ib_mad.h>
+#include <rdma/ib_smi.h>
+
+
+#define PFX "ib_mad: "
+
+#define IB_MAD_QPS_CORE		2 /* Always QP0 and QP1 as a minimum */
+
+/* QP and CQ parameters */
+#define IB_MAD_QP_SEND_SIZE	128
+#define IB_MAD_QP_RECV_SIZE	512
+#define IB_MAD_QP_MIN_SIZE	64
+#define IB_MAD_QP_MAX_SIZE	8192
+#define IB_MAD_SEND_REQ_MAX_SG	2
+#define IB_MAD_RECV_REQ_MAX_SG	1
+
+#define IB_MAD_SEND_Q_PSN	0
+
+/* Registration table sizes */
+#define MAX_MGMT_CLASS		80
+#define MAX_MGMT_VERSION	8
+#define MAX_MGMT_OUI		8
+#define MAX_MGMT_VENDOR_RANGE2	(IB_MGMT_CLASS_VENDOR_RANGE2_END - \
+				IB_MGMT_CLASS_VENDOR_RANGE2_START + 1)
+
+struct ib_mad_list_head {
+	struct list_head list;
+	struct ib_mad_queue *mad_queue;
+};
+
+struct ib_mad_private_header {
+	struct ib_mad_list_head mad_list;
+	struct ib_mad_recv_wc recv_wc;
+	struct ib_wc wc;
+	u64 mapping;
+} __attribute__ ((packed));
+
+struct ib_mad_private {
+	struct ib_mad_private_header header;
+	struct ib_grh grh;
+	union {
+		struct ib_mad mad;
+		struct ib_rmpp_mad rmpp_mad;
+		struct ib_smp smp;
+	} mad;
+} __attribute__ ((packed));
+
+struct ib_rmpp_segment {
+	struct list_head list;
+	u32 num;
+	u8 data[0];
+};
+
+struct ib_mad_agent_private {
+	struct list_head agent_list;
+	struct ib_mad_agent agent;
+	struct ib_mad_reg_req *reg_req;
+	struct ib_mad_qp_info *qp_info;
+
+	spinlock_t lock;
+	struct list_head send_list;
+	struct list_head wait_list;
+	struct list_head done_list;
+	struct work_struct timeout_work;
+	struct timer_list timeout_timer;
+	unsigned long timeout;
+	struct list_head local_list;
+	struct work_struct local_work;
+	struct list_head rmpp_list;
+
+	atomic_t refcount;
+	struct completion comp;
+};
+
+struct ib_mad_snoop_private {
+	struct ib_mad_agent agent;
+	struct ib_mad_qp_info *qp_info;
+	int snoop_index;
+	int mad_snoop_flags;
+	atomic_t refcount;
+	struct completion comp;
+};
+
+struct ib_mad_send_wr_private {
+	struct ib_mad_list_head mad_list;
+	struct list_head agent_list;
+	struct ib_mad_agent_private *mad_agent_priv;
+	struct ib_mad_send_buf send_buf;
+	u64 header_mapping;
+	u64 payload_mapping;
+	struct ib_send_wr send_wr;
+	struct ib_sge sg_list[IB_MAD_SEND_REQ_MAX_SG];
+	__be64 tid;
+	unsigned long timeout;
+	int max_retries;
+	int retries_left;
+	int retry;
+	int refcount;
+	enum ib_wc_status status;
+
+	/* RMPP control */
+	struct list_head rmpp_list;
+	struct ib_rmpp_segment *last_ack_seg;
+	struct ib_rmpp_segment *cur_seg;
+	int last_ack;
+	int seg_num;
+	int newwin;
+	int pad;
+};
+
+struct ib_mad_local_private {
+	struct list_head completion_list;
+	struct ib_mad_private *mad_priv;
+	struct ib_mad_agent_private *recv_mad_agent;
+	struct ib_mad_send_wr_private *mad_send_wr;
+};
+
+struct ib_mad_mgmt_method_table {
+	struct ib_mad_agent_private *agent[IB_MGMT_MAX_METHODS];
+};
+
+struct ib_mad_mgmt_class_table {
+	struct ib_mad_mgmt_method_table *method_table[MAX_MGMT_CLASS];
+};
+
+struct ib_mad_mgmt_vendor_class {
+	u8	oui[MAX_MGMT_OUI][3];
+	struct ib_mad_mgmt_method_table *method_table[MAX_MGMT_OUI];
+};
+
+struct ib_mad_mgmt_vendor_class_table {
+	struct ib_mad_mgmt_vendor_class *vendor_class[MAX_MGMT_VENDOR_RANGE2];
+};
+
+struct ib_mad_mgmt_version_table {
+	struct ib_mad_mgmt_class_table *class;
+	struct ib_mad_mgmt_vendor_class_table *vendor;
+};
+
+struct ib_mad_queue {
+	spinlock_t lock;
+	struct list_head list;
+	int count;
+	int max_active;
+	struct ib_mad_qp_info *qp_info;
+};
+
+struct ib_mad_qp_info {
+	struct ib_mad_port_private *port_priv;
+	struct ib_qp *qp;
+	struct ib_mad_queue send_queue;
+	struct ib_mad_queue recv_queue;
+	struct list_head overflow_list;
+	spinlock_t snoop_lock;
+	struct ib_mad_snoop_private **snoop_table;
+	int snoop_table_size;
+	atomic_t snoop_count;
+};
+
+struct ib_mad_port_private {
+	struct list_head port_list;
+	struct ib_device *device;
+	int port_num;
+	struct ib_cq *cq;
+	struct ib_pd *pd;
+	struct ib_mr *mr;
+
+	spinlock_t reg_lock;
+	struct ib_mad_mgmt_version_table version[MAX_MGMT_VERSION];
+	struct list_head agent_list;
+	struct workqueue_struct *wq;
+	struct work_struct work;
+	struct ib_mad_qp_info qp_info[IB_MAD_QPS_CORE];
+};
+
+int ib_send_mad(struct ib_mad_send_wr_private *mad_send_wr);
+
+struct ib_mad_send_wr_private *
+ib_find_send_mad(struct ib_mad_agent_private *mad_agent_priv,
+		 struct ib_mad_recv_wc *mad_recv_wc);
+
+void ib_mad_complete_send_wr(struct ib_mad_send_wr_private *mad_send_wr,
+			     struct ib_mad_send_wc *mad_send_wc);
+
+void ib_mark_mad_done(struct ib_mad_send_wr_private *mad_send_wr);
+
+void ib_reset_mad_timeout(struct ib_mad_send_wr_private *mad_send_wr,
+			  int timeout_ms);
+
+#endif	/* __IB_MAD_PRIV_H__ */


Property changes on: trunk/sys/ofed/drivers/infiniband/core/mad_priv.h
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/sys/ofed/drivers/infiniband/core/mad_rmpp.c
===================================================================
--- trunk/sys/ofed/drivers/infiniband/core/mad_rmpp.c	                        (rev 0)
+++ trunk/sys/ofed/drivers/infiniband/core/mad_rmpp.c	2016-09-14 19:35:22 UTC (rev 7911)
@@ -0,0 +1,951 @@
+/*
+ * Copyright (c) 2005 Intel Inc. All rights reserved.
+ * Copyright (c) 2005-2006 Voltaire, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "mad_priv.h"
+#include "mad_rmpp.h"
+
+enum rmpp_state {
+	RMPP_STATE_ACTIVE,
+	RMPP_STATE_TIMEOUT,
+	RMPP_STATE_COMPLETE,
+	RMPP_STATE_CANCELING
+};
+
+struct mad_rmpp_recv {
+	struct ib_mad_agent_private *agent;
+	struct list_head list;
+	struct delayed_work timeout_work;
+	struct delayed_work cleanup_work;
+	struct completion comp;
+	enum rmpp_state state;
+	spinlock_t lock;
+	atomic_t refcount;
+
+	struct ib_ah *ah;
+	struct ib_mad_recv_wc *rmpp_wc;
+	struct ib_mad_recv_buf *cur_seg_buf;
+	int last_ack;
+	int seg_num;
+	int newwin;
+	int repwin;
+
+	__be64 tid;
+	u32 src_qp;
+	u16 slid;
+	u8 mgmt_class;
+	u8 class_version;
+	u8 method;
+};
+
+static inline void deref_rmpp_recv(struct mad_rmpp_recv *rmpp_recv)
+{
+	if (atomic_dec_and_test(&rmpp_recv->refcount))
+		complete(&rmpp_recv->comp);
+}
+
+static void destroy_rmpp_recv(struct mad_rmpp_recv *rmpp_recv)
+{
+	deref_rmpp_recv(rmpp_recv);
+	wait_for_completion(&rmpp_recv->comp);
+	ib_destroy_ah(rmpp_recv->ah);
+	kfree(rmpp_recv);
+}
+
+void ib_cancel_rmpp_recvs(struct ib_mad_agent_private *agent)
+{
+	struct mad_rmpp_recv *rmpp_recv, *temp_rmpp_recv;
+	unsigned long flags;
+
+	spin_lock_irqsave(&agent->lock, flags);
+	list_for_each_entry(rmpp_recv, &agent->rmpp_list, list) {
+		if (rmpp_recv->state != RMPP_STATE_COMPLETE)
+			ib_free_recv_mad(rmpp_recv->rmpp_wc);
+		rmpp_recv->state = RMPP_STATE_CANCELING;
+	}
+	spin_unlock_irqrestore(&agent->lock, flags);
+
+	list_for_each_entry(rmpp_recv, &agent->rmpp_list, list) {
+		cancel_delayed_work(&rmpp_recv->timeout_work);
+		cancel_delayed_work(&rmpp_recv->cleanup_work);
+	}
+
+	flush_workqueue(agent->qp_info->port_priv->wq);
+
+	list_for_each_entry_safe(rmpp_recv, temp_rmpp_recv,
+				 &agent->rmpp_list, list) {
+		list_del(&rmpp_recv->list);
+		destroy_rmpp_recv(rmpp_recv);
+	}
+}
+
+static void format_ack(struct ib_mad_send_buf *msg,
+		       struct ib_rmpp_mad *data,
+		       struct mad_rmpp_recv *rmpp_recv)
+{
+	struct ib_rmpp_mad *ack = msg->mad;
+	unsigned long flags;
+
+	memcpy(ack, &data->mad_hdr, msg->hdr_len);
+
+	ack->mad_hdr.method ^= IB_MGMT_METHOD_RESP;
+	ack->rmpp_hdr.rmpp_type = IB_MGMT_RMPP_TYPE_ACK;
+	ib_set_rmpp_flags(&ack->rmpp_hdr, IB_MGMT_RMPP_FLAG_ACTIVE);
+
+	spin_lock_irqsave(&rmpp_recv->lock, flags);
+	rmpp_recv->last_ack = rmpp_recv->seg_num;
+	ack->rmpp_hdr.seg_num = cpu_to_be32(rmpp_recv->seg_num);
+	ack->rmpp_hdr.paylen_newwin = cpu_to_be32(rmpp_recv->newwin);
+	spin_unlock_irqrestore(&rmpp_recv->lock, flags);
+}
+
+static void ack_recv(struct mad_rmpp_recv *rmpp_recv,
+		     struct ib_mad_recv_wc *recv_wc)
+{
+	struct ib_mad_send_buf *msg;
+	int ret, hdr_len;
+
+	hdr_len = ib_get_mad_data_offset(recv_wc->recv_buf.mad->mad_hdr.mgmt_class);
+	msg = ib_create_send_mad(&rmpp_recv->agent->agent, recv_wc->wc->src_qp,
+				 recv_wc->wc->pkey_index, 1, hdr_len,
+				 0, GFP_KERNEL);
+	if (IS_ERR(msg))
+		return;
+
+	format_ack(msg, (struct ib_rmpp_mad *) recv_wc->recv_buf.mad, rmpp_recv);
+	msg->ah = rmpp_recv->ah;
+	ret = ib_post_send_mad(msg, NULL);
+	if (ret)
+		ib_free_send_mad(msg);
+}
+
+static struct ib_mad_send_buf *alloc_response_msg(struct ib_mad_agent *agent,
+						  struct ib_mad_recv_wc *recv_wc)
+{
+	struct ib_mad_send_buf *msg;
+	struct ib_ah *ah;
+	int hdr_len;
+
+	ah = ib_create_ah_from_wc(agent->qp->pd, recv_wc->wc,
+				  recv_wc->recv_buf.grh, agent->port_num);
+	if (IS_ERR(ah))
+		return (void *) ah;
+
+	hdr_len = ib_get_mad_data_offset(recv_wc->recv_buf.mad->mad_hdr.mgmt_class);
+	msg = ib_create_send_mad(agent, recv_wc->wc->src_qp,
+				 recv_wc->wc->pkey_index, 1,
+				 hdr_len, 0, GFP_KERNEL);
+	if (IS_ERR(msg))
+		ib_destroy_ah(ah);
+	else {
+		msg->ah = ah;
+		msg->context[0] = ah;
+	}
+
+	return msg;
+}
+
+static void ack_ds_ack(struct ib_mad_agent_private *agent,
+		       struct ib_mad_recv_wc *recv_wc)
+{
+	struct ib_mad_send_buf *msg;
+	struct ib_rmpp_mad *rmpp_mad;
+	int ret;
+
+	msg = alloc_response_msg(&agent->agent, recv_wc);
+	if (IS_ERR(msg))
+		return;
+
+	rmpp_mad = msg->mad;
+	memcpy(rmpp_mad, recv_wc->recv_buf.mad, msg->hdr_len);
+
+	rmpp_mad->mad_hdr.method ^= IB_MGMT_METHOD_RESP;
+	ib_set_rmpp_flags(&rmpp_mad->rmpp_hdr, IB_MGMT_RMPP_FLAG_ACTIVE);
+	rmpp_mad->rmpp_hdr.seg_num = 0;
+	rmpp_mad->rmpp_hdr.paylen_newwin = cpu_to_be32(1);
+
+	ret = ib_post_send_mad(msg, NULL);
+	if (ret) {
+		ib_destroy_ah(msg->ah);
+		ib_free_send_mad(msg);
+	}
+}
+
+void ib_rmpp_send_handler(struct ib_mad_send_wc *mad_send_wc)
+{
+	if (mad_send_wc->send_buf->context[0] == mad_send_wc->send_buf->ah)
+		ib_destroy_ah(mad_send_wc->send_buf->ah);
+	ib_free_send_mad(mad_send_wc->send_buf);
+}
+
+static void nack_recv(struct ib_mad_agent_private *agent,
+		      struct ib_mad_recv_wc *recv_wc, u8 rmpp_status)
+{
+	struct ib_mad_send_buf *msg;
+	struct ib_rmpp_mad *rmpp_mad;
+	int ret;
+
+	msg = alloc_response_msg(&agent->agent, recv_wc);
+	if (IS_ERR(msg))
+		return;
+
+	rmpp_mad = msg->mad;
+	memcpy(rmpp_mad, recv_wc->recv_buf.mad, msg->hdr_len);
+
+	rmpp_mad->mad_hdr.method ^= IB_MGMT_METHOD_RESP;
+	rmpp_mad->rmpp_hdr.rmpp_version = IB_MGMT_RMPP_VERSION;
+	rmpp_mad->rmpp_hdr.rmpp_type = IB_MGMT_RMPP_TYPE_ABORT;
+	ib_set_rmpp_flags(&rmpp_mad->rmpp_hdr, IB_MGMT_RMPP_FLAG_ACTIVE);
+	rmpp_mad->rmpp_hdr.rmpp_status = rmpp_status;
+	rmpp_mad->rmpp_hdr.seg_num = 0;
+	rmpp_mad->rmpp_hdr.paylen_newwin = 0;
+
+	ret = ib_post_send_mad(msg, NULL);
+	if (ret) {
+		ib_destroy_ah(msg->ah);
+		ib_free_send_mad(msg);
+	}
+}
+
+static void recv_timeout_handler(struct work_struct *work)
+{
+	struct mad_rmpp_recv *rmpp_recv =
+		container_of(work, struct mad_rmpp_recv, timeout_work.work);
+	struct ib_mad_recv_wc *rmpp_wc;
+	unsigned long flags;
+
+	spin_lock_irqsave(&rmpp_recv->agent->lock, flags);
+	if (rmpp_recv->state != RMPP_STATE_ACTIVE) {
+		spin_unlock_irqrestore(&rmpp_recv->agent->lock, flags);
+		return;
+	}
+	rmpp_recv->state = RMPP_STATE_TIMEOUT;
+	list_del(&rmpp_recv->list);
+	spin_unlock_irqrestore(&rmpp_recv->agent->lock, flags);
+
+	rmpp_wc = rmpp_recv->rmpp_wc;
+	nack_recv(rmpp_recv->agent, rmpp_wc, IB_MGMT_RMPP_STATUS_T2L);
+	destroy_rmpp_recv(rmpp_recv);
+	ib_free_recv_mad(rmpp_wc);
+}
+
+static void recv_cleanup_handler(struct work_struct *work)
+{
+	struct mad_rmpp_recv *rmpp_recv =
+		container_of(work, struct mad_rmpp_recv, cleanup_work.work);
+	unsigned long flags;
+
+	spin_lock_irqsave(&rmpp_recv->agent->lock, flags);
+	if (rmpp_recv->state == RMPP_STATE_CANCELING) {
+		spin_unlock_irqrestore(&rmpp_recv->agent->lock, flags);
+		return;
+	}
+	list_del(&rmpp_recv->list);
+	spin_unlock_irqrestore(&rmpp_recv->agent->lock, flags);
+	destroy_rmpp_recv(rmpp_recv);
+}
+
+static struct mad_rmpp_recv *
+create_rmpp_recv(struct ib_mad_agent_private *agent,
+		 struct ib_mad_recv_wc *mad_recv_wc)
+{
+	struct mad_rmpp_recv *rmpp_recv;
+	struct ib_mad_hdr *mad_hdr;
+
+	rmpp_recv = kmalloc(sizeof *rmpp_recv, GFP_KERNEL);
+	if (!rmpp_recv)
+		return NULL;
+
+	rmpp_recv->ah = ib_create_ah_from_wc(agent->agent.qp->pd,
+					     mad_recv_wc->wc,
+					     mad_recv_wc->recv_buf.grh,
+					     agent->agent.port_num);
+	if (IS_ERR(rmpp_recv->ah))
+		goto error;
+
+	rmpp_recv->agent = agent;
+	init_completion(&rmpp_recv->comp);
+	INIT_DELAYED_WORK(&rmpp_recv->timeout_work, recv_timeout_handler);
+	INIT_DELAYED_WORK(&rmpp_recv->cleanup_work, recv_cleanup_handler);
+	spin_lock_init(&rmpp_recv->lock);
+	rmpp_recv->state = RMPP_STATE_ACTIVE;
+	atomic_set(&rmpp_recv->refcount, 1);
+
+	rmpp_recv->rmpp_wc = mad_recv_wc;
+	rmpp_recv->cur_seg_buf = &mad_recv_wc->recv_buf;
+	rmpp_recv->newwin = 1;
+	rmpp_recv->seg_num = 1;
+	rmpp_recv->last_ack = 0;
+	rmpp_recv->repwin = 1;
+
+	mad_hdr = &mad_recv_wc->recv_buf.mad->mad_hdr;
+	rmpp_recv->tid = mad_hdr->tid;
+	rmpp_recv->src_qp = mad_recv_wc->wc->src_qp;
+	rmpp_recv->slid = mad_recv_wc->wc->slid;
+	rmpp_recv->mgmt_class = mad_hdr->mgmt_class;
+	rmpp_recv->class_version = mad_hdr->class_version;
+	rmpp_recv->method  = mad_hdr->method;
+	return rmpp_recv;
+
+error:	kfree(rmpp_recv);
+	return NULL;
+}
+
+static struct mad_rmpp_recv *
+find_rmpp_recv(struct ib_mad_agent_private *agent,
+	       struct ib_mad_recv_wc *mad_recv_wc)
+{
+	struct mad_rmpp_recv *rmpp_recv;
+	struct ib_mad_hdr *mad_hdr = &mad_recv_wc->recv_buf.mad->mad_hdr;
+
+	list_for_each_entry(rmpp_recv, &agent->rmpp_list, list) {
+		if (rmpp_recv->tid == mad_hdr->tid &&
+		    rmpp_recv->src_qp == mad_recv_wc->wc->src_qp &&
+		    rmpp_recv->slid == mad_recv_wc->wc->slid &&
+		    rmpp_recv->mgmt_class == mad_hdr->mgmt_class &&
+		    rmpp_recv->class_version == mad_hdr->class_version &&
+		    rmpp_recv->method == mad_hdr->method)
+			return rmpp_recv;
+	}
+	return NULL;
+}
+
+static struct mad_rmpp_recv *
+acquire_rmpp_recv(struct ib_mad_agent_private *agent,
+		  struct ib_mad_recv_wc *mad_recv_wc)
+{
+	struct mad_rmpp_recv *rmpp_recv;
+	unsigned long flags;
+
+	spin_lock_irqsave(&agent->lock, flags);
+	rmpp_recv = find_rmpp_recv(agent, mad_recv_wc);
+	if (rmpp_recv)
+		atomic_inc(&rmpp_recv->refcount);
+	spin_unlock_irqrestore(&agent->lock, flags);
+	return rmpp_recv;
+}
+
+static struct mad_rmpp_recv *
+insert_rmpp_recv(struct ib_mad_agent_private *agent,
+		 struct mad_rmpp_recv *rmpp_recv)
+{
+	struct mad_rmpp_recv *cur_rmpp_recv;
+
+	cur_rmpp_recv = find_rmpp_recv(agent, rmpp_recv->rmpp_wc);
+	if (!cur_rmpp_recv)
+		list_add_tail(&rmpp_recv->list, &agent->rmpp_list);
+
+	return cur_rmpp_recv;
+}
+
+static inline int get_last_flag(struct ib_mad_recv_buf *seg)
+{
+	struct ib_rmpp_mad *rmpp_mad;
+
+	rmpp_mad = (struct ib_rmpp_mad *) seg->mad;
+	return ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) & IB_MGMT_RMPP_FLAG_LAST;
+}
+
+static inline int get_seg_num(struct ib_mad_recv_buf *seg)
+{
+	struct ib_rmpp_mad *rmpp_mad;
+
+	rmpp_mad = (struct ib_rmpp_mad *) seg->mad;
+	return be32_to_cpu(rmpp_mad->rmpp_hdr.seg_num);
+}
+
+static inline struct ib_mad_recv_buf * get_next_seg(struct list_head *rmpp_list,
+						    struct ib_mad_recv_buf *seg)
+{
+	if (seg->list.next == rmpp_list)
+		return NULL;
+
+	return container_of(seg->list.next, struct ib_mad_recv_buf, list);
+}
+
+static inline int window_size(struct ib_mad_agent_private *agent)
+{
+	return max(agent->qp_info->recv_queue.max_active >> 3, 1);
+}
+
+static struct ib_mad_recv_buf * find_seg_location(struct list_head *rmpp_list,
+						  int seg_num)
+{
+	struct ib_mad_recv_buf *seg_buf;
+	int cur_seg_num;
+
+	list_for_each_entry_reverse(seg_buf, rmpp_list, list) {
+		cur_seg_num = get_seg_num(seg_buf);
+		if (seg_num > cur_seg_num)
+			return seg_buf;
+		if (seg_num == cur_seg_num)
+			break;
+	}
+	return NULL;
+}
+
+static void update_seg_num(struct mad_rmpp_recv *rmpp_recv,
+			   struct ib_mad_recv_buf *new_buf)
+{
+	struct list_head *rmpp_list = &rmpp_recv->rmpp_wc->rmpp_list;
+
+	while (new_buf && (get_seg_num(new_buf) == rmpp_recv->seg_num + 1)) {
+		rmpp_recv->cur_seg_buf = new_buf;
+		rmpp_recv->seg_num++;
+		new_buf = get_next_seg(rmpp_list, new_buf);
+	}
+}
+
+static inline int get_mad_len(struct mad_rmpp_recv *rmpp_recv)
+{
+	struct ib_rmpp_mad *rmpp_mad;
+	int hdr_size, data_size, pad;
+
+	rmpp_mad = (struct ib_rmpp_mad *)rmpp_recv->cur_seg_buf->mad;
+
+	hdr_size = ib_get_mad_data_offset(rmpp_mad->mad_hdr.mgmt_class);
+	data_size = sizeof(struct ib_rmpp_mad) - hdr_size;
+	pad = IB_MGMT_RMPP_DATA - be32_to_cpu(rmpp_mad->rmpp_hdr.paylen_newwin);
+	if (pad > IB_MGMT_RMPP_DATA || pad < 0)
+		pad = 0;
+
+	return hdr_size + rmpp_recv->seg_num * data_size - pad;
+}
+
+static struct ib_mad_recv_wc * complete_rmpp(struct mad_rmpp_recv *rmpp_recv)
+{
+	struct ib_mad_recv_wc *rmpp_wc;
+
+	ack_recv(rmpp_recv, rmpp_recv->rmpp_wc);
+	if (rmpp_recv->seg_num > 1)
+		cancel_delayed_work(&rmpp_recv->timeout_work);
+
+	rmpp_wc = rmpp_recv->rmpp_wc;
+	rmpp_wc->mad_len = get_mad_len(rmpp_recv);
+	/* 10 seconds until we can find the packet lifetime */
+	queue_delayed_work(rmpp_recv->agent->qp_info->port_priv->wq,
+			   &rmpp_recv->cleanup_work, msecs_to_jiffies(10000));
+	return rmpp_wc;
+}
+
+static struct ib_mad_recv_wc *
+continue_rmpp(struct ib_mad_agent_private *agent,
+	      struct ib_mad_recv_wc *mad_recv_wc)
+{
+	struct mad_rmpp_recv *rmpp_recv;
+	struct ib_mad_recv_buf *prev_buf;
+	struct ib_mad_recv_wc *done_wc;
+	int seg_num;
+	unsigned long flags;
+
+	rmpp_recv = acquire_rmpp_recv(agent, mad_recv_wc);
+	if (!rmpp_recv)
+		goto drop1;
+
+	seg_num = get_seg_num(&mad_recv_wc->recv_buf);
+
+	spin_lock_irqsave(&rmpp_recv->lock, flags);
+	if ((rmpp_recv->state == RMPP_STATE_TIMEOUT) ||
+	    (seg_num > rmpp_recv->newwin))
+		goto drop3;
+
+	if ((seg_num <= rmpp_recv->last_ack) ||
+	    (rmpp_recv->state == RMPP_STATE_COMPLETE)) {
+		spin_unlock_irqrestore(&rmpp_recv->lock, flags);
+		ack_recv(rmpp_recv, mad_recv_wc);
+		goto drop2;
+	}
+
+	prev_buf = find_seg_location(&rmpp_recv->rmpp_wc->rmpp_list, seg_num);
+	if (!prev_buf)
+		goto drop3;
+
+	done_wc = NULL;
+	list_add(&mad_recv_wc->recv_buf.list, &prev_buf->list);
+	if (rmpp_recv->cur_seg_buf == prev_buf) {
+		update_seg_num(rmpp_recv, &mad_recv_wc->recv_buf);
+		if (get_last_flag(rmpp_recv->cur_seg_buf)) {
+			rmpp_recv->state = RMPP_STATE_COMPLETE;
+			spin_unlock_irqrestore(&rmpp_recv->lock, flags);
+			done_wc = complete_rmpp(rmpp_recv);
+			goto out;
+		} else if (rmpp_recv->seg_num == rmpp_recv->newwin) {
+			rmpp_recv->newwin += window_size(agent);
+			spin_unlock_irqrestore(&rmpp_recv->lock, flags);
+			ack_recv(rmpp_recv, mad_recv_wc);
+			goto out;
+		}
+	}
+	spin_unlock_irqrestore(&rmpp_recv->lock, flags);
+out:
+	deref_rmpp_recv(rmpp_recv);
+	return done_wc;
+
+drop3:	spin_unlock_irqrestore(&rmpp_recv->lock, flags);
+drop2:	deref_rmpp_recv(rmpp_recv);
+drop1:	ib_free_recv_mad(mad_recv_wc);
+	return NULL;
+}
+
+static struct ib_mad_recv_wc *
+start_rmpp(struct ib_mad_agent_private *agent,
+	   struct ib_mad_recv_wc *mad_recv_wc)
+{
+	struct mad_rmpp_recv *rmpp_recv;
+	unsigned long flags;
+
+	rmpp_recv = create_rmpp_recv(agent, mad_recv_wc);
+	if (!rmpp_recv) {
+		ib_free_recv_mad(mad_recv_wc);
+		return NULL;
+	}
+
+	spin_lock_irqsave(&agent->lock, flags);
+	if (insert_rmpp_recv(agent, rmpp_recv)) {
+		spin_unlock_irqrestore(&agent->lock, flags);
+		/* duplicate first MAD */
+		destroy_rmpp_recv(rmpp_recv);
+		return continue_rmpp(agent, mad_recv_wc);
+	}
+	atomic_inc(&rmpp_recv->refcount);
+
+	if (get_last_flag(&mad_recv_wc->recv_buf)) {
+		rmpp_recv->state = RMPP_STATE_COMPLETE;
+		spin_unlock_irqrestore(&agent->lock, flags);
+		complete_rmpp(rmpp_recv);
+	} else {
+		spin_unlock_irqrestore(&agent->lock, flags);
+		/* 40 seconds until we can find the packet lifetimes */
+		queue_delayed_work(agent->qp_info->port_priv->wq,
+				   &rmpp_recv->timeout_work,
+				   msecs_to_jiffies(40000));
+		rmpp_recv->newwin += window_size(agent);
+		ack_recv(rmpp_recv, mad_recv_wc);
+		mad_recv_wc = NULL;
+	}
+	deref_rmpp_recv(rmpp_recv);
+	return mad_recv_wc;
+}
+
+static int send_next_seg(struct ib_mad_send_wr_private *mad_send_wr)
+{
+	struct ib_rmpp_mad *rmpp_mad;
+	int timeout;
+	u32 paylen = 0;
+
+	rmpp_mad = mad_send_wr->send_buf.mad;
+	ib_set_rmpp_flags(&rmpp_mad->rmpp_hdr, IB_MGMT_RMPP_FLAG_ACTIVE);
+	rmpp_mad->rmpp_hdr.seg_num = cpu_to_be32(++mad_send_wr->seg_num);
+
+	if (mad_send_wr->seg_num == 1) {
+		rmpp_mad->rmpp_hdr.rmpp_rtime_flags |= IB_MGMT_RMPP_FLAG_FIRST;
+		paylen = mad_send_wr->send_buf.seg_count * IB_MGMT_RMPP_DATA -
+			 mad_send_wr->pad;
+	}
+
+	if (mad_send_wr->seg_num == mad_send_wr->send_buf.seg_count) {
+		rmpp_mad->rmpp_hdr.rmpp_rtime_flags |= IB_MGMT_RMPP_FLAG_LAST;
+		paylen = IB_MGMT_RMPP_DATA - mad_send_wr->pad;
+	}
+	rmpp_mad->rmpp_hdr.paylen_newwin = cpu_to_be32(paylen);
+
+	/* 2 seconds for an ACK until we can find the packet lifetime */
+	timeout = mad_send_wr->send_buf.timeout_ms;
+	if (!timeout || timeout > 2000)
+		mad_send_wr->timeout = msecs_to_jiffies(2000);
+
+	return ib_send_mad(mad_send_wr);
+}
+
+static void abort_send(struct ib_mad_agent_private *agent,
+		       struct ib_mad_recv_wc *mad_recv_wc, u8 rmpp_status)
+{
+	struct ib_mad_send_wr_private *mad_send_wr;
+	struct ib_mad_send_wc wc;
+	unsigned long flags;
+
+	spin_lock_irqsave(&agent->lock, flags);
+	mad_send_wr = ib_find_send_mad(agent, mad_recv_wc);
+	if (!mad_send_wr)
+		goto out;	/* Unmatched send */
+
+	if ((mad_send_wr->last_ack == mad_send_wr->send_buf.seg_count) ||
+	    (!mad_send_wr->timeout) || (mad_send_wr->status != IB_WC_SUCCESS))
+		goto out;	/* Send is already done */
+
+	ib_mark_mad_done(mad_send_wr);
+	spin_unlock_irqrestore(&agent->lock, flags);
+
+	wc.status = IB_WC_REM_ABORT_ERR;
+	wc.vendor_err = rmpp_status;
+	wc.send_buf = &mad_send_wr->send_buf;
+	ib_mad_complete_send_wr(mad_send_wr, &wc);
+	return;
+out:
+	spin_unlock_irqrestore(&agent->lock, flags);
+}
+
+static inline void adjust_last_ack(struct ib_mad_send_wr_private *wr,
+				   int seg_num)
+{
+	struct list_head *list;
+
+	wr->last_ack = seg_num;
+	list = &wr->last_ack_seg->list;
+	list_for_each_entry(wr->last_ack_seg, list, list)
+		if (wr->last_ack_seg->num == seg_num)
+			break;
+}
+
+static void process_ds_ack(struct ib_mad_agent_private *agent,
+			   struct ib_mad_recv_wc *mad_recv_wc, int newwin)
+{
+	struct mad_rmpp_recv *rmpp_recv;
+
+	rmpp_recv = find_rmpp_recv(agent, mad_recv_wc);
+	if (rmpp_recv && rmpp_recv->state == RMPP_STATE_COMPLETE)
+		rmpp_recv->repwin = newwin;
+}
+
+static void process_rmpp_ack(struct ib_mad_agent_private *agent,
+			     struct ib_mad_recv_wc *mad_recv_wc)
+{
+	struct ib_mad_send_wr_private *mad_send_wr;
+	struct ib_rmpp_mad *rmpp_mad;
+	unsigned long flags;
+	int seg_num, newwin, ret;
+
+	rmpp_mad = (struct ib_rmpp_mad *)mad_recv_wc->recv_buf.mad;
+	if (rmpp_mad->rmpp_hdr.rmpp_status) {
+		abort_send(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BAD_STATUS);
+		nack_recv(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BAD_STATUS);
+		return;
+	}
+
+	seg_num = be32_to_cpu(rmpp_mad->rmpp_hdr.seg_num);
+	newwin = be32_to_cpu(rmpp_mad->rmpp_hdr.paylen_newwin);
+	if (newwin < seg_num) {
+		abort_send(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_W2S);
+		nack_recv(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_W2S);
+		return;
+	}
+
+	spin_lock_irqsave(&agent->lock, flags);
+	mad_send_wr = ib_find_send_mad(agent, mad_recv_wc);
+	if (!mad_send_wr) {
+		if (!seg_num)
+			process_ds_ack(agent, mad_recv_wc, newwin);
+		goto out;	/* Unmatched or DS RMPP ACK */
+	}
+
+	if ((mad_send_wr->last_ack == mad_send_wr->send_buf.seg_count) &&
+	    (mad_send_wr->timeout)) {
+		spin_unlock_irqrestore(&agent->lock, flags);
+		ack_ds_ack(agent, mad_recv_wc);
+		return;		/* Repeated ACK for DS RMPP transaction */
+	}
+
+	if ((mad_send_wr->last_ack == mad_send_wr->send_buf.seg_count) ||
+	    (!mad_send_wr->timeout) || (mad_send_wr->status != IB_WC_SUCCESS))
+		goto out;	/* Send is already done */
+
+	if (seg_num > mad_send_wr->send_buf.seg_count ||
+	    seg_num > mad_send_wr->newwin) {
+		spin_unlock_irqrestore(&agent->lock, flags);
+		abort_send(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_S2B);
+		nack_recv(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_S2B);
+		return;
+	}
+
+	if (newwin < mad_send_wr->newwin || seg_num < mad_send_wr->last_ack)
+		goto out;	/* Old ACK */
+
+	if (seg_num > mad_send_wr->last_ack) {
+		adjust_last_ack(mad_send_wr, seg_num);
+		mad_send_wr->retries_left = mad_send_wr->max_retries;
+	}
+	mad_send_wr->newwin = newwin;
+	if (mad_send_wr->last_ack == mad_send_wr->send_buf.seg_count) {
+		/* If no response is expected, the ACK completes the send */
+		if (!mad_send_wr->send_buf.timeout_ms) {
+			struct ib_mad_send_wc wc;
+
+			ib_mark_mad_done(mad_send_wr);
+			spin_unlock_irqrestore(&agent->lock, flags);
+
+			wc.status = IB_WC_SUCCESS;
+			wc.vendor_err = 0;
+			wc.send_buf = &mad_send_wr->send_buf;
+			ib_mad_complete_send_wr(mad_send_wr, &wc);
+			return;
+		}
+		if (mad_send_wr->refcount == 1)
+			ib_reset_mad_timeout(mad_send_wr,
+					     mad_send_wr->send_buf.timeout_ms);
+		spin_unlock_irqrestore(&agent->lock, flags);
+		ack_ds_ack(agent, mad_recv_wc);
+		return;
+	} else if (mad_send_wr->refcount == 1 &&
+		   mad_send_wr->seg_num < mad_send_wr->newwin &&
+		   mad_send_wr->seg_num < mad_send_wr->send_buf.seg_count) {
+		/* Send failure will just result in a timeout/retry */
+		ret = send_next_seg(mad_send_wr);
+		if (ret)
+			goto out;
+
+		mad_send_wr->refcount++;
+		list_move_tail(&mad_send_wr->agent_list,
+			      &mad_send_wr->mad_agent_priv->send_list);
+	}
+out:
+	spin_unlock_irqrestore(&agent->lock, flags);
+}
+
+static struct ib_mad_recv_wc *
+process_rmpp_data(struct ib_mad_agent_private *agent,
+		  struct ib_mad_recv_wc *mad_recv_wc)
+{
+	struct ib_rmpp_hdr *rmpp_hdr;
+	u8 rmpp_status;
+
+	rmpp_hdr = &((struct ib_rmpp_mad *)mad_recv_wc->recv_buf.mad)->rmpp_hdr;
+
+	if (rmpp_hdr->rmpp_status) {
+		rmpp_status = IB_MGMT_RMPP_STATUS_BAD_STATUS;
+		goto bad;
+	}
+
+	if (rmpp_hdr->seg_num == cpu_to_be32(1)) {
+		if (!(ib_get_rmpp_flags(rmpp_hdr) & IB_MGMT_RMPP_FLAG_FIRST)) {
+			rmpp_status = IB_MGMT_RMPP_STATUS_BAD_SEG;
+			goto bad;
+		}
+		return start_rmpp(agent, mad_recv_wc);
+	} else {
+		if (ib_get_rmpp_flags(rmpp_hdr) & IB_MGMT_RMPP_FLAG_FIRST) {
+			rmpp_status = IB_MGMT_RMPP_STATUS_BAD_SEG;
+			goto bad;
+		}
+		return continue_rmpp(agent, mad_recv_wc);
+	}
+bad:
+	nack_recv(agent, mad_recv_wc, rmpp_status);
+	ib_free_recv_mad(mad_recv_wc);
+	return NULL;
+}
+
+static void process_rmpp_stop(struct ib_mad_agent_private *agent,
+			      struct ib_mad_recv_wc *mad_recv_wc)
+{
+	struct ib_rmpp_mad *rmpp_mad;
+
+	rmpp_mad = (struct ib_rmpp_mad *)mad_recv_wc->recv_buf.mad;
+
+	if (rmpp_mad->rmpp_hdr.rmpp_status != IB_MGMT_RMPP_STATUS_RESX) {
+		abort_send(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BAD_STATUS);
+		nack_recv(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BAD_STATUS);
+	} else
+		abort_send(agent, mad_recv_wc, rmpp_mad->rmpp_hdr.rmpp_status);
+}
+
+static void process_rmpp_abort(struct ib_mad_agent_private *agent,
+			       struct ib_mad_recv_wc *mad_recv_wc)
+{
+	struct ib_rmpp_mad *rmpp_mad;
+
+	rmpp_mad = (struct ib_rmpp_mad *)mad_recv_wc->recv_buf.mad;
+
+	if (rmpp_mad->rmpp_hdr.rmpp_status < IB_MGMT_RMPP_STATUS_ABORT_MIN ||
+	    rmpp_mad->rmpp_hdr.rmpp_status > IB_MGMT_RMPP_STATUS_ABORT_MAX) {
+		abort_send(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BAD_STATUS);
+		nack_recv(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BAD_STATUS);
+	} else
+		abort_send(agent, mad_recv_wc, rmpp_mad->rmpp_hdr.rmpp_status);
+}
+
+struct ib_mad_recv_wc *
+ib_process_rmpp_recv_wc(struct ib_mad_agent_private *agent,
+			struct ib_mad_recv_wc *mad_recv_wc)
+{
+	struct ib_rmpp_mad *rmpp_mad;
+
+	rmpp_mad = (struct ib_rmpp_mad *)mad_recv_wc->recv_buf.mad;
+	if (!(rmpp_mad->rmpp_hdr.rmpp_rtime_flags & IB_MGMT_RMPP_FLAG_ACTIVE))
+		return mad_recv_wc;
+
+	if (rmpp_mad->rmpp_hdr.rmpp_version != IB_MGMT_RMPP_VERSION) {
+		abort_send(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_UNV);
+		nack_recv(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_UNV);
+		goto out;
+	}
+
+	switch (rmpp_mad->rmpp_hdr.rmpp_type) {
+	case IB_MGMT_RMPP_TYPE_DATA:
+		return process_rmpp_data(agent, mad_recv_wc);
+	case IB_MGMT_RMPP_TYPE_ACK:
+		process_rmpp_ack(agent, mad_recv_wc);
+		break;
+	case IB_MGMT_RMPP_TYPE_STOP:
+		process_rmpp_stop(agent, mad_recv_wc);
+		break;
+	case IB_MGMT_RMPP_TYPE_ABORT:
+		process_rmpp_abort(agent, mad_recv_wc);
+		break;
+	default:
+		abort_send(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BADT);
+		nack_recv(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BADT);
+		break;
+	}
+out:
+	ib_free_recv_mad(mad_recv_wc);
+	return NULL;
+}
+
+static int init_newwin(struct ib_mad_send_wr_private *mad_send_wr)
+{
+	struct ib_mad_agent_private *agent = mad_send_wr->mad_agent_priv;
+	struct ib_mad_hdr *mad_hdr = mad_send_wr->send_buf.mad;
+	struct mad_rmpp_recv *rmpp_recv;
+	struct ib_ah_attr ah_attr;
+	unsigned long flags;
+	int newwin = 1;
+
+	if (!(mad_hdr->method & IB_MGMT_METHOD_RESP))
+		goto out;
+
+	spin_lock_irqsave(&agent->lock, flags);
+	list_for_each_entry(rmpp_recv, &agent->rmpp_list, list) {
+		if (rmpp_recv->tid != mad_hdr->tid ||
+		    rmpp_recv->mgmt_class != mad_hdr->mgmt_class ||
+		    rmpp_recv->class_version != mad_hdr->class_version ||
+		    (rmpp_recv->method & IB_MGMT_METHOD_RESP))
+			continue;
+
+		if (ib_query_ah(mad_send_wr->send_buf.ah, &ah_attr))
+			continue;
+
+		if (rmpp_recv->slid == ah_attr.dlid) {
+			newwin = rmpp_recv->repwin;
+			break;
+		}
+	}
+	spin_unlock_irqrestore(&agent->lock, flags);
+out:
+	return newwin;
+}
+
+int ib_send_rmpp_mad(struct ib_mad_send_wr_private *mad_send_wr)
+{
+	struct ib_rmpp_mad *rmpp_mad;
+	int ret;
+
+	rmpp_mad = mad_send_wr->send_buf.mad;
+	if (!(ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) &
+	      IB_MGMT_RMPP_FLAG_ACTIVE))
+		return IB_RMPP_RESULT_UNHANDLED;
+
+	if (rmpp_mad->rmpp_hdr.rmpp_type != IB_MGMT_RMPP_TYPE_DATA) {
+		mad_send_wr->seg_num = 1;
+		return IB_RMPP_RESULT_INTERNAL;
+	}
+
+	mad_send_wr->newwin = init_newwin(mad_send_wr);
+
+	/* We need to wait for the final ACK even if there isn't a response */
+	mad_send_wr->refcount += (mad_send_wr->timeout == 0);
+	ret = send_next_seg(mad_send_wr);
+	if (!ret)
+		return IB_RMPP_RESULT_CONSUMED;
+	return ret;
+}
+
+int ib_process_rmpp_send_wc(struct ib_mad_send_wr_private *mad_send_wr,
+			    struct ib_mad_send_wc *mad_send_wc)
+{
+	struct ib_rmpp_mad *rmpp_mad;
+	int ret;
+
+	rmpp_mad = mad_send_wr->send_buf.mad;
+	if (!(ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) &
+	      IB_MGMT_RMPP_FLAG_ACTIVE))
+		return IB_RMPP_RESULT_UNHANDLED; /* RMPP not active */
+
+	if (rmpp_mad->rmpp_hdr.rmpp_type != IB_MGMT_RMPP_TYPE_DATA)
+		return IB_RMPP_RESULT_INTERNAL;	 /* ACK, STOP, or ABORT */
+
+	if (mad_send_wc->status != IB_WC_SUCCESS ||
+	    mad_send_wr->status != IB_WC_SUCCESS)
+		return IB_RMPP_RESULT_PROCESSED; /* Canceled or send error */
+
+	if (!mad_send_wr->timeout)
+		return IB_RMPP_RESULT_PROCESSED; /* Response received */
+
+	if (mad_send_wr->last_ack == mad_send_wr->send_buf.seg_count) {
+		mad_send_wr->timeout =
+			msecs_to_jiffies(mad_send_wr->send_buf.timeout_ms);
+		return IB_RMPP_RESULT_PROCESSED; /* Send done */
+	}
+
+	if (mad_send_wr->seg_num == mad_send_wr->newwin ||
+	    mad_send_wr->seg_num == mad_send_wr->send_buf.seg_count)
+		return IB_RMPP_RESULT_PROCESSED; /* Wait for ACK */
+
+	ret = send_next_seg(mad_send_wr);
+	if (ret) {
+		mad_send_wc->status = IB_WC_GENERAL_ERR;
+		return IB_RMPP_RESULT_PROCESSED;
+	}
+	return IB_RMPP_RESULT_CONSUMED;
+}
+
+int ib_retry_rmpp(struct ib_mad_send_wr_private *mad_send_wr)
+{
+	struct ib_rmpp_mad *rmpp_mad;
+	int ret;
+
+	rmpp_mad = mad_send_wr->send_buf.mad;
+	if (!(ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) &
+	      IB_MGMT_RMPP_FLAG_ACTIVE))
+		return IB_RMPP_RESULT_UNHANDLED; /* RMPP not active */
+
+	if (mad_send_wr->last_ack == mad_send_wr->send_buf.seg_count)
+		return IB_RMPP_RESULT_PROCESSED;
+
+	mad_send_wr->seg_num = mad_send_wr->last_ack;
+	mad_send_wr->cur_seg = mad_send_wr->last_ack_seg;
+
+	ret = send_next_seg(mad_send_wr);
+	if (ret)
+		return IB_RMPP_RESULT_PROCESSED;
+
+	return IB_RMPP_RESULT_CONSUMED;
+}


Property changes on: trunk/sys/ofed/drivers/infiniband/core/mad_rmpp.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/sys/ofed/drivers/infiniband/core/mad_rmpp.h
===================================================================
--- trunk/sys/ofed/drivers/infiniband/core/mad_rmpp.h	                        (rev 0)
+++ trunk/sys/ofed/drivers/infiniband/core/mad_rmpp.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2005 Intel Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __MAD_RMPP_H__
+#define __MAD_RMPP_H__
+
+enum {
+	IB_RMPP_RESULT_PROCESSED,
+	IB_RMPP_RESULT_CONSUMED,
+	IB_RMPP_RESULT_INTERNAL,
+	IB_RMPP_RESULT_UNHANDLED
+};
+
+int ib_send_rmpp_mad(struct ib_mad_send_wr_private *mad_send_wr);
+
+struct ib_mad_recv_wc *
+ib_process_rmpp_recv_wc(struct ib_mad_agent_private *agent,
+			struct ib_mad_recv_wc *mad_recv_wc);
+
+int ib_process_rmpp_send_wc(struct ib_mad_send_wr_private *mad_send_wr,
+			    struct ib_mad_send_wc *mad_send_wc);
+
+void ib_rmpp_send_handler(struct ib_mad_send_wc *mad_send_wc);
+
+void ib_cancel_rmpp_recvs(struct ib_mad_agent_private *agent);
+
+int ib_retry_rmpp(struct ib_mad_send_wr_private *mad_send_wr);
+
+#endif	/* __MAD_RMPP_H__ */


Property changes on: trunk/sys/ofed/drivers/infiniband/core/mad_rmpp.h
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/sys/ofed/drivers/infiniband/core/multicast.c
===================================================================
--- trunk/sys/ofed/drivers/infiniband/core/multicast.c	                        (rev 0)
+++ trunk/sys/ofed/drivers/infiniband/core/multicast.c	2016-09-14 19:35:22 UTC (rev 7911)
@@ -0,0 +1,868 @@
+/*
+ * Copyright (c) 2006 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/completion.h>
+#include <linux/dma-mapping.h>
+#include <linux/err.h>
+#include <linux/interrupt.h>
+#include <linux/bitops.h>
+#include <linux/random.h>
+
+#include <rdma/ib_cache.h>
+#include "sa.h"
+
+static void mcast_add_one(struct ib_device *device);
+static void mcast_remove_one(struct ib_device *device);
+
+static struct ib_client mcast_client = {
+	.name   = "ib_multicast",
+	.add    = mcast_add_one,
+	.remove = mcast_remove_one
+};
+
+static struct ib_sa_client	sa_client;
+static struct workqueue_struct	*mcast_wq;
+static union ib_gid mgid0;
+
+struct mcast_device;
+
+struct mcast_port {
+	struct mcast_device	*dev;
+	spinlock_t		lock;
+	struct rb_root		table;
+	atomic_t		refcount;
+	struct completion	comp;
+	u8			port_num;
+};
+
+struct mcast_device {
+	struct ib_device	*device;
+	struct ib_event_handler	event_handler;
+	int			start_port;
+	int			end_port;
+	struct mcast_port	port[0];
+};
+
+enum mcast_state {
+	MCAST_JOINING,
+	MCAST_MEMBER,
+	MCAST_ERROR,
+};
+
+enum mcast_group_state {
+	MCAST_IDLE,
+	MCAST_BUSY,
+	MCAST_GROUP_ERROR,
+	MCAST_PKEY_EVENT
+};
+
+enum {
+	MCAST_INVALID_PKEY_INDEX = 0xFFFF
+};
+
+struct mcast_member;
+
+struct mcast_group {
+	struct ib_sa_mcmember_rec rec;
+	struct rb_node		node;
+	struct mcast_port	*port;
+	spinlock_t		lock;
+	struct work_struct	work;
+	struct list_head	pending_list;
+	struct list_head	active_list;
+	struct mcast_member	*last_join;
+	int			members[3];
+	atomic_t		refcount;
+	enum mcast_group_state	state;
+	struct ib_sa_query	*query;
+	int			query_id;
+	u16			pkey_index;
+	u8			leave_state;
+	int			retries;
+};
+
+struct mcast_member {
+	struct ib_sa_multicast	multicast;
+	struct ib_sa_client	*client;
+	struct mcast_group	*group;
+	struct list_head	list;
+	enum mcast_state	state;
+	atomic_t		refcount;
+	struct completion	comp;
+};
+
+static void join_handler(int status, struct ib_sa_mcmember_rec *rec,
+			 void *context);
+static void leave_handler(int status, struct ib_sa_mcmember_rec *rec,
+			  void *context);
+
+static struct mcast_group *mcast_find(struct mcast_port *port,
+				      union ib_gid *mgid)
+{
+	struct rb_node *node = port->table.rb_node;
+	struct mcast_group *group;
+	int ret;
+
+	while (node) {
+		group = rb_entry(node, struct mcast_group, node);
+		ret = memcmp(mgid->raw, group->rec.mgid.raw, sizeof *mgid);
+		if (!ret)
+			return group;
+
+		if (ret < 0)
+			node = node->rb_left;
+		else
+			node = node->rb_right;
+	}
+	return NULL;
+}
+
+static struct mcast_group *mcast_insert(struct mcast_port *port,
+					struct mcast_group *group,
+					int allow_duplicates)
+{
+	struct rb_node **link = &port->table.rb_node;
+	struct rb_node *parent = NULL;
+	struct mcast_group *cur_group;
+	int ret;
+
+	while (*link) {
+		parent = *link;
+		cur_group = rb_entry(parent, struct mcast_group, node);
+
+		ret = memcmp(group->rec.mgid.raw, cur_group->rec.mgid.raw,
+			     sizeof group->rec.mgid);
+		if (ret < 0)
+			link = &(*link)->rb_left;
+		else if (ret > 0)
+			link = &(*link)->rb_right;
+		else if (allow_duplicates)
+			link = &(*link)->rb_left;
+		else
+			return cur_group;
+	}
+	rb_link_node(&group->node, parent, link);
+	rb_insert_color(&group->node, &port->table);
+	return NULL;
+}
+
+static void deref_port(struct mcast_port *port)
+{
+	if (atomic_dec_and_test(&port->refcount))
+		complete(&port->comp);
+}
+
+static void release_group(struct mcast_group *group)
+{
+	struct mcast_port *port = group->port;
+	unsigned long flags;
+
+	spin_lock_irqsave(&port->lock, flags);
+	if (atomic_dec_and_test(&group->refcount)) {
+		rb_erase(&group->node, &port->table);
+		spin_unlock_irqrestore(&port->lock, flags);
+		kfree(group);
+		deref_port(port);
+	} else
+		spin_unlock_irqrestore(&port->lock, flags);
+}
+
+static void deref_member(struct mcast_member *member)
+{
+	if (atomic_dec_and_test(&member->refcount))
+		complete(&member->comp);
+}
+
+static void queue_join(struct mcast_member *member)
+{
+	struct mcast_group *group = member->group;
+	unsigned long flags;
+
+	spin_lock_irqsave(&group->lock, flags);
+	list_add_tail(&member->list, &group->pending_list);
+	if (group->state == MCAST_IDLE) {
+		group->state = MCAST_BUSY;
+		atomic_inc(&group->refcount);
+		queue_work(mcast_wq, &group->work);
+	}
+	spin_unlock_irqrestore(&group->lock, flags);
+}
+
+/*
+ * A multicast group has three types of members: full member, non member, and
+ * send only member.  We need to keep track of the number of members of each
+ * type based on their join state.  Adjust the number of members the belong to
+ * the specified join states.
+ */
+static void adjust_membership(struct mcast_group *group, u8 join_state, int inc)
+{
+	int i;
+
+	for (i = 0; i < 3; i++, join_state >>= 1)
+		if (join_state & 0x1)
+			group->members[i] += inc;
+}
+
+/*
+ * If a multicast group has zero members left for a particular join state, but
+ * the group is still a member with the SA, we need to leave that join state.
+ * Determine which join states we still belong to, but that do not have any
+ * active members.
+ */
+static u8 get_leave_state(struct mcast_group *group)
+{
+	u8 leave_state = 0;
+	int i;
+
+	for (i = 0; i < 3; i++)
+		if (!group->members[i])
+			leave_state |= (0x1 << i);
+
+	return leave_state & group->rec.join_state;
+}
+
+static int cmp_rec(struct ib_sa_mcmember_rec *src,
+		   struct ib_sa_mcmember_rec *dst, ib_sa_comp_mask comp_mask)
+{
+	/* MGID must already match */
+
+	if (comp_mask & IB_SA_MCMEMBER_REC_PORT_GID &&
+	    memcmp(&src->port_gid, &dst->port_gid, sizeof src->port_gid))
+		return -EINVAL;
+	if (comp_mask & IB_SA_MCMEMBER_REC_QKEY && src->qkey != dst->qkey)
+		return -EINVAL;
+	if (comp_mask & IB_SA_MCMEMBER_REC_MLID && src->mlid != dst->mlid)
+		return -EINVAL;
+	if (ib_sa_check_selector(comp_mask, IB_SA_MCMEMBER_REC_MTU_SELECTOR,
+				 IB_SA_MCMEMBER_REC_MTU, dst->mtu_selector,
+				 src->mtu, dst->mtu))
+		return -EINVAL;
+	if (comp_mask & IB_SA_MCMEMBER_REC_TRAFFIC_CLASS &&
+	    src->traffic_class != dst->traffic_class)
+		return -EINVAL;
+	if (comp_mask & IB_SA_MCMEMBER_REC_PKEY && src->pkey != dst->pkey)
+		return -EINVAL;
+	if (ib_sa_check_selector(comp_mask, IB_SA_MCMEMBER_REC_RATE_SELECTOR,
+				 IB_SA_MCMEMBER_REC_RATE, dst->rate_selector,
+				 src->rate, dst->rate))
+		return -EINVAL;
+	if (ib_sa_check_selector(comp_mask,
+				 IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME_SELECTOR,
+				 IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME,
+				 dst->packet_life_time_selector,
+				 src->packet_life_time, dst->packet_life_time))
+		return -EINVAL;
+	if (comp_mask & IB_SA_MCMEMBER_REC_SL && src->sl != dst->sl)
+		return -EINVAL;
+	if (comp_mask & IB_SA_MCMEMBER_REC_FLOW_LABEL &&
+	    src->flow_label != dst->flow_label)
+		return -EINVAL;
+	if (comp_mask & IB_SA_MCMEMBER_REC_HOP_LIMIT &&
+	    src->hop_limit != dst->hop_limit)
+		return -EINVAL;
+	if (comp_mask & IB_SA_MCMEMBER_REC_SCOPE && src->scope != dst->scope)
+		return -EINVAL;
+
+	/* join_state checked separately, proxy_join ignored */
+
+	return 0;
+}
+
+static int send_join(struct mcast_group *group, struct mcast_member *member)
+{
+	struct mcast_port *port = group->port;
+	int ret;
+
+	group->last_join = member;
+	ret = ib_sa_mcmember_rec_query(&sa_client, port->dev->device,
+				       port->port_num, IB_MGMT_METHOD_SET,
+				       &member->multicast.rec,
+				       member->multicast.comp_mask,
+				       3000, GFP_KERNEL, join_handler, group,
+				       &group->query);
+	if (ret >= 0) {
+		group->query_id = ret;
+		ret = 0;
+	}
+	return ret;
+}
+
+static int send_leave(struct mcast_group *group, u8 leave_state)
+{
+	struct mcast_port *port = group->port;
+	struct ib_sa_mcmember_rec rec;
+	int ret;
+
+	rec = group->rec;
+	rec.join_state = leave_state;
+	group->leave_state = leave_state;
+
+	ret = ib_sa_mcmember_rec_query(&sa_client, port->dev->device,
+				       port->port_num, IB_SA_METHOD_DELETE, &rec,
+				       IB_SA_MCMEMBER_REC_MGID     |
+				       IB_SA_MCMEMBER_REC_PORT_GID |
+				       IB_SA_MCMEMBER_REC_JOIN_STATE,
+				       3000, GFP_KERNEL, leave_handler,
+				       group, &group->query);
+	if (ret >= 0) {
+		group->query_id = ret;
+		ret = 0;
+	}
+	return ret;
+}
+
+static void join_group(struct mcast_group *group, struct mcast_member *member,
+		       u8 join_state)
+{
+	member->state = MCAST_MEMBER;
+	adjust_membership(group, join_state, 1);
+	group->rec.join_state |= join_state;
+	member->multicast.rec = group->rec;
+	member->multicast.rec.join_state = join_state;
+	list_move(&member->list, &group->active_list);
+}
+
+static int fail_join(struct mcast_group *group, struct mcast_member *member,
+		     int status)
+{
+	spin_lock_irq(&group->lock);
+	list_del_init(&member->list);
+	spin_unlock_irq(&group->lock);
+	return member->multicast.callback(status, &member->multicast);
+}
+
+static void process_group_error(struct mcast_group *group)
+{
+	struct mcast_member *member;
+	int ret = 0;
+	u16 pkey_index;
+
+	if (group->state == MCAST_PKEY_EVENT)
+		ret = ib_find_pkey(group->port->dev->device,
+				   group->port->port_num,
+				   be16_to_cpu(group->rec.pkey), &pkey_index);
+
+	spin_lock_irq(&group->lock);
+	if (group->state == MCAST_PKEY_EVENT && !ret &&
+	    group->pkey_index == pkey_index)
+		goto out;
+
+	while (!list_empty(&group->active_list)) {
+		member = list_entry(group->active_list.next,
+				    struct mcast_member, list);
+		atomic_inc(&member->refcount);
+		list_del_init(&member->list);
+		adjust_membership(group, member->multicast.rec.join_state, -1);
+		member->state = MCAST_ERROR;
+		spin_unlock_irq(&group->lock);
+
+		ret = member->multicast.callback(-ENETRESET,
+						 &member->multicast);
+		deref_member(member);
+		if (ret)
+			ib_sa_free_multicast(&member->multicast);
+		spin_lock_irq(&group->lock);
+	}
+
+	group->rec.join_state = 0;
+out:
+	group->state = MCAST_BUSY;
+	spin_unlock_irq(&group->lock);
+}
+
+static void mcast_work_handler(struct work_struct *work)
+{
+	struct mcast_group *group;
+	struct mcast_member *member;
+	struct ib_sa_multicast *multicast;
+	int status, ret;
+	u8 join_state;
+
+	group = container_of(work, typeof(*group), work);
+retest:
+	spin_lock_irq(&group->lock);
+	while (!list_empty(&group->pending_list) ||
+	       (group->state != MCAST_BUSY)) {
+
+		if (group->state != MCAST_BUSY) {
+			spin_unlock_irq(&group->lock);
+			process_group_error(group);
+			goto retest;
+		}
+
+		member = list_entry(group->pending_list.next,
+				    struct mcast_member, list);
+		multicast = &member->multicast;
+		join_state = multicast->rec.join_state;
+		atomic_inc(&member->refcount);
+
+		if (join_state == (group->rec.join_state & join_state)) {
+			status = cmp_rec(&group->rec, &multicast->rec,
+					 multicast->comp_mask);
+			if (!status)
+				join_group(group, member, join_state);
+			else
+				list_del_init(&member->list);
+			spin_unlock_irq(&group->lock);
+			ret = multicast->callback(status, multicast);
+		} else {
+			spin_unlock_irq(&group->lock);
+			status = send_join(group, member);
+			if (!status) {
+				deref_member(member);
+				return;
+			}
+			ret = fail_join(group, member, status);
+		}
+
+		deref_member(member);
+		if (ret)
+			ib_sa_free_multicast(&member->multicast);
+		spin_lock_irq(&group->lock);
+	}
+
+	join_state = get_leave_state(group);
+	if (join_state) {
+		group->rec.join_state &= ~join_state;
+		spin_unlock_irq(&group->lock);
+		if (send_leave(group, join_state))
+			goto retest;
+	} else {
+		group->state = MCAST_IDLE;
+		spin_unlock_irq(&group->lock);
+		release_group(group);
+	}
+}
+
+/*
+ * Fail a join request if it is still active - at the head of the pending queue.
+ */
+static void process_join_error(struct mcast_group *group, int status)
+{
+	struct mcast_member *member;
+	int ret;
+
+	spin_lock_irq(&group->lock);
+	member = list_entry(group->pending_list.next,
+			    struct mcast_member, list);
+	if (group->last_join == member) {
+		atomic_inc(&member->refcount);
+		list_del_init(&member->list);
+		spin_unlock_irq(&group->lock);
+		ret = member->multicast.callback(status, &member->multicast);
+		deref_member(member);
+		if (ret)
+			ib_sa_free_multicast(&member->multicast);
+	} else
+		spin_unlock_irq(&group->lock);
+}
+
+static void join_handler(int status, struct ib_sa_mcmember_rec *rec,
+			 void *context)
+{
+	struct mcast_group *group = context;
+	u16 pkey_index = MCAST_INVALID_PKEY_INDEX;
+
+	if (status)
+		process_join_error(group, status);
+	else {
+		ib_find_pkey(group->port->dev->device, group->port->port_num,
+			     be16_to_cpu(rec->pkey), &pkey_index);
+
+		spin_lock_irq(&group->port->lock);
+		group->rec = *rec;
+		if (group->state == MCAST_BUSY &&
+		    group->pkey_index == MCAST_INVALID_PKEY_INDEX)
+			group->pkey_index = pkey_index;
+		if (!memcmp(&mgid0, &group->rec.mgid, sizeof mgid0)) {
+			rb_erase(&group->node, &group->port->table);
+			mcast_insert(group->port, group, 1);
+		}
+		spin_unlock_irq(&group->port->lock);
+	}
+	mcast_work_handler(&group->work);
+}
+
+static void leave_handler(int status, struct ib_sa_mcmember_rec *rec,
+			  void *context)
+{
+	struct mcast_group *group = context;
+
+	if (status && (group->retries > 0) &&
+	    !send_leave(group, group->leave_state))
+		group->retries--;
+	else
+		mcast_work_handler(&group->work);
+}
+
+static struct mcast_group *acquire_group(struct mcast_port *port,
+					 union ib_gid *mgid, gfp_t gfp_mask)
+{
+	struct mcast_group *group, *cur_group;
+	unsigned long flags;
+	int is_mgid0;
+
+	is_mgid0 = !memcmp(&mgid0, mgid, sizeof mgid0);
+	if (!is_mgid0) {
+		spin_lock_irqsave(&port->lock, flags);
+		group = mcast_find(port, mgid);
+		if (group)
+			goto found;
+		spin_unlock_irqrestore(&port->lock, flags);
+	}
+
+	group = kzalloc(sizeof *group, gfp_mask);
+	if (!group)
+		return NULL;
+
+	group->retries = 3;
+	group->port = port;
+	group->rec.mgid = *mgid;
+	group->pkey_index = MCAST_INVALID_PKEY_INDEX;
+	INIT_LIST_HEAD(&group->pending_list);
+	INIT_LIST_HEAD(&group->active_list);
+	INIT_WORK(&group->work, mcast_work_handler);
+	spin_lock_init(&group->lock);
+
+	spin_lock_irqsave(&port->lock, flags);
+	cur_group = mcast_insert(port, group, is_mgid0);
+	if (cur_group) {
+		kfree(group);
+		group = cur_group;
+	} else
+		atomic_inc(&port->refcount);
+found:
+	atomic_inc(&group->refcount);
+	spin_unlock_irqrestore(&port->lock, flags);
+	return group;
+}
+
+/*
+ * We serialize all join requests to a single group to make our lives much
+ * easier.  Otherwise, two users could try to join the same group
+ * simultaneously, with different configurations, one could leave while the
+ * join is in progress, etc., which makes locking around error recovery
+ * difficult.
+ */
+struct ib_sa_multicast *
+ib_sa_join_multicast(struct ib_sa_client *client,
+		     struct ib_device *device, u8 port_num,
+		     struct ib_sa_mcmember_rec *rec,
+		     ib_sa_comp_mask comp_mask, gfp_t gfp_mask,
+		     int (*callback)(int status,
+				     struct ib_sa_multicast *multicast),
+		     void *context)
+{
+	struct mcast_device *dev;
+	struct mcast_member *member;
+	struct ib_sa_multicast *multicast;
+	int ret;
+
+	dev = ib_get_client_data(device, &mcast_client);
+	if (!dev)
+		return ERR_PTR(-ENODEV);
+
+	member = kmalloc(sizeof *member, gfp_mask);
+	if (!member)
+		return ERR_PTR(-ENOMEM);
+
+	ib_sa_client_get(client);
+	member->client = client;
+	member->multicast.rec = *rec;
+	member->multicast.comp_mask = comp_mask;
+	member->multicast.callback = callback;
+	member->multicast.context = context;
+	init_completion(&member->comp);
+	atomic_set(&member->refcount, 1);
+	member->state = MCAST_JOINING;
+
+	member->group = acquire_group(&dev->port[port_num - dev->start_port],
+				      &rec->mgid, gfp_mask);
+	if (!member->group) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	/*
+	 * The user will get the multicast structure in their callback.  They
+	 * could then free the multicast structure before we can return from
+	 * this routine.  So we save the pointer to return before queuing
+	 * any callback.
+	 */
+	multicast = &member->multicast;
+	queue_join(member);
+	return multicast;
+
+err:
+	ib_sa_client_put(client);
+	kfree(member);
+	return ERR_PTR(ret);
+}
+EXPORT_SYMBOL(ib_sa_join_multicast);
+
+void ib_sa_free_multicast(struct ib_sa_multicast *multicast)
+{
+	struct mcast_member *member;
+	struct mcast_group *group;
+
+	member = container_of(multicast, struct mcast_member, multicast);
+	group = member->group;
+
+	spin_lock_irq(&group->lock);
+	if (member->state == MCAST_MEMBER)
+		adjust_membership(group, multicast->rec.join_state, -1);
+
+	list_del_init(&member->list);
+
+	if (group->state == MCAST_IDLE) {
+		group->state = MCAST_BUSY;
+		spin_unlock_irq(&group->lock);
+		/* Continue to hold reference on group until callback */
+		queue_work(mcast_wq, &group->work);
+	} else {
+		spin_unlock_irq(&group->lock);
+		release_group(group);
+	}
+
+	deref_member(member);
+	wait_for_completion(&member->comp);
+	ib_sa_client_put(member->client);
+	kfree(member);
+}
+EXPORT_SYMBOL(ib_sa_free_multicast);
+
+int ib_sa_get_mcmember_rec(struct ib_device *device, u8 port_num,
+			   union ib_gid *mgid, struct ib_sa_mcmember_rec *rec)
+{
+	struct mcast_device *dev;
+	struct mcast_port *port;
+	struct mcast_group *group;
+	unsigned long flags;
+	int ret = 0;
+
+	dev = ib_get_client_data(device, &mcast_client);
+	if (!dev)
+		return -ENODEV;
+
+	port = &dev->port[port_num - dev->start_port];
+	spin_lock_irqsave(&port->lock, flags);
+	group = mcast_find(port, mgid);
+	if (group)
+		*rec = group->rec;
+	else
+		ret = -EADDRNOTAVAIL;
+	spin_unlock_irqrestore(&port->lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL(ib_sa_get_mcmember_rec);
+
+int ib_init_ah_from_mcmember(struct ib_device *device, u8 port_num,
+			     struct ib_sa_mcmember_rec *rec,
+			     struct ib_ah_attr *ah_attr)
+{
+	int ret;
+	u16 gid_index;
+	u8 p;
+
+	ret = ib_find_cached_gid(device, &rec->port_gid, &p, &gid_index);
+	if (ret)
+		return ret;
+
+	memset(ah_attr, 0, sizeof *ah_attr);
+	ah_attr->dlid = be16_to_cpu(rec->mlid);
+	ah_attr->sl = rec->sl;
+	ah_attr->port_num = port_num;
+	ah_attr->static_rate = rec->rate;
+
+	ah_attr->ah_flags = IB_AH_GRH;
+	ah_attr->grh.dgid = rec->mgid;
+
+	ah_attr->grh.sgid_index = (u8) gid_index;
+	ah_attr->grh.flow_label = be32_to_cpu(rec->flow_label);
+	ah_attr->grh.hop_limit = rec->hop_limit;
+	ah_attr->grh.traffic_class = rec->traffic_class;
+
+	return 0;
+}
+EXPORT_SYMBOL(ib_init_ah_from_mcmember);
+
+static void mcast_groups_event(struct mcast_port *port,
+			       enum mcast_group_state state)
+{
+	struct mcast_group *group;
+	struct rb_node *node;
+	unsigned long flags;
+
+	spin_lock_irqsave(&port->lock, flags);
+	for (node = rb_first(&port->table); node; node = rb_next(node)) {
+		group = rb_entry(node, struct mcast_group, node);
+		spin_lock(&group->lock);
+		if (group->state == MCAST_IDLE) {
+			atomic_inc(&group->refcount);
+			queue_work(mcast_wq, &group->work);
+		}
+		if (group->state != MCAST_GROUP_ERROR)
+			group->state = state;
+		spin_unlock(&group->lock);
+	}
+	spin_unlock_irqrestore(&port->lock, flags);
+}
+
+static void mcast_event_handler(struct ib_event_handler *handler,
+				struct ib_event *event)
+{
+	struct mcast_device *dev;
+	int index;
+
+	dev = container_of(handler, struct mcast_device, event_handler);
+	if (rdma_port_get_link_layer(dev->device, event->element.port_num) !=
+	    IB_LINK_LAYER_INFINIBAND)
+		return;
+
+	index = event->element.port_num - dev->start_port;
+
+	switch (event->event) {
+	case IB_EVENT_PORT_ERR:
+	case IB_EVENT_LID_CHANGE:
+	case IB_EVENT_SM_CHANGE:
+	case IB_EVENT_CLIENT_REREGISTER:
+		mcast_groups_event(&dev->port[index], MCAST_GROUP_ERROR);
+		break;
+	case IB_EVENT_PKEY_CHANGE:
+		mcast_groups_event(&dev->port[index], MCAST_PKEY_EVENT);
+		break;
+	default:
+		break;
+	}
+}
+
+static void mcast_add_one(struct ib_device *device)
+{
+	struct mcast_device *dev;
+	struct mcast_port *port;
+	int i;
+	int count = 0;
+
+	if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
+		return;
+
+	dev = kmalloc(sizeof *dev + device->phys_port_cnt * sizeof *port,
+		      GFP_KERNEL);
+	if (!dev)
+		return;
+
+	if (device->node_type == RDMA_NODE_IB_SWITCH)
+		dev->start_port = dev->end_port = 0;
+	else {
+		dev->start_port = 1;
+		dev->end_port = device->phys_port_cnt;
+	}
+
+	for (i = 0; i <= dev->end_port - dev->start_port; i++) {
+		if (rdma_port_get_link_layer(device, dev->start_port + i) !=
+		    IB_LINK_LAYER_INFINIBAND)
+			continue;
+		port = &dev->port[i];
+		port->dev = dev;
+		port->port_num = dev->start_port + i;
+		spin_lock_init(&port->lock);
+		port->table = RB_ROOT;
+		init_completion(&port->comp);
+		atomic_set(&port->refcount, 1);
+		++count;
+	}
+
+	if (!count) {
+		kfree(dev);
+		return;
+	}
+
+	dev->device = device;
+	ib_set_client_data(device, &mcast_client, dev);
+
+	INIT_IB_EVENT_HANDLER(&dev->event_handler, device, mcast_event_handler);
+	ib_register_event_handler(&dev->event_handler);
+}
+
+static void mcast_remove_one(struct ib_device *device)
+{
+	struct mcast_device *dev;
+	struct mcast_port *port;
+	int i;
+
+	dev = ib_get_client_data(device, &mcast_client);
+	if (!dev)
+		return;
+
+	ib_unregister_event_handler(&dev->event_handler);
+	flush_workqueue(mcast_wq);
+
+	for (i = 0; i <= dev->end_port - dev->start_port; i++) {
+		if (rdma_port_get_link_layer(device, dev->start_port + i) ==
+		    IB_LINK_LAYER_INFINIBAND) {
+			port = &dev->port[i];
+			deref_port(port);
+			wait_for_completion(&port->comp);
+		}
+	}
+
+	kfree(dev);
+}
+
+int mcast_init(void)
+{
+	int ret;
+
+	mcast_wq = create_singlethread_workqueue("ib_mcast");
+	if (!mcast_wq)
+		return -ENOMEM;
+
+	ib_sa_register_client(&sa_client);
+
+	ret = ib_register_client(&mcast_client);
+	if (ret)
+		goto err;
+	return 0;
+
+err:
+	ib_sa_unregister_client(&sa_client);
+	destroy_workqueue(mcast_wq);
+	return ret;
+}
+
+void mcast_cleanup(void)
+{
+	ib_unregister_client(&mcast_client);
+	ib_sa_unregister_client(&sa_client);
+	destroy_workqueue(mcast_wq);
+}


Property changes on: trunk/sys/ofed/drivers/infiniband/core/multicast.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/sys/ofed/drivers/infiniband/core/notice.c
===================================================================
--- trunk/sys/ofed/drivers/infiniband/core/notice.c	                        (rev 0)
+++ trunk/sys/ofed/drivers/infiniband/core/notice.c	2016-09-14 19:35:22 UTC (rev 7911)
@@ -0,0 +1,749 @@
+/*
+ * Copyright (c) 2006 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/completion.h>
+#include <linux/dma-mapping.h>
+#include <linux/err.h>
+#include <linux/interrupt.h>
+#include <linux/pci.h>
+#include <linux/bitops.h>
+#include <linux/random.h>
+
+#include "sa.h"
+
+MODULE_AUTHOR("Sean Hefty");
+MODULE_DESCRIPTION("InfiniBand InformInfo & Notice event handling");
+MODULE_LICENSE("Dual BSD/GPL");
+
+static void inform_add_one(struct ib_device *device);
+static void inform_remove_one(struct ib_device *device);
+
+static struct ib_client inform_client = {
+	.name   = "ib_notice",
+	.add    = inform_add_one,
+	.remove = inform_remove_one
+};
+
+static struct ib_sa_client	sa_client;
+static struct workqueue_struct	*inform_wq;
+
+struct inform_device;
+
+struct inform_port {
+	struct inform_device	*dev;
+	spinlock_t		lock;
+	struct rb_root		table;
+	atomic_t		refcount;
+	struct completion	comp;
+	u8			port_num;
+};
+
+struct inform_device {
+	struct ib_device	*device;
+	struct ib_event_handler	event_handler;
+	int			start_port;
+	int			end_port;
+	struct inform_port	port[0];
+};
+
+enum inform_state {
+	INFORM_IDLE,
+	INFORM_REGISTERING,
+	INFORM_MEMBER,
+	INFORM_BUSY,
+	INFORM_ERROR
+};
+
+struct inform_member;
+
+struct inform_group {
+	u16			trap_number;
+	struct rb_node		node;
+	struct inform_port	*port;
+	spinlock_t		lock;
+	struct work_struct	work;
+	struct list_head	pending_list;
+	struct list_head	active_list;
+	struct list_head	notice_list;
+	struct inform_member	*last_join;
+	int			members;
+	enum inform_state	join_state; /* State relative to SA */
+	atomic_t		refcount;
+	enum inform_state	state;
+	struct ib_sa_query	*query;
+	int			query_id;
+};
+
+struct inform_member {
+	struct ib_inform_info	info;
+	struct ib_sa_client	*client;
+	struct inform_group	*group;
+	struct list_head	list;
+	enum inform_state	state;
+	atomic_t		refcount;
+	struct completion	comp;
+};
+
+struct inform_notice {
+	struct list_head	list;
+	struct ib_sa_notice	notice;
+};
+
+static void reg_handler(int status, struct ib_sa_inform *inform,
+			 void *context);
+static void unreg_handler(int status, struct ib_sa_inform *inform,
+			  void *context);
+
+static struct inform_group *inform_find(struct inform_port *port,
+					u16 trap_number)
+{
+	struct rb_node *node = port->table.rb_node;
+	struct inform_group *group;
+
+	while (node) {
+		group = rb_entry(node, struct inform_group, node);
+		if (trap_number < group->trap_number)
+			node = node->rb_left;
+		else if (trap_number > group->trap_number)
+			node = node->rb_right;
+		else
+			return group;
+	}
+	return NULL;
+}
+
+static struct inform_group *inform_insert(struct inform_port *port,
+					  struct inform_group *group)
+{
+	struct rb_node **link = &port->table.rb_node;
+	struct rb_node *parent = NULL;
+	struct inform_group *cur_group;
+
+	while (*link) {
+		parent = *link;
+		cur_group = rb_entry(parent, struct inform_group, node);
+		if (group->trap_number < cur_group->trap_number)
+			link = &(*link)->rb_left;
+		else if (group->trap_number > cur_group->trap_number)
+			link = &(*link)->rb_right;
+		else
+			return cur_group;
+	}
+	rb_link_node(&group->node, parent, link);
+	rb_insert_color(&group->node, &port->table);
+	return NULL;
+}
+
+static void deref_port(struct inform_port *port)
+{
+	if (atomic_dec_and_test(&port->refcount))
+		complete(&port->comp);
+}
+
+static void release_group(struct inform_group *group)
+{
+	struct inform_port *port = group->port;
+	unsigned long flags;
+
+	spin_lock_irqsave(&port->lock, flags);
+	if (atomic_dec_and_test(&group->refcount)) {
+		rb_erase(&group->node, &port->table);
+		spin_unlock_irqrestore(&port->lock, flags);
+		kfree(group);
+		deref_port(port);
+	} else
+		spin_unlock_irqrestore(&port->lock, flags);
+}
+
+static void deref_member(struct inform_member *member)
+{
+	if (atomic_dec_and_test(&member->refcount))
+		complete(&member->comp);
+}
+
+static void queue_reg(struct inform_member *member)
+{
+	struct inform_group *group = member->group;
+	unsigned long flags;
+
+	spin_lock_irqsave(&group->lock, flags);
+	list_add(&member->list, &group->pending_list);
+	if (group->state == INFORM_IDLE) {
+		group->state = INFORM_BUSY;
+		atomic_inc(&group->refcount);
+		queue_work(inform_wq, &group->work);
+	}
+	spin_unlock_irqrestore(&group->lock, flags);
+}
+
+static int send_reg(struct inform_group *group, struct inform_member *member)
+{
+	struct inform_port *port = group->port;
+	struct ib_sa_inform inform;
+	int ret;
+
+	memset(&inform, 0, sizeof inform);
+	inform.lid_range_begin = cpu_to_be16(0xFFFF);
+	inform.is_generic = 1;
+	inform.subscribe = 1;
+	inform.type = cpu_to_be16(IB_SA_EVENT_TYPE_ALL);
+	inform.trap.generic.trap_num = cpu_to_be16(member->info.trap_number);
+	inform.trap.generic.resp_time = 19;
+	inform.trap.generic.producer_type =
+				cpu_to_be32(IB_SA_EVENT_PRODUCER_TYPE_ALL);
+
+	group->last_join = member;
+	ret = ib_sa_informinfo_query(&sa_client, port->dev->device,
+				     port->port_num, &inform, 3000, GFP_KERNEL,
+				     reg_handler, group,&group->query);
+	if (ret >= 0) {
+		group->query_id = ret;
+		ret = 0;
+	}
+	return ret;
+}
+
+static int send_unreg(struct inform_group *group)
+{
+	struct inform_port *port = group->port;
+	struct ib_sa_inform inform;
+	int ret;
+
+	memset(&inform, 0, sizeof inform);
+	inform.lid_range_begin = cpu_to_be16(0xFFFF);
+	inform.is_generic = 1;
+	inform.type = cpu_to_be16(IB_SA_EVENT_TYPE_ALL);
+	inform.trap.generic.trap_num = cpu_to_be16(group->trap_number);
+	inform.trap.generic.qpn = IB_QP1;
+	inform.trap.generic.resp_time = 19;
+	inform.trap.generic.producer_type =
+				cpu_to_be32(IB_SA_EVENT_PRODUCER_TYPE_ALL);
+
+	ret = ib_sa_informinfo_query(&sa_client, port->dev->device,
+				     port->port_num, &inform, 3000, GFP_KERNEL,
+				     unreg_handler, group, &group->query);
+	if (ret >= 0) {
+		group->query_id = ret;
+		ret = 0;
+	}
+	return ret;
+}
+
+static void join_group(struct inform_group *group, struct inform_member *member)
+{
+	member->state = INFORM_MEMBER;
+	group->members++;
+	list_move(&member->list, &group->active_list);
+}
+
+static int fail_join(struct inform_group *group, struct inform_member *member,
+		     int status)
+{
+	spin_lock_irq(&group->lock);
+	list_del_init(&member->list);
+	spin_unlock_irq(&group->lock);
+	return member->info.callback(status, &member->info, NULL);
+}
+
+static void process_group_error(struct inform_group *group)
+{
+	struct inform_member *member;
+	int ret;
+
+	spin_lock_irq(&group->lock);
+	while (!list_empty(&group->active_list)) {
+		member = list_entry(group->active_list.next,
+				    struct inform_member, list);
+		atomic_inc(&member->refcount);
+		list_del_init(&member->list);
+		group->members--;
+		member->state = INFORM_ERROR;
+		spin_unlock_irq(&group->lock);
+
+		ret = member->info.callback(-ENETRESET, &member->info, NULL);
+		deref_member(member);
+		if (ret)
+			ib_sa_unregister_inform_info(&member->info);
+		spin_lock_irq(&group->lock);
+	}
+
+	group->join_state = INFORM_IDLE;
+	group->state = INFORM_BUSY;
+	spin_unlock_irq(&group->lock);
+}
+
+/*
+ * Report a notice to all active subscribers.  We use a temporary list to
+ * handle unsubscription requests while the notice is being reported, which
+ * avoids holding the group lock while in the user's callback.
+ */
+static void process_notice(struct inform_group *group,
+			   struct inform_notice *info_notice)
+{
+	struct inform_member *member;
+	struct list_head list;
+	int ret;
+
+	INIT_LIST_HEAD(&list);
+
+	spin_lock_irq(&group->lock);
+	list_splice_init(&group->active_list, &list);
+	while (!list_empty(&list)) {
+
+		member = list_entry(list.next, struct inform_member, list);
+		atomic_inc(&member->refcount);
+		list_move(&member->list, &group->active_list);
+		spin_unlock_irq(&group->lock);
+
+		ret = member->info.callback(0, &member->info,
+					    &info_notice->notice);
+		deref_member(member);
+		if (ret)
+			ib_sa_unregister_inform_info(&member->info);
+		spin_lock_irq(&group->lock);
+	}
+	spin_unlock_irq(&group->lock);
+}
+
+static void inform_work_handler(struct work_struct *work)
+{
+	struct inform_group *group;
+	struct inform_member *member;
+	struct ib_inform_info *info;
+	struct inform_notice *info_notice;
+	int status, ret;
+
+	group = container_of(work, typeof(*group), work);
+retest:
+	spin_lock_irq(&group->lock);
+	while (!list_empty(&group->pending_list) ||
+	       !list_empty(&group->notice_list) ||
+	       (group->state == INFORM_ERROR)) {
+
+		if (group->state == INFORM_ERROR) {
+			spin_unlock_irq(&group->lock);
+			process_group_error(group);
+			goto retest;
+		}
+
+		if (!list_empty(&group->notice_list)) {
+			info_notice = list_entry(group->notice_list.next,
+						 struct inform_notice, list);
+			list_del(&info_notice->list);
+			spin_unlock_irq(&group->lock);
+			process_notice(group, info_notice);
+			kfree(info_notice);
+			goto retest;
+		}
+
+		member = list_entry(group->pending_list.next,
+				    struct inform_member, list);
+		info = &member->info;
+		atomic_inc(&member->refcount);
+
+		if (group->join_state == INFORM_MEMBER) {
+			join_group(group, member);
+			spin_unlock_irq(&group->lock);
+			ret = info->callback(0, info, NULL);
+		} else {
+			spin_unlock_irq(&group->lock);
+			status = send_reg(group, member);
+			if (!status) {
+				deref_member(member);
+				return;
+			}
+			ret = fail_join(group, member, status);
+		}
+
+		deref_member(member);
+		if (ret)
+			ib_sa_unregister_inform_info(&member->info);
+		spin_lock_irq(&group->lock);
+	}
+
+	if (!group->members && (group->join_state == INFORM_MEMBER)) {
+		group->join_state = INFORM_IDLE;
+		spin_unlock_irq(&group->lock);
+		if (send_unreg(group))
+			goto retest;
+	} else {
+		group->state = INFORM_IDLE;
+		spin_unlock_irq(&group->lock);
+		release_group(group);
+	}
+}
+
+/*
+ * Fail a join request if it is still active - at the head of the pending queue.
+ */
+static void process_join_error(struct inform_group *group, int status)
+{
+	struct inform_member *member;
+	int ret;
+
+	spin_lock_irq(&group->lock);
+	member = list_entry(group->pending_list.next,
+			    struct inform_member, list);
+	if (group->last_join == member) {
+		atomic_inc(&member->refcount);
+		list_del_init(&member->list);
+		spin_unlock_irq(&group->lock);
+		ret = member->info.callback(status, &member->info, NULL);
+		deref_member(member);
+		if (ret)
+			ib_sa_unregister_inform_info(&member->info);
+	} else
+		spin_unlock_irq(&group->lock);
+}
+
+static void reg_handler(int status, struct ib_sa_inform *inform, void *context)
+{
+	struct inform_group *group = context;
+
+	if (status)
+		process_join_error(group, status);
+	else
+		group->join_state = INFORM_MEMBER;
+
+	inform_work_handler(&group->work);
+}
+
+static void unreg_handler(int status, struct ib_sa_inform *rec, void *context)
+{
+	struct inform_group *group = context;
+
+	inform_work_handler(&group->work);
+}
+
+int notice_dispatch(struct ib_device *device, u8 port_num,
+		    struct ib_sa_notice *notice)
+{
+	struct inform_device *dev;
+	struct inform_port *port;
+	struct inform_group *group;
+	struct inform_notice *info_notice;
+
+	dev = ib_get_client_data(device, &inform_client);
+	if (!dev)
+		return 0; /* No one to give notice to. */
+
+	port = &dev->port[port_num - dev->start_port];
+	spin_lock_irq(&port->lock);
+	group = inform_find(port, __be16_to_cpu(notice->trap.
+						generic.trap_num));
+	if (!group) {
+		spin_unlock_irq(&port->lock);
+		return 0;
+	}
+
+	atomic_inc(&group->refcount);
+	spin_unlock_irq(&port->lock);
+
+	info_notice = kmalloc(sizeof *info_notice, GFP_KERNEL);
+	if (!info_notice) {
+		release_group(group);
+		return -ENOMEM;
+	}
+
+	info_notice->notice = *notice;
+
+	spin_lock_irq(&group->lock);
+	list_add(&info_notice->list, &group->notice_list);
+	if (group->state == INFORM_IDLE) {
+		group->state = INFORM_BUSY;
+		spin_unlock_irq(&group->lock);
+		inform_work_handler(&group->work);
+	} else {
+		spin_unlock_irq(&group->lock);
+		release_group(group);
+	}
+
+	return 0;
+}
+
+static struct inform_group *acquire_group(struct inform_port *port,
+					  u16 trap_number, gfp_t gfp_mask)
+{
+	struct inform_group *group, *cur_group;
+	unsigned long flags;
+
+	spin_lock_irqsave(&port->lock, flags);
+	group = inform_find(port, trap_number);
+	if (group)
+		goto found;
+	spin_unlock_irqrestore(&port->lock, flags);
+
+	group = kzalloc(sizeof *group, gfp_mask);
+	if (!group)
+		return NULL;
+
+	group->port = port;
+	group->trap_number = trap_number;
+	INIT_LIST_HEAD(&group->pending_list);
+	INIT_LIST_HEAD(&group->active_list);
+	INIT_LIST_HEAD(&group->notice_list);
+	INIT_WORK(&group->work, inform_work_handler);
+	spin_lock_init(&group->lock);
+
+	spin_lock_irqsave(&port->lock, flags);
+	cur_group = inform_insert(port, group);
+	if (cur_group) {
+		kfree(group);
+		group = cur_group;
+	} else
+		atomic_inc(&port->refcount);
+found:
+	atomic_inc(&group->refcount);
+	spin_unlock_irqrestore(&port->lock, flags);
+	return group;
+}
+
+/*
+ * We serialize all join requests to a single group to make our lives much
+ * easier.  Otherwise, two users could try to join the same group
+ * simultaneously, with different configurations, one could leave while the
+ * join is in progress, etc., which makes locking around error recovery
+ * difficult.
+ */
+struct ib_inform_info *
+ib_sa_register_inform_info(struct ib_sa_client *client,
+			   struct ib_device *device, u8 port_num,
+			   u16 trap_number, gfp_t gfp_mask,
+			   int (*callback)(int status,
+					   struct ib_inform_info *info,
+					   struct ib_sa_notice *notice),
+			   void *context)
+{
+	struct inform_device *dev;
+	struct inform_member *member;
+	struct ib_inform_info *info;
+	int ret;
+
+	dev = ib_get_client_data(device, &inform_client);
+	if (!dev)
+		return ERR_PTR(-ENODEV);
+
+	member = kzalloc(sizeof *member, gfp_mask);
+	if (!member)
+		return ERR_PTR(-ENOMEM);
+
+	ib_sa_client_get(client);
+	member->client = client;
+	member->info.trap_number = trap_number;
+	member->info.callback = callback;
+	member->info.context = context;
+	init_completion(&member->comp);
+	atomic_set(&member->refcount, 1);
+	member->state = INFORM_REGISTERING;
+
+	member->group = acquire_group(&dev->port[port_num - dev->start_port],
+				      trap_number, gfp_mask);
+	if (!member->group) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	/*
+	 * The user will get the info structure in their callback.  They
+	 * could then free the info structure before we can return from
+	 * this routine.  So we save the pointer to return before queuing
+	 * any callback.
+	 */
+	info = &member->info;
+	queue_reg(member);
+	return info;
+
+err:
+	ib_sa_client_put(member->client);
+	kfree(member);
+	return ERR_PTR(ret);
+}
+EXPORT_SYMBOL(ib_sa_register_inform_info);
+
+void ib_sa_unregister_inform_info(struct ib_inform_info *info)
+{
+	struct inform_member *member;
+	struct inform_group *group;
+
+	member = container_of(info, struct inform_member, info);
+	group = member->group;
+
+	spin_lock_irq(&group->lock);
+	if (member->state == INFORM_MEMBER)
+		group->members--;
+
+	list_del_init(&member->list);
+
+	if (group->state == INFORM_IDLE) {
+		group->state = INFORM_BUSY;
+		spin_unlock_irq(&group->lock);
+		/* Continue to hold reference on group until callback */
+		queue_work(inform_wq, &group->work);
+	} else {
+		spin_unlock_irq(&group->lock);
+		release_group(group);
+	}
+
+	deref_member(member);
+	wait_for_completion(&member->comp);
+	ib_sa_client_put(member->client);
+	kfree(member);
+}
+EXPORT_SYMBOL(ib_sa_unregister_inform_info);
+
+static void inform_groups_lost(struct inform_port *port)
+{
+	struct inform_group *group;
+	struct rb_node *node;
+	unsigned long flags;
+
+	spin_lock_irqsave(&port->lock, flags);
+	for (node = rb_first(&port->table); node; node = rb_next(node)) {
+		group = rb_entry(node, struct inform_group, node);
+		spin_lock(&group->lock);
+		if (group->state == INFORM_IDLE) {
+			atomic_inc(&group->refcount);
+			queue_work(inform_wq, &group->work);
+		}
+		group->state = INFORM_ERROR;
+		spin_unlock(&group->lock);
+	}
+	spin_unlock_irqrestore(&port->lock, flags);
+}
+
+static void inform_event_handler(struct ib_event_handler *handler,
+				struct ib_event *event)
+{
+	struct inform_device *dev;
+
+	dev = container_of(handler, struct inform_device, event_handler);
+
+	switch (event->event) {
+	case IB_EVENT_PORT_ERR:
+	case IB_EVENT_LID_CHANGE:
+	case IB_EVENT_SM_CHANGE:
+	case IB_EVENT_CLIENT_REREGISTER:
+		inform_groups_lost(&dev->port[event->element.port_num -
+					      dev->start_port]);
+		break;
+	default:
+		break;
+	}
+}
+
+static void inform_add_one(struct ib_device *device)
+{
+	struct inform_device *dev;
+	struct inform_port *port;
+	int i;
+
+	if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
+		return;
+
+	dev = kmalloc(sizeof *dev + device->phys_port_cnt * sizeof *port,
+		      GFP_KERNEL);
+	if (!dev)
+		return;
+
+	if (device->node_type == RDMA_NODE_IB_SWITCH)
+		dev->start_port = dev->end_port = 0;
+	else {
+		dev->start_port = 1;
+		dev->end_port = device->phys_port_cnt;
+	}
+
+	for (i = 0; i <= dev->end_port - dev->start_port; i++) {
+		port = &dev->port[i];
+		port->dev = dev;
+		port->port_num = dev->start_port + i;
+		spin_lock_init(&port->lock);
+		port->table = RB_ROOT;
+		init_completion(&port->comp);
+		atomic_set(&port->refcount, 1);
+	}
+
+	dev->device = device;
+	ib_set_client_data(device, &inform_client, dev);
+
+	INIT_IB_EVENT_HANDLER(&dev->event_handler, device, inform_event_handler);
+	ib_register_event_handler(&dev->event_handler);
+}
+
+static void inform_remove_one(struct ib_device *device)
+{
+	struct inform_device *dev;
+	struct inform_port *port;
+	int i;
+
+	dev = ib_get_client_data(device, &inform_client);
+	if (!dev)
+		return;
+
+	ib_unregister_event_handler(&dev->event_handler);
+	flush_workqueue(inform_wq);
+
+	for (i = 0; i <= dev->end_port - dev->start_port; i++) {
+		port = &dev->port[i];
+		deref_port(port);
+		wait_for_completion(&port->comp);
+	}
+
+	kfree(dev);
+}
+
+int notice_init(void)
+{
+	int ret;
+
+	inform_wq = create_singlethread_workqueue("ib_inform");
+	if (!inform_wq)
+		return -ENOMEM;
+
+	ib_sa_register_client(&sa_client);
+
+	ret = ib_register_client(&inform_client);
+	if (ret)
+		goto err;
+	return 0;
+
+err:
+	ib_sa_unregister_client(&sa_client);
+	destroy_workqueue(inform_wq);
+	return ret;
+}
+
+void notice_cleanup(void)
+{
+	ib_unregister_client(&inform_client);
+	ib_sa_unregister_client(&sa_client);
+	destroy_workqueue(inform_wq);
+}


Property changes on: trunk/sys/ofed/drivers/infiniband/core/notice.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/sys/ofed/drivers/infiniband/core/packer.c
===================================================================
--- trunk/sys/ofed/drivers/infiniband/core/packer.c	                        (rev 0)
+++ trunk/sys/ofed/drivers/infiniband/core/packer.c	2016-09-14 19:35:22 UTC (rev 7911)
@@ -0,0 +1,202 @@
+/*
+ * Copyright (c) 2004 Topspin Corporation.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/string.h>
+
+#include <rdma/ib_pack.h>
+
+static u64 value_read(int offset, int size, void *structure)
+{
+	switch (size) {
+	case 1: return                *(u8  *) (structure + offset);
+	case 2: return be16_to_cpup((__be16 *) (structure + offset));
+	case 4: return be32_to_cpup((__be32 *) (structure + offset));
+	case 8: return be64_to_cpup((__be64 *) (structure + offset));
+	default:
+		printk(KERN_WARNING "Field size %d bits not handled\n", size * 8);
+		return 0;
+	}
+}
+
+/**
+ * ib_pack - Pack a structure into a buffer
+ * @desc:Array of structure field descriptions
+ * @desc_len:Number of entries in @desc
+ * @structure:Structure to pack from
+ * @buf:Buffer to pack into
+ *
+ * ib_pack() packs a list of structure fields into a buffer,
+ * controlled by the array of fields in @desc.
+ */
+void ib_pack(const struct ib_field        *desc,
+	     int                           desc_len,
+	     void                         *structure,
+	     void                         *buf)
+{
+	int i;
+
+	for (i = 0; i < desc_len; ++i) {
+		if (desc[i].size_bits <= 32) {
+			int shift;
+			u32 val;
+			__be32 mask;
+			__be32 *addr;
+
+			shift = 32 - desc[i].offset_bits - desc[i].size_bits;
+			if (desc[i].struct_size_bytes)
+				val = value_read(desc[i].struct_offset_bytes,
+						 desc[i].struct_size_bytes,
+						 structure) << shift;
+			else
+				val = 0;
+
+			mask = cpu_to_be32(((1ull << desc[i].size_bits) - 1) << shift);
+			addr = (__be32 *) buf + desc[i].offset_words;
+			*addr = (*addr & ~mask) | (cpu_to_be32(val) & mask);
+		} else if (desc[i].size_bits <= 64) {
+			int shift;
+			u64 val;
+			__be64 mask;
+			__be64 *addr;
+
+			shift = 64 - desc[i].offset_bits - desc[i].size_bits;
+			if (desc[i].struct_size_bytes)
+				val = value_read(desc[i].struct_offset_bytes,
+						 desc[i].struct_size_bytes,
+						 structure) << shift;
+			else
+				val = 0;
+
+			mask = cpu_to_be64((~0ull >> (64 - desc[i].size_bits)) << shift);
+			addr = (__be64 *) ((__be32 *) buf + desc[i].offset_words);
+			*addr = (*addr & ~mask) | (cpu_to_be64(val) & mask);
+		} else {
+			if (desc[i].offset_bits % 8 ||
+			    desc[i].size_bits   % 8) {
+				printk(KERN_WARNING "Structure field %s of size %d "
+				       "bits is not byte-aligned\n",
+				       desc[i].field_name, desc[i].size_bits);
+			}
+
+			if (desc[i].struct_size_bytes)
+				memcpy(buf + desc[i].offset_words * 4 +
+				       desc[i].offset_bits / 8,
+				       structure + desc[i].struct_offset_bytes,
+				       desc[i].size_bits / 8);
+			else
+				memset(buf + desc[i].offset_words * 4 +
+				       desc[i].offset_bits / 8,
+				       0,
+				       desc[i].size_bits / 8);
+		}
+	}
+}
+EXPORT_SYMBOL(ib_pack);
+
+static void value_write(int offset, int size, u64 val, void *structure)
+{
+	switch (size * 8) {
+	case 8:  *(    u8 *) (structure + offset) = val; break;
+	case 16: *(__be16 *) (structure + offset) = cpu_to_be16(val); break;
+	case 32: *(__be32 *) (structure + offset) = cpu_to_be32(val); break;
+	case 64: *(__be64 *) (structure + offset) = cpu_to_be64(val); break;
+	default:
+		printk(KERN_WARNING "Field size %d bits not handled\n", size * 8);
+	}
+}
+
+/**
+ * ib_unpack - Unpack a buffer into a structure
+ * @desc:Array of structure field descriptions
+ * @desc_len:Number of entries in @desc
+ * @buf:Buffer to unpack from
+ * @structure:Structure to unpack into
+ *
+ * ib_pack() unpacks a list of structure fields from a buffer,
+ * controlled by the array of fields in @desc.
+ */
+void ib_unpack(const struct ib_field        *desc,
+	       int                           desc_len,
+	       void                         *buf,
+	       void                         *structure)
+{
+	int i;
+
+	for (i = 0; i < desc_len; ++i) {
+		if (!desc[i].struct_size_bytes)
+			continue;
+
+		if (desc[i].size_bits <= 32) {
+			int shift;
+			u32  val;
+			u32  mask;
+			__be32 *addr;
+
+			shift = 32 - desc[i].offset_bits - desc[i].size_bits;
+			mask = ((1ull << desc[i].size_bits) - 1) << shift;
+			addr = (__be32 *) buf + desc[i].offset_words;
+			val = (be32_to_cpup(addr) & mask) >> shift;
+			value_write(desc[i].struct_offset_bytes,
+				    desc[i].struct_size_bytes,
+				    val,
+				    structure);
+		} else if (desc[i].size_bits <= 64) {
+			int shift;
+			u64  val;
+			u64  mask;
+			__be64 *addr;
+
+			shift = 64 - desc[i].offset_bits - desc[i].size_bits;
+			mask = (~0ull >> (64 - desc[i].size_bits)) << shift;
+			addr = (__be64 *) buf + desc[i].offset_words;
+			val = (be64_to_cpup(addr) & mask) >> shift;
+			value_write(desc[i].struct_offset_bytes,
+				    desc[i].struct_size_bytes,
+				    val,
+				    structure);
+		} else {
+			if (desc[i].offset_bits % 8 ||
+			    desc[i].size_bits   % 8) {
+				printk(KERN_WARNING "Structure field %s of size %d "
+				       "bits is not byte-aligned\n",
+				       desc[i].field_name, desc[i].size_bits);
+			}
+
+			memcpy(structure + desc[i].struct_offset_bytes,
+			       buf + desc[i].offset_words * 4 +
+			       desc[i].offset_bits / 8,
+			       desc[i].size_bits / 8);
+		}
+	}
+}
+EXPORT_SYMBOL(ib_unpack);


Property changes on: trunk/sys/ofed/drivers/infiniband/core/packer.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/sys/ofed/drivers/infiniband/core/sa.h
===================================================================
--- trunk/sys/ofed/drivers/infiniband/core/sa.h	                        (rev 0)
+++ trunk/sys/ofed/drivers/infiniband/core/sa.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Voltaire, Inc.  All rights reserved.
+ * Copyright (c) 2006 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef SA_H
+#define SA_H
+
+#include <rdma/ib_sa.h>
+
+static inline void ib_sa_client_get(struct ib_sa_client *client)
+{
+	atomic_inc(&client->users);
+}
+
+static inline void ib_sa_client_put(struct ib_sa_client *client)
+{
+	if (atomic_dec_and_test(&client->users))
+		complete(&client->comp);
+}
+
+int ib_sa_check_selector(ib_sa_comp_mask comp_mask,
+			 ib_sa_comp_mask selector_mask,
+			 ib_sa_comp_mask value_mask,
+			 u8 selector, u8 src_value, u8 dst_value);
+
+int ib_sa_pack_attr(void *dst, void *src, int attr_id);
+
+int ib_sa_unpack_attr(void *dst, void *src, int attr_id);
+
+int ib_sa_path_rec_query(struct ib_sa_client *client,
+			 struct ib_device *device, u8 port_num,
+			 struct ib_sa_path_rec *rec,
+			 ib_sa_comp_mask comp_mask,
+			 int timeout_ms, gfp_t gfp_mask,
+			 void (*callback)(int status,
+					  struct ib_sa_path_rec *resp,
+					  void *context),
+			 void *context,
+			 struct ib_sa_query **sa_query);
+
+int sa_db_init(void);
+void sa_db_cleanup(void);
+
+int ib_sa_mcmember_rec_query(struct ib_sa_client *client,
+			     struct ib_device *device, u8 port_num,
+			     u8 method,
+			     struct ib_sa_mcmember_rec *rec,
+			     ib_sa_comp_mask comp_mask,
+			     int timeout_ms, gfp_t gfp_mask,
+			     void (*callback)(int status,
+					      struct ib_sa_mcmember_rec *resp,
+					      void *context),
+			     void *context,
+			     struct ib_sa_query **sa_query);
+
+int mcast_init(void);
+void mcast_cleanup(void);
+
+int ib_sa_informinfo_query(struct ib_sa_client *client,
+			   struct ib_device *device, u8 port_num,
+			   struct ib_sa_inform *rec,
+			   int timeout_ms, gfp_t gfp_mask,
+			   void (*callback)(int status,
+					    struct ib_sa_inform *resp,
+					    void *context),
+			   void *context,
+			   struct ib_sa_query **sa_query);
+
+int notice_dispatch(struct ib_device *device, u8 port_num,
+		    struct ib_sa_notice *notice);
+
+int notice_init(void);
+void notice_cleanup(void);
+
+#endif /* SA_H */


Property changes on: trunk/sys/ofed/drivers/infiniband/core/sa.h
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/sys/ofed/drivers/infiniband/core/sa_query.c
===================================================================
--- trunk/sys/ofed/drivers/infiniband/core/sa_query.c	                        (rev 0)
+++ trunk/sys/ofed/drivers/infiniband/core/sa_query.c	2016-09-14 19:35:22 UTC (rev 7911)
@@ -0,0 +1,1500 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Voltaire, Inc.  All rights reserved.
+ * Copyright (c) 2006 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/err.h>
+#include <linux/random.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/dma-mapping.h>
+#include <linux/kref.h>
+#include <linux/idr.h>
+#include <linux/workqueue.h>
+
+#include <rdma/ib_pack.h>
+#include <rdma/ib_cache.h>
+#include "sa.h"
+
+MODULE_AUTHOR("Roland Dreier");
+MODULE_DESCRIPTION("InfiniBand subnet administration query support");
+MODULE_LICENSE("Dual BSD/GPL");
+
+struct ib_sa_sm_ah {
+	struct ib_ah        *ah;
+	struct kref          ref;
+	u16		     pkey_index;
+	u8		     src_path_mask;
+};
+
+struct ib_sa_port {
+	struct ib_mad_agent *agent;
+	struct ib_mad_agent *notice_agent;
+	struct ib_sa_sm_ah  *sm_ah;
+	struct work_struct   update_task;
+	spinlock_t           ah_lock;
+	u8                   port_num;
+	struct ib_device    *device;
+};
+
+struct ib_sa_device {
+	int                     start_port, end_port;
+	struct ib_event_handler event_handler;
+	struct ib_sa_port port[0];
+};
+
+struct ib_sa_query {
+	void (*callback)(struct ib_sa_query *, int, struct ib_sa_mad *);
+	void (*release)(struct ib_sa_query *);
+	struct ib_sa_client    *client;
+	struct ib_sa_port      *port;
+	struct ib_mad_send_buf *mad_buf;
+	struct ib_sa_sm_ah     *sm_ah;
+	int			id;
+};
+
+struct ib_sa_service_query {
+	void (*callback)(int, struct ib_sa_service_rec *, void *);
+	void *context;
+	struct ib_sa_query sa_query;
+};
+
+struct ib_sa_path_query {
+	void (*callback)(int, struct ib_sa_path_rec *, void *);
+	void *context;
+	struct ib_sa_query sa_query;
+};
+
+struct ib_sa_mcmember_query {
+	void (*callback)(int, struct ib_sa_mcmember_rec *, void *);
+	void *context;
+	struct ib_sa_query sa_query;
+};
+
+struct ib_sa_inform_query {
+	void (*callback)(int, struct ib_sa_inform *, void *);
+	void *context;
+	struct ib_sa_query sa_query;
+};
+
+static void ib_sa_add_one(struct ib_device *device);
+static void ib_sa_remove_one(struct ib_device *device);
+
+static struct ib_client sa_client = {
+	.name   = "sa",
+	.add    = ib_sa_add_one,
+	.remove = ib_sa_remove_one
+};
+
+static spinlock_t idr_lock;
+static DEFINE_IDR(query_idr);
+
+static spinlock_t tid_lock;
+static u32 tid;
+
+#define PATH_REC_FIELD(field) \
+	.struct_offset_bytes = offsetof(struct ib_sa_path_rec, field),		\
+	.struct_size_bytes   = sizeof ((struct ib_sa_path_rec *) 0)->field,	\
+	.field_name          = "sa_path_rec:" #field
+
+static const struct ib_field path_rec_table[] = {
+	{ PATH_REC_FIELD(service_id),
+	  .offset_words = 0,
+	  .offset_bits  = 0,
+	  .size_bits    = 64 },
+	{ PATH_REC_FIELD(dgid),
+	  .offset_words = 2,
+	  .offset_bits  = 0,
+	  .size_bits    = 128 },
+	{ PATH_REC_FIELD(sgid),
+	  .offset_words = 6,
+	  .offset_bits  = 0,
+	  .size_bits    = 128 },
+	{ PATH_REC_FIELD(dlid),
+	  .offset_words = 10,
+	  .offset_bits  = 0,
+	  .size_bits    = 16 },
+	{ PATH_REC_FIELD(slid),
+	  .offset_words = 10,
+	  .offset_bits  = 16,
+	  .size_bits    = 16 },
+	{ PATH_REC_FIELD(raw_traffic),
+	  .offset_words = 11,
+	  .offset_bits  = 0,
+	  .size_bits    = 1 },
+	{ RESERVED,
+	  .offset_words = 11,
+	  .offset_bits  = 1,
+	  .size_bits    = 3 },
+	{ PATH_REC_FIELD(flow_label),
+	  .offset_words = 11,
+	  .offset_bits  = 4,
+	  .size_bits    = 20 },
+	{ PATH_REC_FIELD(hop_limit),
+	  .offset_words = 11,
+	  .offset_bits  = 24,
+	  .size_bits    = 8 },
+	{ PATH_REC_FIELD(traffic_class),
+	  .offset_words = 12,
+	  .offset_bits  = 0,
+	  .size_bits    = 8 },
+	{ PATH_REC_FIELD(reversible),
+	  .offset_words = 12,
+	  .offset_bits  = 8,
+	  .size_bits    = 1 },
+	{ PATH_REC_FIELD(numb_path),
+	  .offset_words = 12,
+	  .offset_bits  = 9,
+	  .size_bits    = 7 },
+	{ PATH_REC_FIELD(pkey),
+	  .offset_words = 12,
+	  .offset_bits  = 16,
+	  .size_bits    = 16 },
+	{ PATH_REC_FIELD(qos_class),
+	  .offset_words = 13,
+	  .offset_bits  = 0,
+	  .size_bits    = 12 },
+	{ PATH_REC_FIELD(sl),
+	  .offset_words = 13,
+	  .offset_bits  = 12,
+	  .size_bits    = 4 },
+	{ PATH_REC_FIELD(mtu_selector),
+	  .offset_words = 13,
+	  .offset_bits  = 16,
+	  .size_bits    = 2 },
+	{ PATH_REC_FIELD(mtu),
+	  .offset_words = 13,
+	  .offset_bits  = 18,
+	  .size_bits    = 6 },
+	{ PATH_REC_FIELD(rate_selector),
+	  .offset_words = 13,
+	  .offset_bits  = 24,
+	  .size_bits    = 2 },
+	{ PATH_REC_FIELD(rate),
+	  .offset_words = 13,
+	  .offset_bits  = 26,
+	  .size_bits    = 6 },
+	{ PATH_REC_FIELD(packet_life_time_selector),
+	  .offset_words = 14,
+	  .offset_bits  = 0,
+	  .size_bits    = 2 },
+	{ PATH_REC_FIELD(packet_life_time),
+	  .offset_words = 14,
+	  .offset_bits  = 2,
+	  .size_bits    = 6 },
+	{ PATH_REC_FIELD(preference),
+	  .offset_words = 14,
+	  .offset_bits  = 8,
+	  .size_bits    = 8 },
+	{ RESERVED,
+	  .offset_words = 14,
+	  .offset_bits  = 16,
+	  .size_bits    = 48 },
+};
+
+#define MCMEMBER_REC_FIELD(field) \
+	.struct_offset_bytes = offsetof(struct ib_sa_mcmember_rec, field),	\
+	.struct_size_bytes   = sizeof ((struct ib_sa_mcmember_rec *) 0)->field,	\
+	.field_name          = "sa_mcmember_rec:" #field
+
+static const struct ib_field mcmember_rec_table[] = {
+	{ MCMEMBER_REC_FIELD(mgid),
+	  .offset_words = 0,
+	  .offset_bits  = 0,
+	  .size_bits    = 128 },
+	{ MCMEMBER_REC_FIELD(port_gid),
+	  .offset_words = 4,
+	  .offset_bits  = 0,
+	  .size_bits    = 128 },
+	{ MCMEMBER_REC_FIELD(qkey),
+	  .offset_words = 8,
+	  .offset_bits  = 0,
+	  .size_bits    = 32 },
+	{ MCMEMBER_REC_FIELD(mlid),
+	  .offset_words = 9,
+	  .offset_bits  = 0,
+	  .size_bits    = 16 },
+	{ MCMEMBER_REC_FIELD(mtu_selector),
+	  .offset_words = 9,
+	  .offset_bits  = 16,
+	  .size_bits    = 2 },
+	{ MCMEMBER_REC_FIELD(mtu),
+	  .offset_words = 9,
+	  .offset_bits  = 18,
+	  .size_bits    = 6 },
+	{ MCMEMBER_REC_FIELD(traffic_class),
+	  .offset_words = 9,
+	  .offset_bits  = 24,
+	  .size_bits    = 8 },
+	{ MCMEMBER_REC_FIELD(pkey),
+	  .offset_words = 10,
+	  .offset_bits  = 0,
+	  .size_bits    = 16 },
+	{ MCMEMBER_REC_FIELD(rate_selector),
+	  .offset_words = 10,
+	  .offset_bits  = 16,
+	  .size_bits    = 2 },
+	{ MCMEMBER_REC_FIELD(rate),
+	  .offset_words = 10,
+	  .offset_bits  = 18,
+	  .size_bits    = 6 },
+	{ MCMEMBER_REC_FIELD(packet_life_time_selector),
+	  .offset_words = 10,
+	  .offset_bits  = 24,
+	  .size_bits    = 2 },
+	{ MCMEMBER_REC_FIELD(packet_life_time),
+	  .offset_words = 10,
+	  .offset_bits  = 26,
+	  .size_bits    = 6 },
+	{ MCMEMBER_REC_FIELD(sl),
+	  .offset_words = 11,
+	  .offset_bits  = 0,
+	  .size_bits    = 4 },
+	{ MCMEMBER_REC_FIELD(flow_label),
+	  .offset_words = 11,
+	  .offset_bits  = 4,
+	  .size_bits    = 20 },
+	{ MCMEMBER_REC_FIELD(hop_limit),
+	  .offset_words = 11,
+	  .offset_bits  = 24,
+	  .size_bits    = 8 },
+	{ MCMEMBER_REC_FIELD(scope),
+	  .offset_words = 12,
+	  .offset_bits  = 0,
+	  .size_bits    = 4 },
+	{ MCMEMBER_REC_FIELD(join_state),
+	  .offset_words = 12,
+	  .offset_bits  = 4,
+	  .size_bits    = 4 },
+	{ MCMEMBER_REC_FIELD(proxy_join),
+	  .offset_words = 12,
+	  .offset_bits  = 8,
+	  .size_bits    = 1 },
+	{ RESERVED,
+	  .offset_words = 12,
+	  .offset_bits  = 9,
+	  .size_bits    = 23 },
+};
+
+#define SERVICE_REC_FIELD(field) \
+	.struct_offset_bytes = offsetof(struct ib_sa_service_rec, field),	\
+	.struct_size_bytes   = sizeof ((struct ib_sa_service_rec *) 0)->field,	\
+	.field_name          = "sa_service_rec:" #field
+
+static const struct ib_field service_rec_table[] = {
+	{ SERVICE_REC_FIELD(id),
+	  .offset_words = 0,
+	  .offset_bits  = 0,
+	  .size_bits    = 64 },
+	{ SERVICE_REC_FIELD(gid),
+	  .offset_words = 2,
+	  .offset_bits  = 0,
+	  .size_bits    = 128 },
+	{ SERVICE_REC_FIELD(pkey),
+	  .offset_words = 6,
+	  .offset_bits  = 0,
+	  .size_bits    = 16 },
+	{ SERVICE_REC_FIELD(lease),
+	  .offset_words = 7,
+	  .offset_bits  = 0,
+	  .size_bits    = 32 },
+	{ SERVICE_REC_FIELD(key),
+	  .offset_words = 8,
+	  .offset_bits  = 0,
+	  .size_bits    = 128 },
+	{ SERVICE_REC_FIELD(name),
+	  .offset_words = 12,
+	  .offset_bits  = 0,
+	  .size_bits    = 64*8 },
+	{ SERVICE_REC_FIELD(data8),
+	  .offset_words = 28,
+	  .offset_bits  = 0,
+	  .size_bits    = 16*8 },
+	{ SERVICE_REC_FIELD(data16),
+	  .offset_words = 32,
+	  .offset_bits  = 0,
+	  .size_bits    = 8*16 },
+	{ SERVICE_REC_FIELD(data32),
+	  .offset_words = 36,
+	  .offset_bits  = 0,
+	  .size_bits    = 4*32 },
+	{ SERVICE_REC_FIELD(data64),
+	  .offset_words = 40,
+	  .offset_bits  = 0,
+	  .size_bits    = 2*64 },
+};
+
+#define INFORM_FIELD(field) \
+	.struct_offset_bytes = offsetof(struct ib_sa_inform, field), \
+	.struct_size_bytes   = sizeof ((struct ib_sa_inform *) 0)->field, \
+	.field_name          = "sa_inform:" #field
+
+static const struct ib_field inform_table[] = {
+	{ INFORM_FIELD(gid),
+	  .offset_words = 0,
+	  .offset_bits  = 0,
+	  .size_bits    = 128 },
+	{ INFORM_FIELD(lid_range_begin),
+	  .offset_words = 4,
+	  .offset_bits  = 0,
+	  .size_bits    = 16 },
+	{ INFORM_FIELD(lid_range_end),
+	  .offset_words = 4,
+	  .offset_bits  = 16,
+	  .size_bits    = 16 },
+	{ RESERVED,
+	  .offset_words = 5,
+	  .offset_bits  = 0,
+	  .size_bits    = 16 },
+	{ INFORM_FIELD(is_generic),
+	  .offset_words = 5,
+	  .offset_bits  = 16,
+	  .size_bits    = 8 },
+	{ INFORM_FIELD(subscribe),
+	  .offset_words = 5,
+	  .offset_bits  = 24,
+	  .size_bits    = 8 },
+	{ INFORM_FIELD(type),
+	  .offset_words = 6,
+	  .offset_bits  = 0,
+	  .size_bits    = 16 },
+	{ INFORM_FIELD(trap.generic.trap_num),
+	  .offset_words = 6,
+	  .offset_bits  = 16,
+	  .size_bits    = 16 },
+	{ INFORM_FIELD(trap.generic.qpn),
+	  .offset_words = 7,
+	  .offset_bits  = 0,
+	  .size_bits    = 24 },
+	{ RESERVED,
+	  .offset_words = 7,
+	  .offset_bits  = 24,
+	  .size_bits    = 3 },
+	{ INFORM_FIELD(trap.generic.resp_time),
+	  .offset_words = 7,
+	  .offset_bits  = 27,
+	  .size_bits    = 5 },
+	{ RESERVED,
+	  .offset_words = 8,
+	  .offset_bits  = 0,
+	  .size_bits    = 8 },
+	{ INFORM_FIELD(trap.generic.producer_type),
+	  .offset_words = 8,
+	  .offset_bits  = 8,
+	  .size_bits    = 24 },
+};
+
+#define NOTICE_FIELD(field) \
+	.struct_offset_bytes = offsetof(struct ib_sa_notice, field), \
+	.struct_size_bytes   = sizeof ((struct ib_sa_notice *) 0)->field, \
+	.field_name          = "sa_notice:" #field
+
+static const struct ib_field notice_table[] = {
+	{ NOTICE_FIELD(is_generic),
+	  .offset_words = 0,
+	  .offset_bits  = 0,
+	  .size_bits    = 1 },
+	{ NOTICE_FIELD(type),
+	  .offset_words = 0,
+	  .offset_bits  = 1,
+	  .size_bits    = 7 },
+	{ NOTICE_FIELD(trap.generic.producer_type),
+	  .offset_words = 0,
+	  .offset_bits  = 8,
+	  .size_bits    = 24 },
+	{ NOTICE_FIELD(trap.generic.trap_num),
+	  .offset_words = 1,
+	  .offset_bits  = 0,
+	  .size_bits    = 16 },
+	{ NOTICE_FIELD(issuer_lid),
+	  .offset_words = 1,
+	  .offset_bits  = 16,
+	  .size_bits    = 16 },
+	{ NOTICE_FIELD(notice_toggle),
+	  .offset_words = 2,
+	  .offset_bits  = 0,
+	  .size_bits    = 1 },
+	{ NOTICE_FIELD(notice_count),
+	  .offset_words = 2,
+	  .offset_bits  = 1,
+	  .size_bits    = 15 },
+	{ NOTICE_FIELD(data_details),
+	  .offset_words = 2,
+	  .offset_bits  = 16,
+	  .size_bits    = 432 },
+	{ NOTICE_FIELD(issuer_gid),
+	  .offset_words = 16,
+	  .offset_bits  = 0,
+	  .size_bits    = 128 },
+};
+
+int ib_sa_check_selector(ib_sa_comp_mask comp_mask,
+			 ib_sa_comp_mask selector_mask,
+			 ib_sa_comp_mask value_mask,
+			 u8 selector, u8 src_value, u8 dst_value)
+{
+	int err;
+
+	if (!(comp_mask & selector_mask) || !(comp_mask & value_mask))
+		return 0;
+
+	switch (selector) {
+	case IB_SA_GT:
+		err = (src_value <= dst_value);
+		break;
+	case IB_SA_LT:
+		err = (src_value >= dst_value);
+		break;
+	case IB_SA_EQ:
+		err = (src_value != dst_value);
+		break;
+	default:
+		err = 0;
+		break;
+	}
+
+	return err;
+}
+
+int ib_sa_pack_attr(void *dst, void *src, int attr_id)
+{
+	switch (attr_id) {
+	case IB_SA_ATTR_PATH_REC:
+		ib_pack(path_rec_table, ARRAY_SIZE(path_rec_table), src, dst);
+		break;
+	default:
+		return -EINVAL;
+	}
+	return 0;
+}
+
+int ib_sa_unpack_attr(void *dst, void *src, int attr_id)
+{
+	switch (attr_id) {
+	case IB_SA_ATTR_PATH_REC:
+		ib_unpack(path_rec_table, ARRAY_SIZE(path_rec_table), src, dst);
+		break;
+	default:
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static void free_sm_ah(struct kref *kref)
+{
+	struct ib_sa_sm_ah *sm_ah = container_of(kref, struct ib_sa_sm_ah, ref);
+
+	ib_destroy_ah(sm_ah->ah);
+	kfree(sm_ah);
+}
+
+static void update_sm_ah(struct work_struct *work)
+{
+	struct ib_sa_port *port =
+		container_of(work, struct ib_sa_port, update_task);
+	struct ib_sa_sm_ah *new_ah;
+	struct ib_port_attr port_attr;
+	struct ib_ah_attr   ah_attr;
+
+	if (ib_query_port(port->agent->device, port->port_num, &port_attr)) {
+		printk(KERN_WARNING "Couldn't query port\n");
+		return;
+	}
+
+	new_ah = kmalloc(sizeof *new_ah, GFP_KERNEL);
+	if (!new_ah) {
+		printk(KERN_WARNING "Couldn't allocate new SM AH\n");
+		return;
+	}
+
+	kref_init(&new_ah->ref);
+	new_ah->src_path_mask = (1 << port_attr.lmc) - 1;
+
+	new_ah->pkey_index = 0;
+	if (ib_find_pkey(port->agent->device, port->port_num,
+			 IB_DEFAULT_PKEY_FULL, &new_ah->pkey_index))
+		printk(KERN_ERR "Couldn't find index for default PKey\n");
+
+	memset(&ah_attr, 0, sizeof ah_attr);
+	ah_attr.dlid     = port_attr.sm_lid;
+	ah_attr.sl       = port_attr.sm_sl;
+	ah_attr.port_num = port->port_num;
+
+	new_ah->ah = ib_create_ah(port->agent->qp->pd, &ah_attr);
+	if (IS_ERR(new_ah->ah)) {
+		printk(KERN_WARNING "Couldn't create new SM AH\n");
+		kfree(new_ah);
+		return;
+	}
+
+	spin_lock_irq(&port->ah_lock);
+	if (port->sm_ah)
+		kref_put(&port->sm_ah->ref, free_sm_ah);
+	port->sm_ah = new_ah;
+	spin_unlock_irq(&port->ah_lock);
+
+}
+
+static void ib_sa_event(struct ib_event_handler *handler, struct ib_event *event)
+{
+	if (event->event == IB_EVENT_PORT_ERR    ||
+	    event->event == IB_EVENT_PORT_ACTIVE ||
+	    event->event == IB_EVENT_LID_CHANGE  ||
+	    event->event == IB_EVENT_PKEY_CHANGE ||
+	    event->event == IB_EVENT_SM_CHANGE   ||
+	    event->event == IB_EVENT_CLIENT_REREGISTER) {
+		unsigned long flags;
+		struct ib_sa_device *sa_dev =
+			container_of(handler, typeof(*sa_dev), event_handler);
+		struct ib_sa_port *port =
+			&sa_dev->port[event->element.port_num - sa_dev->start_port];
+
+		if (rdma_port_get_link_layer(handler->device, port->port_num) != IB_LINK_LAYER_INFINIBAND)
+			return;
+
+		spin_lock_irqsave(&port->ah_lock, flags);
+		if (port->sm_ah)
+			kref_put(&port->sm_ah->ref, free_sm_ah);
+		port->sm_ah = NULL;
+		spin_unlock_irqrestore(&port->ah_lock, flags);
+
+		schedule_work(&sa_dev->port[event->element.port_num -
+					    sa_dev->start_port].update_task);
+	}
+}
+
+void ib_sa_register_client(struct ib_sa_client *client)
+{
+	atomic_set(&client->users, 1);
+	init_completion(&client->comp);
+}
+EXPORT_SYMBOL(ib_sa_register_client);
+
+void ib_sa_unregister_client(struct ib_sa_client *client)
+{
+	ib_sa_client_put(client);
+	wait_for_completion(&client->comp);
+}
+EXPORT_SYMBOL(ib_sa_unregister_client);
+
+/**
+ * ib_sa_cancel_query - try to cancel an SA query
+ * @id:ID of query to cancel
+ * @query:query pointer to cancel
+ *
+ * Try to cancel an SA query.  If the id and query don't match up or
+ * the query has already completed, nothing is done.  Otherwise the
+ * query is canceled and will complete with a status of -EINTR.
+ */
+void ib_sa_cancel_query(int id, struct ib_sa_query *query)
+{
+	unsigned long flags;
+	struct ib_mad_agent *agent;
+	struct ib_mad_send_buf *mad_buf;
+
+	spin_lock_irqsave(&idr_lock, flags);
+	if (idr_find(&query_idr, id) != query) {
+		spin_unlock_irqrestore(&idr_lock, flags);
+		return;
+	}
+	agent = query->port->agent;
+	mad_buf = query->mad_buf;
+	spin_unlock_irqrestore(&idr_lock, flags);
+
+	ib_cancel_mad(agent, mad_buf);
+}
+EXPORT_SYMBOL(ib_sa_cancel_query);
+
+static u8 get_src_path_mask(struct ib_device *device, u8 port_num)
+{
+	struct ib_sa_device *sa_dev;
+	struct ib_sa_port   *port;
+	unsigned long flags;
+	u8 src_path_mask;
+
+	sa_dev = ib_get_client_data(device, &sa_client);
+	if (!sa_dev)
+		return 0x7f;
+
+	port  = &sa_dev->port[port_num - sa_dev->start_port];
+	spin_lock_irqsave(&port->ah_lock, flags);
+	src_path_mask = port->sm_ah ? port->sm_ah->src_path_mask : 0x7f;
+	spin_unlock_irqrestore(&port->ah_lock, flags);
+
+	return src_path_mask;
+}
+
+int ib_init_ah_from_path(struct ib_device *device, u8 port_num,
+			 struct ib_sa_path_rec *rec, struct ib_ah_attr *ah_attr)
+{
+	int ret;
+	u16 gid_index;
+	int force_grh;
+
+	memset(ah_attr, 0, sizeof *ah_attr);
+	ah_attr->dlid = be16_to_cpu(rec->dlid);
+	ah_attr->sl = rec->sl;
+	ah_attr->src_path_bits = be16_to_cpu(rec->slid) &
+				 get_src_path_mask(device, port_num);
+	ah_attr->port_num = port_num;
+	ah_attr->static_rate = rec->rate;
+
+	force_grh = rdma_port_get_link_layer(device, port_num) == IB_LINK_LAYER_ETHERNET;
+
+	if (rec->hop_limit > 1 || force_grh) {
+		ah_attr->ah_flags = IB_AH_GRH;
+		ah_attr->grh.dgid = rec->dgid;
+
+		ret = ib_find_cached_gid(device, &rec->sgid, &port_num,
+					 &gid_index);
+		if (ret)
+			return ret;
+
+		ah_attr->grh.sgid_index    = gid_index;
+		ah_attr->grh.flow_label    = be32_to_cpu(rec->flow_label);
+		ah_attr->grh.hop_limit     = rec->hop_limit;
+		ah_attr->grh.traffic_class = rec->traffic_class;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(ib_init_ah_from_path);
+
+static int alloc_mad(struct ib_sa_query *query, gfp_t gfp_mask)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&query->port->ah_lock, flags);
+	if (!query->port->sm_ah) {
+		spin_unlock_irqrestore(&query->port->ah_lock, flags);
+		return -EAGAIN;
+	}
+	kref_get(&query->port->sm_ah->ref);
+	query->sm_ah = query->port->sm_ah;
+	spin_unlock_irqrestore(&query->port->ah_lock, flags);
+
+	query->mad_buf = ib_create_send_mad(query->port->agent, 1,
+					    query->sm_ah->pkey_index,
+					    0, IB_MGMT_SA_HDR, IB_MGMT_SA_DATA,
+					    gfp_mask);
+	if (IS_ERR(query->mad_buf)) {
+		kref_put(&query->sm_ah->ref, free_sm_ah);
+		return -ENOMEM;
+	}
+
+	query->mad_buf->ah = query->sm_ah->ah;
+
+	return 0;
+}
+
+static void free_mad(struct ib_sa_query *query)
+{
+	ib_free_send_mad(query->mad_buf);
+	kref_put(&query->sm_ah->ref, free_sm_ah);
+}
+
+static void init_mad(struct ib_sa_mad *mad, struct ib_mad_agent *agent)
+{
+	unsigned long flags;
+
+	memset(mad, 0, sizeof *mad);
+
+	mad->mad_hdr.base_version  = IB_MGMT_BASE_VERSION;
+	mad->mad_hdr.mgmt_class    = IB_MGMT_CLASS_SUBN_ADM;
+	mad->mad_hdr.class_version = IB_SA_CLASS_VERSION;
+
+	spin_lock_irqsave(&tid_lock, flags);
+	mad->mad_hdr.tid           =
+		cpu_to_be64(((u64) agent->hi_tid) << 32 | tid++);
+	spin_unlock_irqrestore(&tid_lock, flags);
+}
+
+static int send_mad(struct ib_sa_query *query, int timeout_ms, gfp_t gfp_mask)
+{
+	unsigned long flags;
+	int ret, id;
+
+retry:
+	if (!idr_pre_get(&query_idr, gfp_mask))
+		return -ENOMEM;
+	spin_lock_irqsave(&idr_lock, flags);
+	ret = idr_get_new(&query_idr, query, &id);
+	spin_unlock_irqrestore(&idr_lock, flags);
+	if (ret == -EAGAIN)
+		goto retry;
+	if (ret)
+		return ret;
+
+	query->mad_buf->timeout_ms  = timeout_ms;
+	query->mad_buf->context[0] = query;
+	query->id = id;
+
+	ret = ib_post_send_mad(query->mad_buf, NULL);
+	if (ret) {
+		spin_lock_irqsave(&idr_lock, flags);
+		idr_remove(&query_idr, id);
+		spin_unlock_irqrestore(&idr_lock, flags);
+	}
+
+	/*
+	 * It's not safe to dereference query any more, because the
+	 * send may already have completed and freed the query in
+	 * another context.
+	 */
+	return ret ? ret : id;
+}
+
+void ib_sa_unpack_path(void *attribute, struct ib_sa_path_rec *rec)
+{
+	ib_unpack(path_rec_table, ARRAY_SIZE(path_rec_table), attribute, rec);
+}
+EXPORT_SYMBOL(ib_sa_unpack_path);
+
+static void ib_sa_path_rec_callback(struct ib_sa_query *sa_query,
+				    int status,
+				    struct ib_sa_mad *mad)
+{
+	struct ib_sa_path_query *query =
+		container_of(sa_query, struct ib_sa_path_query, sa_query);
+
+	if (mad) {
+		struct ib_sa_path_rec rec;
+
+		ib_unpack(path_rec_table, ARRAY_SIZE(path_rec_table),
+			  mad->data, &rec);
+		query->callback(status, &rec, query->context);
+	} else
+		query->callback(status, NULL, query->context);
+}
+
+static void ib_sa_path_rec_release(struct ib_sa_query *sa_query)
+{
+	kfree(container_of(sa_query, struct ib_sa_path_query, sa_query));
+}
+
+int ib_sa_path_rec_query(struct ib_sa_client *client,
+			 struct ib_device *device, u8 port_num,
+			 struct ib_sa_path_rec *rec,
+			 ib_sa_comp_mask comp_mask,
+			 int timeout_ms, gfp_t gfp_mask,
+			 void (*callback)(int status,
+					  struct ib_sa_path_rec *resp,
+					  void *context),
+			 void *context,
+			 struct ib_sa_query **sa_query)
+{
+	struct ib_sa_path_query *query;
+	struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client);
+	struct ib_sa_port   *port;
+	struct ib_mad_agent *agent;
+	struct ib_sa_mad *mad;
+	int ret;
+
+	if (!sa_dev)
+		return -ENODEV;
+
+	port  = &sa_dev->port[port_num - sa_dev->start_port];
+	agent = port->agent;
+
+	query = kmalloc(sizeof *query, gfp_mask);
+	if (!query)
+		return -ENOMEM;
+
+	query->sa_query.port     = port;
+	ret = alloc_mad(&query->sa_query, gfp_mask);
+	if (ret)
+		goto err1;
+
+	ib_sa_client_get(client);
+	query->sa_query.client = client;
+	query->callback        = callback;
+	query->context         = context;
+
+	mad = query->sa_query.mad_buf->mad;
+	init_mad(mad, agent);
+
+	query->sa_query.callback = callback ? ib_sa_path_rec_callback : NULL;
+	query->sa_query.release  = ib_sa_path_rec_release;
+	mad->mad_hdr.method	 = IB_MGMT_METHOD_GET;
+	mad->mad_hdr.attr_id	 = cpu_to_be16(IB_SA_ATTR_PATH_REC);
+	mad->sa_hdr.comp_mask	 = comp_mask;
+
+	ib_pack(path_rec_table, ARRAY_SIZE(path_rec_table), rec, mad->data);
+
+	*sa_query = &query->sa_query;
+
+	ret = send_mad(&query->sa_query, timeout_ms, gfp_mask);
+	if (ret < 0)
+		goto err2;
+
+	return ret;
+
+err2:
+	*sa_query = NULL;
+	ib_sa_client_put(query->sa_query.client);
+	free_mad(&query->sa_query);
+
+err1:
+	kfree(query);
+	return ret;
+}
+
+static void ib_sa_service_rec_callback(struct ib_sa_query *sa_query,
+				    int status,
+				    struct ib_sa_mad *mad)
+{
+	struct ib_sa_service_query *query =
+		container_of(sa_query, struct ib_sa_service_query, sa_query);
+
+	if (mad) {
+		struct ib_sa_service_rec rec;
+
+		ib_unpack(service_rec_table, ARRAY_SIZE(service_rec_table),
+			  mad->data, &rec);
+		query->callback(status, &rec, query->context);
+	} else
+		query->callback(status, NULL, query->context);
+}
+
+static void ib_sa_service_rec_release(struct ib_sa_query *sa_query)
+{
+	kfree(container_of(sa_query, struct ib_sa_service_query, sa_query));
+}
+
+/**
+ * ib_sa_service_rec_query - Start Service Record operation
+ * @client:SA client
+ * @device:device to send request on
+ * @port_num: port number to send request on
+ * @method:SA method - should be get, set, or delete
+ * @rec:Service Record to send in request
+ * @comp_mask:component mask to send in request
+ * @timeout_ms:time to wait for response
+ * @gfp_mask:GFP mask to use for internal allocations
+ * @callback:function called when request completes, times out or is
+ * canceled
+ * @context:opaque user context passed to callback
+ * @sa_query:request context, used to cancel request
+ *
+ * Send a Service Record set/get/delete to the SA to register,
+ * unregister or query a service record.
+ * The callback function will be called when the request completes (or
+ * fails); status is 0 for a successful response, -EINTR if the query
+ * is canceled, -ETIMEDOUT is the query timed out, or -EIO if an error
+ * occurred sending the query.  The resp parameter of the callback is
+ * only valid if status is 0.
+ *
+ * If the return value of ib_sa_service_rec_query() is negative, it is an
+ * error code.  Otherwise it is a request ID that can be used to cancel
+ * the query.
+ */
+int ib_sa_service_rec_query(struct ib_sa_client *client,
+			    struct ib_device *device, u8 port_num, u8 method,
+			    struct ib_sa_service_rec *rec,
+			    ib_sa_comp_mask comp_mask,
+			    int timeout_ms, gfp_t gfp_mask,
+			    void (*callback)(int status,
+					     struct ib_sa_service_rec *resp,
+					     void *context),
+			    void *context,
+			    struct ib_sa_query **sa_query)
+{
+	struct ib_sa_service_query *query;
+	struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client);
+	struct ib_sa_port   *port;
+	struct ib_mad_agent *agent;
+	struct ib_sa_mad *mad;
+	int ret;
+
+	if (!sa_dev)
+		return -ENODEV;
+
+	port  = &sa_dev->port[port_num - sa_dev->start_port];
+	agent = port->agent;
+
+	if (method != IB_MGMT_METHOD_GET &&
+	    method != IB_MGMT_METHOD_SET &&
+	    method != IB_SA_METHOD_DELETE)
+		return -EINVAL;
+
+	query = kmalloc(sizeof *query, gfp_mask);
+	if (!query)
+		return -ENOMEM;
+
+	query->sa_query.port     = port;
+	ret = alloc_mad(&query->sa_query, gfp_mask);
+	if (ret)
+		goto err1;
+
+	ib_sa_client_get(client);
+	query->sa_query.client = client;
+	query->callback        = callback;
+	query->context         = context;
+
+	mad = query->sa_query.mad_buf->mad;
+	init_mad(mad, agent);
+
+	query->sa_query.callback = callback ? ib_sa_service_rec_callback : NULL;
+	query->sa_query.release  = ib_sa_service_rec_release;
+	mad->mad_hdr.method	 = method;
+	mad->mad_hdr.attr_id	 = cpu_to_be16(IB_SA_ATTR_SERVICE_REC);
+	mad->sa_hdr.comp_mask	 = comp_mask;
+
+	ib_pack(service_rec_table, ARRAY_SIZE(service_rec_table),
+		rec, mad->data);
+
+	*sa_query = &query->sa_query;
+
+	ret = send_mad(&query->sa_query, timeout_ms, gfp_mask);
+	if (ret < 0)
+		goto err2;
+
+	return ret;
+
+err2:
+	*sa_query = NULL;
+	ib_sa_client_put(query->sa_query.client);
+	free_mad(&query->sa_query);
+
+err1:
+	kfree(query);
+	return ret;
+}
+EXPORT_SYMBOL(ib_sa_service_rec_query);
+
+static void ib_sa_mcmember_rec_callback(struct ib_sa_query *sa_query,
+					int status,
+					struct ib_sa_mad *mad)
+{
+	struct ib_sa_mcmember_query *query =
+		container_of(sa_query, struct ib_sa_mcmember_query, sa_query);
+
+	if (mad) {
+		struct ib_sa_mcmember_rec rec;
+
+		ib_unpack(mcmember_rec_table, ARRAY_SIZE(mcmember_rec_table),
+			  mad->data, &rec);
+		query->callback(status, &rec, query->context);
+	} else
+		query->callback(status, NULL, query->context);
+}
+
+static void ib_sa_mcmember_rec_release(struct ib_sa_query *sa_query)
+{
+	kfree(container_of(sa_query, struct ib_sa_mcmember_query, sa_query));
+}
+
+int ib_sa_mcmember_rec_query(struct ib_sa_client *client,
+			     struct ib_device *device, u8 port_num,
+			     u8 method,
+			     struct ib_sa_mcmember_rec *rec,
+			     ib_sa_comp_mask comp_mask,
+			     int timeout_ms, gfp_t gfp_mask,
+			     void (*callback)(int status,
+					      struct ib_sa_mcmember_rec *resp,
+					      void *context),
+			     void *context,
+			     struct ib_sa_query **sa_query)
+{
+	struct ib_sa_mcmember_query *query;
+	struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client);
+	struct ib_sa_port   *port;
+	struct ib_mad_agent *agent;
+	struct ib_sa_mad *mad;
+	int ret;
+
+	if (!sa_dev)
+		return -ENODEV;
+
+	port  = &sa_dev->port[port_num - sa_dev->start_port];
+	agent = port->agent;
+
+	query = kmalloc(sizeof *query, gfp_mask);
+	if (!query)
+		return -ENOMEM;
+
+	query->sa_query.port     = port;
+	ret = alloc_mad(&query->sa_query, gfp_mask);
+	if (ret)
+		goto err1;
+
+	ib_sa_client_get(client);
+	query->sa_query.client = client;
+	query->callback        = callback;
+	query->context         = context;
+
+	mad = query->sa_query.mad_buf->mad;
+	init_mad(mad, agent);
+
+	query->sa_query.callback = callback ? ib_sa_mcmember_rec_callback : NULL;
+	query->sa_query.release  = ib_sa_mcmember_rec_release;
+	mad->mad_hdr.method	 = method;
+	mad->mad_hdr.attr_id	 = cpu_to_be16(IB_SA_ATTR_MC_MEMBER_REC);
+	mad->sa_hdr.comp_mask	 = comp_mask;
+
+	ib_pack(mcmember_rec_table, ARRAY_SIZE(mcmember_rec_table),
+		rec, mad->data);
+
+	*sa_query = &query->sa_query;
+
+	ret = send_mad(&query->sa_query, timeout_ms, gfp_mask);
+	if (ret < 0)
+		goto err2;
+
+	return ret;
+
+err2:
+	*sa_query = NULL;
+	ib_sa_client_put(query->sa_query.client);
+	free_mad(&query->sa_query);
+
+err1:
+	kfree(query);
+	return ret;
+}
+
+static void ib_sa_inform_callback(struct ib_sa_query *sa_query,
+				  int status,
+				  struct ib_sa_mad *mad)
+{
+	struct ib_sa_inform_query *query =
+		container_of(sa_query, struct ib_sa_inform_query, sa_query);
+
+	if (mad) {
+		struct ib_sa_inform rec;
+
+		ib_unpack(inform_table, ARRAY_SIZE(inform_table),
+			  mad->data, &rec);
+		query->callback(status, &rec, query->context);
+	} else
+		query->callback(status, NULL, query->context);
+}
+
+static void ib_sa_inform_release(struct ib_sa_query *sa_query)
+{
+	kfree(container_of(sa_query, struct ib_sa_inform_query, sa_query));
+}
+
+int ib_sa_guid_info_rec_query(struct ib_sa_client *client,
+			      struct ib_device *device, u8 port_num,
+			      struct ib_sa_guidinfo_rec *rec,
+			      ib_sa_comp_mask comp_mask, u8 method,
+			      int timeout_ms, gfp_t gfp_mask,
+			      void (*callback)(int status,
+					       struct ib_sa_guidinfo_rec *resp,
+					       void *context),
+			      void *context,
+			      struct ib_sa_query **sa_query)
+{
+	// stub function - 
+        // called originally from mad.c under mlx4_ib_init_sriov()
+        // which calls mlx4_ib_init_alias_guid_service() in alias_GUID.c
+        // which goes down to this function
+
+        printk("ERROR: function should be called only in SRIOV flow!!!");
+
+	return 0;
+}
+
+/**
+ * ib_sa_informinfo_query - Start an InformInfo registration.
+ * @client:SA client
+ * @device:device to send query on
+ * @port_num: port number to send query on
+ * @rec:Inform record to send in query
+ * @timeout_ms:time to wait for response
+ * @gfp_mask:GFP mask to use for internal allocations
+ * @callback:function called when notice handler registration completes,
+ * times out or is canceled
+ * @context:opaque user context passed to callback
+ * @sa_query:query context, used to cancel query
+ *
+ * This function sends inform info to register with SA to receive
+ * in-service notice.
+ * The callback function will be called when the query completes (or
+ * fails); status is 0 for a successful response, -EINTR if the query
+ * is canceled, -ETIMEDOUT is the query timed out, or -EIO if an error
+ * occurred sending the query.  The resp parameter of the callback is
+ * only valid if status is 0.
+ *
+ * If the return value of ib_sa_inform_query() is negative, it is an
+ * error code.  Otherwise it is a query ID that can be used to cancel
+ * the query.
+ */
+int ib_sa_informinfo_query(struct ib_sa_client *client,
+			   struct ib_device *device, u8 port_num,
+			   struct ib_sa_inform *rec,
+			   int timeout_ms, gfp_t gfp_mask,
+			   void (*callback)(int status,
+					   struct ib_sa_inform *resp,
+					   void *context),
+			   void *context,
+			   struct ib_sa_query **sa_query)
+{
+	struct ib_sa_inform_query *query;
+	struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client);
+	struct ib_sa_port   *port;
+	struct ib_mad_agent *agent;
+	struct ib_sa_mad *mad;
+	int ret;
+
+	if (!sa_dev)
+		return -ENODEV;
+
+	port  = &sa_dev->port[port_num - sa_dev->start_port];
+	agent = port->agent;
+
+	query = kmalloc(sizeof *query, gfp_mask);
+	if (!query)
+		return -ENOMEM;
+
+	query->sa_query.port     = port;
+	ret = alloc_mad(&query->sa_query, gfp_mask);
+	if (ret)
+		goto err1;
+
+	ib_sa_client_get(client);
+	query->sa_query.client = client;
+	query->callback = callback;
+	query->context  = context;
+
+	mad = query->sa_query.mad_buf->mad;
+	init_mad(mad, agent);
+
+	query->sa_query.callback = callback ? ib_sa_inform_callback : NULL;
+	query->sa_query.release  = ib_sa_inform_release;
+	query->sa_query.port     = port;
+	mad->mad_hdr.method	 = IB_MGMT_METHOD_SET;
+	mad->mad_hdr.attr_id	 = cpu_to_be16(IB_SA_ATTR_INFORM_INFO);
+
+	ib_pack(inform_table, ARRAY_SIZE(inform_table), rec, mad->data);
+
+	*sa_query = &query->sa_query;
+	ret = send_mad(&query->sa_query, timeout_ms, gfp_mask);
+	if (ret < 0)
+		goto err2;
+
+	return ret;
+
+err2:
+	*sa_query = NULL;
+	ib_sa_client_put(query->sa_query.client);
+	free_mad(&query->sa_query);
+err1:
+	kfree(query);
+	return ret;
+}
+
+static void ib_sa_notice_resp(struct ib_sa_port *port,
+			      struct ib_mad_recv_wc *mad_recv_wc)
+{
+	struct ib_mad_send_buf *mad_buf;
+	struct ib_sa_mad *mad;
+	int ret;
+	unsigned long flags;
+
+	mad_buf = ib_create_send_mad(port->notice_agent, 1, 0, 0,
+				     IB_MGMT_SA_HDR, IB_MGMT_SA_DATA,
+				     GFP_KERNEL);
+	if (IS_ERR(mad_buf))
+		return;
+
+	mad = mad_buf->mad;
+	memcpy(mad, mad_recv_wc->recv_buf.mad, sizeof *mad);
+	mad->mad_hdr.method = IB_MGMT_METHOD_REPORT_RESP;
+
+	spin_lock_irqsave(&port->ah_lock, flags);
+	if (!port->sm_ah) {
+		spin_unlock_irqrestore(&port->ah_lock, flags);
+		ib_free_send_mad(mad_buf);
+		return;
+	}
+	kref_get(&port->sm_ah->ref);
+	mad_buf->context[0] = &port->sm_ah->ref;
+	mad_buf->ah = port->sm_ah->ah;
+	spin_unlock_irqrestore(&port->ah_lock, flags);
+
+	ret = ib_post_send_mad(mad_buf, NULL);
+	if (ret)
+		goto err;
+
+	return;
+err:
+	kref_put(mad_buf->context[0], free_sm_ah);
+	ib_free_send_mad(mad_buf);
+}
+
+static void send_handler(struct ib_mad_agent *agent,
+			 struct ib_mad_send_wc *mad_send_wc)
+{
+	struct ib_sa_query *query = mad_send_wc->send_buf->context[0];
+	unsigned long flags;
+
+	if (query->callback)
+		switch (mad_send_wc->status) {
+		case IB_WC_SUCCESS:
+			/* No callback -- already got recv */
+			break;
+		case IB_WC_RESP_TIMEOUT_ERR:
+			query->callback(query, -ETIMEDOUT, NULL);
+			break;
+		case IB_WC_WR_FLUSH_ERR:
+			query->callback(query, -EINTR, NULL);
+			break;
+		default:
+			query->callback(query, -EIO, NULL);
+			break;
+		}
+
+	spin_lock_irqsave(&idr_lock, flags);
+	idr_remove(&query_idr, query->id);
+	spin_unlock_irqrestore(&idr_lock, flags);
+
+	free_mad(query);
+	ib_sa_client_put(query->client);
+	query->release(query);
+}
+
+static void recv_handler(struct ib_mad_agent *mad_agent,
+			 struct ib_mad_recv_wc *mad_recv_wc)
+{
+	struct ib_sa_query *query;
+	struct ib_mad_send_buf *mad_buf;
+
+	mad_buf = (void *) (unsigned long) mad_recv_wc->wc->wr_id;
+	query = mad_buf->context[0];
+
+	if (query->callback) {
+		if (mad_recv_wc->wc->status == IB_WC_SUCCESS)
+			query->callback(query,
+					mad_recv_wc->recv_buf.mad->mad_hdr.status ?
+					-EINVAL : 0,
+					(struct ib_sa_mad *) mad_recv_wc->recv_buf.mad);
+		else
+			query->callback(query, -EIO, NULL);
+	}
+
+	ib_free_recv_mad(mad_recv_wc);
+}
+
+static void notice_resp_handler(struct ib_mad_agent *agent,
+				struct ib_mad_send_wc *mad_send_wc)
+{
+	kref_put(mad_send_wc->send_buf->context[0], free_sm_ah);
+	ib_free_send_mad(mad_send_wc->send_buf);
+}
+
+static void notice_handler(struct ib_mad_agent *mad_agent,
+			   struct ib_mad_recv_wc *mad_recv_wc)
+{
+	struct ib_sa_port *port;
+	struct ib_sa_mad *mad;
+	struct ib_sa_notice notice;
+
+	port = mad_agent->context;
+	mad = (struct ib_sa_mad *) mad_recv_wc->recv_buf.mad;
+	ib_unpack(notice_table, ARRAY_SIZE(notice_table), mad->data, &notice);
+
+	if (!notice_dispatch(port->device, port->port_num, &notice))
+		ib_sa_notice_resp(port, mad_recv_wc);
+	ib_free_recv_mad(mad_recv_wc);
+}
+
+static void ib_sa_add_one(struct ib_device *device)
+{
+	struct ib_sa_device *sa_dev;
+	struct ib_mad_reg_req reg_req = {
+		.mgmt_class = IB_MGMT_CLASS_SUBN_ADM,
+		.mgmt_class_version = 2
+	};
+	int s, e, i;
+
+	if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
+		return;
+
+	if (device->node_type == RDMA_NODE_IB_SWITCH)
+		s = e = 0;
+	else {
+		s = 1;
+		e = device->phys_port_cnt;
+	}
+
+	sa_dev = kzalloc(sizeof *sa_dev +
+			 (e - s + 1) * sizeof (struct ib_sa_port),
+			 GFP_KERNEL);
+	if (!sa_dev)
+		return;
+
+	sa_dev->start_port = s;
+	sa_dev->end_port   = e;
+
+	for (i = 0; i <= e - s; ++i) {
+		spin_lock_init(&sa_dev->port[i].ah_lock);
+		if (rdma_port_get_link_layer(device, i + 1) != IB_LINK_LAYER_INFINIBAND)
+			continue;
+
+		sa_dev->port[i].sm_ah    = NULL;
+		sa_dev->port[i].port_num = i + s;
+
+		sa_dev->port[i].agent =
+			ib_register_mad_agent(device, i + s, IB_QPT_GSI,
+					      NULL, 0, send_handler,
+					      recv_handler, sa_dev);
+		if (IS_ERR(sa_dev->port[i].agent))
+			goto err;
+
+		sa_dev->port[i].device = device;
+		set_bit(IB_MGMT_METHOD_REPORT, reg_req.method_mask);
+		sa_dev->port[i].notice_agent =
+			ib_register_mad_agent(device, i + s, IB_QPT_GSI,
+					      &reg_req, 0, notice_resp_handler,
+					      notice_handler, &sa_dev->port[i]);
+
+		if (IS_ERR(sa_dev->port[i].notice_agent))
+			goto err;
+
+		INIT_WORK(&sa_dev->port[i].update_task, update_sm_ah);
+	}
+
+	ib_set_client_data(device, &sa_client, sa_dev);
+
+	/*
+	 * We register our event handler after everything is set up,
+	 * and then update our cached info after the event handler is
+	 * registered to avoid any problems if a port changes state
+	 * during our initialization.
+	 */
+
+	INIT_IB_EVENT_HANDLER(&sa_dev->event_handler, device, ib_sa_event);
+	if (ib_register_event_handler(&sa_dev->event_handler))
+		goto err;
+
+	for (i = 0; i <= e - s; ++i)
+		if (rdma_port_get_link_layer(device, i + 1) == IB_LINK_LAYER_INFINIBAND)
+			update_sm_ah(&sa_dev->port[i].update_task);
+
+	return;
+
+err:
+	while (--i >= 0)
+		if (rdma_port_get_link_layer(device, i + 1) == IB_LINK_LAYER_INFINIBAND) {
+			if (!IS_ERR(sa_dev->port[i].notice_agent))
+				ib_unregister_mad_agent(sa_dev->port[i].notice_agent);
+			if (!IS_ERR(sa_dev->port[i].agent))
+				ib_unregister_mad_agent(sa_dev->port[i].agent);
+		}
+
+	kfree(sa_dev);
+
+	return;
+}
+
+static void ib_sa_remove_one(struct ib_device *device)
+{
+	struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client);
+	int i;
+
+	if (!sa_dev)
+		return;
+
+	ib_unregister_event_handler(&sa_dev->event_handler);
+
+	flush_scheduled_work();
+
+	for (i = 0; i <= sa_dev->end_port - sa_dev->start_port; ++i) {
+		if (rdma_port_get_link_layer(device, i + 1) == IB_LINK_LAYER_INFINIBAND) {
+			ib_unregister_mad_agent(sa_dev->port[i].notice_agent);
+			ib_unregister_mad_agent(sa_dev->port[i].agent);
+			if (sa_dev->port[i].sm_ah)
+				kref_put(&sa_dev->port[i].sm_ah->ref, free_sm_ah);
+		}
+
+	}
+
+	kfree(sa_dev);
+}
+
+static int __init ib_sa_init(void)
+{
+	int ret;
+
+	spin_lock_init(&idr_lock);
+	spin_lock_init(&tid_lock);
+
+	get_random_bytes(&tid, sizeof tid);
+
+	ret = ib_register_client(&sa_client);
+	if (ret) {
+		printk(KERN_ERR "Couldn't register ib_sa client\n");
+		goto err1;
+	}
+
+	ret = mcast_init();
+	if (ret) {
+		printk(KERN_ERR "Couldn't initialize multicast handling\n");
+		goto err2;
+	}
+
+	ret = notice_init();
+	if (ret) {
+		printk(KERN_ERR "Couldn't initialize notice handling\n");
+		goto err3;
+	}
+
+	ret = sa_db_init();
+	if (ret) {
+		printk(KERN_ERR "Couldn't initialize local SA\n");
+		goto err4;
+	}
+
+	return 0;
+err4:
+	notice_cleanup();
+err3:
+	mcast_cleanup();
+err2:
+	ib_unregister_client(&sa_client);
+err1:
+	return ret;
+}
+
+static void __exit ib_sa_cleanup(void)
+{
+	sa_db_cleanup();
+	mcast_cleanup();
+	notice_cleanup();
+	ib_unregister_client(&sa_client);
+	idr_destroy(&query_idr);
+}
+
+module_init_order(ib_sa_init, SI_ORDER_SECOND);
+module_exit(ib_sa_cleanup);


Property changes on: trunk/sys/ofed/drivers/infiniband/core/sa_query.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/sys/ofed/drivers/infiniband/core/smi.c
===================================================================
--- trunk/sys/ofed/drivers/infiniband/core/smi.c	                        (rev 0)
+++ trunk/sys/ofed/drivers/infiniband/core/smi.c	2016-09-14 19:35:22 UTC (rev 7911)
@@ -0,0 +1,245 @@
+/*
+ * Copyright (c) 2004, 2005 Mellanox Technologies Ltd.  All rights reserved.
+ * Copyright (c) 2004, 2005 Infinicon Corporation.  All rights reserved.
+ * Copyright (c) 2004, 2005 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2004, 2005 Topspin Corporation.  All rights reserved.
+ * Copyright (c) 2004-2007 Voltaire Corporation.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <rdma/ib_smi.h>
+#include "smi.h"
+
+/*
+ * Fixup a directed route SMP for sending
+ * Return 0 if the SMP should be discarded
+ */
+enum smi_action smi_handle_dr_smp_send(struct ib_smp *smp,
+				       u8 node_type, int port_num)
+{
+	u8 hop_ptr, hop_cnt;
+
+	hop_ptr = smp->hop_ptr;
+	hop_cnt = smp->hop_cnt;
+
+	/* See section 14.2.2.2, Vol 1 IB spec */
+	if (!ib_get_smp_direction(smp)) {
+		/* C14-9:1 */
+		if (hop_cnt && hop_ptr == 0) {
+			smp->hop_ptr++;
+			return (smp->initial_path[smp->hop_ptr] ==
+				port_num ? IB_SMI_HANDLE : IB_SMI_DISCARD);
+		}
+
+		/* C14-9:2 */
+		if (hop_ptr && hop_ptr < hop_cnt) {
+			if (node_type != RDMA_NODE_IB_SWITCH)
+				return IB_SMI_DISCARD;
+
+			/* smp->return_path set when received */
+			smp->hop_ptr++;
+			return (smp->initial_path[smp->hop_ptr] ==
+				port_num ? IB_SMI_HANDLE : IB_SMI_DISCARD);
+		}
+
+		/* C14-9:3 -- We're at the end of the DR segment of path */
+		if (hop_ptr == hop_cnt) {
+			/* smp->return_path set when received */
+			smp->hop_ptr++;
+			return (node_type == RDMA_NODE_IB_SWITCH ||
+				smp->dr_dlid == IB_LID_PERMISSIVE ?
+				IB_SMI_HANDLE : IB_SMI_DISCARD);
+		}
+
+		/* C14-9:4 -- hop_ptr = hop_cnt + 1 -> give to SMA/SM */
+		/* C14-9:5 -- Fail unreasonable hop pointer */
+		return (hop_ptr == hop_cnt + 1 ? IB_SMI_HANDLE : IB_SMI_DISCARD);
+
+	} else {
+		/* C14-13:1 */
+		if (hop_cnt && hop_ptr == hop_cnt + 1) {
+			smp->hop_ptr--;
+			return (smp->return_path[smp->hop_ptr] ==
+				port_num ? IB_SMI_HANDLE : IB_SMI_DISCARD);
+		}
+
+		/* C14-13:2 */
+		if (2 <= hop_ptr && hop_ptr <= hop_cnt) {
+			if (node_type != RDMA_NODE_IB_SWITCH)
+				return IB_SMI_DISCARD;
+
+			smp->hop_ptr--;
+			return (smp->return_path[smp->hop_ptr] ==
+				port_num ? IB_SMI_HANDLE : IB_SMI_DISCARD);
+		}
+
+		/* C14-13:3 -- at the end of the DR segment of path */
+		if (hop_ptr == 1) {
+			smp->hop_ptr--;
+			/* C14-13:3 -- SMPs destined for SM shouldn't be here */
+			return (node_type == RDMA_NODE_IB_SWITCH ||
+				smp->dr_slid == IB_LID_PERMISSIVE ?
+				IB_SMI_HANDLE : IB_SMI_DISCARD);
+		}
+
+		/* C14-13:4 -- hop_ptr = 0 -> should have gone to SM */
+		if (hop_ptr == 0)
+			return IB_SMI_HANDLE;
+
+		/* C14-13:5 -- Check for unreasonable hop pointer */
+		return IB_SMI_DISCARD;
+	}
+}
+
+/*
+ * Adjust information for a received SMP
+ * Return 0 if the SMP should be dropped
+ */
+enum smi_action smi_handle_dr_smp_recv(struct ib_smp *smp, u8 node_type,
+				       int port_num, int phys_port_cnt)
+{
+	u8 hop_ptr, hop_cnt;
+
+	hop_ptr = smp->hop_ptr;
+	hop_cnt = smp->hop_cnt;
+
+	/* See section 14.2.2.2, Vol 1 IB spec */
+	if (!ib_get_smp_direction(smp)) {
+		/* C14-9:1 -- sender should have incremented hop_ptr */
+		if (hop_cnt && hop_ptr == 0)
+			return IB_SMI_DISCARD;
+
+		/* C14-9:2 -- intermediate hop */
+		if (hop_ptr && hop_ptr < hop_cnt) {
+			if (node_type != RDMA_NODE_IB_SWITCH)
+				return IB_SMI_DISCARD;
+
+			smp->return_path[hop_ptr] = port_num;
+			/* smp->hop_ptr updated when sending */
+			return (smp->initial_path[hop_ptr+1] <= phys_port_cnt ?
+				IB_SMI_HANDLE : IB_SMI_DISCARD);
+		}
+
+		/* C14-9:3 -- We're at the end of the DR segment of path */
+		if (hop_ptr == hop_cnt) {
+			if (hop_cnt)
+				smp->return_path[hop_ptr] = port_num;
+			/* smp->hop_ptr updated when sending */
+
+			return (node_type == RDMA_NODE_IB_SWITCH ||
+				smp->dr_dlid == IB_LID_PERMISSIVE ?
+				IB_SMI_HANDLE : IB_SMI_DISCARD);
+		}
+
+		/* C14-9:4 -- hop_ptr = hop_cnt + 1 -> give to SMA/SM */
+		/* C14-9:5 -- fail unreasonable hop pointer */
+		return (hop_ptr == hop_cnt + 1 ? IB_SMI_HANDLE : IB_SMI_DISCARD);
+
+	} else {
+
+		/* C14-13:1 */
+		if (hop_cnt && hop_ptr == hop_cnt + 1) {
+			smp->hop_ptr--;
+			return (smp->return_path[smp->hop_ptr] ==
+				port_num ? IB_SMI_HANDLE : IB_SMI_DISCARD);
+		}
+
+		/* C14-13:2 */
+		if (2 <= hop_ptr && hop_ptr <= hop_cnt) {
+			if (node_type != RDMA_NODE_IB_SWITCH)
+				return IB_SMI_DISCARD;
+
+			/* smp->hop_ptr updated when sending */
+			return (smp->return_path[hop_ptr-1] <= phys_port_cnt ?
+				IB_SMI_HANDLE : IB_SMI_DISCARD);
+		}
+
+		/* C14-13:3 -- We're at the end of the DR segment of path */
+		if (hop_ptr == 1) {
+			if (smp->dr_slid == IB_LID_PERMISSIVE) {
+				/* giving SMP to SM - update hop_ptr */
+				smp->hop_ptr--;
+				return IB_SMI_HANDLE;
+			}
+			/* smp->hop_ptr updated when sending */
+			return (node_type == RDMA_NODE_IB_SWITCH ?
+				IB_SMI_HANDLE : IB_SMI_DISCARD);
+		}
+
+		/* C14-13:4 -- hop_ptr = 0 -> give to SM */
+		/* C14-13:5 -- Check for unreasonable hop pointer */
+		return (hop_ptr == 0 ? IB_SMI_HANDLE : IB_SMI_DISCARD);
+	}
+}
+
+enum smi_forward_action smi_check_forward_dr_smp(struct ib_smp *smp)
+{
+	u8 hop_ptr, hop_cnt;
+
+	hop_ptr = smp->hop_ptr;
+	hop_cnt = smp->hop_cnt;
+
+	if (!ib_get_smp_direction(smp)) {
+		/* C14-9:2 -- intermediate hop */
+		if (hop_ptr && hop_ptr < hop_cnt)
+			return IB_SMI_FORWARD;
+
+		/* C14-9:3 -- at the end of the DR segment of path */
+		if (hop_ptr == hop_cnt)
+			return (smp->dr_dlid == IB_LID_PERMISSIVE ?
+				IB_SMI_SEND : IB_SMI_LOCAL);
+
+		/* C14-9:4 -- hop_ptr = hop_cnt + 1 -> give to SMA/SM */
+		if (hop_ptr == hop_cnt + 1)
+			return IB_SMI_SEND;
+	} else {
+		/* C14-13:2  -- intermediate hop */
+		if (2 <= hop_ptr && hop_ptr <= hop_cnt)
+			return IB_SMI_FORWARD;
+
+		/* C14-13:3 -- at the end of the DR segment of path */
+		if (hop_ptr == 1)
+			return (smp->dr_slid != IB_LID_PERMISSIVE ?
+				IB_SMI_SEND : IB_SMI_LOCAL);
+	}
+	return IB_SMI_LOCAL;
+}
+
+/*
+ * Return the forwarding port number from initial_path for outgoing SMP and
+ * from return_path for returning SMP
+ */
+int smi_get_fwd_port(struct ib_smp *smp)
+{
+	return (!ib_get_smp_direction(smp) ? smp->initial_path[smp->hop_ptr+1] :
+		smp->return_path[smp->hop_ptr-1]);
+}


Property changes on: trunk/sys/ofed/drivers/infiniband/core/smi.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/sys/ofed/drivers/infiniband/core/smi.h
===================================================================
--- trunk/sys/ofed/drivers/infiniband/core/smi.h	                        (rev 0)
+++ trunk/sys/ofed/drivers/infiniband/core/smi.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2004 Mellanox Technologies Ltd.  All rights reserved.
+ * Copyright (c) 2004 Infinicon Corporation.  All rights reserved.
+ * Copyright (c) 2004 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation.  All rights reserved.
+ * Copyright (c) 2004-2007 Voltaire Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef __SMI_H_
+#define __SMI_H_
+
+#include <rdma/ib_smi.h>
+
+enum smi_action {
+	IB_SMI_DISCARD,
+	IB_SMI_HANDLE
+};
+
+enum smi_forward_action {
+	IB_SMI_LOCAL,	/* SMP should be completed up the stack */
+	IB_SMI_SEND,	/* received DR SMP should be forwarded to the send queue */
+	IB_SMI_FORWARD	/* SMP should be forwarded (for switches only) */
+};
+
+enum smi_action smi_handle_dr_smp_recv(struct ib_smp *smp, u8 node_type,
+				       int port_num, int phys_port_cnt);
+int smi_get_fwd_port(struct ib_smp *smp);
+extern enum smi_forward_action smi_check_forward_dr_smp(struct ib_smp *smp);
+extern enum smi_action smi_handle_dr_smp_send(struct ib_smp *smp,
+					      u8 node_type, int port_num);
+
+/*
+ * Return IB_SMI_HANDLE if the SMP should be handled by the local SMA/SM
+ * via process_mad
+ */
+static inline enum smi_action smi_check_local_smp(struct ib_smp *smp,
+						  struct ib_device *device)
+{
+	/* C14-9:3 -- We're at the end of the DR segment of path */
+	/* C14-9:4 -- Hop Pointer = Hop Count + 1 -> give to SMA/SM */
+	return ((device->process_mad &&
+		!ib_get_smp_direction(smp) &&
+		(smp->hop_ptr == smp->hop_cnt + 1)) ?
+		IB_SMI_HANDLE : IB_SMI_DISCARD);
+}
+
+/*
+ * Return IB_SMI_HANDLE if the SMP should be handled by the local SMA/SM
+ * via process_mad
+ */
+static inline enum smi_action smi_check_local_returning_smp(struct ib_smp *smp,
+						   struct ib_device *device)
+{
+	/* C14-13:3 -- We're at the end of the DR segment of path */
+	/* C14-13:4 -- Hop Pointer == 0 -> give to SM */
+	return ((device->process_mad &&
+		ib_get_smp_direction(smp) &&
+		!smp->hop_ptr) ? IB_SMI_HANDLE : IB_SMI_DISCARD);
+}
+
+#endif	/* __SMI_H_ */


Property changes on: trunk/sys/ofed/drivers/infiniband/core/smi.h
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/sys/ofed/drivers/infiniband/core/sysfs.c
===================================================================
--- trunk/sys/ofed/drivers/infiniband/core/sysfs.c	                        (rev 0)
+++ trunk/sys/ofed/drivers/infiniband/core/sysfs.c	2016-09-14 19:35:22 UTC (rev 7911)
@@ -0,0 +1,981 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies Ltd.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "core_priv.h"
+
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/fs.h>
+
+#include <rdma/ib_mad.h>
+#include <rdma/ib_pma.h>
+
+struct ib_port {
+	struct kobject         kobj;
+	struct ib_device      *ibdev;
+	struct attribute_group gid_group;
+	struct attribute_group pkey_group;
+	u8                     port_num;
+};
+
+struct port_attribute {
+	struct attribute attr;
+	ssize_t (*show)(struct ib_port *, struct port_attribute *, char *buf);
+	ssize_t (*store)(struct ib_port *, struct port_attribute *,
+			 const char *buf, size_t count);
+};
+
+#define PORT_ATTR(_name, _mode, _show, _store) \
+struct port_attribute port_attr_##_name = __ATTR(_name, _mode, _show, _store)
+
+#define PORT_ATTR_RO(_name) \
+struct port_attribute port_attr_##_name = __ATTR_RO(_name)
+
+struct port_table_attribute {
+	struct port_attribute	attr;
+	char			name[8];
+	int			index;
+};
+
+static ssize_t port_attr_show(struct kobject *kobj,
+			      struct attribute *attr, char *buf)
+{
+	struct port_attribute *port_attr =
+		container_of(attr, struct port_attribute, attr);
+	struct ib_port *p = container_of(kobj, struct ib_port, kobj);
+
+	if (!port_attr->show)
+		return -EIO;
+
+	return port_attr->show(p, port_attr, buf);
+}
+
+static const struct sysfs_ops port_sysfs_ops = {
+	.show = port_attr_show
+};
+
+static ssize_t state_show(struct ib_port *p, struct port_attribute *unused,
+			  char *buf)
+{
+	struct ib_port_attr attr;
+	ssize_t ret;
+
+	static const char *state_name[] = {
+		[IB_PORT_NOP]		= "NOP",
+		[IB_PORT_DOWN]		= "DOWN",
+		[IB_PORT_INIT]		= "INIT",
+		[IB_PORT_ARMED]		= "ARMED",
+		[IB_PORT_ACTIVE]	= "ACTIVE",
+		[IB_PORT_ACTIVE_DEFER]	= "ACTIVE_DEFER"
+	};
+
+	ret = ib_query_port(p->ibdev, p->port_num, &attr);
+	if (ret)
+		return ret;
+
+	return sprintf(buf, "%d: %s\n", attr.state,
+		       attr.state < ARRAY_SIZE(state_name) ?
+		       state_name[attr.state] : "UNKNOWN");
+}
+
+static ssize_t lid_show(struct ib_port *p, struct port_attribute *unused,
+			char *buf)
+{
+	struct ib_port_attr attr;
+	ssize_t ret;
+
+	ret = ib_query_port(p->ibdev, p->port_num, &attr);
+	if (ret)
+		return ret;
+
+	return sprintf(buf, "0x%x\n", attr.lid);
+}
+
+static ssize_t lid_mask_count_show(struct ib_port *p,
+				   struct port_attribute *unused,
+				   char *buf)
+{
+	struct ib_port_attr attr;
+	ssize_t ret;
+
+	ret = ib_query_port(p->ibdev, p->port_num, &attr);
+	if (ret)
+		return ret;
+
+	return sprintf(buf, "%d\n", attr.lmc);
+}
+
+static ssize_t sm_lid_show(struct ib_port *p, struct port_attribute *unused,
+			   char *buf)
+{
+	struct ib_port_attr attr;
+	ssize_t ret;
+
+	ret = ib_query_port(p->ibdev, p->port_num, &attr);
+	if (ret)
+		return ret;
+
+	return sprintf(buf, "0x%x\n", attr.sm_lid);
+}
+
+static ssize_t sm_sl_show(struct ib_port *p, struct port_attribute *unused,
+			  char *buf)
+{
+	struct ib_port_attr attr;
+	ssize_t ret;
+
+	ret = ib_query_port(p->ibdev, p->port_num, &attr);
+	if (ret)
+		return ret;
+
+	return sprintf(buf, "%d\n", attr.sm_sl);
+}
+
+static ssize_t cap_mask_show(struct ib_port *p, struct port_attribute *unused,
+			     char *buf)
+{
+	struct ib_port_attr attr;
+	ssize_t ret;
+
+	ret = ib_query_port(p->ibdev, p->port_num, &attr);
+	if (ret)
+		return ret;
+
+	return sprintf(buf, "0x%08x\n", attr.port_cap_flags);
+}
+
+static ssize_t rate_show(struct ib_port *p, struct port_attribute *unused,
+			 char *buf)
+{
+	struct ib_port_attr attr;
+	char *speed = "";
+	int rate;
+	ssize_t ret;
+
+	ret = ib_query_port(p->ibdev, p->port_num, &attr);
+	if (ret)
+		return ret;
+
+	switch (attr.active_speed) {
+	case 2: speed = " DDR"; break;
+	case 4: speed = " QDR"; break;
+	}
+
+	rate = 25 * ib_width_enum_to_int(attr.active_width) * attr.active_speed;
+	if (rate < 0)
+		return -EINVAL;
+
+	return sprintf(buf, "%d%s Gb/sec (%dX%s)\n",
+		       rate / 10, rate % 10 ? ".5" : "",
+		       ib_width_enum_to_int(attr.active_width), speed);
+}
+
+static ssize_t phys_state_show(struct ib_port *p, struct port_attribute *unused,
+			       char *buf)
+{
+	struct ib_port_attr attr;
+
+	ssize_t ret;
+
+	ret = ib_query_port(p->ibdev, p->port_num, &attr);
+	if (ret)
+		return ret;
+
+	switch (attr.phys_state) {
+	case 1:  return sprintf(buf, "1: Sleep\n");
+	case 2:  return sprintf(buf, "2: Polling\n");
+	case 3:  return sprintf(buf, "3: Disabled\n");
+	case 4:  return sprintf(buf, "4: PortConfigurationTraining\n");
+	case 5:  return sprintf(buf, "5: LinkUp\n");
+	case 6:  return sprintf(buf, "6: LinkErrorRecovery\n");
+	case 7:  return sprintf(buf, "7: Phy Test\n");
+	default: return sprintf(buf, "%d: <unknown>\n", attr.phys_state);
+	}
+}
+
+static ssize_t link_layer_show(struct ib_port *p, struct port_attribute *unused,
+			       char *buf)
+{
+	switch (rdma_port_get_link_layer(p->ibdev, p->port_num)) {
+	case IB_LINK_LAYER_INFINIBAND:
+		return sprintf(buf, "%s\n", "IB");
+	case IB_LINK_LAYER_ETHERNET:
+		return sprintf(buf, "%s\n", "Ethernet");
+	default:
+		return sprintf(buf, "%s\n", "Unknown");
+	}
+}
+
+static PORT_ATTR_RO(state);
+static PORT_ATTR_RO(lid);
+static PORT_ATTR_RO(lid_mask_count);
+static PORT_ATTR_RO(sm_lid);
+static PORT_ATTR_RO(sm_sl);
+static PORT_ATTR_RO(cap_mask);
+static PORT_ATTR_RO(rate);
+static PORT_ATTR_RO(phys_state);
+static PORT_ATTR_RO(link_layer);
+
+static struct attribute *port_default_attrs[] = {
+	&port_attr_state.attr,
+	&port_attr_lid.attr,
+	&port_attr_lid_mask_count.attr,
+	&port_attr_sm_lid.attr,
+	&port_attr_sm_sl.attr,
+	&port_attr_cap_mask.attr,
+	&port_attr_rate.attr,
+	&port_attr_phys_state.attr,
+	&port_attr_link_layer.attr,
+	NULL
+};
+
+static ssize_t show_port_gid(struct ib_port *p, struct port_attribute *attr,
+			     char *buf)
+{
+	struct port_table_attribute *tab_attr =
+		container_of(attr, struct port_table_attribute, attr);
+	union ib_gid gid;
+	ssize_t ret;
+	u16 *raw;
+
+	ret = ib_query_gid(p->ibdev, p->port_num, tab_attr->index, &gid);
+	if (ret)
+		return ret;
+
+	raw = (u16 *)gid.raw;
+	return sprintf(buf, "%.4x:%.4x:%.4x:%.4x:%.4x:%.4x:%.4x:%.4x\n",
+	    htons(raw[0]), htons(raw[1]), htons(raw[2]), htons(raw[3]),
+	    htons(raw[4]), htons(raw[5]), htons(raw[6]), htons(raw[7]));
+}
+
+static ssize_t show_port_pkey(struct ib_port *p, struct port_attribute *attr,
+			      char *buf)
+{
+	struct port_table_attribute *tab_attr =
+		container_of(attr, struct port_table_attribute, attr);
+	u16 pkey;
+	ssize_t ret;
+
+	ret = ib_query_pkey(p->ibdev, p->port_num, tab_attr->index, &pkey);
+	if (ret)
+		return ret;
+
+	return sprintf(buf, "0x%04x\n", pkey);
+}
+
+static ssize_t get_pma_counters(struct ib_port *p, struct port_attribute *attr,
+                                char *buf, int c_ext)
+{
+        struct port_table_attribute *tab_attr =
+                container_of(attr, struct port_table_attribute, attr);
+        int offset = tab_attr->index & 0xffff;
+        int width  = (tab_attr->index >> 16) & 0xff;
+        struct ib_mad *in_mad  = NULL;
+        struct ib_mad *out_mad = NULL;
+        ssize_t ret;
+
+        if (!p->ibdev->process_mad)
+                return -ENXIO;
+
+        in_mad  = kzalloc(sizeof *in_mad, GFP_KERNEL);
+        out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
+        if (!in_mad || !out_mad) {
+                ret = -ENOMEM;
+                goto out;
+        }
+
+        in_mad->mad_hdr.base_version  = 1;
+        in_mad->mad_hdr.mgmt_class    = IB_MGMT_CLASS_PERF_MGMT;
+        in_mad->mad_hdr.class_version = 1;
+        in_mad->mad_hdr.method        = IB_MGMT_METHOD_GET;
+        if (c_ext)
+                in_mad->mad_hdr.attr_id = IB_PMA_PORT_COUNTERS_EXT;
+        else
+                in_mad->mad_hdr.attr_id = IB_PMA_PORT_COUNTERS;
+
+        in_mad->data[41] = p->port_num; /* PortSelect field */
+
+        if ((p->ibdev->process_mad(p->ibdev, IB_MAD_IGNORE_MKEY,
+                 p->port_num, NULL, NULL, in_mad, out_mad) &
+             (IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY)) !=
+            (IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY)) {
+                ret = -EINVAL;
+                goto out;
+        }
+
+        switch (width) {
+        case 4:
+                ret = sprintf(buf, "%u\n", (out_mad->data[40 + offset / 8] >>
+                                            (4 - (offset % 8))) & 0xf);
+                break;
+        case 8:
+                ret = sprintf(buf, "%u\n", out_mad->data[40 + offset / 8]);
+                break;
+        case 16:
+                ret = sprintf(buf, "%u\n",
+                              be16_to_cpup((__be16 *)(out_mad->data + 40 + offset / 8)));
+                break;
+        case 32:
+                ret = sprintf(buf, "%u\n",
+                              be32_to_cpup((__be32 *)(out_mad->data + 40 + offset / 8)));
+                break;
+        case 64:
+                ret = sprintf(buf, "%llu\n", (unsigned long long)
+                              be64_to_cpup((__be64 *)(out_mad->data + 40 + offset / 8)));
+                break;
+        default:
+                ret = 0;
+        }
+
+out:
+        kfree(in_mad);
+        kfree(out_mad);
+
+        return ret;
+}
+
+#define PORT_PMA_ATTR(_name, _counter, _width, _offset)                 \
+struct port_table_attribute port_pma_attr_##_name = {                   \
+        .attr  = __ATTR(_name, S_IRUGO, show_pma_counter, NULL),        \
+        .index = (_offset) | ((_width) << 16) | ((_counter) << 24)      \
+}
+
+static ssize_t show_pma_counter(struct ib_port *p, struct port_attribute *attr,
+                                char *buf)
+{
+        return get_pma_counters(p, attr, buf, 0);
+}
+
+static PORT_PMA_ATTR(symbol_error                   ,  0, 16,  32);
+static PORT_PMA_ATTR(link_error_recovery            ,  1,  8,  48);
+static PORT_PMA_ATTR(link_downed                    ,  2,  8,  56);
+static PORT_PMA_ATTR(port_rcv_errors                ,  3, 16,  64);
+static PORT_PMA_ATTR(port_rcv_remote_physical_errors,  4, 16,  80);
+static PORT_PMA_ATTR(port_rcv_switch_relay_errors   ,  5, 16,  96);
+static PORT_PMA_ATTR(port_xmit_discards             ,  6, 16, 112);
+static PORT_PMA_ATTR(port_xmit_constraint_errors    ,  7,  8, 128);
+static PORT_PMA_ATTR(port_rcv_constraint_errors     ,  8,  8, 136);
+static PORT_PMA_ATTR(local_link_integrity_errors    ,  9,  4, 152);
+static PORT_PMA_ATTR(excessive_buffer_overrun_errors, 10,  4, 156);
+static PORT_PMA_ATTR(VL15_dropped                   , 11, 16, 176);
+static PORT_PMA_ATTR(port_xmit_data                 , 12, 32, 192);
+static PORT_PMA_ATTR(port_rcv_data                  , 13, 32, 224);
+static PORT_PMA_ATTR(port_xmit_packets              , 14, 32, 256);
+static PORT_PMA_ATTR(port_rcv_packets               , 15, 32, 288);
+
+static struct attribute *pma_attrs[] = {
+        &port_pma_attr_symbol_error.attr.attr,
+        &port_pma_attr_link_error_recovery.attr.attr,
+        &port_pma_attr_link_downed.attr.attr,
+        &port_pma_attr_port_rcv_errors.attr.attr,
+        &port_pma_attr_port_rcv_remote_physical_errors.attr.attr,
+        &port_pma_attr_port_rcv_switch_relay_errors.attr.attr,
+        &port_pma_attr_port_xmit_discards.attr.attr,
+        &port_pma_attr_port_xmit_constraint_errors.attr.attr,
+        &port_pma_attr_port_rcv_constraint_errors.attr.attr,
+        &port_pma_attr_local_link_integrity_errors.attr.attr,
+        &port_pma_attr_excessive_buffer_overrun_errors.attr.attr,
+        &port_pma_attr_VL15_dropped.attr.attr,
+        &port_pma_attr_port_xmit_data.attr.attr,
+        &port_pma_attr_port_rcv_data.attr.attr,
+        &port_pma_attr_port_xmit_packets.attr.attr,
+        &port_pma_attr_port_rcv_packets.attr.attr,
+        NULL
+};
+
+static struct attribute_group pma_group = {
+	.name  = "counters",
+	.attrs  = pma_attrs
+};
+
+#define PORT_PMA_ATTR_EXT(_name, _counter, _width, _offset)             \
+struct port_table_attribute port_pma_attr_ext_##_name = {               \
+        .attr  = __ATTR(_name, S_IRUGO, show_pma_counter_ext, NULL),    \
+        .index = (_offset) | ((_width) << 16) | ((_counter) << 24)      \
+}
+
+static ssize_t show_pma_counter_ext(struct ib_port *p,
+                                    struct port_attribute *attr, char *buf)
+{
+        return get_pma_counters(p, attr, buf, 1);
+}
+
+static PORT_PMA_ATTR_EXT(port_xmit_data_64           ,  0, 64,  64);
+static PORT_PMA_ATTR_EXT(port_rcv_data_64            ,  0, 64,  128);
+static PORT_PMA_ATTR_EXT(port_xmit_packets_64        ,  0, 64,  192);
+static PORT_PMA_ATTR_EXT(port_rcv_packets_64         ,  0, 64,  256);
+static PORT_PMA_ATTR_EXT(port_unicast_xmit_packets   ,  0, 64,  320);
+static PORT_PMA_ATTR_EXT(port_unicast_rcv_packets    ,  0, 64,  384);
+static PORT_PMA_ATTR_EXT(port_multicast_xmit_packets ,  0, 64,  448);
+static PORT_PMA_ATTR_EXT(port_multicast_rcv_packets  ,  0, 64,  512);
+
+static struct attribute *pma_attrs_ext[] = {
+        &port_pma_attr_ext_port_xmit_data_64.attr.attr,
+        &port_pma_attr_ext_port_rcv_data_64.attr.attr,
+        &port_pma_attr_ext_port_xmit_packets_64.attr.attr,
+        &port_pma_attr_ext_port_rcv_packets_64.attr.attr,
+        &port_pma_attr_ext_port_unicast_xmit_packets.attr.attr,
+        &port_pma_attr_ext_port_unicast_rcv_packets.attr.attr,
+        &port_pma_attr_ext_port_multicast_xmit_packets.attr.attr,
+        &port_pma_attr_ext_port_multicast_rcv_packets.attr.attr,
+        NULL
+};
+
+static struct attribute_group pma_ext_group = {
+        .name  = "counters_ext",
+        .attrs  = pma_attrs_ext
+};
+
+static void ib_port_release(struct kobject *kobj)
+{
+	struct ib_port *p = container_of(kobj, struct ib_port, kobj);
+	struct attribute *a;
+	int i;
+
+	for (i = 0; (a = p->gid_group.attrs[i]); ++i)
+		kfree(a);
+
+	kfree(p->gid_group.attrs);
+
+	for (i = 0; (a = p->pkey_group.attrs[i]); ++i)
+		kfree(a);
+
+	kfree(p->pkey_group.attrs);
+
+	kfree(p);
+}
+
+static struct kobj_type port_type = {
+	.release       = ib_port_release,
+	.sysfs_ops     = &port_sysfs_ops,
+	.default_attrs = port_default_attrs
+};
+
+static void ib_device_release(struct device *device)
+{
+	struct ib_device *dev = container_of(device, struct ib_device, dev);
+
+	kfree(dev);
+}
+
+#ifdef __linux__
+/* BSD supports this through devfs(5) and devd(8). */
+static int ib_device_uevent(struct device *device,
+			    struct kobj_uevent_env *env)
+{
+	struct ib_device *dev = container_of(device, struct ib_device, dev);
+
+	if (add_uevent_var(env, "NAME=%s", dev->name))
+		return -ENOMEM;
+
+	/*
+	 * It would be nice to pass the node GUID with the event...
+	 */
+
+	return 0;
+}
+#endif
+
+static struct attribute **
+alloc_group_attrs(ssize_t (*show)(struct ib_port *,
+				  struct port_attribute *, char *buf),
+		  int len)
+{
+	struct attribute **tab_attr;
+	struct port_table_attribute *element;
+	int i;
+
+	tab_attr = kcalloc(1 + len, sizeof(struct attribute *), GFP_KERNEL);
+	if (!tab_attr)
+		return NULL;
+
+	for (i = 0; i < len; i++) {
+		element = kzalloc(sizeof(struct port_table_attribute),
+				  GFP_KERNEL);
+		if (!element)
+			goto err;
+
+		if (snprintf(element->name, sizeof(element->name),
+			     "%d", i) >= sizeof(element->name)) {
+			kfree(element);
+			goto err;
+		}
+
+		element->attr.attr.name  = element->name;
+		element->attr.attr.mode  = S_IRUGO;
+		element->attr.show       = show;
+		element->index		 = i;
+
+		tab_attr[i] = &element->attr.attr;
+	}
+
+	return tab_attr;
+
+err:
+	while (--i >= 0)
+		kfree(tab_attr[i]);
+	kfree(tab_attr);
+	return NULL;
+}
+
+static int add_port(struct ib_device *device, int port_num,
+                    int (*port_callback)(struct ib_device *,
+                                         u8, struct kobject *))
+{
+	struct ib_port *p;
+	struct ib_port_attr attr;
+	int i;
+	int ret;
+
+	ret = ib_query_port(device, port_num, &attr);
+	if (ret)
+		return ret;
+
+	p = kzalloc(sizeof *p, GFP_KERNEL);
+	if (!p)
+		return -ENOMEM;
+
+	p->ibdev      = device;
+	p->port_num   = port_num;
+
+	ret = kobject_init_and_add(&p->kobj, &port_type,
+				   kobject_get(device->ports_parent),
+				   "%d", port_num);
+	if (ret)
+		goto err_put;
+
+	ret = sysfs_create_group(&p->kobj, &pma_group);
+	if (ret)
+		goto err_put;
+
+        ret = sysfs_create_group(&p->kobj, &pma_ext_group);
+        if (ret)
+                goto err_remove_pma;
+
+	p->gid_group.name  = "gids";
+	p->gid_group.attrs = alloc_group_attrs(show_port_gid, attr.gid_tbl_len);
+	if (!p->gid_group.attrs)
+		goto err_remove_pma_ext;
+
+	ret = sysfs_create_group(&p->kobj, &p->gid_group);
+	if (ret)
+		goto err_free_gid;
+
+	p->pkey_group.name  = "pkeys";
+	p->pkey_group.attrs = alloc_group_attrs(show_port_pkey,
+						attr.pkey_tbl_len);
+	if (!p->pkey_group.attrs)
+		goto err_remove_gid;
+
+	ret = sysfs_create_group(&p->kobj, &p->pkey_group);
+	if (ret)
+		goto err_free_pkey;
+
+        if (port_callback) {
+                ret = port_callback(device, port_num, &p->kobj);
+                if (ret)
+                        goto err_remove_pkey;
+        }
+
+	list_add_tail(&p->kobj.entry, &device->port_list);
+
+#ifdef __linux__
+	kobject_uevent(&p->kobj, KOBJ_ADD);
+#endif
+	return 0;
+
+err_remove_pkey:
+        sysfs_remove_group(&p->kobj, &p->pkey_group);
+
+err_free_pkey:
+	for (i = 0; i < attr.pkey_tbl_len; ++i)
+		kfree(p->pkey_group.attrs[i]);
+
+	kfree(p->pkey_group.attrs);
+
+err_remove_gid:
+	sysfs_remove_group(&p->kobj, &p->gid_group);
+
+err_free_gid:
+	for (i = 0; i < attr.gid_tbl_len; ++i)
+		kfree(p->gid_group.attrs[i]);
+
+	kfree(p->gid_group.attrs);
+
+err_remove_pma_ext:
+        sysfs_remove_group(&p->kobj, &pma_ext_group);
+
+err_remove_pma:
+	sysfs_remove_group(&p->kobj, &pma_group);
+
+err_put:
+	kobject_put(device->ports_parent);
+	kfree(p);
+	return ret;
+}
+
+static ssize_t show_node_type(struct device *device,
+			      struct device_attribute *attr, char *buf)
+{
+	struct ib_device *dev = container_of(device, struct ib_device, dev);
+
+	switch (dev->node_type) {
+	case RDMA_NODE_IB_CA:	  return sprintf(buf, "%d: CA\n", dev->node_type);
+	case RDMA_NODE_RNIC:	  return sprintf(buf, "%d: RNIC\n", dev->node_type);
+	case RDMA_NODE_IB_SWITCH: return sprintf(buf, "%d: switch\n", dev->node_type);
+	case RDMA_NODE_IB_ROUTER: return sprintf(buf, "%d: router\n", dev->node_type);
+	default:		  return sprintf(buf, "%d: <unknown>\n", dev->node_type);
+	}
+}
+
+static ssize_t show_sys_image_guid(struct device *device,
+				   struct device_attribute *dev_attr, char *buf)
+{
+	struct ib_device *dev = container_of(device, struct ib_device, dev);
+	struct ib_device_attr attr;
+	ssize_t ret;
+
+	ret = ib_query_device(dev, &attr);
+	if (ret)
+		return ret;
+
+	return sprintf(buf, "%04x:%04x:%04x:%04x\n",
+		       be16_to_cpu(((__be16 *) &attr.sys_image_guid)[0]),
+		       be16_to_cpu(((__be16 *) &attr.sys_image_guid)[1]),
+		       be16_to_cpu(((__be16 *) &attr.sys_image_guid)[2]),
+		       be16_to_cpu(((__be16 *) &attr.sys_image_guid)[3]));
+}
+
+static ssize_t show_node_guid(struct device *device,
+			      struct device_attribute *attr, char *buf)
+{
+	struct ib_device *dev = container_of(device, struct ib_device, dev);
+
+	return sprintf(buf, "%04x:%04x:%04x:%04x\n",
+		       be16_to_cpu(((__be16 *) &dev->node_guid)[0]),
+		       be16_to_cpu(((__be16 *) &dev->node_guid)[1]),
+		       be16_to_cpu(((__be16 *) &dev->node_guid)[2]),
+		       be16_to_cpu(((__be16 *) &dev->node_guid)[3]));
+}
+
+static ssize_t show_node_desc(struct device *device,
+			      struct device_attribute *attr, char *buf)
+{
+	struct ib_device *dev = container_of(device, struct ib_device, dev);
+
+	return sprintf(buf, "%.64s\n", dev->node_desc);
+}
+
+static ssize_t set_node_desc(struct device *device,
+			     struct device_attribute *attr,
+			     const char *buf, size_t count)
+{
+	struct ib_device *dev = container_of(device, struct ib_device, dev);
+	struct ib_device_modify desc = {};
+	int ret;
+
+	if (!dev->modify_device)
+		return -EIO;
+
+	memcpy(desc.node_desc, buf, min_t(int, count, 64));
+	ret = ib_modify_device(dev, IB_DEVICE_MODIFY_NODE_DESC, &desc);
+	if (ret)
+		return ret;
+
+	return count;
+}
+
+static DEVICE_ATTR(node_type, S_IRUGO, show_node_type, NULL);
+static DEVICE_ATTR(sys_image_guid, S_IRUGO, show_sys_image_guid, NULL);
+static DEVICE_ATTR(node_guid, S_IRUGO, show_node_guid, NULL);
+static DEVICE_ATTR(node_desc, S_IRUGO | S_IWUSR, show_node_desc, set_node_desc);
+
+static struct device_attribute *ib_class_attributes[] = {
+	&dev_attr_node_type,
+	&dev_attr_sys_image_guid,
+	&dev_attr_node_guid,
+	&dev_attr_node_desc
+};
+
+static struct class ib_class = {
+	.name    = "infiniband",
+	.dev_release = ib_device_release,
+#ifdef __linux__
+	.dev_uevent = ib_device_uevent,
+#endif
+};
+
+/* Show a given an attribute in the statistics group */
+static ssize_t show_protocol_stat(const struct device *device,
+			    struct device_attribute *attr, char *buf,
+			    unsigned offset)
+{
+	struct ib_device *dev = container_of(__DECONST(struct device *, device), struct ib_device, dev);
+	union rdma_protocol_stats stats;
+	ssize_t ret;
+
+	ret = dev->get_protocol_stats(dev, &stats);
+	if (ret)
+		return ret;
+
+	return sprintf(buf, "%llu\n",
+		       (unsigned long long) ((u64 *) &stats)[offset]);
+}
+
+/* generate a read-only iwarp statistics attribute */
+#define IW_STATS_ENTRY(name)						\
+static ssize_t show_##name(struct device *device,			\
+			   struct device_attribute *attr, char *buf)	\
+{									\
+	return show_protocol_stat(device, attr, buf,			\
+				  offsetof(struct iw_protocol_stats, name) / \
+				  sizeof (u64));			\
+}									\
+static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL)
+
+IW_STATS_ENTRY(ipInReceives);
+IW_STATS_ENTRY(ipInHdrErrors);
+IW_STATS_ENTRY(ipInTooBigErrors);
+IW_STATS_ENTRY(ipInNoRoutes);
+IW_STATS_ENTRY(ipInAddrErrors);
+IW_STATS_ENTRY(ipInUnknownProtos);
+IW_STATS_ENTRY(ipInTruncatedPkts);
+IW_STATS_ENTRY(ipInDiscards);
+IW_STATS_ENTRY(ipInDelivers);
+IW_STATS_ENTRY(ipOutForwDatagrams);
+IW_STATS_ENTRY(ipOutRequests);
+IW_STATS_ENTRY(ipOutDiscards);
+IW_STATS_ENTRY(ipOutNoRoutes);
+IW_STATS_ENTRY(ipReasmTimeout);
+IW_STATS_ENTRY(ipReasmReqds);
+IW_STATS_ENTRY(ipReasmOKs);
+IW_STATS_ENTRY(ipReasmFails);
+IW_STATS_ENTRY(ipFragOKs);
+IW_STATS_ENTRY(ipFragFails);
+IW_STATS_ENTRY(ipFragCreates);
+IW_STATS_ENTRY(ipInMcastPkts);
+IW_STATS_ENTRY(ipOutMcastPkts);
+IW_STATS_ENTRY(ipInBcastPkts);
+IW_STATS_ENTRY(ipOutBcastPkts);
+IW_STATS_ENTRY(tcpRtoAlgorithm);
+IW_STATS_ENTRY(tcpRtoMin);
+IW_STATS_ENTRY(tcpRtoMax);
+IW_STATS_ENTRY(tcpMaxConn);
+IW_STATS_ENTRY(tcpActiveOpens);
+IW_STATS_ENTRY(tcpPassiveOpens);
+IW_STATS_ENTRY(tcpAttemptFails);
+IW_STATS_ENTRY(tcpEstabResets);
+IW_STATS_ENTRY(tcpCurrEstab);
+IW_STATS_ENTRY(tcpInSegs);
+IW_STATS_ENTRY(tcpOutSegs);
+IW_STATS_ENTRY(tcpRetransSegs);
+IW_STATS_ENTRY(tcpInErrs);
+IW_STATS_ENTRY(tcpOutRsts);
+
+static struct attribute *iw_proto_stats_attrs[] = {
+	&dev_attr_ipInReceives.attr,
+	&dev_attr_ipInHdrErrors.attr,
+	&dev_attr_ipInTooBigErrors.attr,
+	&dev_attr_ipInNoRoutes.attr,
+	&dev_attr_ipInAddrErrors.attr,
+	&dev_attr_ipInUnknownProtos.attr,
+	&dev_attr_ipInTruncatedPkts.attr,
+	&dev_attr_ipInDiscards.attr,
+	&dev_attr_ipInDelivers.attr,
+	&dev_attr_ipOutForwDatagrams.attr,
+	&dev_attr_ipOutRequests.attr,
+	&dev_attr_ipOutDiscards.attr,
+	&dev_attr_ipOutNoRoutes.attr,
+	&dev_attr_ipReasmTimeout.attr,
+	&dev_attr_ipReasmReqds.attr,
+	&dev_attr_ipReasmOKs.attr,
+	&dev_attr_ipReasmFails.attr,
+	&dev_attr_ipFragOKs.attr,
+	&dev_attr_ipFragFails.attr,
+	&dev_attr_ipFragCreates.attr,
+	&dev_attr_ipInMcastPkts.attr,
+	&dev_attr_ipOutMcastPkts.attr,
+	&dev_attr_ipInBcastPkts.attr,
+	&dev_attr_ipOutBcastPkts.attr,
+	&dev_attr_tcpRtoAlgorithm.attr,
+	&dev_attr_tcpRtoMin.attr,
+	&dev_attr_tcpRtoMax.attr,
+	&dev_attr_tcpMaxConn.attr,
+	&dev_attr_tcpActiveOpens.attr,
+	&dev_attr_tcpPassiveOpens.attr,
+	&dev_attr_tcpAttemptFails.attr,
+	&dev_attr_tcpEstabResets.attr,
+	&dev_attr_tcpCurrEstab.attr,
+	&dev_attr_tcpInSegs.attr,
+	&dev_attr_tcpOutSegs.attr,
+	&dev_attr_tcpRetransSegs.attr,
+	&dev_attr_tcpInErrs.attr,
+	&dev_attr_tcpOutRsts.attr,
+	NULL
+};
+
+static struct attribute_group iw_stats_group = {
+	.name	= "proto_stats",
+	.attrs	= iw_proto_stats_attrs,
+};
+
+int ib_device_register_sysfs(struct ib_device *device,
+                                int (*port_callback)(struct ib_device *, u8, struct kobject *))
+{
+	struct device *class_dev = &device->dev;
+	int ret;
+	int i;
+
+	class_dev->class      = &ib_class;
+	class_dev->parent     = device->dma_device;
+        dev_set_name(class_dev, device->name);
+        dev_set_drvdata(class_dev, device);
+
+	INIT_LIST_HEAD(&device->port_list);
+
+	ret = device_register(class_dev);
+	if (ret)
+		goto err;
+
+	for (i = 0; i < ARRAY_SIZE(ib_class_attributes); ++i) {
+		ret = device_create_file(class_dev, ib_class_attributes[i]);
+		if (ret)
+			goto err_unregister;
+	}
+
+	device->ports_parent = kobject_create_and_add("ports",
+                        	        kobject_get(&class_dev->kobj));
+        if (!device->ports_parent) {
+		ret = -ENOMEM;
+		goto err_put;
+	}
+
+	if (device->node_type == RDMA_NODE_IB_SWITCH) {
+		ret = add_port(device, 0, port_callback);
+		if (ret)
+			goto err_put;
+	} else {
+		for (i = 1; i <= device->phys_port_cnt; ++i) {
+			ret = add_port(device, i, port_callback);
+			if (ret)
+				goto err_put;
+		}
+	}
+
+	if (device->node_type == RDMA_NODE_RNIC && device->get_protocol_stats) {
+		ret = sysfs_create_group(&class_dev->kobj, &iw_stats_group);
+		if (ret)
+			goto err_put;
+	}
+
+	return 0;
+
+err_put:
+	{
+		struct kobject *p, *t;
+		struct ib_port *port;
+
+		list_for_each_entry_safe(p, t, &device->port_list, entry) {
+			list_del(&p->entry);
+			port = container_of(p, struct ib_port, kobj);
+			sysfs_remove_group(p, &pma_group);
+			sysfs_remove_group(p, &port->pkey_group);
+			sysfs_remove_group(p, &port->gid_group);
+			kobject_put(p);
+		}
+	}
+
+	kobject_put(&class_dev->kobj);
+
+err_unregister:
+	device_unregister(class_dev);
+
+err:
+	return ret;
+}
+
+void ib_device_unregister_sysfs(struct ib_device *device)
+{
+	struct kobject *p, *t;
+	struct ib_port *port;
+	int i;
+
+	/* Hold kobject until ib_dealloc_device() */
+	kobject_get(&device->dev.kobj);
+
+	for (i = 0; i < ARRAY_SIZE(ib_class_attributes); ++i) {
+			device_remove_file(&device->dev, ib_class_attributes[i]);
+	}
+
+	list_for_each_entry_safe(p, t, &device->port_list, entry) {
+		list_del(&p->entry);
+		port = container_of(p, struct ib_port, kobj);
+		sysfs_remove_group(p, &pma_group);
+		sysfs_remove_group(p, &port->pkey_group);
+		sysfs_remove_group(p, &port->gid_group);
+		kobject_put(p);
+	}
+
+	kobject_put(device->ports_parent);
+	device_unregister(&device->dev);
+}
+
+int ib_sysfs_setup(void)
+{
+	return class_register(&ib_class);
+}
+
+void ib_sysfs_cleanup(void)
+{
+	class_unregister(&ib_class);
+}
+
+/*int ib_sysfs_create_port_files(struct ib_device *device,
+			       int (*create)(struct ib_device *dev, u8 port_num,
+					     struct kobject *kobj))
+{
+	struct kobject *p;
+	struct ib_port *port;
+	int ret = 0;
+
+	list_for_each_entry(p, &device->port_list, entry) {
+		port = container_of(p, struct ib_port, kobj);
+		ret = create(device, port->port_num, &port->kobj);
+		if (ret)
+			break;
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL(ib_sysfs_create_port_files);*/


Property changes on: trunk/sys/ofed/drivers/infiniband/core/sysfs.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/sys/ofed/drivers/infiniband/core/ucm.c
===================================================================
--- trunk/sys/ofed/drivers/infiniband/core/ucm.c	                        (rev 0)
+++ trunk/sys/ofed/drivers/infiniband/core/ucm.c	2016-09-14 19:35:22 UTC (rev 7911)
@@ -0,0 +1,1344 @@
+/*
+ * Copyright (c) 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *	copyright notice, this list of conditions and the following
+ *	disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *	copyright notice, this list of conditions and the following
+ *	disclaimer in the documentation and/or other materials
+ *	provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/completion.h>
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/poll.h>
+#include <linux/file.h>
+#include <linux/cdev.h>
+#include <linux/idr.h>
+#include <linux/mutex.h>
+#include <linux/string.h>
+
+#include <asm/uaccess.h>
+
+#include <rdma/ib_cm.h>
+#include <rdma/ib_user_cm.h>
+#include <rdma/ib_marshall.h>
+
+MODULE_AUTHOR("Libor Michalek");
+MODULE_DESCRIPTION("InfiniBand userspace Connection Manager access");
+MODULE_LICENSE("Dual BSD/GPL");
+
+struct ib_ucm_device {
+	int			devnum;
+	struct cdev		cdev;
+	struct device		dev;
+	struct ib_device	*ib_dev;
+};
+
+struct ib_ucm_file {
+	struct mutex file_mutex;
+	struct file *filp;
+	struct ib_ucm_device *device;
+
+	struct list_head  ctxs;
+	struct list_head  events;
+	wait_queue_head_t poll_wait;
+};
+
+struct ib_ucm_context {
+	int                 id;
+	struct completion   comp;
+	atomic_t            ref;
+	int		    events_reported;
+
+	struct ib_ucm_file *file;
+	struct ib_cm_id    *cm_id;
+	__u64		   uid;
+
+	struct list_head    events;    /* list of pending events. */
+	struct list_head    file_list; /* member in file ctx list */
+};
+
+struct ib_ucm_event {
+	struct ib_ucm_context *ctx;
+	struct list_head file_list; /* member in file event list */
+	struct list_head ctx_list;  /* member in ctx event list */
+
+	struct ib_cm_id *cm_id;
+	struct ib_ucm_event_resp resp;
+	void *data;
+	void *info;
+	int data_len;
+	int info_len;
+};
+
+enum {
+	IB_UCM_MAJOR = 231,
+	IB_UCM_BASE_MINOR = 224,
+	IB_UCM_MAX_DEVICES = 32
+};
+
+#define IB_UCM_BASE_DEV MKDEV(IB_UCM_MAJOR, IB_UCM_BASE_MINOR)
+
+static void ib_ucm_add_one(struct ib_device *device);
+static void ib_ucm_remove_one(struct ib_device *device);
+
+static struct ib_client ucm_client = {
+	.name   = "ucm",
+	.add    = ib_ucm_add_one,
+	.remove = ib_ucm_remove_one
+};
+
+static DEFINE_MUTEX(ctx_id_mutex);
+static DEFINE_IDR(ctx_id_table);
+static DECLARE_BITMAP(dev_map, IB_UCM_MAX_DEVICES);
+
+static struct ib_ucm_context *ib_ucm_ctx_get(struct ib_ucm_file *file, int id)
+{
+	struct ib_ucm_context *ctx;
+
+	mutex_lock(&ctx_id_mutex);
+	ctx = idr_find(&ctx_id_table, id);
+	if (!ctx)
+		ctx = ERR_PTR(-ENOENT);
+	else if (ctx->file != file)
+		ctx = ERR_PTR(-EINVAL);
+	else
+		atomic_inc(&ctx->ref);
+	mutex_unlock(&ctx_id_mutex);
+
+	return ctx;
+}
+
+static void ib_ucm_ctx_put(struct ib_ucm_context *ctx)
+{
+	if (atomic_dec_and_test(&ctx->ref))
+		complete(&ctx->comp);
+}
+
+static inline int ib_ucm_new_cm_id(int event)
+{
+	return event == IB_CM_REQ_RECEIVED || event == IB_CM_SIDR_REQ_RECEIVED;
+}
+
+static void ib_ucm_cleanup_events(struct ib_ucm_context *ctx)
+{
+	struct ib_ucm_event *uevent;
+
+	mutex_lock(&ctx->file->file_mutex);
+	list_del(&ctx->file_list);
+	while (!list_empty(&ctx->events)) {
+
+		uevent = list_entry(ctx->events.next,
+				    struct ib_ucm_event, ctx_list);
+		list_del(&uevent->file_list);
+		list_del(&uevent->ctx_list);
+		mutex_unlock(&ctx->file->file_mutex);
+
+		/* clear incoming connections. */
+		if (ib_ucm_new_cm_id(uevent->resp.event))
+			ib_destroy_cm_id(uevent->cm_id);
+
+		kfree(uevent);
+		mutex_lock(&ctx->file->file_mutex);
+	}
+	mutex_unlock(&ctx->file->file_mutex);
+}
+
+static struct ib_ucm_context *ib_ucm_ctx_alloc(struct ib_ucm_file *file)
+{
+	struct ib_ucm_context *ctx;
+	int result;
+
+	ctx = kzalloc(sizeof *ctx, GFP_KERNEL);
+	if (!ctx)
+		return NULL;
+
+	atomic_set(&ctx->ref, 1);
+	init_completion(&ctx->comp);
+	ctx->file = file;
+	INIT_LIST_HEAD(&ctx->events);
+
+	do {
+		result = idr_pre_get(&ctx_id_table, GFP_KERNEL);
+		if (!result)
+			goto error;
+
+		mutex_lock(&ctx_id_mutex);
+		result = idr_get_new(&ctx_id_table, ctx, &ctx->id);
+		mutex_unlock(&ctx_id_mutex);
+	} while (result == -EAGAIN);
+
+	if (result)
+		goto error;
+
+	list_add_tail(&ctx->file_list, &file->ctxs);
+	return ctx;
+
+error:
+	kfree(ctx);
+	return NULL;
+}
+
+static void ib_ucm_event_req_get(struct ib_ucm_req_event_resp *ureq,
+				 struct ib_cm_req_event_param *kreq)
+{
+	ureq->remote_ca_guid             = kreq->remote_ca_guid;
+	ureq->remote_qkey                = kreq->remote_qkey;
+	ureq->remote_qpn                 = kreq->remote_qpn;
+	ureq->qp_type                    = kreq->qp_type;
+	ureq->starting_psn               = kreq->starting_psn;
+	ureq->responder_resources        = kreq->responder_resources;
+	ureq->initiator_depth            = kreq->initiator_depth;
+	ureq->local_cm_response_timeout  = kreq->local_cm_response_timeout;
+	ureq->flow_control               = kreq->flow_control;
+	ureq->remote_cm_response_timeout = kreq->remote_cm_response_timeout;
+	ureq->retry_count                = kreq->retry_count;
+	ureq->rnr_retry_count            = kreq->rnr_retry_count;
+	ureq->srq                        = kreq->srq;
+	ureq->port			 = kreq->port;
+
+	ib_copy_path_rec_to_user(&ureq->primary_path, kreq->primary_path);
+	if (kreq->alternate_path)
+		ib_copy_path_rec_to_user(&ureq->alternate_path,
+					 kreq->alternate_path);
+}
+
+static void ib_ucm_event_rep_get(struct ib_ucm_rep_event_resp *urep,
+				 struct ib_cm_rep_event_param *krep)
+{
+	urep->remote_ca_guid      = krep->remote_ca_guid;
+	urep->remote_qkey         = krep->remote_qkey;
+	urep->remote_qpn          = krep->remote_qpn;
+	urep->starting_psn        = krep->starting_psn;
+	urep->responder_resources = krep->responder_resources;
+	urep->initiator_depth     = krep->initiator_depth;
+	urep->target_ack_delay    = krep->target_ack_delay;
+	urep->failover_accepted   = krep->failover_accepted;
+	urep->flow_control        = krep->flow_control;
+	urep->rnr_retry_count     = krep->rnr_retry_count;
+	urep->srq                 = krep->srq;
+}
+
+static void ib_ucm_event_sidr_rep_get(struct ib_ucm_sidr_rep_event_resp *urep,
+				      struct ib_cm_sidr_rep_event_param *krep)
+{
+	urep->status = krep->status;
+	urep->qkey   = krep->qkey;
+	urep->qpn    = krep->qpn;
+};
+
+static int ib_ucm_event_process(struct ib_cm_event *evt,
+				struct ib_ucm_event *uvt)
+{
+	void *info = NULL;
+
+	switch (evt->event) {
+	case IB_CM_REQ_RECEIVED:
+		ib_ucm_event_req_get(&uvt->resp.u.req_resp,
+				     &evt->param.req_rcvd);
+		uvt->data_len      = IB_CM_REQ_PRIVATE_DATA_SIZE;
+		uvt->resp.present  = IB_UCM_PRES_PRIMARY;
+		uvt->resp.present |= (evt->param.req_rcvd.alternate_path ?
+				      IB_UCM_PRES_ALTERNATE : 0);
+		break;
+	case IB_CM_REP_RECEIVED:
+		ib_ucm_event_rep_get(&uvt->resp.u.rep_resp,
+				     &evt->param.rep_rcvd);
+		uvt->data_len = IB_CM_REP_PRIVATE_DATA_SIZE;
+		break;
+	case IB_CM_RTU_RECEIVED:
+		uvt->data_len = IB_CM_RTU_PRIVATE_DATA_SIZE;
+		uvt->resp.u.send_status = evt->param.send_status;
+		break;
+	case IB_CM_DREQ_RECEIVED:
+		uvt->data_len = IB_CM_DREQ_PRIVATE_DATA_SIZE;
+		uvt->resp.u.send_status = evt->param.send_status;
+		break;
+	case IB_CM_DREP_RECEIVED:
+		uvt->data_len = IB_CM_DREP_PRIVATE_DATA_SIZE;
+		uvt->resp.u.send_status = evt->param.send_status;
+		break;
+	case IB_CM_MRA_RECEIVED:
+		uvt->resp.u.mra_resp.timeout =
+					evt->param.mra_rcvd.service_timeout;
+		uvt->data_len = IB_CM_MRA_PRIVATE_DATA_SIZE;
+		break;
+	case IB_CM_REJ_RECEIVED:
+		uvt->resp.u.rej_resp.reason = evt->param.rej_rcvd.reason;
+		uvt->data_len = IB_CM_REJ_PRIVATE_DATA_SIZE;
+		uvt->info_len = evt->param.rej_rcvd.ari_length;
+		info	      = evt->param.rej_rcvd.ari;
+		break;
+	case IB_CM_LAP_RECEIVED:
+		ib_copy_path_rec_to_user(&uvt->resp.u.lap_resp.path,
+					 evt->param.lap_rcvd.alternate_path);
+		uvt->data_len = IB_CM_LAP_PRIVATE_DATA_SIZE;
+		uvt->resp.present = IB_UCM_PRES_ALTERNATE;
+		break;
+	case IB_CM_APR_RECEIVED:
+		uvt->resp.u.apr_resp.status = evt->param.apr_rcvd.ap_status;
+		uvt->data_len = IB_CM_APR_PRIVATE_DATA_SIZE;
+		uvt->info_len = evt->param.apr_rcvd.info_len;
+		info	      = evt->param.apr_rcvd.apr_info;
+		break;
+	case IB_CM_SIDR_REQ_RECEIVED:
+		uvt->resp.u.sidr_req_resp.pkey =
+					evt->param.sidr_req_rcvd.pkey;
+		uvt->resp.u.sidr_req_resp.port =
+					evt->param.sidr_req_rcvd.port;
+		uvt->data_len = IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE;
+		break;
+	case IB_CM_SIDR_REP_RECEIVED:
+		ib_ucm_event_sidr_rep_get(&uvt->resp.u.sidr_rep_resp,
+					  &evt->param.sidr_rep_rcvd);
+		uvt->data_len = IB_CM_SIDR_REP_PRIVATE_DATA_SIZE;
+		uvt->info_len = evt->param.sidr_rep_rcvd.info_len;
+		info	      = evt->param.sidr_rep_rcvd.info;
+		break;
+	default:
+		uvt->resp.u.send_status = evt->param.send_status;
+		break;
+	}
+
+	if (uvt->data_len) {
+		uvt->data = kmemdup(evt->private_data, uvt->data_len, GFP_KERNEL);
+		if (!uvt->data)
+			goto err1;
+
+		uvt->resp.present |= IB_UCM_PRES_DATA;
+	}
+
+	if (uvt->info_len) {
+		uvt->info = kmemdup(info, uvt->info_len, GFP_KERNEL);
+		if (!uvt->info)
+			goto err2;
+
+		uvt->resp.present |= IB_UCM_PRES_INFO;
+	}
+	return 0;
+
+err2:
+	kfree(uvt->data);
+err1:
+	return -ENOMEM;
+}
+
+static int ib_ucm_event_handler(struct ib_cm_id *cm_id,
+				struct ib_cm_event *event)
+{
+	struct ib_ucm_event *uevent;
+	struct ib_ucm_context *ctx;
+	int result = 0;
+
+	ctx = cm_id->context;
+
+	uevent = kzalloc(sizeof *uevent, GFP_KERNEL);
+	if (!uevent)
+		goto err1;
+
+	uevent->ctx = ctx;
+	uevent->cm_id = cm_id;
+	uevent->resp.uid = ctx->uid;
+	uevent->resp.id = ctx->id;
+	uevent->resp.event = event->event;
+
+	result = ib_ucm_event_process(event, uevent);
+	if (result)
+		goto err2;
+
+	mutex_lock(&ctx->file->file_mutex);
+	list_add_tail(&uevent->file_list, &ctx->file->events);
+	list_add_tail(&uevent->ctx_list, &ctx->events);
+	wake_up_interruptible(&ctx->file->poll_wait);
+	if (ctx->file->filp)
+		selwakeup(&ctx->file->filp->f_selinfo);
+	mutex_unlock(&ctx->file->file_mutex);
+	return 0;
+
+err2:
+	kfree(uevent);
+err1:
+	/* Destroy new cm_id's */
+	return ib_ucm_new_cm_id(event->event);
+}
+
+static ssize_t ib_ucm_event(struct ib_ucm_file *file,
+			    const char __user *inbuf,
+			    int in_len, int out_len)
+{
+	struct ib_ucm_context *ctx;
+	struct ib_ucm_event_get cmd;
+	struct ib_ucm_event *uevent;
+	int result = 0;
+	DEFINE_WAIT(wait);
+
+	if (out_len < sizeof(struct ib_ucm_event_resp))
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	mutex_lock(&file->file_mutex);
+	while (list_empty(&file->events)) {
+		mutex_unlock(&file->file_mutex);
+
+		if (file->filp->f_flags & O_NONBLOCK)
+			return -EAGAIN;
+
+		if (wait_event_interruptible(file->poll_wait,
+					     !list_empty(&file->events)))
+			return -ERESTARTSYS;
+
+		mutex_lock(&file->file_mutex);
+	}
+
+	uevent = list_entry(file->events.next, struct ib_ucm_event, file_list);
+
+	if (ib_ucm_new_cm_id(uevent->resp.event)) {
+		ctx = ib_ucm_ctx_alloc(file);
+		if (!ctx) {
+			result = -ENOMEM;
+			goto done;
+		}
+
+		ctx->cm_id = uevent->cm_id;
+		ctx->cm_id->context = ctx;
+		uevent->resp.id = ctx->id;
+	}
+
+	if (copy_to_user((void __user *)(unsigned long)cmd.response,
+			 &uevent->resp, sizeof(uevent->resp))) {
+		result = -EFAULT;
+		goto done;
+	}
+
+	if (uevent->data) {
+		if (cmd.data_len < uevent->data_len) {
+			result = -ENOMEM;
+			goto done;
+		}
+		if (copy_to_user((void __user *)(unsigned long)cmd.data,
+				 uevent->data, uevent->data_len)) {
+			result = -EFAULT;
+			goto done;
+		}
+	}
+
+	if (uevent->info) {
+		if (cmd.info_len < uevent->info_len) {
+			result = -ENOMEM;
+			goto done;
+		}
+		if (copy_to_user((void __user *)(unsigned long)cmd.info,
+				 uevent->info, uevent->info_len)) {
+			result = -EFAULT;
+			goto done;
+		}
+	}
+
+	list_del(&uevent->file_list);
+	list_del(&uevent->ctx_list);
+	uevent->ctx->events_reported++;
+
+	kfree(uevent->data);
+	kfree(uevent->info);
+	kfree(uevent);
+done:
+	mutex_unlock(&file->file_mutex);
+	return result;
+}
+
+static ssize_t ib_ucm_create_id(struct ib_ucm_file *file,
+				const char __user *inbuf,
+				int in_len, int out_len)
+{
+	struct ib_ucm_create_id cmd;
+	struct ib_ucm_create_id_resp resp;
+	struct ib_ucm_context *ctx;
+	int result;
+
+	if (out_len < sizeof(resp))
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	mutex_lock(&file->file_mutex);
+	ctx = ib_ucm_ctx_alloc(file);
+	mutex_unlock(&file->file_mutex);
+	if (!ctx)
+		return -ENOMEM;
+
+	ctx->uid = cmd.uid;
+	ctx->cm_id = ib_create_cm_id(file->device->ib_dev,
+				     ib_ucm_event_handler, ctx);
+	if (IS_ERR(ctx->cm_id)) {
+		result = PTR_ERR(ctx->cm_id);
+		goto err1;
+	}
+
+	resp.id = ctx->id;
+	if (copy_to_user((void __user *)(unsigned long)cmd.response,
+			 &resp, sizeof(resp))) {
+		result = -EFAULT;
+		goto err2;
+	}
+	return 0;
+
+err2:
+	ib_destroy_cm_id(ctx->cm_id);
+err1:
+	mutex_lock(&ctx_id_mutex);
+	idr_remove(&ctx_id_table, ctx->id);
+	mutex_unlock(&ctx_id_mutex);
+	kfree(ctx);
+	return result;
+}
+
+static ssize_t ib_ucm_destroy_id(struct ib_ucm_file *file,
+				 const char __user *inbuf,
+				 int in_len, int out_len)
+{
+	struct ib_ucm_destroy_id cmd;
+	struct ib_ucm_destroy_id_resp resp;
+	struct ib_ucm_context *ctx;
+	int result = 0;
+
+	if (out_len < sizeof(resp))
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	mutex_lock(&ctx_id_mutex);
+	ctx = idr_find(&ctx_id_table, cmd.id);
+	if (!ctx)
+		ctx = ERR_PTR(-ENOENT);
+	else if (ctx->file != file)
+		ctx = ERR_PTR(-EINVAL);
+	else
+		idr_remove(&ctx_id_table, ctx->id);
+	mutex_unlock(&ctx_id_mutex);
+
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	ib_ucm_ctx_put(ctx);
+	wait_for_completion(&ctx->comp);
+
+	/* No new events will be generated after destroying the cm_id. */
+	ib_destroy_cm_id(ctx->cm_id);
+	/* Cleanup events not yet reported to the user. */
+	ib_ucm_cleanup_events(ctx);
+
+	resp.events_reported = ctx->events_reported;
+	if (copy_to_user((void __user *)(unsigned long)cmd.response,
+			 &resp, sizeof(resp)))
+		result = -EFAULT;
+
+	kfree(ctx);
+	return result;
+}
+
+static ssize_t ib_ucm_attr_id(struct ib_ucm_file *file,
+			      const char __user *inbuf,
+			      int in_len, int out_len)
+{
+	struct ib_ucm_attr_id_resp resp;
+	struct ib_ucm_attr_id cmd;
+	struct ib_ucm_context *ctx;
+	int result = 0;
+
+	if (out_len < sizeof(resp))
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	ctx = ib_ucm_ctx_get(file, cmd.id);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	resp.service_id   = ctx->cm_id->service_id;
+	resp.service_mask = ctx->cm_id->service_mask;
+	resp.local_id     = ctx->cm_id->local_id;
+	resp.remote_id    = ctx->cm_id->remote_id;
+
+	if (copy_to_user((void __user *)(unsigned long)cmd.response,
+			 &resp, sizeof(resp)))
+		result = -EFAULT;
+
+	ib_ucm_ctx_put(ctx);
+	return result;
+}
+
+static ssize_t ib_ucm_init_qp_attr(struct ib_ucm_file *file,
+				   const char __user *inbuf,
+				   int in_len, int out_len)
+{
+	struct ib_uverbs_qp_attr resp;
+	struct ib_ucm_init_qp_attr cmd;
+	struct ib_ucm_context *ctx;
+	struct ib_qp_attr qp_attr;
+	int result = 0;
+
+	if (out_len < sizeof(resp))
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	ctx = ib_ucm_ctx_get(file, cmd.id);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	resp.qp_attr_mask = 0;
+	memset(&qp_attr, 0, sizeof qp_attr);
+	qp_attr.qp_state = cmd.qp_state;
+	result = ib_cm_init_qp_attr(ctx->cm_id, &qp_attr, &resp.qp_attr_mask);
+	if (result)
+		goto out;
+
+	ib_copy_qp_attr_to_user(&resp, &qp_attr);
+
+	if (copy_to_user((void __user *)(unsigned long)cmd.response,
+			 &resp, sizeof(resp)))
+		result = -EFAULT;
+
+out:
+	ib_ucm_ctx_put(ctx);
+	return result;
+}
+
+static int ucm_validate_listen(__be64 service_id, __be64 service_mask)
+{
+	service_id &= service_mask;
+
+	if (((service_id & IB_CMA_SERVICE_ID_MASK) == IB_CMA_SERVICE_ID) ||
+	    ((service_id & IB_SDP_SERVICE_ID_MASK) == IB_SDP_SERVICE_ID))
+		return -EINVAL;
+
+	return 0;
+}
+
+static ssize_t ib_ucm_listen(struct ib_ucm_file *file,
+			     const char __user *inbuf,
+			     int in_len, int out_len)
+{
+	struct ib_ucm_listen cmd;
+	struct ib_ucm_context *ctx;
+	int result;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	ctx = ib_ucm_ctx_get(file, cmd.id);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	result = ucm_validate_listen(cmd.service_id, cmd.service_mask);
+	if (result)
+		goto out;
+
+	result = ib_cm_listen(ctx->cm_id, cmd.service_id, cmd.service_mask,
+			      NULL);
+out:
+	ib_ucm_ctx_put(ctx);
+	return result;
+}
+
+static ssize_t ib_ucm_notify(struct ib_ucm_file *file,
+			     const char __user *inbuf,
+			     int in_len, int out_len)
+{
+	struct ib_ucm_notify cmd;
+	struct ib_ucm_context *ctx;
+	int result;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	ctx = ib_ucm_ctx_get(file, cmd.id);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	result = ib_cm_notify(ctx->cm_id, (enum ib_event_type) cmd.event);
+	ib_ucm_ctx_put(ctx);
+	return result;
+}
+
+static int ib_ucm_alloc_data(const void **dest, u64 src, u32 len)
+{
+	void *data;
+
+	*dest = NULL;
+
+	if (!len)
+		return 0;
+
+	data = kmalloc(len, GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	if (copy_from_user(data, (void __user *)(unsigned long)src, len)) {
+		kfree(data);
+		return -EFAULT;
+	}
+
+	*dest = data;
+	return 0;
+}
+
+static int ib_ucm_path_get(struct ib_sa_path_rec **path, u64 src)
+{
+	struct ib_user_path_rec upath;
+	struct ib_sa_path_rec  *sa_path;
+
+	*path = NULL;
+
+	if (!src)
+		return 0;
+
+	sa_path = kmalloc(sizeof(*sa_path), GFP_KERNEL);
+	if (!sa_path)
+		return -ENOMEM;
+
+	if (copy_from_user(&upath, (void __user *)(unsigned long)src,
+			   sizeof(upath))) {
+
+		kfree(sa_path);
+		return -EFAULT;
+	}
+
+	ib_copy_path_rec_from_user(sa_path, &upath);
+	*path = sa_path;
+	return 0;
+}
+
+static ssize_t ib_ucm_send_req(struct ib_ucm_file *file,
+			       const char __user *inbuf,
+			       int in_len, int out_len)
+{
+	struct ib_cm_req_param param;
+	struct ib_ucm_context *ctx;
+	struct ib_ucm_req cmd;
+	int result;
+
+	param.private_data   = NULL;
+	param.primary_path   = NULL;
+	param.alternate_path = NULL;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	result = ib_ucm_alloc_data(&param.private_data, cmd.data, cmd.len);
+	if (result)
+		goto done;
+
+	result = ib_ucm_path_get(&param.primary_path, cmd.primary_path);
+	if (result)
+		goto done;
+
+	result = ib_ucm_path_get(&param.alternate_path, cmd.alternate_path);
+	if (result)
+		goto done;
+
+	param.private_data_len           = cmd.len;
+	param.service_id                 = cmd.sid;
+	param.qp_num                     = cmd.qpn;
+	param.qp_type                    = cmd.qp_type;
+	param.starting_psn               = cmd.psn;
+	param.peer_to_peer               = cmd.peer_to_peer;
+	param.responder_resources        = cmd.responder_resources;
+	param.initiator_depth            = cmd.initiator_depth;
+	param.remote_cm_response_timeout = cmd.remote_cm_response_timeout;
+	param.flow_control               = cmd.flow_control;
+	param.local_cm_response_timeout  = cmd.local_cm_response_timeout;
+	param.retry_count                = cmd.retry_count;
+	param.rnr_retry_count            = cmd.rnr_retry_count;
+	param.max_cm_retries             = cmd.max_cm_retries;
+	param.srq                        = cmd.srq;
+
+	ctx = ib_ucm_ctx_get(file, cmd.id);
+	if (!IS_ERR(ctx)) {
+		result = ib_send_cm_req(ctx->cm_id, &param);
+		ib_ucm_ctx_put(ctx);
+	} else
+		result = PTR_ERR(ctx);
+
+done:
+	kfree(param.private_data);
+	kfree(param.primary_path);
+	kfree(param.alternate_path);
+	return result;
+}
+
+static ssize_t ib_ucm_send_rep(struct ib_ucm_file *file,
+			       const char __user *inbuf,
+			       int in_len, int out_len)
+{
+	struct ib_cm_rep_param param;
+	struct ib_ucm_context *ctx;
+	struct ib_ucm_rep cmd;
+	int result;
+
+	param.private_data = NULL;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	result = ib_ucm_alloc_data(&param.private_data, cmd.data, cmd.len);
+	if (result)
+		return result;
+
+	param.qp_num              = cmd.qpn;
+	param.starting_psn        = cmd.psn;
+	param.private_data_len    = cmd.len;
+	param.responder_resources = cmd.responder_resources;
+	param.initiator_depth     = cmd.initiator_depth;
+	param.failover_accepted   = cmd.failover_accepted;
+	param.flow_control        = cmd.flow_control;
+	param.rnr_retry_count     = cmd.rnr_retry_count;
+	param.srq                 = cmd.srq;
+
+	ctx = ib_ucm_ctx_get(file, cmd.id);
+	if (!IS_ERR(ctx)) {
+		ctx->uid = cmd.uid;
+		result = ib_send_cm_rep(ctx->cm_id, &param);
+		ib_ucm_ctx_put(ctx);
+	} else
+		result = PTR_ERR(ctx);
+
+	kfree(param.private_data);
+	return result;
+}
+
+static ssize_t ib_ucm_send_private_data(struct ib_ucm_file *file,
+					const char __user *inbuf, int in_len,
+					int (*func)(struct ib_cm_id *cm_id,
+						    const void *private_data,
+						    u8 private_data_len))
+{
+	struct ib_ucm_private_data cmd;
+	struct ib_ucm_context *ctx;
+	const void *private_data = NULL;
+	int result;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	result = ib_ucm_alloc_data(&private_data, cmd.data, cmd.len);
+	if (result)
+		return result;
+
+	ctx = ib_ucm_ctx_get(file, cmd.id);
+	if (!IS_ERR(ctx)) {
+		result = func(ctx->cm_id, private_data, cmd.len);
+		ib_ucm_ctx_put(ctx);
+	} else
+		result = PTR_ERR(ctx);
+
+	kfree(private_data);
+	return result;
+}
+
+static ssize_t ib_ucm_send_rtu(struct ib_ucm_file *file,
+			       const char __user *inbuf,
+			       int in_len, int out_len)
+{
+	return ib_ucm_send_private_data(file, inbuf, in_len, ib_send_cm_rtu);
+}
+
+static ssize_t ib_ucm_send_dreq(struct ib_ucm_file *file,
+				const char __user *inbuf,
+				int in_len, int out_len)
+{
+	return ib_ucm_send_private_data(file, inbuf, in_len, ib_send_cm_dreq);
+}
+
+static ssize_t ib_ucm_send_drep(struct ib_ucm_file *file,
+				const char __user *inbuf,
+				int in_len, int out_len)
+{
+	return ib_ucm_send_private_data(file, inbuf, in_len, ib_send_cm_drep);
+}
+
+static ssize_t ib_ucm_send_info(struct ib_ucm_file *file,
+				const char __user *inbuf, int in_len,
+				int (*func)(struct ib_cm_id *cm_id,
+					    int status,
+					    const void *info,
+					    u8 info_len,
+					    const void *data,
+					    u8 data_len))
+{
+	struct ib_ucm_context *ctx;
+	struct ib_ucm_info cmd;
+	const void *data = NULL;
+	const void *info = NULL;
+	int result;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	result = ib_ucm_alloc_data(&data, cmd.data, cmd.data_len);
+	if (result)
+		goto done;
+
+	result = ib_ucm_alloc_data(&info, cmd.info, cmd.info_len);
+	if (result)
+		goto done;
+
+	ctx = ib_ucm_ctx_get(file, cmd.id);
+	if (!IS_ERR(ctx)) {
+		result = func(ctx->cm_id, cmd.status, info, cmd.info_len,
+			      data, cmd.data_len);
+		ib_ucm_ctx_put(ctx);
+	} else
+		result = PTR_ERR(ctx);
+
+done:
+	kfree(data);
+	kfree(info);
+	return result;
+}
+
+static ssize_t ib_ucm_send_rej(struct ib_ucm_file *file,
+			       const char __user *inbuf,
+			       int in_len, int out_len)
+{
+	return ib_ucm_send_info(file, inbuf, in_len, (void *)ib_send_cm_rej);
+}
+
+static ssize_t ib_ucm_send_apr(struct ib_ucm_file *file,
+			       const char __user *inbuf,
+			       int in_len, int out_len)
+{
+	return ib_ucm_send_info(file, inbuf, in_len, (void *)ib_send_cm_apr);
+}
+
+static ssize_t ib_ucm_send_mra(struct ib_ucm_file *file,
+			       const char __user *inbuf,
+			       int in_len, int out_len)
+{
+	struct ib_ucm_context *ctx;
+	struct ib_ucm_mra cmd;
+	const void *data = NULL;
+	int result;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	result = ib_ucm_alloc_data(&data, cmd.data, cmd.len);
+	if (result)
+		return result;
+
+	ctx = ib_ucm_ctx_get(file, cmd.id);
+	if (!IS_ERR(ctx)) {
+		result = ib_send_cm_mra(ctx->cm_id, cmd.timeout, data, cmd.len);
+		ib_ucm_ctx_put(ctx);
+	} else
+		result = PTR_ERR(ctx);
+
+	kfree(data);
+	return result;
+}
+
+static ssize_t ib_ucm_send_lap(struct ib_ucm_file *file,
+			       const char __user *inbuf,
+			       int in_len, int out_len)
+{
+	struct ib_ucm_context *ctx;
+	struct ib_sa_path_rec *path = NULL;
+	struct ib_ucm_lap cmd;
+	const void *data = NULL;
+	int result;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	result = ib_ucm_alloc_data(&data, cmd.data, cmd.len);
+	if (result)
+		goto done;
+
+	result = ib_ucm_path_get(&path, cmd.path);
+	if (result)
+		goto done;
+
+	ctx = ib_ucm_ctx_get(file, cmd.id);
+	if (!IS_ERR(ctx)) {
+		result = ib_send_cm_lap(ctx->cm_id, path, data, cmd.len);
+		ib_ucm_ctx_put(ctx);
+	} else
+		result = PTR_ERR(ctx);
+
+done:
+	kfree(data);
+	kfree(path);
+	return result;
+}
+
+static ssize_t ib_ucm_send_sidr_req(struct ib_ucm_file *file,
+				    const char __user *inbuf,
+				    int in_len, int out_len)
+{
+	struct ib_cm_sidr_req_param param;
+	struct ib_ucm_context *ctx;
+	struct ib_ucm_sidr_req cmd;
+	int result;
+
+	param.private_data = NULL;
+	param.path = NULL;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	result = ib_ucm_alloc_data(&param.private_data, cmd.data, cmd.len);
+	if (result)
+		goto done;
+
+	result = ib_ucm_path_get(&param.path, cmd.path);
+	if (result)
+		goto done;
+
+	param.private_data_len = cmd.len;
+	param.service_id       = cmd.sid;
+	param.timeout_ms       = cmd.timeout;
+	param.max_cm_retries   = cmd.max_cm_retries;
+
+	ctx = ib_ucm_ctx_get(file, cmd.id);
+	if (!IS_ERR(ctx)) {
+		result = ib_send_cm_sidr_req(ctx->cm_id, &param);
+		ib_ucm_ctx_put(ctx);
+	} else
+		result = PTR_ERR(ctx);
+
+done:
+	kfree(param.private_data);
+	kfree(param.path);
+	return result;
+}
+
+static ssize_t ib_ucm_send_sidr_rep(struct ib_ucm_file *file,
+				    const char __user *inbuf,
+				    int in_len, int out_len)
+{
+	struct ib_cm_sidr_rep_param param;
+	struct ib_ucm_sidr_rep cmd;
+	struct ib_ucm_context *ctx;
+	int result;
+
+	param.info = NULL;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	result = ib_ucm_alloc_data(&param.private_data,
+				   cmd.data, cmd.data_len);
+	if (result)
+		goto done;
+
+	result = ib_ucm_alloc_data(&param.info, cmd.info, cmd.info_len);
+	if (result)
+		goto done;
+
+	param.qp_num		= cmd.qpn;
+	param.qkey		= cmd.qkey;
+	param.status		= cmd.status;
+	param.info_length	= cmd.info_len;
+	param.private_data_len	= cmd.data_len;
+
+	ctx = ib_ucm_ctx_get(file, cmd.id);
+	if (!IS_ERR(ctx)) {
+		result = ib_send_cm_sidr_rep(ctx->cm_id, &param);
+		ib_ucm_ctx_put(ctx);
+	} else
+		result = PTR_ERR(ctx);
+
+done:
+	kfree(param.private_data);
+	kfree(param.info);
+	return result;
+}
+
+static ssize_t (*ucm_cmd_table[])(struct ib_ucm_file *file,
+				  const char __user *inbuf,
+				  int in_len, int out_len) = {
+	[IB_USER_CM_CMD_CREATE_ID]     = ib_ucm_create_id,
+	[IB_USER_CM_CMD_DESTROY_ID]    = ib_ucm_destroy_id,
+	[IB_USER_CM_CMD_ATTR_ID]       = ib_ucm_attr_id,
+	[IB_USER_CM_CMD_LISTEN]        = ib_ucm_listen,
+	[IB_USER_CM_CMD_NOTIFY]        = ib_ucm_notify,
+	[IB_USER_CM_CMD_SEND_REQ]      = ib_ucm_send_req,
+	[IB_USER_CM_CMD_SEND_REP]      = ib_ucm_send_rep,
+	[IB_USER_CM_CMD_SEND_RTU]      = ib_ucm_send_rtu,
+	[IB_USER_CM_CMD_SEND_DREQ]     = ib_ucm_send_dreq,
+	[IB_USER_CM_CMD_SEND_DREP]     = ib_ucm_send_drep,
+	[IB_USER_CM_CMD_SEND_REJ]      = ib_ucm_send_rej,
+	[IB_USER_CM_CMD_SEND_MRA]      = ib_ucm_send_mra,
+	[IB_USER_CM_CMD_SEND_LAP]      = ib_ucm_send_lap,
+	[IB_USER_CM_CMD_SEND_APR]      = ib_ucm_send_apr,
+	[IB_USER_CM_CMD_SEND_SIDR_REQ] = ib_ucm_send_sidr_req,
+	[IB_USER_CM_CMD_SEND_SIDR_REP] = ib_ucm_send_sidr_rep,
+	[IB_USER_CM_CMD_EVENT]	       = ib_ucm_event,
+	[IB_USER_CM_CMD_INIT_QP_ATTR]  = ib_ucm_init_qp_attr,
+};
+
+static ssize_t ib_ucm_write(struct file *filp, const char __user *buf,
+			    size_t len, loff_t *pos)
+{
+	struct ib_ucm_file *file = filp->private_data;
+	struct ib_ucm_cmd_hdr hdr;
+	ssize_t result;
+
+	if (len < sizeof(hdr))
+		return -EINVAL;
+
+	if (copy_from_user(&hdr, buf, sizeof(hdr)))
+		return -EFAULT;
+
+	if (hdr.cmd < 0 || hdr.cmd >= ARRAY_SIZE(ucm_cmd_table))
+		return -EINVAL;
+
+	if (hdr.in + sizeof(hdr) > len)
+		return -EINVAL;
+
+	result = ucm_cmd_table[hdr.cmd](file, buf + sizeof(hdr),
+					hdr.in, hdr.out);
+	if (!result)
+		result = len;
+
+	return result;
+}
+
+static unsigned int ib_ucm_poll(struct file *filp,
+				struct poll_table_struct *wait)
+{
+	struct ib_ucm_file *file = filp->private_data;
+	unsigned int mask = 0;
+
+	poll_wait(filp, &file->poll_wait, wait);
+
+	if (!list_empty(&file->events))
+		mask = POLLIN | POLLRDNORM;
+
+	return mask;
+}
+
+/*
+ * ib_ucm_open() does not need the BKL:
+ *
+ *  - no global state is referred to;
+ *  - there is no ioctl method to race against;
+ *  - no further module initialization is required for open to work
+ *    after the device is registered.
+ */
+static int ib_ucm_open(struct inode *inode, struct file *filp)
+{
+	struct ib_ucm_file *file;
+
+	file = kzalloc(sizeof(*file), GFP_KERNEL);
+	if (!file)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&file->events);
+	INIT_LIST_HEAD(&file->ctxs);
+	init_waitqueue_head(&file->poll_wait);
+
+	mutex_init(&file->file_mutex);
+
+	filp->private_data = file;
+	file->filp = filp;
+	file->device = container_of(inode->i_cdev->si_drv1, struct ib_ucm_device, cdev);
+
+	return 0;
+}
+
+static int ib_ucm_close(struct inode *inode, struct file *filp)
+{
+	struct ib_ucm_file *file = filp->private_data;
+	struct ib_ucm_context *ctx;
+
+	mutex_lock(&file->file_mutex);
+	while (!list_empty(&file->ctxs)) {
+		ctx = list_entry(file->ctxs.next,
+				 struct ib_ucm_context, file_list);
+		mutex_unlock(&file->file_mutex);
+
+		mutex_lock(&ctx_id_mutex);
+		idr_remove(&ctx_id_table, ctx->id);
+		mutex_unlock(&ctx_id_mutex);
+
+		ib_destroy_cm_id(ctx->cm_id);
+		ib_ucm_cleanup_events(ctx);
+		kfree(ctx);
+
+		mutex_lock(&file->file_mutex);
+	}
+	mutex_unlock(&file->file_mutex);
+	kfree(file);
+	return 0;
+}
+
+static void ib_ucm_release_dev(struct device *dev)
+{
+	struct ib_ucm_device *ucm_dev;
+
+	ucm_dev = container_of(dev, struct ib_ucm_device, dev);
+	cdev_del(&ucm_dev->cdev);
+	clear_bit(ucm_dev->devnum, dev_map);
+	kfree(ucm_dev);
+}
+
+static const struct file_operations ucm_fops = {
+	.owner 	 = THIS_MODULE,
+	.open 	 = ib_ucm_open,
+	.release = ib_ucm_close,
+	.write 	 = ib_ucm_write,
+	.poll    = ib_ucm_poll,
+};
+
+static ssize_t show_ibdev(struct device *dev, struct device_attribute *attr,
+			  char *buf)
+{
+	struct ib_ucm_device *ucm_dev;
+
+	ucm_dev = container_of(dev, struct ib_ucm_device, dev);
+	return sprintf(buf, "%s\n", ucm_dev->ib_dev->name);
+}
+static DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL);
+
+static void ib_ucm_add_one(struct ib_device *device)
+{
+	struct ib_ucm_device *ucm_dev;
+
+	if (!device->alloc_ucontext ||
+	    rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
+		return;
+
+	ucm_dev = kzalloc(sizeof *ucm_dev, GFP_KERNEL);
+	if (!ucm_dev)
+		return;
+
+	ucm_dev->ib_dev = device;
+
+	ucm_dev->devnum = find_first_zero_bit(dev_map, IB_UCM_MAX_DEVICES);
+	if (ucm_dev->devnum >= IB_UCM_MAX_DEVICES)
+		goto err;
+
+	set_bit(ucm_dev->devnum, dev_map);
+
+	cdev_init(&ucm_dev->cdev, &ucm_fops);
+	ucm_dev->cdev.owner = THIS_MODULE;
+	kobject_set_name(&ucm_dev->cdev.kobj, "ucm%d", ucm_dev->devnum);
+	if (cdev_add(&ucm_dev->cdev, IB_UCM_BASE_DEV + ucm_dev->devnum, 1))
+		goto err;
+
+	ucm_dev->dev.class = &cm_class;
+	ucm_dev->dev.parent = device->dma_device;
+	ucm_dev->dev.devt = ucm_dev->cdev.dev;
+	ucm_dev->dev.release = ib_ucm_release_dev;
+	dev_set_name(&ucm_dev->dev, "ucm%d", ucm_dev->devnum);
+	if (device_register(&ucm_dev->dev))
+		goto err_cdev;
+
+	if (device_create_file(&ucm_dev->dev, &dev_attr_ibdev))
+		goto err_dev;
+
+	ib_set_client_data(device, &ucm_client, ucm_dev);
+	return;
+
+err_dev:
+	device_unregister(&ucm_dev->dev);
+err_cdev:
+	cdev_del(&ucm_dev->cdev);
+	clear_bit(ucm_dev->devnum, dev_map);
+err:
+	kfree(ucm_dev);
+	return;
+}
+
+static void ib_ucm_remove_one(struct ib_device *device)
+{
+	struct ib_ucm_device *ucm_dev = ib_get_client_data(device, &ucm_client);
+
+	if (!ucm_dev)
+		return;
+
+	device_unregister(&ucm_dev->dev);
+}
+
+static ssize_t show_abi_version(struct class *class, struct class_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%d\n", IB_USER_CM_ABI_VERSION);
+}
+static CLASS_ATTR(abi_version, S_IRUGO, show_abi_version, NULL);
+
+static int __init ib_ucm_init(void)
+{
+	int ret;
+
+	ret = register_chrdev_region(IB_UCM_BASE_DEV, IB_UCM_MAX_DEVICES,
+				     "infiniband_cm");
+	if (ret) {
+		printk(KERN_ERR "ucm: couldn't register device number\n");
+		goto error1;
+	}
+
+	ret = class_create_file(&cm_class, &class_attr_abi_version);
+	if (ret) {
+		printk(KERN_ERR "ucm: couldn't create abi_version attribute\n");
+		goto error2;
+	}
+
+	ret = ib_register_client(&ucm_client);
+	if (ret) {
+		printk(KERN_ERR "ucm: couldn't register client\n");
+		goto error3;
+	}
+	return 0;
+
+error3:
+	class_remove_file(&cm_class, &class_attr_abi_version);
+error2:
+	unregister_chrdev_region(IB_UCM_BASE_DEV, IB_UCM_MAX_DEVICES);
+error1:
+	return ret;
+}
+
+static void __exit ib_ucm_cleanup(void)
+{
+	ib_unregister_client(&ucm_client);
+	class_remove_file(&cm_class, &class_attr_abi_version);
+	unregister_chrdev_region(IB_UCM_BASE_DEV, IB_UCM_MAX_DEVICES);
+	idr_destroy(&ctx_id_table);
+}
+
+module_init_order(ib_ucm_init, SI_ORDER_THIRD);
+module_exit(ib_ucm_cleanup);


Property changes on: trunk/sys/ofed/drivers/infiniband/core/ucm.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/sys/ofed/drivers/infiniband/core/ucma.c
===================================================================
--- trunk/sys/ofed/drivers/infiniband/core/ucma.c	                        (rev 0)
+++ trunk/sys/ofed/drivers/infiniband/core/ucma.c	2016-09-14 19:35:22 UTC (rev 7911)
@@ -0,0 +1,1353 @@
+/*
+ * Copyright (c) 2005-2006 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *	copyright notice, this list of conditions and the following
+ *	disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *	copyright notice, this list of conditions and the following
+ *	disclaimer in the documentation and/or other materials
+ *	provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/completion.h>
+#include <linux/file.h>
+#include <linux/mutex.h>
+#include <linux/poll.h>
+#include <linux/idr.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/miscdevice.h>
+
+#include <sys/filio.h>
+
+#include <rdma/rdma_user_cm.h>
+#include <rdma/ib_marshall.h>
+#include <rdma/rdma_cm.h>
+#include <rdma/rdma_cm_ib.h>
+
+MODULE_AUTHOR("Sean Hefty");
+MODULE_DESCRIPTION("RDMA Userspace Connection Manager Access");
+MODULE_LICENSE("Dual BSD/GPL");
+
+enum {
+	UCMA_MAX_BACKLOG	= 1024
+};
+
+struct ucma_file {
+	struct mutex		mut;
+	struct file		*filp;
+	struct list_head	ctx_list;
+	struct list_head	event_list;
+	wait_queue_head_t	poll_wait;
+};
+
+struct ucma_context {
+	int			id;
+	struct completion	comp;
+	atomic_t		ref;
+	int			events_reported;
+	int			backlog;
+
+	struct ucma_file	*file;
+	struct rdma_cm_id	*cm_id;
+	u64			uid;
+
+	struct list_head	list;
+	struct list_head	mc_list;
+};
+
+struct ucma_multicast {
+	struct ucma_context	*ctx;
+	int			id;
+	int			events_reported;
+
+	u64			uid;
+	struct list_head	list;
+	struct sockaddr_storage	addr;
+};
+
+struct ucma_event {
+	struct ucma_context	*ctx;
+	struct ucma_multicast	*mc;
+	struct list_head	list;
+	struct rdma_cm_id	*cm_id;
+	struct rdma_ucm_event_resp resp;
+};
+
+static DEFINE_MUTEX(mut);
+static DEFINE_IDR(ctx_idr);
+static DEFINE_IDR(multicast_idr);
+
+static inline struct ucma_context *_ucma_find_context(int id,
+						      struct ucma_file *file)
+{
+	struct ucma_context *ctx;
+
+	ctx = idr_find(&ctx_idr, id);
+	if (!ctx)
+		ctx = ERR_PTR(-ENOENT);
+	else if (ctx->file != file)
+		ctx = ERR_PTR(-EINVAL);
+	return ctx;
+}
+
+static struct ucma_context *ucma_get_ctx(struct ucma_file *file, int id)
+{
+	struct ucma_context *ctx;
+
+	mutex_lock(&mut);
+	ctx = _ucma_find_context(id, file);
+	if (!IS_ERR(ctx))
+		atomic_inc(&ctx->ref);
+	mutex_unlock(&mut);
+	return ctx;
+}
+
+static void ucma_put_ctx(struct ucma_context *ctx)
+{
+	if (atomic_dec_and_test(&ctx->ref))
+		complete(&ctx->comp);
+}
+
+static struct ucma_context *ucma_alloc_ctx(struct ucma_file *file)
+{
+	struct ucma_context *ctx;
+	int ret;
+
+	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+	if (!ctx)
+		return NULL;
+
+	atomic_set(&ctx->ref, 1);
+	init_completion(&ctx->comp);
+	INIT_LIST_HEAD(&ctx->mc_list);
+	ctx->file = file;
+
+	do {
+		ret = idr_pre_get(&ctx_idr, GFP_KERNEL);
+		if (!ret)
+			goto error;
+
+		mutex_lock(&mut);
+		ret = idr_get_new(&ctx_idr, ctx, &ctx->id);
+		mutex_unlock(&mut);
+	} while (ret == -EAGAIN);
+
+	if (ret)
+		goto error;
+
+	list_add_tail(&ctx->list, &file->ctx_list);
+	return ctx;
+
+error:
+	kfree(ctx);
+	return NULL;
+}
+
+static struct ucma_multicast* ucma_alloc_multicast(struct ucma_context *ctx)
+{
+	struct ucma_multicast *mc;
+	int ret;
+
+	mc = kzalloc(sizeof(*mc), GFP_KERNEL);
+	if (!mc)
+		return NULL;
+
+	do {
+		ret = idr_pre_get(&multicast_idr, GFP_KERNEL);
+		if (!ret)
+			goto error;
+
+		mutex_lock(&mut);
+		ret = idr_get_new(&multicast_idr, mc, &mc->id);
+		mutex_unlock(&mut);
+	} while (ret == -EAGAIN);
+
+	if (ret)
+		goto error;
+
+	mc->ctx = ctx;
+	list_add_tail(&mc->list, &ctx->mc_list);
+	return mc;
+
+error:
+	kfree(mc);
+	return NULL;
+}
+
+static void ucma_copy_conn_event(struct rdma_ucm_conn_param *dst,
+				 struct rdma_conn_param *src)
+{
+	if (src->private_data_len)
+		memcpy(dst->private_data, src->private_data,
+		       src->private_data_len);
+	dst->private_data_len = src->private_data_len;
+	dst->responder_resources =src->responder_resources;
+	dst->initiator_depth = src->initiator_depth;
+	dst->flow_control = src->flow_control;
+	dst->retry_count = src->retry_count;
+	dst->rnr_retry_count = src->rnr_retry_count;
+	dst->srq = src->srq;
+	dst->qp_num = src->qp_num;
+}
+
+static void ucma_copy_ud_event(struct rdma_ucm_ud_param *dst,
+			       struct rdma_ud_param *src)
+{
+	if (src->private_data_len)
+		memcpy(dst->private_data, src->private_data,
+		       src->private_data_len);
+	dst->private_data_len = src->private_data_len;
+	ib_copy_ah_attr_to_user(&dst->ah_attr, &src->ah_attr);
+	dst->qp_num = src->qp_num;
+	dst->qkey = src->qkey;
+}
+
+static void ucma_set_event_context(struct ucma_context *ctx,
+				   struct rdma_cm_event *event,
+				   struct ucma_event *uevent)
+{
+	uevent->ctx = ctx;
+	switch (event->event) {
+	case RDMA_CM_EVENT_MULTICAST_JOIN:
+	case RDMA_CM_EVENT_MULTICAST_ERROR:
+		uevent->mc = (struct ucma_multicast *)
+			     event->param.ud.private_data;
+		uevent->resp.uid = uevent->mc->uid;
+		uevent->resp.id = uevent->mc->id;
+		break;
+	default:
+		uevent->resp.uid = ctx->uid;
+		uevent->resp.id = ctx->id;
+		break;
+	}
+}
+
+static int ucma_event_handler(struct rdma_cm_id *cm_id,
+			      struct rdma_cm_event *event)
+{
+	struct ucma_event *uevent;
+	struct ucma_context *ctx = cm_id->context;
+	int ret = 0;
+
+	uevent = kzalloc(sizeof(*uevent), GFP_KERNEL);
+	if (!uevent)
+		return event->event == RDMA_CM_EVENT_CONNECT_REQUEST;
+
+	uevent->cm_id = cm_id;
+	ucma_set_event_context(ctx, event, uevent);
+	uevent->resp.event = event->event;
+	uevent->resp.status = event->status;
+	if (cm_id->ps == RDMA_PS_UDP || cm_id->ps == RDMA_PS_IPOIB)
+		ucma_copy_ud_event(&uevent->resp.param.ud, &event->param.ud);
+	else
+		ucma_copy_conn_event(&uevent->resp.param.conn,
+				     &event->param.conn);
+
+	mutex_lock(&ctx->file->mut);
+	if (event->event == RDMA_CM_EVENT_CONNECT_REQUEST) {
+		if (!ctx->backlog) {
+			ret = -ENOMEM;
+			kfree(uevent);
+			goto out;
+		}
+		ctx->backlog--;
+	} else if (!ctx->uid) {
+		/*
+		 * We ignore events for new connections until userspace has set
+		 * their context.  This can only happen if an error occurs on a
+		 * new connection before the user accepts it.  This is okay,
+		 * since the accept will just fail later.
+		 */
+		kfree(uevent);
+		goto out;
+	}
+
+	list_add_tail(&uevent->list, &ctx->file->event_list);
+	wake_up_interruptible(&ctx->file->poll_wait);
+	if (ctx->file->filp)
+		selwakeup(&ctx->file->filp->f_selinfo);
+out:
+	mutex_unlock(&ctx->file->mut);
+	return ret;
+}
+
+static ssize_t ucma_get_event(struct ucma_file *file, const char __user *inbuf,
+			      int in_len, int out_len)
+{
+	struct ucma_context *ctx;
+	struct rdma_ucm_get_event cmd;
+	struct ucma_event *uevent;
+	int ret = 0;
+	DEFINE_WAIT(wait);
+
+	if (out_len < sizeof uevent->resp)
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	mutex_lock(&file->mut);
+	while (list_empty(&file->event_list)) {
+		mutex_unlock(&file->mut);
+
+		if (file->filp->f_flags & O_NONBLOCK)
+			return -EAGAIN;
+
+		if (wait_event_interruptible(file->poll_wait,
+					     !list_empty(&file->event_list)))
+			return -ERESTARTSYS;
+
+		mutex_lock(&file->mut);
+	}
+
+	uevent = list_entry(file->event_list.next, struct ucma_event, list);
+
+	if (uevent->resp.event == RDMA_CM_EVENT_CONNECT_REQUEST) {
+		ctx = ucma_alloc_ctx(file);
+		if (!ctx) {
+			ret = -ENOMEM;
+			goto done;
+		}
+		uevent->ctx->backlog++;
+		ctx->cm_id = uevent->cm_id;
+		ctx->cm_id->context = ctx;
+		uevent->resp.id = ctx->id;
+	}
+
+	if (copy_to_user((void __user *)(unsigned long)cmd.response,
+			 &uevent->resp, sizeof uevent->resp)) {
+		ret = -EFAULT;
+		goto done;
+	}
+
+	list_del(&uevent->list);
+	uevent->ctx->events_reported++;
+	if (uevent->mc)
+		uevent->mc->events_reported++;
+	kfree(uevent);
+done:
+	mutex_unlock(&file->mut);
+	return ret;
+}
+
+static ssize_t ucma_create_id(struct ucma_file *file,
+				const char __user *inbuf,
+				int in_len, int out_len)
+{
+	struct rdma_ucm_create_id cmd;
+	struct rdma_ucm_create_id_resp resp;
+	struct ucma_context *ctx;
+	int ret;
+
+	if (out_len < sizeof(resp))
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	mutex_lock(&file->mut);
+	ctx = ucma_alloc_ctx(file);
+	mutex_unlock(&file->mut);
+	if (!ctx)
+		return -ENOMEM;
+
+	ctx->uid = cmd.uid;
+	ctx->cm_id = rdma_create_id(ucma_event_handler, ctx, cmd.ps);
+	if (IS_ERR(ctx->cm_id)) {
+		ret = PTR_ERR(ctx->cm_id);
+		goto err1;
+	}
+
+	resp.id = ctx->id;
+	if (copy_to_user((void __user *)(unsigned long)cmd.response,
+			 &resp, sizeof(resp))) {
+		ret = -EFAULT;
+		goto err2;
+	}
+	return 0;
+
+err2:
+	rdma_destroy_id(ctx->cm_id);
+err1:
+	mutex_lock(&mut);
+	idr_remove(&ctx_idr, ctx->id);
+	mutex_unlock(&mut);
+	kfree(ctx);
+	return ret;
+}
+
+static void ucma_cleanup_multicast(struct ucma_context *ctx)
+{
+	struct ucma_multicast *mc, *tmp;
+
+	mutex_lock(&mut);
+	list_for_each_entry_safe(mc, tmp, &ctx->mc_list, list) {
+		list_del(&mc->list);
+		idr_remove(&multicast_idr, mc->id);
+		kfree(mc);
+	}
+	mutex_unlock(&mut);
+}
+
+static void ucma_cleanup_events(struct ucma_context *ctx)
+{
+	struct ucma_event *uevent, *tmp;
+
+	list_for_each_entry_safe(uevent, tmp, &ctx->file->event_list, list) {
+		if (uevent->ctx != ctx)
+			continue;
+
+		list_del(&uevent->list);
+
+		/* clear incoming connections. */
+		if (uevent->resp.event == RDMA_CM_EVENT_CONNECT_REQUEST)
+			rdma_destroy_id(uevent->cm_id);
+
+		kfree(uevent);
+	}
+}
+
+static void ucma_cleanup_mc_events(struct ucma_multicast *mc)
+{
+	struct ucma_event *uevent, *tmp;
+
+	list_for_each_entry_safe(uevent, tmp, &mc->ctx->file->event_list, list) {
+		if (uevent->mc != mc)
+			continue;
+
+		list_del(&uevent->list);
+		kfree(uevent);
+	}
+}
+
+static int ucma_free_ctx(struct ucma_context *ctx)
+{
+	int events_reported;
+
+	/* No new events will be generated after destroying the id. */
+	rdma_destroy_id(ctx->cm_id);
+
+	ucma_cleanup_multicast(ctx);
+
+	/* Cleanup events not yet reported to the user. */
+	mutex_lock(&ctx->file->mut);
+	ucma_cleanup_events(ctx);
+	list_del(&ctx->list);
+	mutex_unlock(&ctx->file->mut);
+
+	events_reported = ctx->events_reported;
+	kfree(ctx);
+	return events_reported;
+}
+
+static ssize_t ucma_destroy_id(struct ucma_file *file, const char __user *inbuf,
+			       int in_len, int out_len)
+{
+	struct rdma_ucm_destroy_id cmd;
+	struct rdma_ucm_destroy_id_resp resp;
+	struct ucma_context *ctx;
+	int ret = 0;
+
+	if (out_len < sizeof(resp))
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	mutex_lock(&mut);
+	ctx = _ucma_find_context(cmd.id, file);
+	if (!IS_ERR(ctx))
+		idr_remove(&ctx_idr, ctx->id);
+	mutex_unlock(&mut);
+
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	ucma_put_ctx(ctx);
+	wait_for_completion(&ctx->comp);
+	resp.events_reported = ucma_free_ctx(ctx);
+
+	if (copy_to_user((void __user *)(unsigned long)cmd.response,
+			 &resp, sizeof(resp)))
+		ret = -EFAULT;
+
+	return ret;
+}
+
+static ssize_t ucma_bind_addr(struct ucma_file *file, const char __user *inbuf,
+			      int in_len, int out_len)
+{
+	struct rdma_ucm_bind_addr cmd;
+	struct ucma_context *ctx;
+	int ret;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	ctx = ucma_get_ctx(file, cmd.id);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	ret = rdma_bind_addr(ctx->cm_id, (struct sockaddr *) &cmd.addr);
+	ucma_put_ctx(ctx);
+	return ret;
+}
+
+static ssize_t ucma_resolve_addr(struct ucma_file *file,
+				 const char __user *inbuf,
+				 int in_len, int out_len)
+{
+	struct rdma_ucm_resolve_addr cmd;
+	struct ucma_context *ctx;
+	int ret;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	ctx = ucma_get_ctx(file, cmd.id);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	ret = rdma_resolve_addr(ctx->cm_id, (struct sockaddr *) &cmd.src_addr,
+				(struct sockaddr *) &cmd.dst_addr,
+				cmd.timeout_ms);
+	ucma_put_ctx(ctx);
+	return ret;
+}
+
+static ssize_t ucma_resolve_route(struct ucma_file *file,
+				  const char __user *inbuf,
+				  int in_len, int out_len)
+{
+	struct rdma_ucm_resolve_route cmd;
+	struct ucma_context *ctx;
+	int ret;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	ctx = ucma_get_ctx(file, cmd.id);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	ret = rdma_resolve_route(ctx->cm_id, cmd.timeout_ms);
+	ucma_put_ctx(ctx);
+	return ret;
+}
+
+static void ucma_copy_ib_route(struct rdma_ucm_query_route_resp *resp,
+			       struct rdma_route *route)
+{
+	struct rdma_dev_addr *dev_addr;
+
+	resp->num_paths = route->num_paths;
+	switch (route->num_paths) {
+	case 0:
+		dev_addr = &route->addr.dev_addr;
+		rdma_addr_get_dgid(dev_addr,
+				   (union ib_gid *) &resp->ib_route[0].dgid);
+		rdma_addr_get_sgid(dev_addr,
+				   (union ib_gid *) &resp->ib_route[0].sgid);
+		resp->ib_route[0].pkey = cpu_to_be16(ib_addr_get_pkey(dev_addr));
+		break;
+	case 2:
+		ib_copy_path_rec_to_user(&resp->ib_route[1],
+					 &route->path_rec[1]);
+		/* fall through */
+	case 1:
+		ib_copy_path_rec_to_user(&resp->ib_route[0],
+					 &route->path_rec[0]);
+		break;
+	default:
+		break;
+	}
+}
+
+static void ucma_copy_iboe_route(struct rdma_ucm_query_route_resp *resp,
+				 struct rdma_route *route)
+{
+	struct rdma_dev_addr *dev_addr;
+	struct net_device *dev;
+	u16 vid = 0;
+
+	resp->num_paths = route->num_paths;
+	switch (route->num_paths) {
+	case 0:
+		dev_addr = &route->addr.dev_addr;
+		dev = dev_get_by_index(&init_net, dev_addr->bound_dev_if);
+			if (dev) {
+				vid = rdma_vlan_dev_vlan_id(dev);
+				dev_put(dev);
+			}
+
+		iboe_mac_vlan_to_ll((union ib_gid *) &resp->ib_route[0].dgid,
+				    dev_addr->dst_dev_addr, vid);
+		iboe_addr_get_sgid(dev_addr,
+				   (union ib_gid *) &resp->ib_route[0].sgid);
+		resp->ib_route[0].pkey = cpu_to_be16(0xffff);
+		break;
+	case 2:
+		ib_copy_path_rec_to_user(&resp->ib_route[1],
+					 &route->path_rec[1]);
+		/* fall through */
+	case 1:
+		ib_copy_path_rec_to_user(&resp->ib_route[0],
+					 &route->path_rec[0]);
+		break;
+	default:
+		break;
+	}
+}
+
+static ssize_t ucma_query_route(struct ucma_file *file,
+				const char __user *inbuf,
+				int in_len, int out_len)
+{
+	struct rdma_ucm_query_route cmd;
+	struct rdma_ucm_query_route_resp resp;
+	struct ucma_context *ctx;
+	struct sockaddr *addr;
+	int ret = 0;
+
+	if (out_len < sizeof(resp))
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	ctx = ucma_get_ctx(file, cmd.id);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	memset(&resp, 0, sizeof resp);
+	addr = (struct sockaddr *) &ctx->cm_id->route.addr.src_addr;
+	memcpy(&resp.src_addr, addr, addr->sa_family == AF_INET ?
+				     sizeof(struct sockaddr_in) :
+				     sizeof(struct sockaddr_in6));
+	addr = (struct sockaddr *) &ctx->cm_id->route.addr.dst_addr;
+	memcpy(&resp.dst_addr, addr, addr->sa_family == AF_INET ?
+				     sizeof(struct sockaddr_in) :
+				     sizeof(struct sockaddr_in6));
+	if (!ctx->cm_id->device)
+		goto out;
+
+	resp.node_guid = (__force __u64) ctx->cm_id->device->node_guid;
+	resp.port_num = ctx->cm_id->port_num;
+	if (rdma_node_get_transport(ctx->cm_id->device->node_type) == RDMA_TRANSPORT_IB) {
+		switch (rdma_port_get_link_layer(ctx->cm_id->device, ctx->cm_id->port_num)) {
+		case IB_LINK_LAYER_INFINIBAND:
+			ucma_copy_ib_route(&resp, &ctx->cm_id->route);
+			break;
+		case IB_LINK_LAYER_ETHERNET:
+			ucma_copy_iboe_route(&resp, &ctx->cm_id->route);
+			break;
+		default:
+			break;
+		}
+	}
+
+out:
+	if (copy_to_user((void __user *)(unsigned long)cmd.response,
+			 &resp, sizeof(resp)))
+		ret = -EFAULT;
+
+	ucma_put_ctx(ctx);
+	return ret;
+}
+
+static void ucma_copy_conn_param(struct rdma_conn_param *dst,
+				 struct rdma_ucm_conn_param *src)
+{
+	dst->private_data = src->private_data;
+	dst->private_data_len = src->private_data_len;
+	dst->responder_resources =src->responder_resources;
+	dst->initiator_depth = src->initiator_depth;
+	dst->flow_control = src->flow_control;
+	dst->retry_count = src->retry_count;
+	dst->rnr_retry_count = src->rnr_retry_count;
+	dst->srq = src->srq;
+	dst->qp_num = src->qp_num;
+}
+
+static ssize_t ucma_connect(struct ucma_file *file, const char __user *inbuf,
+			    int in_len, int out_len)
+{
+	struct rdma_ucm_connect cmd;
+	struct rdma_conn_param conn_param;
+	struct ucma_context *ctx;
+	int ret;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	if (!cmd.conn_param.valid)
+		return -EINVAL;
+
+	ctx = ucma_get_ctx(file, cmd.id);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	ucma_copy_conn_param(&conn_param, &cmd.conn_param);
+	ret = rdma_connect(ctx->cm_id, &conn_param);
+	ucma_put_ctx(ctx);
+	return ret;
+}
+
+static ssize_t ucma_listen(struct ucma_file *file, const char __user *inbuf,
+			   int in_len, int out_len)
+{
+	struct rdma_ucm_listen cmd;
+	struct ucma_context *ctx;
+	int ret;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	ctx = ucma_get_ctx(file, cmd.id);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	ctx->backlog = cmd.backlog > 0 && cmd.backlog < UCMA_MAX_BACKLOG ?
+		       cmd.backlog : UCMA_MAX_BACKLOG;
+	ret = rdma_listen(ctx->cm_id, ctx->backlog);
+	ucma_put_ctx(ctx);
+	return ret;
+}
+
+static ssize_t ucma_accept(struct ucma_file *file, const char __user *inbuf,
+			   int in_len, int out_len)
+{
+	struct rdma_ucm_accept cmd;
+	struct rdma_conn_param conn_param;
+	struct ucma_context *ctx;
+	int ret;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	ctx = ucma_get_ctx(file, cmd.id);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	if (cmd.conn_param.valid) {
+		ctx->uid = cmd.uid;
+		ucma_copy_conn_param(&conn_param, &cmd.conn_param);
+		ret = rdma_accept(ctx->cm_id, &conn_param);
+	} else
+		ret = rdma_accept(ctx->cm_id, NULL);
+
+	ucma_put_ctx(ctx);
+	return ret;
+}
+
+static ssize_t ucma_reject(struct ucma_file *file, const char __user *inbuf,
+			   int in_len, int out_len)
+{
+	struct rdma_ucm_reject cmd;
+	struct ucma_context *ctx;
+	int ret;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	ctx = ucma_get_ctx(file, cmd.id);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	ret = rdma_reject(ctx->cm_id, cmd.private_data, cmd.private_data_len);
+	ucma_put_ctx(ctx);
+	return ret;
+}
+
+static ssize_t ucma_disconnect(struct ucma_file *file, const char __user *inbuf,
+			       int in_len, int out_len)
+{
+	struct rdma_ucm_disconnect cmd;
+	struct ucma_context *ctx;
+	int ret;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	ctx = ucma_get_ctx(file, cmd.id);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	ret = rdma_disconnect(ctx->cm_id);
+	ucma_put_ctx(ctx);
+	return ret;
+}
+
+static ssize_t ucma_init_qp_attr(struct ucma_file *file,
+				 const char __user *inbuf,
+				 int in_len, int out_len)
+{
+	struct rdma_ucm_init_qp_attr cmd;
+	struct ib_uverbs_qp_attr resp;
+	struct ucma_context *ctx;
+	struct ib_qp_attr qp_attr;
+	int ret;
+
+	if (out_len < sizeof(resp))
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	ctx = ucma_get_ctx(file, cmd.id);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	resp.qp_attr_mask = 0;
+	memset(&qp_attr, 0, sizeof qp_attr);
+	qp_attr.qp_state = cmd.qp_state;
+	ret = rdma_init_qp_attr(ctx->cm_id, &qp_attr, &resp.qp_attr_mask);
+	if (ret)
+		goto out;
+
+	ib_copy_qp_attr_to_user(&resp, &qp_attr);
+	if (copy_to_user((void __user *)(unsigned long)cmd.response,
+			 &resp, sizeof(resp)))
+		ret = -EFAULT;
+
+out:
+	ucma_put_ctx(ctx);
+	return ret;
+}
+
+static int ucma_set_option_id(struct ucma_context *ctx, int optname,
+			      void *optval, size_t optlen)
+{
+	int ret = 0;
+
+	switch (optname) {
+	case RDMA_OPTION_ID_TOS:
+		if (optlen != sizeof(u8)) {
+			ret = -EINVAL;
+			break;
+		}
+		rdma_set_service_type(ctx->cm_id, *((u8 *) optval));
+		break;
+	default:
+		ret = -ENOSYS;
+	}
+
+	return ret;
+}
+
+static int ucma_set_ib_path(struct ucma_context *ctx,
+			    struct ib_path_rec_data *path_data, size_t optlen)
+{
+	struct ib_sa_path_rec sa_path;
+	struct rdma_cm_event event;
+	int ret;
+
+	if (optlen % sizeof(*path_data))
+		return -EINVAL;
+
+	for (; optlen; optlen -= sizeof(*path_data), path_data++) {
+		if (path_data->flags == (IB_PATH_GMP | IB_PATH_PRIMARY |
+					 IB_PATH_BIDIRECTIONAL))
+			break;
+	}
+
+	if (!optlen)
+		return -EINVAL;
+
+	ib_sa_unpack_path(path_data->path_rec, &sa_path);
+	ret = rdma_set_ib_paths(ctx->cm_id, &sa_path, 1);
+	if (ret)
+		return ret;
+
+	memset(&event, 0, sizeof event);
+	event.event = RDMA_CM_EVENT_ROUTE_RESOLVED;
+	return ucma_event_handler(ctx->cm_id, &event);
+}
+
+static int ucma_set_option_ib(struct ucma_context *ctx, int optname,
+			      void *optval, size_t optlen)
+{
+	int ret;
+
+	switch (optname) {
+	case RDMA_OPTION_IB_PATH:
+		ret = ucma_set_ib_path(ctx, optval, optlen);
+		break;
+	default:
+		ret = -ENOSYS;
+	}
+
+	return ret;
+}
+
+static int ucma_set_option_level(struct ucma_context *ctx, int level,
+				 int optname, void *optval, size_t optlen)
+{
+	int ret;
+
+	switch (level) {
+	case RDMA_OPTION_ID:
+		ret = ucma_set_option_id(ctx, optname, optval, optlen);
+		break;
+	case RDMA_OPTION_IB:
+		ret = ucma_set_option_ib(ctx, optname, optval, optlen);
+		break;
+	default:
+		ret = -ENOSYS;
+	}
+
+	return ret;
+}
+
+static ssize_t ucma_set_option(struct ucma_file *file, const char __user *inbuf,
+			       int in_len, int out_len)
+{
+	struct rdma_ucm_set_option cmd;
+	struct ucma_context *ctx;
+	void *optval;
+	int ret;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	ctx = ucma_get_ctx(file, cmd.id);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	optval = kmalloc(cmd.optlen, GFP_KERNEL);
+	if (!optval) {
+		ret = -ENOMEM;
+		goto out1;
+	}
+
+	if (copy_from_user(optval, (void __user *) (unsigned long) cmd.optval,
+			   cmd.optlen)) {
+		ret = -EFAULT;
+		goto out2;
+	}
+
+	ret = ucma_set_option_level(ctx, cmd.level, cmd.optname, optval,
+				    cmd.optlen);
+out2:
+	kfree(optval);
+out1:
+	ucma_put_ctx(ctx);
+	return ret;
+}
+
+static ssize_t ucma_notify(struct ucma_file *file, const char __user *inbuf,
+			   int in_len, int out_len)
+{
+	struct rdma_ucm_notify cmd;
+	struct ucma_context *ctx;
+	int ret;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	ctx = ucma_get_ctx(file, cmd.id);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	ret = rdma_notify(ctx->cm_id, (enum ib_event_type) cmd.event);
+	ucma_put_ctx(ctx);
+	return ret;
+}
+
+static ssize_t ucma_join_multicast(struct ucma_file *file,
+				   const char __user *inbuf,
+				   int in_len, int out_len)
+{
+	struct rdma_ucm_join_mcast cmd;
+	struct rdma_ucm_create_id_resp resp;
+	struct ucma_context *ctx;
+	struct ucma_multicast *mc;
+	int ret;
+
+	if (out_len < sizeof(resp))
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	ctx = ucma_get_ctx(file, cmd.id);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	mutex_lock(&file->mut);
+	mc = ucma_alloc_multicast(ctx);
+	if (!mc) {
+		ret = -ENOMEM;
+		goto err1;
+	}
+
+	mc->uid = cmd.uid;
+	memcpy(&mc->addr, &cmd.addr, sizeof cmd.addr);
+	ret = rdma_join_multicast(ctx->cm_id, (struct sockaddr *) &mc->addr, mc);
+	if (ret)
+		goto err2;
+
+	resp.id = mc->id;
+	if (copy_to_user((void __user *)(unsigned long)cmd.response,
+			 &resp, sizeof(resp))) {
+		ret = -EFAULT;
+		goto err3;
+	}
+
+	mutex_unlock(&file->mut);
+	ucma_put_ctx(ctx);
+	return 0;
+
+err3:
+	rdma_leave_multicast(ctx->cm_id, (struct sockaddr *) &mc->addr);
+	ucma_cleanup_mc_events(mc);
+err2:
+	mutex_lock(&mut);
+	idr_remove(&multicast_idr, mc->id);
+	mutex_unlock(&mut);
+	list_del(&mc->list);
+	kfree(mc);
+err1:
+	mutex_unlock(&file->mut);
+	ucma_put_ctx(ctx);
+	return ret;
+}
+
+static ssize_t ucma_leave_multicast(struct ucma_file *file,
+				    const char __user *inbuf,
+				    int in_len, int out_len)
+{
+	struct rdma_ucm_destroy_id cmd;
+	struct rdma_ucm_destroy_id_resp resp;
+	struct ucma_multicast *mc;
+	int ret = 0;
+
+	if (out_len < sizeof(resp))
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	mutex_lock(&mut);
+	mc = idr_find(&multicast_idr, cmd.id);
+	if (!mc)
+		mc = ERR_PTR(-ENOENT);
+	else if (mc->ctx->file != file)
+		mc = ERR_PTR(-EINVAL);
+	else {
+		idr_remove(&multicast_idr, mc->id);
+		atomic_inc(&mc->ctx->ref);
+	}
+	mutex_unlock(&mut);
+
+	if (IS_ERR(mc)) {
+		ret = PTR_ERR(mc);
+		goto out;
+	}
+
+	rdma_leave_multicast(mc->ctx->cm_id, (struct sockaddr *) &mc->addr);
+	mutex_lock(&mc->ctx->file->mut);
+	ucma_cleanup_mc_events(mc);
+	list_del(&mc->list);
+	mutex_unlock(&mc->ctx->file->mut);
+
+	ucma_put_ctx(mc->ctx);
+	resp.events_reported = mc->events_reported;
+	kfree(mc);
+
+	if (copy_to_user((void __user *)(unsigned long)cmd.response,
+			 &resp, sizeof(resp)))
+		ret = -EFAULT;
+out:
+	return ret;
+}
+
+static void ucma_lock_files(struct ucma_file *file1, struct ucma_file *file2)
+{
+	/* Acquire mutex's based on pointer comparison to prevent deadlock. */
+	if (file1 < file2) {
+		mutex_lock(&file1->mut);
+		mutex_lock(&file2->mut);
+	} else {
+		mutex_lock(&file2->mut);
+		mutex_lock(&file1->mut);
+	}
+}
+
+static void ucma_unlock_files(struct ucma_file *file1, struct ucma_file *file2)
+{
+	if (file1 < file2) {
+		mutex_unlock(&file2->mut);
+		mutex_unlock(&file1->mut);
+	} else {
+		mutex_unlock(&file1->mut);
+		mutex_unlock(&file2->mut);
+	}
+}
+
+static void ucma_move_events(struct ucma_context *ctx, struct ucma_file *file)
+{
+	struct ucma_event *uevent, *tmp;
+
+	list_for_each_entry_safe(uevent, tmp, &ctx->file->event_list, list)
+		if (uevent->ctx == ctx)
+			list_move_tail(&uevent->list, &file->event_list);
+}
+
+static ssize_t ucma_migrate_id(struct ucma_file *new_file,
+			       const char __user *inbuf,
+			       int in_len, int out_len)
+{
+	struct rdma_ucm_migrate_id cmd;
+	struct rdma_ucm_migrate_resp resp;
+	struct ucma_context *ctx;
+	struct file *filp;
+	struct ucma_file *cur_file;
+	int ret = 0;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	/* Get current fd to protect against it being closed */
+	filp = fget(cmd.fd);
+	if (!filp)
+		return -ENOENT;
+
+	/* Validate current fd and prevent destruction of id. */
+	ctx = ucma_get_ctx(filp->private_data, cmd.id);
+	if (IS_ERR(ctx)) {
+		ret = PTR_ERR(ctx);
+		goto file_put;
+	}
+
+	cur_file = ctx->file;
+	if (cur_file == new_file) {
+		resp.events_reported = ctx->events_reported;
+		goto response;
+	}
+
+	/*
+	 * Migrate events between fd's, maintaining order, and avoiding new
+	 * events being added before existing events.
+	 */
+	ucma_lock_files(cur_file, new_file);
+	mutex_lock(&mut);
+
+	list_move_tail(&ctx->list, &new_file->ctx_list);
+	ucma_move_events(ctx, new_file);
+	ctx->file = new_file;
+	resp.events_reported = ctx->events_reported;
+
+	mutex_unlock(&mut);
+	ucma_unlock_files(cur_file, new_file);
+
+response:
+	if (copy_to_user((void __user *)(unsigned long)cmd.response,
+			 &resp, sizeof(resp)))
+		ret = -EFAULT;
+
+	ucma_put_ctx(ctx);
+file_put:
+	fput(filp);
+	return ret;
+}
+
+static ssize_t (*ucma_cmd_table[])(struct ucma_file *file,
+				   const char __user *inbuf,
+				   int in_len, int out_len) = {
+	[RDMA_USER_CM_CMD_CREATE_ID]	= ucma_create_id,
+	[RDMA_USER_CM_CMD_DESTROY_ID]	= ucma_destroy_id,
+	[RDMA_USER_CM_CMD_BIND_ADDR]	= ucma_bind_addr,
+	[RDMA_USER_CM_CMD_RESOLVE_ADDR]	= ucma_resolve_addr,
+	[RDMA_USER_CM_CMD_RESOLVE_ROUTE]= ucma_resolve_route,
+	[RDMA_USER_CM_CMD_QUERY_ROUTE]	= ucma_query_route,
+	[RDMA_USER_CM_CMD_CONNECT]	= ucma_connect,
+	[RDMA_USER_CM_CMD_LISTEN]	= ucma_listen,
+	[RDMA_USER_CM_CMD_ACCEPT]	= ucma_accept,
+	[RDMA_USER_CM_CMD_REJECT]	= ucma_reject,
+	[RDMA_USER_CM_CMD_DISCONNECT]	= ucma_disconnect,
+	[RDMA_USER_CM_CMD_INIT_QP_ATTR]	= ucma_init_qp_attr,
+	[RDMA_USER_CM_CMD_GET_EVENT]	= ucma_get_event,
+	[RDMA_USER_CM_CMD_GET_OPTION]	= NULL,
+	[RDMA_USER_CM_CMD_SET_OPTION]	= ucma_set_option,
+	[RDMA_USER_CM_CMD_NOTIFY]	= ucma_notify,
+	[RDMA_USER_CM_CMD_JOIN_MCAST]	= ucma_join_multicast,
+	[RDMA_USER_CM_CMD_LEAVE_MCAST]	= ucma_leave_multicast,
+	[RDMA_USER_CM_CMD_MIGRATE_ID]	= ucma_migrate_id
+};
+
+static ssize_t ucma_write(struct file *filp, const char __user *buf,
+			  size_t len, loff_t *pos)
+{
+	struct ucma_file *file = filp->private_data;
+	struct rdma_ucm_cmd_hdr hdr;
+	ssize_t ret;
+
+	if (len < sizeof(hdr))
+		return -EINVAL;
+
+	if (copy_from_user(&hdr, buf, sizeof(hdr)))
+		return -EFAULT;
+
+	if (hdr.cmd < 0 || hdr.cmd >= ARRAY_SIZE(ucma_cmd_table))
+		return -EINVAL;
+
+	if (hdr.in + sizeof(hdr) > len)
+		return -EINVAL;
+
+	if (!ucma_cmd_table[hdr.cmd])
+		return -ENOSYS;
+
+	ret = ucma_cmd_table[hdr.cmd](file, buf + sizeof(hdr), hdr.in, hdr.out);
+	if (!ret)
+		ret = len;
+
+	return ret;
+}
+
+static unsigned int ucma_poll(struct file *filp, struct poll_table_struct *wait)
+{
+	struct ucma_file *file = filp->private_data;
+	unsigned int mask = 0;
+
+	poll_wait(filp, &file->poll_wait, wait);
+
+	if (!list_empty(&file->event_list))
+		mask = POLLIN | POLLRDNORM;
+
+	return mask;
+}
+
+/*
+ * ucma_open() does not need the BKL:
+ *
+ *  - no global state is referred to;
+ *  - there is no ioctl method to race against;
+ *  - no further module initialization is required for open to work
+ *    after the device is registered.
+ */
+static int ucma_open(struct inode *inode, struct file *filp)
+{
+	struct ucma_file *file;
+
+	file = kmalloc(sizeof *file, GFP_KERNEL);
+	if (!file)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&file->event_list);
+	INIT_LIST_HEAD(&file->ctx_list);
+	init_waitqueue_head(&file->poll_wait);
+	mutex_init(&file->mut);
+
+	filp->private_data = file;
+	file->filp = filp;
+	return 0;
+}
+
+static int ucma_close(struct inode *inode, struct file *filp)
+{
+	struct ucma_file *file = filp->private_data;
+	struct ucma_context *ctx, *tmp;
+
+	mutex_lock(&file->mut);
+	list_for_each_entry_safe(ctx, tmp, &file->ctx_list, list) {
+		mutex_unlock(&file->mut);
+
+		mutex_lock(&mut);
+		idr_remove(&ctx_idr, ctx->id);
+		mutex_unlock(&mut);
+
+		ucma_free_ctx(ctx);
+		mutex_lock(&file->mut);
+	}
+	mutex_unlock(&file->mut);
+	kfree(file);
+	return 0;
+}
+
+static long
+ucma_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+
+	switch (cmd) {
+	case FIONBIO:
+	case FIOASYNC:
+		return (0);
+	default:
+		return (-ENOTTY);
+	}
+}
+
+static const struct file_operations ucma_fops = {
+	.owner 	 = THIS_MODULE,
+	.open 	 = ucma_open,
+	.release = ucma_close,
+	.write	 = ucma_write,
+	.unlocked_ioctl = ucma_ioctl,
+	.poll    = ucma_poll,
+};
+
+static struct miscdevice ucma_misc = {
+	.minor	= MISC_DYNAMIC_MINOR,
+	.name	= "rdma_cm",
+	.fops	= &ucma_fops,
+};
+
+static ssize_t show_abi_version(struct device *dev,
+				struct device_attribute *attr,
+				char *buf)
+{
+	return sprintf(buf, "%d\n", RDMA_USER_CM_ABI_VERSION);
+}
+static DEVICE_ATTR(abi_version, S_IRUGO, show_abi_version, NULL);
+
+static int __init ucma_init(void)
+{
+	int ret;
+
+	ret = misc_register(&ucma_misc);
+	if (ret)
+		return ret;
+
+	ret = device_create_file(ucma_misc.this_device, &dev_attr_abi_version);
+	if (ret) {
+		printk(KERN_ERR "rdma_ucm: couldn't create abi_version attr\n");
+		goto err;
+	}
+	return 0;
+err:
+	misc_deregister(&ucma_misc);
+	return ret;
+}
+
+static void __exit ucma_cleanup(void)
+{
+	device_remove_file(ucma_misc.this_device, &dev_attr_abi_version);
+	misc_deregister(&ucma_misc);
+	idr_destroy(&ctx_idr);
+}
+
+module_init(ucma_init);
+module_exit(ucma_cleanup);


Property changes on: trunk/sys/ofed/drivers/infiniband/core/ucma.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/sys/ofed/drivers/infiniband/core/ud_header.c
===================================================================
--- trunk/sys/ofed/drivers/infiniband/core/ud_header.c	                        (rev 0)
+++ trunk/sys/ofed/drivers/infiniband/core/ud_header.c	2016-09-14 19:35:22 UTC (rev 7911)
@@ -0,0 +1,453 @@
+/*
+ * Copyright (c) 2004 Topspin Corporation.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/if_ether.h>
+
+#include <rdma/ib_pack.h>
+
+#define STRUCT_FIELD(header, field) \
+	.struct_offset_bytes = offsetof(struct ib_unpacked_ ## header, field),      \
+	.struct_size_bytes   = sizeof ((struct ib_unpacked_ ## header *) 0)->field, \
+	.field_name          = #header ":" #field
+
+static const struct ib_field lrh_table[]  = {
+	{ STRUCT_FIELD(lrh, virtual_lane),
+	  .offset_words = 0,
+	  .offset_bits  = 0,
+	  .size_bits    = 4 },
+	{ STRUCT_FIELD(lrh, link_version),
+	  .offset_words = 0,
+	  .offset_bits  = 4,
+	  .size_bits    = 4 },
+	{ STRUCT_FIELD(lrh, service_level),
+	  .offset_words = 0,
+	  .offset_bits  = 8,
+	  .size_bits    = 4 },
+	{ RESERVED,
+	  .offset_words = 0,
+	  .offset_bits  = 12,
+	  .size_bits    = 2 },
+	{ STRUCT_FIELD(lrh, link_next_header),
+	  .offset_words = 0,
+	  .offset_bits  = 14,
+	  .size_bits    = 2 },
+	{ STRUCT_FIELD(lrh, destination_lid),
+	  .offset_words = 0,
+	  .offset_bits  = 16,
+	  .size_bits    = 16 },
+	{ RESERVED,
+	  .offset_words = 1,
+	  .offset_bits  = 0,
+	  .size_bits    = 5 },
+	{ STRUCT_FIELD(lrh, packet_length),
+	  .offset_words = 1,
+	  .offset_bits  = 5,
+	  .size_bits    = 11 },
+	{ STRUCT_FIELD(lrh, source_lid),
+	  .offset_words = 1,
+	  .offset_bits  = 16,
+	  .size_bits    = 16 }
+};
+
+static const struct ib_field eth_table[]  = {
+	{ STRUCT_FIELD(eth, dmac_h),
+	  .offset_words = 0,
+	  .offset_bits  = 0,
+	  .size_bits    = 32 },
+	{ STRUCT_FIELD(eth, dmac_l),
+	  .offset_words = 1,
+	  .offset_bits  = 0,
+	  .size_bits    = 16 },
+	{ STRUCT_FIELD(eth, smac_h),
+	  .offset_words = 1,
+	  .offset_bits  = 16,
+	  .size_bits    = 16 },
+	{ STRUCT_FIELD(eth, smac_l),
+	  .offset_words = 2,
+	  .offset_bits  = 0,
+	  .size_bits    = 32 },
+	{ STRUCT_FIELD(eth, type),
+	  .offset_words = 3,
+	  .offset_bits  = 0,
+	  .size_bits    = 16 }
+};
+
+static const struct ib_field vlan_table[]  = {
+	{ STRUCT_FIELD(vlan, tag),
+	  .offset_words = 0,
+	  .offset_bits  = 0,
+	  .size_bits    = 16 },
+	{ STRUCT_FIELD(vlan, type),
+	  .offset_words = 0,
+	  .offset_bits  = 16,
+	  .size_bits    = 16 }
+};
+
+static const struct ib_field grh_table[]  = {
+	{ STRUCT_FIELD(grh, ip_version),
+	  .offset_words = 0,
+	  .offset_bits  = 0,
+	  .size_bits    = 4 },
+	{ STRUCT_FIELD(grh, traffic_class),
+	  .offset_words = 0,
+	  .offset_bits  = 4,
+	  .size_bits    = 8 },
+	{ STRUCT_FIELD(grh, flow_label),
+	  .offset_words = 0,
+	  .offset_bits  = 12,
+	  .size_bits    = 20 },
+	{ STRUCT_FIELD(grh, payload_length),
+	  .offset_words = 1,
+	  .offset_bits  = 0,
+	  .size_bits    = 16 },
+	{ STRUCT_FIELD(grh, next_header),
+	  .offset_words = 1,
+	  .offset_bits  = 16,
+	  .size_bits    = 8 },
+	{ STRUCT_FIELD(grh, hop_limit),
+	  .offset_words = 1,
+	  .offset_bits  = 24,
+	  .size_bits    = 8 },
+	{ STRUCT_FIELD(grh, source_gid),
+	  .offset_words = 2,
+	  .offset_bits  = 0,
+	  .size_bits    = 128 },
+	{ STRUCT_FIELD(grh, destination_gid),
+	  .offset_words = 6,
+	  .offset_bits  = 0,
+	  .size_bits    = 128 }
+};
+
+static const struct ib_field bth_table[]  = {
+	{ STRUCT_FIELD(bth, opcode),
+	  .offset_words = 0,
+	  .offset_bits  = 0,
+	  .size_bits    = 8 },
+	{ STRUCT_FIELD(bth, solicited_event),
+	  .offset_words = 0,
+	  .offset_bits  = 8,
+	  .size_bits    = 1 },
+	{ STRUCT_FIELD(bth, mig_req),
+	  .offset_words = 0,
+	  .offset_bits  = 9,
+	  .size_bits    = 1 },
+	{ STRUCT_FIELD(bth, pad_count),
+	  .offset_words = 0,
+	  .offset_bits  = 10,
+	  .size_bits    = 2 },
+	{ STRUCT_FIELD(bth, transport_header_version),
+	  .offset_words = 0,
+	  .offset_bits  = 12,
+	  .size_bits    = 4 },
+	{ STRUCT_FIELD(bth, pkey),
+	  .offset_words = 0,
+	  .offset_bits  = 16,
+	  .size_bits    = 16 },
+	{ RESERVED,
+	  .offset_words = 1,
+	  .offset_bits  = 0,
+	  .size_bits    = 8 },
+	{ STRUCT_FIELD(bth, destination_qpn),
+	  .offset_words = 1,
+	  .offset_bits  = 8,
+	  .size_bits    = 24 },
+	{ STRUCT_FIELD(bth, ack_req),
+	  .offset_words = 2,
+	  .offset_bits  = 0,
+	  .size_bits    = 1 },
+	{ RESERVED,
+	  .offset_words = 2,
+	  .offset_bits  = 1,
+	  .size_bits    = 7 },
+	{ STRUCT_FIELD(bth, psn),
+	  .offset_words = 2,
+	  .offset_bits  = 8,
+	  .size_bits    = 24 }
+};
+
+static const struct ib_field deth_table[] = {
+	{ STRUCT_FIELD(deth, qkey),
+	  .offset_words = 0,
+	  .offset_bits  = 0,
+	  .size_bits    = 32 },
+	{ RESERVED,
+	  .offset_words = 1,
+	  .offset_bits  = 0,
+	  .size_bits    = 8 },
+	{ STRUCT_FIELD(deth, source_qpn),
+	  .offset_words = 1,
+	  .offset_bits  = 8,
+	  .size_bits    = 24 }
+};
+
+/**
+ * ib_ud_header_init - Initialize UD header structure
+ * @payload_bytes:Length of packet payload
+ * @lrh_present: specify if LRH is present
+ * @eth_present: specify if Eth header is present
+ * @vlan_present: packet is tagged vlan
+ * @grh_present:GRH flag (if non-zero, GRH will be included)
+ * @immediate_present: specify if immediate data is present
+ * @header:Structure to initialize
+ */
+void ib_ud_header_init(int     		    payload_bytes,
+		       int		    lrh_present,
+		       int		    eth_present,
+		       int		    vlan_present,
+		       int    		    grh_present,
+		       int		    immediate_present,
+		       struct ib_ud_header *header)
+{
+	u16 packet_length = 0;
+
+	memset(header, 0, sizeof *header);
+
+	if (lrh_present) {
+		header->lrh.link_version     = 0;
+		header->lrh.link_next_header =
+			grh_present ? IB_LNH_IBA_GLOBAL : IB_LNH_IBA_LOCAL;
+		packet_length = IB_LRH_BYTES;
+	}
+
+	if (eth_present) {
+		if (vlan_present) {
+			header->eth.type = cpu_to_be16(ETH_P_8021Q);
+			packet_length += IB_VLAN_BYTES;
+
+		}
+		packet_length += IB_ETH_BYTES;
+	}
+
+	packet_length += IB_BTH_BYTES + IB_DETH_BYTES + payload_bytes +
+		4       + /* ICRC     */
+		3;        /* round up */
+	packet_length /= 4;
+	if (grh_present) {
+		packet_length += IB_GRH_BYTES / 4;
+		header->grh.ip_version = 6;
+		header->grh.payload_length =
+			cpu_to_be16((IB_BTH_BYTES  +
+				     IB_DETH_BYTES +
+				     payload_bytes +
+				     4             + /* ICRC     */
+				     3) & ~3);       /* round up */
+		header->grh.next_header     = 0x1b;
+	}
+
+	if (lrh_present)
+		header->lrh.packet_length = cpu_to_be16(packet_length);
+
+	if (immediate_present)
+		header->bth.opcode           = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE;
+	else
+		header->bth.opcode           = IB_OPCODE_UD_SEND_ONLY;
+	header->bth.pad_count                = (4 - payload_bytes) & 3;
+	header->bth.transport_header_version = 0;
+
+	header->lrh_present = lrh_present;
+	header->eth_present = eth_present;
+	header->vlan_present = vlan_present;
+	header->grh_present = grh_present;
+	header->immediate_present = immediate_present;
+}
+EXPORT_SYMBOL(ib_ud_header_init);
+
+/**
+ * ib_lrh_header_pack - Pack LRH header struct into wire format
+ * @lrh:unpacked LRH header struct
+ * @buf:Buffer to pack into
+ *
+ * ib_lrh_header_pack() packs the LRH header structure @lrh into
+ * wire format in the buffer @buf.
+ */
+int ib_lrh_header_pack(struct ib_unpacked_lrh *lrh, void *buf)
+{
+	ib_pack(lrh_table, ARRAY_SIZE(lrh_table), lrh, buf);
+	return 0;
+}
+EXPORT_SYMBOL(ib_lrh_header_pack);
+
+/**
+ * ib_lrh_header_unpack - Unpack LRH structure from wire format
+ * @lrh:unpacked LRH header struct
+ * @buf:Buffer to pack into
+ *
+ * ib_lrh_header_unpack() unpacks the LRH header structure from
+ * wire format (in buf) into @lrh.
+ */
+int ib_lrh_header_unpack(void *buf, struct ib_unpacked_lrh *lrh)
+{
+	ib_unpack(lrh_table, ARRAY_SIZE(lrh_table), buf, lrh);
+	return 0;
+}
+EXPORT_SYMBOL(ib_lrh_header_unpack);
+
+/**
+ * ib_ud_header_pack - Pack UD header struct into wire format
+ * @header:UD header struct
+ * @buf:Buffer to pack into
+ *
+ * ib_ud_header_pack() packs the UD header structure @header into wire
+ * format in the buffer @buf.
+ */
+int ib_ud_header_pack(struct ib_ud_header *header,
+		      void                *buf)
+{
+	int len = 0;
+
+	if (header->lrh_present) {
+		ib_pack(lrh_table, ARRAY_SIZE(lrh_table),
+			&header->lrh, buf + len);
+		len += IB_LRH_BYTES;
+	}
+	if (header->eth_present) {
+		ib_pack(eth_table, ARRAY_SIZE(eth_table),
+			&header->eth, buf + len);
+		len += IB_ETH_BYTES;
+	}
+
+
+	if (header->vlan_present) {
+		ib_pack(vlan_table, ARRAY_SIZE(vlan_table),
+			&header->vlan, buf + len);
+		len += IB_VLAN_BYTES;
+	}
+
+	if (header->grh_present) {
+		ib_pack(grh_table, ARRAY_SIZE(grh_table),
+			&header->grh, buf + len);
+		len += IB_GRH_BYTES;
+	}
+
+	ib_pack(bth_table, ARRAY_SIZE(bth_table),
+		&header->bth, buf + len);
+	len += IB_BTH_BYTES;
+
+	ib_pack(deth_table, ARRAY_SIZE(deth_table),
+		&header->deth, buf + len);
+	len += IB_DETH_BYTES;
+
+	if (header->immediate_present) {
+		memcpy(buf + len, &header->immediate_data, sizeof header->immediate_data);
+		len += sizeof header->immediate_data;
+	}
+
+	return len;
+}
+EXPORT_SYMBOL(ib_ud_header_pack);
+
+/**
+ * ib_ud_header_unpack - Unpack UD header struct from wire format
+ * @header:UD header struct
+ * @buf:Buffer to pack into
+ *
+ * ib_ud_header_pack() unpacks the UD header structure @header from wire
+ * format in the buffer @buf.
+ */
+int ib_ud_header_unpack(void                *buf,
+			struct ib_ud_header *header)
+{
+	ib_unpack(lrh_table, ARRAY_SIZE(lrh_table),
+		  buf, &header->lrh);
+	buf += IB_LRH_BYTES;
+
+	if (header->lrh.link_version != 0) {
+		printk(KERN_WARNING "Invalid LRH.link_version %d\n",
+		       header->lrh.link_version);
+		return -EINVAL;
+	}
+
+	switch (header->lrh.link_next_header) {
+	case IB_LNH_IBA_LOCAL:
+		header->grh_present = 0;
+		break;
+
+	case IB_LNH_IBA_GLOBAL:
+		header->grh_present = 1;
+		ib_unpack(grh_table, ARRAY_SIZE(grh_table),
+			  buf, &header->grh);
+		buf += IB_GRH_BYTES;
+
+		if (header->grh.ip_version != 6) {
+			printk(KERN_WARNING "Invalid GRH.ip_version %d\n",
+			       header->grh.ip_version);
+			return -EINVAL;
+		}
+		if (header->grh.next_header != 0x1b) {
+			printk(KERN_WARNING "Invalid GRH.next_header 0x%02x\n",
+			       header->grh.next_header);
+			return -EINVAL;
+		}
+		break;
+
+	default:
+		printk(KERN_WARNING "Invalid LRH.link_next_header %d\n",
+		       header->lrh.link_next_header);
+		return -EINVAL;
+	}
+
+	ib_unpack(bth_table, ARRAY_SIZE(bth_table),
+		  buf, &header->bth);
+	buf += IB_BTH_BYTES;
+
+	switch (header->bth.opcode) {
+	case IB_OPCODE_UD_SEND_ONLY:
+		header->immediate_present = 0;
+		break;
+	case IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE:
+		header->immediate_present = 1;
+		break;
+	default:
+		printk(KERN_WARNING "Invalid BTH.opcode 0x%02x\n",
+		       header->bth.opcode);
+		return -EINVAL;
+	}
+
+	if (header->bth.transport_header_version != 0) {
+		printk(KERN_WARNING "Invalid BTH.transport_header_version %d\n",
+		       header->bth.transport_header_version);
+		return -EINVAL;
+	}
+
+	ib_unpack(deth_table, ARRAY_SIZE(deth_table),
+		  buf, &header->deth);
+	buf += IB_DETH_BYTES;
+
+	if (header->immediate_present)
+		memcpy(&header->immediate_data, buf, sizeof header->immediate_data);
+
+	return 0;
+}
+EXPORT_SYMBOL(ib_ud_header_unpack);


Property changes on: trunk/sys/ofed/drivers/infiniband/core/ud_header.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/sys/ofed/drivers/infiniband/core/umem.c
===================================================================
--- trunk/sys/ofed/drivers/infiniband/core/umem.c	                        (rev 0)
+++ trunk/sys/ofed/drivers/infiniband/core/umem.c	2016-09-14 19:35:22 UTC (rev 7911)
@@ -0,0 +1,575 @@
+/*
+ * Copyright (c) 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Cisco Systems.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/mm.h>
+#include <linux/dma-mapping.h>
+#include <linux/sched.h>
+#ifdef __linux__
+#include <linux/hugetlb.h>
+#endif
+#include <linux/dma-attrs.h>
+
+#include <sys/priv.h>
+#include <sys/resource.h>
+#include <sys/resourcevar.h>
+
+#include <vm/vm.h>
+#include <vm/vm_map.h>
+#include <vm/vm_object.h>
+#include <vm/vm_pageout.h>
+
+#include "uverbs.h"
+
+static int allow_weak_ordering;
+module_param(allow_weak_ordering, bool, 0444);
+MODULE_PARM_DESC(allow_weak_ordering,  "Allow weak ordering for data registered memory");
+
+#define IB_UMEM_MAX_PAGE_CHUNK						\
+	((PAGE_SIZE - offsetof(struct ib_umem_chunk, page_list)) /	\
+	 ((void *) &((struct ib_umem_chunk *) 0)->page_list[1] -	\
+	  (void *) &((struct ib_umem_chunk *) 0)->page_list[0]))
+
+#ifdef __ia64__
+extern int dma_map_sg_hp_wa;
+
+static int dma_map_sg_ia64(struct ib_device *ibdev,
+			   struct scatterlist *sg,
+			   int nents,
+			   enum dma_data_direction dir)
+{
+	int i, rc, j, lents = 0;
+	struct device *dev;
+
+	if (!dma_map_sg_hp_wa)
+		return ib_dma_map_sg(ibdev, sg, nents, dir);
+
+	dev = ibdev->dma_device;
+	for (i = 0; i < nents; ++i) {
+		rc = dma_map_sg(dev, sg + i, 1, dir);
+		if (rc <= 0) {
+			for (j = 0; j < i; ++j)
+				dma_unmap_sg(dev, sg + j, 1, dir);
+
+			return 0;
+		}
+		lents += rc;
+	}
+
+	return lents;
+}
+
+static void dma_unmap_sg_ia64(struct ib_device *ibdev,
+			      struct scatterlist *sg,
+			      int nents,
+			      enum dma_data_direction dir)
+{
+	int i;
+	struct device *dev;
+
+	if (!dma_map_sg_hp_wa)
+		return ib_dma_unmap_sg(ibdev, sg, nents, dir);
+
+	dev = ibdev->dma_device;
+	for (i = 0; i < nents; ++i)
+		dma_unmap_sg(dev, sg + i, 1, dir);
+}
+
+#define ib_dma_map_sg(dev, sg, nents, dir) dma_map_sg_ia64(dev, sg, nents, dir)
+#define ib_dma_unmap_sg(dev, sg, nents, dir) dma_unmap_sg_ia64(dev, sg, nents, dir)
+
+#endif
+
+static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int dirty)
+{
+#ifdef __linux__
+	struct ib_umem_chunk *chunk, *tmp;
+	int i;
+
+	list_for_each_entry_safe(chunk, tmp, &umem->chunk_list, list) {
+		ib_dma_unmap_sg_attrs(dev, chunk->page_list,
+				      chunk->nents, DMA_BIDIRECTIONAL, &chunk->attrs);
+		for (i = 0; i < chunk->nents; ++i) {
+			struct page *page = sg_page(&chunk->page_list[i]);
+			if (umem->writable && dirty)
+				set_page_dirty_lock(page);
+			put_page(page);
+		}
+		kfree(chunk);
+	}
+#else
+	struct ib_umem_chunk *chunk, *tmp;
+	vm_object_t object;
+	int i;
+
+	object = NULL;
+	list_for_each_entry_safe(chunk, tmp, &umem->chunk_list, list) {
+		ib_dma_unmap_sg_attrs(dev, chunk->page_list,
+				      chunk->nents, DMA_BIDIRECTIONAL, &chunk->attrs);
+		for (i = 0; i < chunk->nents; ++i) {
+			struct page *page = sg_page(&chunk->page_list[i]);
+			if (umem->writable && dirty) {
+				if (object && object != page->object)
+					VM_OBJECT_UNLOCK(object);
+				if (object != page->object) {
+					object = page->object;
+					VM_OBJECT_LOCK(object);
+				}
+				vm_page_dirty(page);
+			}
+		}
+		kfree(chunk);
+	}
+	if (object)
+		VM_OBJECT_UNLOCK(object);
+
+#endif
+}
+
+/**
+ * ib_umem_get - Pin and DMA map userspace memory.
+ * @context: userspace context to pin memory for
+ * @addr: userspace virtual address to start at
+ * @size: length of region to pin
+ * @access: IB_ACCESS_xxx flags for memory being pinned
+ * @dmasync: flush in-flight DMA when the memory region is written
+ */
+struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
+			    size_t size, int access, int dmasync)
+{
+#ifdef __linux__
+	struct ib_umem *umem;
+	struct page **page_list;
+	struct vm_area_struct **vma_list;
+	struct ib_umem_chunk *chunk;
+	unsigned long locked;
+	unsigned long lock_limit;
+	unsigned long cur_base;
+	unsigned long npages;
+	int ret;
+	int off;
+	int i;
+	DEFINE_DMA_ATTRS(attrs);
+
+	if (dmasync)
+		dma_set_attr(DMA_ATTR_WRITE_BARRIER, &attrs);
+	else if (allow_weak_ordering)
+		dma_set_attr(DMA_ATTR_WEAK_ORDERING, &attrs);
+
+	if (!can_do_mlock())
+		return ERR_PTR(-EPERM);
+
+	umem = kmalloc(sizeof *umem, GFP_KERNEL);
+	if (!umem)
+		return ERR_PTR(-ENOMEM);
+
+	umem->context   = context;
+	umem->length    = size;
+	umem->offset    = addr & ~PAGE_MASK;
+	umem->page_size = PAGE_SIZE;
+	/*
+	 * We ask for writable memory if any access flags other than
+	 * "remote read" are set.  "Local write" and "remote write"
+	 * obviously require write access.  "Remote atomic" can do
+	 * things like fetch and add, which will modify memory, and
+	 * "MW bind" can change permissions by binding a window.
+	 */
+	umem->writable  = !!(access & ~IB_ACCESS_REMOTE_READ);
+
+	/* We assume the memory is from hugetlb until proved otherwise */
+	umem->hugetlb   = 1;
+
+	INIT_LIST_HEAD(&umem->chunk_list);
+
+	page_list = (struct page **) __get_free_page(GFP_KERNEL);
+	if (!page_list) {
+		kfree(umem);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	/*
+	 * if we can't alloc the vma_list, it's not so bad;
+	 * just assume the memory is not hugetlb memory
+	 */
+	vma_list = (struct vm_area_struct **) __get_free_page(GFP_KERNEL);
+	if (!vma_list)
+		umem->hugetlb = 0;
+
+	npages = PAGE_ALIGN(size + umem->offset) >> PAGE_SHIFT;
+
+	down_write(&current->mm->mmap_sem);
+
+	locked     = npages + current->mm->locked_vm;
+	lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT;
+
+	if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	cur_base = addr & PAGE_MASK;
+
+	ret = 0;
+
+	while (npages) {
+		ret = get_user_pages(current, current->mm, cur_base,
+				     min_t(unsigned long, npages,
+					   PAGE_SIZE / sizeof (struct page *)),
+				     1, !umem->writable, page_list, vma_list);
+
+		if (ret < 0)
+			goto out;
+
+		cur_base += ret * PAGE_SIZE;
+		npages   -= ret;
+
+		off = 0;
+
+		while (ret) {
+			chunk = kmalloc(sizeof *chunk + sizeof (struct scatterlist) *
+					min_t(int, ret, IB_UMEM_MAX_PAGE_CHUNK),
+					GFP_KERNEL);
+			if (!chunk) {
+				ret = -ENOMEM;
+				goto out;
+			}
+
+			chunk->attrs = attrs;
+			chunk->nents = min_t(int, ret, IB_UMEM_MAX_PAGE_CHUNK);
+			sg_init_table(chunk->page_list, chunk->nents);
+			for (i = 0; i < chunk->nents; ++i) {
+				if (vma_list &&
+				    !is_vm_hugetlb_page(vma_list[i + off]))
+					umem->hugetlb = 0;
+				sg_set_page(&chunk->page_list[i], page_list[i + off], PAGE_SIZE, 0);
+			}
+
+			chunk->nmap = ib_dma_map_sg_attrs(context->device,
+							  &chunk->page_list[0],
+							  chunk->nents,
+							  DMA_BIDIRECTIONAL,
+							  &attrs);
+			if (chunk->nmap <= 0) {
+				for (i = 0; i < chunk->nents; ++i)
+					put_page(sg_page(&chunk->page_list[i]));
+				kfree(chunk);
+
+				ret = -ENOMEM;
+				goto out;
+			}
+
+			ret -= chunk->nents;
+			off += chunk->nents;
+			list_add_tail(&chunk->list, &umem->chunk_list);
+		}
+
+		ret = 0;
+	}
+
+out:
+	if (ret < 0) {
+		__ib_umem_release(context->device, umem, 0);
+		kfree(umem);
+	} else
+		current->mm->locked_vm = locked;
+
+	up_write(&current->mm->mmap_sem);
+	if (vma_list)
+		free_page((unsigned long) vma_list);
+	free_page((unsigned long) page_list);
+
+	return ret < 0 ? ERR_PTR(ret) : umem;
+#else
+	struct ib_umem *umem;
+	struct ib_umem_chunk *chunk;
+        struct proc *proc;
+	pmap_t pmap;
+        vm_offset_t end, last, start;
+        vm_size_t npages;
+        int error;
+	int ents;
+	int ret;
+	int i;
+	DEFINE_DMA_ATTRS(attrs);
+
+	error = priv_check(curthread, PRIV_VM_MLOCK);
+	if (error)
+		return ERR_PTR(-error);
+
+	last = addr + size;
+	start = addr & PAGE_MASK; /* Use the linux PAGE_MASK definition. */
+	end = roundup2(last, PAGE_SIZE); /* Use PAGE_MASK safe operation. */
+	if (last < addr || end < addr)
+		return ERR_PTR(-EINVAL);
+	npages = atop(end - start);
+	if (npages > vm_page_max_wired)
+		return ERR_PTR(-ENOMEM);
+	umem = kzalloc(sizeof *umem, GFP_KERNEL);
+	if (!umem)
+		return ERR_PTR(-ENOMEM);
+	proc = curthread->td_proc;
+	PROC_LOCK(proc);
+	if (ptoa(npages +
+	    pmap_wired_count(vm_map_pmap(&proc->p_vmspace->vm_map))) >
+	    lim_cur(proc, RLIMIT_MEMLOCK)) {
+		PROC_UNLOCK(proc);
+		kfree(umem);
+		return ERR_PTR(-ENOMEM);
+	}
+        PROC_UNLOCK(proc);
+	if (npages + cnt.v_wire_count > vm_page_max_wired) {
+		kfree(umem);
+		return ERR_PTR(-EAGAIN);
+	}
+	error = vm_map_wire(&proc->p_vmspace->vm_map, start, end,
+	    VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES |
+	    (umem->writable ? VM_MAP_WIRE_WRITE : 0));
+	if (error != KERN_SUCCESS) {
+		kfree(umem);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	umem->context   = context;
+	umem->length    = size;
+	umem->offset    = addr & ~PAGE_MASK;
+	umem->page_size = PAGE_SIZE;
+	umem->start	= addr;
+	/*
+	 * We ask for writable memory if any access flags other than
+	 * "remote read" are set.  "Local write" and "remote write"
+	 * obviously require write access.  "Remote atomic" can do
+	 * things like fetch and add, which will modify memory, and
+	 * "MW bind" can change permissions by binding a window.
+	 */
+	umem->writable  = !!(access & ~IB_ACCESS_REMOTE_READ);
+	umem->hugetlb = 0;
+	INIT_LIST_HEAD(&umem->chunk_list);
+
+	pmap = vm_map_pmap(&proc->p_vmspace->vm_map);
+	ret = 0;
+	while (npages) {
+		ents = min_t(int, npages, IB_UMEM_MAX_PAGE_CHUNK);
+		chunk = kmalloc(sizeof(*chunk) +
+				(sizeof(struct scatterlist) * ents),
+				GFP_KERNEL);
+		if (!chunk) {
+			ret = -ENOMEM;
+			goto out;
+		}
+
+		chunk->attrs = attrs;
+		chunk->nents = ents;
+		sg_init_table(&chunk->page_list[0], ents);
+		for (i = 0; i < chunk->nents; ++i) {
+			vm_paddr_t pa;
+
+			pa = pmap_extract(pmap, start);
+			if (pa == 0) {
+				ret = -ENOMEM;
+				kfree(chunk);
+				goto out;
+			}
+			sg_set_page(&chunk->page_list[i], PHYS_TO_VM_PAGE(pa),
+			    PAGE_SIZE, 0);
+			npages--;
+			start += PAGE_SIZE;
+		}
+
+		chunk->nmap = ib_dma_map_sg_attrs(context->device,
+						  &chunk->page_list[0],
+						  chunk->nents,
+						  DMA_BIDIRECTIONAL,
+						  &attrs);
+		if (chunk->nmap != chunk->nents) {
+			kfree(chunk);
+			ret = -ENOMEM;
+			goto out;
+		}
+
+		list_add_tail(&chunk->list, &umem->chunk_list);
+	}
+
+out:
+	if (ret < 0) {
+		__ib_umem_release(context->device, umem, 0);
+		kfree(umem);
+	}
+
+	return ret < 0 ? ERR_PTR(ret) : umem;
+#endif
+}
+EXPORT_SYMBOL(ib_umem_get);
+
+#ifdef __linux__
+static void ib_umem_account(struct work_struct *work)
+{
+	struct ib_umem *umem = container_of(work, struct ib_umem, work);
+
+	down_write(&umem->mm->mmap_sem);
+	umem->mm->locked_vm -= umem->diff;
+	up_write(&umem->mm->mmap_sem);
+	mmput(umem->mm);
+	kfree(umem);
+}
+#endif
+
+/**
+ * ib_umem_release - release memory pinned with ib_umem_get
+ * @umem: umem struct to release
+ */
+void ib_umem_release(struct ib_umem *umem)
+{
+#ifdef __linux__
+	struct ib_ucontext *context = umem->context;
+	struct mm_struct *mm;
+	unsigned long diff;
+
+	__ib_umem_release(umem->context->device, umem, 1);
+
+	mm = get_task_mm(current);
+	if (!mm) {
+		kfree(umem);
+		return;
+	}
+
+	diff = PAGE_ALIGN(umem->length + umem->offset) >> PAGE_SHIFT;
+
+	/*
+	 * We may be called with the mm's mmap_sem already held.  This
+	 * can happen when a userspace munmap() is the call that drops
+	 * the last reference to our file and calls our release
+	 * method.  If there are memory regions to destroy, we'll end
+	 * up here and not be able to take the mmap_sem.  In that case
+	 * we defer the vm_locked accounting to the system workqueue.
+	 */
+	if (context->closing) {
+		if (!down_write_trylock(&mm->mmap_sem)) {
+			INIT_WORK(&umem->work, ib_umem_account);
+			umem->mm   = mm;
+			umem->diff = diff;
+
+			schedule_work(&umem->work);
+			return;
+		}
+	} else
+		down_write(&mm->mmap_sem);
+
+	current->mm->locked_vm -= diff;
+	up_write(&mm->mmap_sem);
+	mmput(mm);
+#else
+	vm_offset_t addr, end, last, start;
+	vm_size_t size;
+	int error;
+
+	__ib_umem_release(umem->context->device, umem, 1);
+	if (umem->context->closing) {
+		kfree(umem);
+		return;
+	}
+	error = priv_check(curthread, PRIV_VM_MUNLOCK);
+	if (error)
+		return;
+	addr = umem->start;
+	size = umem->length;
+	last = addr + size;
+        start = addr & PAGE_MASK; /* Use the linux PAGE_MASK definition. */
+	end = roundup2(last, PAGE_SIZE); /* Use PAGE_MASK safe operation. */
+	vm_map_unwire(&curthread->td_proc->p_vmspace->vm_map, start, end,
+	    VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
+	
+#endif
+	kfree(umem);
+}
+EXPORT_SYMBOL(ib_umem_release);
+
+int ib_umem_page_count(struct ib_umem *umem)
+{
+	struct ib_umem_chunk *chunk;
+	int shift;
+	int i;
+	int n;
+
+	shift = ilog2(umem->page_size);
+
+	n = 0;
+	list_for_each_entry(chunk, &umem->chunk_list, list)
+		for (i = 0; i < chunk->nmap; ++i)
+			n += sg_dma_len(&chunk->page_list[i]) >> shift;
+
+	return n;
+}
+EXPORT_SYMBOL(ib_umem_page_count);
+
+/**********************************************/
+/* 
+ * Stub functions for contiguous pages - 
+ * We currently do not support this feature
+ */
+/**********************************************/
+
+/**
+ * ib_cmem_release_contiguous_pages - release memory allocated by
+ *                                              ib_cmem_alloc_contiguous_pages.
+ * @cmem: cmem struct to release
+ */
+void ib_cmem_release_contiguous_pages(struct ib_cmem *cmem)
+{
+}
+EXPORT_SYMBOL(ib_cmem_release_contiguous_pages);
+
+/**
+ *  * ib_cmem_alloc_contiguous_pages - allocate contiguous pages
+ *  *  @context: userspace context to allocate memory for
+ *   * @total_size: total required size for that allocation.
+ *    * @page_size_order: order of one contiguous page.
+ *     */
+struct ib_cmem *ib_cmem_alloc_contiguous_pages(struct ib_ucontext *context,
+                                unsigned long total_size,
+				                                unsigned long page_size_order)
+{
+	return NULL;
+}
+EXPORT_SYMBOL(ib_cmem_alloc_contiguous_pages);
+
+/**
+ *  * ib_cmem_map_contiguous_pages_to_vma - map contiguous pages into VMA
+ *   * @ib_cmem: cmem structure returned by ib_cmem_alloc_contiguous_pages
+ *    * @vma: VMA to inject pages into.
+ *     */
+int ib_cmem_map_contiguous_pages_to_vma(struct ib_cmem *ib_cmem,
+                                                struct vm_area_struct *vma)
+{
+	return 0;
+}
+EXPORT_SYMBOL(ib_cmem_map_contiguous_pages_to_vma);


Property changes on: trunk/sys/ofed/drivers/infiniband/core/umem.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/sys/ofed/drivers/infiniband/core/user_mad.c
===================================================================
--- trunk/sys/ofed/drivers/infiniband/core/user_mad.c	                        (rev 0)
+++ trunk/sys/ofed/drivers/infiniband/core/user_mad.c	2016-09-14 19:35:22 UTC (rev 7911)
@@ -0,0 +1,1224 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2008 Cisco. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/fs.h>
+#include <linux/cdev.h>
+#include <linux/dma-mapping.h>
+#include <linux/poll.h>
+#include <linux/mutex.h>
+#include <linux/kref.h>
+#include <linux/compat.h>
+#include <linux/semaphore.h>
+
+#include <asm/uaccess.h>
+
+#include <rdma/ib_mad.h>
+#include <rdma/ib_user_mad.h>
+
+MODULE_AUTHOR("Roland Dreier");
+MODULE_DESCRIPTION("InfiniBand userspace MAD packet access");
+MODULE_LICENSE("Dual BSD/GPL");
+
+enum {
+	IB_UMAD_MAX_PORTS  = 64,
+	IB_UMAD_MAX_AGENTS = 32,
+
+	IB_UMAD_MAJOR      = 231,
+	IB_UMAD_MINOR_BASE = 0
+};
+
+/*
+ * Our lifetime rules for these structs are the following: each time a
+ * device special file is opened, we look up the corresponding struct
+ * ib_umad_port by minor in the umad_port[] table while holding the
+ * port_lock.  If this lookup succeeds, we take a reference on the
+ * ib_umad_port's struct ib_umad_device while still holding the
+ * port_lock; if the lookup fails, we fail the open().  We drop these
+ * references in the corresponding close().
+ *
+ * In addition to references coming from open character devices, there
+ * is one more reference to each ib_umad_device representing the
+ * module's reference taken when allocating the ib_umad_device in
+ * ib_umad_add_one().
+ *
+ * When destroying an ib_umad_device, we clear all of its
+ * ib_umad_ports from umad_port[] while holding port_lock before
+ * dropping the module's reference to the ib_umad_device.  This is
+ * always safe because any open() calls will either succeed and obtain
+ * a reference before we clear the umad_port[] entries, or fail after
+ * we clear the umad_port[] entries.
+ */
+
+struct ib_umad_port {
+	struct cdev           *cdev;
+	struct device	      *dev;
+
+	struct cdev           *sm_cdev;
+	struct device	      *sm_dev;
+	struct semaphore       sm_sem;
+
+	struct mutex	       file_mutex;
+	struct list_head       file_list;
+
+	struct ib_device      *ib_dev;
+	struct ib_umad_device *umad_dev;
+	int                    dev_num;
+	u8                     port_num;
+};
+
+struct ib_umad_device {
+	int                  start_port, end_port;
+	struct kref          ref;
+	struct ib_umad_port  port[0];
+};
+
+struct ib_umad_file {
+	struct mutex		mutex;
+	struct ib_umad_port    *port;
+	struct file	       *filp;
+	struct list_head	recv_list;
+	struct list_head	send_list;
+	struct list_head	port_list;
+	spinlock_t		send_lock;
+	wait_queue_head_t	recv_wait;
+	struct ib_mad_agent    *agent[IB_UMAD_MAX_AGENTS];
+	int			agents_dead;
+	u8			use_pkey_index;
+	u8			already_used;
+};
+
+struct ib_umad_packet {
+	struct ib_mad_send_buf *msg;
+	struct ib_mad_recv_wc  *recv_wc;
+	struct list_head   list;
+	int		   length;
+	struct ib_user_mad mad;
+};
+
+static struct class *umad_class;
+
+static const dev_t base_dev = MKDEV(IB_UMAD_MAJOR, IB_UMAD_MINOR_BASE);
+
+static DEFINE_SPINLOCK(port_lock);
+static struct ib_umad_port *umad_port[IB_UMAD_MAX_PORTS];
+static DECLARE_BITMAP(dev_map, IB_UMAD_MAX_PORTS);
+
+static void ib_umad_add_one(struct ib_device *device);
+static void ib_umad_remove_one(struct ib_device *device);
+
+static void ib_umad_release_dev(struct kref *ref)
+{
+	struct ib_umad_device *dev =
+		container_of(ref, struct ib_umad_device, ref);
+
+	kfree(dev);
+}
+
+static int hdr_size(struct ib_umad_file *file)
+{
+	return file->use_pkey_index ? sizeof (struct ib_user_mad_hdr) :
+		sizeof (struct ib_user_mad_hdr_old);
+}
+
+/* caller must hold file->mutex */
+static struct ib_mad_agent *__get_agent(struct ib_umad_file *file, int id)
+{
+	return file->agents_dead ? NULL : file->agent[id];
+}
+
+static int queue_packet(struct ib_umad_file *file,
+			struct ib_mad_agent *agent,
+			struct ib_umad_packet *packet)
+{
+	int ret = 1;
+
+	mutex_lock(&file->mutex);
+
+	for (packet->mad.hdr.id = 0;
+	     packet->mad.hdr.id < IB_UMAD_MAX_AGENTS;
+	     packet->mad.hdr.id++)
+		if (agent == __get_agent(file, packet->mad.hdr.id)) {
+			list_add_tail(&packet->list, &file->recv_list);
+			selwakeup(&file->filp->f_selinfo);
+			wake_up_interruptible(&file->recv_wait);
+			ret = 0;
+			break;
+		}
+
+	mutex_unlock(&file->mutex);
+
+	return ret;
+}
+
+static void dequeue_send(struct ib_umad_file *file,
+			 struct ib_umad_packet *packet)
+{
+	spin_lock_irq(&file->send_lock);
+	list_del(&packet->list);
+	spin_unlock_irq(&file->send_lock);
+}
+
+static void send_handler(struct ib_mad_agent *agent,
+			 struct ib_mad_send_wc *send_wc)
+{
+	struct ib_umad_file *file = agent->context;
+	struct ib_umad_packet *packet = send_wc->send_buf->context[0];
+
+	dequeue_send(file, packet);
+	ib_destroy_ah(packet->msg->ah);
+	ib_free_send_mad(packet->msg);
+
+	if (send_wc->status == IB_WC_RESP_TIMEOUT_ERR) {
+		packet->length = IB_MGMT_MAD_HDR;
+		packet->mad.hdr.status = ETIMEDOUT;
+		if (!queue_packet(file, agent, packet))
+			return;
+	}
+	kfree(packet);
+}
+
+static void recv_handler(struct ib_mad_agent *agent,
+			 struct ib_mad_recv_wc *mad_recv_wc)
+{
+	struct ib_umad_file *file = agent->context;
+	struct ib_umad_packet *packet;
+
+	if (mad_recv_wc->wc->status != IB_WC_SUCCESS)
+		goto err1;
+
+	packet = kzalloc(sizeof *packet, GFP_KERNEL);
+	if (!packet)
+		goto err1;
+
+	packet->length = mad_recv_wc->mad_len;
+	packet->recv_wc = mad_recv_wc;
+
+	packet->mad.hdr.status	   = 0;
+	packet->mad.hdr.length	   = hdr_size(file) + mad_recv_wc->mad_len;
+	packet->mad.hdr.qpn	   = cpu_to_be32(mad_recv_wc->wc->src_qp);
+	packet->mad.hdr.lid	   = cpu_to_be16(mad_recv_wc->wc->slid);
+	packet->mad.hdr.sl	   = mad_recv_wc->wc->sl;
+	packet->mad.hdr.path_bits  = mad_recv_wc->wc->dlid_path_bits;
+	packet->mad.hdr.pkey_index = mad_recv_wc->wc->pkey_index;
+	packet->mad.hdr.grh_present = !!(mad_recv_wc->wc->wc_flags & IB_WC_GRH);
+	if (packet->mad.hdr.grh_present) {
+		struct ib_ah_attr ah_attr;
+
+		ib_init_ah_from_wc(agent->device, agent->port_num,
+				   mad_recv_wc->wc, mad_recv_wc->recv_buf.grh,
+				   &ah_attr);
+
+		packet->mad.hdr.gid_index = ah_attr.grh.sgid_index;
+		packet->mad.hdr.hop_limit = ah_attr.grh.hop_limit;
+		packet->mad.hdr.traffic_class = ah_attr.grh.traffic_class;
+		memcpy(packet->mad.hdr.gid, &ah_attr.grh.dgid, 16);
+		packet->mad.hdr.flow_label = cpu_to_be32(ah_attr.grh.flow_label);
+	}
+
+	if (queue_packet(file, agent, packet))
+		goto err2;
+	return;
+
+err2:
+	kfree(packet);
+err1:
+	ib_free_recv_mad(mad_recv_wc);
+}
+
+static ssize_t copy_recv_mad(struct ib_umad_file *file, char __user *buf,
+			     struct ib_umad_packet *packet, size_t count)
+{
+	struct ib_mad_recv_buf *recv_buf;
+	int left, seg_payload, offset, max_seg_payload;
+
+	/* We need enough room to copy the first (or only) MAD segment. */
+	recv_buf = &packet->recv_wc->recv_buf;
+	if ((packet->length <= sizeof (*recv_buf->mad) &&
+	     count < hdr_size(file) + packet->length) ||
+	    (packet->length > sizeof (*recv_buf->mad) &&
+	     count < hdr_size(file) + sizeof (*recv_buf->mad)))
+		return -EINVAL;
+
+	if (copy_to_user(buf, &packet->mad, hdr_size(file)))
+		return -EFAULT;
+
+	buf += hdr_size(file);
+	seg_payload = min_t(int, packet->length, sizeof (*recv_buf->mad));
+	if (copy_to_user(buf, recv_buf->mad, seg_payload))
+		return -EFAULT;
+
+	if (seg_payload < packet->length) {
+		/*
+		 * Multipacket RMPP MAD message. Copy remainder of message.
+		 * Note that last segment may have a shorter payload.
+		 */
+		if (count < hdr_size(file) + packet->length) {
+			/*
+			 * The buffer is too small, return the first RMPP segment,
+			 * which includes the RMPP message length.
+			 */
+			return -ENOSPC;
+		}
+		offset = ib_get_mad_data_offset(recv_buf->mad->mad_hdr.mgmt_class);
+		max_seg_payload = sizeof (struct ib_mad) - offset;
+
+		for (left = packet->length - seg_payload, buf += seg_payload;
+		     left; left -= seg_payload, buf += seg_payload) {
+			recv_buf = container_of(recv_buf->list.next,
+						struct ib_mad_recv_buf, list);
+			seg_payload = min(left, max_seg_payload);
+			if (copy_to_user(buf, ((void *) recv_buf->mad) + offset,
+					 seg_payload))
+				return -EFAULT;
+		}
+	}
+	return hdr_size(file) + packet->length;
+}
+
+static ssize_t copy_send_mad(struct ib_umad_file *file, char __user *buf,
+			     struct ib_umad_packet *packet, size_t count)
+{
+	ssize_t size = hdr_size(file) + packet->length;
+
+	if (count < size)
+		return -EINVAL;
+
+	if (copy_to_user(buf, &packet->mad, hdr_size(file)))
+		return -EFAULT;
+
+	buf += hdr_size(file);
+
+	if (copy_to_user(buf, packet->mad.data, packet->length))
+		return -EFAULT;
+
+	return size;
+}
+
+static ssize_t ib_umad_read(struct file *filp, char __user *buf,
+			    size_t count, loff_t *pos)
+{
+	struct ib_umad_file *file = filp->private_data;
+	struct ib_umad_packet *packet;
+	ssize_t ret;
+
+	if (count < hdr_size(file))
+		return -EINVAL;
+
+	mutex_lock(&file->mutex);
+
+	while (list_empty(&file->recv_list)) {
+		mutex_unlock(&file->mutex);
+
+		if (filp->f_flags & O_NONBLOCK)
+			return -EAGAIN;
+
+		if (wait_event_interruptible(file->recv_wait,
+					     !list_empty(&file->recv_list)))
+			return -ERESTARTSYS;
+
+		mutex_lock(&file->mutex);
+	}
+
+	packet = list_entry(file->recv_list.next, struct ib_umad_packet, list);
+	list_del(&packet->list);
+
+	mutex_unlock(&file->mutex);
+
+	if (packet->recv_wc)
+		ret = copy_recv_mad(file, buf, packet, count);
+	else
+		ret = copy_send_mad(file, buf, packet, count);
+
+	if (ret < 0) {
+		/* Requeue packet */
+		mutex_lock(&file->mutex);
+		list_add(&packet->list, &file->recv_list);
+		mutex_unlock(&file->mutex);
+	} else {
+		if (packet->recv_wc)
+			ib_free_recv_mad(packet->recv_wc);
+		kfree(packet);
+	}
+	return ret;
+}
+
+static int copy_rmpp_mad(struct ib_mad_send_buf *msg, const char __user *buf)
+{
+	int left, seg;
+
+	/* Copy class specific header */
+	if ((msg->hdr_len > IB_MGMT_RMPP_HDR) &&
+	    copy_from_user(msg->mad + IB_MGMT_RMPP_HDR, buf + IB_MGMT_RMPP_HDR,
+			   msg->hdr_len - IB_MGMT_RMPP_HDR))
+		return -EFAULT;
+
+	/* All headers are in place.  Copy data segments. */
+	for (seg = 1, left = msg->data_len, buf += msg->hdr_len; left > 0;
+	     seg++, left -= msg->seg_size, buf += msg->seg_size) {
+		if (copy_from_user(ib_get_rmpp_segment(msg, seg), buf,
+				   min(left, msg->seg_size)))
+			return -EFAULT;
+	}
+	return 0;
+}
+
+static int same_destination(struct ib_user_mad_hdr *hdr1,
+			    struct ib_user_mad_hdr *hdr2)
+{
+	if (!hdr1->grh_present && !hdr2->grh_present)
+	   return (hdr1->lid == hdr2->lid);
+
+	if (hdr1->grh_present && hdr2->grh_present)
+	   return !memcmp(hdr1->gid, hdr2->gid, 16);
+
+	return 0;
+}
+
+static int is_duplicate(struct ib_umad_file *file,
+			struct ib_umad_packet *packet)
+{
+	struct ib_umad_packet *sent_packet;
+	struct ib_mad_hdr *sent_hdr, *hdr;
+
+	hdr = (struct ib_mad_hdr *) packet->mad.data;
+	list_for_each_entry(sent_packet, &file->send_list, list) {
+		sent_hdr = (struct ib_mad_hdr *) sent_packet->mad.data;
+
+		if ((hdr->tid != sent_hdr->tid) ||
+		    (hdr->mgmt_class != sent_hdr->mgmt_class))
+			continue;
+
+		/*
+		 * No need to be overly clever here.  If two new operations have
+		 * the same TID, reject the second as a duplicate.  This is more
+		 * restrictive than required by the spec.
+		 */
+		if (!ib_response_mad((struct ib_mad *) hdr)) {
+			if (!ib_response_mad((struct ib_mad *) sent_hdr))
+				return 1;
+			continue;
+		} else if (!ib_response_mad((struct ib_mad *) sent_hdr))
+			continue;
+
+		if (same_destination(&packet->mad.hdr, &sent_packet->mad.hdr))
+			return 1;
+	}
+
+	return 0;
+}
+
+static ssize_t ib_umad_write(struct file *filp, const char __user *buf,
+			     size_t count, loff_t *pos)
+{
+	struct ib_umad_file *file = filp->private_data;
+	struct ib_umad_packet *packet;
+	struct ib_mad_agent *agent;
+	struct ib_ah_attr ah_attr;
+	struct ib_ah *ah;
+	struct ib_rmpp_mad *rmpp_mad;
+	__be64 *tid;
+	int ret, data_len, hdr_len, copy_offset, rmpp_active;
+
+	if (count < hdr_size(file) + IB_MGMT_RMPP_HDR)
+		return -EINVAL;
+
+	packet = kzalloc(sizeof *packet + IB_MGMT_RMPP_HDR, GFP_KERNEL);
+	if (!packet)
+		return -ENOMEM;
+
+	if (copy_from_user(&packet->mad, buf, hdr_size(file))) {
+		ret = -EFAULT;
+		goto err;
+	}
+
+	if (packet->mad.hdr.id < 0 ||
+	    packet->mad.hdr.id >= IB_UMAD_MAX_AGENTS) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	buf += hdr_size(file);
+
+	if (copy_from_user(packet->mad.data, buf, IB_MGMT_RMPP_HDR)) {
+		ret = -EFAULT;
+		goto err;
+	}
+
+	mutex_lock(&file->mutex);
+
+	agent = __get_agent(file, packet->mad.hdr.id);
+	if (!agent) {
+		ret = -EINVAL;
+		goto err_up;
+	}
+
+	memset(&ah_attr, 0, sizeof ah_attr);
+	ah_attr.dlid          = be16_to_cpu(packet->mad.hdr.lid);
+	ah_attr.sl            = packet->mad.hdr.sl;
+	ah_attr.src_path_bits = packet->mad.hdr.path_bits;
+	ah_attr.port_num      = file->port->port_num;
+	if (packet->mad.hdr.grh_present) {
+		ah_attr.ah_flags = IB_AH_GRH;
+		memcpy(ah_attr.grh.dgid.raw, packet->mad.hdr.gid, 16);
+		ah_attr.grh.sgid_index	   = packet->mad.hdr.gid_index;
+		ah_attr.grh.flow_label 	   = be32_to_cpu(packet->mad.hdr.flow_label);
+		ah_attr.grh.hop_limit  	   = packet->mad.hdr.hop_limit;
+		ah_attr.grh.traffic_class  = packet->mad.hdr.traffic_class;
+	}
+
+	ah = ib_create_ah(agent->qp->pd, &ah_attr);
+	if (IS_ERR(ah)) {
+		ret = PTR_ERR(ah);
+		goto err_up;
+	}
+
+	rmpp_mad = (struct ib_rmpp_mad *) packet->mad.data;
+	hdr_len = ib_get_mad_data_offset(rmpp_mad->mad_hdr.mgmt_class);
+	if (!ib_is_mad_class_rmpp(rmpp_mad->mad_hdr.mgmt_class)) {
+		copy_offset = IB_MGMT_MAD_HDR;
+		rmpp_active = 0;
+	} else {
+		copy_offset = IB_MGMT_RMPP_HDR;
+		rmpp_active = ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) &
+			      IB_MGMT_RMPP_FLAG_ACTIVE;
+	}
+
+	data_len = count - hdr_size(file) - hdr_len;
+	packet->msg = ib_create_send_mad(agent,
+					 be32_to_cpu(packet->mad.hdr.qpn),
+					 packet->mad.hdr.pkey_index, rmpp_active,
+					 hdr_len, data_len, GFP_KERNEL);
+	if (IS_ERR(packet->msg)) {
+		ret = PTR_ERR(packet->msg);
+		goto err_ah;
+	}
+
+	packet->msg->ah 	= ah;
+	packet->msg->timeout_ms = packet->mad.hdr.timeout_ms;
+	packet->msg->retries 	= packet->mad.hdr.retries;
+	packet->msg->context[0] = packet;
+
+	/* Copy MAD header.  Any RMPP header is already in place. */
+	memcpy(packet->msg->mad, packet->mad.data, IB_MGMT_MAD_HDR);
+
+	if (!rmpp_active) {
+		if (copy_from_user(packet->msg->mad + copy_offset,
+				   buf + copy_offset,
+				   hdr_len + data_len - copy_offset)) {
+			ret = -EFAULT;
+			goto err_msg;
+		}
+	} else {
+		ret = copy_rmpp_mad(packet->msg, buf);
+		if (ret)
+			goto err_msg;
+	}
+
+	/*
+	 * Set the high-order part of the transaction ID to make MADs from
+	 * different agents unique, and allow routing responses back to the
+	 * original requestor.
+	 */
+	if (!ib_response_mad(packet->msg->mad)) {
+		tid = &((struct ib_mad_hdr *) packet->msg->mad)->tid;
+		*tid = cpu_to_be64(((u64) agent->hi_tid) << 32 |
+				   (be64_to_cpup(tid) & 0xffffffff));
+		rmpp_mad->mad_hdr.tid = *tid;
+	}
+
+	spin_lock_irq(&file->send_lock);
+	ret = is_duplicate(file, packet);
+	if (!ret)
+		list_add_tail(&packet->list, &file->send_list);
+	spin_unlock_irq(&file->send_lock);
+	if (ret) {
+		ret = -EINVAL;
+		goto err_msg;
+	}
+
+	ret = ib_post_send_mad(packet->msg, NULL);
+	if (ret)
+		goto err_send;
+
+	mutex_unlock(&file->mutex);
+	return count;
+
+err_send:
+	dequeue_send(file, packet);
+err_msg:
+	ib_free_send_mad(packet->msg);
+err_ah:
+	ib_destroy_ah(ah);
+err_up:
+	mutex_unlock(&file->mutex);
+err:
+	kfree(packet);
+	return ret;
+}
+
+static unsigned int ib_umad_poll(struct file *filp, struct poll_table_struct *wait)
+{
+	struct ib_umad_file *file = filp->private_data;
+
+	/* we will always be able to post a MAD send */
+	unsigned int mask = POLLOUT | POLLWRNORM;
+
+	poll_wait(filp, &file->recv_wait, wait);
+
+	if (!list_empty(&file->recv_list))
+		mask |= POLLIN | POLLRDNORM;
+
+	return mask;
+}
+
+static int ib_umad_reg_agent(struct ib_umad_file *file, void __user *arg,
+			     int compat_method_mask)
+{
+	struct ib_user_mad_reg_req ureq;
+	struct ib_mad_reg_req req;
+	struct ib_mad_agent *agent = NULL;
+	int agent_id;
+	int ret;
+
+	mutex_lock(&file->port->file_mutex);
+	mutex_lock(&file->mutex);
+
+	if (!file->port->ib_dev) {
+		ret = -EPIPE;
+		goto out;
+	}
+
+	if (copy_from_user(&ureq, arg, sizeof ureq)) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	if (ureq.qpn != 0 && ureq.qpn != 1) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	for (agent_id = 0; agent_id < IB_UMAD_MAX_AGENTS; ++agent_id)
+		if (!__get_agent(file, agent_id))
+			goto found;
+
+	ret = -ENOMEM;
+	goto out;
+
+found:
+	if (ureq.mgmt_class) {
+		req.mgmt_class         = ureq.mgmt_class;
+		req.mgmt_class_version = ureq.mgmt_class_version;
+		memcpy(req.oui, ureq.oui, sizeof req.oui);
+
+		if (compat_method_mask) {
+			u32 *umm = (u32 *) ureq.method_mask;
+			int i;
+
+			for (i = 0; i < BITS_TO_LONGS(IB_MGMT_MAX_METHODS); ++i)
+				req.method_mask[i] =
+					umm[i * 2] | ((u64) umm[i * 2 + 1] << 32);
+		} else
+			memcpy(req.method_mask, ureq.method_mask,
+			       sizeof req.method_mask);
+	}
+
+	agent = ib_register_mad_agent(file->port->ib_dev, file->port->port_num,
+				      ureq.qpn ? IB_QPT_GSI : IB_QPT_SMI,
+				      ureq.mgmt_class ? &req : NULL,
+				      ureq.rmpp_version,
+				      send_handler, recv_handler, file);
+	if (IS_ERR(agent)) {
+		ret = PTR_ERR(agent);
+		agent = NULL;
+		goto out;
+	}
+
+	if (put_user(agent_id,
+		     (u32 __user *) (arg + offsetof(struct ib_user_mad_reg_req, id)))) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	if (!file->already_used) {
+		file->already_used = 1;
+		if (!file->use_pkey_index) {
+			printk(KERN_WARNING "user_mad: process %s did not enable "
+			       "P_Key index support.\n", curproc->p_comm);
+			printk(KERN_WARNING "user_mad:   Documentation/infiniband/user_mad.txt "
+			       "has info on the new ABI.\n");
+		}
+	}
+
+	file->agent[agent_id] = agent;
+	ret = 0;
+
+out:
+	mutex_unlock(&file->mutex);
+
+	if (ret && agent)
+		ib_unregister_mad_agent(agent);
+
+	mutex_unlock(&file->port->file_mutex);
+
+	return ret;
+}
+
+static int ib_umad_unreg_agent(struct ib_umad_file *file, u32 __user *arg)
+{
+	struct ib_mad_agent *agent = NULL;
+	u32 id;
+	int ret = 0;
+
+	if (get_user(id, arg))
+		return -EFAULT;
+
+	mutex_lock(&file->port->file_mutex);
+	mutex_lock(&file->mutex);
+
+	if (id < 0 || id >= IB_UMAD_MAX_AGENTS || !__get_agent(file, id)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	agent = file->agent[id];
+	file->agent[id] = NULL;
+
+out:
+	mutex_unlock(&file->mutex);
+
+	if (agent)
+		ib_unregister_mad_agent(agent);
+
+	mutex_unlock(&file->port->file_mutex);
+
+	return ret;
+}
+
+static long ib_umad_enable_pkey(struct ib_umad_file *file)
+{
+	int ret = 0;
+
+	mutex_lock(&file->mutex);
+	if (file->already_used)
+		ret = -EINVAL;
+	else
+		file->use_pkey_index = 1;
+	mutex_unlock(&file->mutex);
+
+	return ret;
+}
+
+static long ib_umad_ioctl(struct file *filp, unsigned int cmd,
+			  unsigned long arg)
+{
+	switch (cmd) {
+	case IB_USER_MAD_REGISTER_AGENT:
+		return ib_umad_reg_agent(filp->private_data, (void __user *) arg, 0);
+	case IB_USER_MAD_UNREGISTER_AGENT:
+		return ib_umad_unreg_agent(filp->private_data, (__u32 __user *) arg);
+	case IB_USER_MAD_ENABLE_PKEY:
+		return ib_umad_enable_pkey(filp->private_data);
+	default:
+		return -ENOIOCTLCMD;
+	}
+}
+
+#ifdef CONFIG_COMPAT
+static long ib_umad_compat_ioctl(struct file *filp, unsigned int cmd,
+				 unsigned long arg)
+{
+	switch (cmd) {
+	case IB_USER_MAD_REGISTER_AGENT:
+		return ib_umad_reg_agent(filp->private_data, compat_ptr(arg), 1);
+	case IB_USER_MAD_UNREGISTER_AGENT:
+		return ib_umad_unreg_agent(filp->private_data, compat_ptr(arg));
+	case IB_USER_MAD_ENABLE_PKEY:
+		return ib_umad_enable_pkey(filp->private_data);
+	default:
+		return -ENOIOCTLCMD;
+	}
+}
+#endif
+
+/*
+ * ib_umad_open() does not need the BKL:
+ *
+ *  - umad_port[] accesses are protected by port_lock, the
+ *    ib_umad_port structures are properly reference counted, and
+ *    everything else is purely local to the file being created, so
+ *    races against other open calls are not a problem;
+ *  - the ioctl method does not affect any global state outside of the
+ *    file structure being operated on;
+ *  - the port is added to umad_port[] as the last part of module
+ *    initialization so the open method will either immediately run
+ *    -ENXIO, or all required initialization will be done.
+ */
+static int ib_umad_open(struct inode *inode, struct file *filp)
+{
+	struct ib_umad_port *port;
+	struct ib_umad_file *file;
+	int ret = 0;
+
+	spin_lock(&port_lock);
+	port = umad_port[iminor(inode) - IB_UMAD_MINOR_BASE];
+	if (port)
+		kref_get(&port->umad_dev->ref);
+	spin_unlock(&port_lock);
+
+	if (!port)
+		return -ENXIO;
+
+	mutex_lock(&port->file_mutex);
+
+	if (!port->ib_dev) {
+		ret = -ENXIO;
+		goto out;
+	}
+
+	file = kzalloc(sizeof *file, GFP_KERNEL);
+	if (!file) {
+		kref_put(&port->umad_dev->ref, ib_umad_release_dev);
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	mutex_init(&file->mutex);
+	spin_lock_init(&file->send_lock);
+	INIT_LIST_HEAD(&file->recv_list);
+	INIT_LIST_HEAD(&file->send_list);
+	init_waitqueue_head(&file->recv_wait);
+
+	file->port = port;
+	file->filp = filp;
+	filp->private_data = file;
+
+	list_add_tail(&file->port_list, &port->file_list);
+
+out:
+	mutex_unlock(&port->file_mutex);
+	return ret;
+}
+
+static int ib_umad_close(struct inode *inode, struct file *filp)
+{
+	struct ib_umad_file *file = filp->private_data;
+	struct ib_umad_device *dev = file->port->umad_dev;
+	struct ib_umad_packet *packet, *tmp;
+	int already_dead;
+	int i;
+
+	mutex_lock(&file->port->file_mutex);
+	mutex_lock(&file->mutex);
+
+	already_dead = file->agents_dead;
+	file->agents_dead = 1;
+
+	list_for_each_entry_safe(packet, tmp, &file->recv_list, list) {
+		if (packet->recv_wc)
+			ib_free_recv_mad(packet->recv_wc);
+		kfree(packet);
+	}
+
+	list_del(&file->port_list);
+
+	mutex_unlock(&file->mutex);
+
+	if (!already_dead)
+		for (i = 0; i < IB_UMAD_MAX_AGENTS; ++i)
+			if (file->agent[i])
+				ib_unregister_mad_agent(file->agent[i]);
+
+	mutex_unlock(&file->port->file_mutex);
+
+	kfree(file);
+	kref_put(&dev->ref, ib_umad_release_dev);
+
+	return 0;
+}
+
+static const struct file_operations umad_fops = {
+	.owner 	 	= THIS_MODULE,
+	.read 	 	= ib_umad_read,
+	.write 	 	= ib_umad_write,
+	.poll 	 	= ib_umad_poll,
+	.unlocked_ioctl = ib_umad_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl 	= ib_umad_compat_ioctl,
+#endif
+	.open 	 	= ib_umad_open,
+	.release 	= ib_umad_close
+};
+
+static int ib_umad_sm_open(struct inode *inode, struct file *filp)
+{
+	struct ib_umad_port *port;
+	struct ib_port_modify props = {
+		.set_port_cap_mask = IB_PORT_SM
+	};
+	int ret;
+
+	spin_lock(&port_lock);
+	port = umad_port[iminor(inode) - IB_UMAD_MINOR_BASE - IB_UMAD_MAX_PORTS];
+	if (port)
+		kref_get(&port->umad_dev->ref);
+	spin_unlock(&port_lock);
+
+	if (!port)
+		return -ENXIO;
+
+	if (filp->f_flags & O_NONBLOCK) {
+		if (down_trylock(&port->sm_sem)) {
+			ret = -EAGAIN;
+			goto fail;
+		}
+	} else {
+		if (down_interruptible(&port->sm_sem)) {
+			ret = -ERESTARTSYS;
+			goto fail;
+		}
+	}
+
+	ret = ib_modify_port(port->ib_dev, port->port_num, 0, &props);
+	if (ret) {
+		up(&port->sm_sem);
+		goto fail;
+	}
+
+	filp->private_data = port;
+
+	return 0;
+
+fail:
+	kref_put(&port->umad_dev->ref, ib_umad_release_dev);
+	return ret;
+}
+
+static int ib_umad_sm_close(struct inode *inode, struct file *filp)
+{
+	struct ib_umad_port *port = filp->private_data;
+	struct ib_port_modify props = {
+		.clr_port_cap_mask = IB_PORT_SM
+	};
+	int ret = 0;
+
+	mutex_lock(&port->file_mutex);
+	if (port->ib_dev)
+		ret = ib_modify_port(port->ib_dev, port->port_num, 0, &props);
+	mutex_unlock(&port->file_mutex);
+
+	up(&port->sm_sem);
+
+	kref_put(&port->umad_dev->ref, ib_umad_release_dev);
+
+	return ret;
+}
+
+static const struct file_operations umad_sm_fops = {
+	.owner 	 = THIS_MODULE,
+	.open 	 = ib_umad_sm_open,
+	.release = ib_umad_sm_close
+};
+
+static struct ib_client umad_client = {
+	.name   = "umad",
+	.add    = ib_umad_add_one,
+	.remove = ib_umad_remove_one
+};
+
+static ssize_t show_ibdev(struct device *dev, struct device_attribute *attr,
+			  char *buf)
+{
+	struct ib_umad_port *port = dev_get_drvdata(dev);
+
+	if (!port)
+		return -ENODEV;
+
+	return sprintf(buf, "%s\n", port->ib_dev->name);
+}
+static DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL);
+
+static ssize_t show_port(struct device *dev, struct device_attribute *attr,
+			 char *buf)
+{
+	struct ib_umad_port *port = dev_get_drvdata(dev);
+
+	if (!port)
+		return -ENODEV;
+
+	return sprintf(buf, "%d\n", port->port_num);
+}
+static DEVICE_ATTR(port, S_IRUGO, show_port, NULL);
+
+static ssize_t show_abi_version(struct class *class, struct class_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%d\n", IB_USER_MAD_ABI_VERSION);
+}
+static CLASS_ATTR(abi_version, S_IRUGO, show_abi_version, NULL);
+
+static int ib_umad_init_port(struct ib_device *device, int port_num,
+			     struct ib_umad_port *port)
+{
+	spin_lock(&port_lock);
+	port->dev_num = find_first_zero_bit(dev_map, IB_UMAD_MAX_PORTS);
+	if (port->dev_num >= IB_UMAD_MAX_PORTS) {
+		spin_unlock(&port_lock);
+		return -1;
+	}
+	set_bit(port->dev_num, dev_map);
+	spin_unlock(&port_lock);
+
+	port->ib_dev   = device;
+	port->port_num = port_num;
+	init_MUTEX(&port->sm_sem);
+	mutex_init(&port->file_mutex);
+	INIT_LIST_HEAD(&port->file_list);
+
+	port->cdev = cdev_alloc();
+	if (!port->cdev)
+		return -1;
+	port->cdev->owner = THIS_MODULE;
+	port->cdev->ops   = &umad_fops;
+	kobject_set_name(&port->cdev->kobj, "umad%d", port->dev_num);
+	if (cdev_add(port->cdev, base_dev + port->dev_num, 1))
+		goto err_cdev;
+
+	port->dev = device_create(umad_class, device->dma_device,
+				  port->cdev->dev, port,
+				  "umad%d", port->dev_num);
+	if (IS_ERR(port->dev))
+		goto err_cdev;
+
+	if (device_create_file(port->dev, &dev_attr_ibdev))
+		goto err_dev;
+	if (device_create_file(port->dev, &dev_attr_port))
+		goto err_dev;
+
+	port->sm_cdev = cdev_alloc();
+	if (!port->sm_cdev)
+		goto err_dev;
+	port->sm_cdev->owner = THIS_MODULE;
+	port->sm_cdev->ops   = &umad_sm_fops;
+	kobject_set_name(&port->sm_cdev->kobj, "issm%d", port->dev_num);
+	if (cdev_add(port->sm_cdev, base_dev + port->dev_num + IB_UMAD_MAX_PORTS, 1))
+		goto err_sm_cdev;
+
+	port->sm_dev = device_create(umad_class, device->dma_device,
+				     port->sm_cdev->dev, port,
+				     "issm%d", port->dev_num);
+	if (IS_ERR(port->sm_dev))
+		goto err_sm_cdev;
+
+	if (device_create_file(port->sm_dev, &dev_attr_ibdev))
+		goto err_sm_dev;
+	if (device_create_file(port->sm_dev, &dev_attr_port))
+		goto err_sm_dev;
+
+	spin_lock(&port_lock);
+	umad_port[port->dev_num] = port;
+	spin_unlock(&port_lock);
+
+	return 0;
+
+err_sm_dev:
+	device_destroy(umad_class, port->sm_cdev->dev);
+
+err_sm_cdev:
+	cdev_del(port->sm_cdev);
+
+err_dev:
+	device_destroy(umad_class, port->cdev->dev);
+
+err_cdev:
+	cdev_del(port->cdev);
+	clear_bit(port->dev_num, dev_map);
+
+	return -1;
+}
+
+static void ib_umad_kill_port(struct ib_umad_port *port)
+{
+	struct ib_umad_file *file;
+	int already_dead;
+	int id;
+
+	dev_set_drvdata(port->dev,    NULL);
+	dev_set_drvdata(port->sm_dev, NULL);
+
+	device_destroy(umad_class, port->cdev->dev);
+	device_destroy(umad_class, port->sm_cdev->dev);
+
+	cdev_del(port->cdev);
+	cdev_del(port->sm_cdev);
+
+	spin_lock(&port_lock);
+	umad_port[port->dev_num] = NULL;
+	spin_unlock(&port_lock);
+
+	mutex_lock(&port->file_mutex);
+
+	port->ib_dev = NULL;
+
+	list_for_each_entry(file, &port->file_list, port_list) {
+		mutex_lock(&file->mutex);
+		already_dead = file->agents_dead;
+		file->agents_dead = 1;
+		mutex_unlock(&file->mutex);
+
+		for (id = 0; id < IB_UMAD_MAX_AGENTS; ++id)
+			if (file->agent[id])
+				ib_unregister_mad_agent(file->agent[id]);
+	}
+
+	mutex_unlock(&port->file_mutex);
+
+	clear_bit(port->dev_num, dev_map);
+}
+
+static void ib_umad_add_one(struct ib_device *device)
+{
+	struct ib_umad_device *umad_dev;
+	int s, e, i;
+
+	if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
+		return;
+
+	if (device->node_type == RDMA_NODE_IB_SWITCH)
+		s = e = 0;
+	else {
+		s = 1;
+		e = device->phys_port_cnt;
+	}
+
+	umad_dev = kzalloc(sizeof *umad_dev +
+			   (e - s + 1) * sizeof (struct ib_umad_port),
+			   GFP_KERNEL);
+	if (!umad_dev)
+		return;
+
+	kref_init(&umad_dev->ref);
+
+	umad_dev->start_port = s;
+	umad_dev->end_port   = e;
+
+	for (i = s; i <= e; ++i) {
+		umad_dev->port[i - s].umad_dev = umad_dev;
+
+		if (rdma_port_get_link_layer(device, i) == IB_LINK_LAYER_INFINIBAND)
+			if (ib_umad_init_port(device, i, &umad_dev->port[i - s]))
+				goto err;
+	}
+
+	ib_set_client_data(device, &umad_client, umad_dev);
+
+	return;
+
+err:
+	while (--i >= s)
+		if (rdma_port_get_link_layer(device, i) == IB_LINK_LAYER_INFINIBAND)
+			ib_umad_kill_port(&umad_dev->port[i - s]);
+
+	kref_put(&umad_dev->ref, ib_umad_release_dev);
+}
+
+static void ib_umad_remove_one(struct ib_device *device)
+{
+	struct ib_umad_device *umad_dev = ib_get_client_data(device, &umad_client);
+	int i;
+
+	if (!umad_dev)
+		return;
+
+	for (i = 0; i <= umad_dev->end_port - umad_dev->start_port; ++i)
+		if (rdma_port_get_link_layer(device, i + 1) == IB_LINK_LAYER_INFINIBAND)
+			ib_umad_kill_port(&umad_dev->port[i]);
+
+	kref_put(&umad_dev->ref, ib_umad_release_dev);
+}
+
+static int __init ib_umad_init(void)
+{
+	int ret;
+
+	ret = register_chrdev_region(base_dev, IB_UMAD_MAX_PORTS * 2,
+				     "infiniband_mad");
+	if (ret) {
+		printk(KERN_ERR "user_mad: couldn't register device number\n");
+		goto out;
+	}
+
+	umad_class = class_create(THIS_MODULE, "infiniband_mad");
+	if (IS_ERR(umad_class)) {
+		ret = PTR_ERR(umad_class);
+		printk(KERN_ERR "user_mad: couldn't create class infiniband_mad\n");
+		goto out_chrdev;
+	}
+
+	ret = class_create_file(umad_class, &class_attr_abi_version);
+	if (ret) {
+		printk(KERN_ERR "user_mad: couldn't create abi_version attribute\n");
+		goto out_class;
+	}
+
+	ret = ib_register_client(&umad_client);
+	if (ret) {
+		printk(KERN_ERR "user_mad: couldn't register ib_umad client\n");
+		goto out_class;
+	}
+
+	return 0;
+
+out_class:
+	class_destroy(umad_class);
+
+out_chrdev:
+	unregister_chrdev_region(base_dev, IB_UMAD_MAX_PORTS * 2);
+
+out:
+	return ret;
+}
+
+static void __exit ib_umad_cleanup(void)
+{
+	ib_unregister_client(&umad_client);
+	class_destroy(umad_class);
+	unregister_chrdev_region(base_dev, IB_UMAD_MAX_PORTS * 2);
+}
+
+module_init(ib_umad_init);
+module_exit(ib_umad_cleanup);


Property changes on: trunk/sys/ofed/drivers/infiniband/core/user_mad.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/sys/ofed/drivers/infiniband/core/uverbs.h
===================================================================
--- trunk/sys/ofed/drivers/infiniband/core/uverbs.h	                        (rev 0)
+++ trunk/sys/ofed/drivers/infiniband/core/uverbs.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -0,0 +1,220 @@
+/*
+ * Copyright (c) 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005, 2006 Cisco Systems.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2005 Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2005 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef UVERBS_H
+#define UVERBS_H
+
+#include <linux/kref.h>
+#include <linux/idr.h>
+#include <linux/mutex.h>
+#include <linux/completion.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_umem.h>
+#include <rdma/ib_user_verbs.h>
+
+/*
+ * Our lifetime rules for these structs are the following:
+ *
+ * struct ib_uverbs_device: One reference is held by the module and
+ * released in ib_uverbs_remove_one().  Another reference is taken by
+ * ib_uverbs_open() each time the character special file is opened,
+ * and released in ib_uverbs_release_file() when the file is released.
+ *
+ * struct ib_uverbs_file: One reference is held by the VFS and
+ * released when the file is closed.  Another reference is taken when
+ * an asynchronous event queue file is created and released when the
+ * event file is closed.
+ *
+ * struct ib_uverbs_event_file: One reference is held by the VFS and
+ * released when the file is closed.  For asynchronous event files,
+ * another reference is held by the corresponding main context file
+ * and released when that file is closed.  For completion event files,
+ * a reference is taken when a CQ is created that uses the file, and
+ * released when the CQ is destroyed.
+ */
+
+struct ib_uverbs_device {
+	struct kref				ref;
+	struct completion			comp;
+	int					devnum;
+	struct cdev			       *cdev;
+	struct device			       *dev;
+	struct ib_device		       *ib_dev;
+	int					num_comp_vectors;
+};
+
+struct ib_uverbs_event_file {
+	struct kref				ref;
+	struct file			       *filp;
+	struct ib_uverbs_file		       *uverbs_file;
+	spinlock_t				lock;
+	wait_queue_head_t			poll_wait;
+	struct fasync_struct		       *async_queue;
+	struct list_head			event_list;
+	int					is_async;
+	int					is_closed;
+};
+
+struct ib_uverbs_file {
+	struct kref				ref;
+	struct mutex				mutex;
+	struct ib_uverbs_device		       *device;
+	struct ib_ucontext		       *ucontext;
+	struct ib_event_handler			event_handler;
+	struct ib_uverbs_event_file	       *async_file;
+};
+
+struct ib_uverbs_event {
+	union {
+		struct ib_uverbs_async_event_desc	async;
+		struct ib_uverbs_comp_event_desc	comp;
+	}					desc;
+	struct list_head			list;
+	struct list_head			obj_list;
+	u32				       *counter;
+};
+
+struct ib_uverbs_mcast_entry {
+	struct list_head	list;
+	union ib_gid 		gid;
+	u16 			lid;
+};
+
+struct ib_uevent_object {
+	struct ib_uobject	uobject;
+	struct list_head	event_list;
+	u32			events_reported;
+};
+
+struct ib_uqp_object {
+	struct ib_uevent_object	uevent;
+	struct list_head 	mcast_list;
+};
+
+struct ib_ucq_object {
+	struct ib_uobject	uobject;
+	struct ib_uverbs_file  *uverbs_file;
+	struct list_head	comp_list;
+	struct list_head	async_list;
+	u32			comp_events_reported;
+	u32			async_events_reported;
+};
+
+struct ib_uxrcd_object {
+	struct ib_uobject	uobject;
+	struct list_head	xrc_reg_qp_list;
+};
+
+extern spinlock_t ib_uverbs_idr_lock;
+extern struct idr ib_uverbs_pd_idr;
+extern struct idr ib_uverbs_mr_idr;
+extern struct idr ib_uverbs_mw_idr;
+extern struct idr ib_uverbs_ah_idr;
+extern struct idr ib_uverbs_cq_idr;
+extern struct idr ib_uverbs_qp_idr;
+extern struct idr ib_uverbs_srq_idr;
+extern struct idr ib_uverbs_xrc_domain_idr;
+
+void idr_remove_uobj(struct idr *idp, struct ib_uobject *uobj);
+
+struct file *ib_uverbs_alloc_event_file(struct ib_uverbs_file *uverbs_file,
+					int is_async, int *fd);
+struct ib_uverbs_event_file *ib_uverbs_lookup_comp_file(int fd);
+
+void ib_uverbs_release_ucq(struct ib_uverbs_file *file,
+			   struct ib_uverbs_event_file *ev_file,
+			   struct ib_ucq_object *uobj);
+void ib_uverbs_release_uevent(struct ib_uverbs_file *file,
+			      struct ib_uevent_object *uobj);
+
+void ib_uverbs_comp_handler(struct ib_cq *cq, void *cq_context);
+void ib_uverbs_cq_event_handler(struct ib_event *event, void *context_ptr);
+void ib_uverbs_qp_event_handler(struct ib_event *event, void *context_ptr);
+void ib_uverbs_srq_event_handler(struct ib_event *event, void *context_ptr);
+void ib_uverbs_event_handler(struct ib_event_handler *handler,
+			     struct ib_event *event);
+void ib_uverbs_xrc_rcv_qp_event_handler(struct ib_event *event,
+					void *context_ptr);
+void ib_uverbs_dealloc_xrcd(struct ib_device *ib_dev,
+			    struct ib_xrcd *xrcd);
+int ib_uverbs_cleanup_xrc_rcv_qp(struct ib_uverbs_file *file,
+				 struct ib_xrcd *xrcd, u32 qp_num);
+
+#define IB_UVERBS_DECLARE_CMD(name)					\
+	ssize_t ib_uverbs_##name(struct ib_uverbs_file *file,		\
+				 const char __user *buf, int in_len,	\
+				 int out_len)
+
+IB_UVERBS_DECLARE_CMD(get_context);
+IB_UVERBS_DECLARE_CMD(query_device);
+IB_UVERBS_DECLARE_CMD(query_port);
+IB_UVERBS_DECLARE_CMD(alloc_pd);
+IB_UVERBS_DECLARE_CMD(dealloc_pd);
+IB_UVERBS_DECLARE_CMD(reg_mr);
+IB_UVERBS_DECLARE_CMD(dereg_mr);
+IB_UVERBS_DECLARE_CMD(create_comp_channel);
+IB_UVERBS_DECLARE_CMD(create_cq);
+IB_UVERBS_DECLARE_CMD(resize_cq);
+IB_UVERBS_DECLARE_CMD(poll_cq);
+IB_UVERBS_DECLARE_CMD(req_notify_cq);
+IB_UVERBS_DECLARE_CMD(destroy_cq);
+IB_UVERBS_DECLARE_CMD(create_qp);
+IB_UVERBS_DECLARE_CMD(query_qp);
+IB_UVERBS_DECLARE_CMD(modify_qp);
+IB_UVERBS_DECLARE_CMD(destroy_qp);
+IB_UVERBS_DECLARE_CMD(post_send);
+IB_UVERBS_DECLARE_CMD(post_recv);
+IB_UVERBS_DECLARE_CMD(post_srq_recv);
+IB_UVERBS_DECLARE_CMD(create_ah);
+IB_UVERBS_DECLARE_CMD(destroy_ah);
+IB_UVERBS_DECLARE_CMD(attach_mcast);
+IB_UVERBS_DECLARE_CMD(detach_mcast);
+IB_UVERBS_DECLARE_CMD(create_srq);
+IB_UVERBS_DECLARE_CMD(modify_srq);
+IB_UVERBS_DECLARE_CMD(query_srq);
+IB_UVERBS_DECLARE_CMD(destroy_srq);
+IB_UVERBS_DECLARE_CMD(create_xrc_srq);
+IB_UVERBS_DECLARE_CMD(open_xrc_domain);
+IB_UVERBS_DECLARE_CMD(close_xrc_domain);
+IB_UVERBS_DECLARE_CMD(create_xrc_rcv_qp);
+IB_UVERBS_DECLARE_CMD(modify_xrc_rcv_qp);
+IB_UVERBS_DECLARE_CMD(query_xrc_rcv_qp);
+IB_UVERBS_DECLARE_CMD(reg_xrc_rcv_qp);
+IB_UVERBS_DECLARE_CMD(unreg_xrc_rcv_qp);
+
+
+#endif /* UVERBS_H */


Property changes on: trunk/sys/ofed/drivers/infiniband/core/uverbs.h
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/sys/ofed/drivers/infiniband/core/uverbs_cmd.c
===================================================================
--- trunk/sys/ofed/drivers/infiniband/core/uverbs_cmd.c	                        (rev 0)
+++ trunk/sys/ofed/drivers/infiniband/core/uverbs_cmd.c	2016-09-14 19:35:22 UTC (rev 7911)
@@ -0,0 +1,3023 @@
+/*
+ * Copyright (c) 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005, 2006, 2007 Cisco Systems.  All rights reserved.
+ * Copyright (c) 2005 PathScale, Inc.  All rights reserved.
+ * Copyright (c) 2006 Mellanox Technologies.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/lockdep.h>
+
+#include <asm/uaccess.h>
+#include <asm/fcntl.h>
+
+#include "uverbs.h"
+
+static struct lock_class_key pd_lock_key;
+static struct lock_class_key mr_lock_key;
+static struct lock_class_key cq_lock_key;
+static struct lock_class_key qp_lock_key;
+static struct lock_class_key ah_lock_key;
+static struct lock_class_key srq_lock_key;
+
+#define INIT_UDATA(udata, ibuf, obuf, ilen, olen)			\
+	do {								\
+		(udata)->inbuf  = (void __user *) (ibuf);		\
+		(udata)->outbuf = (void __user *) (obuf);		\
+		(udata)->inlen  = (ilen);				\
+		(udata)->outlen = (olen);				\
+	} while (0)
+
+/*
+ * The ib_uobject locking scheme is as follows:
+ *
+ * - ib_uverbs_idr_lock protects the uverbs idrs themselves, so it
+ *   needs to be held during all idr operations.  When an object is
+ *   looked up, a reference must be taken on the object's kref before
+ *   dropping this lock.
+ *
+ * - Each object also has an rwsem.  This rwsem must be held for
+ *   reading while an operation that uses the object is performed.
+ *   For example, while registering an MR, the associated PD's
+ *   uobject.mutex must be held for reading.  The rwsem must be held
+ *   for writing while initializing or destroying an object.
+ *
+ * - In addition, each object has a "live" flag.  If this flag is not
+ *   set, then lookups of the object will fail even if it is found in
+ *   the idr.  This handles a reader that blocks and does not acquire
+ *   the rwsem until after the object is destroyed.  The destroy
+ *   operation will set the live flag to 0 and then drop the rwsem;
+ *   this will allow the reader to acquire the rwsem, see that the
+ *   live flag is 0, and then drop the rwsem and its reference to
+ *   object.  The underlying storage will not be freed until the last
+ *   reference to the object is dropped.
+ */
+
+static void init_uobj(struct ib_uobject *uobj, u64 user_handle,
+		      struct ib_ucontext *context, struct lock_class_key *key)
+{
+	uobj->user_handle = user_handle;
+	uobj->context     = context;
+	kref_init(&uobj->ref);
+	init_rwsem(&uobj->mutex);
+	lockdep_set_class(&uobj->mutex, key);
+	uobj->live        = 0;
+}
+
+static void release_uobj(struct kref *kref)
+{
+	kfree(container_of(kref, struct ib_uobject, ref));
+}
+
+static void put_uobj(struct ib_uobject *uobj)
+{
+	kref_put(&uobj->ref, release_uobj);
+}
+
+static void put_uobj_read(struct ib_uobject *uobj)
+{
+	up_read(&uobj->mutex);
+	put_uobj(uobj);
+}
+
+static void put_uobj_write(struct ib_uobject *uobj)
+{
+	up_write(&uobj->mutex);
+	put_uobj(uobj);
+}
+
+static int idr_add_uobj(struct idr *idr, struct ib_uobject *uobj)
+{
+	int ret;
+
+retry:
+	if (!idr_pre_get(idr, GFP_KERNEL))
+		return -ENOMEM;
+
+	spin_lock(&ib_uverbs_idr_lock);
+	ret = idr_get_new(idr, uobj, &uobj->id);
+	spin_unlock(&ib_uverbs_idr_lock);
+
+	if (ret == -EAGAIN)
+		goto retry;
+
+	return ret;
+}
+
+void idr_remove_uobj(struct idr *idr, struct ib_uobject *uobj)
+{
+	spin_lock(&ib_uverbs_idr_lock);
+	idr_remove(idr, uobj->id);
+	spin_unlock(&ib_uverbs_idr_lock);
+}
+
+static struct ib_uobject *__idr_get_uobj(struct idr *idr, int id,
+					 struct ib_ucontext *context)
+{
+	struct ib_uobject *uobj;
+
+	spin_lock(&ib_uverbs_idr_lock);
+	uobj = idr_find(idr, id);
+	if (uobj) {
+		if (uobj->context == context)
+			kref_get(&uobj->ref);
+		else
+			uobj = NULL;
+	}
+	spin_unlock(&ib_uverbs_idr_lock);
+
+	return uobj;
+}
+
+static struct ib_uobject *idr_read_uobj(struct idr *idr, int id,
+					struct ib_ucontext *context, int nested)
+{
+	struct ib_uobject *uobj;
+
+	uobj = __idr_get_uobj(idr, id, context);
+	if (!uobj)
+		return NULL;
+
+	if (nested)
+		down_read_nested(&uobj->mutex, SINGLE_DEPTH_NESTING);
+	else
+		down_read(&uobj->mutex);
+	if (!uobj->live) {
+		put_uobj_read(uobj);
+		return NULL;
+	}
+
+	return uobj;
+}
+
+static struct ib_uobject *idr_write_uobj(struct idr *idr, int id,
+					 struct ib_ucontext *context)
+{
+	struct ib_uobject *uobj;
+
+	uobj = __idr_get_uobj(idr, id, context);
+	if (!uobj)
+		return NULL;
+
+	down_write(&uobj->mutex);
+	if (!uobj->live) {
+		put_uobj_write(uobj);
+		return NULL;
+	}
+
+	return uobj;
+}
+
+static void *idr_read_obj(struct idr *idr, int id, struct ib_ucontext *context,
+			  int nested)
+{
+	struct ib_uobject *uobj;
+
+	uobj = idr_read_uobj(idr, id, context, nested);
+	return uobj ? uobj->object : NULL;
+}
+
+static struct ib_pd *idr_read_pd(int pd_handle, struct ib_ucontext *context)
+{
+	return idr_read_obj(&ib_uverbs_pd_idr, pd_handle, context, 0);
+}
+
+static void put_pd_read(struct ib_pd *pd)
+{
+	put_uobj_read(pd->uobject);
+}
+
+static struct ib_cq *idr_read_cq(int cq_handle, struct ib_ucontext *context, int nested)
+{
+	return idr_read_obj(&ib_uverbs_cq_idr, cq_handle, context, nested);
+}
+
+static void put_cq_read(struct ib_cq *cq)
+{
+	put_uobj_read(cq->uobject);
+}
+
+static struct ib_ah *idr_read_ah(int ah_handle, struct ib_ucontext *context)
+{
+	return idr_read_obj(&ib_uverbs_ah_idr, ah_handle, context, 0);
+}
+
+static void put_ah_read(struct ib_ah *ah)
+{
+	put_uobj_read(ah->uobject);
+}
+
+static struct ib_qp *idr_read_qp(int qp_handle, struct ib_ucontext *context)
+{
+	return idr_read_obj(&ib_uverbs_qp_idr, qp_handle, context, 0);
+}
+
+static void put_qp_read(struct ib_qp *qp)
+{
+	put_uobj_read(qp->uobject);
+}
+
+static struct ib_srq *idr_read_srq(int srq_handle, struct ib_ucontext *context)
+{
+	return idr_read_obj(&ib_uverbs_srq_idr, srq_handle, context, 0);
+}
+
+static void put_srq_read(struct ib_srq *srq)
+{
+	put_uobj_read(srq->uobject);
+}
+
+static struct ib_xrcd *idr_read_xrcd(int xrcd_handle,
+				     struct ib_ucontext *context,
+				     struct ib_uobject **uobj)
+{
+	*uobj = idr_read_uobj(&ib_uverbs_xrc_domain_idr, xrcd_handle,
+			      context, 0);
+	return *uobj ? (*uobj)->object : NULL;
+}
+
+static void put_xrcd_read(struct ib_uobject *uobj)
+{
+	put_uobj_read(uobj);
+}
+
+ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
+			      const char __user *buf,
+			      int in_len, int out_len)
+{
+	struct ib_uverbs_get_context      cmd;
+	struct ib_uverbs_get_context_resp resp;
+	struct ib_udata                   udata;
+	struct ib_device                 *ibdev = file->device->ib_dev;
+	struct ib_ucontext		 *ucontext;
+	struct file			 *filp;
+	int ret;
+
+	if (out_len < sizeof resp)
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	mutex_lock(&file->mutex);
+
+	if (file->ucontext) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	INIT_UDATA(&udata, buf + sizeof cmd,
+		   (unsigned long) cmd.response + sizeof resp,
+		   in_len - sizeof cmd, out_len - sizeof resp);
+
+	ucontext = ibdev->alloc_ucontext(ibdev, &udata);
+	if (IS_ERR(ucontext)) {
+		ret = PTR_ERR(file->ucontext);
+		goto err;
+	}
+
+	ucontext->device = ibdev;
+	INIT_LIST_HEAD(&ucontext->pd_list);
+	INIT_LIST_HEAD(&ucontext->mr_list);
+	INIT_LIST_HEAD(&ucontext->mw_list);
+	INIT_LIST_HEAD(&ucontext->cq_list);
+	INIT_LIST_HEAD(&ucontext->qp_list);
+	INIT_LIST_HEAD(&ucontext->srq_list);
+	INIT_LIST_HEAD(&ucontext->ah_list);
+	INIT_LIST_HEAD(&ucontext->xrcd_list);
+	ucontext->closing = 0;
+
+	resp.num_comp_vectors = file->device->num_comp_vectors;
+
+	filp = ib_uverbs_alloc_event_file(file, 1, &resp.async_fd);
+	if (IS_ERR(filp)) {
+		ret = PTR_ERR(filp);
+		goto err_free;
+	}
+
+	if (copy_to_user((void __user *) (unsigned long) cmd.response,
+			 &resp, sizeof resp)) {
+		ret = -EFAULT;
+		goto err_file;
+	}
+
+	file->async_file = filp->private_data;
+
+	INIT_IB_EVENT_HANDLER(&file->event_handler, file->device->ib_dev,
+			      ib_uverbs_event_handler);
+	ret = ib_register_event_handler(&file->event_handler);
+	if (ret)
+		goto err_file;
+
+	kref_get(&file->async_file->ref);
+	kref_get(&file->ref);
+	file->ucontext = ucontext;
+
+	fd_install(resp.async_fd, filp);
+
+	mutex_unlock(&file->mutex);
+
+	return in_len;
+
+err_file:
+	put_unused_fd(resp.async_fd);
+	fput(filp);
+
+err_free:
+	ibdev->dealloc_ucontext(ucontext);
+
+err:
+	mutex_unlock(&file->mutex);
+	return ret;
+}
+
+ssize_t ib_uverbs_query_device(struct ib_uverbs_file *file,
+			       const char __user *buf,
+			       int in_len, int out_len)
+{
+	struct ib_uverbs_query_device      cmd;
+	struct ib_uverbs_query_device_resp resp;
+	struct ib_device_attr              attr;
+	int                                ret;
+
+	if (out_len < sizeof resp)
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	ret = ib_query_device(file->device->ib_dev, &attr);
+	if (ret)
+		return ret;
+
+	memset(&resp, 0, sizeof resp);
+
+	resp.fw_ver 		       = attr.fw_ver;
+	resp.node_guid 		       = file->device->ib_dev->node_guid;
+	resp.sys_image_guid 	       = attr.sys_image_guid;
+	resp.max_mr_size 	       = attr.max_mr_size;
+	resp.page_size_cap 	       = attr.page_size_cap;
+	resp.vendor_id 		       = attr.vendor_id;
+	resp.vendor_part_id 	       = attr.vendor_part_id;
+	resp.hw_ver 		       = attr.hw_ver;
+	resp.max_qp 		       = attr.max_qp;
+	resp.max_qp_wr 		       = attr.max_qp_wr;
+	resp.device_cap_flags 	       = attr.device_cap_flags;
+	resp.max_sge 		       = attr.max_sge;
+	resp.max_sge_rd 	       = attr.max_sge_rd;
+	resp.max_cq 		       = attr.max_cq;
+	resp.max_cqe 		       = attr.max_cqe;
+	resp.max_mr 		       = attr.max_mr;
+	resp.max_pd 		       = attr.max_pd;
+	resp.max_qp_rd_atom 	       = attr.max_qp_rd_atom;
+	resp.max_ee_rd_atom 	       = attr.max_ee_rd_atom;
+	resp.max_res_rd_atom 	       = attr.max_res_rd_atom;
+	resp.max_qp_init_rd_atom       = attr.max_qp_init_rd_atom;
+	resp.max_ee_init_rd_atom       = attr.max_ee_init_rd_atom;
+	resp.atomic_cap 	       = attr.atomic_cap;
+	resp.max_ee 		       = attr.max_ee;
+	resp.max_rdd 		       = attr.max_rdd;
+	resp.max_mw 		       = attr.max_mw;
+	resp.max_raw_ipv6_qp 	       = attr.max_raw_ipv6_qp;
+	resp.max_raw_ethy_qp 	       = attr.max_raw_ethy_qp;
+	resp.max_mcast_grp 	       = attr.max_mcast_grp;
+	resp.max_mcast_qp_attach       = attr.max_mcast_qp_attach;
+	resp.max_total_mcast_qp_attach = attr.max_total_mcast_qp_attach;
+	resp.max_ah 		       = attr.max_ah;
+	resp.max_fmr 		       = attr.max_fmr;
+	resp.max_map_per_fmr 	       = attr.max_map_per_fmr;
+	resp.max_srq 		       = attr.max_srq;
+	resp.max_srq_wr 	       = attr.max_srq_wr;
+	resp.max_srq_sge 	       = attr.max_srq_sge;
+	resp.max_pkeys 		       = attr.max_pkeys;
+	resp.local_ca_ack_delay        = attr.local_ca_ack_delay;
+	resp.phys_port_cnt	       = file->device->ib_dev->phys_port_cnt;
+
+	if (copy_to_user((void __user *) (unsigned long) cmd.response,
+			 &resp, sizeof resp))
+		return -EFAULT;
+
+	return in_len;
+}
+
+ssize_t ib_uverbs_query_port(struct ib_uverbs_file *file,
+			     const char __user *buf,
+			     int in_len, int out_len)
+{
+	struct ib_uverbs_query_port      cmd;
+	struct ib_uverbs_query_port_resp resp;
+	struct ib_port_attr              attr;
+	int                              ret;
+
+	if (out_len < sizeof resp)
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	ret = ib_query_port(file->device->ib_dev, cmd.port_num, &attr);
+	if (ret)
+		return ret;
+
+	memset(&resp, 0, sizeof resp);
+
+	resp.state 	     = attr.state;
+	resp.max_mtu 	     = attr.max_mtu;
+	resp.active_mtu      = attr.active_mtu;
+	resp.gid_tbl_len     = attr.gid_tbl_len;
+	resp.port_cap_flags  = attr.port_cap_flags;
+	resp.max_msg_sz      = attr.max_msg_sz;
+	resp.bad_pkey_cntr   = attr.bad_pkey_cntr;
+	resp.qkey_viol_cntr  = attr.qkey_viol_cntr;
+	resp.pkey_tbl_len    = attr.pkey_tbl_len;
+	resp.lid 	     = attr.lid;
+	resp.sm_lid 	     = attr.sm_lid;
+	resp.lmc 	     = attr.lmc;
+	resp.max_vl_num      = attr.max_vl_num;
+	resp.sm_sl 	     = attr.sm_sl;
+	resp.subnet_timeout  = attr.subnet_timeout;
+	resp.init_type_reply = attr.init_type_reply;
+	resp.active_width    = attr.active_width;
+	resp.active_speed    = attr.active_speed;
+	resp.phys_state      = attr.phys_state;
+	resp.link_layer	     = attr.link_layer;
+
+	if (copy_to_user((void __user *) (unsigned long) cmd.response,
+			 &resp, sizeof resp))
+		return -EFAULT;
+
+	return in_len;
+}
+
+ssize_t ib_uverbs_alloc_pd(struct ib_uverbs_file *file,
+			   const char __user *buf,
+			   int in_len, int out_len)
+{
+	struct ib_uverbs_alloc_pd      cmd;
+	struct ib_uverbs_alloc_pd_resp resp;
+	struct ib_udata                udata;
+	struct ib_uobject             *uobj;
+	struct ib_pd                  *pd;
+	int                            ret;
+
+	if (out_len < sizeof resp)
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	INIT_UDATA(&udata, buf + sizeof cmd,
+		   (unsigned long) cmd.response + sizeof resp,
+		   in_len - sizeof cmd, out_len - sizeof resp);
+
+	uobj = kmalloc(sizeof *uobj, GFP_KERNEL);
+	if (!uobj)
+		return -ENOMEM;
+
+	init_uobj(uobj, 0, file->ucontext, &pd_lock_key);
+	down_write(&uobj->mutex);
+
+	pd = file->device->ib_dev->alloc_pd(file->device->ib_dev,
+					    file->ucontext, &udata);
+	if (IS_ERR(pd)) {
+		ret = PTR_ERR(pd);
+		goto err;
+	}
+
+	pd->device  = file->device->ib_dev;
+	pd->uobject = uobj;
+	atomic_set(&pd->usecnt, 0);
+
+	uobj->object = pd;
+	ret = idr_add_uobj(&ib_uverbs_pd_idr, uobj);
+	if (ret)
+		goto err_idr;
+
+	memset(&resp, 0, sizeof resp);
+	resp.pd_handle = uobj->id;
+
+	if (copy_to_user((void __user *) (unsigned long) cmd.response,
+			 &resp, sizeof resp)) {
+		ret = -EFAULT;
+		goto err_copy;
+	}
+
+	mutex_lock(&file->mutex);
+	list_add_tail(&uobj->list, &file->ucontext->pd_list);
+	mutex_unlock(&file->mutex);
+
+	uobj->live = 1;
+
+	up_write(&uobj->mutex);
+
+	return in_len;
+
+err_copy:
+	idr_remove_uobj(&ib_uverbs_pd_idr, uobj);
+
+err_idr:
+	ib_dealloc_pd(pd);
+
+err:
+	put_uobj_write(uobj);
+	return ret;
+}
+
+ssize_t ib_uverbs_dealloc_pd(struct ib_uverbs_file *file,
+			     const char __user *buf,
+			     int in_len, int out_len)
+{
+	struct ib_uverbs_dealloc_pd cmd;
+	struct ib_uobject          *uobj;
+	int                         ret;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	uobj = idr_write_uobj(&ib_uverbs_pd_idr, cmd.pd_handle, file->ucontext);
+	if (!uobj)
+		return -EINVAL;
+
+	ret = ib_dealloc_pd(uobj->object);
+	if (!ret)
+		uobj->live = 0;
+
+	put_uobj_write(uobj);
+
+	if (ret)
+		return ret;
+
+	idr_remove_uobj(&ib_uverbs_pd_idr, uobj);
+
+	mutex_lock(&file->mutex);
+	list_del(&uobj->list);
+	mutex_unlock(&file->mutex);
+
+	put_uobj(uobj);
+
+	return in_len;
+}
+
+ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file,
+			 const char __user *buf, int in_len,
+			 int out_len)
+{
+	struct ib_uverbs_reg_mr      cmd;
+	struct ib_uverbs_reg_mr_resp resp;
+	struct ib_udata              udata;
+	struct ib_uobject           *uobj;
+	struct ib_pd                *pd;
+	struct ib_mr                *mr;
+	int                          ret;
+
+	if (out_len < sizeof resp)
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	INIT_UDATA(&udata, buf + sizeof cmd,
+		   (unsigned long) cmd.response + sizeof resp,
+		   in_len - sizeof cmd, out_len - sizeof resp);
+
+	if ((cmd.start & ~PAGE_MASK) != (cmd.hca_va & ~PAGE_MASK))
+		return -EINVAL;
+
+	/*
+	 * Local write permission is required if remote write or
+	 * remote atomic permission is also requested.
+	 */
+	if (cmd.access_flags & (IB_ACCESS_REMOTE_ATOMIC | IB_ACCESS_REMOTE_WRITE) &&
+	    !(cmd.access_flags & IB_ACCESS_LOCAL_WRITE))
+		return -EINVAL;
+
+	uobj = kmalloc(sizeof *uobj, GFP_KERNEL);
+	if (!uobj)
+		return -ENOMEM;
+
+	init_uobj(uobj, 0, file->ucontext, &mr_lock_key);
+	down_write(&uobj->mutex);
+
+	pd = idr_read_pd(cmd.pd_handle, file->ucontext);
+	if (!pd) {
+		ret = -EINVAL;
+		goto err_free;
+	}
+
+	mr = pd->device->reg_user_mr(pd, cmd.start, cmd.length, cmd.hca_va,
+				     cmd.access_flags, &udata, 0);
+	if (IS_ERR(mr)) {
+		ret = PTR_ERR(mr);
+		goto err_put;
+	}
+
+	mr->device  = pd->device;
+	mr->pd      = pd;
+	mr->uobject = uobj;
+	atomic_inc(&pd->usecnt);
+	atomic_set(&mr->usecnt, 0);
+
+	uobj->object = mr;
+	ret = idr_add_uobj(&ib_uverbs_mr_idr, uobj);
+	if (ret)
+		goto err_unreg;
+
+	memset(&resp, 0, sizeof resp);
+	resp.lkey      = mr->lkey;
+	resp.rkey      = mr->rkey;
+	resp.mr_handle = uobj->id;
+
+	if (copy_to_user((void __user *) (unsigned long) cmd.response,
+			 &resp, sizeof resp)) {
+		ret = -EFAULT;
+		goto err_copy;
+	}
+
+	put_pd_read(pd);
+
+	mutex_lock(&file->mutex);
+	list_add_tail(&uobj->list, &file->ucontext->mr_list);
+	mutex_unlock(&file->mutex);
+
+	uobj->live = 1;
+
+	up_write(&uobj->mutex);
+
+	return in_len;
+
+err_copy:
+	idr_remove_uobj(&ib_uverbs_mr_idr, uobj);
+
+err_unreg:
+	ib_dereg_mr(mr);
+
+err_put:
+	put_pd_read(pd);
+
+err_free:
+	put_uobj_write(uobj);
+	return ret;
+}
+
+ssize_t ib_uverbs_dereg_mr(struct ib_uverbs_file *file,
+			   const char __user *buf, int in_len,
+			   int out_len)
+{
+	struct ib_uverbs_dereg_mr cmd;
+	struct ib_mr             *mr;
+	struct ib_uobject	 *uobj;
+	int                       ret = -EINVAL;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	uobj = idr_write_uobj(&ib_uverbs_mr_idr, cmd.mr_handle, file->ucontext);
+	if (!uobj)
+		return -EINVAL;
+
+	mr = uobj->object;
+
+	ret = ib_dereg_mr(mr);
+	if (!ret)
+		uobj->live = 0;
+
+	put_uobj_write(uobj);
+
+	if (ret)
+		return ret;
+
+	idr_remove_uobj(&ib_uverbs_mr_idr, uobj);
+
+	mutex_lock(&file->mutex);
+	list_del(&uobj->list);
+	mutex_unlock(&file->mutex);
+
+	put_uobj(uobj);
+
+	return in_len;
+}
+
+ssize_t ib_uverbs_create_comp_channel(struct ib_uverbs_file *file,
+				      const char __user *buf, int in_len,
+				      int out_len)
+{
+	struct ib_uverbs_create_comp_channel	   cmd;
+	struct ib_uverbs_create_comp_channel_resp  resp;
+	struct file				  *filp;
+
+	if (out_len < sizeof resp)
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	filp = ib_uverbs_alloc_event_file(file, 0, &resp.fd);
+	if (IS_ERR(filp))
+		return PTR_ERR(filp);
+
+	if (copy_to_user((void __user *) (unsigned long) cmd.response,
+			 &resp, sizeof resp)) {
+		put_unused_fd(resp.fd);
+		fput(filp);
+		return -EFAULT;
+	}
+
+	fd_install(resp.fd, filp);
+	return in_len;
+}
+
+ssize_t ib_uverbs_create_cq(struct ib_uverbs_file *file,
+			    const char __user *buf, int in_len,
+			    int out_len)
+{
+	struct ib_uverbs_create_cq      cmd;
+	struct ib_uverbs_create_cq_resp resp;
+	struct ib_udata                 udata;
+	struct ib_ucq_object           *obj;
+	struct ib_uverbs_event_file    *ev_file = NULL;
+	struct ib_cq                   *cq;
+	int                             ret;
+
+	if (out_len < sizeof resp)
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	INIT_UDATA(&udata, buf + sizeof cmd,
+		   (unsigned long) cmd.response + sizeof resp,
+		   in_len - sizeof cmd, out_len - sizeof resp);
+
+	if (cmd.comp_vector >= file->device->num_comp_vectors)
+		return -EINVAL;
+
+	obj = kmalloc(sizeof *obj, GFP_KERNEL);
+	if (!obj)
+		return -ENOMEM;
+
+	init_uobj(&obj->uobject, cmd.user_handle, file->ucontext, &cq_lock_key);
+	down_write(&obj->uobject.mutex);
+
+	if (cmd.comp_channel >= 0) {
+		ev_file = ib_uverbs_lookup_comp_file(cmd.comp_channel);
+		if (!ev_file) {
+			ret = -EINVAL;
+			goto err;
+		}
+	}
+
+	obj->uverbs_file	   = file;
+	obj->comp_events_reported  = 0;
+	obj->async_events_reported = 0;
+	INIT_LIST_HEAD(&obj->comp_list);
+	INIT_LIST_HEAD(&obj->async_list);
+
+	cq = file->device->ib_dev->create_cq(file->device->ib_dev, cmd.cqe,
+					     cmd.comp_vector,
+					     file->ucontext, &udata);
+	if (IS_ERR(cq)) {
+		ret = PTR_ERR(cq);
+		goto err_file;
+	}
+
+	cq->device        = file->device->ib_dev;
+	cq->uobject       = &obj->uobject;
+	cq->comp_handler  = ib_uverbs_comp_handler;
+	cq->event_handler = ib_uverbs_cq_event_handler;
+	cq->cq_context    = ev_file;
+	atomic_set(&cq->usecnt, 0);
+
+	obj->uobject.object = cq;
+	ret = idr_add_uobj(&ib_uverbs_cq_idr, &obj->uobject);
+	if (ret)
+		goto err_free;
+
+	memset(&resp, 0, sizeof resp);
+	resp.cq_handle = obj->uobject.id;
+	resp.cqe       = cq->cqe;
+
+	if (copy_to_user((void __user *) (unsigned long) cmd.response,
+			 &resp, sizeof resp)) {
+		ret = -EFAULT;
+		goto err_copy;
+	}
+
+	mutex_lock(&file->mutex);
+	list_add_tail(&obj->uobject.list, &file->ucontext->cq_list);
+	mutex_unlock(&file->mutex);
+
+	obj->uobject.live = 1;
+
+	up_write(&obj->uobject.mutex);
+
+	return in_len;
+
+err_copy:
+	idr_remove_uobj(&ib_uverbs_cq_idr, &obj->uobject);
+
+err_free:
+	ib_destroy_cq(cq);
+
+err_file:
+	if (ev_file)
+		ib_uverbs_release_ucq(file, ev_file, obj);
+
+err:
+	put_uobj_write(&obj->uobject);
+	return ret;
+}
+
+ssize_t ib_uverbs_resize_cq(struct ib_uverbs_file *file,
+			    const char __user *buf, int in_len,
+			    int out_len)
+{
+	struct ib_uverbs_resize_cq	cmd;
+	struct ib_uverbs_resize_cq_resp	resp;
+	struct ib_udata                 udata;
+	struct ib_cq			*cq;
+	int				ret = -EINVAL;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	INIT_UDATA(&udata, buf + sizeof cmd,
+		   (unsigned long) cmd.response + sizeof resp,
+		   in_len - sizeof cmd, out_len - sizeof resp);
+
+	cq = idr_read_cq(cmd.cq_handle, file->ucontext, 0);
+	if (!cq)
+		return -EINVAL;
+
+	ret = cq->device->resize_cq(cq, cmd.cqe, &udata);
+	if (ret)
+		goto out;
+
+	resp.cqe = cq->cqe;
+
+	if (copy_to_user((void __user *) (unsigned long) cmd.response,
+			 &resp, sizeof resp.cqe))
+		ret = -EFAULT;
+
+out:
+	put_cq_read(cq);
+
+	return ret ? ret : in_len;
+}
+
+ssize_t ib_uverbs_poll_cq(struct ib_uverbs_file *file,
+			  const char __user *buf, int in_len,
+			  int out_len)
+{
+	struct ib_uverbs_poll_cq       cmd;
+	struct ib_uverbs_poll_cq_resp *resp;
+	struct ib_cq                  *cq;
+	struct ib_wc                  *wc;
+	int                            ret = 0;
+	int                            i;
+	int                            rsize;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	wc = kmalloc(cmd.ne * sizeof *wc, GFP_KERNEL);
+	if (!wc)
+		return -ENOMEM;
+
+	rsize = sizeof *resp + cmd.ne * sizeof(struct ib_uverbs_wc);
+	resp = kmalloc(rsize, GFP_KERNEL);
+	if (!resp) {
+		ret = -ENOMEM;
+		goto out_wc;
+	}
+
+	cq = idr_read_cq(cmd.cq_handle, file->ucontext, 0);
+	if (!cq) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	resp->count = ib_poll_cq(cq, cmd.ne, wc);
+
+	put_cq_read(cq);
+
+	for (i = 0; i < resp->count; i++) {
+		resp->wc[i].wr_id 	   = wc[i].wr_id;
+		resp->wc[i].status 	   = wc[i].status;
+		resp->wc[i].opcode 	   = wc[i].opcode;
+		resp->wc[i].vendor_err 	   = wc[i].vendor_err;
+		resp->wc[i].byte_len 	   = wc[i].byte_len;
+		resp->wc[i].ex.imm_data    = (__u32 __force) wc[i].ex.imm_data;
+		resp->wc[i].qp_num 	   = wc[i].qp->qp_num;
+		resp->wc[i].src_qp 	   = wc[i].src_qp;
+		resp->wc[i].wc_flags 	   = wc[i].wc_flags;
+		resp->wc[i].pkey_index 	   = wc[i].pkey_index;
+		resp->wc[i].slid 	   = wc[i].slid;
+		resp->wc[i].sl 		   = wc[i].sl;
+		resp->wc[i].dlid_path_bits = wc[i].dlid_path_bits;
+		resp->wc[i].port_num 	   = wc[i].port_num;
+	}
+
+	if (copy_to_user((void __user *) (unsigned long) cmd.response, resp, rsize))
+		ret = -EFAULT;
+
+out:
+	kfree(resp);
+
+out_wc:
+	kfree(wc);
+	return ret ? ret : in_len;
+}
+
+ssize_t ib_uverbs_req_notify_cq(struct ib_uverbs_file *file,
+				const char __user *buf, int in_len,
+				int out_len)
+{
+	struct ib_uverbs_req_notify_cq cmd;
+	struct ib_cq                  *cq;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	cq = idr_read_cq(cmd.cq_handle, file->ucontext, 0);
+	if (!cq)
+		return -EINVAL;
+
+	ib_req_notify_cq(cq, cmd.solicited_only ?
+			 IB_CQ_SOLICITED : IB_CQ_NEXT_COMP);
+
+	put_cq_read(cq);
+
+	return in_len;
+}
+
+ssize_t ib_uverbs_destroy_cq(struct ib_uverbs_file *file,
+			     const char __user *buf, int in_len,
+			     int out_len)
+{
+	struct ib_uverbs_destroy_cq      cmd;
+	struct ib_uverbs_destroy_cq_resp resp;
+	struct ib_uobject		*uobj;
+	struct ib_cq               	*cq;
+	struct ib_ucq_object        	*obj;
+	struct ib_uverbs_event_file	*ev_file;
+	int                        	 ret = -EINVAL;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	uobj = idr_write_uobj(&ib_uverbs_cq_idr, cmd.cq_handle, file->ucontext);
+	if (!uobj)
+		return -EINVAL;
+	cq      = uobj->object;
+	ev_file = cq->cq_context;
+	obj     = container_of(cq->uobject, struct ib_ucq_object, uobject);
+
+	ret = ib_destroy_cq(cq);
+	if (!ret)
+		uobj->live = 0;
+
+	put_uobj_write(uobj);
+
+	if (ret)
+		return ret;
+
+	idr_remove_uobj(&ib_uverbs_cq_idr, uobj);
+
+	mutex_lock(&file->mutex);
+	list_del(&uobj->list);
+	mutex_unlock(&file->mutex);
+
+	ib_uverbs_release_ucq(file, ev_file, obj);
+
+	memset(&resp, 0, sizeof resp);
+	resp.comp_events_reported  = obj->comp_events_reported;
+	resp.async_events_reported = obj->async_events_reported;
+
+	put_uobj(uobj);
+
+	if (copy_to_user((void __user *) (unsigned long) cmd.response,
+			 &resp, sizeof resp))
+		return -EFAULT;
+
+	return in_len;
+}
+
+ssize_t ib_uverbs_create_qp(struct ib_uverbs_file *file,
+			    const char __user *buf, int in_len,
+			    int out_len)
+{
+	struct ib_uverbs_create_qp      cmd;
+	struct ib_uverbs_create_qp_resp resp;
+	struct ib_udata                 udata;
+	struct ib_uqp_object           *obj;
+	struct ib_pd                   *pd;
+	struct ib_cq                   *scq, *rcq;
+	struct ib_srq                  *srq;
+	struct ib_qp                   *qp;
+	struct ib_qp_init_attr          attr;
+	struct ib_xrcd		       *xrcd;
+	struct ib_uobject	       *xrcd_uobj;
+	int ret;
+
+	if (out_len < sizeof resp)
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	INIT_UDATA(&udata, buf + sizeof cmd,
+		   (unsigned long) cmd.response + sizeof resp,
+		   in_len - sizeof cmd, out_len - sizeof resp);
+
+	obj = kmalloc(sizeof *obj, GFP_KERNEL);
+	if (!obj)
+		return -ENOMEM;
+
+	init_uobj(&obj->uevent.uobject, cmd.user_handle, file->ucontext, &qp_lock_key);
+	down_write(&obj->uevent.uobject.mutex);
+
+	srq = (cmd.is_srq && cmd.qp_type != IB_QPT_XRC) ?
+		idr_read_srq(cmd.srq_handle, file->ucontext) : NULL;
+	xrcd = cmd.qp_type == IB_QPT_XRC ?
+		idr_read_xrcd(cmd.srq_handle, file->ucontext, &xrcd_uobj) : NULL;
+	pd  = idr_read_pd(cmd.pd_handle, file->ucontext);
+	scq = idr_read_cq(cmd.send_cq_handle, file->ucontext, 0);
+	rcq = cmd.recv_cq_handle == cmd.send_cq_handle ?
+		scq : idr_read_cq(cmd.recv_cq_handle, file->ucontext, 1);
+
+	if (!pd || !scq || !rcq || (cmd.is_srq && !srq) ||
+	    (cmd.qp_type == IB_QPT_XRC && !xrcd)) {
+		ret = -EINVAL;
+		goto err_put;
+	}
+
+	attr.create_flags  = 0;
+	attr.event_handler = ib_uverbs_qp_event_handler;
+	attr.qp_context    = file;
+	attr.send_cq       = scq;
+	attr.recv_cq       = rcq;
+	attr.srq           = srq;
+	attr.sq_sig_type   = cmd.sq_sig_all ? IB_SIGNAL_ALL_WR : IB_SIGNAL_REQ_WR;
+	attr.qp_type       = cmd.qp_type;
+	attr.xrcd    = xrcd;
+	attr.create_flags  = 0;
+
+	attr.cap.max_send_wr     = cmd.max_send_wr;
+	attr.cap.max_recv_wr     = cmd.max_recv_wr;
+	attr.cap.max_send_sge    = cmd.max_send_sge;
+	attr.cap.max_recv_sge    = cmd.max_recv_sge;
+	attr.cap.max_inline_data = cmd.max_inline_data;
+
+	obj->uevent.events_reported     = 0;
+	INIT_LIST_HEAD(&obj->uevent.event_list);
+	INIT_LIST_HEAD(&obj->mcast_list);
+
+	qp = pd->device->create_qp(pd, &attr, &udata);
+	if (IS_ERR(qp)) {
+		ret = PTR_ERR(qp);
+		goto err_put;
+	}
+
+	qp->device     	  = pd->device;
+	qp->pd         	  = pd;
+	qp->send_cq    	  = attr.send_cq;
+	qp->recv_cq    	  = attr.recv_cq;
+	qp->srq	       	  = attr.srq;
+	qp->uobject       = &obj->uevent.uobject;
+	qp->event_handler = attr.event_handler;
+	qp->qp_context    = attr.qp_context;
+	qp->qp_type	  = attr.qp_type;
+	qp->xrcd	  = attr.xrcd;
+	atomic_inc(&pd->usecnt);
+	atomic_inc(&attr.send_cq->usecnt);
+	atomic_inc(&attr.recv_cq->usecnt);
+	if (attr.srq)
+		atomic_inc(&attr.srq->usecnt);
+	else if (attr.xrcd)
+		atomic_inc(&attr.xrcd->usecnt);
+
+	obj->uevent.uobject.object = qp;
+	ret = idr_add_uobj(&ib_uverbs_qp_idr, &obj->uevent.uobject);
+	if (ret)
+		goto err_destroy;
+
+	memset(&resp, 0, sizeof resp);
+	resp.qpn             = qp->qp_num;
+	resp.qp_handle       = obj->uevent.uobject.id;
+	resp.max_recv_sge    = attr.cap.max_recv_sge;
+	resp.max_send_sge    = attr.cap.max_send_sge;
+	resp.max_recv_wr     = attr.cap.max_recv_wr;
+	resp.max_send_wr     = attr.cap.max_send_wr;
+	resp.max_inline_data = attr.cap.max_inline_data;
+
+	if (copy_to_user((void __user *) (unsigned long) cmd.response,
+			 &resp, sizeof resp)) {
+		ret = -EFAULT;
+		goto err_copy;
+	}
+
+	put_pd_read(pd);
+	put_cq_read(scq);
+	if (rcq != scq)
+		put_cq_read(rcq);
+	if (srq)
+		put_srq_read(srq);
+	if (xrcd)
+		put_xrcd_read(xrcd_uobj);
+
+	mutex_lock(&file->mutex);
+	list_add_tail(&obj->uevent.uobject.list, &file->ucontext->qp_list);
+	mutex_unlock(&file->mutex);
+
+	obj->uevent.uobject.live = 1;
+
+	up_write(&obj->uevent.uobject.mutex);
+
+	return in_len;
+
+err_copy:
+	idr_remove_uobj(&ib_uverbs_qp_idr, &obj->uevent.uobject);
+
+err_destroy:
+	ib_destroy_qp(qp);
+
+err_put:
+	if (pd)
+		put_pd_read(pd);
+	if (scq)
+		put_cq_read(scq);
+	if (rcq && rcq != scq)
+		put_cq_read(rcq);
+	if (srq)
+		put_srq_read(srq);
+	if (xrcd)
+		put_xrcd_read(xrcd_uobj);
+
+	put_uobj_write(&obj->uevent.uobject);
+	return ret;
+}
+
+ssize_t ib_uverbs_query_qp(struct ib_uverbs_file *file,
+			   const char __user *buf, int in_len,
+			   int out_len)
+{
+	struct ib_uverbs_query_qp      cmd;
+	struct ib_uverbs_query_qp_resp resp;
+	struct ib_qp                   *qp;
+	struct ib_qp_attr              *attr;
+	struct ib_qp_init_attr         *init_attr;
+	int                            ret;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	attr      = kmalloc(sizeof *attr, GFP_KERNEL);
+	init_attr = kmalloc(sizeof *init_attr, GFP_KERNEL);
+	if (!attr || !init_attr) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	qp = idr_read_qp(cmd.qp_handle, file->ucontext);
+	if (!qp) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = ib_query_qp(qp, attr, cmd.attr_mask, init_attr);
+
+	put_qp_read(qp);
+
+	if (ret)
+		goto out;
+
+	memset(&resp, 0, sizeof resp);
+
+	resp.qp_state               = attr->qp_state;
+	resp.cur_qp_state           = attr->cur_qp_state;
+	resp.path_mtu               = attr->path_mtu;
+	resp.path_mig_state         = attr->path_mig_state;
+	resp.qkey                   = attr->qkey;
+	resp.rq_psn                 = attr->rq_psn;
+	resp.sq_psn                 = attr->sq_psn;
+	resp.dest_qp_num            = attr->dest_qp_num;
+	resp.qp_access_flags        = attr->qp_access_flags;
+	resp.pkey_index             = attr->pkey_index;
+	resp.alt_pkey_index         = attr->alt_pkey_index;
+	resp.sq_draining            = attr->sq_draining;
+	resp.max_rd_atomic          = attr->max_rd_atomic;
+	resp.max_dest_rd_atomic     = attr->max_dest_rd_atomic;
+	resp.min_rnr_timer          = attr->min_rnr_timer;
+	resp.port_num               = attr->port_num;
+	resp.timeout                = attr->timeout;
+	resp.retry_cnt              = attr->retry_cnt;
+	resp.rnr_retry              = attr->rnr_retry;
+	resp.alt_port_num           = attr->alt_port_num;
+	resp.alt_timeout            = attr->alt_timeout;
+
+	memcpy(resp.dest.dgid, attr->ah_attr.grh.dgid.raw, 16);
+	resp.dest.flow_label        = attr->ah_attr.grh.flow_label;
+	resp.dest.sgid_index        = attr->ah_attr.grh.sgid_index;
+	resp.dest.hop_limit         = attr->ah_attr.grh.hop_limit;
+	resp.dest.traffic_class     = attr->ah_attr.grh.traffic_class;
+	resp.dest.dlid              = attr->ah_attr.dlid;
+	resp.dest.sl                = attr->ah_attr.sl;
+	resp.dest.src_path_bits     = attr->ah_attr.src_path_bits;
+	resp.dest.static_rate       = attr->ah_attr.static_rate;
+	resp.dest.is_global         = !!(attr->ah_attr.ah_flags & IB_AH_GRH);
+	resp.dest.port_num          = attr->ah_attr.port_num;
+
+	memcpy(resp.alt_dest.dgid, attr->alt_ah_attr.grh.dgid.raw, 16);
+	resp.alt_dest.flow_label    = attr->alt_ah_attr.grh.flow_label;
+	resp.alt_dest.sgid_index    = attr->alt_ah_attr.grh.sgid_index;
+	resp.alt_dest.hop_limit     = attr->alt_ah_attr.grh.hop_limit;
+	resp.alt_dest.traffic_class = attr->alt_ah_attr.grh.traffic_class;
+	resp.alt_dest.dlid          = attr->alt_ah_attr.dlid;
+	resp.alt_dest.sl            = attr->alt_ah_attr.sl;
+	resp.alt_dest.src_path_bits = attr->alt_ah_attr.src_path_bits;
+	resp.alt_dest.static_rate   = attr->alt_ah_attr.static_rate;
+	resp.alt_dest.is_global     = !!(attr->alt_ah_attr.ah_flags & IB_AH_GRH);
+	resp.alt_dest.port_num      = attr->alt_ah_attr.port_num;
+
+	resp.max_send_wr            = init_attr->cap.max_send_wr;
+	resp.max_recv_wr            = init_attr->cap.max_recv_wr;
+	resp.max_send_sge           = init_attr->cap.max_send_sge;
+	resp.max_recv_sge           = init_attr->cap.max_recv_sge;
+	resp.max_inline_data        = init_attr->cap.max_inline_data;
+	resp.sq_sig_all             = init_attr->sq_sig_type == IB_SIGNAL_ALL_WR;
+
+	if (copy_to_user((void __user *) (unsigned long) cmd.response,
+			 &resp, sizeof resp))
+		ret = -EFAULT;
+
+out:
+	kfree(attr);
+	kfree(init_attr);
+
+	return ret ? ret : in_len;
+}
+
+ssize_t ib_uverbs_modify_qp(struct ib_uverbs_file *file,
+			    const char __user *buf, int in_len,
+			    int out_len)
+{
+	struct ib_uverbs_modify_qp cmd;
+	struct ib_udata            udata;
+	struct ib_qp              *qp;
+	struct ib_qp_attr         *attr;
+	int                        ret;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	INIT_UDATA(&udata, buf + sizeof cmd, NULL, in_len - sizeof cmd,
+		   out_len);
+
+	attr = kmalloc(sizeof *attr, GFP_KERNEL);
+	if (!attr)
+		return -ENOMEM;
+
+	qp = idr_read_qp(cmd.qp_handle, file->ucontext);
+	if (!qp) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	attr->qp_state 		  = cmd.qp_state;
+	attr->cur_qp_state 	  = cmd.cur_qp_state;
+	attr->path_mtu 		  = cmd.path_mtu;
+	attr->path_mig_state 	  = cmd.path_mig_state;
+	attr->qkey 		  = cmd.qkey;
+	attr->rq_psn 		  = cmd.rq_psn;
+	attr->sq_psn 		  = cmd.sq_psn;
+	attr->dest_qp_num 	  = cmd.dest_qp_num;
+	attr->qp_access_flags 	  = cmd.qp_access_flags;
+	attr->pkey_index 	  = cmd.pkey_index;
+	attr->alt_pkey_index 	  = cmd.alt_pkey_index;
+	attr->en_sqd_async_notify = cmd.en_sqd_async_notify;
+	attr->max_rd_atomic 	  = cmd.max_rd_atomic;
+	attr->max_dest_rd_atomic  = cmd.max_dest_rd_atomic;
+	attr->min_rnr_timer 	  = cmd.min_rnr_timer;
+	attr->port_num 		  = cmd.port_num;
+	attr->timeout 		  = cmd.timeout;
+	attr->retry_cnt 	  = cmd.retry_cnt;
+	attr->rnr_retry 	  = cmd.rnr_retry;
+	attr->alt_port_num 	  = cmd.alt_port_num;
+	attr->alt_timeout 	  = cmd.alt_timeout;
+
+	memcpy(attr->ah_attr.grh.dgid.raw, cmd.dest.dgid, 16);
+	attr->ah_attr.grh.flow_label        = cmd.dest.flow_label;
+	attr->ah_attr.grh.sgid_index        = cmd.dest.sgid_index;
+	attr->ah_attr.grh.hop_limit         = cmd.dest.hop_limit;
+	attr->ah_attr.grh.traffic_class     = cmd.dest.traffic_class;
+	attr->ah_attr.dlid 	    	    = cmd.dest.dlid;
+	attr->ah_attr.sl   	    	    = cmd.dest.sl;
+	attr->ah_attr.src_path_bits 	    = cmd.dest.src_path_bits;
+	attr->ah_attr.static_rate   	    = cmd.dest.static_rate;
+	attr->ah_attr.ah_flags 	    	    = cmd.dest.is_global ? IB_AH_GRH : 0;
+	attr->ah_attr.port_num 	    	    = cmd.dest.port_num;
+
+	memcpy(attr->alt_ah_attr.grh.dgid.raw, cmd.alt_dest.dgid, 16);
+	attr->alt_ah_attr.grh.flow_label    = cmd.alt_dest.flow_label;
+	attr->alt_ah_attr.grh.sgid_index    = cmd.alt_dest.sgid_index;
+	attr->alt_ah_attr.grh.hop_limit     = cmd.alt_dest.hop_limit;
+	attr->alt_ah_attr.grh.traffic_class = cmd.alt_dest.traffic_class;
+	attr->alt_ah_attr.dlid 	    	    = cmd.alt_dest.dlid;
+	attr->alt_ah_attr.sl   	    	    = cmd.alt_dest.sl;
+	attr->alt_ah_attr.src_path_bits     = cmd.alt_dest.src_path_bits;
+	attr->alt_ah_attr.static_rate       = cmd.alt_dest.static_rate;
+	attr->alt_ah_attr.ah_flags 	    = cmd.alt_dest.is_global ? IB_AH_GRH : 0;
+	attr->alt_ah_attr.port_num 	    = cmd.alt_dest.port_num;
+
+	ret = qp->device->modify_qp(qp, attr, cmd.attr_mask, &udata);
+
+	put_qp_read(qp);
+
+	if (ret)
+		goto out;
+
+	ret = in_len;
+
+out:
+	kfree(attr);
+
+	return ret;
+}
+
+ssize_t ib_uverbs_destroy_qp(struct ib_uverbs_file *file,
+			     const char __user *buf, int in_len,
+			     int out_len)
+{
+	struct ib_uverbs_destroy_qp      cmd;
+	struct ib_uverbs_destroy_qp_resp resp;
+	struct ib_uobject		*uobj;
+	struct ib_qp               	*qp;
+	struct ib_uqp_object        	*obj;
+	int                        	 ret = -EINVAL;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	memset(&resp, 0, sizeof resp);
+
+	uobj = idr_write_uobj(&ib_uverbs_qp_idr, cmd.qp_handle, file->ucontext);
+	if (!uobj)
+		return -EINVAL;
+	qp  = uobj->object;
+	obj = container_of(uobj, struct ib_uqp_object, uevent.uobject);
+
+	if (!list_empty(&obj->mcast_list)) {
+		put_uobj_write(uobj);
+		return -EBUSY;
+	}
+
+	ret = ib_destroy_qp(qp);
+	if (!ret)
+		uobj->live = 0;
+
+	put_uobj_write(uobj);
+
+	if (ret)
+		return ret;
+
+	idr_remove_uobj(&ib_uverbs_qp_idr, uobj);
+
+	mutex_lock(&file->mutex);
+	list_del(&uobj->list);
+	mutex_unlock(&file->mutex);
+
+	ib_uverbs_release_uevent(file, &obj->uevent);
+
+	resp.events_reported = obj->uevent.events_reported;
+
+	put_uobj(uobj);
+
+	if (copy_to_user((void __user *) (unsigned long) cmd.response,
+			 &resp, sizeof resp))
+		return -EFAULT;
+
+	return in_len;
+}
+
+ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file,
+			    const char __user *buf, int in_len,
+			    int out_len)
+{
+	struct ib_uverbs_post_send      cmd;
+	struct ib_uverbs_post_send_resp resp;
+	struct ib_uverbs_send_wr       *user_wr;
+	struct ib_send_wr              *wr = NULL, *last, *next, *bad_wr;
+	struct ib_qp                   *qp;
+	int                             i, sg_ind;
+	int				is_ud;
+	ssize_t                         ret = -EINVAL;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	if (in_len < sizeof cmd + cmd.wqe_size * cmd.wr_count +
+	    cmd.sge_count * sizeof (struct ib_uverbs_sge))
+		return -EINVAL;
+
+	if (cmd.wqe_size < sizeof (struct ib_uverbs_send_wr))
+		return -EINVAL;
+
+	user_wr = kmalloc(cmd.wqe_size, GFP_KERNEL);
+	if (!user_wr)
+		return -ENOMEM;
+
+	qp = idr_read_qp(cmd.qp_handle, file->ucontext);
+	if (!qp)
+		goto out;
+
+	is_ud = qp->qp_type == IB_QPT_UD;
+	sg_ind = 0;
+	last = NULL;
+	for (i = 0; i < cmd.wr_count; ++i) {
+		if (copy_from_user(user_wr,
+				   buf + sizeof cmd + i * cmd.wqe_size,
+				   cmd.wqe_size)) {
+			ret = -EFAULT;
+			goto out_put;
+		}
+
+		if (user_wr->num_sge + sg_ind > cmd.sge_count) {
+			ret = -EINVAL;
+			goto out_put;
+		}
+
+		next = kmalloc(ALIGN(sizeof *next, sizeof (struct ib_sge)) +
+			       user_wr->num_sge * sizeof (struct ib_sge),
+			       GFP_KERNEL);
+		if (!next) {
+			ret = -ENOMEM;
+			goto out_put;
+		}
+
+		if (!last)
+			wr = next;
+		else
+			last->next = next;
+		last = next;
+
+		next->next       = NULL;
+		next->wr_id      = user_wr->wr_id;
+		next->num_sge    = user_wr->num_sge;
+		next->opcode     = user_wr->opcode;
+		next->send_flags = user_wr->send_flags;
+
+		if (is_ud) {
+			next->wr.ud.ah = idr_read_ah(user_wr->wr.ud.ah,
+						     file->ucontext);
+			if (!next->wr.ud.ah) {
+				ret = -EINVAL;
+				goto out_put;
+			}
+			next->wr.ud.remote_qpn  = user_wr->wr.ud.remote_qpn;
+			next->wr.ud.remote_qkey = user_wr->wr.ud.remote_qkey;
+		} else {
+			switch (next->opcode) {
+			case IB_WR_RDMA_WRITE_WITH_IMM:
+				next->ex.imm_data =
+					(__be32 __force) user_wr->ex.imm_data;
+			case IB_WR_RDMA_WRITE:
+			case IB_WR_RDMA_READ:
+				next->wr.rdma.remote_addr =
+					user_wr->wr.rdma.remote_addr;
+				next->wr.rdma.rkey        =
+					user_wr->wr.rdma.rkey;
+				break;
+			case IB_WR_SEND_WITH_IMM:
+				next->ex.imm_data =
+					(__be32 __force) user_wr->ex.imm_data;
+				break;
+			case IB_WR_SEND_WITH_INV:
+				next->ex.invalidate_rkey =
+					user_wr->ex.invalidate_rkey;
+				break;
+			case IB_WR_ATOMIC_CMP_AND_SWP:
+			case IB_WR_ATOMIC_FETCH_AND_ADD:
+				next->wr.atomic.remote_addr =
+					user_wr->wr.atomic.remote_addr;
+				next->wr.atomic.compare_add =
+					user_wr->wr.atomic.compare_add;
+				next->wr.atomic.swap = user_wr->wr.atomic.swap;
+				next->wr.atomic.rkey = user_wr->wr.atomic.rkey;
+				break;
+			default:
+				break;
+			}
+		}
+
+		if (next->num_sge) {
+			next->sg_list = (void *) next +
+				ALIGN(sizeof *next, sizeof (struct ib_sge));
+			if (copy_from_user(next->sg_list,
+					   buf + sizeof cmd +
+					   cmd.wr_count * cmd.wqe_size +
+					   sg_ind * sizeof (struct ib_sge),
+					   next->num_sge * sizeof (struct ib_sge))) {
+				ret = -EFAULT;
+				goto out_put;
+			}
+			sg_ind += next->num_sge;
+		} else
+			next->sg_list = NULL;
+	}
+
+	resp.bad_wr = 0;
+	ret = qp->device->post_send(qp, wr, &bad_wr);
+	if (ret)
+		for (next = wr; next; next = next->next) {
+			++resp.bad_wr;
+			if (next == bad_wr)
+				break;
+		}
+
+	if (copy_to_user((void __user *) (unsigned long) cmd.response,
+			 &resp, sizeof resp))
+		ret = -EFAULT;
+
+out_put:
+	put_qp_read(qp);
+
+	while (wr) {
+		if (is_ud && wr->wr.ud.ah)
+			put_ah_read(wr->wr.ud.ah);
+		next = wr->next;
+		kfree(wr);
+		wr = next;
+	}
+
+out:
+	kfree(user_wr);
+
+	return ret ? ret : in_len;
+}
+
+static struct ib_recv_wr *ib_uverbs_unmarshall_recv(const char __user *buf,
+						    int in_len,
+						    u32 wr_count,
+						    u32 sge_count,
+						    u32 wqe_size)
+{
+	struct ib_uverbs_recv_wr *user_wr;
+	struct ib_recv_wr        *wr = NULL, *last, *next;
+	int                       sg_ind;
+	int                       i;
+	int                       ret;
+
+	if (in_len < wqe_size * wr_count +
+	    sge_count * sizeof (struct ib_uverbs_sge))
+		return ERR_PTR(-EINVAL);
+
+	if (wqe_size < sizeof (struct ib_uverbs_recv_wr))
+		return ERR_PTR(-EINVAL);
+
+	user_wr = kmalloc(wqe_size, GFP_KERNEL);
+	if (!user_wr)
+		return ERR_PTR(-ENOMEM);
+
+	sg_ind = 0;
+	last = NULL;
+	for (i = 0; i < wr_count; ++i) {
+		if (copy_from_user(user_wr, buf + i * wqe_size,
+				   wqe_size)) {
+			ret = -EFAULT;
+			goto err;
+		}
+
+		if (user_wr->num_sge + sg_ind > sge_count) {
+			ret = -EINVAL;
+			goto err;
+		}
+
+		next = kmalloc(ALIGN(sizeof *next, sizeof (struct ib_sge)) +
+			       user_wr->num_sge * sizeof (struct ib_sge),
+			       GFP_KERNEL);
+		if (!next) {
+			ret = -ENOMEM;
+			goto err;
+		}
+
+		if (!last)
+			wr = next;
+		else
+			last->next = next;
+		last = next;
+
+		next->next       = NULL;
+		next->wr_id      = user_wr->wr_id;
+		next->num_sge    = user_wr->num_sge;
+
+		if (next->num_sge) {
+			next->sg_list = (void *) next +
+				ALIGN(sizeof *next, sizeof (struct ib_sge));
+			if (copy_from_user(next->sg_list,
+					   buf + wr_count * wqe_size +
+					   sg_ind * sizeof (struct ib_sge),
+					   next->num_sge * sizeof (struct ib_sge))) {
+				ret = -EFAULT;
+				goto err;
+			}
+			sg_ind += next->num_sge;
+		} else
+			next->sg_list = NULL;
+	}
+
+	kfree(user_wr);
+	return wr;
+
+err:
+	kfree(user_wr);
+
+	while (wr) {
+		next = wr->next;
+		kfree(wr);
+		wr = next;
+	}
+
+	return ERR_PTR(ret);
+}
+
+ssize_t ib_uverbs_post_recv(struct ib_uverbs_file *file,
+			    const char __user *buf, int in_len,
+			    int out_len)
+{
+	struct ib_uverbs_post_recv      cmd;
+	struct ib_uverbs_post_recv_resp resp;
+	struct ib_recv_wr              *wr, *next, *bad_wr;
+	struct ib_qp                   *qp;
+	ssize_t                         ret = -EINVAL;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	wr = ib_uverbs_unmarshall_recv(buf + sizeof cmd,
+				       in_len - sizeof cmd, cmd.wr_count,
+				       cmd.sge_count, cmd.wqe_size);
+	if (IS_ERR(wr))
+		return PTR_ERR(wr);
+
+	qp = idr_read_qp(cmd.qp_handle, file->ucontext);
+	if (!qp)
+		goto out;
+
+	resp.bad_wr = 0;
+	ret = qp->device->post_recv(qp, wr, &bad_wr);
+
+	put_qp_read(qp);
+
+	if (ret)
+		for (next = wr; next; next = next->next) {
+			++resp.bad_wr;
+			if (next == bad_wr)
+				break;
+		}
+
+	if (copy_to_user((void __user *) (unsigned long) cmd.response,
+			 &resp, sizeof resp))
+		ret = -EFAULT;
+
+out:
+	while (wr) {
+		next = wr->next;
+		kfree(wr);
+		wr = next;
+	}
+
+	return ret ? ret : in_len;
+}
+
+ssize_t ib_uverbs_post_srq_recv(struct ib_uverbs_file *file,
+				const char __user *buf, int in_len,
+				int out_len)
+{
+	struct ib_uverbs_post_srq_recv      cmd;
+	struct ib_uverbs_post_srq_recv_resp resp;
+	struct ib_recv_wr                  *wr, *next, *bad_wr;
+	struct ib_srq                      *srq;
+	ssize_t                             ret = -EINVAL;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	wr = ib_uverbs_unmarshall_recv(buf + sizeof cmd,
+				       in_len - sizeof cmd, cmd.wr_count,
+				       cmd.sge_count, cmd.wqe_size);
+	if (IS_ERR(wr))
+		return PTR_ERR(wr);
+
+	srq = idr_read_srq(cmd.srq_handle, file->ucontext);
+	if (!srq)
+		goto out;
+
+	resp.bad_wr = 0;
+	ret = srq->device->post_srq_recv(srq, wr, &bad_wr);
+
+	put_srq_read(srq);
+
+	if (ret)
+		for (next = wr; next; next = next->next) {
+			++resp.bad_wr;
+			if (next == bad_wr)
+				break;
+		}
+
+	if (copy_to_user((void __user *) (unsigned long) cmd.response,
+			 &resp, sizeof resp))
+		ret = -EFAULT;
+
+out:
+	while (wr) {
+		next = wr->next;
+		kfree(wr);
+		wr = next;
+	}
+
+	return ret ? ret : in_len;
+}
+
+ssize_t ib_uverbs_create_ah(struct ib_uverbs_file *file,
+			    const char __user *buf, int in_len,
+			    int out_len)
+{
+	struct ib_uverbs_create_ah	 cmd;
+	struct ib_uverbs_create_ah_resp	 resp;
+	struct ib_uobject		*uobj;
+	struct ib_pd			*pd;
+	struct ib_ah			*ah;
+	struct ib_ah_attr		attr;
+	int ret;
+
+	if (out_len < sizeof resp)
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	uobj = kmalloc(sizeof *uobj, GFP_KERNEL);
+	if (!uobj)
+		return -ENOMEM;
+
+	init_uobj(uobj, cmd.user_handle, file->ucontext, &ah_lock_key);
+	down_write(&uobj->mutex);
+
+	pd = idr_read_pd(cmd.pd_handle, file->ucontext);
+	if (!pd) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	attr.dlid 	       = cmd.attr.dlid;
+	attr.sl 	       = cmd.attr.sl;
+	attr.src_path_bits     = cmd.attr.src_path_bits;
+	attr.static_rate       = cmd.attr.static_rate;
+	attr.ah_flags          = cmd.attr.is_global ? IB_AH_GRH : 0;
+	attr.port_num 	       = cmd.attr.port_num;
+	attr.grh.flow_label    = cmd.attr.grh.flow_label;
+	attr.grh.sgid_index    = cmd.attr.grh.sgid_index;
+	attr.grh.hop_limit     = cmd.attr.grh.hop_limit;
+	attr.grh.traffic_class = cmd.attr.grh.traffic_class;
+	memcpy(attr.grh.dgid.raw, cmd.attr.grh.dgid, 16);
+
+	ah = ib_create_ah(pd, &attr);
+	if (IS_ERR(ah)) {
+		ret = PTR_ERR(ah);
+		goto err_put;
+	}
+
+	ah->uobject  = uobj;
+	uobj->object = ah;
+
+	ret = idr_add_uobj(&ib_uverbs_ah_idr, uobj);
+	if (ret)
+		goto err_destroy;
+
+	resp.ah_handle = uobj->id;
+
+	if (copy_to_user((void __user *) (unsigned long) cmd.response,
+			 &resp, sizeof resp)) {
+		ret = -EFAULT;
+		goto err_copy;
+	}
+
+	put_pd_read(pd);
+
+	mutex_lock(&file->mutex);
+	list_add_tail(&uobj->list, &file->ucontext->ah_list);
+	mutex_unlock(&file->mutex);
+
+	uobj->live = 1;
+
+	up_write(&uobj->mutex);
+
+	return in_len;
+
+err_copy:
+	idr_remove_uobj(&ib_uverbs_ah_idr, uobj);
+
+err_destroy:
+	ib_destroy_ah(ah);
+
+err_put:
+	put_pd_read(pd);
+
+err:
+	put_uobj_write(uobj);
+	return ret;
+}
+
+ssize_t ib_uverbs_destroy_ah(struct ib_uverbs_file *file,
+			     const char __user *buf, int in_len, int out_len)
+{
+	struct ib_uverbs_destroy_ah cmd;
+	struct ib_ah		   *ah;
+	struct ib_uobject	   *uobj;
+	int			    ret;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	uobj = idr_write_uobj(&ib_uverbs_ah_idr, cmd.ah_handle, file->ucontext);
+	if (!uobj)
+		return -EINVAL;
+	ah = uobj->object;
+
+	ret = ib_destroy_ah(ah);
+	if (!ret)
+		uobj->live = 0;
+
+	put_uobj_write(uobj);
+
+	if (ret)
+		return ret;
+
+	idr_remove_uobj(&ib_uverbs_ah_idr, uobj);
+
+	mutex_lock(&file->mutex);
+	list_del(&uobj->list);
+	mutex_unlock(&file->mutex);
+
+	put_uobj(uobj);
+
+	return in_len;
+}
+
+ssize_t ib_uverbs_attach_mcast(struct ib_uverbs_file *file,
+			       const char __user *buf, int in_len,
+			       int out_len)
+{
+	struct ib_uverbs_attach_mcast cmd;
+	struct ib_qp                 *qp;
+	struct ib_uqp_object         *obj;
+	struct ib_uverbs_mcast_entry *mcast;
+	int                           ret;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	qp = idr_read_qp(cmd.qp_handle, file->ucontext);
+	if (!qp)
+		return -EINVAL;
+
+	obj = container_of(qp->uobject, struct ib_uqp_object, uevent.uobject);
+
+	list_for_each_entry(mcast, &obj->mcast_list, list)
+		if (cmd.mlid == mcast->lid &&
+		    !memcmp(cmd.gid, mcast->gid.raw, sizeof mcast->gid.raw)) {
+			ret = 0;
+			goto out_put;
+		}
+
+	mcast = kmalloc(sizeof *mcast, GFP_KERNEL);
+	if (!mcast) {
+		ret = -ENOMEM;
+		goto out_put;
+	}
+
+	mcast->lid = cmd.mlid;
+	memcpy(mcast->gid.raw, cmd.gid, sizeof mcast->gid.raw);
+
+	ret = ib_attach_mcast(qp, &mcast->gid, cmd.mlid);
+	if (!ret)
+		list_add_tail(&mcast->list, &obj->mcast_list);
+	else
+		kfree(mcast);
+
+out_put:
+	put_qp_read(qp);
+
+	return ret ? ret : in_len;
+}
+
+ssize_t ib_uverbs_detach_mcast(struct ib_uverbs_file *file,
+			       const char __user *buf, int in_len,
+			       int out_len)
+{
+	struct ib_uverbs_detach_mcast cmd;
+	struct ib_uqp_object         *obj;
+	struct ib_qp                 *qp;
+	struct ib_uverbs_mcast_entry *mcast;
+	int                           ret = -EINVAL;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	qp = idr_read_qp(cmd.qp_handle, file->ucontext);
+	if (!qp)
+		return -EINVAL;
+
+	ret = ib_detach_mcast(qp, (union ib_gid *) cmd.gid, cmd.mlid);
+	if (ret)
+		goto out_put;
+
+	obj = container_of(qp->uobject, struct ib_uqp_object, uevent.uobject);
+
+	list_for_each_entry(mcast, &obj->mcast_list, list)
+		if (cmd.mlid == mcast->lid &&
+		    !memcmp(cmd.gid, mcast->gid.raw, sizeof mcast->gid.raw)) {
+			list_del(&mcast->list);
+			kfree(mcast);
+			break;
+		}
+
+out_put:
+	put_qp_read(qp);
+
+	return ret ? ret : in_len;
+}
+
+ssize_t ib_uverbs_create_srq(struct ib_uverbs_file *file,
+			     const char __user *buf, int in_len,
+			     int out_len)
+{
+	struct ib_uverbs_create_srq      cmd;
+	struct ib_uverbs_create_srq_resp resp;
+	struct ib_udata                  udata;
+	struct ib_uevent_object         *obj;
+	struct ib_pd                    *pd;
+	struct ib_srq                   *srq;
+	struct ib_srq_init_attr          attr;
+	int ret;
+
+	if (out_len < sizeof resp)
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	INIT_UDATA(&udata, buf + sizeof cmd,
+		   (unsigned long) cmd.response + sizeof resp,
+		   in_len - sizeof cmd, out_len - sizeof resp);
+
+	obj = kmalloc(sizeof *obj, GFP_KERNEL);
+	if (!obj)
+		return -ENOMEM;
+
+	init_uobj(&obj->uobject, cmd.user_handle, file->ucontext, &srq_lock_key);
+	down_write(&obj->uobject.mutex);
+
+	pd  = idr_read_pd(cmd.pd_handle, file->ucontext);
+	if (!pd) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	attr.event_handler  = ib_uverbs_srq_event_handler;
+	attr.srq_context    = file;
+	attr.attr.max_wr    = cmd.max_wr;
+	attr.attr.max_sge   = cmd.max_sge;
+	attr.attr.srq_limit = cmd.srq_limit;
+
+	obj->events_reported     = 0;
+	INIT_LIST_HEAD(&obj->event_list);
+
+	srq = pd->device->create_srq(pd, &attr, &udata);
+	if (IS_ERR(srq)) {
+		ret = PTR_ERR(srq);
+		goto err_put;
+	}
+
+	srq->device    	   = pd->device;
+	srq->pd        	   = pd;
+	srq->uobject       = &obj->uobject;
+	srq->event_handler = attr.event_handler;
+	srq->srq_context   = attr.srq_context;
+	srq->ext.xrc.cq = NULL;
+	srq->ext.xrc.xrcd = NULL;
+	atomic_inc(&pd->usecnt);
+	atomic_set(&srq->usecnt, 0);
+
+	obj->uobject.object = srq;
+	ret = idr_add_uobj(&ib_uverbs_srq_idr, &obj->uobject);
+	if (ret)
+		goto err_destroy;
+
+	memset(&resp, 0, sizeof resp);
+	resp.srq_handle = obj->uobject.id;
+	resp.max_wr     = attr.attr.max_wr;
+	resp.max_sge    = attr.attr.max_sge;
+
+	if (copy_to_user((void __user *) (unsigned long) cmd.response,
+			 &resp, sizeof resp)) {
+		ret = -EFAULT;
+		goto err_copy;
+	}
+
+	put_pd_read(pd);
+
+	mutex_lock(&file->mutex);
+	list_add_tail(&obj->uobject.list, &file->ucontext->srq_list);
+	mutex_unlock(&file->mutex);
+
+	obj->uobject.live = 1;
+
+	up_write(&obj->uobject.mutex);
+
+	return in_len;
+
+err_copy:
+	idr_remove_uobj(&ib_uverbs_srq_idr, &obj->uobject);
+
+err_destroy:
+	ib_destroy_srq(srq);
+
+err_put:
+	put_pd_read(pd);
+
+err:
+	put_uobj_write(&obj->uobject);
+	return ret;
+}
+
+ssize_t ib_uverbs_create_xrc_srq(struct ib_uverbs_file *file,
+			     const char __user *buf, int in_len,
+			     int out_len)
+{
+	struct ib_uverbs_create_xsrq  cmd;
+	struct ib_uverbs_create_srq_resp resp;
+	struct ib_udata			 udata;
+	struct ib_uevent_object		*obj;
+	struct ib_pd			*pd;
+	struct ib_srq			*srq;
+	struct ib_cq			*xrc_cq;
+	struct ib_xrcd			*xrcd;
+	struct ib_srq_init_attr		 attr;
+	struct ib_uobject		*xrcd_uobj;
+	int ret;
+
+	if (out_len < sizeof resp)
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	INIT_UDATA(&udata, buf + sizeof cmd,
+		   (unsigned long) cmd.response + sizeof resp,
+		   in_len - sizeof cmd, out_len - sizeof resp);
+
+	obj = kmalloc(sizeof *obj, GFP_KERNEL);
+	if (!obj)
+		return -ENOMEM;
+
+	init_uobj(&obj->uobject, cmd.user_handle, file->ucontext,
+		  &srq_lock_key);
+	down_write(&obj->uobject.mutex);
+
+	pd  = idr_read_pd(cmd.pd_handle, file->ucontext);
+	if (!pd) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	xrc_cq  = idr_read_cq(cmd.cq_handle, file->ucontext, 0);
+	if (!xrc_cq) {
+		ret = -EINVAL;
+		goto err_put_pd;
+	}
+
+	xrcd  = idr_read_xrcd(cmd.xrcd_handle, file->ucontext, &xrcd_uobj);
+	if (!xrcd) {
+		ret = -EINVAL;
+		goto err_put_cq;
+	}
+
+
+	attr.event_handler  = ib_uverbs_srq_event_handler;
+	attr.srq_context    = file;
+	attr.attr.max_wr    = cmd.max_wr;
+	attr.attr.max_sge   = cmd.max_sge;
+	attr.attr.srq_limit = cmd.srq_limit;
+
+	obj->events_reported     = 0;
+	INIT_LIST_HEAD(&obj->event_list);
+
+	srq = pd->device->create_xrc_srq(pd, xrc_cq, xrcd, &attr, &udata);
+	if (IS_ERR(srq)) {
+		ret = PTR_ERR(srq);
+		goto err_put;
+	}
+
+	srq->device	   = pd->device;
+	srq->pd		   = pd;
+	srq->uobject	   = &obj->uobject;
+	srq->event_handler = attr.event_handler;
+	srq->srq_context   = attr.srq_context;
+	srq->ext.xrc.cq	   = xrc_cq;
+	srq->ext.xrc.xrcd	   = xrcd;
+	atomic_inc(&pd->usecnt);
+	atomic_inc(&xrc_cq->usecnt);
+	atomic_inc(&xrcd->usecnt);
+
+	atomic_set(&srq->usecnt, 0);
+
+	obj->uobject.object = srq;
+	ret = idr_add_uobj(&ib_uverbs_srq_idr, &obj->uobject);
+	if (ret)
+		goto err_destroy;
+
+	memset(&resp, 0, sizeof resp);
+	resp.srq_handle	= obj->uobject.id;
+	resp.max_wr	= attr.attr.max_wr;
+	resp.max_sge	= attr.attr.max_sge;
+
+	if (copy_to_user((void __user *) (unsigned long) cmd.response,
+			 &resp, sizeof resp)) {
+		ret = -EFAULT;
+		goto err_copy;
+	}
+
+	put_xrcd_read(xrcd_uobj);
+	put_cq_read(xrc_cq);
+	put_pd_read(pd);
+
+	mutex_lock(&file->mutex);
+	list_add_tail(&obj->uobject.list, &file->ucontext->srq_list);
+	mutex_unlock(&file->mutex);
+
+	obj->uobject.live = 1;
+
+	up_write(&obj->uobject.mutex);
+
+	return in_len;
+
+err_copy:
+	idr_remove_uobj(&ib_uverbs_srq_idr, &obj->uobject);
+
+err_destroy:
+	ib_destroy_srq(srq);
+
+err_put:
+	put_xrcd_read(xrcd_uobj);
+
+err_put_cq:
+	put_cq_read(xrc_cq);
+
+err_put_pd:
+	put_pd_read(pd);
+
+err:
+	put_uobj_write(&obj->uobject);
+	return ret;
+}
+
+ssize_t ib_uverbs_modify_srq(struct ib_uverbs_file *file,
+			     const char __user *buf, int in_len,
+			     int out_len)
+{
+	struct ib_uverbs_modify_srq cmd;
+	struct ib_udata             udata;
+	struct ib_srq              *srq;
+	struct ib_srq_attr          attr;
+	int                         ret;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	INIT_UDATA(&udata, buf + sizeof cmd, NULL, in_len - sizeof cmd,
+		   out_len);
+
+	srq = idr_read_srq(cmd.srq_handle, file->ucontext);
+	if (!srq)
+		return -EINVAL;
+
+	attr.max_wr    = cmd.max_wr;
+	attr.srq_limit = cmd.srq_limit;
+
+	ret = srq->device->modify_srq(srq, &attr, cmd.attr_mask, &udata);
+
+	put_srq_read(srq);
+
+	return ret ? ret : in_len;
+}
+
+ssize_t ib_uverbs_query_srq(struct ib_uverbs_file *file,
+			    const char __user *buf,
+			    int in_len, int out_len)
+{
+	struct ib_uverbs_query_srq      cmd;
+	struct ib_uverbs_query_srq_resp resp;
+	struct ib_srq_attr              attr;
+	struct ib_srq                   *srq;
+	int                             ret;
+
+	if (out_len < sizeof resp)
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	srq = idr_read_srq(cmd.srq_handle, file->ucontext);
+	if (!srq)
+		return -EINVAL;
+
+	ret = ib_query_srq(srq, &attr);
+
+	put_srq_read(srq);
+
+	if (ret)
+		return ret;
+
+	memset(&resp, 0, sizeof resp);
+
+	resp.max_wr    = attr.max_wr;
+	resp.max_sge   = attr.max_sge;
+	resp.srq_limit = attr.srq_limit;
+
+	if (copy_to_user((void __user *) (unsigned long) cmd.response,
+			 &resp, sizeof resp))
+		return -EFAULT;
+
+	return in_len;
+}
+
+ssize_t ib_uverbs_destroy_srq(struct ib_uverbs_file *file,
+			      const char __user *buf, int in_len,
+			      int out_len)
+{
+	struct ib_uverbs_destroy_srq      cmd;
+	struct ib_uverbs_destroy_srq_resp resp;
+	struct ib_uobject		 *uobj;
+	struct ib_srq               	 *srq;
+	struct ib_uevent_object        	 *obj;
+	int                         	  ret = -EINVAL;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	uobj = idr_write_uobj(&ib_uverbs_srq_idr, cmd.srq_handle, file->ucontext);
+	if (!uobj)
+		return -EINVAL;
+	srq = uobj->object;
+	obj = container_of(uobj, struct ib_uevent_object, uobject);
+
+	ret = ib_destroy_srq(srq);
+	if (!ret)
+		uobj->live = 0;
+
+	put_uobj_write(uobj);
+
+	if (ret)
+		return ret;
+
+	idr_remove_uobj(&ib_uverbs_srq_idr, uobj);
+
+	mutex_lock(&file->mutex);
+	list_del(&uobj->list);
+	mutex_unlock(&file->mutex);
+
+	ib_uverbs_release_uevent(file, obj);
+
+	memset(&resp, 0, sizeof resp);
+	resp.events_reported = obj->events_reported;
+
+	put_uobj(uobj);
+
+	if (copy_to_user((void __user *) (unsigned long) cmd.response,
+			 &resp, sizeof resp))
+		ret = -EFAULT;
+
+	return ret ? ret : in_len;
+}
+
+static struct inode *xrc_file2inode(struct file *f)
+{
+	return f->f_dentry->d_inode;
+}
+
+struct xrcd_table_entry {
+	struct rb_node node;
+	struct inode *inode;
+	struct ib_xrcd *xrcd;
+};
+
+static int xrcd_table_insert(struct ib_device *dev,
+			     struct inode *i_n,
+			     struct ib_xrcd *xrcd)
+{
+	struct xrcd_table_entry *entry, *scan;
+	struct rb_node **p = &dev->ib_uverbs_xrcd_table.rb_node;
+	struct rb_node *parent = NULL;
+
+	entry = kmalloc(sizeof(struct xrcd_table_entry), GFP_KERNEL);
+	if (!entry)
+		return -ENOMEM;
+
+	entry->inode = i_n;
+	entry->xrcd = xrcd;
+
+	while (*p) {
+		parent = *p;
+		scan = rb_entry(parent, struct xrcd_table_entry, node);
+
+		if (i_n < scan->inode)
+			p = &(*p)->rb_left;
+		else if (i_n > scan->inode)
+			p = &(*p)->rb_right;
+		else {
+			kfree(entry);
+			return -EEXIST;
+		}
+	}
+
+	rb_link_node(&entry->node, parent, p);
+	rb_insert_color(&entry->node, &dev->ib_uverbs_xrcd_table);
+	igrab(i_n);
+	return 0;
+}
+
+static struct xrcd_table_entry *xrcd_table_search(struct ib_device *dev,
+						   struct inode *i_n)
+{
+	struct xrcd_table_entry *scan;
+	struct rb_node **p = &dev->ib_uverbs_xrcd_table.rb_node;
+	struct rb_node *parent = NULL;
+
+	while (*p) {
+		parent = *p;
+		scan = rb_entry(parent, struct xrcd_table_entry, node);
+
+		if (i_n < scan->inode)
+			p = &(*p)->rb_left;
+		else if (i_n > scan->inode)
+			p = &(*p)->rb_right;
+		else
+			return scan;
+	}
+	return NULL;
+}
+
+static int find_xrcd(struct ib_device *dev, struct inode *i_n,
+		     struct ib_xrcd **xrcd)
+{
+	struct xrcd_table_entry *entry;
+
+	entry = xrcd_table_search(dev, i_n);
+	if (!entry)
+		return -EINVAL;
+
+	*xrcd = entry->xrcd;
+	return 0;
+}
+
+
+static void xrcd_table_delete(struct ib_device *dev,
+			      struct inode *i_n)
+{
+	struct xrcd_table_entry *entry = xrcd_table_search(dev, i_n);
+
+	if (entry) {
+		iput(i_n);
+		rb_erase(&entry->node, &dev->ib_uverbs_xrcd_table);
+		kfree(entry);
+	}
+}
+
+ssize_t ib_uverbs_open_xrc_domain(struct ib_uverbs_file *file,
+				  const char __user *buf, int in_len,
+				  int out_len)
+{
+	struct ib_uverbs_open_xrc_domain cmd;
+	struct ib_uverbs_open_xrc_domain_resp resp;
+	struct ib_udata	udata;
+	struct ib_uobject *uobj;
+	struct ib_uxrcd_object         	*xrcd_uobj;
+	struct ib_xrcd			*xrcd = NULL;
+	struct file			*f = NULL;
+	struct inode			*inode = NULL;
+	int				 ret = 0;
+	int				 new_xrcd = 0;
+
+	if (out_len < sizeof resp)
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	INIT_UDATA(&udata, buf + sizeof cmd,
+		   (unsigned long) cmd.response + sizeof resp,
+		   in_len - sizeof cmd, out_len - sizeof resp);
+
+	mutex_lock(&file->device->ib_dev->xrcd_table_mutex);
+	if (cmd.fd != (u32) (-1)) {
+		/* search for file descriptor */
+		f = fget(cmd.fd);
+		if (!f) {
+			ret = -EBADF;
+			goto err_table_mutex_unlock;
+		}
+
+		inode = xrc_file2inode(f);
+		if (!inode) {
+			ret = -EBADF;
+			goto err_table_mutex_unlock;
+		}
+
+		ret = find_xrcd(file->device->ib_dev, inode, &xrcd);
+		if (ret && !(cmd.oflags & O_CREAT)) {
+			/* no file descriptor. Need CREATE flag */
+			ret = -EAGAIN;
+			goto err_table_mutex_unlock;
+		}
+
+		if (xrcd && cmd.oflags & O_EXCL) {
+			ret = -EINVAL;
+			goto err_table_mutex_unlock;
+		}
+	}
+
+	xrcd_uobj = kmalloc(sizeof *xrcd_uobj, GFP_KERNEL);
+	if (!xrcd_uobj) {
+		ret = -ENOMEM;
+		goto err_table_mutex_unlock;
+	}
+
+	uobj = &xrcd_uobj->uobject;
+	init_uobj(uobj, 0, file->ucontext, &pd_lock_key);
+	down_write(&uobj->mutex);
+
+	if (!xrcd) {
+		xrcd = file->device->ib_dev->alloc_xrcd(file->device->ib_dev,
+							file->ucontext, &udata);
+		if (IS_ERR(xrcd)) {
+			ret = PTR_ERR(xrcd);
+			goto err;
+		}
+		xrcd->uobject = (cmd.fd == -1) ? uobj : NULL;
+		xrcd->inode = inode;
+		xrcd->device  = file->device->ib_dev;
+		atomic_set(&xrcd->usecnt, 0);
+		new_xrcd = 1;
+	}
+
+	uobj->object = xrcd;
+	ret = idr_add_uobj(&ib_uverbs_xrc_domain_idr, uobj);
+	if (ret)
+		goto err_idr;
+
+	memset(&resp, 0, sizeof resp);
+	resp.xrcd_handle = uobj->id;
+
+	if (inode) {
+		if (new_xrcd) {
+		/* create new inode/xrcd table entry */
+			ret = xrcd_table_insert(file->device->ib_dev, inode, xrcd);
+			if (ret)
+				goto err_insert_xrcd;
+		}
+		atomic_inc(&xrcd->usecnt);
+	}
+	if (f)
+		fput(f);
+
+	if (copy_to_user((void __user *) (unsigned long) cmd.response,
+			 &resp, sizeof resp)) {
+		ret = -EFAULT;
+		goto err_copy;
+	}
+
+	INIT_LIST_HEAD(&xrcd_uobj->xrc_reg_qp_list);
+
+	mutex_lock(&file->mutex);
+	list_add_tail(&uobj->list, &file->ucontext->xrcd_list);
+	mutex_unlock(&file->mutex);
+
+	uobj->live = 1;
+
+	up_write(&uobj->mutex);
+
+	mutex_unlock(&file->device->ib_dev->xrcd_table_mutex);
+	return in_len;
+
+err_copy:
+
+	if (inode) {
+		if (new_xrcd)
+			xrcd_table_delete(file->device->ib_dev, inode);
+		atomic_dec(&xrcd->usecnt);
+	}
+
+err_insert_xrcd:
+	idr_remove_uobj(&ib_uverbs_xrc_domain_idr, uobj);
+
+err_idr:
+	ib_dealloc_xrcd(xrcd);
+
+err:
+	put_uobj_write(uobj);
+
+err_table_mutex_unlock:
+
+	if (f)
+		fput(f);
+	mutex_unlock(&file->device->ib_dev->xrcd_table_mutex);
+	return ret;
+}
+
+ssize_t ib_uverbs_close_xrc_domain(struct ib_uverbs_file *file,
+				   const char __user *buf, int in_len,
+				   int out_len)
+{
+	struct ib_uverbs_close_xrc_domain cmd;
+	struct ib_uobject *uobj, *t_uobj;
+	struct ib_uxrcd_object *xrcd_uobj;
+	struct ib_xrcd *xrcd = NULL;
+	struct inode *inode = NULL;
+	int ret = 0;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	mutex_lock(&file->device->ib_dev->xrcd_table_mutex);
+	uobj = idr_write_uobj(&ib_uverbs_xrc_domain_idr, cmd.xrcd_handle,
+			      file->ucontext);
+	if (!uobj) {
+		ret = -EINVAL;
+		goto err_unlock_mutex;
+	}
+
+	mutex_lock(&file->mutex);
+	if (!ret) {
+		list_for_each_entry(t_uobj, &file->ucontext->qp_list, list) {
+			struct ib_qp *qp = t_uobj->object;
+			if (qp->xrcd && qp->xrcd == uobj->object) {
+				ret = -EBUSY;
+				break;
+			}
+		}
+	}
+	if (!ret) {
+		list_for_each_entry(t_uobj, &file->ucontext->srq_list, list) {
+			struct ib_srq *srq = t_uobj->object;
+			if (srq->ext.xrc.xrcd && srq->ext.xrc.xrcd == uobj->object) {
+				ret = -EBUSY;
+				break;
+			}
+		}
+	}
+	mutex_unlock(&file->mutex);
+	if (ret) {
+		put_uobj_write(uobj);
+		goto err_unlock_mutex;
+	}
+
+	xrcd_uobj = container_of(uobj, struct ib_uxrcd_object, uobject);
+	if (!list_empty(&xrcd_uobj->xrc_reg_qp_list)) {
+		ret = -EBUSY;
+		put_uobj_write(uobj);
+		goto err_unlock_mutex;
+	}
+
+	xrcd = (struct ib_xrcd *) (uobj->object);
+	inode = xrcd->inode;
+
+	if (inode)
+		atomic_dec(&xrcd->usecnt);
+
+	ret = ib_dealloc_xrcd(uobj->object);
+	if (!ret)
+		uobj->live = 0;
+
+	put_uobj_write(uobj);
+
+	if (ret && !inode)
+		goto err_unlock_mutex;
+
+	if (!ret && inode)
+		xrcd_table_delete(file->device->ib_dev, inode);
+
+	idr_remove_uobj(&ib_uverbs_xrc_domain_idr, uobj);
+
+	mutex_lock(&file->mutex);
+	list_del(&uobj->list);
+	mutex_unlock(&file->mutex);
+
+	put_uobj(uobj);
+
+	mutex_unlock(&file->device->ib_dev->xrcd_table_mutex);
+	return in_len;
+
+err_unlock_mutex:
+	mutex_unlock(&file->device->ib_dev->xrcd_table_mutex);
+	return ret;
+}
+
+void ib_uverbs_dealloc_xrcd(struct ib_device *ib_dev,
+			    struct ib_xrcd *xrcd)
+{
+	struct inode *inode = NULL;
+	int ret = 0;
+
+	inode = xrcd->inode;
+	if (inode)
+		atomic_dec(&xrcd->usecnt);
+
+	ret = ib_dealloc_xrcd(xrcd);
+	if (!ret && inode)
+		xrcd_table_delete(ib_dev, inode);
+}
+
+ssize_t ib_uverbs_create_xrc_rcv_qp(struct ib_uverbs_file *file,
+				    const char __user *buf, int in_len,
+				    int out_len)
+{
+	struct ib_uverbs_create_xrc_rcv_qp	cmd;
+	struct ib_uverbs_create_xrc_rcv_qp_resp resp;
+	struct ib_uxrc_rcv_object      *obj;
+	struct ib_qp_init_attr		init_attr;
+	struct ib_xrcd		       *xrcd;
+	struct ib_uobject	       *uobj;
+	struct ib_uxrcd_object	       *xrcd_uobj;
+	u32				qp_num;
+	int				err;
+
+	if (out_len < sizeof resp)
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	obj = kzalloc(sizeof *obj, GFP_KERNEL);
+	if (!obj)
+		return -ENOMEM;
+
+	xrcd = idr_read_xrcd(cmd.xrc_domain_handle, file->ucontext, &uobj);
+	if (!xrcd) {
+		err = -EINVAL;
+		goto err_out;
+	}
+
+	init_attr.event_handler = ib_uverbs_xrc_rcv_qp_event_handler;
+	init_attr.qp_context	= file;
+	init_attr.srq		= NULL;
+	init_attr.sq_sig_type	=
+		cmd.sq_sig_all ? IB_SIGNAL_ALL_WR : IB_SIGNAL_REQ_WR;
+	init_attr.qp_type	= IB_QPT_XRC;
+	init_attr.xrcd	= xrcd;
+
+	init_attr.cap.max_send_wr	= 1;
+	init_attr.cap.max_recv_wr	= 0;
+	init_attr.cap.max_send_sge	= 1;
+	init_attr.cap.max_recv_sge	= 0;
+	init_attr.cap.max_inline_data	= 0;
+
+	err = xrcd->device->create_xrc_rcv_qp(&init_attr, &qp_num);
+	if (err)
+		goto err_put;
+
+	memset(&resp, 0, sizeof resp);
+	resp.qpn = qp_num;
+
+	if (copy_to_user((void __user *) (unsigned long) cmd.response,
+			 &resp, sizeof resp)) {
+		err = -EFAULT;
+		goto err_destroy;
+	}
+
+	atomic_inc(&xrcd->usecnt);
+	put_xrcd_read(uobj);
+	obj->qp_num = qp_num;
+	obj->domain_handle = cmd.xrc_domain_handle;
+	xrcd_uobj = container_of(uobj, struct ib_uxrcd_object, uobject);
+	mutex_lock(&file->device->ib_dev->xrcd_table_mutex);
+	list_add_tail(&obj->list, &xrcd_uobj->xrc_reg_qp_list);
+	mutex_unlock(&file->device->ib_dev->xrcd_table_mutex);
+
+	return in_len;
+
+err_destroy:
+	xrcd->device->unreg_xrc_rcv_qp(xrcd, file, qp_num);
+err_put:
+	put_xrcd_read(uobj);
+err_out:
+	kfree(obj);
+	return err;
+}
+
+ssize_t ib_uverbs_modify_xrc_rcv_qp(struct ib_uverbs_file *file,
+				    const char __user *buf, int in_len,
+				    int out_len)
+{
+	struct ib_uverbs_modify_xrc_rcv_qp      cmd;
+	struct ib_qp_attr	       *attr;
+	struct ib_xrcd		       *xrcd;
+	struct ib_uobject	       *uobj;
+	int				err;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	attr = kzalloc(sizeof *attr, GFP_KERNEL);
+	if (!attr)
+		return -ENOMEM;
+
+	xrcd = idr_read_xrcd(cmd.xrc_domain_handle, file->ucontext, &uobj);
+	if (!xrcd) {
+		kfree(attr);
+		return -EINVAL;
+	}
+
+	attr->qp_state		  = cmd.qp_state;
+	attr->cur_qp_state	  = cmd.cur_qp_state;
+	attr->qp_access_flags	  = cmd.qp_access_flags;
+	attr->pkey_index	  = cmd.pkey_index;
+	attr->port_num		  = cmd.port_num;
+	attr->path_mtu		  = cmd.path_mtu;
+	attr->path_mig_state	  = cmd.path_mig_state;
+	attr->qkey		  = cmd.qkey;
+	attr->rq_psn		  = cmd.rq_psn;
+	attr->sq_psn		  = cmd.sq_psn;
+	attr->dest_qp_num	  = cmd.dest_qp_num;
+	attr->alt_pkey_index	  = cmd.alt_pkey_index;
+	attr->en_sqd_async_notify = cmd.en_sqd_async_notify;
+	attr->max_rd_atomic	  = cmd.max_rd_atomic;
+	attr->max_dest_rd_atomic  = cmd.max_dest_rd_atomic;
+	attr->min_rnr_timer	  = cmd.min_rnr_timer;
+	attr->port_num		  = cmd.port_num;
+	attr->timeout		  = cmd.timeout;
+	attr->retry_cnt		  = cmd.retry_cnt;
+	attr->rnr_retry		  = cmd.rnr_retry;
+	attr->alt_port_num	  = cmd.alt_port_num;
+	attr->alt_timeout	  = cmd.alt_timeout;
+
+	memcpy(attr->ah_attr.grh.dgid.raw, cmd.dest.dgid, 16);
+	attr->ah_attr.grh.flow_label	    = cmd.dest.flow_label;
+	attr->ah_attr.grh.sgid_index	    = cmd.dest.sgid_index;
+	attr->ah_attr.grh.hop_limit	    = cmd.dest.hop_limit;
+	attr->ah_attr.grh.traffic_class	    = cmd.dest.traffic_class;
+	attr->ah_attr.dlid		    = cmd.dest.dlid;
+	attr->ah_attr.sl		    = cmd.dest.sl;
+	attr->ah_attr.src_path_bits	    = cmd.dest.src_path_bits;
+	attr->ah_attr.static_rate	    = cmd.dest.static_rate;
+	attr->ah_attr.ah_flags		    = cmd.dest.is_global ? IB_AH_GRH : 0;
+	attr->ah_attr.port_num		    = cmd.dest.port_num;
+
+	memcpy(attr->alt_ah_attr.grh.dgid.raw, cmd.alt_dest.dgid, 16);
+	attr->alt_ah_attr.grh.flow_label    = cmd.alt_dest.flow_label;
+	attr->alt_ah_attr.grh.sgid_index    = cmd.alt_dest.sgid_index;
+	attr->alt_ah_attr.grh.hop_limit     = cmd.alt_dest.hop_limit;
+	attr->alt_ah_attr.grh.traffic_class = cmd.alt_dest.traffic_class;
+	attr->alt_ah_attr.dlid		    = cmd.alt_dest.dlid;
+	attr->alt_ah_attr.sl		    = cmd.alt_dest.sl;
+	attr->alt_ah_attr.src_path_bits	    = cmd.alt_dest.src_path_bits;
+	attr->alt_ah_attr.static_rate	    = cmd.alt_dest.static_rate;
+	attr->alt_ah_attr.ah_flags	    = cmd.alt_dest.is_global ? IB_AH_GRH : 0;
+	attr->alt_ah_attr.port_num	    = cmd.alt_dest.port_num;
+
+	err = xrcd->device->modify_xrc_rcv_qp(xrcd, cmd.qp_num, attr, cmd.attr_mask);
+	put_xrcd_read(uobj);
+	kfree(attr);
+	return err ? err : in_len;
+}
+
+ssize_t ib_uverbs_query_xrc_rcv_qp(struct ib_uverbs_file *file,
+				   const char __user *buf, int in_len,
+				   int out_len)
+{
+	struct ib_uverbs_query_xrc_rcv_qp cmd;
+	struct ib_uverbs_query_qp_resp	 resp;
+	struct ib_qp_attr		*attr;
+	struct ib_qp_init_attr		*init_attr;
+	struct ib_xrcd			*xrcd;
+	struct ib_uobject		*uobj;
+	int				 ret;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	attr      = kmalloc(sizeof *attr, GFP_KERNEL);
+	init_attr = kmalloc(sizeof *init_attr, GFP_KERNEL);
+	if (!attr || !init_attr) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	xrcd = idr_read_xrcd(cmd.xrc_domain_handle, file->ucontext, &uobj);
+	if (!xrcd) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = xrcd->device->query_xrc_rcv_qp(xrcd, cmd.qp_num, attr,
+					     cmd.attr_mask, init_attr);
+
+	put_xrcd_read(uobj);
+
+	if (ret)
+		goto out;
+
+	memset(&resp, 0, sizeof resp);
+	resp.qp_state		    = attr->qp_state;
+	resp.cur_qp_state	    = attr->cur_qp_state;
+	resp.path_mtu		    = attr->path_mtu;
+	resp.path_mig_state	    = attr->path_mig_state;
+	resp.qkey		    = attr->qkey;
+	resp.rq_psn		    = attr->rq_psn;
+	resp.sq_psn		    = attr->sq_psn;
+	resp.dest_qp_num	    = attr->dest_qp_num;
+	resp.qp_access_flags	    = attr->qp_access_flags;
+	resp.pkey_index		    = attr->pkey_index;
+	resp.alt_pkey_index	    = attr->alt_pkey_index;
+	resp.sq_draining	    = attr->sq_draining;
+	resp.max_rd_atomic	    = attr->max_rd_atomic;
+	resp.max_dest_rd_atomic	    = attr->max_dest_rd_atomic;
+	resp.min_rnr_timer	    = attr->min_rnr_timer;
+	resp.port_num		    = attr->port_num;
+	resp.timeout		    = attr->timeout;
+	resp.retry_cnt		    = attr->retry_cnt;
+	resp.rnr_retry		    = attr->rnr_retry;
+	resp.alt_port_num	    = attr->alt_port_num;
+	resp.alt_timeout	    = attr->alt_timeout;
+
+	memcpy(resp.dest.dgid, attr->ah_attr.grh.dgid.raw, 16);
+	resp.dest.flow_label	    = attr->ah_attr.grh.flow_label;
+	resp.dest.sgid_index	    = attr->ah_attr.grh.sgid_index;
+	resp.dest.hop_limit	    = attr->ah_attr.grh.hop_limit;
+	resp.dest.traffic_class	    = attr->ah_attr.grh.traffic_class;
+	resp.dest.dlid		    = attr->ah_attr.dlid;
+	resp.dest.sl		    = attr->ah_attr.sl;
+	resp.dest.src_path_bits	    = attr->ah_attr.src_path_bits;
+	resp.dest.static_rate	    = attr->ah_attr.static_rate;
+	resp.dest.is_global	    = !!(attr->ah_attr.ah_flags & IB_AH_GRH);
+	resp.dest.port_num	    = attr->ah_attr.port_num;
+
+	memcpy(resp.alt_dest.dgid, attr->alt_ah_attr.grh.dgid.raw, 16);
+	resp.alt_dest.flow_label    = attr->alt_ah_attr.grh.flow_label;
+	resp.alt_dest.sgid_index    = attr->alt_ah_attr.grh.sgid_index;
+	resp.alt_dest.hop_limit     = attr->alt_ah_attr.grh.hop_limit;
+	resp.alt_dest.traffic_class = attr->alt_ah_attr.grh.traffic_class;
+	resp.alt_dest.dlid	    = attr->alt_ah_attr.dlid;
+	resp.alt_dest.sl	    = attr->alt_ah_attr.sl;
+	resp.alt_dest.src_path_bits = attr->alt_ah_attr.src_path_bits;
+	resp.alt_dest.static_rate   = attr->alt_ah_attr.static_rate;
+	resp.alt_dest.is_global	    = !!(attr->alt_ah_attr.ah_flags & IB_AH_GRH);
+	resp.alt_dest.port_num	    = attr->alt_ah_attr.port_num;
+
+	resp.max_send_wr	    = init_attr->cap.max_send_wr;
+	resp.max_recv_wr	    = init_attr->cap.max_recv_wr;
+	resp.max_send_sge	    = init_attr->cap.max_send_sge;
+	resp.max_recv_sge	    = init_attr->cap.max_recv_sge;
+	resp.max_inline_data	    = init_attr->cap.max_inline_data;
+	resp.sq_sig_all		    = init_attr->sq_sig_type == IB_SIGNAL_ALL_WR;
+
+	if (copy_to_user((void __user *) (unsigned long) cmd.response,
+			 &resp, sizeof resp))
+		ret = -EFAULT;
+
+out:
+	kfree(attr);
+	kfree(init_attr);
+
+	return ret ? ret : in_len;
+}
+
+ssize_t ib_uverbs_reg_xrc_rcv_qp(struct ib_uverbs_file *file,
+				 const char __user *buf, int in_len,
+				 int out_len)
+{
+	struct ib_uverbs_reg_xrc_rcv_qp  cmd;
+	struct ib_uxrc_rcv_object	*qp_obj, *tmp;
+	struct ib_xrcd			*xrcd;
+	struct ib_uobject		*uobj;
+	struct ib_uxrcd_object		*xrcd_uobj;
+	int				 ret;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	qp_obj = kmalloc(sizeof *qp_obj, GFP_KERNEL);
+	if (!qp_obj)
+		return -ENOMEM;
+
+	xrcd = idr_read_xrcd(cmd.xrc_domain_handle, file->ucontext, &uobj);
+	if (!xrcd) {
+		ret = -EINVAL;
+		goto err_out;
+	}
+
+	ret = xrcd->device->reg_xrc_rcv_qp(xrcd, file, cmd.qp_num);
+	if (ret)
+		goto err_put;
+
+	xrcd_uobj = container_of(uobj, struct ib_uxrcd_object, uobject);
+	mutex_lock(&file->device->ib_dev->xrcd_table_mutex);
+	list_for_each_entry(tmp, &xrcd_uobj->xrc_reg_qp_list, list)
+		if (cmd.qp_num == tmp->qp_num) {
+			kfree(qp_obj);
+			mutex_unlock(&file->device->ib_dev->xrcd_table_mutex);
+			put_xrcd_read(uobj);
+			return in_len;
+		}
+	qp_obj->qp_num = cmd.qp_num;
+	qp_obj->domain_handle = cmd.xrc_domain_handle;
+	list_add_tail(&qp_obj->list, &xrcd_uobj->xrc_reg_qp_list);
+	mutex_unlock(&file->device->ib_dev->xrcd_table_mutex);
+	atomic_inc(&xrcd->usecnt);
+	put_xrcd_read(uobj);
+	return in_len;
+
+err_put:
+	put_xrcd_read(uobj);
+err_out:
+
+	kfree(qp_obj);
+	return ret;
+}
+
+int ib_uverbs_cleanup_xrc_rcv_qp(struct ib_uverbs_file *file,
+				 struct ib_xrcd *xrcd, u32 qp_num)
+{
+	int err;
+	err = xrcd->device->unreg_xrc_rcv_qp(xrcd, file, qp_num);
+	if (!err)
+		atomic_dec(&xrcd->usecnt);
+	return err;
+}
+
+ssize_t ib_uverbs_unreg_xrc_rcv_qp(struct ib_uverbs_file *file,
+				   const char __user *buf, int in_len,
+				   int out_len)
+{
+	struct ib_uverbs_unreg_xrc_rcv_qp cmd;
+	struct ib_uxrc_rcv_object *qp_obj, *tmp;
+	struct ib_xrcd *xrcd;
+	struct ib_uobject *uobj;
+	struct ib_uxrcd_object *xrcd_uobj;
+	int ret;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	xrcd = idr_read_xrcd(cmd.xrc_domain_handle, file->ucontext, &uobj);
+	if (!xrcd)
+		return -EINVAL;
+
+	ret = xrcd->device->unreg_xrc_rcv_qp(xrcd, file, cmd.qp_num);
+	if (ret) {
+		put_xrcd_read(uobj);
+		return -EINVAL;
+	}
+	atomic_dec(&xrcd->usecnt);
+
+	xrcd_uobj = container_of(uobj, struct ib_uxrcd_object, uobject);
+	mutex_lock(&file->device->ib_dev->xrcd_table_mutex);
+	list_for_each_entry_safe(qp_obj, tmp, &xrcd_uobj->xrc_reg_qp_list, list)
+		if (cmd.qp_num == qp_obj->qp_num) {
+			list_del(&qp_obj->list);
+			kfree(qp_obj);
+			break;
+		}
+	mutex_unlock(&file->device->ib_dev->xrcd_table_mutex);
+	put_xrcd_read(uobj);
+	return in_len;
+}


Property changes on: trunk/sys/ofed/drivers/infiniband/core/uverbs_cmd.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/sys/ofed/drivers/infiniband/core/uverbs_main.c
===================================================================
--- trunk/sys/ofed/drivers/infiniband/core/uverbs_main.c	                        (rev 0)
+++ trunk/sys/ofed/drivers/infiniband/core/uverbs_main.c	2016-09-14 19:35:22 UTC (rev 7911)
@@ -0,0 +1,1013 @@
+/*
+ * Copyright (c) 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005, 2006 Cisco Systems.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2005 Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2005 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/fs.h>
+#include <linux/poll.h>
+#include <linux/file.h>
+#include <linux/cdev.h>
+
+#include <asm/uaccess.h>
+
+#include "uverbs.h"
+
+MODULE_AUTHOR("Roland Dreier");
+MODULE_DESCRIPTION("InfiniBand userspace verbs access");
+MODULE_LICENSE("Dual BSD/GPL");
+
+#define INFINIBANDEVENTFS_MAGIC	0x49426576	/* "IBev" */
+
+enum {
+	IB_UVERBS_MAJOR       = 231,
+	IB_UVERBS_BASE_MINOR  = 192,
+	IB_UVERBS_MAX_DEVICES = 32
+};
+
+#define IB_UVERBS_BASE_DEV	MKDEV(IB_UVERBS_MAJOR, IB_UVERBS_BASE_MINOR)
+
+static struct class *uverbs_class;
+
+DEFINE_SPINLOCK(ib_uverbs_idr_lock);
+DEFINE_IDR(ib_uverbs_pd_idr);
+DEFINE_IDR(ib_uverbs_mr_idr);
+DEFINE_IDR(ib_uverbs_mw_idr);
+DEFINE_IDR(ib_uverbs_ah_idr);
+DEFINE_IDR(ib_uverbs_cq_idr);
+DEFINE_IDR(ib_uverbs_qp_idr);
+DEFINE_IDR(ib_uverbs_srq_idr);
+DEFINE_IDR(ib_uverbs_xrc_domain_idr);
+
+static spinlock_t map_lock;
+static struct ib_uverbs_device *dev_table[IB_UVERBS_MAX_DEVICES];
+static DECLARE_BITMAP(dev_map, IB_UVERBS_MAX_DEVICES);
+
+static ssize_t (*uverbs_cmd_table[])(struct ib_uverbs_file *file,
+				     const char __user *buf, int in_len,
+				     int out_len) = {
+	[IB_USER_VERBS_CMD_GET_CONTEXT]   	= ib_uverbs_get_context,
+	[IB_USER_VERBS_CMD_QUERY_DEVICE]  	= ib_uverbs_query_device,
+	[IB_USER_VERBS_CMD_QUERY_PORT]    	= ib_uverbs_query_port,
+	[IB_USER_VERBS_CMD_ALLOC_PD]      	= ib_uverbs_alloc_pd,
+	[IB_USER_VERBS_CMD_DEALLOC_PD]    	= ib_uverbs_dealloc_pd,
+	[IB_USER_VERBS_CMD_REG_MR]        	= ib_uverbs_reg_mr,
+	[IB_USER_VERBS_CMD_DEREG_MR]      	= ib_uverbs_dereg_mr,
+	[IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL] = ib_uverbs_create_comp_channel,
+	[IB_USER_VERBS_CMD_CREATE_CQ]     	= ib_uverbs_create_cq,
+	[IB_USER_VERBS_CMD_RESIZE_CQ]     	= ib_uverbs_resize_cq,
+	[IB_USER_VERBS_CMD_POLL_CQ]     	= ib_uverbs_poll_cq,
+	[IB_USER_VERBS_CMD_REQ_NOTIFY_CQ]     	= ib_uverbs_req_notify_cq,
+	[IB_USER_VERBS_CMD_DESTROY_CQ]    	= ib_uverbs_destroy_cq,
+	[IB_USER_VERBS_CMD_CREATE_QP]     	= ib_uverbs_create_qp,
+	[IB_USER_VERBS_CMD_QUERY_QP]     	= ib_uverbs_query_qp,
+	[IB_USER_VERBS_CMD_MODIFY_QP]     	= ib_uverbs_modify_qp,
+	[IB_USER_VERBS_CMD_DESTROY_QP]    	= ib_uverbs_destroy_qp,
+	[IB_USER_VERBS_CMD_POST_SEND]    	= ib_uverbs_post_send,
+	[IB_USER_VERBS_CMD_POST_RECV]    	= ib_uverbs_post_recv,
+	[IB_USER_VERBS_CMD_POST_SRQ_RECV]    	= ib_uverbs_post_srq_recv,
+	[IB_USER_VERBS_CMD_CREATE_AH]    	= ib_uverbs_create_ah,
+	[IB_USER_VERBS_CMD_DESTROY_AH]    	= ib_uverbs_destroy_ah,
+	[IB_USER_VERBS_CMD_ATTACH_MCAST]  	= ib_uverbs_attach_mcast,
+	[IB_USER_VERBS_CMD_DETACH_MCAST]  	= ib_uverbs_detach_mcast,
+	[IB_USER_VERBS_CMD_CREATE_SRQ]    	= ib_uverbs_create_srq,
+	[IB_USER_VERBS_CMD_MODIFY_SRQ]    	= ib_uverbs_modify_srq,
+	[IB_USER_VERBS_CMD_QUERY_SRQ]     	= ib_uverbs_query_srq,
+	[IB_USER_VERBS_CMD_DESTROY_SRQ]   	= ib_uverbs_destroy_srq,
+	[IB_USER_VERBS_CMD_CREATE_XRC_SRQ]	= ib_uverbs_create_xrc_srq,
+	[IB_USER_VERBS_CMD_OPEN_XRCD]	        = ib_uverbs_open_xrc_domain,
+	[IB_USER_VERBS_CMD_CLOSE_XRCD]	        = ib_uverbs_close_xrc_domain,
+	[IB_USER_VERBS_CMD_CREATE_XRC_RCV_QP]	= ib_uverbs_create_xrc_rcv_qp,
+	[IB_USER_VERBS_CMD_MODIFY_XRC_RCV_QP]	= ib_uverbs_modify_xrc_rcv_qp,
+	[IB_USER_VERBS_CMD_QUERY_XRC_RCV_QP]	= ib_uverbs_query_xrc_rcv_qp,
+	[IB_USER_VERBS_CMD_REG_XRC_RCV_QP]	= ib_uverbs_reg_xrc_rcv_qp,
+	[IB_USER_VERBS_CMD_UNREG_XRC_RCV_QP]	= ib_uverbs_unreg_xrc_rcv_qp,
+};
+
+#ifdef __linux__
+/* BSD Does not require a fake mountpoint for all files. */
+static struct vfsmount *uverbs_event_mnt;
+#endif
+
+static void ib_uverbs_add_one(struct ib_device *device);
+static void ib_uverbs_remove_one(struct ib_device *device);
+
+static void ib_uverbs_release_dev(struct kref *ref)
+{
+	struct ib_uverbs_device *dev =
+		container_of(ref, struct ib_uverbs_device, ref);
+
+	complete(&dev->comp);
+}
+
+static void ib_uverbs_release_event_file(struct kref *ref)
+{
+	struct ib_uverbs_event_file *file =
+		container_of(ref, struct ib_uverbs_event_file, ref);
+
+	kfree(file);
+}
+
+void ib_uverbs_release_ucq(struct ib_uverbs_file *file,
+			  struct ib_uverbs_event_file *ev_file,
+			  struct ib_ucq_object *uobj)
+{
+	struct ib_uverbs_event *evt, *tmp;
+
+	if (ev_file) {
+		spin_lock_irq(&ev_file->lock);
+		list_for_each_entry_safe(evt, tmp, &uobj->comp_list, obj_list) {
+			list_del(&evt->list);
+			kfree(evt);
+		}
+		spin_unlock_irq(&ev_file->lock);
+
+		kref_put(&ev_file->ref, ib_uverbs_release_event_file);
+	}
+
+	spin_lock_irq(&file->async_file->lock);
+	list_for_each_entry_safe(evt, tmp, &uobj->async_list, obj_list) {
+		list_del(&evt->list);
+		kfree(evt);
+	}
+	spin_unlock_irq(&file->async_file->lock);
+}
+
+void ib_uverbs_release_uevent(struct ib_uverbs_file *file,
+			      struct ib_uevent_object *uobj)
+{
+	struct ib_uverbs_event *evt, *tmp;
+
+	spin_lock_irq(&file->async_file->lock);
+	list_for_each_entry_safe(evt, tmp, &uobj->event_list, obj_list) {
+		list_del(&evt->list);
+		kfree(evt);
+	}
+	spin_unlock_irq(&file->async_file->lock);
+}
+
+static void ib_uverbs_detach_umcast(struct ib_qp *qp,
+				    struct ib_uqp_object *uobj)
+{
+	struct ib_uverbs_mcast_entry *mcast, *tmp;
+
+	list_for_each_entry_safe(mcast, tmp, &uobj->mcast_list, list) {
+		ib_detach_mcast(qp, &mcast->gid, mcast->lid);
+		list_del(&mcast->list);
+		kfree(mcast);
+	}
+}
+
+static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
+				      struct ib_ucontext *context)
+{
+	struct ib_uobject *uobj, *tmp;
+
+	if (!context)
+		return 0;
+
+	context->closing = 1;
+
+	list_for_each_entry_safe(uobj, tmp, &context->ah_list, list) {
+		struct ib_ah *ah = uobj->object;
+
+		idr_remove_uobj(&ib_uverbs_ah_idr, uobj);
+		ib_destroy_ah(ah);
+		kfree(uobj);
+	}
+
+	list_for_each_entry_safe(uobj, tmp, &context->qp_list, list) {
+		struct ib_qp *qp = uobj->object;
+		struct ib_uqp_object *uqp =
+			container_of(uobj, struct ib_uqp_object, uevent.uobject);
+
+		idr_remove_uobj(&ib_uverbs_qp_idr, uobj);
+		ib_uverbs_detach_umcast(qp, uqp);
+		ib_destroy_qp(qp);
+		ib_uverbs_release_uevent(file, &uqp->uevent);
+		kfree(uqp);
+	}
+
+
+	list_for_each_entry_safe(uobj, tmp, &context->srq_list, list) {
+		struct ib_srq *srq = uobj->object;
+		struct ib_uevent_object *uevent =
+			container_of(uobj, struct ib_uevent_object, uobject);
+
+		idr_remove_uobj(&ib_uverbs_srq_idr, uobj);
+		ib_destroy_srq(srq);
+		ib_uverbs_release_uevent(file, uevent);
+		kfree(uevent);
+	}
+
+	list_for_each_entry_safe(uobj, tmp, &context->cq_list, list) {
+		struct ib_cq *cq = uobj->object;
+		struct ib_uverbs_event_file *ev_file = cq->cq_context;
+		struct ib_ucq_object *ucq =
+			container_of(uobj, struct ib_ucq_object, uobject);
+
+		idr_remove_uobj(&ib_uverbs_cq_idr, uobj);
+		ib_destroy_cq(cq);
+		ib_uverbs_release_ucq(file, ev_file, ucq);
+		kfree(ucq);
+	}
+
+	/* XXX Free MWs */
+
+	list_for_each_entry_safe(uobj, tmp, &context->mr_list, list) {
+		struct ib_mr *mr = uobj->object;
+
+		idr_remove_uobj(&ib_uverbs_mr_idr, uobj);
+		ib_dereg_mr(mr);
+		kfree(uobj);
+	}
+
+	mutex_lock(&file->device->ib_dev->xrcd_table_mutex);
+	list_for_each_entry_safe(uobj, tmp, &context->xrcd_list, list) {
+		struct ib_xrcd *xrcd = uobj->object;
+		struct ib_uxrc_rcv_object *xrc_qp_obj, *tmp1;
+		struct ib_uxrcd_object *xrcd_uobj =
+			container_of(uobj, struct ib_uxrcd_object, uobject);
+
+		list_for_each_entry_safe(xrc_qp_obj, tmp1,
+					 &xrcd_uobj->xrc_reg_qp_list, list) {
+			list_del(&xrc_qp_obj->list);
+			ib_uverbs_cleanup_xrc_rcv_qp(file, xrcd,
+						     xrc_qp_obj->qp_num);
+			kfree(xrc_qp_obj);
+		}
+
+		idr_remove_uobj(&ib_uverbs_xrc_domain_idr, uobj);
+		ib_uverbs_dealloc_xrcd(file->device->ib_dev, xrcd);
+		kfree(uobj);
+	}
+	mutex_unlock(&file->device->ib_dev->xrcd_table_mutex);
+
+	list_for_each_entry_safe(uobj, tmp, &context->pd_list, list) {
+		struct ib_pd *pd = uobj->object;
+
+		idr_remove_uobj(&ib_uverbs_pd_idr, uobj);
+		ib_dealloc_pd(pd);
+		kfree(uobj);
+	}
+
+	return context->device->dealloc_ucontext(context);
+}
+
+static void ib_uverbs_release_file(struct kref *ref)
+{
+	struct ib_uverbs_file *file =
+		container_of(ref, struct ib_uverbs_file, ref);
+
+	module_put(file->device->ib_dev->owner);
+	kref_put(&file->device->ref, ib_uverbs_release_dev);
+
+	kfree(file);
+}
+
+static ssize_t ib_uverbs_event_read(struct file *filp, char __user *buf,
+				    size_t count, loff_t *pos)
+{
+	struct ib_uverbs_event_file *file = filp->private_data;
+	struct ib_uverbs_event *event;
+	int eventsz;
+	int ret = 0;
+
+	spin_lock_irq(&file->lock);
+
+	while (list_empty(&file->event_list)) {
+		spin_unlock_irq(&file->lock);
+
+		if (filp->f_flags & O_NONBLOCK)
+			return -EAGAIN;
+
+		if (wait_event_interruptible(file->poll_wait,
+					     !list_empty(&file->event_list)))
+			return -ERESTARTSYS;
+
+		spin_lock_irq(&file->lock);
+	}
+
+	event = list_entry(file->event_list.next, struct ib_uverbs_event, list);
+
+	if (file->is_async)
+		eventsz = sizeof (struct ib_uverbs_async_event_desc);
+	else
+		eventsz = sizeof (struct ib_uverbs_comp_event_desc);
+
+	if (eventsz > count) {
+		ret   = -EINVAL;
+		event = NULL;
+	} else {
+		list_del(file->event_list.next);
+		if (event->counter) {
+			++(*event->counter);
+			list_del(&event->obj_list);
+		}
+	}
+
+	spin_unlock_irq(&file->lock);
+
+	if (event) {
+		if (copy_to_user(buf, event, eventsz))
+			ret = -EFAULT;
+		else
+			ret = eventsz;
+	}
+
+	kfree(event);
+
+	return ret;
+}
+
+static unsigned int ib_uverbs_event_poll(struct file *filp,
+					 struct poll_table_struct *wait)
+{
+	unsigned int pollflags = 0;
+	struct ib_uverbs_event_file *file = filp->private_data;
+
+	file->filp = filp;
+	poll_wait(filp, &file->poll_wait, wait);
+
+	spin_lock_irq(&file->lock);
+	if (!list_empty(&file->event_list))
+		pollflags = POLLIN | POLLRDNORM;
+	spin_unlock_irq(&file->lock);
+
+	return pollflags;
+}
+
+static int ib_uverbs_event_fasync(int fd, struct file *filp, int on)
+{
+	struct ib_uverbs_event_file *file = filp->private_data;
+
+	return fasync_helper(fd, filp, on, &file->async_queue);
+}
+
+static int ib_uverbs_event_close(struct inode *inode, struct file *filp)
+{
+	struct ib_uverbs_event_file *file = filp->private_data;
+	struct ib_uverbs_event *entry, *tmp;
+
+	spin_lock_irq(&file->lock);
+	file->is_closed = 1;
+	list_for_each_entry_safe(entry, tmp, &file->event_list, list) {
+		if (entry->counter)
+			list_del(&entry->obj_list);
+		kfree(entry);
+	}
+	spin_unlock_irq(&file->lock);
+
+	if (file->is_async) {
+		ib_unregister_event_handler(&file->uverbs_file->event_handler);
+		kref_put(&file->uverbs_file->ref, ib_uverbs_release_file);
+	}
+	kref_put(&file->ref, ib_uverbs_release_event_file);
+
+	return 0;
+}
+
+static const struct file_operations uverbs_event_fops = {
+	.owner	 = THIS_MODULE,
+	.read 	 = ib_uverbs_event_read,
+	.poll    = ib_uverbs_event_poll,
+	.release = ib_uverbs_event_close,
+	.fasync  = ib_uverbs_event_fasync
+};
+
+void ib_uverbs_comp_handler(struct ib_cq *cq, void *cq_context)
+{
+	struct ib_uverbs_event_file    *file = cq_context;
+	struct ib_ucq_object	       *uobj;
+	struct ib_uverbs_event	       *entry;
+	unsigned long			flags;
+
+	if (!file)
+		return;
+
+	spin_lock_irqsave(&file->lock, flags);
+	if (file->is_closed) {
+		spin_unlock_irqrestore(&file->lock, flags);
+		return;
+	}
+
+	entry = kmalloc(sizeof *entry, GFP_ATOMIC);
+	if (!entry) {
+		spin_unlock_irqrestore(&file->lock, flags);
+		return;
+	}
+
+	uobj = container_of(cq->uobject, struct ib_ucq_object, uobject);
+
+	entry->desc.comp.cq_handle = cq->uobject->user_handle;
+	entry->counter		   = &uobj->comp_events_reported;
+
+	list_add_tail(&entry->list, &file->event_list);
+	list_add_tail(&entry->obj_list, &uobj->comp_list);
+	spin_unlock_irqrestore(&file->lock, flags);
+
+	wake_up_interruptible(&file->poll_wait);
+	if (file->filp)
+		selwakeup(&file->filp->f_selinfo);
+	kill_fasync(&file->async_queue, SIGIO, POLL_IN);
+}
+
+static void ib_uverbs_async_handler(struct ib_uverbs_file *file,
+				    __u64 element, __u64 event,
+				    struct list_head *obj_list,
+				    u32 *counter)
+{
+	struct ib_uverbs_event *entry;
+	unsigned long flags;
+
+	spin_lock_irqsave(&file->async_file->lock, flags);
+	if (file->async_file->is_closed) {
+		spin_unlock_irqrestore(&file->async_file->lock, flags);
+		return;
+	}
+
+	entry = kmalloc(sizeof *entry, GFP_ATOMIC);
+	if (!entry) {
+		spin_unlock_irqrestore(&file->async_file->lock, flags);
+		return;
+	}
+
+	entry->desc.async.element    = element;
+	entry->desc.async.event_type = event;
+	entry->counter               = counter;
+
+	list_add_tail(&entry->list, &file->async_file->event_list);
+	if (obj_list)
+		list_add_tail(&entry->obj_list, obj_list);
+	spin_unlock_irqrestore(&file->async_file->lock, flags);
+
+	wake_up_interruptible(&file->async_file->poll_wait);
+	if (file->async_file->filp)
+		selwakeup(&file->async_file->filp->f_selinfo);
+	kill_fasync(&file->async_file->async_queue, SIGIO, POLL_IN);
+}
+
+void ib_uverbs_cq_event_handler(struct ib_event *event, void *context_ptr)
+{
+	struct ib_ucq_object *uobj = container_of(event->element.cq->uobject,
+						  struct ib_ucq_object, uobject);
+
+	ib_uverbs_async_handler(uobj->uverbs_file, uobj->uobject.user_handle,
+				event->event, &uobj->async_list,
+				&uobj->async_events_reported);
+}
+
+void ib_uverbs_qp_event_handler(struct ib_event *event, void *context_ptr)
+{
+	struct ib_uevent_object *uobj;
+
+	uobj = container_of(event->element.qp->uobject,
+			    struct ib_uevent_object, uobject);
+
+	ib_uverbs_async_handler(context_ptr, uobj->uobject.user_handle,
+				event->event, &uobj->event_list,
+				&uobj->events_reported);
+}
+
+void ib_uverbs_srq_event_handler(struct ib_event *event, void *context_ptr)
+{
+	struct ib_uevent_object *uobj;
+
+	uobj = container_of(event->element.srq->uobject,
+			    struct ib_uevent_object, uobject);
+
+	ib_uverbs_async_handler(context_ptr, uobj->uobject.user_handle,
+				event->event, &uobj->event_list,
+				&uobj->events_reported);
+}
+
+void ib_uverbs_event_handler(struct ib_event_handler *handler,
+			     struct ib_event *event)
+{
+	struct ib_uverbs_file *file =
+		container_of(handler, struct ib_uverbs_file, event_handler);
+
+	ib_uverbs_async_handler(file, event->element.port_num, event->event,
+				NULL, NULL);
+}
+
+void ib_uverbs_xrc_rcv_qp_event_handler(struct ib_event *event,
+					void *context_ptr)
+{
+	ib_uverbs_async_handler(context_ptr, event->element.xrc_qp_num,
+				event->event, NULL, NULL);
+}
+
+struct file *ib_uverbs_alloc_event_file(struct ib_uverbs_file *uverbs_file,
+					int is_async, int *fd)
+{
+	struct ib_uverbs_event_file *ev_file;
+	struct file *filp;
+	int ret;
+
+	ev_file = kmalloc(sizeof *ev_file, GFP_KERNEL);
+	if (!ev_file)
+		return ERR_PTR(-ENOMEM);
+
+	kref_init(&ev_file->ref);
+	spin_lock_init(&ev_file->lock);
+	INIT_LIST_HEAD(&ev_file->event_list);
+	init_waitqueue_head(&ev_file->poll_wait);
+	ev_file->uverbs_file = uverbs_file;
+	ev_file->async_queue = NULL;
+	ev_file->is_async    = is_async;
+	ev_file->is_closed   = 0;
+	ev_file->filp	     = NULL;
+
+	*fd = get_unused_fd();
+	if (*fd < 0) {
+		ret = *fd;
+		goto err;
+	}
+
+	/*
+	 * fops_get() can't fail here, because we're coming from a
+	 * system call on a uverbs file, which will already have a
+	 * module reference.
+	 */
+#ifdef __linux__
+	filp = alloc_file(uverbs_event_mnt, dget(uverbs_event_mnt->mnt_root),
+			  FMODE_READ, fops_get(&uverbs_event_fops));
+#else
+	filp = alloc_file(FMODE_READ, fops_get(&uverbs_event_fops));
+#endif
+	if (!filp) {
+		ret = -ENFILE;
+		goto err_fd;
+	}
+
+	filp->private_data = ev_file;
+
+	return filp;
+
+err_fd:
+	put_unused_fd(*fd);
+
+err:
+	kfree(ev_file);
+	return ERR_PTR(ret);
+}
+
+/*
+ * Look up a completion event file by FD.  If lookup is successful,
+ * takes a ref to the event file struct that it returns; if
+ * unsuccessful, returns NULL.
+ */
+struct ib_uverbs_event_file *ib_uverbs_lookup_comp_file(int fd)
+{
+	struct ib_uverbs_event_file *ev_file = NULL;
+	struct file *filp;
+
+	filp = fget(fd);
+	if (!filp)
+		return NULL;
+
+	if (filp->f_op != &uverbs_event_fops)
+		goto out;
+
+	ev_file = filp->private_data;
+	if (ev_file->is_async) {
+		ev_file = NULL;
+		goto out;
+	}
+
+	kref_get(&ev_file->ref);
+
+out:
+	fput(filp);
+	return ev_file;
+}
+
+static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf,
+			     size_t count, loff_t *pos)
+{
+	struct ib_uverbs_file *file = filp->private_data;
+	struct ib_uverbs_cmd_hdr hdr;
+
+	if (count < sizeof hdr)
+		return -EINVAL;
+
+	if (copy_from_user(&hdr, buf, sizeof hdr))
+		return -EFAULT;
+
+	if (hdr.in_words * 4 != count)
+		return -EINVAL;
+
+	if (hdr.command >= ARRAY_SIZE(uverbs_cmd_table) ||
+	    !uverbs_cmd_table[hdr.command]		||
+	    !(file->device->ib_dev->uverbs_cmd_mask & (1ull << hdr.command)))
+		return -EINVAL;
+
+	if (!file->ucontext &&
+	    hdr.command != IB_USER_VERBS_CMD_GET_CONTEXT)
+		return -EINVAL;
+
+	return uverbs_cmd_table[hdr.command](file, buf + sizeof hdr,
+					     hdr.in_words * 4, hdr.out_words * 4);
+}
+
+static int ib_uverbs_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+	struct ib_uverbs_file *file = filp->private_data;
+
+	if (!file->ucontext)
+		return -ENODEV;
+	else
+		return file->device->ib_dev->mmap(file->ucontext, vma);
+}
+
+/*
+ * ib_uverbs_open() does not need the BKL:
+ *
+ *  - dev_table[] accesses are protected by map_lock, the
+ *    ib_uverbs_device structures are properly reference counted, and
+ *    everything else is purely local to the file being created, so
+ *    races against other open calls are not a problem;
+ *  - there is no ioctl method to race against;
+ *  - the device is added to dev_table[] as the last part of module
+ *    initialization, the open method will either immediately run
+ *    -ENXIO, or all required initialization will be done.
+ */
+static int ib_uverbs_open(struct inode *inode, struct file *filp)
+{
+	struct ib_uverbs_device *dev;
+	struct ib_uverbs_file *file;
+	int ret;
+
+	spin_lock(&map_lock);
+	dev = dev_table[iminor(inode) - IB_UVERBS_BASE_MINOR];
+	if (dev)
+		kref_get(&dev->ref);
+	spin_unlock(&map_lock);
+
+	if (!dev)
+		return -ENXIO;
+
+	if (!try_module_get(dev->ib_dev->owner)) {
+		ret = -ENODEV;
+		goto err;
+	}
+
+	file = kmalloc(sizeof *file, GFP_KERNEL);
+	if (!file) {
+		ret = -ENOMEM;
+		goto err_module;
+	}
+
+	file->device	 = dev;
+	file->ucontext	 = NULL;
+	file->async_file = NULL;
+	kref_init(&file->ref);
+	mutex_init(&file->mutex);
+
+	filp->private_data = file;
+
+	return 0;
+
+err_module:
+	module_put(dev->ib_dev->owner);
+
+err:
+	kref_put(&dev->ref, ib_uverbs_release_dev);
+	return ret;
+}
+
+static int ib_uverbs_close(struct inode *inode, struct file *filp)
+{
+	struct ib_uverbs_file *file = filp->private_data;
+
+	ib_uverbs_cleanup_ucontext(file, file->ucontext);
+
+	if (file->async_file)
+		kref_put(&file->async_file->ref, ib_uverbs_release_event_file);
+
+	kref_put(&file->ref, ib_uverbs_release_file);
+
+	return 0;
+}
+
+static const struct file_operations uverbs_fops = {
+	.owner 	 = THIS_MODULE,
+	.write 	 = ib_uverbs_write,
+	.open 	 = ib_uverbs_open,
+	.release = ib_uverbs_close
+};
+
+static const struct file_operations uverbs_mmap_fops = {
+	.owner 	 = THIS_MODULE,
+	.write 	 = ib_uverbs_write,
+	.mmap    = ib_uverbs_mmap,
+	.open 	 = ib_uverbs_open,
+	.release = ib_uverbs_close
+};
+
+static struct ib_client uverbs_client = {
+	.name   = "uverbs",
+	.add    = ib_uverbs_add_one,
+	.remove = ib_uverbs_remove_one
+};
+
+static ssize_t show_ibdev(struct device *device, struct device_attribute *attr,
+			  char *buf)
+{
+	struct ib_uverbs_device *dev = dev_get_drvdata(device);
+
+	if (!dev)
+		return -ENODEV;
+
+	return sprintf(buf, "%s\n", dev->ib_dev->name);
+}
+static DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL);
+
+static ssize_t show_dev_abi_version(struct device *device,
+				    struct device_attribute *attr, char *buf)
+{
+	struct ib_uverbs_device *dev = dev_get_drvdata(device);
+
+	if (!dev)
+		return -ENODEV;
+
+	return sprintf(buf, "%d\n", dev->ib_dev->uverbs_abi_ver);
+}
+static DEVICE_ATTR(abi_version, S_IRUGO, show_dev_abi_version, NULL);
+
+static ssize_t show_abi_version(struct class *class, struct class_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%d\n", IB_USER_VERBS_ABI_VERSION);
+}
+static CLASS_ATTR(abi_version, S_IRUGO, show_abi_version, NULL);
+
+#include <linux/pci.h>
+
+static ssize_t
+show_dev_device(struct device *device, struct device_attribute *attr, char *buf)
+{
+	struct ib_uverbs_device *dev = dev_get_drvdata(device);
+
+	if (!dev)
+		return -ENODEV;
+
+	return sprintf(buf, "0x%04x\n",
+	    ((struct pci_dev *)dev->ib_dev->dma_device)->device);
+}
+static DEVICE_ATTR(device, S_IRUGO, show_dev_device, NULL);
+
+static ssize_t
+show_dev_vendor(struct device *device, struct device_attribute *attr, char *buf)
+{
+	struct ib_uverbs_device *dev = dev_get_drvdata(device);
+
+	if (!dev)
+		return -ENODEV;
+
+	return sprintf(buf, "0x%04x\n",
+	    ((struct pci_dev *)dev->ib_dev->dma_device)->vendor);
+}
+static DEVICE_ATTR(vendor, S_IRUGO, show_dev_vendor, NULL);
+
+struct attribute *device_attrs[] =
+{
+	&dev_attr_device.attr,
+	&dev_attr_vendor.attr,
+	NULL
+};
+
+static struct attribute_group device_group = {
+        .name  = "device",
+        .attrs  = device_attrs   
+};
+
+static void ib_uverbs_add_one(struct ib_device *device)
+{
+	struct ib_uverbs_device *uverbs_dev;
+
+	if (!device->alloc_ucontext)
+		return;
+
+	uverbs_dev = kzalloc(sizeof *uverbs_dev, GFP_KERNEL);
+	if (!uverbs_dev)
+		return;
+
+	kref_init(&uverbs_dev->ref);
+	init_completion(&uverbs_dev->comp);
+
+	spin_lock(&map_lock);
+	uverbs_dev->devnum = find_first_zero_bit(dev_map, IB_UVERBS_MAX_DEVICES);
+	if (uverbs_dev->devnum >= IB_UVERBS_MAX_DEVICES) {
+		spin_unlock(&map_lock);
+		goto err;
+	}
+	set_bit(uverbs_dev->devnum, dev_map);
+	spin_unlock(&map_lock);
+
+	uverbs_dev->ib_dev           = device;
+	uverbs_dev->num_comp_vectors = device->num_comp_vectors;
+
+	uverbs_dev->cdev = cdev_alloc();
+	if (!uverbs_dev->cdev)
+		goto err;
+	uverbs_dev->cdev->owner = THIS_MODULE;
+	uverbs_dev->cdev->ops = device->mmap ? &uverbs_mmap_fops : &uverbs_fops;
+	kobject_set_name(&uverbs_dev->cdev->kobj, "uverbs%d", uverbs_dev->devnum);
+	if (cdev_add(uverbs_dev->cdev, IB_UVERBS_BASE_DEV + uverbs_dev->devnum, 1))
+		goto err_cdev;
+
+	uverbs_dev->dev = device_create(uverbs_class, device->dma_device,
+					uverbs_dev->cdev->dev, uverbs_dev,
+					"uverbs%d", uverbs_dev->devnum);
+	if (IS_ERR(uverbs_dev->dev))
+		goto err_cdev;
+
+	if (device_create_file(uverbs_dev->dev, &dev_attr_ibdev))
+		goto err_class;
+	if (device_create_file(uverbs_dev->dev, &dev_attr_abi_version))
+		goto err_class;
+	if (sysfs_create_group(&uverbs_dev->dev->kobj, &device_group))
+		goto err_class;
+
+	spin_lock(&map_lock);
+	dev_table[uverbs_dev->devnum] = uverbs_dev;
+	spin_unlock(&map_lock);
+
+	ib_set_client_data(device, &uverbs_client, uverbs_dev);
+
+	return;
+
+err_class:
+	device_destroy(uverbs_class, uverbs_dev->cdev->dev);
+
+err_cdev:
+	cdev_del(uverbs_dev->cdev);
+	clear_bit(uverbs_dev->devnum, dev_map);
+
+err:
+	kref_put(&uverbs_dev->ref, ib_uverbs_release_dev);
+	wait_for_completion(&uverbs_dev->comp);
+	kfree(uverbs_dev);
+	return;
+}
+
+static void ib_uverbs_remove_one(struct ib_device *device)
+{
+	struct ib_uverbs_device *uverbs_dev = ib_get_client_data(device, &uverbs_client);
+
+	if (!uverbs_dev)
+		return;
+
+	sysfs_remove_group(&uverbs_dev->dev->kobj, &device_group);
+	dev_set_drvdata(uverbs_dev->dev, NULL);
+	device_destroy(uverbs_class, uverbs_dev->cdev->dev);
+	cdev_del(uverbs_dev->cdev);
+
+	spin_lock(&map_lock);
+	dev_table[uverbs_dev->devnum] = NULL;
+	spin_unlock(&map_lock);
+
+	clear_bit(uverbs_dev->devnum, dev_map);
+
+	kref_put(&uverbs_dev->ref, ib_uverbs_release_dev);
+	wait_for_completion(&uverbs_dev->comp);
+	kfree(uverbs_dev);
+}
+#ifdef __linux__
+static int uverbs_event_get_sb(struct file_system_type *fs_type, int flags,
+			       const char *dev_name, void *data,
+			       struct vfsmount *mnt)
+{
+	return get_sb_pseudo(fs_type, "infinibandevent:", NULL,
+			     INFINIBANDEVENTFS_MAGIC, mnt);
+}
+
+static struct file_system_type uverbs_event_fs = {
+	/* No owner field so module can be unloaded */
+	.name    = "infinibandeventfs",
+	.get_sb  = uverbs_event_get_sb,
+	.kill_sb = kill_litter_super
+};
+#endif
+
+static int __init ib_uverbs_init(void)
+{
+	int ret;
+
+	spin_lock_init(&map_lock);
+
+	ret = register_chrdev_region(IB_UVERBS_BASE_DEV, IB_UVERBS_MAX_DEVICES,
+				     "infiniband_verbs");
+	if (ret) {
+		printk(KERN_ERR "user_verbs: couldn't register device number\n");
+		goto out;
+	}
+
+	uverbs_class = class_create(THIS_MODULE, "infiniband_verbs");
+	if (IS_ERR(uverbs_class)) {
+		ret = PTR_ERR(uverbs_class);
+		printk(KERN_ERR "user_verbs: couldn't create class infiniband_verbs\n");
+		goto out_chrdev;
+	}
+
+	ret = class_create_file(uverbs_class, &class_attr_abi_version);
+	if (ret) {
+		printk(KERN_ERR "user_verbs: couldn't create abi_version attribute\n");
+		goto out_class;
+	}
+
+#ifdef __linux__
+	ret = register_filesystem(&uverbs_event_fs);
+	if (ret) {
+		printk(KERN_ERR "user_verbs: couldn't register infinibandeventfs\n");
+		goto out_class;
+	}
+
+	uverbs_event_mnt = kern_mount(&uverbs_event_fs);
+	if (IS_ERR(uverbs_event_mnt)) {
+		ret = PTR_ERR(uverbs_event_mnt);
+		printk(KERN_ERR "user_verbs: couldn't mount infinibandeventfs\n");
+		goto out_fs;
+	}
+#endif
+
+	ret = ib_register_client(&uverbs_client);
+	if (ret) {
+		printk(KERN_ERR "user_verbs: couldn't register client\n");
+		goto out_mnt;
+	}
+
+	return 0;
+
+out_mnt:
+#ifdef __linux__
+	mntput(uverbs_event_mnt);
+
+out_fs:
+	unregister_filesystem(&uverbs_event_fs);
+#endif
+
+out_class:
+	class_destroy(uverbs_class);
+
+out_chrdev:
+	unregister_chrdev_region(IB_UVERBS_BASE_DEV, IB_UVERBS_MAX_DEVICES);
+
+out:
+	return ret;
+}
+
+static void __exit ib_uverbs_cleanup(void)
+{
+	ib_unregister_client(&uverbs_client);
+#ifdef __linux__
+	mntput(uverbs_event_mnt);
+	unregister_filesystem(&uverbs_event_fs);
+#endif
+	class_destroy(uverbs_class);
+	unregister_chrdev_region(IB_UVERBS_BASE_DEV, IB_UVERBS_MAX_DEVICES);
+	idr_destroy(&ib_uverbs_pd_idr);
+	idr_destroy(&ib_uverbs_mr_idr);
+	idr_destroy(&ib_uverbs_mw_idr);
+	idr_destroy(&ib_uverbs_ah_idr);
+	idr_destroy(&ib_uverbs_cq_idr);
+	idr_destroy(&ib_uverbs_qp_idr);
+	idr_destroy(&ib_uverbs_srq_idr);
+}
+
+module_init(ib_uverbs_init);
+module_exit(ib_uverbs_cleanup);


Property changes on: trunk/sys/ofed/drivers/infiniband/core/uverbs_main.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/sys/ofed/drivers/infiniband/core/uverbs_marshall.c
===================================================================
--- trunk/sys/ofed/drivers/infiniband/core/uverbs_marshall.c	                        (rev 0)
+++ trunk/sys/ofed/drivers/infiniband/core/uverbs_marshall.c	2016-09-14 19:35:22 UTC (rev 7911)
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2005 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <rdma/ib_marshall.h>
+
+void ib_copy_ah_attr_to_user(struct ib_uverbs_ah_attr *dst,
+			     struct ib_ah_attr *src)
+{
+	memcpy(dst->grh.dgid, src->grh.dgid.raw, sizeof src->grh.dgid);
+	dst->grh.flow_label        = src->grh.flow_label;
+	dst->grh.sgid_index        = src->grh.sgid_index;
+	dst->grh.hop_limit         = src->grh.hop_limit;
+	dst->grh.traffic_class     = src->grh.traffic_class;
+	dst->dlid 	    	   = src->dlid;
+	dst->sl   	    	   = src->sl;
+	dst->src_path_bits 	   = src->src_path_bits;
+	dst->static_rate   	   = src->static_rate;
+	dst->is_global             = src->ah_flags & IB_AH_GRH ? 1 : 0;
+	dst->port_num 	    	   = src->port_num;
+}
+EXPORT_SYMBOL(ib_copy_ah_attr_to_user);
+
+void ib_copy_qp_attr_to_user(struct ib_uverbs_qp_attr *dst,
+			     struct ib_qp_attr *src)
+{
+	dst->cur_qp_state	= src->cur_qp_state;
+	dst->path_mtu		= src->path_mtu;
+	dst->path_mig_state	= src->path_mig_state;
+	dst->qkey		= src->qkey;
+	dst->rq_psn		= src->rq_psn;
+	dst->sq_psn		= src->sq_psn;
+	dst->dest_qp_num	= src->dest_qp_num;
+	dst->qp_access_flags	= src->qp_access_flags;
+
+	dst->max_send_wr	= src->cap.max_send_wr;
+	dst->max_recv_wr	= src->cap.max_recv_wr;
+	dst->max_send_sge	= src->cap.max_send_sge;
+	dst->max_recv_sge	= src->cap.max_recv_sge;
+	dst->max_inline_data	= src->cap.max_inline_data;
+
+	ib_copy_ah_attr_to_user(&dst->ah_attr, &src->ah_attr);
+	ib_copy_ah_attr_to_user(&dst->alt_ah_attr, &src->alt_ah_attr);
+
+	dst->pkey_index		= src->pkey_index;
+	dst->alt_pkey_index	= src->alt_pkey_index;
+	dst->en_sqd_async_notify = src->en_sqd_async_notify;
+	dst->sq_draining	= src->sq_draining;
+	dst->max_rd_atomic	= src->max_rd_atomic;
+	dst->max_dest_rd_atomic	= src->max_dest_rd_atomic;
+	dst->min_rnr_timer	= src->min_rnr_timer;
+	dst->port_num		= src->port_num;
+	dst->timeout		= src->timeout;
+	dst->retry_cnt		= src->retry_cnt;
+	dst->rnr_retry		= src->rnr_retry;
+	dst->alt_port_num	= src->alt_port_num;
+	dst->alt_timeout	= src->alt_timeout;
+}
+EXPORT_SYMBOL(ib_copy_qp_attr_to_user);
+
+void ib_copy_path_rec_to_user(struct ib_user_path_rec *dst,
+			      struct ib_sa_path_rec *src)
+{
+	memcpy(dst->dgid, src->dgid.raw, sizeof src->dgid);
+	memcpy(dst->sgid, src->sgid.raw, sizeof src->sgid);
+
+	dst->dlid		= src->dlid;
+	dst->slid		= src->slid;
+	dst->raw_traffic	= src->raw_traffic;
+	dst->flow_label		= src->flow_label;
+	dst->hop_limit		= src->hop_limit;
+	dst->traffic_class	= src->traffic_class;
+	dst->reversible		= src->reversible;
+	dst->numb_path		= src->numb_path;
+	dst->pkey		= src->pkey;
+	dst->sl			= src->sl;
+	dst->mtu_selector	= src->mtu_selector;
+	dst->mtu		= src->mtu;
+	dst->rate_selector	= src->rate_selector;
+	dst->rate		= src->rate;
+	dst->packet_life_time	= src->packet_life_time;
+	dst->preference		= src->preference;
+	dst->packet_life_time_selector = src->packet_life_time_selector;
+}
+EXPORT_SYMBOL(ib_copy_path_rec_to_user);
+
+void ib_copy_path_rec_from_user(struct ib_sa_path_rec *dst,
+				struct ib_user_path_rec *src)
+{
+	memcpy(dst->dgid.raw, src->dgid, sizeof dst->dgid);
+	memcpy(dst->sgid.raw, src->sgid, sizeof dst->sgid);
+
+	dst->dlid		= src->dlid;
+	dst->slid		= src->slid;
+	dst->raw_traffic	= src->raw_traffic;
+	dst->flow_label		= src->flow_label;
+	dst->hop_limit		= src->hop_limit;
+	dst->traffic_class	= src->traffic_class;
+	dst->reversible		= src->reversible;
+	dst->numb_path		= src->numb_path;
+	dst->pkey		= src->pkey;
+	dst->sl			= src->sl;
+	dst->mtu_selector	= src->mtu_selector;
+	dst->mtu		= src->mtu;
+	dst->rate_selector	= src->rate_selector;
+	dst->rate		= src->rate;
+	dst->packet_life_time	= src->packet_life_time;
+	dst->preference		= src->preference;
+	dst->packet_life_time_selector = src->packet_life_time_selector;
+}
+EXPORT_SYMBOL(ib_copy_path_rec_from_user);


Property changes on: trunk/sys/ofed/drivers/infiniband/core/uverbs_marshall.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/sys/ofed/drivers/infiniband/core/verbs.c
===================================================================
--- trunk/sys/ofed/drivers/infiniband/core/verbs.c	                        (rev 0)
+++ trunk/sys/ofed/drivers/infiniband/core/verbs.c	2016-09-14 19:35:22 UTC (rev 7911)
@@ -0,0 +1,1073 @@
+/*
+ * Copyright (c) 2004 Mellanox Technologies Ltd.  All rights reserved.
+ * Copyright (c) 2004 Infinicon Corporation.  All rights reserved.
+ * Copyright (c) 2004 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation.  All rights reserved.
+ * Copyright (c) 2004 Voltaire Corporation.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005, 2006 Cisco Systems.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/errno.h>
+#include <linux/err.h>
+#include <linux/string.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_cache.h>
+
+int ib_rate_to_mult(enum ib_rate rate)
+{
+	switch (rate) {
+	case IB_RATE_2_5_GBPS: return  1;
+	case IB_RATE_5_GBPS:   return  2;
+	case IB_RATE_10_GBPS:  return  4;
+	case IB_RATE_20_GBPS:  return  8;
+	case IB_RATE_30_GBPS:  return 12;
+	case IB_RATE_40_GBPS:  return 16;
+	case IB_RATE_60_GBPS:  return 24;
+	case IB_RATE_80_GBPS:  return 32;
+	case IB_RATE_120_GBPS: return 48;
+	default:	       return -1;
+	}
+}
+EXPORT_SYMBOL(ib_rate_to_mult);
+
+enum ib_rate mult_to_ib_rate(int mult)
+{
+	switch (mult) {
+	case 1:  return IB_RATE_2_5_GBPS;
+	case 2:  return IB_RATE_5_GBPS;
+	case 4:  return IB_RATE_10_GBPS;
+	case 8:  return IB_RATE_20_GBPS;
+	case 12: return IB_RATE_30_GBPS;
+	case 16: return IB_RATE_40_GBPS;
+	case 24: return IB_RATE_60_GBPS;
+	case 32: return IB_RATE_80_GBPS;
+	case 48: return IB_RATE_120_GBPS;
+	default: return IB_RATE_PORT_CURRENT;
+	}
+}
+EXPORT_SYMBOL(mult_to_ib_rate);
+
+enum rdma_transport_type
+rdma_node_get_transport(enum rdma_node_type node_type)
+{
+	switch (node_type) {
+	case RDMA_NODE_IB_CA:
+	case RDMA_NODE_IB_SWITCH:
+	case RDMA_NODE_IB_ROUTER:
+		return RDMA_TRANSPORT_IB;
+	case RDMA_NODE_RNIC:
+		return RDMA_TRANSPORT_IWARP;
+	default:
+		BUG();
+		return 0;
+	}
+}
+EXPORT_SYMBOL(rdma_node_get_transport);
+
+enum rdma_link_layer rdma_port_get_link_layer(struct ib_device *device, u8 port_num)
+{
+	if (device->get_link_layer)
+		return device->get_link_layer(device, port_num);
+
+	switch (rdma_node_get_transport(device->node_type)) {
+	case RDMA_TRANSPORT_IB:
+		return IB_LINK_LAYER_INFINIBAND;
+	case RDMA_TRANSPORT_IWARP:
+		return IB_LINK_LAYER_ETHERNET;
+	default:
+		return IB_LINK_LAYER_UNSPECIFIED;
+	}
+}
+EXPORT_SYMBOL(rdma_port_get_link_layer);
+
+/* Protection domains */
+
+struct ib_pd *ib_alloc_pd(struct ib_device *device)
+{
+	struct ib_pd *pd;
+
+	pd = device->alloc_pd(device, NULL, NULL);
+
+	if (!IS_ERR(pd)) {
+		pd->device  = device;
+		pd->uobject = NULL;
+		atomic_set(&pd->usecnt, 0);
+	}
+
+	return pd;
+}
+EXPORT_SYMBOL(ib_alloc_pd);
+
+int ib_dealloc_pd(struct ib_pd *pd)
+{
+	if (atomic_read(&pd->usecnt))
+		return -EBUSY;
+
+	return pd->device->dealloc_pd(pd);
+}
+EXPORT_SYMBOL(ib_dealloc_pd);
+
+/* Address handles */
+
+struct ib_ah *ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr)
+{
+	struct ib_ah *ah;
+
+	ah = pd->device->create_ah(pd, ah_attr);
+
+	if (!IS_ERR(ah)) {
+		ah->device  = pd->device;
+		ah->pd      = pd;
+		ah->uobject = NULL;
+		atomic_inc(&pd->usecnt);
+	}
+
+	return ah;
+}
+EXPORT_SYMBOL(ib_create_ah);
+
+int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, struct ib_wc *wc,
+		       struct ib_grh *grh, struct ib_ah_attr *ah_attr)
+{
+	u32 flow_class;
+	u16 gid_index;
+	int ret;
+
+	memset(ah_attr, 0, sizeof *ah_attr);
+	ah_attr->dlid = wc->slid;
+	ah_attr->sl = wc->sl;
+	ah_attr->src_path_bits = wc->dlid_path_bits;
+	ah_attr->port_num = port_num;
+
+	if (wc->wc_flags & IB_WC_GRH) {
+		ah_attr->ah_flags = IB_AH_GRH;
+		ah_attr->grh.dgid = grh->sgid;
+
+		ret = ib_find_cached_gid(device, &grh->dgid, &port_num,
+					 &gid_index);
+		if (ret)
+			return ret;
+
+		ah_attr->grh.sgid_index = (u8) gid_index;
+		flow_class = be32_to_cpu(grh->version_tclass_flow);
+		ah_attr->grh.flow_label = flow_class & 0xFFFFF;
+		ah_attr->grh.hop_limit = 0xFF;
+		ah_attr->grh.traffic_class = (flow_class >> 20) & 0xFF;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(ib_init_ah_from_wc);
+
+struct ib_ah *ib_create_ah_from_wc(struct ib_pd *pd, struct ib_wc *wc,
+				   struct ib_grh *grh, u8 port_num)
+{
+	struct ib_ah_attr ah_attr;
+	int ret;
+
+	ret = ib_init_ah_from_wc(pd->device, port_num, wc, grh, &ah_attr);
+	if (ret)
+		return ERR_PTR(ret);
+
+	return ib_create_ah(pd, &ah_attr);
+}
+EXPORT_SYMBOL(ib_create_ah_from_wc);
+
+int ib_modify_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr)
+{
+	return ah->device->modify_ah ?
+		ah->device->modify_ah(ah, ah_attr) :
+		-ENOSYS;
+}
+EXPORT_SYMBOL(ib_modify_ah);
+
+int ib_query_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr)
+{
+	return ah->device->query_ah ?
+		ah->device->query_ah(ah, ah_attr) :
+		-ENOSYS;
+}
+EXPORT_SYMBOL(ib_query_ah);
+
+int ib_destroy_ah(struct ib_ah *ah)
+{
+	struct ib_pd *pd;
+	int ret;
+
+	pd = ah->pd;
+	ret = ah->device->destroy_ah(ah);
+	if (!ret)
+		atomic_dec(&pd->usecnt);
+
+	return ret;
+}
+EXPORT_SYMBOL(ib_destroy_ah);
+
+/* Shared receive queues */
+
+struct ib_srq *ib_create_srq(struct ib_pd *pd,
+			     struct ib_srq_init_attr *srq_init_attr)
+{
+	struct ib_srq *srq;
+
+	if (!pd->device->create_srq)
+		return ERR_PTR(-ENOSYS);
+
+	srq = pd->device->create_srq(pd, srq_init_attr, NULL);
+
+	if (!IS_ERR(srq)) {
+		srq->device    	   = pd->device;
+		srq->pd        	   = pd;
+		srq->uobject       = NULL;
+		srq->event_handler = srq_init_attr->event_handler;
+		srq->srq_context   = srq_init_attr->srq_context;
+		srq->ext.xrc.cq = NULL;
+		srq->ext.xrc.xrcd = NULL;
+		atomic_inc(&pd->usecnt);
+		atomic_set(&srq->usecnt, 0);
+	}
+
+	return srq;
+}
+EXPORT_SYMBOL(ib_create_srq);
+
+struct ib_srq *ib_create_xrc_srq(struct ib_pd *pd,
+				 struct ib_cq *xrc_cq,
+				 struct ib_xrcd *xrcd,
+				 struct ib_srq_init_attr *srq_init_attr)
+{
+	struct ib_srq *srq;
+
+	if (!pd->device->create_xrc_srq)
+		return ERR_PTR(-ENOSYS);
+
+	srq = pd->device->create_xrc_srq(pd, xrc_cq, xrcd, srq_init_attr, NULL);
+
+	if (!IS_ERR(srq)) {
+		srq->device	   = pd->device;
+		srq->pd		   = pd;
+		srq->uobject	   = NULL;
+		srq->event_handler = srq_init_attr->event_handler;
+		srq->srq_context   = srq_init_attr->srq_context;
+		srq->ext.xrc.cq	   = xrc_cq;
+		srq->ext.xrc.xrcd	   = xrcd;
+		atomic_inc(&pd->usecnt);
+		atomic_inc(&xrcd->usecnt);
+		atomic_inc(&xrc_cq->usecnt);
+		atomic_set(&srq->usecnt, 0);
+	}
+
+	return srq;
+}
+EXPORT_SYMBOL(ib_create_xrc_srq);
+
+int ib_modify_srq(struct ib_srq *srq,
+		  struct ib_srq_attr *srq_attr,
+		  enum ib_srq_attr_mask srq_attr_mask)
+{
+	return srq->device->modify_srq ?
+		srq->device->modify_srq(srq, srq_attr, srq_attr_mask, NULL) :
+		-ENOSYS;
+}
+EXPORT_SYMBOL(ib_modify_srq);
+
+int ib_query_srq(struct ib_srq *srq,
+		 struct ib_srq_attr *srq_attr)
+{
+	return srq->device->query_srq ?
+		srq->device->query_srq(srq, srq_attr) : -ENOSYS;
+}
+EXPORT_SYMBOL(ib_query_srq);
+
+int ib_destroy_srq(struct ib_srq *srq)
+{
+	struct ib_pd *pd;
+	struct ib_cq *xrc_cq;
+	struct ib_xrcd *xrcd;
+	int ret;
+
+	if (atomic_read(&srq->usecnt))
+		return -EBUSY;
+
+	pd = srq->pd;
+	xrc_cq = srq->ext.xrc.cq;
+	xrcd = srq->ext.xrc.xrcd;
+
+	ret = srq->device->destroy_srq(srq);
+	if (!ret) {
+		atomic_dec(&pd->usecnt);
+		if (xrc_cq)
+			atomic_dec(&xrc_cq->usecnt);
+		if (xrcd)
+			atomic_dec(&xrcd->usecnt);
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL(ib_destroy_srq);
+
+/* Queue pairs */
+
+struct ib_qp *ib_create_qp(struct ib_pd *pd,
+			   struct ib_qp_init_attr *qp_init_attr)
+{
+	struct ib_qp *qp;
+
+	qp = pd->device->create_qp(pd, qp_init_attr, NULL);
+
+	if (!IS_ERR(qp)) {
+		qp->device     	  = pd->device;
+		qp->pd         	  = pd;
+		qp->send_cq    	  = qp_init_attr->send_cq;
+		qp->recv_cq    	  = qp_init_attr->recv_cq;
+		qp->srq	       	  = qp_init_attr->srq;
+		qp->uobject       = NULL;
+		qp->event_handler = qp_init_attr->event_handler;
+		qp->qp_context    = qp_init_attr->qp_context;
+		qp->qp_type	  = qp_init_attr->qp_type;
+		qp->xrcd	  = qp->qp_type == IB_QPT_XRC ?
+			qp_init_attr->xrcd : NULL;
+		atomic_inc(&pd->usecnt);
+		atomic_inc(&qp_init_attr->send_cq->usecnt);
+		atomic_inc(&qp_init_attr->recv_cq->usecnt);
+		if (qp_init_attr->srq)
+			atomic_inc(&qp_init_attr->srq->usecnt);
+		if (qp->qp_type == IB_QPT_XRC)
+			atomic_inc(&qp->xrcd->usecnt);
+	}
+
+	return qp;
+}
+EXPORT_SYMBOL(ib_create_qp);
+
+static const struct {
+	int			valid;
+	enum ib_qp_attr_mask	req_param[IB_QPT_RAW_PACKET + 1];
+	enum ib_qp_attr_mask	opt_param[IB_QPT_RAW_PACKET + 1];
+} qp_state_table[IB_QPS_ERR + 1][IB_QPS_ERR + 1] = {
+	[IB_QPS_RESET] = {
+		[IB_QPS_RESET] = { .valid = 1 },
+		[IB_QPS_INIT]  = {
+			.valid = 1,
+			.req_param = {
+				[IB_QPT_UD]  = (IB_QP_PKEY_INDEX		|
+						IB_QP_PORT			|
+						IB_QP_QKEY),
+				[IB_QPT_RAW_PACKET] = IB_QP_PORT,
+				[IB_QPT_UC]  = (IB_QP_PKEY_INDEX		|
+						IB_QP_PORT			|
+						IB_QP_ACCESS_FLAGS),
+				[IB_QPT_RC]  = (IB_QP_PKEY_INDEX		|
+						IB_QP_PORT			|
+						IB_QP_ACCESS_FLAGS),
+				[IB_QPT_XRC] = (IB_QP_PKEY_INDEX		|
+						IB_QP_PORT			|
+						IB_QP_ACCESS_FLAGS),
+				[IB_QPT_SMI] = (IB_QP_PKEY_INDEX		|
+						IB_QP_QKEY),
+				[IB_QPT_GSI] = (IB_QP_PKEY_INDEX		|
+						IB_QP_QKEY),
+			}
+		},
+	},
+	[IB_QPS_INIT]  = {
+		[IB_QPS_RESET] = { .valid = 1 },
+		[IB_QPS_ERR] =   { .valid = 1 },
+		[IB_QPS_INIT]  = {
+			.valid = 1,
+			.opt_param = {
+				[IB_QPT_UD]  = (IB_QP_PKEY_INDEX		|
+						IB_QP_PORT			|
+						IB_QP_QKEY),
+				[IB_QPT_UC]  = (IB_QP_PKEY_INDEX		|
+						IB_QP_PORT			|
+						IB_QP_ACCESS_FLAGS),
+				[IB_QPT_RC]  = (IB_QP_PKEY_INDEX		|
+						IB_QP_PORT			|
+						IB_QP_ACCESS_FLAGS),
+				[IB_QPT_XRC] = (IB_QP_PKEY_INDEX		|
+						IB_QP_PORT			|
+						IB_QP_ACCESS_FLAGS),
+				[IB_QPT_SMI] = (IB_QP_PKEY_INDEX		|
+						IB_QP_QKEY),
+				[IB_QPT_GSI] = (IB_QP_PKEY_INDEX		|
+						IB_QP_QKEY),
+			}
+		},
+		[IB_QPS_RTR]   = {
+			.valid = 1,
+			.req_param = {
+				[IB_QPT_UC]  = (IB_QP_AV			|
+						IB_QP_PATH_MTU			|
+						IB_QP_DEST_QPN			|
+						IB_QP_RQ_PSN),
+				[IB_QPT_RC]  = (IB_QP_AV			|
+						IB_QP_PATH_MTU			|
+						IB_QP_DEST_QPN			|
+						IB_QP_RQ_PSN			|
+						IB_QP_MAX_DEST_RD_ATOMIC	|
+						IB_QP_MIN_RNR_TIMER),
+				[IB_QPT_XRC] = (IB_QP_AV			|
+						IB_QP_PATH_MTU			|
+						IB_QP_DEST_QPN			|
+						IB_QP_RQ_PSN			|
+						IB_QP_MAX_DEST_RD_ATOMIC	|
+						IB_QP_MIN_RNR_TIMER),
+			},
+			.opt_param = {
+				 [IB_QPT_UD]  = (IB_QP_PKEY_INDEX		|
+						 IB_QP_QKEY),
+				 [IB_QPT_UC]  = (IB_QP_ALT_PATH			|
+						 IB_QP_ACCESS_FLAGS		|
+						 IB_QP_PKEY_INDEX),
+				 [IB_QPT_RC]  = (IB_QP_ALT_PATH			|
+						 IB_QP_ACCESS_FLAGS		|
+						 IB_QP_PKEY_INDEX),
+				 [IB_QPT_XRC] = (IB_QP_ALT_PATH			|
+						IB_QP_ACCESS_FLAGS		|
+						IB_QP_PKEY_INDEX),
+				 [IB_QPT_SMI] = (IB_QP_PKEY_INDEX		|
+						 IB_QP_QKEY),
+				 [IB_QPT_GSI] = (IB_QP_PKEY_INDEX		|
+						 IB_QP_QKEY),
+			 }
+		}
+	},
+	[IB_QPS_RTR]   = {
+		[IB_QPS_RESET] = { .valid = 1 },
+		[IB_QPS_ERR] =   { .valid = 1 },
+		[IB_QPS_RTS]   = {
+			.valid = 1,
+			.req_param = {
+				[IB_QPT_UD]  = IB_QP_SQ_PSN,
+				[IB_QPT_UC]  = IB_QP_SQ_PSN,
+				[IB_QPT_RC]  = (IB_QP_TIMEOUT			|
+						IB_QP_RETRY_CNT			|
+						IB_QP_RNR_RETRY			|
+						IB_QP_SQ_PSN			|
+						IB_QP_MAX_QP_RD_ATOMIC),
+				[IB_QPT_XRC] = (IB_QP_TIMEOUT			|
+						IB_QP_RETRY_CNT			|
+						IB_QP_RNR_RETRY			|
+						IB_QP_SQ_PSN			|
+						IB_QP_MAX_QP_RD_ATOMIC),
+				[IB_QPT_SMI] = IB_QP_SQ_PSN,
+				[IB_QPT_GSI] = IB_QP_SQ_PSN,
+			},
+			.opt_param = {
+				 [IB_QPT_UD]  = (IB_QP_CUR_STATE		|
+						 IB_QP_QKEY),
+				 [IB_QPT_UC]  = (IB_QP_CUR_STATE		|
+						 IB_QP_ALT_PATH			|
+						 IB_QP_ACCESS_FLAGS		|
+						 IB_QP_PATH_MIG_STATE),
+				 [IB_QPT_RC]  = (IB_QP_CUR_STATE		|
+						 IB_QP_ALT_PATH			|
+						 IB_QP_ACCESS_FLAGS		|
+						 IB_QP_MIN_RNR_TIMER		|
+						 IB_QP_PATH_MIG_STATE),
+				 [IB_QPT_XRC] = (IB_QP_CUR_STATE		|
+						IB_QP_ALT_PATH			|
+						IB_QP_ACCESS_FLAGS		|
+						IB_QP_MIN_RNR_TIMER		|
+						IB_QP_PATH_MIG_STATE),
+				 [IB_QPT_SMI] = (IB_QP_CUR_STATE		|
+						 IB_QP_QKEY),
+				 [IB_QPT_GSI] = (IB_QP_CUR_STATE		|
+						 IB_QP_QKEY),
+			 }
+		}
+	},
+	[IB_QPS_RTS]   = {
+		[IB_QPS_RESET] = { .valid = 1 },
+		[IB_QPS_ERR] =   { .valid = 1 },
+		[IB_QPS_RTS]   = {
+			.valid = 1,
+			.opt_param = {
+				[IB_QPT_UD]  = (IB_QP_CUR_STATE			|
+						IB_QP_QKEY),
+				[IB_QPT_UC]  = (IB_QP_CUR_STATE			|
+						IB_QP_ACCESS_FLAGS		|
+						IB_QP_ALT_PATH			|
+						IB_QP_PATH_MIG_STATE),
+				[IB_QPT_RC]  = (IB_QP_CUR_STATE			|
+						IB_QP_ACCESS_FLAGS		|
+						IB_QP_ALT_PATH			|
+						IB_QP_PATH_MIG_STATE		|
+						IB_QP_MIN_RNR_TIMER),
+				[IB_QPT_XRC] = (IB_QP_CUR_STATE			|
+						IB_QP_ACCESS_FLAGS		|
+						IB_QP_ALT_PATH			|
+						IB_QP_PATH_MIG_STATE		|
+						IB_QP_MIN_RNR_TIMER),
+				[IB_QPT_SMI] = (IB_QP_CUR_STATE			|
+						IB_QP_QKEY),
+				[IB_QPT_GSI] = (IB_QP_CUR_STATE			|
+						IB_QP_QKEY),
+			}
+		},
+		[IB_QPS_SQD]   = {
+			.valid = 1,
+			.opt_param = {
+				[IB_QPT_UD]  = IB_QP_EN_SQD_ASYNC_NOTIFY,
+				[IB_QPT_UC]  = IB_QP_EN_SQD_ASYNC_NOTIFY,
+				[IB_QPT_RC]  = IB_QP_EN_SQD_ASYNC_NOTIFY,
+				[IB_QPT_XRC] = IB_QP_EN_SQD_ASYNC_NOTIFY,
+				[IB_QPT_SMI] = IB_QP_EN_SQD_ASYNC_NOTIFY,
+				[IB_QPT_GSI] = IB_QP_EN_SQD_ASYNC_NOTIFY
+			}
+		},
+	},
+	[IB_QPS_SQD]   = {
+		[IB_QPS_RESET] = { .valid = 1 },
+		[IB_QPS_ERR] =   { .valid = 1 },
+		[IB_QPS_RTS]   = {
+			.valid = 1,
+			.opt_param = {
+				[IB_QPT_UD]  = (IB_QP_CUR_STATE			|
+						IB_QP_QKEY),
+				[IB_QPT_UC]  = (IB_QP_CUR_STATE			|
+						IB_QP_ALT_PATH			|
+						IB_QP_ACCESS_FLAGS		|
+						IB_QP_PATH_MIG_STATE),
+				[IB_QPT_RC]  = (IB_QP_CUR_STATE			|
+						IB_QP_ALT_PATH			|
+						IB_QP_ACCESS_FLAGS		|
+						IB_QP_MIN_RNR_TIMER		|
+						IB_QP_PATH_MIG_STATE),
+				[IB_QPT_XRC] = (IB_QP_CUR_STATE			|
+						IB_QP_ALT_PATH			|
+						IB_QP_ACCESS_FLAGS		|
+						IB_QP_MIN_RNR_TIMER		|
+						IB_QP_PATH_MIG_STATE),
+				[IB_QPT_SMI] = (IB_QP_CUR_STATE			|
+						IB_QP_QKEY),
+				[IB_QPT_GSI] = (IB_QP_CUR_STATE			|
+						IB_QP_QKEY),
+			}
+		},
+		[IB_QPS_SQD]   = {
+			.valid = 1,
+			.opt_param = {
+				[IB_QPT_UD]  = (IB_QP_PKEY_INDEX		|
+						IB_QP_QKEY),
+				[IB_QPT_UC]  = (IB_QP_AV			|
+						IB_QP_ALT_PATH			|
+						IB_QP_ACCESS_FLAGS		|
+						IB_QP_PKEY_INDEX		|
+						IB_QP_PATH_MIG_STATE),
+				[IB_QPT_RC]  = (IB_QP_PORT			|
+						IB_QP_AV			|
+						IB_QP_TIMEOUT			|
+						IB_QP_RETRY_CNT			|
+						IB_QP_RNR_RETRY			|
+						IB_QP_MAX_QP_RD_ATOMIC		|
+						IB_QP_MAX_DEST_RD_ATOMIC	|
+						IB_QP_ALT_PATH			|
+						IB_QP_ACCESS_FLAGS		|
+						IB_QP_PKEY_INDEX		|
+						IB_QP_MIN_RNR_TIMER		|
+						IB_QP_PATH_MIG_STATE),
+				[IB_QPT_XRC] = (IB_QP_PORT			|
+						IB_QP_AV			|
+						IB_QP_TIMEOUT			|
+						IB_QP_RETRY_CNT			|
+						IB_QP_RNR_RETRY			|
+						IB_QP_MAX_QP_RD_ATOMIC		|
+						IB_QP_MAX_DEST_RD_ATOMIC	|
+						IB_QP_ALT_PATH			|
+						IB_QP_ACCESS_FLAGS		|
+						IB_QP_PKEY_INDEX		|
+						IB_QP_MIN_RNR_TIMER		|
+						IB_QP_PATH_MIG_STATE),
+				[IB_QPT_SMI] = (IB_QP_PKEY_INDEX		|
+						IB_QP_QKEY),
+				[IB_QPT_GSI] = (IB_QP_PKEY_INDEX		|
+						IB_QP_QKEY),
+			}
+		}
+	},
+	[IB_QPS_SQE]   = {
+		[IB_QPS_RESET] = { .valid = 1 },
+		[IB_QPS_ERR] =   { .valid = 1 },
+		[IB_QPS_RTS]   = {
+			.valid = 1,
+			.opt_param = {
+				[IB_QPT_UD]  = (IB_QP_CUR_STATE			|
+						IB_QP_QKEY),
+				[IB_QPT_UC]  = (IB_QP_CUR_STATE			|
+						IB_QP_ACCESS_FLAGS),
+				[IB_QPT_SMI] = (IB_QP_CUR_STATE			|
+						IB_QP_QKEY),
+				[IB_QPT_GSI] = (IB_QP_CUR_STATE			|
+						IB_QP_QKEY),
+			}
+		}
+	},
+	[IB_QPS_ERR] = {
+		[IB_QPS_RESET] = { .valid = 1 },
+		[IB_QPS_ERR] =   { .valid = 1 }
+	}
+};
+
+int ib_modify_qp_is_ok(enum ib_qp_state cur_state, enum ib_qp_state next_state,
+		       enum ib_qp_type type, enum ib_qp_attr_mask mask)
+{
+	enum ib_qp_attr_mask req_param, opt_param;
+
+	if (cur_state  < 0 || cur_state  > IB_QPS_ERR ||
+	    next_state < 0 || next_state > IB_QPS_ERR)
+		return 0;
+
+	if (mask & IB_QP_CUR_STATE  &&
+	    cur_state != IB_QPS_RTR && cur_state != IB_QPS_RTS &&
+	    cur_state != IB_QPS_SQD && cur_state != IB_QPS_SQE)
+		return 0;
+
+	if (!qp_state_table[cur_state][next_state].valid)
+		return 0;
+
+	req_param = qp_state_table[cur_state][next_state].req_param[type];
+	opt_param = qp_state_table[cur_state][next_state].opt_param[type];
+
+	if ((mask & req_param) != req_param)
+		return 0;
+
+	if (mask & ~(req_param | opt_param | IB_QP_STATE))
+		return 0;
+
+	return 1;
+}
+EXPORT_SYMBOL(ib_modify_qp_is_ok);
+
+int ib_modify_qp(struct ib_qp *qp,
+		 struct ib_qp_attr *qp_attr,
+		 int qp_attr_mask)
+{
+	return qp->device->modify_qp(qp, qp_attr, qp_attr_mask, NULL);
+}
+EXPORT_SYMBOL(ib_modify_qp);
+
+int ib_query_qp(struct ib_qp *qp,
+		struct ib_qp_attr *qp_attr,
+		int qp_attr_mask,
+		struct ib_qp_init_attr *qp_init_attr)
+{
+	return qp->device->query_qp ?
+		qp->device->query_qp(qp, qp_attr, qp_attr_mask, qp_init_attr) :
+		-ENOSYS;
+}
+EXPORT_SYMBOL(ib_query_qp);
+
+int ib_destroy_qp(struct ib_qp *qp)
+{
+	struct ib_pd *pd;
+	struct ib_cq *scq, *rcq;
+	struct ib_srq *srq;
+	struct ib_xrcd *xrcd;
+	enum ib_qp_type	qp_type = qp->qp_type;
+	int ret;
+
+	pd  = qp->pd;
+	scq = qp->send_cq;
+	rcq = qp->recv_cq;
+	srq = qp->srq;
+	xrcd = qp->xrcd;
+
+	ret = qp->device->destroy_qp(qp);
+	if (!ret) {
+		atomic_dec(&pd->usecnt);
+		atomic_dec(&scq->usecnt);
+		atomic_dec(&rcq->usecnt);
+		if (srq)
+			atomic_dec(&srq->usecnt);
+		if (qp_type == IB_QPT_XRC)
+			atomic_dec(&xrcd->usecnt);
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL(ib_destroy_qp);
+
+/* Completion queues */
+
+struct ib_cq *ib_create_cq(struct ib_device *device,
+			   ib_comp_handler comp_handler,
+			   void (*event_handler)(struct ib_event *, void *),
+			   void *cq_context, int cqe, int comp_vector)
+{
+	struct ib_cq *cq;
+
+	cq = device->create_cq(device, cqe, comp_vector, NULL, NULL);
+
+	if (!IS_ERR(cq)) {
+		cq->device        = device;
+		cq->uobject       = NULL;
+		cq->comp_handler  = comp_handler;
+		cq->event_handler = event_handler;
+		cq->cq_context    = cq_context;
+		atomic_set(&cq->usecnt, 0);
+	}
+
+	return cq;
+}
+EXPORT_SYMBOL(ib_create_cq);
+
+int ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period)
+{
+	return cq->device->modify_cq ?
+		cq->device->modify_cq(cq, cq_count, cq_period) : -ENOSYS;
+}
+EXPORT_SYMBOL(ib_modify_cq);
+
+int ib_destroy_cq(struct ib_cq *cq)
+{
+	if (atomic_read(&cq->usecnt))
+		return -EBUSY;
+
+	return cq->device->destroy_cq(cq);
+}
+EXPORT_SYMBOL(ib_destroy_cq);
+
+int ib_resize_cq(struct ib_cq *cq, int cqe)
+{
+	return cq->device->resize_cq ?
+		cq->device->resize_cq(cq, cqe, NULL) : -ENOSYS;
+}
+EXPORT_SYMBOL(ib_resize_cq);
+
+/* Memory regions */
+
+struct ib_mr *ib_get_dma_mr(struct ib_pd *pd, int mr_access_flags)
+{
+	struct ib_mr *mr;
+
+	mr = pd->device->get_dma_mr(pd, mr_access_flags);
+
+	if (!IS_ERR(mr)) {
+		mr->device  = pd->device;
+		mr->pd      = pd;
+		mr->uobject = NULL;
+		atomic_inc(&pd->usecnt);
+		atomic_set(&mr->usecnt, 0);
+	}
+
+	return mr;
+}
+EXPORT_SYMBOL(ib_get_dma_mr);
+
+struct ib_mr *ib_reg_phys_mr(struct ib_pd *pd,
+			     struct ib_phys_buf *phys_buf_array,
+			     int num_phys_buf,
+			     int mr_access_flags,
+			     u64 *iova_start)
+{
+	struct ib_mr *mr;
+
+	if (!pd->device->reg_phys_mr)
+		return ERR_PTR(-ENOSYS);
+
+	mr = pd->device->reg_phys_mr(pd, phys_buf_array, num_phys_buf,
+				     mr_access_flags, iova_start);
+
+	if (!IS_ERR(mr)) {
+		mr->device  = pd->device;
+		mr->pd      = pd;
+		mr->uobject = NULL;
+		atomic_inc(&pd->usecnt);
+		atomic_set(&mr->usecnt, 0);
+	}
+
+	return mr;
+}
+EXPORT_SYMBOL(ib_reg_phys_mr);
+
+int ib_rereg_phys_mr(struct ib_mr *mr,
+		     int mr_rereg_mask,
+		     struct ib_pd *pd,
+		     struct ib_phys_buf *phys_buf_array,
+		     int num_phys_buf,
+		     int mr_access_flags,
+		     u64 *iova_start)
+{
+	struct ib_pd *old_pd;
+	int ret;
+
+	if (!mr->device->rereg_phys_mr)
+		return -ENOSYS;
+
+	if (atomic_read(&mr->usecnt))
+		return -EBUSY;
+
+	old_pd = mr->pd;
+
+	ret = mr->device->rereg_phys_mr(mr, mr_rereg_mask, pd,
+					phys_buf_array, num_phys_buf,
+					mr_access_flags, iova_start);
+
+	if (!ret && (mr_rereg_mask & IB_MR_REREG_PD)) {
+		atomic_dec(&old_pd->usecnt);
+		atomic_inc(&pd->usecnt);
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL(ib_rereg_phys_mr);
+
+int ib_query_mr(struct ib_mr *mr, struct ib_mr_attr *mr_attr)
+{
+	return mr->device->query_mr ?
+		mr->device->query_mr(mr, mr_attr) : -ENOSYS;
+}
+EXPORT_SYMBOL(ib_query_mr);
+
+int ib_dereg_mr(struct ib_mr *mr)
+{
+	struct ib_pd *pd;
+	int ret;
+
+	if (atomic_read(&mr->usecnt))
+		return -EBUSY;
+
+	pd = mr->pd;
+	ret = mr->device->dereg_mr(mr);
+	if (!ret)
+		atomic_dec(&pd->usecnt);
+
+	return ret;
+}
+EXPORT_SYMBOL(ib_dereg_mr);
+
+struct ib_mr *ib_alloc_fast_reg_mr(struct ib_pd *pd, int max_page_list_len)
+{
+	struct ib_mr *mr;
+
+	if (!pd->device->alloc_fast_reg_mr)
+		return ERR_PTR(-ENOSYS);
+
+	mr = pd->device->alloc_fast_reg_mr(pd, max_page_list_len);
+
+	if (!IS_ERR(mr)) {
+		mr->device  = pd->device;
+		mr->pd      = pd;
+		mr->uobject = NULL;
+		atomic_inc(&pd->usecnt);
+		atomic_set(&mr->usecnt, 0);
+	}
+
+	return mr;
+}
+EXPORT_SYMBOL(ib_alloc_fast_reg_mr);
+
+struct ib_fast_reg_page_list *ib_alloc_fast_reg_page_list(struct ib_device *device,
+							  int max_page_list_len)
+{
+	struct ib_fast_reg_page_list *page_list;
+
+	if (!device->alloc_fast_reg_page_list)
+		return ERR_PTR(-ENOSYS);
+
+	page_list = device->alloc_fast_reg_page_list(device, max_page_list_len);
+
+	if (!IS_ERR(page_list)) {
+		page_list->device = device;
+		page_list->max_page_list_len = max_page_list_len;
+	}
+
+	return page_list;
+}
+EXPORT_SYMBOL(ib_alloc_fast_reg_page_list);
+
+void ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list)
+{
+	page_list->device->free_fast_reg_page_list(page_list);
+}
+EXPORT_SYMBOL(ib_free_fast_reg_page_list);
+
+/* Memory windows */
+
+struct ib_mw *ib_alloc_mw(struct ib_pd *pd)
+{
+	struct ib_mw *mw;
+
+	if (!pd->device->alloc_mw)
+		return ERR_PTR(-ENOSYS);
+
+	mw = pd->device->alloc_mw(pd);
+	if (!IS_ERR(mw)) {
+		mw->device  = pd->device;
+		mw->pd      = pd;
+		mw->uobject = NULL;
+		atomic_inc(&pd->usecnt);
+	}
+
+	return mw;
+}
+EXPORT_SYMBOL(ib_alloc_mw);
+
+int ib_dealloc_mw(struct ib_mw *mw)
+{
+	struct ib_pd *pd;
+	int ret;
+
+	pd = mw->pd;
+	ret = mw->device->dealloc_mw(mw);
+	if (!ret)
+		atomic_dec(&pd->usecnt);
+
+	return ret;
+}
+EXPORT_SYMBOL(ib_dealloc_mw);
+
+/* "Fast" memory regions */
+
+struct ib_fmr *ib_alloc_fmr(struct ib_pd *pd,
+			    int mr_access_flags,
+			    struct ib_fmr_attr *fmr_attr)
+{
+	struct ib_fmr *fmr;
+
+	if (!pd->device->alloc_fmr)
+		return ERR_PTR(-ENOSYS);
+
+	fmr = pd->device->alloc_fmr(pd, mr_access_flags, fmr_attr);
+	if (!IS_ERR(fmr)) {
+		fmr->device = pd->device;
+		fmr->pd     = pd;
+		atomic_inc(&pd->usecnt);
+	}
+
+	return fmr;
+}
+EXPORT_SYMBOL(ib_alloc_fmr);
+
+int ib_unmap_fmr(struct list_head *fmr_list)
+{
+	struct ib_fmr *fmr;
+
+	if (list_empty(fmr_list))
+		return 0;
+
+	fmr = list_entry(fmr_list->next, struct ib_fmr, list);
+	return fmr->device->unmap_fmr(fmr_list);
+}
+EXPORT_SYMBOL(ib_unmap_fmr);
+
+int ib_dealloc_fmr(struct ib_fmr *fmr)
+{
+	struct ib_pd *pd;
+	int ret;
+
+	pd = fmr->pd;
+	ret = fmr->device->dealloc_fmr(fmr);
+	if (!ret)
+		atomic_dec(&pd->usecnt);
+
+	return ret;
+}
+EXPORT_SYMBOL(ib_dealloc_fmr);
+
+/* Multicast groups */
+
+int ib_attach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid)
+{
+	if (!qp->device->attach_mcast)
+		return -ENOSYS;
+
+	switch (rdma_node_get_transport(qp->device->node_type)) {
+	case RDMA_TRANSPORT_IB:
+		if (qp->qp_type == IB_QPT_RAW_PACKET) {
+			/* In raw Etherent mgids the 63 msb's should be 0 */
+			if (gid->global.subnet_prefix & cpu_to_be64(~1ULL))
+				return -EINVAL;
+		} else if (gid->raw[0] != 0xff || qp->qp_type != IB_QPT_UD)
+			return -EINVAL;
+		break;
+	case RDMA_TRANSPORT_IWARP:
+		if (qp->qp_type != IB_QPT_RAW_PACKET)
+			return -EINVAL;
+		break;
+	}
+	return qp->device->attach_mcast(qp, gid, lid);
+}
+EXPORT_SYMBOL(ib_attach_mcast);
+
+int ib_detach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid)
+{
+	if (!qp->device->detach_mcast)
+		return -ENOSYS;
+
+	switch (rdma_node_get_transport(qp->device->node_type)) {
+	case RDMA_TRANSPORT_IB:
+		if (qp->qp_type == IB_QPT_RAW_PACKET) {
+			/* In raw Etherent mgids the 63 msb's should be 0 */
+			if (gid->global.subnet_prefix & cpu_to_be64(~1ULL))
+				return -EINVAL;
+		} else if (gid->raw[0] != 0xff || qp->qp_type != IB_QPT_UD)
+			return -EINVAL;
+		break;
+	case RDMA_TRANSPORT_IWARP:
+		if (qp->qp_type != IB_QPT_RAW_PACKET)
+			return -EINVAL;
+		break;
+	}
+	return qp->device->detach_mcast(qp, gid, lid);
+}
+EXPORT_SYMBOL(ib_detach_mcast);
+
+int ib_dealloc_xrcd(struct ib_xrcd *xrcd)
+{
+	if (atomic_read(&xrcd->usecnt))
+		return -EBUSY;
+
+	return xrcd->device->dealloc_xrcd(xrcd);
+}
+EXPORT_SYMBOL(ib_dealloc_xrcd);
+
+struct ib_xrcd *ib_alloc_xrcd(struct ib_device *device)
+{
+	struct ib_xrcd *xrcd;
+
+	if (!device->alloc_xrcd)
+		return ERR_PTR(-ENOSYS);
+
+	xrcd = device->alloc_xrcd(device, NULL, NULL);
+	if (!IS_ERR(xrcd)) {
+		xrcd->device = device;
+		xrcd->inode = NULL;
+		xrcd->uobject = NULL;
+		atomic_set(&xrcd->usecnt, 0);
+	}
+	return xrcd;
+}
+EXPORT_SYMBOL(ib_alloc_xrcd);
+


Property changes on: trunk/sys/ofed/drivers/infiniband/core/verbs.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Modified: trunk/sys/ofed/drivers/infiniband/hw/mlx4/Kconfig
===================================================================
--- trunk/sys/ofed/drivers/infiniband/hw/mlx4/Kconfig	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/drivers/infiniband/hw/mlx4/Kconfig	2016-09-14 19:35:22 UTC (rev 7911)
@@ -1,5 +1,7 @@
 config MLX4_INFINIBAND
 	tristate "Mellanox ConnectX HCA support"
+	depends on NETDEVICES && ETHERNET && PCI
+	select NET_VENDOR_MELLANOX
 	select MLX4_CORE
 	---help---
 	  This driver provides low-level InfiniBand support for


Property changes on: trunk/sys/ofed/drivers/infiniband/hw/mlx4/Kconfig
___________________________________________________________________
Deleted: cvs2svn:cvs-rev
## -1 +0,0 ##
-1.1.1.1
\ No newline at end of property
Modified: trunk/sys/ofed/drivers/infiniband/hw/mlx4/Makefile
===================================================================
--- trunk/sys/ofed/drivers/infiniband/hw/mlx4/Makefile	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/drivers/infiniband/hw/mlx4/Makefile	2016-09-14 19:35:22 UTC (rev 7911)
@@ -1,4 +1,31 @@
-obj-$(CONFIG_MLX4_INFINIBAND)	+= mlx4_ib.o
+# $FreeBSD$
+#.PATH:  ${.CURDIR}/../../ofed/drivers/infiniband/hw/mlx4
+#.PATH:  ${.CURDIR}/../../../../include/linux
 
-mlx4_ib-y :=	ah.o cq.o doorbell.o mad.o main.o mr.o qp.o srq.o
-mlx4_ib-y +=	wc.o
+.include <bsd.own.mk>
+
+KMOD    = mlx4ib
+SRCS    = device_if.h bus_if.h pci_if.h vnode_if.h
+#SRCS+=  linux_compat.c linux_radix.c
+SRCS+=	ah.c cq.c doorbell.c mad.c main.c mr.c qp.c srq.c wc.c 
+SRCS+=	opt_inet.h opt_inet6.h
+
+#CFLAGS+= -I${.CURDIR}/../../ofed/include/
+CFLAGS+= -I${.CURDIR}/../../../../include
+CFLAGS+= -DCONFIG_INFINIBAND_USER_MEM
+
+.if !defined(KERNBUILDDIR)
+.if ${MK_INET_SUPPORT} != "no"
+opt_inet.h:
+	@echo "#define INET 1" > ${.TARGET}
+.endif
+
+.if ${MK_INET6_SUPPORT} != "no"
+opt_inet6.h:
+	@echo "#define INET6 1" > ${.TARGET}
+.endif
+.endif
+
+.include <bsd.kmod.mk>
+
+CFLAGS+= -Wno-cast-qual -Wno-pointer-arith -fms-extensions


Property changes on: trunk/sys/ofed/drivers/infiniband/hw/mlx4/Makefile
___________________________________________________________________
Deleted: cvs2svn:cvs-rev
## -1 +0,0 ##
-1.1.1.1
\ No newline at end of property
Modified: trunk/sys/ofed/drivers/infiniband/hw/mlx4/ah.c
===================================================================
--- trunk/sys/ofed/drivers/infiniband/hw/mlx4/ah.c	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/drivers/infiniband/hw/mlx4/ah.c	2016-09-14 19:35:22 UTC (rev 7911)
@@ -30,25 +30,24 @@
  * SOFTWARE.
  */
 
-#include "mlx4_ib.h"
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/systm.h>
 #include <rdma/ib_addr.h>
-#include <linux/inet.h>
-#include <linux/string.h>
 #include <rdma/ib_cache.h>
 
+#include <linux/slab.h>
+#include <linux/string.h>
+
+#include "mlx4_ib.h"
+
 int mlx4_ib_resolve_grh(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah_attr,
 			u8 *mac, int *is_mcast, u8 port)
 {
-	struct mlx4_ib_iboe *iboe = &dev->iboe;
 	struct in6_addr in6;
 
 	*is_mcast = 0;
-	spin_lock(&iboe->lock);
-	if (!iboe->netdevs[port - 1]) {
-		spin_unlock(&iboe->lock);
-		return -EINVAL;
-	}
-	spin_unlock(&iboe->lock);
 
 	memcpy(&in6, ah_attr->grh.dgid.raw, sizeof in6);
 	if (rdma_link_local_addr(&in6))
@@ -92,15 +91,15 @@
 }
 
 static struct ib_ah *create_iboe_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr,
-				   struct mlx4_ib_ah *ah)
+				    struct mlx4_ib_ah *ah)
 {
 	struct mlx4_ib_dev *ibdev = to_mdev(pd->device);
 	struct mlx4_dev *dev = ibdev->dev;
+	union ib_gid sgid;
 	u8 mac[6];
 	int err;
 	int is_mcast;
 	u16 vlan_tag;
-	union ib_gid sgid;
 
 	err = mlx4_ib_resolve_grh(ibdev, ah_attr, mac, &is_mcast, ah_attr->port_num);
 	if (err)
@@ -130,7 +129,7 @@
 		ah->av.ib.dlid = cpu_to_be16(0xc000);
 
 	memcpy(ah->av.eth.dgid, ah_attr->grh.dgid.raw, 16);
-	ah->av.eth.sl_tclass_flowlabel = cpu_to_be32(ah_attr->sl << 28);
+	ah->av.eth.sl_tclass_flowlabel = cpu_to_be32(ah_attr->sl << 29);
 
 	return &ah->ibah;
 }
@@ -147,25 +146,24 @@
 	if (rdma_port_get_link_layer(pd->device, ah_attr->port_num) == IB_LINK_LAYER_ETHERNET) {
 		if (!(ah_attr->ah_flags & IB_AH_GRH)) {
 			ret = ERR_PTR(-EINVAL);
-			goto out;
 		} else {
-			/* TBD: need to handle the case when we get called
-			in an atomic context and there we might sleep. We
-			don't expect this currently since we're working with
-			link local addresses which we can translate without
-			going to sleep */
+			/*
+			 * TBD: need to handle the case when we get
+			 * called in an atomic context and there we
+			 * might sleep.  We don't expect this
+			 * currently since we're working with link
+			 * local addresses which we can translate
+			 * without going to sleep.
+			 */
 			ret = create_iboe_ah(pd, ah_attr, ah);
-			if (IS_ERR(ret))
-				goto out;
-			else
-				return ret;
 		}
+
+		if (IS_ERR(ret))
+			kfree(ah);
+
+		return ret;
 	} else
 		return create_ib_ah(pd, ah_attr, ah); /* never fails */
-
-out:
-	kfree(ah);
-	return ret;
 }
 
 int mlx4_ib_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr)
@@ -202,4 +200,3 @@
 	kfree(to_mah(ah));
 	return 0;
 }
-


Property changes on: trunk/sys/ofed/drivers/infiniband/hw/mlx4/ah.c
___________________________________________________________________
Deleted: cvs2svn:cvs-rev
## -1 +0,0 ##
-1.1.1.1
\ No newline at end of property
Added: trunk/sys/ofed/drivers/infiniband/hw/mlx4/alias_GUID.c
===================================================================
--- trunk/sys/ofed/drivers/infiniband/hw/mlx4/alias_GUID.c	                        (rev 0)
+++ trunk/sys/ofed/drivers/infiniband/hw/mlx4/alias_GUID.c	2016-09-14 19:35:22 UTC (rev 7911)
@@ -0,0 +1,687 @@
+/*
+ * Copyright (c) 2012 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+ /***********************************************************/
+/*This file support the handling of the Alias GUID feature. */
+/***********************************************************/
+#include <rdma/ib_mad.h>
+#include <rdma/ib_smi.h>
+#include <rdma/ib_cache.h>
+#include <rdma/ib_sa.h>
+#include <rdma/ib_pack.h>
+#include <linux/mlx4/cmd.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <rdma/ib_user_verbs.h>
+#include <linux/delay.h>
+#include "mlx4_ib.h"
+
+/*
+The driver keeps the current state of all guids, as they are in the HW.
+Whenever we receive an smp mad GUIDInfo record, the data will be cached.
+*/
+
+struct mlx4_alias_guid_work_context {
+	u8 port;
+	struct mlx4_ib_dev     *dev ;
+	struct ib_sa_query     *sa_query;
+	struct completion	done;
+	int			query_id;
+	struct list_head	list;
+	int			block_num;
+};
+
+struct mlx4_next_alias_guid_work {
+	u8 port;
+	u8 block_num;
+	struct mlx4_sriov_alias_guid_info_rec_det rec_det;
+};
+
+
+void mlx4_ib_update_cache_on_guid_change(struct mlx4_ib_dev *dev, int block_num,
+					 u8 port_num, u8 *p_data)
+{
+	int i;
+	u64 guid_indexes;
+	int slave_id;
+	int port_index = port_num - 1;
+
+	if (!mlx4_is_master(dev->dev))
+		return;
+
+	guid_indexes = be64_to_cpu((__force __be64) dev->sriov.alias_guid.
+				   ports_guid[port_num - 1].
+				   all_rec_per_port[block_num].guid_indexes);
+	pr_debug("port: %d, guid_indexes: 0x%llx\n", port_num, (long long)guid_indexes);
+
+	for (i = 0; i < NUM_ALIAS_GUID_IN_REC; i++) {
+		/* The location of the specific index starts from bit number 4
+		 * until bit num 11 */
+		if (test_bit(i + 4, (unsigned long *)&guid_indexes)) {
+			slave_id = (block_num * NUM_ALIAS_GUID_IN_REC) + i ;
+			if (slave_id >= dev->dev->num_slaves) {
+				pr_debug("The last slave: %d\n", slave_id);
+				return;
+			}
+
+			/* cache the guid: */
+			memcpy(&dev->sriov.demux[port_index].guid_cache[slave_id],
+			       &p_data[i * GUID_REC_SIZE],
+			       GUID_REC_SIZE);
+		} else
+			pr_debug("Guid number: %d in block: %d"
+				 " was not updated\n", i, block_num);
+	}
+}
+
+static __be64 get_cached_alias_guid(struct mlx4_ib_dev *dev, int port, int index)
+{
+	if (index >= NUM_ALIAS_GUID_PER_PORT) {
+		pr_err("%s: ERROR: asked for index:%d\n", __func__, index);
+		return (__force __be64) -1;
+	}
+	return *(__be64 *)&dev->sriov.demux[port - 1].guid_cache[index];
+}
+
+
+ib_sa_comp_mask mlx4_ib_get_aguid_comp_mask_from_ix(int index)
+{
+	return IB_SA_COMP_MASK(4 + index);
+}
+
+/*
+ * Whenever new GUID is set/unset (guid table change) create event and
+ * notify the relevant slave (master also should be notified).
+ * If the GUID value is not as we have in the cache the slave will not be
+ * updated; in this case it waits for the smp_snoop or the port management
+ * event to call the function and to update the slave.
+ * block_number - the index of the block (16 blocks available)
+ * port_number - 1 or 2
+ */
+void mlx4_ib_notify_slaves_on_guid_change(struct mlx4_ib_dev *dev,
+					  int block_num, u8 port_num,
+					  u8 *p_data)
+{
+	int i;
+	u64 guid_indexes;
+	int slave_id;
+	enum slave_port_state new_state;
+	enum slave_port_state prev_state;
+	__be64 tmp_cur_ag, form_cache_ag;
+	enum slave_port_gen_event gen_event;
+
+	if (!mlx4_is_master(dev->dev))
+		return;
+
+	guid_indexes = be64_to_cpu((__force __be64) dev->sriov.alias_guid.
+				   ports_guid[port_num - 1].
+				   all_rec_per_port[block_num].guid_indexes);
+	pr_debug("port: %d, guid_indexes: 0x%llx\n", port_num, (long long)guid_indexes);
+
+	/*calculate the slaves and notify them*/
+	for (i = 0; i < NUM_ALIAS_GUID_IN_REC; i++) {
+		/* the location of the specific index runs from bits 4..11 */
+		if (!(test_bit(i + 4, (unsigned long *)&guid_indexes)))
+			continue;
+
+		slave_id = (block_num * NUM_ALIAS_GUID_IN_REC) + i ;
+		if (slave_id >= dev->dev->num_slaves)
+			return;
+		tmp_cur_ag = *(__be64 *)&p_data[i * GUID_REC_SIZE];
+		form_cache_ag = get_cached_alias_guid(dev, port_num,
+					(NUM_ALIAS_GUID_IN_REC * block_num) + i);
+		/*
+		 * Check if guid is not the same as in the cache,
+		 * If it is different, wait for the snoop_smp or the port mgmt
+		 * change event to update the slave on its port state change
+		 */
+		if (tmp_cur_ag != form_cache_ag)
+			continue;
+		mlx4_gen_guid_change_eqe(dev->dev, slave_id, port_num);
+
+		/*2 cases: Valid GUID, and Invalid Guid*/
+
+		if (tmp_cur_ag != MLX4_NOT_SET_GUID) { /*valid GUID*/
+			prev_state = mlx4_get_slave_port_state(dev->dev, slave_id, port_num);
+			new_state = set_and_calc_slave_port_state(dev->dev, slave_id, port_num,
+								  MLX4_PORT_STATE_IB_PORT_STATE_EVENT_GID_VALID,
+								  &gen_event);
+			pr_debug("slave: %d, port: %d prev_port_state: %d,"
+				 " new_port_state: %d, gen_event: %d\n",
+				 slave_id, port_num, prev_state, new_state, gen_event);
+			if (gen_event == SLAVE_PORT_GEN_EVENT_UP) {
+				pr_debug("sending PORT_UP event to slave: %d, port: %d\n",
+					 slave_id, port_num);
+				mlx4_gen_port_state_change_eqe(dev->dev, slave_id,
+							       port_num, MLX4_PORT_CHANGE_SUBTYPE_ACTIVE);
+			}
+		} else { /* request to invalidate GUID */
+			set_and_calc_slave_port_state(dev->dev, slave_id, port_num,
+						      MLX4_PORT_STATE_IB_EVENT_GID_INVALID,
+						      &gen_event);
+			pr_debug("sending PORT DOWN event to slave: %d, port: %d\n",
+				 slave_id, port_num);
+			mlx4_gen_port_state_change_eqe(dev->dev, slave_id, port_num,
+						       MLX4_PORT_CHANGE_SUBTYPE_DOWN);
+		}
+	}
+}
+
+static void aliasguid_query_handler(int status,
+				    struct ib_sa_guidinfo_rec *guid_rec,
+				    void *context)
+{
+	struct mlx4_ib_dev *dev;
+	struct mlx4_alias_guid_work_context *cb_ctx = context;
+	u8 port_index ;
+	int i;
+	struct mlx4_sriov_alias_guid_info_rec_det *rec;
+	unsigned long flags, flags1;
+
+	if (!context)
+		return;
+
+	dev = cb_ctx->dev;
+	port_index = cb_ctx->port - 1;
+	rec = &dev->sriov.alias_guid.ports_guid[port_index].
+		all_rec_per_port[cb_ctx->block_num];
+
+	if (status) {
+		rec->status = MLX4_GUID_INFO_STATUS_IDLE;
+		pr_debug("(port: %d) failed: status = %d\n",
+			 cb_ctx->port, status);
+		goto out;
+	}
+
+	if (guid_rec->block_num != cb_ctx->block_num) {
+		pr_err("block num mismatch: %d != %d\n",
+		       cb_ctx->block_num, guid_rec->block_num);
+		goto out;
+	}
+
+	pr_debug("lid/port: %d/%d, block_num: %d\n",
+		 be16_to_cpu(guid_rec->lid), cb_ctx->port,
+		 guid_rec->block_num);
+
+	rec = &dev->sriov.alias_guid.ports_guid[port_index].
+		all_rec_per_port[guid_rec->block_num];
+
+	rec->status = MLX4_GUID_INFO_STATUS_SET;
+	rec->method = MLX4_GUID_INFO_RECORD_SET;
+
+	for (i = 0 ; i < NUM_ALIAS_GUID_IN_REC; i++) {
+		__be64 tmp_cur_ag;
+		tmp_cur_ag = *(__be64 *)&guid_rec->guid_info_list[i * GUID_REC_SIZE];
+		/* check if the SM didn't assign one of the records.
+		 * if it didn't, if it was not sysadmin request:
+		 * ask the SM to give a new GUID, (instead of the driver request).
+		 */
+		if (tmp_cur_ag == MLX4_NOT_SET_GUID) {
+			mlx4_ib_warn(&dev->ib_dev, "%s:Record num %d in "
+				     "block_num: %d was declined by SM, "
+				     "ownership by %d (0 = driver, 1=sysAdmin,"
+				     " 2=None)\n", __func__, i,
+				     guid_rec->block_num, rec->ownership);
+			if (rec->ownership == MLX4_GUID_DRIVER_ASSIGN) {
+				/* if it is driver assign, asks for new GUID from SM*/
+				*(__be64 *)&rec->all_recs[i * GUID_REC_SIZE] =
+					MLX4_NOT_SET_GUID;
+
+				/* Mark the record as not assigned, and let it
+				 * be sent again in the next work sched.*/
+				rec->status = MLX4_GUID_INFO_STATUS_IDLE;
+				rec->guid_indexes |= mlx4_ib_get_aguid_comp_mask_from_ix(i);
+			}
+		} else {
+		       /* properly assigned record. */
+		       /* We save the GUID we just got from the SM in the
+			* admin_guid in order to be persistent, and in the
+			* request from the sm the process will ask for the same GUID */
+			if (rec->ownership == MLX4_GUID_SYSADMIN_ASSIGN &&
+			    tmp_cur_ag != *(__be64 *)&rec->all_recs[i * GUID_REC_SIZE]) {
+				/* the sysadmin assignment failed.*/
+				mlx4_ib_warn(&dev->ib_dev, "%s: Failed to set"
+					     " admin guid after SysAdmin "
+					     "configuration. "
+					     "Record num %d in block_num:%d "
+					     "was declined by SM, "
+					     "new val(0x%llx) was kept\n",
+					      __func__, i,
+					     guid_rec->block_num,
+					     (long long)be64_to_cpu(*(__be64 *) &
+							 rec->all_recs[i * GUID_REC_SIZE]));
+			} else {
+				memcpy(&rec->all_recs[i * GUID_REC_SIZE],
+				       &guid_rec->guid_info_list[i * GUID_REC_SIZE],
+				       GUID_REC_SIZE);
+			}
+		}
+	}
+	/*
+	The func is call here to close the cases when the
+	sm doesn't send smp, so in the sa response the driver
+	notifies the slave.
+	*/
+	mlx4_ib_notify_slaves_on_guid_change(dev, guid_rec->block_num,
+					     cb_ctx->port,
+					     guid_rec->guid_info_list);
+out:
+	spin_lock_irqsave(&dev->sriov.going_down_lock, flags);
+	spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags1);
+	if (!dev->sriov.is_going_down)
+		queue_delayed_work(dev->sriov.alias_guid.ports_guid[port_index].wq,
+				   &dev->sriov.alias_guid.ports_guid[port_index].
+				   alias_guid_work, 0);
+	if (cb_ctx->sa_query) {
+		list_del(&cb_ctx->list);
+		kfree(cb_ctx);
+	} else
+		complete(&cb_ctx->done);
+	spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags1);
+	spin_unlock_irqrestore(&dev->sriov.going_down_lock, flags);
+}
+
+static void invalidate_guid_record(struct mlx4_ib_dev *dev, u8 port, int index)
+{
+	int i;
+	u64 cur_admin_val;
+	ib_sa_comp_mask comp_mask = 0;
+
+	dev->sriov.alias_guid.ports_guid[port - 1].all_rec_per_port[index].status
+		= MLX4_GUID_INFO_STATUS_IDLE;
+	dev->sriov.alias_guid.ports_guid[port - 1].all_rec_per_port[index].method
+		= MLX4_GUID_INFO_RECORD_SET;
+
+	/* calculate the comp_mask for that record.*/
+	for (i = 0; i < NUM_ALIAS_GUID_IN_REC; i++) {
+		cur_admin_val =
+			*(u64 *)&dev->sriov.alias_guid.ports_guid[port - 1].
+			all_rec_per_port[index].all_recs[GUID_REC_SIZE * i];
+		/*
+		check the admin value: if it's for delete (~00LL) or
+		it is the first guid of the first record (hw guid) or
+		the records is not in ownership of the sysadmin and the sm doesn't
+		need to assign GUIDs, then don't put it up for assignment.
+		*/
+		if (MLX4_GUID_FOR_DELETE_VAL == cur_admin_val ||
+		    (!index && !i) ||
+		    MLX4_GUID_NONE_ASSIGN == dev->sriov.alias_guid.
+		    ports_guid[port - 1].all_rec_per_port[index].ownership)
+			continue;
+		comp_mask |= mlx4_ib_get_aguid_comp_mask_from_ix(i);
+	}
+	dev->sriov.alias_guid.ports_guid[port - 1].
+		all_rec_per_port[index].guid_indexes = comp_mask;
+}
+
+static int set_guid_rec(struct ib_device *ibdev,
+			u8 port, int index,
+			struct mlx4_sriov_alias_guid_info_rec_det *rec_det)
+{
+	int err;
+	struct mlx4_ib_dev *dev = to_mdev(ibdev);
+	struct ib_sa_guidinfo_rec guid_info_rec;
+	ib_sa_comp_mask comp_mask;
+	struct ib_port_attr attr;
+	struct mlx4_alias_guid_work_context *callback_context;
+	unsigned long resched_delay, flags, flags1;
+	struct list_head *head =
+		&dev->sriov.alias_guid.ports_guid[port - 1].cb_list;
+
+	err = __mlx4_ib_query_port(ibdev, port, &attr, 1);
+	if (err) {
+		pr_debug("mlx4_ib_query_port failed (err: %d), port: %d\n",
+			 err, port);
+		return err;
+	}
+	/*check the port was configured by the sm, otherwise no need to send */
+	if (attr.state != IB_PORT_ACTIVE) {
+		pr_debug("port %d not active...rescheduling\n", port);
+		resched_delay = 5 * HZ;
+		err = -EAGAIN;
+		goto new_schedule;
+	}
+
+	callback_context = kmalloc(sizeof *callback_context, GFP_KERNEL);
+	if (!callback_context) {
+		err = -ENOMEM;
+		resched_delay = HZ * 5;
+		goto new_schedule;
+	}
+	callback_context->port = port;
+	callback_context->dev = dev;
+	callback_context->block_num = index;
+
+	memset(&guid_info_rec, 0, sizeof (struct ib_sa_guidinfo_rec));
+
+	guid_info_rec.lid = cpu_to_be16(attr.lid);
+	guid_info_rec.block_num = index;
+
+	memcpy(guid_info_rec.guid_info_list, rec_det->all_recs,
+	       GUID_REC_SIZE * NUM_ALIAS_GUID_IN_REC);
+	comp_mask = IB_SA_GUIDINFO_REC_LID | IB_SA_GUIDINFO_REC_BLOCK_NUM |
+		rec_det->guid_indexes;
+
+	init_completion(&callback_context->done);
+	spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags1);
+	list_add_tail(&callback_context->list, head);
+	spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags1);
+
+	callback_context->query_id =
+		ib_sa_guid_info_rec_query(dev->sriov.alias_guid.sa_client,
+					  ibdev, port, &guid_info_rec,
+					  comp_mask, rec_det->method, 1000,
+					  GFP_KERNEL, aliasguid_query_handler,
+					  callback_context,
+					  &callback_context->sa_query);
+	if (callback_context->query_id < 0) {
+		pr_debug("ib_sa_guid_info_rec_query failed, query_id: "
+			 "%d. will reschedule to the next 1 sec.\n",
+			 callback_context->query_id);
+		spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags1);
+		list_del(&callback_context->list);
+		kfree(callback_context);
+		spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags1);
+		resched_delay = 1 * HZ;
+		err = -EAGAIN;
+		goto new_schedule;
+	}
+	err = 0;
+	goto out;
+
+new_schedule:
+	spin_lock_irqsave(&dev->sriov.going_down_lock, flags);
+	spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags1);
+	invalidate_guid_record(dev, port, index);
+	if (!dev->sriov.is_going_down) {
+		queue_delayed_work(dev->sriov.alias_guid.ports_guid[port - 1].wq,
+				   &dev->sriov.alias_guid.ports_guid[port - 1].alias_guid_work,
+				   resched_delay);
+	}
+	spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags1);
+	spin_unlock_irqrestore(&dev->sriov.going_down_lock, flags);
+
+out:
+	return err;
+}
+
+void mlx4_ib_invalidate_all_guid_record(struct mlx4_ib_dev *dev, int port)
+{
+	int i;
+	unsigned long flags, flags1;
+
+	pr_debug("port %d\n", port);
+
+	spin_lock_irqsave(&dev->sriov.going_down_lock, flags);
+	spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags1);
+	for (i = 0; i < NUM_ALIAS_GUID_REC_IN_PORT; i++)
+		invalidate_guid_record(dev, port, i);
+
+	if (mlx4_is_master(dev->dev) && !dev->sriov.is_going_down) {
+		/*
+		make sure no work waits in the queue, if the work is already
+		queued(not on the timer) the cancel will fail. That is not a problem
+		because we just want the work started.
+		*/
+		cancel_delayed_work(&dev->sriov.alias_guid.
+				      ports_guid[port - 1].alias_guid_work);
+		queue_delayed_work(dev->sriov.alias_guid.ports_guid[port - 1].wq,
+				   &dev->sriov.alias_guid.ports_guid[port - 1].alias_guid_work,
+				   0);
+	}
+	spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags1);
+	spin_unlock_irqrestore(&dev->sriov.going_down_lock, flags);
+}
+
+/* The function returns the next record that was
+ * not configured (or failed to be configured) */
+static int get_next_record_to_update(struct mlx4_ib_dev *dev, u8 port,
+				     struct mlx4_next_alias_guid_work *rec)
+{
+	int j;
+	unsigned long flags;
+
+	for (j = 0; j < NUM_ALIAS_GUID_REC_IN_PORT; j++) {
+		spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags);
+		if (dev->sriov.alias_guid.ports_guid[port].all_rec_per_port[j].status ==
+		    MLX4_GUID_INFO_STATUS_IDLE) {
+			memcpy(&rec->rec_det,
+			       &dev->sriov.alias_guid.ports_guid[port].all_rec_per_port[j],
+			       sizeof (struct mlx4_sriov_alias_guid_info_rec_det));
+			rec->port = port;
+			rec->block_num = j;
+			dev->sriov.alias_guid.ports_guid[port].all_rec_per_port[j].status =
+				MLX4_GUID_INFO_STATUS_PENDING;
+			spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags);
+			return 0;
+		}
+		spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags);
+	}
+	return -ENOENT;
+}
+
+static void set_administratively_guid_record(struct mlx4_ib_dev *dev, int port,
+					     int rec_index,
+					     struct mlx4_sriov_alias_guid_info_rec_det *rec_det)
+{
+	dev->sriov.alias_guid.ports_guid[port].all_rec_per_port[rec_index].guid_indexes =
+		rec_det->guid_indexes;
+	memcpy(dev->sriov.alias_guid.ports_guid[port].all_rec_per_port[rec_index].all_recs,
+	       rec_det->all_recs, NUM_ALIAS_GUID_IN_REC * GUID_REC_SIZE);
+	dev->sriov.alias_guid.ports_guid[port].all_rec_per_port[rec_index].status =
+		rec_det->status;
+}
+
+static void set_all_slaves_guids(struct mlx4_ib_dev *dev, int port)
+{
+	int j;
+	struct mlx4_sriov_alias_guid_info_rec_det rec_det ;
+
+	for (j = 0 ; j < NUM_ALIAS_GUID_REC_IN_PORT ; j++) {
+		memset(rec_det.all_recs, 0, NUM_ALIAS_GUID_IN_REC * GUID_REC_SIZE);
+		rec_det.guid_indexes = (!j ? 0 : IB_SA_GUIDINFO_REC_GID0) |
+			IB_SA_GUIDINFO_REC_GID1 | IB_SA_GUIDINFO_REC_GID2 |
+			IB_SA_GUIDINFO_REC_GID3 | IB_SA_GUIDINFO_REC_GID4 |
+			IB_SA_GUIDINFO_REC_GID5 | IB_SA_GUIDINFO_REC_GID6 |
+			IB_SA_GUIDINFO_REC_GID7;
+		rec_det.status = MLX4_GUID_INFO_STATUS_IDLE;
+		set_administratively_guid_record(dev, port, j, &rec_det);
+	}
+}
+
+static void alias_guid_work(struct work_struct *work)
+{
+	struct delayed_work *delay = to_delayed_work(work);
+	int ret = 0;
+	struct mlx4_next_alias_guid_work *rec;
+	struct mlx4_sriov_alias_guid_port_rec_det *sriov_alias_port =
+		container_of(delay, struct mlx4_sriov_alias_guid_port_rec_det,
+			     alias_guid_work);
+	struct mlx4_sriov_alias_guid *sriov_alias_guid = sriov_alias_port->parent;
+	struct mlx4_ib_sriov *ib_sriov = container_of(sriov_alias_guid,
+						struct mlx4_ib_sriov,
+						alias_guid);
+	struct mlx4_ib_dev *dev = container_of(ib_sriov, struct mlx4_ib_dev, sriov);
+
+	rec = kzalloc(sizeof *rec, GFP_KERNEL);
+	if (!rec) {
+		pr_err("alias_guid_work: No Memory\n");
+		return;
+	}
+
+	pr_debug("starting [port: %d]...\n", sriov_alias_port->port + 1);
+	ret = get_next_record_to_update(dev, sriov_alias_port->port, rec);
+	if (ret) {
+		pr_debug("No more records to update.\n");
+		goto out;
+	}
+
+	set_guid_rec(&dev->ib_dev, rec->port + 1, rec->block_num,
+		     &rec->rec_det);
+
+out:
+	kfree(rec);
+}
+
+
+void mlx4_ib_init_alias_guid_work(struct mlx4_ib_dev *dev, int port)
+{
+	unsigned long flags, flags1;
+
+	if (!mlx4_is_master(dev->dev))
+		return;
+	spin_lock_irqsave(&dev->sriov.going_down_lock, flags);
+	spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags1);
+	if (!dev->sriov.is_going_down) {
+		queue_delayed_work(dev->sriov.alias_guid.ports_guid[port].wq,
+			   &dev->sriov.alias_guid.ports_guid[port].alias_guid_work, 0);
+	}
+	spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags1);
+	spin_unlock_irqrestore(&dev->sriov.going_down_lock, flags);
+}
+
+void mlx4_ib_destroy_alias_guid_service(struct mlx4_ib_dev *dev)
+{
+	int i;
+	struct mlx4_ib_sriov *sriov = &dev->sriov;
+	struct mlx4_alias_guid_work_context *cb_ctx;
+	struct mlx4_sriov_alias_guid_port_rec_det *det;
+	struct ib_sa_query *sa_query;
+	unsigned long flags;
+
+	for (i = 0 ; i < dev->num_ports; i++) {
+		cancel_delayed_work(&dev->sriov.alias_guid.ports_guid[i].alias_guid_work);
+		det = &sriov->alias_guid.ports_guid[i];
+		spin_lock_irqsave(&sriov->alias_guid.ag_work_lock, flags);
+		while (!list_empty(&det->cb_list)) {
+			cb_ctx = list_entry(det->cb_list.next,
+					    struct mlx4_alias_guid_work_context,
+					    list);
+			sa_query = cb_ctx->sa_query;
+			cb_ctx->sa_query = NULL;
+			list_del(&cb_ctx->list);
+			spin_unlock_irqrestore(&sriov->alias_guid.ag_work_lock, flags);
+			ib_sa_cancel_query(cb_ctx->query_id, sa_query);
+			wait_for_completion(&cb_ctx->done);
+			kfree(cb_ctx);
+			spin_lock_irqsave(&sriov->alias_guid.ag_work_lock, flags);
+		}
+		spin_unlock_irqrestore(&sriov->alias_guid.ag_work_lock, flags);
+	}
+	for (i = 0 ; i < dev->num_ports; i++) {
+		flush_workqueue(dev->sriov.alias_guid.ports_guid[i].wq);
+		destroy_workqueue(dev->sriov.alias_guid.ports_guid[i].wq);
+	}
+	ib_sa_unregister_client(dev->sriov.alias_guid.sa_client);
+	kfree(dev->sriov.alias_guid.sa_client);
+}
+
+int mlx4_ib_init_alias_guid_service(struct mlx4_ib_dev *dev)
+{
+	char alias_wq_name[15];
+	int ret = 0;
+	int i, j, k;
+	union ib_gid gid;
+
+	if (!mlx4_is_master(dev->dev))
+		return 0;
+	dev->sriov.alias_guid.sa_client =
+		kzalloc(sizeof *dev->sriov.alias_guid.sa_client, GFP_KERNEL);
+	if (!dev->sriov.alias_guid.sa_client)
+		return -ENOMEM;
+
+	ib_sa_register_client(dev->sriov.alias_guid.sa_client);
+
+	spin_lock_init(&dev->sriov.alias_guid.ag_work_lock);
+
+	for (i = 1; i <= dev->num_ports; ++i) {
+		if (dev->ib_dev.query_gid(&dev->ib_dev , i, 0, &gid)) {
+			ret = -EFAULT;
+			goto err_unregister;
+		}
+	}
+
+	for (i = 0 ; i < dev->num_ports; i++) {
+		memset(&dev->sriov.alias_guid.ports_guid[i], 0,
+		       sizeof (struct mlx4_sriov_alias_guid_port_rec_det));
+		/*Check if the SM doesn't need to assign the GUIDs*/
+		for (j = 0; j < NUM_ALIAS_GUID_REC_IN_PORT; j++) {
+			if (mlx4_ib_sm_guid_assign) {
+				dev->sriov.alias_guid.ports_guid[i].
+					all_rec_per_port[j].
+					ownership = MLX4_GUID_DRIVER_ASSIGN;
+				continue;
+			}
+			dev->sriov.alias_guid.ports_guid[i].all_rec_per_port[j].
+					ownership = MLX4_GUID_NONE_ASSIGN;
+			/*mark each val as it was deleted,
+			  till the sysAdmin will give it valid val*/
+			for (k = 0; k < NUM_ALIAS_GUID_IN_REC; k++) {
+				*(__be64 *)&dev->sriov.alias_guid.ports_guid[i].
+					all_rec_per_port[j].all_recs[GUID_REC_SIZE * k] =
+						cpu_to_be64(MLX4_GUID_FOR_DELETE_VAL);
+			}
+		}
+		INIT_LIST_HEAD(&dev->sriov.alias_guid.ports_guid[i].cb_list);
+		/*prepare the records, set them to be allocated by sm*/
+		for (j = 0 ; j < NUM_ALIAS_GUID_REC_IN_PORT; j++)
+			invalidate_guid_record(dev, i + 1, j);
+
+		dev->sriov.alias_guid.ports_guid[i].parent = &dev->sriov.alias_guid;
+		dev->sriov.alias_guid.ports_guid[i].port  = i;
+		if (mlx4_ib_sm_guid_assign)
+			set_all_slaves_guids(dev, i);
+
+		snprintf(alias_wq_name, sizeof alias_wq_name, "alias_guid%d", i);
+		dev->sriov.alias_guid.ports_guid[i].wq =
+			create_singlethread_workqueue(alias_wq_name);
+		if (!dev->sriov.alias_guid.ports_guid[i].wq) {
+			ret = -ENOMEM;
+			goto err_thread;
+		}
+		INIT_DELAYED_WORK(&dev->sriov.alias_guid.ports_guid[i].alias_guid_work,
+			  alias_guid_work);
+	}
+	return 0;
+
+err_thread:
+	for (--i; i >= 0; i--) {
+		destroy_workqueue(dev->sriov.alias_guid.ports_guid[i].wq);
+		dev->sriov.alias_guid.ports_guid[i].wq = NULL;
+	}
+
+err_unregister:
+	ib_sa_unregister_client(dev->sriov.alias_guid.sa_client);
+	kfree(dev->sriov.alias_guid.sa_client);
+	dev->sriov.alias_guid.sa_client = NULL;
+	pr_err("init_alias_guid_service: Failed. (ret:%d)\n", ret);
+	return ret;
+}


Property changes on: trunk/sys/ofed/drivers/infiniband/hw/mlx4/alias_GUID.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/sys/ofed/drivers/infiniband/hw/mlx4/cm.c
===================================================================
--- trunk/sys/ofed/drivers/infiniband/hw/mlx4/cm.c	                        (rev 0)
+++ trunk/sys/ofed/drivers/infiniband/hw/mlx4/cm.c	2016-09-14 19:35:22 UTC (rev 7911)
@@ -0,0 +1,440 @@
+/*
+ * Copyright (c) 2012 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <rdma/ib_mad.h>
+
+#include <linux/mlx4/cmd.h>
+#include <linux/idr.h>
+#include <rdma/ib_cm.h>
+
+#include "mlx4_ib.h"
+
+#define CM_CLEANUP_CACHE_TIMEOUT  (5 * HZ)
+
+struct id_map_entry {
+	struct rb_node node;
+
+	u32 sl_cm_id;
+	u32 pv_cm_id;
+	int slave_id;
+	int scheduled_delete;
+	struct mlx4_ib_dev *dev;
+
+	struct list_head list;
+	struct delayed_work timeout;
+};
+
+struct cm_generic_msg {
+	struct ib_mad_hdr hdr;
+
+	__be32 local_comm_id;
+	__be32 remote_comm_id;
+};
+
+struct cm_req_msg {
+	unsigned char unused[0x60];
+	union ib_gid primary_path_sgid;
+};
+
+
+static void set_local_comm_id(struct ib_mad *mad, u32 cm_id)
+{
+	struct cm_generic_msg *msg = (struct cm_generic_msg *)mad;
+	msg->local_comm_id = cpu_to_be32(cm_id);
+}
+
+static u32 get_local_comm_id(struct ib_mad *mad)
+{
+	struct cm_generic_msg *msg = (struct cm_generic_msg *)mad;
+
+	return be32_to_cpu(msg->local_comm_id);
+}
+
+static void set_remote_comm_id(struct ib_mad *mad, u32 cm_id)
+{
+	struct cm_generic_msg *msg = (struct cm_generic_msg *)mad;
+	msg->remote_comm_id = cpu_to_be32(cm_id);
+}
+
+static u32 get_remote_comm_id(struct ib_mad *mad)
+{
+	struct cm_generic_msg *msg = (struct cm_generic_msg *)mad;
+
+	return be32_to_cpu(msg->remote_comm_id);
+}
+
+static union ib_gid gid_from_req_msg(struct ib_device *ibdev, struct ib_mad *mad)
+{
+	struct cm_req_msg *msg = (struct cm_req_msg *)mad;
+
+	return msg->primary_path_sgid;
+}
+
+/* Lock should be taken before called */
+static struct id_map_entry *
+id_map_find_by_sl_id(struct ib_device *ibdev, u32 slave_id, u32 sl_cm_id)
+{
+	struct rb_root *sl_id_map = &to_mdev(ibdev)->sriov.sl_id_map;
+	struct rb_node *node = sl_id_map->rb_node;
+
+	while (node) {
+		struct id_map_entry *id_map_entry =
+			rb_entry(node, struct id_map_entry, node);
+
+		if (id_map_entry->sl_cm_id > sl_cm_id)
+			node = node->rb_left;
+		else if (id_map_entry->sl_cm_id < sl_cm_id)
+			node = node->rb_right;
+		else if (id_map_entry->slave_id > slave_id)
+			node = node->rb_left;
+		else if (id_map_entry->slave_id < slave_id)
+			node = node->rb_right;
+		else
+			return id_map_entry;
+	}
+	return NULL;
+}
+
+static void id_map_ent_timeout(struct work_struct *work)
+{
+	struct delayed_work *delay = to_delayed_work(work);
+	struct id_map_entry *ent = container_of(delay, struct id_map_entry, timeout);
+	struct id_map_entry *db_ent, *found_ent;
+	struct mlx4_ib_dev *dev = ent->dev;
+	struct mlx4_ib_sriov *sriov = &dev->sriov;
+	struct rb_root *sl_id_map = &sriov->sl_id_map;
+	int pv_id = (int) ent->pv_cm_id;
+
+	spin_lock(&sriov->id_map_lock);
+	db_ent = (struct id_map_entry *)idr_find(&sriov->pv_id_table, pv_id);
+	if (!db_ent)
+		goto out;
+	found_ent = id_map_find_by_sl_id(&dev->ib_dev, ent->slave_id, ent->sl_cm_id);
+	if (found_ent && found_ent == ent)
+		rb_erase(&found_ent->node, sl_id_map);
+	idr_remove(&sriov->pv_id_table, pv_id);
+
+out:
+	list_del(&ent->list);
+	spin_unlock(&sriov->id_map_lock);
+	kfree(ent);
+}
+
+static void id_map_find_del(struct ib_device *ibdev, int pv_cm_id)
+{
+	struct mlx4_ib_sriov *sriov = &to_mdev(ibdev)->sriov;
+	struct rb_root *sl_id_map = &sriov->sl_id_map;
+	struct id_map_entry *ent, *found_ent;
+
+	spin_lock(&sriov->id_map_lock);
+	ent = (struct id_map_entry *)idr_find(&sriov->pv_id_table, pv_cm_id);
+	if (!ent)
+		goto out;
+	found_ent = id_map_find_by_sl_id(ibdev, ent->slave_id, ent->sl_cm_id);
+	if (found_ent && found_ent == ent)
+		rb_erase(&found_ent->node, sl_id_map);
+	idr_remove(&sriov->pv_id_table, pv_cm_id);
+out:
+	spin_unlock(&sriov->id_map_lock);
+}
+
+static void sl_id_map_add(struct ib_device *ibdev, struct id_map_entry *new)
+{
+	struct rb_root *sl_id_map = &to_mdev(ibdev)->sriov.sl_id_map;
+	struct rb_node **link = &sl_id_map->rb_node, *parent = NULL;
+	struct id_map_entry *ent;
+	int slave_id = new->slave_id;
+	int sl_cm_id = new->sl_cm_id;
+
+	ent = id_map_find_by_sl_id(ibdev, slave_id, sl_cm_id);
+	if (ent) {
+		pr_debug("overriding existing sl_id_map entry (cm_id = %x)\n",
+			 sl_cm_id);
+
+		rb_replace_node(&ent->node, &new->node, sl_id_map);
+		return;
+	}
+
+	/* Go to the bottom of the tree */
+	while (*link) {
+		parent = *link;
+		ent = rb_entry(parent, struct id_map_entry, node);
+
+		if (ent->sl_cm_id > sl_cm_id || (ent->sl_cm_id == sl_cm_id && ent->slave_id > slave_id))
+			link = &(*link)->rb_left;
+		else
+			link = &(*link)->rb_right;
+	}
+
+	rb_link_node(&new->node, parent, link);
+	rb_insert_color(&new->node, sl_id_map);
+}
+
+static struct id_map_entry *
+id_map_alloc(struct ib_device *ibdev, int slave_id, u32 sl_cm_id)
+{
+	int ret, id;
+	static int next_id;
+	struct id_map_entry *ent;
+	struct mlx4_ib_sriov *sriov = &to_mdev(ibdev)->sriov;
+
+	ent = kmalloc(sizeof (struct id_map_entry), GFP_KERNEL);
+	if (!ent) {
+		mlx4_ib_warn(ibdev, "Couldn't allocate id cache entry - out of memory\n");
+		return ERR_PTR(-ENOMEM);
+	}
+
+	ent->sl_cm_id = sl_cm_id;
+	ent->slave_id = slave_id;
+	ent->scheduled_delete = 0;
+	ent->dev = to_mdev(ibdev);
+	INIT_DELAYED_WORK(&ent->timeout, id_map_ent_timeout);
+
+	do {
+		spin_lock(&to_mdev(ibdev)->sriov.id_map_lock);
+		ret = idr_get_new_above(&sriov->pv_id_table, ent,
+					next_id, &id);
+		if (!ret) {
+			next_id = ((unsigned) id + 1) & MAX_IDR_MASK;
+			ent->pv_cm_id = (u32)id;
+			sl_id_map_add(ibdev, ent);
+		}
+
+		spin_unlock(&sriov->id_map_lock);
+	} while (ret == -EAGAIN && idr_pre_get(&sriov->pv_id_table, GFP_KERNEL));
+	/*the function idr_get_new_above can return -ENOSPC, so don't insert in that case.*/
+	if (!ret) {
+		spin_lock(&sriov->id_map_lock);
+		list_add_tail(&ent->list, &sriov->cm_list);
+		spin_unlock(&sriov->id_map_lock);
+		return ent;
+	}
+	/*error flow*/
+	kfree(ent);
+	mlx4_ib_warn(ibdev, "No more space in the idr (err:0x%x)\n", ret);
+	return ERR_PTR(-ENOMEM);
+}
+
+static struct id_map_entry *
+id_map_get(struct ib_device *ibdev, int *pv_cm_id, int sl_cm_id, int slave_id)
+{
+	struct id_map_entry *ent;
+	struct mlx4_ib_sriov *sriov = &to_mdev(ibdev)->sriov;
+
+	spin_lock(&sriov->id_map_lock);
+	if (*pv_cm_id == -1) {
+		ent = id_map_find_by_sl_id(ibdev, sl_cm_id, slave_id);
+		if (ent)
+			*pv_cm_id = (int) ent->pv_cm_id;
+	} else
+		ent = (struct id_map_entry *)idr_find(&sriov->pv_id_table, *pv_cm_id);
+	spin_unlock(&sriov->id_map_lock);
+
+	return ent;
+}
+
+static void schedule_delayed(struct ib_device *ibdev, struct id_map_entry *id)
+{
+	struct mlx4_ib_sriov *sriov = &to_mdev(ibdev)->sriov;
+	unsigned long flags;
+
+	spin_lock(&sriov->id_map_lock);
+	spin_lock_irqsave(&sriov->going_down_lock, flags);
+	/*make sure that there is no schedule inside the scheduled work.*/
+	if (!sriov->is_going_down) {
+		id->scheduled_delete = 1;
+		schedule_delayed_work(&id->timeout, CM_CLEANUP_CACHE_TIMEOUT);
+	}
+	spin_unlock_irqrestore(&sriov->going_down_lock, flags);
+	spin_unlock(&sriov->id_map_lock);
+}
+
+int mlx4_ib_multiplex_cm_handler(struct ib_device *ibdev, int port, int slave_id,
+		struct ib_mad *mad)
+{
+	struct id_map_entry *id;
+	u32 sl_cm_id;
+	int pv_cm_id = -1;
+
+	sl_cm_id = get_local_comm_id(mad);
+
+	if (mad->mad_hdr.attr_id == CM_REQ_ATTR_ID ||
+			mad->mad_hdr.attr_id == CM_REP_ATTR_ID) {
+		id = id_map_alloc(ibdev, slave_id, sl_cm_id);
+		if (IS_ERR(id)) {
+			mlx4_ib_warn(ibdev, "%s: id{slave: %d, sl_cm_id: 0x%x} Failed to id_map_alloc\n",
+				__func__, slave_id, sl_cm_id);
+			return PTR_ERR(id);
+		}
+	} else if (mad->mad_hdr.attr_id == CM_REJ_ATTR_ID) {
+		return 0;
+	} else {
+		id = id_map_get(ibdev, &pv_cm_id, slave_id, sl_cm_id);
+	}
+
+	if (!id) {
+		pr_debug("id{slave: %d, sl_cm_id: 0x%x} is NULL!\n",
+			 slave_id, sl_cm_id);
+		return -EINVAL;
+	}
+
+	set_local_comm_id(mad, id->pv_cm_id);
+
+	if (mad->mad_hdr.attr_id == CM_DREQ_ATTR_ID)
+		schedule_delayed(ibdev, id);
+	else if (mad->mad_hdr.attr_id == CM_DREP_ATTR_ID)
+		id_map_find_del(ibdev, pv_cm_id);
+
+	return 0;
+}
+
+int mlx4_ib_demux_cm_handler(struct ib_device *ibdev, int port, int *slave,
+			     struct ib_mad *mad, int is_eth)
+{
+	u32 pv_cm_id;
+	struct id_map_entry *id;
+
+	if (mad->mad_hdr.attr_id == CM_REQ_ATTR_ID) {
+		union ib_gid gid;
+
+		if (is_eth)
+			return 0;
+
+		gid = gid_from_req_msg(ibdev, mad);
+		*slave = mlx4_ib_find_real_gid(ibdev, port, gid.global.interface_id);
+		if (*slave < 0) {
+			mlx4_ib_warn(ibdev, "failed matching slave_id by gid (0x%llx)\n",
+					(long long)gid.global.interface_id);
+			return -ENOENT;
+		}
+		return 0;
+	}
+
+	pv_cm_id = get_remote_comm_id(mad);
+	id = id_map_get(ibdev, (int *)&pv_cm_id, -1, -1);
+
+	if (!id) {
+		pr_debug("Couldn't find an entry for pv_cm_id 0x%x\n", pv_cm_id);
+		return -ENOENT;
+	}
+
+	if (!is_eth)
+		*slave = id->slave_id;
+	set_remote_comm_id(mad, id->sl_cm_id);
+
+	if (mad->mad_hdr.attr_id == CM_DREQ_ATTR_ID)
+		schedule_delayed(ibdev, id);
+	else if (mad->mad_hdr.attr_id == CM_REJ_ATTR_ID ||
+			mad->mad_hdr.attr_id == CM_DREP_ATTR_ID) {
+		id_map_find_del(ibdev, (int) pv_cm_id);
+	}
+
+	return 0;
+}
+
+void mlx4_ib_cm_paravirt_init(struct mlx4_ib_dev *dev)
+{
+	spin_lock_init(&dev->sriov.id_map_lock);
+	INIT_LIST_HEAD(&dev->sriov.cm_list);
+	dev->sriov.sl_id_map = RB_ROOT;
+	idr_init(&dev->sriov.pv_id_table);
+	idr_pre_get(&dev->sriov.pv_id_table, GFP_KERNEL);
+}
+
+/* slave = -1 ==> all slaves */
+/* TBD -- call paravirt clean for single slave.  Need for slave RESET event */
+void mlx4_ib_cm_paravirt_clean(struct mlx4_ib_dev *dev, int slave)
+{
+	struct mlx4_ib_sriov *sriov = &dev->sriov;
+	struct rb_root *sl_id_map = &sriov->sl_id_map;
+	struct list_head lh;
+	struct rb_node *nd;
+	int need_flush = 1;
+	struct id_map_entry *map, *tmp_map;
+	/* cancel all delayed work queue entries */
+	INIT_LIST_HEAD(&lh);
+	spin_lock(&sriov->id_map_lock);
+	list_for_each_entry_safe(map, tmp_map, &dev->sriov.cm_list, list) {
+		if (slave < 0 || slave == map->slave_id) {
+			if (map->scheduled_delete)
+				need_flush &= !!cancel_delayed_work(&map->timeout);
+		}
+	}
+
+	spin_unlock(&sriov->id_map_lock);
+
+	if (!need_flush)
+		flush_scheduled_work(); /* make sure all timers were flushed */
+
+	/* now, remove all leftover entries from databases*/
+	spin_lock(&sriov->id_map_lock);
+	if (slave < 0) {
+		while (rb_first(sl_id_map)) {
+			struct id_map_entry *ent =
+				rb_entry(rb_first(sl_id_map),
+					 struct id_map_entry, node);
+
+			rb_erase(&ent->node, sl_id_map);
+			idr_remove(&sriov->pv_id_table, (int) ent->pv_cm_id);
+		}
+		list_splice_init(&dev->sriov.cm_list, &lh);
+	} else {
+		/* first, move nodes belonging to slave to db remove list */
+		nd = rb_first(sl_id_map);
+		while (nd) {
+			struct id_map_entry *ent =
+				rb_entry(nd, struct id_map_entry, node);
+			nd = rb_next(nd);
+			if (ent->slave_id == slave)
+				list_move_tail(&ent->list, &lh);
+		}
+		/* remove those nodes from databases */
+		list_for_each_entry_safe(map, tmp_map, &lh, list) {
+			rb_erase(&map->node, sl_id_map);
+			idr_remove(&sriov->pv_id_table, (int) map->pv_cm_id);
+		}
+
+		/* add remaining nodes from cm_list */
+		list_for_each_entry_safe(map, tmp_map, &dev->sriov.cm_list, list) {
+			if (slave == map->slave_id)
+				list_move_tail(&map->list, &lh);
+		}
+	}
+
+	spin_unlock(&sriov->id_map_lock);
+
+	/* free any map entries left behind due to cancel_delayed_work above */
+	list_for_each_entry_safe(map, tmp_map, &lh, list) {
+		list_del(&map->list);
+		kfree(map);
+	}
+}


Property changes on: trunk/sys/ofed/drivers/infiniband/hw/mlx4/cm.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Modified: trunk/sys/ofed/drivers/infiniband/hw/mlx4/cq.c
===================================================================
--- trunk/sys/ofed/drivers/infiniband/hw/mlx4/cq.c	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/drivers/infiniband/hw/mlx4/cq.c	2016-09-14 19:35:22 UTC (rev 7911)
@@ -33,7 +33,7 @@
 
 #include <linux/mlx4/cq.h>
 #include <linux/mlx4/qp.h>
-#include <linux/mlx4/srq.h>
+#include <linux/slab.h>
 
 #include "mlx4_ib.h"
 #include "user.h"
@@ -40,6 +40,7 @@
 
 /* Which firmware version adds support for Resize CQ */
 #define MLX4_FW_VER_RESIZE_CQ  mlx4_fw_ver(2, 5, 0)
+#define MLX4_FW_VER_IGNORE_OVERRUN_CQ mlx4_fw_ver(2, 7, 8200)
 
 static void mlx4_ib_cq_comp(struct mlx4_cq *cq)
 {
@@ -53,7 +54,7 @@
 	struct ib_cq *ibcq;
 
 	if (type != MLX4_EVENT_TYPE_CQ_ERROR) {
-		printk(KERN_WARNING "mlx4_ib: Unexpected event type %d "
+		pr_warn("Unexpected event type %d "
 		       "on CQ %06x\n", type, cq->cqn);
 		return;
 	}
@@ -69,7 +70,7 @@
 
 static void *get_cqe_from_buf(struct mlx4_ib_cq_buf *buf, int n)
 {
-	return mlx4_buf_offset(&buf->buf, n * sizeof (struct mlx4_cqe));
+	return mlx4_buf_offset(&buf->buf, n * buf->entry_size);
 }
 
 static void *get_cqe(struct mlx4_ib_cq *cq, int n)
@@ -80,8 +81,9 @@
 static void *get_sw_cqe(struct mlx4_ib_cq *cq, int n)
 {
 	struct mlx4_cqe *cqe = get_cqe(cq, n & cq->ibcq.cqe);
+	struct mlx4_cqe *tcqe = ((cq->buf.entry_size == 64) ? (cqe + 1) : cqe);
 
-	return (!!(cqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK) ^
+	return (!!(tcqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK) ^
 		!!(n & (cq->ibcq.cqe + 1))) ? NULL : cqe;
 }
 
@@ -102,12 +104,13 @@
 {
 	int err;
 
-	err = mlx4_buf_alloc(dev->dev, nent * sizeof(struct mlx4_cqe),
+	err = mlx4_buf_alloc(dev->dev, nent * dev->dev->caps.cqe_size,
 			     PAGE_SIZE * 2, &buf->buf);
 
 	if (err)
 		goto out;
 
+	buf->entry_size = dev->dev->caps.cqe_size;
 	err = mlx4_mtt_init(dev->dev, buf->buf.npages, buf->buf.page_shift,
 				    &buf->mtt);
 	if (err)
@@ -123,8 +126,7 @@
 	mlx4_mtt_cleanup(dev->dev, &buf->mtt);
 
 err_buf:
-	mlx4_buf_free(dev->dev, nent * sizeof(struct mlx4_cqe),
-			      &buf->buf);
+	mlx4_buf_free(dev->dev, nent * buf->entry_size, &buf->buf);
 
 out:
 	return err;
@@ -132,7 +134,7 @@
 
 static void mlx4_ib_free_cq_buf(struct mlx4_ib_dev *dev, struct mlx4_ib_cq_buf *buf, int cqe)
 {
-	mlx4_buf_free(dev->dev, (cqe + 1) * sizeof(struct mlx4_cqe), &buf->buf);
+	mlx4_buf_free(dev->dev, (cqe + 1) * buf->entry_size, &buf->buf);
 }
 
 static int mlx4_ib_get_cq_umem(struct mlx4_ib_dev *dev, struct ib_ucontext *context,
@@ -140,14 +142,19 @@
 			       u64 buf_addr, int cqe)
 {
 	int err;
+	int cqe_size = dev->dev->caps.cqe_size;
+	int shift;
+	int n;
 
-	*umem = ib_umem_get(context, buf_addr, cqe * sizeof (struct mlx4_cqe),
+	*umem = ib_umem_get(context, buf_addr, cqe * cqe_size,
 			    IB_ACCESS_LOCAL_WRITE, 1);
 	if (IS_ERR(*umem))
 		return PTR_ERR(*umem);
 
-	err = mlx4_mtt_init(dev->dev, ib_umem_page_count(*umem),
-			    ilog2((*umem)->page_size), &buf->mtt);
+	n = ib_umem_page_count(*umem);
+	shift = mlx4_ib_umem_calc_optimal_mtt_size(*umem, 0, &n);
+	err = mlx4_mtt_init(dev->dev, n, shift, &buf->mtt);
+
 	if (err)
 		goto err_buf;
 
@@ -175,12 +182,10 @@
 	struct mlx4_uar *uar;
 	int err;
 
-	if (entries < 1 || entries > dev->dev->caps.max_cqes) {
-		mlx4_ib_dbg("invalid num of entries: %d", entries);
+	if (entries < 1 || entries > dev->dev->caps.max_cqes)
 		return ERR_PTR(-EINVAL);
-	}
 
-	cq = kzalloc(sizeof *cq, GFP_KERNEL);
+	cq = kmalloc(sizeof *cq, GFP_KERNEL);
 	if (!cq)
 		return ERR_PTR(-ENOMEM);
 
@@ -227,10 +232,11 @@
 		uar = &dev->priv_uar;
 	}
 
+	if (dev->eq_table)
+		vector = dev->eq_table[vector % ibdev->num_comp_vectors];
+
 	err = mlx4_cq_alloc(dev->dev, entries, &cq->buf.mtt, uar,
-			    cq->db.dma, &cq->mcq,
-			    vector == IB_CQ_VECTOR_LEAST_ATTACHED ?
-			    MLX4_LEAST_ATTACHED_VECTOR : vector, 0);
+			    cq->db.dma, &cq->mcq, vector, 0, 0);
 	if (err)
 		goto err_dbmap;
 
@@ -335,16 +341,23 @@
 {
 	struct mlx4_cqe *cqe, *new_cqe;
 	int i;
+	int cqe_size = cq->buf.entry_size;
+	int cqe_inc = cqe_size == 64 ? 1 : 0;
 
 	i = cq->mcq.cons_index;
 	cqe = get_cqe(cq, i & cq->ibcq.cqe);
+	cqe += cqe_inc;
+
 	while ((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) != MLX4_CQE_OPCODE_RESIZE) {
 		new_cqe = get_cqe_from_buf(&cq->resize_buf->buf,
 					   (i + 1) & cq->resize_buf->cqe);
-		memcpy(new_cqe, get_cqe(cq, i & cq->ibcq.cqe), sizeof(struct mlx4_cqe));
+		memcpy(new_cqe, get_cqe(cq, i & cq->ibcq.cqe), cqe_size);
+		new_cqe += cqe_inc;
+
 		new_cqe->owner_sr_opcode = (cqe->owner_sr_opcode & ~MLX4_CQE_OWNER_MASK) |
 			(((i + 1) & (cq->resize_buf->cqe + 1)) ? MLX4_CQE_OWNER_MASK : 0);
 		cqe = get_cqe(cq, ++i & cq->ibcq.cqe);
+		cqe += cqe_inc;
 	}
 	++cq->mcq.cons_index;
 }
@@ -409,7 +422,7 @@
 	} else {
 		struct mlx4_ib_cq_buf tmp_buf;
 		int tmp_cqe = 0;
- 
+
 		spin_lock_irq(&cq->lock);
 		if (cq->resize_buf) {
 			mlx4_ib_cq_resize_copy_cqes(cq);
@@ -445,9 +458,21 @@
 
 out:
 	mutex_unlock(&cq->resize_mutex);
+
 	return err;
 }
 
+int mlx4_ib_ignore_overrun_cq(struct ib_cq *ibcq)
+{
+	struct mlx4_ib_dev *dev = to_mdev(ibcq->device);
+	struct mlx4_ib_cq *cq = to_mcq(ibcq);
+
+	if (dev->dev->caps.fw_ver < MLX4_FW_VER_IGNORE_OVERRUN_CQ)
+		return -ENOSYS;
+
+	return mlx4_cq_ignore_overrun(dev->dev, &cq->mcq);
+}
+
 int mlx4_ib_destroy_cq(struct ib_cq *cq)
 {
 	struct mlx4_ib_dev *dev = to_mdev(cq->device);
@@ -473,7 +498,7 @@
 {
 	__be32 *buf = cqe;
 
-	printk(KERN_DEBUG "CQE contents %08x %08x %08x %08x %08x %08x %08x %08x\n",
+	pr_debug("CQE contents %08x %08x %08x %08x %08x %08x %08x %08x\n",
 	       be32_to_cpu(buf[0]), be32_to_cpu(buf[1]), be32_to_cpu(buf[2]),
 	       be32_to_cpu(buf[3]), be32_to_cpu(buf[4]), be32_to_cpu(buf[5]),
 	       be32_to_cpu(buf[6]), be32_to_cpu(buf[7]));
@@ -483,7 +508,7 @@
 				     struct ib_wc *wc)
 {
 	if (cqe->syndrome == MLX4_CQE_SYNDROME_LOCAL_QP_OP_ERR) {
-		printk(KERN_DEBUG "local QP operation err "
+		pr_debug("local QP operation err "
 		       "(QPN %06x, WQE index %x, vendor syndrome %02x, "
 		       "opcode = %02x)\n",
 		       be32_to_cpu(cqe->my_qpn), be16_to_cpu(cqe->wqe_index),
@@ -554,6 +579,26 @@
 		checksum == cpu_to_be16(0xffff);
 }
 
+static int use_tunnel_data(struct mlx4_ib_qp *qp, struct mlx4_ib_cq *cq, struct ib_wc *wc,
+			   unsigned tail, struct mlx4_cqe *cqe)
+{
+	struct mlx4_ib_proxy_sqp_hdr *hdr;
+
+	ib_dma_sync_single_for_cpu(qp->ibqp.device,
+				   qp->sqp_proxy_rcv[tail].map,
+				   sizeof (struct mlx4_ib_proxy_sqp_hdr),
+				   DMA_FROM_DEVICE);
+	hdr = (struct mlx4_ib_proxy_sqp_hdr *) (qp->sqp_proxy_rcv[tail].addr);
+	wc->pkey_index	= be16_to_cpu(hdr->tun.pkey_index);
+	wc->slid	= be16_to_cpu(hdr->tun.slid_mac_47_32);
+	wc->sl		= (u8) (be16_to_cpu(hdr->tun.sl_vid) >> 12);
+	wc->src_qp	= be32_to_cpu(hdr->tun.flags_src_qp) & 0xFFFFFF;
+	wc->wc_flags   |= (hdr->tun.g_ml_path & 0x80) ? (IB_WC_GRH) : 0;
+	wc->dlid_path_bits = 0;
+
+	return 0;
+}
+
 static int mlx4_ib_poll_one(struct mlx4_ib_cq *cq,
 			    struct mlx4_ib_qp **cur_qp,
 			    struct ib_wc *wc)
@@ -562,12 +607,11 @@
 	struct mlx4_qp *mqp;
 	struct mlx4_ib_wq *wq;
 	struct mlx4_ib_srq *srq;
-	struct mlx4_srq *msrq;
 	int is_send;
 	int is_error;
 	u32 g_mlpath_rqpn;
-	int is_xrc_recv = 0;
 	u16 wqe_ctr;
+	unsigned tail = 0;
 
 repoll:
 	cqe = next_cqe_sw(cq);
@@ -574,6 +618,9 @@
 	if (!cqe)
 		return -EAGAIN;
 
+	if (cq->buf.entry_size == 64)
+		cqe++;
+
 	++cq->mcq.cons_index;
 
 	/*
@@ -588,7 +635,7 @@
 
 	if (unlikely((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) == MLX4_OPCODE_NOP &&
 		     is_send)) {
-		printk(KERN_WARNING "Completion for NOP opcode detected!\n");
+		pr_warn("Completion for NOP opcode detected!\n");
 		return -EINVAL;
 	}
 
@@ -608,24 +655,7 @@
 		goto repoll;
 	}
 
-	if ((be32_to_cpu(cqe->vlan_my_qpn) & (1 << 23)) && !is_send) {
-		 /*
-		  * We do not have to take the XRC SRQ table lock here,
-		  * because CQs will be locked while XRC SRQs are removed
-		  * from the table.
-		  */
-		 msrq = __mlx4_srq_lookup(to_mdev(cq->ibcq.device)->dev,
-					 be32_to_cpu(cqe->g_mlpath_rqpn) &
-					 0xffffff);
-		 if (unlikely(!msrq)) {
-			 printk(KERN_WARNING "CQ %06x with entry for unknown "
-				"XRC SRQ %06x\n", cq->mcq.cqn,
-				be32_to_cpu(cqe->g_mlpath_rqpn) & 0xffffff);
-			 return -EINVAL;
-		 }
-		 is_xrc_recv = 1;
-		 srq = to_mibsrq(msrq);
-	} else if (!*cur_qp ||
+	if (!*cur_qp ||
 	    (be32_to_cpu(cqe->vlan_my_qpn) & MLX4_CQE_QPN_MASK) != (*cur_qp)->mqp.qpn) {
 		/*
 		 * We do not have to take the QP table lock here,
@@ -635,7 +665,7 @@
 		mqp = __mlx4_qp_lookup(to_mdev(cq->ibcq.device)->dev,
 				       be32_to_cpu(cqe->vlan_my_qpn));
 		if (unlikely(!mqp)) {
-			printk(KERN_WARNING "CQ %06x with entry for unknown QPN %06x\n",
+			pr_warn("CQ %06x with entry for unknown QPN %06x\n",
 			       cq->mcq.cqn, be32_to_cpu(cqe->vlan_my_qpn) & MLX4_CQE_QPN_MASK);
 			return -EINVAL;
 		}
@@ -643,7 +673,7 @@
 		*cur_qp = to_mibqp(mqp);
 	}
 
-	wc->qp = is_xrc_recv ? NULL: &(*cur_qp)->ibqp;
+	wc->qp = &(*cur_qp)->ibqp;
 
 	if (is_send) {
 		wq = &(*cur_qp)->sq;
@@ -653,10 +683,6 @@
 		}
 		wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
 		++wq->tail;
-	} else if (is_xrc_recv) {
-		wqe_ctr = be16_to_cpu(cqe->wqe_index);
-		wc->wr_id = srq->wrid[wqe_ctr];
-		mlx4_ib_free_srq_wqe(srq, wqe_ctr);
 	} else if ((*cur_qp)->ibqp.srq) {
 		srq = to_msrq((*cur_qp)->ibqp.srq);
 		wqe_ctr = be16_to_cpu(cqe->wqe_index);
@@ -664,7 +690,8 @@
 		mlx4_ib_free_srq_wqe(srq, wqe_ctr);
 	} else {
 		wq	  = &(*cur_qp)->rq;
-		wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
+		tail	  = wq->tail & (wq->wqe_cnt - 1);
+		wc->wr_id = wq->wrid[tail];
 		++wq->tail;
 	}
 
@@ -747,14 +774,26 @@
 			break;
 		}
 
+		if (mlx4_is_mfunc(to_mdev(cq->ibcq.device)->dev)) {
+			if ((*cur_qp)->mlx4_ib_qp_type &
+			    (MLX4_IB_QPT_PROXY_SMI_OWNER |
+			     MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_GSI))
+				return use_tunnel_data(*cur_qp, cq, wc, tail, cqe);
+		}
+
 		wc->slid	   = be16_to_cpu(cqe->rlid);
-		wc->sl		   = be16_to_cpu(cqe->sl_vid) >> 12;
 		g_mlpath_rqpn	   = be32_to_cpu(cqe->g_mlpath_rqpn);
 		wc->src_qp	   = g_mlpath_rqpn & 0xffffff;
 		wc->dlid_path_bits = (g_mlpath_rqpn >> 24) & 0x7f;
 		wc->wc_flags	  |= g_mlpath_rqpn & 0x80000000 ? IB_WC_GRH : 0;
 		wc->pkey_index     = be32_to_cpu(cqe->immed_rss_invalid) & 0x7f;
-		wc->csum_ok	   = mlx4_ib_ipoib_csum_ok(cqe->status, cqe->checksum);
+		wc->wc_flags	  |= mlx4_ib_ipoib_csum_ok(cqe->status,
+					cqe->checksum) ? IB_WC_IP_CSUM_OK : 0;
+		if (rdma_port_get_link_layer(wc->qp->device,
+				(*cur_qp)->port) == IB_LINK_LAYER_ETHERNET)
+			wc->sl  = be16_to_cpu(cqe->sl_vid) >> 13;
+		else
+			wc->sl  = be16_to_cpu(cqe->sl_vid) >> 12;
 	}
 
 	return 0;
@@ -776,8 +815,7 @@
 			break;
 	}
 
-	if (npolled)
-		mlx4_cq_set_ci(&cq->mcq);
+	mlx4_cq_set_ci(&cq->mcq);
 
 	spin_unlock_irqrestore(&cq->lock, flags);
 
@@ -804,11 +842,8 @@
 	int nfreed = 0;
 	struct mlx4_cqe *cqe, *dest;
 	u8 owner_bit;
-	int is_xrc_srq = 0;
+	int cqe_inc = cq->buf.entry_size == 64 ? 1 : 0;
 
-	if (srq && srq->ibsrq.xrc_cq)
-		is_xrc_srq = 1;
-
 	/*
 	 * First we need to find the current producer index, so we
 	 * know where to start cleaning from.  It doesn't matter if HW
@@ -826,15 +861,16 @@
 	 */
 	while ((int) --prod_index - (int) cq->mcq.cons_index >= 0) {
 		cqe = get_cqe(cq, prod_index & cq->ibcq.cqe);
-		if (((be32_to_cpu(cqe->vlan_my_qpn) & 0xffffff) == qpn) ||
-		    (is_xrc_srq &&
-		     (be32_to_cpu(cqe->g_mlpath_rqpn) & 0xffffff) ==
-		      srq->msrq.srqn)) {
+		cqe += cqe_inc;
+
+		if ((be32_to_cpu(cqe->vlan_my_qpn) & MLX4_CQE_QPN_MASK) == qpn) {
 			if (srq && !(cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK))
 				mlx4_ib_free_srq_wqe(srq, be16_to_cpu(cqe->wqe_index));
 			++nfreed;
 		} else if (nfreed) {
 			dest = get_cqe(cq, (prod_index + nfreed) & cq->ibcq.cqe);
+			dest += cqe_inc;
+
 			owner_bit = dest->owner_sr_opcode & MLX4_CQE_OWNER_MASK;
 			memcpy(dest, cqe, sizeof *cqe);
 			dest->owner_sr_opcode = owner_bit |


Property changes on: trunk/sys/ofed/drivers/infiniband/hw/mlx4/cq.c
___________________________________________________________________
Deleted: cvs2svn:cvs-rev
## -1 +0,0 ##
-1.1.1.1
\ No newline at end of property
Index: trunk/sys/ofed/drivers/infiniband/hw/mlx4/doorbell.c
===================================================================
--- trunk/sys/ofed/drivers/infiniband/hw/mlx4/doorbell.c	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/drivers/infiniband/hw/mlx4/doorbell.c	2016-09-14 19:35:22 UTC (rev 7911)

Property changes on: trunk/sys/ofed/drivers/infiniband/hw/mlx4/doorbell.c
___________________________________________________________________
Deleted: cvs2svn:cvs-rev
## -1 +0,0 ##
-1.1.1.1
\ No newline at end of property
Modified: trunk/sys/ofed/drivers/infiniband/hw/mlx4/mad.c
===================================================================
--- trunk/sys/ofed/drivers/infiniband/hw/mlx4/mad.c	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/drivers/infiniband/hw/mlx4/mad.c	2016-09-14 19:35:22 UTC (rev 7911)
@@ -32,8 +32,13 @@
 
 #include <rdma/ib_mad.h>
 #include <rdma/ib_smi.h>
+#include <rdma/ib_sa.h>
+#include <rdma/ib_cache.h>
 
+#include <linux/random.h>
 #include <linux/mlx4/cmd.h>
+#include <linux/gfp.h>
+#include <rdma/ib_pma.h>
 
 #include "mlx4_ib.h"
 
@@ -42,7 +47,62 @@
 	MLX4_IB_VENDOR_CLASS2 = 0xa
 };
 
-int mlx4_MAD_IFC(struct mlx4_ib_dev *dev, int ignore_mkey, int ignore_bkey,
+#define MLX4_TUN_SEND_WRID_SHIFT 34
+#define MLX4_TUN_QPN_SHIFT 32
+#define MLX4_TUN_WRID_RECV (((u64) 1) << MLX4_TUN_SEND_WRID_SHIFT)
+#define MLX4_TUN_SET_WRID_QPN(a) (((u64) ((a) & 0x3)) << MLX4_TUN_QPN_SHIFT)
+
+#define MLX4_TUN_IS_RECV(a)  (((a) >>  MLX4_TUN_SEND_WRID_SHIFT) & 0x1)
+#define MLX4_TUN_WRID_QPN(a) (((a) >> MLX4_TUN_QPN_SHIFT) & 0x3)
+
+ /* Port mgmt change event handling */
+
+#define GET_BLK_PTR_FROM_EQE(eqe) be32_to_cpu(eqe->event.port_mgmt_change.params.tbl_change_info.block_ptr)
+#define GET_MASK_FROM_EQE(eqe) be32_to_cpu(eqe->event.port_mgmt_change.params.tbl_change_info.tbl_entries_mask)
+#define NUM_IDX_IN_PKEY_TBL_BLK 32
+#define GUID_TBL_ENTRY_SIZE 8	   /* size in bytes */
+#define GUID_TBL_BLK_NUM_ENTRIES 8
+#define GUID_TBL_BLK_SIZE (GUID_TBL_ENTRY_SIZE * GUID_TBL_BLK_NUM_ENTRIES)
+
+struct mlx4_mad_rcv_buf {
+	struct ib_grh grh;
+	u8 payload[256];
+} __packed;
+
+struct mlx4_mad_snd_buf {
+	u8 payload[256];
+} __packed;
+
+struct mlx4_tunnel_mad {
+	struct ib_grh grh;
+	struct mlx4_ib_tunnel_header hdr;
+	struct ib_mad mad;
+} __packed;
+
+struct mlx4_rcv_tunnel_mad {
+	struct mlx4_rcv_tunnel_hdr hdr;
+	struct ib_grh grh;
+	struct ib_mad mad;
+} __packed;
+
+static void handle_client_rereg_event(struct mlx4_ib_dev *dev, u8 port_num);
+static void handle_lid_change_event(struct mlx4_ib_dev *dev, u8 port_num);
+static void __propagate_pkey_ev(struct mlx4_ib_dev *dev, int port_num,
+				int block, u32 change_bitmap);
+
+__be64 mlx4_ib_gen_node_guid(void)
+{
+#define NODE_GUID_HI	((u64) (((u64)IB_OPENIB_OUI) << 40))
+	return cpu_to_be64(NODE_GUID_HI | random());
+}
+
+__be64 mlx4_ib_get_new_demux_tid(struct mlx4_ib_demux_ctx *ctx)
+{
+	return cpu_to_be64(atomic_inc_return(&ctx->tid)) |
+		cpu_to_be64(0xff00000000000000LL);
+}
+
+int mlx4_MAD_IFC(struct mlx4_ib_dev *dev, int mad_ifc_flags,
 		 int port, struct ib_wc *in_wc, struct ib_grh *in_grh,
 		 void *in_mad, void *response_mad)
 {
@@ -69,10 +129,13 @@
 	 * Key check traps can't be generated unless we have in_wc to
 	 * tell us where to send the trap.
 	 */
-	if (ignore_mkey || !in_wc)
+	if ((mad_ifc_flags & MLX4_MAD_IFC_IGNORE_MKEY) || !in_wc)
 		op_modifier |= 0x1;
-	if (ignore_bkey || !in_wc)
+	if ((mad_ifc_flags & MLX4_MAD_IFC_IGNORE_BKEY) || !in_wc)
 		op_modifier |= 0x2;
+	if (mlx4_is_mfunc(dev->dev) &&
+	    (mad_ifc_flags & MLX4_MAD_IFC_NET_VIEW || in_wc))
+		op_modifier |= 0x8;
 
 	if (in_wc) {
 		struct {
@@ -105,9 +168,10 @@
 		in_modifier |= in_wc->slid << 16;
 	}
 
-	err = mlx4_cmd_box(dev->dev, inmailbox->dma, outmailbox->dma,
-			   in_modifier, op_modifier,
-			   MLX4_CMD_MAD_IFC, MLX4_CMD_TIME_CLASS_C);
+	err = mlx4_cmd_box(dev->dev, inmailbox->dma, outmailbox->dma, in_modifier,
+			   mlx4_is_master(dev->dev) ? (op_modifier & ~0x8) : op_modifier,
+			   MLX4_CMD_MAD_IFC, MLX4_CMD_TIME_CLASS_C,
+			   (op_modifier & 0x8) ? MLX4_CMD_NATIVE : MLX4_CMD_WRAPPED);
 
 	if (!err)
 		memcpy(response_mad, outmailbox->buf, 256);
@@ -122,6 +186,7 @@
 {
 	struct ib_ah *new_ah;
 	struct ib_ah_attr ah_attr;
+	unsigned long flags;
 
 	if (!dev->send_agent[port_num - 1][0])
 		return;
@@ -136,53 +201,134 @@
 	if (IS_ERR(new_ah))
 		return;
 
-	spin_lock(&dev->sm_lock);
+	spin_lock_irqsave(&dev->sm_lock, flags);
 	if (dev->sm_ah[port_num - 1])
 		ib_destroy_ah(dev->sm_ah[port_num - 1]);
 	dev->sm_ah[port_num - 1] = new_ah;
-	spin_unlock(&dev->sm_lock);
+	spin_unlock_irqrestore(&dev->sm_lock, flags);
 }
 
 /*
- * Snoop SM MADs for port info and P_Key table sets, so we can
- * synthesize LID change and P_Key change events.
+ * Snoop SM MADs for port info, GUID info, and  P_Key table sets, so we can
+ * synthesize LID change, Client-Rereg, GID change, and P_Key change events.
  */
 static void smp_snoop(struct ib_device *ibdev, u8 port_num, struct ib_mad *mad,
-				u16 prev_lid)
+		      u16 prev_lid)
 {
-	struct ib_event event;
+	struct ib_port_info *pinfo;
+	u16 lid;
+	__be16 *base;
+	u32 bn, pkey_change_bitmap;
+	int i;
 
+
+	struct mlx4_ib_dev *dev = to_mdev(ibdev);
 	if ((mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED ||
 	     mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) &&
-	    mad->mad_hdr.method == IB_MGMT_METHOD_SET) {
-		if (mad->mad_hdr.attr_id == IB_SMP_ATTR_PORT_INFO) {
-			struct ib_port_info *pinfo =
-				(struct ib_port_info *) ((struct ib_smp *) mad)->data;
-			u16 lid = be16_to_cpu(pinfo->lid);
+	    mad->mad_hdr.method == IB_MGMT_METHOD_SET)
+		switch (mad->mad_hdr.attr_id) {
+		case IB_SMP_ATTR_PORT_INFO:
+			pinfo = (struct ib_port_info *) ((struct ib_smp *) mad)->data;
+			lid = be16_to_cpu(pinfo->lid);
 
-			update_sm_ah(to_mdev(ibdev), port_num,
+			update_sm_ah(dev, port_num,
 				     be16_to_cpu(pinfo->sm_lid),
 				     pinfo->neighbormtu_mastersmsl & 0xf);
 
-			event.device	       = ibdev;
-			event.element.port_num = port_num;
+			if (pinfo->clientrereg_resv_subnetto & 0x80)
+				handle_client_rereg_event(dev, port_num);
 
-			if (pinfo->clientrereg_resv_subnetto & 0x80) {
-				event.event    = IB_EVENT_CLIENT_REREGISTER;
-				ib_dispatch_event(&event);
+			if (prev_lid != lid)
+				handle_lid_change_event(dev, port_num);
+			break;
+
+		case IB_SMP_ATTR_PKEY_TABLE:
+			if (!mlx4_is_mfunc(dev->dev)) {
+				mlx4_ib_dispatch_event(dev, port_num,
+						       IB_EVENT_PKEY_CHANGE);
+				break;
 			}
 
-			if (prev_lid != lid) {
-				event.event    = IB_EVENT_LID_CHANGE;
-				ib_dispatch_event(&event);
+			/* at this point, we are running in the master.
+			 * Slaves do not receive SMPs.
+			 */
+			bn  = be32_to_cpu(((struct ib_smp *)mad)->attr_mod) & 0xFFFF;
+			base = (__be16 *) &(((struct ib_smp *)mad)->data[0]);
+			pkey_change_bitmap = 0;
+			for (i = 0; i < 32; i++) {
+				pr_debug("PKEY[%d] = x%x\n",
+					 i + bn*32, be16_to_cpu(base[i]));
+				if (be16_to_cpu(base[i]) !=
+				    dev->pkeys.phys_pkey_cache[port_num - 1][i + bn*32]) {
+					pkey_change_bitmap |= (1 << i);
+					dev->pkeys.phys_pkey_cache[port_num - 1][i + bn*32] =
+						be16_to_cpu(base[i]);
+				}
 			}
+			pr_debug("PKEY Change event: port=%d, "
+				 "block=0x%x, change_bitmap=0x%x\n",
+				 port_num, bn, pkey_change_bitmap);
+
+			if (pkey_change_bitmap) {
+				mlx4_ib_dispatch_event(dev, port_num,
+						       IB_EVENT_PKEY_CHANGE);
+				if (!dev->sriov.is_going_down)
+					__propagate_pkey_ev(dev, port_num, bn,
+							    pkey_change_bitmap);
+			}
+			break;
+
+		case IB_SMP_ATTR_GUID_INFO:
+			/* paravirtualized master's guid is guid 0 -- does not change */
+			if (!mlx4_is_master(dev->dev))
+				mlx4_ib_dispatch_event(dev, port_num,
+						       IB_EVENT_GID_CHANGE);
+			/*if master, notify relevant slaves*/
+			if (mlx4_is_master(dev->dev) &&
+			    !dev->sriov.is_going_down) {
+				bn = be32_to_cpu(((struct ib_smp *)mad)->attr_mod);
+				mlx4_ib_update_cache_on_guid_change(dev, bn, port_num,
+								    (u8 *)(&((struct ib_smp *)mad)->data));
+				mlx4_ib_notify_slaves_on_guid_change(dev, bn, port_num,
+								     (u8 *)(&((struct ib_smp *)mad)->data));
+			}
+			break;
+
+		default:
+			break;
 		}
+}
 
-		if (mad->mad_hdr.attr_id == IB_SMP_ATTR_PKEY_TABLE) {
-			event.device	       = ibdev;
-			event.event	       = IB_EVENT_PKEY_CHANGE;
-			event.element.port_num = port_num;
-			ib_dispatch_event(&event);
+static void __propagate_pkey_ev(struct mlx4_ib_dev *dev, int port_num,
+				int block, u32 change_bitmap)
+{
+	int i, ix, slave, err;
+	int have_event = 0;
+
+	for (slave = 0; slave < dev->dev->caps.sqp_demux; slave++) {
+		if (slave == mlx4_master_func_num(dev->dev))
+			continue;
+		if (!mlx4_is_slave_active(dev->dev, slave))
+			continue;
+
+		have_event = 0;
+		for (i = 0; i < 32; i++) {
+			if (!(change_bitmap & (1 << i)))
+				continue;
+			for (ix = 0;
+			     ix < dev->dev->caps.pkey_table_len[port_num]; ix++) {
+				if (dev->pkeys.virt2phys_pkey[slave][port_num - 1]
+				    [ix] == i + 32 * block) {
+					err = mlx4_gen_pkey_eqe(dev->dev, slave, port_num);
+					pr_debug("propagate_pkey_ev: slave %d,"
+						 " port %d, ix %d (%d)\n",
+						 slave, port_num, ix, err);
+					have_event = 1;
+					break;
+				}
+			}
+			if (have_event)
+				break;
 		}
 	}
 }
@@ -190,13 +336,15 @@
 static void node_desc_override(struct ib_device *dev,
 			       struct ib_mad *mad)
 {
+	unsigned long flags;
+
 	if ((mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED ||
 	     mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) &&
 	    mad->mad_hdr.method == IB_MGMT_METHOD_GET_RESP &&
 	    mad->mad_hdr.attr_id == IB_SMP_ATTR_NODE_DESC) {
-		spin_lock(&to_mdev(dev)->sm_lock);
+		spin_lock_irqsave(&to_mdev(dev)->sm_lock, flags);
 		memcpy(((struct ib_smp *) mad)->data, dev->node_desc, 64);
-		spin_unlock(&to_mdev(dev)->sm_lock);
+		spin_unlock_irqrestore(&to_mdev(dev)->sm_lock, flags);
 	}
 }
 
@@ -206,10 +354,13 @@
 	struct ib_mad_send_buf *send_buf;
 	struct ib_mad_agent *agent = dev->send_agent[port_num - 1][qpn];
 	int ret;
+	unsigned long flags;
 
 	if (agent) {
 		send_buf = ib_create_send_mad(agent, qpn, 0, 0, IB_MGMT_MAD_HDR,
 					      IB_MGMT_MAD_DATA, GFP_ATOMIC);
+		if (IS_ERR(send_buf))
+			return;
 		/*
 		 * We rely here on the fact that MLX QPs don't use the
 		 * address handle after the send is posted (this is
@@ -216,13 +367,13 @@
 		 * wrong following the IB spec strictly, but we know
 		 * it's OK for our devices).
 		 */
-		spin_lock(&dev->sm_lock);
+		spin_lock_irqsave(&dev->sm_lock, flags);
 		memcpy(send_buf->mad, mad, sizeof *mad);
 		if ((send_buf->ah = dev->sm_ah[port_num - 1]))
 			ret = ib_post_send_mad(send_buf, NULL);
 		else
 			ret = -EINVAL;
-		spin_unlock(&dev->sm_lock);
+		spin_unlock_irqrestore(&dev->sm_lock, flags);
 
 		if (ret)
 			ib_free_send_mad(send_buf);
@@ -229,24 +380,331 @@
 	}
 }
 
-static int is_vendor_id(__be16 attr_id)
+static int mlx4_ib_demux_sa_handler(struct ib_device *ibdev, int port, int slave,
+							     struct ib_sa_mad *sa_mad)
 {
-	return (attr_id & IB_SMP_ATTR_VENDOR_MASK) == IB_SMP_ATTR_VENDOR_MASK;
+	int ret = 0;
+
+	/* dispatch to different sa handlers */
+	switch (be16_to_cpu(sa_mad->mad_hdr.attr_id)) {
+	case IB_SA_ATTR_MC_MEMBER_REC:
+		ret = mlx4_ib_mcg_demux_handler(ibdev, port, slave, sa_mad);
+		break;
+	default:
+		break;
+	}
+	return ret;
 }
 
-static int supported_vendor_id(__be16 attr_id)
+int mlx4_ib_find_real_gid(struct ib_device *ibdev, u8 port, __be64 guid)
 {
-	return 1;
+	struct mlx4_ib_dev *dev = to_mdev(ibdev);
+	int i;
+
+	for (i = 0; i < dev->dev->caps.sqp_demux; i++) {
+		if (dev->sriov.demux[port - 1].guid_cache[i] == guid)
+			return i;
+	}
+	return -1;
 }
 
+
+static int find_slave_port_pkey_ix(struct mlx4_ib_dev *dev, int slave,
+				   u8 port, u16 pkey, u16 *ix)
+{
+	int i, ret;
+	u8 unassigned_pkey_ix, pkey_ix, partial_ix = 0xFF;
+	u16 slot_pkey;
+
+	if (slave == mlx4_master_func_num(dev->dev))
+		return ib_find_cached_pkey(&dev->ib_dev, port, pkey, ix);
+
+	unassigned_pkey_ix = dev->dev->phys_caps.pkey_phys_table_len[port] - 1;
+
+	for (i = 0; i < dev->dev->caps.pkey_table_len[port]; i++) {
+		if (dev->pkeys.virt2phys_pkey[slave][port - 1][i] == unassigned_pkey_ix)
+			continue;
+
+		pkey_ix = dev->pkeys.virt2phys_pkey[slave][port - 1][i];
+
+		ret = ib_get_cached_pkey(&dev->ib_dev, port, pkey_ix, &slot_pkey);
+		if (ret)
+			continue;
+		if ((slot_pkey & 0x7FFF) == (pkey & 0x7FFF)) {
+			if (slot_pkey & 0x8000) {
+				*ix = (u16) pkey_ix;
+				return 0;
+			} else {
+				/* take first partial pkey index found */
+				if (partial_ix == 0xFF)
+					partial_ix = pkey_ix;
+			}
+		}
+	}
+
+	if (partial_ix < 0xFF) {
+		*ix = (u16) partial_ix;
+		return 0;
+	}
+
+	return -EINVAL;
+}
+
+int mlx4_ib_send_to_slave(struct mlx4_ib_dev *dev, int slave, u8 port,
+			  enum ib_qp_type dest_qpt, struct ib_wc *wc,
+			  struct ib_grh *grh, struct ib_mad *mad)
+{
+	struct ib_sge list;
+	struct ib_send_wr wr, *bad_wr;
+	struct mlx4_ib_demux_pv_ctx *tun_ctx;
+	struct mlx4_ib_demux_pv_qp *tun_qp;
+	struct mlx4_rcv_tunnel_mad *tun_mad;
+	struct ib_ah_attr attr;
+	struct ib_ah *ah;
+	struct ib_qp *src_qp = NULL;
+	unsigned tun_tx_ix = 0;
+	int dqpn;
+	int ret = 0;
+	u16 tun_pkey_ix;
+	u16 cached_pkey;
+	u8 is_eth = dev->dev->caps.port_type[port] == MLX4_PORT_TYPE_ETH;
+
+	if (dest_qpt > IB_QPT_GSI)
+		return -EINVAL;
+
+	tun_ctx = dev->sriov.demux[port-1].tun[slave];
+
+	/* check if proxy qp created */
+	if (!tun_ctx || tun_ctx->state != DEMUX_PV_STATE_ACTIVE)
+		return -EAGAIN;
+
+	/* QP0 forwarding only for Dom0 */
+	if (!dest_qpt && (mlx4_master_func_num(dev->dev) != slave))
+		return -EINVAL;
+
+	if (!dest_qpt)
+		tun_qp = &tun_ctx->qp[0];
+	else
+		tun_qp = &tun_ctx->qp[1];
+
+	/* compute P_Key index to put in tunnel header for slave */
+	if (dest_qpt) {
+		u16 pkey_ix;
+		ret = ib_get_cached_pkey(&dev->ib_dev, port, wc->pkey_index, &cached_pkey);
+		if (ret)
+			return -EINVAL;
+
+		ret = find_slave_port_pkey_ix(dev, slave, port, cached_pkey, &pkey_ix);
+		if (ret)
+			return -EINVAL;
+		tun_pkey_ix = pkey_ix;
+	} else
+		tun_pkey_ix = dev->pkeys.virt2phys_pkey[slave][port - 1][0];
+
+	dqpn = dev->dev->phys_caps.base_proxy_sqpn + 8 * slave + port + (dest_qpt * 2) - 1;
+
+	/* get tunnel tx data buf for slave */
+	src_qp = tun_qp->qp;
+
+	/* create ah. Just need an empty one with the port num for the post send.
+	 * The driver will set the force loopback bit in post_send */
+	memset(&attr, 0, sizeof attr);
+	attr.port_num = port;
+	if (is_eth) {
+		memcpy(&attr.grh.dgid.raw[0], &grh->dgid.raw[0], 16);
+		attr.ah_flags = IB_AH_GRH;
+	}
+	ah = ib_create_ah(tun_ctx->pd, &attr);
+	if (IS_ERR(ah))
+		return -ENOMEM;
+
+	/* allocate tunnel tx buf after pass failure returns */
+	spin_lock(&tun_qp->tx_lock);
+	if (tun_qp->tx_ix_head - tun_qp->tx_ix_tail >=
+	    (MLX4_NUM_TUNNEL_BUFS - 1))
+		ret = -EAGAIN;
+	else
+		tun_tx_ix = (++tun_qp->tx_ix_head) & (MLX4_NUM_TUNNEL_BUFS - 1);
+	spin_unlock(&tun_qp->tx_lock);
+	if (ret)
+		goto out;
+
+	tun_mad = (struct mlx4_rcv_tunnel_mad *) (tun_qp->tx_ring[tun_tx_ix].buf.addr);
+	if (tun_qp->tx_ring[tun_tx_ix].ah)
+		ib_destroy_ah(tun_qp->tx_ring[tun_tx_ix].ah);
+	tun_qp->tx_ring[tun_tx_ix].ah = ah;
+	ib_dma_sync_single_for_cpu(&dev->ib_dev,
+				   tun_qp->tx_ring[tun_tx_ix].buf.map,
+				   sizeof (struct mlx4_rcv_tunnel_mad),
+				   DMA_TO_DEVICE);
+
+	/* copy over to tunnel buffer */
+	if (grh)
+		memcpy(&tun_mad->grh, grh, sizeof *grh);
+	memcpy(&tun_mad->mad, mad, sizeof *mad);
+
+	/* adjust tunnel data */
+	tun_mad->hdr.pkey_index = cpu_to_be16(tun_pkey_ix);
+	tun_mad->hdr.sl_vid = cpu_to_be16(((u16)(wc->sl)) << 12);
+	tun_mad->hdr.slid_mac_47_32 = cpu_to_be16(wc->slid);
+	tun_mad->hdr.flags_src_qp = cpu_to_be32(wc->src_qp & 0xFFFFFF);
+	tun_mad->hdr.g_ml_path = (grh && (wc->wc_flags & IB_WC_GRH)) ? 0x80 : 0;
+
+	ib_dma_sync_single_for_device(&dev->ib_dev,
+				      tun_qp->tx_ring[tun_tx_ix].buf.map,
+				      sizeof (struct mlx4_rcv_tunnel_mad),
+				      DMA_TO_DEVICE);
+
+	list.addr = tun_qp->tx_ring[tun_tx_ix].buf.map;
+	list.length = sizeof (struct mlx4_rcv_tunnel_mad);
+	list.lkey = tun_ctx->mr->lkey;
+
+	wr.wr.ud.ah = ah;
+	wr.wr.ud.port_num = port;
+	wr.wr.ud.remote_qkey = IB_QP_SET_QKEY;
+	wr.wr.ud.remote_qpn = dqpn;
+	wr.next = NULL;
+	wr.wr_id = ((u64) tun_tx_ix) | MLX4_TUN_SET_WRID_QPN(dest_qpt);
+	wr.sg_list = &list;
+	wr.num_sge = 1;
+	wr.opcode = IB_WR_SEND;
+	wr.send_flags = IB_SEND_SIGNALED;
+
+	ret = ib_post_send(src_qp, &wr, &bad_wr);
+out:
+	if (ret)
+		ib_destroy_ah(ah);
+	return ret;
+}
+
+static int mlx4_ib_demux_mad(struct ib_device *ibdev, u8 port,
+			struct ib_wc *wc, struct ib_grh *grh,
+			struct ib_mad *mad)
+{
+	struct mlx4_ib_dev *dev = to_mdev(ibdev);
+	int err;
+	int slave;
+	u8 *slave_id;
+	int is_eth = 0;
+
+	if (rdma_port_get_link_layer(ibdev, port) == IB_LINK_LAYER_INFINIBAND)
+		is_eth = 0;
+	else
+		is_eth = 1;
+
+	if (is_eth) {
+		if (!wc->wc_flags & IB_WC_GRH) {
+			mlx4_ib_warn(ibdev, "RoCE grh not present.\n");
+			return -EINVAL;
+		}
+		if (mad->mad_hdr.mgmt_class != IB_MGMT_CLASS_CM) {
+			mlx4_ib_warn(ibdev, "RoCE mgmt class is not CM\n");
+			return -EINVAL;
+		}
+		if (mlx4_get_slave_from_roce_gid(dev->dev, port, grh->dgid.raw, &slave)) {
+			mlx4_ib_warn(ibdev, "failed matching grh\n");
+			return -ENOENT;
+		}
+		if (slave >= dev->dev->caps.sqp_demux) {
+			mlx4_ib_warn(ibdev, "slave id: %d is bigger than allowed:%d\n",
+				     slave, dev->dev->caps.sqp_demux);
+			return -ENOENT;
+		}
+
+		if (mlx4_ib_demux_cm_handler(ibdev, port, &slave, mad, is_eth))
+			return 0;
+
+		err = mlx4_ib_send_to_slave(dev, slave, port, wc->qp->qp_type, wc, grh, mad);
+		if (err)
+			pr_debug("failed sending to slave %d via tunnel qp (%d)\n",
+				 slave, err);
+		return 0;
+	}
+
+	/* Initially assume that this mad is for us */
+	slave = mlx4_master_func_num(dev->dev);
+
+	/* See if the slave id is encoded in a response mad */
+	if (mad->mad_hdr.method & 0x80) {
+		slave_id = (u8 *) &mad->mad_hdr.tid;
+		slave = *slave_id;
+		if (slave != 255) /*255 indicates the dom0*/
+			*slave_id = 0; /* remap tid */
+	}
+
+	/* If a grh is present, we demux according to it */
+	if (wc->wc_flags & IB_WC_GRH) {
+		slave = mlx4_ib_find_real_gid(ibdev, port, grh->dgid.global.interface_id);
+		if (slave < 0) {
+			mlx4_ib_warn(ibdev, "failed matching grh\n");
+			return -ENOENT;
+		}
+	}
+	/* Class-specific handling */
+	switch (mad->mad_hdr.mgmt_class) {
+	case IB_MGMT_CLASS_SUBN_ADM:
+		if (mlx4_ib_demux_sa_handler(ibdev, port, slave,
+					     (struct ib_sa_mad *) mad))
+			return 0;
+		break;
+	case IB_MGMT_CLASS_CM:
+		if (mlx4_ib_demux_cm_handler(ibdev, port, &slave, mad, is_eth))
+			return 0;
+		break;
+	case IB_MGMT_CLASS_DEVICE_MGMT:
+		if (mad->mad_hdr.method != IB_MGMT_METHOD_GET_RESP)
+			return 0;
+		break;
+	default:
+		/* Drop unsupported classes for slaves in tunnel mode */
+		if (slave != mlx4_master_func_num(dev->dev)) {
+			pr_debug("dropping unsupported ingress mad from class:%d "
+				 "for slave:%d\n", mad->mad_hdr.mgmt_class, slave);
+			return 0;
+		}
+	}
+	/*make sure that no slave==255 was not handled yet.*/
+	if (slave >= dev->dev->caps.sqp_demux) {
+		mlx4_ib_warn(ibdev, "slave id: %d is bigger than allowed:%d\n",
+			     slave, dev->dev->caps.sqp_demux);
+		return -ENOENT;
+	}
+
+	err = mlx4_ib_send_to_slave(dev, slave, port, wc->qp->qp_type, wc, grh, mad);
+	if (err)
+		pr_debug("failed sending to slave %d via tunnel qp (%d)\n",
+			 slave, err);
+	return 0;
+}
+
 static int ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
-                          struct ib_wc *in_wc, struct ib_grh *in_grh,
-                          struct ib_mad *in_mad, struct ib_mad *out_mad)
+			struct ib_wc *in_wc, struct ib_grh *in_grh,
+			struct ib_mad *in_mad, struct ib_mad *out_mad)
 {
 	u16 slid, prev_lid = 0;
 	int err;
 	struct ib_port_attr pattr;
 
+	if (in_wc && in_wc->qp->qp_num) {
+		pr_debug("received MAD: slid:%d sqpn:%d "
+			"dlid_bits:%d dqpn:%d wc_flags:0x%x, cls %x, mtd %x, atr %x\n",
+			in_wc->slid, in_wc->src_qp,
+			in_wc->dlid_path_bits,
+			in_wc->qp->qp_num,
+			in_wc->wc_flags,
+			in_mad->mad_hdr.mgmt_class, in_mad->mad_hdr.method,
+			be16_to_cpu(in_mad->mad_hdr.attr_id));
+		if (in_wc->wc_flags & IB_WC_GRH) {
+			pr_debug("sgid_hi:0x%016llx sgid_lo:0x%016llx\n",
+				 (long long)be64_to_cpu(in_grh->sgid.global.subnet_prefix),
+				 (long long)
+				 be64_to_cpu(in_grh->sgid.global.interface_id));
+			pr_debug("dgid_hi:0x%016llx dgid_lo:0x%016llx\n",
+				 (long long)be64_to_cpu(in_grh->dgid.global.subnet_prefix),
+				 (long long)be64_to_cpu(in_grh->dgid.global.interface_id));
+		}
+	}
+
 	slid = in_wc ? in_wc->slid : be16_to_cpu(IB_LID_PERMISSIVE);
 
 	if (in_mad->mad_hdr.method == IB_MGMT_METHOD_TRAP && slid == 0) {
@@ -262,12 +720,9 @@
 			return IB_MAD_RESULT_SUCCESS;
 
 		/*
-		 * Don't process SMInfo queries or vendor-specific
-		 * MADs -- the SMA can't handle them.
+		 * Don't process SMInfo queries -- the SMA can't handle them.
 		 */
-		if (in_mad->mad_hdr.attr_id == IB_SMP_ATTR_SM_INFO ||
-		    (is_vendor_id(in_mad->mad_hdr.attr_id) &&
-		    !supported_vendor_id(in_mad->mad_hdr.attr_id)))
+		if (in_mad->mad_hdr.attr_id == IB_SMP_ATTR_SM_INFO)
 			return IB_MAD_RESULT_SUCCESS;
 	} else if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_PERF_MGMT ||
 		   in_mad->mad_hdr.mgmt_class == MLX4_IB_VENDOR_CLASS1   ||
@@ -287,15 +742,19 @@
 		prev_lid = pattr.lid;
 
 	err = mlx4_MAD_IFC(to_mdev(ibdev),
-			   mad_flags & IB_MAD_IGNORE_MKEY,
-			   mad_flags & IB_MAD_IGNORE_BKEY,
+			   (mad_flags & IB_MAD_IGNORE_MKEY ? MLX4_MAD_IFC_IGNORE_MKEY : 0) |
+			   (mad_flags & IB_MAD_IGNORE_BKEY ? MLX4_MAD_IFC_IGNORE_BKEY : 0) |
+			   MLX4_MAD_IFC_NET_VIEW,
 			   port_num, in_wc, in_grh, in_mad, out_mad);
 	if (err)
 		return IB_MAD_RESULT_FAILURE;
 
 	if (!out_mad->mad_hdr.status) {
-		smp_snoop(ibdev, port_num, in_mad, prev_lid);
-		node_desc_override(ibdev, out_mad);
+		if (!(to_mdev(ibdev)->dev->caps.flags & MLX4_DEV_CAP_FLAG_PORT_MNG_CHG_EV))
+			smp_snoop(ibdev, port_num, in_mad, prev_lid);
+		/* slaves get node desc from FW */
+		if (!mlx4_is_slave(to_mdev(ibdev)->dev))
+			node_desc_override(ibdev, out_mad);
 	}
 
 	/* set return bit in status of directed route responses */
@@ -309,60 +768,224 @@
 	return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY;
 }
 
-static __be32 be64_to_be32(__be64 b64)
+static void edit_counter_ext(struct mlx4_if_stat_extended *cnt, void *counters,
+			     __be16 attr_id)
 {
-	return cpu_to_be32(be64_to_cpu(b64) & 0xffffffff);
+	switch (attr_id) {
+	case IB_PMA_PORT_COUNTERS:
+	{
+		struct ib_pma_portcounters *pma_cnt =
+				(struct ib_pma_portcounters *)counters;
+		pma_cnt->port_xmit_data =
+			cpu_to_be32((be64_to_cpu(cnt->counters[0].
+						 IfTxUnicastOctets) +
+				     be64_to_cpu(cnt->counters[0].
+						 IfTxMulticastOctets) +
+				     be64_to_cpu(cnt->counters[0].
+						 IfTxBroadcastOctets) +
+				     be64_to_cpu(cnt->counters[0].
+						 IfTxDroppedOctets)) >> 2);
+		pma_cnt->port_rcv_data  =
+			cpu_to_be32((be64_to_cpu(cnt->counters[0].
+						 IfRxUnicastOctets) +
+				     be64_to_cpu(cnt->counters[0].
+						 IfRxMulticastOctets) +
+				     be64_to_cpu(cnt->counters[0].
+						 IfRxBroadcastOctets) +
+				     be64_to_cpu(cnt->counters[0].
+						 IfRxNoBufferOctets) +
+				     be64_to_cpu(cnt->counters[0].
+						 IfRxErrorOctets)) >> 2);
+		pma_cnt->port_xmit_packets =
+			cpu_to_be32(be64_to_cpu(cnt->counters[0].
+						IfTxUnicastFrames) +
+				    be64_to_cpu(cnt->counters[0].
+						IfTxMulticastFrames) +
+				    be64_to_cpu(cnt->counters[0].
+						IfTxBroadcastFrames) +
+				    be64_to_cpu(cnt->counters[0].
+						IfTxDroppedFrames));
+		pma_cnt->port_rcv_packets  =
+			cpu_to_be32(be64_to_cpu(cnt->counters[0].
+						IfRxUnicastFrames) +
+				    be64_to_cpu(cnt->counters[0].
+						IfRxMulticastFrames) +
+				    be64_to_cpu(cnt->counters[0].
+						IfRxBroadcastFrames) +
+				    be64_to_cpu(cnt->counters[0].
+						IfRxNoBufferFrames) +
+				    be64_to_cpu(cnt->counters[0].
+						IfRxErrorFrames));
+		pma_cnt->port_rcv_errors = cpu_to_be32(be64_to_cpu(cnt->
+						       counters[0].
+						       IfRxErrorFrames));
+		break;
+	}
+
+	case IB_PMA_PORT_COUNTERS_EXT:
+	{
+		struct ib_pma_portcounters_ext *pma_cnt_ext =
+				(struct ib_pma_portcounters_ext *)counters;
+
+		pma_cnt_ext->port_xmit_data =
+			cpu_to_be64((be64_to_cpu(cnt->counters[0].
+						 IfTxUnicastOctets) +
+				     be64_to_cpu(cnt->counters[0].
+						 IfTxMulticastOctets) +
+				     be64_to_cpu(cnt->counters[0].
+						 IfTxBroadcastOctets) +
+				     be64_to_cpu(cnt->counters[0].
+						 IfTxDroppedOctets)) >> 2);
+		pma_cnt_ext->port_rcv_data  =
+			cpu_to_be64((be64_to_cpu(cnt->counters[0].
+						 IfRxUnicastOctets) +
+				     be64_to_cpu(cnt->counters[0].
+						 IfRxMulticastOctets) +
+				     be64_to_cpu(cnt->counters[0].
+						 IfRxBroadcastOctets) +
+				     be64_to_cpu(cnt->counters[0].
+						 IfRxNoBufferOctets) +
+				     be64_to_cpu(cnt->counters[0].
+						 IfRxErrorOctets)) >> 2);
+		pma_cnt_ext->port_xmit_packets =
+			cpu_to_be64(be64_to_cpu(cnt->counters[0].
+						IfTxUnicastFrames) +
+				    be64_to_cpu(cnt->counters[0].
+						IfTxMulticastFrames) +
+				    be64_to_cpu(cnt->counters[0].
+						IfTxBroadcastFrames) +
+				    be64_to_cpu(cnt->counters[0].
+						IfTxDroppedFrames));
+		pma_cnt_ext->port_rcv_packets  =
+			cpu_to_be64(be64_to_cpu(cnt->counters[0].
+						IfRxUnicastFrames) +
+				    be64_to_cpu(cnt->counters[0].
+						IfRxMulticastFrames) +
+				    be64_to_cpu(cnt->counters[0].
+						IfRxBroadcastFrames) +
+				    be64_to_cpu(cnt->counters[0].
+						IfRxNoBufferFrames) +
+				    be64_to_cpu(cnt->counters[0].
+						IfRxErrorFrames));
+		pma_cnt_ext->port_unicast_xmit_packets = cnt->counters[0].
+						IfTxUnicastFrames;
+		pma_cnt_ext->port_unicast_rcv_packets = cnt->counters[0].
+						IfRxUnicastFrames;
+		pma_cnt_ext->port_multicast_xmit_packets =
+			cpu_to_be64(be64_to_cpu(cnt->counters[0].
+						IfTxMulticastFrames) +
+				    be64_to_cpu(cnt->counters[0].
+						IfTxBroadcastFrames));
+		pma_cnt_ext->port_multicast_rcv_packets =
+			cpu_to_be64(be64_to_cpu(cnt->counters[0].
+						IfTxMulticastFrames) +
+				    be64_to_cpu(cnt->counters[0].
+						IfTxBroadcastFrames));
+
+		break;
+	}
+
+	default:
+		pr_warn("Unsupported attr_id 0x%x\n", attr_id);
+		break;
+	}
+
 }
 
-static void edit_counters(struct mlx4_counters *cnt, void *data)
+static void edit_counter(struct mlx4_if_stat_basic *cnt, void *counters,
+			 __be16	attr_id)
 {
-	*(__be32 *)(data + 40 + 24) = be64_to_be32(cnt->tx_bytes);
-	*(__be32 *)(data + 40 + 28) = be64_to_be32(cnt->rx_bytes);
-	*(__be32 *)(data + 40 + 32) = be64_to_be32(cnt->tx_frames);
-	*(__be32 *)(data + 40 + 36) = be64_to_be32(cnt->rx_frames);
-}
+	switch (attr_id) {
+	case IB_PMA_PORT_COUNTERS:
+	{
+		struct ib_pma_portcounters *pma_cnt =
+				(struct ib_pma_portcounters *) counters;
+		pma_cnt->port_xmit_data =
+			cpu_to_be32(be64_to_cpu(
+				    cnt->counters[0].IfTxOctets) >> 2);
+		pma_cnt->port_rcv_data  =
+			cpu_to_be32(be64_to_cpu(
+				    cnt->counters[0].IfRxOctets) >> 2);
+		pma_cnt->port_xmit_packets =
+			cpu_to_be32(be64_to_cpu(cnt->counters[0].IfTxFrames));
+		pma_cnt->port_rcv_packets  =
+			cpu_to_be32(be64_to_cpu(cnt->counters[0].IfRxFrames));
+		break;
+	}
+	case IB_PMA_PORT_COUNTERS_EXT:
+	{
+		struct ib_pma_portcounters_ext *pma_cnt_ext =
+				(struct ib_pma_portcounters_ext *) counters;
 
-static void edit_ext_counters(struct mlx4_counters_ext *cnt, void *data)
-{
-	*(__be32 *)(data + 40 + 24) = be64_to_be32(cnt->tx_uni_bytes);
-	*(__be32 *)(data + 40 + 28) = be64_to_be32(cnt->rx_uni_bytes);
-	*(__be32 *)(data + 40 + 32) = be64_to_be32(cnt->tx_uni_frames);
-	*(__be32 *)(data + 40 + 36) = be64_to_be32(cnt->rx_uni_frames);
-	*(__be32 *)(data + 40 + 8) = be64_to_be32(cnt->rx_err_frames);
+		pma_cnt_ext->port_xmit_data =
+			cpu_to_be64((be64_to_cpu(cnt->counters[0].
+						 IfTxOctets) >> 2));
+		pma_cnt_ext->port_rcv_data  =
+			cpu_to_be64((be64_to_cpu(cnt->counters[0].
+						 IfRxOctets) >> 2));
+		pma_cnt_ext->port_xmit_packets = cnt->counters[0].IfTxFrames;
+		pma_cnt_ext->port_rcv_packets  = cnt->counters[0].IfRxFrames;
+		break;
+	}
+	default:
+		pr_warn("Unsupported attr_id 0x%x\n", attr_id);
+		break;
+	}
 }
 
-static int rdmaoe_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
-                              struct ib_wc *in_wc, struct ib_grh *in_grh,
-                              struct ib_mad *in_mad, struct ib_mad *out_mad)
+int mlx4_ib_query_if_stat(struct mlx4_ib_dev *dev, u32 counter_index,
+		       union mlx4_counter *counter, u8 clear)
 {
 	struct mlx4_cmd_mailbox *mailbox;
-	struct mlx4_ib_dev *dev = to_mdev(ibdev);
 	int err;
-	u32 inmod = dev->counters[port_num - 1] & 0xffff;
-	int mode;
+	u32 inmod = counter_index | ((clear & 1) << 31);
 
-        if (in_mad->mad_hdr.mgmt_class != IB_MGMT_CLASS_PERF_MGMT)
-		return -EINVAL;
-
 	mailbox = mlx4_alloc_cmd_mailbox(dev->dev);
 	if (IS_ERR(mailbox))
 		return IB_MAD_RESULT_FAILURE;
 
 	err = mlx4_cmd_box(dev->dev, 0, mailbox->dma, inmod, 0,
-			   MLX4_CMD_QUERY_IF_STAT, MLX4_CMD_TIME_CLASS_C);
-	if (err)
+			   MLX4_CMD_QUERY_IF_STAT, MLX4_CMD_TIME_CLASS_C,
+			   MLX4_CMD_WRAPPED);
+	if (!err)
+		memcpy(counter, mailbox->buf, MLX4_IF_STAT_SZ(1));
+
+	mlx4_free_cmd_mailbox(dev->dev, mailbox);
+
+	return err;
+}
+
+static int iboe_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
+			struct ib_wc *in_wc, struct ib_grh *in_grh,
+			struct ib_mad *in_mad, struct ib_mad *out_mad)
+{
+	struct mlx4_ib_dev *dev = to_mdev(ibdev);
+	int err;
+	u32 counter_index = dev->counters[port_num - 1] & 0xffff;
+	u8 mode;
+	char				counter_buf[MLX4_IF_STAT_SZ(1)];
+	union  mlx4_counter		*counter = (union mlx4_counter *)
+						   counter_buf;
+
+	if (in_mad->mad_hdr.mgmt_class != IB_MGMT_CLASS_PERF_MGMT)
+		return -EINVAL;
+
+	if (mlx4_ib_query_if_stat(dev, counter_index, counter, 0)) {
 		err = IB_MAD_RESULT_FAILURE;
-	else {
+	} else {
 		memset(out_mad->data, 0, sizeof out_mad->data);
-		mode = be32_to_cpu(((struct mlx4_counters *)mailbox->buf)->counter_mode) & 0xf;
-		switch (mode) {
+		mode = counter->control.cnt_mode & 0xFF;
+		err = IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY;
+		switch (mode & 0xf) {
 		case 0:
-			edit_counters(mailbox->buf, out_mad->data);
-			err = IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY;
+			edit_counter((void *)counter,
+				     (void *)(out_mad->data + 40),
+				     in_mad->mad_hdr.attr_id);
 			break;
 		case 1:
-			edit_ext_counters(mailbox->buf, out_mad->data);
-			err = IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY;
+			edit_counter_ext((void *)counter,
+					 (void *)(out_mad->data + 40),
+					 in_mad->mad_hdr.attr_id);
 			break;
 		default:
 			err = IB_MAD_RESULT_FAILURE;
@@ -369,12 +992,11 @@
 		}
 	}
 
-	mlx4_free_cmd_mailbox(dev->dev, mailbox);
 
 	return err;
 }
 
-int mlx4_ib_process_mad(struct ib_device *ibdev, int mad_flags,	u8 port_num,
+int mlx4_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
 			struct ib_wc *in_wc, struct ib_grh *in_grh,
 			struct ib_mad *in_mad, struct ib_mad *out_mad)
 {
@@ -383,7 +1005,7 @@
 		return ib_process_mad(ibdev, mad_flags, port_num, in_wc,
 				      in_grh, in_mad, out_mad);
 	case IB_LINK_LAYER_ETHERNET:
-		return rdmaoe_process_mad(ibdev, mad_flags, port_num, in_wc,
+		return iboe_process_mad(ibdev, mad_flags, port_num, in_wc,
 					  in_grh, in_mad, out_mad);
 	default:
 		return -EINVAL;
@@ -393,6 +1015,8 @@
 static void send_handler(struct ib_mad_agent *agent,
 			 struct ib_mad_send_wc *mad_send_wc)
 {
+	if (mad_send_wc->send_buf->context[0])
+		ib_destroy_ah(mad_send_wc->send_buf->context[0]);
 	ib_free_send_mad(mad_send_wc->send_buf);
 }
 
@@ -450,3 +1074,1221 @@
 			ib_destroy_ah(dev->sm_ah[p]);
 	}
 }
+
+static void handle_lid_change_event(struct mlx4_ib_dev *dev, u8 port_num)
+{
+	mlx4_ib_dispatch_event(dev, port_num, IB_EVENT_LID_CHANGE);
+
+	if (mlx4_is_master(dev->dev) && !dev->sriov.is_going_down)
+		mlx4_gen_slaves_port_mgt_ev(dev->dev, port_num,
+					    MLX4_EQ_PORT_INFO_LID_CHANGE_MASK, 0, 0);
+}
+
+static void handle_client_rereg_event(struct mlx4_ib_dev *dev, u8 port_num)
+{
+	/* re-configure the alias-guid and mcg's */
+	if (mlx4_is_master(dev->dev)) {
+		mlx4_ib_invalidate_all_guid_record(dev, port_num);
+
+		if (!dev->sriov.is_going_down) {
+			mlx4_ib_mcg_port_cleanup(&dev->sriov.demux[port_num - 1], 0);
+			mlx4_gen_slaves_port_mgt_ev(dev->dev, port_num,
+						    MLX4_EQ_PORT_INFO_CLIENT_REREG_MASK, 0, 0);
+		}
+	}
+	mlx4_ib_dispatch_event(dev, port_num, IB_EVENT_CLIENT_REREGISTER);
+}
+
+static void propagate_pkey_ev(struct mlx4_ib_dev *dev, int port_num,
+			      struct mlx4_eqe *eqe)
+{
+	__propagate_pkey_ev(dev, port_num, GET_BLK_PTR_FROM_EQE(eqe),
+			    GET_MASK_FROM_EQE(eqe));
+}
+
+static void handle_slaves_guid_change(struct mlx4_ib_dev *dev, u8 port_num,
+				      u32 guid_tbl_blk_num, u32 change_bitmap)
+{
+	struct ib_smp *in_mad  = NULL;
+	struct ib_smp *out_mad  = NULL;
+	u16 i;
+
+	if (!mlx4_is_mfunc(dev->dev) || !mlx4_is_master(dev->dev))
+		return;
+
+	in_mad  = kmalloc(sizeof *in_mad, GFP_KERNEL);
+	out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
+	if (!in_mad || !out_mad) {
+		mlx4_ib_warn(&dev->ib_dev, "failed to allocate memory for guid info mads\n");
+		goto out;
+	}
+
+	guid_tbl_blk_num  *= 4;
+
+	for (i = 0; i < 4; i++) {
+		if (change_bitmap && (!((change_bitmap >> (8 * i)) & 0xff)))
+			continue;
+		memset(in_mad, 0, sizeof *in_mad);
+		memset(out_mad, 0, sizeof *out_mad);
+
+		in_mad->base_version  = 1;
+		in_mad->mgmt_class    = IB_MGMT_CLASS_SUBN_LID_ROUTED;
+		in_mad->class_version = 1;
+		in_mad->method        = IB_MGMT_METHOD_GET;
+		in_mad->attr_id       = IB_SMP_ATTR_GUID_INFO;
+		in_mad->attr_mod      = cpu_to_be32(guid_tbl_blk_num + i);
+
+		if (mlx4_MAD_IFC(dev,
+				 MLX4_MAD_IFC_IGNORE_KEYS | MLX4_MAD_IFC_NET_VIEW,
+				 port_num, NULL, NULL, in_mad, out_mad)) {
+			mlx4_ib_warn(&dev->ib_dev, "Failed in get GUID INFO MAD_IFC\n");
+			goto out;
+		}
+
+		mlx4_ib_update_cache_on_guid_change(dev, guid_tbl_blk_num + i,
+						    port_num,
+						    (u8 *)(&((struct ib_smp *)out_mad)->data));
+		mlx4_ib_notify_slaves_on_guid_change(dev, guid_tbl_blk_num + i,
+						     port_num,
+						     (u8 *)(&((struct ib_smp *)out_mad)->data));
+	}
+
+out:
+	kfree(in_mad);
+	kfree(out_mad);
+	return;
+}
+
+void handle_port_mgmt_change_event(struct work_struct *work)
+{
+	struct ib_event_work *ew = container_of(work, struct ib_event_work, work);
+	struct mlx4_ib_dev *dev = ew->ib_dev;
+	struct mlx4_eqe *eqe = &(ew->ib_eqe);
+	u8 port = eqe->event.port_mgmt_change.port;
+	u32 changed_attr;
+	u32 tbl_block;
+	u32 change_bitmap;
+
+	switch (eqe->subtype) {
+	case MLX4_DEV_PMC_SUBTYPE_PORT_INFO:
+		changed_attr = be32_to_cpu(eqe->event.port_mgmt_change.params.port_info.changed_attr);
+
+		/* Update the SM ah - This should be done before handling
+		   the other changed attributes so that MADs can be sent to the SM */
+		if (changed_attr & MSTR_SM_CHANGE_MASK) {
+			u16 lid = be16_to_cpu(eqe->event.port_mgmt_change.params.port_info.mstr_sm_lid);
+			u8 sl = eqe->event.port_mgmt_change.params.port_info.mstr_sm_sl & 0xf;
+			update_sm_ah(dev, port, lid, sl);
+		}
+
+		/* Check if it is a lid change event */
+		if (changed_attr & MLX4_EQ_PORT_INFO_LID_CHANGE_MASK)
+			handle_lid_change_event(dev, port);
+
+		/* Generate GUID changed event */
+		if (changed_attr & MLX4_EQ_PORT_INFO_GID_PFX_CHANGE_MASK) {
+			mlx4_ib_dispatch_event(dev, port, IB_EVENT_GID_CHANGE);
+			/*if master, notify all slaves*/
+			if (mlx4_is_master(dev->dev))
+				mlx4_gen_slaves_port_mgt_ev(dev->dev, port,
+							    MLX4_EQ_PORT_INFO_GID_PFX_CHANGE_MASK, 0, 0);
+		}
+
+		if (changed_attr & MLX4_EQ_PORT_INFO_CLIENT_REREG_MASK)
+			handle_client_rereg_event(dev, port);
+		break;
+
+	case MLX4_DEV_PMC_SUBTYPE_PKEY_TABLE:
+		mlx4_ib_dispatch_event(dev, port, IB_EVENT_PKEY_CHANGE);
+		if (mlx4_is_master(dev->dev) && !dev->sriov.is_going_down)
+			propagate_pkey_ev(dev, port, eqe);
+		break;
+	case MLX4_DEV_PMC_SUBTYPE_GUID_INFO:
+		/* paravirtualized master's guid is guid 0 -- does not change */
+		if (!mlx4_is_master(dev->dev))
+			mlx4_ib_dispatch_event(dev, port, IB_EVENT_GID_CHANGE);
+		/*if master, notify relevant slaves*/
+		else if (!dev->sriov.is_going_down) {
+			tbl_block = GET_BLK_PTR_FROM_EQE(eqe);
+			change_bitmap = GET_MASK_FROM_EQE(eqe);
+			handle_slaves_guid_change(dev, port, tbl_block, change_bitmap);
+		}
+		break;
+	default:
+		pr_warn("Unsupported subtype 0x%x for "
+			"Port Management Change event\n", eqe->subtype);
+	}
+
+	kfree(ew);
+}
+
+void mlx4_ib_dispatch_event(struct mlx4_ib_dev *dev, u8 port_num,
+			    enum ib_event_type type)
+{
+	struct ib_event event;
+
+	event.device		= &dev->ib_dev;
+	event.element.port_num	= port_num;
+	event.event		= type;
+
+	ib_dispatch_event(&event);
+}
+
+static void mlx4_ib_tunnel_comp_handler(struct ib_cq *cq, void *arg)
+{
+	unsigned long flags;
+	struct mlx4_ib_demux_pv_ctx *ctx = cq->cq_context;
+	struct mlx4_ib_dev *dev = to_mdev(ctx->ib_dev);
+	spin_lock_irqsave(&dev->sriov.going_down_lock, flags);
+	if (!dev->sriov.is_going_down && ctx->state == DEMUX_PV_STATE_ACTIVE)
+		queue_work(ctx->wq, &ctx->work);
+	spin_unlock_irqrestore(&dev->sriov.going_down_lock, flags);
+}
+
+static int mlx4_ib_post_pv_qp_buf(struct mlx4_ib_demux_pv_ctx *ctx,
+				  struct mlx4_ib_demux_pv_qp *tun_qp,
+				  int index)
+{
+	struct ib_sge sg_list;
+	struct ib_recv_wr recv_wr, *bad_recv_wr;
+	int size;
+
+	size = (tun_qp->qp->qp_type == IB_QPT_UD) ?
+		sizeof (struct mlx4_tunnel_mad) : sizeof (struct mlx4_mad_rcv_buf);
+
+	sg_list.addr = tun_qp->ring[index].map;
+	sg_list.length = size;
+	sg_list.lkey = ctx->mr->lkey;
+
+	recv_wr.next = NULL;
+	recv_wr.sg_list = &sg_list;
+	recv_wr.num_sge = 1;
+	recv_wr.wr_id = (u64) index | MLX4_TUN_WRID_RECV |
+		MLX4_TUN_SET_WRID_QPN(tun_qp->proxy_qpt);
+	ib_dma_sync_single_for_device(ctx->ib_dev, tun_qp->ring[index].map,
+				      size, DMA_FROM_DEVICE);
+	return ib_post_recv(tun_qp->qp, &recv_wr, &bad_recv_wr);
+}
+
+static int mlx4_ib_multiplex_sa_handler(struct ib_device *ibdev, int port,
+		int slave, struct ib_sa_mad *sa_mad)
+{
+	int ret = 0;
+
+	/* dispatch to different sa handlers */
+	switch (be16_to_cpu(sa_mad->mad_hdr.attr_id)) {
+	case IB_SA_ATTR_MC_MEMBER_REC:
+		ret = mlx4_ib_mcg_multiplex_handler(ibdev, port, slave, sa_mad);
+		break;
+	default:
+		break;
+	}
+	return ret;
+}
+
+static int is_proxy_qp0(struct mlx4_ib_dev *dev, int qpn, int slave)
+{
+	int proxy_start = dev->dev->phys_caps.base_proxy_sqpn + 8 * slave;
+
+	return (qpn >= proxy_start && qpn <= proxy_start + 1);
+}
+
+
+int mlx4_ib_send_to_wire(struct mlx4_ib_dev *dev, int slave, u8 port,
+			 enum ib_qp_type dest_qpt, u16 pkey_index, u32 remote_qpn,
+			 u32 qkey, struct ib_ah_attr *attr, struct ib_mad *mad)
+{
+	struct ib_sge list;
+	struct ib_send_wr wr, *bad_wr;
+	struct mlx4_ib_demux_pv_ctx *sqp_ctx;
+	struct mlx4_ib_demux_pv_qp *sqp;
+	struct mlx4_mad_snd_buf *sqp_mad;
+	struct ib_ah *ah;
+	struct ib_qp *send_qp = NULL;
+	unsigned wire_tx_ix = 0;
+	int ret = 0;
+	u16 wire_pkey_ix;
+	int src_qpnum;
+	u8 sgid_index;
+
+
+	sqp_ctx = dev->sriov.sqps[port-1];
+
+	/* check if proxy qp created */
+	if (!sqp_ctx || sqp_ctx->state != DEMUX_PV_STATE_ACTIVE)
+		return -EAGAIN;
+
+	/* QP0 forwarding only for Dom0 */
+	if (dest_qpt == IB_QPT_SMI && (mlx4_master_func_num(dev->dev) != slave))
+		return -EINVAL;
+
+	if (dest_qpt == IB_QPT_SMI) {
+		src_qpnum = 0;
+		sqp = &sqp_ctx->qp[0];
+		wire_pkey_ix = dev->pkeys.virt2phys_pkey[slave][port - 1][0];
+	} else {
+		src_qpnum = 1;
+		sqp = &sqp_ctx->qp[1];
+		wire_pkey_ix = dev->pkeys.virt2phys_pkey[slave][port - 1][pkey_index];
+	}
+
+	send_qp = sqp->qp;
+
+	/* create ah */
+	sgid_index = attr->grh.sgid_index;
+	attr->grh.sgid_index = 0;
+	ah = ib_create_ah(sqp_ctx->pd, attr);
+	if (IS_ERR(ah))
+		return -ENOMEM;
+	attr->grh.sgid_index = sgid_index;
+	to_mah(ah)->av.ib.gid_index = sgid_index;
+	/* get rid of force-loopback bit */
+	to_mah(ah)->av.ib.port_pd &= cpu_to_be32(0x7FFFFFFF);
+	spin_lock(&sqp->tx_lock);
+	if (sqp->tx_ix_head - sqp->tx_ix_tail >=
+	    (MLX4_NUM_TUNNEL_BUFS - 1))
+		ret = -EAGAIN;
+	else
+		wire_tx_ix = (++sqp->tx_ix_head) & (MLX4_NUM_TUNNEL_BUFS - 1);
+	spin_unlock(&sqp->tx_lock);
+	if (ret)
+		goto out;
+
+	sqp_mad = (struct mlx4_mad_snd_buf *) (sqp->tx_ring[wire_tx_ix].buf.addr);
+	if (sqp->tx_ring[wire_tx_ix].ah)
+		ib_destroy_ah(sqp->tx_ring[wire_tx_ix].ah);
+	sqp->tx_ring[wire_tx_ix].ah = ah;
+	ib_dma_sync_single_for_cpu(&dev->ib_dev,
+				   sqp->tx_ring[wire_tx_ix].buf.map,
+				   sizeof (struct mlx4_mad_snd_buf),
+				   DMA_TO_DEVICE);
+
+	memcpy(&sqp_mad->payload, mad, sizeof *mad);
+
+	ib_dma_sync_single_for_device(&dev->ib_dev,
+				      sqp->tx_ring[wire_tx_ix].buf.map,
+				      sizeof (struct mlx4_mad_snd_buf),
+				      DMA_TO_DEVICE);
+
+	list.addr = sqp->tx_ring[wire_tx_ix].buf.map;
+	list.length = sizeof (struct mlx4_mad_snd_buf);
+	list.lkey = sqp_ctx->mr->lkey;
+
+	wr.wr.ud.ah = ah;
+	wr.wr.ud.port_num = port;
+	wr.wr.ud.pkey_index = wire_pkey_ix;
+	wr.wr.ud.remote_qkey = qkey;
+	wr.wr.ud.remote_qpn = remote_qpn;
+	wr.next = NULL;
+	wr.wr_id = ((u64) wire_tx_ix) | MLX4_TUN_SET_WRID_QPN(src_qpnum);
+	wr.sg_list = &list;
+	wr.num_sge = 1;
+	wr.opcode = IB_WR_SEND;
+	wr.send_flags = IB_SEND_SIGNALED;
+
+	ret = ib_post_send(send_qp, &wr, &bad_wr);
+out:
+	if (ret)
+		ib_destroy_ah(ah);
+	return ret;
+}
+
+static int get_slave_base_gid_ix(struct mlx4_ib_dev *dev, int slave, int port)
+{
+	int gids;
+	int vfs;
+
+	if (rdma_port_get_link_layer(&dev->ib_dev, port) == IB_LINK_LAYER_INFINIBAND)
+		return slave;
+
+	gids = MLX4_ROCE_MAX_GIDS - MLX4_ROCE_PF_GIDS;
+	vfs = dev->dev->num_vfs;
+
+	if (slave == 0)
+		return 0;
+	if (slave <= gids % vfs)
+		return MLX4_ROCE_PF_GIDS + ((gids / vfs) + 1) * (slave - 1);
+
+	return MLX4_ROCE_PF_GIDS + (gids % vfs) + ((gids / vfs) * (slave - 1));
+}
+
+static int get_real_sgid_index(struct mlx4_ib_dev *dev, int slave, int port,
+			       struct ib_ah_attr *ah_attr)
+{
+	if (rdma_port_get_link_layer(&dev->ib_dev, port) == IB_LINK_LAYER_INFINIBAND) {
+		ah_attr->grh.sgid_index = slave;
+		return 0;
+	}
+	ah_attr->grh.sgid_index += get_slave_base_gid_ix(dev, slave, port);
+	return 0;
+}
+
+static void mlx4_ib_multiplex_mad(struct mlx4_ib_demux_pv_ctx *ctx, struct ib_wc *wc)
+{
+	struct mlx4_ib_dev *dev = to_mdev(ctx->ib_dev);
+	struct mlx4_ib_demux_pv_qp *tun_qp = &ctx->qp[MLX4_TUN_WRID_QPN(wc->wr_id)];
+	int wr_ix = wc->wr_id & (MLX4_NUM_TUNNEL_BUFS - 1);
+	struct mlx4_tunnel_mad *tunnel = tun_qp->ring[wr_ix].addr;
+	struct mlx4_ib_ah ah;
+	struct ib_ah_attr ah_attr;
+	u8 *slave_id;
+	int slave;
+
+	/* Get slave that sent this packet */
+	if (wc->src_qp < dev->dev->phys_caps.base_proxy_sqpn ||
+	    wc->src_qp >= dev->dev->phys_caps.base_proxy_sqpn + 8 * MLX4_MFUNC_MAX ||
+	    (wc->src_qp & 0x1) != ctx->port - 1 ||
+	    wc->src_qp & 0x4) {
+		mlx4_ib_warn(ctx->ib_dev, "can't multiplex bad sqp:%d\n", wc->src_qp);
+		return;
+	}
+	slave = ((wc->src_qp & ~0x7) - dev->dev->phys_caps.base_proxy_sqpn) / 8;
+	if (slave != ctx->slave) {
+		mlx4_ib_warn(ctx->ib_dev, "can't multiplex bad sqp:%d: "
+			     "belongs to another slave\n", wc->src_qp);
+		return;
+	}
+	if (slave != mlx4_master_func_num(dev->dev) && !(wc->src_qp & 0x2)) {
+		mlx4_ib_warn(ctx->ib_dev, "can't multiplex bad sqp:%d: "
+			     "non-master trying to send QP0 packets\n", wc->src_qp);
+		return;
+	}
+
+	/* Map transaction ID */
+	ib_dma_sync_single_for_cpu(ctx->ib_dev, tun_qp->ring[wr_ix].map,
+				   sizeof (struct mlx4_tunnel_mad),
+				   DMA_FROM_DEVICE);
+	switch (tunnel->mad.mad_hdr.method) {
+	case IB_MGMT_METHOD_SET:
+	case IB_MGMT_METHOD_GET:
+	case IB_MGMT_METHOD_REPORT:
+	case IB_SA_METHOD_GET_TABLE:
+	case IB_SA_METHOD_DELETE:
+	case IB_SA_METHOD_GET_MULTI:
+	case IB_SA_METHOD_GET_TRACE_TBL:
+		slave_id = (u8 *) &tunnel->mad.mad_hdr.tid;
+		if (*slave_id) {
+			mlx4_ib_warn(ctx->ib_dev, "egress mad has non-null tid msb:%d "
+				     "class:%d slave:%d\n", *slave_id,
+				     tunnel->mad.mad_hdr.mgmt_class, slave);
+			return;
+		} else
+			*slave_id = slave;
+	default:
+		/* nothing */;
+	}
+
+	/* Class-specific handling */
+	switch (tunnel->mad.mad_hdr.mgmt_class) {
+	case IB_MGMT_CLASS_SUBN_ADM:
+		if (mlx4_ib_multiplex_sa_handler(ctx->ib_dev, ctx->port, slave,
+			      (struct ib_sa_mad *) &tunnel->mad))
+			return;
+		break;
+	case IB_MGMT_CLASS_CM:
+		if (mlx4_ib_multiplex_cm_handler(ctx->ib_dev, ctx->port, slave,
+			      (struct ib_mad *) &tunnel->mad))
+			return;
+		break;
+	case IB_MGMT_CLASS_DEVICE_MGMT:
+		if (tunnel->mad.mad_hdr.method != IB_MGMT_METHOD_GET &&
+		    tunnel->mad.mad_hdr.method != IB_MGMT_METHOD_SET)
+			return;
+		break;
+	default:
+		/* Drop unsupported classes for slaves in tunnel mode */
+		if (slave != mlx4_master_func_num(dev->dev)) {
+			mlx4_ib_warn(ctx->ib_dev, "dropping unsupported egress mad from class:%d "
+				     "for slave:%d\n", tunnel->mad.mad_hdr.mgmt_class, slave);
+			return;
+		}
+	}
+
+	/* We are using standard ib_core services to send the mad, so generate a
+	 * stadard address handle by decoding the tunnelled mlx4_ah fields */
+	memcpy(&ah.av, &tunnel->hdr.av, sizeof (struct mlx4_av));
+	ah.ibah.device = ctx->ib_dev;
+	mlx4_ib_query_ah(&ah.ibah, &ah_attr);
+	if (ah_attr.ah_flags & IB_AH_GRH)
+		if (get_real_sgid_index(dev, slave, ctx->port, &ah_attr))
+			return;
+
+	mlx4_ib_send_to_wire(dev, slave, ctx->port,
+			     is_proxy_qp0(dev, wc->src_qp, slave) ?
+			     IB_QPT_SMI : IB_QPT_GSI,
+			     be16_to_cpu(tunnel->hdr.pkey_index),
+			     be32_to_cpu(tunnel->hdr.remote_qpn),
+			     be32_to_cpu(tunnel->hdr.qkey),
+			     &ah_attr, &tunnel->mad);
+}
+
+static int mlx4_ib_alloc_pv_bufs(struct mlx4_ib_demux_pv_ctx *ctx,
+				 enum ib_qp_type qp_type, int is_tun)
+{
+	int i;
+	struct mlx4_ib_demux_pv_qp *tun_qp;
+	int rx_buf_size, tx_buf_size;
+
+	if (qp_type > IB_QPT_GSI)
+		return -EINVAL;
+
+	tun_qp = &ctx->qp[qp_type];
+
+	tun_qp->ring = kzalloc(sizeof (struct mlx4_ib_buf) * MLX4_NUM_TUNNEL_BUFS,
+			       GFP_KERNEL);
+	if (!tun_qp->ring)
+		return -ENOMEM;
+
+	tun_qp->tx_ring = kcalloc(MLX4_NUM_TUNNEL_BUFS,
+				  sizeof (struct mlx4_ib_tun_tx_buf),
+				  GFP_KERNEL);
+	if (!tun_qp->tx_ring) {
+		kfree(tun_qp->ring);
+		tun_qp->ring = NULL;
+		return -ENOMEM;
+	}
+
+	if (is_tun) {
+		rx_buf_size = sizeof (struct mlx4_tunnel_mad);
+		tx_buf_size = sizeof (struct mlx4_rcv_tunnel_mad);
+	} else {
+		rx_buf_size = sizeof (struct mlx4_mad_rcv_buf);
+		tx_buf_size = sizeof (struct mlx4_mad_snd_buf);
+	}
+
+	for (i = 0; i < MLX4_NUM_TUNNEL_BUFS; i++) {
+		tun_qp->ring[i].addr = kmalloc(rx_buf_size, GFP_KERNEL);
+		if (!tun_qp->ring[i].addr)
+			goto err;
+		tun_qp->ring[i].map = ib_dma_map_single(ctx->ib_dev,
+							tun_qp->ring[i].addr,
+							rx_buf_size,
+							DMA_FROM_DEVICE);
+	}
+
+	for (i = 0; i < MLX4_NUM_TUNNEL_BUFS; i++) {
+		tun_qp->tx_ring[i].buf.addr =
+			kmalloc(tx_buf_size, GFP_KERNEL);
+		if (!tun_qp->tx_ring[i].buf.addr)
+			goto tx_err;
+		tun_qp->tx_ring[i].buf.map =
+			ib_dma_map_single(ctx->ib_dev,
+					  tun_qp->tx_ring[i].buf.addr,
+					  tx_buf_size,
+					  DMA_TO_DEVICE);
+		tun_qp->tx_ring[i].ah = NULL;
+	}
+	spin_lock_init(&tun_qp->tx_lock);
+	tun_qp->tx_ix_head = 0;
+	tun_qp->tx_ix_tail = 0;
+	tun_qp->proxy_qpt = qp_type;
+
+	return 0;
+
+tx_err:
+	while (i > 0) {
+		--i;
+		ib_dma_unmap_single(ctx->ib_dev, tun_qp->tx_ring[i].buf.map,
+				    tx_buf_size, DMA_TO_DEVICE);
+		kfree(tun_qp->tx_ring[i].buf.addr);
+	}
+	kfree(tun_qp->tx_ring);
+	tun_qp->tx_ring = NULL;
+	i = MLX4_NUM_TUNNEL_BUFS;
+err:
+	while (i > 0) {
+		--i;
+		ib_dma_unmap_single(ctx->ib_dev, tun_qp->ring[i].map,
+				    rx_buf_size, DMA_FROM_DEVICE);
+		kfree(tun_qp->ring[i].addr);
+	}
+	kfree(tun_qp->ring);
+	tun_qp->ring = NULL;
+	return -ENOMEM;
+}
+
+static void mlx4_ib_free_pv_qp_bufs(struct mlx4_ib_demux_pv_ctx *ctx,
+				     enum ib_qp_type qp_type, int is_tun)
+{
+	int i;
+	struct mlx4_ib_demux_pv_qp *tun_qp;
+	int rx_buf_size, tx_buf_size;
+
+	if (qp_type > IB_QPT_GSI)
+		return;
+
+	tun_qp = &ctx->qp[qp_type];
+	if (is_tun) {
+		rx_buf_size = sizeof (struct mlx4_tunnel_mad);
+		tx_buf_size = sizeof (struct mlx4_rcv_tunnel_mad);
+	} else {
+		rx_buf_size = sizeof (struct mlx4_mad_rcv_buf);
+		tx_buf_size = sizeof (struct mlx4_mad_snd_buf);
+	}
+
+
+	for (i = 0; i < MLX4_NUM_TUNNEL_BUFS; i++) {
+		ib_dma_unmap_single(ctx->ib_dev, tun_qp->ring[i].map,
+				    rx_buf_size, DMA_FROM_DEVICE);
+		kfree(tun_qp->ring[i].addr);
+	}
+
+	for (i = 0; i < MLX4_NUM_TUNNEL_BUFS; i++) {
+		ib_dma_unmap_single(ctx->ib_dev, tun_qp->tx_ring[i].buf.map,
+				    tx_buf_size, DMA_TO_DEVICE);
+		kfree(tun_qp->tx_ring[i].buf.addr);
+		if (tun_qp->tx_ring[i].ah)
+			ib_destroy_ah(tun_qp->tx_ring[i].ah);
+	}
+	kfree(tun_qp->tx_ring);
+	kfree(tun_qp->ring);
+}
+
+static void mlx4_ib_tunnel_comp_worker(struct work_struct *work)
+{
+	struct mlx4_ib_demux_pv_ctx *ctx;
+	struct mlx4_ib_demux_pv_qp *tun_qp;
+	struct ib_wc wc;
+	int ret;
+	ctx = container_of(work, struct mlx4_ib_demux_pv_ctx, work);
+	ib_req_notify_cq(ctx->cq, IB_CQ_NEXT_COMP);
+
+	while (ib_poll_cq(ctx->cq, 1, &wc) == 1) {
+		tun_qp = &ctx->qp[MLX4_TUN_WRID_QPN(wc.wr_id)];
+		if (wc.status == IB_WC_SUCCESS) {
+			switch (wc.opcode) {
+			case IB_WC_RECV:
+				mlx4_ib_multiplex_mad(ctx, &wc);
+				ret = mlx4_ib_post_pv_qp_buf(ctx, tun_qp,
+							     wc.wr_id &
+							     (MLX4_NUM_TUNNEL_BUFS - 1));
+				if (ret)
+					pr_err("Failed reposting tunnel "
+					       "buf:%lld\n", (long long)wc.wr_id);
+				break;
+			case IB_WC_SEND:
+				pr_debug("received tunnel send completion:"
+					 "wrid=0x%llx, status=0x%x\n",
+					 (long long)wc.wr_id, wc.status);
+				ib_destroy_ah(tun_qp->tx_ring[wc.wr_id &
+					      (MLX4_NUM_TUNNEL_BUFS - 1)].ah);
+				tun_qp->tx_ring[wc.wr_id & (MLX4_NUM_TUNNEL_BUFS - 1)].ah
+					= NULL;
+				spin_lock(&tun_qp->tx_lock);
+				tun_qp->tx_ix_tail++;
+				spin_unlock(&tun_qp->tx_lock);
+
+				break;
+			default:
+				break;
+			}
+		} else  {
+			pr_debug("mlx4_ib: completion error in tunnel: %d."
+				 " status = %d, wrid = 0x%llx\n",
+				 ctx->slave, wc.status, (long long)wc.wr_id);
+			if (!MLX4_TUN_IS_RECV(wc.wr_id)) {
+				ib_destroy_ah(tun_qp->tx_ring[wc.wr_id &
+					      (MLX4_NUM_TUNNEL_BUFS - 1)].ah);
+				tun_qp->tx_ring[wc.wr_id & (MLX4_NUM_TUNNEL_BUFS - 1)].ah
+					= NULL;
+				spin_lock(&tun_qp->tx_lock);
+				tun_qp->tx_ix_tail++;
+				spin_unlock(&tun_qp->tx_lock);
+			}
+		}
+	}
+}
+
+static void pv_qp_event_handler(struct ib_event *event, void *qp_context)
+{
+	struct mlx4_ib_demux_pv_ctx *sqp = qp_context;
+
+	/* It's worse than that! He's dead, Jim! */
+	pr_err("Fatal error (%d) on a MAD QP on port %d\n",
+	       event->event, sqp->port);
+}
+
+static int create_pv_sqp(struct mlx4_ib_demux_pv_ctx *ctx,
+			    enum ib_qp_type qp_type, int create_tun)
+{
+	int i, ret;
+	struct mlx4_ib_demux_pv_qp *tun_qp;
+	struct mlx4_ib_qp_tunnel_init_attr qp_init_attr;
+	struct ib_qp_attr attr;
+	int qp_attr_mask_INIT;
+
+	if (qp_type > IB_QPT_GSI)
+		return -EINVAL;
+
+	tun_qp = &ctx->qp[qp_type];
+
+	memset(&qp_init_attr, 0, sizeof qp_init_attr);
+	qp_init_attr.init_attr.send_cq = ctx->cq;
+	qp_init_attr.init_attr.recv_cq = ctx->cq;
+	qp_init_attr.init_attr.sq_sig_type = IB_SIGNAL_ALL_WR;
+	qp_init_attr.init_attr.cap.max_send_wr = MLX4_NUM_TUNNEL_BUFS;
+	qp_init_attr.init_attr.cap.max_recv_wr = MLX4_NUM_TUNNEL_BUFS;
+	qp_init_attr.init_attr.cap.max_send_sge = 1;
+	qp_init_attr.init_attr.cap.max_recv_sge = 1;
+	if (create_tun) {
+		qp_init_attr.init_attr.qp_type = IB_QPT_UD;
+		qp_init_attr.init_attr.create_flags = (enum ib_qp_create_flags)MLX4_IB_SRIOV_TUNNEL_QP;
+		qp_init_attr.port = ctx->port;
+		qp_init_attr.slave = ctx->slave;
+		qp_init_attr.proxy_qp_type = qp_type;
+		qp_attr_mask_INIT = IB_QP_STATE | IB_QP_PKEY_INDEX |
+			   IB_QP_QKEY | IB_QP_PORT;
+	} else {
+		qp_init_attr.init_attr.qp_type = qp_type;
+		qp_init_attr.init_attr.create_flags = (enum ib_qp_create_flags)MLX4_IB_SRIOV_SQP;
+		qp_attr_mask_INIT = IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_QKEY;
+	}
+	qp_init_attr.init_attr.port_num = ctx->port;
+	qp_init_attr.init_attr.qp_context = ctx;
+	qp_init_attr.init_attr.event_handler = pv_qp_event_handler;
+	tun_qp->qp = ib_create_qp(ctx->pd, &qp_init_attr.init_attr);
+	if (IS_ERR(tun_qp->qp)) {
+		ret = PTR_ERR(tun_qp->qp);
+		tun_qp->qp = NULL;
+		pr_err("Couldn't create %s QP (%d)\n",
+		       create_tun ? "tunnel" : "special", ret);
+		return ret;
+	}
+
+	memset(&attr, 0, sizeof attr);
+	attr.qp_state = IB_QPS_INIT;
+	attr.pkey_index =
+		to_mdev(ctx->ib_dev)->pkeys.virt2phys_pkey[ctx->slave][ctx->port - 1][0];
+	attr.qkey = IB_QP1_QKEY;
+	attr.port_num = ctx->port;
+	ret = ib_modify_qp(tun_qp->qp, &attr, qp_attr_mask_INIT);
+	if (ret) {
+		pr_err("Couldn't change %s qp state to INIT (%d)\n",
+		       create_tun ? "tunnel" : "special", ret);
+		goto err_qp;
+	}
+	attr.qp_state = IB_QPS_RTR;
+	ret = ib_modify_qp(tun_qp->qp, &attr, IB_QP_STATE);
+	if (ret) {
+		pr_err("Couldn't change %s qp state to RTR (%d)\n",
+		       create_tun ? "tunnel" : "special", ret);
+		goto err_qp;
+	}
+	attr.qp_state = IB_QPS_RTS;
+	attr.sq_psn = 0;
+	ret = ib_modify_qp(tun_qp->qp, &attr, IB_QP_STATE | IB_QP_SQ_PSN);
+	if (ret) {
+		pr_err("Couldn't change %s qp state to RTS (%d)\n",
+		       create_tun ? "tunnel" : "special", ret);
+		goto err_qp;
+	}
+
+	for (i = 0; i < MLX4_NUM_TUNNEL_BUFS; i++) {
+		ret = mlx4_ib_post_pv_qp_buf(ctx, tun_qp, i);
+		if (ret) {
+			pr_err(" mlx4_ib_post_pv_buf error"
+			       " (err = %d, i = %d)\n", ret, i);
+			goto err_qp;
+		}
+	}
+	return 0;
+
+err_qp:
+	ib_destroy_qp(tun_qp->qp);
+	tun_qp->qp = NULL;
+	return ret;
+}
+
+/*
+ * IB MAD completion callback for real SQPs
+ */
+static void mlx4_ib_sqp_comp_worker(struct work_struct *work)
+{
+	struct mlx4_ib_demux_pv_ctx *ctx;
+	struct mlx4_ib_demux_pv_qp *sqp;
+	struct ib_wc wc;
+	struct ib_grh *grh;
+	struct ib_mad *mad;
+
+	ctx = container_of(work, struct mlx4_ib_demux_pv_ctx, work);
+	ib_req_notify_cq(ctx->cq, IB_CQ_NEXT_COMP);
+
+	while (mlx4_ib_poll_cq(ctx->cq, 1, &wc) == 1) {
+		sqp = &ctx->qp[MLX4_TUN_WRID_QPN(wc.wr_id)];
+		if (wc.status == IB_WC_SUCCESS) {
+			switch (wc.opcode) {
+			case IB_WC_SEND:
+				ib_destroy_ah(sqp->tx_ring[wc.wr_id &
+					      (MLX4_NUM_TUNNEL_BUFS - 1)].ah);
+				sqp->tx_ring[wc.wr_id & (MLX4_NUM_TUNNEL_BUFS - 1)].ah
+					= NULL;
+				spin_lock(&sqp->tx_lock);
+				sqp->tx_ix_tail++;
+				spin_unlock(&sqp->tx_lock);
+				break;
+			case IB_WC_RECV:
+				mad = (struct ib_mad *) &(((struct mlx4_mad_rcv_buf *)
+						(sqp->ring[wc.wr_id &
+						(MLX4_NUM_TUNNEL_BUFS - 1)].addr))->payload);
+				grh = &(((struct mlx4_mad_rcv_buf *)
+						(sqp->ring[wc.wr_id &
+						(MLX4_NUM_TUNNEL_BUFS - 1)].addr))->grh);
+				mlx4_ib_demux_mad(ctx->ib_dev, ctx->port, &wc, grh, mad);
+				if (mlx4_ib_post_pv_qp_buf(ctx, sqp, wc.wr_id &
+							   (MLX4_NUM_TUNNEL_BUFS - 1)))
+					pr_err("Failed reposting SQP "
+					       "buf:%lld\n", (long long)wc.wr_id);
+				break;
+			default:
+				BUG_ON(1);
+				break;
+			}
+		} else  {
+			pr_debug("mlx4_ib: completion error in tunnel: %d."
+				 " status = %d, wrid = 0x%llx\n",
+				 ctx->slave, wc.status, (long long)wc.wr_id);
+			if (!MLX4_TUN_IS_RECV(wc.wr_id)) {
+				ib_destroy_ah(sqp->tx_ring[wc.wr_id &
+					      (MLX4_NUM_TUNNEL_BUFS - 1)].ah);
+				sqp->tx_ring[wc.wr_id & (MLX4_NUM_TUNNEL_BUFS - 1)].ah
+					= NULL;
+				spin_lock(&sqp->tx_lock);
+				sqp->tx_ix_tail++;
+				spin_unlock(&sqp->tx_lock);
+			}
+		}
+	}
+}
+
+static int alloc_pv_object(struct mlx4_ib_dev *dev, int slave, int port,
+			       struct mlx4_ib_demux_pv_ctx **ret_ctx)
+{
+	struct mlx4_ib_demux_pv_ctx *ctx;
+
+	*ret_ctx = NULL;
+	ctx = kzalloc(sizeof (struct mlx4_ib_demux_pv_ctx), GFP_KERNEL);
+	if (!ctx) {
+		pr_err("failed allocating pv resource context "
+		       "for port %d, slave %d\n", port, slave);
+		return -ENOMEM;
+	}
+
+	ctx->ib_dev = &dev->ib_dev;
+	ctx->port = port;
+	ctx->slave = slave;
+	*ret_ctx = ctx;
+	return 0;
+}
+
+static void free_pv_object(struct mlx4_ib_dev *dev, int slave, int port)
+{
+	if (dev->sriov.demux[port - 1].tun[slave]) {
+		kfree(dev->sriov.demux[port - 1].tun[slave]);
+		dev->sriov.demux[port - 1].tun[slave] = NULL;
+	}
+}
+
+static int create_pv_resources(struct ib_device *ibdev, int slave, int port,
+			       int create_tun, struct mlx4_ib_demux_pv_ctx *ctx)
+{
+	int ret, cq_size;
+
+	if (ctx->state != DEMUX_PV_STATE_DOWN)
+		return -EEXIST;
+
+	ctx->state = DEMUX_PV_STATE_STARTING;
+	/* have QP0 only on port owner, and only if link layer is IB */
+	if (ctx->slave == mlx4_master_func_num(to_mdev(ctx->ib_dev)->dev) &&
+	    rdma_port_get_link_layer(ibdev, ctx->port) == IB_LINK_LAYER_INFINIBAND)
+		ctx->has_smi = 1;
+
+	if (ctx->has_smi) {
+		ret = mlx4_ib_alloc_pv_bufs(ctx, IB_QPT_SMI, create_tun);
+		if (ret) {
+			pr_err("Failed allocating qp0 tunnel bufs (%d)\n", ret);
+			goto err_out;
+		}
+	}
+
+	ret = mlx4_ib_alloc_pv_bufs(ctx, IB_QPT_GSI, create_tun);
+	if (ret) {
+		pr_err("Failed allocating qp1 tunnel bufs (%d)\n", ret);
+		goto err_out_qp0;
+	}
+
+	cq_size = 2 * MLX4_NUM_TUNNEL_BUFS;
+	if (ctx->has_smi)
+		cq_size *= 2;
+
+	ctx->cq = ib_create_cq(ctx->ib_dev, mlx4_ib_tunnel_comp_handler,
+			       NULL, ctx, cq_size, 0);
+	if (IS_ERR(ctx->cq)) {
+		ret = PTR_ERR(ctx->cq);
+		pr_err("Couldn't create tunnel CQ (%d)\n", ret);
+		goto err_buf;
+	}
+
+	ctx->pd = ib_alloc_pd(ctx->ib_dev);
+	if (IS_ERR(ctx->pd)) {
+		ret = PTR_ERR(ctx->pd);
+		pr_err("Couldn't create tunnel PD (%d)\n", ret);
+		goto err_cq;
+	}
+
+	ctx->mr = ib_get_dma_mr(ctx->pd, IB_ACCESS_LOCAL_WRITE);
+	if (IS_ERR(ctx->mr)) {
+		ret = PTR_ERR(ctx->mr);
+		pr_err("Couldn't get tunnel DMA MR (%d)\n", ret);
+		goto err_pd;
+	}
+
+	if (ctx->has_smi) {
+		ret = create_pv_sqp(ctx, IB_QPT_SMI, create_tun);
+		if (ret) {
+			pr_err("Couldn't create %s QP0 (%d)\n",
+			       create_tun ? "tunnel for" : "",  ret);
+			goto err_mr;
+		}
+	}
+
+	ret = create_pv_sqp(ctx, IB_QPT_GSI, create_tun);
+	if (ret) {
+		pr_err("Couldn't create %s QP1 (%d)\n",
+		       create_tun ? "tunnel for" : "",  ret);
+		goto err_qp0;
+	}
+
+	if (create_tun)
+		INIT_WORK(&ctx->work, mlx4_ib_tunnel_comp_worker);
+	else
+		INIT_WORK(&ctx->work, mlx4_ib_sqp_comp_worker);
+
+	ctx->wq = to_mdev(ibdev)->sriov.demux[port - 1].wq;
+
+	ret = ib_req_notify_cq(ctx->cq, IB_CQ_NEXT_COMP);
+	if (ret) {
+		pr_err("Couldn't arm tunnel cq (%d)\n", ret);
+		goto err_wq;
+	}
+	ctx->state = DEMUX_PV_STATE_ACTIVE;
+	return 0;
+
+err_wq:
+	ctx->wq = NULL;
+	ib_destroy_qp(ctx->qp[1].qp);
+	ctx->qp[1].qp = NULL;
+
+
+err_qp0:
+	if (ctx->has_smi)
+		ib_destroy_qp(ctx->qp[0].qp);
+	ctx->qp[0].qp = NULL;
+
+err_mr:
+	ib_dereg_mr(ctx->mr);
+	ctx->mr = NULL;
+
+err_pd:
+	ib_dealloc_pd(ctx->pd);
+	ctx->pd = NULL;
+
+err_cq:
+	ib_destroy_cq(ctx->cq);
+	ctx->cq = NULL;
+
+err_buf:
+	mlx4_ib_free_pv_qp_bufs(ctx, IB_QPT_GSI, create_tun);
+
+err_out_qp0:
+	if (ctx->has_smi)
+		mlx4_ib_free_pv_qp_bufs(ctx, IB_QPT_SMI, create_tun);
+err_out:
+	ctx->state = DEMUX_PV_STATE_DOWN;
+	return ret;
+}
+
+static void destroy_pv_resources(struct mlx4_ib_dev *dev, int slave, int port,
+				 struct mlx4_ib_demux_pv_ctx *ctx, int flush)
+{
+	if (!ctx)
+		return;
+	if (ctx->state > DEMUX_PV_STATE_DOWN) {
+		ctx->state = DEMUX_PV_STATE_DOWNING;
+		if (flush)
+			flush_workqueue(ctx->wq);
+		if (ctx->has_smi) {
+			ib_destroy_qp(ctx->qp[0].qp);
+			ctx->qp[0].qp = NULL;
+			mlx4_ib_free_pv_qp_bufs(ctx, IB_QPT_SMI, 1);
+		}
+		ib_destroy_qp(ctx->qp[1].qp);
+		ctx->qp[1].qp = NULL;
+		mlx4_ib_free_pv_qp_bufs(ctx, IB_QPT_GSI, 1);
+		ib_dereg_mr(ctx->mr);
+		ctx->mr = NULL;
+		ib_dealloc_pd(ctx->pd);
+		ctx->pd = NULL;
+		ib_destroy_cq(ctx->cq);
+		ctx->cq = NULL;
+		ctx->state = DEMUX_PV_STATE_DOWN;
+	}
+}
+
+static int mlx4_ib_tunnels_update(struct mlx4_ib_dev *dev, int slave,
+				  int port, int do_init)
+{
+	int ret = 0;
+
+	if (!do_init) {
+		clean_vf_mcast(&dev->sriov.demux[port - 1], slave);
+		/* for master, destroy real sqp resources */
+		if (slave == mlx4_master_func_num(dev->dev))
+			destroy_pv_resources(dev, slave, port,
+					     dev->sriov.sqps[port - 1], 1);
+		/* destroy the tunnel qp resources */
+		destroy_pv_resources(dev, slave, port,
+				     dev->sriov.demux[port - 1].tun[slave], 1);
+		return 0;
+	}
+
+	/* create the tunnel qp resources */
+	ret = create_pv_resources(&dev->ib_dev, slave, port, 1,
+				  dev->sriov.demux[port - 1].tun[slave]);
+
+	/* for master, create the real sqp resources */
+	if (!ret && slave == mlx4_master_func_num(dev->dev))
+		ret = create_pv_resources(&dev->ib_dev, slave, port, 0,
+					  dev->sriov.sqps[port - 1]);
+	return ret;
+}
+
+void mlx4_ib_tunnels_update_work(struct work_struct *work)
+{
+	struct mlx4_ib_demux_work *dmxw;
+
+	dmxw = container_of(work, struct mlx4_ib_demux_work, work);
+	mlx4_ib_tunnels_update(dmxw->dev, dmxw->slave, (int) dmxw->port,
+			       dmxw->do_init);
+	kfree(dmxw);
+	return;
+}
+
+static int mlx4_ib_alloc_demux_ctx(struct mlx4_ib_dev *dev,
+				       struct mlx4_ib_demux_ctx *ctx,
+				       int port)
+{
+	char name[12];
+	int ret = 0;
+	int i;
+
+	ctx->tun = kcalloc(dev->dev->caps.sqp_demux,
+			   sizeof (struct mlx4_ib_demux_pv_ctx *), GFP_KERNEL);
+	if (!ctx->tun)
+		return -ENOMEM;
+
+	ctx->dev = dev;
+	ctx->port = port;
+	ctx->ib_dev = &dev->ib_dev;
+
+	for (i = 0; i < dev->dev->caps.sqp_demux; i++) {
+		ret = alloc_pv_object(dev, i, port, &ctx->tun[i]);
+		if (ret) {
+			ret = -ENOMEM;
+			goto err_mcg;
+		}
+	}
+
+	ret = mlx4_ib_mcg_port_init(ctx);
+	if (ret) {
+		pr_err("Failed initializing mcg para-virt (%d)\n", ret);
+		goto err_mcg;
+	}
+
+	snprintf(name, sizeof name, "mlx4_ibt%d", port);
+	ctx->wq = create_singlethread_workqueue(name);
+	if (!ctx->wq) {
+		pr_err("Failed to create tunnelling WQ for port %d\n", port);
+		ret = -ENOMEM;
+		goto err_wq;
+	}
+
+	snprintf(name, sizeof name, "mlx4_ibud%d", port);
+	ctx->ud_wq = create_singlethread_workqueue(name);
+	if (!ctx->ud_wq) {
+		pr_err("Failed to create up/down WQ for port %d\n", port);
+		ret = -ENOMEM;
+		goto err_udwq;
+	}
+
+	return 0;
+
+err_udwq:
+	destroy_workqueue(ctx->wq);
+	ctx->wq = NULL;
+
+err_wq:
+	mlx4_ib_mcg_port_cleanup(ctx, 1);
+err_mcg:
+	for (i = 0; i < dev->dev->caps.sqp_demux; i++)
+		free_pv_object(dev, i, port);
+	kfree(ctx->tun);
+	ctx->tun = NULL;
+	return ret;
+}
+
+static void mlx4_ib_free_sqp_ctx(struct mlx4_ib_demux_pv_ctx *sqp_ctx)
+{
+	if (sqp_ctx->state > DEMUX_PV_STATE_DOWN) {
+		sqp_ctx->state = DEMUX_PV_STATE_DOWNING;
+		flush_workqueue(sqp_ctx->wq);
+		if (sqp_ctx->has_smi) {
+			ib_destroy_qp(sqp_ctx->qp[0].qp);
+			sqp_ctx->qp[0].qp = NULL;
+			mlx4_ib_free_pv_qp_bufs(sqp_ctx, IB_QPT_SMI, 0);
+		}
+		ib_destroy_qp(sqp_ctx->qp[1].qp);
+		sqp_ctx->qp[1].qp = NULL;
+		mlx4_ib_free_pv_qp_bufs(sqp_ctx, IB_QPT_GSI, 0);
+		ib_dereg_mr(sqp_ctx->mr);
+		sqp_ctx->mr = NULL;
+		ib_dealloc_pd(sqp_ctx->pd);
+		sqp_ctx->pd = NULL;
+		ib_destroy_cq(sqp_ctx->cq);
+		sqp_ctx->cq = NULL;
+		sqp_ctx->state = DEMUX_PV_STATE_DOWN;
+	}
+}
+
+static void mlx4_ib_free_demux_ctx(struct mlx4_ib_demux_ctx *ctx)
+{
+	int i;
+	if (ctx) {
+		struct mlx4_ib_dev *dev = to_mdev(ctx->ib_dev);
+		mlx4_ib_mcg_port_cleanup(ctx, 1);
+		for (i = 0; i < dev->dev->caps.sqp_demux; i++) {
+			if (!ctx->tun[i])
+				continue;
+			if (ctx->tun[i]->state > DEMUX_PV_STATE_DOWN)
+				ctx->tun[i]->state = DEMUX_PV_STATE_DOWNING;
+		}
+		flush_workqueue(ctx->wq);
+		for (i = 0; i < dev->dev->caps.sqp_demux; i++) {
+			destroy_pv_resources(dev, i, ctx->port, ctx->tun[i], 0);
+			free_pv_object(dev, i, ctx->port);
+		}
+		kfree(ctx->tun);
+		destroy_workqueue(ctx->ud_wq);
+		destroy_workqueue(ctx->wq);
+	}
+}
+
+static void mlx4_ib_master_tunnels(struct mlx4_ib_dev *dev, int do_init)
+{
+	int i;
+
+	if (!mlx4_is_master(dev->dev))
+		return;
+	/* initialize or tear down tunnel QPs for the master */
+	for (i = 0; i < dev->dev->caps.num_ports; i++)
+		mlx4_ib_tunnels_update(dev, mlx4_master_func_num(dev->dev), i + 1, do_init);
+	return;
+}
+
+int mlx4_ib_init_sriov(struct mlx4_ib_dev *dev)
+{
+	int i = 0;
+	int err;
+
+	if (!mlx4_is_mfunc(dev->dev))
+		return 0;
+
+	dev->sriov.is_going_down = 0;
+	spin_lock_init(&dev->sriov.going_down_lock);
+	mlx4_ib_cm_paravirt_init(dev);
+
+	mlx4_ib_warn(&dev->ib_dev, "multi-function enabled\n");
+
+	if (mlx4_is_slave(dev->dev)) {
+		mlx4_ib_warn(&dev->ib_dev, "operating in qp1 tunnel mode\n");
+		return 0;
+	}
+
+	for (i = 0; i < dev->dev->caps.sqp_demux; i++) {
+		if (i == mlx4_master_func_num(dev->dev))
+			mlx4_put_slave_node_guid(dev->dev, i, dev->ib_dev.node_guid);
+		else
+			mlx4_put_slave_node_guid(dev->dev, i, mlx4_ib_gen_node_guid());
+	}
+
+	err = mlx4_ib_init_alias_guid_service(dev);
+	if (err) {
+		mlx4_ib_warn(&dev->ib_dev, "Failed init alias guid process.\n");
+		goto paravirt_err;
+	}
+	err = mlx4_ib_device_register_sysfs(dev);
+	if (err) {
+		mlx4_ib_warn(&dev->ib_dev, "Failed to register sysfs\n");
+		goto sysfs_err;
+	}
+
+	mlx4_ib_warn(&dev->ib_dev, "initializing demux service for %d qp1 clients\n",
+		     dev->dev->caps.sqp_demux);
+	for (i = 0; i < dev->num_ports; i++) {
+		union ib_gid gid;
+		err = __mlx4_ib_query_gid(&dev->ib_dev, i + 1, 0, &gid, 1);
+		if (err)
+			goto demux_err;
+		dev->sriov.demux[i].guid_cache[0] = gid.global.interface_id;
+		err = alloc_pv_object(dev, mlx4_master_func_num(dev->dev), i + 1,
+				      &dev->sriov.sqps[i]);
+		if (err)
+			goto demux_err;
+		err = mlx4_ib_alloc_demux_ctx(dev, &dev->sriov.demux[i], i + 1);
+		if (err)
+			goto demux_err;
+	}
+	mlx4_ib_master_tunnels(dev, 1);
+	return 0;
+
+demux_err:
+	while (i > 0) {
+		free_pv_object(dev, mlx4_master_func_num(dev->dev), i + 1);
+		mlx4_ib_free_demux_ctx(&dev->sriov.demux[i]);
+		--i;
+	}
+	mlx4_ib_device_unregister_sysfs(dev);
+
+sysfs_err:
+	mlx4_ib_destroy_alias_guid_service(dev);
+
+paravirt_err:
+	mlx4_ib_cm_paravirt_clean(dev, -1);
+
+	return err;
+}
+
+void mlx4_ib_close_sriov(struct mlx4_ib_dev *dev)
+{
+	int i;
+	unsigned long flags;
+
+	if (!mlx4_is_mfunc(dev->dev))
+		return;
+
+	spin_lock_irqsave(&dev->sriov.going_down_lock, flags);
+	dev->sriov.is_going_down = 1;
+	spin_unlock_irqrestore(&dev->sriov.going_down_lock, flags);
+	if (mlx4_is_master(dev->dev)) {
+		for (i = 0; i < dev->num_ports; i++) {
+			flush_workqueue(dev->sriov.demux[i].ud_wq);
+			mlx4_ib_free_sqp_ctx(dev->sriov.sqps[i]);
+			kfree(dev->sriov.sqps[i]);
+			dev->sriov.sqps[i] = NULL;
+			mlx4_ib_free_demux_ctx(&dev->sriov.demux[i]);
+		}
+
+		mlx4_ib_cm_paravirt_clean(dev, -1);
+		mlx4_ib_destroy_alias_guid_service(dev);
+		mlx4_ib_device_unregister_sysfs(dev);
+	}
+}


Property changes on: trunk/sys/ofed/drivers/infiniband/hw/mlx4/mad.c
___________________________________________________________________
Deleted: cvs2svn:cvs-rev
## -1 +0,0 ##
-1.1.1.1
\ No newline at end of property
Modified: trunk/sys/ofed/drivers/infiniband/hw/mlx4/main.c
===================================================================
--- trunk/sys/ofed/drivers/infiniband/hw/mlx4/main.c	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/drivers/infiniband/hw/mlx4/main.c	2016-09-14 19:35:22 UTC (rev 7911)
@@ -32,12 +32,19 @@
  */
 
 #include <linux/module.h>
-#include <linux/init.h>
+
+#ifdef __linux__
+#include <linux/proc_fs.h>
+#endif
+
+#include <linux/slab.h>
 #include <linux/errno.h>
 #include <linux/netdevice.h>
 #include <linux/inetdevice.h>
-#include <linux/rtnetlink.h>
 #include <linux/if_vlan.h>
+#include <linux/bitops.h>
+#include <linux/if_ether.h>
+#include <linux/fs.h>
 
 #include <rdma/ib_smi.h>
 #include <rdma/ib_user_verbs.h>
@@ -45,45 +52,63 @@
 
 #include <linux/mlx4/driver.h>
 #include <linux/mlx4/cmd.h>
-
+#include <linux/sched.h>
 #include "mlx4_ib.h"
 #include "user.h"
 #include "wc.h"
 
 #define DRV_NAME	MLX4_IB_DRV_NAME
-#define DRV_VERSION	"1.0-ofed1.5.2"
-#define DRV_RELDATE	"August 4, 2010"
+#define DRV_VERSION	"1.0"
+#define DRV_RELDATE	"April 4, 2008"
 
+#define MLX4_IB_DRIVER_PROC_DIR_NAME "driver/mlx4_ib"
+#define MLX4_IB_MRS_PROC_DIR_NAME "mrs"
+
 MODULE_AUTHOR("Roland Dreier");
 MODULE_DESCRIPTION("Mellanox ConnectX HCA InfiniBand driver");
 MODULE_LICENSE("Dual BSD/GPL");
 MODULE_VERSION(DRV_VERSION);
 
-#ifdef CONFIG_MLX4_DEBUG
+int mlx4_ib_sm_guid_assign = 1;
 
-int mlx4_ib_debug_level = 0;
-module_param_named(debug_level, mlx4_ib_debug_level, int, 0644);
-MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0");
+#ifdef __linux__
+struct proc_dir_entry *mlx4_mrs_dir_entry;
+static struct proc_dir_entry *mlx4_ib_driver_dir_entry;
+#endif
 
-#endif /* CONFIG_MLX4_DEBUG */
+module_param_named(sm_guid_assign, mlx4_ib_sm_guid_assign, int, 0444);
+MODULE_PARM_DESC(sm_guid_assign, "Enable SM alias_GUID assignment if sm_guid_assign > 0 (Default: 1)");
 
+static char dev_assign_str[512];
+//module_param_string(dev_assign_str, dev_assign_str, sizeof(dev_assign_str), 0644);
+MODULE_PARM_DESC(dev_assign_str, "Map all device function numbers to "
+		 "IB device numbers following the  pattern: "
+		 "bb:dd.f-0,bb:dd.f-1,... (all numbers are hexadecimals)."
+		 " Max supported devices - 32");
+
 static const char mlx4_ib_version[] =
 	DRV_NAME ": Mellanox ConnectX InfiniBand driver v"
 	DRV_VERSION " (" DRV_RELDATE ")\n";
 
-static void *get_ibdev(struct mlx4_dev *dev, void *ctx, u8 port)
-{
-       struct mlx4_ib_dev *mlxibdev = ctx;
-       return &mlxibdev->ib_dev;
-}
-
 struct update_gid_work {
-	struct work_struct work;
-	union ib_gid gids[128];
-	int port;
-	struct mlx4_ib_dev *dev;
+	struct work_struct	work;
+	union ib_gid		gids[128];
+	struct mlx4_ib_dev     *dev;
+	int			port;
 };
 
+struct dev_rec {
+	int	bus;
+	int	dev;
+	int	func;
+	int	nr;
+};
+
+#define MAX_DR 32
+static struct dev_rec dr[MAX_DR];
+
+static void do_slave_init(struct mlx4_ib_dev *ibdev, int slave, int do_init);
+
 static struct workqueue_struct *wq;
 
 static void init_query_mad(struct ib_smp *mad)
@@ -112,7 +137,8 @@
 	init_query_mad(in_mad);
 	in_mad->attr_id = IB_SMP_ATTR_NODE_INFO;
 
-	err = mlx4_MAD_IFC(to_mdev(ibdev), 1, 1, 1, NULL, NULL, in_mad, out_mad);
+	err = mlx4_MAD_IFC(to_mdev(ibdev), MLX4_MAD_IFC_IGNORE_KEYS,
+			   1, NULL, NULL, in_mad, out_mad);
 	if (err)
 		goto out;
 
@@ -123,7 +149,9 @@
 		IB_DEVICE_PORT_ACTIVE_EVENT		|
 		IB_DEVICE_SYS_IMAGE_GUID		|
 		IB_DEVICE_RC_RNR_NAK_GEN		|
-		IB_DEVICE_BLOCK_MULTICAST_LOOPBACK;
+		IB_DEVICE_BLOCK_MULTICAST_LOOPBACK	|
+		IB_DEVICE_SHARED_MR;
+
 	if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_BAD_PKEY_CNTR)
 		props->device_cap_flags |= IB_DEVICE_BAD_PKEY_CNTR;
 	if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_BAD_QKEY_CNTR)
@@ -144,42 +172,45 @@
 		props->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS;
 	if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC)
 		props->device_cap_flags |= IB_DEVICE_XRC;
-	if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_RAW_ETY)
-		props->max_raw_ethy_qp = dev->ib_dev.phys_port_cnt;
 
+	props->device_cap_flags |= IB_DEVICE_QPG;
+	if (dev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_RSS) {
+		props->device_cap_flags |= IB_DEVICE_UD_RSS;
+		props->max_rss_tbl_sz = dev->dev->caps.max_rss_tbl_sz;
+	}
 	props->vendor_id	   = be32_to_cpup((__be32 *) (out_mad->data + 36)) &
 		0xffffff;
-	props->vendor_part_id	   = be16_to_cpup((__be16 *) (out_mad->data + 30));
+	props->vendor_part_id	   = dev->dev->pdev->device;
 	props->hw_ver		   = be32_to_cpup((__be32 *) (out_mad->data + 32));
 	memcpy(&props->sys_image_guid, out_mad->data +	4, 8);
 
 	props->max_mr_size	   = ~0ull;
 	props->page_size_cap	   = dev->dev->caps.page_size_cap;
-	props->max_qp		   = dev->dev->caps.num_qps - dev->dev->caps.reserved_qps;
+	props->max_qp		   = dev->dev->quotas.qp;
 	props->max_qp_wr	   = dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE;
 	props->max_sge		   = min(dev->dev->caps.max_sq_sg,
 					 dev->dev->caps.max_rq_sg);
-	props->max_cq		   = dev->dev->caps.num_cqs - dev->dev->caps.reserved_cqs;
+	props->max_cq		   = dev->dev->quotas.cq;
 	props->max_cqe		   = dev->dev->caps.max_cqes;
-	props->max_mr		   = dev->dev->caps.num_mpts - dev->dev->caps.reserved_mrws;
+	props->max_mr		   = dev->dev->quotas.mpt;
 	props->max_pd		   = dev->dev->caps.num_pds - dev->dev->caps.reserved_pds;
 	props->max_qp_rd_atom	   = dev->dev->caps.max_qp_dest_rdma;
 	props->max_qp_init_rd_atom = dev->dev->caps.max_qp_init_rdma;
 	props->max_res_rd_atom	   = props->max_qp_rd_atom * props->max_qp;
-	props->max_srq		   = dev->dev->caps.num_srqs - dev->dev->caps.reserved_srqs;
+	props->max_srq		   = dev->dev->quotas.srq;
 	props->max_srq_wr	   = dev->dev->caps.max_srq_wqes - 1;
 	props->max_srq_sge	   = dev->dev->caps.max_srq_sge;
-	props->max_fast_reg_page_list_len = MAX_FAST_REG_PAGES;
+	props->max_fast_reg_page_list_len = MLX4_MAX_FAST_REG_PAGES;
 	props->local_ca_ack_delay  = dev->dev->caps.local_ca_ack_delay;
 	props->atomic_cap	   = dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_ATOMIC ?
 		IB_ATOMIC_HCA : IB_ATOMIC_NONE;
-	props->masked_atomic_cap   = IB_ATOMIC_HCA;
+	props->masked_atomic_cap   = props->atomic_cap;
 	props->max_pkeys	   = dev->dev->caps.pkey_table_len[1];
 	props->max_mcast_grp	   = dev->dev->caps.num_mgms + dev->dev->caps.num_amgms;
 	props->max_mcast_qp_attach = dev->dev->caps.num_qp_per_mgm;
 	props->max_total_mcast_qp_attach = props->max_mcast_qp_attach *
 					   props->max_mcast_grp;
-	props->max_map_per_fmr = (1 << (32 - ilog2(dev->dev->caps.num_mpts))) - 1;
+	props->max_map_per_fmr = dev->dev->caps.max_fmr_maps;
 
 out:
 	kfree(in_mad);
@@ -197,10 +228,33 @@
 		IB_LINK_LAYER_INFINIBAND : IB_LINK_LAYER_ETHERNET;
 }
 
-static void ib_link_query_port(struct ib_device *ibdev, u8 port,
-			       struct ib_port_attr *props,
-			       struct ib_smp *out_mad)
+static int ib_link_query_port(struct ib_device *ibdev, u8 port,
+			      struct ib_port_attr *props, int netw_view)
 {
+	struct ib_smp *in_mad  = NULL;
+	struct ib_smp *out_mad = NULL;
+	int ext_active_speed;
+	int mad_ifc_flags = MLX4_MAD_IFC_IGNORE_KEYS;
+	int err = -ENOMEM;
+
+	in_mad  = kzalloc(sizeof *in_mad, GFP_KERNEL);
+	out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
+	if (!in_mad || !out_mad)
+		goto out;
+
+	init_query_mad(in_mad);
+	in_mad->attr_id  = IB_SMP_ATTR_PORT_INFO;
+	in_mad->attr_mod = cpu_to_be32(port);
+
+	if (mlx4_is_mfunc(to_mdev(ibdev)->dev) && netw_view)
+		mad_ifc_flags |= MLX4_MAD_IFC_NET_VIEW;
+
+	err = mlx4_MAD_IFC(to_mdev(ibdev), mad_ifc_flags, port, NULL, NULL,
+				in_mad, out_mad);
+	if (err)
+		goto out;
+
+
 	props->lid		= be16_to_cpup((__be16 *) (out_mad->data + 16));
 	props->lmc		= out_mad->data[34] & 0x7;
 	props->sm_lid		= be16_to_cpup((__be16 *) (out_mad->data + 18));
@@ -208,7 +262,10 @@
 	props->state		= out_mad->data[32] & 0xf;
 	props->phys_state	= out_mad->data[33] >> 4;
 	props->port_cap_flags	= be32_to_cpup((__be32 *) (out_mad->data + 20));
-	props->gid_tbl_len	= to_mdev(ibdev)->dev->caps.gid_table_len[port];
+	if (netw_view)
+		props->gid_tbl_len = out_mad->data[50];
+	else
+		props->gid_tbl_len = to_mdev(ibdev)->dev->caps.gid_table_len[port];
 	props->max_msg_sz	= to_mdev(ibdev)->dev->caps.max_msg_sz;
 	props->pkey_tbl_len	= to_mdev(ibdev)->dev->caps.pkey_table_len[port];
 	props->bad_pkey_cntr	= be16_to_cpup((__be16 *) (out_mad->data + 46));
@@ -220,39 +277,46 @@
 	props->subnet_timeout	= out_mad->data[51] & 0x1f;
 	props->max_vl_num	= out_mad->data[37] >> 4;
 	props->init_type_reply	= out_mad->data[41] >> 4;
-	props->link_layer	= IB_LINK_LAYER_INFINIBAND;
-}
 
-#ifdef notyet
-static int eth_to_ib_width(int w)
-{
-	switch (w) {
-	case 4:
-		return IB_WIDTH_4X;
-	case 8:
-	case 16:
-		return IB_WIDTH_8X;
-	case 32:
-		return IB_WIDTH_12X;
-	default:
-		return IB_WIDTH_1X;
+	/* Check if extended speeds (EDR/FDR/...) are supported */
+	if (props->port_cap_flags & IB_PORT_EXTENDED_SPEEDS_SUP) {
+		ext_active_speed = out_mad->data[62] >> 4;
+
+		switch (ext_active_speed) {
+		case 1:
+			props->active_speed = IB_SPEED_FDR;
+			break;
+		case 2:
+			props->active_speed = IB_SPEED_EDR;
+			break;
+		}
 	}
-}
 
-static int eth_to_ib_speed(int s)
-{
-	switch (s) {
-	case 256:
-		return 1;
-	case 512:
-		return 2;
-	case 1024:
-		return 4;
-	default:
-		return 1;
+	/* If reported active speed is QDR, check if is FDR-10 */
+	if (props->active_speed == IB_SPEED_QDR) {
+		init_query_mad(in_mad);
+		in_mad->attr_id = MLX4_ATTR_EXTENDED_PORT_INFO;
+		in_mad->attr_mod = cpu_to_be32(port);
+
+		err = mlx4_MAD_IFC(to_mdev(ibdev), mad_ifc_flags, port,
+				   NULL, NULL, in_mad, out_mad);
+		if (err)
+			goto out;
+
+		/* Checking LinkSpeedActive for FDR-10 */
+		if (out_mad->data[15] & 0x1)
+			props->active_speed = IB_SPEED_FDR10;
 	}
+
+	/* Avoid wrong speed value returned by FW if the IB link is down. */
+	if (props->state == IB_PORT_DOWN)
+		 props->active_speed = IB_SPEED_SDR;
+
+out:
+	kfree(in_mad);
+	kfree(out_mad);
+	return err;
 }
-#endif
 
 static u8 state_to_phys_state(enum ib_port_state state)
 {
@@ -260,26 +324,39 @@
 }
 
 static int eth_link_query_port(struct ib_device *ibdev, u8 port,
-			       struct ib_port_attr *props,
-			       struct ib_smp *out_mad)
+			       struct ib_port_attr *props, int netw_view)
 {
-	struct mlx4_ib_iboe *iboe = &to_mdev(ibdev)->iboe;
+
+	struct mlx4_ib_dev *mdev = to_mdev(ibdev);
+	struct mlx4_ib_iboe *iboe = &mdev->iboe;
 	struct net_device *ndev;
 	enum ib_mtu tmp;
+	struct mlx4_cmd_mailbox *mailbox;
+	int err = 0;
 
-	props->active_width	= IB_WIDTH_4X;
-	props->active_speed	= 1;
+	mailbox = mlx4_alloc_cmd_mailbox(mdev->dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	err = mlx4_cmd_box(mdev->dev, 0, mailbox->dma, port, 0,
+			   MLX4_CMD_QUERY_PORT, MLX4_CMD_TIME_CLASS_B,
+			   MLX4_CMD_WRAPPED);
+	if (err)
+		goto out;
+
+	props->active_width	=  (((u8 *)mailbox->buf)[5] == 0x40) ?
+						IB_WIDTH_4X : IB_WIDTH_1X;
+	props->active_speed	= IB_SPEED_QDR;
 	props->port_cap_flags	= IB_PORT_CM_SUP;
-	props->gid_tbl_len	= to_mdev(ibdev)->dev->caps.gid_table_len[port];
-	props->max_msg_sz	= to_mdev(ibdev)->dev->caps.max_msg_sz;
+	if (netw_view)
+		props->gid_tbl_len = MLX4_ROCE_MAX_GIDS;
+	else
+		props->gid_tbl_len   = mdev->dev->caps.gid_table_len[port];
+
+	props->max_msg_sz	= mdev->dev->caps.max_msg_sz;
 	props->pkey_tbl_len	= 1;
-	props->bad_pkey_cntr	= be16_to_cpup((__be16 *) (out_mad->data + 46));
-	props->qkey_viol_cntr	= be16_to_cpup((__be16 *) (out_mad->data + 48));
-	props->max_mtu		= IB_MTU_2048;
-	props->subnet_timeout	= 0;
-	props->max_vl_num	= out_mad->data[37] >> 4;
-	props->init_type_reply	= 0;
-	props->link_layer	= IB_LINK_LAYER_ETHERNET;
+	props->max_mtu		= IB_MTU_4096;
+	props->max_vl_num	= 2;
 	props->state		= IB_PORT_DOWN;
 	props->phys_state	= state_to_phys_state(props->state);
 	props->active_mtu	= IB_MTU_256;
@@ -286,62 +363,51 @@
 	spin_lock(&iboe->lock);
 	ndev = iboe->netdevs[port - 1];
 	if (!ndev)
-		goto out;
+		goto out_unlock;
 
-#ifdef __linux__
-	tmp = iboe_get_mtu(ndev->mtu);
-#else
 	tmp = iboe_get_mtu(ndev->if_mtu);
-#endif
 	props->active_mtu = tmp ? min(props->max_mtu, tmp) : IB_MTU_256;
-	props->state		= netif_carrier_ok(ndev) &&  netif_oper_up(ndev) ?
+
+	props->state		= (netif_running(ndev) && netif_carrier_ok(ndev)) ?
 					IB_PORT_ACTIVE : IB_PORT_DOWN;
 	props->phys_state	= state_to_phys_state(props->state);
-
+out_unlock:
+	spin_unlock(&iboe->lock);
 out:
-	spin_unlock(&iboe->lock);
-	return 0;
+	mlx4_free_cmd_mailbox(mdev->dev, mailbox);
+	return err;
 }
 
-static int mlx4_ib_query_port(struct ib_device *ibdev, u8 port,
-			      struct ib_port_attr *props)
+int __mlx4_ib_query_port(struct ib_device *ibdev, u8 port,
+			 struct ib_port_attr *props, int netw_view)
 {
-	struct ib_smp *in_mad  = NULL;
-	struct ib_smp *out_mad = NULL;
-	int err = -ENOMEM;
+	int err;
 
-	in_mad  = kzalloc(sizeof *in_mad, GFP_KERNEL);
-	out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
-	if (!in_mad || !out_mad)
-		goto out;
-
 	memset(props, 0, sizeof *props);
 
-	init_query_mad(in_mad);
-	in_mad->attr_id  = IB_SMP_ATTR_PORT_INFO;
-	in_mad->attr_mod = cpu_to_be32(port);
+	err = mlx4_ib_port_link_layer(ibdev, port) == IB_LINK_LAYER_INFINIBAND ?
+		ib_link_query_port(ibdev, port, props, netw_view) :
+				eth_link_query_port(ibdev, port, props, netw_view);
 
-	err = mlx4_MAD_IFC(to_mdev(ibdev), 1, 1, port, NULL, NULL, in_mad, out_mad);
-	if (err)
-		goto out;
-
-	mlx4_ib_port_link_layer(ibdev, port) == IB_LINK_LAYER_INFINIBAND ?
-		ib_link_query_port(ibdev, port, props, out_mad) :
-		eth_link_query_port(ibdev, port, props, out_mad);
-
-out:
-	kfree(in_mad);
-	kfree(out_mad);
-
 	return err;
 }
 
-static int __mlx4_ib_query_gid(struct ib_device *ibdev, u8 port, int index,
-			       union ib_gid *gid)
+static int mlx4_ib_query_port(struct ib_device *ibdev, u8 port,
+			      struct ib_port_attr *props)
 {
+	/* returns host view */
+	return __mlx4_ib_query_port(ibdev, port, props, 0);
+}
+
+int __mlx4_ib_query_gid(struct ib_device *ibdev, u8 port, int index,
+			union ib_gid *gid, int netw_view)
+{
 	struct ib_smp *in_mad  = NULL;
 	struct ib_smp *out_mad = NULL;
 	int err = -ENOMEM;
+	struct mlx4_ib_dev *dev = to_mdev(ibdev);
+	int clear = 0;
+	int mad_ifc_flags = MLX4_MAD_IFC_IGNORE_KEYS;
 
 	in_mad  = kzalloc(sizeof *in_mad, GFP_KERNEL);
 	out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
@@ -352,17 +418,30 @@
 	in_mad->attr_id  = IB_SMP_ATTR_PORT_INFO;
 	in_mad->attr_mod = cpu_to_be32(port);
 
-	err = mlx4_MAD_IFC(to_mdev(ibdev), 1, 1, port, NULL, NULL, in_mad, out_mad);
+	if (mlx4_is_mfunc(dev->dev) && netw_view)
+		mad_ifc_flags |= MLX4_MAD_IFC_NET_VIEW;
+
+	err = mlx4_MAD_IFC(dev, mad_ifc_flags, port, NULL, NULL, in_mad, out_mad);
 	if (err)
 		goto out;
 
 	memcpy(gid->raw, out_mad->data + 8, 8);
 
+	if (mlx4_is_mfunc(dev->dev) && !netw_view) {
+		if (index) {
+			/* For any index > 0, return the null guid */
+			err = 0;
+			clear = 1;
+			goto out;
+		}
+	}
+
 	init_query_mad(in_mad);
 	in_mad->attr_id  = IB_SMP_ATTR_GUID_INFO;
 	in_mad->attr_mod = cpu_to_be32(index / 8);
 
-	err = mlx4_MAD_IFC(to_mdev(ibdev), 1, 1, port, NULL, NULL, in_mad, out_mad);
+	err = mlx4_MAD_IFC(dev, mad_ifc_flags, port,
+			   NULL, NULL, in_mad, out_mad);
 	if (err)
 		goto out;
 
@@ -369,6 +448,8 @@
 	memcpy(gid->raw + 8, out_mad->data + (index % 8) * 8, 8);
 
 out:
+	if (clear)
+		memset(gid->raw + 8, 0, 8);
 	kfree(in_mad);
 	kfree(out_mad);
 	return err;
@@ -375,7 +456,7 @@
 }
 
 static int iboe_query_gid(struct ib_device *ibdev, u8 port, int index,
-			    union ib_gid *gid)
+			  union ib_gid *gid)
 {
 	struct mlx4_ib_dev *dev = to_mdev(ibdev);
 
@@ -388,16 +469,17 @@
 			     union ib_gid *gid)
 {
 	if (rdma_port_get_link_layer(ibdev, port) == IB_LINK_LAYER_INFINIBAND)
-		return __mlx4_ib_query_gid(ibdev, port, index, gid);
+		return __mlx4_ib_query_gid(ibdev, port, index, gid, 0);
 	else
 		return iboe_query_gid(ibdev, port, index, gid);
 }
 
-static int mlx4_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
-			      u16 *pkey)
+int __mlx4_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
+			 u16 *pkey, int netw_view)
 {
 	struct ib_smp *in_mad  = NULL;
 	struct ib_smp *out_mad = NULL;
+	int mad_ifc_flags = MLX4_MAD_IFC_IGNORE_KEYS;
 	int err = -ENOMEM;
 
 	in_mad  = kzalloc(sizeof *in_mad, GFP_KERNEL);
@@ -409,7 +491,11 @@
 	in_mad->attr_id  = IB_SMP_ATTR_PKEY_TABLE;
 	in_mad->attr_mod = cpu_to_be32(index / 32);
 
-	err = mlx4_MAD_IFC(to_mdev(ibdev), 1, 1, port, NULL, NULL, in_mad, out_mad);
+	if (mlx4_is_mfunc(to_mdev(ibdev)->dev) && netw_view)
+		mad_ifc_flags |= MLX4_MAD_IFC_NET_VIEW;
+
+	err = mlx4_MAD_IFC(to_mdev(ibdev), mad_ifc_flags, port, NULL, NULL,
+			   in_mad, out_mad);
 	if (err)
 		goto out;
 
@@ -421,11 +507,16 @@
 	return err;
 }
 
+static int mlx4_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 *pkey)
+{
+	return __mlx4_ib_query_pkey(ibdev, port, index, pkey, 0);
+}
+
 static int mlx4_ib_modify_device(struct ib_device *ibdev, int mask,
 				 struct ib_device_modify *props)
 {
 	struct mlx4_cmd_mailbox *mailbox;
-	int err;
+	unsigned long flags;
 
 	if (mask & ~IB_DEVICE_MODIFY_NODE_DESC)
 		return -EOPNOTSUPP;
@@ -433,12 +524,16 @@
 	if (!(mask & IB_DEVICE_MODIFY_NODE_DESC))
 		return 0;
 
-	spin_lock(&to_mdev(ibdev)->sm_lock);
+	if (mlx4_is_slave(to_mdev(ibdev)->dev))
+		return -EOPNOTSUPP;
+
+	spin_lock_irqsave(&to_mdev(ibdev)->sm_lock, flags);
 	memcpy(ibdev->node_desc, props->node_desc, 64);
-	spin_unlock(&to_mdev(ibdev)->sm_lock);
+	spin_unlock_irqrestore(&to_mdev(ibdev)->sm_lock, flags);
 
-	/* if possible, pass node desc to FW, so it can generate
-	 * a 144 trap. If cmd fails, just ignore.
+	/*
+	 * If possible, pass node desc to FW, so it can generate
+	 * a 144 trap.  If cmd fails, just ignore.
 	 */
 	mailbox = mlx4_alloc_cmd_mailbox(to_mdev(ibdev)->dev);
 	if (IS_ERR(mailbox))
@@ -446,10 +541,8 @@
 
 	memset(mailbox->buf, 0, 256);
 	memcpy(mailbox->buf, props->node_desc, 64);
-	err = mlx4_cmd(to_mdev(ibdev)->dev, mailbox->dma, 1, 0,
-		       MLX4_CMD_SET_NODE, MLX4_CMD_TIME_CLASS_A);
-	if (err)
-		mlx4_ib_dbg("SET_NODE command failed (%d)", err);
+	mlx4_cmd(to_mdev(ibdev)->dev, mailbox->dma, 1, 0,
+		 MLX4_CMD_SET_NODE, MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
 
 	mlx4_free_cmd_mailbox(to_mdev(ibdev)->dev, mailbox);
 
@@ -478,7 +571,7 @@
 	}
 
 	err = mlx4_cmd(dev->dev, mailbox->dma, port, is_eth, MLX4_CMD_SET_PORT,
-		       MLX4_CMD_TIME_CLASS_B);
+		       MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
 
 	mlx4_free_cmd_mailbox(dev->dev, mailbox);
 	return err;
@@ -514,6 +607,7 @@
 {
 	struct mlx4_ib_dev *dev = to_mdev(ibdev);
 	struct mlx4_ib_ucontext *context;
+	struct mlx4_ib_alloc_ucontext_resp_v3 resp_v3;
 	struct mlx4_ib_alloc_ucontext_resp resp;
 	int err;
 
@@ -520,17 +614,29 @@
 	if (!dev->ib_active)
 		return ERR_PTR(-EAGAIN);
 
-	resp.qp_tab_size      = dev->dev->caps.num_qps;
-
-	if (mlx4_wc_enabled()) {
-		resp.bf_reg_size      = dev->dev->caps.bf_reg_size;
-		resp.bf_regs_per_page = dev->dev->caps.bf_regs_per_page;
+	if (ibdev->uverbs_abi_ver == MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION) {
+		resp_v3.qp_tab_size      = dev->dev->caps.num_qps;
+		if (mlx4_wc_enabled()) {
+			resp_v3.bf_reg_size      = dev->dev->caps.bf_reg_size;
+			resp_v3.bf_regs_per_page = dev->dev->caps.bf_regs_per_page;
+		} else {
+			resp_v3.bf_reg_size      = 0;
+			resp_v3.bf_regs_per_page = 0;
+		}
 	} else {
-		resp.bf_reg_size      = 0;
-		resp.bf_regs_per_page = 0;
+		resp.dev_caps	      = dev->dev->caps.userspace_caps;
+		resp.qp_tab_size      = dev->dev->caps.num_qps;
+		if (mlx4_wc_enabled()) {
+			resp.bf_reg_size      = dev->dev->caps.bf_reg_size;
+			resp.bf_regs_per_page = dev->dev->caps.bf_regs_per_page;
+		} else {
+			resp.bf_reg_size      = 0;
+			resp.bf_regs_per_page = 0;
+		}
+		resp.cqe_size	      = dev->dev->caps.cqe_size;
 	}
 
-	context = kzalloc(sizeof *context, GFP_KERNEL);
+	context = kmalloc(sizeof *context, GFP_KERNEL);
 	if (!context)
 		return ERR_PTR(-ENOMEM);
 
@@ -543,7 +649,11 @@
 	INIT_LIST_HEAD(&context->db_page_list);
 	mutex_init(&context->db_page_mutex);
 
-	err = ib_copy_to_udata(udata, &resp, sizeof resp);
+	if (ibdev->uverbs_abi_ver == MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION)
+		err = ib_copy_to_udata(udata, &resp_v3, sizeof(resp_v3));
+	else
+		err = ib_copy_to_udata(udata, &resp, sizeof(resp));
+
 	if (err) {
 		mlx4_uar_free(to_mdev(ibdev)->dev, &context->uar);
 		kfree(context);
@@ -562,15 +672,75 @@
 
 	return 0;
 }
+#ifdef __linux__
+static unsigned long mlx4_ib_get_unmapped_area(struct file *file,
+			unsigned long addr,
+			unsigned long len, unsigned long pgoff,
+			unsigned long flags)
+{
+	struct mm_struct *mm;
+	struct vm_area_struct *vma;
+	unsigned long start_addr;
+	unsigned long page_size_order;
+	unsigned long  command;
 
+	mm = current->mm;
+	if (addr)
+		return current->mm->get_unmapped_area(file, addr, len,
+						pgoff, flags);
+
+	/* Last 8 bits hold the  command others are data per that command */
+	command = pgoff & MLX4_IB_MMAP_CMD_MASK;
+	if (command != MLX4_IB_MMAP_GET_CONTIGUOUS_PAGES)
+		return current->mm->get_unmapped_area(file, addr, len,
+						pgoff, flags);
+
+	page_size_order = pgoff >> MLX4_IB_MMAP_CMD_BITS;
+	/* code is based on the huge-pages get_unmapped_area code */
+	start_addr = mm->free_area_cache;
+
+	if (len <= mm->cached_hole_size)
+		start_addr = TASK_UNMAPPED_BASE;
+
+
+full_search:
+	addr = ALIGN(start_addr, 1 << page_size_order);
+
+	for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
+		/* At this point:  (!vma || addr < vma->vm_end). */
+		if (TASK_SIZE - len < addr) {
+			/*
+			 * Start a new search - just in case we missed
+			 * some holes.
+			 */
+			if (start_addr != TASK_UNMAPPED_BASE) {
+				start_addr = TASK_UNMAPPED_BASE;
+				goto full_search;
+			}
+			return -ENOMEM;
+		}
+
+		if (!vma || addr + len <= vma->vm_start)
+			return addr;
+		addr = ALIGN(vma->vm_end, 1 << page_size_order);
+	}
+}
+#endif
+
 static int mlx4_ib_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
 {
 	struct mlx4_ib_dev *dev = to_mdev(context->device);
+	int err;
 
-	if (vma->vm_end - vma->vm_start != PAGE_SIZE)
-		return -EINVAL;
+	/* Last 8 bits hold the  command others are data per that command */
+	unsigned long  command = vma->vm_pgoff & MLX4_IB_MMAP_CMD_MASK;
 
-	if (vma->vm_pgoff == 0) {
+	if (command < MLX4_IB_MMAP_GET_CONTIGUOUS_PAGES) {
+		/* compatability handling for commands 0 & 1*/
+		if (vma->vm_end - vma->vm_start != PAGE_SIZE)
+			return -EINVAL;
+	}
+	if (command == MLX4_IB_MMAP_UAR_PAGE) {
 		vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
 
 		if (io_remap_pfn_range(vma, vma->vm_start,
@@ -577,7 +747,8 @@
 				       to_mucontext(context)->uar.pfn,
 				       PAGE_SIZE, vma->vm_page_prot))
 			return -EAGAIN;
-	} else if (vma->vm_pgoff == 1 && dev->dev->caps.bf_reg_size != 0) {
+	} else if (command == MLX4_IB_MMAP_BLUE_FLAME_PAGE &&
+			dev->dev->caps.bf_reg_size != 0) {
 		vma->vm_page_prot = pgprot_wc(vma->vm_page_prot);
 
 		if (io_remap_pfn_range(vma, vma->vm_start,
@@ -585,6 +756,25 @@
 				       dev->dev->caps.num_uars,
 				       PAGE_SIZE, vma->vm_page_prot))
 			return -EAGAIN;
+	} else if (command == MLX4_IB_MMAP_GET_CONTIGUOUS_PAGES) {
+		/* Getting contiguous physical pages */
+		unsigned long total_size = vma->vm_end - vma->vm_start;
+		unsigned long page_size_order = (vma->vm_pgoff) >>
+						MLX4_IB_MMAP_CMD_BITS;
+		struct ib_cmem *ib_cmem;
+		ib_cmem = ib_cmem_alloc_contiguous_pages(context, total_size,
+							page_size_order);
+		if (IS_ERR(ib_cmem)) {
+			err = PTR_ERR(ib_cmem);
+			return err;
+		}
+
+		err = ib_cmem_map_contiguous_pages_to_vma(ib_cmem, vma);
+		if (err) {
+			ib_cmem_release_contiguous_pages(ib_cmem);
+			return err;
+		}
+		return 0;
 	} else
 		return -EINVAL;
 
@@ -598,7 +788,7 @@
 	struct mlx4_ib_pd *pd;
 	int err;
 
-	pd = kzalloc(sizeof *pd, GFP_KERNEL);
+	pd = kmalloc(sizeof *pd, GFP_KERNEL);
 	if (!pd)
 		return ERR_PTR(-ENOMEM);
 
@@ -626,11 +816,62 @@
 	return 0;
 }
 
+static struct ib_xrcd *mlx4_ib_alloc_xrcd(struct ib_device *ibdev,
+					  struct ib_ucontext *context,
+					  struct ib_udata *udata)
+{
+	struct mlx4_ib_xrcd *xrcd;
+	int err;
+
+	if (!(to_mdev(ibdev)->dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC))
+		return ERR_PTR(-ENOSYS);
+
+	xrcd = kmalloc(sizeof *xrcd, GFP_KERNEL);
+	if (!xrcd)
+		return ERR_PTR(-ENOMEM);
+
+	err = mlx4_xrcd_alloc(to_mdev(ibdev)->dev, &xrcd->xrcdn);
+	if (err)
+		goto err1;
+
+	xrcd->pd = ib_alloc_pd(ibdev);
+	if (IS_ERR(xrcd->pd)) {
+		err = PTR_ERR(xrcd->pd);
+		goto err2;
+	}
+
+	xrcd->cq = ib_create_cq(ibdev, NULL, NULL, xrcd, 1, 0);
+	if (IS_ERR(xrcd->cq)) {
+		err = PTR_ERR(xrcd->cq);
+		goto err3;
+	}
+
+	return &xrcd->ibxrcd;
+
+err3:
+	ib_dealloc_pd(xrcd->pd);
+err2:
+	mlx4_xrcd_free(to_mdev(ibdev)->dev, xrcd->xrcdn);
+err1:
+	kfree(xrcd);
+	return ERR_PTR(err);
+}
+
+static int mlx4_ib_dealloc_xrcd(struct ib_xrcd *xrcd)
+{
+	ib_destroy_cq(to_mxrcd(xrcd)->cq);
+	ib_dealloc_pd(to_mxrcd(xrcd)->pd);
+	mlx4_xrcd_free(to_mdev(xrcd->device)->dev, to_mxrcd(xrcd)->xrcdn);
+	kfree(xrcd);
+
+	return 0;
+}
+
 static int add_gid_entry(struct ib_qp *ibqp, union ib_gid *gid)
 {
 	struct mlx4_ib_qp *mqp = to_mqp(ibqp);
 	struct mlx4_ib_dev *mdev = to_mdev(ibqp->device);
-	struct gid_entry *ge;
+	struct mlx4_ib_gid_entry *ge;
 
 	ge = kzalloc(sizeof *ge, GFP_KERNEL);
 	if (!ge)
@@ -658,11 +899,13 @@
 
 	if (!mqp->port)
 		return 0;
+
 	spin_lock(&mdev->iboe.lock);
 	ndev = mdev->iboe.netdevs[mqp->port - 1];
 	if (ndev)
 		dev_hold(ndev);
 	spin_unlock(&mdev->iboe.lock);
+
 	if (ndev) {
 		rdma_get_mcast_mac((struct in6_addr *)gid, mac);
 		rtnl_lock();
@@ -675,38 +918,270 @@
 	return ret;
 }
 
+struct mlx4_ib_steering {
+	struct list_head list;
+	u64 reg_id;
+	union ib_gid gid;
+};
+
 static int mlx4_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
 {
 	int err;
 	struct mlx4_ib_dev *mdev = to_mdev(ibqp->device);
 	struct mlx4_ib_qp *mqp = to_mqp(ibqp);
+	u64 reg_id;
+	struct mlx4_ib_steering *ib_steering = NULL;
 
-	err = mlx4_multicast_attach(mdev->dev, &mqp->mqp, gid->raw, !!(mqp->flags &
-				MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK),
-				(ibqp->qp_type == IB_QPT_RAW_ETH) ?
-				MLX4_MCAST_PROT_EN : MLX4_MCAST_PROT_IB);
+	if (mdev->dev->caps.steering_mode ==
+	    MLX4_STEERING_MODE_DEVICE_MANAGED) {
+		ib_steering = kmalloc(sizeof(*ib_steering), GFP_KERNEL);
+		if (!ib_steering)
+			return -ENOMEM;
+	}
+
+	err = mlx4_multicast_attach(mdev->dev, &mqp->mqp, gid->raw, mqp->port,
+				    !!(mqp->flags &
+				       MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK),
+				    MLX4_PROT_IB_IPV6, &reg_id);
 	if (err)
-		return err;
+		goto err_malloc;
 
 	err = add_gid_entry(ibqp, gid);
 	if (err)
 		goto err_add;
 
+	if (ib_steering) {
+		memcpy(ib_steering->gid.raw, gid->raw, 16);
+		ib_steering->reg_id = reg_id;
+		mutex_lock(&mqp->mutex);
+		list_add(&ib_steering->list, &mqp->steering_rules);
+		mutex_unlock(&mqp->mutex);
+	}
 	return 0;
 
 err_add:
 	mlx4_multicast_detach(mdev->dev, &mqp->mqp, gid->raw,
-				(ibqp->qp_type == IB_QPT_RAW_ETH) ?
-				MLX4_MCAST_PROT_EN : MLX4_MCAST_PROT_IB);
+			      MLX4_PROT_IB_IPV6, reg_id);
+err_malloc:
+	kfree(ib_steering);
+
 	return err;
 }
 
-static struct gid_entry *find_gid_entry(struct mlx4_ib_qp *qp, u8 *raw)
+enum {
+	IBV_FLOW_L4_NONE = 0,
+	IBV_FLOW_L4_OTHER = 3,
+	IBV_FLOW_L4_UDP = 5,
+	IBV_FLOW_L4_TCP = 6
+};
+
+struct mlx4_cm_steering {
+	struct list_head list;
+	u64 reg_id;
+	struct ib_flow_spec spec;
+};
+
+static int flow_spec_to_net_rule(struct ib_device *dev, struct ib_flow_spec *flow_spec,
+				  struct list_head *rule_list_h)
 {
-	struct gid_entry *ge;
-	struct gid_entry *tmp;
-	struct gid_entry *ret = NULL;
+	struct mlx4_spec_list *spec_l2, *spec_l3, *spec_l4;
+	u64 mac_msk = cpu_to_be64(MLX4_MAC_MASK << 16);
 
+	spec_l2 = kzalloc(sizeof *spec_l2, GFP_KERNEL);
+	if (!spec_l2)
+		return -ENOMEM;
+
+	switch (flow_spec->type) {
+	case IB_FLOW_ETH:
+		spec_l2->id = MLX4_NET_TRANS_RULE_ID_ETH;
+		memcpy(spec_l2->eth.dst_mac, flow_spec->l2_id.eth.mac, ETH_ALEN);
+		memcpy(spec_l2->eth.dst_mac_msk, &mac_msk, ETH_ALEN);
+		spec_l2->eth.ether_type = flow_spec->l2_id.eth.ethertype;
+		if (flow_spec->l2_id.eth.vlan_present) {
+			spec_l2->eth.vlan_id = flow_spec->l2_id.eth.vlan;
+			spec_l2->eth.vlan_id_msk = cpu_to_be16(0x0fff);
+		}
+		break;
+	case IB_FLOW_IB_UC:
+		spec_l2->id = MLX4_NET_TRANS_RULE_ID_IB;
+		if(flow_spec->l2_id.ib_uc.qpn) {
+			spec_l2->ib.l3_qpn = cpu_to_be32(flow_spec->l2_id.ib_uc.qpn);
+			spec_l2->ib.qpn_msk = cpu_to_be32(0xffffff);
+                    }
+		break;
+	case IB_FLOW_IB_MC_IPV4:
+	case IB_FLOW_IB_MC_IPV6:
+		spec_l2->id = MLX4_NET_TRANS_RULE_ID_IB;
+		memcpy(spec_l2->ib.dst_gid, flow_spec->l2_id.ib_mc.mgid, 16);
+		memset(spec_l2->ib.dst_gid_msk, 0xff, 16);
+		break;
+	}
+
+
+	list_add_tail(&spec_l2->list, rule_list_h);
+
+	if (flow_spec->l2_id.eth.ethertype == cpu_to_be16(ETH_P_IP) ||
+	    flow_spec->type != IB_FLOW_ETH) {
+		spec_l3 = kzalloc(sizeof *spec_l3, GFP_KERNEL);
+		if (!spec_l3)
+			return -ENOMEM;
+
+		spec_l3->id = MLX4_NET_TRANS_RULE_ID_IPV4;
+		spec_l3->ipv4.src_ip = flow_spec->src_ip;
+		if (flow_spec->type != IB_FLOW_IB_MC_IPV4 &&
+		    flow_spec->type != IB_FLOW_IB_MC_IPV6)
+			spec_l3->ipv4.dst_ip = flow_spec->dst_ip;
+
+		if (spec_l3->ipv4.src_ip)
+			spec_l3->ipv4.src_ip_msk = MLX4_BE_WORD_MASK;
+		if (spec_l3->ipv4.dst_ip)
+			spec_l3->ipv4.dst_ip_msk = MLX4_BE_WORD_MASK;
+
+		list_add_tail(&spec_l3->list, rule_list_h);
+	}
+
+	if (flow_spec->l4_protocol) {
+		spec_l4 = kzalloc(sizeof(*spec_l4), GFP_KERNEL);
+		if (!spec_l4)
+			return -ENOMEM;
+
+		spec_l4->tcp_udp.src_port = flow_spec->src_port;
+		spec_l4->tcp_udp.dst_port = flow_spec->dst_port;
+		if (spec_l4->tcp_udp.src_port)
+			spec_l4->tcp_udp.src_port_msk =
+						MLX4_BE_SHORT_MASK;
+		if (spec_l4->tcp_udp.dst_port)
+			spec_l4->tcp_udp.dst_port_msk =
+						MLX4_BE_SHORT_MASK;
+
+		switch (flow_spec->l4_protocol) {
+		case IBV_FLOW_L4_UDP:
+			spec_l4->id = MLX4_NET_TRANS_RULE_ID_UDP;
+			break;
+		case IBV_FLOW_L4_TCP:
+			spec_l4->id = MLX4_NET_TRANS_RULE_ID_TCP;
+			break;
+		default:
+			dev_err(dev->dma_device,
+				"Unsupported l4 protocol.\n");
+			kfree(spec_l4);
+			return -EPROTONOSUPPORT;
+		}
+		list_add_tail(&spec_l4->list, rule_list_h);
+	}
+	return 0;
+}
+
+static int __mlx4_ib_flow_attach(struct mlx4_ib_dev *mdev,
+				 struct mlx4_ib_qp *mqp,
+				 struct ib_flow_spec *flow_spec,
+				 int priority, int lock_qp)
+{
+	u64 reg_id = 0;
+	int err = 0;
+	struct mlx4_cm_steering *cm_flow;
+	struct mlx4_spec_list *spec, *tmp_spec;
+
+	struct mlx4_net_trans_rule rule =
+	{	.queue_mode = MLX4_NET_TRANS_Q_FIFO,
+		.exclusive = 0,
+	};
+
+	rule.promisc_mode = flow_spec->rule_type;
+	rule.port = mqp->port;
+	rule.qpn = mqp->mqp.qpn;
+	INIT_LIST_HEAD(&rule.list);
+
+	cm_flow = kmalloc(sizeof(*cm_flow), GFP_KERNEL);
+	if (!cm_flow)
+		return -ENOMEM;
+
+	if (rule.promisc_mode == MLX4_FS_REGULAR) {
+		rule.allow_loopback = !flow_spec->block_mc_loopback;
+		rule.priority = MLX4_DOMAIN_UVERBS | priority;
+		err = flow_spec_to_net_rule(&mdev->ib_dev, flow_spec,
+					    &rule.list);
+		if (err)
+			goto free_list;
+	}
+
+	err = mlx4_flow_attach(mdev->dev, &rule, &reg_id);
+	if (err)
+		goto free_list;
+
+	memcpy(&cm_flow->spec, flow_spec, sizeof(*flow_spec));
+	cm_flow->reg_id = reg_id;
+
+	if (lock_qp)
+		mutex_lock(&mqp->mutex);
+	list_add(&cm_flow->list, &mqp->rules_list);
+	if (lock_qp)
+                mutex_unlock(&mqp->mutex);
+
+free_list:
+	list_for_each_entry_safe(spec, tmp_spec, &rule.list, list) {
+		list_del(&spec->list);
+		kfree(spec);
+	}
+	if (err) {
+		kfree(cm_flow);
+		dev_err(mdev->ib_dev.dma_device,
+			"Fail to attach flow steering rule\n");
+	}
+	return err;
+}
+
+static int __mlx4_ib_flow_detach(struct mlx4_ib_dev *mdev,
+				 struct mlx4_ib_qp *mqp,
+				 struct ib_flow_spec *spec, int priority,
+				 int lock_qp)
+{
+	struct mlx4_cm_steering *cm_flow;
+	int ret;
+
+	if (lock_qp)
+		mutex_lock(&mqp->mutex);
+	list_for_each_entry(cm_flow, &mqp->rules_list, list) {
+		if (!memcmp(&cm_flow->spec, spec, sizeof(*spec))) {
+			list_del(&cm_flow->list);
+			break;
+		}
+	}
+	if (lock_qp)
+		mutex_unlock(&mqp->mutex);
+
+	if (&cm_flow->list == &mqp->rules_list) {
+		dev_err(mdev->ib_dev.dma_device, "Couldn't find reg_id for flow spec. "
+			"Steering rule is left attached\n");
+		return -EINVAL;
+	}
+
+	ret = mlx4_flow_detach(mdev->dev, cm_flow->reg_id);
+
+	kfree(cm_flow);
+	return ret;
+}
+
+static int mlx4_ib_flow_attach(struct ib_qp *qp, struct ib_flow_spec *flow_spec,
+			       int priority)
+{
+	return __mlx4_ib_flow_attach(to_mdev(qp->device), to_mqp(qp),
+				     flow_spec, priority, 1);
+}
+
+static int mlx4_ib_flow_detach(struct ib_qp *qp, struct ib_flow_spec *spec,
+			       int priority)
+{
+	return __mlx4_ib_flow_detach(to_mdev(qp->device), to_mqp(qp),
+				     spec, priority, 1);
+}
+
+static struct mlx4_ib_gid_entry *find_gid_entry(struct mlx4_ib_qp *qp, u8 *raw)
+{
+	struct mlx4_ib_gid_entry *ge;
+	struct mlx4_ib_gid_entry *tmp;
+	struct mlx4_ib_gid_entry *ret = NULL;
+
 	list_for_each_entry_safe(ge, tmp, &qp->gid_list, list) {
 		if (!memcmp(raw, ge->gid.raw, 16)) {
 			ret = ge;
@@ -724,11 +1199,31 @@
 	struct mlx4_ib_qp *mqp = to_mqp(ibqp);
 	u8 mac[6];
 	struct net_device *ndev;
-	struct gid_entry *ge;
+	struct mlx4_ib_gid_entry *ge;
+	u64 reg_id = 0;
 
+	if (mdev->dev->caps.steering_mode ==
+	    MLX4_STEERING_MODE_DEVICE_MANAGED) {
+		struct mlx4_ib_steering *ib_steering;
+
+		mutex_lock(&mqp->mutex);
+		list_for_each_entry(ib_steering, &mqp->steering_rules, list) {
+			if (!memcmp(ib_steering->gid.raw, gid->raw, 16)) {
+				list_del(&ib_steering->list);
+				break;
+			}
+		}
+		mutex_unlock(&mqp->mutex);
+		if (&ib_steering->list == &mqp->steering_rules) {
+			pr_err("Couldn't find reg_id for mgid. Steering rule is left attached\n");
+			return -EINVAL;
+		}
+		reg_id = ib_steering->reg_id;
+		kfree(ib_steering);
+	}
+
 	err = mlx4_multicast_detach(mdev->dev, &mqp->mqp, gid->raw,
-				(ibqp->qp_type == IB_QPT_RAW_ETH) ?
-				MLX4_MCAST_PROT_EN : MLX4_MCAST_PROT_IB);
+				    MLX4_PROT_IB_IPV6, reg_id);
 	if (err)
 		return err;
 
@@ -750,7 +1245,7 @@
 		list_del(&ge->list);
 		kfree(ge);
 	} else
-		printk(KERN_WARNING "could not find mgid entry\n");
+		pr_warn("could not find mgid entry\n");
 
 	mutex_unlock(&mqp->mutex);
 
@@ -757,84 +1252,11 @@
 	return 0;
 }
 
-static void mlx4_dummy_comp_handler(struct ib_cq *cq, void *cq_context)
-{
-}
-
-static struct ib_xrcd *mlx4_ib_alloc_xrcd(struct ib_device *ibdev,
-					  struct ib_ucontext *context,
-					  struct ib_udata *udata)
-{
-	struct mlx4_ib_xrcd *xrcd;
-	struct mlx4_ib_dev *mdev = to_mdev(ibdev);
-	struct ib_pd *pd;
-	struct ib_cq *cq;
-	int err;
-
-	if (!(mdev->dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC))
-		return ERR_PTR(-ENOSYS);
-
-	xrcd = kmalloc(sizeof *xrcd, GFP_KERNEL);
-	if (!xrcd)
-		return ERR_PTR(-ENOMEM);
-
-	err = mlx4_xrcd_alloc(mdev->dev, &xrcd->xrcdn);
-	if (err)
-		goto err_xrcd;
-
-	pd = mlx4_ib_alloc_pd(ibdev, NULL, NULL);
-	if (IS_ERR(pd)) {
-		err = PTR_ERR(pd);
-		goto err_pd;
-	}
-	pd->device  = ibdev;
-
-	cq = mlx4_ib_create_cq(ibdev, 1, 0, NULL, NULL);
-	if (IS_ERR(cq)) {
-		err = PTR_ERR(cq);
-		goto err_cq;
-	}
-	cq->device        = ibdev;
-	cq->comp_handler  = mlx4_dummy_comp_handler;
-
-	if (context)
-		if (ib_copy_to_udata(udata, &xrcd->xrcdn, sizeof(__u32))) {
-			err = -EFAULT;
-			goto err_copy;
-		}
-
-	xrcd->cq = cq;
-	xrcd->pd = pd;
-	return &xrcd->ibxrcd;
-
-err_copy:
-	mlx4_ib_destroy_cq(cq);
-err_cq:
-	mlx4_ib_dealloc_pd(pd);
-err_pd:
-	mlx4_xrcd_free(mdev->dev, xrcd->xrcdn);
-err_xrcd:
-	kfree(xrcd);
-	return ERR_PTR(err);
-}
-
-static int mlx4_ib_dealloc_xrcd(struct ib_xrcd *xrcd)
-{
-	struct mlx4_ib_xrcd *mxrcd = to_mxrcd(xrcd);
-
-	mlx4_ib_destroy_cq(mxrcd->cq);
-	mlx4_ib_dealloc_pd(mxrcd->pd);
-	mlx4_xrcd_free(to_mdev(xrcd->device)->dev, to_mxrcd(xrcd)->xrcdn);
-	kfree(xrcd);
-
-	return 0;
-}
-
-
 static int init_node_data(struct mlx4_ib_dev *dev)
 {
 	struct ib_smp *in_mad  = NULL;
 	struct ib_smp *out_mad = NULL;
+	int mad_ifc_flags = MLX4_MAD_IFC_IGNORE_KEYS;
 	int err = -ENOMEM;
 
 	in_mad  = kzalloc(sizeof *in_mad, GFP_KERNEL);
@@ -844,8 +1266,10 @@
 
 	init_query_mad(in_mad);
 	in_mad->attr_id = IB_SMP_ATTR_NODE_DESC;
+	if (mlx4_is_master(dev->dev))
+		mad_ifc_flags |= MLX4_MAD_IFC_NET_VIEW;
 
-	err = mlx4_MAD_IFC(dev, 1, 1, 1, NULL, NULL, in_mad, out_mad);
+	err = mlx4_MAD_IFC(dev, mad_ifc_flags, 1, NULL, NULL, in_mad, out_mad);
 	if (err)
 		goto out;
 
@@ -853,7 +1277,7 @@
 
 	in_mad->attr_id = IB_SMP_ATTR_NODE_INFO;
 
-	err = mlx4_MAD_IFC(dev, 1, 1, 1, NULL, NULL, in_mad, out_mad);
+	err = mlx4_MAD_IFC(dev, mad_ifc_flags, 1, NULL, NULL, in_mad, out_mad);
 	if (err)
 		goto out;
 
@@ -913,136 +1337,6 @@
 	&dev_attr_board_id
 };
 
-/*
- * create show function and a device_attribute struct pointing to
- * the function for _name
- */
-#define DEVICE_DIAG_RPRT_ATTR(_name, _offset, _op_mod)		\
-static ssize_t show_rprt_##_name(struct device *dev,		\
-				 struct device_attribute *attr,	\
-				 char *buf){			\
-	return show_diag_rprt(dev, buf, _offset, _op_mod);	\
-}								\
-static DEVICE_ATTR(_name, S_IRUGO, show_rprt_##_name, NULL);
-
-#define MLX4_DIAG_RPRT_CLEAR_DIAGS 3
-
-static size_t show_diag_rprt(struct device *device, char *buf,
-                              u32 offset, u8 op_modifier)
-{
-	size_t ret;
-	u32 counter_offset = offset;
-	u32 diag_counter = 0;
-	struct mlx4_ib_dev *dev = container_of(device, struct mlx4_ib_dev,
-					       ib_dev.dev);
-
-	ret = mlx4_query_diag_counters(dev->dev, 1, op_modifier,
-				       &counter_offset, &diag_counter);
-	if (ret)
-		return ret;
-
-	return sprintf(buf,"%d\n", diag_counter);
-}
-
-static ssize_t clear_diag_counters(struct device *device,
-				   struct device_attribute *attr,
-				   const char *buf, size_t length)
-{
-	size_t ret;
-	struct mlx4_ib_dev *dev = container_of(device, struct mlx4_ib_dev,
-					       ib_dev.dev);
-
-	ret = mlx4_query_diag_counters(dev->dev, 0, MLX4_DIAG_RPRT_CLEAR_DIAGS,
-				       NULL, NULL);
-	if (ret)
-		return ret;
-
-	return length;
-}
-
-DEVICE_DIAG_RPRT_ATTR(rq_num_lle	, 0x00, 2);
-DEVICE_DIAG_RPRT_ATTR(sq_num_lle	, 0x04, 2);
-DEVICE_DIAG_RPRT_ATTR(rq_num_lqpoe	, 0x08, 2);
-DEVICE_DIAG_RPRT_ATTR(sq_num_lqpoe 	, 0x0C, 2);
-DEVICE_DIAG_RPRT_ATTR(rq_num_leeoe	, 0x10, 2);
-DEVICE_DIAG_RPRT_ATTR(sq_num_leeoe	, 0x14, 2);
-DEVICE_DIAG_RPRT_ATTR(rq_num_lpe	, 0x18, 2);
-DEVICE_DIAG_RPRT_ATTR(sq_num_lpe	, 0x1C, 2);
-DEVICE_DIAG_RPRT_ATTR(rq_num_wrfe	, 0x20, 2);
-DEVICE_DIAG_RPRT_ATTR(sq_num_wrfe	, 0x24, 2);
-DEVICE_DIAG_RPRT_ATTR(sq_num_mwbe	, 0x2C, 2);
-DEVICE_DIAG_RPRT_ATTR(sq_num_bre	, 0x34, 2);
-DEVICE_DIAG_RPRT_ATTR(rq_num_lae	, 0x38, 2);
-DEVICE_DIAG_RPRT_ATTR(sq_num_rire	, 0x44, 2);
-DEVICE_DIAG_RPRT_ATTR(rq_num_rire	, 0x48, 2);
-DEVICE_DIAG_RPRT_ATTR(sq_num_rae	, 0x4C, 2);
-DEVICE_DIAG_RPRT_ATTR(rq_num_rae	, 0x50, 2);
-DEVICE_DIAG_RPRT_ATTR(sq_num_roe	, 0x54, 2);
-DEVICE_DIAG_RPRT_ATTR(sq_num_tree	, 0x5C, 2);
-DEVICE_DIAG_RPRT_ATTR(sq_num_rree	, 0x64, 2);
-DEVICE_DIAG_RPRT_ATTR(rq_num_rnr	, 0x68, 2);
-DEVICE_DIAG_RPRT_ATTR(sq_num_rnr	, 0x6C, 2);
-DEVICE_DIAG_RPRT_ATTR(sq_num_rabrte	, 0x7C, 2);
-DEVICE_DIAG_RPRT_ATTR(sq_num_ieecne	, 0x84, 2);
-DEVICE_DIAG_RPRT_ATTR(sq_num_ieecse	, 0x8C, 2);
-DEVICE_DIAG_RPRT_ATTR(rq_num_oos	, 0x100, 2);
-DEVICE_DIAG_RPRT_ATTR(sq_num_oos	, 0x104, 2);
-DEVICE_DIAG_RPRT_ATTR(rq_num_mce	, 0x108, 2);
-DEVICE_DIAG_RPRT_ATTR(rq_num_rsync	, 0x110, 2);
-DEVICE_DIAG_RPRT_ATTR(sq_num_rsync	, 0x114, 2);
-DEVICE_DIAG_RPRT_ATTR(rq_num_udsdprd	, 0x118, 2);
-DEVICE_DIAG_RPRT_ATTR(rq_num_ucsdprd	, 0x120, 2);
-DEVICE_DIAG_RPRT_ATTR(num_cqovf		, 0x1A0, 2);
-DEVICE_DIAG_RPRT_ATTR(num_eqovf		, 0x1A4, 2);
-DEVICE_DIAG_RPRT_ATTR(num_baddb		, 0x1A8, 2);
-
-static DEVICE_ATTR(clear_diag, S_IWUGO, NULL, clear_diag_counters);
-
-static struct attribute *diag_rprt_attrs[] = {
-	&dev_attr_rq_num_lle.attr,
-	&dev_attr_sq_num_lle.attr,
-	&dev_attr_rq_num_lqpoe.attr,
-	&dev_attr_sq_num_lqpoe.attr,
-	&dev_attr_rq_num_leeoe.attr,
-	&dev_attr_sq_num_leeoe.attr,
-	&dev_attr_rq_num_lpe.attr,
-	&dev_attr_sq_num_lpe.attr,
-	&dev_attr_rq_num_wrfe.attr,
-	&dev_attr_sq_num_wrfe.attr,
-	&dev_attr_sq_num_mwbe.attr,
-	&dev_attr_sq_num_bre.attr,
-	&dev_attr_rq_num_lae.attr,
-	&dev_attr_sq_num_rire.attr,
-	&dev_attr_rq_num_rire.attr,
-	&dev_attr_sq_num_rae.attr,
-	&dev_attr_rq_num_rae.attr,
-	&dev_attr_sq_num_roe.attr,
-	&dev_attr_sq_num_tree.attr,
-	&dev_attr_sq_num_rree.attr,
-	&dev_attr_rq_num_rnr.attr,
-	&dev_attr_sq_num_rnr.attr,
-	&dev_attr_sq_num_rabrte.attr,
-	&dev_attr_sq_num_ieecne.attr,
-	&dev_attr_sq_num_ieecse.attr,
-	&dev_attr_rq_num_oos.attr,
-	&dev_attr_sq_num_oos.attr,
-	&dev_attr_rq_num_mce.attr,
-	&dev_attr_rq_num_rsync.attr,
-	&dev_attr_sq_num_rsync.attr,
-	&dev_attr_rq_num_udsdprd.attr,
-	&dev_attr_rq_num_ucsdprd.attr,
-	&dev_attr_num_cqovf.attr,
-	&dev_attr_num_eqovf.attr,
-	&dev_attr_num_baddb.attr,
-	&dev_attr_clear_diag.attr,
-	NULL
-};
-
-struct attribute_group diag_counters_group = {
-	.name  = "diag_counters",
-	.attrs  = diag_rprt_attrs
-};
-
 static void mlx4_addrconf_ifid_eui48(u8 *eui, u16 vlan_id, struct net_device *dev)
 {
 #ifdef __linux__
@@ -1049,8 +1343,8 @@
 	memcpy(eui, dev->dev_addr, 3);
 	memcpy(eui + 5, dev->dev_addr + 3, 3);
 #else
-	memcpy(eui, IF_LLADDR(dev), 3);
-	memcpy(eui + 5, IF_LLADDR(dev) + 3, 3);
+        memcpy(eui, IF_LLADDR(dev), 3);
+        memcpy(eui + 5, IF_LLADDR(dev) + 3, 3);
 #endif
 	if (vlan_id < 0x1000) {
 		eui[3] = vlan_id >> 8;
@@ -1069,11 +1363,10 @@
 	union ib_gid *gids;
 	int err;
 	struct mlx4_dev	*dev = gw->dev->dev;
-	struct ib_event event;
 
 	mailbox = mlx4_alloc_cmd_mailbox(dev);
 	if (IS_ERR(mailbox)) {
-		printk(KERN_WARNING "update gid table failed %ld\n", PTR_ERR(mailbox));
+		pr_warn("update gid table failed %ld\n", PTR_ERR(mailbox));
 		return;
 	}
 
@@ -1081,15 +1374,13 @@
 	memcpy(gids, gw->gids, sizeof gw->gids);
 
 	err = mlx4_cmd(dev, mailbox->dma, MLX4_SET_PORT_GID_TABLE << 8 | gw->port,
-		       1, MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B);
+		       1, MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B,
+		       MLX4_CMD_WRAPPED);
 	if (err)
-		printk(KERN_WARNING "set port command failed\n");
+		pr_warn("set port command failed\n");
 	else {
 		memcpy(gw->dev->iboe.gid_table[gw->port - 1], gw->gids, sizeof gw->gids);
-		event.device = &gw->dev->ib_dev;
-		event.element.port_num = gw->port;
-		event.event    = IB_EVENT_GID_CHANGE;
-		ib_dispatch_event(&event);
+		mlx4_ib_dispatch_event(gw->dev, gw->port, IB_EVENT_GID_CHANGE);
 	}
 
 	mlx4_free_cmd_mailbox(dev, mailbox);
@@ -1096,10 +1387,6 @@
 	kfree(gw);
 }
 
-enum {
-	MLX4_MAX_EFF_VLANS = 128 - MLX4_VLAN_REGULAR,
-};
-
 static int update_ipv6_gids(struct mlx4_ib_dev *dev, int port, int clear)
 {
 	struct net_device *ndev = dev->iboe.netdevs[port - 1];
@@ -1107,11 +1394,11 @@
 	struct net_device *tmp;
 	int i;
 	u8 *hits;
-	int ret;
 	union ib_gid gid;
-	int tofree;
+	int index_free;
 	int found;
 	int need_update = 0;
+	int max_gids;
 	u16 vid;
 
 	work = kzalloc(sizeof *work, GFP_ATOMIC);
@@ -1118,18 +1405,20 @@
 	if (!work)
 		return -ENOMEM;
 
-	hits = kzalloc(MLX4_MAX_EFF_VLANS + 1, GFP_ATOMIC);
+	hits = kzalloc(128, GFP_ATOMIC);
 	if (!hits) {
-		ret = -ENOMEM;
-		goto out;
+		kfree(work);
+		return -ENOMEM;
 	}
 
+	max_gids = dev->dev->caps.gid_table_len[port];
+
 #ifdef __linux__
-	read_lock(&dev_base_lock);
-	for_each_netdev(&init_net, tmp) {
+	rcu_read_lock();
+	for_each_netdev_rcu(&init_net, tmp) {
 #else
-	IFNET_RLOCK();
-	TAILQ_FOREACH(tmp, &V_ifnet, if_link) {
+        IFNET_RLOCK();
+        TAILQ_FOREACH(tmp, &V_ifnet, if_link) {
 #endif
 		if (ndev && (tmp == ndev || rdma_vlan_dev_real_dev(tmp) == ndev)) {
 			gid.global.subnet_prefix = cpu_to_be64(0xfe80000000000000LL);
@@ -1136,11 +1425,11 @@
 			vid = rdma_vlan_dev_vlan_id(tmp);
 			mlx4_addrconf_ifid_eui48(&gid.raw[8], vid, ndev);
 			found = 0;
-			tofree = -1;
-			for (i = 0; i < MLX4_MAX_EFF_VLANS + 1; ++i) {
-				if (tofree < 0 &&
+			index_free = -1;
+			for (i = 0; i < max_gids; ++i) {
+				if (index_free < 0 &&
 				    !memcmp(&dev->iboe.gid_table[port - 1][i], &zgid, sizeof zgid))
-					tofree = i;
+					index_free = i;
 				if (!memcmp(&dev->iboe.gid_table[port - 1][i], &gid, sizeof gid)) {
 					hits[i] = 1;
 					found = 1;
@@ -1149,26 +1438,30 @@
 			}
 
 			if (!found) {
-				if (tmp == ndev  && (memcmp(&dev->iboe.gid_table[port - 1][0], &gid, sizeof gid) || !memcmp(&dev->iboe.gid_table[port - 1][0], &zgid, sizeof gid))) {
+				if (tmp == ndev &&
+				    (memcmp(&dev->iboe.gid_table[port - 1][0],
+					    &gid, sizeof gid) ||
+				     !memcmp(&dev->iboe.gid_table[port - 1][0],
+					     &zgid, sizeof gid))) {
 					dev->iboe.gid_table[port - 1][0] = gid;
 					++need_update;
 					hits[0] = 1;
-				} else if (tofree >= 0) {
-					dev->iboe.gid_table[port - 1][tofree] = gid;
-					hits[tofree] = 1;
+				} else if (index_free >= 0) {
+					dev->iboe.gid_table[port - 1][index_free] = gid;
+					hits[index_free] = 1;
 					++need_update;
 				}
 			}
 		}
-#ifdef __linux__
-	}
-	read_unlock(&dev_base_lock);
+#ifdef __linux__	
+        }
+	rcu_read_unlock();
 #else
-	}
-	IFNET_RUNLOCK();
+        }
+        IFNET_RUNLOCK();
 #endif
 
-	for (i = 0; i < MLX4_MAX_EFF_VLANS + 1; ++i)
+	for (i = 0; i < max_gids; ++i)
 		if (!hits[i]) {
 			if (memcmp(&dev->iboe.gid_table[port - 1][i], &zgid, sizeof zgid))
 				++need_update;
@@ -1175,7 +1468,6 @@
 			dev->iboe.gid_table[port - 1][i] = zgid;
 		}
 
-
 	if (need_update) {
 		memcpy(work->gids, dev->iboe.gid_table[port - 1], sizeof work->gids);
 		INIT_WORK(&work->work, update_gids_task);
@@ -1187,10 +1479,6 @@
 
 	kfree(hits);
 	return 0;
-
-out:
-	kfree(work);
-	return ret;
 }
 
 static void handle_en_event(struct mlx4_ib_dev *dev, int port, unsigned long event)
@@ -1239,7 +1527,8 @@
 	spin_lock(&iboe->lock);
 	mlx4_foreach_ib_transport_port(port, ibdev->dev) {
 		oldnd = iboe->netdevs[port - 1];
-		iboe->netdevs[port - 1] = mlx4_get_prot_dev(ibdev->dev, MLX4_PROT_EN, port);
+		iboe->netdevs[port - 1] =
+			mlx4_get_protocol_dev(ibdev->dev, MLX4_PROT_ETH, port);
 		if (oldnd != iboe->netdevs[port - 1]) {
 			if (iboe->netdevs[port - 1])
 				netdev_added(ibdev, port);
@@ -1260,20 +1549,328 @@
 	return NOTIFY_DONE;
 }
 
+static void init_pkeys(struct mlx4_ib_dev *ibdev)
+{
+	int port;
+	int slave;
+	int i;
+
+	if (mlx4_is_master(ibdev->dev)) {
+		for (slave = 0; slave <= ibdev->dev->num_vfs; ++slave) {
+			for (port = 1; port <= ibdev->dev->caps.num_ports; ++port) {
+				for (i = 0;
+				     i < ibdev->dev->phys_caps.pkey_phys_table_len[port];
+				     ++i) {
+					ibdev->pkeys.virt2phys_pkey[slave][port - 1][i] =
+					/* master has the identity virt2phys pkey mapping */
+						(slave == mlx4_master_func_num(ibdev->dev) || !i) ? i :
+							ibdev->dev->phys_caps.pkey_phys_table_len[port] - 1;
+					mlx4_sync_pkey_table(ibdev->dev, slave, port, i,
+							     ibdev->pkeys.virt2phys_pkey[slave][port - 1][i]);
+				}
+			}
+		}
+		/* initialize pkey cache */
+		for (port = 1; port <= ibdev->dev->caps.num_ports; ++port) {
+			for (i = 0;
+			     i < ibdev->dev->phys_caps.pkey_phys_table_len[port];
+			     ++i)
+				ibdev->pkeys.phys_pkey_cache[port-1][i] =
+					(i) ? 0 : 0xFFFF;
+		}
+	}
+}
+
+static void mlx4_ib_alloc_eqs(struct mlx4_dev *dev, struct mlx4_ib_dev *ibdev)
+{
+	char name[32];
+	int eq_per_port = 0;
+	int added_eqs = 0;
+	int total_eqs = 0;
+	int i, j, eq;
+
+	/* Legacy mode or comp_pool is not large enough */
+	if (dev->caps.comp_pool == 0 ||
+	    dev->caps.num_ports > dev->caps.comp_pool)
+		return;
+
+	eq_per_port = rounddown_pow_of_two(dev->caps.comp_pool/
+					dev->caps.num_ports);
+
+	/* Init eq table */
+	added_eqs = 0;
+	mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB)
+		added_eqs += eq_per_port;
+
+	total_eqs = dev->caps.num_comp_vectors + added_eqs;
+
+	ibdev->eq_table = kzalloc(total_eqs * sizeof(int), GFP_KERNEL);
+	if (!ibdev->eq_table)
+		return;
+
+	ibdev->eq_added = added_eqs;
+
+	eq = 0;
+	mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB) {
+		for (j = 0; j < eq_per_port; j++) {
+			snprintf(name, sizeof(name), "mlx4-ib-%d-%d@%d:%d:%d:%d", i, j,
+			    pci_get_domain(dev->pdev->dev.bsddev),
+			    pci_get_bus(dev->pdev->dev.bsddev),
+			    PCI_SLOT(dev->pdev->devfn),
+			    PCI_FUNC(dev->pdev->devfn));
+
+			/* Set IRQ for specific name (per ring) */
+			if (mlx4_assign_eq(dev, name,
+					   &ibdev->eq_table[eq])) {
+				/* Use legacy (same as mlx4_en driver) */
+				pr_warn("Can't allocate EQ %d; reverting to legacy\n", eq);
+				ibdev->eq_table[eq] =
+					(eq % dev->caps.num_comp_vectors);
+			}
+			eq++;
+		}
+	}
+
+	/* Fill the reset of the vector with legacy EQ */
+	for (i = 0, eq = added_eqs; i < dev->caps.num_comp_vectors; i++)
+		ibdev->eq_table[eq++] = i;
+
+	/* Advertise the new number of EQs to clients */
+	ibdev->ib_dev.num_comp_vectors = total_eqs;
+}
+
+static void mlx4_ib_free_eqs(struct mlx4_dev *dev, struct mlx4_ib_dev *ibdev)
+{
+	int i;
+
+	/* no additional eqs were added */
+	if (!ibdev->eq_table)
+		return;
+
+	/* Reset the advertised EQ number */
+	ibdev->ib_dev.num_comp_vectors = dev->caps.num_comp_vectors;
+
+	/* Free only the added eqs */
+	for (i = 0; i < ibdev->eq_added; i++) {
+		/* Don't free legacy eqs if used */
+		if (ibdev->eq_table[i] <= dev->caps.num_comp_vectors)
+			continue;
+		mlx4_release_eq(dev, ibdev->eq_table[i]);
+	}
+
+	kfree(ibdev->eq_table);
+}
+
+/*
+ * create show function and a device_attribute struct pointing to
+ * the function for _name
+ */
+#define DEVICE_DIAG_RPRT_ATTR(_name, _offset, _op_mod)		\
+static ssize_t show_rprt_##_name(struct device *dev,		\
+				 struct device_attribute *attr,	\
+				 char *buf){			\
+	return show_diag_rprt(dev, buf, _offset, _op_mod);	\
+}								\
+static DEVICE_ATTR(_name, S_IRUGO, show_rprt_##_name, NULL);
+
+#define MLX4_DIAG_RPRT_CLEAR_DIAGS 3
+
+static size_t show_diag_rprt(struct device *device, char *buf,
+			     u32 offset, u8 op_modifier)
+{
+	size_t ret;
+	u32 counter_offset = offset;
+	u32 diag_counter = 0;
+	struct mlx4_ib_dev *dev = container_of(device, struct mlx4_ib_dev,
+					       ib_dev.dev);
+
+	ret = mlx4_query_diag_counters(dev->dev, 1, op_modifier,
+				       &counter_offset, &diag_counter);
+	if (ret)
+		return ret;
+
+	return sprintf(buf, "%d\n", diag_counter);
+}
+
+static ssize_t clear_diag_counters(struct device *device,
+				   struct device_attribute *attr,
+				   const char *buf, size_t length)
+{
+	size_t ret;
+	struct mlx4_ib_dev *dev = container_of(device, struct mlx4_ib_dev,
+					       ib_dev.dev);
+
+	ret = mlx4_query_diag_counters(dev->dev, 0, MLX4_DIAG_RPRT_CLEAR_DIAGS,
+				       NULL, NULL);
+	if (ret)
+		return ret;
+
+	return length;
+}
+
+DEVICE_DIAG_RPRT_ATTR(rq_num_lle	, 0x00, 2);
+DEVICE_DIAG_RPRT_ATTR(sq_num_lle	, 0x04, 2);
+DEVICE_DIAG_RPRT_ATTR(rq_num_lqpoe	, 0x08, 2);
+DEVICE_DIAG_RPRT_ATTR(sq_num_lqpoe 	, 0x0C, 2);
+DEVICE_DIAG_RPRT_ATTR(rq_num_lpe	, 0x18, 2);
+DEVICE_DIAG_RPRT_ATTR(sq_num_lpe	, 0x1C, 2);
+DEVICE_DIAG_RPRT_ATTR(rq_num_wrfe	, 0x20, 2);
+DEVICE_DIAG_RPRT_ATTR(sq_num_wrfe	, 0x24, 2);
+DEVICE_DIAG_RPRT_ATTR(sq_num_mwbe	, 0x2C, 2);
+DEVICE_DIAG_RPRT_ATTR(sq_num_bre	, 0x34, 2);
+DEVICE_DIAG_RPRT_ATTR(rq_num_lae	, 0x38, 2);
+DEVICE_DIAG_RPRT_ATTR(sq_num_rire	, 0x44, 2);
+DEVICE_DIAG_RPRT_ATTR(rq_num_rire	, 0x48, 2);
+DEVICE_DIAG_RPRT_ATTR(sq_num_rae	, 0x4C, 2);
+DEVICE_DIAG_RPRT_ATTR(rq_num_rae	, 0x50, 2);
+DEVICE_DIAG_RPRT_ATTR(sq_num_roe	, 0x54, 2);
+DEVICE_DIAG_RPRT_ATTR(sq_num_tree	, 0x5C, 2);
+DEVICE_DIAG_RPRT_ATTR(sq_num_rree	, 0x64, 2);
+DEVICE_DIAG_RPRT_ATTR(rq_num_rnr	, 0x68, 2);
+DEVICE_DIAG_RPRT_ATTR(sq_num_rnr	, 0x6C, 2);
+DEVICE_DIAG_RPRT_ATTR(rq_num_oos	, 0x100, 2);
+DEVICE_DIAG_RPRT_ATTR(sq_num_oos	, 0x104, 2);
+DEVICE_DIAG_RPRT_ATTR(rq_num_mce	, 0x108, 2);
+DEVICE_DIAG_RPRT_ATTR(rq_num_udsdprd	, 0x118, 2);
+DEVICE_DIAG_RPRT_ATTR(rq_num_ucsdprd	, 0x120, 2);
+DEVICE_DIAG_RPRT_ATTR(num_cqovf		, 0x1A0, 2);
+DEVICE_DIAG_RPRT_ATTR(num_eqovf		, 0x1A4, 2);
+DEVICE_DIAG_RPRT_ATTR(num_baddb		, 0x1A8, 2);
+
+static DEVICE_ATTR(clear_diag, S_IWUSR, NULL, clear_diag_counters);
+
+static struct attribute *diag_rprt_attrs[] = {
+	&dev_attr_rq_num_lle.attr,
+	&dev_attr_sq_num_lle.attr,
+	&dev_attr_rq_num_lqpoe.attr,
+	&dev_attr_sq_num_lqpoe.attr,
+	&dev_attr_rq_num_lpe.attr,
+	&dev_attr_sq_num_lpe.attr,
+	&dev_attr_rq_num_wrfe.attr,
+	&dev_attr_sq_num_wrfe.attr,
+	&dev_attr_sq_num_mwbe.attr,
+	&dev_attr_sq_num_bre.attr,
+	&dev_attr_rq_num_lae.attr,
+	&dev_attr_sq_num_rire.attr,
+	&dev_attr_rq_num_rire.attr,
+	&dev_attr_sq_num_rae.attr,
+	&dev_attr_rq_num_rae.attr,
+	&dev_attr_sq_num_roe.attr,
+	&dev_attr_sq_num_tree.attr,
+	&dev_attr_sq_num_rree.attr,
+	&dev_attr_rq_num_rnr.attr,
+	&dev_attr_sq_num_rnr.attr,
+	&dev_attr_rq_num_oos.attr,
+	&dev_attr_sq_num_oos.attr,
+	&dev_attr_rq_num_mce.attr,
+	&dev_attr_rq_num_udsdprd.attr,
+	&dev_attr_rq_num_ucsdprd.attr,
+	&dev_attr_num_cqovf.attr,
+	&dev_attr_num_eqovf.attr,
+	&dev_attr_num_baddb.attr,
+	&dev_attr_clear_diag.attr,
+	NULL
+};
+
+static struct attribute_group diag_counters_group = {
+	.name  = "diag_counters",
+	.attrs  = diag_rprt_attrs
+};
+
+#ifdef __linux__
+static int mlx4_ib_proc_init(void)
+{
+	/* Creating procfs directories /proc/drivers/mlx4_ib/ &&
+	      /proc/drivers/mlx4_ib/mrs for further use by the driver.
+	*/
+	int err;
+	
+        mlx4_ib_driver_dir_entry = proc_mkdir(MLX4_IB_DRIVER_PROC_DIR_NAME,
+				NULL);
+	if (!mlx4_ib_driver_dir_entry) {
+		pr_err("mlx4_ib_proc_init has failed for %s\n",
+		       MLX4_IB_DRIVER_PROC_DIR_NAME);
+		err = -ENODEV;
+		goto error;
+	}
+
+        mlx4_mrs_dir_entry = proc_mkdir(MLX4_IB_MRS_PROC_DIR_NAME,
+					mlx4_ib_driver_dir_entry);
+	if (!mlx4_mrs_dir_entry) {
+		pr_err("mlx4_ib_proc_init has failed for %s\n",
+		       MLX4_IB_MRS_PROC_DIR_NAME);
+		err = -ENODEV;
+		goto remove_entry;
+	}
+
+	return 0;
+
+remove_entry:
+	remove_proc_entry(MLX4_IB_DRIVER_PROC_DIR_NAME,
+				NULL);
+error:
+	return err;
+}
+#endif
+
+static void init_dev_assign(void)
+{
+	int bus, slot, fn, ib_idx;
+	char *p = dev_assign_str, *t;
+	char curr_val[32] = {0};
+	int ret;
+	int j, i = 0;
+
+	memset(dr, 0, sizeof dr);
+
+	if (dev_assign_str[0] == 0)
+		return;
+
+	while (strlen(p)) {
+		ret = sscanf(p, "%02x:%02x.%x-%x", &bus, &slot, &fn, &ib_idx);
+		if (ret != 4 || ib_idx < 0)
+			goto err;
+
+		for (j = 0; j < i; j++)
+			if (dr[j].nr == ib_idx)
+				goto err;
+
+		dr[i].bus = bus;
+		dr[i].dev = slot;
+		dr[i].func = fn;
+		dr[i].nr = ib_idx;
+
+		t = strchr(p, ',');
+		sprintf(curr_val, "%02x:%02x.%x-%x", bus, slot, fn, ib_idx);
+		if ((!t) && strlen(p) == strlen(curr_val))
+			return;
+
+		if (!t || (t + 1) >= dev_assign_str + sizeof dev_assign_str)
+			goto err;
+
+		++i;
+		if (i >= MAX_DR)
+			goto err;
+
+		p = t + 1;
+	}
+
+	return;
+err:
+	memset(dr, 0, sizeof dr);
+	printk(KERN_WARNING "mlx4_ib: The value of 'dev_assign_str' parameter "
+			    "is incorrect. The parameter value is discarded!");
+}
+
 static void *mlx4_ib_add(struct mlx4_dev *dev)
 {
-	static int mlx4_ib_version_printed;
 	struct mlx4_ib_dev *ibdev;
 	int num_ports = 0;
-	int i;
+	int i, j;
 	int err;
 	struct mlx4_ib_iboe *iboe;
-	int k;
 
-	if (!mlx4_ib_version_printed) {
-		printk(KERN_INFO "%s", mlx4_ib_version);
-		++mlx4_ib_version_printed;
-	}
+	printk(KERN_INFO "%s", mlx4_ib_version);
 
 	mlx4_foreach_ib_transport_port(i, dev)
 		num_ports++;
@@ -1296,9 +1893,12 @@
 	if (mlx4_uar_alloc(dev, &ibdev->priv_uar))
 		goto err_pd;
 
-	ibdev->priv_uar.map = ioremap(ibdev->priv_uar.pfn << PAGE_SHIFT, PAGE_SIZE);
+	ibdev->priv_uar.map = ioremap(ibdev->priv_uar.pfn << PAGE_SHIFT,
+		PAGE_SIZE);
+
 	if (!ibdev->priv_uar.map)
 		goto err_uar;
+
 	MLX4_INIT_DOORBELL_LOCK(&ibdev->uar_lock);
 
 	ibdev->dev = dev;
@@ -1312,7 +1912,11 @@
 	ibdev->ib_dev.num_comp_vectors	= dev->caps.num_comp_vectors;
 	ibdev->ib_dev.dma_device	= &dev->pdev->dev;
 
-	ibdev->ib_dev.uverbs_abi_ver	= MLX4_IB_UVERBS_ABI_VERSION;
+	if (dev->caps.userspace_caps)
+		ibdev->ib_dev.uverbs_abi_ver = MLX4_IB_UVERBS_ABI_VERSION;
+	else
+		ibdev->ib_dev.uverbs_abi_ver = MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION;
+
 	ibdev->ib_dev.uverbs_cmd_mask	=
 		(1ull << IB_USER_VERBS_CMD_GET_CONTEXT)		|
 		(1ull << IB_USER_VERBS_CMD_QUERY_DEVICE)	|
@@ -1334,6 +1938,11 @@
 		(1ull << IB_USER_VERBS_CMD_CREATE_SRQ)		|
 		(1ull << IB_USER_VERBS_CMD_MODIFY_SRQ)		|
 		(1ull << IB_USER_VERBS_CMD_QUERY_SRQ)		|
+		(1ull << IB_USER_VERBS_CMD_DESTROY_SRQ)		|
+		(1ull << IB_USER_VERBS_CMD_CREATE_XSRQ)		|
+		(1ull << IB_USER_VERBS_CMD_OPEN_QP)		|
+		(1ull << IB_USER_VERBS_CMD_ATTACH_FLOW)		|
+		(1ull << IB_USER_VERBS_CMD_DETACH_FLOW)		|
 		(1ull << IB_USER_VERBS_CMD_DESTROY_SRQ);
 
 	ibdev->ib_dev.query_device	= mlx4_ib_query_device;
@@ -1346,6 +1955,9 @@
 	ibdev->ib_dev.alloc_ucontext	= mlx4_ib_alloc_ucontext;
 	ibdev->ib_dev.dealloc_ucontext	= mlx4_ib_dealloc_ucontext;
 	ibdev->ib_dev.mmap		= mlx4_ib_mmap;
+#ifdef __linux__
+	ibdev->ib_dev.get_unmapped_area = mlx4_ib_get_unmapped_area;
+#endif
 	ibdev->ib_dev.alloc_pd		= mlx4_ib_alloc_pd;
 	ibdev->ib_dev.dealloc_pd	= mlx4_ib_dealloc_pd;
 	ibdev->ib_dev.create_ah		= mlx4_ib_create_ah;
@@ -1376,87 +1988,139 @@
 	ibdev->ib_dev.free_fast_reg_page_list  = mlx4_ib_free_fast_reg_page_list;
 	ibdev->ib_dev.attach_mcast	= mlx4_ib_mcg_attach;
 	ibdev->ib_dev.detach_mcast	= mlx4_ib_mcg_detach;
+	ibdev->ib_dev.attach_flow	= mlx4_ib_flow_attach;
+	ibdev->ib_dev.detach_flow	= mlx4_ib_flow_detach;
 	ibdev->ib_dev.process_mad	= mlx4_ib_process_mad;
 
-	ibdev->ib_dev.alloc_fmr		= mlx4_ib_fmr_alloc;
-	ibdev->ib_dev.map_phys_fmr	= mlx4_ib_map_phys_fmr;
-	ibdev->ib_dev.unmap_fmr		= mlx4_ib_unmap_fmr;
-	ibdev->ib_dev.dealloc_fmr	= mlx4_ib_fmr_dealloc;
+	if (!mlx4_is_slave(ibdev->dev)) {
+		ibdev->ib_dev.alloc_fmr		= mlx4_ib_fmr_alloc;
+		ibdev->ib_dev.map_phys_fmr	= mlx4_ib_map_phys_fmr;
+		ibdev->ib_dev.unmap_fmr		= mlx4_ib_unmap_fmr;
+		ibdev->ib_dev.dealloc_fmr	= mlx4_ib_fmr_dealloc;
+	}
+
 	if (dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC) {
-		ibdev->ib_dev.create_xrc_srq = mlx4_ib_create_xrc_srq;
 		ibdev->ib_dev.alloc_xrcd = mlx4_ib_alloc_xrcd;
 		ibdev->ib_dev.dealloc_xrcd = mlx4_ib_dealloc_xrcd;
-		ibdev->ib_dev.create_xrc_rcv_qp = mlx4_ib_create_xrc_rcv_qp;
-		ibdev->ib_dev.modify_xrc_rcv_qp = mlx4_ib_modify_xrc_rcv_qp;
-		ibdev->ib_dev.query_xrc_rcv_qp = mlx4_ib_query_xrc_rcv_qp;
-		ibdev->ib_dev.reg_xrc_rcv_qp = mlx4_ib_reg_xrc_rcv_qp;
-		ibdev->ib_dev.unreg_xrc_rcv_qp = mlx4_ib_unreg_xrc_rcv_qp;
 		ibdev->ib_dev.uverbs_cmd_mask |=
-			(1ull << IB_USER_VERBS_CMD_CREATE_XRC_SRQ)	|
-			(1ull << IB_USER_VERBS_CMD_OPEN_XRC_DOMAIN)	|
-			(1ull << IB_USER_VERBS_CMD_CLOSE_XRC_DOMAIN)	|
-			(1ull << IB_USER_VERBS_CMD_CREATE_XRC_RCV_QP)	|
-			(1ull << IB_USER_VERBS_CMD_MODIFY_XRC_RCV_QP)	|
-			(1ull << IB_USER_VERBS_CMD_QUERY_XRC_RCV_QP)	|
-			(1ull << IB_USER_VERBS_CMD_REG_XRC_RCV_QP)	|
-			(1ull << IB_USER_VERBS_CMD_UNREG_XRC_RCV_QP);
+			(1ull << IB_USER_VERBS_CMD_OPEN_XRCD) |
+			(1ull << IB_USER_VERBS_CMD_CLOSE_XRCD);
 	}
 
+	mlx4_ib_alloc_eqs(dev, ibdev);
 
 	spin_lock_init(&iboe->lock);
+
 	if (init_node_data(ibdev))
 		goto err_map;
 
-	for (k = 0; k < ibdev->num_ports; ++k) {
-		err = mlx4_counter_alloc(ibdev->dev, &ibdev->counters[k]);
-		if (err)
-			ibdev->counters[k] = -1;
-		else
-			mlx4_set_iboe_counter(dev, ibdev->counters[k], k + 1);
+	for (i = 0; i < ibdev->num_ports; ++i) {
+		if (mlx4_ib_port_link_layer(&ibdev->ib_dev, i + 1) ==
+						IB_LINK_LAYER_ETHERNET) {
+			err = mlx4_counter_alloc(ibdev->dev, i + 1, &ibdev->counters[i]);
+			if (err)
+				ibdev->counters[i] = -1;
+		} else
+				ibdev->counters[i] = -1;
 	}
 
 	spin_lock_init(&ibdev->sm_lock);
 	mutex_init(&ibdev->cap_mask_mutex);
-	mutex_init(&ibdev->xrc_reg_mutex);
 
-	if (ib_register_device(&ibdev->ib_dev))
-		goto err_counter;
+	if (dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED &&
+	    !mlx4_is_slave(dev)) {
+		ibdev->steer_qpn_count = MLX4_IB_UC_MAX_NUM_QPS;
+		err = mlx4_qp_reserve_range(dev, ibdev->steer_qpn_count,
+					    MLX4_IB_UC_STEER_QPN_ALIGN, &ibdev->steer_qpn_base, 0);
+		if (err)
+			goto err_counter;
 
+		ibdev->ib_uc_qpns_bitmap =
+			kmalloc(BITS_TO_LONGS(ibdev->steer_qpn_count) *
+				sizeof(long),
+				GFP_KERNEL);
+		if (!ibdev->ib_uc_qpns_bitmap) {
+			dev_err(&dev->pdev->dev, "bit map alloc failed\n");
+			goto err_steer_qp_release;
+		}
+
+		bitmap_zero(ibdev->ib_uc_qpns_bitmap, ibdev->steer_qpn_count);
+
+		err = mlx4_FLOW_STEERING_IB_UC_QP_RANGE(dev, ibdev->steer_qpn_base,
+				ibdev->steer_qpn_base + ibdev->steer_qpn_count - 1);
+		if (err)
+			goto err_steer_free_bitmap;
+	}
+
+	if (ib_register_device(&ibdev->ib_dev, NULL))
+		goto err_steer_free_bitmap;
+
 	if (mlx4_ib_mad_init(ibdev))
 		goto err_reg;
+
+	if (mlx4_ib_init_sriov(ibdev))
+		goto err_mad;
+
 	if (dev->caps.flags & MLX4_DEV_CAP_FLAG_IBOE && !iboe->nb.notifier_call) {
 		iboe->nb.notifier_call = mlx4_ib_netdev_event;
 		err = register_netdevice_notifier(&iboe->nb);
 		if (err)
-			goto err_reg;
+			goto err_sriov;
 	}
-	for (i = 0; i < ARRAY_SIZE(mlx4_class_attributes); ++i) {
+
+	for (j = 0; j < ARRAY_SIZE(mlx4_class_attributes); ++j) {
 		if (device_create_file(&ibdev->ib_dev.dev,
-				       mlx4_class_attributes[i]))
+				       mlx4_class_attributes[j]))
 			goto err_notif;
 	}
-
-	if(sysfs_create_group(&ibdev->ib_dev.dev.kobj, &diag_counters_group))
+	if (sysfs_create_group(&ibdev->ib_dev.dev.kobj, &diag_counters_group))
 		goto err_notif;
 
-	ibdev->ib_active = 1;
+	ibdev->ib_active = true;
 
+	if (mlx4_is_mfunc(ibdev->dev))
+		init_pkeys(ibdev);
+
+	/* create paravirt contexts for any VFs which are active */
+	if (mlx4_is_master(ibdev->dev)) {
+		for (j = 0; j < MLX4_MFUNC_MAX; j++) {
+			if (j == mlx4_master_func_num(ibdev->dev))
+				continue;
+			if (mlx4_is_slave_active(ibdev->dev, j))
+				do_slave_init(ibdev, j, 1);
+		}
+	}
 	return ibdev;
 
 err_notif:
 	if (unregister_netdevice_notifier(&ibdev->iboe.nb))
-		printk(KERN_WARNING "failure unregistering notifier\n");
+		pr_warn("failure unregistering notifier\n");
 	flush_workqueue(wq);
 
+err_sriov:
+	mlx4_ib_close_sriov(ibdev);
+
+err_mad:
+	mlx4_ib_mad_cleanup(ibdev);
+
 err_reg:
 	ib_unregister_device(&ibdev->ib_dev);
 
+err_steer_free_bitmap:
+	kfree(ibdev->ib_uc_qpns_bitmap);
+
+err_steer_qp_release:
+	if (dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED)
+		mlx4_qp_release_range(dev, ibdev->steer_qpn_base,
+				ibdev->steer_qpn_count);
 err_counter:
-	for (; k; --k)
-		mlx4_counter_free(ibdev->dev, ibdev->counters[k - 1]);
+	for (; i; --i)
+		if (ibdev->counters[i - 1] != -1)
+			mlx4_counter_free(ibdev->dev, i, ibdev->counters[i - 1]);
 
 err_map:
 	iounmap(ibdev->priv_uar.map);
+	mlx4_ib_free_eqs(dev, ibdev);
 
 err_uar:
 	mlx4_uar_free(dev, &ibdev->priv_uar);
@@ -1470,73 +2134,215 @@
 	return NULL;
 }
 
+int mlx4_ib_steer_qp_alloc(struct mlx4_ib_dev *dev, int count, int *qpn)
+{
+	int offset;
+
+	WARN_ON(!dev->ib_uc_qpns_bitmap);
+
+	offset = bitmap_find_free_region(dev->ib_uc_qpns_bitmap,
+					 dev->steer_qpn_count,
+					 get_count_order(count));
+	if (offset < 0)
+		return offset;
+
+	*qpn = dev->steer_qpn_base + offset;
+	return 0;
+}
+
+void mlx4_ib_steer_qp_free(struct mlx4_ib_dev *dev, u32 qpn, int count)
+{
+	if (!qpn ||
+	    dev->dev->caps.steering_mode != MLX4_STEERING_MODE_DEVICE_MANAGED)
+		return;
+
+	BUG_ON(qpn < dev->steer_qpn_base);
+
+	bitmap_release_region(dev->ib_uc_qpns_bitmap,
+			qpn - dev->steer_qpn_base, get_count_order(count));
+}
+
+int mlx4_ib_steer_qp_reg(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp,
+			 int is_attach)
+{
+	struct ib_flow_spec spec = {
+		.type = IB_FLOW_IB_UC,
+		.l2_id.ib_uc.qpn  = mqp->ibqp.qp_num,
+	};
+
+	return is_attach ?
+		__mlx4_ib_flow_attach(mdev, mqp, &spec, MLX4_DOMAIN_NIC, 0)
+                : __mlx4_ib_flow_detach(mdev, mqp, &spec, MLX4_DOMAIN_NIC, 0);
+}
+
 static void mlx4_ib_remove(struct mlx4_dev *dev, void *ibdev_ptr)
 {
 	struct mlx4_ib_dev *ibdev = ibdev_ptr;
-	int p;
-	int k;
+	int p,j;
 
+	mlx4_ib_close_sriov(ibdev);
 	sysfs_remove_group(&ibdev->ib_dev.dev.kobj, &diag_counters_group);
+	mlx4_ib_mad_cleanup(ibdev);
 
-	mlx4_ib_mad_cleanup(ibdev);
+	for (j = 0; j < ARRAY_SIZE(mlx4_class_attributes); ++j) {
+		device_remove_file(&ibdev->ib_dev.dev, mlx4_class_attributes[j]);
+	}
+
 	ib_unregister_device(&ibdev->ib_dev);
-	for (k = 0; k < ibdev->num_ports; ++k)
-		mlx4_counter_free(ibdev->dev, ibdev->counters[k]);
 
+	if (dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED) {
+		mlx4_qp_release_range(dev, ibdev->steer_qpn_base,
+				ibdev->steer_qpn_count);
+		kfree(ibdev->ib_uc_qpns_bitmap);
+	}
+
 	if (ibdev->iboe.nb.notifier_call) {
-		unregister_netdevice_notifier(&ibdev->iboe.nb);
-		flush_workqueue(wq);
+		if (unregister_netdevice_notifier(&ibdev->iboe.nb))
+			pr_warn("failure unregistering notifier\n");
 		ibdev->iboe.nb.notifier_call = NULL;
 	}
 	iounmap(ibdev->priv_uar.map);
-
+	for (p = 0; p < ibdev->num_ports; ++p)
+		if (ibdev->counters[p] != -1)
+			mlx4_counter_free(ibdev->dev, p + 1, ibdev->counters[p]);
 	mlx4_foreach_port(p, dev, MLX4_PORT_TYPE_IB)
 		mlx4_CLOSE_PORT(dev, p);
 
+	mlx4_ib_free_eqs(dev, ibdev);
+
 	mlx4_uar_free(dev, &ibdev->priv_uar);
 	mlx4_pd_free(dev, ibdev->priv_pdn);
 	ib_dealloc_device(&ibdev->ib_dev);
 }
 
+static void do_slave_init(struct mlx4_ib_dev *ibdev, int slave, int do_init)
+{
+	struct mlx4_ib_demux_work **dm = NULL;
+	struct mlx4_dev *dev = ibdev->dev;
+	int i;
+	unsigned long flags;
+
+	if (!mlx4_is_master(dev))
+		return;
+
+	dm = kcalloc(dev->caps.num_ports, sizeof *dm, GFP_ATOMIC);
+	if (!dm) {
+		pr_err("failed to allocate memory for tunneling qp update\n");
+		goto out;
+	}
+
+	for (i = 0; i < dev->caps.num_ports; i++) {
+		dm[i] = kmalloc(sizeof (struct mlx4_ib_demux_work), GFP_ATOMIC);
+		if (!dm[i]) {
+			pr_err("failed to allocate memory for tunneling qp update work struct\n");
+			for (i = 0; i < dev->caps.num_ports; i++) {
+				if (dm[i])
+					kfree(dm[i]);
+			}
+			goto out;
+		}
+	}
+	/* initialize or tear down tunnel QPs for the slave */
+	for (i = 0; i < dev->caps.num_ports; i++) {
+		INIT_WORK(&dm[i]->work, mlx4_ib_tunnels_update_work);
+		dm[i]->port = i + 1;
+		dm[i]->slave = slave;
+		dm[i]->do_init = do_init;
+		dm[i]->dev = ibdev;
+		spin_lock_irqsave(&ibdev->sriov.going_down_lock, flags);
+		if (!ibdev->sriov.is_going_down)
+			queue_work(ibdev->sriov.demux[i].ud_wq, &dm[i]->work);
+		spin_unlock_irqrestore(&ibdev->sriov.going_down_lock, flags);
+	}
+out:
+	if (dm)
+		kfree(dm);
+	return;
+}
+
 static void mlx4_ib_event(struct mlx4_dev *dev, void *ibdev_ptr,
-			  enum mlx4_dev_event event, int port)
+			  enum mlx4_dev_event event, unsigned long param)
 {
 	struct ib_event ibev;
 	struct mlx4_ib_dev *ibdev = to_mdev((struct ib_device *) ibdev_ptr);
+	struct mlx4_eqe *eqe = NULL;
+	struct ib_event_work *ew;
+	int p = 0;
 
-	if (port > ibdev->num_ports)
-		return;
+	if (event == MLX4_DEV_EVENT_PORT_MGMT_CHANGE)
+		eqe = (struct mlx4_eqe *)param;
+	else
+		p = (int) param;
 
 	switch (event) {
 	case MLX4_DEV_EVENT_PORT_UP:
+		if (p > ibdev->num_ports)
+			return;
+		if (mlx4_is_master(dev) &&
+		    rdma_port_get_link_layer(&ibdev->ib_dev, p) ==
+			IB_LINK_LAYER_INFINIBAND) {
+			mlx4_ib_invalidate_all_guid_record(ibdev, p);
+		}
+		mlx4_ib_info((struct ib_device *) ibdev_ptr,
+			     "Port %d logical link is up\n", p);
 		ibev.event = IB_EVENT_PORT_ACTIVE;
 		break;
 
 	case MLX4_DEV_EVENT_PORT_DOWN:
+		if (p > ibdev->num_ports)
+			return;
+		mlx4_ib_info((struct ib_device *) ibdev_ptr,
+			     "Port %d logical link is down\n", p);
 		ibev.event = IB_EVENT_PORT_ERR;
 		break;
 
 	case MLX4_DEV_EVENT_CATASTROPHIC_ERROR:
-		ibdev->ib_active = 0;
+		ibdev->ib_active = false;
 		ibev.event = IB_EVENT_DEVICE_FATAL;
 		break;
 
+	case MLX4_DEV_EVENT_PORT_MGMT_CHANGE:
+		ew = kmalloc(sizeof *ew, GFP_ATOMIC);
+		if (!ew) {
+			pr_err("failed to allocate memory for events work\n");
+			break;
+		}
+
+		INIT_WORK(&ew->work, handle_port_mgmt_change_event);
+		memcpy(&ew->ib_eqe, eqe, sizeof *eqe);
+		ew->ib_dev = ibdev;
+		/* need to queue only for port owner, which uses GEN_EQE */
+		if (mlx4_is_master(dev))
+			queue_work(wq, &ew->work);
+		else
+			handle_port_mgmt_change_event(&ew->work);
+		return;
+
+	case MLX4_DEV_EVENT_SLAVE_INIT:
+		/* here, p is the slave id */
+		do_slave_init(ibdev, p, 1);
+		return;
+
+	case MLX4_DEV_EVENT_SLAVE_SHUTDOWN:
+		/* here, p is the slave id */
+		do_slave_init(ibdev, p, 0);
+		return;
+
 	default:
 		return;
 	}
 
 	ibev.device	      = ibdev_ptr;
-	ibev.element.port_num = port;
+	ibev.element.port_num = (u8) p;
 
 	ib_dispatch_event(&ibev);
 }
 
 static struct mlx4_interface mlx4_ib_interface = {
-	.add	= mlx4_ib_add,
-	.remove	= mlx4_ib_remove,
-       .event  = mlx4_ib_event,
-       .get_prot_dev = get_ibdev,
-       .protocol     = MLX4_PROT_IB,
+	.add		= mlx4_ib_add,
+	.remove		= mlx4_ib_remove,
+	.event		= mlx4_ib_event,
+	.protocol	= MLX4_PROT_IB_IPV6
 };
 
 static int __init mlx4_ib_init(void)
@@ -1547,19 +2353,52 @@
 	if (!wq)
 		return -ENOMEM;
 
+#ifdef __linux__
+	err = mlx4_ib_proc_init();
+	if (err)
+		goto clean_wq;
+#endif
+
+	err = mlx4_ib_mcg_init();
+	if (err)
+		goto clean_proc;
+
+	init_dev_assign();
+
 	err = mlx4_register_interface(&mlx4_ib_interface);
-	if (err) {
-		destroy_workqueue(wq);
-		return err;
-	}
+	if (err)
+		goto clean_mcg;
 
 	return 0;
+
+clean_mcg:
+	mlx4_ib_mcg_destroy();
+
+clean_proc:
+#ifdef __linux__
+	remove_proc_entry(MLX4_IB_MRS_PROC_DIR_NAME,
+			  mlx4_ib_driver_dir_entry);
+	remove_proc_entry(MLX4_IB_DRIVER_PROC_DIR_NAME, NULL);
+
+clean_wq:
+#endif
+	destroy_workqueue(wq);
+	return err;
 }
 
 static void __exit mlx4_ib_cleanup(void)
 {
 	mlx4_unregister_interface(&mlx4_ib_interface);
+	mlx4_ib_mcg_destroy();
 	destroy_workqueue(wq);
+
+	/* Remove proc entries */
+#ifdef __linux__
+	remove_proc_entry(MLX4_IB_MRS_PROC_DIR_NAME,
+				mlx4_ib_driver_dir_entry);
+	remove_proc_entry(MLX4_IB_DRIVER_PROC_DIR_NAME, NULL);
+#endif
+
 }
 
 module_init_order(mlx4_ib_init, SI_ORDER_MIDDLE);
@@ -1572,9 +2411,11 @@
 {
         return (0);
 }
+
 static moduledata_t mlx4ib_mod = {
         .name = "mlx4ib",
         .evhand = mlx4ib_evhand,
 };
-DECLARE_MODULE(mlx4ib, mlx4ib_mod, SI_SUB_SMP, SI_ORDER_ANY);
+DECLARE_MODULE(mlx4ib, mlx4ib_mod, SI_SUB_OFED_PREINIT, SI_ORDER_ANY);
 MODULE_DEPEND(mlx4ib, mlx4, 1, 1, 1);
+MODULE_DEPEND(mlx4ib, ibcore, 1, 1, 1);


Property changes on: trunk/sys/ofed/drivers/infiniband/hw/mlx4/main.c
___________________________________________________________________
Deleted: cvs2svn:cvs-rev
## -1 +0,0 ##
-1.1.1.1
\ No newline at end of property
Added: trunk/sys/ofed/drivers/infiniband/hw/mlx4/mcg.c
===================================================================
--- trunk/sys/ofed/drivers/infiniband/hw/mlx4/mcg.c	                        (rev 0)
+++ trunk/sys/ofed/drivers/infiniband/hw/mlx4/mcg.c	2016-09-14 19:35:22 UTC (rev 7911)
@@ -0,0 +1,1254 @@
+/*
+ * Copyright (c) 2012 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <rdma/ib_mad.h>
+#include <rdma/ib_smi.h>
+#include <rdma/ib_cache.h>
+#include <rdma/ib_sa.h>
+
+#include <linux/mlx4/cmd.h>
+#include <linux/delay.h>
+
+#include "mlx4_ib.h"
+
+#define MAX_VFS		80
+#define MAX_PEND_REQS_PER_FUNC 4
+#define MAD_TIMEOUT_MS	2000
+
+#define mcg_warn(fmt, arg...)	pr_warn("MCG WARNING: " fmt, ##arg)
+#define mcg_error(fmt, arg...)	pr_err(fmt, ##arg)
+#define mcg_warn_group(group, format, arg...) \
+	pr_warn("%s-%d: %16s (port %d): WARNING: " format, __func__, __LINE__,\
+	(group)->name, group->demux->port, ## arg)
+
+#define mcg_error_group(group, format, arg...) \
+	pr_err("  %16s: " format, (group)->name, ## arg)
+
+static union ib_gid mgid0;
+
+static struct workqueue_struct *clean_wq;
+
+enum mcast_state {
+	MCAST_NOT_MEMBER = 0,
+	MCAST_MEMBER,
+};
+
+enum mcast_group_state {
+	MCAST_IDLE,
+	MCAST_JOIN_SENT,
+	MCAST_LEAVE_SENT,
+	MCAST_RESP_READY
+};
+
+struct mcast_member {
+	enum mcast_state state;
+	uint8_t			join_state;
+	int			num_pend_reqs;
+	struct list_head	pending;
+};
+
+struct ib_sa_mcmember_data {
+	union ib_gid	mgid;
+	union ib_gid	port_gid;
+	__be32		qkey;
+	__be16		mlid;
+	u8		mtusel_mtu;
+	u8		tclass;
+	__be16		pkey;
+	u8		ratesel_rate;
+	u8		lifetmsel_lifetm;
+	__be32		sl_flowlabel_hoplimit;
+	u8		scope_join_state;
+	u8		proxy_join;
+	u8		reserved[2];
+};
+
+struct mcast_group {
+	struct ib_sa_mcmember_data rec;
+	struct rb_node		node;
+	struct list_head	mgid0_list;
+	struct mlx4_ib_demux_ctx *demux;
+	struct mcast_member	func[MAX_VFS];
+	struct mutex		lock;
+	struct work_struct	work;
+	struct list_head	pending_list;
+	int			members[3];
+	enum mcast_group_state	state;
+	enum mcast_group_state	prev_state;
+	struct ib_sa_mad	response_sa_mad;
+	__be64			last_req_tid;
+
+	char			name[33]; /* MGID string */
+	struct device_attribute	dentry;
+
+	/* refcount is the reference count for the following:
+	   1. Each queued request
+	   2. Each invocation of the worker thread
+	   3. Membership of the port at the SA
+	*/
+	atomic_t		refcount;
+
+	/* delayed work to clean pending SM request */
+	struct delayed_work	timeout_work;
+	struct list_head	cleanup_list;
+};
+
+struct mcast_req {
+	int			func;
+	struct ib_sa_mad	sa_mad;
+	struct list_head	group_list;
+	struct list_head	func_list;
+	struct mcast_group	*group;
+	int			clean;
+};
+
+
+#define safe_atomic_dec(ref) \
+        do {\
+                if (atomic_dec_and_test(ref)) \
+			mcg_warn_group(group, "did not expect to reach zero\n"); \
+	} while (0)
+
+static const char *get_state_string(enum mcast_group_state state)
+{
+	switch (state) {
+	case MCAST_IDLE:
+		return "MCAST_IDLE";
+	case MCAST_JOIN_SENT:
+		return "MCAST_JOIN_SENT";
+	case MCAST_LEAVE_SENT:
+		return "MCAST_LEAVE_SENT";
+	case MCAST_RESP_READY:
+		return "MCAST_RESP_READY";
+	}
+	return "Invalid State";
+}
+
+static struct mcast_group *mcast_find(struct mlx4_ib_demux_ctx *ctx,
+				      union ib_gid *mgid)
+{
+	struct rb_node *node = ctx->mcg_table.rb_node;
+	struct mcast_group *group;
+	int ret;
+
+	while (node) {
+		group = rb_entry(node, struct mcast_group, node);
+		ret = memcmp(mgid->raw, group->rec.mgid.raw, sizeof *mgid);
+		if (!ret)
+			return group;
+
+		if (ret < 0)
+			node = node->rb_left;
+		else
+			node = node->rb_right;
+	}
+	return NULL;
+}
+
+static struct mcast_group *mcast_insert(struct mlx4_ib_demux_ctx *ctx,
+					struct mcast_group *group)
+{
+	struct rb_node **link = &ctx->mcg_table.rb_node;
+	struct rb_node *parent = NULL;
+	struct mcast_group *cur_group;
+	int ret;
+
+	while (*link) {
+		parent = *link;
+		cur_group = rb_entry(parent, struct mcast_group, node);
+
+		ret = memcmp(group->rec.mgid.raw, cur_group->rec.mgid.raw,
+			     sizeof group->rec.mgid);
+		if (ret < 0)
+			link = &(*link)->rb_left;
+		else if (ret > 0)
+			link = &(*link)->rb_right;
+		else
+			return cur_group;
+	}
+	rb_link_node(&group->node, parent, link);
+	rb_insert_color(&group->node, &ctx->mcg_table);
+	return NULL;
+}
+
+static int send_mad_to_wire(struct mlx4_ib_demux_ctx *ctx, struct ib_mad *mad)
+{
+	struct mlx4_ib_dev *dev = ctx->dev;
+	struct ib_ah_attr	ah_attr;
+
+	spin_lock(&dev->sm_lock);
+	if (!dev->sm_ah[ctx->port - 1]) {
+		/* port is not yet Active, sm_ah not ready */
+		spin_unlock(&dev->sm_lock);
+		return -EAGAIN;
+	}
+	mlx4_ib_query_ah(dev->sm_ah[ctx->port - 1], &ah_attr);
+	spin_unlock(&dev->sm_lock);
+	return mlx4_ib_send_to_wire(dev, mlx4_master_func_num(dev->dev), ctx->port,
+				    IB_QPT_GSI, 0, 1, IB_QP1_QKEY, &ah_attr, mad);
+}
+
+static int send_mad_to_slave(int slave, struct mlx4_ib_demux_ctx *ctx,
+			     struct ib_mad *mad)
+{
+	struct mlx4_ib_dev *dev = ctx->dev;
+	struct ib_mad_agent *agent = dev->send_agent[ctx->port - 1][1];
+	struct ib_wc wc;
+	struct ib_ah_attr ah_attr;
+
+	/* Our agent might not yet be registered when mads start to arrive */
+	if (!agent)
+		return -EAGAIN;
+
+	ib_query_ah(dev->sm_ah[ctx->port - 1], &ah_attr);
+
+	if (ib_find_cached_pkey(&dev->ib_dev, ctx->port, IB_DEFAULT_PKEY_FULL, &wc.pkey_index))
+		return -EINVAL;
+	wc.sl = 0;
+	wc.dlid_path_bits = 0;
+	wc.port_num = ctx->port;
+	wc.slid = ah_attr.dlid;  /* opensm lid */
+	wc.src_qp = 1;
+	return mlx4_ib_send_to_slave(dev, slave, ctx->port, IB_QPT_GSI, &wc, NULL, mad);
+}
+
+static int send_join_to_wire(struct mcast_group *group, struct ib_sa_mad *sa_mad)
+{
+	struct ib_sa_mad mad;
+	struct ib_sa_mcmember_data *sa_mad_data = (struct ib_sa_mcmember_data *)&mad.data;
+	int ret;
+
+	/* we rely on a mad request as arrived from a VF */
+	memcpy(&mad, sa_mad, sizeof mad);
+
+	/* fix port GID to be the real one (slave 0) */
+	sa_mad_data->port_gid.global.interface_id = group->demux->guid_cache[0];
+
+	/* assign our own TID */
+	mad.mad_hdr.tid = mlx4_ib_get_new_demux_tid(group->demux);
+	group->last_req_tid = mad.mad_hdr.tid; /* keep it for later validation */
+
+	ret = send_mad_to_wire(group->demux, (struct ib_mad *)&mad);
+	/* set timeout handler */
+	if (!ret) {
+		/* calls mlx4_ib_mcg_timeout_handler */
+		queue_delayed_work(group->demux->mcg_wq, &group->timeout_work,
+				msecs_to_jiffies(MAD_TIMEOUT_MS));
+	}
+
+	return ret;
+}
+
+static int send_leave_to_wire(struct mcast_group *group, u8 join_state)
+{
+	struct ib_sa_mad mad;
+	struct ib_sa_mcmember_data *sa_data = (struct ib_sa_mcmember_data *)&mad.data;
+	int ret;
+
+	memset(&mad, 0, sizeof mad);
+	mad.mad_hdr.base_version = 1;
+	mad.mad_hdr.mgmt_class = IB_MGMT_CLASS_SUBN_ADM;
+	mad.mad_hdr.class_version = 2;
+	mad.mad_hdr.method = IB_SA_METHOD_DELETE;
+	mad.mad_hdr.status = cpu_to_be16(0);
+	mad.mad_hdr.class_specific = cpu_to_be16(0);
+	mad.mad_hdr.tid = mlx4_ib_get_new_demux_tid(group->demux);
+	group->last_req_tid = mad.mad_hdr.tid; /* keep it for later validation */
+	mad.mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_MC_MEMBER_REC);
+	mad.mad_hdr.attr_mod = cpu_to_be32(0);
+	mad.sa_hdr.sm_key = 0x0;
+	mad.sa_hdr.attr_offset = cpu_to_be16(7);
+	mad.sa_hdr.comp_mask = IB_SA_MCMEMBER_REC_MGID |
+		IB_SA_MCMEMBER_REC_PORT_GID | IB_SA_MCMEMBER_REC_JOIN_STATE;
+
+	*sa_data = group->rec;
+	sa_data->scope_join_state = join_state;
+
+	ret = send_mad_to_wire(group->demux, (struct ib_mad *)&mad);
+	if (ret)
+		group->state = MCAST_IDLE;
+
+	/* set timeout handler */
+	if (!ret) {
+		/* calls mlx4_ib_mcg_timeout_handler */
+		queue_delayed_work(group->demux->mcg_wq, &group->timeout_work,
+				msecs_to_jiffies(MAD_TIMEOUT_MS));
+	}
+
+	return ret;
+}
+
+static int send_reply_to_slave(int slave, struct mcast_group *group,
+		struct ib_sa_mad *req_sa_mad, u16 status)
+{
+	struct ib_sa_mad mad;
+	struct ib_sa_mcmember_data *sa_data = (struct ib_sa_mcmember_data *)&mad.data;
+	struct ib_sa_mcmember_data *req_sa_data = (struct ib_sa_mcmember_data *)&req_sa_mad->data;
+	int ret;
+
+	memset(&mad, 0, sizeof mad);
+	mad.mad_hdr.base_version = 1;
+	mad.mad_hdr.mgmt_class = IB_MGMT_CLASS_SUBN_ADM;
+	mad.mad_hdr.class_version = 2;
+	mad.mad_hdr.method = IB_MGMT_METHOD_GET_RESP;
+	mad.mad_hdr.status = cpu_to_be16(status);
+	mad.mad_hdr.class_specific = cpu_to_be16(0);
+	mad.mad_hdr.tid = req_sa_mad->mad_hdr.tid;
+	*(u8 *)&mad.mad_hdr.tid = 0; /* resetting tid to 0 */
+	mad.mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_MC_MEMBER_REC);
+	mad.mad_hdr.attr_mod = cpu_to_be32(0);
+	mad.sa_hdr.sm_key = req_sa_mad->sa_hdr.sm_key;
+	mad.sa_hdr.attr_offset = cpu_to_be16(7);
+	mad.sa_hdr.comp_mask = 0; /* ignored on responses, see IBTA spec */
+
+	*sa_data = group->rec;
+
+	/* reconstruct VF's requested join_state and port_gid */
+	sa_data->scope_join_state &= 0xf0;
+	sa_data->scope_join_state |= (group->func[slave].join_state & 0x0f);
+	memcpy(&sa_data->port_gid, &req_sa_data->port_gid, sizeof req_sa_data->port_gid);
+
+	ret = send_mad_to_slave(slave, group->demux, (struct ib_mad *)&mad);
+	return ret;
+}
+
+static int check_selector(ib_sa_comp_mask comp_mask,
+			  ib_sa_comp_mask selector_mask,
+			  ib_sa_comp_mask value_mask,
+			  u8 src_value, u8 dst_value)
+{
+	int err;
+	u8 selector = dst_value >> 6;
+	dst_value &= 0x3f;
+	src_value &= 0x3f;
+
+	if (!(comp_mask & selector_mask) || !(comp_mask & value_mask))
+		return 0;
+
+	switch (selector) {
+	case IB_SA_GT:
+		err = (src_value <= dst_value);
+		break;
+	case IB_SA_LT:
+		err = (src_value >= dst_value);
+		break;
+	case IB_SA_EQ:
+		err = (src_value != dst_value);
+		break;
+	default:
+		err = 0;
+		break;
+	}
+
+	return err;
+}
+
+static u16 cmp_rec(struct ib_sa_mcmember_data *src,
+		   struct ib_sa_mcmember_data *dst, ib_sa_comp_mask comp_mask)
+{
+	/* src is group record, dst is request record */
+	/* MGID must already match */
+	/* Port_GID we always replace to our Port_GID, so it is a match */
+
+#define MAD_STATUS_REQ_INVALID 0x0200
+	if (comp_mask & IB_SA_MCMEMBER_REC_QKEY && src->qkey != dst->qkey)
+		return MAD_STATUS_REQ_INVALID;
+	if (comp_mask & IB_SA_MCMEMBER_REC_MLID && src->mlid != dst->mlid)
+		return MAD_STATUS_REQ_INVALID;
+	if (check_selector(comp_mask, IB_SA_MCMEMBER_REC_MTU_SELECTOR,
+				 IB_SA_MCMEMBER_REC_MTU,
+				 src->mtusel_mtu, dst->mtusel_mtu))
+		return MAD_STATUS_REQ_INVALID;
+	if (comp_mask & IB_SA_MCMEMBER_REC_TRAFFIC_CLASS &&
+	    src->tclass != dst->tclass)
+		return MAD_STATUS_REQ_INVALID;
+	if (comp_mask & IB_SA_MCMEMBER_REC_PKEY && src->pkey != dst->pkey)
+		return MAD_STATUS_REQ_INVALID;
+	if (check_selector(comp_mask, IB_SA_MCMEMBER_REC_RATE_SELECTOR,
+				 IB_SA_MCMEMBER_REC_RATE,
+				 src->ratesel_rate, dst->ratesel_rate))
+		return MAD_STATUS_REQ_INVALID;
+	if (check_selector(comp_mask,
+				 IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME_SELECTOR,
+				 IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME,
+				 src->lifetmsel_lifetm, dst->lifetmsel_lifetm))
+		return MAD_STATUS_REQ_INVALID;
+	if (comp_mask & IB_SA_MCMEMBER_REC_SL &&
+			(be32_to_cpu(src->sl_flowlabel_hoplimit) & 0xf0000000) !=
+			(be32_to_cpu(dst->sl_flowlabel_hoplimit) & 0xf0000000))
+		return MAD_STATUS_REQ_INVALID;
+	if (comp_mask & IB_SA_MCMEMBER_REC_FLOW_LABEL &&
+			(be32_to_cpu(src->sl_flowlabel_hoplimit) & 0x0fffff00) !=
+			(be32_to_cpu(dst->sl_flowlabel_hoplimit) & 0x0fffff00))
+		return MAD_STATUS_REQ_INVALID;
+	if (comp_mask & IB_SA_MCMEMBER_REC_HOP_LIMIT &&
+			(be32_to_cpu(src->sl_flowlabel_hoplimit) & 0x000000ff) !=
+			(be32_to_cpu(dst->sl_flowlabel_hoplimit) & 0x000000ff))
+		return MAD_STATUS_REQ_INVALID;
+	if (comp_mask & IB_SA_MCMEMBER_REC_SCOPE &&
+			(src->scope_join_state & 0xf0) !=
+			(dst->scope_join_state & 0xf0))
+		return MAD_STATUS_REQ_INVALID;
+
+	/* join_state checked separately, proxy_join ignored */
+
+	return 0;
+}
+
+/* release group, return 1 if this was last release and group is destroyed
+ * timout work is canceled sync */
+static int release_group(struct mcast_group *group, int from_timeout_handler)
+{
+	struct mlx4_ib_demux_ctx *ctx = group->demux;
+	int nzgroup;
+
+	mutex_lock(&ctx->mcg_table_lock);
+	mutex_lock(&group->lock);
+	if (atomic_dec_and_test(&group->refcount)) {
+		if (!from_timeout_handler) {
+			if (group->state != MCAST_IDLE &&
+			    !cancel_delayed_work(&group->timeout_work)) {
+				atomic_inc(&group->refcount);
+				mutex_unlock(&group->lock);
+				mutex_unlock(&ctx->mcg_table_lock);
+				return 0;
+			}
+		}
+
+		nzgroup = memcmp(&group->rec.mgid, &mgid0, sizeof mgid0);
+		if (nzgroup)
+			del_sysfs_port_mcg_attr(ctx->dev, ctx->port, &group->dentry.attr);
+		if (!list_empty(&group->pending_list))
+			mcg_warn_group(group, "releasing a group with non empty pending list\n");
+		if (nzgroup)
+			rb_erase(&group->node, &ctx->mcg_table);
+		list_del_init(&group->mgid0_list);
+		mutex_unlock(&group->lock);
+		mutex_unlock(&ctx->mcg_table_lock);
+		kfree(group);
+		return 1;
+	} else {
+		mutex_unlock(&group->lock);
+		mutex_unlock(&ctx->mcg_table_lock);
+	}
+	return 0;
+}
+
+static void adjust_membership(struct mcast_group *group, u8 join_state, int inc)
+{
+	int i;
+
+	for (i = 0; i < 3; i++, join_state >>= 1)
+		if (join_state & 0x1)
+			group->members[i] += inc;
+}
+
+static u8 get_leave_state(struct mcast_group *group)
+{
+	u8 leave_state = 0;
+	int i;
+
+	for (i = 0; i < 3; i++)
+		if (!group->members[i])
+			leave_state |= (1 << i);
+
+	return leave_state & (group->rec.scope_join_state & 7);
+}
+
+static int join_group(struct mcast_group *group, int slave, u8 join_mask)
+{
+	int ret = 0;
+	u8 join_state;
+
+	/* remove bits that slave is already member of, and adjust */
+	join_state = join_mask & (~group->func[slave].join_state);
+	adjust_membership(group, join_state, 1);
+	group->func[slave].join_state |= join_state;
+	if (group->func[slave].state != MCAST_MEMBER && join_state) {
+		group->func[slave].state = MCAST_MEMBER;
+		ret = 1;
+	}
+	return ret;
+}
+
+static int leave_group(struct mcast_group *group, int slave, u8 leave_state)
+{
+	int ret = 0;
+
+	adjust_membership(group, leave_state, -1);
+	group->func[slave].join_state &= ~leave_state;
+	if (!group->func[slave].join_state) {
+		group->func[slave].state = MCAST_NOT_MEMBER;
+		ret = 1;
+	}
+	return ret;
+}
+
+static int check_leave(struct mcast_group *group, int slave, u8 leave_mask)
+{
+	if (group->func[slave].state != MCAST_MEMBER)
+		return MAD_STATUS_REQ_INVALID;
+
+	/* make sure we're not deleting unset bits */
+	if (~group->func[slave].join_state & leave_mask)
+		return MAD_STATUS_REQ_INVALID;
+
+	if (!leave_mask)
+		return MAD_STATUS_REQ_INVALID;
+
+	return 0;
+}
+
+static void mlx4_ib_mcg_timeout_handler(struct work_struct *work)
+{
+	struct delayed_work *delay = to_delayed_work(work);
+	struct mcast_group *group;
+	struct mcast_req *req = NULL;
+
+	group = container_of(delay, typeof(*group), timeout_work);
+
+	mutex_lock(&group->lock);
+	if (group->state == MCAST_JOIN_SENT) {
+		if (!list_empty(&group->pending_list)) {
+			req = list_first_entry(&group->pending_list, struct mcast_req, group_list);
+			list_del(&req->group_list);
+			list_del(&req->func_list);
+			--group->func[req->func].num_pend_reqs;
+			mutex_unlock(&group->lock);
+			kfree(req);
+			if (memcmp(&group->rec.mgid, &mgid0, sizeof mgid0)) {
+				if (release_group(group, 1))
+					return;
+			} else {
+				kfree(group);
+				return;
+			}
+			mutex_lock(&group->lock);
+		} else
+			mcg_warn_group(group, "DRIVER BUG\n");
+	} else if (group->state == MCAST_LEAVE_SENT) {
+		if (group->rec.scope_join_state & 7)
+			group->rec.scope_join_state &= 0xf8;
+		group->state = MCAST_IDLE;
+		mutex_unlock(&group->lock);
+		if (release_group(group, 1))
+			return;
+		mutex_lock(&group->lock);
+	} else
+		mcg_warn_group(group, "invalid state %s\n", get_state_string(group->state));
+	group->state = MCAST_IDLE;
+	atomic_inc(&group->refcount);
+        queue_work(group->demux->mcg_wq, &group->work);
+        safe_atomic_dec(&group->refcount);
+
+	mutex_unlock(&group->lock);
+}
+
+static int handle_leave_req(struct mcast_group *group, u8 leave_mask,
+			    struct mcast_req *req)
+{
+	u16 status;
+
+	if (req->clean)
+		leave_mask = group->func[req->func].join_state;
+
+	status = check_leave(group, req->func, leave_mask);
+	if (!status)
+		leave_group(group, req->func, leave_mask);
+
+	if (!req->clean)
+		send_reply_to_slave(req->func, group, &req->sa_mad, status);
+	--group->func[req->func].num_pend_reqs;
+	list_del(&req->group_list);
+	list_del(&req->func_list);
+	kfree(req);
+	return 1;
+}
+
+static int handle_join_req(struct mcast_group *group, u8 join_mask,
+			   struct mcast_req *req)
+{
+	u8 group_join_state = group->rec.scope_join_state & 7;
+	int ref = 0;
+	u16 status;
+	struct ib_sa_mcmember_data *sa_data = (struct ib_sa_mcmember_data *)req->sa_mad.data;
+
+	if (join_mask == (group_join_state & join_mask)) {
+		/* port's membership need not change */
+		status = cmp_rec(&group->rec, sa_data, req->sa_mad.sa_hdr.comp_mask);
+		if (!status)
+			join_group(group, req->func, join_mask);
+
+		--group->func[req->func].num_pend_reqs;
+		send_reply_to_slave(req->func, group, &req->sa_mad, status);
+		list_del(&req->group_list);
+		list_del(&req->func_list);
+		kfree(req);
+		++ref;
+	} else {
+		/* port's membership needs to be updated */
+		group->prev_state = group->state;
+		if (send_join_to_wire(group, &req->sa_mad)) {
+			--group->func[req->func].num_pend_reqs;
+			list_del(&req->group_list);
+			list_del(&req->func_list);
+			kfree(req);
+			ref = 1;
+			group->state = group->prev_state;
+		} else
+			group->state = MCAST_JOIN_SENT;
+	}
+
+	return ref;
+}
+
+static void mlx4_ib_mcg_work_handler(struct work_struct *work)
+{
+	struct mcast_group *group;
+	struct mcast_req *req = NULL;
+	struct ib_sa_mcmember_data *sa_data;
+	u8 req_join_state;
+	int rc = 1; /* release_count - this is for the scheduled work */
+	u16 status;
+	u8 method;
+
+	group = container_of(work, typeof(*group), work);
+
+	mutex_lock(&group->lock);
+
+	/* First, let's see if a response from SM is waiting regarding this group.
+	 * If so, we need to update the group's REC. If this is a bad response, we
+	 * may need to send a bad response to a VF waiting for it. If VF is waiting
+	 * and this is a good response, the VF will be answered later in this func. */
+	if (group->state == MCAST_RESP_READY) {
+		/* cancels mlx4_ib_mcg_timeout_handler */
+		cancel_delayed_work(&group->timeout_work);
+		status = be16_to_cpu(group->response_sa_mad.mad_hdr.status);
+		method = group->response_sa_mad.mad_hdr.method;
+		if (group->last_req_tid != group->response_sa_mad.mad_hdr.tid) {
+			mcg_warn_group(group, "Got MAD response to existing MGID but wrong TID, dropping. Resp TID=%llx, group TID=%llx\n",
+				(long long unsigned int)be64_to_cpu(group->response_sa_mad.mad_hdr.tid),
+				(long long unsigned int)be64_to_cpu(group->last_req_tid));
+			group->state = group->prev_state;
+			goto process_requests;
+		}
+		if (status) {
+			if (!list_empty(&group->pending_list))
+				req = list_first_entry(&group->pending_list,
+						struct mcast_req, group_list);
+			if (method == IB_MGMT_METHOD_GET_RESP) {
+					if (req) {
+						send_reply_to_slave(req->func, group, &req->sa_mad, status);
+						--group->func[req->func].num_pend_reqs;
+						list_del(&req->group_list);
+						list_del(&req->func_list);
+						kfree(req);
+						++rc;
+					} else
+						mcg_warn_group(group, "no request for failed join\n");
+			} else if (method == IB_SA_METHOD_DELETE_RESP && group->demux->flushing)
+				++rc;
+		} else {
+			u8 resp_join_state;
+			u8 cur_join_state;
+
+			resp_join_state = ((struct ib_sa_mcmember_data *)
+						group->response_sa_mad.data)->scope_join_state & 7;
+			cur_join_state = group->rec.scope_join_state & 7;
+
+			if (method == IB_MGMT_METHOD_GET_RESP) {
+				/* successfull join */
+				if (!cur_join_state && resp_join_state)
+					--rc;
+			} else if (!resp_join_state)
+					++rc;
+			memcpy(&group->rec, group->response_sa_mad.data, sizeof group->rec);
+		}
+		group->state = MCAST_IDLE;
+	}
+
+process_requests:
+	/* We should now go over pending join/leave requests, as long as we are idle. */
+	while (!list_empty(&group->pending_list) && group->state == MCAST_IDLE) {
+		req = list_first_entry(&group->pending_list, struct mcast_req,
+				       group_list);
+		sa_data = (struct ib_sa_mcmember_data *)req->sa_mad.data;
+		req_join_state = sa_data->scope_join_state & 0x7;
+
+		/* For a leave request, we will immediately answer the VF, and
+		 * update our internal counters. The actual leave will be sent
+		 * to SM later, if at all needed. We dequeue the request now. */
+		if (req->sa_mad.mad_hdr.method == IB_SA_METHOD_DELETE)
+			rc += handle_leave_req(group, req_join_state, req);
+		else
+			rc += handle_join_req(group, req_join_state, req);
+	}
+
+	/* Handle leaves */
+	if (group->state == MCAST_IDLE) {
+		req_join_state = get_leave_state(group);
+		if (req_join_state) {
+			group->rec.scope_join_state &= ~req_join_state;
+			group->prev_state = group->state;
+			if (send_leave_to_wire(group, req_join_state)) {
+				group->state = group->prev_state;
+				++rc;
+			} else
+				group->state = MCAST_LEAVE_SENT;
+		}
+	}
+
+	if (!list_empty(&group->pending_list) && group->state == MCAST_IDLE)
+		goto process_requests;
+	mutex_unlock(&group->lock);
+
+	while (rc--)
+		release_group(group, 0);
+}
+
+static struct mcast_group *search_relocate_mgid0_group(struct mlx4_ib_demux_ctx *ctx,
+						       __be64 tid,
+						       union ib_gid *new_mgid)
+{
+	struct mcast_group *group = NULL, *cur_group;
+	struct mcast_req *req;
+	struct list_head *pos;
+	struct list_head *n;
+
+	mutex_lock(&ctx->mcg_table_lock);
+	list_for_each_safe(pos, n, &ctx->mcg_mgid0_list) {
+		group = list_entry(pos, struct mcast_group, mgid0_list);
+		mutex_lock(&group->lock);
+		if (group->last_req_tid == tid) {
+			if (memcmp(new_mgid, &mgid0, sizeof mgid0)) {
+				group->rec.mgid = *new_mgid;
+				sprintf(group->name, "%016llx%016llx",
+						(long long unsigned int)be64_to_cpu(group->rec.mgid.global.subnet_prefix),
+						(long long unsigned int)be64_to_cpu(group->rec.mgid.global.interface_id));
+				list_del_init(&group->mgid0_list);
+				cur_group = mcast_insert(ctx, group);
+				if (cur_group) {
+					/* A race between our code and SM. Silently cleaning the new one */
+					req = list_first_entry(&group->pending_list,
+							       struct mcast_req, group_list);
+					--group->func[req->func].num_pend_reqs;
+					list_del(&req->group_list);
+					list_del(&req->func_list);
+					kfree(req);
+					mutex_unlock(&group->lock);
+					mutex_unlock(&ctx->mcg_table_lock);
+					release_group(group, 0);
+					return NULL;
+				}
+
+				atomic_inc(&group->refcount);
+				add_sysfs_port_mcg_attr(ctx->dev, ctx->port, &group->dentry.attr);
+				mutex_unlock(&group->lock);
+				mutex_unlock(&ctx->mcg_table_lock);
+				return group;
+			} else {
+				struct mcast_req *tmp1, *tmp2;
+
+				list_del(&group->mgid0_list);
+				if (!list_empty(&group->pending_list) && group->state != MCAST_IDLE)
+					cancel_delayed_work_sync(&group->timeout_work);
+
+				list_for_each_entry_safe(tmp1, tmp2, &group->pending_list, group_list) {
+					list_del(&tmp1->group_list);
+					kfree(tmp1);
+				}
+				mutex_unlock(&group->lock);
+				mutex_unlock(&ctx->mcg_table_lock);
+				kfree(group);
+				return NULL;
+			}
+		}
+		mutex_unlock(&group->lock);
+	}
+	mutex_unlock(&ctx->mcg_table_lock);
+
+	return NULL;
+}
+
+static ssize_t sysfs_show_group(struct device *dev,
+		struct device_attribute *attr, char *buf);
+
+static struct mcast_group *acquire_group(struct mlx4_ib_demux_ctx *ctx,
+					 union ib_gid *mgid, int create,
+					 gfp_t gfp_mask)
+{
+	struct mcast_group *group, *cur_group;
+	int is_mgid0;
+	int i;
+
+	is_mgid0 = !memcmp(&mgid0, mgid, sizeof mgid0);
+	if (!is_mgid0) {
+		group = mcast_find(ctx, mgid);
+		if (group)
+			goto found;
+	}
+
+	if (!create)
+		return ERR_PTR(-ENOENT);
+
+	group = kzalloc(sizeof *group, gfp_mask);
+	if (!group)
+		return ERR_PTR(-ENOMEM);
+
+	group->demux = ctx;
+	group->rec.mgid = *mgid;
+	INIT_LIST_HEAD(&group->pending_list);
+	INIT_LIST_HEAD(&group->mgid0_list);
+	for (i = 0; i < MAX_VFS; ++i)
+		INIT_LIST_HEAD(&group->func[i].pending);
+	INIT_WORK(&group->work, mlx4_ib_mcg_work_handler);
+	INIT_DELAYED_WORK(&group->timeout_work, mlx4_ib_mcg_timeout_handler);
+	mutex_init(&group->lock);
+	sprintf(group->name, "%016llx%016llx",
+			(long long unsigned int)be64_to_cpu(group->rec.mgid.global.subnet_prefix),
+			(long long unsigned int)be64_to_cpu(group->rec.mgid.global.interface_id));
+	sysfs_attr_init(&group->dentry.attr);
+	group->dentry.show = sysfs_show_group;
+	group->dentry.store = NULL;
+	group->dentry.attr.name = group->name;
+	group->dentry.attr.mode = 0400;
+	group->state = MCAST_IDLE;
+
+	if (is_mgid0) {
+		list_add(&group->mgid0_list, &ctx->mcg_mgid0_list);
+		goto found;
+	}
+
+	cur_group = mcast_insert(ctx, group);
+	if (cur_group) {
+		mcg_warn("group just showed up %s - confused\n", cur_group->name);
+		kfree(group);
+		return ERR_PTR(-EINVAL);
+	}
+
+	add_sysfs_port_mcg_attr(ctx->dev, ctx->port, &group->dentry.attr);
+
+found:
+	atomic_inc(&group->refcount);
+	return group;
+}
+
+static void queue_req(struct mcast_req *req)
+{
+	struct mcast_group *group = req->group;
+
+	atomic_inc(&group->refcount); /* for the request */
+	atomic_inc(&group->refcount); /* for scheduling the work */
+	list_add_tail(&req->group_list, &group->pending_list);
+	list_add_tail(&req->func_list, &group->func[req->func].pending);
+	/* calls mlx4_ib_mcg_work_handler */
+	queue_work(group->demux->mcg_wq, &group->work);
+        safe_atomic_dec(&group->refcount);
+}
+
+int mlx4_ib_mcg_demux_handler(struct ib_device *ibdev, int port, int slave,
+			      struct ib_sa_mad *mad)
+{
+	struct mlx4_ib_dev *dev = to_mdev(ibdev);
+	struct ib_sa_mcmember_data *rec = (struct ib_sa_mcmember_data *)mad->data;
+	struct mlx4_ib_demux_ctx *ctx = &dev->sriov.demux[port - 1];
+	struct mcast_group *group;
+
+	switch (mad->mad_hdr.method) {
+	case IB_MGMT_METHOD_GET_RESP:
+	case IB_SA_METHOD_DELETE_RESP:
+		mutex_lock(&ctx->mcg_table_lock);
+		group = acquire_group(ctx, &rec->mgid, 0, GFP_KERNEL);
+		mutex_unlock(&ctx->mcg_table_lock);
+		if (IS_ERR(group)) {
+			if (mad->mad_hdr.method == IB_MGMT_METHOD_GET_RESP) {
+				__be64 tid = mad->mad_hdr.tid;
+				*(u8 *)(&tid) = (u8)slave; /* in group we kept the modified TID */
+				group = search_relocate_mgid0_group(ctx, tid, &rec->mgid);
+			} else
+				group = NULL;
+		}
+
+		if (!group)
+			return 1;
+
+		mutex_lock(&group->lock);
+		group->response_sa_mad = *mad;
+		group->prev_state = group->state;
+		group->state = MCAST_RESP_READY;
+		/* calls mlx4_ib_mcg_work_handler */
+		atomic_inc(&group->refcount);
+                queue_work(ctx->mcg_wq, &group->work);
+                safe_atomic_dec(&group->refcount);
+		mutex_unlock(&group->lock);
+		release_group(group, 0);
+		return 1; /* consumed */
+	case IB_MGMT_METHOD_SET:
+	case IB_SA_METHOD_GET_TABLE:
+	case IB_SA_METHOD_GET_TABLE_RESP:
+	case IB_SA_METHOD_DELETE:
+		return 0; /* not consumed, pass-through to guest over tunnel */
+	default:
+		mcg_warn("In demux, port %d: unexpected MCMember method: 0x%x, dropping\n",
+			port, mad->mad_hdr.method);
+		return 1; /* consumed */
+	}
+}
+
+int mlx4_ib_mcg_multiplex_handler(struct ib_device *ibdev, int port,
+				  int slave, struct ib_sa_mad *sa_mad)
+{
+	struct mlx4_ib_dev *dev = to_mdev(ibdev);
+	struct ib_sa_mcmember_data *rec = (struct ib_sa_mcmember_data *)sa_mad->data;
+	struct mlx4_ib_demux_ctx *ctx = &dev->sriov.demux[port - 1];
+	struct mcast_group *group;
+	struct mcast_req *req;
+	int may_create = 0;
+
+	if (ctx->flushing)
+		return -EAGAIN;
+
+	switch (sa_mad->mad_hdr.method) {
+	case IB_MGMT_METHOD_SET:
+		may_create = 1;
+	case IB_SA_METHOD_DELETE:
+		req = kzalloc(sizeof *req, GFP_KERNEL);
+		if (!req)
+			return -ENOMEM;
+
+		req->func = slave;
+		req->sa_mad = *sa_mad;
+
+		mutex_lock(&ctx->mcg_table_lock);
+		group = acquire_group(ctx, &rec->mgid, may_create, GFP_KERNEL);
+		mutex_unlock(&ctx->mcg_table_lock);
+		if (IS_ERR(group)) {
+			kfree(req);
+			return PTR_ERR(group);
+		}
+		mutex_lock(&group->lock);
+		if (group->func[slave].num_pend_reqs > MAX_PEND_REQS_PER_FUNC) {
+			mutex_unlock(&group->lock);
+			mcg_warn_group(group, "Port %d, Func %d has too many pending requests (%d), dropping\n",
+				       port, slave, MAX_PEND_REQS_PER_FUNC);
+			release_group(group, 0);
+			kfree(req);
+			return -ENOMEM;
+		}
+		++group->func[slave].num_pend_reqs;
+		req->group = group;
+		queue_req(req);
+		mutex_unlock(&group->lock);
+		release_group(group, 0);
+		return 1; /* consumed */
+	case IB_SA_METHOD_GET_TABLE:
+	case IB_MGMT_METHOD_GET_RESP:
+	case IB_SA_METHOD_GET_TABLE_RESP:
+	case IB_SA_METHOD_DELETE_RESP:
+		return 0; /* not consumed, pass-through */
+	default:
+		mcg_warn("In multiplex, port %d, func %d: unexpected MCMember method: 0x%x, dropping\n",
+			port, slave, sa_mad->mad_hdr.method);
+		return 1; /* consumed */
+	}
+}
+
+static ssize_t sysfs_show_group(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct mcast_group *group =
+		container_of(attr, struct mcast_group, dentry);
+	struct mcast_req *req = NULL;
+	char pending_str[40];
+	char state_str[40];
+	ssize_t len = 0;
+	int f;
+
+	if (group->state == MCAST_IDLE)
+		sprintf(state_str, "%s", get_state_string(group->state));
+	else
+		sprintf(state_str, "%s(TID=0x%llx)",
+				get_state_string(group->state),
+				(long long unsigned int)be64_to_cpu(group->last_req_tid));
+	if (list_empty(&group->pending_list)) {
+		sprintf(pending_str, "No");
+	} else {
+		req = list_first_entry(&group->pending_list, struct mcast_req, group_list);
+		sprintf(pending_str, "Yes(TID=0x%llx)",
+				(long long unsigned int)be64_to_cpu(req->sa_mad.mad_hdr.tid));
+	}
+	len += sprintf(buf + len, "%1d [%02d,%02d,%02d] %4d %4s %5s     ",
+			group->rec.scope_join_state & 0xf,
+			group->members[2], group->members[1], group->members[0],
+			atomic_read(&group->refcount),
+			pending_str,
+			state_str);
+	for (f = 0; f < MAX_VFS; ++f)
+		if (group->func[f].state == MCAST_MEMBER)
+			len += sprintf(buf + len, "%d[%1x] ",
+					f, group->func[f].join_state);
+
+	len += sprintf(buf + len, "\t\t(%4hx %4x %2x %2x %2x %2x %2x "
+		"%4x %4x %2x %2x)\n",
+		be16_to_cpu(group->rec.pkey),
+		be32_to_cpu(group->rec.qkey),
+		(group->rec.mtusel_mtu & 0xc0) >> 6,
+		group->rec.mtusel_mtu & 0x3f,
+		group->rec.tclass,
+		(group->rec.ratesel_rate & 0xc0) >> 6,
+		group->rec.ratesel_rate & 0x3f,
+		(be32_to_cpu(group->rec.sl_flowlabel_hoplimit) & 0xf0000000) >> 28,
+		(be32_to_cpu(group->rec.sl_flowlabel_hoplimit) & 0x0fffff00) >> 8,
+		be32_to_cpu(group->rec.sl_flowlabel_hoplimit) & 0x000000ff,
+		group->rec.proxy_join);
+
+	return len;
+}
+
+int mlx4_ib_mcg_port_init(struct mlx4_ib_demux_ctx *ctx)
+{
+	char name[20];
+
+	atomic_set(&ctx->tid, 0);
+	sprintf(name, "mlx4_ib_mcg%d", ctx->port);
+	ctx->mcg_wq = create_singlethread_workqueue(name);
+	if (!ctx->mcg_wq)
+		return -ENOMEM;
+
+	mutex_init(&ctx->mcg_table_lock);
+	ctx->mcg_table = RB_ROOT;
+	INIT_LIST_HEAD(&ctx->mcg_mgid0_list);
+	ctx->flushing = 0;
+
+	return 0;
+}
+
+static void force_clean_group(struct mcast_group *group)
+{
+	struct mcast_req *req, *tmp
+		;
+	list_for_each_entry_safe(req, tmp, &group->pending_list, group_list) {
+		list_del(&req->group_list);
+		kfree(req);
+	}
+	del_sysfs_port_mcg_attr(group->demux->dev, group->demux->port, &group->dentry.attr);
+	rb_erase(&group->node, &group->demux->mcg_table);
+	kfree(group);
+}
+
+static void _mlx4_ib_mcg_port_cleanup(struct mlx4_ib_demux_ctx *ctx, int destroy_wq)
+{
+	int i;
+	struct rb_node *p;
+	struct mcast_group *group;
+	unsigned long end;
+	int count;
+
+	for (i = 0; i < MAX_VFS; ++i)
+		clean_vf_mcast(ctx, i);
+
+	end = jiffies + msecs_to_jiffies(MAD_TIMEOUT_MS + 3000);
+	do {
+		count = 0;
+		mutex_lock(&ctx->mcg_table_lock);
+		for (p = rb_first(&ctx->mcg_table); p; p = rb_next(p))
+			++count;
+		mutex_unlock(&ctx->mcg_table_lock);
+		if (!count)
+			break;
+
+		msleep(1);
+	} while (time_after(end, jiffies));
+
+	flush_workqueue(ctx->mcg_wq);
+	if (destroy_wq)
+		destroy_workqueue(ctx->mcg_wq);
+
+	mutex_lock(&ctx->mcg_table_lock);
+	while ((p = rb_first(&ctx->mcg_table)) != NULL) {
+		group = rb_entry(p, struct mcast_group, node);
+		if (atomic_read(&group->refcount))
+			mcg_warn_group(group, "group refcount %d!!! (pointer %p)\n", atomic_read(&group->refcount), group);
+
+		force_clean_group(group);
+	}
+	mutex_unlock(&ctx->mcg_table_lock);
+}
+
+struct clean_work {
+	struct work_struct work;
+	struct mlx4_ib_demux_ctx *ctx;
+	int destroy_wq;
+};
+
+static void mcg_clean_task(struct work_struct *work)
+{
+	struct clean_work *cw = container_of(work, struct clean_work, work);
+
+	_mlx4_ib_mcg_port_cleanup(cw->ctx, cw->destroy_wq);
+	cw->ctx->flushing = 0;
+	kfree(cw);
+}
+
+void mlx4_ib_mcg_port_cleanup(struct mlx4_ib_demux_ctx *ctx, int destroy_wq)
+{
+	struct clean_work *work;
+
+	if (ctx->flushing)
+		return;
+
+	ctx->flushing = 1;
+
+	if (destroy_wq) {
+		_mlx4_ib_mcg_port_cleanup(ctx, destroy_wq);
+		ctx->flushing = 0;
+		return;
+	}
+
+	work = kmalloc(sizeof *work, GFP_KERNEL);
+	if (!work) {
+		ctx->flushing = 0;
+		mcg_warn("failed allocating work for cleanup\n");
+		return;
+	}
+
+	work->ctx = ctx;
+	work->destroy_wq = destroy_wq;
+	INIT_WORK(&work->work, mcg_clean_task);
+	queue_work(clean_wq, &work->work);
+}
+
+static void build_leave_mad(struct mcast_req *req)
+{
+	struct ib_sa_mad *mad = &req->sa_mad;
+
+	mad->mad_hdr.method = IB_SA_METHOD_DELETE;
+}
+
+
+static void clear_pending_reqs(struct mcast_group *group, int vf)
+{
+	struct mcast_req *req, *tmp, *group_first = NULL;
+	int clear;
+	int pend = 0;
+
+	if (!list_empty(&group->pending_list))
+		group_first = list_first_entry(&group->pending_list, struct mcast_req, group_list);
+
+	list_for_each_entry_safe(req, tmp, &group->func[vf].pending, func_list) {
+		clear = 1;
+		if (group_first == req &&
+		    (group->state == MCAST_JOIN_SENT ||
+		     group->state == MCAST_LEAVE_SENT)) {
+			clear = cancel_delayed_work(&group->timeout_work);
+			pend = !clear;
+			group->state = MCAST_IDLE;
+		}
+		if (clear) {
+			--group->func[vf].num_pend_reqs;
+			list_del(&req->group_list);
+			list_del(&req->func_list);
+			kfree(req);
+			atomic_dec(&group->refcount);
+		}
+	}
+
+	if (!pend && (!list_empty(&group->func[vf].pending) || group->func[vf].num_pend_reqs)) {
+		mcg_warn_group(group, "DRIVER BUG: list_empty %d, num_pend_reqs %d\n",
+			       list_empty(&group->func[vf].pending), group->func[vf].num_pend_reqs);
+	}
+}
+
+static int push_deleteing_req(struct mcast_group *group, int slave)
+{
+	struct mcast_req *req;
+	struct mcast_req *pend_req;
+
+	if (!group->func[slave].join_state)
+		return 0;
+
+	req = kzalloc(sizeof *req, GFP_KERNEL);
+	if (!req) {
+		mcg_warn_group(group, "failed allocation - may leave stall groups\n");
+		return -ENOMEM;
+	}
+
+	if (!list_empty(&group->func[slave].pending)) {
+		pend_req = list_entry(group->func[slave].pending.prev, struct mcast_req, group_list);
+		if (pend_req->clean) {
+			kfree(req);
+			return 0;
+		}
+	}
+
+	req->clean = 1;
+	req->func = slave;
+	req->group = group;
+	++group->func[slave].num_pend_reqs;
+	build_leave_mad(req);
+	queue_req(req);
+	return 0;
+}
+
+void clean_vf_mcast(struct mlx4_ib_demux_ctx *ctx, int slave)
+{
+	struct mcast_group *group;
+	struct rb_node *p;
+
+	mutex_lock(&ctx->mcg_table_lock);
+	for (p = rb_first(&ctx->mcg_table); p; p = rb_next(p)) {
+		group = rb_entry(p, struct mcast_group, node);
+		mutex_lock(&group->lock);
+		if (atomic_read(&group->refcount)) {
+			/* clear pending requests of this VF */
+			clear_pending_reqs(group, slave);
+			push_deleteing_req(group, slave);
+		}
+		mutex_unlock(&group->lock);
+	}
+	mutex_unlock(&ctx->mcg_table_lock);
+}
+
+
+int mlx4_ib_mcg_init(void)
+{
+	clean_wq = create_singlethread_workqueue("mlx4_ib_mcg");
+	if (!clean_wq)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void mlx4_ib_mcg_destroy(void)
+{
+	destroy_workqueue(clean_wq);
+}


Property changes on: trunk/sys/ofed/drivers/infiniband/hw/mlx4/mcg.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Modified: trunk/sys/ofed/drivers/infiniband/hw/mlx4/mlx4_ib.h
===================================================================
--- trunk/sys/ofed/drivers/infiniband/hw/mlx4/mlx4_ib.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/drivers/infiniband/hw/mlx4/mlx4_ib.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -37,39 +37,52 @@
 #include <linux/compiler.h>
 #include <linux/list.h>
 #include <linux/mutex.h>
+#include <linux/idr.h>
+#include <linux/notifier.h>
 
 #include <rdma/ib_verbs.h>
 #include <rdma/ib_umem.h>
+#include <rdma/ib_mad.h>
+#include <rdma/ib_sa.h>
 
 #include <linux/mlx4/device.h>
 #include <linux/mlx4/doorbell.h>
+#include <linux/rbtree.h>
 
-
 #define MLX4_IB_DRV_NAME	"mlx4_ib"
 
-#ifdef CONFIG_MLX4_DEBUG
-extern int mlx4_ib_debug_level;
+#ifdef pr_fmt
+#undef pr_fmt
+#endif
+#define pr_fmt(fmt)	"<" MLX4_IB_DRV_NAME "> %s: " fmt, __func__
 
-#define mlx4_ib_dbg(format, arg...) 		\
-	do {					\
-		if (mlx4_ib_debug_level) 	\
-			printk(KERN_DEBUG "<" MLX4_IB_DRV_NAME "> %s: " format "\n",\
-			__func__, ## arg);	\
-	} while (0)
+#define mlx4_ib_warn(ibdev, format, arg...) \
+	dev_warn((ibdev)->dma_device, MLX4_IB_DRV_NAME ": " format, ## arg)
 
-#else /* CONFIG_MLX4_DEBUG */
+#define mlx4_ib_info(ibdev, format, arg...) \
+	dev_info((ibdev)->dma_device, MLX4_IB_DRV_NAME ": " format, ## arg)
 
-#define mlx4_ib_dbg(format, arg...) do {} while (0)
-
-#endif /* CONFIG_MLX4_DEBUG */
-
 enum {
-	MLX4_IB_SQ_MIN_WQE_SHIFT = 6
+	MLX4_IB_SQ_MIN_WQE_SHIFT = 6,
+	MLX4_IB_MAX_HEADROOM	 = 2048
 };
 
-#define MLX4_IB_SQ_HEADROOM(shift) ((2048 >> (shift)) + 1)
-#define MLX4_IB_SQ_MAX_SPARE (MLX4_IB_SQ_HEADROOM(MLX4_IB_SQ_MIN_WQE_SHIFT))
+#define MLX4_IB_SQ_HEADROOM(shift)	((MLX4_IB_MAX_HEADROOM >> (shift)) + 1)
+#define MLX4_IB_SQ_MAX_SPARE		(MLX4_IB_SQ_HEADROOM(MLX4_IB_SQ_MIN_WQE_SHIFT))
 
+/*module param to indicate if SM assigns the alias_GUID*/
+extern int mlx4_ib_sm_guid_assign;
+#ifdef __linux__
+extern struct proc_dir_entry *mlx4_mrs_dir_entry;
+#endif
+
+#define MLX4_IB_UC_STEER_QPN_ALIGN 1
+#define MLX4_IB_UC_MAX_NUM_QPS     (256 * 1024)
+
+
+#define MLX4_IB_MMAP_CMD_MASK 0xFF
+#define MLX4_IB_MMAP_CMD_BITS 8
+
 struct mlx4_ib_ucontext {
 	struct ib_ucontext	ibucontext;
 	struct mlx4_uar		uar;
@@ -83,15 +96,16 @@
 };
 
 struct mlx4_ib_xrcd {
-	struct ib_xrcd	ibxrcd;
-	u32		xrcdn;
-	struct ib_pd	*pd;
-	struct ib_cq	*cq;
+	struct ib_xrcd		ibxrcd;
+	u32			xrcdn;
+	struct ib_pd	       *pd;
+	struct ib_cq	       *cq;
 };
 
 struct mlx4_ib_cq_buf {
 	struct mlx4_buf		buf;
 	struct mlx4_mtt		mtt;
+	int			entry_size;
 };
 
 struct mlx4_ib_cq_resize {
@@ -99,6 +113,11 @@
 	int			cqe;
 };
 
+struct mlx4_shared_mr_info {
+	int mr_id;
+	struct ib_umem	       *umem;
+};
+
 struct mlx4_ib_cq {
 	struct ib_cq		ibcq;
 	struct mlx4_cq		mcq;
@@ -115,6 +134,7 @@
 	struct ib_mr		ibmr;
 	struct mlx4_mr		mmr;
 	struct ib_umem	       *umem;
+	struct mlx4_shared_mr_info	*smr_info;
 };
 
 struct mlx4_ib_fast_reg_page_list {
@@ -141,12 +161,14 @@
 };
 
 enum mlx4_ib_qp_flags {
-	MLX4_IB_QP_LSO				= 1 << 0,
-	MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK	= 1 << 1,
-	MLX4_IB_XRC_RCV				= 1 << 2,
+	MLX4_IB_QP_LSO = IB_QP_CREATE_IPOIB_UD_LSO,
+	MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK = IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK,
+	MLX4_IB_QP_NETIF = IB_QP_CREATE_NETIF_QP,
+	MLX4_IB_SRIOV_TUNNEL_QP = 1 << 30,
+	MLX4_IB_SRIOV_SQP = 1 << 31,
 };
 
-struct gid_entry {
+struct mlx4_ib_gid_entry {
 	struct list_head	list;
 	union ib_gid		gid;
 	int			added;
@@ -153,6 +175,113 @@
 	u8			port;
 };
 
+enum mlx4_ib_mmap_cmd {
+	MLX4_IB_MMAP_UAR_PAGE		= 0,
+	MLX4_IB_MMAP_BLUE_FLAME_PAGE	= 1,
+	MLX4_IB_MMAP_GET_CONTIGUOUS_PAGES	= 2,
+};
+
+enum mlx4_ib_qp_type {
+	/*
+	 * IB_QPT_SMI and IB_QPT_GSI have to be the first two entries
+	 * here (and in that order) since the MAD layer uses them as
+	 * indices into a 2-entry table.
+	 */
+	MLX4_IB_QPT_SMI = IB_QPT_SMI,
+	MLX4_IB_QPT_GSI = IB_QPT_GSI,
+
+	MLX4_IB_QPT_RC = IB_QPT_RC,
+	MLX4_IB_QPT_UC = IB_QPT_UC,
+	MLX4_IB_QPT_UD = IB_QPT_UD,
+	MLX4_IB_QPT_RAW_IPV6 = IB_QPT_RAW_IPV6,
+	MLX4_IB_QPT_RAW_ETHERTYPE = IB_QPT_RAW_ETHERTYPE,
+	MLX4_IB_QPT_RAW_PACKET = IB_QPT_RAW_PACKET,
+	MLX4_IB_QPT_XRC_INI = IB_QPT_XRC_INI,
+	MLX4_IB_QPT_XRC_TGT = IB_QPT_XRC_TGT,
+
+	MLX4_IB_QPT_PROXY_SMI_OWNER	= 1 << 16,
+	MLX4_IB_QPT_PROXY_SMI		= 1 << 17,
+	MLX4_IB_QPT_PROXY_GSI		= 1 << 18,
+	MLX4_IB_QPT_TUN_SMI_OWNER	= 1 << 19,
+	MLX4_IB_QPT_TUN_SMI		= 1 << 20,
+	MLX4_IB_QPT_TUN_GSI		= 1 << 21,
+};
+
+#define MLX4_IB_QPT_ANY_SRIOV	(MLX4_IB_QPT_PROXY_SMI_OWNER | \
+	MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_GSI | MLX4_IB_QPT_TUN_SMI_OWNER | \
+	MLX4_IB_QPT_TUN_SMI | MLX4_IB_QPT_TUN_GSI)
+
+enum mlx4_ib_mad_ifc_flags {
+	MLX4_MAD_IFC_IGNORE_MKEY	= 1,
+	MLX4_MAD_IFC_IGNORE_BKEY	= 2,
+	MLX4_MAD_IFC_IGNORE_KEYS	= (MLX4_MAD_IFC_IGNORE_MKEY |
+					   MLX4_MAD_IFC_IGNORE_BKEY),
+	MLX4_MAD_IFC_NET_VIEW		= 4,
+};
+
+enum {
+	MLX4_NUM_TUNNEL_BUFS		= 256,
+};
+
+struct mlx4_ib_tunnel_header {
+	struct mlx4_av av;
+	__be32 remote_qpn;
+	__be32 qkey;
+	__be16 vlan;
+	u8 mac[6];
+	__be16 pkey_index;
+	u8 reserved[6];
+};
+
+struct mlx4_ib_buf {
+	void *addr;
+	dma_addr_t map;
+};
+
+struct mlx4_rcv_tunnel_hdr {
+	__be32 flags_src_qp; /* flags[6:5] is defined for VLANs:
+			      * 0x0 - no vlan was in the packet
+			      * 0x01 - C-VLAN was in the packet */
+	u8 g_ml_path; /* gid bit stands for ipv6/4 header in RoCE */
+	u8 reserved;
+	__be16 pkey_index;
+	__be16 sl_vid;
+	__be16 slid_mac_47_32;
+	__be32 mac_31_0;
+};
+
+struct mlx4_ib_proxy_sqp_hdr {
+	struct ib_grh grh;
+	struct mlx4_rcv_tunnel_hdr tun;
+}  __packed;
+
+struct mlx4_roce_smac_vlan_info {
+	u64 smac;
+	int smac_index;
+	int smac_port;
+	u64 candidate_smac;
+	int candidate_smac_index;
+	int candidate_smac_port;
+	u16 vid;
+	int vlan_index;
+	int vlan_port;
+	u16 candidate_vid;
+	int candidate_vlan_index;
+	int candidate_vlan_port;
+	int update_vid;
+};
+
+struct mlx4_ib_qpg_data {
+	unsigned long *tss_bitmap;
+	unsigned long *rss_bitmap;
+	struct mlx4_ib_qp *qpg_parent;
+	int tss_qpn_base;
+	int rss_qpn_base;
+	u32 tss_child_count;
+	u32 rss_child_count;
+	u32 qpg_tss_mask_sz;
+};
+
 struct mlx4_ib_qp {
 	struct ib_qp		ibqp;
 	struct mlx4_qp		mqp;
@@ -168,14 +297,13 @@
 	int			sq_spare_wqes;
 	struct mlx4_ib_wq	sq;
 
+	enum mlx4_ib_qp_type	mlx4_ib_qp_type;
 	struct ib_umem	       *umem;
 	struct mlx4_mtt		mtt;
 	int			buf_size;
 	struct mutex		mutex;
+	u16			xrcdn;
 	u32			flags;
-	struct list_head	xrc_reg_list;
-	spinlock_t		xrc_reg_list_lock;
-	u16			xrcdn;
 	u8			port;
 	u8			alt_port;
 	u8			atomic_rd_en;
@@ -183,9 +311,16 @@
 	u8			sq_no_prefetch;
 	u8			state;
 	int			mlx_type;
+	enum ib_qpg_type	qpg_type;
+	struct mlx4_ib_qpg_data *qpg_data;
 	struct list_head	gid_list;
-	int			max_inline_data;
-	struct mlx4_bf		bf;
+	struct list_head	steering_rules;
+	struct mlx4_ib_buf	*sqp_proxy_rcv;
+	struct mlx4_roce_smac_vlan_info pri;
+	struct mlx4_roce_smac_vlan_info alt;
+	struct list_head	rules_list;
+	int                     max_inline_data;
+	struct mlx4_bf          bf;
 };
 
 struct mlx4_ib_srq {
@@ -208,6 +343,138 @@
 	union mlx4_ext_av       av;
 };
 
+/****************************************/
+/* alias guid support */
+/****************************************/
+#define NUM_PORT_ALIAS_GUID		2
+#define NUM_ALIAS_GUID_IN_REC		8
+#define NUM_ALIAS_GUID_REC_IN_PORT	16
+#define GUID_REC_SIZE			8
+#define NUM_ALIAS_GUID_PER_PORT		128
+#define MLX4_NOT_SET_GUID		(0x00LL)
+#define MLX4_GUID_FOR_DELETE_VAL	(~(0x00LL))
+
+enum mlx4_guid_alias_rec_status {
+	MLX4_GUID_INFO_STATUS_IDLE,
+	MLX4_GUID_INFO_STATUS_SET,
+	MLX4_GUID_INFO_STATUS_PENDING,
+};
+
+enum mlx4_guid_alias_rec_ownership {
+	MLX4_GUID_DRIVER_ASSIGN,
+	MLX4_GUID_SYSADMIN_ASSIGN,
+	MLX4_GUID_NONE_ASSIGN, /*init state of each record*/
+};
+
+enum mlx4_guid_alias_rec_method {
+	MLX4_GUID_INFO_RECORD_SET	= IB_MGMT_METHOD_SET,
+	MLX4_GUID_INFO_RECORD_DELETE	= IB_SA_METHOD_DELETE,
+};
+
+struct mlx4_sriov_alias_guid_info_rec_det {
+	u8 all_recs[GUID_REC_SIZE * NUM_ALIAS_GUID_IN_REC];
+	ib_sa_comp_mask guid_indexes; /*indicates what from the 8 records are valid*/
+	enum mlx4_guid_alias_rec_status status; /*indicates the administraively status of the record.*/
+	u8 method; /*set or delete*/
+	enum mlx4_guid_alias_rec_ownership ownership; /*indicates who assign that alias_guid record*/
+};
+
+struct mlx4_sriov_alias_guid_port_rec_det {
+	struct mlx4_sriov_alias_guid_info_rec_det all_rec_per_port[NUM_ALIAS_GUID_REC_IN_PORT];
+	struct workqueue_struct *wq;
+	struct delayed_work alias_guid_work;
+	u8 port;
+	struct mlx4_sriov_alias_guid *parent;
+	struct list_head cb_list;
+};
+
+struct mlx4_sriov_alias_guid {
+	struct mlx4_sriov_alias_guid_port_rec_det ports_guid[MLX4_MAX_PORTS];
+	spinlock_t ag_work_lock;
+	struct ib_sa_client *sa_client;
+};
+
+struct mlx4_ib_demux_work {
+	struct work_struct	work;
+	struct mlx4_ib_dev     *dev;
+	int			slave;
+	int			do_init;
+	u8			port;
+
+};
+
+struct mlx4_ib_tun_tx_buf {
+	struct mlx4_ib_buf buf;
+	struct ib_ah *ah;
+};
+
+struct mlx4_ib_demux_pv_qp {
+	struct ib_qp *qp;
+	enum ib_qp_type proxy_qpt;
+	struct mlx4_ib_buf *ring;
+	struct mlx4_ib_tun_tx_buf *tx_ring;
+	spinlock_t tx_lock;
+	unsigned tx_ix_head;
+	unsigned tx_ix_tail;
+};
+
+enum mlx4_ib_demux_pv_state {
+	DEMUX_PV_STATE_DOWN,
+	DEMUX_PV_STATE_STARTING,
+	DEMUX_PV_STATE_ACTIVE,
+	DEMUX_PV_STATE_DOWNING,
+};
+
+struct mlx4_ib_demux_pv_ctx {
+	int port;
+	int slave;
+	enum mlx4_ib_demux_pv_state state;
+	int has_smi;
+	struct ib_device *ib_dev;
+	struct ib_cq *cq;
+	struct ib_pd *pd;
+	struct ib_mr *mr;
+	struct work_struct work;
+	struct workqueue_struct *wq;
+	struct mlx4_ib_demux_pv_qp qp[2];
+};
+
+struct mlx4_ib_demux_ctx {
+	struct ib_device *ib_dev;
+	int port;
+	struct workqueue_struct *wq;
+	struct workqueue_struct *ud_wq;
+	spinlock_t ud_lock;
+	__be64 subnet_prefix;
+	__be64 guid_cache[128];
+	struct mlx4_ib_dev *dev;
+	/* the following lock protects both mcg_table and mcg_mgid0_list */
+	struct mutex		mcg_table_lock;
+	struct rb_root		mcg_table;
+	struct list_head	mcg_mgid0_list;
+	struct workqueue_struct	*mcg_wq;
+	struct mlx4_ib_demux_pv_ctx **tun;
+	atomic_t tid;
+	int    flushing; /* flushing the work queue */
+};
+
+struct mlx4_ib_sriov {
+	struct mlx4_ib_demux_ctx demux[MLX4_MAX_PORTS];
+	struct mlx4_ib_demux_pv_ctx *sqps[MLX4_MAX_PORTS];
+	/* when using this spinlock you should use "irq" because
+	 * it may be called from interrupt context.*/
+	spinlock_t going_down_lock;
+	int is_going_down;
+
+	struct mlx4_sriov_alias_guid alias_guid;
+
+	/* CM paravirtualization fields */
+	struct list_head cm_list;
+	spinlock_t id_map_lock;
+	struct rb_root sl_id_map;
+	struct idr pv_id_table;
+};
+
 struct mlx4_ib_iboe {
 	spinlock_t		lock;
 	struct net_device      *netdevs[MLX4_MAX_PORTS];
@@ -215,6 +482,42 @@
 	union ib_gid		gid_table[MLX4_MAX_PORTS][128];
 };
 
+struct pkey_mgt {
+	u8			virt2phys_pkey[MLX4_MFUNC_MAX][MLX4_MAX_PORTS][MLX4_MAX_PORT_PKEYS];
+	u16			phys_pkey_cache[MLX4_MAX_PORTS][MLX4_MAX_PORT_PKEYS];
+	struct list_head	pkey_port_list[MLX4_MFUNC_MAX];
+	struct kobject	       *device_parent[MLX4_MFUNC_MAX];
+};
+
+struct mlx4_ib_iov_sysfs_attr {
+	void *ctx;
+	struct kobject *kobj;
+	unsigned long data;
+	u32 entry_num;
+	char name[15];
+	struct device_attribute dentry;
+	struct device *dev;
+};
+
+struct mlx4_ib_iov_sysfs_attr_ar {
+	struct mlx4_ib_iov_sysfs_attr dentries[3 * NUM_ALIAS_GUID_PER_PORT + 1];
+};
+
+struct mlx4_ib_iov_port {
+	char name[100];
+	u8 num;
+	struct mlx4_ib_dev *dev;
+	struct list_head list;
+	struct mlx4_ib_iov_sysfs_attr_ar *dentr_ar;
+	struct ib_port_attr attr;
+	struct kobject	*cur_port;
+	struct kobject	*admin_alias_parent;
+	struct kobject	*gids_parent;
+	struct kobject	*pkeys_parent;
+	struct kobject	*mcgs_parent;
+	struct mlx4_ib_iov_sysfs_attr mcg_dentry;
+};
+
 struct mlx4_ib_dev {
 	struct ib_device	ib_dev;
 	struct mlx4_dev	       *dev;
@@ -226,14 +529,37 @@
 	struct ib_mad_agent    *send_agent[MLX4_MAX_PORTS][2];
 	struct ib_ah	       *sm_ah[MLX4_MAX_PORTS];
 	spinlock_t		sm_lock;
+	struct mlx4_ib_sriov	sriov;
 
 	struct mutex		cap_mask_mutex;
-	struct mutex		xrc_reg_mutex;
-	int			ib_active;
+	bool			ib_active;
 	struct mlx4_ib_iboe	iboe;
 	int			counters[MLX4_MAX_PORTS];
+	int		       *eq_table;
+	int			eq_added;
+	struct kobject	       *iov_parent;
+	struct kobject	       *ports_parent;
+	struct kobject	       *dev_ports_parent[MLX4_MFUNC_MAX];
+	struct mlx4_ib_iov_port	iov_ports[MLX4_MAX_PORTS];
+	struct pkey_mgt		pkeys;
+	unsigned long *ib_uc_qpns_bitmap;
+	int steer_qpn_count;
+	int steer_qpn_base;
 };
 
+struct ib_event_work {
+	struct work_struct	work;
+	struct mlx4_ib_dev	*ib_dev;
+	struct mlx4_eqe		ib_eqe;
+};
+
+struct mlx4_ib_qp_tunnel_init_attr {
+	struct ib_qp_init_attr init_attr;
+	int slave;
+	enum ib_qp_type proxy_qp_type;
+	u8 port;
+};
+
 static inline struct mlx4_ib_dev *to_mdev(struct ib_device *ibdev)
 {
 	return container_of(ibdev, struct mlx4_ib_dev, ib_dev);
@@ -303,6 +629,9 @@
 	return container_of(ibah, struct mlx4_ib_ah, ibah);
 }
 
+int mlx4_ib_init_sriov(struct mlx4_ib_dev *dev);
+void mlx4_ib_close_sriov(struct mlx4_ib_dev *dev);
+
 int mlx4_ib_db_map_user(struct mlx4_ib_ucontext *context, unsigned long virt,
 			struct mlx4_db *db);
 void mlx4_ib_db_unmap_user(struct mlx4_ib_ucontext *context, struct mlx4_db *db);
@@ -310,9 +639,12 @@
 struct ib_mr *mlx4_ib_get_dma_mr(struct ib_pd *pd, int acc);
 int mlx4_ib_umem_write_mtt(struct mlx4_ib_dev *dev, struct mlx4_mtt *mtt,
 			   struct ib_umem *umem);
+int mlx4_ib_umem_calc_optimal_mtt_size(struct ib_umem *umem,
+						u64 start_va,
+						int *num_of_mtts);
 struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 				  u64 virt_addr, int access_flags,
-				  struct ib_udata *udata);
+				  struct ib_udata *udata, int mr_id);
 int mlx4_ib_dereg_mr(struct ib_mr *mr);
 struct ib_mr *mlx4_ib_alloc_fast_reg_mr(struct ib_pd *pd,
 					int max_page_list_len);
@@ -322,6 +654,7 @@
 
 int mlx4_ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period);
 int mlx4_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata);
+int mlx4_ib_ignore_overrun_cq(struct ib_cq *ibcq);
 struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev, int entries, int vector,
 				struct ib_ucontext *context,
 				struct ib_udata *udata);
@@ -338,11 +671,6 @@
 struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd,
 				  struct ib_srq_init_attr *init_attr,
 				  struct ib_udata *udata);
-struct ib_srq *mlx4_ib_create_xrc_srq(struct ib_pd *pd,
-				      struct ib_cq *xrc_cq,
-				      struct ib_xrcd *xrcd,
-				      struct ib_srq_init_attr *init_attr,
-				      struct ib_udata *udata);
 int mlx4_ib_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
 		       enum ib_srq_attr_mask attr_mask, struct ib_udata *udata);
 int mlx4_ib_query_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr);
@@ -364,7 +692,7 @@
 int mlx4_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr,
 		      struct ib_recv_wr **bad_wr);
 
-int mlx4_MAD_IFC(struct mlx4_ib_dev *dev, int ignore_mkey, int ignore_bkey,
+int mlx4_MAD_IFC(struct mlx4_ib_dev *dev, int mad_ifc_flags,
 		 int port, struct ib_wc *in_wc, struct ib_grh *in_grh,
 		 void *in_mad, void *response_mad);
 int mlx4_ib_process_mad(struct ib_device *ibdev, int mad_flags,	u8 port_num,
@@ -379,20 +707,20 @@
 			 u64 iova);
 int mlx4_ib_unmap_fmr(struct list_head *fmr_list);
 int mlx4_ib_fmr_dealloc(struct ib_fmr *fmr);
-int mlx4_ib_create_xrc_rcv_qp(struct ib_qp_init_attr *init_attr,
-			      u32 *qp_num);
-int mlx4_ib_modify_xrc_rcv_qp(struct ib_xrcd *xrcd, u32 qp_num,
-			      struct ib_qp_attr *attr, int attr_mask);
-int mlx4_ib_query_xrc_rcv_qp(struct ib_xrcd *xrcd, u32 qp_num,
-			     struct ib_qp_attr *attr, int attr_mask,
-			     struct ib_qp_init_attr *init_attr);
-int mlx4_ib_reg_xrc_rcv_qp(struct ib_xrcd *xrcd, void *context, u32 qp_num);
-int mlx4_ib_unreg_xrc_rcv_qp(struct ib_xrcd *xrcd, void *context, u32 qp_num);
+int __mlx4_ib_query_port(struct ib_device *ibdev, u8 port,
+			 struct ib_port_attr *props, int netw_view);
+int __mlx4_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
+			 u16 *pkey, int netw_view);
 
+int __mlx4_ib_query_gid(struct ib_device *ibdev, u8 port, int index,
+			union ib_gid *gid, int netw_view);
 
 int mlx4_ib_resolve_grh(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah_attr,
 			u8 *mac, int *is_mcast, u8 port);
 
+int mlx4_ib_query_if_stat(struct mlx4_ib_dev *dev, u32 counter_index,
+		       union mlx4_counter *counter, u8 clear);
+
 static inline int mlx4_ib_ah_grh_present(struct mlx4_ib_ah *ah)
 {
 	u8 port = be32_to_cpu(ah->av.ib.port_pd) >> 24 & 3;
@@ -403,7 +731,73 @@
 	return !!(ah->av.ib.g_slid & 0x80);
 }
 
+int mlx4_ib_mcg_port_init(struct mlx4_ib_demux_ctx *ctx);
+void mlx4_ib_mcg_port_cleanup(struct mlx4_ib_demux_ctx *ctx, int destroy_wq);
+void clean_vf_mcast(struct mlx4_ib_demux_ctx *ctx, int slave);
+int mlx4_ib_mcg_init(void);
+void mlx4_ib_mcg_destroy(void);
+
+int mlx4_ib_find_real_gid(struct ib_device *ibdev, u8 port, __be64 guid);
+
+int mlx4_ib_mcg_multiplex_handler(struct ib_device *ibdev, int port, int slave,
+				  struct ib_sa_mad *sa_mad);
+int mlx4_ib_mcg_demux_handler(struct ib_device *ibdev, int port, int slave,
+			      struct ib_sa_mad *mad);
+
 int mlx4_ib_add_mc(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp,
 		   union ib_gid *gid);
 
+void mlx4_ib_dispatch_event(struct mlx4_ib_dev *dev, u8 port_num,
+			    enum ib_event_type type);
+
+void mlx4_ib_tunnels_update_work(struct work_struct *work);
+
+int mlx4_ib_send_to_slave(struct mlx4_ib_dev *dev, int slave, u8 port,
+			  enum ib_qp_type qpt, struct ib_wc *wc,
+			  struct ib_grh *grh, struct ib_mad *mad);
+int mlx4_ib_send_to_wire(struct mlx4_ib_dev *dev, int slave, u8 port,
+			 enum ib_qp_type dest_qpt, u16 pkey_index, u32 remote_qpn,
+			 u32 qkey, struct ib_ah_attr *attr, struct ib_mad *mad);
+__be64 mlx4_ib_get_new_demux_tid(struct mlx4_ib_demux_ctx *ctx);
+
+int mlx4_ib_demux_cm_handler(struct ib_device *ibdev, int port, int *slave,
+		struct ib_mad *mad, int is_eth);
+
+int mlx4_ib_multiplex_cm_handler(struct ib_device *ibdev, int port, int slave_id,
+		struct ib_mad *mad);
+
+void mlx4_ib_cm_paravirt_init(struct mlx4_ib_dev *dev);
+void mlx4_ib_cm_paravirt_clean(struct mlx4_ib_dev *dev, int slave_id);
+
+/* alias guid support */
+void mlx4_ib_init_alias_guid_work(struct mlx4_ib_dev *dev, int port);
+int mlx4_ib_init_alias_guid_service(struct mlx4_ib_dev *dev);
+void mlx4_ib_destroy_alias_guid_service(struct mlx4_ib_dev *dev);
+void mlx4_ib_invalidate_all_guid_record(struct mlx4_ib_dev *dev, int port);
+
+void mlx4_ib_notify_slaves_on_guid_change(struct mlx4_ib_dev *dev,
+					  int block_num,
+					  u8 port_num, u8 *p_data);
+
+void mlx4_ib_update_cache_on_guid_change(struct mlx4_ib_dev *dev,
+					 int block_num, u8 port_num,
+					 u8 *p_data);
+
+int add_sysfs_port_mcg_attr(struct mlx4_ib_dev *device, int port_num,
+			    struct attribute *attr);
+void del_sysfs_port_mcg_attr(struct mlx4_ib_dev *device, int port_num,
+			     struct attribute *attr);
+ib_sa_comp_mask mlx4_ib_get_aguid_comp_mask_from_ix(int index);
+
+int mlx4_ib_device_register_sysfs(struct mlx4_ib_dev *device) ;
+
+void mlx4_ib_device_unregister_sysfs(struct mlx4_ib_dev *device);
+
+__be64 mlx4_ib_gen_node_guid(void);
+
+int mlx4_ib_steer_qp_alloc(struct mlx4_ib_dev *dev, int count, int *qpn);
+void mlx4_ib_steer_qp_free(struct mlx4_ib_dev *dev, u32 qpn, int count);
+int mlx4_ib_steer_qp_reg(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp,
+			 int is_attach);
+
 #endif /* MLX4_IB_H */


Property changes on: trunk/sys/ofed/drivers/infiniband/hw/mlx4/mlx4_ib.h
___________________________________________________________________
Deleted: cvs2svn:cvs-rev
## -1 +0,0 ##
-1.1.1.1
\ No newline at end of property
Modified: trunk/sys/ofed/drivers/infiniband/hw/mlx4/mr.c
===================================================================
--- trunk/sys/ofed/drivers/infiniband/hw/mlx4/mr.c	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/drivers/infiniband/hw/mlx4/mr.c	2016-09-14 19:35:22 UTC (rev 7911)
@@ -31,6 +31,15 @@
  * SOFTWARE.
  */
 
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+
+#ifdef __linux__
+#include <linux/proc_fs.h>
+#include <linux/cred.h>
+#endif
+
 #include "mlx4_ib.h"
 
 static u32 convert_access(int acc)
@@ -41,13 +50,67 @@
 	       (acc & IB_ACCESS_LOCAL_WRITE   ? MLX4_PERM_LOCAL_WRITE  : 0) |
 	       MLX4_PERM_LOCAL_READ;
 }
+#ifdef __linux__
+static ssize_t shared_mr_proc_read(struct file *file,
+			  char __user *buffer,
+			  size_t len,
+			  loff_t *offset)
+{
 
+	return -ENOSYS;
+
+}
+
+static ssize_t shared_mr_proc_write(struct file *file,
+			   const char __user *buffer,
+			   size_t len,
+			   loff_t *offset)
+{
+
+	return -ENOSYS;
+}
+
+static int shared_mr_mmap(struct file *filep, struct vm_area_struct *vma)
+{
+
+	struct proc_dir_entry *pde = PDE(filep->f_path.dentry->d_inode);
+	struct mlx4_shared_mr_info *smr_info =
+		(struct mlx4_shared_mr_info *)pde->data;
+
+	/* Prevent any mapping not on start of area */
+	if (vma->vm_pgoff != 0)
+		return -EINVAL;
+
+	return ib_umem_map_to_vma(smr_info->umem,
+					vma);
+
+}
+
+static const struct file_operations shared_mr_proc_ops = {
+	.owner	= THIS_MODULE,
+	.read	= shared_mr_proc_read,
+	.write	= shared_mr_proc_write,
+	.mmap	= shared_mr_mmap
+};
+
+static mode_t convert_shared_access(int acc)
+{
+
+	return (acc & IB_ACCESS_SHARED_MR_USER_READ ? S_IRUSR       : 0) |
+	       (acc & IB_ACCESS_SHARED_MR_USER_WRITE  ? S_IWUSR : 0) |
+	       (acc & IB_ACCESS_SHARED_MR_GROUP_READ   ? S_IRGRP  : 0) |
+	       (acc & IB_ACCESS_SHARED_MR_GROUP_WRITE   ? S_IWGRP  : 0) |
+	       (acc & IB_ACCESS_SHARED_MR_OTHER_READ   ? S_IROTH  : 0) |
+	       (acc & IB_ACCESS_SHARED_MR_OTHER_WRITE   ? S_IWOTH  : 0);
+
+}
+#endif
 struct ib_mr *mlx4_ib_get_dma_mr(struct ib_pd *pd, int acc)
 {
 	struct mlx4_ib_mr *mr;
 	int err;
 
-	mr = kmalloc(sizeof *mr, GFP_KERNEL);
+	mr = kzalloc(sizeof *mr, GFP_KERNEL);
 	if (!mr)
 		return ERR_PTR(-ENOMEM);
 
@@ -74,118 +137,350 @@
 	return ERR_PTR(err);
 }
 
+static int mlx4_ib_umem_write_mtt_block(struct mlx4_ib_dev *dev,
+						struct mlx4_mtt *mtt,
+						u64 mtt_size,
+						u64 mtt_shift,
+						u64 len,
+						u64 cur_start_addr,
+						u64 *pages,
+						int *start_index,
+						int *npages)
+{
+	int k;
+	int err = 0;
+	u64 mtt_entries;
+	u64 cur_end_addr = cur_start_addr + len;
+	u64 cur_end_addr_aligned = 0;
+
+	len += (cur_start_addr & (mtt_size-1ULL));
+	cur_end_addr_aligned = round_up(cur_end_addr, mtt_size);
+	len += (cur_end_addr_aligned - cur_end_addr);
+	if (len & (mtt_size-1ULL)) {
+		WARN(1 ,
+		"write_block: len %llx is not aligned to mtt_size %llx\n",
+			(long long)len, (long long)mtt_size);
+		return -EINVAL;
+	}
+
+
+	mtt_entries = (len >> mtt_shift);
+
+	/* Align the MTT start address to
+		the mtt_size.
+		Required to handle cases when the MR
+		starts in the middle of an MTT record.
+		Was not required in old code since
+		the physical addresses provided by
+		the dma subsystem were page aligned,
+		which was also the MTT size.
+	*/
+	cur_start_addr = round_down(cur_start_addr, mtt_size);
+	/* A new block is started ...*/
+	for (k = 0; k < mtt_entries; ++k) {
+		pages[*npages] = cur_start_addr + (mtt_size * k);
+		(*npages)++;
+		/*
+		 * Be friendly to mlx4_write_mtt() and
+		 * pass it chunks of appropriate size.
+		 */
+		if (*npages == PAGE_SIZE / sizeof(u64)) {
+			err = mlx4_write_mtt(dev->dev,
+					mtt, *start_index,
+					*npages, pages);
+			if (err)
+				return err;
+
+			(*start_index) += *npages;
+			*npages = 0;
+		}
+	}
+
+	return 0;
+}
+
 int mlx4_ib_umem_write_mtt(struct mlx4_ib_dev *dev, struct mlx4_mtt *mtt,
 			   struct ib_umem *umem)
 {
 	u64 *pages;
 	struct ib_umem_chunk *chunk;
-	int i, j, k;
-	int n;
-	int len;
+	int j;
+	u64 len = 0;
 	int err = 0;
+	u64 mtt_size;
+	u64 cur_start_addr = 0;
+	u64 mtt_shift;
+	int start_index = 0;
+	int npages = 0;
 
 	pages = (u64 *) __get_free_page(GFP_KERNEL);
 	if (!pages)
 		return -ENOMEM;
 
-	i = n = 0;
+	mtt_shift = mtt->page_shift;
+	mtt_size = 1ULL << mtt_shift;
 
 	list_for_each_entry(chunk, &umem->chunk_list, list)
 		for (j = 0; j < chunk->nmap; ++j) {
-			len = sg_dma_len(&chunk->page_list[j]) >> mtt->page_shift;
-			for (k = 0; k < len; ++k) {
-				pages[i++] = sg_dma_address(&chunk->page_list[j]) +
-					umem->page_size * k;
-				/*
-				 * Be friendly to mlx4_write_mtt() and
-				 * pass it chunks of appropriate size.
-				 */
-				if (i == PAGE_SIZE / sizeof (u64)) {
-					err = mlx4_write_mtt(dev->dev, mtt, n,
-							     i, pages);
-					if (err)
-						goto out;
-					n += i;
-					i = 0;
-				}
+			if (cur_start_addr + len ==
+			    sg_dma_address(&chunk->page_list[j])) {
+				/* still the same block */
+				len += sg_dma_len(&chunk->page_list[j]);
+				continue;
 			}
+			/* A new block is started ...*/
+			/* If len is malaligned, write an extra mtt entry to
+			    cover the misaligned area (round up the division)
+			*/
+			err = mlx4_ib_umem_write_mtt_block(dev,
+						mtt, mtt_size, mtt_shift,
+						len, cur_start_addr,
+						pages,
+						&start_index,
+						&npages);
+			if (err)
+				goto out;
+
+			cur_start_addr =
+				sg_dma_address(&chunk->page_list[j]);
+			len = sg_dma_len(&chunk->page_list[j]);
 		}
 
-	if (i)
-		err = mlx4_write_mtt(dev->dev, mtt, n, i, pages);
+	/* Handle the last block */
+	if (len > 0) {
+		/*  If len is malaligned, write an extra mtt entry to cover
+		     the misaligned area (round up the division)
+		*/
+		err = mlx4_ib_umem_write_mtt_block(dev,
+						mtt, mtt_size, mtt_shift,
+						len, cur_start_addr,
+						pages,
+						&start_index,
+						&npages);
+			if (err)
+				goto out;
+	}
 
+
+	if (npages)
+		err = mlx4_write_mtt(dev->dev, mtt, start_index, npages, pages);
+
 out:
 	free_page((unsigned long) pages);
 	return err;
 }
 
-static int handle_hugetlb_user_mr(struct ib_pd *pd, struct mlx4_ib_mr *mr,
-				  u64 start, u64 virt_addr, int access_flags)
+static inline u64 alignment_of(u64 ptr)
 {
-#if defined(CONFIG_HUGETLB_PAGE) && !defined(__powerpc__) && !defined(__ia64__)
-	struct mlx4_ib_dev *dev = to_mdev(pd->device);
+	return ilog2(ptr & (~(ptr-1)));
+}
+
+static int mlx4_ib_umem_calc_block_mtt(u64 next_block_start,
+						u64 current_block_end,
+						u64 block_shift)
+{
+	/* Check whether the alignment of the new block
+	     is aligned as well as the previous block.
+	     Block address must start with zeros till size of entity_size.
+	*/
+	if ((next_block_start & ((1ULL << block_shift) - 1ULL)) != 0)
+		/* It is not as well aligned as the
+		previous block-reduce the mtt size
+		accordingly.
+		Here we take the last right bit
+		which is 1.
+		*/
+		block_shift = alignment_of(next_block_start);
+
+	/*  Check whether the alignment of the
+	     end of previous block - is it aligned
+	     as well as the start of the block
+	*/
+	if (((current_block_end) & ((1ULL << block_shift) - 1ULL)) != 0)
+		/* It is not as well aligned as
+		the start of the block - reduce the
+		mtt size accordingly.
+		*/
+		block_shift = alignment_of(current_block_end);
+
+	return block_shift;
+}
+
+/* Calculate optimal mtt size based on contiguous pages.
+* Function will return also the number of pages that are not aligned to the
+   calculated mtt_size to be added to total number
+    of pages. For that we should check the first chunk length & last chunk
+    length and if not aligned to mtt_size we should increment
+    the non_aligned_pages number.
+    All chunks in the middle already handled as part of mtt shift calculation
+    for both their start & end addresses.
+*/
+int mlx4_ib_umem_calc_optimal_mtt_size(struct ib_umem *umem,
+						u64 start_va,
+						int *num_of_mtts)
+{
 	struct ib_umem_chunk *chunk;
-	unsigned dsize;
-	dma_addr_t daddr;
-	unsigned cur_size = 0;
-	dma_addr_t uninitialized_var(cur_addr);
-	int n;
-	struct ib_umem	*umem = mr->umem;
-	u64 *arr;
-	int err = 0;
-	int i;
-	int j = 0;
-	int off = start & (HPAGE_SIZE - 1);
+	int j;
+	u64 block_shift = MLX4_MAX_MTT_SHIFT;
+	u64 current_block_len = 0;
+	u64 current_block_start = 0;
+	u64 misalignment_bits;
+	u64 first_block_start = 0;
+	u64 last_block_end = 0;
+	u64 total_len = 0;
+	u64 last_block_aligned_end = 0;
+	u64 min_shift = ilog2(umem->page_size);
 
-	n = DIV_ROUND_UP(off + umem->length, HPAGE_SIZE);
-	arr = kmalloc(n * sizeof *arr, GFP_KERNEL);
-	if (!arr)
-		return -ENOMEM;
+	list_for_each_entry(chunk, &umem->chunk_list, list) {
+		/* Initialization - save the first chunk start as
+		    the current_block_start - block means contiguous pages.
+		*/
+		if (current_block_len == 0 && current_block_start == 0) {
+			first_block_start = current_block_start =
+				sg_dma_address(&chunk->page_list[0]);
+			/* Find the bits that are different between
+			    the physical address and the virtual
+			    address for the start of the MR.
+			*/
+			/* umem_get aligned the start_va to a page
+			   boundry. Therefore, we need to align the
+			   start va to the same boundry */
+			/* misalignment_bits is needed to handle the
+			   case of a single memory region. In this
+			   case, the rest of the logic will not reduce
+			   the block size.  If we use a block size
+			   which is bigger than the alignment of the
+			   misalignment bits, we might use the virtual
+			   page number instead of the physical page
+			   number, resulting in access to the wrong
+			   data. */
+			misalignment_bits =
+			(start_va & (~(((u64)(umem->page_size))-1ULL)))
+						^ current_block_start;
+			block_shift = min(alignment_of(misalignment_bits)
+				, block_shift);
+		}
 
-	list_for_each_entry(chunk, &umem->chunk_list, list)
-		for (i = 0; i < chunk->nmap; ++i) {
-			daddr = sg_dma_address(&chunk->page_list[i]);
-			dsize = sg_dma_len(&chunk->page_list[i]);
-			if (!cur_size) {
-				cur_addr = daddr;
-				cur_size = dsize;
-			} else if (cur_addr + cur_size != daddr) {
-				err = -EINVAL;
-				goto out;
-			} else
-				cur_size += dsize;
+		/* Go over the scatter entries in the current chunk, check
+		     if they continue the previous scatter entry.
+		*/
+		for (j = 0; j < chunk->nmap; ++j) {
+			u64 next_block_start =
+				sg_dma_address(&chunk->page_list[j]);
+			u64 current_block_end = current_block_start
+				+ current_block_len;
+			/* If we have a split (non-contig.) between two block*/
+			if (current_block_end != next_block_start) {
+				block_shift = mlx4_ib_umem_calc_block_mtt(
+						next_block_start,
+						current_block_end,
+						block_shift);
 
-			if (cur_size > HPAGE_SIZE) {
-				err = -EINVAL;
-				goto out;
-			} else if (cur_size == HPAGE_SIZE) {
-				cur_size = 0;
-				arr[j++] = cur_addr;
+				/* If we reached the minimum shift for 4k
+				     page we stop the loop.
+				*/
+				if (block_shift <= min_shift)
+					goto end;
+
+				/* If not saved yet we are in first block -
+				     we save the length of first block to
+				     calculate the non_aligned_pages number at
+				*    the end.
+				*/
+				total_len += current_block_len;
+
+				/* Start a new block */
+				current_block_start = next_block_start;
+				current_block_len =
+					sg_dma_len(&chunk->page_list[j]);
+				continue;
 			}
+			/* The scatter entry is another part of
+			     the current block, increase the block size
+			* An entry in the scatter can be larger than
+			4k (page) as of dma mapping
+			which merge some blocks together.
+			*/
+			current_block_len +=
+				sg_dma_len(&chunk->page_list[j]);
 		}
+	}
 
-	if (cur_size) {
-		arr[j++] = cur_addr;
+	/* Account for the last block in the total len */
+	total_len += current_block_len;
+	/* Add to the first block the misalignment that it suffers from.*/
+	total_len += (first_block_start & ((1ULL<<block_shift)-1ULL));
+	last_block_end = current_block_start+current_block_len;
+	last_block_aligned_end = round_up(last_block_end, 1<<block_shift);
+	total_len += (last_block_aligned_end - last_block_end);
+
+	WARN((total_len & ((1ULL<<block_shift)-1ULL)),
+		" misaligned total length detected (%llu, %llu)!",
+		(long long)total_len, (long long)block_shift);
+
+	*num_of_mtts = total_len >> block_shift;
+end:
+	if (block_shift < min_shift) {
+		/* If shift is less than the min we set a WARN and
+		     return the min shift.
+		*/
+		WARN(1,
+		"mlx4_ib_umem_calc_optimal_mtt_size - unexpected shift %lld\n",
+		(long long)block_shift);
+
+		block_shift = min_shift;
 	}
+	return block_shift;
+}
 
-	err = mlx4_mr_alloc(dev->dev, to_mpd(pd)->pdn, virt_addr, umem->length,
-			    convert_access(access_flags), n, HPAGE_SHIFT, &mr->mmr);
-	if (err)
-		goto out;
+#ifdef __linux__
+static int prepare_shared_mr(struct mlx4_ib_mr *mr, int access_flags, int mr_id)
+{
+	struct proc_dir_entry *mr_proc_entry;
+	mode_t mode = S_IFREG;
+	char name_buff[16];
 
-	err = mlx4_write_mtt(dev->dev, &mr->mmr.mtt, 0, n, arr);
+	mode |= convert_shared_access(access_flags);
+	sprintf(name_buff, "%X", mr_id);
+	mr->smr_info = kmalloc(sizeof(struct mlx4_shared_mr_info), GFP_KERNEL);
+	mr->smr_info->mr_id = mr_id;
+	mr->smr_info->umem = mr->umem;
 
-out:
-	kfree(arr);
-	return err;
-#else
-	return -ENOSYS;
-#endif
+	mr_proc_entry = proc_create_data(name_buff, mode,
+				mlx4_mrs_dir_entry,
+				&shared_mr_proc_ops,
+				mr->smr_info);
+
+	if (!mr_proc_entry) {
+		pr_err("prepare_shared_mr failed via proc\n");
+		kfree(mr->smr_info);
+		return -ENODEV;
+	}
+
+	current_uid_gid(&(mr_proc_entry->uid), &(mr_proc_entry->gid));
+	mr_proc_entry->size = mr->umem->length;
+	return 0;
+
 }
+static int is_shared_mr(int access_flags)
+{
+	/* We should check whether IB_ACCESS_SHARED_MR_USER_READ or
+	other shared bits were turned on.
+	*/
+	return !!(access_flags & (IB_ACCESS_SHARED_MR_USER_READ |
+				IB_ACCESS_SHARED_MR_USER_WRITE |
+				IB_ACCESS_SHARED_MR_GROUP_READ |
+				IB_ACCESS_SHARED_MR_GROUP_WRITE |
+				IB_ACCESS_SHARED_MR_OTHER_READ |
+				IB_ACCESS_SHARED_MR_OTHER_WRITE));
 
+}
+#endif
+
 struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 				  u64 virt_addr, int access_flags,
-				  struct ib_udata *udata)
+				  struct ib_udata *udata,
+				  int mr_id)
 {
 	struct mlx4_ib_dev *dev = to_mdev(pd->device);
 	struct mlx4_ib_mr *mr;
@@ -193,38 +488,49 @@
 	int err;
 	int n;
 
-	mr = kmalloc(sizeof *mr, GFP_KERNEL);
+	mr = kzalloc(sizeof *mr, GFP_KERNEL);
 	if (!mr)
 		return ERR_PTR(-ENOMEM);
 
 	mr->umem = ib_umem_get(pd->uobject->context, start, length,
-			       access_flags, 0);
+			access_flags, 0);
 	if (IS_ERR(mr->umem)) {
 		err = PTR_ERR(mr->umem);
 		goto err_free;
 	}
 
-	if (!mr->umem->hugetlb ||
-	    handle_hugetlb_user_mr(pd, mr, start, virt_addr, access_flags)) {
-		n = ib_umem_page_count(mr->umem);
-		shift = ilog2(mr->umem->page_size);
+	n = ib_umem_page_count(mr->umem);
+	shift = mlx4_ib_umem_calc_optimal_mtt_size(mr->umem, start,
+		&n);
+	err = mlx4_mr_alloc(dev->dev, to_mpd(pd)->pdn, virt_addr, length,
+			 convert_access(access_flags), n, shift, &mr->mmr);
+	if (err)
+		goto err_umem;
 
-		err = mlx4_mr_alloc(dev->dev, to_mpd(pd)->pdn, virt_addr, length,
-				    convert_access(access_flags), n, shift, &mr->mmr);
-		if (err)
-			goto err_umem;
+	err = mlx4_ib_umem_write_mtt(dev, &mr->mmr.mtt, mr->umem);
+	if (err)
+		goto err_mr;
 
-		err = mlx4_ib_umem_write_mtt(dev, &mr->mmr.mtt, mr->umem);
-		if (err)
-			goto err_mr;
-	}
-
 	err = mlx4_mr_enable(dev->dev, &mr->mmr);
 	if (err)
 		goto err_mr;
 
 	mr->ibmr.rkey = mr->ibmr.lkey = mr->mmr.key;
+#ifdef __linux__
+	/* Check whether MR should be shared */
+	if (is_shared_mr(access_flags)) {
+	/* start address and length must be aligned to page size in order
+	    to map a full page and preventing leakage of data */
+		if (mr->umem->offset || (length & ~PAGE_MASK)) {
+		        err = -EINVAL;
+		        goto err_mr;
+		}
 
+		err = prepare_shared_mr(mr, access_flags, mr_id);
+		if (err)
+			goto err_mr;
+	}
+#endif
 	return &mr->ibmr;
 
 err_mr:
@@ -239,13 +545,36 @@
 	return ERR_PTR(err);
 }
 
+
 int mlx4_ib_dereg_mr(struct ib_mr *ibmr)
 {
 	struct mlx4_ib_mr *mr = to_mmr(ibmr);
 
 	mlx4_mr_free(to_mdev(ibmr->device)->dev, &mr->mmr);
+	if (mr->smr_info) {
+		/* When master/parent shared mr is dereged there is
+		no ability to share this mr any more - its mr_id will be
+		returned to the kernel as part of ib_uverbs_dereg_mr
+		and may be allocated again as part of other reg_mr.
+		*/
+		char name_buff[16];
+
+		sprintf(name_buff, "%X", mr->smr_info->mr_id);
+		/* Remove proc entry is checking internally that no operation
+		    was strated on that proc fs file and if in the middle
+		    current process will wait till end of operation.
+		    That's why no sync mechanism is needed when we release
+		    below the shared umem.
+		*/
+#ifdef __linux__
+		remove_proc_entry(name_buff, mlx4_mrs_dir_entry);
+		kfree(mr->smr_info);
+#endif
+	}
+
 	if (mr->umem)
 		ib_umem_release(mr->umem);
+
 	kfree(mr);
 
 	return 0;
@@ -258,7 +587,7 @@
 	struct mlx4_ib_mr *mr;
 	int err;
 
-	mr = kmalloc(sizeof *mr, GFP_KERNEL);
+	mr = kzalloc(sizeof *mr, GFP_KERNEL);
 	if (!mr)
 		return ERR_PTR(-ENOMEM);
 
@@ -291,7 +620,7 @@
 	struct mlx4_ib_fast_reg_page_list *mfrpl;
 	int size = page_list_len * sizeof (u64);
 
-	if (page_list_len > MAX_FAST_REG_PAGES)
+	if (page_list_len > MLX4_MAX_FAST_REG_PAGES)
 		return ERR_PTR(-EINVAL);
 
 	mfrpl = kmalloc(sizeof *mfrpl, GFP_KERNEL);
@@ -403,7 +732,7 @@
 
 	err = mlx4_SYNC_TPT(mdev);
 	if (err)
-		printk(KERN_WARNING "mlx4_ib: SYNC_TPT error %d when "
+		pr_warn("SYNC_TPT error %d when "
 		       "unmapping FMRs\n", err);
 
 	return 0;


Property changes on: trunk/sys/ofed/drivers/infiniband/hw/mlx4/mr.c
___________________________________________________________________
Deleted: cvs2svn:cvs-rev
## -1 +0,0 ##
-1.1.1.1
\ No newline at end of property
Modified: trunk/sys/ofed/drivers/infiniband/hw/mlx4/qp.c
===================================================================
--- trunk/sys/ofed/drivers/infiniband/hw/mlx4/qp.c	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/drivers/infiniband/hw/mlx4/qp.c	2016-09-14 19:35:22 UTC (rev 7911)
@@ -32,15 +32,23 @@
  */
 
 #include <linux/log2.h>
+#include <linux/slab.h>
 #include <linux/netdevice.h>
+#include <linux/bitops.h>
 
 #include <rdma/ib_cache.h>
 #include <rdma/ib_pack.h>
 #include <rdma/ib_addr.h>
+#include <rdma/ib_mad.h>
 
 #include <linux/mlx4/qp.h>
+#include <linux/mlx4/driver.h>
 #include <linux/io.h>
 
+#ifndef __linux__
+#define asm __asm
+#endif
+
 #include "mlx4_ib.h"
 #include "user.h"
 
@@ -52,27 +60,24 @@
 	MLX4_IB_DEFAULT_SCHED_QUEUE	= 0x83,
 	MLX4_IB_DEFAULT_QP0_SCHED_QUEUE	= 0x3f,
 	MLX4_IB_LINK_TYPE_IB		= 0,
-	MLX4_IB_LINK_TYPE_ETH		= 1,
+	MLX4_IB_LINK_TYPE_ETH		= 1
 };
 
 enum {
 	/*
-	 * Largest possible UD header: send with GRH and immediate data.
-	 * 4 bytes added to accommodate for eth header instead of lrh
+	 * Largest possible UD header: send with GRH and immediate
+	 * data plus 18 bytes for an Ethernet header with VLAN/802.1Q
+	 * tag.  (LRH would only use 8 bytes, so Ethernet is the
+	 * biggest case)
 	 */
-	MLX4_IB_UD_HEADER_SIZE		= 76,
-	MLX4_IB_MAX_RAW_ETY_HDR_SIZE	= 12
+	MLX4_IB_UD_HEADER_SIZE		= 82,
+	MLX4_IB_LSO_HEADER_SPARE	= 128,
 };
 
 enum {
-	MLX4_IBOE_ETHERTYPE = 0x8915
+	MLX4_IB_IBOE_ETHERTYPE		= 0x8915
 };
 
-struct mlx4_ib_xrc_reg_entry {
-	struct list_head list;
-	void *context;
-};
-
 struct mlx4_ib_sqp {
 	struct mlx4_ib_qp	qp;
 	int			pkey_index;
@@ -83,9 +88,15 @@
 };
 
 enum {
-	MLX4_IB_MIN_SQ_STRIDE = 6
+	MLX4_IB_MIN_SQ_STRIDE	= 6,
+	MLX4_IB_CACHE_LINE_SIZE	= 64,
 };
 
+enum {
+	MLX4_RAW_QP_MTU		= 7,
+	MLX4_RAW_QP_MSGMAX	= 31,
+};
+
 static const __be32 mlx4_ib_opcode[] = {
 	[IB_WR_SEND]				= cpu_to_be32(MLX4_OPCODE_SEND),
 	[IB_WR_LSO]				= cpu_to_be32(MLX4_OPCODE_LSO),
@@ -104,32 +115,77 @@
 
 #ifndef wc_wmb
 	#if defined(__i386__)
-		#define wc_wmb() __asm volatile("lock; addl $0,0(%%esp) " ::: "memory")
+		#define wc_wmb() asm volatile("lock; addl $0,0(%%esp) " ::: "memory")
 	#elif defined(__x86_64__)
-		#define wc_wmb() __asm volatile("sfence" ::: "memory")
+		#define wc_wmb() asm volatile("sfence" ::: "memory")
 	#elif defined(__ia64__)
-		#define wc_wmb() __asm volatile("fwb" ::: "memory")
+		#define wc_wmb() asm volatile("fwb" ::: "memory")
 	#else
 		#define wc_wmb() wmb()
 	#endif
 #endif
 
-
 static struct mlx4_ib_sqp *to_msqp(struct mlx4_ib_qp *mqp)
 {
 	return container_of(mqp, struct mlx4_ib_sqp, qp);
 }
 
+static int is_tunnel_qp(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp)
+{
+	if (!mlx4_is_master(dev->dev))
+		return 0;
+
+	return qp->mqp.qpn >= dev->dev->phys_caps.base_tunnel_sqpn &&
+	       qp->mqp.qpn < dev->dev->phys_caps.base_tunnel_sqpn +
+		8 * MLX4_MFUNC_MAX;
+}
+
 static int is_sqp(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp)
 {
-	return qp->mqp.qpn >= dev->dev->caps.sqp_start &&
-		qp->mqp.qpn <= dev->dev->caps.sqp_start + 3;
+	int proxy_sqp = 0;
+	int real_sqp = 0;
+	int i;
+	/* PPF or Native -- real SQP */
+	real_sqp = ((mlx4_is_master(dev->dev) || !mlx4_is_mfunc(dev->dev)) &&
+		    qp->mqp.qpn >= dev->dev->phys_caps.base_sqpn &&
+		    qp->mqp.qpn <= dev->dev->phys_caps.base_sqpn + 3);
+	if (real_sqp)
+		return 1;
+	/* VF or PF -- proxy SQP */
+	if (mlx4_is_mfunc(dev->dev)) {
+		for (i = 0; i < dev->dev->caps.num_ports; i++) {
+			if (qp->mqp.qpn == dev->dev->caps.qp0_proxy[i] ||
+			    qp->mqp.qpn == dev->dev->caps.qp1_proxy[i]) {
+				proxy_sqp = 1;
+				break;
+			}
+		}
+	}
+	return proxy_sqp;
 }
 
+/* used for INIT/CLOSE port logic */
 static int is_qp0(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp)
 {
-	return qp->mqp.qpn >= dev->dev->caps.sqp_start &&
-		qp->mqp.qpn <= dev->dev->caps.sqp_start + 1;
+	int proxy_qp0 = 0;
+	int real_qp0 = 0;
+	int i;
+	/* PPF or Native -- real QP0 */
+	real_qp0 = ((mlx4_is_master(dev->dev) || !mlx4_is_mfunc(dev->dev)) &&
+		    qp->mqp.qpn >= dev->dev->phys_caps.base_sqpn &&
+		    qp->mqp.qpn <= dev->dev->phys_caps.base_sqpn + 1);
+	if (real_qp0)
+		return 1;
+	/* VF or PF -- proxy QP0 */
+	if (mlx4_is_mfunc(dev->dev)) {
+		for (i = 0; i < dev->dev->caps.num_ports; i++) {
+			if (qp->mqp.qpn == dev->dev->caps.qp0_proxy[i]) {
+				proxy_qp0 = 1;
+				break;
+			}
+		}
+	}
+	return proxy_qp0;
 }
 
 static void *get_wqe(struct mlx4_ib_qp *qp, int offset)
@@ -207,7 +263,7 @@
 	/* Pad the remainder of the WQE with an inline data segment. */
 	if (size > s) {
 		inl = wqe + s;
-		inl->byte_count = cpu_to_be32(1 << 31 | (size - s - sizeof *inl));
+		inl->byte_count = cpu_to_be32(1U << 31 | (size - s - sizeof *inl));
 	}
 	ctrl->srcrb_flags = 0;
 	ctrl->fence_size = size / 16;
@@ -237,10 +293,7 @@
 static void mlx4_ib_qp_event(struct mlx4_qp *qp, enum mlx4_event type)
 {
 	struct ib_event event;
-	struct mlx4_ib_qp *mqp = to_mibqp(qp);
-	struct ib_qp *ibqp = &mqp->ibqp;
-	struct mlx4_ib_xrc_reg_entry *ctx_entry;
-	unsigned long flags;
+	struct ib_qp *ibqp = &to_mibqp(qp)->ibqp;
 
 	if (type == MLX4_EVENT_TYPE_PATH_MIG)
 		to_mibqp(qp)->port = to_mibqp(qp)->alt_port;
@@ -247,6 +300,7 @@
 
 	if (ibqp->event_handler) {
 		event.device     = ibqp->device;
+		event.element.qp = ibqp;
 		switch (type) {
 		case MLX4_EVENT_TYPE_PATH_MIG:
 			event.event = IB_EVENT_PATH_MIG;
@@ -273,27 +327,16 @@
 			event.event = IB_EVENT_QP_ACCESS_ERR;
 			break;
 		default:
-			printk(KERN_WARNING "mlx4_ib: Unexpected event type %d "
+			pr_warn("Unexpected event type %d "
 			       "on QP %06x\n", type, qp->qpn);
 			return;
 		}
 
-		if (unlikely(ibqp->qp_type == IB_QPT_XRC &&
-			     mqp->flags & MLX4_IB_XRC_RCV)) {
-			event.event |= IB_XRC_QP_EVENT_FLAG;
-			event.element.xrc_qp_num = ibqp->qp_num;
-			spin_lock_irqsave(&mqp->xrc_reg_list_lock, flags);
-			list_for_each_entry(ctx_entry, &mqp->xrc_reg_list, list)
-				ibqp->event_handler(&event, ctx_entry->context);
-			spin_unlock_irqrestore(&mqp->xrc_reg_list_lock, flags);
-			return;
-		}
-		event.element.qp = ibqp;
 		ibqp->event_handler(&event, ibqp->qp_context);
 	}
 }
 
-static int send_wqe_overhead(enum ib_qp_type type, u32 flags)
+static int send_wqe_overhead(enum mlx4_ib_qp_type type, u32 flags)
 {
 	/*
 	 * UD WQEs must have a datagram segment.
@@ -302,20 +345,29 @@
 	 * header and space for the ICRC).
 	 */
 	switch (type) {
-	case IB_QPT_UD:
+	case MLX4_IB_QPT_UD:
 		return sizeof (struct mlx4_wqe_ctrl_seg) +
 			sizeof (struct mlx4_wqe_datagram_seg) +
-			((flags & MLX4_IB_QP_LSO) ? 128 : 0);
-	case IB_QPT_UC:
+			((flags & MLX4_IB_QP_LSO) ? MLX4_IB_LSO_HEADER_SPARE : 0);
+	case MLX4_IB_QPT_PROXY_SMI_OWNER:
+	case MLX4_IB_QPT_PROXY_SMI:
+	case MLX4_IB_QPT_PROXY_GSI:
 		return sizeof (struct mlx4_wqe_ctrl_seg) +
+			sizeof (struct mlx4_wqe_datagram_seg) + 64;
+	case MLX4_IB_QPT_TUN_SMI_OWNER:
+	case MLX4_IB_QPT_TUN_GSI:
+		return sizeof (struct mlx4_wqe_ctrl_seg) +
+			sizeof (struct mlx4_wqe_datagram_seg);
+
+	case MLX4_IB_QPT_UC:
+		return sizeof (struct mlx4_wqe_ctrl_seg) +
 			sizeof (struct mlx4_wqe_raddr_seg);
-	case IB_QPT_XRC:
-	case IB_QPT_RC:
+	case MLX4_IB_QPT_RC:
 		return sizeof (struct mlx4_wqe_ctrl_seg) +
-			sizeof (struct mlx4_wqe_atomic_seg) +
+			sizeof (struct mlx4_wqe_masked_atomic_seg) +
 			sizeof (struct mlx4_wqe_raddr_seg);
-	case IB_QPT_SMI:
-	case IB_QPT_GSI:
+	case MLX4_IB_QPT_SMI:
+	case MLX4_IB_QPT_GSI:
 		return sizeof (struct mlx4_wqe_ctrl_seg) +
 			ALIGN(MLX4_IB_UD_HEADER_SIZE +
 			      DIV_ROUND_UP(MLX4_IB_UD_HEADER_SIZE,
@@ -325,12 +377,6 @@
 			ALIGN(4 +
 			      sizeof (struct mlx4_wqe_inline_seg),
 			      sizeof (struct mlx4_wqe_data_seg));
-	case IB_QPT_RAW_ETY:
-		return sizeof(struct mlx4_wqe_ctrl_seg) +
-			ALIGN(MLX4_IB_MAX_RAW_ETY_HDR_SIZE +
-			      sizeof(struct mlx4_wqe_inline_seg),
-			      sizeof(struct mlx4_wqe_data_seg));
-
 	default:
 		return sizeof (struct mlx4_wqe_ctrl_seg);
 	}
@@ -337,32 +383,22 @@
 }
 
 static int set_rq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
-		       int is_user, int has_srq_or_is_xrc, struct mlx4_ib_qp *qp)
+		       int is_user, int has_rq, struct mlx4_ib_qp *qp)
 {
 	/* Sanity check RQ size before proceeding */
 	if (cap->max_recv_wr > dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE ||
-	    cap->max_recv_sge >
-		min(dev->dev->caps.max_sq_sg, dev->dev->caps.max_rq_sg)) {
-		mlx4_ib_dbg("Requested RQ size (sge or wr) too large");
+	    cap->max_recv_sge > min(dev->dev->caps.max_sq_sg, dev->dev->caps.max_rq_sg))
 		return -EINVAL;
-	}
 
-	if (has_srq_or_is_xrc) {
-		/* QPs attached to an SRQ should have no RQ */
-		if (cap->max_recv_wr) {
-			mlx4_ib_dbg("non-zero RQ size for QP using SRQ");
+	if (!has_rq) {
+		if (cap->max_recv_wr)
 			return -EINVAL;
-		}
 
 		qp->rq.wqe_cnt = qp->rq.max_gs = 0;
 	} else {
 		/* HW requires >= 1 RQ entry with >= 1 gather entry */
-		if (is_user && (!cap->max_recv_wr || !cap->max_recv_sge)) {
-			mlx4_ib_dbg("user QP RQ has 0 wr's or 0 sge's "
-				    "(wr: 0x%x, sge: 0x%x)", cap->max_recv_wr,
-				    cap->max_recv_sge);
+		if (is_user && (!cap->max_recv_wr || !cap->max_recv_sge))
 			return -EINVAL;
-		}
 
 		qp->rq.wqe_cnt	 = roundup_pow_of_two(max(1U, cap->max_recv_wr));
 		qp->rq.max_gs	 = roundup_pow_of_two(max(1U, cap->max_recv_sge));
@@ -378,45 +414,33 @@
 			min(dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE, qp->rq.wqe_cnt);
 		cap->max_recv_sge = min(qp->rq.max_gs,
 					min(dev->dev->caps.max_sq_sg,
-				    	dev->dev->caps.max_rq_sg));
+					    dev->dev->caps.max_rq_sg));
 	}
-	/* We don't support inline sends for kernel QPs (yet) */
 
-
 	return 0;
 }
 
 static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
-			      enum ib_qp_type type, struct mlx4_ib_qp *qp)
+			      enum mlx4_ib_qp_type type, struct mlx4_ib_qp *qp)
 {
 	int s;
 
 	/* Sanity check SQ size before proceeding */
-	if (cap->max_send_wr	 > (dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE) ||
-	    cap->max_send_sge	 >
-		min(dev->dev->caps.max_sq_sg, dev->dev->caps.max_rq_sg) ||
+	if (cap->max_send_wr  > (dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE) ||
+	    cap->max_send_sge > min(dev->dev->caps.max_sq_sg, dev->dev->caps.max_rq_sg) ||
 	    cap->max_inline_data + send_wqe_overhead(type, qp->flags) +
-	    sizeof (struct mlx4_wqe_inline_seg) > dev->dev->caps.max_sq_desc_sz) {
-		mlx4_ib_dbg("Requested SQ resources exceed device maxima");
+	    sizeof (struct mlx4_wqe_inline_seg) > dev->dev->caps.max_sq_desc_sz)
 		return -EINVAL;
-	}
 
 	/*
 	 * For MLX transport we need 2 extra S/G entries:
 	 * one for the header and one for the checksum at the end
 	 */
-	if ((type == IB_QPT_SMI || type == IB_QPT_GSI) &&
-	    cap->max_send_sge + 2 > dev->dev->caps.max_sq_sg) {
-		mlx4_ib_dbg("No space for SQP hdr/csum sge's");
+	if ((type == MLX4_IB_QPT_SMI || type == MLX4_IB_QPT_GSI ||
+	     type & (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_TUN_SMI_OWNER)) &&
+	    cap->max_send_sge + 2 > dev->dev->caps.max_sq_sg)
 		return -EINVAL;
-	}
 
-	if (type == IB_QPT_RAW_ETY &&
-	    cap->max_send_sge + 1 > dev->dev->caps.max_sq_sg) {
-		mlx4_ib_dbg("No space for RAW ETY hdr");
-		return -EINVAL;
-	}
-
 	s = max(cap->max_send_sge * sizeof (struct mlx4_wqe_data_seg),
 		cap->max_inline_data + sizeof (struct mlx4_wqe_inline_seg)) +
 		send_wqe_overhead(type, qp->flags);
@@ -434,7 +458,7 @@
 	 * anymore, so we do this only if selective signaling is off.
 	 *
 	 * Further, on 32-bit platforms, we can't use vmap() to make
-	 * the QP buffer virtually contigious.  Thus we have to use
+	 * the QP buffer virtually contiguous.  Thus we have to use
 	 * constant-sized WRs to make sure a WR is always fully within
 	 * a single page-sized chunk.
 	 *
@@ -457,7 +481,9 @@
 	 */
 	if (dev->dev->caps.fw_ver >= MLX4_FW_VER_WQE_CTRL_NEC &&
 	    qp->sq_signal_bits && BITS_PER_LONG == 64 &&
-	    type != IB_QPT_SMI && type != IB_QPT_GSI && type != IB_QPT_RAW_ETY)
+	    type != MLX4_IB_QPT_SMI && type != MLX4_IB_QPT_GSI &&
+	    !(type & (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_PROXY_SMI |
+		      MLX4_IB_QPT_PROXY_GSI | MLX4_IB_QPT_TUN_SMI_OWNER)))
 		qp->sq.wqe_shift = ilog2(64);
 	else
 		qp->sq.wqe_shift = ilog2(roundup_pow_of_two(s));
@@ -516,10 +542,8 @@
 	if ((1 << ucmd->log_sq_bb_count) > dev->dev->caps.max_wqes	 ||
 	    ucmd->log_sq_stride >
 		ilog2(roundup_pow_of_two(dev->dev->caps.max_sq_desc_sz)) ||
-	    ucmd->log_sq_stride < MLX4_IB_MIN_SQ_STRIDE) {
-		mlx4_ib_dbg("Requested max wqes or wqe stride exceeds max");
+	    ucmd->log_sq_stride < MLX4_IB_MIN_SQ_STRIDE)
 		return -EINVAL;
-	}
 
 	qp->sq.wqe_cnt   = 1 << ucmd->log_sq_bb_count;
 	qp->sq.wqe_shift = ucmd->log_sq_stride;
@@ -530,30 +554,412 @@
 	return 0;
 }
 
+static int alloc_proxy_bufs(struct ib_device *dev, struct mlx4_ib_qp *qp)
+{
+	int i;
+
+	qp->sqp_proxy_rcv =
+		kmalloc(sizeof (struct mlx4_ib_buf) * qp->rq.wqe_cnt,
+			GFP_KERNEL);
+	if (!qp->sqp_proxy_rcv)
+		return -ENOMEM;
+	for (i = 0; i < qp->rq.wqe_cnt; i++) {
+		qp->sqp_proxy_rcv[i].addr =
+			kmalloc(sizeof (struct mlx4_ib_proxy_sqp_hdr),
+				GFP_KERNEL);
+		if (!qp->sqp_proxy_rcv[i].addr)
+			goto err;
+		qp->sqp_proxy_rcv[i].map =
+			ib_dma_map_single(dev, qp->sqp_proxy_rcv[i].addr,
+					  sizeof (struct mlx4_ib_proxy_sqp_hdr),
+					  DMA_FROM_DEVICE);
+	}
+	return 0;
+
+err:
+	while (i > 0) {
+		--i;
+		ib_dma_unmap_single(dev, qp->sqp_proxy_rcv[i].map,
+				    sizeof (struct mlx4_ib_proxy_sqp_hdr),
+				    DMA_FROM_DEVICE);
+		kfree(qp->sqp_proxy_rcv[i].addr);
+	}
+	kfree(qp->sqp_proxy_rcv);
+	qp->sqp_proxy_rcv = NULL;
+	return -ENOMEM;
+}
+
+static void free_proxy_bufs(struct ib_device *dev, struct mlx4_ib_qp *qp)
+{
+	int i;
+
+	for (i = 0; i < qp->rq.wqe_cnt; i++) {
+		ib_dma_unmap_single(dev, qp->sqp_proxy_rcv[i].map,
+				    sizeof (struct mlx4_ib_proxy_sqp_hdr),
+				    DMA_FROM_DEVICE);
+		kfree(qp->sqp_proxy_rcv[i].addr);
+	}
+	kfree(qp->sqp_proxy_rcv);
+}
+
+static int qp_has_rq(struct ib_qp_init_attr *attr)
+{
+	if (attr->qp_type == IB_QPT_XRC_INI || attr->qp_type == IB_QPT_XRC_TGT)
+		return 0;
+
+	return !attr->srq;
+}
+
+#ifdef __linux__
+static int init_qpg_parent(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *pqp,
+			   struct ib_qp_init_attr *attr, int *qpn)
+{
+	struct mlx4_ib_qpg_data *qpg_data;
+	int tss_num, rss_num;
+	int tss_align_num, rss_align_num;
+	int tss_base, rss_base = 0;
+	int err;
+
+	/* Parent is part of the TSS range (in SW TSS ARP is sent via parent) */
+	tss_num = 1 + attr->parent_attrib.tss_child_count;
+	tss_align_num = roundup_pow_of_two(tss_num);
+	rss_num = attr->parent_attrib.rss_child_count;
+	rss_align_num = roundup_pow_of_two(rss_num);
+
+	if (rss_num > 1) {
+		/* RSS is requested */
+		if (!(dev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_RSS))
+			return -ENOSYS;
+		if (rss_align_num > dev->dev->caps.max_rss_tbl_sz)
+			return -EINVAL;
+		/* We must work with power of two */
+		attr->parent_attrib.rss_child_count = rss_align_num;
+	}
+
+	qpg_data = kzalloc(sizeof *qpg_data, GFP_KERNEL);
+	if (!qpg_data)
+		return -ENOMEM;
+
+	if(pqp->flags & MLX4_IB_QP_NETIF)
+		err = mlx4_ib_steer_qp_alloc(dev, tss_align_num, &tss_base);
+	else
+		err = mlx4_qp_reserve_range(dev->dev, tss_align_num,
+				tss_align_num, &tss_base, 1);
+	if (err)
+		goto err1;
+
+	if (tss_num > 1) {
+		u32 alloc = BITS_TO_LONGS(tss_align_num)  * sizeof(long);
+		qpg_data->tss_bitmap = kzalloc(alloc, GFP_KERNEL);
+		if (qpg_data->tss_bitmap == NULL) {
+			err = -ENOMEM;
+			goto err2;
+		}
+		bitmap_fill(qpg_data->tss_bitmap, tss_num);
+		/* Note parent takes first index */
+		clear_bit(0, qpg_data->tss_bitmap);
+	}
+
+	if (rss_num > 1) {
+		u32 alloc = BITS_TO_LONGS(rss_align_num) * sizeof(long);
+		err = mlx4_qp_reserve_range(dev->dev, rss_align_num,
+					    1, &rss_base, 0);
+		if (err)
+			goto err3;
+		qpg_data->rss_bitmap = kzalloc(alloc, GFP_KERNEL);
+		if (qpg_data->rss_bitmap == NULL) {
+			err = -ENOMEM;
+			goto err4;
+		}
+		bitmap_fill(qpg_data->rss_bitmap, rss_align_num);
+	}
+
+	qpg_data->tss_child_count = attr->parent_attrib.tss_child_count;
+	qpg_data->rss_child_count = attr->parent_attrib.rss_child_count;
+	qpg_data->qpg_parent = pqp;
+	qpg_data->qpg_tss_mask_sz = ilog2(tss_align_num);
+	qpg_data->tss_qpn_base = tss_base;
+	qpg_data->rss_qpn_base = rss_base;
+
+	pqp->qpg_data = qpg_data;
+	*qpn = tss_base;
+
+	return 0;
+
+err4:
+	mlx4_qp_release_range(dev->dev, rss_base, rss_align_num);
+
+err3:
+	if (tss_num > 1)
+		kfree(qpg_data->tss_bitmap);
+
+err2:
+	if(pqp->flags & MLX4_IB_QP_NETIF)
+		mlx4_ib_steer_qp_free(dev, tss_base, tss_align_num);
+	else
+		mlx4_qp_release_range(dev->dev, tss_base, tss_align_num);
+
+err1:
+	kfree(qpg_data);
+	return err;
+}
+
+static void free_qpg_parent(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *pqp)
+{
+	struct mlx4_ib_qpg_data *qpg_data = pqp->qpg_data;
+	int align_num;
+
+	if (qpg_data->tss_child_count > 1)
+		kfree(qpg_data->tss_bitmap);
+
+	align_num = roundup_pow_of_two(1 + qpg_data->tss_child_count);
+	if(pqp->flags & MLX4_IB_QP_NETIF)
+		mlx4_ib_steer_qp_free(dev, qpg_data->tss_qpn_base, align_num);
+	else
+		mlx4_qp_release_range(dev->dev, qpg_data->tss_qpn_base, align_num);
+
+	if (qpg_data->rss_child_count > 1) {
+		kfree(qpg_data->rss_bitmap);
+		align_num = roundup_pow_of_two(qpg_data->rss_child_count);
+		mlx4_qp_release_range(dev->dev, qpg_data->rss_qpn_base,
+					align_num);
+	}
+
+	kfree(qpg_data);
+}
+
+static int alloc_qpg_qpn(struct ib_qp_init_attr *init_attr,
+			 struct mlx4_ib_qp *pqp, int *qpn)
+{
+	struct mlx4_ib_qp *mqp = to_mqp(init_attr->qpg_parent);
+	struct mlx4_ib_qpg_data *qpg_data = mqp->qpg_data;
+	u32 idx, old;
+
+	switch (init_attr->qpg_type) {
+	case IB_QPG_CHILD_TX:
+		if (qpg_data->tss_child_count == 0)
+			return -EINVAL;
+		do {
+			/* Parent took index 0 */
+			idx = find_first_bit(qpg_data->tss_bitmap,
+					     qpg_data->tss_child_count + 1);
+			if (idx >= qpg_data->tss_child_count + 1)
+				return -ENOMEM;
+			old = test_and_clear_bit(idx, qpg_data->tss_bitmap);
+		} while (old == 0);
+		idx += qpg_data->tss_qpn_base;
+		break;
+	case IB_QPG_CHILD_RX:
+		if (qpg_data->rss_child_count == 0)
+			return -EINVAL;
+		do {
+			idx = find_first_bit(qpg_data->rss_bitmap,
+					     qpg_data->rss_child_count);
+			if (idx >= qpg_data->rss_child_count)
+				return -ENOMEM;
+			old = test_and_clear_bit(idx, qpg_data->rss_bitmap);
+		} while (old == 0);
+		idx += qpg_data->rss_qpn_base;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	pqp->qpg_data = qpg_data;
+	*qpn = idx;
+
+	return 0;
+}
+
+static void free_qpg_qpn(struct mlx4_ib_qp *mqp, int qpn)
+{
+	struct mlx4_ib_qpg_data *qpg_data = mqp->qpg_data;
+
+	switch (mqp->qpg_type) {
+	case IB_QPG_CHILD_TX:
+		/* Do range check */
+		qpn -= qpg_data->tss_qpn_base;
+		set_bit(qpn, qpg_data->tss_bitmap);
+		break;
+	case IB_QPG_CHILD_RX:
+		qpn -= qpg_data->rss_qpn_base;
+		set_bit(qpn, qpg_data->rss_bitmap);
+		break;
+	default:
+		/* error */
+		pr_warn("wrong qpg type (%d)\n", mqp->qpg_type);
+		break;
+	}
+}
+#endif
+
+static int alloc_qpn_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp,
+			    struct ib_qp_init_attr *attr, int *qpn)
+{
+	int err = 0;
+
+	switch (attr->qpg_type) {
+	case IB_QPG_NONE:
+		/* Raw packet QPNs must be aligned to 8 bits. If not, the WQE
+		 * BlueFlame setup flow wrongly causes VLAN insertion. */
+		if (attr->qp_type == IB_QPT_RAW_PACKET) {
+			err = mlx4_qp_reserve_range(dev->dev, 1, 1, qpn, 1);
+		} else {
+			if(qp->flags & MLX4_IB_QP_NETIF)
+				err = mlx4_ib_steer_qp_alloc(dev, 1, qpn);
+			else
+				err = mlx4_qp_reserve_range(dev->dev, 1, 1, qpn, 0);
+		}
+		break;
+	case IB_QPG_PARENT:
+#ifdef __linux__
+		err = init_qpg_parent(dev, qp, attr, qpn);
+#endif
+		break;
+	case IB_QPG_CHILD_TX:
+	case IB_QPG_CHILD_RX:
+#ifdef __linux__
+		err = alloc_qpg_qpn(attr, qp, qpn);
+#endif
+		break;
+	default:
+		qp->qpg_type = IB_QPG_NONE;
+		err = -EINVAL;
+		break;
+	}
+	if (err)
+		return err;
+	qp->qpg_type = attr->qpg_type;
+	return 0;
+}
+
+static void free_qpn_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp,
+			enum ib_qpg_type qpg_type, int qpn)
+{
+	switch (qpg_type) {
+	case IB_QPG_NONE:
+		if (qp->flags & MLX4_IB_QP_NETIF)
+			mlx4_ib_steer_qp_free(dev, qpn, 1);
+		else
+			mlx4_qp_release_range(dev->dev, qpn, 1);
+		break;
+	case IB_QPG_PARENT:
+#ifdef __linux__
+		free_qpg_parent(dev, qp);
+#endif
+		break;
+	case IB_QPG_CHILD_TX:
+	case IB_QPG_CHILD_RX:
+#ifdef __linux__
+		free_qpg_qpn(qp, qpn);
+#endif
+		break;
+	default:
+		break;
+	}
+}
+
+/* Revert allocation on create_qp_common */
+static void unalloc_qpn_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp,
+			       struct ib_qp_init_attr *attr, int qpn)
+{
+	free_qpn_common(dev, qp, attr->qpg_type, qpn);
+}
+
+static void release_qpn_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp)
+{
+	free_qpn_common(dev, qp, qp->qpg_type, qp->mqp.qpn);
+}
+
 static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
 			    struct ib_qp_init_attr *init_attr,
-			    struct ib_udata *udata, int sqpn, struct mlx4_ib_qp *qp)
+			    struct ib_udata *udata, int sqpn, struct mlx4_ib_qp **caller_qp)
 {
 	int qpn;
 	int err;
+	struct mlx4_ib_sqp *sqp;
+	struct mlx4_ib_qp *qp;
+	enum mlx4_ib_qp_type qp_type = (enum mlx4_ib_qp_type) init_attr->qp_type;
 
+#ifndef __linux__
+        init_attr->qpg_type = IB_QPG_NONE;
+#endif
+
+	/* When tunneling special qps, we use a plain UD qp */
+	if (sqpn) {
+		if (mlx4_is_mfunc(dev->dev) &&
+		    (!mlx4_is_master(dev->dev) ||
+		     !(init_attr->create_flags & MLX4_IB_SRIOV_SQP))) {
+			if (init_attr->qp_type == IB_QPT_GSI)
+				qp_type = MLX4_IB_QPT_PROXY_GSI;
+			else if (mlx4_is_master(dev->dev))
+				qp_type = MLX4_IB_QPT_PROXY_SMI_OWNER;
+			else
+				qp_type = MLX4_IB_QPT_PROXY_SMI;
+		}
+		qpn = sqpn;
+		/* add extra sg entry for tunneling */
+		init_attr->cap.max_recv_sge++;
+	} else if (init_attr->create_flags & MLX4_IB_SRIOV_TUNNEL_QP) {
+		struct mlx4_ib_qp_tunnel_init_attr *tnl_init =
+			container_of(init_attr,
+				     struct mlx4_ib_qp_tunnel_init_attr, init_attr);
+		if ((tnl_init->proxy_qp_type != IB_QPT_SMI &&
+		     tnl_init->proxy_qp_type != IB_QPT_GSI)   ||
+		    !mlx4_is_master(dev->dev))
+			return -EINVAL;
+		if (tnl_init->proxy_qp_type == IB_QPT_GSI)
+			qp_type = MLX4_IB_QPT_TUN_GSI;
+		else if (tnl_init->slave == mlx4_master_func_num(dev->dev))
+			qp_type = MLX4_IB_QPT_TUN_SMI_OWNER;
+		else
+			qp_type = MLX4_IB_QPT_TUN_SMI;
+		/* we are definitely in the PPF here, since we are creating
+		 * tunnel QPs. base_tunnel_sqpn is therefore valid. */
+		qpn = dev->dev->phys_caps.base_tunnel_sqpn + 8 * tnl_init->slave
+			+ tnl_init->proxy_qp_type * 2 + tnl_init->port - 1;
+		sqpn = qpn;
+	}
+
+	if (!*caller_qp) {
+		if (qp_type == MLX4_IB_QPT_SMI || qp_type == MLX4_IB_QPT_GSI ||
+		    (qp_type & (MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_SMI_OWNER |
+				MLX4_IB_QPT_PROXY_GSI | MLX4_IB_QPT_TUN_SMI_OWNER))) {
+			sqp = kzalloc(sizeof (struct mlx4_ib_sqp), GFP_KERNEL);
+			if (!sqp)
+				return -ENOMEM;
+			qp = &sqp->qp;
+			qp->pri.vid = qp->alt.vid = 0xFFFF;
+		} else {
+			qp = kzalloc(sizeof (struct mlx4_ib_qp), GFP_KERNEL);
+			if (!qp)
+				return -ENOMEM;
+			qp->pri.vid = qp->alt.vid = 0xFFFF;
+		}
+	} else
+		qp = *caller_qp;
+
+	qp->mlx4_ib_qp_type = qp_type;
+
 	mutex_init(&qp->mutex);
 	spin_lock_init(&qp->sq.lock);
 	spin_lock_init(&qp->rq.lock);
-	spin_lock_init(&qp->xrc_reg_list_lock);
 	INIT_LIST_HEAD(&qp->gid_list);
+	INIT_LIST_HEAD(&qp->steering_rules);
+	INIT_LIST_HEAD(&qp->rules_list);
 
 	qp->state	 = IB_QPS_RESET;
 	if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR)
 		qp->sq_signal_bits = cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE);
 
-	err = set_rq_size(dev, &init_attr->cap, !!pd->uobject,
-			  !!init_attr->srq || !!init_attr->xrc_domain , qp);
+	err = set_rq_size(dev, &init_attr->cap, !!pd->uobject, qp_has_rq(init_attr), qp);
 	if (err)
 		goto err;
 
 	if (pd->uobject) {
 		struct mlx4_ib_create_qp ucmd;
+		int shift;
+		int n;
 
 		if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) {
 			err = -EFAULT;
@@ -570,30 +976,25 @@
 				       qp->buf_size, 0, 0);
 		if (IS_ERR(qp->umem)) {
 			err = PTR_ERR(qp->umem);
-			mlx4_ib_dbg("ib_umem_get error (%d)", err);
 			goto err;
 		}
 
-		err = mlx4_mtt_init(dev->dev, ib_umem_page_count(qp->umem),
-				    ilog2(qp->umem->page_size), &qp->mtt);
-		if (err) {
-			mlx4_ib_dbg("mlx4_mtt_init error (%d)", err);
+		n = ib_umem_page_count(qp->umem);
+		shift = mlx4_ib_umem_calc_optimal_mtt_size(qp->umem, 0, &n);
+		err = mlx4_mtt_init(dev->dev, n, shift, &qp->mtt);
+
+		if (err)
 			goto err_buf;
-		}
 
 		err = mlx4_ib_umem_write_mtt(dev, &qp->mtt, qp->umem);
-		if (err) {
-			mlx4_ib_dbg("mlx4_ib_umem_write_mtt error (%d)", err);
+		if (err)
 			goto err_mtt;
-		}
 
-		if (!init_attr->srq && init_attr->qp_type != IB_QPT_XRC) {
+		if (qp_has_rq(init_attr)) {
 			err = mlx4_ib_db_map_user(to_mucontext(pd->uobject->context),
 						  ucmd.db_addr, &qp->db);
-			if (err) {
-				mlx4_ib_dbg("mlx4_ib_db_map_user error (%d)", err);
+			if (err)
 				goto err_mtt;
-			}
 		}
 	} else {
 		qp->sq_no_prefetch = 0;
@@ -604,11 +1005,17 @@
 		if (init_attr->create_flags & IB_QP_CREATE_IPOIB_UD_LSO)
 			qp->flags |= MLX4_IB_QP_LSO;
 
-		err = set_kernel_sq_size(dev, &init_attr->cap, init_attr->qp_type, qp);
+		if (init_attr->create_flags & IB_QP_CREATE_NETIF_QP &&
+		    dev->dev->caps.steering_mode ==
+		    MLX4_STEERING_MODE_DEVICE_MANAGED &&
+		    !mlx4_is_mfunc(dev->dev))
+			qp->flags |= MLX4_IB_QP_NETIF;
+
+		err = set_kernel_sq_size(dev, &init_attr->cap, qp_type, qp);
 		if (err)
 			goto err;
 
-		if (!init_attr->srq && init_attr->qp_type != IB_QPT_XRC) {
+		if (qp_has_rq(init_attr)) {
 			err = mlx4_db_alloc(dev->dev, &qp->db, 0);
 			if (err)
 				goto err;
@@ -617,9 +1024,10 @@
 		}
 
 		if (qp->max_inline_data) {
-			err = mlx4_bf_alloc(dev->dev, &qp->bf);
+			err = mlx4_bf_alloc(dev->dev, &qp->bf, 0);
 			if (err) {
-				mlx4_ib_dbg("failed to allocate blue flame register (%d)", err);
+				pr_debug("failed to allocate blue flame"
+					 " register (%d)", err);
 				qp->bf.uar = &dev->priv_uar;
 			}
 		} else
@@ -632,16 +1040,12 @@
 
 		err = mlx4_mtt_init(dev->dev, qp->buf.npages, qp->buf.page_shift,
 				    &qp->mtt);
-		if (err) {
-			mlx4_ib_dbg("kernel qp mlx4_mtt_init error (%d)", err);
+		if (err)
 			goto err_buf;
-		}
 
 		err = mlx4_buf_write_mtt(dev->dev, &qp->mtt, &qp->buf);
-		if (err) {
-			mlx4_ib_dbg("mlx4_buf_write_mtt error (%d)", err);
+		if (err)
 			goto err_mtt;
-		}
 
 		qp->sq.wrid  = kmalloc(qp->sq.wqe_cnt * sizeof (u64), GFP_KERNEL);
 		qp->rq.wrid  = kmalloc(qp->rq.wqe_cnt * sizeof (u64), GFP_KERNEL);
@@ -653,11 +1057,17 @@
 	}
 
 	if (sqpn) {
-		qpn = sqpn;
+		if (qp->mlx4_ib_qp_type & (MLX4_IB_QPT_PROXY_SMI_OWNER |
+		    MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_GSI)) {
+			if (alloc_proxy_bufs(pd->device, qp)) {
+				err = -ENOMEM;
+				goto err_wrid;
+			}
+		}
 	} else {
-		err = mlx4_qp_reserve_range(dev->dev, 1, 1, &qpn);
+		err = alloc_qpn_common(dev, qp, init_attr, &qpn);
 		if (err)
-			goto err_wrid;
+			goto err_proxy;
 	}
 
 	err = mlx4_qp_alloc(dev->dev, qpn, &qp->mqp);
@@ -664,7 +1074,7 @@
 	if (err)
 		goto err_qpn;
 
-	if (init_attr->qp_type == IB_QPT_XRC)
+	if (init_attr->qp_type == IB_QPT_XRC_TGT)
 		qp->mqp.qpn |= (1 << 23);
 
 	/*
@@ -675,18 +1085,20 @@
 	qp->doorbell_qpn = swab32(qp->mqp.qpn << 8);
 
 	qp->mqp.event = mlx4_ib_qp_event;
-
+	if (!*caller_qp)
+		*caller_qp = qp;
 	return 0;
 
 err_qpn:
-	if (!sqpn)
-		mlx4_qp_release_range(dev->dev, qpn, 1);
+	unalloc_qpn_common(dev, qp, init_attr, qpn);
 
+err_proxy:
+	if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_GSI)
+		free_proxy_bufs(pd->device, qp);
 err_wrid:
 	if (pd->uobject) {
-		if (!init_attr->srq && init_attr->qp_type != IB_QPT_XRC)
-			mlx4_ib_db_unmap_user(to_mucontext(pd->uobject->context),
-					      &qp->db);
+		if (qp_has_rq(init_attr))
+			mlx4_ib_db_unmap_user(to_mucontext(pd->uobject->context), &qp->db);
 	} else {
 		kfree(qp->sq.wrid);
 		kfree(qp->rq.wrid);
@@ -702,7 +1114,7 @@
 		mlx4_buf_free(dev->dev, qp->buf_size, &qp->buf);
 
 err_db:
-	if (!pd->uobject && !init_attr->srq && init_attr->qp_type != IB_QPT_XRC)
+	if (!pd->uobject && qp_has_rq(init_attr))
 		mlx4_db_free(dev->dev, &qp->db);
 
 	if (qp->max_inline_data)
@@ -709,6 +1121,8 @@
 		mlx4_bf_free(dev->dev, &qp->bf);
 
 err:
+	if (!*caller_qp)
+		kfree(qp);
 	return err;
 }
 
@@ -727,10 +1141,12 @@
 }
 
 static void mlx4_ib_lock_cqs(struct mlx4_ib_cq *send_cq, struct mlx4_ib_cq *recv_cq)
+	__acquires(&send_cq->lock) __acquires(&recv_cq->lock)
 {
-	if (send_cq == recv_cq)
+	if (send_cq == recv_cq) {
 		spin_lock_irq(&send_cq->lock);
-	else if (send_cq->mcq.cqn < recv_cq->mcq.cqn) {
+		(void) __acquire(&recv_cq->lock);
+	} else if (send_cq->mcq.cqn < recv_cq->mcq.cqn) {
 		spin_lock_irq(&send_cq->lock);
 		spin_lock_nested(&recv_cq->lock, SINGLE_DEPTH_NESTING);
 	} else {
@@ -740,10 +1156,12 @@
 }
 
 static void mlx4_ib_unlock_cqs(struct mlx4_ib_cq *send_cq, struct mlx4_ib_cq *recv_cq)
+	__releases(&send_cq->lock) __releases(&recv_cq->lock)
 {
-	if (send_cq == recv_cq)
+	if (send_cq == recv_cq) {
+		(void) __release(&recv_cq->lock);
 		spin_unlock_irq(&send_cq->lock);
-	else if (send_cq->mcq.cqn < recv_cq->mcq.cqn) {
+	} else if (send_cq->mcq.cqn < recv_cq->mcq.cqn) {
 		spin_unlock(&recv_cq->lock);
 		spin_unlock_irq(&send_cq->lock);
 	} else {
@@ -754,7 +1172,7 @@
 
 static void del_gid_entries(struct mlx4_ib_qp *qp)
 {
-	struct gid_entry *ge, *tmp;
+	struct mlx4_ib_gid_entry *ge, *tmp;
 
 	list_for_each_entry_safe(ge, tmp, &qp->gid_list, list) {
 		list_del(&ge->list);
@@ -762,19 +1180,66 @@
 	}
 }
 
+static struct mlx4_ib_pd *get_pd(struct mlx4_ib_qp *qp)
+{
+	if (qp->ibqp.qp_type == IB_QPT_XRC_TGT)
+		return to_mpd(to_mxrcd(qp->ibqp.xrcd)->pd);
+	else
+		return to_mpd(qp->ibqp.pd);
+}
+
+static void get_cqs(struct mlx4_ib_qp *qp,
+		    struct mlx4_ib_cq **send_cq, struct mlx4_ib_cq **recv_cq)
+{
+	switch (qp->ibqp.qp_type) {
+	case IB_QPT_XRC_TGT:
+		*send_cq = to_mcq(to_mxrcd(qp->ibqp.xrcd)->cq);
+		*recv_cq = *send_cq;
+		break;
+	case IB_QPT_XRC_INI:
+		*send_cq = to_mcq(qp->ibqp.send_cq);
+		*recv_cq = *send_cq;
+		break;
+	default:
+		*send_cq = to_mcq(qp->ibqp.send_cq);
+		*recv_cq = to_mcq(qp->ibqp.recv_cq);
+		break;
+	}
+}
+
 static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp,
 			      int is_user)
 {
 	struct mlx4_ib_cq *send_cq, *recv_cq;
 
-	if (qp->state != IB_QPS_RESET)
+	if (qp->state != IB_QPS_RESET) {
 		if (mlx4_qp_modify(dev->dev, NULL, to_mlx4_state(qp->state),
 				   MLX4_QP_STATE_RST, NULL, 0, 0, &qp->mqp))
-			printk(KERN_WARNING "mlx4_ib: modify QP %06x to RESET failed.\n",
+			pr_warn("modify QP %06x to RESET failed.\n",
 			       qp->mqp.qpn);
+		if (qp->pri.smac) {
+			mlx4_unregister_mac(dev->dev, qp->pri.smac_port, qp->pri.smac);
+			qp->pri.smac = 0;
+		}
+		if (qp->alt.smac) {
+			mlx4_unregister_mac(dev->dev, qp->alt.smac_port, qp->alt.smac);
+			qp->alt.smac = 0;
+		}
+		if (qp->pri.vid < 0x1000) {
+			mlx4_unregister_vlan(dev->dev, qp->pri.vlan_port, qp->pri.vid);
+			qp->pri.vid = 0xFFFF;
+			qp->pri.candidate_vid = 0xFFFF;
+			qp->pri.update_vid = 0;
+		}
+		if (qp->alt.vid < 0x1000) {
+			mlx4_unregister_vlan(dev->dev, qp->alt.vlan_port, qp->alt.vid);
+			qp->alt.vid = 0xFFFF;
+			qp->alt.candidate_vid = 0xFFFF;
+			qp->alt.update_vid = 0;
+		}
+	}
 
-	send_cq = to_mcq(qp->ibqp.send_cq);
-	recv_cq = to_mcq(qp->ibqp.recv_cq);
+	get_cqs(qp, &send_cq, &recv_cq);
 
 	mlx4_ib_lock_cqs(send_cq, recv_cq);
 
@@ -791,13 +1256,13 @@
 
 	mlx4_qp_free(dev->dev, &qp->mqp);
 
-	if (!is_sqp(dev, qp))
-		mlx4_qp_release_range(dev->dev, qp->mqp.qpn, 1);
+	if (!is_sqp(dev, qp) && !is_tunnel_qp(dev, qp))
+		release_qpn_common(dev, qp);
 
 	mlx4_mtt_cleanup(dev->dev, &qp->mtt);
 
 	if (is_user) {
-		if (!qp->ibqp.srq && qp->ibqp.qp_type != IB_QPT_XRC)
+		if (qp->rq.wqe_cnt)
 			mlx4_ib_db_unmap_user(to_mucontext(qp->ibqp.uobject->context),
 					      &qp->db);
 		ib_umem_release(qp->umem);
@@ -804,10 +1269,14 @@
 	} else {
 		kfree(qp->sq.wrid);
 		kfree(qp->rq.wrid);
+		if (qp->mlx4_ib_qp_type & (MLX4_IB_QPT_PROXY_SMI_OWNER |
+		    MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_GSI))
+			free_proxy_bufs(&dev->ib_dev, qp);
 		mlx4_buf_free(dev->dev, qp->buf_size, &qp->buf);
 		if (qp->max_inline_data)
 			mlx4_bf_free(dev->dev, &qp->bf);
-		if (!qp->ibqp.srq && qp->ibqp.qp_type != IB_QPT_XRC)
+
+		if (qp->rq.wqe_cnt)
 			mlx4_db_free(dev->dev, &qp->db);
 	}
 
@@ -814,83 +1283,178 @@
 	del_gid_entries(qp);
 }
 
+static u32 get_sqp_num(struct mlx4_ib_dev *dev, struct ib_qp_init_attr *attr)
+{
+	/* Native or PPF */
+	if (!mlx4_is_mfunc(dev->dev) ||
+	    (mlx4_is_master(dev->dev) &&
+	     attr->create_flags & MLX4_IB_SRIOV_SQP)) {
+		return  dev->dev->phys_caps.base_sqpn +
+			(attr->qp_type == IB_QPT_SMI ? 0 : 2) +
+			attr->port_num - 1;
+	}
+	/* PF or VF -- creating proxies */
+	if (attr->qp_type == IB_QPT_SMI)
+		return dev->dev->caps.qp0_proxy[attr->port_num - 1];
+	else
+		return dev->dev->caps.qp1_proxy[attr->port_num - 1];
+}
+
+#ifdef __linux__
+static int check_qpg_attr(struct mlx4_ib_dev *dev,
+			  struct ib_qp_init_attr *attr)
+{
+	if (attr->qpg_type == IB_QPG_NONE)
+		return 0;
+
+	if (attr->qp_type != IB_QPT_UD)
+		return -EINVAL;
+
+	if (attr->qpg_type == IB_QPG_PARENT) {
+		if (attr->parent_attrib.tss_child_count == 1)
+			return -EINVAL; /* Doesn't make sense */
+		if (attr->parent_attrib.rss_child_count == 1)
+			return -EINVAL; /* Doesn't make sense */
+		if ((attr->parent_attrib.tss_child_count == 0) &&
+			(attr->parent_attrib.rss_child_count == 0))
+			/* Should be called with IP_QPG_NONE */
+			return -EINVAL;
+		if (attr->parent_attrib.rss_child_count > 1) {
+			int rss_align_num;
+			if (!(dev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_RSS))
+				return -ENOSYS;
+			rss_align_num = roundup_pow_of_two(
+					attr->parent_attrib.rss_child_count);
+			if (rss_align_num > dev->dev->caps.max_rss_tbl_sz)
+				return -EINVAL;
+		}
+	} else {
+		struct mlx4_ib_qpg_data *qpg_data;
+		if (attr->qpg_parent == NULL)
+			return -EINVAL;
+		if (IS_ERR(attr->qpg_parent))
+			return -EINVAL;
+		qpg_data = to_mqp(attr->qpg_parent)->qpg_data;
+		if (qpg_data == NULL)
+			return -EINVAL;
+		if (attr->qpg_type == IB_QPG_CHILD_TX &&
+		    !qpg_data->tss_child_count)
+			return -EINVAL;
+		if (attr->qpg_type == IB_QPG_CHILD_RX &&
+		    !qpg_data->rss_child_count)
+			return -EINVAL;
+	}
+	return 0;
+}
+#endif
+
+#define RESERVED_FLAGS_MASK ((((unsigned int)IB_QP_CREATE_RESERVED_END - 1) | IB_QP_CREATE_RESERVED_END)   \
+							& ~(IB_QP_CREATE_RESERVED_START - 1))
+
+static enum mlx4_ib_qp_flags to_mlx4_ib_qp_flags(enum ib_qp_create_flags ib_qp_flags)
+{
+	enum mlx4_ib_qp_flags mlx4_ib_qp_flags = 0;
+
+	if (ib_qp_flags & IB_QP_CREATE_IPOIB_UD_LSO)
+		mlx4_ib_qp_flags |= MLX4_IB_QP_LSO;
+
+	if (ib_qp_flags & IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK)
+		mlx4_ib_qp_flags |= MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK;
+
+	if (ib_qp_flags & IB_QP_CREATE_NETIF_QP)
+		mlx4_ib_qp_flags |= MLX4_IB_QP_NETIF;
+
+	/* reserved flags */
+	mlx4_ib_qp_flags |= (ib_qp_flags & RESERVED_FLAGS_MASK);
+
+	return mlx4_ib_qp_flags;
+}
+
 struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd,
 				struct ib_qp_init_attr *init_attr,
 				struct ib_udata *udata)
 {
-	struct mlx4_ib_dev *dev = to_mdev(pd->device);
-	struct mlx4_ib_sqp *sqp;
-	struct mlx4_ib_qp *qp;
+	struct mlx4_ib_qp *qp = NULL;
 	int err;
+	u16 xrcdn = 0;
+	enum mlx4_ib_qp_flags mlx4_qp_flags = to_mlx4_ib_qp_flags(init_attr->create_flags);
+	struct ib_device *device;
 
+	/* see ib_core::ib_create_qp same handling */
+	device = pd ? pd->device : init_attr->xrcd->device;
 	/*
-	 * We only support LSO and multicast loopback blocking, and
-	 * only for kernel UD QPs.
+	 * We only support LSO, vendor flag1, and multicast loopback blocking,
+	 * and only for kernel UD QPs.
 	 */
-	if (init_attr->create_flags & ~(IB_QP_CREATE_IPOIB_UD_LSO |
-					IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK))
+	if (mlx4_qp_flags & ~(MLX4_IB_QP_LSO |
+					MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK |
+					MLX4_IB_SRIOV_TUNNEL_QP | MLX4_IB_SRIOV_SQP |
+					MLX4_IB_QP_NETIF))
 		return ERR_PTR(-EINVAL);
 
+	if (init_attr->create_flags & IB_QP_CREATE_NETIF_QP) {
+	       if (init_attr->qp_type != IB_QPT_UD)
+		       return ERR_PTR(-EINVAL);
+	}
+
 	if (init_attr->create_flags &&
-	    (pd->uobject || init_attr->qp_type != IB_QPT_UD))
+	    (udata ||
+	     ((mlx4_qp_flags & ~MLX4_IB_SRIOV_SQP) &&
+	      init_attr->qp_type != IB_QPT_UD) ||
+	     ((mlx4_qp_flags & MLX4_IB_SRIOV_SQP) &&
+	      init_attr->qp_type > IB_QPT_GSI)))
 		return ERR_PTR(-EINVAL);
 
+#ifdef __linux__
+	err = check_qpg_attr(to_mdev(device), init_attr);
+	if (err)
+		return ERR_PTR(err);
+#endif
+
 	switch (init_attr->qp_type) {
-	case IB_QPT_XRC:
-		if (!(dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC))
+	case IB_QPT_XRC_TGT:
+		pd = to_mxrcd(init_attr->xrcd)->pd;
+		xrcdn = to_mxrcd(init_attr->xrcd)->xrcdn;
+		init_attr->send_cq = to_mxrcd(init_attr->xrcd)->cq;
+		/* fall through */
+	case IB_QPT_XRC_INI:
+		if (!(to_mdev(device)->dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC))
 			return ERR_PTR(-ENOSYS);
+		init_attr->recv_cq = init_attr->send_cq;
+		/* fall through */
 	case IB_QPT_RC:
 	case IB_QPT_UC:
-	case IB_QPT_UD:
-	case IB_QPT_RAW_ETH:
-	{
+	case IB_QPT_RAW_PACKET:
 		qp = kzalloc(sizeof *qp, GFP_KERNEL);
 		if (!qp)
 			return ERR_PTR(-ENOMEM);
-
-		err = create_qp_common(dev, pd, init_attr, udata, 0, qp);
+		qp->pri.vid = qp->alt.vid = 0xFFFF;
+		/* fall through */
+	case IB_QPT_UD:
+	{
+		err = create_qp_common(to_mdev(device), pd, init_attr, udata, 0, &qp);
 		if (err) {
 			kfree(qp);
 			return ERR_PTR(err);
 		}
 
-		if (init_attr->qp_type == IB_QPT_XRC)
-			qp->xrcdn = to_mxrcd(init_attr->xrc_domain)->xrcdn;
-		else
-			qp->xrcdn = 0;
-
 		qp->ibqp.qp_num = qp->mqp.qpn;
+		qp->xrcdn = xrcdn;
 
 		break;
 	}
-	case IB_QPT_RAW_ETY:
-		if (!(dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_RAW_ETY))
-			return ERR_PTR(-ENOSYS);
 	case IB_QPT_SMI:
 	case IB_QPT_GSI:
 	{
 		/* Userspace is not allowed to create special QPs: */
-		if (pd->uobject) {
-			mlx4_ib_dbg("Userspace is not allowed to create special QPs");
+		if (udata)
 			return ERR_PTR(-EINVAL);
-		}
 
-		sqp = kzalloc(sizeof *sqp, GFP_KERNEL);
-		if (!sqp)
-			return ERR_PTR(-ENOMEM);
-
-		qp = &sqp->qp;
-
-		err = create_qp_common(dev, pd, init_attr, udata,
-				       dev->dev->caps.sqp_start +
-				       (init_attr->qp_type == IB_QPT_RAW_ETY ? 4 :
-				       (init_attr->qp_type == IB_QPT_SMI ? 0 : 2)) +
-				       init_attr->port_num - 1,
-				       qp);
-		if (err) {
-			kfree(sqp);
+		err = create_qp_common(to_mdev(device), pd, init_attr, udata,
+				       get_sqp_num(to_mdev(device), init_attr),
+				       &qp);
+		if (err)
 			return ERR_PTR(err);
-		}
 
 		qp->port	= init_attr->port_num;
 		qp->ibqp.qp_num = init_attr->qp_type == IB_QPT_SMI ? 0 : 1;
@@ -898,8 +1462,7 @@
 		break;
 	}
 	default:
-		mlx4_ib_dbg("Invalid QP type requested for create_qp (%d)",
-			    init_attr->qp_type);
+		/* Don't support raw QPs */
 		return ERR_PTR(-EINVAL);
 	}
 
@@ -910,11 +1473,13 @@
 {
 	struct mlx4_ib_dev *dev = to_mdev(qp->device);
 	struct mlx4_ib_qp *mqp = to_mqp(qp);
+	struct mlx4_ib_pd *pd;
 
 	if (is_qp0(dev, mqp))
 		mlx4_CLOSE_PORT(dev->dev, mqp->port);
 
-	destroy_qp_common(dev, mqp, !!qp->pd->uobject);
+	pd = get_pd(mqp);
+	destroy_qp_common(dev, mqp, !!pd->ibpd.uobject);
 
 	if (is_sqp(dev, mqp))
 		kfree(to_msqp(mqp));
@@ -924,18 +1489,27 @@
 	return 0;
 }
 
-static int to_mlx4_st(enum ib_qp_type type)
+static int to_mlx4_st(struct mlx4_ib_dev *dev, enum mlx4_ib_qp_type type)
 {
 	switch (type) {
-	case IB_QPT_RC:		return MLX4_QP_ST_RC;
-	case IB_QPT_UC:		return MLX4_QP_ST_UC;
-	case IB_QPT_UD:		return MLX4_QP_ST_UD;
-	case IB_QPT_XRC:	return MLX4_QP_ST_XRC;
-	case IB_QPT_RAW_ETY:
-	case IB_QPT_SMI:
-	case IB_QPT_GSI:
-	case IB_QPT_RAW_ETH:	return MLX4_QP_ST_MLX;
-	default:		return -1;
+	case MLX4_IB_QPT_RC:		return MLX4_QP_ST_RC;
+	case MLX4_IB_QPT_UC:		return MLX4_QP_ST_UC;
+	case MLX4_IB_QPT_UD:		return MLX4_QP_ST_UD;
+	case MLX4_IB_QPT_XRC_INI:
+	case MLX4_IB_QPT_XRC_TGT:	return MLX4_QP_ST_XRC;
+	case MLX4_IB_QPT_SMI:
+	case MLX4_IB_QPT_GSI:
+	case MLX4_IB_QPT_RAW_PACKET:	return MLX4_QP_ST_MLX;
+
+	case MLX4_IB_QPT_PROXY_SMI_OWNER:
+	case MLX4_IB_QPT_TUN_SMI_OWNER:	return (mlx4_is_mfunc(dev->dev) ?
+						MLX4_QP_ST_MLX : -1);
+	case MLX4_IB_QPT_PROXY_SMI:
+	case MLX4_IB_QPT_TUN_SMI:
+	case MLX4_IB_QPT_PROXY_GSI:
+	case MLX4_IB_QPT_TUN_GSI:	return (mlx4_is_mfunc(dev->dev) ?
+						MLX4_QP_ST_UD : -1);
+	default:			return -1;
 	}
 }
 
@@ -986,8 +1560,10 @@
 }
 
 static int mlx4_set_path(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah,
-			 struct mlx4_qp_path *path, u8 port)
+			 struct mlx4_ib_qp *qp, struct mlx4_qp_path *path,
+			 u8 port, int is_primary)
 {
+	struct net_device *ndev;
 	int err;
 	int is_eth = rdma_port_get_link_layer(&dev->ib_dev, port) ==
 		IB_LINK_LAYER_ETHERNET;
@@ -995,6 +1571,10 @@
 	int is_mcast;
 	u16 vlan_tag;
 	int vidx;
+	int smac_index;
+	u64 u64_mac;
+	u8 *smac;
+	struct mlx4_roce_smac_vlan_info *smac_info;
 
 	path->grh_mylmc     = ah->src_path_bits & 0x7f;
 	path->rlid	    = cpu_to_be16(ah->dlid);
@@ -1008,7 +1588,7 @@
 
 	if (ah->ah_flags & IB_AH_GRH) {
 		if (ah->grh.sgid_index >= dev->dev->caps.gid_table_len[port]) {
-			printk(KERN_ERR "sgid_index (%u) too large. max is %d\n",
+			pr_err("sgid_index (%u) too large. max is %d\n",
 			       ah->grh.sgid_index, dev->dev->caps.gid_table_len[port] - 1);
 			return -1;
 		}
@@ -1023,29 +1603,96 @@
 	}
 
 	if (is_eth) {
-		path->sched_queue = MLX4_IB_DEFAULT_SCHED_QUEUE |
-			((port - 1) << 6) | ((ah->sl & 0x7) << 3) | ((ah->sl & 8) >> 1);
-
 		if (!(ah->ah_flags & IB_AH_GRH))
 			return -1;
 
+		path->sched_queue = MLX4_IB_DEFAULT_SCHED_QUEUE |
+			((port - 1) << 6) | ((ah->sl & 7) << 3);
+
+		if (is_primary)
+			smac_info = &qp->pri;
+		else
+			smac_info = &qp->alt;
+
+		vlan_tag = rdma_get_vlan_id(&dev->iboe.gid_table[port - 1][ah->grh.sgid_index]);
+		if (vlan_tag < 0x1000) {
+			if (smac_info->vid < 0x1000) {
+				/* both valid vlan ids */
+				if (smac_info->vid != vlan_tag) {
+					/* different VIDs.  unreg old and reg new */
+					err = mlx4_register_vlan(dev->dev, port, vlan_tag, &vidx);
+					if (err)
+						return err;
+					smac_info->candidate_vid = vlan_tag;
+					smac_info->candidate_vlan_index = vidx;
+					smac_info->candidate_vlan_port = port;
+					smac_info->update_vid = 1;
+					path->vlan_index = vidx;
+					path->fl = 1 << 6;
+				} else {
+					path->vlan_index = smac_info->vlan_index;
+					path->fl = 1 << 6;
+				}
+			} else {
+				/* no current vlan tag in qp */
+				err = mlx4_register_vlan(dev->dev, port, vlan_tag, &vidx);
+				if (err)
+					return err;
+				smac_info->candidate_vid = vlan_tag;
+				smac_info->candidate_vlan_index = vidx;
+				smac_info->candidate_vlan_port = port;
+				smac_info->update_vid = 1;
+				path->vlan_index = vidx;
+				path->fl = 1 << 6;
+			}
+		} else {
+			/* have current vlan tag. unregister it at modify-qp success */
+			if (smac_info->vid < 0x1000) {
+				smac_info->candidate_vid = 0xFFFF;
+				smac_info->update_vid = 1;
+			}
+		}
+
 		err = mlx4_ib_resolve_grh(dev, ah, mac, &is_mcast, port);
 		if (err)
 			return err;
 
+		/* get smac_index for RoCE use.
+		 * If no smac was yet assigned, register one.
+		 * If one was already assigned, but the new mac differs,
+		 * unregister the old one and register the new one.
+		*/
+                spin_lock(&dev->iboe.lock);
+		ndev = dev->iboe.netdevs[port - 1];
+		if (ndev) {
+#ifdef __linux__
+                        smac = ndev->dev_addr; /* fixme: cache this value */
+#else
+                        smac = IF_LLADDR(ndev); /* fixme: cache this value */
+#endif
+
+			u64_mac = mlx4_mac_to_u64(smac);
+		} else
+			u64_mac = dev->dev->caps.def_mac[port];
+                spin_unlock(&dev->iboe.lock);
+
+		if (!smac_info->smac || smac_info->smac != u64_mac) {
+			/* register candidate now, unreg if needed, after success */
+			smac_index = mlx4_register_mac(dev->dev, port, u64_mac);
+			if (smac_index >= 0) {
+				smac_info->candidate_smac_index = smac_index;
+				smac_info->candidate_smac = u64_mac;
+				smac_info->candidate_smac_port = port;
+			} else
+				return -EINVAL;
+		} else
+			smac_index = smac_info->smac_index;
+
 		memcpy(path->dmac, mac, 6);
 		path->ackto = MLX4_IB_LINK_TYPE_ETH;
-		/* use index 0 into MAC table for IBoE */
-		path->grh_mylmc &= 0x80;
+		/* put MAC table smac index for IBoE */
+		path->grh_mylmc = (u8) (smac_index) | 0x80 ;
 
-		vlan_tag = rdma_get_vlan_id(&dev->iboe.gid_table[port - 1][ah->grh.sgid_index]);
-		if (vlan_tag < 0x1000) {
-			if (mlx4_find_cached_vlan(dev->dev, port, vlan_tag, &vidx))
-				return -ENOENT;
-
-			path->vlan_index = vidx;
-			path->fl = 1 << 6;
-		}
 	} else
 		path->sched_queue = MLX4_IB_DEFAULT_SCHED_QUEUE |
 			((port - 1) << 6) | ((ah->sl & 0xf) << 2);
@@ -1055,7 +1702,7 @@
 
 static void update_mcg_macs(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp)
 {
-	struct gid_entry *ge, *tmp;
+	struct mlx4_ib_gid_entry *ge, *tmp;
 
 	list_for_each_entry_safe(ge, tmp, &qp->gid_list, list) {
 		if (!ge->added && mlx4_ib_add_mc(dev, qp, &ge->gid)) {
@@ -1065,6 +1712,38 @@
 	}
 }
 
+static int handle_eth_ud_smac_index(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp,
+				    struct mlx4_qp_context *context)
+{
+	struct net_device *ndev;
+	u64 u64_mac;
+	u8 *smac;
+	int smac_index;
+
+	ndev = dev->iboe.netdevs[qp->port - 1];
+	if (ndev) {
+#ifdef __linux__
+                smac = ndev->dev_addr; /* fixme: cache this value */
+#else
+                smac = IF_LLADDR(ndev); /* fixme: cache this value */
+#endif
+		u64_mac = mlx4_mac_to_u64(smac);
+	} else
+		u64_mac = dev->dev->caps.def_mac[qp->port];
+
+	context->pri_path.sched_queue = MLX4_IB_DEFAULT_SCHED_QUEUE | ((qp->port - 1) << 6);
+	if (!qp->pri.smac) {
+		smac_index = mlx4_register_mac(dev->dev, qp->port, u64_mac);
+		if (smac_index >= 0) {
+			qp->pri.candidate_smac_index = smac_index;
+			qp->pri.candidate_smac = u64_mac;
+			qp->pri.candidate_smac_port = qp->port;
+			context->pri_path.grh_mylmc = 0x80 | (u8) smac_index;
+		} else
+			return -ENOENT;
+	}
+	return 0;
+}
 static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
 			       const struct ib_qp_attr *attr, int attr_mask,
 			       enum ib_qp_state cur_state, enum ib_qp_state new_state)
@@ -1071,10 +1750,14 @@
 {
 	struct mlx4_ib_dev *dev = to_mdev(ibqp->device);
 	struct mlx4_ib_qp *qp = to_mqp(ibqp);
+	struct mlx4_ib_pd *pd;
+	struct mlx4_ib_cq *send_cq, *recv_cq;
 	struct mlx4_qp_context *context;
 	enum mlx4_qp_optpar optpar = 0;
 	int sqd_event;
+	int steer_qp = 0;
 	int err = -EINVAL;
+	int is_eth = -1;
 
 	context = kzalloc(sizeof *context, GFP_KERNEL);
 	if (!context)
@@ -1081,7 +1764,7 @@
 		return -ENOMEM;
 
 	context->flags = cpu_to_be32((to_mlx4_state(new_state) << 28) |
-				     (to_mlx4_st(ibqp->qp_type) << 16));
+				     (to_mlx4_st(dev, qp->mlx4_ib_qp_type) << 16));
 
 	if (!(attr_mask & IB_QP_PATH_MIG_STATE))
 		context->flags |= cpu_to_be32(MLX4_QP_PM_MIGRATED << 11);
@@ -1099,11 +1782,11 @@
 			break;
 		}
 	}
-	if (ibqp->qp_type == IB_QPT_RAW_ETH)
-		context->mtu_msgmax = 0xff;
-	else if (ibqp->qp_type == IB_QPT_GSI || ibqp->qp_type == IB_QPT_SMI ||
-	    ibqp->qp_type == IB_QPT_RAW_ETY)
+
+	if (ibqp->qp_type == IB_QPT_GSI || ibqp->qp_type == IB_QPT_SMI)
 		context->mtu_msgmax = (IB_MTU_4096 << 5) | 11;
+	else if (ibqp->qp_type == IB_QPT_RAW_PACKET)
+		context->mtu_msgmax = (MLX4_RAW_QP_MTU << 5) | MLX4_RAW_QP_MSGMAX;
 	else if (ibqp->qp_type == IB_QPT_UD) {
 		if (qp->flags & MLX4_IB_QP_LSO)
 			context->mtu_msgmax = (IB_MTU_4096 << 5) |
@@ -1112,7 +1795,7 @@
 			context->mtu_msgmax = (IB_MTU_4096 << 5) | 12;
 	} else if (attr_mask & IB_QP_PATH_MTU) {
 		if (attr->path_mtu < IB_MTU_256 || attr->path_mtu > IB_MTU_4096) {
-			printk(KERN_ERR "path MTU (%u) is invalid\n",
+			pr_err("path MTU (%u) is invalid\n",
 			       attr->path_mtu);
 			goto out;
 		}
@@ -1130,8 +1813,8 @@
 
 	if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) {
 		context->sq_size_stride |= !!qp->sq_no_prefetch << 7;
-		if (ibqp->qp_type == IB_QPT_XRC)
-			context->xrcd = cpu_to_be32((u32) qp->xrcdn);
+		context->xrcd = cpu_to_be32((u32) qp->xrcdn);
+		context->param3 |= cpu_to_be32(1 << 30);
 	}
 
 	if (qp->ibqp.uobject)
@@ -1150,24 +1833,33 @@
 		}
 	}
 
-	if (cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR &&
-	    dev->counters[qp->port - 1] != -1) {
-		context->pri_path.counter_index = dev->counters[qp->port - 1];
-		optpar |= MLX4_QP_OPTPAR_COUNTER_INDEX;
+	if (cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR) {
+		if (dev->counters[qp->port - 1] != -1) {
+			context->pri_path.counter_index =
+						dev->counters[qp->port - 1];
+			optpar |= MLX4_QP_OPTPAR_COUNTER_INDEX;
+		} else
+			context->pri_path.counter_index = 0xff;
+
+		if (qp->flags & MLX4_IB_QP_NETIF &&
+		    (qp->qpg_type == IB_QPG_NONE || qp->qpg_type == IB_QPG_PARENT)) {
+			mlx4_ib_steer_qp_reg(dev, qp, 1);
+			steer_qp = 1;
+		}
 	}
 
 	if (attr_mask & IB_QP_PKEY_INDEX) {
+		if (qp->mlx4_ib_qp_type & MLX4_IB_QPT_ANY_SRIOV)
+			context->pri_path.disable_pkey_check = 0x40;
 		context->pri_path.pkey_index = attr->pkey_index;
 		optpar |= MLX4_QP_OPTPAR_PKEY_INDEX;
 	}
 
 	if (attr_mask & IB_QP_AV) {
-		if (mlx4_set_path(dev, &attr->ah_attr, &context->pri_path,
-				  attr_mask & IB_QP_PORT ? attr->port_num : qp->port)) {
-			mlx4_ib_dbg("qpn 0x%x: could not set pri path params",
-				    ibqp->qp_num);
+		if (mlx4_set_path(dev, &attr->ah_attr, qp, &context->pri_path,
+				  attr_mask & IB_QP_PORT ?
+				  attr->port_num : qp->port, 1))
 			goto out;
-		}
 
 		optpar |= (MLX4_QP_OPTPAR_PRIMARY_ADDR_PATH |
 			   MLX4_QP_OPTPAR_SCHED_QUEUE);
@@ -1174,31 +1866,22 @@
 	}
 
 	if (attr_mask & IB_QP_TIMEOUT) {
-		context->pri_path.ackto |= (attr->timeout << 3);
+		context->pri_path.ackto |= attr->timeout << 3;
 		optpar |= MLX4_QP_OPTPAR_ACK_TIMEOUT;
 	}
 
 	if (attr_mask & IB_QP_ALT_PATH) {
 		if (attr->alt_port_num == 0 ||
-		    attr->alt_port_num > dev->num_ports) {
-			mlx4_ib_dbg("qpn 0x%x: invalid alternate port num (%d)",
-				    ibqp->qp_num, attr->alt_port_num);
+		    attr->alt_port_num > dev->dev->caps.num_ports)
 			goto out;
-		}
 
 		if (attr->alt_pkey_index >=
-		    dev->dev->caps.pkey_table_len[attr->alt_port_num]) {
-			mlx4_ib_dbg("qpn 0x%x: invalid alt pkey index (0x%x)",
-				    ibqp->qp_num, attr->alt_pkey_index);
+		    dev->dev->caps.pkey_table_len[attr->alt_port_num])
 			goto out;
-		}
 
-		if (mlx4_set_path(dev, &attr->alt_ah_attr, &context->alt_path,
-				  attr->alt_port_num)) {
-			mlx4_ib_dbg("qpn 0x%x: could not set alt path params",
-				    ibqp->qp_num);
+		if (mlx4_set_path(dev, &attr->alt_ah_attr, qp, &context->alt_path,
+				  attr->alt_port_num, 0))
 			goto out;
-		}
 
 		context->alt_path.pkey_index = attr->alt_pkey_index;
 		context->alt_path.ackto = attr->alt_timeout << 3;
@@ -1205,8 +1888,12 @@
 		optpar |= MLX4_QP_OPTPAR_ALT_ADDR_PATH;
 	}
 
-	context->pd	    = cpu_to_be32(to_mpd(ibqp->pd)->pdn);
-	context->params1    = cpu_to_be32(MLX4_IB_ACK_REQ_FREQ << 28);
+	pd = get_pd(qp);
+	get_cqs(qp, &send_cq, &recv_cq);
+	context->pd       = cpu_to_be32(pd->pdn);
+	context->cqn_send = cpu_to_be32(send_cq->mcq.cqn);
+	context->cqn_recv = cpu_to_be32(recv_cq->mcq.cqn);
+	context->params1  = cpu_to_be32(MLX4_IB_ACK_REQ_FREQ << 28);
 
 	/* Set "fast registration enabled" for all kernel QPs */
 	if (!qp->ibqp.uobject)
@@ -1232,8 +1919,6 @@
 	if (attr_mask & IB_QP_SQ_PSN)
 		context->next_send_psn = cpu_to_be32(attr->sq_psn);
 
-	context->cqn_send = cpu_to_be32(to_mcq(ibqp->send_cq)->mcq.cqn);
-
 	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) {
 		if (attr->max_dest_rd_atomic)
 			context->params2 |=
@@ -1246,6 +1931,18 @@
 		optpar |= MLX4_QP_OPTPAR_RWE | MLX4_QP_OPTPAR_RRE | MLX4_QP_OPTPAR_RAE;
 	}
 
+	if (attr_mask & IB_M_EXT_CLASS_1)
+		context->params2 |= cpu_to_be32(MLX4_QP_BIT_COLL_MASTER);
+
+	/* for now we enable also sqe on send */
+	if (attr_mask & IB_M_EXT_CLASS_2) {
+		context->params2 |= cpu_to_be32(MLX4_QP_BIT_COLL_SYNC_SQ);
+		context->params2 |= cpu_to_be32(MLX4_QP_BIT_COLL_MASTER);
+	}
+
+	if (attr_mask & IB_M_EXT_CLASS_3)
+		context->params2 |= cpu_to_be32(MLX4_QP_BIT_COLL_SYNC_RQ);
+
 	if (ibqp->srq)
 		context->params2 |= cpu_to_be32(MLX4_QP_BIT_RIC);
 
@@ -1256,10 +1953,24 @@
 	if (attr_mask & IB_QP_RQ_PSN)
 		context->rnr_nextrecvpsn |= cpu_to_be32(attr->rq_psn);
 
-	context->cqn_recv = cpu_to_be32(to_mcq(ibqp->recv_cq)->mcq.cqn);
-
+	/* proxy and tunnel qp qkeys will be changed in modify-qp wrappers */
 	if (attr_mask & IB_QP_QKEY) {
-		context->qkey = cpu_to_be32(attr->qkey);
+		if (qp->mlx4_ib_qp_type &
+		    (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_TUN_SMI_OWNER))
+			context->qkey = cpu_to_be32(IB_QP_SET_QKEY);
+		else {
+			if (mlx4_is_mfunc(dev->dev) &&
+			    !(qp->mlx4_ib_qp_type & MLX4_IB_QPT_ANY_SRIOV) &&
+			    (attr->qkey & MLX4_RESERVED_QKEY_MASK) ==
+			    MLX4_RESERVED_QKEY_BASE) {
+				pr_err("Cannot use reserved QKEY"
+				       " 0x%x (range 0xffff0000..0xffffffff"
+				       " is reserved)\n", attr->qkey);
+				err = -EINVAL;
+				goto out;
+			}
+			context->qkey = cpu_to_be32(attr->qkey);
+		}
 		optpar |= MLX4_QP_OPTPAR_Q_KEY;
 	}
 
@@ -1266,20 +1977,41 @@
 	if (ibqp->srq)
 		context->srqn = cpu_to_be32(1 << 24 | to_msrq(ibqp->srq)->msrq.srqn);
 
-	if (!ibqp->srq && ibqp->qp_type != IB_QPT_XRC &&
-	    cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT)
+	if (qp->rq.wqe_cnt && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT)
 		context->db_rec_addr = cpu_to_be64(qp->db.dma);
 
 	if (cur_state == IB_QPS_INIT &&
 	    new_state == IB_QPS_RTR  &&
 	    (ibqp->qp_type == IB_QPT_GSI || ibqp->qp_type == IB_QPT_SMI ||
-	     ibqp->qp_type == IB_QPT_UD || ibqp->qp_type == IB_QPT_RAW_ETY ||
-		ibqp->qp_type == IB_QPT_RAW_ETH)) {
+	     ibqp->qp_type == IB_QPT_UD ||
+	     ibqp->qp_type == IB_QPT_RAW_PACKET)) {
 		context->pri_path.sched_queue = (qp->port - 1) << 6;
-		if (is_qp0(dev, qp))
+		if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_SMI ||
+		    qp->mlx4_ib_qp_type &
+		    (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_TUN_SMI_OWNER)) {
 			context->pri_path.sched_queue |= MLX4_IB_DEFAULT_QP0_SCHED_QUEUE;
-		else
+			if (qp->mlx4_ib_qp_type != MLX4_IB_QPT_SMI)
+				context->pri_path.fl = 0x80;
+		} else {
+			if (qp->mlx4_ib_qp_type & MLX4_IB_QPT_ANY_SRIOV)
+				context->pri_path.fl = 0x80;
 			context->pri_path.sched_queue |= MLX4_IB_DEFAULT_SCHED_QUEUE;
+		}
+		is_eth = rdma_port_get_link_layer(&dev->ib_dev, qp->port) ==
+			IB_LINK_LAYER_ETHERNET;
+		if (is_eth) {
+			if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_TUN_GSI ||
+			    qp->mlx4_ib_qp_type == MLX4_IB_QPT_GSI)
+				context->pri_path.feup = 1 << 7; /* don't fsm */
+			/* handle smac_index */
+			if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_UD ||
+			    qp->mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_GSI ||
+			    qp->mlx4_ib_qp_type == MLX4_IB_QPT_TUN_GSI) {
+				err = handle_eth_ud_smac_index(dev, qp, context);
+				if (err)
+					return -EINVAL;
+			}
+		}
 	}
 
 	if (cur_state == IB_QPS_RTS && new_state == IB_QPS_SQD	&&
@@ -1291,6 +2023,43 @@
 	if (!ibqp->uobject && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT)
 		context->rlkey |= (1 << 4);
 
+	if ((attr_mask & IB_QP_GROUP_RSS) &&
+		(qp->qpg_data->rss_child_count > 1)) {
+		struct mlx4_ib_qpg_data *qpg_data = qp->qpg_data;
+		void *rss_context_base = &context->pri_path;
+		struct mlx4_rss_context *rss_context =
+			(struct mlx4_rss_context *) (rss_context_base
+					+ MLX4_RSS_OFFSET_IN_QPC_PRI_PATH);
+
+		context->flags |= cpu_to_be32(1 << MLX4_RSS_QPC_FLAG_OFFSET);
+
+		/* This should be tbl_sz_base_qpn */
+		rss_context->base_qpn = cpu_to_be32(qpg_data->rss_qpn_base |
+				(ilog2(qpg_data->rss_child_count) << 24));
+		rss_context->default_qpn = cpu_to_be32(qpg_data->rss_qpn_base);
+		/* This should be flags_hash_fn */
+		rss_context->flags = MLX4_RSS_TCP_IPV6 |
+				     MLX4_RSS_TCP_IPV4;
+		if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_UDP_RSS) {
+			rss_context->base_qpn_udp = rss_context->default_qpn;
+			rss_context->flags |= MLX4_RSS_IPV6 |
+					MLX4_RSS_IPV4     |
+					MLX4_RSS_UDP_IPV6 |
+					MLX4_RSS_UDP_IPV4;
+		}
+		if (dev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_RSS_TOP) {
+			static const u32 rsskey[10] = { 0xD181C62C, 0xF7F4DB5B,
+				0x1983A2FC, 0x943E1ADB, 0xD9389E6B, 0xD1039C2C,
+				0xA74499AD, 0x593D56D9, 0xF3253C06, 0x2ADC1FFC};
+			rss_context->hash_fn = MLX4_RSS_HASH_TOP;
+			memcpy(rss_context->rss_key, rsskey,
+				sizeof(rss_context->rss_key));
+		} else {
+			rss_context->hash_fn = MLX4_RSS_HASH_XOR;
+			memset(rss_context->rss_key, 0,
+				sizeof(rss_context->rss_key));
+		}
+	}
 	/*
 	 * Before passing a kernel QP to the HW, make sure that the
 	 * ownership bits of the send queue are set and the SQ
@@ -1333,6 +2102,29 @@
 	if (is_sqp(dev, qp))
 		store_sqp_attrs(to_msqp(qp), attr, attr_mask);
 
+	/* Set 'ignore_cq_overrun' bits for collectives offload */
+	if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) {
+		if (attr_mask & (IB_M_EXT_CLASS_2 | IB_M_EXT_CLASS_3)) {
+			err = mlx4_ib_ignore_overrun_cq(ibqp->send_cq);
+			if (err) {
+				pr_err("Failed to set ignore CQ "
+				       "overrun for QP 0x%x's send CQ\n",
+				       ibqp->qp_num);
+				goto out;
+			}
+
+			if (ibqp->recv_cq != ibqp->send_cq) {
+				err = mlx4_ib_ignore_overrun_cq(ibqp->recv_cq);
+				if (err) {
+					pr_err("Failed to set ignore "
+					       "CQ overrun for QP 0x%x's recv "
+					       "CQ\n", ibqp->qp_num);
+					goto out;
+				}
+			}
+		}
+	}
+
 	/*
 	 * If we moved QP0 to RTR, bring the IB link up; if we moved
 	 * QP0 to RESET or ERROR, bring the link back down.
@@ -1340,7 +2132,7 @@
 	if (is_qp0(dev, qp)) {
 		if (cur_state != IB_QPS_RTR && new_state == IB_QPS_RTR)
 			if (mlx4_INIT_PORT(dev->dev, qp->port))
-				printk(KERN_WARNING "INIT_PORT failed for port %d\n",
+				pr_warn("INIT_PORT failed for port %d\n",
 				       qp->port);
 
 		if (cur_state != IB_QPS_RESET && cur_state != IB_QPS_ERR &&
@@ -1352,23 +2144,120 @@
 	 * If we moved a kernel QP to RESET, clean up all old CQ
 	 * entries and reinitialize the QP.
 	 */
-	if (new_state == IB_QPS_RESET && !ibqp->uobject) {
-		mlx4_ib_cq_clean(to_mcq(ibqp->recv_cq), qp->mqp.qpn,
-				 ibqp->srq ? to_msrq(ibqp->srq): NULL);
-		if (ibqp->send_cq != ibqp->recv_cq)
-			mlx4_ib_cq_clean(to_mcq(ibqp->send_cq), qp->mqp.qpn, NULL);
+	if (new_state == IB_QPS_RESET) {
+		if (!ibqp->uobject) {
+			mlx4_ib_cq_clean(recv_cq, qp->mqp.qpn,
+					 ibqp->srq ? to_msrq(ibqp->srq) : NULL);
+			if (send_cq != recv_cq)
+				mlx4_ib_cq_clean(send_cq, qp->mqp.qpn, NULL);
 
-		qp->rq.head = 0;
-		qp->rq.tail = 0;
-		qp->sq.head = 0;
-		qp->sq.tail = 0;
-		qp->sq_next_wqe = 0;
-		if (!ibqp->srq && ibqp->qp_type != IB_QPT_XRC)
-			*qp->db.db  = 0;
+			qp->rq.head = 0;
+			qp->rq.tail = 0;
+			qp->sq.head = 0;
+			qp->sq.tail = 0;
+			qp->sq_next_wqe = 0;
+			if (qp->rq.wqe_cnt)
+				*qp->db.db  = 0;
+
+			if (qp->flags & MLX4_IB_QP_NETIF &&
+			    (qp->qpg_type == IB_QPG_NONE ||
+			     qp->qpg_type == IB_QPG_PARENT))
+				mlx4_ib_steer_qp_reg(dev, qp, 0);
+		}
+		if (qp->pri.smac) {
+			mlx4_unregister_mac(dev->dev, qp->pri.smac_port, qp->pri.smac);
+			qp->pri.smac = 0;
+		}
+		if (qp->alt.smac) {
+			mlx4_unregister_mac(dev->dev, qp->alt.smac_port, qp->alt.smac);
+			qp->alt.smac = 0;
+		}
+		if (qp->pri.vid < 0x1000) {
+			mlx4_unregister_vlan(dev->dev, qp->pri.vlan_port, qp->pri.vid);
+			qp->pri.vid = 0xFFFF;
+			qp->pri.candidate_vid = 0xFFFF;
+			qp->pri.update_vid = 0;
+		}
+
+		if (qp->alt.vid < 0x1000) {
+			mlx4_unregister_vlan(dev->dev, qp->alt.vlan_port, qp->alt.vid);
+			qp->alt.vid = 0xFFFF;
+			qp->alt.candidate_vid = 0xFFFF;
+			qp->alt.update_vid = 0;
+		}
 	}
 
 out:
+	if (err && steer_qp)
+		mlx4_ib_steer_qp_reg(dev, qp, 0);
 	kfree(context);
+	if (qp->pri.candidate_smac) {
+		if (err)
+			mlx4_unregister_mac(dev->dev, qp->pri.candidate_smac_port, qp->pri.candidate_smac);
+		else {
+			if (qp->pri.smac) {
+				mlx4_unregister_mac(dev->dev, qp->pri.smac_port, qp->pri.smac);
+			}
+			qp->pri.smac = qp->pri.candidate_smac;
+			qp->pri.smac_index = qp->pri.candidate_smac_index;
+			qp->pri.smac_port = qp->pri.candidate_smac_port;
+
+		}
+		qp->pri.candidate_smac = 0;
+		qp->pri.candidate_smac_index = 0;
+		qp->pri.candidate_smac_port = 0;
+	}
+	if (qp->alt.candidate_smac) {
+		if (err)
+			mlx4_unregister_mac(dev->dev, qp->alt.candidate_smac_port, qp->pri.candidate_smac);
+		else {
+			if (qp->pri.smac) {
+				mlx4_unregister_mac(dev->dev, qp->alt.smac_port, qp->alt.smac);
+			}
+			qp->alt.smac = qp->alt.candidate_smac;
+			qp->alt.smac_index = qp->alt.candidate_smac_index;
+			qp->alt.smac_port = qp->alt.candidate_smac_port;
+
+		}
+		qp->pri.candidate_smac = 0;
+		qp->pri.candidate_smac_index = 0;
+		qp->pri.candidate_smac_port = 0;
+	}
+
+	if (qp->pri.update_vid) {
+		if (err) {
+			if (qp->pri.candidate_vid < 0x1000)
+				mlx4_unregister_vlan(dev->dev, qp->pri.candidate_vlan_port,
+						     qp->pri.candidate_vid);
+		} else {
+			if (qp->pri.vid < 0x1000)
+				mlx4_unregister_vlan(dev->dev, qp->pri.vlan_port,
+						     qp->pri.vid);
+			qp->pri.vid = qp->pri.candidate_vid;
+			qp->pri.vlan_port = qp->pri.candidate_vlan_port;
+			qp->pri.vlan_index =  qp->pri.candidate_vlan_index;
+		}
+		qp->pri.candidate_vid = 0xFFFF;
+		qp->pri.update_vid = 0;
+	}
+
+	if (qp->alt.update_vid) {
+		if (err) {
+			if (qp->alt.candidate_vid < 0x1000)
+				mlx4_unregister_vlan(dev->dev, qp->alt.candidate_vlan_port,
+						     qp->alt.candidate_vid);
+		} else {
+			if (qp->alt.vid < 0x1000)
+				mlx4_unregister_vlan(dev->dev, qp->alt.vlan_port,
+						     qp->alt.vid);
+			qp->alt.vid = qp->alt.candidate_vid;
+			qp->alt.vlan_port = qp->alt.candidate_vlan_port;
+			qp->alt.vlan_index =  qp->alt.candidate_vlan_index;
+		}
+		qp->alt.candidate_vid = 0xFFFF;
+		qp->alt.update_vid = 0;
+	}
+
 	return err;
 }
 
@@ -1385,40 +2274,43 @@
 	cur_state = attr_mask & IB_QP_CUR_STATE ? attr->cur_qp_state : qp->state;
 	new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state;
 
-	if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, attr_mask)) {
-		mlx4_ib_dbg("qpn 0x%x: invalid attribute mask specified "
-			    "for transition %d to %d. qp_type %d, attr_mask 0x%x",
-			    ibqp->qp_num, cur_state, new_state,
-			    ibqp->qp_type, attr_mask);
+	if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type,
+				attr_mask & ~IB_M_QP_MOD_VEND_MASK)) {
+		pr_debug("qpn 0x%x: invalid attribute mask specified "
+			 "for transition %d to %d. qp_type %d,"
+			 " attr_mask 0x%x\n",
+			 ibqp->qp_num, cur_state, new_state,
+			 ibqp->qp_type, attr_mask);
 		goto out;
 	}
 
-	if ((attr_mask & IB_QP_PORT) && (ibqp->qp_type != IB_QPT_RAW_ETH) &&
-	    (attr->port_num == 0 || attr->port_num > dev->num_ports)) {
-		mlx4_ib_dbg("qpn 0x%x: invalid port number (%d) specified "
-			    "for transition %d to %d. qp_type %d",
-			    ibqp->qp_num, attr->port_num, cur_state,
-			    new_state, ibqp->qp_type);
+	if ((attr_mask & IB_M_QP_MOD_VEND_MASK) && !dev->dev->caps.sync_qp) {
+		pr_err("extended verbs are not supported by %s\n",
+		       dev->ib_dev.name);
 		goto out;
 	}
 
-	if ((attr_mask & IB_QP_PORT) && (ibqp->qp_type == IB_QPT_RAW_ETH) &&
-		(rdma_port_get_link_layer(&dev->ib_dev, attr->port_num)
-				!= IB_LINK_LAYER_ETHERNET)) {
-		mlx4_ib_dbg("qpn 0x%x: invalid port (%d) specified (not RDMAoE)"
-			    "for transition %d to %d. qp_type %d",
-			    ibqp->qp_num, attr->port_num, cur_state,
-			    new_state, ibqp->qp_type);
+	if ((attr_mask & IB_QP_PORT) &&
+	    (attr->port_num == 0 || attr->port_num > dev->num_ports)) {
+		pr_debug("qpn 0x%x: invalid port number (%d) specified "
+			 "for transition %d to %d. qp_type %d\n",
+			 ibqp->qp_num, attr->port_num, cur_state,
+			 new_state, ibqp->qp_type);
 		goto out;
 	}
 
+	if ((attr_mask & IB_QP_PORT) && (ibqp->qp_type == IB_QPT_RAW_PACKET) &&
+	    (rdma_port_get_link_layer(&dev->ib_dev, attr->port_num) !=
+	     IB_LINK_LAYER_ETHERNET))
+		goto out;
+
 	if (attr_mask & IB_QP_PKEY_INDEX) {
 		int p = attr_mask & IB_QP_PORT ? attr->port_num : qp->port;
 		if (attr->pkey_index >= dev->dev->caps.pkey_table_len[p]) {
-			mlx4_ib_dbg("qpn 0x%x: invalid pkey index (%d) specified "
-				    "for transition %d to %d. qp_type %d",
-				    ibqp->qp_num, attr->pkey_index, cur_state,
-				    new_state, ibqp->qp_type);
+			pr_debug("qpn 0x%x: invalid pkey index (%d) specified "
+				 "for transition %d to %d. qp_type %d\n",
+				 ibqp->qp_num, attr->pkey_index, cur_state,
+				 new_state, ibqp->qp_type);
 			goto out;
 		}
 	}
@@ -1425,19 +2317,19 @@
 
 	if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC &&
 	    attr->max_rd_atomic > dev->dev->caps.max_qp_init_rdma) {
-		mlx4_ib_dbg("qpn 0x%x: max_rd_atomic (%d) too large. "
-			    "Transition %d to %d. qp_type %d",
-			    ibqp->qp_num, attr->max_rd_atomic, cur_state,
-			    new_state, ibqp->qp_type);
+		pr_debug("qpn 0x%x: max_rd_atomic (%d) too large. "
+			 "Transition %d to %d. qp_type %d\n",
+			 ibqp->qp_num, attr->max_rd_atomic, cur_state,
+			 new_state, ibqp->qp_type);
 		goto out;
 	}
 
 	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC &&
 	    attr->max_dest_rd_atomic > dev->dev->caps.max_qp_dest_rdma) {
-		mlx4_ib_dbg("qpn 0x%x: max_dest_rd_atomic (%d) too large. "
-			    "Transition %d to %d. qp_type %d",
-			    ibqp->qp_num, attr->max_dest_rd_atomic, cur_state,
-			    new_state, ibqp->qp_type);
+		pr_debug("qpn 0x%x: max_dest_rd_atomic (%d) too large. "
+			 "Transition %d to %d. qp_type %d\n",
+			 ibqp->qp_num, attr->max_dest_rd_atomic, cur_state,
+			 new_state, ibqp->qp_type);
 		goto out;
 	}
 
@@ -1453,46 +2345,111 @@
 	return err;
 }
 
-static int build_raw_ety_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr,
-			    void *wqe, unsigned *mlx_seg_len)
+static int build_sriov_qp0_header(struct mlx4_ib_sqp *sqp,
+				  struct ib_send_wr *wr,
+				  void *wqe, unsigned *mlx_seg_len)
 {
-	int payload = 0;
-	int header_size, packet_length;
+	struct mlx4_ib_dev *mdev = to_mdev(sqp->qp.ibqp.device);
+	struct ib_device *ib_dev = &mdev->ib_dev;
 	struct mlx4_wqe_mlx_seg *mlx = wqe;
 	struct mlx4_wqe_inline_seg *inl = wqe + sizeof *mlx;
-	u32 *lrh = wqe + sizeof *mlx + sizeof *inl;
+	struct mlx4_ib_ah *ah = to_mah(wr->wr.ud.ah);
+	u16 pkey;
+	u32 qkey;
+	int send_size;
+	int header_size;
+	int spc;
 	int i;
 
-	/* Only IB_WR_SEND is supported */
 	if (wr->opcode != IB_WR_SEND)
 		return -EINVAL;
 
+	send_size = 0;
+
 	for (i = 0; i < wr->num_sge; ++i)
-		payload += wr->sg_list[i].length;
+		send_size += wr->sg_list[i].length;
 
-	header_size = IB_LRH_BYTES + 4; /* LRH + RAW_HEADER (32 bits) */
+	/* for proxy-qp0 sends, need to add in size of tunnel header */
+	/* for tunnel-qp0 sends, tunnel header is already in s/g list */
+	if (sqp->qp.mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_SMI_OWNER)
+		send_size += sizeof (struct mlx4_ib_tunnel_header);
 
-	/* headers + payload and round up */
-	packet_length = (header_size + payload + 3) / 4;
+	ib_ud_header_init(send_size, 1, 0, 0, 0, 0, &sqp->ud_header);
 
+	if (sqp->qp.mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_SMI_OWNER) {
+		sqp->ud_header.lrh.service_level =
+			be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 28;
+		sqp->ud_header.lrh.destination_lid =
+			cpu_to_be16(ah->av.ib.g_slid & 0x7f);
+		sqp->ud_header.lrh.source_lid =
+			cpu_to_be16(ah->av.ib.g_slid & 0x7f);
+	}
+
 	mlx->flags &= cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE);
 
-	mlx->flags |= cpu_to_be32(MLX4_WQE_MLX_ICRC |
-				  (wr->wr.raw_ety.lrh->service_level << 8));
+	/* force loopback */
+	mlx->flags |= cpu_to_be32(MLX4_WQE_MLX_VL15 | 0x1 | MLX4_WQE_MLX_SLR);
+	mlx->rlid = sqp->ud_header.lrh.destination_lid;
 
-	mlx->rlid = wr->wr.raw_ety.lrh->destination_lid;
+	sqp->ud_header.lrh.virtual_lane    = 0;
+	sqp->ud_header.bth.solicited_event = !!(wr->send_flags & IB_SEND_SOLICITED);
+	ib_get_cached_pkey(ib_dev, sqp->qp.port, 0, &pkey);
+	sqp->ud_header.bth.pkey = cpu_to_be16(pkey);
+	if (sqp->qp.mlx4_ib_qp_type == MLX4_IB_QPT_TUN_SMI_OWNER)
+		sqp->ud_header.bth.destination_qpn = cpu_to_be32(wr->wr.ud.remote_qpn);
+	else
+		sqp->ud_header.bth.destination_qpn =
+			cpu_to_be32(mdev->dev->caps.qp0_tunnel[sqp->qp.port - 1]);
 
-	wr->wr.raw_ety.lrh->packet_length = cpu_to_be16(packet_length);
+	sqp->ud_header.bth.psn = cpu_to_be32((sqp->send_psn++) & ((1 << 24) - 1));
+	if (mlx4_get_parav_qkey(mdev->dev, sqp->qp.mqp.qpn, &qkey))
+		return -EINVAL;
+	sqp->ud_header.deth.qkey = cpu_to_be32(qkey);
+	sqp->ud_header.deth.source_qpn = cpu_to_be32(sqp->qp.mqp.qpn);
 
-	ib_lrh_header_pack(wr->wr.raw_ety.lrh, lrh);
-	lrh += IB_LRH_BYTES / 4;	/* LRH size is a dword multiple */
-	*lrh = cpu_to_be32(wr->wr.raw_ety.eth_type);
+	sqp->ud_header.bth.opcode        = IB_OPCODE_UD_SEND_ONLY;
+	sqp->ud_header.immediate_present = 0;
 
-	inl->byte_count = cpu_to_be32(1 << 31 | header_size);
+	header_size = ib_ud_header_pack(&sqp->ud_header, sqp->header_buf);
 
+	/*
+	 * Inline data segments may not cross a 64 byte boundary.  If
+	 * our UD header is bigger than the space available up to the
+	 * next 64 byte boundary in the WQE, use two inline data
+	 * segments to hold the UD header.
+	 */
+	spc = MLX4_INLINE_ALIGN -
+	      ((unsigned long) (inl + 1) & (MLX4_INLINE_ALIGN - 1));
+	if (header_size <= spc) {
+		inl->byte_count = cpu_to_be32(1U << 31 | header_size);
+		memcpy(inl + 1, sqp->header_buf, header_size);
+		i = 1;
+	} else {
+		inl->byte_count = cpu_to_be32(1U << 31 | spc);
+		memcpy(inl + 1, sqp->header_buf, spc);
+
+		inl = (void *) (inl + 1) + spc;
+		memcpy(inl + 1, sqp->header_buf + spc, header_size - spc);
+		/*
+		 * Need a barrier here to make sure all the data is
+		 * visible before the byte_count field is set.
+		 * Otherwise the HCA prefetcher could grab the 64-byte
+		 * chunk with this inline segment and get a valid (!=
+		 * 0xffffffff) byte count but stale data, and end up
+		 * generating a packet with bad headers.
+		 *
+		 * The first inline segment's byte_count field doesn't
+		 * need a barrier, because it comes after a
+		 * control/MLX segment and therefore is at an offset
+		 * of 16 mod 64.
+		 */
+		wmb();
+		inl->byte_count = cpu_to_be32(1U << 31 | (header_size - spc));
+		i = 2;
+	}
+
 	*mlx_seg_len =
-		ALIGN(sizeof(struct mlx4_wqe_inline_seg) + header_size, 16);
-
+	ALIGN(i * sizeof (struct mlx4_wqe_inline_seg) + header_size, 16);
 	return 0;
 }
 
@@ -1499,23 +2456,23 @@
 static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr,
 			    void *wqe, unsigned *mlx_seg_len)
 {
-	struct ib_device *ib_dev = &to_mdev(sqp->qp.ibqp.device)->ib_dev;
+	struct ib_device *ib_dev = sqp->qp.ibqp.device;
 	struct mlx4_wqe_mlx_seg *mlx = wqe;
+	struct mlx4_wqe_ctrl_seg *ctrl = wqe;
 	struct mlx4_wqe_inline_seg *inl = wqe + sizeof *mlx;
 	struct mlx4_ib_ah *ah = to_mah(wr->wr.ud.ah);
+	union ib_gid sgid;
 	u16 pkey;
 	int send_size;
 	int header_size;
 	int spc;
 	int i;
-	union ib_gid sgid;
 	int is_eth;
+	int is_vlan = 0;
 	int is_grh;
-	int is_vlan = 0;
-	int err;
-	u16 vlan;
+	u16 vlan = 0;
+	int err = 0;
 
-	vlan = 0;
 	send_size = 0;
 	for (i = 0; i < wr->num_sge; ++i)
 		send_size += wr->sg_list[i].length;
@@ -1522,16 +2479,29 @@
 
 	is_eth = rdma_port_get_link_layer(sqp->qp.ibqp.device, sqp->qp.port) == IB_LINK_LAYER_ETHERNET;
 	is_grh = mlx4_ib_ah_grh_present(ah);
-	err = ib_get_cached_gid(ib_dev, be32_to_cpu(ah->av.ib.port_pd) >> 24,
-				ah->av.ib.gid_index, &sgid);
-	if (err)
-		return err;
 	if (is_eth) {
-		is_vlan = rdma_get_vlan_id(&sgid) < 0x1000;
+		if (mlx4_is_mfunc(to_mdev(ib_dev)->dev)) {
+			/* When multi-function is enabled, the ib_core gid
+			 * indexes don't necessarily match the hw ones, so
+			 * we must use our own cache */
+			err = mlx4_get_roce_gid_from_slave(to_mdev(ib_dev)->dev,
+							   be32_to_cpu(ah->av.ib.port_pd) >> 24,
+							   ah->av.ib.gid_index, &sgid.raw[0]);
+			if (err)
+				return err;
+		} else  {
+			err = ib_get_cached_gid(ib_dev,
+						be32_to_cpu(ah->av.ib.port_pd) >> 24,
+						ah->av.ib.gid_index, &sgid);
+			if (err)
+				return err;
+		}
+
 		vlan = rdma_get_vlan_id(&sgid);
+		is_vlan = vlan < 0x1000;
 	}
+	ib_ud_header_init(send_size, !is_eth, is_eth, is_vlan, is_grh, 0, &sqp->ud_header);
 
-	ib_ud_header_init(send_size, !is_eth, is_eth, is_vlan, is_grh, 0, &sqp->ud_header);
 	if (!is_eth) {
 		sqp->ud_header.lrh.service_level =
 			be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 28;
@@ -1545,8 +2515,25 @@
 		sqp->ud_header.grh.flow_label    =
 			ah->av.ib.sl_tclass_flowlabel & cpu_to_be32(0xfffff);
 		sqp->ud_header.grh.hop_limit     = ah->av.ib.hop_limit;
-		ib_get_cached_gid(ib_dev, be32_to_cpu(ah->av.ib.port_pd) >> 24,
-				  ah->av.ib.gid_index, &sqp->ud_header.grh.source_gid);
+		if (is_eth)
+			memcpy(sqp->ud_header.grh.source_gid.raw, sgid.raw, 16);
+		else {
+		if (mlx4_is_mfunc(to_mdev(ib_dev)->dev)) {
+			/* When multi-function is enabled, the ib_core gid
+			 * indexes don't necessarily match the hw ones, so
+			 * we must use our own cache */
+			sqp->ud_header.grh.source_gid.global.subnet_prefix =
+				to_mdev(ib_dev)->sriov.demux[sqp->qp.port - 1].
+						       subnet_prefix;
+			sqp->ud_header.grh.source_gid.global.interface_id =
+				to_mdev(ib_dev)->sriov.demux[sqp->qp.port - 1].
+					       guid_cache[ah->av.ib.gid_index];
+		} else
+			ib_get_cached_gid(ib_dev,
+					  be32_to_cpu(ah->av.ib.port_pd) >> 24,
+					  ah->av.ib.gid_index,
+					  &sqp->ud_header.grh.source_gid);
+		}
 		memcpy(sqp->ud_header.grh.destination_gid.raw,
 		       ah->av.ib.dgid, 16);
 	}
@@ -1558,16 +2545,18 @@
 					  (sqp->ud_header.lrh.destination_lid ==
 					   IB_LID_PERMISSIVE ? MLX4_WQE_MLX_SLR : 0) |
 					  (sqp->ud_header.lrh.service_level << 8));
+		if (ah->av.ib.port_pd & cpu_to_be32(0x80000000))
+			mlx->flags |= cpu_to_be32(0x1); /* force loopback */
 		mlx->rlid = sqp->ud_header.lrh.destination_lid;
 	}
 
 	switch (wr->opcode) {
 	case IB_WR_SEND:
-		sqp->ud_header.bth.opcode        = IB_OPCODE_UD_SEND_ONLY;
+		sqp->ud_header.bth.opcode	 = IB_OPCODE_UD_SEND_ONLY;
 		sqp->ud_header.immediate_present = 0;
 		break;
 	case IB_WR_SEND_WITH_IMM:
-		sqp->ud_header.bth.opcode        = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE;
+		sqp->ud_header.bth.opcode	 = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE;
 		sqp->ud_header.immediate_present = 1;
 		sqp->ud_header.immediate_data    = wr->ex.imm_data;
 		break;
@@ -1576,24 +2565,26 @@
 	}
 
 	if (is_eth) {
-		u8 *smac;
+		u8 smac[6];
+		struct in6_addr in6;
 
+		u16 pcp = (be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 29) << 13;
+
+		mlx->sched_prio = cpu_to_be16(pcp);
+
 		memcpy(sqp->ud_header.eth.dmac_h, ah->av.eth.mac, 6);
-#ifdef __linux__
-		smac = to_mdev(sqp->qp.ibqp.device)->iboe.netdevs[sqp->qp.port - 1]->dev_addr; /* fixme: cache this value */
-#else
-		smac = IF_LLADDR(to_mdev(sqp->qp.ibqp.device)->iboe.netdevs[sqp->qp.port - 1]); /* fixme: cache this value */
-#endif
+		/* FIXME: cache smac value? */
+		memcpy(&ctrl->srcrb_flags16[0], ah->av.eth.mac, 2);
+		memcpy(&ctrl->imm, ah->av.eth.mac + 2, 4);
+		memcpy(&in6, sgid.raw, sizeof(in6));
+		rdma_get_ll_mac(&in6, smac);
 		memcpy(sqp->ud_header.eth.smac_h, smac, 6);
 		if (!memcmp(sqp->ud_header.eth.smac_h, sqp->ud_header.eth.dmac_h, 6))
 			mlx->flags |= cpu_to_be32(MLX4_WQE_CTRL_FORCE_LOOPBACK);
-		if (!is_vlan)
-			sqp->ud_header.eth.type = cpu_to_be16(MLX4_IBOE_ETHERTYPE);
-		else {
-			u16 pcp;
-
-			sqp->ud_header.vlan.type = cpu_to_be16(MLX4_IBOE_ETHERTYPE);
-			pcp = (be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 27 & 3) << 13;
+		if (!is_vlan) {
+			sqp->ud_header.eth.type = cpu_to_be16(MLX4_IB_IBOE_ETHERTYPE);
+		} else {
+			sqp->ud_header.vlan.type = cpu_to_be16(MLX4_IB_IBOE_ETHERTYPE);
 			sqp->ud_header.vlan.tag = cpu_to_be16(vlan | pcp);
 		}
 	} else {
@@ -1616,16 +2607,16 @@
 	header_size = ib_ud_header_pack(&sqp->ud_header, sqp->header_buf);
 
 	if (0) {
-		printk(KERN_ERR "built UD header of size %d:\n", header_size);
+		pr_err("built UD header of size %d:\n", header_size);
 		for (i = 0; i < header_size / 4; ++i) {
 			if (i % 8 == 0)
-				printk("  [%02x] ", i * 4);
-			printk(" %08x",
-			       be32_to_cpu(((__be32 *) sqp->header_buf)[i]));
+				pr_err("  [%02x] ", i * 4);
+			pr_cont(" %08x",
+				be32_to_cpu(((__be32 *) sqp->header_buf)[i]));
 			if ((i + 1) % 8 == 0)
-				printk("\n");
+				pr_cont("\n");
 		}
-		printk("\n");
+		pr_err("\n");
 	}
 
 	/*
@@ -1635,13 +2626,13 @@
 	 * segments to hold the UD header.
 	 */
 	spc = MLX4_INLINE_ALIGN -
-	      ((unsigned long) (inl + 1) & (MLX4_INLINE_ALIGN - 1));
+		((unsigned long) (inl + 1) & (MLX4_INLINE_ALIGN - 1));
 	if (header_size <= spc) {
-		inl->byte_count = cpu_to_be32(1 << 31 | header_size);
+		inl->byte_count = cpu_to_be32(1U << 31 | header_size);
 		memcpy(inl + 1, sqp->header_buf, header_size);
 		i = 1;
 	} else {
-		inl->byte_count = cpu_to_be32(1 << 31 | spc);
+		inl->byte_count = cpu_to_be32(1U << 31 | spc);
 		memcpy(inl + 1, sqp->header_buf, spc);
 
 		inl = (void *) (inl + 1) + spc;
@@ -1660,12 +2651,12 @@
 		 * of 16 mod 64.
 		 */
 		wmb();
-		inl->byte_count = cpu_to_be32(1 << 31 | (header_size - spc));
+		inl->byte_count = cpu_to_be32(1U << 31 | (header_size - spc));
 		i = 2;
 	}
 
 	*mlx_seg_len =
-	ALIGN(i * sizeof (struct mlx4_wqe_inline_seg) + header_size, 16);
+		ALIGN(i * sizeof (struct mlx4_wqe_inline_seg) + header_size, 16);
 	return 0;
 }
 
@@ -1688,10 +2679,10 @@
 
 static __be32 convert_access(int acc)
 {
-	return (acc & IB_ACCESS_REMOTE_ATOMIC ? cpu_to_be32(MLX4_WQE_FMR_PERM_ATOMIC)       : 0) |
-	       (acc & IB_ACCESS_REMOTE_WRITE  ? cpu_to_be32(MLX4_WQE_FMR_PERM_REMOTE_WRITE) : 0) |
-	       (acc & IB_ACCESS_REMOTE_READ   ? cpu_to_be32(MLX4_WQE_FMR_PERM_REMOTE_READ)  : 0) |
-	       (acc & IB_ACCESS_LOCAL_WRITE   ? cpu_to_be32(MLX4_WQE_FMR_PERM_LOCAL_WRITE)  : 0) |
+	return (acc & IB_ACCESS_REMOTE_ATOMIC ? cpu_to_be32(MLX4_WQE_FMR_AND_BIND_PERM_ATOMIC)		: 0) |
+	       (acc & IB_ACCESS_REMOTE_WRITE  ? cpu_to_be32(MLX4_WQE_FMR_AND_BIND_PERM_REMOTE_WRITE)	: 0) |
+	       (acc & IB_ACCESS_REMOTE_READ   ? cpu_to_be32(MLX4_WQE_FMR_AND_BIND_PERM_REMOTE_READ)	: 0) |
+	       (acc & IB_ACCESS_LOCAL_WRITE   ? cpu_to_be32(MLX4_WQE_FMR_PERM_LOCAL_WRITE)  		: 0) |
 		cpu_to_be32(MLX4_WQE_FMR_PERM_LOCAL_READ);
 }
 
@@ -1718,10 +2709,12 @@
 
 static void set_local_inv_seg(struct mlx4_wqe_local_inval_seg *iseg, u32 rkey)
 {
-	iseg->flags	= 0;
-	iseg->mem_key	= cpu_to_be32(rkey);
-	iseg->guest_id	= 0;
-	iseg->pa	= 0;
+	iseg->mem_key = cpu_to_be32(rkey);
+
+	iseg->reserved1    = 0;
+	iseg->reserved2    = 0;
+	iseg->reserved3[0] = 0;
+	iseg->reserved3[1] = 0;
 }
 
 static __always_inline void set_raddr_seg(struct mlx4_wqe_raddr_seg *rseg,
@@ -1757,7 +2750,7 @@
 }
 
 static void set_datagram_seg(struct mlx4_wqe_datagram_seg *dseg,
-			     struct ib_send_wr *wr, __be16 *vlan)
+			     struct ib_send_wr *wr)
 {
 	memcpy(dseg->av, &to_mah(wr->wr.ud.ah)->av, sizeof (struct mlx4_av));
 	dseg->dqpn = cpu_to_be32(wr->wr.ud.remote_qpn);
@@ -1764,9 +2757,65 @@
 	dseg->qkey = cpu_to_be32(wr->wr.ud.remote_qkey);
 	dseg->vlan = to_mah(wr->wr.ud.ah)->av.eth.vlan;
 	memcpy(dseg->mac, to_mah(wr->wr.ud.ah)->av.eth.mac, 6);
-	*vlan = dseg->vlan;
 }
 
+static void set_tunnel_datagram_seg(struct mlx4_ib_dev *dev,
+				    struct mlx4_wqe_datagram_seg *dseg,
+				    struct ib_send_wr *wr, enum ib_qp_type qpt)
+{
+	union mlx4_ext_av *av = &to_mah(wr->wr.ud.ah)->av;
+	struct mlx4_av sqp_av = {0};
+	int port = *((u8 *) &av->ib.port_pd) & 0x3;
+
+	/* force loopback */
+	sqp_av.port_pd = av->ib.port_pd | cpu_to_be32(0x80000000);
+	sqp_av.g_slid = av->ib.g_slid & 0x7f; /* no GRH */
+	sqp_av.sl_tclass_flowlabel = av->ib.sl_tclass_flowlabel &
+			cpu_to_be32(0xf0000000);
+
+	memcpy(dseg->av, &sqp_av, sizeof (struct mlx4_av));
+	/* This function used only for sending on QP1 proxies */
+	dseg->dqpn = cpu_to_be32(dev->dev->caps.qp1_tunnel[port - 1]);
+	/* Use QKEY from the QP context, which is set by master */
+	dseg->qkey = cpu_to_be32(IB_QP_SET_QKEY);
+}
+
+static void build_tunnel_header(struct ib_send_wr *wr, void *wqe, unsigned *mlx_seg_len)
+{
+	struct mlx4_wqe_inline_seg *inl = wqe;
+	struct mlx4_ib_tunnel_header hdr;
+	struct mlx4_ib_ah *ah = to_mah(wr->wr.ud.ah);
+	int spc;
+	int i;
+
+	memcpy(&hdr.av, &ah->av, sizeof hdr.av);
+	hdr.remote_qpn = cpu_to_be32(wr->wr.ud.remote_qpn);
+	hdr.pkey_index = cpu_to_be16(wr->wr.ud.pkey_index);
+	hdr.qkey = cpu_to_be32(wr->wr.ud.remote_qkey);
+
+	spc = MLX4_INLINE_ALIGN -
+		((unsigned long) (inl + 1) & (MLX4_INLINE_ALIGN - 1));
+	if (sizeof (hdr) <= spc) {
+		memcpy(inl + 1, &hdr, sizeof (hdr));
+		wmb();
+		inl->byte_count = cpu_to_be32(1U << 31 | sizeof (hdr));
+		i = 1;
+	} else {
+		memcpy(inl + 1, &hdr, spc);
+		wmb();
+		inl->byte_count = cpu_to_be32(1U << 31 | spc);
+
+		inl = (void *) (inl + 1) + spc;
+		memcpy(inl + 1, (void *) &hdr + spc, sizeof (hdr) - spc);
+		wmb();
+		inl->byte_count = cpu_to_be32(1U << 31 | (sizeof (hdr) - spc));
+		i = 2;
+	}
+
+	*mlx_seg_len =
+		ALIGN(i * sizeof (struct mlx4_wqe_inline_seg) + sizeof (hdr), 16);
+}
+
 static void set_mlx_icrc_seg(void *dseg)
 {
 	u32 *t = dseg;
@@ -1814,11 +2863,12 @@
 
 static int build_lso_seg(struct mlx4_wqe_lso_seg *wqe, struct ib_send_wr *wr,
 			 struct mlx4_ib_qp *qp, unsigned *lso_seg_len,
-			 __be32 *lso_hdr_sz, int *blh)
+			 __be32 *lso_hdr_sz, __be32 *blh)
 {
 	unsigned halign = ALIGN(sizeof *wqe + wr->wr.ud.hlen, 16);
 
-	*blh = unlikely(halign > 64) ? 1 : 0;
+	if (unlikely(halign > MLX4_IB_CACHE_LINE_SIZE))
+		*blh = cpu_to_be32(1 << 6);
 
 	if (unlikely(!(qp->flags & MLX4_IB_QP_LSO) &&
 		     wr->num_sge > qp->sq.max_gs - (halign >> 4)))
@@ -1847,6 +2897,13 @@
 	}
 }
 
+static void add_zero_len_inline(void *wqe)
+{
+	struct mlx4_wqe_inline_seg *inl = wqe;
+	memset(wqe, 0, 16);
+	inl->byte_count = cpu_to_be32(1 << 31);
+}
+
 static int lay_inline_data(struct mlx4_ib_qp *qp, struct ib_send_wr *wr,
 			   void *wqe, int *sz)
 {
@@ -1923,7 +2980,8 @@
  * implementations may use move-string-buffer assembler instructions,
  * which do not guarantee order of copying.
  */
-static void mlx4_bf_copy(unsigned long *dst, unsigned long *src, unsigned bytecnt)
+static void mlx4_bf_copy(unsigned long *dst, unsigned long *src,
+				unsigned bytecnt)
 {
 	__iowrite64_copy(dst, src, bytecnt / 8);
 }
@@ -1933,7 +2991,7 @@
 {
 	struct mlx4_ib_qp *qp = to_mqp(ibqp);
 	void *wqe;
-	struct mlx4_wqe_ctrl_seg *ctrl;
+	struct mlx4_wqe_ctrl_seg *uninitialized_var(ctrl);
 	struct mlx4_wqe_data_seg *dseg;
 	unsigned long flags;
 	int nreq;
@@ -1945,12 +3003,9 @@
 	__be32 dummy;
 	__be32 *lso_wqe;
 	__be32 uninitialized_var(lso_hdr_sz);
+	__be32 blh;
 	int i;
-	int blh = 0;
-	__be16 vlan = 0;
 	int inl = 0;
-
-	ctrl = NULL;
 	spin_lock_irqsave(&qp->sq.lock, flags);
 
 	ind = qp->sq_next_wqe;
@@ -1957,9 +3012,9 @@
 
 	for (nreq = 0; wr; ++nreq, wr = wr->next) {
 		lso_wqe = &dummy;
+		blh = 0;
 
 		if (mlx4_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq)) {
-			mlx4_ib_dbg("QP 0x%x: WQE overflow", ibqp->qp_num);
 			err = -ENOMEM;
 			*bad_wr = wr;
 			goto out;
@@ -1966,8 +3021,6 @@
 		}
 
 		if (unlikely(wr->num_sge > qp->sq.max_gs)) {
-			mlx4_ib_dbg("QP 0x%x: too many sg entries (%d)",
-				    ibqp->qp_num, wr->num_sge);
 			err = -EINVAL;
 			*bad_wr = wr;
 			goto out;
@@ -1992,13 +3045,9 @@
 		wqe += sizeof *ctrl;
 		size = sizeof *ctrl / 16;
 
-		switch (ibqp->qp_type) {
-		case IB_QPT_XRC:
-			ctrl->srcrb_flags |=
-				cpu_to_be32(wr->xrc_remote_srq_num << 8);
-			/* fall thru */
-		case IB_QPT_RC:
-		case IB_QPT_UC:
+		switch (qp->mlx4_ib_qp_type) {
+		case MLX4_IB_QPT_RC:
+		case MLX4_IB_QPT_UC:
 			switch (wr->opcode) {
 			case IB_WR_ATOMIC_CMP_AND_SWP:
 			case IB_WR_ATOMIC_FETCH_AND_ADD:
@@ -2059,10 +3108,28 @@
 			}
 			break;
 
-		case IB_QPT_UD:
-			set_datagram_seg(wqe, wr, &vlan);
+		case MLX4_IB_QPT_TUN_SMI_OWNER:
+			err =  build_sriov_qp0_header(to_msqp(qp), wr, ctrl, &seglen);
+			if (unlikely(err)) {
+				*bad_wr = wr;
+				goto out;
+			}
+			wqe  += seglen;
+			size += seglen / 16;
+			break;
+		case MLX4_IB_QPT_TUN_SMI:
+		case MLX4_IB_QPT_TUN_GSI:
+			/* this is a UD qp used in MAD responses to slaves. */
+			set_datagram_seg(wqe, wr);
+			/* set the forced-loopback bit in the data seg av */
+			*(__be32 *) wqe |= cpu_to_be32(0x80000000);
 			wqe  += sizeof (struct mlx4_wqe_datagram_seg);
 			size += sizeof (struct mlx4_wqe_datagram_seg) / 16;
+			break;
+		case MLX4_IB_QPT_UD:
+			set_datagram_seg(wqe, wr);
+			wqe  += sizeof (struct mlx4_wqe_datagram_seg);
+			size += sizeof (struct mlx4_wqe_datagram_seg) / 16;
 
 			if (wr->opcode == IB_WR_LSO) {
 				err = build_lso_seg(wqe, wr, qp, &seglen, &lso_hdr_sz, &blh);
@@ -2076,9 +3143,13 @@
 			}
 			break;
 
-		case IB_QPT_SMI:
-		case IB_QPT_GSI:
-			err = build_mlx_header(to_msqp(qp), wr, ctrl, &seglen);
+		case MLX4_IB_QPT_PROXY_SMI_OWNER:
+			if (unlikely(!mlx4_is_master(to_mdev(ibqp->device)->dev))) {
+				err = -ENOSYS;
+				*bad_wr = wr;
+				goto out;
+			}
+			err = build_sriov_qp0_header(to_msqp(qp), wr, ctrl, &seglen);
 			if (unlikely(err)) {
 				*bad_wr = wr;
 				goto out;
@@ -2085,11 +3156,35 @@
 			}
 			wqe  += seglen;
 			size += seglen / 16;
+			/* to start tunnel header on a cache-line boundary */
+			add_zero_len_inline(wqe);
+			wqe += 16;
+			size++;
+			build_tunnel_header(wr, wqe, &seglen);
+			wqe  += seglen;
+			size += seglen / 16;
 			break;
+		case MLX4_IB_QPT_PROXY_SMI:
+			/* don't allow QP0 sends on guests */
+			err = -ENOSYS;
+			*bad_wr = wr;
+			goto out;
+		case MLX4_IB_QPT_PROXY_GSI:
+			/* If we are tunneling special qps, this is a UD qp.
+			 * In this case we first add a UD segment targeting
+			 * the tunnel qp, and then add a header with address
+			 * information */
+			set_tunnel_datagram_seg(to_mdev(ibqp->device), wqe, wr, ibqp->qp_type);
+			wqe  += sizeof (struct mlx4_wqe_datagram_seg);
+			size += sizeof (struct mlx4_wqe_datagram_seg) / 16;
+			build_tunnel_header(wr, wqe, &seglen);
+			wqe  += seglen;
+			size += seglen / 16;
+			break;
 
-		case IB_QPT_RAW_ETY:
-			err = build_raw_ety_header(to_msqp(qp), wr, ctrl,
-						   &seglen);
+		case MLX4_IB_QPT_SMI:
+		case MLX4_IB_QPT_GSI:
+			err = build_mlx_header(to_msqp(qp), wr, ctrl, &seglen);
 			if (unlikely(err)) {
 				*bad_wr = wr;
 				goto out;
@@ -2108,13 +3203,14 @@
 		 * cacheline.  This avoids issues with WQE
 		 * prefetching.
 		 */
-
 		dseg = wqe;
 		dseg += wr->num_sge - 1;
 
 		/* Add one more inline data segment for ICRC for MLX sends */
-		if (unlikely(qp->ibqp.qp_type == IB_QPT_SMI ||
-			     qp->ibqp.qp_type == IB_QPT_GSI)) {
+		if (unlikely(qp->mlx4_ib_qp_type == MLX4_IB_QPT_SMI ||
+			     qp->mlx4_ib_qp_type == MLX4_IB_QPT_GSI ||
+			     qp->mlx4_ib_qp_type &
+			     (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_TUN_SMI_OWNER))) {
 			set_mlx_icrc_seg(dseg + 1);
 			size += sizeof (struct mlx4_wqe_data_seg) / 16;
 		}
@@ -2127,7 +3223,8 @@
 				size += sz;
 			}
 		} else {
-			size += wr->num_sge * (sizeof (struct mlx4_wqe_data_seg) / 16);
+			size += wr->num_sge *
+				(sizeof(struct mlx4_wqe_data_seg) / 16);
 			for (i = wr->num_sge - 1; i >= 0; --i, --dseg)
 				set_data_seg(dseg, wr->sg_list + i);
 		}
@@ -2139,15 +3236,9 @@
 		 */
 		wmb();
 		*lso_wqe = lso_hdr_sz;
-
 		ctrl->fence_size = (wr->send_flags & IB_SEND_FENCE ?
 				    MLX4_WQE_CTRL_FENCE : 0) | size;
 
-		if (vlan) {
-			ctrl->ins_vlan = 1 << 6;
-			ctrl->vlan_tag = vlan;
-		}
-
 		/*
 		 * Make sure descriptor is fully written before
 		 * setting ownership bit (because HW can start
@@ -2155,14 +3246,14 @@
 		 */
 		wmb();
 
-		if (wr->opcode < 0 || wr->opcode >= ARRAY_SIZE(mlx4_ib_opcode)) {
+		if (wr->opcode >= ARRAY_SIZE(mlx4_ib_opcode)) {
+			*bad_wr = wr;
 			err = -EINVAL;
 			goto out;
 		}
 
 		ctrl->owner_opcode = mlx4_ib_opcode[wr->opcode] |
-			(ind & qp->sq.wqe_cnt ? cpu_to_be32(1 << 31) : 0) |
-			(blh ? cpu_to_be32(1 << 6) : 0);
+			(ind & qp->sq.wqe_cnt ? cpu_to_be32(1 << 31) : 0) | blh;
 
 		stamp = ind + qp->sq_spare_wqes;
 		ind += DIV_ROUND_UP(size * 16, 1U << qp->sq.wqe_shift);
@@ -2185,6 +3276,9 @@
 out:
 	if (nreq == 1 && inl && size > 1 && size < qp->bf.buf_size / 16) {
 		ctrl->owner_opcode |= htonl((qp->sq_next_wqe & 0xffff) << 8);
+		/* We set above doorbell_qpn bits to 0 as part of vlan
+		  * tag initialization, so |= should be correct.
+		*/
 		*(u32 *) (&ctrl->vlan_tag) |= qp->doorbell_qpn;
 		/*
 		 * Make sure that descriptor is written to memory
@@ -2239,8 +3333,10 @@
 	int err = 0;
 	int nreq;
 	int ind;
+	int max_gs;
 	int i;
 
+	max_gs = qp->rq.max_gs;
 	spin_lock_irqsave(&qp->rq.lock, flags);
 
 	ind = qp->rq.head & (qp->rq.wqe_cnt - 1);
@@ -2247,7 +3343,6 @@
 
 	for (nreq = 0; wr; ++nreq, wr = wr->next) {
 		if (mlx4_wq_overflow(&qp->rq, nreq, qp->ibqp.recv_cq)) {
-			mlx4_ib_dbg("QP 0x%x: WQE overflow", ibqp->qp_num);
 			err = -ENOMEM;
 			*bad_wr = wr;
 			goto out;
@@ -2254,8 +3349,6 @@
 		}
 
 		if (unlikely(wr->num_sge > qp->rq.max_gs)) {
-			mlx4_ib_dbg("QP 0x%x: too many sg entries (%d)",
-				    ibqp->qp_num, wr->num_sge);
 			err = -EINVAL;
 			*bad_wr = wr;
 			goto out;
@@ -2263,10 +3356,25 @@
 
 		scat = get_recv_wqe(qp, ind);
 
+		if (qp->mlx4_ib_qp_type & (MLX4_IB_QPT_PROXY_SMI_OWNER |
+		    MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_GSI)) {
+			ib_dma_sync_single_for_device(ibqp->device,
+						      qp->sqp_proxy_rcv[ind].map,
+						      sizeof (struct mlx4_ib_proxy_sqp_hdr),
+						      DMA_FROM_DEVICE);
+			scat->byte_count =
+				cpu_to_be32(sizeof (struct mlx4_ib_proxy_sqp_hdr));
+			/* use dma lkey from upper layer entry */
+			scat->lkey = cpu_to_be32(wr->sg_list->lkey);
+			scat->addr = cpu_to_be64(qp->sqp_proxy_rcv[ind].map);
+			scat++;
+			max_gs--;
+		}
+
 		for (i = 0; i < wr->num_sge; ++i)
 			__set_data_seg(scat + i, wr->sg_list + i);
 
-		if (i < qp->rq.max_gs) {
+		if (i < max_gs) {
 			scat[i].byte_count = 0;
 			scat[i].lkey       = cpu_to_be32(MLX4_INVALID_LKEY);
 			scat[i].addr       = 0;
@@ -2334,10 +3442,10 @@
 	return ib_flags;
 }
 
-static void to_ib_ah_attr(struct mlx4_ib_dev *ib_dev, struct ib_ah_attr *ib_ah_attr,
-			  struct mlx4_qp_path *path)
+static void to_ib_ah_attr(struct mlx4_ib_dev *ibdev, struct ib_ah_attr *ib_ah_attr,
+				struct mlx4_qp_path *path)
 {
-	struct mlx4_dev *dev = ib_dev->dev;
+	struct mlx4_dev *dev = ibdev->dev;
 	int is_eth;
 
 	memset(ib_ah_attr, 0, sizeof *ib_ah_attr);
@@ -2346,7 +3454,7 @@
 	if (ib_ah_attr->port_num == 0 || ib_ah_attr->port_num > dev->caps.num_ports)
 		return;
 
-	is_eth = rdma_port_get_link_layer(&ib_dev->ib_dev, ib_ah_attr->port_num) ==
+	is_eth = rdma_port_get_link_layer(&ibdev->ib_dev, ib_ah_attr->port_num) ==
 		IB_LINK_LAYER_ETHERNET;
 	if (is_eth)
 		ib_ah_attr->sl = ((path->sched_queue >> 3) & 0x7) |
@@ -2355,7 +3463,6 @@
 		ib_ah_attr->sl = (path->sched_queue >> 2) & 0xf;
 
 	ib_ah_attr->dlid	  = be16_to_cpu(path->rlid);
-
 	ib_ah_attr->src_path_bits = path->grh_mylmc & 0x7f;
 	ib_ah_attr->static_rate   = path->static_rate ? path->static_rate - 5 : 0;
 	ib_ah_attr->ah_flags      = (path->grh_mylmc & (1 << 7)) ? IB_AH_GRH : 0;
@@ -2407,8 +3514,7 @@
 	qp_attr->qp_access_flags     =
 		to_ib_qp_access_flags(be32_to_cpu(context.params2));
 
-	if (qp->ibqp.qp_type == IB_QPT_RC || qp->ibqp.qp_type == IB_QPT_UC ||
-	    qp->ibqp.qp_type == IB_QPT_XRC) {
+	if (qp->ibqp.qp_type == IB_QPT_RC || qp->ibqp.qp_type == IB_QPT_UC) {
 		to_ib_ah_attr(dev, &qp_attr->ah_attr, &context.pri_path);
 		to_ib_ah_attr(dev, &qp_attr->alt_ah_attr, &context.alt_path);
 		qp_attr->alt_pkey_index = context.alt_path.pkey_index & 0x7f;
@@ -2463,308 +3569,21 @@
 	if (qp->flags & MLX4_IB_QP_LSO)
 		qp_init_attr->create_flags |= IB_QP_CREATE_IPOIB_UD_LSO;
 
-out:
-	mutex_unlock(&qp->mutex);
-	return err;
-}
+	if (qp->flags & MLX4_IB_QP_NETIF)
+		qp_init_attr->create_flags |= IB_QP_CREATE_NETIF_QP;
 
-int mlx4_ib_create_xrc_rcv_qp(struct ib_qp_init_attr *init_attr,
-			      u32 *qp_num)
-{
-	struct mlx4_ib_dev *dev = to_mdev(init_attr->xrc_domain->device);
-	struct mlx4_ib_xrcd *xrcd = to_mxrcd(init_attr->xrc_domain);
-	struct mlx4_ib_qp *qp;
-	struct ib_qp *ibqp;
-	struct mlx4_ib_xrc_reg_entry *ctx_entry;
-	unsigned long flags;
-	int err;
+	qp_init_attr->sq_sig_type =
+		qp->sq_signal_bits == cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE) ?
+		IB_SIGNAL_ALL_WR : IB_SIGNAL_REQ_WR;
 
-	if (!(dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC))
-		return -ENOSYS;
+	qp_init_attr->qpg_type = ibqp->qpg_type;
+	if (ibqp->qpg_type == IB_QPG_PARENT)
+		qp_init_attr->cap.qpg_tss_mask_sz = qp->qpg_data->qpg_tss_mask_sz;
+	else
+		qp_init_attr->cap.qpg_tss_mask_sz = 0;
 
-	if (init_attr->qp_type != IB_QPT_XRC)
-		return -EINVAL;
-
-	ctx_entry = kmalloc(sizeof *ctx_entry, GFP_KERNEL);
-	if (!ctx_entry)
-		return -ENOMEM;
-
-	qp = kzalloc(sizeof *qp, GFP_KERNEL);
-	if (!qp) {
-		kfree(ctx_entry);
-		return -ENOMEM;
-	}
-	mutex_lock(&dev->xrc_reg_mutex);
-	qp->flags = MLX4_IB_XRC_RCV;
-	qp->xrcdn = to_mxrcd(init_attr->xrc_domain)->xrcdn;
-	INIT_LIST_HEAD(&qp->xrc_reg_list);
-	err = create_qp_common(dev, xrcd->pd, init_attr, NULL, 0, qp);
-	if (err) {
-		mutex_unlock(&dev->xrc_reg_mutex);
-		kfree(ctx_entry);
-		kfree(qp);
-		return err;
-	}
-
-	ibqp = &qp->ibqp;
-	/* set the ibpq attributes which will be used by the mlx4 module */
-	ibqp->qp_num = qp->mqp.qpn;
-	ibqp->device = init_attr->xrc_domain->device;
-	ibqp->pd = xrcd->pd;
-	ibqp->send_cq = ibqp->recv_cq = xrcd->cq;
-	ibqp->event_handler = init_attr->event_handler;
-	ibqp->qp_context = init_attr->qp_context;
-	ibqp->qp_type = init_attr->qp_type;
-	ibqp->xrcd = init_attr->xrc_domain;
-
-	mutex_lock(&qp->mutex);
-	ctx_entry->context = init_attr->qp_context;
-	spin_lock_irqsave(&qp->xrc_reg_list_lock, flags);
-	list_add_tail(&ctx_entry->list, &qp->xrc_reg_list);
-	spin_unlock_irqrestore(&qp->xrc_reg_list_lock, flags);
+out:
 	mutex_unlock(&qp->mutex);
-	mutex_unlock(&dev->xrc_reg_mutex);
-	*qp_num = qp->mqp.qpn;
-	return 0;
-}
-
-int mlx4_ib_modify_xrc_rcv_qp(struct ib_xrcd *ibxrcd, u32 qp_num,
-			      struct ib_qp_attr *attr, int attr_mask)
-{
-	struct mlx4_ib_dev *dev = to_mdev(ibxrcd->device);
-	struct mlx4_ib_xrcd *xrcd = to_mxrcd(ibxrcd);
-	struct mlx4_qp *mqp;
-	struct mlx4_ib_qp *mibqp;
-	int err = -EINVAL;
-
-	if (!(dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC))
-		return -ENOSYS;
-
-	mutex_lock(&dev->xrc_reg_mutex);
-	mqp = mlx4_qp_lookup_lock(dev->dev, qp_num);
-	if (unlikely(!mqp)) {
-		printk(KERN_WARNING "mlx4_ib_reg_xrc_rcv_qp: "
-		       "unknown QPN %06x\n", qp_num);
-		goto err_out;
-	}
-
-	mibqp = to_mibqp(mqp);
-
-	if (!(mibqp->flags & MLX4_IB_XRC_RCV) || !mibqp->ibqp.xrcd ||
-	    xrcd->xrcdn != to_mxrcd(mibqp->ibqp.xrcd)->xrcdn)
-		goto err_out;
-
-	err = mlx4_ib_modify_qp(&mibqp->ibqp, attr, attr_mask, NULL);
-	mutex_unlock(&dev->xrc_reg_mutex);
 	return err;
-
-err_out:
-	mutex_unlock(&dev->xrc_reg_mutex);
-	return err;
 }
 
-int mlx4_ib_query_xrc_rcv_qp(struct ib_xrcd *ibxrcd, u32 qp_num,
-			     struct ib_qp_attr *qp_attr, int qp_attr_mask,
-			     struct ib_qp_init_attr *qp_init_attr)
-{
-	struct mlx4_ib_dev *dev = to_mdev(ibxrcd->device);
-	struct mlx4_ib_xrcd *xrcd = to_mxrcd(ibxrcd);
-	struct mlx4_ib_qp *qp;
-	struct mlx4_qp *mqp;
-	struct mlx4_qp_context context;
-	int mlx4_state;
-	int err = -EINVAL;
-
-	if (!(dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC))
-		return -ENOSYS;
-
-	mutex_lock(&dev->xrc_reg_mutex);
-	mqp = mlx4_qp_lookup_lock(dev->dev, qp_num);
-	if (unlikely(!mqp)) {
-		printk(KERN_WARNING "mlx4_ib_reg_xrc_rcv_qp: "
-		       "unknown QPN %06x\n", qp_num);
-		goto err_out;
-	}
-
-	qp = to_mibqp(mqp);
-	if (!(qp->flags & MLX4_IB_XRC_RCV) || !(qp->ibqp.xrcd) ||
-	    xrcd->xrcdn != to_mxrcd(qp->ibqp.xrcd)->xrcdn)
-		goto err_out;
-
-	if (qp->state == IB_QPS_RESET) {
-		qp_attr->qp_state = IB_QPS_RESET;
-		goto done;
-	}
-
-	err = mlx4_qp_query(dev->dev, mqp, &context);
-	if (err)
-		goto err_out;
-
-	mlx4_state = be32_to_cpu(context.flags) >> 28;
-
-	qp_attr->qp_state = to_ib_qp_state(mlx4_state);
-	qp_attr->path_mtu = context.mtu_msgmax >> 5;
-	qp_attr->path_mig_state =
-		to_ib_mig_state((be32_to_cpu(context.flags) >> 11) & 0x3);
-	qp_attr->qkey = be32_to_cpu(context.qkey);
-	qp_attr->rq_psn = be32_to_cpu(context.rnr_nextrecvpsn) & 0xffffff;
-	qp_attr->sq_psn = be32_to_cpu(context.next_send_psn) & 0xffffff;
-	qp_attr->dest_qp_num = be32_to_cpu(context.remote_qpn) & 0xffffff;
-	qp_attr->qp_access_flags =
-		to_ib_qp_access_flags(be32_to_cpu(context.params2));
-
-	if (qp->ibqp.qp_type == IB_QPT_RC || qp->ibqp.qp_type == IB_QPT_UC ||
-	    qp->ibqp.qp_type == IB_QPT_XRC) {
-		to_ib_ah_attr(dev, &qp_attr->ah_attr, &context.pri_path);
-		to_ib_ah_attr(dev, &qp_attr->alt_ah_attr,
-			      &context.alt_path);
-		qp_attr->alt_pkey_index = context.alt_path.pkey_index & 0x7f;
-		qp_attr->alt_port_num	= qp_attr->alt_ah_attr.port_num;
-	}
-
-	qp_attr->pkey_index = context.pri_path.pkey_index & 0x7f;
-	if (qp_attr->qp_state == IB_QPS_INIT)
-		qp_attr->port_num = qp->port;
-	else
-		qp_attr->port_num = context.pri_path.sched_queue & 0x40 ? 2 : 1;
-
-	/* qp_attr->en_sqd_async_notify is only applicable in modify qp */
-	qp_attr->sq_draining = mlx4_state == MLX4_QP_STATE_SQ_DRAINING;
-
-	qp_attr->max_rd_atomic =
-		1 << ((be32_to_cpu(context.params1) >> 21) & 0x7);
-
-	qp_attr->max_dest_rd_atomic =
-		1 << ((be32_to_cpu(context.params2) >> 21) & 0x7);
-	qp_attr->min_rnr_timer =
-		(be32_to_cpu(context.rnr_nextrecvpsn) >> 24) & 0x1f;
-	qp_attr->timeout = context.pri_path.ackto >> 3;
-	qp_attr->retry_cnt = (be32_to_cpu(context.params1) >> 16) & 0x7;
-	qp_attr->rnr_retry = (be32_to_cpu(context.params1) >> 13) & 0x7;
-	qp_attr->alt_timeout = context.alt_path.ackto >> 3;
-
-done:
-	qp_attr->cur_qp_state	     = qp_attr->qp_state;
-	qp_attr->cap.max_recv_wr     = 0;
-	qp_attr->cap.max_recv_sge    = 0;
-	qp_attr->cap.max_send_wr     = 0;
-	qp_attr->cap.max_send_sge    = 0;
-	qp_attr->cap.max_inline_data = 0;
-	qp_init_attr->cap	     = qp_attr->cap;
-
-	mutex_unlock(&dev->xrc_reg_mutex);
-	return 0;
-
-err_out:
-	mutex_unlock(&dev->xrc_reg_mutex);
-	return err;
-}
-
-int mlx4_ib_reg_xrc_rcv_qp(struct ib_xrcd *xrcd, void *context, u32 qp_num)
-{
-
-	struct mlx4_ib_xrcd *mxrcd = to_mxrcd(xrcd);
-
-	struct mlx4_qp *mqp;
-	struct mlx4_ib_qp *mibqp;
-	struct mlx4_ib_xrc_reg_entry *ctx_entry, *tmp;
-	unsigned long flags;
-	int err = -EINVAL;
-
-	mutex_lock(&to_mdev(xrcd->device)->xrc_reg_mutex);
-	mqp = mlx4_qp_lookup_lock(to_mdev(xrcd->device)->dev, qp_num);
-	if (unlikely(!mqp)) {
-		printk(KERN_WARNING "mlx4_ib_reg_xrc_rcv_qp: "
-		       "unknown QPN %06x\n", qp_num);
-		goto err_out;
-	}
-
-	mibqp = to_mibqp(mqp);
-
-	if (!(mibqp->flags & MLX4_IB_XRC_RCV) || !(mibqp->ibqp.xrcd) ||
-	    mxrcd->xrcdn != to_mxrcd(mibqp->ibqp.xrcd)->xrcdn)
-		goto err_out;
-
-	ctx_entry = kmalloc(sizeof *ctx_entry, GFP_KERNEL);
-	if (!ctx_entry) {
-		err = -ENOMEM;
-		goto err_out;
-	}
-
-	mutex_lock(&mibqp->mutex);
-	list_for_each_entry(tmp, &mibqp->xrc_reg_list, list)
-		if (tmp->context == context) {
-			mutex_unlock(&mibqp->mutex);
-			kfree(ctx_entry);
-			mutex_unlock(&to_mdev(xrcd->device)->xrc_reg_mutex);
-			return 0;
-		}
-
-	ctx_entry->context = context;
-	spin_lock_irqsave(&mibqp->xrc_reg_list_lock, flags);
-	list_add_tail(&ctx_entry->list, &mibqp->xrc_reg_list);
-	spin_unlock_irqrestore(&mibqp->xrc_reg_list_lock, flags);
-	mutex_unlock(&mibqp->mutex);
-	mutex_unlock(&to_mdev(xrcd->device)->xrc_reg_mutex);
-	return 0;
-
-err_out:
-	mutex_unlock(&to_mdev(xrcd->device)->xrc_reg_mutex);
-	return err;
-}
-
-int mlx4_ib_unreg_xrc_rcv_qp(struct ib_xrcd *xrcd, void *context, u32 qp_num)
-{
-
-	struct mlx4_ib_xrcd *mxrcd = to_mxrcd(xrcd);
-
-	struct mlx4_qp *mqp;
-	struct mlx4_ib_qp *mibqp;
-	struct mlx4_ib_xrc_reg_entry *ctx_entry, *tmp;
-	unsigned long flags;
-	int found = 0;
-	int err = -EINVAL;
-
-	mutex_lock(&to_mdev(xrcd->device)->xrc_reg_mutex);
-	mqp = mlx4_qp_lookup_lock(to_mdev(xrcd->device)->dev, qp_num);
-	if (unlikely(!mqp)) {
-		printk(KERN_WARNING "mlx4_ib_unreg_xrc_rcv_qp: "
-		       "unknown QPN %06x\n", qp_num);
-		goto err_out;
-	}
-
-	mibqp = to_mibqp(mqp);
-
-	if (!(mibqp->flags & MLX4_IB_XRC_RCV) ||
-	    mxrcd->xrcdn != (mibqp->xrcdn & 0xffff))
-		goto err_out;
-
-	mutex_lock(&mibqp->mutex);
-	spin_lock_irqsave(&mibqp->xrc_reg_list_lock, flags);
-	list_for_each_entry_safe(ctx_entry, tmp, &mibqp->xrc_reg_list, list)
-		if (ctx_entry->context == context) {
-			found = 1;
-			list_del(&ctx_entry->list);
-			spin_unlock_irqrestore(&mibqp->xrc_reg_list_lock, flags);
-			kfree(ctx_entry);
-			break;
-		}
-
-	if (!found)
-		spin_unlock_irqrestore(&mibqp->xrc_reg_list_lock, flags);
-	mutex_unlock(&mibqp->mutex);
-	if (!found)
-		goto err_out;
-
-	/* destroy the QP if the registration list is empty */
-	if (list_empty(&mibqp->xrc_reg_list))
-		mlx4_ib_destroy_qp(&mibqp->ibqp);
-
-	mutex_unlock(&to_mdev(xrcd->device)->xrc_reg_mutex);
-	return 0;
-
-err_out:
-	mutex_unlock(&to_mdev(xrcd->device)->xrc_reg_mutex);
-	return err;
-}
-


Property changes on: trunk/sys/ofed/drivers/infiniband/hw/mlx4/qp.c
___________________________________________________________________
Deleted: cvs2svn:cvs-rev
## -1 +0,0 ##
-1.1.1.1
\ No newline at end of property
Modified: trunk/sys/ofed/drivers/infiniband/hw/mlx4/srq.c
===================================================================
--- trunk/sys/ofed/drivers/infiniband/hw/mlx4/srq.c	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/drivers/infiniband/hw/mlx4/srq.c	2016-09-14 19:35:22 UTC (rev 7911)
@@ -33,6 +33,7 @@
 
 #include <linux/mlx4/qp.h>
 #include <linux/mlx4/srq.h>
+#include <linux/slab.h>
 
 #include "mlx4_ib.h"
 #include "user.h"
@@ -58,7 +59,7 @@
 			event.event = IB_EVENT_SRQ_ERR;
 			break;
 		default:
-			printk(KERN_WARNING "mlx4_ib: Unexpected event type %d "
+			pr_warn("Unexpected event type %d "
 			       "on SRQ %06x\n", type, srq->srqn);
 			return;
 		}
@@ -67,17 +68,16 @@
 	}
 }
 
-struct ib_srq *mlx4_ib_create_xrc_srq(struct ib_pd *pd,
-				      struct ib_cq *xrc_cq,
-				      struct ib_xrcd *xrcd,
-				      struct ib_srq_init_attr *init_attr,
-				      struct ib_udata *udata)
+struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd,
+				  struct ib_srq_init_attr *init_attr,
+				  struct ib_udata *udata)
 {
 	struct mlx4_ib_dev *dev = to_mdev(pd->device);
 	struct mlx4_ib_srq *srq;
 	struct mlx4_wqe_srq_next_seg *next;
-	u32	cqn;
-	u16	xrcdn;
+	struct mlx4_wqe_data_seg *scatter;
+	u32 cqn;
+	u16 xrcdn;
 	int desc_size;
 	int buf_size;
 	int err;
@@ -85,14 +85,10 @@
 
 	/* Sanity check SRQ size before proceeding */
 	if (init_attr->attr.max_wr  >= dev->dev->caps.max_srq_wqes ||
-	    init_attr->attr.max_sge >  dev->dev->caps.max_srq_sge) {
-		mlx4_ib_dbg("a size param is out of range. "
-			    "max_wr = 0x%x, max_sge = 0x%x",
-			    init_attr->attr.max_wr, init_attr->attr.max_sge);
+	    init_attr->attr.max_sge >  dev->dev->caps.max_srq_sge)
 		return ERR_PTR(-EINVAL);
-	}
 
-	srq = kzalloc(sizeof *srq, GFP_KERNEL);
+	srq = kmalloc(sizeof *srq, GFP_KERNEL);
 	if (!srq)
 		return ERR_PTR(-ENOMEM);
 
@@ -138,8 +134,6 @@
 		if (err)
 			goto err_mtt;
 	} else {
-		struct mlx4_wqe_data_seg *scatter;
-
 		err = mlx4_db_alloc(dev->dev, &srq->db, 0);
 		if (err)
 			goto err_srq;
@@ -182,10 +176,11 @@
 		}
 	}
 
-	cqn = xrc_cq ? (u32) (to_mcq(xrc_cq)->mcq.cqn) : 0;
-	xrcdn = xrcd ? (u16) (to_mxrcd(xrcd)->xrcdn) :
+	cqn = (init_attr->srq_type == IB_SRQT_XRC) ?
+		to_mcq(init_attr->ext.xrc.cq)->mcq.cqn : 0;
+	xrcdn = (init_attr->srq_type == IB_SRQT_XRC) ?
+		to_mxrcd(init_attr->ext.xrc.xrcd)->xrcdn :
 		(u16) dev->dev->caps.reserved_xrcds;
-
 	err = mlx4_srq_alloc(dev->dev, to_mpd(pd)->pdn, cqn, xrcdn, &srq->mtt,
 			     srq->db.dma, &srq->msrq);
 	if (err)
@@ -192,14 +187,13 @@
 		goto err_wrid;
 
 	srq->msrq.event = mlx4_ib_srq_event;
+	srq->ibsrq.ext.xrc.srq_num = srq->msrq.srqn;
 
-	if (pd->uobject) {
+	if (pd->uobject)
 		if (ib_copy_to_udata(udata, &srq->msrq.srqn, sizeof (__u32))) {
 			err = -EFAULT;
 			goto err_wrid;
 		}
-	} else
-		srq->ibsrq.xrc_srq_num = srq->msrq.srqn;
 
 	init_attr->attr.max_wr = srq->msrq.max - 1;
 
@@ -238,16 +232,12 @@
 	int ret;
 
 	/* We don't support resizing SRQs (yet?) */
-	if (attr_mask & IB_SRQ_MAX_WR) {
-		mlx4_ib_dbg("resize not yet supported");
+	if (attr_mask & IB_SRQ_MAX_WR)
 		return -EINVAL;
-	}
 
 	if (attr_mask & IB_SRQ_LIMIT) {
-		if (attr->srq_limit >= srq->msrq.max){
-			mlx4_ib_dbg("limit (0x%x) too high", attr->srq_limit);
+		if (attr->srq_limit >= srq->msrq.max)
 			return -EINVAL;
-		}
 
 		mutex_lock(&srq->mutex);
 		ret = mlx4_srq_arm(dev->dev, &srq->msrq, attr->srq_limit);
@@ -260,13 +250,6 @@
 	return 0;
 }
 
-struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd,
-				  struct ib_srq_init_attr *init_attr,
-				  struct ib_udata *udata)
-{
-	return mlx4_ib_create_xrc_srq(pd, NULL, NULL, init_attr, udata);
-}
-
 int mlx4_ib_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *srq_attr)
 {
 	struct mlx4_ib_dev *dev = to_mdev(ibsrq->device);
@@ -289,19 +272,7 @@
 {
 	struct mlx4_ib_dev *dev = to_mdev(srq->device);
 	struct mlx4_ib_srq *msrq = to_msrq(srq);
-	struct mlx4_ib_cq *cq;
 
-	mlx4_srq_invalidate(dev->dev, &msrq->msrq);
-
-	if (srq->xrc_cq && !srq->uobject) {
-		cq = to_mcq(srq->xrc_cq);
-		spin_lock_irq(&cq->lock);
-		__mlx4_ib_cq_clean(cq, -1, msrq);
-		mlx4_srq_remove(dev->dev, &msrq->msrq);
-		spin_unlock_irq(&cq->lock);
-	} else
-		mlx4_srq_remove(dev->dev, &msrq->msrq);
-
 	mlx4_srq_free(dev->dev, &msrq->msrq);
 	mlx4_mtt_cleanup(dev->dev, &msrq->mtt);
 
@@ -349,8 +320,6 @@
 
 	for (nreq = 0; wr; ++nreq, wr = wr->next) {
 		if (unlikely(wr->num_sge > srq->msrq.max_gs)) {
-			mlx4_ib_dbg("srq num 0x%x: num s/g entries too large (%d)",
-				    srq->msrq.srqn, wr->num_sge);
 			err = -EINVAL;
 			*bad_wr = wr;
 			break;
@@ -357,8 +326,6 @@
 		}
 
 		if (unlikely(srq->head == srq->tail)) {
-			mlx4_ib_dbg("srq num 0x%x: No entries available to post.",
-				    srq->msrq.srqn);
 			err = -ENOMEM;
 			*bad_wr = wr;
 			break;


Property changes on: trunk/sys/ofed/drivers/infiniband/hw/mlx4/srq.c
___________________________________________________________________
Deleted: cvs2svn:cvs-rev
## -1 +0,0 ##
-1.1.1.1
\ No newline at end of property
Added: trunk/sys/ofed/drivers/infiniband/hw/mlx4/sysfs.c
===================================================================
--- trunk/sys/ofed/drivers/infiniband/hw/mlx4/sysfs.c	                        (rev 0)
+++ trunk/sys/ofed/drivers/infiniband/hw/mlx4/sysfs.c	2016-09-14 19:35:22 UTC (rev 7911)
@@ -0,0 +1,801 @@
+/*
+ * Copyright (c) 2012 Mellanox Technologies.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*#include "core_priv.h"*/
+#include "mlx4_ib.h"
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/fs.h>
+
+#include <rdma/ib_mad.h>
+/*show_admin_alias_guid returns the administratively assigned value of that GUID.
+ * Values returned in buf parameter string:
+ *	0			- requests opensm to assign a value.
+ *	ffffffffffffffff	- delete this entry.
+ *	other			- value assigned by administrator.
+ */
+static ssize_t show_admin_alias_guid(struct device *dev,
+			      struct device_attribute *attr, char *buf)
+{
+	int record_num;/*0-15*/
+	int guid_index_in_rec; /*0 - 7*/
+	struct mlx4_ib_iov_sysfs_attr *mlx4_ib_iov_dentry =
+		container_of(attr, struct mlx4_ib_iov_sysfs_attr, dentry);
+	struct mlx4_ib_iov_port *port = mlx4_ib_iov_dentry->ctx;
+	struct mlx4_ib_dev *mdev = port->dev;
+
+	record_num = mlx4_ib_iov_dentry->entry_num / 8 ;
+	guid_index_in_rec = mlx4_ib_iov_dentry->entry_num % 8 ;
+
+	return sprintf(buf, "%llx\n", (long long)
+		       be64_to_cpu(*(__be64 *)&mdev->sriov.alias_guid.
+				   ports_guid[port->num - 1].
+				   all_rec_per_port[record_num].
+				   all_recs[8 * guid_index_in_rec]));
+}
+
+/* store_admin_alias_guid stores the (new) administratively assigned value of that GUID.
+ * Values in buf parameter string:
+ *	0			- requests opensm to assign a value.
+ *	0xffffffffffffffff	- delete this entry.
+ *	other			- guid value assigned by the administrator.
+ */
+static ssize_t store_admin_alias_guid(struct device *dev,
+				      struct device_attribute *attr,
+				      const char *buf, size_t count)
+{
+	int record_num;/*0-15*/
+	int guid_index_in_rec; /*0 - 7*/
+	struct mlx4_ib_iov_sysfs_attr *mlx4_ib_iov_dentry =
+		container_of(attr, struct mlx4_ib_iov_sysfs_attr, dentry);
+	struct mlx4_ib_iov_port *port = mlx4_ib_iov_dentry->ctx;
+	struct mlx4_ib_dev *mdev = port->dev;
+	u64 sysadmin_ag_val;
+
+	record_num = mlx4_ib_iov_dentry->entry_num / 8;
+	guid_index_in_rec = mlx4_ib_iov_dentry->entry_num % 8;
+	if (0 == record_num && 0 == guid_index_in_rec) {
+		pr_err("GUID 0 block 0 is RO\n");
+		return count;
+	}
+	sscanf(buf, "%llx", &sysadmin_ag_val);
+	*(__be64 *)&mdev->sriov.alias_guid.ports_guid[port->num - 1].
+		all_rec_per_port[record_num].
+		all_recs[GUID_REC_SIZE * guid_index_in_rec] =
+			cpu_to_be64(sysadmin_ag_val);
+
+	/* Change the state to be pending for update */
+	mdev->sriov.alias_guid.ports_guid[port->num - 1].all_rec_per_port[record_num].status
+		= MLX4_GUID_INFO_STATUS_IDLE ;
+
+	mdev->sriov.alias_guid.ports_guid[port->num - 1].all_rec_per_port[record_num].method
+		= MLX4_GUID_INFO_RECORD_SET;
+
+	switch (sysadmin_ag_val) {
+	case MLX4_GUID_FOR_DELETE_VAL:
+		mdev->sriov.alias_guid.ports_guid[port->num - 1].all_rec_per_port[record_num].method
+			= MLX4_GUID_INFO_RECORD_DELETE;
+		mdev->sriov.alias_guid.ports_guid[port->num - 1].all_rec_per_port[record_num].ownership
+			= MLX4_GUID_SYSADMIN_ASSIGN;
+		break;
+	/* The sysadmin requests the SM to re-assign */
+	case MLX4_NOT_SET_GUID:
+		mdev->sriov.alias_guid.ports_guid[port->num - 1].all_rec_per_port[record_num].ownership
+			= MLX4_GUID_DRIVER_ASSIGN;
+		break;
+	/* The sysadmin requests a specific value.*/
+	default:
+		mdev->sriov.alias_guid.ports_guid[port->num - 1].all_rec_per_port[record_num].ownership
+			= MLX4_GUID_SYSADMIN_ASSIGN;
+		break;
+	}
+
+	/* set the record index */
+	mdev->sriov.alias_guid.ports_guid[port->num - 1].all_rec_per_port[record_num].guid_indexes
+		= mlx4_ib_get_aguid_comp_mask_from_ix(guid_index_in_rec);
+
+	mlx4_ib_init_alias_guid_work(mdev, port->num - 1);
+
+	return count;
+}
+
+static ssize_t show_port_gid(struct device *dev,
+			     struct device_attribute *attr,
+			     char *buf)
+{
+	struct mlx4_ib_iov_sysfs_attr *mlx4_ib_iov_dentry =
+		container_of(attr, struct mlx4_ib_iov_sysfs_attr, dentry);
+	struct mlx4_ib_iov_port *port = mlx4_ib_iov_dentry->ctx;
+	struct mlx4_ib_dev *mdev = port->dev;
+	union ib_gid gid;
+	ssize_t ret;
+
+	ret = __mlx4_ib_query_gid(&mdev->ib_dev, port->num,
+				  mlx4_ib_iov_dentry->entry_num, &gid, 1);
+	if (ret)
+		return ret;
+	ret = sprintf(buf, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n",
+		      be16_to_cpu(((__be16 *) gid.raw)[0]),
+		      be16_to_cpu(((__be16 *) gid.raw)[1]),
+		      be16_to_cpu(((__be16 *) gid.raw)[2]),
+		      be16_to_cpu(((__be16 *) gid.raw)[3]),
+		      be16_to_cpu(((__be16 *) gid.raw)[4]),
+		      be16_to_cpu(((__be16 *) gid.raw)[5]),
+		      be16_to_cpu(((__be16 *) gid.raw)[6]),
+		      be16_to_cpu(((__be16 *) gid.raw)[7]));
+	return ret;
+}
+
+static ssize_t show_phys_port_pkey(struct device *dev,
+				   struct device_attribute *attr,
+				   char *buf)
+{
+	struct mlx4_ib_iov_sysfs_attr *mlx4_ib_iov_dentry =
+		container_of(attr, struct mlx4_ib_iov_sysfs_attr, dentry);
+	struct mlx4_ib_iov_port *port = mlx4_ib_iov_dentry->ctx;
+	struct mlx4_ib_dev *mdev = port->dev;
+	u16 pkey;
+	ssize_t ret;
+
+	ret = __mlx4_ib_query_pkey(&mdev->ib_dev, port->num,
+				   mlx4_ib_iov_dentry->entry_num, &pkey, 1);
+	if (ret)
+		return ret;
+
+	return sprintf(buf, "0x%04x\n", pkey);
+}
+
+#define DENTRY_REMOVE(_dentry)						\
+do {									\
+	sysfs_remove_file((_dentry)->kobj, &(_dentry)->dentry.attr);	\
+} while (0);
+
+static int create_sysfs_entry(void *_ctx, struct mlx4_ib_iov_sysfs_attr *_dentry,
+			      char *_name, struct kobject *_kobj,
+			      ssize_t (*show)(struct device *dev,
+					      struct device_attribute *attr,
+					      char *buf),
+			      ssize_t (*store)(struct device *dev,
+					       struct device_attribute *attr,
+					       const char *buf, size_t count)
+			      )
+{
+	int ret = 0;
+	struct mlx4_ib_iov_sysfs_attr *vdentry = _dentry;
+
+	vdentry->ctx = _ctx;
+	vdentry->dentry.show = show;
+	vdentry->dentry.store = store;
+	sysfs_attr_init(&vdentry->dentry.attr);
+	vdentry->dentry.attr.name = vdentry->name;
+	vdentry->dentry.attr.mode = 0;
+	vdentry->kobj = _kobj;
+	snprintf(vdentry->name, 15, "%s", _name);
+
+	if (vdentry->dentry.store)
+		vdentry->dentry.attr.mode |= S_IWUSR;
+
+	if (vdentry->dentry.show)
+		vdentry->dentry.attr.mode |= S_IRUGO;
+
+	ret = sysfs_create_file(vdentry->kobj, &vdentry->dentry.attr);
+	if (ret) {
+		pr_err("failed to create %s\n", vdentry->dentry.attr.name);
+		vdentry->ctx = NULL;
+		return ret;
+	}
+
+	return ret;
+}
+
+int add_sysfs_port_mcg_attr(struct mlx4_ib_dev *device, int port_num,
+		struct attribute *attr)
+{
+	struct mlx4_ib_iov_port *port = &device->iov_ports[port_num - 1];
+	int ret;
+
+	ret = sysfs_create_file(port->mcgs_parent, attr);
+	if (ret)
+		pr_err("failed to create %s\n", attr->name);
+
+	return ret;
+}
+
+void del_sysfs_port_mcg_attr(struct mlx4_ib_dev *device, int port_num,
+		struct attribute *attr)
+{
+	struct mlx4_ib_iov_port *port = &device->iov_ports[port_num - 1];
+
+	sysfs_remove_file(port->mcgs_parent, attr);
+}
+
+static int add_port_entries(struct mlx4_ib_dev *device, int port_num)
+{
+	int i;
+	char buff[10];
+	struct mlx4_ib_iov_port *port = NULL;
+	int ret = 0 ;
+	struct ib_port_attr attr;
+
+	/* get the physical gid and pkey table sizes.*/
+	ret = __mlx4_ib_query_port(&device->ib_dev, port_num, &attr, 1);
+	if (ret)
+		goto err;
+
+	port = &device->iov_ports[port_num - 1];
+	port->dev = device;
+	port->num = port_num;
+	/* Directory structure:
+	 * iov -
+	 *   port num -
+	 *	admin_guids
+	 *	gids (operational)
+	 *	mcg_table
+	 */
+	port->dentr_ar = kzalloc(sizeof (struct mlx4_ib_iov_sysfs_attr_ar),
+				 GFP_KERNEL);
+	if (!port->dentr_ar) {
+		ret = -ENOMEM;
+		goto err;
+	}
+	sprintf(buff, "%d", port_num);
+	port->cur_port = kobject_create_and_add(buff,
+				 kobject_get(device->ports_parent));
+	if (!port->cur_port) {
+		ret = -ENOMEM;
+		goto kobj_create_err;
+	}
+	/* admin GUIDs */
+	port->admin_alias_parent = kobject_create_and_add("admin_guids",
+						  kobject_get(port->cur_port));
+	if (!port->admin_alias_parent) {
+		ret = -ENOMEM;
+		goto err_admin_guids;
+	}
+	for (i = 0 ; i < attr.gid_tbl_len; i++) {
+		sprintf(buff, "%d", i);
+		port->dentr_ar->dentries[i].entry_num = i;
+		ret = create_sysfs_entry(port, &port->dentr_ar->dentries[i],
+					  buff, port->admin_alias_parent,
+					  show_admin_alias_guid, store_admin_alias_guid);
+		if (ret)
+			goto err_admin_alias_parent;
+	}
+
+	/* gids subdirectory (operational gids) */
+	port->gids_parent = kobject_create_and_add("gids",
+						  kobject_get(port->cur_port));
+	if (!port->gids_parent) {
+		ret = -ENOMEM;
+		goto err_gids;
+	}
+
+	for (i = 0 ; i < attr.gid_tbl_len; i++) {
+		sprintf(buff, "%d", i);
+		port->dentr_ar->dentries[attr.gid_tbl_len + i].entry_num = i;
+		ret = create_sysfs_entry(port,
+					 &port->dentr_ar->dentries[attr.gid_tbl_len + i],
+					 buff,
+					 port->gids_parent, show_port_gid, NULL);
+		if (ret)
+			goto err_gids_parent;
+	}
+
+	/* physical port pkey table */
+	port->pkeys_parent =
+		kobject_create_and_add("pkeys", kobject_get(port->cur_port));
+	if (!port->pkeys_parent) {
+		ret = -ENOMEM;
+		goto err_pkeys;
+	}
+
+	for (i = 0 ; i < attr.pkey_tbl_len; i++) {
+		sprintf(buff, "%d", i);
+		port->dentr_ar->dentries[2 * attr.gid_tbl_len + i].entry_num = i;
+		ret = create_sysfs_entry(port,
+					 &port->dentr_ar->dentries[2 * attr.gid_tbl_len + i],
+					 buff, port->pkeys_parent,
+					 show_phys_port_pkey, NULL);
+		if (ret)
+			goto err_pkeys_parent;
+	}
+
+	/* MCGs table */
+	port->mcgs_parent =
+		kobject_create_and_add("mcgs", kobject_get(port->cur_port));
+	if (!port->mcgs_parent) {
+		ret = -ENOMEM;
+		goto err_mcgs;
+	}
+	return 0;
+
+err_mcgs:
+	kobject_put(port->cur_port);
+
+err_pkeys_parent:
+	kobject_put(port->pkeys_parent);
+
+err_pkeys:
+	kobject_put(port->cur_port);
+
+err_gids_parent:
+	kobject_put(port->gids_parent);
+
+err_gids:
+	kobject_put(port->cur_port);
+
+err_admin_alias_parent:
+	kobject_put(port->admin_alias_parent);
+
+err_admin_guids:
+	kobject_put(port->cur_port);
+	kobject_put(port->cur_port); /* once more for create_and_add buff */
+
+kobj_create_err:
+	kobject_put(device->ports_parent);
+	kfree(port->dentr_ar);
+
+err:
+	pr_err("add_port_entries FAILED: for port:%d, error: %d\n",
+	       port_num, ret);
+	return ret;
+}
+
+static void get_name(struct mlx4_ib_dev *dev, char *name, int i, int max)
+{
+	char base_name[9];
+
+	/* pci_name format is: bus:dev:func -> xxxx:yy:zz.n */
+	strlcpy(name, pci_name(dev->dev->pdev), max);
+	strncpy(base_name, name, 8); /*till xxxx:yy:*/
+	base_name[8] = '\0';
+	/* with no ARI only 3 last bits are used so when the fn is higher than 8
+	 * need to add it to the dev num, so count in the last number will be
+	 * modulo 8 */
+	sprintf(name, "%s%.2d.%d", base_name, (i/8), (i%8));
+}
+
+struct mlx4_port {
+	struct kobject         kobj;
+	struct mlx4_ib_dev    *dev;
+	struct attribute_group pkey_group;
+	struct attribute_group gid_group;
+	u8                     port_num;
+	int		       slave;
+};
+
+
+static void mlx4_port_release(struct kobject *kobj)
+{
+	struct mlx4_port *p = container_of(kobj, struct mlx4_port, kobj);
+	struct attribute *a;
+	int i;
+
+	for (i = 0; (a = p->pkey_group.attrs[i]); ++i)
+		kfree(a);
+	kfree(p->pkey_group.attrs);
+	for (i = 0; (a = p->gid_group.attrs[i]); ++i)
+		kfree(a);
+	kfree(p->gid_group.attrs);
+	kfree(p);
+}
+
+struct port_attribute {
+	struct attribute attr;
+	ssize_t (*show)(struct mlx4_port *, struct port_attribute *, char *buf);
+	ssize_t (*store)(struct mlx4_port *, struct port_attribute *,
+			 const char *buf, size_t count);
+};
+
+static ssize_t port_attr_show(struct kobject *kobj,
+			      struct attribute *attr, char *buf)
+{
+	struct port_attribute *port_attr =
+		container_of(attr, struct port_attribute, attr);
+	struct mlx4_port *p = container_of(kobj, struct mlx4_port, kobj);
+
+	if (!port_attr->show)
+		return -EIO;
+	return port_attr->show(p, port_attr, buf);
+}
+
+static ssize_t port_attr_store(struct kobject *kobj,
+			       struct attribute *attr,
+			       const char *buf, size_t size)
+{
+	struct port_attribute *port_attr =
+		container_of(attr, struct port_attribute, attr);
+	struct mlx4_port *p = container_of(kobj, struct mlx4_port, kobj);
+
+	if (!port_attr->store)
+		return -EIO;
+	return port_attr->store(p, port_attr, buf, size);
+}
+
+static const struct sysfs_ops port_sysfs_ops = {
+	.show = port_attr_show,
+	.store = port_attr_store,
+};
+
+static struct kobj_type port_type = {
+	.release    = mlx4_port_release,
+	.sysfs_ops  = &port_sysfs_ops,
+};
+
+struct port_table_attribute {
+	struct port_attribute	attr;
+	char			name[8];
+	int			index;
+};
+
+static ssize_t show_port_pkey(struct mlx4_port *p, struct port_attribute *attr,
+			      char *buf)
+{
+	struct port_table_attribute *tab_attr =
+		container_of(attr, struct port_table_attribute, attr);
+	ssize_t ret = -ENODEV;
+
+	if (p->dev->pkeys.virt2phys_pkey[p->slave][p->port_num - 1][tab_attr->index] >=
+	    (p->dev->dev->caps.pkey_table_len[p->port_num]))
+		ret = sprintf(buf, "none\n");
+	else
+		ret = sprintf(buf, "%d\n",
+			      p->dev->pkeys.virt2phys_pkey[p->slave]
+			      [p->port_num - 1][tab_attr->index]);
+	return ret;
+}
+
+static ssize_t store_port_pkey(struct mlx4_port *p, struct port_attribute *attr,
+			       const char *buf, size_t count)
+{
+	struct port_table_attribute *tab_attr =
+		container_of(attr, struct port_table_attribute, attr);
+	int idx;
+	int err;
+
+	/* do not allow remapping Dom0 virtual pkey table */
+	if (p->slave == mlx4_master_func_num(p->dev->dev))
+		return -EINVAL;
+
+	if (!strncasecmp(buf, "no", 2))
+		idx = p->dev->dev->phys_caps.pkey_phys_table_len[p->port_num] - 1;
+	else if (sscanf(buf, "%i", &idx) != 1 ||
+		 idx >= p->dev->dev->caps.pkey_table_len[p->port_num] ||
+		 idx < 0)
+		return -EINVAL;
+
+	p->dev->pkeys.virt2phys_pkey[p->slave][p->port_num - 1]
+				    [tab_attr->index] = idx;
+	mlx4_sync_pkey_table(p->dev->dev, p->slave, p->port_num,
+			     tab_attr->index, idx);
+	err = mlx4_gen_pkey_eqe(p->dev->dev, p->slave, p->port_num);
+	if (err) {
+		pr_err("mlx4_gen_pkey_eqe failed for slave %d,"
+		       " port %d, index %d\n", p->slave, p->port_num, idx);
+		return err;
+	}
+	return count;
+}
+
+static ssize_t show_port_gid_idx(struct mlx4_port *p,
+				 struct port_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%d\n", p->slave);
+}
+
+static struct attribute **
+alloc_group_attrs(ssize_t (*show)(struct mlx4_port *,
+				  struct port_attribute *, char *buf),
+		  ssize_t (*store)(struct mlx4_port *, struct port_attribute *,
+				   const char *buf, size_t count),
+		  int len)
+{
+	struct attribute **tab_attr;
+	struct port_table_attribute *element;
+	int i;
+
+	tab_attr = kcalloc(1 + len, sizeof (struct attribute *), GFP_KERNEL);
+	if (!tab_attr)
+		return NULL;
+
+	for (i = 0; i < len; i++) {
+		element = kzalloc(sizeof (struct port_table_attribute),
+				  GFP_KERNEL);
+		if (!element)
+			goto err;
+		if (snprintf(element->name, sizeof (element->name),
+			     "%d", i) >= sizeof (element->name)) {
+			kfree(element);
+			goto err;
+		}
+		sysfs_attr_init(&element->attr.attr);
+		element->attr.attr.name  = element->name;
+		if (store) {
+			element->attr.attr.mode  = S_IWUSR | S_IRUGO;
+			element->attr.store	 = store;
+		} else
+			element->attr.attr.mode  = S_IRUGO;
+
+		element->attr.show       = show;
+		element->index		 = i;
+		tab_attr[i] = &element->attr.attr;
+	}
+	return tab_attr;
+
+err:
+	while (--i >= 0)
+		kfree(tab_attr[i]);
+	kfree(tab_attr);
+	return NULL;
+}
+
+static int add_port(struct mlx4_ib_dev *dev, int port_num, int slave)
+{
+	struct mlx4_port *p;
+	int i;
+	int ret;
+	int is_eth = rdma_port_get_link_layer(&dev->ib_dev, port_num) ==
+			IB_LINK_LAYER_ETHERNET;
+
+	p = kzalloc(sizeof *p, GFP_KERNEL);
+	if (!p)
+		return -ENOMEM;
+
+	p->dev = dev;
+	p->port_num = port_num;
+	p->slave = slave;
+
+	ret = kobject_init_and_add(&p->kobj, &port_type,
+				   kobject_get(dev->dev_ports_parent[slave]),
+				   "%d", port_num);
+	if (ret)
+		goto err_alloc;
+
+	p->pkey_group.name  = "pkey_idx";
+	if (is_eth)
+		p->pkey_group.attrs =
+			alloc_group_attrs(show_port_pkey, NULL,
+					  dev->dev->caps.pkey_table_len[port_num]);
+	else
+		p->pkey_group.attrs =
+			alloc_group_attrs(show_port_pkey, store_port_pkey,
+					  dev->dev->caps.pkey_table_len[port_num]);
+	if (!p->pkey_group.attrs)
+		goto err_alloc;
+
+	ret = sysfs_create_group(&p->kobj, &p->pkey_group);
+	if (ret)
+		goto err_free_pkey;
+
+	p->gid_group.name  = "gid_idx";
+	p->gid_group.attrs = alloc_group_attrs(show_port_gid_idx, NULL, 1);
+	if (!p->gid_group.attrs)
+		goto err_free_pkey;
+
+	ret = sysfs_create_group(&p->kobj, &p->gid_group);
+	if (ret)
+		goto err_free_gid;
+
+	list_add_tail(&p->kobj.entry, &dev->pkeys.pkey_port_list[slave]);
+	return 0;
+
+err_free_gid:
+	kfree(p->gid_group.attrs[0]);
+	kfree(p->gid_group.attrs);
+
+err_free_pkey:
+	for (i = 0; i < dev->dev->caps.pkey_table_len[port_num]; ++i)
+		kfree(p->pkey_group.attrs[i]);
+	kfree(p->pkey_group.attrs);
+
+err_alloc:
+	kobject_put(dev->dev_ports_parent[slave]);
+	kfree(p);
+	return ret;
+}
+
+static int register_one_pkey_tree(struct mlx4_ib_dev *dev, int slave)
+{
+	char name[32];
+	int err;
+	int port;
+	struct kobject *p, *t;
+	struct mlx4_port *mport;
+
+	get_name(dev, name, slave, sizeof name);
+
+	dev->pkeys.device_parent[slave] =
+		kobject_create_and_add(name, kobject_get(dev->iov_parent));
+
+	if (!dev->pkeys.device_parent[slave]) {
+		err = -ENOMEM;
+		goto fail_dev;
+	}
+
+	INIT_LIST_HEAD(&dev->pkeys.pkey_port_list[slave]);
+
+	dev->dev_ports_parent[slave] =
+		kobject_create_and_add("ports",
+				       kobject_get(dev->pkeys.device_parent[slave]));
+
+	if (!dev->dev_ports_parent[slave]) {
+		err = -ENOMEM;
+		goto err_ports;
+	}
+
+	for (port = 1; port <= dev->dev->caps.num_ports; ++port) {
+		err = add_port(dev, port, slave);
+		if (err)
+			goto err_add;
+	}
+	return 0;
+
+err_add:
+	list_for_each_entry_safe(p, t,
+				 &dev->pkeys.pkey_port_list[slave],
+				 entry) {
+		list_del(&p->entry);
+		mport = container_of(p, struct mlx4_port, kobj);
+		sysfs_remove_group(p, &mport->pkey_group);
+		sysfs_remove_group(p, &mport->gid_group);
+		kobject_put(p);
+	}
+	kobject_put(dev->dev_ports_parent[slave]);
+
+err_ports:
+	kobject_put(dev->pkeys.device_parent[slave]);
+	/* extra put for the device_parent create_and_add */
+	kobject_put(dev->pkeys.device_parent[slave]);
+
+fail_dev:
+	kobject_put(dev->iov_parent);
+	return err;
+}
+
+static int register_pkey_tree(struct mlx4_ib_dev *device)
+{
+	int i;
+
+	if (!mlx4_is_master(device->dev))
+		return 0;
+
+	for (i = 0; i <= device->dev->num_vfs; ++i)
+		register_one_pkey_tree(device, i);
+
+	return 0;
+}
+
+static void unregister_pkey_tree(struct mlx4_ib_dev *device)
+{
+	int slave;
+	struct kobject *p, *t;
+	struct mlx4_port *port;
+
+	if (!mlx4_is_master(device->dev))
+		return;
+
+	for (slave = device->dev->num_vfs; slave >= 0; --slave) {
+		list_for_each_entry_safe(p, t,
+					 &device->pkeys.pkey_port_list[slave],
+					 entry) {
+			list_del(&p->entry);
+			port = container_of(p, struct mlx4_port, kobj);
+			sysfs_remove_group(p, &port->pkey_group);
+			sysfs_remove_group(p, &port->gid_group);
+			kobject_put(p);
+			kobject_put(device->dev_ports_parent[slave]);
+		}
+		kobject_put(device->dev_ports_parent[slave]);
+		kobject_put(device->pkeys.device_parent[slave]);
+		kobject_put(device->pkeys.device_parent[slave]);
+		kobject_put(device->iov_parent);
+	}
+}
+
+int mlx4_ib_device_register_sysfs(struct mlx4_ib_dev *dev)
+{
+	int i;
+	int ret = 0;
+
+	if (!mlx4_is_master(dev->dev))
+		return 0;
+
+	dev->iov_parent =
+		kobject_create_and_add("iov",
+				       kobject_get(dev->ib_dev.ports_parent->parent));
+	if (!dev->iov_parent) {
+		ret = -ENOMEM;
+		goto err;
+	}
+	dev->ports_parent =
+		kobject_create_and_add("ports",
+				       kobject_get(dev->iov_parent));
+	if (!dev->iov_parent) {
+		ret = -ENOMEM;
+		goto err_ports;
+	}
+
+	for (i = 1; i <= dev->ib_dev.phys_port_cnt; ++i) {
+		ret = add_port_entries(dev, i);
+		if (ret)
+			goto err_add_entries;
+	}
+
+	ret = register_pkey_tree(dev);
+	if (ret)
+		goto err_add_entries;
+	return 0;
+
+err_add_entries:
+	kobject_put(dev->ports_parent);
+
+err_ports:
+	kobject_put(dev->iov_parent);
+err:
+	kobject_put(dev->ib_dev.ports_parent->parent);
+	pr_err("mlx4_ib_device_register_sysfs error (%d)\n", ret);
+	return ret;
+}
+
+static void unregister_alias_guid_tree(struct mlx4_ib_dev *device)
+{
+	struct mlx4_ib_iov_port *p;
+	int i;
+
+	if (!mlx4_is_master(device->dev))
+		return;
+
+	for (i = 0; i < device->dev->caps.num_ports; i++) {
+		p = &device->iov_ports[i];
+		kobject_put(p->admin_alias_parent);
+		kobject_put(p->gids_parent);
+		kobject_put(p->pkeys_parent);
+		kobject_put(p->mcgs_parent);
+		kobject_put(p->cur_port);
+		kobject_put(p->cur_port);
+		kobject_put(p->cur_port);
+		kobject_put(p->cur_port);
+		kobject_put(p->cur_port);
+		kobject_put(p->dev->ports_parent);
+		kfree(p->dentr_ar);
+	}
+}
+
+void mlx4_ib_device_unregister_sysfs(struct mlx4_ib_dev *device)
+{
+	unregister_alias_guid_tree(device);
+	unregister_pkey_tree(device);
+	kobject_put(device->ports_parent);
+	kobject_put(device->iov_parent);
+	kobject_put(device->iov_parent);
+	kobject_put(device->ib_dev.ports_parent->parent);
+}


Property changes on: trunk/sys/ofed/drivers/infiniband/hw/mlx4/sysfs.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Modified: trunk/sys/ofed/drivers/infiniband/hw/mlx4/user.h
===================================================================
--- trunk/sys/ofed/drivers/infiniband/hw/mlx4/user.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/drivers/infiniband/hw/mlx4/user.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -40,8 +40,10 @@
  * Increment this value if any changes that break userspace ABI
  * compatibility are made.
  */
-#define MLX4_IB_UVERBS_ABI_VERSION	3
 
+#define MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION	3
+#define MLX4_IB_UVERBS_ABI_VERSION		4
+
 /*
  * Make sure that all structs defined in this file remain laid out so
  * that they pack the same way on 32-bit and 64-bit architectures (to
@@ -50,10 +52,18 @@
  * instead.
  */
 
+struct mlx4_ib_alloc_ucontext_resp_v3 {
+	__u32	qp_tab_size;
+	__u16	bf_reg_size;
+	__u16	bf_regs_per_page;
+};
+
 struct mlx4_ib_alloc_ucontext_resp {
+	__u32	dev_caps;
 	__u32	qp_tab_size;
 	__u16	bf_reg_size;
 	__u16	bf_regs_per_page;
+	__u32	cqe_size;
 };
 
 struct mlx4_ib_alloc_pd_resp {


Property changes on: trunk/sys/ofed/drivers/infiniband/hw/mlx4/user.h
___________________________________________________________________
Deleted: cvs2svn:cvs-rev
## -1 +0,0 ##
-1.1.1.1
\ No newline at end of property
Modified: trunk/sys/ofed/drivers/infiniband/hw/mlx4/wc.c
===================================================================
--- trunk/sys/ofed/drivers/infiniband/hw/mlx4/wc.c	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/drivers/infiniband/hw/mlx4/wc.c	2016-09-14 19:35:22 UTC (rev 7911)
@@ -71,4 +71,3 @@
 }
 
 #endif
-


Property changes on: trunk/sys/ofed/drivers/infiniband/hw/mlx4/wc.c
___________________________________________________________________
Deleted: cvs2svn:cvs-rev
## -1 +0,0 ##
-1.1.1.1
\ No newline at end of property
Index: trunk/sys/ofed/drivers/infiniband/hw/mlx4/wc.h
===================================================================
--- trunk/sys/ofed/drivers/infiniband/hw/mlx4/wc.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/drivers/infiniband/hw/mlx4/wc.h	2016-09-14 19:35:22 UTC (rev 7911)

Property changes on: trunk/sys/ofed/drivers/infiniband/hw/mlx4/wc.h
___________________________________________________________________
Deleted: cvs2svn:cvs-rev
## -1 +0,0 ##
-1.1.1.1
\ No newline at end of property
Index: trunk/sys/ofed/drivers/infiniband/hw/mthca/Kconfig
===================================================================
--- trunk/sys/ofed/drivers/infiniband/hw/mthca/Kconfig	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/drivers/infiniband/hw/mthca/Kconfig	2016-09-14 19:35:22 UTC (rev 7911)

Property changes on: trunk/sys/ofed/drivers/infiniband/hw/mthca/Kconfig
___________________________________________________________________
Deleted: cvs2svn:cvs-rev
## -1 +0,0 ##
-1.1.1.1
\ No newline at end of property
Index: trunk/sys/ofed/drivers/infiniband/hw/mthca/Makefile
===================================================================
--- trunk/sys/ofed/drivers/infiniband/hw/mthca/Makefile	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/drivers/infiniband/hw/mthca/Makefile	2016-09-14 19:35:22 UTC (rev 7911)

Property changes on: trunk/sys/ofed/drivers/infiniband/hw/mthca/Makefile
___________________________________________________________________
Deleted: cvs2svn:cvs-rev
## -1 +0,0 ##
-1.1.1.1
\ No newline at end of property
Modified: trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_allocator.c
===================================================================
--- trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_allocator.c	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_allocator.c	2016-09-14 19:35:22 UTC (rev 7911)
@@ -32,7 +32,6 @@
 
 #include <linux/errno.h>
 #include <linux/slab.h>
-#include <linux/bitmap.h>
 
 #include "mthca_dev.h"
 


Property changes on: trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_allocator.c
___________________________________________________________________
Deleted: cvs2svn:cvs-rev
## -1 +0,0 ##
-1.1.1.1
\ No newline at end of property
Index: trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_av.c
===================================================================
--- trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_av.c	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_av.c	2016-09-14 19:35:22 UTC (rev 7911)

Property changes on: trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_av.c
___________________________________________________________________
Deleted: cvs2svn:cvs-rev
## -1 +0,0 ##
-1.1.1.1
\ No newline at end of property
Index: trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_catas.c
===================================================================
--- trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_catas.c	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_catas.c	2016-09-14 19:35:22 UTC (rev 7911)

Property changes on: trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_catas.c
___________________________________________________________________
Deleted: cvs2svn:cvs-rev
## -1 +0,0 ##
-1.1.1.1
\ No newline at end of property
Modified: trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_cmd.c
===================================================================
--- trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_cmd.c	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_cmd.c	2016-09-14 19:35:22 UTC (rev 7911)
@@ -1808,7 +1808,7 @@
 	case IB_QPT_RAW_IPV6:
 		op_mod = 2;
 		break;
-	case IB_QPT_RAW_ETY:
+	case IB_QPT_RAW_ETHERTYPE:
 		op_mod = 3;
 		break;
 	default:


Property changes on: trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_cmd.c
___________________________________________________________________
Deleted: cvs2svn:cvs-rev
## -1 +0,0 ##
-1.1.1.1
\ No newline at end of property
Index: trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_cmd.h
===================================================================
--- trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_cmd.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_cmd.h	2016-09-14 19:35:22 UTC (rev 7911)

Property changes on: trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_cmd.h
___________________________________________________________________
Deleted: cvs2svn:cvs-rev
## -1 +0,0 ##
-1.1.1.1
\ No newline at end of property
Modified: trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_config_reg.h
===================================================================
--- trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_config_reg.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_config_reg.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -34,7 +34,7 @@
 #ifndef MTHCA_CONFIG_REG_H
 #define MTHCA_CONFIG_REG_H
 
-#include <asm/page.h>
+#include <linux/page.h>
 
 #define MTHCA_HCR_BASE         0x80680
 #define MTHCA_HCR_SIZE         0x0001c


Property changes on: trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_config_reg.h
___________________________________________________________________
Deleted: cvs2svn:cvs-rev
## -1 +0,0 ##
-1.1.1.1
\ No newline at end of property
Index: trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_cq.c
===================================================================
--- trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_cq.c	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_cq.c	2016-09-14 19:35:22 UTC (rev 7911)

Property changes on: trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_cq.c
___________________________________________________________________
Deleted: cvs2svn:cvs-rev
## -1 +0,0 ##
-1.1.1.1
\ No newline at end of property
Index: trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_dev.h
===================================================================
--- trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_dev.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_dev.h	2016-09-14 19:35:22 UTC (rev 7911)

Property changes on: trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_dev.h
___________________________________________________________________
Deleted: cvs2svn:cvs-rev
## -1 +0,0 ##
-1.1.1.1
\ No newline at end of property
Index: trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_doorbell.h
===================================================================
--- trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_doorbell.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_doorbell.h	2016-09-14 19:35:22 UTC (rev 7911)

Property changes on: trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_doorbell.h
___________________________________________________________________
Deleted: cvs2svn:cvs-rev
## -1 +0,0 ##
-1.1.1.1
\ No newline at end of property
Index: trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_eq.c
===================================================================
--- trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_eq.c	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_eq.c	2016-09-14 19:35:22 UTC (rev 7911)

Property changes on: trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_eq.c
___________________________________________________________________
Deleted: cvs2svn:cvs-rev
## -1 +0,0 ##
-1.1.1.1
\ No newline at end of property
Index: trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_mad.c
===================================================================
--- trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_mad.c	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_mad.c	2016-09-14 19:35:22 UTC (rev 7911)

Property changes on: trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_mad.c
___________________________________________________________________
Deleted: cvs2svn:cvs-rev
## -1 +0,0 ##
-1.1.1.1
\ No newline at end of property
Modified: trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_main.c
===================================================================
--- trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_main.c	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_main.c	2016-09-14 19:35:22 UTC (rev 7911)
@@ -33,7 +33,6 @@
  */
 
 #include <linux/module.h>
-#include <linux/init.h>
 #include <linux/errno.h>
 #include <linux/pci.h>
 #include <linux/interrupt.h>
@@ -1325,7 +1324,7 @@
 	if (log_mtts_per_seg == 0)
 		log_mtts_per_seg = ilog2(MTHCA_MTT_SEG_SIZE / 8);
 	if ((log_mtts_per_seg < 1) || (log_mtts_per_seg > 5)) {
-		printk(KERN_WARNING PFX "bad log_mtts_per_seg (%d). Using default - %ld\n",
+		printk(KERN_WARNING PFX "bad log_mtts_per_seg (%d). Using default - %d\n",
 		       log_mtts_per_seg, ilog2(MTHCA_MTT_SEG_SIZE / 8));
 		log_mtts_per_seg = ilog2(MTHCA_MTT_SEG_SIZE / 8);
 	}


Property changes on: trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_main.c
___________________________________________________________________
Deleted: cvs2svn:cvs-rev
## -1 +0,0 ##
-1.1.1.1
\ No newline at end of property
Index: trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_mcg.c
===================================================================
--- trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_mcg.c	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_mcg.c	2016-09-14 19:35:22 UTC (rev 7911)

Property changes on: trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_mcg.c
___________________________________________________________________
Deleted: cvs2svn:cvs-rev
## -1 +0,0 ##
-1.1.1.1
\ No newline at end of property
Modified: trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_memfree.c
===================================================================
--- trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_memfree.c	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_memfree.c	2016-09-14 19:35:22 UTC (rev 7911)
@@ -36,7 +36,7 @@
 #include <linux/scatterlist.h>
 #include <linux/sched.h>
 
-#include <asm/page.h>
+#include <linux/page.h>
 
 #include "mthca_memfree.h"
 #include "mthca_dev.h"
@@ -448,6 +448,7 @@
 		page * MTHCA_ICM_PAGE_SIZE;
 }
 
+
 #include <vm/vm_map.h>
 #include <vm/vm_pageout.h>
 #include <vm/pmap.h>


Property changes on: trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_memfree.c
___________________________________________________________________
Deleted: cvs2svn:cvs-rev
## -1 +0,0 ##
-1.1.1.1
\ No newline at end of property
Index: trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_memfree.h
===================================================================
--- trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_memfree.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_memfree.h	2016-09-14 19:35:22 UTC (rev 7911)

Property changes on: trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_memfree.h
___________________________________________________________________
Deleted: cvs2svn:cvs-rev
## -1 +0,0 ##
-1.1.1.1
\ No newline at end of property
Index: trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_mr.c
===================================================================
--- trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_mr.c	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_mr.c	2016-09-14 19:35:22 UTC (rev 7911)

Property changes on: trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_mr.c
___________________________________________________________________
Deleted: cvs2svn:cvs-rev
## -1 +0,0 ##
-1.1.1.1
\ No newline at end of property
Index: trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_pd.c
===================================================================
--- trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_pd.c	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_pd.c	2016-09-14 19:35:22 UTC (rev 7911)

Property changes on: trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_pd.c
___________________________________________________________________
Deleted: cvs2svn:cvs-rev
## -1 +0,0 ##
-1.1.1.1
\ No newline at end of property
Index: trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_profile.c
===================================================================
--- trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_profile.c	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_profile.c	2016-09-14 19:35:22 UTC (rev 7911)

Property changes on: trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_profile.c
___________________________________________________________________
Deleted: cvs2svn:cvs-rev
## -1 +0,0 ##
-1.1.1.1
\ No newline at end of property
Index: trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_profile.h
===================================================================
--- trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_profile.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_profile.h	2016-09-14 19:35:22 UTC (rev 7911)

Property changes on: trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_profile.h
___________________________________________________________________
Deleted: cvs2svn:cvs-rev
## -1 +0,0 ##
-1.1.1.1
\ No newline at end of property
Modified: trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_provider.c
===================================================================
--- trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_provider.c	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_provider.c	2016-09-14 19:35:22 UTC (rev 7911)
@@ -40,6 +40,7 @@
 
 #include <linux/sched.h>
 #include <linux/mm.h>
+#include <linux/fs.h>
 
 #include "mthca_dev.h"
 #include "mthca_cmd.h"
@@ -1006,7 +1007,7 @@
 }
 
 static struct ib_mr *mthca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
-				       u64 virt, int acc, struct ib_udata *udata)
+				       u64 virt, int acc, struct ib_udata *udata, int mr_id)
 {
 	struct mthca_dev *dev = to_mdev(pd->device);
 	struct ib_umem_chunk *chunk;
@@ -1402,7 +1403,7 @@
 
 	mutex_init(&dev->cap_mask_mutex);
 
-	ret = ib_register_device(&dev->ib_dev);
+	ret = ib_register_device(&dev->ib_dev, NULL);
 	if (ret)
 		return ret;
 


Property changes on: trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_provider.c
___________________________________________________________________
Deleted: cvs2svn:cvs-rev
## -1 +0,0 ##
-1.1.1.1
\ No newline at end of property
Index: trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_provider.h
===================================================================
--- trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_provider.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_provider.h	2016-09-14 19:35:22 UTC (rev 7911)

Property changes on: trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_provider.h
___________________________________________________________________
Deleted: cvs2svn:cvs-rev
## -1 +0,0 ##
-1.1.1.1
\ No newline at end of property
Index: trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_qp.c
===================================================================
--- trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_qp.c	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_qp.c	2016-09-14 19:35:22 UTC (rev 7911)

Property changes on: trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_qp.c
___________________________________________________________________
Deleted: cvs2svn:cvs-rev
## -1 +0,0 ##
-1.1.1.1
\ No newline at end of property
Modified: trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_reset.c
===================================================================
--- trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_reset.c	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_reset.c	2016-09-14 19:35:22 UTC (rev 7911)
@@ -30,7 +30,6 @@
  * SOFTWARE.
  */
 
-#include <linux/init.h>
 #include <linux/errno.h>
 #include <linux/pci.h>
 #include <linux/delay.h>
@@ -44,9 +43,13 @@
 	int i;
 	int err = 0;
 	u32 *hca_header    = NULL;
+#ifdef __linux__
 	u32 *bridge_header = NULL;
+#endif
 	struct pci_dev *bridge = NULL;
+#ifdef __linux__
 	int bridge_pcix_cap = 0;
+#endif
 	int hca_pcie_cap = 0;
 	int hca_pcix_cap = 0;
 
@@ -196,6 +199,7 @@
 	}
 
 good:
+#ifdef __linux__
 	/* Now restore the PCI headers */
 	if (bridge) {
 		if (pci_write_config_dword(bridge, bridge_pcix_cap + 0x8,
@@ -236,6 +240,7 @@
 			goto out;
 		}
 	}
+#endif
 
 	if (hca_pcix_cap) {
 		if (pci_write_config_dword(mdev->pdev, hca_pcix_cap,
@@ -290,8 +295,8 @@
 #ifdef __linux__
 	if (bridge)
 		pci_dev_put(bridge);
+	kfree(bridge_header);
 #endif
-	kfree(bridge_header);
 	kfree(hca_header);
 
 	return err;


Property changes on: trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_reset.c
___________________________________________________________________
Deleted: cvs2svn:cvs-rev
## -1 +0,0 ##
-1.1.1.1
\ No newline at end of property
Index: trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_srq.c
===================================================================
--- trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_srq.c	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_srq.c	2016-09-14 19:35:22 UTC (rev 7911)

Property changes on: trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_srq.c
___________________________________________________________________
Deleted: cvs2svn:cvs-rev
## -1 +0,0 ##
-1.1.1.1
\ No newline at end of property
Modified: trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_uar.c
===================================================================
--- trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_uar.c	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_uar.c	2016-09-14 19:35:22 UTC (rev 7911)
@@ -30,7 +30,7 @@
  * SOFTWARE.
  */
 
-#include <asm/page.h>		/* PAGE_SHIFT */
+#include <linux/page.h>
 
 #include "mthca_dev.h"
 #include "mthca_memfree.h"


Property changes on: trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_uar.c
___________________________________________________________________
Deleted: cvs2svn:cvs-rev
## -1 +0,0 ##
-1.1.1.1
\ No newline at end of property
Index: trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_user.h
===================================================================
--- trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_user.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_user.h	2016-09-14 19:35:22 UTC (rev 7911)

Property changes on: trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_user.h
___________________________________________________________________
Deleted: cvs2svn:cvs-rev
## -1 +0,0 ##
-1.1.1.1
\ No newline at end of property
Index: trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_wqe.h
===================================================================
--- trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_wqe.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_wqe.h	2016-09-14 19:35:22 UTC (rev 7911)

Property changes on: trunk/sys/ofed/drivers/infiniband/hw/mthca/mthca_wqe.h
___________________________________________________________________
Deleted: cvs2svn:cvs-rev
## -1 +0,0 ##
-1.1.1.1
\ No newline at end of property
Index: trunk/sys/ofed/drivers/infiniband/ulp/ipoib/Kconfig
===================================================================
--- trunk/sys/ofed/drivers/infiniband/ulp/ipoib/Kconfig	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/drivers/infiniband/ulp/ipoib/Kconfig	2016-09-14 19:35:22 UTC (rev 7911)

Property changes on: trunk/sys/ofed/drivers/infiniband/ulp/ipoib/Kconfig
___________________________________________________________________
Deleted: cvs2svn:cvs-rev
## -1 +0,0 ##
-1.1.1.1
\ No newline at end of property
Index: trunk/sys/ofed/drivers/infiniband/ulp/ipoib/Makefile
===================================================================
--- trunk/sys/ofed/drivers/infiniband/ulp/ipoib/Makefile	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/drivers/infiniband/ulp/ipoib/Makefile	2016-09-14 19:35:22 UTC (rev 7911)

Property changes on: trunk/sys/ofed/drivers/infiniband/ulp/ipoib/Makefile
___________________________________________________________________
Deleted: cvs2svn:cvs-rev
## -1 +0,0 ##
-1.1.1.1
\ No newline at end of property
Modified: trunk/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib.h
===================================================================
--- trunk/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -67,8 +67,6 @@
 #include <netinet/in_var.h>
 #include <netinet/if_ether.h>
 #include <netinet/ip_var.h>
-#include <netinet/ip_fw.h>
-#include <netinet/ipfw/ip_fw_private.h>
 #endif
 #ifdef INET6
 #include <netinet6/nd6.h>
@@ -111,7 +109,8 @@
 	IPOIB_ENCAP_LEN		  = 4,
 	IPOIB_HEADER_LEN	  = IPOIB_ENCAP_LEN + INFINIBAND_ALEN,
 	IPOIB_UD_MAX_MTU	  = 4 * 1024,
-	IPOIB_UD_RX_SG		  = (IPOIB_UD_MAX_MTU / MJUMPAGESIZE),
+//	IPOIB_UD_RX_SG		  = (IPOIB_UD_MAX_MTU / MJUMPAGESIZE),
+	IPOIB_UD_RX_SG		  = 2,
 	IPOIB_UD_TX_SG		  = (IPOIB_UD_MAX_MTU / MCLBYTES) + 2,
 	IPOIB_CM_MAX_MTU	  = (64 * 1024),
 	IPOIB_CM_TX_SG		  = (IPOIB_CM_MAX_MTU / MCLBYTES) + 2,
@@ -320,6 +319,8 @@
 
 	unsigned long flags;
 
+	int gone;
+
 	struct mutex vlan_mutex;
 
 	struct rb_root  path_tree;


Property changes on: trunk/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib.h
___________________________________________________________________
Deleted: cvs2svn:cvs-rev
## -1 +0,0 ##
-1.1.1.1
\ No newline at end of property
Modified: trunk/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_cm.c
===================================================================
--- trunk/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_cm.c	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_cm.c	2016-09-14 19:35:22 UTC (rev 7911)
@@ -481,6 +481,8 @@
 	int has_srq;
 	u_short proto;
 
+	CURVNET_SET_QUIET(dev->if_vnet);
+
 	ipoib_dbg_data(priv, "cm recv completion: id %d, status: %d\n",
 		       wr_id, wc->status);
 
@@ -496,7 +498,7 @@
 		} else
 			ipoib_warn(priv, "cm recv completion event with wrid %d (> %d)\n",
 				   wr_id, ipoib_recvq_size);
-		return;
+		goto done;
 	}
 
 	p = wc->qp->qp_context;
@@ -520,7 +522,7 @@
 				queue_work(ipoib_workqueue, &priv->cm.rx_reap_task);
 				spin_unlock(&priv->lock);
 			}
-			return;
+			goto done;
 		}
 	}
 
@@ -579,6 +581,9 @@
 				   "for buf %d\n", wr_id);
 		}
 	}
+done:
+	CURVNET_RESTORE();
+	return;
 }
 
 static inline int post_send(struct ipoib_dev_priv *priv,

Modified: trunk/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c
===================================================================
--- trunk/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c	2016-09-14 19:35:22 UTC (rev 7911)
@@ -31,7 +31,6 @@
  */
 
 #include <linux/kernel.h>
-#include <linux/ethtool.h>
 #include <linux/netdevice.h>
 
 #include "ipoib.h"

Modified: trunk/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_main.c
===================================================================
--- trunk/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_main.c	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_main.c	2016-09-14 19:35:22 UTC (rev 7911)
@@ -40,7 +40,6 @@
 
 #include <linux/module.h>
 
-#include <linux/init.h>
 #include <linux/slab.h>
 #include <linux/kernel.h>
 #include <linux/vmalloc.h>
@@ -259,6 +258,10 @@
 	struct ifreq *ifr = (struct ifreq *) data;
 	int error = 0;
 
+	/* check if detaching */
+	if (priv == NULL || priv->gone != 0)
+		return (ENXIO);
+
 	switch (command) {
 	case SIOCSIFFLAGS:
 		if (ifp->if_flags & IFF_UP) {
@@ -795,6 +798,7 @@
 
 	dev = priv->dev;
 	if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
+		priv->gone = 1;
 		bpfdetach(dev);
 		if_detach(dev);
 		if_free(dev);
@@ -1073,6 +1077,8 @@
 		if (rdma_port_get_link_layer(device, priv->port) != IB_LINK_LAYER_INFINIBAND)
 			continue;
 
+		ipoib_stop(priv);
+
 		ib_unregister_event_handler(&priv->event_handler);
 
 		/* dev_change_flags(priv->dev, priv->dev->flags & ~IFF_UP); */
@@ -1537,3 +1543,20 @@
 
 module_init(ipoib_init_module);
 module_exit(ipoib_cleanup_module);
+
+#undef MODULE_VERSION
+#include <sys/module.h>
+static int
+ipoib_evhand(module_t mod, int event, void *arg)
+{
+	                return (0);
+}
+
+static moduledata_t ipoib_mod = {
+	                .name = "ipoib",
+			                .evhand = ipoib_evhand,
+};
+
+DECLARE_MODULE(ipoib, ipoib_mod, SI_SUB_SMP, SI_ORDER_ANY);
+MODULE_DEPEND(ipoib, ibcore, 1, 1, 1);
+

Modified: trunk/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
===================================================================
--- trunk/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_multicast.c	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_multicast.c	2016-09-14 19:35:22 UTC (rev 7911)
@@ -723,7 +723,7 @@
 void ipoib_mcast_restart(struct ipoib_dev_priv *priv)
 {
 	struct ifnet *dev = priv->dev;
-	struct ifmultiaddr *ifma;;
+	struct ifmultiaddr *ifma;
 	struct ipoib_mcast *mcast, *tmcast;
 	LIST_HEAD(remove_list);
 	struct ib_sa_mcmember_rec rec;

Modified: trunk/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
===================================================================
--- trunk/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_verbs.c	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_verbs.c	2016-09-14 19:35:22 UTC (rev 7911)
@@ -32,7 +32,6 @@
  */
 
 #include "ipoib.h"
-#include <linux/ethtool.h>
 
 int ipoib_mcast_attach(struct ipoib_dev_priv *priv, u16 mlid, union ib_gid *mgid, int set_qkey)
 {

Index: trunk/sys/ofed/drivers/infiniband/ulp/sdp/Kconfig
===================================================================
--- trunk/sys/ofed/drivers/infiniband/ulp/sdp/Kconfig	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/drivers/infiniband/ulp/sdp/Kconfig	2016-09-14 19:35:22 UTC (rev 7911)

Property changes on: trunk/sys/ofed/drivers/infiniband/ulp/sdp/Kconfig
___________________________________________________________________
Deleted: cvs2svn:cvs-rev
## -1 +0,0 ##
-1.1.1.1
\ No newline at end of property
Index: trunk/sys/ofed/drivers/infiniband/ulp/sdp/Makefile
===================================================================
--- trunk/sys/ofed/drivers/infiniband/ulp/sdp/Makefile	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/drivers/infiniband/ulp/sdp/Makefile	2016-09-14 19:35:22 UTC (rev 7911)

Property changes on: trunk/sys/ofed/drivers/infiniband/ulp/sdp/Makefile
___________________________________________________________________
Deleted: cvs2svn:cvs-rev
## -1 +0,0 ##
-1.1.1.1
\ No newline at end of property
Modified: trunk/sys/ofed/drivers/infiniband/ulp/sdp/sdp.h
===================================================================
--- trunk/sys/ofed/drivers/infiniband/ulp/sdp/sdp.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/drivers/infiniband/ulp/sdp/sdp.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -702,6 +702,7 @@
 void sdp_rx_comp_full(struct sdp_sock *ssk);
 
 /* sdp_zcopy.c */
+struct kiocb;
 int sdp_sendmsg_zcopy(struct kiocb *iocb, struct socket *sk, struct iovec *iov);
 int sdp_handle_srcavail(struct sdp_sock *ssk, struct sdp_srcah *srcah);
 void sdp_handle_sendsm(struct sdp_sock *ssk, u32 mseq_ack);


Property changes on: trunk/sys/ofed/drivers/infiniband/ulp/sdp/sdp.h
___________________________________________________________________
Deleted: cvs2svn:cvs-rev
## -1 +0,0 ##
-1.1.1.1
\ No newline at end of property
Modified: trunk/sys/ofed/drivers/infiniband/ulp/sdp/sdp_bcopy.c
===================================================================
--- trunk/sys/ofed/drivers/infiniband/ulp/sdp/sdp_bcopy.c	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/drivers/infiniband/ulp/sdp/sdp_bcopy.c	2016-09-14 19:35:22 UTC (rev 7911)
@@ -29,7 +29,7 @@
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  *
- * $Id: sdp_bcopy.c,v 1.1.1.1 2012-07-21 15:17:36 laffer1 Exp $
+ * $Id$
  */
 #include "sdp.h"
 

Modified: trunk/sys/ofed/drivers/infiniband/ulp/sdp/sdp_cma.c
===================================================================
--- trunk/sys/ofed/drivers/infiniband/ulp/sdp/sdp_cma.c	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/drivers/infiniband/ulp/sdp/sdp_cma.c	2016-09-14 19:35:22 UTC (rev 7911)
@@ -29,7 +29,7 @@
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  *
- * $Id: sdp_cma.c,v 1.1.1.1 2012-07-21 15:17:36 laffer1 Exp $
+ * $Id$
  */
 #include "sdp.h"
 

Index: trunk/sys/ofed/drivers/infiniband/ulp/sdp/sdp_dbg.h
===================================================================
--- trunk/sys/ofed/drivers/infiniband/ulp/sdp/sdp_dbg.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/drivers/infiniband/ulp/sdp/sdp_dbg.h	2016-09-14 19:35:22 UTC (rev 7911)

Property changes on: trunk/sys/ofed/drivers/infiniband/ulp/sdp/sdp_dbg.h
___________________________________________________________________
Deleted: cvs2svn:cvs-rev
## -1 +0,0 ##
-1.1.1.1
\ No newline at end of property
Modified: trunk/sys/ofed/drivers/infiniband/ulp/sdp/sdp_main.c
===================================================================
--- trunk/sys/ofed/drivers/infiniband/ulp/sdp/sdp_main.c	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/drivers/infiniband/ulp/sdp/sdp_main.c	2016-09-14 19:35:22 UTC (rev 7911)
@@ -85,7 +85,7 @@
 #define	SDP_LIST_RLOCK_ASSERT()	rw_assert(&sdp_lock, RW_RLOCKED)
 #define	SDP_LIST_LOCK_ASSERT()	rw_assert(&sdp_lock, RW_LOCKED)
 
-MALLOC_DEFINE(M_SDP, "sdp", "Socket Direct Protocol");
+static MALLOC_DEFINE(M_SDP, "sdp", "Socket Direct Protocol");
 
 static void sdp_stop_keepalive_timer(struct socket *so);
 
@@ -1267,7 +1267,7 @@
 
 	/* Socket buffer is empty and we shall not block. */
 	if (sb->sb_cc == 0 &&
-	    ((sb->sb_flags & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) {
+	    ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) {
 		error = EAGAIN;
 		goto out;
 	}
@@ -1297,7 +1297,7 @@
 
 	/* Socket buffer got some data that we shall deliver now. */
 	if (sb->sb_cc > 0 && !(flags & MSG_WAITALL) &&
-	    ((sb->sb_flags & SS_NBIO) ||
+	    ((so->so_state & SS_NBIO) ||
 	     (flags & (MSG_DONTWAIT|MSG_NBIO)) ||
 	     sb->sb_cc >= sb->sb_lowat ||
 	     sb->sb_cc >= uio->uio_resid ||
@@ -1878,7 +1878,7 @@
 	return (error);
 }
 
-SYSCTL_NODE(_net_inet, -1,  sdp,    CTLFLAG_RW, 0,  "SDP");
+static SYSCTL_NODE(_net_inet, -1,  sdp,    CTLFLAG_RW, 0,  "SDP");
 
 SYSCTL_PROC(_net_inet_sdp, TCPCTL_PCBLIST, pcblist,
     CTLFLAG_RD | CTLTYPE_STRUCT, 0, 0, sdp_pcblist, "S,xtcpcb",

Modified: trunk/sys/ofed/drivers/infiniband/ulp/sdp/sdp_rx.c
===================================================================
--- trunk/sys/ofed/drivers/infiniband/ulp/sdp/sdp_rx.c	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/drivers/infiniband/ulp/sdp/sdp_rx.c	2016-09-14 19:35:22 UTC (rev 7911)
@@ -590,7 +590,7 @@
 	if (unlikely(!ssk->poll_cq)) {
 		struct rdma_cm_id *id = ssk->id;
 		if (id && id->qp)
-			rdma_notify(id, RDMA_CM_EVENT_ESTABLISHED);
+			rdma_notify(id, IB_EVENT_COMM_EST);
 		goto out;
 	}
 

Modified: trunk/sys/ofed/drivers/infiniband/util/madeye.c
===================================================================
--- trunk/sys/ofed/drivers/infiniband/util/madeye.c	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/drivers/infiniband/util/madeye.c	2016-09-14 19:35:22 UTC (rev 7911)
@@ -30,7 +30,7 @@
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  *
- * $Id: madeye.c,v 1.1.1.1 2012-07-21 15:17:36 laffer1 Exp $
+ * $Id$
  */
 #include <linux/module.h>
 #include <linux/device.h>

Modified: trunk/sys/ofed/drivers/net/mlx4/Makefile
===================================================================
--- trunk/sys/ofed/drivers/net/mlx4/Makefile	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/drivers/net/mlx4/Makefile	2016-09-14 19:35:22 UTC (rev 7911)
@@ -1,9 +1,33 @@
-obj-$(CONFIG_MLX4_CORE)		+= mlx4_core.o
+# $FreeBSD$
+#.PATH:  ${.CURDIR}/../../ofed/drivers/net/mlx4:${.CURDIR}/../../ofed/include/linux
+.PATH:  ${.CURDIR}/../../../../../include/linux
 
-mlx4_core-y :=	alloc.o catas.o cmd.o cq.o eq.o fw.o icm.o intf.o main.o mcg.o \
-		mr.o pd.o port.o profile.o qp.o reset.o sense.o srq.o xrcd.o
+.include <bsd.own.mk>
 
-obj-$(CONFIG_MLX4_EN)               += mlx4_en.o
 
-mlx4_en-y := 	en_main.o en_tx.o en_rx.o en_ethtool.o en_port.o en_cq.o \
-		en_resources.o en_netdev.o en_frag.o en_selftest.o
+KMOD    = mlx4
+SRCS    = device_if.h bus_if.h pci_if.h vnode_if.h 
+SRCS+=  alloc.c catas.c cmd.c cq.c eq.c fw.c icm.c intf.c main.c mcg.c mr.c linux_compat.c linux_radix.c
+SRCS+=  pd.c port.c profile.c qp.c reset.c sense.c srq.c resource_tracker.c sys_tune.c
+SRCS+=  opt_inet.h opt_inet6.h
+
+
+#CFLAGS+= -I${.CURDIR}/../../ofed/drivers/net/mlx4
+#CFLAGS+= -I${.CURDIR}/../../ofed/include/
+CFLAGS+= -I${.CURDIR}/../../../../../include
+
+.if !defined(KERNBUILDDIR)
+.if ${MK_INET_SUPPORT} != "no"
+opt_inet.h:
+	@echo "#define INET 1" > ${.TARGET}
+.endif
+
+.if ${MK_INET6_SUPPORT} != "no"
+opt_inet6.h:
+	@echo "#define INET6 1" > ${.TARGET}
+.endif
+.endif
+
+.include <bsd.kmod.mk>
+
+CFLAGS+= -Wno-cast-qual -Wno-pointer-arith -fms-extensions

Modified: trunk/sys/ofed/drivers/net/mlx4/alloc.c
===================================================================
--- trunk/sys/ofed/drivers/net/mlx4/alloc.c	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/drivers/net/mlx4/alloc.c	2016-09-14 19:35:22 UTC (rev 7911)
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 2006, 2007 Cisco Systems, Inc.  All rights reserved.
- * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2007, 2008, 2014 Mellanox Technologies. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -34,7 +34,7 @@
 #include <linux/errno.h>
 #include <linux/slab.h>
 #include <linux/mm.h>
-#include <linux/bitmap.h>
+#include <linux/module.h>
 #include <linux/dma-mapping.h>
 #include <linux/vmalloc.h>
 
@@ -70,14 +70,14 @@
 	return obj;
 }
 
-void mlx4_bitmap_free(struct mlx4_bitmap *bitmap, u32 obj)
+void mlx4_bitmap_free(struct mlx4_bitmap *bitmap, u32 obj, int use_rr)
 {
-	mlx4_bitmap_free_range(bitmap, obj, 1);
+	mlx4_bitmap_free_range(bitmap, obj, 1, use_rr);
 }
 
 static unsigned long find_aligned_range(unsigned long *bitmap,
 					u32 start, u32 nbits,
-					int len, int align)
+					int len, int align, u32 skip_mask)
 {
 	unsigned long end, i;
 
@@ -84,7 +84,8 @@
 again:
 	start = ALIGN(start, align);
 
-	while ((start < nbits) && test_bit(start, bitmap))
+	while ((start < nbits) && (test_bit(start, bitmap) ||
+				   (start & skip_mask)))
 		start += align;
 
 	if (start >= nbits)
@@ -95,7 +96,7 @@
 		return -1;
 
 	for (i = start + 1; i < end; i++) {
-		if (test_bit(i, bitmap)) {
+		if (test_bit(i, bitmap) || ((u32)i & skip_mask)) {
 			start = i + 1;
 			goto again;
 		}
@@ -104,27 +105,27 @@
 	return start;
 }
 
-u32 mlx4_bitmap_alloc_range(struct mlx4_bitmap *bitmap, int cnt, int align)
+u32 mlx4_bitmap_alloc_range(struct mlx4_bitmap *bitmap, int cnt,
+			    int align, u32 skip_mask)
 {
-	u32 obj, i;
+	u32 obj;
 
-	if (likely(cnt == 1 && align == 1))
+	if (likely(cnt == 1 && align == 1 && !skip_mask))
 		return mlx4_bitmap_alloc(bitmap);
 
 	spin_lock(&bitmap->lock);
 
 	obj = find_aligned_range(bitmap->table, bitmap->last,
-				 bitmap->max, cnt, align);
+				bitmap->max, cnt, align, skip_mask);
 	if (obj >= bitmap->max) {
 		bitmap->top = (bitmap->top + bitmap->max + bitmap->reserved_top)
 				& bitmap->mask;
 		obj = find_aligned_range(bitmap->table, 0, bitmap->max,
-					 cnt, align);
+						cnt, align, skip_mask);
 	}
 
 	if (obj < bitmap->max) {
-		for (i = 0; i < cnt; i++)
-			set_bit(obj + i, bitmap->table);
+		bitmap_set(bitmap->table, obj, cnt);
 		if (obj == bitmap->last) {
 			bitmap->last = (obj + cnt);
 			if (bitmap->last >= bitmap->max)
@@ -147,18 +148,18 @@
 	return bitmap->avail;
 }
 
-void mlx4_bitmap_free_range(struct mlx4_bitmap *bitmap, u32 obj, int cnt)
+void mlx4_bitmap_free_range(struct mlx4_bitmap *bitmap, u32 obj, int cnt,
+			    int use_rr)
 {
-	u32 i;
-
 	obj &= bitmap->max + bitmap->reserved_top - 1;
 
 	spin_lock(&bitmap->lock);
-	for (i = 0; i < cnt; i++)
-		clear_bit(obj + i, bitmap->table);
-	bitmap->last = min(bitmap->last, obj);
-	bitmap->top = (bitmap->top + bitmap->max + bitmap->reserved_top)
-			& bitmap->mask;
+	if (!use_rr) {
+		bitmap->last = min(bitmap->last, obj);
+		bitmap->top = (bitmap->top + bitmap->max + bitmap->reserved_top)
+				& bitmap->mask;
+	}
+	bitmap_clear(bitmap->table, obj, cnt);
 	bitmap->avail += cnt;
 	spin_unlock(&bitmap->lock);
 }
@@ -166,12 +167,17 @@
 int mlx4_bitmap_init(struct mlx4_bitmap *bitmap, u32 num, u32 mask,
 		     u32 reserved_bot, u32 reserved_top)
 {
-	int i;
+	/* sanity check */
+	if (num <= (u64)reserved_top + reserved_bot)
+		return -EINVAL;
 
 	/* num must be a power of 2 */
 	if (num != roundup_pow_of_two(num))
 		return -EINVAL;
 
+	if (reserved_bot + reserved_top >= num)
+		return -EINVAL;
+
 	bitmap->last = 0;
 	bitmap->top  = 0;
 	bitmap->max  = num - reserved_top;
@@ -184,8 +190,7 @@
 	if (!bitmap->table)
 		return -ENOMEM;
 
-	for (i = 0; i < reserved_bot; ++i)
-		set_bit(i, bitmap->table);
+	bitmap_set(bitmap->table, 0, reserved_bot);
 
 	return 0;
 }
@@ -207,7 +212,6 @@
 {
 	dma_addr_t t;
 
-	buf->direct.buf = NULL;
 	if (size <= max_direct) {
 		buf->nbufs        = 1;
 		buf->npages       = 1;
@@ -229,11 +233,10 @@
 		int i;
 
 		buf->direct.buf  = NULL;
-		buf->direct.map  = 0;
 		buf->nbufs       = (size + PAGE_SIZE - 1) / PAGE_SIZE;
 		buf->npages      = buf->nbufs;
 		buf->page_shift  = PAGE_SHIFT;
-		buf->page_list   = kzalloc(buf->nbufs * sizeof *buf->page_list,
+		buf->page_list   = kcalloc(buf->nbufs, sizeof(*buf->page_list),
 					   GFP_KERNEL);
 		if (!buf->page_list)
 			return -ENOMEM;
@@ -291,7 +294,6 @@
 						  buf->page_list[i].map);
 		kfree(buf->page_list);
 	}
-	buf->direct.buf = NULL;
 }
 EXPORT_SYMBOL_GPL(mlx4_buf_free);
 

Modified: trunk/sys/ofed/drivers/net/mlx4/catas.c
===================================================================
--- trunk/sys/ofed/drivers/net/mlx4/catas.c	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/drivers/net/mlx4/catas.c	2016-09-14 19:35:22 UTC (rev 7911)
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
- * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2007, 2008, 2014 Mellanox Technologies. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -32,10 +32,13 @@
  */
 
 #include <linux/workqueue.h>
+#include <linux/module.h>
 
+#include <asm/byteorder.h>
+
 #include "mlx4.h"
 
-#define	MLX4_CATAS_POLL_INTERVAL	(5 * HZ)
+#define 	MLX4_CATAS_POLL_INTERVAL	(5 * HZ)
 
 static DEFINE_SPINLOCK(catas_lock);
 
@@ -45,7 +48,8 @@
 static int internal_err_reset = 1;
 module_param(internal_err_reset, int, 0644);
 MODULE_PARM_DESC(internal_err_reset,
-		 "Reset device on internal errors if non-zero (default 1)");
+		 "Reset device on internal errors if non-zero"
+		 " (default 1, in SRIOV mode default is 0)");
 
 static void dump_err_buf(struct mlx4_dev *dev)
 {
@@ -65,16 +69,21 @@
 	struct mlx4_priv *priv = mlx4_priv(dev);
 
 	if (readl(priv->catas_err.map)) {
-		dump_err_buf(dev);
+		/* If the device is off-line, we cannot try to recover it */
+		if (pci_channel_offline(dev->pdev))
+			mod_timer(&priv->catas_err.timer,
+				  round_jiffies(jiffies + MLX4_CATAS_POLL_INTERVAL));
+		else {
+			dump_err_buf(dev);
+			mlx4_dispatch_event(dev, MLX4_DEV_EVENT_CATASTROPHIC_ERROR, 0);
 
-		mlx4_dispatch_event(dev, MLX4_DEV_EVENT_CATASTROPHIC_ERROR, 0);
+			if (internal_err_reset) {
+				spin_lock(&catas_lock);
+				list_add(&priv->catas_err.list, &catas_list);
+				spin_unlock(&catas_lock);
 
-		if (internal_err_reset) {
-			spin_lock(&catas_lock);
-			list_add(&priv->catas_err.list, &catas_list);
-			spin_unlock(&catas_lock);
-
-			queue_work(mlx4_wq, &catas_work);
+				queue_work(mlx4_wq, &catas_work);
+			}
 		}
 	} else
 		mod_timer(&priv->catas_err.timer,
@@ -89,9 +98,6 @@
 	LIST_HEAD(tlist);
 	int ret;
 
-	if (!mutex_trylock(&drv_mutex))
-		return;
-
 	spin_lock_irq(&catas_lock);
 	list_splice_init(&catas_list, &tlist);
 	spin_unlock_irq(&catas_lock);
@@ -99,24 +105,31 @@
 	list_for_each_entry_safe(priv, tmppriv, &tlist, catas_err.list) {
 		struct pci_dev *pdev = priv->dev.pdev;
 
+		/* If the device is off-line, we cannot reset it */
+		if (pci_channel_offline(pdev))
+			continue;
+
 		ret = mlx4_restart_one(priv->dev.pdev);
 		/* 'priv' now is not valid */
 		if (ret)
-			printk(KERN_ERR "mlx4 %s: Reset failed (%d)\n",
-				pci_name(pdev), ret);
+			pr_err("mlx4 %s: Reset failed (%d)\n",
+			       pci_name(pdev), ret);
 		else {
 			dev  = pci_get_drvdata(pdev);
 			mlx4_dbg(dev, "Reset succeeded\n");
 		}
 	}
-	mutex_unlock(&drv_mutex);
 }
 
 void mlx4_start_catas_poll(struct mlx4_dev *dev)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
-	unsigned long addr;
+	phys_addr_t addr;
 
+	/*If we are in SRIOV the default of the module param must be 0*/
+	if (mlx4_is_mfunc(dev))
+		internal_err_reset = 0;
+
 	INIT_LIST_HEAD(&priv->catas_err.list);
 	init_timer(&priv->catas_err.timer);
 	priv->catas_err.map = NULL;
@@ -126,8 +139,8 @@
 
 	priv->catas_err.map = ioremap(addr, priv->fw.catas_size * 4);
 	if (!priv->catas_err.map) {
-		mlx4_warn(dev, "Failed to map internal error buffer at 0x%lx\n",
-			  addr);
+		mlx4_warn(dev, "Failed to map internal error buffer at 0x%llx\n",
+			  (unsigned long long) addr);
 		return;
 	}
 
@@ -144,11 +157,13 @@
 
 	del_timer_sync(&priv->catas_err.timer);
 
-	if (priv->catas_err.map)
+	if (priv->catas_err.map) {
 		iounmap(priv->catas_err.map);
+		priv->catas_err.map = NULL;
+	}
 
 	spin_lock_irq(&catas_lock);
-	list_del(&priv->catas_err.list);
+	list_del_init(&priv->catas_err.list);
 	spin_unlock_irq(&catas_lock);
 }
 

Modified: trunk/sys/ofed/drivers/net/mlx4/cmd.c
===================================================================
--- trunk/sys/ofed/drivers/net/mlx4/cmd.c	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/drivers/net/mlx4/cmd.c	2016-09-14 19:35:22 UTC (rev 7911)
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
- * Copyright (c) 2005, 2006, 2007, 2008 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2005, 2006, 2007, 2008, 2014 Mellanox Technologies. All rights reserved.
  * Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc.  All rights reserved.
  *
  * This software is available to you under a choice of one of two
@@ -33,17 +33,28 @@
  */
 
 #include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/module.h>
 #include <linux/pci.h>
 #include <linux/errno.h>
 
 #include <linux/mlx4/cmd.h>
+#include <linux/mlx4/device.h>
+#include <linux/semaphore.h>
+#include <rdma/ib_smi.h>
 
 #include <asm/io.h>
+#include <linux/ktime.h>
 
 #include "mlx4.h"
+#include "fw.h"
 
 #define CMD_POLL_TOKEN 0xffff
+#define INBOX_MASK	0xffffffffffffff00ULL
 
+#define CMD_CHAN_VER 1
+#define CMD_CHAN_IF_REV 1
+
 enum {
 	/* command completed successfully: */
 	CMD_STAT_OK		= 0x00,
@@ -102,6 +113,14 @@
 	GO_BIT_TIMEOUT_MSECS	= 10000
 };
 
+enum mlx4_vlan_transition {
+	MLX4_VLAN_TRANSITION_VST_VST = 0,
+	MLX4_VLAN_TRANSITION_VST_VGT = 1,
+	MLX4_VLAN_TRANSITION_VGT_VST = 2,
+	MLX4_VLAN_TRANSITION_VGT_VGT = 3,
+};
+
+
 struct mlx4_cmd_context {
 	struct completion	done;
 	int			result;
@@ -111,6 +130,9 @@
 	u8			fw_status;
 };
 
+static int mlx4_master_process_vhcr(struct mlx4_dev *dev, int slave,
+				    struct mlx4_vhcr_cmd *in_vhcr);
+
 static int mlx4_status_to_errno(u8 status)
 {
 	static const int trans_table[] = {
@@ -141,33 +163,348 @@
 	return trans_table[status];
 }
 
+static const char *cmd_to_str(u16 cmd)
+{
+	switch (cmd) {
+	case MLX4_CMD_SYS_EN:		return "SYS_EN";
+	case MLX4_CMD_SYS_DIS:		return "SYS_DIS";
+	case MLX4_CMD_MAP_FA:		return "MAP_FA";
+	case MLX4_CMD_UNMAP_FA:		return "UNMAP_FA";
+	case MLX4_CMD_RUN_FW:		return "RUN_FW";
+	case MLX4_CMD_MOD_STAT_CFG:	return "MOD_STAT_CFG";
+	case MLX4_CMD_QUERY_DEV_CAP:	return "QUERY_DEV_CAP";
+	case MLX4_CMD_QUERY_FW:		return "QUERY_FW";
+	case MLX4_CMD_ENABLE_LAM:	return "ENABLE_LAM";
+	case MLX4_CMD_DISABLE_LAM:	return "DISABLE_LAM";
+	case MLX4_CMD_QUERY_DDR:	return "QUERY_DDR";
+	case MLX4_CMD_QUERY_ADAPTER:	return "QUERY_ADAPTER";
+	case MLX4_CMD_INIT_HCA:		return "INIT_HCA";
+	case MLX4_CMD_CLOSE_HCA:	return "CLOSE_HCA";
+	case MLX4_CMD_INIT_PORT:	return "INIT_PORT";
+	case MLX4_CMD_CLOSE_PORT:	return "CLOSE_PORT";
+	case MLX4_CMD_QUERY_HCA:	return "QUERY_HCA";
+	case MLX4_CMD_QUERY_PORT:	return "QUERY_PORT";
+	case MLX4_CMD_SENSE_PORT:	return "SENSE_PORT";
+	case MLX4_CMD_HW_HEALTH_CHECK:  return "HW_HEALTH_CHECK";
+	case MLX4_CMD_SET_PORT:		return "SET_PORT";
+	case MLX4_CMD_SET_NODE:		return "SET_NODE";
+	case MLX4_CMD_QUERY_FUNC:	return "QUERY_FUNC";
+	case MLX4_CMD_MAP_ICM:		return "MAP_ICM";
+	case MLX4_CMD_UNMAP_ICM:	return "UNMAP_ICM";
+	case MLX4_CMD_MAP_ICM_AUX:	return "MAP_ICM_AUX";
+	case MLX4_CMD_UNMAP_ICM_AUX:	return "UNMAP_ICM_AUX";
+	case MLX4_CMD_SET_ICM_SIZE:	return "SET_ICM_SIZE";
+		/*master notify fw on finish for slave's flr*/
+	case MLX4_CMD_INFORM_FLR_DONE:	return "INFORM_FLR_DONE";
+	case MLX4_CMD_GET_OP_REQ:	return "GET_OP_REQ";
+
+		/* TPT commands */
+	case MLX4_CMD_SW2HW_MPT:	return "SW2HW_MPT";
+	case MLX4_CMD_QUERY_MPT:	return "QUERY_MPT";
+	case MLX4_CMD_HW2SW_MPT:	return "HW2SW_MPT";
+	case MLX4_CMD_READ_MTT:		return "READ_MTT";
+	case MLX4_CMD_WRITE_MTT:	return "WRITE_MTT";
+	case MLX4_CMD_SYNC_TPT:		return "SYNC_TPT";
+
+		/* EQ commands */
+	case MLX4_CMD_MAP_EQ:		return "MAP_EQ";
+	case MLX4_CMD_SW2HW_EQ:		return "SW2HW_EQ";
+	case MLX4_CMD_HW2SW_EQ:		return "HW2SW_EQ";
+	case MLX4_CMD_QUERY_EQ:		return "QUERY_EQ";
+
+		/* CQ commands */
+	case MLX4_CMD_SW2HW_CQ:		return "SW2HW_CQ";
+	case MLX4_CMD_HW2SW_CQ:		return "HW2SW_CQ";
+	case MLX4_CMD_QUERY_CQ:		return "QUERY_CQ:";
+	case MLX4_CMD_MODIFY_CQ:	return "MODIFY_CQ:";
+
+		/* SRQ commands */
+	case MLX4_CMD_SW2HW_SRQ:	return "SW2HW_SRQ";
+	case MLX4_CMD_HW2SW_SRQ:	return "HW2SW_SRQ";
+	case MLX4_CMD_QUERY_SRQ:	return "QUERY_SRQ";
+	case MLX4_CMD_ARM_SRQ:		return "ARM_SRQ";
+
+		/* QP/EE commands */
+	case MLX4_CMD_RST2INIT_QP:	return "RST2INIT_QP";
+	case MLX4_CMD_INIT2RTR_QP:	return "INIT2RTR_QP";
+	case MLX4_CMD_RTR2RTS_QP:	return "RTR2RTS_QP";
+	case MLX4_CMD_RTS2RTS_QP:	return "RTS2RTS_QP";
+	case MLX4_CMD_SQERR2RTS_QP:	return "SQERR2RTS_QP";
+	case MLX4_CMD_2ERR_QP:		return "2ERR_QP";
+	case MLX4_CMD_RTS2SQD_QP:	return "RTS2SQD_QP";
+	case MLX4_CMD_SQD2SQD_QP:	return "SQD2SQD_QP";
+	case MLX4_CMD_SQD2RTS_QP:	return "SQD2RTS_QP";
+	case MLX4_CMD_2RST_QP:		return "2RST_QP";
+	case MLX4_CMD_QUERY_QP:		return "QUERY_QP";
+	case MLX4_CMD_INIT2INIT_QP:	return "INIT2INIT_QP";
+	case MLX4_CMD_SUSPEND_QP:	return "SUSPEND_QP";
+	case MLX4_CMD_UNSUSPEND_QP:	return "UNSUSPEND_QP";
+		/* special QP and management commands */
+	case MLX4_CMD_CONF_SPECIAL_QP:	return "CONF_SPECIAL_QP";
+	case MLX4_CMD_MAD_IFC:		return "MAD_IFC";
+
+		/* multicast commands */
+	case MLX4_CMD_READ_MCG:		return "READ_MCG";
+	case MLX4_CMD_WRITE_MCG:	return "WRITE_MCG";
+	case MLX4_CMD_MGID_HASH:	return "MGID_HASH";
+
+		/* miscellaneous commands */
+	case MLX4_CMD_DIAG_RPRT:	return "DIAG_RPRT";
+	case MLX4_CMD_NOP:		return "NOP";
+	case MLX4_CMD_ACCESS_MEM:	return "ACCESS_MEM";
+	case MLX4_CMD_SET_VEP:		return "SET_VEP";
+
+		/* Ethernet specific commands */
+	case MLX4_CMD_SET_VLAN_FLTR:	return "SET_VLAN_FLTR";
+	case MLX4_CMD_SET_MCAST_FLTR:	return "SET_MCAST_FLTR";
+	case MLX4_CMD_DUMP_ETH_STATS:	return "DUMP_ETH_STATS";
+
+		/* Communication channel commands */
+	case MLX4_CMD_ARM_COMM_CHANNEL:	return "ARM_COMM_CHANNEL";
+	case MLX4_CMD_GEN_EQE:		return "GEN_EQE";
+
+		/* virtual commands */
+	case MLX4_CMD_ALLOC_RES:	return "ALLOC_RES";
+	case MLX4_CMD_FREE_RES:		return "FREE_RES";
+	case MLX4_CMD_MCAST_ATTACH:	return "MCAST_ATTACH";
+	case MLX4_CMD_UCAST_ATTACH:	return "UCAST_ATTACH";
+	case MLX4_CMD_PROMISC:		return "PROMISC";
+	case MLX4_CMD_QUERY_FUNC_CAP:	return "QUERY_FUNC_CAP";
+	case MLX4_CMD_QP_ATTACH:	return "QP_ATTACH";
+
+		/* debug commands */
+	case MLX4_CMD_QUERY_DEBUG_MSG:	return "QUERY_DEBUG_MSG";
+	case MLX4_CMD_SET_DEBUG_MSG:	return "SET_DEBUG_MSG";
+
+		/* statistics commands */
+	case MLX4_CMD_QUERY_IF_STAT:	return "QUERY_IF_STAT";
+	case MLX4_CMD_SET_IF_STAT:	return "SET_IF_STAT";
+
+		/* register/delete flow steering network rules */
+	case MLX4_QP_FLOW_STEERING_ATTACH:	return "QP_FLOW_STEERING_ATTACH";
+	case MLX4_QP_FLOW_STEERING_DETACH:	return "QP_FLOW_STEERING_DETACH";
+	case MLX4_FLOW_STEERING_IB_UC_QP_RANGE:	return "FLOW_STEERING_IB_UC_QP_RANGE";
+	default: return "OTHER";
+	}
+}
+
+static u8 mlx4_errno_to_status(int errno)
+{
+	switch (errno) {
+	case -EPERM:
+		return CMD_STAT_BAD_OP;
+	case -EINVAL:
+		return CMD_STAT_BAD_PARAM;
+	case -ENXIO:
+		return CMD_STAT_BAD_SYS_STATE;
+	case -EBUSY:
+		return CMD_STAT_RESOURCE_BUSY;
+	case -ENOMEM:
+		return CMD_STAT_EXCEED_LIM;
+	case -ENFILE:
+		return CMD_STAT_ICM_ERROR;
+	default:
+		return CMD_STAT_INTERNAL_ERR;
+	}
+}
+
+static int comm_pending(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	u32 status = readl(&priv->mfunc.comm->slave_read);
+
+	return (swab32(status) >> 31) != priv->cmd.comm_toggle;
+}
+
+static void mlx4_comm_cmd_post(struct mlx4_dev *dev, u8 cmd, u16 param)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	u32 val;
+
+	priv->cmd.comm_toggle ^= 1;
+	val = param | (cmd << 16) | (priv->cmd.comm_toggle << 31);
+	__raw_writel((__force u32) cpu_to_be32(val),
+		     &priv->mfunc.comm->slave_write);
+	mmiowb();
+}
+
+static int mlx4_comm_cmd_poll(struct mlx4_dev *dev, u8 cmd, u16 param,
+		       unsigned long timeout)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	unsigned long end;
+	int err = 0;
+	int ret_from_pending = 0;
+
+	/* First, verify that the master reports correct status */
+	if (comm_pending(dev)) {
+		mlx4_warn(dev, "Communication channel is not idle."
+			  "my toggle is %d (cmd:0x%x)\n",
+			  priv->cmd.comm_toggle, cmd);
+		return -EAGAIN;
+	}
+
+	/* Write command */
+	down(&priv->cmd.poll_sem);
+	mlx4_comm_cmd_post(dev, cmd, param);
+
+	end = msecs_to_jiffies(timeout) + jiffies;
+	while (comm_pending(dev) && time_before(jiffies, end))
+		cond_resched();
+	ret_from_pending = comm_pending(dev);
+	if (ret_from_pending) {
+		/* check if the slave is trying to boot in the middle of
+		 * FLR process. The only non-zero result in the RESET command
+		 * is MLX4_DELAY_RESET_SLAVE*/
+		if ((MLX4_COMM_CMD_RESET == cmd)) {
+			mlx4_warn(dev, "Got slave FLRed from Communication"
+				  " channel (ret:0x%x)\n", ret_from_pending);
+			err = MLX4_DELAY_RESET_SLAVE;
+		} else {
+			mlx4_warn(dev, "Communication channel timed out\n");
+			err = -ETIMEDOUT;
+		}
+	}
+
+	up(&priv->cmd.poll_sem);
+	return err;
+}
+
+static int mlx4_comm_cmd_wait(struct mlx4_dev *dev, u8 op,
+			      u16 param, unsigned long timeout)
+{
+	struct mlx4_cmd *cmd = &mlx4_priv(dev)->cmd;
+	struct mlx4_cmd_context *context;
+	unsigned long end;
+	int err = 0;
+
+	down(&cmd->event_sem);
+
+	end = msecs_to_jiffies(timeout) + jiffies;
+	while (comm_pending(dev) && time_before(jiffies, end))
+		cond_resched();
+	if (comm_pending(dev)) {
+		mlx4_warn(dev, "mlx4_comm_cmd_wait: Comm channel "
+			  "is not idle. My toggle is %d (op: 0x%x)\n",
+			  mlx4_priv(dev)->cmd.comm_toggle, op);
+		up(&cmd->event_sem);
+		return -EAGAIN;
+	}
+
+	spin_lock(&cmd->context_lock);
+	BUG_ON(cmd->free_head < 0);
+	context = &cmd->context[cmd->free_head];
+	context->token += cmd->token_mask + 1;
+	cmd->free_head = context->next;
+	spin_unlock(&cmd->context_lock);
+
+	init_completion(&context->done);
+
+	mlx4_comm_cmd_post(dev, op, param);
+
+	/* In slave, wait unconditionally for completion */
+	wait_for_completion(&context->done);
+
+	err = context->result;
+	if (err && context->fw_status != CMD_STAT_MULTI_FUNC_REQ) {
+		mlx4_err(dev, "command 0x%x failed: fw status = 0x%x\n",
+			 op, context->fw_status);
+		goto out;
+	}
+
+out:
+	/* wait for comm channel ready
+	 * this is necessary for prevention the race
+	 * when switching between event to polling mode
+	 */
+	end = msecs_to_jiffies(timeout) + jiffies;
+	while (comm_pending(dev) && time_before(jiffies, end))
+		cond_resched();
+
+	spin_lock(&cmd->context_lock);
+	context->next = cmd->free_head;
+	cmd->free_head = context - cmd->context;
+	spin_unlock(&cmd->context_lock);
+
+	up(&cmd->event_sem);
+	return err;
+}
+
+int mlx4_comm_cmd(struct mlx4_dev *dev, u8 cmd, u16 param,
+		  unsigned long timeout)
+{
+	if (mlx4_priv(dev)->cmd.use_events)
+		return mlx4_comm_cmd_wait(dev, cmd, param, timeout);
+	return mlx4_comm_cmd_poll(dev, cmd, param, timeout);
+}
+
 static int cmd_pending(struct mlx4_dev *dev)
 {
-	u32 status = readl(mlx4_priv(dev)->cmd.hcr + HCR_STATUS_OFFSET);
+	u32 status;
 
+	if (pci_channel_offline(dev->pdev))
+		return -EIO;
+
+	status = readl(mlx4_priv(dev)->cmd.hcr + HCR_STATUS_OFFSET);
+
 	return (status & swab32(1 << HCR_GO_BIT)) ||
 		(mlx4_priv(dev)->cmd.toggle ==
 		 !!(status & swab32(1 << HCR_T_BIT)));
 }
 
-static int mlx4_cmd_post(struct mlx4_dev *dev, u64 in_param, u64 out_param,
-			 u32 in_modifier, u8 op_modifier, u16 op, u16 token,
-			 int event)
+static int get_status(struct mlx4_dev *dev, u32 *status, int *go_bit,
+		      int *t_bit)
 {
+	if (pci_channel_offline(dev->pdev))
+		return -EIO;
+
+	*status = readl(mlx4_priv(dev)->cmd.hcr + HCR_STATUS_OFFSET);
+	*t_bit = !!(*status & swab32(1 << HCR_T_BIT));
+	*go_bit = !!(*status & swab32(1 << HCR_GO_BIT));
+
+	return 0;
+}
+
+static int mlx4_cmd_post(struct mlx4_dev *dev, struct timespec *ts1,
+			 u64 in_param, u64 out_param, u32 in_modifier,
+			 u8 op_modifier, u16 op, u16 token, int event)
+{
 	struct mlx4_cmd *cmd = &mlx4_priv(dev)->cmd;
 	u32 __iomem *hcr = cmd->hcr;
 	int ret = -EAGAIN;
 	unsigned long end;
+	int err, go_bit = 0, t_bit = 0;
+	u32 status = 0;
 
 	mutex_lock(&cmd->hcr_mutex);
 
+	if (pci_channel_offline(dev->pdev)) {
+		/*
+		 * Device is going through error recovery
+		 * and cannot accept commands.
+		 */
+		ret = -EIO;
+		goto out;
+	}
+
 	end = jiffies;
 	if (event)
 		end += msecs_to_jiffies(GO_BIT_TIMEOUT_MSECS);
 
 	while (cmd_pending(dev)) {
-		if (time_after_eq(jiffies, end))
+		if (pci_channel_offline(dev->pdev)) {
+			/*
+			 * Device is going through error recovery
+			 * and cannot accept commands.
+			 */
+			ret = -EIO;
 			goto out;
+		}
+
+		if (time_after_eq(jiffies, end)) {
+			mlx4_err(dev, "%s:cmd_pending failed\n", __func__);
+			goto out;
+		}
 		cond_resched();
 	}
 
@@ -184,6 +521,9 @@
 	__raw_writel((__force u32) cpu_to_be32(out_param & 0xfffffffful), hcr + 4);
 	__raw_writel((__force u32) cpu_to_be32(token << 16),		  hcr + 5);
 
+	if (ts1)
+		ktime_get_ts(ts1);
+
 	/* __raw_writel may not order writes. */
 	wmb();
 
@@ -191,7 +531,7 @@
 					       (cmd->toggle << HCR_T_BIT)	|
 					       (event ? (1 << HCR_E_BIT) : 0)	|
 					       (op_modifier << HCR_OPMOD_SHIFT) |
-					       op),			  hcr + 6);
+					       op), hcr + 6);
 
 	/*
 	 * Make sure that our HCR writes don't get mixed in with
@@ -204,10 +544,78 @@
 	ret = 0;
 
 out:
+	if (ret) {
+		err = get_status(dev, &status, &go_bit, &t_bit);
+		mlx4_warn(dev, "Could not post command %s (0x%x): ret=%d, "
+			  "in_param=0x%llx, in_mod=0x%x, op_mod=0x%x, "
+			  "get_status err=%d, status_reg=0x%x, go_bit=%d, "
+			  "t_bit=%d, toggle=0x%x\n", cmd_to_str(op), op, ret,
+			  (unsigned long long) in_param, in_modifier, op_modifier, err, status,
+			  go_bit, t_bit, cmd->toggle);
+	}
 	mutex_unlock(&cmd->hcr_mutex);
 	return ret;
 }
 
+static int mlx4_slave_cmd(struct mlx4_dev *dev, u64 in_param, u64 *out_param,
+			  int out_is_imm, u32 in_modifier, u8 op_modifier,
+			  u16 op, unsigned long timeout)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_vhcr_cmd *vhcr = priv->mfunc.vhcr;
+	int ret;
+
+	mutex_lock(&priv->cmd.slave_cmd_mutex);
+
+	vhcr->in_param = cpu_to_be64(in_param);
+	vhcr->out_param = out_param ? cpu_to_be64(*out_param) : 0;
+	vhcr->in_modifier = cpu_to_be32(in_modifier);
+	vhcr->opcode = cpu_to_be16((((u16) op_modifier) << 12) | (op & 0xfff));
+	vhcr->token = cpu_to_be16(CMD_POLL_TOKEN);
+	vhcr->status = 0;
+	vhcr->flags = !!(priv->cmd.use_events) << 6;
+
+	if (mlx4_is_master(dev)) {
+		ret = mlx4_master_process_vhcr(dev, dev->caps.function, vhcr);
+		if (!ret) {
+			if (out_is_imm) {
+				if (out_param)
+					*out_param =
+						be64_to_cpu(vhcr->out_param);
+				else {
+					mlx4_err(dev, "response expected while"
+						 "output mailbox is NULL for "
+						 "command 0x%x\n", op);
+					vhcr->status = CMD_STAT_BAD_PARAM;
+				}
+			}
+			ret = mlx4_status_to_errno(vhcr->status);
+		}
+	} else {
+		ret = mlx4_comm_cmd(dev, MLX4_COMM_CMD_VHCR_POST, 0,
+				    MLX4_COMM_TIME + timeout);
+		if (!ret) {
+			if (out_is_imm) {
+				if (out_param)
+					*out_param =
+						be64_to_cpu(vhcr->out_param);
+				else {
+					mlx4_err(dev, "response expected while"
+						 "output mailbox is NULL for "
+						 "command 0x%x\n", op);
+					vhcr->status = CMD_STAT_BAD_PARAM;
+				}
+			}
+			ret = mlx4_status_to_errno(vhcr->status);
+		} else
+			mlx4_err(dev, "failed execution of VHCR_POST command"
+				 "opcode %s (0x%x)\n", cmd_to_str(op), op);
+	}
+
+	mutex_unlock(&priv->cmd.slave_cmd_mutex);
+	return ret;
+}
+
 static int mlx4_cmd_poll(struct mlx4_dev *dev, u64 in_param, u64 *out_param,
 			 int out_is_imm, u32 in_modifier, u8 op_modifier,
 			 u16 op, unsigned long timeout)
@@ -220,16 +628,37 @@
 
 	down(&priv->cmd.poll_sem);
 
-	err = mlx4_cmd_post(dev, in_param, out_param ? *out_param : 0,
+	if (pci_channel_offline(dev->pdev)) {
+		/*
+		 * Device is going through error recovery
+		 * and cannot accept commands.
+		 */
+		err = -EIO;
+		goto out;
+	}
+
+	err = mlx4_cmd_post(dev, NULL, in_param, out_param ? *out_param : 0,
 			    in_modifier, op_modifier, op, CMD_POLL_TOKEN, 0);
 	if (err)
 		goto out;
 
 	end = msecs_to_jiffies(timeout) + jiffies;
-	while (cmd_pending(dev) && time_before(jiffies, end))
+	while (cmd_pending(dev) && time_before(jiffies, end)) {
+		if (pci_channel_offline(dev->pdev)) {
+			/*
+			 * Device is going through error recovery
+			 * and cannot accept commands.
+			 */
+			err = -EIO;
+			goto out;
+		}
+
 		cond_resched();
+	}
 
 	if (cmd_pending(dev)) {
+		mlx4_warn(dev, "command %s (0x%x) timed out (go bit not cleared)\n",
+			  cmd_to_str(op), op);
 		err = -ETIMEDOUT;
 		goto out;
 	}
@@ -240,13 +669,12 @@
 					  __raw_readl(hcr + HCR_OUT_PARAM_OFFSET)) << 32 |
 			(u64) be32_to_cpu((__force __be32)
 					  __raw_readl(hcr + HCR_OUT_PARAM_OFFSET + 4));
-	stat = be32_to_cpu((__force __be32) __raw_readl(hcr + HCR_STATUS_OFFSET)) >> 24;
+	stat = be32_to_cpu((__force __be32)
+			   __raw_readl(hcr + HCR_STATUS_OFFSET)) >> 24;
 	err = mlx4_status_to_errno(stat);
-	if (err) {
-		if (op != MLX4_CMD_SET_NODE || stat != CMD_STAT_BAD_OP)
-			mlx4_err(dev, "command 0x%x failed: fw status = 0x%x\n",
-				 op, stat);
-	}
+	if (err)
+		mlx4_err(dev, "command %s (0x%x) failed: fw status = 0x%x\n",
+			 cmd_to_str(op), op, stat);
 
 out:
 	up(&priv->cmd.poll_sem);
@@ -277,7 +705,15 @@
 	struct mlx4_cmd *cmd = &mlx4_priv(dev)->cmd;
 	struct mlx4_cmd_context *context;
 	int err = 0;
+	int go_bit = 0, t_bit = 0, stat_err;
+	u32 status = 0;
+	struct timespec	ts1, ts2;
+	ktime_t t1, t2, delta;
+	s64 ds;
 
+	if (out_is_imm && !out_param)
+		return -EINVAL;
+
 	down(&cmd->event_sem);
 
 	spin_lock(&cmd->context_lock);
@@ -289,19 +725,51 @@
 
 	init_completion(&context->done);
 
-	mlx4_cmd_post(dev, in_param, out_param ? *out_param : 0,
-		      in_modifier, op_modifier, op, context->token, 1);
+	err = mlx4_cmd_post(dev, &ts1, in_param, out_param ? *out_param : 0,
+			    in_modifier, op_modifier, op, context->token, 1);
+	if (err)
+		goto out;
 
-	if (!wait_for_completion_timeout(&context->done, msecs_to_jiffies(timeout))) {
+	if (!wait_for_completion_timeout(&context->done,
+					 msecs_to_jiffies(timeout))) {
+		stat_err = get_status(dev, &status, &go_bit, &t_bit);
+		mlx4_warn(dev, "command %s (0x%x) timed out: in_param=0x%llx, "
+			  "in_mod=0x%x, op_mod=0x%x, get_status err=%d, "
+			  "status_reg=0x%x, go_bit=%d, t_bit=%d, toggle=0x%x\n"
+			  , cmd_to_str(op), op, (unsigned long long) in_param, in_modifier,
+			  op_modifier, stat_err, status, go_bit, t_bit,
+			  mlx4_priv(dev)->cmd.toggle);
 		err = -EBUSY;
 		goto out;
 	}
+	if (mlx4_debug_level & MLX4_DEBUG_MASK_CMD_TIME) {
+		ktime_get_ts(&ts2);
+		t1 = timespec_to_ktime(ts1);
+		t2 = timespec_to_ktime(ts2);
+		delta = ktime_sub(t2, t1);
+		ds = ktime_to_ns(delta);
+		pr_info("mlx4: fw exec time for %s is %lld nsec\n", cmd_to_str(op), (long long) ds);
+	}
 
 	err = context->result;
 	if (err) {
-		if (op != MLX4_CMD_SET_NODE || context->fw_status != CMD_STAT_BAD_OP)
-			mlx4_err(dev, "command 0x%x failed: fw status = 0x%x\n",
-				 op, context->fw_status);
+		mlx4_err(dev, "command %s (0x%x) failed: in_param=0x%llx, "
+			 "in_mod=0x%x, op_mod=0x%x, fw status = 0x%x\n",
+			 cmd_to_str(op), op, (unsigned long long) in_param, in_modifier,
+			 op_modifier, context->fw_status);
+
+		switch(context->fw_status) {
+		case CMD_STAT_BAD_PARAM:
+			mlx4_err(dev, "Parameter is not supported, "
+			    "parameter is out of range\n");
+			break;
+		case CMD_STAT_EXCEED_LIM:
+			mlx4_err(dev, "Required capability exceeded "
+			    "device limits\n");
+			break;
+		default:
+			break;
+		}
 		goto out;
 	}
 
@@ -320,42 +788,1537 @@
 
 int __mlx4_cmd(struct mlx4_dev *dev, u64 in_param, u64 *out_param,
 	       int out_is_imm, u32 in_modifier, u8 op_modifier,
-	       u16 op, unsigned long timeout)
+	       u16 op, unsigned long timeout, int native)
 {
-	if (mlx4_priv(dev)->cmd.use_events && !cold)
-		return mlx4_cmd_wait(dev, in_param, out_param, out_is_imm,
-				     in_modifier, op_modifier, op, timeout);
-	else
-		return mlx4_cmd_poll(dev, in_param, out_param, out_is_imm,
-				     in_modifier, op_modifier, op, timeout);
+	if (pci_channel_offline(dev->pdev))
+		return -EIO;
+
+	if (!mlx4_is_mfunc(dev) || (native && mlx4_is_master(dev))) {
+		if (mlx4_priv(dev)->cmd.use_events)
+			return mlx4_cmd_wait(dev, in_param, out_param,
+					     out_is_imm, in_modifier,
+					     op_modifier, op, timeout);
+		else
+			return mlx4_cmd_poll(dev, in_param, out_param,
+					     out_is_imm, in_modifier,
+					     op_modifier, op, timeout);
+	}
+	return mlx4_slave_cmd(dev, in_param, out_param, out_is_imm,
+			      in_modifier, op_modifier, op, timeout);
 }
 EXPORT_SYMBOL_GPL(__mlx4_cmd);
 
+
+static int mlx4_ARM_COMM_CHANNEL(struct mlx4_dev *dev)
+{
+	return mlx4_cmd(dev, 0, 0, 0, MLX4_CMD_ARM_COMM_CHANNEL,
+			MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
+}
+
+static int mlx4_ACCESS_MEM(struct mlx4_dev *dev, u64 master_addr,
+			   int slave, u64 slave_addr,
+			   int size, int is_read)
+{
+	u64 in_param;
+	u64 out_param;
+
+	if ((slave_addr & 0xfff) | (master_addr & 0xfff) |
+	    (slave & ~0x7f) | (size & 0xff)) {
+		mlx4_err(dev, "Bad access mem params - slave_addr:0x%llx "
+			      "master_addr:0x%llx slave_id:%d size:%d\n",
+			      (unsigned long long) slave_addr, (unsigned long long) master_addr, slave, size);
+		return -EINVAL;
+	}
+
+	if (is_read) {
+		in_param = (u64) slave | slave_addr;
+		out_param = (u64) dev->caps.function | master_addr;
+	} else {
+		in_param = (u64) dev->caps.function | master_addr;
+		out_param = (u64) slave | slave_addr;
+	}
+
+	return mlx4_cmd_imm(dev, in_param, &out_param, size, 0,
+			    MLX4_CMD_ACCESS_MEM,
+			    MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
+}
+
+static int query_pkey_block(struct mlx4_dev *dev, u8 port, u16 index, u16 *pkey,
+			       struct mlx4_cmd_mailbox *inbox,
+			       struct mlx4_cmd_mailbox *outbox)
+{
+	struct ib_smp *in_mad = (struct ib_smp *)(inbox->buf);
+	struct ib_smp *out_mad = (struct ib_smp *)(outbox->buf);
+	int err;
+	int i;
+
+	if (index & 0x1f)
+		return -EINVAL;
+
+	in_mad->attr_mod = cpu_to_be32(index / 32);
+
+	err = mlx4_cmd_box(dev, inbox->dma, outbox->dma, port, 3,
+			   MLX4_CMD_MAD_IFC, MLX4_CMD_TIME_CLASS_C,
+			   MLX4_CMD_NATIVE);
+	if (err)
+		return err;
+
+	for (i = 0; i < 32; ++i)
+		pkey[i] = be16_to_cpu(((__be16 *) out_mad->data)[i]);
+
+	return err;
+}
+
+static int get_full_pkey_table(struct mlx4_dev *dev, u8 port, u16 *table,
+			       struct mlx4_cmd_mailbox *inbox,
+			       struct mlx4_cmd_mailbox *outbox)
+{
+	int i;
+	int err;
+
+	for (i = 0; i < dev->caps.pkey_table_len[port]; i += 32) {
+		err = query_pkey_block(dev, port, i, table + i, inbox, outbox);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+#define PORT_CAPABILITY_LOCATION_IN_SMP 20
+#define PORT_STATE_OFFSET 32
+
+static enum ib_port_state vf_port_state(struct mlx4_dev *dev, int port, int vf)
+{
+	if (mlx4_get_slave_port_state(dev, vf, port) == SLAVE_PORT_UP)
+		return IB_PORT_ACTIVE;
+	else
+		return IB_PORT_DOWN;
+}
+
+static int mlx4_MAD_IFC_wrapper(struct mlx4_dev *dev, int slave,
+				struct mlx4_vhcr *vhcr,
+				struct mlx4_cmd_mailbox *inbox,
+				struct mlx4_cmd_mailbox *outbox,
+				struct mlx4_cmd_info *cmd)
+{
+	struct ib_smp *smp = inbox->buf;
+	u32 index;
+	u8 port;
+	u16 *table;
+	int err;
+	int vidx, pidx;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct ib_smp *outsmp = outbox->buf;
+	__be16 *outtab = (__be16 *)(outsmp->data);
+	__be32 slave_cap_mask;
+	__be64 slave_node_guid;
+	port = vhcr->in_modifier;
+
+	if (smp->base_version == 1 &&
+	    smp->mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED &&
+	    smp->class_version == 1) {
+		if (smp->method	== IB_MGMT_METHOD_GET) {
+			if (smp->attr_id == IB_SMP_ATTR_PKEY_TABLE) {
+				index = be32_to_cpu(smp->attr_mod);
+				if (port < 1 || port > dev->caps.num_ports)
+					return -EINVAL;
+				table = kcalloc(dev->caps.pkey_table_len[port], sizeof *table, GFP_KERNEL);
+				if (!table)
+					return -ENOMEM;
+				/* need to get the full pkey table because the paravirtualized
+				 * pkeys may be scattered among several pkey blocks.
+				 */
+				err = get_full_pkey_table(dev, port, table, inbox, outbox);
+				if (!err) {
+					for (vidx = index * 32; vidx < (index + 1) * 32; ++vidx) {
+						pidx = priv->virt2phys_pkey[slave][port - 1][vidx];
+						outtab[vidx % 32] = cpu_to_be16(table[pidx]);
+					}
+				}
+				kfree(table);
+				return err;
+			}
+			if (smp->attr_id == IB_SMP_ATTR_PORT_INFO) {
+				/*get the slave specific caps:*/
+				/*do the command */
+				err = mlx4_cmd_box(dev, inbox->dma, outbox->dma,
+					    vhcr->in_modifier, vhcr->op_modifier,
+					    vhcr->op, MLX4_CMD_TIME_CLASS_C, MLX4_CMD_NATIVE);
+				/* modify the response for slaves */
+				if (!err && slave != mlx4_master_func_num(dev)) {
+					u8 *state = outsmp->data + PORT_STATE_OFFSET;
+
+					*state = (*state & 0xf0) | vf_port_state(dev, port, slave);
+					slave_cap_mask = priv->mfunc.master.slave_state[slave].ib_cap_mask[port];
+					memcpy(outsmp->data + PORT_CAPABILITY_LOCATION_IN_SMP, &slave_cap_mask, 4);
+				}
+				return err;
+			}
+			if (smp->attr_id == IB_SMP_ATTR_GUID_INFO) {
+				/* compute slave's gid block */
+				smp->attr_mod = cpu_to_be32(slave / 8);
+				/* execute cmd */
+				err = mlx4_cmd_box(dev, inbox->dma, outbox->dma,
+					     vhcr->in_modifier, vhcr->op_modifier,
+					     vhcr->op, MLX4_CMD_TIME_CLASS_C, MLX4_CMD_NATIVE);
+				if (!err) {
+					/* if needed, move slave gid to index 0 */
+					if (slave % 8)
+						memcpy(outsmp->data,
+						       outsmp->data + (slave % 8) * 8, 8);
+					/* delete all other gids */
+					memset(outsmp->data + 8, 0, 56);
+				}
+				return err;
+			}
+			if (smp->attr_id == IB_SMP_ATTR_NODE_INFO) {
+				err = mlx4_cmd_box(dev, inbox->dma, outbox->dma,
+					     vhcr->in_modifier, vhcr->op_modifier,
+					     vhcr->op, MLX4_CMD_TIME_CLASS_C, MLX4_CMD_NATIVE);
+				if (!err) {
+					slave_node_guid =  mlx4_get_slave_node_guid(dev, slave);
+					memcpy(outsmp->data + 12, &slave_node_guid, 8);
+				}
+				return err;
+			}
+		}
+	}
+	if (slave != mlx4_master_func_num(dev) &&
+	    ((smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) ||
+	     (smp->mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED &&
+	      smp->method == IB_MGMT_METHOD_SET))) {
+		mlx4_err(dev, "slave %d is trying to execute a Subnet MGMT MAD, "
+			 "class 0x%x, method 0x%x for attr 0x%x. Rejecting\n",
+			 slave, smp->method, smp->mgmt_class,
+			 be16_to_cpu(smp->attr_id));
+		return -EPERM;
+	}
+	/*default:*/
+	return mlx4_cmd_box(dev, inbox->dma, outbox->dma,
+				    vhcr->in_modifier, vhcr->op_modifier,
+				    vhcr->op, MLX4_CMD_TIME_CLASS_C, MLX4_CMD_NATIVE);
+}
+
+static int MLX4_CMD_DIAG_RPRT_wrapper(struct mlx4_dev *dev, int slave,
+		     struct mlx4_vhcr *vhcr,
+		     struct mlx4_cmd_mailbox *inbox,
+		     struct mlx4_cmd_mailbox *outbox,
+		     struct mlx4_cmd_info *cmd)
+{
+	return -EPERM;
+}
+
+static int MLX4_CMD_UPDATE_QP_wrapper(struct mlx4_dev *dev, int slave,
+		     struct mlx4_vhcr *vhcr,
+		     struct mlx4_cmd_mailbox *inbox,
+		     struct mlx4_cmd_mailbox *outbox,
+		     struct mlx4_cmd_info *cmd)
+{
+	return -EPERM;
+}
+
+int mlx4_DMA_wrapper(struct mlx4_dev *dev, int slave,
+		     struct mlx4_vhcr *vhcr,
+		     struct mlx4_cmd_mailbox *inbox,
+		     struct mlx4_cmd_mailbox *outbox,
+		     struct mlx4_cmd_info *cmd)
+{
+	u64 in_param;
+	u64 out_param;
+	int err;
+
+	in_param = cmd->has_inbox ? (u64) inbox->dma : vhcr->in_param;
+	out_param = cmd->has_outbox ? (u64) outbox->dma : vhcr->out_param;
+	if (cmd->encode_slave_id) {
+		in_param &= 0xffffffffffffff00ll;
+		in_param |= slave;
+	}
+
+	err = __mlx4_cmd(dev, in_param, &out_param, cmd->out_is_imm,
+			 vhcr->in_modifier, vhcr->op_modifier, vhcr->op,
+			 MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
+
+	if (cmd->out_is_imm)
+		vhcr->out_param = out_param;
+
+	return err;
+}
+
+static struct mlx4_cmd_info cmd_info[] = {
+	{
+		.opcode = MLX4_CMD_QUERY_FW,
+		.has_inbox = false,
+		.has_outbox = true,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_QUERY_FW_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_QUERY_HCA,
+		.has_inbox = false,
+		.has_outbox = true,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = NULL
+	},
+	{
+		.opcode = MLX4_CMD_QUERY_DEV_CAP,
+		.has_inbox = false,
+		.has_outbox = true,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_QUERY_DEV_CAP_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_QUERY_FUNC_CAP,
+		.has_inbox = false,
+		.has_outbox = true,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_QUERY_FUNC_CAP_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_QUERY_ADAPTER,
+		.has_inbox = false,
+		.has_outbox = true,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = NULL
+	},
+	{
+		.opcode = MLX4_CMD_INIT_PORT,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_INIT_PORT_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_CLOSE_PORT,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm  = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_CLOSE_PORT_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_QUERY_PORT,
+		.has_inbox = false,
+		.has_outbox = true,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_QUERY_PORT_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_SET_PORT,
+		.has_inbox = true,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_SET_PORT_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_MAP_EQ,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_MAP_EQ_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_SW2HW_EQ,
+		.has_inbox = true,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = true,
+		.verify = NULL,
+		.wrapper = mlx4_SW2HW_EQ_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_HW_HEALTH_CHECK,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = NULL
+	},
+	{
+		.opcode = MLX4_CMD_DIAG_RPRT,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.skip_err_print = true,
+		.verify = NULL,
+		.wrapper = MLX4_CMD_DIAG_RPRT_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_NOP,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = NULL
+	},
+	{
+		.opcode = MLX4_CMD_ALLOC_RES,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = true,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_ALLOC_RES_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_FREE_RES,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_FREE_RES_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_SW2HW_MPT,
+		.has_inbox = true,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = true,
+		.verify = NULL,
+		.wrapper = mlx4_SW2HW_MPT_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_QUERY_MPT,
+		.has_inbox = false,
+		.has_outbox = true,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_QUERY_MPT_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_HW2SW_MPT,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_HW2SW_MPT_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_READ_MTT,
+		.has_inbox = false,
+		.has_outbox = true,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = NULL
+	},
+	{
+		.opcode = MLX4_CMD_WRITE_MTT,
+		.has_inbox = true,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_WRITE_MTT_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_SYNC_TPT,
+		.has_inbox = true,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = NULL
+	},
+	{
+		.opcode = MLX4_CMD_HW2SW_EQ,
+		.has_inbox = false,
+		.has_outbox = true,
+		.out_is_imm = false,
+		.encode_slave_id = true,
+		.verify = NULL,
+		.wrapper = mlx4_HW2SW_EQ_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_QUERY_EQ,
+		.has_inbox = false,
+		.has_outbox = true,
+		.out_is_imm = false,
+		.encode_slave_id = true,
+		.verify = NULL,
+		.wrapper = mlx4_QUERY_EQ_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_SW2HW_CQ,
+		.has_inbox = true,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = true,
+		.verify = NULL,
+		.wrapper = mlx4_SW2HW_CQ_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_HW2SW_CQ,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_HW2SW_CQ_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_QUERY_CQ,
+		.has_inbox = false,
+		.has_outbox = true,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_QUERY_CQ_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_MODIFY_CQ,
+		.has_inbox = true,
+		.has_outbox = false,
+		.out_is_imm = true,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_MODIFY_CQ_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_SW2HW_SRQ,
+		.has_inbox = true,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = true,
+		.verify = NULL,
+		.wrapper = mlx4_SW2HW_SRQ_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_HW2SW_SRQ,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_HW2SW_SRQ_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_QUERY_SRQ,
+		.has_inbox = false,
+		.has_outbox = true,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_QUERY_SRQ_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_ARM_SRQ,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_ARM_SRQ_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_RST2INIT_QP,
+		.has_inbox = true,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = true,
+		.verify = NULL,
+		.wrapper = mlx4_RST2INIT_QP_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_INIT2INIT_QP,
+		.has_inbox = true,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_INIT2INIT_QP_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_INIT2RTR_QP,
+		.has_inbox = true,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_INIT2RTR_QP_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_RTR2RTS_QP,
+		.has_inbox = true,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_RTR2RTS_QP_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_RTS2RTS_QP,
+		.has_inbox = true,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_RTS2RTS_QP_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_SQERR2RTS_QP,
+		.has_inbox = true,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_SQERR2RTS_QP_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_2ERR_QP,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_GEN_QP_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_RTS2SQD_QP,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_GEN_QP_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_SQD2SQD_QP,
+		.has_inbox = true,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_SQD2SQD_QP_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_SQD2RTS_QP,
+		.has_inbox = true,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_SQD2RTS_QP_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_2RST_QP,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_2RST_QP_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_QUERY_QP,
+		.has_inbox = false,
+		.has_outbox = true,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_GEN_QP_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_SUSPEND_QP,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_GEN_QP_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_UNSUSPEND_QP,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_GEN_QP_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_UPDATE_QP,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.skip_err_print = true,
+		.verify = NULL,
+		.wrapper = MLX4_CMD_UPDATE_QP_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_CONF_SPECIAL_QP,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL, /* XXX verify: only demux can do this */
+		.wrapper = NULL
+	},
+	{
+		.opcode = MLX4_CMD_MAD_IFC,
+		.has_inbox = true,
+		.has_outbox = true,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_MAD_IFC_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_QUERY_IF_STAT,
+		.has_inbox = false,
+		.has_outbox = true,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_QUERY_IF_STAT_wrapper
+	},
+	/* Native multicast commands are not available for guests */
+	{
+		.opcode = MLX4_CMD_QP_ATTACH,
+		.has_inbox = true,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_QP_ATTACH_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_PROMISC,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_PROMISC_wrapper
+	},
+	/* Ethernet specific commands */
+	{
+		.opcode = MLX4_CMD_SET_VLAN_FLTR,
+		.has_inbox = true,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_SET_VLAN_FLTR_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_SET_MCAST_FLTR,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_SET_MCAST_FLTR_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_DUMP_ETH_STATS,
+		.has_inbox = false,
+		.has_outbox = true,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_DUMP_ETH_STATS_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_INFORM_FLR_DONE,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = NULL
+	},
+	/* flow steering commands */
+	{
+		.opcode = MLX4_QP_FLOW_STEERING_ATTACH,
+		.has_inbox = true,
+		.has_outbox = false,
+		.out_is_imm = true,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_QP_FLOW_STEERING_ATTACH_wrapper
+	},
+	{
+		.opcode = MLX4_QP_FLOW_STEERING_DETACH,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_QP_FLOW_STEERING_DETACH_wrapper
+	},
+	/* wol commands */
+	{
+		.opcode = MLX4_CMD_MOD_STAT_CFG,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.skip_err_print = true,
+		.verify = NULL,
+		.wrapper = mlx4_MOD_STAT_CFG_wrapper
+	},
+};
+
+static int mlx4_master_process_vhcr(struct mlx4_dev *dev, int slave,
+				    struct mlx4_vhcr_cmd *in_vhcr)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_cmd_info *cmd = NULL;
+	struct mlx4_vhcr_cmd *vhcr_cmd = in_vhcr ? in_vhcr : priv->mfunc.vhcr;
+	struct mlx4_vhcr *vhcr;
+	struct mlx4_cmd_mailbox *inbox = NULL;
+	struct mlx4_cmd_mailbox *outbox = NULL;
+	u64 in_param;
+	u64 out_param;
+	int ret = 0;
+	int i;
+	int err = 0;
+
+	/* Create sw representation of Virtual HCR */
+	vhcr = kzalloc(sizeof(struct mlx4_vhcr), GFP_KERNEL);
+	if (!vhcr)
+		return -ENOMEM;
+
+	/* DMA in the vHCR */
+	if (!in_vhcr) {
+		ret = mlx4_ACCESS_MEM(dev, priv->mfunc.vhcr_dma, slave,
+				      priv->mfunc.master.slave_state[slave].vhcr_dma,
+				      ALIGN(sizeof(struct mlx4_vhcr_cmd),
+					    MLX4_ACCESS_MEM_ALIGN), 1);
+		if (ret) {
+			mlx4_err(dev, "%s:Failed reading vhcr"
+				 "ret: 0x%x\n", __func__, ret);
+			kfree(vhcr);
+			return ret;
+		}
+	}
+
+	/* Fill SW VHCR fields */
+	vhcr->in_param = be64_to_cpu(vhcr_cmd->in_param);
+	vhcr->out_param = be64_to_cpu(vhcr_cmd->out_param);
+	vhcr->in_modifier = be32_to_cpu(vhcr_cmd->in_modifier);
+	vhcr->token = be16_to_cpu(vhcr_cmd->token);
+	vhcr->op = be16_to_cpu(vhcr_cmd->opcode) & 0xfff;
+	vhcr->op_modifier = (u8) (be16_to_cpu(vhcr_cmd->opcode) >> 12);
+	vhcr->e_bit = vhcr_cmd->flags & (1 << 6);
+
+	/* Lookup command */
+	for (i = 0; i < ARRAY_SIZE(cmd_info); ++i) {
+		if (vhcr->op == cmd_info[i].opcode) {
+			cmd = &cmd_info[i];
+			break;
+		}
+	}
+	if (!cmd) {
+		mlx4_err(dev, "unparavirt command: %s (0x%x) accepted from slave:%d\n",
+			 cmd_to_str(vhcr->op), vhcr->op, slave);
+		vhcr_cmd->status = CMD_STAT_BAD_PARAM;
+		goto out_status;
+	}
+
+	/* Read inbox */
+	if (cmd->has_inbox) {
+		vhcr->in_param &= INBOX_MASK;
+		inbox = mlx4_alloc_cmd_mailbox(dev);
+		if (IS_ERR(inbox)) {
+			vhcr_cmd->status = CMD_STAT_BAD_SIZE;
+			inbox = NULL;
+			goto out_status;
+		}
+
+		if (mlx4_ACCESS_MEM(dev, inbox->dma, slave,
+				    vhcr->in_param,
+				    MLX4_MAILBOX_SIZE, 1)) {
+			mlx4_err(dev, "%s: Failed reading inbox for cmd %s (0x%x)\n",
+				 __func__, cmd_to_str(cmd->opcode), cmd->opcode);
+			vhcr_cmd->status = CMD_STAT_INTERNAL_ERR;
+			goto out_status;
+		}
+	}
+
+	/* Apply permission and bound checks if applicable */
+	if (cmd->verify && cmd->verify(dev, slave, vhcr, inbox)) {
+		mlx4_warn(dev, "Command %s (0x%x) from slave: %d failed protection "
+			  "checks for resource_id: %d\n", cmd_to_str(vhcr->op),
+			  vhcr->op, slave, vhcr->in_modifier);
+		vhcr_cmd->status = CMD_STAT_BAD_OP;
+		goto out_status;
+	}
+
+	/* Allocate outbox */
+	if (cmd->has_outbox) {
+		outbox = mlx4_alloc_cmd_mailbox(dev);
+		if (IS_ERR(outbox)) {
+			vhcr_cmd->status = CMD_STAT_BAD_SIZE;
+			outbox = NULL;
+			goto out_status;
+		}
+	}
+
+	/* Execute the command! */
+	if (cmd->wrapper) {
+		err = cmd->wrapper(dev, slave, vhcr, inbox, outbox,
+				   cmd);
+		if (cmd->out_is_imm)
+			vhcr_cmd->out_param = cpu_to_be64(vhcr->out_param);
+	} else {
+		in_param = cmd->has_inbox ? (u64) inbox->dma :
+			vhcr->in_param;
+		out_param = cmd->has_outbox ? (u64) outbox->dma :
+			vhcr->out_param;
+		err = __mlx4_cmd(dev, in_param, &out_param,
+				 cmd->out_is_imm, vhcr->in_modifier,
+				 vhcr->op_modifier, vhcr->op,
+				 MLX4_CMD_TIME_CLASS_A,
+				 MLX4_CMD_NATIVE);
+
+		if (cmd->out_is_imm) {
+			vhcr->out_param = out_param;
+			vhcr_cmd->out_param = cpu_to_be64(vhcr->out_param);
+		}
+	}
+
+	if (err) {
+		if (!cmd->skip_err_print)
+			mlx4_warn(dev, "vhcr command %s (0x%x) slave:%d "
+				  "in_param 0x%llx in_mod=0x%x, op_mod=0x%x "
+				  "failed with error:%d, status %d\n",
+				  cmd_to_str(vhcr->op), vhcr->op, slave,
+				  (unsigned long long) vhcr->in_param, vhcr->in_modifier,
+				  vhcr->op_modifier, vhcr->errno, err);
+		vhcr_cmd->status = mlx4_errno_to_status(err);
+		goto out_status;
+	}
+
+
+	/* Write outbox if command completed successfully */
+	if (cmd->has_outbox && !vhcr_cmd->status) {
+		ret = mlx4_ACCESS_MEM(dev, outbox->dma, slave,
+				      vhcr->out_param,
+				      MLX4_MAILBOX_SIZE, MLX4_CMD_WRAPPED);
+		if (ret) {
+			/* If we failed to write back the outbox after the
+			 *command was successfully executed, we must fail this
+			 * slave, as it is now in undefined state */
+			mlx4_err(dev, "%s: Failed writing outbox\n", __func__);
+			goto out;
+		}
+	}
+
+out_status:
+	/* DMA back vhcr result */
+	if (!in_vhcr) {
+		ret = mlx4_ACCESS_MEM(dev, priv->mfunc.vhcr_dma, slave,
+				      priv->mfunc.master.slave_state[slave].vhcr_dma,
+				      ALIGN(sizeof(struct mlx4_vhcr),
+					    MLX4_ACCESS_MEM_ALIGN),
+				      MLX4_CMD_WRAPPED);
+		if (ret)
+			mlx4_err(dev, "%s:Failed writing vhcr result\n",
+				 __func__);
+		else if (vhcr->e_bit &&
+			 mlx4_GEN_EQE(dev, slave, &priv->mfunc.master.cmd_eqe))
+				mlx4_warn(dev, "Failed to generate command completion "
+					  "eqe for slave %d\n", slave);
+	}
+
+out:
+	kfree(vhcr);
+	mlx4_free_cmd_mailbox(dev, inbox);
+	mlx4_free_cmd_mailbox(dev, outbox);
+	return ret;
+}
+
+static int mlx4_master_immediate_activate_vlan_qos(struct mlx4_priv *priv,
+					    int slave, int port)
+{
+	struct mlx4_vport_oper_state *vp_oper;
+	struct mlx4_vport_state *vp_admin;
+	struct mlx4_vf_immed_vlan_work *work;
+	int err;
+	int admin_vlan_ix = NO_INDX;
+
+	vp_oper = &priv->mfunc.master.vf_oper[slave].vport[port];
+	vp_admin = &priv->mfunc.master.vf_admin[slave].vport[port];
+
+	if (vp_oper->state.default_vlan == vp_admin->default_vlan &&
+	    vp_oper->state.default_qos == vp_admin->default_qos)
+		return 0;
+
+	work = kzalloc(sizeof(*work), GFP_KERNEL);
+	if (!work)
+		return -ENOMEM;
+
+	if (vp_oper->state.default_vlan != vp_admin->default_vlan) {
+		if (MLX4_VGT != vp_admin->default_vlan) {
+			err = __mlx4_register_vlan(&priv->dev, port,
+						   vp_admin->default_vlan,
+						   &admin_vlan_ix);
+			if (err) {
+				mlx4_warn((&priv->dev),
+					  "No vlan resources slave %d, port %d\n",
+					  slave, port);
+				kfree(work);
+				return err;
+			}
+		} else {
+			admin_vlan_ix = NO_INDX;
+		}
+		work->flags |= MLX4_VF_IMMED_VLAN_FLAG_VLAN;
+		mlx4_dbg((&(priv->dev)),
+			 "alloc vlan %d idx  %d slave %d port %d\n",
+			 (int)(vp_admin->default_vlan),
+			 admin_vlan_ix, slave, port);
+	}
+
+	/* save original vlan ix and vlan id */
+	work->orig_vlan_id = vp_oper->state.default_vlan;
+	work->orig_vlan_ix = vp_oper->vlan_idx;
+
+	/* handle new qos */
+	if (vp_oper->state.default_qos != vp_admin->default_qos)
+		work->flags |= MLX4_VF_IMMED_VLAN_FLAG_QOS;
+
+	if (work->flags & MLX4_VF_IMMED_VLAN_FLAG_VLAN)
+		vp_oper->vlan_idx = admin_vlan_ix;
+
+	vp_oper->state.default_vlan = vp_admin->default_vlan;
+	vp_oper->state.default_qos = vp_admin->default_qos;
+
+	/* iterate over QPs owned by this slave, using UPDATE_QP */
+	work->port = port;
+	work->slave = slave;
+	work->qos = vp_oper->state.default_qos;
+	work->vlan_id = vp_oper->state.default_vlan;
+	work->vlan_ix = vp_oper->vlan_idx;
+	work->priv = priv;
+	INIT_WORK(&work->work, mlx4_vf_immed_vlan_work_handler);
+	queue_work(priv->mfunc.master.comm_wq, &work->work);
+
+	return 0;
+}
+
+
+static int mlx4_master_activate_admin_state(struct mlx4_priv *priv, int slave)
+{
+	int port, err;
+	struct mlx4_vport_state *vp_admin;
+	struct mlx4_vport_oper_state *vp_oper;
+
+	for (port = 1; port <= MLX4_MAX_PORTS; port++) {
+		vp_oper = &priv->mfunc.master.vf_oper[slave].vport[port];
+		vp_admin = &priv->mfunc.master.vf_admin[slave].vport[port];
+		vp_oper->state = *vp_admin;
+		if (MLX4_VGT != vp_admin->default_vlan) {
+			err = __mlx4_register_vlan(&priv->dev, port,
+						 vp_admin->default_vlan, &(vp_oper->vlan_idx));
+			if (err) {
+				vp_oper->vlan_idx = NO_INDX;
+				mlx4_warn((&priv->dev),
+					  "No vlan resorces slave %d, port %d\n",
+					  slave, port);
+				return err;
+			}
+			mlx4_dbg((&(priv->dev)), "alloc vlan %d idx  %d slave %d port %d\n",
+				 (int)(vp_oper->state.default_vlan),
+				 vp_oper->vlan_idx, slave, port);
+		}
+		if (vp_admin->spoofchk) {
+			vp_oper->mac_idx = __mlx4_register_mac(&priv->dev,
+							       port,
+							       vp_admin->mac);
+			if (0 > vp_oper->mac_idx) {
+				err = vp_oper->mac_idx;
+				vp_oper->mac_idx = NO_INDX;
+				mlx4_warn((&priv->dev),
+					  "No mac resources slave %d, port %d\n",
+					  slave, port);
+				return err;
+			}
+			mlx4_dbg((&(priv->dev)), "alloc mac %llx idx  %d slave %d port %d\n",
+				 (unsigned long long) vp_oper->state.mac, vp_oper->mac_idx, slave, port);
+		}
+	}
+	return 0;
+}
+
+static void mlx4_master_deactivate_admin_state(struct mlx4_priv *priv, int slave)
+{
+	int port;
+	struct mlx4_vport_oper_state *vp_oper;
+
+	for (port = 1; port <= MLX4_MAX_PORTS; port++) {
+		vp_oper = &priv->mfunc.master.vf_oper[slave].vport[port];
+		if (NO_INDX != vp_oper->vlan_idx) {
+			__mlx4_unregister_vlan(&priv->dev,
+					       port, vp_oper->state.default_vlan);
+			vp_oper->vlan_idx = NO_INDX;
+		}
+		if (NO_INDX != vp_oper->mac_idx) {
+			__mlx4_unregister_mac(&priv->dev, port, vp_oper->state.mac);
+			vp_oper->mac_idx = NO_INDX;
+		}
+	}
+	return;
+}
+
+static void mlx4_master_do_cmd(struct mlx4_dev *dev, int slave, u8 cmd,
+			       u16 param, u8 toggle)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_slave_state *slave_state = priv->mfunc.master.slave_state;
+	u32 reply;
+	u8 is_going_down = 0;
+	int i;
+	unsigned long flags;
+
+	slave_state[slave].comm_toggle ^= 1;
+	reply = (u32) slave_state[slave].comm_toggle << 31;
+	if (toggle != slave_state[slave].comm_toggle) {
+		mlx4_warn(dev, "Incorrect toggle %d from slave %d. *** MASTER"
+			  "STATE COMPROMISIED ***\n", toggle, slave);
+		goto reset_slave;
+	}
+	if (cmd == MLX4_COMM_CMD_RESET) {
+		mlx4_warn(dev, "Received reset from slave:%d\n", slave);
+		slave_state[slave].active = false;
+		slave_state[slave].old_vlan_api = false;
+		mlx4_master_deactivate_admin_state(priv, slave);
+		for (i = 0; i < MLX4_EVENT_TYPES_NUM; ++i) {
+				slave_state[slave].event_eq[i].eqn = -1;
+				slave_state[slave].event_eq[i].token = 0;
+		}
+		/*check if we are in the middle of FLR process,
+		if so return "retry" status to the slave*/
+		if (MLX4_COMM_CMD_FLR == slave_state[slave].last_cmd)
+			goto inform_slave_state;
+
+		mlx4_dispatch_event(dev, MLX4_DEV_EVENT_SLAVE_SHUTDOWN, slave);
+
+		/* write the version in the event field */
+		reply |= mlx4_comm_get_version();
+
+		goto reset_slave;
+	}
+	/*command from slave in the middle of FLR*/
+	if (cmd != MLX4_COMM_CMD_RESET &&
+	    MLX4_COMM_CMD_FLR == slave_state[slave].last_cmd) {
+		mlx4_warn(dev, "slave:%d is Trying to run cmd (0x%x) "
+			  "in the middle of FLR\n", slave, cmd);
+		return;
+	}
+
+	switch (cmd) {
+	case MLX4_COMM_CMD_VHCR0:
+		if (slave_state[slave].last_cmd != MLX4_COMM_CMD_RESET)
+			goto reset_slave;
+		slave_state[slave].vhcr_dma = ((u64) param) << 48;
+		priv->mfunc.master.slave_state[slave].cookie = 0;
+		break;
+	case MLX4_COMM_CMD_VHCR1:
+		if (slave_state[slave].last_cmd != MLX4_COMM_CMD_VHCR0)
+			goto reset_slave;
+		slave_state[slave].vhcr_dma |= ((u64) param) << 32;
+		break;
+	case MLX4_COMM_CMD_VHCR2:
+		if (slave_state[slave].last_cmd != MLX4_COMM_CMD_VHCR1)
+			goto reset_slave;
+		slave_state[slave].vhcr_dma |= ((u64) param) << 16;
+		break;
+	case MLX4_COMM_CMD_VHCR_EN:
+		if (slave_state[slave].last_cmd != MLX4_COMM_CMD_VHCR2)
+			goto reset_slave;
+		slave_state[slave].vhcr_dma |= param;
+		if (mlx4_master_activate_admin_state(priv, slave))
+				goto reset_slave;
+		slave_state[slave].active = true;
+		mlx4_dispatch_event(dev, MLX4_DEV_EVENT_SLAVE_INIT, slave);
+		break;
+	case MLX4_COMM_CMD_VHCR_POST:
+		if ((slave_state[slave].last_cmd != MLX4_COMM_CMD_VHCR_EN) &&
+		    (slave_state[slave].last_cmd != MLX4_COMM_CMD_VHCR_POST))
+			goto reset_slave;
+
+		mutex_lock(&priv->cmd.slave_cmd_mutex);
+		if (mlx4_master_process_vhcr(dev, slave, NULL)) {
+			mlx4_err(dev, "Failed processing vhcr for slave: %d,"
+				 " resetting slave.\n", slave);
+			mutex_unlock(&priv->cmd.slave_cmd_mutex);
+			goto reset_slave;
+		}
+		mutex_unlock(&priv->cmd.slave_cmd_mutex);
+		break;
+	default:
+		mlx4_warn(dev, "Bad comm cmd: %d from slave: %d\n", cmd, slave);
+		goto reset_slave;
+	}
+	spin_lock_irqsave(&priv->mfunc.master.slave_state_lock, flags);
+	if (!slave_state[slave].is_slave_going_down)
+		slave_state[slave].last_cmd = cmd;
+	else
+		is_going_down = 1;
+	spin_unlock_irqrestore(&priv->mfunc.master.slave_state_lock, flags);
+	if (is_going_down) {
+		mlx4_warn(dev, "Slave is going down aborting command (%d)"
+			  " executing from slave: %d\n",
+			  cmd, slave);
+		return;
+	}
+	__raw_writel((__force u32) cpu_to_be32(reply),
+		     &priv->mfunc.comm[slave].slave_read);
+	mmiowb();
+
+	return;
+
+reset_slave:
+	/* cleanup any slave resources */
+	mlx4_delete_all_resources_for_slave(dev, slave);
+	spin_lock_irqsave(&priv->mfunc.master.slave_state_lock, flags);
+	if (!slave_state[slave].is_slave_going_down)
+		slave_state[slave].last_cmd = MLX4_COMM_CMD_RESET;
+	spin_unlock_irqrestore(&priv->mfunc.master.slave_state_lock, flags);
+	/*with slave in the middle of flr, no need to clean resources again.*/
+inform_slave_state:
+	__raw_writel((__force u32) cpu_to_be32(reply),
+		     &priv->mfunc.comm[slave].slave_read);
+	wmb();
+}
+
+/* master command processing */
+void mlx4_master_comm_channel(struct work_struct *work)
+{
+	struct mlx4_mfunc_master_ctx *master =
+		container_of(work,
+			     struct mlx4_mfunc_master_ctx,
+			     comm_work);
+	struct mlx4_mfunc *mfunc =
+		container_of(master, struct mlx4_mfunc, master);
+	struct mlx4_priv *priv =
+		container_of(mfunc, struct mlx4_priv, mfunc);
+	struct mlx4_dev *dev = &priv->dev;
+	__be32 *bit_vec;
+	u32 comm_cmd;
+	u32 vec;
+	int i, j, slave;
+	int toggle;
+	int served = 0;
+	int reported = 0;
+	u32 slt;
+
+	bit_vec = master->comm_arm_bit_vector;
+	for (i = 0; i < COMM_CHANNEL_BIT_ARRAY_SIZE; i++) {
+		vec = be32_to_cpu(bit_vec[i]);
+		for (j = 0; j < 32; j++) {
+			if (!(vec & (1 << j)))
+				continue;
+			++reported;
+			slave = (i * 32) + j;
+			comm_cmd = swab32(readl(
+					  &mfunc->comm[slave].slave_write));
+			slt = swab32(readl(&mfunc->comm[slave].slave_read))
+				     >> 31;
+			toggle = comm_cmd >> 31;
+			if (toggle != slt) {
+				if (master->slave_state[slave].comm_toggle
+				    != slt) {
+					mlx4_info(dev, "slave %d out of sync."
+						  " read toggle %d, state toggle %d. "
+						  "Resynching.\n", slave, slt,
+						  master->slave_state[slave].comm_toggle);
+					master->slave_state[slave].comm_toggle =
+						slt;
+				}
+				mlx4_master_do_cmd(dev, slave,
+						   comm_cmd >> 16 & 0xff,
+						   comm_cmd & 0xffff, toggle);
+				++served;
+			} else
+				mlx4_err(dev, "slave %d out of sync."
+				  " read toggle %d, write toggle %d.\n", slave, slt,
+				  toggle);
+		}
+	}
+
+	if (reported && reported != served)
+		mlx4_warn(dev, "Got command event with bitmask from %d slaves"
+			  " but %d were served\n",
+			  reported, served);
+}
+/* master command processing */
+void mlx4_master_arm_comm_channel(struct work_struct *work)
+{
+	struct mlx4_mfunc_master_ctx *master =
+		container_of(work,
+			     struct mlx4_mfunc_master_ctx,
+			     arm_comm_work);
+	struct mlx4_mfunc *mfunc =
+		container_of(master, struct mlx4_mfunc, master);
+	struct mlx4_priv *priv =
+		container_of(mfunc, struct mlx4_priv, mfunc);
+	struct mlx4_dev *dev = &priv->dev;
+
+	if (mlx4_ARM_COMM_CHANNEL(dev))
+		mlx4_warn(dev, "Failed to arm comm channel events\n");
+}
+
+static int sync_toggles(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int wr_toggle;
+	int rd_toggle;
+	unsigned long end;
+
+	wr_toggle = swab32(readl(&priv->mfunc.comm->slave_write)) >> 31;
+	end = jiffies + msecs_to_jiffies(5000);
+
+	while (time_before(jiffies, end)) {
+		rd_toggle = swab32(readl(&priv->mfunc.comm->slave_read)) >> 31;
+		if (rd_toggle == wr_toggle) {
+			priv->cmd.comm_toggle = rd_toggle;
+			return 0;
+		}
+
+		cond_resched();
+	}
+
+	/*
+	 * we could reach here if for example the previous VM using this
+	 * function misbehaved and left the channel with unsynced state. We
+	 * should fix this here and give this VM a chance to use a properly
+	 * synced channel
+	 */
+	mlx4_warn(dev, "recovering from previously mis-behaved VM\n");
+	__raw_writel((__force u32) 0, &priv->mfunc.comm->slave_read);
+	__raw_writel((__force u32) 0, &priv->mfunc.comm->slave_write);
+	priv->cmd.comm_toggle = 0;
+
+	return 0;
+}
+
+int mlx4_multi_func_init(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_slave_state *s_state;
+	int i, j, err, port;
+
+	if (mlx4_is_master(dev))
+		priv->mfunc.comm =
+		ioremap(pci_resource_start(dev->pdev, priv->fw.comm_bar) +
+			priv->fw.comm_base, MLX4_COMM_PAGESIZE);
+	else
+		priv->mfunc.comm =
+		ioremap(pci_resource_start(dev->pdev, 2) +
+			MLX4_SLAVE_COMM_BASE, MLX4_COMM_PAGESIZE);
+	if (!priv->mfunc.comm) {
+		mlx4_err(dev, "Couldn't map communication vector.\n");
+		goto err_vhcr;
+	}
+
+	if (mlx4_is_master(dev)) {
+		priv->mfunc.master.slave_state =
+			kzalloc(dev->num_slaves *
+				sizeof(struct mlx4_slave_state), GFP_KERNEL);
+		if (!priv->mfunc.master.slave_state)
+			goto err_comm;
+
+		priv->mfunc.master.vf_admin =
+			kzalloc(dev->num_slaves *
+				sizeof(struct mlx4_vf_admin_state), GFP_KERNEL);
+		if (!priv->mfunc.master.vf_admin)
+			goto err_comm_admin;
+
+		priv->mfunc.master.vf_oper =
+			kzalloc(dev->num_slaves *
+				sizeof(struct mlx4_vf_oper_state), GFP_KERNEL);
+		if (!priv->mfunc.master.vf_oper)
+			goto err_comm_oper;
+
+		for (i = 0; i < dev->num_slaves; ++i) {
+			s_state = &priv->mfunc.master.slave_state[i];
+			s_state->last_cmd = MLX4_COMM_CMD_RESET;
+			mutex_init(&priv->mfunc.master.gen_eqe_mutex[i]);
+			for (j = 0; j < MLX4_EVENT_TYPES_NUM; ++j)
+				s_state->event_eq[j].eqn = -1;
+			__raw_writel((__force u32) 0,
+				     &priv->mfunc.comm[i].slave_write);
+			__raw_writel((__force u32) 0,
+				     &priv->mfunc.comm[i].slave_read);
+			mmiowb();
+			for (port = 1; port <= MLX4_MAX_PORTS; port++) {
+				s_state->vlan_filter[port] =
+					kzalloc(sizeof(struct mlx4_vlan_fltr),
+						GFP_KERNEL);
+				if (!s_state->vlan_filter[port]) {
+					if (--port)
+						kfree(s_state->vlan_filter[port]);
+					goto err_slaves;
+				}
+				INIT_LIST_HEAD(&s_state->mcast_filters[port]);
+				priv->mfunc.master.vf_admin[i].vport[port].default_vlan = MLX4_VGT;
+				priv->mfunc.master.vf_oper[i].vport[port].state.default_vlan = MLX4_VGT;
+				priv->mfunc.master.vf_oper[i].vport[port].vlan_idx = NO_INDX;
+				priv->mfunc.master.vf_oper[i].vport[port].mac_idx = NO_INDX;
+			}
+			spin_lock_init(&s_state->lock);
+		}
+
+		memset(&priv->mfunc.master.cmd_eqe, 0, dev->caps.eqe_size);
+		priv->mfunc.master.cmd_eqe.type = MLX4_EVENT_TYPE_CMD;
+		INIT_WORK(&priv->mfunc.master.comm_work,
+			  mlx4_master_comm_channel);
+		INIT_WORK(&priv->mfunc.master.arm_comm_work,
+			  mlx4_master_arm_comm_channel);
+		INIT_WORK(&priv->mfunc.master.slave_event_work,
+			  mlx4_gen_slave_eqe);
+		INIT_WORK(&priv->mfunc.master.slave_flr_event_work,
+			  mlx4_master_handle_slave_flr);
+		spin_lock_init(&priv->mfunc.master.slave_state_lock);
+		spin_lock_init(&priv->mfunc.master.slave_eq.event_lock);
+		priv->mfunc.master.comm_wq =
+			create_singlethread_workqueue("mlx4_comm");
+		if (!priv->mfunc.master.comm_wq)
+			goto err_slaves;
+
+		if (mlx4_init_resource_tracker(dev))
+			goto err_thread;
+
+		err = mlx4_ARM_COMM_CHANNEL(dev);
+		if (err) {
+			mlx4_err(dev, " Failed to arm comm channel eq: %x\n",
+				 err);
+			goto err_resource;
+		}
+
+	} else {
+		err = sync_toggles(dev);
+		if (err) {
+			mlx4_err(dev, "Couldn't sync toggles\n");
+			goto err_comm;
+		}
+	}
+	return 0;
+
+err_resource:
+	mlx4_free_resource_tracker(dev, RES_TR_FREE_ALL);
+err_thread:
+	flush_workqueue(priv->mfunc.master.comm_wq);
+	destroy_workqueue(priv->mfunc.master.comm_wq);
+err_slaves:
+	while (--i) {
+		for (port = 1; port <= MLX4_MAX_PORTS; port++)
+			kfree(priv->mfunc.master.slave_state[i].vlan_filter[port]);
+	}
+	kfree(priv->mfunc.master.vf_oper);
+err_comm_oper:
+	kfree(priv->mfunc.master.vf_admin);
+err_comm_admin:
+	kfree(priv->mfunc.master.slave_state);
+err_comm:
+	iounmap(priv->mfunc.comm);
+err_vhcr:
+	dma_free_coherent(&(dev->pdev->dev), PAGE_SIZE,
+					     priv->mfunc.vhcr,
+					     priv->mfunc.vhcr_dma);
+	priv->mfunc.vhcr = NULL;
+	return -ENOMEM;
+}
+
 int mlx4_cmd_init(struct mlx4_dev *dev)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
 
 	mutex_init(&priv->cmd.hcr_mutex);
+	mutex_init(&priv->cmd.slave_cmd_mutex);
 	sema_init(&priv->cmd.poll_sem, 1);
 	priv->cmd.use_events = 0;
 	priv->cmd.toggle     = 1;
 
-	priv->cmd.hcr = ioremap(pci_resource_start(dev->pdev, 0) + MLX4_HCR_BASE,
-				MLX4_HCR_SIZE);
-	if (!priv->cmd.hcr) {
-		mlx4_err(dev, "Couldn't map command register.");
-		return -ENOMEM;
+	priv->cmd.hcr = NULL;
+	priv->mfunc.vhcr = NULL;
+
+	if (!mlx4_is_slave(dev)) {
+		priv->cmd.hcr = ioremap(pci_resource_start(dev->pdev, 0) +
+					MLX4_HCR_BASE, MLX4_HCR_SIZE);
+		if (!priv->cmd.hcr) {
+			mlx4_err(dev, "Couldn't map command register.\n");
+			return -ENOMEM;
+		}
 	}
 
+	if (mlx4_is_mfunc(dev)) {
+		priv->mfunc.vhcr = dma_alloc_coherent(&(dev->pdev->dev), PAGE_SIZE,
+						      &priv->mfunc.vhcr_dma,
+						      GFP_KERNEL);
+		if (!priv->mfunc.vhcr) {
+			mlx4_err(dev, "Couldn't allocate VHCR.\n");
+			goto err_hcr;
+		}
+	}
+
 	priv->cmd.pool = pci_pool_create("mlx4_cmd", dev->pdev,
 					 MLX4_MAILBOX_SIZE,
 					 MLX4_MAILBOX_SIZE, 0);
-	if (!priv->cmd.pool) {
+	if (!priv->cmd.pool)
+		goto err_vhcr;
+
+	return 0;
+
+err_vhcr:
+	if (mlx4_is_mfunc(dev))
+		dma_free_coherent(&(dev->pdev->dev), PAGE_SIZE,
+				  priv->mfunc.vhcr, priv->mfunc.vhcr_dma);
+	priv->mfunc.vhcr = NULL;
+
+err_hcr:
+	if (!mlx4_is_slave(dev))
 		iounmap(priv->cmd.hcr);
-		return -ENOMEM;
+	return -ENOMEM;
+}
+
+void mlx4_multi_func_cleanup(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int i, port;
+
+	if (mlx4_is_master(dev)) {
+		flush_workqueue(priv->mfunc.master.comm_wq);
+		destroy_workqueue(priv->mfunc.master.comm_wq);
+		for (i = 0; i < dev->num_slaves; i++) {
+			for (port = 1; port <= MLX4_MAX_PORTS; port++)
+				kfree(priv->mfunc.master.slave_state[i].vlan_filter[port]);
+		}
+		kfree(priv->mfunc.master.slave_state);
+		kfree(priv->mfunc.master.vf_admin);
+		kfree(priv->mfunc.master.vf_oper);
 	}
 
-	return 0;
+	iounmap(priv->mfunc.comm);
 }
 
 void mlx4_cmd_cleanup(struct mlx4_dev *dev)
@@ -363,7 +2326,13 @@
 	struct mlx4_priv *priv = mlx4_priv(dev);
 
 	pci_pool_destroy(priv->cmd.pool);
-	iounmap(priv->cmd.hcr);
+
+	if (!mlx4_is_slave(dev))
+		iounmap(priv->cmd.hcr);
+	if (mlx4_is_mfunc(dev))
+		dma_free_coherent(&(dev->pdev->dev), PAGE_SIZE,
+				  priv->mfunc.vhcr, priv->mfunc.vhcr_dma);
+	priv->mfunc.vhcr = NULL;
 }
 
 /*
@@ -374,6 +2343,7 @@
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
 	int i;
+	int err = 0;
 
 	priv->cmd.context = kmalloc(priv->cmd.max_cmds *
 				   sizeof (struct mlx4_cmd_context),
@@ -398,11 +2368,10 @@
 		; /* nothing */
 	--priv->cmd.token_mask;
 
+	down(&priv->cmd.poll_sem);
 	priv->cmd.use_events = 1;
 
-	down(&priv->cmd.poll_sem);
-
-	return 0;
+	return err;
 }
 
 /*
@@ -438,11 +2407,14 @@
 		return ERR_PTR(-ENOMEM);
 	}
 
+	memset(mailbox->buf, 0, MLX4_MAILBOX_SIZE);
+
 	return mailbox;
 }
 EXPORT_SYMBOL_GPL(mlx4_alloc_cmd_mailbox);
 
-void mlx4_free_cmd_mailbox(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox)
+void mlx4_free_cmd_mailbox(struct mlx4_dev *dev,
+			   struct mlx4_cmd_mailbox *mailbox)
 {
 	if (!mailbox)
 		return;
@@ -451,3 +2423,184 @@
 	kfree(mailbox);
 }
 EXPORT_SYMBOL_GPL(mlx4_free_cmd_mailbox);
+
+u32 mlx4_comm_get_version(void)
+{
+	 return ((u32) CMD_CHAN_IF_REV << 8) | (u32) CMD_CHAN_VER;
+}
+
+static int mlx4_get_slave_indx(struct mlx4_dev *dev, int vf)
+{
+	if ((vf < 0) || (vf >= dev->num_vfs)) {
+		mlx4_err(dev, "Bad vf number:%d (number of activated vf: %d)\n", vf, dev->num_vfs);
+		return -EINVAL;
+	}
+	return (vf+1);
+}
+
+int mlx4_set_vf_mac(struct mlx4_dev *dev, int port, int vf, u8 *mac)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_vport_state *s_info;
+	int slave;
+
+	if (!mlx4_is_master(dev))
+		return -EPROTONOSUPPORT;
+
+	slave = mlx4_get_slave_indx(dev, vf);
+	if (slave < 0)
+		return -EINVAL;
+
+	s_info = &priv->mfunc.master.vf_admin[slave].vport[port];
+	s_info->mac = mlx4_mac_to_u64(mac);
+	mlx4_info(dev, "default mac on vf %d port %d to %llX will take afect only after vf restart\n",
+		  vf, port, (unsigned long long) s_info->mac);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_set_vf_mac);
+
+int mlx4_set_vf_vlan(struct mlx4_dev *dev, int port, int vf, u16 vlan, u8 qos)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_vport_oper_state *vf_oper;
+	struct mlx4_vport_state *vf_admin;
+	int slave;
+
+	if ((!mlx4_is_master(dev)) ||
+	    !(dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_VLAN_CONTROL))
+		return -EPROTONOSUPPORT;
+
+	if ((vlan > 4095) || (qos > 7))
+		return -EINVAL;
+
+	slave = mlx4_get_slave_indx(dev, vf);
+	if (slave < 0)
+		return -EINVAL;
+
+	vf_admin = &priv->mfunc.master.vf_admin[slave].vport[port];
+	vf_oper = &priv->mfunc.master.vf_oper[slave].vport[port];
+
+	if ((0 == vlan) && (0 == qos))
+		vf_admin->default_vlan = MLX4_VGT;
+	else
+		vf_admin->default_vlan = vlan;
+	vf_admin->default_qos = qos;
+
+	if (priv->mfunc.master.slave_state[slave].active &&
+	    dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_UPDATE_QP) {
+		mlx4_info(dev, "updating vf %d port %d config params immediately\n",
+			  vf, port);
+		mlx4_master_immediate_activate_vlan_qos(priv, slave, port);
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_set_vf_vlan);
+
+ /* mlx4_get_slave_default_vlan -
+ * retrun true if VST ( default vlan)
+ * if VST will fill vlan & qos (if not NULL) */
+bool mlx4_get_slave_default_vlan(struct mlx4_dev *dev, int port, int slave, u16 *vlan, u8 *qos)
+{
+	struct mlx4_vport_oper_state *vp_oper;
+	struct mlx4_priv *priv;
+
+	priv = mlx4_priv(dev);
+	vp_oper = &priv->mfunc.master.vf_oper[slave].vport[port];
+
+	if (MLX4_VGT != vp_oper->state.default_vlan) {
+		if (vlan)
+			*vlan = vp_oper->state.default_vlan;
+		if (qos)
+			*qos = vp_oper->state.default_qos;
+		return true;
+	}
+	return false;
+}
+EXPORT_SYMBOL_GPL(mlx4_get_slave_default_vlan);
+
+int mlx4_set_vf_spoofchk(struct mlx4_dev *dev, int port, int vf, bool setting)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_vport_state *s_info;
+	int slave;
+
+	if ((!mlx4_is_master(dev)) ||
+	    !(dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_FSM))
+		return -EPROTONOSUPPORT;
+
+	slave = mlx4_get_slave_indx(dev, vf);
+	if (slave < 0)
+		return -EINVAL;
+
+	s_info = &priv->mfunc.master.vf_admin[slave].vport[port];
+	s_info->spoofchk = setting;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_set_vf_spoofchk);
+
+int mlx4_set_vf_link_state(struct mlx4_dev *dev, int port, int vf, int link_state)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_vport_state *s_info;
+	struct mlx4_vport_oper_state *vp_oper;
+	int slave;
+	u8 link_stat_event;
+
+	slave = mlx4_get_slave_indx(dev, vf);
+	if (slave < 0)
+		return -EINVAL;
+
+	switch (link_state) {
+	case IFLA_VF_LINK_STATE_AUTO:
+		/* get link curent state */
+		if (!priv->sense.do_sense_port[port])
+			link_stat_event = MLX4_PORT_CHANGE_SUBTYPE_ACTIVE;
+		else
+			link_stat_event = MLX4_PORT_CHANGE_SUBTYPE_DOWN;
+	    break;
+
+	case IFLA_VF_LINK_STATE_ENABLE:
+		link_stat_event = MLX4_PORT_CHANGE_SUBTYPE_ACTIVE;
+	    break;
+
+	case IFLA_VF_LINK_STATE_DISABLE:
+		link_stat_event = MLX4_PORT_CHANGE_SUBTYPE_DOWN;
+	    break;
+
+	default:
+		mlx4_warn(dev, "unknown value for link_state %02x on slave %d port %d\n",
+			  link_state, slave, port);
+		return -EINVAL;
+	};
+	/* update the admin & oper state on the link state */
+	s_info = &priv->mfunc.master.vf_admin[slave].vport[port];
+	vp_oper = &priv->mfunc.master.vf_oper[slave].vport[port];
+	s_info->link_state = link_state;
+	vp_oper->state.link_state = link_state;
+
+	/* send event */
+	mlx4_gen_port_state_change_eqe(dev, slave, port, link_stat_event);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_set_vf_link_state);
+
+int mlx4_get_vf_link_state(struct mlx4_dev *dev, int port, int vf)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_vport_state *s_info;
+	int slave;
+
+	if (!mlx4_is_master(dev))
+		return -EPROTONOSUPPORT;
+
+	slave = mlx4_get_slave_indx(dev, vf);
+	if (slave < 0)
+		return -EINVAL;
+
+	s_info = &priv->mfunc.master.vf_admin[slave].vport[port];
+
+	return s_info->link_state;
+}
+EXPORT_SYMBOL_GPL(mlx4_get_vf_link_state);
+

Modified: trunk/sys/ofed/drivers/net/mlx4/cq.c
===================================================================
--- trunk/sys/ofed/drivers/net/mlx4/cq.c	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/drivers/net/mlx4/cq.c	2016-09-14 19:35:22 UTC (rev 7911)
@@ -2,7 +2,7 @@
  * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
  * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
  * Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc. All rights reserved.
- * Copyright (c) 2005, 2006, 2007, 2008 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2005, 2006, 2007, 2008, 2014 Mellanox Technologies. All rights reserved.
  * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
  *
  * This software is available to you under a choice of one of two
@@ -34,9 +34,8 @@
  * SOFTWARE.
  */
 
-#include <linux/init.h>
 #include <linux/hardirq.h>
-
+#include <linux/module.h>
 #include <linux/mlx4/cmd.h>
 #include <linux/mlx4/cq.h>
 
@@ -43,27 +42,6 @@
 #include "mlx4.h"
 #include "icm.h"
 
-struct mlx4_cq_context {
-	__be32			flags;
-	u16			reserved1[3];
-	__be16			page_offset;
-	__be32			logsize_usrpage;
-	__be16			cq_period;
-	__be16			cq_max_count;
-	u8			reserved2[3];
-	u8			comp_eqn;
-	u8			log_page_size;
-	u8			reserved3[2];
-	u8			mtt_base_addr_h;
-	__be32			mtt_base_addr_l;
-	__be32			last_notified_index;
-	__be32			solicit_producer_index;
-	__be32			consumer_index;
-	__be32			producer_index;
-	u32			reserved4[2];
-	__be64			db_rec_addr;
-};
-
 #define MLX4_CQ_STATUS_OK		( 0 << 28)
 #define MLX4_CQ_STATUS_OVERFLOW		( 9 << 28)
 #define MLX4_CQ_STATUS_WRITE_FAIL	(10 << 28)
@@ -75,10 +53,18 @@
 
 void mlx4_cq_completion(struct mlx4_dev *dev, u32 cqn)
 {
+	struct mlx4_cq_table *cq_table = &mlx4_priv(dev)->cq_table;
 	struct mlx4_cq *cq;
 
+	read_lock(&cq_table->cq_table_lock);
+
 	cq = radix_tree_lookup(&mlx4_priv(dev)->cq_table.tree,
 			       cqn & (dev->caps.num_cqs - 1));
+	if (cq)
+		atomic_inc(&cq->refcount);
+
+	read_unlock(&cq_table->cq_table_lock);
+
 	if (!cq) {
 		mlx4_dbg(dev, "Completion event for bogus CQ %08x\n", cqn);
 		return;
@@ -87,6 +73,9 @@
 	++cq->arm_sn;
 
 	cq->comp(cq);
+
+	if (atomic_dec_and_test(&cq->refcount))
+		complete(&cq->free);
 }
 
 void mlx4_cq_event(struct mlx4_dev *dev, u32 cqn, int event_type)
@@ -94,13 +83,13 @@
 	struct mlx4_cq_table *cq_table = &mlx4_priv(dev)->cq_table;
 	struct mlx4_cq *cq;
 
-	spin_lock(&cq_table->lock);
+	read_lock(&cq_table->cq_table_lock);
 
 	cq = radix_tree_lookup(&cq_table->tree, cqn & (dev->caps.num_cqs - 1));
 	if (cq)
 		atomic_inc(&cq->refcount);
 
-	spin_unlock(&cq_table->lock);
+	read_unlock(&cq_table->cq_table_lock);
 
 	if (!cq) {
 		mlx4_warn(dev, "Async event for bogus CQ %08x\n", cqn);
@@ -116,8 +105,9 @@
 static int mlx4_SW2HW_CQ(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox,
 			 int cq_num)
 {
-	return mlx4_cmd(dev, mailbox->dma, cq_num, 0, MLX4_CMD_SW2HW_CQ,
-			MLX4_CMD_TIME_CLASS_A);
+	return mlx4_cmd(dev, mailbox->dma, cq_num, 0,
+			MLX4_CMD_SW2HW_CQ, MLX4_CMD_TIME_CLASS_A,
+			MLX4_CMD_WRAPPED);
 }
 
 static int mlx4_MODIFY_CQ(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox,
@@ -124,15 +114,15 @@
 			 int cq_num, u32 opmod)
 {
 	return mlx4_cmd(dev, mailbox->dma, cq_num, opmod, MLX4_CMD_MODIFY_CQ,
-			MLX4_CMD_TIME_CLASS_A);
+			MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
 }
 
 static int mlx4_HW2SW_CQ(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox,
 			 int cq_num)
 {
-	return mlx4_cmd_box(dev, 0, mailbox ? mailbox->dma : 0, cq_num,
-			    mailbox ? 0 : 1, MLX4_CMD_HW2SW_CQ,
-			    MLX4_CMD_TIME_CLASS_A);
+	return mlx4_cmd_box(dev, 0, mailbox ? mailbox->dma : 0,
+			    cq_num, mailbox ? 0 : 1, MLX4_CMD_HW2SW_CQ,
+			    MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
 }
 
 int mlx4_cq_modify(struct mlx4_dev *dev, struct mlx4_cq *cq,
@@ -187,26 +177,105 @@
 }
 EXPORT_SYMBOL_GPL(mlx4_cq_resize);
 
-static int mlx4_find_least_loaded_vector(struct mlx4_priv *priv)
+int mlx4_cq_ignore_overrun(struct mlx4_dev *dev, struct mlx4_cq *cq)
 {
-	int i;
-	int index = 0;
-	int min = priv->eq_table.eq[0].load;
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_cq_context *cq_context;
+	int err;
 
-	for (i = 1; i < priv->dev.caps.num_comp_vectors; i++) {
-		if (priv->eq_table.eq[i].load < min) {
-			index = i;
-			min = priv->eq_table.eq[i].load;
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	cq_context = mailbox->buf;
+	memset(cq_context, 0, sizeof *cq_context);
+
+	cq_context->flags |= cpu_to_be32(MLX4_CQ_FLAG_OI);
+
+	err = mlx4_MODIFY_CQ(dev, mailbox, cq->cqn, 3);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_cq_ignore_overrun);
+
+int __mlx4_cq_alloc_icm(struct mlx4_dev *dev, int *cqn)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_cq_table *cq_table = &priv->cq_table;
+	int err;
+
+	*cqn = mlx4_bitmap_alloc(&cq_table->bitmap);
+	if (*cqn == -1)
+		return -ENOMEM;
+
+	err = mlx4_table_get(dev, &cq_table->table, *cqn);
+	if (err)
+		goto err_out;
+
+	err = mlx4_table_get(dev, &cq_table->cmpt_table, *cqn);
+	if (err)
+		goto err_put;
+	return 0;
+
+err_put:
+	mlx4_table_put(dev, &cq_table->table, *cqn);
+
+err_out:
+	mlx4_bitmap_free(&cq_table->bitmap, *cqn, MLX4_NO_RR);
+	return err;
+}
+
+static int mlx4_cq_alloc_icm(struct mlx4_dev *dev, int *cqn)
+{
+	u64 out_param;
+	int err;
+
+	if (mlx4_is_mfunc(dev)) {
+		err = mlx4_cmd_imm(dev, 0, &out_param, RES_CQ,
+				   RES_OP_RESERVE_AND_MAP, MLX4_CMD_ALLOC_RES,
+				   MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
+		if (err)
+			return err;
+		else {
+			*cqn = get_param_l(&out_param);
+			return 0;
 		}
 	}
+	return __mlx4_cq_alloc_icm(dev, cqn);
+}
 
-	return index;
+void __mlx4_cq_free_icm(struct mlx4_dev *dev, int cqn)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_cq_table *cq_table = &priv->cq_table;
+
+	mlx4_table_put(dev, &cq_table->cmpt_table, cqn);
+	mlx4_table_put(dev, &cq_table->table, cqn);
+	mlx4_bitmap_free(&cq_table->bitmap, cqn, MLX4_NO_RR);
 }
 
-int mlx4_cq_alloc(struct mlx4_dev *dev, int nent, struct mlx4_mtt *mtt,
-		  struct mlx4_uar *uar, u64 db_rec, struct mlx4_cq *cq,
-		  unsigned vector, int collapsed)
+static void mlx4_cq_free_icm(struct mlx4_dev *dev, int cqn)
 {
+	u64 in_param = 0;
+	int err;
+
+	if (mlx4_is_mfunc(dev)) {
+		set_param_l(&in_param, cqn);
+		err = mlx4_cmd(dev, in_param, RES_CQ, RES_OP_RESERVE_AND_MAP,
+			       MLX4_CMD_FREE_RES,
+			       MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
+		if (err)
+			mlx4_warn(dev, "Failed freeing cq:%d\n", cqn);
+	} else
+		__mlx4_cq_free_icm(dev, cqn);
+}
+
+int mlx4_cq_alloc(struct mlx4_dev *dev, int nent,
+		  struct mlx4_mtt *mtt, struct mlx4_uar *uar, u64 db_rec,
+		  struct mlx4_cq *cq, unsigned vector, int collapsed,
+		  int timestamp_en)
+{
 	struct mlx4_priv *priv = mlx4_priv(dev);
 	struct mlx4_cq_table *cq_table = &priv->cq_table;
 	struct mlx4_cmd_mailbox *mailbox;
@@ -214,29 +283,20 @@
 	u64 mtt_addr;
 	int err;
 
-	cq->vector = (vector == MLX4_LEAST_ATTACHED_VECTOR) ?
-		mlx4_find_least_loaded_vector(priv) : vector;
-
-	if (cq->vector >= dev->caps.num_comp_vectors)
+	if (vector > dev->caps.num_comp_vectors + dev->caps.comp_pool)
 		return -EINVAL;
 
-	cq->cqn = mlx4_bitmap_alloc(&cq_table->bitmap);
-	if (cq->cqn == -1)
-		return -ENOMEM;
+	cq->vector = vector;
 
-	err = mlx4_table_get(dev, &cq_table->table, cq->cqn);
+	err = mlx4_cq_alloc_icm(dev, &cq->cqn);
 	if (err)
-		goto err_out;
+		return err;
 
-	err = mlx4_table_get(dev, &cq_table->cmpt_table, cq->cqn);
-	if (err)
-		goto err_put;
-
 	spin_lock_irq(&cq_table->lock);
 	err = radix_tree_insert(&cq_table->tree, cq->cqn, cq);
 	spin_unlock_irq(&cq_table->lock);
 	if (err)
-		goto err_cmpt_put;
+		goto err_icm;
 
 	mailbox = mlx4_alloc_cmd_mailbox(dev);
 	if (IS_ERR(mailbox)) {
@@ -248,8 +308,11 @@
 	memset(cq_context, 0, sizeof *cq_context);
 
 	cq_context->flags	    = cpu_to_be32(!!collapsed << 18);
+	if (timestamp_en)
+		cq_context->flags  |= cpu_to_be32(1 << 19);
+
 	cq_context->logsize_usrpage = cpu_to_be32((ilog2(nent) << 24) | uar->index);
-	cq_context->comp_eqn	    = priv->eq_table.eq[cq->vector].eqn;
+	cq_context->comp_eqn	    = priv->eq_table.eq[vector].eqn;
 	cq_context->log_page_size   = mtt->page_shift - MLX4_ICM_PAGE_SHIFT;
 
 	mtt_addr = mlx4_mtt_addr(dev, mtt);
@@ -262,7 +325,6 @@
 	if (err)
 		goto err_radix;
 
-	priv->eq_table.eq[cq->vector].load++;
 	cq->cons_index = 0;
 	cq->arm_sn     = 1;
 	cq->uar        = uar;
@@ -269,6 +331,9 @@
 	atomic_set(&cq->refcount, 1);
 	init_completion(&cq->free);
 
+	cq->eqn = priv->eq_table.eq[cq->vector].eqn;
+	cq->irq = priv->eq_table.eq[cq->vector].irq;
+
 	return 0;
 
 err_radix:
@@ -276,15 +341,9 @@
 	radix_tree_delete(&cq_table->tree, cq->cqn);
 	spin_unlock_irq(&cq_table->lock);
 
-err_cmpt_put:
-	mlx4_table_put(dev, &cq_table->cmpt_table, cq->cqn);
+err_icm:
+	mlx4_cq_free_icm(dev, cq->cqn);
 
-err_put:
-	mlx4_table_put(dev, &cq_table->table, cq->cqn);
-
-err_out:
-	mlx4_bitmap_free(&cq_table->bitmap, cq->cqn);
-
 	return err;
 }
 EXPORT_SYMBOL_GPL(mlx4_cq_alloc);
@@ -300,7 +359,6 @@
 		mlx4_warn(dev, "HW2SW_CQ failed (%d) for CQN %06x\n", err, cq->cqn);
 
 	synchronize_irq(priv->eq_table.eq[cq->vector].irq);
-	priv->eq_table.eq[cq->vector].load--;
 
 	spin_lock_irq(&cq_table->lock);
 	radix_tree_delete(&cq_table->tree, cq->cqn);
@@ -310,8 +368,7 @@
 		complete(&cq->free);
 	wait_for_completion(&cq->free);
 
-	mlx4_table_put(dev, &cq_table->table, cq->cqn);
-	mlx4_bitmap_free(&cq_table->bitmap, cq->cqn);
+	mlx4_cq_free_icm(dev, cq->cqn);
 }
 EXPORT_SYMBOL_GPL(mlx4_cq_free);
 
@@ -321,7 +378,10 @@
 	int err;
 
 	spin_lock_init(&cq_table->lock);
+	rwlock_init(&cq_table->cq_table_lock);
 	INIT_RADIX_TREE(&cq_table->tree, GFP_ATOMIC);
+	if (mlx4_is_slave(dev))
+		return 0;
 
 	err = mlx4_bitmap_init(&cq_table->bitmap, dev->caps.num_cqs,
 			       dev->caps.num_cqs - 1, dev->caps.reserved_cqs, 0);
@@ -333,6 +393,8 @@
 
 void mlx4_cleanup_cq_table(struct mlx4_dev *dev)
 {
+	if (mlx4_is_slave(dev))
+		return;
 	/* Nothing to do to clean up radix_tree */
 	mlx4_bitmap_cleanup(&mlx4_priv(dev)->cq_table.bitmap);
 }

Modified: trunk/sys/ofed/drivers/net/mlx4/en_cq.c
===================================================================
--- trunk/sys/ofed/drivers/net/mlx4/en_cq.c	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/drivers/net/mlx4/en_cq.c	2016-09-14 19:35:22 UTC (rev 7911)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2007, 2014 Mellanox Technologies. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -31,12 +31,13 @@
  *
  */
 
-#include "mlx4_en.h"
-
 #include <linux/mlx4/cq.h>
 #include <linux/mlx4/qp.h>
 #include <linux/mlx4/cmd.h>
 
+#include "mlx4_en.h"
+
+
 static void mlx4_en_cq_event(struct mlx4_cq *cq, enum mlx4_event event)
 {
 	return;
@@ -44,52 +45,72 @@
 
 
 int mlx4_en_create_cq(struct mlx4_en_priv *priv,
-		      struct mlx4_en_cq *cq,
-		      int entries, int ring, enum cq_type mode)
+		      struct mlx4_en_cq **pcq,
+		      int entries, int ring, enum cq_type mode,
+		      int node)
 {
 	struct mlx4_en_dev *mdev = priv->mdev;
+	struct mlx4_en_cq *cq;
 	int err;
 
+	cq = kzalloc_node(sizeof(struct mlx4_en_cq), GFP_KERNEL, node);
+	if (!cq) {
+		cq = kzalloc(sizeof(struct mlx4_en_cq), GFP_KERNEL);
+		if (!cq) {
+			en_err(priv, "Failed to allocate CW struture\n");
+			return -ENOMEM;
+		}
+	}
+
 	cq->size = entries;
+	cq->buf_size = cq->size * mdev->dev->caps.cqe_size;
+
 	cq->tq = taskqueue_create_fast("mlx4_en_que", M_NOWAIT,
-	    taskqueue_thread_enqueue, &cq->tq);
-	if (mode == RX) {
-		cq->buf_size = cq->size * sizeof(struct mlx4_cqe);
-		cq->vector   = (ring + priv->port) %
-				mdev->dev->caps.num_comp_vectors;
+                        taskqueue_thread_enqueue, &cq->tq);
+        if (mode == RX) {
 		TASK_INIT(&cq->cq_task, 0, mlx4_en_rx_que, cq);
 		taskqueue_start_threads(&cq->tq, 1, PI_NET, "%s rx cq",
-		    if_name(priv->dev));
+				if_name(priv->dev));
+
 	} else {
-		cq->buf_size = sizeof(struct mlx4_cqe);
-		cq->vector   = MLX4_LEAST_ATTACHED_VECTOR;
 		TASK_INIT(&cq->cq_task, 0, mlx4_en_tx_que, cq);
 		taskqueue_start_threads(&cq->tq, 1, PI_NET, "%s tx cq",
-		    if_name(priv->dev));
+				if_name(priv->dev));
 	}
 
 	cq->ring = ring;
 	cq->is_tx = mode;
-	mtx_init(&cq->lock.m, "mlx4 cq", NULL, MTX_DEF);
+	spin_lock_init(&cq->lock);
 
 	err = mlx4_alloc_hwq_res(mdev->dev, &cq->wqres,
 				cq->buf_size, 2 * PAGE_SIZE);
 	if (err)
-		return err;
+		goto err_cq;
 
 	err = mlx4_en_map_buffer(&cq->wqres.buf);
 	if (err)
-		mlx4_free_hwq_res(mdev->dev, &cq->wqres, cq->buf_size);
-	else
-		cq->buf = (struct mlx4_cqe *) cq->wqres.buf.direct.buf;
+		goto err_res;
 
+	cq->buf = (struct mlx4_cqe *) cq->wqres.buf.direct.buf;
+	*pcq = cq;
+
+	return 0;
+
+err_res:
+	mlx4_free_hwq_res(mdev->dev, &cq->wqres, cq->buf_size);
+err_cq:
+	kfree(cq);
 	return err;
 }
 
-int mlx4_en_activate_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq)
+
+int mlx4_en_activate_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq,
+			int cq_idx)
 {
 	struct mlx4_en_dev *mdev = priv->mdev;
-	int err;
+	int err = 0;
+	char name[25];
+	int timestamp_en = 0;
 
 	cq->dev = mdev->pndev[priv->port];
 	cq->mcq.set_ci_db  = cq->wqres.db.db;
@@ -98,11 +119,40 @@
 	*cq->mcq.arm_db    = 0;
 	memset(cq->buf, 0, cq->buf_size);
 
+	if (cq->is_tx == RX) {
+		if (mdev->dev->caps.comp_pool) {
+			if (!cq->vector) {
+				sprintf(name, "%s-%d", if_name(priv->dev),
+					cq->ring);
+				/* Set IRQ for specific name (per ring) */
+				if (mlx4_assign_eq(mdev->dev, name, &cq->vector)) {
+					cq->vector = (cq->ring + 1 + priv->port)
+					    % mdev->dev->caps.num_comp_vectors;
+					mlx4_warn(mdev, "Failed Assigning an EQ to "
+						  "%s ,Falling back to legacy EQ's\n",
+						  name);
+				}
+			}
+		} else {
+			cq->vector = (cq->ring + 1 + priv->port) %
+				mdev->dev->caps.num_comp_vectors;
+		}
+	} else {
+		struct mlx4_en_cq *rx_cq;
+		/*
+		 * For TX we use the same irq per
+		 * ring we assigned for the RX
+		 */
+		cq_idx = cq_idx % priv->rx_ring_num;
+		rx_cq = priv->rx_cq[cq_idx];
+		cq->vector = rx_cq->vector;
+	}
+
 	if (!cq->is_tx)
-		cq->size = priv->rx_ring[cq->ring].actual_size;
-
-	err = mlx4_cq_alloc(mdev->dev, cq->size, &cq->wqres.mtt, &mdev->priv_uar,
-			    cq->wqres.db.dma, &cq->mcq, cq->vector, cq->is_tx);
+		cq->size = priv->rx_ring[cq->ring]->actual_size;
+	err = mlx4_cq_alloc(mdev->dev, cq->size, &cq->wqres.mtt,
+			    &mdev->priv_uar, cq->wqres.db.dma, &cq->mcq,
+			    cq->vector, 0, timestamp_en);
 	if (err)
 		return err;
 
@@ -109,39 +159,43 @@
 	cq->mcq.comp  = cq->is_tx ? mlx4_en_tx_irq : mlx4_en_rx_irq;
 	cq->mcq.event = mlx4_en_cq_event;
 
-	if (cq->is_tx) {
-		init_timer(&cq->timer);
-		cq->timer.function = mlx4_en_poll_tx_cq;
-		cq->timer.data = (unsigned long) cq;
-	}
+        if (cq->is_tx) {
+                init_timer(&cq->timer);
+                cq->timer.function = mlx4_en_poll_tx_cq;
+                cq->timer.data = (unsigned long) cq;
+        }
 
+
 	return 0;
 }
 
-void mlx4_en_destroy_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq)
+void mlx4_en_destroy_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq **pcq)
 {
 	struct mlx4_en_dev *mdev = priv->mdev;
+	struct mlx4_en_cq *cq = *pcq;
 
 	taskqueue_drain(cq->tq, &cq->cq_task);
 	taskqueue_free(cq->tq);
 	mlx4_en_unmap_buffer(&cq->wqres.buf);
 	mlx4_free_hwq_res(mdev->dev, &cq->wqres, cq->buf_size);
-	cq->buf_size = 0;
-	cq->buf = NULL;
-	mtx_destroy(&cq->lock.m);
+	if (priv->mdev->dev->caps.comp_pool && cq->vector)
+		mlx4_release_eq(priv->mdev->dev, cq->vector);
+	kfree(cq);
+	*pcq = NULL;
 }
 
 void mlx4_en_deactivate_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq)
 {
-	struct mlx4_en_dev *mdev = priv->mdev;
+        struct mlx4_en_dev *mdev = priv->mdev;
 
-	taskqueue_drain(cq->tq, &cq->cq_task);
-	if (cq->is_tx)
-		del_timer(&cq->timer);
+        taskqueue_drain(cq->tq, &cq->cq_task);
+        if (cq->is_tx)
+                del_timer(&cq->timer);
 
-	mlx4_cq_free(mdev->dev, &cq->mcq);
+        mlx4_cq_free(mdev->dev, &cq->mcq);
 }
 
+
 /* Set rx cq moderation parameters */
 int mlx4_en_set_cq_moder(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq)
 {

Modified: trunk/sys/ofed/drivers/net/mlx4/en_main.c
===================================================================
--- trunk/sys/ofed/drivers/net/mlx4/en_main.c	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/drivers/net/mlx4/en_main.c	2016-09-14 19:35:22 UTC (rev 7911)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2007, 2014 Mellanox Technologies. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -34,6 +34,7 @@
 #include <linux/module.h>
 #include <linux/delay.h>
 #include <linux/netdevice.h>
+#include <linux/slab.h>
 
 #include <linux/mlx4/driver.h>
 #include <linux/mlx4/device.h>
@@ -41,15 +42,8 @@
 
 #include "mlx4_en.h"
 
-MODULE_AUTHOR("Liran Liss, Yevgeny Petrilin");
-MODULE_DESCRIPTION("Mellanox ConnectX HCA Ethernet driver");
-MODULE_LICENSE("Dual BSD/GPL");
-MODULE_VERSION(DRV_VERSION " ("DRV_RELDATE")");
+/* Mellanox ConnectX HCA Ethernet driver */
 
-static const char mlx4_en_version[] =
-	DRV_NAME ": Mellanox ConnectX HCA Ethernet driver v"
-	DRV_VERSION " (" DRV_RELDATE ")\n";
-
 #define MLX4_EN_PARM_INT(X, def_val, desc) \
 	static unsigned int X = def_val;\
 	module_param(X , uint, 0444); \
@@ -60,21 +54,10 @@
  * Device scope module parameters
  */
 
-
-/* Enable RSS TCP traffic */
-MLX4_EN_PARM_INT(tcp_rss, 1,
-		 "Enable RSS for incomming TCP traffic or disabled (0)");
 /* Enable RSS UDP traffic */
 MLX4_EN_PARM_INT(udp_rss, 1,
-		 "Enable RSS for incomming UDP traffic or disabled (0)");
+		 "Enable RSS for incoming UDP traffic");
 
-/* Number of LRO sessions per Rx ring (rounded up to a power of two) */
-MLX4_EN_PARM_INT(num_lro, MLX4_EN_MAX_LRO_DESCRIPTORS,
-		 "Number of LRO sessions per ring or disabled (0)");
-
-/* Allow reassembly of fragmented IP packets */
-MLX4_EN_PARM_INT(ip_reasm, 1, "Allow reassembly of fragmented IP packets (!0)");
-
 /* Priority pausing */
 MLX4_EN_PARM_INT(pfctx, 0, "Priority based Flow Control policy on TX[7:0]."
 			   " Per priority bit mask");
@@ -81,19 +64,23 @@
 MLX4_EN_PARM_INT(pfcrx, 0, "Priority based Flow Control policy on RX[7:0]."
 			   " Per priority bit mask");
 
+#define MAX_PFC_TX	0xff
+#define MAX_PFC_RX	0xff
+
+
 static int mlx4_en_get_profile(struct mlx4_en_dev *mdev)
 {
 	struct mlx4_en_profile *params = &mdev->profile;
 	int i;
 
-	params->tcp_rss = tcp_rss;
 	params->udp_rss = udp_rss;
-	if (params->udp_rss && !mdev->dev->caps.udp_rss) {
+	params->num_tx_rings_p_up = min_t(int, mp_ncpus,
+			MLX4_EN_MAX_TX_RING_P_UP);
+	if (params->udp_rss && !(mdev->dev->caps.flags
+					& MLX4_DEV_CAP_FLAG_UDP_RSS)) {
 		mlx4_warn(mdev, "UDP RSS is not supported on this device.\n");
 		params->udp_rss = 0;
 	}
-	params->num_lro = min_t(int, num_lro , MLX4_EN_MAX_LRO_DESCRIPTORS);
-	params->ip_reasm = ip_reasm;
 	for (i = 1; i <= MLX4_MAX_PORTS; i++) {
 		params->prof[i].rx_pause = 1;
 		params->prof[i].rx_ppp = pfcrx;
@@ -101,14 +88,15 @@
 		params->prof[i].tx_ppp = pfctx;
 		params->prof[i].tx_ring_size = MLX4_EN_DEF_TX_RING_SIZE;
 		params->prof[i].rx_ring_size = MLX4_EN_DEF_RX_RING_SIZE;
-		params->prof[i].tx_ring_num = MLX4_EN_NUM_HASH_RINGS + 1 +
-			(!!pfcrx) * MLX4_EN_NUM_PPP_RINGS;
+		params->prof[i].tx_ring_num = params->num_tx_rings_p_up *
+			MLX4_EN_NUM_UP;
+		params->prof[i].rss_rings = 0;
 	}
 
 	return 0;
 }
 
-static void *get_netdev(struct mlx4_dev *dev, void *ctx, u8 port)
+static void *mlx4_en_get_netdev(struct mlx4_dev *dev, void *ctx, u8 port)
 {
 	struct mlx4_en_dev *endev = ctx;
 
@@ -116,18 +104,17 @@
 }
 
 static void mlx4_en_event(struct mlx4_dev *dev, void *endev_ptr,
-			  enum mlx4_dev_event event, int port)
+			  enum mlx4_dev_event event, unsigned long port)
 {
 	struct mlx4_en_dev *mdev = (struct mlx4_en_dev *) endev_ptr;
 	struct mlx4_en_priv *priv;
 
-	if (!mdev->pndev[port])
-		return;
-
-	priv = netdev_priv(mdev->pndev[port]);
 	switch (event) {
 	case MLX4_DEV_EVENT_PORT_UP:
 	case MLX4_DEV_EVENT_PORT_DOWN:
+		if (!mdev->pndev[port])
+			return;
+		priv = netdev_priv(mdev->pndev[port]);
 		/* To prevent races, we poll the link state in a separate
 		  task rather than changing it here */
 		priv->link_state = event;
@@ -138,8 +125,15 @@
 		mlx4_err(mdev, "Internal error detected, restarting device\n");
 		break;
 
+	case MLX4_DEV_EVENT_SLAVE_INIT:
+	case MLX4_DEV_EVENT_SLAVE_SHUTDOWN:
+		break;
 	default:
-		mlx4_warn(mdev, "Unhandled event: %d\n", event);
+		if (port < 1 || port > dev->caps.num_ports ||
+		    !mdev->pndev[port])
+			return;
+		mlx4_warn(mdev, "Unhandled event %d for port %d\n", event,
+			  (int) port);
 	}
 }
 
@@ -146,7 +140,7 @@
 static void mlx4_en_remove(struct mlx4_dev *dev, void *endev_ptr)
 {
 	struct mlx4_en_dev *mdev = endev_ptr;
-	int i;
+	int i, ret;
 
 	mutex_lock(&mdev->state_lock);
 	mdev->device_up = false;
@@ -158,26 +152,21 @@
 
 	flush_workqueue(mdev->workqueue);
 	destroy_workqueue(mdev->workqueue);
-	mlx4_mr_free(dev, &mdev->mr);
+	ret = mlx4_mr_free(dev, &mdev->mr);
+	if (ret)
+		mlx4_err(mdev, "Error deregistering MR. The system may have become unstable.");
+	iounmap(mdev->uar_map);
 	mlx4_uar_free(dev, &mdev->priv_uar);
 	mlx4_pd_free(dev, mdev->priv_pdn);
-	sx_destroy(&mdev->state_lock.sx);
-	mtx_destroy(&mdev->uar_lock.m);
 	kfree(mdev);
 }
 
 static void *mlx4_en_add(struct mlx4_dev *dev)
 {
-	static int mlx4_en_version_printed;
 	struct mlx4_en_dev *mdev;
 	int i;
 	int err;
 
-	if (!mlx4_en_version_printed) {
-		printk(KERN_INFO "%s", mlx4_en_version);
-		mlx4_en_version_printed++;
-	}
-
 	mdev = kzalloc(sizeof *mdev, GFP_KERNEL);
 	if (!mdev) {
 		dev_err(&dev->pdev->dev, "Device struct alloc failed, "
@@ -192,10 +181,11 @@
 	if (mlx4_uar_alloc(dev, &mdev->priv_uar))
 		goto err_pd;
 
-	mtx_init(&mdev->uar_lock.m, "mlx4 uar", NULL, MTX_DEF);
-	mdev->uar_map = ioremap(mdev->priv_uar.pfn << PAGE_SHIFT, PAGE_SIZE);
+	mdev->uar_map = ioremap((phys_addr_t) mdev->priv_uar.pfn << PAGE_SHIFT,
+				PAGE_SIZE);
 	if (!mdev->uar_map)
 		goto err_uar;
+	spin_lock_init(&mdev->uar_lock);
 
 	mdev->dev = dev;
 	mdev->dma_device = &(dev->pdev->dev);
@@ -211,7 +201,7 @@
 			 MLX4_PERM_LOCAL_WRITE |  MLX4_PERM_LOCAL_READ,
 			 0, 0, &mdev->mr)) {
 		mlx4_err(mdev, "Failed allocating memory region\n");
-		goto err_uar;
+		goto err_map;
 	}
 	if (mlx4_mr_enable(mdev->dev, &mdev->mr)) {
 		mlx4_err(mdev, "Failed enabling memory region\n");
@@ -225,21 +215,24 @@
 		goto err_mr;
 	}
 
-	/* Configure wich ports to start according to module parameters */
+	/* Configure which ports to start according to module parameters */
 	mdev->port_cnt = 0;
 	mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_ETH)
 		mdev->port_cnt++;
 
-	/* If we did not receive an explicit number of Rx rings, default to
-	 * the number of completion vectors populated by the mlx4_core */
+
 	mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_ETH) {
-		mlx4_info(mdev, "Using %d tx rings for port:%d\n",
-			  mdev->profile.prof[i].tx_ring_num, i);
-		mdev->profile.prof[i].rx_ring_num = rounddown_pow_of_two(
-			min_t(int, dev->caps.num_comp_vectors, MAX_RX_RINGS));
-
-		mlx4_info(mdev, "Defaulting to %d rx rings for port:%d\n",
-			  mdev->profile.prof[i].rx_ring_num, i);
+		if (!dev->caps.comp_pool) {
+			mdev->profile.prof[i].rx_ring_num =
+				rounddown_pow_of_two(max_t(int, MIN_RX_RINGS,
+							   min_t(int,
+								 dev->caps.num_comp_vectors,
+								 DEF_RX_RINGS)));
+		} else {
+			mdev->profile.prof[i].rx_ring_num = rounddown_pow_of_two(
+				min_t(int, dev->caps.comp_pool /
+				      dev->caps.num_ports, MAX_MSIX_P_PORT));
+		}
 	}
 
 	/* Create our own workqueue for reset/multicast tasks
@@ -253,7 +246,7 @@
 
 	/* At this stage all non-port specific tasks are complete:
 	 * mark the card state as up */
-	sx_init(&mdev->state_lock.sx, "mlxen state");
+	mutex_init(&mdev->state_lock);
 	mdev->device_up = true;
 
 	/* Setup ports */
@@ -261,32 +254,20 @@
 	/* Create a netdev for each port */
 	mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_ETH) {
 		mlx4_info(mdev, "Activating port:%d\n", i);
-		if (mlx4_en_init_netdev(mdev, i, &mdev->profile.prof[i])) {
+		if (mlx4_en_init_netdev(mdev, i, &mdev->profile.prof[i]))
 			mdev->pndev[i] = NULL;
-			goto err_free_netdev;
-		}
 	}
+
 	return mdev;
 
-
-err_free_netdev:
-	mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_ETH) {
-		if (mdev->pndev[i])
-			mlx4_en_destroy_netdev(mdev->pndev[i]);
-	}
-
-	mutex_lock(&mdev->state_lock);
-	mdev->device_up = false;
-	mutex_unlock(&mdev->state_lock);
-	flush_workqueue(mdev->workqueue);
-
-	/* Stop event queue before we drop down to release shared SW state */
-	destroy_workqueue(mdev->workqueue);
-
 err_mr:
-	mlx4_mr_free(dev, &mdev->mr);
+	err = mlx4_mr_free(dev, &mdev->mr);
+	if (err)
+		mlx4_err(mdev, "Error deregistering MR. The system may have become unstable.");
+err_map:
+	if (mdev->uar_map)
+		iounmap(mdev->uar_map);
 err_uar:
-	mtx_destroy(&mdev->uar_lock.m);
 	mlx4_uar_free(dev, &mdev->priv_uar);
 err_pd:
 	mlx4_pd_free(dev, mdev->priv_pdn);
@@ -296,67 +277,42 @@
 	return NULL;
 }
 
-enum mlx4_query_reply mlx4_en_query(void *endev_ptr, void *int_dev)
+static struct mlx4_interface mlx4_en_interface = {
+	.add		= mlx4_en_add,
+	.remove		= mlx4_en_remove,
+	.event		= mlx4_en_event,
+	.get_dev	= mlx4_en_get_netdev,
+	.protocol	= MLX4_PROT_ETH,
+};
+
+static void mlx4_en_verify_params(void)
 {
-	struct mlx4_en_dev *mdev = endev_ptr;
-	struct net_device *netdev = int_dev;
-	int p;
-	
-	for (p = 1; p <= MLX4_MAX_PORTS; ++p)
-		if (mdev->pndev[p] == netdev)
-			return p;
+        if (pfctx > MAX_PFC_TX) {
+                pr_warn("mlx4_en: WARNING: illegal module parameter pfctx 0x%x - "
+                                "should be in range 0-0x%x, will be changed to default (0)\n",
+                                pfctx, MAX_PFC_TX);
+                pfctx = 0;
+        }
 
-	return MLX4_QUERY_NOT_MINE;
+        if (pfcrx > MAX_PFC_RX) {
+                pr_warn("mlx4_en: WARNING: illegal module parameter pfcrx 0x%x - "
+                                "should be in range 0-0x%x, will be changed to default (0)\n",
+                                pfcrx, MAX_PFC_RX);
+                pfcrx = 0;
+        }
 }
 
-#if 0
-static struct pci_device_id mlx4_en_pci_table[] = {
-	{ PCI_VDEVICE(MELLANOX, 0x6340) }, /* MT25408 "Hermon" SDR */
-	{ PCI_VDEVICE(MELLANOX, 0x634a) }, /* MT25408 "Hermon" DDR */
-	{ PCI_VDEVICE(MELLANOX, 0x6354) }, /* MT25408 "Hermon" QDR */
-	{ PCI_VDEVICE(MELLANOX, 0x6732) }, /* MT25408 "Hermon" DDR PCIe gen2 */
-	{ PCI_VDEVICE(MELLANOX, 0x673c) }, /* MT25408 "Hermon" QDR PCIe gen2 */
-	{ PCI_VDEVICE(MELLANOX, 0x6368) }, /* MT25408 "Hermon" EN 10GigE */
-	{ PCI_VDEVICE(MELLANOX, 0x6750) }, /* MT25408 "Hermon" EN 10GigE PCIe gen2 */
-	{ PCI_VDEVICE(MELLANOX, 0x6372) }, /* MT25458 ConnectX EN 10GBASE-T 10GigE */
-	{ PCI_VDEVICE(MELLANOX, 0x675a) }, /* MT25458 ConnectX EN 10GBASE-T+Gen2 10GigE */
-	{ PCI_VDEVICE(MELLANOX, 0x6764) }, /* MT26468 ConnectX EN 10GigE PCIe gen2 */
-	{ PCI_VDEVICE(MELLANOX, 0x6746) }, /* MT26438 ConnectX VPI PCIe 2.0 5GT/s - IB QDR / 10GigE Virt+ */
-	{ PCI_VDEVICE(MELLANOX, 0x676e) }, /* MT26478 ConnectX EN 40GigE PCIe 2.0 5GT/s */
-	{ PCI_VDEVICE(MELLANOX, 0x6778) }, /* MT26488 ConnectX VPI PCIe 2.0 5GT/s - IB DDR / 10GigE Virt+ */
-	{ PCI_VDEVICE(MELLANOX, 0x1000) },
-	{ PCI_VDEVICE(MELLANOX, 0x1001) },
-	{ PCI_VDEVICE(MELLANOX, 0x1002) },
-	{ PCI_VDEVICE(MELLANOX, 0x1003) },
-	{ PCI_VDEVICE(MELLANOX, 0x1004) },
-	{ PCI_VDEVICE(MELLANOX, 0x1005) },
-	{ PCI_VDEVICE(MELLANOX, 0x1006) },
-	{ PCI_VDEVICE(MELLANOX, 0x1007) },
-	{ PCI_VDEVICE(MELLANOX, 0x1008) },
-	{ PCI_VDEVICE(MELLANOX, 0x1009) },
-	{ PCI_VDEVICE(MELLANOX, 0x100a) },
-	{ PCI_VDEVICE(MELLANOX, 0x100b) },
-	{ PCI_VDEVICE(MELLANOX, 0x100c) },
-	{ PCI_VDEVICE(MELLANOX, 0x100d) },
-	{ PCI_VDEVICE(MELLANOX, 0x100e) },
-	{ PCI_VDEVICE(MELLANOX, 0x100f) },
-	{ 0, }
-};
 
-MODULE_DEVICE_TABLE(pci, mlx4_en_pci_table);
-#endif
-
-static struct mlx4_interface mlx4_en_interface = {
-	.add	= mlx4_en_add,
-	.remove	= mlx4_en_remove,
-	.event	= mlx4_en_event,
-	.query  = mlx4_en_query,
-	.get_prot_dev	= get_netdev,
-	.protocol	= MLX4_PROT_EN,
-};
-
 static int __init mlx4_en_init(void)
 {
+        mlx4_en_verify_params();
+
+#ifdef CONFIG_DEBUG_FS
+	int err = 0;
+	err = mlx4_en_register_debugfs();
+	if (err)
+		pr_err(KERN_ERR "Failed to register debugfs\n");
+#endif
 	return mlx4_register_interface(&mlx4_en_interface);
 }
 
@@ -363,6 +319,9 @@
 static void __exit mlx4_en_cleanup(void)
 {
 	mlx4_unregister_interface(&mlx4_en_interface);
+#ifdef CONFIG_DEBUG_FS
+	mlx4_en_unregister_debugfs();
+#endif
 }
 
 module_init(mlx4_en_init);
@@ -379,5 +338,5 @@
         .name = "mlxen",
 	.evhand = mlxen_evhand,
 };
-DECLARE_MODULE(mlxen, mlxen_mod, SI_SUB_SMP, SI_ORDER_ANY);
+DECLARE_MODULE(mlxen, mlxen_mod, SI_SUB_OFED_PREINIT, SI_ORDER_ANY);
 MODULE_DEPEND(mlxen, mlx4, 1, 1, 1);

Modified: trunk/sys/ofed/drivers/net/mlx4/en_netdev.c
===================================================================
--- trunk/sys/ofed/drivers/net/mlx4/en_netdev.c	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/drivers/net/mlx4/en_netdev.c	2016-09-14 19:35:22 UTC (rev 7911)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2007, 2014 Mellanox Technologies. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -31,201 +31,816 @@
  *
  */
 
-#include "mlx4_en.h"
+#include <linux/etherdevice.h>
+#include <linux/delay.h>
+#include <linux/slab.h>
+#ifdef CONFIG_NET_RX_BUSY_POLL
+#include <net/busy_poll.h>
+#endif
 
+#include <linux/list.h>
+#include <linux/if_ether.h>
+
 #include <linux/mlx4/driver.h>
 #include <linux/mlx4/device.h>
 #include <linux/mlx4/cmd.h>
 #include <linux/mlx4/cq.h>
 
-#include <linux/delay.h>
-#include <net/ethernet.h>
-#include <net/if_vlan_var.h>
 #include <sys/sockio.h>
+#include <sys/sysctl.h>
 
+#include "mlx4_en.h"
+#include "en_port.h"
+
 static void mlx4_en_sysctl_stat(struct mlx4_en_priv *priv);
+static void mlx4_en_sysctl_conf(struct mlx4_en_priv *priv);
+static int mlx4_en_unit;
 
+#ifdef CONFIG_NET_RX_BUSY_POLL
+/* must be called with local_bh_disable()d */
+static int mlx4_en_low_latency_recv(struct napi_struct *napi)
+{
+	struct mlx4_en_cq *cq = container_of(napi, struct mlx4_en_cq, napi);
+	struct net_device *dev = cq->dev;
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_rx_ring *rx_ring = priv->rx_ring[cq->ring];
+	int done;
+
+	if (!priv->port_up)
+		return LL_FLUSH_FAILED;
+
+	if (!mlx4_en_cq_lock_poll(cq))
+		return LL_FLUSH_BUSY;
+
+	done = mlx4_en_process_rx_cq(dev, cq, 4);
+#ifdef LL_EXTENDED_STATS
+	if (done)
+		rx_ring->cleaned += done;
+	else
+		rx_ring->misses++;
+#endif
+
+	mlx4_en_cq_unlock_poll(cq);
+
+	return done;
+}
+#endif	/* CONFIG_NET_RX_BUSY_POLL */
+
+#ifdef CONFIG_RFS_ACCEL
+
+struct mlx4_en_filter {
+	struct list_head next;
+	struct work_struct work;
+
+	u8     ip_proto;
+	__be32 src_ip;
+	__be32 dst_ip;
+	__be16 src_port;
+	__be16 dst_port;
+
+	int rxq_index;
+	struct mlx4_en_priv *priv;
+	u32 flow_id;			/* RFS infrastructure id */
+	int id;				/* mlx4_en driver id */
+	u64 reg_id;			/* Flow steering API id */
+	u8 activated;			/* Used to prevent expiry before filter
+					 * is attached
+					 */
+	struct hlist_node filter_chain;
+};
+
+static void mlx4_en_filter_rfs_expire(struct mlx4_en_priv *priv);
+
+static enum mlx4_net_trans_rule_id mlx4_ip_proto_to_trans_rule_id(u8 ip_proto)
+{
+	switch (ip_proto) {
+	case IPPROTO_UDP:
+		return MLX4_NET_TRANS_RULE_ID_UDP;
+	case IPPROTO_TCP:
+		return MLX4_NET_TRANS_RULE_ID_TCP;
+	default:
+		return -EPROTONOSUPPORT;
+	}
+};
+
+static void mlx4_en_filter_work(struct work_struct *work)
+{
+	struct mlx4_en_filter *filter = container_of(work,
+						     struct mlx4_en_filter,
+						     work);
+	struct mlx4_en_priv *priv = filter->priv;
+	struct mlx4_spec_list spec_tcp_udp = {
+		.id = mlx4_ip_proto_to_trans_rule_id(filter->ip_proto),
+		{
+			.tcp_udp = {
+				.dst_port = filter->dst_port,
+				.dst_port_msk = (__force __be16)-1,
+				.src_port = filter->src_port,
+				.src_port_msk = (__force __be16)-1,
+			},
+		},
+	};
+	struct mlx4_spec_list spec_ip = {
+		.id = MLX4_NET_TRANS_RULE_ID_IPV4,
+		{
+			.ipv4 = {
+				.dst_ip = filter->dst_ip,
+				.dst_ip_msk = (__force __be32)-1,
+				.src_ip = filter->src_ip,
+				.src_ip_msk = (__force __be32)-1,
+			},
+		},
+	};
+	struct mlx4_spec_list spec_eth = {
+		.id = MLX4_NET_TRANS_RULE_ID_ETH,
+	};
+	struct mlx4_net_trans_rule rule = {
+		.list = LIST_HEAD_INIT(rule.list),
+		.queue_mode = MLX4_NET_TRANS_Q_LIFO,
+		.exclusive = 1,
+		.allow_loopback = 1,
+		.promisc_mode = MLX4_FS_REGULAR,
+		.port = priv->port,
+		.priority = MLX4_DOMAIN_RFS,
+	};
+	int rc;
+	__be64 mac_mask = cpu_to_be64(MLX4_MAC_MASK << 16);
+
+	if (spec_tcp_udp.id < 0) {
+		en_warn(priv, "RFS: ignoring unsupported ip protocol (%d)\n",
+			filter->ip_proto);
+		goto ignore;
+	}
+	list_add_tail(&spec_eth.list, &rule.list);
+	list_add_tail(&spec_ip.list, &rule.list);
+	list_add_tail(&spec_tcp_udp.list, &rule.list);
+
+	rule.qpn = priv->rss_map.qps[filter->rxq_index].qpn;
+	memcpy(spec_eth.eth.dst_mac, priv->dev->dev_addr, ETH_ALEN);
+	memcpy(spec_eth.eth.dst_mac_msk, &mac_mask, ETH_ALEN);
+
+	filter->activated = 0;
+
+	if (filter->reg_id) {
+		rc = mlx4_flow_detach(priv->mdev->dev, filter->reg_id);
+		if (rc && rc != -ENOENT)
+			en_err(priv, "Error detaching flow. rc = %d\n", rc);
+	}
+
+	rc = mlx4_flow_attach(priv->mdev->dev, &rule, &filter->reg_id);
+	if (rc)
+		en_err(priv, "Error attaching flow. err = %d\n", rc);
+
+ignore:
+	mlx4_en_filter_rfs_expire(priv);
+
+	filter->activated = 1;
+}
+
+static inline struct hlist_head *
+filter_hash_bucket(struct mlx4_en_priv *priv, __be32 src_ip, __be32 dst_ip,
+		   __be16 src_port, __be16 dst_port)
+{
+	unsigned long l;
+	int bucket_idx;
+
+	l = (__force unsigned long)src_port |
+	    ((__force unsigned long)dst_port << 2);
+	l ^= (__force unsigned long)(src_ip ^ dst_ip);
+
+	bucket_idx = hash_long(l, MLX4_EN_FILTER_HASH_SHIFT);
+
+	return &priv->filter_hash[bucket_idx];
+}
+
+static struct mlx4_en_filter *
+mlx4_en_filter_alloc(struct mlx4_en_priv *priv, int rxq_index, __be32 src_ip,
+		     __be32 dst_ip, u8 ip_proto, __be16 src_port,
+		     __be16 dst_port, u32 flow_id)
+{
+	struct mlx4_en_filter *filter = NULL;
+
+	filter = kzalloc(sizeof(struct mlx4_en_filter), GFP_ATOMIC);
+	if (!filter)
+		return NULL;
+
+	filter->priv = priv;
+	filter->rxq_index = rxq_index;
+	INIT_WORK(&filter->work, mlx4_en_filter_work);
+
+	filter->src_ip = src_ip;
+	filter->dst_ip = dst_ip;
+	filter->ip_proto = ip_proto;
+	filter->src_port = src_port;
+	filter->dst_port = dst_port;
+
+	filter->flow_id = flow_id;
+
+	filter->id = priv->last_filter_id++ % RPS_NO_FILTER;
+
+	list_add_tail(&filter->next, &priv->filters);
+	hlist_add_head(&filter->filter_chain,
+		       filter_hash_bucket(priv, src_ip, dst_ip, src_port,
+					  dst_port));
+
+	return filter;
+}
+
+static void mlx4_en_filter_free(struct mlx4_en_filter *filter)
+{
+	struct mlx4_en_priv *priv = filter->priv;
+	int rc;
+
+	list_del(&filter->next);
+
+	rc = mlx4_flow_detach(priv->mdev->dev, filter->reg_id);
+	if (rc && rc != -ENOENT)
+		en_err(priv, "Error detaching flow. rc = %d\n", rc);
+
+	kfree(filter);
+}
+
+static inline struct mlx4_en_filter *
+mlx4_en_filter_find(struct mlx4_en_priv *priv, __be32 src_ip, __be32 dst_ip,
+		    u8 ip_proto, __be16 src_port, __be16 dst_port)
+{
+	struct hlist_node *elem;
+	struct mlx4_en_filter *filter;
+	struct mlx4_en_filter *ret = NULL;
+
+	hlist_for_each_entry(filter, elem,
+			     filter_hash_bucket(priv, src_ip, dst_ip,
+						src_port, dst_port),
+			     filter_chain) {
+		if (filter->src_ip == src_ip &&
+		    filter->dst_ip == dst_ip &&
+		    filter->ip_proto == ip_proto &&
+		    filter->src_port == src_port &&
+		    filter->dst_port == dst_port) {
+			ret = filter;
+			break;
+		}
+	}
+
+	return ret;
+}
+
+static int
+mlx4_en_filter_rfs(struct net_device *net_dev, const struct sk_buff *skb,
+		   u16 rxq_index, u32 flow_id)
+{
+	struct mlx4_en_priv *priv = netdev_priv(net_dev);
+	struct mlx4_en_filter *filter;
+	const struct iphdr *ip;
+	const __be16 *ports;
+	u8 ip_proto;
+	__be32 src_ip;
+	__be32 dst_ip;
+	__be16 src_port;
+	__be16 dst_port;
+	int nhoff = skb_network_offset(skb);
+	int ret = 0;
+
+	if (skb->protocol != htons(ETH_P_IP))
+		return -EPROTONOSUPPORT;
+
+	ip = (const struct iphdr *)(skb->data + nhoff);
+	if (ip_is_fragment(ip))
+		return -EPROTONOSUPPORT;
+
+	if ((ip->protocol != IPPROTO_TCP) && (ip->protocol != IPPROTO_UDP))
+		return -EPROTONOSUPPORT;
+	ports = (const __be16 *)(skb->data + nhoff + 4 * ip->ihl);
+
+	ip_proto = ip->protocol;
+	src_ip = ip->saddr;
+	dst_ip = ip->daddr;
+	src_port = ports[0];
+	dst_port = ports[1];
+
+	spin_lock_bh(&priv->filters_lock);
+	filter = mlx4_en_filter_find(priv, src_ip, dst_ip, ip_proto,
+				     src_port, dst_port);
+	if (filter) {
+		if (filter->rxq_index == rxq_index)
+			goto out;
+
+		filter->rxq_index = rxq_index;
+	} else {
+		filter = mlx4_en_filter_alloc(priv, rxq_index,
+					      src_ip, dst_ip, ip_proto,
+					      src_port, dst_port, flow_id);
+		if (!filter) {
+			ret = -ENOMEM;
+			goto err;
+		}
+	}
+
+	queue_work(priv->mdev->workqueue, &filter->work);
+
+out:
+	ret = filter->id;
+err:
+	spin_unlock_bh(&priv->filters_lock);
+
+	return ret;
+}
+
+void mlx4_en_cleanup_filters(struct mlx4_en_priv *priv,
+			     struct mlx4_en_rx_ring *rx_ring)
+{
+	struct mlx4_en_filter *filter, *tmp;
+	LIST_HEAD(del_list);
+
+	spin_lock_bh(&priv->filters_lock);
+	list_for_each_entry_safe(filter, tmp, &priv->filters, next) {
+		list_move(&filter->next, &del_list);
+		hlist_del(&filter->filter_chain);
+	}
+	spin_unlock_bh(&priv->filters_lock);
+
+	list_for_each_entry_safe(filter, tmp, &del_list, next) {
+		cancel_work_sync(&filter->work);
+		mlx4_en_filter_free(filter);
+	}
+}
+
+static void mlx4_en_filter_rfs_expire(struct mlx4_en_priv *priv)
+{
+	struct mlx4_en_filter *filter = NULL, *tmp, *last_filter = NULL;
+	LIST_HEAD(del_list);
+	int i = 0;
+
+	spin_lock_bh(&priv->filters_lock);
+	list_for_each_entry_safe(filter, tmp, &priv->filters, next) {
+		if (i > MLX4_EN_FILTER_EXPIRY_QUOTA)
+			break;
+
+		if (filter->activated &&
+		    !work_pending(&filter->work) &&
+		    rps_may_expire_flow(priv->dev,
+					filter->rxq_index, filter->flow_id,
+					filter->id)) {
+			list_move(&filter->next, &del_list);
+			hlist_del(&filter->filter_chain);
+		} else
+			last_filter = filter;
+
+		i++;
+	}
+
+	if (last_filter && (&last_filter->next != priv->filters.next))
+		list_move(&priv->filters, &last_filter->next);
+
+	spin_unlock_bh(&priv->filters_lock);
+
+	list_for_each_entry_safe(filter, tmp, &del_list, next)
+		mlx4_en_filter_free(filter);
+}
+#endif
+
 static void mlx4_en_vlan_rx_add_vid(void *arg, struct net_device *dev, u16 vid)
 {
 	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	int err;
 	int idx;
-	u8 field;
 
-	if ((vid == 0) || (vid > 4095))    /* Invalid */
+	if (arg != priv)
 		return;
+
 	en_dbg(HW, priv, "adding VLAN:%d\n", vid);
-	idx = vid >> 5;
-	field = 1 << (vid & 0x1f);
-	spin_lock(&priv->vlan_lock);
-	priv->vlgrp_modified = true;
-	if (priv->vlan_unregister[idx] & field)
-		priv->vlan_unregister[idx] &= ~field;
-	else
-		priv->vlan_register[idx] |= field;
-	priv->vlans[idx] |= field;
-	spin_unlock(&priv->vlan_lock);
+
+	set_bit(vid, priv->active_vlans);
+
+	/* Add VID to port VLAN filter */
+	mutex_lock(&mdev->state_lock);
+	if (mdev->device_up && priv->port_up) {
+		err = mlx4_SET_VLAN_FLTR(mdev->dev, priv);
+		if (err)
+			en_err(priv, "Failed configuring VLAN filter\n");
+	}
+	if (mlx4_register_vlan(mdev->dev, priv->port, vid, &idx))
+		en_dbg(HW, priv, "failed adding vlan %d\n", vid);
+	mutex_unlock(&mdev->state_lock);
+
 }
 
 static void mlx4_en_vlan_rx_kill_vid(void *arg, struct net_device *dev, u16 vid)
 {
 	struct mlx4_en_priv *priv = netdev_priv(dev);
-	int idx;
-	u8 field;
+	struct mlx4_en_dev *mdev = priv->mdev;
+	int err;
 
-	if ((vid == 0) || (vid > 4095))    /* Invalid */
+	if (arg != priv)
 		return;
+
 	en_dbg(HW, priv, "Killing VID:%d\n", vid);
-	idx = vid >> 5;
-	field = 1 << (vid & 0x1f);
-	spin_lock(&priv->vlan_lock);
-	priv->vlgrp_modified = true;
-	if (priv->vlan_register[idx] & field)
-		priv->vlan_register[idx] &= ~field;
-	else
-		priv->vlan_unregister[idx] |= field;
-	priv->vlans[idx] &= ~field;
-	spin_unlock(&priv->vlan_lock);
+
+	clear_bit(vid, priv->active_vlans);
+
+	/* Remove VID from port VLAN filter */
+	mutex_lock(&mdev->state_lock);
+	mlx4_unregister_vlan(mdev->dev, priv->port, vid);
+
+	if (mdev->device_up && priv->port_up) {
+		err = mlx4_SET_VLAN_FLTR(mdev->dev, priv);
+		if (err)
+			en_err(priv, "Failed configuring VLAN filter\n");
+	}
+	mutex_unlock(&mdev->state_lock);
+
 }
 
-u64 mlx4_en_mac_to_u64(u8 *addr)
+static int mlx4_en_uc_steer_add(struct mlx4_en_priv *priv,
+				unsigned char *mac, int *qpn, u64 *reg_id)
 {
-	u64 mac = 0;
-	int i;
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct mlx4_dev *dev = mdev->dev;
+	int err;
 
-	for (i = 0; i < ETHER_ADDR_LEN; i++) {
-		mac <<= 8;
-		mac |= addr[i];
+	switch (dev->caps.steering_mode) {
+	case MLX4_STEERING_MODE_B0: {
+		struct mlx4_qp qp;
+		u8 gid[16] = {0};
+
+		qp.qpn = *qpn;
+		memcpy(&gid[10], mac, ETH_ALEN);
+		gid[5] = priv->port;
+
+		err = mlx4_unicast_attach(dev, &qp, gid, 0, MLX4_PROT_ETH);
+		break;
 	}
-	return mac;
+	case MLX4_STEERING_MODE_DEVICE_MANAGED: {
+		struct mlx4_spec_list spec_eth = { {NULL} };
+		__be64 mac_mask = cpu_to_be64(MLX4_MAC_MASK << 16);
+
+		struct mlx4_net_trans_rule rule = {
+			.queue_mode = MLX4_NET_TRANS_Q_FIFO,
+			.exclusive = 0,
+			.allow_loopback = 1,
+			.promisc_mode = MLX4_FS_REGULAR,
+			.priority = MLX4_DOMAIN_NIC,
+		};
+
+		rule.port = priv->port;
+		rule.qpn = *qpn;
+		INIT_LIST_HEAD(&rule.list);
+
+		spec_eth.id = MLX4_NET_TRANS_RULE_ID_ETH;
+		memcpy(spec_eth.eth.dst_mac, mac, ETH_ALEN);
+		memcpy(spec_eth.eth.dst_mac_msk, &mac_mask, ETH_ALEN);
+		list_add_tail(&spec_eth.list, &rule.list);
+
+		err = mlx4_flow_attach(dev, &rule, reg_id);
+		break;
+	}
+	default:
+		return -EINVAL;
+	}
+	if (err)
+		en_warn(priv, "Failed Attaching Unicast\n");
+
+	return err;
 }
 
-static int mlx4_en_cache_mclist(struct net_device *dev, u64 **mcaddrp)
+static void mlx4_en_uc_steer_release(struct mlx4_en_priv *priv,
+				     unsigned char *mac, int qpn, u64 reg_id)
 {
-	struct ifmultiaddr *ifma;;
-	u64 *mcaddr;
-	int cnt;
-	int i;
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct mlx4_dev *dev = mdev->dev;
 
-	*mcaddrp = NULL;
-restart:
-	cnt = 0;
-	if_maddr_rlock(dev);
-	TAILQ_FOREACH(ifma, &dev->if_multiaddrs, ifma_link) {
-		if (ifma->ifma_addr->sa_family != AF_LINK)
-			continue;
-		if (((struct sockaddr_dl *)ifma->ifma_addr)->sdl_alen !=
-		    ETHER_ADDR_LEN)
-			continue;
-		cnt++;
+	switch (dev->caps.steering_mode) {
+	case MLX4_STEERING_MODE_B0: {
+		struct mlx4_qp qp;
+		u8 gid[16] = {0};
+
+		qp.qpn = qpn;
+		memcpy(&gid[10], mac, ETH_ALEN);
+		gid[5] = priv->port;
+
+		mlx4_unicast_detach(dev, &qp, gid, MLX4_PROT_ETH);
+		break;
 	}
-	if_maddr_runlock(dev);
-	if (cnt == 0)
-		return (0);
-	mcaddr = kmalloc(sizeof(u64) * cnt, GFP_KERNEL);
-	if (mcaddr == NULL)
-		return (0);
-	i = 0;
-	if_maddr_rlock(dev);
-	TAILQ_FOREACH(ifma, &dev->if_multiaddrs, ifma_link) {
-		if (ifma->ifma_addr->sa_family != AF_LINK)
-			continue;
-		if (((struct sockaddr_dl *)ifma->ifma_addr)->sdl_alen !=
-		    ETHER_ADDR_LEN)
-			continue;
-		/* Make sure the list didn't grow. */
-		if (i == cnt) {
-			if_maddr_runlock(dev);
-			kfree(mcaddr);
-			goto restart;
-		}
-		mcaddr[i++] = mlx4_en_mac_to_u64(
-		    LLADDR((struct sockaddr_dl *)ifma->ifma_addr));
+	case MLX4_STEERING_MODE_DEVICE_MANAGED: {
+		mlx4_flow_detach(dev, reg_id);
+		break;
 	}
-	if_maddr_runlock(dev);
-	*mcaddrp = mcaddr;
-	return (i);
+	default:
+		en_err(priv, "Invalid steering mode.\n");
+	}
 }
 
-
-static void mlx4_en_set_multicast(struct net_device *dev)
+static int mlx4_en_get_qp(struct mlx4_en_priv *priv)
 {
-	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct mlx4_dev *dev = mdev->dev;
+	struct mlx4_mac_entry *entry;
+	int index = 0;
+	int err = 0;
+	u64 reg_id;
+	int *qpn = &priv->base_qpn;
+	u64 mac = mlx4_mac_to_u64(IF_LLADDR(priv->dev));
 
-	if (!priv->port_up)
-		return;
+	en_dbg(DRV, priv, "Registering MAC: %pM for adding\n",
+	       IF_LLADDR(priv->dev));
+	index = mlx4_register_mac(dev, priv->port, mac);
+	if (index < 0) {
+		err = index;
+		en_err(priv, "Failed adding MAC: %pM\n",
+		       IF_LLADDR(priv->dev));
+		return err;
+	}
 
-	queue_work(priv->mdev->workqueue, &priv->mcast_task);
+	if (dev->caps.steering_mode == MLX4_STEERING_MODE_A0) {
+		int base_qpn = mlx4_get_base_qpn(dev, priv->port);
+		*qpn = base_qpn + index;
+		return 0;
+	}
+
+	err = mlx4_qp_reserve_range(dev, 1, 1, qpn, 0);
+	en_dbg(DRV, priv, "Reserved qp %d\n", *qpn);
+	if (err) {
+		en_err(priv, "Failed to reserve qp for mac registration\n");
+		goto qp_err;
+	}
+
+	err = mlx4_en_uc_steer_add(priv, IF_LLADDR(priv->dev), qpn, &reg_id);
+	if (err)
+		goto steer_err;
+
+	entry = kmalloc(sizeof(*entry), GFP_KERNEL);
+	if (!entry) {
+		err = -ENOMEM;
+		goto alloc_err;
+	}
+	memcpy(entry->mac, IF_LLADDR(priv->dev), sizeof(entry->mac));
+	entry->reg_id = reg_id;
+
+	hlist_add_head(&entry->hlist,
+			   &priv->mac_hash[entry->mac[MLX4_EN_MAC_HASH_IDX]]);
+
+	return 0;
+
+alloc_err:
+	mlx4_en_uc_steer_release(priv, IF_LLADDR(priv->dev), *qpn, reg_id);
+
+steer_err:
+	mlx4_qp_release_range(dev, *qpn, 1);
+
+qp_err:
+	mlx4_unregister_mac(dev, priv->port, mac);
+	return err;
 }
 
-static void mlx4_en_do_set_multicast(struct work_struct *work)
+static void mlx4_en_put_qp(struct mlx4_en_priv *priv)
 {
-	struct mlx4_en_priv *priv = container_of(work, struct mlx4_en_priv,
-						 mcast_task);
-	struct net_device *dev = priv->dev;
 	struct mlx4_en_dev *mdev = priv->mdev;
-	int err;
+	struct mlx4_dev *dev = mdev->dev;
+	int qpn = priv->base_qpn;
+	u64 mac;
 
-	mutex_lock(&mdev->state_lock);
-	if (!mdev->device_up) {
-		en_dbg(HW, priv, "Card is not up, "
-				 "ignoring multicast change.\n");
-		goto out;
+	if (dev->caps.steering_mode == MLX4_STEERING_MODE_A0) {
+		mac = mlx4_mac_to_u64(IF_LLADDR(priv->dev));
+		en_dbg(DRV, priv, "Registering MAC: %pM for deleting\n",
+		       IF_LLADDR(priv->dev));
+		mlx4_unregister_mac(dev, priv->port, mac);
+	} else {
+		struct mlx4_mac_entry *entry;
+		struct hlist_node *n, *tmp;
+		struct hlist_head *bucket;
+		unsigned int i;
+
+		for (i = 0; i < MLX4_EN_MAC_HASH_SIZE; ++i) {
+			bucket = &priv->mac_hash[i];
+			hlist_for_each_entry_safe(entry, n, tmp, bucket, hlist) {
+				mac = mlx4_mac_to_u64(entry->mac);
+				en_dbg(DRV, priv, "Registering MAC: %pM for deleting\n",
+				       entry->mac);
+				mlx4_en_uc_steer_release(priv, entry->mac,
+							 qpn, entry->reg_id);
+
+				mlx4_unregister_mac(dev, priv->port, mac);
+				hlist_del(&entry->hlist);
+				kfree(entry);
+			}
+		}
+
+		en_dbg(DRV, priv, "Releasing qp: port %d, qpn %d\n",
+		       priv->port, qpn);
+		mlx4_qp_release_range(dev, qpn, 1);
+		priv->flags &= ~MLX4_EN_FLAG_FORCE_PROMISC;
 	}
-	if (!priv->port_up) {
-		en_dbg(HW, priv, "Port is down, "
-				 "ignoring  multicast change.\n");
-		goto out;
+}
+
+static void mlx4_en_clear_list(struct net_device *dev)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_mc_list *tmp, *mc_to_del;
+
+	list_for_each_entry_safe(mc_to_del, tmp, &priv->mc_list, list) {
+		list_del(&mc_to_del->list);
+		kfree(mc_to_del);
 	}
+}
 
-	/*
-	 * Promsicuous mode: disable all filters
+static void mlx4_en_cache_mclist(struct net_device *dev)
+{
+        struct ifmultiaddr *ifma;
+	struct mlx4_en_mc_list *tmp;
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+
+        if_maddr_rlock(dev);
+        TAILQ_FOREACH(ifma, &dev->if_multiaddrs, ifma_link) {
+                if (ifma->ifma_addr->sa_family != AF_LINK)
+                        continue;
+                if (((struct sockaddr_dl *)ifma->ifma_addr)->sdl_alen !=
+                                ETHER_ADDR_LEN)
+                        continue;
+                /* Make sure the list didn't grow. */
+		tmp = kzalloc(sizeof(struct mlx4_en_mc_list), GFP_ATOMIC);
+		if (tmp == NULL) {
+			en_err(priv, "Failed to allocate multicast list\n");
+			break;
+		}
+		memcpy(tmp->addr,
+			LLADDR((struct sockaddr_dl *)ifma->ifma_addr), ETH_ALEN);
+		list_add_tail(&tmp->list, &priv->mc_list);
+        }
+        if_maddr_runlock(dev);
+}
+
+static void update_mclist_flags(struct mlx4_en_priv *priv,
+				struct list_head *dst,
+				struct list_head *src)
+{
+	struct mlx4_en_mc_list *dst_tmp, *src_tmp, *new_mc;
+	bool found;
+
+	/* Find all the entries that should be removed from dst,
+	 * These are the entries that are not found in src
 	 */
+	list_for_each_entry(dst_tmp, dst, list) {
+		found = false;
+		list_for_each_entry(src_tmp, src, list) {
+			if (!memcmp(dst_tmp->addr, src_tmp->addr, ETH_ALEN)) {
+				found = true;
+				break;
+			}
+		}
+		if (!found)
+			dst_tmp->action = MCLIST_REM;
+	}
 
-	if (dev->if_flags & IFF_PROMISC) {
-		if (!(priv->flags & MLX4_EN_FLAG_PROMISC)) {
-			priv->flags |= MLX4_EN_FLAG_PROMISC;
+	/* Add entries that exist in src but not in dst
+	 * mark them as need to add
+	 */
+	list_for_each_entry(src_tmp, src, list) {
+		found = false;
+		list_for_each_entry(dst_tmp, dst, list) {
+			if (!memcmp(dst_tmp->addr, src_tmp->addr, ETH_ALEN)) {
+				dst_tmp->action = MCLIST_NONE;
+				found = true;
+				break;
+			}
+		}
+		if (!found) {
+			new_mc = kmalloc(sizeof(struct mlx4_en_mc_list),
+					 GFP_KERNEL);
+			if (!new_mc) {
+				en_err(priv, "Failed to allocate current multicast list\n");
+				return;
+			}
+			memcpy(new_mc, src_tmp,
+			       sizeof(struct mlx4_en_mc_list));
+			new_mc->action = MCLIST_ADD;
+			list_add_tail(&new_mc->list, dst);
+		}
+	}
+}
 
-			/* Enable promiscouos mode */
-			err = mlx4_SET_PORT_qpn_calc(mdev->dev, priv->port,
-						     priv->base_qpn, 1);
+static void mlx4_en_set_rx_mode(struct net_device *dev)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+
+	if (!priv->port_up)
+		return;
+
+	queue_work(priv->mdev->workqueue, &priv->rx_mode_task);
+}
+
+static void mlx4_en_set_promisc_mode(struct mlx4_en_priv *priv,
+				     struct mlx4_en_dev *mdev)
+{
+	int err = 0;
+	if (!(priv->flags & MLX4_EN_FLAG_PROMISC)) {
+		priv->flags |= MLX4_EN_FLAG_PROMISC;
+
+		/* Enable promiscouos mode */
+		switch (mdev->dev->caps.steering_mode) {
+		case MLX4_STEERING_MODE_DEVICE_MANAGED:
+			err = mlx4_flow_steer_promisc_add(mdev->dev,
+							  priv->port,
+							  priv->base_qpn,
+							  MLX4_FS_ALL_DEFAULT);
 			if (err)
-				en_err(priv, "Failed enabling "
-					     "promiscous mode\n");
+				en_err(priv, "Failed enabling promiscuous mode\n");
+			priv->flags |= MLX4_EN_FLAG_MC_PROMISC;
+			break;
 
-			/* Disable port multicast filter (unconditionally) */
-			err = mlx4_SET_MCAST_FLTR(mdev->dev, priv->port, 0,
-						  0, MLX4_MCAST_DISABLE);
+		case MLX4_STEERING_MODE_B0:
+			err = mlx4_unicast_promisc_add(mdev->dev,
+						       priv->base_qpn,
+						       priv->port);
 			if (err)
-				en_err(priv, "Failed disabling "
-					     "multicast filter\n");
+				en_err(priv, "Failed enabling unicast promiscuous mode\n");
 
-			/* Disable port VLAN filter */
-			err = mlx4_SET_VLAN_FLTR(mdev->dev, priv->port, NULL);
+			/* Add the default qp number as multicast
+			 * promisc
+			 */
+			if (!(priv->flags & MLX4_EN_FLAG_MC_PROMISC)) {
+				err = mlx4_multicast_promisc_add(mdev->dev,
+								 priv->base_qpn,
+								 priv->port);
+				if (err)
+					en_err(priv, "Failed enabling multicast promiscuous mode\n");
+				priv->flags |= MLX4_EN_FLAG_MC_PROMISC;
+			}
+			break;
+
+		case MLX4_STEERING_MODE_A0:
+			err = mlx4_SET_PORT_qpn_calc(mdev->dev,
+						     priv->port,
+						     priv->base_qpn,
+						     1);
 			if (err)
-				en_err(priv, "Failed disabling VLAN filter\n");
+				en_err(priv, "Failed enabling promiscuous mode\n");
+			break;
 		}
-		goto out;
+
+		/* Disable port multicast filter (unconditionally) */
+		err = mlx4_SET_MCAST_FLTR(mdev->dev, priv->port, 0,
+					  0, MLX4_MCAST_DISABLE);
+		if (err)
+			en_err(priv, "Failed disabling multicast filter\n");
 	}
+}
 
-	/*
-	 * Not in promiscous mode
-	 */
+static void mlx4_en_clear_promisc_mode(struct mlx4_en_priv *priv,
+				       struct mlx4_en_dev *mdev)
+{
+	int err = 0;
 
-	if (priv->flags & MLX4_EN_FLAG_PROMISC) {
-		priv->flags &= ~MLX4_EN_FLAG_PROMISC;
+	priv->flags &= ~MLX4_EN_FLAG_PROMISC;
 
-		/* Disable promiscouos mode */
-		err = mlx4_SET_PORT_qpn_calc(mdev->dev, priv->port,
-					     priv->base_qpn, 0);
+	/* Disable promiscouos mode */
+	switch (mdev->dev->caps.steering_mode) {
+	case MLX4_STEERING_MODE_DEVICE_MANAGED:
+		err = mlx4_flow_steer_promisc_remove(mdev->dev,
+						     priv->port,
+						     MLX4_FS_ALL_DEFAULT);
 		if (err)
-			en_err(priv, "Failed disabling promiscous mode\n");
+			en_err(priv, "Failed disabling promiscuous mode\n");
+		priv->flags &= ~MLX4_EN_FLAG_MC_PROMISC;
+		break;
 
-		/* Enable port VLAN filter */
-		err = mlx4_SET_VLAN_FLTR(mdev->dev, priv->port, priv->vlans);
+	case MLX4_STEERING_MODE_B0:
+		err = mlx4_unicast_promisc_remove(mdev->dev,
+						  priv->base_qpn,
+						  priv->port);
 		if (err)
-			en_err(priv, "Failed enabling VLAN filter\n");
+			en_err(priv, "Failed disabling unicast promiscuous mode\n");
+		/* Disable Multicast promisc */
+		if (priv->flags & MLX4_EN_FLAG_MC_PROMISC) {
+			err = mlx4_multicast_promisc_remove(mdev->dev,
+							    priv->base_qpn,
+							    priv->port);
+			if (err)
+				en_err(priv, "Failed disabling multicast promiscuous mode\n");
+			priv->flags &= ~MLX4_EN_FLAG_MC_PROMISC;
+		}
+		break;
+
+	case MLX4_STEERING_MODE_A0:
+		err = mlx4_SET_PORT_qpn_calc(mdev->dev,
+					     priv->port,
+					     priv->base_qpn, 0);
+		if (err)
+			en_err(priv, "Failed disabling promiscuous mode\n");
+		break;
 	}
+}
 
+static void mlx4_en_do_multicast(struct mlx4_en_priv *priv,
+				 struct net_device *dev,
+				 struct mlx4_en_dev *mdev)
+{
+	struct mlx4_en_mc_list *mclist, *tmp;
+	u8 mc_list[16] = {0};
+	int err = 0;
+	u64 mcast_addr = 0;
+
+
 	/* Enable/disable the multicast filter according to IFF_ALLMULTI */
 	if (dev->if_flags & IFF_ALLMULTI) {
 		err = mlx4_SET_MCAST_FLTR(mdev->dev, priv->port, 0,
@@ -232,11 +847,54 @@
 					  0, MLX4_MCAST_DISABLE);
 		if (err)
 			en_err(priv, "Failed disabling multicast filter\n");
+
+		/* Add the default qp number as multicast promisc */
+		if (!(priv->flags & MLX4_EN_FLAG_MC_PROMISC)) {
+			switch (mdev->dev->caps.steering_mode) {
+			case MLX4_STEERING_MODE_DEVICE_MANAGED:
+				err = mlx4_flow_steer_promisc_add(mdev->dev,
+								  priv->port,
+								  priv->base_qpn,
+								  MLX4_FS_MC_DEFAULT);
+				break;
+
+			case MLX4_STEERING_MODE_B0:
+				err = mlx4_multicast_promisc_add(mdev->dev,
+								 priv->base_qpn,
+								 priv->port);
+				break;
+
+			case MLX4_STEERING_MODE_A0:
+				break;
+			}
+			if (err)
+				en_err(priv, "Failed entering multicast promisc mode\n");
+			priv->flags |= MLX4_EN_FLAG_MC_PROMISC;
+		}
 	} else {
-		u64 *mcaddr;
-		int mccount;
-		int i;
+		/* Disable Multicast promisc */
+		if (priv->flags & MLX4_EN_FLAG_MC_PROMISC) {
+			switch (mdev->dev->caps.steering_mode) {
+			case MLX4_STEERING_MODE_DEVICE_MANAGED:
+				err = mlx4_flow_steer_promisc_remove(mdev->dev,
+								     priv->port,
+								     MLX4_FS_MC_DEFAULT);
+				break;
 
+			case MLX4_STEERING_MODE_B0:
+				err = mlx4_multicast_promisc_remove(mdev->dev,
+								    priv->base_qpn,
+								    priv->port);
+				break;
+
+			case MLX4_STEERING_MODE_A0:
+				break;
+			}
+			if (err)
+				en_err(priv, "Failed disabling multicast promiscuous mode\n");
+			priv->flags &= ~MLX4_EN_FLAG_MC_PROMISC;
+		}
+
 		err = mlx4_SET_MCAST_FLTR(mdev->dev, priv->port, 0,
 					  0, MLX4_MCAST_DISABLE);
 		if (err)
@@ -248,17 +906,96 @@
 
 		/* Update multicast list - we cache all addresses so they won't
 		 * change while HW is updated holding the command semaphor */
-		mccount = mlx4_en_cache_mclist(dev, &mcaddr);
-		for (i = 0; i < mccount; i++)
+		mlx4_en_cache_mclist(dev);
+		list_for_each_entry(mclist, &priv->mc_list, list) {
+			mcast_addr = mlx4_mac_to_u64(mclist->addr);
 			mlx4_SET_MCAST_FLTR(mdev->dev, priv->port,
-					    mcaddr[i], 0, MLX4_MCAST_CONFIG);
+					mcast_addr, 0, MLX4_MCAST_CONFIG);
+		}
 		err = mlx4_SET_MCAST_FLTR(mdev->dev, priv->port, 0,
 					  0, MLX4_MCAST_ENABLE);
 		if (err)
 			en_err(priv, "Failed enabling multicast filter\n");
 
-		kfree(mcaddr);
+		update_mclist_flags(priv, &priv->curr_list, &priv->mc_list);
+		list_for_each_entry_safe(mclist, tmp, &priv->curr_list, list) {
+			if (mclist->action == MCLIST_REM) {
+				/* detach this address and delete from list */
+				memcpy(&mc_list[10], mclist->addr, ETH_ALEN);
+				mc_list[5] = priv->port;
+				err = mlx4_multicast_detach(mdev->dev,
+							    &priv->rss_map.indir_qp,
+							    mc_list,
+							    MLX4_PROT_ETH,
+							    mclist->reg_id);
+				if (err)
+					en_err(priv, "Fail to detach multicast address\n");
+
+				/* remove from list */
+				list_del(&mclist->list);
+				kfree(mclist);
+			} else if (mclist->action == MCLIST_ADD) {
+				/* attach the address */
+				memcpy(&mc_list[10], mclist->addr, ETH_ALEN);
+				/* needed for B0 steering support */
+				mc_list[5] = priv->port;
+				err = mlx4_multicast_attach(mdev->dev,
+							    &priv->rss_map.indir_qp,
+							    mc_list,
+							    priv->port, 0,
+							    MLX4_PROT_ETH,
+							    &mclist->reg_id);
+				if (err)
+					en_err(priv, "Fail to attach multicast address\n");
+
+			}
+		}
 	}
+}
+
+static void mlx4_en_do_set_rx_mode(struct work_struct *work)
+{
+	struct mlx4_en_priv *priv = container_of(work, struct mlx4_en_priv,
+						 rx_mode_task);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct net_device *dev = priv->dev;
+
+
+	mutex_lock(&mdev->state_lock);
+	if (!mdev->device_up) {
+		en_dbg(HW, priv, "Card is not up, ignoring rx mode change.\n");
+		goto out;
+	}
+	if (!priv->port_up) {
+		en_dbg(HW, priv, "Port is down, ignoring rx mode change.\n");
+		goto out;
+	}
+	if (!mlx4_en_QUERY_PORT(mdev, priv->port)) {
+		if (priv->port_state.link_state) {
+			priv->last_link_state = MLX4_DEV_EVENT_PORT_UP;
+			/* update netif baudrate */
+			priv->dev->if_baudrate =
+			    IF_Mbps(priv->port_state.link_speed);
+			/* Important note: the following call for if_link_state_change
+			 * is needed for interface up scenario (start port, link state
+			 * change) */
+			if_link_state_change(priv->dev, LINK_STATE_UP);
+			en_dbg(HW, priv, "Link Up\n");
+		}
+	}
+
+	/* Promsicuous mode: disable all filters */
+	if ((dev->if_flags & IFF_PROMISC) ||
+	    (priv->flags & MLX4_EN_FLAG_FORCE_PROMISC)) {
+		mlx4_en_set_promisc_mode(priv, mdev);
+		goto out;
+	}
+
+	/* Not in promiscuous mode */
+	if (priv->flags & MLX4_EN_FLAG_PROMISC)
+		mlx4_en_clear_promisc_mode(priv, mdev);
+
+	mlx4_en_do_multicast(priv, dev, mdev);
 out:
 	mutex_unlock(&mdev->state_lock);
 }
@@ -272,7 +1009,7 @@
 	int i;
 
 	for (i = 0; i < priv->rx_ring_num; i++) {
-		cq = &priv->rx_cq[i];
+		cq = priv->rx_cq[i];
 		spin_lock_irqsave(&cq->lock, flags);
 		napi_synchronize(&cq->napi);
 		mlx4_en_process_rx_cq(dev, cq, 0);
@@ -283,18 +1020,18 @@
 
 static void mlx4_en_watchdog_timeout(void *arg)
 {
-	struct mlx4_en_priv *priv = arg;
-	struct mlx4_en_dev *mdev = priv->mdev;
+        struct mlx4_en_priv *priv = arg;
+        struct mlx4_en_dev *mdev = priv->mdev;
 
-	en_dbg(DRV, priv, "Scheduling watchdog\n");
-	queue_work(mdev->workqueue, &priv->watchdog_task);
-	if (priv->port_up)
-		callout_reset(&priv->watchdog_timer, MLX4_EN_WATCHDOG_TIMEOUT,
-		    mlx4_en_watchdog_timeout, priv);
+        en_dbg(DRV, priv, "Scheduling watchdog\n");
+        queue_work(mdev->workqueue, &priv->watchdog_task);
+        if (priv->port_up)
+                callout_reset(&priv->watchdog_timer, MLX4_EN_WATCHDOG_TIMEOUT,
+                                mlx4_en_watchdog_timeout, priv);
 }
 
 
-/* XXX This clears user settings in too many cases. */
+
 static void mlx4_en_set_default_moderation(struct mlx4_en_priv *priv)
 {
 	struct mlx4_en_cq *cq;
@@ -301,7 +1038,7 @@
 	int i;
 
 	/* If we haven't received a specific coalescing setting
-	 * (module param), we set the moderation paramters as follows:
+	 * (module param), we set the moderation parameters as follows:
 	 * - moder_cnt is set to the number of mtu sized packets to
 	 *   satisfy our coelsing target.
 	 * - moder_time is set to a fixed value.
@@ -308,21 +1045,26 @@
 	 */
 	priv->rx_frames = MLX4_EN_RX_COAL_TARGET / priv->dev->if_mtu + 1;
 	priv->rx_usecs = MLX4_EN_RX_COAL_TIME;
-	en_dbg(INTR, priv, "Default coalesing params for mtu:%ld - "
-			   "rx_frames:%d rx_usecs:%d\n",
-		 priv->dev->if_mtu, priv->rx_frames, priv->rx_usecs);
+	priv->tx_frames = MLX4_EN_TX_COAL_PKTS;
+	priv->tx_usecs = MLX4_EN_TX_COAL_TIME;
+	en_dbg(INTR, priv, "Default coalesing params for mtu: %u - "
+	       "rx_frames:%d rx_usecs:%d\n",
+	       (unsigned)priv->dev->if_mtu, priv->rx_frames, priv->rx_usecs);
 
 	/* Setup cq moderation params */
 	for (i = 0; i < priv->rx_ring_num; i++) {
-		cq = &priv->rx_cq[i];
+		cq = priv->rx_cq[i];
 		cq->moder_cnt = priv->rx_frames;
 		cq->moder_time = priv->rx_usecs;
+		priv->last_moder_time[i] = MLX4_EN_AUTO_CONF;
+		priv->last_moder_packets[i] = 0;
+		priv->last_moder_bytes[i] = 0;
 	}
 
 	for (i = 0; i < priv->tx_ring_num; i++) {
-		cq = &priv->tx_cq[i];
-		cq->moder_cnt = MLX4_EN_TX_COAL_PKTS;
-		cq->moder_time = MLX4_EN_TX_COAL_TIME;
+		cq = priv->tx_cq[i];
+		cq->moder_cnt = priv->tx_frames;
+		cq->moder_time = priv->tx_usecs;
 	}
 
 	/* Reset auto-moderation params */
@@ -332,11 +1074,8 @@
 	priv->rx_usecs_high = MLX4_EN_RX_COAL_TIME_HIGH;
 	priv->sample_interval = MLX4_EN_SAMPLE_INTERVAL;
 	priv->adaptive_rx_coal = 1;
-	priv->last_moder_time = MLX4_EN_AUTO_CONF;
 	priv->last_moder_jiffies = 0;
-	priv->last_moder_packets = 0;
 	priv->last_moder_tx_packets = 0;
-	priv->last_moder_bytes = 0;
 }
 
 static void mlx4_en_auto_moderation(struct mlx4_en_priv *priv)
@@ -348,45 +1087,31 @@
 	unsigned long avg_pkt_size;
 	unsigned long rx_packets;
 	unsigned long rx_bytes;
-	unsigned long tx_packets;
-	unsigned long tx_pkt_diff;
 	unsigned long rx_pkt_diff;
 	int moder_time;
-	int i, err;
+	int ring, err;
 
 	if (!priv->adaptive_rx_coal || period < priv->sample_interval * HZ)
 		return;
 
-	spin_lock(&priv->stats_lock);
-	rx_packets = priv->dev->if_ipackets;
-	rx_bytes = priv->dev->if_ibytes;
-	tx_packets = priv->dev->if_opackets;
-	spin_unlock(&priv->stats_lock);
+	for (ring = 0; ring < priv->rx_ring_num; ring++) {
+                spin_lock(&priv->stats_lock);
+		rx_packets = priv->rx_ring[ring]->packets;
+		rx_bytes = priv->rx_ring[ring]->bytes;
+		spin_unlock(&priv->stats_lock);
 
-	if (!priv->last_moder_jiffies || !period)
-		goto out;
+		rx_pkt_diff = ((unsigned long) (rx_packets -
+				priv->last_moder_packets[ring]));
+		packets = rx_pkt_diff;
+		rate = packets * HZ / period;
+		avg_pkt_size = packets ? ((unsigned long) (rx_bytes -
+				priv->last_moder_bytes[ring])) / packets : 0;
 
-	tx_pkt_diff = ((unsigned long) (tx_packets -
-					priv->last_moder_tx_packets));
-	rx_pkt_diff = ((unsigned long) (rx_packets -
-					priv->last_moder_packets));
-	packets = max(tx_pkt_diff, rx_pkt_diff);
-	rate = packets * HZ / period;
-	avg_pkt_size = packets ? ((unsigned long) (rx_bytes -
-				 priv->last_moder_bytes)) / packets : 0;
-
-	/* Apply auto-moderation only when packet rate exceeds a rate that
-	 * it matters */
-	if (rate > MLX4_EN_RX_RATE_THRESH) {
-		/* If tx and rx packet rates are not balanced, assume that
-		 * traffic is mainly BW bound and apply maximum moderation.
-		 * Otherwise, moderate according to packet rate */
-		if (2 * tx_pkt_diff > 3 * rx_pkt_diff ||
-		    2 * rx_pkt_diff > 3 * tx_pkt_diff) {
-			moder_time = priv->rx_usecs_high;
-		} else {
-			if (rate < priv->pkt_rate_low ||
-			    avg_pkt_size < MLX4_EN_AVG_PKT_SMALL)
+		/* Apply auto-moderation only when packet rate
+		 * exceeds a rate that it matters */
+		if (rate > (MLX4_EN_RX_RATE_THRESH / priv->rx_ring_num) &&
+		    avg_pkt_size > MLX4_EN_AVG_PKT_SMALL) {
+			if (rate < priv->pkt_rate_low)
 				moder_time = priv->rx_usecs_low;
 			else if (rate > priv->pkt_rate_high)
 				moder_time = priv->rx_usecs_high;
@@ -395,82 +1120,26 @@
 					(priv->rx_usecs_high - priv->rx_usecs_low) /
 					(priv->pkt_rate_high - priv->pkt_rate_low) +
 					priv->rx_usecs_low;
+		} else {
+			moder_time = priv->rx_usecs_low;
 		}
-	} else {
-		/* When packet rate is low, use default moderation rather than
-		 * 0 to prevent interrupt storms if traffic suddenly increases */
-		moder_time = priv->rx_usecs;
-	}
 
-	en_dbg(INTR, priv, "tx rate:%lu rx_rate:%lu\n",
-	       tx_pkt_diff * HZ / period, rx_pkt_diff * HZ / period);
-
-	en_dbg(INTR, priv, "Rx moder_time changed from:%d to %d period:%lu "
-	       "[jiff] packets:%lu avg_pkt_size:%lu rate:%lu [p/s])\n",
-		 priv->last_moder_time, moder_time, period, packets,
-		 avg_pkt_size, rate);
-
-	if (moder_time != priv->last_moder_time) {
-		priv->last_moder_time = moder_time;
-		for (i = 0; i < priv->rx_ring_num; i++) {
-			cq = &priv->rx_cq[i];
+		if (moder_time != priv->last_moder_time[ring]) {
+			priv->last_moder_time[ring] = moder_time;
+			cq = priv->rx_cq[ring];
 			cq->moder_time = moder_time;
 			err = mlx4_en_set_cq_moder(priv, cq);
-			if (err) {
-				en_err(priv, "Failed modifying moderation for cq:%d\n", i);
-				break;
-			}
+			if (err)
+				en_err(priv, "Failed modifying moderation for cq:%d\n",
+				       ring);
 		}
+		priv->last_moder_packets[ring] = rx_packets;
+		priv->last_moder_bytes[ring] = rx_bytes;
 	}
 
-out:
-	priv->last_moder_packets = rx_packets;
-	priv->last_moder_tx_packets = tx_packets;
-	priv->last_moder_bytes = rx_bytes;
 	priv->last_moder_jiffies = jiffies;
 }
 
-static void mlx4_en_handle_vlans(struct mlx4_en_priv *priv)
-{
-	u8 vlan_register[VLAN_FLTR_SIZE];
-	u8 vlan_unregister[VLAN_FLTR_SIZE];
-	int i, j, idx;
-	u16 vid;
-
-	/* cache the vlan data for processing 
-	 * done under lock to avoid changes during work */
-	spin_lock(&priv->vlan_lock);
-	for (i = 0; i < VLAN_FLTR_SIZE; i++) {
-		vlan_register[i] = priv->vlan_register[i];
-		priv->vlan_register[i] = 0;
-		vlan_unregister[i] = priv->vlan_unregister[i];
-		priv->vlan_unregister[i] = 0;
-	}
-	priv->vlgrp_modified = false;
-	spin_unlock(&priv->vlan_lock);
-
-	/* Configure the vlan filter 
-	 * The vlgrp is updated with all the vids that need to be allowed */
-	if (mlx4_SET_VLAN_FLTR(priv->mdev->dev, priv->port, priv->vlans))
-		en_err(priv, "Failed configuring VLAN filter\n");
-
-	/* Configure the VLAN table */
-	for (i = 0; i < VLAN_FLTR_SIZE; i++) {
-		for (j = 0; j < 32; j++) {
-			vid = (i << 5) + j;
-			if (vlan_register[i] & (1 << j))
-				if (mlx4_register_vlan(priv->mdev->dev, priv->port, vid, &idx))
-					en_dbg(HW, priv, "failed registering vlan %d\n", vid);
-			if (vlan_unregister[i] & (1 << j)) {
-				if (!mlx4_find_cached_vlan(priv->mdev->dev, priv->port, vid, &idx))
-					mlx4_unregister_vlan(priv->mdev->dev, priv->port, idx);
-				else
-					en_dbg(HW, priv, "could not find vid %d in cache\n", vid);
-			}
-		}
-	}
-}
-
 static void mlx4_en_do_get_stats(struct work_struct *work)
 {
 	struct delayed_work *delay = to_delayed_work(work);
@@ -479,16 +1148,12 @@
 	struct mlx4_en_dev *mdev = priv->mdev;
 	int err;
 
-	err = mlx4_en_DUMP_ETH_STATS(mdev, priv->port, 0);
-	if (err)
-		en_dbg(HW, priv, "Could not update stats \n");
-
-
 	mutex_lock(&mdev->state_lock);
 	if (mdev->device_up) {
 		if (priv->port_up) {
-			if (priv->vlgrp_modified)
-				mlx4_en_handle_vlans(priv);
+                        err = mlx4_en_DUMP_ETH_STATS(mdev, priv->port, 0);
+			if (err)
+				en_dbg(HW, priv, "Could not update stats\n");
 
 			mlx4_en_auto_moderation(priv);
 		}
@@ -495,10 +1160,23 @@
 
 		queue_delayed_work(mdev->workqueue, &priv->stats_task, STATS_DELAY);
 	}
-	if (mdev->mac_removed[MLX4_MAX_PORTS + 1 - priv->port]) {
-		panic("mlx4_en_do_get_stats: Unexpected mac removed for %d\n",
-		    priv->port);
-		mdev->mac_removed[MLX4_MAX_PORTS + 1 - priv->port] = 0;
+	mutex_unlock(&mdev->state_lock);
+}
+
+/* mlx4_en_service_task - Run service task for tasks that needed to be done
+ * periodically
+ */
+static void mlx4_en_service_task(struct work_struct *work)
+{
+	struct delayed_work *delay = to_delayed_work(work);
+	struct mlx4_en_priv *priv = container_of(delay, struct mlx4_en_priv,
+						 service_task);
+	struct mlx4_en_dev *mdev = priv->mdev;
+
+	mutex_lock(&mdev->state_lock);
+	if (mdev->device_up) {
+		queue_delayed_work(mdev->workqueue, &priv->service_task,
+				   SERVICE_TASK_DELAY);
 	}
 	mutex_unlock(&mdev->state_lock);
 }
@@ -515,8 +1193,22 @@
 	 * report to system log */
 	if (priv->last_link_state != linkstate) {
 		if (linkstate == MLX4_DEV_EVENT_PORT_DOWN) {
+			en_info(priv, "Link Down\n");
 			if_link_state_change(priv->dev, LINK_STATE_DOWN);
-		} else {
+			/* update netif baudrate */
+			priv->dev->if_baudrate = 0;
+
+		/* make sure the port is up before notifying the OS.
+		 * This is tricky since we get here on INIT_PORT and
+		 * in such case we can't tell the OS the port is up.
+		 * To solve this there is a call to if_link_state_change
+		 * in set_rx_mode.
+		 * */
+		} else if (priv->port_up && (linkstate == MLX4_DEV_EVENT_PORT_UP)){
+			if (mlx4_en_QUERY_PORT(priv->mdev, priv->port))
+				en_info(priv, "Query port failed\n");
+			priv->dev->if_baudrate =
+			    IF_Mbps(priv->port_state.link_speed);
 			en_info(priv, "Link Up\n");
 			if_link_state_change(priv->dev, LINK_STATE_UP);
 		}
@@ -532,21 +1224,26 @@
 	struct mlx4_en_dev *mdev = priv->mdev;
 	struct mlx4_en_cq *cq;
 	struct mlx4_en_tx_ring *tx_ring;
-	u64 config;
 	int rx_index = 0;
 	int tx_index = 0;
 	int err = 0;
 	int i;
 	int j;
+	u8 mc_list[16] = {0};
 
+
 	if (priv->port_up) {
 		en_dbg(DRV, priv, "start port called while port already up\n");
 		return 0;
 	}
 
+	INIT_LIST_HEAD(&priv->mc_list);
+	INIT_LIST_HEAD(&priv->curr_list);
+	INIT_LIST_HEAD(&priv->ethtool_list);
+
 	/* Calculate Rx buf size */
 	dev->if_mtu = min(dev->if_mtu, priv->max_mtu);
-	mlx4_en_calc_rx_buf(dev);
+        mlx4_en_calc_rx_buf(dev);
 	en_dbg(DRV, priv, "Rx buf size:%d\n", priv->rx_mb_size);
 
 	/* Configure rx cq's and rings */
@@ -555,11 +1252,11 @@
 		en_err(priv, "Failed to activate RX rings\n");
 		return err;
 	}
-
 	for (i = 0; i < priv->rx_ring_num; i++) {
-		cq = &priv->rx_cq[i];
+		cq = priv->rx_cq[i];
 
-		err = mlx4_en_activate_cq(priv, cq);
+		mlx4_en_cq_init_lock(cq);
+		err = mlx4_en_activate_cq(priv, cq, i);
 		if (err) {
 			en_err(priv, "Failed activating Rx CQ\n");
 			goto cq_err;
@@ -573,23 +1270,43 @@
 			goto cq_err;
 		}
 		mlx4_en_arm_cq(priv, cq);
-		priv->rx_ring[i].cqn = cq->mcq.cqn;
+		priv->rx_ring[i]->cqn = cq->mcq.cqn;
 		++rx_index;
 	}
 
+	/* Set qp number */
+	en_dbg(DRV, priv, "Getting qp number for port %d\n", priv->port);
+	err = mlx4_en_get_qp(priv);
+	if (err) {
+		en_err(priv, "Failed getting eth qp\n");
+		goto cq_err;
+	}
+	mdev->mac_removed[priv->port] = 0;
+
+	/* gets default allocated counter index from func cap */
+	/* or sink counter index if no resources */
+	priv->counter_index = mdev->dev->caps.def_counter_index[priv->port - 1];
+
+	en_dbg(DRV, priv, "%s: default counter index %d for port %d\n",
+	       __func__, priv->counter_index, priv->port);
+
 	err = mlx4_en_config_rss_steer(priv);
 	if (err) {
 		en_err(priv, "Failed configuring rss steering\n");
-		goto cq_err;
+		goto mac_err;
 	}
 
+	err = mlx4_en_create_drop_qp(priv);
+	if (err)
+		goto rss_err;
+
 	/* Configure tx cq's and rings */
 	for (i = 0; i < priv->tx_ring_num; i++) {
 		/* Configure cq */
-		cq = &priv->tx_cq[i];
-		err = mlx4_en_activate_cq(priv, cq);
+		cq = priv->tx_cq[i];
+		err = mlx4_en_activate_cq(priv, cq, i);
 		if (err) {
-			en_err(priv, "Failed allocating Tx CQ\n");
+			en_err(priv, "Failed activating Tx CQ\n");
 			goto tx_err;
 		}
 		err = mlx4_en_set_cq_moder(priv, cq);
@@ -602,13 +1319,19 @@
 		cq->buf->wqe_index = cpu_to_be16(0xffff);
 
 		/* Configure ring */
-		tx_ring = &priv->tx_ring[i];
-		err = mlx4_en_activate_tx_ring(priv, tx_ring, cq->mcq.cqn);
+		tx_ring = priv->tx_ring[i];
+
+		err = mlx4_en_activate_tx_ring(priv, tx_ring, cq->mcq.cqn,
+					       i / priv->num_tx_rings_p_up);
 		if (err) {
-			en_err(priv, "Failed allocating Tx ring\n");
+			en_err(priv, "Failed activating Tx ring %d\n", i);
 			mlx4_en_deactivate_cq(priv, cq);
 			goto tx_err;
 		}
+
+		/* Arm CQ for TX completions */
+		mlx4_en_arm_cq(priv, cq);
+
 		/* Set initial ownership of all Tx TXBBs to SW (1) */
 		for (j = 0; j < tx_ring->buf_size; j += STAMP_STRIDE)
 			*((u32 *) (tx_ring->buf + j)) = 0xffffffff;
@@ -617,14 +1340,14 @@
 
 	/* Configure port */
 	err = mlx4_SET_PORT_general(mdev->dev, priv->port,
-				    priv->rx_mb_size + ETHER_CRC_LEN,
+				    priv->rx_mb_size,
 				    priv->prof->tx_pause,
 				    priv->prof->tx_ppp,
 				    priv->prof->rx_pause,
 				    priv->prof->rx_ppp);
 	if (err) {
-		en_err(priv, "Failed setting port general configurations "
-			     "for port %d, with error %d\n", priv->port, err);
+		en_err(priv, "Failed setting port general configurations for port %d, with error %d\n",
+		       priv->port, err);
 		goto tx_err;
 	}
 	/* Set default qp number */
@@ -633,16 +1356,6 @@
 		en_err(priv, "Failed setting default qp numbers\n");
 		goto tx_err;
 	}
-	/* Set port mac number */
-	en_dbg(DRV, priv, "Setting mac for port %d\n", priv->port);
-	err = mlx4_register_mac(mdev->dev, priv->port,
-				mlx4_en_mac_to_u64(IF_LLADDR(dev)),
-				&priv->mac_index);
-	if (err) {
-		en_err(priv, "Failed setting port mac\n");
-		goto tx_err;
-	}
-	mdev->mac_removed[priv->port] = 0;
 
 	/* Init port */
 	en_dbg(HW, priv, "Initializing port\n");
@@ -649,71 +1362,54 @@
 	err = mlx4_INIT_PORT(mdev->dev, priv->port);
 	if (err) {
 		en_err(priv, "Failed Initializing port\n");
-		goto mac_err;
+		goto tx_err;
 	}
 
-	/* Set the various hardware offload abilities */
-	dev->if_hwassist = 0;
-	if (dev->if_capenable & IFCAP_TSO4)
-		dev->if_hwassist |= CSUM_TSO;
-	if (dev->if_capenable & IFCAP_TXCSUM)
-		dev->if_hwassist |= (CSUM_TCP | CSUM_UDP | CSUM_IP);
-	if (dev->if_capenable & IFCAP_RXCSUM)
-		priv->rx_csum = 1;
-	else
-		priv->rx_csum = 0;
+	/* Attach rx QP to bradcast address */
+	memset(&mc_list[10], 0xff, ETH_ALEN);
+	mc_list[5] = priv->port; /* needed for B0 steering support */
+	if (mlx4_multicast_attach(mdev->dev, &priv->rss_map.indir_qp, mc_list,
+				  priv->port, 0, MLX4_PROT_ETH,
+				  &priv->broadcast_id))
+		mlx4_warn(mdev, "Failed Attaching Broadcast\n");
 
-	err = mlx4_wol_read(priv->mdev->dev, &config, priv->port);
-	if (err) {
-		en_err(priv, "Failed to get WoL info, unable to modify\n");
-		goto wol_err;
-	}
-	if (dev->if_capenable & IFCAP_WOL_MAGIC) {
-		config |= MLX4_EN_WOL_DO_MODIFY | MLX4_EN_WOL_ENABLED |
-		    MLX4_EN_WOL_MAGIC;
-	} else {
-		config &= ~(MLX4_EN_WOL_ENABLED | MLX4_EN_WOL_MAGIC);
-		config |= MLX4_EN_WOL_DO_MODIFY;
-	}
+	/* Must redo promiscuous mode setup. */
+	priv->flags &= ~(MLX4_EN_FLAG_PROMISC | MLX4_EN_FLAG_MC_PROMISC);
 
-	err = mlx4_wol_write(priv->mdev->dev, config, priv->port);
-	if (err) {
-		en_err(priv, "Failed to set WoL information\n");
-		goto wol_err;
-	}
+	/* Schedule multicast task to populate multicast list */
+	queue_work(mdev->workqueue, &priv->rx_mode_task);
 
+	mlx4_set_stats_bitmap(mdev->dev, priv->stats_bitmap);
+
 	priv->port_up = true;
 
-	/* Populate multicast list */
-	mlx4_en_set_multicast(dev);
+        /* Enable the queues. */
+        dev->if_drv_flags &= ~IFF_DRV_OACTIVE;
+        dev->if_drv_flags |= IFF_DRV_RUNNING;
+#ifdef CONFIG_DEBUG_FS
+	mlx4_en_create_debug_files(priv);
+#endif
+        callout_reset(&priv->watchdog_timer, MLX4_EN_WATCHDOG_TIMEOUT,
+                    mlx4_en_watchdog_timeout, priv);
 
-	/* Enable the queues. */
-	atomic_clear_int(&dev->if_drv_flags, IFF_DRV_OACTIVE);
-	atomic_set_int(&dev->if_drv_flags, IFF_DRV_RUNNING);
 
-	callout_reset(&priv->watchdog_timer, MLX4_EN_WATCHDOG_TIMEOUT,
-	    mlx4_en_watchdog_timeout, priv);
-
 	return 0;
 
-wol_err:
-	/* close port*/
-	mlx4_CLOSE_PORT(mdev->dev, priv->port);
-
-mac_err:
-	mlx4_unregister_mac(mdev->dev, priv->port, priv->mac_index);
 tx_err:
 	while (tx_index--) {
-		mlx4_en_deactivate_tx_ring(priv, &priv->tx_ring[tx_index]);
-		mlx4_en_deactivate_cq(priv, &priv->tx_cq[tx_index]);
+		mlx4_en_deactivate_tx_ring(priv, priv->tx_ring[tx_index]);
+		mlx4_en_deactivate_cq(priv, priv->tx_cq[tx_index]);
 	}
-
+	mlx4_en_destroy_drop_qp(priv);
+rss_err:
 	mlx4_en_release_rss_steer(priv);
+mac_err:
+	mlx4_en_put_qp(priv);
 cq_err:
 	while (rx_index--)
-		mlx4_en_deactivate_cq(priv, &priv->rx_cq[rx_index]);
+		mlx4_en_deactivate_cq(priv, priv->rx_cq[rx_index]);
 	for (i = 0; i < priv->rx_ring_num; i++)
-		mlx4_en_deactivate_rx_ring(priv, &priv->rx_ring[i]);
+		mlx4_en_deactivate_rx_ring(priv, priv->rx_ring[i]);
 
 	return err; /* need to close devices */
 }
@@ -723,7 +1419,9 @@
 {
 	struct mlx4_en_priv *priv = netdev_priv(dev);
 	struct mlx4_en_dev *mdev = priv->mdev;
+	struct mlx4_en_mc_list *mclist, *tmp;
 	int i;
+	u8 mc_list[16] = {0};
 
 	if (!priv->port_up) {
 		en_dbg(DRV, priv, "stop port called while port already down\n");
@@ -730,38 +1428,94 @@
 		return;
 	}
 
+#ifdef CONFIG_DEBUG_FS
+	mlx4_en_delete_debug_files(priv);
+#endif
+
+	/* close port*/
+	mlx4_CLOSE_PORT(mdev->dev, priv->port);
+
 	/* Set port as not active */
 	priv->port_up = false;
+	if (priv->counter_index != 0xff) {
+		mlx4_counter_free(mdev->dev, priv->port, priv->counter_index);
+		priv->counter_index = 0xff;
+	}
 
-	/* Unregister Mac address for the port */
-	mlx4_unregister_mac(mdev->dev, priv->port, priv->mac_index);
-	mdev->mac_removed[priv->port] = 1;
+	/* Promsicuous mode */
+	if (mdev->dev->caps.steering_mode ==
+	    MLX4_STEERING_MODE_DEVICE_MANAGED) {
+		priv->flags &= ~(MLX4_EN_FLAG_PROMISC |
+				 MLX4_EN_FLAG_MC_PROMISC);
+		mlx4_flow_steer_promisc_remove(mdev->dev,
+					       priv->port,
+					       MLX4_FS_ALL_DEFAULT);
+		mlx4_flow_steer_promisc_remove(mdev->dev,
+					       priv->port,
+					       MLX4_FS_MC_DEFAULT);
+	} else if (priv->flags & MLX4_EN_FLAG_PROMISC) {
+		priv->flags &= ~MLX4_EN_FLAG_PROMISC;
 
+		/* Disable promiscouos mode */
+		mlx4_unicast_promisc_remove(mdev->dev, priv->base_qpn,
+					    priv->port);
+
+		/* Disable Multicast promisc */
+		if (priv->flags & MLX4_EN_FLAG_MC_PROMISC) {
+			mlx4_multicast_promisc_remove(mdev->dev, priv->base_qpn,
+						      priv->port);
+			priv->flags &= ~MLX4_EN_FLAG_MC_PROMISC;
+		}
+	}
+
+	/* Detach All multicasts */
+	memset(&mc_list[10], 0xff, ETH_ALEN);
+	mc_list[5] = priv->port; /* needed for B0 steering support */
+	mlx4_multicast_detach(mdev->dev, &priv->rss_map.indir_qp, mc_list,
+			      MLX4_PROT_ETH, priv->broadcast_id);
+	list_for_each_entry(mclist, &priv->curr_list, list) {
+		memcpy(&mc_list[10], mclist->addr, ETH_ALEN);
+		mc_list[5] = priv->port;
+		mlx4_multicast_detach(mdev->dev, &priv->rss_map.indir_qp,
+				      mc_list, MLX4_PROT_ETH, mclist->reg_id);
+	}
+	mlx4_en_clear_list(dev);
+	list_for_each_entry_safe(mclist, tmp, &priv->curr_list, list) {
+		list_del(&mclist->list);
+		kfree(mclist);
+	}
+
+	/* Flush multicast filter */
+	mlx4_SET_MCAST_FLTR(mdev->dev, priv->port, 0, 1, MLX4_MCAST_CONFIG);
+	mlx4_en_destroy_drop_qp(priv);
+
 	/* Free TX Rings */
 	for (i = 0; i < priv->tx_ring_num; i++) {
-		mlx4_en_deactivate_tx_ring(priv, &priv->tx_ring[i]);
-		mlx4_en_deactivate_cq(priv, &priv->tx_cq[i]);
+		mlx4_en_deactivate_tx_ring(priv, priv->tx_ring[i]);
+		mlx4_en_deactivate_cq(priv, priv->tx_cq[i]);
 	}
 	msleep(10);
 
 	for (i = 0; i < priv->tx_ring_num; i++)
-		mlx4_en_free_tx_buf(dev, &priv->tx_ring[i]);
+		mlx4_en_free_tx_buf(dev, priv->tx_ring[i]);
 
 	/* Free RSS qps */
 	mlx4_en_release_rss_steer(priv);
 
+	/* Unregister Mac address for the port */
+	mlx4_en_put_qp(priv);
+	mdev->mac_removed[priv->port] = 1;
+
 	/* Free RX Rings */
 	for (i = 0; i < priv->rx_ring_num; i++) {
-		mlx4_en_deactivate_rx_ring(priv, &priv->rx_ring[i]);
-		mlx4_en_deactivate_cq(priv, &priv->rx_cq[i]);
+		struct mlx4_en_cq *cq = priv->rx_cq[i];
+		mlx4_en_deactivate_rx_ring(priv, priv->rx_ring[i]);
+		mlx4_en_deactivate_cq(priv, cq);
 	}
 
-	/* close port*/
-	mlx4_CLOSE_PORT(mdev->dev, priv->port);
+        callout_stop(&priv->watchdog_timer);
 
-	callout_stop(&priv->watchdog_timer);
-
-	atomic_clear_int(&dev->if_drv_flags, IFF_DRV_RUNNING);
+        dev->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE);
 }
 
 static void mlx4_en_restart(struct work_struct *work)
@@ -773,12 +1527,13 @@
 	struct mlx4_en_tx_ring *ring;
 	int i;
 
+
 	if (priv->blocked == 0 || priv->port_up == 0)
 		return;
 	for (i = 0; i < priv->tx_ring_num; i++) {
-		ring = &priv->tx_ring[i];
+		ring = priv->tx_ring[i];
 		if (ring->blocked &&
-		    ring->watchdog_time + MLX4_EN_WATCHDOG_TIMEOUT < ticks)
+				ring->watchdog_time + MLX4_EN_WATCHDOG_TIMEOUT < ticks)
 			goto reset;
 	}
 	return;
@@ -790,6 +1545,8 @@
 	mutex_lock(&mdev->state_lock);
 	if (priv->port_up) {
 		mlx4_en_stop_port(dev);
+                //for (i = 0; i < priv->tx_ring_num; i++)         
+                //        netdev_tx_reset_queue(priv->tx_ring[i]->tx_queue);
 		if (mlx4_en_start_port(dev))
 			en_err(priv, "Failed restarting port %d\n", priv->port);
 	}
@@ -796,21 +1553,49 @@
 	mutex_unlock(&mdev->state_lock);
 }
 
-
-static void
-mlx4_en_init(void *arg)
+static void mlx4_en_clear_stats(struct net_device *dev)
 {
-	struct mlx4_en_priv *priv;
-	struct mlx4_en_dev *mdev;
-	struct ifnet *dev;
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
 	int i;
 
-	priv = arg;
-	dev = priv->dev;
-	mdev = priv->mdev;
+	if (!mlx4_is_slave(mdev->dev))
+		if (mlx4_en_DUMP_ETH_STATS(mdev, priv->port, 1))
+			en_dbg(HW, priv, "Failed dumping statistics\n");
+
+	memset(&priv->pstats, 0, sizeof(priv->pstats));
+	memset(&priv->pkstats, 0, sizeof(priv->pkstats));
+	memset(&priv->port_stats, 0, sizeof(priv->port_stats));
+	memset(&priv->vport_stats, 0, sizeof(priv->vport_stats));
+
+	for (i = 0; i < priv->tx_ring_num; i++) {
+		priv->tx_ring[i]->bytes = 0;
+		priv->tx_ring[i]->packets = 0;
+		priv->tx_ring[i]->tx_csum = 0;
+		priv->tx_ring[i]->oversized_packets = 0;
+	}
+	for (i = 0; i < priv->rx_ring_num; i++) {
+		priv->rx_ring[i]->bytes = 0;
+		priv->rx_ring[i]->packets = 0;
+		priv->rx_ring[i]->csum_ok = 0;
+		priv->rx_ring[i]->csum_none = 0;
+	}
+}
+
+static void mlx4_en_open(void* arg)
+{
+
+        struct mlx4_en_priv *priv;
+        struct mlx4_en_dev *mdev;
+        struct net_device *dev;
+        int err = 0;
+
+        priv = arg;
+        mdev = priv->mdev;
+        dev = priv->dev;
+
+
 	mutex_lock(&mdev->state_lock);
-	if (dev->if_drv_flags & IFF_DRV_RUNNING)
-		mlx4_en_stop_port(dev);
 
 	if (!mdev->device_up) {
 		en_err(priv, "Cannot open - device down/disabled\n");
@@ -817,27 +1602,16 @@
 		goto out;
 	}
 
-	/* Reset HW statistics and performance counters */
-	if (mlx4_en_DUMP_ETH_STATS(mdev, priv->port, 1))
-		en_dbg(HW, priv, "Failed dumping statistics\n");
+	/* Reset HW statistics and SW counters */
+	mlx4_en_clear_stats(dev);
 
-	memset(&priv->pstats, 0, sizeof(priv->pstats));
-
-	for (i = 0; i < priv->tx_ring_num; i++) {
-		priv->tx_ring[i].bytes = 0;
-		priv->tx_ring[i].packets = 0;
-	}
-	for (i = 0; i < priv->rx_ring_num; i++) {
-		priv->rx_ring[i].bytes = 0;
-		priv->rx_ring[i].packets = 0;
-	}
-
-	mlx4_en_set_default_moderation(priv);
-	if (mlx4_en_start_port(dev))
+	err = mlx4_en_start_port(dev);
+	if (err)
 		en_err(priv, "Failed starting port:%d\n", priv->port);
 
 out:
 	mutex_unlock(&mdev->state_lock);
+	return;
 }
 
 void mlx4_en_free_resources(struct mlx4_en_priv *priv)
@@ -844,23 +1618,30 @@
 {
 	int i;
 
+#ifdef CONFIG_RFS_ACCEL
+	if (priv->dev->rx_cpu_rmap) {
+		free_irq_cpu_rmap(priv->dev->rx_cpu_rmap);
+		priv->dev->rx_cpu_rmap = NULL;
+	}
+#endif
+
 	for (i = 0; i < priv->tx_ring_num; i++) {
-		if (priv->tx_ring[i].tx_info)
+		if (priv->tx_ring && priv->tx_ring[i])
 			mlx4_en_destroy_tx_ring(priv, &priv->tx_ring[i]);
-		if (priv->tx_cq[i].buf)
+		if (priv->tx_cq && priv->tx_cq[i])
 			mlx4_en_destroy_cq(priv, &priv->tx_cq[i]);
 	}
 
 	for (i = 0; i < priv->rx_ring_num; i++) {
-		if (priv->rx_ring[i].rx_info)
-			mlx4_en_destroy_rx_ring(priv, &priv->rx_ring[i]);
-		if (priv->rx_cq[i].buf)
+		if (priv->rx_ring[i])
+			mlx4_en_destroy_rx_ring(priv, &priv->rx_ring[i],
+				priv->prof->rx_ring_size, priv->stride);
+		if (priv->rx_cq[i])
 			mlx4_en_destroy_cq(priv, &priv->rx_cq[i]);
 	}
-	/* Free the stats tree when we resize the rings. */
+
 	if (priv->sysctl)
 		sysctl_ctx_free(&priv->stat_ctx);
-
 }
 
 int mlx4_en_alloc_resources(struct mlx4_en_priv *priv)
@@ -867,44 +1648,71 @@
 {
 	struct mlx4_en_port_profile *prof = priv->prof;
 	int i;
+	int node = 0;
 
-	/* Create tx Rings */
-	for (i = 0; i < priv->tx_ring_num; i++) {
-		if (mlx4_en_create_cq(priv, &priv->tx_cq[i],
-				      prof->tx_ring_size, i, TX))
+	/* Create rx Rings */
+	for (i = 0; i < priv->rx_ring_num; i++) {
+		if (mlx4_en_create_cq(priv, &priv->rx_cq[i],
+				      prof->rx_ring_size, i, RX, node))
 			goto err;
 
-		if (mlx4_en_create_tx_ring(priv, &priv->tx_ring[i],
-					   prof->tx_ring_size, TXBB_SIZE))
+		if (mlx4_en_create_rx_ring(priv, &priv->rx_ring[i],
+					   prof->rx_ring_size, node))
 			goto err;
 	}
 
-	/* Create rx Rings */
-	for (i = 0; i < priv->rx_ring_num; i++) {
-		if (mlx4_en_create_cq(priv, &priv->rx_cq[i],
-				      prof->rx_ring_size, i, RX))
+	/* Create tx Rings */
+	for (i = 0; i < priv->tx_ring_num; i++) {
+		if (mlx4_en_create_cq(priv, &priv->tx_cq[i],
+				      prof->tx_ring_size, i, TX, node))
 			goto err;
 
-		if (mlx4_en_create_rx_ring(priv, &priv->rx_ring[i],
-					   prof->rx_ring_size))
+		if (mlx4_en_create_tx_ring(priv, &priv->tx_ring[i],
+					   prof->tx_ring_size, TXBB_SIZE, node, i))
 			goto err;
 	}
 
-	/* Re-create stat sysctls in case the number of rings changed. */
+#ifdef CONFIG_RFS_ACCEL
+	priv->dev->rx_cpu_rmap = alloc_irq_cpu_rmap(priv->rx_ring_num);
+	if (!priv->dev->rx_cpu_rmap)
+		goto err;
+#endif
+        /* Re-create stat sysctls in case the number of rings changed. */
 	mlx4_en_sysctl_stat(priv);
-
-	/* Populate Tx priority mappings */
-	mlx4_en_set_prio_map(priv, priv->tx_prio_map,
-			     priv->tx_ring_num - MLX4_EN_NUM_HASH_RINGS);
-
 	return 0;
 
 err:
 	en_err(priv, "Failed to allocate NIC resources\n");
+	for (i = 0; i < priv->rx_ring_num; i++) {
+		if (priv->rx_ring[i])
+			mlx4_en_destroy_rx_ring(priv, &priv->rx_ring[i],
+						prof->rx_ring_size,
+						priv->stride);
+		if (priv->rx_cq[i])
+			mlx4_en_destroy_cq(priv, &priv->rx_cq[i]);
+	}
+	for (i = 0; i < priv->tx_ring_num; i++) {
+		if (priv->tx_ring[i])
+			mlx4_en_destroy_tx_ring(priv, &priv->tx_ring[i]);
+		if (priv->tx_cq[i])
+			mlx4_en_destroy_cq(priv, &priv->tx_cq[i]);
+	}
+	priv->port_up = false;
 	return -ENOMEM;
 }
 
+struct en_port_attribute {
+	struct attribute attr;
+	ssize_t (*show)(struct en_port *, struct en_port_attribute *, char *buf);
+	ssize_t (*store)(struct en_port *, struct en_port_attribute *, char *buf, size_t count);
+};
 
+#define PORT_ATTR_RO(_name) \
+struct en_port_attribute en_port_attr_##_name = __ATTR_RO(_name)
+
+#define EN_PORT_ATTR(_name, _mode, _show, _store) \
+struct en_port_attribute en_port_attr_##_name = __ATTR(_name, _mode, _show, _store)
+
 void mlx4_en_destroy_netdev(struct net_device *dev)
 {
 	struct mlx4_en_priv *priv = netdev_priv(dev);
@@ -912,24 +1720,31 @@
 
 	en_dbg(DRV, priv, "Destroying netdev on port:%d\n", priv->port);
 
-	if (priv->vlan_attach != NULL)
-		EVENTHANDLER_DEREGISTER(vlan_config, priv->vlan_attach);
-	if (priv->vlan_detach != NULL)
-		EVENTHANDLER_DEREGISTER(vlan_unconfig, priv->vlan_detach);
+        if (priv->vlan_attach != NULL)
+                EVENTHANDLER_DEREGISTER(vlan_config, priv->vlan_attach);
+        if (priv->vlan_detach != NULL)
+                EVENTHANDLER_DEREGISTER(vlan_unconfig, priv->vlan_detach);
 
 	/* Unregister device - this will close the port if it was up */
-	if (priv->registered)
+	if (priv->registered) {
+		mutex_lock(&mdev->state_lock);
 		ether_ifdetach(dev);
+		mutex_unlock(&mdev->state_lock);
+	}
 
 	if (priv->allocated)
 		mlx4_free_hwq_res(mdev->dev, &priv->res, MLX4_EN_PAGE_SIZE);
 
-	if (priv->sysctl)
-		sysctl_ctx_free(&priv->conf_ctx);
+	mutex_lock(&mdev->state_lock);
+	mlx4_en_stop_port(dev);
+	mutex_unlock(&mdev->state_lock);
 
+
 	cancel_delayed_work(&priv->stats_task);
+	cancel_delayed_work(&priv->service_task);
 	/* flush any pending task for this netdev */
 	flush_workqueue(mdev->workqueue);
+        callout_drain(&priv->watchdog_timer);
 
 	/* Detach the netdev so tasks would not attempt to access it */
 	mutex_lock(&mdev->state_lock);
@@ -936,11 +1751,19 @@
 	mdev->pndev[priv->port] = NULL;
 	mutex_unlock(&mdev->state_lock);
 
+
 	mlx4_en_free_resources(priv);
-	mtx_destroy(&priv->stats_lock.m);
-	mtx_destroy(&priv->vlan_lock.m);
-	kfree(priv);
-	if_free(dev);
+
+	/* freeing the sysctl conf cannot be called from within mlx4_en_free_resources */
+	if (priv->sysctl)
+		sysctl_ctx_free(&priv->conf_ctx);
+
+	kfree(priv->tx_ring);
+	kfree(priv->tx_cq);
+
+        kfree(priv);
+        if_free(dev);
+
 }
 
 static int mlx4_en_change_mtu(struct net_device *dev, int new_mtu)
@@ -949,8 +1772,8 @@
 	struct mlx4_en_dev *mdev = priv->mdev;
 	int err = 0;
 
-	en_dbg(DRV, priv, "Change MTU called - current:%ld new:%d\n",
-		 dev->if_mtu, new_mtu);
+	en_dbg(DRV, priv, "Change MTU called - current:%u new:%u\n",
+	       (unsigned)dev->if_mtu, (unsigned)new_mtu);
 
 	if ((new_mtu < MLX4_EN_MIN_MTU) || (new_mtu > priv->max_mtu)) {
 		en_err(priv, "Bad MTU size:%d.\n", new_mtu);
@@ -961,15 +1784,14 @@
 	if (dev->if_drv_flags & IFF_DRV_RUNNING) {
 		if (!mdev->device_up) {
 			/* NIC is probably restarting - let watchdog task reset
-			 * the port */
+			 *                          * the port */
 			en_dbg(DRV, priv, "Change MTU called with card down!?\n");
 		} else {
 			mlx4_en_stop_port(dev);
-			mlx4_en_set_default_moderation(priv);
 			err = mlx4_en_start_port(dev);
 			if (err) {
 				en_err(priv, "Failed restarting port:%d\n",
-					 priv->port);
+						priv->port);
 				queue_work(mdev->workqueue, &priv->watchdog_task);
 			}
 		}
@@ -986,8 +1808,6 @@
 	active = IFM_ETHER;
 	if (priv->last_link_state == MLX4_DEV_EVENT_PORT_DOWN)
 		return (active);
-	if (mlx4_en_QUERY_PORT(priv->mdev, priv->port))
-		return (active);
 	active |= IFM_FDX;
 	trans_type = priv->port_state.transciver;
 	/* XXX I don't know all of the transceiver values. */
@@ -1013,7 +1833,6 @@
 	return (active);
 }
 
-
 static void mlx4_en_media_status(struct ifnet *dev, struct ifmediareq *ifmr)
 {
 	struct mlx4_en_priv *priv;
@@ -1048,9 +1867,10 @@
 	case IFM_10G_SR:
 	case IFM_10G_CX4:
 	case IFM_1000_T:
-		if (IFM_SUBTYPE(ifm->ifm_media) ==
-		    IFM_SUBTYPE(mlx4_en_calc_media(priv)) &&
-		    (ifm->ifm_media & IFM_FDX))
+	case IFM_40G_CR4:
+		if ((IFM_SUBTYPE(ifm->ifm_media)
+			== IFM_SUBTYPE(mlx4_en_calc_media(priv)))
+			&& (ifm->ifm_media & IFM_FDX))
 			break;
 		/* Fallthrough */
 	default:
@@ -1087,6 +1907,7 @@
 	mdev = priv->mdev;
 	ifr = (struct ifreq *) data;
 	switch (command) {
+
 	case SIOCSIFMTU:
 		error = -mlx4_en_change_mtu(dev, ifr->ifr_mtu);
 		break;
@@ -1096,8 +1917,9 @@
 				mutex_lock(&mdev->state_lock);
 				mlx4_en_start_port(dev);
 				mutex_unlock(&mdev->state_lock);
-			} else
-				mlx4_en_set_multicast(dev);
+			} else {
+				mlx4_en_set_rx_mode(dev);
+			}
 		} else {
 			mutex_lock(&mdev->state_lock);
 			if (dev->if_drv_flags & IFF_DRV_RUNNING) {
@@ -1109,7 +1931,7 @@
 		break;
 	case SIOCADDMULTI:
 	case SIOCDELMULTI:
-		mlx4_en_set_multicast(dev);
+		mlx4_en_set_rx_mode(dev);
 		break;
 	case SIOCSIFMEDIA:
 	case SIOCGIFMEDIA:
@@ -1116,11 +1938,59 @@
 		error = ifmedia_ioctl(dev, ifr, &priv->media, command);
 		break;
 	case SIOCSIFCAP:
+		mutex_lock(&mdev->state_lock);
 		mask = ifr->ifr_reqcap ^ dev->if_capenable;
-		if (mask & IFCAP_HWCSUM)
-			dev->if_capenable ^= IFCAP_HWCSUM;
-		if (mask & IFCAP_TSO4)
+		if (mask & IFCAP_TXCSUM) {
+			dev->if_capenable ^= IFCAP_TXCSUM;
+			dev->if_hwassist ^= (CSUM_TCP | CSUM_UDP | CSUM_IP);
+
+			if (IFCAP_TSO4 & dev->if_capenable &&
+			    !(IFCAP_TXCSUM & dev->if_capenable)) {
+				dev->if_capenable &= ~IFCAP_TSO4;
+				dev->if_hwassist &= ~CSUM_TSO;
+				if_printf(dev,
+				    "tso4 disabled due to -txcsum.\n");
+			}
+		}
+		if (mask & IFCAP_TXCSUM_IPV6) {
+			dev->if_capenable ^= IFCAP_TXCSUM_IPV6;
+			dev->if_hwassist ^= (CSUM_UDP_IPV6 | CSUM_TCP_IPV6);
+
+			if (IFCAP_TSO6 & dev->if_capenable &&
+			    !(IFCAP_TXCSUM_IPV6 & dev->if_capenable)) {
+				dev->if_capenable &= ~IFCAP_TSO6;
+				dev->if_hwassist &= ~CSUM_TSO;
+				if_printf(dev,
+				    "tso6 disabled due to -txcsum6.\n");
+			}
+		}
+		if (mask & IFCAP_RXCSUM)
+			dev->if_capenable ^= IFCAP_RXCSUM;
+		if (mask & IFCAP_RXCSUM_IPV6)
+			dev->if_capenable ^= IFCAP_RXCSUM_IPV6;
+
+		if (mask & IFCAP_TSO4) {
+			if (!(IFCAP_TSO4 & dev->if_capenable) &&
+			    !(IFCAP_TXCSUM & dev->if_capenable)) {
+				if_printf(dev, "enable txcsum first.\n");
+				error = EAGAIN;
+				goto out;
+			}
 			dev->if_capenable ^= IFCAP_TSO4;
+		}
+		if (mask & IFCAP_TSO6) {
+			if (!(IFCAP_TSO6 & dev->if_capenable) &&
+			    !(IFCAP_TXCSUM_IPV6 & dev->if_capenable)) {
+				if_printf(dev, "enable txcsum6 first.\n");
+				error = EAGAIN;
+				goto out;
+			}
+			dev->if_capenable ^= IFCAP_TSO6;
+		}
+		if (dev->if_capenable & (IFCAP_TSO4 | IFCAP_TSO6))
+			dev->if_hwassist |= CSUM_TSO;
+		else
+			dev->if_hwassist &= ~CSUM_TSO;
 		if (mask & IFCAP_LRO)
 			dev->if_capenable ^= IFCAP_LRO;
 		if (mask & IFCAP_VLAN_HWTAGGING)
@@ -1130,9 +2000,36 @@
 		if (mask & IFCAP_WOL_MAGIC)
 			dev->if_capenable ^= IFCAP_WOL_MAGIC;
 		if (dev->if_drv_flags & IFF_DRV_RUNNING)
-			mlx4_en_init(priv);
+			mlx4_en_start_port(dev);
+out:
+		mutex_unlock(&mdev->state_lock);
 		VLAN_CAPABILITIES(dev);
 		break;
+#if __FreeBSD_version >= 1100036
+	case SIOCGI2C: {
+		struct ifi2creq i2c;
+
+		error = copyin(ifr->ifr_data, &i2c, sizeof(i2c));
+		if (error)
+			break;
+		if (i2c.len > sizeof(i2c.data)) {
+			error = EINVAL;
+			break;
+		}
+		/*
+		 * Note that we ignore i2c.addr here. The driver hardcodes
+		 * the address to 0x50, while standard expects it to be 0xA0.
+		 */
+		error = mlx4_get_module_info(mdev->dev, priv->port,
+		    i2c.offset, i2c.len, i2c.data);
+		if (error < 0) {
+			error = -error;
+			break;
+		}
+		error = copyout(&i2c, ifr->ifr_data, sizeof(i2c));
+		break;
+	}
+#endif
 	default:
 		error = ether_ioctl(dev, command, data);
 		break;
@@ -1141,231 +2038,605 @@
 	return (error);
 }
 
-static int mlx4_en_set_ring_size(struct net_device *dev,
-    int rx_size, int tx_size)
+
+int mlx4_en_init_netdev(struct mlx4_en_dev *mdev, int port,
+			struct mlx4_en_port_profile *prof)
 {
-	struct mlx4_en_priv *priv = netdev_priv(dev);
-	struct mlx4_en_dev *mdev = priv->mdev;
-	int port_up = 0;
-	int err = 0;
+	struct net_device *dev;
+	struct mlx4_en_priv *priv;
+	uint8_t dev_addr[ETHER_ADDR_LEN];
+	int err;
+	int i;
 
-	rx_size = roundup_pow_of_two(rx_size);
-	rx_size = max_t(u32, rx_size, MLX4_EN_MIN_RX_SIZE);
-	rx_size = min_t(u32, rx_size, MLX4_EN_MAX_RX_SIZE);
-	tx_size = roundup_pow_of_two(tx_size);
-	tx_size = max_t(u32, tx_size, MLX4_EN_MIN_TX_SIZE);
-	tx_size = min_t(u32, tx_size, MLX4_EN_MAX_TX_SIZE);
+	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+	dev = priv->dev = if_alloc(IFT_ETHER);
+	if (dev == NULL) {
+		en_err(priv, "Net device allocation failed\n");
+		kfree(priv);
+		return -ENOMEM;
+	}
+	dev->if_softc = priv;
+	if_initname(dev, "mlxen", atomic_fetchadd_int(&mlx4_en_unit, 1));
+	dev->if_mtu = ETHERMTU;
+	dev->if_init = mlx4_en_open;
+	dev->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
+	dev->if_ioctl = mlx4_en_ioctl;
+	dev->if_transmit = mlx4_en_transmit;
+	dev->if_qflush = mlx4_en_qflush;
+	dev->if_snd.ifq_maxlen = prof->tx_ring_size;
 
-	if (rx_size == (priv->port_up ?
-	    priv->rx_ring[0].actual_size : priv->rx_ring[0].size) &&
-	    tx_size == priv->tx_ring[0].size)
-		return 0;
+	/*
+	 * Initialize driver private data
+	 */
+	priv->counter_index = 0xff;
+	spin_lock_init(&priv->stats_lock);
+	INIT_WORK(&priv->rx_mode_task, mlx4_en_do_set_rx_mode);
+	INIT_WORK(&priv->watchdog_task, mlx4_en_restart);
+	INIT_WORK(&priv->linkstate_task, mlx4_en_linkstate);
+	INIT_DELAYED_WORK(&priv->stats_task, mlx4_en_do_get_stats);
+	INIT_DELAYED_WORK(&priv->service_task, mlx4_en_service_task);
+	callout_init(&priv->watchdog_timer, 1);
+#ifdef CONFIG_RFS_ACCEL
+	INIT_LIST_HEAD(&priv->filters);
+	spin_lock_init(&priv->filters_lock);
+#endif
 
-	mutex_lock(&mdev->state_lock);
-	if (priv->port_up) {
-		port_up = 1;
-		mlx4_en_stop_port(dev);
+	priv->msg_enable = MLX4_EN_MSG_LEVEL;
+	priv->dev = dev;
+	priv->mdev = mdev;
+	priv->ddev = &mdev->pdev->dev;
+	priv->prof = prof;
+	priv->port = port;
+	priv->port_up = false;
+	priv->flags = prof->flags;
+
+	priv->num_tx_rings_p_up = mdev->profile.num_tx_rings_p_up;
+	priv->tx_ring_num = prof->tx_ring_num;
+	priv->tx_ring = kcalloc(MAX_TX_RINGS,
+				sizeof(struct mlx4_en_tx_ring *), GFP_KERNEL);
+	if (!priv->tx_ring) {
+		err = -ENOMEM;
+		goto out;
 	}
-	mlx4_en_free_resources(priv);
-	priv->prof->tx_ring_size = tx_size;
-	priv->prof->rx_ring_size = rx_size;
+	priv->tx_cq = kcalloc(sizeof(struct mlx4_en_cq *), MAX_TX_RINGS,
+			GFP_KERNEL);
+	if (!priv->tx_cq) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	priv->rx_ring_num = prof->rx_ring_num;
+	priv->cqe_factor = (mdev->dev->caps.cqe_size == 64) ? 1 : 0;
+	priv->mac_index = -1;
+	priv->last_ifq_jiffies = 0;
+	priv->if_counters_rx_errors = 0;
+	priv->if_counters_rx_no_buffer = 0;
+#ifdef CONFIG_MLX4_EN_DCB
+	if (!mlx4_is_slave(priv->mdev->dev)) {
+		priv->dcbx_cap = DCB_CAP_DCBX_HOST;
+		priv->flags |= MLX4_EN_FLAG_DCB_ENABLED;
+		if (mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_ETS_CFG) {
+			dev->dcbnl_ops = &mlx4_en_dcbnl_ops;
+		} else {
+			en_info(priv, "QoS disabled - no HW support\n");
+			dev->dcbnl_ops = &mlx4_en_dcbnl_pfc_ops;
+		}
+	}
+#endif
+
+	for (i = 0; i < MLX4_EN_MAC_HASH_SIZE; ++i)
+		INIT_HLIST_HEAD(&priv->mac_hash[i]);
+
+	/* Query for default mac and max mtu */
+	priv->max_mtu = mdev->dev->caps.eth_mtu_cap[priv->port];
+        priv->mac = mdev->dev->caps.def_mac[priv->port];
+        if (ILLEGAL_MAC(priv->mac)) {
+#if BITS_PER_LONG == 64
+                en_err(priv, "Port: %d, invalid mac burned: 0x%lx, quiting\n",
+                                priv->port, priv->mac);
+#elif BITS_PER_LONG == 32
+                en_err(priv, "Port: %d, invalid mac burned: 0x%llx, quiting\n",
+                                priv->port, priv->mac);
+#endif
+                err = -EINVAL;
+                goto out;
+        }
+
+	priv->stride = roundup_pow_of_two(sizeof(struct mlx4_en_rx_desc) +
+					  DS_SIZE);
+
+	mlx4_en_sysctl_conf(priv);
+
 	err = mlx4_en_alloc_resources(priv);
+	if (err)
+		goto out;
+
+	/* Allocate page for receive rings */
+	err = mlx4_alloc_hwq_res(mdev->dev, &priv->res,
+				MLX4_EN_PAGE_SIZE, MLX4_EN_PAGE_SIZE);
 	if (err) {
-		en_err(priv, "Failed reallocating port resources\n");
+		en_err(priv, "Failed to allocate page for rx qps\n");
 		goto out;
 	}
-	if (port_up) {
-		err = mlx4_en_start_port(dev);
-		if (err)
-			en_err(priv, "Failed starting port\n");
+	priv->allocated = 1;
+
+	/*
+	 * Set driver features
+	 */
+	dev->if_capabilities |= IFCAP_HWCSUM | IFCAP_HWCSUM_IPV6;
+	dev->if_capabilities |= IFCAP_VLAN_MTU | IFCAP_VLAN_HWTAGGING;
+	dev->if_capabilities |= IFCAP_VLAN_HWCSUM | IFCAP_VLAN_HWFILTER;
+	dev->if_capabilities |= IFCAP_LINKSTATE | IFCAP_JUMBO_MTU;
+	dev->if_capabilities |= IFCAP_LRO;
+
+	if (mdev->LSO_support)
+		dev->if_capabilities |= IFCAP_TSO4 | IFCAP_TSO6 | IFCAP_VLAN_HWTSO;
+
+	/* set TSO limits so that we don't have to drop TX packets */
+	dev->if_hw_tsomax = MLX4_EN_TX_MAX_PAYLOAD_SIZE - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN) /* hdr */;
+	dev->if_hw_tsomaxsegcount = MLX4_EN_TX_MAX_MBUF_FRAGS - 1 /* hdr */;
+	dev->if_hw_tsomaxsegsize = MLX4_EN_TX_MAX_MBUF_SIZE;
+
+	dev->if_capenable = dev->if_capabilities;
+
+	dev->if_hwassist = 0;
+	if (dev->if_capenable & (IFCAP_TSO4 | IFCAP_TSO6))
+		dev->if_hwassist |= CSUM_TSO;
+	if (dev->if_capenable & IFCAP_TXCSUM)
+		dev->if_hwassist |= (CSUM_TCP | CSUM_UDP | CSUM_IP);
+	if (dev->if_capenable & IFCAP_TXCSUM_IPV6)
+		dev->if_hwassist |= (CSUM_UDP_IPV6 | CSUM_TCP_IPV6);
+
+
+        /* Register for VLAN events */
+	priv->vlan_attach = EVENTHANDLER_REGISTER(vlan_config,
+            mlx4_en_vlan_rx_add_vid, priv, EVENTHANDLER_PRI_FIRST);
+	priv->vlan_detach = EVENTHANDLER_REGISTER(vlan_unconfig,
+            mlx4_en_vlan_rx_kill_vid, priv, EVENTHANDLER_PRI_FIRST);
+
+	mdev->pndev[priv->port] = dev;
+
+	priv->last_link_state = MLX4_DEV_EVENT_PORT_DOWN;
+        mlx4_en_set_default_moderation(priv);
+
+	/* Set default MAC */
+	for (i = 0; i < ETHER_ADDR_LEN; i++)
+		dev_addr[ETHER_ADDR_LEN - 1 - i] = (u8) (priv->mac >> (8 * i));
+
+
+	ether_ifattach(dev, dev_addr);
+	if_link_state_change(dev, LINK_STATE_DOWN);
+	ifmedia_init(&priv->media, IFM_IMASK | IFM_ETH_FMASK,
+	    mlx4_en_media_change, mlx4_en_media_status);
+	ifmedia_add(&priv->media, IFM_ETHER | IFM_FDX | IFM_1000_T, 0, NULL);
+	ifmedia_add(&priv->media, IFM_ETHER | IFM_FDX | IFM_10G_SR, 0, NULL);
+	ifmedia_add(&priv->media, IFM_ETHER | IFM_FDX | IFM_10G_CX4, 0, NULL);
+	ifmedia_add(&priv->media, IFM_ETHER | IFM_FDX | IFM_40G_CR4, 0, NULL);
+	ifmedia_add(&priv->media, IFM_ETHER | IFM_AUTO, 0, NULL);
+	ifmedia_set(&priv->media, IFM_ETHER | IFM_AUTO);
+
+	en_warn(priv, "Using %d TX rings\n", prof->tx_ring_num);
+	en_warn(priv, "Using %d RX rings\n", prof->rx_ring_num);
+
+	priv->registered = 1;
+
+        en_warn(priv, "Using %d TX rings\n", prof->tx_ring_num);
+        en_warn(priv, "Using %d RX rings\n", prof->rx_ring_num);
+
+
+	priv->rx_mb_size = dev->if_mtu + ETH_HLEN + VLAN_HLEN + ETH_FCS_LEN;
+	err = mlx4_SET_PORT_general(mdev->dev, priv->port,
+				    priv->rx_mb_size,
+				    prof->tx_pause, prof->tx_ppp,
+				    prof->rx_pause, prof->rx_ppp);
+	if (err) {
+		en_err(priv, "Failed setting port general configurations "
+		       "for port %d, with error %d\n", priv->port, err);
+		goto out;
 	}
+
+	/* Init port */
+	en_warn(priv, "Initializing port\n");
+	err = mlx4_INIT_PORT(mdev->dev, priv->port);
+	if (err) {
+		en_err(priv, "Failed Initializing port\n");
+		goto out;
+	}
+
+	queue_delayed_work(mdev->workqueue, &priv->stats_task, STATS_DELAY);
+
+        if (mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_TS)
+                queue_delayed_work(mdev->workqueue, &priv->service_task, SERVICE_TASK_DELAY);
+
+	return 0;
+
 out:
-	mutex_unlock(&mdev->state_lock);
+	mlx4_en_destroy_netdev(dev);
 	return err;
 }
 
-static int mlx4_en_set_rx_ring_size(SYSCTL_HANDLER_ARGS)
+static int mlx4_en_set_ring_size(struct net_device *dev,
+    int rx_size, int tx_size)
 {
-	struct mlx4_en_priv *priv;
-	int size;
-	int error;
+        struct mlx4_en_priv *priv = netdev_priv(dev);
+        struct mlx4_en_dev *mdev = priv->mdev;
+        int port_up = 0;
+        int err = 0;
 
-	priv = arg1;
-	size = priv->prof->rx_ring_size;
-	error = sysctl_handle_int(oidp, &size, 0, req);
-	if (error || !req->newptr)
-		return (error);
-	error = -mlx4_en_set_ring_size(priv->dev, size,
-	    priv->prof->tx_ring_size);
+        rx_size = roundup_pow_of_two(rx_size);
+        rx_size = max_t(u32, rx_size, MLX4_EN_MIN_RX_SIZE);
+        rx_size = min_t(u32, rx_size, MLX4_EN_MAX_RX_SIZE);
+        tx_size = roundup_pow_of_two(tx_size);
+        tx_size = max_t(u32, tx_size, MLX4_EN_MIN_TX_SIZE);
+        tx_size = min_t(u32, tx_size, MLX4_EN_MAX_TX_SIZE);
 
-	return (error);
+        if (rx_size == (priv->port_up ?
+            priv->rx_ring[0]->actual_size : priv->rx_ring[0]->size) &&
+            tx_size == priv->tx_ring[0]->size)
+                return 0;
+        mutex_lock(&mdev->state_lock);
+        if (priv->port_up) {
+                port_up = 1;
+                mlx4_en_stop_port(dev);
+        }
+        mlx4_en_free_resources(priv);
+        priv->prof->tx_ring_size = tx_size;
+        priv->prof->rx_ring_size = rx_size;
+        err = mlx4_en_alloc_resources(priv);
+        if (err) {
+                en_err(priv, "Failed reallocating port resources\n");
+                goto out;
+        }
+        if (port_up) {
+                err = mlx4_en_start_port(dev);
+                if (err)
+                        en_err(priv, "Failed starting port\n");
+        }
+out:
+        mutex_unlock(&mdev->state_lock);
+        return err;
 }
+static int mlx4_en_set_rx_ring_size(SYSCTL_HANDLER_ARGS)
+{
+        struct mlx4_en_priv *priv;
+        int size;
+        int error;
 
+        priv = arg1;
+        size = priv->prof->rx_ring_size;
+        error = sysctl_handle_int(oidp, &size, 0, req);
+        if (error || !req->newptr)
+                return (error);
+        error = -mlx4_en_set_ring_size(priv->dev, size,
+            priv->prof->tx_ring_size);
+        return (error);
+}
+
 static int mlx4_en_set_tx_ring_size(SYSCTL_HANDLER_ARGS)
 {
-	struct mlx4_en_priv *priv;
-	int size;
-	int error;
+        struct mlx4_en_priv *priv;
+        int size;
+        int error;
 
-	priv = arg1;
-	size = priv->prof->tx_ring_size;
-	error = sysctl_handle_int(oidp, &size, 0, req);
-	if (error || !req->newptr)
-		return (error);
-	error = -mlx4_en_set_ring_size(priv->dev, priv->prof->rx_ring_size,
-	    size);
+        priv = arg1;
+        size = priv->prof->tx_ring_size;
+        error = sysctl_handle_int(oidp, &size, 0, req);
+        if (error || !req->newptr)
+                return (error);
+        error = -mlx4_en_set_ring_size(priv->dev, priv->prof->rx_ring_size,
+            size);
 
-	return (error);
+        return (error);
 }
 
-static int mlx4_en_set_tx_ppp(SYSCTL_HANDLER_ARGS)
+static int mlx4_en_get_module_info(struct net_device *dev,
+				   struct ethtool_modinfo *modinfo)
 {
-	struct mlx4_en_priv *priv;
-	int ppp;
-	int error;
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	int ret;
+	u8 data[4];
 
-	priv = arg1;
-	ppp = priv->prof->tx_ppp;
-	error = sysctl_handle_int(oidp, &ppp, 0, req);
-	if (error || !req->newptr)
-		return (error);
-	if (ppp > 0xff || ppp < 0)
-		return (-EINVAL);
-	priv->prof->tx_ppp = ppp;
-	error = -mlx4_SET_PORT_general(priv->mdev->dev, priv->port,
-				       priv->rx_mb_size + ETHER_CRC_LEN,
-				       priv->prof->tx_pause,
-				       priv->prof->tx_ppp,
-				       priv->prof->rx_pause,
-				       priv->prof->rx_ppp);
+	/* Read first 2 bytes to get Module & REV ID */
+	ret = mlx4_get_module_info(mdev->dev, priv->port,
+				   0/*offset*/, 2/*size*/, data);
 
-	return (error);
+	if (ret < 2) {
+		en_err(priv, "Failed to read eeprom module first two bytes, error: 0x%x\n", -ret);
+		return -EIO;
+	}
+
+	switch (data[0] /* identifier */) {
+	case MLX4_MODULE_ID_QSFP:
+		modinfo->type = ETH_MODULE_SFF_8436;
+		modinfo->eeprom_len = ETH_MODULE_SFF_8436_LEN;
+		break;
+	case MLX4_MODULE_ID_QSFP_PLUS:
+		if (data[1] >= 0x3) { /* revision id */
+			modinfo->type = ETH_MODULE_SFF_8636;
+			modinfo->eeprom_len = ETH_MODULE_SFF_8636_LEN;
+		} else {
+			modinfo->type = ETH_MODULE_SFF_8436;
+			modinfo->eeprom_len = ETH_MODULE_SFF_8436_LEN;
+		}
+		break;
+	case MLX4_MODULE_ID_QSFP28:
+		modinfo->type = ETH_MODULE_SFF_8636;
+		modinfo->eeprom_len = ETH_MODULE_SFF_8636_LEN;
+		break;
+	case MLX4_MODULE_ID_SFP:
+		modinfo->type = ETH_MODULE_SFF_8472;
+		modinfo->eeprom_len = ETH_MODULE_SFF_8472_LEN;
+		break;
+	default:
+		en_err(priv, "mlx4_en_get_module_info :  Not recognized cable type\n");
+		return -EINVAL;
+	}
+
+	return 0;
 }
 
-static int mlx4_en_set_rx_ppp(SYSCTL_HANDLER_ARGS)
+static int mlx4_en_get_module_eeprom(struct net_device *dev,
+				     struct ethtool_eeprom *ee,
+				     u8 *data)
 {
-	struct mlx4_en_priv *priv;
-	struct mlx4_en_dev *mdev;
-	int tx_ring_num;
-	int ppp;
-	int error;
-	int port_up;
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	int offset = ee->offset;
+	int i = 0, ret;
 
-	port_up = 0;
-	priv = arg1;
-	mdev = priv->mdev;
-	ppp = priv->prof->rx_ppp;
-	error = sysctl_handle_int(oidp, &ppp, 0, req);
+	if (ee->len == 0)
+		return -EINVAL;
+
+	memset(data, 0, ee->len);
+
+	while (i < ee->len) {
+		en_dbg(DRV, priv,
+		       "mlx4_get_module_info i(%d) offset(%d) len(%d)\n",
+		       i, offset, ee->len - i);
+
+		ret = mlx4_get_module_info(mdev->dev, priv->port,
+					   offset, ee->len - i, data + i);
+
+		if (!ret) /* Done reading */
+			return 0;
+
+		if (ret < 0) {
+			en_err(priv,
+			       "mlx4_get_module_info i(%d) offset(%d) bytes_to_read(%d) - FAILED (0x%x)\n",
+			       i, offset, ee->len - i, ret);
+			return -1;
+		}
+
+		i += ret;
+		offset += ret;
+	}
+	return 0;
+}
+
+static void mlx4_en_print_eeprom(u8 *data, __u32 len)
+{
+	int		i;
+	int		j = 0;
+	int		row = 0;
+	const int	NUM_OF_BYTES = 16;
+
+	printf("\nOffset\t\tValues\n");
+	printf("------\t\t------\n");
+	while(row < len){
+		printf("0x%04x\t\t",row);
+		for(i=0; i < NUM_OF_BYTES; i++){
+			printf("%02x ", data[j]);
+			row++;
+			j++;
+		}
+		printf("\n");
+	}
+}
+
+/* Read cable EEPROM module information by first inspecting the first
+ * two bytes to get the length and then read the rest of the information.
+ * The information is printed to dmesg. */
+static int mlx4_en_read_eeprom(SYSCTL_HANDLER_ARGS)
+{
+
+	u8*		data;
+	int		error;
+	int		result = 0;
+	struct		mlx4_en_priv *priv;
+	struct		net_device *dev;
+	struct		ethtool_modinfo modinfo;
+	struct		ethtool_eeprom ee;
+
+	error = sysctl_handle_int(oidp, &result, 0, req);
 	if (error || !req->newptr)
 		return (error);
-	if (ppp > 0xff || ppp < 0)
-		return (-EINVAL);
-	/* See if we have to change the number of tx queues. */
-	if (!ppp != !priv->prof->rx_ppp) {
-		tx_ring_num = MLX4_EN_NUM_HASH_RINGS + 1 +
-		    (!!ppp) * MLX4_EN_NUM_PPP_RINGS;
-		mutex_lock(&mdev->state_lock);
-		if (priv->port_up) {
-			port_up = 1;
-			mlx4_en_stop_port(priv->dev);
+
+	if (result == 1) {
+		priv = arg1;
+		dev = priv->dev;
+		data = kmalloc(PAGE_SIZE, GFP_KERNEL);
+
+		error = mlx4_en_get_module_info(dev, &modinfo);
+		if (error) {
+			en_err(priv,
+			       "mlx4_en_get_module_info returned with error - FAILED (0x%x)\n",
+			       -error);
+			goto out;
 		}
-		mlx4_en_free_resources(priv);
-		priv->tx_ring_num = tx_ring_num;
-		priv->prof->rx_ppp = ppp;
-		error = -mlx4_en_alloc_resources(priv);
-		if (error)
-			en_err(priv, "Failed reallocating port resources\n");
-		if (error == 0 && port_up) {
-			error = -mlx4_en_start_port(priv->dev);
-			if (error)
-				en_err(priv, "Failed starting port\n");
+
+		ee.len = modinfo.eeprom_len;
+		ee.offset = 0;
+
+		error = mlx4_en_get_module_eeprom(dev, &ee, data);
+		if (error) {
+			en_err(priv,
+			       "mlx4_en_get_module_eeprom returned with error - FAILED (0x%x)\n",
+			       -error);
+			/* Continue printing partial information in case of an error */
 		}
-		mutex_unlock(&mdev->state_lock);
-		return (error);
 
+		/* EEPROM information will be printed in dmesg */
+		mlx4_en_print_eeprom(data, ee.len);
+out:
+		kfree(data);
 	}
-	priv->prof->rx_ppp = ppp;
-	error = -mlx4_SET_PORT_general(priv->mdev->dev, priv->port,
-				       priv->rx_mb_size + ETHER_CRC_LEN,
-				       priv->prof->tx_pause,
-				       priv->prof->tx_ppp,
-				       priv->prof->rx_pause,
-				       priv->prof->rx_ppp);
+	/* Return zero to prevent sysctl failure. */
+	return (0);
+}
 
-	return (error);
+static int mlx4_en_set_tx_ppp(SYSCTL_HANDLER_ARGS)
+{
+        struct mlx4_en_priv *priv;
+        int ppp;
+        int error;
+
+        priv = arg1;
+        ppp = priv->prof->tx_ppp;
+        error = sysctl_handle_int(oidp, &ppp, 0, req);
+        if (error || !req->newptr)
+                return (error);
+        if (ppp > 0xff || ppp < 0)
+                return (-EINVAL);
+        priv->prof->tx_ppp = ppp;
+        error = -mlx4_SET_PORT_general(priv->mdev->dev, priv->port,
+                                       priv->rx_mb_size + ETHER_CRC_LEN,
+                                       priv->prof->tx_pause,
+                                       priv->prof->tx_ppp,
+                                       priv->prof->rx_pause,
+                                       priv->prof->rx_ppp);
+
+        return (error);
 }
 
+static int mlx4_en_set_rx_ppp(SYSCTL_HANDLER_ARGS)
+{
+        struct mlx4_en_priv *priv;
+        struct mlx4_en_dev *mdev;
+        int ppp;
+        int error;
+        int port_up;
+
+        port_up = 0;
+        priv = arg1;
+        mdev = priv->mdev;
+        ppp = priv->prof->rx_ppp;
+        error = sysctl_handle_int(oidp, &ppp, 0, req);
+        if (error || !req->newptr)
+                return (error);
+        if (ppp > 0xff || ppp < 0)
+                return (-EINVAL);
+        /* See if we have to change the number of tx queues. */
+        if (!ppp != !priv->prof->rx_ppp) {
+                mutex_lock(&mdev->state_lock);
+                if (priv->port_up) {
+                        port_up = 1;
+                        mlx4_en_stop_port(priv->dev);
+                }
+                mlx4_en_free_resources(priv);
+                priv->prof->rx_ppp = ppp;
+                error = -mlx4_en_alloc_resources(priv);
+                if (error)
+                        en_err(priv, "Failed reallocating port resources\n");
+                if (error == 0 && port_up) {
+                        error = -mlx4_en_start_port(priv->dev);
+                        if (error)
+                                en_err(priv, "Failed starting port\n");
+                }
+                mutex_unlock(&mdev->state_lock);
+                return (error);
+
+        }
+        priv->prof->rx_ppp = ppp;
+        error = -mlx4_SET_PORT_general(priv->mdev->dev, priv->port,
+                                       priv->rx_mb_size + ETHER_CRC_LEN,
+                                       priv->prof->tx_pause,
+                                       priv->prof->tx_ppp,
+                                       priv->prof->rx_pause,
+                                       priv->prof->rx_ppp);
+
+        return (error);
+}
+
 static void mlx4_en_sysctl_conf(struct mlx4_en_priv *priv)
 {
-	struct net_device *dev;
-	struct sysctl_ctx_list *ctx;
-	struct sysctl_oid *node;
-	struct sysctl_oid_list *node_list;
-	struct sysctl_oid *coal;
-	struct sysctl_oid_list *coal_list;
+        struct net_device *dev;
+        struct sysctl_ctx_list *ctx;
+        struct sysctl_oid *node;
+        struct sysctl_oid_list *node_list;
+        struct sysctl_oid *coal;
+        struct sysctl_oid_list *coal_list;
+	const char *pnameunit;
 
-	dev = priv->dev;
-	ctx = &priv->conf_ctx;
+        dev = priv->dev;
+        ctx = &priv->conf_ctx;
+	pnameunit = device_get_nameunit(priv->mdev->pdev->dev.bsddev);
 
-	sysctl_ctx_init(ctx);
-	priv->sysctl = SYSCTL_ADD_NODE(ctx, SYSCTL_STATIC_CHILDREN(_hw),
-	    OID_AUTO, dev->if_xname, CTLFLAG_RD, 0, "mlx4 10gig ethernet");
-	node = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(priv->sysctl), OID_AUTO,
-	    "conf", CTLFLAG_RD, NULL, "Configuration");
-	node_list = SYSCTL_CHILDREN(node);
+        sysctl_ctx_init(ctx);
+        priv->sysctl = SYSCTL_ADD_NODE(ctx, SYSCTL_STATIC_CHILDREN(_hw),
+            OID_AUTO, dev->if_xname, CTLFLAG_RD, 0, "mlx4 10gig ethernet");
+        node = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(priv->sysctl), OID_AUTO,
+            "conf", CTLFLAG_RD, NULL, "Configuration");
+        node_list = SYSCTL_CHILDREN(node);
 
-	SYSCTL_ADD_UINT(ctx, node_list, OID_AUTO, "msg_enable",
-	    CTLFLAG_RW, &priv->msg_enable, 0,
-	    "Driver message enable bitfield");
-	SYSCTL_ADD_UINT(ctx, node_list, OID_AUTO, "rx_rings",
-	    CTLTYPE_INT | CTLFLAG_RD, &priv->rx_ring_num, 0,
-	    "Number of receive rings");
-	SYSCTL_ADD_UINT(ctx, node_list, OID_AUTO, "tx_rings",
-	    CTLTYPE_INT | CTLFLAG_RD, &priv->tx_ring_num, 0,
-	    "Number of transmit rings");
-	SYSCTL_ADD_PROC(ctx, node_list, OID_AUTO, "rx_size",
+        SYSCTL_ADD_UINT(ctx, node_list, OID_AUTO, "msg_enable",
+            CTLFLAG_RW, &priv->msg_enable, 0,
+            "Driver message enable bitfield");
+        SYSCTL_ADD_UINT(ctx, node_list, OID_AUTO, "rx_rings",
+            CTLFLAG_RD, &priv->rx_ring_num, 0,
+            "Number of receive rings");
+        SYSCTL_ADD_UINT(ctx, node_list, OID_AUTO, "tx_rings",
+            CTLFLAG_RD, &priv->tx_ring_num, 0,
+            "Number of transmit rings");
+        SYSCTL_ADD_PROC(ctx, node_list, OID_AUTO, "rx_size",
+            CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, priv, 0,
+            mlx4_en_set_rx_ring_size, "I", "Receive ring size");
+        SYSCTL_ADD_PROC(ctx, node_list, OID_AUTO, "tx_size",
+            CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, priv, 0,
+            mlx4_en_set_tx_ring_size, "I", "Transmit ring size");
+        SYSCTL_ADD_PROC(ctx, node_list, OID_AUTO, "tx_ppp",
+            CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, priv, 0,
+            mlx4_en_set_tx_ppp, "I", "TX Per-priority pause");
+        SYSCTL_ADD_PROC(ctx, node_list, OID_AUTO, "rx_ppp",
+            CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, priv, 0,
+            mlx4_en_set_rx_ppp, "I", "RX Per-priority pause");
+        SYSCTL_ADD_UINT(ctx, node_list, OID_AUTO, "port_num",
+            CTLFLAG_RD, &priv->port, 0,
+            "Port Number");
+        SYSCTL_ADD_STRING(ctx, node_list, OID_AUTO, "device_name",
+	    CTLFLAG_RD, __DECONST(void *, pnameunit), 0,
+	    "PCI device name");
+
+        /* Add coalescer configuration. */
+        coal = SYSCTL_ADD_NODE(ctx, node_list, OID_AUTO,
+            "coalesce", CTLFLAG_RD, NULL, "Interrupt coalesce configuration");
+        coal_list = SYSCTL_CHILDREN(coal);
+        SYSCTL_ADD_UINT(ctx, coal_list, OID_AUTO, "pkt_rate_low",
+            CTLFLAG_RW, &priv->pkt_rate_low, 0,
+            "Packets per-second for minimum delay");
+        SYSCTL_ADD_UINT(ctx, coal_list, OID_AUTO, "rx_usecs_low",
+            CTLFLAG_RW, &priv->rx_usecs_low, 0,
+            "Minimum RX delay in micro-seconds");
+        SYSCTL_ADD_UINT(ctx, coal_list, OID_AUTO, "pkt_rate_high",
+            CTLFLAG_RW, &priv->pkt_rate_high, 0,
+            "Packets per-second for maximum delay");
+        SYSCTL_ADD_UINT(ctx, coal_list, OID_AUTO, "rx_usecs_high",
+            CTLFLAG_RW, &priv->rx_usecs_high, 0,
+            "Maximum RX delay in micro-seconds");
+        SYSCTL_ADD_UINT(ctx, coal_list, OID_AUTO, "sample_interval",
+            CTLFLAG_RW, &priv->sample_interval, 0,
+            "adaptive frequency in units of HZ ticks");
+        SYSCTL_ADD_UINT(ctx, coal_list, OID_AUTO, "adaptive_rx_coal",
+            CTLFLAG_RW, &priv->adaptive_rx_coal, 0,
+            "Enable adaptive rx coalescing");
+	/* EEPROM support */
+	SYSCTL_ADD_PROC(ctx, node_list, OID_AUTO, "eeprom_info",
 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, priv, 0,
-	    mlx4_en_set_rx_ring_size, "I", "Receive ring size");
-	SYSCTL_ADD_PROC(ctx, node_list, OID_AUTO, "tx_size",
-	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, priv, 0,
-	    mlx4_en_set_tx_ring_size, "I", "Transmit ring size");
-	SYSCTL_ADD_UINT(ctx, node_list, OID_AUTO, "ip_reasm",
-	    CTLFLAG_RW, &priv->ip_reasm, 0,
-	    "Allow reassembly of IP fragments.");
-	SYSCTL_ADD_PROC(ctx, node_list, OID_AUTO, "tx_ppp",
-	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, priv, 0,
-	    mlx4_en_set_tx_ppp, "I", "TX Per-priority pause");
-	SYSCTL_ADD_PROC(ctx, node_list, OID_AUTO, "rx_ppp",
-	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, priv, 0,
-	    mlx4_en_set_rx_ppp, "I", "RX Per-priority pause");
-
-	/* Add coalescer configuration. */
-	coal = SYSCTL_ADD_NODE(ctx, node_list, OID_AUTO,
-	    "coalesce", CTLFLAG_RD, NULL, "Interrupt coalesce configuration");
-	coal_list = SYSCTL_CHILDREN(node);
-	SYSCTL_ADD_UINT(ctx, coal_list, OID_AUTO, "pkt_rate_low",
-	    CTLFLAG_RW, &priv->pkt_rate_low, 0,
-	    "Packets per-second for minimum delay");
-	SYSCTL_ADD_UINT(ctx, coal_list, OID_AUTO, "rx_usecs_low",
-	    CTLFLAG_RW, &priv->rx_usecs_low, 0,
-	    "Minimum RX delay in micro-seconds");
-	SYSCTL_ADD_UINT(ctx, coal_list, OID_AUTO, "pkt_rate_high",
-	    CTLFLAG_RW, &priv->pkt_rate_high, 0,
-	    "Packets per-second for maximum delay");
-	SYSCTL_ADD_UINT(ctx, coal_list, OID_AUTO, "rx_usecs_high",
-	    CTLFLAG_RW, &priv->rx_usecs_high, 0,
-	    "Maximum RX delay in micro-seconds");
-	SYSCTL_ADD_UINT(ctx, coal_list, OID_AUTO, "sample_interval",
-	    CTLFLAG_RW, &priv->sample_interval, 0,
-	    "adaptive frequency in units of HZ ticks");
-	SYSCTL_ADD_UINT(ctx, coal_list, OID_AUTO, "adaptive_rx_coal",
-	    CTLFLAG_RW, &priv->adaptive_rx_coal, 0,
-	    "Enable adaptive rx coalescing");
+	    mlx4_en_read_eeprom, "I", "EEPROM information");
 }
 
 static void mlx4_en_sysctl_stat(struct mlx4_en_priv *priv)
 {
-	struct net_device *dev;
 	struct sysctl_ctx_list *ctx;
 	struct sysctl_oid *node;
 	struct sysctl_oid_list *node_list;
@@ -1376,8 +2647,6 @@
 	char namebuf[128];
 	int i;
 
-	dev = priv->dev;
-
 	ctx = &priv->stat_ctx;
 	sysctl_ctx_init(ctx);
 	node = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(priv->sysctl), OID_AUTO,
@@ -1405,6 +2674,8 @@
 	    &priv->port_stats.wake_queue, "Queue resumed after full");
 	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "tx_timeout", CTLFLAG_RD,
 	    &priv->port_stats.tx_timeout, "Transmit timeouts");
+	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "tx_oversized_packets", CTLFLAG_RD,
+	    &priv->port_stats.oversized_packets, "TX oversized packets, m_defrag failed");
 	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "rx_alloc_failed", CTLFLAG_RD,
 	    &priv->port_stats.rx_alloc_failed, "RX failed to allocate mbuf");
 	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "rx_chksum_good", CTLFLAG_RD,
@@ -1416,43 +2687,111 @@
 	    "TX checksum offloads");
 
 	/* Could strdup the names and add in a loop.  This is simpler. */
-	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "broadcast", CTLFLAG_RD,
-	    &priv->pkstats.broadcast, "Broadcast packets");
-	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "tx_prio0", CTLFLAG_RD,
-	    &priv->pkstats.tx_prio[0], "TX Priority 0 packets");
-	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "tx_prio1", CTLFLAG_RD,
-	    &priv->pkstats.tx_prio[1], "TX Priority 1 packets");
-	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "tx_prio2", CTLFLAG_RD,
-	    &priv->pkstats.tx_prio[2], "TX Priority 2 packets");
-	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "tx_prio3", CTLFLAG_RD,
-	    &priv->pkstats.tx_prio[3], "TX Priority 3 packets");
-	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "tx_prio4", CTLFLAG_RD,
-	    &priv->pkstats.tx_prio[4], "TX Priority 4 packets");
-	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "tx_prio5", CTLFLAG_RD,
-	    &priv->pkstats.tx_prio[5], "TX Priority 5 packets");
-	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "tx_prio6", CTLFLAG_RD,
-	    &priv->pkstats.tx_prio[6], "TX Priority 6 packets");
-	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "tx_prio7", CTLFLAG_RD,
-	    &priv->pkstats.tx_prio[7], "TX Priority 7 packets");
-	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "rx_prio0", CTLFLAG_RD,
-	    &priv->pkstats.rx_prio[0], "RX Priority 0 packets");
-	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "rx_prio1", CTLFLAG_RD,
-	    &priv->pkstats.rx_prio[1], "RX Priority 1 packets");
-	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "rx_prio2", CTLFLAG_RD,
-	    &priv->pkstats.rx_prio[2], "RX Priority 2 packets");
-	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "rx_prio3", CTLFLAG_RD,
-	    &priv->pkstats.rx_prio[3], "RX Priority 3 packets");
-	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "rx_prio4", CTLFLAG_RD,
-	    &priv->pkstats.rx_prio[4], "RX Priority 4 packets");
-	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "rx_prio5", CTLFLAG_RD,
-	    &priv->pkstats.rx_prio[5], "RX Priority 5 packets");
-	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "rx_prio6", CTLFLAG_RD,
-	    &priv->pkstats.rx_prio[6], "RX Priority 6 packets");
-	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "rx_prio7", CTLFLAG_RD,
-	    &priv->pkstats.rx_prio[7], "RX Priority 7 packets");
+	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "rx_bytes", CTLFLAG_RD,
+	    &priv->pkstats.rx_bytes, "RX Bytes");
+	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "rx_packets", CTLFLAG_RD,
+	    &priv->pkstats.rx_packets, "RX packets");
+	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "rx_multicast_packets", CTLFLAG_RD,
+	    &priv->pkstats.rx_multicast_packets, "RX Multicast Packets");
+	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "rx_broadcast_packets", CTLFLAG_RD,
+	    &priv->pkstats.rx_broadcast_packets, "RX Broadcast Packets");
+	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "rx_errors", CTLFLAG_RD,
+	    &priv->pkstats.rx_errors, "RX Errors");
+	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "rx_dropped", CTLFLAG_RD,
+	    &priv->pkstats.rx_dropped, "RX Dropped");
+	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "rx_length_errors", CTLFLAG_RD,
+	    &priv->pkstats.rx_length_errors, "RX Length Errors");
+	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "rx_over_errors", CTLFLAG_RD,
+	    &priv->pkstats.rx_over_errors, "RX Over Errors");
+	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "rx_crc_errors", CTLFLAG_RD,
+	    &priv->pkstats.rx_crc_errors, "RX CRC Errors");
+	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "rx_jabbers", CTLFLAG_RD,
+	    &priv->pkstats.rx_jabbers, "RX Jabbers");
 
+
+	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "rx_in_range_length_error", CTLFLAG_RD,
+	    &priv->pkstats.rx_in_range_length_error, "RX IN_Range Length Error");
+	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "rx_out_range_length_error",
+		CTLFLAG_RD, &priv->pkstats.rx_out_range_length_error,
+		"RX Out Range Length Error");
+	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "rx_lt_64_bytes_packets", CTLFLAG_RD,
+	    &priv->pkstats.rx_lt_64_bytes_packets, "RX Lt 64 Bytes Packets");
+	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "rx_127_bytes_packets", CTLFLAG_RD,
+	    &priv->pkstats.rx_127_bytes_packets, "RX 127 bytes Packets");
+	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "rx_255_bytes_packets", CTLFLAG_RD,
+	    &priv->pkstats.rx_255_bytes_packets, "RX 255 bytes Packets");
+	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "rx_511_bytes_packets", CTLFLAG_RD,
+	    &priv->pkstats.rx_511_bytes_packets, "RX 511 bytes Packets");
+	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "rx_1023_bytes_packets", CTLFLAG_RD,
+	    &priv->pkstats.rx_1023_bytes_packets, "RX 1023 bytes Packets");
+	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "rx_1518_bytes_packets", CTLFLAG_RD,
+	    &priv->pkstats.rx_1518_bytes_packets, "RX 1518 bytes Packets");
+	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "rx_1522_bytes_packets", CTLFLAG_RD,
+	    &priv->pkstats.rx_1522_bytes_packets, "RX 1522 bytes Packets");
+	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "rx_1548_bytes_packets", CTLFLAG_RD,
+	    &priv->pkstats.rx_1548_bytes_packets, "RX 1548 bytes Packets");
+	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "rx_gt_1548_bytes_packets", CTLFLAG_RD,
+	    &priv->pkstats.rx_gt_1548_bytes_packets,
+	    "RX Greater Then 1548 bytes Packets");
+
+struct mlx4_en_pkt_stats {
+	unsigned long tx_packets;
+	unsigned long tx_bytes;
+	unsigned long tx_multicast_packets;
+	unsigned long tx_broadcast_packets;
+	unsigned long tx_errors;
+	unsigned long tx_dropped;
+	unsigned long tx_lt_64_bytes_packets;
+	unsigned long tx_127_bytes_packets;
+	unsigned long tx_255_bytes_packets;
+	unsigned long tx_511_bytes_packets;
+	unsigned long tx_1023_bytes_packets;
+	unsigned long tx_1518_bytes_packets;
+	unsigned long tx_1522_bytes_packets;
+	unsigned long tx_1548_bytes_packets;
+	unsigned long tx_gt_1548_bytes_packets;
+	unsigned long rx_prio[NUM_PRIORITIES][NUM_PRIORITY_STATS];
+	unsigned long tx_prio[NUM_PRIORITIES][NUM_PRIORITY_STATS];
+#define NUM_PKT_STATS		72
+};
+
+
+	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "tx_packets", CTLFLAG_RD,
+	    &priv->pkstats.tx_packets, "TX packets");
+	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "tx_bytes", CTLFLAG_RD,
+	    &priv->pkstats.tx_bytes, "TX Bytes");
+	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "tx_multicast_packets", CTLFLAG_RD,
+	    &priv->pkstats.tx_multicast_packets, "TX Multicast Packets");
+	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "tx_broadcast_packets", CTLFLAG_RD,
+	    &priv->pkstats.tx_broadcast_packets, "TX Broadcast Packets");
+	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "tx_errors", CTLFLAG_RD,
+	    &priv->pkstats.tx_errors, "TX Errors");
+	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "tx_dropped", CTLFLAG_RD,
+	    &priv->pkstats.tx_dropped, "TX Dropped");
+	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "tx_lt_64_bytes_packets", CTLFLAG_RD,
+	    &priv->pkstats.tx_lt_64_bytes_packets, "TX Less Then 64 Bytes Packets");
+	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "tx_127_bytes_packets", CTLFLAG_RD,
+	    &priv->pkstats.tx_127_bytes_packets, "TX 127 Bytes Packets");
+	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "tx_255_bytes_packets", CTLFLAG_RD,
+	    &priv->pkstats.tx_255_bytes_packets, "TX 255 Bytes Packets");
+	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "tx_511_bytes_packets", CTLFLAG_RD,
+	    &priv->pkstats.tx_511_bytes_packets, "TX 511 Bytes Packets");
+	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "tx_1023_bytes_packets", CTLFLAG_RD,
+	    &priv->pkstats.tx_1023_bytes_packets, "TX 1023 Bytes Packets");
+	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "tx_1518_bytes_packets", CTLFLAG_RD,
+	    &priv->pkstats.tx_1518_bytes_packets, "TX 1518 Bytes Packets");
+	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "tx_1522_bytes_packets", CTLFLAG_RD,
+	    &priv->pkstats.tx_1522_bytes_packets, "TX 1522 Bytes Packets");
+	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "tx_1548_bytes_packets", CTLFLAG_RD,
+	    &priv->pkstats.tx_1548_bytes_packets, "TX 1548 Bytes Packets");
+	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "tx_gt_1548_bytes_packets", CTLFLAG_RD,
+	    &priv->pkstats.tx_gt_1548_bytes_packets,
+	    "TX Greater Then 1548 Bytes Packets");
+
+
+
 	for (i = 0; i < priv->tx_ring_num; i++) {
-		tx_ring = &priv->tx_ring[i];
+		tx_ring = priv->tx_ring[i];
 		snprintf(namebuf, sizeof(namebuf), "tx_ring%d", i);
 		ring_node = SYSCTL_ADD_NODE(ctx, node_list, OID_AUTO, namebuf,
 		    CTLFLAG_RD, NULL, "TX Ring");
@@ -1461,12 +2800,10 @@
 		    CTLFLAG_RD, &tx_ring->packets, "TX packets");
 		SYSCTL_ADD_ULONG(ctx, ring_list, OID_AUTO, "bytes",
 		    CTLFLAG_RD, &tx_ring->bytes, "TX bytes");
-		SYSCTL_ADD_ULONG(ctx, ring_list, OID_AUTO, "error",
-		    CTLFLAG_RD, &tx_ring->errors, "TX soft errors");
+	}
 
-	}
 	for (i = 0; i < priv->rx_ring_num; i++) {
-		rx_ring = &priv->rx_ring[i];
+		rx_ring = priv->rx_ring[i];
 		snprintf(namebuf, sizeof(namebuf), "rx_ring%d", i);
 		ring_node = SYSCTL_ADD_NODE(ctx, node_list, OID_AUTO, namebuf,
 		    CTLFLAG_RD, NULL, "RX Ring");
@@ -1477,149 +2814,5 @@
 		    CTLFLAG_RD, &rx_ring->bytes, "RX bytes");
 		SYSCTL_ADD_ULONG(ctx, ring_list, OID_AUTO, "error",
 		    CTLFLAG_RD, &rx_ring->errors, "RX soft errors");
-		SYSCTL_ADD_UINT(ctx, ring_list, OID_AUTO, "lro_queued",
-		    CTLFLAG_RD, &rx_ring->lro.lro_queued, 0, "LRO Queued");
-		SYSCTL_ADD_UINT(ctx, ring_list, OID_AUTO, "lro_flushed",
-		    CTLFLAG_RD, &rx_ring->lro.lro_flushed, 0, "LRO Flushed");
 	}
 }
-
-int mlx4_en_init_netdev(struct mlx4_en_dev *mdev, int port,
-			struct mlx4_en_port_profile *prof)
-{
-	static volatile int mlx4_en_unit;
-	struct net_device *dev;
-	struct mlx4_en_priv *priv;
-	uint8_t dev_addr[ETHER_ADDR_LEN];
-	int err;
-	int i;
-
-	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
-	dev = priv->dev = if_alloc(IFT_ETHER);
-	if (dev == NULL) {
-		mlx4_err(mdev, "Net device allocation failed\n");
-		kfree(priv);
-		return -ENOMEM;
-	}
-	dev->if_softc = priv;
-	if_initname(dev, "mlxen", atomic_fetchadd_int(&mlx4_en_unit, 1));
-	dev->if_mtu = ETHERMTU;
-	dev->if_baudrate = 1000000000;
-	dev->if_init = mlx4_en_init;
-	dev->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
-	dev->if_ioctl = mlx4_en_ioctl;
-	dev->if_transmit = mlx4_en_transmit;
-	dev->if_qflush = mlx4_en_qflush;
-	dev->if_snd.ifq_maxlen = prof->tx_ring_size;
-
-	/*
-	 * Initialize driver private data
-	 */
-	priv->dev = dev;
-	priv->mdev = mdev;
-	priv->prof = prof;
-	priv->port = port;
-	priv->port_up = false;
-	priv->rx_csum = 1;
-	priv->flags = prof->flags;
-	priv->tx_ring_num = prof->tx_ring_num;
-	priv->rx_ring_num = prof->rx_ring_num;
-	priv->mac_index = -1;
-	priv->msg_enable = MLX4_EN_MSG_LEVEL;
-	priv->ip_reasm = priv->mdev->profile.ip_reasm;
-	mtx_init(&priv->stats_lock.m, "mlx4 stats", NULL, MTX_DEF);
-	mtx_init(&priv->vlan_lock.m, "mlx4 vlan", NULL, MTX_DEF);
-	INIT_WORK(&priv->mcast_task, mlx4_en_do_set_multicast);
-	INIT_WORK(&priv->watchdog_task, mlx4_en_restart);
-	INIT_WORK(&priv->linkstate_task, mlx4_en_linkstate);
-	INIT_DELAYED_WORK(&priv->stats_task, mlx4_en_do_get_stats);
-	callout_init(&priv->watchdog_timer, 1);
-
-	/* Query for default mac and max mtu */
-	priv->max_mtu = mdev->dev->caps.eth_mtu_cap[priv->port];
-	priv->mac = mdev->dev->caps.def_mac[priv->port];
-
-	if (ILLEGAL_MAC(priv->mac)) {
-		en_err(priv, "Port: %d, invalid mac burned: 0x%llx, quiting\n",
-			 priv->port, priv->mac);
-		err = -EINVAL;
-		goto out;
-	}
-
-	mlx4_en_sysctl_conf(priv);
-
-	err = mlx4_en_alloc_resources(priv);
-	if (err)
-		goto out;
-
-	/* Allocate page for receive rings */
-	err = mlx4_alloc_hwq_res(mdev->dev, &priv->res,
-				MLX4_EN_PAGE_SIZE, MLX4_EN_PAGE_SIZE);
-	if (err) {
-		en_err(priv, "Failed to allocate page for rx qps\n");
-		goto out;
-	}
-	priv->allocated = 1;
-
-	/*
-	 * Set driver features
-	 */
-	dev->if_capabilities |= IFCAP_RXCSUM | IFCAP_TXCSUM;
-	dev->if_capabilities |= IFCAP_VLAN_MTU | IFCAP_VLAN_HWTAGGING;
-	dev->if_capabilities |= IFCAP_VLAN_HWCSUM | IFCAP_VLAN_HWFILTER;
-	dev->if_capabilities |= IFCAP_LINKSTATE | IFCAP_JUMBO_MTU;
-	if (mdev->LSO_support)
-		dev->if_capabilities |= IFCAP_TSO4 | IFCAP_VLAN_HWTSO;
-	if (mdev->profile.num_lro)
-		dev->if_capabilities |= IFCAP_LRO;
-	dev->if_capenable = dev->if_capabilities;
-	/*
-	 * Setup wake-on-lan.
-	 */
-	if (priv->mdev->dev->caps.wol) {
-		u64 config;
-		if (mlx4_wol_read(priv->mdev->dev, &config, priv->port) == 0) {
-			if (config & MLX4_EN_WOL_MAGIC)
-				dev->if_capabilities |= IFCAP_WOL_MAGIC;
-			if (config & MLX4_EN_WOL_ENABLED)
-				dev->if_capenable |= IFCAP_WOL_MAGIC;
-		}
-	}
-
-        /* Register for VLAN events */
-	priv->vlan_attach = EVENTHANDLER_REGISTER(vlan_config,
-            mlx4_en_vlan_rx_add_vid, priv, EVENTHANDLER_PRI_FIRST);
-	priv->vlan_detach = EVENTHANDLER_REGISTER(vlan_unconfig,
-            mlx4_en_vlan_rx_kill_vid, priv, EVENTHANDLER_PRI_FIRST);
-
-	mdev->pndev[priv->port] = dev;
-
-	priv->last_link_state = MLX4_DEV_EVENT_PORT_DOWN;
-	if_link_state_change(dev, LINK_STATE_DOWN);
-
-	/* Set default MAC */
-	for (i = 0; i < ETHER_ADDR_LEN; i++)
-		dev_addr[ETHER_ADDR_LEN - 1 - i] = (u8) (priv->mac >> (8 * i));
-
-	ether_ifattach(dev, dev_addr);
-	ifmedia_init(&priv->media, IFM_IMASK | IFM_ETH_FMASK,
-	    mlx4_en_media_change, mlx4_en_media_status);
-	ifmedia_add(&priv->media, IFM_ETHER | IFM_FDX | IFM_1000_T, 0, NULL);
-	ifmedia_add(&priv->media, IFM_ETHER | IFM_FDX | IFM_10G_SR, 0, NULL);
-	ifmedia_add(&priv->media, IFM_ETHER | IFM_FDX | IFM_10G_CX4, 0, NULL);
-	ifmedia_add(&priv->media, IFM_ETHER | IFM_AUTO, 0, NULL);
-	ifmedia_set(&priv->media, IFM_ETHER | IFM_AUTO);
-
-	en_warn(priv, "Using %d TX rings\n", prof->tx_ring_num);
-	en_warn(priv, "Using %d RX rings\n", prof->rx_ring_num);
-
-	priv->registered = 1;
-	queue_delayed_work(mdev->workqueue, &priv->stats_task, STATS_DELAY);
-
-	return 0;
-
-out:
-	mlx4_en_destroy_netdev(dev);
-	return err;
-}
-

Modified: trunk/sys/ofed/drivers/net/mlx4/en_port.c
===================================================================
--- trunk/sys/ofed/drivers/net/mlx4/en_port.c	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/drivers/net/mlx4/en_port.c	2016-09-14 19:35:22 UTC (rev 7911)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2007, 2014 Mellanox Technologies. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -31,27 +31,25 @@
  *
  */
 
-
-#include "mlx4_en.h"
-
+#include <sys/types.h>
 #include <linux/if_vlan.h>
 
 #include <linux/mlx4/device.h>
 #include <linux/mlx4/cmd.h>
 
+#include "en_port.h"
+#include "mlx4_en.h"
+#define EN_IFQ_MIN_INTERVAL 3000
 
-int mlx4_SET_MCAST_FLTR(struct mlx4_dev *dev, u8 port,
-			u64 mac, u64 clear, u8 mode)
-{
-	return mlx4_cmd(dev, (mac | (clear << 63)), port, mode,
-			MLX4_CMD_SET_MCAST_FLTR, MLX4_CMD_TIME_CLASS_B);
-}
 
-int mlx4_SET_VLAN_FLTR(struct mlx4_dev *dev, u8 port, u32 *vlans)
+int mlx4_SET_VLAN_FLTR(struct mlx4_dev *dev, struct mlx4_en_priv *priv)
 {
 	struct mlx4_cmd_mailbox *mailbox;
 	struct mlx4_set_vlan_fltr_mbox *filter;
-	int i, j;
+	int i;
+	int j;
+	int index = 0;
+	u32 entry;
 	int err = 0;
 
 	mailbox = mlx4_alloc_cmd_mailbox(dev);
@@ -59,78 +57,22 @@
 		return PTR_ERR(mailbox);
 
 	filter = mailbox->buf;
-	memset(filter, 0, sizeof *filter);
-	if (vlans)
-		for (i = 0, j = VLAN_FLTR_SIZE - 1; i < VLAN_FLTR_SIZE;
-		    i++, j--)
-			filter->entry[j] = cpu_to_be32(vlans[i]);
-	err = mlx4_cmd(dev, mailbox->dma, port, 0, MLX4_CMD_SET_VLAN_FLTR,
-		       MLX4_CMD_TIME_CLASS_B);
+	memset(filter, 0, sizeof(*filter));
+	for (i = VLAN_FLTR_SIZE - 1; i >= 0; i--) {
+		entry = 0;
+		for (j = 0; j < 32; j++) {
+			if (test_bit(index, priv->active_vlans))
+				entry |= 1 << j;
+			index++;
+		}
+		filter->entry[i] = cpu_to_be32(entry);
+	}
+	err = mlx4_cmd(dev, mailbox->dma, priv->port, 0, MLX4_CMD_SET_VLAN_FLTR,
+		       MLX4_CMD_TIME_CLASS_B, MLX4_CMD_WRAPPED);
 	mlx4_free_cmd_mailbox(dev, mailbox);
 	return err;
 }
 
-
-int mlx4_SET_PORT_general(struct mlx4_dev *dev, u8 port, int mtu,
-			  u8 pptx, u8 pfctx, u8 pprx, u8 pfcrx)
-{
-	struct mlx4_cmd_mailbox *mailbox;
-	struct mlx4_set_port_general_context *context;
-	int err;
-	u32 in_mod;
-
-	mailbox = mlx4_alloc_cmd_mailbox(dev);
-	if (IS_ERR(mailbox))
-		return PTR_ERR(mailbox);
-	context = mailbox->buf;
-	memset(context, 0, sizeof *context);
-
-	context->flags = SET_PORT_GEN_ALL_VALID;
-	context->mtu = cpu_to_be16(mtu);
-	context->pptx = (pptx * (!pfctx)) << 7;
-	context->pfctx = pfctx;
-	context->pprx = (pprx * (!pfcrx)) << 7;
-	context->pfcrx = pfcrx;
-
-	in_mod = MLX4_SET_PORT_GENERAL << 8 | port;
-	err = mlx4_cmd(dev, mailbox->dma, in_mod, 1, MLX4_CMD_SET_PORT,
-		       MLX4_CMD_TIME_CLASS_B);
-
-	mlx4_free_cmd_mailbox(dev, mailbox);
-	return err;
-}
-
-int mlx4_SET_PORT_qpn_calc(struct mlx4_dev *dev, u8 port, u32 base_qpn,
-			   u8 promisc)
-{
-	struct mlx4_cmd_mailbox *mailbox;
-	struct mlx4_set_port_rqp_calc_context *context;
-	int err;
-	u32 in_mod;
-
-	mailbox = mlx4_alloc_cmd_mailbox(dev);
-	if (IS_ERR(mailbox))
-		return PTR_ERR(mailbox);
-	context = mailbox->buf;
-	memset(context, 0, sizeof *context);
-
-	context->base_qpn = cpu_to_be32(base_qpn);
-	context->promisc = cpu_to_be32(promisc << SET_PORT_PROMISC_EN_SHIFT | base_qpn);
-	context->mcast = cpu_to_be32((dev->caps.mc_promisc_mode <<
-				      SET_PORT_PROMISC_MODE_SHIFT) | base_qpn);
-	context->intra_no_vlan = 0;
-	context->no_vlan = MLX4_NO_VLAN_IDX;
-	context->intra_vlan_miss = 0;
-	context->vlan_miss = MLX4_VLAN_MISS_IDX;
-
-	in_mod = MLX4_SET_PORT_RQP_CALC << 8 | port;
-	err = mlx4_cmd(dev, mailbox->dma, in_mod, 1, MLX4_CMD_SET_PORT,
-		       MLX4_CMD_TIME_CLASS_B);
-
-	mlx4_free_cmd_mailbox(dev, mailbox);
-	return err;
-}
-
 int mlx4_en_QUERY_PORT(struct mlx4_en_dev *mdev, u8 port)
 {
 	struct mlx4_en_query_port_context *qport_context;
@@ -144,7 +86,8 @@
 		return PTR_ERR(mailbox);
 	memset(mailbox->buf, 0, sizeof(*qport_context));
 	err = mlx4_cmd_box(mdev->dev, 0, mailbox->dma, port, 0,
-			   MLX4_CMD_QUERY_PORT, MLX4_CMD_TIME_CLASS_B);
+			   MLX4_CMD_QUERY_PORT, MLX4_CMD_TIME_CLASS_B,
+			   MLX4_CMD_WRAPPED);
 	if (err)
 		goto out;
 	qport_context = mailbox->buf;
@@ -160,16 +103,21 @@
 	case MLX4_EN_10G_SPEED_XFI:
 		state->link_speed = 10000;
 		break;
+	case MLX4_EN_20G_SPEED:
+		state->link_speed = 20000;
+		break;
 	case MLX4_EN_40G_SPEED:
 		state->link_speed = 40000;
 		break;
+	case MLX4_EN_56G_SPEED:
+		state->link_speed = 56000;
+		break;
 	default:
 		state->link_speed = -1;
 		break;
 	}
 	state->transciver = qport_context->transceiver;
-	if (be32_to_cpu(qport_context->transceiver_code_hi) & 0x400)
-		state->transciver = 0x80;
+	state->autoneg = !!(qport_context->autoneg & MLX4_EN_AUTONEG_MASK);
 
 out:
 	mlx4_free_cmd_mailbox(mdev->dev, mailbox);
@@ -176,75 +124,56 @@
 	return err;
 }
 
-static int read_iboe_counters(struct mlx4_dev *dev, int index, u64 counters[])
+int mlx4_en_DUMP_ETH_STATS(struct mlx4_en_dev *mdev, u8 port, u8 reset)
 {
-	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_en_stat_out_mbox *mlx4_en_stats;
+	struct mlx4_en_stat_out_flow_control_mbox *flowstats;
+	struct mlx4_en_priv *priv = netdev_priv(mdev->pndev[port]);
+	struct mlx4_en_vport_stats *vport_stats = &priv->vport_stats;
+	struct mlx4_cmd_mailbox *mailbox = NULL;
+	struct mlx4_cmd_mailbox *mailbox_flow = NULL;
+	u64 in_mod = reset << 8 | port;
 	int err;
-	int mode;
-	struct mlx4_counters_ext *ext;
-	struct mlx4_counters *reg;
+	int i;
+	int do_if_stat = 1;
+	unsigned long period = (unsigned long) (jiffies - priv->last_ifq_jiffies);
+	struct mlx4_en_vport_stats tmp_vport_stats;
+        struct net_device *dev;
 
-	mailbox = mlx4_alloc_cmd_mailbox(dev);
-	if (IS_ERR(mailbox))
-		return -ENOMEM;
+	if (jiffies_to_msecs(period) < EN_IFQ_MIN_INTERVAL ||
+				priv->counter_index == 0xff)
+		do_if_stat = 0;
 
-	err = mlx4_cmd_box(dev, 0, mailbox->dma, index, 0,
-			   MLX4_CMD_QUERY_IF_STAT, MLX4_CMD_TIME_CLASS_C);
-	if (err)
-		goto out;
+	mailbox = mlx4_alloc_cmd_mailbox(mdev->dev);
+	if (IS_ERR(mailbox)) {
+		err = PTR_ERR(mailbox);
+		goto mailbox_out;
+	}
 
-	mode = be32_to_cpu(((struct mlx4_counters *)mailbox->buf)->counter_mode) & 0xf;
-	switch (mode) {
-	case 0:
-		reg = mailbox->buf;
-		counters[0] = be64_to_cpu(reg->rx_frames);
-		counters[1] = be64_to_cpu(reg->tx_frames);
-		counters[2] = be64_to_cpu(reg->rx_bytes);
-		counters[3] = be64_to_cpu(reg->tx_bytes);
-		break;
-	case 1:
-		ext = mailbox->buf;
-		counters[0] = be64_to_cpu(ext->rx_uni_frames);
-		counters[1] = be64_to_cpu(ext->tx_uni_frames);
-		counters[2] = be64_to_cpu(ext->rx_uni_bytes);
-		counters[3] = be64_to_cpu(ext->tx_uni_bytes);
-		break;
-	default:
-		err = -EINVAL;
+	mailbox_flow = mlx4_alloc_cmd_mailbox(mdev->dev);
+	if (IS_ERR(mailbox_flow)) {
+		mlx4_free_cmd_mailbox(mdev->dev, mailbox);
+		err = PTR_ERR(mailbox_flow);
+		goto mailbox_out;
 	}
 
-out:
-	mlx4_free_cmd_mailbox(dev, mailbox);
-	return err;
-}
+	/* 0xffs indicates invalid value */
+	memset(mailbox_flow->buf, 0xff, sizeof(*flowstats) *
+		MLX4_NUM_PRIORITIES);
 
-int mlx4_en_DUMP_ETH_STATS(struct mlx4_en_dev *mdev, u8 port, u8 reset)
-{
-	struct mlx4_en_stat_out_mbox *mlx4_en_stats;
-	struct net_device *dev;
-	struct mlx4_en_priv *priv;
-	struct mlx4_cmd_mailbox *mailbox;
-	u64 in_mod = reset << 8 | port;
-	unsigned long oerror;
-	unsigned long ierror;
-	int err;
-	int i;
-	int counter;
-	u64 counters[4];
+	if (mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_FLOWSTATS_EN) {
+		memset(mailbox_flow->buf, 0, sizeof(*flowstats));
+		err = mlx4_cmd_box(mdev->dev, 0, mailbox_flow->dma,
+				   in_mod | 1<<12, 0, MLX4_CMD_DUMP_ETH_STATS,
+				   MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
 
-	dev = mdev->pndev[port];
-	priv = netdev_priv(dev);
-	memset(counters, 0, sizeof counters);
-	counter = mlx4_get_iboe_counter(priv->mdev->dev, port);
-	if (counter >= 0)
-		err = read_iboe_counters(priv->mdev->dev, counter, counters);
+		if (err)
+			goto out;
+	}
 
-	mailbox = mlx4_alloc_cmd_mailbox(mdev->dev);
-	if (IS_ERR(mailbox))
-		return PTR_ERR(mailbox);
-	memset(mailbox->buf, 0, sizeof(*mlx4_en_stats));
 	err = mlx4_cmd_box(mdev->dev, 0, mailbox->dma, in_mod, 0,
-			   MLX4_CMD_DUMP_ETH_STATS, MLX4_CMD_TIME_CLASS_B);
+			   MLX4_CMD_DUMP_ETH_STATS, MLX4_CMD_TIME_CLASS_B,
+			   MLX4_CMD_NATIVE);
 	if (err)
 		goto out;
 
@@ -252,74 +181,395 @@
 
 	spin_lock(&priv->stats_lock);
 
-	oerror = ierror = 0;
-	dev->if_ipackets = counters[0];
-	dev->if_ibytes = counters[2];
+	priv->port_stats.rx_chksum_good = 0;
+	priv->port_stats.rx_chksum_none = 0;
 	for (i = 0; i < priv->rx_ring_num; i++) {
-		dev->if_ipackets += priv->rx_ring[i].packets;
-		dev->if_ibytes += priv->rx_ring[i].bytes;
-		ierror += priv->rx_ring[i].errors;
+		priv->port_stats.rx_chksum_good += priv->rx_ring[i]->csum_ok;
+		priv->port_stats.rx_chksum_none += priv->rx_ring[i]->csum_none;
 	}
-	dev->if_opackets = counters[1];
-	dev->if_obytes = counters[3];
-	for (i = 0; i <= priv->tx_ring_num; i++) {
-		dev->if_opackets += priv->tx_ring[i].packets;
-		dev->if_obytes += priv->tx_ring[i].bytes;
-		oerror += priv->tx_ring[i].errors;
+
+	priv->port_stats.tx_chksum_offload = 0;
+	priv->port_stats.queue_stopped = 0;
+	priv->port_stats.wake_queue = 0;
+	for (i = 0; i < priv->tx_ring_num; i++) {
+		priv->port_stats.tx_chksum_offload += priv->tx_ring[i]->tx_csum;
+		priv->port_stats.queue_stopped += priv->tx_ring[i]->queue_stopped;
+		priv->port_stats.wake_queue += priv->tx_ring[i]->wake_queue;
+		priv->port_stats.oversized_packets += priv->tx_ring[i]->oversized_packets;
 	}
+	/* RX Statistics */
+	priv->pkstats.rx_packets = be64_to_cpu(mlx4_en_stats->RTOT_prio_0) +
+		be64_to_cpu(mlx4_en_stats->RTOT_prio_1) +
+		be64_to_cpu(mlx4_en_stats->RTOT_prio_2) +
+		be64_to_cpu(mlx4_en_stats->RTOT_prio_3) +
+		be64_to_cpu(mlx4_en_stats->RTOT_prio_4) +
+		be64_to_cpu(mlx4_en_stats->RTOT_prio_5) +
+		be64_to_cpu(mlx4_en_stats->RTOT_prio_6) +
+		be64_to_cpu(mlx4_en_stats->RTOT_prio_7) +
+		be64_to_cpu(mlx4_en_stats->RTOT_novlan);
+	priv->pkstats.rx_bytes = be64_to_cpu(mlx4_en_stats->ROCT_prio_0) +
+		be64_to_cpu(mlx4_en_stats->ROCT_prio_1) +
+		be64_to_cpu(mlx4_en_stats->ROCT_prio_2) +
+		be64_to_cpu(mlx4_en_stats->ROCT_prio_3) +
+		be64_to_cpu(mlx4_en_stats->ROCT_prio_4) +
+		be64_to_cpu(mlx4_en_stats->ROCT_prio_5) +
+		be64_to_cpu(mlx4_en_stats->ROCT_prio_6) +
+		be64_to_cpu(mlx4_en_stats->ROCT_prio_7) +
+		be64_to_cpu(mlx4_en_stats->ROCT_novlan);
+	priv->pkstats.rx_multicast_packets = be64_to_cpu(mlx4_en_stats->MCAST_prio_0) +
+		be64_to_cpu(mlx4_en_stats->MCAST_prio_1) +
+		be64_to_cpu(mlx4_en_stats->MCAST_prio_2) +
+		be64_to_cpu(mlx4_en_stats->MCAST_prio_3) +
+		be64_to_cpu(mlx4_en_stats->MCAST_prio_4) +
+		be64_to_cpu(mlx4_en_stats->MCAST_prio_5) +
+		be64_to_cpu(mlx4_en_stats->MCAST_prio_6) +
+		be64_to_cpu(mlx4_en_stats->MCAST_prio_7) +
+		be64_to_cpu(mlx4_en_stats->MCAST_novlan);
+	priv->pkstats.rx_broadcast_packets = be64_to_cpu(mlx4_en_stats->RBCAST_prio_0) +
+		be64_to_cpu(mlx4_en_stats->RBCAST_prio_1) +
+		be64_to_cpu(mlx4_en_stats->RBCAST_prio_2) +
+		be64_to_cpu(mlx4_en_stats->RBCAST_prio_3) +
+		be64_to_cpu(mlx4_en_stats->RBCAST_prio_4) +
+		be64_to_cpu(mlx4_en_stats->RBCAST_prio_5) +
+		be64_to_cpu(mlx4_en_stats->RBCAST_prio_6) +
+		be64_to_cpu(mlx4_en_stats->RBCAST_prio_7) +
+		be64_to_cpu(mlx4_en_stats->RBCAST_novlan);
+	priv->pkstats.rx_errors = be64_to_cpu(mlx4_en_stats->PCS) +
+		be32_to_cpu(mlx4_en_stats->RJBBR) +
+		be32_to_cpu(mlx4_en_stats->RCRC) +
+		be32_to_cpu(mlx4_en_stats->RRUNT) +
+		be64_to_cpu(mlx4_en_stats->RInRangeLengthErr) +
+		be64_to_cpu(mlx4_en_stats->ROutRangeLengthErr) +
+		be32_to_cpu(mlx4_en_stats->RSHORT) +
+		be64_to_cpu(mlx4_en_stats->RGIANT_prio_0) +
+		be64_to_cpu(mlx4_en_stats->RGIANT_prio_1) +
+		be64_to_cpu(mlx4_en_stats->RGIANT_prio_2) +
+		be64_to_cpu(mlx4_en_stats->RGIANT_prio_3) +
+		be64_to_cpu(mlx4_en_stats->RGIANT_prio_4) +
+		be64_to_cpu(mlx4_en_stats->RGIANT_prio_5) +
+		be64_to_cpu(mlx4_en_stats->RGIANT_prio_6) +
+		be64_to_cpu(mlx4_en_stats->RGIANT_prio_7) +
+		be64_to_cpu(mlx4_en_stats->RGIANT_novlan);
+	priv->pkstats.rx_dropped = be32_to_cpu(mlx4_en_stats->RdropOvflw);
+	priv->pkstats.rx_length_errors = be32_to_cpu(mlx4_en_stats->RdropLength);
+	priv->pkstats.rx_over_errors = be32_to_cpu(mlx4_en_stats->RdropOvflw);
+	priv->pkstats.rx_crc_errors = be32_to_cpu(mlx4_en_stats->RCRC);
+	priv->pkstats.rx_jabbers = be32_to_cpu(mlx4_en_stats->RJBBR);
+	priv->pkstats.rx_in_range_length_error = be64_to_cpu(mlx4_en_stats->RInRangeLengthErr);
+	priv->pkstats.rx_out_range_length_error = be64_to_cpu(mlx4_en_stats->ROutRangeLengthErr);
+	priv->pkstats.rx_lt_64_bytes_packets = be64_to_cpu(mlx4_en_stats->R64_prio_0) +
+		be64_to_cpu(mlx4_en_stats->R64_prio_1) +
+		be64_to_cpu(mlx4_en_stats->R64_prio_2) +
+		be64_to_cpu(mlx4_en_stats->R64_prio_3) +
+		be64_to_cpu(mlx4_en_stats->R64_prio_4) +
+		be64_to_cpu(mlx4_en_stats->R64_prio_5) +
+		be64_to_cpu(mlx4_en_stats->R64_prio_6) +
+		be64_to_cpu(mlx4_en_stats->R64_prio_7) +
+		be64_to_cpu(mlx4_en_stats->R64_novlan);
+	priv->pkstats.rx_127_bytes_packets = be64_to_cpu(mlx4_en_stats->R127_prio_0) +
+		be64_to_cpu(mlx4_en_stats->R127_prio_1) +
+		be64_to_cpu(mlx4_en_stats->R127_prio_2) +
+		be64_to_cpu(mlx4_en_stats->R127_prio_3) +
+		be64_to_cpu(mlx4_en_stats->R127_prio_4) +
+		be64_to_cpu(mlx4_en_stats->R127_prio_5) +
+		be64_to_cpu(mlx4_en_stats->R127_prio_6) +
+		be64_to_cpu(mlx4_en_stats->R127_prio_7) +
+		be64_to_cpu(mlx4_en_stats->R127_novlan);
+	priv->pkstats.rx_255_bytes_packets = be64_to_cpu(mlx4_en_stats->R255_prio_0) +
+		be64_to_cpu(mlx4_en_stats->R255_prio_1) +
+		be64_to_cpu(mlx4_en_stats->R255_prio_2) +
+		be64_to_cpu(mlx4_en_stats->R255_prio_3) +
+		be64_to_cpu(mlx4_en_stats->R255_prio_4) +
+		be64_to_cpu(mlx4_en_stats->R255_prio_5) +
+		be64_to_cpu(mlx4_en_stats->R255_prio_6) +
+		be64_to_cpu(mlx4_en_stats->R255_prio_7) +
+		be64_to_cpu(mlx4_en_stats->R255_novlan);
+	priv->pkstats.rx_511_bytes_packets = be64_to_cpu(mlx4_en_stats->R511_prio_0) +
+		be64_to_cpu(mlx4_en_stats->R511_prio_1) +
+		be64_to_cpu(mlx4_en_stats->R511_prio_2) +
+		be64_to_cpu(mlx4_en_stats->R511_prio_3) +
+		be64_to_cpu(mlx4_en_stats->R511_prio_4) +
+		be64_to_cpu(mlx4_en_stats->R511_prio_5) +
+		be64_to_cpu(mlx4_en_stats->R511_prio_6) +
+		be64_to_cpu(mlx4_en_stats->R511_prio_7) +
+		be64_to_cpu(mlx4_en_stats->R511_novlan);
+	priv->pkstats.rx_1023_bytes_packets = be64_to_cpu(mlx4_en_stats->R1023_prio_0) +
+		be64_to_cpu(mlx4_en_stats->R1023_prio_1) +
+		be64_to_cpu(mlx4_en_stats->R1023_prio_2) +
+		be64_to_cpu(mlx4_en_stats->R1023_prio_3) +
+		be64_to_cpu(mlx4_en_stats->R1023_prio_4) +
+		be64_to_cpu(mlx4_en_stats->R1023_prio_5) +
+		be64_to_cpu(mlx4_en_stats->R1023_prio_6) +
+		be64_to_cpu(mlx4_en_stats->R1023_prio_7) +
+		be64_to_cpu(mlx4_en_stats->R1023_novlan);
+	priv->pkstats.rx_1518_bytes_packets = be64_to_cpu(mlx4_en_stats->R1518_prio_0) +
+		be64_to_cpu(mlx4_en_stats->R1518_prio_1) +
+		be64_to_cpu(mlx4_en_stats->R1518_prio_2) +
+		be64_to_cpu(mlx4_en_stats->R1518_prio_3) +
+		be64_to_cpu(mlx4_en_stats->R1518_prio_4) +
+		be64_to_cpu(mlx4_en_stats->R1518_prio_5) +
+		be64_to_cpu(mlx4_en_stats->R1518_prio_6) +
+		be64_to_cpu(mlx4_en_stats->R1518_prio_7) +
+		be64_to_cpu(mlx4_en_stats->R1518_novlan);
+	priv->pkstats.rx_1522_bytes_packets = be64_to_cpu(mlx4_en_stats->R1522_prio_0) +
+		be64_to_cpu(mlx4_en_stats->R1522_prio_1) +
+		be64_to_cpu(mlx4_en_stats->R1522_prio_2) +
+		be64_to_cpu(mlx4_en_stats->R1522_prio_3) +
+		be64_to_cpu(mlx4_en_stats->R1522_prio_4) +
+		be64_to_cpu(mlx4_en_stats->R1522_prio_5) +
+		be64_to_cpu(mlx4_en_stats->R1522_prio_6) +
+		be64_to_cpu(mlx4_en_stats->R1522_prio_7) +
+		be64_to_cpu(mlx4_en_stats->R1522_novlan);
+	priv->pkstats.rx_1548_bytes_packets = be64_to_cpu(mlx4_en_stats->R1548_prio_0) +
+		be64_to_cpu(mlx4_en_stats->R1548_prio_1) +
+		be64_to_cpu(mlx4_en_stats->R1548_prio_2) +
+		be64_to_cpu(mlx4_en_stats->R1548_prio_3) +
+		be64_to_cpu(mlx4_en_stats->R1548_prio_4) +
+		be64_to_cpu(mlx4_en_stats->R1548_prio_5) +
+		be64_to_cpu(mlx4_en_stats->R1548_prio_6) +
+		be64_to_cpu(mlx4_en_stats->R1548_prio_7) +
+		be64_to_cpu(mlx4_en_stats->R1548_novlan);
+	priv->pkstats.rx_gt_1548_bytes_packets = be64_to_cpu(mlx4_en_stats->R2MTU_prio_0) +
+		be64_to_cpu(mlx4_en_stats->R2MTU_prio_1) +
+		be64_to_cpu(mlx4_en_stats->R2MTU_prio_2) +
+		be64_to_cpu(mlx4_en_stats->R2MTU_prio_3) +
+		be64_to_cpu(mlx4_en_stats->R2MTU_prio_4) +
+		be64_to_cpu(mlx4_en_stats->R2MTU_prio_5) +
+		be64_to_cpu(mlx4_en_stats->R2MTU_prio_6) +
+		be64_to_cpu(mlx4_en_stats->R2MTU_prio_7) +
+		be64_to_cpu(mlx4_en_stats->R2MTU_novlan);
 
-	dev->if_ierrors = be32_to_cpu(mlx4_en_stats->RDROP) + ierror;
-	dev->if_oerrors = be32_to_cpu(mlx4_en_stats->TDROP) + oerror;
-	dev->if_imcasts = be64_to_cpu(mlx4_en_stats->MCAST_prio_0) +
-			  be64_to_cpu(mlx4_en_stats->MCAST_prio_1) +
-			  be64_to_cpu(mlx4_en_stats->MCAST_prio_2) +
-			  be64_to_cpu(mlx4_en_stats->MCAST_prio_3) +
-			  be64_to_cpu(mlx4_en_stats->MCAST_prio_4) +
-			  be64_to_cpu(mlx4_en_stats->MCAST_prio_5) +
-			  be64_to_cpu(mlx4_en_stats->MCAST_prio_6) +
-			  be64_to_cpu(mlx4_en_stats->MCAST_prio_7) +
-			  be64_to_cpu(mlx4_en_stats->MCAST_novlan);
-	dev->if_omcasts = be64_to_cpu(mlx4_en_stats->TMCAST_prio_0) +
-			  be64_to_cpu(mlx4_en_stats->TMCAST_prio_1) +
-			  be64_to_cpu(mlx4_en_stats->TMCAST_prio_2) +
-			  be64_to_cpu(mlx4_en_stats->TMCAST_prio_3) +
-			  be64_to_cpu(mlx4_en_stats->TMCAST_prio_4) +
-			  be64_to_cpu(mlx4_en_stats->TMCAST_prio_5) +
-			  be64_to_cpu(mlx4_en_stats->TMCAST_prio_6) +
-			  be64_to_cpu(mlx4_en_stats->TMCAST_prio_7) +
-			  be64_to_cpu(mlx4_en_stats->TMCAST_novlan);
-	dev->if_collisions = 0;
+	/* Tx Stats */
+	priv->pkstats.tx_packets = be64_to_cpu(mlx4_en_stats->TTOT_prio_0) +
+		be64_to_cpu(mlx4_en_stats->TTOT_prio_1) +
+		be64_to_cpu(mlx4_en_stats->TTOT_prio_2) +
+		be64_to_cpu(mlx4_en_stats->TTOT_prio_3) +
+		be64_to_cpu(mlx4_en_stats->TTOT_prio_4) +
+		be64_to_cpu(mlx4_en_stats->TTOT_prio_5) +
+		be64_to_cpu(mlx4_en_stats->TTOT_prio_6) +
+		be64_to_cpu(mlx4_en_stats->TTOT_prio_7) +
+		be64_to_cpu(mlx4_en_stats->TTOT_novlan);
+	priv->pkstats.tx_bytes = be64_to_cpu(mlx4_en_stats->TOCT_prio_0) +
+		be64_to_cpu(mlx4_en_stats->TOCT_prio_1) +
+		be64_to_cpu(mlx4_en_stats->TOCT_prio_2) +
+		be64_to_cpu(mlx4_en_stats->TOCT_prio_3) +
+		be64_to_cpu(mlx4_en_stats->TOCT_prio_4) +
+		be64_to_cpu(mlx4_en_stats->TOCT_prio_5) +
+		be64_to_cpu(mlx4_en_stats->TOCT_prio_6) +
+		be64_to_cpu(mlx4_en_stats->TOCT_prio_7) +
+		be64_to_cpu(mlx4_en_stats->TOCT_novlan);
+	priv->pkstats.tx_multicast_packets = be64_to_cpu(mlx4_en_stats->TMCAST_prio_0) +
+		be64_to_cpu(mlx4_en_stats->TMCAST_prio_1) +
+		be64_to_cpu(mlx4_en_stats->TMCAST_prio_2) +
+		be64_to_cpu(mlx4_en_stats->TMCAST_prio_3) +
+		be64_to_cpu(mlx4_en_stats->TMCAST_prio_4) +
+		be64_to_cpu(mlx4_en_stats->TMCAST_prio_5) +
+		be64_to_cpu(mlx4_en_stats->TMCAST_prio_6) +
+		be64_to_cpu(mlx4_en_stats->TMCAST_prio_7) +
+		be64_to_cpu(mlx4_en_stats->TMCAST_novlan);
+	priv->pkstats.tx_broadcast_packets = be64_to_cpu(mlx4_en_stats->TBCAST_prio_0) +
+		be64_to_cpu(mlx4_en_stats->TBCAST_prio_1) +
+		be64_to_cpu(mlx4_en_stats->TBCAST_prio_2) +
+		be64_to_cpu(mlx4_en_stats->TBCAST_prio_3) +
+		be64_to_cpu(mlx4_en_stats->TBCAST_prio_4) +
+		be64_to_cpu(mlx4_en_stats->TBCAST_prio_5) +
+		be64_to_cpu(mlx4_en_stats->TBCAST_prio_6) +
+		be64_to_cpu(mlx4_en_stats->TBCAST_prio_7) +
+		be64_to_cpu(mlx4_en_stats->TBCAST_novlan);
+	priv->pkstats.tx_errors = be64_to_cpu(mlx4_en_stats->TGIANT_prio_0) +
+		be64_to_cpu(mlx4_en_stats->TGIANT_prio_1) +
+		be64_to_cpu(mlx4_en_stats->TGIANT_prio_2) +
+		be64_to_cpu(mlx4_en_stats->TGIANT_prio_3) +
+		be64_to_cpu(mlx4_en_stats->TGIANT_prio_4) +
+		be64_to_cpu(mlx4_en_stats->TGIANT_prio_5) +
+		be64_to_cpu(mlx4_en_stats->TGIANT_prio_6) +
+		be64_to_cpu(mlx4_en_stats->TGIANT_prio_7) +
+		be64_to_cpu(mlx4_en_stats->TGIANT_novlan);
+	priv->pkstats.tx_dropped = be32_to_cpu(mlx4_en_stats->TDROP) -
+		priv->pkstats.tx_errors;
+	priv->pkstats.tx_lt_64_bytes_packets = be64_to_cpu(mlx4_en_stats->T64_prio_0) +
+		be64_to_cpu(mlx4_en_stats->T64_prio_1) +
+		be64_to_cpu(mlx4_en_stats->T64_prio_2) +
+		be64_to_cpu(mlx4_en_stats->T64_prio_3) +
+		be64_to_cpu(mlx4_en_stats->T64_prio_4) +
+		be64_to_cpu(mlx4_en_stats->T64_prio_5) +
+		be64_to_cpu(mlx4_en_stats->T64_prio_6) +
+		be64_to_cpu(mlx4_en_stats->T64_prio_7) +
+		be64_to_cpu(mlx4_en_stats->T64_novlan);
+	priv->pkstats.tx_127_bytes_packets = be64_to_cpu(mlx4_en_stats->T127_prio_0) +
+		be64_to_cpu(mlx4_en_stats->T127_prio_1) +
+		be64_to_cpu(mlx4_en_stats->T127_prio_2) +
+		be64_to_cpu(mlx4_en_stats->T127_prio_3) +
+		be64_to_cpu(mlx4_en_stats->T127_prio_4) +
+		be64_to_cpu(mlx4_en_stats->T127_prio_5) +
+		be64_to_cpu(mlx4_en_stats->T127_prio_6) +
+		be64_to_cpu(mlx4_en_stats->T127_prio_7) +
+		be64_to_cpu(mlx4_en_stats->T127_novlan);
+	priv->pkstats.tx_255_bytes_packets = be64_to_cpu(mlx4_en_stats->T255_prio_0) +
+		be64_to_cpu(mlx4_en_stats->T255_prio_1) +
+		be64_to_cpu(mlx4_en_stats->T255_prio_2) +
+		be64_to_cpu(mlx4_en_stats->T255_prio_3) +
+		be64_to_cpu(mlx4_en_stats->T255_prio_4) +
+		be64_to_cpu(mlx4_en_stats->T255_prio_5) +
+		be64_to_cpu(mlx4_en_stats->T255_prio_6) +
+		be64_to_cpu(mlx4_en_stats->T255_prio_7) +
+		be64_to_cpu(mlx4_en_stats->T255_novlan);
+	priv->pkstats.tx_511_bytes_packets = be64_to_cpu(mlx4_en_stats->T511_prio_0) +
+		be64_to_cpu(mlx4_en_stats->T511_prio_1) +
+		be64_to_cpu(mlx4_en_stats->T511_prio_2) +
+		be64_to_cpu(mlx4_en_stats->T511_prio_3) +
+		be64_to_cpu(mlx4_en_stats->T511_prio_4) +
+		be64_to_cpu(mlx4_en_stats->T511_prio_5) +
+		be64_to_cpu(mlx4_en_stats->T511_prio_6) +
+		be64_to_cpu(mlx4_en_stats->T511_prio_7) +
+		be64_to_cpu(mlx4_en_stats->T511_novlan);
+	priv->pkstats.tx_1023_bytes_packets = be64_to_cpu(mlx4_en_stats->T1023_prio_0) +
+		be64_to_cpu(mlx4_en_stats->T1023_prio_1) +
+		be64_to_cpu(mlx4_en_stats->T1023_prio_2) +
+		be64_to_cpu(mlx4_en_stats->T1023_prio_3) +
+		be64_to_cpu(mlx4_en_stats->T1023_prio_4) +
+		be64_to_cpu(mlx4_en_stats->T1023_prio_5) +
+		be64_to_cpu(mlx4_en_stats->T1023_prio_6) +
+		be64_to_cpu(mlx4_en_stats->T1023_prio_7) +
+		be64_to_cpu(mlx4_en_stats->T1023_novlan);
+	priv->pkstats.tx_1518_bytes_packets = be64_to_cpu(mlx4_en_stats->T1518_prio_0) +
+		be64_to_cpu(mlx4_en_stats->T1518_prio_1) +
+		be64_to_cpu(mlx4_en_stats->T1518_prio_2) +
+		be64_to_cpu(mlx4_en_stats->T1518_prio_3) +
+		be64_to_cpu(mlx4_en_stats->T1518_prio_4) +
+		be64_to_cpu(mlx4_en_stats->T1518_prio_5) +
+		be64_to_cpu(mlx4_en_stats->T1518_prio_6) +
+		be64_to_cpu(mlx4_en_stats->T1518_prio_7) +
+		be64_to_cpu(mlx4_en_stats->T1518_novlan);
+	priv->pkstats.tx_1522_bytes_packets = be64_to_cpu(mlx4_en_stats->T1522_prio_0) +
+		be64_to_cpu(mlx4_en_stats->T1522_prio_1) +
+		be64_to_cpu(mlx4_en_stats->T1522_prio_2) +
+		be64_to_cpu(mlx4_en_stats->T1522_prio_3) +
+		be64_to_cpu(mlx4_en_stats->T1522_prio_4) +
+		be64_to_cpu(mlx4_en_stats->T1522_prio_5) +
+		be64_to_cpu(mlx4_en_stats->T1522_prio_6) +
+		be64_to_cpu(mlx4_en_stats->T1522_prio_7) +
+		be64_to_cpu(mlx4_en_stats->T1522_novlan);
+	priv->pkstats.tx_1548_bytes_packets = be64_to_cpu(mlx4_en_stats->T1548_prio_0) +
+		be64_to_cpu(mlx4_en_stats->T1548_prio_1) +
+		be64_to_cpu(mlx4_en_stats->T1548_prio_2) +
+		be64_to_cpu(mlx4_en_stats->T1548_prio_3) +
+		be64_to_cpu(mlx4_en_stats->T1548_prio_4) +
+		be64_to_cpu(mlx4_en_stats->T1548_prio_5) +
+		be64_to_cpu(mlx4_en_stats->T1548_prio_6) +
+		be64_to_cpu(mlx4_en_stats->T1548_prio_7) +
+		be64_to_cpu(mlx4_en_stats->T1548_novlan);
+	priv->pkstats.tx_gt_1548_bytes_packets = be64_to_cpu(mlx4_en_stats->T2MTU_prio_0) +
+		be64_to_cpu(mlx4_en_stats->T2MTU_prio_1) +
+		be64_to_cpu(mlx4_en_stats->T2MTU_prio_2) +
+		be64_to_cpu(mlx4_en_stats->T2MTU_prio_3) +
+		be64_to_cpu(mlx4_en_stats->T2MTU_prio_4) +
+		be64_to_cpu(mlx4_en_stats->T2MTU_prio_5) +
+		be64_to_cpu(mlx4_en_stats->T2MTU_prio_6) +
+		be64_to_cpu(mlx4_en_stats->T2MTU_prio_7) +
+		be64_to_cpu(mlx4_en_stats->T2MTU_novlan);
 
-	priv->pkstats.broadcast =
-				be64_to_cpu(mlx4_en_stats->RBCAST_prio_0) +
-				be64_to_cpu(mlx4_en_stats->RBCAST_prio_1) +
-				be64_to_cpu(mlx4_en_stats->RBCAST_prio_2) +
-				be64_to_cpu(mlx4_en_stats->RBCAST_prio_3) +
-				be64_to_cpu(mlx4_en_stats->RBCAST_prio_4) +
-				be64_to_cpu(mlx4_en_stats->RBCAST_prio_5) +
-				be64_to_cpu(mlx4_en_stats->RBCAST_prio_6) +
-				be64_to_cpu(mlx4_en_stats->RBCAST_prio_7) +
-				be64_to_cpu(mlx4_en_stats->RBCAST_novlan);
-	priv->pkstats.rx_prio[0] = be64_to_cpu(mlx4_en_stats->RTOT_prio_0);
-	priv->pkstats.rx_prio[1] = be64_to_cpu(mlx4_en_stats->RTOT_prio_1);
-	priv->pkstats.rx_prio[2] = be64_to_cpu(mlx4_en_stats->RTOT_prio_2);
-	priv->pkstats.rx_prio[3] = be64_to_cpu(mlx4_en_stats->RTOT_prio_3);
-	priv->pkstats.rx_prio[4] = be64_to_cpu(mlx4_en_stats->RTOT_prio_4);
-	priv->pkstats.rx_prio[5] = be64_to_cpu(mlx4_en_stats->RTOT_prio_5);
-	priv->pkstats.rx_prio[6] = be64_to_cpu(mlx4_en_stats->RTOT_prio_6);
-	priv->pkstats.rx_prio[7] = be64_to_cpu(mlx4_en_stats->RTOT_prio_7);
-	priv->pkstats.tx_prio[0] = be64_to_cpu(mlx4_en_stats->TTOT_prio_0);
-	priv->pkstats.tx_prio[1] = be64_to_cpu(mlx4_en_stats->TTOT_prio_1);
-	priv->pkstats.tx_prio[2] = be64_to_cpu(mlx4_en_stats->TTOT_prio_2);
-	priv->pkstats.tx_prio[3] = be64_to_cpu(mlx4_en_stats->TTOT_prio_3);
-	priv->pkstats.tx_prio[4] = be64_to_cpu(mlx4_en_stats->TTOT_prio_4);
-	priv->pkstats.tx_prio[5] = be64_to_cpu(mlx4_en_stats->TTOT_prio_5);
-	priv->pkstats.tx_prio[6] = be64_to_cpu(mlx4_en_stats->TTOT_prio_6);
-	priv->pkstats.tx_prio[7] = be64_to_cpu(mlx4_en_stats->TTOT_prio_7);
+	priv->pkstats.rx_prio[0][0] = be64_to_cpu(mlx4_en_stats->RTOT_prio_0);
+	priv->pkstats.rx_prio[0][1] = be64_to_cpu(mlx4_en_stats->ROCT_prio_0);
+	priv->pkstats.rx_prio[1][0] = be64_to_cpu(mlx4_en_stats->RTOT_prio_1);
+	priv->pkstats.rx_prio[1][1] = be64_to_cpu(mlx4_en_stats->ROCT_prio_1);
+	priv->pkstats.rx_prio[2][0] = be64_to_cpu(mlx4_en_stats->RTOT_prio_2);
+	priv->pkstats.rx_prio[2][1] = be64_to_cpu(mlx4_en_stats->ROCT_prio_2);
+	priv->pkstats.rx_prio[3][0] = be64_to_cpu(mlx4_en_stats->RTOT_prio_3);
+	priv->pkstats.rx_prio[3][1] = be64_to_cpu(mlx4_en_stats->ROCT_prio_3);
+	priv->pkstats.rx_prio[4][0] = be64_to_cpu(mlx4_en_stats->RTOT_prio_4);
+	priv->pkstats.rx_prio[4][1] = be64_to_cpu(mlx4_en_stats->ROCT_prio_4);
+	priv->pkstats.rx_prio[5][0] = be64_to_cpu(mlx4_en_stats->RTOT_prio_5);
+	priv->pkstats.rx_prio[5][1] = be64_to_cpu(mlx4_en_stats->ROCT_prio_5);
+	priv->pkstats.rx_prio[6][0] = be64_to_cpu(mlx4_en_stats->RTOT_prio_6);
+	priv->pkstats.rx_prio[6][1] = be64_to_cpu(mlx4_en_stats->ROCT_prio_6);
+	priv->pkstats.rx_prio[7][0] = be64_to_cpu(mlx4_en_stats->RTOT_prio_7);
+	priv->pkstats.rx_prio[7][1] = be64_to_cpu(mlx4_en_stats->ROCT_prio_7);
+	priv->pkstats.rx_prio[8][0] = be64_to_cpu(mlx4_en_stats->RTOT_novlan);
+	priv->pkstats.rx_prio[8][1] = be64_to_cpu(mlx4_en_stats->ROCT_novlan);
+	priv->pkstats.tx_prio[0][0] = be64_to_cpu(mlx4_en_stats->TTOT_prio_0);
+	priv->pkstats.tx_prio[0][1] = be64_to_cpu(mlx4_en_stats->TOCT_prio_0);
+	priv->pkstats.tx_prio[1][0] = be64_to_cpu(mlx4_en_stats->TTOT_prio_1);
+	priv->pkstats.tx_prio[1][1] = be64_to_cpu(mlx4_en_stats->TOCT_prio_1);
+	priv->pkstats.tx_prio[2][0] = be64_to_cpu(mlx4_en_stats->TTOT_prio_2);
+	priv->pkstats.tx_prio[2][1] = be64_to_cpu(mlx4_en_stats->TOCT_prio_2);
+	priv->pkstats.tx_prio[3][0] = be64_to_cpu(mlx4_en_stats->TTOT_prio_3);
+	priv->pkstats.tx_prio[3][1] = be64_to_cpu(mlx4_en_stats->TOCT_prio_3);
+	priv->pkstats.tx_prio[4][0] = be64_to_cpu(mlx4_en_stats->TTOT_prio_4);
+	priv->pkstats.tx_prio[4][1] = be64_to_cpu(mlx4_en_stats->TOCT_prio_4);
+	priv->pkstats.tx_prio[5][0] = be64_to_cpu(mlx4_en_stats->TTOT_prio_5);
+	priv->pkstats.tx_prio[5][1] = be64_to_cpu(mlx4_en_stats->TOCT_prio_5);
+	priv->pkstats.tx_prio[6][0] = be64_to_cpu(mlx4_en_stats->TTOT_prio_6);
+	priv->pkstats.tx_prio[6][1] = be64_to_cpu(mlx4_en_stats->TOCT_prio_6);
+	priv->pkstats.tx_prio[7][0] = be64_to_cpu(mlx4_en_stats->TTOT_prio_7);
+	priv->pkstats.tx_prio[7][1] = be64_to_cpu(mlx4_en_stats->TOCT_prio_7);
+	priv->pkstats.tx_prio[8][0] = be64_to_cpu(mlx4_en_stats->TTOT_novlan);
+	priv->pkstats.tx_prio[8][1] = be64_to_cpu(mlx4_en_stats->TOCT_novlan);
+
+	flowstats = mailbox_flow->buf;
+
+	for (i = 0; i < MLX4_NUM_PRIORITIES; i++)	{
+		priv->flowstats[i].rx_pause =
+			be64_to_cpu(flowstats[i].rx_pause);
+		priv->flowstats[i].rx_pause_duration =
+			be64_to_cpu(flowstats[i].rx_pause_duration);
+		priv->flowstats[i].rx_pause_transition =
+			be64_to_cpu(flowstats[i].rx_pause_transition);
+		priv->flowstats[i].tx_pause =
+			be64_to_cpu(flowstats[i].tx_pause);
+		priv->flowstats[i].tx_pause_duration =
+			be64_to_cpu(flowstats[i].tx_pause_duration);
+		priv->flowstats[i].tx_pause_transition =
+			be64_to_cpu(flowstats[i].tx_pause_transition);
+	}
+
+	memset(&tmp_vport_stats, 0, sizeof(tmp_vport_stats));
 	spin_unlock(&priv->stats_lock);
+	err = mlx4_get_vport_ethtool_stats(mdev->dev, port,
+					   &tmp_vport_stats, reset);
+	spin_lock(&priv->stats_lock);
+	if (!err) {
+		/* ethtool stats format */
+		vport_stats->rx_unicast_packets = tmp_vport_stats.rx_unicast_packets;
+		vport_stats->rx_unicast_bytes = tmp_vport_stats.rx_unicast_bytes;
+		vport_stats->rx_multicast_packets = tmp_vport_stats.rx_multicast_packets;
+		vport_stats->rx_multicast_bytes = tmp_vport_stats.rx_multicast_bytes;
+		vport_stats->rx_broadcast_packets = tmp_vport_stats.rx_broadcast_packets;
+		vport_stats->rx_broadcast_bytes = tmp_vport_stats.rx_broadcast_bytes;
+		vport_stats->rx_dropped = tmp_vport_stats.rx_dropped;
+		vport_stats->rx_errors = tmp_vport_stats.rx_errors;
+		vport_stats->tx_unicast_packets = tmp_vport_stats.tx_unicast_packets;
+		vport_stats->tx_unicast_bytes = tmp_vport_stats.tx_unicast_bytes;
+		vport_stats->tx_multicast_packets = tmp_vport_stats.tx_multicast_packets;
+		vport_stats->tx_multicast_bytes = tmp_vport_stats.tx_multicast_bytes;
+		vport_stats->tx_broadcast_packets = tmp_vport_stats.tx_broadcast_packets;
+		vport_stats->tx_broadcast_bytes = tmp_vport_stats.tx_broadcast_bytes;
+		vport_stats->tx_errors = tmp_vport_stats.tx_errors;
+	}
 
+	if (!mlx4_is_mfunc(mdev->dev)) {
+		/* netdevice stats format */
+                dev                     = mdev->pndev[port];
+		dev->if_ipackets        = priv->pkstats.rx_packets;
+		dev->if_opackets        = priv->pkstats.tx_packets;
+		dev->if_ibytes          = priv->pkstats.rx_bytes;
+		dev->if_obytes          = priv->pkstats.tx_bytes;
+		dev->if_ierrors         = priv->pkstats.rx_errors;
+		dev->if_iqdrops         = priv->pkstats.rx_dropped;
+		dev->if_imcasts         = priv->pkstats.rx_multicast_packets;
+                dev->if_omcasts         = priv->pkstats.tx_multicast_packets;
+                dev->if_collisions      = 0;
+	}
+
+	spin_unlock(&priv->stats_lock);
+
 out:
+	mlx4_free_cmd_mailbox(mdev->dev, mailbox_flow);
 	mlx4_free_cmd_mailbox(mdev->dev, mailbox);
+
+mailbox_out:
+	if (do_if_stat)
+		priv->last_ifq_jiffies = jiffies;
+
 	return err;
 }
-

Modified: trunk/sys/ofed/drivers/net/mlx4/en_port.h
===================================================================
--- trunk/sys/ofed/drivers/net/mlx4/en_port.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/drivers/net/mlx4/en_port.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2007, 2014 Mellanox Technologies. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -36,43 +36,11 @@
 
 
 #define SET_PORT_GEN_ALL_VALID	0x7
-#define SET_PORT_PROMISC_EN_SHIFT	31
-#define SET_PORT_PROMISC_MODE_SHIFT	30
+#define SET_PORT_PROMISC_SHIFT	31
+#define SET_PORT_MC_PROMISC_SHIFT	30
 
-enum {
-	MLX4_CMD_SET_VLAN_FLTR  = 0x47,
-	MLX4_CMD_SET_MCAST_FLTR = 0x48,
-	MLX4_CMD_DUMP_ETH_STATS = 0x49,
-};
+#define MLX4_EN_NUM_TC		8
 
-struct mlx4_set_port_general_context {
-	u8 reserved[3];
-	u8 flags;
-	u16 reserved2;
-	__be16 mtu;
-	u8 pptx;
-	u8 pfctx;
-	u16 reserved3;
-	u8 pprx;
-	u8 pfcrx;
-	u16 reserved4;
-};
-
-struct mlx4_set_port_rqp_calc_context {
-	__be32 base_qpn;
-	__be32 flags;
-	u8 reserved[3];
-	u8 mac_miss;
-	u8 intra_no_vlan;
-	u8 no_vlan;
-	u8 intra_vlan_miss;
-	u8 vlan_miss;
-	u8 reserved2[3];
-	u8 no_vlan_prio;
-	__be32 promisc;
-	__be32 mcast;
-};
-
 #define VLAN_FLTR_SIZE	128
 struct mlx4_set_vlan_fltr_mbox {
 	__be32 entry[VLAN_FLTR_SIZE];
@@ -86,10 +54,12 @@
 };
 
 enum {
+	MLX4_EN_10G_SPEED_XAUI	= 0x00,
+	MLX4_EN_10G_SPEED_XFI	= 0x01,
 	MLX4_EN_1G_SPEED	= 0x02,
-	MLX4_EN_10G_SPEED_XFI	= 0x01,
-	MLX4_EN_10G_SPEED_XAUI	= 0x00,
+	MLX4_EN_20G_SPEED	= 0x08,
 	MLX4_EN_40G_SPEED	= 0x40,
+	MLX4_EN_56G_SPEED	= 0x20,
 	MLX4_EN_OTHER_SPEED	= 0x0f,
 };
 
@@ -96,19 +66,15 @@
 struct mlx4_en_query_port_context {
 	u8 link_up;
 #define MLX4_EN_LINK_UP_MASK	0x80
-	u8 reserved;
+	u8 autoneg;
+#define MLX4_EN_AUTONEG_MASK	0x80
 	__be16 mtu;
 	u8 reserved2;
 	u8 link_speed;
-#define MLX4_EN_SPEED_MASK	0x43
+#define MLX4_EN_SPEED_MASK	0x6b
 	u16 reserved3[5];
 	__be64 mac;
 	u8 transceiver;
-	u8 reserved4[3];
-	__be32 wavelenth;
-	u32 reserved5;
-	__be32 transceiver_code_hi;
-	__be32 transceiver_code_low;
 };
 
 
@@ -593,6 +559,5 @@
 	__be32 TDROP;
 };
 
-enum mlx4_query_reply mlx4_en_query(void *endev_ptr, void *int_dev);
 
 #endif


Property changes on: trunk/sys/ofed/drivers/net/mlx4/en_port.h
___________________________________________________________________
Deleted: cvs2svn:cvs-rev
## -1 +0,0 ##
-1.1.1.1
\ No newline at end of property
Modified: trunk/sys/ofed/drivers/net/mlx4/en_resources.c
===================================================================
--- trunk/sys/ofed/drivers/net/mlx4/en_resources.c	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/drivers/net/mlx4/en_resources.c	2016-09-14 19:35:22 UTC (rev 7911)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2007, 2014 Mellanox Technologies. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -31,24 +31,26 @@
  *
  */
 
+#include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/mlx4/qp.h>
 
 #include "mlx4_en.h"
 
+
 void mlx4_en_fill_qp_context(struct mlx4_en_priv *priv, int size, int stride,
 			     int is_tx, int rss, int qpn, int cqn,
-			     struct mlx4_qp_context *context)
+			     int user_prio, struct mlx4_qp_context *context)
 {
 	struct mlx4_en_dev *mdev = priv->mdev;
+	struct net_device *dev = priv->dev;
 
 	memset(context, 0, sizeof *context);
-	context->flags = cpu_to_be32(7 << 16 | rss << 13);
+	context->flags = cpu_to_be32(7 << 16 | rss << MLX4_RSS_QPC_FLAG_OFFSET);
 	context->pd = cpu_to_be32(mdev->priv_pdn);
 	context->mtu_msgmax = 0xff;
-	if (!is_tx && !rss) {
+	if (!is_tx && !rss)
 		context->rq_size_stride = ilog2(size) << 3 | (ilog2(stride) - 4);
-	}
 	if (is_tx)
 		context->sq_size_stride = ilog2(size) << 3 | (ilog2(stride) - 4);
 	else
@@ -57,10 +59,25 @@
 	context->local_qpn = cpu_to_be32(qpn);
 	context->pri_path.ackto = 1 & 0x07;
 	context->pri_path.sched_queue = 0x83 | (priv->port - 1) << 6;
-	context->pri_path.counter_index = 0xff;
+	if (user_prio >= 0) {
+		context->pri_path.sched_queue |= user_prio << 3;
+		context->pri_path.feup = 1 << 6;
+	}
+	context->pri_path.counter_index = (u8)(priv->counter_index);
+	if (!rss &&
+	    (mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_LB_SRC_CHK) &&
+	    context->pri_path.counter_index != 0xFF) {
+		/* disable multicast loopback to qp with same counter */
+		context->pri_path.fl |= MLX4_FL_ETH_SRC_CHECK_MC_LB;
+		context->pri_path.vlan_control |=
+			MLX4_VLAN_CTRL_ETH_SRC_CHECK_IF_COUNTER;
+	}
+
 	context->cqn_send = cpu_to_be32(cqn);
 	context->cqn_recv = cpu_to_be32(cqn);
 	context->db_rec_addr = cpu_to_be64(priv->res.db.dma << 2);
+	if (!(dev->if_capabilities & IFCAP_VLAN_HWCSUM))
+		context->param3 |= cpu_to_be32(1 << 30);
 }
 
 
@@ -69,6 +86,8 @@
 	struct page **pages;
 	int i;
 
+        // if nbufs == 1 - there is no need to vmap 
+        // if buf->direct.buf is not NULL it means that vmap was already done by mlx4_alloc_buff
 	if (buf->direct.buf != NULL || buf->nbufs == 1)
 		return 0;
 
@@ -89,11 +108,10 @@
 
 void mlx4_en_unmap_buffer(struct mlx4_buf *buf)
 {
-	if (buf->direct.buf != NULL || buf->nbufs == 1)
+	if (BITS_PER_LONG == 64 || buf->nbufs == 1)
 		return;
 
 	vunmap(buf->direct.buf);
-	buf->direct.buf = NULL;
 }
 
 void mlx4_en_sqp_event(struct mlx4_qp *qp, enum mlx4_event event)

Modified: trunk/sys/ofed/drivers/net/mlx4/en_rx.c
===================================================================
--- trunk/sys/ofed/drivers/net/mlx4/en_rx.c	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/drivers/net/mlx4/en_rx.c	2016-09-14 19:35:22 UTC (rev 7911)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2007, 2014 Mellanox Technologies. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -30,65 +30,41 @@
  * SOFTWARE.
  *
  */
-
 #include "opt_inet.h"
-#include "mlx4_en.h"
-
 #include <linux/mlx4/cq.h>
+#include <linux/slab.h>
 #include <linux/mlx4/qp.h>
+#include <linux/if_ether.h>
+#include <linux/if_vlan.h>
+#include <linux/vmalloc.h>
+#include <linux/mlx4/driver.h>
+#ifdef CONFIG_NET_RX_BUSY_POLL
+#include <net/busy_poll.h>
+#endif
 
-#include <net/ethernet.h>
-#include <net/if_vlan_var.h>
-#include <sys/mbuf.h>
+#include "mlx4_en.h"
 
-enum {
-	MIN_RX_ARM = 1024,
-};
 
-static int mlx4_en_alloc_buf(struct mlx4_en_priv *priv,
-			     struct mlx4_en_rx_desc *rx_desc,
-			     struct mbuf **mb_list,
-			     int i)
-{
-	struct mlx4_en_dev *mdev = priv->mdev;
-	struct mlx4_en_frag_info *frag_info = &priv->frag_info[i];
-	struct mbuf *mb;
-	dma_addr_t dma;
-
-	if (i == 0)
-		mb = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, frag_info->frag_size);
-	else
-		mb = m_getjcl(M_NOWAIT, MT_DATA, 0, frag_info->frag_size);
-	if (mb == NULL) {
-		priv->port_stats.rx_alloc_failed++;
-		return -ENOMEM;
-	}
-	dma = pci_map_single(mdev->pdev, mb->m_data, frag_info->frag_size,
-			     PCI_DMA_FROMDEVICE);
-	rx_desc->data[i].addr = cpu_to_be64(dma);
-	mb_list[i] = mb;
-	return 0;
-}
-
 static void mlx4_en_init_rx_desc(struct mlx4_en_priv *priv,
-				 struct mlx4_en_rx_ring *ring, int index)
+				 struct mlx4_en_rx_ring *ring,
+				 int index)
 {
-	struct mlx4_en_rx_desc *rx_desc = ring->buf + ring->stride * index;
+	struct mlx4_en_rx_desc *rx_desc = (struct mlx4_en_rx_desc *)
+	    (ring->buf + (ring->stride * index));
 	int possible_frags;
 	int i;
 
 	/* Set size and memtype fields */
-	for (i = 0; i < priv->num_frags; i++) {
-		rx_desc->data[i].byte_count =
-			cpu_to_be32(priv->frag_info[i].frag_size);
-		rx_desc->data[i].lkey = cpu_to_be32(priv->mdev->mr.key);
-	}
+	rx_desc->data[0].byte_count = cpu_to_be32(priv->rx_mb_size - MLX4_NET_IP_ALIGN);
+	rx_desc->data[0].lkey = cpu_to_be32(priv->mdev->mr.key);
 
-	/* If the number of used fragments does not fill up the ring stride,
-	 * remaining (unused) fragments must be padded with null address/size
-	 * and a special memory key */
+	/*
+	 * If the number of used fragments does not fill up the ring
+	 * stride, remaining (unused) fragments must be padded with
+	 * null address/size and a special memory key:
+	 */
 	possible_frags = (ring->stride - sizeof(struct mlx4_en_rx_desc)) / DS_SIZE;
-	for (i = priv->num_frags; i < possible_frags; i++) {
+	for (i = 1; i < possible_frags; i++) {
 		rx_desc->data[i].byte_count = 0;
 		rx_desc->data[i].lkey = cpu_to_be32(MLX4_EN_MEMTYPE_PAD);
 		rx_desc->data[i].addr = 0;
@@ -95,54 +71,121 @@
 	}
 }
 
-static int mlx4_en_prepare_rx_desc(struct mlx4_en_priv *priv,
-				   struct mlx4_en_rx_ring *ring, int index)
+static int
+mlx4_en_alloc_buf(struct mlx4_en_rx_ring *ring,
+     __be64 *pdma, struct mlx4_en_rx_mbuf *mb_list)
 {
-	struct mlx4_en_rx_desc *rx_desc = ring->buf + (index * ring->stride);
-	struct mbuf **mb_list = ring->rx_info + (index << priv->log_rx_info);
-	int i;
+	bus_dma_segment_t segs[1];
+	bus_dmamap_t map;
+	struct mbuf *mb;
+	int nsegs;
+	int err;
 
-	for (i = 0; i < priv->num_frags; i++)
-		if (mlx4_en_alloc_buf(priv, rx_desc, mb_list, i))
-			goto err;
+	/* try to allocate a new spare mbuf */
+	if (unlikely(ring->spare.mbuf == NULL)) {
+		mb = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, ring->rx_mb_size);
+		if (unlikely(mb == NULL))
+			return (-ENOMEM);
+		/* setup correct length */
+		mb->m_pkthdr.len = mb->m_len = ring->rx_mb_size;
 
-	return 0;
+		/* make sure IP header gets aligned */
+		m_adj(mb, MLX4_NET_IP_ALIGN);
 
-err:
-	while (i--)
-		m_free(mb_list[i]);
-	return -ENOMEM;
+		/* load spare mbuf into BUSDMA */
+		err = -bus_dmamap_load_mbuf_sg(ring->dma_tag, ring->spare.dma_map,
+		    mb, segs, &nsegs, BUS_DMA_NOWAIT);
+		if (unlikely(err != 0)) {
+			m_freem(mb);
+			return (err);
+		}
+
+		/* store spare info */
+		ring->spare.mbuf = mb;
+		ring->spare.paddr_be = cpu_to_be64(segs[0].ds_addr);
+
+		bus_dmamap_sync(ring->dma_tag, ring->spare.dma_map,
+		    BUS_DMASYNC_PREREAD);
+	}
+
+	/* synchronize and unload the current mbuf, if any */
+	if (likely(mb_list->mbuf != NULL)) {
+		bus_dmamap_sync(ring->dma_tag, mb_list->dma_map,
+		    BUS_DMASYNC_POSTREAD);
+		bus_dmamap_unload(ring->dma_tag, mb_list->dma_map);
+	}
+
+	mb = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, ring->rx_mb_size);
+	if (unlikely(mb == NULL))
+		goto use_spare;
+
+	/* setup correct length */
+	mb->m_pkthdr.len = mb->m_len = ring->rx_mb_size;
+
+	/* make sure IP header gets aligned */
+	m_adj(mb, MLX4_NET_IP_ALIGN);
+
+	err = -bus_dmamap_load_mbuf_sg(ring->dma_tag, mb_list->dma_map,
+	    mb, segs, &nsegs, BUS_DMA_NOWAIT);
+	if (unlikely(err != 0)) {
+		m_freem(mb);
+		goto use_spare;
+	}
+
+	*pdma = cpu_to_be64(segs[0].ds_addr);
+	mb_list->mbuf = mb;
+
+	bus_dmamap_sync(ring->dma_tag, mb_list->dma_map, BUS_DMASYNC_PREREAD);
+	return (0);
+
+use_spare:
+	/* swap DMA maps */
+	map = mb_list->dma_map;
+	mb_list->dma_map = ring->spare.dma_map;
+	ring->spare.dma_map = map;
+
+	/* swap MBUFs */
+	mb_list->mbuf = ring->spare.mbuf;
+	ring->spare.mbuf = NULL;
+
+	/* store physical address */
+	*pdma = ring->spare.paddr_be;
+	return (0);
 }
 
-static inline void mlx4_en_update_rx_prod_db(struct mlx4_en_rx_ring *ring)
+static void
+mlx4_en_free_buf(struct mlx4_en_rx_ring *ring, struct mlx4_en_rx_mbuf *mb_list)
 {
-	*ring->wqres.db.db = cpu_to_be32(ring->prod & 0xffff);
+	bus_dmamap_t map = mb_list->dma_map;
+	bus_dmamap_sync(ring->dma_tag, map, BUS_DMASYNC_POSTREAD);
+	bus_dmamap_unload(ring->dma_tag, map);
+	m_freem(mb_list->mbuf);
+	mb_list->mbuf = NULL;	/* safety clearing */
 }
 
-static void mlx4_en_free_rx_desc(struct mlx4_en_priv *priv,
-				 struct mlx4_en_rx_ring *ring,
-				 int index)
+static int
+mlx4_en_prepare_rx_desc(struct mlx4_en_priv *priv,
+    struct mlx4_en_rx_ring *ring, int index)
 {
-	struct mlx4_en_frag_info *frag_info;
-	struct mlx4_en_dev *mdev = priv->mdev;
-	struct mbuf **mb_list;
-	struct mlx4_en_rx_desc *rx_desc = ring->buf + (index << ring->log_stride);
-	dma_addr_t dma;
-	int nr;
+	struct mlx4_en_rx_desc *rx_desc = (struct mlx4_en_rx_desc *)
+	    (ring->buf + (index * ring->stride));
+	struct mlx4_en_rx_mbuf *mb_list = ring->mbuf + index;
 
-	mb_list = ring->rx_info + (index << priv->log_rx_info);
-	for (nr = 0; nr < priv->num_frags; nr++) {
-		en_dbg(DRV, priv, "Freeing fragment:%d\n", nr);
- 		frag_info = &priv->frag_info[nr];
-		dma = be64_to_cpu(rx_desc->data[nr].addr);
+	mb_list->mbuf = NULL;
 
-		en_dbg(DRV, priv, "Unmaping buffer at dma:0x%llx\n", (u64) dma);
-		pci_unmap_single(mdev->pdev, dma, frag_info->frag_size,
-				 PCI_DMA_FROMDEVICE);
-		m_free(mb_list[nr]);
+	if (mlx4_en_alloc_buf(ring, &rx_desc->data[0].addr, mb_list)) {
+		priv->port_stats.rx_alloc_failed++;
+		return (-ENOMEM);
 	}
+	return (0);
 }
 
+static inline void
+mlx4_en_update_rx_prod_db(struct mlx4_en_rx_ring *ring)
+{
+	*ring->wqres.db.db = cpu_to_be32(ring->prod & 0xffff);
+}
+
 static int mlx4_en_fill_rx_buffers(struct mlx4_en_priv *priv)
 {
 	struct mlx4_en_rx_ring *ring;
@@ -153,7 +196,7 @@
 
 	for (buf_ind = 0; buf_ind < priv->prof->rx_ring_size; buf_ind++) {
 		for (ring_ind = 0; ring_ind < priv->rx_ring_num; ring_ind++) {
-			ring = &priv->rx_ring[ring_ind];
+			ring = priv->rx_ring[ring_ind];
 
 			err = mlx4_en_prepare_rx_desc(priv, ring,
 						      ring->actual_size);
@@ -163,7 +206,8 @@
 						     "enough rx buffers\n");
 					return -ENOMEM;
 				} else {
-					new_size = rounddown_pow_of_two(ring->actual_size);
+					new_size =
+						rounddown_pow_of_two(ring->actual_size);
 					en_warn(priv, "Only %d buffers allocated "
 						      "reducing ring size to %d\n",
 						ring->actual_size, new_size);
@@ -178,11 +222,12 @@
 
 reduce_rings:
 	for (ring_ind = 0; ring_ind < priv->rx_ring_num; ring_ind++) {
-		ring = &priv->rx_ring[ring_ind];
+		ring = priv->rx_ring[ring_ind];
 		while (ring->actual_size > new_size) {
 			ring->actual_size--;
 			ring->prod--;
-			mlx4_en_free_rx_desc(priv, ring, ring->actual_size);
+			mlx4_en_free_buf(ring,
+			    ring->mbuf + ring->actual_size);
 		}
 	}
 
@@ -202,44 +247,107 @@
 	while (ring->cons != ring->prod) {
 		index = ring->cons & ring->size_mask;
 		en_dbg(DRV, priv, "Processing descriptor:%d\n", index);
-		mlx4_en_free_rx_desc(priv, ring, index);
+		mlx4_en_free_buf(ring, ring->mbuf + index);
 		++ring->cons;
 	}
 }
 
+void mlx4_en_calc_rx_buf(struct net_device *dev)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	int eff_mtu = dev->if_mtu + ETH_HLEN + VLAN_HLEN + ETH_FCS_LEN +
+	    MLX4_NET_IP_ALIGN;
 
+	if (eff_mtu > MJUM16BYTES) {
+		en_err(priv, "MTU(%d) is too big\n", (int)dev->if_mtu);
+                eff_mtu = MJUM16BYTES;
+        } else if (eff_mtu > MJUM9BYTES) {
+                eff_mtu = MJUM16BYTES;
+        } else if (eff_mtu > MJUMPAGESIZE) {
+                eff_mtu = MJUM9BYTES;
+        } else if (eff_mtu > MCLBYTES) {
+                eff_mtu = MJUMPAGESIZE;
+        } else {
+                eff_mtu = MCLBYTES;
+        }
+
+	priv->rx_mb_size = eff_mtu;
+
+	en_dbg(DRV, priv, "Effective RX MTU: %d bytes\n", eff_mtu);
+}
+
 int mlx4_en_create_rx_ring(struct mlx4_en_priv *priv,
-			   struct mlx4_en_rx_ring *ring, u32 size)
+			   struct mlx4_en_rx_ring **pring,
+			   u32 size, int node)
 {
 	struct mlx4_en_dev *mdev = priv->mdev;
+	struct mlx4_en_rx_ring *ring;
 	int err;
 	int tmp;
+	uint32_t x;
 
+        ring = kzalloc(sizeof(struct mlx4_en_rx_ring), GFP_KERNEL);
+        if (!ring) {
+                en_err(priv, "Failed to allocate RX ring structure\n");
+                return -ENOMEM;
+        }
 
+	/* Create DMA descriptor TAG */
+	if ((err = -bus_dma_tag_create(
+	    bus_get_dma_tag(mdev->pdev->dev.bsddev),
+	    1,				/* any alignment */
+	    0,				/* no boundary */
+	    BUS_SPACE_MAXADDR,		/* lowaddr */
+	    BUS_SPACE_MAXADDR,		/* highaddr */
+	    NULL, NULL,			/* filter, filterarg */
+	    MJUM16BYTES,		/* maxsize */
+	    1,				/* nsegments */
+	    MJUM16BYTES,		/* maxsegsize */
+	    0,				/* flags */
+	    NULL, NULL,			/* lockfunc, lockfuncarg */
+	    &ring->dma_tag))) {
+		en_err(priv, "Failed to create DMA tag\n");
+		goto err_ring;
+	}
+
 	ring->prod = 0;
 	ring->cons = 0;
 	ring->size = size;
 	ring->size_mask = size - 1;
-	ring->stride = roundup_pow_of_two(sizeof(struct mlx4_en_rx_desc) +
-					  DS_SIZE * MLX4_EN_MAX_RX_FRAGS);
+	ring->stride = roundup_pow_of_two(
+	    sizeof(struct mlx4_en_rx_desc) + DS_SIZE);
 	ring->log_stride = ffs(ring->stride) - 1;
 	ring->buf_size = ring->size * ring->stride + TXBB_SIZE;
 
-	tmp = size * roundup_pow_of_two(MLX4_EN_MAX_RX_FRAGS *
-					sizeof(struct mbuf *));
+	tmp = size * sizeof(struct mlx4_en_rx_mbuf);
 
-	ring->rx_info = kmalloc(tmp, GFP_KERNEL);
-	if (!ring->rx_info) {
-		en_err(priv, "Failed allocating rx_info ring\n");
-		return -ENOMEM;
+        ring->mbuf = kzalloc(tmp, GFP_KERNEL);
+        if (ring->mbuf == NULL) {
+                err = -ENOMEM;
+                goto err_dma_tag;
+        }
+
+	err = -bus_dmamap_create(ring->dma_tag, 0, &ring->spare.dma_map);
+	if (err != 0)
+		goto err_info;
+
+	for (x = 0; x != size; x++) {
+		err = -bus_dmamap_create(ring->dma_tag, 0,
+		    &ring->mbuf[x].dma_map);
+		if (err != 0) {
+			while (x--)
+				bus_dmamap_destroy(ring->dma_tag,
+				    ring->mbuf[x].dma_map);
+			goto err_info;
+		}
 	}
-	en_dbg(DRV, priv, "Allocated rx_info ring at addr:%p size:%d stride:%d (%d)\n",
-		 ring->rx_info, tmp, ring->stride, ring->log_stride);
+	en_dbg(DRV, priv, "Allocated MBUF ring at addr:%p size:%d\n",
+		 ring->mbuf, tmp);
 
 	err = mlx4_alloc_hwq_res(mdev->dev, &ring->wqres,
 				 ring->buf_size, 2 * PAGE_SIZE);
 	if (err)
-		goto err_ring;
+		goto err_dma_map;
 
 	err = mlx4_en_map_buffer(&ring->wqres.buf);
 	if (err) {
@@ -247,16 +355,24 @@
 		goto err_hwq;
 	}
 	ring->buf = ring->wqres.buf.direct.buf;
-
+	*pring = ring;
 	return 0;
 
-	mlx4_en_unmap_buffer(&ring->wqres.buf);
 err_hwq:
 	mlx4_free_hwq_res(mdev->dev, &ring->wqres, ring->buf_size);
+err_dma_map:
+	for (x = 0; x != size; x++) {
+		bus_dmamap_destroy(ring->dma_tag,
+		    ring->mbuf[x].dma_map);
+	}
+	bus_dmamap_destroy(ring->dma_tag, ring->spare.dma_map);
+err_info:
+	vfree(ring->mbuf);
+err_dma_tag:
+	bus_dma_tag_destroy(ring->dma_tag);
 err_ring:
-	kfree(ring->rx_info);
-	ring->rx_info = NULL;
-	return err;
+	kfree(ring);
+	return (err);
 }
 
 int mlx4_en_activate_rx_rings(struct mlx4_en_priv *priv)
@@ -265,16 +381,18 @@
 	int i;
 	int ring_ind;
 	int err;
-	int stride = roundup_pow_of_two(sizeof(struct mlx4_en_rx_desc) +
-					DS_SIZE * priv->num_frags);
+	int stride = roundup_pow_of_two(
+	    sizeof(struct mlx4_en_rx_desc) + DS_SIZE);
 
 	for (ring_ind = 0; ring_ind < priv->rx_ring_num; ring_ind++) {
-		ring = &priv->rx_ring[ring_ind];
+		ring = priv->rx_ring[ring_ind];
 
 		ring->prod = 0;
 		ring->cons = 0;
 		ring->actual_size = 0;
-		ring->cqn = priv->rx_cq[ring_ind].mcq.cqn;
+		ring->cqn = priv->rx_cq[ring_ind]->mcq.cqn;
+                ring->rx_mb_size = priv->rx_mb_size;
+
 		ring->stride = stride;
 		if (ring->stride <= TXBB_SIZE)
 			ring->buf += TXBB_SIZE;
@@ -285,9 +403,10 @@
 		memset(ring->buf, 0, ring->buf_size);
 		mlx4_en_update_rx_prod_db(ring);
 
-		/* Initailize all descriptors */
+		/* Initialize all descriptors */
 		for (i = 0; i < ring->size; i++)
 			mlx4_en_init_rx_desc(priv, ring, i);
+
 #ifdef INET
 		/* Configure lro mngr */
 		if (priv->dev->if_capenable & IFCAP_LRO) {
@@ -298,36 +417,65 @@
 		}
 #endif
 	}
+
+
 	err = mlx4_en_fill_rx_buffers(priv);
 	if (err)
 		goto err_buffers;
 
 	for (ring_ind = 0; ring_ind < priv->rx_ring_num; ring_ind++) {
-		ring = &priv->rx_ring[ring_ind];
+		ring = priv->rx_ring[ring_ind];
 
 		ring->size_mask = ring->actual_size - 1;
 		mlx4_en_update_rx_prod_db(ring);
 	}
 
-
 	return 0;
 
 err_buffers:
 	for (ring_ind = 0; ring_ind < priv->rx_ring_num; ring_ind++)
-		mlx4_en_free_rx_buf(priv, &priv->rx_ring[ring_ind]);
+		mlx4_en_free_rx_buf(priv, priv->rx_ring[ring_ind]);
 
+	ring_ind = priv->rx_ring_num - 1;
+
+	while (ring_ind >= 0) {
+		ring = priv->rx_ring[ring_ind];
+		if (ring->stride <= TXBB_SIZE)
+			ring->buf -= TXBB_SIZE;
+		ring_ind--;
+	}
+
 	return err;
 }
 
+
 void mlx4_en_destroy_rx_ring(struct mlx4_en_priv *priv,
-			     struct mlx4_en_rx_ring *ring)
+			     struct mlx4_en_rx_ring **pring,
+			     u32 size, u16 stride)
 {
 	struct mlx4_en_dev *mdev = priv->mdev;
+	struct mlx4_en_rx_ring *ring = *pring;
+	uint32_t x;
 
 	mlx4_en_unmap_buffer(&ring->wqres.buf);
-	mlx4_free_hwq_res(mdev->dev, &ring->wqres, ring->buf_size + TXBB_SIZE);
-	kfree(ring->rx_info);
-	ring->rx_info = NULL;
+	mlx4_free_hwq_res(mdev->dev, &ring->wqres, size * stride + TXBB_SIZE);
+	for (x = 0; x != size; x++)
+		bus_dmamap_destroy(ring->dma_tag, ring->mbuf[x].dma_map);
+	/* free spare mbuf, if any */
+	if (ring->spare.mbuf != NULL) {
+		bus_dmamap_sync(ring->dma_tag, ring->spare.dma_map,
+		    BUS_DMASYNC_POSTREAD);
+		bus_dmamap_unload(ring->dma_tag, ring->spare.dma_map);
+		m_freem(ring->spare.mbuf);
+	}
+	bus_dmamap_destroy(ring->dma_tag, ring->spare.dma_map);
+	vfree(ring->mbuf);
+	bus_dma_tag_destroy(ring->dma_tag);
+	kfree(ring);
+	*pring = NULL;
+#ifdef CONFIG_RFS_ACCEL
+	mlx4_en_cleanup_filters(priv, ring);
+#endif
 }
 
 void mlx4_en_deactivate_rx_ring(struct mlx4_en_priv *priv,
@@ -342,52 +490,20 @@
 }
 
 
-/* Unmap a completed descriptor and free unused pages */
-static int mlx4_en_complete_rx_desc(struct mlx4_en_priv *priv,
-				    struct mlx4_en_rx_desc *rx_desc,
-				    struct mbuf **mb_list,
-				    int length)
+static void validate_loopback(struct mlx4_en_priv *priv, struct mbuf *mb)
 {
-	struct mlx4_en_dev *mdev = priv->mdev;
-	struct mlx4_en_frag_info *frag_info;
-	dma_addr_t dma;
-	struct mbuf *mb;
-	int nr;
+	int i;
+	int offset = ETHER_HDR_LEN;
 
-	mb = mb_list[0];
-	mb->m_pkthdr.len = length;
-	/* Collect used fragments while replacing them in the HW descirptors */
-	for (nr = 0; nr < priv->num_frags; nr++) {
-		frag_info = &priv->frag_info[nr];
-		if (length <= frag_info->frag_prefix_size)
-			break;
-		if (nr) 
-			mb->m_next = mb_list[nr];
-		mb = mb_list[nr];
-		mb->m_len = frag_info[nr].frag_size;
-		dma = be64_to_cpu(rx_desc->data[nr].addr);
-
-		/* Allocate a replacement page */
-		if (mlx4_en_alloc_buf(priv, rx_desc, mb_list, nr))
-			goto fail;
-
-		/* Unmap buffer */
-		pci_unmap_single(mdev->pdev, dma, frag_info[nr].frag_size,
-				 PCI_DMA_FROMDEVICE);
+	for (i = 0; i < MLX4_LOOPBACK_TEST_PAYLOAD; i++, offset++) {
+		if (*(mb->m_data + offset) != (unsigned char) (i & 0xff))
+			goto out_loopback;
 	}
-	/* Adjust size of last fragment to match actual length */
-	mb->m_len = length - priv->frag_info[nr - 1].frag_prefix_size;
-	mb->m_next = NULL;
-	return 0;
+	/* Loopback found */
+	priv->loopback_ok = 1;
 
-fail:
-	/* Drop all accumulated fragments (which have already been replaced in
-	 * the descriptor) of this packet; remaining fragments are reused... */
-	while (nr > 0) {
-		nr--;
-		m_free(mb_list[nr]);
-	}
-	return -ENOMEM;
+out_loopback:
+	m_freem(mb);
 }
 
 
@@ -397,60 +513,58 @@
 	/* Drop packet on bad receive or bad checksum */
 	if (unlikely((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) ==
 		     MLX4_CQE_OPCODE_ERROR)) {
-		en_err(priv, "CQE completed in error - vendor "
-			 "syndrom:%d syndrom:%d\n",
-			 ((struct mlx4_err_cqe *) cqe)->vendor_err_syndrome,
-			 ((struct mlx4_err_cqe *) cqe)->syndrome);
+		en_err(priv, "CQE completed in error - vendor syndrom:%d syndrom:%d\n",
+		       ((struct mlx4_err_cqe *)cqe)->vendor_err_syndrome,
+		       ((struct mlx4_err_cqe *)cqe)->syndrome);
 		return 1;
 	}
 	if (unlikely(cqe->badfcs_enc & MLX4_CQE_BAD_FCS)) {
 		en_dbg(RX_ERR, priv, "Accepted frame with bad FCS\n");
-		return 1;;
+		return 1;
 	}
 
 	return 0;
 }
 
-static void validate_loopback(struct mlx4_en_priv *priv, struct mbuf *mb)
+static struct mbuf *
+mlx4_en_rx_mb(struct mlx4_en_priv *priv, struct mlx4_en_rx_ring *ring,
+    struct mlx4_en_rx_desc *rx_desc, struct mlx4_en_rx_mbuf *mb_list,
+    int length)
 {
-	int i;
-	int offset = ETHER_HDR_LEN;
+	struct mbuf *mb;
 
-	for (i = 0; i < MLX4_LOOPBACK_TEST_PAYLOAD; i++, offset++) {
-		if (*(mb->m_data + offset) != (unsigned char) (i & 0xff))
-			goto out_loopback;
-	}
-	/* Loopback found */
-	priv->loopback_ok = 1;
+	/* get mbuf */
+	mb = mb_list->mbuf;
 
-out_loopback:
-	m_freem(mb);
-}
+	/* collect used fragment while atomically replacing it */
+	if (mlx4_en_alloc_buf(ring, &rx_desc->data[0].addr, mb_list))
+		return (NULL);
 
-static struct mbuf *mlx4_en_rx_mb(struct mlx4_en_priv *priv,
-				  struct mlx4_en_rx_desc *rx_desc,
-				  struct mbuf **mb_list,
-				  unsigned int length)
-{
-	struct mbuf *mb;
+	/* range check hardware computed value */
+	if (unlikely(length > mb->m_len))
+		length = mb->m_len;
 
-	mb = mb_list[0];
-	/* Move relevant fragments to mb */
-	if (unlikely(mlx4_en_complete_rx_desc(priv, rx_desc, mb_list, length)))
-		return NULL;
-
-	return mb;
+	/* update total packet length in packet header */
+	mb->m_len = mb->m_pkthdr.len = length;
+	return (mb);
 }
 
-
+/* For cpu arch with cache line of 64B the performance is better when cqe size==64B
+ * To enlarge cqe size from 32B to 64B --> 32B of garbage (i.e. 0xccccccc)
+ * was added in the beginning of each cqe (the real data is in the corresponding 32B).
+ * The following calc ensures that when factor==1, it means we are alligned to 64B
+ * and we get the real cqe data*/
+#define CQE_FACTOR_INDEX(index, factor) ((index << factor) + factor)
 int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int budget)
 {
 	struct mlx4_en_priv *priv = netdev_priv(dev);
 	struct mlx4_cqe *cqe;
-	struct mlx4_en_rx_ring *ring = &priv->rx_ring[cq->ring];
-	struct mbuf **mb_list;
+	struct mlx4_en_rx_ring *ring = priv->rx_ring[cq->ring];
+	struct mlx4_en_rx_mbuf *mb_list;
 	struct mlx4_en_rx_desc *rx_desc;
 	struct mbuf *mb;
+	struct mlx4_cq *mcq = &cq->mcq;
+	struct mlx4_cqe *buf = cq->buf;
 #ifdef INET
 	struct lro_entry *queued;
 #endif
@@ -457,37 +571,43 @@
 	int index;
 	unsigned int length;
 	int polled = 0;
+	u32 cons_index = mcq->cons_index;
+	u32 size_mask = ring->size_mask;
+	int size = cq->size;
+	int factor = priv->cqe_factor;
 
 	if (!priv->port_up)
 		return 0;
 
 	/* We assume a 1:1 mapping between CQEs and Rx descriptors, so Rx
-	 * descriptor offset can be deduced from the CQE index instead of
+	 * descriptor offset can be deducted from the CQE index instead of
 	 * reading 'cqe->index' */
-	index = cq->mcq.cons_index & ring->size_mask;
-	cqe = &cq->buf[index];
+	index = cons_index & size_mask;
+	cqe = &buf[CQE_FACTOR_INDEX(index, factor)];
 
 	/* Process all completed CQEs */
 	while (XNOR(cqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK,
-		    cq->mcq.cons_index & cq->size)) {
+		    cons_index & size)) {
+		mb_list = ring->mbuf + index;
+		rx_desc = (struct mlx4_en_rx_desc *)
+		    (ring->buf + (index << ring->log_stride));
 
-		mb_list = ring->rx_info + (index << priv->log_rx_info);
-		rx_desc = ring->buf + (index << ring->log_stride);
-
 		/*
 		 * make sure we read the CQE after we read the ownership bit
 		 */
 		rmb();
 
-		if (invalid_cqe(priv, cqe))
+		if (invalid_cqe(priv, cqe)) {
 			goto next;
-
+		}
 		/*
 		 * Packet is OK - process it.
 		 */
 		length = be32_to_cpu(cqe->byte_cnt);
-		mb = mlx4_en_rx_mb(priv, rx_desc, mb_list, length);
-		if (!mb) {
+		length -= ring->fcs_del;
+
+		mb = mlx4_en_rx_mb(priv, ring, rx_desc, mb_list, length);
+		if (unlikely(!mb)) {
 			ring->errors++;
 			goto next;
 		}
@@ -495,12 +615,13 @@
 		ring->bytes += length;
 		ring->packets++;
 
-                if (unlikely(priv->validate_loopback)) {
+		if (unlikely(priv->validate_loopback)) {
 			validate_loopback(priv, mb);
 			goto next;
 		}
 
-		mb->m_pkthdr.flowid = cq->ring;
+		/* forward Toeplitz compatible hash value */
+		mb->m_pkthdr.flowid = be32_to_cpu(cqe->immed_rss_invalid);
 		mb->m_flags |= M_FLOWID;
 		mb->m_pkthdr.rcvif = dev;
 		if (be32_to_cpu(cqe->vlan_my_qpn) &
@@ -508,11 +629,12 @@
 			mb->m_pkthdr.ether_vtag = be16_to_cpu(cqe->sl_vid);
 			mb->m_flags |= M_VLANTAG;
 		}
-		if (likely(priv->rx_csum) &&
+		if (likely(dev->if_capenable &
+		    (IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6)) &&
 		    (cqe->status & cpu_to_be16(MLX4_CQE_STATUS_IPOK)) &&
 		    (cqe->checksum == cpu_to_be16(0xffff))) {
 			priv->port_stats.rx_chksum_good++;
-			mb->m_pkthdr.csum_flags = 
+			mb->m_pkthdr.csum_flags =
 			    CSUM_IP_CHECKED | CSUM_IP_VALID |
 			    CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
 			mb->m_pkthdr.csum_data = htons(0xffff);
@@ -524,24 +646,18 @@
 			 */
 #ifdef INET
 			if (mlx4_en_can_lro(cqe->status) &&
-			    (dev->if_capenable & IFCAP_LRO)) {
+					(dev->if_capenable & IFCAP_LRO)) {
 				if (ring->lro.lro_cnt != 0 &&
-				    tcp_lro_rx(&ring->lro, mb, 0) == 0)
+						tcp_lro_rx(&ring->lro, mb, 0) == 0)
 					goto next;
 			}
+
 #endif
-
 			/* LRO not possible, complete processing here */
 			INC_PERF_COUNTER(priv->pstats.lro_misses);
 		} else {
 			mb->m_pkthdr.csum_flags = 0;
 			priv->port_stats.rx_chksum_none++;
-#ifdef INET
-			if (priv->ip_reasm &&
-			    cqe->status & cpu_to_be16(MLX4_CQE_STATUS_IPV4) &&
-			    !mlx4_en_rx_frags(priv, ring, mb, cqe))
-				goto next;
-#endif
 		}
 
 		/* Push it up the stack */
@@ -548,9 +664,9 @@
 		dev->if_input(dev, mb);
 
 next:
-		++cq->mcq.cons_index;
-		index = (cq->mcq.cons_index) & ring->size_mask;
-		cqe = &cq->buf[index];
+		++cons_index;
+		index = cons_index & size_mask;
+		cqe = &buf[CQE_FACTOR_INDEX(index, factor)];
 		if (++polled == budget)
 			goto out;
 	}
@@ -557,7 +673,6 @@
 	/* Flush all pending IP reassembly sessions */
 out:
 #ifdef INET
-	mlx4_en_flush_frags(priv, ring);
 	while ((queued = SLIST_FIRST(&ring->lro.lro_active)) != NULL) {
 		SLIST_REMOVE_HEAD(&ring->lro.lro_active, next);
 		tcp_lro_flush(&ring->lro, queued);
@@ -564,99 +679,64 @@
 	}
 #endif
 	AVG_PERF_COUNTER(priv->pstats.rx_coal_avg, polled);
-	mlx4_cq_set_ci(&cq->mcq);
+	mcq->cons_index = cons_index;
+	mlx4_cq_set_ci(mcq);
 	wmb(); /* ensure HW sees CQ consumer before we post new buffers */
-	ring->cons = cq->mcq.cons_index;
+	ring->cons = mcq->cons_index;
 	ring->prod += polled; /* Polled descriptors were realocated in place */
 	mlx4_en_update_rx_prod_db(ring);
 	return polled;
+
 }
 
-
 /* Rx CQ polling - called by NAPI */
 static int mlx4_en_poll_rx_cq(struct mlx4_en_cq *cq, int budget)
 {
-	struct net_device *dev = cq->dev;
-	int done;
+        struct net_device *dev = cq->dev;
+        int done;
 
-	done = mlx4_en_process_rx_cq(dev, cq, budget);
-	cq->tot_rx += done;
+        done = mlx4_en_process_rx_cq(dev, cq, budget);
+        cq->tot_rx += done;
 
-	return done;
-}
+        return done;
 
-void mlx4_en_rx_que(void *context, int pending)
-{
-	struct mlx4_en_cq *cq;
-
-        cq = context;
-	while (mlx4_en_poll_rx_cq(cq, MLX4_EN_MAX_RX_POLL)
-	    == MLX4_EN_MAX_RX_POLL);
-	mlx4_en_arm_cq(cq->dev->if_softc, cq);
 }
-
 void mlx4_en_rx_irq(struct mlx4_cq *mcq)
 {
 	struct mlx4_en_cq *cq = container_of(mcq, struct mlx4_en_cq, mcq);
 	struct mlx4_en_priv *priv = netdev_priv(cq->dev);
-	int done;
+        int done;
 
-	done = mlx4_en_poll_rx_cq(cq, MLX4_EN_MAX_RX_POLL);
-	if (done == MLX4_EN_MAX_RX_POLL)
+        // Shoot one within the irq context 
+        // Because there is no NAPI in freeBSD
+        done = mlx4_en_poll_rx_cq(cq, MLX4_EN_RX_BUDGET);
+	if (priv->port_up  && (done == MLX4_EN_RX_BUDGET) ) {
+		cq->curr_poll_rx_cpu_id = curcpu;
 		taskqueue_enqueue(cq->tq, &cq->cq_task);
-	else
+        }
+	else {
 		mlx4_en_arm_cq(priv, cq);
+	}
 }
 
-
-#if MLX4_EN_MAX_RX_FRAGS == 3
-static int frag_sizes[] = {
-	FRAG_SZ0,
-	FRAG_SZ1,
-	FRAG_SZ2,
-};
-#elif MLX4_EN_MAX_RX_FRAGS == 2
-static int frag_sizes[] = {
-	FRAG_SZ0,
-	FRAG_SZ1,
-};
-#else
-#error "Unknown MAX_RX_FRAGS"
-#endif
-
-void mlx4_en_calc_rx_buf(struct net_device *dev)
+void mlx4_en_rx_que(void *context, int pending)
 {
-	struct mlx4_en_priv *priv = netdev_priv(dev);
-	int eff_mtu = dev->if_mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + ETH_LLC_SNAP_SIZE;
-	int buf_size = 0;
-	int i, frag;
+        struct mlx4_en_cq *cq;
+	struct thread *td;
 
-	for (i = 0, frag = 0; buf_size < eff_mtu; frag++, i++) {
-		/*
-		 * Allocate small to large but only as much as is needed for
-		 * the tail.
-		 */
-		while (i > 0 && eff_mtu - buf_size <= frag_sizes[i - 1])
-			i--;
-		priv->frag_info[frag].frag_size = frag_sizes[i];
-		priv->frag_info[frag].frag_prefix_size = buf_size;
-		buf_size += priv->frag_info[frag].frag_size;
-	}
+        cq = context;
+	td = curthread;
 
-	priv->num_frags = frag;
-	priv->rx_mb_size = eff_mtu;
-	priv->log_rx_info =
-	    ROUNDUP_LOG2(priv->num_frags * sizeof(struct mbuf *));
+	thread_lock(td);
+	sched_bind(td, cq->curr_poll_rx_cpu_id);
+	thread_unlock(td);
 
-	en_dbg(DRV, priv, "Rx buffer scatter-list (effective-mtu:%d "
-		  "num_frags:%d):\n", eff_mtu, priv->num_frags);
-	for (i = 0; i < priv->num_frags; i++) {
-		en_dbg(DRV, priv, "  frag:%d - size:%d prefix:%d\n", i,
-				priv->frag_info[i].frag_size,
-				priv->frag_info[i].frag_prefix_size)
-	}
+        while (mlx4_en_poll_rx_cq(cq, MLX4_EN_RX_BUDGET)
+                        == MLX4_EN_RX_BUDGET);
+        mlx4_en_arm_cq(cq->dev->if_softc, cq);
 }
 
+
 /* RSS related functions */
 
 static int mlx4_en_config_rss_qp(struct mlx4_en_priv *priv, int qpn,
@@ -683,9 +763,16 @@
 
 	memset(context, 0, sizeof *context);
 	mlx4_en_fill_qp_context(priv, ring->actual_size, ring->stride, 0, 0,
-				qpn, ring->cqn, context);
+				qpn, ring->cqn, -1, context);
 	context->db_rec_addr = cpu_to_be64(ring->wqres.db.dma);
 
+	/* Cancel FCS removal if FW allows */
+	if (mdev->dev->caps.flags & MLX4_DEV_CAP_FLAG_FCS_KEEP) {
+		context->param3 |= cpu_to_be32(1 << 29);
+		ring->fcs_del = ETH_FCS_LEN;
+	} else
+		ring->fcs_del = 0;
+
 	err = mlx4_qp_to_ready(mdev->dev, &ring->wqres.mtt, context, qp, state);
 	if (err) {
 		mlx4_qp_remove(mdev->dev, qp);
@@ -697,6 +784,36 @@
 	return err;
 }
 
+int mlx4_en_create_drop_qp(struct mlx4_en_priv *priv)
+{
+	int err;
+	u32 qpn;
+
+	err = mlx4_qp_reserve_range(priv->mdev->dev, 1, 1, &qpn, 0);
+	if (err) {
+		en_err(priv, "Failed reserving drop qpn\n");
+		return err;
+	}
+	err = mlx4_qp_alloc(priv->mdev->dev, qpn, &priv->drop_qp);
+	if (err) {
+		en_err(priv, "Failed allocating drop qp\n");
+		mlx4_qp_release_range(priv->mdev->dev, qpn, 1);
+		return err;
+	}
+
+	return 0;
+}
+
+void mlx4_en_destroy_drop_qp(struct mlx4_en_priv *priv)
+{
+	u32 qpn;
+
+	qpn = priv->drop_qp.qpn;
+	mlx4_qp_remove(priv->mdev->dev, &priv->drop_qp);
+	mlx4_qp_free(priv->mdev->dev, &priv->drop_qp);
+	mlx4_qp_release_range(priv->mdev->dev, qpn, 1);
+}
+
 /* Allocate rx qp's and configure them according to rss map */
 int mlx4_en_config_rss_steer(struct mlx4_en_priv *priv)
 {
@@ -703,21 +820,22 @@
 	struct mlx4_en_dev *mdev = priv->mdev;
 	struct mlx4_en_rss_map *rss_map = &priv->rss_map;
 	struct mlx4_qp_context context;
-	struct mlx4_en_rss_context *rss_context;
+	struct mlx4_rss_context *rss_context;
+	int rss_rings;
 	void *ptr;
-	u8 rss_mask;
-	int i, qpn;
+	u8 rss_mask = (MLX4_RSS_IPV4 | MLX4_RSS_TCP_IPV4 | MLX4_RSS_IPV6 |
+			MLX4_RSS_TCP_IPV6);
+	int i;
 	int err = 0;
 	int good_qps = 0;
+	static const u32 rsskey[10] = { 0xD181C62C, 0xF7F4DB5B, 0x1983A2FC,
+				0x943E1ADB, 0xD9389E6B, 0xD1039C2C, 0xA74499AD,
+				0x593D56D9, 0xF3253C06, 0x2ADC1FFC};
 
-	if (mdev->profile.udp_rss)
-		rss_mask = 0x3f;
-	else
-		rss_mask = 0x14;
 	en_dbg(DRV, priv, "Configuring rss steering\n");
 	err = mlx4_qp_reserve_range(mdev->dev, priv->rx_ring_num,
-				    roundup_pow_of_two(priv->rx_ring_num),
-				    &rss_map->base_qpn);
+				    priv->rx_ring_num,
+				    &rss_map->base_qpn, 0);
 	if (err) {
 		en_err(priv, "Failed reserving %d qps\n", priv->rx_ring_num);
 		return err;
@@ -724,9 +842,9 @@
 	}
 
 	for (i = 0; i < priv->rx_ring_num; i++) {
-		qpn = rss_map->base_qpn + i;
-		err = mlx4_en_config_rss_qp(priv, qpn,
-					    &priv->rx_ring[i],
+		priv->rx_ring[i]->qpn = rss_map->base_qpn + i;
+		err = mlx4_en_config_rss_qp(priv, priv->rx_ring[i]->qpn,
+					    priv->rx_ring[i],
 					    &rss_map->state[i],
 					    &rss_map->qps[i]);
 		if (err)
@@ -736,28 +854,34 @@
 	}
 
 	/* Configure RSS indirection qp */
-	err = mlx4_qp_reserve_range(mdev->dev, 1, 1, &priv->base_qpn);
-	if (err) {
-		en_err(priv, "Failed to reserve range for RSS "
-			     "indirection qp\n");
-		goto rss_err;
-	}
 	err = mlx4_qp_alloc(mdev->dev, priv->base_qpn, &rss_map->indir_qp);
 	if (err) {
 		en_err(priv, "Failed to allocate RSS indirection QP\n");
-		goto reserve_err;
+		goto rss_err;
 	}
 	rss_map->indir_qp.event = mlx4_en_sqp_event;
 	mlx4_en_fill_qp_context(priv, 0, 0, 0, 1, priv->base_qpn,
-				priv->rx_ring[0].cqn, &context);
+				priv->rx_ring[0]->cqn, -1, &context);
 
-	ptr = ((void *) &context) + 0x3c;
-	rss_context = (struct mlx4_en_rss_context *) ptr;
-	rss_context->base_qpn = cpu_to_be32(ilog2(priv->rx_ring_num) << 24 |
+	if (!priv->prof->rss_rings || priv->prof->rss_rings > priv->rx_ring_num)
+		rss_rings = priv->rx_ring_num;
+	else
+		rss_rings = priv->prof->rss_rings;
+
+	ptr = ((u8 *)&context) + offsetof(struct mlx4_qp_context, pri_path) +
+	    MLX4_RSS_OFFSET_IN_QPC_PRI_PATH;
+	rss_context = ptr;
+	rss_context->base_qpn = cpu_to_be32(ilog2(rss_rings) << 24 |
 					    (rss_map->base_qpn));
 	rss_context->default_qpn = cpu_to_be32(rss_map->base_qpn);
+	if (priv->mdev->profile.udp_rss) {
+		rss_mask |=  MLX4_RSS_UDP_IPV4 | MLX4_RSS_UDP_IPV6;
+		rss_context->base_qpn_udp = rss_context->default_qpn;
+	}
 	rss_context->flags = rss_mask;
-	rss_context->base_qpn_udp = rss_context->default_qpn;
+	rss_context->hash_fn = MLX4_RSS_HASH_TOP;
+	for (i = 0; i < 10; i++)
+		rss_context->rss_key[i] = cpu_to_be32(rsskey[i]);
 
 	err = mlx4_qp_to_ready(mdev->dev, &priv->res.mtt, &context,
 			       &rss_map->indir_qp, &rss_map->indir_state);
@@ -771,8 +895,6 @@
 		       MLX4_QP_STATE_RST, NULL, 0, 0, &rss_map->indir_qp);
 	mlx4_qp_remove(mdev->dev, &rss_map->indir_qp);
 	mlx4_qp_free(mdev->dev, &rss_map->indir_qp);
-reserve_err:
-	mlx4_qp_release_range(mdev->dev, priv->base_qpn, 1);
 rss_err:
 	for (i = 0; i < good_qps; i++) {
 		mlx4_qp_modify(mdev->dev, NULL, rss_map->state[i],
@@ -794,7 +916,6 @@
 		       MLX4_QP_STATE_RST, NULL, 0, 0, &rss_map->indir_qp);
 	mlx4_qp_remove(mdev->dev, &rss_map->indir_qp);
 	mlx4_qp_free(mdev->dev, &rss_map->indir_qp);
-	mlx4_qp_release_range(mdev->dev, priv->base_qpn, 1);
 
 	for (i = 0; i < priv->rx_ring_num; i++) {
 		mlx4_qp_modify(mdev->dev, NULL, rss_map->state[i],
@@ -804,3 +925,4 @@
 	}
 	mlx4_qp_release_range(mdev->dev, rss_map->base_qpn, priv->rx_ring_num);
 }
+

Modified: trunk/sys/ofed/drivers/net/mlx4/en_tx.c
===================================================================
--- trunk/sys/ofed/drivers/net/mlx4/en_tx.c	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/drivers/net/mlx4/en_tx.c	2016-09-14 19:35:22 UTC (rev 7911)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2007, 2014 Mellanox Technologies. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -31,16 +31,14 @@
  *
  */
 
-#include "mlx4_en.h"
-
+#include <linux/page.h>
 #include <linux/mlx4/cq.h>
+#include <linux/slab.h>
 #include <linux/mlx4/qp.h>
+#include <linux/if_vlan.h>
 #include <linux/vmalloc.h>
+#include <linux/moduleparam.h>
 
-#include <net/ethernet.h>
-#include <net/if_vlan_var.h>
-#include <sys/mbuf.h>
-
 #include <netinet/in_systm.h>
 #include <netinet/in.h>
 #include <netinet/if_ether.h>
@@ -50,64 +48,105 @@
 #include <netinet/tcp_lro.h>
 #include <netinet/udp.h>
 
+#include "mlx4_en.h"
+#include "utils.h"
+
 enum {
 	MAX_INLINE = 104, /* 128 - 16 - 4 - 4 */
 	MAX_BF = 256,
+	MIN_PKT_LEN = 17,
 };
 
-static int inline_thold = MAX_INLINE;
+static int inline_thold __read_mostly = MAX_INLINE;
 
-module_param_named(inline_thold, inline_thold, int, 0444);
-MODULE_PARM_DESC(inline_thold, "treshold for using inline data");
+module_param_named(inline_thold, inline_thold, uint, 0444);
+MODULE_PARM_DESC(inline_thold, "threshold for using inline data");
 
 int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv,
-			   struct mlx4_en_tx_ring *ring, u32 size,
-			   u16 stride)
+			   struct mlx4_en_tx_ring **pring, u32 size,
+			   u16 stride, int node, int queue_idx)
 {
 	struct mlx4_en_dev *mdev = priv->mdev;
+	struct mlx4_en_tx_ring *ring;
+	uint32_t x;
 	int tmp;
 	int err;
 
+	ring = kzalloc_node(sizeof(struct mlx4_en_tx_ring), GFP_KERNEL, node);
+	if (!ring) {
+		ring = kzalloc(sizeof(struct mlx4_en_tx_ring), GFP_KERNEL);
+		if (!ring) {
+			en_err(priv, "Failed allocating TX ring\n");
+			return -ENOMEM;
+		}
+	}
+
+	/* Create DMA descriptor TAG */
+	if ((err = -bus_dma_tag_create(
+	    bus_get_dma_tag(mdev->pdev->dev.bsddev),
+	    1,					/* any alignment */
+	    0,					/* no boundary */
+	    BUS_SPACE_MAXADDR,			/* lowaddr */
+	    BUS_SPACE_MAXADDR,			/* highaddr */
+	    NULL, NULL,				/* filter, filterarg */
+	    MLX4_EN_TX_MAX_PAYLOAD_SIZE,	/* maxsize */
+	    MLX4_EN_TX_MAX_MBUF_FRAGS,		/* nsegments */
+	    MLX4_EN_TX_MAX_MBUF_SIZE,		/* maxsegsize */
+	    0,					/* flags */
+	    NULL, NULL,				/* lockfunc, lockfuncarg */
+	    &ring->dma_tag)))
+		goto done;
+
 	ring->size = size;
 	ring->size_mask = size - 1;
 	ring->stride = stride;
-
-	inline_thold = min(inline_thold, MAX_INLINE);
-
+	ring->inline_thold = MAX(MIN_PKT_LEN, MIN(inline_thold, MAX_INLINE));
 	mtx_init(&ring->tx_lock.m, "mlx4 tx", NULL, MTX_DEF);
 	mtx_init(&ring->comp_lock.m, "mlx4 comp", NULL, MTX_DEF);
 
 	/* Allocate the buf ring */
 	ring->br = buf_ring_alloc(MLX4_EN_DEF_TX_QUEUE_SIZE, M_DEVBUF,
-	    M_WAITOK, &ring->tx_lock.m);
+		M_WAITOK, &ring->tx_lock.m);
 	if (ring->br == NULL) {
 		en_err(priv, "Failed allocating tx_info ring\n");
-		return -ENOMEM;
+		err = -ENOMEM;
+		goto err_free_dma_tag;
 	}
 
 	tmp = size * sizeof(struct mlx4_en_tx_info);
-	ring->tx_info = kmalloc(tmp, GFP_KERNEL);
+	ring->tx_info = kzalloc_node(tmp, GFP_KERNEL, node);
 	if (!ring->tx_info) {
-		en_err(priv, "Failed allocating tx_info ring\n");
-		err = -ENOMEM;
-		goto err_tx;
+		ring->tx_info = kzalloc(tmp, GFP_KERNEL);
+		if (!ring->tx_info) {
+			err = -ENOMEM;
+			goto err_ring;
+		}
 	}
+
+	/* Create DMA descriptor MAPs */
+	for (x = 0; x != size; x++) {
+		err = -bus_dmamap_create(ring->dma_tag, 0,
+		    &ring->tx_info[x].dma_map);
+		if (err != 0) {
+			while (x--) {
+				bus_dmamap_destroy(ring->dma_tag,
+				    ring->tx_info[x].dma_map);
+			}
+			goto err_info;
+		}
+	}
+
 	en_dbg(DRV, priv, "Allocated tx_info ring at addr:%p size:%d\n",
 		 ring->tx_info, tmp);
 
-	ring->bounce_buf = kmalloc(MAX_DESC_SIZE, GFP_KERNEL);
-	if (!ring->bounce_buf) {
-		en_err(priv, "Failed allocating bounce buffer\n");
-		err = -ENOMEM;
-		goto err_tx;
-	}
 	ring->buf_size = ALIGN(size * ring->stride, MLX4_EN_PAGE_SIZE);
 
+	/* Allocate HW buffers on provided NUMA node */
 	err = mlx4_alloc_hwq_res(mdev->dev, &ring->wqres, ring->buf_size,
 				 2 * PAGE_SIZE);
 	if (err) {
 		en_err(priv, "Failed allocating hwq resources\n");
-		goto err_bounce;
+		goto err_dma_map;
 	}
 
 	err = mlx4_en_map_buffer(&ring->wqres.buf);
@@ -122,9 +161,10 @@
 	       "buf_size:%d dma:%llx\n", ring, ring->buf, ring->size,
 	       ring->buf_size, (unsigned long long) ring->wqres.buf.direct.map);
 
-	err = mlx4_qp_reserve_range(mdev->dev, 1, 256, &ring->qpn);
+	err = mlx4_qp_reserve_range(mdev->dev, 1, 1, &ring->qpn,
+				    MLX4_RESERVE_BF_QP);
 	if (err) {
-		en_err(priv, "Failed reserving qp for tx ring.\n");
+		en_err(priv, "failed reserving qp for TX ring\n");
 		goto err_map;
 	}
 
@@ -135,14 +175,19 @@
 	}
 	ring->qp.event = mlx4_en_sqp_event;
 
-	err = mlx4_bf_alloc(mdev->dev, &ring->bf);
+	err = mlx4_bf_alloc(mdev->dev, &ring->bf, node);
 	if (err) {
+		en_dbg(DRV, priv, "working without blueflame (%d)", err);
 		ring->bf.uar = &mdev->priv_uar;
 		ring->bf.uar->map = mdev->uar_map;
 		ring->bf_enabled = false;
 	} else
 		ring->bf_enabled = true;
+	ring->queue_index = queue_idx;
+	if (queue_idx < priv->num_tx_rings_p_up )
+		CPU_SET(queue_idx, &ring->affinity_mask);
 
+	*pring = ring;
 	return 0;
 
 err_reserve:
@@ -151,20 +196,26 @@
 	mlx4_en_unmap_buffer(&ring->wqres.buf);
 err_hwq_res:
 	mlx4_free_hwq_res(mdev->dev, &ring->wqres, ring->buf_size);
-err_bounce:
-	kfree(ring->bounce_buf);
-	ring->bounce_buf = NULL;
-err_tx:
+err_dma_map:
+	for (x = 0; x != size; x++)
+		bus_dmamap_destroy(ring->dma_tag, ring->tx_info[x].dma_map);
+err_info:
+	vfree(ring->tx_info);
+err_ring:
 	buf_ring_free(ring->br, M_DEVBUF);
-	kfree(ring->tx_info);
-	ring->tx_info = NULL;
+err_free_dma_tag:
+	bus_dma_tag_destroy(ring->dma_tag);
+done:
+	kfree(ring);
 	return err;
 }
 
 void mlx4_en_destroy_tx_ring(struct mlx4_en_priv *priv,
-			     struct mlx4_en_tx_ring *ring)
+			     struct mlx4_en_tx_ring **pring)
 {
 	struct mlx4_en_dev *mdev = priv->mdev;
+	struct mlx4_en_tx_ring *ring = *pring;
+	uint32_t x;
 	en_dbg(DRV, priv, "Destroying tx ring, qpn: %d\n", ring->qpn);
 
 	buf_ring_free(ring->br, M_DEVBUF);
@@ -172,20 +223,22 @@
 		mlx4_bf_free(mdev->dev, &ring->bf);
 	mlx4_qp_remove(mdev->dev, &ring->qp);
 	mlx4_qp_free(mdev->dev, &ring->qp);
-	mlx4_qp_release_range(mdev->dev, ring->qpn, 1);
+	mlx4_qp_release_range(priv->mdev->dev, ring->qpn, 1);
 	mlx4_en_unmap_buffer(&ring->wqres.buf);
 	mlx4_free_hwq_res(mdev->dev, &ring->wqres, ring->buf_size);
-	kfree(ring->bounce_buf);
-	ring->bounce_buf = NULL;
-	kfree(ring->tx_info);
-	ring->tx_info = NULL;
+	for (x = 0; x != ring->size; x++)
+		bus_dmamap_destroy(ring->dma_tag, ring->tx_info[x].dma_map);
+	vfree(ring->tx_info);
 	mtx_destroy(&ring->tx_lock.m);
 	mtx_destroy(&ring->comp_lock.m);
+	bus_dma_tag_destroy(ring->dma_tag);
+	kfree(ring);
+	*pring = NULL;
 }
 
 int mlx4_en_activate_tx_ring(struct mlx4_en_priv *priv,
 			     struct mlx4_en_tx_ring *ring,
-			     int cq)
+			     int cq, int user_prio)
 {
 	struct mlx4_en_dev *mdev = priv->mdev;
 	int err;
@@ -196,20 +249,18 @@
 	ring->last_nr_txbb = 1;
 	ring->poll_cnt = 0;
 	ring->blocked = 0;
-	memset(ring->tx_info, 0, ring->size * sizeof(struct mlx4_en_tx_info));
 	memset(ring->buf, 0, ring->buf_size);
 
 	ring->qp_state = MLX4_QP_STATE_RST;
-	ring->doorbell_qpn = swab32(ring->qp.qpn << 8);
+	ring->doorbell_qpn = ring->qp.qpn << 8;
 
 	mlx4_en_fill_qp_context(priv, ring->size, ring->stride, 1, 0, ring->qpn,
-				ring->cqn, &ring->context);
+				ring->cqn, user_prio, &ring->context);
 	if (ring->bf_enabled)
 		ring->context.usr_page = cpu_to_be32(ring->bf.uar->index);
 
 	err = mlx4_qp_to_ready(mdev->dev, &ring->wqres.mtt, &ring->context,
 			       &ring->qp, &ring->qp_state);
-
 	return err;
 }
 
@@ -222,65 +273,65 @@
 		       MLX4_QP_STATE_RST, NULL, 0, 0, &ring->qp);
 }
 
+static volatile struct mlx4_wqe_data_seg *
+mlx4_en_store_inline_lso_data(volatile struct mlx4_wqe_data_seg *dseg,
+    struct mbuf *mb, int len, __be32 owner_bit)
+{
+	uint8_t *inl = __DEVOLATILE(uint8_t *, dseg);
 
-static u32 mlx4_en_free_tx_desc(struct mlx4_en_priv *priv,
-				struct mlx4_en_tx_ring *ring,
-				int index, u8 owner)
+	/* copy data into place */
+	m_copydata(mb, 0, len, inl + 4);
+	dseg += DIV_ROUND_UP(4 + len, DS_SIZE_ALIGNMENT);
+	return (dseg);
+}
+
+static void
+mlx4_en_store_inline_lso_header(volatile struct mlx4_wqe_data_seg *dseg,
+    int len, __be32 owner_bit)
 {
-	struct mlx4_en_dev *mdev = priv->mdev;
+}
+
+static void
+mlx4_en_stamp_wqe(struct mlx4_en_priv *priv,
+    struct mlx4_en_tx_ring *ring, u32 index, u8 owner)
+{
 	struct mlx4_en_tx_info *tx_info = &ring->tx_info[index];
-	struct mlx4_en_tx_desc *tx_desc = ring->buf + index * TXBB_SIZE;
-	struct mlx4_wqe_data_seg *data = (void *) tx_desc + tx_info->data_offset;
-	struct mbuf *mb = tx_info->mb;
-	void *end = ring->buf + ring->buf_size;
-	int frags = tx_info->nr_segs;
-	int i;
-	__be32 *ptr = (__be32 *)tx_desc;
-	__be32 stamp = cpu_to_be32(STAMP_VAL | (!!owner << STAMP_SHIFT));
+	struct mlx4_en_tx_desc *tx_desc = (struct mlx4_en_tx_desc *)
+	    (ring->buf + (index * TXBB_SIZE));
+	volatile __be32 *ptr = (__be32 *)tx_desc;
+	const __be32 stamp = cpu_to_be32(STAMP_VAL |
+	    ((u32)owner << STAMP_SHIFT));
+	u32 i;
 
-	/* Optimize the common case when there are no wraparounds */
-	if (likely((void *) tx_desc + tx_info->nr_txbb * TXBB_SIZE <= end)) {
-		if (!tx_info->inl) {
-			for (i = 0; i < frags; i++) {
-				pci_unmap_single(mdev->pdev,
-					(dma_addr_t) be64_to_cpu(data[i].addr),
-					data[i].byte_count, PCI_DMA_TODEVICE);
-			}
-		}
-		/* Stamp the freed descriptor */
-		for (i = 0; i < tx_info->nr_txbb * TXBB_SIZE; i += STAMP_STRIDE) {
-			*ptr = stamp;
-			ptr += STAMP_DWORDS;
-		}
+	/* Stamp the freed descriptor */
+	for (i = 0; i < tx_info->nr_txbb * TXBB_SIZE; i += STAMP_STRIDE) {
+		*ptr = stamp;
+		ptr += STAMP_DWORDS;
+	}
+}
 
-	} else {
-		if (!tx_info->inl) {
-			for (i = 0; i < frags; i++) {
-				/* Check for wraparound before unmapping */
-				if ((void *) data >= end)
-					data = (struct mlx4_wqe_data_seg *) ring->buf;
-				pci_unmap_single(mdev->pdev,
-					(dma_addr_t) be64_to_cpu(data->addr),
-					data->byte_count, PCI_DMA_TODEVICE);
-				++data;
-			}
-		}
-		/* Stamp the freed descriptor */
-		for (i = 0; i < tx_info->nr_txbb * TXBB_SIZE; i += STAMP_STRIDE) {
-			*ptr = stamp;
-			ptr += STAMP_DWORDS;
-			if ((void *) ptr >= end) {
-				ptr = ring->buf;
-				stamp ^= cpu_to_be32(0x80000000);
-			}
-		}
+static u32
+mlx4_en_free_tx_desc(struct mlx4_en_priv *priv,
+    struct mlx4_en_tx_ring *ring, u32 index)
+{
+	struct mlx4_en_tx_info *tx_info;
+	struct mbuf *mb;
 
-	}
-	m_freem(mb);
-	return tx_info->nr_txbb;
+	tx_info = &ring->tx_info[index];
+	mb = tx_info->mb;
+
+	if (mb == NULL)
+		goto done;
+
+	bus_dmamap_sync(ring->dma_tag, tx_info->dma_map,
+	    BUS_DMASYNC_POSTWRITE);
+	bus_dmamap_unload(ring->dma_tag, tx_info->dma_map);
+
+        m_freem(mb);
+done:
+	return (tx_info->nr_txbb);
 }
 
-
 int mlx4_en_free_tx_buf(struct net_device *dev, struct mlx4_en_tx_ring *ring)
 {
 	struct mlx4_en_priv *priv = netdev_priv(dev);
@@ -292,14 +343,13 @@
 		 ring->cons, ring->prod);
 
 	if ((u32) (ring->prod - ring->cons) > ring->size) {
-		en_warn(priv, "Tx consumer passed producer!\n");
+                en_warn(priv, "Tx consumer passed producer!\n");
 		return 0;
 	}
 
 	while (ring->cons != ring->prod) {
 		ring->last_nr_txbb = mlx4_en_free_tx_desc(priv, ring,
-						ring->cons & ring->size_mask,
-						!!(ring->cons & ring->size));
+		    ring->cons & ring->size_mask);
 		ring->cons += ring->last_nr_txbb;
 		cnt++;
 	}
@@ -310,102 +360,96 @@
 	return cnt;
 }
 
-void mlx4_en_set_prio_map(struct mlx4_en_priv *priv, u16 *prio_map, u32 ring_num)
+static bool
+mlx4_en_tx_ring_is_full(struct mlx4_en_tx_ring *ring)
 {
-	int block = 8 / ring_num;
-	int extra = 8 - (block * ring_num);
-	int num = 0;
-	u16 ring = 1;
-	int prio;
-
-	if (ring_num == 1) {
-		for (prio = 0; prio < 8; prio++)
-			prio_map[prio] = 0;
-		return;
-	}
-
-	for (prio = 0; prio < 8; prio++) {
-		if (extra && (num == block + 1)) {
-			ring++;
-			num = 0;
-			extra--;
-		} else if (!extra && (num == block)) {
-			ring++;
-			num = 0;
-		}
-		prio_map[prio] = ring;
-		en_dbg(DRV, priv, " prio:%d --> ring:%d\n", prio, ring);
-		num++;
-	}
+	int wqs;
+	wqs = ring->size - (ring->prod - ring->cons);
+	return (wqs < (HEADROOM + (2 * MLX4_EN_TX_WQE_MAX_WQEBBS)));
 }
 
-static void mlx4_en_process_tx_cq(struct net_device *dev, struct mlx4_en_cq *cq)
+static int mlx4_en_process_tx_cq(struct net_device *dev,
+				 struct mlx4_en_cq *cq)
 {
 	struct mlx4_en_priv *priv = netdev_priv(dev);
 	struct mlx4_cq *mcq = &cq->mcq;
-	struct mlx4_en_tx_ring *ring = &priv->tx_ring[cq->ring];
-	struct mlx4_cqe *cqe = cq->buf;
+	struct mlx4_en_tx_ring *ring = priv->tx_ring[cq->ring];
+	struct mlx4_cqe *cqe;
 	u16 index;
-	u16 new_index;
+	u16 new_index, ring_index, stamp_index;
 	u32 txbbs_skipped = 0;
-	u32 cq_last_sav;
+	u32 txbbs_stamp = 0;
+	u32 cons_index = mcq->cons_index;
+	int size = cq->size;
+	u32 size_mask = ring->size_mask;
+	struct mlx4_cqe *buf = cq->buf;
+	int factor = priv->cqe_factor;
 
-	/* index always points to the first TXBB of the last polled descriptor */
-	index = ring->cons & ring->size_mask;
-	new_index = be16_to_cpu(cqe->wqe_index) & ring->size_mask;
-	if (index == new_index)
-		return;
-
 	if (!priv->port_up)
-		return;
+		return 0;
 
-	/*
-	 * We use a two-stage loop:
-	 * - the first samples the HW-updated CQE
-	 * - the second frees TXBBs until the last sample
-	 * This lets us amortize CQE cache misses, while still polling the CQ
-	 * until is quiescent.
-	 */
-	cq_last_sav = mcq->cons_index;
-	do {
+	index = cons_index & size_mask;
+	cqe = &buf[(index << factor) + factor];
+	ring_index = ring->cons & size_mask;
+	stamp_index = ring_index;
+
+	/* Process all completed CQEs */
+	while (XNOR(cqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK,
+			cons_index & size)) {
+		/*
+		 * make sure we read the CQE after we read the
+		 * ownership bit
+		 */
+		rmb();
+
+		if (unlikely((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) ==
+			     MLX4_CQE_OPCODE_ERROR)) {
+			en_err(priv, "CQE completed in error - vendor syndrom: 0x%x syndrom: 0x%x\n",
+			       ((struct mlx4_err_cqe *)cqe)->
+				       vendor_err_syndrome,
+			       ((struct mlx4_err_cqe *)cqe)->syndrome);
+		}
+
+		/* Skip over last polled CQE */
+		new_index = be16_to_cpu(cqe->wqe_index) & size_mask;
+
 		do {
-			/* Skip over last polled CQE */
-			index = (index + ring->last_nr_txbb) & ring->size_mask;
 			txbbs_skipped += ring->last_nr_txbb;
-
-			/* Poll next CQE */
+			ring_index = (ring_index + ring->last_nr_txbb) & size_mask;
+			/* free next descriptor */
 			ring->last_nr_txbb = mlx4_en_free_tx_desc(
-						priv, ring, index,
-						!!((ring->cons + txbbs_skipped) &
-						   ring->size));
-			++mcq->cons_index;
+			    priv, ring, ring_index);
+			mlx4_en_stamp_wqe(priv, ring, stamp_index,
+					  !!((ring->cons + txbbs_stamp) &
+						ring->size));
+			stamp_index = ring_index;
+			txbbs_stamp = txbbs_skipped;
+		} while (ring_index != new_index);
 
-		} while (index != new_index);
+		++cons_index;
+		index = cons_index & size_mask;
+		cqe = &buf[(index << factor) + factor];
+	}
 
-		new_index = be16_to_cpu(cqe->wqe_index) & ring->size_mask;
-	} while (index != new_index);
-	AVG_PERF_COUNTER(priv->pstats.tx_coal_avg,
-			 (u32) (mcq->cons_index - cq_last_sav));
 
 	/*
 	 * To prevent CQ overflow we first update CQ consumer and only then
 	 * the ring consumer.
 	 */
+	mcq->cons_index = cons_index;
 	mlx4_cq_set_ci(mcq);
 	wmb();
 	ring->cons += txbbs_skipped;
 
-	/* Wakeup Tx queue if this ring stopped it */
-	if (unlikely(ring->blocked)) {
-		if ((u32) (ring->prod - ring->cons) <=
-		     ring->size - HEADROOM - MAX_DESC_TXBBS) {
-			ring->blocked = 0;
-			if (atomic_fetchadd_int(&priv->blocked, -1) == 1)
-				atomic_clear_int(&dev->if_drv_flags,
-				    IFF_DRV_OACTIVE);
-			priv->port_stats.wake_queue++;
-		}
+	/* Wakeup Tx queue if it was stopped and ring is not full */
+	if (unlikely(ring->blocked) && !mlx4_en_tx_ring_is_full(ring)) {
+		ring->blocked = 0;
+		if (atomic_fetchadd_int(&priv->blocked, -1) == 1)
+			atomic_clear_int(&dev->if_drv_flags ,IFF_DRV_OACTIVE);
+		ring->wake_queue++;
+		priv->port_stats.wake_queue++;
 	}
+	return (0);
 }
 
 void mlx4_en_tx_irq(struct mlx4_cq *mcq)
@@ -412,9 +456,9 @@
 {
 	struct mlx4_en_cq *cq = container_of(mcq, struct mlx4_en_cq, mcq);
 	struct mlx4_en_priv *priv = netdev_priv(cq->dev);
-	struct mlx4_en_tx_ring *ring = &priv->tx_ring[cq->ring];
+	struct mlx4_en_tx_ring *ring = priv->tx_ring[cq->ring];
 
-	if (!spin_trylock(&ring->comp_lock))
+	if (priv->port_up == 0 || !spin_trylock(&ring->comp_lock))
 		return;
 	mlx4_en_process_tx_cq(cq->dev, cq);
 	mod_timer(&cq->timer, jiffies + 1);
@@ -421,16 +465,17 @@
 	spin_unlock(&ring->comp_lock);
 }
 
-
 void mlx4_en_poll_tx_cq(unsigned long data)
 {
 	struct mlx4_en_cq *cq = (struct mlx4_en_cq *) data;
 	struct mlx4_en_priv *priv = netdev_priv(cq->dev);
-	struct mlx4_en_tx_ring *ring = &priv->tx_ring[cq->ring];
+	struct mlx4_en_tx_ring *ring = priv->tx_ring[cq->ring];
 	u32 inflight;
 
 	INC_PERF_COUNTER(priv->pstats.tx_poll);
 
+	if (priv->port_up == 0)
+		return;
 	if (!spin_trylock(&ring->comp_lock)) {
 		mod_timer(&cq->timer, jiffies + MLX4_EN_TX_POLL_TIMEOUT);
 		return;
@@ -447,39 +492,14 @@
 	spin_unlock(&ring->comp_lock);
 }
 
-static struct mlx4_en_tx_desc *mlx4_en_bounce_to_desc(struct mlx4_en_priv *priv,
-						      struct mlx4_en_tx_ring *ring,
-						      u32 index,
-						      unsigned int desc_size)
+static inline void mlx4_en_xmit_poll(struct mlx4_en_priv *priv, int tx_ind)
 {
-	u32 copy = (ring->size - index) * TXBB_SIZE;
-	int i;
+	struct mlx4_en_cq *cq = priv->tx_cq[tx_ind];
+	struct mlx4_en_tx_ring *ring = priv->tx_ring[tx_ind];
 
-	for (i = desc_size - copy - 4; i >= 0; i -= 4) {
-		if ((i & (TXBB_SIZE - 1)) == 0)
-			wmb();
+	if (priv->port_up == 0)
+		return;
 
-		*((u32 *) (ring->buf + i)) =
-			*((u32 *) (ring->bounce_buf + copy + i));
-	}
-
-	for (i = copy - 4; i >= 4 ; i -= 4) {
-		if ((i & (TXBB_SIZE - 1)) == 0)
-			wmb();
-
-		*((u32 *) (ring->buf + index * TXBB_SIZE + i)) =
-			*((u32 *) (ring->bounce_buf + i));
-	}
-
-	/* Return real descriptor location */
-	return ring->buf + index * TXBB_SIZE;
-}
-
-static inline void mlx4_en_xmit_poll(struct mlx4_en_priv *priv, int tx_ind)
-{
-	struct mlx4_en_cq *cq = &priv->tx_cq[tx_ind];
-	struct mlx4_en_tx_ring *ring = &priv->tx_ring[tx_ind];
-
 	/* If we don't have a pending timer, set one up to catch our recent
 	   post in case the interface becomes idle */
 	if (!timer_pending(&cq->timer))
@@ -493,427 +513,478 @@
 		}
 }
 
-static int is_inline(struct mbuf *mb)
+static u16
+mlx4_en_get_inline_hdr_size(struct mlx4_en_tx_ring *ring, struct mbuf *mb)
 {
+	u16 retval;
 
-	if (inline_thold && mb->m_pkthdr.len <= inline_thold &&
-	    (mb->m_pkthdr.csum_flags & CSUM_TSO) == 0)
-		return 1;
+	/* only copy from first fragment, if possible */
+	retval = MIN(ring->inline_thold, mb->m_len);
 
-	return 0;
+	/* check for too little data */
+	if (unlikely(retval < MIN_PKT_LEN))
+		retval = MIN(ring->inline_thold, mb->m_pkthdr.len);
+	return (retval);
 }
 
-static int inline_size(struct mbuf *mb)
+static int
+mlx4_en_get_header_size(struct mbuf *mb)
 {
-	int len;
+	struct ether_vlan_header *eh;
+        struct tcphdr *th;
+        struct ip *ip;
+        int ip_hlen, tcp_hlen;
+	struct ip6_hdr *ip6;
+	uint16_t eth_type;
+	int eth_hdr_len;
 
-	len = mb->m_pkthdr.len;
-	if (len + CTRL_SIZE + sizeof(struct mlx4_wqe_inline_seg)
-	    <= MLX4_INLINE_ALIGN)
-		return ALIGN(len + CTRL_SIZE +
-			     sizeof(struct mlx4_wqe_inline_seg), 16);
-	else
-		return ALIGN(len + CTRL_SIZE + 2 *
-			     sizeof(struct mlx4_wqe_inline_seg), 16);
-}
-
-static int get_head_size(struct mbuf *mb)
-{
-	struct tcphdr *th;
-	struct ip *ip;
-	int ip_hlen, tcp_hlen;
-	int len;
-
-	len = ETHER_HDR_LEN;
-	if (mb->m_len < len + sizeof(struct ip))
+	eh = mtod(mb, struct ether_vlan_header *);
+	if (mb->m_len < ETHER_HDR_LEN)
 		return (0);
-	ip = (struct ip *)(mtod(mb, char *) + len);
-	if (ip->ip_p != IPPROTO_TCP)
+	if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
+		eth_type = ntohs(eh->evl_proto);
+		eth_hdr_len = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
+	} else {
+		eth_type = ntohs(eh->evl_encap_proto);
+		eth_hdr_len = ETHER_HDR_LEN;
+	}
+	if (mb->m_len < eth_hdr_len)
 		return (0);
-	ip_hlen = ip->ip_hl << 2;
-	len += ip_hlen;
-	if (mb->m_len < len + sizeof(struct tcphdr))
+	switch (eth_type) {
+	case ETHERTYPE_IP:
+		ip = (struct ip *)(mb->m_data + eth_hdr_len);
+		if (mb->m_len < eth_hdr_len + sizeof(*ip))
+			return (0);
+		if (ip->ip_p != IPPROTO_TCP)
+			return (0);
+		ip_hlen = ip->ip_hl << 2;
+		eth_hdr_len += ip_hlen;
+		break;
+	case ETHERTYPE_IPV6:
+		ip6 = (struct ip6_hdr *)(mb->m_data + eth_hdr_len);
+		if (mb->m_len < eth_hdr_len + sizeof(*ip6))
+			return (0);
+		if (ip6->ip6_nxt != IPPROTO_TCP)
+			return (0);
+		eth_hdr_len += sizeof(*ip6);
+		break;
+	default:
 		return (0);
-	th = (struct tcphdr *)(mtod(mb, char *) + len);
+	}
+	if (mb->m_len < eth_hdr_len + sizeof(*th))
+		return (0);
+	th = (struct tcphdr *)(mb->m_data + eth_hdr_len);
 	tcp_hlen = th->th_off << 2;
-	len += tcp_hlen;
-	if (mb->m_len < len)
+	eth_hdr_len += tcp_hlen;
+	if (mb->m_len < eth_hdr_len)
 		return (0);
-	return (len);
+	return (eth_hdr_len);
 }
 
-static int get_real_size(struct mbuf *mb, struct net_device *dev, int *segsp,
-    int *lso_header_size)
+static volatile struct mlx4_wqe_data_seg *
+mlx4_en_store_inline_data(volatile struct mlx4_wqe_data_seg *dseg,
+    struct mbuf *mb, int len, __be32 owner_bit)
 {
-	struct mbuf *m;
-	int nr_segs;
+	uint8_t *inl = __DEVOLATILE(uint8_t *, dseg);
+	const int spc = MLX4_INLINE_ALIGN - CTRL_SIZE - 4;
 
-	nr_segs = 0;
-	for (m = mb; m != NULL; m = m->m_next)
-		if (m->m_len)
-			nr_segs++;
-
-	if (mb->m_pkthdr.csum_flags & CSUM_TSO) {
-		*lso_header_size = get_head_size(mb);
-		if (*lso_header_size) {
-			if (mb->m_len == *lso_header_size)
-				nr_segs--;
-			*segsp = nr_segs;
-			return CTRL_SIZE + nr_segs * DS_SIZE +
-			    ALIGN(*lso_header_size + 4, DS_SIZE);
-		}
-	} else
-		*lso_header_size = 0;
-	*segsp = nr_segs;
-	if (is_inline(mb))
-		return inline_size(mb);
-	return (CTRL_SIZE + nr_segs * DS_SIZE);
-}
-
-static struct mbuf *mb_copy(struct mbuf *mb, int *offp, char *data, int len)
-{
-	int bytes;
-	int off;
-
-	off = *offp;
-	while (len) {
-		bytes = min(mb->m_len - off, len);
-		if (bytes)
-			memcpy(data, mb->m_data + off, bytes);
-		len -= bytes;
-		data += bytes;
-		off += bytes;
-		if (off == mb->m_len) {
-			off = 0;
-			mb = mb->m_next;
-		}
+	if (unlikely(len < MIN_PKT_LEN)) {
+		m_copydata(mb, 0, len, inl + 4);
+		memset(inl + 4 + len, 0, MIN_PKT_LEN - len);
+		dseg += DIV_ROUND_UP(4 + MIN_PKT_LEN, DS_SIZE_ALIGNMENT);
+	} else if (len <= spc) {
+		m_copydata(mb, 0, len, inl + 4);
+		dseg += DIV_ROUND_UP(4 + len, DS_SIZE_ALIGNMENT);
+	} else {
+		m_copydata(mb, 0, spc, inl + 4);
+		m_copydata(mb, spc, len - spc, inl + 8 + spc);
+		dseg += DIV_ROUND_UP(8 + len, DS_SIZE_ALIGNMENT);
 	}
-	*offp = off;
-	return (mb);
+	return (dseg);
 }
 
-static void build_inline_wqe(struct mlx4_en_tx_desc *tx_desc, struct mbuf *mb,
-			     int real_size, u16 *vlan_tag, int tx_ind)
+static void
+mlx4_en_store_inline_header(volatile struct mlx4_wqe_data_seg *dseg,
+    int len, __be32 owner_bit)
 {
-	struct mlx4_wqe_inline_seg *inl = &tx_desc->inl;
-	int spc = MLX4_INLINE_ALIGN - CTRL_SIZE - sizeof *inl;
-	int len;
-	int off;
+	uint8_t *inl = __DEVOLATILE(uint8_t *, dseg);
+	const int spc = MLX4_INLINE_ALIGN - CTRL_SIZE - 4;
 
-	off = 0;
-	len = mb->m_pkthdr.len;
-	if (len <= spc) {
-		inl->byte_count = cpu_to_be32(1 << 31 | len);
-		mb_copy(mb, &off, (void *)(inl + 1), len);
+	if (unlikely(len < MIN_PKT_LEN)) {
+		*(volatile uint32_t *)inl =
+		    SET_BYTE_COUNT((1 << 31) | MIN_PKT_LEN);
+	} else if (len <= spc) {
+		*(volatile uint32_t *)inl =
+		    SET_BYTE_COUNT((1 << 31) | len);
 	} else {
-		inl->byte_count = cpu_to_be32(1 << 31 | spc);
-		mb = mb_copy(mb, &off, (void *)(inl + 1), spc);
-		inl = (void *) (inl + 1) + spc;
-		mb_copy(mb, &off, (void *)(inl + 1), len - spc);
+		*(volatile uint32_t *)(inl + 4 + spc) =
+		    SET_BYTE_COUNT((1 << 31) | (len - spc));
 		wmb();
-		inl->byte_count = cpu_to_be32(1 << 31 | (len - spc));
+		*(volatile uint32_t *)inl =
+		    SET_BYTE_COUNT((1 << 31) | spc);
 	}
-	tx_desc->ctrl.vlan_tag = cpu_to_be16(*vlan_tag);
-	tx_desc->ctrl.ins_vlan = MLX4_WQE_CTRL_INS_VLAN * !!(*vlan_tag);
-	tx_desc->ctrl.fence_size = (real_size / 16) & 0x3f;
 }
 
+static unsigned long hashrandom;
+static void hashrandom_init(void *arg)
+{
+	hashrandom = random();
+}
+SYSINIT(hashrandom_init, SI_SUB_KLD, SI_ORDER_SECOND, &hashrandom_init, NULL);
+
 u16 mlx4_en_select_queue(struct net_device *dev, struct mbuf *mb)
 {
 	struct mlx4_en_priv *priv = netdev_priv(dev);
-	struct mlx4_en_tx_hash_entry *entry;
-	struct ether_header *eth;
-	struct tcphdr *th;
-	struct ip *iph;
-	u32 hash_index;
-	int tx_ind = 0;
-	u16 vlan_tag = 0;
-	int len;
+	u32 rings_p_up = priv->num_tx_rings_p_up;
+	u32 up = 0;
+	u32 queue_index;
 
+#if (MLX4_EN_NUM_UP > 1)
 	/* Obtain VLAN information if present */
 	if (mb->m_flags & M_VLANTAG) {
-		vlan_tag = mb->m_pkthdr.ether_vtag;
-		/* Set the Tx ring to use according to vlan priority */
-		tx_ind = priv->tx_prio_map[vlan_tag >> 13];
-		if (tx_ind)
-			return tx_ind;
+		u32 vlan_tag = mb->m_pkthdr.ether_vtag;
+	        up = (vlan_tag >> 13) % MLX4_EN_NUM_UP;
 	}
-	if (mb->m_len <
-	    ETHER_HDR_LEN + sizeof(struct ip) + sizeof(struct tcphdr))
-		return MLX4_EN_NUM_HASH_RINGS;
-	eth = mtod(mb, struct ether_header *);
-	/* Hashing is only done for TCP/IP or UDP/IP packets */
-	if (be16_to_cpu(eth->ether_type) != ETHERTYPE_IP)
-		return MLX4_EN_NUM_HASH_RINGS;
-	len = ETHER_HDR_LEN;
-	iph = (struct ip *)(mtod(mb, char *) + len);
-	len += iph->ip_hl << 2;
-	th = (struct tcphdr *)(mtod(mb, char *) + len);
-	hash_index = be32_to_cpu(iph->ip_dst.s_addr) & MLX4_EN_TX_HASH_MASK;
-	switch(iph->ip_p) {
-	case IPPROTO_UDP:
-		break;
-	case IPPROTO_TCP:
-		if (mb->m_len < len + sizeof(struct tcphdr))
-			return MLX4_EN_NUM_HASH_RINGS;
-		hash_index =
-		    (hash_index ^ be16_to_cpu(th->th_dport ^ th->th_sport)) &
-		    MLX4_EN_TX_HASH_MASK;
-		break;
-	default:
-		return MLX4_EN_NUM_HASH_RINGS;
-	}
+#endif
+	/* hash mbuf */
+	queue_index = mlx4_en_hashmbuf(MLX4_F_HASHL3 | MLX4_F_HASHL4, mb, hashrandom);
 
-	entry = &priv->tx_hash[hash_index];
-	if(unlikely(!entry->cnt)) {
-		tx_ind = hash_index & (MLX4_EN_NUM_HASH_RINGS / 2 - 1);
-		if (2 * entry->small_pkts > entry->big_pkts)
-			tx_ind += MLX4_EN_NUM_HASH_RINGS / 2;
-		entry->small_pkts = entry->big_pkts = 0;
-		entry->ring = tx_ind;
-	}
+	return ((queue_index % rings_p_up) + (up * rings_p_up));
+}
 
-	entry->cnt++;
-	if (mb->m_pkthdr.len > MLX4_EN_SMALL_PKT_SIZE)
-		entry->big_pkts++;
-	else
-		entry->small_pkts++;
-	return entry->ring;
+static void mlx4_bf_copy(void __iomem *dst, volatile unsigned long *src, unsigned bytecnt)
+{
+	__iowrite64_copy(dst, __DEVOLATILE(void *, src), bytecnt / 8);
 }
 
-static void mlx4_bf_copy(unsigned long *dst, unsigned long *src, unsigned bytecnt)
+static u64 mlx4_en_mac_to_u64(u8 *addr)
 {
-	__iowrite64_copy(dst, src, bytecnt / 8);
+        u64 mac = 0;
+        int i;
+
+        for (i = 0; i < ETHER_ADDR_LEN; i++) {
+                mac <<= 8;
+                mac |= addr[i];
+        }
+        return mac;
 }
 
-static int mlx4_en_xmit(struct net_device *dev, int tx_ind, struct mbuf **mbp)
+static int mlx4_en_xmit(struct mlx4_en_priv *priv, int tx_ind, struct mbuf **mbp)
 {
-	struct mlx4_en_priv *priv = netdev_priv(dev);
-	struct mlx4_en_dev *mdev = priv->mdev;
-	struct mlx4_en_tx_ring *ring;
-	struct mlx4_en_cq *cq;
-	struct mlx4_en_tx_desc *tx_desc;
-	struct mlx4_wqe_data_seg *data;
+	enum {
+		DS_FACT = TXBB_SIZE / DS_SIZE_ALIGNMENT,
+		CTRL_FLAGS = cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE |
+		    MLX4_WQE_CTRL_SOLICITED),
+	};
+	bus_dma_segment_t segs[MLX4_EN_TX_MAX_MBUF_FRAGS];
+	volatile struct mlx4_wqe_data_seg *dseg;
+	volatile struct mlx4_wqe_data_seg *dseg_inline;
+	volatile struct mlx4_en_tx_desc *tx_desc;
+	struct mlx4_en_tx_ring *ring = priv->tx_ring[tx_ind];
+	struct ifnet *ifp = priv->dev;
 	struct mlx4_en_tx_info *tx_info;
+	struct mbuf *mb = *mbp;
 	struct mbuf *m;
-	int nr_txbb;
+	__be32 owner_bit;
 	int nr_segs;
-	int desc_size;
-	int real_size;
-	dma_addr_t dma;
-	u32 index, bf_index;
-	__be32 op_own;
-	u16 vlan_tag = 0;
-	int i;
-	int lso_header_size;
-	bool bounce = false;
-	struct mbuf *mb;
-	int defrag = 1;
+	int pad;
+	int err;
+	u32 bf_size;
+	u32 bf_prod;
+	u32 opcode;
+	u16 index;
+	u16 ds_cnt;
+	u16 ihs;
 
-	ring = &priv->tx_ring[tx_ind];
-	mb = *mbp;
-	if (!priv->port_up)
+	if (unlikely(!priv->port_up)) {
+		err = EINVAL;
 		goto tx_drop;
-
-retry:
-	real_size = get_real_size(mb, dev, &nr_segs, &lso_header_size);
-	if (unlikely(!real_size))
-		goto tx_drop;
-
-	/* Allign descriptor to TXBB size */
-	desc_size = ALIGN(real_size, TXBB_SIZE);
-	nr_txbb = desc_size / TXBB_SIZE;
-	if (unlikely(nr_txbb > MAX_DESC_TXBBS)) {
-		if (defrag) {
-			mb = m_defrag(*mbp, M_DONTWAIT);
-			if (mb == NULL) {
-				mb = *mbp;
-				goto tx_drop;
-			}
-			*mbp = mb;
-			defrag = 0;
-			goto retry;
-		}
-		goto tx_drop;
 	}
 
-	/* Check available TXBBs And 2K spare for prefetch */
-	if (unlikely(((int)(ring->prod - ring->cons)) >
-		     ring->size - HEADROOM - MAX_DESC_TXBBS)) {
-		/* every full Tx ring stops queue */
-		if (ring->blocked == 0)
-			atomic_add_int(&priv->blocked, 1);
-		atomic_set_int(&dev->if_drv_flags, IFF_DRV_OACTIVE);
+	/* check if TX ring is full */
+	if (unlikely(mlx4_en_tx_ring_is_full(ring))) {
+			/* every full native Tx ring stops queue */
+			if (ring->blocked == 0)
+				atomic_add_int(&priv->blocked, 1);
+			/* Set HW-queue-is-full flag */
+			atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
+			priv->port_stats.queue_stopped++;
 		ring->blocked = 1;
 		priv->port_stats.queue_stopped++;
+		ring->queue_stopped++;
 
 		/* Use interrupts to find out when queue opened */
-		cq = &priv->tx_cq[tx_ind];
-		mlx4_en_arm_cq(priv, cq);
-		return EBUSY;
-	}
+		mlx4_en_arm_cq(priv, priv->tx_cq[tx_ind]);
+		return (ENOBUFS);
+        }
 
+	/* sanity check we are not wrapping around */
+	KASSERT(((~ring->prod) & ring->size_mask) >=
+	    (MLX4_EN_TX_WQE_MAX_WQEBBS - 1), ("Wrapping around TX ring"));
+
 	/* Track current inflight packets for performance analysis */
 	AVG_PERF_COUNTER(priv->pstats.inflight_avg,
 			 (u32) (ring->prod - ring->cons - 1));
 
-	/* Packet is good - grab an index and transmit it */
+	/* Track current mbuf packet header length */
+	AVG_PERF_COUNTER(priv->pstats.tx_pktsz_avg, mb->m_pkthdr.len);
+
+	/* Grab an index and try to transmit packet */
+	owner_bit = (ring->prod & ring->size) ?
+		cpu_to_be32(MLX4_EN_BIT_DESC_OWN) : 0;
 	index = ring->prod & ring->size_mask;
-	bf_index = ring->prod;
+	tx_desc = (volatile struct mlx4_en_tx_desc *)
+	    (ring->buf + index * TXBB_SIZE);
+	tx_info = &ring->tx_info[index];
+	dseg = &tx_desc->data;
 
-	/* See if we have enough space for whole descriptor TXBB for setting
-	 * SW ownership on next descriptor; if not, use a bounce buffer. */
-	if (likely(index + nr_txbb <= ring->size))
-		tx_desc = ring->buf + index * TXBB_SIZE;
-	else {
-		tx_desc = (struct mlx4_en_tx_desc *) ring->bounce_buf;
-		bounce = true;
-	}
+	/* send a copy of the frame to the BPF listener, if any */
+	if (ifp != NULL && ifp->if_bpf != NULL)
+		ETHER_BPF_MTAP(ifp, mb);
 
-	/* Prepare ctrl segement apart opcode+ownership, which depends on
-	 * whether LSO is used */
-	if (mb->m_flags & M_VLANTAG)
-		vlan_tag = mb->m_pkthdr.ether_vtag;
-	tx_desc->ctrl.vlan_tag = cpu_to_be16(vlan_tag);
-	tx_desc->ctrl.ins_vlan = MLX4_WQE_CTRL_INS_VLAN * !!vlan_tag;
-	tx_desc->ctrl.fence_size = (real_size / 16) & 0x3f;
-	tx_desc->ctrl.srcrb_flags = cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE |
-						MLX4_WQE_CTRL_SOLICITED);
-	if (mb->m_pkthdr.csum_flags & (CSUM_IP|CSUM_TCP|CSUM_UDP)) {
-		tx_desc->ctrl.srcrb_flags |= cpu_to_be32(MLX4_WQE_CTRL_IP_CSUM |
-							 MLX4_WQE_CTRL_TCP_UDP_CSUM);
+	/* get default flags */
+	tx_desc->ctrl.srcrb_flags = CTRL_FLAGS;
+
+	if (mb->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TSO))
+		tx_desc->ctrl.srcrb_flags |= cpu_to_be32(MLX4_WQE_CTRL_IP_CSUM);
+
+	if (mb->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_UDP |
+	    CSUM_UDP_IPV6 | CSUM_TCP_IPV6 | CSUM_TSO))
+		tx_desc->ctrl.srcrb_flags |= cpu_to_be32(MLX4_WQE_CTRL_TCP_UDP_CSUM);
+
+	/* do statistics */
+	if (likely(tx_desc->ctrl.srcrb_flags != CTRL_FLAGS)) {
 		priv->port_stats.tx_chksum_offload++;
+		ring->tx_csum++;
 	}
 
-	if (unlikely(priv->validate_loopback)) {
-		/* Copy dst mac address to wqe */
-		struct ether_header *ethh;
-		u64 mac;
-		u32 mac_l, mac_h;
+	/* check for VLAN tag */
+	if (mb->m_flags & M_VLANTAG) {
+		tx_desc->ctrl.vlan_tag = cpu_to_be16(mb->m_pkthdr.ether_vtag);
+		tx_desc->ctrl.ins_vlan = MLX4_WQE_CTRL_INS_VLAN;
+	} else {
+		tx_desc->ctrl.vlan_tag = 0;
+		tx_desc->ctrl.ins_vlan = 0;
+	}
 
-		ethh = mtod(mb, struct ether_header *);
-		mac = mlx4_en_mac_to_u64(ethh->ether_dhost);
-		if (mac) {
-			mac_h = (u32) ((mac & 0xffff00000000ULL) >> 16);
-			mac_l = (u32) (mac & 0xffffffff);
-			tx_desc->ctrl.srcrb_flags |= cpu_to_be32(mac_h);
-			tx_desc->ctrl.imm = cpu_to_be32(mac_l);
+	/* clear immediate field */
+	tx_desc->ctrl.imm = 0;
+
+	/* Handle LSO (TSO) packets */
+	if (mb->m_pkthdr.csum_flags & CSUM_TSO) {
+		u32 payload_len;
+		u32 mss = mb->m_pkthdr.tso_segsz;
+		u32 num_pkts;
+
+		opcode = cpu_to_be32(MLX4_OPCODE_LSO | MLX4_WQE_CTRL_RR) |
+		    owner_bit;
+		ihs = mlx4_en_get_header_size(mb);
+		if (unlikely(ihs > MAX_INLINE)) {
+			ring->oversized_packets++;
+			err = EINVAL;
+			goto tx_drop;
 		}
+		tx_desc->lso.mss_hdr_size = cpu_to_be32((mss << 16) | ihs);
+		payload_len = mb->m_pkthdr.len - ihs;
+		if (unlikely(payload_len == 0))
+			num_pkts = 1;
+		else
+			num_pkts = DIV_ROUND_UP(payload_len, mss);
+		ring->bytes += payload_len + (num_pkts * ihs);
+		ring->packets += num_pkts;
+		priv->port_stats.tso_packets++;
+		/* store pointer to inline header */
+		dseg_inline = dseg;
+		/* copy data inline */
+		dseg = mlx4_en_store_inline_lso_data(dseg,
+		    mb, ihs, owner_bit);
+	} else {
+		opcode = cpu_to_be32(MLX4_OPCODE_SEND) |
+		    owner_bit;
+		ihs = mlx4_en_get_inline_hdr_size(ring, mb);
+		ring->bytes += max_t (unsigned int,
+		    mb->m_pkthdr.len, ETHER_MIN_LEN - ETHER_CRC_LEN);
+		ring->packets++;
+		/* store pointer to inline header */
+		dseg_inline = dseg;
+		/* copy data inline */
+		dseg = mlx4_en_store_inline_data(dseg,
+		    mb, ihs, owner_bit);
 	}
+	m_adj(mb, ihs);
 
-	/* Handle LSO (TSO) packets */
-	if (lso_header_size) {
-		int segsz;
+	/* trim off empty mbufs */
+	while (mb->m_len == 0) {
+		mb = m_free(mb);
+		/* check if all data has been inlined */
+		if (mb == NULL) {
+			nr_segs = 0;
+			goto skip_dma;
+		}
+	}
 
-		/* Mark opcode as LSO */
-		op_own = cpu_to_be32(MLX4_OPCODE_LSO | (1 << 6)) |
-			((ring->prod & ring->size) ?
-				cpu_to_be32(MLX4_EN_BIT_DESC_OWN) : 0);
+	err = bus_dmamap_load_mbuf_sg(ring->dma_tag, tx_info->dma_map,
+	    mb, segs, &nr_segs, BUS_DMA_NOWAIT);
+	if (unlikely(err == EFBIG)) {
+		/* Too many mbuf fragments */
+		m = m_defrag(mb, M_NOWAIT);
+		if (m == NULL) {
+			ring->oversized_packets++;
+			goto tx_drop;
+		}
+		mb = m;
+		/* Try again */
+		err = bus_dmamap_load_mbuf_sg(ring->dma_tag, tx_info->dma_map,
+		    mb, segs, &nr_segs, BUS_DMA_NOWAIT);
+	}
+	/* catch errors */
+	if (unlikely(err != 0)) {
+		ring->oversized_packets++;
+		goto tx_drop;
+	}
+	/* make sure all mbuf data is written to RAM */
+	bus_dmamap_sync(ring->dma_tag, tx_info->dma_map,
+	    BUS_DMASYNC_PREWRITE);
 
-		/* Fill in the LSO prefix */
-		tx_desc->lso.mss_hdr_size = cpu_to_be32(
-			mb->m_pkthdr.tso_segsz << 16 | lso_header_size);
+skip_dma:
+	/* compute number of DS needed */
+	ds_cnt = (dseg - ((volatile struct mlx4_wqe_data_seg *)tx_desc)) + nr_segs;
 
-		/* Copy headers;
-		 * note that we already verified that it is linear */
-		memcpy(tx_desc->lso.header, mb->m_data, lso_header_size);
-		data = ((void *) &tx_desc->lso +
-			ALIGN(lso_header_size + 4, DS_SIZE));
+	/*
+	 * Check if the next request can wrap around and fill the end
+	 * of the current request with zero immediate data:
+	 */
+	pad = DIV_ROUND_UP(ds_cnt, DS_FACT);
+	pad = (~(ring->prod + pad)) & ring->size_mask;
 
-		priv->port_stats.tso_packets++;
-		segsz = mb->m_pkthdr.tso_segsz;
-		i = ((mb->m_pkthdr.len - lso_header_size) / segsz) +
-			!!((mb->m_pkthdr.len - lso_header_size) % segsz);
-		ring->bytes += mb->m_pkthdr.len + (i - 1) * lso_header_size;
-		ring->packets += i;
-		mb->m_data += lso_header_size;
-		mb->m_len -= lso_header_size;
+	if (unlikely(pad < (MLX4_EN_TX_WQE_MAX_WQEBBS - 1))) {
+		/*
+		 * Compute the least number of DS blocks we need to
+		 * pad in order to achieve a TX ring wraparound:
+		 */
+		pad = (DS_FACT * (pad + 1));
 	} else {
-		/* Normal (Non LSO) packet */
-		op_own = cpu_to_be32(MLX4_OPCODE_SEND) |
-			((ring->prod & ring->size) ?
-			 cpu_to_be32(MLX4_EN_BIT_DESC_OWN) : 0);
-		data = &tx_desc->data;
-		ring->bytes += max(mb->m_pkthdr.len,
-		    (unsigned int)ETHER_MIN_LEN - ETHER_CRC_LEN);
-		ring->packets++;
-
+		/*
+		 * The hardware will automatically jump to the next
+		 * TXBB. No need for padding.
+		 */
+		pad = 0;
 	}
-	AVG_PERF_COUNTER(priv->pstats.tx_pktsz_avg, mb->m_pkthdr.len);
 
-	/* Save mb in tx_info ring */
-	tx_info = &ring->tx_info[index];
+	/* compute total number of DS blocks */
+	ds_cnt += pad;
+	/*
+	 * When modifying this code, please ensure that the following
+	 * computation is always less than or equal to 0x3F:
+	 *
+	 * ((MLX4_EN_TX_WQE_MAX_WQEBBS - 1) * DS_FACT) +
+	 * (MLX4_EN_TX_WQE_MAX_WQEBBS * DS_FACT)
+	 *
+	 * Else the "ds_cnt" variable can become too big.
+	 */
+	tx_desc->ctrl.fence_size = (ds_cnt & 0x3f);
+
+	/* store pointer to mbuf */
 	tx_info->mb = mb;
-	tx_info->nr_txbb = nr_txbb;
-	tx_info->nr_segs = nr_segs;
-	/* valid only for non inline segments */
-	tx_info->data_offset = (void *) data - (void *) tx_desc;
+	tx_info->nr_txbb = DIV_ROUND_UP(ds_cnt, DS_FACT);
+	bf_size = ds_cnt * DS_SIZE_ALIGNMENT;
+	bf_prod = ring->prod;
 
-	if (!is_inline(mb)) {
-		for (i = 0, m = mb; i < nr_segs; i++, m = m->m_next) {
-			if (m->m_len == 0) {
-				i--;
-				continue;
-			}
-			dma = pci_map_single(mdev->dev->pdev, m->m_data,
-					     m->m_len, PCI_DMA_TODEVICE);
-			data->addr = cpu_to_be64(dma);
-			data->lkey = cpu_to_be32(mdev->mr.key);
+	/* compute end of "dseg" array */
+	dseg += nr_segs + pad;
+
+	/* pad using zero immediate dseg */
+	while (pad--) {
+		dseg--;
+		dseg->addr = 0;
+		dseg->lkey = 0;
+		wmb();
+		dseg->byte_count = SET_BYTE_COUNT((1 << 31)|0);
+	}
+
+	/* fill segment list */
+	while (nr_segs--) {
+		if (unlikely(segs[nr_segs].ds_len == 0)) {
+			dseg--;
+			dseg->addr = 0;
+			dseg->lkey = 0;
 			wmb();
-			data->byte_count = cpu_to_be32(m->m_len);
-			data++;
+			dseg->byte_count = SET_BYTE_COUNT((1 << 31)|0);
+		} else {
+			dseg--;
+			dseg->addr = cpu_to_be64((uint64_t)segs[nr_segs].ds_addr);
+			dseg->lkey = cpu_to_be32(priv->mdev->mr.key);
+			wmb();
+			dseg->byte_count = SET_BYTE_COUNT((uint32_t)segs[nr_segs].ds_len);
 		}
-		if (lso_header_size) {
-			mb->m_data -= lso_header_size;
-			mb->m_len += lso_header_size;
-		}
-		tx_info->inl = 0;
-	} else {
-		build_inline_wqe(tx_desc, mb, real_size, &vlan_tag, tx_ind);
-		tx_info->inl = 1;
 	}
 
-	ring->prod += nr_txbb;
+	wmb();
 
-	/* If we used a bounce buffer then copy descriptor back into place */
-	if (bounce)
-		tx_desc = mlx4_en_bounce_to_desc(priv, ring, index, desc_size);
+	/* write owner bits in reverse order */
+	if ((opcode & cpu_to_be32(0x1F)) == cpu_to_be32(MLX4_OPCODE_LSO))
+		mlx4_en_store_inline_lso_header(dseg_inline, ihs, owner_bit);
+	else
+		mlx4_en_store_inline_header(dseg_inline, ihs, owner_bit);
 
-	if (ring->bf_enabled && desc_size <= MAX_BF && !bounce && !vlan_tag) {
-		*(u32 *) (&tx_desc->ctrl.vlan_tag) |= ring->doorbell_qpn;
-		op_own |= htonl((bf_index & 0xffff) << 8);
-		/* Ensure new descirptor hits memory
-		* before setting ownership of this descriptor to HW */
-		wmb();
-		tx_desc->ctrl.owner_opcode = op_own;
+	if (unlikely(priv->validate_loopback)) {
+		/* Copy dst mac address to wqe */
+                struct ether_header *ethh;
+                u64 mac;
+                u32 mac_l, mac_h;
 
-		wmb();
+                ethh = mtod(mb, struct ether_header *);
+                mac = mlx4_en_mac_to_u64(ethh->ether_dhost);
+                if (mac) {
+                        mac_h = (u32) ((mac & 0xffff00000000ULL) >> 16);
+                        mac_l = (u32) (mac & 0xffffffff);
+                        tx_desc->ctrl.srcrb_flags |= cpu_to_be32(mac_h);
+                        tx_desc->ctrl.imm = cpu_to_be32(mac_l);
+                }
+	}
 
-		mlx4_bf_copy(ring->bf.reg + ring->bf.offset, (unsigned long *) &tx_desc->ctrl,
-		     desc_size);
+	/* update producer counter */
+	ring->prod += tx_info->nr_txbb;
 
+	if (ring->bf_enabled && bf_size <= MAX_BF &&
+	    (tx_desc->ctrl.ins_vlan != MLX4_WQE_CTRL_INS_VLAN)) {
+
+		/* store doorbell number */
+		*(volatile __be32 *) (&tx_desc->ctrl.vlan_tag) |= cpu_to_be32(ring->doorbell_qpn);
+
+		/* or in producer number for this WQE */
+		opcode |= cpu_to_be32((bf_prod & 0xffff) << 8);
+
+		/*
+		 * Ensure the new descriptor hits memory before
+		 * setting ownership of this descriptor to HW:
+		 */
 		wmb();
-
+		tx_desc->ctrl.owner_opcode = opcode;
+		wmb();
+		mlx4_bf_copy(((u8 *)ring->bf.reg) + ring->bf.offset,
+		     (volatile unsigned long *) &tx_desc->ctrl, bf_size);
+		wmb();
 		ring->bf.offset ^= ring->bf.buf_size;
 	} else {
-		/* Ensure new descirptor hits memory
-		* before setting ownership of this descriptor to HW */
+		/*
+		 * Ensure the new descriptor hits memory before
+		 * setting ownership of this descriptor to HW:
+		 */
 		wmb();
-		tx_desc->ctrl.owner_opcode = op_own;
+		tx_desc->ctrl.owner_opcode = opcode;
 		wmb();
-		writel(ring->doorbell_qpn, ring->bf.uar->map + MLX4_SEND_DOORBELL);
+		writel(cpu_to_be32(ring->doorbell_qpn),
+		    ((u8 *)ring->bf.uar->map) + MLX4_SEND_DOORBELL);
 	}
 
-	return 0;
-
+	return (0);
 tx_drop:
 	*mbp = NULL;
 	m_freem(mb);
-	ring->errors++;
-	return EINVAL;
+	return (err);
 }
 
-
 static int
 mlx4_en_transmit_locked(struct ifnet *dev, int tx_ind, struct mbuf *m)
 {
@@ -922,38 +993,39 @@
 	struct mbuf *next;
 	int enqueued, err = 0;
 
-	ring = &priv->tx_ring[tx_ind];
+	ring = priv->tx_ring[tx_ind];
 	if ((dev->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
 	    IFF_DRV_RUNNING || priv->port_up == 0) {
 		if (m != NULL)
 			err = drbr_enqueue(dev, ring->br, m);
-		return (err);  
+		return (err);
 	}
 
 	enqueued = 0;
-	if (m == NULL) {
-		next = drbr_dequeue(dev, ring->br);
-	} else if (drbr_needs_enqueue(dev, ring->br)) {
-		if ((err = drbr_enqueue(dev, ring->br, m)) != 0)
-			return (err);
-		next = drbr_dequeue(dev, ring->br);
-	} else
-		next = m;
+	if (m != NULL)
+		/*
+		 * If we can't insert mbuf into drbr, try to xmit anyway.
+		 * We keep the error we got so we could return that after xmit.
+		 */
+		err = drbr_enqueue(dev, ring->br, m);
 
 	/* Process the queue */
-	while (next != NULL) {
-		if ((err = mlx4_en_xmit(dev, tx_ind, &next)) != 0) {
-			if (next != NULL)
-				err = drbr_enqueue(dev, ring->br, next);
+	while ((next = drbr_peek(dev, ring->br)) != NULL) {
+		if (mlx4_en_xmit(priv, tx_ind, &next) != 0) {
+			if (next == NULL) {
+				drbr_advance(dev, ring->br);
+			} else {
+				drbr_putback(dev, ring->br, next);
+			}
 			break;
 		}
+		drbr_advance(dev, ring->br);
 		enqueued++;
-		drbr_stats_update(dev, next->m_pkthdr.len, next->m_flags);
-		/* Send a copy of the frame to the BPF listener */
-		ETHER_BPF_MTAP(dev, next);
+		dev->if_obytes += next->m_pkthdr.len;
+		if (next->m_flags & M_MCAST)
+			dev->if_omcasts++;
 		if ((dev->if_drv_flags & IFF_DRV_RUNNING) == 0)
 			break;
-		next = drbr_dequeue(dev, ring->br);
 	}
 
 	if (enqueued > 0)
@@ -970,13 +1042,14 @@
 	struct net_device *dev;
 	struct mlx4_en_cq *cq;
 	int tx_ind;
-
 	cq = context;
 	dev = cq->dev;
 	priv = dev->if_softc;
 	tx_ind = cq->ring;
-	ring = &priv->tx_ring[tx_ind];
-        if (dev->if_drv_flags & IFF_DRV_RUNNING) {
+	ring = priv->tx_ring[tx_ind];
+
+	if (priv->port_up != 0 &&
+	    (dev->if_drv_flags & IFF_DRV_RUNNING) != 0) {
 		mlx4_en_xmit_poll(priv, tx_ind);
 		spin_lock(&ring->tx_lock);
                 if (!drbr_empty(dev, ring->br))
@@ -991,16 +1064,22 @@
 	struct mlx4_en_priv *priv = netdev_priv(dev);
 	struct mlx4_en_tx_ring *ring;
 	struct mlx4_en_cq *cq;
-	int i = 0, err = 0;
+	int i, err = 0;
 
-	/* Which queue to use */
-	if ((m->m_flags & (M_FLOWID | M_VLANTAG)) == M_FLOWID)
-		i = m->m_pkthdr.flowid % (MLX4_EN_NUM_HASH_RINGS - 1);
-	else
+	if (priv->port_up == 0) {
+		m_freem(m);
+		return (ENETDOWN);
+	}
+
+	/* Compute which queue to use */
+	if (m->m_flags & M_FLOWID) {
+		i = (m->m_pkthdr.flowid % 128) % priv->tx_ring_num;
+	}
+	else {
 		i = mlx4_en_select_queue(dev, m);
+	}
 
-	ring = &priv->tx_ring[i];
-
+	ring = priv->tx_ring[i];
 	if (spin_trylock(&ring->tx_lock)) {
 		err = mlx4_en_transmit_locked(dev, i, m);
 		spin_unlock(&ring->tx_lock);
@@ -1008,7 +1087,7 @@
 		mlx4_en_xmit_poll(priv, i);
 	} else {
 		err = drbr_enqueue(dev, ring->br, m);
-		cq = &priv->tx_cq[i];
+		cq = priv->tx_cq[i];
 		taskqueue_enqueue(cq->tq, &cq->cq_task);
 	}
 
@@ -1022,10 +1101,14 @@
 mlx4_en_qflush(struct ifnet *dev)
 {
 	struct mlx4_en_priv *priv = netdev_priv(dev);
-	struct mlx4_en_tx_ring *ring = priv->tx_ring;
+	struct mlx4_en_tx_ring *ring;
 	struct mbuf *m;
 
-	for (int i = 0; i < priv->tx_ring_num; i++, ring++) {
+	if (priv->port_up == 0)
+		return;
+
+	for (int i = 0; i < priv->tx_ring_num; i++) {
+		ring = priv->tx_ring[i];
 		spin_lock(&ring->tx_lock);
 		while ((m = buf_ring_dequeue_sc(ring->br)) != NULL)
 			m_freem(m);

Modified: trunk/sys/ofed/drivers/net/mlx4/eq.c
===================================================================
--- trunk/sys/ofed/drivers/net/mlx4/eq.c	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/drivers/net/mlx4/eq.c	2016-09-14 19:35:22 UTC (rev 7911)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2005, 2006, 2007, 2008 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2005, 2006, 2007, 2008, 2014 Mellanox Technologies. All rights reserved.
  * Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc. All rights reserved.
  *
  * This software is available to you under a choice of one of two
@@ -31,8 +31,9 @@
  * SOFTWARE.
  */
 
-#include <linux/init.h>
 #include <linux/interrupt.h>
+#include <linux/slab.h>
+#include <linux/module.h>
 #include <linux/mm.h>
 #include <linux/dma-mapping.h>
 
@@ -42,35 +43,15 @@
 #include "fw.h"
 
 enum {
+	MLX4_IRQNAME_SIZE	= 32
+};
+
+enum {
 	MLX4_NUM_ASYNC_EQE	= 0x100,
 	MLX4_NUM_SPARE_EQE	= 0x80,
 	MLX4_EQ_ENTRY_SIZE	= 0x20
 };
 
-/*
- * Must be packed because start is 64 bits but only aligned to 32 bits.
- */
-struct mlx4_eq_context {
-	__be32			flags;
-	u16			reserved1[3];
-	__be16			page_offset;
-	u8			log_eq_size;
-	u8			reserved2[4];
-	u8			eq_period;
-	u8			reserved3;
-	u8			eq_max_count;
-	u8			reserved4[3];
-	u8			intr;
-	u8			log_page_size;
-	u8			reserved5[2];
-	u8			mtt_base_addr_h;
-	__be32			mtt_base_addr_l;
-	u32			reserved6[2];
-	__be32			consumer_index;
-	__be32			producer_index;
-	u32			reserved7[4];
-};
-
 #define MLX4_EQ_STATUS_OK	   ( 0 << 28)
 #define MLX4_EQ_STATUS_WRITE_FAIL  (10 << 28)
 #define MLX4_EQ_OWNER_SW	   ( 0 << 24)
@@ -95,47 +76,23 @@
 			       (1ull << MLX4_EVENT_TYPE_SRQ_CATAS_ERROR)    | \
 			       (1ull << MLX4_EVENT_TYPE_SRQ_QP_LAST_WQE)    | \
 			       (1ull << MLX4_EVENT_TYPE_SRQ_LIMIT)	    | \
-			       (1ull << MLX4_EVENT_TYPE_CMD))
+			       (1ull << MLX4_EVENT_TYPE_CMD)		    | \
+			       (1ull << MLX4_EVENT_TYPE_OP_REQUIRED)	    | \
+			       (1ull << MLX4_EVENT_TYPE_COMM_CHANNEL)       | \
+			       (1ull << MLX4_EVENT_TYPE_FLR_EVENT)	    | \
+			       (1ull << MLX4_EVENT_TYPE_FATAL_WARNING))
 
-struct mlx4_eqe {
-	u8			reserved1;
-	u8			type;
-	u8			reserved2;
-	u8			subtype;
-	union {
-		u32		raw[6];
-		struct {
-			__be32	cqn;
-		} __attribute__((packed)) comp;
-		struct {
-			u16	reserved1;
-			__be16	token;
-			u32	reserved2;
-			u8	reserved3[3];
-			u8	status;
-			__be64	out_param;
-		} __attribute__((packed)) cmd;
-		struct {
-			__be32	qpn;
-		} __attribute__((packed)) qp;
-		struct {
-			__be32	srqn;
-		} __attribute__((packed)) srq;
-		struct {
-			__be32	cqn;
-			u32	reserved1;
-			u8	reserved2[3];
-			u8	syndrome;
-		} __attribute__((packed)) cq_err;
-		struct {
-			u32	reserved1[2];
-			__be32	port;
-		} __attribute__((packed)) port_change;
-	}			event;
-	u8			reserved3[3];
-	u8			owner;
-} __attribute__((packed));
+static u64 get_async_ev_mask(struct mlx4_dev *dev)
+{
+	u64 async_ev_mask = MLX4_ASYNC_EVENT_MASK;
+	if (dev->caps.flags & MLX4_DEV_CAP_FLAG_PORT_MNG_CHG_EV)
+		async_ev_mask |= (1ull << MLX4_EVENT_TYPE_PORT_MNG_CHG_EVENT);
+	if (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_RECOVERABLE_ERROR_EVENT)
+		async_ev_mask |= (1ull << MLX4_EVENT_TYPE_RECOVERABLE_ERROR_EVENT);
 
+	return async_ev_mask;
+}
+
 static void eq_set_ci(struct mlx4_eq *eq, int req_not)
 {
 	__raw_writel((__force u32) cpu_to_be32((eq->cons_index & 0xffffff) |
@@ -145,27 +102,359 @@
 	mb();
 }
 
-static struct mlx4_eqe *get_eqe(struct mlx4_eq *eq, u32 entry)
+static struct mlx4_eqe *get_eqe(struct mlx4_eq *eq, u32 entry, u8 eqe_factor)
 {
-	unsigned long off = (entry & (eq->nent - 1)) * MLX4_EQ_ENTRY_SIZE;
-	return eq->page_list[off / PAGE_SIZE].buf + off % PAGE_SIZE;
+	/* (entry & (eq->nent - 1)) gives us a cyclic array */
+	unsigned long offset = (entry & (eq->nent - 1)) * (MLX4_EQ_ENTRY_SIZE << eqe_factor);
+	/* CX3 is capable of extending the EQE from 32 to 64 bytes.
+	 * When this feature is enabled, the first (in the lower addresses)
+	 * 32 bytes in the 64 byte EQE are reserved and the next 32 bytes
+	 * contain the legacy EQE information.
+	 */
+	return eq->page_list[offset / PAGE_SIZE].buf + (offset + (eqe_factor ? MLX4_EQ_ENTRY_SIZE : 0)) % PAGE_SIZE;
 }
 
-static struct mlx4_eqe *next_eqe_sw(struct mlx4_eq *eq)
+static struct mlx4_eqe *next_eqe_sw(struct mlx4_eq *eq, u8 eqe_factor)
 {
-	struct mlx4_eqe *eqe = get_eqe(eq, eq->cons_index);
+	struct mlx4_eqe *eqe = get_eqe(eq, eq->cons_index, eqe_factor);
 	return !!(eqe->owner & 0x80) ^ !!(eq->cons_index & eq->nent) ? NULL : eqe;
 }
 
+static struct mlx4_eqe *next_slave_event_eqe(struct mlx4_slave_event_eq *slave_eq)
+{
+	struct mlx4_eqe *eqe =
+		&slave_eq->event_eqe[slave_eq->cons & (SLAVE_EVENT_EQ_SIZE - 1)];
+	return (!!(eqe->owner & 0x80) ^
+		!!(slave_eq->cons & SLAVE_EVENT_EQ_SIZE)) ?
+		eqe : NULL;
+}
+
+void mlx4_gen_slave_eqe(struct work_struct *work)
+{
+	struct mlx4_mfunc_master_ctx *master =
+		container_of(work, struct mlx4_mfunc_master_ctx,
+			     slave_event_work);
+	struct mlx4_mfunc *mfunc =
+		container_of(master, struct mlx4_mfunc, master);
+	struct mlx4_priv *priv = container_of(mfunc, struct mlx4_priv, mfunc);
+	struct mlx4_dev *dev = &priv->dev;
+	struct mlx4_slave_event_eq *slave_eq = &mfunc->master.slave_eq;
+	struct mlx4_eqe *eqe;
+	u8 slave;
+	int i;
+
+	for (eqe = next_slave_event_eqe(slave_eq); eqe;
+	      eqe = next_slave_event_eqe(slave_eq)) {
+		slave = eqe->slave_id;
+
+		/* All active slaves need to receive the event */
+		if (slave == ALL_SLAVES) {
+			for (i = 0; i < dev->num_slaves; i++) {
+				if (mlx4_GEN_EQE(dev, i, eqe))
+					mlx4_warn(dev, "Failed to generate "
+						  "event for slave %d\n", i);
+			}
+		} else {
+			if (mlx4_GEN_EQE(dev, slave, eqe))
+				mlx4_warn(dev, "Failed to generate event "
+					       "for slave %d\n", slave);
+		}
+		++slave_eq->cons;
+	}
+}
+
+
+static void slave_event(struct mlx4_dev *dev, u8 slave, struct mlx4_eqe *eqe)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_slave_event_eq *slave_eq = &priv->mfunc.master.slave_eq;
+	struct mlx4_eqe *s_eqe;
+	unsigned long flags;
+
+	spin_lock_irqsave(&slave_eq->event_lock, flags);
+	s_eqe = &slave_eq->event_eqe[slave_eq->prod & (SLAVE_EVENT_EQ_SIZE - 1)];
+	if ((!!(s_eqe->owner & 0x80)) ^
+	    (!!(slave_eq->prod & SLAVE_EVENT_EQ_SIZE))) {
+		mlx4_warn(dev, "Master failed to generate an EQE for slave: %d. "
+			  "No free EQE on slave events queue\n", slave);
+		spin_unlock_irqrestore(&slave_eq->event_lock, flags);
+		return;
+	}
+
+	memcpy(s_eqe, eqe, dev->caps.eqe_size - 1);
+	s_eqe->slave_id = slave;
+	/* ensure all information is written before setting the ownersip bit */
+	wmb();
+	s_eqe->owner = !!(slave_eq->prod & SLAVE_EVENT_EQ_SIZE) ? 0x0 : 0x80;
+	++slave_eq->prod;
+
+	queue_work(priv->mfunc.master.comm_wq,
+		   &priv->mfunc.master.slave_event_work);
+	spin_unlock_irqrestore(&slave_eq->event_lock, flags);
+}
+
+static void mlx4_slave_event(struct mlx4_dev *dev, int slave,
+			     struct mlx4_eqe *eqe)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	if (slave < 0 || slave >= dev->num_slaves ||
+	    slave == dev->caps.function)
+		return;
+
+	if (!priv->mfunc.master.slave_state[slave].active)
+		return;
+
+	slave_event(dev, slave, eqe);
+}
+
+int mlx4_gen_pkey_eqe(struct mlx4_dev *dev, int slave, u8 port)
+{
+	struct mlx4_eqe eqe;
+
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_slave_state *s_slave = &priv->mfunc.master.slave_state[slave];
+
+	if (!s_slave->active)
+		return 0;
+
+	memset(&eqe, 0, sizeof eqe);
+
+	eqe.type = MLX4_EVENT_TYPE_PORT_MNG_CHG_EVENT;
+	eqe.subtype = MLX4_DEV_PMC_SUBTYPE_PKEY_TABLE;
+	eqe.event.port_mgmt_change.port = port;
+
+	return mlx4_GEN_EQE(dev, slave, &eqe);
+}
+EXPORT_SYMBOL(mlx4_gen_pkey_eqe);
+
+int mlx4_gen_guid_change_eqe(struct mlx4_dev *dev, int slave, u8 port)
+{
+	struct mlx4_eqe eqe;
+
+	/*don't send if we don't have the that slave */
+	if (dev->num_vfs < slave)
+		return 0;
+	memset(&eqe, 0, sizeof eqe);
+
+	eqe.type = MLX4_EVENT_TYPE_PORT_MNG_CHG_EVENT;
+	eqe.subtype = MLX4_DEV_PMC_SUBTYPE_GUID_INFO;
+	eqe.event.port_mgmt_change.port = port;
+
+	return mlx4_GEN_EQE(dev, slave, &eqe);
+}
+EXPORT_SYMBOL(mlx4_gen_guid_change_eqe);
+
+int mlx4_gen_port_state_change_eqe(struct mlx4_dev *dev, int slave, u8 port,
+				   u8 port_subtype_change)
+{
+	struct mlx4_eqe eqe;
+
+	/*don't send if we don't have the that slave */
+	if (dev->num_vfs < slave)
+		return 0;
+	memset(&eqe, 0, sizeof eqe);
+
+	eqe.type = MLX4_EVENT_TYPE_PORT_CHANGE;
+	eqe.subtype = port_subtype_change;
+	eqe.event.port_change.port = cpu_to_be32(port << 28);
+
+	mlx4_dbg(dev, "%s: sending: %d to slave: %d on port: %d\n", __func__,
+		 port_subtype_change, slave, port);
+	return mlx4_GEN_EQE(dev, slave, &eqe);
+}
+EXPORT_SYMBOL(mlx4_gen_port_state_change_eqe);
+
+enum slave_port_state mlx4_get_slave_port_state(struct mlx4_dev *dev, int slave, u8 port)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_slave_state *s_state = priv->mfunc.master.slave_state;
+	if (slave >= dev->num_slaves || port > MLX4_MAX_PORTS) {
+		pr_err("%s: Error: asking for slave:%d, port:%d\n",
+		       __func__, slave, port);
+		return SLAVE_PORT_DOWN;
+	}
+	return s_state[slave].port_state[port];
+}
+EXPORT_SYMBOL(mlx4_get_slave_port_state);
+
+static int mlx4_set_slave_port_state(struct mlx4_dev *dev, int slave, u8 port,
+				     enum slave_port_state state)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_slave_state *s_state = priv->mfunc.master.slave_state;
+
+	if (slave >= dev->num_slaves || port > MLX4_MAX_PORTS || port == 0) {
+		pr_err("%s: Error: asking for slave:%d, port:%d\n",
+		       __func__, slave, port);
+		return -1;
+	}
+	s_state[slave].port_state[port] = state;
+
+	return 0;
+}
+
+static void set_all_slave_state(struct mlx4_dev *dev, u8 port, int event)
+{
+	int i;
+	enum slave_port_gen_event gen_event;
+
+	for (i = 0; i < dev->num_slaves; i++)
+		set_and_calc_slave_port_state(dev, i, port, event, &gen_event);
+}
+/**************************************************************************
+	The function get as input the new event to that port,
+	and according to the prev state change the slave's port state.
+	The events are:
+		MLX4_PORT_STATE_DEV_EVENT_PORT_DOWN,
+		MLX4_PORT_STATE_DEV_EVENT_PORT_UP
+		MLX4_PORT_STATE_IB_EVENT_GID_VALID
+		MLX4_PORT_STATE_IB_EVENT_GID_INVALID
+***************************************************************************/
+int set_and_calc_slave_port_state(struct mlx4_dev *dev, int slave,
+				  u8 port, int event,
+				  enum slave_port_gen_event *gen_event)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_slave_state *ctx = NULL;
+	unsigned long flags;
+	int ret = -1;
+	enum slave_port_state cur_state =
+		mlx4_get_slave_port_state(dev, slave, port);
+
+	*gen_event = SLAVE_PORT_GEN_EVENT_NONE;
+
+	if (slave >= dev->num_slaves || port > MLX4_MAX_PORTS || port == 0) {
+		pr_err("%s: Error: asking for slave:%d, port:%d\n",
+		       __func__, slave, port);
+		return ret;
+	}
+
+	ctx = &priv->mfunc.master.slave_state[slave];
+	spin_lock_irqsave(&ctx->lock, flags);
+
+	switch (cur_state) {
+	case SLAVE_PORT_DOWN:
+		if (MLX4_PORT_STATE_DEV_EVENT_PORT_UP == event)
+			mlx4_set_slave_port_state(dev, slave, port,
+						  SLAVE_PENDING_UP);
+		break;
+	case SLAVE_PENDING_UP:
+		if (MLX4_PORT_STATE_DEV_EVENT_PORT_DOWN == event)
+			mlx4_set_slave_port_state(dev, slave, port,
+						  SLAVE_PORT_DOWN);
+		else if (MLX4_PORT_STATE_IB_PORT_STATE_EVENT_GID_VALID == event) {
+			mlx4_set_slave_port_state(dev, slave, port,
+						  SLAVE_PORT_UP);
+			*gen_event = SLAVE_PORT_GEN_EVENT_UP;
+		}
+		break;
+	case SLAVE_PORT_UP:
+		if (MLX4_PORT_STATE_DEV_EVENT_PORT_DOWN == event) {
+			mlx4_set_slave_port_state(dev, slave, port,
+						  SLAVE_PORT_DOWN);
+			*gen_event = SLAVE_PORT_GEN_EVENT_DOWN;
+		} else if (MLX4_PORT_STATE_IB_EVENT_GID_INVALID ==
+				event) {
+			mlx4_set_slave_port_state(dev, slave, port,
+						  SLAVE_PENDING_UP);
+			*gen_event = SLAVE_PORT_GEN_EVENT_DOWN;
+		}
+		break;
+	default:
+		pr_err("%s: BUG!!! UNKNOWN state: "
+		       "slave:%d, port:%d\n", __func__, slave, port);
+			goto out;
+	}
+	ret = mlx4_get_slave_port_state(dev, slave, port);
+
+out:
+	spin_unlock_irqrestore(&ctx->lock, flags);
+	return ret;
+}
+
+EXPORT_SYMBOL(set_and_calc_slave_port_state);
+
+int mlx4_gen_slaves_port_mgt_ev(struct mlx4_dev *dev, u8 port, int attr, u16 sm_lid, u8 sm_sl)
+{
+	struct mlx4_eqe eqe;
+
+	memset(&eqe, 0, sizeof eqe);
+
+	eqe.type = MLX4_EVENT_TYPE_PORT_MNG_CHG_EVENT;
+	eqe.subtype = MLX4_DEV_PMC_SUBTYPE_PORT_INFO;
+	eqe.event.port_mgmt_change.port = port;
+	eqe.event.port_mgmt_change.params.port_info.changed_attr =
+		cpu_to_be32((u32) attr);
+	if (attr & MSTR_SM_CHANGE_MASK) {
+		eqe.event.port_mgmt_change.params.port_info.mstr_sm_lid =
+			cpu_to_be16(sm_lid);
+		eqe.event.port_mgmt_change.params.port_info.mstr_sm_sl =
+			sm_sl;
+	}
+
+	slave_event(dev, ALL_SLAVES, &eqe);
+	return 0;
+}
+EXPORT_SYMBOL(mlx4_gen_slaves_port_mgt_ev);
+
+void mlx4_master_handle_slave_flr(struct work_struct *work)
+{
+	struct mlx4_mfunc_master_ctx *master =
+		container_of(work, struct mlx4_mfunc_master_ctx,
+			     slave_flr_event_work);
+	struct mlx4_mfunc *mfunc =
+		container_of(master, struct mlx4_mfunc, master);
+	struct mlx4_priv *priv =
+		container_of(mfunc, struct mlx4_priv, mfunc);
+	struct mlx4_dev *dev = &priv->dev;
+	struct mlx4_slave_state *slave_state = priv->mfunc.master.slave_state;
+	int i;
+	int err;
+	unsigned long flags;
+
+	mlx4_dbg(dev, "mlx4_handle_slave_flr\n");
+
+	for (i = 0 ; i < dev->num_slaves; i++) {
+
+		if (MLX4_COMM_CMD_FLR == slave_state[i].last_cmd) {
+			mlx4_dbg(dev, "mlx4_handle_slave_flr: "
+				 "clean slave: %d\n", i);
+
+			mlx4_delete_all_resources_for_slave(dev, i);
+			/*return the slave to running mode*/
+			spin_lock_irqsave(&priv->mfunc.master.slave_state_lock, flags);
+			slave_state[i].last_cmd = MLX4_COMM_CMD_RESET;
+			slave_state[i].is_slave_going_down = 0;
+			spin_unlock_irqrestore(&priv->mfunc.master.slave_state_lock, flags);
+			/*notify the FW:*/
+			err = mlx4_cmd(dev, 0, i, 0, MLX4_CMD_INFORM_FLR_DONE,
+				       MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
+			if (err)
+				mlx4_warn(dev, "Failed to notify FW on "
+					  "FLR done (slave:%d)\n", i);
+		}
+	}
+}
+
 static int mlx4_eq_int(struct mlx4_dev *dev, struct mlx4_eq *eq)
 {
+	struct mlx4_priv *priv = mlx4_priv(dev);
 	struct mlx4_eqe *eqe;
 	int cqn;
 	int eqes_found = 0;
 	int set_ci = 0;
 	int port;
+	int slave = 0;
+	int ret;
+	u32 flr_slave;
+	u8 update_slave_state;
+	int i;
+	enum slave_port_gen_event gen_event;
+	unsigned long flags;
+	struct mlx4_vport_state *s_info;
 
-	while ((eqe = next_eqe_sw(eq))) {
+	while ((eqe = next_eqe_sw(eq, dev->caps.eqe_factor))) {
 		/*
 		 * Make sure we read EQ entry contents after we've
 		 * checked the ownership bit.
@@ -186,14 +475,67 @@
 		case MLX4_EVENT_TYPE_PATH_MIG_FAILED:
 		case MLX4_EVENT_TYPE_WQ_INVAL_REQ_ERROR:
 		case MLX4_EVENT_TYPE_WQ_ACCESS_ERROR:
-			mlx4_qp_event(dev, be32_to_cpu(eqe->event.qp.qpn) & 0xffffff,
-				      eqe->type);
+			mlx4_dbg(dev, "event %d arrived\n", eqe->type);
+			if (mlx4_is_master(dev)) {
+				/* forward only to slave owning the QP */
+				ret = mlx4_get_slave_from_resource_id(dev,
+						RES_QP,
+						be32_to_cpu(eqe->event.qp.qpn)
+						& 0xffffff, &slave);
+				if (ret && ret != -ENOENT) {
+					mlx4_dbg(dev, "QP event %02x(%02x) on "
+						 "EQ %d at index %u: could "
+						 "not get slave id (%d)\n",
+						 eqe->type, eqe->subtype,
+						 eq->eqn, eq->cons_index, ret);
+					break;
+				}
+
+				if (!ret && slave != dev->caps.function) {
+					mlx4_slave_event(dev, slave, eqe);
+					break;
+				}
+
+			}
+			mlx4_qp_event(dev, be32_to_cpu(eqe->event.qp.qpn) &
+				      0xffffff, eqe->type);
 			break;
 
 		case MLX4_EVENT_TYPE_SRQ_LIMIT:
+			mlx4_dbg(dev, "%s: MLX4_EVENT_TYPE_SRQ_LIMIT\n",
+				 __func__);
+		/* fall through */
 		case MLX4_EVENT_TYPE_SRQ_CATAS_ERROR:
-			mlx4_srq_event(dev, be32_to_cpu(eqe->event.srq.srqn) & 0xffffff,
-				      eqe->type);
+			if (mlx4_is_master(dev)) {
+				/* forward only to slave owning the SRQ */
+				ret = mlx4_get_slave_from_resource_id(dev,
+						RES_SRQ,
+						be32_to_cpu(eqe->event.srq.srqn)
+						& 0xffffff,
+						&slave);
+				if (ret && ret != -ENOENT) {
+					mlx4_warn(dev, "SRQ event %02x(%02x) "
+						  "on EQ %d at index %u: could"
+						  " not get slave id (%d)\n",
+						  eqe->type, eqe->subtype,
+						  eq->eqn, eq->cons_index, ret);
+					break;
+				}
+				mlx4_dbg(dev, "%s: slave:%d, srq_no:0x%x, event: %02x(%02x)\n",
+					 __func__, slave,
+					 be32_to_cpu(eqe->event.srq.srqn),
+					 eqe->type, eqe->subtype);
+
+				if (!ret && slave != dev->caps.function) {
+					mlx4_dbg(dev, "%s: sending event %02x(%02x) to slave:%d\n",
+						 __func__, eqe->type,
+						 eqe->subtype, slave);
+					mlx4_slave_event(dev, slave, eqe);
+					break;
+				}
+			}
+			mlx4_srq_event(dev, be32_to_cpu(eqe->event.srq.srqn) &
+				       0xffffff, eqe->type);
 			break;
 
 		case MLX4_EVENT_TYPE_CMD:
@@ -209,10 +551,50 @@
 				mlx4_dispatch_event(dev, MLX4_DEV_EVENT_PORT_DOWN,
 						    port);
 				mlx4_priv(dev)->sense.do_sense_port[port] = 1;
+				if (!mlx4_is_master(dev))
+					break;
+				for (i = 0; i < dev->num_slaves; i++) {
+					if (dev->caps.port_type[port] == MLX4_PORT_TYPE_ETH) {
+						if (i == mlx4_master_func_num(dev))
+							continue;
+						mlx4_dbg(dev, "%s: Sending MLX4_PORT_CHANGE_SUBTYPE_DOWN"
+							 " to slave: %d, port:%d\n",
+							 __func__, i, port);
+						s_info = &priv->mfunc.master.vf_oper[slave].vport[port].state;
+						if (IFLA_VF_LINK_STATE_AUTO == s_info->link_state)
+							mlx4_slave_event(dev, i, eqe);
+					} else {  /* IB port */
+						set_and_calc_slave_port_state(dev, i, port,
+									      MLX4_PORT_STATE_DEV_EVENT_PORT_DOWN,
+									      &gen_event);
+						/*we can be in pending state, then do not send port_down event*/
+						if (SLAVE_PORT_GEN_EVENT_DOWN ==  gen_event) {
+							if (i == mlx4_master_func_num(dev))
+								continue;
+							mlx4_slave_event(dev, i, eqe);
+						}
+					}
+				}
 			} else {
-				mlx4_dispatch_event(dev, MLX4_DEV_EVENT_PORT_UP,
-						    port);
+				mlx4_dispatch_event(dev, MLX4_DEV_EVENT_PORT_UP, port);
+
 				mlx4_priv(dev)->sense.do_sense_port[port] = 0;
+
+				if (!mlx4_is_master(dev))
+					break;
+				if (dev->caps.port_type[port] == MLX4_PORT_TYPE_ETH)
+					for (i = 0; i < dev->num_slaves; i++) {
+						if (i == mlx4_master_func_num(dev))
+							continue;
+						s_info = &priv->mfunc.master.vf_oper[slave].vport[port].state;
+						if (IFLA_VF_LINK_STATE_AUTO == s_info->link_state)
+							mlx4_slave_event(dev, i, eqe);
+					}
+				else /* IB port */
+					/* port-up event will be sent to a slave when the
+					 * slave's alias-guid is set. This is done in alias_GUID.c
+					 */
+					set_all_slave_state(dev, port, MLX4_DEV_EVENT_PORT_UP);
 			}
 			break;
 
@@ -221,7 +603,28 @@
 				  eqe->event.cq_err.syndrome == 1 ?
 				  "overrun" : "access violation",
 				  be32_to_cpu(eqe->event.cq_err.cqn) & 0xffffff);
-			mlx4_cq_event(dev, be32_to_cpu(eqe->event.cq_err.cqn),
+			if (mlx4_is_master(dev)) {
+				ret = mlx4_get_slave_from_resource_id(dev,
+					RES_CQ,
+					be32_to_cpu(eqe->event.cq_err.cqn)
+					& 0xffffff, &slave);
+				if (ret && ret != -ENOENT) {
+					mlx4_dbg(dev, "CQ event %02x(%02x) on "
+						 "EQ %d at index %u: could "
+						  "not get slave id (%d)\n",
+						  eqe->type, eqe->subtype,
+						  eq->eqn, eq->cons_index, ret);
+					break;
+				}
+
+				if (!ret && slave != dev->caps.function) {
+					mlx4_slave_event(dev, slave, eqe);
+					break;
+				}
+			}
+			mlx4_cq_event(dev,
+				      be32_to_cpu(eqe->event.cq_err.cqn)
+				      & 0xffffff,
 				      eqe->type);
 			break;
 
@@ -229,11 +632,127 @@
 			mlx4_warn(dev, "EQ overrun on EQN %d\n", eq->eqn);
 			break;
 
+		case MLX4_EVENT_TYPE_OP_REQUIRED:
+			atomic_inc(&priv->opreq_count);
+			/* FW commands can't be executed from interrupt context
+			   working in deferred task */
+			queue_work(mlx4_wq, &priv->opreq_task);
+			break;
+
+		case MLX4_EVENT_TYPE_COMM_CHANNEL:
+			if (!mlx4_is_master(dev)) {
+				mlx4_warn(dev, "Received comm channel event "
+					       "for non master device\n");
+				break;
+			}
+
+			memcpy(&priv->mfunc.master.comm_arm_bit_vector,
+			       eqe->event.comm_channel_arm.bit_vec,
+			       sizeof eqe->event.comm_channel_arm.bit_vec);
+
+			if (!queue_work(priv->mfunc.master.comm_wq,
+				   &priv->mfunc.master.comm_work))
+				mlx4_warn(dev, "Failed to queue comm channel work\n");
+
+			if (!queue_work(priv->mfunc.master.comm_wq,
+				   &priv->mfunc.master.arm_comm_work))
+				mlx4_warn(dev, "Failed to queue arm comm channel work\n");
+			break;
+
+		case MLX4_EVENT_TYPE_FLR_EVENT:
+			flr_slave = be32_to_cpu(eqe->event.flr_event.slave_id);
+			if (!mlx4_is_master(dev)) {
+				mlx4_warn(dev, "Non-master function received"
+					       "FLR event\n");
+				break;
+			}
+
+			mlx4_dbg(dev, "FLR event for slave: %d\n", flr_slave);
+
+			if (flr_slave >= dev->num_slaves) {
+				mlx4_warn(dev,
+					  "Got FLR for unknown function: %d\n",
+					  flr_slave);
+				update_slave_state = 0;
+			} else
+				update_slave_state = 1;
+
+			spin_lock_irqsave(&priv->mfunc.master.slave_state_lock, flags);
+			if (update_slave_state) {
+				priv->mfunc.master.slave_state[flr_slave].active = false;
+				priv->mfunc.master.slave_state[flr_slave].last_cmd = MLX4_COMM_CMD_FLR;
+				priv->mfunc.master.slave_state[flr_slave].is_slave_going_down = 1;
+			}
+			spin_unlock_irqrestore(&priv->mfunc.master.slave_state_lock, flags);
+			queue_work(priv->mfunc.master.comm_wq,
+				   &priv->mfunc.master.slave_flr_event_work);
+			break;
+
+		case MLX4_EVENT_TYPE_FATAL_WARNING:
+			if (eqe->subtype == MLX4_FATAL_WARNING_SUBTYPE_WARMING) {
+				if (mlx4_is_master(dev))
+					for (i = 0; i < dev->num_slaves; i++) {
+						mlx4_dbg(dev, "%s: Sending "
+							"MLX4_FATAL_WARNING_SUBTYPE_WARMING"
+							" to slave: %d\n", __func__, i);
+						if (i == dev->caps.function)
+							continue;
+						mlx4_slave_event(dev, i, eqe);
+					}
+				mlx4_err(dev, "Temperature Threshold was reached! "
+					"Threshold: %d celsius degrees; "
+					"Current Temperature: %d\n",
+					be16_to_cpu(eqe->event.warming.warning_threshold),
+					be16_to_cpu(eqe->event.warming.current_temperature));
+			} else
+				mlx4_warn(dev, "Unhandled event FATAL WARNING (%02x), "
+					  "subtype %02x on EQ %d at index %u. owner=%x, "
+					  "nent=0x%x, slave=%x, ownership=%s\n",
+					  eqe->type, eqe->subtype, eq->eqn,
+					  eq->cons_index, eqe->owner, eq->nent,
+					  eqe->slave_id,
+					  !!(eqe->owner & 0x80) ^
+					  !!(eq->cons_index & eq->nent) ? "HW" : "SW");
+
+			break;
+
+		case MLX4_EVENT_TYPE_PORT_MNG_CHG_EVENT:
+			mlx4_dispatch_event(dev, MLX4_DEV_EVENT_PORT_MGMT_CHANGE,
+					    (unsigned long) eqe);
+			break;
+
+		case MLX4_EVENT_TYPE_RECOVERABLE_ERROR_EVENT:
+			switch (eqe->subtype) {
+			case MLX4_RECOVERABLE_ERROR_EVENT_SUBTYPE_BAD_CABLE:
+				mlx4_warn(dev, "Bad cable detected on port %u\n",
+					  eqe->event.bad_cable.port);
+				break;
+			case MLX4_RECOVERABLE_ERROR_EVENT_SUBTYPE_UNSUPPORTED_CABLE:
+				mlx4_warn(dev, "Unsupported cable detected\n");
+				break;
+			default:
+				mlx4_dbg(dev, "Unhandled recoverable error event "
+					 "detected: %02x(%02x) on EQ %d at index %u. "
+					 "owner=%x, nent=0x%x, ownership=%s\n",
+					 eqe->type, eqe->subtype, eq->eqn,
+					 eq->cons_index, eqe->owner, eq->nent,
+					 !!(eqe->owner & 0x80) ^
+					 !!(eq->cons_index & eq->nent) ? "HW" : "SW");
+				break;
+			}
+			break;
+
 		case MLX4_EVENT_TYPE_EEC_CATAS_ERROR:
 		case MLX4_EVENT_TYPE_ECC_DETECT:
 		default:
-			mlx4_warn(dev, "Unhandled event %02x(%02x) on EQ %d at index %u\n",
-				  eqe->type, eqe->subtype, eq->eqn, eq->cons_index);
+			mlx4_warn(dev, "Unhandled event %02x(%02x) on EQ %d at "
+				  "index %u. owner=%x, nent=0x%x, slave=%x, "
+				  "ownership=%s\n",
+				  eqe->type, eqe->subtype, eq->eqn,
+				  eq->cons_index, eqe->owner, eq->nent,
+				  eqe->slave_id,
+				  !!(eqe->owner & 0x80) ^
+				  !!(eq->cons_index & eq->nent) ? "HW" : "SW");
 			break;
 		};
 
@@ -285,25 +804,55 @@
 	return IRQ_HANDLED;
 }
 
+int mlx4_MAP_EQ_wrapper(struct mlx4_dev *dev, int slave,
+			struct mlx4_vhcr *vhcr,
+			struct mlx4_cmd_mailbox *inbox,
+			struct mlx4_cmd_mailbox *outbox,
+			struct mlx4_cmd_info *cmd)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_slave_event_eq_info *event_eq =
+		priv->mfunc.master.slave_state[slave].event_eq;
+	u32 in_modifier = vhcr->in_modifier;
+	u32 eqn = in_modifier & 0x3FF;
+	u64 in_param =  vhcr->in_param;
+	int err = 0;
+	int i;
+
+	if (slave == dev->caps.function)
+		err = mlx4_cmd(dev, in_param, (in_modifier & 0x80000000) | eqn,
+			       0, MLX4_CMD_MAP_EQ, MLX4_CMD_TIME_CLASS_B,
+			       MLX4_CMD_NATIVE);
+	if (!err)
+		for (i = 0; i < MLX4_EVENT_TYPES_NUM; ++i)
+			if (in_param & (1LL << i))
+				event_eq[i].eqn = in_modifier >> 31 ? -1 : eqn;
+
+	return err;
+}
+
 static int mlx4_MAP_EQ(struct mlx4_dev *dev, u64 event_mask, int unmap,
 			int eq_num)
 {
 	return mlx4_cmd(dev, event_mask, (unmap << 31) | eq_num,
-			0, MLX4_CMD_MAP_EQ, MLX4_CMD_TIME_CLASS_B);
+			0, MLX4_CMD_MAP_EQ, MLX4_CMD_TIME_CLASS_B,
+			MLX4_CMD_WRAPPED);
 }
 
 static int mlx4_SW2HW_EQ(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox,
 			 int eq_num)
 {
-	return mlx4_cmd(dev, mailbox->dma, eq_num, 0, MLX4_CMD_SW2HW_EQ,
-			MLX4_CMD_TIME_CLASS_A);
+	return mlx4_cmd(dev, mailbox->dma, eq_num, 0,
+			MLX4_CMD_SW2HW_EQ, MLX4_CMD_TIME_CLASS_A,
+			MLX4_CMD_WRAPPED);
 }
 
 static int mlx4_HW2SW_EQ(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox,
 			 int eq_num)
 {
-	return mlx4_cmd_box(dev, 0, mailbox->dma, eq_num, 0, MLX4_CMD_HW2SW_EQ,
-			    MLX4_CMD_TIME_CLASS_A);
+	return mlx4_cmd_box(dev, 0, mailbox->dma, eq_num,
+			    0, MLX4_CMD_HW2SW_EQ, MLX4_CMD_TIME_CLASS_A,
+			    MLX4_CMD_WRAPPED);
 }
 
 static int mlx4_num_eq_uar(struct mlx4_dev *dev)
@@ -313,8 +862,8 @@
 	 * we need to map, take the difference of highest index and
 	 * the lowest index we'll use and add 1.
 	 */
-	return (dev->caps.num_comp_vectors + 1 + dev->caps.reserved_eqs) / 4 -
-		dev->caps.reserved_eqs / 4 + 1;
+	return (dev->caps.num_comp_vectors + 1 + dev->caps.reserved_eqs +
+		 dev->caps.comp_pool)/4 - dev->caps.reserved_eqs/4 + 1;
 }
 
 static void __iomem *mlx4_get_eq_uar(struct mlx4_dev *dev, struct mlx4_eq *eq)
@@ -339,6 +888,18 @@
 	return priv->eq_table.uar_map[index] + 0x800 + 8 * (eq->eqn % 4);
 }
 
+static void mlx4_unmap_uar(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int i;
+
+	for (i = 0; i < mlx4_num_eq_uar(dev); ++i)
+		if (priv->eq_table.uar_map[i]) {
+			iounmap(priv->eq_table.uar_map[i]);
+			priv->eq_table.uar_map[i] = NULL;
+		}
+}
+
 static int mlx4_create_eq(struct mlx4_dev *dev, int nent,
 			  u8 intr, struct mlx4_eq *eq)
 {
@@ -354,7 +915,8 @@
 
 	eq->dev   = dev;
 	eq->nent  = roundup_pow_of_two(max(nent, 2));
-	npages = PAGE_ALIGN(eq->nent * MLX4_EQ_ENTRY_SIZE) / PAGE_SIZE;
+	/* CX3 is capable of extending the CQE\EQE from 32 to 64 bytes */
+	npages = PAGE_ALIGN(eq->nent * (MLX4_EQ_ENTRY_SIZE << dev->caps.eqe_factor)) / PAGE_SIZE;
 
 	eq->page_list = kmalloc(npages * sizeof *eq->page_list,
 				GFP_KERNEL);
@@ -431,7 +993,7 @@
 	mlx4_mtt_cleanup(dev, &eq->mtt);
 
 err_out_free_eq:
-	mlx4_bitmap_free(&priv->eq_table.bitmap, eq->eqn);
+	mlx4_bitmap_free(&priv->eq_table.bitmap, eq->eqn, MLX4_USE_RR);
 
 err_out_free_pages:
 	for (i = 0; i < npages; ++i)
@@ -456,8 +1018,9 @@
 	struct mlx4_priv *priv = mlx4_priv(dev);
 	struct mlx4_cmd_mailbox *mailbox;
 	int err;
-	int npages = PAGE_ALIGN(MLX4_EQ_ENTRY_SIZE * eq->nent) / PAGE_SIZE;
 	int i;
+	/* CX3 is capable of extending the CQE\EQE from 32 to 64 bytes */
+	int npages = PAGE_ALIGN((MLX4_EQ_ENTRY_SIZE << dev->caps.eqe_factor) * eq->nent) / PAGE_SIZE;
 
 	mailbox = mlx4_alloc_cmd_mailbox(dev);
 	if (IS_ERR(mailbox))
@@ -471,21 +1034,21 @@
 		mlx4_dbg(dev, "Dumping EQ context %02x:\n", eq->eqn);
 		for (i = 0; i < sizeof (struct mlx4_eq_context) / 4; ++i) {
 			if (i % 4 == 0)
-				printk("[%02x] ", i * 4);
-			printk(" %08x", be32_to_cpup(mailbox->buf + i * 4));
+				pr_cont("[%02x] ", i * 4);
+			pr_cont(" %08x", be32_to_cpup(mailbox->buf + i * 4));
 			if ((i + 1) % 4 == 0)
-				printk("\n");
+				pr_cont("\n");
 		}
 	}
 
 	mlx4_mtt_cleanup(dev, &eq->mtt);
 	for (i = 0; i < npages; ++i)
-		pci_free_consistent(dev->pdev, PAGE_SIZE,
+		dma_free_coherent(&dev->pdev->dev, PAGE_SIZE,
 				    eq->page_list[i].buf,
 				    eq->page_list[i].map);
 
 	kfree(eq->page_list);
-	mlx4_bitmap_free(&priv->eq_table.bitmap, eq->eqn);
+	mlx4_bitmap_free(&priv->eq_table.bitmap, eq->eqn, MLX4_USE_RR);
 	mlx4_free_cmd_mailbox(dev, mailbox);
 }
 
@@ -492,10 +1055,12 @@
 static void mlx4_free_irqs(struct mlx4_dev *dev)
 {
 	struct mlx4_eq_table *eq_table = &mlx4_priv(dev)->eq_table;
-	int i;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int	i, vec;
 
 	if (eq_table->have_irq)
 		free_irq(dev->pdev->irq, dev);
+
 	for (i = 0; i < dev->caps.num_comp_vectors + 1; ++i)
 		if (eq_table->eq[i].have_irq) {
 			free_irq(eq_table->eq[i].irq, eq_table->eq + i);
@@ -502,6 +1067,20 @@
 			eq_table->eq[i].have_irq = 0;
 		}
 
+	for (i = 0; i < dev->caps.comp_pool; i++) {
+		/*
+		 * Freeing the assigned irq's
+		 * all bits should be 0, but we need to validate
+		 */
+		if (priv->msix_ctl.pool_bm & 1ULL << i) {
+			/* NO need protecting*/
+			vec = dev->caps.num_comp_vectors + 1 + i;
+			free_irq(priv->eq_table.eq[vec].irq,
+				 &priv->eq_table.eq[vec]);
+		}
+	}
+
+
 	kfree(eq_table->irq_names);
 }
 
@@ -549,8 +1128,9 @@
 	int err;
 	int i;
 
-	priv->eq_table.uar_map = kcalloc(sizeof *priv->eq_table.uar_map,
-					 mlx4_num_eq_uar(dev), GFP_KERNEL);
+	priv->eq_table.uar_map = kcalloc(mlx4_num_eq_uar(dev),
+					 sizeof *priv->eq_table.uar_map,
+					 GFP_KERNEL);
 	if (!priv->eq_table.uar_map) {
 		err = -ENOMEM;
 		goto err_out_free;
@@ -564,23 +1144,30 @@
 	for (i = 0; i < mlx4_num_eq_uar(dev); ++i)
 		priv->eq_table.uar_map[i] = NULL;
 
-	err = mlx4_map_clr_int(dev);
-	if (err)
-		goto err_out_bitmap;
+	if (!mlx4_is_slave(dev)) {
+		err = mlx4_map_clr_int(dev);
+		if (err)
+			goto err_out_bitmap;
 
-	priv->eq_table.clr_mask =
-		swab32(1 << (priv->eq_table.inta_pin & 31));
-	priv->eq_table.clr_int  = priv->clr_base +
-		(priv->eq_table.inta_pin < 32 ? 4 : 0);
+		priv->eq_table.clr_mask =
+			swab32(1 << (priv->eq_table.inta_pin & 31));
+		priv->eq_table.clr_int  = priv->clr_base +
+			(priv->eq_table.inta_pin < 32 ? 4 : 0);
+	}
 
-	priv->eq_table.irq_names = kmalloc(16 * dev->caps.num_comp_vectors, GFP_KERNEL);
+	priv->eq_table.irq_names =
+		kmalloc(MLX4_IRQNAME_SIZE * (dev->caps.num_comp_vectors + 1 +
+					     dev->caps.comp_pool),
+			GFP_KERNEL);
 	if (!priv->eq_table.irq_names) {
 		err = -ENOMEM;
-		goto err_out_bitmap;
+		goto err_out_clr_int;
 	}
 
 	for (i = 0; i < dev->caps.num_comp_vectors; ++i) {
-		err = mlx4_create_eq(dev, dev->caps.num_cqs + MLX4_NUM_SPARE_EQE,
+		err = mlx4_create_eq(dev, dev->caps.num_cqs -
+					  dev->caps.reserved_cqs +
+					  MLX4_NUM_SPARE_EQE,
 				     (dev->flags & MLX4_FLAG_MSI_X) ? i : 0,
 				     &priv->eq_table.eq[i]);
 		if (err) {
@@ -595,18 +1182,42 @@
 	if (err)
 		goto err_out_comp;
 
+	/*if additional completion vectors poolsize is 0 this loop will not run*/
+	for (i = dev->caps.num_comp_vectors + 1;
+	      i < dev->caps.num_comp_vectors + dev->caps.comp_pool + 1; ++i) {
+
+		err = mlx4_create_eq(dev, dev->caps.num_cqs -
+					  dev->caps.reserved_cqs +
+					  MLX4_NUM_SPARE_EQE,
+				     (dev->flags & MLX4_FLAG_MSI_X) ? i : 0,
+				     &priv->eq_table.eq[i]);
+		if (err) {
+			--i;
+			goto err_out_unmap;
+		}
+	}
+
+
 	if (dev->flags & MLX4_FLAG_MSI_X) {
-		static const char async_eq_name[] = DRV_NAME "(async)";
 		const char *eq_name;
 
 		for (i = 0; i < dev->caps.num_comp_vectors + 1; ++i) {
 			if (i < dev->caps.num_comp_vectors) {
-				snprintf(priv->eq_table.irq_names + i * 16, 16,
-					 "eth-mlx4-%d", i);
-				eq_name = priv->eq_table.irq_names + i * 16;
-			} else
-				eq_name = async_eq_name;
+				snprintf(priv->eq_table.irq_names +
+					 i * MLX4_IRQNAME_SIZE,
+					 MLX4_IRQNAME_SIZE,
+					 "mlx4-comp-%d at pci:%s", i,
+					 pci_name(dev->pdev));
+			} else {
+				snprintf(priv->eq_table.irq_names +
+					 i * MLX4_IRQNAME_SIZE,
+					 MLX4_IRQNAME_SIZE,
+					 "mlx4-async at pci:%s",
+					 pci_name(dev->pdev));
+			}
 
+			eq_name = priv->eq_table.irq_names +
+				  i * MLX4_IRQNAME_SIZE;
 			err = request_irq(priv->eq_table.eq[i].irq,
 					  mlx4_msi_x_interrupt, 0, eq_name,
 					  priv->eq_table.eq + i);
@@ -616,8 +1227,12 @@
 			priv->eq_table.eq[i].have_irq = 1;
 		}
 	} else {
+		snprintf(priv->eq_table.irq_names,
+			 MLX4_IRQNAME_SIZE,
+			 DRV_NAME "@pci:%s",
+			 pci_name(dev->pdev));
 		err = request_irq(dev->pdev->irq, mlx4_interrupt,
-				  IRQF_SHARED, DRV_NAME, dev);
+				  IRQF_SHARED, priv->eq_table.irq_names, dev);
 		if (err)
 			goto err_out_async;
 
@@ -624,7 +1239,7 @@
 		priv->eq_table.have_irq = 1;
 	}
 
-	err = mlx4_MAP_EQ(dev, MLX4_ASYNC_EVENT_MASK, 0,
+	err = mlx4_MAP_EQ(dev, get_async_ev_mask(dev), 0,
 			  priv->eq_table.eq[dev->caps.num_comp_vectors].eqn);
 	if (err)
 		mlx4_warn(dev, "MAP_EQ for async EQ %d failed (%d)\n",
@@ -646,10 +1261,14 @@
 		mlx4_free_eq(dev, &priv->eq_table.eq[i]);
 		--i;
 	}
-	mlx4_unmap_clr_int(dev);
 	mlx4_free_irqs(dev);
 
+err_out_clr_int:
+	if (!mlx4_is_slave(dev))
+		mlx4_unmap_clr_int(dev);
+
 err_out_bitmap:
+	mlx4_unmap_uar(dev);
 	mlx4_bitmap_cleanup(&priv->eq_table.bitmap);
 
 err_out_free:
@@ -663,20 +1282,18 @@
 	struct mlx4_priv *priv = mlx4_priv(dev);
 	int i;
 
-	mlx4_MAP_EQ(dev, MLX4_ASYNC_EVENT_MASK, 1,
+	mlx4_MAP_EQ(dev, get_async_ev_mask(dev), 1,
 		    priv->eq_table.eq[dev->caps.num_comp_vectors].eqn);
 
 	mlx4_free_irqs(dev);
 
-	for (i = 0; i < dev->caps.num_comp_vectors + 1; ++i)
+	for (i = 0; i < dev->caps.num_comp_vectors + dev->caps.comp_pool + 1; ++i)
 		mlx4_free_eq(dev, &priv->eq_table.eq[i]);
 
-	mlx4_unmap_clr_int(dev);
+	if (!mlx4_is_slave(dev))
+		mlx4_unmap_clr_int(dev);
 
-	for (i = 0; i < mlx4_num_eq_uar(dev); ++i)
-		if (priv->eq_table.uar_map[i])
-			iounmap(priv->eq_table.uar_map[i]);
-
+	mlx4_unmap_uar(dev);
 	mlx4_bitmap_cleanup(&priv->eq_table.bitmap);
 
 	kfree(priv->eq_table.uar_map);
@@ -694,7 +1311,7 @@
 
 	err = mlx4_NOP(dev);
 	/* When not in MSI_X, there is only one irq to check */
-	if (!(dev->flags & MLX4_FLAG_MSI_X))
+	if (!(dev->flags & MLX4_FLAG_MSI_X) || mlx4_is_slave(dev))
 		return err;
 
 	/* A loop over all completion vectors, for each vector we will check
@@ -706,7 +1323,7 @@
 		mlx4_cmd_use_polling(dev);
 
 		/* Map the new eq to handle all asyncronous events */
-		err = mlx4_MAP_EQ(dev, MLX4_ASYNC_EVENT_MASK, 0,
+		err = mlx4_MAP_EQ(dev, get_async_ev_mask(dev), 0,
 				  priv->eq_table.eq[i].eqn);
 		if (err) {
 			mlx4_warn(dev, "Failed mapping eq for interrupt test\n");
@@ -720,8 +1337,70 @@
 	}
 
 	/* Return to default */
-	mlx4_MAP_EQ(dev, MLX4_ASYNC_EVENT_MASK, 0,
+	mlx4_MAP_EQ(dev, get_async_ev_mask(dev), 0,
 		    priv->eq_table.eq[dev->caps.num_comp_vectors].eqn);
 	return err;
 }
 EXPORT_SYMBOL(mlx4_test_interrupts);
+
+int mlx4_assign_eq(struct mlx4_dev *dev, char* name, int * vector)
+{
+
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int vec = 0, err = 0, i;
+
+	mutex_lock(&priv->msix_ctl.pool_lock);
+	for (i = 0; !vec && i < dev->caps.comp_pool; i++) {
+		if (~priv->msix_ctl.pool_bm & 1ULL << i) {
+			priv->msix_ctl.pool_bm |= 1ULL << i;
+			vec = dev->caps.num_comp_vectors + 1 + i;
+			snprintf(priv->eq_table.irq_names +
+					vec * MLX4_IRQNAME_SIZE,
+					MLX4_IRQNAME_SIZE, "%s", name);
+			err = request_irq(priv->eq_table.eq[vec].irq,
+					  mlx4_msi_x_interrupt, 0,
+					  &priv->eq_table.irq_names[vec<<5],
+					  priv->eq_table.eq + vec);
+			if (err) {
+				/*zero out bit by fliping it*/
+				priv->msix_ctl.pool_bm ^= 1 << i;
+				vec = 0;
+				continue;
+				/*we dont want to break here*/
+			}
+			eq_set_ci(&priv->eq_table.eq[vec], 1);
+		}
+	}
+	mutex_unlock(&priv->msix_ctl.pool_lock);
+
+	if (vec) {
+		*vector = vec;
+	} else {
+		*vector = 0;
+		err = (i == dev->caps.comp_pool) ? -ENOSPC : err;
+	}
+	return err;
+}
+EXPORT_SYMBOL(mlx4_assign_eq);
+
+void mlx4_release_eq(struct mlx4_dev *dev, int vec)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	/*bm index*/
+	int i = vec - dev->caps.num_comp_vectors - 1;
+
+	if (likely(i >= 0)) {
+		/*sanity check , making sure were not trying to free irq's
+		  Belonging to a legacy EQ*/
+		mutex_lock(&priv->msix_ctl.pool_lock);
+		if (priv->msix_ctl.pool_bm & 1ULL << i) {
+			free_irq(priv->eq_table.eq[vec].irq,
+				 &priv->eq_table.eq[vec]);
+			priv->msix_ctl.pool_bm &= ~(1ULL << i);
+		}
+		mutex_unlock(&priv->msix_ctl.pool_lock);
+	}
+
+}
+EXPORT_SYMBOL(mlx4_release_eq);
+

Modified: trunk/sys/ofed/drivers/net/mlx4/fw.c
===================================================================
--- trunk/sys/ofed/drivers/net/mlx4/fw.c	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/drivers/net/mlx4/fw.c	2016-09-14 19:35:22 UTC (rev 7911)
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
- * Copyright (c) 2005, 2006, 2007, 2008 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2005, 2006, 2007, 2008, 2014 Mellanox Technologies. All rights reserved.
  * Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc.  All rights reserved.
  *
  * This software is available to you under a choice of one of two
@@ -32,7 +32,10 @@
  * SOFTWARE.
  */
 
+#include <linux/etherdevice.h>
 #include <linux/mlx4/cmd.h>
+#include <linux/module.h>
+#include <linux/cache.h>
 
 #include "fw.h"
 #include "icm.h"
@@ -46,14 +49,10 @@
 extern void __buggy_use_of_MLX4_GET(void);
 extern void __buggy_use_of_MLX4_PUT(void);
 
-static int enable_qos;
+static bool enable_qos;
 module_param(enable_qos, bool, 0444);
 MODULE_PARM_DESC(enable_qos, "Enable Quality of Service support in the HCA (default: off)");
 
-static int mlx4_pre_t11_mode = 0;
-module_param_named(enable_pre_t11_mode, mlx4_pre_t11_mode, int, 0644);
-MODULE_PARM_DESC(enable_pre_t11_mode, "For FCoXX, enable pre-t11 mode if non-zero (default: 0)");
-
 #define MLX4_GET(dest, source, offset)				      \
 	do {							      \
 		void *__p = (char *) (source) + (offset);	      \
@@ -93,6 +92,7 @@
 		[ 9] = "Q_Key violation counter",
 		[10] = "VMM",
 		[12] = "DPDP",
+		[15] = "Big LSO headers",
 		[16] = "MW support",
 		[17] = "APM support",
 		[18] = "Atomic ops support",
@@ -102,8 +102,18 @@
 		[24] = "Demand paging support",
 		[25] = "Router support",
 		[30] = "IBoE support",
-		[48] = "Basic counters support",
-		[49] = "Extended counters support",
+		[32] = "Unicast loopback support",
+		[34] = "FCS header control",
+		[38] = "Wake On LAN support",
+		[40] = "UDP RSS support",
+		[41] = "Unicast VEP steering support",
+		[42] = "Multicast VEP steering support",
+		[44] = "Cross-channel (sync_qp) operations support",
+		[48] = "Counters support",
+		[59] = "Port management change event support",
+		[60] = "eSwitch support",
+		[61] = "64 byte EQE support",
+		[62] = "64 byte CQE support",
 	};
 	int i;
 
@@ -113,6 +123,32 @@
 			mlx4_dbg(dev, "    %s\n", fname[i]);
 }
 
+static void dump_dev_cap_flags2(struct mlx4_dev *dev, u64 flags)
+{
+	static const char * const fname[] = {
+		[0] = "RSS support",
+		[1] = "RSS Toeplitz Hash Function support",
+		[2] = "RSS XOR Hash Function support",
+		[3] = "Device manage flow steering support",
+		[4] = "FSM (MAC unti-spoofing) support",
+		[5] = "VST (control vlan insertion/stripping) support",
+		[6] = "Dynamic QP updates support",
+		[7] = "Loopback source checks support",
+		[8] = "Device managed flow steering IPoIB support",
+		[9] = "ETS configuration support",
+		[10] = "ETH backplane autoneg report",
+		[11] = "Ethernet Flow control statistics support",
+		[12] = "Recoverable error events support",
+		[13] = "Time stamping support",
+		[14] = "Report driver version to FW support"
+	};
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(fname); ++i)
+		if (fname[i] && (flags & (1LL << i)))
+			mlx4_dbg(dev, "    %s\n", fname[i]);
+}
+
 int mlx4_MOD_STAT_CFG(struct mlx4_dev *dev, struct mlx4_mod_stat_cfg *cfg)
 {
 	struct mlx4_cmd_mailbox *mailbox;
@@ -135,25 +171,317 @@
 	MLX4_PUT(inbox, cfg->log_pg_sz_m, MOD_STAT_CFG_PG_SZ_M_OFFSET);
 
 	err = mlx4_cmd(dev, mailbox->dma, 0, 0, MLX4_CMD_MOD_STAT_CFG,
-			MLX4_CMD_TIME_CLASS_A);
+			MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
 
 	mlx4_free_cmd_mailbox(dev, mailbox);
 	return err;
 }
 
+int mlx4_QUERY_FUNC_CAP_wrapper(struct mlx4_dev *dev, int slave,
+				struct mlx4_vhcr *vhcr,
+				struct mlx4_cmd_mailbox *inbox,
+				struct mlx4_cmd_mailbox *outbox,
+				struct mlx4_cmd_info *cmd)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	u8	field, port;
+	u32	size;
+	int	err = 0;
+
+#define QUERY_FUNC_CAP_FLAGS_OFFSET		0x0
+#define QUERY_FUNC_CAP_NUM_PORTS_OFFSET		0x1
+#define QUERY_FUNC_CAP_PF_BHVR_OFFSET		0x4
+#define QUERY_FUNC_CAP_FMR_OFFSET		0x8
+#define QUERY_FUNC_CAP_QP_QUOTA_OFFSET_DEP	0x10
+#define QUERY_FUNC_CAP_CQ_QUOTA_OFFSET_DEP	0x14
+#define QUERY_FUNC_CAP_SRQ_QUOTA_OFFSET_DEP	0x18
+#define QUERY_FUNC_CAP_MPT_QUOTA_OFFSET_DEP	0x20
+#define QUERY_FUNC_CAP_MTT_QUOTA_OFFSET_DEP	0x24
+#define QUERY_FUNC_CAP_MCG_QUOTA_OFFSET_DEP	0x28
+#define QUERY_FUNC_CAP_MAX_EQ_OFFSET		0x2c
+#define QUERY_FUNC_CAP_RESERVED_EQ_OFFSET	0x30
+
+#define QUERY_FUNC_CAP_QP_QUOTA_OFFSET		0x50
+#define QUERY_FUNC_CAP_CQ_QUOTA_OFFSET		0x54
+#define QUERY_FUNC_CAP_SRQ_QUOTA_OFFSET		0x58
+#define QUERY_FUNC_CAP_MPT_QUOTA_OFFSET		0x60
+#define QUERY_FUNC_CAP_MTT_QUOTA_OFFSET		0x64
+#define QUERY_FUNC_CAP_MCG_QUOTA_OFFSET		0x68
+
+#define QUERY_FUNC_CAP_FMR_FLAG			0x80
+#define QUERY_FUNC_CAP_FLAG_RDMA		0x40
+#define QUERY_FUNC_CAP_FLAG_ETH			0x80
+#define QUERY_FUNC_CAP_FLAG_QUOTAS		0x10
+
+/* when opcode modifier = 1 */
+#define QUERY_FUNC_CAP_PHYS_PORT_OFFSET		0x3
+#define QUERY_FUNC_CAP_FLAGS0_OFFSET		0x8
+#define QUERY_FUNC_CAP_FLAGS1_OFFSET		0xc
+#define QUERY_FUNC_CAP_COUNTER_INDEX_OFFSET	0xd
+
+#define QUERY_FUNC_CAP_QP0_TUNNEL		0x10
+#define QUERY_FUNC_CAP_QP0_PROXY		0x14
+#define QUERY_FUNC_CAP_QP1_TUNNEL		0x18
+#define QUERY_FUNC_CAP_QP1_PROXY		0x1c
+
+#define QUERY_FUNC_CAP_ETH_PROPS_FORCE_MAC	0x40
+#define QUERY_FUNC_CAP_ETH_PROPS_FORCE_VLAN	0x80
+#define QUERY_FUNC_CAP_PROPS_DEF_COUNTER	0x20
+
+#define QUERY_FUNC_CAP_RDMA_PROPS_FORCE_PHY_WQE_GID 0x80
+
+	if (vhcr->op_modifier == 1) {
+		port = vhcr->in_modifier; /* phys-port = logical-port */
+		MLX4_PUT(outbox->buf, port, QUERY_FUNC_CAP_PHYS_PORT_OFFSET);
+
+		field = 0;
+		/* ensure that phy_wqe_gid bit is not set */
+		MLX4_PUT(outbox->buf, field, QUERY_FUNC_CAP_FLAGS0_OFFSET);
+
+		/* ensure force vlan and force mac bits are not set
+		 * and that default counter bit is set
+		 */
+		field = QUERY_FUNC_CAP_PROPS_DEF_COUNTER; /* def counter */
+		MLX4_PUT(outbox->buf, field, QUERY_FUNC_CAP_FLAGS1_OFFSET);
+
+		/* There is always default counter legal or sink counter */
+		field = mlx4_get_default_counter_index(dev, slave, vhcr->in_modifier);
+		MLX4_PUT(outbox->buf, field, QUERY_FUNC_CAP_COUNTER_INDEX_OFFSET);
+
+		/* size is now the QP number */
+		size = dev->phys_caps.base_tunnel_sqpn + 8 * slave + port - 1;
+		MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_QP0_TUNNEL);
+
+		size += 2;
+		MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_QP1_TUNNEL);
+
+		size = dev->phys_caps.base_proxy_sqpn + 8 * slave + port - 1;
+		MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_QP0_PROXY);
+
+		size += 2;
+		MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_QP1_PROXY);
+
+	} else if (vhcr->op_modifier == 0) {
+		/* enable rdma and ethernet interfaces, and new quota locations */
+		field = (QUERY_FUNC_CAP_FLAG_ETH | QUERY_FUNC_CAP_FLAG_RDMA |
+			 QUERY_FUNC_CAP_FLAG_QUOTAS);
+		MLX4_PUT(outbox->buf, field, QUERY_FUNC_CAP_FLAGS_OFFSET);
+
+		field = dev->caps.num_ports;
+		MLX4_PUT(outbox->buf, field, QUERY_FUNC_CAP_NUM_PORTS_OFFSET);
+
+		size = dev->caps.function_caps; /* set PF behaviours */
+		MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_PF_BHVR_OFFSET);
+
+		field = 0; /* protected FMR support not available as yet */
+		MLX4_PUT(outbox->buf, field, QUERY_FUNC_CAP_FMR_OFFSET);
+
+		size = priv->mfunc.master.res_tracker.res_alloc[RES_QP].quota[slave];
+		MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_QP_QUOTA_OFFSET);
+		size = dev->caps.num_qps;
+		MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_QP_QUOTA_OFFSET_DEP);
+
+		size = priv->mfunc.master.res_tracker.res_alloc[RES_SRQ].quota[slave];
+		MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_SRQ_QUOTA_OFFSET);
+		size = dev->caps.num_srqs;
+		MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_SRQ_QUOTA_OFFSET_DEP);
+
+		size = priv->mfunc.master.res_tracker.res_alloc[RES_CQ].quota[slave];
+		MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_CQ_QUOTA_OFFSET);
+		size = dev->caps.num_cqs;
+		MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_CQ_QUOTA_OFFSET_DEP);
+
+		size = dev->caps.num_eqs;
+		MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_MAX_EQ_OFFSET);
+
+		size = dev->caps.reserved_eqs;
+		MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_RESERVED_EQ_OFFSET);
+
+		size = priv->mfunc.master.res_tracker.res_alloc[RES_MPT].quota[slave];
+		MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_MPT_QUOTA_OFFSET);
+		size = dev->caps.num_mpts;
+		MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_MPT_QUOTA_OFFSET_DEP);
+
+		size = priv->mfunc.master.res_tracker.res_alloc[RES_MTT].quota[slave];
+		MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_MTT_QUOTA_OFFSET);
+		size = dev->caps.num_mtts;
+		MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_MTT_QUOTA_OFFSET_DEP);
+
+		size = dev->caps.num_mgms + dev->caps.num_amgms;
+		MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_MCG_QUOTA_OFFSET);
+		MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_MCG_QUOTA_OFFSET_DEP);
+
+	} else
+		err = -EINVAL;
+
+	return err;
+}
+
+int mlx4_QUERY_FUNC_CAP(struct mlx4_dev *dev, u32 gen_or_port,
+			struct mlx4_func_cap *func_cap)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	u32			*outbox;
+	u8			field, op_modifier;
+	u32			size;
+	int			err = 0, quotas = 0;
+
+	op_modifier = !!gen_or_port; /* 0 = general, 1 = logical port */
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	err = mlx4_cmd_box(dev, 0, mailbox->dma, gen_or_port, op_modifier,
+			   MLX4_CMD_QUERY_FUNC_CAP,
+			   MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
+	if (err)
+		goto out;
+
+	outbox = mailbox->buf;
+
+	if (!op_modifier) {
+		MLX4_GET(field, outbox, QUERY_FUNC_CAP_FLAGS_OFFSET);
+		if (!(field & (QUERY_FUNC_CAP_FLAG_ETH | QUERY_FUNC_CAP_FLAG_RDMA))) {
+			mlx4_err(dev, "The host supports neither eth nor rdma interfaces\n");
+			err = -EPROTONOSUPPORT;
+			goto out;
+		}
+		func_cap->flags = field;
+		quotas = !!(func_cap->flags & QUERY_FUNC_CAP_FLAG_QUOTAS);
+
+		MLX4_GET(field, outbox, QUERY_FUNC_CAP_NUM_PORTS_OFFSET);
+		func_cap->num_ports = field;
+
+		MLX4_GET(size, outbox, QUERY_FUNC_CAP_PF_BHVR_OFFSET);
+		func_cap->pf_context_behaviour = size;
+
+		if (quotas) {
+			MLX4_GET(size, outbox, QUERY_FUNC_CAP_QP_QUOTA_OFFSET);
+			func_cap->qp_quota = size & 0xFFFFFF;
+
+			MLX4_GET(size, outbox, QUERY_FUNC_CAP_SRQ_QUOTA_OFFSET);
+			func_cap->srq_quota = size & 0xFFFFFF;
+
+			MLX4_GET(size, outbox, QUERY_FUNC_CAP_CQ_QUOTA_OFFSET);
+			func_cap->cq_quota = size & 0xFFFFFF;
+
+			MLX4_GET(size, outbox, QUERY_FUNC_CAP_MPT_QUOTA_OFFSET);
+			func_cap->mpt_quota = size & 0xFFFFFF;
+
+			MLX4_GET(size, outbox, QUERY_FUNC_CAP_MTT_QUOTA_OFFSET);
+			func_cap->mtt_quota = size & 0xFFFFFF;
+
+			MLX4_GET(size, outbox, QUERY_FUNC_CAP_MCG_QUOTA_OFFSET);
+			func_cap->mcg_quota = size & 0xFFFFFF;
+
+		} else {
+			MLX4_GET(size, outbox, QUERY_FUNC_CAP_QP_QUOTA_OFFSET_DEP);
+			func_cap->qp_quota = size & 0xFFFFFF;
+
+			MLX4_GET(size, outbox, QUERY_FUNC_CAP_SRQ_QUOTA_OFFSET_DEP);
+			func_cap->srq_quota = size & 0xFFFFFF;
+
+			MLX4_GET(size, outbox, QUERY_FUNC_CAP_CQ_QUOTA_OFFSET_DEP);
+			func_cap->cq_quota = size & 0xFFFFFF;
+
+			MLX4_GET(size, outbox, QUERY_FUNC_CAP_MPT_QUOTA_OFFSET_DEP);
+			func_cap->mpt_quota = size & 0xFFFFFF;
+
+			MLX4_GET(size, outbox, QUERY_FUNC_CAP_MTT_QUOTA_OFFSET_DEP);
+			func_cap->mtt_quota = size & 0xFFFFFF;
+
+			MLX4_GET(size, outbox, QUERY_FUNC_CAP_MCG_QUOTA_OFFSET_DEP);
+			func_cap->mcg_quota = size & 0xFFFFFF;
+		}
+		MLX4_GET(size, outbox, QUERY_FUNC_CAP_MAX_EQ_OFFSET);
+		func_cap->max_eq = size & 0xFFFFFF;
+
+		MLX4_GET(size, outbox, QUERY_FUNC_CAP_RESERVED_EQ_OFFSET);
+		func_cap->reserved_eq = size & 0xFFFFFF;
+
+		goto out;
+	}
+
+	/* logical port query */
+	if (gen_or_port > dev->caps.num_ports) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	if (dev->caps.port_type[gen_or_port] == MLX4_PORT_TYPE_ETH) {
+		MLX4_GET(field, outbox, QUERY_FUNC_CAP_FLAGS1_OFFSET);
+		if (field & QUERY_FUNC_CAP_ETH_PROPS_FORCE_VLAN) {
+			mlx4_err(dev, "VLAN is enforced on this port\n");
+			err = -EPROTONOSUPPORT;
+			goto out;
+		}
+
+		if (field & QUERY_FUNC_CAP_ETH_PROPS_FORCE_MAC) {
+			mlx4_err(dev, "Force mac is enabled on this port\n");
+			err = -EPROTONOSUPPORT;
+			goto out;
+		}
+	} else if (dev->caps.port_type[gen_or_port] == MLX4_PORT_TYPE_IB) {
+		MLX4_GET(field, outbox, QUERY_FUNC_CAP_FLAGS0_OFFSET);
+		if (field & QUERY_FUNC_CAP_RDMA_PROPS_FORCE_PHY_WQE_GID) {
+			mlx4_err(dev, "phy_wqe_gid is "
+				 "enforced on this ib port\n");
+			err = -EPROTONOSUPPORT;
+			goto out;
+		}
+	}
+
+	MLX4_GET(field, outbox, QUERY_FUNC_CAP_PHYS_PORT_OFFSET);
+	func_cap->physical_port = field;
+	if (func_cap->physical_port != gen_or_port) {
+		err = -ENOSYS;
+		goto out;
+	}
+
+	MLX4_GET(field, outbox, QUERY_FUNC_CAP_FLAGS1_OFFSET);
+	if (field & QUERY_FUNC_CAP_PROPS_DEF_COUNTER) {
+		MLX4_GET(field, outbox, QUERY_FUNC_CAP_COUNTER_INDEX_OFFSET);
+		func_cap->def_counter_index = field;
+	} else {
+		func_cap->def_counter_index = MLX4_SINK_COUNTER_INDEX;
+	}
+
+	MLX4_GET(size, outbox, QUERY_FUNC_CAP_QP0_TUNNEL);
+	func_cap->qp0_tunnel_qpn = size & 0xFFFFFF;
+
+	MLX4_GET(size, outbox, QUERY_FUNC_CAP_QP0_PROXY);
+	func_cap->qp0_proxy_qpn = size & 0xFFFFFF;
+
+	MLX4_GET(size, outbox, QUERY_FUNC_CAP_QP1_TUNNEL);
+	func_cap->qp1_tunnel_qpn = size & 0xFFFFFF;
+
+	MLX4_GET(size, outbox, QUERY_FUNC_CAP_QP1_PROXY);
+	func_cap->qp1_proxy_qpn = size & 0xFFFFFF;
+
+	/* All other resources are allocated by the master, but we still report
+	 * 'num' and 'reserved' capabilities as follows:
+	 * - num remains the maximum resource index
+	 * - 'num - reserved' is the total available objects of a resource, but
+	 *   resource indices may be less than 'reserved'
+	 * TODO: set per-resource quotas */
+
+out:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+
+	return err;
+}
+
 int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 {
 	struct mlx4_cmd_mailbox *mailbox;
 	u32 *outbox;
 	u8 field;
-	u32 field32;
+	u32 field32, flags, ext_flags;
 	u16 size;
 	u16 stat_rate;
 	int err;
 	int i;
-	u32 in_modifier;
-	u64 out_param;
-	u32 tmp1, tmp2;
 
 #define QUERY_DEV_CAP_OUT_SIZE		       0x100
 #define QUERY_DEV_CAP_MAX_SRQ_SZ_OFFSET		0x10
@@ -178,8 +506,8 @@
 #define QUERY_DEV_CAP_MAX_REQ_QP_OFFSET		0x29
 #define QUERY_DEV_CAP_MAX_RES_QP_OFFSET		0x2b
 #define QUERY_DEV_CAP_MAX_GSO_OFFSET		0x2d
+#define QUERY_DEV_CAP_RSS_OFFSET		0x2e
 #define QUERY_DEV_CAP_MAX_RDMA_OFFSET		0x2f
-#define QUERY_DEV_CAP_STAT_CFG_INL_OFFSET	0x31
 #define QUERY_DEV_CAP_RSZ_SRQ_OFFSET		0x33
 #define QUERY_DEV_CAP_ACK_DELAY_OFFSET		0x35
 #define QUERY_DEV_CAP_MTU_WIDTH_OFFSET		0x36
@@ -187,10 +515,10 @@
 #define QUERY_DEV_CAP_MAX_MSG_SZ_OFFSET		0x38
 #define QUERY_DEV_CAP_MAX_GID_OFFSET		0x3b
 #define QUERY_DEV_CAP_RATE_SUPPORT_OFFSET	0x3c
+#define QUERY_DEV_CAP_CQ_TS_SUPPORT_OFFSET	0x3e
 #define QUERY_DEV_CAP_MAX_PKEY_OFFSET		0x3f
 #define QUERY_DEV_CAP_EXT_FLAGS_OFFSET		0x40
-#define QUERY_DEV_CAP_UDP_RSS_OFFSET		0x42
-#define QUERY_DEV_CAP_ETH_UC_LOOPBACK_OFFSET	0x43
+#define QUERY_DEV_CAP_SYNC_QP_OFFSET		0x42
 #define QUERY_DEV_CAP_FLAGS_OFFSET		0x44
 #define QUERY_DEV_CAP_RSVD_UAR_OFFSET		0x48
 #define QUERY_DEV_CAP_UAR_SZ_OFFSET		0x49
@@ -210,6 +538,13 @@
 #define QUERY_DEV_CAP_MAX_PD_OFFSET		0x65
 #define QUERY_DEV_CAP_RSVD_XRC_OFFSET		0x66
 #define QUERY_DEV_CAP_MAX_XRC_OFFSET		0x67
+#define QUERY_DEV_CAP_MAX_BASIC_COUNTERS_OFFSET	0x68
+#define QUERY_DEV_CAP_MAX_EXTENDED_COUNTERS_OFFSET	0x6c
+#define QUERY_DEV_CAP_PORT_FLOWSTATS_COUNTERS_OFFSET	0x70
+#define QUERY_DEV_CAP_FLOW_STEERING_RANGE_EN_OFFSET	0x76
+#define QUERY_DEV_CAP_EXT_2_FLAGS_OFFSET		0x70
+#define QUERY_DEV_CAP_FLOW_STEERING_IPOIB_OFFSET	0x74
+#define QUERY_DEV_CAP_FLOW_STEERING_MAX_QP_OFFSET	0x77
 #define QUERY_DEV_CAP_RDMARC_ENTRY_SZ_OFFSET	0x80
 #define QUERY_DEV_CAP_QPC_ENTRY_SZ_OFFSET	0x82
 #define QUERY_DEV_CAP_AUX_ENTRY_SZ_OFFSET	0x84
@@ -222,10 +557,10 @@
 #define QUERY_DEV_CAP_D_MPT_ENTRY_SZ_OFFSET	0x92
 #define QUERY_DEV_CAP_BMME_FLAGS_OFFSET		0x94
 #define QUERY_DEV_CAP_RSVD_LKEY_OFFSET		0x98
+#define QUERY_DEV_CAP_ETS_CFG_OFFSET		0x9c
 #define QUERY_DEV_CAP_MAX_ICM_SZ_OFFSET		0xa0
-#define QUERY_DEV_CAP_MAX_BASIC_CNT_OFFSET	0x68
-#define QUERY_DEV_CAP_MAX_EXT_CNT_OFFSET	0x6c
 
+	dev_cap->flags2 = 0;
 	mailbox = mlx4_alloc_cmd_mailbox(dev);
 	if (IS_ERR(mailbox))
 		return PTR_ERR(mailbox);
@@ -232,7 +567,7 @@
 	outbox = mailbox->buf;
 
 	err = mlx4_cmd_box(dev, 0, mailbox->dma, 0, 0, MLX4_CMD_QUERY_DEV_CAP,
-			   MLX4_CMD_TIME_CLASS_A);
+			   MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
 	if (err)
 		goto out;
 
@@ -253,7 +588,7 @@
 	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_MPT_OFFSET);
 	dev_cap->max_mpts = 1 << (field & 0x3f);
 	MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_EQ_OFFSET);
-	dev_cap->reserved_eqs = 1 << (field & 0xf);
+	dev_cap->reserved_eqs = field & 0xf;
 	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_EQ_OFFSET);
 	dev_cap->max_eqs = 1 << (field & 0xf);
 	MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_MTT_OFFSET);
@@ -275,6 +610,17 @@
 	else
 		dev_cap->max_gso_sz = 1 << field;
 
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_RSS_OFFSET);
+	if (field & 0x20)
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_RSS_XOR;
+	if (field & 0x10)
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_RSS_TOP;
+	field &= 0xf;
+	if (field) {
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_RSS;
+		dev_cap->max_rss_tbl_sz = 1 << field;
+	} else
+		dev_cap->max_rss_tbl_sz = 0;
 	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_RDMA_OFFSET);
 	dev_cap->max_rdma_global = 1 << (field & 0x3f);
 	MLX4_GET(field, outbox, QUERY_DEV_CAP_ACK_DELAY_OFFSET);
@@ -283,16 +629,28 @@
 	dev_cap->num_ports = field & 0xf;
 	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_MSG_SZ_OFFSET);
 	dev_cap->max_msg_sz = 1 << (field & 0x1f);
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_PORT_FLOWSTATS_COUNTERS_OFFSET);
+	if (field & 0x10)
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_FLOWSTATS_EN;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_FLOW_STEERING_RANGE_EN_OFFSET);
+	if (field & 0x80)
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_FS_EN;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_FLOW_STEERING_IPOIB_OFFSET);
+	if (field & 0x80)
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_DMFS_IPOIB;
+	dev_cap->fs_log_max_ucast_qp_range_size = field & 0x1f;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_FLOW_STEERING_MAX_QP_OFFSET);
+	dev_cap->fs_max_num_qp_per_entry = field;
 	MLX4_GET(stat_rate, outbox, QUERY_DEV_CAP_RATE_SUPPORT_OFFSET);
 	dev_cap->stat_rate_support = stat_rate;
-	MLX4_GET(field, outbox, QUERY_DEV_CAP_UDP_RSS_OFFSET);
-	dev_cap->udp_rss = field & 0x1;
-	MLX4_GET(field, outbox, QUERY_DEV_CAP_ETH_UC_LOOPBACK_OFFSET);
-	dev_cap->loopback_support = field & 0x1;
-	dev_cap->wol = field & 0x40;
-	MLX4_GET(tmp1, outbox, QUERY_DEV_CAP_EXT_FLAGS_OFFSET);
-	MLX4_GET(tmp2, outbox, QUERY_DEV_CAP_FLAGS_OFFSET);
-	dev_cap->flags = tmp2 | (u64)tmp1 << 32;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_CQ_TS_SUPPORT_OFFSET);
+	if (field & 0x80)
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_TS;
+	MLX4_GET(ext_flags, outbox, QUERY_DEV_CAP_EXT_FLAGS_OFFSET);
+	MLX4_GET(flags, outbox, QUERY_DEV_CAP_FLAGS_OFFSET);
+	dev_cap->flags = flags | (u64)ext_flags << 32;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_SYNC_QP_OFFSET);
+	dev_cap->sync_qp = field & 0x10;
 	MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_UAR_OFFSET);
 	dev_cap->reserved_uars = field >> 4;
 	MLX4_GET(field, outbox, QUERY_DEV_CAP_UAR_SZ_OFFSET);
@@ -305,10 +663,8 @@
 		MLX4_GET(field, outbox, QUERY_DEV_CAP_LOG_BF_REG_SZ_OFFSET);
 		dev_cap->bf_reg_size = 1 << (field & 0x1f);
 		MLX4_GET(field, outbox, QUERY_DEV_CAP_LOG_MAX_BF_REGS_PER_PAGE_OFFSET);
-		if ((1 << (field & 0x3f)) > (PAGE_SIZE / dev_cap->bf_reg_size)) {
-			mlx4_dbg(dev, "log blue flame is invalid (%d), forcing 3\n", field & 0x1f);
+		if ((1 << (field & 0x3f)) > (PAGE_SIZE / dev_cap->bf_reg_size))
 			field = 3;
-		}
 		dev_cap->bf_regs_per_page = 1 << (field & 0x3f);
 		mlx4_dbg(dev, "BlueFlame available (reg size %d, regs/page %d)\n",
 			 dev_cap->bf_reg_size, dev_cap->bf_regs_per_page);
@@ -332,7 +688,6 @@
 	dev_cap->reserved_pds = field >> 4;
 	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_PD_OFFSET);
 	dev_cap->max_pds = 1 << (field & 0x3f);
-
 	MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_XRC_OFFSET);
 	dev_cap->reserved_xrcds = field >> 4;
 	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_XRC_OFFSET);
@@ -363,8 +718,6 @@
 	dev_cap->max_srq_sz = 1 << field;
 	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_QP_SZ_OFFSET);
 	dev_cap->max_qp_sz = 1 << field;
-	MLX4_GET(field, outbox, QUERY_DEV_CAP_STAT_CFG_INL_OFFSET);
-	dev_cap->inline_cfg = field & 1;
 	MLX4_GET(field, outbox, QUERY_DEV_CAP_RSZ_SRQ_OFFSET);
 	dev_cap->resize_srq = field & 1;
 	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_SG_RQ_OFFSET);
@@ -376,13 +729,37 @@
 		 QUERY_DEV_CAP_BMME_FLAGS_OFFSET);
 	MLX4_GET(dev_cap->reserved_lkey, outbox,
 		 QUERY_DEV_CAP_RSVD_LKEY_OFFSET);
+	MLX4_GET(field32, outbox, QUERY_DEV_CAP_ETS_CFG_OFFSET);
+	if (field32 & (1 << 0))
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_ETH_BACKPL_AN_REP;
+	if (field32 & (1 << 7))
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_RECOVERABLE_ERROR_EVENT;
+	if (field32 & (1 << 8))
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_DRIVER_VERSION_TO_FW;
+	if (field32 & (1 << 13))
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_ETS_CFG;
+
 	MLX4_GET(dev_cap->max_icm_sz, outbox,
 		 QUERY_DEV_CAP_MAX_ICM_SZ_OFFSET);
-	MLX4_GET(dev_cap->max_basic_counters, outbox,
-		 QUERY_DEV_CAP_MAX_BASIC_CNT_OFFSET);
-	MLX4_GET(dev_cap->max_ext_counters, outbox,
-		 QUERY_DEV_CAP_MAX_EXT_CNT_OFFSET);
+	if (dev_cap->flags & MLX4_DEV_CAP_FLAG_COUNTERS)
+		MLX4_GET(dev_cap->max_basic_counters, outbox,
+			 QUERY_DEV_CAP_MAX_BASIC_COUNTERS_OFFSET);
+	/* FW reports 256 however real value is 255 */
+	dev_cap->max_basic_counters = min_t(u32, dev_cap->max_basic_counters, 255);
+	if (dev_cap->flags & MLX4_DEV_CAP_FLAG_COUNTERS_EXT)
+		MLX4_GET(dev_cap->max_extended_counters, outbox,
+			 QUERY_DEV_CAP_MAX_EXTENDED_COUNTERS_OFFSET);
 
+	MLX4_GET(field32, outbox, QUERY_DEV_CAP_EXT_2_FLAGS_OFFSET);
+	if (field32 & (1 << 16))
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_UPDATE_QP;
+	if (field32 & (1 << 19))
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_LB_SRC_CHK;
+	if (field32 & (1 << 20))
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_FSM;
+	if (field32 & (1 << 26))
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_VLAN_CONTROL;
+
 	if (dev->flags & MLX4_FLAG_OLD_PORT_CMDS) {
 		for (i = 1; i <= dev_cap->num_ports; ++i) {
 			MLX4_GET(field, outbox, QUERY_DEV_CAP_VL_PORT_OFFSET);
@@ -408,19 +785,16 @@
 #define QUERY_PORT_WAVELENGTH_OFFSET		0x1c
 #define QUERY_PORT_TRANS_CODE_OFFSET		0x20
 
-#define STAT_CFG_PORT_MODE	(1 << 28)
-#define STAT_CFG_PORT_OFFSET	0x8
-#define STAT_CFG_PORT_MASK	(1 << 20)
-#define STAT_CFG_MOD_INLINE	0x3
-
 		for (i = 1; i <= dev_cap->num_ports; ++i) {
 			err = mlx4_cmd_box(dev, 0, mailbox->dma, i, 0, MLX4_CMD_QUERY_PORT,
-					   MLX4_CMD_TIME_CLASS_B);
+					   MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
 			if (err)
 				goto out;
 
 			MLX4_GET(field, outbox, QUERY_PORT_SUPPORTED_TYPE_OFFSET);
 			dev_cap->supported_port_types[i] = field & 3;
+			dev_cap->suggested_type[i] = (field >> 3) & 1;
+			dev_cap->default_sense[i] = (field >> 4) & 1;
 			MLX4_GET(field, outbox, QUERY_PORT_MTU_OFFSET);
 			dev_cap->ib_mtu[i]	   = field & 0xf;
 			MLX4_GET(field, outbox, QUERY_PORT_WIDTH_OFFSET);
@@ -440,20 +814,6 @@
 			dev_cap->vendor_oui[i] = field32 & 0xffffff;
 			MLX4_GET(dev_cap->wavelength[i], outbox, QUERY_PORT_WAVELENGTH_OFFSET);
 			MLX4_GET(dev_cap->trans_code[i], outbox, QUERY_PORT_TRANS_CODE_OFFSET);
-
-			/* Query stat cfg for port enablement */
-			if (dev_cap->inline_cfg) {
-				in_modifier = STAT_CFG_PORT_MODE | i << 8 |
-							STAT_CFG_PORT_OFFSET;
-				err = mlx4_cmd_imm(dev, 0, &out_param,
-						   in_modifier,
-						   STAT_CFG_MOD_INLINE,
-						   MLX4_CMD_MOD_STAT_CFG,
-						   MLX4_CMD_TIME_CLASS_B);
-				if (!err)
-					if (!(out_param & STAT_CFG_PORT_MASK))
-						dev_cap->supported_port_types[i] = 0;
-			}
 		}
 	}
 
@@ -494,8 +854,12 @@
 	mlx4_dbg(dev, "Max RQ desc size: %d, max RQ S/G: %d\n",
 		 dev_cap->max_rq_desc_sz, dev_cap->max_rq_sg);
 	mlx4_dbg(dev, "Max GSO size: %d\n", dev_cap->max_gso_sz);
+	mlx4_dbg(dev, "Max basic counters: %d\n", dev_cap->max_basic_counters);
+	mlx4_dbg(dev, "Max extended counters: %d\n", dev_cap->max_extended_counters);
+	mlx4_dbg(dev, "Max RSS Table size: %d\n", dev_cap->max_rss_tbl_sz);
 
 	dump_dev_cap_flags(dev, dev_cap->flags);
+	dump_dev_cap_flags2(dev, dev_cap->flags2);
 
 out:
 	mlx4_free_cmd_mailbox(dev, mailbox);
@@ -502,6 +866,134 @@
 	return err;
 }
 
+int mlx4_QUERY_DEV_CAP_wrapper(struct mlx4_dev *dev, int slave,
+			       struct mlx4_vhcr *vhcr,
+			       struct mlx4_cmd_mailbox *inbox,
+			       struct mlx4_cmd_mailbox *outbox,
+			       struct mlx4_cmd_info *cmd)
+{
+	u64	flags;
+	int	err = 0;
+	u8	field;
+
+	err = mlx4_cmd_box(dev, 0, outbox->dma, 0, 0, MLX4_CMD_QUERY_DEV_CAP,
+			   MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
+	if (err)
+		return err;
+
+	/* add port mng change event capability unconditionally to slaves */
+	MLX4_GET(flags, outbox->buf, QUERY_DEV_CAP_EXT_FLAGS_OFFSET);
+	flags |= MLX4_DEV_CAP_FLAG_PORT_MNG_CHG_EV;
+	MLX4_PUT(outbox->buf, flags, QUERY_DEV_CAP_EXT_FLAGS_OFFSET);
+
+	/* For guests, report Blueflame disabled */
+	MLX4_GET(field, outbox->buf, QUERY_DEV_CAP_BF_OFFSET);
+	field &= 0x7f;
+	MLX4_PUT(outbox->buf, field, QUERY_DEV_CAP_BF_OFFSET);
+
+	/* turn off device-managed steering capability if not enabled */
+	if (dev->caps.steering_mode != MLX4_STEERING_MODE_DEVICE_MANAGED) {
+		MLX4_GET(field, outbox->buf,
+			 QUERY_DEV_CAP_FLOW_STEERING_RANGE_EN_OFFSET);
+		field &= 0x7f;
+		MLX4_PUT(outbox->buf, field,
+			 QUERY_DEV_CAP_FLOW_STEERING_RANGE_EN_OFFSET);
+	}
+	return 0;
+}
+
+int mlx4_QUERY_PORT_wrapper(struct mlx4_dev *dev, int slave,
+			    struct mlx4_vhcr *vhcr,
+			    struct mlx4_cmd_mailbox *inbox,
+			    struct mlx4_cmd_mailbox *outbox,
+			    struct mlx4_cmd_info *cmd)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	u64 def_mac;
+	u8 port_type;
+	u16 short_field;
+	int err;
+	int admin_link_state;
+
+#define MLX4_VF_PORT_NO_LINK_SENSE_MASK	0xE0
+#define MLX4_PORT_LINK_UP_MASK		0x80
+#define QUERY_PORT_CUR_MAX_PKEY_OFFSET	0x0c
+#define QUERY_PORT_CUR_MAX_GID_OFFSET	0x0e
+
+	err = mlx4_cmd_box(dev, 0, outbox->dma, vhcr->in_modifier, 0,
+			   MLX4_CMD_QUERY_PORT, MLX4_CMD_TIME_CLASS_B,
+			   MLX4_CMD_NATIVE);
+
+	if (!err && dev->caps.function != slave) {
+		/* set slave default_mac address to be zero MAC */
+		def_mac = priv->mfunc.master.vf_oper[slave].vport[vhcr->in_modifier].state.mac;
+		MLX4_PUT(outbox->buf, def_mac, QUERY_PORT_MAC_OFFSET);
+
+		/* get port type - currently only eth is enabled */
+		MLX4_GET(port_type, outbox->buf,
+			 QUERY_PORT_SUPPORTED_TYPE_OFFSET);
+
+		/* No link sensing allowed */
+		port_type &= MLX4_VF_PORT_NO_LINK_SENSE_MASK;
+		/* set port type to currently operating port type */
+		port_type |= (dev->caps.port_type[vhcr->in_modifier] & 0x3);
+
+		admin_link_state = priv->mfunc.master.vf_oper[slave].vport[vhcr->in_modifier].state.link_state;
+		if (IFLA_VF_LINK_STATE_ENABLE == admin_link_state)
+			port_type |= MLX4_PORT_LINK_UP_MASK;
+		else if (IFLA_VF_LINK_STATE_DISABLE == admin_link_state)
+			port_type &= ~MLX4_PORT_LINK_UP_MASK;
+
+		MLX4_PUT(outbox->buf, port_type,
+			 QUERY_PORT_SUPPORTED_TYPE_OFFSET);
+
+		if (dev->caps.port_type[vhcr->in_modifier] == MLX4_PORT_TYPE_ETH)
+			short_field = mlx4_get_slave_num_gids(dev, slave);
+		else
+			short_field = 1; /* slave max gids */
+		MLX4_PUT(outbox->buf, short_field,
+			 QUERY_PORT_CUR_MAX_GID_OFFSET);
+
+		short_field = dev->caps.pkey_table_len[vhcr->in_modifier];
+		MLX4_PUT(outbox->buf, short_field,
+			 QUERY_PORT_CUR_MAX_PKEY_OFFSET);
+	}
+
+	return err;
+}
+
+int mlx4_get_slave_pkey_gid_tbl_len(struct mlx4_dev *dev, u8 port,
+				    int *gid_tbl_len, int *pkey_tbl_len)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	u32			*outbox;
+	u16			field;
+	int			err;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	err =  mlx4_cmd_box(dev, 0, mailbox->dma, port, 0,
+			    MLX4_CMD_QUERY_PORT, MLX4_CMD_TIME_CLASS_B,
+			    MLX4_CMD_WRAPPED);
+	if (err)
+		goto out;
+
+	outbox = mailbox->buf;
+
+	MLX4_GET(field, outbox, QUERY_PORT_CUR_MAX_GID_OFFSET);
+	*gid_tbl_len = field;
+
+	MLX4_GET(field, outbox, QUERY_PORT_CUR_MAX_PKEY_OFFSET);
+	*pkey_tbl_len = field;
+
+out:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+EXPORT_SYMBOL(mlx4_get_slave_pkey_gid_tbl_len);
+
 int mlx4_map_cmd(struct mlx4_dev *dev, u16 op, struct mlx4_icm *icm, u64 virt)
 {
 	struct mlx4_cmd_mailbox *mailbox;
@@ -551,7 +1043,8 @@
 
 			if (++nent == MLX4_MAILBOX_SIZE / 16) {
 				err = mlx4_cmd(dev, mailbox->dma, nent, 0, op,
-						MLX4_CMD_TIME_CLASS_B);
+						MLX4_CMD_TIME_CLASS_B,
+						MLX4_CMD_NATIVE);
 				if (err)
 					goto out;
 				nent = 0;
@@ -560,7 +1053,8 @@
 	}
 
 	if (nent)
-		err = mlx4_cmd(dev, mailbox->dma, nent, 0, op, MLX4_CMD_TIME_CLASS_B);
+		err = mlx4_cmd(dev, mailbox->dma, nent, 0, op,
+			       MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
 	if (err)
 		goto out;
 
@@ -589,13 +1083,15 @@
 
 int mlx4_UNMAP_FA(struct mlx4_dev *dev)
 {
-	return mlx4_cmd(dev, 0, 0, 0, MLX4_CMD_UNMAP_FA, MLX4_CMD_TIME_CLASS_B);
+	return mlx4_cmd(dev, 0, 0, 0, MLX4_CMD_UNMAP_FA,
+			MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
 }
 
 
 int mlx4_RUN_FW(struct mlx4_dev *dev)
 {
-	return mlx4_cmd(dev, 0, 0, 0, MLX4_CMD_RUN_FW, MLX4_CMD_TIME_CLASS_A);
+	return mlx4_cmd(dev, 0, 0, 0, MLX4_CMD_RUN_FW,
+			MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
 }
 
 int mlx4_QUERY_FW(struct mlx4_dev *dev)
@@ -611,7 +1107,7 @@
 
 #define QUERY_FW_OUT_SIZE             0x100
 #define QUERY_FW_VER_OFFSET            0x00
-#define MC_PROMISC_VER		       0x2000702bcull
+#define QUERY_FW_PPF_ID		       0x09
 #define QUERY_FW_CMD_IF_REV_OFFSET     0x0a
 #define QUERY_FW_MAX_CMD_OFFSET        0x0f
 #define QUERY_FW_ERR_START_OFFSET      0x30
@@ -622,6 +1118,12 @@
 #define QUERY_FW_CLR_INT_BASE_OFFSET   0x20
 #define QUERY_FW_CLR_INT_BAR_OFFSET    0x28
 
+#define QUERY_FW_COMM_BASE_OFFSET      0x40
+#define QUERY_FW_COMM_BAR_OFFSET       0x48
+
+#define QUERY_FW_CLOCK_OFFSET	       0x50
+#define QUERY_FW_CLOCK_BAR	       0x58
+
 	mailbox = mlx4_alloc_cmd_mailbox(dev);
 	if (IS_ERR(mailbox))
 		return PTR_ERR(mailbox);
@@ -628,7 +1130,7 @@
 	outbox = mailbox->buf;
 
 	err = mlx4_cmd_box(dev, 0, mailbox->dma, 0, 0, MLX4_CMD_QUERY_FW,
-			    MLX4_CMD_TIME_CLASS_A);
+			    MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
 	if (err)
 		goto out;
 
@@ -640,11 +1142,14 @@
 	dev->caps.fw_ver = (fw_ver & 0xffff00000000ull) |
 		((fw_ver & 0xffff0000ull) >> 16) |
 		((fw_ver & 0x0000ffffull) << 16);
-	if (dev->caps.fw_ver < MC_PROMISC_VER)
-		dev->caps.mc_promisc_mode = 2;
-	else
-		dev->caps.mc_promisc_mode = 1;
 
+	MLX4_GET(lg, outbox, QUERY_FW_PPF_ID);
+	dev->caps.function = lg;
+
+	if (mlx4_is_slave(dev))
+		goto out;
+
+
 	MLX4_GET(cmd_if_rev, outbox, QUERY_FW_CMD_IF_REV_OFFSET);
 	if (cmd_if_rev < MLX4_COMMAND_INTERFACE_MIN_REV ||
 	    cmd_if_rev > MLX4_COMMAND_INTERFACE_MAX_REV) {
@@ -686,8 +1191,19 @@
 	MLX4_GET(fw->clr_int_bar,  outbox, QUERY_FW_CLR_INT_BAR_OFFSET);
 	fw->clr_int_bar = (fw->clr_int_bar >> 6) * 2;
 
+	MLX4_GET(fw->comm_base, outbox, QUERY_FW_COMM_BASE_OFFSET);
+	MLX4_GET(fw->comm_bar,  outbox, QUERY_FW_COMM_BAR_OFFSET);
+	fw->comm_bar = (fw->comm_bar >> 6) * 2;
+	mlx4_dbg(dev, "Communication vector bar:%d offset:0x%llx\n",
+		 fw->comm_bar, (unsigned long long)fw->comm_base);
 	mlx4_dbg(dev, "FW size %d KB\n", fw->fw_pages >> 2);
 
+	MLX4_GET(fw->clock_offset, outbox, QUERY_FW_CLOCK_OFFSET);
+	MLX4_GET(fw->clock_bar,    outbox, QUERY_FW_CLOCK_BAR);
+	fw->clock_bar = (fw->clock_bar >> 6) * 2;
+	mlx4_dbg(dev, "Internal clock bar:%d offset:0x%llx\n",
+		 fw->comm_bar, (unsigned long long)fw->comm_base);
+
 	/*
 	 * Round up number of system pages needed in case
 	 * MLX4_ICM_PAGE_SIZE < PAGE_SIZE.
@@ -704,8 +1220,32 @@
 	return err;
 }
 
-static void get_board_id(void *vsd, char *board_id)
+int mlx4_QUERY_FW_wrapper(struct mlx4_dev *dev, int slave,
+			  struct mlx4_vhcr *vhcr,
+			  struct mlx4_cmd_mailbox *inbox,
+			  struct mlx4_cmd_mailbox *outbox,
+			  struct mlx4_cmd_info *cmd)
 {
+	u8 *outbuf;
+	int err;
+
+	outbuf = outbox->buf;
+	err = mlx4_cmd_box(dev, 0, outbox->dma, 0, 0, MLX4_CMD_QUERY_FW,
+			    MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
+	if (err)
+		return err;
+
+	/* for slaves, set pci PPF ID to invalid and zero out everything
+	 * else except FW version */
+	outbuf[0] = outbuf[1] = 0;
+	memset(&outbuf[8], 0, QUERY_FW_OUT_SIZE - 8);
+	outbuf[QUERY_FW_PPF_ID] = MLX4_INVALID_SLAVE_ID;
+
+	return 0;
+}
+
+static void get_board_id(void *vsd, char *board_id, char *vsdstr)
+{
 	int i;
 
 #define VSD_OFFSET_SIG1		0x00
@@ -712,9 +1252,16 @@
 #define VSD_OFFSET_SIG2		0xde
 #define VSD_OFFSET_MLX_BOARD_ID	0xd0
 #define VSD_OFFSET_TS_BOARD_ID	0x20
+#define VSD_LEN			0xd0
 
 #define VSD_SIGNATURE_TOPSPIN	0x5ad
 
+	memset(vsdstr, 0, MLX4_VSD_LEN);
+
+	for (i = 0; i < VSD_LEN / 4; i++)
+		((u32 *)vsdstr)[i] =
+			swab32(*(u32 *)(vsd + i * 4));
+
 	memset(board_id, 0, MLX4_BOARD_ID_LEN);
 
 	if (be16_to_cpup(vsd + VSD_OFFSET_SIG1) == VSD_SIGNATURE_TOPSPIN &&
@@ -741,6 +1288,7 @@
 #define QUERY_ADAPTER_OUT_SIZE             0x100
 #define QUERY_ADAPTER_INTA_PIN_OFFSET      0x10
 #define QUERY_ADAPTER_VSD_OFFSET           0x20
+#define QUERY_ADAPTER_VSD_VENDOR_ID_OFFSET 0x1e
 
 	mailbox = mlx4_alloc_cmd_mailbox(dev);
 	if (IS_ERR(mailbox))
@@ -748,14 +1296,17 @@
 	outbox = mailbox->buf;
 
 	err = mlx4_cmd_box(dev, 0, mailbox->dma, 0, 0, MLX4_CMD_QUERY_ADAPTER,
-			   MLX4_CMD_TIME_CLASS_A);
+			   MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
 	if (err)
 		goto out;
 
 	MLX4_GET(adapter->inta_pin, outbox,    QUERY_ADAPTER_INTA_PIN_OFFSET);
 
+	adapter->vsd_vendor_id = be16_to_cpup((u16 *)outbox +
+				QUERY_ADAPTER_VSD_VENDOR_ID_OFFSET / 2);
+
 	get_board_id(outbox + QUERY_ADAPTER_VSD_OFFSET / 4,
-		     adapter->board_id);
+		     adapter->board_id, adapter->vsd);
 
 out:
 	mlx4_free_cmd_mailbox(dev, mailbox);
@@ -766,14 +1317,16 @@
 {
 	struct mlx4_cmd_mailbox *mailbox;
 	__be32 *inbox;
+	u32 mw_enable;
 	int err;
 
 #define INIT_HCA_IN_SIZE		 0x200
+#define INIT_HCA_DRV_NAME_FOR_FW_MAX_SIZE 64
 #define INIT_HCA_VERSION_OFFSET		 0x000
 #define	 INIT_HCA_VERSION		 2
 #define INIT_HCA_CACHELINE_SZ_OFFSET	 0x0e
-#define INIT_HCA_X86_64_BYTE_CACHELINE_SZ	 0x40
 #define INIT_HCA_FLAGS_OFFSET		 0x014
+#define INIT_HCA_RECOVERABLE_ERROR_EVENT_OFFSET 0x018
 #define INIT_HCA_QPC_OFFSET		 0x020
 #define	 INIT_HCA_QPC_BASE_OFFSET	 (INIT_HCA_QPC_OFFSET + 0x10)
 #define	 INIT_HCA_LOG_QP_OFFSET		 (INIT_HCA_QPC_OFFSET + 0x17)
@@ -781,6 +1334,7 @@
 #define	 INIT_HCA_LOG_SRQ_OFFSET	 (INIT_HCA_QPC_OFFSET + 0x2f)
 #define	 INIT_HCA_CQC_BASE_OFFSET	 (INIT_HCA_QPC_OFFSET + 0x30)
 #define	 INIT_HCA_LOG_CQ_OFFSET		 (INIT_HCA_QPC_OFFSET + 0x37)
+#define	 INIT_HCA_EQE_CQE_OFFSETS	 (INIT_HCA_QPC_OFFSET + 0x38)
 #define	 INIT_HCA_ALTC_BASE_OFFSET	 (INIT_HCA_QPC_OFFSET + 0x40)
 #define	 INIT_HCA_AUXC_BASE_OFFSET	 (INIT_HCA_QPC_OFFSET + 0x50)
 #define	 INIT_HCA_EQC_BASE_OFFSET	 (INIT_HCA_QPC_OFFSET + 0x60)
@@ -791,9 +1345,22 @@
 #define	 INIT_HCA_MC_BASE_OFFSET	 (INIT_HCA_MCAST_OFFSET + 0x00)
 #define	 INIT_HCA_LOG_MC_ENTRY_SZ_OFFSET (INIT_HCA_MCAST_OFFSET + 0x12)
 #define	 INIT_HCA_LOG_MC_HASH_SZ_OFFSET	 (INIT_HCA_MCAST_OFFSET + 0x16)
+#define  INIT_HCA_UC_STEERING_OFFSET	 (INIT_HCA_MCAST_OFFSET + 0x18)
 #define	 INIT_HCA_LOG_MC_TABLE_SZ_OFFSET (INIT_HCA_MCAST_OFFSET + 0x1b)
+#define  INIT_HCA_DEVICE_MANAGED_FLOW_STEERING_EN	0x6
+#define  INIT_HCA_DRIVER_VERSION_OFFSET   0x140
+#define  INIT_HCA_FS_PARAM_OFFSET         0x1d0
+#define  INIT_HCA_FS_BASE_OFFSET          (INIT_HCA_FS_PARAM_OFFSET + 0x00)
+#define  INIT_HCA_FS_LOG_ENTRY_SZ_OFFSET  (INIT_HCA_FS_PARAM_OFFSET + 0x12)
+#define  INIT_HCA_FS_LOG_TABLE_SZ_OFFSET  (INIT_HCA_FS_PARAM_OFFSET + 0x1b)
+#define  INIT_HCA_FS_ETH_BITS_OFFSET      (INIT_HCA_FS_PARAM_OFFSET + 0x21)
+#define  INIT_HCA_FS_ETH_NUM_ADDRS_OFFSET (INIT_HCA_FS_PARAM_OFFSET + 0x22)
+#define  INIT_HCA_FS_IB_BITS_OFFSET       (INIT_HCA_FS_PARAM_OFFSET + 0x25)
+#define  INIT_HCA_FS_IB_NUM_ADDRS_OFFSET  (INIT_HCA_FS_PARAM_OFFSET + 0x26)
 #define INIT_HCA_TPT_OFFSET		 0x0f0
 #define	 INIT_HCA_DMPT_BASE_OFFSET	 (INIT_HCA_TPT_OFFSET + 0x00)
+#define  INIT_HCA_TPT_MW_OFFSET		 (INIT_HCA_TPT_OFFSET + 0x08)
+#define  INIT_HCA_TPT_MW_ENABLE		 (1 << 31)
 #define	 INIT_HCA_LOG_MPT_SZ_OFFSET	 (INIT_HCA_TPT_OFFSET + 0x0b)
 #define	 INIT_HCA_MTT_BASE_OFFSET	 (INIT_HCA_TPT_OFFSET + 0x10)
 #define	 INIT_HCA_CMPT_BASE_OFFSET	 (INIT_HCA_TPT_OFFSET + 0x18)
@@ -809,10 +1376,10 @@
 	memset(inbox, 0, INIT_HCA_IN_SIZE);
 
 	*((u8 *) mailbox->buf + INIT_HCA_VERSION_OFFSET) = INIT_HCA_VERSION;
-#if defined(__x86_64__) || defined(__PPC64__)
-	*((u8 *) mailbox->buf + INIT_HCA_CACHELINE_SZ_OFFSET) = INIT_HCA_X86_64_BYTE_CACHELINE_SZ;
-#endif
 
+	*((u8 *) mailbox->buf + INIT_HCA_CACHELINE_SZ_OFFSET) =
+		((ilog2(cache_line_size()) - 4) << 5) | (1 << 4);
+
 #if defined(__LITTLE_ENDIAN)
 	*(inbox + INIT_HCA_FLAGS_OFFSET / 4) &= ~cpu_to_be32(1 << 1);
 #elif defined(__BIG_ENDIAN)
@@ -831,10 +1398,43 @@
 	if (enable_qos)
 		*(inbox + INIT_HCA_FLAGS_OFFSET / 4) |= cpu_to_be32(1 << 2);
 
-	/* counters mode */
-	*(inbox + INIT_HCA_FLAGS_OFFSET / 4) |=
-		cpu_to_be32(dev->caps.counters_mode << 4);
+	/* Enable fast drop performance optimization */
+	if (dev->caps.fast_drop)
+		*(inbox + INIT_HCA_FLAGS_OFFSET / 4) |= cpu_to_be32(1 << 7);
 
+	/* enable counters */
+	if (dev->caps.flags & MLX4_DEV_CAP_FLAG_COUNTERS)
+		*(inbox + INIT_HCA_FLAGS_OFFSET / 4) |= cpu_to_be32(1 << 4);
+
+	/* CX3 is capable of extending CQEs\EQEs from 32 to 64 bytes */
+	if (dev->caps.flags & MLX4_DEV_CAP_FLAG_64B_EQE) {
+		*(inbox + INIT_HCA_EQE_CQE_OFFSETS / 4) |= cpu_to_be32(1 << 29);
+		dev->caps.eqe_size   = 64;
+		dev->caps.eqe_factor = 1;
+	} else {
+		dev->caps.eqe_size   = 32;
+		dev->caps.eqe_factor = 0;
+	}
+
+	if (dev->caps.flags & MLX4_DEV_CAP_FLAG_64B_CQE) {
+		*(inbox + INIT_HCA_EQE_CQE_OFFSETS / 4) |= cpu_to_be32(1 << 30);
+		dev->caps.cqe_size   = 64;
+		dev->caps.userspace_caps |= MLX4_USER_DEV_CAP_64B_CQE;
+	} else {
+		dev->caps.cqe_size   = 32;
+	}
+
+	if (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_RECOVERABLE_ERROR_EVENT)
+		*(inbox + INIT_HCA_RECOVERABLE_ERROR_EVENT_OFFSET / 4) |= cpu_to_be32(1 << 31);
+
+	if (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_DRIVER_VERSION_TO_FW) {
+		strncpy((u8 *)mailbox->buf + INIT_HCA_DRIVER_VERSION_OFFSET,
+			DRV_NAME_FOR_FW,
+			INIT_HCA_DRV_NAME_FOR_FW_MAX_SIZE - 1);
+		mlx4_dbg(dev, "Reporting Driver Version to FW: %s\n",
+			 (u8 *)mailbox->buf + INIT_HCA_DRIVER_VERSION_OFFSET);
+	}
+
 	/* QPC/EEC/CQC/EQC/RDMARC attributes */
 
 	MLX4_PUT(inbox, param->qpc_base,      INIT_HCA_QPC_BASE_OFFSET);
@@ -850,16 +1450,50 @@
 	MLX4_PUT(inbox, param->rdmarc_base,   INIT_HCA_RDMARC_BASE_OFFSET);
 	MLX4_PUT(inbox, param->log_rd_per_qp, INIT_HCA_LOG_RD_OFFSET);
 
-	/* multicast attributes */
+	/* steering attributes */
+	if (dev->caps.steering_mode ==
+	    MLX4_STEERING_MODE_DEVICE_MANAGED) {
+		*(inbox + INIT_HCA_FLAGS_OFFSET / 4) |=
+			cpu_to_be32(1 <<
+				    INIT_HCA_DEVICE_MANAGED_FLOW_STEERING_EN);
 
-	MLX4_PUT(inbox, param->mc_base,		INIT_HCA_MC_BASE_OFFSET);
-	MLX4_PUT(inbox, param->log_mc_entry_sz, INIT_HCA_LOG_MC_ENTRY_SZ_OFFSET);
-	MLX4_PUT(inbox, param->log_mc_hash_sz,  INIT_HCA_LOG_MC_HASH_SZ_OFFSET);
-	MLX4_PUT(inbox, param->log_mc_table_sz, INIT_HCA_LOG_MC_TABLE_SZ_OFFSET);
+		MLX4_PUT(inbox, param->mc_base, INIT_HCA_FS_BASE_OFFSET);
+		MLX4_PUT(inbox, param->log_mc_entry_sz,
+			 INIT_HCA_FS_LOG_ENTRY_SZ_OFFSET);
+		MLX4_PUT(inbox, param->log_mc_table_sz,
+			 INIT_HCA_FS_LOG_TABLE_SZ_OFFSET);
+		/* Enable Ethernet flow steering
+		 * with udp unicast and tcp unicast
+		 */
+		MLX4_PUT(inbox, (u8) (MLX4_FS_UDP_UC_EN | MLX4_FS_TCP_UC_EN),
+			 INIT_HCA_FS_ETH_BITS_OFFSET);
+		MLX4_PUT(inbox, (u16) MLX4_FS_NUM_OF_L2_ADDR,
+			 INIT_HCA_FS_ETH_NUM_ADDRS_OFFSET);
+		/* Enable IPoIB flow steering
+		 * with udp unicast and tcp unicast
+		 */
+		MLX4_PUT(inbox, (u8) (MLX4_FS_UDP_UC_EN | MLX4_FS_TCP_UC_EN),
+			 INIT_HCA_FS_IB_BITS_OFFSET);
+		MLX4_PUT(inbox, (u16) MLX4_FS_NUM_OF_L2_ADDR,
+			 INIT_HCA_FS_IB_NUM_ADDRS_OFFSET);
+	} else {
+		MLX4_PUT(inbox, param->mc_base,	INIT_HCA_MC_BASE_OFFSET);
+		MLX4_PUT(inbox, param->log_mc_entry_sz,
+			 INIT_HCA_LOG_MC_ENTRY_SZ_OFFSET);
+		MLX4_PUT(inbox, param->log_mc_hash_sz,
+			 INIT_HCA_LOG_MC_HASH_SZ_OFFSET);
+		MLX4_PUT(inbox, param->log_mc_table_sz,
+			 INIT_HCA_LOG_MC_TABLE_SZ_OFFSET);
+		if (dev->caps.steering_mode == MLX4_STEERING_MODE_B0)
+			MLX4_PUT(inbox, (u8) (1 << 3),
+				 INIT_HCA_UC_STEERING_OFFSET);
+	}
 
 	/* TPT attributes */
 
 	MLX4_PUT(inbox, param->dmpt_base,  INIT_HCA_DMPT_BASE_OFFSET);
+	mw_enable = param->mw_enable ? INIT_HCA_TPT_MW_ENABLE : 0;
+	MLX4_PUT(inbox, mw_enable,	   INIT_HCA_TPT_MW_OFFSET);
 	MLX4_PUT(inbox, param->log_mpt_sz, INIT_HCA_LOG_MPT_SZ_OFFSET);
 	MLX4_PUT(inbox, param->mtt_base,   INIT_HCA_MTT_BASE_OFFSET);
 	MLX4_PUT(inbox, param->cmpt_base,  INIT_HCA_CMPT_BASE_OFFSET);
@@ -866,14 +1500,12 @@
 
 	/* UAR attributes */
 
-	MLX4_PUT(inbox, (u8) (PAGE_SHIFT - 12), INIT_HCA_UAR_PAGE_SZ_OFFSET);
+	MLX4_PUT(inbox, param->uar_page_sz,	INIT_HCA_UAR_PAGE_SZ_OFFSET);
 	MLX4_PUT(inbox, param->log_uar_sz,      INIT_HCA_LOG_UAR_SZ_OFFSET);
-	if (!mlx4_pre_t11_mode && dev->caps.flags & (u32) MLX4_DEV_CAP_FLAG_FC_T11)
-		*(inbox + INIT_HCA_FLAGS_OFFSET / 4) |= cpu_to_be32(1 << 10);
 
+	err = mlx4_cmd(dev, mailbox->dma, 0, 0, MLX4_CMD_INIT_HCA, 10000,
+		       MLX4_CMD_NATIVE);
 
-	err = mlx4_cmd(dev, mailbox->dma, 0, 0, MLX4_CMD_INIT_HCA, 10000);
-
 	if (err)
 		mlx4_err(dev, "INIT_HCA returns %d\n", err);
 
@@ -881,6 +1513,157 @@
 	return err;
 }
 
+int mlx4_QUERY_HCA(struct mlx4_dev *dev,
+		   struct mlx4_init_hca_param *param)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	__be32 *outbox;
+	u32 dword_field;
+	u32 mw_enable;
+	int err;
+	u8 byte_field;
+
+#define QUERY_HCA_GLOBAL_CAPS_OFFSET	0x04
+#define QUERY_HCA_CORE_CLOCK_OFFSET	0x0c
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	outbox = mailbox->buf;
+
+	err = mlx4_cmd_box(dev, 0, mailbox->dma, 0, 0,
+			   MLX4_CMD_QUERY_HCA,
+			   MLX4_CMD_TIME_CLASS_B,
+			   !mlx4_is_slave(dev));
+	if (err)
+		goto out;
+
+	MLX4_GET(param->global_caps, outbox, QUERY_HCA_GLOBAL_CAPS_OFFSET);
+	MLX4_GET(param->hca_core_clock, outbox, QUERY_HCA_CORE_CLOCK_OFFSET);
+
+	/* QPC/EEC/CQC/EQC/RDMARC attributes */
+
+	MLX4_GET(param->qpc_base,      outbox, INIT_HCA_QPC_BASE_OFFSET);
+	MLX4_GET(param->log_num_qps,   outbox, INIT_HCA_LOG_QP_OFFSET);
+	MLX4_GET(param->srqc_base,     outbox, INIT_HCA_SRQC_BASE_OFFSET);
+	MLX4_GET(param->log_num_srqs,  outbox, INIT_HCA_LOG_SRQ_OFFSET);
+	MLX4_GET(param->cqc_base,      outbox, INIT_HCA_CQC_BASE_OFFSET);
+	MLX4_GET(param->log_num_cqs,   outbox, INIT_HCA_LOG_CQ_OFFSET);
+	MLX4_GET(param->altc_base,     outbox, INIT_HCA_ALTC_BASE_OFFSET);
+	MLX4_GET(param->auxc_base,     outbox, INIT_HCA_AUXC_BASE_OFFSET);
+	MLX4_GET(param->eqc_base,      outbox, INIT_HCA_EQC_BASE_OFFSET);
+	MLX4_GET(param->log_num_eqs,   outbox, INIT_HCA_LOG_EQ_OFFSET);
+	MLX4_GET(param->rdmarc_base,   outbox, INIT_HCA_RDMARC_BASE_OFFSET);
+	MLX4_GET(param->log_rd_per_qp, outbox, INIT_HCA_LOG_RD_OFFSET);
+
+	MLX4_GET(dword_field, outbox, INIT_HCA_FLAGS_OFFSET);
+	if (dword_field & (1 << INIT_HCA_DEVICE_MANAGED_FLOW_STEERING_EN)) {
+		param->steering_mode = MLX4_STEERING_MODE_DEVICE_MANAGED;
+	} else {
+		MLX4_GET(byte_field, outbox, INIT_HCA_UC_STEERING_OFFSET);
+		if (byte_field & 0x8)
+			param->steering_mode = MLX4_STEERING_MODE_B0;
+		else
+			param->steering_mode = MLX4_STEERING_MODE_A0;
+	}
+	/* steering attributes */
+	if (param->steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED) {
+		MLX4_GET(param->mc_base, outbox, INIT_HCA_FS_BASE_OFFSET);
+		MLX4_GET(param->log_mc_entry_sz, outbox,
+			 INIT_HCA_FS_LOG_ENTRY_SZ_OFFSET);
+		MLX4_GET(param->log_mc_table_sz, outbox,
+			 INIT_HCA_FS_LOG_TABLE_SZ_OFFSET);
+	} else {
+		MLX4_GET(param->mc_base, outbox, INIT_HCA_MC_BASE_OFFSET);
+		MLX4_GET(param->log_mc_entry_sz, outbox,
+			 INIT_HCA_LOG_MC_ENTRY_SZ_OFFSET);
+		MLX4_GET(param->log_mc_hash_sz,  outbox,
+			 INIT_HCA_LOG_MC_HASH_SZ_OFFSET);
+		MLX4_GET(param->log_mc_table_sz, outbox,
+			 INIT_HCA_LOG_MC_TABLE_SZ_OFFSET);
+	}
+
+	/* CX3 is capable of extending CQEs\EQEs from 32 to 64 bytes */
+	MLX4_GET(byte_field, outbox, INIT_HCA_EQE_CQE_OFFSETS);
+	if (byte_field & 0x20) /* 64-bytes eqe enabled */
+		param->dev_cap_enabled |= MLX4_DEV_CAP_64B_EQE_ENABLED;
+	if (byte_field & 0x40) /* 64-bytes cqe enabled */
+		param->dev_cap_enabled |= MLX4_DEV_CAP_64B_CQE_ENABLED;
+
+	/* TPT attributes */
+
+	MLX4_GET(param->dmpt_base,  outbox, INIT_HCA_DMPT_BASE_OFFSET);
+	MLX4_GET(mw_enable,	    outbox, INIT_HCA_TPT_MW_OFFSET);
+	param->mw_enable = (mw_enable & INIT_HCA_TPT_MW_ENABLE) ==
+			   INIT_HCA_TPT_MW_ENABLE;
+	MLX4_GET(param->log_mpt_sz, outbox, INIT_HCA_LOG_MPT_SZ_OFFSET);
+	MLX4_GET(param->mtt_base,   outbox, INIT_HCA_MTT_BASE_OFFSET);
+	MLX4_GET(param->cmpt_base,  outbox, INIT_HCA_CMPT_BASE_OFFSET);
+
+	/* UAR attributes */
+
+	MLX4_GET(param->uar_page_sz, outbox, INIT_HCA_UAR_PAGE_SZ_OFFSET);
+	MLX4_GET(param->log_uar_sz, outbox, INIT_HCA_LOG_UAR_SZ_OFFSET);
+
+out:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+
+	return err;
+}
+
+/* for IB-type ports only in SRIOV mode. Checks that both proxy QP0
+ * and real QP0 are active, so that the paravirtualized QP0 is ready
+ * to operate */
+static int check_qp0_state(struct mlx4_dev *dev, int function, int port)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	/* irrelevant if not infiniband */
+	if (priv->mfunc.master.qp0_state[port].proxy_qp0_active &&
+	    priv->mfunc.master.qp0_state[port].qp0_active)
+		return 1;
+	return 0;
+}
+
+int mlx4_INIT_PORT_wrapper(struct mlx4_dev *dev, int slave,
+			   struct mlx4_vhcr *vhcr,
+			   struct mlx4_cmd_mailbox *inbox,
+			   struct mlx4_cmd_mailbox *outbox,
+			   struct mlx4_cmd_info *cmd)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int port = vhcr->in_modifier;
+	int err;
+
+	if (priv->mfunc.master.slave_state[slave].init_port_mask & (1 << port))
+		return 0;
+
+	if (dev->caps.port_mask[port] != MLX4_PORT_TYPE_IB) {
+		/* Enable port only if it was previously disabled */
+		if (!priv->mfunc.master.init_port_ref[port]) {
+			err = mlx4_cmd(dev, 0, port, 0, MLX4_CMD_INIT_PORT,
+				       MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
+			if (err)
+				return err;
+		}
+		priv->mfunc.master.slave_state[slave].init_port_mask |= (1 << port);
+	} else {
+		if (slave == mlx4_master_func_num(dev)) {
+			if (check_qp0_state(dev, slave, port) &&
+			    !priv->mfunc.master.qp0_state[port].port_active) {
+				err = mlx4_cmd(dev, 0, port, 0, MLX4_CMD_INIT_PORT,
+					       MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
+				if (err)
+					return err;
+				priv->mfunc.master.qp0_state[port].port_active = 1;
+				priv->mfunc.master.slave_state[slave].init_port_mask |= (1 << port);
+			}
+		} else
+			priv->mfunc.master.slave_state[slave].init_port_mask |= (1 << port);
+	}
+	++priv->mfunc.master.init_port_ref[port];
+	return 0;
+}
+
 int mlx4_INIT_PORT(struct mlx4_dev *dev, int port)
 {
 	struct mlx4_cmd_mailbox *mailbox;
@@ -924,26 +1707,69 @@
 		MLX4_PUT(inbox, field, INIT_PORT_MAX_PKEY_OFFSET);
 
 		err = mlx4_cmd(dev, mailbox->dma, port, 0, MLX4_CMD_INIT_PORT,
-			       MLX4_CMD_TIME_CLASS_A);
+			       MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
 
 		mlx4_free_cmd_mailbox(dev, mailbox);
 	} else
 		err = mlx4_cmd(dev, 0, port, 0, MLX4_CMD_INIT_PORT,
-			       MLX4_CMD_TIME_CLASS_A);
+			       MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
 
 	return err;
 }
 EXPORT_SYMBOL_GPL(mlx4_INIT_PORT);
 
+int mlx4_CLOSE_PORT_wrapper(struct mlx4_dev *dev, int slave,
+			    struct mlx4_vhcr *vhcr,
+			    struct mlx4_cmd_mailbox *inbox,
+			    struct mlx4_cmd_mailbox *outbox,
+			    struct mlx4_cmd_info *cmd)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int port = vhcr->in_modifier;
+	int err;
+
+	if (!(priv->mfunc.master.slave_state[slave].init_port_mask &
+	    (1 << port)))
+		return 0;
+
+	if (dev->caps.port_mask[port] != MLX4_PORT_TYPE_IB) {
+		if (priv->mfunc.master.init_port_ref[port] == 1) {
+			err = mlx4_cmd(dev, 0, port, 0, MLX4_CMD_CLOSE_PORT,
+				       1000, MLX4_CMD_NATIVE);
+			if (err)
+				return err;
+		}
+		priv->mfunc.master.slave_state[slave].init_port_mask &= ~(1 << port);
+	} else {
+		/* infiniband port */
+		if (slave == mlx4_master_func_num(dev)) {
+			if (!priv->mfunc.master.qp0_state[port].qp0_active &&
+			    priv->mfunc.master.qp0_state[port].port_active) {
+				err = mlx4_cmd(dev, 0, port, 0, MLX4_CMD_CLOSE_PORT,
+					       1000, MLX4_CMD_NATIVE);
+				if (err)
+					return err;
+				priv->mfunc.master.slave_state[slave].init_port_mask &= ~(1 << port);
+				priv->mfunc.master.qp0_state[port].port_active = 0;
+			}
+		} else
+			priv->mfunc.master.slave_state[slave].init_port_mask &= ~(1 << port);
+	}
+	--priv->mfunc.master.init_port_ref[port];
+	return 0;
+}
+
 int mlx4_CLOSE_PORT(struct mlx4_dev *dev, int port)
 {
-	return mlx4_cmd(dev, 0, port, 0, MLX4_CMD_CLOSE_PORT, 1000);
+	return mlx4_cmd(dev, 0, port, 0, MLX4_CMD_CLOSE_PORT, 1000,
+			MLX4_CMD_WRAPPED);
 }
 EXPORT_SYMBOL_GPL(mlx4_CLOSE_PORT);
 
 int mlx4_CLOSE_HCA(struct mlx4_dev *dev, int panic)
 {
-	return mlx4_cmd(dev, 0, 0, panic, MLX4_CMD_CLOSE_HCA, 1000);
+	return mlx4_cmd(dev, 0, 0, panic, MLX4_CMD_CLOSE_HCA, 1000,
+			MLX4_CMD_NATIVE);
 }
 
 int mlx4_SET_ICM_SIZE(struct mlx4_dev *dev, u64 icm_size, u64 *aux_pages)
@@ -950,7 +1776,7 @@
 {
 	int ret = mlx4_cmd_imm(dev, icm_size, aux_pages, 0, 0,
 			       MLX4_CMD_SET_ICM_SIZE,
-			       MLX4_CMD_TIME_CLASS_A);
+			       MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
 	if (ret)
 		return ret;
 
@@ -967,9 +1793,53 @@
 int mlx4_NOP(struct mlx4_dev *dev)
 {
 	/* Input modifier of 0x1f means "finish as soon as possible." */
-	return mlx4_cmd(dev, 0, 0x1f, 0, MLX4_CMD_NOP, 100);
+	return mlx4_cmd(dev, 0, 0x1f, 0, MLX4_CMD_NOP, MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
 }
 
+int mlx4_query_diag_counters(struct mlx4_dev *dev, int array_length,
+			     u8 op_modifier, u32 in_offset[],
+			     u32 counter_out[])
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	u32 *outbox;
+	int ret;
+	int i;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	outbox = mailbox->buf;
+
+	ret = mlx4_cmd_box(dev, 0, mailbox->dma, 0, op_modifier,
+			   MLX4_CMD_DIAG_RPRT, MLX4_CMD_TIME_CLASS_A,
+			   MLX4_CMD_NATIVE);
+	if (ret)
+		goto out;
+
+	for (i = 0; i < array_length; i++) {
+		if (in_offset[i] > MLX4_MAILBOX_SIZE) {
+			ret = -EINVAL;
+			goto out;
+		}
+
+		MLX4_GET(counter_out[i], outbox, in_offset[i]);
+	}
+
+out:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(mlx4_query_diag_counters);
+
+int mlx4_MOD_STAT_CFG_wrapper(struct mlx4_dev *dev, int slave,
+			  struct mlx4_vhcr *vhcr,
+			  struct mlx4_cmd_mailbox *inbox,
+			  struct mlx4_cmd_mailbox *outbox,
+			  struct mlx4_cmd_info *cmd)
+{
+	return -EPERM;
+}
+
 #define MLX4_WOL_SETUP_MODE (5 << 28)
 int mlx4_wol_read(struct mlx4_dev *dev, u64 *config, int port)
 {
@@ -976,7 +1846,8 @@
 	u32 in_mod = MLX4_WOL_SETUP_MODE | port << 8;
 
 	return mlx4_cmd_imm(dev, 0, config, in_mod, 0x3,
-			    MLX4_CMD_MOD_STAT_CFG, MLX4_CMD_TIME_CLASS_A);
+			    MLX4_CMD_MOD_STAT_CFG, MLX4_CMD_TIME_CLASS_A,
+			    MLX4_CMD_NATIVE);
 }
 EXPORT_SYMBOL_GPL(mlx4_wol_read);
 
@@ -985,46 +1856,99 @@
 	u32 in_mod = MLX4_WOL_SETUP_MODE | port << 8;
 
 	return mlx4_cmd(dev, config, in_mod, 0x1, MLX4_CMD_MOD_STAT_CFG,
-					MLX4_CMD_TIME_CLASS_A);
+			MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
 }
 EXPORT_SYMBOL_GPL(mlx4_wol_write);
 
-int mlx4_query_diag_counters(struct mlx4_dev *dev, int array_length,
-			     u8 op_modifier, u32 in_offset[], u32 counter_out[])
+enum {
+	ADD_TO_MCG = 0x26,
+};
+
+
+void mlx4_opreq_action(struct work_struct *work)
 {
+	struct mlx4_priv *priv = container_of(work, struct mlx4_priv, opreq_task);
+	struct mlx4_dev *dev = &priv->dev;
+	int num_tasks = atomic_read(&priv->opreq_count);
 	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_mgm *mgm;
 	u32 *outbox;
-	int ret;
+	u32 modifier;
+	u16 token;
+	u16 type_m;
+	u16 type;
+	int err;
+	u32 num_qps;
+	struct mlx4_qp qp;
 	int i;
+	u8 rem_mcg;
+	u8 prot;
 
+#define GET_OP_REQ_MODIFIER_OFFSET	0x08
+#define GET_OP_REQ_TOKEN_OFFSET		0x14
+#define GET_OP_REQ_TYPE_OFFSET		0x1a
+#define GET_OP_REQ_DATA_OFFSET		0x20
+
 	mailbox = mlx4_alloc_cmd_mailbox(dev);
-	if (IS_ERR(mailbox))
-		return PTR_ERR(mailbox);
+	if (IS_ERR(mailbox)) {
+		mlx4_err(dev, "Failed to allocate mailbox for GET_OP_REQ\n");
+		return;
+	}
 	outbox = mailbox->buf;
 
-	ret = mlx4_cmd_box(dev, 0, mailbox->dma, 0, op_modifier,
-			   MLX4_CMD_DIAG_RPRT, MLX4_CMD_TIME_CLASS_A);
-	if (ret)
-		goto out;
+	while (num_tasks) {
+		err = mlx4_cmd_box(dev, 0, mailbox->dma, 0, 0,
+				   MLX4_CMD_GET_OP_REQ, MLX4_CMD_TIME_CLASS_A,
+				   MLX4_CMD_NATIVE);
+		if (err) {
+			mlx4_err(dev, "Failed to retreive required operation: %d\n", err);
+			return;
+		}
+		MLX4_GET(modifier, outbox, GET_OP_REQ_MODIFIER_OFFSET);
+		MLX4_GET(token, outbox, GET_OP_REQ_TOKEN_OFFSET);
+		MLX4_GET(type, outbox, GET_OP_REQ_TYPE_OFFSET);
+		type_m = type >> 12;
+		type &= 0xfff;
 
-	for (i=0; i < array_length; i++) {
-		if (in_offset[i] > MLX4_MAILBOX_SIZE) {
-			ret = -EINVAL;
+		switch (type) {
+		case ADD_TO_MCG:
+			if (dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED) {
+				mlx4_warn(dev, "ADD MCG operation is not supported in "
+					       "DEVICE_MANAGED steerign mode\n");
+				err = EPERM;
+				break;
+			}
+			mgm = (struct mlx4_mgm *) ((u8 *) (outbox) + GET_OP_REQ_DATA_OFFSET);
+			num_qps = be32_to_cpu(mgm->members_count) & MGM_QPN_MASK;
+			rem_mcg = ((u8 *) (&mgm->members_count))[0] & 1;
+			prot = ((u8 *) (&mgm->members_count))[0] >> 6;
+
+			for (i = 0; i < num_qps; i++) {
+				qp.qpn = be32_to_cpu(mgm->qp[i]);
+				if (rem_mcg)
+					err = mlx4_multicast_detach(dev, &qp, mgm->gid, prot, 0);
+				else
+					err = mlx4_multicast_attach(dev, &qp, mgm->gid, mgm->gid[5] ,0, prot, NULL);
+				if (err)
+					break;
+			}
+			break;
+		default:
+			mlx4_warn(dev, "Bad type for required operation\n");
+			err = EINVAL;
+			break;
+		}
+		err = mlx4_cmd(dev, 0, ((u32) err | cpu_to_be32(token) << 16), 1,
+			       MLX4_CMD_GET_OP_REQ, MLX4_CMD_TIME_CLASS_A,
+			       MLX4_CMD_NATIVE);
+		if (err) {
+			mlx4_err(dev, "Failed to acknowledge required request: %d\n", err);
 			goto out;
 		}
-
-		MLX4_GET(counter_out[i], outbox, in_offset[i]);
+		memset(outbox, 0, 0xffc);
+		num_tasks = atomic_dec_return(&priv->opreq_count);
 	}
 
 out:
 	mlx4_free_cmd_mailbox(dev, mailbox);
-	return ret;
 }
-EXPORT_SYMBOL_GPL(mlx4_query_diag_counters);
-
-void mlx4_get_fc_t11_settings(struct mlx4_dev *dev, int *enable_pre_t11, int *t11_supported)
-{
-	*enable_pre_t11 = !!mlx4_pre_t11_mode;
-	*t11_supported = !!(dev->caps.flags & MLX4_DEV_CAP_FLAG_FC_T11);
-}
-EXPORT_SYMBOL_GPL(mlx4_get_fc_t11_settings);

Modified: trunk/sys/ofed/drivers/net/mlx4/fw.h
===================================================================
--- trunk/sys/ofed/drivers/net/mlx4/fw.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/drivers/net/mlx4/fw.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
- * Copyright (c) 2005, 2006, 2007, 2008 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2005, 2006, 2007, 2008, 2014 Mellanox Technologies. All rights reserved.
  * Copyright (c) 2006, 2007 Cisco Systems.  All rights reserved.
  *
  * This software is available to you under a choice of one of two
@@ -78,10 +78,10 @@
 	u16 wavelength[MLX4_MAX_PORTS + 1];
 	u64 trans_code[MLX4_MAX_PORTS + 1];
 	u16 stat_rate_support;
-	int udp_rss;
-	int loopback_support;
-	int wol;
+	int fs_log_max_ucast_qp_range_size;
+	int fs_max_num_qp_per_entry;
 	u64 flags;
+	u64 flags2;
 	int reserved_uars;
 	int uar_size;
 	int min_page_sz;
@@ -108,21 +108,48 @@
 	int dmpt_entry_sz;
 	int cmpt_entry_sz;
 	int mtt_entry_sz;
-	int inline_cfg;
 	int resize_srq;
 	u32 bmme_flags;
 	u32 reserved_lkey;
 	u64 max_icm_sz;
 	int max_gso_sz;
+	int max_rss_tbl_sz;
 	u8  supported_port_types[MLX4_MAX_PORTS + 1];
+	u8  suggested_type[MLX4_MAX_PORTS + 1];
+	u8  default_sense[MLX4_MAX_PORTS + 1];
 	u8  log_max_macs[MLX4_MAX_PORTS + 1];
 	u8  log_max_vlans[MLX4_MAX_PORTS + 1];
 	u32 max_basic_counters;
-	u32 max_ext_counters;
+	u32 sync_qp;
+	u8  timestamp_support;
+	u32 max_extended_counters;
 };
 
+struct mlx4_func_cap {
+	u8	num_ports;
+	u8	flags;
+	u32	pf_context_behaviour;
+	int	qp_quota;
+	int	cq_quota;
+	int	srq_quota;
+	int	mpt_quota;
+	int	mtt_quota;
+	int	max_eq;
+	int	reserved_eq;
+	int	mcg_quota;
+	u32	qp0_tunnel_qpn;
+	u32	qp0_proxy_qpn;
+	u32	qp1_tunnel_qpn;
+	u32	qp1_proxy_qpn;
+	u8	physical_port;
+	u8	port_flags;
+	u8	def_counter_index;
+};
+
 struct mlx4_adapter {
+	u16 vsd_vendor_id;
 	char board_id[MLX4_BOARD_ID_LEN];
+	char vsd[MLX4_VSD_LEN];
 	u8   inta_pin;
 };
 
@@ -138,8 +165,10 @@
 	u64 dmpt_base;
 	u64 cmpt_base;
 	u64 mtt_base;
+	u64 global_caps;
 	u16 log_mc_entry_sz;
 	u16 log_mc_hash_sz;
+	u16 hca_core_clock;
 	u8  log_num_qps;
 	u8  log_num_srqs;
 	u8  log_num_cqs;
@@ -148,6 +177,11 @@
 	u8  log_mc_table_sz;
 	u8  log_mpt_sz;
 	u8  log_uar_sz;
+	u8  uar_page_sz; /* log pg sz in 4k chunks */
+	u8  mw_enable;	/* Enable memory windows */
+	u8  fs_hash_enable_bits;
+	u8  steering_mode; /* for QUERY_HCA */
+	u64 dev_cap_enabled;
 };
 
 struct mlx4_init_ib_param {
@@ -172,6 +206,13 @@
 };
 
 int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap);
+int mlx4_QUERY_FUNC_CAP(struct mlx4_dev *dev, u32 gen_or_port,
+			struct mlx4_func_cap *func_cap);
+int mlx4_QUERY_FUNC_CAP_wrapper(struct mlx4_dev *dev, int slave,
+				struct mlx4_vhcr *vhcr,
+				struct mlx4_cmd_mailbox *inbox,
+				struct mlx4_cmd_mailbox *outbox,
+				struct mlx4_cmd_info *cmd);
 int mlx4_MAP_FA(struct mlx4_dev *dev, struct mlx4_icm *icm);
 int mlx4_UNMAP_FA(struct mlx4_dev *dev);
 int mlx4_RUN_FW(struct mlx4_dev *dev);
@@ -178,10 +219,12 @@
 int mlx4_QUERY_FW(struct mlx4_dev *dev);
 int mlx4_QUERY_ADAPTER(struct mlx4_dev *dev, struct mlx4_adapter *adapter);
 int mlx4_INIT_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param);
+int mlx4_QUERY_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param);
 int mlx4_CLOSE_HCA(struct mlx4_dev *dev, int panic);
 int mlx4_map_cmd(struct mlx4_dev *dev, u16 op, struct mlx4_icm *icm, u64 virt);
 int mlx4_SET_ICM_SIZE(struct mlx4_dev *dev, u64 icm_size, u64 *aux_pages);
 int mlx4_NOP(struct mlx4_dev *dev);
 int mlx4_MOD_STAT_CFG(struct mlx4_dev *dev, struct mlx4_mod_stat_cfg *cfg);
+void mlx4_opreq_action(struct work_struct *work);
 
 #endif /* MLX4_FW_H */


Property changes on: trunk/sys/ofed/drivers/net/mlx4/fw.h
___________________________________________________________________
Deleted: cvs2svn:cvs-rev
## -1 +0,0 ##
-1.1.1.1
\ No newline at end of property
Modified: trunk/sys/ofed/drivers/net/mlx4/icm.c
===================================================================
--- trunk/sys/ofed/drivers/net/mlx4/icm.c	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/drivers/net/mlx4/icm.c	2016-09-14 19:35:22 UTC (rev 7911)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2005, 2006, 2007, 2008 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2005, 2006, 2007, 2008, 2014 Mellanox Technologies. All rights reserved.
  * Copyright (c) 2006, 2007 Cisco Systems, Inc.  All rights reserved.
  *
  * This software is available to you under a choice of one of two
@@ -31,10 +31,11 @@
  * SOFTWARE.
  */
 
-#include <linux/init.h>
 #include <linux/errno.h>
 #include <linux/mm.h>
 #include <linux/scatterlist.h>
+#include <linux/slab.h>
+#include <linux/math64.h>
 
 #include <linux/mlx4/cmd.h>
 
@@ -93,13 +94,17 @@
 	kfree(icm);
 }
 
-static int mlx4_alloc_icm_pages(struct scatterlist *mem, int order, gfp_t gfp_mask)
+static int mlx4_alloc_icm_pages(struct scatterlist *mem, int order,
+				gfp_t gfp_mask, int node)
 {
 	struct page *page;
 
-	page = alloc_pages(gfp_mask, order);
-	if (!page)
-		return -ENOMEM;
+	page = alloc_pages_node(node, gfp_mask, order);
+	if (!page) {
+		page = alloc_pages(gfp_mask, order);
+		if (!page)
+			return -ENOMEM;
+	}
 
 	sg_set_page(mem, page, PAGE_SIZE << order, 0);
 	return 0;
@@ -130,9 +135,13 @@
 	/* We use sg_set_buf for coherent allocs, which assumes low memory */
 	BUG_ON(coherent && (gfp_mask & __GFP_HIGHMEM));
 
-	icm = kmalloc(sizeof *icm, gfp_mask & ~(__GFP_HIGHMEM | __GFP_NOWARN));
-	if (!icm)
-		return NULL;
+	icm = kmalloc_node(sizeof *icm, gfp_mask & ~(__GFP_HIGHMEM | __GFP_NOWARN),
+			   dev->numa_node);
+	if (!icm) {
+		icm = kmalloc(sizeof *icm, gfp_mask & ~(__GFP_HIGHMEM | __GFP_NOWARN));
+		if (!icm)
+			return NULL;
+	}
 
 	icm->refcount = 0;
 	INIT_LIST_HEAD(&icm->chunk_list);
@@ -141,10 +150,15 @@
 
 	while (npages > 0) {
 		if (!chunk) {
-			chunk = kmalloc(sizeof *chunk,
-					gfp_mask & ~(__GFP_HIGHMEM | __GFP_NOWARN));
-			if (!chunk)
-				goto fail;
+			chunk = kmalloc_node(sizeof *chunk,
+					     gfp_mask & ~(__GFP_HIGHMEM | __GFP_NOWARN),
+					     dev->numa_node);
+			if (!chunk) {
+				chunk = kmalloc(sizeof *chunk,
+						gfp_mask & ~(__GFP_HIGHMEM | __GFP_NOWARN));
+				if (!chunk)
+					goto fail;
+			}
 
 			sg_init_table(chunk->mem, MLX4_ICM_CHUNK_LEN);
 			chunk->npages = 0;
@@ -161,31 +175,33 @@
 						      cur_order, gfp_mask);
 		else
 			ret = mlx4_alloc_icm_pages(&chunk->mem[chunk->npages],
-						   cur_order, gfp_mask);
+						   cur_order, gfp_mask,
+						   dev->numa_node);
 
-		if (!ret) {
-			++chunk->npages;
+		if (ret) {
+			if (--cur_order < 0)
+				goto fail;
+			else
+				continue;
+		}
 
-			if (coherent)
-				++chunk->nsg;
-			else if (chunk->npages == MLX4_ICM_CHUNK_LEN) {
-				chunk->nsg = pci_map_sg(dev->pdev, chunk->mem,
-							chunk->npages,
-							PCI_DMA_BIDIRECTIONAL);
+		++chunk->npages;
 
-				if (chunk->nsg <= 0)
-					goto fail;
-			}
+		if (coherent)
+			++chunk->nsg;
+		else if (chunk->npages == MLX4_ICM_CHUNK_LEN) {
+			chunk->nsg = pci_map_sg(dev->pdev, chunk->mem,
+						chunk->npages,
+						PCI_DMA_BIDIRECTIONAL);
 
-			if (chunk->npages == MLX4_ICM_CHUNK_LEN)
-				chunk = NULL;
-
-			npages -= 1 << cur_order;
-		} else {
-			--cur_order;
-			if (cur_order < 0)
+			if (chunk->nsg <= 0)
 				goto fail;
 		}
+
+		if (chunk->npages == MLX4_ICM_CHUNK_LEN)
+			chunk = NULL;
+
+		npages -= 1 << cur_order;
 	}
 
 	if (!coherent && chunk) {
@@ -209,38 +225,12 @@
 	return mlx4_map_cmd(dev, MLX4_CMD_MAP_ICM, icm, virt);
 }
 
-int mlx4_UNMAP_ICM(struct mlx4_dev *dev, u64 virt, u32 page_count)
+static int mlx4_UNMAP_ICM(struct mlx4_dev *dev, u64 virt, u32 page_count)
 {
 	return mlx4_cmd(dev, virt, page_count, 0, MLX4_CMD_UNMAP_ICM,
-			MLX4_CMD_TIME_CLASS_B);
+			MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
 }
 
-int mlx4_MAP_ICM_page(struct mlx4_dev *dev, u64 dma_addr, u64 virt)
-{
-	struct mlx4_cmd_mailbox *mailbox;
-	__be64 *inbox;
-	int err;
-
-	mailbox = mlx4_alloc_cmd_mailbox(dev);
-	if (IS_ERR(mailbox))
-		return PTR_ERR(mailbox);
-	inbox = mailbox->buf;
-
-	inbox[0] = cpu_to_be64(virt);
-	inbox[1] = cpu_to_be64(dma_addr);
-
-	err = mlx4_cmd(dev, mailbox->dma, 1, 0, MLX4_CMD_MAP_ICM,
-		       MLX4_CMD_TIME_CLASS_B);
-
-	mlx4_free_cmd_mailbox(dev, mailbox);
-
-	if (!err)
-		mlx4_dbg(dev, "Mapped page at %llx to %llx for ICM.\n",
-			  (unsigned long long) dma_addr, (unsigned long long) virt);
-
-	return err;
-}
-
 int mlx4_MAP_ICM_AUX(struct mlx4_dev *dev, struct mlx4_icm *icm)
 {
 	return mlx4_map_cmd(dev, MLX4_CMD_MAP_ICM_AUX, icm, -1);
@@ -248,12 +238,14 @@
 
 int mlx4_UNMAP_ICM_AUX(struct mlx4_dev *dev)
 {
-	return mlx4_cmd(dev, 0, 0, 0, MLX4_CMD_UNMAP_ICM_AUX, MLX4_CMD_TIME_CLASS_B);
+	return mlx4_cmd(dev, 0, 0, 0, MLX4_CMD_UNMAP_ICM_AUX,
+			MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
 }
 
-int mlx4_table_get(struct mlx4_dev *dev, struct mlx4_icm_table *table, int obj)
+int mlx4_table_get(struct mlx4_dev *dev, struct mlx4_icm_table *table, u32 obj)
 {
-	int i = (obj & (table->num_obj - 1)) / (MLX4_TABLE_CHUNK_SIZE / table->obj_size);
+	u32 i = (obj & (table->num_obj - 1)) /
+			(MLX4_TABLE_CHUNK_SIZE / table->obj_size);
 	int ret = 0;
 
 	mutex_lock(&table->mutex);
@@ -286,9 +278,10 @@
 	return ret;
 }
 
-void mlx4_table_put(struct mlx4_dev *dev, struct mlx4_icm_table *table, int obj)
+void mlx4_table_put(struct mlx4_dev *dev, struct mlx4_icm_table *table, u32 obj)
 {
-	int i;
+	u32 i;
+	u64 offset;
 
 	i = (obj & (table->num_obj - 1)) / (MLX4_TABLE_CHUNK_SIZE / table->obj_size);
 
@@ -295,18 +288,25 @@
 	mutex_lock(&table->mutex);
 
 	if (--table->icm[i]->refcount == 0) {
-		mlx4_UNMAP_ICM(dev, table->virt + i * MLX4_TABLE_CHUNK_SIZE,
-			       MLX4_TABLE_CHUNK_SIZE / MLX4_ICM_PAGE_SIZE);
-		mlx4_free_icm(dev, table->icm[i], table->coherent);
-		table->icm[i] = NULL;
+		offset = (u64) i * MLX4_TABLE_CHUNK_SIZE;
+
+		if (!mlx4_UNMAP_ICM(dev, table->virt + offset,
+				    MLX4_TABLE_CHUNK_SIZE / MLX4_ICM_PAGE_SIZE)) {
+			mlx4_free_icm(dev, table->icm[i], table->coherent);
+			table->icm[i] = NULL;
+		} else {
+			pr_warn("mlx4_core: mlx4_UNMAP_ICM failed.\n");
+		}
 	}
 
 	mutex_unlock(&table->mutex);
 }
 
-void *mlx4_table_find(struct mlx4_icm_table *table, int obj, dma_addr_t *dma_handle)
+void *mlx4_table_find(struct mlx4_icm_table *table, u32 obj,
+			dma_addr_t *dma_handle)
 {
-	int idx, offset, dma_offset, i;
+	int offset, dma_offset, i;
+	u64 idx;
 	struct mlx4_icm_chunk *chunk;
 	struct mlx4_icm *icm;
 	struct page *page = NULL;
@@ -316,7 +316,7 @@
 
 	mutex_lock(&table->mutex);
 
-	idx = (obj & (table->num_obj - 1)) * table->obj_size;
+	idx = (u64) (obj & (table->num_obj - 1)) * table->obj_size;
 	icm = table->icm[idx / MLX4_TABLE_CHUNK_SIZE];
 	dma_offset = offset = idx % MLX4_TABLE_CHUNK_SIZE;
 
@@ -350,10 +350,11 @@
 }
 
 int mlx4_table_get_range(struct mlx4_dev *dev, struct mlx4_icm_table *table,
-			 int start, int end)
+			 u32 start, u32 end)
 {
 	int inc = MLX4_TABLE_CHUNK_SIZE / table->obj_size;
-	int i, err;
+	int err;
+	u32 i;
 
 	for (i = start; i <= end; i += inc) {
 		err = mlx4_table_get(dev, table, i);
@@ -373,9 +374,9 @@
 }
 
 void mlx4_table_put_range(struct mlx4_dev *dev, struct mlx4_icm_table *table,
-			  int start, int end)
+			  u32 start, u32 end)
 {
-	int i;
+	u32 i;
 
 	for (i = start; i <= end; i += MLX4_TABLE_CHUNK_SIZE / table->obj_size)
 		mlx4_table_put(dev, table, i);
@@ -382,7 +383,7 @@
 }
 
 int mlx4_init_icm_table(struct mlx4_dev *dev, struct mlx4_icm_table *table,
-			u64 virt, int obj_size,	int nobj, int reserved,
+			u64 virt, int obj_size,	u64 nobj, int reserved,
 			int use_lowmem, int use_coherent)
 {
 	int obj_per_chunk;
@@ -389,9 +390,10 @@
 	int num_icm;
 	unsigned chunk_size;
 	int i;
+	u64 size;
 
 	obj_per_chunk = MLX4_TABLE_CHUNK_SIZE / obj_size;
-	num_icm = (nobj + obj_per_chunk - 1) / obj_per_chunk;
+	num_icm = div_u64((nobj + obj_per_chunk - 1), obj_per_chunk);
 
 	table->icm      = kcalloc(num_icm, sizeof *table->icm, GFP_KERNEL);
 	if (!table->icm)
@@ -404,10 +406,12 @@
 	table->coherent = use_coherent;
 	mutex_init(&table->mutex);
 
+	size = (u64) nobj * obj_size;
 	for (i = 0; i * MLX4_TABLE_CHUNK_SIZE < reserved * obj_size; ++i) {
 		chunk_size = MLX4_TABLE_CHUNK_SIZE;
-		if ((i + 1) * MLX4_TABLE_CHUNK_SIZE > nobj * obj_size)
-			chunk_size = PAGE_ALIGN(nobj * obj_size - i * MLX4_TABLE_CHUNK_SIZE);
+		if ((i + 1) * MLX4_TABLE_CHUNK_SIZE > size)
+			chunk_size = PAGE_ALIGN(size -
+					i * MLX4_TABLE_CHUNK_SIZE);
 
 		table->icm[i] = mlx4_alloc_icm(dev, chunk_size >> PAGE_SHIFT,
 					       (use_lowmem ? GFP_KERNEL : GFP_HIGHUSER) |
@@ -432,10 +436,16 @@
 err:
 	for (i = 0; i < num_icm; ++i)
 		if (table->icm[i]) {
-			mlx4_UNMAP_ICM(dev, virt + i * MLX4_TABLE_CHUNK_SIZE,
-				       MLX4_TABLE_CHUNK_SIZE / MLX4_ICM_PAGE_SIZE);
-			mlx4_free_icm(dev, table->icm[i], use_coherent);
+			if (!mlx4_UNMAP_ICM(dev,
+					    virt + i * MLX4_TABLE_CHUNK_SIZE,
+					    MLX4_TABLE_CHUNK_SIZE / MLX4_ICM_PAGE_SIZE)) {
+				mlx4_free_icm(dev, table->icm[i], use_coherent);
+			} else {
+				pr_warn("mlx4_core: mlx4_UNMAP_ICM failed.\n");
+				return -ENOMEM;
+			}
 		}
+	kfree(table->icm);
 
 	return -ENOMEM;
 }
@@ -442,14 +452,22 @@
 
 void mlx4_cleanup_icm_table(struct mlx4_dev *dev, struct mlx4_icm_table *table)
 {
-	int i;
+	int i, err = 0;
 
 	for (i = 0; i < table->num_icm; ++i)
 		if (table->icm[i]) {
-			mlx4_UNMAP_ICM(dev, table->virt + i * MLX4_TABLE_CHUNK_SIZE,
-				       MLX4_TABLE_CHUNK_SIZE / MLX4_ICM_PAGE_SIZE);
-			mlx4_free_icm(dev, table->icm[i], table->coherent);
+			err = mlx4_UNMAP_ICM(dev,
+					     table->virt + i * MLX4_TABLE_CHUNK_SIZE,
+					     MLX4_TABLE_CHUNK_SIZE / MLX4_ICM_PAGE_SIZE);
+			if (!err) {
+				mlx4_free_icm(dev, table->icm[i],
+					      table->coherent);
+			} else {
+				pr_warn("mlx4_core: mlx4_UNMAP_ICM failed.\n");
+				break;
+			}
 		}
 
-	kfree(table->icm);
+	if (!err)
+		kfree(table->icm);
 }

Modified: trunk/sys/ofed/drivers/net/mlx4/icm.h
===================================================================
--- trunk/sys/ofed/drivers/net/mlx4/icm.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/drivers/net/mlx4/icm.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2005, 2006, 2007, 2008 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2005, 2006, 2007, 2008, 2014 Mellanox Technologies. All rights reserved.
  * Copyright (c) 2006, 2007 Cisco Systems, Inc.  All rights reserved.
  *
  * This software is available to you under a choice of one of two
@@ -37,6 +37,7 @@
 #include <linux/list.h>
 #include <linux/pci.h>
 #include <linux/mutex.h>
+#include <linux/scatterlist.h>
 
 #define MLX4_ICM_CHUNK_LEN						\
 	((256 - sizeof (struct list_head) - 2 * sizeof (int)) /		\
@@ -71,17 +72,17 @@
 				gfp_t gfp_mask, int coherent);
 void mlx4_free_icm(struct mlx4_dev *dev, struct mlx4_icm *icm, int coherent);
 
+int mlx4_table_get(struct mlx4_dev *dev, struct mlx4_icm_table *table, u32 obj);
+void mlx4_table_put(struct mlx4_dev *dev, struct mlx4_icm_table *table, u32 obj);
+int mlx4_table_get_range(struct mlx4_dev *dev, struct mlx4_icm_table *table,
+			 u32 start, u32 end);
+void mlx4_table_put_range(struct mlx4_dev *dev, struct mlx4_icm_table *table,
+			  u32 start, u32 end);
 int mlx4_init_icm_table(struct mlx4_dev *dev, struct mlx4_icm_table *table,
-			u64 virt, int obj_size,	int nobj, int reserved,
+			u64 virt, int obj_size,	u64 nobj, int reserved,
 			int use_lowmem, int use_coherent);
 void mlx4_cleanup_icm_table(struct mlx4_dev *dev, struct mlx4_icm_table *table);
-int mlx4_table_get(struct mlx4_dev *dev, struct mlx4_icm_table *table, int obj);
-void mlx4_table_put(struct mlx4_dev *dev, struct mlx4_icm_table *table, int obj);
-void *mlx4_table_find(struct mlx4_icm_table *table, int obj, dma_addr_t *dma_handle);
-int mlx4_table_get_range(struct mlx4_dev *dev, struct mlx4_icm_table *table,
-			 int start, int end);
-void mlx4_table_put_range(struct mlx4_dev *dev, struct mlx4_icm_table *table,
-			  int start, int end);
+void *mlx4_table_find(struct mlx4_icm_table *table, u32 obj, dma_addr_t *dma_handle);
 
 static inline void mlx4_icm_first(struct mlx4_icm *icm,
 				  struct mlx4_icm_iter *iter)
@@ -122,8 +123,6 @@
 	return sg_dma_len(&iter->chunk->mem[iter->page_idx]);
 }
 
-int mlx4_UNMAP_ICM(struct mlx4_dev *dev, u64 virt, u32 page_count);
-int mlx4_MAP_ICM_page(struct mlx4_dev *dev, u64 dma_addr, u64 virt);
 int mlx4_MAP_ICM_AUX(struct mlx4_dev *dev, struct mlx4_icm *icm);
 int mlx4_UNMAP_ICM_AUX(struct mlx4_dev *dev);
 


Property changes on: trunk/sys/ofed/drivers/net/mlx4/icm.h
___________________________________________________________________
Deleted: cvs2svn:cvs-rev
## -1 +0,0 ##
-1.1.1.1
\ No newline at end of property
Modified: trunk/sys/ofed/drivers/net/mlx4/intf.c
===================================================================
--- trunk/sys/ofed/drivers/net/mlx4/intf.c	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/drivers/net/mlx4/intf.c	2016-09-14 19:35:22 UTC (rev 7911)
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 2006, 2007 Cisco Systems, Inc. All rights reserved.
- * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2007, 2008, 2014 Mellanox Technologies. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -31,6 +31,9 @@
  * SOFTWARE.
  */
 
+#include <linux/slab.h>
+#include <linux/module.h>
+
 #include "mlx4.h"
 
 struct mlx4_device_context {
@@ -112,38 +115,9 @@
 }
 EXPORT_SYMBOL_GPL(mlx4_unregister_interface);
 
-struct mlx4_dev *mlx4_query_interface(void *int_dev, int *port)
+void mlx4_dispatch_event(struct mlx4_dev *dev, enum mlx4_dev_event type,
+			 unsigned long param)
 {
-	struct mlx4_priv *priv;
-	struct mlx4_device_context *dev_ctx;
-	enum mlx4_query_reply r;
-	unsigned long flags;
-
-	mutex_lock(&intf_mutex);
-
-	list_for_each_entry(priv, &dev_list, dev_list) {
-		spin_lock_irqsave(&priv->ctx_lock, flags);
-		list_for_each_entry(dev_ctx, &priv->ctx_list, list) {
-			if (!dev_ctx->intf->query)
-				continue;
-			r = dev_ctx->intf->query(dev_ctx->context, int_dev);
-			if (r != MLX4_QUERY_NOT_MINE) {
-				*port = r;
-				spin_unlock_irqrestore(&priv->ctx_lock, flags);
-				mutex_unlock(&intf_mutex);
-				return &priv->dev;
-			}
-		}
-		spin_unlock_irqrestore(&priv->ctx_lock, flags);
-	}
-
-	mutex_unlock(&intf_mutex);
-	return NULL;
-}
-EXPORT_SYMBOL_GPL(mlx4_query_interface);
-
-void mlx4_dispatch_event(struct mlx4_dev *dev, enum mlx4_dev_event type, int port)
-{
 	struct mlx4_priv *priv = mlx4_priv(dev);
 	struct mlx4_device_context *dev_ctx;
 	unsigned long flags;
@@ -152,7 +126,7 @@
 
 	list_for_each_entry(dev_ctx, &priv->ctx_list, list)
 		if (dev_ctx->intf->event)
-			dev_ctx->intf->event(dev, dev_ctx->context, type, port);
+			dev_ctx->intf->event(dev, dev_ctx->context, type, param);
 
 	spin_unlock_irqrestore(&priv->ctx_lock, flags);
 }
@@ -169,7 +143,8 @@
 		mlx4_add_device(intf, priv);
 
 	mutex_unlock(&intf_mutex);
-	mlx4_start_catas_poll(dev);
+	if (!mlx4_is_slave(dev))
+		mlx4_start_catas_poll(dev);
 
 	return 0;
 }
@@ -179,18 +154,19 @@
 	struct mlx4_priv *priv = mlx4_priv(dev);
 	struct mlx4_interface *intf;
 
-	mlx4_stop_catas_poll(dev);
+	if (!mlx4_is_slave(dev))
+		mlx4_stop_catas_poll(dev);
 	mutex_lock(&intf_mutex);
 
 	list_for_each_entry(intf, &intf_list, list)
 		mlx4_remove_device(intf, priv);
 
-	list_del(&priv->dev_list);
+	list_del_init(&priv->dev_list);
 
 	mutex_unlock(&intf_mutex);
 }
 
-void *mlx4_find_get_prot_dev(struct mlx4_dev *dev, enum mlx4_prot proto, int port)
+void *mlx4_get_protocol_dev(struct mlx4_dev *dev, enum mlx4_protocol proto, int port)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
 	struct mlx4_device_context *dev_ctx;
@@ -200,13 +176,13 @@
 	spin_lock_irqsave(&priv->ctx_lock, flags);
 
 	list_for_each_entry(dev_ctx, &priv->ctx_list, list)
-		if (dev_ctx->intf->protocol == proto && dev_ctx->intf->get_prot_dev) {
-			result = dev_ctx->intf->get_prot_dev(dev, dev_ctx->context, port);
+		if (dev_ctx->intf->protocol == proto && dev_ctx->intf->get_dev) {
+			result = dev_ctx->intf->get_dev(dev, dev_ctx->context, port);
 			break;
-	}
+		}
 
 	spin_unlock_irqrestore(&priv->ctx_lock, flags);
 
 	return result;
 }
-
+EXPORT_SYMBOL_GPL(mlx4_get_protocol_dev);

Modified: trunk/sys/ofed/drivers/net/mlx4/main.c
===================================================================
--- trunk/sys/ofed/drivers/net/mlx4/main.c	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/drivers/net/mlx4/main.c	2016-09-14 19:35:22 UTC (rev 7911)
@@ -1,7 +1,7 @@
 /*
  * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
  * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
- * Copyright (c) 2005, 2006, 2007, 2008 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2005, 2006, 2007, 2008, 2014 Mellanox Technologies. All rights reserved.
  * Copyright (c) 2006, 2007 Cisco Systems, Inc. All rights reserved.
  *
  * This software is available to you under a choice of one of two
@@ -33,12 +33,21 @@
  * SOFTWARE.
  */
 
+#include <linux/kmod.h>
+/* 
+ * kmod.h must be included before module.h since it includes (indirectly) sys/module.h
+ * To use the FBSD macro sys/module.h should define MODULE_VERSION before linux/module does.
+*/
 #include <linux/module.h>
-#include <linux/init.h>
 #include <linux/errno.h>
 #include <linux/pci.h>
 #include <linux/dma-mapping.h>
+#include <linux/slab.h>
 #include <linux/io-mapping.h>
+#include <linux/delay.h>
+#include <linux/netdevice.h>
+#include <linux/string.h>
+#include <linux/fs.h>
 
 #include <linux/mlx4/device.h>
 #include <linux/mlx4/doorbell.h>
@@ -46,11 +55,9 @@
 #include "mlx4.h"
 #include "fw.h"
 #include "icm.h"
+#include "mlx4_stats.h"
 
-MODULE_AUTHOR("Roland Dreier");
-MODULE_DESCRIPTION("Mellanox ConnectX HCA low-level driver");
-MODULE_LICENSE("Dual BSD/GPL");
-MODULE_VERSION(DRV_VERSION);
+/* Mellanox ConnectX HCA low-level driver */
 
 struct workqueue_struct *mlx4_wq;
 
@@ -62,15 +69,11 @@
 
 #endif /* CONFIG_MLX4_DEBUG */
 
-int mlx4_blck_lb=1;
-module_param_named(block_loopback, mlx4_blck_lb, int, 0644);
-MODULE_PARM_DESC(block_loopback, "Block multicast loopback packets if > 0");
-
 #ifdef CONFIG_PCI_MSI
 
 static int msi_x = 1;
 module_param(msi_x, int, 0444);
-MODULE_PARM_DESC(msi_x, "attempt to use MSI-X if nonzero");
+MODULE_PARM_DESC(msi_x, "0 - don't use MSI-X, 1 - use MSI-X, >1 - limit number of MSI-X irqs to msi_x (non-SRIOV only)");
 
 #else /* CONFIG_PCI_MSI */
 
@@ -78,124 +81,483 @@
 
 #endif /* CONFIG_PCI_MSI */
 
-static char mlx4_version[] __devinitdata =
-	DRV_NAME ": Mellanox ConnectX core driver v"
-	DRV_VERSION " (" DRV_RELDATE ")\n";
+static int enable_sys_tune = 0;
+module_param(enable_sys_tune, int, 0444);
+MODULE_PARM_DESC(enable_sys_tune, "Tune the cpu's for better performance (default 0)");
 
-struct mutex drv_mutex;
+int mlx4_blck_lb = 1;
+module_param_named(block_loopback, mlx4_blck_lb, int, 0644);
+MODULE_PARM_DESC(block_loopback, "Block multicast loopback packets if > 0 "
+				 "(default: 1)");
+enum {
+	DEFAULT_DOMAIN	= 0,
+	BDF_STR_SIZE	= 8, /* bb:dd.f- */
+	DBDF_STR_SIZE	= 13 /* mmmm:bb:dd.f- */
+};
 
-static struct mlx4_profile default_profile = {
-	.num_qp		= 1 << 18,
-	.num_srq	= 1 << 16,
-	.rdmarc_per_qp	= 1 << 4,
-	.num_cq		= 1 << 16,
-	.num_mcg	= 1 << 13,
-	.num_mpt	= 1 << 19,
-	.num_mtt	= 1 << 20,
+enum {
+	NUM_VFS,
+	PROBE_VF,
+	PORT_TYPE_ARRAY
 };
 
-static int log_num_mac = 2;
+enum {
+	VALID_DATA,
+	INVALID_DATA,
+	INVALID_STR
+};
+
+struct param_data {
+	int				id;
+	struct mlx4_dbdf2val_lst	dbdf2val;
+};
+
+static struct param_data num_vfs = {
+	.id		= NUM_VFS,
+	.dbdf2val = {
+		.name		= "num_vfs param",
+		.num_vals	= 1,
+		.def_val	= {0},
+		.range		= {0, MLX4_MAX_NUM_VF}
+	}
+};
+module_param_string(num_vfs, num_vfs.dbdf2val.str,
+		    sizeof(num_vfs.dbdf2val.str), 0444);
+MODULE_PARM_DESC(num_vfs,
+		 "Either single value (e.g. '5') to define uniform num_vfs value for all devices functions\n"
+		 "\t\tor a string to map device function numbers to their num_vfs values (e.g. '0000:04:00.0-5,002b:1c:0b.a-15').\n"
+		 "\t\tHexadecimal digits for the device function (e.g. 002b:1c:0b.a) and decimal for num_vfs value (e.g. 15).");
+
+static struct param_data probe_vf = {
+	.id		= PROBE_VF,
+	.dbdf2val = {
+		.name		= "probe_vf param",
+		.num_vals	= 1,
+		.def_val	= {0},
+		.range		= {0, MLX4_MAX_NUM_VF}
+	}
+};
+module_param_string(probe_vf, probe_vf.dbdf2val.str,
+		    sizeof(probe_vf.dbdf2val.str), 0444);
+MODULE_PARM_DESC(probe_vf,
+		 "Either single value (e.g. '3') to define uniform number of VFs to probe by the pf driver for all devices functions\n"
+		 "\t\tor a string to map device function numbers to their probe_vf values (e.g. '0000:04:00.0-3,002b:1c:0b.a-13').\n"
+		 "\t\tHexadecimal digits for the device function (e.g. 002b:1c:0b.a) and decimal for probe_vf value (e.g. 13).");
+
+int mlx4_log_num_mgm_entry_size = MLX4_DEFAULT_MGM_LOG_ENTRY_SIZE;
+
+module_param_named(log_num_mgm_entry_size,
+			mlx4_log_num_mgm_entry_size, int, 0444);
+MODULE_PARM_DESC(log_num_mgm_entry_size, "log mgm size, that defines the num"
+					 " of qp per mcg, for example:"
+					 " 10 gives 248.range: 7 <="
+					 " log_num_mgm_entry_size <= 12."
+					 " To activate device managed"
+					 " flow steering when available, set to -1");
+
+static int high_rate_steer;
+module_param(high_rate_steer, int, 0444);
+MODULE_PARM_DESC(high_rate_steer, "Enable steering mode for higher packet rate"
+				  " (default off)");
+
+static int fast_drop;
+module_param_named(fast_drop, fast_drop, int, 0444);
+MODULE_PARM_DESC(fast_drop,
+		 "Enable fast packet drop when no recieve WQEs are posted");
+
+int mlx4_enable_64b_cqe_eqe = 1;
+module_param_named(enable_64b_cqe_eqe, mlx4_enable_64b_cqe_eqe, int, 0644);
+MODULE_PARM_DESC(enable_64b_cqe_eqe,
+		 "Enable 64 byte CQEs/EQEs when the the FW supports this if non-zero (default: 1)");
+
+#define HCA_GLOBAL_CAP_MASK            0
+
+#define PF_CONTEXT_BEHAVIOUR_MASK	MLX4_FUNC_CAP_64B_EQE_CQE
+
+static char mlx4_version[] __devinitdata =
+	DRV_NAME ": Mellanox ConnectX VPI driver v"
+	DRV_VERSION " (" DRV_RELDATE ")\n";
+
+static int log_num_mac = 7;
 module_param_named(log_num_mac, log_num_mac, int, 0444);
 MODULE_PARM_DESC(log_num_mac, "Log2 max number of MACs per ETH port (1-7)");
 
-static int use_prio;
-module_param_named(use_prio, use_prio, bool, 0444);
-MODULE_PARM_DESC(use_prio, "Enable steering by VLAN priority on ETH ports "
-		  "(0/1, default 0)");
+static int log_num_vlan;
+module_param_named(log_num_vlan, log_num_vlan, int, 0444);
+MODULE_PARM_DESC(log_num_vlan,
+	"(Obsolete) Log2 max number of VLANs per ETH port (0-7)");
+/* Log2 max number of VLANs per ETH port (0-7) */
+#define MLX4_LOG_NUM_VLANS 7
 
-static struct mlx4_profile mod_param_profile = { 0 };
+int log_mtts_per_seg = ilog2(1);
+module_param_named(log_mtts_per_seg, log_mtts_per_seg, int, 0444);
+MODULE_PARM_DESC(log_mtts_per_seg, "Log2 number of MTT entries per segment "
+		 "(0-7) (default: 0)");
 
+static struct param_data port_type_array = {
+	.id		= PORT_TYPE_ARRAY,
+	.dbdf2val = {
+		.name		= "port_type_array param",
+		.num_vals	= 2,
+		.def_val	= {MLX4_PORT_TYPE_ETH, MLX4_PORT_TYPE_ETH},
+		.range		= {MLX4_PORT_TYPE_IB, MLX4_PORT_TYPE_NA}
+	}
+};
+module_param_string(port_type_array, port_type_array.dbdf2val.str,
+		    sizeof(port_type_array.dbdf2val.str), 0444);
+MODULE_PARM_DESC(port_type_array,
+		 "Either pair of values (e.g. '1,2') to define uniform port1/port2 types configuration for all devices functions\n"
+		 "\t\tor a string to map device function numbers to their pair of port types values (e.g. '0000:04:00.0-1;2,002b:1c:0b.a-1;1').\n"
+		 "\t\tValid port types: 1-ib, 2-eth, 3-auto, 4-N/A\n"
+		 "\t\tIn case that only one port is available use the N/A port type for port2 (e.g '1,4').");
+
+
+struct mlx4_port_config {
+	struct list_head list;
+	enum mlx4_port_type port_type[MLX4_MAX_PORTS + 1];
+	struct pci_dev *pdev;
+};
+
+#define MLX4_LOG_NUM_MTT 20
+/* We limit to 30 as of a bit map issue which uses int and not uint.
+     see mlx4_buddy_init -> bitmap_zero which gets int.
+*/
+#define MLX4_MAX_LOG_NUM_MTT 30
+static struct mlx4_profile mod_param_profile = {
+	.num_qp         = 19,
+	.num_srq        = 16,
+	.rdmarc_per_qp  = 4,
+	.num_cq         = 16,
+	.num_mcg        = 13,
+	.num_mpt        = 19,
+	.num_mtt_segs   = 0, /* max(20, 2*MTTs for host memory)) */
+};
+
 module_param_named(log_num_qp, mod_param_profile.num_qp, int, 0444);
-MODULE_PARM_DESC(log_num_qp, "log maximum number of QPs per HCA");
+MODULE_PARM_DESC(log_num_qp, "log maximum number of QPs per HCA (default: 19)");
 
 module_param_named(log_num_srq, mod_param_profile.num_srq, int, 0444);
-MODULE_PARM_DESC(log_num_srq, "log maximum number of SRQs per HCA");
+MODULE_PARM_DESC(log_num_srq, "log maximum number of SRQs per HCA "
+		 "(default: 16)");
 
-module_param_named(log_rdmarc_per_qp, mod_param_profile.rdmarc_per_qp, int, 0444);
-MODULE_PARM_DESC(log_rdmarc_per_qp, "log number of RDMARC buffers per QP");
+module_param_named(log_rdmarc_per_qp, mod_param_profile.rdmarc_per_qp, int,
+		   0444);
+MODULE_PARM_DESC(log_rdmarc_per_qp, "log number of RDMARC buffers per QP "
+		 "(default: 4)");
 
 module_param_named(log_num_cq, mod_param_profile.num_cq, int, 0444);
-MODULE_PARM_DESC(log_num_cq, "log maximum number of CQs per HCA");
+MODULE_PARM_DESC(log_num_cq, "log maximum number of CQs per HCA (default: 16)");
 
 module_param_named(log_num_mcg, mod_param_profile.num_mcg, int, 0444);
-MODULE_PARM_DESC(log_num_mcg, "log maximum number of multicast groups per HCA");
+MODULE_PARM_DESC(log_num_mcg, "log maximum number of multicast groups per HCA "
+		 "(default: 13)");
 
 module_param_named(log_num_mpt, mod_param_profile.num_mpt, int, 0444);
 MODULE_PARM_DESC(log_num_mpt,
-		"log maximum number of memory protection table entries per HCA");
+		 "log maximum number of memory protection table entries per "
+		 "HCA (default: 19)");
 
-module_param_named(log_num_mtt, mod_param_profile.num_mtt, int, 0444);
+module_param_named(log_num_mtt, mod_param_profile.num_mtt_segs, int, 0444);
 MODULE_PARM_DESC(log_num_mtt,
-		 "log maximum number of memory translation table segments per HCA");
+		 "log maximum number of memory translation table segments per "
+		 "HCA (default: max(20, 2*MTTs for register all of the host memory limited to 30))");
 
-static int log_mtts_per_seg = 0;
-module_param_named(log_mtts_per_seg, log_mtts_per_seg, int, 0444);
-MODULE_PARM_DESC(log_mtts_per_seg, "Log2 number of MTT entries per segment (1-7)");
+enum {
+	MLX4_IF_STATE_BASIC,
+	MLX4_IF_STATE_EXTENDED
+};
 
-static void process_mod_param_profile(void)
+static inline u64 dbdf_to_u64(int domain, int bus, int dev, int fn)
 {
-	default_profile.num_qp = (mod_param_profile.num_qp ?
-				  1 << mod_param_profile.num_qp :
-				  default_profile.num_qp);
-	default_profile.num_srq = (mod_param_profile.num_srq ?
-				  1 << mod_param_profile.num_srq :
-				  default_profile.num_srq);
-	default_profile.rdmarc_per_qp = (mod_param_profile.rdmarc_per_qp ?
-				  1 << mod_param_profile.rdmarc_per_qp :
-				  default_profile.rdmarc_per_qp);
-	default_profile.num_cq = (mod_param_profile.num_cq ?
-				  1 << mod_param_profile.num_cq :
-				  default_profile.num_cq);
-	default_profile.num_mcg = (mod_param_profile.num_mcg ?
-				  1 << mod_param_profile.num_mcg :
-				  default_profile.num_mcg);
-	default_profile.num_mpt = (mod_param_profile.num_mpt ?
-				  1 << mod_param_profile.num_mpt :
-				  default_profile.num_mpt);
-	default_profile.num_mtt = (mod_param_profile.num_mtt ?
-				  1 << mod_param_profile.num_mtt :
-				  default_profile.num_mtt);
+	return (domain << 20) | (bus << 12) | (dev << 4) | fn;
 }
 
-struct mlx4_port_config
+static inline void pr_bdf_err(const char *dbdf, const char *pname)
 {
-	struct list_head list;
-	enum mlx4_port_type port_type[MLX4_MAX_PORTS + 1];
-	struct pci_dev *pdev;
-};
-static LIST_HEAD(config_list);
+	pr_warn("mlx4_core: '%s' is not valid bdf in '%s'\n", dbdf, pname);
+}
 
-static void mlx4_config_cleanup(void)
+static inline void pr_val_err(const char *dbdf, const char *pname,
+			      const char *val)
 {
-	struct mlx4_port_config *config, *tmp;
+	pr_warn("mlx4_core: value '%s' of bdf '%s' in '%s' is not valid\n"
+		, val, dbdf, pname);
+}
 
-	list_for_each_entry_safe(config, tmp, &config_list, list) {
-		list_del(&config->list);
-		kfree(config);
+static inline void pr_out_of_range_bdf(const char *dbdf, int val,
+				       struct mlx4_dbdf2val_lst *dbdf2val)
+{
+	pr_warn("mlx4_core: value %d in bdf '%s' of '%s' is out of its valid range (%d,%d)\n"
+		, val, dbdf, dbdf2val->name , dbdf2val->range.min,
+		dbdf2val->range.max);
+}
+
+static inline void pr_out_of_range(struct mlx4_dbdf2val_lst *dbdf2val)
+{
+	pr_warn("mlx4_core: value of '%s' is out of its valid range (%d,%d)\n"
+		, dbdf2val->name , dbdf2val->range.min, dbdf2val->range.max);
+}
+
+static inline int is_in_range(int val, struct mlx4_range *r)
+{
+	return (val >= r->min && val <= r->max);
+}
+
+static int update_defaults(struct param_data *pdata)
+{
+	long int val[MLX4_MAX_BDF_VALS];
+	int ret;
+	char *t, *p = pdata->dbdf2val.str;
+	char sval[32];
+	int val_len;
+
+	if (!strlen(p) || strchr(p, ':') || strchr(p, '.') || strchr(p, ';'))
+		return INVALID_STR;
+
+	switch (pdata->id) {
+	case PORT_TYPE_ARRAY:
+		t = strchr(p, ',');
+		if (!t || t == p || (t - p) > sizeof(sval))
+			return INVALID_STR;
+
+		val_len = t - p;
+		strncpy(sval, p, val_len);
+		sval[val_len] = 0;
+
+		ret = kstrtol(sval, 0, &val[0]);
+		if (ret == -EINVAL)
+			return INVALID_STR;
+		if (ret || !is_in_range(val[0], &pdata->dbdf2val.range)) {
+			pr_out_of_range(&pdata->dbdf2val);
+			return INVALID_DATA;
+		}
+
+		ret = kstrtol(t + 1, 0, &val[1]);
+		if (ret == -EINVAL)
+			return INVALID_STR;
+		if (ret || !is_in_range(val[1], &pdata->dbdf2val.range)) {
+			pr_out_of_range(&pdata->dbdf2val);
+			return INVALID_DATA;
+		}
+
+		pdata->dbdf2val.tbl[0].val[0] = val[0];
+		pdata->dbdf2val.tbl[0].val[1] = val[1];
+		break;
+
+	case NUM_VFS:
+	case PROBE_VF:
+		ret = kstrtol(p, 0, &val[0]);
+		if (ret == -EINVAL)
+			return INVALID_STR;
+		if (ret || !is_in_range(val[0], &pdata->dbdf2val.range)) {
+			pr_out_of_range(&pdata->dbdf2val);
+			return INVALID_DATA;
+		}
+		pdata->dbdf2val.tbl[0].val[0] = val[0];
+		break;
 	}
+	pdata->dbdf2val.tbl[1].dbdf = MLX4_ENDOF_TBL;
+
+	return VALID_DATA;
 }
 
-void *mlx4_get_prot_dev(struct mlx4_dev *dev, enum mlx4_prot proto, int port)
+int mlx4_fill_dbdf2val_tbl(struct mlx4_dbdf2val_lst *dbdf2val_lst)
 {
-	return mlx4_find_get_prot_dev(dev, proto, port);
+	int domain, bus, dev, fn;
+	u64 dbdf;
+	char *p, *t, *v;
+	char tmp[32];
+	char sbdf[32];
+	char sep = ',';
+	int j, k, str_size, i = 1;
+	int prfx_size;
+
+	p = dbdf2val_lst->str;
+
+	for (j = 0; j < dbdf2val_lst->num_vals; j++)
+		dbdf2val_lst->tbl[0].val[j] = dbdf2val_lst->def_val[j];
+	dbdf2val_lst->tbl[1].dbdf = MLX4_ENDOF_TBL;
+
+	str_size = strlen(dbdf2val_lst->str);
+
+	if (str_size == 0)
+		return 0;
+
+	while (strlen(p)) {
+		prfx_size = BDF_STR_SIZE;
+		sbdf[prfx_size] = 0;
+		strncpy(sbdf, p, prfx_size);
+		domain = DEFAULT_DOMAIN;
+		if (sscanf(sbdf, "%02x:%02x.%x-", &bus, &dev, &fn) != 3) {
+			prfx_size = DBDF_STR_SIZE;
+			sbdf[prfx_size] = 0;
+			strncpy(sbdf, p, prfx_size);
+			if (sscanf(sbdf, "%04x:%02x:%02x.%x-", &domain, &bus,
+				   &dev, &fn) != 4) {
+				pr_bdf_err(sbdf, dbdf2val_lst->name);
+				goto err;
+			}
+			sprintf(tmp, "%04x:%02x:%02x.%x-", domain, bus, dev,
+				fn);
+		} else {
+			sprintf(tmp, "%02x:%02x.%x-", bus, dev, fn);
+		}
+
+		if (strnicmp(sbdf, tmp, sizeof(tmp))) {
+			pr_bdf_err(sbdf, dbdf2val_lst->name);
+			goto err;
+		}
+
+		dbdf = dbdf_to_u64(domain, bus, dev, fn);
+
+		for (j = 1; j < i; j++)
+			if (dbdf2val_lst->tbl[j].dbdf == dbdf) {
+				pr_warn("mlx4_core: in '%s', %s appears multiple times\n"
+					, dbdf2val_lst->name, sbdf);
+				goto err;
+			}
+
+		if (i >= MLX4_DEVS_TBL_SIZE) {
+			pr_warn("mlx4_core: Too many devices in '%s'\n"
+				, dbdf2val_lst->name);
+			goto err;
+		}
+
+		p += prfx_size;
+		t = strchr(p, sep);
+		t = t ? t : p + strlen(p);
+		if (p >= t) {
+			pr_val_err(sbdf, dbdf2val_lst->name, "");
+			goto err;
+		}
+
+		for (k = 0; k < dbdf2val_lst->num_vals; k++) {
+			char sval[32];
+			long int val;
+			int ret, val_len;
+			char vsep = ';';
+
+			v = (k == dbdf2val_lst->num_vals - 1) ? t : strchr(p, vsep);
+			if (!v || v > t || v == p || (v - p) > sizeof(sval)) {
+				pr_val_err(sbdf, dbdf2val_lst->name, p);
+				goto err;
+			}
+			val_len = v - p;
+			strncpy(sval, p, val_len);
+			sval[val_len] = 0;
+
+			ret = kstrtol(sval, 0, &val);
+			if (ret) {
+				if (strchr(p, vsep))
+					pr_warn("mlx4_core: too many vals in bdf '%s' of '%s'\n"
+						, sbdf, dbdf2val_lst->name);
+				else
+					pr_val_err(sbdf, dbdf2val_lst->name,
+						   sval);
+				goto err;
+			}
+			if (!is_in_range(val, &dbdf2val_lst->range)) {
+				pr_out_of_range_bdf(sbdf, val, dbdf2val_lst);
+				goto err;
+			}
+
+			dbdf2val_lst->tbl[i].val[k] = val;
+			p = v;
+			if (p[0] == vsep)
+				p++;
+		}
+
+		dbdf2val_lst->tbl[i].dbdf = dbdf;
+		if (strlen(p)) {
+			if (p[0] != sep) {
+				pr_warn("mlx4_core: expect separator '%c' before '%s' in '%s'\n"
+					, sep, p, dbdf2val_lst->name);
+				goto err;
+			}
+			p++;
+		}
+		i++;
+		if (i < MLX4_DEVS_TBL_SIZE)
+			dbdf2val_lst->tbl[i].dbdf = MLX4_ENDOF_TBL;
+	}
+
+	return 0;
+
+err:
+	dbdf2val_lst->tbl[1].dbdf = MLX4_ENDOF_TBL;
+	pr_warn("mlx4_core: The value of '%s' is incorrect. The value is discarded!\n"
+		, dbdf2val_lst->name);
+
+	return -EINVAL;
 }
-EXPORT_SYMBOL(mlx4_get_prot_dev);
+EXPORT_SYMBOL(mlx4_fill_dbdf2val_tbl);
 
-void mlx4_set_iboe_counter(struct mlx4_dev *dev, int index, u8 port)
+int mlx4_get_val(struct mlx4_dbdf2val *tbl, struct pci_dev *pdev, int idx,
+		 int *val)
 {
-	struct mlx4_priv *priv = mlx4_priv(dev);
+	u64 dbdf;
+	int i = 1;
 
-	priv->iboe_counter_index[port - 1] = index;
+	*val = tbl[0].val[idx];
+	if (!pdev)
+		return -EINVAL;
+
+        dbdf = dbdf_to_u64(pci_get_domain(pdev->dev.bsddev), pci_get_bus(pdev->dev.bsddev),
+			   PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn));
+
+	while ((i < MLX4_DEVS_TBL_SIZE) && (tbl[i].dbdf != MLX4_ENDOF_TBL)) {
+		if (tbl[i].dbdf == dbdf) {
+			*val = tbl[i].val[idx];
+			return 0;
+		}
+		i++;
+	}
+
+	return 0;
 }
-EXPORT_SYMBOL(mlx4_set_iboe_counter);
+EXPORT_SYMBOL(mlx4_get_val);
 
-int mlx4_get_iboe_counter(struct mlx4_dev *dev, u8 port)
+static void process_mod_param_profile(struct mlx4_profile *profile)
 {
-	struct mlx4_priv *priv = mlx4_priv(dev);
+        vm_size_t hwphyssz;
+        hwphyssz = 0;
+        TUNABLE_ULONG_FETCH("hw.realmem", (u_long *) &hwphyssz);
 
-	return priv->iboe_counter_index[port - 1];
+	profile->num_qp        = 1 << mod_param_profile.num_qp;
+	profile->num_srq       = 1 << mod_param_profile.num_srq;
+	profile->rdmarc_per_qp = 1 << mod_param_profile.rdmarc_per_qp;
+	profile->num_cq	       = 1 << mod_param_profile.num_cq;
+	profile->num_mcg       = 1 << mod_param_profile.num_mcg;
+	profile->num_mpt       = 1 << mod_param_profile.num_mpt;
+	/*
+	 * We want to scale the number of MTTs with the size of the
+	 * system memory, since it makes sense to register a lot of
+	 * memory on a system with a lot of memory.  As a heuristic,
+	 * make sure we have enough MTTs to register twice the system
+	 * memory (with PAGE_SIZE entries).
+	 *
+	 * This number has to be a power of two and fit into 32 bits
+	 * due to device limitations. We cap this at 2^30 as of bit map
+	 * limitation to work with int instead of uint (mlx4_buddy_init -> bitmap_zero)
+	 * That limits us to 4TB of memory registration per HCA with
+	 * 4KB pages, which is probably OK for the next few months.
+	 */
+	if (mod_param_profile.num_mtt_segs)
+		profile->num_mtt_segs = 1 << mod_param_profile.num_mtt_segs;
+	else {
+		profile->num_mtt_segs =
+			roundup_pow_of_two(max_t(unsigned,
+						1 << (MLX4_LOG_NUM_MTT - log_mtts_per_seg),
+						min(1UL << 
+						(MLX4_MAX_LOG_NUM_MTT -
+						log_mtts_per_seg),
+						(hwphyssz << 1)
+						>> log_mtts_per_seg)));
+		/* set the actual value, so it will be reflected to the user
+		   using the sysfs */
+		mod_param_profile.num_mtt_segs = ilog2(profile->num_mtt_segs);
+	}
 }
-EXPORT_SYMBOL(mlx4_get_iboe_counter);
 
 int mlx4_check_port_params(struct mlx4_dev *dev,
 			   enum mlx4_port_type *port_type)
@@ -209,9 +571,6 @@
 					 "on this HCA, aborting.\n");
 				return -EINVAL;
 			}
-			if (port_type[i] == MLX4_PORT_TYPE_ETH &&
-			    port_type[i + 1] == MLX4_PORT_TYPE_IB)
-				return -EINVAL;
 		}
 	}
 
@@ -233,19 +592,6 @@
 		dev->caps.port_mask[i] = dev->caps.port_type[i];
 }
 
-static u8 get_counters_mode(u64 flags)
-{
-	switch (flags >> 48 & 3) {
-	case 2:
-	case 3:
-		return MLX4_CUNTERS_EXT;
-	case 1:
-		return MLX4_CUNTERS_BASIC;
-	default:
-		return MLX4_CUNTERS_DISABLED;
-	}
-}
-
 static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 {
 	int err;
@@ -260,7 +606,7 @@
 	if (dev_cap->min_page_sz > PAGE_SIZE) {
 		mlx4_err(dev, "HCA minimum page size of %d bigger than "
 			 "kernel PAGE_SIZE of %d, aborting.\n",
-			 dev_cap->min_page_sz, PAGE_SIZE);
+			 dev_cap->min_page_sz, (int)PAGE_SIZE);
 		return -ENODEV;
 	}
 	if (dev_cap->num_ports > MLX4_MAX_PORTS) {
@@ -279,9 +625,14 @@
 	}
 
 	dev->caps.num_ports	     = dev_cap->num_ports;
+	dev->phys_caps.num_phys_eqs  = MLX4_MAX_EQ_NUM;
 	for (i = 1; i <= dev->caps.num_ports; ++i) {
 		dev->caps.vl_cap[i]	    = dev_cap->max_vl[i];
 		dev->caps.ib_mtu_cap[i]	    = dev_cap->ib_mtu[i];
+		dev->phys_caps.gid_phys_table_len[i]  = dev_cap->max_gids[i];
+		dev->phys_caps.pkey_phys_table_len[i] = dev_cap->max_pkeys[i];
+		/* set gid and pkey table operating lengths by default
+		 * to non-sriov values */
 		dev->caps.gid_table_len[i]  = dev_cap->max_gids[i];
 		dev->caps.pkey_table_len[i] = dev_cap->max_pkeys[i];
 		dev->caps.port_width_cap[i] = dev_cap->max_port_width[i];
@@ -288,6 +639,8 @@
 		dev->caps.eth_mtu_cap[i]    = dev_cap->eth_mtu[i];
 		dev->caps.def_mac[i]        = dev_cap->def_mac[i];
 		dev->caps.supported_type[i] = dev_cap->supported_port_types[i];
+		dev->caps.suggested_type[i] = dev_cap->suggested_type[i];
+		dev->caps.default_sense[i] = dev_cap->default_sense[i];
 		dev->caps.trans_type[i]	    = dev_cap->trans_type[i];
 		dev->caps.vendor_oui[i]     = dev_cap->vendor_oui[i];
 		dev->caps.wavelength[i]     = dev_cap->wavelength[i];
@@ -294,6 +647,7 @@
 		dev->caps.trans_code[i]     = dev_cap->trans_code[i];
 	}
 
+	dev->caps.uar_page_size	     = PAGE_SIZE;
 	dev->caps.num_uars	     = dev_cap->uar_size / PAGE_SIZE;
 	dev->caps.local_ca_ack_delay = dev_cap->local_ca_ack_delay;
 	dev->caps.bf_reg_size	     = dev_cap->bf_reg_size;
@@ -307,52 +661,112 @@
 	dev->caps.reserved_srqs	     = dev_cap->reserved_srqs;
 	dev->caps.max_sq_desc_sz     = dev_cap->max_sq_desc_sz;
 	dev->caps.max_rq_desc_sz     = dev_cap->max_rq_desc_sz;
-	dev->caps.num_qp_per_mgm     = MLX4_QP_PER_MGM;
 	/*
 	 * Subtract 1 from the limit because we need to allocate a
-	 * spare CQE so the HCA HW can tell the difference between an
-	 * empty CQ and a full CQ.
+	 * spare CQE to enable resizing the CQ
 	 */
 	dev->caps.max_cqes	     = dev_cap->max_cq_sz - 1;
 	dev->caps.reserved_cqs	     = dev_cap->reserved_cqs;
 	dev->caps.reserved_eqs	     = dev_cap->reserved_eqs;
-	dev->caps.mtts_per_seg	     = 1 << log_mtts_per_seg;
-	dev->caps.reserved_mtts	     = DIV_ROUND_UP(dev_cap->reserved_mtts,
-						    dev->caps.mtts_per_seg);
+	dev->caps.reserved_mtts      = dev_cap->reserved_mtts;
 	dev->caps.reserved_mrws	     = dev_cap->reserved_mrws;
-	dev->caps.reserved_uars	     = dev_cap->reserved_uars;
+
+	/* The first 128 UARs are used for EQ doorbells */
+	dev->caps.reserved_uars	     = max_t(int, 128, dev_cap->reserved_uars);
 	dev->caps.reserved_pds	     = dev_cap->reserved_pds;
-	dev->caps.mtt_entry_sz	     = dev->caps.mtts_per_seg * dev_cap->mtt_entry_sz;
+	dev->caps.reserved_xrcds     = (dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC) ?
+					dev_cap->reserved_xrcds : 0;
+	dev->caps.max_xrcds          = (dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC) ?
+					dev_cap->max_xrcds : 0;
+	dev->caps.mtt_entry_sz       = dev_cap->mtt_entry_sz;
+
 	dev->caps.max_msg_sz         = dev_cap->max_msg_sz;
 	dev->caps.page_size_cap	     = ~(u32) (dev_cap->min_page_sz - 1);
 	dev->caps.flags		     = dev_cap->flags;
+	dev->caps.flags2	     = dev_cap->flags2;
 	dev->caps.bmme_flags	     = dev_cap->bmme_flags;
 	dev->caps.reserved_lkey	     = dev_cap->reserved_lkey;
 	dev->caps.stat_rate_support  = dev_cap->stat_rate_support;
-	dev->caps.udp_rss	     = dev_cap->udp_rss;
-	dev->caps.loopback_support   = dev_cap->loopback_support;
-	dev->caps.wol		     = dev_cap->wol;
+	dev->caps.cq_timestamp       = dev_cap->timestamp_support;
 	dev->caps.max_gso_sz	     = dev_cap->max_gso_sz;
-	dev->caps.reserved_xrcds     = (dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC) ?
-		dev_cap->reserved_xrcds : 0;
-	dev->caps.max_xrcds	     = (dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC) ?
-		dev_cap->max_xrcds : 0;
+	dev->caps.max_rss_tbl_sz     = dev_cap->max_rss_tbl_sz;
 
+	/* Sense port always allowed on supported devices for ConnectX-1 and -2 */
+	if (mlx4_priv(dev)->pci_dev_data & MLX4_PCI_DEV_FORCE_SENSE_PORT)
+		dev->caps.flags |= MLX4_DEV_CAP_FLAG_SENSE_SUPPORT;
+	/* Don't do sense port on multifunction devices (for now at least) */
+	if (mlx4_is_mfunc(dev))
+		dev->caps.flags &= ~MLX4_DEV_CAP_FLAG_SENSE_SUPPORT;
+
 	dev->caps.log_num_macs  = log_num_mac;
-	dev->caps.log_num_prios = use_prio ? 3 : 0;
+	dev->caps.log_num_vlans = MLX4_LOG_NUM_VLANS;
 
+	dev->caps.fast_drop	= fast_drop ?
+				  !!(dev->caps.flags & MLX4_DEV_CAP_FLAG_FAST_DROP) :
+				  0;
+
 	for (i = 1; i <= dev->caps.num_ports; ++i) {
 		dev->caps.port_type[i] = MLX4_PORT_TYPE_NONE;
 		if (dev->caps.supported_type[i]) {
-			if (dev->caps.supported_type[i] != MLX4_PORT_TYPE_ETH)
+			/* if only ETH is supported - assign ETH */
+			if (dev->caps.supported_type[i] == MLX4_PORT_TYPE_ETH)
+				dev->caps.port_type[i] = MLX4_PORT_TYPE_ETH;
+			/* if only IB is supported, assign IB */
+			else if (dev->caps.supported_type[i] ==
+				 MLX4_PORT_TYPE_IB)
 				dev->caps.port_type[i] = MLX4_PORT_TYPE_IB;
-			else
-				dev->caps.port_type[i] = MLX4_PORT_TYPE_ETH;
+			else {
+				/*
+				 * if IB and ETH are supported, we set the port
+				 * type according to user selection of port type;
+				 * if there is no user selection, take the FW hint
+				 */
+				int pta;
+				mlx4_get_val(port_type_array.dbdf2val.tbl,
+					     pci_physfn(dev->pdev), i - 1,
+					     &pta);
+				if (pta == MLX4_PORT_TYPE_NONE) {
+					dev->caps.port_type[i] = dev->caps.suggested_type[i] ?
+						MLX4_PORT_TYPE_ETH : MLX4_PORT_TYPE_IB;
+				} else if (pta == MLX4_PORT_TYPE_NA) {
+					mlx4_err(dev, "Port %d is valid port. "
+						 "It is not allowed to configure its type to N/A(%d)\n",
+						 i, MLX4_PORT_TYPE_NA);
+					return -EINVAL;
+				} else {
+					dev->caps.port_type[i] = pta;
+				}
+			}
 		}
-		dev->caps.possible_type[i] = dev->caps.port_type[i];
+		/*
+		 * Link sensing is allowed on the port if 3 conditions are true:
+		 * 1. Both protocols are supported on the port.
+		 * 2. Different types are supported on the port
+		 * 3. FW declared that it supports link sensing
+		 */
 		mlx4_priv(dev)->sense.sense_allowed[i] =
-			dev->caps.supported_type[i] == MLX4_PORT_TYPE_AUTO;
+			((dev->caps.supported_type[i] == MLX4_PORT_TYPE_AUTO) &&
+			 (dev->caps.flags & MLX4_DEV_CAP_FLAG_DPDP) &&
+			 (dev->caps.flags & MLX4_DEV_CAP_FLAG_SENSE_SUPPORT));
 
+		/* Disablling auto sense for default Eth ports support */
+		mlx4_priv(dev)->sense.sense_allowed[i] = 0;
+
+		/*
+		 * If "default_sense" bit is set, we move the port to "AUTO" mode
+		 * and perform sense_port FW command to try and set the correct
+		 * port type from beginning
+		 */
+		if (mlx4_priv(dev)->sense.sense_allowed[i] && dev->caps.default_sense[i]) {
+			enum mlx4_port_type sensed_port = MLX4_PORT_TYPE_NONE;
+			dev->caps.possible_type[i] = MLX4_PORT_TYPE_AUTO;
+			mlx4_SENSE_PORT(dev, i, &sensed_port);
+			if (sensed_port != MLX4_PORT_TYPE_NONE)
+				dev->caps.port_type[i] = sensed_port;
+		} else {
+			dev->caps.possible_type[i] = dev->caps.port_type[i];
+		}
+
 		if (dev->caps.log_num_macs > dev_cap->log_max_macs[i]) {
 			dev->caps.log_num_macs = dev_cap->log_max_macs[i];
 			mlx4_warn(dev, "Requested number of MACs is too much "
@@ -359,12 +773,21 @@
 				  "for port %d, reducing to %d.\n",
 				  i, 1 << dev->caps.log_num_macs);
 		}
-		dev->caps.log_num_vlans = dev_cap->log_max_vlans[i];
+		if (dev->caps.log_num_vlans > dev_cap->log_max_vlans[i]) {
+			dev->caps.log_num_vlans = dev_cap->log_max_vlans[i];
+			mlx4_warn(dev, "Requested number of VLANs is too much "
+				  "for port %d, reducing to %d.\n",
+				  i, 1 << dev->caps.log_num_vlans);
+		}
 	}
 
-	dev->caps.counters_mode = get_counters_mode(dev_cap->flags);
-	dev->caps.max_basic_counters = 1 << ilog2(dev_cap->max_basic_counters);
-	dev->caps.max_ext_counters = 1 << ilog2(dev_cap->max_ext_counters);
+	dev->caps.max_basic_counters = dev_cap->max_basic_counters;
+	dev->caps.max_extended_counters = dev_cap->max_extended_counters;
+	/* support extended counters if available */
+	if (dev->caps.flags & MLX4_DEV_CAP_FLAG_COUNTERS_EXT)
+		dev->caps.max_counters = dev->caps.max_extended_counters;
+	else
+		dev->caps.max_counters = dev->caps.max_basic_counters;
 
 	dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW] = dev_cap->reserved_qps;
 	dev->caps.reserved_qps_cnt[MLX4_QP_REGION_ETH_ADDR] =
@@ -371,42 +794,336 @@
 		dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FC_ADDR] =
 		(1 << dev->caps.log_num_macs) *
 		(1 << dev->caps.log_num_vlans) *
-		(1 << dev->caps.log_num_prios) *
 		dev->caps.num_ports;
+	dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FC_EXCH] = MLX4_NUM_FEXCH;
 
 	dev->caps.reserved_qps = dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW] +
 		dev->caps.reserved_qps_cnt[MLX4_QP_REGION_ETH_ADDR] +
-		dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FC_ADDR];
+		dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FC_ADDR] +
+		dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FC_EXCH];
 
+	dev->caps.sync_qp = dev_cap->sync_qp;
+	if (dev->pdev->device == 0x1003)
+		dev->caps.cq_flags |= MLX4_DEV_CAP_CQ_FLAG_IO;
+
+	dev->caps.sqp_demux = (mlx4_is_master(dev)) ? MLX4_MAX_NUM_SLAVES : 0;
+
+	if (!mlx4_enable_64b_cqe_eqe && !mlx4_is_slave(dev)) {
+		if (dev_cap->flags &
+		    (MLX4_DEV_CAP_FLAG_64B_CQE | MLX4_DEV_CAP_FLAG_64B_EQE)) {
+			mlx4_warn(dev, "64B EQEs/CQEs supported by the device but not enabled\n");
+			dev->caps.flags &= ~MLX4_DEV_CAP_FLAG_64B_CQE;
+			dev->caps.flags &= ~MLX4_DEV_CAP_FLAG_64B_EQE;
+		}
+	}
+
+	if ((dev->caps.flags &
+	    (MLX4_DEV_CAP_FLAG_64B_CQE | MLX4_DEV_CAP_FLAG_64B_EQE)) &&
+	    mlx4_is_master(dev))
+		dev->caps.function_caps |= MLX4_FUNC_CAP_64B_EQE_CQE;
+
+	if (!mlx4_is_slave(dev)) {
+		for (i = 0; i < dev->caps.num_ports; ++i)
+			dev->caps.def_counter_index[i] = i << 1;
+	}
+
 	return 0;
 }
-
-static int mlx4_save_config(struct mlx4_dev *dev)
+/*The function checks if there are live vf, return the num of them*/
+static int mlx4_how_many_lives_vf(struct mlx4_dev *dev)
 {
-	struct mlx4_port_config *config;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_slave_state *s_state;
 	int i;
+	int ret = 0;
 
-	list_for_each_entry(config, &config_list, list) {
-		if (config->pdev == dev->pdev) {
-			for (i = 1; i <= dev->caps.num_ports; i++)
-				config->port_type[i] = dev->caps.possible_type[i];
-			return 0;
+	for (i = 1/*the ppf is 0*/; i < dev->num_slaves; ++i) {
+		s_state = &priv->mfunc.master.slave_state[i];
+		if (s_state->active && s_state->last_cmd !=
+		    MLX4_COMM_CMD_RESET) {
+			mlx4_warn(dev, "%s: slave: %d is still active\n",
+				  __func__, i);
+			ret++;
 		}
 	}
+	return ret;
+}
 
-	config = kmalloc(sizeof(struct mlx4_port_config), GFP_KERNEL);
-	if (!config)
-		return -ENOMEM;
+int mlx4_get_parav_qkey(struct mlx4_dev *dev, u32 qpn, u32 *qkey)
+{
+	u32 qk = MLX4_RESERVED_QKEY_BASE;
 
-	config->pdev = dev->pdev;
-	for (i = 1; i <= dev->caps.num_ports; i++)
-		config->port_type[i] = dev->caps.possible_type[i];
+	if (qpn >= dev->phys_caps.base_tunnel_sqpn + 8 * MLX4_MFUNC_MAX ||
+	    qpn < dev->phys_caps.base_proxy_sqpn)
+		return -EINVAL;
 
-	list_add_tail(&config->list, &config_list);
+	if (qpn >= dev->phys_caps.base_tunnel_sqpn)
+		/* tunnel qp */
+		qk += qpn - dev->phys_caps.base_tunnel_sqpn;
+	else
+		qk += qpn - dev->phys_caps.base_proxy_sqpn;
+	*qkey = qk;
+	return 0;
+}
+EXPORT_SYMBOL(mlx4_get_parav_qkey);
 
+void mlx4_sync_pkey_table(struct mlx4_dev *dev, int slave, int port, int i, int val)
+{
+	struct mlx4_priv *priv = container_of(dev, struct mlx4_priv, dev);
+
+	if (!mlx4_is_master(dev))
+		return;
+
+	priv->virt2phys_pkey[slave][port - 1][i] = val;
+}
+EXPORT_SYMBOL(mlx4_sync_pkey_table);
+
+void mlx4_put_slave_node_guid(struct mlx4_dev *dev, int slave, __be64 guid)
+{
+	struct mlx4_priv *priv = container_of(dev, struct mlx4_priv, dev);
+
+	if (!mlx4_is_master(dev))
+		return;
+
+	priv->slave_node_guids[slave] = guid;
+}
+EXPORT_SYMBOL(mlx4_put_slave_node_guid);
+
+__be64 mlx4_get_slave_node_guid(struct mlx4_dev *dev, int slave)
+{
+	struct mlx4_priv *priv = container_of(dev, struct mlx4_priv, dev);
+
+	if (!mlx4_is_master(dev))
+		return 0;
+
+	return priv->slave_node_guids[slave];
+}
+EXPORT_SYMBOL(mlx4_get_slave_node_guid);
+
+int mlx4_is_slave_active(struct mlx4_dev *dev, int slave)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_slave_state *s_slave;
+
+	if (!mlx4_is_master(dev))
+		return 0;
+
+	s_slave = &priv->mfunc.master.slave_state[slave];
+	return !!s_slave->active;
+}
+EXPORT_SYMBOL(mlx4_is_slave_active);
+
+static void slave_adjust_steering_mode(struct mlx4_dev *dev,
+				       struct mlx4_dev_cap *dev_cap,
+				       struct mlx4_init_hca_param *hca_param)
+{
+	dev->caps.steering_mode = hca_param->steering_mode;
+	if (dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED)
+		dev->caps.num_qp_per_mgm = dev_cap->fs_max_num_qp_per_entry;
+	else
+		dev->caps.num_qp_per_mgm =
+			4 * ((1 << hca_param->log_mc_entry_sz)/16 - 2);
+
+	mlx4_dbg(dev, "Steering mode is: %s\n",
+		 mlx4_steering_mode_str(dev->caps.steering_mode));
+}
+
+static int mlx4_slave_cap(struct mlx4_dev *dev)
+{
+	int			   err;
+	u32			   page_size;
+	struct mlx4_dev_cap	   dev_cap;
+	struct mlx4_func_cap	   func_cap;
+	struct mlx4_init_hca_param hca_param;
+	int			   i;
+
+	memset(&hca_param, 0, sizeof(hca_param));
+	err = mlx4_QUERY_HCA(dev, &hca_param);
+	if (err) {
+		mlx4_err(dev, "QUERY_HCA command failed, aborting.\n");
+		return err;
+	}
+
+	/*fail if the hca has an unknown capability */
+	if ((hca_param.global_caps | HCA_GLOBAL_CAP_MASK) !=
+	    HCA_GLOBAL_CAP_MASK) {
+		mlx4_err(dev, "Unknown hca global capabilities\n");
+		return -ENOSYS;
+	}
+
+	mlx4_log_num_mgm_entry_size = hca_param.log_mc_entry_sz;
+
+	dev->caps.hca_core_clock = hca_param.hca_core_clock;
+
+	memset(&dev_cap, 0, sizeof(dev_cap));
+	dev->caps.max_qp_dest_rdma = 1 << hca_param.log_rd_per_qp;
+	err = mlx4_dev_cap(dev, &dev_cap);
+	if (err) {
+		mlx4_err(dev, "QUERY_DEV_CAP command failed, aborting.\n");
+		return err;
+	}
+
+	err = mlx4_QUERY_FW(dev);
+	if (err)
+		mlx4_err(dev, "QUERY_FW command failed: could not get FW version.\n");
+
+	if (!hca_param.mw_enable) {
+		dev->caps.flags      &= ~MLX4_DEV_CAP_FLAG_MEM_WINDOW;
+		dev->caps.bmme_flags &= ~MLX4_BMME_FLAG_TYPE_2_WIN;
+	}
+
+	page_size = ~dev->caps.page_size_cap + 1;
+	mlx4_warn(dev, "HCA minimum page size:%d\n", page_size);
+	if (page_size > PAGE_SIZE) {
+		mlx4_err(dev, "HCA minimum page size of %d bigger than "
+			 "kernel PAGE_SIZE of %d, aborting.\n",
+			 page_size, (int)PAGE_SIZE);
+		return -ENODEV;
+	}
+
+	/* slave gets uar page size from QUERY_HCA fw command */
+	dev->caps.uar_page_size = 1 << (hca_param.uar_page_sz + 12);
+
+	/* TODO: relax this assumption */
+	if (dev->caps.uar_page_size != PAGE_SIZE) {
+		mlx4_err(dev, "UAR size:%d != kernel PAGE_SIZE of %d\n",
+			 dev->caps.uar_page_size, (int)PAGE_SIZE);
+		return -ENODEV;
+	}
+
+	memset(&func_cap, 0, sizeof(func_cap));
+	err = mlx4_QUERY_FUNC_CAP(dev, 0, &func_cap);
+	if (err) {
+		mlx4_err(dev, "QUERY_FUNC_CAP general command failed, aborting (%d).\n",
+			  err);
+		return err;
+	}
+
+	if ((func_cap.pf_context_behaviour | PF_CONTEXT_BEHAVIOUR_MASK) !=
+	    PF_CONTEXT_BEHAVIOUR_MASK) {
+		mlx4_err(dev, "Unknown pf context behaviour\n");
+		return -ENOSYS;
+	}
+
+	dev->caps.num_ports		= func_cap.num_ports;
+	dev->quotas.qp			= func_cap.qp_quota;
+	dev->quotas.srq			= func_cap.srq_quota;
+	dev->quotas.cq			= func_cap.cq_quota;
+	dev->quotas.mpt			= func_cap.mpt_quota;
+	dev->quotas.mtt			= func_cap.mtt_quota;
+	dev->caps.num_qps		= 1 << hca_param.log_num_qps;
+	dev->caps.num_srqs		= 1 << hca_param.log_num_srqs;
+	dev->caps.num_cqs		= 1 << hca_param.log_num_cqs;
+	dev->caps.num_mpts		= 1 << hca_param.log_mpt_sz;
+	dev->caps.num_eqs		= func_cap.max_eq;
+	dev->caps.reserved_eqs		= func_cap.reserved_eq;
+	dev->caps.num_pds               = MLX4_NUM_PDS;
+	dev->caps.num_mgms              = 0;
+	dev->caps.num_amgms             = 0;
+
+	if (dev->caps.num_ports > MLX4_MAX_PORTS) {
+		mlx4_err(dev, "HCA has %d ports, but we only support %d, "
+			 "aborting.\n", dev->caps.num_ports, MLX4_MAX_PORTS);
+		return -ENODEV;
+	}
+
+	dev->caps.qp0_tunnel = kcalloc(dev->caps.num_ports, sizeof (u32), GFP_KERNEL);
+	dev->caps.qp0_proxy = kcalloc(dev->caps.num_ports, sizeof (u32), GFP_KERNEL);
+	dev->caps.qp1_tunnel = kcalloc(dev->caps.num_ports, sizeof (u32), GFP_KERNEL);
+	dev->caps.qp1_proxy = kcalloc(dev->caps.num_ports, sizeof (u32), GFP_KERNEL);
+
+	if (!dev->caps.qp0_tunnel || !dev->caps.qp0_proxy ||
+	    !dev->caps.qp1_tunnel || !dev->caps.qp1_proxy) {
+		err = -ENOMEM;
+		goto err_mem;
+	}
+
+	for (i = 1; i <= dev->caps.num_ports; ++i) {
+		err = mlx4_QUERY_FUNC_CAP(dev, (u32) i, &func_cap);
+		if (err) {
+			mlx4_err(dev, "QUERY_FUNC_CAP port command failed for"
+				 " port %d, aborting (%d).\n", i, err);
+			goto err_mem;
+		}
+		dev->caps.qp0_tunnel[i - 1] = func_cap.qp0_tunnel_qpn;
+		dev->caps.qp0_proxy[i - 1] = func_cap.qp0_proxy_qpn;
+		dev->caps.qp1_tunnel[i - 1] = func_cap.qp1_tunnel_qpn;
+		dev->caps.qp1_proxy[i - 1] = func_cap.qp1_proxy_qpn;
+		dev->caps.def_counter_index[i - 1] = func_cap.def_counter_index;
+
+		dev->caps.port_mask[i] = dev->caps.port_type[i];
+		err = mlx4_get_slave_pkey_gid_tbl_len(dev, i,
+						      &dev->caps.gid_table_len[i],
+						      &dev->caps.pkey_table_len[i]);
+		if (err)
+			goto err_mem;
+	}
+
+	if (dev->caps.uar_page_size * (dev->caps.num_uars -
+				       dev->caps.reserved_uars) >
+				       pci_resource_len(dev->pdev, 2)) {
+		mlx4_err(dev, "HCA reported UAR region size of 0x%x bigger than "
+			 "PCI resource 2 size of 0x%llx, aborting.\n",
+			 dev->caps.uar_page_size * dev->caps.num_uars,
+			 (unsigned long long) pci_resource_len(dev->pdev, 2));
+		err = -ENOMEM;
+		goto err_mem;
+	}
+
+	if (hca_param.dev_cap_enabled & MLX4_DEV_CAP_64B_EQE_ENABLED) {
+		dev->caps.eqe_size   = 64;
+		dev->caps.eqe_factor = 1;
+	} else {
+		dev->caps.eqe_size   = 32;
+		dev->caps.eqe_factor = 0;
+	}
+
+	if (hca_param.dev_cap_enabled & MLX4_DEV_CAP_64B_CQE_ENABLED) {
+		dev->caps.cqe_size   = 64;
+		dev->caps.userspace_caps |= MLX4_USER_DEV_CAP_64B_CQE;
+	} else {
+		dev->caps.cqe_size   = 32;
+	}
+
+	dev->caps.flags2 &= ~MLX4_DEV_CAP_FLAG2_TS;
+	mlx4_warn(dev, "Timestamping is not supported in slave mode.\n");
+
+	slave_adjust_steering_mode(dev, &dev_cap, &hca_param);
+
 	return 0;
+
+err_mem:
+	kfree(dev->caps.qp0_tunnel);
+	kfree(dev->caps.qp0_proxy);
+	kfree(dev->caps.qp1_tunnel);
+	kfree(dev->caps.qp1_proxy);
+	dev->caps.qp0_tunnel = dev->caps.qp0_proxy =
+		dev->caps.qp1_tunnel = dev->caps.qp1_proxy = NULL;
+
+	return err;
 }
 
+static void mlx4_request_modules(struct mlx4_dev *dev)
+{
+	int port;
+	int has_ib_port = false;
+	int has_eth_port = false;
+#define EN_DRV_NAME	"mlx4_en"
+#define IB_DRV_NAME	"mlx4_ib"
+
+	for (port = 1; port <= dev->caps.num_ports; port++) {
+		if (dev->caps.port_type[port] == MLX4_PORT_TYPE_IB)
+			has_ib_port = true;
+		else if (dev->caps.port_type[port] == MLX4_PORT_TYPE_ETH)
+			has_eth_port = true;
+	}
+
+	if (has_ib_port)
+		request_module_nowait(IB_DRV_NAME);
+	if (has_eth_port)
+		request_module_nowait(EN_DRV_NAME);
+}
+
 /*
  * Change the port configuration of the device.
  * Every user of this function must hold the port mutex.
@@ -421,16 +1138,15 @@
 	for (port = 0; port <  dev->caps.num_ports; port++) {
 		/* Change the port type only if the new type is different
 		 * from the current, and not set to Auto */
-		if (port_types[port] != dev->caps.port_type[port + 1]) {
+		if (port_types[port] != dev->caps.port_type[port + 1])
 			change = 1;
-			dev->caps.port_type[port + 1] = port_types[port];
-		}
 	}
 	if (change) {
 		mlx4_unregister_device(dev);
 		for (port = 1; port <= dev->caps.num_ports; port++) {
 			mlx4_CLOSE_PORT(dev, port);
-			err = mlx4_SET_PORT(dev, port);
+			dev->caps.port_type[port] = port_types[port - 1];
+			err = mlx4_SET_PORT(dev, port, -1);
 			if (err) {
 				mlx4_err(dev, "Failed to set port %d, "
 					      "aborting\n", port);
@@ -438,8 +1154,12 @@
 			}
 		}
 		mlx4_set_port_mask(dev);
-		mlx4_save_config(dev);
 		err = mlx4_register_device(dev);
+		if (err) {
+			mlx4_err(dev, "Failed to register device\n");
+			goto out;
+		}
+		mlx4_request_modules(dev);
 	}
 
 out:
@@ -490,6 +1210,13 @@
 		return -EINVAL;
 	}
 
+	if ((info->tmp_type & mdev->caps.supported_type[info->port]) !=
+	    info->tmp_type) {
+		mlx4_err(mdev, "Requested port type for port %d is not supported on this HCA\n",
+			 info->port);
+		return -EINVAL;
+	}
+
 	mlx4_stop_sense(mdev);
 	mutex_lock(&priv->port_mutex);
 	/* Possible type is always the one that was delivered */
@@ -502,14 +1229,8 @@
 			types[i] = mdev->caps.port_type[i+1];
 	}
 
-	if (priv->trig) {
-		if (++priv->changed_ports < mdev->caps.num_ports)
-			goto out;
-		else
-			priv->trig = priv->changed_ports = 0;
-	}
-
-	if (!(mdev->caps.flags & MLX4_DEV_CAP_FLAG_DPDP)) {
+	if (!(mdev->caps.flags & MLX4_DEV_CAP_FLAG_DPDP) &&
+	    !(mdev->caps.flags & MLX4_DEV_CAP_FLAG_SENSE_SUPPORT)) {
 		for (i = 1; i <= mdev->caps.num_ports; i++) {
 			if (mdev->caps.possible_type[i] == MLX4_PORT_TYPE_AUTO) {
 				mdev->caps.possible_type[i] = mdev->caps.port_type[i];
@@ -544,27 +1265,140 @@
 	return err ? err : count;
 }
 
-static ssize_t trigger_port(struct device *dev, struct device_attribute *attr,
-			    const char *buf, size_t count)
+enum ibta_mtu {
+	IB_MTU_256  = 1,
+	IB_MTU_512  = 2,
+	IB_MTU_1024 = 3,
+	IB_MTU_2048 = 4,
+	IB_MTU_4096 = 5
+};
+
+static inline int int_to_ibta_mtu(int mtu)
 {
-	struct pci_dev *pdev = to_pci_dev(dev);
-	struct mlx4_dev *mdev = pci_get_drvdata(pdev);
-	struct mlx4_priv *priv = container_of(mdev, struct mlx4_priv, dev);
+	switch (mtu) {
+	case 256:  return IB_MTU_256;
+	case 512:  return IB_MTU_512;
+	case 1024: return IB_MTU_1024;
+	case 2048: return IB_MTU_2048;
+	case 4096: return IB_MTU_4096;
+	default: return -1;
+	}
+}
 
-	if (!priv)
-		return -ENODEV;
+static inline int ibta_mtu_to_int(enum ibta_mtu mtu)
+{
+	switch (mtu) {
+	case IB_MTU_256:  return  256;
+	case IB_MTU_512:  return  512;
+	case IB_MTU_1024: return 1024;
+	case IB_MTU_2048: return 2048;
+	case IB_MTU_4096: return 4096;
+	default: return -1;
+	}
+}
 
+static ssize_t
+show_board(struct device *device, struct device_attribute *attr,
+			  char *buf)
+{
+	struct mlx4_hca_info *info = container_of(attr, struct mlx4_hca_info,
+						   board_attr);
+	struct mlx4_dev *mdev = info->dev;
+
+	return sprintf(buf, "%.*s\n", MLX4_BOARD_ID_LEN,
+		       mdev->board_id);
+}
+
+static ssize_t
+show_hca(struct device *device, struct device_attribute *attr,
+			char *buf)
+{
+	struct mlx4_hca_info *info = container_of(attr, struct mlx4_hca_info,
+						   hca_attr);
+	struct mlx4_dev *mdev = info->dev;
+
+	return sprintf(buf, "MT%d\n", mdev->pdev->device);
+}
+
+static ssize_t
+show_firmware_version(struct device *dev,
+				struct device_attribute *attr,
+				char *buf)
+{
+	struct mlx4_hca_info *info = container_of(attr, struct mlx4_hca_info,
+						   firmware_attr);
+	struct mlx4_dev *mdev = info->dev;
+
+	return sprintf(buf, "%d.%d.%d\n", (int)(mdev->caps.fw_ver >> 32),
+		       (int)(mdev->caps.fw_ver >> 16) & 0xffff,
+		       (int)mdev->caps.fw_ver & 0xffff);
+}
+
+static ssize_t show_port_ib_mtu(struct device *dev,
+			     struct device_attribute *attr,
+			     char *buf)
+{
+	struct mlx4_port_info *info = container_of(attr, struct mlx4_port_info,
+						   port_mtu_attr);
+	struct mlx4_dev *mdev = info->dev;
+
+	/* When port type is eth, port mtu value isn't used. */
+	if (mdev->caps.port_type[info->port] == MLX4_PORT_TYPE_ETH)
+		return -EINVAL;
+
+	sprintf(buf, "%d\n",
+			ibta_mtu_to_int(mdev->caps.port_ib_mtu[info->port]));
+	return strlen(buf);
+}
+
+static ssize_t set_port_ib_mtu(struct device *dev,
+			     struct device_attribute *attr,
+			     const char *buf, size_t count)
+{
+	struct mlx4_port_info *info = container_of(attr, struct mlx4_port_info,
+						   port_mtu_attr);
+	struct mlx4_dev *mdev = info->dev;
+	struct mlx4_priv *priv = mlx4_priv(mdev);
+	int err, port, mtu, ibta_mtu = -1;
+
+	if (mdev->caps.port_type[info->port] == MLX4_PORT_TYPE_ETH) {
+		mlx4_warn(mdev, "port level mtu is only used for IB ports\n");
+		return -EINVAL;
+	}
+
+	mtu = (int) simple_strtol(buf, NULL, 0);
+	ibta_mtu = int_to_ibta_mtu(mtu);
+
+	if (ibta_mtu < 0) {
+		mlx4_err(mdev, "%s is invalid IBTA mtu\n", buf);
+		return -EINVAL;
+	}
+
+	mdev->caps.port_ib_mtu[info->port] = ibta_mtu;
+
+	mlx4_stop_sense(mdev);
 	mutex_lock(&priv->port_mutex);
-	priv->trig = 1;
+	mlx4_unregister_device(mdev);
+	for (port = 1; port <= mdev->caps.num_ports; port++) {
+		mlx4_CLOSE_PORT(mdev, port);
+		err = mlx4_SET_PORT(mdev, port, -1);
+		if (err) {
+			mlx4_err(mdev, "Failed to set port %d, "
+				      "aborting\n", port);
+			goto err_set_port;
+		}
+	}
+	err = mlx4_register_device(mdev);
+err_set_port:
 	mutex_unlock(&priv->port_mutex);
-	return count;
+	mlx4_start_sense(mdev);
+	return err ? err : count;
 }
-DEVICE_ATTR(port_trigger, S_IWUGO, NULL, trigger_port);
 
 static int mlx4_load_fw(struct mlx4_dev *dev)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
-	int err;
+	int err, unmap_flag = 0;
 
 	priv->fw.fw_icm = mlx4_alloc_icm(dev, priv->fw.fw_pages,
 					 GFP_HIGHUSER | __GFP_NOWARN, 0);
@@ -588,10 +1422,13 @@
 	return 0;
 
 err_unmap_fa:
-	mlx4_UNMAP_FA(dev);
+	unmap_flag = mlx4_UNMAP_FA(dev);
+	if (unmap_flag)
+		pr_warn("mlx4_core: mlx4_UNMAP_FA failed.\n");
 
 err_free:
-	mlx4_free_icm(dev, priv->fw.fw_icm, 0);
+	if (!unmap_flag)
+		mlx4_free_icm(dev, priv->fw.fw_icm, 0);
 	return err;
 }
 
@@ -600,6 +1437,7 @@
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
 	int err;
+	int num_eqs;
 
 	err = mlx4_init_icm_table(dev, &priv->qp_table.cmpt_table,
 				  cmpt_base +
@@ -629,12 +1467,13 @@
 	if (err)
 		goto err_srq;
 
+	num_eqs = (mlx4_is_master(dev)) ? dev->phys_caps.num_phys_eqs :
+		  dev->caps.num_eqs;
 	err = mlx4_init_icm_table(dev, &priv->eq_table.cmpt_table,
 				  cmpt_base +
 				  ((u64) (MLX4_CMPT_TYPE_EQ *
 					  cmpt_entry_sz) << MLX4_CMPT_SHIFT),
-				  cmpt_entry_sz,
-				  dev->caps.num_eqs, dev->caps.num_eqs, 0, 0);
+				  cmpt_entry_sz, num_eqs, num_eqs, 0, 0);
 	if (err)
 		goto err_cq;
 
@@ -658,7 +1497,8 @@
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
 	u64 aux_pages;
-	int err;
+	int num_eqs;
+	int err, unmap_flag = 0;
 
 	err = mlx4_SET_ICM_SIZE(dev, icm_size, &aux_pages);
 	if (err) {
@@ -689,10 +1529,12 @@
 		goto err_unmap_aux;
 	}
 
+
+	num_eqs = (mlx4_is_master(dev)) ? dev->phys_caps.num_phys_eqs :
+		   dev->caps.num_eqs;
 	err = mlx4_init_icm_table(dev, &priv->eq_table.table,
 				  init_hca->eqc_base, dev_cap->eqc_entry_sz,
-				  dev->caps.num_eqs, dev->caps.num_eqs,
-				  0, 0);
+				  num_eqs, num_eqs, 0, 0);
 	if (err) {
 		mlx4_err(dev, "Failed to map EQ context memory, aborting.\n");
 		goto err_unmap_cmpt;
@@ -712,7 +1554,7 @@
 	err = mlx4_init_icm_table(dev, &priv->mr_table.mtt_table,
 				  init_hca->mtt_base,
 				  dev->caps.mtt_entry_sz,
-				  dev->caps.num_mtt_segs,
+				  dev->caps.num_mtts,
 				  dev->caps.reserved_mtts, 1, 0);
 	if (err) {
 		mlx4_err(dev, "Failed to map MTT context memory, aborting.\n");
@@ -794,12 +1636,15 @@
 	}
 
 	/*
-	 * It's not strictly required, but for simplicity just map the
-	 * whole multicast group table now.  The table isn't very big
-	 * and it's a lot easier than trying to track ref counts.
+	 * For flow steering device managed mode it is required to use
+	 * mlx4_init_icm_table. For B0 steering mode it's not strictly
+	 * required, but for simplicity just map the whole multicast
+	 * group table now.  The table isn't very big and it's a lot
+	 * easier than trying to track ref counts.
 	 */
 	err = mlx4_init_icm_table(dev, &priv->mcg_table.table,
-				  init_hca->mc_base, MLX4_MGM_ENTRY_SIZE,
+				  init_hca->mc_base,
+				  mlx4_get_mgm_entry_size(dev),
 				  dev->caps.num_mgms + dev->caps.num_amgms,
 				  dev->caps.num_mgms + dev->caps.num_amgms,
 				  0, 0);
@@ -844,10 +1689,13 @@
 	mlx4_cleanup_icm_table(dev, &priv->qp_table.cmpt_table);
 
 err_unmap_aux:
-	mlx4_UNMAP_ICM_AUX(dev);
+	unmap_flag = mlx4_UNMAP_ICM_AUX(dev);
+	if (unmap_flag)
+		pr_warn("mlx4_core: mlx4_UNMAP_ICM_AUX failed.\n");
 
 err_free_aux:
-	mlx4_free_icm(dev, priv->fw.aux_icm, 0);
+	if (!unmap_flag)
+		mlx4_free_icm(dev, priv->fw.aux_icm, 0);
 
 	return err;
 }
@@ -871,10 +1719,22 @@
 	mlx4_cleanup_icm_table(dev, &priv->srq_table.cmpt_table);
 	mlx4_cleanup_icm_table(dev, &priv->qp_table.cmpt_table);
 
-	mlx4_UNMAP_ICM_AUX(dev);
-	mlx4_free_icm(dev, priv->fw.aux_icm, 0);
+	if (!mlx4_UNMAP_ICM_AUX(dev))
+		mlx4_free_icm(dev, priv->fw.aux_icm, 0);
+	else
+		pr_warn("mlx4_core: mlx4_UNMAP_ICM_AUX failed.\n");
 }
 
+static void mlx4_slave_exit(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	mutex_lock(&priv->cmd.slave_cmd_mutex);
+	if (mlx4_comm_cmd(dev, MLX4_COMM_CMD_RESET, 0, MLX4_COMM_TIME))
+		mlx4_warn(dev, "Failed to close slave function.\n");
+	mutex_unlock(&priv->cmd.slave_cmd_mutex);
+}
+
 static int map_bf_area(struct mlx4_dev *dev)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
@@ -882,8 +1742,13 @@
 	resource_size_t bf_len;
 	int err = 0;
 
-	bf_start = pci_resource_start(dev->pdev, 2) + (dev->caps.num_uars << PAGE_SHIFT);
-	bf_len = pci_resource_len(dev->pdev, 2) - (dev->caps.num_uars << PAGE_SHIFT);
+	if (!dev->caps.bf_reg_size)
+		return -ENXIO;
+
+	bf_start = pci_resource_start(dev->pdev, 2) +
+			(dev->caps.num_uars << PAGE_SHIFT);
+	bf_len = pci_resource_len(dev->pdev, 2) -
+			(dev->caps.num_uars << PAGE_SHIFT);
 	priv->bf_mapping = io_mapping_create_wc(bf_start, bf_len);
 	if (!priv->bf_mapping)
 		err = -ENOMEM;
@@ -897,112 +1762,406 @@
 		io_mapping_free(mlx4_priv(dev)->bf_mapping);
 }
 
+int mlx4_read_clock(struct mlx4_dev *dev)
+{
+	u32 clockhi, clocklo, clockhi1;
+	cycle_t cycles;
+	int i;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	if (!priv->clock_mapping)
+		return -ENOTSUPP;
+
+	for (i = 0; i < 10; i++) {
+		clockhi = swab32(readl(priv->clock_mapping));
+		clocklo = swab32(readl(priv->clock_mapping + 4));
+		clockhi1 = swab32(readl(priv->clock_mapping));
+		if (clockhi == clockhi1)
+			break;
+	}
+
+	cycles = (u64) clockhi << 32 | (u64) clocklo;
+
+	return cycles;
+}
+EXPORT_SYMBOL_GPL(mlx4_read_clock);
+
+
+static int map_internal_clock(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	priv->clock_mapping = ioremap(pci_resource_start(dev->pdev,
+				priv->fw.clock_bar) +
+				priv->fw.clock_offset, MLX4_CLOCK_SIZE);
+
+	if (!priv->clock_mapping)
+		return -ENOMEM;
+
+	return 0;
+}
+
+
+int mlx4_get_internal_clock_params(struct mlx4_dev *dev,
+				   struct mlx4_clock_params *params)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	if (mlx4_is_slave(dev))
+		return -ENOTSUPP;
+	if (!params)
+		return -EINVAL;
+
+	params->bar = priv->fw.clock_bar;
+	params->offset = priv->fw.clock_offset;
+	params->size = MLX4_CLOCK_SIZE;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_get_internal_clock_params);
+
+static void unmap_internal_clock(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	if (priv->clock_mapping)
+		iounmap(priv->clock_mapping);
+}
+
 static void mlx4_close_hca(struct mlx4_dev *dev)
 {
+	unmap_internal_clock(dev);
 	unmap_bf_area(dev);
-	mlx4_CLOSE_HCA(dev, 0);
-	mlx4_free_icms(dev);
-	mlx4_UNMAP_FA(dev);
-	mlx4_free_icm(dev, mlx4_priv(dev)->fw.fw_icm, 0);
+	if (mlx4_is_slave(dev)) {
+		mlx4_slave_exit(dev);
+	} else {
+		mlx4_CLOSE_HCA(dev, 0);
+		mlx4_free_icms(dev);
+
+		if (!mlx4_UNMAP_FA(dev))
+			 mlx4_free_icm(dev, mlx4_priv(dev)->fw.fw_icm, 0);
+		else
+			pr_warn("mlx4_core: mlx4_UNMAP_FA failed.\n");
+	}
 }
 
+static int mlx4_init_slave(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	u64 dma = (u64) priv->mfunc.vhcr_dma;
+	int num_of_reset_retries = NUM_OF_RESET_RETRIES;
+	int ret_from_reset = 0;
+	u32 slave_read;
+	u32 cmd_channel_ver;
+
+	mutex_lock(&priv->cmd.slave_cmd_mutex);
+	priv->cmd.max_cmds = 1;
+	mlx4_warn(dev, "Sending reset\n");
+	ret_from_reset = mlx4_comm_cmd(dev, MLX4_COMM_CMD_RESET, 0,
+				       MLX4_COMM_TIME);
+	/* if we are in the middle of flr the slave will try
+	 * NUM_OF_RESET_RETRIES times before leaving.*/
+	if (ret_from_reset) {
+		if (MLX4_DELAY_RESET_SLAVE == ret_from_reset) {
+			msleep(SLEEP_TIME_IN_RESET);
+			while (ret_from_reset && num_of_reset_retries) {
+				mlx4_warn(dev, "slave is currently in the"
+					  "middle of FLR. retrying..."
+					  "(try num:%d)\n",
+					  (NUM_OF_RESET_RETRIES -
+					   num_of_reset_retries  + 1));
+				ret_from_reset =
+					mlx4_comm_cmd(dev, MLX4_COMM_CMD_RESET,
+						      0, MLX4_COMM_TIME);
+				num_of_reset_retries = num_of_reset_retries - 1;
+			}
+		} else
+			goto err;
+	}
+
+	/* check the driver version - the slave I/F revision
+	 * must match the master's */
+	slave_read = swab32(readl(&priv->mfunc.comm->slave_read));
+	cmd_channel_ver = mlx4_comm_get_version();
+
+	if (MLX4_COMM_GET_IF_REV(cmd_channel_ver) !=
+		MLX4_COMM_GET_IF_REV(slave_read)) {
+		mlx4_err(dev, "slave driver version is not supported"
+			 " by the master\n");
+		goto err;
+	}
+
+	mlx4_warn(dev, "Sending vhcr0\n");
+	if (mlx4_comm_cmd(dev, MLX4_COMM_CMD_VHCR0, dma >> 48,
+						    MLX4_COMM_TIME))
+		goto err;
+	if (mlx4_comm_cmd(dev, MLX4_COMM_CMD_VHCR1, dma >> 32,
+						    MLX4_COMM_TIME))
+		goto err;
+	if (mlx4_comm_cmd(dev, MLX4_COMM_CMD_VHCR2, dma >> 16,
+						    MLX4_COMM_TIME))
+		goto err;
+	if (mlx4_comm_cmd(dev, MLX4_COMM_CMD_VHCR_EN, dma, MLX4_COMM_TIME))
+		goto err;
+
+	mutex_unlock(&priv->cmd.slave_cmd_mutex);
+	return 0;
+
+err:
+	mlx4_comm_cmd(dev, MLX4_COMM_CMD_RESET, 0, 0);
+	mutex_unlock(&priv->cmd.slave_cmd_mutex);
+	return -EIO;
+}
+
+static void mlx4_parav_master_pf_caps(struct mlx4_dev *dev)
+{
+	int i;
+
+	for (i = 1; i <= dev->caps.num_ports; i++) {
+		if (dev->caps.port_type[i] == MLX4_PORT_TYPE_ETH)
+			dev->caps.gid_table_len[i] =
+				mlx4_get_slave_num_gids(dev, 0);
+		else
+			dev->caps.gid_table_len[i] = 1;
+		dev->caps.pkey_table_len[i] =
+			dev->phys_caps.pkey_phys_table_len[i] - 1;
+	}
+}
+
+static int choose_log_fs_mgm_entry_size(int qp_per_entry)
+{
+	int i = MLX4_MIN_MGM_LOG_ENTRY_SIZE;
+
+	for (i = MLX4_MIN_MGM_LOG_ENTRY_SIZE; i <= MLX4_MAX_MGM_LOG_ENTRY_SIZE;
+	      i++) {
+		if (qp_per_entry <= 4 * ((1 << i) / 16 - 2))
+			break;
+	}
+
+	return (i <= MLX4_MAX_MGM_LOG_ENTRY_SIZE) ? i : -1;
+}
+
+static void choose_steering_mode(struct mlx4_dev *dev,
+				 struct mlx4_dev_cap *dev_cap)
+{
+	int nvfs;
+
+	mlx4_get_val(num_vfs.dbdf2val.tbl, pci_physfn(dev->pdev), 0, &nvfs);
+	if (high_rate_steer && !mlx4_is_mfunc(dev)) {
+		dev->caps.flags &= ~(MLX4_DEV_CAP_FLAG_VEP_MC_STEER |
+				     MLX4_DEV_CAP_FLAG_VEP_UC_STEER);
+		dev_cap->flags2 &= ~MLX4_DEV_CAP_FLAG2_FS_EN;
+	}
+
+	if (mlx4_log_num_mgm_entry_size == -1 &&
+	    dev_cap->flags2 & MLX4_DEV_CAP_FLAG2_FS_EN &&
+	    (!mlx4_is_mfunc(dev) ||
+	     (dev_cap->fs_max_num_qp_per_entry >= (nvfs + 1))) &&
+	    choose_log_fs_mgm_entry_size(dev_cap->fs_max_num_qp_per_entry) >=
+		MLX4_MIN_MGM_LOG_ENTRY_SIZE) {
+		dev->oper_log_mgm_entry_size =
+			choose_log_fs_mgm_entry_size(dev_cap->fs_max_num_qp_per_entry);
+		dev->caps.steering_mode = MLX4_STEERING_MODE_DEVICE_MANAGED;
+		dev->caps.num_qp_per_mgm = dev_cap->fs_max_num_qp_per_entry;
+	} else {
+		if (dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_UC_STEER &&
+		    dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_MC_STEER)
+			dev->caps.steering_mode = MLX4_STEERING_MODE_B0;
+		else {
+			dev->caps.steering_mode = MLX4_STEERING_MODE_A0;
+
+			if (dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_UC_STEER ||
+			    dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_MC_STEER)
+				mlx4_warn(dev, "Must have both UC_STEER and MC_STEER flags "
+					  "set to use B0 steering. Falling back to A0 steering mode.\n");
+		}
+		dev->oper_log_mgm_entry_size =
+			mlx4_log_num_mgm_entry_size > 0 ?
+			mlx4_log_num_mgm_entry_size :
+			MLX4_DEFAULT_MGM_LOG_ENTRY_SIZE;
+		dev->caps.num_qp_per_mgm = mlx4_get_qp_per_mgm(dev);
+	}
+	mlx4_dbg(dev, "Steering mode is: %s, oper_log_mgm_entry_size = %d, "
+		 "log_num_mgm_entry_size = %d\n",
+		 mlx4_steering_mode_str(dev->caps.steering_mode),
+		 dev->oper_log_mgm_entry_size, mlx4_log_num_mgm_entry_size);
+}
+
 static int mlx4_init_hca(struct mlx4_dev *dev)
 {
 	struct mlx4_priv	  *priv = mlx4_priv(dev);
+	struct mlx4_dev_cap	   *dev_cap = NULL;
 	struct mlx4_adapter	   adapter;
-	struct mlx4_dev_cap	   dev_cap;
 	struct mlx4_mod_stat_cfg   mlx4_cfg;
 	struct mlx4_profile	   profile;
 	struct mlx4_init_hca_param init_hca;
-	struct mlx4_port_config	  *config;
 	u64 icm_size;
 	int err;
-	int i;
 
-	err = mlx4_QUERY_FW(dev);
-	if (err) {
-		if (err == -EACCES)
-			mlx4_info(dev, "non-primary physical function, skipping.\n");
-		else
-			mlx4_err(dev, "QUERY_FW command failed, aborting.\n");
-		return err;
-	}
+	if (!mlx4_is_slave(dev)) {
+		err = mlx4_QUERY_FW(dev);
+		if (err) {
+			if (err == -EACCES)
+				mlx4_info(dev, "non-primary physical function, skipping.\n");
+			else
+				mlx4_err(dev, "QUERY_FW command failed, aborting.\n");
+			return err;
+		}
 
-	err = mlx4_load_fw(dev);
-	if (err) {
-		mlx4_err(dev, "Failed to start FW, aborting.\n");
-		return err;
-	}
+		err = mlx4_load_fw(dev);
+		if (err) {
+			mlx4_err(dev, "Failed to start FW, aborting.\n");
+			return err;
+		}
 
-	mlx4_cfg.log_pg_sz_m = 1;
-	mlx4_cfg.log_pg_sz = 0;
-	err = mlx4_MOD_STAT_CFG(dev, &mlx4_cfg);
-	if (err)
-		mlx4_warn(dev, "Failed to override log_pg_sz parameter\n");
+		mlx4_cfg.log_pg_sz_m = 1;
+		mlx4_cfg.log_pg_sz = 0;
+		err = mlx4_MOD_STAT_CFG(dev, &mlx4_cfg);
+		if (err)
+			mlx4_warn(dev, "Failed to override log_pg_sz parameter\n");
 
-	err = mlx4_dev_cap(dev, &dev_cap);
-	if (err) {
-		mlx4_err(dev, "QUERY_DEV_CAP command failed, aborting.\n");
-		goto err_stop_fw;
-	}
+		dev_cap = kzalloc(sizeof *dev_cap, GFP_KERNEL);
+		if (!dev_cap) {
+			mlx4_err(dev, "Failed to allocate memory for dev_cap\n");
+			err = -ENOMEM;
+			goto err_stop_fw;
+		}
 
-	process_mod_param_profile();
-	profile = default_profile;
+		err = mlx4_dev_cap(dev, dev_cap);
+		if (err) {
+			mlx4_err(dev, "QUERY_DEV_CAP command failed, aborting.\n");
+			goto err_stop_fw;
+		}
 
-	list_for_each_entry(config, &config_list, list) {
-		if (config->pdev == dev->pdev) {
-			for (i = 1; i <= dev->caps.num_ports; i++) {
-				dev->caps.possible_type[i] = config->port_type[i];
-				if (config->port_type[i] != MLX4_PORT_TYPE_AUTO)
-					dev->caps.port_type[i] = config->port_type[i];
+		choose_steering_mode(dev, dev_cap);
+
+		if (mlx4_is_master(dev))
+			mlx4_parav_master_pf_caps(dev);
+
+		process_mod_param_profile(&profile);
+		if (dev->caps.steering_mode ==
+		    MLX4_STEERING_MODE_DEVICE_MANAGED)
+			profile.num_mcg = MLX4_FS_NUM_MCG;
+
+		icm_size = mlx4_make_profile(dev, &profile, dev_cap,
+					     &init_hca);
+		if ((long long) icm_size < 0) {
+			err = icm_size;
+			goto err_stop_fw;
+		}
+
+		dev->caps.max_fmr_maps = (1 << (32 - ilog2(dev->caps.num_mpts))) - 1;
+
+		init_hca.log_uar_sz = ilog2(dev->caps.num_uars);
+		init_hca.uar_page_sz = PAGE_SHIFT - 12;
+
+		err = mlx4_init_icm(dev, dev_cap, &init_hca, icm_size);
+		if (err)
+			goto err_stop_fw;
+
+		init_hca.mw_enable = 1;
+
+		err = mlx4_INIT_HCA(dev, &init_hca);
+		if (err) {
+			mlx4_err(dev, "INIT_HCA command failed, aborting.\n");
+			goto err_free_icm;
+		}
+
+		/*
+		 * Read HCA frequency by QUERY_HCA command
+		 */
+		if (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_TS) {
+			memset(&init_hca, 0, sizeof(init_hca));
+			err = mlx4_QUERY_HCA(dev, &init_hca);
+			if (err) {
+				mlx4_err(dev, "QUERY_HCA command failed, disable timestamp.\n");
+				dev->caps.flags2 &= ~MLX4_DEV_CAP_FLAG2_TS;
+			} else {
+				dev->caps.hca_core_clock =
+					init_hca.hca_core_clock;
 			}
+
+			/* In case we got HCA frequency 0 - disable timestamping
+			 * to avoid dividing by zero
+			 */
+			if (!dev->caps.hca_core_clock) {
+				dev->caps.flags2 &= ~MLX4_DEV_CAP_FLAG2_TS;
+				mlx4_err(dev, "HCA frequency is 0. Timestamping is not supported.");
+			} else if (map_internal_clock(dev)) {
+				/* Map internal clock,
+				 * in case of failure disable timestamping
+				 */
+				dev->caps.flags2 &= ~MLX4_DEV_CAP_FLAG2_TS;
+				mlx4_err(dev, "Failed to map internal clock. Timestamping is not supported.\n");
+			}
 		}
-	}
+	} else {
+		err = mlx4_init_slave(dev);
+		if (err) {
+			mlx4_err(dev, "Failed to initialize slave\n");
+			return err;
+		}
 
-	mlx4_set_port_mask(dev);
-	icm_size = mlx4_make_profile(dev, &profile, &dev_cap, &init_hca);
-	if ((long long) icm_size < 0) {
-		err = icm_size;
-		goto err_stop_fw;
+		err = mlx4_slave_cap(dev);
+		if (err) {
+			mlx4_err(dev, "Failed to obtain slave caps\n");
+			goto err_close;
+		}
 	}
 
 	if (map_bf_area(dev))
-		mlx4_dbg(dev, "Kernel support for blue flame is not available for kernels < 2.6.28\n");
+		mlx4_dbg(dev, "Failed to map blue flame area\n");
 
-	init_hca.log_uar_sz = ilog2(dev->caps.num_uars);
+	/* Only the master set the ports, all the rest got it from it.*/
+	if (!mlx4_is_slave(dev))
+		mlx4_set_port_mask(dev);
 
-	err = mlx4_init_icm(dev, &dev_cap, &init_hca, icm_size);
-	if (err)
-		goto err_stop_fw;
-
-	err = mlx4_INIT_HCA(dev, &init_hca);
-	if (err) {
-		mlx4_err(dev, "INIT_HCA command failed, aborting.\n");
-		goto err_free_icm;
-	}
-
 	err = mlx4_QUERY_ADAPTER(dev, &adapter);
 	if (err) {
 		mlx4_err(dev, "QUERY_ADAPTER command failed, aborting.\n");
-		goto err_close;
+		goto unmap_bf;
 	}
 
 	priv->eq_table.inta_pin = adapter.inta_pin;
 	memcpy(dev->board_id, adapter.board_id, sizeof dev->board_id);
+	memcpy(dev->vsd, adapter.vsd, sizeof(dev->vsd));
+	dev->vsd_vendor_id = adapter.vsd_vendor_id;
 
+	if (!mlx4_is_slave(dev))
+		kfree(dev_cap);
+
 	return 0;
 
+unmap_bf:
+	if (!mlx4_is_slave(dev))
+		unmap_internal_clock(dev);
+	unmap_bf_area(dev);
+
+	if (mlx4_is_slave(dev)) {
+		kfree(dev->caps.qp0_tunnel);
+		kfree(dev->caps.qp0_proxy);
+		kfree(dev->caps.qp1_tunnel);
+		kfree(dev->caps.qp1_proxy);
+	}
+
 err_close:
-	mlx4_CLOSE_HCA(dev, 0);
+	if (mlx4_is_slave(dev))
+		mlx4_slave_exit(dev);
+	else
+		mlx4_CLOSE_HCA(dev, 0);
 
 err_free_icm:
-	mlx4_free_icms(dev);
+	if (!mlx4_is_slave(dev))
+		mlx4_free_icms(dev);
 
 err_stop_fw:
-	unmap_bf_area(dev);
-	mlx4_UNMAP_FA(dev);
-	mlx4_free_icm(dev, priv->fw.fw_icm, 0);
-
+	if (!mlx4_is_slave(dev)) {
+		if (!mlx4_UNMAP_FA(dev))
+			mlx4_free_icm(dev, priv->fw.fw_icm, 0);
+		else
+			pr_warn("mlx4_core: mlx4_UNMAP_FA failed.\n");
+		kfree(dev_cap);
+	}
 	return err;
 }
 
@@ -1009,68 +2168,539 @@
 static int mlx4_init_counters_table(struct mlx4_dev *dev)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
-	int err;
-	int nent;
+	int nent_pow2, port_indx, vf_index, num_counters;
+	int res, index = 0;
+	struct counter_index *new_counter_index;
 
-	switch (dev->caps.counters_mode) {
-	case MLX4_CUNTERS_BASIC:
-		nent = dev->caps.max_basic_counters;
-		break;
-	case MLX4_CUNTERS_EXT:
-		nent = dev->caps.max_ext_counters;
-		break;
-	default:
+
+	if (!(dev->caps.flags & MLX4_DEV_CAP_FLAG_COUNTERS))
 		return -ENOENT;
+
+	if (!mlx4_is_slave(dev) &&
+	    dev->caps.max_counters == dev->caps.max_extended_counters) {
+		res = mlx4_cmd(dev, MLX4_IF_STATE_EXTENDED, 0, 0,
+			       MLX4_CMD_SET_IF_STAT,
+			       MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
+		if (res) {
+			mlx4_err(dev, "Failed to set extended counters (err=%d)\n", res);
+			return res;
+		}
 	}
-	err = mlx4_bitmap_init(&priv->counters_bitmap, nent, nent - 1, 0, 0);
-	if (err)
-		return err;
 
+	mutex_init(&priv->counters_table.mutex);
+
+	if (mlx4_is_slave(dev)) {
+		for (port_indx = 0; port_indx < dev->caps.num_ports; port_indx++) {
+			INIT_LIST_HEAD(&priv->counters_table.global_port_list[port_indx]);
+			if (dev->caps.def_counter_index[port_indx] != 0xFF) {
+				new_counter_index = kmalloc(sizeof(struct counter_index), GFP_KERNEL);
+				if (!new_counter_index)
+					return -ENOMEM;
+				new_counter_index->index = dev->caps.def_counter_index[port_indx];
+				list_add_tail(&new_counter_index->list, &priv->counters_table.global_port_list[port_indx]);
+			}
+		}
+		mlx4_dbg(dev, "%s: slave allocated %d counters for %d ports\n",
+			 __func__, dev->caps.num_ports, dev->caps.num_ports);
+		return 0;
+	}
+
+	nent_pow2 = roundup_pow_of_two(dev->caps.max_counters);
+
+	for (port_indx = 0; port_indx < dev->caps.num_ports; port_indx++) {
+		INIT_LIST_HEAD(&priv->counters_table.global_port_list[port_indx]);
+		/* allocating 2 counters per port for PFs */
+                /* For the PF, the ETH default counters are 0,2; */
+		/* and the RoCE default counters are 1,3 */
+		for (num_counters = 0; num_counters < 2; num_counters++, index++) {
+			new_counter_index = kmalloc(sizeof(struct counter_index), GFP_KERNEL);
+			if (!new_counter_index)
+				return -ENOMEM;
+			new_counter_index->index = index;
+			list_add_tail(&new_counter_index->list,
+				      &priv->counters_table.global_port_list[port_indx]);
+		}
+	}
+
+	if (mlx4_is_master(dev)) {
+		for (vf_index = 0; vf_index < dev->num_vfs; vf_index++) {
+			for (port_indx = 0; port_indx < dev->caps.num_ports; port_indx++) {
+				INIT_LIST_HEAD(&priv->counters_table.vf_list[vf_index][port_indx]);
+				new_counter_index = kmalloc(sizeof(struct counter_index), GFP_KERNEL);
+				if (!new_counter_index)
+					return -ENOMEM;
+				if (index <  nent_pow2 - 2) {
+					new_counter_index->index = index;
+					index++;
+				} else {
+					new_counter_index->index = MLX4_SINK_COUNTER_INDEX;
+				}
+
+				list_add_tail(&new_counter_index->list,
+					      &priv->counters_table.vf_list[vf_index][port_indx]);
+			}
+		}
+
+		res = mlx4_bitmap_init(&priv->counters_table.bitmap,
+				       nent_pow2, nent_pow2 - 1,
+				       index, 1);
+		mlx4_dbg(dev, "%s: master allocated %d counters for %d VFs\n",
+			 __func__, index, dev->num_vfs);
+	} else {
+		res = mlx4_bitmap_init(&priv->counters_table.bitmap,
+				nent_pow2, nent_pow2 - 1,
+				index, 1);
+		mlx4_dbg(dev, "%s: native allocated %d counters for %d ports\n",
+			 __func__, index, dev->caps.num_ports);
+	}
+
 	return 0;
+
 }
 
 static void mlx4_cleanup_counters_table(struct mlx4_dev *dev)
 {
-	switch (dev->caps.counters_mode) {
-	case MLX4_CUNTERS_BASIC:
-	case MLX4_CUNTERS_EXT:
-		mlx4_bitmap_cleanup(&mlx4_priv(dev)->counters_bitmap);
-		break;
-	default:
-		break;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int i, j;
+	struct counter_index *port, *tmp_port;
+	struct counter_index *vf, *tmp_vf;
+
+	mutex_lock(&priv->counters_table.mutex);
+
+	if (dev->caps.flags & MLX4_DEV_CAP_FLAG_COUNTERS) {
+		for (i = 0; i < dev->caps.num_ports; i++) {
+			list_for_each_entry_safe(port, tmp_port,
+						 &priv->counters_table.global_port_list[i],
+						 list) {
+				list_del(&port->list);
+				kfree(port);
+			}
+		}
+		if (!mlx4_is_slave(dev)) {
+			for (i = 0; i < dev->num_vfs; i++) {
+				for (j = 0; j < dev->caps.num_ports; j++) {
+					list_for_each_entry_safe(vf, tmp_vf,
+								 &priv->counters_table.vf_list[i][j],
+								 list) {
+						/* clear the counter statistic */
+						if (__mlx4_clear_if_stat(dev, vf->index))
+							mlx4_dbg(dev, "%s: reset counter %d failed\n",
+								 __func__, vf->index);
+						list_del(&vf->list);
+						kfree(vf);
+					}
+				}
+			}
+			mlx4_bitmap_cleanup(&priv->counters_table.bitmap);
+		}
 	}
+	mutex_unlock(&priv->counters_table.mutex);
 }
 
-int mlx4_counter_alloc(struct mlx4_dev *dev, u32 *idx)
+int __mlx4_slave_counters_free(struct mlx4_dev *dev, int slave)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
+	int i, first;
+	struct counter_index *vf, *tmp_vf;
 
-	switch (dev->caps.counters_mode) {
-	case MLX4_CUNTERS_BASIC:
-	case MLX4_CUNTERS_EXT:
-		*idx = mlx4_bitmap_alloc(&priv->counters_bitmap);
-		if (*idx == -1)
-			return -ENOMEM;
+	/* clean VF's counters for the next useg */
+	if (slave > 0 && slave <= dev->num_vfs) {
+		mlx4_dbg(dev, "%s: free counters of slave(%d)\n"
+			 , __func__, slave);
+
+		mutex_lock(&priv->counters_table.mutex);
+		for (i = 0; i < dev->caps.num_ports; i++) {
+			first = 0;
+			list_for_each_entry_safe(vf, tmp_vf,
+						 &priv->counters_table.vf_list[slave - 1][i],
+						 list) {
+				/* clear the counter statistic */
+				if (__mlx4_clear_if_stat(dev, vf->index))
+					mlx4_dbg(dev, "%s: reset counter %d failed\n",
+						 __func__, vf->index);
+				if (first++ && vf->index != MLX4_SINK_COUNTER_INDEX) {
+					mlx4_dbg(dev, "%s: delete counter index %d for slave %d and port %d\n"
+						 , __func__, vf->index, slave, i + 1);
+					mlx4_bitmap_free(&priv->counters_table.bitmap, vf->index, MLX4_USE_RR);
+					list_del(&vf->list);
+					kfree(vf);
+				} else {
+					mlx4_dbg(dev, "%s: can't delete default counter index %d for slave %d and port %d\n"
+						 , __func__, vf->index, slave, i + 1);
+				}
+			}
+		}
+		mutex_unlock(&priv->counters_table.mutex);
+	}
+
+	return 0;
+}
+
+int __mlx4_counter_alloc(struct mlx4_dev *dev, int slave, int port, u32 *idx)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct counter_index *new_counter_index;
+
+	if (!(dev->caps.flags & MLX4_DEV_CAP_FLAG_COUNTERS))
+		return -ENOENT;
+
+	if ((slave > MLX4_MAX_NUM_VF) || (slave < 0) ||
+	    (port < 0) || (port > MLX4_MAX_PORTS)) {
+		mlx4_dbg(dev, "%s: invalid slave(%d) or port(%d) index\n",
+			 __func__, slave, port);
+		return -EINVAL;
+	}
+
+	/* handle old guest request does not support request by port index */
+	if (port == 0) {
+		*idx = MLX4_SINK_COUNTER_INDEX;
+		mlx4_dbg(dev, "%s: allocated default counter index %d for slave %d port %d\n"
+			 , __func__, *idx, slave, port);
 		return 0;
-	default:
-		return -ENOMEM;
 	}
+
+	mutex_lock(&priv->counters_table.mutex);
+
+	*idx = mlx4_bitmap_alloc(&priv->counters_table.bitmap);
+	/* if no resources return the default counter of the slave and port */
+	if (*idx == -1) {
+		if (slave == 0) { /* its the ethernet counter ?????? */
+			new_counter_index = list_entry(priv->counters_table.global_port_list[port - 1].next,
+						       struct counter_index,
+						       list);
+		} else {
+			new_counter_index = list_entry(priv->counters_table.vf_list[slave - 1][port - 1].next,
+						       struct counter_index,
+						       list);
+		}
+
+		*idx = new_counter_index->index;
+		mlx4_dbg(dev, "%s: allocated defualt counter index %d for slave %d port %d\n"
+			 , __func__, *idx, slave, port);
+		goto out;
+	}
+
+	if (slave == 0) { /* native or master */
+		new_counter_index = kmalloc(sizeof(struct counter_index), GFP_KERNEL);
+		if (!new_counter_index)
+			goto no_mem;
+		new_counter_index->index = *idx;
+		list_add_tail(&new_counter_index->list, &priv->counters_table.global_port_list[port - 1]);
+	} else {
+		new_counter_index = kmalloc(sizeof(struct counter_index), GFP_KERNEL);
+		if (!new_counter_index)
+			goto no_mem;
+		new_counter_index->index = *idx;
+		list_add_tail(&new_counter_index->list, &priv->counters_table.vf_list[slave - 1][port - 1]);
+	}
+
+	mlx4_dbg(dev, "%s: allocated counter index %d for slave %d port %d\n"
+		 , __func__, *idx, slave, port);
+out:
+	mutex_unlock(&priv->counters_table.mutex);
+	return 0;
+
+no_mem:
+	mlx4_bitmap_free(&priv->counters_table.bitmap, *idx, MLX4_USE_RR);
+	mutex_unlock(&priv->counters_table.mutex);
+	*idx = MLX4_SINK_COUNTER_INDEX;
+	mlx4_dbg(dev, "%s: failed err (%d)\n"
+		 , __func__, -ENOMEM);
+	return -ENOMEM;
 }
+
+int mlx4_counter_alloc(struct mlx4_dev *dev, u8 port, u32 *idx)
+{
+	u64 out_param;
+	int err;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct counter_index *new_counter_index, *c_index;
+
+	if (mlx4_is_mfunc(dev)) {
+		err = mlx4_cmd_imm(dev, 0, &out_param,
+				   ((u32) port) << 8 | (u32) RES_COUNTER,
+				   RES_OP_RESERVE, MLX4_CMD_ALLOC_RES,
+				   MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
+		if (!err) {
+			*idx = get_param_l(&out_param);
+			if (*idx == MLX4_SINK_COUNTER_INDEX)
+				return -ENOSPC;
+
+			mutex_lock(&priv->counters_table.mutex);
+			c_index = list_entry(priv->counters_table.global_port_list[port - 1].next,
+					     struct counter_index,
+					     list);
+			mutex_unlock(&priv->counters_table.mutex);
+			if (c_index->index == *idx)
+				return -EEXIST;
+
+			if (mlx4_is_slave(dev)) {
+				new_counter_index = kmalloc(sizeof(struct counter_index), GFP_KERNEL);
+				if (!new_counter_index) {
+					mlx4_counter_free(dev, port, *idx);
+					return -ENOMEM;
+				}
+				new_counter_index->index = *idx;
+				mutex_lock(&priv->counters_table.mutex);
+				list_add_tail(&new_counter_index->list, &priv->counters_table.global_port_list[port - 1]);
+				mutex_unlock(&priv->counters_table.mutex);
+				mlx4_dbg(dev, "%s: allocated counter index %d for port %d\n"
+					 , __func__, *idx, port);
+			}
+		}
+		return err;
+	}
+	return __mlx4_counter_alloc(dev, 0, port, idx);
+}
 EXPORT_SYMBOL_GPL(mlx4_counter_alloc);
 
-void mlx4_counter_free(struct mlx4_dev *dev, u32 idx)
+void __mlx4_counter_free(struct mlx4_dev *dev, int slave, int port, u32 idx)
 {
-	switch (dev->caps.counters_mode) {
-	case MLX4_CUNTERS_BASIC:
-	case MLX4_CUNTERS_EXT:
-		mlx4_bitmap_free(&mlx4_priv(dev)->counters_bitmap, idx);
+	/* check if native or slave and deletes acordingly */
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct counter_index *pf, *tmp_pf;
+	struct counter_index *vf, *tmp_vf;
+	int first;
+
+
+	if (idx == MLX4_SINK_COUNTER_INDEX) {
+		mlx4_dbg(dev, "%s: try to delete default counter index %d for port %d\n"
+			 , __func__, idx, port);
+			return;
+	}
+
+	if ((slave > MLX4_MAX_NUM_VF) || (slave < 0) ||
+	    (port < 0) || (port > MLX4_MAX_PORTS)) {
+		mlx4_warn(dev, "%s: deletion failed due to invalid slave(%d) or port(%d) index\n"
+			 , __func__, slave, idx);
+			return;
+	}
+
+	mutex_lock(&priv->counters_table.mutex);
+	if (slave == 0) {
+		first = 0;
+		list_for_each_entry_safe(pf, tmp_pf,
+					 &priv->counters_table.global_port_list[port - 1],
+					 list) {
+			/* the first 2 counters are reserved */
+			if (pf->index == idx) {
+				/* clear the counter statistic */
+				if (__mlx4_clear_if_stat(dev, pf->index))
+					mlx4_dbg(dev, "%s: reset counter %d failed\n",
+						 __func__, pf->index);
+				if (1 < first && idx != MLX4_SINK_COUNTER_INDEX) {
+					list_del(&pf->list);
+					kfree(pf);
+					mlx4_dbg(dev, "%s: delete counter index %d for native device (%d) port %d\n"
+						 , __func__, idx, slave, port);
+					mlx4_bitmap_free(&priv->counters_table.bitmap, idx, MLX4_USE_RR);
+					goto out;
+				} else {
+					mlx4_dbg(dev, "%s: can't delete default counter index %d for native device (%d) port %d\n"
+						 , __func__, idx, slave, port);
+					goto out;
+				}
+			}
+			first++;
+		}
+		mlx4_dbg(dev, "%s: can't delete counter index %d for native device (%d) port %d\n"
+			 , __func__, idx, slave, port);
+	} else {
+		first = 0;
+		list_for_each_entry_safe(vf, tmp_vf,
+					 &priv->counters_table.vf_list[slave - 1][port - 1],
+					 list) {
+			/* the first element is reserved */
+			if (vf->index == idx) {
+				/* clear the counter statistic */
+				if (__mlx4_clear_if_stat(dev, vf->index))
+					mlx4_dbg(dev, "%s: reset counter %d failed\n",
+						 __func__, vf->index);
+				if (first) {
+					list_del(&vf->list);
+					kfree(vf);
+					mlx4_dbg(dev, "%s: delete counter index %d for slave %d port %d\n",
+						 __func__, idx, slave, port);
+					mlx4_bitmap_free(&priv->counters_table.bitmap, idx, MLX4_USE_RR);
+					goto out;
+				} else {
+					mlx4_dbg(dev, "%s: can't delete default slave (%d) counter index %d for port %d\n"
+						 , __func__, slave, idx, port);
+					goto out;
+				}
+			}
+			first++;
+		}
+		mlx4_dbg(dev, "%s: can't delete slave (%d) counter index %d for port %d\n"
+			 , __func__, slave, idx, port);
+	}
+
+out:
+	mutex_unlock(&priv->counters_table.mutex);
+}
+
+void mlx4_counter_free(struct mlx4_dev *dev, u8 port, u32 idx)
+{
+	u64 in_param = 0;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct counter_index *counter, *tmp_counter;
+	int first = 0;
+
+	if (mlx4_is_mfunc(dev)) {
+		set_param_l(&in_param, idx);
+		mlx4_cmd(dev, in_param,
+			 ((u32) port) << 8 | (u32) RES_COUNTER,
+			 RES_OP_RESERVE,
+			 MLX4_CMD_FREE_RES, MLX4_CMD_TIME_CLASS_A,
+			 MLX4_CMD_WRAPPED);
+
+		if (mlx4_is_slave(dev) && idx != MLX4_SINK_COUNTER_INDEX) {
+			mutex_lock(&priv->counters_table.mutex);
+			list_for_each_entry_safe(counter, tmp_counter,
+						 &priv->counters_table.global_port_list[port - 1],
+						 list) {
+				if (counter->index == idx && first++) {
+					list_del(&counter->list);
+					kfree(counter);
+					mlx4_dbg(dev, "%s: delete counter index %d for port %d\n"
+						 , __func__, idx, port);
+					mutex_unlock(&priv->counters_table.mutex);
+					return;
+				}
+			}
+			mutex_unlock(&priv->counters_table.mutex);
+		}
+
 		return;
-	default:
-		return;
 	}
+	__mlx4_counter_free(dev, 0, port, idx);
 }
 EXPORT_SYMBOL_GPL(mlx4_counter_free);
 
+int __mlx4_clear_if_stat(struct mlx4_dev *dev,
+			 u8 counter_index)
+{
+	struct mlx4_cmd_mailbox *if_stat_mailbox = NULL;
+	int err = 0;
+	u32 if_stat_in_mod = (counter_index & 0xff) | (1 << 31);
+
+	if (counter_index == MLX4_SINK_COUNTER_INDEX)
+		return -EINVAL;
+
+	if (mlx4_is_slave(dev))
+		return 0;
+
+	if_stat_mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(if_stat_mailbox)) {
+		err = PTR_ERR(if_stat_mailbox);
+		return err;
+	}
+
+	err = mlx4_cmd_box(dev, 0, if_stat_mailbox->dma, if_stat_in_mod, 0,
+			   MLX4_CMD_QUERY_IF_STAT, MLX4_CMD_TIME_CLASS_C,
+			   MLX4_CMD_NATIVE);
+
+	mlx4_free_cmd_mailbox(dev, if_stat_mailbox);
+	return err;
+}
+
+u8 mlx4_get_default_counter_index(struct mlx4_dev *dev, int slave, int port)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct counter_index *new_counter_index;
+
+	if (dev->caps.port_type[port] == MLX4_PORT_TYPE_IB) {
+		mlx4_dbg(dev, "%s: return counter index %d for slave %d port (MLX4_PORT_TYPE_IB) %d\n",
+			 __func__, MLX4_SINK_COUNTER_INDEX, slave, port);
+		return (u8)MLX4_SINK_COUNTER_INDEX;
+	}
+
+	mutex_lock(&priv->counters_table.mutex);
+	if (slave == 0) {
+		new_counter_index = list_entry(priv->counters_table.global_port_list[port - 1].next,
+					       struct counter_index,
+					       list);
+	} else {
+		new_counter_index = list_entry(priv->counters_table.vf_list[slave - 1][port - 1].next,
+					       struct counter_index,
+					       list);
+	}
+	mutex_unlock(&priv->counters_table.mutex);
+
+	mlx4_dbg(dev, "%s: return counter index %d for slave %d port %d\n",
+		 __func__, new_counter_index->index, slave, port);
+
+
+	return (u8)new_counter_index->index;
+}
+
+int mlx4_get_vport_ethtool_stats(struct mlx4_dev *dev, int port,
+			 struct mlx4_en_vport_stats *vport_stats,
+			 int reset)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_cmd_mailbox *if_stat_mailbox = NULL;
+	union  mlx4_counter *counter;
+	int err = 0;
+	u32 if_stat_in_mod;
+	struct counter_index *vport, *tmp_vport;
+
+	if (!vport_stats)
+		return -EINVAL;
+
+	if_stat_mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(if_stat_mailbox)) {
+		err = PTR_ERR(if_stat_mailbox);
+		return err;
+	}
+
+	mutex_lock(&priv->counters_table.mutex);
+	list_for_each_entry_safe(vport, tmp_vport,
+				 &priv->counters_table.global_port_list[port - 1],
+				 list) {
+		if (vport->index == MLX4_SINK_COUNTER_INDEX)
+			continue;
+
+		memset(if_stat_mailbox->buf, 0, sizeof(union  mlx4_counter));
+		if_stat_in_mod = (vport->index & 0xff) | ((reset & 1) << 31);
+		err = mlx4_cmd_box(dev, 0, if_stat_mailbox->dma,
+				   if_stat_in_mod, 0,
+				   MLX4_CMD_QUERY_IF_STAT,
+				   MLX4_CMD_TIME_CLASS_C,
+				   MLX4_CMD_NATIVE);
+		if (err) {
+			mlx4_dbg(dev, "%s: failed to read statistics for counter index %d\n",
+				 __func__, vport->index);
+			goto if_stat_out;
+		}
+		counter = (union mlx4_counter *)if_stat_mailbox->buf;
+		if ((counter->control.cnt_mode & 0xf) == 1) {
+			vport_stats->rx_broadcast_packets += be64_to_cpu(counter->ext.counters[0].IfRxBroadcastFrames);
+			vport_stats->rx_unicast_packets += be64_to_cpu(counter->ext.counters[0].IfRxUnicastFrames);
+			vport_stats->rx_multicast_packets += be64_to_cpu(counter->ext.counters[0].IfRxMulticastFrames);
+			vport_stats->tx_broadcast_packets += be64_to_cpu(counter->ext.counters[0].IfTxBroadcastFrames);
+			vport_stats->tx_unicast_packets += be64_to_cpu(counter->ext.counters[0].IfTxUnicastFrames);
+			vport_stats->tx_multicast_packets += be64_to_cpu(counter->ext.counters[0].IfTxMulticastFrames);
+			vport_stats->rx_broadcast_bytes += be64_to_cpu(counter->ext.counters[0].IfRxBroadcastOctets);
+			vport_stats->rx_unicast_bytes += be64_to_cpu(counter->ext.counters[0].IfRxUnicastOctets);
+			vport_stats->rx_multicast_bytes += be64_to_cpu(counter->ext.counters[0].IfRxMulticastOctets);
+			vport_stats->tx_broadcast_bytes += be64_to_cpu(counter->ext.counters[0].IfTxBroadcastOctets);
+			vport_stats->tx_unicast_bytes += be64_to_cpu(counter->ext.counters[0].IfTxUnicastOctets);
+			vport_stats->tx_multicast_bytes += be64_to_cpu(counter->ext.counters[0].IfTxMulticastOctets);
+			vport_stats->rx_errors += be64_to_cpu(counter->ext.counters[0].IfRxErrorFrames);
+			vport_stats->rx_dropped += be64_to_cpu(counter->ext.counters[0].IfRxNoBufferFrames);
+			vport_stats->tx_errors += be64_to_cpu(counter->ext.counters[0].IfTxDroppedFrames);
+		}
+	}
+
+if_stat_out:
+	mutex_unlock(&priv->counters_table.mutex);
+	mlx4_free_cmd_mailbox(dev, if_stat_mailbox);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_get_vport_ethtool_stats);
+
 static int mlx4_setup_hca(struct mlx4_dev *dev)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
@@ -1081,18 +2711,19 @@
 	err = mlx4_init_uar_table(dev);
 	if (err) {
 		mlx4_err(dev, "Failed to initialize "
-			 "user access region table, aborting.\n");
+			 "user access region table (err=%d), aborting.\n",
+			 err);
 		return err;
 	}
 
 	err = mlx4_uar_alloc(dev, &priv->driver_uar);
 	if (err) {
-		mlx4_err(dev, "Failed to allocate driver access region, "
-			 "aborting.\n");
+		mlx4_err(dev, "Failed to allocate driver access region "
+			 "(err=%d), aborting.\n", err);
 		goto err_uar_table_free;
 	}
 
-	priv->kar = ioremap(priv->driver_uar.pfn << PAGE_SHIFT, PAGE_SIZE);
+	priv->kar = ioremap((phys_addr_t) priv->driver_uar.pfn << PAGE_SHIFT, PAGE_SIZE);
 	if (!priv->kar) {
 		mlx4_err(dev, "Couldn't map kernel access region, "
 			 "aborting.\n");
@@ -1103,14 +2734,15 @@
 	err = mlx4_init_pd_table(dev);
 	if (err) {
 		mlx4_err(dev, "Failed to initialize "
-			 "protection domain table, aborting.\n");
+			 "protection domain table (err=%d), aborting.\n", err);
 		goto err_kar_unmap;
 	}
 
 	err = mlx4_init_xrcd_table(dev);
 	if (err) {
-		mlx4_err(dev, "Failed to initialize extended "
-			 "reliably connected domain table, aborting.\n");
+		mlx4_err(dev, "Failed to initialize "
+			 "reliable connection domain table (err=%d), "
+			 "aborting.\n", err);
 		goto err_pd_table_free;
 	}
 
@@ -1117,21 +2749,31 @@
 	err = mlx4_init_mr_table(dev);
 	if (err) {
 		mlx4_err(dev, "Failed to initialize "
-			 "memory region table, aborting.\n");
+			 "memory region table (err=%d), aborting.\n", err);
 		goto err_xrcd_table_free;
 	}
 
+	if (!mlx4_is_slave(dev)) {
+		err = mlx4_init_mcg_table(dev);
+		if (err) {
+			mlx4_err(dev, "Failed to initialize "
+				 "multicast group table (err=%d), aborting.\n",
+				 err);
+			goto err_mr_table_free;
+		}
+	}
+
 	err = mlx4_init_eq_table(dev);
 	if (err) {
 		mlx4_err(dev, "Failed to initialize "
-			 "event queue table, aborting.\n");
-		goto err_mr_table_free;
+			 "event queue table (err=%d), aborting.\n", err);
+		goto err_mcg_table_free;
 	}
 
 	err = mlx4_cmd_use_events(dev);
 	if (err) {
 		mlx4_err(dev, "Failed to switch to event-driven "
-			 "firmware commands, aborting.\n");
+			 "firmware commands (err=%d), aborting.\n", err);
 		goto err_eq_table_free;
 	}
 
@@ -1157,7 +2799,7 @@
 	err = mlx4_init_cq_table(dev);
 	if (err) {
 		mlx4_err(dev, "Failed to initialize "
-			 "completion queue table, aborting.\n");
+			 "completion queue table (err=%d), aborting.\n", err);
 		goto err_cmd_poll;
 	}
 
@@ -1164,7 +2806,8 @@
 	err = mlx4_init_srq_table(dev);
 	if (err) {
 		mlx4_err(dev, "Failed to initialize "
-			 "shared receive queue table, aborting.\n");
+			 "shared receive queue table (err=%d), aborting.\n",
+			 err);
 		goto err_cq_table_free;
 	}
 
@@ -1171,36 +2814,48 @@
 	err = mlx4_init_qp_table(dev);
 	if (err) {
 		mlx4_err(dev, "Failed to initialize "
-			 "queue pair table, aborting.\n");
+			 "queue pair table (err=%d), aborting.\n", err);
 		goto err_srq_table_free;
 	}
 
-	err = mlx4_init_mcg_table(dev);
-	if (err) {
-		mlx4_err(dev, "Failed to initialize "
-			 "multicast group table, aborting.\n");
-		goto err_qp_table_free;
-	}
-
 	err = mlx4_init_counters_table(dev);
 	if (err && err != -ENOENT) {
-		mlx4_err(dev, "Failed to initialize counters table, aborting.\n");
-		goto err_mcg_table_free;
+		mlx4_err(dev, "Failed to initialize counters table (err=%d), "
+			 "aborting.\n", err);
+		goto err_qp_table_free;
 	}
 
-	for (port = 1; port <= dev->caps.num_ports; port++) {
-		ib_port_default_caps = 0;
-		err = mlx4_get_port_ib_caps(dev, port, &ib_port_default_caps);
-		if (err)
-			mlx4_warn(dev, "failed to get port %d default "
-				  "ib capabilities (%d). Continuing with "
-				  "caps = 0\n", port, err);
-		dev->caps.ib_port_def_cap[port] = ib_port_default_caps;
-		err = mlx4_SET_PORT(dev, port);
-		if (err) {
-			mlx4_err(dev, "Failed to set port %d, aborting\n",
-				port);
-			goto err_counters_table_free;
+	if (!mlx4_is_slave(dev)) {
+		for (port = 1; port <= dev->caps.num_ports; port++) {
+			ib_port_default_caps = 0;
+			err = mlx4_get_port_ib_caps(dev, port,
+						    &ib_port_default_caps);
+			if (err)
+				mlx4_warn(dev, "failed to get port %d default "
+					  "ib capabilities (%d). Continuing "
+					  "with caps = 0\n", port, err);
+			dev->caps.ib_port_def_cap[port] = ib_port_default_caps;
+
+			/* initialize per-slave default ib port capabilities */
+			if (mlx4_is_master(dev)) {
+				int i;
+				for (i = 0; i < dev->num_slaves; i++) {
+					if (i == mlx4_master_func_num(dev))
+						continue;
+					priv->mfunc.master.slave_state[i].ib_cap_mask[port] =
+							ib_port_default_caps;
+				}
+			}
+
+			dev->caps.port_ib_mtu[port] = IB_MTU_4096;
+
+			err = mlx4_SET_PORT(dev, port, mlx4_is_master(dev) ?
+					    dev->caps.pkey_table_len[port] : -1);
+			if (err) {
+				mlx4_err(dev, "Failed to set port %d (err=%d), "
+					 "aborting\n", port, err);
+				goto err_counters_table_free;
+			}
 		}
 	}
 
@@ -1209,9 +2864,6 @@
 err_counters_table_free:
 	mlx4_cleanup_counters_table(dev);
 
-err_mcg_table_free:
-	mlx4_cleanup_mcg_table(dev);
-
 err_qp_table_free:
 	mlx4_cleanup_qp_table(dev);
 
@@ -1227,6 +2879,10 @@
 err_eq_table_free:
 	mlx4_cleanup_eq_table(dev);
 
+err_mcg_table_free:
+	if (!mlx4_is_slave(dev))
+		mlx4_cleanup_mcg_table(dev);
+
 err_mr_table_free:
 	mlx4_cleanup_mr_table(dev);
 
@@ -1251,13 +2907,19 @@
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
 	struct msix_entry *entries;
-	int nreq;
+	int nreq = min_t(int, dev->caps.num_ports *
+			 min_t(int, num_possible_cpus() + 1, MAX_MSIX_P_PORT)
+				+ MSIX_LEGACY_SZ, MAX_MSIX);
 	int err;
 	int i;
 
 	if (msi_x) {
 		nreq = min_t(int, dev->caps.num_eqs - dev->caps.reserved_eqs,
-			     num_possible_cpus() + 1);
+			     nreq);
+
+		if (msi_x > 1 && !mlx4_is_mfunc(dev))
+			nreq = min_t(int, nreq, msi_x);
+
 		entries = kcalloc(nreq, sizeof *entries, GFP_KERNEL);
 		if (!entries)
 			goto no_msi;
@@ -1277,10 +2939,24 @@
 				goto retry;
 			}
 			kfree(entries);
+			/* if error, or can't alloc even 1 IRQ */
+			if (err < 0) {
+				mlx4_err(dev, "No IRQs left, device can't "
+				    "be started.\n");
+				goto no_irq;
+			}
 			goto no_msi;
 		}
 
-		dev->caps.num_comp_vectors = nreq - 1;
+		if (nreq <
+		    MSIX_LEGACY_SZ + dev->caps.num_ports * MIN_MSIX_P_PORT) {
+			/*Working in legacy mode , all EQ's shared*/
+			dev->caps.comp_pool           = 0;
+			dev->caps.num_comp_vectors = nreq - 1;
+		} else {
+			dev->caps.comp_pool           = nreq - MSIX_LEGACY_SZ;
+			dev->caps.num_comp_vectors = MSIX_LEGACY_SZ - 1;
+		}
 		for (i = 0; i < nreq; ++i)
 			priv->eq_table.eq[i].irq = entries[i].vector;
 
@@ -1292,11 +2968,40 @@
 
 no_msi:
 	dev->caps.num_comp_vectors = 1;
+	dev->caps.comp_pool	   = 0;
 
 	for (i = 0; i < 2; ++i)
 		priv->eq_table.eq[i].irq = dev->pdev->irq;
+	return;
+no_irq:
+	dev->caps.num_comp_vectors = 0;
+	dev->caps.comp_pool        = 0;
+	return;
 }
 
+static void
+mlx4_init_hca_info(struct mlx4_dev *dev)
+{
+	struct mlx4_hca_info *info = &mlx4_priv(dev)->hca_info;
+
+	info->dev = dev;
+
+	info->firmware_attr = (struct device_attribute)__ATTR(fw_ver, S_IRUGO,
+							show_firmware_version, NULL);
+	if (device_create_file(&dev->pdev->dev, &info->firmware_attr))
+		mlx4_err(dev, "Failed to add file firmware version");
+
+	info->hca_attr = (struct device_attribute)__ATTR(hca, S_IRUGO, show_hca,
+										NULL);
+	if (device_create_file(&dev->pdev->dev, &info->hca_attr))
+		mlx4_err(dev, "Failed to add file hca type");
+
+	info->board_attr = (struct device_attribute)__ATTR(board_id, S_IRUGO,
+							    show_board, NULL);
+	if (device_create_file(&dev->pdev->dev, &info->board_attr))
+		mlx4_err(dev, "Failed to add file board id type");
+}
+
 static int mlx4_init_port_info(struct mlx4_dev *dev, int port)
 {
 	struct mlx4_port_info *info = &mlx4_priv(dev)->port[port];
@@ -1304,14 +3009,22 @@
 
 	info->dev = dev;
 	info->port = port;
-	mlx4_init_mac_table(dev, &info->mac_table);
-	mlx4_init_vlan_table(dev, &info->vlan_table);
+	if (!mlx4_is_slave(dev)) {
+		mlx4_init_mac_table(dev, &info->mac_table);
+		mlx4_init_vlan_table(dev, &info->vlan_table);
+		info->base_qpn = mlx4_get_base_qpn(dev, port);
+	}
 
 	sprintf(info->dev_name, "mlx4_port%d", port);
 	info->port_attr.attr.name = info->dev_name;
-	info->port_attr.attr.mode = S_IRUGO | S_IWUSR;
+	if (mlx4_is_mfunc(dev))
+		info->port_attr.attr.mode = S_IRUGO;
+	else {
+		info->port_attr.attr.mode = S_IRUGO | S_IWUSR;
+		info->port_attr.store     = set_port_type;
+	}
 	info->port_attr.show      = show_port_type;
-	info->port_attr.store     = set_port_type;
+	sysfs_attr_init(&info->port_attr.attr);
 
 	err = device_create_file(&dev->pdev->dev, &info->port_attr);
 	if (err) {
@@ -1319,9 +3032,35 @@
 		info->port = -1;
 	}
 
+	sprintf(info->dev_mtu_name, "mlx4_port%d_mtu", port);
+	info->port_mtu_attr.attr.name = info->dev_mtu_name;
+	if (mlx4_is_mfunc(dev))
+		info->port_mtu_attr.attr.mode = S_IRUGO;
+	else {
+		info->port_mtu_attr.attr.mode = S_IRUGO | S_IWUSR;
+		info->port_mtu_attr.store     = set_port_ib_mtu;
+	}
+	info->port_mtu_attr.show      = show_port_ib_mtu;
+	sysfs_attr_init(&info->port_mtu_attr.attr);
+
+	err = device_create_file(&dev->pdev->dev, &info->port_mtu_attr);
+	if (err) {
+		mlx4_err(dev, "Failed to create mtu file for port %d\n", port);
+		device_remove_file(&info->dev->pdev->dev, &info->port_attr);
+		info->port = -1;
+	}
+
 	return err;
 }
 
+static void
+mlx4_cleanup_hca_info(struct mlx4_hca_info *info)
+{
+	device_remove_file(&info->dev->pdev->dev, &info->firmware_attr);
+	device_remove_file(&info->dev->pdev->dev, &info->board_attr);
+	device_remove_file(&info->dev->pdev->dev, &info->hca_attr);
+}
+
 static void mlx4_cleanup_port_info(struct mlx4_port_info *info)
 {
 	if (info->port < 0)
@@ -1328,25 +3067,115 @@
 		return;
 
 	device_remove_file(&info->dev->pdev->dev, &info->port_attr);
+	device_remove_file(&info->dev->pdev->dev, &info->port_mtu_attr);
 }
 
-static int mlx4_init_trigger(struct mlx4_priv *priv)
+static int mlx4_init_steering(struct mlx4_dev *dev)
 {
-	memcpy(&priv->trigger_attr, &dev_attr_port_trigger,
-	       sizeof(struct device_attribute));
-        return device_create_file(&priv->dev.pdev->dev, &priv->trigger_attr);
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int num_entries = dev->caps.num_ports;
+	int i, j;
+
+	priv->steer = kzalloc(sizeof(struct mlx4_steer) * num_entries, GFP_KERNEL);
+	if (!priv->steer)
+		return -ENOMEM;
+
+	for (i = 0; i < num_entries; i++)
+		for (j = 0; j < MLX4_NUM_STEERS; j++) {
+			INIT_LIST_HEAD(&priv->steer[i].promisc_qps[j]);
+			INIT_LIST_HEAD(&priv->steer[i].steer_entries[j]);
+		}
+	return 0;
 }
 
-static int __mlx4_init_one(struct pci_dev *pdev, const struct pci_device_id *id)
+static void mlx4_clear_steering(struct mlx4_dev *dev)
 {
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_steer_index *entry, *tmp_entry;
+	struct mlx4_promisc_qp *pqp, *tmp_pqp;
+	int num_entries = dev->caps.num_ports;
+	int i, j;
+
+	for (i = 0; i < num_entries; i++) {
+		for (j = 0; j < MLX4_NUM_STEERS; j++) {
+			list_for_each_entry_safe(pqp, tmp_pqp,
+						 &priv->steer[i].promisc_qps[j],
+						 list) {
+				list_del(&pqp->list);
+				kfree(pqp);
+			}
+			list_for_each_entry_safe(entry, tmp_entry,
+						 &priv->steer[i].steer_entries[j],
+						 list) {
+				list_del(&entry->list);
+				list_for_each_entry_safe(pqp, tmp_pqp,
+							 &entry->duplicates,
+							 list) {
+					list_del(&pqp->list);
+					kfree(pqp);
+				}
+				kfree(entry);
+			}
+		}
+	}
+	kfree(priv->steer);
+}
+
+static int extended_func_num(struct pci_dev *pdev)
+{
+	return PCI_SLOT(pdev->devfn) * 8 + PCI_FUNC(pdev->devfn);
+}
+
+#define MLX4_OWNER_BASE	0x8069c
+#define MLX4_OWNER_SIZE	4
+
+static int mlx4_get_ownership(struct mlx4_dev *dev)
+{
+	void __iomem *owner;
+	u32 ret;
+
+	if (pci_channel_offline(dev->pdev))
+		return -EIO;
+
+	owner = ioremap(pci_resource_start(dev->pdev, 0) + MLX4_OWNER_BASE,
+			MLX4_OWNER_SIZE);
+	if (!owner) {
+		mlx4_err(dev, "Failed to obtain ownership bit\n");
+		return -ENOMEM;
+	}
+
+	ret = readl(owner);
+	iounmap(owner);
+	return (int) !!ret;
+}
+
+static void mlx4_free_ownership(struct mlx4_dev *dev)
+{
+	void __iomem *owner;
+
+	if (pci_channel_offline(dev->pdev))
+		return;
+
+	owner = ioremap(pci_resource_start(dev->pdev, 0) + MLX4_OWNER_BASE,
+			MLX4_OWNER_SIZE);
+	if (!owner) {
+		mlx4_err(dev, "Failed to obtain ownership bit\n");
+		return;
+	}
+	writel(0, owner);
+	msleep(1000);
+	iounmap(owner);
+}
+
+static int __mlx4_init_one(struct pci_dev *pdev, int pci_dev_data)
+{
 	struct mlx4_priv *priv;
 	struct mlx4_dev *dev;
 	int err;
 	int port;
-	int i;
+	int nvfs, prb_vf;
 
-	printk(KERN_INFO PFX "Initializing %s\n",
-	       pci_name(pdev));
+	pr_info(DRV_NAME ": Initializing %s\n", pci_name(pdev));
 
 	err = pci_enable_device(pdev);
 	if (err) {
@@ -1355,12 +3184,26 @@
 		return err;
 	}
 
+	mlx4_get_val(num_vfs.dbdf2val.tbl, pci_physfn(pdev), 0, &nvfs);
+	mlx4_get_val(probe_vf.dbdf2val.tbl, pci_physfn(pdev), 0, &prb_vf);
+	if (nvfs > MLX4_MAX_NUM_VF) {
+		dev_err(&pdev->dev, "There are more VF's (%d) than allowed(%d)\n",
+			nvfs, MLX4_MAX_NUM_VF);
+		return -EINVAL;
+	}
+
+	if (nvfs < 0) {
+		dev_err(&pdev->dev, "num_vfs module parameter cannot be negative\n");
+		return -EINVAL;
+	}
 	/*
-	 * Check for BARs.  We expect 0: 1MB
+	 * Check for BARs.
 	 */
-	if (!(pci_resource_flags(pdev, 0) & IORESOURCE_MEM) ||
-	    pci_resource_len(pdev, 0) != 1 << 20) {
-		dev_err(&pdev->dev, "Missing DCS, aborting.\n");
+	if (!(pci_dev_data & MLX4_PCI_DEV_IS_VF) &&
+	    !(pci_resource_flags(pdev, 0) & IORESOURCE_MEM)) {
+		dev_err(&pdev->dev, "Missing DCS, aborting."
+			"(driver_data: 0x%x, pci_resource_flags(pdev, 0):0x%x)\n",
+			pci_dev_data, pci_resource_flags(pdev, 0));
 		err = -ENODEV;
 		goto err_disable_pdev;
 	}
@@ -1370,18 +3213,12 @@
 		goto err_disable_pdev;
 	}
 
-	err = pci_request_region(pdev, 0, DRV_NAME);
+	err = pci_request_regions(pdev, DRV_NAME);
 	if (err) {
-		dev_err(&pdev->dev, "Cannot request control region, aborting.\n");
+		dev_err(&pdev->dev, "Couldn't get PCI resources, aborting\n");
 		goto err_disable_pdev;
 	}
 
-	err = pci_request_region(pdev, 2, DRV_NAME);
-	if (err) {
-		dev_err(&pdev->dev, "Cannot request UAR region, aborting.\n");
-		goto err_release_bar0;
-	}
-
 	pci_set_master(pdev);
 
 	err = pci_set_dma_mask(pdev, DMA_BIT_MASK(64));
@@ -1390,7 +3227,7 @@
 		err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
 		if (err) {
 			dev_err(&pdev->dev, "Can't set PCI DMA mask, aborting.\n");
-			goto err_release_bar2;
+			goto err_release_regions;
 		}
 	}
 	err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
@@ -1401,20 +3238,24 @@
 		if (err) {
 			dev_err(&pdev->dev, "Can't set consistent PCI DMA mask, "
 				"aborting.\n");
-			goto err_release_bar2;
+			goto err_release_regions;
 		}
 	}
 
+	/* Allow large DMA segments, up to the firmware limit of 1 GB */
+	dma_set_max_seg_size(&pdev->dev, 1024 * 1024 * 1024);
+
 	priv = kzalloc(sizeof *priv, GFP_KERNEL);
 	if (!priv) {
 		dev_err(&pdev->dev, "Device struct alloc failed, "
 			"aborting.\n");
 		err = -ENOMEM;
-		goto err_release_bar2;
+		goto err_release_regions;
 	}
 
 	dev       = &priv->dev;
 	dev->pdev = pdev;
+	INIT_LIST_HEAD(&priv->dev_list);
 	INIT_LIST_HEAD(&priv->ctx_list);
 	spin_lock_init(&priv->ctx_lock);
 
@@ -1422,48 +3263,163 @@
 
 	INIT_LIST_HEAD(&priv->pgdir_list);
 	mutex_init(&priv->pgdir_mutex);
-	for (i = 0; i < MLX4_MAX_PORTS; ++i)
-		priv->iboe_counter_index[i] = -1;
 
 	INIT_LIST_HEAD(&priv->bf_list);
 	mutex_init(&priv->bf_mutex);
 
-	/*
-	 * Now reset the HCA before we touch the PCI capabilities or
-	 * attempt a firmware command, since a boot ROM may have left
-	 * the HCA in an undefined state.
-	 */
-	err = mlx4_reset(dev);
-	if (err) {
-		mlx4_err(dev, "Failed to reset HCA, aborting.\n");
-		goto err_free_dev;
+	dev->rev_id = pdev->revision;
+	dev->numa_node = dev_to_node(&pdev->dev);
+	/* Detect if this device is a virtual function */
+	if (pci_dev_data & MLX4_PCI_DEV_IS_VF) {
+		/* When acting as pf, we normally skip vfs unless explicitly
+		 * requested to probe them. */
+		if (nvfs && extended_func_num(pdev) > prb_vf) {
+			mlx4_warn(dev, "Skipping virtual function:%d\n",
+						extended_func_num(pdev));
+			err = -ENODEV;
+			goto err_free_dev;
+		}
+		mlx4_warn(dev, "Detected virtual function - running in slave mode\n");
+		dev->flags |= MLX4_FLAG_SLAVE;
+	} else {
+		/* We reset the device and enable SRIOV only for physical
+		 * devices.  Try to claim ownership on the device;
+		 * if already taken, skip -- do not allow multiple PFs */
+		err = mlx4_get_ownership(dev);
+		if (err) {
+			if (err < 0)
+				goto err_free_dev;
+			else {
+				mlx4_warn(dev, "Multiple PFs not yet supported."
+					  " Skipping PF.\n");
+				err = -EINVAL;
+				goto err_free_dev;
+			}
+		}
+
+		if (nvfs) {
+			mlx4_warn(dev, "Enabling SR-IOV with %d VFs\n", nvfs);
+			err = pci_enable_sriov(pdev, nvfs);
+			if (err) {
+				mlx4_err(dev, "Failed to enable SR-IOV, continuing without SR-IOV (err = %d).\n",
+					 err);
+				err = 0;
+			} else {
+				mlx4_warn(dev, "Running in master mode\n");
+				dev->flags |= MLX4_FLAG_SRIOV |
+					      MLX4_FLAG_MASTER;
+				dev->num_vfs = nvfs;
+			}
+		}
+
+		atomic_set(&priv->opreq_count, 0);
+		INIT_WORK(&priv->opreq_task, mlx4_opreq_action);
+
+		/*
+		 * Now reset the HCA before we touch the PCI capabilities or
+		 * attempt a firmware command, since a boot ROM may have left
+		 * the HCA in an undefined state.
+		 */
+		err = mlx4_reset(dev);
+		if (err) {
+			mlx4_err(dev, "Failed to reset HCA, aborting.\n");
+			goto err_sriov;
+		}
 	}
 
-	if (mlx4_cmd_init(dev)) {
+slave_start:
+	err = mlx4_cmd_init(dev);
+	if (err) {
 		mlx4_err(dev, "Failed to init command interface, aborting.\n");
-		goto err_free_dev;
+		goto err_sriov;
 	}
 
+	/* In slave functions, the communication channel must be initialized
+	 * before posting commands. Also, init num_slaves before calling
+	 * mlx4_init_hca */
+	if (mlx4_is_mfunc(dev)) {
+		if (mlx4_is_master(dev))
+			dev->num_slaves = MLX4_MAX_NUM_SLAVES;
+		else {
+			dev->num_slaves = 0;
+			err = mlx4_multi_func_init(dev);
+			if (err) {
+				mlx4_err(dev, "Failed to init slave mfunc"
+					 " interface, aborting.\n");
+				goto err_cmd;
+			}
+		}
+	}
+
 	err = mlx4_init_hca(dev);
-	if (err)
-		goto err_cmd;
+	if (err) {
+		if (err == -EACCES) {
+			/* Not primary Physical function
+			 * Running in slave mode */
+			mlx4_cmd_cleanup(dev);
+			dev->flags |= MLX4_FLAG_SLAVE;
+			dev->flags &= ~MLX4_FLAG_MASTER;
+			goto slave_start;
+		} else
+			goto err_mfunc;
+	}
 
+	/* In master functions, the communication channel must be initialized
+	 * after obtaining its address from fw */
+	if (mlx4_is_master(dev)) {
+		err = mlx4_multi_func_init(dev);
+		if (err) {
+			mlx4_err(dev, "Failed to init master mfunc"
+				 "interface, aborting.\n");
+			goto err_close;
+		}
+	}
+
 	err = mlx4_alloc_eq_table(dev);
 	if (err)
-		goto err_close;
+		goto err_master_mfunc;
 
+	priv->msix_ctl.pool_bm = 0;
+	mutex_init(&priv->msix_ctl.pool_lock);
+
 	mlx4_enable_msi_x(dev);
 
+	/* no MSIX and no shared IRQ */
+	if (!dev->caps.num_comp_vectors && !dev->caps.comp_pool) {
+		err = -ENOSPC;
+		goto err_free_eq;
+	}
+
+	if ((mlx4_is_mfunc(dev)) &&
+	    !(dev->flags & MLX4_FLAG_MSI_X)) {
+		err = -ENOSYS;
+		mlx4_err(dev, "INTx is not supported in multi-function mode."
+			 " aborting.\n");
+		goto err_free_eq;
+	}
+
+	if (!mlx4_is_slave(dev)) {
+		err = mlx4_init_steering(dev);
+		if (err)
+			goto err_free_eq;
+	}
+
 	err = mlx4_setup_hca(dev);
-	if (err == -EBUSY && (dev->flags & MLX4_FLAG_MSI_X)) {
+	if (err == -EBUSY && (dev->flags & MLX4_FLAG_MSI_X) &&
+	    !mlx4_is_mfunc(dev)) {
 		dev->flags &= ~MLX4_FLAG_MSI_X;
+		dev->caps.num_comp_vectors = 1;
+		dev->caps.comp_pool	   = 0;
 		pci_disable_msix(pdev);
 		err = mlx4_setup_hca(dev);
 	}
 
 	if (err)
-		goto err_free_eq;
+		goto err_steer;
 
+	mlx4_init_quotas(dev);
+	mlx4_init_hca_info(dev);
+
 	for (port = 1; port <= dev->caps.num_ports; port++) {
 		err = mlx4_init_port_info(dev, port);
 		if (err)
@@ -1474,43 +3430,52 @@
 	if (err)
 		goto err_port;
 
-	err = mlx4_init_trigger(priv);
-	if (err)
-		goto err_register;
+	mlx4_request_modules(dev);
 
-	err = mlx4_sense_init(dev);
-	if (err)
-		goto err_trigger;
-
+	mlx4_sense_init(dev);
 	mlx4_start_sense(dev);
 
+	priv->pci_dev_data = pci_dev_data;
 	pci_set_drvdata(pdev, dev);
 
 	return 0;
 
-err_trigger:
-	device_remove_file(&dev->pdev->dev, &priv->trigger_attr);
-err_register:
-	mlx4_unregister_device(dev);
 err_port:
 	for (--port; port >= 1; --port)
 		mlx4_cleanup_port_info(&priv->port[port]);
 
 	mlx4_cleanup_counters_table(dev);
-	mlx4_cleanup_mcg_table(dev);
 	mlx4_cleanup_qp_table(dev);
 	mlx4_cleanup_srq_table(dev);
 	mlx4_cleanup_cq_table(dev);
 	mlx4_cmd_use_polling(dev);
 	mlx4_cleanup_eq_table(dev);
+	mlx4_cleanup_mcg_table(dev);
 	mlx4_cleanup_mr_table(dev);
 	mlx4_cleanup_xrcd_table(dev);
 	mlx4_cleanup_pd_table(dev);
 	mlx4_cleanup_uar_table(dev);
 
+err_steer:
+	if (!mlx4_is_slave(dev))
+		mlx4_clear_steering(dev);
+
 err_free_eq:
 	mlx4_free_eq_table(dev);
 
+err_master_mfunc:
+	if (mlx4_is_master(dev)) {
+		mlx4_free_resource_tracker(dev, RES_TR_FREE_STRUCTS_ONLY);
+		mlx4_multi_func_cleanup(dev);
+	}
+
+	if (mlx4_is_slave(dev)) {
+		kfree(dev->caps.qp0_tunnel);
+		kfree(dev->caps.qp0_proxy);
+		kfree(dev->caps.qp1_tunnel);
+		kfree(dev->caps.qp1_proxy);
+	}
+
 err_close:
 	if (dev->flags & MLX4_FLAG_MSI_X)
 		pci_disable_msix(pdev);
@@ -1517,18 +3482,26 @@
 
 	mlx4_close_hca(dev);
 
+err_mfunc:
+	if (mlx4_is_slave(dev))
+		mlx4_multi_func_cleanup(dev);
+
 err_cmd:
 	mlx4_cmd_cleanup(dev);
 
+err_sriov:
+	if (dev->flags & MLX4_FLAG_SRIOV)
+		pci_disable_sriov(pdev);
+
+	if (!mlx4_is_slave(dev))
+		mlx4_free_ownership(dev);
+
 err_free_dev:
 	kfree(priv);
 
-err_release_bar2:
-	pci_release_region(pdev, 2);
+err_release_regions:
+	pci_release_regions(pdev);
 
-err_release_bar0:
-	pci_release_region(pdev, 0);
-
 err_disable_pdev:
 	pci_disable_device(pdev);
 	pci_set_drvdata(pdev, NULL);
@@ -1538,14 +3511,8 @@
 static int __devinit mlx4_init_one(struct pci_dev *pdev,
 				   const struct pci_device_id *id)
 {
-	static int mlx4_version_printed;
-
-	if (!mlx4_version_printed) {
-		printk(KERN_INFO "%s", mlx4_version);
-		++mlx4_version_printed;
-	}
-
-	return __mlx4_init_one(pdev, id);
+	device_set_desc(pdev->dev.bsddev, mlx4_version);
+	return __mlx4_init_one(pdev, id->driver_data);
 }
 
 static void mlx4_remove_one(struct pci_dev *pdev)
@@ -1555,106 +3522,307 @@
 	int p;
 
 	if (dev) {
-		mlx4_sense_cleanup(dev);
+		/* in SRIOV it is not allowed to unload the pf's
+		 * driver while there are alive vf's */
+		if (mlx4_is_master(dev)) {
+			if (mlx4_how_many_lives_vf(dev))
+				mlx4_err(dev, "Removing PF when there are assigned VF's !!!\n");
+		}
+		mlx4_stop_sense(dev);
 		mlx4_unregister_device(dev);
-		device_remove_file(&dev->pdev->dev, &priv->trigger_attr);
 
+		mlx4_cleanup_hca_info(&priv->hca_info);
 		for (p = 1; p <= dev->caps.num_ports; p++) {
 			mlx4_cleanup_port_info(&priv->port[p]);
 			mlx4_CLOSE_PORT(dev, p);
 		}
 
-                mlx4_cleanup_counters_table(dev);
-		mlx4_cleanup_mcg_table(dev);
+		if (mlx4_is_master(dev))
+			mlx4_free_resource_tracker(dev,
+						   RES_TR_FREE_SLAVES_ONLY);
+
+		mlx4_cleanup_counters_table(dev);
 		mlx4_cleanup_qp_table(dev);
 		mlx4_cleanup_srq_table(dev);
 		mlx4_cleanup_cq_table(dev);
 		mlx4_cmd_use_polling(dev);
 		mlx4_cleanup_eq_table(dev);
+		mlx4_cleanup_mcg_table(dev);
 		mlx4_cleanup_mr_table(dev);
 		mlx4_cleanup_xrcd_table(dev);
 		mlx4_cleanup_pd_table(dev);
 
+		if (mlx4_is_master(dev))
+			mlx4_free_resource_tracker(dev,
+						   RES_TR_FREE_STRUCTS_ONLY);
+
 		iounmap(priv->kar);
 		mlx4_uar_free(dev, &priv->driver_uar);
 		mlx4_cleanup_uar_table(dev);
+		if (!mlx4_is_slave(dev))
+			mlx4_clear_steering(dev);
 		mlx4_free_eq_table(dev);
+		if (mlx4_is_master(dev))
+			mlx4_multi_func_cleanup(dev);
 		mlx4_close_hca(dev);
+		if (mlx4_is_slave(dev))
+			mlx4_multi_func_cleanup(dev);
 		mlx4_cmd_cleanup(dev);
 
 		if (dev->flags & MLX4_FLAG_MSI_X)
 			pci_disable_msix(pdev);
+		if (dev->flags & MLX4_FLAG_SRIOV) {
+			mlx4_warn(dev, "Disabling SR-IOV\n");
+			pci_disable_sriov(pdev);
+		}
 
+		if (!mlx4_is_slave(dev))
+			mlx4_free_ownership(dev);
+
+		kfree(dev->caps.qp0_tunnel);
+		kfree(dev->caps.qp0_proxy);
+		kfree(dev->caps.qp1_tunnel);
+		kfree(dev->caps.qp1_proxy);
+
 		kfree(priv);
-		pci_release_region(pdev, 2);
-		pci_release_region(pdev, 0);
+		pci_release_regions(pdev);
 		pci_disable_device(pdev);
 		pci_set_drvdata(pdev, NULL);
 	}
 }
 
+static int restore_current_port_types(struct mlx4_dev *dev,
+				      enum mlx4_port_type *types,
+				      enum mlx4_port_type *poss_types)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int err, i;
+
+	mlx4_stop_sense(dev);
+	mutex_lock(&priv->port_mutex);
+	for (i = 0; i < dev->caps.num_ports; i++)
+		dev->caps.possible_type[i + 1] = poss_types[i];
+	err = mlx4_change_port_types(dev, types);
+	mlx4_start_sense(dev);
+	mutex_unlock(&priv->port_mutex);
+	return err;
+}
+
 int mlx4_restart_one(struct pci_dev *pdev)
 {
+	struct mlx4_dev	 *dev  = pci_get_drvdata(pdev);
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	enum mlx4_port_type curr_type[MLX4_MAX_PORTS];
+	enum mlx4_port_type poss_type[MLX4_MAX_PORTS];
+	int pci_dev_data, err, i;
+
+	pci_dev_data = priv->pci_dev_data;
+	for (i = 0; i < dev->caps.num_ports; i++) {
+		curr_type[i] = dev->caps.port_type[i + 1];
+		poss_type[i] = dev->caps.possible_type[i + 1];
+	}
+
 	mlx4_remove_one(pdev);
-	return __mlx4_init_one(pdev, NULL);
+	err = __mlx4_init_one(pdev, pci_dev_data);
+	if (err)
+		return err;
+
+	dev = pci_get_drvdata(pdev);
+	err = restore_current_port_types(dev, curr_type, poss_type);
+	if (err)
+		mlx4_err(dev, "mlx4_restart_one: could not restore original port types (%d)\n",
+			 err);
+	return 0;
 }
 
-static struct pci_device_id mlx4_pci_table[] = {
-	{ PCI_VDEVICE(MELLANOX, 0x6340) }, /* MT25408 "Hermon" SDR */
-	{ PCI_VDEVICE(MELLANOX, 0x634a) }, /* MT25408 "Hermon" DDR */
-	{ PCI_VDEVICE(MELLANOX, 0x6354) }, /* MT25408 "Hermon" QDR */
-	{ PCI_VDEVICE(MELLANOX, 0x6732) }, /* MT25408 "Hermon" DDR PCIe gen2 */
-	{ PCI_VDEVICE(MELLANOX, 0x673c) }, /* MT25408 "Hermon" QDR PCIe gen2 */
-	{ PCI_VDEVICE(MELLANOX, 0x6368) }, /* MT25408 "Hermon" EN 10GigE */
-	{ PCI_VDEVICE(MELLANOX, 0x6750) }, /* MT25408 "Hermon" EN 10GigE PCIe gen2 */
-	{ PCI_VDEVICE(MELLANOX, 0x6372) }, /* MT25458 ConnectX EN 10GBASE-T 10GigE */
-	{ PCI_VDEVICE(MELLANOX, 0x675a) }, /* MT25458 ConnectX EN 10GBASE-T+Gen2 10GigE */
-	{ PCI_VDEVICE(MELLANOX, 0x6764) }, /* MT26468 ConnectX EN 10GigE PCIe gen2 */
-	{ PCI_VDEVICE(MELLANOX, 0x6746) }, /* MT26438 ConnectX VPI PCIe 2.0 5GT/s - IB QDR / 10GigE Virt+ */
-	{ PCI_VDEVICE(MELLANOX, 0x676e) }, /* MT26478 ConnectX EN 40GigE PCIe 2.0 5GT/s */
-	{ PCI_VDEVICE(MELLANOX, 0x6778) }, /* MT26488 ConnectX VPI PCIe 2.0 5GT/s - IB DDR / 10GigE Virt+ */
-	{ PCI_VDEVICE(MELLANOX, 0x1000) },
-	{ PCI_VDEVICE(MELLANOX, 0x1001) },
-	{ PCI_VDEVICE(MELLANOX, 0x1002) },
-	{ PCI_VDEVICE(MELLANOX, 0x1003) },
-	{ PCI_VDEVICE(MELLANOX, 0x1004) },
-	{ PCI_VDEVICE(MELLANOX, 0x1005) },
-	{ PCI_VDEVICE(MELLANOX, 0x1006) },
-	{ PCI_VDEVICE(MELLANOX, 0x1007) },
-	{ PCI_VDEVICE(MELLANOX, 0x1008) },
-	{ PCI_VDEVICE(MELLANOX, 0x1009) },
-	{ PCI_VDEVICE(MELLANOX, 0x100a) },
-	{ PCI_VDEVICE(MELLANOX, 0x100b) },
-	{ PCI_VDEVICE(MELLANOX, 0x100c) },
-	{ PCI_VDEVICE(MELLANOX, 0x100d) },
-	{ PCI_VDEVICE(MELLANOX, 0x100e) },
-	{ PCI_VDEVICE(MELLANOX, 0x100f) },
+static DEFINE_PCI_DEVICE_TABLE(mlx4_pci_table) = {
+	/* MT25408 "Hermon" SDR */
+	{ PCI_VDEVICE(MELLANOX, 0x6340), MLX4_PCI_DEV_FORCE_SENSE_PORT },
+	/* MT25408 "Hermon" DDR */
+	{ PCI_VDEVICE(MELLANOX, 0x634a), MLX4_PCI_DEV_FORCE_SENSE_PORT },
+	/* MT25408 "Hermon" QDR */
+	{ PCI_VDEVICE(MELLANOX, 0x6354), MLX4_PCI_DEV_FORCE_SENSE_PORT },
+	/* MT25408 "Hermon" DDR PCIe gen2 */
+	{ PCI_VDEVICE(MELLANOX, 0x6732), MLX4_PCI_DEV_FORCE_SENSE_PORT },
+	/* MT25408 "Hermon" QDR PCIe gen2 */
+	{ PCI_VDEVICE(MELLANOX, 0x673c), MLX4_PCI_DEV_FORCE_SENSE_PORT },
+	/* MT25408 "Hermon" EN 10GigE */
+	{ PCI_VDEVICE(MELLANOX, 0x6368), MLX4_PCI_DEV_FORCE_SENSE_PORT },
+	/* MT25408 "Hermon" EN 10GigE PCIe gen2 */
+	{ PCI_VDEVICE(MELLANOX, 0x6750), MLX4_PCI_DEV_FORCE_SENSE_PORT },
+	/* MT25458 ConnectX EN 10GBASE-T 10GigE */
+	{ PCI_VDEVICE(MELLANOX, 0x6372), MLX4_PCI_DEV_FORCE_SENSE_PORT },
+	/* MT25458 ConnectX EN 10GBASE-T+Gen2 10GigE */
+	{ PCI_VDEVICE(MELLANOX, 0x675a), MLX4_PCI_DEV_FORCE_SENSE_PORT },
+	/* MT26468 ConnectX EN 10GigE PCIe gen2*/
+	{ PCI_VDEVICE(MELLANOX, 0x6764), MLX4_PCI_DEV_FORCE_SENSE_PORT },
+	/* MT26438 ConnectX EN 40GigE PCIe gen2 5GT/s */
+	{ PCI_VDEVICE(MELLANOX, 0x6746), MLX4_PCI_DEV_FORCE_SENSE_PORT },
+	/* MT26478 ConnectX2 40GigE PCIe gen2 */
+	{ PCI_VDEVICE(MELLANOX, 0x676e), MLX4_PCI_DEV_FORCE_SENSE_PORT },
+	/* MT25400 Family [ConnectX-2 Virtual Function] */
+	{ PCI_VDEVICE(MELLANOX, 0x1002), MLX4_PCI_DEV_IS_VF },
+	/* MT27500 Family [ConnectX-3] */
+	{ PCI_VDEVICE(MELLANOX, 0x1003), 0 },
+	/* MT27500 Family [ConnectX-3 Virtual Function] */
+	{ PCI_VDEVICE(MELLANOX, 0x1004), MLX4_PCI_DEV_IS_VF },
+	{ PCI_VDEVICE(MELLANOX, 0x1005), 0 }, /* MT27510 Family */
+	{ PCI_VDEVICE(MELLANOX, 0x1006), 0 }, /* MT27511 Family */
+	{ PCI_VDEVICE(MELLANOX, 0x1007), 0 }, /* MT27520 Family */
+	{ PCI_VDEVICE(MELLANOX, 0x1008), 0 }, /* MT27521 Family */
+	{ PCI_VDEVICE(MELLANOX, 0x1009), 0 }, /* MT27530 Family */
+	{ PCI_VDEVICE(MELLANOX, 0x100a), 0 }, /* MT27531 Family */
+	{ PCI_VDEVICE(MELLANOX, 0x100b), 0 }, /* MT27540 Family */
+	{ PCI_VDEVICE(MELLANOX, 0x100c), 0 }, /* MT27541 Family */
+	{ PCI_VDEVICE(MELLANOX, 0x100d), 0 }, /* MT27550 Family */
+	{ PCI_VDEVICE(MELLANOX, 0x100e), 0 }, /* MT27551 Family */
+	{ PCI_VDEVICE(MELLANOX, 0x100f), 0 }, /* MT27560 Family */
+	{ PCI_VDEVICE(MELLANOX, 0x1010), 0 }, /* MT27561 Family */
 	{ 0, }
 };
 
 MODULE_DEVICE_TABLE(pci, mlx4_pci_table);
 
+static pci_ers_result_t mlx4_pci_err_detected(struct pci_dev *pdev,
+					      pci_channel_state_t state)
+{
+	mlx4_remove_one(pdev);
+
+	return state == pci_channel_io_perm_failure ?
+		PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_NEED_RESET;
+}
+
+static pci_ers_result_t mlx4_pci_slot_reset(struct pci_dev *pdev)
+{
+	int ret = __mlx4_init_one(pdev, 0);
+
+	return ret ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
+}
+
+static const struct pci_error_handlers mlx4_err_handler = {
+	.error_detected = mlx4_pci_err_detected,
+	.slot_reset     = mlx4_pci_slot_reset,
+};
+
+static int suspend(struct pci_dev *pdev, pm_message_t state)
+{
+	mlx4_remove_one(pdev);
+
+	return 0;
+}
+
+static int resume(struct pci_dev *pdev)
+{
+	return __mlx4_init_one(pdev, 0);
+}
+
 static struct pci_driver mlx4_driver = {
 	.name		= DRV_NAME,
 	.id_table	= mlx4_pci_table,
 	.probe		= mlx4_init_one,
-	.remove		= __devexit_p(mlx4_remove_one)
+	.remove		= __devexit_p(mlx4_remove_one),
+	.suspend	= suspend,
+	.resume		= resume,
+	.err_handler    = &mlx4_err_handler,
 };
 
 static int __init mlx4_verify_params(void)
 {
+	int status;
+
+	status = update_defaults(&port_type_array);
+	if (status == INVALID_STR) {
+		if (mlx4_fill_dbdf2val_tbl(&port_type_array.dbdf2val))
+			return -1;
+	} else if (status == INVALID_DATA) {
+		return -1;
+	}
+
+	status = update_defaults(&num_vfs);
+	if (status == INVALID_STR) {
+		if (mlx4_fill_dbdf2val_tbl(&num_vfs.dbdf2val))
+			return -1;
+	} else if (status == INVALID_DATA) {
+		return -1;
+	}
+
+	status = update_defaults(&probe_vf);
+	if (status == INVALID_STR) {
+		if (mlx4_fill_dbdf2val_tbl(&probe_vf.dbdf2val))
+			return -1;
+	} else if (status == INVALID_DATA) {
+		return -1;
+	}
+
+	if (msi_x < 0) {
+		pr_warn("mlx4_core: bad msi_x: %d\n", msi_x);
+		return -1;
+	}
+
 	if ((log_num_mac < 0) || (log_num_mac > 7)) {
-		printk(KERN_WARNING "mlx4_core: bad num_mac: %d\n", log_num_mac);
+		pr_warning("mlx4_core: bad num_mac: %d\n", log_num_mac);
 		return -1;
 	}
 
-	if (log_mtts_per_seg == 0)
-		log_mtts_per_seg = ilog2(MLX4_MTT_ENTRY_PER_SEG);
-	if ((log_mtts_per_seg < 1) || (log_mtts_per_seg > 7)) {
-		printk(KERN_WARNING "mlx4_core: bad log_mtts_per_seg: %d\n", log_mtts_per_seg);
+	if (log_num_vlan != 0)
+		pr_warning("mlx4_core: log_num_vlan - obsolete module param, using %d\n",
+			   MLX4_LOG_NUM_VLANS);
+
+	if (mlx4_set_4k_mtu != -1)
+		pr_warning("mlx4_core: set_4k_mtu - obsolete module param\n");
+
+	if ((log_mtts_per_seg < 0) || (log_mtts_per_seg > 7)) {
+		pr_warning("mlx4_core: bad log_mtts_per_seg: %d\n", log_mtts_per_seg);
 		return -1;
 	}
 
+	if (mlx4_log_num_mgm_entry_size != -1 &&
+	    (mlx4_log_num_mgm_entry_size < MLX4_MIN_MGM_LOG_ENTRY_SIZE ||
+	     mlx4_log_num_mgm_entry_size > MLX4_MAX_MGM_LOG_ENTRY_SIZE)) {
+		pr_warning("mlx4_core: mlx4_log_num_mgm_entry_size (%d) not "
+			   "in legal range (-1 or %d..%d)\n",
+			   mlx4_log_num_mgm_entry_size,
+			   MLX4_MIN_MGM_LOG_ENTRY_SIZE,
+			   MLX4_MAX_MGM_LOG_ENTRY_SIZE);
+		return -1;
+	}
+
+	if (mod_param_profile.num_qp < 18 || mod_param_profile.num_qp > 23) {
+		pr_warning("mlx4_core: bad log_num_qp: %d\n",
+			   mod_param_profile.num_qp);
+		return -1;
+	}
+
+	if (mod_param_profile.num_srq < 10) {
+		pr_warning("mlx4_core: too low log_num_srq: %d\n",
+			   mod_param_profile.num_srq);
+		return -1;
+	}
+
+	if (mod_param_profile.num_cq < 10) {
+		pr_warning("mlx4_core: too low log_num_cq: %d\n",
+			   mod_param_profile.num_cq);
+		return -1;
+	}
+
+	if (mod_param_profile.num_mpt < 10) {
+		pr_warning("mlx4_core: too low log_num_mpt: %d\n",
+			   mod_param_profile.num_mpt);
+		return -1;
+	}
+
+	if (mod_param_profile.num_mtt_segs &&
+	    mod_param_profile.num_mtt_segs < 15) {
+		pr_warning("mlx4_core: too low log_num_mtt: %d\n",
+			   mod_param_profile.num_mtt_segs);
+		return -1;
+	}
+
+	if (mod_param_profile.num_mtt_segs > MLX4_MAX_LOG_NUM_MTT) {
+		pr_warning("mlx4_core: too high log_num_mtt: %d\n",
+			   mod_param_profile.num_mtt_segs);
+		return -1;
+	}
 	return 0;
 }
 
@@ -1662,8 +3830,6 @@
 {
 	int ret;
 
-	mutex_init(&drv_mutex);
-
 	if (mlx4_verify_params())
 		return -EINVAL;
 
@@ -1673,16 +3839,30 @@
 	if (!mlx4_wq)
 		return -ENOMEM;
 
+	if (enable_sys_tune)
+		sys_tune_init();
+
 	ret = pci_register_driver(&mlx4_driver);
-	return ret < 0 ? ret : 0;
+	if (ret < 0)
+		goto err;
+
+	return 0;
+
+err:
+	if (enable_sys_tune)
+		sys_tune_fini();
+
+	destroy_workqueue(mlx4_wq);
+
+	return ret;
 }
 
 static void __exit mlx4_cleanup(void)
 {
-	mutex_lock(&drv_mutex);
-	mlx4_config_cleanup();
+	if (enable_sys_tune)
+		sys_tune_fini();
+
 	pci_unregister_driver(&mlx4_driver);
-	mutex_unlock(&drv_mutex);
 	destroy_workqueue(mlx4_wq);
 }
 
@@ -1689,17 +3869,16 @@
 module_init_order(mlx4_init, SI_ORDER_MIDDLE);
 module_exit(mlx4_cleanup);
 
-#undef MODULE_VERSION
 #include <sys/module.h>
 static int
 mlx4_evhand(module_t mod, int event, void *arg)
 {
-	return (0);
+        return (0);
 }
 
 static moduledata_t mlx4_mod = {
-	.name = "mlx4",
-	.evhand = mlx4_evhand,
+        .name = "mlx4",
+        .evhand = mlx4_evhand,
 };
 MODULE_VERSION(mlx4, 1);
-DECLARE_MODULE(mlx4, mlx4_mod, SI_SUB_SMP, SI_ORDER_ANY);
+DECLARE_MODULE(mlx4, mlx4_mod, SI_SUB_OFED_PREINIT, SI_ORDER_ANY);

Modified: trunk/sys/ofed/drivers/net/mlx4/mcg.c
===================================================================
--- trunk/sys/ofed/drivers/net/mlx4/mcg.c	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/drivers/net/mlx4/mcg.c	2016-09-14 19:35:22 UTC (rev 7911)
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 2006, 2007 Cisco Systems, Inc.  All rights reserved.
- * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2007, 2008, 2014 Mellanox Technologies. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -31,50 +31,88 @@
  * SOFTWARE.
  */
 
-#include <linux/init.h>
 #include <linux/string.h>
-#include <linux/slab.h>
+#include <linux/etherdevice.h>
 
 #include <linux/mlx4/cmd.h>
-#include <linux/mlx4/driver.h>
+#include <linux/module.h>
+#include <linux/printk.h>
 
 #include "mlx4.h"
 
-#define MGM_QPN_MASK       0x00FFFFFF
-#define MGM_BLCK_LB_BIT    30
+int mlx4_get_mgm_entry_size(struct mlx4_dev *dev)
+{
+	return 1 << dev->oper_log_mgm_entry_size;
+}
 
-struct mlx4_mgm {
-	__be32			next_gid_index;
-	__be32			members_count;
-	u32			reserved[2];
-	u8			gid[16];
-	__be32			qp[MLX4_QP_PER_MGM];
-};
+int mlx4_get_qp_per_mgm(struct mlx4_dev *dev)
+{
+	return 4 * (mlx4_get_mgm_entry_size(dev) / 16 - 2);
+}
 
-static const u8 zero_gid[16];	/* automatically initialized to 0 */
+static int mlx4_QP_FLOW_STEERING_ATTACH(struct mlx4_dev *dev,
+					struct mlx4_cmd_mailbox *mailbox,
+					u32 size,
+					u64 *reg_id)
+{
+	u64 imm;
+	int err = 0;
 
-static int mlx4_READ_MCG(struct mlx4_dev *dev, int index,
-			 struct mlx4_cmd_mailbox *mailbox)
+	err = mlx4_cmd_imm(dev, mailbox->dma, &imm, size, 0,
+			   MLX4_QP_FLOW_STEERING_ATTACH, MLX4_CMD_TIME_CLASS_A,
+			   MLX4_CMD_NATIVE);
+	if (err)
+		return err;
+	*reg_id = imm;
+
+	return err;
+}
+
+static int mlx4_QP_FLOW_STEERING_DETACH(struct mlx4_dev *dev, u64 regid)
 {
+	int err = 0;
+
+	err = mlx4_cmd(dev, regid, 0, 0,
+		       MLX4_QP_FLOW_STEERING_DETACH, MLX4_CMD_TIME_CLASS_A,
+		       MLX4_CMD_NATIVE);
+
+	return err;
+}
+
+static int mlx4_READ_ENTRY(struct mlx4_dev *dev, int index,
+			   struct mlx4_cmd_mailbox *mailbox)
+{
 	return mlx4_cmd_box(dev, 0, mailbox->dma, index, 0, MLX4_CMD_READ_MCG,
-			    MLX4_CMD_TIME_CLASS_A);
+			    MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
 }
 
-static int mlx4_WRITE_MCG(struct mlx4_dev *dev, int index,
-			  struct mlx4_cmd_mailbox *mailbox)
+static int mlx4_WRITE_ENTRY(struct mlx4_dev *dev, int index,
+			    struct mlx4_cmd_mailbox *mailbox)
 {
 	return mlx4_cmd(dev, mailbox->dma, index, 0, MLX4_CMD_WRITE_MCG,
-			MLX4_CMD_TIME_CLASS_A);
+			MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
 }
 
-static int mlx4_MGID_HASH(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox,
-			  u16 *hash)
+static int mlx4_WRITE_PROMISC(struct mlx4_dev *dev, u8 port, u8 steer,
+			      struct mlx4_cmd_mailbox *mailbox)
 {
+	u32 in_mod;
+
+	in_mod = (u32) port << 16 | steer << 1;
+	return mlx4_cmd(dev, mailbox->dma, in_mod, 0x1,
+			MLX4_CMD_WRITE_MCG, MLX4_CMD_TIME_CLASS_A,
+			MLX4_CMD_NATIVE);
+}
+
+static int mlx4_GID_HASH(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox,
+			 u16 *hash, u8 op_mod)
+{
 	u64 imm;
 	int err;
 
-	err = mlx4_cmd_imm(dev, mailbox->dma, &imm, 0, 0, MLX4_CMD_MGID_HASH,
-			   MLX4_CMD_TIME_CLASS_A);
+	err = mlx4_cmd_imm(dev, mailbox->dma, &imm, 0, op_mod,
+			   MLX4_CMD_MGID_HASH, MLX4_CMD_TIME_CLASS_A,
+			   MLX4_CMD_NATIVE);
 
 	if (!err)
 		*hash = imm;
@@ -82,7 +120,538 @@
 	return err;
 }
 
+static struct mlx4_promisc_qp *get_promisc_qp(struct mlx4_dev *dev, u8 port,
+					      enum mlx4_steer_type steer,
+					      u32 qpn)
+{
+	struct mlx4_steer *s_steer;
+	struct mlx4_promisc_qp *pqp;
+
+	if (port < 1 || port > dev->caps.num_ports)
+		return NULL;
+
+	s_steer = &mlx4_priv(dev)->steer[port - 1];
+
+	list_for_each_entry(pqp, &s_steer->promisc_qps[steer], list) {
+		if (pqp->qpn == qpn)
+			return pqp;
+	}
+	/* not found */
+	return NULL;
+}
+
 /*
+ * Add new entry to steering data structure.
+ * All promisc QPs should be added as well
+ */
+static int new_steering_entry(struct mlx4_dev *dev, u8 port,
+			      enum mlx4_steer_type steer,
+			      unsigned int index, u32 qpn)
+{
+	struct mlx4_steer *s_steer;
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_mgm *mgm;
+	u32 members_count;
+	struct mlx4_steer_index *new_entry;
+	struct mlx4_promisc_qp *pqp;
+	struct mlx4_promisc_qp *dqp = NULL;
+	u32 prot;
+	int err;
+
+	if (port < 1 || port > dev->caps.num_ports)
+		return -EINVAL;
+
+	s_steer = &mlx4_priv(dev)->steer[port - 1];
+	new_entry = kzalloc(sizeof *new_entry, GFP_KERNEL);
+	if (!new_entry)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&new_entry->duplicates);
+	new_entry->index = index;
+	list_add_tail(&new_entry->list, &s_steer->steer_entries[steer]);
+
+	/* If the given qpn is also a promisc qp,
+	 * it should be inserted to duplicates list
+	 */
+	pqp = get_promisc_qp(dev, port, steer, qpn);
+	if (pqp) {
+		dqp = kmalloc(sizeof *dqp, GFP_KERNEL);
+		if (!dqp) {
+			err = -ENOMEM;
+			goto out_alloc;
+		}
+		dqp->qpn = qpn;
+		list_add_tail(&dqp->list, &new_entry->duplicates);
+	}
+
+	/* if no promisc qps for this vep, we are done */
+	if (list_empty(&s_steer->promisc_qps[steer]))
+		return 0;
+
+	/* now need to add all the promisc qps to the new
+	 * steering entry, as they should also receive the packets
+	 * destined to this address */
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox)) {
+		err = -ENOMEM;
+		goto out_alloc;
+	}
+	mgm = mailbox->buf;
+
+	err = mlx4_READ_ENTRY(dev, index, mailbox);
+	if (err)
+		goto out_mailbox;
+
+	members_count = be32_to_cpu(mgm->members_count) & 0xffffff;
+	prot = be32_to_cpu(mgm->members_count) >> 30;
+	list_for_each_entry(pqp, &s_steer->promisc_qps[steer], list) {
+		/* don't add already existing qpn */
+		if (pqp->qpn == qpn)
+			continue;
+		if (members_count == dev->caps.num_qp_per_mgm) {
+			/* out of space */
+			err = -ENOMEM;
+			goto out_mailbox;
+		}
+
+		/* add the qpn */
+		mgm->qp[members_count++] = cpu_to_be32(pqp->qpn & MGM_QPN_MASK);
+	}
+	/* update the qps count and update the entry with all the promisc qps*/
+	mgm->members_count = cpu_to_be32(members_count | (prot << 30));
+	err = mlx4_WRITE_ENTRY(dev, index, mailbox);
+
+out_mailbox:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	if (!err)
+		return 0;
+out_alloc:
+	if (dqp) {
+		list_del(&dqp->list);
+		kfree(dqp);
+	}
+	list_del(&new_entry->list);
+	kfree(new_entry);
+	return err;
+}
+
+/* update the data structures with existing steering entry */
+static int existing_steering_entry(struct mlx4_dev *dev, u8 port,
+				   enum mlx4_steer_type steer,
+				   unsigned int index, u32 qpn)
+{
+	struct mlx4_steer *s_steer;
+	struct mlx4_steer_index *tmp_entry, *entry = NULL;
+	struct mlx4_promisc_qp *pqp;
+	struct mlx4_promisc_qp *dqp;
+
+	if (port < 1 || port > dev->caps.num_ports)
+		return -EINVAL;
+
+	s_steer = &mlx4_priv(dev)->steer[port - 1];
+
+	pqp = get_promisc_qp(dev, port, steer, qpn);
+	if (!pqp)
+		return 0; /* nothing to do */
+
+	list_for_each_entry(tmp_entry, &s_steer->steer_entries[steer], list) {
+		if (tmp_entry->index == index) {
+			entry = tmp_entry;
+			break;
+		}
+	}
+	if (unlikely(!entry)) {
+		mlx4_warn(dev, "Steering entry at index %x is not registered\n", index);
+		return -EINVAL;
+	}
+
+	/* the given qpn is listed as a promisc qpn
+	 * we need to add it as a duplicate to this entry
+	 * for future references */
+	list_for_each_entry(dqp, &entry->duplicates, list) {
+		if (qpn == dqp->qpn)
+			return 0; /* qp is already duplicated */
+	}
+
+	/* add the qp as a duplicate on this index */
+	dqp = kmalloc(sizeof *dqp, GFP_KERNEL);
+	if (!dqp)
+		return -ENOMEM;
+	dqp->qpn = qpn;
+	list_add_tail(&dqp->list, &entry->duplicates);
+
+	return 0;
+}
+
+/* Check whether a qpn is a duplicate on steering entry
+ * If so, it should not be removed from mgm */
+static bool check_duplicate_entry(struct mlx4_dev *dev, u8 port,
+				  enum mlx4_steer_type steer,
+				  unsigned int index, u32 qpn)
+{
+	struct mlx4_steer *s_steer;
+	struct mlx4_steer_index *tmp_entry, *entry = NULL;
+	struct mlx4_promisc_qp *dqp, *tmp_dqp;
+
+	if (port < 1 || port > dev->caps.num_ports)
+		return NULL;
+
+	s_steer = &mlx4_priv(dev)->steer[port - 1];
+
+	/* if qp is not promisc, it cannot be duplicated */
+	if (!get_promisc_qp(dev, port, steer, qpn))
+		return false;
+
+	/* The qp is promisc qp so it is a duplicate on this index
+	 * Find the index entry, and remove the duplicate */
+	list_for_each_entry(tmp_entry, &s_steer->steer_entries[steer], list) {
+		if (tmp_entry->index == index) {
+			entry = tmp_entry;
+			break;
+		}
+	}
+	if (unlikely(!entry)) {
+		mlx4_warn(dev, "Steering entry for index %x is not registered\n", index);
+		return false;
+	}
+	list_for_each_entry_safe(dqp, tmp_dqp, &entry->duplicates, list) {
+		if (dqp->qpn == qpn) {
+			list_del(&dqp->list);
+			kfree(dqp);
+		}
+	}
+	return true;
+}
+
+/*
+ * returns true if all the QPs != tqpn contained in this entry
+ * are Promisc QPs. return false otherwise.
+ */
+static bool promisc_steering_entry(struct mlx4_dev *dev, u8 port,
+				      enum mlx4_steer_type steer,
+				      unsigned int index, u32 tqpn, u32 *members_count)
+{
+	struct mlx4_steer *s_steer;
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_mgm *mgm;
+	u32 m_count;
+	bool ret = false;
+	int i;
+
+	if (port < 1 || port > dev->caps.num_ports)
+		return false;
+
+	s_steer = &mlx4_priv(dev)->steer[port - 1];
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return false;
+	mgm = mailbox->buf;
+
+	if (mlx4_READ_ENTRY(dev, index, mailbox))
+		goto out;
+	m_count = be32_to_cpu(mgm->members_count) & 0xffffff;
+	if (members_count)
+		*members_count = m_count;
+
+	for (i = 0;  i < m_count; i++) {
+		u32 qpn = be32_to_cpu(mgm->qp[i]) & MGM_QPN_MASK;
+		if (!get_promisc_qp(dev, port, steer, qpn) && qpn != tqpn) {
+			/* the qp is not promisc, the entry can't be removed */
+			goto out;
+		}
+	}
+	ret = true;
+out:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return ret;
+}
+
+/* IF a steering entry contains only promisc QPs, it can be removed. */
+static bool can_remove_steering_entry(struct mlx4_dev *dev, u8 port,
+				      enum mlx4_steer_type steer,
+				      unsigned int index, u32 tqpn)
+{
+	struct mlx4_steer *s_steer;
+	struct mlx4_steer_index *entry = NULL, *tmp_entry;
+	u32 members_count;
+	bool ret = false;
+
+	if (port < 1 || port > dev->caps.num_ports)
+		return NULL;
+
+	s_steer = &mlx4_priv(dev)->steer[port - 1];
+
+	if (!promisc_steering_entry(dev, port, steer, index, tqpn, &members_count))
+		goto out;
+
+	/* All the qps currently registered for this entry are promiscuous,
+	  * Checking for duplicates */
+	ret = true;
+	list_for_each_entry_safe(entry, tmp_entry, &s_steer->steer_entries[steer], list) {
+		if (entry->index == index) {
+			if (list_empty(&entry->duplicates) || members_count == 1) {
+				struct mlx4_promisc_qp *pqp, *tmp_pqp;
+				/*
+				 * If there is only 1 entry in duplicates than
+				 * this is the QP we want to delete, going over
+				 * the list and deleting the entry.
+				 */
+				list_del(&entry->list);
+				list_for_each_entry_safe(pqp, tmp_pqp,
+							 &entry->duplicates,
+							 list) {
+					list_del(&pqp->list);
+					kfree(pqp);
+				}
+				kfree(entry);
+			} else {
+				/* This entry contains duplicates so it shouldn't be removed */
+				ret = false;
+				goto out;
+			}
+		}
+	}
+
+out:
+	return ret;
+}
+
+static int add_promisc_qp(struct mlx4_dev *dev, u8 port,
+			  enum mlx4_steer_type steer, u32 qpn)
+{
+	struct mlx4_steer *s_steer;
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_mgm *mgm;
+	struct mlx4_steer_index *entry;
+	struct mlx4_promisc_qp *pqp;
+	struct mlx4_promisc_qp *dqp;
+	u32 members_count;
+	u32 prot;
+	int i;
+	bool found;
+	int err;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	if (port < 1 || port > dev->caps.num_ports)
+		return -EINVAL;
+
+	s_steer = &mlx4_priv(dev)->steer[port - 1];
+
+	mutex_lock(&priv->mcg_table.mutex);
+
+	if (get_promisc_qp(dev, port, steer, qpn)) {
+		err = 0;  /* Noting to do, already exists */
+		goto out_mutex;
+	}
+
+	pqp = kmalloc(sizeof *pqp, GFP_KERNEL);
+	if (!pqp) {
+		err = -ENOMEM;
+		goto out_mutex;
+	}
+	pqp->qpn = qpn;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox)) {
+		err = -ENOMEM;
+		goto out_alloc;
+	}
+	mgm = mailbox->buf;
+
+	if (!(mlx4_is_mfunc(dev) && steer == MLX4_UC_STEER)) {
+		/* the promisc qp needs to be added for each one of the steering
+		 * entries, if it already exists, needs to be added as a duplicate
+		 * for this entry */
+		list_for_each_entry(entry, &s_steer->steer_entries[steer], list) {
+			err = mlx4_READ_ENTRY(dev, entry->index, mailbox);
+			if (err)
+				goto out_mailbox;
+
+			members_count = be32_to_cpu(mgm->members_count) & 0xffffff;
+			prot = be32_to_cpu(mgm->members_count) >> 30;
+			found = false;
+			for (i = 0; i < members_count; i++) {
+				if ((be32_to_cpu(mgm->qp[i]) & MGM_QPN_MASK) == qpn) {
+					/* Entry already exists, add to duplicates */
+					dqp = kmalloc(sizeof *dqp, GFP_KERNEL);
+					if (!dqp) {
+						err = -ENOMEM;
+						goto out_mailbox;
+					}
+					dqp->qpn = qpn;
+					list_add_tail(&dqp->list, &entry->duplicates);
+					found = true;
+				}
+			}
+			if (!found) {
+				/* Need to add the qpn to mgm */
+				if (members_count == dev->caps.num_qp_per_mgm) {
+					/* entry is full */
+					err = -ENOMEM;
+					goto out_mailbox;
+				}
+				mgm->qp[members_count++] = cpu_to_be32(qpn & MGM_QPN_MASK);
+				mgm->members_count = cpu_to_be32(members_count | (prot << 30));
+				err = mlx4_WRITE_ENTRY(dev, entry->index, mailbox);
+				if (err)
+					goto out_mailbox;
+			}
+		}
+	}
+
+	/* add the new qpn to list of promisc qps */
+	list_add_tail(&pqp->list, &s_steer->promisc_qps[steer]);
+	/* now need to add all the promisc qps to default entry */
+	memset(mgm, 0, sizeof *mgm);
+	members_count = 0;
+	list_for_each_entry(dqp, &s_steer->promisc_qps[steer], list) {
+		if (members_count == dev->caps.num_qp_per_mgm) {
+			/* entry is full */
+			err = -ENOMEM;
+			goto out_list;
+		}
+		mgm->qp[members_count++] = cpu_to_be32(dqp->qpn & MGM_QPN_MASK);
+	}
+	mgm->members_count = cpu_to_be32(members_count | MLX4_PROT_ETH << 30);
+
+	err = mlx4_WRITE_PROMISC(dev, port, steer, mailbox);
+	if (err)
+		goto out_list;
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	mutex_unlock(&priv->mcg_table.mutex);
+	return 0;
+
+out_list:
+	list_del(&pqp->list);
+out_mailbox:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+out_alloc:
+	kfree(pqp);
+out_mutex:
+	mutex_unlock(&priv->mcg_table.mutex);
+	return err;
+}
+
+static int remove_promisc_qp(struct mlx4_dev *dev, u8 port,
+			     enum mlx4_steer_type steer, u32 qpn)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_steer *s_steer;
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_mgm *mgm;
+	struct mlx4_steer_index *entry, *tmp_entry;
+	struct mlx4_promisc_qp *pqp;
+	struct mlx4_promisc_qp *dqp;
+	u32 members_count;
+	bool found;
+	bool back_to_list = false;
+	int i, loc = -1;
+	int err;
+
+	if (port < 1 || port > dev->caps.num_ports)
+		return -EINVAL;
+
+	s_steer = &mlx4_priv(dev)->steer[port - 1];
+	mutex_lock(&priv->mcg_table.mutex);
+
+	pqp = get_promisc_qp(dev, port, steer, qpn);
+	if (unlikely(!pqp)) {
+		mlx4_warn(dev, "QP %x is not promiscuous QP\n", qpn);
+		/* nothing to do */
+		err = 0;
+		goto out_mutex;
+	}
+
+	/*remove from list of promisc qps */
+	list_del(&pqp->list);
+
+	/* set the default entry not to include the removed one */
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox)) {
+		err = -ENOMEM;
+		back_to_list = true;
+		goto out_list;
+	}
+	mgm = mailbox->buf;
+	memset(mgm, 0, sizeof *mgm);
+	members_count = 0;
+	list_for_each_entry(dqp, &s_steer->promisc_qps[steer], list)
+		mgm->qp[members_count++] = cpu_to_be32(dqp->qpn & MGM_QPN_MASK);
+	mgm->members_count = cpu_to_be32(members_count | MLX4_PROT_ETH << 30);
+
+	err = mlx4_WRITE_PROMISC(dev, port, steer, mailbox);
+	if (err)
+		goto out_mailbox;
+
+	if (!(mlx4_is_mfunc(dev) && steer == MLX4_UC_STEER)) {
+		/* remove the qp from all the steering entries*/
+		list_for_each_entry_safe(entry, tmp_entry, &s_steer->steer_entries[steer], list) {
+			found = false;
+			list_for_each_entry(dqp, &entry->duplicates, list) {
+				if (dqp->qpn == qpn) {
+					found = true;
+					break;
+				}
+			}
+			if (found) {
+				/* a duplicate, no need to change the mgm,
+				 * only update the duplicates list */
+				list_del(&dqp->list);
+				kfree(dqp);
+			} else {
+				err = mlx4_READ_ENTRY(dev, entry->index, mailbox);
+					if (err)
+						goto out_mailbox;
+				members_count = be32_to_cpu(mgm->members_count) & 0xffffff;
+				if (!members_count) {
+					mlx4_warn(dev, "QP %06x wasn't found in entry %x mcount=0."
+					               " deleting entry...\n", qpn, entry->index);
+					list_del(&entry->list);
+					kfree(entry);
+					continue;
+				}
+
+				for (i = 0; i < members_count; ++i)
+					if ((be32_to_cpu(mgm->qp[i]) & MGM_QPN_MASK) == qpn) {
+						loc = i;
+						break;
+					}
+
+				if (loc < 0) {
+					mlx4_err(dev, "QP %06x wasn't found in entry %d\n",
+						 qpn, entry->index);
+					err = -EINVAL;
+					goto out_mailbox;
+				}
+
+				/* copy the last QP in this MGM over removed QP */
+				mgm->qp[loc] = mgm->qp[members_count - 1];
+				mgm->qp[members_count - 1] = 0;
+				mgm->members_count = cpu_to_be32(--members_count |
+								 (MLX4_PROT_ETH << 30));
+
+				err = mlx4_WRITE_ENTRY(dev, entry->index, mailbox);
+					if (err)
+						goto out_mailbox;
+			}
+		}
+	}
+
+out_mailbox:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+out_list:
+	if (back_to_list)
+		list_add_tail(&pqp->list, &s_steer->promisc_qps[steer]);
+	else
+		kfree(pqp);
+out_mutex:
+	mutex_unlock(&priv->mcg_table.mutex);
+	return err;
+}
+
+/*
  * Caller must hold MCG table semaphore.  gid and mgm parameters must
  * be properly aligned for command interface.
  *
@@ -97,15 +666,18 @@
  * If no AMGM exists for given gid, *index = -1, *prev = index of last
  * entry in hash chain and *mgm holds end of hash chain.
  */
-static int find_mgm(struct mlx4_dev *dev,
-		    u8 *gid, enum mlx4_mcast_prot prot,
-		    struct mlx4_cmd_mailbox *mgm_mailbox,
-		    u16 *hash, int *prev, int *index)
+static int find_entry(struct mlx4_dev *dev, u8 port,
+		      u8 *gid, enum mlx4_protocol prot,
+		      struct mlx4_cmd_mailbox *mgm_mailbox,
+		      int *prev, int *index)
 {
 	struct mlx4_cmd_mailbox *mailbox;
 	struct mlx4_mgm *mgm = mgm_mailbox->buf;
 	u8 *mgid;
 	int err;
+	u16 hash;
+	u8 op_mod = (prot == MLX4_PROT_ETH) ?
+		!!(dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_MC_STEER) : 0;
 
 	mailbox = mlx4_alloc_cmd_mailbox(dev);
 	if (IS_ERR(mailbox))
@@ -114,24 +686,26 @@
 
 	memcpy(mgid, gid, 16);
 
-	err = mlx4_MGID_HASH(dev, mailbox, hash);
+	err = mlx4_GID_HASH(dev, mailbox, &hash, op_mod);
 	mlx4_free_cmd_mailbox(dev, mailbox);
 	if (err)
 		return err;
 
-	if (0)
-		mlx4_dbg(dev, "Hash for %pI6 is %04x\n", gid, *hash);
+	if (0) {
+		mlx4_dbg(dev, "Hash for "GID_PRINT_FMT" is %04x\n",
+		    GID_PRINT_ARGS(gid), hash);
+	}
 
-	*index = *hash;
+	*index = hash;
 	*prev  = -1;
 
 	do {
-		err = mlx4_READ_MCG(dev, *index, mgm_mailbox);
+		err = mlx4_READ_ENTRY(dev, *index, mgm_mailbox);
 		if (err)
 			return err;
 
-		if (!memcmp(mgm->gid, zero_gid, 16)) {
-			if (*index != *hash) {
+		if (!(be32_to_cpu(mgm->members_count) & 0xffffff)) {
+			if (*index != hash) {
 				mlx4_err(dev, "Found zero MGID in AMGM.\n");
 				err = -EINVAL;
 			}
@@ -139,7 +713,7 @@
 		}
 
 		if (!memcmp(mgm->gid, gid, 16) &&
-				(prot == be32_to_cpu(mgm->members_count) >> 30))
+		    be32_to_cpu(mgm->members_count) >> 30 == prot)
 			return err;
 
 		*prev = *index;
@@ -150,18 +724,300 @@
 	return err;
 }
 
-int mlx4_multicast_attach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16],
-			  int block_mcast_loopback, enum mlx4_mcast_prot prot)
+static const u8 __promisc_mode[] = {
+	[MLX4_FS_REGULAR]   = 0x0,
+	[MLX4_FS_ALL_DEFAULT] = 0x1,
+	[MLX4_FS_MC_DEFAULT] = 0x3,
+	[MLX4_FS_UC_SNIFFER] = 0x4,
+	[MLX4_FS_MC_SNIFFER] = 0x5,
+};
+
+int map_sw_to_hw_steering_mode(struct mlx4_dev *dev,
+			       enum mlx4_net_trans_promisc_mode flow_type)
 {
+	if (flow_type >= MLX4_FS_MODE_NUM || flow_type < 0) {
+		mlx4_err(dev, "Invalid flow type. type = %d\n", flow_type);
+		return -EINVAL;
+	}
+	return __promisc_mode[flow_type];
+}
+EXPORT_SYMBOL_GPL(map_sw_to_hw_steering_mode);
+
+static void trans_rule_ctrl_to_hw(struct mlx4_net_trans_rule *ctrl,
+				  struct mlx4_net_trans_rule_hw_ctrl *hw)
+{
+	u8 flags = 0;
+
+	flags = ctrl->queue_mode == MLX4_NET_TRANS_Q_LIFO ? 1 : 0;
+	flags |= ctrl->exclusive ? (1 << 2) : 0;
+	flags |= ctrl->allow_loopback ? (1 << 3) : 0;
+
+	hw->flags = flags;
+	hw->type = __promisc_mode[ctrl->promisc_mode];
+	hw->prio = cpu_to_be16(ctrl->priority);
+	hw->port = ctrl->port;
+	hw->qpn = cpu_to_be32(ctrl->qpn);
+}
+
+const u16 __sw_id_hw[] = {
+	[MLX4_NET_TRANS_RULE_ID_ETH]     = 0xE001,
+	[MLX4_NET_TRANS_RULE_ID_IB]      = 0xE005,
+	[MLX4_NET_TRANS_RULE_ID_IPV6]    = 0xE003,
+	[MLX4_NET_TRANS_RULE_ID_IPV4]    = 0xE002,
+	[MLX4_NET_TRANS_RULE_ID_TCP]     = 0xE004,
+	[MLX4_NET_TRANS_RULE_ID_UDP]     = 0xE006
+};
+
+int map_sw_to_hw_steering_id(struct mlx4_dev *dev,
+			     enum mlx4_net_trans_rule_id id)
+{
+	if (id >= MLX4_NET_TRANS_RULE_NUM || id < 0) {
+		mlx4_err(dev, "Invalid network rule id. id = %d\n", id);
+		return -EINVAL;
+	}
+	return __sw_id_hw[id];
+}
+EXPORT_SYMBOL_GPL(map_sw_to_hw_steering_id);
+
+static const int __rule_hw_sz[] = {
+	[MLX4_NET_TRANS_RULE_ID_ETH] =
+		sizeof(struct mlx4_net_trans_rule_hw_eth),
+	[MLX4_NET_TRANS_RULE_ID_IB] =
+		sizeof(struct mlx4_net_trans_rule_hw_ib),
+	[MLX4_NET_TRANS_RULE_ID_IPV6] = 0,
+	[MLX4_NET_TRANS_RULE_ID_IPV4] =
+		sizeof(struct mlx4_net_trans_rule_hw_ipv4),
+	[MLX4_NET_TRANS_RULE_ID_TCP] =
+		sizeof(struct mlx4_net_trans_rule_hw_tcp_udp),
+	[MLX4_NET_TRANS_RULE_ID_UDP] =
+		sizeof(struct mlx4_net_trans_rule_hw_tcp_udp)
+};
+
+int hw_rule_sz(struct mlx4_dev *dev,
+	       enum mlx4_net_trans_rule_id id)
+{
+	if (id >= MLX4_NET_TRANS_RULE_NUM || id < 0) {
+		mlx4_err(dev, "Invalid network rule id. id = %d\n", id);
+		return -EINVAL;
+	}
+
+	return __rule_hw_sz[id];
+}
+EXPORT_SYMBOL_GPL(hw_rule_sz);
+
+static int parse_trans_rule(struct mlx4_dev *dev, struct mlx4_spec_list *spec,
+			    struct _rule_hw *rule_hw)
+{
+	if (hw_rule_sz(dev, spec->id) < 0)
+		return -EINVAL;
+	memset(rule_hw, 0, hw_rule_sz(dev, spec->id));
+	rule_hw->id = cpu_to_be16(__sw_id_hw[spec->id]);
+	rule_hw->size = hw_rule_sz(dev, spec->id) >> 2;
+
+	switch (spec->id) {
+	case MLX4_NET_TRANS_RULE_ID_ETH:
+		memcpy(rule_hw->eth.dst_mac, spec->eth.dst_mac, ETH_ALEN);
+		memcpy(rule_hw->eth.dst_mac_msk, spec->eth.dst_mac_msk,
+		       ETH_ALEN);
+		memcpy(rule_hw->eth.src_mac, spec->eth.src_mac, ETH_ALEN);
+		memcpy(rule_hw->eth.src_mac_msk, spec->eth.src_mac_msk,
+		       ETH_ALEN);
+		if (spec->eth.ether_type_enable) {
+			rule_hw->eth.ether_type_enable = 1;
+			rule_hw->eth.ether_type = spec->eth.ether_type;
+		}
+		rule_hw->eth.vlan_tag = spec->eth.vlan_id;
+		rule_hw->eth.vlan_tag_msk = spec->eth.vlan_id_msk;
+		break;
+
+	case MLX4_NET_TRANS_RULE_ID_IB:
+		rule_hw->ib.l3_qpn = spec->ib.l3_qpn;
+		rule_hw->ib.qpn_mask = spec->ib.qpn_msk;
+		memcpy(&rule_hw->ib.dst_gid, &spec->ib.dst_gid, 16);
+		memcpy(&rule_hw->ib.dst_gid_msk, &spec->ib.dst_gid_msk, 16);
+		break;
+
+	case MLX4_NET_TRANS_RULE_ID_IPV6:
+		return -EOPNOTSUPP;
+
+	case MLX4_NET_TRANS_RULE_ID_IPV4:
+		rule_hw->ipv4.src_ip = spec->ipv4.src_ip;
+		rule_hw->ipv4.src_ip_msk = spec->ipv4.src_ip_msk;
+		rule_hw->ipv4.dst_ip = spec->ipv4.dst_ip;
+		rule_hw->ipv4.dst_ip_msk = spec->ipv4.dst_ip_msk;
+		break;
+
+	case MLX4_NET_TRANS_RULE_ID_TCP:
+	case MLX4_NET_TRANS_RULE_ID_UDP:
+		rule_hw->tcp_udp.dst_port = spec->tcp_udp.dst_port;
+		rule_hw->tcp_udp.dst_port_msk = spec->tcp_udp.dst_port_msk;
+		rule_hw->tcp_udp.src_port = spec->tcp_udp.src_port;
+		rule_hw->tcp_udp.src_port_msk = spec->tcp_udp.src_port_msk;
+		break;
+
+	default:
+		return -EINVAL;
+	}
+
+	return __rule_hw_sz[spec->id];
+}
+
+static void mlx4_err_rule(struct mlx4_dev *dev, char *str,
+			  struct mlx4_net_trans_rule *rule)
+{
+#define BUF_SIZE 256
+	struct mlx4_spec_list *cur;
+	char buf[BUF_SIZE];
+	int len = 0;
+
+	mlx4_err(dev, "%s", str);
+	len += snprintf(buf + len, BUF_SIZE - len,
+			"port = %d prio = 0x%x qp = 0x%x ",
+			rule->port, rule->priority, rule->qpn);
+
+	list_for_each_entry(cur, &rule->list, list) {
+		switch (cur->id) {
+		case MLX4_NET_TRANS_RULE_ID_ETH:
+			len += snprintf(buf + len, BUF_SIZE - len,
+					"dmac = %pM ", &cur->eth.dst_mac);
+			if (cur->eth.ether_type)
+				len += snprintf(buf + len, BUF_SIZE - len,
+						"ethertype = 0x%x ",
+						be16_to_cpu(cur->eth.ether_type));
+			if (cur->eth.vlan_id)
+				len += snprintf(buf + len, BUF_SIZE - len,
+						"vlan-id = %d ",
+						be16_to_cpu(cur->eth.vlan_id));
+			break;
+
+		case MLX4_NET_TRANS_RULE_ID_IPV4:
+			if (cur->ipv4.src_ip)
+				len += snprintf(buf + len, BUF_SIZE - len,
+						"src-ip = %pI4 ",
+						&cur->ipv4.src_ip);
+			if (cur->ipv4.dst_ip)
+				len += snprintf(buf + len, BUF_SIZE - len,
+						"dst-ip = %pI4 ",
+						&cur->ipv4.dst_ip);
+			break;
+
+		case MLX4_NET_TRANS_RULE_ID_TCP:
+		case MLX4_NET_TRANS_RULE_ID_UDP:
+			if (cur->tcp_udp.src_port)
+				len += snprintf(buf + len, BUF_SIZE - len,
+						"src-port = %d ",
+						be16_to_cpu(cur->tcp_udp.src_port));
+			if (cur->tcp_udp.dst_port)
+				len += snprintf(buf + len, BUF_SIZE - len,
+						"dst-port = %d ",
+						be16_to_cpu(cur->tcp_udp.dst_port));
+			break;
+
+		case MLX4_NET_TRANS_RULE_ID_IB:
+			len += snprintf(buf + len, BUF_SIZE - len,
+					"dst-gid = "GID_PRINT_FMT"\n",
+					GID_PRINT_ARGS(cur->ib.dst_gid));
+			len += snprintf(buf + len, BUF_SIZE - len,
+					"dst-gid-mask = "GID_PRINT_FMT"\n",
+					GID_PRINT_ARGS(cur->ib.dst_gid_msk));
+			break;
+
+		case MLX4_NET_TRANS_RULE_ID_IPV6:
+			break;
+
+		default:
+			break;
+		}
+	}
+	len += snprintf(buf + len, BUF_SIZE - len, "\n");
+	mlx4_err(dev, "%s", buf);
+
+	if (len >= BUF_SIZE)
+		mlx4_err(dev, "Network rule error message was truncated, print buffer is too small.\n");
+}
+
+int mlx4_flow_attach(struct mlx4_dev *dev,
+		     struct mlx4_net_trans_rule *rule, u64 *reg_id)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_spec_list *cur;
+	u32 size = 0;
+	int ret;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	memset(mailbox->buf, 0, sizeof(struct mlx4_net_trans_rule_hw_ctrl));
+	trans_rule_ctrl_to_hw(rule, mailbox->buf);
+
+	size += sizeof(struct mlx4_net_trans_rule_hw_ctrl);
+
+	list_for_each_entry(cur, &rule->list, list) {
+		ret = parse_trans_rule(dev, cur, mailbox->buf + size);
+		if (ret < 0) {
+			mlx4_free_cmd_mailbox(dev, mailbox);
+			return -EINVAL;
+		}
+		size += ret;
+	}
+
+	ret = mlx4_QP_FLOW_STEERING_ATTACH(dev, mailbox, size >> 2, reg_id);
+	if (ret == -ENOMEM)
+		mlx4_err_rule(dev,
+			      "mcg table is full. Fail to register network rule.\n",
+			      rule);
+	else if (ret)
+		mlx4_err_rule(dev, "Fail to register network rule.\n", rule);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(mlx4_flow_attach);
+
+int mlx4_flow_detach(struct mlx4_dev *dev, u64 reg_id)
+{
+	int err;
+
+	err = mlx4_QP_FLOW_STEERING_DETACH(dev, reg_id);
+	if (err)
+		mlx4_err(dev, "Fail to detach network rule. registration id = 0x%llx\n",
+			 (unsigned long long)reg_id);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_flow_detach);
+
+int mlx4_FLOW_STEERING_IB_UC_QP_RANGE(struct mlx4_dev *dev, u32 min_range_qpn, u32 max_range_qpn)
+{
+	int err;
+	u64 in_param;
+
+	in_param = ((u64) min_range_qpn) << 32;
+	in_param |= ((u64) max_range_qpn) & 0xFFFFFFFF;
+
+	err = mlx4_cmd(dev, in_param, 0, 0,
+			MLX4_FLOW_STEERING_IB_UC_QP_RANGE,
+			MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_FLOW_STEERING_IB_UC_QP_RANGE);
+
+int mlx4_qp_attach_common(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16],
+			  int block_mcast_loopback, enum mlx4_protocol prot,
+			  enum mlx4_steer_type steer)
+{
 	struct mlx4_priv *priv = mlx4_priv(dev);
 	struct mlx4_cmd_mailbox *mailbox;
 	struct mlx4_mgm *mgm;
 	u32 members_count;
-	u16 hash;
 	int index, prev;
 	int link = 0;
 	int i;
 	int err;
+	u8 port = gid[5];
+	u8 new_entry = 0;
 
 	mailbox = mlx4_alloc_cmd_mailbox(dev);
 	if (IS_ERR(mailbox))
@@ -169,14 +1025,16 @@
 	mgm = mailbox->buf;
 
 	mutex_lock(&priv->mcg_table.mutex);
-
-	err = find_mgm(dev, gid, prot, mailbox, &hash, &prev, &index);
+	err = find_entry(dev, port, gid, prot,
+			 mailbox, &prev, &index);
 	if (err)
 		goto out;
 
 	if (index != -1) {
-		if (!memcmp(mgm->gid, zero_gid, 16))
+		if (!(be32_to_cpu(mgm->members_count) & 0xffffff)) {
+			new_entry = 1;
 			memcpy(mgm->gid, gid, 16);
+		}
 	} else {
 		link = 1;
 
@@ -188,12 +1046,13 @@
 		}
 		index += dev->caps.num_mgms;
 
+		new_entry = 1;
 		memset(mgm, 0, sizeof *mgm);
 		memcpy(mgm->gid, gid, 16);
 	}
 
 	members_count = be32_to_cpu(mgm->members_count) & 0xffffff;
-	if (members_count == MLX4_QP_PER_MGM) {
+	if (members_count == dev->caps.num_qp_per_mgm) {
 		mlx4_err(dev, "MGM at index %x is full.\n", index);
 		err = -ENOMEM;
 		goto out;
@@ -209,25 +1068,36 @@
 	mgm->qp[members_count++] = cpu_to_be32((qp->qpn & MGM_QPN_MASK) |
 					       (!!mlx4_blck_lb << MGM_BLCK_LB_BIT));
 
-	mgm->members_count = cpu_to_be32(members_count | ((u32) prot << 30));
+	mgm->members_count = cpu_to_be32(members_count | (u32) prot << 30);
 
-	err = mlx4_WRITE_MCG(dev, index, mailbox);
+	err = mlx4_WRITE_ENTRY(dev, index, mailbox);
 	if (err)
 		goto out;
 
+	/* if !link, still add the new entry. */
 	if (!link)
-		goto out;
+		goto skip_link;
 
-	err = mlx4_READ_MCG(dev, prev, mailbox);
+	err = mlx4_READ_ENTRY(dev, prev, mailbox);
 	if (err)
 		goto out;
 
 	mgm->next_gid_index = cpu_to_be32(index << 6);
 
-	err = mlx4_WRITE_MCG(dev, prev, mailbox);
+	err = mlx4_WRITE_ENTRY(dev, prev, mailbox);
 	if (err)
 		goto out;
 
+skip_link:
+	if (prot == MLX4_PROT_ETH) {
+		/* manage the steering entry for promisc mode */
+		if (new_entry)
+			new_steering_entry(dev, port, steer, index, qp->qpn);
+		else
+			existing_steering_entry(dev, port, steer,
+						index, qp->qpn);
+	}
+
 out:
 	if (err && link && index != -1) {
 		if (index < dev->caps.num_mgms)
@@ -235,7 +1105,7 @@
 				  index, dev->caps.num_mgms);
 		else
 			mlx4_bitmap_free(&priv->mcg_table.bitmap,
-					 index - dev->caps.num_mgms);
+					 index - dev->caps.num_mgms, MLX4_USE_RR);
 	}
 	mutex_unlock(&priv->mcg_table.mutex);
 
@@ -242,19 +1112,19 @@
 	mlx4_free_cmd_mailbox(dev, mailbox);
 	return err;
 }
-EXPORT_SYMBOL_GPL(mlx4_multicast_attach);
 
-int mlx4_multicast_detach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16],
-						enum mlx4_mcast_prot prot)
+int mlx4_qp_detach_common(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16],
+			  enum mlx4_protocol prot, enum mlx4_steer_type steer)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
 	struct mlx4_cmd_mailbox *mailbox;
 	struct mlx4_mgm *mgm;
 	u32 members_count;
-	u16 hash;
 	int prev, index;
-	int i, loc;
+	int i, loc = -1;
 	int err;
+	u8 port = gid[5];
+	bool removed_entry = false;
 
 	mailbox = mlx4_alloc_cmd_mailbox(dev);
 	if (IS_ERR(mailbox))
@@ -263,20 +1133,33 @@
 
 	mutex_lock(&priv->mcg_table.mutex);
 
-	err = find_mgm(dev, gid, prot, mailbox, &hash, &prev, &index);
+	err = find_entry(dev, port, gid, prot,
+			 mailbox, &prev, &index);
 	if (err)
 		goto out;
 
 	if (index == -1) {
-		mlx4_err(dev, "MGID %pI6 not found\n", gid);
+		mlx4_err(dev, "MGID "GID_PRINT_FMT" not found\n",
+		    GID_PRINT_ARGS(gid));
 		err = -EINVAL;
 		goto out;
 	}
 
+	/*
+	  if this QP is also a promisc QP, it shouldn't be removed only if
+	  at least one none promisc QP is also attached to this MCG
+	*/
+	if (prot == MLX4_PROT_ETH &&
+	    check_duplicate_entry(dev, port, steer, index, qp->qpn) &&
+	    !promisc_steering_entry(dev, port, steer, index, qp->qpn, NULL))
+			goto out;
+
 	members_count = be32_to_cpu(mgm->members_count) & 0xffffff;
-	for (loc = -1, i = 0; i < members_count; ++i)
-		if ((be32_to_cpu(mgm->qp[i]) & MGM_QPN_MASK) == qp->qpn)
+	for (i = 0; i < members_count; ++i)
+		if ((be32_to_cpu(mgm->qp[i]) & MGM_QPN_MASK) == qp->qpn) {
 			loc = i;
+			break;
+		}
 
 	if (loc == -1) {
 		mlx4_err(dev, "QP %06x not found in MGM\n", qp->qpn);
@@ -284,27 +1167,33 @@
 		goto out;
 	}
 
+	/* copy the last QP in this MGM over removed QP */
+	mgm->qp[loc] = mgm->qp[members_count - 1];
+	mgm->qp[members_count - 1] = 0;
+	mgm->members_count = cpu_to_be32(--members_count | (u32) prot << 30);
 
-	mgm->members_count = cpu_to_be32(--members_count | ((u32) prot << 30));
-	mgm->qp[loc]       = mgm->qp[i - 1];
-	mgm->qp[i - 1]     = 0;
-
-	if (i != 1) {
-		err = mlx4_WRITE_MCG(dev, index, mailbox);
+	if (prot == MLX4_PROT_ETH)
+		removed_entry = can_remove_steering_entry(dev, port, steer,
+								index, qp->qpn);
+	if (members_count && (prot != MLX4_PROT_ETH || !removed_entry)) {
+		err = mlx4_WRITE_ENTRY(dev, index, mailbox);
 		goto out;
 	}
 
+	/* We are going to delete the entry, members count should be 0 */
+	mgm->members_count = cpu_to_be32((u32) prot << 30);
+
 	if (prev == -1) {
 		/* Remove entry from MGM */
 		int amgm_index = be32_to_cpu(mgm->next_gid_index) >> 6;
 		if (amgm_index) {
-			err = mlx4_READ_MCG(dev, amgm_index, mailbox);
+			err = mlx4_READ_ENTRY(dev, amgm_index, mailbox);
 			if (err)
 				goto out;
 		} else
 			memset(mgm->gid, 0, 16);
 
-		err = mlx4_WRITE_MCG(dev, index, mailbox);
+		err = mlx4_WRITE_ENTRY(dev, index, mailbox);
 		if (err)
 			goto out;
 
@@ -314,18 +1203,18 @@
 					  index, amgm_index, dev->caps.num_mgms);
 			else
 				mlx4_bitmap_free(&priv->mcg_table.bitmap,
-						 amgm_index - dev->caps.num_mgms);
+						 amgm_index - dev->caps.num_mgms, MLX4_USE_RR);
 		}
 	} else {
 		/* Remove entry from AMGM */
 		int cur_next_index = be32_to_cpu(mgm->next_gid_index) >> 6;
-		err = mlx4_READ_MCG(dev, prev, mailbox);
+		err = mlx4_READ_ENTRY(dev, prev, mailbox);
 		if (err)
 			goto out;
 
 		mgm->next_gid_index = cpu_to_be32(cur_next_index << 6);
 
-		err = mlx4_WRITE_MCG(dev, prev, mailbox);
+		err = mlx4_WRITE_ENTRY(dev, prev, mailbox);
 		if (err)
 			goto out;
 
@@ -334,7 +1223,7 @@
 				  prev, index, dev->caps.num_mgms);
 		else
 			mlx4_bitmap_free(&priv->mcg_table.bitmap,
-					 index - dev->caps.num_mgms);
+					 index - dev->caps.num_mgms, MLX4_USE_RR);
 	}
 
 out:
@@ -343,13 +1232,299 @@
 	mlx4_free_cmd_mailbox(dev, mailbox);
 	return err;
 }
+
+static int mlx4_QP_ATTACH(struct mlx4_dev *dev, struct mlx4_qp *qp,
+			  u8 gid[16], u8 attach, u8 block_loopback,
+			  enum mlx4_protocol prot)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	int err = 0;
+	int qpn;
+
+	if (!mlx4_is_mfunc(dev))
+		return -EBADF;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	memcpy(mailbox->buf, gid, 16);
+	qpn = qp->qpn;
+	qpn |= (prot << 28);
+	if (attach && block_loopback)
+		qpn |= (1 << 31);
+
+	err = mlx4_cmd(dev, mailbox->dma, qpn, attach,
+		       MLX4_CMD_QP_ATTACH, MLX4_CMD_TIME_CLASS_A,
+		       MLX4_CMD_WRAPPED);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+
+int mlx4_trans_to_dmfs_attach(struct mlx4_dev *dev, struct mlx4_qp *qp,
+			      u8 gid[16], u8 port,
+			      int block_mcast_loopback,
+			      enum mlx4_protocol prot, u64 *reg_id)
+{
+		struct mlx4_spec_list spec = { {NULL} };
+		__be64 mac_mask = cpu_to_be64(MLX4_MAC_MASK << 16);
+
+		struct mlx4_net_trans_rule rule = {
+			.queue_mode = MLX4_NET_TRANS_Q_FIFO,
+			.exclusive = 0,
+			.promisc_mode = MLX4_FS_REGULAR,
+			.priority = MLX4_DOMAIN_NIC,
+		};
+
+		rule.allow_loopback = !block_mcast_loopback;
+		rule.port = port;
+		rule.qpn = qp->qpn;
+		INIT_LIST_HEAD(&rule.list);
+
+		switch (prot) {
+		case MLX4_PROT_ETH:
+			spec.id = MLX4_NET_TRANS_RULE_ID_ETH;
+			memcpy(spec.eth.dst_mac, &gid[10], ETH_ALEN);
+			memcpy(spec.eth.dst_mac_msk, &mac_mask, ETH_ALEN);
+			break;
+
+		case MLX4_PROT_IB_IPV6:
+			spec.id = MLX4_NET_TRANS_RULE_ID_IB;
+			memcpy(spec.ib.dst_gid, gid, 16);
+			memset(&spec.ib.dst_gid_msk, 0xff, 16);
+			break;
+		default:
+			return -EINVAL;
+		}
+		list_add_tail(&spec.list, &rule.list);
+
+		return mlx4_flow_attach(dev, &rule, reg_id);
+}
+
+int mlx4_multicast_attach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16],
+			  u8 port, int block_mcast_loopback,
+			  enum mlx4_protocol prot, u64 *reg_id)
+{
+	enum mlx4_steer_type steer;
+	steer = (is_valid_ether_addr(&gid[10])) ? MLX4_UC_STEER : MLX4_MC_STEER;
+
+	switch (dev->caps.steering_mode) {
+	case MLX4_STEERING_MODE_A0:
+		if (prot == MLX4_PROT_ETH)
+			return 0;
+
+	case MLX4_STEERING_MODE_B0:
+		if (prot == MLX4_PROT_ETH)
+			gid[7] |= (steer << 1);
+
+		if (mlx4_is_mfunc(dev))
+			return mlx4_QP_ATTACH(dev, qp, gid, 1,
+					      block_mcast_loopback, prot);
+		return mlx4_qp_attach_common(dev, qp, gid,
+					     block_mcast_loopback, prot,
+					     MLX4_MC_STEER);
+
+	case MLX4_STEERING_MODE_DEVICE_MANAGED:
+		return mlx4_trans_to_dmfs_attach(dev, qp, gid, port,
+						 block_mcast_loopback,
+						 prot, reg_id);
+	default:
+		return -EINVAL;
+	}
+}
+EXPORT_SYMBOL_GPL(mlx4_multicast_attach);
+
+int mlx4_multicast_detach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16],
+			  enum mlx4_protocol prot, u64 reg_id)
+{
+	enum mlx4_steer_type steer;
+	steer = (is_valid_ether_addr(&gid[10])) ? MLX4_UC_STEER : MLX4_MC_STEER;
+
+	switch (dev->caps.steering_mode) {
+	case MLX4_STEERING_MODE_A0:
+		if (prot == MLX4_PROT_ETH)
+			return 0;
+
+	case MLX4_STEERING_MODE_B0:
+		if (prot == MLX4_PROT_ETH)
+			gid[7] |= (steer << 1);
+
+		if (mlx4_is_mfunc(dev))
+			return mlx4_QP_ATTACH(dev, qp, gid, 0, 0, prot);
+
+		return mlx4_qp_detach_common(dev, qp, gid, prot,
+					     MLX4_MC_STEER);
+
+	case MLX4_STEERING_MODE_DEVICE_MANAGED:
+		return mlx4_flow_detach(dev, reg_id);
+
+	default:
+		return -EINVAL;
+	}
+}
 EXPORT_SYMBOL_GPL(mlx4_multicast_detach);
 
+int mlx4_flow_steer_promisc_add(struct mlx4_dev *dev, u8 port,
+				u32 qpn, enum mlx4_net_trans_promisc_mode mode)
+{
+	struct mlx4_net_trans_rule rule;
+	u64 *regid_p;
+
+	switch (mode) {
+	case MLX4_FS_ALL_DEFAULT:
+		regid_p = &dev->regid_promisc_array[port];
+		break;
+	case MLX4_FS_MC_DEFAULT:
+		regid_p = &dev->regid_allmulti_array[port];
+		break;
+	default:
+		return -1;
+	}
+
+	if (*regid_p != 0)
+		return -1;
+
+	rule.promisc_mode = mode;
+	rule.port = port;
+	rule.qpn = qpn;
+	INIT_LIST_HEAD(&rule.list);
+	mlx4_err(dev, "going promisc on %x\n", port);
+
+	return  mlx4_flow_attach(dev, &rule, regid_p);
+}
+EXPORT_SYMBOL_GPL(mlx4_flow_steer_promisc_add);
+
+int mlx4_flow_steer_promisc_remove(struct mlx4_dev *dev, u8 port,
+				   enum mlx4_net_trans_promisc_mode mode)
+{
+	int ret;
+	u64 *regid_p;
+
+	switch (mode) {
+	case MLX4_FS_ALL_DEFAULT:
+		regid_p = &dev->regid_promisc_array[port];
+		break;
+	case MLX4_FS_MC_DEFAULT:
+		regid_p = &dev->regid_allmulti_array[port];
+		break;
+	default:
+		return -1;
+	}
+
+	if (*regid_p == 0)
+		return -1;
+
+	ret =  mlx4_flow_detach(dev, *regid_p);
+	if (ret == 0)
+		*regid_p = 0;
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(mlx4_flow_steer_promisc_remove);
+
+int mlx4_unicast_attach(struct mlx4_dev *dev,
+			struct mlx4_qp *qp, u8 gid[16],
+			int block_mcast_loopback, enum mlx4_protocol prot)
+{
+	if (prot == MLX4_PROT_ETH)
+		gid[7] |= (MLX4_UC_STEER << 1);
+
+	if (mlx4_is_mfunc(dev))
+		return mlx4_QP_ATTACH(dev, qp, gid, 1,
+					block_mcast_loopback, prot);
+
+	return mlx4_qp_attach_common(dev, qp, gid, block_mcast_loopback,
+					prot, MLX4_UC_STEER);
+}
+EXPORT_SYMBOL_GPL(mlx4_unicast_attach);
+
+int mlx4_unicast_detach(struct mlx4_dev *dev, struct mlx4_qp *qp,
+			       u8 gid[16], enum mlx4_protocol prot)
+{
+	if (prot == MLX4_PROT_ETH)
+		gid[7] |= (MLX4_UC_STEER << 1);
+
+	if (mlx4_is_mfunc(dev))
+		return mlx4_QP_ATTACH(dev, qp, gid, 0, 0, prot);
+
+	return mlx4_qp_detach_common(dev, qp, gid, prot, MLX4_UC_STEER);
+}
+EXPORT_SYMBOL_GPL(mlx4_unicast_detach);
+
+int mlx4_PROMISC_wrapper(struct mlx4_dev *dev, int slave,
+			 struct mlx4_vhcr *vhcr,
+			 struct mlx4_cmd_mailbox *inbox,
+			 struct mlx4_cmd_mailbox *outbox,
+			 struct mlx4_cmd_info *cmd)
+{
+	u32 qpn = (u32) vhcr->in_param & 0xffffffff;
+	u8 port = vhcr->in_param >> 62;
+	enum mlx4_steer_type steer = vhcr->in_modifier;
+
+	/* Promiscuous unicast is not allowed in mfunc for VFs */
+	if ((slave != dev->caps.function) && (steer == MLX4_UC_STEER))
+		return 0;
+
+	if (vhcr->op_modifier)
+		return add_promisc_qp(dev, port, steer, qpn);
+	else
+		return remove_promisc_qp(dev, port, steer, qpn);
+}
+
+static int mlx4_PROMISC(struct mlx4_dev *dev, u32 qpn,
+			enum mlx4_steer_type steer, u8 add, u8 port)
+{
+	return mlx4_cmd(dev, (u64) qpn | (u64) port << 62, (u32) steer, add,
+			MLX4_CMD_PROMISC, MLX4_CMD_TIME_CLASS_A,
+			MLX4_CMD_WRAPPED);
+}
+
+int mlx4_multicast_promisc_add(struct mlx4_dev *dev, u32 qpn, u8 port)
+{
+	if (mlx4_is_mfunc(dev))
+		return mlx4_PROMISC(dev, qpn, MLX4_MC_STEER, 1, port);
+
+	return add_promisc_qp(dev, port, MLX4_MC_STEER, qpn);
+}
+EXPORT_SYMBOL_GPL(mlx4_multicast_promisc_add);
+
+int mlx4_multicast_promisc_remove(struct mlx4_dev *dev, u32 qpn, u8 port)
+{
+	if (mlx4_is_mfunc(dev))
+		return mlx4_PROMISC(dev, qpn, MLX4_MC_STEER, 0, port);
+
+	return remove_promisc_qp(dev, port, MLX4_MC_STEER, qpn);
+}
+EXPORT_SYMBOL_GPL(mlx4_multicast_promisc_remove);
+
+int mlx4_unicast_promisc_add(struct mlx4_dev *dev, u32 qpn, u8 port)
+{
+	if (mlx4_is_mfunc(dev))
+		return mlx4_PROMISC(dev, qpn, MLX4_UC_STEER, 1, port);
+
+	return add_promisc_qp(dev, port, MLX4_UC_STEER, qpn);
+}
+EXPORT_SYMBOL_GPL(mlx4_unicast_promisc_add);
+
+int mlx4_unicast_promisc_remove(struct mlx4_dev *dev, u32 qpn, u8 port)
+{
+	if (mlx4_is_mfunc(dev))
+		return mlx4_PROMISC(dev, qpn, MLX4_UC_STEER, 0, port);
+
+	return remove_promisc_qp(dev, port, MLX4_UC_STEER, qpn);
+}
+EXPORT_SYMBOL_GPL(mlx4_unicast_promisc_remove);
+
 int mlx4_init_mcg_table(struct mlx4_dev *dev)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
 	int err;
 
+	/* No need for mcg_table when fw managed the mcg table*/
+	if (dev->caps.steering_mode ==
+	    MLX4_STEERING_MODE_DEVICE_MANAGED)
+		return 0;
 	err = mlx4_bitmap_init(&priv->mcg_table.bitmap, dev->caps.num_amgms,
 			       dev->caps.num_amgms - 1, 0, 0);
 	if (err)
@@ -362,5 +1537,7 @@
 
 void mlx4_cleanup_mcg_table(struct mlx4_dev *dev)
 {
-	mlx4_bitmap_cleanup(&mlx4_priv(dev)->mcg_table.bitmap);
+	if (dev->caps.steering_mode !=
+	    MLX4_STEERING_MODE_DEVICE_MANAGED)
+		mlx4_bitmap_cleanup(&mlx4_priv(dev)->mcg_table.bitmap);
 }

Modified: trunk/sys/ofed/drivers/net/mlx4/mlx4.h
===================================================================
--- trunk/sys/ofed/drivers/net/mlx4/mlx4.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/drivers/net/mlx4/mlx4.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -2,7 +2,7 @@
  * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
  * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
  * Copyright (c) 2005, 2006, 2007 Cisco Systems.  All rights reserved.
- * Copyright (c) 2005, 2006, 2007, 2008 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2005, 2006, 2007, 2008, 2014 Mellanox Technologies. All rights reserved.
  * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
  *
  * This software is available to you under a choice of one of two
@@ -39,28 +39,60 @@
 
 #include <linux/mutex.h>
 #include <linux/radix-tree.h>
+#include <linux/rbtree.h>
 #include <linux/timer.h>
+#include <linux/semaphore.h>
 #include <linux/workqueue.h>
-
+#include <linux/device.h>
 #include <linux/mlx4/device.h>
 #include <linux/mlx4/driver.h>
 #include <linux/mlx4/doorbell.h>
+#include <linux/mlx4/cmd.h>
 
 #define DRV_NAME	"mlx4_core"
 #define PFX		DRV_NAME ": "
-#define DRV_VERSION	"1.0-ofed1.5.2"
-#define DRV_RELDATE	"August 4, 2010"
+#define DRV_VERSION	"2.1.6"
+#define DRV_RELDATE	__DATE__
 
+#define DRV_STACK_NAME		"Linux-MLNX_OFED"
+#define DRV_STACK_VERSION	"2.1"
+#define DRV_NAME_FOR_FW		DRV_STACK_NAME","DRV_STACK_VERSION
+
+#define MLX4_FS_UDP_UC_EN		(1 << 1)
+#define MLX4_FS_TCP_UC_EN		(1 << 2)
+#define MLX4_FS_NUM_OF_L2_ADDR		8
+#define MLX4_FS_MGM_LOG_ENTRY_SIZE	7
+#define MLX4_FS_NUM_MCG			(1 << 17)
+
+struct mlx4_set_port_prio2tc_context {
+	u8 prio2tc[4];
+};
+
+struct mlx4_port_scheduler_tc_cfg_be {
+	__be16 pg;
+	__be16 bw_precentage;
+	__be16 max_bw_units; /* 3-100Mbps, 4-1Gbps, other values - reserved */
+	__be16 max_bw_value;
+};
+
+struct mlx4_set_port_scheduler_context {
+	struct mlx4_port_scheduler_tc_cfg_be tc[MLX4_NUM_TC];
+};
+
 enum {
 	MLX4_HCR_BASE		= 0x80680,
 	MLX4_HCR_SIZE		= 0x0001c,
-	MLX4_CLR_INT_SIZE	= 0x00008
+	MLX4_CLR_INT_SIZE	= 0x00008,
+	MLX4_SLAVE_COMM_BASE	= 0x0,
+	MLX4_COMM_PAGESIZE	= 0x1000,
+	MLX4_CLOCK_SIZE		= 0x00008
 };
 
 enum {
-	MLX4_MGM_ENTRY_SIZE	=  0x100,
-	MLX4_QP_PER_MGM		= 4 * (MLX4_MGM_ENTRY_SIZE / 16 - 2),
-	MLX4_MTT_ENTRY_PER_SEG	= 8
+	MLX4_DEFAULT_MGM_LOG_ENTRY_SIZE = 10,
+	MLX4_MIN_MGM_LOG_ENTRY_SIZE = 7,
+	MLX4_MAX_MGM_LOG_ENTRY_SIZE = 12,
+	MLX4_MAX_QP_PER_MGM	= 4 * ((1 << MLX4_MAX_MGM_LOG_ENTRY_SIZE)/16 - 2),
 };
 
 enum {
@@ -80,6 +112,107 @@
 	MLX4_NUM_CMPTS		= MLX4_CMPT_NUM_TYPE << MLX4_CMPT_SHIFT
 };
 
+enum mlx4_mpt_state {
+	MLX4_MPT_DISABLED = 0,
+	MLX4_MPT_EN_HW,
+	MLX4_MPT_EN_SW
+};
+
+#define MLX4_COMM_TIME		10000
+enum {
+	MLX4_COMM_CMD_RESET,
+	MLX4_COMM_CMD_VHCR0,
+	MLX4_COMM_CMD_VHCR1,
+	MLX4_COMM_CMD_VHCR2,
+	MLX4_COMM_CMD_VHCR_EN,
+	MLX4_COMM_CMD_VHCR_POST,
+	MLX4_COMM_CMD_FLR = 254
+};
+
+/*The flag indicates that the slave should delay the RESET cmd*/
+#define MLX4_DELAY_RESET_SLAVE 0xbbbbbbb
+/*indicates how many retries will be done if we are in the middle of FLR*/
+#define NUM_OF_RESET_RETRIES	10
+#define SLEEP_TIME_IN_RESET	(2 * 1000)
+enum mlx4_resource {
+	RES_QP,
+	RES_CQ,
+	RES_SRQ,
+	RES_XRCD,
+	RES_MPT,
+	RES_MTT,
+	RES_MAC,
+	RES_VLAN,
+	RES_NPORT_ID,
+	RES_COUNTER,
+	RES_FS_RULE,
+	RES_EQ,
+	MLX4_NUM_OF_RESOURCE_TYPE
+};
+
+enum mlx4_alloc_mode {
+	RES_OP_RESERVE,
+	RES_OP_RESERVE_AND_MAP,
+	RES_OP_MAP_ICM,
+};
+
+enum mlx4_res_tracker_free_type {
+	RES_TR_FREE_ALL,
+	RES_TR_FREE_SLAVES_ONLY,
+	RES_TR_FREE_STRUCTS_ONLY,
+};
+
+/*
+ *Virtual HCR structures.
+ * mlx4_vhcr is the sw representation, in machine endianess
+ *
+ * mlx4_vhcr_cmd is the formalized structure, the one that is passed
+ * to FW to go through communication channel.
+ * It is big endian, and has the same structure as the physical HCR
+ * used by command interface
+ */
+struct mlx4_vhcr {
+	u64	in_param;
+	u64	out_param;
+	u32	in_modifier;
+	u32	errno;
+	u16	op;
+	u16	token;
+	u8	op_modifier;
+	u8	e_bit;
+};
+
+struct mlx4_vhcr_cmd {
+	__be64 in_param;
+	__be32 in_modifier;
+	u32 reserved1;
+	__be64 out_param;
+	__be16 token;
+	u16 reserved;
+	u8 status;
+	u8 flags;
+	__be16 opcode;
+} __packed;
+
+struct mlx4_cmd_info {
+	u16 opcode;
+	bool has_inbox;
+	bool has_outbox;
+	bool out_is_imm;
+	bool encode_slave_id;
+	bool skip_err_print;
+	int (*verify)(struct mlx4_dev *dev, int slave, struct mlx4_vhcr *vhcr,
+		      struct mlx4_cmd_mailbox *inbox);
+	int (*wrapper)(struct mlx4_dev *dev, int slave, struct mlx4_vhcr *vhcr,
+		       struct mlx4_cmd_mailbox *inbox,
+		       struct mlx4_cmd_mailbox *outbox,
+		       struct mlx4_cmd_info *cmd);
+};
+
+enum {
+	MLX4_DEBUG_MASK_CMD_TIME = 0x100,
+};
+
 #ifdef CONFIG_MLX4_DEBUG
 extern int mlx4_debug_level;
 #else /* CONFIG_MLX4_DEBUG */
@@ -87,20 +220,26 @@
 #endif /* CONFIG_MLX4_DEBUG */
 
 #define mlx4_dbg(mdev, format, arg...)					\
-	do {								\
-		if (mlx4_debug_level)					\
-			dev_printk(KERN_DEBUG, &mdev->pdev->dev, format, ## arg); \
-	} while (0)
+do {									\
+	if (mlx4_debug_level)						\
+		dev_printk(KERN_DEBUG, &mdev->pdev->dev, format, ##arg); \
+} while (0)
 
 #define mlx4_err(mdev, format, arg...) \
-	dev_err(&mdev->pdev->dev, format, ## arg)
+	dev_err(&mdev->pdev->dev, format, ##arg)
 #define mlx4_info(mdev, format, arg...) \
-	dev_info(&mdev->pdev->dev, format, ## arg)
+	dev_info(&mdev->pdev->dev, format, ##arg)
 #define mlx4_warn(mdev, format, arg...) \
-	dev_warn(&mdev->pdev->dev, format, ## arg)
+	dev_warn(&mdev->pdev->dev, format, ##arg)
 
+extern int mlx4_log_num_mgm_entry_size;
+extern int log_mtts_per_seg;
 extern int mlx4_blck_lb;
+extern int mlx4_set_4k_mtu;
 
+#define MLX4_MAX_NUM_SLAVES	(MLX4_MAX_NUM_PF + MLX4_MAX_NUM_VF)
+#define ALL_SLAVES 0xff
+
 struct mlx4_bitmap {
 	u32			last;
 	u32			top;
@@ -115,7 +254,7 @@
 struct mlx4_buddy {
 	unsigned long	      **bits;
 	unsigned int	       *num_free;
-	int			max_order;
+	u32			max_order;
 	spinlock_t		lock;
 };
 
@@ -124,7 +263,7 @@
 struct mlx4_icm_table {
 	u64			virt;
 	int			num_icm;
-	int			num_obj;
+	u32			num_obj;
 	int			obj_size;
 	int			lowmem;
 	int			coherent;
@@ -132,6 +271,107 @@
 	struct mlx4_icm	      **icm;
 };
 
+#define MLX4_MPT_FLAG_SW_OWNS	    (0xfUL << 28)
+#define MLX4_MPT_FLAG_FREE	    (0x3UL << 28)
+#define MLX4_MPT_FLAG_MIO	    (1 << 17)
+#define MLX4_MPT_FLAG_BIND_ENABLE   (1 << 15)
+#define MLX4_MPT_FLAG_PHYSICAL	    (1 <<  9)
+#define MLX4_MPT_FLAG_REGION	    (1 <<  8)
+
+#define MLX4_MPT_PD_FLAG_FAST_REG   (1 << 27)
+#define MLX4_MPT_PD_FLAG_RAE	    (1 << 28)
+#define MLX4_MPT_PD_FLAG_EN_INV	    (3 << 24)
+
+#define MLX4_MPT_QP_FLAG_BOUND_QP   (1 << 7)
+
+#define MLX4_MPT_STATUS_SW		0xF0
+#define MLX4_MPT_STATUS_HW		0x00
+
+/*
+ * Must be packed because mtt_seg is 64 bits but only aligned to 32 bits.
+ */
+struct mlx4_mpt_entry {
+	__be32 flags;
+	__be32 qpn;
+	__be32 key;
+	__be32 pd_flags;
+	__be64 start;
+	__be64 length;
+	__be32 lkey;
+	__be32 win_cnt;
+	u8	reserved1[3];
+	u8	mtt_rep;
+	__be64 mtt_addr;
+	__be32 mtt_sz;
+	__be32 entity_size;
+	__be32 first_byte_offset;
+} __packed;
+
+/*
+ * Must be packed because start is 64 bits but only aligned to 32 bits.
+ */
+struct mlx4_eq_context {
+	__be32			flags;
+	u16			reserved1[3];
+	__be16			page_offset;
+	u8			log_eq_size;
+	u8			reserved2[4];
+	u8			eq_period;
+	u8			reserved3;
+	u8			eq_max_count;
+	u8			reserved4[3];
+	u8			intr;
+	u8			log_page_size;
+	u8			reserved5[2];
+	u8			mtt_base_addr_h;
+	__be32			mtt_base_addr_l;
+	u32			reserved6[2];
+	__be32			consumer_index;
+	__be32			producer_index;
+	u32			reserved7[4];
+};
+
+struct mlx4_cq_context {
+	__be32			flags;
+	u16			reserved1[3];
+	__be16			page_offset;
+	__be32			logsize_usrpage;
+	__be16			cq_period;
+	__be16			cq_max_count;
+	u8			reserved2[3];
+	u8			comp_eqn;
+	u8			log_page_size;
+	u8			reserved3[2];
+	u8			mtt_base_addr_h;
+	__be32			mtt_base_addr_l;
+	__be32			last_notified_index;
+	__be32			solicit_producer_index;
+	__be32			consumer_index;
+	__be32			producer_index;
+	u32			reserved4[2];
+	__be64			db_rec_addr;
+};
+
+struct mlx4_srq_context {
+	__be32			state_logsize_srqn;
+	u8			logstride;
+	u8			reserved1;
+	__be16			xrcd;
+	__be32			pg_offset_cqn;
+	u32			reserved2;
+	u8			log_page_size;
+	u8			reserved3[2];
+	u8			mtt_base_addr_h;
+	__be32			mtt_base_addr_l;
+	__be32			pd;
+	__be16			limit_watermark;
+	__be16			wqe_cnt;
+	u16			reserved4;
+	__be16			wqe_counter;
+	u32			reserved5;
+	__be64			db_rec_addr;
+};
+
 struct mlx4_eq {
 	struct mlx4_dev	       *dev;
 	void __iomem	       *doorbell;
@@ -140,11 +380,21 @@
 	u16			irq;
 	u16			have_irq;
 	int			nent;
-	int			load;
 	struct mlx4_buf_list   *page_list;
 	struct mlx4_mtt		mtt;
 };
 
+struct mlx4_slave_eqe {
+	u8 type;
+	u8 port;
+	u32 param;
+};
+
+struct mlx4_slave_event_eq_info {
+	int eqn;
+	u16 token;
+};
+
 struct mlx4_profile {
 	int			num_qp;
 	int			rdmarc_per_qp;
@@ -152,12 +402,14 @@
 	int			num_cq;
 	int			num_mcg;
 	int			num_mpt;
-	int			num_mtt;
+	unsigned		num_mtt_segs;
 };
 
 struct mlx4_fw {
 	u64			clr_int_base;
 	u64			catas_offset;
+	u64			comm_base;
+	u64			clock_offset;
 	struct mlx4_icm	       *fw_icm;
 	struct mlx4_icm	       *aux_icm;
 	u32			catas_size;
@@ -164,12 +416,184 @@
 	u16			fw_pages;
 	u8			clr_int_bar;
 	u8			catas_bar;
+	u8			comm_bar;
+	u8			clock_bar;
 };
 
+struct mlx4_comm {
+	u32			slave_write;
+	u32			slave_read;
+};
+
+enum {
+	MLX4_MCAST_CONFIG       = 0,
+	MLX4_MCAST_DISABLE      = 1,
+	MLX4_MCAST_ENABLE       = 2,
+};
+
+#define VLAN_FLTR_SIZE	128
+
+struct mlx4_vlan_fltr {
+	__be32 entry[VLAN_FLTR_SIZE];
+};
+
+struct mlx4_mcast_entry {
+	struct list_head list;
+	u64 addr;
+};
+
+struct mlx4_promisc_qp {
+	struct list_head list;
+	u32 qpn;
+};
+
+struct mlx4_steer_index {
+	struct list_head list;
+	unsigned int index;
+	struct list_head duplicates;
+};
+
+#define MLX4_EVENT_TYPES_NUM 64
+
+struct mlx4_slave_state {
+	u8 comm_toggle;
+	u8 last_cmd;
+	u8 init_port_mask;
+	bool active;
+	bool old_vlan_api;
+	u8 function;
+	dma_addr_t vhcr_dma;
+	u16 mtu[MLX4_MAX_PORTS + 1];
+	__be32 ib_cap_mask[MLX4_MAX_PORTS + 1];
+	struct mlx4_slave_eqe eq[MLX4_MFUNC_MAX_EQES];
+	struct list_head mcast_filters[MLX4_MAX_PORTS + 1];
+	struct mlx4_vlan_fltr *vlan_filter[MLX4_MAX_PORTS + 1];
+	/* event type to eq number lookup */
+	struct mlx4_slave_event_eq_info event_eq[MLX4_EVENT_TYPES_NUM];
+	u16 eq_pi;
+	u16 eq_ci;
+	spinlock_t lock;
+	/*initialized via the kzalloc*/
+	u8 is_slave_going_down;
+	u32 cookie;
+	enum slave_port_state port_state[MLX4_MAX_PORTS + 1];
+};
+
+#define MLX4_VGT 4095
+#define NO_INDX  (-1)
+
+
+struct mlx4_vport_state {
+	u64 mac;
+	u16 default_vlan;
+	u8  default_qos;
+	u32 tx_rate;
+	bool spoofchk;
+	u32 link_state;
+};
+
+struct mlx4_vf_admin_state {
+	struct mlx4_vport_state vport[MLX4_MAX_PORTS + 1];
+};
+
+struct mlx4_vport_oper_state {
+	struct mlx4_vport_state state;
+	int mac_idx;
+	int vlan_idx;
+};
+struct mlx4_vf_oper_state {
+	struct mlx4_vport_oper_state vport[MLX4_MAX_PORTS + 1];
+};
+
+struct slave_list {
+	struct mutex mutex;
+	struct list_head res_list[MLX4_NUM_OF_RESOURCE_TYPE];
+};
+
+struct resource_allocator {
+	spinlock_t alloc_lock;
+	union {
+		int res_reserved;
+		int res_port_rsvd[MLX4_MAX_PORTS];
+	};
+	union {
+		int res_free;
+		int res_port_free[MLX4_MAX_PORTS];
+	};
+	int *quota;
+	int *allocated;
+	int *guaranteed;
+};
+
+struct mlx4_resource_tracker {
+	spinlock_t lock;
+	/* tree for each resources */
+	struct rb_root res_tree[MLX4_NUM_OF_RESOURCE_TYPE];
+	/* num_of_slave's lists, one per slave */
+	struct slave_list *slave_list;
+	struct resource_allocator res_alloc[MLX4_NUM_OF_RESOURCE_TYPE];
+};
+
+#define SLAVE_EVENT_EQ_SIZE	128
+struct mlx4_slave_event_eq {
+	u32 eqn;
+	u32 cons;
+	u32 prod;
+	spinlock_t event_lock;
+	struct mlx4_eqe event_eqe[SLAVE_EVENT_EQ_SIZE];
+};
+
+struct mlx4_master_qp0_state {
+	int proxy_qp0_active;
+	int qp0_active;
+	int port_active;
+};
+
+struct mlx4_mfunc_master_ctx {
+	struct mlx4_slave_state *slave_state;
+	struct mlx4_vf_admin_state *vf_admin;
+	struct mlx4_vf_oper_state *vf_oper;
+	struct mlx4_master_qp0_state qp0_state[MLX4_MAX_PORTS + 1];
+	int			init_port_ref[MLX4_MAX_PORTS + 1];
+	u16			max_mtu[MLX4_MAX_PORTS + 1];
+	int			disable_mcast_ref[MLX4_MAX_PORTS + 1];
+	struct mlx4_resource_tracker res_tracker;
+	struct workqueue_struct *comm_wq;
+	struct work_struct	comm_work;
+	struct work_struct	arm_comm_work;
+	struct work_struct	slave_event_work;
+	struct work_struct	slave_flr_event_work;
+	spinlock_t		slave_state_lock;
+	__be32			comm_arm_bit_vector[4];
+	struct mlx4_eqe		cmd_eqe;
+	struct mlx4_slave_event_eq slave_eq;
+	struct mutex		gen_eqe_mutex[MLX4_MFUNC_MAX];
+};
+
+struct mlx4_mfunc {
+	struct mlx4_comm __iomem       *comm;
+	struct mlx4_vhcr_cmd	       *vhcr;
+	dma_addr_t			vhcr_dma;
+
+	struct mlx4_mfunc_master_ctx	master;
+};
+
+#define MGM_QPN_MASK       0x00FFFFFF
+#define MGM_BLCK_LB_BIT    30
+
+struct mlx4_mgm {
+	__be32			next_gid_index;
+	__be32			members_count;
+	u32			reserved[2];
+	u8			gid[16];
+	__be32			qp[MLX4_MAX_QP_PER_MGM];
+};
+
 struct mlx4_cmd {
 	struct pci_pool	       *pool;
 	void __iomem	       *hcr;
 	struct mutex		hcr_mutex;
+	struct mutex		slave_cmd_mutex;
 	struct semaphore	poll_sem;
 	struct semaphore	event_sem;
 	int			max_cmds;
@@ -179,8 +603,27 @@
 	u16			token_mask;
 	u8			use_events;
 	u8			toggle;
+	u8			comm_toggle;
 };
 
+enum {
+	MLX4_VF_IMMED_VLAN_FLAG_VLAN = 1 << 0,
+	MLX4_VF_IMMED_VLAN_FLAG_QOS = 1 << 1,
+};
+struct mlx4_vf_immed_vlan_work {
+	struct work_struct	work;
+	struct mlx4_priv	*priv;
+	int			flags;
+	int			slave;
+	int			vlan_ix;
+	int			orig_vlan_ix;
+	u8			port;
+	u8			qos;
+	u16			vlan_id;
+	u16			orig_vlan_id;
+};
+
+
 struct mlx4_uar_table {
 	struct mlx4_bitmap	bitmap;
 };
@@ -197,6 +640,7 @@
 struct mlx4_cq_table {
 	struct mlx4_bitmap	bitmap;
 	spinlock_t		lock;
+	rwlock_t		cq_table_lock;
 	struct radix_tree_root	tree;
 	struct mlx4_icm_table	table;
 	struct mlx4_icm_table	cmpt_table;
@@ -218,6 +662,7 @@
 struct mlx4_srq_table {
 	struct mlx4_bitmap	bitmap;
 	spinlock_t		lock;
+	struct radix_tree_root	tree;
 	struct mlx4_icm_table	table;
 	struct mlx4_icm_table	cmpt_table;
 };
@@ -268,6 +713,55 @@
 	int			max;
 };
 
+#define SET_PORT_GEN_ALL_VALID		0x7
+#define SET_PORT_PROMISC_SHIFT		31
+#define SET_PORT_MC_PROMISC_SHIFT	30
+
+enum {
+	MCAST_DIRECT_ONLY	= 0,
+	MCAST_DIRECT		= 1,
+	MCAST_DEFAULT		= 2
+};
+
+
+struct mlx4_set_port_general_context {
+	u8 reserved[3];
+	u8 flags;
+	u16 reserved2;
+	__be16 mtu;
+	u8 pptx;
+	u8 pfctx;
+	u16 reserved3;
+	u8 pprx;
+	u8 pfcrx;
+	u16 reserved4;
+};
+
+struct mlx4_set_port_rqp_calc_context {
+	__be32 base_qpn;
+	u8 rererved;
+	u8 n_mac;
+	u8 n_vlan;
+	u8 n_prio;
+	u8 reserved2[3];
+	u8 mac_miss;
+	u8 intra_no_vlan;
+	u8 no_vlan;
+	u8 intra_vlan_miss;
+	u8 vlan_miss;
+	u8 reserved3[3];
+	u8 no_vlan_prio;
+	__be32 promisc;
+	__be32 mcast;
+};
+
+struct mlx4_hca_info {
+	struct mlx4_dev	       *dev;
+	struct device_attribute firmware_attr;
+	struct device_attribute hca_attr;
+	struct device_attribute board_attr;
+};
+
 struct mlx4_port_info {
 	struct mlx4_dev	       *dev;
 	int			port;
@@ -274,8 +768,11 @@
 	char			dev_name[16];
 	struct device_attribute port_attr;
 	enum mlx4_port_type	tmp_type;
+	char			dev_mtu_name[16];
+	struct device_attribute port_mtu_attr;
 	struct mlx4_mac_table	mac_table;
 	struct mlx4_vlan_table	vlan_table;
+	int			base_qpn;
 };
 
 struct mlx4_sense {
@@ -283,12 +780,44 @@
 	u8			do_sense_port[MLX4_MAX_PORTS + 1];
 	u8			sense_allowed[MLX4_MAX_PORTS + 1];
 	struct delayed_work	sense_poll;
-	struct workqueue_struct	*sense_wq;
-	u32			resched;
 };
 
-extern struct mutex drv_mutex;
+struct mlx4_msix_ctl {
+	u64		pool_bm;
+	struct mutex	pool_lock;
+};
 
+struct mlx4_steer {
+	struct list_head promisc_qps[MLX4_NUM_STEERS];
+	struct list_head steer_entries[MLX4_NUM_STEERS];
+};
+
+enum {
+	MLX4_PCI_DEV_IS_VF		= 1 << 0,
+	MLX4_PCI_DEV_FORCE_SENSE_PORT	= 1 << 1,
+};
+
+struct mlx4_roce_gid_entry {
+	u8 raw[16];
+};
+
+struct counter_index {
+	struct  list_head	list;
+	u32			index;
+};
+
+struct mlx4_counters {
+	struct mlx4_bitmap	bitmap;
+	struct list_head	global_port_list[MLX4_MAX_PORTS];
+	struct list_head	vf_list[MLX4_MAX_NUM_VF][MLX4_MAX_PORTS];
+	struct mutex		mutex;
+};
+
+enum {
+	MLX4_NO_RR	= 0,
+	MLX4_USE_RR	= 1,
+};
+
 struct mlx4_priv {
 	struct mlx4_dev		dev;
 
@@ -296,11 +825,14 @@
 	struct list_head	ctx_list;
 	spinlock_t		ctx_lock;
 
+	int			pci_dev_data;
+
 	struct list_head        pgdir_list;
 	struct mutex            pgdir_mutex;
 
 	struct mlx4_fw		fw;
 	struct mlx4_cmd		cmd;
+	struct mlx4_mfunc	mfunc;
 
 	struct mlx4_bitmap	pd_bitmap;
 	struct mlx4_bitmap	xrcd_bitmap;
@@ -311,9 +843,7 @@
 	struct mlx4_srq_table	srq_table;
 	struct mlx4_qp_table	qp_table;
 	struct mlx4_mcg_table	mcg_table;
-	struct mlx4_bitmap	counters_bitmap;
-	struct list_head	bf_list;
-	struct mutex		bf_mutex;
+	struct mlx4_counters	counters_table;
 
 	struct mlx4_catas_err	catas_err;
 
@@ -322,13 +852,22 @@
 	struct mlx4_uar		driver_uar;
 	void __iomem	       *kar;
 	struct mlx4_port_info	port[MLX4_MAX_PORTS + 1];
-	struct device_attribute trigger_attr;
-	int                     trig;
-	int                     changed_ports;
+	struct mlx4_hca_info	hca_info;
 	struct mlx4_sense       sense;
 	struct mutex		port_mutex;
-	int			iboe_counter_index[MLX4_MAX_PORTS];
-	struct io_mapping      *bf_mapping;
+	struct mlx4_msix_ctl	msix_ctl;
+	struct mlx4_steer	*steer;
+	struct list_head	bf_list;
+	struct mutex		bf_mutex;
+	struct io_mapping	*bf_mapping;
+	void __iomem            *clock_mapping;
+	int			reserved_mtts;
+	int			fs_hash_mode;
+	u8 virt2phys_pkey[MLX4_MFUNC_MAX][MLX4_MAX_PORTS][MLX4_MAX_PORT_PKEYS];
+	__be64			slave_node_guids[MLX4_MFUNC_MAX];
+	struct mlx4_roce_gid_entry roce_gids[MLX4_MAX_PORTS][128];
+	atomic_t		opreq_count;
+	struct work_struct	opreq_task;
 };
 
 static inline struct mlx4_priv *mlx4_priv(struct mlx4_dev *dev)
@@ -341,9 +880,11 @@
 extern struct workqueue_struct *mlx4_wq;
 
 u32 mlx4_bitmap_alloc(struct mlx4_bitmap *bitmap);
-void mlx4_bitmap_free(struct mlx4_bitmap *bitmap, u32 obj);
-u32 mlx4_bitmap_alloc_range(struct mlx4_bitmap *bitmap, int cnt, int align);
-void mlx4_bitmap_free_range(struct mlx4_bitmap *bitmap, u32 obj, int cnt);
+void mlx4_bitmap_free(struct mlx4_bitmap *bitmap, u32 obj, int use_rr);
+u32 mlx4_bitmap_alloc_range(struct mlx4_bitmap *bitmap, int cnt,
+			    int align, u32 skip_mask);
+void mlx4_bitmap_free_range(struct mlx4_bitmap *bitmap, u32 obj, int cnt,
+			    int use_rr);
 u32 mlx4_bitmap_avail(struct mlx4_bitmap *bitmap);
 int mlx4_bitmap_init(struct mlx4_bitmap *bitmap, u32 num, u32 mask,
 		     u32 reserved_bot, u32 resetrved_top);
@@ -365,6 +906,7 @@
 int mlx4_init_mcg_table(struct mlx4_dev *dev);
 
 void mlx4_cleanup_pd_table(struct mlx4_dev *dev);
+void mlx4_cleanup_xrcd_table(struct mlx4_dev *dev);
 void mlx4_cleanup_uar_table(struct mlx4_dev *dev);
 void mlx4_cleanup_mr_table(struct mlx4_dev *dev);
 void mlx4_cleanup_eq_table(struct mlx4_dev *dev);
@@ -372,8 +914,72 @@
 void mlx4_cleanup_qp_table(struct mlx4_dev *dev);
 void mlx4_cleanup_srq_table(struct mlx4_dev *dev);
 void mlx4_cleanup_mcg_table(struct mlx4_dev *dev);
-void mlx4_cleanup_xrcd_table(struct mlx4_dev *dev);
+int __mlx4_qp_alloc_icm(struct mlx4_dev *dev, int qpn);
+void __mlx4_qp_free_icm(struct mlx4_dev *dev, int qpn);
+int __mlx4_cq_alloc_icm(struct mlx4_dev *dev, int *cqn);
+void __mlx4_cq_free_icm(struct mlx4_dev *dev, int cqn);
+int __mlx4_srq_alloc_icm(struct mlx4_dev *dev, int *srqn);
+void __mlx4_srq_free_icm(struct mlx4_dev *dev, int srqn);
+int __mlx4_mpt_reserve(struct mlx4_dev *dev);
+void __mlx4_mpt_release(struct mlx4_dev *dev, u32 index);
+int __mlx4_mpt_alloc_icm(struct mlx4_dev *dev, u32 index);
+void __mlx4_mpt_free_icm(struct mlx4_dev *dev, u32 index);
+u32 __mlx4_alloc_mtt_range(struct mlx4_dev *dev, int order);
+void __mlx4_free_mtt_range(struct mlx4_dev *dev, u32 first_seg, int order);
 
+int mlx4_WRITE_MTT_wrapper(struct mlx4_dev *dev, int slave,
+			   struct mlx4_vhcr *vhcr,
+			   struct mlx4_cmd_mailbox *inbox,
+			   struct mlx4_cmd_mailbox *outbox,
+			   struct mlx4_cmd_info *cmd);
+int mlx4_SYNC_TPT_wrapper(struct mlx4_dev *dev, int slave,
+			   struct mlx4_vhcr *vhcr,
+			   struct mlx4_cmd_mailbox *inbox,
+			   struct mlx4_cmd_mailbox *outbox,
+			   struct mlx4_cmd_info *cmd);
+int mlx4_SW2HW_MPT_wrapper(struct mlx4_dev *dev, int slave,
+			   struct mlx4_vhcr *vhcr,
+			   struct mlx4_cmd_mailbox *inbox,
+			   struct mlx4_cmd_mailbox *outbox,
+			   struct mlx4_cmd_info *cmd);
+int mlx4_HW2SW_MPT_wrapper(struct mlx4_dev *dev, int slave,
+			   struct mlx4_vhcr *vhcr,
+			   struct mlx4_cmd_mailbox *inbox,
+			   struct mlx4_cmd_mailbox *outbox,
+			   struct mlx4_cmd_info *cmd);
+int mlx4_QUERY_MPT_wrapper(struct mlx4_dev *dev, int slave,
+			   struct mlx4_vhcr *vhcr,
+			   struct mlx4_cmd_mailbox *inbox,
+			   struct mlx4_cmd_mailbox *outbox,
+			   struct mlx4_cmd_info *cmd);
+int mlx4_SW2HW_EQ_wrapper(struct mlx4_dev *dev, int slave,
+			  struct mlx4_vhcr *vhcr,
+			  struct mlx4_cmd_mailbox *inbox,
+			  struct mlx4_cmd_mailbox *outbox,
+			  struct mlx4_cmd_info *cmd);
+int mlx4_DMA_wrapper(struct mlx4_dev *dev, int slave,
+		     struct mlx4_vhcr *vhcr,
+		     struct mlx4_cmd_mailbox *inbox,
+		     struct mlx4_cmd_mailbox *outbox,
+		     struct mlx4_cmd_info *cmd);
+int __mlx4_qp_reserve_range(struct mlx4_dev *dev, int cnt, int align,
+			    int *base, u8 flags);
+void __mlx4_qp_release_range(struct mlx4_dev *dev, int base_qpn, int cnt);
+int __mlx4_register_mac(struct mlx4_dev *dev, u8 port, u64 mac);
+void __mlx4_unregister_mac(struct mlx4_dev *dev, u8 port, u64 mac);
+int __mlx4_write_mtt(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
+		     int start_index, int npages, u64 *page_list);
+int __mlx4_counter_alloc(struct mlx4_dev *dev, int slave, int port, u32 *idx);
+void __mlx4_counter_free(struct mlx4_dev *dev, int slave, int port, u32 idx);
+
+int __mlx4_slave_counters_free(struct mlx4_dev *dev, int slave);
+int __mlx4_clear_if_stat(struct mlx4_dev *dev,
+			 u8 counter_index);
+u8 mlx4_get_default_counter_index(struct mlx4_dev *dev, int slave, int port);
+
+int __mlx4_xrcd_alloc(struct mlx4_dev *dev, u32 *xrcdn);
+void __mlx4_xrcd_free(struct mlx4_dev *dev, u32 xrcdn);
+
 void mlx4_start_catas_poll(struct mlx4_dev *dev);
 void mlx4_stop_catas_poll(struct mlx4_dev *dev);
 void mlx4_catas_init(void);
@@ -380,8 +986,8 @@
 int mlx4_restart_one(struct pci_dev *pdev);
 int mlx4_register_device(struct mlx4_dev *dev);
 void mlx4_unregister_device(struct mlx4_dev *dev);
-void mlx4_dispatch_event(struct mlx4_dev *dev, enum mlx4_dev_event type, int port);
-void *mlx4_find_get_prot_dev(struct mlx4_dev *dev, enum mlx4_prot proto, int port);
+void mlx4_dispatch_event(struct mlx4_dev *dev, enum mlx4_dev_event type,
+			 unsigned long param);
 
 struct mlx4_dev_cap;
 struct mlx4_init_hca_param;
@@ -390,13 +996,159 @@
 		      struct mlx4_profile *request,
 		      struct mlx4_dev_cap *dev_cap,
 		      struct mlx4_init_hca_param *init_hca);
+void mlx4_master_comm_channel(struct work_struct *work);
+void mlx4_master_arm_comm_channel(struct work_struct *work);
+void mlx4_gen_slave_eqe(struct work_struct *work);
+void mlx4_master_handle_slave_flr(struct work_struct *work);
 
+int mlx4_ALLOC_RES_wrapper(struct mlx4_dev *dev, int slave,
+			   struct mlx4_vhcr *vhcr,
+			   struct mlx4_cmd_mailbox *inbox,
+			   struct mlx4_cmd_mailbox *outbox,
+			   struct mlx4_cmd_info *cmd);
+int mlx4_FREE_RES_wrapper(struct mlx4_dev *dev, int slave,
+			  struct mlx4_vhcr *vhcr,
+			  struct mlx4_cmd_mailbox *inbox,
+			  struct mlx4_cmd_mailbox *outbox,
+			  struct mlx4_cmd_info *cmd);
+int mlx4_MAP_EQ_wrapper(struct mlx4_dev *dev, int slave,
+			struct mlx4_vhcr *vhcr, struct mlx4_cmd_mailbox *inbox,
+			struct mlx4_cmd_mailbox *outbox,
+			struct mlx4_cmd_info *cmd);
+int mlx4_COMM_INT_wrapper(struct mlx4_dev *dev, int slave,
+			  struct mlx4_vhcr *vhcr,
+			  struct mlx4_cmd_mailbox *inbox,
+			  struct mlx4_cmd_mailbox *outbox,
+			  struct mlx4_cmd_info *cmd);
+int mlx4_HW2SW_EQ_wrapper(struct mlx4_dev *dev, int slave,
+			    struct mlx4_vhcr *vhcr,
+			    struct mlx4_cmd_mailbox *inbox,
+			    struct mlx4_cmd_mailbox *outbox,
+			  struct mlx4_cmd_info *cmd);
+int mlx4_QUERY_EQ_wrapper(struct mlx4_dev *dev, int slave,
+			  struct mlx4_vhcr *vhcr,
+			  struct mlx4_cmd_mailbox *inbox,
+			  struct mlx4_cmd_mailbox *outbox,
+			  struct mlx4_cmd_info *cmd);
+int mlx4_SW2HW_CQ_wrapper(struct mlx4_dev *dev, int slave,
+			  struct mlx4_vhcr *vhcr,
+			  struct mlx4_cmd_mailbox *inbox,
+			  struct mlx4_cmd_mailbox *outbox,
+			  struct mlx4_cmd_info *cmd);
+int mlx4_HW2SW_CQ_wrapper(struct mlx4_dev *dev, int slave,
+			  struct mlx4_vhcr *vhcr,
+			  struct mlx4_cmd_mailbox *inbox,
+			  struct mlx4_cmd_mailbox *outbox,
+			  struct mlx4_cmd_info *cmd);
+int mlx4_QUERY_CQ_wrapper(struct mlx4_dev *dev, int slave,
+			  struct mlx4_vhcr *vhcr,
+			  struct mlx4_cmd_mailbox *inbox,
+			  struct mlx4_cmd_mailbox *outbox,
+			  struct mlx4_cmd_info *cmd);
+int mlx4_MODIFY_CQ_wrapper(struct mlx4_dev *dev, int slave,
+			  struct mlx4_vhcr *vhcr,
+			  struct mlx4_cmd_mailbox *inbox,
+			  struct mlx4_cmd_mailbox *outbox,
+			   struct mlx4_cmd_info *cmd);
+int mlx4_SW2HW_SRQ_wrapper(struct mlx4_dev *dev, int slave,
+			   struct mlx4_vhcr *vhcr,
+			   struct mlx4_cmd_mailbox *inbox,
+			   struct mlx4_cmd_mailbox *outbox,
+			   struct mlx4_cmd_info *cmd);
+int mlx4_HW2SW_SRQ_wrapper(struct mlx4_dev *dev, int slave,
+			   struct mlx4_vhcr *vhcr,
+			   struct mlx4_cmd_mailbox *inbox,
+			   struct mlx4_cmd_mailbox *outbox,
+			   struct mlx4_cmd_info *cmd);
+int mlx4_QUERY_SRQ_wrapper(struct mlx4_dev *dev, int slave,
+			   struct mlx4_vhcr *vhcr,
+			   struct mlx4_cmd_mailbox *inbox,
+			   struct mlx4_cmd_mailbox *outbox,
+			   struct mlx4_cmd_info *cmd);
+int mlx4_ARM_SRQ_wrapper(struct mlx4_dev *dev, int slave,
+			 struct mlx4_vhcr *vhcr,
+			 struct mlx4_cmd_mailbox *inbox,
+			 struct mlx4_cmd_mailbox *outbox,
+			 struct mlx4_cmd_info *cmd);
+int mlx4_GEN_QP_wrapper(struct mlx4_dev *dev, int slave,
+			struct mlx4_vhcr *vhcr,
+			struct mlx4_cmd_mailbox *inbox,
+			struct mlx4_cmd_mailbox *outbox,
+			struct mlx4_cmd_info *cmd);
+int mlx4_RST2INIT_QP_wrapper(struct mlx4_dev *dev, int slave,
+			     struct mlx4_vhcr *vhcr,
+			     struct mlx4_cmd_mailbox *inbox,
+			     struct mlx4_cmd_mailbox *outbox,
+			     struct mlx4_cmd_info *cmd);
+int mlx4_INIT2INIT_QP_wrapper(struct mlx4_dev *dev, int slave,
+			      struct mlx4_vhcr *vhcr,
+			      struct mlx4_cmd_mailbox *inbox,
+			      struct mlx4_cmd_mailbox *outbox,
+			      struct mlx4_cmd_info *cmd);
+int mlx4_INIT2RTR_QP_wrapper(struct mlx4_dev *dev, int slave,
+			     struct mlx4_vhcr *vhcr,
+			     struct mlx4_cmd_mailbox *inbox,
+			     struct mlx4_cmd_mailbox *outbox,
+			     struct mlx4_cmd_info *cmd);
+int mlx4_RTR2RTS_QP_wrapper(struct mlx4_dev *dev, int slave,
+			    struct mlx4_vhcr *vhcr,
+			    struct mlx4_cmd_mailbox *inbox,
+			    struct mlx4_cmd_mailbox *outbox,
+			    struct mlx4_cmd_info *cmd);
+int mlx4_RTS2RTS_QP_wrapper(struct mlx4_dev *dev, int slave,
+			    struct mlx4_vhcr *vhcr,
+			    struct mlx4_cmd_mailbox *inbox,
+			    struct mlx4_cmd_mailbox *outbox,
+			    struct mlx4_cmd_info *cmd);
+int mlx4_SQERR2RTS_QP_wrapper(struct mlx4_dev *dev, int slave,
+			      struct mlx4_vhcr *vhcr,
+			      struct mlx4_cmd_mailbox *inbox,
+			      struct mlx4_cmd_mailbox *outbox,
+			      struct mlx4_cmd_info *cmd);
+int mlx4_2ERR_QP_wrapper(struct mlx4_dev *dev, int slave,
+			 struct mlx4_vhcr *vhcr,
+			 struct mlx4_cmd_mailbox *inbox,
+			 struct mlx4_cmd_mailbox *outbox,
+			 struct mlx4_cmd_info *cmd);
+int mlx4_RTS2SQD_QP_wrapper(struct mlx4_dev *dev, int slave,
+			    struct mlx4_vhcr *vhcr,
+			    struct mlx4_cmd_mailbox *inbox,
+			    struct mlx4_cmd_mailbox *outbox,
+			    struct mlx4_cmd_info *cmd);
+int mlx4_SQD2SQD_QP_wrapper(struct mlx4_dev *dev, int slave,
+			    struct mlx4_vhcr *vhcr,
+			    struct mlx4_cmd_mailbox *inbox,
+			    struct mlx4_cmd_mailbox *outbox,
+			    struct mlx4_cmd_info *cmd);
+int mlx4_SQD2RTS_QP_wrapper(struct mlx4_dev *dev, int slave,
+			    struct mlx4_vhcr *vhcr,
+			    struct mlx4_cmd_mailbox *inbox,
+			    struct mlx4_cmd_mailbox *outbox,
+			    struct mlx4_cmd_info *cmd);
+int mlx4_2RST_QP_wrapper(struct mlx4_dev *dev, int slave,
+			 struct mlx4_vhcr *vhcr,
+			 struct mlx4_cmd_mailbox *inbox,
+			 struct mlx4_cmd_mailbox *outbox,
+			 struct mlx4_cmd_info *cmd);
+int mlx4_QUERY_QP_wrapper(struct mlx4_dev *dev, int slave,
+			  struct mlx4_vhcr *vhcr,
+			  struct mlx4_cmd_mailbox *inbox,
+			  struct mlx4_cmd_mailbox *outbox,
+			  struct mlx4_cmd_info *cmd);
+
+int mlx4_GEN_EQE(struct mlx4_dev *dev, int slave, struct mlx4_eqe *eqe);
+
 int mlx4_cmd_init(struct mlx4_dev *dev);
 void mlx4_cmd_cleanup(struct mlx4_dev *dev);
+int mlx4_multi_func_init(struct mlx4_dev *dev);
+void mlx4_multi_func_cleanup(struct mlx4_dev *dev);
 void mlx4_cmd_event(struct mlx4_dev *dev, u16 token, u8 status, u64 out_param);
 int mlx4_cmd_use_events(struct mlx4_dev *dev);
 void mlx4_cmd_use_polling(struct mlx4_dev *dev);
 
+int mlx4_comm_cmd(struct mlx4_dev *dev, u8 cmd, u16 param,
+		  unsigned long timeout);
+
 void mlx4_cq_completion(struct mlx4_dev *dev, u32 cqn);
 void mlx4_cq_event(struct mlx4_dev *dev, u32 cqn, int event_type);
 
@@ -406,13 +1158,14 @@
 
 void mlx4_handle_catas_err(struct mlx4_dev *dev);
 
+int mlx4_SENSE_PORT(struct mlx4_dev *dev, int port,
+		    enum mlx4_port_type *type);
 void mlx4_do_sense_ports(struct mlx4_dev *dev,
 			 enum mlx4_port_type *stype,
 			 enum mlx4_port_type *defaults);
 void mlx4_start_sense(struct mlx4_dev *dev);
 void mlx4_stop_sense(struct mlx4_dev *dev);
-int mlx4_sense_init(struct mlx4_dev *dev);
-void mlx4_sense_cleanup(struct mlx4_dev *dev);
+void mlx4_sense_init(struct mlx4_dev *dev);
 int mlx4_check_port_params(struct mlx4_dev *dev,
 			   enum mlx4_port_type *port_type);
 int mlx4_change_port_types(struct mlx4_dev *dev,
@@ -420,8 +1173,156 @@
 
 void mlx4_init_mac_table(struct mlx4_dev *dev, struct mlx4_mac_table *table);
 void mlx4_init_vlan_table(struct mlx4_dev *dev, struct mlx4_vlan_table *table);
+void __mlx4_unregister_vlan(struct mlx4_dev *dev, u8 port, u16 vlan);
+int __mlx4_register_vlan(struct mlx4_dev *dev, u8 port, u16 vlan, int *index);
 
-int mlx4_SET_PORT(struct mlx4_dev *dev, u8 port);
+int mlx4_SET_PORT(struct mlx4_dev *dev, u8 port, int pkey_tbl_sz);
+/* resource tracker functions*/
+int mlx4_get_slave_from_resource_id(struct mlx4_dev *dev,
+				    enum mlx4_resource resource_type,
+				    u64 resource_id, int *slave);
+void mlx4_delete_all_resources_for_slave(struct mlx4_dev *dev, int slave_id);
+int mlx4_init_resource_tracker(struct mlx4_dev *dev);
+
+void mlx4_free_resource_tracker(struct mlx4_dev *dev,
+				enum mlx4_res_tracker_free_type type);
+
+int mlx4_QUERY_FW_wrapper(struct mlx4_dev *dev, int slave,
+			  struct mlx4_vhcr *vhcr,
+			  struct mlx4_cmd_mailbox *inbox,
+			  struct mlx4_cmd_mailbox *outbox,
+			  struct mlx4_cmd_info *cmd);
+int mlx4_SET_PORT_wrapper(struct mlx4_dev *dev, int slave,
+			  struct mlx4_vhcr *vhcr,
+			  struct mlx4_cmd_mailbox *inbox,
+			  struct mlx4_cmd_mailbox *outbox,
+			  struct mlx4_cmd_info *cmd);
+int mlx4_INIT_PORT_wrapper(struct mlx4_dev *dev, int slave,
+			   struct mlx4_vhcr *vhcr,
+			   struct mlx4_cmd_mailbox *inbox,
+			   struct mlx4_cmd_mailbox *outbox,
+			   struct mlx4_cmd_info *cmd);
+int mlx4_CLOSE_PORT_wrapper(struct mlx4_dev *dev, int slave,
+			    struct mlx4_vhcr *vhcr,
+			    struct mlx4_cmd_mailbox *inbox,
+			    struct mlx4_cmd_mailbox *outbox,
+			    struct mlx4_cmd_info *cmd);
+int mlx4_QUERY_DEV_CAP_wrapper(struct mlx4_dev *dev, int slave,
+			       struct mlx4_vhcr *vhcr,
+			       struct mlx4_cmd_mailbox *inbox,
+			       struct mlx4_cmd_mailbox *outbox,
+			       struct mlx4_cmd_info *cmd);
+int mlx4_QUERY_PORT_wrapper(struct mlx4_dev *dev, int slave,
+			    struct mlx4_vhcr *vhcr,
+			    struct mlx4_cmd_mailbox *inbox,
+			    struct mlx4_cmd_mailbox *outbox,
+			    struct mlx4_cmd_info *cmd);
 int mlx4_get_port_ib_caps(struct mlx4_dev *dev, u8 port, __be32 *caps);
 
+int mlx4_get_slave_pkey_gid_tbl_len(struct mlx4_dev *dev, u8 port,
+				    int *gid_tbl_len, int *pkey_tbl_len);
+
+int mlx4_QP_ATTACH_wrapper(struct mlx4_dev *dev, int slave,
+			   struct mlx4_vhcr *vhcr,
+			   struct mlx4_cmd_mailbox *inbox,
+			   struct mlx4_cmd_mailbox *outbox,
+			   struct mlx4_cmd_info *cmd);
+
+int mlx4_PROMISC_wrapper(struct mlx4_dev *dev, int slave,
+			 struct mlx4_vhcr *vhcr,
+			 struct mlx4_cmd_mailbox *inbox,
+			 struct mlx4_cmd_mailbox *outbox,
+			 struct mlx4_cmd_info *cmd);
+int mlx4_qp_detach_common(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16],
+			  enum mlx4_protocol prot, enum mlx4_steer_type steer);
+int mlx4_qp_attach_common(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16],
+			  int block_mcast_loopback, enum mlx4_protocol prot,
+			  enum mlx4_steer_type steer);
+int mlx4_trans_to_dmfs_attach(struct mlx4_dev *dev, struct mlx4_qp *qp,
+			      u8 gid[16], u8 port,
+			      int block_mcast_loopback,
+			      enum mlx4_protocol prot, u64 *reg_id);
+int mlx4_SET_MCAST_FLTR(struct mlx4_dev *dev, u8 port, u64 mac, u64 clear, u8 mode);
+int mlx4_SET_MCAST_FLTR_wrapper(struct mlx4_dev *dev, int slave,
+				struct mlx4_vhcr *vhcr,
+				struct mlx4_cmd_mailbox *inbox,
+				struct mlx4_cmd_mailbox *outbox,
+				struct mlx4_cmd_info *cmd);
+int mlx4_SET_VLAN_FLTR_wrapper(struct mlx4_dev *dev, int slave,
+			       struct mlx4_vhcr *vhcr,
+			       struct mlx4_cmd_mailbox *inbox,
+			       struct mlx4_cmd_mailbox *outbox,
+			       struct mlx4_cmd_info *cmd);
+int mlx4_common_set_vlan_fltr(struct mlx4_dev *dev, int function,
+				     int port, void *buf);
+int mlx4_DUMP_ETH_STATS_wrapper(struct mlx4_dev *dev, int slave,
+				   struct mlx4_vhcr *vhcr,
+				   struct mlx4_cmd_mailbox *inbox,
+				   struct mlx4_cmd_mailbox *outbox,
+				struct mlx4_cmd_info *cmd);
+int mlx4_PKEY_TABLE_wrapper(struct mlx4_dev *dev, int slave,
+			    struct mlx4_vhcr *vhcr,
+			    struct mlx4_cmd_mailbox *inbox,
+			    struct mlx4_cmd_mailbox *outbox,
+			    struct mlx4_cmd_info *cmd);
+int mlx4_QUERY_IF_STAT_wrapper(struct mlx4_dev *dev, int slave,
+			       struct mlx4_vhcr *vhcr,
+			       struct mlx4_cmd_mailbox *inbox,
+			       struct mlx4_cmd_mailbox *outbox,
+			       struct mlx4_cmd_info *cmd);
+int mlx4_QP_FLOW_STEERING_ATTACH_wrapper(struct mlx4_dev *dev, int slave,
+					 struct mlx4_vhcr *vhcr,
+					 struct mlx4_cmd_mailbox *inbox,
+					 struct mlx4_cmd_mailbox *outbox,
+					 struct mlx4_cmd_info *cmd);
+int mlx4_QP_FLOW_STEERING_DETACH_wrapper(struct mlx4_dev *dev, int slave,
+					 struct mlx4_vhcr *vhcr,
+					 struct mlx4_cmd_mailbox *inbox,
+					 struct mlx4_cmd_mailbox *outbox,
+					 struct mlx4_cmd_info *cmd);
+int mlx4_MOD_STAT_CFG_wrapper(struct mlx4_dev *dev, int slave,
+			  struct mlx4_vhcr *vhcr,
+			  struct mlx4_cmd_mailbox *inbox,
+			  struct mlx4_cmd_mailbox *outbox,
+			  struct mlx4_cmd_info *cmd);
+
+int mlx4_get_mgm_entry_size(struct mlx4_dev *dev);
+int mlx4_get_qp_per_mgm(struct mlx4_dev *dev);
+
+static inline void set_param_l(u64 *arg, u32 val)
+{
+	*arg = (*arg & 0xffffffff00000000ULL) | (u64) val;
+}
+
+static inline void set_param_h(u64 *arg, u32 val)
+{
+	*arg = (*arg & 0xffffffff) | ((u64) val << 32);
+}
+
+static inline u32 get_param_l(u64 *arg)
+{
+	return (u32) (*arg & 0xffffffff);
+}
+
+static inline u32 get_param_h(u64 *arg)
+{
+	return (u32)(*arg >> 32);
+}
+
+static inline spinlock_t *mlx4_tlock(struct mlx4_dev *dev)
+{
+	return &mlx4_priv(dev)->mfunc.master.res_tracker.lock;
+}
+
+#define NOT_MASKED_PD_BITS 17
+
+void sys_tune_init(void);
+void sys_tune_fini(void);
+
+void mlx4_init_quotas(struct mlx4_dev *dev);
+
+int mlx4_get_slave_num_gids(struct mlx4_dev *dev, int slave);
+int mlx4_get_base_gid_ix(struct mlx4_dev *dev, int slave);
+void mlx4_vf_immed_vlan_work_handler(struct work_struct *_work);
+
 #endif /* MLX4_H */


Property changes on: trunk/sys/ofed/drivers/net/mlx4/mlx4.h
___________________________________________________________________
Deleted: cvs2svn:cvs-rev
## -1 +0,0 ##
-1.1.1.1
\ No newline at end of property
Modified: trunk/sys/ofed/drivers/net/mlx4/mlx4_en.h
===================================================================
--- trunk/sys/ofed/drivers/net/mlx4/mlx4_en.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/drivers/net/mlx4/mlx4_en.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2007, 2014 Mellanox Technologies. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -34,13 +34,17 @@
 #ifndef _MLX4_EN_H_
 #define _MLX4_EN_H_
 
-#include <sys/cdefs.h>
-
-#include <linux/types.h>
+#include <linux/bitops.h>
 #include <linux/compiler.h>
 #include <linux/list.h>
 #include <linux/mutex.h>
+#include <linux/kobject.h>
 #include <linux/netdevice.h>
+#include <linux/if_vlan.h>
+#include <linux/if_ether.h>
+#ifdef CONFIG_MLX4_EN_DCB
+#include <linux/dcbnl.h>
+#endif
 
 #include <linux/mlx4/device.h>
 #include <linux/mlx4/qp.h>
@@ -49,56 +53,15 @@
 #include <linux/mlx4/doorbell.h>
 #include <linux/mlx4/cmd.h>
 
-#include <net/if_media.h>
 #include <netinet/tcp_lro.h>
 
 #include "en_port.h"
+#include "mlx4_stats.h"
 
 #define DRV_NAME	"mlx4_en"
-#define DRV_VERSION	"1.5.2"
-#define DRV_RELDATE	"July 2010"
 
-/* XXX */
-#define	NETIF_MSG_LINK		0x1
-#define	NETIF_MSG_IFDOWN	0x2
-#define	NETIF_MSG_HW		0x4
-#define	NETIF_MSG_DRV		0x8
-#define	NETIF_MSG_INTR		0x10
-#define	NETIF_MSG_RX_ERR	0x20
-
 #define MLX4_EN_MSG_LEVEL	(NETIF_MSG_LINK | NETIF_MSG_IFDOWN)
 
-#define en_print(level, priv, format, arg...)			\
-	{							\
-	if ((priv)->registered)					\
-		printk(level "%s: %s: " format, DRV_NAME,	\
-			(priv->dev)->if_xname, ## arg);	\
-	else							\
-		printk(level "%s: %s: Port %d: " format,	\
-			DRV_NAME, dev_name(&priv->mdev->pdev->dev), \
-			(priv)->port, ## arg);			\
-	}
-
-#define en_dbg(mlevel, priv, format, arg...)	\
-	if (NETIF_MSG_##mlevel & priv->msg_enable) \
-		en_print(KERN_DEBUG, priv, format, ## arg)
-#define en_warn(priv, format, arg...) \
-	en_print(KERN_WARNING, priv, format, ## arg)
-#define en_err(priv, format, arg...) \
-	en_print(KERN_ERR, priv, format, ## arg)
-#define en_info(priv, format, arg...) \
-	en_print(KERN_INFO, priv, format, ## arg)
-
-#define mlx4_err(mdev, format, arg...) \
-	printk(KERN_ERR "%s %s: " format , DRV_NAME ,\
-		dev_name(&mdev->pdev->dev) , ## arg)
-#define mlx4_info(mdev, format, arg...) \
-	printk(KERN_INFO "%s %s: " format , DRV_NAME ,\
-		dev_name(&mdev->pdev->dev) , ## arg)
-#define mlx4_warn(mdev, format, arg...) \
-	printk(KERN_WARNING "%s %s: " format , DRV_NAME ,\
-		dev_name(&mdev->pdev->dev) , ## arg)
-
 /*
  * Device constants
  */
@@ -106,8 +69,10 @@
 
 #define MLX4_EN_PAGE_SHIFT	12
 #define MLX4_EN_PAGE_SIZE	(1 << MLX4_EN_PAGE_SHIFT)
-#define MAX_TX_RINGS		(MLX4_EN_NUM_HASH_RINGS + 1 + MLX4_EN_NUM_PPP_RINGS)
-#define MAX_RX_RINGS		16
+#define	MLX4_NET_IP_ALIGN	2	/* bytes */
+#define DEF_RX_RINGS		16
+#define MAX_RX_RINGS		128
+#define MIN_RX_RINGS		4
 #define TXBB_SIZE		64
 #define HEADROOM		(2048 / TXBB_SIZE + 1)
 #define STAMP_STRIDE		64
@@ -115,11 +80,20 @@
 #define STAMP_SHIFT		31
 #define STAMP_VAL		0x7fffffff
 #define STATS_DELAY		(HZ / 4)
+#define SERVICE_TASK_DELAY	(HZ / 4)
+#define MAX_NUM_OF_FS_RULES	256
 
-/* Typical TSO descriptor with 16 gather entries is 352 bytes... */
-#define MAX_DESC_SIZE		512
-#define MAX_DESC_TXBBS		(MAX_DESC_SIZE / TXBB_SIZE)
+#define MLX4_EN_FILTER_HASH_SHIFT 4
+#define MLX4_EN_FILTER_EXPIRY_QUOTA 60
 
+#ifdef CONFIG_NET_RX_BUSY_POLL
+#define LL_EXTENDED_STATS
+#endif
+
+/* vlan valid range */
+#define VLAN_MIN_VALUE		1
+#define VLAN_MAX_VALUE		4094
+
 /*
  * OS related constants and tunables
  */
@@ -126,52 +100,46 @@
 
 #define MLX4_EN_WATCHDOG_TIMEOUT	(15 * HZ)
 
-#define MLX4_EN_MAX_LRO_DESCRIPTORS	32
-#define MLX4_EN_NUM_IPFRAG_SESSIONS	16
+#define MLX4_EN_ALLOC_SIZE     PAGE_ALIGN(PAGE_SIZE)
+#define MLX4_EN_ALLOC_ORDER    get_order(MLX4_EN_ALLOC_SIZE)
 
-/* Receive fragment sizes; we use at most 3 fragments (for 9600 byte MTU
- * and 4K allocations) */
-#if MJUMPAGESIZE == 4096
-enum {
-	FRAG_SZ0 = MCLBYTES,
-	FRAG_SZ1 = MJUMPAGESIZE,
-	FRAG_SZ2 = MJUMPAGESIZE,
+enum mlx4_en_alloc_type {
+	MLX4_EN_ALLOC_NEW = 0,
+	MLX4_EN_ALLOC_REPLACEMENT = 1,
 };
-#define MLX4_EN_MAX_RX_FRAGS	3
-#elif MJUMPAGESIZE == 8192
-enum {
-	FRAG_SZ0 = MCLBYTES,
-	FRAG_SZ1 = MJUMPAGESIZE,
-};
-#define MLX4_EN_MAX_RX_FRAGS	2
-#elif MJUMPAGESIZE == 8192
-#else
-#error	"Unknown PAGE_SIZE"
-#endif
 
 /* Maximum ring sizes */
+#define MLX4_EN_DEF_TX_QUEUE_SIZE       4096
+
+/* Minimum packet number till arming the CQ */
+#define MLX4_EN_MIN_RX_ARM	2048
+#define MLX4_EN_MIN_TX_ARM	2048
+
+/* Maximum ring sizes */
 #define MLX4_EN_MAX_TX_SIZE	8192
 #define MLX4_EN_MAX_RX_SIZE	8192
 
-#define MLX4_EN_MIN_RX_SIZE	(128)
+/* Minimum ring sizes */
+#define MLX4_EN_MIN_RX_SIZE	(4096 / TXBB_SIZE)
 #define MLX4_EN_MIN_TX_SIZE	(4096 / TXBB_SIZE)
 
 #define MLX4_EN_SMALL_PKT_SIZE		64
-#define MLX4_EN_TX_HASH_SIZE		256
-#define MLX4_EN_TX_HASH_MASK		(MLX4_EN_TX_HASH_SIZE - 1)
-#define MLX4_EN_NUM_HASH_RINGS		4
-#define MLX4_EN_NUM_PPP_RINGS		8
-#define MLX4_EN_DEF_TX_RING_SIZE	512
-#define MLX4_EN_DEF_TX_QUEUE_SIZE	4096
+
+#define MLX4_EN_MAX_TX_RING_P_UP	32
+#define MLX4_EN_NUM_UP			1
+
+#define MAX_TX_RINGS			(MLX4_EN_MAX_TX_RING_P_UP * \
+					MLX4_EN_NUM_UP)
+
+#define MLX4_EN_DEF_TX_RING_SIZE	1024
 #define MLX4_EN_DEF_RX_RING_SIZE  	1024
-#define	MLX4_EN_MAX_RX_POLL		1024
 
 /* Target number of bytes to coalesce with interrupt moderation */
 #define MLX4_EN_RX_COAL_TARGET	0x20000
 #define MLX4_EN_RX_COAL_TIME	0x10
 
-#define MLX4_EN_TX_COAL_PKTS	5
-#define MLX4_EN_TX_COAL_TIME	0x80
+#define MLX4_EN_TX_COAL_PKTS	64
+#define MLX4_EN_TX_COAL_TIME	64
 
 #define MLX4_EN_RX_RATE_LOW		400000
 #define MLX4_EN_RX_COAL_TIME_LOW	0
@@ -187,14 +155,13 @@
 #define MLX4_EN_DEF_RX_PAUSE	1
 #define MLX4_EN_DEF_TX_PAUSE	1
 
-/* Interval between sucessive polls in the Tx routine when polling is used
+/* Interval between successive polls in the Tx routine when polling is used
    instead of interrupts (in per-core Tx rings) - should be power of 2 */
 #define MLX4_EN_TX_POLL_MODER	16
 #define MLX4_EN_TX_POLL_TIMEOUT	(HZ / 4)
 
-#define ETH_LLC_SNAP_SIZE	8
-
-#define SMALL_PACKET_SIZE      (MHLEN)
+#define MLX4_EN_64_ALIGN	(64 - NET_SKB_PAD)
+#define SMALL_PACKET_SIZE      (256 - NET_IP_ALIGN)
 #define HEADER_COPY_SIZE       (128)
 #define MLX4_LOOPBACK_TEST_PAYLOAD (HEADER_COPY_SIZE - ETHER_HDR_LEN)
 
@@ -208,7 +175,6 @@
 /* Number of samples to 'average' */
 #define AVG_SIZE			128
 #define AVG_FACTOR			1024
-#define NUM_PERF_STATS			NUM_PERF_COUNTERS
 
 #define INC_PERF_COUNTER(cnt)		(++(cnt))
 #define ADD_PERF_COUNTER(cnt, add)	((cnt) += (add))
@@ -219,7 +185,6 @@
 
 #else
 
-#define NUM_PERF_STATS			0
 #define INC_PERF_COUNTER(cnt)		do {} while (0)
 #define ADD_PERF_COUNTER(cnt, add)	do {} while (0)
 #define AVG_PERF_COUNTER(cnt, sample)	do {} while (0)
@@ -244,13 +209,11 @@
 #define XNOR(x, y)		(!(x) == !(y))
 #define ILLEGAL_MAC(addr)	(addr == 0xffffffffffffULL || addr == 0x0)
 
-
 struct mlx4_en_tx_info {
-	struct mbuf *mb;
-	u32 nr_txbb;
-	u8 nr_segs;
-	u8 data_offset;
-	u8 inl;
+	bus_dmamap_t dma_map;
+        struct mbuf *mb;
+        u32 nr_txbb;
+	u32 nr_bytes;
 };
 
 
@@ -271,8 +234,22 @@
 
 #define MLX4_EN_USE_SRQ		0x01000000
 
+#define MLX4_EN_RX_BUDGET 64
+
+#define	MLX4_EN_TX_MAX_DESC_SIZE 512	/* bytes */
+#define	MLX4_EN_TX_MAX_MBUF_SIZE 65536	/* bytes */
+#define	MLX4_EN_TX_MAX_PAYLOAD_SIZE 65536	/* bytes */
+#define	MLX4_EN_TX_MAX_MBUF_FRAGS \
+    ((MLX4_EN_TX_MAX_DESC_SIZE - 128) / DS_SIZE_ALIGNMENT) /* units */
+#define	MLX4_EN_TX_WQE_MAX_WQEBBS			\
+    (MLX4_EN_TX_MAX_DESC_SIZE / TXBB_SIZE) /* units */
+
+#define MLX4_EN_CX3_LOW_ID	0x1000
+#define MLX4_EN_CX3_HIGH_ID	0x1005
+
 struct mlx4_en_tx_ring {
-	spinlock_t tx_lock;
+        spinlock_t tx_lock;
+	bus_dma_tag_t dma_tag;
 	struct mlx4_hwq_resources wqres;
 	u32 size ; /* number of TXBBs */
 	u32 size_mask;
@@ -282,12 +259,13 @@
 	u32 cons;
 	u32 buf_size;
 	u32 doorbell_qpn;
-	void *buf;
+	u8 *buf;
 	u16 poll_cnt;
 	int blocked;
+	struct mlx4_en_tx_info *tx_info;
+	u8 queue_index;
+	cpuset_t affinity_mask;
 	struct buf_ring *br;
-	struct mlx4_en_tx_info *tx_info;
-	u8 *bounce_buf;
 	u32 last_nr_txbb;
 	struct mlx4_qp qp;
 	struct mlx4_qp_context context;
@@ -296,31 +274,38 @@
 	struct mlx4_srq dummy;
 	unsigned long bytes;
 	unsigned long packets;
-	unsigned long errors;
-	spinlock_t comp_lock;
+	unsigned long tx_csum;
+	unsigned long queue_stopped;
+	unsigned long oversized_packets;
+	unsigned long wake_queue;
 	struct mlx4_bf bf;
 	bool bf_enabled;
+	int hwtstamp_tx_type;
+	spinlock_t comp_lock;
+	int inline_thold;
 	u64 watchdog_time;
 };
 
-struct mlx4_en_ipfrag {
-	struct mbuf *fragments;
-	struct mbuf *last;
-	__be32		saddr;
-	__be32		daddr;
-	__be16		id;
-	u8		protocol;
-	int		total_len;
-	u16		offset;
-};
-
 struct mlx4_en_rx_desc {
 	/* actual number of entries depends on rx ring stride */
 	struct mlx4_wqe_data_seg data[0];
 };
 
+struct mlx4_en_rx_mbuf {
+	bus_dmamap_t dma_map;
+	struct mbuf *mbuf;
+};
+
+struct mlx4_en_rx_spare {
+	bus_dmamap_t dma_map;
+	struct mbuf *mbuf;
+	u64 paddr_be;
+};
+
 struct mlx4_en_rx_ring {
 	struct mlx4_hwq_resources wqres;
+	bus_dma_tag_t dma_tag;
+	struct mlx4_en_rx_spare spare;
 	u32 size ;	/* number of Rx descs*/
 	u32 actual_size;
 	u32 size_mask;
@@ -330,28 +315,48 @@
 	u32 prod;
 	u32 cons;
 	u32 buf_size;
-	void *buf;
-	void *rx_info;
+	u8  fcs_del;
+	u32 rx_mb_size;
+	int qpn;
+	u8 *buf;
+	struct mlx4_en_rx_mbuf *mbuf;
+	unsigned long errors;
 	unsigned long bytes;
 	unsigned long packets;
-	unsigned long errors;
+#ifdef LL_EXTENDED_STATS
+	unsigned long yields;
+	unsigned long misses;
+	unsigned long cleaned;
+#endif
+	unsigned long csum_ok;
+	unsigned long csum_none;
+	int hwtstamp_rx_filter;
+	int numa_node;
 	struct lro_ctrl lro;
-	struct mlx4_en_ipfrag ipfrag[MLX4_EN_NUM_IPFRAG_SESSIONS];
 };
 
-
 static inline int mlx4_en_can_lro(__be16 status)
 {
-	return (status & cpu_to_be16(MLX4_CQE_STATUS_IPV4	|
-				     MLX4_CQE_STATUS_IPV4F	|
-				     MLX4_CQE_STATUS_IPV6	|
-				     MLX4_CQE_STATUS_IPV4OPT	|
-				     MLX4_CQE_STATUS_TCP	|
-				     MLX4_CQE_STATUS_UDP	|
-				     MLX4_CQE_STATUS_IPOK)) ==
-		cpu_to_be16(MLX4_CQE_STATUS_IPV4 |
-			    MLX4_CQE_STATUS_IPOK |
-			    MLX4_CQE_STATUS_TCP);
+	const __be16 status_all = cpu_to_be16(
+			MLX4_CQE_STATUS_IPV4    |
+			MLX4_CQE_STATUS_IPV4F   |
+			MLX4_CQE_STATUS_IPV6    |
+			MLX4_CQE_STATUS_IPV4OPT |
+			MLX4_CQE_STATUS_TCP     |
+			MLX4_CQE_STATUS_UDP     |
+			MLX4_CQE_STATUS_IPOK);
+	const __be16 status_ipv4_ipok_tcp = cpu_to_be16(
+			MLX4_CQE_STATUS_IPV4    |
+			MLX4_CQE_STATUS_IPOK    |
+			MLX4_CQE_STATUS_TCP);
+	const __be16 status_ipv6_ipok_tcp = cpu_to_be16(
+			MLX4_CQE_STATUS_IPV6    |
+			MLX4_CQE_STATUS_IPOK    |
+			MLX4_CQE_STATUS_TCP);
+
+	status &= status_all;
+	return (status == status_ipv4_ipok_tcp ||
+			status == status_ipv6_ipok_tcp);
 }
 
 struct mlx4_en_cq {
@@ -360,8 +365,8 @@
 	int                     ring;
 	spinlock_t              lock;
 	struct net_device      *dev;
-	/* Per-core Tx cq processing support */
-	struct timer_list timer;
+        /* Per-core Tx cq processing support */
+        struct timer_list timer;
 	int size;
 	int buf_size;
 	unsigned vector;
@@ -373,6 +378,21 @@
 	struct taskqueue *tq;
 #define MLX4_EN_OPCODE_ERROR	0x1e
 	u32 tot_rx;
+	u32 tot_tx;
+	u32 curr_poll_rx_cpu_id;
+
+#ifdef CONFIG_NET_RX_BUSY_POLL
+	unsigned int state;
+#define MLX4_EN_CQ_STATEIDLE        0
+#define MLX4_EN_CQ_STATENAPI     1    /* NAPI owns this CQ */
+#define MLX4_EN_CQ_STATEPOLL     2    /* poll owns this CQ */
+#define MLX4_CQ_LOCKED (MLX4_EN_CQ_STATENAPI | MLX4_EN_CQ_STATEPOLL)
+#define MLX4_EN_CQ_STATENAPI_YIELD  4    /* NAPI yielded this CQ */
+#define MLX4_EN_CQ_STATEPOLL_YIELD  8    /* poll yielded this CQ */
+#define CQ_YIELD (MLX4_EN_CQ_STATENAPI_YIELD | MLX4_EN_CQ_STATEPOLL_YIELD)
+#define CQ_USER_PEND (MLX4_EN_CQ_STATEPOLL | MLX4_EN_CQ_STATEPOLL_YIELD)
+	spinlock_t poll_lock; /* protects from LLS/napi conflicts */
+#endif  /* CONFIG_NET_RX_BUSY_POLL */
 };
 
 struct mlx4_en_port_profile {
@@ -382,41 +402,42 @@
 	u32 tx_ring_size;
 	u32 rx_ring_size;
 	u8 rx_pause;
+	u8 rx_ppp;
 	u8 tx_pause;
-	u32 rx_ppp;
-	u32 tx_ppp;
+	u8 tx_ppp;
+	int rss_rings;
 };
 
 struct mlx4_en_profile {
 	int rss_xor;
-	int num_lro;
-	int ip_reasm;
-	int tcp_rss;
 	int udp_rss;
 	u8 rss_mask;
 	u32 active_ports;
 	u32 small_pkt_int;
 	u8 no_reset;
+	u8 num_tx_rings_p_up;
 	struct mlx4_en_port_profile prof[MLX4_MAX_PORTS + 1];
 };
 
 struct mlx4_en_dev {
-	struct mlx4_dev         *dev;
+	struct mlx4_dev		*dev;
 	struct pci_dev		*pdev;
 	struct mutex		state_lock;
-	struct net_device       *pndev[MLX4_MAX_PORTS + 1];
-	u32                     port_cnt;
+	struct net_device	*pndev[MLX4_MAX_PORTS + 1];
+	u32			port_cnt;
 	bool			device_up;
-	struct mlx4_en_profile  profile;
+	struct mlx4_en_profile	profile;
 	u32			LSO_support;
 	struct workqueue_struct *workqueue;
-	struct device           *dma_device;
-	void __iomem            *uar_map;
-	struct mlx4_uar         priv_uar;
+	struct device		*dma_device;
+	void __iomem		*uar_map;
+	struct mlx4_uar		priv_uar;
 	struct mlx4_mr		mr;
-	u32                     priv_pdn;
-	spinlock_t              uar_lock;
+	u32			priv_pdn;
+	spinlock_t		uar_lock;
 	u8			mac_removed[MLX4_MAX_PORTS + 1];
+	unsigned long		last_overflow_check;
+	unsigned long		overflow_period;
 };
 
 
@@ -428,58 +449,59 @@
 	enum mlx4_qp_state indir_state;
 };
 
-struct mlx4_en_rss_context {
-	__be32 base_qpn;
-	__be32 default_qpn;
-	u16 reserved;
-	u8 hash_fn;
-	u8 flags;
-	__be32 rss_key[10];
-	__be32 base_qpn_udp;
-};
-
 struct mlx4_en_port_state {
 	int link_state;
 	int link_speed;
 	int transciver;
+	int autoneg;
 };
 
-struct mlx4_en_pkt_stats {
-	unsigned long broadcast;
-	unsigned long rx_prio[8];
-	unsigned long tx_prio[8];
-#define NUM_PKT_STATS		17
+enum mlx4_en_mclist_act {
+	MCLIST_NONE,
+	MCLIST_REM,
+	MCLIST_ADD,
 };
 
-struct mlx4_en_port_stats {
-	unsigned long tso_packets;
-	unsigned long queue_stopped;
-	unsigned long wake_queue;
-	unsigned long tx_timeout;
-	unsigned long rx_alloc_failed;
-	unsigned long rx_chksum_good;
-	unsigned long rx_chksum_none;
-	unsigned long tx_chksum_offload;
+struct mlx4_en_mc_list {
+	struct list_head	list;
+	enum mlx4_en_mclist_act	action;
+	u8			addr[ETH_ALEN];
+	u64			reg_id;
 };
 
-struct mlx4_en_perf_stats {
-	u32 tx_poll;
-	u64 tx_pktsz_avg;
-	u32 inflight_avg;
-	u32 tx_coal_avg;
-	u32 rx_coal_avg;
-};
+#ifdef CONFIG_MLX4_EN_DCB
+/* Minimal TC BW - setting to 0 will block traffic */
+#define MLX4_EN_BW_MIN 1
+#define MLX4_EN_BW_MAX 100 /* Utilize 100% of the line */
 
-struct mlx4_en_frag_info {
-	u16 frag_size;
-	u16 frag_prefix_size;
+#define MLX4_EN_TC_ETS 7
+
+#endif
+
+
+enum {
+	MLX4_EN_FLAG_PROMISC		= (1 << 0),
+	MLX4_EN_FLAG_MC_PROMISC		= (1 << 1),
+	/* whether we need to enable hardware loopback by putting dmac
+	 * in Tx WQE
+	 */
+	MLX4_EN_FLAG_ENABLE_HW_LOOPBACK	= (1 << 2),
+	/* whether we need to drop packets that hardware loopback-ed */
+	MLX4_EN_FLAG_RX_FILTER_NEEDED	= (1 << 3),
+	MLX4_EN_FLAG_FORCE_PROMISC	= (1 << 4),
+#ifdef CONFIG_MLX4_EN_DCB
+	MLX4_EN_FLAG_DCB_ENABLED	= (1 << 5)
+#endif
 };
 
-struct mlx4_en_tx_hash_entry {
-	u8 cnt;
-	unsigned int small_pkts;
-	unsigned int big_pkts;
-	unsigned int ring;
+#define MLX4_EN_MAC_HASH_SIZE (1 << BITS_PER_BYTE)
+#define MLX4_EN_MAC_HASH_IDX 5
+
+struct en_port {
+	struct kobject		kobj;
+	struct mlx4_dev		*dev;
+	u8			port_num;
+	u8			vport_num;
 };
 
 struct mlx4_en_priv {
@@ -486,29 +508,27 @@
 	struct mlx4_en_dev *mdev;
 	struct mlx4_en_port_profile *prof;
 	struct net_device *dev;
-	bool vlgrp_modified;
-	u32 vlan_register[VLAN_FLTR_SIZE];
-	u32 vlan_unregister[VLAN_FLTR_SIZE];
-	u32 vlans[VLAN_FLTR_SIZE];
-	spinlock_t vlan_lock;
+	unsigned long active_vlans[BITS_TO_LONGS(VLAN_N_VID)];
 	struct mlx4_en_port_state port_state;
 	spinlock_t stats_lock;
+	/* To allow rules removal while port is going down */
+	struct list_head ethtool_list;
 
-	unsigned long last_moder_packets;
+	unsigned long last_moder_packets[MAX_RX_RINGS];
 	unsigned long last_moder_tx_packets;
-	unsigned long last_moder_bytes;
+	unsigned long last_moder_bytes[MAX_RX_RINGS];
 	unsigned long last_moder_jiffies;
-	int last_moder_time;
+	int last_moder_time[MAX_RX_RINGS];
 	u16 rx_usecs;
 	u16 rx_frames;
 	u16 tx_usecs;
 	u16 tx_frames;
 	u32 pkt_rate_low;
-	u16 rx_usecs_low;
+	u32 rx_usecs_low;
 	u32 pkt_rate_high;
-	u16 rx_usecs_high;
-	u16 sample_interval;
-	u16 adaptive_rx_coal;
+	u32 rx_usecs_high;
+	u32 sample_interval;
+	u32 adaptive_rx_coal;
 	u32 msg_enable;
 	u32 loopback_ok;
 	u32 validate_loopback;
@@ -520,61 +540,199 @@
 	int port;
 	int registered;
 	int allocated;
-	int rx_csum;
-	u64 mac;
+	int stride;
+	unsigned char current_mac[ETH_ALEN + 2];
+        u64 mac;
 	int mac_index;
 	unsigned max_mtu;
 	int base_qpn;
+	int cqe_factor;
 
 	struct mlx4_en_rss_map rss_map;
-	u16 tx_prio_map[8];
 	u32 flags;
-#define MLX4_EN_FLAG_PROMISC	0x1
+	u8 num_tx_rings_p_up;
 	u32 tx_ring_num;
 	u32 rx_ring_num;
 	u32 rx_mb_size;
-	struct mlx4_en_frag_info frag_info[MLX4_EN_MAX_RX_FRAGS];
-	u16 num_frags;
-	u16 log_rx_info;
-	int ip_reasm;
-	bool wol;
 
-	struct mlx4_en_tx_ring tx_ring[MAX_TX_RINGS];
-	struct mlx4_en_rx_ring rx_ring[MAX_RX_RINGS];
-	struct mlx4_en_cq tx_cq[MAX_TX_RINGS];
-	struct mlx4_en_cq rx_cq[MAX_RX_RINGS];
-	struct mlx4_en_tx_hash_entry tx_hash[MLX4_EN_TX_HASH_SIZE];
-	struct work_struct mcast_task;
+	struct mlx4_en_tx_ring **tx_ring;
+	struct mlx4_en_rx_ring *rx_ring[MAX_RX_RINGS];
+	struct mlx4_en_cq **tx_cq;
+	struct mlx4_en_cq *rx_cq[MAX_RX_RINGS];
+	struct mlx4_qp drop_qp;
+	struct work_struct rx_mode_task;
 	struct work_struct watchdog_task;
 	struct work_struct linkstate_task;
 	struct delayed_work stats_task;
+	struct delayed_work service_task;
 	struct mlx4_en_perf_stats pstats;
 	struct mlx4_en_pkt_stats pkstats;
+	struct mlx4_en_flow_stats flowstats[MLX4_NUM_PRIORITIES];
 	struct mlx4_en_port_stats port_stats;
+	struct mlx4_en_vport_stats vport_stats;
+	struct mlx4_en_vf_stats vf_stats;
+	DECLARE_BITMAP(stats_bitmap, NUM_ALL_STATS);
+	struct list_head mc_list;
+	struct list_head curr_list;
+	u64 broadcast_id;
 	struct mlx4_en_stat_out_mbox hw_stats;
-	struct ifmedia media;
+	int vids[128];
+	bool wol;
+	struct device *ddev;
+	struct dentry *dev_root;
+	u32 counter_index;
 	eventhandler_tag vlan_attach;
 	eventhandler_tag vlan_detach;
 	struct callout watchdog_timer;
+        struct ifmedia media;
 	volatile int blocked;
 	struct sysctl_oid *sysctl;
 	struct sysctl_ctx_list conf_ctx;
 	struct sysctl_ctx_list stat_ctx;
+#define MLX4_EN_MAC_HASH_IDX 5
+	struct hlist_head mac_hash[MLX4_EN_MAC_HASH_SIZE];
+
+#ifdef CONFIG_MLX4_EN_DCB
+	struct ieee_ets ets;
+	u16 maxrate[IEEE_8021QAZ_MAX_TCS];
+	u8 dcbx_cap;
+#endif
+#ifdef CONFIG_RFS_ACCEL
+	spinlock_t filters_lock;
+	int last_filter_id;
+	struct list_head filters;
+	struct hlist_head filter_hash[1 << MLX4_EN_FILTER_HASH_SHIFT];
+#endif
+	struct en_port *vf_ports[MLX4_MAX_NUM_VF];
+	unsigned long last_ifq_jiffies;
+	u64 if_counters_rx_errors;
+	u64 if_counters_rx_no_buffer;
 };
 
 enum mlx4_en_wol {
 	MLX4_EN_WOL_MAGIC = (1ULL << 61),
 	MLX4_EN_WOL_ENABLED = (1ULL << 62),
-	MLX4_EN_WOL_DO_MODIFY = (1ULL << 63),
 };
 
-int mlx4_en_transmit(struct net_device *dev, struct mbuf *mb);
-void mlx4_en_qflush(struct net_device *dev);
+struct mlx4_mac_entry {
+	struct hlist_node hlist;
+	unsigned char mac[ETH_ALEN + 2];
+	u64 reg_id;
+};
 
-int mlx4_en_rx_frags(struct mlx4_en_priv *priv, struct mlx4_en_rx_ring *ring,
-		     struct mbuf *mb, struct mlx4_cqe *cqe);
-void mlx4_en_flush_frags(struct mlx4_en_priv *priv,
-			 struct mlx4_en_rx_ring *ring);
+#ifdef CONFIG_NET_RX_BUSY_POLL
+static inline void mlx4_en_cq_init_lock(struct mlx4_en_cq *cq)
+{
+	spin_lock_init(&cq->poll_lock);
+	cq->state = MLX4_EN_CQ_STATEIDLE;
+}
+
+/* called from the device poll rutine to get ownership of a cq */
+static inline bool mlx4_en_cq_lock_napi(struct mlx4_en_cq *cq)
+{
+	int rc = true;
+	spin_lock(&cq->poll_lock);
+	if (cq->state & MLX4_CQ_LOCKED) {
+		WARN_ON(cq->state & MLX4_EN_CQ_STATENAPI);
+		cq->state |= MLX4_EN_CQ_STATENAPI_YIELD;
+		rc = false;
+	} else
+		/* we don't care if someone yielded */
+		cq->state = MLX4_EN_CQ_STATENAPI;
+	spin_unlock(&cq->poll_lock);
+	return rc;
+}
+
+/* returns true is someone tried to get the cq while napi had it */
+static inline bool mlx4_en_cq_unlock_napi(struct mlx4_en_cq *cq)
+{
+	int rc = false;
+	spin_lock(&cq->poll_lock);
+	WARN_ON(cq->state & (MLX4_EN_CQ_STATEPOLL |
+			     MLX4_EN_CQ_STATENAPI_YIELD));
+
+	if (cq->state & MLX4_EN_CQ_STATEPOLL_YIELD)
+		rc = true;
+	cq->state = MLX4_EN_CQ_STATEIDLE;
+	spin_unlock(&cq->poll_lock);
+	return rc;
+}
+
+/* called from mlx4_en_low_latency_poll() */
+static inline bool mlx4_en_cq_lock_poll(struct mlx4_en_cq *cq)
+{
+	int rc = true;
+	spin_lock_bh(&cq->poll_lock);
+	if ((cq->state & MLX4_CQ_LOCKED)) {
+		struct net_device *dev = cq->dev;
+		struct mlx4_en_priv *priv = netdev_priv(dev);
+		struct mlx4_en_rx_ring *rx_ring = priv->rx_ring[cq->ring];
+
+		cq->state |= MLX4_EN_CQ_STATEPOLL_YIELD;
+		rc = false;
+#ifdef LL_EXTENDED_STATS
+		rx_ring->yields++;
+#endif
+	} else
+		/* preserve yield marks */
+		cq->state |= MLX4_EN_CQ_STATEPOLL;
+	spin_unlock_bh(&cq->poll_lock);
+	return rc;
+}
+
+/* returns true if someone tried to get the cq while it was locked */
+static inline bool mlx4_en_cq_unlock_poll(struct mlx4_en_cq *cq)
+{
+	int rc = false;
+	spin_lock_bh(&cq->poll_lock);
+	WARN_ON(cq->state & (MLX4_EN_CQ_STATENAPI));
+
+	if (cq->state & MLX4_EN_CQ_STATEPOLL_YIELD)
+		rc = true;
+	cq->state = MLX4_EN_CQ_STATEIDLE;
+	spin_unlock_bh(&cq->poll_lock);
+	return rc;
+}
+
+/* true if a socket is polling, even if it did not get the lock */
+static inline bool mlx4_en_cq_ll_polling(struct mlx4_en_cq *cq)
+{
+	WARN_ON(!(cq->state & MLX4_CQ_LOCKED));
+	return cq->state & CQ_USER_PEND;
+}
+#else
+static inline void mlx4_en_cq_init_lock(struct mlx4_en_cq *cq)
+{
+}
+
+static inline bool mlx4_en_cq_lock_napi(struct mlx4_en_cq *cq)
+{
+	return true;
+}
+
+static inline bool mlx4_en_cq_unlock_napi(struct mlx4_en_cq *cq)
+{
+	return false;
+}
+
+static inline bool mlx4_en_cq_lock_poll(struct mlx4_en_cq *cq)
+{
+	return false;
+}
+
+static inline bool mlx4_en_cq_unlock_poll(struct mlx4_en_cq *cq)
+{
+	return false;
+}
+
+static inline bool mlx4_en_cq_ll_polling(struct mlx4_en_cq *cq)
+{
+	return false;
+}
+#endif /* CONFIG_NET_RX_BUSY_POLL */
+
+#define MLX4_EN_WOL_DO_MODIFY (1ULL << 63)
+
 void mlx4_en_destroy_netdev(struct net_device *dev);
 int mlx4_en_init_netdev(struct mlx4_en_dev *mdev, int port,
 			struct mlx4_en_port_profile *prof);
@@ -585,31 +743,40 @@
 void mlx4_en_free_resources(struct mlx4_en_priv *priv);
 int mlx4_en_alloc_resources(struct mlx4_en_priv *priv);
 
-int mlx4_en_create_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq,
-		      int entries, int ring, enum cq_type mode);
-void mlx4_en_destroy_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq);
-int mlx4_en_activate_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq);
+int mlx4_en_pre_config(struct mlx4_en_priv *priv);
+int mlx4_en_create_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq **pcq,
+		      int entries, int ring, enum cq_type mode, int node);
+void mlx4_en_destroy_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq **pcq);
+int mlx4_en_activate_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq,
+			int cq_idx);
 void mlx4_en_deactivate_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq);
 int mlx4_en_set_cq_moder(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq);
 int mlx4_en_arm_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq);
 
-void mlx4_en_poll_tx_cq(unsigned long data);
 void mlx4_en_tx_irq(struct mlx4_cq *mcq);
 u16 mlx4_en_select_queue(struct net_device *dev, struct mbuf *mb);
 
-int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv, struct mlx4_en_tx_ring *ring,
-			   u32 size, u16 stride);
-void mlx4_en_destroy_tx_ring(struct mlx4_en_priv *priv, struct mlx4_en_tx_ring *ring);
+int mlx4_en_transmit(struct ifnet *dev, struct mbuf *m);
+int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv,
+			   struct mlx4_en_tx_ring **pring,
+			   u32 size, u16 stride, int node, int queue_idx);
+void mlx4_en_destroy_tx_ring(struct mlx4_en_priv *priv,
+			     struct mlx4_en_tx_ring **pring);
 int mlx4_en_activate_tx_ring(struct mlx4_en_priv *priv,
 			     struct mlx4_en_tx_ring *ring,
-			     int cq);
+			     int cq, int user_prio);
 void mlx4_en_deactivate_tx_ring(struct mlx4_en_priv *priv,
 				struct mlx4_en_tx_ring *ring);
+void mlx4_en_qflush(struct ifnet *dev);
 
 int mlx4_en_create_rx_ring(struct mlx4_en_priv *priv,
-			   struct mlx4_en_rx_ring *ring, u32 size);
+			   struct mlx4_en_rx_ring **pring,
+			   u32 size, int node);
 void mlx4_en_destroy_rx_ring(struct mlx4_en_priv *priv,
-			     struct mlx4_en_rx_ring *ring);
+			     struct mlx4_en_rx_ring **pring,
+			     u32 size, u16 stride);
+void mlx4_en_tx_que(void *context, int pending);
+void mlx4_en_rx_que(void *context, int pending);
 int mlx4_en_activate_rx_rings(struct mlx4_en_priv *priv);
 void mlx4_en_deactivate_rx_ring(struct mlx4_en_priv *priv,
 				struct mlx4_en_rx_ring *ring);
@@ -616,41 +783,129 @@
 int mlx4_en_process_rx_cq(struct net_device *dev,
 			  struct mlx4_en_cq *cq,
 			  int budget);
-int mlx4_en_process_rx_cq_mb(struct net_device *dev,
-			      struct mlx4_en_cq *cq,
-			      int budget);
-void mlx4_en_tx_que(void *context, int pending);
-void mlx4_en_rx_que(void *context, int pending);
+void mlx4_en_poll_tx_cq(unsigned long data);
 void mlx4_en_fill_qp_context(struct mlx4_en_priv *priv, int size, int stride,
-			     int is_tx, int rss, int qpn, int cqn,
-			     struct mlx4_qp_context *context);
+		int is_tx, int rss, int qpn, int cqn, int user_prio,
+		struct mlx4_qp_context *context);
 void mlx4_en_sqp_event(struct mlx4_qp *qp, enum mlx4_event event);
 int mlx4_en_map_buffer(struct mlx4_buf *buf);
 void mlx4_en_unmap_buffer(struct mlx4_buf *buf);
+void mlx4_en_calc_rx_buf(struct net_device *dev);
 
-void mlx4_en_calc_rx_buf(struct net_device *dev);
-void mlx4_en_set_prio_map(struct mlx4_en_priv *priv, u16 *prio_map, u32 ring_num);
 int mlx4_en_config_rss_steer(struct mlx4_en_priv *priv);
 void mlx4_en_release_rss_steer(struct mlx4_en_priv *priv);
+int mlx4_en_create_drop_qp(struct mlx4_en_priv *priv);
+void mlx4_en_destroy_drop_qp(struct mlx4_en_priv *priv);
 int mlx4_en_free_tx_buf(struct net_device *dev, struct mlx4_en_tx_ring *ring);
 void mlx4_en_rx_irq(struct mlx4_cq *mcq);
 
 int mlx4_SET_MCAST_FLTR(struct mlx4_dev *dev, u8 port, u64 mac, u64 clear, u8 mode);
-int mlx4_SET_VLAN_FLTR(struct mlx4_dev *dev, u8 port, u32 *vlans);
-int mlx4_SET_PORT_general(struct mlx4_dev *dev, u8 port, int mtu,
-			  u8 pptx, u8 pfctx, u8 pprx, u8 pfcrx);
-int mlx4_SET_PORT_qpn_calc(struct mlx4_dev *dev, u8 port, u32 base_qpn,
-			   u8 promisc);
+int mlx4_SET_VLAN_FLTR(struct mlx4_dev *dev, struct mlx4_en_priv *priv);
 
 int mlx4_en_DUMP_ETH_STATS(struct mlx4_en_dev *mdev, u8 port, u8 reset);
 int mlx4_en_QUERY_PORT(struct mlx4_en_dev *mdev, u8 port);
+int mlx4_en_get_vport_stats(struct mlx4_en_dev *mdev, u8 port);
+void mlx4_en_create_debug_files(struct mlx4_en_priv *priv);
+void mlx4_en_delete_debug_files(struct mlx4_en_priv *priv);
+int mlx4_en_register_debugfs(void);
+void mlx4_en_unregister_debugfs(void);
 
+#ifdef CONFIG_MLX4_EN_DCB
+extern const struct dcbnl_rtnl_ops mlx4_en_dcbnl_ops;
+extern const struct dcbnl_rtnl_ops mlx4_en_dcbnl_pfc_ops;
+#endif
+
+int mlx4_en_setup_tc(struct net_device *dev, u8 up);
+
+#ifdef CONFIG_RFS_ACCEL
+void mlx4_en_cleanup_filters(struct mlx4_en_priv *priv,
+			     struct mlx4_en_rx_ring *rx_ring);
+#endif
+
 #define MLX4_EN_NUM_SELF_TEST	5
 void mlx4_en_ex_selftest(struct net_device *dev, u32 *flags, u64 *buf);
-u64 mlx4_en_mac_to_u64(u8 *addr);
+void mlx4_en_ptp_overflow_check(struct mlx4_en_dev *mdev);
 
 /*
+ * Functions for time stamping
+ */
+#define SKBTX_HW_TSTAMP (1 << 0)
+#define SKBTX_IN_PROGRESS (1 << 2)
+
+u64 mlx4_en_get_cqe_ts(struct mlx4_cqe *cqe);
+
+/* Functions for caching and restoring statistics */
+int mlx4_en_get_sset_count(struct net_device *dev, int sset);
+void mlx4_en_restore_ethtool_stats(struct mlx4_en_priv *priv,
+				    u64 *data);
+
+/*
  * Globals
  */
 extern const struct ethtool_ops mlx4_en_ethtool_ops;
+
+/*
+ * Defines for link speed - needed by selftest
+ */
+#define MLX4_EN_LINK_SPEED_1G	1000
+#define MLX4_EN_LINK_SPEED_10G	10000
+#define MLX4_EN_LINK_SPEED_40G	40000
+
+enum {
+        NETIF_MSG_DRV           = 0x0001,
+        NETIF_MSG_PROBE         = 0x0002,
+        NETIF_MSG_LINK          = 0x0004,
+        NETIF_MSG_TIMER         = 0x0008,
+        NETIF_MSG_IFDOWN        = 0x0010,
+        NETIF_MSG_IFUP          = 0x0020,
+        NETIF_MSG_RX_ERR        = 0x0040,
+        NETIF_MSG_TX_ERR        = 0x0080,
+        NETIF_MSG_TX_QUEUED     = 0x0100,
+        NETIF_MSG_INTR          = 0x0200,
+        NETIF_MSG_TX_DONE       = 0x0400,
+        NETIF_MSG_RX_STATUS     = 0x0800,
+        NETIF_MSG_PKTDATA       = 0x1000,
+        NETIF_MSG_HW            = 0x2000,
+        NETIF_MSG_WOL           = 0x4000,
+};
+
+
+/*
+ * printk / logging functions
+ */
+
+#define en_print(level, priv, format, arg...)                   \
+        {                                                       \
+        if ((priv)->registered)                                 \
+                printk(level "%s: %s: " format, DRV_NAME,       \
+                        (priv->dev)->if_xname, ## arg); \
+        else                                                    \
+                printk(level "%s: %s: Port %d: " format,        \
+                        DRV_NAME, dev_name(&priv->mdev->pdev->dev), \
+                        (priv)->port, ## arg);                  \
+        }
+
+
+#define en_dbg(mlevel, priv, format, arg...)			\
+do {								\
+	if (NETIF_MSG_##mlevel & priv->msg_enable)		\
+		en_print(KERN_DEBUG, priv, format, ##arg);	\
+} while (0)
+#define en_warn(priv, format, arg...)			\
+	en_print(KERN_WARNING, priv, format, ##arg)
+#define en_err(priv, format, arg...)			\
+	en_print(KERN_ERR, priv, format, ##arg)
+#define en_info(priv, format, arg...)			\
+	en_print(KERN_INFO, priv, format, ## arg)
+
+#define mlx4_err(mdev, format, arg...)			\
+	pr_err("%s %s: " format, DRV_NAME,		\
+	       dev_name(&mdev->pdev->dev), ##arg)
+#define mlx4_info(mdev, format, arg...)			\
+	pr_info("%s %s: " format, DRV_NAME,		\
+		dev_name(&mdev->pdev->dev), ##arg)
+#define mlx4_warn(mdev, format, arg...)			\
+	pr_warning("%s %s: " format, DRV_NAME,		\
+		   dev_name(&mdev->pdev->dev), ##arg)
+
 #endif


Property changes on: trunk/sys/ofed/drivers/net/mlx4/mlx4_en.h
___________________________________________________________________
Deleted: cvs2svn:cvs-rev
## -1 +0,0 ##
-1.1.1.1
\ No newline at end of property
Added: trunk/sys/ofed/drivers/net/mlx4/mlx4_stats.h
===================================================================
--- trunk/sys/ofed/drivers/net/mlx4/mlx4_stats.h	                        (rev 0)
+++ trunk/sys/ofed/drivers/net/mlx4/mlx4_stats.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -0,0 +1,185 @@
+/*
+ * Copyright (c) 2014 Mellanox Technologies Ltd.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _MLX4_STATS_
+#define _MLX4_STATS_
+
+
+#ifdef MLX4_EN_PERF_STAT
+#define NUM_PERF_STATS			NUM_PERF_COUNTERS
+#else
+#define NUM_PERF_STATS			0
+#endif
+
+#define NUM_PRIORITIES	9
+#define NUM_PRIORITY_STATS 2
+
+struct mlx4_en_pkt_stats {
+	unsigned long rx_packets;
+	unsigned long rx_bytes;
+	unsigned long rx_multicast_packets;
+	unsigned long rx_broadcast_packets;
+	unsigned long rx_errors;
+	unsigned long rx_dropped;
+	unsigned long rx_length_errors;
+	unsigned long rx_over_errors;
+	unsigned long rx_crc_errors;
+	unsigned long rx_jabbers;
+	unsigned long rx_in_range_length_error;
+	unsigned long rx_out_range_length_error;
+	unsigned long rx_lt_64_bytes_packets;
+	unsigned long rx_127_bytes_packets;
+	unsigned long rx_255_bytes_packets;
+	unsigned long rx_511_bytes_packets;
+	unsigned long rx_1023_bytes_packets;
+	unsigned long rx_1518_bytes_packets;
+	unsigned long rx_1522_bytes_packets;
+	unsigned long rx_1548_bytes_packets;
+	unsigned long rx_gt_1548_bytes_packets;
+	unsigned long tx_packets;
+	unsigned long tx_bytes;
+	unsigned long tx_multicast_packets;
+	unsigned long tx_broadcast_packets;
+	unsigned long tx_errors;
+	unsigned long tx_dropped;
+	unsigned long tx_lt_64_bytes_packets;
+	unsigned long tx_127_bytes_packets;
+	unsigned long tx_255_bytes_packets;
+	unsigned long tx_511_bytes_packets;
+	unsigned long tx_1023_bytes_packets;
+	unsigned long tx_1518_bytes_packets;
+	unsigned long tx_1522_bytes_packets;
+	unsigned long tx_1548_bytes_packets;
+	unsigned long tx_gt_1548_bytes_packets;
+	unsigned long rx_prio[NUM_PRIORITIES][NUM_PRIORITY_STATS];
+	unsigned long tx_prio[NUM_PRIORITIES][NUM_PRIORITY_STATS];
+#define NUM_PKT_STATS		72
+};
+
+struct mlx4_en_vf_stats {
+	unsigned long rx_packets;
+	unsigned long rx_bytes;
+	unsigned long rx_multicast_packets;
+	unsigned long rx_broadcast_packets;
+	unsigned long rx_errors;
+	unsigned long rx_dropped;
+	unsigned long tx_packets;
+	unsigned long tx_bytes;
+	unsigned long tx_multicast_packets;
+	unsigned long tx_broadcast_packets;
+	unsigned long tx_errors;
+#define NUM_VF_STATS		11
+};
+
+struct mlx4_en_vport_stats {
+	unsigned long rx_unicast_packets;
+	unsigned long rx_unicast_bytes;
+	unsigned long rx_multicast_packets;
+	unsigned long rx_multicast_bytes;
+	unsigned long rx_broadcast_packets;
+	unsigned long rx_broadcast_bytes;
+	unsigned long rx_dropped;
+	unsigned long rx_errors;
+	unsigned long tx_unicast_packets;
+	unsigned long tx_unicast_bytes;
+	unsigned long tx_multicast_packets;
+	unsigned long tx_multicast_bytes;
+	unsigned long tx_broadcast_packets;
+	unsigned long tx_broadcast_bytes;
+	unsigned long tx_errors;
+#define NUM_VPORT_STATS		15
+};
+
+struct mlx4_en_port_stats {
+	unsigned long tso_packets;
+	unsigned long queue_stopped;
+	unsigned long wake_queue;
+	unsigned long tx_timeout;
+	unsigned long oversized_packets;
+	unsigned long rx_alloc_failed;
+	unsigned long rx_chksum_good;
+	unsigned long rx_chksum_none;
+	unsigned long tx_chksum_offload;
+#define NUM_PORT_STATS		8
+};
+
+struct mlx4_en_perf_stats {
+	u32 tx_poll;
+	u64 tx_pktsz_avg;
+	u32 inflight_avg;
+	u16 tx_coal_avg;
+	u16 rx_coal_avg;
+	u32 napi_quota;
+#define NUM_PERF_COUNTERS		6
+};
+
+struct mlx4_en_flow_stats {
+	u64 rx_pause;
+	u64 rx_pause_duration;
+	u64 rx_pause_transition;
+	u64 tx_pause;
+	u64 tx_pause_duration;
+	u64 tx_pause_transition;
+};
+#define MLX4_NUM_PRIORITIES	8
+#define NUM_FLOW_PRIORITY_STATS	6
+#define NUM_FLOW_STATS		(NUM_FLOW_PRIORITY_STATS*MLX4_NUM_PRIORITIES)
+
+
+struct mlx4_en_stat_out_flow_control_mbox {
+	/* Total number of PAUSE frames received from the far-end port */
+	__be64 rx_pause;
+	/* Total number of microseconds that far-end port requested to pause
+	 * transmission of packets
+	 */
+	__be64 rx_pause_duration;
+	/* Number of received transmission from XOFF state to XON state */
+	__be64 rx_pause_transition;
+	/* Total number of PAUSE frames sent from the far-end port */
+	__be64 tx_pause;
+	/* Total time in microseconds that transmission of packets has been
+	 * paused
+	 */
+	__be64 tx_pause_duration;
+	/* Number of transmitter transitions from XOFF state to XON state */
+	__be64 tx_pause_transition;
+	/* Reserverd */
+	__be64 reserved[2];
+};
+
+int mlx4_get_vport_ethtool_stats(struct mlx4_dev *dev, int port,
+                         struct mlx4_en_vport_stats *vport_stats,
+                         int reset);
+
+#define NUM_ALL_STATS	(NUM_PKT_STATS + NUM_FLOW_STATS + NUM_VPORT_STATS + \
+			 NUM_VF_STATS + NUM_PORT_STATS + NUM_PERF_STATS)
+#endif


Property changes on: trunk/sys/ofed/drivers/net/mlx4/mlx4_stats.h
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Modified: trunk/sys/ofed/drivers/net/mlx4/mr.c
===================================================================
--- trunk/sys/ofed/drivers/net/mlx4/mr.c	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/drivers/net/mlx4/mr.c	2016-09-14 19:35:22 UTC (rev 7911)
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 2004 Topspin Communications.  All rights reserved.
- * Copyright (c) 2005, 2006, 2007, 2008 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2005, 2006, 2007, 2008, 2014 Mellanox Technologies. All rights reserved.
  * Copyright (c) 2006, 2007 Cisco Systems, Inc.  All rights reserved.
  *
  * This software is available to you under a choice of one of two
@@ -32,52 +32,20 @@
  * SOFTWARE.
  */
 
-#include <linux/init.h>
+#include <linux/err.h>
 #include <linux/errno.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/vmalloc.h>
 
 #include <linux/mlx4/cmd.h>
 
+#include <linux/math64.h>
+
 #include "mlx4.h"
 #include "icm.h"
 
-/*
- * Must be packed because mtt_seg is 64 bits but only aligned to 32 bits.
- */
-struct mlx4_mpt_entry {
-	__be32 flags;
-	__be32 qpn;
-	__be32 key;
-	__be32 pd_flags;
-	__be64 start;
-	__be64 length;
-	__be32 lkey;
-	__be32 win_cnt;
-	u8	reserved1;
-	u8	flags2;
-	u8	reserved2;
-	u8	mtt_rep;
-	__be64 mtt_seg;
-	__be32 mtt_sz;
-	__be32 entity_size;
-	__be32 first_byte_offset;
-} __attribute__((packed));
-
-#define MLX4_MPT_FLAG_SW_OWNS	    (0xfUL << 28)
-#define MLX4_MPT_FLAG_FREE	    (0x3UL << 28)
-#define MLX4_MPT_FLAG_MIO	    (1 << 17)
-#define MLX4_MPT_FLAG_BIND_ENABLE   (1 << 15)
-#define MLX4_MPT_FLAG_PHYSICAL	    (1 <<  9)
-#define MLX4_MPT_FLAG_REGION	    (1 <<  8)
-
-#define MLX4_MPT_PD_FLAG_FAST_REG   (1 << 27)
-#define MLX4_MPT_PD_FLAG_RAE	    (1 << 28)
-#define MLX4_MPT_PD_FLAG_EN_INV	    (3 << 24)
-
-#define MLX4_MPT_FLAG2_FBO_EN	     (1 <<  7)
-
-#define MLX4_MPT_STATUS_SW		0xF0
-#define MLX4_MPT_STATUS_HW		0x00
-
 static u32 mlx4_buddy_alloc(struct mlx4_buddy *buddy, int order)
 {
 	int o;
@@ -141,9 +109,9 @@
 	buddy->max_order = max_order;
 	spin_lock_init(&buddy->lock);
 
-	buddy->bits = kzalloc((buddy->max_order + 1) * sizeof (long *),
+	buddy->bits = kcalloc(buddy->max_order + 1, sizeof (long *),
 			      GFP_KERNEL);
-	buddy->num_free = kzalloc((buddy->max_order + 1) * sizeof (int *),
+	buddy->num_free = kcalloc((buddy->max_order + 1), sizeof *buddy->num_free,
 				  GFP_KERNEL);
 	if (!buddy->bits || !buddy->num_free)
 		goto err_out;
@@ -150,10 +118,9 @@
 
 	for (i = 0; i <= buddy->max_order; ++i) {
 		s = BITS_TO_LONGS(1 << (buddy->max_order - i));
-		buddy->bits[i] = kmalloc(s * sizeof (long), GFP_KERNEL);
-		if (!buddy->bits[i])
+		buddy->bits[i] = kcalloc(s, sizeof (long), GFP_KERNEL | __GFP_NOWARN);
+		if (!buddy->bits[i]) 
 			goto err_out_free;
-		bitmap_zero(buddy->bits[i], 1 << (buddy->max_order - i));
 	}
 
 	set_bit(0, buddy->bits[buddy->max_order]);
@@ -183,24 +150,50 @@
 	kfree(buddy->num_free);
 }
 
-static u32 mlx4_alloc_mtt_range(struct mlx4_dev *dev, int order)
+u32 __mlx4_alloc_mtt_range(struct mlx4_dev *dev, int order)
 {
 	struct mlx4_mr_table *mr_table = &mlx4_priv(dev)->mr_table;
 	u32 seg;
+	int seg_order;
+	u32 offset;
 
-	seg = mlx4_buddy_alloc(&mr_table->mtt_buddy, order);
+	seg_order = max_t(int, order - log_mtts_per_seg, 0);
+
+	seg = mlx4_buddy_alloc(&mr_table->mtt_buddy, seg_order);
 	if (seg == -1)
 		return -1;
 
-	if (mlx4_table_get_range(dev, &mr_table->mtt_table, seg,
-				 seg + (1 << order) - 1)) {
-		mlx4_buddy_free(&mr_table->mtt_buddy, seg, order);
+	offset = seg * (1 << log_mtts_per_seg);
+
+	if (mlx4_table_get_range(dev, &mr_table->mtt_table, offset,
+				 offset + (1 << order) - 1)) {
+		mlx4_buddy_free(&mr_table->mtt_buddy, seg, seg_order);
 		return -1;
 	}
 
-	return seg;
+	return offset;
 }
 
+static u32 mlx4_alloc_mtt_range(struct mlx4_dev *dev, int order)
+{
+	u64 in_param = 0;
+	u64 out_param;
+	int err;
+
+	if (mlx4_is_mfunc(dev)) {
+		set_param_l(&in_param, order);
+		err = mlx4_cmd_imm(dev, in_param, &out_param, RES_MTT,
+						       RES_OP_RESERVE_AND_MAP,
+						       MLX4_CMD_ALLOC_RES,
+						       MLX4_CMD_TIME_CLASS_A,
+						       MLX4_CMD_WRAPPED);
+		if (err)
+			return -1;
+		return get_param_l(&out_param);
+	}
+	return __mlx4_alloc_mtt_range(dev, order);
+}
+
 int mlx4_mtt_init(struct mlx4_dev *dev, int npages, int page_shift,
 		  struct mlx4_mtt *mtt)
 {
@@ -213,33 +206,66 @@
 	} else
 		mtt->page_shift = page_shift;
 
-	for (mtt->order = 0, i = dev->caps.mtts_per_seg; i < npages; i <<= 1)
+	for (mtt->order = 0, i = 1; i < npages; i <<= 1)
 		++mtt->order;
 
-	mtt->first_seg = mlx4_alloc_mtt_range(dev, mtt->order);
-	if (mtt->first_seg == -1)
+	mtt->offset = mlx4_alloc_mtt_range(dev, mtt->order);
+	if (mtt->offset == -1) {
+		mlx4_err(dev, "Failed to allocate mtts for %d pages(order %d)\n",
+			 npages, mtt->order);
 		return -ENOMEM;
+	}
 
 	return 0;
 }
 EXPORT_SYMBOL_GPL(mlx4_mtt_init);
 
-void mlx4_mtt_cleanup(struct mlx4_dev *dev, struct mlx4_mtt *mtt)
+void __mlx4_free_mtt_range(struct mlx4_dev *dev, u32 offset, int order)
 {
+	u32 first_seg;
+	int seg_order;
 	struct mlx4_mr_table *mr_table = &mlx4_priv(dev)->mr_table;
 
+	seg_order = max_t(int, order - log_mtts_per_seg, 0);
+	first_seg = offset / (1 << log_mtts_per_seg);
+
+	mlx4_buddy_free(&mr_table->mtt_buddy, first_seg, seg_order);
+	mlx4_table_put_range(dev, &mr_table->mtt_table, offset,
+			     offset + (1 << order) - 1);
+}
+
+static void mlx4_free_mtt_range(struct mlx4_dev *dev, u32 offset, int order)
+{
+	u64 in_param = 0;
+	int err;
+
+	if (mlx4_is_mfunc(dev)) {
+		set_param_l(&in_param, offset);
+		set_param_h(&in_param, order);
+		err = mlx4_cmd(dev, in_param, RES_MTT, RES_OP_RESERVE_AND_MAP,
+						       MLX4_CMD_FREE_RES,
+						       MLX4_CMD_TIME_CLASS_A,
+						       MLX4_CMD_WRAPPED);
+		if (err)
+			mlx4_warn(dev, "Failed to free mtt range at:"
+				  "%d order:%d\n", offset, order);
+		return;
+	}
+	 __mlx4_free_mtt_range(dev, offset, order);
+}
+
+void mlx4_mtt_cleanup(struct mlx4_dev *dev, struct mlx4_mtt *mtt)
+{
 	if (mtt->order < 0)
 		return;
 
-	mlx4_buddy_free(&mr_table->mtt_buddy, mtt->first_seg, mtt->order);
-	mlx4_table_put_range(dev, &mr_table->mtt_table, mtt->first_seg,
-			     mtt->first_seg + (1 << mtt->order) - 1);
+	mlx4_free_mtt_range(dev, mtt->offset, mtt->order);
 }
 EXPORT_SYMBOL_GPL(mlx4_mtt_cleanup);
 
 u64 mlx4_mtt_addr(struct mlx4_dev *dev, struct mlx4_mtt *mtt)
 {
-	return (u64) mtt->first_seg * dev->caps.mtt_entry_sz;
+	return (u64) mtt->offset * dev->caps.mtt_entry_sz;
 }
 EXPORT_SYMBOL_GPL(mlx4_mtt_addr);
 
@@ -256,8 +282,9 @@
 static int mlx4_SW2HW_MPT(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox,
 			  int mpt_index)
 {
-	return mlx4_cmd(dev, mailbox->dma, mpt_index, 0, MLX4_CMD_SW2HW_MPT,
-			MLX4_CMD_TIME_CLASS_B);
+	return mlx4_cmd(dev, mailbox->dma, mpt_index,
+			0, MLX4_CMD_SW2HW_MPT, MLX4_CMD_TIME_CLASS_B,
+			MLX4_CMD_WRAPPED);
 }
 
 static int mlx4_HW2SW_MPT(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox,
@@ -264,54 +291,127 @@
 			  int mpt_index)
 {
 	return mlx4_cmd_box(dev, 0, mailbox ? mailbox->dma : 0, mpt_index,
-			    !mailbox, MLX4_CMD_HW2SW_MPT, MLX4_CMD_TIME_CLASS_B);
+			    !mailbox, MLX4_CMD_HW2SW_MPT,
+			    MLX4_CMD_TIME_CLASS_B, MLX4_CMD_WRAPPED);
 }
 
-int mlx4_mr_reserve_range(struct mlx4_dev *dev, int cnt, int align, u32 *base_mridx)
+static int mlx4_mr_alloc_reserved(struct mlx4_dev *dev, u32 mridx, u32 pd,
+			   u64 iova, u64 size, u32 access, int npages,
+			   int page_shift, struct mlx4_mr *mr)
 {
+	mr->iova       = iova;
+	mr->size       = size;
+	mr->pd	       = pd;
+	mr->access     = access;
+	mr->enabled    = MLX4_MPT_DISABLED;
+	mr->key	       = hw_index_to_key(mridx);
+
+	return mlx4_mtt_init(dev, npages, page_shift, &mr->mtt);
+}
+
+static int mlx4_WRITE_MTT(struct mlx4_dev *dev,
+			  struct mlx4_cmd_mailbox *mailbox,
+			  int num_entries)
+{
+	return mlx4_cmd(dev, mailbox->dma, num_entries, 0, MLX4_CMD_WRITE_MTT,
+			MLX4_CMD_TIME_CLASS_A,  MLX4_CMD_WRAPPED);
+}
+
+int __mlx4_mpt_reserve(struct mlx4_dev *dev)
+{
 	struct mlx4_priv *priv = mlx4_priv(dev);
-	u32 mridx;
 
-	mridx = mlx4_bitmap_alloc_range(&priv->mr_table.mpt_bitmap, cnt, align);
-	if (mridx == -1)
-		return -ENOMEM;
+	return mlx4_bitmap_alloc(&priv->mr_table.mpt_bitmap);
+}
 
-	*base_mridx = mridx;
-	return 0;
+static int mlx4_mpt_reserve(struct mlx4_dev *dev)
+{
+	u64 out_param;
 
+	if (mlx4_is_mfunc(dev)) {
+		if (mlx4_cmd_imm(dev, 0, &out_param, RES_MPT, RES_OP_RESERVE,
+				   MLX4_CMD_ALLOC_RES,
+				   MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED))
+			return -1;
+		return get_param_l(&out_param);
+	}
+	return  __mlx4_mpt_reserve(dev);
 }
-EXPORT_SYMBOL_GPL(mlx4_mr_reserve_range);
 
-void mlx4_mr_release_range(struct mlx4_dev *dev, u32 base_mridx, int cnt)
+void __mlx4_mpt_release(struct mlx4_dev *dev, u32 index)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
-	mlx4_bitmap_free_range(&priv->mr_table.mpt_bitmap, base_mridx, cnt);
+
+	mlx4_bitmap_free(&priv->mr_table.mpt_bitmap, index, MLX4_NO_RR);
 }
-EXPORT_SYMBOL_GPL(mlx4_mr_release_range);
 
-int mlx4_mr_alloc_reserved(struct mlx4_dev *dev, u32 mridx, u32 pd,
-			   u64 iova, u64 size, u32 access, int npages,
-			   int page_shift, struct mlx4_mr *mr)
+static void mlx4_mpt_release(struct mlx4_dev *dev, u32 index)
 {
-	mr->iova       = iova;
-	mr->size       = size;
-	mr->pd	       = pd;
-	mr->access     = access;
-	mr->enabled    = 0;
-	mr->key	       = hw_index_to_key(mridx);
+	u64 in_param = 0;
 
-	return mlx4_mtt_init(dev, npages, page_shift, &mr->mtt);
+	if (mlx4_is_mfunc(dev)) {
+		set_param_l(&in_param, index);
+		if (mlx4_cmd(dev, in_param, RES_MPT, RES_OP_RESERVE,
+			       MLX4_CMD_FREE_RES,
+			       MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED))
+			mlx4_warn(dev, "Failed to release mr index:%d\n",
+				  index);
+		return;
+	}
+	__mlx4_mpt_release(dev, index);
 }
-EXPORT_SYMBOL_GPL(mlx4_mr_alloc_reserved);
 
+int __mlx4_mpt_alloc_icm(struct mlx4_dev *dev, u32 index)
+{
+	struct mlx4_mr_table *mr_table = &mlx4_priv(dev)->mr_table;
+
+	return mlx4_table_get(dev, &mr_table->dmpt_table, index);
+}
+
+static int mlx4_mpt_alloc_icm(struct mlx4_dev *dev, u32 index)
+{
+	u64 param = 0;
+
+	if (mlx4_is_mfunc(dev)) {
+		set_param_l(&param, index);
+		return mlx4_cmd_imm(dev, param, &param, RES_MPT, RES_OP_MAP_ICM,
+							MLX4_CMD_ALLOC_RES,
+							MLX4_CMD_TIME_CLASS_A,
+							MLX4_CMD_WRAPPED);
+	}
+	return __mlx4_mpt_alloc_icm(dev, index);
+}
+
+void __mlx4_mpt_free_icm(struct mlx4_dev *dev, u32 index)
+{
+	struct mlx4_mr_table *mr_table = &mlx4_priv(dev)->mr_table;
+
+	mlx4_table_put(dev, &mr_table->dmpt_table, index);
+}
+
+static void mlx4_mpt_free_icm(struct mlx4_dev *dev, u32 index)
+{
+	u64 in_param = 0;
+
+	if (mlx4_is_mfunc(dev)) {
+		set_param_l(&in_param, index);
+		if (mlx4_cmd(dev, in_param, RES_MPT, RES_OP_MAP_ICM,
+			     MLX4_CMD_FREE_RES, MLX4_CMD_TIME_CLASS_A,
+			     MLX4_CMD_WRAPPED))
+			mlx4_warn(dev, "Failed to free icm of mr index:%d\n",
+				  index);
+		return;
+	}
+	return __mlx4_mpt_free_icm(dev, index);
+}
+
 int mlx4_mr_alloc(struct mlx4_dev *dev, u32 pd, u64 iova, u64 size, u32 access,
 		  int npages, int page_shift, struct mlx4_mr *mr)
 {
-	struct mlx4_priv *priv = mlx4_priv(dev);
 	u32 index;
 	int err;
 
-	index = mlx4_bitmap_alloc(&priv->mr_table.mpt_bitmap);
+	index = mlx4_mpt_reserve(dev);
 	if (index == -1)
 		return -ENOMEM;
 
@@ -318,44 +418,55 @@
 	err = mlx4_mr_alloc_reserved(dev, index, pd, iova, size,
 				     access, npages, page_shift, mr);
 	if (err)
-		mlx4_bitmap_free(&priv->mr_table.mpt_bitmap, index);
+		mlx4_mpt_release(dev, index);
 
 	return err;
 }
 EXPORT_SYMBOL_GPL(mlx4_mr_alloc);
 
-void mlx4_mr_free_reserved(struct mlx4_dev *dev, struct mlx4_mr *mr)
+static int mlx4_mr_free_reserved(struct mlx4_dev *dev, struct mlx4_mr *mr)
 {
 	int err;
 
-	if (mr->enabled) {
+	if (mr->enabled == MLX4_MPT_EN_HW) {
 		err = mlx4_HW2SW_MPT(dev, NULL,
 				     key_to_hw_index(mr->key) &
 				     (dev->caps.num_mpts - 1));
-		if (err)
-			mlx4_warn(dev, "HW2SW_MPT failed (%d)\n", err);
+		if (err) {
+			mlx4_warn(dev, "HW2SW_MPT failed (%d).", err);
+			mlx4_warn(dev, "Most likely the MR has MWs bound to it.\n");
+			return err;
+		}
+
+		mr->enabled = MLX4_MPT_EN_SW;
 	}
+	mlx4_mtt_cleanup(dev, &mr->mtt);
 
-	mlx4_mtt_cleanup(dev, &mr->mtt);
+	return 0;
 }
-EXPORT_SYMBOL_GPL(mlx4_mr_free_reserved);
 
-void mlx4_mr_free(struct mlx4_dev *dev, struct mlx4_mr *mr)
+int mlx4_mr_free(struct mlx4_dev *dev, struct mlx4_mr *mr)
 {
-	struct mlx4_priv *priv = mlx4_priv(dev);
-	mlx4_mr_free_reserved(dev, mr);
-	mlx4_bitmap_free(&priv->mr_table.mpt_bitmap, key_to_hw_index(mr->key));
+	int ret;
+
+	ret = mlx4_mr_free_reserved(dev, mr);
+	if (ret)
+		return ret;
+	if (mr->enabled)
+		mlx4_mpt_free_icm(dev, key_to_hw_index(mr->key));
+	mlx4_mpt_release(dev, key_to_hw_index(mr->key));
+
+	return 0;
 }
 EXPORT_SYMBOL_GPL(mlx4_mr_free);
 
 int mlx4_mr_enable(struct mlx4_dev *dev, struct mlx4_mr *mr)
 {
-	struct mlx4_mr_table *mr_table = &mlx4_priv(dev)->mr_table;
 	struct mlx4_cmd_mailbox *mailbox;
 	struct mlx4_mpt_entry *mpt_entry;
 	int err;
 
-	err = mlx4_table_get(dev, &mr_table->dmpt_table, key_to_hw_index(mr->key));
+	err = mlx4_mpt_alloc_icm(dev, key_to_hw_index(mr->key));
 	if (err)
 		return err;
 
@@ -380,9 +491,10 @@
 
 	if (mr->mtt.order < 0) {
 		mpt_entry->flags |= cpu_to_be32(MLX4_MPT_FLAG_PHYSICAL);
-		mpt_entry->mtt_seg = 0;
+		mpt_entry->mtt_addr = 0;
 	} else {
-		mpt_entry->mtt_seg = cpu_to_be64(mlx4_mtt_addr(dev, &mr->mtt));
+		mpt_entry->mtt_addr = cpu_to_be64(mlx4_mtt_addr(dev,
+						  &mr->mtt));
 	}
 
 	if (mr->mtt.order >= 0 && mr->mtt.page_shift == 0) {
@@ -390,8 +502,7 @@
 		mpt_entry->flags    |= cpu_to_be32(MLX4_MPT_FLAG_FREE);
 		mpt_entry->pd_flags |= cpu_to_be32(MLX4_MPT_PD_FLAG_FAST_REG |
 						   MLX4_MPT_PD_FLAG_RAE);
-		mpt_entry->mtt_sz    = cpu_to_be32((1 << mr->mtt.order) *
-						   dev->caps.mtts_per_seg);
+		mpt_entry->mtt_sz    = cpu_to_be32(1 << mr->mtt.order);
 	} else {
 		mpt_entry->flags    |= cpu_to_be32(MLX4_MPT_FLAG_SW_OWNS);
 	}
@@ -402,9 +513,8 @@
 		mlx4_warn(dev, "SW2HW_MPT failed (%d)\n", err);
 		goto err_cmd;
 	}
+	mr->enabled = MLX4_MPT_EN_HW;
 
-	mr->enabled = 1;
-
 	mlx4_free_cmd_mailbox(dev, mailbox);
 
 	return 0;
@@ -413,7 +523,7 @@
 	mlx4_free_cmd_mailbox(dev, mailbox);
 
 err_table:
-	mlx4_table_put(dev, &mr_table->dmpt_table, key_to_hw_index(mr->key));
+	mlx4_mpt_free_icm(dev, key_to_hw_index(mr->key));
 	return err;
 }
 EXPORT_SYMBOL_GPL(mlx4_mr_enable);
@@ -425,50 +535,94 @@
 	__be64 *mtts;
 	dma_addr_t dma_handle;
 	int i;
-	int s = start_index * sizeof (u64);
 
-	/* All MTTs must fit in the same page */
-	if (start_index / (PAGE_SIZE / sizeof (u64)) !=
-	    (start_index + npages - 1) / (PAGE_SIZE / sizeof (u64)))
-		return -EINVAL;
+	mtts = mlx4_table_find(&priv->mr_table.mtt_table, mtt->offset +
+			       start_index, &dma_handle);
 
-	if (start_index & (dev->caps.mtts_per_seg - 1))
-		return -EINVAL;
-
-	mtts = mlx4_table_find(&priv->mr_table.mtt_table, mtt->first_seg +
-				s / dev->caps.mtt_entry_sz, &dma_handle);
 	if (!mtts)
 		return -ENOMEM;
 
+	dma_sync_single_for_cpu(&dev->pdev->dev, dma_handle,
+				npages * sizeof (u64), DMA_TO_DEVICE);
+
 	for (i = 0; i < npages; ++i)
 		mtts[i] = cpu_to_be64(page_list[i] | MLX4_MTT_FLAG_PRESENT);
 
-	dma_sync_single(&dev->pdev->dev, dma_handle, npages * sizeof (u64), DMA_TO_DEVICE);
+	dma_sync_single_for_device(&dev->pdev->dev, dma_handle,
+				   npages * sizeof (u64), DMA_TO_DEVICE);
 
 	return 0;
 }
 
-int mlx4_write_mtt(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
-		   int start_index, int npages, u64 *page_list)
+int __mlx4_write_mtt(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
+		     int start_index, int npages, u64 *page_list)
 {
+	int err = 0;
 	int chunk;
-	int err;
+	int mtts_per_page;
+	int max_mtts_first_page;
 
-	if (mtt->order < 0)
-		return -EINVAL;
+	/* compute how may mtts fit in the first page */
+	mtts_per_page = PAGE_SIZE / sizeof(u64);
+	max_mtts_first_page = mtts_per_page - (mtt->offset + start_index)
+			      % mtts_per_page;
 
+	chunk = min_t(int, max_mtts_first_page, npages);
+
 	while (npages > 0) {
-		chunk = min_t(int, PAGE_SIZE / sizeof(u64), npages);
 		err = mlx4_write_mtt_chunk(dev, mtt, start_index, chunk, page_list);
 		if (err)
 			return err;
-
 		npages      -= chunk;
 		start_index += chunk;
 		page_list   += chunk;
+
+		chunk = min_t(int, mtts_per_page, npages);
 	}
+	return err;
+}
 
-	return 0;
+int mlx4_write_mtt(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
+		   int start_index, int npages, u64 *page_list)
+{
+	struct mlx4_cmd_mailbox *mailbox = NULL;
+	__be64 *inbox = NULL;
+	int chunk;
+	int err = 0;
+	int i;
+
+	if (mtt->order < 0)
+		return -EINVAL;
+
+	if (mlx4_is_mfunc(dev)) {
+		mailbox = mlx4_alloc_cmd_mailbox(dev);
+		if (IS_ERR(mailbox))
+			return PTR_ERR(mailbox);
+		inbox = mailbox->buf;
+
+		while (npages > 0) {
+			chunk = min_t(int, MLX4_MAILBOX_SIZE / sizeof(u64) - 2,
+				      npages);
+			inbox[0] = cpu_to_be64(mtt->offset + start_index);
+			inbox[1] = 0;
+			for (i = 0; i < chunk; ++i)
+				inbox[i + 2] = cpu_to_be64(page_list[i] |
+					       MLX4_MTT_FLAG_PRESENT);
+			err = mlx4_WRITE_MTT(dev, mailbox, chunk);
+			if (err) {
+				mlx4_free_cmd_mailbox(dev, mailbox);
+				return err;
+			}
+
+			npages      -= chunk;
+			start_index += chunk;
+			page_list   += chunk;
+		}
+		mlx4_free_cmd_mailbox(dev, mailbox);
+		return err;
+	}
+
+	return __mlx4_write_mtt(dev, mtt, start_index, npages, page_list);
 }
 EXPORT_SYMBOL_GPL(mlx4_write_mtt);
 
@@ -484,7 +638,7 @@
 		return -ENOMEM;
 
 	for (i = 0; i < buf->npages; ++i)
-		if (buf->direct.map)
+		if (buf->nbufs == 1)
 			page_list[i] = buf->direct.map + (i << buf->page_shift);
 		else
 			page_list[i] = buf->page_list[i].map;
@@ -496,11 +650,106 @@
 }
 EXPORT_SYMBOL_GPL(mlx4_buf_write_mtt);
 
+int mlx4_mw_alloc(struct mlx4_dev *dev, u32 pd, enum mlx4_mw_type type,
+		  struct mlx4_mw *mw)
+{
+	u32 index;
+
+	index = mlx4_mpt_reserve(dev);
+	if (index == -1)
+		return -ENOMEM;
+
+	mw->key	    = hw_index_to_key(index);
+	mw->pd      = pd;
+	mw->type    = type;
+	mw->enabled = MLX4_MPT_DISABLED;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_mw_alloc);
+
+int mlx4_mw_enable(struct mlx4_dev *dev, struct mlx4_mw *mw)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_mpt_entry *mpt_entry;
+	int err;
+
+	err = mlx4_mpt_alloc_icm(dev, key_to_hw_index(mw->key));
+	if (err)
+		return err;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox)) {
+		err = PTR_ERR(mailbox);
+		goto err_table;
+	}
+	mpt_entry = mailbox->buf;
+
+	memset(mpt_entry, 0, sizeof(*mpt_entry));
+
+	/* Note that the MLX4_MPT_FLAG_REGION bit in mpt_entry->flags is turned
+	* off, thus creating a memory window and not a memory region.
+	*/
+	mpt_entry->key	       = cpu_to_be32(key_to_hw_index(mw->key));
+	mpt_entry->pd_flags    = cpu_to_be32(mw->pd);
+	if (mw->type == MLX4_MW_TYPE_2) {
+		mpt_entry->flags    |= cpu_to_be32(MLX4_MPT_FLAG_FREE);
+		mpt_entry->qpn       = cpu_to_be32(MLX4_MPT_QP_FLAG_BOUND_QP);
+		mpt_entry->pd_flags |= cpu_to_be32(MLX4_MPT_PD_FLAG_EN_INV);
+	}
+
+	err = mlx4_SW2HW_MPT(dev, mailbox,
+			     key_to_hw_index(mw->key) &
+			     (dev->caps.num_mpts - 1));
+	if (err) {
+		mlx4_warn(dev, "SW2HW_MPT failed (%d)\n", err);
+		goto err_cmd;
+	}
+	mw->enabled = MLX4_MPT_EN_HW;
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+
+	return 0;
+
+err_cmd:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+
+err_table:
+	mlx4_mpt_free_icm(dev, key_to_hw_index(mw->key));
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_mw_enable);
+
+void mlx4_mw_free(struct mlx4_dev *dev, struct mlx4_mw *mw)
+{
+	int err;
+
+	if (mw->enabled == MLX4_MPT_EN_HW) {
+		err = mlx4_HW2SW_MPT(dev, NULL,
+				     key_to_hw_index(mw->key) &
+				     (dev->caps.num_mpts - 1));
+		if (err)
+			mlx4_warn(dev, "xxx HW2SW_MPT failed (%d)\n", err);
+
+		mw->enabled = MLX4_MPT_EN_SW;
+	}
+	if (mw->enabled)
+		mlx4_mpt_free_icm(dev, key_to_hw_index(mw->key));
+	mlx4_mpt_release(dev, key_to_hw_index(mw->key));
+}
+EXPORT_SYMBOL_GPL(mlx4_mw_free);
+
 int mlx4_init_mr_table(struct mlx4_dev *dev)
 {
-	struct mlx4_mr_table *mr_table = &mlx4_priv(dev)->mr_table;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_mr_table *mr_table = &priv->mr_table;
 	int err;
 
+	/* Nothing to do for slaves - all MR handling is forwarded
+	* to the master */
+	if (mlx4_is_slave(dev))
+		return 0;
+
 	if (!is_power_of_2(dev->caps.num_mpts))
 		return -EINVAL;
 
@@ -510,13 +759,17 @@
 		return err;
 
 	err = mlx4_buddy_init(&mr_table->mtt_buddy,
-			      ilog2(dev->caps.num_mtt_segs));
+			      ilog2(div_u64(dev->caps.num_mtts,
+			      (1 << log_mtts_per_seg))));
 	if (err)
 		goto err_buddy;
 
 	if (dev->caps.reserved_mtts) {
-		if (mlx4_alloc_mtt_range(dev, fls(dev->caps.reserved_mtts - 1)) == -1) {
-			mlx4_warn(dev, "MTT table of order %d is too small.\n",
+		priv->reserved_mtts =
+			mlx4_alloc_mtt_range(dev,
+					     fls(dev->caps.reserved_mtts - 1));
+		if (priv->reserved_mtts < 0) {
+			mlx4_warn(dev, "MTT table of order %u is too small.\n",
 				  mr_table->mtt_buddy.max_order);
 			err = -ENOMEM;
 			goto err_reserve_mtts;
@@ -536,8 +789,14 @@
 
 void mlx4_cleanup_mr_table(struct mlx4_dev *dev)
 {
-	struct mlx4_mr_table *mr_table = &mlx4_priv(dev)->mr_table;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_mr_table *mr_table = &priv->mr_table;
 
+	if (mlx4_is_slave(dev))
+		return;
+	if (priv->reserved_mtts >= 0)
+		mlx4_free_mtt_range(dev, priv->reserved_mtts,
+				    fls(dev->caps.reserved_mtts - 1));
 	mlx4_buddy_cleanup(&mr_table->mtt_buddy);
 	mlx4_bitmap_cleanup(&mr_table->mpt_bitmap);
 }
@@ -569,9 +828,8 @@
 	return 0;
 }
 
-int mlx4_map_phys_fmr_fbo(struct mlx4_dev *dev, struct mlx4_fmr *fmr,
-			  u64 *page_list, int npages, u64 iova, u32 fbo,
-			  u32 len, u32 *lkey, u32 *rkey, int same_key)
+int mlx4_map_phys_fmr(struct mlx4_dev *dev, struct mlx4_fmr *fmr, u64 *page_list,
+		      int npages, u64 iova, u32 *lkey, u32 *rkey)
 {
 	u32 key;
 	int i, err;
@@ -583,8 +841,7 @@
 	++fmr->maps;
 
 	key = key_to_hw_index(fmr->mr.key);
-	if (!same_key)
-		key += dev->caps.num_mpts;
+	key += dev->caps.num_mpts;
 	*lkey = *rkey = fmr->mr.key = hw_index_to_key(key);
 
 	*(u8 *) fmr->mpt = MLX4_MPT_STATUS_SW;
@@ -592,18 +849,19 @@
 	/* Make sure MPT status is visible before writing MTT entries */
 	wmb();
 
+	dma_sync_single_for_cpu(&dev->pdev->dev, fmr->dma_handle,
+				npages * sizeof(u64), DMA_TO_DEVICE);
+
 	for (i = 0; i < npages; ++i)
 		fmr->mtts[i] = cpu_to_be64(page_list[i] | MLX4_MTT_FLAG_PRESENT);
 
-	dma_sync_single(&dev->pdev->dev, fmr->dma_handle,
-			npages * sizeof(u64), DMA_TO_DEVICE);
+	dma_sync_single_for_device(&dev->pdev->dev, fmr->dma_handle,
+				   npages * sizeof(u64), DMA_TO_DEVICE);
 
 	fmr->mpt->key    = cpu_to_be32(key);
 	fmr->mpt->lkey   = cpu_to_be32(key);
-	fmr->mpt->length = cpu_to_be64(len);
+	fmr->mpt->length = cpu_to_be64(npages * (1ull << fmr->page_shift));
 	fmr->mpt->start  = cpu_to_be64(iova);
-	fmr->mpt->first_byte_offset = cpu_to_be32(fbo & 0x001fffff);
-	fmr->mpt->flags2 = (fbo ? MLX4_MPT_FLAG2_FBO_EN : 0);
 
 	/* Make MTT entries are visible before setting MPT status */
 	wmb();
@@ -615,16 +873,6 @@
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(mlx4_map_phys_fmr_fbo);
-
-int mlx4_map_phys_fmr(struct mlx4_dev *dev, struct mlx4_fmr *fmr, u64 *page_list,
-		      int npages, u64 iova, u32 *lkey, u32 *rkey)
-{
-	u32 len = npages * (1ull << fmr->page_shift);
-
-	return mlx4_map_phys_fmr_fbo(dev, fmr, page_list, npages, iova, 0,
-				     len, lkey, rkey, 0);
-}
 EXPORT_SYMBOL_GPL(mlx4_map_phys_fmr);
 
 int mlx4_fmr_alloc(struct mlx4_dev *dev, u32 pd, u32 access, int max_pages,
@@ -631,9 +879,11 @@
 		   int max_maps, u8 page_shift, struct mlx4_fmr *fmr)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
-	u64 mtt_seg;
-	int err = -ENOMEM;
+	int err = -ENOMEM, ret;
 
+	if (max_maps > dev->caps.max_fmr_maps)
+		return -EINVAL;
+
 	if (page_shift < (ffs(dev->caps.page_size_cap) - 1) || page_shift >= 32)
 		return -EINVAL;
 
@@ -651,11 +901,10 @@
 	if (err)
 		return err;
 
-	mtt_seg = fmr->mr.mtt.first_seg * dev->caps.mtt_entry_sz;
-
 	fmr->mtts = mlx4_table_find(&priv->mr_table.mtt_table,
-				    fmr->mr.mtt.first_seg,
+				    fmr->mr.mtt.offset,
 				    &fmr->dma_handle);
+
 	if (!fmr->mtts) {
 		err = -ENOMEM;
 		goto err_free;
@@ -664,54 +913,13 @@
 	return 0;
 
 err_free:
-	mlx4_mr_free(dev, &fmr->mr);
+	ret = mlx4_mr_free(dev, &fmr->mr);
+	if (ret)
+		mlx4_err(dev, "Error deregistering MR. The system may have become unstable.");
 	return err;
 }
 EXPORT_SYMBOL_GPL(mlx4_fmr_alloc);
 
-int mlx4_fmr_alloc_reserved(struct mlx4_dev *dev, u32 mridx,
-			    u32 pd, u32 access, int max_pages,
-			    int max_maps, u8 page_shift, struct mlx4_fmr *fmr)
-{
-	struct mlx4_priv *priv = mlx4_priv(dev);
-	u64 mtt_seg;
-	int err = -ENOMEM;
-
-	if (page_shift < (ffs(dev->caps.page_size_cap) - 1) || page_shift >= 32)
-		return -EINVAL;
-
-	/* All MTTs must fit in the same page */
-	if (max_pages * sizeof *fmr->mtts > PAGE_SIZE)
-		return -EINVAL;
-
-	fmr->page_shift = page_shift;
-	fmr->max_pages  = max_pages;
-	fmr->max_maps   = max_maps;
-	fmr->maps = 0;
-
-	err = mlx4_mr_alloc_reserved(dev, mridx, pd, 0, 0, access, max_pages,
-				     page_shift, &fmr->mr);
-	if (err)
-		return err;
-
-	mtt_seg = fmr->mr.mtt.first_seg * dev->caps.mtt_entry_sz;
-
-	fmr->mtts = mlx4_table_find(&priv->mr_table.mtt_table,
-				    fmr->mr.mtt.first_seg,
-				    &fmr->dma_handle);
-	if (!fmr->mtts) {
-		err = -ENOMEM;
-		goto err_free;
-	}
-
-	return 0;
-
-err_free:
-	mlx4_mr_free_reserved(dev, &fmr->mr);
-	return err;
-}
-EXPORT_SYMBOL_GPL(mlx4_fmr_alloc_reserved);
-
 int mlx4_fmr_enable(struct mlx4_dev *dev, struct mlx4_fmr *fmr)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
@@ -733,41 +941,56 @@
 void mlx4_fmr_unmap(struct mlx4_dev *dev, struct mlx4_fmr *fmr,
 		    u32 *lkey, u32 *rkey)
 {
+	u32 key;
+
 	if (!fmr->maps)
 		return;
 
+	key = key_to_hw_index(fmr->mr.key) & (dev->caps.num_mpts - 1);
+
+	*(u8 *)fmr->mpt = MLX4_MPT_STATUS_SW;
+
+	/* Make sure MPT status is visible before changing MPT fields */
+	wmb();
+
+	fmr->mr.key = hw_index_to_key(key);
+
+	fmr->mpt->key    = cpu_to_be32(key);
+	fmr->mpt->lkey   = cpu_to_be32(key);
+	fmr->mpt->length = 0;
+	fmr->mpt->start  = 0;
+
+	/* Make sure MPT data is visible before changing MPT status */
+	wmb();
+
+	*(u8 *)fmr->mpt = MLX4_MPT_STATUS_HW;
+
+	/* Make sure MPT satus is visible */
+	wmb();
+
 	fmr->maps = 0;
-
-	*(u8 *) fmr->mpt = MLX4_MPT_STATUS_SW;
 }
 EXPORT_SYMBOL_GPL(mlx4_fmr_unmap);
 
 int mlx4_fmr_free(struct mlx4_dev *dev, struct mlx4_fmr *fmr)
 {
+	int ret;
+
 	if (fmr->maps)
 		return -EBUSY;
 
-	fmr->mr.enabled = 0;
-	mlx4_mr_free(dev, &fmr->mr);
+	ret = mlx4_mr_free(dev, &fmr->mr);
+	if (ret)
+		return ret;
+	fmr->mr.enabled = MLX4_MPT_DISABLED;
 
 	return 0;
 }
 EXPORT_SYMBOL_GPL(mlx4_fmr_free);
 
-int mlx4_fmr_free_reserved(struct mlx4_dev *dev, struct mlx4_fmr *fmr)
-{
-	if (fmr->maps)
-		return -EBUSY;
-
-	fmr->mr.enabled = 0;
-	mlx4_mr_free_reserved(dev, &fmr->mr);
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(mlx4_fmr_free_reserved);
-
 int mlx4_SYNC_TPT(struct mlx4_dev *dev)
 {
-	return mlx4_cmd(dev, 0, 0, 0, MLX4_CMD_SYNC_TPT, 1000);
+	return mlx4_cmd(dev, 0, 0, 0, MLX4_CMD_SYNC_TPT, 1000,
+			MLX4_CMD_NATIVE);
 }
 EXPORT_SYMBOL_GPL(mlx4_SYNC_TPT);

Modified: trunk/sys/ofed/drivers/net/mlx4/pd.c
===================================================================
--- trunk/sys/ofed/drivers/net/mlx4/pd.c	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/drivers/net/mlx4/pd.c	2016-09-14 19:35:22 UTC (rev 7911)
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 2006, 2007 Cisco Systems, Inc.  All rights reserved.
- * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2005, 2014 Mellanox Technologies. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -31,11 +31,11 @@
  * SOFTWARE.
  */
 
-#include <linux/init.h>
 #include <linux/errno.h>
+#include <linux/module.h>
 #include <linux/io-mapping.h>
 
-#include <asm/page.h>
+#include <linux/page.h>
 
 #include "mlx4.h"
 #include "icm.h"
@@ -58,16 +58,70 @@
 
 void mlx4_pd_free(struct mlx4_dev *dev, u32 pdn)
 {
-	mlx4_bitmap_free(&mlx4_priv(dev)->pd_bitmap, pdn);
+	mlx4_bitmap_free(&mlx4_priv(dev)->pd_bitmap, pdn, MLX4_USE_RR);
 }
 EXPORT_SYMBOL_GPL(mlx4_pd_free);
 
+int __mlx4_xrcd_alloc(struct mlx4_dev *dev, u32 *xrcdn)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	*xrcdn = mlx4_bitmap_alloc(&priv->xrcd_bitmap);
+	if (*xrcdn == -1)
+		return -ENOMEM;
+
+	return 0;
+}
+
+int mlx4_xrcd_alloc(struct mlx4_dev *dev, u32 *xrcdn)
+{
+	u64 out_param;
+	int err;
+
+	if (mlx4_is_mfunc(dev)) {
+		err = mlx4_cmd_imm(dev, 0, &out_param,
+				   RES_XRCD, RES_OP_RESERVE,
+				   MLX4_CMD_ALLOC_RES,
+				   MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
+		if (err)
+			return err;
+
+		*xrcdn = get_param_l(&out_param);
+		return 0;
+	}
+	return __mlx4_xrcd_alloc(dev, xrcdn);
+}
+EXPORT_SYMBOL_GPL(mlx4_xrcd_alloc);
+
+void __mlx4_xrcd_free(struct mlx4_dev *dev, u32 xrcdn)
+{
+	mlx4_bitmap_free(&mlx4_priv(dev)->xrcd_bitmap, xrcdn, MLX4_USE_RR);
+}
+
+void mlx4_xrcd_free(struct mlx4_dev *dev, u32 xrcdn)
+{
+	u64 in_param = 0;
+	int err;
+
+	if (mlx4_is_mfunc(dev)) {
+		set_param_l(&in_param, xrcdn);
+		err = mlx4_cmd(dev, in_param, RES_XRCD,
+			       RES_OP_RESERVE, MLX4_CMD_FREE_RES,
+			       MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
+		if (err)
+			mlx4_warn(dev, "Failed to release xrcdn %d\n", xrcdn);
+	} else
+		__mlx4_xrcd_free(dev, xrcdn);
+}
+EXPORT_SYMBOL_GPL(mlx4_xrcd_free);
+
 int mlx4_init_pd_table(struct mlx4_dev *dev)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
 
 	return mlx4_bitmap_init(&priv->pd_bitmap, dev->caps.num_pds,
-				(1 << 24) - 1, dev->caps.reserved_pds, 0);
+				(1 << NOT_MASKED_PD_BITS) - 1,
+				 dev->caps.reserved_pds, 0);
 }
 
 void mlx4_cleanup_pd_table(struct mlx4_dev *dev)
@@ -75,16 +129,34 @@
 	mlx4_bitmap_cleanup(&mlx4_priv(dev)->pd_bitmap);
 }
 
+int mlx4_init_xrcd_table(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
 
+	return mlx4_bitmap_init(&priv->xrcd_bitmap, (1 << 16),
+				(1 << 16) - 1, dev->caps.reserved_xrcds + 1, 0);
+}
+
+void mlx4_cleanup_xrcd_table(struct mlx4_dev *dev)
+{
+	mlx4_bitmap_cleanup(&mlx4_priv(dev)->xrcd_bitmap);
+}
+
 int mlx4_uar_alloc(struct mlx4_dev *dev, struct mlx4_uar *uar)
 {
+	int offset;
+
 	uar->index = mlx4_bitmap_alloc(&mlx4_priv(dev)->uar_table.bitmap);
 	if (uar->index == -1)
 		return -ENOMEM;
 
-	uar->pfn = (pci_resource_start(dev->pdev, 2) >> PAGE_SHIFT) + uar->index;
+	if (mlx4_is_slave(dev))
+		offset = uar->index % ((int) pci_resource_len(dev->pdev, 2) /
+				       dev->caps.uar_page_size);
+	else
+		offset = uar->index;
+	uar->pfn = (pci_resource_start(dev->pdev, 2) >> PAGE_SHIFT) + offset;
 	uar->map = NULL;
-
 	return 0;
 }
 EXPORT_SYMBOL_GPL(mlx4_uar_alloc);
@@ -91,11 +163,12 @@
 
 void mlx4_uar_free(struct mlx4_dev *dev, struct mlx4_uar *uar)
 {
-	mlx4_bitmap_free(&mlx4_priv(dev)->uar_table.bitmap, uar->index);
+	mlx4_bitmap_free(&mlx4_priv(dev)->uar_table.bitmap, uar->index, MLX4_USE_RR);
 }
 EXPORT_SYMBOL_GPL(mlx4_uar_free);
 
-int mlx4_bf_alloc(struct mlx4_dev *dev, struct mlx4_bf *bf)
+#ifndef CONFIG_PPC
+int mlx4_bf_alloc(struct mlx4_dev *dev, struct mlx4_bf *bf, int node)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
 	struct mlx4_uar *uar;
@@ -113,10 +186,13 @@
 			err = -ENOMEM;
 			goto out;
 		}
-		uar = kmalloc(sizeof *uar, GFP_KERNEL);
+		uar = kmalloc_node(sizeof *uar, GFP_KERNEL, node);
 		if (!uar) {
-			err = -ENOMEM;
-			goto out;
+			uar = kmalloc(sizeof *uar, GFP_KERNEL);
+			if (!uar) {
+				err = -ENOMEM;
+				goto out;
+			}
 		}
 		err = mlx4_uar_alloc(dev, uar);
 		if (err)
@@ -191,6 +267,21 @@
 }
 EXPORT_SYMBOL_GPL(mlx4_bf_free);
 
+#else
+int mlx4_bf_alloc(struct mlx4_dev *dev, struct mlx4_bf *bf, int node)
+{
+	memset(bf, 0, sizeof *bf);
+	return -ENOSYS;
+}
+EXPORT_SYMBOL_GPL(mlx4_bf_alloc);
+
+void mlx4_bf_free(struct mlx4_dev *dev, struct mlx4_bf *bf)
+{
+       return;
+}
+EXPORT_SYMBOL_GPL(mlx4_bf_free);
+#endif
+
 int mlx4_init_uar_table(struct mlx4_dev *dev)
 {
 	if (dev->caps.num_uars <= 128) {
@@ -202,7 +293,7 @@
 
 	return mlx4_bitmap_init(&mlx4_priv(dev)->uar_table.bitmap,
 				dev->caps.num_uars, dev->caps.num_uars - 1,
-				max(128, dev->caps.reserved_uars), 0);
+				dev->caps.reserved_uars, 0);
 }
 
 void mlx4_cleanup_uar_table(struct mlx4_dev *dev)

Modified: trunk/sys/ofed/drivers/net/mlx4/port.c
===================================================================
--- trunk/sys/ofed/drivers/net/mlx4/port.c	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/drivers/net/mlx4/port.c	2016-09-14 19:35:22 UTC (rev 7911)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2007, 2014 Mellanox Technologies. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -32,17 +32,22 @@
 
 #include <linux/errno.h>
 #include <linux/if_ether.h>
+#include <linux/module.h>
+#include <linux/err.h>
 
 #include <linux/mlx4/cmd.h>
-
+#include <linux/moduleparam.h>
 #include "mlx4.h"
+#include "mlx4_stats.h"
 
-int mlx4_ib_set_4k_mtu = 0;
-module_param_named(set_4k_mtu, mlx4_ib_set_4k_mtu, int, 0444);
-MODULE_PARM_DESC(set_4k_mtu, "attempt to set 4K MTU to all ConnectX ports");
 
+int mlx4_set_4k_mtu = -1;
+module_param_named(set_4k_mtu, mlx4_set_4k_mtu, int, 0444);
+MODULE_PARM_DESC(set_4k_mtu,
+	"(Obsolete) attempt to set 4K MTU to all ConnectX ports");
+
+
 #define MLX4_MAC_VALID		(1ull << 63)
-#define MLX4_MAC_MASK		0xffffffffffffULL
 
 #define MLX4_VLAN_VALID		(1u << 31)
 #define MLX4_VLAN_MASK		0xfff
@@ -69,10 +74,36 @@
 		table->entries[i] = 0;
 		table->refs[i]	 = 0;
 	}
-	table->max   = 1 << dev->caps.log_num_vlans;
+	table->max   = (1 << dev->caps.log_num_vlans) - MLX4_VLAN_REGULAR;
 	table->total = 0;
 }
 
+static int validate_index(struct mlx4_dev *dev,
+			  struct mlx4_mac_table *table, int index)
+{
+	int err = 0;
+
+	if (index < 0 || index >= table->max || !table->refs[index]) {
+		mlx4_warn(dev, "No valid Mac entry for the given index\n");
+		err = -EINVAL;
+	}
+	return err;
+}
+
+static int find_index(struct mlx4_dev *dev,
+		      struct mlx4_mac_table *table, u64 mac)
+{
+	int i;
+
+	for (i = 0; i < MLX4_MAX_MAC_NUM; i++) {
+		if ((mac & MLX4_MAC_MASK) ==
+		    (MLX4_MAC_MASK & be64_to_cpu(table->entries[i])))
+			return i;
+	}
+	/* Mac not found */
+	return -EINVAL;
+}
+
 static int mlx4_set_port_mac_table(struct mlx4_dev *dev, u8 port,
 				   __be64 *entries)
 {
@@ -87,40 +118,40 @@
 	memcpy(mailbox->buf, entries, MLX4_MAC_TABLE_SIZE);
 
 	in_mod = MLX4_SET_PORT_MAC_TABLE << 8 | port;
+
 	err = mlx4_cmd(dev, mailbox->dma, in_mod, 1, MLX4_CMD_SET_PORT,
-		       MLX4_CMD_TIME_CLASS_B);
+		       MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
 
 	mlx4_free_cmd_mailbox(dev, mailbox);
 	return err;
 }
 
-int mlx4_register_mac(struct mlx4_dev *dev, u8 port, u64 mac, int *index)
+int __mlx4_register_mac(struct mlx4_dev *dev, u8 port, u64 mac)
 {
-	struct mlx4_mac_table *table = &mlx4_priv(dev)->port[port].mac_table;
+	struct mlx4_port_info *info = &mlx4_priv(dev)->port[port];
+	struct mlx4_mac_table *table = &info->mac_table;
 	int i, err = 0;
 	int free = -1;
 
-	mlx4_dbg(dev, "Registering MAC: 0x%llx\n", (unsigned long long) mac);
+	mlx4_dbg(dev, "Registering MAC: 0x%llx for port %d\n",
+		 (unsigned long long) mac, port);
+
 	mutex_lock(&table->mutex);
-	for (i = 0; i < MLX4_MAX_MAC_NUM - 1; i++) {
+	for (i = 0; i < MLX4_MAX_MAC_NUM; i++) {
 		if (free < 0 && !table->refs[i]) {
 			free = i;
 			continue;
 		}
 
-		if (mac == (MLX4_MAC_MASK & be64_to_cpu(table->entries[i]))) {
-			/* MAC already registered, increase refernce count */
-			*index = i;
+		if ((mac == (MLX4_MAC_MASK & be64_to_cpu(table->entries[i]))) &&
+		    table->refs[i]) {
+			/* MAC already registered, Must not have duplicates */
+			err = i;
 			++table->refs[i];
 			goto out;
 		}
 	}
 
-	if (free < 0) {
-		err = -ENOMEM;
-		goto out;
-	}
-
 	mlx4_dbg(dev, "Free MAC index is %d\n", free);
 
 	if (table->total == table->max) {
@@ -130,39 +161,87 @@
 	}
 
 	/* Register new MAC */
-	table->refs[free] = 1;
 	table->entries[free] = cpu_to_be64(mac | MLX4_MAC_VALID);
 
 	err = mlx4_set_port_mac_table(dev, port, table->entries);
 	if (unlikely(err)) {
-		mlx4_err(dev, "Failed adding MAC: 0x%llx\n", (unsigned long long) mac);
-		table->refs[free] = 0;
+		mlx4_err(dev, "Failed adding MAC: 0x%llx\n",
+			 (unsigned long long) mac);
 		table->entries[free] = 0;
 		goto out;
 	}
+	table->refs[free] = 1;
 
-	*index = free;
+	err = free;
 	++table->total;
 out:
 	mutex_unlock(&table->mutex);
 	return err;
 }
+EXPORT_SYMBOL_GPL(__mlx4_register_mac);
+
+int mlx4_register_mac(struct mlx4_dev *dev, u8 port, u64 mac)
+{
+	u64 out_param = 0;
+	int err = -EINVAL;
+
+	if (mlx4_is_mfunc(dev)) {
+		if (!(dev->flags & MLX4_FLAG_OLD_REG_MAC)) {
+			err = mlx4_cmd_imm(dev, mac, &out_param,
+					   ((u32) port) << 8 | (u32) RES_MAC,
+					   RES_OP_RESERVE_AND_MAP, MLX4_CMD_ALLOC_RES,
+					   MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
+		}
+		if (err && err == -EINVAL && mlx4_is_slave(dev)) {
+			/* retry using old REG_MAC format */
+			set_param_l(&out_param, port);
+			err = mlx4_cmd_imm(dev, mac, &out_param, RES_MAC,
+					   RES_OP_RESERVE_AND_MAP, MLX4_CMD_ALLOC_RES,
+					   MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
+			if (!err)
+				dev->flags |= MLX4_FLAG_OLD_REG_MAC;
+		}
+		if (err)
+			return err;
+
+		return get_param_l(&out_param);
+	}
+	return __mlx4_register_mac(dev, port, mac);
+}
 EXPORT_SYMBOL_GPL(mlx4_register_mac);
 
-void mlx4_unregister_mac(struct mlx4_dev *dev, u8 port, int index)
+int mlx4_get_base_qpn(struct mlx4_dev *dev, u8 port)
 {
-	struct mlx4_mac_table *table = &mlx4_priv(dev)->port[port].mac_table;
+	return dev->caps.reserved_qps_base[MLX4_QP_REGION_ETH_ADDR] +
+			(port - 1) * (1 << dev->caps.log_num_macs);
+}
+EXPORT_SYMBOL_GPL(mlx4_get_base_qpn);
 
+void __mlx4_unregister_mac(struct mlx4_dev *dev, u8 port, u64 mac)
+{
+	struct mlx4_port_info *info;
+	struct mlx4_mac_table *table;
+	int index;
+
+	if (port < 1 || port > dev->caps.num_ports) {
+		mlx4_warn(dev, "invalid port number (%d), aborting...\n", port);
+		return;
+	}
+	info = &mlx4_priv(dev)->port[port];
+	table = &info->mac_table;
 	mutex_lock(&table->mutex);
-	if (!table->refs[index]) {
-		mlx4_warn(dev, "No MAC entry for index %d\n", index);
+
+	index = find_index(dev, table, mac);
+
+	if (validate_index(dev, table, index))
 		goto out;
-	}
+
 	if (--table->refs[index]) {
-		mlx4_warn(dev, "Have more references for index %d,"
-			  "no need to modify MAC table\n", index);
+		mlx4_dbg(dev, "Have more references for index %d,"
+			 "no need to modify mac table\n", index);
 		goto out;
 	}
+
 	table->entries[index] = 0;
 	mlx4_set_port_mac_table(dev, port, table->entries);
 	--table->total;
@@ -169,8 +248,60 @@
 out:
 	mutex_unlock(&table->mutex);
 }
+EXPORT_SYMBOL_GPL(__mlx4_unregister_mac);
+
+void mlx4_unregister_mac(struct mlx4_dev *dev, u8 port, u64 mac)
+{
+	u64 out_param = 0;
+
+	if (mlx4_is_mfunc(dev)) {
+		if (!(dev->flags & MLX4_FLAG_OLD_REG_MAC)) {
+			(void) mlx4_cmd_imm(dev, mac, &out_param,
+					    ((u32) port) << 8 | (u32) RES_MAC,
+					    RES_OP_RESERVE_AND_MAP, MLX4_CMD_FREE_RES,
+					    MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
+		} else {
+			/* use old unregister mac format */
+			set_param_l(&out_param, port);
+			(void) mlx4_cmd_imm(dev, mac, &out_param, RES_MAC,
+					    RES_OP_RESERVE_AND_MAP, MLX4_CMD_FREE_RES,
+					    MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
+		}
+		return;
+	}
+	__mlx4_unregister_mac(dev, port, mac);
+	return;
+}
 EXPORT_SYMBOL_GPL(mlx4_unregister_mac);
 
+int __mlx4_replace_mac(struct mlx4_dev *dev, u8 port, int qpn, u64 new_mac)
+{
+	struct mlx4_port_info *info = &mlx4_priv(dev)->port[port];
+	struct mlx4_mac_table *table = &info->mac_table;
+	int index = qpn - info->base_qpn;
+	int err = 0;
+
+	/* CX1 doesn't support multi-functions */
+	mutex_lock(&table->mutex);
+
+	err = validate_index(dev, table, index);
+	if (err)
+		goto out;
+
+	table->entries[index] = cpu_to_be64(new_mac | MLX4_MAC_VALID);
+
+	err = mlx4_set_port_mac_table(dev, port, table->entries);
+	if (unlikely(err)) {
+		mlx4_err(dev, "Failed adding MAC: 0x%llx\n",
+			 (unsigned long long) new_mac);
+		table->entries[index] = 0;
+	}
+out:
+	mutex_unlock(&table->mutex);
+	return err;
+}
+EXPORT_SYMBOL_GPL(__mlx4_replace_mac);
+
 static int mlx4_set_port_vlan_table(struct mlx4_dev *dev, u8 port,
 				    __be32 *entries)
 {
@@ -185,7 +316,7 @@
 	memcpy(mailbox->buf, entries, MLX4_VLAN_TABLE_SIZE);
 	in_mod = MLX4_SET_PORT_VLAN_TABLE << 8 | port;
 	err = mlx4_cmd(dev, mailbox->dma, in_mod, 1, MLX4_CMD_SET_PORT,
-		       MLX4_CMD_TIME_CLASS_B);
+		       MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
 
 	mlx4_free_cmd_mailbox(dev, mailbox);
 
@@ -201,7 +332,7 @@
 		if (table->refs[i] &&
 		    (vid == (MLX4_VLAN_MASK &
 			      be32_to_cpu(table->entries[i])))) {
-			/* Vlan already registered, increase refernce count */
+			/* VLAN already registered, increase reference count */
 			*idx = i;
 			return 0;
 		}
@@ -211,7 +342,8 @@
 }
 EXPORT_SYMBOL_GPL(mlx4_find_cached_vlan);
 
-int mlx4_register_vlan(struct mlx4_dev *dev, u8 port, u16 vlan, int *index)
+int __mlx4_register_vlan(struct mlx4_dev *dev, u8 port, u16 vlan,
+				int *index)
 {
 	struct mlx4_vlan_table *table = &mlx4_priv(dev)->port[port].vlan_table;
 	int i, err = 0;
@@ -218,6 +350,13 @@
 	int free = -1;
 
 	mutex_lock(&table->mutex);
+
+	if (table->total == table->max) {
+		/* No free vlan entries */
+		err = -ENOSPC;
+		goto out;
+	}
+
 	for (i = MLX4_VLAN_REGULAR; i < MLX4_MAX_VLAN_NUM; i++) {
 		if (free < 0 && (table->refs[i] == 0)) {
 			free = i;
@@ -227,7 +366,7 @@
 		if (table->refs[i] &&
 		    (vlan == (MLX4_VLAN_MASK &
 			      be32_to_cpu(table->entries[i])))) {
-			/* Vlan already registered, increase refernce count */
+			/* Vlan already registered, increase references count */
 			*index = i;
 			++table->refs[i];
 			goto out;
@@ -239,13 +378,7 @@
 		goto out;
 	}
 
-	if (table->total == table->max) {
-		/* No free vlan entries */
-		err = -ENOSPC;
-		goto out;
-	}
-
-	/* Register new MAC */
+	/* Register new VLAN */
 	table->refs[free] = 1;
 	table->entries[free] = cpu_to_be32(vlan | MLX4_VLAN_VALID);
 
@@ -263,25 +396,49 @@
 	mutex_unlock(&table->mutex);
 	return err;
 }
+
+int mlx4_register_vlan(struct mlx4_dev *dev, u8 port, u16 vlan, int *index)
+{
+	u64 out_param = 0;
+	int err;
+
+	if (vlan > 4095)
+		return -EINVAL;
+
+	if (mlx4_is_mfunc(dev)) {
+		err = mlx4_cmd_imm(dev, vlan, &out_param,
+				   ((u32) port) << 8 | (u32) RES_VLAN,
+				   RES_OP_RESERVE_AND_MAP, MLX4_CMD_ALLOC_RES,
+				   MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
+		if (!err)
+			*index = get_param_l(&out_param);
+
+		return err;
+	}
+	return __mlx4_register_vlan(dev, port, vlan, index);
+}
 EXPORT_SYMBOL_GPL(mlx4_register_vlan);
 
-void mlx4_unregister_vlan(struct mlx4_dev *dev, u8 port, int index)
+void __mlx4_unregister_vlan(struct mlx4_dev *dev, u8 port, u16 vlan)
 {
 	struct mlx4_vlan_table *table = &mlx4_priv(dev)->port[port].vlan_table;
+	int index;
 
+	mutex_lock(&table->mutex);
+	if (mlx4_find_cached_vlan(dev, port, vlan, &index)) {
+		mlx4_warn(dev, "vlan 0x%x is not in the vlan table\n", vlan);
+		goto out;
+	}
+
 	if (index < MLX4_VLAN_REGULAR) {
 		mlx4_warn(dev, "Trying to free special vlan index %d\n", index);
-		return;
+		goto out;
 	}
 
-	mutex_lock(&table->mutex);
-	if (!table->refs[index]) {
-		mlx4_warn(dev, "No vlan entry for index %d\n", index);
-		goto out;
-	}
 	if (--table->refs[index]) {
-		mlx4_dbg(dev, "Have more references for index %d,"
-			 "no need to modify vlan table\n", index);
+		mlx4_dbg(dev, "Have %d more references for index %d, "
+			 "no need to modify vlan table\n", table->refs[index],
+			 index);
 		goto out;
 	}
 	table->entries[index] = 0;
@@ -290,6 +447,21 @@
 out:
 	mutex_unlock(&table->mutex);
 }
+
+void mlx4_unregister_vlan(struct mlx4_dev *dev, u8 port, u16 vlan)
+{
+	u64 out_param = 0;
+
+	if (mlx4_is_mfunc(dev)) {
+		(void) mlx4_cmd_imm(dev, vlan, &out_param,
+				    ((u32) port) << 8 | (u32) RES_VLAN,
+				    RES_OP_RESERVE_AND_MAP,
+				    MLX4_CMD_FREE_RES, MLX4_CMD_TIME_CLASS_A,
+				    MLX4_CMD_WRAPPED);
+		return;
+	}
+	__mlx4_unregister_vlan(dev, port, vlan);
+}
 EXPORT_SYMBOL_GPL(mlx4_unregister_vlan);
 
 int mlx4_get_port_ib_caps(struct mlx4_dev *dev, u8 port, __be32 *caps)
@@ -320,7 +492,8 @@
 	*(__be32 *) (&inbuf[20]) = cpu_to_be32(port);
 
 	err = mlx4_cmd_box(dev, inmailbox->dma, outmailbox->dma, port, 3,
-			   MLX4_CMD_MAD_IFC, MLX4_CMD_TIME_CLASS_C);
+			   MLX4_CMD_MAD_IFC, MLX4_CMD_TIME_CLASS_C,
+			   MLX4_CMD_NATIVE);
 	if (!err)
 		*caps = *(__be32 *) (outbuf + 84);
 	mlx4_free_cmd_mailbox(dev, inmailbox);
@@ -327,13 +500,263 @@
 	mlx4_free_cmd_mailbox(dev, outmailbox);
 	return err;
 }
+static struct mlx4_roce_gid_entry zgid_entry;
 
-int mlx4_SET_PORT(struct mlx4_dev *dev, u8 port)
+int mlx4_get_slave_num_gids(struct mlx4_dev *dev, int slave)
 {
-	struct mlx4_cmd_mailbox *mailbox;
+	if (slave == 0)
+		return MLX4_ROCE_PF_GIDS;
+	if (slave <= ((MLX4_ROCE_MAX_GIDS - MLX4_ROCE_PF_GIDS) % dev->num_vfs))
+		return ((MLX4_ROCE_MAX_GIDS - MLX4_ROCE_PF_GIDS) / dev->num_vfs) + 1;
+	return (MLX4_ROCE_MAX_GIDS - MLX4_ROCE_PF_GIDS) / dev->num_vfs;
+}
+
+int mlx4_get_base_gid_ix(struct mlx4_dev *dev, int slave)
+{
+	int gids;
+	int vfs;
+
+	gids = MLX4_ROCE_MAX_GIDS - MLX4_ROCE_PF_GIDS;
+	vfs = dev->num_vfs;
+
+	if (slave == 0)
+		return 0;
+	if (slave <= gids % vfs)
+		return MLX4_ROCE_PF_GIDS + ((gids / vfs) + 1) * (slave - 1);
+
+	return MLX4_ROCE_PF_GIDS + (gids % vfs) + ((gids / vfs) * (slave - 1));
+}
+
+static int mlx4_common_set_port(struct mlx4_dev *dev, int slave, u32 in_mod,
+				u8 op_mod, struct mlx4_cmd_mailbox *inbox)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_port_info *port_info;
+	struct mlx4_mfunc_master_ctx *master = &priv->mfunc.master;
+	struct mlx4_slave_state *slave_st = &master->slave_state[slave];
+	struct mlx4_set_port_rqp_calc_context *qpn_context;
+	struct mlx4_set_port_general_context *gen_context;
+	struct mlx4_roce_gid_entry *gid_entry_tbl, *gid_entry_mbox, *gid_entry_mb1;
+	int reset_qkey_viols;
+	int port;
+	int is_eth;
+	int num_gids;
+	int base;
+	u32 in_modifier;
+	u32 promisc;
+	u16 mtu, prev_mtu;
 	int err;
+	int i, j;
+	int offset;
+	__be32 agg_cap_mask;
+	__be32 slave_cap_mask;
+	__be32 new_cap_mask;
 
-	if (dev->caps.port_type[port] != MLX4_PORT_TYPE_IB)
+	port = in_mod & 0xff;
+	in_modifier = (in_mod >> 8) & 0xff;
+	is_eth = op_mod;
+	port_info = &priv->port[port];
+
+	if (op_mod > 1)
+		return -EINVAL;
+
+	/* Slaves cannot perform SET_PORT operations except changing MTU */
+	if (is_eth) {
+		if (slave != dev->caps.function &&
+		    in_modifier != MLX4_SET_PORT_GENERAL &&
+		    in_modifier != MLX4_SET_PORT_GID_TABLE) {
+			mlx4_warn(dev, "denying SET_PORT for slave:%d,"
+				  "port %d, config_select 0x%x\n",
+				  slave, port, in_modifier);
+			return -EINVAL;
+		}
+		switch (in_modifier) {
+		case MLX4_SET_PORT_RQP_CALC:
+			qpn_context = inbox->buf;
+			qpn_context->base_qpn =
+				cpu_to_be32(port_info->base_qpn);
+			qpn_context->n_mac = 0x7;
+			promisc = be32_to_cpu(qpn_context->promisc) >>
+				SET_PORT_PROMISC_SHIFT;
+			qpn_context->promisc = cpu_to_be32(
+				promisc << SET_PORT_PROMISC_SHIFT |
+				port_info->base_qpn);
+			promisc = be32_to_cpu(qpn_context->mcast) >>
+				SET_PORT_MC_PROMISC_SHIFT;
+			qpn_context->mcast = cpu_to_be32(
+				promisc << SET_PORT_MC_PROMISC_SHIFT |
+				port_info->base_qpn);
+			break;
+		case MLX4_SET_PORT_GENERAL:
+			gen_context = inbox->buf;
+			/* Mtu is configured as the max MTU among all the
+			 * the functions on the port. */
+			mtu = be16_to_cpu(gen_context->mtu);
+			mtu = min_t(int, mtu, dev->caps.eth_mtu_cap[port] +
+				    ETH_HLEN + VLAN_HLEN + ETH_FCS_LEN);
+			prev_mtu = slave_st->mtu[port];
+			slave_st->mtu[port] = mtu;
+			if (mtu > master->max_mtu[port])
+				master->max_mtu[port] = mtu;
+			if (mtu < prev_mtu && prev_mtu ==
+						master->max_mtu[port]) {
+				slave_st->mtu[port] = mtu;
+				master->max_mtu[port] = mtu;
+				for (i = 0; i < dev->num_slaves; i++) {
+					master->max_mtu[port] =
+					max(master->max_mtu[port],
+					    master->slave_state[i].mtu[port]);
+				}
+			}
+
+			gen_context->mtu = cpu_to_be16(master->max_mtu[port]);
+			break;
+		case MLX4_SET_PORT_GID_TABLE:
+			/* change to MULTIPLE entries: number of guest's gids
+			 * need a FOR-loop here over number of gids the guest has.
+			 * 1. Check no duplicates in gids passed by slave
+			 */
+			num_gids = mlx4_get_slave_num_gids(dev, slave);
+			base = mlx4_get_base_gid_ix(dev, slave);
+			gid_entry_mbox = (struct mlx4_roce_gid_entry *) (inbox->buf);
+			for (i = 0; i < num_gids; gid_entry_mbox++, i++) {
+				if (!memcmp(gid_entry_mbox->raw, zgid_entry.raw,
+					    sizeof(zgid_entry)))
+					continue;
+				gid_entry_mb1 = gid_entry_mbox + 1;
+				for (j = i + 1; j < num_gids; gid_entry_mb1++, j++) {
+					if (!memcmp(gid_entry_mb1->raw,
+						    zgid_entry.raw, sizeof(zgid_entry)))
+						continue;
+					if (!memcmp(gid_entry_mb1->raw, gid_entry_mbox->raw,
+						    sizeof(gid_entry_mbox->raw))) {
+						/* found duplicate */
+						return -EINVAL;
+					}
+				}
+			}
+
+			/* 2. Check that do not have duplicates in OTHER
+			 *    entries in the port GID table
+			 */
+			for (i = 0; i < MLX4_ROCE_MAX_GIDS; i++) {
+				if (i >= base && i < base + num_gids)
+					continue; /* don't compare to slave's current gids */
+				gid_entry_tbl = &priv->roce_gids[port - 1][i];
+				if (!memcmp(gid_entry_tbl->raw, zgid_entry.raw, sizeof(zgid_entry)))
+					continue;
+				gid_entry_mbox = (struct mlx4_roce_gid_entry *) (inbox->buf);
+				for (j = 0; j < num_gids; gid_entry_mbox++, j++) {
+					if (!memcmp(gid_entry_mbox->raw, zgid_entry.raw,
+						    sizeof(zgid_entry)))
+						continue;
+					if (!memcmp(gid_entry_mbox->raw, gid_entry_tbl->raw,
+						    sizeof(gid_entry_tbl->raw))) {
+						/* found duplicate */
+						mlx4_warn(dev, "requested gid entry for slave:%d "
+							  "is a duplicate of gid at index %d\n",
+							  slave, i);
+						return -EINVAL;
+					}
+				}
+			}
+
+			/* insert slave GIDs with memcpy, starting at slave's base index */
+			gid_entry_mbox = (struct mlx4_roce_gid_entry *) (inbox->buf);
+			for (i = 0, offset = base; i < num_gids; gid_entry_mbox++, offset++, i++)
+				memcpy(priv->roce_gids[port - 1][offset].raw, gid_entry_mbox->raw, 16);
+
+			/* Now, copy roce port gids table to current mailbox for passing to FW */
+			gid_entry_mbox = (struct mlx4_roce_gid_entry *) (inbox->buf);
+			for (i = 0; i < MLX4_ROCE_MAX_GIDS; gid_entry_mbox++, i++)
+				memcpy(gid_entry_mbox->raw, priv->roce_gids[port - 1][i].raw, 16);
+
+			break;
+		}
+		return mlx4_cmd(dev, inbox->dma, in_mod & 0xffff, op_mod,
+				MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B,
+				MLX4_CMD_NATIVE);
+	}
+
+	/* For IB, we only consider:
+	 * - The capability mask, which is set to the aggregate of all
+	 *   slave function capabilities
+	 * - The QKey violatin counter - reset according to each request.
+	 */
+
+	if (dev->flags & MLX4_FLAG_OLD_PORT_CMDS) {
+		reset_qkey_viols = (*(u8 *) inbox->buf) & 0x40;
+		new_cap_mask = ((__be32 *) inbox->buf)[2];
+	} else {
+		reset_qkey_viols = ((u8 *) inbox->buf)[3] & 0x1;
+		new_cap_mask = ((__be32 *) inbox->buf)[1];
+	}
+
+	/* slave may not set the IS_SM capability for the port */
+	if (slave != mlx4_master_func_num(dev) &&
+	    (be32_to_cpu(new_cap_mask) & MLX4_PORT_CAP_IS_SM))
+		return -EINVAL;
+
+	/* No DEV_MGMT in multifunc mode */
+	if (mlx4_is_mfunc(dev) &&
+	    (be32_to_cpu(new_cap_mask) & MLX4_PORT_CAP_DEV_MGMT_SUP))
+		return -EINVAL;
+
+	agg_cap_mask = 0;
+	slave_cap_mask =
+		priv->mfunc.master.slave_state[slave].ib_cap_mask[port];
+	priv->mfunc.master.slave_state[slave].ib_cap_mask[port] = new_cap_mask;
+	for (i = 0; i < dev->num_slaves; i++)
+		agg_cap_mask |=
+			priv->mfunc.master.slave_state[i].ib_cap_mask[port];
+
+	/* only clear mailbox for guests.  Master may be setting
+	* MTU or PKEY table size
+	*/
+	if (slave != dev->caps.function)
+		memset(inbox->buf, 0, 256);
+	if (dev->flags & MLX4_FLAG_OLD_PORT_CMDS) {
+		*(u8 *) inbox->buf	   |= !!reset_qkey_viols << 6;
+		((__be32 *) inbox->buf)[2] = agg_cap_mask;
+	} else {
+		((u8 *) inbox->buf)[3]     |= !!reset_qkey_viols;
+		((__be32 *) inbox->buf)[1] = agg_cap_mask;
+	}
+
+	err = mlx4_cmd(dev, inbox->dma, port, is_eth, MLX4_CMD_SET_PORT,
+		       MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
+	if (err)
+		priv->mfunc.master.slave_state[slave].ib_cap_mask[port] =
+			slave_cap_mask;
+	return err;
+}
+
+int mlx4_SET_PORT_wrapper(struct mlx4_dev *dev, int slave,
+			  struct mlx4_vhcr *vhcr,
+			  struct mlx4_cmd_mailbox *inbox,
+			  struct mlx4_cmd_mailbox *outbox,
+			  struct mlx4_cmd_info *cmd)
+{
+	return mlx4_common_set_port(dev, slave, vhcr->in_modifier,
+				    vhcr->op_modifier, inbox);
+}
+
+/* bit locations for set port command with zero op modifier */
+enum {
+	MLX4_SET_PORT_VL_CAP	 = 4, /* bits 7:4 */
+	MLX4_SET_PORT_MTU_CAP	 = 12, /* bits 15:12 */
+	MLX4_CHANGE_PORT_PKEY_TBL_SZ = 20,
+	MLX4_CHANGE_PORT_VL_CAP	 = 21,
+	MLX4_CHANGE_PORT_MTU_CAP = 22,
+};
+
+int mlx4_SET_PORT(struct mlx4_dev *dev, u8 port, int pkey_tbl_sz)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	int err = -EINVAL, vl_cap, pkey_tbl_flag = 0;
+	u32 in_mod;
+
+	if (dev->caps.port_type[port] == MLX4_PORT_TYPE_NONE)
 		return 0;
 
 	mailbox = mlx4_alloc_cmd_mailbox(dev);
@@ -342,13 +765,458 @@
 
 	memset(mailbox->buf, 0, 256);
 
-	if (mlx4_ib_set_4k_mtu)
-		((__be32 *) mailbox->buf)[0] |= cpu_to_be32((1 << 22) | (1 << 21) | (5 << 12) | (2 << 4));
+	if (dev->caps.port_type[port] == MLX4_PORT_TYPE_ETH) {
+		in_mod = MLX4_SET_PORT_GENERAL << 8 | port;
+		err = mlx4_cmd(dev, mailbox->dma, in_mod, 1,
+			       MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B,
+			       MLX4_CMD_WRAPPED);
+	} else {
+		((__be32 *) mailbox->buf)[1] = dev->caps.ib_port_def_cap[port];
 
-	((__be32 *) mailbox->buf)[1] = dev->caps.ib_port_def_cap[port];
-	err = mlx4_cmd(dev, mailbox->dma, port, 0, MLX4_CMD_SET_PORT,
-		       MLX4_CMD_TIME_CLASS_B);
+		if (pkey_tbl_sz >= 0 && mlx4_is_master(dev)) {
+			pkey_tbl_flag = 1;
+			((__be16 *) mailbox->buf)[20] = cpu_to_be16(pkey_tbl_sz);
+		}
 
+		/* IB VL CAP enum isn't used by the firmware, just numerical values */
+		for (vl_cap = dev->caps.vl_cap[port];
+				vl_cap >= 1; vl_cap >>= 1) {
+			((__be32 *) mailbox->buf)[0] = cpu_to_be32(
+				(1 << MLX4_CHANGE_PORT_MTU_CAP) |
+				(1 << MLX4_CHANGE_PORT_VL_CAP)  |
+				(pkey_tbl_flag << MLX4_CHANGE_PORT_PKEY_TBL_SZ) |
+				(dev->caps.port_ib_mtu[port] << MLX4_SET_PORT_MTU_CAP) |
+				(vl_cap << MLX4_SET_PORT_VL_CAP));
+			err = mlx4_cmd(dev, mailbox->dma, port, 0, MLX4_CMD_SET_PORT,
+					MLX4_CMD_TIME_CLASS_B, MLX4_CMD_WRAPPED);
+			if (err != -ENOMEM)
+				break;
+		}
+	}
+
 	mlx4_free_cmd_mailbox(dev, mailbox);
 	return err;
 }
+
+int mlx4_SET_PORT_general(struct mlx4_dev *dev, u8 port, int mtu,
+			  u8 pptx, u8 pfctx, u8 pprx, u8 pfcrx)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_set_port_general_context *context;
+	int err;
+	u32 in_mod;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	context = mailbox->buf;
+	memset(context, 0, sizeof *context);
+
+	context->flags = SET_PORT_GEN_ALL_VALID;
+	context->mtu = cpu_to_be16(mtu);
+	context->pptx = (pptx * (!pfctx)) << 7;
+	context->pfctx = pfctx;
+	context->pprx = (pprx * (!pfcrx)) << 7;
+	context->pfcrx = pfcrx;
+
+	in_mod = MLX4_SET_PORT_GENERAL << 8 | port;
+	err = mlx4_cmd(dev, mailbox->dma, in_mod, 1, MLX4_CMD_SET_PORT,
+		       MLX4_CMD_TIME_CLASS_B,  MLX4_CMD_WRAPPED);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+EXPORT_SYMBOL(mlx4_SET_PORT_general);
+
+int mlx4_SET_PORT_qpn_calc(struct mlx4_dev *dev, u8 port, u32 base_qpn,
+			   u8 promisc)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_set_port_rqp_calc_context *context;
+	int err;
+	u32 in_mod;
+	u32 m_promisc = (dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_MC_STEER) ?
+		MCAST_DIRECT : MCAST_DEFAULT;
+
+	if (dev->caps.steering_mode != MLX4_STEERING_MODE_A0)
+		return 0;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	context = mailbox->buf;
+	memset(context, 0, sizeof *context);
+
+	context->base_qpn = cpu_to_be32(base_qpn);
+	context->n_mac = dev->caps.log_num_macs;
+	context->promisc = cpu_to_be32(promisc << SET_PORT_PROMISC_SHIFT |
+				       base_qpn);
+	context->mcast = cpu_to_be32(m_promisc << SET_PORT_MC_PROMISC_SHIFT |
+				     base_qpn);
+	context->intra_no_vlan = 0;
+	context->no_vlan = MLX4_NO_VLAN_IDX;
+	context->intra_vlan_miss = 0;
+	context->vlan_miss = MLX4_VLAN_MISS_IDX;
+
+	in_mod = MLX4_SET_PORT_RQP_CALC << 8 | port;
+	err = mlx4_cmd(dev, mailbox->dma, in_mod, 1, MLX4_CMD_SET_PORT,
+		       MLX4_CMD_TIME_CLASS_B,  MLX4_CMD_WRAPPED);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+EXPORT_SYMBOL(mlx4_SET_PORT_qpn_calc);
+
+int mlx4_SET_PORT_PRIO2TC(struct mlx4_dev *dev, u8 port, u8 *prio2tc)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_set_port_prio2tc_context *context;
+	int err;
+	u32 in_mod;
+	int i;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	context = mailbox->buf;
+	memset(context, 0, sizeof *context);
+
+	for (i = 0; i < MLX4_NUM_UP; i += 2)
+		context->prio2tc[i >> 1] = prio2tc[i] << 4 | prio2tc[i + 1];
+
+	in_mod = MLX4_SET_PORT_PRIO2TC << 8 | port;
+	err = mlx4_cmd(dev, mailbox->dma, in_mod, 1, MLX4_CMD_SET_PORT,
+		       MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+EXPORT_SYMBOL(mlx4_SET_PORT_PRIO2TC);
+
+int mlx4_SET_PORT_SCHEDULER(struct mlx4_dev *dev, u8 port, u8 *tc_tx_bw,
+		u8 *pg, u16 *ratelimit)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_set_port_scheduler_context *context;
+	int err;
+	u32 in_mod;
+	int i;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	context = mailbox->buf;
+	memset(context, 0, sizeof *context);
+
+	for (i = 0; i < MLX4_NUM_TC; i++) {
+		struct mlx4_port_scheduler_tc_cfg_be *tc = &context->tc[i];
+		u16 r;
+		if (ratelimit && ratelimit[i]) {
+			if (ratelimit[i] <= MLX4_MAX_100M_UNITS_VAL) {
+				r = ratelimit[i];
+				tc->max_bw_units =
+					htons(MLX4_RATELIMIT_100M_UNITS);
+			} else {
+				r = ratelimit[i]/10;
+				tc->max_bw_units =
+					htons(MLX4_RATELIMIT_1G_UNITS);
+			}
+			tc->max_bw_value = htons(r);
+		} else {
+			tc->max_bw_value = htons(MLX4_RATELIMIT_DEFAULT);
+			tc->max_bw_units = htons(MLX4_RATELIMIT_1G_UNITS);
+		}
+
+		tc->pg = htons(pg[i]);
+		tc->bw_precentage = htons(tc_tx_bw[i]);
+	}
+
+	in_mod = MLX4_SET_PORT_SCHEDULER << 8 | port;
+	err = mlx4_cmd(dev, mailbox->dma, in_mod, 1, MLX4_CMD_SET_PORT,
+		       MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+EXPORT_SYMBOL(mlx4_SET_PORT_SCHEDULER);
+
+int mlx4_SET_MCAST_FLTR_wrapper(struct mlx4_dev *dev, int slave,
+				struct mlx4_vhcr *vhcr,
+				struct mlx4_cmd_mailbox *inbox,
+				struct mlx4_cmd_mailbox *outbox,
+				struct mlx4_cmd_info *cmd)
+{
+	int err = 0;
+
+	return err;
+}
+
+int mlx4_SET_MCAST_FLTR(struct mlx4_dev *dev, u8 port,
+			u64 mac, u64 clear, u8 mode)
+{
+	return mlx4_cmd(dev, (mac | (clear << 63)), port, mode,
+			MLX4_CMD_SET_MCAST_FLTR, MLX4_CMD_TIME_CLASS_B,
+			MLX4_CMD_WRAPPED);
+}
+EXPORT_SYMBOL(mlx4_SET_MCAST_FLTR);
+
+int mlx4_SET_VLAN_FLTR_wrapper(struct mlx4_dev *dev, int slave,
+			       struct mlx4_vhcr *vhcr,
+			       struct mlx4_cmd_mailbox *inbox,
+			       struct mlx4_cmd_mailbox *outbox,
+			       struct mlx4_cmd_info *cmd)
+{
+	int err = 0;
+
+	return err;
+}
+
+int mlx4_DUMP_ETH_STATS_wrapper(struct mlx4_dev *dev, int slave,
+				struct mlx4_vhcr *vhcr,
+				struct mlx4_cmd_mailbox *inbox,
+				struct mlx4_cmd_mailbox *outbox,
+				struct mlx4_cmd_info *cmd)
+{
+	return 0;
+}
+
+void mlx4_set_stats_bitmap(struct mlx4_dev *dev, unsigned long *stats_bitmap)
+{
+	int last_i = 0;
+
+	bitmap_zero(stats_bitmap, NUM_ALL_STATS);
+
+	if (mlx4_is_slave(dev)) {
+		last_i = dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_FLOWSTATS_EN ?
+			NUM_PKT_STATS + NUM_FLOW_STATS : NUM_PKT_STATS;
+	} else {
+		bitmap_set(stats_bitmap, last_i, NUM_PKT_STATS);
+		last_i = NUM_PKT_STATS;
+
+		if (dev->caps.flags2 &
+		    MLX4_DEV_CAP_FLAG2_FLOWSTATS_EN) {
+			bitmap_set(stats_bitmap, last_i, NUM_FLOW_STATS);
+			last_i += NUM_FLOW_STATS;
+		}
+	}
+
+	if (mlx4_is_slave(dev))
+		bitmap_set(stats_bitmap, last_i, NUM_VF_STATS);
+	last_i += NUM_VF_STATS;
+
+	if (mlx4_is_master(dev))
+		bitmap_set(stats_bitmap, last_i, NUM_VPORT_STATS);
+	last_i += NUM_VPORT_STATS;
+
+	bitmap_set(stats_bitmap, last_i, NUM_PORT_STATS);
+}
+EXPORT_SYMBOL(mlx4_set_stats_bitmap);
+
+int mlx4_get_slave_from_roce_gid(struct mlx4_dev *dev, int port, u8 *gid, int *slave_id)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int i, found_ix = -1;
+	int vf_gids = MLX4_ROCE_MAX_GIDS - MLX4_ROCE_PF_GIDS;
+
+	if (!mlx4_is_mfunc(dev))
+		return -EINVAL;
+
+	for (i = 0; i < MLX4_ROCE_MAX_GIDS; i++) {
+		if (!memcmp(priv->roce_gids[port - 1][i].raw, gid, 16)) {
+			found_ix = i;
+			break;
+		}
+	}
+
+	if (found_ix >= 0) {
+		if (found_ix < MLX4_ROCE_PF_GIDS)
+			*slave_id = 0;
+		else if (found_ix < MLX4_ROCE_PF_GIDS + (vf_gids % dev->num_vfs) *
+			 (vf_gids / dev->num_vfs + 1))
+			*slave_id = ((found_ix - MLX4_ROCE_PF_GIDS) /
+				     (vf_gids / dev->num_vfs + 1)) + 1;
+		else
+			*slave_id =
+			((found_ix - MLX4_ROCE_PF_GIDS -
+			  ((vf_gids % dev->num_vfs) * ((vf_gids / dev->num_vfs + 1)))) /
+			 (vf_gids / dev->num_vfs)) + vf_gids % dev->num_vfs + 1;
+	}
+
+	return (found_ix >= 0) ? 0 : -EINVAL;
+}
+EXPORT_SYMBOL(mlx4_get_slave_from_roce_gid);
+
+int mlx4_get_roce_gid_from_slave(struct mlx4_dev *dev, int port, int slave_id, u8 *gid)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	if (!mlx4_is_master(dev))
+		return -EINVAL;
+
+	memcpy(gid, priv->roce_gids[port - 1][slave_id].raw, 16);
+	return 0;
+}
+EXPORT_SYMBOL(mlx4_get_roce_gid_from_slave);
+
+/* Cable Module Info */
+#define MODULE_INFO_MAX_READ 48
+
+#define I2C_ADDR_LOW  0x50
+#define I2C_ADDR_HIGH 0x51
+#define I2C_PAGE_SIZE 256
+
+/* Module Info Data */
+struct mlx4_cable_info {
+	u8	i2c_addr;
+	u8	page_num;
+	__be16	dev_mem_address;
+	__be16	reserved1;
+	__be16	size;
+	__be32	reserved2[2];
+	u8	data[MODULE_INFO_MAX_READ];
+};
+
+enum cable_info_err {
+	CABLE_INF_INV_PORT	= 0x1,
+	CABLE_INF_OP_NOSUP	= 0x2,
+	CABLE_INF_NOT_CONN	= 0x3,
+	CABLE_INF_NO_EEPRM	= 0x4,
+	CABLE_INF_PAGE_ERR	= 0x5,
+	CABLE_INF_INV_ADDR	= 0x6,
+	CABLE_INF_I2C_ADDR	= 0x7,
+	CABLE_INF_QSFP_VIO	= 0x8,
+	CABLE_INF_I2C_BUSY	= 0x9,
+};
+
+#define MAD_STATUS_2_CABLE_ERR(mad_status) ((mad_status >> 8) & 0xFF)
+
+#ifdef DEBUG
+static inline const char *cable_info_mad_err_str(u16 mad_status)
+{
+	u8 err = MAD_STATUS_2_CABLE_ERR(mad_status);
+
+	switch (err) {
+	case CABLE_INF_INV_PORT:
+		return "invalid port selected";
+	case CABLE_INF_OP_NOSUP:
+		return "operation not supported for this port (the port is of type CX4 or internal)";
+	case CABLE_INF_NOT_CONN:
+		return "cable is not connected";
+	case CABLE_INF_NO_EEPRM:
+		return "the connected cable has no EPROM (passive copper cable)";
+	case CABLE_INF_PAGE_ERR:
+		return "page number is greater than 15";
+	case CABLE_INF_INV_ADDR:
+		return "invalid device_address or size (that is, size equals 0 or address+size is greater than 256)";
+	case CABLE_INF_I2C_ADDR:
+		return "invalid I2C slave address";
+	case CABLE_INF_QSFP_VIO:
+		return "at least one cable violates the QSFP specification and ignores the modsel signal";
+	case CABLE_INF_I2C_BUSY:
+		return "I2C bus is constantly busy";
+	}
+	return "Unknown Error";
+}
+#endif /* DEBUG */
+
+/**
+ * mlx4_get_module_info - Read cable module eeprom data
+ * @dev: mlx4_dev.
+ * @port: port number.
+ * @offset: byte offset in eeprom to start reading data from.
+ * @size: num of bytes to read.
+ * @data: output buffer to put the requested data into.
+ *
+ * Reads cable module eeprom data, puts the outcome data into
+ * data pointer paramer.
+ * Returns num of read bytes on success or a negative error
+ * code.
+ */
+int mlx4_get_module_info(struct mlx4_dev *dev, u8 port, u16 offset,
+    u16 size, u8 *data)
+{
+	struct mlx4_cmd_mailbox *inbox, *outbox;
+	struct mlx4_mad_ifc *inmad, *outmad;
+	struct mlx4_cable_info *cable_info;
+	u16 i2c_addr;
+	int ret;
+
+	if (size > MODULE_INFO_MAX_READ)
+		size = MODULE_INFO_MAX_READ;
+
+	inbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(inbox)) {
+		mlx4_err(dev,
+			 "mlx4_alloc_cmd_mailbox returned with error(%lx)", PTR_ERR(inbox));
+		return PTR_ERR(inbox);
+	}
+
+	outbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(outbox)) {
+		mlx4_free_cmd_mailbox(dev, inbox);
+		mlx4_err(dev,
+			 "mlx4_alloc_cmd_mailbox returned with error(%lx)", PTR_ERR(outbox));
+		return PTR_ERR(outbox);
+	}
+
+	inmad = (struct mlx4_mad_ifc *)(inbox->buf);
+	outmad = (struct mlx4_mad_ifc *)(outbox->buf);
+
+	inmad->method = 0x1; /* Get */
+	inmad->class_version = 0x1;
+	inmad->mgmt_class = 0x1;
+	inmad->base_version = 0x1;
+	inmad->attr_id = cpu_to_be16(0xFF60); /* Module Info */
+
+	if (offset < I2C_PAGE_SIZE && offset + size > I2C_PAGE_SIZE)
+		/* Cross pages reads are not allowed
+		 * read until offset 256 in low page
+		 */
+		size -= offset + size - I2C_PAGE_SIZE;
+
+	i2c_addr = I2C_ADDR_LOW;
+	if (offset >= I2C_PAGE_SIZE) {
+		/* Reset offset to high page */
+		i2c_addr = I2C_ADDR_HIGH;
+		offset -= I2C_PAGE_SIZE;
+	}
+
+	cable_info = (struct mlx4_cable_info *)inmad->data;
+	cable_info->dev_mem_address = cpu_to_be16(offset);
+	cable_info->page_num = 0;
+	cable_info->i2c_addr = i2c_addr;
+	cable_info->size = cpu_to_be16(size);
+
+	ret = mlx4_cmd_box(dev, inbox->dma, outbox->dma, port, 3,
+	    MLX4_CMD_MAD_IFC, MLX4_CMD_TIME_CLASS_C, MLX4_CMD_NATIVE);
+	if (ret)
+		goto out;
+
+	if (be16_to_cpu(outmad->status)) {
+		/* Mad returned with bad status */
+		ret = be16_to_cpu(outmad->status);
+#ifdef DEBUG
+		mlx4_warn(dev, "MLX4_CMD_MAD_IFC Get Module info attr(%x) "
+		    "port(%d) i2c_addr(%x) offset(%d) size(%d): Response "
+		    "Mad Status(%x) - %s\n", 0xFF60, port, i2c_addr, offset,
+		    size, ret, cable_info_mad_err_str(ret));
+#endif
+		if (i2c_addr == I2C_ADDR_HIGH &&
+		    MAD_STATUS_2_CABLE_ERR(ret) == CABLE_INF_I2C_ADDR)
+			/* Some SFP cables do not support i2c slave
+			 * address 0x51 (high page), abort silently.
+			 */
+			ret = 0;
+		else
+			ret = -ret;
+		goto out;
+	}
+	cable_info = (struct mlx4_cable_info *)outmad->data;
+	memcpy(data, cable_info->data, size);
+	ret = size;
+out:
+	mlx4_free_cmd_mailbox(dev, inbox);
+	mlx4_free_cmd_mailbox(dev, outbox);
+	return ret;
+}
+EXPORT_SYMBOL(mlx4_get_module_info);

Modified: trunk/sys/ofed/drivers/net/mlx4/profile.c
===================================================================
--- trunk/sys/ofed/drivers/net/mlx4/profile.c	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/drivers/net/mlx4/profile.c	2016-09-14 19:35:22 UTC (rev 7911)
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
- * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2005, 2014 Mellanox Technologies. All rights reserved.
  * Copyright (c) 2006, 2007 Cisco Systems, Inc. All rights reserved.
  *
  * This software is available to you under a choice of one of two
@@ -32,7 +32,7 @@
  * SOFTWARE.
  */
 
-#include <linux/init.h>
+#include <linux/slab.h>
 
 #include "mlx4.h"
 #include "fw.h"
@@ -76,7 +76,7 @@
 		u64 size;
 		u64 start;
 		int type;
-		int num;
+		u64 num;
 		int log_num;
 	};
 
@@ -85,7 +85,7 @@
 	struct mlx4_resource tmp;
 	int i, j;
 
-	profile = kzalloc(MLX4_RES_NUM * sizeof *profile, GFP_KERNEL);
+	profile = kcalloc(MLX4_RES_NUM, sizeof(*profile), GFP_KERNEL);
 	if (!profile)
 		return -ENOMEM;
 
@@ -98,8 +98,8 @@
 	profile[MLX4_RES_EQ].size     = dev_cap->eqc_entry_sz;
 	profile[MLX4_RES_DMPT].size   = dev_cap->dmpt_entry_sz;
 	profile[MLX4_RES_CMPT].size   = dev_cap->cmpt_entry_sz;
-	profile[MLX4_RES_MTT].size    = dev->caps.mtts_per_seg * dev_cap->mtt_entry_sz;
-	profile[MLX4_RES_MCG].size    = MLX4_MGM_ENTRY_SIZE;
+	profile[MLX4_RES_MTT].size    = dev_cap->mtt_entry_sz;
+	profile[MLX4_RES_MCG].size    = mlx4_get_mgm_entry_size(dev);
 
 	profile[MLX4_RES_QP].num      = request->num_qp;
 	profile[MLX4_RES_RDMARC].num  = request->num_qp * request->rdmarc_per_qp;
@@ -107,12 +107,13 @@
 	profile[MLX4_RES_AUXC].num    = request->num_qp;
 	profile[MLX4_RES_SRQ].num     = request->num_srq;
 	profile[MLX4_RES_CQ].num      = request->num_cq;
-	profile[MLX4_RES_EQ].num      = min_t(unsigned, dev_cap->max_eqs,
-					      dev_cap->reserved_eqs +
-					      num_possible_cpus() + 1);
+	profile[MLX4_RES_EQ].num      = mlx4_is_mfunc(dev) ?
+					dev->phys_caps.num_phys_eqs :
+					min_t(unsigned, dev_cap->max_eqs, MAX_MSIX);
 	profile[MLX4_RES_DMPT].num    = request->num_mpt;
 	profile[MLX4_RES_CMPT].num    = MLX4_NUM_CMPTS;
-	profile[MLX4_RES_MTT].num     = request->num_mtt;
+	profile[MLX4_RES_MTT].num     = ((u64)request->num_mtt_segs) *
+					(1 << log_mtts_per_seg);
 	profile[MLX4_RES_MCG].num     = request->num_mcg;
 
 	for (i = 0; i < MLX4_RES_NUM; ++i) {
@@ -198,9 +199,10 @@
 			init_hca->log_num_cqs = profile[i].log_num;
 			break;
 		case MLX4_RES_EQ:
-			dev->caps.num_eqs     = profile[i].num;
+			dev->caps.num_eqs     = roundup_pow_of_two(min_t(unsigned, dev_cap->max_eqs,
+									 MAX_MSIX));
 			init_hca->eqc_base    = profile[i].start;
-			init_hca->log_num_eqs = profile[i].log_num;
+			init_hca->log_num_eqs = ilog2(dev->caps.num_eqs);
 			break;
 		case MLX4_RES_DMPT:
 			dev->caps.num_mpts	= profile[i].num;
@@ -212,17 +214,24 @@
 			init_hca->cmpt_base	 = profile[i].start;
 			break;
 		case MLX4_RES_MTT:
-			dev->caps.num_mtt_segs	 = profile[i].num;
+			dev->caps.num_mtts	 = profile[i].num;
 			priv->mr_table.mtt_base	 = profile[i].start;
 			init_hca->mtt_base	 = profile[i].start;
 			break;
 		case MLX4_RES_MCG:
-			dev->caps.num_mgms	  = profile[i].num >> 1;
-			dev->caps.num_amgms	  = profile[i].num >> 1;
 			init_hca->mc_base	  = profile[i].start;
-			init_hca->log_mc_entry_sz = ilog2(MLX4_MGM_ENTRY_SIZE);
+			init_hca->log_mc_entry_sz =
+					ilog2(mlx4_get_mgm_entry_size(dev));
 			init_hca->log_mc_table_sz = profile[i].log_num;
-			init_hca->log_mc_hash_sz  = profile[i].log_num - 1;
+			if (dev->caps.steering_mode ==
+			    MLX4_STEERING_MODE_DEVICE_MANAGED) {
+				dev->caps.num_mgms = profile[i].num;
+			} else {
+				init_hca->log_mc_hash_sz =
+						profile[i].log_num - 1;
+				dev->caps.num_mgms = profile[i].num >> 1;
+				dev->caps.num_amgms = profile[i].num >> 1;
+			}
 			break;
 		default:
 			break;

Modified: trunk/sys/ofed/drivers/net/mlx4/qp.c
===================================================================
--- trunk/sys/ofed/drivers/net/mlx4/qp.c	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/drivers/net/mlx4/qp.c	2016-09-14 19:35:22 UTC (rev 7911)
@@ -1,7 +1,7 @@
 /*
  * Copyright (c) 2004 Topspin Communications.  All rights reserved.
  * Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc. All rights reserved.
- * Copyright (c) 2005, 2006, 2007, 2008 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2005, 2006, 2007, 2008, 2014 Mellanox Technologies. All rights reserved.
  * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
  *
  * This software is available to you under a choice of one of two
@@ -33,7 +33,9 @@
  * SOFTWARE.
  */
 
-#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/gfp.h>
+#include <linux/module.h>
 
 #include <linux/mlx4/cmd.h>
 #include <linux/mlx4/qp.h>
@@ -41,6 +43,12 @@
 #include "mlx4.h"
 #include "icm.h"
 
+/*
+ * QP to support BF should have bits 6,7 cleared
+ */
+#define MLX4_BF_QP_SKIP_MASK	0xc0
+#define MLX4_MAX_BF_QP_RANGE	0x40
+
 void mlx4_qp_event(struct mlx4_dev *dev, u32 qpn, int event_type)
 {
 	struct mlx4_qp_table *qp_table = &mlx4_priv(dev)->qp_table;
@@ -55,7 +63,7 @@
 	spin_unlock(&qp_table->lock);
 
 	if (!qp) {
-		mlx4_warn(dev, "Async event for bogus QP %08x\n", qpn);
+		mlx4_dbg(dev, "Async event for none existent QP %08x\n", qpn);
 		return;
 	}
 
@@ -65,11 +73,26 @@
 		complete(&qp->free);
 }
 
-int mlx4_qp_modify(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
-		   enum mlx4_qp_state cur_state, enum mlx4_qp_state new_state,
-		   struct mlx4_qp_context *context, enum mlx4_qp_optpar optpar,
-		   int sqd_event, struct mlx4_qp *qp)
+/* used for INIT/CLOSE port logic */
+static int is_master_qp0(struct mlx4_dev *dev, struct mlx4_qp *qp, int *real_qp0, int *proxy_qp0)
 {
+	/* this procedure is called after we already know we are on the master */
+	/* qp0 is either the proxy qp0, or the real qp0 */
+	u32 pf_proxy_offset = dev->phys_caps.base_proxy_sqpn + 8 * mlx4_master_func_num(dev);
+	*proxy_qp0 = qp->qpn >= pf_proxy_offset && qp->qpn <= pf_proxy_offset + 1;
+
+	*real_qp0 = qp->qpn >= dev->phys_caps.base_sqpn &&
+		qp->qpn <= dev->phys_caps.base_sqpn + 1;
+
+	return *real_qp0 || *proxy_qp0;
+}
+
+static int __mlx4_qp_modify(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
+		     enum mlx4_qp_state cur_state, enum mlx4_qp_state new_state,
+		     struct mlx4_qp_context *context,
+		     enum mlx4_qp_optpar optpar,
+		     int sqd_event, struct mlx4_qp *qp, int native)
+{
 	static const u16 op[MLX4_QP_NUM_STATE][MLX4_QP_NUM_STATE] = {
 		[MLX4_QP_STATE_RST] = {
 			[MLX4_QP_STATE_RST]	= MLX4_CMD_2RST_QP,
@@ -110,16 +133,31 @@
 		}
 	};
 
+	struct mlx4_priv *priv = mlx4_priv(dev);
 	struct mlx4_cmd_mailbox *mailbox;
 	int ret = 0;
+	int real_qp0 = 0;
+	int proxy_qp0 = 0;
+	u8 port;
 
 	if (cur_state >= MLX4_QP_NUM_STATE || new_state >= MLX4_QP_NUM_STATE ||
 	    !op[cur_state][new_state])
 		return -EINVAL;
 
-	if (op[cur_state][new_state] == MLX4_CMD_2RST_QP)
-		return mlx4_cmd(dev, 0, qp->qpn, 2,
-				MLX4_CMD_2RST_QP, MLX4_CMD_TIME_CLASS_A);
+	if (op[cur_state][new_state] == MLX4_CMD_2RST_QP) {
+		ret = mlx4_cmd(dev, 0, qp->qpn, 2,
+			MLX4_CMD_2RST_QP, MLX4_CMD_TIME_CLASS_A, native);
+		if (mlx4_is_master(dev) && cur_state != MLX4_QP_STATE_ERR &&
+		    cur_state != MLX4_QP_STATE_RST &&
+		    is_master_qp0(dev, qp, &real_qp0, &proxy_qp0)) {
+			port = (qp->qpn & 1) + 1;
+			if (proxy_qp0)
+				priv->mfunc.master.qp0_state[port].proxy_qp0_active = 0;
+			else
+				priv->mfunc.master.qp0_state[port].qp0_active = 0;
+		}
+		return ret;
+	}
 
 	mailbox = mlx4_alloc_cmd_mailbox(dev);
 	if (IS_ERR(mailbox))
@@ -138,116 +176,236 @@
 	((struct mlx4_qp_context *) (mailbox->buf + 8))->local_qpn =
 		cpu_to_be32(qp->qpn);
 
-	ret = mlx4_cmd(dev, mailbox->dma, qp->qpn | (!!sqd_event << 31),
+	ret = mlx4_cmd(dev, mailbox->dma,
+		       qp->qpn | (!!sqd_event << 31),
 		       new_state == MLX4_QP_STATE_RST ? 2 : 0,
-		       op[cur_state][new_state], MLX4_CMD_TIME_CLASS_C);
+		       op[cur_state][new_state], MLX4_CMD_TIME_CLASS_C, native);
 
+	if (mlx4_is_master(dev) && is_master_qp0(dev, qp, &real_qp0, &proxy_qp0)) {
+		port = (qp->qpn & 1) + 1;
+		if (cur_state != MLX4_QP_STATE_ERR &&
+		    cur_state != MLX4_QP_STATE_RST &&
+		    new_state == MLX4_QP_STATE_ERR) {
+			if (proxy_qp0)
+				priv->mfunc.master.qp0_state[port].proxy_qp0_active = 0;
+			else
+				priv->mfunc.master.qp0_state[port].qp0_active = 0;
+		} else if (new_state == MLX4_QP_STATE_RTR) {
+			if (proxy_qp0)
+				priv->mfunc.master.qp0_state[port].proxy_qp0_active = 1;
+			else
+				priv->mfunc.master.qp0_state[port].qp0_active = 1;
+		}
+	}
+
 	mlx4_free_cmd_mailbox(dev, mailbox);
 	return ret;
 }
+
+int mlx4_qp_modify(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
+		   enum mlx4_qp_state cur_state, enum mlx4_qp_state new_state,
+		   struct mlx4_qp_context *context,
+		   enum mlx4_qp_optpar optpar,
+		   int sqd_event, struct mlx4_qp *qp)
+{
+	return __mlx4_qp_modify(dev, mtt, cur_state, new_state, context,
+				optpar, sqd_event, qp, 0);
+}
 EXPORT_SYMBOL_GPL(mlx4_qp_modify);
 
-int mlx4_qp_reserve_range(struct mlx4_dev *dev, int cnt, int align, int *base)
+int __mlx4_qp_reserve_range(struct mlx4_dev *dev, int cnt, int align,
+			    int *base, u8 flags)
 {
+	int bf_qp = !!(flags & (u8) MLX4_RESERVE_BF_QP);
+
 	struct mlx4_priv *priv = mlx4_priv(dev);
 	struct mlx4_qp_table *qp_table = &priv->qp_table;
-	int qpn;
 
-	qpn = mlx4_bitmap_alloc_range(&qp_table->bitmap, cnt, align);
-	if (qpn == -1)
+	/* Only IPoIB uses a large cnt. In this case, just allocate
+	 * as usual, ignoring bf skipping, since IPoIB does not run over RoCE
+	 */
+	if (cnt > MLX4_MAX_BF_QP_RANGE && bf_qp)
+		bf_qp = 0;
+
+	*base = mlx4_bitmap_alloc_range(&qp_table->bitmap, cnt, align,
+					bf_qp ? MLX4_BF_QP_SKIP_MASK : 0);
+	if (*base == -1)
 		return -ENOMEM;
 
-	*base = qpn;
 	return 0;
 }
+
+int mlx4_qp_reserve_range(struct mlx4_dev *dev, int cnt, int align,
+			  int *base, u8 flags)
+{
+	u64 in_param = 0;
+	u64 out_param;
+	int err;
+
+	if (mlx4_is_mfunc(dev)) {
+		set_param_l(&in_param, (((u32) flags) << 24) | (u32) cnt);
+		set_param_h(&in_param, align);
+		err = mlx4_cmd_imm(dev, in_param, &out_param,
+				   RES_QP, RES_OP_RESERVE,
+				   MLX4_CMD_ALLOC_RES,
+				   MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
+		if (err)
+			return err;
+
+		*base = get_param_l(&out_param);
+		return 0;
+	}
+	return __mlx4_qp_reserve_range(dev, cnt, align, base, flags);
+}
 EXPORT_SYMBOL_GPL(mlx4_qp_reserve_range);
 
-void mlx4_qp_release_range(struct mlx4_dev *dev, int base_qpn, int cnt)
+void __mlx4_qp_release_range(struct mlx4_dev *dev, int base_qpn, int cnt)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
 	struct mlx4_qp_table *qp_table = &priv->qp_table;
-	if (base_qpn < dev->caps.sqp_start + 8)
+
+	if (mlx4_is_qp_reserved(dev, (u32) base_qpn))
 		return;
+	mlx4_bitmap_free_range(&qp_table->bitmap, base_qpn, cnt, MLX4_USE_RR);
+}
 
-	mlx4_bitmap_free_range(&qp_table->bitmap, base_qpn, cnt);
+void mlx4_qp_release_range(struct mlx4_dev *dev, int base_qpn, int cnt)
+{
+	u64 in_param = 0;
+	int err;
+
+	if (mlx4_is_mfunc(dev)) {
+		set_param_l(&in_param, base_qpn);
+		set_param_h(&in_param, cnt);
+		err = mlx4_cmd(dev, in_param, RES_QP, RES_OP_RESERVE,
+			       MLX4_CMD_FREE_RES,
+			       MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
+		if (err) {
+			mlx4_warn(dev, "Failed to release qp range"
+				  " base:%d cnt:%d\n", base_qpn, cnt);
+		}
+	} else
+		 __mlx4_qp_release_range(dev, base_qpn, cnt);
 }
 EXPORT_SYMBOL_GPL(mlx4_qp_release_range);
 
-int mlx4_qp_alloc(struct mlx4_dev *dev, int qpn, struct mlx4_qp *qp)
+int __mlx4_qp_alloc_icm(struct mlx4_dev *dev, int qpn)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
 	struct mlx4_qp_table *qp_table = &priv->qp_table;
 	int err;
 
-	if (!qpn)
-		return -EINVAL;
-
-	qp->qpn = qpn;
-
-	err = mlx4_table_get(dev, &qp_table->qp_table, qp->qpn);
+	err = mlx4_table_get(dev, &qp_table->qp_table, qpn);
 	if (err)
 		goto err_out;
 
-	err = mlx4_table_get(dev, &qp_table->auxc_table, qp->qpn);
+	err = mlx4_table_get(dev, &qp_table->auxc_table, qpn);
 	if (err)
 		goto err_put_qp;
 
-	err = mlx4_table_get(dev, &qp_table->altc_table, qp->qpn);
+	err = mlx4_table_get(dev, &qp_table->altc_table, qpn);
 	if (err)
 		goto err_put_auxc;
 
-	err = mlx4_table_get(dev, &qp_table->rdmarc_table, qp->qpn);
+	err = mlx4_table_get(dev, &qp_table->rdmarc_table, qpn);
 	if (err)
 		goto err_put_altc;
 
-	err = mlx4_table_get(dev, &qp_table->cmpt_table, qp->qpn);
+	err = mlx4_table_get(dev, &qp_table->cmpt_table, qpn);
 	if (err)
 		goto err_put_rdmarc;
 
-	spin_lock_irq(&qp_table->lock);
-	err = radix_tree_insert(&dev->qp_table_tree, qp->qpn & (dev->caps.num_qps - 1), qp);
-	spin_unlock_irq(&qp_table->lock);
-	if (err)
-		goto err_put_cmpt;
-
-	atomic_set(&qp->refcount, 1);
-	init_completion(&qp->free);
-
 	return 0;
 
-err_put_cmpt:
-	mlx4_table_put(dev, &qp_table->cmpt_table, qp->qpn);
-
 err_put_rdmarc:
-	mlx4_table_put(dev, &qp_table->rdmarc_table, qp->qpn);
+	mlx4_table_put(dev, &qp_table->rdmarc_table, qpn);
 
 err_put_altc:
-	mlx4_table_put(dev, &qp_table->altc_table, qp->qpn);
+	mlx4_table_put(dev, &qp_table->altc_table, qpn);
 
 err_put_auxc:
-	mlx4_table_put(dev, &qp_table->auxc_table, qp->qpn);
+	mlx4_table_put(dev, &qp_table->auxc_table, qpn);
 
 err_put_qp:
-	mlx4_table_put(dev, &qp_table->qp_table, qp->qpn);
+	mlx4_table_put(dev, &qp_table->qp_table, qpn);
 
 err_out:
 	return err;
 }
-EXPORT_SYMBOL_GPL(mlx4_qp_alloc);
 
-struct mlx4_qp *mlx4_qp_lookup_lock(struct mlx4_dev *dev, u32 qpn)
+static int mlx4_qp_alloc_icm(struct mlx4_dev *dev, int qpn)
 {
-	struct mlx4_qp_table *qp_table = &mlx4_priv(dev)->qp_table;
-	unsigned long flags;
-	struct mlx4_qp *qp;
+	u64 param = 0;
 
-	spin_lock_irqsave(&qp_table->lock, flags);
-	qp = radix_tree_lookup(&dev->qp_table_tree, qpn & (dev->caps.num_qps - 1));
-	spin_unlock_irqrestore(&qp_table->lock, flags);
-	return qp;
+	if (mlx4_is_mfunc(dev)) {
+		set_param_l(&param, qpn);
+		return mlx4_cmd_imm(dev, param, &param, RES_QP, RES_OP_MAP_ICM,
+				    MLX4_CMD_ALLOC_RES, MLX4_CMD_TIME_CLASS_A,
+				    MLX4_CMD_WRAPPED);
+	}
+	return __mlx4_qp_alloc_icm(dev, qpn);
 }
-EXPORT_SYMBOL_GPL(mlx4_qp_lookup_lock);
 
+void __mlx4_qp_free_icm(struct mlx4_dev *dev, int qpn)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_qp_table *qp_table = &priv->qp_table;
+
+	mlx4_table_put(dev, &qp_table->cmpt_table, qpn);
+	mlx4_table_put(dev, &qp_table->rdmarc_table, qpn);
+	mlx4_table_put(dev, &qp_table->altc_table, qpn);
+	mlx4_table_put(dev, &qp_table->auxc_table, qpn);
+	mlx4_table_put(dev, &qp_table->qp_table, qpn);
+}
+
+static void mlx4_qp_free_icm(struct mlx4_dev *dev, int qpn)
+{
+	u64 in_param = 0;
+
+	if (mlx4_is_mfunc(dev)) {
+		set_param_l(&in_param, qpn);
+		if (mlx4_cmd(dev, in_param, RES_QP, RES_OP_MAP_ICM,
+			     MLX4_CMD_FREE_RES, MLX4_CMD_TIME_CLASS_A,
+			     MLX4_CMD_WRAPPED))
+			mlx4_warn(dev, "Failed to free icm of qp:%d\n", qpn);
+	} else
+		__mlx4_qp_free_icm(dev, qpn);
+}
+
+int mlx4_qp_alloc(struct mlx4_dev *dev, int qpn, struct mlx4_qp *qp)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_qp_table *qp_table = &priv->qp_table;
+	int err;
+
+	if (!qpn)
+		return -EINVAL;
+
+	qp->qpn = qpn;
+
+	err = mlx4_qp_alloc_icm(dev, qpn);
+	if (err)
+		return err;
+
+	spin_lock_irq(&qp_table->lock);
+	err = radix_tree_insert(&dev->qp_table_tree, qp->qpn &
+				(dev->caps.num_qps - 1), qp);
+	spin_unlock_irq(&qp_table->lock);
+	if (err)
+		goto err_icm;
+
+	atomic_set(&qp->refcount, 1);
+	init_completion(&qp->free);
+
+	return 0;
+
+err_icm:
+	mlx4_qp_free_icm(dev, qpn);
+	return err;
+}
+
+EXPORT_SYMBOL_GPL(mlx4_qp_alloc);
+
 void mlx4_qp_remove(struct mlx4_dev *dev, struct mlx4_qp *qp)
 {
 	struct mlx4_qp_table *qp_table = &mlx4_priv(dev)->qp_table;
@@ -261,25 +419,18 @@
 
 void mlx4_qp_free(struct mlx4_dev *dev, struct mlx4_qp *qp)
 {
-	struct mlx4_qp_table *qp_table = &mlx4_priv(dev)->qp_table;
-
 	if (atomic_dec_and_test(&qp->refcount))
 		complete(&qp->free);
 	wait_for_completion(&qp->free);
 
-	mlx4_table_put(dev, &qp_table->cmpt_table, qp->qpn);
-	mlx4_table_put(dev, &qp_table->rdmarc_table, qp->qpn);
-	mlx4_table_put(dev, &qp_table->altc_table, qp->qpn);
-	mlx4_table_put(dev, &qp_table->auxc_table, qp->qpn);
-	mlx4_table_put(dev, &qp_table->qp_table, qp->qpn);
+	mlx4_qp_free_icm(dev, qp->qpn);
 }
 EXPORT_SYMBOL_GPL(mlx4_qp_free);
 
 static int mlx4_CONF_SPECIAL_QP(struct mlx4_dev *dev, u32 base_qpn)
 {
-	return mlx4_cmd(dev, 0, base_qpn,
-			(dev->caps.flags & MLX4_DEV_CAP_FLAG_RAW_ETY) ? 4 : 0,
-			MLX4_CMD_CONF_SPECIAL_QP, MLX4_CMD_TIME_CLASS_B);
+	return mlx4_cmd(dev, 0, base_qpn, 0, MLX4_CMD_CONF_SPECIAL_QP,
+			MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
 }
 
 int mlx4_init_qp_table(struct mlx4_dev *dev)
@@ -287,18 +438,23 @@
 	struct mlx4_qp_table *qp_table = &mlx4_priv(dev)->qp_table;
 	int err;
 	int reserved_from_top = 0;
+	int reserved_from_bot;
+	int k;
 
 	spin_lock_init(&qp_table->lock);
 	INIT_RADIX_TREE(&dev->qp_table_tree, GFP_ATOMIC);
+	if (mlx4_is_slave(dev))
+		return 0;
 
 	/*
 	 * We reserve 2 extra QPs per port for the special QPs.  The
 	 * block of special QPs must be aligned to a multiple of 8, so
 	 * round up.
+	 *
 	 * We also reserve the MSB of the 24-bit QP number to indicate
-	 * an XRC qp.
+	 * that a QP is an XRC QP.
 	 */
-	dev->caps.sqp_start =
+	dev->phys_caps.base_sqpn =
 		ALIGN(dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW], 8);
 
 	{
@@ -329,34 +485,82 @@
 
 	}
 
+       /* Reserve 8 real SQPs in both native and SRIOV modes.
+	* In addition, in SRIOV mode, reserve 8 proxy SQPs per function
+	* (for all PFs and VFs), and 8 corresponding tunnel QPs.
+	* Each proxy SQP works opposite its own tunnel QP.
+	*
+	* The QPs are arranged as follows:
+	* a. 8 real SQPs
+	* b. All the proxy SQPs (8 per function)
+	* c. All the tunnel QPs (8 per function)
+	*/
+	reserved_from_bot = mlx4_num_reserved_sqps(dev);
+	if (reserved_from_bot + reserved_from_top > dev->caps.num_qps) {
+		mlx4_err(dev, "Number of reserved QPs is higher than number "
+			 "of QPs, increase the value of log_num_qp\n");
+		return -EINVAL;
+	}
+
 	err = mlx4_bitmap_init(&qp_table->bitmap, dev->caps.num_qps,
-			       (1 << 23) - 1, dev->caps.sqp_start + 8,
+			       (1 << 23) - 1, reserved_from_bot,
 			       reserved_from_top);
 	if (err)
 		return err;
 
-	return mlx4_CONF_SPECIAL_QP(dev, dev->caps.sqp_start);
+	if (mlx4_is_mfunc(dev)) {
+		/* for PPF use */
+		dev->phys_caps.base_proxy_sqpn = dev->phys_caps.base_sqpn + 8;
+		dev->phys_caps.base_tunnel_sqpn = dev->phys_caps.base_sqpn + 8 + 8 * MLX4_MFUNC_MAX;
+
+		/* In mfunc, calculate proxy and tunnel qp offsets for the PF here,
+		 * since the PF does not call mlx4_slave_caps */
+		dev->caps.qp0_tunnel = kcalloc(dev->caps.num_ports, sizeof (u32), GFP_KERNEL);
+		dev->caps.qp0_proxy = kcalloc(dev->caps.num_ports, sizeof (u32), GFP_KERNEL);
+		dev->caps.qp1_tunnel = kcalloc(dev->caps.num_ports, sizeof (u32), GFP_KERNEL);
+		dev->caps.qp1_proxy = kcalloc(dev->caps.num_ports, sizeof (u32), GFP_KERNEL);
+
+		if (!dev->caps.qp0_tunnel || !dev->caps.qp0_proxy ||
+		    !dev->caps.qp1_tunnel || !dev->caps.qp1_proxy) {
+			err = -ENOMEM;
+			goto err_mem;
+		}
+
+		for (k = 0; k < dev->caps.num_ports; k++) {
+			dev->caps.qp0_proxy[k] = dev->phys_caps.base_proxy_sqpn +
+				8 * mlx4_master_func_num(dev) + k;
+			dev->caps.qp0_tunnel[k] = dev->caps.qp0_proxy[k] + 8 * MLX4_MFUNC_MAX;
+			dev->caps.qp1_proxy[k] = dev->phys_caps.base_proxy_sqpn +
+				8 * mlx4_master_func_num(dev) + MLX4_MAX_PORTS + k;
+			dev->caps.qp1_tunnel[k] = dev->caps.qp1_proxy[k] + 8 * MLX4_MFUNC_MAX;
+		}
+	}
+
+
+	err = mlx4_CONF_SPECIAL_QP(dev, dev->phys_caps.base_sqpn);
+	if (err)
+		goto err_mem;
+	return 0;
+
+err_mem:
+	kfree(dev->caps.qp0_tunnel);
+	kfree(dev->caps.qp0_proxy);
+	kfree(dev->caps.qp1_tunnel);
+	kfree(dev->caps.qp1_proxy);
+	dev->caps.qp0_tunnel = dev->caps.qp0_proxy =
+		dev->caps.qp1_tunnel = dev->caps.qp1_proxy = NULL;
+	return err;
 }
 
 void mlx4_cleanup_qp_table(struct mlx4_dev *dev)
 {
+	if (mlx4_is_slave(dev))
+		return;
+
 	mlx4_CONF_SPECIAL_QP(dev, 0);
 	mlx4_bitmap_cleanup(&mlx4_priv(dev)->qp_table.bitmap);
 }
 
-int mlx4_qp_get_region(struct mlx4_dev *dev, enum mlx4_qp_region region,
-			int *base_qpn, int *cnt)
-{
-	if ((region < 0) || (region >= MLX4_NUM_QP_REGION))
-		return -EINVAL;
-
-	*base_qpn = dev->caps.reserved_qps_base[region];
-	*cnt = dev->caps.reserved_qps_cnt[region];
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(mlx4_qp_get_region);
-
 int mlx4_qp_query(struct mlx4_dev *dev, struct mlx4_qp *qp,
 		  struct mlx4_qp_context *context)
 {
@@ -368,7 +572,8 @@
 		return PTR_ERR(mailbox);
 
 	err = mlx4_cmd_box(dev, 0, mailbox->dma, qp->qpn, 0,
-			   MLX4_CMD_QUERY_QP, MLX4_CMD_TIME_CLASS_A);
+			   MLX4_CMD_QUERY_QP, MLX4_CMD_TIME_CLASS_A,
+			   MLX4_CMD_WRAPPED);
 	if (!err)
 		memcpy(context, mailbox->buf + 8, sizeof *context);
 

Modified: trunk/sys/ofed/drivers/net/mlx4/reset.c
===================================================================
--- trunk/sys/ofed/drivers/net/mlx4/reset.c	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/drivers/net/mlx4/reset.c	2016-09-14 19:35:22 UTC (rev 7911)
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 2006, 2007 Cisco Systems, Inc.  All rights reserved.
- * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2007, 2008, 2014 Mellanox Technologies. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -31,7 +31,6 @@
  * SOFTWARE.
  */
 
-#include <linux/init.h>
 #include <linux/errno.h>
 #include <linux/pci.h>
 #include <linux/delay.h>
@@ -78,7 +77,7 @@
 		goto out;
 	}
 
-	pcie_cap = pci_find_capability(dev->pdev, PCI_CAP_ID_EXP);
+	pcie_cap = pci_pcie_cap(dev->pdev);
 
 	for (i = 0; i < 64; ++i) {
 		if (i == 22 || i == 23)
@@ -120,8 +119,8 @@
 	writel(MLX4_RESET_VALUE, reset + MLX4_RESET_OFFSET);
 	iounmap(reset);
 
-	/* Docs say to wait one second before accessing device */
-	msleep(1000);
+	/* wait half a second before accessing device */
+	msleep(500);
 
 	end = jiffies + MLX4_RESET_TIMEOUT_JIFFIES;
 	do {
@@ -142,8 +141,8 @@
 	/* Now restore the PCI headers */
 	if (pcie_cap) {
 		devctl = hca_header[(pcie_cap + PCI_EXP_DEVCTL) / 4];
-		if (pci_write_config_word(dev->pdev, pcie_cap + PCI_EXP_DEVCTL,
-					   devctl)) {
+		if (pcie_capability_write_word(dev->pdev, PCI_EXP_DEVCTL,
+					       devctl)) {
 			err = -ENODEV;
 			mlx4_err(dev, "Couldn't restore HCA PCI Express "
 				 "Device Control register, aborting.\n");
@@ -150,8 +149,8 @@
 			goto out;
 		}
 		linkctl = hca_header[(pcie_cap + PCI_EXP_LNKCTL) / 4];
-		if (pci_write_config_word(dev->pdev, pcie_cap + PCI_EXP_LNKCTL,
-					   linkctl)) {
+		if (pcie_capability_write_word(dev->pdev, PCI_EXP_LNKCTL,
+					       linkctl)) {
 			err = -ENODEV;
 			mlx4_err(dev, "Couldn't restore HCA PCI Express "
 				 "Link control register, aborting.\n");

Added: trunk/sys/ofed/drivers/net/mlx4/resource_tracker.c
===================================================================
--- trunk/sys/ofed/drivers/net/mlx4/resource_tracker.c	                        (rev 0)
+++ trunk/sys/ofed/drivers/net/mlx4/resource_tracker.c	2016-09-14 19:35:22 UTC (rev 7911)
@@ -0,0 +1,4686 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005, 2006, 2007, 2008, 2014 Mellanox Technologies.
+ * All rights reserved.
+ * Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/sched.h>
+#include <linux/pci.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/io.h>
+#include <linux/slab.h>
+#include <linux/mlx4/cmd.h>
+#include <linux/mlx4/qp.h>
+#include <linux/if_ether.h>
+#include <linux/etherdevice.h>
+
+#include "mlx4.h"
+#include "fw.h"
+
+#define MLX4_MAC_VALID		(1ull << 63)
+
+struct mac_res {
+	struct list_head list;
+	u64 mac;
+	int ref_count;
+	u8 smac_index;
+	u8 port;
+};
+
+struct vlan_res {
+	struct list_head list;
+	u16 vlan;
+	int ref_count;
+	int vlan_index;
+	u8 port;
+};
+
+struct res_common {
+	struct list_head	list;
+	struct rb_node		node;
+	u64		        res_id;
+	int			owner;
+	int			state;
+	int			from_state;
+	int			to_state;
+	int			removing;
+};
+
+enum {
+	RES_ANY_BUSY = 1
+};
+
+struct res_gid {
+	struct list_head	list;
+	u8			gid[16];
+	enum mlx4_protocol	prot;
+	enum mlx4_steer_type	steer;
+	u64			reg_id;
+};
+
+enum res_qp_states {
+	RES_QP_BUSY = RES_ANY_BUSY,
+
+	/* QP number was allocated */
+	RES_QP_RESERVED,
+
+	/* ICM memory for QP context was mapped */
+	RES_QP_MAPPED,
+
+	/* QP is in hw ownership */
+	RES_QP_HW
+};
+
+struct res_qp {
+	struct res_common	com;
+	struct res_mtt	       *mtt;
+	struct res_cq	       *rcq;
+	struct res_cq	       *scq;
+	struct res_srq	       *srq;
+	struct list_head	mcg_list;
+	spinlock_t		mcg_spl;
+	int			local_qpn;
+	atomic_t		ref_count;
+	u32			qpc_flags;
+	/* saved qp params before VST enforcement in order to restore on VGT */
+	u8			sched_queue;
+	__be32			param3;
+	u8			vlan_control;
+	u8			fvl_rx;
+	u8			pri_path_fl;
+	u8			vlan_index;
+	u8			feup;
+};
+
+enum res_mtt_states {
+	RES_MTT_BUSY = RES_ANY_BUSY,
+	RES_MTT_ALLOCATED,
+};
+
+static inline const char *mtt_states_str(enum res_mtt_states state)
+{
+	switch (state) {
+	case RES_MTT_BUSY: return "RES_MTT_BUSY";
+	case RES_MTT_ALLOCATED: return "RES_MTT_ALLOCATED";
+	default: return "Unknown";
+	}
+}
+
+struct res_mtt {
+	struct res_common	com;
+	int			order;
+	atomic_t		ref_count;
+};
+
+enum res_mpt_states {
+	RES_MPT_BUSY = RES_ANY_BUSY,
+	RES_MPT_RESERVED,
+	RES_MPT_MAPPED,
+	RES_MPT_HW,
+};
+
+struct res_mpt {
+	struct res_common	com;
+	struct res_mtt	       *mtt;
+	int			key;
+};
+
+enum res_eq_states {
+	RES_EQ_BUSY = RES_ANY_BUSY,
+	RES_EQ_RESERVED,
+	RES_EQ_HW,
+};
+
+struct res_eq {
+	struct res_common	com;
+	struct res_mtt	       *mtt;
+};
+
+enum res_cq_states {
+	RES_CQ_BUSY = RES_ANY_BUSY,
+	RES_CQ_ALLOCATED,
+	RES_CQ_HW,
+};
+
+struct res_cq {
+	struct res_common	com;
+	struct res_mtt	       *mtt;
+	atomic_t		ref_count;
+};
+
+enum res_srq_states {
+	RES_SRQ_BUSY = RES_ANY_BUSY,
+	RES_SRQ_ALLOCATED,
+	RES_SRQ_HW,
+};
+
+struct res_srq {
+	struct res_common	com;
+	struct res_mtt	       *mtt;
+	struct res_cq	       *cq;
+	atomic_t		ref_count;
+};
+
+enum res_counter_states {
+	RES_COUNTER_BUSY = RES_ANY_BUSY,
+	RES_COUNTER_ALLOCATED,
+};
+
+struct res_counter {
+	struct res_common	com;
+	int			port;
+};
+
+enum res_xrcdn_states {
+	RES_XRCD_BUSY = RES_ANY_BUSY,
+	RES_XRCD_ALLOCATED,
+};
+
+struct res_xrcdn {
+	struct res_common	com;
+	int			port;
+};
+
+enum res_fs_rule_states {
+	RES_FS_RULE_BUSY = RES_ANY_BUSY,
+	RES_FS_RULE_ALLOCATED,
+};
+
+struct res_fs_rule {
+	struct res_common	com;
+	int			qpn;
+};
+
+static int mlx4_is_eth(struct mlx4_dev *dev, int port)
+{
+	return dev->caps.port_mask[port] == MLX4_PORT_TYPE_IB ? 0 : 1;
+}
+
+static void *res_tracker_lookup(struct rb_root *root, u64 res_id)
+{
+	struct rb_node *node = root->rb_node;
+
+	while (node) {
+		struct res_common *res = container_of(node, struct res_common,
+						      node);
+
+		if (res_id < res->res_id)
+			node = node->rb_left;
+		else if (res_id > res->res_id)
+			node = node->rb_right;
+		else
+			return res;
+	}
+	return NULL;
+}
+
+static int res_tracker_insert(struct rb_root *root, struct res_common *res)
+{
+	struct rb_node **new = &(root->rb_node), *parent = NULL;
+
+	/* Figure out where to put new node */
+	while (*new) {
+		struct res_common *this = container_of(*new, struct res_common,
+						       node);
+
+		parent = *new;
+		if (res->res_id < this->res_id)
+			new = &((*new)->rb_left);
+		else if (res->res_id > this->res_id)
+			new = &((*new)->rb_right);
+		else
+			return -EEXIST;
+	}
+
+	/* Add new node and rebalance tree. */
+	rb_link_node(&res->node, parent, new);
+	rb_insert_color(&res->node, root);
+
+	return 0;
+}
+
+enum qp_transition {
+	QP_TRANS_INIT2RTR,
+	QP_TRANS_RTR2RTS,
+	QP_TRANS_RTS2RTS,
+	QP_TRANS_SQERR2RTS,
+	QP_TRANS_SQD2SQD,
+	QP_TRANS_SQD2RTS
+};
+
+/* For Debug uses */
+static const char *ResourceType(enum mlx4_resource rt)
+{
+	switch (rt) {
+	case RES_QP: return "RES_QP";
+	case RES_CQ: return "RES_CQ";
+	case RES_SRQ: return "RES_SRQ";
+	case RES_MPT: return "RES_MPT";
+	case RES_MTT: return "RES_MTT";
+	case RES_MAC: return  "RES_MAC";
+	case RES_VLAN: return  "RES_VLAN";
+	case RES_EQ: return "RES_EQ";
+	case RES_COUNTER: return "RES_COUNTER";
+	case RES_FS_RULE: return "RES_FS_RULE";
+	case RES_XRCD: return "RES_XRCD";
+	default: return "Unknown resource type !!!";
+	};
+}
+
+static void rem_slave_vlans(struct mlx4_dev *dev, int slave);
+static inline int mlx4_grant_resource(struct mlx4_dev *dev, int slave,
+				      enum mlx4_resource res_type, int count,
+				      int port)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct resource_allocator *res_alloc =
+		&priv->mfunc.master.res_tracker.res_alloc[res_type];
+	int err = -EINVAL;
+	int allocated, free, reserved, guaranteed, from_free;
+
+	spin_lock(&res_alloc->alloc_lock);
+	allocated = (port > 0) ?
+		res_alloc->allocated[(port - 1) * (dev->num_vfs + 1) + slave] :
+		res_alloc->allocated[slave];
+	free = (port > 0) ? res_alloc->res_port_free[port - 1] :
+		res_alloc->res_free;
+	reserved = (port > 0) ? res_alloc->res_port_rsvd[port - 1] :
+		res_alloc->res_reserved;
+	guaranteed = res_alloc->guaranteed[slave];
+
+	if (allocated + count > res_alloc->quota[slave])
+		goto out;
+
+	if (allocated + count <= guaranteed) {
+		err = 0;
+	} else {
+		/* portion may need to be obtained from free area */
+		if (guaranteed - allocated > 0)
+			from_free = count - (guaranteed - allocated);
+		else
+			from_free = count;
+
+		if (free - from_free > reserved)
+			err = 0;
+	}
+
+	if (!err) {
+		/* grant the request */
+		if (port > 0) {
+			res_alloc->allocated[(port - 1) * (dev->num_vfs + 1) + slave] += count;
+			res_alloc->res_port_free[port - 1] -= count;
+		} else {
+			res_alloc->allocated[slave] += count;
+			res_alloc->res_free -= count;
+		}
+	}
+
+out:
+	spin_unlock(&res_alloc->alloc_lock);
+	return err;
+
+}
+
+static inline void mlx4_release_resource(struct mlx4_dev *dev, int slave,
+				    enum mlx4_resource res_type, int count,
+				    int port)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct resource_allocator *res_alloc =
+		&priv->mfunc.master.res_tracker.res_alloc[res_type];
+
+	spin_lock(&res_alloc->alloc_lock);
+	if (port > 0) {
+		res_alloc->allocated[(port - 1) * (dev->num_vfs + 1) + slave] -= count;
+		res_alloc->res_port_free[port - 1] += count;
+	} else {
+		res_alloc->allocated[slave] -= count;
+		res_alloc->res_free += count;
+	}
+
+	spin_unlock(&res_alloc->alloc_lock);
+	return;
+}
+
+static inline void initialize_res_quotas(struct mlx4_dev *dev,
+					 struct resource_allocator *res_alloc,
+					 enum mlx4_resource res_type,
+					 int vf, int num_instances)
+{
+	res_alloc->guaranteed[vf] = num_instances / (2 * (dev->num_vfs + 1));
+	res_alloc->quota[vf] = (num_instances / 2) + res_alloc->guaranteed[vf];
+	if (vf == mlx4_master_func_num(dev)) {
+		res_alloc->res_free = num_instances;
+		if (res_type == RES_MTT) {
+			/* reserved mtts will be taken out of the PF allocation */
+			res_alloc->res_free += dev->caps.reserved_mtts;
+			res_alloc->guaranteed[vf] += dev->caps.reserved_mtts;
+			res_alloc->quota[vf] += dev->caps.reserved_mtts;
+		}
+	}
+}
+
+void mlx4_init_quotas(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int pf;
+
+	/* quotas for VFs are initialized in mlx4_slave_cap */
+	if (mlx4_is_slave(dev))
+		return;
+
+	if (!mlx4_is_mfunc(dev)) {
+		dev->quotas.qp = dev->caps.num_qps - dev->caps.reserved_qps -
+			mlx4_num_reserved_sqps(dev);
+		dev->quotas.cq = dev->caps.num_cqs - dev->caps.reserved_cqs;
+		dev->quotas.srq = dev->caps.num_srqs - dev->caps.reserved_srqs;
+		dev->quotas.mtt = dev->caps.num_mtts - dev->caps.reserved_mtts;
+		dev->quotas.mpt = dev->caps.num_mpts - dev->caps.reserved_mrws;
+		return;
+	}
+
+	pf = mlx4_master_func_num(dev);
+	dev->quotas.qp =
+		priv->mfunc.master.res_tracker.res_alloc[RES_QP].quota[pf];
+	dev->quotas.cq =
+		priv->mfunc.master.res_tracker.res_alloc[RES_CQ].quota[pf];
+	dev->quotas.srq =
+		priv->mfunc.master.res_tracker.res_alloc[RES_SRQ].quota[pf];
+	dev->quotas.mtt =
+		priv->mfunc.master.res_tracker.res_alloc[RES_MTT].quota[pf];
+	dev->quotas.mpt =
+		priv->mfunc.master.res_tracker.res_alloc[RES_MPT].quota[pf];
+}
+int mlx4_init_resource_tracker(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int i, j;
+	int t;
+
+	priv->mfunc.master.res_tracker.slave_list =
+		kzalloc(dev->num_slaves * sizeof(struct slave_list),
+			GFP_KERNEL);
+	if (!priv->mfunc.master.res_tracker.slave_list)
+		return -ENOMEM;
+
+	for (i = 0 ; i < dev->num_slaves; i++) {
+		for (t = 0; t < MLX4_NUM_OF_RESOURCE_TYPE; ++t)
+			INIT_LIST_HEAD(&priv->mfunc.master.res_tracker.
+				       slave_list[i].res_list[t]);
+		mutex_init(&priv->mfunc.master.res_tracker.slave_list[i].mutex);
+	}
+
+	mlx4_dbg(dev, "Started init_resource_tracker: %ld slaves\n",
+		 dev->num_slaves);
+	for (i = 0 ; i < MLX4_NUM_OF_RESOURCE_TYPE; i++)
+		priv->mfunc.master.res_tracker.res_tree[i] = RB_ROOT;
+
+	for (i = 0; i < MLX4_NUM_OF_RESOURCE_TYPE; i++) {
+		struct resource_allocator *res_alloc =
+			&priv->mfunc.master.res_tracker.res_alloc[i];
+		res_alloc->quota = kmalloc((dev->num_vfs + 1) * sizeof(int), GFP_KERNEL);
+		res_alloc->guaranteed = kmalloc((dev->num_vfs + 1) * sizeof(int), GFP_KERNEL);
+		if (i == RES_MAC || i == RES_VLAN)
+			res_alloc->allocated = kzalloc(MLX4_MAX_PORTS *
+						       (dev->num_vfs + 1) * sizeof(int),
+							GFP_KERNEL);
+		else
+			res_alloc->allocated = kzalloc((dev->num_vfs + 1) * sizeof(int), GFP_KERNEL);
+
+		if (!res_alloc->quota || !res_alloc->guaranteed ||
+		    !res_alloc->allocated)
+			goto no_mem_err;
+
+		spin_lock_init(&res_alloc->alloc_lock);
+		for (t = 0; t < dev->num_vfs + 1; t++) {
+			switch (i) {
+			case RES_QP:
+				initialize_res_quotas(dev, res_alloc, RES_QP,
+						      t, dev->caps.num_qps -
+						      dev->caps.reserved_qps -
+						      mlx4_num_reserved_sqps(dev));
+				break;
+			case RES_CQ:
+				initialize_res_quotas(dev, res_alloc, RES_CQ,
+						      t, dev->caps.num_cqs -
+						      dev->caps.reserved_cqs);
+				break;
+			case RES_SRQ:
+				initialize_res_quotas(dev, res_alloc, RES_SRQ,
+						      t, dev->caps.num_srqs -
+						      dev->caps.reserved_srqs);
+				break;
+			case RES_MPT:
+				initialize_res_quotas(dev, res_alloc, RES_MPT,
+						      t, dev->caps.num_mpts -
+						      dev->caps.reserved_mrws);
+				break;
+			case RES_MTT:
+				initialize_res_quotas(dev, res_alloc, RES_MTT,
+						      t, dev->caps.num_mtts -
+						      dev->caps.reserved_mtts);
+				break;
+			case RES_MAC:
+				if (t == mlx4_master_func_num(dev)) {
+					res_alloc->quota[t] =
+						MLX4_MAX_MAC_NUM - 2 * dev->num_vfs;
+					res_alloc->guaranteed[t] = res_alloc->quota[t];
+					for (j = 0; j < MLX4_MAX_PORTS; j++)
+						res_alloc->res_port_free[j] = MLX4_MAX_MAC_NUM;
+				} else {
+					res_alloc->quota[t] = 2;
+					res_alloc->guaranteed[t] = 2;
+				}
+				break;
+			case RES_VLAN:
+				if (t == mlx4_master_func_num(dev)) {
+					res_alloc->quota[t] = MLX4_MAX_VLAN_NUM;
+					res_alloc->guaranteed[t] = MLX4_MAX_VLAN_NUM / 2;
+					for (j = 0; j < MLX4_MAX_PORTS; j++)
+						res_alloc->res_port_free[j] =
+							res_alloc->quota[t];
+				} else {
+					res_alloc->quota[t] = MLX4_MAX_VLAN_NUM / 2;
+					res_alloc->guaranteed[t] = 0;
+				}
+				break;
+			case RES_COUNTER:
+				res_alloc->quota[t] = dev->caps.max_counters;
+				res_alloc->guaranteed[t] = 0;
+				if (t == mlx4_master_func_num(dev))
+					res_alloc->res_free = res_alloc->quota[t];
+				break;
+			default:
+				break;
+			}
+			if (i == RES_MAC || i == RES_VLAN) {
+				for (j = 0; j < MLX4_MAX_PORTS; j++)
+					res_alloc->res_port_rsvd[j] +=
+						res_alloc->guaranteed[t];
+			} else {
+				res_alloc->res_reserved += res_alloc->guaranteed[t];
+			}
+		}
+	}
+	spin_lock_init(&priv->mfunc.master.res_tracker.lock);
+	return 0;
+
+no_mem_err:
+	for (i = 0; i < MLX4_NUM_OF_RESOURCE_TYPE; i++) {
+		kfree(priv->mfunc.master.res_tracker.res_alloc[i].allocated);
+		priv->mfunc.master.res_tracker.res_alloc[i].allocated = NULL;
+		kfree(priv->mfunc.master.res_tracker.res_alloc[i].guaranteed);
+		priv->mfunc.master.res_tracker.res_alloc[i].guaranteed = NULL;
+		kfree(priv->mfunc.master.res_tracker.res_alloc[i].quota);
+		priv->mfunc.master.res_tracker.res_alloc[i].quota = NULL;
+	}
+	return -ENOMEM;
+}
+
+void mlx4_free_resource_tracker(struct mlx4_dev *dev,
+				enum mlx4_res_tracker_free_type type)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int i;
+
+	if (priv->mfunc.master.res_tracker.slave_list) {
+		if (type != RES_TR_FREE_STRUCTS_ONLY) {
+			for (i = 0; i < dev->num_slaves; i++) {
+				if (type == RES_TR_FREE_ALL ||
+				    dev->caps.function != i)
+					mlx4_delete_all_resources_for_slave(dev, i);
+			}
+			/* free master's vlans */
+			i = dev->caps.function;
+			mutex_lock(&priv->mfunc.master.res_tracker.slave_list[i].mutex);
+			rem_slave_vlans(dev, i);
+			mutex_unlock(&priv->mfunc.master.res_tracker.slave_list[i].mutex);
+		}
+
+		if (type != RES_TR_FREE_SLAVES_ONLY) {
+			for (i = 0; i < MLX4_NUM_OF_RESOURCE_TYPE; i++) {
+				kfree(priv->mfunc.master.res_tracker.res_alloc[i].allocated);
+				priv->mfunc.master.res_tracker.res_alloc[i].allocated = NULL;
+				kfree(priv->mfunc.master.res_tracker.res_alloc[i].guaranteed);
+				priv->mfunc.master.res_tracker.res_alloc[i].guaranteed = NULL;
+				kfree(priv->mfunc.master.res_tracker.res_alloc[i].quota);
+				priv->mfunc.master.res_tracker.res_alloc[i].quota = NULL;
+			}
+			kfree(priv->mfunc.master.res_tracker.slave_list);
+			priv->mfunc.master.res_tracker.slave_list = NULL;
+		}
+	}
+}
+
+static void update_pkey_index(struct mlx4_dev *dev, int slave,
+			      struct mlx4_cmd_mailbox *inbox)
+{
+	u8 sched = *(u8 *)(inbox->buf + 64);
+	u8 orig_index = *(u8 *)(inbox->buf + 35);
+	u8 new_index;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int port;
+
+	port = (sched >> 6 & 1) + 1;
+
+	new_index = priv->virt2phys_pkey[slave][port - 1][orig_index];
+	*(u8 *)(inbox->buf + 35) = new_index;
+}
+
+static void update_gid(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *inbox,
+		       u8 slave)
+{
+	struct mlx4_qp_context	*qp_ctx = inbox->buf + 8;
+	enum mlx4_qp_optpar	optpar = be32_to_cpu(*(__be32 *) inbox->buf);
+	u32			ts = (be32_to_cpu(qp_ctx->flags) >> 16) & 0xff;
+	int port;
+
+	if (MLX4_QP_ST_UD == ts) {
+		port = (qp_ctx->pri_path.sched_queue >> 6 & 1) + 1;
+		if (mlx4_is_eth(dev, port))
+			qp_ctx->pri_path.mgid_index = mlx4_get_base_gid_ix(dev, slave) | 0x80;
+		else
+			qp_ctx->pri_path.mgid_index = 0x80 | slave;
+
+	} else if (MLX4_QP_ST_RC == ts || MLX4_QP_ST_UC == ts) {
+		if (optpar & MLX4_QP_OPTPAR_PRIMARY_ADDR_PATH) {
+			port = (qp_ctx->pri_path.sched_queue >> 6 & 1) + 1;
+			if (mlx4_is_eth(dev, port)) {
+				qp_ctx->pri_path.mgid_index += mlx4_get_base_gid_ix(dev, slave);
+				qp_ctx->pri_path.mgid_index &= 0x7f;
+			} else {
+				qp_ctx->pri_path.mgid_index = slave & 0x7F;
+			}
+		}
+		if (optpar & MLX4_QP_OPTPAR_ALT_ADDR_PATH) {
+			port = (qp_ctx->alt_path.sched_queue >> 6 & 1) + 1;
+			if (mlx4_is_eth(dev, port)) {
+				qp_ctx->alt_path.mgid_index += mlx4_get_base_gid_ix(dev, slave);
+				qp_ctx->alt_path.mgid_index &= 0x7f;
+			} else {
+				qp_ctx->alt_path.mgid_index = slave & 0x7F;
+			}
+		}
+	}
+}
+
+static int check_counter_index_validity(struct mlx4_dev *dev, int slave, int port, int idx)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct counter_index *counter, *tmp_counter;
+
+	if (slave == 0) {
+		list_for_each_entry_safe(counter, tmp_counter,
+					 &priv->counters_table.global_port_list[port - 1],
+					 list) {
+			if (counter->index == idx)
+				return 0;
+		}
+		return -EINVAL;
+	} else {
+		list_for_each_entry_safe(counter, tmp_counter,
+					 &priv->counters_table.vf_list[slave - 1][port - 1],
+					 list) {
+			if (counter->index == idx)
+				return 0;
+		}
+		return -EINVAL;
+	}
+}
+
+static int update_vport_qp_param(struct mlx4_dev *dev,
+				 struct mlx4_cmd_mailbox *inbox,
+				 u8 slave, u32 qpn)
+{
+	struct mlx4_qp_context	*qpc = inbox->buf + 8;
+	struct mlx4_vport_oper_state *vp_oper;
+	struct mlx4_priv *priv;
+	u32 qp_type;
+	int port;
+
+	port = (qpc->pri_path.sched_queue & 0x40) ? 2 : 1;
+	priv = mlx4_priv(dev);
+	vp_oper = &priv->mfunc.master.vf_oper[slave].vport[port];
+	qp_type	= (be32_to_cpu(qpc->flags) >> 16) & 0xff;
+
+	if (dev->caps.port_type[port] == MLX4_PORT_TYPE_ETH &&
+	    qpc->pri_path.counter_index != MLX4_SINK_COUNTER_INDEX) {
+		if (check_counter_index_validity(dev, slave, port,
+						 qpc->pri_path.counter_index))
+			return -EINVAL;
+	}
+
+	mlx4_dbg(dev, "%s: QP counter_index %d for slave %d port %d\n",
+		 __func__, qpc->pri_path.counter_index, slave, port);
+
+	if ((dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_LB_SRC_CHK) &&
+	    dev->caps.port_type[port] == MLX4_PORT_TYPE_ETH &&
+	    !mlx4_is_qp_reserved(dev, qpn) &&
+	    qp_type == MLX4_QP_ST_MLX &&
+	    qpc->pri_path.counter_index != 0xFF) {
+		/* disable multicast loopback to qp with same counter */
+		qpc->pri_path.fl |= MLX4_FL_ETH_SRC_CHECK_MC_LB;
+		qpc->pri_path.vlan_control |=
+			MLX4_VLAN_CTRL_ETH_SRC_CHECK_IF_COUNTER;
+	}
+
+	if (MLX4_VGT != vp_oper->state.default_vlan) {
+		/* the reserved QPs (special, proxy, tunnel)
+		 * do not operate over vlans
+		 */
+		if (mlx4_is_qp_reserved(dev, qpn))
+			return 0;
+
+		/* force strip vlan by clear vsd */
+		qpc->param3 &= ~cpu_to_be32(MLX4_STRIP_VLAN);
+		/* preserve IF_COUNTER flag */
+		qpc->pri_path.vlan_control &=
+			MLX4_VLAN_CTRL_ETH_SRC_CHECK_IF_COUNTER;
+		if (MLX4_QP_ST_RC != qp_type) {
+			if (0 != vp_oper->state.default_vlan) {
+				qpc->pri_path.vlan_control |=
+					MLX4_VLAN_CTRL_ETH_TX_BLOCK_TAGGED |
+					MLX4_VLAN_CTRL_ETH_RX_BLOCK_PRIO_TAGGED |
+					MLX4_VLAN_CTRL_ETH_RX_BLOCK_UNTAGGED;
+			} else { /* priority tagged */
+				qpc->pri_path.vlan_control |=
+					MLX4_VLAN_CTRL_ETH_TX_BLOCK_TAGGED |
+					MLX4_VLAN_CTRL_ETH_RX_BLOCK_TAGGED;
+			}
+		}
+		qpc->pri_path.fvl_rx |= MLX4_FVL_RX_FORCE_ETH_VLAN;
+		qpc->pri_path.vlan_index = vp_oper->vlan_idx;
+		qpc->pri_path.fl |= MLX4_FL_CV | MLX4_FL_ETH_HIDE_CQE_VLAN;
+		qpc->pri_path.feup |= MLX4_FEUP_FORCE_ETH_UP | MLX4_FVL_FORCE_ETH_VLAN;
+		qpc->pri_path.sched_queue &= 0xC7;
+		qpc->pri_path.sched_queue |= (vp_oper->state.default_qos) << 3;
+	}
+	if (vp_oper->state.spoofchk) {
+		qpc->pri_path.feup |= MLX4_FSM_FORCE_ETH_SRC_MAC;
+		qpc->pri_path.grh_mylmc = (0x80 & qpc->pri_path.grh_mylmc) + vp_oper->mac_idx;
+	}
+	return 0;
+}
+
+static int mpt_mask(struct mlx4_dev *dev)
+{
+	return dev->caps.num_mpts - 1;
+}
+
+static void *find_res(struct mlx4_dev *dev, u64 res_id,
+		      enum mlx4_resource type)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	return res_tracker_lookup(&priv->mfunc.master.res_tracker.res_tree[type],
+				  res_id);
+}
+
+static int get_res(struct mlx4_dev *dev, int slave, u64 res_id,
+		   enum mlx4_resource type,
+		   void *res)
+{
+	struct res_common *r;
+	int err = 0;
+
+	spin_lock_irq(mlx4_tlock(dev));
+	r = find_res(dev, res_id, type);
+	if (!r) {
+		err = -ENONET;
+		goto exit;
+	}
+
+	if (r->state == RES_ANY_BUSY) {
+		err = -EBUSY;
+		goto exit;
+	}
+
+	if (r->owner != slave) {
+		err = -EPERM;
+		goto exit;
+	}
+
+	r->from_state = r->state;
+	r->state = RES_ANY_BUSY;
+
+	if (res)
+		*((struct res_common **)res) = r;
+
+exit:
+	spin_unlock_irq(mlx4_tlock(dev));
+	return err;
+}
+
+int mlx4_get_slave_from_resource_id(struct mlx4_dev *dev,
+				    enum mlx4_resource type,
+				    u64 res_id, int *slave)
+{
+
+	struct res_common *r;
+	int err = -ENOENT;
+	int id = res_id;
+
+	if (type == RES_QP)
+		id &= 0x7fffff;
+	spin_lock(mlx4_tlock(dev));
+
+	r = find_res(dev, id, type);
+	if (r) {
+		*slave = r->owner;
+		err = 0;
+	}
+	spin_unlock(mlx4_tlock(dev));
+
+	return err;
+}
+
+static void put_res(struct mlx4_dev *dev, int slave, u64 res_id,
+		    enum mlx4_resource type)
+{
+	struct res_common *r;
+
+	spin_lock_irq(mlx4_tlock(dev));
+	r = find_res(dev, res_id, type);
+	if (r)
+		r->state = r->from_state;
+	spin_unlock_irq(mlx4_tlock(dev));
+}
+
+static struct res_common *alloc_qp_tr(int id)
+{
+	struct res_qp *ret;
+
+	ret = kzalloc(sizeof *ret, GFP_KERNEL);
+	if (!ret)
+		return NULL;
+
+	ret->com.res_id = id;
+	ret->com.state = RES_QP_RESERVED;
+	ret->local_qpn = id;
+	INIT_LIST_HEAD(&ret->mcg_list);
+	spin_lock_init(&ret->mcg_spl);
+	atomic_set(&ret->ref_count, 0);
+
+	return &ret->com;
+}
+
+static struct res_common *alloc_mtt_tr(int id, int order)
+{
+	struct res_mtt *ret;
+
+	ret = kzalloc(sizeof *ret, GFP_KERNEL);
+	if (!ret)
+		return NULL;
+
+	ret->com.res_id = id;
+	ret->order = order;
+	ret->com.state = RES_MTT_ALLOCATED;
+	atomic_set(&ret->ref_count, 0);
+
+	return &ret->com;
+}
+
+static struct res_common *alloc_mpt_tr(int id, int key)
+{
+	struct res_mpt *ret;
+
+	ret = kzalloc(sizeof *ret, GFP_KERNEL);
+	if (!ret)
+		return NULL;
+
+	ret->com.res_id = id;
+	ret->com.state = RES_MPT_RESERVED;
+	ret->key = key;
+
+	return &ret->com;
+}
+
+static struct res_common *alloc_eq_tr(int id)
+{
+	struct res_eq *ret;
+
+	ret = kzalloc(sizeof *ret, GFP_KERNEL);
+	if (!ret)
+		return NULL;
+
+	ret->com.res_id = id;
+	ret->com.state = RES_EQ_RESERVED;
+
+	return &ret->com;
+}
+
+static struct res_common *alloc_cq_tr(int id)
+{
+	struct res_cq *ret;
+
+	ret = kzalloc(sizeof *ret, GFP_KERNEL);
+	if (!ret)
+		return NULL;
+
+	ret->com.res_id = id;
+	ret->com.state = RES_CQ_ALLOCATED;
+	atomic_set(&ret->ref_count, 0);
+
+	return &ret->com;
+}
+
+static struct res_common *alloc_srq_tr(int id)
+{
+	struct res_srq *ret;
+
+	ret = kzalloc(sizeof *ret, GFP_KERNEL);
+	if (!ret)
+		return NULL;
+
+	ret->com.res_id = id;
+	ret->com.state = RES_SRQ_ALLOCATED;
+	atomic_set(&ret->ref_count, 0);
+
+	return &ret->com;
+}
+
+static struct res_common *alloc_counter_tr(int id)
+{
+	struct res_counter *ret;
+
+	ret = kzalloc(sizeof *ret, GFP_KERNEL);
+	if (!ret)
+		return NULL;
+
+	ret->com.res_id = id;
+	ret->com.state = RES_COUNTER_ALLOCATED;
+
+	return &ret->com;
+}
+
+static struct res_common *alloc_xrcdn_tr(int id)
+{
+	struct res_xrcdn *ret;
+
+	ret = kzalloc(sizeof *ret, GFP_KERNEL);
+	if (!ret)
+		return NULL;
+
+	ret->com.res_id = id;
+	ret->com.state = RES_XRCD_ALLOCATED;
+
+	return &ret->com;
+}
+
+static struct res_common *alloc_fs_rule_tr(u64 id, int qpn)
+{
+	struct res_fs_rule *ret;
+
+	ret = kzalloc(sizeof *ret, GFP_KERNEL);
+	if (!ret)
+		return NULL;
+
+	ret->com.res_id = id;
+	ret->com.state = RES_FS_RULE_ALLOCATED;
+	ret->qpn = qpn;
+	return &ret->com;
+}
+
+static struct res_common *alloc_tr(u64 id, enum mlx4_resource type, int slave,
+				   int extra)
+{
+	struct res_common *ret;
+
+	switch (type) {
+	case RES_QP:
+		ret = alloc_qp_tr(id);
+		break;
+	case RES_MPT:
+		ret = alloc_mpt_tr(id, extra);
+		break;
+	case RES_MTT:
+		ret = alloc_mtt_tr(id, extra);
+		break;
+	case RES_EQ:
+		ret = alloc_eq_tr(id);
+		break;
+	case RES_CQ:
+		ret = alloc_cq_tr(id);
+		break;
+	case RES_SRQ:
+		ret = alloc_srq_tr(id);
+		break;
+	case RES_MAC:
+		printk(KERN_ERR "implementation missing\n");
+		return NULL;
+	case RES_COUNTER:
+		ret = alloc_counter_tr(id);
+		break;
+	case RES_XRCD:
+		ret = alloc_xrcdn_tr(id);
+		break;
+	case RES_FS_RULE:
+		ret = alloc_fs_rule_tr(id, extra);
+		break;
+	default:
+		return NULL;
+	}
+	if (ret)
+		ret->owner = slave;
+
+	return ret;
+}
+
+static int add_res_range(struct mlx4_dev *dev, int slave, u64 base, int count,
+			 enum mlx4_resource type, int extra)
+{
+	int i;
+	int err;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct res_common **res_arr;
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct rb_root *root = &tracker->res_tree[type];
+
+	res_arr = kzalloc(count * sizeof *res_arr, GFP_KERNEL);
+	if (!res_arr)
+		return -ENOMEM;
+
+	for (i = 0; i < count; ++i) {
+		res_arr[i] = alloc_tr(base + i, type, slave, extra);
+		if (!res_arr[i]) {
+			for (--i; i >= 0; --i)
+				kfree(res_arr[i]);
+
+			kfree(res_arr);
+			return -ENOMEM;
+		}
+	}
+
+	spin_lock_irq(mlx4_tlock(dev));
+	for (i = 0; i < count; ++i) {
+		if (find_res(dev, base + i, type)) {
+			err = -EEXIST;
+			goto undo;
+		}
+		err = res_tracker_insert(root, res_arr[i]);
+		if (err)
+			goto undo;
+		list_add_tail(&res_arr[i]->list,
+			      &tracker->slave_list[slave].res_list[type]);
+	}
+	spin_unlock_irq(mlx4_tlock(dev));
+	kfree(res_arr);
+
+	return 0;
+
+undo:
+	for (--i; i >= 0; --i) {
+		rb_erase(&res_arr[i]->node, root);
+		list_del_init(&res_arr[i]->list);
+	}
+
+	spin_unlock_irq(mlx4_tlock(dev));
+
+	for (i = 0; i < count; ++i)
+		kfree(res_arr[i]);
+
+	kfree(res_arr);
+
+	return err;
+}
+
+static int remove_qp_ok(struct res_qp *res)
+{
+	if (res->com.state == RES_QP_BUSY || atomic_read(&res->ref_count) ||
+	    !list_empty(&res->mcg_list)) {
+		pr_err("resource tracker: fail to remove qp, state %d, ref_count %d\n",
+		       res->com.state, atomic_read(&res->ref_count));
+		return -EBUSY;
+	} else if (res->com.state != RES_QP_RESERVED) {
+		return -EPERM;
+	}
+
+	return 0;
+}
+
+static int remove_mtt_ok(struct res_mtt *res, int order)
+{
+	if (res->com.state == RES_MTT_BUSY ||
+	    atomic_read(&res->ref_count)) {
+		printk(KERN_DEBUG "%s-%d: state %s, ref_count %d\n",
+		       __func__, __LINE__,
+		       mtt_states_str(res->com.state),
+		       atomic_read(&res->ref_count));
+		return -EBUSY;
+	} else if (res->com.state != RES_MTT_ALLOCATED)
+		return -EPERM;
+	else if (res->order != order)
+		return -EINVAL;
+
+	return 0;
+}
+
+static int remove_mpt_ok(struct res_mpt *res)
+{
+	if (res->com.state == RES_MPT_BUSY)
+		return -EBUSY;
+	else if (res->com.state != RES_MPT_RESERVED)
+		return -EPERM;
+
+	return 0;
+}
+
+static int remove_eq_ok(struct res_eq *res)
+{
+	if (res->com.state == RES_MPT_BUSY)
+		return -EBUSY;
+	else if (res->com.state != RES_MPT_RESERVED)
+		return -EPERM;
+
+	return 0;
+}
+
+static int remove_counter_ok(struct res_counter *res)
+{
+	if (res->com.state == RES_COUNTER_BUSY)
+		return -EBUSY;
+	else if (res->com.state != RES_COUNTER_ALLOCATED)
+		return -EPERM;
+
+	return 0;
+}
+
+static int remove_xrcdn_ok(struct res_xrcdn *res)
+{
+	if (res->com.state == RES_XRCD_BUSY)
+		return -EBUSY;
+	else if (res->com.state != RES_XRCD_ALLOCATED)
+		return -EPERM;
+
+	return 0;
+}
+
+static int remove_fs_rule_ok(struct res_fs_rule *res)
+{
+	if (res->com.state == RES_FS_RULE_BUSY)
+		return -EBUSY;
+	else if (res->com.state != RES_FS_RULE_ALLOCATED)
+		return -EPERM;
+
+	return 0;
+}
+
+static int remove_cq_ok(struct res_cq *res)
+{
+	if (res->com.state == RES_CQ_BUSY)
+		return -EBUSY;
+	else if (res->com.state != RES_CQ_ALLOCATED)
+		return -EPERM;
+
+	return 0;
+}
+
+static int remove_srq_ok(struct res_srq *res)
+{
+	if (res->com.state == RES_SRQ_BUSY)
+		return -EBUSY;
+	else if (res->com.state != RES_SRQ_ALLOCATED)
+		return -EPERM;
+
+	return 0;
+}
+
+static int remove_ok(struct res_common *res, enum mlx4_resource type, int extra)
+{
+	switch (type) {
+	case RES_QP:
+		return remove_qp_ok((struct res_qp *)res);
+	case RES_CQ:
+		return remove_cq_ok((struct res_cq *)res);
+	case RES_SRQ:
+		return remove_srq_ok((struct res_srq *)res);
+	case RES_MPT:
+		return remove_mpt_ok((struct res_mpt *)res);
+	case RES_MTT:
+		return remove_mtt_ok((struct res_mtt *)res, extra);
+	case RES_MAC:
+		return -ENOSYS;
+	case RES_EQ:
+		return remove_eq_ok((struct res_eq *)res);
+	case RES_COUNTER:
+		return remove_counter_ok((struct res_counter *)res);
+	case RES_XRCD:
+		return remove_xrcdn_ok((struct res_xrcdn *)res);
+	case RES_FS_RULE:
+		return remove_fs_rule_ok((struct res_fs_rule *)res);
+	default:
+		return -EINVAL;
+	}
+}
+
+static int rem_res_range(struct mlx4_dev *dev, int slave, u64 base, int count,
+			 enum mlx4_resource type, int extra)
+{
+	u64 i;
+	int err;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct res_common *r;
+
+	spin_lock_irq(mlx4_tlock(dev));
+	for (i = base; i < base + count; ++i) {
+		r = res_tracker_lookup(&tracker->res_tree[type], i);
+		if (!r) {
+			err = -ENOENT;
+			goto out;
+		}
+		if (r->owner != slave) {
+			err = -EPERM;
+			goto out;
+		}
+		err = remove_ok(r, type, extra);
+		if (err)
+			goto out;
+	}
+
+	for (i = base; i < base + count; ++i) {
+		r = res_tracker_lookup(&tracker->res_tree[type], i);
+		rb_erase(&r->node, &tracker->res_tree[type]);
+		list_del(&r->list);
+		kfree(r);
+	}
+	err = 0;
+
+out:
+	spin_unlock_irq(mlx4_tlock(dev));
+
+	return err;
+}
+
+static int qp_res_start_move_to(struct mlx4_dev *dev, int slave, int qpn,
+				enum res_qp_states state, struct res_qp **qp,
+				int alloc)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct res_qp *r;
+	int err = 0;
+
+	spin_lock_irq(mlx4_tlock(dev));
+	r = res_tracker_lookup(&tracker->res_tree[RES_QP], qpn);
+	if (!r)
+		err = -ENOENT;
+	else if (r->com.owner != slave)
+		err = -EPERM;
+	else {
+		switch (state) {
+		case RES_QP_BUSY:
+			mlx4_dbg(dev, "%s: failed RES_QP, 0x%llx\n",
+				 __func__, (unsigned long long)r->com.res_id);
+			err = -EBUSY;
+			break;
+
+		case RES_QP_RESERVED:
+			if (r->com.state == RES_QP_MAPPED && !alloc)
+				break;
+
+			mlx4_dbg(dev, "failed RES_QP, 0x%llx\n", (unsigned long long)r->com.res_id);
+			err = -EINVAL;
+			break;
+
+		case RES_QP_MAPPED:
+			if ((r->com.state == RES_QP_RESERVED && alloc) ||
+			    r->com.state == RES_QP_HW)
+				break;
+			else {
+				mlx4_dbg(dev, "failed RES_QP, 0x%llx\n",
+					  (unsigned long long)r->com.res_id);
+				err = -EINVAL;
+			}
+
+			break;
+
+		case RES_QP_HW:
+			if (r->com.state != RES_QP_MAPPED)
+				err = -EINVAL;
+			break;
+		default:
+			err = -EINVAL;
+		}
+
+		if (!err) {
+			r->com.from_state = r->com.state;
+			r->com.to_state = state;
+			r->com.state = RES_QP_BUSY;
+			if (qp)
+				*qp = r;
+		}
+	}
+
+	spin_unlock_irq(mlx4_tlock(dev));
+
+	return err;
+}
+
+static int mr_res_start_move_to(struct mlx4_dev *dev, int slave, int index,
+				enum res_mpt_states state, struct res_mpt **mpt)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct res_mpt *r;
+	int err = 0;
+
+	spin_lock_irq(mlx4_tlock(dev));
+	r = res_tracker_lookup(&tracker->res_tree[RES_MPT], index);
+	if (!r)
+		err = -ENOENT;
+	else if (r->com.owner != slave)
+		err = -EPERM;
+	else {
+		switch (state) {
+		case RES_MPT_BUSY:
+			err = -EINVAL;
+			break;
+
+		case RES_MPT_RESERVED:
+			if (r->com.state != RES_MPT_MAPPED)
+				err = -EINVAL;
+			break;
+
+		case RES_MPT_MAPPED:
+			if (r->com.state != RES_MPT_RESERVED &&
+			    r->com.state != RES_MPT_HW)
+				err = -EINVAL;
+			break;
+
+		case RES_MPT_HW:
+			if (r->com.state != RES_MPT_MAPPED)
+				err = -EINVAL;
+			break;
+		default:
+			err = -EINVAL;
+		}
+
+		if (!err) {
+			r->com.from_state = r->com.state;
+			r->com.to_state = state;
+			r->com.state = RES_MPT_BUSY;
+			if (mpt)
+				*mpt = r;
+		}
+	}
+
+	spin_unlock_irq(mlx4_tlock(dev));
+
+	return err;
+}
+
+static int eq_res_start_move_to(struct mlx4_dev *dev, int slave, int index,
+				enum res_eq_states state, struct res_eq **eq)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct res_eq *r;
+	int err = 0;
+
+	spin_lock_irq(mlx4_tlock(dev));
+	r = res_tracker_lookup(&tracker->res_tree[RES_EQ], index);
+	if (!r)
+		err = -ENOENT;
+	else if (r->com.owner != slave)
+		err = -EPERM;
+	else {
+		switch (state) {
+		case RES_EQ_BUSY:
+			err = -EINVAL;
+			break;
+
+		case RES_EQ_RESERVED:
+			if (r->com.state != RES_EQ_HW)
+				err = -EINVAL;
+			break;
+
+		case RES_EQ_HW:
+			if (r->com.state != RES_EQ_RESERVED)
+				err = -EINVAL;
+			break;
+
+		default:
+			err = -EINVAL;
+		}
+
+		if (!err) {
+			r->com.from_state = r->com.state;
+			r->com.to_state = state;
+			r->com.state = RES_EQ_BUSY;
+			if (eq)
+				*eq = r;
+		}
+	}
+
+	spin_unlock_irq(mlx4_tlock(dev));
+
+	return err;
+}
+
+static int cq_res_start_move_to(struct mlx4_dev *dev, int slave, int cqn,
+				enum res_cq_states state, struct res_cq **cq)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct res_cq *r;
+	int err;
+
+	spin_lock_irq(mlx4_tlock(dev));
+	r = res_tracker_lookup(&tracker->res_tree[RES_CQ], cqn);
+	if (!r)
+		err = -ENOENT;
+	else if (r->com.owner != slave)
+		err = -EPERM;
+	else {
+		switch (state) {
+		case RES_CQ_BUSY:
+			err = -EBUSY;
+			break;
+
+		case RES_CQ_ALLOCATED:
+			if (r->com.state != RES_CQ_HW)
+				err = -EINVAL;
+			else if (atomic_read(&r->ref_count))
+				err = -EBUSY;
+			else
+				err = 0;
+			break;
+
+		case RES_CQ_HW:
+			if (r->com.state != RES_CQ_ALLOCATED)
+				err = -EINVAL;
+			else
+				err = 0;
+			break;
+
+		default:
+			err = -EINVAL;
+		}
+
+		if (!err) {
+			r->com.from_state = r->com.state;
+			r->com.to_state = state;
+			r->com.state = RES_CQ_BUSY;
+			if (cq)
+				*cq = r;
+		}
+	}
+
+	spin_unlock_irq(mlx4_tlock(dev));
+
+	return err;
+}
+
+static int srq_res_start_move_to(struct mlx4_dev *dev, int slave, int index,
+				 enum res_srq_states state, struct res_srq **srq)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct res_srq *r;
+	int err = 0;
+
+	spin_lock_irq(mlx4_tlock(dev));
+	r = res_tracker_lookup(&tracker->res_tree[RES_SRQ], index);
+	if (!r)
+		err = -ENOENT;
+	else if (r->com.owner != slave)
+		err = -EPERM;
+	else {
+		switch (state) {
+		case RES_SRQ_BUSY:
+			err = -EINVAL;
+			break;
+
+		case RES_SRQ_ALLOCATED:
+			if (r->com.state != RES_SRQ_HW)
+				err = -EINVAL;
+			else if (atomic_read(&r->ref_count))
+				err = -EBUSY;
+			break;
+
+		case RES_SRQ_HW:
+			if (r->com.state != RES_SRQ_ALLOCATED)
+				err = -EINVAL;
+			break;
+
+		default:
+			err = -EINVAL;
+		}
+
+		if (!err) {
+			r->com.from_state = r->com.state;
+			r->com.to_state = state;
+			r->com.state = RES_SRQ_BUSY;
+			if (srq)
+				*srq = r;
+		}
+	}
+
+	spin_unlock_irq(mlx4_tlock(dev));
+
+	return err;
+}
+
+static void res_abort_move(struct mlx4_dev *dev, int slave,
+			   enum mlx4_resource type, int id)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct res_common *r;
+
+	spin_lock_irq(mlx4_tlock(dev));
+	r = res_tracker_lookup(&tracker->res_tree[type], id);
+	if (r && (r->owner == slave))
+		r->state = r->from_state;
+	spin_unlock_irq(mlx4_tlock(dev));
+}
+
+static void res_end_move(struct mlx4_dev *dev, int slave,
+			 enum mlx4_resource type, int id)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct res_common *r;
+
+	spin_lock_irq(mlx4_tlock(dev));
+	r = res_tracker_lookup(&tracker->res_tree[type], id);
+	if (r && (r->owner == slave))
+		r->state = r->to_state;
+	spin_unlock_irq(mlx4_tlock(dev));
+}
+
+static int valid_reserved(struct mlx4_dev *dev, int slave, int qpn)
+{
+	return mlx4_is_qp_reserved(dev, qpn) &&
+		(mlx4_is_master(dev) || mlx4_is_guest_proxy(dev, slave, qpn));
+}
+
+static int fw_reserved(struct mlx4_dev *dev, int qpn)
+{
+	return qpn < dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW];
+}
+
+static int qp_alloc_res(struct mlx4_dev *dev, int slave, int op, int cmd,
+			u64 in_param, u64 *out_param)
+{
+	int err;
+	int count;
+	int align;
+	int base;
+	int qpn;
+	u8 flags;
+
+	switch (op) {
+	case RES_OP_RESERVE:
+		count = get_param_l(&in_param) & 0xffffff;
+		flags = get_param_l(&in_param) >> 24;
+		align = get_param_h(&in_param);
+		err = mlx4_grant_resource(dev, slave, RES_QP, count, 0);
+		if (err)
+			return err;
+
+		err = __mlx4_qp_reserve_range(dev, count, align, &base, flags);
+		if (err) {
+			mlx4_release_resource(dev, slave, RES_QP, count, 0);
+			return err;
+		}
+
+		err = add_res_range(dev, slave, base, count, RES_QP, 0);
+		if (err) {
+			mlx4_release_resource(dev, slave, RES_QP, count, 0);
+			__mlx4_qp_release_range(dev, base, count);
+			return err;
+		}
+		set_param_l(out_param, base);
+		break;
+	case RES_OP_MAP_ICM:
+		qpn = get_param_l(&in_param) & 0x7fffff;
+		if (valid_reserved(dev, slave, qpn)) {
+			err = add_res_range(dev, slave, qpn, 1, RES_QP, 0);
+			if (err)
+				return err;
+		}
+
+		err = qp_res_start_move_to(dev, slave, qpn, RES_QP_MAPPED,
+					   NULL, 1);
+		if (err)
+			return err;
+
+		if (!fw_reserved(dev, qpn)) {
+			err = __mlx4_qp_alloc_icm(dev, qpn);
+			if (err) {
+				res_abort_move(dev, slave, RES_QP, qpn);
+				return err;
+			}
+		}
+
+		res_end_move(dev, slave, RES_QP, qpn);
+		break;
+
+	default:
+		err = -EINVAL;
+		break;
+	}
+	return err;
+}
+
+static int mtt_alloc_res(struct mlx4_dev *dev, int slave, int op, int cmd,
+			 u64 in_param, u64 *out_param)
+{
+	int err = -EINVAL;
+	int base;
+	int order;
+
+	if (op != RES_OP_RESERVE_AND_MAP)
+		return err;
+
+	order = get_param_l(&in_param);
+
+	err = mlx4_grant_resource(dev, slave, RES_MTT, 1 << order, 0);
+	if (err)
+		return err;
+
+	base = __mlx4_alloc_mtt_range(dev, order);
+	if (base == -1) {
+		mlx4_release_resource(dev, slave, RES_MTT, 1 << order, 0);
+		return -ENOMEM;
+	}
+
+	err = add_res_range(dev, slave, base, 1, RES_MTT, order);
+	if (err) {
+		mlx4_release_resource(dev, slave, RES_MTT, 1 << order, 0);
+		__mlx4_free_mtt_range(dev, base, order);
+	} else
+		set_param_l(out_param, base);
+
+	return err;
+}
+
+static int mpt_alloc_res(struct mlx4_dev *dev, int slave, int op, int cmd,
+			 u64 in_param, u64 *out_param)
+{
+	int err = -EINVAL;
+	int index;
+	int id;
+	struct res_mpt *mpt;
+
+	switch (op) {
+	case RES_OP_RESERVE:
+		err = mlx4_grant_resource(dev, slave, RES_MPT, 1, 0);
+		if (err)
+			break;
+
+		index = __mlx4_mpt_reserve(dev);
+		if (index == -1) {
+			mlx4_release_resource(dev, slave, RES_MPT, 1, 0);
+			break;
+		}
+		id = index & mpt_mask(dev);
+
+		err = add_res_range(dev, slave, id, 1, RES_MPT, index);
+		if (err) {
+			mlx4_release_resource(dev, slave, RES_MPT, 1, 0);
+			__mlx4_mpt_release(dev, index);
+			break;
+		}
+		set_param_l(out_param, index);
+		break;
+	case RES_OP_MAP_ICM:
+		index = get_param_l(&in_param);
+		id = index & mpt_mask(dev);
+		err = mr_res_start_move_to(dev, slave, id,
+					   RES_MPT_MAPPED, &mpt);
+		if (err)
+			return err;
+
+		err = __mlx4_mpt_alloc_icm(dev, mpt->key);
+		if (err) {
+			res_abort_move(dev, slave, RES_MPT, id);
+			return err;
+		}
+
+		res_end_move(dev, slave, RES_MPT, id);
+		break;
+	}
+	return err;
+}
+
+static int cq_alloc_res(struct mlx4_dev *dev, int slave, int op, int cmd,
+			u64 in_param, u64 *out_param)
+{
+	int cqn;
+	int err;
+
+	switch (op) {
+	case RES_OP_RESERVE_AND_MAP:
+		err = mlx4_grant_resource(dev, slave, RES_CQ, 1, 0);
+		if (err)
+			break;
+
+		err = __mlx4_cq_alloc_icm(dev, &cqn);
+		if (err) {
+			mlx4_release_resource(dev, slave, RES_CQ, 1, 0);
+			break;
+		}
+
+		err = add_res_range(dev, slave, cqn, 1, RES_CQ, 0);
+		if (err) {
+			mlx4_release_resource(dev, slave, RES_CQ, 1, 0);
+			__mlx4_cq_free_icm(dev, cqn);
+			break;
+		}
+
+		set_param_l(out_param, cqn);
+		break;
+
+	default:
+		err = -EINVAL;
+	}
+
+	return err;
+}
+
+static int srq_alloc_res(struct mlx4_dev *dev, int slave, int op, int cmd,
+			 u64 in_param, u64 *out_param)
+{
+	int srqn;
+	int err;
+
+	switch (op) {
+	case RES_OP_RESERVE_AND_MAP:
+		err = mlx4_grant_resource(dev, slave, RES_SRQ, 1, 0);
+		if (err)
+			break;
+
+		err = __mlx4_srq_alloc_icm(dev, &srqn);
+		if (err) {
+			mlx4_release_resource(dev, slave, RES_SRQ, 1, 0);
+			break;
+		}
+
+		err = add_res_range(dev, slave, srqn, 1, RES_SRQ, 0);
+		if (err) {
+			mlx4_release_resource(dev, slave, RES_SRQ, 1, 0);
+			__mlx4_srq_free_icm(dev, srqn);
+			break;
+		}
+
+		set_param_l(out_param, srqn);
+		break;
+
+	default:
+		err = -EINVAL;
+	}
+
+	return err;
+}
+
+static int mac_find_smac_ix_in_slave(struct mlx4_dev *dev, int slave, int port,
+				     u8 smac_index, u64 *mac)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct list_head *mac_list =
+		&tracker->slave_list[slave].res_list[RES_MAC];
+	struct mac_res *res, *tmp;
+
+	list_for_each_entry_safe(res, tmp, mac_list, list) {
+		if (res->smac_index == smac_index && res->port == (u8) port) {
+			*mac = res->mac;
+			return 0;
+		}
+	}
+	return -ENOENT;
+}
+
+static int mac_add_to_slave(struct mlx4_dev *dev, int slave, u64 mac, int port, u8 smac_index)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct list_head *mac_list =
+		&tracker->slave_list[slave].res_list[RES_MAC];
+	struct mac_res *res, *tmp;
+
+	list_for_each_entry_safe(res, tmp, mac_list, list) {
+		if (res->mac == mac && res->port == (u8) port) {
+			/* mac found. update ref count */
+			++res->ref_count;
+			return 0;
+		}
+	}
+
+	if (mlx4_grant_resource(dev, slave, RES_MAC, 1, port))
+		return -EINVAL;
+	res = kzalloc(sizeof *res, GFP_KERNEL);
+	if (!res) {
+		mlx4_release_resource(dev, slave, RES_MAC, 1, port);
+		return -ENOMEM;
+	}
+	res->mac = mac;
+	res->port = (u8) port;
+	res->smac_index = smac_index;
+	res->ref_count = 1;
+	list_add_tail(&res->list,
+		      &tracker->slave_list[slave].res_list[RES_MAC]);
+	return 0;
+}
+
+
+static void mac_del_from_slave(struct mlx4_dev *dev, int slave, u64 mac,
+			       int port)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct list_head *mac_list =
+		&tracker->slave_list[slave].res_list[RES_MAC];
+	struct mac_res *res, *tmp;
+
+	list_for_each_entry_safe(res, tmp, mac_list, list) {
+		if (res->mac == mac && res->port == (u8) port) {
+			if (!--res->ref_count) {
+				list_del(&res->list);
+				mlx4_release_resource(dev, slave, RES_MAC, 1, port);
+				kfree(res);
+			}
+			break;
+		}
+	}
+}
+
+static void rem_slave_macs(struct mlx4_dev *dev, int slave)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct list_head *mac_list =
+		&tracker->slave_list[slave].res_list[RES_MAC];
+	struct mac_res *res, *tmp;
+	int i;
+
+	list_for_each_entry_safe(res, tmp, mac_list, list) {
+		list_del(&res->list);
+		/* dereference the mac the num times the slave referenced it */
+		for (i = 0; i < res->ref_count; i++)
+			__mlx4_unregister_mac(dev, res->port, res->mac);
+		mlx4_release_resource(dev, slave, RES_MAC, 1, res->port);
+		kfree(res);
+	}
+}
+
+static int mac_alloc_res(struct mlx4_dev *dev, int slave, int op, int cmd,
+			 u64 in_param, u64 *out_param, int in_port)
+{
+	int err = -EINVAL;
+	int port;
+	u64 mac;
+	u8 smac_index = 0;
+
+	if (op != RES_OP_RESERVE_AND_MAP)
+		return err;
+
+	port = !in_port ? get_param_l(out_param) : in_port;
+	mac = in_param;
+
+	err = __mlx4_register_mac(dev, port, mac);
+	if (err >= 0) {
+		smac_index = err;
+		set_param_l(out_param, err);
+		err = 0;
+	}
+
+	if (!err) {
+		err = mac_add_to_slave(dev, slave, mac, port, smac_index);
+		if (err)
+			__mlx4_unregister_mac(dev, port, mac);
+	}
+	return err;
+}
+
+static int vlan_add_to_slave(struct mlx4_dev *dev, int slave, u16 vlan,
+			     int port, int vlan_index)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct list_head *vlan_list =
+		&tracker->slave_list[slave].res_list[RES_VLAN];
+	struct vlan_res *res, *tmp;
+
+	list_for_each_entry_safe(res, tmp, vlan_list, list) {
+		if (res->vlan == vlan && res->port == (u8) port) {
+			/* vlan found. update ref count */
+			++res->ref_count;
+			return 0;
+		}
+	}
+
+	if (mlx4_grant_resource(dev, slave, RES_VLAN, 1, port))
+		return -EINVAL;
+	res = kzalloc(sizeof(*res), GFP_KERNEL);
+	if (!res) {
+		mlx4_release_resource(dev, slave, RES_VLAN, 1, port);
+		return -ENOMEM;
+	}
+	res->vlan = vlan;
+	res->port = (u8) port;
+	res->vlan_index = vlan_index;
+	res->ref_count = 1;
+	list_add_tail(&res->list,
+		      &tracker->slave_list[slave].res_list[RES_VLAN]);
+	return 0;
+}
+
+
+static void vlan_del_from_slave(struct mlx4_dev *dev, int slave, u16 vlan,
+				int port)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct list_head *vlan_list =
+		&tracker->slave_list[slave].res_list[RES_VLAN];
+	struct vlan_res *res, *tmp;
+
+	list_for_each_entry_safe(res, tmp, vlan_list, list) {
+		if (res->vlan == vlan && res->port == (u8) port) {
+			if (!--res->ref_count) {
+				list_del(&res->list);
+				mlx4_release_resource(dev, slave, RES_VLAN,
+						      1, port);
+				kfree(res);
+			}
+			break;
+		}
+	}
+}
+
+static void rem_slave_vlans(struct mlx4_dev *dev, int slave)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct list_head *vlan_list =
+		&tracker->slave_list[slave].res_list[RES_VLAN];
+	struct vlan_res *res, *tmp;
+	int i;
+
+	list_for_each_entry_safe(res, tmp, vlan_list, list) {
+		list_del(&res->list);
+		/* dereference the vlan the num times the slave referenced it */
+		for (i = 0; i < res->ref_count; i++)
+			__mlx4_unregister_vlan(dev, res->port, res->vlan);
+		mlx4_release_resource(dev, slave, RES_VLAN, 1, res->port);
+		kfree(res);
+	}
+}
+
+static int vlan_alloc_res(struct mlx4_dev *dev, int slave, int op, int cmd,
+			  u64 in_param, u64 *out_param, int in_port)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_slave_state *slave_state = priv->mfunc.master.slave_state;
+	int err = -EINVAL;
+	u16 vlan;
+	int vlan_index;
+	int port;
+
+	port = !in_port ? get_param_l(out_param) : in_port;
+
+	if (!port)
+		return err;
+
+	if (op != RES_OP_RESERVE_AND_MAP)
+		return err;
+
+	/* upstream kernels had NOP for reg/unreg vlan. Continue this. */
+	if (!in_port && port > 0 && port <= dev->caps.num_ports) {
+		slave_state[slave].old_vlan_api = true;
+		return 0;
+	}
+
+	vlan = (u16) in_param;
+
+	err = __mlx4_register_vlan(dev, port, vlan, &vlan_index);
+	if (!err) {
+		set_param_l(out_param, (u32) vlan_index);
+		err = vlan_add_to_slave(dev, slave, vlan, port, vlan_index);
+		if (err)
+			__mlx4_unregister_vlan(dev, port, vlan);
+	}
+	return err;
+}
+
+static int counter_alloc_res(struct mlx4_dev *dev, int slave, int op, int cmd,
+			     u64 in_param, u64 *out_param, int port)
+{
+	u32 index;
+	int err;
+
+	if (op != RES_OP_RESERVE)
+		return -EINVAL;
+
+	err = __mlx4_counter_alloc(dev, slave, port, &index);
+	if (!err)
+		set_param_l(out_param, index);
+
+	return err;
+}
+
+static int xrcdn_alloc_res(struct mlx4_dev *dev, int slave, int op, int cmd,
+			   u64 in_param, u64 *out_param)
+{
+	u32 xrcdn;
+	int err;
+
+	if (op != RES_OP_RESERVE)
+		return -EINVAL;
+
+	err = __mlx4_xrcd_alloc(dev, &xrcdn);
+	if (err)
+		return err;
+
+	err = add_res_range(dev, slave, xrcdn, 1, RES_XRCD, 0);
+	if (err)
+		__mlx4_xrcd_free(dev, xrcdn);
+	else
+		set_param_l(out_param, xrcdn);
+
+	return err;
+}
+
+int mlx4_ALLOC_RES_wrapper(struct mlx4_dev *dev, int slave,
+			   struct mlx4_vhcr *vhcr,
+			   struct mlx4_cmd_mailbox *inbox,
+			   struct mlx4_cmd_mailbox *outbox,
+			   struct mlx4_cmd_info *cmd)
+{
+	int err;
+	int alop = vhcr->op_modifier;
+
+	switch (vhcr->in_modifier & 0xFF) {
+	case RES_QP:
+		err = qp_alloc_res(dev, slave, vhcr->op_modifier, alop,
+				   vhcr->in_param, &vhcr->out_param);
+		break;
+
+	case RES_MTT:
+		err = mtt_alloc_res(dev, slave, vhcr->op_modifier, alop,
+				    vhcr->in_param, &vhcr->out_param);
+		break;
+
+	case RES_MPT:
+		err = mpt_alloc_res(dev, slave, vhcr->op_modifier, alop,
+				    vhcr->in_param, &vhcr->out_param);
+		break;
+
+	case RES_CQ:
+		err = cq_alloc_res(dev, slave, vhcr->op_modifier, alop,
+				   vhcr->in_param, &vhcr->out_param);
+		break;
+
+	case RES_SRQ:
+		err = srq_alloc_res(dev, slave, vhcr->op_modifier, alop,
+				    vhcr->in_param, &vhcr->out_param);
+		break;
+
+	case RES_MAC:
+		err = mac_alloc_res(dev, slave, vhcr->op_modifier, alop,
+				    vhcr->in_param, &vhcr->out_param,
+				    (vhcr->in_modifier >> 8) & 0xFF);
+		break;
+
+	case RES_VLAN:
+		err = vlan_alloc_res(dev, slave, vhcr->op_modifier, alop,
+				     vhcr->in_param, &vhcr->out_param,
+				     (vhcr->in_modifier >> 8) & 0xFF);
+		break;
+
+	case RES_COUNTER:
+		err = counter_alloc_res(dev, slave, vhcr->op_modifier, alop,
+					vhcr->in_param, &vhcr->out_param,
+					(vhcr->in_modifier >> 8) & 0xFF);
+		break;
+
+	case RES_XRCD:
+		err = xrcdn_alloc_res(dev, slave, vhcr->op_modifier, alop,
+				      vhcr->in_param, &vhcr->out_param);
+		break;
+
+	default:
+		err = -EINVAL;
+		break;
+	}
+
+	return err;
+}
+
+static int qp_free_res(struct mlx4_dev *dev, int slave, int op, int cmd,
+		       u64 in_param)
+{
+	int err;
+	int count;
+	int base;
+	int qpn;
+
+	switch (op) {
+	case RES_OP_RESERVE:
+		base = get_param_l(&in_param) & 0x7fffff;
+		count = get_param_h(&in_param);
+		err = rem_res_range(dev, slave, base, count, RES_QP, 0);
+		if (err)
+			break;
+		mlx4_release_resource(dev, slave, RES_QP, count, 0);
+		__mlx4_qp_release_range(dev, base, count);
+		break;
+	case RES_OP_MAP_ICM:
+		qpn = get_param_l(&in_param) & 0x7fffff;
+		err = qp_res_start_move_to(dev, slave, qpn, RES_QP_RESERVED,
+					   NULL, 0);
+		if (err)
+			return err;
+
+		if (!fw_reserved(dev, qpn))
+			__mlx4_qp_free_icm(dev, qpn);
+
+		res_end_move(dev, slave, RES_QP, qpn);
+
+		if (valid_reserved(dev, slave, qpn))
+			err = rem_res_range(dev, slave, qpn, 1, RES_QP, 0);
+		break;
+	default:
+		err = -EINVAL;
+		break;
+	}
+	return err;
+}
+
+static int mtt_free_res(struct mlx4_dev *dev, int slave, int op, int cmd,
+			u64 in_param, u64 *out_param)
+{
+	int err = -EINVAL;
+	int base;
+	int order;
+
+	if (op != RES_OP_RESERVE_AND_MAP)
+		return err;
+
+	base = get_param_l(&in_param);
+	order = get_param_h(&in_param);
+	err = rem_res_range(dev, slave, base, 1, RES_MTT, order);
+	if (!err) {
+		mlx4_release_resource(dev, slave, RES_MTT, 1 << order, 0);
+		__mlx4_free_mtt_range(dev, base, order);
+	}
+	return err;
+}
+
+static int mpt_free_res(struct mlx4_dev *dev, int slave, int op, int cmd,
+			u64 in_param)
+{
+	int err = -EINVAL;
+	int index;
+	int id;
+	struct res_mpt *mpt;
+
+	switch (op) {
+	case RES_OP_RESERVE:
+		index = get_param_l(&in_param);
+		id = index & mpt_mask(dev);
+		err = get_res(dev, slave, id, RES_MPT, &mpt);
+		if (err)
+			break;
+		index = mpt->key;
+		put_res(dev, slave, id, RES_MPT);
+
+		err = rem_res_range(dev, slave, id, 1, RES_MPT, 0);
+		if (err)
+			break;
+		mlx4_release_resource(dev, slave, RES_MPT, 1, 0);
+		__mlx4_mpt_release(dev, index);
+		break;
+	case RES_OP_MAP_ICM:
+			index = get_param_l(&in_param);
+			id = index & mpt_mask(dev);
+			err = mr_res_start_move_to(dev, slave, id,
+						   RES_MPT_RESERVED, &mpt);
+			if (err)
+				return err;
+
+			__mlx4_mpt_free_icm(dev, mpt->key);
+			res_end_move(dev, slave, RES_MPT, id);
+			return err;
+		break;
+	default:
+		err = -EINVAL;
+		break;
+	}
+	return err;
+}
+
+static int cq_free_res(struct mlx4_dev *dev, int slave, int op, int cmd,
+		       u64 in_param, u64 *out_param)
+{
+	int cqn;
+	int err;
+
+	switch (op) {
+	case RES_OP_RESERVE_AND_MAP:
+		cqn = get_param_l(&in_param);
+		err = rem_res_range(dev, slave, cqn, 1, RES_CQ, 0);
+		if (err)
+			break;
+
+		mlx4_release_resource(dev, slave, RES_CQ, 1, 0);
+		__mlx4_cq_free_icm(dev, cqn);
+		break;
+
+	default:
+		err = -EINVAL;
+		break;
+	}
+
+	return err;
+}
+
+static int srq_free_res(struct mlx4_dev *dev, int slave, int op, int cmd,
+			u64 in_param, u64 *out_param)
+{
+	int srqn;
+	int err;
+
+	switch (op) {
+	case RES_OP_RESERVE_AND_MAP:
+		srqn = get_param_l(&in_param);
+		err = rem_res_range(dev, slave, srqn, 1, RES_SRQ, 0);
+		if (err)
+			break;
+
+		mlx4_release_resource(dev, slave, RES_SRQ, 1, 0);
+		__mlx4_srq_free_icm(dev, srqn);
+		break;
+
+	default:
+		err = -EINVAL;
+		break;
+	}
+
+	return err;
+}
+
+static int mac_free_res(struct mlx4_dev *dev, int slave, int op, int cmd,
+			    u64 in_param, u64 *out_param, int in_port)
+{
+	int port;
+	int err = 0;
+
+	switch (op) {
+	case RES_OP_RESERVE_AND_MAP:
+		port = !in_port ? get_param_l(out_param) : in_port;
+		mac_del_from_slave(dev, slave, in_param, port);
+		__mlx4_unregister_mac(dev, port, in_param);
+		break;
+	default:
+		err = -EINVAL;
+		break;
+	}
+
+	return err;
+
+}
+
+static int vlan_free_res(struct mlx4_dev *dev, int slave, int op, int cmd,
+			    u64 in_param, u64 *out_param, int port)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_slave_state *slave_state = priv->mfunc.master.slave_state;
+	int err = 0;
+
+	switch (op) {
+	case RES_OP_RESERVE_AND_MAP:
+		if (slave_state[slave].old_vlan_api == true)
+			return 0;
+		if (!port)
+			return -EINVAL;
+		vlan_del_from_slave(dev, slave, in_param, port);
+		__mlx4_unregister_vlan(dev, port, in_param);
+		break;
+	default:
+		err = -EINVAL;
+		break;
+	}
+
+	return err;
+}
+
+static int counter_free_res(struct mlx4_dev *dev, int slave, int op, int cmd,
+			    u64 in_param, u64 *out_param, int port)
+{
+	int index;
+
+	if (op != RES_OP_RESERVE)
+		return -EINVAL;
+
+	index = get_param_l(&in_param);
+
+	__mlx4_counter_free(dev, slave, port, index);
+
+	return 0;
+}
+
+static int xrcdn_free_res(struct mlx4_dev *dev, int slave, int op, int cmd,
+			  u64 in_param, u64 *out_param)
+{
+	int xrcdn;
+	int err;
+
+	if (op != RES_OP_RESERVE)
+		return -EINVAL;
+
+	xrcdn = get_param_l(&in_param);
+	err = rem_res_range(dev, slave, xrcdn, 1, RES_XRCD, 0);
+	if (err)
+		return err;
+
+	__mlx4_xrcd_free(dev, xrcdn);
+
+	return err;
+}
+
+int mlx4_FREE_RES_wrapper(struct mlx4_dev *dev, int slave,
+			  struct mlx4_vhcr *vhcr,
+			  struct mlx4_cmd_mailbox *inbox,
+			  struct mlx4_cmd_mailbox *outbox,
+			  struct mlx4_cmd_info *cmd)
+{
+	int err = -EINVAL;
+	int alop = vhcr->op_modifier;
+
+	switch (vhcr->in_modifier & 0xFF) {
+	case RES_QP:
+		err = qp_free_res(dev, slave, vhcr->op_modifier, alop,
+				  vhcr->in_param);
+		break;
+
+	case RES_MTT:
+		err = mtt_free_res(dev, slave, vhcr->op_modifier, alop,
+				   vhcr->in_param, &vhcr->out_param);
+		break;
+
+	case RES_MPT:
+		err = mpt_free_res(dev, slave, vhcr->op_modifier, alop,
+				   vhcr->in_param);
+		break;
+
+	case RES_CQ:
+		err = cq_free_res(dev, slave, vhcr->op_modifier, alop,
+				  vhcr->in_param, &vhcr->out_param);
+		break;
+
+	case RES_SRQ:
+		err = srq_free_res(dev, slave, vhcr->op_modifier, alop,
+				   vhcr->in_param, &vhcr->out_param);
+		break;
+
+	case RES_MAC:
+		err = mac_free_res(dev, slave, vhcr->op_modifier, alop,
+				   vhcr->in_param, &vhcr->out_param,
+				   (vhcr->in_modifier >> 8) & 0xFF);
+		break;
+
+	case RES_VLAN:
+		err = vlan_free_res(dev, slave, vhcr->op_modifier, alop,
+				    vhcr->in_param, &vhcr->out_param,
+				    (vhcr->in_modifier >> 8) & 0xFF);
+		break;
+
+	case RES_COUNTER:
+		err = counter_free_res(dev, slave, vhcr->op_modifier, alop,
+				       vhcr->in_param, &vhcr->out_param,
+				       (vhcr->in_modifier >> 8) & 0xFF);
+		break;
+
+	case RES_XRCD:
+		err = xrcdn_free_res(dev, slave, vhcr->op_modifier, alop,
+				     vhcr->in_param, &vhcr->out_param);
+
+	default:
+		break;
+	}
+	return err;
+}
+
+/* ugly but other choices are uglier */
+static int mr_phys_mpt(struct mlx4_mpt_entry *mpt)
+{
+	return (be32_to_cpu(mpt->flags) >> 9) & 1;
+}
+
+static int mr_get_mtt_addr(struct mlx4_mpt_entry *mpt)
+{
+	return (int)be64_to_cpu(mpt->mtt_addr) & 0xfffffff8;
+}
+
+static int mr_get_mtt_size(struct mlx4_mpt_entry *mpt)
+{
+	return be32_to_cpu(mpt->mtt_sz);
+}
+
+static u32 mr_get_pd(struct mlx4_mpt_entry *mpt)
+{
+	return be32_to_cpu(mpt->pd_flags) & 0x00ffffff;
+}
+
+static int mr_is_fmr(struct mlx4_mpt_entry *mpt)
+{
+	return be32_to_cpu(mpt->pd_flags) & MLX4_MPT_PD_FLAG_FAST_REG;
+}
+
+static int mr_is_bind_enabled(struct mlx4_mpt_entry *mpt)
+{
+	return be32_to_cpu(mpt->flags) & MLX4_MPT_FLAG_BIND_ENABLE;
+}
+
+static int mr_is_region(struct mlx4_mpt_entry *mpt)
+{
+	return be32_to_cpu(mpt->flags) & MLX4_MPT_FLAG_REGION;
+}
+
+static int qp_get_mtt_addr(struct mlx4_qp_context *qpc)
+{
+	return be32_to_cpu(qpc->mtt_base_addr_l) & 0xfffffff8;
+}
+
+static int srq_get_mtt_addr(struct mlx4_srq_context *srqc)
+{
+	return be32_to_cpu(srqc->mtt_base_addr_l) & 0xfffffff8;
+}
+
+static int qp_get_mtt_size(struct mlx4_qp_context *qpc)
+{
+	int page_shift = (qpc->log_page_size & 0x3f) + 12;
+	int log_sq_size = (qpc->sq_size_stride >> 3) & 0xf;
+	int log_sq_sride = qpc->sq_size_stride & 7;
+	int log_rq_size = (qpc->rq_size_stride >> 3) & 0xf;
+	int log_rq_stride = qpc->rq_size_stride & 7;
+	int srq = (be32_to_cpu(qpc->srqn) >> 24) & 1;
+	int rss = (be32_to_cpu(qpc->flags) >> 13) & 1;
+	u32 ts = (be32_to_cpu(qpc->flags) >> 16) & 0xff;
+	int xrc = (ts == MLX4_QP_ST_XRC) ? 1 : 0;
+	int sq_size;
+	int rq_size;
+	int total_pages;
+	int total_mem;
+	int page_offset = (be32_to_cpu(qpc->params2) >> 6) & 0x3f;
+
+	sq_size = 1 << (log_sq_size + log_sq_sride + 4);
+	rq_size = (srq|rss|xrc) ? 0 : (1 << (log_rq_size + log_rq_stride + 4));
+	total_mem = sq_size + rq_size;
+	total_pages =
+		roundup_pow_of_two((total_mem + (page_offset << 6)) >>
+				   page_shift);
+
+	return total_pages;
+}
+
+static int check_mtt_range(struct mlx4_dev *dev, int slave, int start,
+			   int size, struct res_mtt *mtt)
+{
+	int res_start = mtt->com.res_id;
+	int res_size = (1 << mtt->order);
+
+	if (start < res_start || start + size > res_start + res_size)
+		return -EPERM;
+	return 0;
+}
+
+int mlx4_SW2HW_MPT_wrapper(struct mlx4_dev *dev, int slave,
+			   struct mlx4_vhcr *vhcr,
+			   struct mlx4_cmd_mailbox *inbox,
+			   struct mlx4_cmd_mailbox *outbox,
+			   struct mlx4_cmd_info *cmd)
+{
+	int err;
+	int index = vhcr->in_modifier;
+	struct res_mtt *mtt;
+	struct res_mpt *mpt;
+	int mtt_base = mr_get_mtt_addr(inbox->buf) / dev->caps.mtt_entry_sz;
+	int phys;
+	int id;
+	u32 pd;
+	int pd_slave;
+
+	id = index & mpt_mask(dev);
+	err = mr_res_start_move_to(dev, slave, id, RES_MPT_HW, &mpt);
+	if (err)
+		return err;
+
+	/* Currently disable memory windows since this feature isn't tested yet
+	* under virtualization.
+	*/
+	if (!mr_is_region(inbox->buf)) {
+		err = -ENOSYS;
+		goto ex_abort;
+	}
+
+	/* Make sure that the PD bits related to the slave id are zeros. */
+	pd = mr_get_pd(inbox->buf);
+	pd_slave = (pd >> 17) & 0x7f;
+	if (pd_slave != 0 && pd_slave != slave) {
+		err = -EPERM;
+		goto ex_abort;
+	}
+
+	if (mr_is_fmr(inbox->buf)) {
+		/* FMR and Bind Enable are forbidden in slave devices. */
+		if (mr_is_bind_enabled(inbox->buf)) {
+			err = -EPERM;
+			goto ex_abort;
+		}
+		/* FMR and Memory Windows are also forbidden. */
+		if (!mr_is_region(inbox->buf)) {
+			err = -EPERM;
+			goto ex_abort;
+		}
+	}
+
+	phys = mr_phys_mpt(inbox->buf);
+	if (!phys) {
+		err = get_res(dev, slave, mtt_base, RES_MTT, &mtt);
+		if (err)
+			goto ex_abort;
+
+		err = check_mtt_range(dev, slave, mtt_base,
+				      mr_get_mtt_size(inbox->buf), mtt);
+		if (err)
+			goto ex_put;
+
+		mpt->mtt = mtt;
+	}
+
+	err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+	if (err)
+		goto ex_put;
+
+	if (!phys) {
+		atomic_inc(&mtt->ref_count);
+		put_res(dev, slave, mtt->com.res_id, RES_MTT);
+	}
+
+	res_end_move(dev, slave, RES_MPT, id);
+	return 0;
+
+ex_put:
+	if (!phys)
+		put_res(dev, slave, mtt->com.res_id, RES_MTT);
+ex_abort:
+	res_abort_move(dev, slave, RES_MPT, id);
+
+	return err;
+}
+
+int mlx4_HW2SW_MPT_wrapper(struct mlx4_dev *dev, int slave,
+			   struct mlx4_vhcr *vhcr,
+			   struct mlx4_cmd_mailbox *inbox,
+			   struct mlx4_cmd_mailbox *outbox,
+			   struct mlx4_cmd_info *cmd)
+{
+	int err;
+	int index = vhcr->in_modifier;
+	struct res_mpt *mpt;
+	int id;
+
+	id = index & mpt_mask(dev);
+	err = mr_res_start_move_to(dev, slave, id, RES_MPT_MAPPED, &mpt);
+	if (err)
+		return err;
+
+	err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+	if (err)
+		goto ex_abort;
+
+	if (mpt->mtt)
+		atomic_dec(&mpt->mtt->ref_count);
+
+	res_end_move(dev, slave, RES_MPT, id);
+	return 0;
+
+ex_abort:
+	res_abort_move(dev, slave, RES_MPT, id);
+
+	return err;
+}
+
+int mlx4_QUERY_MPT_wrapper(struct mlx4_dev *dev, int slave,
+			   struct mlx4_vhcr *vhcr,
+			   struct mlx4_cmd_mailbox *inbox,
+			   struct mlx4_cmd_mailbox *outbox,
+			   struct mlx4_cmd_info *cmd)
+{
+	int err;
+	int index = vhcr->in_modifier;
+	struct res_mpt *mpt;
+	int id;
+
+	id = index & mpt_mask(dev);
+	err = get_res(dev, slave, id, RES_MPT, &mpt);
+	if (err)
+		return err;
+
+	if (mpt->com.from_state != RES_MPT_HW) {
+		err = -EBUSY;
+		goto out;
+	}
+
+	err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+
+out:
+	put_res(dev, slave, id, RES_MPT);
+	return err;
+}
+
+static int qp_get_rcqn(struct mlx4_qp_context *qpc)
+{
+	return be32_to_cpu(qpc->cqn_recv) & 0xffffff;
+}
+
+static int qp_get_scqn(struct mlx4_qp_context *qpc)
+{
+	return be32_to_cpu(qpc->cqn_send) & 0xffffff;
+}
+
+static u32 qp_get_srqn(struct mlx4_qp_context *qpc)
+{
+	return be32_to_cpu(qpc->srqn) & 0x1ffffff;
+}
+
+static void adjust_proxy_tun_qkey(struct mlx4_dev *dev, struct mlx4_vhcr *vhcr,
+				  struct mlx4_qp_context *context)
+{
+	u32 qpn = vhcr->in_modifier & 0xffffff;
+	u32 qkey = 0;
+
+	if (mlx4_get_parav_qkey(dev, qpn, &qkey))
+		return;
+
+	/* adjust qkey in qp context */
+	context->qkey = cpu_to_be32(qkey);
+}
+
+int mlx4_RST2INIT_QP_wrapper(struct mlx4_dev *dev, int slave,
+			     struct mlx4_vhcr *vhcr,
+			     struct mlx4_cmd_mailbox *inbox,
+			     struct mlx4_cmd_mailbox *outbox,
+			     struct mlx4_cmd_info *cmd)
+{
+	int err;
+	int qpn = vhcr->in_modifier & 0x7fffff;
+	struct res_mtt *mtt;
+	struct res_qp *qp;
+	struct mlx4_qp_context *qpc = inbox->buf + 8;
+	int mtt_base = qp_get_mtt_addr(qpc) / dev->caps.mtt_entry_sz;
+	int mtt_size = qp_get_mtt_size(qpc);
+	struct res_cq *rcq;
+	struct res_cq *scq;
+	int rcqn = qp_get_rcqn(qpc);
+	int scqn = qp_get_scqn(qpc);
+	u32 srqn = qp_get_srqn(qpc) & 0xffffff;
+	int use_srq = (qp_get_srqn(qpc) >> 24) & 1;
+	struct res_srq *srq;
+	int local_qpn = be32_to_cpu(qpc->local_qpn) & 0xffffff;
+
+	err = qp_res_start_move_to(dev, slave, qpn, RES_QP_HW, &qp, 0);
+	if (err)
+		return err;
+	qp->local_qpn = local_qpn;
+	qp->sched_queue = 0;
+	qp->param3 = 0;
+	qp->vlan_control = 0;
+	qp->fvl_rx = 0;
+	qp->pri_path_fl = 0;
+	qp->vlan_index = 0;
+	qp->feup = 0;
+	qp->qpc_flags = be32_to_cpu(qpc->flags);
+
+	err = get_res(dev, slave, mtt_base, RES_MTT, &mtt);
+	if (err)
+		goto ex_abort;
+
+	err = check_mtt_range(dev, slave, mtt_base, mtt_size, mtt);
+	if (err)
+		goto ex_put_mtt;
+
+	err = get_res(dev, slave, rcqn, RES_CQ, &rcq);
+	if (err)
+		goto ex_put_mtt;
+
+	if (scqn != rcqn) {
+		err = get_res(dev, slave, scqn, RES_CQ, &scq);
+		if (err)
+			goto ex_put_rcq;
+	} else
+		scq = rcq;
+
+	if (use_srq) {
+		err = get_res(dev, slave, srqn, RES_SRQ, &srq);
+		if (err)
+			goto ex_put_scq;
+	}
+
+	adjust_proxy_tun_qkey(dev, vhcr, qpc);
+	update_pkey_index(dev, slave, inbox);
+	err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+	if (err)
+		goto ex_put_srq;
+	atomic_inc(&mtt->ref_count);
+	qp->mtt = mtt;
+	atomic_inc(&rcq->ref_count);
+	qp->rcq = rcq;
+	atomic_inc(&scq->ref_count);
+	qp->scq = scq;
+
+	if (scqn != rcqn)
+		put_res(dev, slave, scqn, RES_CQ);
+
+	if (use_srq) {
+		atomic_inc(&srq->ref_count);
+		put_res(dev, slave, srqn, RES_SRQ);
+		qp->srq = srq;
+	}
+	put_res(dev, slave, rcqn, RES_CQ);
+	put_res(dev, slave, mtt_base, RES_MTT);
+	res_end_move(dev, slave, RES_QP, qpn);
+
+	return 0;
+
+ex_put_srq:
+	if (use_srq)
+		put_res(dev, slave, srqn, RES_SRQ);
+ex_put_scq:
+	if (scqn != rcqn)
+		put_res(dev, slave, scqn, RES_CQ);
+ex_put_rcq:
+	put_res(dev, slave, rcqn, RES_CQ);
+ex_put_mtt:
+	put_res(dev, slave, mtt_base, RES_MTT);
+ex_abort:
+	res_abort_move(dev, slave, RES_QP, qpn);
+
+	return err;
+}
+
+static int eq_get_mtt_addr(struct mlx4_eq_context *eqc)
+{
+	return be32_to_cpu(eqc->mtt_base_addr_l) & 0xfffffff8;
+}
+
+static int eq_get_mtt_size(struct mlx4_eq_context *eqc)
+{
+	int log_eq_size = eqc->log_eq_size & 0x1f;
+	int page_shift = (eqc->log_page_size & 0x3f) + 12;
+
+	if (log_eq_size + 5 < page_shift)
+		return 1;
+
+	return 1 << (log_eq_size + 5 - page_shift);
+}
+
+static int cq_get_mtt_addr(struct mlx4_cq_context *cqc)
+{
+	return be32_to_cpu(cqc->mtt_base_addr_l) & 0xfffffff8;
+}
+
+static int cq_get_mtt_size(struct mlx4_cq_context *cqc)
+{
+	int log_cq_size = (be32_to_cpu(cqc->logsize_usrpage) >> 24) & 0x1f;
+	int page_shift = (cqc->log_page_size & 0x3f) + 12;
+
+	if (log_cq_size + 5 < page_shift)
+		return 1;
+
+	return 1 << (log_cq_size + 5 - page_shift);
+}
+
+int mlx4_SW2HW_EQ_wrapper(struct mlx4_dev *dev, int slave,
+			  struct mlx4_vhcr *vhcr,
+			  struct mlx4_cmd_mailbox *inbox,
+			  struct mlx4_cmd_mailbox *outbox,
+			  struct mlx4_cmd_info *cmd)
+{
+	int err;
+	int eqn = vhcr->in_modifier;
+	int res_id = (slave << 8) | eqn;
+	struct mlx4_eq_context *eqc = inbox->buf;
+	int mtt_base = eq_get_mtt_addr(eqc) / dev->caps.mtt_entry_sz;
+	int mtt_size = eq_get_mtt_size(eqc);
+	struct res_eq *eq;
+	struct res_mtt *mtt;
+
+	err = add_res_range(dev, slave, res_id, 1, RES_EQ, 0);
+	if (err)
+		return err;
+	err = eq_res_start_move_to(dev, slave, res_id, RES_EQ_HW, &eq);
+	if (err)
+		goto out_add;
+
+	err = get_res(dev, slave, mtt_base, RES_MTT, &mtt);
+	if (err)
+		goto out_move;
+
+	err = check_mtt_range(dev, slave, mtt_base, mtt_size, mtt);
+	if (err)
+		goto out_put;
+
+	err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+	if (err)
+		goto out_put;
+
+	atomic_inc(&mtt->ref_count);
+	eq->mtt = mtt;
+	put_res(dev, slave, mtt->com.res_id, RES_MTT);
+	res_end_move(dev, slave, RES_EQ, res_id);
+	return 0;
+
+out_put:
+	put_res(dev, slave, mtt->com.res_id, RES_MTT);
+out_move:
+	res_abort_move(dev, slave, RES_EQ, res_id);
+out_add:
+	rem_res_range(dev, slave, res_id, 1, RES_EQ, 0);
+	return err;
+}
+
+static int get_containing_mtt(struct mlx4_dev *dev, int slave, int start,
+			      int len, struct res_mtt **res)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct res_mtt *mtt;
+	int err = -EINVAL;
+
+	spin_lock_irq(mlx4_tlock(dev));
+	list_for_each_entry(mtt, &tracker->slave_list[slave].res_list[RES_MTT],
+			    com.list) {
+		if (!check_mtt_range(dev, slave, start, len, mtt)) {
+			*res = mtt;
+			mtt->com.from_state = mtt->com.state;
+			mtt->com.state = RES_MTT_BUSY;
+			err = 0;
+			break;
+		}
+	}
+	spin_unlock_irq(mlx4_tlock(dev));
+
+	return err;
+}
+
+static int verify_qp_parameters(struct mlx4_dev *dev,
+				struct mlx4_cmd_mailbox *inbox,
+				enum qp_transition transition, u8 slave)
+{
+	u32			qp_type;
+	struct mlx4_qp_context	*qp_ctx;
+	enum mlx4_qp_optpar	optpar;
+	int port;
+	int num_gids;
+
+	qp_ctx  = inbox->buf + 8;
+	qp_type	= (be32_to_cpu(qp_ctx->flags) >> 16) & 0xff;
+	optpar	= be32_to_cpu(*(__be32 *) inbox->buf);
+
+	switch (qp_type) {
+	case MLX4_QP_ST_RC:
+	case MLX4_QP_ST_UC:
+		switch (transition) {
+		case QP_TRANS_INIT2RTR:
+		case QP_TRANS_RTR2RTS:
+		case QP_TRANS_RTS2RTS:
+		case QP_TRANS_SQD2SQD:
+		case QP_TRANS_SQD2RTS:
+			if (slave != mlx4_master_func_num(dev))
+				if (optpar & MLX4_QP_OPTPAR_PRIMARY_ADDR_PATH) {
+					port = (qp_ctx->pri_path.sched_queue >> 6 & 1) + 1;
+					if (dev->caps.port_mask[port] != MLX4_PORT_TYPE_IB)
+						num_gids = mlx4_get_slave_num_gids(dev, slave);
+					else
+						num_gids = 1;
+					if (qp_ctx->pri_path.mgid_index >= num_gids)
+						return -EINVAL;
+				}
+				if (optpar & MLX4_QP_OPTPAR_ALT_ADDR_PATH) {
+					port = (qp_ctx->alt_path.sched_queue >> 6 & 1) + 1;
+					if (dev->caps.port_mask[port] != MLX4_PORT_TYPE_IB)
+						num_gids = mlx4_get_slave_num_gids(dev, slave);
+					else
+						num_gids = 1;
+					if (qp_ctx->alt_path.mgid_index >= num_gids)
+						return -EINVAL;
+				}
+			break;
+		default:
+			break;
+		}
+
+		break;
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+int mlx4_WRITE_MTT_wrapper(struct mlx4_dev *dev, int slave,
+			   struct mlx4_vhcr *vhcr,
+			   struct mlx4_cmd_mailbox *inbox,
+			   struct mlx4_cmd_mailbox *outbox,
+			   struct mlx4_cmd_info *cmd)
+{
+	struct mlx4_mtt mtt;
+	__be64 *page_list = inbox->buf;
+	u64 *pg_list = (u64 *)page_list;
+	int i;
+	struct res_mtt *rmtt = NULL;
+	int start = be64_to_cpu(page_list[0]);
+	int npages = vhcr->in_modifier;
+	int err;
+
+	err = get_containing_mtt(dev, slave, start, npages, &rmtt);
+	if (err)
+		return err;
+
+	/* Call the SW implementation of write_mtt:
+	 * - Prepare a dummy mtt struct
+	 * - Translate inbox contents to simple addresses in host endianess */
+	mtt.offset = 0;  /* TBD this is broken but I don't handle it since
+			    we don't really use it */
+	mtt.order = 0;
+	mtt.page_shift = 0;
+	for (i = 0; i < npages; ++i)
+		pg_list[i + 2] = (be64_to_cpu(page_list[i + 2]) & ~1ULL);
+
+	err = __mlx4_write_mtt(dev, &mtt, be64_to_cpu(page_list[0]), npages,
+			       ((u64 *)page_list + 2));
+
+	if (rmtt)
+		put_res(dev, slave, rmtt->com.res_id, RES_MTT);
+
+	return err;
+}
+
+int mlx4_HW2SW_EQ_wrapper(struct mlx4_dev *dev, int slave,
+			  struct mlx4_vhcr *vhcr,
+			  struct mlx4_cmd_mailbox *inbox,
+			  struct mlx4_cmd_mailbox *outbox,
+			  struct mlx4_cmd_info *cmd)
+{
+	int eqn = vhcr->in_modifier;
+	int res_id = eqn | (slave << 8);
+	struct res_eq *eq;
+	int err;
+
+	err = eq_res_start_move_to(dev, slave, res_id, RES_EQ_RESERVED, &eq);
+	if (err)
+		return err;
+
+	err = get_res(dev, slave, eq->mtt->com.res_id, RES_MTT, NULL);
+	if (err)
+		goto ex_abort;
+
+	err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+	if (err)
+		goto ex_put;
+
+	atomic_dec(&eq->mtt->ref_count);
+	put_res(dev, slave, eq->mtt->com.res_id, RES_MTT);
+	res_end_move(dev, slave, RES_EQ, res_id);
+	rem_res_range(dev, slave, res_id, 1, RES_EQ, 0);
+
+	return 0;
+
+ex_put:
+	put_res(dev, slave, eq->mtt->com.res_id, RES_MTT);
+ex_abort:
+	res_abort_move(dev, slave, RES_EQ, res_id);
+
+	return err;
+}
+
+int mlx4_GEN_EQE(struct mlx4_dev *dev, int slave, struct mlx4_eqe *eqe)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_slave_event_eq_info *event_eq;
+	struct mlx4_cmd_mailbox *mailbox;
+	u32 in_modifier = 0;
+	int err;
+	int res_id;
+	struct res_eq *req;
+
+	if (!priv->mfunc.master.slave_state)
+		return -EINVAL;
+
+	/* check for slave valid, slave not PF, and slave active */
+	if (slave < 0 || slave >= dev->num_slaves ||
+	    slave == dev->caps.function ||
+	    !priv->mfunc.master.slave_state[slave].active)
+		return 0;
+
+	event_eq = &priv->mfunc.master.slave_state[slave].event_eq[eqe->type];
+
+	/* Create the event only if the slave is registered */
+	if (event_eq->eqn < 0)
+		return 0;
+
+	mutex_lock(&priv->mfunc.master.gen_eqe_mutex[slave]);
+	res_id = (slave << 8) | event_eq->eqn;
+	err = get_res(dev, slave, res_id, RES_EQ, &req);
+	if (err)
+		goto unlock;
+
+	if (req->com.from_state != RES_EQ_HW) {
+		err = -EINVAL;
+		goto put;
+	}
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox)) {
+		err = PTR_ERR(mailbox);
+		goto put;
+	}
+
+	if (eqe->type == MLX4_EVENT_TYPE_CMD) {
+		++event_eq->token;
+		eqe->event.cmd.token = cpu_to_be16(event_eq->token);
+	}
+
+	memcpy(mailbox->buf, (u8 *) eqe, 28);
+
+	in_modifier = (slave & 0xff) | ((event_eq->eqn & 0xff) << 16);
+
+	err = mlx4_cmd(dev, mailbox->dma, in_modifier, 0,
+		       MLX4_CMD_GEN_EQE, MLX4_CMD_TIME_CLASS_B,
+		       MLX4_CMD_NATIVE);
+
+	put_res(dev, slave, res_id, RES_EQ);
+	mutex_unlock(&priv->mfunc.master.gen_eqe_mutex[slave]);
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+
+put:
+	put_res(dev, slave, res_id, RES_EQ);
+
+unlock:
+	mutex_unlock(&priv->mfunc.master.gen_eqe_mutex[slave]);
+	return err;
+}
+
+int mlx4_QUERY_EQ_wrapper(struct mlx4_dev *dev, int slave,
+			  struct mlx4_vhcr *vhcr,
+			  struct mlx4_cmd_mailbox *inbox,
+			  struct mlx4_cmd_mailbox *outbox,
+			  struct mlx4_cmd_info *cmd)
+{
+	int eqn = vhcr->in_modifier;
+	int res_id = eqn | (slave << 8);
+	struct res_eq *eq;
+	int err;
+
+	err = get_res(dev, slave, res_id, RES_EQ, &eq);
+	if (err)
+		return err;
+
+	if (eq->com.from_state != RES_EQ_HW) {
+		err = -EINVAL;
+		goto ex_put;
+	}
+
+	err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+
+ex_put:
+	put_res(dev, slave, res_id, RES_EQ);
+	return err;
+}
+
+int mlx4_SW2HW_CQ_wrapper(struct mlx4_dev *dev, int slave,
+			  struct mlx4_vhcr *vhcr,
+			  struct mlx4_cmd_mailbox *inbox,
+			  struct mlx4_cmd_mailbox *outbox,
+			  struct mlx4_cmd_info *cmd)
+{
+	int err;
+	int cqn = vhcr->in_modifier;
+	struct mlx4_cq_context *cqc = inbox->buf;
+	int mtt_base = cq_get_mtt_addr(cqc) / dev->caps.mtt_entry_sz;
+	struct res_cq *cq;
+	struct res_mtt *mtt;
+
+	err = cq_res_start_move_to(dev, slave, cqn, RES_CQ_HW, &cq);
+	if (err)
+		return err;
+	err = get_res(dev, slave, mtt_base, RES_MTT, &mtt);
+	if (err)
+		goto out_move;
+	err = check_mtt_range(dev, slave, mtt_base, cq_get_mtt_size(cqc), mtt);
+	if (err)
+		goto out_put;
+	err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+	if (err)
+		goto out_put;
+	atomic_inc(&mtt->ref_count);
+	cq->mtt = mtt;
+	put_res(dev, slave, mtt->com.res_id, RES_MTT);
+	res_end_move(dev, slave, RES_CQ, cqn);
+	return 0;
+
+out_put:
+	put_res(dev, slave, mtt->com.res_id, RES_MTT);
+out_move:
+	res_abort_move(dev, slave, RES_CQ, cqn);
+	return err;
+}
+
+int mlx4_HW2SW_CQ_wrapper(struct mlx4_dev *dev, int slave,
+			  struct mlx4_vhcr *vhcr,
+			  struct mlx4_cmd_mailbox *inbox,
+			  struct mlx4_cmd_mailbox *outbox,
+			  struct mlx4_cmd_info *cmd)
+{
+	int err;
+	int cqn = vhcr->in_modifier;
+	struct res_cq *cq;
+
+	err = cq_res_start_move_to(dev, slave, cqn, RES_CQ_ALLOCATED, &cq);
+	if (err)
+		return err;
+	err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+	if (err)
+		goto out_move;
+	atomic_dec(&cq->mtt->ref_count);
+	res_end_move(dev, slave, RES_CQ, cqn);
+	return 0;
+
+out_move:
+	res_abort_move(dev, slave, RES_CQ, cqn);
+	return err;
+}
+
+int mlx4_QUERY_CQ_wrapper(struct mlx4_dev *dev, int slave,
+			  struct mlx4_vhcr *vhcr,
+			  struct mlx4_cmd_mailbox *inbox,
+			  struct mlx4_cmd_mailbox *outbox,
+			  struct mlx4_cmd_info *cmd)
+{
+	int cqn = vhcr->in_modifier;
+	struct res_cq *cq;
+	int err;
+
+	err = get_res(dev, slave, cqn, RES_CQ, &cq);
+	if (err)
+		return err;
+
+	if (cq->com.from_state != RES_CQ_HW)
+		goto ex_put;
+
+	err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+ex_put:
+	put_res(dev, slave, cqn, RES_CQ);
+
+	return err;
+}
+
+static int handle_resize(struct mlx4_dev *dev, int slave,
+			 struct mlx4_vhcr *vhcr,
+			 struct mlx4_cmd_mailbox *inbox,
+			 struct mlx4_cmd_mailbox *outbox,
+			 struct mlx4_cmd_info *cmd,
+			 struct res_cq *cq)
+{
+	int err;
+	struct res_mtt *orig_mtt;
+	struct res_mtt *mtt;
+	struct mlx4_cq_context *cqc = inbox->buf;
+	int mtt_base = cq_get_mtt_addr(cqc) / dev->caps.mtt_entry_sz;
+
+	err = get_res(dev, slave, cq->mtt->com.res_id, RES_MTT, &orig_mtt);
+	if (err)
+		return err;
+
+	if (orig_mtt != cq->mtt) {
+		err = -EINVAL;
+		goto ex_put;
+	}
+
+	err = get_res(dev, slave, mtt_base, RES_MTT, &mtt);
+	if (err)
+		goto ex_put;
+
+	err = check_mtt_range(dev, slave, mtt_base, cq_get_mtt_size(cqc), mtt);
+	if (err)
+		goto ex_put1;
+	err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+	if (err)
+		goto ex_put1;
+	atomic_dec(&orig_mtt->ref_count);
+	put_res(dev, slave, orig_mtt->com.res_id, RES_MTT);
+	atomic_inc(&mtt->ref_count);
+	cq->mtt = mtt;
+	put_res(dev, slave, mtt->com.res_id, RES_MTT);
+	return 0;
+
+ex_put1:
+	put_res(dev, slave, mtt->com.res_id, RES_MTT);
+ex_put:
+	put_res(dev, slave, orig_mtt->com.res_id, RES_MTT);
+
+	return err;
+
+}
+
+int mlx4_MODIFY_CQ_wrapper(struct mlx4_dev *dev, int slave,
+			   struct mlx4_vhcr *vhcr,
+			   struct mlx4_cmd_mailbox *inbox,
+			   struct mlx4_cmd_mailbox *outbox,
+			   struct mlx4_cmd_info *cmd)
+{
+	int cqn = vhcr->in_modifier;
+	struct res_cq *cq;
+	int err;
+
+	err = get_res(dev, slave, cqn, RES_CQ, &cq);
+	if (err)
+		return err;
+
+	if (cq->com.from_state != RES_CQ_HW)
+		goto ex_put;
+
+	if (vhcr->op_modifier == 0) {
+		err = handle_resize(dev, slave, vhcr, inbox, outbox, cmd, cq);
+		goto ex_put;
+	}
+
+	err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+ex_put:
+	put_res(dev, slave, cqn, RES_CQ);
+
+	return err;
+}
+
+static int srq_get_mtt_size(struct mlx4_srq_context *srqc)
+{
+	int log_srq_size = (be32_to_cpu(srqc->state_logsize_srqn) >> 24) & 0xf;
+	int log_rq_stride = srqc->logstride & 7;
+	int page_shift = (srqc->log_page_size & 0x3f) + 12;
+
+	if (log_srq_size + log_rq_stride + 4 < page_shift)
+		return 1;
+
+	return 1 << (log_srq_size + log_rq_stride + 4 - page_shift);
+}
+
+int mlx4_SW2HW_SRQ_wrapper(struct mlx4_dev *dev, int slave,
+			   struct mlx4_vhcr *vhcr,
+			   struct mlx4_cmd_mailbox *inbox,
+			   struct mlx4_cmd_mailbox *outbox,
+			   struct mlx4_cmd_info *cmd)
+{
+	int err;
+	int srqn = vhcr->in_modifier;
+	struct res_mtt *mtt;
+	struct res_srq *srq;
+	struct mlx4_srq_context *srqc = inbox->buf;
+	int mtt_base = srq_get_mtt_addr(srqc) / dev->caps.mtt_entry_sz;
+
+	if (srqn != (be32_to_cpu(srqc->state_logsize_srqn) & 0xffffff))
+		return -EINVAL;
+
+	err = srq_res_start_move_to(dev, slave, srqn, RES_SRQ_HW, &srq);
+	if (err)
+		return err;
+	err = get_res(dev, slave, mtt_base, RES_MTT, &mtt);
+	if (err)
+		goto ex_abort;
+	err = check_mtt_range(dev, slave, mtt_base, srq_get_mtt_size(srqc),
+			      mtt);
+	if (err)
+		goto ex_put_mtt;
+
+	err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+	if (err)
+		goto ex_put_mtt;
+
+	atomic_inc(&mtt->ref_count);
+	srq->mtt = mtt;
+	put_res(dev, slave, mtt->com.res_id, RES_MTT);
+	res_end_move(dev, slave, RES_SRQ, srqn);
+	return 0;
+
+ex_put_mtt:
+	put_res(dev, slave, mtt->com.res_id, RES_MTT);
+ex_abort:
+	res_abort_move(dev, slave, RES_SRQ, srqn);
+
+	return err;
+}
+
+int mlx4_HW2SW_SRQ_wrapper(struct mlx4_dev *dev, int slave,
+			   struct mlx4_vhcr *vhcr,
+			   struct mlx4_cmd_mailbox *inbox,
+			   struct mlx4_cmd_mailbox *outbox,
+			   struct mlx4_cmd_info *cmd)
+{
+	int err;
+	int srqn = vhcr->in_modifier;
+	struct res_srq *srq;
+
+	err = srq_res_start_move_to(dev, slave, srqn, RES_SRQ_ALLOCATED, &srq);
+	if (err)
+		return err;
+	err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+	if (err)
+		goto ex_abort;
+	atomic_dec(&srq->mtt->ref_count);
+	if (srq->cq)
+		atomic_dec(&srq->cq->ref_count);
+	res_end_move(dev, slave, RES_SRQ, srqn);
+
+	return 0;
+
+ex_abort:
+	res_abort_move(dev, slave, RES_SRQ, srqn);
+
+	return err;
+}
+
+int mlx4_QUERY_SRQ_wrapper(struct mlx4_dev *dev, int slave,
+			   struct mlx4_vhcr *vhcr,
+			   struct mlx4_cmd_mailbox *inbox,
+			   struct mlx4_cmd_mailbox *outbox,
+			   struct mlx4_cmd_info *cmd)
+{
+	int err;
+	int srqn = vhcr->in_modifier;
+	struct res_srq *srq;
+
+	err = get_res(dev, slave, srqn, RES_SRQ, &srq);
+	if (err)
+		return err;
+	if (srq->com.from_state != RES_SRQ_HW) {
+		err = -EBUSY;
+		goto out;
+	}
+	err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+out:
+	put_res(dev, slave, srqn, RES_SRQ);
+	return err;
+}
+
+int mlx4_ARM_SRQ_wrapper(struct mlx4_dev *dev, int slave,
+			 struct mlx4_vhcr *vhcr,
+			 struct mlx4_cmd_mailbox *inbox,
+			 struct mlx4_cmd_mailbox *outbox,
+			 struct mlx4_cmd_info *cmd)
+{
+	int err;
+	int srqn = vhcr->in_modifier;
+	struct res_srq *srq;
+
+	err = get_res(dev, slave, srqn, RES_SRQ, &srq);
+	if (err)
+		return err;
+
+	if (srq->com.from_state != RES_SRQ_HW) {
+		err = -EBUSY;
+		goto out;
+	}
+
+	err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+out:
+	put_res(dev, slave, srqn, RES_SRQ);
+	return err;
+}
+
+int mlx4_GEN_QP_wrapper(struct mlx4_dev *dev, int slave,
+			struct mlx4_vhcr *vhcr,
+			struct mlx4_cmd_mailbox *inbox,
+			struct mlx4_cmd_mailbox *outbox,
+			struct mlx4_cmd_info *cmd)
+{
+	int err;
+	int qpn = vhcr->in_modifier & 0x7fffff;
+	struct res_qp *qp;
+
+	err = get_res(dev, slave, qpn, RES_QP, &qp);
+	if (err)
+		return err;
+	if (qp->com.from_state != RES_QP_HW) {
+		err = -EBUSY;
+		goto out;
+	}
+
+	err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+out:
+	put_res(dev, slave, qpn, RES_QP);
+	return err;
+}
+
+int mlx4_INIT2INIT_QP_wrapper(struct mlx4_dev *dev, int slave,
+			      struct mlx4_vhcr *vhcr,
+			      struct mlx4_cmd_mailbox *inbox,
+			      struct mlx4_cmd_mailbox *outbox,
+			      struct mlx4_cmd_info *cmd)
+{
+	struct mlx4_qp_context *context = inbox->buf + 8;
+	adjust_proxy_tun_qkey(dev, vhcr, context);
+	update_pkey_index(dev, slave, inbox);
+	return mlx4_GEN_QP_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+}
+
+static int roce_verify_mac(struct mlx4_dev *dev, int slave,
+				struct mlx4_qp_context *qpc,
+				struct mlx4_cmd_mailbox *inbox)
+{
+	u64 mac;
+	int port;
+	u32 ts = (be32_to_cpu(qpc->flags) >> 16) & 0xff;
+	u8 sched = *(u8 *)(inbox->buf + 64);
+	u8 smac_ix;
+
+	port = (sched >> 6 & 1) + 1;
+	if (mlx4_is_eth(dev, port) && (ts != MLX4_QP_ST_MLX)) {
+		smac_ix = qpc->pri_path.grh_mylmc & 0x7f;
+		if (mac_find_smac_ix_in_slave(dev, slave, port, smac_ix, &mac))
+			return -ENOENT;
+	}
+	return 0;
+}
+
+int mlx4_INIT2RTR_QP_wrapper(struct mlx4_dev *dev, int slave,
+			     struct mlx4_vhcr *vhcr,
+			     struct mlx4_cmd_mailbox *inbox,
+			     struct mlx4_cmd_mailbox *outbox,
+			     struct mlx4_cmd_info *cmd)
+{
+	int err;
+	struct mlx4_qp_context *qpc = inbox->buf + 8;
+	int qpn = vhcr->in_modifier & 0x7fffff;
+	struct res_qp *qp;
+	u8 orig_sched_queue;
+	__be32	orig_param3 = qpc->param3;
+	u8 orig_vlan_control = qpc->pri_path.vlan_control;
+	u8 orig_fvl_rx = qpc->pri_path.fvl_rx;
+	u8 orig_pri_path_fl = qpc->pri_path.fl;
+	u8 orig_vlan_index = qpc->pri_path.vlan_index;
+	u8 orig_feup = qpc->pri_path.feup;
+
+	err = verify_qp_parameters(dev, inbox, QP_TRANS_INIT2RTR, slave);
+	if (err)
+		return err;
+
+	if (roce_verify_mac(dev, slave, qpc, inbox))
+		return -EINVAL;
+
+	update_pkey_index(dev, slave, inbox);
+	update_gid(dev, inbox, (u8)slave);
+	adjust_proxy_tun_qkey(dev, vhcr, qpc);
+	orig_sched_queue = qpc->pri_path.sched_queue;
+
+	err = get_res(dev, slave, qpn, RES_QP, &qp);
+	if (err)
+		return err;
+	if (qp->com.from_state != RES_QP_HW) {
+		err = -EBUSY;
+		goto out;
+	}
+
+	/* do not modify vport QP params for RSS QPs */
+	if (!(qp->qpc_flags & (1 << MLX4_RSS_QPC_FLAG_OFFSET))) {
+		err = update_vport_qp_param(dev, inbox, slave, qpn);
+		if (err)
+			goto out;
+	}
+
+	err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+out:
+	/* if no error, save sched queue value passed in by VF. This is
+	 * essentially the QOS value provided by the VF. This will be useful
+	 * if we allow dynamic changes from VST back to VGT
+	 */
+	if (!err) {
+		qp->sched_queue = orig_sched_queue;
+		qp->param3	= orig_param3;
+		qp->vlan_control = orig_vlan_control;
+		qp->fvl_rx	=  orig_fvl_rx;
+		qp->pri_path_fl = orig_pri_path_fl;
+		qp->vlan_index  = orig_vlan_index;
+		qp->feup	= orig_feup;
+	}
+	put_res(dev, slave, qpn, RES_QP);
+	return err;
+}
+
+int mlx4_RTR2RTS_QP_wrapper(struct mlx4_dev *dev, int slave,
+			    struct mlx4_vhcr *vhcr,
+			    struct mlx4_cmd_mailbox *inbox,
+			    struct mlx4_cmd_mailbox *outbox,
+			    struct mlx4_cmd_info *cmd)
+{
+	int err;
+	struct mlx4_qp_context *context = inbox->buf + 8;
+
+	err = verify_qp_parameters(dev, inbox, QP_TRANS_RTR2RTS, slave);
+	if (err)
+		return err;
+
+	update_pkey_index(dev, slave, inbox);
+	update_gid(dev, inbox, (u8)slave);
+	adjust_proxy_tun_qkey(dev, vhcr, context);
+	return mlx4_GEN_QP_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+}
+
+int mlx4_RTS2RTS_QP_wrapper(struct mlx4_dev *dev, int slave,
+			    struct mlx4_vhcr *vhcr,
+			    struct mlx4_cmd_mailbox *inbox,
+			    struct mlx4_cmd_mailbox *outbox,
+			    struct mlx4_cmd_info *cmd)
+{
+	int err;
+	struct mlx4_qp_context *context = inbox->buf + 8;
+
+	err = verify_qp_parameters(dev, inbox, QP_TRANS_RTS2RTS, slave);
+	if (err)
+		return err;
+
+	update_pkey_index(dev, slave, inbox);
+	update_gid(dev, inbox, (u8)slave);
+	adjust_proxy_tun_qkey(dev, vhcr, context);
+	return mlx4_GEN_QP_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+}
+
+
+int mlx4_SQERR2RTS_QP_wrapper(struct mlx4_dev *dev, int slave,
+			      struct mlx4_vhcr *vhcr,
+			      struct mlx4_cmd_mailbox *inbox,
+			      struct mlx4_cmd_mailbox *outbox,
+			      struct mlx4_cmd_info *cmd)
+{
+	struct mlx4_qp_context *context = inbox->buf + 8;
+	adjust_proxy_tun_qkey(dev, vhcr, context);
+	return mlx4_GEN_QP_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+}
+
+int mlx4_SQD2SQD_QP_wrapper(struct mlx4_dev *dev, int slave,
+			    struct mlx4_vhcr *vhcr,
+			    struct mlx4_cmd_mailbox *inbox,
+			    struct mlx4_cmd_mailbox *outbox,
+			    struct mlx4_cmd_info *cmd)
+{
+	int err;
+	struct mlx4_qp_context *context = inbox->buf + 8;
+
+	err = verify_qp_parameters(dev, inbox, QP_TRANS_SQD2SQD, slave);
+	if (err)
+		return err;
+
+	adjust_proxy_tun_qkey(dev, vhcr, context);
+	update_gid(dev, inbox, (u8)slave);
+	update_pkey_index(dev, slave, inbox);
+	return mlx4_GEN_QP_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+}
+
+int mlx4_SQD2RTS_QP_wrapper(struct mlx4_dev *dev, int slave,
+			    struct mlx4_vhcr *vhcr,
+			    struct mlx4_cmd_mailbox *inbox,
+			    struct mlx4_cmd_mailbox *outbox,
+			    struct mlx4_cmd_info *cmd)
+{
+	int err;
+	struct mlx4_qp_context *context = inbox->buf + 8;
+
+	err = verify_qp_parameters(dev, inbox, QP_TRANS_SQD2RTS, slave);
+	if (err)
+		return err;
+
+	adjust_proxy_tun_qkey(dev, vhcr, context);
+	update_gid(dev, inbox, (u8)slave);
+	update_pkey_index(dev, slave, inbox);
+	return mlx4_GEN_QP_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+}
+
+int mlx4_2RST_QP_wrapper(struct mlx4_dev *dev, int slave,
+			 struct mlx4_vhcr *vhcr,
+			 struct mlx4_cmd_mailbox *inbox,
+			 struct mlx4_cmd_mailbox *outbox,
+			 struct mlx4_cmd_info *cmd)
+{
+	int err;
+	int qpn = vhcr->in_modifier & 0x7fffff;
+	struct res_qp *qp;
+
+	err = qp_res_start_move_to(dev, slave, qpn, RES_QP_MAPPED, &qp, 0);
+	if (err)
+		return err;
+	err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+	if (err)
+		goto ex_abort;
+
+	atomic_dec(&qp->mtt->ref_count);
+	atomic_dec(&qp->rcq->ref_count);
+	atomic_dec(&qp->scq->ref_count);
+	if (qp->srq)
+		atomic_dec(&qp->srq->ref_count);
+	res_end_move(dev, slave, RES_QP, qpn);
+	return 0;
+
+ex_abort:
+	res_abort_move(dev, slave, RES_QP, qpn);
+
+	return err;
+}
+
+static struct res_gid *find_gid(struct mlx4_dev *dev, int slave,
+				struct res_qp *rqp, u8 *gid)
+{
+	struct res_gid *res;
+
+	list_for_each_entry(res, &rqp->mcg_list, list) {
+		if (!memcmp(res->gid, gid, 16))
+			return res;
+	}
+	return NULL;
+}
+
+static int add_mcg_res(struct mlx4_dev *dev, int slave, struct res_qp *rqp,
+		       u8 *gid, enum mlx4_protocol prot,
+		       enum mlx4_steer_type steer, u64 reg_id)
+{
+	struct res_gid *res;
+	int err;
+
+	res = kzalloc(sizeof *res, GFP_KERNEL);
+	if (!res)
+		return -ENOMEM;
+
+	spin_lock_irq(&rqp->mcg_spl);
+	if (find_gid(dev, slave, rqp, gid)) {
+		kfree(res);
+		err = -EEXIST;
+	} else {
+		memcpy(res->gid, gid, 16);
+		res->prot = prot;
+		res->steer = steer;
+		res->reg_id = reg_id;
+		list_add_tail(&res->list, &rqp->mcg_list);
+		err = 0;
+	}
+	spin_unlock_irq(&rqp->mcg_spl);
+
+	return err;
+}
+
+static int rem_mcg_res(struct mlx4_dev *dev, int slave, struct res_qp *rqp,
+		       u8 *gid, enum mlx4_protocol prot,
+		       enum mlx4_steer_type steer, u64 *reg_id)
+{
+	struct res_gid *res;
+	int err;
+
+	spin_lock_irq(&rqp->mcg_spl);
+	res = find_gid(dev, slave, rqp, gid);
+	if (!res || res->prot != prot || res->steer != steer)
+		err = -EINVAL;
+	else {
+		*reg_id = res->reg_id;
+		list_del(&res->list);
+		kfree(res);
+		err = 0;
+	}
+	spin_unlock_irq(&rqp->mcg_spl);
+
+	return err;
+}
+
+static int qp_attach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16],
+		     int block_loopback, enum mlx4_protocol prot,
+		     enum mlx4_steer_type type, u64 *reg_id)
+{
+	switch (dev->caps.steering_mode) {
+	case MLX4_STEERING_MODE_DEVICE_MANAGED:
+		return mlx4_trans_to_dmfs_attach(dev, qp, gid, gid[5],
+						block_loopback, prot,
+						reg_id);
+	case MLX4_STEERING_MODE_B0:
+		return mlx4_qp_attach_common(dev, qp, gid,
+					    block_loopback, prot, type);
+	default:
+		return -EINVAL;
+	}
+}
+
+static int qp_detach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16],
+		     enum mlx4_protocol prot, enum mlx4_steer_type type,
+		     u64 reg_id)
+{
+	switch (dev->caps.steering_mode) {
+	case MLX4_STEERING_MODE_DEVICE_MANAGED:
+		return mlx4_flow_detach(dev, reg_id);
+	case MLX4_STEERING_MODE_B0:
+		return mlx4_qp_detach_common(dev, qp, gid, prot, type);
+	default:
+		return -EINVAL;
+	}
+}
+
+int mlx4_QP_ATTACH_wrapper(struct mlx4_dev *dev, int slave,
+			       struct mlx4_vhcr *vhcr,
+			       struct mlx4_cmd_mailbox *inbox,
+			       struct mlx4_cmd_mailbox *outbox,
+			       struct mlx4_cmd_info *cmd)
+{
+	struct mlx4_qp qp; /* dummy for calling attach/detach */
+	u8 *gid = inbox->buf;
+	enum mlx4_protocol prot = (vhcr->in_modifier >> 28) & 0x7;
+	int err;
+	int qpn;
+	struct res_qp *rqp;
+	u64 reg_id = 0;
+	int attach = vhcr->op_modifier;
+	int block_loopback = vhcr->in_modifier >> 31;
+	u8 steer_type_mask = 2;
+	enum mlx4_steer_type type = (gid[7] & steer_type_mask) >> 1;
+
+	qpn = vhcr->in_modifier & 0xffffff;
+	err = get_res(dev, slave, qpn, RES_QP, &rqp);
+	if (err)
+		return err;
+
+	qp.qpn = qpn;
+	if (attach) {
+		err = qp_attach(dev, &qp, gid, block_loopback, prot,
+				type, &reg_id);
+		if (err) {
+			pr_err("Fail to attach rule to qp 0x%x\n", qpn);
+			goto ex_put;
+		}
+		err = add_mcg_res(dev, slave, rqp, gid, prot, type, reg_id);
+		if (err)
+			goto ex_detach;
+	} else {
+		err = rem_mcg_res(dev, slave, rqp, gid, prot, type, &reg_id);
+		if (err)
+			goto ex_put;
+
+		err = qp_detach(dev, &qp, gid, prot, type, reg_id);
+		if (err)
+			pr_err("Fail to detach rule from qp 0x%x reg_id = 0x%llx\n",
+			       qpn, (unsigned long long)reg_id);
+	}
+	put_res(dev, slave, qpn, RES_QP);
+	return err;
+
+ex_detach:
+	qp_detach(dev, &qp, gid, prot, type, reg_id);
+ex_put:
+	put_res(dev, slave, qpn, RES_QP);
+	return err;
+}
+
+/*
+ * MAC validation for Flow Steering rules.
+ * VF can attach rules only with a mac address which is assigned to it.
+ */
+static int validate_eth_header_mac(int slave, struct _rule_hw *eth_header,
+				   struct list_head *rlist)
+{
+	struct mac_res *res, *tmp;
+	__be64 be_mac;
+
+	/* make sure it isn't multicast or broadcast mac*/
+	if (!is_multicast_ether_addr(eth_header->eth.dst_mac) &&
+	    !is_broadcast_ether_addr(eth_header->eth.dst_mac)) {
+		list_for_each_entry_safe(res, tmp, rlist, list) {
+			be_mac = cpu_to_be64(res->mac << 16);
+			if (!memcmp(&be_mac, eth_header->eth.dst_mac, ETH_ALEN))
+				return 0;
+		}
+		pr_err("MAC %pM doesn't belong to VF %d, Steering rule rejected\n",
+		       eth_header->eth.dst_mac, slave);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+/*
+ * In case of missing eth header, append eth header with a MAC address
+ * assigned to the VF.
+ */
+static int add_eth_header(struct mlx4_dev *dev, int slave,
+			  struct mlx4_cmd_mailbox *inbox,
+			  struct list_head *rlist, int header_id)
+{
+	struct mac_res *res, *tmp;
+	u8 port;
+	struct mlx4_net_trans_rule_hw_ctrl *ctrl;
+	struct mlx4_net_trans_rule_hw_eth *eth_header;
+	struct mlx4_net_trans_rule_hw_ipv4 *ip_header;
+	struct mlx4_net_trans_rule_hw_tcp_udp *l4_header;
+	__be64 be_mac = 0;
+	__be64 mac_msk = cpu_to_be64(MLX4_MAC_MASK << 16);
+
+	ctrl = (struct mlx4_net_trans_rule_hw_ctrl *)inbox->buf;
+	port = ctrl->port;
+	eth_header = (struct mlx4_net_trans_rule_hw_eth *)(ctrl + 1);
+
+	/* Clear a space in the inbox for eth header */
+	switch (header_id) {
+	case MLX4_NET_TRANS_RULE_ID_IPV4:
+		ip_header =
+			(struct mlx4_net_trans_rule_hw_ipv4 *)(eth_header + 1);
+		memmove(ip_header, eth_header,
+			sizeof(*ip_header) + sizeof(*l4_header));
+		break;
+	case MLX4_NET_TRANS_RULE_ID_TCP:
+	case MLX4_NET_TRANS_RULE_ID_UDP:
+		l4_header = (struct mlx4_net_trans_rule_hw_tcp_udp *)
+			    (eth_header + 1);
+		memmove(l4_header, eth_header, sizeof(*l4_header));
+		break;
+	default:
+		return -EINVAL;
+	}
+	list_for_each_entry_safe(res, tmp, rlist, list) {
+		if (port == res->port) {
+			be_mac = cpu_to_be64(res->mac << 16);
+			break;
+		}
+	}
+	if (!be_mac) {
+		pr_err("Failed adding eth header to FS rule, Can't find matching MAC for port %d .\n",
+		       port);
+		return -EINVAL;
+	}
+
+	memset(eth_header, 0, sizeof(*eth_header));
+	eth_header->size = sizeof(*eth_header) >> 2;
+	eth_header->id = cpu_to_be16(__sw_id_hw[MLX4_NET_TRANS_RULE_ID_ETH]);
+	memcpy(eth_header->dst_mac, &be_mac, ETH_ALEN);
+	memcpy(eth_header->dst_mac_msk, &mac_msk, ETH_ALEN);
+
+	return 0;
+
+}
+
+int mlx4_QP_FLOW_STEERING_ATTACH_wrapper(struct mlx4_dev *dev, int slave,
+					 struct mlx4_vhcr *vhcr,
+					 struct mlx4_cmd_mailbox *inbox,
+					 struct mlx4_cmd_mailbox *outbox,
+					 struct mlx4_cmd_info *cmd)
+{
+
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct list_head *rlist = &tracker->slave_list[slave].res_list[RES_MAC];
+	int err;
+	int qpn;
+	struct res_qp *rqp;
+	struct mlx4_net_trans_rule_hw_ctrl *ctrl;
+	struct _rule_hw  *rule_header;
+	int header_id;
+
+	if (dev->caps.steering_mode !=
+	    MLX4_STEERING_MODE_DEVICE_MANAGED)
+		return -EOPNOTSUPP;
+
+	ctrl = (struct mlx4_net_trans_rule_hw_ctrl *)inbox->buf;
+	qpn = be32_to_cpu(ctrl->qpn) & 0xffffff;
+	err = get_res(dev, slave, qpn, RES_QP, &rqp);
+	if (err) {
+		pr_err("Steering rule with qpn 0x%x rejected.\n", qpn);
+		return err;
+	}
+	rule_header = (struct _rule_hw *)(ctrl + 1);
+	header_id = map_hw_to_sw_id(be16_to_cpu(rule_header->id));
+
+	switch (header_id) {
+	case MLX4_NET_TRANS_RULE_ID_ETH:
+		if (validate_eth_header_mac(slave, rule_header, rlist)) {
+			err = -EINVAL;
+			goto err_put;
+		}
+		break;
+	case MLX4_NET_TRANS_RULE_ID_IB:
+		break;
+	case MLX4_NET_TRANS_RULE_ID_IPV4:
+	case MLX4_NET_TRANS_RULE_ID_TCP:
+	case MLX4_NET_TRANS_RULE_ID_UDP:
+		pr_warn("Can't attach FS rule without L2 headers, adding L2 header.\n");
+		if (add_eth_header(dev, slave, inbox, rlist, header_id)) {
+			err = -EINVAL;
+			goto err_put;
+		}
+		vhcr->in_modifier +=
+			sizeof(struct mlx4_net_trans_rule_hw_eth) >> 2;
+		break;
+	default:
+		pr_err("Corrupted mailbox.\n");
+		err = -EINVAL;
+		goto err_put;
+	}
+
+	err = mlx4_cmd_imm(dev, inbox->dma, &vhcr->out_param,
+			   vhcr->in_modifier, 0,
+			   MLX4_QP_FLOW_STEERING_ATTACH, MLX4_CMD_TIME_CLASS_A,
+			   MLX4_CMD_NATIVE);
+	if (err)
+		goto err_put;
+
+	err = add_res_range(dev, slave, vhcr->out_param, 1, RES_FS_RULE, qpn);
+	if (err) {
+		mlx4_err(dev, "Fail to add flow steering resources.\n ");
+		/* detach rule*/
+		mlx4_cmd(dev, vhcr->out_param, 0, 0,
+			 MLX4_QP_FLOW_STEERING_DETACH, MLX4_CMD_TIME_CLASS_A,
+			 MLX4_CMD_NATIVE);
+		goto err_put;
+	}
+	atomic_inc(&rqp->ref_count);
+err_put:
+	put_res(dev, slave, qpn, RES_QP);
+	return err;
+}
+
+int mlx4_QP_FLOW_STEERING_DETACH_wrapper(struct mlx4_dev *dev, int slave,
+					 struct mlx4_vhcr *vhcr,
+					 struct mlx4_cmd_mailbox *inbox,
+					 struct mlx4_cmd_mailbox *outbox,
+					 struct mlx4_cmd_info *cmd)
+{
+	int err;
+	struct res_qp *rqp;
+	struct res_fs_rule *rrule;
+
+	if (dev->caps.steering_mode !=
+	    MLX4_STEERING_MODE_DEVICE_MANAGED)
+		return -EOPNOTSUPP;
+
+	err = get_res(dev, slave, vhcr->in_param, RES_FS_RULE, &rrule);
+	if (err)
+		return err;
+	/* Release the rule form busy state before removal */
+	put_res(dev, slave, vhcr->in_param, RES_FS_RULE);
+	err = get_res(dev, slave, rrule->qpn, RES_QP, &rqp);
+	if (err)
+		return err;
+
+	err = mlx4_cmd(dev, vhcr->in_param, 0, 0,
+		       MLX4_QP_FLOW_STEERING_DETACH, MLX4_CMD_TIME_CLASS_A,
+		       MLX4_CMD_NATIVE);
+	if (!err) {
+		err = rem_res_range(dev, slave, vhcr->in_param, 1, RES_FS_RULE,
+				    0);
+		atomic_dec(&rqp->ref_count);
+
+		if (err) {
+			mlx4_err(dev, "Fail to remove flow steering resources.\n ");
+			goto out;
+		}
+	}
+
+out:
+	put_res(dev, slave, rrule->qpn, RES_QP);
+	return err;
+}
+
+enum {
+	BUSY_MAX_RETRIES = 10
+};
+
+int mlx4_QUERY_IF_STAT_wrapper(struct mlx4_dev *dev, int slave,
+			       struct mlx4_vhcr *vhcr,
+			       struct mlx4_cmd_mailbox *inbox,
+			       struct mlx4_cmd_mailbox *outbox,
+			       struct mlx4_cmd_info *cmd)
+{
+	int err;
+
+	err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+
+	return err;
+}
+
+static void detach_qp(struct mlx4_dev *dev, int slave, struct res_qp *rqp)
+{
+	struct res_gid *rgid;
+	struct res_gid *tmp;
+	struct mlx4_qp qp; /* dummy for calling attach/detach */
+
+	list_for_each_entry_safe(rgid, tmp, &rqp->mcg_list, list) {
+		switch (dev->caps.steering_mode) {
+		case MLX4_STEERING_MODE_DEVICE_MANAGED:
+			mlx4_flow_detach(dev, rgid->reg_id);
+			break;
+		case MLX4_STEERING_MODE_B0:
+			qp.qpn = rqp->local_qpn;
+			(void) mlx4_qp_detach_common(dev, &qp, rgid->gid,
+						     rgid->prot, rgid->steer);
+			break;
+		}
+		list_del(&rgid->list);
+		kfree(rgid);
+	}
+}
+
+static int _move_all_busy(struct mlx4_dev *dev, int slave,
+			  enum mlx4_resource type, int print)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker =
+		&priv->mfunc.master.res_tracker;
+	struct list_head *rlist = &tracker->slave_list[slave].res_list[type];
+	struct res_common *r;
+	struct res_common *tmp;
+	int busy;
+
+	busy = 0;
+	spin_lock_irq(mlx4_tlock(dev));
+	list_for_each_entry_safe(r, tmp, rlist, list) {
+		if (r->owner == slave) {
+			if (!r->removing) {
+				if (r->state == RES_ANY_BUSY) {
+					if (print)
+						mlx4_dbg(dev,
+							 "%s id 0x%llx is busy\n",
+							  ResourceType(type),
+							  (unsigned long long)r->res_id);
+					++busy;
+				} else {
+					r->from_state = r->state;
+					r->state = RES_ANY_BUSY;
+					r->removing = 1;
+				}
+			}
+		}
+	}
+	spin_unlock_irq(mlx4_tlock(dev));
+
+	return busy;
+}
+
+static int move_all_busy(struct mlx4_dev *dev, int slave,
+			 enum mlx4_resource type)
+{
+	unsigned long begin;
+	int busy;
+
+	begin = jiffies;
+	do {
+		busy = _move_all_busy(dev, slave, type, 0);
+		if (time_after(jiffies, begin + 5 * HZ))
+			break;
+		if (busy)
+			cond_resched();
+	} while (busy);
+
+	if (busy)
+		busy = _move_all_busy(dev, slave, type, 1);
+
+	return busy;
+}
+static void rem_slave_qps(struct mlx4_dev *dev, int slave)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct list_head *qp_list =
+		&tracker->slave_list[slave].res_list[RES_QP];
+	struct res_qp *qp;
+	struct res_qp *tmp;
+	int state;
+	u64 in_param;
+	int qpn;
+	int err;
+
+	err = move_all_busy(dev, slave, RES_QP);
+	if (err)
+		mlx4_warn(dev, "rem_slave_qps: Could not move all qps to busy"
+			  "for slave %d\n", slave);
+
+	spin_lock_irq(mlx4_tlock(dev));
+	list_for_each_entry_safe(qp, tmp, qp_list, com.list) {
+		spin_unlock_irq(mlx4_tlock(dev));
+		if (qp->com.owner == slave) {
+			qpn = qp->com.res_id;
+			detach_qp(dev, slave, qp);
+			state = qp->com.from_state;
+			while (state != 0) {
+				switch (state) {
+				case RES_QP_RESERVED:
+					spin_lock_irq(mlx4_tlock(dev));
+					rb_erase(&qp->com.node,
+						 &tracker->res_tree[RES_QP]);
+					list_del(&qp->com.list);
+					spin_unlock_irq(mlx4_tlock(dev));
+					if (!valid_reserved(dev, slave, qpn)) {
+						__mlx4_qp_release_range(dev, qpn, 1);
+						mlx4_release_resource(dev, slave,
+								      RES_QP, 1, 0);
+					}
+					kfree(qp);
+					state = 0;
+					break;
+				case RES_QP_MAPPED:
+					if (!valid_reserved(dev, slave, qpn))
+						__mlx4_qp_free_icm(dev, qpn);
+					state = RES_QP_RESERVED;
+					break;
+				case RES_QP_HW:
+					in_param = slave;
+					err = mlx4_cmd(dev, in_param,
+						       qp->local_qpn, 2,
+						       MLX4_CMD_2RST_QP,
+						       MLX4_CMD_TIME_CLASS_A,
+						       MLX4_CMD_NATIVE);
+					if (err)
+						mlx4_dbg(dev, "rem_slave_qps: failed"
+							 " to move slave %d qpn %d to"
+							 " reset\n", slave,
+							 qp->local_qpn);
+					atomic_dec(&qp->rcq->ref_count);
+					atomic_dec(&qp->scq->ref_count);
+					atomic_dec(&qp->mtt->ref_count);
+					if (qp->srq)
+						atomic_dec(&qp->srq->ref_count);
+					state = RES_QP_MAPPED;
+					break;
+				default:
+					state = 0;
+				}
+			}
+		}
+		spin_lock_irq(mlx4_tlock(dev));
+	}
+	spin_unlock_irq(mlx4_tlock(dev));
+}
+
+static void rem_slave_srqs(struct mlx4_dev *dev, int slave)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct list_head *srq_list =
+		&tracker->slave_list[slave].res_list[RES_SRQ];
+	struct res_srq *srq;
+	struct res_srq *tmp;
+	int state;
+	u64 in_param;
+	LIST_HEAD(tlist);
+	int srqn;
+	int err;
+
+	err = move_all_busy(dev, slave, RES_SRQ);
+	if (err)
+		mlx4_warn(dev, "rem_slave_srqs: Could not move all srqs to "
+			  "busy for slave %d\n", slave);
+
+	spin_lock_irq(mlx4_tlock(dev));
+	list_for_each_entry_safe(srq, tmp, srq_list, com.list) {
+		spin_unlock_irq(mlx4_tlock(dev));
+		if (srq->com.owner == slave) {
+			srqn = srq->com.res_id;
+			state = srq->com.from_state;
+			while (state != 0) {
+				switch (state) {
+				case RES_SRQ_ALLOCATED:
+					__mlx4_srq_free_icm(dev, srqn);
+					spin_lock_irq(mlx4_tlock(dev));
+					rb_erase(&srq->com.node,
+						 &tracker->res_tree[RES_SRQ]);
+					list_del(&srq->com.list);
+					spin_unlock_irq(mlx4_tlock(dev));
+					mlx4_release_resource(dev, slave,
+							      RES_SRQ, 1, 0);
+					kfree(srq);
+					state = 0;
+					break;
+
+				case RES_SRQ_HW:
+					in_param = slave;
+					err = mlx4_cmd(dev, in_param, srqn, 1,
+						       MLX4_CMD_HW2SW_SRQ,
+						       MLX4_CMD_TIME_CLASS_A,
+						       MLX4_CMD_NATIVE);
+					if (err)
+						mlx4_dbg(dev, "rem_slave_srqs: failed"
+							 " to move slave %d srq %d to"
+							 " SW ownership\n",
+							 slave, srqn);
+
+					atomic_dec(&srq->mtt->ref_count);
+					if (srq->cq)
+						atomic_dec(&srq->cq->ref_count);
+					state = RES_SRQ_ALLOCATED;
+					break;
+
+				default:
+					state = 0;
+				}
+			}
+		}
+		spin_lock_irq(mlx4_tlock(dev));
+	}
+	spin_unlock_irq(mlx4_tlock(dev));
+}
+
+static void rem_slave_cqs(struct mlx4_dev *dev, int slave)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct list_head *cq_list =
+		&tracker->slave_list[slave].res_list[RES_CQ];
+	struct res_cq *cq;
+	struct res_cq *tmp;
+	int state;
+	u64 in_param;
+	LIST_HEAD(tlist);
+	int cqn;
+	int err;
+
+	err = move_all_busy(dev, slave, RES_CQ);
+	if (err)
+		mlx4_warn(dev, "rem_slave_cqs: Could not move all cqs to "
+			  "busy for slave %d\n", slave);
+
+	spin_lock_irq(mlx4_tlock(dev));
+	list_for_each_entry_safe(cq, tmp, cq_list, com.list) {
+		spin_unlock_irq(mlx4_tlock(dev));
+		if (cq->com.owner == slave && !atomic_read(&cq->ref_count)) {
+			cqn = cq->com.res_id;
+			state = cq->com.from_state;
+			while (state != 0) {
+				switch (state) {
+				case RES_CQ_ALLOCATED:
+					__mlx4_cq_free_icm(dev, cqn);
+					spin_lock_irq(mlx4_tlock(dev));
+					rb_erase(&cq->com.node,
+						 &tracker->res_tree[RES_CQ]);
+					list_del(&cq->com.list);
+					spin_unlock_irq(mlx4_tlock(dev));
+					mlx4_release_resource(dev, slave,
+							      RES_CQ, 1, 0);
+					kfree(cq);
+					state = 0;
+					break;
+
+				case RES_CQ_HW:
+					in_param = slave;
+					err = mlx4_cmd(dev, in_param, cqn, 1,
+						       MLX4_CMD_HW2SW_CQ,
+						       MLX4_CMD_TIME_CLASS_A,
+						       MLX4_CMD_NATIVE);
+					if (err)
+						mlx4_dbg(dev, "rem_slave_cqs: failed"
+							 " to move slave %d cq %d to"
+							 " SW ownership\n",
+							 slave, cqn);
+					atomic_dec(&cq->mtt->ref_count);
+					state = RES_CQ_ALLOCATED;
+					break;
+
+				default:
+					state = 0;
+				}
+			}
+		}
+		spin_lock_irq(mlx4_tlock(dev));
+	}
+	spin_unlock_irq(mlx4_tlock(dev));
+}
+
+static void rem_slave_mrs(struct mlx4_dev *dev, int slave)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct list_head *mpt_list =
+		&tracker->slave_list[slave].res_list[RES_MPT];
+	struct res_mpt *mpt;
+	struct res_mpt *tmp;
+	int state;
+	u64 in_param;
+	LIST_HEAD(tlist);
+	int mptn;
+	int err;
+
+	err = move_all_busy(dev, slave, RES_MPT);
+	if (err)
+		mlx4_warn(dev, "rem_slave_mrs: Could not move all mpts to "
+			  "busy for slave %d\n", slave);
+
+	spin_lock_irq(mlx4_tlock(dev));
+	list_for_each_entry_safe(mpt, tmp, mpt_list, com.list) {
+		spin_unlock_irq(mlx4_tlock(dev));
+		if (mpt->com.owner == slave) {
+			mptn = mpt->com.res_id;
+			state = mpt->com.from_state;
+			while (state != 0) {
+				switch (state) {
+				case RES_MPT_RESERVED:
+					__mlx4_mpt_release(dev, mpt->key);
+					spin_lock_irq(mlx4_tlock(dev));
+					rb_erase(&mpt->com.node,
+						 &tracker->res_tree[RES_MPT]);
+					list_del(&mpt->com.list);
+					spin_unlock_irq(mlx4_tlock(dev));
+					mlx4_release_resource(dev, slave,
+							      RES_MPT, 1, 0);
+					kfree(mpt);
+					state = 0;
+					break;
+
+				case RES_MPT_MAPPED:
+					__mlx4_mpt_free_icm(dev, mpt->key);
+					state = RES_MPT_RESERVED;
+					break;
+
+				case RES_MPT_HW:
+					in_param = slave;
+					err = mlx4_cmd(dev, in_param, mptn, 0,
+						     MLX4_CMD_HW2SW_MPT,
+						     MLX4_CMD_TIME_CLASS_A,
+						     MLX4_CMD_NATIVE);
+					if (err)
+						mlx4_dbg(dev, "rem_slave_mrs: failed"
+							 " to move slave %d mpt %d to"
+							 " SW ownership\n",
+							 slave, mptn);
+					if (mpt->mtt)
+						atomic_dec(&mpt->mtt->ref_count);
+					state = RES_MPT_MAPPED;
+					break;
+				default:
+					state = 0;
+				}
+			}
+		}
+		spin_lock_irq(mlx4_tlock(dev));
+	}
+	spin_unlock_irq(mlx4_tlock(dev));
+}
+
+static void rem_slave_mtts(struct mlx4_dev *dev, int slave)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker =
+		&priv->mfunc.master.res_tracker;
+	struct list_head *mtt_list =
+		&tracker->slave_list[slave].res_list[RES_MTT];
+	struct res_mtt *mtt;
+	struct res_mtt *tmp;
+	int state;
+	LIST_HEAD(tlist);
+	int base;
+	int err;
+
+	err = move_all_busy(dev, slave, RES_MTT);
+	if (err)
+		mlx4_warn(dev, "rem_slave_mtts: Could not move all mtts to "
+			  "busy for slave %d\n", slave);
+
+	spin_lock_irq(mlx4_tlock(dev));
+	list_for_each_entry_safe(mtt, tmp, mtt_list, com.list) {
+		spin_unlock_irq(mlx4_tlock(dev));
+		if (mtt->com.owner == slave) {
+			base = mtt->com.res_id;
+			state = mtt->com.from_state;
+			while (state != 0) {
+				switch (state) {
+				case RES_MTT_ALLOCATED:
+					__mlx4_free_mtt_range(dev, base,
+							      mtt->order);
+					spin_lock_irq(mlx4_tlock(dev));
+					rb_erase(&mtt->com.node,
+						 &tracker->res_tree[RES_MTT]);
+					list_del(&mtt->com.list);
+					spin_unlock_irq(mlx4_tlock(dev));
+					mlx4_release_resource(dev, slave, RES_MTT,
+							      1 << mtt->order, 0);
+					kfree(mtt);
+					state = 0;
+					break;
+
+				default:
+					state = 0;
+				}
+			}
+		}
+		spin_lock_irq(mlx4_tlock(dev));
+	}
+	spin_unlock_irq(mlx4_tlock(dev));
+}
+
+static void rem_slave_fs_rule(struct mlx4_dev *dev, int slave)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker =
+		&priv->mfunc.master.res_tracker;
+	struct list_head *fs_rule_list =
+		&tracker->slave_list[slave].res_list[RES_FS_RULE];
+	struct res_fs_rule *fs_rule;
+	struct res_fs_rule *tmp;
+	int state;
+	u64 base;
+	int err;
+
+	err = move_all_busy(dev, slave, RES_FS_RULE);
+	if (err)
+		mlx4_warn(dev, "rem_slave_fs_rule: Could not move all mtts to busy for slave %d\n",
+			  slave);
+
+	spin_lock_irq(mlx4_tlock(dev));
+	list_for_each_entry_safe(fs_rule, tmp, fs_rule_list, com.list) {
+		spin_unlock_irq(mlx4_tlock(dev));
+		if (fs_rule->com.owner == slave) {
+			base = fs_rule->com.res_id;
+			state = fs_rule->com.from_state;
+			while (state != 0) {
+				switch (state) {
+				case RES_FS_RULE_ALLOCATED:
+					/* detach rule */
+					err = mlx4_cmd(dev, base, 0, 0,
+						       MLX4_QP_FLOW_STEERING_DETACH,
+						       MLX4_CMD_TIME_CLASS_A,
+						       MLX4_CMD_NATIVE);
+
+					spin_lock_irq(mlx4_tlock(dev));
+					rb_erase(&fs_rule->com.node,
+						 &tracker->res_tree[RES_FS_RULE]);
+					list_del(&fs_rule->com.list);
+					spin_unlock_irq(mlx4_tlock(dev));
+					kfree(fs_rule);
+					state = 0;
+					break;
+
+				default:
+					state = 0;
+				}
+			}
+		}
+		spin_lock_irq(mlx4_tlock(dev));
+	}
+	spin_unlock_irq(mlx4_tlock(dev));
+}
+
+static void rem_slave_eqs(struct mlx4_dev *dev, int slave)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct list_head *eq_list =
+		&tracker->slave_list[slave].res_list[RES_EQ];
+	struct res_eq *eq;
+	struct res_eq *tmp;
+	int err;
+	int state;
+	LIST_HEAD(tlist);
+	int eqn;
+	struct mlx4_cmd_mailbox *mailbox;
+
+	err = move_all_busy(dev, slave, RES_EQ);
+	if (err)
+		mlx4_warn(dev, "rem_slave_eqs: Could not move all eqs to "
+			  "busy for slave %d\n", slave);
+
+	spin_lock_irq(mlx4_tlock(dev));
+	list_for_each_entry_safe(eq, tmp, eq_list, com.list) {
+		spin_unlock_irq(mlx4_tlock(dev));
+		if (eq->com.owner == slave) {
+			eqn = eq->com.res_id;
+			state = eq->com.from_state;
+			while (state != 0) {
+				switch (state) {
+				case RES_EQ_RESERVED:
+					spin_lock_irq(mlx4_tlock(dev));
+					rb_erase(&eq->com.node,
+						 &tracker->res_tree[RES_EQ]);
+					list_del(&eq->com.list);
+					spin_unlock_irq(mlx4_tlock(dev));
+					kfree(eq);
+					state = 0;
+					break;
+
+				case RES_EQ_HW:
+					mailbox = mlx4_alloc_cmd_mailbox(dev);
+					if (IS_ERR(mailbox)) {
+						cond_resched();
+						continue;
+					}
+					err = mlx4_cmd_box(dev, slave, 0,
+							   eqn & 0xff, 0,
+							   MLX4_CMD_HW2SW_EQ,
+							   MLX4_CMD_TIME_CLASS_A,
+							   MLX4_CMD_NATIVE);
+					if (err)
+						mlx4_dbg(dev, "rem_slave_eqs: failed"
+							 " to move slave %d eqs %d to"
+							 " SW ownership\n", slave, eqn);
+					mlx4_free_cmd_mailbox(dev, mailbox);
+					atomic_dec(&eq->mtt->ref_count);
+					state = RES_EQ_RESERVED;
+					break;
+
+				default:
+					state = 0;
+				}
+			}
+		}
+		spin_lock_irq(mlx4_tlock(dev));
+	}
+	spin_unlock_irq(mlx4_tlock(dev));
+}
+
+static void rem_slave_counters(struct mlx4_dev *dev, int slave)
+{
+	__mlx4_slave_counters_free(dev, slave);
+}
+
+static void rem_slave_xrcdns(struct mlx4_dev *dev, int slave)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct list_head *xrcdn_list =
+		&tracker->slave_list[slave].res_list[RES_XRCD];
+	struct res_xrcdn *xrcd;
+	struct res_xrcdn *tmp;
+	int err;
+	int xrcdn;
+
+	err = move_all_busy(dev, slave, RES_XRCD);
+	if (err)
+		mlx4_warn(dev, "rem_slave_xrcdns: Could not move all xrcdns to "
+			  "busy for slave %d\n", slave);
+
+	spin_lock_irq(mlx4_tlock(dev));
+	list_for_each_entry_safe(xrcd, tmp, xrcdn_list, com.list) {
+		if (xrcd->com.owner == slave) {
+			xrcdn = xrcd->com.res_id;
+			rb_erase(&xrcd->com.node, &tracker->res_tree[RES_XRCD]);
+			list_del(&xrcd->com.list);
+			kfree(xrcd);
+			__mlx4_xrcd_free(dev, xrcdn);
+		}
+	}
+	spin_unlock_irq(mlx4_tlock(dev));
+}
+
+void mlx4_delete_all_resources_for_slave(struct mlx4_dev *dev, int slave)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	mutex_lock(&priv->mfunc.master.res_tracker.slave_list[slave].mutex);
+	rem_slave_macs(dev, slave);
+	rem_slave_vlans(dev, slave);
+	rem_slave_fs_rule(dev, slave);
+	rem_slave_qps(dev, slave);
+	rem_slave_srqs(dev, slave);
+	rem_slave_cqs(dev, slave);
+	rem_slave_mrs(dev, slave);
+	rem_slave_eqs(dev, slave);
+	rem_slave_mtts(dev, slave);
+	rem_slave_counters(dev, slave);
+	rem_slave_xrcdns(dev, slave);
+	mutex_unlock(&priv->mfunc.master.res_tracker.slave_list[slave].mutex);
+}
+
+void mlx4_vf_immed_vlan_work_handler(struct work_struct *_work)
+{
+	struct mlx4_vf_immed_vlan_work *work =
+		container_of(_work, struct mlx4_vf_immed_vlan_work, work);
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_update_qp_context *upd_context;
+	struct mlx4_dev *dev = &work->priv->dev;
+	struct mlx4_resource_tracker *tracker =
+		&work->priv->mfunc.master.res_tracker;
+	struct list_head *qp_list =
+		&tracker->slave_list[work->slave].res_list[RES_QP];
+	struct res_qp *qp;
+	struct res_qp *tmp;
+	u64 qp_path_mask_vlan_ctrl =
+		       ((1ULL << MLX4_UPD_QP_PATH_MASK_ETH_TX_BLOCK_UNTAGGED) |
+		       (1ULL << MLX4_UPD_QP_PATH_MASK_ETH_TX_BLOCK_1P) |
+		       (1ULL << MLX4_UPD_QP_PATH_MASK_ETH_TX_BLOCK_TAGGED) |
+		       (1ULL << MLX4_UPD_QP_PATH_MASK_ETH_RX_BLOCK_UNTAGGED) |
+		       (1ULL << MLX4_UPD_QP_PATH_MASK_ETH_RX_BLOCK_1P) |
+		       (1ULL << MLX4_UPD_QP_PATH_MASK_ETH_RX_BLOCK_TAGGED));
+
+	u64 qp_path_mask = ((1ULL << MLX4_UPD_QP_PATH_MASK_VLAN_INDEX) |
+		       (1ULL << MLX4_UPD_QP_PATH_MASK_FVL) |
+		       (1ULL << MLX4_UPD_QP_PATH_MASK_CV) |
+		       (1ULL << MLX4_UPD_QP_PATH_MASK_ETH_HIDE_CQE_VLAN) |
+		       (1ULL << MLX4_UPD_QP_PATH_MASK_FEUP) |
+		       (1ULL << MLX4_UPD_QP_PATH_MASK_FVL_RX) |
+		       (1ULL << MLX4_UPD_QP_PATH_MASK_SCHED_QUEUE));
+
+	int err;
+	int port, errors = 0;
+	u8 vlan_control;
+
+	if (mlx4_is_slave(dev)) {
+		mlx4_warn(dev, "Trying to update-qp in slave %d\n",
+			  work->slave);
+		goto out;
+	}
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		goto out;
+
+	if (!work->vlan_id)
+		vlan_control = MLX4_VLAN_CTRL_ETH_TX_BLOCK_TAGGED |
+			MLX4_VLAN_CTRL_ETH_RX_BLOCK_TAGGED;
+	else
+		vlan_control = MLX4_VLAN_CTRL_ETH_TX_BLOCK_TAGGED |
+			MLX4_VLAN_CTRL_ETH_RX_BLOCK_PRIO_TAGGED |
+			MLX4_VLAN_CTRL_ETH_RX_BLOCK_UNTAGGED;
+
+	upd_context = mailbox->buf;
+	upd_context->qp_mask = cpu_to_be64(MLX4_UPD_QP_MASK_VSD);
+
+	spin_lock_irq(mlx4_tlock(dev));
+	list_for_each_entry_safe(qp, tmp, qp_list, com.list) {
+		spin_unlock_irq(mlx4_tlock(dev));
+		if (qp->com.owner == work->slave) {
+			if (qp->com.from_state != RES_QP_HW ||
+			    !qp->sched_queue ||  /* no INIT2RTR trans yet */
+			    mlx4_is_qp_reserved(dev, qp->local_qpn) ||
+			    qp->qpc_flags & (1 << MLX4_RSS_QPC_FLAG_OFFSET)) {
+				spin_lock_irq(mlx4_tlock(dev));
+				continue;
+			}
+			port = (qp->sched_queue >> 6 & 1) + 1;
+			if (port != work->port) {
+				spin_lock_irq(mlx4_tlock(dev));
+				continue;
+			}
+			if (MLX4_QP_ST_RC == ((qp->qpc_flags >> 16) & 0xff))
+				upd_context->primary_addr_path_mask = cpu_to_be64(qp_path_mask);
+			else
+				upd_context->primary_addr_path_mask =
+					cpu_to_be64(qp_path_mask | qp_path_mask_vlan_ctrl);
+			if (work->vlan_id == MLX4_VGT) {
+				upd_context->qp_context.param3 = qp->param3;
+				upd_context->qp_context.pri_path.vlan_control = qp->vlan_control;
+				upd_context->qp_context.pri_path.fvl_rx = qp->fvl_rx;
+				upd_context->qp_context.pri_path.vlan_index = qp->vlan_index;
+				upd_context->qp_context.pri_path.fl = qp->pri_path_fl;
+				upd_context->qp_context.pri_path.feup = qp->feup;
+				upd_context->qp_context.pri_path.sched_queue =
+					qp->sched_queue;
+			} else {
+				upd_context->qp_context.param3 = qp->param3 & ~cpu_to_be32(MLX4_STRIP_VLAN);
+				upd_context->qp_context.pri_path.vlan_control = vlan_control;
+				upd_context->qp_context.pri_path.vlan_index = work->vlan_ix;
+				upd_context->qp_context.pri_path.fvl_rx =
+					qp->fvl_rx | MLX4_FVL_RX_FORCE_ETH_VLAN;
+				upd_context->qp_context.pri_path.fl =
+					qp->pri_path_fl | MLX4_FL_CV | MLX4_FL_ETH_HIDE_CQE_VLAN;
+				upd_context->qp_context.pri_path.feup =
+					qp->feup | MLX4_FEUP_FORCE_ETH_UP | MLX4_FVL_FORCE_ETH_VLAN;
+				upd_context->qp_context.pri_path.sched_queue =
+					qp->sched_queue & 0xC7;
+				upd_context->qp_context.pri_path.sched_queue |=
+					((work->qos & 0x7) << 3);
+			}
+
+			err = mlx4_cmd(dev, mailbox->dma,
+				       qp->local_qpn & 0xffffff,
+				       0, MLX4_CMD_UPDATE_QP,
+				       MLX4_CMD_TIME_CLASS_C, MLX4_CMD_NATIVE);
+			if (err) {
+				mlx4_info(dev, "UPDATE_QP failed for slave %d, "
+					  "port %d, qpn %d (%d)\n",
+					  work->slave, port, qp->local_qpn,
+					  err);
+				errors++;
+			}
+		}
+		spin_lock_irq(mlx4_tlock(dev));
+	}
+	spin_unlock_irq(mlx4_tlock(dev));
+	mlx4_free_cmd_mailbox(dev, mailbox);
+
+	if (errors)
+		mlx4_err(dev, "%d UPDATE_QP failures for slave %d, port %d\n",
+			 errors, work->slave, work->port);
+
+	/* unregister previous vlan_id if needed and we had no errors
+	 * while updating the QPs
+	 */
+	if (work->flags & MLX4_VF_IMMED_VLAN_FLAG_VLAN && !errors &&
+	    NO_INDX != work->orig_vlan_ix)
+		__mlx4_unregister_vlan(&work->priv->dev, work->port,
+				       work->orig_vlan_id);
+out:
+	kfree(work);
+	return;
+}
+


Property changes on: trunk/sys/ofed/drivers/net/mlx4/resource_tracker.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Modified: trunk/sys/ofed/drivers/net/mlx4/sense.c
===================================================================
--- trunk/sys/ofed/drivers/net/mlx4/sense.c	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/drivers/net/mlx4/sense.c	2016-09-14 19:35:22 UTC (rev 7911)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2007, 2014 Mellanox Technologies. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -38,14 +38,15 @@
 
 #include "mlx4.h"
 
-static int mlx4_SENSE_PORT(struct mlx4_dev *dev, int port,
-			   enum mlx4_port_type *type)
+int mlx4_SENSE_PORT(struct mlx4_dev *dev, int port,
+		    enum mlx4_port_type *type)
 {
 	u64 out_param;
 	int err = 0;
 
 	err = mlx4_cmd_imm(dev, 0, &out_param, port, 0,
-			   MLX4_CMD_SENSE_PORT, MLX4_CMD_TIME_CLASS_B);
+			   MLX4_CMD_SENSE_PORT, MLX4_CMD_TIME_CLASS_B,
+			   MLX4_CMD_WRAPPED);
 	if (err) {
 		mlx4_err(dev, "Sense command failed for port: %d\n", port);
 		return err;
@@ -52,8 +53,8 @@
 	}
 
 	if (out_param > 2) {
-		mlx4_err(dev, "Sense returned illegal value: 0x%llx\n", out_param);
-		return EINVAL;
+		mlx4_err(dev, "Sense returned illegal value: 0x%llx\n", (unsigned long long)out_param);
+		return -EINVAL;
 	}
 
 	*type = out_param;
@@ -80,20 +81,6 @@
 	}
 
 	/*
-	 * Adjust port configuration:
-	 * If port 1 sensed nothing and port 2 is IB, set both as IB
-	 * If port 2 sensed nothing and port 1 is Eth, set both as Eth
-	 */
-	if (stype[0] == MLX4_PORT_TYPE_ETH) {
-		for (i = 1; i < dev->caps.num_ports; i++)
-			stype[i] = stype[i] ? stype[i] : MLX4_PORT_TYPE_ETH;
-	}
-	if (stype[dev->caps.num_ports - 1] == MLX4_PORT_TYPE_IB) {
-		for (i = 0; i < dev->caps.num_ports - 1; i++)
-			stype[i] = stype[i] ? stype[i] : MLX4_PORT_TYPE_IB;
-	}
-
-	/*
 	 * If sensed nothing, remain in current configuration.
 	 */
 	for (i = 0; i < dev->caps.num_ports; i++)
@@ -121,9 +108,8 @@
 
 sense_again:
 	mutex_unlock(&priv->port_mutex);
-	if (sense->resched)
-		queue_delayed_work(sense->sense_wq , &sense->sense_poll,
-				   round_jiffies(MLX4_SENSE_RANGE));
+	queue_delayed_work(mlx4_wq , &sense->sense_poll,
+			   round_jiffies_relative(MLX4_SENSE_RANGE));
 }
 
 void mlx4_start_sense(struct mlx4_dev *dev)
@@ -134,18 +120,16 @@
 	if (!(dev->caps.flags & MLX4_DEV_CAP_FLAG_DPDP))
 		return;
 
-	sense->resched = 1;
-	queue_delayed_work(sense->sense_wq , &sense->sense_poll,
-			   round_jiffies(MLX4_SENSE_RANGE));
+	queue_delayed_work(mlx4_wq , &sense->sense_poll,
+			   round_jiffies_relative(MLX4_SENSE_RANGE));
 }
 
-
 void mlx4_stop_sense(struct mlx4_dev *dev)
 {
-	mlx4_priv(dev)->sense.resched = 0;
+	cancel_delayed_work_sync(&mlx4_priv(dev)->sense.sense_poll);
 }
 
-int mlx4_sense_init(struct mlx4_dev *dev)
+void  mlx4_sense_init(struct mlx4_dev *dev)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
 	struct mlx4_sense *sense = &priv->sense;
@@ -152,21 +136,8 @@
 	int port;
 
 	sense->dev = dev;
-	sense->sense_wq = create_singlethread_workqueue("mlx4_sense");
-	if (!sense->sense_wq)
-		return -ENOMEM;
-
 	for (port = 1; port <= dev->caps.num_ports; port++)
 		sense->do_sense_port[port] = 1;
 
-	INIT_DELAYED_WORK_DEFERRABLE(&sense->sense_poll, mlx4_sense_port);
-	return 0;
+	INIT_DEFERRABLE_WORK(&sense->sense_poll, mlx4_sense_port);
 }
-
-void mlx4_sense_cleanup(struct mlx4_dev *dev)
-{
-	mlx4_stop_sense(dev);
-	cancel_delayed_work(&mlx4_priv(dev)->sense.sense_poll);
-	destroy_workqueue(mlx4_priv(dev)->sense.sense_wq);
-}
-

Modified: trunk/sys/ofed/drivers/net/mlx4/srq.c
===================================================================
--- trunk/sys/ofed/drivers/net/mlx4/srq.c	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/drivers/net/mlx4/srq.c	2016-09-14 19:35:22 UTC (rev 7911)
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 2006, 2007 Cisco Systems, Inc. All rights reserved.
- * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2007, 2008, 2014 Mellanox Technologies. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -31,34 +31,14 @@
  * SOFTWARE.
  */
 
-#include <linux/init.h>
-
 #include <linux/mlx4/cmd.h>
 #include <linux/mlx4/srq.h>
+#include <linux/module.h>
+#include <linux/gfp.h>
 
 #include "mlx4.h"
 #include "icm.h"
 
-struct mlx4_srq_context {
-	__be32			state_logsize_srqn;
-	u8			logstride;
-	u8			reserved1;
-	__be16			xrc_domain;
-	__be32			pg_offset_cqn;
-	u32			reserved2;
-	u8			log_page_size;
-	u8			reserved3[2];
-	u8			mtt_base_addr_h;
-	__be32			mtt_base_addr_l;
-	__be32			pd;
-	__be16			limit_watermark;
-	__be16			wqe_cnt;
-	u16			reserved4;
-	__be16			wqe_counter;
-	u32			reserved5;
-	__be64			db_rec_addr;
-};
-
 void mlx4_srq_event(struct mlx4_dev *dev, u32 srqn, int event_type)
 {
 	struct mlx4_srq_table *srq_table = &mlx4_priv(dev)->srq_table;
@@ -66,8 +46,7 @@
 
 	spin_lock(&srq_table->lock);
 
-	srq = radix_tree_lookup(&dev->srq_table_tree,
-				srqn & (dev->caps.num_srqs - 1));
+	srq = radix_tree_lookup(&srq_table->tree, srqn & (dev->caps.num_srqs - 1));
 	if (srq)
 		atomic_inc(&srq->refcount);
 
@@ -87,8 +66,9 @@
 static int mlx4_SW2HW_SRQ(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox,
 			  int srq_num)
 {
-	return mlx4_cmd(dev, mailbox->dma, srq_num, 0, MLX4_CMD_SW2HW_SRQ,
-			MLX4_CMD_TIME_CLASS_A);
+	return mlx4_cmd(dev, mailbox->dma, srq_num, 0,
+			MLX4_CMD_SW2HW_SRQ, MLX4_CMD_TIME_CLASS_A,
+			MLX4_CMD_WRAPPED);
 }
 
 static int mlx4_HW2SW_SRQ(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox,
@@ -96,13 +76,13 @@
 {
 	return mlx4_cmd_box(dev, 0, mailbox ? mailbox->dma : 0, srq_num,
 			    mailbox ? 0 : 1, MLX4_CMD_HW2SW_SRQ,
-			    MLX4_CMD_TIME_CLASS_A);
+			    MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
 }
 
 static int mlx4_ARM_SRQ(struct mlx4_dev *dev, int srq_num, int limit_watermark)
 {
 	return mlx4_cmd(dev, limit_watermark, srq_num, 0, MLX4_CMD_ARM_SRQ,
-			MLX4_CMD_TIME_CLASS_B);
+			MLX4_CMD_TIME_CLASS_B, MLX4_CMD_WRAPPED);
 }
 
 static int mlx4_QUERY_SRQ(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox,
@@ -109,35 +89,96 @@
 			  int srq_num)
 {
 	return mlx4_cmd_box(dev, 0, mailbox->dma, srq_num, 0, MLX4_CMD_QUERY_SRQ,
-			    MLX4_CMD_TIME_CLASS_A);
+			    MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
 }
 
-int mlx4_srq_alloc(struct mlx4_dev *dev, u32 pdn, u32 cqn, u16 xrcd,
-		   struct mlx4_mtt *mtt, u64 db_rec, struct mlx4_srq *srq)
+int __mlx4_srq_alloc_icm(struct mlx4_dev *dev, int *srqn)
 {
 	struct mlx4_srq_table *srq_table = &mlx4_priv(dev)->srq_table;
-	struct mlx4_cmd_mailbox *mailbox;
-	struct mlx4_srq_context *srq_context;
-	u64 mtt_addr;
 	int err;
 
-	srq->srqn = mlx4_bitmap_alloc(&srq_table->bitmap);
-	if (srq->srqn == -1)
+
+	*srqn = mlx4_bitmap_alloc(&srq_table->bitmap);
+	if (*srqn == -1)
 		return -ENOMEM;
 
-	err = mlx4_table_get(dev, &srq_table->table, srq->srqn);
+	err = mlx4_table_get(dev, &srq_table->table, *srqn);
 	if (err)
 		goto err_out;
 
-	err = mlx4_table_get(dev, &srq_table->cmpt_table, srq->srqn);
+	err = mlx4_table_get(dev, &srq_table->cmpt_table, *srqn);
 	if (err)
 		goto err_put;
+	return 0;
 
+err_put:
+	mlx4_table_put(dev, &srq_table->table, *srqn);
+
+err_out:
+	mlx4_bitmap_free(&srq_table->bitmap, *srqn, MLX4_NO_RR);
+	return err;
+}
+
+static int mlx4_srq_alloc_icm(struct mlx4_dev *dev, int *srqn)
+{
+	u64 out_param;
+	int err;
+
+	if (mlx4_is_mfunc(dev)) {
+		err = mlx4_cmd_imm(dev, 0, &out_param, RES_SRQ,
+				   RES_OP_RESERVE_AND_MAP,
+				   MLX4_CMD_ALLOC_RES,
+				   MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
+		if (!err)
+			*srqn = get_param_l(&out_param);
+
+		return err;
+	}
+	return __mlx4_srq_alloc_icm(dev, srqn);
+}
+
+void __mlx4_srq_free_icm(struct mlx4_dev *dev, int srqn)
+{
+	struct mlx4_srq_table *srq_table = &mlx4_priv(dev)->srq_table;
+
+	mlx4_table_put(dev, &srq_table->cmpt_table, srqn);
+	mlx4_table_put(dev, &srq_table->table, srqn);
+	mlx4_bitmap_free(&srq_table->bitmap, srqn, MLX4_NO_RR);
+}
+
+static void mlx4_srq_free_icm(struct mlx4_dev *dev, int srqn)
+{
+	u64 in_param = 0;
+
+	if (mlx4_is_mfunc(dev)) {
+		set_param_l(&in_param, srqn);
+		if (mlx4_cmd(dev, in_param, RES_SRQ, RES_OP_RESERVE_AND_MAP,
+			     MLX4_CMD_FREE_RES,
+			     MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED))
+			mlx4_warn(dev, "Failed freeing cq:%d\n", srqn);
+		return;
+	}
+	__mlx4_srq_free_icm(dev, srqn);
+}
+
+int mlx4_srq_alloc(struct mlx4_dev *dev, u32 pdn, u32 cqn, u16 xrcd,
+		   struct mlx4_mtt *mtt, u64 db_rec, struct mlx4_srq *srq)
+{
+	struct mlx4_srq_table *srq_table = &mlx4_priv(dev)->srq_table;
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_srq_context *srq_context;
+	u64 mtt_addr;
+	int err;
+
+	err = mlx4_srq_alloc_icm(dev, &srq->srqn);
+	if (err)
+		return err;
+
 	spin_lock_irq(&srq_table->lock);
-	err = radix_tree_insert(&dev->srq_table_tree, srq->srqn, srq);
+	err = radix_tree_insert(&srq_table->tree, srq->srqn, srq);
 	spin_unlock_irq(&srq_table->lock);
 	if (err)
-		goto err_cmpt_put;
+		goto err_icm;
 
 	mailbox = mlx4_alloc_cmd_mailbox(dev);
 	if (IS_ERR(mailbox)) {
@@ -151,7 +192,7 @@
 	srq_context->state_logsize_srqn = cpu_to_be32((ilog2(srq->max) << 24) |
 						      srq->srqn);
 	srq_context->logstride          = srq->wqe_shift - 4;
-	srq_context->xrc_domain		= cpu_to_be16(xrcd);
+	srq_context->xrcd		= cpu_to_be16(xrcd);
 	srq_context->pg_offset_cqn	= cpu_to_be32(cqn & 0xffffff);
 	srq_context->log_page_size      = mtt->page_shift - MLX4_ICM_PAGE_SHIFT;
 
@@ -173,52 +214,33 @@
 
 err_radix:
 	spin_lock_irq(&srq_table->lock);
-	radix_tree_delete(&dev->srq_table_tree, srq->srqn);
+	radix_tree_delete(&srq_table->tree, srq->srqn);
 	spin_unlock_irq(&srq_table->lock);
 
-err_cmpt_put:
-	mlx4_table_put(dev, &srq_table->cmpt_table, srq->srqn);
-
-err_put:
-	mlx4_table_put(dev, &srq_table->table, srq->srqn);
-
-err_out:
-	mlx4_bitmap_free(&srq_table->bitmap, srq->srqn);
-
+err_icm:
+	mlx4_srq_free_icm(dev, srq->srqn);
 	return err;
 }
 EXPORT_SYMBOL_GPL(mlx4_srq_alloc);
 
-void mlx4_srq_invalidate(struct mlx4_dev *dev, struct mlx4_srq *srq)
+void mlx4_srq_free(struct mlx4_dev *dev, struct mlx4_srq *srq)
 {
+	struct mlx4_srq_table *srq_table = &mlx4_priv(dev)->srq_table;
 	int err;
 
 	err = mlx4_HW2SW_SRQ(dev, NULL, srq->srqn);
 	if (err)
 		mlx4_warn(dev, "HW2SW_SRQ failed (%d) for SRQN %06x\n", err, srq->srqn);
-}
-EXPORT_SYMBOL_GPL(mlx4_srq_invalidate);
 
-void mlx4_srq_remove(struct mlx4_dev *dev, struct mlx4_srq *srq)
-{
-	struct mlx4_srq_table *srq_table = &mlx4_priv(dev)->srq_table;
-
 	spin_lock_irq(&srq_table->lock);
-	radix_tree_delete(&dev->srq_table_tree, srq->srqn);
+	radix_tree_delete(&srq_table->tree, srq->srqn);
 	spin_unlock_irq(&srq_table->lock);
-}
-EXPORT_SYMBOL_GPL(mlx4_srq_remove);
 
-void mlx4_srq_free(struct mlx4_dev *dev, struct mlx4_srq *srq)
-{
-	struct mlx4_srq_table *srq_table = &mlx4_priv(dev)->srq_table;
-
 	if (atomic_dec_and_test(&srq->refcount))
 		complete(&srq->free);
 	wait_for_completion(&srq->free);
 
-	mlx4_table_put(dev, &srq_table->table, srq->srqn);
-	mlx4_bitmap_free(&srq_table->bitmap, srq->srqn);
+	mlx4_srq_free_icm(dev, srq->srqn);
 }
 EXPORT_SYMBOL_GPL(mlx4_srq_free);
 
@@ -257,7 +279,9 @@
 	int err;
 
 	spin_lock_init(&srq_table->lock);
-	INIT_RADIX_TREE(&dev->srq_table_tree, GFP_ATOMIC);
+	INIT_RADIX_TREE(&srq_table->tree, GFP_ATOMIC);
+	if (mlx4_is_slave(dev))
+		return 0;
 
 	err = mlx4_bitmap_init(&srq_table->bitmap, dev->caps.num_srqs,
 			       dev->caps.num_srqs - 1, dev->caps.reserved_srqs, 0);
@@ -269,5 +293,22 @@
 
 void mlx4_cleanup_srq_table(struct mlx4_dev *dev)
 {
+	if (mlx4_is_slave(dev))
+		return;
 	mlx4_bitmap_cleanup(&mlx4_priv(dev)->srq_table.bitmap);
 }
+
+struct mlx4_srq *mlx4_srq_lookup(struct mlx4_dev *dev, u32 srqn)
+{
+	struct mlx4_srq_table *srq_table = &mlx4_priv(dev)->srq_table;
+	struct mlx4_srq *srq;
+	unsigned long flags;
+
+	spin_lock_irqsave(&srq_table->lock, flags);
+	srq = radix_tree_lookup(&srq_table->tree,
+				srqn & (dev->caps.num_srqs - 1));
+	spin_unlock_irqrestore(&srq_table->lock, flags);
+
+	return srq;
+}
+EXPORT_SYMBOL_GPL(mlx4_srq_lookup);

Added: trunk/sys/ofed/drivers/net/mlx4/sys_tune.c
===================================================================
--- trunk/sys/ofed/drivers/net/mlx4/sys_tune.c	                        (rev 0)
+++ trunk/sys/ofed/drivers/net/mlx4/sys_tune.c	2016-09-14 19:35:22 UTC (rev 7911)
@@ -0,0 +1,323 @@
+/*
+ * Copyright (c) 2010, 2014 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/sched.h>
+#include <linux/mutex.h>
+#include <asm/atomic.h>
+
+#include "mlx4.h"
+
+#if defined(CONFIG_X86) && defined(CONFIG_APM_MODULE)
+
+/* Each CPU is put into a group.  In most cases, the group number is
+ * equal to the CPU number of one of the CPUs in the group.  The
+ * exception is group NR_CPUS which is the default group.  This is
+ * protected by sys_tune_startup_mutex. */
+DEFINE_PER_CPU(int, idle_cpu_group) = NR_CPUS;
+
+/* For each group, a count of the number of CPUs in the group which
+ * are known to be busy.  A busy CPU might be running the busy loop
+ * below or general kernel code.  The count is decremented on entry to
+ * the old pm_idle handler and incremented on exit.  The aim is to
+ * avoid the count going to zero or negative.  This situation can
+ * occur temporarily during module unload or CPU hot-plug but
+ * normality will be restored when the affected CPUs next exit the
+ * idle loop. */
+static atomic_t busy_cpu_count[NR_CPUS+1];
+
+/* A workqueue item to be executed to cause the CPU to exit from the
+ * idle loop. */
+DEFINE_PER_CPU(struct work_struct, sys_tune_cpu_work);
+
+#define sys_tune_set_state(CPU,STATE) \
+	do { } while(0)
+
+
+/* A mutex to protect most of the module datastructures. */
+static DEFINE_MUTEX(sys_tune_startup_mutex);
+
+/* The old pm_idle handler. */
+static void (*old_pm_idle)(void) = NULL;
+
+static void sys_tune_pm_idle(void)
+{
+	atomic_t *busy_cpus_ptr;
+	int busy_cpus;
+	int cpu = smp_processor_id();
+
+	busy_cpus_ptr = &(busy_cpu_count[per_cpu(idle_cpu_group, cpu)]);
+
+	sys_tune_set_state(cpu, 2);
+
+	local_irq_enable();
+	while (!need_resched()) {
+		busy_cpus = atomic_read(busy_cpus_ptr);
+
+		/* If other CPUs in this group are busy then let this
+		 * CPU go idle.  We mustn't let the number of busy
+		 * CPUs drop below 1. */
+		if ( busy_cpus > 1 &&
+		     old_pm_idle != NULL &&
+		     ( atomic_cmpxchg(busy_cpus_ptr, busy_cpus,
+				      busy_cpus-1) == busy_cpus ) ) {
+			local_irq_disable();
+			sys_tune_set_state(cpu, 3);
+			/* This check might not be necessary, but it
+			 * seems safest to include it because there
+			 * might be a kernel version which requires
+			 * it. */
+			if (need_resched())
+				local_irq_enable();
+			else
+				old_pm_idle();
+			/* This CPU is busy again. */
+			sys_tune_set_state(cpu, 1);
+			atomic_add(1, busy_cpus_ptr);
+			return;
+		}
+
+		cpu_relax();
+	}
+	sys_tune_set_state(cpu, 0);
+}
+
+
+void sys_tune_work_func(struct work_struct *work)
+{
+	/* Do nothing.  Since this function is running in process
+	 * context, the idle thread isn't running on this CPU. */
+}
+
+
+#ifdef CONFIG_SMP
+static void sys_tune_smp_call(void *info)
+{
+	schedule_work(&get_cpu_var(sys_tune_cpu_work));
+	put_cpu_var(sys_tune_cpu_work);
+}
+#endif
+
+
+#ifdef CONFIG_SMP
+static void sys_tune_refresh(void)
+{
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,26)
+        on_each_cpu(&sys_tune_smp_call, NULL, 0, 1);
+#else
+        on_each_cpu(&sys_tune_smp_call, NULL, 1);
+#endif
+}
+#else
+static void sys_tune_refresh(void)
+{
+	/* The current thread is executing on the one and only CPU so
+	 * the idle thread isn't running. */
+}
+#endif
+
+
+
+static int sys_tune_cpu_group(int cpu)
+{
+#ifdef CONFIG_SMP
+	const cpumask_t *mask;
+	int other_cpu;
+	int group;
+
+#if defined(topology_thread_cpumask) && defined(ST_HAVE_EXPORTED_CPU_SIBLING_MAP)
+	/* Keep one hyperthread busy per core. */
+	mask = topology_thread_cpumask(cpu);
+#else
+	return cpu;
+#endif
+	for_each_cpu_mask(cpu, *(mask))	{
+		group = per_cpu(idle_cpu_group, other_cpu);
+		if (group != NR_CPUS)
+			return group;
+	}
+#endif
+
+	return cpu;
+}
+
+
+static void sys_tune_add_cpu(int cpu)
+{
+	int group;
+
+	/* Do nothing if this CPU has already been added. */
+	if (per_cpu(idle_cpu_group, cpu) != NR_CPUS)
+		return;
+
+	group = sys_tune_cpu_group(cpu);
+	per_cpu(idle_cpu_group, cpu) = group;
+	atomic_inc(&(busy_cpu_count[group]));
+
+}
+
+static void sys_tune_del_cpu(int cpu)
+{
+
+	int group;
+
+	if (per_cpu(idle_cpu_group, cpu) == NR_CPUS)
+		return;
+
+	group = per_cpu(idle_cpu_group, cpu);
+	/* If the CPU was busy, this can cause the count to drop to
+	 * zero.  To rectify this, we need to cause one of the other
+	 * CPUs in the group to exit the idle loop.  If the CPU was
+	 * not busy then this causes the contribution for this CPU to
+	 * go to -1 which can cause the overall count to drop to zero
+	 * or go negative.  To rectify this situation we need to cause
+	 * this CPU to exit the idle loop. */
+	atomic_dec(&(busy_cpu_count[group]));
+	per_cpu(idle_cpu_group, cpu) = NR_CPUS;
+
+}
+
+
+static int sys_tune_cpu_notify(struct notifier_block *self,
+			       unsigned long action, void *hcpu)
+{
+	int cpu = (long)hcpu;
+	
+	switch(action) {
+#ifdef CPU_ONLINE_FROZEN
+	case CPU_ONLINE_FROZEN:
+#endif
+	case CPU_ONLINE:
+		mutex_lock(&sys_tune_startup_mutex);
+		sys_tune_add_cpu(cpu);
+		mutex_unlock(&sys_tune_startup_mutex);
+		/* The CPU might have already entered the idle loop in
+		 * the wrong group.  Make sure it exits the idle loop
+		 * so that it picks up the correct group. */
+		sys_tune_refresh();
+		break;
+
+#ifdef CPU_DEAD_FROZEN
+	case CPU_DEAD_FROZEN:
+#endif
+	case CPU_DEAD:
+		mutex_lock(&sys_tune_startup_mutex);
+		sys_tune_del_cpu(cpu);
+		mutex_unlock(&sys_tune_startup_mutex);
+		/* The deleted CPU may have been the only busy CPU in
+		 * the group.  Make sure one of the other CPUs in the
+		 * group exits the idle loop. */
+		sys_tune_refresh();
+		break;
+	}
+	return NOTIFY_OK;
+}
+
+
+static struct notifier_block sys_tune_cpu_nb = {
+	.notifier_call = sys_tune_cpu_notify,
+};
+
+
+static void sys_tune_ensure_init(void)
+{
+	BUG_ON (old_pm_idle != NULL);
+
+	/* Atomically update pm_idle to &sys_tune_pm_idle.  The old value
+	 * is stored in old_pm_idle before installing the new
+	 * handler. */
+	do {
+		old_pm_idle = pm_idle;
+	} while (cmpxchg(&pm_idle, old_pm_idle, &sys_tune_pm_idle) !=
+		 old_pm_idle);
+}
+#endif
+
+void sys_tune_fini(void)
+{
+#if defined(CONFIG_X86) && defined(CONFIG_APM_MODULE)
+	void (*old)(void);
+	int cpu;
+
+	unregister_cpu_notifier(&sys_tune_cpu_nb);
+
+	mutex_lock(&sys_tune_startup_mutex);
+
+
+	old = cmpxchg(&pm_idle, &sys_tune_pm_idle, old_pm_idle);
+
+	for_each_online_cpu(cpu)
+		sys_tune_del_cpu(cpu);
+
+	mutex_unlock(&sys_tune_startup_mutex);
+	
+	/* Our handler may still be executing on other CPUs.
+	 * Schedule this thread on all CPUs to make sure all
+	 * idle threads get interrupted. */
+	sys_tune_refresh();
+
+	/* Make sure the work item has finished executing on all CPUs.
+	 * This in turn ensures that all idle threads have been
+	 * interrupted. */
+	flush_scheduled_work();
+#endif /* CONFIG_X86 */
+}
+
+void sys_tune_init(void)
+{
+#if defined(CONFIG_X86) && defined(CONFIG_APM_MODULE)
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		INIT_WORK(&per_cpu(sys_tune_cpu_work, cpu),
+			  sys_tune_work_func);
+	}
+
+	/* Start by registering the handler to ensure we don't miss
+	 * any updates. */
+	register_cpu_notifier(&sys_tune_cpu_nb);
+
+	mutex_lock(&sys_tune_startup_mutex);
+
+	for_each_online_cpu(cpu)
+		sys_tune_add_cpu(cpu);
+
+	sys_tune_ensure_init();
+
+
+	mutex_unlock(&sys_tune_startup_mutex);
+
+	/* Ensure our idle handler starts to run. */
+	sys_tune_refresh();
+#endif
+}
+


Property changes on: trunk/sys/ofed/drivers/net/mlx4/sys_tune.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/sys/ofed/drivers/net/mlx4/utils.c
===================================================================
--- trunk/sys/ofed/drivers/net/mlx4/utils.c	                        (rev 0)
+++ trunk/sys/ofed/drivers/net/mlx4/utils.c	2016-09-14 19:35:22 UTC (rev 7911)
@@ -0,0 +1,189 @@
+/*	$OpenBSD: if_trunk.c,v 1.30 2007/01/31 06:20:19 reyk Exp $	*/
+
+/*
+ * Copyright (c) 2005, 2006 Reyk Floeter <reyk at openbsd.org>
+ * Copyright (c) 2007 Andrew Thompson <thompsa at FreeBSD.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_inet.h"
+#include "opt_inet6.h"
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/queue.h>
+#include <sys/socket.h>
+#include <sys/sockio.h>
+#include <sys/sysctl.h>
+#include <sys/module.h>
+#include <sys/priv.h>
+#include <sys/systm.h>
+#include <sys/proc.h>
+#include <sys/hash.h>
+#include <sys/lock.h>
+#include <sys/rmlock.h>
+#include <sys/taskqueue.h>
+#include <sys/eventhandler.h>
+
+#include <net/ethernet.h>
+#include <net/if.h>
+#include <net/if_clone.h>
+#include <net/if_arp.h>
+#include <net/if_dl.h>
+#include <net/if_llc.h>
+#include <net/if_media.h>
+#include <net/if_types.h>
+#include <net/if_var.h>
+#include <net/bpf.h>
+
+#if defined(INET) || defined(INET6)
+#include <netinet/in.h>
+#endif
+#ifdef INET
+#include <netinet/in_systm.h>
+#include <netinet/if_ether.h>
+#include <netinet/ip.h>
+#endif
+
+#ifdef INET6
+#include <netinet/ip6.h>
+#include <netinet6/in6_var.h>
+#include <netinet6/in6_ifattach.h>
+#endif
+
+#include <net/if_vlan_var.h>
+
+#include "utils.h"
+
+/* XXX this code should be factored out */
+/* XXX copied from if_lagg.c */
+
+static const void *
+mlx4_en_gethdr(struct mbuf *m, u_int off, u_int len, void *buf)
+{
+	if (m->m_pkthdr.len < (off + len)) {
+		return (NULL);
+	} else if (m->m_len < (off + len)) {
+		m_copydata(m, off, len, buf);
+		return (buf);
+	}
+	return (mtod(m, char *) + off);
+}
+
+uint32_t
+mlx4_en_hashmbuf(uint32_t flags, struct mbuf *m, uint32_t key)
+{
+	uint16_t etype;
+	uint32_t p = key;
+	int off;
+	struct ether_header *eh;
+	const struct ether_vlan_header *vlan;
+#ifdef INET
+	const struct ip *ip;
+	const uint32_t *ports;
+	int iphlen;
+#endif
+#ifdef INET6
+	const struct ip6_hdr *ip6;
+	uint32_t flow;
+#endif
+	union {
+#ifdef INET
+		struct ip ip;
+#endif
+#ifdef INET6
+		struct ip6_hdr ip6;
+#endif
+		struct ether_vlan_header vlan;
+		uint32_t port;
+	} buf;
+
+
+	off = sizeof(*eh);
+	if (m->m_len < off)
+		goto out;
+	eh = mtod(m, struct ether_header *);
+	etype = ntohs(eh->ether_type);
+	if (flags & MLX4_F_HASHL2) {
+		p = hash32_buf(&eh->ether_shost, ETHER_ADDR_LEN, p);
+		p = hash32_buf(&eh->ether_dhost, ETHER_ADDR_LEN, p);
+	}
+
+	/* Special handling for encapsulating VLAN frames */
+	if ((m->m_flags & M_VLANTAG) && (flags & MLX4_F_HASHL2)) {
+		p = hash32_buf(&m->m_pkthdr.ether_vtag,
+		    sizeof(m->m_pkthdr.ether_vtag), p);
+	} else if (etype == ETHERTYPE_VLAN) {
+		vlan = mlx4_en_gethdr(m, off,  sizeof(*vlan), &buf);
+		if (vlan == NULL)
+			goto out;
+
+		if (flags & MLX4_F_HASHL2)
+			p = hash32_buf(&vlan->evl_tag, sizeof(vlan->evl_tag), p);
+		etype = ntohs(vlan->evl_proto);
+		off += sizeof(*vlan) - sizeof(*eh);
+	}
+
+	switch (etype) {
+#ifdef INET
+	case ETHERTYPE_IP:
+		ip = mlx4_en_gethdr(m, off, sizeof(*ip), &buf);
+		if (ip == NULL)
+			goto out;
+
+		if (flags & MLX4_F_HASHL3) {
+			p = hash32_buf(&ip->ip_src, sizeof(struct in_addr), p);
+			p = hash32_buf(&ip->ip_dst, sizeof(struct in_addr), p);
+		}
+		if (!(flags & MLX4_F_HASHL4))
+			break;
+		switch (ip->ip_p) {
+			case IPPROTO_TCP:
+			case IPPROTO_UDP:
+			case IPPROTO_SCTP:
+				iphlen = ip->ip_hl << 2;
+				if (iphlen < sizeof(*ip))
+					break;
+				off += iphlen;
+				ports = mlx4_en_gethdr(m, off, sizeof(*ports), &buf);
+				if (ports == NULL)
+					break;
+				p = hash32_buf(ports, sizeof(*ports), p);
+				break;
+		}
+		break;
+#endif
+#ifdef INET6
+	case ETHERTYPE_IPV6:
+		if (!(flags & MLX4_F_HASHL3))
+			break;
+		ip6 = mlx4_en_gethdr(m, off, sizeof(*ip6), &buf);
+		if (ip6 == NULL)
+			goto out;
+
+		p = hash32_buf(&ip6->ip6_src, sizeof(struct in6_addr), p);
+		p = hash32_buf(&ip6->ip6_dst, sizeof(struct in6_addr), p);
+		flow = ip6->ip6_flow & IPV6_FLOWLABEL_MASK;
+		p = hash32_buf(&flow, sizeof(flow), p);	/* IPv6 flow label */
+		break;
+#endif
+	}
+out:
+	return (p);
+}


Property changes on: trunk/sys/ofed/drivers/net/mlx4/utils.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/sys/ofed/drivers/net/mlx4/utils.h
===================================================================
--- trunk/sys/ofed/drivers/net/mlx4/utils.h	                        (rev 0)
+++ trunk/sys/ofed/drivers/net/mlx4/utils.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2014 Mellanox Technologies Ltd.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _MLX4_UTILS_H_
+#define	_MLX4_UTILS_H_
+
+/* Lagg flags */
+#define	MLX4_F_HASHL2		0x00000001	/* hash layer 2 */
+#define	MLX4_F_HASHL3		0x00000002	/* hash layer 3 */
+#define	MLX4_F_HASHL4		0x00000004	/* hash layer 4 */
+#define	MLX4_F_HASHMASK		0x00000007
+
+uint32_t mlx4_en_hashmbuf(uint32_t flags, struct mbuf *m, uint32_t key);
+
+#endif		/* _MLX4_UTILS_H_ */


Property changes on: trunk/sys/ofed/drivers/net/mlx4/utils.h
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Modified: trunk/sys/ofed/include/asm/atomic-long.h
===================================================================
--- trunk/sys/ofed/include/asm/atomic-long.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/asm/atomic-long.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -2,6 +2,7 @@
  * Copyright (c) 2010 Isilon Systems, Inc.
  * Copyright (c) 2010 iX Systems, Inc.
  * Copyright (c) 2010 Panasas, Inc.
+ * Copyright (c) 2013, 2014 Mellanox Technologies, Ltd.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -25,6 +26,7 @@
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
+
 #ifndef	_ATOMIC_LONG_H_
 #define	_ATOMIC_LONG_H_
 
@@ -33,7 +35,7 @@
 #include <machine/atomic.h>
 
 typedef struct {
-	volatile u_long counter;
+	volatile long counter;
 } atomic_long_t;
 
 #define	atomic_long_add(i, v)		atomic_long_add_return((i), (v))


Property changes on: trunk/sys/ofed/include/asm/atomic-long.h
___________________________________________________________________
Deleted: cvs2svn:cvs-rev
## -1 +0,0 ##
-1.1.1.1
\ No newline at end of property
Modified: trunk/sys/ofed/include/asm/atomic.h
===================================================================
--- trunk/sys/ofed/include/asm/atomic.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/asm/atomic.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -2,6 +2,7 @@
  * Copyright (c) 2010 Isilon Systems, Inc.
  * Copyright (c) 2010 iX Systems, Inc.
  * Copyright (c) 2010 Panasas, Inc.
+ * Copyright (c) 2013-2016 Mellanox Technologies, Ltd.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -32,12 +33,15 @@
 #include <sys/cdefs.h>
 #include <sys/types.h>
 #include <machine/atomic.h>
-#include <asm/atomic-long.h>
 
 typedef struct {
-	volatile u_int counter;
+	volatile int counter;
 } atomic_t;
 
+/*------------------------------------------------------------------------*
+ *	32-bit atomic operations
+ *------------------------------------------------------------------------*/
+
 #define	atomic_add(i, v)		atomic_add_return((i), (v))
 #define	atomic_sub(i, v)		atomic_sub_return((i), (v))
 #define	atomic_inc_return(v)		atomic_add_return(1, (v))
@@ -45,6 +49,8 @@
 #define	atomic_sub_and_test(i, v)	(atomic_sub_return((i), (v)) == 0)
 #define	atomic_dec_and_test(v)		(atomic_sub_return(1, (v)) == 0)
 #define	atomic_inc_and_test(v)		(atomic_add_return(1, (v)) == 0)
+#define	atomic_dec_return(v)		atomic_sub_return(1, (v))
+#define	atomic_inc_not_zero(v)		atomic_add_unless((v), 1, 0)
 
 static inline int
 atomic_add_return(int i, atomic_t *v)
@@ -82,4 +88,19 @@
 	return atomic_fetchadd_int(&v->counter, -1) - 1;
 }
 
-#endif	/* _ASM_ATOMIC_H_ */
+static inline int
+atomic_add_unless(atomic_t *v, int a, int u)
+{
+	int c;
+
+	for (;;) {
+		c = atomic_read(v);
+		if (unlikely(c == u))
+			break;
+		if (likely(atomic_cmpset_int(&v->counter, c, c + a)))
+			break;
+	}
+	return (c != u);
+}
+
+#endif					/* _ASM_ATOMIC_H_ */


Property changes on: trunk/sys/ofed/include/asm/atomic.h
___________________________________________________________________
Deleted: cvs2svn:cvs-rev
## -1 +0,0 ##
-1.1.1.1
\ No newline at end of property
Modified: trunk/sys/ofed/include/asm/byteorder.h
===================================================================
--- trunk/sys/ofed/include/asm/byteorder.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/asm/byteorder.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -2,6 +2,7 @@
  * Copyright (c) 2010 Isilon Systems, Inc.
  * Copyright (c) 2010 iX Systems, Inc.
  * Copyright (c) 2010 Panasas, Inc.
+ * Copyright (c) 2013, 2014 Mellanox Technologies, Ltd.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -25,11 +26,13 @@
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
+
 #ifndef	_ASM_BYTEORDER_H_
 #define	_ASM_BYTEORDER_H_
 
 #include <sys/types.h>
 #include <sys/endian.h>
+#include <asm/types.h>
 
 #if BYTE_ORDER == LITTLE_ENDIAN
 #define	__LITTLE_ENDIAN


Property changes on: trunk/sys/ofed/include/asm/byteorder.h
___________________________________________________________________
Deleted: cvs2svn:cvs-rev
## -1 +0,0 ##
-1.1.1.1
\ No newline at end of property
Index: trunk/sys/ofed/include/asm/current.h
===================================================================
--- trunk/sys/ofed/include/asm/current.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/asm/current.h	2016-09-14 19:35:22 UTC (rev 7911)

Property changes on: trunk/sys/ofed/include/asm/current.h
___________________________________________________________________
Deleted: cvs2svn:cvs-rev
## -1 +0,0 ##
-1.1.1.1
\ No newline at end of property
Modified: trunk/sys/ofed/include/asm/fcntl.h
===================================================================
--- trunk/sys/ofed/include/asm/fcntl.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/asm/fcntl.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -2,6 +2,7 @@
  * Copyright (c) 2010 Isilon Systems, Inc.
  * Copyright (c) 2010 iX Systems, Inc.
  * Copyright (c) 2010 Panasas, Inc.
+ * Copyright (c) 2013, 2014 Mellanox Technologies, Ltd.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without


Property changes on: trunk/sys/ofed/include/asm/fcntl.h
___________________________________________________________________
Deleted: cvs2svn:cvs-rev
## -1 +0,0 ##
-1.1.1.1
\ No newline at end of property
Modified: trunk/sys/ofed/include/asm/io.h
===================================================================
--- trunk/sys/ofed/include/asm/io.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/asm/io.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -1,7 +1,8 @@
-/*-
+/*
  * Copyright (c) 2010 Isilon Systems, Inc.
  * Copyright (c) 2010 iX Systems, Inc.
  * Copyright (c) 2010 Panasas, Inc.
+ * Copyright (c) 2013, 2014 Mellanox Technologies, Ltd.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -26,4 +27,9 @@
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
+#ifndef _ASM_IO_H_
+#define _ASM_IO_H_
+
 #include <linux/io.h>
+
+#endif	/* _ASM_IO_H_ */


Property changes on: trunk/sys/ofed/include/asm/io.h
___________________________________________________________________
Deleted: cvs2svn:cvs-rev
## -1 +0,0 ##
-1.1.1.1
\ No newline at end of property
Index: trunk/sys/ofed/include/asm/page.h
===================================================================
--- trunk/sys/ofed/include/asm/page.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/asm/page.h	2016-09-14 19:35:22 UTC (rev 7911)

Property changes on: trunk/sys/ofed/include/asm/page.h
___________________________________________________________________
Deleted: cvs2svn:cvs-rev
## -1 +0,0 ##
-1.1.1.1
\ No newline at end of property
Modified: trunk/sys/ofed/include/asm/pgtable.h
===================================================================
--- trunk/sys/ofed/include/asm/pgtable.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/asm/pgtable.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -2,6 +2,7 @@
  * Copyright (c) 2010 Isilon Systems, Inc.
  * Copyright (c) 2010 iX Systems, Inc.
  * Copyright (c) 2010 Panasas, Inc.
+ * Copyright (c) 2013, 2014 Mellanox Technologies, Ltd.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without


Property changes on: trunk/sys/ofed/include/asm/pgtable.h
___________________________________________________________________
Deleted: cvs2svn:cvs-rev
## -1 +0,0 ##
-1.1.1.1
\ No newline at end of property
Index: trunk/sys/ofed/include/asm/semaphore.h
===================================================================
--- trunk/sys/ofed/include/asm/semaphore.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/asm/semaphore.h	2016-09-14 19:35:22 UTC (rev 7911)

Property changes on: trunk/sys/ofed/include/asm/semaphore.h
___________________________________________________________________
Deleted: cvs2svn:cvs-rev
## -1 +0,0 ##
-1.1.1.1
\ No newline at end of property
Index: trunk/sys/ofed/include/asm/system.h
===================================================================
--- trunk/sys/ofed/include/asm/system.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/asm/system.h	2016-09-14 19:35:22 UTC (rev 7911)

Property changes on: trunk/sys/ofed/include/asm/system.h
___________________________________________________________________
Deleted: cvs2svn:cvs-rev
## -1 +0,0 ##
-1.1.1.1
\ No newline at end of property
Modified: trunk/sys/ofed/include/asm/types.h
===================================================================
--- trunk/sys/ofed/include/asm/types.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/asm/types.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -2,6 +2,7 @@
  * Copyright (c) 2010 Isilon Systems, Inc.
  * Copyright (c) 2010 iX Systems, Inc.
  * Copyright (c) 2010 Panasas, Inc.
+ * Copyright (c) 2013, 2014 Mellanox Technologies, Ltd.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -25,43 +26,36 @@
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
+
 #ifndef	_ASM_TYPES_H_
 #define	_ASM_TYPES_H_
 
-typedef unsigned short umode_t;
-
-typedef __signed__ char __s8;
-typedef unsigned char __u8;
-
-typedef __signed__ short __s16;
-typedef unsigned short __u16;
-
-typedef __signed__ int __s32;
-typedef unsigned int __u32;
-
-#if defined(__GNUC__) // && !defined(__STRICT_ANSI__)
-typedef __signed__ long long __s64;
-typedef unsigned long long __u64;
-#endif
-
 #ifdef _KERNEL
 
-typedef signed char s8;
-typedef unsigned char u8;
+typedef uint8_t u8;
+typedef uint8_t __u8;
+typedef uint16_t u16;
+typedef uint16_t __u16;
+typedef uint32_t u32;
+typedef uint32_t __u32;
+typedef uint64_t u64;
+typedef uint64_t __u64;
 
-typedef signed short s16;
-typedef unsigned short u16;
+typedef int8_t s8;
+typedef int8_t __s8;
+typedef int16_t s16;
+typedef int16_t __s16;
+typedef int32_t s32;
+typedef int32_t __s32;
+typedef int64_t s64;
+typedef int64_t __s64;
 
-typedef signed int s32;
-typedef unsigned int u32;
-
-typedef signed long long s64;
-typedef unsigned long long u64;
-
 /* DMA addresses come in generic and 64-bit flavours.  */
 typedef vm_paddr_t dma_addr_t;
 typedef vm_paddr_t dma64_addr_t;
 
+typedef unsigned short umode_t;
+
 #endif	/* _KERNEL */
 
 #endif	/* _ASM_TYPES_H_ */


Property changes on: trunk/sys/ofed/include/asm/types.h
___________________________________________________________________
Deleted: cvs2svn:cvs-rev
## -1 +0,0 ##
-1.1.1.1
\ No newline at end of property
Modified: trunk/sys/ofed/include/asm/uaccess.h
===================================================================
--- trunk/sys/ofed/include/asm/uaccess.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/asm/uaccess.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -2,6 +2,7 @@
  * Copyright (c) 2010 Isilon Systems, Inc.
  * Copyright (c) 2010 iX Systems, Inc.
  * Copyright (c) 2010 Panasas, Inc.
+ * Copyright (c) 2013, 2014 Mellanox Technologies, Ltd.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -25,6 +26,7 @@
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
+
 #ifndef _ASM_UACCESS_H_
 #define _ASM_UACCESS_H_
 


Property changes on: trunk/sys/ofed/include/asm/uaccess.h
___________________________________________________________________
Deleted: cvs2svn:cvs-rev
## -1 +0,0 ##
-1.1.1.1
\ No newline at end of property
Modified: trunk/sys/ofed/include/linux/bitops.h
===================================================================
--- trunk/sys/ofed/include/linux/bitops.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/linux/bitops.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -2,6 +2,7 @@
  * Copyright (c) 2010 Isilon Systems, Inc.
  * Copyright (c) 2010 iX Systems, Inc.
  * Copyright (c) 2010 Panasas, Inc.
+ * Copyright (c) 2013, 2014 Mellanox Technologies, Ltd.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -35,7 +36,10 @@
 #endif
 #define	BIT_MASK(n)		(~0UL >> (BITS_PER_LONG - (n)))
 #define	BITS_TO_LONGS(n)	howmany((n), BITS_PER_LONG)
+#define BIT_WORD(nr)		((nr) / BITS_PER_LONG)
 
+#define BITS_PER_BYTE           8
+
 static inline int
 __ffs(int mask)
 {
@@ -63,6 +67,16 @@
 
 #define	ffz(mask)	__ffs(~(mask))
 
+static inline int get_count_order(unsigned int count)
+{
+        int order;
+
+        order = fls(count) - 1;
+        if (count & (count - 1))
+                order++;
+        return order;
+}
+
 static inline unsigned long
 find_first_bit(unsigned long *addr, unsigned long size)
 {
@@ -124,11 +138,11 @@
 		if (mask)
 			return (bit + __flsl(mask));
 	}
-	while (--pos) {
+	while (pos--) {
 		addr--;
 		bit -= BITS_PER_LONG;
 		if (*addr)
-			return (bit + __flsl(mask));
+			return (bit + __flsl(*addr));
 	}
 	return (size);
 }
@@ -272,16 +286,23 @@
 	return (1);
 }
 
-#define	NBINT	(NBBY * sizeof(int))
+#define	NBLONG	(NBBY * sizeof(long))
 
+#define	__set_bit(i, a)							\
+    atomic_set_long(&((volatile long *)(a))[(i)/NBLONG], 1UL << ((i) % NBLONG))
+
 #define	set_bit(i, a)							\
-    atomic_set_int(&((volatile int *)(a))[(i)/NBINT], 1 << (i) % NBINT)
+    atomic_set_long(&((volatile long *)(a))[(i)/NBLONG], 1UL << ((i) % NBLONG))
 
+#define	__clear_bit(i, a)						\
+    atomic_clear_long(&((volatile long *)(a))[(i)/NBLONG], 1UL << ((i) % NBLONG))
+
 #define	clear_bit(i, a)							\
-    atomic_clear_int(&((volatile int *)(a))[(i)/NBINT], 1 << (i) % NBINT)
+    atomic_clear_long(&((volatile long *)(a))[(i)/NBLONG], 1UL << ((i) % NBLONG))
 
 #define	test_bit(i, a)							\
-    !!(atomic_load_acq_int(&((volatile int *)(a))[(i)/NBINT]) & 1 << ((i) % NBINT))
+    !!(atomic_load_acq_long(&((volatile long *)(a))[(i)/NBLONG]) &	\
+    (1UL << ((i) % NBLONG)))
 
 static inline long
 test_and_clear_bit(long bit, long *var)
@@ -288,7 +309,9 @@
 {
 	long val;
 
-	bit = 1 << bit;
+	var += bit / (sizeof(long) * NBBY);
+	bit %= sizeof(long) * NBBY;
+	bit = (1UL << bit);
 	do {
 		val = *(volatile long *)var;
 	} while (atomic_cmpset_long(var, val, val & ~bit) == 0);
@@ -301,7 +324,9 @@
 {
 	long val;
 
-	bit = 1 << bit;
+	var += bit / (sizeof(long) * NBBY);
+	bit %= sizeof(long) * NBBY;
+	bit = (1UL << bit);
 	do {
 		val = *(volatile long *)var;
 	} while (atomic_cmpset_long(var, val, val | bit) == 0);
@@ -309,4 +334,185 @@
 	return !!(val & bit);
 }
 
+
+#define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) % BITS_PER_LONG))
+#define BITMAP_LAST_WORD_MASK(nbits)                                    \
+(                                                                       \
+        ((nbits) % BITS_PER_LONG) ?                                     \
+                (1UL<<((nbits) % BITS_PER_LONG))-1 : ~0UL               \
+)
+
+
+static inline void
+bitmap_set(unsigned long *map, int start, int nr)
+{
+	unsigned long *p = map + BIT_WORD(start);
+	const int size = start + nr;
+	int bits_to_set = BITS_PER_LONG - (start % BITS_PER_LONG);
+	unsigned long mask_to_set = BITMAP_FIRST_WORD_MASK(start);
+
+	while (nr - bits_to_set >= 0) {
+		*p |= mask_to_set;
+		nr -= bits_to_set;
+		bits_to_set = BITS_PER_LONG;
+		mask_to_set = ~0UL;
+		p++;
+	}
+	if (nr) {
+		mask_to_set &= BITMAP_LAST_WORD_MASK(size);
+		*p |= mask_to_set;
+	}
+}
+
+static inline void
+bitmap_clear(unsigned long *map, int start, int nr)
+{
+	unsigned long *p = map + BIT_WORD(start);
+	const int size = start + nr;
+	int bits_to_clear = BITS_PER_LONG - (start % BITS_PER_LONG);
+	unsigned long mask_to_clear = BITMAP_FIRST_WORD_MASK(start);
+
+	while (nr - bits_to_clear >= 0) {
+		*p &= ~mask_to_clear;
+		nr -= bits_to_clear;
+		bits_to_clear = BITS_PER_LONG;
+		mask_to_clear = ~0UL;
+		p++;
+	}
+	if (nr) {
+		mask_to_clear &= BITMAP_LAST_WORD_MASK(size);
+		*p &= ~mask_to_clear;
+	}
+}
+
+enum {
+        REG_OP_ISFREE,          /* true if region is all zero bits */
+        REG_OP_ALLOC,           /* set all bits in region */
+        REG_OP_RELEASE,         /* clear all bits in region */
+};
+
+static int __reg_op(unsigned long *bitmap, int pos, int order, int reg_op)
+{
+        int nbits_reg;          /* number of bits in region */
+        int index;              /* index first long of region in bitmap */
+        int offset;             /* bit offset region in bitmap[index] */
+        int nlongs_reg;         /* num longs spanned by region in bitmap */
+        int nbitsinlong;        /* num bits of region in each spanned long */
+        unsigned long mask;     /* bitmask for one long of region */
+        int i;                  /* scans bitmap by longs */
+        int ret = 0;            /* return value */
+
+        /*
+         * Either nlongs_reg == 1 (for small orders that fit in one long)
+         * or (offset == 0 && mask == ~0UL) (for larger multiword orders.)
+         */
+        nbits_reg = 1 << order;
+        index = pos / BITS_PER_LONG;
+        offset = pos - (index * BITS_PER_LONG);
+        nlongs_reg = BITS_TO_LONGS(nbits_reg);
+        nbitsinlong = min(nbits_reg,  BITS_PER_LONG);
+
+        /*
+         * Can't do "mask = (1UL << nbitsinlong) - 1", as that
+         * overflows if nbitsinlong == BITS_PER_LONG.
+         */
+        mask = (1UL << (nbitsinlong - 1));
+        mask += mask - 1;
+        mask <<= offset;
+
+        switch (reg_op) {
+        case REG_OP_ISFREE:
+                for (i = 0; i < nlongs_reg; i++) {
+                        if (bitmap[index + i] & mask)
+                                goto done;
+                }
+                ret = 1;        /* all bits in region free (zero) */
+                break;
+
+        case REG_OP_ALLOC:
+                for (i = 0; i < nlongs_reg; i++)
+                        bitmap[index + i] |= mask;
+                break;
+
+        case REG_OP_RELEASE:
+                for (i = 0; i < nlongs_reg; i++)
+                        bitmap[index + i] &= ~mask;
+                break;
+        }
+done:
+        return ret;
+}
+
+/**
+ * bitmap_find_free_region - find a contiguous aligned mem region
+ *      @bitmap: array of unsigned longs corresponding to the bitmap
+ *      @bits: number of bits in the bitmap
+ *      @order: region size (log base 2 of number of bits) to find
+ *
+ * Find a region of free (zero) bits in a @bitmap of @bits bits and
+ * allocate them (set them to one).  Only consider regions of length
+ * a power (@order) of two, aligned to that power of two, which
+ * makes the search algorithm much faster.
+ *
+ * Return the bit offset in bitmap of the allocated region,
+ * or -errno on failure.
+ */
+static inline int 
+bitmap_find_free_region(unsigned long *bitmap, int bits, int order)
+{
+        int pos, end;           /* scans bitmap by regions of size order */
+
+        for (pos = 0 ; (end = pos + (1 << order)) <= bits; pos = end) {
+                if (!__reg_op(bitmap, pos, order, REG_OP_ISFREE))
+                        continue;
+                __reg_op(bitmap, pos, order, REG_OP_ALLOC);
+                return pos;
+        }
+        return -ENOMEM;
+}
+
+/**
+ * bitmap_allocate_region - allocate bitmap region
+ *      @bitmap: array of unsigned longs corresponding to the bitmap
+ *      @pos: beginning of bit region to allocate
+ *      @order: region size (log base 2 of number of bits) to allocate
+ *
+ * Allocate (set bits in) a specified region of a bitmap.
+ *
+ * Return 0 on success, or %-EBUSY if specified region wasn't
+ * free (not all bits were zero).
+ */
+
+static inline int
+bitmap_allocate_region(unsigned long *bitmap, int pos, int order)
+{
+        if (!__reg_op(bitmap, pos, order, REG_OP_ISFREE))
+                return -EBUSY;
+        __reg_op(bitmap, pos, order, REG_OP_ALLOC);
+        return 0;
+}
+
+/**
+ * bitmap_release_region - release allocated bitmap region
+ *      @bitmap: array of unsigned longs corresponding to the bitmap
+ *      @pos: beginning of bit region to release
+ *      @order: region size (log base 2 of number of bits) to release
+ *
+ * This is the complement to __bitmap_find_free_region() and releases
+ * the found region (by clearing it in the bitmap).
+ *
+ * No return value.
+ */
+static inline void 
+bitmap_release_region(unsigned long *bitmap, int pos, int order)
+{
+        __reg_op(bitmap, pos, order, REG_OP_RELEASE);
+}
+
+
+#define for_each_set_bit(bit, addr, size) \
+	for ((bit) = find_first_bit((addr), (size));		\
+	     (bit) < (size);					\
+	     (bit) = find_next_bit((addr), (size), (bit) + 1))
+
 #endif	/* _LINUX_BITOPS_H_ */

Added: trunk/sys/ofed/include/linux/cache.h
===================================================================
--- trunk/sys/ofed/include/linux/cache.h	                        (rev 0)
+++ trunk/sys/ofed/include/linux/cache.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -0,0 +1,36 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * Copyright (c) 2013, 2014 Mellanox Technologies, Ltd.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef	_LINUX_CACHE_H_
+#define _LINUX_CACHE_H_
+
+#define	cache_line_size()	CACHE_LINE_SIZE
+#define	L1_CACHE_BYTES		CACHE_LINE_SIZE
+
+#endif	/* _LINUX_CACHE_H_ */


Property changes on: trunk/sys/ofed/include/linux/cache.h
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Modified: trunk/sys/ofed/include/linux/cdev.h
===================================================================
--- trunk/sys/ofed/include/linux/cdev.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/linux/cdev.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -2,6 +2,7 @@
  * Copyright (c) 2010 Isilon Systems, Inc.
  * Copyright (c) 2010 iX Systems, Inc.
  * Copyright (c) 2010 Panasas, Inc.
+ * Copyright (c) 2013, 2014 Mellanox Technologies, Ltd.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without

Added: trunk/sys/ofed/include/linux/clocksource.h
===================================================================
--- trunk/sys/ofed/include/linux/clocksource.h	                        (rev 0)
+++ trunk/sys/ofed/include/linux/clocksource.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -0,0 +1,37 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * Copyright (c) 2013, 2014 Mellanox Technologies, Ltd.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _LINUX_CLOCKSOURCE_H
+#define _LINUX_CLOCKSOURCE_H
+
+/* clocksource cycle base type */
+typedef u64 cycle_t;
+
+
+#endif /* _LINUX_CLOCKSOURCE_H */


Property changes on: trunk/sys/ofed/include/linux/clocksource.h
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Modified: trunk/sys/ofed/include/linux/compat.h
===================================================================
--- trunk/sys/ofed/include/linux/compat.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/linux/compat.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -2,6 +2,7 @@
  * Copyright (c) 2010 Isilon Systems, Inc.
  * Copyright (c) 2010 iX Systems, Inc.
  * Copyright (c) 2010 Panasas, Inc.
+ * Copyright (c) 2013, 2014 Mellanox Technologies, Ltd.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -29,5 +30,4 @@
 #ifndef	_LINUX_COMPAT_H_
 #define	_LINUX_COMPAT_H_
 
-
 #endif	/* _LINUX_COMPAT_H_ */

Modified: trunk/sys/ofed/include/linux/compiler.h
===================================================================
--- trunk/sys/ofed/include/linux/compiler.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/linux/compiler.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -2,6 +2,7 @@
  * Copyright (c) 2010 Isilon Systems, Inc.
  * Copyright (c) 2010 iX Systems, Inc.
  * Copyright (c) 2010 Panasas, Inc.
+ * Copyright (c) 2013, 2014 Mellanox Technologies, Ltd.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without

Modified: trunk/sys/ofed/include/linux/completion.h
===================================================================
--- trunk/sys/ofed/include/linux/completion.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/linux/completion.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -2,6 +2,7 @@
  * Copyright (c) 2010 Isilon Systems, Inc.
  * Copyright (c) 2010 iX Systems, Inc.
  * Copyright (c) 2010 Panasas, Inc.
+ * Copyright (c) 2013, 2014 Mellanox Technologies, Ltd.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -25,131 +26,41 @@
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
+
 #ifndef	_LINUX_COMPLETION_H_
 #define	_LINUX_COMPLETION_H_
 
 #include <linux/errno.h>
-#include <linux/sched.h>
-#include <linux/wait.h>
 
-#include <sys/param.h>
-#include <sys/systm.h>
-#include <sys/sleepqueue.h>
-#include <sys/kernel.h>
-#include <sys/proc.h>
-
 struct completion {
 	unsigned int done;
 };
 
-#define	INIT_COMPLETION(c)	((c).done = 0)
-#define	init_completion(c)	((c)->done = 0)
+#define	INIT_COMPLETION(c) \
+	((c).done = 0)
+#define	init_completion(c) \
+	((c)->done = 0)
+#define	complete(c)				\
+	linux_complete_common((c), 0)
+#define	complete_all(c)				\
+	linux_complete_common((c), 1)
+#define	wait_for_completion(c)			\
+	linux_wait_for_common((c), 0)
+#define	wait_for_completion_interuptible(c)	\
+	linux_wait_for_common((c), 1)
+#define	wait_for_completion_timeout(c, timeout)	\
+	linux_wait_for_timeout_common((c), (timeout), 0)
+#define	wait_for_completion_interruptible_timeout(c, timeout)	\
+	linux_wait_for_timeout_common((c), (timeout), 1)
+#define	try_wait_for_completion(c) \
+	linux_try_wait_for_completion(c)
+#define	completion_done(c) \
+	linux_completion_done(c)
 
-static inline void
-_complete_common(struct completion *c, int all)
-{
-	int wakeup_swapper;
+extern void linux_complete_common(struct completion *, int);
+extern long linux_wait_for_common(struct completion *, int);
+extern long linux_wait_for_timeout_common(struct completion *, long, int);
+extern int linux_try_wait_for_completion(struct completion *);
+extern int linux_completion_done(struct completion *);
 
-	sleepq_lock(c);
-	c->done++;
-	if (all)
-		wakeup_swapper = sleepq_broadcast(c, SLEEPQ_SLEEP, 0, 0);
-	else
-		wakeup_swapper = sleepq_signal(c, SLEEPQ_SLEEP, 0, 0);
-	sleepq_release(c);
-	if (wakeup_swapper)
-		kick_proc0();
-}
-
-#define	complete(c)	_complete_common(c, 0)
-#define	complete_all(c)	_complete_common(c, 1)
-
-/*
- * Indefinite wait for done != 0 with or without signals.
- */
-static inline long
-_wait_for_common(struct completion *c, int flags)
-{
-
-	flags |= SLEEPQ_SLEEP;
-	for (;;) {
-		sleepq_lock(c);
-		if (c->done)
-			break;
-		sleepq_add(c, NULL, "completion", flags, 0);
-		if (flags & SLEEPQ_INTERRUPTIBLE) {
-			if (sleepq_wait_sig(c, 0) != 0)
-				return (-ERESTARTSYS);
-		} else
-			sleepq_wait(c, 0);
-	}
-	c->done--;
-	sleepq_release(c);
-
-	return (0);
-}
-
-#define	wait_for_completion(c)	_wait_for_common(c, 0)
-#define	wait_for_completion_interuptible(c)				\
-	_wait_for_common(c, SLEEPQ_INTERRUPTIBLE)
-
-static inline long
-_wait_for_timeout_common(struct completion *c, long timeout, int flags)
-{
-	long end;
-
-	end = ticks + timeout;
-	flags |= SLEEPQ_SLEEP;
-	for (;;) {
-		sleepq_lock(c);
-		if (c->done)
-			break;
-		sleepq_add(c, NULL, "completion", flags, 0);
-		sleepq_set_timeout(c, end - ticks);
-		if (flags & SLEEPQ_INTERRUPTIBLE) {
-			if (sleepq_timedwait_sig(c, 0) != 0)
-				return (-ERESTARTSYS);
-		} else
-			sleepq_timedwait(c, 0);
-	}
-	c->done--;
-	sleepq_release(c);
-	timeout = end - ticks;
-
-	return (timeout > 0 ? timeout : 1);
-}
-
-#define	wait_for_completion_timeout(c, timeout)				\
-	_wait_for_timeout_common(c, timeout, 0)
-#define	wait_for_completion_interruptible_timeout(c, timeout)		\
-	_wait_for_timeout_common(c, timeout, SLEEPQ_INTERRUPTIBLE)
-
-static inline int
-try_wait_for_completion(struct completion *c)
-{
-	int isdone;
-
-	isdone = 1;
-	sleepq_lock(c);
-	if (c->done)
-		c->done--;
-	else
-		isdone = 0;
-	sleepq_release(c);
-	return (isdone);
-}
-
-static inline int
-completion_done(struct completion *c)
-{
-	int isdone;
-
-	isdone = 1;
-	sleepq_lock(c);
-	if (c->done == 0)
-		isdone = 0;
-	sleepq_release(c);
-	return (isdone);
-}
-
-#endif	/* _LINUX_COMPLETION_H_ */
+#endif					/* _LINUX_COMPLETION_H_ */

Modified: trunk/sys/ofed/include/linux/delay.h
===================================================================
--- trunk/sys/ofed/include/linux/delay.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/linux/delay.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -2,6 +2,7 @@
  * Copyright (c) 2010 Isilon Systems, Inc.
  * Copyright (c) 2010 iX Systems, Inc.
  * Copyright (c) 2010 Panasas, Inc.
+ * Copyright (c) 2013, 2014 Mellanox Technologies, Ltd.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without

Modified: trunk/sys/ofed/include/linux/device.h
===================================================================
--- trunk/sys/ofed/include/linux/device.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/linux/device.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -2,6 +2,7 @@
  * Copyright (c) 2010 Isilon Systems, Inc.
  * Copyright (c) 2010 iX Systems, Inc.
  * Copyright (c) 2010 Panasas, Inc.
+ * Copyright (c) 2013, 2014 Mellanox Technologies, Ltd.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -51,6 +52,7 @@
 	devclass_t	bsdclass;
 	void		(*class_release)(struct class *class);
 	void		(*dev_release)(struct device *dev);
+	char *		(*devnode)(struct device *dev, umode_t *mode);
 };
 
 struct device {
@@ -72,10 +74,12 @@
 extern struct kobject class_root;
 
 struct class_attribute {
-	struct attribute	attr;
-        ssize_t			(*show)(struct class *, char *);
-        ssize_t			(*store)(struct class *, const char *, size_t);
+        struct attribute attr;
+        ssize_t (*show)(struct class *, struct class_attribute *, char *);
+        ssize_t (*store)(struct class *, struct class_attribute *, const char *, size_t);
+        const void *(*namespace)(struct class *, const struct class_attribute *);
 };
+
 #define	CLASS_ATTR(_name, _mode, _show, _store)				\
 	struct class_attribute class_attr_##_name =			\
 	    { { #_name, NULL, _mode }, _show, _store }
@@ -83,10 +87,10 @@
 struct device_attribute {
 	struct attribute	attr;
 	ssize_t			(*show)(struct device *,
-				    struct device_attribute *, char *);
+					struct device_attribute *, char *);
 	ssize_t			(*store)(struct device *,
-				    struct device_attribute *, const char *,
-				    size_t);
+					struct device_attribute *, const char *,
+					size_t);
 };
 
 #define	DEVICE_ATTR(_name, _mode, _show, _store)			\
@@ -93,6 +97,28 @@
 	struct device_attribute dev_attr_##_name =			\
 	    { { #_name, NULL, _mode }, _show, _store }
 
+/* Simple class attribute that is just a static string */
+struct class_attribute_string {
+	struct class_attribute attr;
+	char *str;
+};
+
+static inline ssize_t
+show_class_attr_string(struct class *class,
+				struct class_attribute *attr, char *buf)
+{
+	struct class_attribute_string *cs;
+	cs = container_of(attr, struct class_attribute_string, attr);
+	return snprintf(buf, PAGE_SIZE, "%s\n", cs->str);
+}
+
+/* Currently read-only only */
+#define _CLASS_ATTR_STRING(_name, _mode, _str) \
+	{ __ATTR(_name, _mode, show_class_attr_string, NULL), _str }
+#define CLASS_ATTR_STRING(_name, _mode, _str) \
+	struct class_attribute_string class_attr_##_name = \
+		_CLASS_ATTR_STRING(_name, _mode, _str)
+
 #define	dev_err(dev, fmt, ...)	device_printf((dev)->bsddev, fmt, ##__VA_ARGS__)
 #define	dev_warn(dev, fmt, ...)	device_printf((dev)->bsddev, fmt, ##__VA_ARGS__)
 #define	dev_info(dev, fmt, ...)	device_printf((dev)->bsddev, fmt, ##__VA_ARGS__)
@@ -151,7 +177,7 @@
 	error = -EIO;
 	if (dattr->show)
 		error = dattr->show(container_of(kobj, struct class, kobj),
-		    buf);
+		    dattr, buf);
 	return (error);
 }
 
@@ -166,7 +192,7 @@
 	error = -EIO;
 	if (dattr->store)
 		error = dattr->store(container_of(kobj, struct class, kobj),
-		    buf, count);
+		    dattr, buf, count);
 	return (error);
 }
 
@@ -385,4 +411,12 @@
 		sysfs_remove_file(&class->kobj, &attr->attr);
 }
 
+static inline int dev_to_node(struct device *dev)
+{
+                return -1;
+}
+
+char *kvasprintf(gfp_t, const char *, va_list);
+char *kasprintf(gfp_t, const char *, ...);
+
 #endif	/* _LINUX_DEVICE_H_ */

Modified: trunk/sys/ofed/include/linux/dma-attrs.h
===================================================================
--- trunk/sys/ofed/include/linux/dma-attrs.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/linux/dma-attrs.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -2,6 +2,7 @@
  * Copyright (c) 2010 Isilon Systems, Inc.
  * Copyright (c) 2010 iX Systems, Inc.
  * Copyright (c) 2010 Panasas, Inc.
+ * Copyright (c) 2013, 2014 Mellanox Technologies, Ltd.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without

Modified: trunk/sys/ofed/include/linux/dma-mapping.h
===================================================================
--- trunk/sys/ofed/include/linux/dma-mapping.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/linux/dma-mapping.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -2,6 +2,7 @@
  * Copyright (c) 2010 Isilon Systems, Inc.
  * Copyright (c) 2010 iX Systems, Inc.
  * Copyright (c) 2010 Panasas, Inc.
+ * Copyright (c) 2013, 2014 Mellanox Technologies, Ltd.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -138,6 +139,14 @@
 		*dma_handle = 0;
 	return (mem);
 }
+
+static inline void *
+dma_zalloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
+    gfp_t flag)
+{
+
+	return (dma_alloc_coherent(dev, size, dma_handle, flag | __GFP_ZERO));
+}
                        
 static inline void
 dma_free_coherent(struct device *dev, size_t size, void *cpu_addr,
@@ -245,6 +254,13 @@
 	return (0);
 }
 
+static inline unsigned int dma_set_max_seg_size(struct device *dev,
+                                                 unsigned int size)
+{
+        return (0);
+}
+
+
 #define dma_map_single(d, a, s, r) dma_map_single_attrs(d, a, s, r, NULL)
 #define dma_unmap_single(d, a, s, r) dma_unmap_single_attrs(d, a, s, r, NULL)
 #define dma_map_sg(d, s, n, r) dma_map_sg_attrs(d, s, n, r, NULL)

Modified: trunk/sys/ofed/include/linux/dmapool.h
===================================================================
--- trunk/sys/ofed/include/linux/dmapool.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/linux/dmapool.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -2,6 +2,7 @@
  * Copyright (c) 2010 Isilon Systems, Inc.
  * Copyright (c) 2010 iX Systems, Inc.
  * Copyright (c) 2010 Panasas, Inc.
+ * Copyright (c) 2013, 2014 Mellanox Technologies, Ltd.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without

Modified: trunk/sys/ofed/include/linux/err.h
===================================================================
--- trunk/sys/ofed/include/linux/err.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/linux/err.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -2,6 +2,7 @@
  * Copyright (c) 2010 Isilon Systems, Inc.
  * Copyright (c) 2010 iX Systems, Inc.
  * Copyright (c) 2010 Panasas, Inc.
+ * Copyright (c) 2013, 2014 Mellanox Technologies, Ltd.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -57,4 +58,15 @@
 	return (void *)ptr;
 }
 
+static inline int
+PTR_ERR_OR_ZERO(const void *ptr)
+{
+        if (IS_ERR(ptr))
+                return PTR_ERR(ptr);
+        else
+                return 0;
+}
+
+#define PTR_RET(p) PTR_ERR_OR_ZERO(p)
+
 #endif	/* _LINUX_ERR_H_ */

Modified: trunk/sys/ofed/include/linux/errno.h
===================================================================
--- trunk/sys/ofed/include/linux/errno.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/linux/errno.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -2,6 +2,7 @@
  * Copyright (c) 2010 Isilon Systems, Inc.
  * Copyright (c) 2010 iX Systems, Inc.
  * Copyright (c) 2010 Panasas, Inc.
+ * Copyright (c) 2013, 2014 Mellanox Technologies, Ltd.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -31,9 +32,11 @@
 
 #include <sys/errno.h>
 
-#define	ECOMM		ESTALE
-#define	ENODATA		ECONNREFUSED
-#define	ENOIOCTLCMD	ENOIOCTL		/* XXX this is negative */
-#define ERESTARTSYS     ERESTART		/* XXX this is negative */
+#define	ECOMM           ESTALE
+#define	ENODATA         ECONNREFUSED
+#define	ENOIOCTLCMD     ENOIOCTL
+#define	ERESTARTSYS     ERESTART
+#define	ENOTSUPP        EOPNOTSUPP
+#define	ENONET          EHOSTDOWN
 
-#endif	/* _LINUX_ERRNO_H_ */
+#endif					/* _LINUX_ERRNO_H_ */

Added: trunk/sys/ofed/include/linux/etherdevice.h
===================================================================
--- trunk/sys/ofed/include/linux/etherdevice.h	                        (rev 0)
+++ trunk/sys/ofed/include/linux/etherdevice.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -0,0 +1,116 @@
+/*-
+ * Copyright (c) 2007 Cisco Systems, Inc.  All rights reserved.
+ * Copyright (c) 2014 Mellanox Technologies, Ltd. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *	- Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *
+ *	- Redistributions in binary form must reproduce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+#ifndef _LINUX_ETHERDEVICE
+#define _LINUX_ETHERDEVICE
+
+#include <linux/types.h>
+
+#define	ETH_MODULE_SFF_8079		1
+#define	ETH_MODULE_SFF_8079_LEN		256
+#define	ETH_MODULE_SFF_8472		2
+#define	ETH_MODULE_SFF_8472_LEN		512
+#define	ETH_MODULE_SFF_8636		3
+#define	ETH_MODULE_SFF_8636_LEN		256
+#define	ETH_MODULE_SFF_8436		4
+#define	ETH_MODULE_SFF_8436_LEN		256
+
+struct ethtool_eeprom {
+	u32	offset;
+	u32	len;
+};
+
+struct ethtool_modinfo {
+	u32	type;
+	u32	eeprom_len;
+};
+
+/**
+ * is_zero_ether_addr - Determine if give Ethernet address is all zeros.
+ * @addr: Pointer to a six-byte array containing the Ethernet address
+ *
+ * Return true if the address is all zeroes.
+ */
+static inline bool is_zero_ether_addr(const u8 *addr)
+{
+        return !(addr[0] | addr[1] | addr[2] | addr[3] | addr[4] | addr[5]);
+}
+
+
+
+/**
+ * is_multicast_ether_addr - Determine if the Ethernet address is a multicast.
+ * @addr: Pointer to a six-byte array containing the Ethernet address
+ *
+ * Return true if the address is a multicast address.
+ * By definition the broadcast address is also a multicast address.
+ */
+static inline bool is_multicast_ether_addr(const u8 *addr)
+{
+        return (0x01 & addr[0]);
+}
+
+/**
+ * is_broadcast_ether_addr - Determine if the Ethernet address is broadcast
+ * @addr: Pointer to a six-byte array containing the Ethernet address
+ *
+ * Return true if the address is the broadcast address.
+ */
+static inline bool is_broadcast_ether_addr(const u8 *addr)
+{
+        return (addr[0] & addr[1] & addr[2] & addr[3] & addr[4] & addr[5]) == 0xff;
+}
+
+/**
+ * is_valid_ether_addr - Determine if the given Ethernet address is valid
+ * @addr: Pointer to a six-byte array containing the Ethernet address
+ *
+ * Check that the Ethernet address (MAC) is not 00:00:00:00:00:00, is not
+ * a multicast address, and is not FF:FF:FF:FF:FF:FF.
+ *
+ * Return true if the address is valid.
+ **/
+static inline bool is_valid_ether_addr(const u8 *addr)
+{
+        /* FF:FF:FF:FF:FF:FF is a multicast address so we don't need to
+        ** explicitly check for it here. */
+        return !is_multicast_ether_addr(addr) && !is_zero_ether_addr(addr);
+}
+
+static inline void ether_addr_copy(u8 *dst, const u8 *src)
+{
+	memcpy(dst, src, 6);
+}
+
+#endif /* _LINUX_ETHERDEVICE */


Property changes on: trunk/sys/ofed/include/linux/etherdevice.h
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Modified: trunk/sys/ofed/include/linux/file.h
===================================================================
--- trunk/sys/ofed/include/linux/file.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/linux/file.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -2,6 +2,7 @@
  * Copyright (c) 2010 Isilon Systems, Inc.
  * Copyright (c) 2010 iX Systems, Inc.
  * Copyright (c) 2010 Panasas, Inc.
+ * Copyright (c) 2013 Mellanox Technologies, Ltd.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -72,7 +73,15 @@
 	file = fget_unlocked(curthread->td_proc->p_fd, fd);
 	if (file == NULL)
 		return;
+	/*
+	 * NOTE: We should only get here when the "fd" has not been
+	 * installed, so no need to free the associated Linux file
+	 * structure.
+	 */
 	fdclose(curthread->td_proc->p_fd, file, fd, curthread);
+
+	/* drop extra reference */
+	fdrop(file, curthread);
 }
 
 static inline void
@@ -81,8 +90,15 @@
 	struct file *file;
 
 	file = fget_unlocked(curthread->td_proc->p_fd, fd);
-	filp->_file = file;
-        finit(file, filp->f_mode, DTYPE_DEV, filp, &linuxfileops);
+	if (file == NULL) {
+		filp->_file = NULL;
+	} else {
+		filp->_file = file;
+		finit(file, filp->f_mode, DTYPE_DEV, filp, &linuxfileops);
+	}
+
+	/* drop the extra reference */
+	fput(filp);
 }
 
 static inline int
@@ -95,16 +111,18 @@
 	error = falloc(curthread, &file, &fd, 0);
 	if (error)
 		return -error;
+	/* drop the extra reference */
+	fdrop(file, curthread);
 	return fd;
 }
 
 static inline struct linux_file *
-_alloc_file(int mode, const struct file_operations *fops)
+alloc_file(int mode, const struct file_operations *fops)
 {
 	struct linux_file *filp;
 
 	filp = kzalloc(sizeof(*filp), GFP_KERNEL);
-	if (filp == NULL) 
+	if (filp == NULL)
 		return (NULL);
 	filp->f_op = fops;
 	filp->f_mode = mode;
@@ -112,8 +130,21 @@
 	return filp;
 }
 
-#define	alloc_file(mnt, root, mode, fops)	_alloc_file((mode), (fops))
+struct fd {
+	struct linux_file *linux_file;
+};
 
+static inline void fdput(struct fd fd)
+{
+	fput(fd.linux_file);
+}
+
+static inline struct fd fdget(unsigned int fd)
+{
+	struct linux_file *f = linux_fget(fd);
+	return (struct fd){f};
+}
+
 #define	file	linux_file
 #define	fget	linux_fget
 

Modified: trunk/sys/ofed/include/linux/fs.h
===================================================================
--- trunk/sys/ofed/include/linux/fs.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/linux/fs.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -2,6 +2,7 @@
  * Copyright (c) 2010 Isilon Systems, Inc.
  * Copyright (c) 2010 iX Systems, Inc.
  * Copyright (c) 2010 Panasas, Inc.
+ * Copyright (c) 2013 Mellanox Technologies, Ltd.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -28,6 +29,8 @@
 #ifndef	_LINUX_FS_H_
 #define	_LINUX_FS_H_
 
+#include <sys/cdefs.h>
+#include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/conf.h>
 #include <sys/vnode.h>
@@ -73,6 +76,7 @@
 	struct dentry	f_dentry_store;
 	struct selinfo	f_selinfo;
 	struct sigio	*f_sigio;
+	struct vnode	*f_vnode;
 };
 
 #define	file		linux_file
@@ -105,6 +109,12 @@
 	int (*open)(struct inode *, struct file *);
 	int (*release)(struct inode *, struct file *);
 	int (*fasync)(int, struct file *, int);
+
+/* Although not supported in FreeBSD, to align with Linux code
+ * we are adding llseek() only when it is mapped to no_llseek which returns 
+ * an illegal seek error
+ */
+	loff_t (*llseek)(struct file *, loff_t, int);
 #if 0
 	/* We do not support these methods.  Don't permit them to compile. */
 	loff_t (*llseek)(struct file *, loff_t, int);
@@ -153,6 +163,21 @@
 	return;
 }
 
+static inline int
+alloc_chrdev_region(dev_t *dev, unsigned baseminor, unsigned count,
+			const char *name)
+{
+
+	return 0;
+}
+
+/* No current support for seek op in FreeBSD */
+static inline int
+nonseekable_open(struct inode *inode, struct file *filp)
+{
+	return 0;
+}
+
 static inline dev_t
 iminor(struct inode *inode)
 {
@@ -179,4 +204,10 @@
 	vrele(inode);
 }
 
-#endif	/* _LINUX_FS_H_ */
+static inline loff_t 
+no_llseek(struct file *file, loff_t offset, int whence)
+{
+        return -ESPIPE;
+}
+
+#endif /* _LINUX_FS_H_ */

Modified: trunk/sys/ofed/include/linux/gfp.h
===================================================================
--- trunk/sys/ofed/include/linux/gfp.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/linux/gfp.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -2,6 +2,7 @@
  * Copyright (c) 2010 Isilon Systems, Inc.
  * Copyright (c) 2010 iX Systems, Inc.
  * Copyright (c) 2010 Panasas, Inc.
+ * Copyright (c) 2013 Mellanox Technologies, Ltd.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -29,11 +30,14 @@
 #ifndef	_LINUX_GFP_H_
 #define	_LINUX_GFP_H_
 
+#include <sys/cdefs.h>
+#include <sys/types.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 
 #include <linux/page.h>
 
+#include <vm/vm_param.h>
 #include <vm/vm_object.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_kern.h>
@@ -101,6 +105,13 @@
 	kmem_free(kmem_map, (vm_offset_t)p, size);
 }
 
+static inline void free_pages(uintptr_t addr, unsigned int order)
+{
+	if (addr == 0)
+		return;
+	__free_pages(virt_to_page((void *)addr), order);
+}
+
 /*
  * Alloc pages allocates directly from the buddy allocator on linux so
  * order specifies a power of two bucket of pages and the results
@@ -120,4 +131,18 @@
         return (virt_to_page(page));
 }
 
+static inline uintptr_t __get_free_pages(gfp_t gfp_mask, unsigned int order)
+{
+	struct page *page;
+
+	page = alloc_pages(gfp_mask, order);
+	if (page == NULL)
+		return (0);
+	return ((uintptr_t)page_address(page));
+}
+
+#define alloc_pages_node(node, mask, order)     alloc_pages(mask, order)
+
+#define kmalloc_node(chunk, mask, node)         kmalloc(chunk, mask)
+
 #endif	/* _LINUX_GFP_H_ */

Modified: trunk/sys/ofed/include/linux/hardirq.h
===================================================================
--- trunk/sys/ofed/include/linux/hardirq.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/linux/hardirq.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -2,6 +2,7 @@
  * Copyright (c) 2010 Isilon Systems, Inc.
  * Copyright (c) 2010 iX Systems, Inc.
  * Copyright (c) 2010 Panasas, Inc.
+ * Copyright (c) 2013 Mellanox Technologies, Ltd.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without

Modified: trunk/sys/ofed/include/linux/idr.h
===================================================================
--- trunk/sys/ofed/include/linux/idr.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/linux/idr.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -2,6 +2,7 @@
  * Copyright (c) 2010 Isilon Systems, Inc.
  * Copyright (c) 2010 iX Systems, Inc.
  * Copyright (c) 2010 Panasas, Inc.
+ * Copyright (c) 2013 Mellanox Technologies, Ltd.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -40,6 +41,10 @@
 #define	MAX_ID_MASK	(MAX_ID_BIT - 1)
 #define	MAX_LEVEL	(MAX_ID_SHIFT + IDR_BITS - 1) / IDR_BITS
 
+#define MAX_IDR_SHIFT (sizeof(int)*8 - 1)
+#define MAX_IDR_BIT (1U << MAX_IDR_SHIFT)
+#define MAX_IDR_MASK (MAX_IDR_BIT - 1)
+
 struct idr_layer {
 	unsigned long		bitmap;
 	struct idr_layer	*ary[IDR_SIZE];

Modified: trunk/sys/ofed/include/linux/if_arp.h
===================================================================
--- trunk/sys/ofed/include/linux/if_arp.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/linux/if_arp.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -2,6 +2,7 @@
  * Copyright (c) 2010 Isilon Systems, Inc.
  * Copyright (c) 2010 iX Systems, Inc.
  * Copyright (c) 2010 Panasas, Inc.
+ * Copyright (c) 2013 Mellanox Technologies, Ltd.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without

Modified: trunk/sys/ofed/include/linux/if_ether.h
===================================================================
--- trunk/sys/ofed/include/linux/if_ether.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/linux/if_ether.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -2,6 +2,7 @@
  * Copyright (c) 2010 Isilon Systems, Inc.
  * Copyright (c) 2010 iX Systems, Inc.
  * Copyright (c) 2010 Panasas, Inc.
+ * Copyright (c) 2013 Mellanox Technologies, Ltd.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -34,4 +35,16 @@
 
 #define	ETH_P_8021Q	ETHERTYPE_VLAN
 
+#define ETH_HLEN        ETHER_HDR_LEN   /* Total octets in header.                              */
+#ifndef ETH_ALEN
+#define ETH_ALEN        ETHER_ADDR_LEN
+#endif
+#define ETH_FCS_LEN     4               /* Octets in the FCS                                    */
+#define VLAN_HLEN       4               /* The additional bytes (on top of the Ethernet header)
+                                         * that VLAN requires.                                  */
+/*
+ * defined Ethernet Protocol ID's.
+ */
+#define ETH_P_IP        0x0800          /* Internet Protocol packet                             */
+
 #endif	/* _LINUX_IF_ETHER_H_ */

Modified: trunk/sys/ofed/include/linux/if_vlan.h
===================================================================
--- trunk/sys/ofed/include/linux/if_vlan.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/linux/if_vlan.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -2,6 +2,7 @@
  * Copyright (c) 2010 Isilon Systems, Inc.
  * Copyright (c) 2010 iX Systems, Inc.
  * Copyright (c) 2010 Panasas, Inc.
+ * Copyright (c) 2013 Mellanox Technologies, Ltd.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -29,7 +30,11 @@
 #ifndef	_LINUX_IF_VLAN_H_
 #define	_LINUX_IF_VLAN_H_
 
+#include <sys/socket.h>
+#include <net/if.h>
 #include <net/ethernet.h>
 #include <net/if_vlan_var.h>
 
+#define VLAN_N_VID              4096
+
 #endif	/* _LINUX_IF_VLAN_H_ */

Modified: trunk/sys/ofed/include/linux/in.h
===================================================================
--- trunk/sys/ofed/include/linux/in.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/linux/in.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -2,6 +2,7 @@
  * Copyright (c) 2010 Isilon Systems, Inc.
  * Copyright (c) 2010 iX Systems, Inc.
  * Copyright (c) 2010 Panasas, Inc.
+ * Copyright (c) 2013, 2014 Mellanox Technologies, Ltd.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -30,6 +31,9 @@
 
 #include "opt_inet.h"
 
+#include <sys/cdefs.h>
+#include <sys/param.h>
+#include <sys/systm.h>
 #include <netinet/in.h>
 #include <asm/byteorder.h>
 

Modified: trunk/sys/ofed/include/linux/in6.h
===================================================================
--- trunk/sys/ofed/include/linux/in6.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/linux/in6.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -2,6 +2,7 @@
  * Copyright (c) 2010 Isilon Systems, Inc.
  * Copyright (c) 2010 iX Systems, Inc.
  * Copyright (c) 2010 Panasas, Inc.
+ * Copyright (c) 2013 Mellanox Technologies, Ltd.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -29,6 +30,8 @@
 #ifndef	_LINUX_IN6_H_
 #define	_LINUX_IN6_H_
 
+#ifndef KLD_MODULE
 #include "opt_inet6.h"
+#endif
 
 #endif	/* _LINUX_IN6_H_ */

Modified: trunk/sys/ofed/include/linux/inetdevice.h
===================================================================
--- trunk/sys/ofed/include/linux/inetdevice.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/linux/inetdevice.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -2,6 +2,7 @@
  * Copyright (c) 2010 Isilon Systems, Inc.
  * Copyright (c) 2010 iX Systems, Inc.
  * Copyright (c) 2010 Panasas, Inc.
+ * Copyright (c) 2013 Mellanox Technologies, Ltd.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without

Modified: trunk/sys/ofed/include/linux/interrupt.h
===================================================================
--- trunk/sys/ofed/include/linux/interrupt.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/linux/interrupt.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -2,6 +2,7 @@
  * Copyright (c) 2010 Isilon Systems, Inc.
  * Copyright (c) 2010 iX Systems, Inc.
  * Copyright (c) 2010 Panasas, Inc.
+ * Copyright (c) 2013-2015 Mellanox Technologies, Ltd.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -116,6 +117,23 @@
 	return 0;
 }
 
+static inline int
+bind_irq_to_cpu(unsigned int irq, int cpu_id)
+{
+	struct irq_ent *irqe;
+	struct device *dev;
+
+	dev = _pci_find_irq_dev(irq);
+	if (dev == NULL)
+		return (-ENOENT);
+
+	irqe = _irq_ent(dev, irq);
+	if (irqe == NULL)
+		return (-ENOENT);
+
+	return (-bus_bind_intr(dev->bsddev, irqe->res, cpu_id));
+}
+
 static inline void
 free_irq(unsigned int irq, void *device)
 {

Modified: trunk/sys/ofed/include/linux/io-mapping.h
===================================================================
--- trunk/sys/ofed/include/linux/io-mapping.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/linux/io-mapping.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -2,6 +2,7 @@
  * Copyright (c) 2010 Isilon Systems, Inc.
  * Copyright (c) 2010 iX Systems, Inc.
  * Copyright (c) 2010 Panasas, Inc.
+ * Copyright (c) 2013, 2014 Mellanox Technologies, Ltd.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without

Modified: trunk/sys/ofed/include/linux/io.h
===================================================================
--- trunk/sys/ofed/include/linux/io.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/linux/io.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -2,6 +2,7 @@
  * Copyright (c) 2010 Isilon Systems, Inc.
  * Copyright (c) 2010 iX Systems, Inc.
  * Copyright (c) 2010 Panasas, Inc.
+ * Copyright (c) 2013, 2014 Mellanox Technologies, Ltd.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -30,6 +31,7 @@
 #define	_LINUX_IO_H_
 
 #include <machine/vm.h>
+#include <sys/endian.h>
 
 static inline uint32_t
 __raw_readl(const volatile void *addr)
@@ -88,6 +90,20 @@
         *(volatile uint16_t *)addr = b;
 }
 
+#undef ioread32be
+static inline uint32_t
+ioread32be(const volatile void *addr)
+{
+	return be32toh(*(const volatile uint32_t *)addr);
+}
+
+#undef iowrite32be
+static inline void
+iowrite32be(uint32_t v, volatile void *addr)
+{
+	*(volatile uint32_t *)addr = htobe32(v);
+}
+
 void *_ioremap_attr(vm_paddr_t phys_addr, unsigned long size, int attr);
 #define	ioremap_nocache(addr, size)					\
     _ioremap_attr((addr), (size), VM_MEMATTR_UNCACHEABLE)

Modified: trunk/sys/ofed/include/linux/ioctl.h
===================================================================
--- trunk/sys/ofed/include/linux/ioctl.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/linux/ioctl.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -2,6 +2,7 @@
  * Copyright (c) 2010 Isilon Systems, Inc.
  * Copyright (c) 2010 iX Systems, Inc.
  * Copyright (c) 2010 Panasas, Inc.
+ * Copyright (c) 2013, 2014 Mellanox Technologies, Ltd.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without

Modified: trunk/sys/ofed/include/linux/jiffies.h
===================================================================
--- trunk/sys/ofed/include/linux/jiffies.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/linux/jiffies.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -2,6 +2,7 @@
  * Copyright (c) 2010 Isilon Systems, Inc.
  * Copyright (c) 2010 iX Systems, Inc.
  * Copyright (c) 2010 Panasas, Inc.
+ * Copyright (c) 2013, 2014 Mellanox Technologies, Ltd.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -44,11 +45,12 @@
 	return (tvtohz(&tv));
 }
 
-#define	jiffies	ticks
+#define jiffies                 ticks
+#define jiffies_to_msecs(x)     (((int64_t)(x)) * 1000 / hz)
 
-#define	time_after(a, b)	((long)(b) - (long)(a) < 0)
+#define	time_after(a, b)	((int)((b) - (a)) < 0)
 #define	time_before(a, b)	time_after(b,a)
-#define	time_after_eq(a, b)	((long)(a) - (long)(b) >= 0)
+#define	time_after_eq(a, b)	((int)((a) - (b)) >= 0)
 #define	time_before_eq(a, b)	time_after_eq(b, a)
 
 #define	HZ	hz

Modified: trunk/sys/ofed/include/linux/kdev_t.h
===================================================================
--- trunk/sys/ofed/include/linux/kdev_t.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/linux/kdev_t.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -2,6 +2,7 @@
  * Copyright (c) 2010 Isilon Systems, Inc.
  * Copyright (c) 2010 iX Systems, Inc.
  * Copyright (c) 2010 Panasas, Inc.
+ * Copyright (c) 2013, 2014 Mellanox Technologies, Ltd.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without

Modified: trunk/sys/ofed/include/linux/kernel.h
===================================================================
--- trunk/sys/ofed/include/linux/kernel.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/linux/kernel.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -2,6 +2,7 @@
  * Copyright (c) 2010 Isilon Systems, Inc.
  * Copyright (c) 2010 iX Systems, Inc.
  * Copyright (c) 2010 Panasas, Inc.
+ * Copyright (c) 2013, 2014 Mellanox Technologies, Ltd.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -28,25 +29,27 @@
 #ifndef	_LINUX_KERNEL_H_
 #define	_LINUX_KERNEL_H_
 
+#include <sys/cdefs.h>
+#include <sys/types.h>
 #include <sys/systm.h>
 #include <sys/param.h>
 #include <sys/libkern.h>
 #include <sys/stat.h>
 #include <sys/smp.h>
+#include <sys/stddef.h>
+#include <sys/syslog.h>
 
 #include <linux/bitops.h>
 #include <linux/compiler.h>
 #include <linux/errno.h>
-#include <linux/stddef.h>
 #include <linux/kthread.h>
 #include <linux/types.h>
 #include <linux/jiffies.h>
 #include <linux/wait.h>
-#include <linux/fs.h>
-#include <linux/notifier.h>
-#include <linux/log2.h>
+#include <linux/log2.h> 
 #include <asm/byteorder.h>
 
+#define KERN_CONT       ""
 #define	KERN_EMERG	"<0>"
 #define	KERN_ALERT	"<1>"
 #define	KERN_CRIT	"<2>"
@@ -56,6 +59,8 @@
 #define	KERN_INFO	"<6>"
 #define	KERN_DEBUG	"<7>"
 
+#define	BUILD_BUG_ON(x)		CTASSERT(!(x))
+
 #define BUG()			panic("BUG")
 #define BUG_ON(condition)	do { if (condition) BUG(); } while(0)
 #define	WARN_ON			BUG_ON
@@ -62,12 +67,89 @@
 
 #undef	ALIGN
 #define	ALIGN(x, y)		roundup2((x), (y))
+#undef PTR_ALIGN
+#define	PTR_ALIGN(p, a)		((__typeof(p))ALIGN((uintptr_t)(p), (a)))
 #define	DIV_ROUND_UP		howmany
+#define	FIELD_SIZEOF(t, f)	sizeof(((t *)0)->f)
 
 #define	printk(X...)		printf(X)
-#define	pr_debug(fmt, ...)	printk(KERN_DEBUG # fmt, ##__VA_ARGS__)
+
+/*
+ * The "pr_debug()" and "pr_devel()" macros should produce zero code
+ * unless DEBUG is defined:
+ */
+#ifdef DEBUG
+#define pr_debug(fmt, ...) \
+        log(LOG_DEBUG, fmt, ##__VA_ARGS__)
+#define pr_devel(fmt, ...) \
+	log(LOG_DEBUG, pr_fmt(fmt), ##__VA_ARGS__)
+#else
+#define pr_debug(fmt, ...) \
+        ({ if (0) log(LOG_DEBUG, fmt, ##__VA_ARGS__); 0; })
+#define pr_devel(fmt, ...) \
+	({ if (0) log(LOG_DEBUG, pr_fmt(fmt), ##__VA_ARGS__); 0; })
+#endif
+
 #define udelay(t)       	DELAY(t)
+#define usleep_range(min,max)	DELAY(min)
 
+#ifndef pr_fmt
+#define pr_fmt(fmt) fmt
+#endif
+
+/*
+ * Print a one-time message (analogous to WARN_ONCE() et al):
+ */
+#define printk_once(...) do {			\
+	static bool __print_once;		\
+						\
+	if (!__print_once) {			\
+		__print_once = true;		\
+		printk(__VA_ARGS__);		\
+	}					\
+} while (0)
+
+/*
+ * Log a one-time message (analogous to WARN_ONCE() et al):
+ */
+#define log_once(level,...) do {		\
+	static bool __log_once;			\
+						\
+	if (!__log_once) {			\
+		__log_once = true;		\
+		log(level, __VA_ARGS__);	\
+	}					\
+} while (0)
+
+#define pr_emerg(fmt, ...) \
+	log(LOG_EMERG, pr_fmt(fmt), ##__VA_ARGS__)
+#define pr_alert(fmt, ...) \
+	log(LOG_ALERT, pr_fmt(fmt), ##__VA_ARGS__)
+#define pr_crit(fmt, ...) \
+	log(LOG_CRIT, pr_fmt(fmt), ##__VA_ARGS__)
+#define pr_err(fmt, ...) \
+	log(LOG_ERR, pr_fmt(fmt), ##__VA_ARGS__)
+#define pr_warning(fmt, ...) \
+	log(LOG_WARNING, pr_fmt(fmt), ##__VA_ARGS__)
+#define pr_warn pr_warning
+#define pr_notice(fmt, ...) \
+	log(LOG_NOTICE, pr_fmt(fmt), ##__VA_ARGS__)
+#define pr_info(fmt, ...) \
+	log(LOG_INFO, pr_fmt(fmt), ##__VA_ARGS__)
+#define pr_info_once(fmt, ...) \
+	log_once(LOG_INFO, pr_fmt(fmt), ##__VA_ARGS__)
+#define pr_cont(fmt, ...) \
+	printk(KERN_CONT fmt, ##__VA_ARGS__)
+
+#ifndef WARN
+#define WARN(condition, format...) ({                                   \
+        int __ret_warn_on = !!(condition);                              \
+        if (unlikely(__ret_warn_on))                                    \
+                pr_warning(format);                                     \
+        unlikely(__ret_warn_on);                                        \
+})
+#endif
+
 #define container_of(ptr, type, member)				\
 ({								\
 	__typeof(((type *)0)->member) *_p = (ptr);		\
@@ -77,12 +159,29 @@
 #define	ARRAY_SIZE(x)	(sizeof(x) / sizeof((x)[0]))
 
 #define	simple_strtoul	strtoul
+#define	simple_strtol	strtol
+#define kstrtol(a,b,c) ({*(c) = strtol(a,0,b);})
 
-#define min(x, y)	(x < y ? x : y)
-#define max(x, y)	(x > y ? x : y)
-#define min_t(type, _x, _y)	(type)(_x) < (type)(_y) ? (type)(_x) : (_y)
-#define max_t(type, _x, _y)	(type)(_x) > (type)(_y) ? (type)(_x) : (_y)
+#define min(x, y)	((x) < (y) ? (x) : (y))
+#define max(x, y)	((x) > (y) ? (x) : (y))
+#define min_t(type, _x, _y)	((type)(_x) < (type)(_y) ? (type)(_x) : (type)(_y))
+#define max_t(type, _x, _y)	((type)(_x) > (type)(_y) ? (type)(_x) : (type)(_y))
 
+/*
+ * This looks more complex than it should be. But we need to
+ * get the type for the ~ right in round_down (it needs to be
+ * as wide as the result!), and we want to evaluate the macro
+ * arguments just once each.
+ */
+#define __round_mask(x, y) ((__typeof__(x))((y)-1))
+#define round_up(x, y) ((((x)-1) | __round_mask(x, y))+1)
+#define round_down(x, y) ((x) & ~__round_mask(x, y))
+
 #define	num_possible_cpus()	mp_ncpus
+#define	num_online_cpus()	mp_ncpus
 
+typedef struct pm_message {
+        int event;
+} pm_message_t;
+
 #endif	/* _LINUX_KERNEL_H_ */

Added: trunk/sys/ofed/include/linux/kmod.h
===================================================================
--- trunk/sys/ofed/include/linux/kmod.h	                        (rev 0)
+++ trunk/sys/ofed/include/linux/kmod.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -0,0 +1,51 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * Copyright (c) 2013, 2014 Mellanox Technologies, Ltd.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef	_LINUX_KMOD_H_
+#define	_LINUX_KMOD_H_
+
+#include <sys/types.h>
+#include <sys/syscallsubr.h>
+#include <sys/refcount.h>
+#include <sys/sbuf.h>
+#include <machine/stdarg.h>
+#include <sys/proc.h>
+
+#define	request_module(...) \
+({\
+	char modname[128]; \
+        int fileid; \
+	snprintf(modname, sizeof(modname), __VA_ARGS__); \
+	kern_kldload(curthread, modname, &fileid); \
+})
+
+#define request_module_nowait request_module
+
+
+#endif /* _LINUX_KMOD_H_ */


Property changes on: trunk/sys/ofed/include/linux/kmod.h
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Modified: trunk/sys/ofed/include/linux/kobject.h
===================================================================
--- trunk/sys/ofed/include/linux/kobject.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/linux/kobject.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -2,6 +2,7 @@
  * Copyright (c) 2010 Isilon Systems, Inc.
  * Copyright (c) 2010 iX Systems, Inc.
  * Copyright (c) 2010 Panasas, Inc.
+ * Copyright (c) 2013, 2014 Mellanox Technologies, Ltd.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -54,6 +55,8 @@
 	struct sysctl_oid	*oidp;
 };
 
+extern struct kobject *mm_kobj;
+
 static inline void
 kobject_init(struct kobject *kobj, struct kobj_type *ktype)
 {
@@ -150,4 +153,17 @@
 int	kobject_init_and_add(struct kobject *kobj, struct kobj_type *ktype,
 	    struct kobject *parent, const char *fmt, ...);
 
+/* sysfs.h calles for 'kobject' which is defined here, 
+ * so we need to add the include only after the 'kobject' def.
+ */
+#include <linux/sysfs.h>
+
+struct kobj_attribute {
+        struct attribute attr;
+        ssize_t (*show)(struct kobject *kobj, struct kobj_attribute *attr,
+                        char *buf);
+        ssize_t (*store)(struct kobject *kobj, struct kobj_attribute *attr,
+                         const char *buf, size_t count);
+};
+
 #endif /* _LINUX_KOBJECT_H_ */

Modified: trunk/sys/ofed/include/linux/kref.h
===================================================================
--- trunk/sys/ofed/include/linux/kref.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/linux/kref.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -2,6 +2,7 @@
  * Copyright (c) 2010 Isilon Systems, Inc.
  * Copyright (c) 2010 iX Systems, Inc.
  * Copyright (c) 2010 Panasas, Inc.
+ * Copyright (c) 2013, 2014 Mellanox Technologies, Ltd.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -28,6 +29,7 @@
 #ifndef _LINUX_KREF_H_
 #define _LINUX_KREF_H_
 
+#include <sys/types.h>
 #include <sys/refcount.h>
 
 struct kref {
@@ -59,4 +61,4 @@
 	return 0;
 }
 
-#endif /* _KREF_H_ */
+#endif /* _LINUX_KREF_H_ */

Modified: trunk/sys/ofed/include/linux/kthread.h
===================================================================
--- trunk/sys/ofed/include/linux/kthread.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/linux/kthread.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -2,6 +2,7 @@
  * Copyright (c) 2010 Isilon Systems, Inc.
  * Copyright (c) 2010 iX Systems, Inc.
  * Copyright (c) 2010 Panasas, Inc.
+ * Copyright (c) 2013, 2014 Mellanox Technologies, Ltd.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without

Added: trunk/sys/ofed/include/linux/ktime.h
===================================================================
--- trunk/sys/ofed/include/linux/ktime.h	                        (rev 0)
+++ trunk/sys/ofed/include/linux/ktime.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -0,0 +1,300 @@
+/*-
+ * Copyright (c) 2014 Mellanox Technologies, Ltd.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _LINUX_KTIME_H
+#define _LINUX_KTIME_H
+
+#include <sys/time.h>
+#include <linux/types.h>
+#include <linux/jiffies.h>
+
+
+/* Get the monotonic time in timespec format: */ 
+#define ktime_get_ts getnanouptime
+
+#define NSEC_PER_USEC   1000L
+#define NSEC_PER_SEC    1000000000L
+
+/*
+ * ktime_t:
+ *
+ * On 64-bit CPUs a single 64-bit variable is used to store the hrtimers
+ * internal representation of time values in scalar nanoseconds. The
+ * design plays out best on 64-bit CPUs, where most conversions are
+ * NOPs and most arithmetic ktime_t operations are plain arithmetic
+ * operations.
+ *
+ * On 32-bit CPUs an optimized representation of the timespec structure
+ * is used to avoid expensive conversions from and to timespecs. The
+ * endian-aware order of the tv struct members is chosen to allow
+ * mathematical operations on the tv64 member of the union too, which
+ * for certain operations produces better code.
+ *
+ * For architectures with efficient support for 64/32-bit conversions the
+ * plain scalar nanosecond based representation can be selected by the
+ * config switch CONFIG_KTIME_SCALAR.
+ */
+union ktime {
+	s64	tv64;
+#if BITS_PER_LONG != 64 && !defined(CONFIG_KTIME_SCALAR)
+	struct {
+# ifdef __BIG_ENDIAN
+	s32	sec, nsec;
+# else
+	s32	nsec, sec;
+# endif
+	} tv;
+#endif
+};
+
+typedef union ktime ktime_t;		/* Kill this */
+
+#define KTIME_MAX                       ((s64)~((u64)1 << 63))
+#define KTIME_SEC_MAX                   (KTIME_MAX / NSEC_PER_SEC)
+
+/*
+ * ktime_t definitions when using the 64-bit scalar representation:
+ */
+
+#if (BITS_PER_LONG == 64) || defined(CONFIG_KTIME_SCALAR)
+
+/**
+ * ktime_set - Set a ktime_t variable from a seconds/nanoseconds value
+ * @secs:	seconds to set
+ * @nsecs:	nanoseconds to set
+ *
+ * Return the ktime_t representation of the value
+ */
+static inline ktime_t ktime_set(const long secs, const unsigned long nsecs)
+{
+#if (BITS_PER_LONG == 64)
+	if (unlikely(secs >= KTIME_SEC_MAX))
+		return (ktime_t){ .tv64 = KTIME_MAX };
+#endif
+	return (ktime_t) { .tv64 = (s64)secs * NSEC_PER_SEC + (s64)nsecs };
+}
+
+/* Subtract two ktime_t variables. rem = lhs -rhs: */
+#define ktime_sub(lhs, rhs) \
+		({ (ktime_t){ .tv64 = (lhs).tv64 - (rhs).tv64 }; })
+
+/* Add two ktime_t variables. res = lhs + rhs: */
+#define ktime_add(lhs, rhs) \
+		({ (ktime_t){ .tv64 = (lhs).tv64 + (rhs).tv64 }; })
+
+/*
+ * Add a ktime_t variable and a scalar nanosecond value.
+ * res = kt + nsval:
+ */
+#define ktime_add_ns(kt, nsval) \
+		({ (ktime_t){ .tv64 = (kt).tv64 + (nsval) }; })
+
+/*
+ * Subtract a scalar nanosecod from a ktime_t variable
+ * res = kt - nsval:
+ */
+#define ktime_sub_ns(kt, nsval) \
+		({ (ktime_t){ .tv64 = (kt).tv64 - (nsval) }; })
+
+/* convert a timespec to ktime_t format: */
+static inline ktime_t timespec_to_ktime(struct timespec ts)
+{
+	return ktime_set(ts.tv_sec, ts.tv_nsec);
+}
+
+/* convert a timeval to ktime_t format: */
+static inline ktime_t timeval_to_ktime(struct timeval tv)
+{
+	return ktime_set(tv.tv_sec, tv.tv_usec * NSEC_PER_USEC);
+}
+
+/* Map the ktime_t to timespec conversion to ns_to_timespec function */
+#define ktime_to_timespec(kt)		ns_to_timespec((kt).tv64)
+
+/* Map the ktime_t to timeval conversion to ns_to_timeval function */
+#define ktime_to_timeval(kt)		ns_to_timeval((kt).tv64)
+
+/* Convert ktime_t to nanoseconds - NOP in the scalar storage format: */
+#define ktime_to_ns(kt)			((kt).tv64)
+
+#else	/* !((BITS_PER_LONG == 64) || defined(CONFIG_KTIME_SCALAR)) */
+
+/*
+ * Helper macros/inlines to get the ktime_t math right in the timespec
+ * representation. The macros are sometimes ugly - their actual use is
+ * pretty okay-ish, given the circumstances. We do all this for
+ * performance reasons. The pure scalar nsec_t based code was nice and
+ * simple, but created too many 64-bit / 32-bit conversions and divisions.
+ *
+ * Be especially aware that negative values are represented in a way
+ * that the tv.sec field is negative and the tv.nsec field is greater
+ * or equal to zero but less than nanoseconds per second. This is the
+ * same representation which is used by timespecs.
+ *
+ *   tv.sec < 0 and 0 >= tv.nsec < NSEC_PER_SEC
+ */
+
+/* Set a ktime_t variable to a value in sec/nsec representation: */
+static inline ktime_t ktime_set(const long secs, const unsigned long nsecs)
+{
+	return (ktime_t) { .tv = { .sec = secs, .nsec = nsecs } };
+}
+
+/**
+ * ktime_sub - subtract two ktime_t variables
+ * @lhs:	minuend
+ * @rhs:	subtrahend
+ *
+ * Returns the remainder of the subtraction
+ */
+static inline ktime_t ktime_sub(const ktime_t lhs, const ktime_t rhs)
+{
+	ktime_t res;
+
+	res.tv64 = lhs.tv64 - rhs.tv64;
+	if (res.tv.nsec < 0)
+		res.tv.nsec += NSEC_PER_SEC;
+
+	return res;
+}
+
+/**
+ * ktime_add - add two ktime_t variables
+ * @add1:	addend1
+ * @add2:	addend2
+ *
+ * Returns the sum of @add1 and @add2.
+ */
+static inline ktime_t ktime_add(const ktime_t add1, const ktime_t add2)
+{
+	ktime_t res;
+
+	res.tv64 = add1.tv64 + add2.tv64;
+	/*
+	 * performance trick: the (u32) -NSEC gives 0x00000000Fxxxxxxx
+	 * so we subtract NSEC_PER_SEC and add 1 to the upper 32 bit.
+	 *
+	 * it's equivalent to:
+	 *   tv.nsec -= NSEC_PER_SEC
+	 *   tv.sec ++;
+	 */
+	if (res.tv.nsec >= NSEC_PER_SEC)
+		res.tv64 += (u32)-NSEC_PER_SEC;
+
+	return res;
+}
+
+/**
+ * ktime_add_ns - Add a scalar nanoseconds value to a ktime_t variable
+ * @kt:		addend
+ * @nsec:	the scalar nsec value to add
+ *
+ * Returns the sum of @kt and @nsec in ktime_t format
+ */
+extern ktime_t ktime_add_ns(const ktime_t kt, u64 nsec);
+
+/**
+ * ktime_sub_ns - Subtract a scalar nanoseconds value from a ktime_t variable
+ * @kt:		minuend
+ * @nsec:	the scalar nsec value to subtract
+ *
+ * Returns the subtraction of @nsec from @kt in ktime_t format
+ */
+extern ktime_t ktime_sub_ns(const ktime_t kt, u64 nsec);
+
+/**
+ * timespec_to_ktime - convert a timespec to ktime_t format
+ * @ts:		the timespec variable to convert
+ *
+ * Returns a ktime_t variable with the converted timespec value
+ */
+static inline ktime_t timespec_to_ktime(const struct timespec ts)
+{
+	return (ktime_t) { .tv = { .sec = (s32)ts.tv_sec,
+			   	   .nsec = (s32)ts.tv_nsec } };
+}
+
+/**
+ * timeval_to_ktime - convert a timeval to ktime_t format
+ * @tv:		the timeval variable to convert
+ *
+ * Returns a ktime_t variable with the converted timeval value
+ */
+static inline ktime_t timeval_to_ktime(const struct timeval tv)
+{
+	return (ktime_t) { .tv = { .sec = (s32)tv.tv_sec,
+				   .nsec = (s32)(tv.tv_usec *
+						 NSEC_PER_USEC) } };
+}
+
+/**
+ * ktime_to_timespec - convert a ktime_t variable to timespec format
+ * @kt:		the ktime_t variable to convert
+ *
+ * Returns the timespec representation of the ktime value
+ */
+static inline struct timespec ktime_to_timespec(const ktime_t kt)
+{
+	return (struct timespec) { .tv_sec = (time_t) kt.tv.sec,
+				   .tv_nsec = (long) kt.tv.nsec };
+}
+
+/**
+ * ktime_to_timeval - convert a ktime_t variable to timeval format
+ * @kt:		the ktime_t variable to convert
+ *
+ * Returns the timeval representation of the ktime value
+ */
+static inline struct timeval ktime_to_timeval(const ktime_t kt)
+{
+	return (struct timeval) {
+		.tv_sec = (time_t) kt.tv.sec,
+		.tv_usec = (suseconds_t) (kt.tv.nsec / NSEC_PER_USEC) };
+}
+
+/**
+ * ktime_to_ns - convert a ktime_t variable to scalar nanoseconds
+ * @kt:		the ktime_t variable to convert
+ *
+ * Returns the scalar nanoseconds representation of @kt
+ */
+static inline s64 ktime_to_ns(const ktime_t kt)
+{
+	return (s64) kt.tv.sec * NSEC_PER_SEC + kt.tv.nsec;
+}
+
+#endif	/* !((BITS_PER_LONG == 64) || defined(CONFIG_KTIME_SCALAR)) */
+
+static inline s64 ktime_get_ns(void)
+{
+	struct timespec ts;
+	ktime_t kt;
+	ktime_get_ts(&ts);
+	kt = timespec_to_ktime(ts);
+	return (ktime_to_ns(kt));
+}
+
+#endif	/* _LINUX_KTIME_H */


Property changes on: trunk/sys/ofed/include/linux/ktime.h
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Modified: trunk/sys/ofed/include/linux/linux_compat.c
===================================================================
--- trunk/sys/ofed/include/linux/linux_compat.c	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/linux/linux_compat.c	2016-09-14 19:35:22 UTC (rev 7911)
@@ -2,6 +2,7 @@
  * Copyright (c) 2010 Isilon Systems, Inc.
  * Copyright (c) 2010 iX Systems, Inc.
  * Copyright (c) 2010 Panasas, Inc.
+ * Copyright (c) 2013, 2014 Mellanox Technologies, Ltd.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -31,6 +32,9 @@
 #include <sys/malloc.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
+#include <sys/proc.h>
+#include <sys/sglist.h>
+#include <sys/sleepqueue.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/bus.h>
@@ -54,6 +58,8 @@
 #include <linux/mm.h>
 #include <linux/io.h>
 #include <linux/vmalloc.h>
+#include <linux/timer.h>
+#include <linux/netdevice.h>
 
 #include <vm/vm_pager.h>
 
@@ -65,12 +71,6 @@
 #undef file
 #undef cdev
 #define	RB_ROOT(head)	(head)->rbh_root
-#undef LIST_HEAD
-/* From sys/queue.h */
-#define LIST_HEAD(name, type)						\
-struct name {								\
-	struct type *lh_first;	/* first element */			\
-}
 
 struct kobject class_root;
 struct device linux_rootdev;
@@ -77,8 +77,11 @@
 struct class miscclass;
 struct list_head pci_drivers;
 struct list_head pci_devices;
+struct net init_net;
 spinlock_t pci_lock;
 
+unsigned long linux_timer_hz_mask;
+
 int
 panic_cmp(struct rb_node *one, struct rb_node *two)
 {
@@ -159,12 +162,26 @@
 static void
 kobject_kfree(struct kobject *kobj)
 {
-
 	kfree(kobj);
 }
 
+static void
+kobject_kfree_name(struct kobject *kobj)
+{
+	if (kobj) {
+		kfree(kobj->name);
+	}
+}
+
 struct kobj_type kfree_type = { .release = kobject_kfree };
 
+static void
+dev_release(struct device *dev)
+{
+	pr_debug("dev_release: %s\n", dev_name(dev));
+	kfree(dev);
+}
+
 struct device *
 device_create(struct class *class, struct device *parent, dev_t devt,
     void *drvdata, const char *fmt, ...)
@@ -177,6 +194,7 @@
 	dev->class = class;
 	dev->devt = devt;
 	dev->driver_data = drvdata;
+	dev->release = dev_release;
 	va_start(args, fmt);
 	kobject_set_name_vargs(&dev->kobj, fmt, args);
 	va_end(args);
@@ -211,7 +229,8 @@
 	struct linux_file *filp;
 
 	filp = cdp;
-	filp->f_op->release(curthread->td_fpop->f_vnode, filp);
+	filp->f_op->release(filp->f_vnode, filp);
+	vdrop(filp->f_vnode);
 	kfree(filp);
 }
 
@@ -231,6 +250,8 @@
 	filp->f_dentry = &filp->f_dentry_store;
 	filp->f_op = ldev->ops;
 	filp->f_flags = file->f_flag;
+	vhold(file->f_vnode);
+	filp->f_vnode = file->f_vnode;
 	if (filp->f_op->open) {
 		error = -filp->f_op->open(file->f_vnode, filp);
 		if (error) {
@@ -263,7 +284,8 @@
 	if ((error = devfs_get_cdevpriv((void **)&filp)) != 0)
 		return (error);
 	filp->f_flags = file->f_flag;
-	devfs_clear_cdevpriv();
+        devfs_clear_cdevpriv();
+        
 
 	return (0);
 }
@@ -392,16 +414,6 @@
 }
 
 static int
-linux_dev_mmap(struct cdev *dev, vm_ooffset_t offset, vm_paddr_t *paddr,
-    int nprot, vm_memattr_t *memattr)
-{
-
-	/* XXX memattr not honored. */
-	*paddr = offset;
-	return (0);
-}
-
-static int
 linux_dev_mmap_single(struct cdev *dev, vm_ooffset_t *offset,
     vm_size_t size, struct vm_object **object, int nprot)
 {
@@ -409,8 +421,6 @@
 	struct linux_file *filp;
 	struct file *file;
 	struct vm_area_struct vma;
-	vm_paddr_t paddr;
-	vm_page_t m;
 	int error;
 
 	file = curthread->td_fpop;
@@ -417,28 +427,35 @@
 	ldev = dev->si_drv1;
 	if (ldev == NULL)
 		return (ENODEV);
-	if (size != PAGE_SIZE)
-		return (EINVAL);
 	if ((error = devfs_get_cdevpriv((void **)&filp)) != 0)
 		return (error);
 	filp->f_flags = file->f_flag;
 	vma.vm_start = 0;
-	vma.vm_end = PAGE_SIZE;
+	vma.vm_end = size;
 	vma.vm_pgoff = *offset / PAGE_SIZE;
 	vma.vm_pfn = 0;
-	vma.vm_page_prot = 0;
+	vma.vm_page_prot = VM_MEMATTR_DEFAULT;
 	if (filp->f_op->mmap) {
 		error = -filp->f_op->mmap(filp, &vma);
 		if (error == 0) {
-			paddr = (vm_paddr_t)vma.vm_pfn << PAGE_SHIFT;
-			*offset = paddr;
-			m = PHYS_TO_VM_PAGE(paddr);
-			*object = vm_pager_allocate(OBJT_DEVICE, dev,
-			    PAGE_SIZE, nprot, *offset, curthread->td_ucred);
-		        if (*object == NULL)
-               			 return (EINVAL);
-			if (vma.vm_page_prot != VM_MEMATTR_DEFAULT)
-				pmap_page_set_memattr(m, vma.vm_page_prot);
+			struct sglist *sg;
+
+			sg = sglist_alloc(1, M_WAITOK);
+			sglist_append_phys(sg,
+			    (vm_paddr_t)vma.vm_pfn << PAGE_SHIFT, vma.vm_len);
+			*object = vm_pager_allocate(OBJT_SG, sg, vma.vm_len,
+			    nprot, 0, curthread->td_ucred);
+		        if (*object == NULL) {
+				sglist_free(sg);
+				return (EINVAL);
+			}
+			*offset = 0;
+			if (vma.vm_page_prot != VM_MEMATTR_DEFAULT) {
+				VM_OBJECT_LOCK(*object);
+				vm_object_set_memattr(*object,
+				    vma.vm_page_prot);
+				VM_OBJECT_UNLOCK(*object);
+			}
 		}
 	} else
 		error = ENODEV;
@@ -455,7 +472,6 @@
 	.d_write = linux_dev_write,
 	.d_ioctl = linux_dev_ioctl,
 	.d_mmap_single = linux_dev_mmap_single,
-	.d_mmap = linux_dev_mmap,
 	.d_poll = linux_dev_poll,
 };
 
@@ -575,7 +591,9 @@
 	unsigned long		vm_size;
 };
 
-LIST_HEAD(vmmaphd, vmmap);
+struct vmmaphd {
+	struct vmmap *lh_first;
+};
 #define	VMMAP_HASH_SIZE	64
 #define	VMMAP_HASH_MASK	(VMMAP_HASH_SIZE - 1)
 #define	VM_HASH(addr)	((uintptr_t)(addr) >> PAGE_SHIFT) & VMMAP_HASH_MASK
@@ -666,9 +684,205 @@
 	kfree(vmmap);
 }
 
+char *
+kvasprintf(gfp_t gfp, const char *fmt, va_list ap)
+{
+	unsigned int len;
+	char *p;
+	va_list aq;
+
+	va_copy(aq, ap);
+	len = vsnprintf(NULL, 0, fmt, aq);
+	va_end(aq);
+
+	p = kmalloc(len + 1, gfp);
+	if (p != NULL)
+		vsnprintf(p, len + 1, fmt, ap);
+
+	return (p);
+}
+
+char *
+kasprintf(gfp_t gfp, const char *fmt, ...)
+{
+	va_list ap;
+	char *p;
+
+	va_start(ap, fmt);
+	p = kvasprintf(gfp, fmt, ap);
+	va_end(ap);
+
+	return (p);
+}
+
+static int
+linux_timer_jiffies_until(unsigned long expires)
+{
+	int delta = expires - jiffies;
+	/* guard against already expired values */
+	if (delta < 1)
+		delta = 1;
+	return (delta);
+}
+
 static void
-linux_compat_init(void)
+linux_timer_callback_wrapper(void *context)
 {
+	struct timer_list *timer;
+
+	timer = context;
+	timer->function(timer->data);
+}
+
+void
+mod_timer(struct timer_list *timer, unsigned long expires)
+{
+
+	timer->expires = expires;
+	callout_reset(&timer->timer_callout,		      
+	    linux_timer_jiffies_until(expires),
+	    &linux_timer_callback_wrapper, timer);
+}
+
+void
+add_timer(struct timer_list *timer)
+{
+
+	callout_reset(&timer->timer_callout,
+	    linux_timer_jiffies_until(timer->expires),
+	    &linux_timer_callback_wrapper, timer);
+}
+
+static void
+linux_timer_init(void *arg)
+{
+
+	/*
+	 * Compute an internal HZ value which can divide 2**32 to
+	 * avoid timer rounding problems when the tick value wraps
+	 * around 2**32:
+	 */
+	linux_timer_hz_mask = 1;
+	while (linux_timer_hz_mask < (unsigned long)hz)
+		linux_timer_hz_mask *= 2;
+	linux_timer_hz_mask--;
+}
+SYSINIT(linux_timer, SI_SUB_DRIVERS, SI_ORDER_FIRST, linux_timer_init, NULL);
+
+void
+linux_complete_common(struct completion *c, int all)
+{
+	int wakeup_swapper;
+
+	sleepq_lock(c);
+	c->done++;
+	if (all)
+		wakeup_swapper = sleepq_broadcast(c, SLEEPQ_SLEEP, 0, 0);
+	else
+		wakeup_swapper = sleepq_signal(c, SLEEPQ_SLEEP, 0, 0);
+	sleepq_release(c);
+	if (wakeup_swapper)
+		kick_proc0();
+}
+
+/*
+ * Indefinite wait for done != 0 with or without signals.
+ */
+long
+linux_wait_for_common(struct completion *c, int flags)
+{
+
+	if (flags != 0)
+		flags = SLEEPQ_INTERRUPTIBLE | SLEEPQ_SLEEP;
+	else
+		flags = SLEEPQ_SLEEP;
+	for (;;) {
+		sleepq_lock(c);
+		if (c->done)
+			break;
+		sleepq_add(c, NULL, "completion", flags, 0);
+		if (flags & SLEEPQ_INTERRUPTIBLE) {
+			if (sleepq_wait_sig(c, 0) != 0)
+				return (-ERESTARTSYS);
+		} else
+			sleepq_wait(c, 0);
+	}
+	c->done--;
+	sleepq_release(c);
+
+	return (0);
+}
+
+/*
+ * Time limited wait for done != 0 with or without signals.
+ */
+long
+linux_wait_for_timeout_common(struct completion *c, long timeout, int flags)
+{
+	long end = jiffies + timeout;
+
+	if (flags != 0)
+		flags = SLEEPQ_INTERRUPTIBLE | SLEEPQ_SLEEP;
+	else
+		flags = SLEEPQ_SLEEP;
+	for (;;) {
+		int ret;
+
+		sleepq_lock(c);
+		if (c->done)
+			break;
+		sleepq_add(c, NULL, "completion", flags, 0);
+		sleepq_set_timeout(c, linux_timer_jiffies_until(end));
+		if (flags & SLEEPQ_INTERRUPTIBLE)
+			ret = sleepq_timedwait_sig(c, 0);
+		else
+			ret = sleepq_timedwait(c, 0);
+		if (ret != 0) {
+			/* check for timeout or signal */
+			if (ret == EWOULDBLOCK)
+				return (0);
+			else
+				return (-ERESTARTSYS);
+		}
+	}
+	c->done--;
+	sleepq_release(c);
+
+	/* return how many jiffies are left */
+	return (linux_timer_jiffies_until(end));
+}
+
+int
+linux_try_wait_for_completion(struct completion *c)
+{
+	int isdone;
+
+	isdone = 1;
+	sleepq_lock(c);
+	if (c->done)
+		c->done--;
+	else
+		isdone = 0;
+	sleepq_release(c);
+	return (isdone);
+}
+
+int
+linux_completion_done(struct completion *c)
+{
+	int isdone;
+
+	isdone = 1;
+	sleepq_lock(c);
+	if (c->done == 0)
+		isdone = 0;
+	sleepq_release(c);
+	return (isdone);
+}
+
+static void
+linux_compat_init(void *arg)
+{
 	struct sysctl_oid *rootoid;
 	int i;
 
@@ -695,3 +909,12 @@
 }
 
 SYSINIT(linux_compat, SI_SUB_DRIVERS, SI_ORDER_SECOND, linux_compat_init, NULL);
+
+static void
+linux_compat_uninit(void *arg)
+{
+	kobject_kfree_name(&class_root);
+	kobject_kfree_name(&linux_rootdev.kobj);
+	kobject_kfree_name(&miscclass.kobj);
+}
+SYSUNINIT(linux_compat, SI_SUB_DRIVERS, SI_ORDER_SECOND, linux_compat_uninit, NULL);


Property changes on: trunk/sys/ofed/include/linux/linux_compat.c
___________________________________________________________________
Deleted: cvs2svn:cvs-rev
## -1 +0,0 ##
-1.1.1.1
\ No newline at end of property
Modified: trunk/sys/ofed/include/linux/linux_idr.c
===================================================================
--- trunk/sys/ofed/include/linux/linux_idr.c	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/linux/linux_idr.c	2016-09-14 19:35:22 UTC (rev 7911)
@@ -2,6 +2,7 @@
  * Copyright (c) 2010 Isilon Systems, Inc.
  * Copyright (c) 2010 iX Systems, Inc.
  * Copyright (c) 2010 Panasas, Inc.
+ * Copyright (c) 2013, 2014 Mellanox Technologies, Ltd.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -49,7 +50,7 @@
  * however it should be fairly fast.  It is basically a radix tree with
  * a builtin bitmap for allocation.
  */
-MALLOC_DEFINE(M_IDR, "idr", "Linux IDR compat");
+static MALLOC_DEFINE(M_IDR, "idr", "Linux IDR compat");
 
 static inline int
 idr_max(struct idr *idr)
@@ -76,6 +77,7 @@
 {
 	struct idr_layer *il, *iln;
 
+	idr_remove_all(idr);
 	mtx_lock(&idr->lock);
 	for (il = idr->free; il != NULL; il = iln) {
 		iln = il->ary[0];


Property changes on: trunk/sys/ofed/include/linux/linux_idr.c
___________________________________________________________________
Deleted: cvs2svn:cvs-rev
## -1 +0,0 ##
-1.1.1.1
\ No newline at end of property
Modified: trunk/sys/ofed/include/linux/linux_radix.c
===================================================================
--- trunk/sys/ofed/include/linux/linux_radix.c	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/linux/linux_radix.c	2016-09-14 19:35:22 UTC (rev 7911)
@@ -2,6 +2,7 @@
  * Copyright (c) 2010 Isilon Systems, Inc.
  * Copyright (c) 2010 iX Systems, Inc.
  * Copyright (c) 2010 Panasas, Inc.
+ * Copyright (c) 2013, 2014 Mellanox Technologies, Ltd.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -37,7 +38,7 @@
 #include <linux/radix-tree.h>
 #include <linux/err.h>
 
-MALLOC_DEFINE(M_RADIX, "radix", "Linux radix compat");
+static MALLOC_DEFINE(M_RADIX, "radix", "Linux radix compat");
 
 static inline int
 radix_max(struct radix_tree_root *root)
@@ -123,40 +124,84 @@
 radix_tree_insert(struct radix_tree_root *root, unsigned long index, void *item)
 {
 	struct radix_tree_node *node;
+	struct radix_tree_node *temp[RADIX_TREE_MAX_HEIGHT - 1];
 	int height;
 	int idx;
 
-	/*
- 	 * Expand the tree to fit indexes as big as requested.
-	 */
-	while (root->rnode == NULL || radix_max(root) < index) {
+	/* bail out upon insertion of a NULL item */
+	if (item == NULL)
+		return (-EINVAL);
+
+	/* get root node, if any */
+	node = root->rnode;
+
+	/* allocate root node, if any */
+	if (node == NULL) {
 		node = malloc(sizeof(*node), M_RADIX, root->gfp_mask | M_ZERO);
 		if (node == NULL)
 			return (-ENOMEM);
-		node->slots[0] = root->rnode;
-		if (root->rnode)
-			node->count++;
 		root->rnode = node;
 		root->height++;
 	}
-	node = root->rnode;
-	height = root->height - 1;
-	/*
-	 * Walk down the tree finding the correct node and allocating any
-	 * missing nodes along the way.
-	 */
-	while (height) {
-		idx = radix_pos(index, height);
-		if (node->slots[idx] == NULL) {
-			node->slots[idx] = malloc(sizeof(*node), M_RADIX,
-			    root->gfp_mask | M_ZERO);
-			if (node->slots[idx] == NULL)
+
+	/* expand radix tree as needed */
+	while (radix_max(root) < index) {
+
+		/* check if the radix tree is getting too big */
+		if (root->height == RADIX_TREE_MAX_HEIGHT)
+			return (-E2BIG);
+
+		/*
+		 * If the root radix level is not empty, we need to
+		 * allocate a new radix level:
+		 */
+		if (node->count != 0) {
+			node = malloc(sizeof(*node), M_RADIX, root->gfp_mask | M_ZERO);
+			if (node == NULL)
 				return (-ENOMEM);
+			node->slots[0] = root->rnode;
 			node->count++;
+			root->rnode = node;
 		}
+		root->height++;
+	}
+
+	/* get radix tree height index */
+	height = root->height - 1;
+
+	/* walk down the tree until the first missing node, if any */
+	for ( ; height != 0; height--) {
+		idx = radix_pos(index, height);
+		if (node->slots[idx] == NULL)
+			break;
 		node = node->slots[idx];
-		height--;
 	}
+
+	/* allocate the missing radix levels, if any */
+	for (idx = 0; idx != height; idx++) {
+		temp[idx] = malloc(sizeof(*node), M_RADIX,
+		    root->gfp_mask | M_ZERO);
+		if (temp[idx] == NULL) {
+			while(idx--)
+				free(temp[idx], M_RADIX);
+			/* check if we should free the root node aswell */
+			if (root->rnode->count == 0) {
+				free(root->rnode, M_RADIX);
+				root->rnode = NULL;
+				root->height = 0;
+			}
+			return (-ENOMEM);
+		}
+	}
+
+	/* setup new radix levels, if any */
+	for ( ; height != 0; height--) {
+		idx = radix_pos(index, height);
+		node->slots[idx] = temp[height - 1];
+		node->count++;
+		node = node->slots[idx];
+	}
+
 	/*
 	 * Insert and adjust count if the item does not already exist.
 	 */


Property changes on: trunk/sys/ofed/include/linux/linux_radix.c
___________________________________________________________________
Deleted: cvs2svn:cvs-rev
## -1 +0,0 ##
-1.1.1.1
\ No newline at end of property
Modified: trunk/sys/ofed/include/linux/list.h
===================================================================
--- trunk/sys/ofed/include/linux/list.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/linux/list.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -2,6 +2,7 @@
  * Copyright (c) 2010 Isilon Systems, Inc.
  * Copyright (c) 2010 iX Systems, Inc.
  * Copyright (c) 2010 Panasas, Inc.
+ * Copyright (c) 2013, 2014 Mellanox Technologies, Ltd.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -39,6 +40,7 @@
 #include <sys/kernel.h>
 #include <sys/queue.h>
 #include <sys/cpuset.h>
+#include <sys/jail.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
@@ -51,9 +53,11 @@
 #include <net/if.h>
 #include <net/if_types.h>
 #include <net/if_media.h>
+#include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
+#include <netinet/in_var.h>
 
 #include <netinet6/in6_var.h>
 #include <netinet6/nd6.h>
@@ -111,6 +115,9 @@
 
 #define	list_entry(ptr, type, field)	container_of(ptr, type, field)
 
+#define list_first_entry(ptr, type, member) \
+        list_entry((ptr)->next, type, member)
+
 #define	list_for_each(p, head)						\
 	for (p = (head)->next; p != (head); p = p->next)
 
@@ -304,6 +311,66 @@
 		new->first->pprev = &new->first;
 	old->first = NULL;
 }
+
+/**
+ * list_is_singular - tests whether a list has just one entry.
+ * @head: the list to test.
+ */
+static inline int list_is_singular(const struct list_head *head)
+{
+	return !list_empty(head) && (head->next == head->prev);
+}
+
+static inline void __list_cut_position(struct list_head *list,
+		struct list_head *head, struct list_head *entry)
+{
+	struct list_head *new_first = entry->next;
+	list->next = head->next;
+	list->next->prev = list;
+	list->prev = entry;
+	entry->next = list;
+	head->next = new_first;
+	new_first->prev = head;
+}
+
+/**
+ * list_cut_position - cut a list into two
+ * @list: a new list to add all removed entries
+ * @head: a list with entries
+ * @entry: an entry within head, could be the head itself
+ *	and if so we won't cut the list
+ *
+ * This helper moves the initial part of @head, up to and
+ * including @entry, from @head to @list. You should
+ * pass on @entry an element you know is on @head. @list
+ * should be an empty list or a list you do not care about
+ * losing its data.
+ *
+ */
+static inline void list_cut_position(struct list_head *list,
+		struct list_head *head, struct list_head *entry)
+{
+	if (list_empty(head))
+		return;
+	if (list_is_singular(head) &&
+		(head->next != entry && head != entry))
+		return;
+	if (entry == head)
+		INIT_LIST_HEAD(list);
+	else
+		__list_cut_position(list, head, entry);
+}
+
+/**
+ *  list_is_last - tests whether @list is the last entry in list @head
+ *   @list: the entry to test
+ *    @head: the head of the list
+ */
+static inline int list_is_last(const struct list_head *list,
+                                const struct list_head *head)
+{
+        return list->next == head;
+}
  
 #define	hlist_entry(ptr, type, field)	container_of(ptr, type, field)
 
@@ -324,9 +391,10 @@
 #define	hlist_for_each_entry_from(tp, p, field)				\
 	for (; p ? (tp = hlist_entry(p, typeof(*tp), field)): NULL; p = p->next)
 
-#define	hlist_for_each_entry_safe(tp, p, n, head, field)		\
-	for (p = (head)->first;	p ?					\
-	    (n = p->next) | (tp = hlist_entry(p, typeof(*tp), field)) :	\
-	    NULL; p = n)
+#define hlist_for_each_entry_safe(tpos, pos, n, head, member) 		 \
+	for (pos = (head)->first;					 \
+	     (pos) != 0 && ({ n = (pos)->next; \
+		 tpos = hlist_entry((pos), typeof(*(tpos)), member); 1;}); \
+	     pos = (n))
 
 #endif /* _LINUX_LIST_H_ */

Modified: trunk/sys/ofed/include/linux/lockdep.h
===================================================================
--- trunk/sys/ofed/include/linux/lockdep.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/linux/lockdep.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -2,6 +2,7 @@
  * Copyright (c) 2010 Isilon Systems, Inc.
  * Copyright (c) 2010 iX Systems, Inc.
  * Copyright (c) 2010 Panasas, Inc.
+ * Copyright (c) 2013, 2014 Mellanox Technologies, Ltd.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -34,4 +35,6 @@
 
 #define lockdep_set_class(lock, key)
 
-#endif	/* _LINUX_LOCKDEP_H_ */
+#define lockdep_set_class_and_name(lock, key, name)
+
+#endif  /* _LINUX_LOCKDEP_H_ */

Modified: trunk/sys/ofed/include/linux/log2.h
===================================================================
--- trunk/sys/ofed/include/linux/log2.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/linux/log2.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -2,6 +2,7 @@
  * Copyright (c) 2010 Isilon Systems, Inc.
  * Copyright (c) 2010 iX Systems, Inc.
  * Copyright (c) 2010 Panasas, Inc.
+ * Copyright (c) 2013, 2014 Mellanox Technologies, Ltd.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -51,10 +52,121 @@
         return (1UL << (flsl(x) - 1));
 }
 
-static inline unsigned long
-ilog2(unsigned long x)
+
+/*
+ * deal with unrepresentable constant logarithms
+ */
+extern __attribute__((const, noreturn))
+int ____ilog2_NaN(void);
+
+/*
+ * non-constant log of base 2 calculators
+ * - the arch may override these in asm/bitops.h if they can be implemented
+ *   more efficiently than using fls() and fls64()
+ * - the arch is not required to handle n==0 if implementing the fallback
+ */
+#ifndef CONFIG_ARCH_HAS_ILOG2_U32
+static inline __attribute__((const))
+int __ilog2_u32(u32 n)
 {
-	return (flsl(x) - 1);
+	return flsl(n) - 1;
 }
+#endif
 
+#ifndef CONFIG_ARCH_HAS_ILOG2_U64
+static inline __attribute__((const))
+int __ilog2_u64(u64 n)
+{
+	return flsl(n) - 1;
+}
+#endif
+
+
+/**
+ * ilog2 - log of base 2 of 32-bit or a 64-bit unsigned value
+ * @n - parameter
+ *
+ * constant-capable log of base 2 calculation
+ * - this can be used to initialise global variables from constant data, hence
+ *   the massive ternary operator construction
+ *
+ * selects the appropriately-sized optimised version depending on sizeof(n)
+ */
+#define ilog2(n)				\
+(						\
+	__builtin_constant_p(n) ? (		\
+		(n) < 1 ? ____ilog2_NaN() :	\
+		(n) & (1ULL << 63) ? 63 :	\
+		(n) & (1ULL << 62) ? 62 :	\
+		(n) & (1ULL << 61) ? 61 :	\
+		(n) & (1ULL << 60) ? 60 :	\
+		(n) & (1ULL << 59) ? 59 :	\
+		(n) & (1ULL << 58) ? 58 :	\
+		(n) & (1ULL << 57) ? 57 :	\
+		(n) & (1ULL << 56) ? 56 :	\
+		(n) & (1ULL << 55) ? 55 :	\
+		(n) & (1ULL << 54) ? 54 :	\
+		(n) & (1ULL << 53) ? 53 :	\
+		(n) & (1ULL << 52) ? 52 :	\
+		(n) & (1ULL << 51) ? 51 :	\
+		(n) & (1ULL << 50) ? 50 :	\
+		(n) & (1ULL << 49) ? 49 :	\
+		(n) & (1ULL << 48) ? 48 :	\
+		(n) & (1ULL << 47) ? 47 :	\
+		(n) & (1ULL << 46) ? 46 :	\
+		(n) & (1ULL << 45) ? 45 :	\
+		(n) & (1ULL << 44) ? 44 :	\
+		(n) & (1ULL << 43) ? 43 :	\
+		(n) & (1ULL << 42) ? 42 :	\
+		(n) & (1ULL << 41) ? 41 :	\
+		(n) & (1ULL << 40) ? 40 :	\
+		(n) & (1ULL << 39) ? 39 :	\
+		(n) & (1ULL << 38) ? 38 :	\
+		(n) & (1ULL << 37) ? 37 :	\
+		(n) & (1ULL << 36) ? 36 :	\
+		(n) & (1ULL << 35) ? 35 :	\
+		(n) & (1ULL << 34) ? 34 :	\
+		(n) & (1ULL << 33) ? 33 :	\
+		(n) & (1ULL << 32) ? 32 :	\
+		(n) & (1ULL << 31) ? 31 :	\
+		(n) & (1ULL << 30) ? 30 :	\
+		(n) & (1ULL << 29) ? 29 :	\
+		(n) & (1ULL << 28) ? 28 :	\
+		(n) & (1ULL << 27) ? 27 :	\
+		(n) & (1ULL << 26) ? 26 :	\
+		(n) & (1ULL << 25) ? 25 :	\
+		(n) & (1ULL << 24) ? 24 :	\
+		(n) & (1ULL << 23) ? 23 :	\
+		(n) & (1ULL << 22) ? 22 :	\
+		(n) & (1ULL << 21) ? 21 :	\
+		(n) & (1ULL << 20) ? 20 :	\
+		(n) & (1ULL << 19) ? 19 :	\
+		(n) & (1ULL << 18) ? 18 :	\
+		(n) & (1ULL << 17) ? 17 :	\
+		(n) & (1ULL << 16) ? 16 :	\
+		(n) & (1ULL << 15) ? 15 :	\
+		(n) & (1ULL << 14) ? 14 :	\
+		(n) & (1ULL << 13) ? 13 :	\
+		(n) & (1ULL << 12) ? 12 :	\
+		(n) & (1ULL << 11) ? 11 :	\
+		(n) & (1ULL << 10) ? 10 :	\
+		(n) & (1ULL <<  9) ?  9 :	\
+		(n) & (1ULL <<  8) ?  8 :	\
+		(n) & (1ULL <<  7) ?  7 :	\
+		(n) & (1ULL <<  6) ?  6 :	\
+		(n) & (1ULL <<  5) ?  5 :	\
+		(n) & (1ULL <<  4) ?  4 :	\
+		(n) & (1ULL <<  3) ?  3 :	\
+		(n) & (1ULL <<  2) ?  2 :	\
+		(n) & (1ULL <<  1) ?  1 :	\
+		(n) & (1ULL <<  0) ?  0 :	\
+		____ilog2_NaN()			\
+				   ) :		\
+	(sizeof(n) <= 4) ?			\
+	__ilog2_u32(n) :			\
+	__ilog2_u64(n)				\
+ )
+
+#define	order_base_2(x) ilog2(roundup_pow_of_two(x))
+
 #endif	/* _LINUX_LOG2_H_ */

Added: trunk/sys/ofed/include/linux/math64.h
===================================================================
--- trunk/sys/ofed/include/linux/math64.h	                        (rev 0)
+++ trunk/sys/ofed/include/linux/math64.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -0,0 +1,133 @@
+/*-
+ * Copyright (c) 2007 Cisco Systems, Inc.  All rights reserved.
+ * Copyright (c) 2014 Mellanox Technologies, Ltd. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _LINUX_MATH64_H
+#define _LINUX_MATH64_H
+
+#include <linux/types.h>
+#include <linux/bitops.h>
+
+#if BITS_PER_LONG == 64
+ 
+# define do_div(n, base) ({                                    \
+	uint32_t __base = (base);                               \
+	uint32_t __rem;                                         \
+	__rem = ((uint64_t)(n)) % __base;                       \
+	(n) = ((uint64_t)(n)) / __base;                         \
+	__rem;                                                  \
+})
+
+/**
+* div_u64_rem - unsigned 64bit divide with 32bit divisor with remainder
+*
+* This is commonly provided by 32bit archs to provide an optimized 64bit
+* divide.
+*/
+static inline u64 div_u64_rem(u64 dividend, u32 divisor, u32 *remainder)
+{
+        *remainder = dividend % divisor;
+        return dividend / divisor;
+}
+
+
+#elif BITS_PER_LONG == 32
+
+static uint32_t __div64_32(uint64_t *n, uint32_t base)
+{
+	uint64_t rem = *n;
+	uint64_t b = base;
+	uint64_t res, d = 1;
+	uint32_t high = rem >> 32;
+
+	/* Reduce the thing a bit first */
+	res = 0;
+	if (high >= base) {
+		high /= base;
+		res = (uint64_t) high << 32;
+		rem -= (uint64_t) (high*base) << 32;
+	}
+
+	while ((int64_t)b > 0 && b < rem) {
+		b = b+b;
+		d = d+d;
+	}
+
+	do {
+		if (rem >= b) {
+			rem -= b;
+			res += d;
+		}
+		b >>= 1;
+		d >>= 1;
+	} while (d);
+
+	*n = res;
+	return rem;
+}
+ 
+# define do_div(n, base) ({                            \
+	uint32_t __base = (base);                       \
+	uint32_t __rem;                                 \
+	(void)(((typeof((n)) *)0) == ((uint64_t *)0));  \
+	if (likely(((n) >> 32) == 0)) {                 \
+		__rem = (uint32_t)(n) % __base;         \
+		(n) = (uint32_t)(n) / __base;           \
+	} else                                          \
+		__rem = __div64_32(&(n), __base);       \
+	__rem;                                          \
+})
+
+#ifndef div_u64_rem
+static inline u64 div_u64_rem(u64 dividend, u32 divisor, u32 *remainder)
+{
+        *remainder = do_div(dividend, divisor);
+        return dividend;
+}
+#endif
+ 
+
+#endif /* BITS_PER_LONG */
+
+
+
+/**
+ ** div_u64 - unsigned 64bit divide with 32bit divisor
+ **
+ ** This is the most common 64bit divide and should be used if possible,
+ ** as many 32bit archs can optimize this variant better than a full 64bit
+ ** divide.
+ *  */
+#ifndef div_u64
+
+static inline u64 div_u64(u64 dividend, u32 divisor)
+{
+        u32 remainder;
+        return div_u64_rem(dividend, divisor, &remainder);
+}
+#endif
+
+#endif	/* _LINUX_MATH64_H */


Property changes on: trunk/sys/ofed/include/linux/math64.h
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Modified: trunk/sys/ofed/include/linux/miscdevice.h
===================================================================
--- trunk/sys/ofed/include/linux/miscdevice.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/linux/miscdevice.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -2,6 +2,7 @@
  * Copyright (c) 2010 Isilon Systems, Inc.
  * Copyright (c) 2010 iX Systems, Inc.
  * Copyright (c) 2010 Panasas, Inc.
+ * Copyright (c) 2013, 2014 Mellanox Technologies, Ltd.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -40,6 +41,8 @@
 	const struct file_operations *fops;
 	struct cdev	*cdev;
 	int		minor;
+	const char *nodename;
+	umode_t mode;
 };
 
 extern struct class	miscclass;

Modified: trunk/sys/ofed/include/linux/mlx4/cmd.h
===================================================================
--- trunk/sys/ofed/include/linux/mlx4/cmd.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/linux/mlx4/cmd.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -34,6 +34,7 @@
 #define MLX4_CMD_H
 
 #include <linux/dma-mapping.h>
+#include <linux/types.h>
 
 enum {
 	/* initialization and general commands */
@@ -59,6 +60,7 @@
 	MLX4_CMD_HW_HEALTH_CHECK = 0x50,
 	MLX4_CMD_SET_PORT	 = 0xc,
 	MLX4_CMD_SET_NODE	 = 0x5a,
+	MLX4_CMD_QUERY_FUNC	 = 0x56,
 	MLX4_CMD_ACCESS_DDR	 = 0x2e,
 	MLX4_CMD_MAP_ICM	 = 0xffa,
 	MLX4_CMD_UNMAP_ICM	 = 0xff9,
@@ -65,6 +67,9 @@
 	MLX4_CMD_MAP_ICM_AUX	 = 0xffc,
 	MLX4_CMD_UNMAP_ICM_AUX	 = 0xffb,
 	MLX4_CMD_SET_ICM_SIZE	 = 0xffd,
+	/*master notify fw on finish for slave's flr*/
+	MLX4_CMD_INFORM_FLR_DONE = 0x5b,
+	MLX4_CMD_GET_OP_REQ      = 0x59,
 
 	/* TPT commands */
 	MLX4_CMD_SW2HW_MPT	 = 0xd,
@@ -107,6 +112,7 @@
 	MLX4_CMD_INIT2INIT_QP	 = 0x2d,
 	MLX4_CMD_SUSPEND_QP	 = 0x32,
 	MLX4_CMD_UNSUSPEND_QP	 = 0x33,
+	MLX4_CMD_UPDATE_QP	 = 0x61,
 	/* special QP and management commands */
 	MLX4_CMD_CONF_SPECIAL_QP = 0x23,
 	MLX4_CMD_MAD_IFC	 = 0x24,
@@ -119,7 +125,27 @@
 	/* miscellaneous commands */
 	MLX4_CMD_DIAG_RPRT	 = 0x30,
 	MLX4_CMD_NOP		 = 0x31,
+	MLX4_CMD_ACCESS_MEM	 = 0x2e,
+	MLX4_CMD_SET_VEP	 = 0x52,
 
+	/* Ethernet specific commands */
+	MLX4_CMD_SET_VLAN_FLTR	 = 0x47,
+	MLX4_CMD_SET_MCAST_FLTR	 = 0x48,
+	MLX4_CMD_DUMP_ETH_STATS	 = 0x49,
+
+	/* Communication channel commands */
+	MLX4_CMD_ARM_COMM_CHANNEL = 0x57,
+	MLX4_CMD_GEN_EQE	 = 0x58,
+
+	/* virtual commands */
+	MLX4_CMD_ALLOC_RES	 = 0xf00,
+	MLX4_CMD_FREE_RES	 = 0xf01,
+	MLX4_CMD_MCAST_ATTACH	 = 0xf05,
+	MLX4_CMD_UCAST_ATTACH	 = 0xf06,
+	MLX4_CMD_PROMISC         = 0xf08,
+	MLX4_CMD_QUERY_FUNC_CAP  = 0xf0a,
+	MLX4_CMD_QP_ATTACH	 = 0xf0b,
+
 	/* debug commands */
 	MLX4_CMD_QUERY_DEBUG_MSG = 0x2a,
 	MLX4_CMD_SET_DEBUG_MSG	 = 0x2b,
@@ -127,28 +153,41 @@
 	/* statistics commands */
 	MLX4_CMD_QUERY_IF_STAT	 = 0X54,
 	MLX4_CMD_SET_IF_STAT	 = 0X55,
+
+	/* register/delete flow steering network rules */
+	MLX4_QP_FLOW_STEERING_ATTACH = 0x65,
+	MLX4_QP_FLOW_STEERING_DETACH = 0x66,
+	MLX4_FLOW_STEERING_IB_UC_QP_RANGE = 0x64,
 };
 
 enum {
-	MLX4_CMD_TIME_CLASS_A	= 10000,
-	MLX4_CMD_TIME_CLASS_B	= 10000,
-	MLX4_CMD_TIME_CLASS_C	= 10000,
+	MLX4_CMD_TIME_CLASS_A	= 60000,
+	MLX4_CMD_TIME_CLASS_B	= 60000,
+	MLX4_CMD_TIME_CLASS_C	= 60000,
 };
 
 enum {
-	MLX4_MAILBOX_SIZE	=  4096
+	MLX4_MAILBOX_SIZE	= 4096,
+	MLX4_ACCESS_MEM_ALIGN	= 256,
 };
 
 enum {
 	/* set port opcode modifiers */
-	MLX4_SET_PORT_GENERAL   = 0x0,
-	MLX4_SET_PORT_RQP_CALC  = 0x1,
-	MLX4_SET_PORT_MAC_TABLE = 0x2,
-	MLX4_SET_PORT_VLAN_TABLE = 0x3,
-	MLX4_SET_PORT_PRIO_MAP  = 0x4,
-	MLX4_SET_PORT_GID_TABLE = 0x5,
+	MLX4_SET_PORT_GENERAL		= 0x0,
+	MLX4_SET_PORT_RQP_CALC		= 0x1,
+	MLX4_SET_PORT_MAC_TABLE		= 0x2,
+	MLX4_SET_PORT_VLAN_TABLE	= 0x3,
+	MLX4_SET_PORT_PRIO_MAP		= 0x4,
+	MLX4_SET_PORT_GID_TABLE		= 0x5,
+	MLX4_SET_PORT_PRIO2TC		= 0x8,
+	MLX4_SET_PORT_SCHEDULER		= 0x9
 };
 
+enum {
+	MLX4_CMD_WRAPPED,
+	MLX4_CMD_NATIVE
+};
+
 struct mlx4_dev;
 
 struct mlx4_cmd_mailbox {
@@ -158,23 +197,24 @@
 
 int __mlx4_cmd(struct mlx4_dev *dev, u64 in_param, u64 *out_param,
 	       int out_is_imm, u32 in_modifier, u8 op_modifier,
-	       u16 op, unsigned long timeout);
+	       u16 op, unsigned long timeout, int native);
 
 /* Invoke a command with no output parameter */
 static inline int mlx4_cmd(struct mlx4_dev *dev, u64 in_param, u32 in_modifier,
-			   u8 op_modifier, u16 op, unsigned long timeout)
+			   u8 op_modifier, u16 op, unsigned long timeout,
+			   int native)
 {
 	return __mlx4_cmd(dev, in_param, NULL, 0, in_modifier,
-			  op_modifier, op, timeout);
+			  op_modifier, op, timeout, native);
 }
 
 /* Invoke a command with an output mailbox */
 static inline int mlx4_cmd_box(struct mlx4_dev *dev, u64 in_param, u64 out_param,
 			       u32 in_modifier, u8 op_modifier, u16 op,
-			       unsigned long timeout)
+			       unsigned long timeout, int native)
 {
 	return __mlx4_cmd(dev, in_param, &out_param, 0, in_modifier,
-			  op_modifier, op, timeout);
+			  op_modifier, op, timeout, native);
 }
 
 /*
@@ -184,13 +224,35 @@
  */
 static inline int mlx4_cmd_imm(struct mlx4_dev *dev, u64 in_param, u64 *out_param,
 			       u32 in_modifier, u8 op_modifier, u16 op,
-			       unsigned long timeout)
+			       unsigned long timeout, int native)
 {
 	return __mlx4_cmd(dev, in_param, out_param, 1, in_modifier,
-			  op_modifier, op, timeout);
+			  op_modifier, op, timeout, native);
 }
 
 struct mlx4_cmd_mailbox *mlx4_alloc_cmd_mailbox(struct mlx4_dev *dev);
 void mlx4_free_cmd_mailbox(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox);
 
+u32 mlx4_comm_get_version(void);
+int mlx4_set_vf_mac(struct mlx4_dev *dev, int port, int vf, u8 *mac);
+int mlx4_set_vf_vlan(struct mlx4_dev *dev, int port, int vf, u16 vlan, u8 qos);
+int mlx4_set_vf_spoofchk(struct mlx4_dev *dev, int port, int vf, bool setting);
+int mlx4_set_vf_link_state(struct mlx4_dev *dev, int port, int vf, int link_state);
+int mlx4_get_vf_link_state(struct mlx4_dev *dev, int port, int vf);
+/*
+ * mlx4_get_slave_default_vlan -
+ * retrun true if VST ( default vlan)
+ * if VST will fill vlan & qos (if not NULL)
+ */
+bool mlx4_get_slave_default_vlan(struct mlx4_dev *dev, int port, int slave, u16 *vlan, u8 *qos);
+
+enum {
+	IFLA_VF_LINK_STATE_AUTO,	/* link state of the uplink */
+	IFLA_VF_LINK_STATE_ENABLE,	/* link always up */
+	IFLA_VF_LINK_STATE_DISABLE,	/* link always down */
+	__IFLA_VF_LINK_STATE_MAX,
+};
+
+#define MLX4_COMM_GET_IF_REV(cmd_chan_ver) (u8)((cmd_chan_ver) >> 8)
+
 #endif /* MLX4_CMD_H */


Property changes on: trunk/sys/ofed/include/linux/mlx4/cmd.h
___________________________________________________________________
Deleted: cvs2svn:cvs-rev
## -1 +0,0 ##
-1.1.1.1
\ No newline at end of property
Modified: trunk/sys/ofed/include/linux/mlx4/cq.h
===================================================================
--- trunk/sys/ofed/include/linux/mlx4/cq.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/linux/mlx4/cq.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -42,17 +42,31 @@
 	__be32			vlan_my_qpn;
 	__be32			immed_rss_invalid;
 	__be32			g_mlpath_rqpn;
-	__be16			sl_vid;
-	__be16			rlid;
-	__be16			status;
-	u8			ipv6_ext_mask;
-	u8			badfcs_enc;
+	union {
+		struct {
+			union {
+				struct {
+					__be16			sl_vid;
+					__be16	rlid;
+				};
+				__be32			timestamp_16_47;
+			};
+			__be16  status;
+			u8      ipv6_ext_mask;
+			u8      badfcs_enc;
+		};
+		struct {
+			__be16 reserved1;
+			u8  smac[6];
+		};
+	};
 	__be32			byte_cnt;
 	__be16			wqe_index;
 	__be16			checksum;
-	u8			reserved[3];
+	u8			reserved2[1];
+	__be16			timestamp_0_15;
 	u8			owner_sr_opcode;
-};
+} __packed;
 
 struct mlx4_err_cqe {
 	__be32			my_qpn;
@@ -64,9 +78,26 @@
 	u8			owner_sr_opcode;
 };
 
+struct mlx4_ts_cqe {
+	__be32			vlan_my_qpn;
+	__be32			immed_rss_invalid;
+	__be32			g_mlpath_rqpn;
+	__be32			timestamp_hi;
+	__be16			status;
+	u8			ipv6_ext_mask;
+	u8			badfcs_enc;
+	__be32			byte_cnt;
+	__be16			wqe_index;
+	__be16			checksum;
+	u8			reserved;
+	__be16			timestamp_lo;
+	u8			owner_sr_opcode;
+} __packed;
+
 enum {
 	MLX4_CQE_VLAN_PRESENT_MASK	= 1 << 29,
 	MLX4_CQE_QPN_MASK		= 0xffffff,
+	MLX4_CQE_VID_MASK		= 0xfff,
 };
 
 enum {
@@ -108,7 +139,7 @@
 };
 
 static inline void mlx4_cq_arm(struct mlx4_cq *cq, u32 cmd,
-			       void __iomem *uar_page,
+			       u8 __iomem *uar_page,
 			       spinlock_t *doorbell_lock)
 {
 	__be32 doorbell[2];
@@ -146,5 +177,5 @@
 		   u16 count, u16 period);
 int mlx4_cq_resize(struct mlx4_dev *dev, struct mlx4_cq *cq,
 		   int entries, struct mlx4_mtt *mtt);
-
+int mlx4_cq_ignore_overrun(struct mlx4_dev *dev, struct mlx4_cq *cq);
 #endif /* MLX4_CQ_H */


Property changes on: trunk/sys/ofed/include/linux/mlx4/cq.h
___________________________________________________________________
Deleted: cvs2svn:cvs-rev
## -1 +0,0 ##
-1.1.1.1
\ No newline at end of property
Modified: trunk/sys/ofed/include/linux/mlx4/device.h
===================================================================
--- trunk/sys/ofed/include/linux/mlx4/device.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/linux/mlx4/device.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -36,47 +36,205 @@
 #include <linux/pci.h>
 #include <linux/completion.h>
 #include <linux/radix-tree.h>
-
+#include <linux/types.h>
+#include <linux/bitops.h>
+#include <linux/workqueue.h>
 #include <asm/atomic.h>
 
-#include <linux/mlx4/driver.h>
+#include <linux/clocksource.h>
 
+#define MAX_MSIX_P_PORT		17
+#define MAX_MSIX		64
+#define MSIX_LEGACY_SZ		4
+#define MIN_MSIX_P_PORT		5
+
+#define MLX4_ROCE_MAX_GIDS	128
+#define MLX4_ROCE_PF_GIDS	16
+
+#define MLX4_NUM_UP			8
+#define MLX4_NUM_TC			8
+#define MLX4_MAX_100M_UNITS_VAL		255	/*
+						 * work around: can't set values
+						 * greater then this value when
+						 * using 100 Mbps units.
+						 */
+#define MLX4_RATELIMIT_100M_UNITS	3	/* 100 Mbps */
+#define MLX4_RATELIMIT_1G_UNITS		4	/* 1 Gbps */
+#define MLX4_RATELIMIT_DEFAULT		0x00ff
+
+#define CORE_CLOCK_MASK 0xffffffffffffULL
+
 enum {
 	MLX4_FLAG_MSI_X		= 1 << 0,
 	MLX4_FLAG_OLD_PORT_CMDS	= 1 << 1,
+	MLX4_FLAG_MASTER	= 1 << 2,
+	MLX4_FLAG_SLAVE		= 1 << 3,
+	MLX4_FLAG_SRIOV		= 1 << 4,
+	MLX4_FLAG_DEV_NUM_STR	= 1 << 5,
+	MLX4_FLAG_OLD_REG_MAC   = 1 << 6,
 };
 
 enum {
-	MLX4_MAX_PORTS		= 2
+	MLX4_PORT_CAP_IS_SM	= 1 << 1,
+	MLX4_PORT_CAP_DEV_MGMT_SUP = 1 << 19,
 };
 
 enum {
-	MLX4_BOARD_ID_LEN = 64
+	MLX4_MAX_PORTS		= 2,
+	MLX4_MAX_PORT_PKEYS	= 128
 };
 
+/* base qkey for use in sriov tunnel-qp/proxy-qp communication.
+ * These qkeys must not be allowed for general use. This is a 64k range,
+ * and to test for violation, we use the mask (protect against future chg).
+ */
+#define MLX4_RESERVED_QKEY_BASE  (0xFFFF0000)
+#define MLX4_RESERVED_QKEY_MASK  (0xFFFF0000)
+
 enum {
-	MLX4_DEV_CAP_FLAG_RC		= 1 <<  0,
-	MLX4_DEV_CAP_FLAG_UC		= 1 <<  1,
-	MLX4_DEV_CAP_FLAG_UD		= 1 <<  2,
-	MLX4_DEV_CAP_FLAG_XRC		= 1 <<  3,
-	MLX4_DEV_CAP_FLAG_SRQ		= 1 <<  6,
-	MLX4_DEV_CAP_FLAG_IPOIB_CSUM	= 1 <<  7,
-	MLX4_DEV_CAP_FLAG_BAD_PKEY_CNTR	= 1 <<  8,
-	MLX4_DEV_CAP_FLAG_BAD_QKEY_CNTR	= 1 <<  9,
-	MLX4_DEV_CAP_FLAG_DPDP		= 1 << 12,
-	MLX4_DEV_CAP_FLAG_RAW_ETY	= 1 << 13,
-	MLX4_DEV_CAP_FLAG_BLH		= 1 << 15,
-	MLX4_DEV_CAP_FLAG_MEM_WINDOW	= 1 << 16,
-	MLX4_DEV_CAP_FLAG_APM		= 1 << 17,
-	MLX4_DEV_CAP_FLAG_ATOMIC	= 1 << 18,
-	MLX4_DEV_CAP_FLAG_RAW_MCAST	= 1 << 19,
-	MLX4_DEV_CAP_FLAG_UD_AV_PORT	= 1 << 20,
-	MLX4_DEV_CAP_FLAG_UD_MCAST	= 1 << 21,
-	MLX4_DEV_CAP_FLAG_IBOE		= 1 << 30,
-	MLX4_DEV_CAP_FLAG_FC_T11	= 1 << 31
+	MLX4_BOARD_ID_LEN = 64,
+	MLX4_VSD_LEN = 208
 };
 
 enum {
+	MLX4_MAX_NUM_PF		= 16,
+	MLX4_MAX_NUM_VF		= 64,
+	MLX4_MFUNC_MAX		= 80,
+	MLX4_MAX_EQ_NUM		= 1024,
+	MLX4_MFUNC_EQ_NUM	= 4,
+	MLX4_MFUNC_MAX_EQES     = 8,
+	MLX4_MFUNC_EQE_MASK     = (MLX4_MFUNC_MAX_EQES - 1)
+};
+
+/* Driver supports 3 diffrent device methods to manage traffic steering:
+ *	-device managed - High level API for ib and eth flow steering. FW is
+ *			  managing flow steering tables.
+ *	- B0 steering mode - Common low level API for ib and (if supported) eth.
+ *	- A0 steering mode - Limited low level API for eth. In case of IB,
+ *			     B0 mode is in use.
+ */
+enum {
+	MLX4_STEERING_MODE_A0,
+	MLX4_STEERING_MODE_B0,
+	MLX4_STEERING_MODE_DEVICE_MANAGED
+};
+
+static inline const char *mlx4_steering_mode_str(int steering_mode)
+{
+	switch (steering_mode) {
+	case MLX4_STEERING_MODE_A0:
+		return "A0 steering";
+
+	case MLX4_STEERING_MODE_B0:
+		return "B0 steering";
+
+	case MLX4_STEERING_MODE_DEVICE_MANAGED:
+		return "Device managed flow steering";
+
+	default:
+		return "Unrecognize steering mode";
+	}
+}
+
+enum {
+	MLX4_DEV_CAP_FLAG_RC		= 1LL <<  0,
+	MLX4_DEV_CAP_FLAG_UC		= 1LL <<  1,
+	MLX4_DEV_CAP_FLAG_UD		= 1LL <<  2,
+	MLX4_DEV_CAP_FLAG_XRC		= 1LL <<  3,
+	MLX4_DEV_CAP_FLAG_SRQ		= 1LL <<  6,
+	MLX4_DEV_CAP_FLAG_IPOIB_CSUM	= 1LL <<  7,
+	MLX4_DEV_CAP_FLAG_BAD_PKEY_CNTR	= 1LL <<  8,
+	MLX4_DEV_CAP_FLAG_BAD_QKEY_CNTR	= 1LL <<  9,
+	MLX4_DEV_CAP_FLAG_DPDP		= 1LL << 12,
+	MLX4_DEV_CAP_FLAG_BLH		= 1LL << 15,
+	MLX4_DEV_CAP_FLAG_MEM_WINDOW	= 1LL << 16,
+	MLX4_DEV_CAP_FLAG_APM		= 1LL << 17,
+	MLX4_DEV_CAP_FLAG_ATOMIC	= 1LL << 18,
+	MLX4_DEV_CAP_FLAG_RAW_MCAST	= 1LL << 19,
+	MLX4_DEV_CAP_FLAG_UD_AV_PORT	= 1LL << 20,
+	MLX4_DEV_CAP_FLAG_UD_MCAST	= 1LL << 21,
+	MLX4_DEV_CAP_FLAG_IBOE		= 1LL << 30,
+	MLX4_DEV_CAP_FLAG_UC_LOOPBACK	= 1LL << 32,
+	MLX4_DEV_CAP_FLAG_FCS_KEEP	= 1LL << 34,
+	MLX4_DEV_CAP_FLAG_WOL_PORT1	= 1LL << 37,
+	MLX4_DEV_CAP_FLAG_WOL_PORT2	= 1LL << 38,
+	MLX4_DEV_CAP_FLAG_UDP_RSS	= 1LL << 40,
+	MLX4_DEV_CAP_FLAG_VEP_UC_STEER	= 1LL << 41,
+	MLX4_DEV_CAP_FLAG_VEP_MC_STEER	= 1LL << 42,
+	MLX4_DEV_CAP_FLAG_CROSS_CHANNEL	= 1LL << 44,
+	MLX4_DEV_CAP_FLAG_COUNTERS	= 1LL << 48,
+	MLX4_DEV_CAP_FLAG_COUNTERS_EXT	= 1LL << 49,
+	MLX4_DEV_CAP_FLAG_SET_PORT_ETH_SCHED = 1LL << 53,
+	MLX4_DEV_CAP_FLAG_SENSE_SUPPORT	= 1LL << 55,
+	MLX4_DEV_CAP_FLAG_FAST_DROP	= 1LL << 57,
+	MLX4_DEV_CAP_FLAG_PORT_MNG_CHG_EV = 1LL << 59,
+	MLX4_DEV_CAP_FLAG_64B_EQE	= 1LL << 61,
+	MLX4_DEV_CAP_FLAG_64B_CQE	= 1LL << 62
+};
+
+enum {
+	MLX4_DEV_CAP_FLAG2_RSS			= 1LL <<  0,
+	MLX4_DEV_CAP_FLAG2_RSS_TOP		= 1LL <<  1,
+	MLX4_DEV_CAP_FLAG2_RSS_XOR		= 1LL <<  2,
+	MLX4_DEV_CAP_FLAG2_FS_EN		= 1LL <<  3,
+	MLX4_DEV_CAP_FLAG2_FSM			= 1LL <<  4,
+	MLX4_DEV_CAP_FLAG2_VLAN_CONTROL		= 1LL <<  5,
+	MLX4_DEV_CAP_FLAG2_UPDATE_QP		= 1LL <<  6,
+	MLX4_DEV_CAP_FLAG2_LB_SRC_CHK		= 1LL <<  7,
+	MLX4_DEV_CAP_FLAG2_DMFS_IPOIB		= 1LL <<  8,
+	MLX4_DEV_CAP_FLAG2_ETS_CFG		= 1LL <<  9,
+	MLX4_DEV_CAP_FLAG2_ETH_BACKPL_AN_REP	= 1LL <<  10,
+	MLX4_DEV_CAP_FLAG2_FLOWSTATS_EN		= 1LL <<  11,
+	MLX4_DEV_CAP_FLAG2_RECOVERABLE_ERROR_EVENT = 1LL << 12,
+	MLX4_DEV_CAP_FLAG2_TS			= 1LL <<  13,
+	MLX4_DEV_CAP_FLAG2_DRIVER_VERSION_TO_FW	   = 1LL <<  14,
+	MLX4_DEV_CAP_FLAG2_REASSIGN_MAC_EN	= 1LL <<  15,
+	MLX4_DEV_CAP_FLAG2_VXLAN_OFFLOADS	= 1LL <<  16,
+	MLX4_DEV_CAP_FLAG2_FS_EN_NCSI		= 1LL <<  17,
+	MLX4_DEV_CAP_FLAG2_80_VFS		= 1LL <<  18,
+	MLX4_DEV_CAP_FLAG2_DMFS_TAG_MODE	= 1LL <<  19,
+	MLX4_DEV_CAP_FLAG2_ROCEV2		= 1LL <<  20,
+	MLX4_DEV_CAP_FLAG2_ETH_PROT_CTRL	= 1LL <<  21,
+	MLX4_DEV_CAP_FLAG2_CQE_STRIDE		= 1LL <<  22,
+	MLX4_DEV_CAP_FLAG2_EQE_STRIDE		= 1LL <<  23,
+	MLX4_DEV_CAP_FLAG2_UPDATE_QP_SRC_CHECK_LB = 1LL << 24,
+	MLX4_DEV_CAP_FLAG2_RX_CSUM_MODE		= 1LL <<  25,
+};
+
+/* bit enums for an 8-bit flags field indicating special use
+ * QPs which require special handling in qp_reserve_range.
+ * Currently, this only includes QPs used by the ETH interface,
+ * where we expect to use blueflame.  These QPs must not have
+ * bits 6 and 7 set in their qp number.
+ *
+ * This enum may use only bits 0..7.
+ */
+enum {
+	MLX4_RESERVE_BF_QP	= 1 << 7,
+};
+
+enum {
+	MLX4_DEV_CAP_CQ_FLAG_IO			= 1 <<  0
+};
+
+enum {
+	MLX4_DEV_CAP_64B_EQE_ENABLED	= 1LL << 0,
+	MLX4_DEV_CAP_64B_CQE_ENABLED	= 1LL << 1
+};
+
+enum {
+	MLX4_USER_DEV_CAP_64B_CQE	= 1L << 0
+};
+
+enum {
+	MLX4_FUNC_CAP_64B_EQE_CQE	= 1L << 0
+};
+
+
+#define MLX4_ATTR_EXTENDED_PORT_INFO	cpu_to_be16(0xff90)
+
+enum {
+	MLX4_BMME_FLAG_WIN_TYPE_2B	= 1 << 1,
 	MLX4_BMME_FLAG_LOCAL_INV	= 1 <<  6,
 	MLX4_BMME_FLAG_REMOTE_INV	= 1 <<  7,
 	MLX4_BMME_FLAG_TYPE_2_WIN	= 1 <<  9,
@@ -102,7 +260,15 @@
 	MLX4_EVENT_TYPE_PORT_CHANGE	   = 0x09,
 	MLX4_EVENT_TYPE_EQ_OVERFLOW	   = 0x0f,
 	MLX4_EVENT_TYPE_ECC_DETECT	   = 0x0e,
-	MLX4_EVENT_TYPE_CMD		   = 0x0a
+	MLX4_EVENT_TYPE_CMD		   = 0x0a,
+	MLX4_EVENT_TYPE_VEP_UPDATE	   = 0x19,
+	MLX4_EVENT_TYPE_COMM_CHANNEL	   = 0x18,
+	MLX4_EVENT_TYPE_OP_REQUIRED	   = 0x1a,
+	MLX4_EVENT_TYPE_FATAL_WARNING	   = 0x1b,
+	MLX4_EVENT_TYPE_FLR_EVENT	   = 0x1c,
+	MLX4_EVENT_TYPE_PORT_MNG_CHG_EVENT = 0x1d,
+	MLX4_EVENT_TYPE_RECOVERABLE_ERROR_EVENT  = 0x3e,
+	MLX4_EVENT_TYPE_NONE		   = 0xff,
 };
 
 enum {
@@ -111,11 +277,40 @@
 };
 
 enum {
+	MLX4_RECOVERABLE_ERROR_EVENT_SUBTYPE_BAD_CABLE		= 1,
+	MLX4_RECOVERABLE_ERROR_EVENT_SUBTYPE_UNSUPPORTED_CABLE	= 2,
+};
+
+enum {
+	MLX4_FATAL_WARNING_SUBTYPE_WARMING = 0,
+};
+
+enum slave_port_state {
+	SLAVE_PORT_DOWN = 0,
+	SLAVE_PENDING_UP,
+	SLAVE_PORT_UP,
+};
+
+enum slave_port_gen_event {
+	SLAVE_PORT_GEN_EVENT_DOWN = 0,
+	SLAVE_PORT_GEN_EVENT_UP,
+	SLAVE_PORT_GEN_EVENT_NONE,
+};
+
+enum slave_port_state_event {
+	MLX4_PORT_STATE_DEV_EVENT_PORT_DOWN,
+	MLX4_PORT_STATE_DEV_EVENT_PORT_UP,
+	MLX4_PORT_STATE_IB_PORT_STATE_EVENT_GID_VALID,
+	MLX4_PORT_STATE_IB_EVENT_GID_INVALID,
+};
+
+enum {
 	MLX4_PERM_LOCAL_READ	= 1 << 10,
 	MLX4_PERM_LOCAL_WRITE	= 1 << 11,
 	MLX4_PERM_REMOTE_READ	= 1 << 12,
 	MLX4_PERM_REMOTE_WRITE	= 1 << 13,
-	MLX4_PERM_ATOMIC	= 1 << 14
+	MLX4_PERM_ATOMIC	= 1 << 14,
+	MLX4_PERM_BIND_MW	= 1 << 15,
 };
 
 enum {
@@ -126,7 +321,6 @@
 	MLX4_OPCODE_SEND		= 0x0a,
 	MLX4_OPCODE_SEND_IMM		= 0x0b,
 	MLX4_OPCODE_LSO			= 0x0e,
-	MLX4_OPCODE_BIG_LSO		= 0x2e,
 	MLX4_OPCODE_RDMA_READ		= 0x10,
 	MLX4_OPCODE_ATOMIC_CS		= 0x11,
 	MLX4_OPCODE_ATOMIC_FA		= 0x12,
@@ -150,14 +344,26 @@
 	MLX4_STAT_RATE_OFFSET	= 5
 };
 
+enum mlx4_protocol {
+	MLX4_PROT_IB_IPV6 = 0,
+	MLX4_PROT_ETH,
+	MLX4_PROT_IB_IPV4,
+	MLX4_PROT_FCOE
+};
+
 enum {
 	MLX4_MTT_FLAG_PRESENT		= 1
 };
 
+enum {
+	MLX4_MAX_MTT_SHIFT		= 31
+};
+
 enum mlx4_qp_region {
 	MLX4_QP_REGION_FW = 0,
 	MLX4_QP_REGION_ETH_ADDR,
 	MLX4_QP_REGION_FC_ADDR,
+	MLX4_QP_REGION_FC_EXCH,
 	MLX4_NUM_QP_REGION
 };
 
@@ -165,7 +371,8 @@
 	MLX4_PORT_TYPE_NONE	= 0,
 	MLX4_PORT_TYPE_IB	= 1,
 	MLX4_PORT_TYPE_ETH	= 2,
-	MLX4_PORT_TYPE_AUTO	= 3
+	MLX4_PORT_TYPE_AUTO	= 3,
+	MLX4_PORT_TYPE_NA	= 4
 };
 
 enum mlx4_special_vlan_idx {
@@ -173,25 +380,63 @@
 	MLX4_VLAN_MISS_IDX,
 	MLX4_VLAN_REGULAR
 };
-#define MLX4_LEAST_ATTACHED_VECTOR	0xffffffff
 
+enum mlx4_steer_type {
+	MLX4_MC_STEER = 0,
+	MLX4_UC_STEER,
+	MLX4_NUM_STEERS
+};
+
 enum {
-	MLX4_CUNTERS_DISABLED,
-	MLX4_CUNTERS_BASIC,
-	MLX4_CUNTERS_EXT
+	MLX4_NUM_FEXCH          = 64 * 1024,
 };
 
 enum {
-	MAX_FAST_REG_PAGES = 511,
+	MLX4_MAX_FAST_REG_PAGES = 511,
 };
 
+enum {
+	MLX4_DEV_PMC_SUBTYPE_GUID_INFO	 = 0x14,
+	MLX4_DEV_PMC_SUBTYPE_PORT_INFO	 = 0x15,
+	MLX4_DEV_PMC_SUBTYPE_PKEY_TABLE	 = 0x16,
+};
+
+/* Port mgmt change event handling */
+enum {
+	MLX4_EQ_PORT_INFO_MSTR_SM_LID_CHANGE_MASK	= 1 << 0,
+	MLX4_EQ_PORT_INFO_GID_PFX_CHANGE_MASK		= 1 << 1,
+	MLX4_EQ_PORT_INFO_LID_CHANGE_MASK		= 1 << 2,
+	MLX4_EQ_PORT_INFO_CLIENT_REREG_MASK		= 1 << 3,
+	MLX4_EQ_PORT_INFO_MSTR_SM_SL_CHANGE_MASK	= 1 << 4,
+};
+
+#define MSTR_SM_CHANGE_MASK (MLX4_EQ_PORT_INFO_MSTR_SM_SL_CHANGE_MASK | \
+			     MLX4_EQ_PORT_INFO_MSTR_SM_LID_CHANGE_MASK)
+
+enum mlx4_module_id {
+	MLX4_MODULE_ID_SFP		= 0x3,
+	MLX4_MODULE_ID_QSFP		= 0xC,
+	MLX4_MODULE_ID_QSFP_PLUS	= 0xD,
+	MLX4_MODULE_ID_QSFP28		= 0x11,
+};
+
 static inline u64 mlx4_fw_ver(u64 major, u64 minor, u64 subminor)
 {
 	return (major << 32) | (minor << 16) | subminor;
 }
 
+struct mlx4_phys_caps {
+	u32			gid_phys_table_len[MLX4_MAX_PORTS + 1];
+	u32			pkey_phys_table_len[MLX4_MAX_PORTS + 1];
+	u32			num_phys_eqs;
+	u32			base_sqpn;
+	u32			base_proxy_sqpn;
+	u32			base_tunnel_sqpn;
+};
+
 struct mlx4_caps {
 	u64			fw_ver;
+	u32			function;
 	int			num_ports;
 	int			vl_cap[MLX4_MAX_PORTS + 1];
 	int			ib_mtu_cap[MLX4_MAX_PORTS + 1];
@@ -206,6 +451,7 @@
 	u64			trans_code[MLX4_MAX_PORTS + 1];
 	int			local_ca_ack_delay;
 	int			num_uars;
+	u32			uar_page_size;
 	int			bf_reg_size;
 	int			bf_regs_per_page;
 	int			max_sq_sg;
@@ -216,7 +462,10 @@
 	int			max_rq_desc_sz;
 	int			max_qp_init_rdma;
 	int			max_qp_dest_rdma;
-	int			sqp_start;
+	u32			*qp0_proxy;
+	u32			*qp1_proxy;
+	u32			*qp0_tunnel;
+	u32			*qp1_tunnel;
 	int			num_srqs;
 	int			max_srq_wqes;
 	int			max_srq_sge;
@@ -227,9 +476,10 @@
 	int			num_eqs;
 	int			reserved_eqs;
 	int			num_comp_vectors;
+	int			comp_pool;
 	int			num_mpts;
-	int			num_mtt_segs;
-	int			mtts_per_seg;
+	int			max_fmr_maps;
+	u64			num_mtts;
 	int			fmr_reserved_mtts;
 	int			reserved_mtts;
 	int			reserved_mrws;
@@ -238,36 +488,49 @@
 	int			num_amgms;
 	int			reserved_mcgs;
 	int			num_qp_per_mgm;
+	int			steering_mode;
 	int			num_pds;
 	int			reserved_pds;
+	int			max_xrcds;
+	int			reserved_xrcds;
 	int			mtt_entry_sz;
-	int			reserved_xrcds;
-	int			max_xrcds;
 	u32			max_msg_sz;
 	u32			page_size_cap;
 	u64			flags;
+	u64			flags2;
 	u32			bmme_flags;
 	u32			reserved_lkey;
 	u16			stat_rate_support;
-	int			udp_rss;
-	int			loopback_support;
-	int			wol;
+	u8			cq_timestamp;
 	u8			port_width_cap[MLX4_MAX_PORTS + 1];
 	int			max_gso_sz;
+	int			max_rss_tbl_sz;
 	int                     reserved_qps_cnt[MLX4_NUM_QP_REGION];
 	int			reserved_qps;
 	int                     reserved_qps_base[MLX4_NUM_QP_REGION];
 	int                     log_num_macs;
 	int                     log_num_vlans;
-	int                     log_num_prios;
 	enum mlx4_port_type	port_type[MLX4_MAX_PORTS + 1];
 	u8			supported_type[MLX4_MAX_PORTS + 1];
-	enum mlx4_port_type	port_mask[MLX4_MAX_PORTS + 1];
+	u8                      suggested_type[MLX4_MAX_PORTS + 1];
+	u8                      default_sense[MLX4_MAX_PORTS + 1];
+	u32			port_mask[MLX4_MAX_PORTS + 1];
 	enum mlx4_port_type	possible_type[MLX4_MAX_PORTS + 1];
-	u8			counters_mode;
+	u32			max_counters;
+	u8			port_ib_mtu[MLX4_MAX_PORTS + 1];
+	u16			sqp_demux;
+	u32			sync_qp;
+	u32			cq_flags;
+	u32			eqe_size;
+	u32			cqe_size;
+	u8			eqe_factor;
+	u32			userspace_caps; /* userspace must be aware to */
+	u32			function_caps;  /* functions must be aware to */
+	u8			fast_drop;
+	u16			hca_core_clock;
 	u32			max_basic_counters;
-	u32			max_ext_counters;
-	u32			mc_promisc_mode;
+	u32			max_extended_counters;
+	u8			def_counter_index[MLX4_MAX_PORTS + 1];
 };
 
 struct mlx4_buf_list {
@@ -284,7 +547,7 @@
 };
 
 struct mlx4_mtt {
-	u32			first_seg;
+	u32			offset;
 	int			order;
 	int			page_shift;
 };
@@ -331,6 +594,18 @@
 	int			enabled;
 };
 
+enum mlx4_mw_type {
+	MLX4_MW_TYPE_1 = 1,
+	MLX4_MW_TYPE_2 = 2,
+};
+
+struct mlx4_mw {
+	u32			key;
+	u32			pd;
+	enum mlx4_mw_type	type;
+	int			enabled;
+};
+
 struct mlx4_fmr {
 	struct mlx4_mr		mr;
 	struct mlx4_mpt_entry  *mpt;
@@ -375,6 +650,8 @@
 
 	atomic_t		refcount;
 	struct completion	free;
+	int			eqn;
+	u16			irq;
 };
 
 struct mlx4_qp {
@@ -422,7 +699,8 @@
 	u8		hop_limit;
 	__be32		sl_tclass_flowlabel;
 	u8		dgid[16];
-	u32		reserved4[2];
+	u8		s_mac[6];
+	u8	reserved4[2];
 	__be16		vlan;
 	u8		mac[6];
 };
@@ -432,52 +710,181 @@
 	struct mlx4_eth_av	eth;
 };
 
-struct mlx4_counters {
-	__be32	counter_mode;
-	__be32	num_ifc;
-	u32	reserved[2];
-	__be64	rx_frames;
-	__be64	rx_bytes;
-	__be64	tx_frames;
-	__be64	tx_bytes;
+struct mlx4_if_stat_control {
+	u8 reserved1[3];
+	/* Extended counters enabled */
+	u8 cnt_mode;
+	/* Number of interfaces */
+	__be32 num_of_if;
+	__be32 reserved[2];
 };
 
-struct mlx4_counters_ext {
-	__be32	counter_mode;
-	__be32	num_ifc;
-	u32	reserved[2];
-	__be64	rx_uni_frames;
-	__be64	rx_uni_bytes;
-	__be64	rx_mcast_frames;
-	__be64	rx_mcast_bytes;
-	__be64	rx_bcast_frames;
-	__be64	rx_bcast_bytes;
-	__be64	rx_nobuf_frames;
-	__be64	rx_nobuf_bytes;
-	__be64	rx_err_frames;
-	__be64	rx_err_bytes;
-	__be64	tx_uni_frames;
-	__be64	tx_uni_bytes;
-	__be64	tx_mcast_frames;
-	__be64	tx_mcast_bytes;
-	__be64	tx_bcast_frames;
-	__be64	tx_bcast_bytes;
-	__be64	tx_nobuf_frames;
-	__be64	tx_nobuf_bytes;
-	__be64	tx_err_frames;
-	__be64	tx_err_bytes;
+struct mlx4_if_stat_basic {
+	struct mlx4_if_stat_control control;
+	struct {
+		__be64 IfRxFrames;
+		__be64 IfRxOctets;
+		__be64 IfTxFrames;
+		__be64 IfTxOctets;
+	} counters[];
 };
+#define MLX4_IF_STAT_BSC_SZ(ports)(sizeof(struct mlx4_if_stat_extended) +\
+				   sizeof(((struct mlx4_if_stat_extended *)0)->\
+				   counters[0]) * ports)
 
+struct mlx4_if_stat_extended {
+	struct mlx4_if_stat_control control;
+	struct {
+		__be64 IfRxUnicastFrames;
+		__be64 IfRxUnicastOctets;
+		__be64 IfRxMulticastFrames;
+		__be64 IfRxMulticastOctets;
+		__be64 IfRxBroadcastFrames;
+		__be64 IfRxBroadcastOctets;
+		__be64 IfRxNoBufferFrames;
+		__be64 IfRxNoBufferOctets;
+		__be64 IfRxErrorFrames;
+		__be64 IfRxErrorOctets;
+		__be32 reserved[39];
+		__be64 IfTxUnicastFrames;
+		__be64 IfTxUnicastOctets;
+		__be64 IfTxMulticastFrames;
+		__be64 IfTxMulticastOctets;
+		__be64 IfTxBroadcastFrames;
+		__be64 IfTxBroadcastOctets;
+		__be64 IfTxDroppedFrames;
+		__be64 IfTxDroppedOctets;
+		__be64 IfTxRequestedFramesSent;
+		__be64 IfTxGeneratedFramesSent;
+		__be64 IfTxTsoOctets;
+	} __packed counters[];
+};
+#define MLX4_IF_STAT_EXT_SZ(ports)   (sizeof(struct mlx4_if_stat_extended) +\
+				      sizeof(((struct mlx4_if_stat_extended *)\
+				      0)->counters[0]) * ports)
+
+union mlx4_counter {
+	struct mlx4_if_stat_control	control;
+	struct mlx4_if_stat_basic	basic;
+	struct mlx4_if_stat_extended	ext;
+};
+#define MLX4_IF_STAT_SZ(ports)		MLX4_IF_STAT_EXT_SZ(ports)
+
+struct mlx4_quotas {
+	int qp;
+	int cq;
+	int srq;
+	int mpt;
+	int mtt;
+	int counter;
+	int xrcd;
+};
+
 struct mlx4_dev {
 	struct pci_dev	       *pdev;
 	unsigned long		flags;
+	unsigned long		num_slaves;
 	struct mlx4_caps	caps;
+	struct mlx4_phys_caps	phys_caps;
+	struct mlx4_quotas	quotas;
 	struct radix_tree_root	qp_table_tree;
-	struct radix_tree_root	srq_table_tree;
-	u32			rev_id;
+	u8			rev_id;
 	char			board_id[MLX4_BOARD_ID_LEN];
+	u16			vsd_vendor_id;
+	char			vsd[MLX4_VSD_LEN];
+	int			num_vfs;
+	int			numa_node;
+	int			oper_log_mgm_entry_size;
+	u64			regid_promisc_array[MLX4_MAX_PORTS + 1];
+	u64			regid_allmulti_array[MLX4_MAX_PORTS + 1];
 };
 
+struct mlx4_clock_params {
+	u64 offset;
+	u8 bar;
+	u8 size;
+};
+
+struct mlx4_eqe {
+	u8			reserved1;
+	u8			type;
+	u8			reserved2;
+	u8			subtype;
+	union {
+		u32		raw[6];
+		struct {
+			__be32	cqn;
+		} __packed comp;
+		struct {
+			u16	reserved1;
+			__be16	token;
+			u32	reserved2;
+			u8	reserved3[3];
+			u8	status;
+			__be64	out_param;
+		} __packed cmd;
+		struct {
+			__be32	qpn;
+		} __packed qp;
+		struct {
+			__be32	srqn;
+		} __packed srq;
+		struct {
+			__be32	cqn;
+			u32	reserved1;
+			u8	reserved2[3];
+			u8	syndrome;
+		} __packed cq_err;
+		struct {
+			u32	reserved1[2];
+			__be32	port;
+		} __packed port_change;
+		struct {
+			#define COMM_CHANNEL_BIT_ARRAY_SIZE	4
+			u32 reserved;
+			u32 bit_vec[COMM_CHANNEL_BIT_ARRAY_SIZE];
+		} __packed comm_channel_arm;
+		struct {
+			u8	port;
+			u8	reserved[3];
+			__be64	mac;
+		} __packed mac_update;
+		struct {
+			__be32	slave_id;
+		} __packed flr_event;
+		struct {
+			__be16  current_temperature;
+			__be16  warning_threshold;
+		} __packed warming;
+		struct {
+			u8 reserved[3];
+			u8 port;
+			union {
+				struct {
+					__be16 mstr_sm_lid;
+					__be16 port_lid;
+					__be32 changed_attr;
+					u8 reserved[3];
+					u8 mstr_sm_sl;
+					__be64 gid_prefix;
+				} __packed port_info;
+				struct {
+					__be32 block_ptr;
+					__be32 tbl_entries_mask;
+				} __packed tbl_change_info;
+			} params;
+		} __packed port_mgmt_change;
+		struct {
+			u8 reserved[3];
+			u8 port;
+			u32 reserved1[5];
+		} __packed bad_cable;
+	}			event;
+	u8			slave_id;
+	u8			reserved3[2];
+	u8			owner;
+} __packed;
+
 struct mlx4_init_port_param {
 	int			set_guid0;
 	int			set_node_guid;
@@ -492,44 +899,107 @@
 	u64			si_guid;
 };
 
-static inline void mlx4_query_steer_cap(struct mlx4_dev *dev, int *log_mac,
-					int *log_vlan, int *log_prio)
-{
-	*log_mac = dev->caps.log_num_macs;
-	*log_vlan = dev->caps.log_num_vlans;
-	*log_prio = dev->caps.log_num_prios;
-}
+#define MAD_IFC_DATA_SZ 192
+/* MAD IFC Mailbox */
+struct mlx4_mad_ifc {
+	u8      base_version;
+	u8      mgmt_class;
+	u8      class_version;
+	u8      method;
+	__be16  status;
+	__be16  class_specific;
+	__be64  tid;
+	__be16  attr_id;
+	__be16  resv;
+	__be32  attr_mod;
+	__be64  mkey;
+	__be16  dr_slid;
+	__be16  dr_dlid;
+	u8      reserved[28];
+	u8      data[MAD_IFC_DATA_SZ];
+} __packed;
 
 #define mlx4_foreach_port(port, dev, type)				\
 	for ((port) = 1; (port) <= (dev)->caps.num_ports; (port)++)	\
 		if ((type) == (dev)->caps.port_mask[(port)])
 
+#define mlx4_foreach_non_ib_transport_port(port, dev)                     \
+	for ((port) = 1; (port) <= (dev)->caps.num_ports; (port)++)	  \
+		if (((dev)->caps.port_mask[port] != MLX4_PORT_TYPE_IB))
+
 #define mlx4_foreach_ib_transport_port(port, dev)                         \
-	for ((port) = 1; (port) <= (dev)->caps.num_ports; (port)++)       \
+	for ((port) = 1; (port) <= (dev)->caps.num_ports; (port)++)	  \
 		if (((dev)->caps.port_mask[port] == MLX4_PORT_TYPE_IB) || \
 			((dev)->caps.flags & MLX4_DEV_CAP_FLAG_IBOE))
 
+#define MLX4_INVALID_SLAVE_ID	0xFF
+
+#define MLX4_SINK_COUNTER_INDEX 0xff
+
+void handle_port_mgmt_change_event(struct work_struct *work);
+
+static inline int mlx4_master_func_num(struct mlx4_dev *dev)
+{
+	return dev->caps.function;
+}
+
+static inline int mlx4_is_master(struct mlx4_dev *dev)
+{
+	return dev->flags & MLX4_FLAG_MASTER;
+}
+
+static inline int mlx4_num_reserved_sqps(struct mlx4_dev *dev)
+{
+	return dev->phys_caps.base_sqpn + 8 +
+		16 * MLX4_MFUNC_MAX * !!mlx4_is_master(dev);
+}
+
+static inline int mlx4_is_qp_reserved(struct mlx4_dev *dev, u32 qpn)
+{
+	return (qpn < dev->phys_caps.base_sqpn + 8 +
+		16 * MLX4_MFUNC_MAX * !!mlx4_is_master(dev));
+}
+
+static inline int mlx4_is_guest_proxy(struct mlx4_dev *dev, int slave, u32 qpn)
+{
+	int guest_proxy_base = dev->phys_caps.base_proxy_sqpn + slave * 8;
+
+	if (qpn >= guest_proxy_base && qpn < guest_proxy_base + 8)
+		return 1;
+
+	return 0;
+}
+
+static inline int mlx4_is_mfunc(struct mlx4_dev *dev)
+{
+	return dev->flags & (MLX4_FLAG_SLAVE | MLX4_FLAG_MASTER);
+}
+
+static inline int mlx4_is_slave(struct mlx4_dev *dev)
+{
+	return dev->flags & MLX4_FLAG_SLAVE;
+}
+
 int mlx4_buf_alloc(struct mlx4_dev *dev, int size, int max_direct,
 		   struct mlx4_buf *buf);
 void mlx4_buf_free(struct mlx4_dev *dev, int size, struct mlx4_buf *buf);
 static inline void *mlx4_buf_offset(struct mlx4_buf *buf, int offset)
 {
-	if (buf->direct.buf != NULL)
-		return buf->direct.buf + offset;
+	if (BITS_PER_LONG == 64 || buf->nbufs == 1)
+		return (u8 *)buf->direct.buf + offset;
 	else
-		return buf->page_list[offset >> PAGE_SHIFT].buf +
+		return (u8 *)buf->page_list[offset >> PAGE_SHIFT].buf +
 			(offset & (PAGE_SIZE - 1));
 }
 
 int mlx4_pd_alloc(struct mlx4_dev *dev, u32 *pdn);
 void mlx4_pd_free(struct mlx4_dev *dev, u32 pdn);
-
 int mlx4_xrcd_alloc(struct mlx4_dev *dev, u32 *xrcdn);
 void mlx4_xrcd_free(struct mlx4_dev *dev, u32 xrcdn);
 
 int mlx4_uar_alloc(struct mlx4_dev *dev, struct mlx4_uar *uar);
 void mlx4_uar_free(struct mlx4_dev *dev, struct mlx4_uar *uar);
-int mlx4_bf_alloc(struct mlx4_dev *dev, struct mlx4_bf *bf);
+int mlx4_bf_alloc(struct mlx4_dev *dev, struct mlx4_bf *bf, int node);
 void mlx4_bf_free(struct mlx4_dev *dev, struct mlx4_bf *bf);
 
 int mlx4_mtt_init(struct mlx4_dev *dev, int npages, int page_shift,
@@ -536,20 +1006,15 @@
 		  struct mlx4_mtt *mtt);
 void mlx4_mtt_cleanup(struct mlx4_dev *dev, struct mlx4_mtt *mtt);
 u64 mlx4_mtt_addr(struct mlx4_dev *dev, struct mlx4_mtt *mtt);
-int mlx4_wol_read(struct mlx4_dev *dev, u64 *config, int port);
-int mlx4_wol_write(struct mlx4_dev *dev, u64 config, int port);
 
-
-int mlx4_mr_reserve_range(struct mlx4_dev *dev, int cnt, int align, u32 *base_mridx);
-void mlx4_mr_release_range(struct mlx4_dev *dev, u32 base_mridx, int cnt);
-int mlx4_mr_alloc_reserved(struct mlx4_dev *dev, u32 mridx, u32 pd,
-			   u64 iova, u64 size, u32 access, int npages,
-			   int page_shift, struct mlx4_mr *mr);
 int mlx4_mr_alloc(struct mlx4_dev *dev, u32 pd, u64 iova, u64 size, u32 access,
 		  int npages, int page_shift, struct mlx4_mr *mr);
-void mlx4_mr_free_reserved(struct mlx4_dev *dev, struct mlx4_mr *mr);
-void mlx4_mr_free(struct mlx4_dev *dev, struct mlx4_mr *mr);
+int mlx4_mr_free(struct mlx4_dev *dev, struct mlx4_mr *mr);
 int mlx4_mr_enable(struct mlx4_dev *dev, struct mlx4_mr *mr);
+int mlx4_mw_alloc(struct mlx4_dev *dev, u32 pd, enum mlx4_mw_type type,
+		  struct mlx4_mw *mw);
+void mlx4_mw_free(struct mlx4_dev *dev, struct mlx4_mw *mw);
+int mlx4_mw_enable(struct mlx4_dev *dev, struct mlx4_mw *mw);
 int mlx4_write_mtt(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
 		   int start_index, int npages, u64 *page_list);
 int mlx4_buf_write_mtt(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
@@ -565,16 +1030,17 @@
 
 int mlx4_cq_alloc(struct mlx4_dev *dev, int nent, struct mlx4_mtt *mtt,
 		  struct mlx4_uar *uar, u64 db_rec, struct mlx4_cq *cq,
-		  unsigned vector, int collapsed);
+		  unsigned vector, int collapsed, int timestamp_en);
 void mlx4_cq_free(struct mlx4_dev *dev, struct mlx4_cq *cq);
 
-int mlx4_qp_reserve_range(struct mlx4_dev *dev, int cnt, int align, int *base);
+int mlx4_qp_reserve_range(struct mlx4_dev *dev, int cnt, int align,
+			  int *base, u8 flags);
 void mlx4_qp_release_range(struct mlx4_dev *dev, int base_qpn, int cnt);
 
 int mlx4_qp_alloc(struct mlx4_dev *dev, int qpn, struct mlx4_qp *qp);
 void mlx4_qp_free(struct mlx4_dev *dev, struct mlx4_qp *qp);
 
-int mlx4_srq_alloc(struct mlx4_dev *dev, u32 pdn, u32 cqn, u16 xrcd,
+int mlx4_srq_alloc(struct mlx4_dev *dev, u32 pdn, u32 cqn, u16 xrcdn,
 		   struct mlx4_mtt *mtt, u64 db_rec, struct mlx4_srq *srq);
 void mlx4_srq_free(struct mlx4_dev *dev, struct mlx4_srq *srq);
 int mlx4_srq_arm(struct mlx4_dev *dev, struct mlx4_srq *srq, int limit_watermark);
@@ -583,41 +1049,279 @@
 int mlx4_INIT_PORT(struct mlx4_dev *dev, int port);
 int mlx4_CLOSE_PORT(struct mlx4_dev *dev, int port);
 
+int mlx4_unicast_attach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16],
+			int block_mcast_loopback, enum mlx4_protocol prot);
+int mlx4_unicast_detach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16],
+			enum mlx4_protocol prot);
 int mlx4_multicast_attach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16],
-			  int block_mcast_loopback, enum mlx4_mcast_prot prot);
+			  u8 port, int block_mcast_loopback,
+			  enum mlx4_protocol protocol, u64 *reg_id);
 int mlx4_multicast_detach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16],
-				enum mlx4_mcast_prot prot);
+			  enum mlx4_protocol protocol, u64 reg_id);
 
-int mlx4_register_mac(struct mlx4_dev *dev, u8 port, u64 mac, int *index);
-void mlx4_unregister_mac(struct mlx4_dev *dev, u8 port, int index);
+enum {
+	MLX4_DOMAIN_UVERBS	= 0x1000,
+	MLX4_DOMAIN_ETHTOOL     = 0x2000,
+	MLX4_DOMAIN_RFS         = 0x3000,
+	MLX4_DOMAIN_NIC    = 0x5000,
+};
 
+enum mlx4_net_trans_rule_id {
+	MLX4_NET_TRANS_RULE_ID_ETH = 0,
+	MLX4_NET_TRANS_RULE_ID_IB,
+	MLX4_NET_TRANS_RULE_ID_IPV6,
+	MLX4_NET_TRANS_RULE_ID_IPV4,
+	MLX4_NET_TRANS_RULE_ID_TCP,
+	MLX4_NET_TRANS_RULE_ID_UDP,
+	MLX4_NET_TRANS_RULE_NUM, /* should be last */
+	MLX4_NET_TRANS_RULE_DUMMY = -1,	/* force enum to be signed */
+};
+
+extern const u16 __sw_id_hw[];
+
+static inline int map_hw_to_sw_id(u16 header_id)
+{
+
+	int i;
+	for (i = 0; i < MLX4_NET_TRANS_RULE_NUM; i++) {
+		if (header_id == __sw_id_hw[i])
+			return i;
+	}
+	return -EINVAL;
+}
+
+enum mlx4_net_trans_promisc_mode {
+	MLX4_FS_REGULAR		= 1,
+	MLX4_FS_ALL_DEFAULT,
+	MLX4_FS_MC_DEFAULT,
+	MLX4_FS_UC_SNIFFER,
+	MLX4_FS_MC_SNIFFER,
+	MLX4_FS_MODE_NUM, /* should be last */
+	MLX4_FS_MODE_DUMMY = -1,	/* force enum to be signed */
+};
+
+struct mlx4_spec_eth {
+	u8	dst_mac[6];
+	u8	dst_mac_msk[6];
+	u8	src_mac[6];
+	u8	src_mac_msk[6];
+	u8	ether_type_enable;
+	__be16	ether_type;
+	__be16	vlan_id_msk;
+	__be16	vlan_id;
+};
+
+struct mlx4_spec_tcp_udp {
+	__be16 dst_port;
+	__be16 dst_port_msk;
+	__be16 src_port;
+	__be16 src_port_msk;
+};
+
+struct mlx4_spec_ipv4 {
+	__be32 dst_ip;
+	__be32 dst_ip_msk;
+	__be32 src_ip;
+	__be32 src_ip_msk;
+};
+
+struct mlx4_spec_ib {
+	__be32 l3_qpn;
+	__be32 qpn_msk;
+	u8 dst_gid[16];
+	u8 dst_gid_msk[16];
+};
+
+struct mlx4_spec_list {
+	struct	list_head list;
+	enum	mlx4_net_trans_rule_id id;
+	union {
+		struct mlx4_spec_eth eth;
+		struct mlx4_spec_ib ib;
+		struct mlx4_spec_ipv4 ipv4;
+		struct mlx4_spec_tcp_udp tcp_udp;
+	};
+};
+
+enum mlx4_net_trans_hw_rule_queue {
+	MLX4_NET_TRANS_Q_FIFO,
+	MLX4_NET_TRANS_Q_LIFO,
+};
+
+struct mlx4_net_trans_rule {
+	struct	list_head list;
+	enum	mlx4_net_trans_hw_rule_queue queue_mode;
+	bool	exclusive;
+	bool	allow_loopback;
+	enum	mlx4_net_trans_promisc_mode promisc_mode;
+	u8	port;
+	u16	priority;
+	u32	qpn;
+};
+
+struct mlx4_net_trans_rule_hw_ctrl {
+	__be16 prio;
+	u8 type;
+	u8 flags;
+	u8 rsvd1;
+	u8 funcid;
+	u8 vep;
+	u8 port;
+	__be32 qpn;
+	__be32 rsvd2;
+};
+
+struct mlx4_net_trans_rule_hw_ib {
+	u8 size;
+	u8 rsvd1;
+	__be16 id;
+	u32 rsvd2;
+	__be32 l3_qpn;
+	__be32 qpn_mask;
+	u8 dst_gid[16];
+	u8 dst_gid_msk[16];
+} __packed;
+
+struct mlx4_net_trans_rule_hw_eth {
+	u8	size;
+	u8	rsvd;
+	__be16	id;
+	u8	rsvd1[6];
+	u8	dst_mac[6];
+	u16	rsvd2;
+	u8	dst_mac_msk[6];
+	u16	rsvd3;
+	u8	src_mac[6];
+	u16	rsvd4;
+	u8	src_mac_msk[6];
+	u8      rsvd5;
+	u8      ether_type_enable;
+	__be16  ether_type;
+	__be16  vlan_tag_msk;
+	__be16  vlan_tag;
+} __packed;
+
+struct mlx4_net_trans_rule_hw_tcp_udp {
+	u8	size;
+	u8	rsvd;
+	__be16	id;
+	__be16	rsvd1[3];
+	__be16	dst_port;
+	__be16	rsvd2;
+	__be16	dst_port_msk;
+	__be16	rsvd3;
+	__be16	src_port;
+	__be16	rsvd4;
+	__be16	src_port_msk;
+} __packed;
+
+struct mlx4_net_trans_rule_hw_ipv4 {
+	u8	size;
+	u8	rsvd;
+	__be16	id;
+	__be32	rsvd1;
+	__be32	dst_ip;
+	__be32	dst_ip_msk;
+	__be32	src_ip;
+	__be32	src_ip_msk;
+} __packed;
+
+struct _rule_hw {
+	union {
+		struct {
+			u8 size;
+			u8 rsvd;
+			__be16 id;
+		};
+		struct mlx4_net_trans_rule_hw_eth eth;
+		struct mlx4_net_trans_rule_hw_ib ib;
+		struct mlx4_net_trans_rule_hw_ipv4 ipv4;
+		struct mlx4_net_trans_rule_hw_tcp_udp tcp_udp;
+	};
+};
+
+int mlx4_flow_steer_promisc_add(struct mlx4_dev *dev, u8 port, u32 qpn,
+				enum mlx4_net_trans_promisc_mode mode);
+int mlx4_flow_steer_promisc_remove(struct mlx4_dev *dev, u8 port,
+				   enum mlx4_net_trans_promisc_mode mode);
+int mlx4_multicast_promisc_add(struct mlx4_dev *dev, u32 qpn, u8 port);
+int mlx4_multicast_promisc_remove(struct mlx4_dev *dev, u32 qpn, u8 port);
+int mlx4_unicast_promisc_add(struct mlx4_dev *dev, u32 qpn, u8 port);
+int mlx4_unicast_promisc_remove(struct mlx4_dev *dev, u32 qpn, u8 port);
+
+int mlx4_register_mac(struct mlx4_dev *dev, u8 port, u64 mac);
+void mlx4_unregister_mac(struct mlx4_dev *dev, u8 port, u64 mac);
+int mlx4_get_base_qpn(struct mlx4_dev *dev, u8 port);
+int __mlx4_replace_mac(struct mlx4_dev *dev, u8 port, int qpn, u64 new_mac);
+void mlx4_set_stats_bitmap(struct mlx4_dev *dev, unsigned long *stats_bitmap);
+int mlx4_SET_PORT_general(struct mlx4_dev *dev, u8 port, int mtu,
+			  u8 pptx, u8 pfctx, u8 pprx, u8 pfcrx);
+int mlx4_SET_PORT_qpn_calc(struct mlx4_dev *dev, u8 port, u32 base_qpn,
+			   u8 promisc);
+int mlx4_SET_PORT_PRIO2TC(struct mlx4_dev *dev, u8 port, u8 *prio2tc);
+int mlx4_SET_PORT_SCHEDULER(struct mlx4_dev *dev, u8 port, u8 *tc_tx_bw,
+		u8 *pg, u16 *ratelimit);
 int mlx4_find_cached_vlan(struct mlx4_dev *dev, u8 port, u16 vid, int *idx);
 int mlx4_register_vlan(struct mlx4_dev *dev, u8 port, u16 vlan, int *index);
-void mlx4_unregister_vlan(struct mlx4_dev *dev, u8 port, int index);
+void mlx4_unregister_vlan(struct mlx4_dev *dev, u8 port, u16 vlan);
 
-int mlx4_map_phys_fmr_fbo(struct mlx4_dev *dev, struct mlx4_fmr *fmr,
-			  u64 *page_list, int npages, u64 iova, u32 fbo,
-			  u32 len, u32 *lkey, u32 *rkey, int same_key);
 int mlx4_map_phys_fmr(struct mlx4_dev *dev, struct mlx4_fmr *fmr, u64 *page_list,
 		      int npages, u64 iova, u32 *lkey, u32 *rkey);
-int mlx4_fmr_alloc_reserved(struct mlx4_dev *dev, u32 mridx, u32 pd,
-			    u32 access, int max_pages, int max_maps,
-			    u8 page_shift, struct mlx4_fmr *fmr);
 int mlx4_fmr_alloc(struct mlx4_dev *dev, u32 pd, u32 access, int max_pages,
 		   int max_maps, u8 page_shift, struct mlx4_fmr *fmr);
 int mlx4_fmr_enable(struct mlx4_dev *dev, struct mlx4_fmr *fmr);
 void mlx4_fmr_unmap(struct mlx4_dev *dev, struct mlx4_fmr *fmr,
 		    u32 *lkey, u32 *rkey);
-int mlx4_fmr_free_reserved(struct mlx4_dev *dev, struct mlx4_fmr *fmr);
 int mlx4_fmr_free(struct mlx4_dev *dev, struct mlx4_fmr *fmr);
 int mlx4_SYNC_TPT(struct mlx4_dev *dev);
 int mlx4_query_diag_counters(struct mlx4_dev *mlx4_dev, int array_length,
-			     u8 op_modifier, u32 in_offset[], u32 counter_out[]);
+			     u8 op_modifier, u32 in_offset[],
+			     u32 counter_out[]);
+
 int mlx4_test_interrupts(struct mlx4_dev *dev);
+int mlx4_assign_eq(struct mlx4_dev *dev, char* name, int * vector);
+void mlx4_release_eq(struct mlx4_dev *dev, int vec);
 
-void mlx4_get_fc_t11_settings(struct mlx4_dev *dev, int *enable_pre_t11, int *t11_supported);
+int mlx4_wol_read(struct mlx4_dev *dev, u64 *config, int port);
+int mlx4_wol_write(struct mlx4_dev *dev, u64 config, int port);
 
-int mlx4_counter_alloc(struct mlx4_dev *dev, u32 *idx);
-void mlx4_counter_free(struct mlx4_dev *dev, u32 idx);
+int mlx4_counter_alloc(struct mlx4_dev *dev, u8 port, u32 *idx);
+void mlx4_counter_free(struct mlx4_dev *dev, u8 port, u32 idx);
 
+int mlx4_flow_attach(struct mlx4_dev *dev,
+		     struct mlx4_net_trans_rule *rule, u64 *reg_id);
+int mlx4_flow_detach(struct mlx4_dev *dev, u64 reg_id);
+int map_sw_to_hw_steering_mode(struct mlx4_dev *dev,
+			       enum mlx4_net_trans_promisc_mode flow_type);
+int map_sw_to_hw_steering_id(struct mlx4_dev *dev,
+			     enum mlx4_net_trans_rule_id id);
+int hw_rule_sz(struct mlx4_dev *dev, enum mlx4_net_trans_rule_id id);
+
+void mlx4_sync_pkey_table(struct mlx4_dev *dev, int slave, int port,
+			  int i, int val);
+
+int mlx4_get_parav_qkey(struct mlx4_dev *dev, u32 qpn, u32 *qkey);
+
+int mlx4_is_slave_active(struct mlx4_dev *dev, int slave);
+int mlx4_gen_pkey_eqe(struct mlx4_dev *dev, int slave, u8 port);
+int mlx4_gen_guid_change_eqe(struct mlx4_dev *dev, int slave, u8 port);
+int mlx4_gen_slaves_port_mgt_ev(struct mlx4_dev *dev, u8 port, int attr, u16 lid, u8 sl);
+int mlx4_gen_port_state_change_eqe(struct mlx4_dev *dev, int slave, u8 port, u8 port_subtype_change);
+enum slave_port_state mlx4_get_slave_port_state(struct mlx4_dev *dev, int slave, u8 port);
+int set_and_calc_slave_port_state(struct mlx4_dev *dev, int slave, u8 port, int event, enum slave_port_gen_event *gen_event);
+
+void mlx4_put_slave_node_guid(struct mlx4_dev *dev, int slave, __be64 guid);
+__be64 mlx4_get_slave_node_guid(struct mlx4_dev *dev, int slave);
+int mlx4_get_slave_from_roce_gid(struct mlx4_dev *dev, int port, u8 *gid, int *slave_id);
+int mlx4_get_roce_gid_from_slave(struct mlx4_dev *dev, int port, int slave_id, u8 *gid);
+
+int mlx4_FLOW_STEERING_IB_UC_QP_RANGE(struct mlx4_dev *dev, u32 min_range_qpn, u32 max_range_qpn);
+
+int mlx4_read_clock(struct mlx4_dev *dev);
+int mlx4_get_internal_clock_params(struct mlx4_dev *dev,
+				   struct mlx4_clock_params *params);
+
+int mlx4_get_module_info(struct mlx4_dev *dev, u8 port,
+			u16 offset, u16 size, u8 *data);
+
 #endif /* MLX4_DEVICE_H */


Property changes on: trunk/sys/ofed/include/linux/mlx4/device.h
___________________________________________________________________
Deleted: cvs2svn:cvs-rev
## -1 +0,0 ##
-1.1.1.1
\ No newline at end of property
Modified: trunk/sys/ofed/include/linux/mlx4/doorbell.h
===================================================================
--- trunk/sys/ofed/include/linux/mlx4/doorbell.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/linux/mlx4/doorbell.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -77,7 +77,7 @@
 
 	spin_lock_irqsave(doorbell_lock, flags);
 	__raw_writel((__force u32) val[0], dest);
-	__raw_writel((__force u32) val[1], dest + 4);
+	__raw_writel((__force u32) val[1], (u8 *)dest + 4);
 	spin_unlock_irqrestore(doorbell_lock, flags);
 }
 


Property changes on: trunk/sys/ofed/include/linux/mlx4/doorbell.h
___________________________________________________________________
Deleted: cvs2svn:cvs-rev
## -1 +0,0 ##
-1.1.1.1
\ No newline at end of property
Modified: trunk/sys/ofed/include/linux/mlx4/driver.h
===================================================================
--- trunk/sys/ofed/include/linux/mlx4/driver.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/linux/mlx4/driver.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -33,50 +33,104 @@
 #ifndef MLX4_DRIVER_H
 #define MLX4_DRIVER_H
 
-#include <linux/device.h>
+#include <linux/mlx4/device.h>
 
 struct mlx4_dev;
 
+#define MLX4_MAC_MASK	   0xffffffffffffULL
+#define MLX4_BE_SHORT_MASK cpu_to_be16(0xffff)
+#define MLX4_BE_WORD_MASK  cpu_to_be32(0xffffffff)
+
 enum mlx4_dev_event {
 	MLX4_DEV_EVENT_CATASTROPHIC_ERROR,
 	MLX4_DEV_EVENT_PORT_UP,
 	MLX4_DEV_EVENT_PORT_DOWN,
 	MLX4_DEV_EVENT_PORT_REINIT,
+	MLX4_DEV_EVENT_PORT_MGMT_CHANGE,
+	MLX4_DEV_EVENT_SLAVE_INIT,
+	MLX4_DEV_EVENT_SLAVE_SHUTDOWN,
 };
 
-enum mlx4_query_reply {
-	MLX4_QUERY_NOT_MINE	= -1,
-	MLX4_QUERY_MINE_NOPORT 	= 0
+struct mlx4_interface {
+	void *			(*add)	 (struct mlx4_dev *dev);
+	void			(*remove)(struct mlx4_dev *dev, void *context);
+	void			(*event) (struct mlx4_dev *dev, void *context,
+					  enum mlx4_dev_event event, unsigned long param);
+	void *			(*get_dev)(struct mlx4_dev *dev, void *context, u8 port);
+	struct list_head	list;
+	enum mlx4_protocol	protocol;
 };
 
-enum mlx4_prot {
-	MLX4_PROT_IB,
-	MLX4_PROT_EN,
+enum {
+	MLX4_MAX_DEVICES	= 32,
+	MLX4_DEVS_TBL_SIZE	= MLX4_MAX_DEVICES + 1,
+	MLX4_DBDF2VAL_STR_SIZE	= 512,
+	MLX4_STR_NAME_SIZE	= 64,
+	MLX4_MAX_BDF_VALS	= 2,
+	MLX4_ENDOF_TBL		= -1LL
 };
 
-enum mlx4_mcast_prot {
-	MLX4_MCAST_PROT_IB = 0,
-	MLX4_MCAST_PROT_EN = 1,
+struct mlx4_dbdf2val {
+	u64 dbdf;
+	int val[MLX4_MAX_BDF_VALS];
 };
 
-struct mlx4_interface {
-	void *			(*add)	 (struct mlx4_dev *dev);
-	void			(*remove)(struct mlx4_dev *dev, void *context);
-	void			(*event) (struct mlx4_dev *dev, void *context,
-					  enum mlx4_dev_event event, int port);
-	void *  (*get_prot_dev) (struct mlx4_dev *dev, void *context, u8 port);
-	enum mlx4_prot          protocol;
+struct mlx4_range {
+	int min;
+	int max;
+};
 
-	enum mlx4_query_reply	(*query) (void *context, void *);
-	struct list_head	list;
+/*
+ * mlx4_dbdf2val_lst struct holds all the data needed to convert
+ * dbdf-to-value-list string into dbdf-to-value table.
+ * dbdf-to-value-list string is a comma separated list of dbdf-to-value strings.
+ * the format of dbdf-to-value string is: "[mmmm:]bb:dd.f-v1[;v2]"
+ * mmmm - Domain number (optional)
+ * bb - Bus number
+ * dd - device number
+ * f  - Function number
+ * v1 - First value related to the domain-bus-device-function.
+ * v2 - Second value related to the domain-bus-device-function (optional).
+ * bb, dd - Two hexadecimal digits without preceding 0x.
+ * mmmm - Four hexadecimal digits without preceding 0x.
+ * f  - One hexadecimal without preceding 0x.
+ * v1,v2 - Number with normal convention (e.g 100, 0xd3).
+ * dbdf-to-value-list string format:
+ *     "[mmmm:]bb:dd.f-v1[;v2],[mmmm:]bb:dd.f-v1[;v2],..."
+ *
+ */
+struct mlx4_dbdf2val_lst {
+	char		name[MLX4_STR_NAME_SIZE];    /* String name */
+	char		str[MLX4_DBDF2VAL_STR_SIZE]; /* dbdf2val list str */
+	struct mlx4_dbdf2val tbl[MLX4_DEVS_TBL_SIZE];/* dbdf to value table */
+	int		num_vals;		     /* # of vals per dbdf */
+	int		def_val[MLX4_MAX_BDF_VALS];  /* Default values */
+	struct mlx4_range range;		     /* Valid values range */
 };
 
+int mlx4_fill_dbdf2val_tbl(struct mlx4_dbdf2val_lst *dbdf2val_lst);
+int mlx4_get_val(struct mlx4_dbdf2val *tbl, struct pci_dev *pdev, int idx,
+		 int *val);
+
 int mlx4_register_interface(struct mlx4_interface *intf);
 void mlx4_unregister_interface(struct mlx4_interface *intf);
-void *mlx4_get_prot_dev(struct mlx4_dev *dev, enum mlx4_prot proto, int port);
 
-struct mlx4_dev *mlx4_query_interface(void *, int *port);
-void mlx4_set_iboe_counter(struct mlx4_dev *dev, int index, u8 port);
-int mlx4_get_iboe_counter(struct mlx4_dev *dev, u8 port);
+void *mlx4_get_protocol_dev(struct mlx4_dev *dev, enum mlx4_protocol proto,
+			    int port);
 
+#ifndef ETH_ALEN
+#define ETH_ALEN	6
+#endif
+static inline u64 mlx4_mac_to_u64(const u8 *addr)
+{
+	u64 mac = 0;
+	int i;
+
+	for (i = 0; i < ETH_ALEN; i++) {
+		mac <<= 8;
+		mac |= addr[i];
+	}
+	return mac;
+}
+
 #endif /* MLX4_DRIVER_H */


Property changes on: trunk/sys/ofed/include/linux/mlx4/driver.h
___________________________________________________________________
Deleted: cvs2svn:cvs-rev
## -1 +0,0 ##
-1.1.1.1
\ No newline at end of property
Modified: trunk/sys/ofed/include/linux/mlx4/qp.h
===================================================================
--- trunk/sys/ofed/include/linux/mlx4/qp.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/linux/mlx4/qp.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -39,6 +39,21 @@
 
 #define MLX4_INVALID_LKEY	0x100
 
+#define	DS_SIZE_ALIGNMENT	16
+
+#define	SET_BYTE_COUNT(byte_count) cpu_to_be32(byte_count)
+#define	SET_LSO_MSS(mss_hdr_size) cpu_to_be32(mss_hdr_size)
+#define	DS_BYTE_COUNT_MASK	cpu_to_be32(0x7fffffff)
+
+enum ib_m_qp_attr_mask {
+	IB_M_EXT_CLASS_1 = 1 << 28,
+	IB_M_EXT_CLASS_2 = 1 << 29,
+	IB_M_EXT_CLASS_3 = 1 << 30,
+
+	IB_M_QP_MOD_VEND_MASK = (IB_M_EXT_CLASS_1 | IB_M_EXT_CLASS_2 |
+				 IB_M_EXT_CLASS_3)
+};
+
 enum mlx4_qp_optpar {
 	MLX4_QP_OPTPAR_ALT_ADDR_PATH		= 1 << 0,
 	MLX4_QP_OPTPAR_RRE			= 1 << 1,
@@ -95,11 +110,42 @@
 	MLX4_QP_BIT_RWE				= 1 << 14,
 	MLX4_QP_BIT_RAE				= 1 << 13,
 	MLX4_QP_BIT_RIC				= 1 <<	4,
+	MLX4_QP_BIT_COLL_SYNC_RQ                = 1 << 2,
+	MLX4_QP_BIT_COLL_SYNC_SQ                = 1 << 1,
+	MLX4_QP_BIT_COLL_MASTER                 = 1 << 0
 };
 
+enum {
+	MLX4_RSS_HASH_XOR			= 0,
+	MLX4_RSS_HASH_TOP			= 1,
+
+	MLX4_RSS_UDP_IPV6			= 1 << 0,
+	MLX4_RSS_UDP_IPV4			= 1 << 1,
+	MLX4_RSS_TCP_IPV6			= 1 << 2,
+	MLX4_RSS_IPV6				= 1 << 3,
+	MLX4_RSS_TCP_IPV4			= 1 << 4,
+	MLX4_RSS_IPV4				= 1 << 5,
+
+	/* offset of mlx4_rss_context within mlx4_qp_context.pri_path */
+	MLX4_RSS_OFFSET_IN_QPC_PRI_PATH		= 0x24,
+	/* offset of being RSS indirection QP within mlx4_qp_context.flags */
+	MLX4_RSS_QPC_FLAG_OFFSET		= 13,
+};
+
+struct mlx4_rss_context {
+	__be32			base_qpn;
+	__be32			default_qpn;
+	u16			reserved;
+	u8			hash_fn;
+	u8			flags;
+	__be32			rss_key[10];
+	__be32			base_qpn_udp;
+};
+
 struct mlx4_qp_path {
 	u8			fl;
-	u8			reserved1[2];
+	u8			vlan_control;
+	u8			disable_pkey_check;
 	u8			pkey_index;
 	u8			counter_index;
 	u8			grh_mylmc;
@@ -112,11 +158,36 @@
 	u8			rgid[16];
 	u8			sched_queue;
 	u8			vlan_index;
-	u8			reserved3[2];
+	u8			feup;
+	u8			fvl_rx;
 	u8			reserved4[2];
 	u8			dmac[6];
 };
 
+enum { /* fl */
+	MLX4_FL_CV	= 1 << 6,
+	MLX4_FL_ETH_HIDE_CQE_VLAN	= 1 << 2,
+	MLX4_FL_ETH_SRC_CHECK_MC_LB	= 1 << 1,
+	MLX4_FL_ETH_SRC_CHECK_UC_LB	= 1 << 0,
+};
+enum { /* vlan_control */
+	MLX4_VLAN_CTRL_ETH_SRC_CHECK_IF_COUNTER	= 1 << 7,
+	MLX4_VLAN_CTRL_ETH_TX_BLOCK_TAGGED	= 1 << 6,
+	MLX4_VLAN_CTRL_ETH_RX_BLOCK_TAGGED	= 1 << 2,
+	MLX4_VLAN_CTRL_ETH_RX_BLOCK_PRIO_TAGGED	= 1 << 1,/* 802.1p priorty tag*/
+	MLX4_VLAN_CTRL_ETH_RX_BLOCK_UNTAGGED	= 1 << 0
+};
+
+enum { /* feup */
+	MLX4_FEUP_FORCE_ETH_UP		= 1 << 6, /* force Eth UP */
+	MLX4_FSM_FORCE_ETH_SRC_MAC	= 1 << 5, /* force Source MAC */
+	MLX4_FVL_FORCE_ETH_VLAN		= 1 << 3  /* force Eth vlan */
+};
+
+enum { /* fvl_rx */
+	MLX4_FVL_RX_FORCE_ETH_VLAN	= 1 << 0 /* enforce Eth rx vlan */
+};
+
 struct mlx4_qp_context {
 	__be32			flags;
 	__be32			pd;
@@ -153,23 +224,57 @@
 	u8			reserved4[2];
 	u8			mtt_base_addr_h;
 	__be32			mtt_base_addr_l;
-	u8			VE;
-	u8			reserved5;
-	__be16			VFT_id_prio;
-	u8			reserved6;
-	u8			exch_size;
-	__be16			exch_base;
-	u8			VFT_hop_cnt;
-	u8			my_fc_id_idx;
-	__be16			reserved7;
-	u32			reserved8[7];
+	u32			reserved5[10];
 };
 
+struct mlx4_update_qp_context {
+	__be64			qp_mask;
+	__be64			primary_addr_path_mask;
+	__be64			secondary_addr_path_mask;
+	u64			reserved1;
+	struct mlx4_qp_context	qp_context;
+	u64			reserved2[58];
+};
+
+enum {
+	MLX4_UPD_QP_MASK_PM_STATE	= 32,
+	MLX4_UPD_QP_MASK_VSD		= 33,
+};
+
+enum {
+	MLX4_UPD_QP_PATH_MASK_PKEY_INDEX		= 0 + 32,
+	MLX4_UPD_QP_PATH_MASK_FSM			= 1 + 32,
+	MLX4_UPD_QP_PATH_MASK_MAC_INDEX			= 2 + 32,
+	MLX4_UPD_QP_PATH_MASK_FVL			= 3 + 32,
+	MLX4_UPD_QP_PATH_MASK_CV			= 4 + 32,
+	MLX4_UPD_QP_PATH_MASK_VLAN_INDEX		= 5 + 32,
+	MLX4_UPD_QP_PATH_MASK_ETH_HIDE_CQE_VLAN		= 6 + 32,
+	MLX4_UPD_QP_PATH_MASK_ETH_TX_BLOCK_UNTAGGED	= 7 + 32,
+	MLX4_UPD_QP_PATH_MASK_ETH_TX_BLOCK_1P		= 8 + 32,
+	MLX4_UPD_QP_PATH_MASK_ETH_TX_BLOCK_TAGGED	= 9 + 32,
+	MLX4_UPD_QP_PATH_MASK_ETH_RX_BLOCK_UNTAGGED	= 10 + 32,
+	MLX4_UPD_QP_PATH_MASK_ETH_RX_BLOCK_1P		= 11 + 32,
+	MLX4_UPD_QP_PATH_MASK_ETH_RX_BLOCK_TAGGED	= 12 + 32,
+	MLX4_UPD_QP_PATH_MASK_FEUP			= 13 + 32,
+	MLX4_UPD_QP_PATH_MASK_SCHED_QUEUE		= 14 + 32,
+	MLX4_UPD_QP_PATH_MASK_IF_COUNTER_INDEX		= 15 + 32,
+	MLX4_UPD_QP_PATH_MASK_FVL_RX			= 16 + 32,
+	MLX4_UPD_QP_PATH_MASK_ETH_SRC_CHECK_UC_LB	= 18 + 32,
+	MLX4_UPD_QP_PATH_MASK_ETH_SRC_CHECK_MC_LB	= 19 + 32,
+};
+
+enum { /* param3 */
+	MLX4_STRIP_VLAN	= 1 << 30
+};
+
+
 /* Which firmware version adds support for NEC (NoErrorCompletion) bit */
 #define MLX4_FW_VER_WQE_CTRL_NEC mlx4_fw_ver(2, 2, 232)
 
 enum {
+	MLX4_WQE_CTRL_OWN		= 1 << 31,
 	MLX4_WQE_CTRL_NEC		= 1 << 29,
+	MLX4_WQE_CTRL_RR		= 1 << 6,
 	MLX4_WQE_CTRL_FENCE		= 1 << 6,
 	MLX4_WQE_CTRL_CQ_UPDATE		= 3 << 2,
 	MLX4_WQE_CTRL_SOLICITED		= 1 << 1,
@@ -192,8 +297,12 @@
 	 * [4]   IP checksum
 	 * [3:2] C (generate completion queue entry)
 	 * [1]   SE (solicited event)
+	 * [0]   FL (force loopback)
 	 */
-	__be32			srcrb_flags;
+	union {
+		__be32			srcrb_flags;
+		__be16			srcrb_flags16[2];
+	};
 	/*
 	 * imm is immediate data for send/RDMA write w/ immediate;
 	 * also invalidation key for send with invalidate; input
@@ -204,8 +313,7 @@
 
 enum {
 	MLX4_WQE_MLX_VL15	= 1 << 17,
-	MLX4_WQE_MLX_SLR	= 1 << 16,
-	MLX4_WQE_MLX_ICRC	= 1 << 4
+	MLX4_WQE_MLX_SLR	= 1 << 16
 };
 
 struct mlx4_wqe_mlx_seg {
@@ -212,7 +320,8 @@
 	u8			owner;
 	u8			reserved1[2];
 	u8			opcode;
-	u8			reserved2[3];
+	__be16			sched_prio;
+	u8			reserved2;
 	u8			size;
 	/*
 	 * [17]    VL15
@@ -241,6 +350,11 @@
 	__be32			header[0];
 };
 
+enum mlx4_wqe_bind_seg_flags2 {
+	MLX4_WQE_BIND_TYPE_2     = (1<<31),
+	MLX4_WQE_BIND_ZERO_BASED = (1<<30),
+};
+
 struct mlx4_wqe_bind_seg {
 	__be32			flags1;
 	__be32			flags2;
@@ -253,9 +367,9 @@
 enum {
 	MLX4_WQE_FMR_PERM_LOCAL_READ	= 1 << 27,
 	MLX4_WQE_FMR_PERM_LOCAL_WRITE	= 1 << 28,
-	MLX4_WQE_FMR_PERM_REMOTE_READ	= 1 << 29,
-	MLX4_WQE_FMR_PERM_REMOTE_WRITE	= 1 << 30,
-	MLX4_WQE_FMR_PERM_ATOMIC	= 1 << 31
+	MLX4_WQE_FMR_AND_BIND_PERM_REMOTE_READ	= 1 << 29,
+	MLX4_WQE_FMR_AND_BIND_PERM_REMOTE_WRITE	= 1 << 30,
+	MLX4_WQE_FMR_AND_BIND_PERM_ATOMIC	= 1 << 31
 };
 
 struct mlx4_wqe_fmr_seg {
@@ -280,12 +394,10 @@
 };
 
 struct mlx4_wqe_local_inval_seg {
-	__be32			flags;
-	u32			reserved1;
+	u64			reserved1;
 	__be32			mem_key;
-	u32			reserved2[2];
-	__be32			guest_id;
-	__be64			pa;
+	u32			reserved2;
+	u64			reserved3[2];
 };
 
 struct mlx4_wqe_raddr_seg {
@@ -338,9 +450,6 @@
 	return radix_tree_lookup(&dev->qp_table_tree, qpn & (dev->caps.num_qps - 1));
 }
 
-struct mlx4_qp *mlx4_qp_lookup_lock(struct mlx4_dev *dev, u32 qpn);
 void mlx4_qp_remove(struct mlx4_dev *dev, struct mlx4_qp *qp);
-int mlx4_qp_get_region(struct mlx4_dev *dev, enum mlx4_qp_region region,
-			int *base_qpn, int *cnt);
 
 #endif /* MLX4_QP_H */


Property changes on: trunk/sys/ofed/include/linux/mlx4/qp.h
___________________________________________________________________
Deleted: cvs2svn:cvs-rev
## -1 +0,0 ##
-1.1.1.1
\ No newline at end of property
Modified: trunk/sys/ofed/include/linux/mlx4/srq.h
===================================================================
--- trunk/sys/ofed/include/linux/mlx4/srq.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/linux/mlx4/srq.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -33,9 +33,6 @@
 #ifndef MLX4_SRQ_H
 #define MLX4_SRQ_H
 
-#include <linux/types.h>
-#include <linux/mlx4/device.h>
-
 struct mlx4_wqe_srq_next_seg {
 	u16			reserved1;
 	__be16			next_wqe_index;
@@ -42,13 +39,6 @@
 	u32			reserved2[3];
 };
 
-void mlx4_srq_invalidate(struct mlx4_dev *dev, struct mlx4_srq *srq);
-void mlx4_srq_remove(struct mlx4_dev *dev, struct mlx4_srq *srq);
+struct mlx4_srq *mlx4_srq_lookup(struct mlx4_dev *dev, u32 srqn);
 
-static inline struct mlx4_srq *__mlx4_srq_lookup(struct mlx4_dev *dev, u32 srqn)
-{
-	return radix_tree_lookup(&dev->srq_table_tree,
-				 srqn & (dev->caps.num_srqs - 1));
-}
-
 #endif /* MLX4_SRQ_H */


Property changes on: trunk/sys/ofed/include/linux/mlx4/srq.h
___________________________________________________________________
Deleted: cvs2svn:cvs-rev
## -1 +0,0 ##
-1.1.1.1
\ No newline at end of property
Modified: trunk/sys/ofed/include/linux/mm.h
===================================================================
--- trunk/sys/ofed/include/linux/mm.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/linux/mm.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -2,6 +2,7 @@
  * Copyright (c) 2010 Isilon Systems, Inc.
  * Copyright (c) 2010 iX Systems, Inc.
  * Copyright (c) 2010 Panasas, Inc.
+ * Copyright (c) 2013, 2014 Mellanox Technologies, Ltd.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -39,6 +40,7 @@
 	vm_offset_t	vm_end;
 	vm_offset_t	vm_pgoff;
 	vm_paddr_t	vm_pfn;		/* PFN For mmap. */
+	vm_size_t	vm_len;		/* length for mmap. */
 	vm_memattr_t	vm_page_prot;
 };
 
@@ -77,6 +79,7 @@
 {
 	vma->vm_page_prot = prot;
 	vma->vm_pfn = pfn;
+	vma->vm_len = size;
 
 	return (0);
 }

Modified: trunk/sys/ofed/include/linux/module.h
===================================================================
--- trunk/sys/ofed/include/linux/module.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/linux/module.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -2,6 +2,7 @@
  * Copyright (c) 2010 Isilon Systems, Inc.
  * Copyright (c) 2010 iX Systems, Inc.
  * Copyright (c) 2010 Panasas, Inc.
+ * Copyright (c) 2013, 2014 Mellanox Technologies, Ltd.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -37,13 +38,21 @@
 #define MODULE_AUTHOR(name)
 #define MODULE_DESCRIPTION(name)
 #define MODULE_LICENSE(name)
-#define	MODULE_VERSION(name)
 
+#ifndef MODULE_VERSION
+#define MODULE_VERSION(name)
+#endif
+
 #define	THIS_MODULE	((struct module *)0)
 
 #define	EXPORT_SYMBOL(name)
 #define	EXPORT_SYMBOL_GPL(name)
 
+/* OFED pre-module initialization */
+#define	SI_SUB_OFED_PREINIT	(SI_SUB_ROOT_CONF - 2)
+/* OFED default module initialization */
+#define	SI_SUB_OFED_MODINIT	(SI_SUB_ROOT_CONF - 1)
+
 #include <sys/linker.h>
 
 static inline void
@@ -68,17 +77,20 @@
 }
 
 #define	module_init(fn)							\
-	SYSINIT(fn, SI_SUB_RUN_SCHEDULER, SI_ORDER_FIRST, _module_run, (fn))
+	SYSINIT(fn, SI_SUB_OFED_MODINIT, SI_ORDER_FIRST, _module_run, (fn))
 
+#define	module_exit(fn)						\
+	SYSUNINIT(fn, SI_SUB_OFED_MODINIT, SI_ORDER_SECOND, _module_run, (fn))
+
 /*
- * XXX This is a freebsdism designed to work around not having a module
- * load order resolver built in.
+ * The following two macros are a workaround for not having a module
+ * load and unload order resolver:
  */
 #define	module_init_order(fn, order)					\
-	SYSINIT(fn, SI_SUB_RUN_SCHEDULER, (order), _module_run, (fn))
+	SYSINIT(fn, SI_SUB_OFED_MODINIT, (order), _module_run, (fn))
 
-#define	module_exit(fn)						\
-	SYSUNINIT(fn, SI_SUB_RUN_SCHEDULER, SI_ORDER_FIRST, _module_run, (fn))
+#define	module_exit_order(fn, order)				\
+	SYSUNINIT(fn, SI_SUB_OFED_MODINIT, (order), _module_run, (fn))
 
 #define	module_get(module)
 #define	module_put(module)

Modified: trunk/sys/ofed/include/linux/moduleparam.h
===================================================================
--- trunk/sys/ofed/include/linux/moduleparam.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/linux/moduleparam.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -2,6 +2,7 @@
  * Copyright (c) 2010 Isilon Systems, Inc.
  * Copyright (c) 2010 iX Systems, Inc.
  * Copyright (c) 2010 Panasas, Inc.
+ * Copyright (c) 2013, 2014 Mellanox Technologies, Ltd.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -25,6 +26,7 @@
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
+
 #ifndef	_LINUX_MODULEPARAM_H_
 #define	_LINUX_MODULEPARAM_H_
 
@@ -81,6 +83,8 @@
 	SYSINIT(name##_param_sysinit, SI_SUB_DRIVERS, SI_ORDER_FIRST,	\
 	    param_sysinit, &__param_##name);
 
+#define module_param_string(name, string, len, perm)                    
+         
 #define	module_param_named(name, var, type, mode)			\
 	module_param_call(name, param_set_##type, param_get_##type, &var, mode)
 
@@ -87,6 +91,9 @@
 #define	module_param(var, type, mode)					\
 	module_param_named(var, var, type, mode)
 
+#define module_param_array(var, type, addr_argc, mode)                  \
+        module_param_named(var, var, type, mode)
+
 #define	MODULE_PARM_DESC(name, desc)
 
 static inline int

Modified: trunk/sys/ofed/include/linux/mutex.h
===================================================================
--- trunk/sys/ofed/include/linux/mutex.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/linux/mutex.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -2,6 +2,7 @@
  * Copyright (c) 2010 Isilon Systems, Inc.
  * Copyright (c) 2010 iX Systems, Inc.
  * Copyright (c) 2010 Panasas, Inc.
+ * Copyright (c) 2013, 2014 Mellanox Technologies, Ltd.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without

Modified: trunk/sys/ofed/include/linux/net.h
===================================================================
--- trunk/sys/ofed/include/linux/net.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/linux/net.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -2,6 +2,7 @@
  * Copyright (c) 2010 Isilon Systems, Inc.
  * Copyright (c) 2010 iX Systems, Inc.
  * Copyright (c) 2010 Panasas, Inc.
+ * Copyright (c) 2013, 2014 Mellanox Technologies, Ltd.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -44,23 +45,23 @@
 sock_getname(struct socket *so, struct sockaddr *addr, int *sockaddr_len,
     int peer)
 {
-	struct sockaddr **nam;
+	struct sockaddr *nam;
 	int error;
 
 	nam = NULL;
-	if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0)
-		return (-ENOTCONN);
+	if (peer) {
+		if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0)
+			return (-ENOTCONN);
 
-	if (peer)
-		error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, nam);
-	else
-		error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, nam);
+		error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, &nam);
+	} else
+		error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &nam);
 	if (error)
 		return (-error);
-	*addr = **nam;
+	*addr = *nam;
 	*sockaddr_len = addr->sa_len;
 
-	free(*nam, M_SONAME);
+	free(nam, M_SONAME);
 	return (0);
 }
 

Modified: trunk/sys/ofed/include/linux/netdevice.h
===================================================================
--- trunk/sys/ofed/include/linux/netdevice.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/linux/netdevice.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -2,6 +2,7 @@
  * Copyright (c) 2010 Isilon Systems, Inc.
  * Copyright (c) 2010 iX Systems, Inc.
  * Copyright (c) 2010 Panasas, Inc.
+ * Copyright (c) 2013, 2014 Mellanox Technologies, Ltd.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -39,7 +40,6 @@
 
 #include <linux/completion.h>
 #include <linux/device.h>
-#include <linux/ethtool.h>
 #include <linux/workqueue.h>
 #include <linux/net.h>
 #include <linux/notifier.h>
@@ -97,6 +97,24 @@
 	nb->notifier_call(nb, NETDEV_UNREGISTER, ifp);
 }
 
+static inline void
+_handle_iflladdr_event(void *arg, struct ifnet *ifp)
+{
+	struct notifier_block *nb;
+
+	nb = arg;
+	nb->notifier_call(nb, NETDEV_CHANGEADDR, ifp);
+}
+
+static inline void
+_handle_ifaddr_event(void *arg, struct ifnet *ifp)
+{
+	struct notifier_block *nb;
+
+	nb = arg;
+	nb->notifier_call(nb, NETDEV_CHANGEIFADDR, ifp);
+}
+
 static inline int
 register_netdevice_notifier(struct notifier_block *nb)
 {
@@ -107,10 +125,22 @@
 	    ifnet_arrival_event, _handle_ifnet_arrival_event, nb, 0);
 	nb->tags[NETDEV_UNREGISTER] = EVENTHANDLER_REGISTER(
 	    ifnet_departure_event, _handle_ifnet_departure_event, nb, 0);
+	nb->tags[NETDEV_CHANGEADDR] = EVENTHANDLER_REGISTER(
+	    iflladdr_event, _handle_iflladdr_event, nb, 0);
+
 	return (0);
 }
 
 static inline int
+register_inetaddr_notifier(struct notifier_block *nb)
+{
+
+        nb->tags[NETDEV_CHANGEIFADDR] = EVENTHANDLER_REGISTER(
+            ifaddr_event, _handle_ifaddr_event, nb, 0);
+        return (0);
+}
+
+static inline int
 unregister_netdevice_notifier(struct notifier_block *nb)
 {
 
@@ -118,9 +148,23 @@
         EVENTHANDLER_DEREGISTER(ifnet_arrival_event, nb->tags[NETDEV_REGISTER]);
         EVENTHANDLER_DEREGISTER(ifnet_departure_event,
 	    nb->tags[NETDEV_UNREGISTER]);
+        EVENTHANDLER_DEREGISTER(iflladdr_event,
+            nb->tags[NETDEV_CHANGEADDR]);
+
 	return (0);
 }
 
+static inline int
+unregister_inetaddr_notifier(struct notifier_block *nb)
+{
+
+        EVENTHANDLER_DEREGISTER(ifaddr_event,
+            nb->tags[NETDEV_CHANGEIFADDR]);
+
+        return (0);
+}
+
+
 #define	rtnl_lock()
 #define	rtnl_unlock()
 

Modified: trunk/sys/ofed/include/linux/notifier.h
===================================================================
--- trunk/sys/ofed/include/linux/notifier.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/linux/notifier.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -2,6 +2,7 @@
  * Copyright (c) 2010 Isilon Systems, Inc.
  * Copyright (c) 2010 iX Systems, Inc.
  * Copyright (c) 2010 Panasas, Inc.
+ * Copyright (c) 2013, 2014 Mellanox Technologies, Ltd.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -35,7 +36,7 @@
  * Max number of FreeBSD events to map to Linux events per notify type.
  */
 #define	NOTIFY_DONE	0
-#define	_NOTIFY_COUNT	5
+#define	_NOTIFY_COUNT	7
 
 struct notifier_block {
 	int (*notifier_call)(struct notifier_block *, unsigned long, void *);
@@ -49,6 +50,8 @@
 #define	NETDEV_DOWN		0x0002
 #define	NETDEV_REGISTER		0x0003
 #define	NETDEV_UNREGISTER	0x0004
+#define	NETDEV_CHANGEADDR       0x0005
+#define	NETDEV_CHANGEIFADDR     0x0006
 
 
 #endif	/* _LINUX_NOTIFIER_H_ */

Modified: trunk/sys/ofed/include/linux/page.h
===================================================================
--- trunk/sys/ofed/include/linux/page.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/linux/page.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -2,6 +2,7 @@
  * Copyright (c) 2010 Isilon Systems, Inc.
  * Copyright (c) 2010 iX Systems, Inc.
  * Copyright (c) 2010 Panasas, Inc.
+ * Copyright (c) 2013, 2014 Mellanox Technologies, Ltd.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without

Modified: trunk/sys/ofed/include/linux/pci.h
===================================================================
--- trunk/sys/ofed/include/linux/pci.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/linux/pci.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -2,6 +2,7 @@
  * Copyright (c) 2010 Isilon Systems, Inc.
  * Copyright (c) 2010 iX Systems, Inc.
  * Copyright (c) 2010 Panasas, Inc.
+ * Copyright (c) 2013, 2014 Mellanox Technologies, Ltd.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -43,7 +44,6 @@
 
 #include <machine/resource.h>
 
-#include <linux/init.h>
 #include <linux/list.h>
 #include <linux/dmapool.h>
 #include <linux/dma-mapping.h>
@@ -72,19 +72,49 @@
 #define	PCI_DEVICE_ID_MELLANOX_SINAI_OLD	0x5e8c
 #define	PCI_DEVICE_ID_MELLANOX_SINAI		0x6274
 
+#define PCI_DEVFN(slot, func)   ((((slot) & 0x1f) << 3) | ((func) & 0x07))
+#define PCI_SLOT(devfn)         (((devfn) >> 3) & 0x1f)
+#define PCI_FUNC(devfn)         ((devfn) & 0x07)
 
-#define PCI_VDEVICE(vendor, device)					\
-	    PCI_VENDOR_ID_##vendor, (device), PCI_ANY_ID, PCI_ANY_ID, 0, 0
-#define	PCI_DEVICE(vendor, device)					\
-	    (vendor), (device), PCI_ANY_ID, PCI_ANY_ID, 0, 0
+#define PCI_VDEVICE(_vendor, _device)					\
+	    .vendor = PCI_VENDOR_ID_##_vendor, .device = (_device),	\
+	    .subvendor = PCI_ANY_ID, .subdevice = PCI_ANY_ID
+#define	PCI_DEVICE(_vendor, _device)					\
+	    .vendor = (_vendor), .device = (_device),			\
+	    .subvendor = PCI_ANY_ID, .subdevice = PCI_ANY_ID
 
 #define	to_pci_dev(n)	container_of(n, struct pci_dev, dev)
 
-#define	PCI_VENDOR_ID	PCIR_DEVVENDOR
-#define	PCI_COMMAND	PCIR_COMMAND
-#define	PCI_EXP_DEVCTL	PCIR_EXPRESS_DEVICE_CTL
-#define	PCI_EXP_LNKCTL	PCIR_EXPRESS_LINK_CTL
+#define	PCI_VENDOR_ID		PCIR_DEVVENDOR
+#define	PCI_COMMAND		PCIR_COMMAND
+#define	PCI_EXP_DEVCTL		PCIER_DEVICE_CTL		/* Device Control */
+#define	PCI_EXP_LNKCTL		PCIER_LINK_CTL			/* Link Control */
+#define	PCI_EXP_FLAGS_TYPE	PCIEM_FLAGS_TYPE		/* Device/Port type */
+#define	PCI_EXP_DEVCAP		PCIER_DEVICE_CAP		/* Device capabilities */
+#define	PCI_EXP_DEVSTA		PCIER_DEVICE_STA		/* Device Status */
+#define	PCI_EXP_LNKCAP		PCIER_LINK_CAP			/* Link Capabilities */
+#define	PCI_EXP_LNKSTA		PCIER_LINK_STA			/* Link Status */
+#define	PCI_EXP_SLTCAP		PCIER_SLOT_CAP			/* Slot Capabilities */
+#define	PCI_EXP_SLTCTL		PCIER_SLOT_CTL			/* Slot Control */
+#define	PCI_EXP_SLTSTA		PCIER_SLOT_STA			/* Slot Status */
+#define	PCI_EXP_RTCTL		PCIER_ROOT_CTL			/* Root Control */
+#define	PCI_EXP_RTCAP		PCIER_ROOT_CAP			/* Root Capabilities */
+#define	PCI_EXP_RTSTA		PCIER_ROOT_STA			/* Root Status */
+#define	PCI_EXP_DEVCAP2		PCIER_DEVICE_CAP2		/* Device Capabilities 2 */
+#define	PCI_EXP_DEVCTL2		PCIER_DEVICE_CTL2		/* Device Control 2 */
+#define	PCI_EXP_LNKCAP2		PCIER_LINK_CAP2			/* Link Capabilities 2 */
+#define	PCI_EXP_LNKCTL2		PCIER_LINK_CTL2			/* Link Control 2 */
+#define	PCI_EXP_LNKSTA2		PCIER_LINK_STA2			/* Link Status 2 */
+#define	PCI_EXP_FLAGS		PCIER_FLAGS			/* Capabilities register */
+#define	PCI_EXP_FLAGS_VERS	PCIEM_FLAGS_VERSION		/* Capability version */
+#define	PCI_EXP_TYPE_ROOT_PORT	PCIEM_TYPE_ROOT_PORT		/* Root Port */
+#define	PCI_EXP_TYPE_ENDPOINT	PCIEM_TYPE_ENDPOINT		/* Express Endpoint */
+#define	PCI_EXP_TYPE_LEG_END	PCIEM_TYPE_LEGACY_ENDPOINT	/* Legacy Endpoint */
+#define	PCI_EXP_TYPE_DOWNSTREAM PCIEM_TYPE_DOWNSTREAM_PORT	/* Downstream Port */
+#define	PCI_EXP_FLAGS_SLOT	PCIEM_FLAGS_SLOT		/* Slot implemented */
+#define	PCI_EXP_TYPE_RC_EC	PCIEM_TYPE_ROOT_EC		/* Root Complex Event Collector */
 
+
 #define	IORESOURCE_MEM	SYS_RES_MEMORY
 #define	IORESOURCE_IO	SYS_RES_IOPORT
 #define	IORESOURCE_IRQ	SYS_RES_IRQ
@@ -91,14 +121,18 @@
 
 struct pci_dev;
 
+
 struct pci_driver {
 	struct list_head		links;
 	char				*name;
-	struct pci_device_id		*id_table;
+	const struct pci_device_id		*id_table;
 	int  (*probe)(struct pci_dev *dev, const struct pci_device_id *id);
 	void (*remove)(struct pci_dev *dev);
+        int  (*suspend) (struct pci_dev *dev, pm_message_t state);      /* Device suspended */
+        int  (*resume) (struct pci_dev *dev);                   /* Device woken up */
 	driver_t			driver;
 	devclass_t			bsdclass;
+        const struct pci_error_handlers       *err_handler;
 };
 
 extern struct list_head pci_drivers;
@@ -115,6 +149,8 @@
 	uint16_t		device;
 	uint16_t		vendor;
 	unsigned int		irq;
+	unsigned int		devfn;
+	u8			revision;
 };
 
 static inline struct resource_list_entry *
@@ -234,6 +270,14 @@
 }
 
 static inline int
+pci_clear_master(struct pci_dev *pdev)
+{
+
+	pci_disable_busmaster(pdev->dev.bsddev);
+	return (0);
+}
+
+static inline int
 pci_request_region(struct pci_dev *pdev, int bar, const char *res_name)
 {
 	int rid;
@@ -294,6 +338,7 @@
 #define	PCI_CAP_ID_EXP	PCIY_EXPRESS
 #define	PCI_CAP_ID_PCIX	PCIY_PCIX
 
+
 static inline int
 pci_find_capability(struct pci_dev *pdev, int capid)
 {
@@ -304,6 +349,26 @@
 	return (reg);
 }
 
+
+
+
+/**
+ * pci_pcie_cap - get the saved PCIe capability offset
+ * @dev: PCI device
+ *
+ * PCIe capability offset is calculated at PCI device initialization
+ * time and saved in the data structure. This function returns saved
+ * PCIe capability offset. Using this instead of pci_find_capability()
+ * reduces unnecessary search in the PCI configuration space. If you
+ * need to calculate PCIe capability offset from raw device for some
+ * reasons, please use pci_find_capability() instead.
+ */
+static inline int pci_pcie_cap(struct pci_dev *dev)
+{
+        return pci_find_capability(dev, PCI_CAP_ID_EXP);
+}
+
+
 static inline int
 pci_read_config_byte(struct pci_dev *pdev, int where, u8 *val)
 {
@@ -353,9 +418,9 @@
 }
 
 static struct pci_driver *
-linux_pci_find(device_t dev, struct pci_device_id **idp)
+linux_pci_find(device_t dev, const struct pci_device_id **idp)
 {
-	struct pci_device_id *id;
+	const struct pci_device_id *id;
 	struct pci_driver *pdrv;
 	uint16_t vendor;
 	uint16_t device;
@@ -380,7 +445,7 @@
 static inline int
 linux_pci_probe(device_t dev)
 {
-	struct pci_device_id *id;
+	const struct pci_device_id *id;
 	struct pci_driver *pdrv;
 
 	if ((pdrv = linux_pci_find(dev, &id)) == NULL)
@@ -397,7 +462,7 @@
 	struct resource_list_entry *rle;
 	struct pci_dev *pdev;
 	struct pci_driver *pdrv;
-	struct pci_device_id *id;
+	const struct pci_device_id *id;
 	int error;
 
 	pdrv = linux_pci_find(dev, &id);
@@ -519,6 +584,14 @@
 	avail = nreq;
 	if ((error = -pci_alloc_msix(pdev->dev.bsddev, &avail)) != 0)
 		return error;
+	/*
+	 * Handle case where "pci_alloc_msix()" may allocate less
+	 * interrupts than available and return with no error:
+	 */
+	if (avail < nreq) {
+		pci_release_msi(pdev->dev.bsddev);
+		return avail;
+	}
 	rle = _pci_get_rle(pdev, SYS_RES_IRQ, 1);
 	pdev->dev.msix = rle->start;
 	pdev->dev.msix_max = rle->start + avail;
@@ -527,6 +600,54 @@
 	return (0);
 }
 
+#define	pci_enable_msix_range	linux_pci_enable_msix_range
+static inline int
+pci_enable_msix_range(struct pci_dev *dev, struct msix_entry *entries,
+    int minvec, int maxvec)
+{
+	int nvec = maxvec;
+	int rc;
+
+	if (maxvec < minvec)
+		return (-ERANGE);
+
+	do {
+		rc = pci_enable_msix(dev, entries, nvec);
+		if (rc < 0) {
+			return (rc);
+		} else if (rc > 0) {
+			if (rc < minvec)
+				return (-ENOSPC);
+			nvec = rc;
+		}
+	} while (rc);
+	return (nvec);
+}
+
+static inline int pci_channel_offline(struct pci_dev *pdev)
+{
+        return false;
+}
+
+static inline int pci_enable_sriov(struct pci_dev *dev, int nr_virtfn)
+{
+        return -ENODEV;
+}
+static inline void pci_disable_sriov(struct pci_dev *dev)
+{
+}
+
+/**
+ * DEFINE_PCI_DEVICE_TABLE - macro used to describe a pci device table
+ * @_table: device table name
+ *
+ * This macro is used to create a struct pci_device_id array (a device table)
+ * in a generic manner.
+ */
+#define DEFINE_PCI_DEVICE_TABLE(_table) \
+	const struct pci_device_id _table[] __devinitdata
+
+
 /* XXX This should not be necessary. */
 #define	pcix_set_mmrbc(d, v)	0
 #define	pcix_get_max_mmrbc(d)	0
@@ -576,5 +697,173 @@
 #define	pci_unmap_len		dma_unmap_len
 #define	pci_unmap_len_set	dma_unmap_len_set
 
+typedef unsigned int __bitwise pci_channel_state_t;
+typedef unsigned int __bitwise pci_ers_result_t;
 
+enum pci_channel_state {
+        /* I/O channel is in normal state */
+        pci_channel_io_normal = (__force pci_channel_state_t) 1,
+
+        /* I/O to channel is blocked */
+        pci_channel_io_frozen = (__force pci_channel_state_t) 2,
+
+        /* PCI card is dead */
+        pci_channel_io_perm_failure = (__force pci_channel_state_t) 3,
+};
+
+enum pci_ers_result {
+        /* no result/none/not supported in device driver */
+        PCI_ERS_RESULT_NONE = (__force pci_ers_result_t) 1,
+
+        /* Device driver can recover without slot reset */
+        PCI_ERS_RESULT_CAN_RECOVER = (__force pci_ers_result_t) 2,
+
+        /* Device driver wants slot to be reset. */
+        PCI_ERS_RESULT_NEED_RESET = (__force pci_ers_result_t) 3,
+
+        /* Device has completely failed, is unrecoverable */
+        PCI_ERS_RESULT_DISCONNECT = (__force pci_ers_result_t) 4,
+
+        /* Device driver is fully recovered and operational */
+        PCI_ERS_RESULT_RECOVERED = (__force pci_ers_result_t) 5,
+};
+
+
+/* PCI bus error event callbacks */
+struct pci_error_handlers {
+        /* PCI bus error detected on this device */
+        pci_ers_result_t (*error_detected)(struct pci_dev *dev,
+                        enum pci_channel_state error);
+
+        /* MMIO has been re-enabled, but not DMA */
+        pci_ers_result_t (*mmio_enabled)(struct pci_dev *dev);
+
+        /* PCI Express link has been reset */
+        pci_ers_result_t (*link_reset)(struct pci_dev *dev);
+
+        /* PCI slot has been reset */
+        pci_ers_result_t (*slot_reset)(struct pci_dev *dev);
+
+        /* Device driver may resume normal operations */
+        void (*resume)(struct pci_dev *dev);
+};
+
+/* freeBSD does not support SRIOV - yet */
+static inline struct pci_dev *pci_physfn(struct pci_dev *dev)
+{
+        return dev;
+}
+
+static inline bool pci_is_pcie(struct pci_dev *dev)
+{
+        return !!pci_pcie_cap(dev);
+}
+
+static inline u16 pcie_flags_reg(struct pci_dev *dev)
+{
+        int pos;
+        u16 reg16;
+
+        pos = pci_find_capability(dev, PCI_CAP_ID_EXP);
+        if (!pos)
+                return 0;
+
+        pci_read_config_word(dev, pos + PCI_EXP_FLAGS, &reg16);
+
+        return reg16;
+}
+
+
+static inline int pci_pcie_type(struct pci_dev *dev)
+{
+        return (pcie_flags_reg(dev) & PCI_EXP_FLAGS_TYPE) >> 4;
+}
+
+static inline int pcie_cap_version(struct pci_dev *dev)
+{
+        return pcie_flags_reg(dev) & PCI_EXP_FLAGS_VERS;
+}
+
+static inline bool pcie_cap_has_lnkctl(struct pci_dev *dev)
+{
+        int type = pci_pcie_type(dev);
+
+        return pcie_cap_version(dev) > 1 ||
+               type == PCI_EXP_TYPE_ROOT_PORT ||
+               type == PCI_EXP_TYPE_ENDPOINT ||
+               type == PCI_EXP_TYPE_LEG_END;
+}
+
+static inline bool pcie_cap_has_devctl(const struct pci_dev *dev)
+{
+                return true;
+}
+
+static inline bool pcie_cap_has_sltctl(struct pci_dev *dev)
+{
+        int type = pci_pcie_type(dev);
+
+        return pcie_cap_version(dev) > 1 ||
+               type == PCI_EXP_TYPE_ROOT_PORT ||
+               (type == PCI_EXP_TYPE_DOWNSTREAM &&
+                pcie_flags_reg(dev) & PCI_EXP_FLAGS_SLOT);
+}
+
+static inline bool pcie_cap_has_rtctl(struct pci_dev *dev)
+{
+        int type = pci_pcie_type(dev);
+
+        return pcie_cap_version(dev) > 1 ||
+               type == PCI_EXP_TYPE_ROOT_PORT ||
+               type == PCI_EXP_TYPE_RC_EC;
+}
+
+static bool pcie_capability_reg_implemented(struct pci_dev *dev, int pos)
+{
+        if (!pci_is_pcie(dev))
+                return false;
+
+        switch (pos) {
+        case PCI_EXP_FLAGS_TYPE:
+                return true;
+        case PCI_EXP_DEVCAP:
+        case PCI_EXP_DEVCTL:
+        case PCI_EXP_DEVSTA:
+                return pcie_cap_has_devctl(dev);
+        case PCI_EXP_LNKCAP:
+        case PCI_EXP_LNKCTL:
+        case PCI_EXP_LNKSTA:
+                return pcie_cap_has_lnkctl(dev);
+        case PCI_EXP_SLTCAP:
+        case PCI_EXP_SLTCTL:
+        case PCI_EXP_SLTSTA:
+                return pcie_cap_has_sltctl(dev);
+        case PCI_EXP_RTCTL:
+        case PCI_EXP_RTCAP:
+        case PCI_EXP_RTSTA:
+                return pcie_cap_has_rtctl(dev);
+        case PCI_EXP_DEVCAP2:
+        case PCI_EXP_DEVCTL2:
+        case PCI_EXP_LNKCAP2:
+        case PCI_EXP_LNKCTL2:
+        case PCI_EXP_LNKSTA2:
+                return pcie_cap_version(dev) > 1;
+        default:
+                return false;
+        }
+}
+
+ 
+static inline int pcie_capability_write_word(struct pci_dev *dev, int pos, u16 val)
+{
+        if (pos & 1)
+                return -EINVAL;
+
+        if (!pcie_capability_reg_implemented(dev, pos))
+                return 0;
+
+        return pci_write_config_word(dev, pci_pcie_cap(dev) + pos, val);
+}
+
+
 #endif	/* _LINUX_PCI_H_ */

Modified: trunk/sys/ofed/include/linux/poll.h
===================================================================
--- trunk/sys/ofed/include/linux/poll.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/linux/poll.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -2,6 +2,7 @@
  * Copyright (c) 2010 Isilon Systems, Inc.
  * Copyright (c) 2010 iX Systems, Inc.
  * Copyright (c) 2010 Panasas, Inc.
+ * Copyright (c) 2013, 2014 Mellanox Technologies, Ltd.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without

Added: trunk/sys/ofed/include/linux/printk.h
===================================================================
--- trunk/sys/ofed/include/linux/printk.h	                        (rev 0)
+++ trunk/sys/ofed/include/linux/printk.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -0,0 +1,40 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * Copyright (c) 2013, 2014 Mellanox Technologies, Ltd.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _FBSD_PRINTK_H_
+#define	_FBSD_PRINTK_H_
+
+/* GID printing macros */
+#define	GID_PRINT_FMT			"%.4x:%.4x:%.4x:%.4x:%.4x:%.4x:%.4x:%.4x"
+#define	GID_PRINT_ARGS(gid_raw)		htons(((u16 *)gid_raw)[0]), htons(((u16 *)gid_raw)[1]),\
+					htons(((u16 *)gid_raw)[2]), htons(((u16 *)gid_raw)[3]),\
+					htons(((u16 *)gid_raw)[4]), htons(((u16 *)gid_raw)[5]),\
+					htons(((u16 *)gid_raw)[6]), htons(((u16 *)gid_raw)[7])
+
+#endif					/* _FBSD_PRINTK_H */


Property changes on: trunk/sys/ofed/include/linux/printk.h
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Modified: trunk/sys/ofed/include/linux/radix-tree.h
===================================================================
--- trunk/sys/ofed/include/linux/radix-tree.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/linux/radix-tree.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -2,6 +2,7 @@
  * Copyright (c) 2010 Isilon Systems, Inc.
  * Copyright (c) 2010 iX Systems, Inc.
  * Copyright (c) 2010 Panasas, Inc.
+ * Copyright (c) 2013, 2014 Mellanox Technologies, Ltd.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without

Modified: trunk/sys/ofed/include/linux/random.h
===================================================================
--- trunk/sys/ofed/include/linux/random.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/linux/random.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -2,6 +2,7 @@
  * Copyright (c) 2010 Isilon Systems, Inc.
  * Copyright (c) 2010 iX Systems, Inc.
  * Copyright (c) 2010 Panasas, Inc.
+ * Copyright (c) 2013, 2014 Mellanox Technologies, Ltd.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without

Modified: trunk/sys/ofed/include/linux/rbtree.h
===================================================================
--- trunk/sys/ofed/include/linux/rbtree.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/linux/rbtree.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -2,6 +2,7 @@
  * Copyright (c) 2010 Isilon Systems, Inc.
  * Copyright (c) 2010 iX Systems, Inc.
  * Copyright (c) 2010 Panasas, Inc.
+ * Copyright (c) 2013, 2014 Mellanox Technologies, Ltd.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without

Modified: trunk/sys/ofed/include/linux/rwlock.h
===================================================================
--- trunk/sys/ofed/include/linux/rwlock.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/linux/rwlock.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -2,6 +2,7 @@
  * Copyright (c) 2010 Isilon Systems, Inc.
  * Copyright (c) 2010 iX Systems, Inc.
  * Copyright (c) 2010 Panasas, Inc.
+ * Copyright (c) 2013, 2014 Mellanox Technologies, Ltd.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without

Modified: trunk/sys/ofed/include/linux/rwsem.h
===================================================================
--- trunk/sys/ofed/include/linux/rwsem.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/linux/rwsem.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -2,6 +2,7 @@
  * Copyright (c) 2010 Isilon Systems, Inc.
  * Copyright (c) 2010 iX Systems, Inc.
  * Copyright (c) 2010 Panasas, Inc.
+ * Copyright (c) 2013, 2014 Mellanox Technologies, Ltd.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without

Modified: trunk/sys/ofed/include/linux/scatterlist.h
===================================================================
--- trunk/sys/ofed/include/linux/scatterlist.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/linux/scatterlist.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -2,6 +2,7 @@
  * Copyright (c) 2010 Isilon Systems, Inc.
  * Copyright (c) 2010 iX Systems, Inc.
  * Copyright (c) 2010 Panasas, Inc.
+ * Copyright (c) 2013, 2014 Mellanox Technologies, Ltd.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -25,12 +26,27 @@
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
+
 #ifndef	_LINUX_SCATTERLIST_H_
 #define	_LINUX_SCATTERLIST_H_
 
-#include <linux/string.h>
 #include <linux/page.h>
+#include <linux/slab.h>
 
+/*
+ * SG table design.
+ *
+ * If flags bit 0 is set, then the sg field contains a pointer to the next sg
+ * table list. Otherwise the next entry is at sg + 1, can be determined using
+ * the sg_is_chain() function.
+ *
+ * If flags bit 1 is set, then this sg entry is the last element in a list,
+ * can be determined using the sg_is_last() function.
+ *
+ * See sg_next().
+ *
+ */
+
 struct scatterlist {
 	union {
 		struct page		*page;
@@ -42,6 +58,18 @@
 	uint32_t	flags;
 };
 
+struct sg_table {
+	struct scatterlist *sgl;        /* the list */
+	unsigned int nents;             /* number of mapped entries */
+	unsigned int orig_nents;        /* original size of list */
+};
+
+/*
+ * Maximum number of entries that will be allocated in one piece, if
+ * a list larger than this is required then chaining will be utilized.
+ */
+#define SG_MAX_SINGLE_ALLOC             (PAGE_SIZE / sizeof(struct scatterlist))
+
 #define	sg_dma_address(sg)	(sg)->address
 #define	sg_dma_len(sg)		(sg)->length
 #define	sg_page(sg)		(sg)->sl_un.page
@@ -92,6 +120,212 @@
 	return sg_page(sg)->phys_addr + sg->offset;
 }
 
+/**
+ * sg_chain - Chain two sglists together
+ * @prv:        First scatterlist
+ * @prv_nents:  Number of entries in prv
+ * @sgl:        Second scatterlist
+ *
+ * Description:
+ *   Links @prv@ and @sgl@ together, to form a longer scatterlist.
+ *
+ **/
+static inline void
+sg_chain(struct scatterlist *prv, unsigned int prv_nents,
+					struct scatterlist *sgl)
+{
+/*
+ * offset and length are unused for chain entry.  Clear them.
+ */
+	struct scatterlist *sg = &prv[prv_nents - 1];
+
+	sg->offset = 0;
+	sg->length = 0;
+
+	/*
+	 * Indicate a link pointer, and set the link to the second list.
+	 */
+	sg->flags = SG_CHAIN;
+	sg->sl_un.sg = sgl;
+}
+
+/**
+ * sg_mark_end - Mark the end of the scatterlist
+ * @sg:          SG entryScatterlist
+ *
+ * Description:
+ *   Marks the passed in sg entry as the termination point for the sg
+ *   table. A call to sg_next() on this entry will return NULL.
+ *
+ **/
+static inline void sg_mark_end(struct scatterlist *sg)
+{
+        sg->flags = SG_END;
+}
+
+/**
+ * __sg_free_table - Free a previously mapped sg table
+ * @table:      The sg table header to use
+ * @max_ents:   The maximum number of entries per single scatterlist
+ *
+ *  Description:
+ *    Free an sg table previously allocated and setup with
+ *    __sg_alloc_table().  The @max_ents value must be identical to
+ *    that previously used with __sg_alloc_table().
+ *
+ **/
+static inline void
+__sg_free_table(struct sg_table *table, unsigned int max_ents)
+{
+	struct scatterlist *sgl, *next;
+
+	if (unlikely(!table->sgl))
+		return;
+
+	sgl = table->sgl;
+	while (table->orig_nents) {
+		unsigned int alloc_size = table->orig_nents;
+		unsigned int sg_size;
+
+		/*
+		 * If we have more than max_ents segments left,
+		 * then assign 'next' to the sg table after the current one.
+		 * sg_size is then one less than alloc size, since the last
+		 * element is the chain pointer.
+		 */
+		if (alloc_size > max_ents) {
+			next = sgl[max_ents - 1].sl_un.sg;
+			alloc_size = max_ents;
+			sg_size = alloc_size - 1;
+		} else {
+			sg_size = alloc_size;
+			next = NULL;
+		}
+
+		table->orig_nents -= sg_size;
+		kfree(sgl);
+		sgl = next;
+	}
+
+	table->sgl = NULL;
+}
+
+/**
+ * sg_free_table - Free a previously allocated sg table
+ * @table:      The mapped sg table header
+ *
+ **/
+static inline void
+sg_free_table(struct sg_table *table)
+{
+	__sg_free_table(table, SG_MAX_SINGLE_ALLOC);
+}
+
+/**
+ * __sg_alloc_table - Allocate and initialize an sg table with given allocator
+ * @table:      The sg table header to use
+ * @nents:      Number of entries in sg list
+ * @max_ents:   The maximum number of entries the allocator returns per call
+ * @gfp_mask:   GFP allocation mask
+ *
+ * Description:
+ *   This function returns a @table @nents long. The allocator is
+ *   defined to return scatterlist chunks of maximum size @max_ents.
+ *   Thus if @nents is bigger than @max_ents, the scatterlists will be
+ *   chained in units of @max_ents.
+ *
+ * Notes:
+ *   If this function returns non-0 (eg failure), the caller must call
+ *   __sg_free_table() to cleanup any leftover allocations.
+ *
+ **/
+static inline int
+__sg_alloc_table(struct sg_table *table, unsigned int nents,
+		unsigned int max_ents, gfp_t gfp_mask)
+{
+	struct scatterlist *sg, *prv;
+	unsigned int left;
+
+	memset(table, 0, sizeof(*table));
+
+	if (nents == 0)
+		return -EINVAL;
+	left = nents;
+	prv = NULL;
+	do {
+		unsigned int sg_size, alloc_size = left;
+
+		if (alloc_size > max_ents) {
+			alloc_size = max_ents;
+			sg_size = alloc_size - 1;
+		} else
+			sg_size = alloc_size;
+
+		left -= sg_size;
+
+		sg = kmalloc(alloc_size * sizeof(struct scatterlist), gfp_mask);
+		if (unlikely(!sg)) {
+		/*
+		 * Adjust entry count to reflect that the last
+		 * entry of the previous table won't be used for
+		 * linkage.  Without this, sg_kfree() may get
+		 * confused.
+		 */
+			if (prv)
+				table->nents = ++table->orig_nents;
+
+			return -ENOMEM;
+		}
+
+		sg_init_table(sg, alloc_size);
+		table->nents = table->orig_nents += sg_size;
+
+		/*
+		 * If this is the first mapping, assign the sg table header.
+		 * If this is not the first mapping, chain previous part.
+		 */
+		if (prv)
+			sg_chain(prv, max_ents, sg);
+		else
+			table->sgl = sg;
+
+		/*
+		* If no more entries after this one, mark the end
+		*/
+		if (!left)
+			sg_mark_end(&sg[sg_size - 1]);
+
+		prv = sg;
+	} while (left);
+
+	return 0;
+}
+
+/**
+ * sg_alloc_table - Allocate and initialize an sg table
+ * @table:      The sg table header to use
+ * @nents:      Number of entries in sg list
+ * @gfp_mask:   GFP allocation mask
+ *
+ *  Description:
+ *    Allocate and initialize an sg table. If @nents@ is larger than
+ *    SG_MAX_SINGLE_ALLOC a chained sg table will be setup.
+ *
+ **/
+
+static inline int
+sg_alloc_table(struct sg_table *table, unsigned int nents, gfp_t gfp_mask)
+{
+	int ret;
+
+	ret = __sg_alloc_table(table, nents, SG_MAX_SINGLE_ALLOC,
+		gfp_mask);
+	if (unlikely(ret))
+		__sg_free_table(table, SG_MAX_SINGLE_ALLOC);
+
+	return ret;
+}
+
 #define	for_each_sg(sglist, sg, sgmax, _itr)				\
 	for (_itr = 0, sg = (sglist); _itr < (sgmax); _itr++, sg = sg_next(sg))
 

Modified: trunk/sys/ofed/include/linux/sched.h
===================================================================
--- trunk/sys/ofed/include/linux/sched.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/linux/sched.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -2,6 +2,7 @@
  * Copyright (c) 2010 Isilon Systems, Inc.
  * Copyright (c) 2010 iX Systems, Inc.
  * Copyright (c) 2010 Panasas, Inc.
+ * Copyright (c) 2013, 2014 Mellanox Technologies, Ltd.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without

Modified: trunk/sys/ofed/include/linux/semaphore.h
===================================================================
--- trunk/sys/ofed/include/linux/semaphore.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/linux/semaphore.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -2,6 +2,7 @@
  * Copyright (c) 2010 Isilon Systems, Inc.
  * Copyright (c) 2010 iX Systems, Inc.
  * Copyright (c) 2010 Panasas, Inc.
+ * Copyright (c) 2013, 2014 Mellanox Technologies, Ltd.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without

Modified: trunk/sys/ofed/include/linux/slab.h
===================================================================
--- trunk/sys/ofed/include/linux/slab.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/linux/slab.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -2,6 +2,7 @@
  * Copyright (c) 2010 Isilon Systems, Inc.
  * Copyright (c) 2010 iX Systems, Inc.
  * Copyright (c) 2010 Panasas, Inc.
+ * Copyright (c) 2013, 2014 Mellanox Technologies, Ltd.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -38,11 +39,18 @@
 
 MALLOC_DECLARE(M_KMALLOC);
 
-#define	kmalloc(size, flags)	malloc((size), M_KMALLOC, (flags))
-#define	kzalloc(size, flags)	kmalloc((size), (flags) | M_ZERO)
-#define	kfree(ptr)		free(__DECONST(void *, (ptr)), M_KMALLOC)
-#define	krealloc(ptr, size, flags) realloc((ptr), (size), M_KMALLOC, (flags))
-#define	kcalloc(n, size, flags)	kmalloc((n) * (size), flags | M_ZERO)
+#define	kmalloc(size, flags)		malloc((size), M_KMALLOC, (flags))
+#define	kvmalloc(size)			kmalloc((size), 0)
+#define	kzalloc(size, flags)		kmalloc((size), (flags) | M_ZERO)
+#define	kzalloc_node(size, flags, node)	kzalloc(size, flags)
+#define	kfree(ptr)			free(__DECONST(void *, (ptr)), M_KMALLOC)
+#define	krealloc(ptr, size, flags)	realloc((ptr), (size), M_KMALLOC, (flags))
+#define	kcalloc(n, size, flags)	        kmalloc((n) * (size), flags | M_ZERO)
+#define	vzalloc(size)			kzalloc(size, GFP_KERNEL | __GFP_NOWARN)
+#define	vfree(arg)			kfree(arg)
+#define	kvfree(arg)			kfree(arg)
+#define	vmalloc(size)                   kmalloc(size, GFP_KERNEL)
+#define	vmalloc_node(size, node)        kmalloc(size, GFP_KERNEL)
 
 struct kmem_cache {
 	uma_zone_t	cache_zone;

Modified: trunk/sys/ofed/include/linux/socket.h
===================================================================
--- trunk/sys/ofed/include/linux/socket.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/linux/socket.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -2,6 +2,7 @@
  * Copyright (c) 2010 Isilon Systems, Inc.
  * Copyright (c) 2010 iX Systems, Inc.
  * Copyright (c) 2010 Panasas, Inc.
+ * Copyright (c) 2013, 2014 Mellanox Technologies, Ltd.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without

Modified: trunk/sys/ofed/include/linux/spinlock.h
===================================================================
--- trunk/sys/ofed/include/linux/spinlock.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/linux/spinlock.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -2,6 +2,7 @@
  * Copyright (c) 2010 Isilon Systems, Inc.
  * Copyright (c) 2010 iX Systems, Inc.
  * Copyright (c) 2010 Panasas, Inc.
+ * Copyright (c) 2013, 2014 Mellanox Technologies, Ltd.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -35,7 +36,6 @@
 
 #include <linux/compiler.h>
 #include <linux/kernel.h>
-#include <linux/lockdep.h>
 #include <linux/rwlock.h>
 
 typedef struct {

Modified: trunk/sys/ofed/include/linux/string.h
===================================================================
--- trunk/sys/ofed/include/linux/string.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/linux/string.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -2,6 +2,7 @@
  * Copyright (c) 2010 Isilon Systems, Inc.
  * Copyright (c) 2010 iX Systems, Inc.
  * Copyright (c) 2010 Panasas, Inc.
+ * Copyright (c) 2013, 2014 Mellanox Technologies, Ltd.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -35,6 +36,9 @@
 
 #include <sys/libkern.h>
 
+#define strnicmp strncasecmp
+
+
 static inline void *
 kmemdup(const void *src, size_t len, gfp_t gfp)
 {

Modified: trunk/sys/ofed/include/linux/sysfs.h
===================================================================
--- trunk/sys/ofed/include/linux/sysfs.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/linux/sysfs.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -2,6 +2,7 @@
  * Copyright (c) 2010 Isilon Systems, Inc.
  * Copyright (c) 2010 iX Systems, Inc.
  * Copyright (c) 2010 Panasas, Inc.
+ * Copyright (c) 2013, 2014 Mellanox Technologies, Ltd.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -75,39 +76,45 @@
 	struct kobject *kobj;
 	struct attribute *attr;
 	const struct sysfs_ops *ops;
-	void *buf;
+	char *buf;
 	int error;
 	ssize_t len;
 
 	kobj = arg1;
 	attr = (struct attribute *)arg2;
-	buf = (void *)get_zeroed_page(GFP_KERNEL);
-	len = 1;	/* Copy out a NULL byte at least. */
 	if (kobj->ktype == NULL || kobj->ktype->sysfs_ops == NULL)
 		return (ENODEV);
-	ops = kobj->ktype->sysfs_ops;
+	buf = (char *)get_zeroed_page(GFP_KERNEL);
 	if (buf == NULL)
 		return (ENOMEM);
+	ops = kobj->ktype->sysfs_ops;
 	if (ops->show) {
 		len = ops->show(kobj, attr, buf);
 		/*
-		 * It's valid not to have a 'show' so we just return 1 byte
-		 * of NULL.
+		 * It's valid to not have a 'show' so just return an
+		 * empty string.
 	 	 */
 		if (len < 0) {
 			error = -len;
-			len = 1;
 			if (error != EIO)
 				goto out;
+			buf[0] = '\0';
+		} else if (len) {
+			len--;
+			if (len >= PAGE_SIZE)
+				len = PAGE_SIZE - 1;
+			/* Trim trailing newline. */
+			buf[len] = '\0';
 		}
 	}
-	error = SYSCTL_OUT(req, buf, len);
-	if (error || !req->newptr || ops->store == NULL)
+
+	/* Leave one trailing byte to append a newline. */
+	error = sysctl_handle_string(oidp, buf, PAGE_SIZE - 1, req);
+	if (error != 0 || req->newptr == NULL || ops->store == NULL)
 		goto out;
-	error = SYSCTL_IN(req, buf, PAGE_SIZE);
-	if (error)
-		goto out;
-	len = ops->store(kobj, attr, buf, req->newlen);
+	len = strlcat(buf, "\n", PAGE_SIZE);
+	KASSERT(len < PAGE_SIZE, ("new attribute truncated"));
+	len = ops->store(kobj, attr, buf, len);
 	if (len < 0)
 		error = -len;
 out:
@@ -179,4 +186,6 @@
 	sysctl_remove_oid(kobj->oidp, 1, 1);
 }
 
+#define sysfs_attr_init(attr) do {} while(0)
+
 #endif	/* _LINUX_SYSFS_H_ */

Modified: trunk/sys/ofed/include/linux/timer.h
===================================================================
--- trunk/sys/ofed/include/linux/timer.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/linux/timer.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -2,6 +2,7 @@
  * Copyright (c) 2010 Isilon Systems, Inc.
  * Copyright (c) 2010 iX Systems, Inc.
  * Copyright (c) 2010 Panasas, Inc.
+ * Copyright (c) 2013, 2014 Mellanox Technologies, Ltd.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -26,7 +27,7 @@
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 #ifndef _LINUX_TIMER_H_
-#define _LINUX_TIMER_H_
+#define	_LINUX_TIMER_H_
 
 #include <linux/types.h>
 
@@ -35,22 +36,14 @@
 #include <sys/callout.h>
 
 struct timer_list {
-	struct callout	timer_callout;
-	void		(*function)(unsigned long);
-        unsigned long	data;
+	struct callout timer_callout;
+	void    (*function) (unsigned long);
+	unsigned long data;
+	unsigned long expires;
 };
 
-#define	expires	timer_callout.c_time
+extern unsigned long linux_timer_hz_mask;
 
-static inline void
-_timer_fn(void *context)
-{
-	struct timer_list *timer;
-
-	timer = context;
-	timer->function(timer->data);
-}
-
 #define	setup_timer(timer, func, dat)					\
 do {									\
 	(timer)->function = (func);					\
@@ -65,23 +58,15 @@
 	callout_init(&(timer)->timer_callout, CALLOUT_MPSAFE);		\
 } while (0)
 
-#define	mod_timer(timer, expire)					\
-	callout_reset(&(timer)->timer_callout, (expire) - jiffies,	\
-	    _timer_fn, (timer))
+extern void mod_timer(struct timer_list *, unsigned long);
+extern void add_timer(struct timer_list *);
 
-#define	add_timer(timer)						\
-	callout_reset(&(timer)->timer_callout,				\
-	    (timer)->timer_callout.c_time - jiffies, _timer_fn, (timer))
-
 #define	del_timer(timer)	callout_stop(&(timer)->timer_callout)
 #define	del_timer_sync(timer)	callout_drain(&(timer)->timer_callout)
-
 #define	timer_pending(timer)	callout_pending(&(timer)->timer_callout)
+#define	round_jiffies(j) \
+	((unsigned long)(((j) + linux_timer_hz_mask) & ~linux_timer_hz_mask))
+#define	round_jiffies_relative(j) \
+	round_jiffies(j)
 
-static inline unsigned long
-round_jiffies(unsigned long j)
-{
-	return roundup(j, hz);
-}
-
-#endif /* _LINUX_TIMER_H_ */
+#endif					/* _LINUX_TIMER_H_ */

Modified: trunk/sys/ofed/include/linux/types.h
===================================================================
--- trunk/sys/ofed/include/linux/types.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/linux/types.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -2,6 +2,7 @@
  * Copyright (c) 2010 Isilon Systems, Inc.
  * Copyright (c) 2010 iX Systems, Inc.
  * Copyright (c) 2010 Panasas, Inc.
+ * Copyright (c) 2013, 2014 Mellanox Technologies, Ltd.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -30,27 +31,35 @@
 
 #include <sys/cdefs.h>
 #include <sys/types.h>
+#include <sys/param.h>
+#include <sys/systm.h>
 #include <linux/compiler.h>
 #include <asm/types.h>
 
-typedef __u16 __le16;
-typedef __u16 __be16;
-typedef __u32 __le32;
-typedef __u32 __be32;
-typedef __u64 __le64;
-typedef __u64 __be64;
-#ifndef __bool_true_false_are_defined
-typedef _Bool bool;
-#define	true	TRUE
-#define	false	FALSE
+#define	__read_mostly __attribute__((__section__(".data.read_mostly")))
+
+#ifndef __bitwise__
+#ifdef __CHECKER__
+#define __bitwise__ __attribute__((bitwise))
+#else
+#define __bitwise__
 #endif
+#endif
 
-typedef unsigned long kernel_ulong_t;
+typedef uint16_t __le16;
+typedef uint16_t __be16;
+typedef uint32_t __le32;
+typedef uint32_t __be32;
+typedef uint64_t __le64;
+typedef uint64_t __be64;
+
 typedef unsigned int    uint;
 typedef unsigned gfp_t;
 typedef uint64_t loff_t;
 typedef vm_paddr_t resource_size_t;
 
+typedef u64 phys_addr_t;
+
 #define	DECLARE_BITMAP(n, bits)						\
 	unsigned long n[howmany(bits, sizeof(long) * 8)]
 

Modified: trunk/sys/ofed/include/linux/uaccess.h
===================================================================
--- trunk/sys/ofed/include/linux/uaccess.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/linux/uaccess.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -2,6 +2,7 @@
  * Copyright (c) 2010 Isilon Systems, Inc.
  * Copyright (c) 2010 iX Systems, Inc.
  * Copyright (c) 2010 Panasas, Inc.
+ * Copyright (c) 2013, 2014 Mellanox Technologies, Ltd.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without

Modified: trunk/sys/ofed/include/linux/vmalloc.h
===================================================================
--- trunk/sys/ofed/include/linux/vmalloc.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/linux/vmalloc.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -2,6 +2,7 @@
  * Copyright (c) 2010 Isilon Systems, Inc.
  * Copyright (c) 2010 iX Systems, Inc.
  * Copyright (c) 2010 Panasas, Inc.
+ * Copyright (c) 2013, 2014 Mellanox Technologies, Ltd.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -29,7 +30,7 @@
 #ifndef _LINUX_VMALLOC_H_
 #define	_LINUX_VMALLOC_H_
 
-#include <asm/page.h>
+#include <linux/page.h>
 
 #define	VM_MAP		0x0000
 #define	PAGE_KERNEL	0x0000

Modified: trunk/sys/ofed/include/linux/wait.h
===================================================================
--- trunk/sys/ofed/include/linux/wait.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/linux/wait.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -2,6 +2,7 @@
  * Copyright (c) 2010 Isilon Systems, Inc.
  * Copyright (c) 2010 iX Systems, Inc.
  * Copyright (c) 2010 Panasas, Inc.
+ * Copyright (c) 2013, 2014 Mellanox Technologies, Ltd.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without

Modified: trunk/sys/ofed/include/linux/workqueue.h
===================================================================
--- trunk/sys/ofed/include/linux/workqueue.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/linux/workqueue.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -2,6 +2,7 @@
  * Copyright (c) 2010 Isilon Systems, Inc.
  * Copyright (c) 2010 iX Systems, Inc.
  * Copyright (c) 2010 Panasas, Inc.
+ * Copyright (c) 2013, 2014 Mellanox Technologies, Ltd.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -80,7 +81,7 @@
 	callout_init(&(_work)->timer, CALLOUT_MPSAFE);			\
 } while (0)
 
-#define	INIT_DELAYED_WORK_DEFERRABLE	INIT_DELAYED_WORK
+#define	INIT_DEFERRABLE_WORK	INIT_DELAYED_WORK
 
 #define	schedule_work(work)						\
 do {									\
@@ -90,11 +91,12 @@
 
 #define	flush_scheduled_work()	flush_taskqueue(taskqueue_thread)
 
-#define	queue_work(q, work)						\
-do {									\
-	(work)->taskqueue = (q)->taskqueue;				\
-	taskqueue_enqueue((q)->taskqueue, &(work)->work_task);		\
-} while (0)
+static inline int queue_work (struct workqueue_struct *q, struct work_struct *work)
+{
+	(work)->taskqueue = (q)->taskqueue;
+	/* Return opposite val to align with Linux logic */
+        return !taskqueue_enqueue((q)->taskqueue, &(work)->work_task);
+}
 
 static inline void
 _delayed_work_fn(void *arg)
@@ -121,6 +123,14 @@
 	return (!pending);
 }
 
+static inline bool schedule_delayed_work(struct delayed_work *dwork,
+                                         unsigned long delay)
+{
+        struct workqueue_struct wq;
+        wq.taskqueue = taskqueue_thread;
+        return queue_delayed_work(&wq, dwork, delay);
+}
+
 static inline struct workqueue_struct *
 _create_workqueue_common(char *name, int cpus)
 {
@@ -129,7 +139,7 @@
 	wq = kmalloc(sizeof(*wq), M_WAITOK);
 	wq->taskqueue = taskqueue_create((name), M_WAITOK,
 	    taskqueue_thread_enqueue,  &wq->taskqueue);
-	taskqueue_start_threads(&wq->taskqueue, cpus, PWAIT, (name));
+	taskqueue_start_threads(&wq->taskqueue, cpus, PWAIT, "%s", name);
 
 	return (wq);
 }
@@ -184,10 +194,30 @@
 {
 
 	callout_stop(&work->timer);
-	if (work->work.taskqueue &&
-	    taskqueue_cancel(work->work.taskqueue, &work->work.work_task, NULL))
-		taskqueue_drain(work->work.taskqueue, &work->work.work_task);
+	if (work->work.taskqueue)
+		return (taskqueue_cancel(work->work.taskqueue,
+		    &work->work.work_task, NULL) == 0);
 	return 0;
 }
 
+static inline int
+cancel_delayed_work_sync(struct delayed_work *work)
+{
+
+        callout_drain(&work->timer);
+        if (work->work.taskqueue &&
+            taskqueue_cancel(work->work.taskqueue, &work->work.work_task, NULL))
+                taskqueue_drain(work->work.taskqueue, &work->work.work_task);
+        return 0;
+}
+
+static inline bool
+mod_delayed_work(struct workqueue_struct *wq, struct delayed_work *dwork,
+		                      unsigned long delay)
+{
+	cancel_delayed_work(dwork);
+	queue_delayed_work(wq, dwork, delay);
+	return false;
+}
+
 #endif	/* _LINUX_WORKQUEUE_H_ */

Added: trunk/sys/ofed/include/net/if_inet6.h
===================================================================
--- trunk/sys/ofed/include/net/if_inet6.h	                        (rev 0)
+++ trunk/sys/ofed/include/net/if_inet6.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -0,0 +1,47 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * Copyright (c) 2013, 2014 Mellanox Technologies, Ltd.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _NET_IF_INET6_H_
+#define	_NET_IF_INET6_H_
+
+static inline void ipv6_eth_mc_map(const struct in6_addr *addr, char *buf)
+{
+/*
+ *      +-------+-------+-------+-------+-------+-------+
+ *      |   33  |   33  | DST13 | DST14 | DST15 | DST16 |
+ *      +-------+-------+-------+-------+-------+-------+
+ */
+
+        buf[0]= 0x33;
+        buf[1]= 0x33;
+
+        memcpy(buf + 2, &addr->s6_addr32[3], sizeof(__u32));
+}
+
+#endif	/* _NET_IF_INET6_H_ */


Property changes on: trunk/sys/ofed/include/net/if_inet6.h
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Modified: trunk/sys/ofed/include/net/ip.h
===================================================================
--- trunk/sys/ofed/include/net/ip.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/net/ip.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -2,6 +2,7 @@
  * Copyright (c) 2010 Isilon Systems, Inc.
  * Copyright (c) 2010 iX Systems, Inc.
  * Copyright (c) 2010 Panasas, Inc.
+ * Copyright (c) 2013, 2014 Mellanox Technologies, Ltd.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -41,11 +42,17 @@
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 
-#ifdef INET
 static inline void inet_get_local_port_range(int *low, int *high)
 {
+#ifdef INET
+	CURVNET_SET_QUIET(TD_TO_VNET(curthread));
 	*low = V_ipport_firstauto;
 	*high = V_ipport_lastauto;
+	CURVNET_RESTORE();
+#else
+	*low = IPPORT_EPHEMERALFIRST;     /* 10000 */
+	*high = IPPORT_EPHEMERALLAST;     /* 65535 */
+#endif
 }
 
 static inline void
@@ -71,11 +78,10 @@
 	buf[13] = 0;
 	buf[14] = 0;
 	buf[15] = 0;
-	buf[16] = (addr >> 24) & 0x0f;
+	buf[16] = (addr >> 24) & 0xff;
 	buf[17] = (addr >> 16) & 0xff;
 	buf[18] = (addr >> 8) & 0xff;
 	buf[19] = addr & 0xff;
 }
-#endif
 
 #endif	/* _LINUX_NET_IP_H_ */

Modified: trunk/sys/ofed/include/net/ipv6.h
===================================================================
--- trunk/sys/ofed/include/net/ipv6.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/net/ipv6.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -2,6 +2,7 @@
  * Copyright (c) 2010 Isilon Systems, Inc.
  * Copyright (c) 2010 iX Systems, Inc.
  * Copyright (c) 2010 Panasas, Inc.
+ * Copyright (c) 2013, 2014 Mellanox Technologies, Ltd.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -57,4 +58,53 @@
 }
 #endif
 
+static inline void __ipv6_addr_set_half(__be32 *addr,
+                                        __be32 wh, __be32 wl)
+{
+#if BITS_PER_LONG == 64
+#if defined(__BIG_ENDIAN)
+        if (__builtin_constant_p(wh) && __builtin_constant_p(wl)) {
+                *(__force u64 *)addr = ((__force u64)(wh) << 32 | (__force u64)(wl));
+                return;
+        }
+#elif defined(__LITTLE_ENDIAN)
+        if (__builtin_constant_p(wl) && __builtin_constant_p(wh)) {
+                *(__force u64 *)addr = ((__force u64)(wl) << 32 | (__force u64)(wh));
+                return;
+        }
+#endif
+#endif
+        addr[0] = wh;
+        addr[1] = wl;
+}
+
+static inline void ipv6_addr_set(struct in6_addr *addr,
+                                     __be32 w1, __be32 w2,
+                                     __be32 w3, __be32 w4)
+{
+        __ipv6_addr_set_half(&addr->s6_addr32[0], w1, w2);
+        __ipv6_addr_set_half(&addr->s6_addr32[2], w3, w4);
+}
+
+static inline void ipv6_addr_set_v4mapped(const __be32 addr,
+					  struct in6_addr *v4mapped)
+{
+	ipv6_addr_set(v4mapped,
+			0, 0,
+			htonl(0x0000FFFF),
+			addr);
+}
+
+static inline int ipv6_addr_v4mapped(const struct in6_addr *a)
+{
+	return ((a->s6_addr32[0] | a->s6_addr32[1] |
+		(a->s6_addr32[2] ^ htonl(0x0000ffff))) == 0);
+}
+
+static inline int ipv6_addr_cmp(const struct in6_addr *a1, const struct in6_addr *a2)
+{
+        return memcmp(a1, a2, sizeof(struct in6_addr));
+}
+
+
 #endif	/* _LINUX_NET_IPV6_H_ */

Modified: trunk/sys/ofed/include/net/netevent.h
===================================================================
--- trunk/sys/ofed/include/net/netevent.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/net/netevent.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -2,6 +2,7 @@
  * Copyright (c) 2010 Isilon Systems, Inc.
  * Copyright (c) 2010 iX Systems, Inc.
  * Copyright (c) 2010 Panasas, Inc.
+ * Copyright (c) 2013, 2014 Mellanox Technologies, Ltd.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -42,7 +43,7 @@
 struct llentry;
 
 static inline void
-_handle_arp_update_event(void *arg, struct llentry *lle)
+_handle_arp_update_event(void *arg, struct llentry *lle, int evt __unused)
 {
 	struct notifier_block *nb;
 
@@ -54,7 +55,7 @@
 register_netevent_notifier(struct notifier_block *nb)
 {
 	nb->tags[NETEVENT_NEIGH_UPDATE] = EVENTHANDLER_REGISTER(
-	    arp_update_event, _handle_arp_update_event, nb, 0);
+	    lle_event, _handle_arp_update_event, nb, 0);
 	return (0);
 }
 
@@ -62,8 +63,7 @@
 unregister_netevent_notifier(struct notifier_block *nb)
 {
 
-	EVENTHANDLER_DEREGISTER(arp_update_event,
-	    nb->tags[NETEVENT_NEIGH_UPDATE]);
+	EVENTHANDLER_DEREGISTER(lle_event, nb->tags[NETEVENT_NEIGH_UPDATE]);
 
 	return (0);
 }

Modified: trunk/sys/ofed/include/net/tcp.h
===================================================================
--- trunk/sys/ofed/include/net/tcp.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/net/tcp.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -2,6 +2,7 @@
  * Copyright (c) 2010 Isilon Systems, Inc.
  * Copyright (c) 2010 iX Systems, Inc.
  * Copyright (c) 2010 Panasas, Inc.
+ * Copyright (c) 2013, 2014 Mellanox Technologies, Ltd.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without

Index: trunk/sys/ofed/include/rdma/Kbuild
===================================================================
--- trunk/sys/ofed/include/rdma/Kbuild	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/rdma/Kbuild	2016-09-14 19:35:22 UTC (rev 7911)

Property changes on: trunk/sys/ofed/include/rdma/Kbuild
___________________________________________________________________
Deleted: cvs2svn:cvs-rev
## -1 +0,0 ##
-1.1.1.1
\ No newline at end of property
Modified: trunk/sys/ofed/include/rdma/ib_addr.h
===================================================================
--- trunk/sys/ofed/include/rdma/ib_addr.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/rdma/ib_addr.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -41,7 +41,6 @@
 #include <linux/socket.h>
 #include <rdma/ib_verbs.h>
 #include <rdma/ib_pack.h>
-#include <linux/ethtool.h>
 #include <linux/if_vlan.h>
 
 struct rdma_addr_client {


Property changes on: trunk/sys/ofed/include/rdma/ib_addr.h
___________________________________________________________________
Deleted: cvs2svn:cvs-rev
## -1 +0,0 ##
-1.1.1.1
\ No newline at end of property
Index: trunk/sys/ofed/include/rdma/ib_cache.h
===================================================================
--- trunk/sys/ofed/include/rdma/ib_cache.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/rdma/ib_cache.h	2016-09-14 19:35:22 UTC (rev 7911)

Property changes on: trunk/sys/ofed/include/rdma/ib_cache.h
___________________________________________________________________
Deleted: cvs2svn:cvs-rev
## -1 +0,0 ##
-1.1.1.1
\ No newline at end of property
Modified: trunk/sys/ofed/include/rdma/ib_cm.h
===================================================================
--- trunk/sys/ofed/include/rdma/ib_cm.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/rdma/ib_cm.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -38,6 +38,9 @@
 #include <rdma/ib_mad.h>
 #include <rdma/ib_sa.h>
 
+/* ib_cm and ib_user_cm modules share /sys/class/infiniband_cm */
+extern struct class cm_class;
+
 enum ib_cm_state {
 	IB_CM_IDLE,
 	IB_CM_LISTEN,
@@ -259,6 +262,18 @@
 	void			*private_data;
 };
 
+#define CM_REQ_ATTR_ID		cpu_to_be16(0x0010)
+#define CM_MRA_ATTR_ID		cpu_to_be16(0x0011)
+#define CM_REJ_ATTR_ID		cpu_to_be16(0x0012)
+#define CM_REP_ATTR_ID		cpu_to_be16(0x0013)
+#define CM_RTU_ATTR_ID		cpu_to_be16(0x0014)
+#define CM_DREQ_ATTR_ID		cpu_to_be16(0x0015)
+#define CM_DREP_ATTR_ID		cpu_to_be16(0x0016)
+#define CM_SIDR_REQ_ATTR_ID	cpu_to_be16(0x0017)
+#define CM_SIDR_REP_ATTR_ID	cpu_to_be16(0x0018)
+#define CM_LAP_ATTR_ID		cpu_to_be16(0x0019)
+#define CM_APR_ATTR_ID		cpu_to_be16(0x001A)
+
 /**
  * ib_cm_handler - User-defined callback to process communication events.
  * @cm_id: Communication identifier associated with the reported event.


Property changes on: trunk/sys/ofed/include/rdma/ib_cm.h
___________________________________________________________________
Deleted: cvs2svn:cvs-rev
## -1 +0,0 ##
-1.1.1.1
\ No newline at end of property
Index: trunk/sys/ofed/include/rdma/ib_fmr_pool.h
===================================================================
--- trunk/sys/ofed/include/rdma/ib_fmr_pool.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/rdma/ib_fmr_pool.h	2016-09-14 19:35:22 UTC (rev 7911)

Property changes on: trunk/sys/ofed/include/rdma/ib_fmr_pool.h
___________________________________________________________________
Deleted: cvs2svn:cvs-rev
## -1 +0,0 ##
-1.1.1.1
\ No newline at end of property
Modified: trunk/sys/ofed/include/rdma/ib_mad.h
===================================================================
--- trunk/sys/ofed/include/rdma/ib_mad.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/rdma/ib_mad.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -151,7 +151,7 @@
 
 typedef u64 __bitwise ib_sa_comp_mask;
 
-#define IB_SA_COMP_MASK(n) ((__force ib_sa_comp_mask) cpu_to_be64(1ull << n))
+#define IB_SA_COMP_MASK(n) ((__force ib_sa_comp_mask) cpu_to_be64(1ull << (n)))
 
 /*
  * ib_sa_hdr and ib_sa_mad structures must be packed because they have


Property changes on: trunk/sys/ofed/include/rdma/ib_mad.h
___________________________________________________________________
Deleted: cvs2svn:cvs-rev
## -1 +0,0 ##
-1.1.1.1
\ No newline at end of property
Index: trunk/sys/ofed/include/rdma/ib_marshall.h
===================================================================
--- trunk/sys/ofed/include/rdma/ib_marshall.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/rdma/ib_marshall.h	2016-09-14 19:35:22 UTC (rev 7911)

Property changes on: trunk/sys/ofed/include/rdma/ib_marshall.h
___________________________________________________________________
Deleted: cvs2svn:cvs-rev
## -1 +0,0 ##
-1.1.1.1
\ No newline at end of property
Index: trunk/sys/ofed/include/rdma/ib_pack.h
===================================================================
--- trunk/sys/ofed/include/rdma/ib_pack.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/rdma/ib_pack.h	2016-09-14 19:35:22 UTC (rev 7911)

Property changes on: trunk/sys/ofed/include/rdma/ib_pack.h
___________________________________________________________________
Deleted: cvs2svn:cvs-rev
## -1 +0,0 ##
-1.1.1.1
\ No newline at end of property
Added: trunk/sys/ofed/include/rdma/ib_pma.h
===================================================================
--- trunk/sys/ofed/include/rdma/ib_pma.h	                        (rev 0)
+++ trunk/sys/ofed/include/rdma/ib_pma.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -0,0 +1,156 @@
+/*
+ * Copyright (c) 2006, 2007, 2008, 2009, 2010 QLogic Corporation.
+ * All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if !defined(IB_PMA_H)
+#define IB_PMA_H
+
+#include <rdma/ib_mad.h>
+
+/*
+ * PMA class portinfo capability mask bits
+ */
+#define IB_PMA_CLASS_CAP_ALLPORTSELECT  cpu_to_be16(1 << 8)
+#define IB_PMA_CLASS_CAP_EXT_WIDTH      cpu_to_be16(1 << 9)
+#define IB_PMA_CLASS_CAP_XMIT_WAIT      cpu_to_be16(1 << 12)
+
+#define IB_PMA_CLASS_PORT_INFO          cpu_to_be16(0x0001)
+#define IB_PMA_PORT_SAMPLES_CONTROL     cpu_to_be16(0x0010)
+#define IB_PMA_PORT_SAMPLES_RESULT      cpu_to_be16(0x0011)
+#define IB_PMA_PORT_COUNTERS            cpu_to_be16(0x0012)
+#define IB_PMA_PORT_COUNTERS_EXT        cpu_to_be16(0x001D)
+#define IB_PMA_PORT_SAMPLES_RESULT_EXT  cpu_to_be16(0x001E)
+
+struct ib_pma_mad {
+	struct ib_mad_hdr mad_hdr;
+	u8 reserved[40];
+	u8 data[192];
+} __packed;
+
+struct ib_pma_portsamplescontrol {
+	u8 opcode;
+	u8 port_select;
+	u8 tick;
+	u8 counter_width;		/* resv: 7:3, counter width: 2:0 */
+	__be32 counter_mask0_9;		/* 2, 10 3-bit fields */
+	__be16 counter_mask10_14;	/* 1, 5 3-bit fields */
+	u8 sample_mechanisms;
+	u8 sample_status;		/* only lower 2 bits */
+	__be64 option_mask;
+	__be64 vendor_mask;
+	__be32 sample_start;
+	__be32 sample_interval;
+	__be16 tag;
+	__be16 counter_select[15];
+	__be32 reserved1;
+	__be64 samples_only_option_mask;
+	__be32 reserved2[28];
+};
+
+struct ib_pma_portsamplesresult {
+	__be16 tag;
+	__be16 sample_status;   /* only lower 2 bits */
+	__be32 counter[15];
+};
+
+struct ib_pma_portsamplesresult_ext {
+	__be16 tag;
+	__be16 sample_status;   /* only lower 2 bits */
+	__be32 extended_width;  /* only upper 2 bits */
+	__be64 counter[15];
+};
+
+struct ib_pma_portcounters {
+	u8 reserved;
+	u8 port_select;
+	__be16 counter_select;
+	__be16 symbol_error_counter;
+	u8 link_error_recovery_counter;
+	u8 link_downed_counter;
+	__be16 port_rcv_errors;
+	__be16 port_rcv_remphys_errors;
+	__be16 port_rcv_switch_relay_errors;
+	__be16 port_xmit_discards;
+	u8 port_xmit_constraint_errors;
+	u8 port_rcv_constraint_errors;
+	u8 reserved1;
+	u8 link_overrun_errors; /* LocalLink: 7:4, BufferOverrun: 3:0 */
+	__be16 reserved2;
+	__be16 vl15_dropped;
+	__be32 port_xmit_data;
+	__be32 port_rcv_data;
+	__be32 port_xmit_packets;
+	__be32 port_rcv_packets;
+	__be32 port_xmit_wait;
+} __packed;
+
+
+#define IB_PMA_SEL_SYMBOL_ERROR                 cpu_to_be16(0x0001)
+#define IB_PMA_SEL_LINK_ERROR_RECOVERY          cpu_to_be16(0x0002)
+#define IB_PMA_SEL_LINK_DOWNED                  cpu_to_be16(0x0004)
+#define IB_PMA_SEL_PORT_RCV_ERRORS              cpu_to_be16(0x0008)
+#define IB_PMA_SEL_PORT_RCV_REMPHYS_ERRORS      cpu_to_be16(0x0010)
+#define IB_PMA_SEL_PORT_XMIT_DISCARDS           cpu_to_be16(0x0040)
+#define IB_PMA_SEL_LOCAL_LINK_INTEGRITY_ERRORS  cpu_to_be16(0x0200)
+#define IB_PMA_SEL_EXCESSIVE_BUFFER_OVERRUNS    cpu_to_be16(0x0400)
+#define IB_PMA_SEL_PORT_VL15_DROPPED            cpu_to_be16(0x0800)
+#define IB_PMA_SEL_PORT_XMIT_DATA               cpu_to_be16(0x1000)
+#define IB_PMA_SEL_PORT_RCV_DATA                cpu_to_be16(0x2000)
+#define IB_PMA_SEL_PORT_XMIT_PACKETS            cpu_to_be16(0x4000)
+#define IB_PMA_SEL_PORT_RCV_PACKETS             cpu_to_be16(0x8000)
+
+struct ib_pma_portcounters_ext {
+	u8 reserved;
+	u8 port_select;
+	__be16 counter_select;
+	__be32 reserved1;
+	__be64 port_xmit_data;
+	__be64 port_rcv_data;
+	__be64 port_xmit_packets;
+	__be64 port_rcv_packets;
+	__be64 port_unicast_xmit_packets;
+	__be64 port_unicast_rcv_packets;
+	__be64 port_multicast_xmit_packets;
+	__be64 port_multicast_rcv_packets;
+} __packed;
+
+#define IB_PMA_SELX_PORT_XMIT_DATA              cpu_to_be16(0x0001)
+#define IB_PMA_SELX_PORT_RCV_DATA               cpu_to_be16(0x0002)
+#define IB_PMA_SELX_PORT_XMIT_PACKETS           cpu_to_be16(0x0004)
+#define IB_PMA_SELX_PORT_RCV_PACKETS            cpu_to_be16(0x0008)
+#define IB_PMA_SELX_PORT_UNI_XMIT_PACKETS       cpu_to_be16(0x0010)
+#define IB_PMA_SELX_PORT_UNI_RCV_PACKETS        cpu_to_be16(0x0020)
+#define IB_PMA_SELX_PORT_MULTI_XMIT_PACKETS     cpu_to_be16(0x0040)
+#define IB_PMA_SELX_PORT_MULTI_RCV_PACKETS      cpu_to_be16(0x0080)
+
+#endif /* IB_PMA_H */


Property changes on: trunk/sys/ofed/include/rdma/ib_pma.h
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Modified: trunk/sys/ofed/include/rdma/ib_sa.h
===================================================================
--- trunk/sys/ofed/include/rdma/ib_sa.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/rdma/ib_sa.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -372,6 +372,28 @@
 	u8	padding[49];
 };
 
+#define IB_SA_GUIDINFO_REC_LID		IB_SA_COMP_MASK(0)
+#define IB_SA_GUIDINFO_REC_BLOCK_NUM	IB_SA_COMP_MASK(1)
+#define IB_SA_GUIDINFO_REC_RES1		IB_SA_COMP_MASK(2)
+#define IB_SA_GUIDINFO_REC_RES2		IB_SA_COMP_MASK(3)
+#define IB_SA_GUIDINFO_REC_GID0		IB_SA_COMP_MASK(4)
+#define IB_SA_GUIDINFO_REC_GID1		IB_SA_COMP_MASK(5)
+#define IB_SA_GUIDINFO_REC_GID2		IB_SA_COMP_MASK(6)
+#define IB_SA_GUIDINFO_REC_GID3		IB_SA_COMP_MASK(7)
+#define IB_SA_GUIDINFO_REC_GID4		IB_SA_COMP_MASK(8)
+#define IB_SA_GUIDINFO_REC_GID5		IB_SA_COMP_MASK(9)
+#define IB_SA_GUIDINFO_REC_GID6		IB_SA_COMP_MASK(10)
+#define IB_SA_GUIDINFO_REC_GID7		IB_SA_COMP_MASK(11)
+
+struct ib_sa_guidinfo_rec {
+	__be16	lid;
+	u8	block_num;
+	/* reserved */
+	u8	res1;
+	__be32	res2;
+	u8	guid_info_list[64];
+};
+
 struct ib_sa_client {
 	atomic_t users;
 	struct completion comp;
@@ -556,4 +578,16 @@
  */
 void ib_sa_unregister_inform_info(struct ib_inform_info *info);
 
+int ib_sa_guid_info_rec_query(struct ib_sa_client *client,
+                              struct ib_device *device, u8 port_num,
+                              struct ib_sa_guidinfo_rec *rec,
+                              ib_sa_comp_mask comp_mask, u8 method,
+                              int timeout_ms, gfp_t gfp_mask,
+                              void (*callback)(int status,
+                                               struct ib_sa_guidinfo_rec *resp,
+                                               void *context),
+                              void *context,
+                              struct ib_sa_query **sa_query);
+
+
 #endif /* IB_SA_H */


Property changes on: trunk/sys/ofed/include/rdma/ib_sa.h
___________________________________________________________________
Deleted: cvs2svn:cvs-rev
## -1 +0,0 ##
-1.1.1.1
\ No newline at end of property
Modified: trunk/sys/ofed/include/rdma/ib_smi.h
===================================================================
--- trunk/sys/ofed/include/rdma/ib_smi.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/rdma/ib_smi.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -38,6 +38,7 @@
 #define IB_SMI_H
 
 #include <rdma/ib_mad.h>
+#include <asm/byteorder.h>
 
 #define IB_SMP_DATA_SIZE			64
 #define IB_SMP_MAX_PATH_HOPS			64


Property changes on: trunk/sys/ofed/include/rdma/ib_smi.h
___________________________________________________________________
Deleted: cvs2svn:cvs-rev
## -1 +0,0 ##
-1.1.1.1
\ No newline at end of property
Modified: trunk/sys/ofed/include/rdma/ib_umem.h
===================================================================
--- trunk/sys/ofed/include/rdma/ib_umem.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/rdma/ib_umem.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -39,6 +39,7 @@
 #include <linux/dma-attrs.h>
 
 struct ib_ucontext;
+struct vm_area_struct;
 
 struct ib_umem {
 	struct ib_ucontext     *context;
@@ -57,6 +58,24 @@
 	unsigned long		diff;
 };
 
+struct ib_cmem {
+
+        struct ib_ucontext     *context;
+        size_t                  length;
+        /* Link list of contiguous blocks being part of that cmem  */
+        struct list_head ib_cmem_block;
+
+        /* Order of cmem block,  2^ block_order will equal number
+             of physical pages per block
+        */
+        unsigned long    block_order;
+        /* Refernce counter for that memory area
+          - When value became 0 pages will be returned to the kernel.
+        */
+        struct kref refcount;
+};
+
+
 struct ib_umem_chunk {
 	struct list_head	list;
 	int                     nents;
@@ -70,4 +89,14 @@
 void ib_umem_release(struct ib_umem *umem);
 int ib_umem_page_count(struct ib_umem *umem);
 
+int ib_cmem_map_contiguous_pages_to_vma(struct ib_cmem *ib_cmem,
+        struct vm_area_struct *vma);
+struct ib_cmem *ib_cmem_alloc_contiguous_pages(struct ib_ucontext *context,
+                                unsigned long total_size,
+                                unsigned long page_size_order);
+void ib_cmem_release_contiguous_pages(struct ib_cmem *cmem);
+int ib_umem_map_to_vma(struct ib_umem *umem,
+                                struct vm_area_struct *vma);
+
+
 #endif /* IB_UMEM_H */


Property changes on: trunk/sys/ofed/include/rdma/ib_umem.h
___________________________________________________________________
Deleted: cvs2svn:cvs-rev
## -1 +0,0 ##
-1.1.1.1
\ No newline at end of property
Modified: trunk/sys/ofed/include/rdma/ib_user_cm.h
===================================================================
--- trunk/sys/ofed/include/rdma/ib_user_cm.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/rdma/ib_user_cm.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -34,6 +34,7 @@
 #ifndef IB_USER_CM_H
 #define IB_USER_CM_H
 
+#include <linux/types.h>
 #include <rdma/ib_user_sa.h>
 
 #define IB_USER_CM_ABI_VERSION 5


Property changes on: trunk/sys/ofed/include/rdma/ib_user_cm.h
___________________________________________________________________
Deleted: cvs2svn:cvs-rev
## -1 +0,0 ##
-1.1.1.1
\ No newline at end of property
Index: trunk/sys/ofed/include/rdma/ib_user_mad.h
===================================================================
--- trunk/sys/ofed/include/rdma/ib_user_mad.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/rdma/ib_user_mad.h	2016-09-14 19:35:22 UTC (rev 7911)

Property changes on: trunk/sys/ofed/include/rdma/ib_user_mad.h
___________________________________________________________________
Deleted: cvs2svn:cvs-rev
## -1 +0,0 ##
-1.1.1.1
\ No newline at end of property
Index: trunk/sys/ofed/include/rdma/ib_user_sa.h
===================================================================
--- trunk/sys/ofed/include/rdma/ib_user_sa.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/rdma/ib_user_sa.h	2016-09-14 19:35:22 UTC (rev 7911)

Property changes on: trunk/sys/ofed/include/rdma/ib_user_sa.h
___________________________________________________________________
Deleted: cvs2svn:cvs-rev
## -1 +0,0 ##
-1.1.1.1
\ No newline at end of property
Modified: trunk/sys/ofed/include/rdma/ib_user_verbs.h
===================================================================
--- trunk/sys/ofed/include/rdma/ib_user_verbs.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/rdma/ib_user_verbs.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -82,9 +82,13 @@
 	IB_USER_VERBS_CMD_QUERY_SRQ,
 	IB_USER_VERBS_CMD_DESTROY_SRQ,
 	IB_USER_VERBS_CMD_POST_SRQ_RECV,
+	IB_USER_VERBS_CMD_OPEN_XRCD,
+	IB_USER_VERBS_CMD_CLOSE_XRCD,
+	IB_USER_VERBS_CMD_CREATE_XSRQ,
+	IB_USER_VERBS_CMD_OPEN_QP,
+	IB_USER_VERBS_CMD_ATTACH_FLOW,
+	IB_USER_VERBS_CMD_DETACH_FLOW,
 	IB_USER_VERBS_CMD_CREATE_XRC_SRQ,
-	IB_USER_VERBS_CMD_OPEN_XRC_DOMAIN,
-	IB_USER_VERBS_CMD_CLOSE_XRC_DOMAIN,
 	IB_USER_VERBS_CMD_CREATE_XRC_RCV_QP,
 	IB_USER_VERBS_CMD_MODIFY_XRC_RCV_QP,
 	IB_USER_VERBS_CMD_QUERY_XRC_RCV_QP,
@@ -230,6 +234,21 @@
 	__u32 pd_handle;
 };
 
+struct ib_uverbs_open_xrcd {
+	__u64 response;
+	__u32 fd;
+	__u32 oflags;
+	__u64 driver_data[0];
+};
+
+struct ib_uverbs_open_xrcd_resp {
+	__u32 xrcd_handle;
+};
+
+struct ib_uverbs_close_xrcd {
+	__u32 xrcd_handle;
+};
+
 struct ib_uverbs_reg_mr {
 	__u64 response;
 	__u64 start;
@@ -412,6 +431,17 @@
 	__u64 driver_data[0];
 };
 
+struct ib_uverbs_open_qp {
+	__u64 response;
+	__u64 user_handle;
+	__u32 pd_handle;
+	__u32 qpn;
+	__u8  qp_type;
+	__u8  reserved[7];
+	__u64 driver_data[0];
+};
+
+/* also used for open response */
 struct ib_uverbs_create_qp_resp {
 	__u32 qp_handle;
 	__u32 qpn;
@@ -569,6 +599,16 @@
 	} wr;
 };
 
+struct ibv_uverbs_flow_spec {
+	__u32  type;
+	__be32 src_ip;
+	__be32 dst_ip;
+	__be16 src_port;
+	__be16 dst_port;
+	__u8   l4_protocol;
+	__u8   block_mc_loopback;
+};
+
 struct ib_uverbs_post_send {
 	__u64 response;
 	__u32 qp_handle;
@@ -646,6 +686,45 @@
 	__u64 driver_data[0];
 };
 
+struct ibv_kern_flow_spec {
+	__u32  type;
+	__u32  reserved1;
+	union {
+		struct {
+			__be16 ethertype;
+			__be16 vlan;
+			__u8 vlan_present;
+			__u8  mac[6];
+			__u8  port;
+		} eth;
+		struct {
+			__be32 qpn;
+		} ib_uc;
+		struct {
+			__u8  mgid[16];
+		} ib_mc;
+	} l2_id;
+	__be32 src_ip;
+	__be32 dst_ip;
+	__be16 src_port;
+	__be16 dst_port;
+	__u8   l4_protocol;
+	__u8   block_mc_loopback;
+	__u8   reserved[2];
+};
+
+struct ib_uverbs_attach_flow {
+	__u32 qp_handle;
+	__u32 priority;
+	struct ibv_kern_flow_spec spec;
+};
+
+struct ib_uverbs_detach_flow {
+	__u32 qp_handle;
+	__u32 priority;
+	struct ibv_kern_flow_spec spec;
+};
+
 struct ib_uverbs_create_srq {
 	__u64 response;
 	__u64 user_handle;
@@ -656,15 +735,17 @@
 	__u64 driver_data[0];
 };
 
-struct ib_uverbs_create_xrc_srq {
+struct ib_uverbs_create_xsrq {
 	__u64 response;
 	__u64 user_handle;
+	__u32 srq_type;
 	__u32 pd_handle;
 	__u32 max_wr;
 	__u32 max_sge;
 	__u32 srq_limit;
+	__u32 reserved;
 	__u32 xrcd_handle;
-	__u32 xrc_cq;
+	__u32 cq_handle;
 	__u64 driver_data[0];
 };
 
@@ -672,7 +753,7 @@
 	__u32 srq_handle;
 	__u32 max_wr;
 	__u32 max_sge;
-	__u32 reserved;
+	__u32 srqn;
 };
 
 struct ib_uverbs_modify_srq {


Property changes on: trunk/sys/ofed/include/rdma/ib_user_verbs.h
___________________________________________________________________
Deleted: cvs2svn:cvs-rev
## -1 +0,0 ##
-1.1.1.1
\ No newline at end of property
Modified: trunk/sys/ofed/include/rdma/ib_verbs.h
===================================================================
--- trunk/sys/ofed/include/rdma/ib_verbs.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/rdma/ib_verbs.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -47,12 +47,14 @@
 #include <linux/list.h>
 #include <linux/rwsem.h>
 #include <linux/scatterlist.h>
+#include <linux/workqueue.h>
 
-#include <asm/atomic.h>
 #include <asm/uaccess.h>
 #include <linux/rbtree.h>
 #include <linux/mutex.h>
 
+extern struct workqueue_struct *ib_wq;
+
 union ib_gid {
 	u8	raw[16];
 	struct {
@@ -114,6 +116,11 @@
 	IB_DEVICE_XRC			= (1<<20),
 	IB_DEVICE_MEM_MGT_EXTENSIONS	= (1<<21),
 	IB_DEVICE_BLOCK_MULTICAST_LOOPBACK = (1<<22),
+	IB_DEVICE_MR_ALLOCATE		= (1<<23),
+	IB_DEVICE_SHARED_MR             = (1<<24),
+	IB_DEVICE_QPG			= (1<<25),
+	IB_DEVICE_UD_RSS		= (1<<26),
+	IB_DEVICE_UD_TSS		= (1<<27)
 };
 
 enum ib_atomic_cap {
@@ -161,6 +168,7 @@
 	int			max_srq_wr;
 	int			max_srq_sge;
 	unsigned int		max_fast_reg_page_list_len;
+	int			max_rss_tbl_sz;
 	u16			max_pkeys;
 	u8			local_ca_ack_delay;
 };
@@ -207,6 +215,7 @@
 	IB_PORT_SM_DISABLED			= 1 << 10,
 	IB_PORT_SYS_IMAGE_GUID_SUP		= 1 << 11,
 	IB_PORT_PKEY_SW_EXT_PORT_TRAP_SUP	= 1 << 12,
+	IB_PORT_EXTENDED_SPEEDS_SUP             = 1 << 14,
 	IB_PORT_CM_SUP				= 1 << 16,
 	IB_PORT_SNMP_TUNNEL_SUP			= 1 << 17,
 	IB_PORT_REINIT_SUP			= 1 << 18,
@@ -237,6 +246,15 @@
 	}
 }
 
+enum ib_port_speed {
+	IB_SPEED_SDR	= 1,
+	IB_SPEED_DDR	= 2,
+	IB_SPEED_QDR	= 4,
+	IB_SPEED_FDR10	= 8,
+	IB_SPEED_FDR	= 16,
+	IB_SPEED_EDR	= 32
+};
+
 struct ib_protocol_stats {
 	/* TBD... */
 };
@@ -421,7 +439,15 @@
 	IB_RATE_40_GBPS  = 7,
 	IB_RATE_60_GBPS  = 8,
 	IB_RATE_80_GBPS  = 9,
-	IB_RATE_120_GBPS = 10
+	IB_RATE_120_GBPS = 10,
+	IB_RATE_14_GBPS  = 11,
+	IB_RATE_56_GBPS  = 12,
+	IB_RATE_112_GBPS = 13,
+	IB_RATE_168_GBPS = 14,
+	IB_RATE_25_GBPS  = 15,
+	IB_RATE_100_GBPS = 16,
+	IB_RATE_200_GBPS = 17,
+	IB_RATE_300_GBPS = 18
 };
 
 /**
@@ -433,6 +459,13 @@
 int ib_rate_to_mult(enum ib_rate rate) __attribute_const__;
 
 /**
+ * ib_rate_to_mbps - Convert the IB rate enum to Mbps.
+ * For example, IB_RATE_2_5_GBPS will be converted to 2500.
+ * @rate: rate to convert.
+ */
+int ib_rate_to_mbps(enum ib_rate rate) __attribute_const__;
+
+/**
  * mult_to_ib_rate - Convert a multiple of 2.5 Gbit/sec to an IB rate
  * enum.
  * @mult: multiple to convert.
@@ -498,6 +531,7 @@
 	IB_WC_GRH		= 1,
 	IB_WC_WITH_IMM		= (1<<1),
 	IB_WC_WITH_INVALIDATE	= (1<<2),
+	IB_WC_IP_CSUM_OK	= (1<<3),
 };
 
 struct ib_wc {
@@ -528,6 +562,11 @@
 	IB_CQ_REPORT_MISSED_EVENTS	= 1 << 2,
 };
 
+enum ib_srq_type {
+	IB_SRQT_BASIC,
+	IB_SRQT_XRC
+};
+
 enum ib_srq_attr_mask {
 	IB_SRQ_MAX_WR	= 1 << 0,
 	IB_SRQ_LIMIT	= 1 << 1,
@@ -543,6 +582,14 @@
 	void		      (*event_handler)(struct ib_event *, void *);
 	void		       *srq_context;
 	struct ib_srq_attr	attr;
+	enum ib_srq_type	srq_type;
+
+	union {
+		struct {
+			struct ib_xrcd *xrcd;
+			struct ib_cq   *cq;
+		} xrc;
+	} ext;
 };
 
 struct ib_qp_cap {
@@ -551,6 +598,7 @@
 	u32	max_send_sge;
 	u32	max_recv_sge;
 	u32	max_inline_data;
+	u32	qpg_tss_mask_sz;
 };
 
 enum ib_sig_type {
@@ -572,15 +620,34 @@
 	IB_QPT_UD,
 	IB_QPT_XRC,
 	IB_QPT_RAW_IPV6,
-	IB_QPT_RAW_ETY,
-	IB_QPT_RAW_ETH
+	IB_QPT_RAW_ETHERTYPE,
+	IB_QPT_RAW_PACKET = 8,
+	IB_QPT_XRC_INI = 9,
+	IB_QPT_XRC_TGT,
+	IB_QPT_MAX,
 };
 
 enum ib_qp_create_flags {
 	IB_QP_CREATE_IPOIB_UD_LSO		= 1 << 0,
 	IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK	= 1 << 1,
+	IB_QP_CREATE_NETIF_QP			= 1 << 2,
+	/* reserve bits 26-31 for low level drivers' internal use */
+	IB_QP_CREATE_RESERVED_START		= 1 << 26,
+	IB_QP_CREATE_RESERVED_END		= 1 << 31,
 };
 
+enum ib_qpg_type {
+	IB_QPG_NONE	= 0,
+	IB_QPG_PARENT	= (1<<0),
+	IB_QPG_CHILD_RX = (1<<1),
+	IB_QPG_CHILD_TX = (1<<2)
+};
+
+struct ib_qpg_init_attrib {
+	u32 tss_child_count;
+	u32 rss_child_count;
+};
+
 struct ib_qp_init_attr {
 	void                  (*event_handler)(struct ib_event *, void *);
 	void		       *qp_context;
@@ -587,14 +654,26 @@
 	struct ib_cq	       *send_cq;
 	struct ib_cq	       *recv_cq;
 	struct ib_srq	       *srq;
+	struct ib_xrcd	       *xrcd;     /* XRC TGT QPs only */
 	struct ib_qp_cap	cap;
+	union {
+		struct ib_qp *qpg_parent; /* see qpg_type */
+		struct ib_qpg_init_attrib parent_attrib;
+	} pp;
 	enum ib_sig_type	sq_sig_type;
 	enum ib_qp_type		qp_type;
 	enum ib_qp_create_flags	create_flags;
-	struct ib_xrcd	       *xrc_domain; /* XRC qp's only */
+	enum ib_qpg_type	qpg_type;
 	u8			port_num; /* special QP types only */
 };
 
+struct ib_qp_open_attr {
+	void                  (*event_handler)(struct ib_event *, void *);
+	void		       *qp_context;
+	u32			qp_num;
+	enum ib_qp_type		qp_type;
+};
+
 enum ib_rnr_timeout {
 	IB_RNR_TIMER_655_36 =  0,
 	IB_RNR_TIMER_000_01 =  1,
@@ -651,7 +730,8 @@
 	IB_QP_MAX_DEST_RD_ATOMIC	= (1<<17),
 	IB_QP_PATH_MIG_STATE		= (1<<18),
 	IB_QP_CAP			= (1<<19),
-	IB_QP_DEST_QPN			= (1<<20)
+	IB_QP_DEST_QPN			= (1<<20),
+	IB_QP_GROUP_RSS			= (1<<21)
 };
 
 enum ib_qp_state {
@@ -724,6 +804,20 @@
 	IB_SEND_IP_CSUM		= (1<<4)
 };
 
+enum ib_flow_types {
+	IB_FLOW_ETH = 0,
+	IB_FLOW_IB_UC = 1,
+	IB_FLOW_IB_MC_IPV4 = 2,
+	IB_FLOW_IB_MC_IPV6 = 3
+};
+
+enum {
+	IB_FLOW_L4_NONE = 0,
+	IB_FLOW_L4_OTHER = 3,
+	IB_FLOW_L4_UDP = 5,
+	IB_FLOW_L4_TCP = 6
+};
+
 struct ib_sge {
 	u64	addr;
 	u32	length;
@@ -785,7 +879,7 @@
 			u8			static_rate;
 		} raw_ety;
 	} wr;
-	u32			xrc_remote_srq_num; /* valid for XRC sends only */
+	u32			xrc_remote_srq_num;	/* XRC TGT QPs only */
 };
 
 struct ib_recv_wr {
@@ -800,7 +894,15 @@
 	IB_ACCESS_REMOTE_WRITE	= (1<<1),
 	IB_ACCESS_REMOTE_READ	= (1<<2),
 	IB_ACCESS_REMOTE_ATOMIC	= (1<<3),
-	IB_ACCESS_MW_BIND	= (1<<4)
+	IB_ACCESS_MW_BIND	= (1<<4),
+	IB_ACCESS_ALLOCATE_MR	= (1<<5),
+	IB_ACCESS_SHARED_MR_USER_READ   = (1<<6),
+	IB_ACCESS_SHARED_MR_USER_WRITE  = (1<<7),
+	IB_ACCESS_SHARED_MR_GROUP_READ  = (1<<8),
+	IB_ACCESS_SHARED_MR_GROUP_WRITE = (1<<9),
+	IB_ACCESS_SHARED_MR_OTHER_READ  = (1<<10),
+	IB_ACCESS_SHARED_MR_OTHER_WRITE = (1<<11)
+
 };
 
 struct ib_phys_buf {
@@ -847,7 +949,7 @@
 	struct list_head	qp_list;
 	struct list_head	srq_list;
 	struct list_head	ah_list;
-	struct list_head	xrc_domain_list;
+	struct list_head	xrcd_list;
 	int			closing;
 };
 
@@ -884,12 +986,14 @@
 struct ib_xrcd {
 	struct ib_device       *device;
 	struct ib_uobject      *uobject;
+	atomic_t		usecnt; /* count all exposed resources */
 	struct inode	       *inode;
 	struct rb_node		node;
-	atomic_t		usecnt; /* count all resources */
+	
+	struct mutex		tgt_qp_mutex;
+	struct list_head	tgt_qp_list;
 };
 
-
 struct ib_ah {
 	struct ib_device	*device;
 	struct ib_pd		*pd;
@@ -911,13 +1015,19 @@
 struct ib_srq {
 	struct ib_device       *device;
 	struct ib_pd	       *pd;
-	struct ib_cq	       *xrc_cq;
-	struct ib_xrcd	       *xrcd;
 	struct ib_uobject      *uobject;
 	void		      (*event_handler)(struct ib_event *, void *);
 	void		       *srq_context;
+	enum ib_srq_type	srq_type;
 	atomic_t		usecnt;
-	u32			xrc_srq_num;
+
+	union {
+		struct {
+			struct ib_xrcd *xrcd;
+			struct ib_cq   *cq;
+			u32		srq_num;
+		} xrc;
+	} ext;
 };
 
 struct ib_qp {
@@ -926,12 +1036,17 @@
 	struct ib_cq	       *send_cq;
 	struct ib_cq	       *recv_cq;
 	struct ib_srq	       *srq;
+	struct ib_xrcd	       *xrcd; /* XRC TGT QPs only */
+	struct list_head	xrcd_list;
+	atomic_t		usecnt; /* count times opened, mcast attaches */
+	struct list_head	open_list;
+	struct ib_qp           *real_qp;
 	struct ib_uobject      *uobject;
 	void                  (*event_handler)(struct ib_event *, void *);
 	void		       *qp_context;
 	u32			qp_num;
 	enum ib_qp_type		qp_type;
-	struct ib_xrcd	       *xrcd;  /* XRC QPs only */
+	enum ib_qpg_type	qpg_type;
 };
 
 struct ib_mr {
@@ -958,6 +1073,32 @@
 	u32			rkey;
 };
 
+struct ib_flow_spec {
+	enum ib_flow_types type;
+	union {
+		struct {
+			__be16 ethertype;
+			__be16 vlan;
+			u8 vlan_present;
+			u8  mac[6];
+			u8  port;
+		} eth;
+		struct {
+			__be32 qpn;
+		} ib_uc;
+		struct {
+			u8  mgid[16];
+		} ib_mc;
+	} l2_id;
+	__be32 src_ip;
+	__be32 dst_ip;
+	__be16 src_port;
+	__be16 dst_port;
+	u8 l4_protocol;
+	u8 block_mc_loopback;
+	u8 rule_type;
+};
+
 struct ib_mad;
 struct ib_grh;
 
@@ -1037,9 +1178,9 @@
 	struct list_head              event_handler_list;
 	spinlock_t                    event_handler_lock;
 
+	spinlock_t                    client_data_lock;
 	struct list_head              core_list;
 	struct list_head              client_data_list;
-	spinlock_t                    client_data_lock;
 
 	struct ib_cache               cache;
 	int                          *pkey_tbl_len;
@@ -1143,7 +1284,8 @@
 						  u64 start, u64 length,
 						  u64 virt_addr,
 						  int mr_access_flags,
-						  struct ib_udata *udata);
+						  struct ib_udata *udata,
+							int mr_id);
 	int                        (*query_mr)(struct ib_mr *mr,
 					       struct ib_mr_attr *mr_attr);
 	int                        (*dereg_mr)(struct ib_mr *mr);
@@ -1191,7 +1333,7 @@
 						     struct ib_srq_init_attr *srq_init_attr,
 						     struct ib_udata *udata);
 	struct ib_xrcd *	   (*alloc_xrcd)(struct ib_device *device,
-						 struct ib_ucontext *context,
+						 struct ib_ucontext *ucontext,
 						 struct ib_udata *udata);
 	int			   (*dealloc_xrcd)(struct ib_xrcd *xrcd);
 	int			   (*create_xrc_rcv_qp)(struct ib_qp_init_attr *init_attr,
@@ -1211,7 +1353,17 @@
 	int 			   (*unreg_xrc_rcv_qp)(struct ib_xrcd *xrcd,
 						       void *context,
 						       u32 qp_num);
+	int                        (*attach_flow)(struct ib_qp *qp,
+						  struct ib_flow_spec *spec,
+						  int priority);
+	int                        (*detach_flow)(struct ib_qp *qp,
+						  struct ib_flow_spec *spec,
+						  int priority);
 
+	unsigned long		   (*get_unmapped_area)(struct file *file,
+					unsigned long addr,
+					unsigned long len, unsigned long pgoff,
+					unsigned long flags);
 	struct ib_dma_mapping_ops   *dma_ops;
 
 	struct module               *owner;
@@ -1225,8 +1377,8 @@
 		IB_DEV_UNREGISTERED
 	}                            reg_state;
 
+	int			     uverbs_abi_ver;
 	u64			     uverbs_cmd_mask;
-	int			     uverbs_abi_ver;
 
 	char			     node_desc[64];
 	__be64			     node_guid;
@@ -1248,7 +1400,9 @@
 struct ib_device *ib_alloc_device(size_t size);
 void ib_dealloc_device(struct ib_device *device);
 
-int ib_register_device   (struct ib_device *device);
+int ib_register_device(struct ib_device *device,
+		       int (*port_callback)(struct ib_device *,
+					    u8, struct kobject *));
 void ib_unregister_device(struct ib_device *device);
 
 int ib_register_client   (struct ib_client *client);
@@ -1269,15 +1423,6 @@
 }
 
 /**
- * ib_sysfs_create_port_files - iterate over port sysfs directories
- * @device: the IB device
- * @create: a function to create sysfs files in each port directory
- */
-int ib_sysfs_create_port_files(struct ib_device *device,
-			       int (*create)(struct ib_device *dev, u8 port_num,
-					     struct kobject *kobj));
-
-/**
  * ib_modify_qp_is_ok - Check that the supplied attribute mask
  * contains all required attributes and no attributes not allowed for
  * the given QP state transition.
@@ -1427,8 +1572,8 @@
 				 struct ib_srq_init_attr *srq_init_attr);
 
 /**
- * ib_create_srq - Creates an SRQ associated with the specified
- *   protection domain.
+ * ib_create_srq - Creates a SRQ associated with the specified protection
+ *   domain.
  * @pd: The protection domain associated with the SRQ.
  * @srq_init_attr: A list of initial attributes required to create the
  *   SRQ.  If SRQ creation succeeds, then the attributes are updated to
@@ -1534,6 +1679,25 @@
 int ib_destroy_qp(struct ib_qp *qp);
 
 /**
+ * ib_open_qp - Obtain a reference to an existing sharable QP.
+ * @xrcd - XRC domain
+ * @qp_open_attr: Attributes identifying the QP to open.
+ *
+ * Returns a reference to a sharable QP.
+ */
+struct ib_qp *ib_open_qp(struct ib_xrcd *xrcd,
+			 struct ib_qp_open_attr *qp_open_attr);
+
+/**
+ * ib_close_qp - Release an external reference to a QP.
+ * @qp: The QP handle to release
+ *
+ * The opened QP handle is released by the caller.  The underlying
+ * shared QP is not destroyed until all internal references are released.
+ */
+int ib_close_qp(struct ib_qp *qp);
+
+/**
  * ib_post_send - Posts a list of work requests to the send queue of
  *   the specified QP.
  * @qp: The QP to post the work request on.
@@ -1540,6 +1704,11 @@
  * @send_wr: A list of work requests to post on the send queue.
  * @bad_send_wr: On an immediate failure, this parameter will reference
  *   the work request that failed to be posted on the QP.
+ *
+ * While IBA Vol. 1 section 11.4.1.1 specifies that if an immediate
+ * error is returned, the QP state shall not be affected,
+ * ib_post_send() will return an immediate error after queueing any
+ * earlier work requests in the list.
  */
 static inline int ib_post_send(struct ib_qp *qp,
 			       struct ib_send_wr *send_wr,
@@ -1581,8 +1750,7 @@
  *   the associated completion and event handlers.
  * @cqe: The minimum size of the CQ.
  * @comp_vector - Completion vector used to signal completion events.
- *     Must be >= 0 and < context->num_comp_vectors
- *     or IB_CQ_VECTOR_LEAST_ATTACHED.
+ *     Must be >= 0 and < context->num_comp_vectors.
  *
  * Users can examine the cq structure to determine the actual CQ size.
  */
@@ -2154,17 +2322,19 @@
  */
 int ib_detach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid);
 
-
 /**
- * ib_dealloc_xrcd - Deallocates an extended reliably connected domain.
- * @xrcd: The xrc domain to deallocate.
+ * ib_alloc_xrcd - Allocates an XRC domain.
+ * @device: The device on which to allocate the XRC domain.
  */
-int ib_dealloc_xrcd(struct ib_xrcd *xrcd);
+struct ib_xrcd *ib_alloc_xrcd(struct ib_device *device);
 
 /**
- * ib_alloc_xrcd - Allocates an extended reliably connected domain.
- * @device: The device on which to allocate the xrcd.
+ * ib_dealloc_xrcd - Deallocates an XRC domain.
+ * @xrcd: The XRC domain to deallocate.
  */
-struct ib_xrcd *ib_alloc_xrcd(struct ib_device *device);
+int ib_dealloc_xrcd(struct ib_xrcd *xrcd);
 
+int ib_attach_flow(struct ib_qp *qp, struct ib_flow_spec *spec, int priority);
+int ib_detach_flow(struct ib_qp *qp, struct ib_flow_spec *spec, int priority);
+
 #endif /* IB_VERBS_H */


Property changes on: trunk/sys/ofed/include/rdma/ib_verbs.h
___________________________________________________________________
Deleted: cvs2svn:cvs-rev
## -1 +0,0 ##
-1.1.1.1
\ No newline at end of property
Modified: trunk/sys/ofed/include/rdma/iw_cm.h
===================================================================
--- trunk/sys/ofed/include/rdma/iw_cm.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/rdma/iw_cm.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -63,6 +63,7 @@
 	void *private_data;
 	u8 private_data_len;
 	void *provider_data;
+	struct socket *so;
 };
 
 /**
@@ -98,6 +99,7 @@
 	/* Used by provider to add and remove refs on IW cm_id */
 	void (*add_ref)(struct iw_cm_id *);
 	void (*rem_ref)(struct iw_cm_id *);
+	struct socket           *so;
 };
 
 struct iw_cm_conn_param {
@@ -139,7 +141,7 @@
  *   returned IW CM identifier.
  * @context: User specified context associated with the id.
  */
-struct iw_cm_id *iw_create_cm_id(struct ib_device *device,
+struct iw_cm_id *iw_create_cm_id(struct ib_device *device, struct socket *so,
 				 iw_cm_handler cm_handler, void *context);
 
 /**


Property changes on: trunk/sys/ofed/include/rdma/iw_cm.h
___________________________________________________________________
Deleted: cvs2svn:cvs-rev
## -1 +0,0 ##
-1.1.1.1
\ No newline at end of property
Index: trunk/sys/ofed/include/rdma/rdma_cm.h
===================================================================
--- trunk/sys/ofed/include/rdma/rdma_cm.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/rdma/rdma_cm.h	2016-09-14 19:35:22 UTC (rev 7911)

Property changes on: trunk/sys/ofed/include/rdma/rdma_cm.h
___________________________________________________________________
Deleted: cvs2svn:cvs-rev
## -1 +0,0 ##
-1.1.1.1
\ No newline at end of property
Index: trunk/sys/ofed/include/rdma/rdma_cm_ib.h
===================================================================
--- trunk/sys/ofed/include/rdma/rdma_cm_ib.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/rdma/rdma_cm_ib.h	2016-09-14 19:35:22 UTC (rev 7911)

Property changes on: trunk/sys/ofed/include/rdma/rdma_cm_ib.h
___________________________________________________________________
Deleted: cvs2svn:cvs-rev
## -1 +0,0 ##
-1.1.1.1
\ No newline at end of property
Index: trunk/sys/ofed/include/rdma/rdma_user_cm.h
===================================================================
--- trunk/sys/ofed/include/rdma/rdma_user_cm.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/rdma/rdma_user_cm.h	2016-09-14 19:35:22 UTC (rev 7911)

Property changes on: trunk/sys/ofed/include/rdma/rdma_user_cm.h
___________________________________________________________________
Deleted: cvs2svn:cvs-rev
## -1 +0,0 ##
-1.1.1.1
\ No newline at end of property
Modified: trunk/sys/ofed/include/rdma/sdp_socket.h
===================================================================
--- trunk/sys/ofed/include/rdma/sdp_socket.h	2016-09-14 19:30:19 UTC (rev 7910)
+++ trunk/sys/ofed/include/rdma/sdp_socket.h	2016-09-14 19:35:22 UTC (rev 7911)
@@ -3,11 +3,6 @@
 #ifndef SDP_SOCKET_H
 #define SDP_SOCKET_H
 
-#ifndef AF_INET_SDP
-#define AF_INET_SDP 27
-#define PF_INET_SDP AF_INET_SDP
-#endif
-
 #ifndef SDP_ZCOPY_THRESH
 #define SDP_ZCOPY_THRESH 80
 #endif


Property changes on: trunk/sys/ofed/include/rdma/sdp_socket.h
___________________________________________________________________
Deleted: cvs2svn:cvs-rev
## -1 +0,0 ##
-1.1.1.1
\ No newline at end of property


More information about the Midnightbsd-cvs mailing list