From f3781d2e89f12dd5afa046dc56032af6e39bd116 Mon Sep 17 00:00:00 2001
From: Roland Dreier <rolandd@cisco.com>
Date: Mon, 14 Jul 2008 23:48:44 -0700
Subject: RDMA: Remove subversion $Id tags

They don't get updated by git and so they're worse than useless.

Signed-off-by: Roland Dreier <rolandd@cisco.com>
---
 drivers/infiniband/ulp/ipoib/ipoib_cm.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'drivers/infiniband/ulp/ipoib/ipoib_cm.c')

diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
index 97e67d36378f..91c959299910 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
@@ -28,8 +28,6 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id$
  */
 
 #include <rdma/ib_cm.h>
-- 
cgit v1.2.3


From f89271da32bc1a636cf4eb078e615930886cd013 Mon Sep 17 00:00:00 2001
From: Eli Cohen <eli@dev.mellanox.co.il>
Date: Mon, 14 Jul 2008 23:48:44 -0700
Subject: IPoIB: Copy small received SKBs in connected mode

The connected mode implementation in the IPoIB driver has a large
overhead in the way SKBs are handled in the receive flow.  It usually
allocates an SKB with as big as was used in the currently received SKB
and moves unused fragments from the old SKB to the new one. This
involves a loop on all the remaining fragments and incurs overhead on
the CPU.  This patch, for small SKBs, allocates an SKB just large
enough to contain the received data and copies to it the data from the
received SKB.  The newly allocated SKB is passed to the stack and the
old SKB is reposted.

When running netperf, UDP small messages, without this pach I get:

    UDP UNIDIRECTIONAL SEND TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to
    14.4.3.178 (14.4.3.178) port 0 AF_INET
    Socket  Message  Elapsed      Messages
    Size    Size     Time         Okay Errors   Throughput
    bytes   bytes    secs            #      #   10^6bits/sec

    114688     128   10.00     5142034      0     526.31
    114688           10.00     1130489            115.71

With this patch I get both send and receive at ~315 mbps.

The reason that send performance actually slows down is as follows:
When using this patch, the overhead of the CPU for handling RX packets
is dramatically reduced.  As a result, we do not experience RNR NAK
messages from the receiver which cause the connection to be closed and
reopened again; when the patch is not used, the receiver cannot handle
the packets fast enough so there is less time to post new buffers and
hence the mentioned RNR NACKs.  So what happens is that the
application *thinks* it posted a certain number of packets for
transmission but these packets are flushed and do not really get
transmitted.  Since the connection gets opened and closed many times,
each time netperf gets the CPU time that otherwise would have been
given to IPoIB to actually transmit the packets.  This can be verified
when looking at the port counters -- the output of ifconfig and the
oputput of netperf (this is for the case without the patch):

    tx packets
    ==========
    port counter:   1,543,996
    ifconfig:       1,581,426
    netperf:        5,142,034

    rx packets
    ==========
    netperf         1,1304,089

Signed-off-by: Eli Cohen <eli@mellanox.co.il>
---
 drivers/infiniband/ulp/ipoib/ipoib_cm.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

(limited to 'drivers/infiniband/ulp/ipoib/ipoib_cm.c')

diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
index 91c959299910..6223fc39af70 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
@@ -523,6 +523,7 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
 	u64 mapping[IPOIB_CM_RX_SG];
 	int frags;
 	int has_srq;
+	struct sk_buff *small_skb;
 
 	ipoib_dbg_data(priv, "cm recv completion: id %d, status: %d\n",
 		       wr_id, wc->status);
@@ -577,6 +578,23 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
 		}
 	}
 
+	if (wc->byte_len < IPOIB_CM_COPYBREAK) {
+		int dlen = wc->byte_len;
+
+		small_skb = dev_alloc_skb(dlen + 12);
+		if (small_skb) {
+			skb_reserve(small_skb, 12);
+			ib_dma_sync_single_for_cpu(priv->ca, rx_ring[wr_id].mapping[0],
+						   dlen, DMA_FROM_DEVICE);
+			skb_copy_from_linear_data(skb, small_skb->data, dlen);
+			ib_dma_sync_single_for_device(priv->ca, rx_ring[wr_id].mapping[0],
+						      dlen, DMA_FROM_DEVICE);
+			skb_put(small_skb, dlen);
+			skb = small_skb;
+			goto copied;
+		}
+	}
+
 	frags = PAGE_ALIGN(wc->byte_len - min(wc->byte_len,
 					      (unsigned)IPOIB_CM_HEAD_SIZE)) / PAGE_SIZE;
 
@@ -599,6 +617,7 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
 
 	skb_put_frags(skb, IPOIB_CM_HEAD_SIZE, wc->byte_len, newskb);
 
+copied:
 	skb->protocol = ((struct ipoib_header *) skb->data)->proto;
 	skb_reset_mac_header(skb);
 	skb_pull(skb, IPOIB_ENCAP_LEN);
-- 
cgit v1.2.3


From a7d834c4bc6be73e8f83eaa5072fac3c5549f7f2 Mon Sep 17 00:00:00 2001
From: Roland Dreier <rolandd@cisco.com>
Date: Mon, 14 Jul 2008 23:48:47 -0700
Subject: IPoIB/cm: Fix racy use of receive WR/SGL in
 ipoib_cm_post_receive_nonsrq()

For devices that don't support SRQs, ipoib_cm_post_receive_nonsrq() is
called from both ipoib_cm_handle_rx_wc() and ipoib_cm_nonsrq_init_rx(),
and these two callers are not synchronized against each other.
However, ipoib_cm_post_receive_nonsrq() always reuses the same receive
work request and scatter list structures, so multiple callers can end
up stepping on each other, which leads to posting garbled work
requests.

Fix this by having the caller pass in the ib_recv_wr and ib_sge
structures to use, and allocating new local structures in
ipoib_cm_nonsrq_init_rx().

Based on a patch by Pradeep Satyanarayana <pradeep@us.ibm.com> and
David Wilder <dwilder@us.ibm.com>, with debugging help from Hoang-Nam
Nguyen <hnguyen@de.ibm.com>.

Signed-off-by: Roland Dreier <rolandd@cisco.com>
---
 drivers/infiniband/ulp/ipoib/ipoib_cm.c | 63 ++++++++++++++++++++++++---------
 1 file changed, 47 insertions(+), 16 deletions(-)

(limited to 'drivers/infiniband/ulp/ipoib/ipoib_cm.c')

diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
index 6223fc39af70..37bf67b2a26f 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
@@ -111,18 +111,20 @@ static int ipoib_cm_post_receive_srq(struct net_device *dev, int id)
 }
 
 static int ipoib_cm_post_receive_nonsrq(struct net_device *dev,
-					struct ipoib_cm_rx *rx, int id)
+					struct ipoib_cm_rx *rx,
+					struct ib_recv_wr *wr,
+					struct ib_sge *sge, int id)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 	struct ib_recv_wr *bad_wr;
 	int i, ret;
 
-	priv->cm.rx_wr.wr_id = id | IPOIB_OP_CM | IPOIB_OP_RECV;
+	wr->wr_id = id | IPOIB_OP_CM | IPOIB_OP_RECV;
 
 	for (i = 0; i < IPOIB_CM_RX_SG; ++i)
-		priv->cm.rx_sge[i].addr = rx->rx_ring[id].mapping[i];
+		sge[i].addr = rx->rx_ring[id].mapping[i];
 
-	ret = ib_post_recv(rx->qp, &priv->cm.rx_wr, &bad_wr);
+	ret = ib_post_recv(rx->qp, wr, &bad_wr);
 	if (unlikely(ret)) {
 		ipoib_warn(priv, "post recv failed for buf %d (%d)\n", id, ret);
 		ipoib_cm_dma_unmap_rx(priv, IPOIB_CM_RX_SG - 1,
@@ -320,10 +322,33 @@ static int ipoib_cm_modify_rx_qp(struct net_device *dev,
 	return 0;
 }
 
+static void ipoib_cm_init_rx_wr(struct net_device *dev,
+				struct ib_recv_wr *wr,
+				struct ib_sge *sge)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	int i;
+
+	for (i = 0; i < priv->cm.num_frags; ++i)
+		sge[i].lkey = priv->mr->lkey;
+
+	sge[0].length = IPOIB_CM_HEAD_SIZE;
+	for (i = 1; i < priv->cm.num_frags; ++i)
+		sge[i].length = PAGE_SIZE;
+
+	wr->next    = NULL;
+	wr->sg_list = priv->cm.rx_sge;
+	wr->num_sge = priv->cm.num_frags;
+}
+
 static int ipoib_cm_nonsrq_init_rx(struct net_device *dev, struct ib_cm_id *cm_id,
 				   struct ipoib_cm_rx *rx)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct {
+		struct ib_recv_wr wr;
+		struct ib_sge sge[IPOIB_CM_RX_SG];
+	} *t;
 	int ret;
 	int i;
 
@@ -331,6 +356,14 @@ static int ipoib_cm_nonsrq_init_rx(struct net_device *dev, struct ib_cm_id *cm_i
 	if (!rx->rx_ring)
 		return -ENOMEM;
 
+	t = kmalloc(sizeof *t, GFP_KERNEL);
+	if (!t) {
+		ret = -ENOMEM;
+		goto err_free;
+	}
+
+	ipoib_cm_init_rx_wr(dev, &t->wr, t->sge);
+
 	spin_lock_irq(&priv->lock);
 
 	if (priv->cm.nonsrq_conn_qp >= ipoib_max_conn_qp) {
@@ -349,8 +382,8 @@ static int ipoib_cm_nonsrq_init_rx(struct net_device *dev, struct ib_cm_id *cm_i
 			ipoib_warn(priv, "failed to allocate receive buffer %d\n", i);
 				ret = -ENOMEM;
 				goto err_count;
-			}
-		ret = ipoib_cm_post_receive_nonsrq(dev, rx, i);
+		}
+		ret = ipoib_cm_post_receive_nonsrq(dev, rx, &t->wr, t->sge, i);
 		if (ret) {
 			ipoib_warn(priv, "ipoib_cm_post_receive_nonsrq "
 				   "failed for buf %d\n", i);
@@ -361,6 +394,8 @@ static int ipoib_cm_nonsrq_init_rx(struct net_device *dev, struct ib_cm_id *cm_i
 
 	rx->recv_count = ipoib_recvq_size;
 
+	kfree(t);
+
 	return 0;
 
 err_count:
@@ -369,6 +404,7 @@ err_count:
 	spin_unlock_irq(&priv->lock);
 
 err_free:
+	kfree(t);
 	ipoib_cm_free_rx_ring(dev, rx->rx_ring);
 
 	return ret;
@@ -637,7 +673,10 @@ repost:
 			ipoib_warn(priv, "ipoib_cm_post_receive_srq failed "
 				   "for buf %d\n", wr_id);
 	} else {
-		if (unlikely(ipoib_cm_post_receive_nonsrq(dev, p, wr_id))) {
+		if (unlikely(ipoib_cm_post_receive_nonsrq(dev, p,
+							  &priv->cm.rx_wr,
+							  priv->cm.rx_sge,
+							  wr_id))) {
 			--p->recv_count;
 			ipoib_warn(priv, "ipoib_cm_post_receive_nonsrq failed "
 				   "for buf %d\n", wr_id);
@@ -1502,15 +1541,7 @@ int ipoib_cm_dev_init(struct net_device *dev)
 		priv->cm.num_frags  = IPOIB_CM_RX_SG;
 	}
 
-	for (i = 0; i < priv->cm.num_frags; ++i)
-		priv->cm.rx_sge[i].lkey	= priv->mr->lkey;
-
-	priv->cm.rx_sge[0].length = IPOIB_CM_HEAD_SIZE;
-	for (i = 1; i < priv->cm.num_frags; ++i)
-		priv->cm.rx_sge[i].length = PAGE_SIZE;
-	priv->cm.rx_wr.next = NULL;
-	priv->cm.rx_wr.sg_list = priv->cm.rx_sge;
-	priv->cm.rx_wr.num_sge = priv->cm.num_frags;
+	ipoib_cm_init_rx_wr(dev, &priv->cm.rx_wr, priv->cm.rx_sge);
 
 	if (ipoib_cm_has_srq(dev)) {
 		for (i = 0; i < ipoib_recvq_size; ++i) {
-- 
cgit v1.2.3


From c8c2afe360b7366f586f6bece1109a72ea334876 Mon Sep 17 00:00:00 2001
From: Eli Cohen <eli@mellanox.co.il>
Date: Mon, 14 Jul 2008 23:48:51 -0700
Subject: IPoIB: Use rtnl lock/unlock when changing device flags

Use of this lock is required to synchronize changes to the netdvice's
data structs.  Also move the call to ipoib_flush_paths() after the
modification of the netdevice flags in set_mode().

Signed-off-by: Eli Cohen <eli@mellanox.co.il>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
---
 drivers/infiniband/ulp/ipoib/ipoib_cm.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'drivers/infiniband/ulp/ipoib/ipoib_cm.c')

diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
index 37bf67b2a26f..b4269139135b 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
@@ -1440,7 +1440,9 @@ static ssize_t set_mode(struct device *d, struct device_attribute *attr,
 		ipoib_warn(priv, "enabling connected mode "
 			   "will cause multicast packet drops\n");
 
+		rtnl_lock();
 		dev->features &= ~(NETIF_F_IP_CSUM | NETIF_F_SG | NETIF_F_TSO);
+		rtnl_unlock();
 		priv->tx_wr.send_flags &= ~IB_SEND_IP_CSUM;
 
 		ipoib_flush_paths(dev);
@@ -1449,14 +1451,16 @@ static ssize_t set_mode(struct device *d, struct device_attribute *attr,
 
 	if (!strcmp(buf, "datagram\n")) {
 		clear_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags);
-		dev->mtu = min(priv->mcast_mtu, dev->mtu);
-		ipoib_flush_paths(dev);
 
+		rtnl_lock();
 		if (test_bit(IPOIB_FLAG_CSUM, &priv->flags)) {
 			dev->features |= NETIF_F_IP_CSUM | NETIF_F_SG;
 			if (priv->hca_caps & IB_DEVICE_UD_TSO)
 				dev->features |= NETIF_F_TSO;
 		}
+		dev->mtu = min(priv->mcast_mtu, dev->mtu);
+		rtnl_unlock();
+		ipoib_flush_paths(dev);
 
 		return count;
 	}
-- 
cgit v1.2.3


From bd3606715effbf37df986548c43bbed0842b49d5 Mon Sep 17 00:00:00 2001
From: Eli Cohen <eli@mellanox.co.il>
Date: Mon, 14 Jul 2008 23:48:51 -0700
Subject: IPoIB: Use dev_set_mtu() to change mtu

When the driver sets the MTU of the net device outside of its
change_mtu method, it should make use of dev_set_mtu() instead of
directly setting the mtu field of struct netdevice.  Otherwise
functions registered to be called upon MTU change will not get called
(this is done through call_netdevice_notifiers() in dev_set_mtu()).

Signed-off-by: Eli Cohen <eli@mellanox.co.il>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
---
 drivers/infiniband/ulp/ipoib/ipoib_cm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers/infiniband/ulp/ipoib/ipoib_cm.c')

diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
index b4269139135b..87f9f3ef3b2d 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
@@ -1458,7 +1458,7 @@ static ssize_t set_mode(struct device *d, struct device_attribute *attr,
 			if (priv->hca_caps & IB_DEVICE_UD_TSO)
 				dev->features |= NETIF_F_TSO;
 		}
-		dev->mtu = min(priv->mcast_mtu, dev->mtu);
+		dev_set_mtu(dev, min(priv->mcast_mtu, dev->mtu));
 		rtnl_unlock();
 		ipoib_flush_paths(dev);
 
-- 
cgit v1.2.3


From e112373fd6aa280bd2cbc0d5cc3809115325a1be Mon Sep 17 00:00:00 2001
From: Eli Cohen <eli@dev.mellanox.co.il>
Date: Mon, 14 Jul 2008 23:48:52 -0700
Subject: IPoIB/cm: Reduce connected mode TX object size

Since IPoIB connected mode does not NETIF_F_SG, we only have one DMA
mapping per send, so we don't need a mapping[] array.  Define a new
struct with a single u64 mapping member and use it for the CM tx_ring.

Signed-off-by: Eli Cohen <eli@mellanox.co.il>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
---
 drivers/infiniband/ulp/ipoib/ipoib_cm.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'drivers/infiniband/ulp/ipoib/ipoib_cm.c')

diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
index 87f9f3ef3b2d..0f2d3045061a 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
@@ -703,7 +703,7 @@ static inline int post_send(struct ipoib_dev_priv *priv,
 void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_tx *tx)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
-	struct ipoib_tx_buf *tx_req;
+	struct ipoib_cm_tx_buf *tx_req;
 	u64 addr;
 
 	if (unlikely(skb->len > tx->mtu)) {
@@ -734,7 +734,7 @@ void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_
 		return;
 	}
 
-	tx_req->mapping[0] = addr;
+	tx_req->mapping = addr;
 
 	if (unlikely(post_send(priv, tx, tx->tx_head & (ipoib_sendq_size - 1),
 			       addr, skb->len))) {
@@ -759,7 +759,7 @@ void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 	struct ipoib_cm_tx *tx = wc->qp->qp_context;
 	unsigned int wr_id = wc->wr_id & ~IPOIB_OP_CM;
-	struct ipoib_tx_buf *tx_req;
+	struct ipoib_cm_tx_buf *tx_req;
 	unsigned long flags;
 
 	ipoib_dbg_data(priv, "cm send completion: id %d, status: %d\n",
@@ -773,7 +773,7 @@ void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
 
 	tx_req = &tx->tx_ring[wr_id];
 
-	ib_dma_unmap_single(priv->ca, tx_req->mapping[0], tx_req->skb->len, DMA_TO_DEVICE);
+	ib_dma_unmap_single(priv->ca, tx_req->mapping, tx_req->skb->len, DMA_TO_DEVICE);
 
 	/* FIXME: is this right? Shouldn't we only increment on success? */
 	++dev->stats.tx_packets;
@@ -1143,7 +1143,7 @@ err_tx:
 static void ipoib_cm_tx_destroy(struct ipoib_cm_tx *p)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(p->dev);
-	struct ipoib_tx_buf *tx_req;
+	struct ipoib_cm_tx_buf *tx_req;
 	unsigned long flags;
 	unsigned long begin;
 
@@ -1171,7 +1171,7 @@ timeout:
 
 	while ((int) p->tx_tail - (int) p->tx_head < 0) {
 		tx_req = &p->tx_ring[p->tx_tail & (ipoib_sendq_size - 1)];
-		ib_dma_unmap_single(priv->ca, tx_req->mapping[0], tx_req->skb->len,
+		ib_dma_unmap_single(priv->ca, tx_req->mapping, tx_req->skb->len,
 				    DMA_TO_DEVICE);
 		dev_kfree_skb_any(tx_req->skb);
 		++p->tx_tail;
-- 
cgit v1.2.3