From 1d2e3f396c532b95a37bbee92269f37efe908457 Mon Sep 17 00:00:00 2001 From: "santosh.shilimkar@oracle.com" Date: Sat, 22 Aug 2015 15:45:22 -0700 Subject: [PATCH 01/14] RDS: restore return value in rds_cmsg_rdma_args() In rds_cmsg_rdma_args() 'ret' is used by rds_pin_pages() which returns number of pinned pages on success. And the same value is returned to the caller of rds_cmsg_rdma_args() on success which is not intended. Commit f4a3fc03c1d7 ("RDS: Clean up error handling in rds_cmsg_rdma_args") removed the 'ret = 0' line which broke RDS RDMA mode. Fix it by restoring the return value on rds_pin_pages() success keeping the clean-up in place. Signed-off-by: Santosh Shilimkar Signed-off-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/rdma.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/rds/rdma.c b/net/rds/rdma.c index 40084d843e9f..6401b501a215 100644 --- a/net/rds/rdma.c +++ b/net/rds/rdma.c @@ -658,6 +658,8 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, ret = rds_pin_pages(iov->addr, nr, pages, !op->op_write); if (ret < 0) goto out; + else + ret = 0; rdsdebug("RDS: nr_bytes %u nr %u iov->bytes %llu iov->addr %llx\n", nr_bytes, nr, iov->bytes, iov->addr); From 43962dd7ee192299c6e0c6cd7f0a65997308f1f4 Mon Sep 17 00:00:00 2001 From: "santosh.shilimkar@oracle.com" Date: Sat, 22 Aug 2015 15:45:23 -0700 Subject: [PATCH 02/14] RDS: always free recv frag as we free its ring entry We were still seeing rare occurrences of the WARN_ON(recv->r_frag) which indicates that the recv refill path was finding allocated frags in ring entries that were marked free. These were usually followed by OOM crashes. They only seem to be occurring in the presence of completion errors and connection resets. This patch ensures that we free the frag as we mark the ring entry free. This should stop the refill path from finding allocated frags in ring entries that were marked free. Reviewed-by: Ajaykumar Hotchandani Signed-off-by: Santosh Shilimkar Signed-off-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/ib_recv.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c index cac5b4506ee3..2a6a75c59943 100644 --- a/net/rds/ib_recv.c +++ b/net/rds/ib_recv.c @@ -982,10 +982,17 @@ static inline void rds_poll_cq(struct rds_ib_connection *ic, } /* - * It's very important that we only free this ring entry if we've truly - * freed the resources allocated to the entry. The refilling path can - * leak if we don't. + * rds_ib_process_recv() doesn't always consume the frag, and + * we might not have called it at all if the wc didn't indicate + * success. We already unmapped the frag's pages, though, and + * the following rds_ib_ring_free() call tells the refill path + * that it will not find an allocated frag here. Make sure we + * keep that promise by freeing a frag that's still on the ring. */ + if (recv->r_frag) { + rds_ib_frag_free(ic, recv->r_frag); + recv->r_frag = NULL; + } rds_ib_ring_free(&ic->i_recv_ring, 1); } } From 1bc7b863f230e429dd7a06c0956ada7933d69f50 Mon Sep 17 00:00:00 2001 From: "santosh.shilimkar@oracle.com" Date: Sat, 22 Aug 2015 15:45:24 -0700 Subject: [PATCH 03/14] RDS: destroy the ib state earlier during shutdown Destroy ib state early during shutdown. Otherwise we can get callbacks after the QP isn't really able to handle them. Reviewed-by: Ajaykumar Hotchandani Signed-off-by: Santosh Shilimkar Signed-off-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/ib_cm.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index f40d8f52b753..94d4427377b2 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -640,6 +640,16 @@ void rds_ib_conn_shutdown(struct rds_connection *conn) (atomic_read(&ic->i_signaled_sends) == 0)); tasklet_kill(&ic->i_recv_tasklet); + /* first destroy the ib state that generates callbacks */ + if (ic->i_cm_id->qp) + rdma_destroy_qp(ic->i_cm_id); + if (ic->i_send_cq) + ib_destroy_cq(ic->i_send_cq); + if (ic->i_recv_cq) + ib_destroy_cq(ic->i_recv_cq); + rdma_destroy_id(ic->i_cm_id); + + /* then free the resources that ib callbacks use */ if (ic->i_send_hdrs) ib_dma_free_coherent(dev, ic->i_send_ring.w_nr * @@ -663,14 +673,6 @@ void rds_ib_conn_shutdown(struct rds_connection *conn) if (ic->i_recvs) rds_ib_recv_clear_ring(ic); - if (ic->i_cm_id->qp) - rdma_destroy_qp(ic->i_cm_id); - if (ic->i_send_cq) - ib_destroy_cq(ic->i_send_cq); - if (ic->i_recv_cq) - ib_destroy_cq(ic->i_recv_cq); - rdma_destroy_id(ic->i_cm_id); - /* * Move connection back to the nodev list. */ From e1f475a738e4c861d8aff84b737a0538680cbe05 Mon Sep 17 00:00:00 2001 From: "santosh.shilimkar@oracle.com" Date: Sat, 22 Aug 2015 15:45:25 -0700 Subject: [PATCH 04/14] RDS: don't update ip address tables if the address hasn't changed If the ip address tables hasn't changed, there is no need to remove them only to be added back again. Lets fix it. Reviewed-by: Ajaykumar Hotchandani Signed-off-by: Santosh Shilimkar Signed-off-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/ib_rdma.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index 657ba9f5d308..e49c9568b4a5 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c @@ -151,12 +151,17 @@ int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr) struct rds_ib_device *rds_ibdev_old; rds_ibdev_old = rds_ib_get_device(ipaddr); - if (rds_ibdev_old) { + if (!rds_ibdev_old) + return rds_ib_add_ipaddr(rds_ibdev, ipaddr); + + if (rds_ibdev_old != rds_ibdev) { rds_ib_remove_ipaddr(rds_ibdev_old, ipaddr); rds_ib_dev_put(rds_ibdev_old); + return rds_ib_add_ipaddr(rds_ibdev, ipaddr); } + rds_ib_dev_put(rds_ibdev_old); - return rds_ib_add_ipaddr(rds_ibdev, ipaddr); + return 0; } void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn) From 73ce4317bf983282593aff710b112a7e705620c3 Mon Sep 17 00:00:00 2001 From: "santosh.shilimkar@oracle.com" Date: Sat, 22 Aug 2015 15:45:26 -0700 Subject: [PATCH 05/14] RDS: make sure we post recv buffers If we get an ENOMEM during rds_ib_recv_refill, we might never come back and refill again later. Patch makes sure to kick krdsd into helping out. To achieve this we add RDS_RECV_REFILL flag and update in the refill path based on that so that at least some therad will keep posting receive buffers. Since krdsd and softirq both might race for refill, we decide to schedule on work queue based on ring_low instead of ring_empty. Reviewed-by: Ajaykumar Hotchandani Signed-off-by: Santosh Shilimkar Signed-off-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/connection.c | 2 ++ net/rds/ib.h | 2 +- net/rds/ib_cm.c | 2 +- net/rds/ib_recv.c | 58 +++++++++++++++++++++++++++++++++++++++----- net/rds/rds.h | 1 + 5 files changed, 57 insertions(+), 8 deletions(-) diff --git a/net/rds/connection.c b/net/rds/connection.c index d4fecb21ca25..a50e652eb269 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -301,6 +301,8 @@ void rds_conn_shutdown(struct rds_connection *conn) wait_event(conn->c_waitq, !test_bit(RDS_IN_XMIT, &conn->c_flags)); + wait_event(conn->c_waitq, + !test_bit(RDS_RECV_REFILL, &conn->c_flags)); conn->c_trans->conn_shutdown(conn); rds_conn_reset(conn); diff --git a/net/rds/ib.h b/net/rds/ib.h index 86d88ec5d556..6422c52682e5 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -320,7 +320,7 @@ void rds_ib_recv_exit(void); int rds_ib_recv(struct rds_connection *conn); int rds_ib_recv_alloc_caches(struct rds_ib_connection *ic); void rds_ib_recv_free_caches(struct rds_ib_connection *ic); -void rds_ib_recv_refill(struct rds_connection *conn, int prefill); +void rds_ib_recv_refill(struct rds_connection *conn, int prefill, gfp_t gfp); void rds_ib_inc_free(struct rds_incoming *inc); int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to); void rds_ib_recv_cq_comp_handler(struct ib_cq *cq, void *context); diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index 94d4427377b2..04243dd1c2ea 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -135,7 +135,7 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even rds_ib_recv_init_ring(ic); /* Post receive buffers - as a side effect, this will update * the posted credit count. */ - rds_ib_recv_refill(conn, 1); + rds_ib_recv_refill(conn, 1, GFP_KERNEL); /* Tune RNR behavior */ rds_ib_tune_rnr(ic, &qp_attr); diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c index 2a6a75c59943..3afdcbdd06b4 100644 --- a/net/rds/ib_recv.c +++ b/net/rds/ib_recv.c @@ -297,7 +297,7 @@ static struct rds_page_frag *rds_ib_refill_one_frag(struct rds_ib_connection *ic } static int rds_ib_recv_refill_one(struct rds_connection *conn, - struct rds_ib_recv_work *recv, int prefill) + struct rds_ib_recv_work *recv, gfp_t gfp) { struct rds_ib_connection *ic = conn->c_transport_data; struct ib_sge *sge; @@ -305,7 +305,7 @@ static int rds_ib_recv_refill_one(struct rds_connection *conn, gfp_t slab_mask = GFP_NOWAIT; gfp_t page_mask = GFP_NOWAIT; - if (prefill) { + if (gfp & __GFP_WAIT) { slab_mask = GFP_KERNEL; page_mask = GFP_HIGHUSER; } @@ -347,6 +347,24 @@ out: return ret; } +static int acquire_refill(struct rds_connection *conn) +{ + return test_and_set_bit(RDS_RECV_REFILL, &conn->c_flags) == 0; +} + +static void release_refill(struct rds_connection *conn) +{ + clear_bit(RDS_RECV_REFILL, &conn->c_flags); + + /* We don't use wait_on_bit()/wake_up_bit() because our waking is in a + * hot path and finding waiters is very rare. We don't want to walk + * the system-wide hashed waitqueue buckets in the fast path only to + * almost never find waiters. + */ + if (waitqueue_active(&conn->c_waitq)) + wake_up_all(&conn->c_waitq); +} + /* * This tries to allocate and post unused work requests after making sure that * they have all the allocations they need to queue received fragments into @@ -354,15 +372,23 @@ out: * * -1 is returned if posting fails due to temporary resource exhaustion. */ -void rds_ib_recv_refill(struct rds_connection *conn, int prefill) +void rds_ib_recv_refill(struct rds_connection *conn, int prefill, gfp_t gfp) { struct rds_ib_connection *ic = conn->c_transport_data; struct rds_ib_recv_work *recv; struct ib_recv_wr *failed_wr; unsigned int posted = 0; int ret = 0; + int can_wait = gfp & __GFP_WAIT; u32 pos; + /* the goal here is to just make sure that someone, somewhere + * is posting buffers. If we can't get the refill lock, + * let them do their thing + */ + if (!acquire_refill(conn)) + return; + while ((prefill || rds_conn_up(conn)) && rds_ib_ring_alloc(&ic->i_recv_ring, 1, &pos)) { if (pos >= ic->i_recv_ring.w_nr) { @@ -372,7 +398,7 @@ void rds_ib_recv_refill(struct rds_connection *conn, int prefill) } recv = &ic->i_recvs[pos]; - ret = rds_ib_recv_refill_one(conn, recv, prefill); + ret = rds_ib_recv_refill_one(conn, recv, gfp); if (ret) { break; } @@ -402,6 +428,24 @@ void rds_ib_recv_refill(struct rds_connection *conn, int prefill) if (ret) rds_ib_ring_unalloc(&ic->i_recv_ring, 1); + + release_refill(conn); + + /* if we're called from the softirq handler, we'll be GFP_NOWAIT. + * in this case the ring being low is going to lead to more interrupts + * and we can safely let the softirq code take care of it unless the + * ring is completely empty. + * + * if we're called from krdsd, we'll be GFP_KERNEL. In this case + * we might have raced with the softirq code while we had the refill + * lock held. Use rds_ib_ring_low() instead of ring_empty to decide + * if we should requeue. + */ + if (rds_conn_up(conn) && + ((can_wait && rds_ib_ring_low(&ic->i_recv_ring)) || + rds_ib_ring_empty(&ic->i_recv_ring))) { + queue_delayed_work(rds_wq, &conn->c_recv_w, 1); + } } /* @@ -1023,7 +1067,7 @@ void rds_ib_recv_tasklet_fn(unsigned long data) rds_ib_stats_inc(s_ib_rx_ring_empty); if (rds_ib_ring_low(&ic->i_recv_ring)) - rds_ib_recv_refill(conn, 0); + rds_ib_recv_refill(conn, 0, GFP_NOWAIT); } int rds_ib_recv(struct rds_connection *conn) @@ -1032,8 +1076,10 @@ int rds_ib_recv(struct rds_connection *conn) int ret = 0; rdsdebug("conn %p\n", conn); - if (rds_conn_up(conn)) + if (rds_conn_up(conn)) { rds_ib_attempt_ack(ic); + rds_ib_recv_refill(conn, 0, GFP_KERNEL); + } return ret; } diff --git a/net/rds/rds.h b/net/rds/rds.h index 9005fb0586f6..afb4048d0cfd 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -80,6 +80,7 @@ enum { #define RDS_LL_SEND_FULL 0 #define RDS_RECONNECT_PENDING 1 #define RDS_IN_XMIT 2 +#define RDS_RECV_REFILL 3 struct rds_connection { struct hlist_node c_hash_node; From 0c484240210a8c835a216dbca6f1d690e4094a7c Mon Sep 17 00:00:00 2001 From: "santosh.shilimkar@oracle.com" Date: Sat, 22 Aug 2015 15:45:27 -0700 Subject: [PATCH 06/14] RDS: check for congestion updates during rds_send_xmit Ensure we don't keep sending the data if the link is congested. Reviewed-by: Ajaykumar Hotchandani Signed-off-by: Santosh Shilimkar Signed-off-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/send.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/rds/send.c b/net/rds/send.c index 2581b8e3dbe7..dd901c324edf 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -411,7 +411,8 @@ over_batch: */ if (ret == 0) { smp_mb(); - if (!list_empty(&conn->c_send_queue) && + if ((test_bit(0, &conn->c_map_queued) || + !list_empty(&conn->c_send_queue)) && send_gen == conn->c_send_gen) { rds_stats_inc(s_send_lock_queue_raced); goto restart; From 0df5f9a68a79f31e608b69a9369d6415a38843e1 Mon Sep 17 00:00:00 2001 From: "santosh.shilimkar@oracle.com" Date: Sat, 22 Aug 2015 15:45:28 -0700 Subject: [PATCH 07/14] RDS: add a sock_destruct callback debug aid This helps to detect the accidental processes/apps trying to destroy the RDS socket which they are sharing with other processes/apps. Reviewed-by: Ajaykumar Hotchandani Signed-off-by: Santosh Shilimkar Signed-off-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/af_rds.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c index 896834cd3b9a..a2f28a6d4dc5 100644 --- a/net/rds/af_rds.c +++ b/net/rds/af_rds.c @@ -438,6 +438,14 @@ static const struct proto_ops rds_proto_ops = { .sendpage = sock_no_sendpage, }; +static void rds_sock_destruct(struct sock *sk) +{ + struct rds_sock *rs = rds_sk_to_rs(sk); + + WARN_ON((&rs->rs_item != rs->rs_item.next || + &rs->rs_item != rs->rs_item.prev)); +} + static int __rds_create(struct socket *sock, struct sock *sk, int protocol) { struct rds_sock *rs; @@ -445,6 +453,7 @@ static int __rds_create(struct socket *sock, struct sock *sk, int protocol) sock_init_data(sock, sk); sock->ops = &rds_proto_ops; sk->sk_protocol = protocol; + sk->sk_destruct = rds_sock_destruct; rs = rds_sk_to_rs(sk); spin_lock_init(&rs->rs_lock); From 4f73113c63bae5cd092c93c2a9835b9d3c7f0b54 Mon Sep 17 00:00:00 2001 From: "santosh.shilimkar@oracle.com" Date: Sat, 22 Aug 2015 15:45:29 -0700 Subject: [PATCH 08/14] RDS: Mark message mapped before transmit rds_send_xmit() marks the rds message map flag after xmit_[rdma/atomic]() which is clearly wrong. We need to maintain the ownership between transport and rds. Also take care of error path. Reviewed-by: Ajaykumar Hotchandani Signed-off-by: Santosh Shilimkar Signed-off-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/send.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/net/rds/send.c b/net/rds/send.c index dd901c324edf..faad1be61ab6 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -282,26 +282,34 @@ restart: /* The transport either sends the whole rdma or none of it */ if (rm->rdma.op_active && !conn->c_xmit_rdma_sent) { rm->m_final_op = &rm->rdma; + /* The transport owns the mapped memory for now. + * You can't unmap it while it's on the send queue + */ + set_bit(RDS_MSG_MAPPED, &rm->m_flags); ret = conn->c_trans->xmit_rdma(conn, &rm->rdma); - if (ret) + if (ret) { + clear_bit(RDS_MSG_MAPPED, &rm->m_flags); + wake_up_interruptible(&rm->m_flush_wait); break; + } conn->c_xmit_rdma_sent = 1; - /* The transport owns the mapped memory for now. - * You can't unmap it while it's on the send queue */ - set_bit(RDS_MSG_MAPPED, &rm->m_flags); } if (rm->atomic.op_active && !conn->c_xmit_atomic_sent) { rm->m_final_op = &rm->atomic; + /* The transport owns the mapped memory for now. + * You can't unmap it while it's on the send queue + */ + set_bit(RDS_MSG_MAPPED, &rm->m_flags); ret = conn->c_trans->xmit_atomic(conn, &rm->atomic); - if (ret) + if (ret) { + clear_bit(RDS_MSG_MAPPED, &rm->m_flags); + wake_up_interruptible(&rm->m_flush_wait); break; + } conn->c_xmit_atomic_sent = 1; - /* The transport owns the mapped memory for now. - * You can't unmap it while it's on the send queue */ - set_bit(RDS_MSG_MAPPED, &rm->m_flags); } /* From 3049147ca7c8426268433d16d7546a51dede316d Mon Sep 17 00:00:00 2001 From: "santosh.shilimkar@oracle.com" Date: Sat, 22 Aug 2015 15:45:30 -0700 Subject: [PATCH 09/14] RDS: Make sure we do a signaled send for large-send WR(Work Requests )always generate a WC(Work Completion) with signaled send. Default RDS ib code is setup for un-signaled completion. Since RDS connction is persistent, we can end up sending the data even after large-send when the remote end is not active(for any reason). By doing a signaled send at least once per large-send, we can at least detect the problem in work completion handler there by avoiding sending more data to inactive remote. Reviewed-by: Ajaykumar Hotchandani Signed-off-by: Santosh Shilimkar Signed-off-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/ib_send.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c index 5d0a704fa039..c576ebeb4115 100644 --- a/net/rds/ib_send.c +++ b/net/rds/ib_send.c @@ -709,6 +709,11 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, if (scat == &rm->data.op_sg[rm->data.op_count]) { prev->s_op = ic->i_data_op; prev->s_wr.send_flags |= IB_SEND_SOLICITED; + if (!(prev->s_wr.send_flags & IB_SEND_SIGNALED)) { + ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs; + prev->s_wr.send_flags |= IB_SEND_SIGNALED; + nr_sig++; + } ic->i_data_op = NULL; } From 5c240fa2ab394af1dbde280e00cc038cbc7f0409 Mon Sep 17 00:00:00 2001 From: "santosh.shilimkar@oracle.com" Date: Sat, 22 Aug 2015 15:45:31 -0700 Subject: [PATCH 10/14] RDS: Fix assertion level from fatal to warning Fix the asserion level since its not fatal and can be hit in normal execution paths. There is no need to take the system down. We keep the WARN_ON() to detect the condition if we get here with bad pages. Reviewed-by: Ajaykumar Hotchandani Signed-off-by: Santosh Shilimkar Signed-off-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/ib_rdma.c | 2 +- net/rds/rdma.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index e49c9568b4a5..7b7aac8cdb56 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c @@ -490,7 +490,7 @@ static void __rds_ib_teardown_mr(struct rds_ib_mr *ibmr) /* FIXME we need a way to tell a r/w MR * from a r/o MR */ - BUG_ON(irqs_disabled()); + WARN_ON(!page->mapping && irqs_disabled()); set_page_dirty(page); put_page(page); } diff --git a/net/rds/rdma.c b/net/rds/rdma.c index 6401b501a215..c1df9b1cf3b2 100644 --- a/net/rds/rdma.c +++ b/net/rds/rdma.c @@ -451,7 +451,7 @@ void rds_rdma_free_op(struct rm_rdma_op *ro) * is the case for a RDMA_READ which copies from remote * to local memory */ if (!ro->op_write) { - BUG_ON(irqs_disabled()); + WARN_ON(!page->mapping && irqs_disabled()); set_page_dirty(page); } put_page(page); From 1c3be624f40acd6f2bd0f22ade081ac2467e8617 Mon Sep 17 00:00:00 2001 From: Santosh Shilimkar Date: Sat, 22 Aug 2015 15:45:32 -0700 Subject: [PATCH 11/14] RDS: Don't destroy the rdma id until after we're done using it During connection resets, we are destroying the rdma id too soon. We can't destroy it when it is still in use. So lets move rdma_destroy_id() after we clear the rings. Reviewed-by: Ajaykumar Hotchandani Signed-off-by: Santosh Shilimkar Signed-off-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/ib_cm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index 04243dd1c2ea..d150bb4aa3cb 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -647,7 +647,6 @@ void rds_ib_conn_shutdown(struct rds_connection *conn) ib_destroy_cq(ic->i_send_cq); if (ic->i_recv_cq) ib_destroy_cq(ic->i_recv_cq); - rdma_destroy_id(ic->i_cm_id); /* then free the resources that ib callbacks use */ if (ic->i_send_hdrs) @@ -673,6 +672,8 @@ void rds_ib_conn_shutdown(struct rds_connection *conn) if (ic->i_recvs) rds_ib_recv_clear_ring(ic); + rdma_destroy_id(ic->i_cm_id); + /* * Move connection back to the nodev list. */ From dfcec251d22bab947d10cf37e9ad67085cf7f097 Mon Sep 17 00:00:00 2001 From: "santosh.shilimkar@oracle.com" Date: Sat, 22 Aug 2015 15:45:33 -0700 Subject: [PATCH 12/14] RDS: make sure rds_send_drop_to properly takes the m_rs_lock rds_send_drop_to() is used during socket tear down to find all the messages on the socket and flush them . It can race with the acking code unless it takes the m_rs_lock on each and every message. This plugs a hole where we didn't take m_rs_lock on any message that didn't have the RDS_MSG_ON_CONN set. Taking m_rs_lock avoids double frees and other memory corruptions as the ack code trusts the message m_rs pointer on a socket that had actually been freed. We must take m_rs_lock to access m_rs. Because of lock nesting and rs access, we also need to acquire rs_lock. Reviewed-by: Ajaykumar Hotchandani Signed-off-by: Santosh Shilimkar Signed-off-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/send.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/net/rds/send.c b/net/rds/send.c index faad1be61ab6..b40c2ea8e89a 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -778,8 +778,22 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest) while (!list_empty(&list)) { rm = list_entry(list.next, struct rds_message, m_sock_item); list_del_init(&rm->m_sock_item); - rds_message_wait(rm); + + /* just in case the code above skipped this message + * because RDS_MSG_ON_CONN wasn't set, run it again here + * taking m_rs_lock is the only thing that keeps us + * from racing with ack processing. + */ + spin_lock_irqsave(&rm->m_rs_lock, flags); + + spin_lock(&rs->rs_lock); + __rds_send_complete(rs, rm, RDS_RDMA_CANCELED); + spin_unlock(&rs->rs_lock); + + rm->m_rs = NULL; + spin_unlock_irqrestore(&rm->m_rs_lock, flags); + rds_message_put(rm); } } From 06e8941e22f5cbaa4051f06d7ac99e3f302f6d48 Mon Sep 17 00:00:00 2001 From: Mukesh Kacker Date: Sat, 22 Aug 2015 15:45:34 -0700 Subject: [PATCH 13/14] RDS: return EMSGSIZE for oversize requests before processing/queueing rds_send_queue_rm() allows for the "current datagram" being queued to exceed SO_SNDBUF thresholds by checking bytes queued without counting in length of current datagram. (Since sk_sndbuf is set to twice requested SO_SNDBUF value as a kernel heuristic this is usually fine!) If this "current datagram" squeezing past the threshold is itself many times the size of the sk_sndbuf threshold itself then even twice the SO_SNDBUF does not save us and it gets queued but cannot be transmitted. Threads block and deadlock and device becomes unusable. The check for this datagram not exceeding SNDBUF thresholds (EMSGSIZE) is not done on this datagram as that check is only done if queueing attempt fails. (Datagrams that follow this datagram fail queueing attempts, go through the check and eventually trip EMSGSIZE error but zero length datagrams silently fail!) This fix moves the check for datagrams exceeding SNDBUF limits before any processing or queueing is attempted and returns EMSGSIZE early in the rds_sndmsg() code. This change also ensures that all datagrams get checked for exceeding SNDBUF/sk_sndbuf size limits and the large datagrams that exceed those limits do not get to rds_send_queue_rm() code for processing. Signed-off-by: Mukesh Kacker Signed-off-by: Santosh Shilimkar Signed-off-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/send.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/net/rds/send.c b/net/rds/send.c index b40c2ea8e89a..4df61a515b83 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -1015,6 +1015,11 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) goto out; } + if (payload_len > rds_sk_sndbuf(rs)) { + ret = -EMSGSIZE; + goto out; + } + /* size of rm including all sgs */ ret = rds_rm_size(msg, payload_len); if (ret < 0) @@ -1087,11 +1092,7 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) while (!rds_send_queue_rm(rs, conn, rm, rs->rs_bound_port, dport, &queued)) { rds_stats_inc(s_send_queue_full); - /* XXX make sure this is reasonable */ - if (payload_len > rds_sk_sndbuf(rs)) { - ret = -EMSGSIZE; - goto out; - } + if (nonblock) { ret = -EAGAIN; goto out; From ae05368afa7ea27fd906477874ed3ebc7df7e53c Mon Sep 17 00:00:00 2001 From: "santosh.shilimkar@oracle.com" Date: Sat, 22 Aug 2015 15:45:35 -0700 Subject: [PATCH 14/14] RDS: check for valid cm_id before initiating connection Connection could have been dropped while the route is being resolved so check for valid cm_id before initiating the connection. Reviewed-by: Ajaykumar Hotchandani Signed-off-by: Santosh Shilimkar Signed-off-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/rdma_transport.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c index 208240836043..b9b40af5345b 100644 --- a/net/rds/rdma_transport.c +++ b/net/rds/rdma_transport.c @@ -34,6 +34,7 @@ #include #include "rdma_transport.h" +#include "ib.h" static struct rdma_cm_id *rds_rdma_listen_id; @@ -82,8 +83,18 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, break; case RDMA_CM_EVENT_ROUTE_RESOLVED: - /* XXX worry about racing with listen acceptance */ - ret = trans->cm_initiate_connect(cm_id); + /* Connection could have been dropped so make sure the + * cm_id is valid before proceeding + */ + if (conn) { + struct rds_ib_connection *ibic; + + ibic = conn->c_transport_data; + if (ibic && ibic->i_cm_id == cm_id) + ret = trans->cm_initiate_connect(cm_id); + else + rds_conn_drop(conn); + } break; case RDMA_CM_EVENT_ESTABLISHED: