From 05dcc5aa4dcced4f59f925625cea669e82b75519 Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Fri, 13 Mar 2015 16:08:10 -0400 Subject: tipc: split link outqueue struct tipc_link contains one single queue for outgoing packets, where both transmitted and waiting packets are queued. This infrastructure is hard to maintain, because we need to keep a number of fields to keep track of which packets are sent or unsent, and the number of packets in each category. A lot of code becomes simpler if we split this queue into a transmission queue, where sent/unacknowledged packets are kept, and a backlog queue, where we keep the not yet sent packets. In this commit we do this separation. Reviewed-by: Erik Hugne Reviewed-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/node.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/tipc/node.c') diff --git a/net/tipc/node.c b/net/tipc/node.c index 86152de8248d..26d1de1bf34d 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -111,7 +111,7 @@ struct tipc_node *tipc_node_create(struct net *net, u32 addr) INIT_LIST_HEAD(&n_ptr->list); INIT_LIST_HEAD(&n_ptr->publ_list); INIT_LIST_HEAD(&n_ptr->conn_sks); - __skb_queue_head_init(&n_ptr->bclink.deferred_queue); + __skb_queue_head_init(&n_ptr->bclink.deferdq); hlist_add_head_rcu(&n_ptr->hash, &tn->node_htable[tipc_hashfn(addr)]); list_for_each_entry_rcu(temp_node, &tn->node_list, list) { if (n_ptr->addr < temp_node->addr) @@ -354,7 +354,7 @@ static void node_lost_contact(struct tipc_node *n_ptr) /* Flush broadcast link info associated with lost node */ if (n_ptr->bclink.recv_permitted) { - __skb_queue_purge(&n_ptr->bclink.deferred_queue); + __skb_queue_purge(&n_ptr->bclink.deferdq); if (n_ptr->bclink.reasm_buf) { kfree_skb(n_ptr->bclink.reasm_buf); -- cgit v1.2.3 From b952b2befb6f6b009e91f087285b9a0a6beb1cc8 Mon Sep 17 00:00:00 2001 From: Ying Xue Date: Thu, 26 Mar 2015 18:10:23 +0800 Subject: tipc: fix potential deadlock when all links are reset [ 60.988363] ====================================================== [ 60.988754] [ INFO: possible circular locking dependency detected ] [ 60.989152] 3.19.0+ #194 Not tainted [ 60.989377] ------------------------------------------------------- [ 60.989781] swapper/3/0 is trying to acquire lock: [ 60.990079] (&(&n_ptr->lock)->rlock){+.-...}, at: [] tipc_link_retransmit+0x1aa/0x240 [tipc] [ 60.990743] [ 60.990743] but task is already holding lock: [ 60.991106] (&(&bclink->lock)->rlock){+.-...}, at: [] tipc_bclink_lock+0x8e/0xa0 [tipc] [ 60.991738] [ 60.991738] which lock already depends on the new lock. [ 60.991738] [ 60.992174] [ 60.992174] the existing dependency chain (in reverse order) is: [ 60.992174] -> #1 (&(&bclink->lock)->rlock){+.-...}: [ 60.992174] [] lock_acquire+0x9c/0x140 [ 60.992174] [] _raw_spin_lock_bh+0x3f/0x50 [ 60.992174] [] tipc_bclink_lock+0x8e/0xa0 [tipc] [ 60.992174] [] tipc_bclink_add_node+0x97/0xf0 [tipc] [ 60.992174] [] tipc_node_link_up+0xf5/0x110 [tipc] [ 60.992174] [] link_state_event+0x2b3/0x4f0 [tipc] [ 60.992174] [] tipc_link_proto_rcv+0x24c/0x418 [tipc] [ 60.992174] [] tipc_rcv+0x827/0xac0 [tipc] [ 60.992174] [] tipc_l2_rcv_msg+0x73/0xd0 [tipc] [ 60.992174] [] __netif_receive_skb_core+0x746/0x980 [ 60.992174] [] __netif_receive_skb+0x21/0x70 [ 60.992174] [] netif_receive_skb_internal+0x35/0x130 [ 60.992174] [] napi_gro_receive+0x158/0x1d0 [ 60.992174] [] e1000_clean_rx_irq+0x155/0x490 [ 60.992174] [] e1000_clean+0x267/0x990 [ 60.992174] [] net_rx_action+0x150/0x360 [ 60.992174] [] __do_softirq+0x123/0x360 [ 60.992174] [] irq_exit+0x8e/0xb0 [ 60.992174] [] do_IRQ+0x65/0x110 [ 60.992174] [] ret_from_intr+0x0/0x13 [ 60.992174] [] arch_cpu_idle+0xf/0x20 [ 60.992174] [] cpu_startup_entry+0x2f6/0x3f0 [ 60.992174] [] start_secondary+0x13a/0x150 [ 60.992174] -> #0 (&(&n_ptr->lock)->rlock){+.-...}: [ 60.992174] [] __lock_acquire+0x163d/0x1ca0 [ 60.992174] [] lock_acquire+0x9c/0x140 [ 60.992174] [] _raw_spin_lock_bh+0x3f/0x50 [ 60.992174] [] tipc_link_retransmit+0x1aa/0x240 [tipc] [ 60.992174] [] tipc_bclink_rcv+0x611/0x640 [tipc] [ 60.992174] [] tipc_rcv+0x616/0xac0 [tipc] [ 60.992174] [] tipc_l2_rcv_msg+0x73/0xd0 [tipc] [ 60.992174] [] __netif_receive_skb_core+0x746/0x980 [ 60.992174] [] __netif_receive_skb+0x21/0x70 [ 60.992174] [] netif_receive_skb_internal+0x35/0x130 [ 60.992174] [] napi_gro_receive+0x158/0x1d0 [ 60.992174] [] e1000_clean_rx_irq+0x155/0x490 [ 60.992174] [] e1000_clean+0x267/0x990 [ 60.992174] [] net_rx_action+0x150/0x360 [ 60.992174] [] __do_softirq+0x123/0x360 [ 60.992174] [] irq_exit+0x8e/0xb0 [ 60.992174] [] do_IRQ+0x65/0x110 [ 60.992174] [] ret_from_intr+0x0/0x13 [ 60.992174] [] arch_cpu_idle+0xf/0x20 [ 60.992174] [] cpu_startup_entry+0x2f6/0x3f0 [ 60.992174] [] start_secondary+0x13a/0x150 [ 60.992174] [ 60.992174] other info that might help us debug this: [ 60.992174] [ 60.992174] Possible unsafe locking scenario: [ 60.992174] [ 60.992174] CPU0 CPU1 [ 60.992174] ---- ---- [ 60.992174] lock(&(&bclink->lock)->rlock); [ 60.992174] lock(&(&n_ptr->lock)->rlock); [ 60.992174] lock(&(&bclink->lock)->rlock); [ 60.992174] lock(&(&n_ptr->lock)->rlock); [ 60.992174] [ 60.992174] *** DEADLOCK *** [ 60.992174] [ 60.992174] 3 locks held by swapper/3/0: [ 60.992174] #0: (rcu_read_lock){......}, at: [] __netif_receive_skb_core+0x71/0x980 [ 60.992174] #1: (rcu_read_lock){......}, at: [] tipc_l2_rcv_msg+0x5/0xd0 [tipc] [ 60.992174] #2: (&(&bclink->lock)->rlock){+.-...}, at: [] tipc_bclink_lock+0x8e/0xa0 [tipc] [ 60.992174] The correct the sequence of grabbing n_ptr->lock and bclink->lock should be that the former is first held and the latter is then taken, which exactly happened on CPU1. But especially when the retransmission of broadcast link is failed, bclink->lock is first held in tipc_bclink_rcv(), and n_ptr->lock is taken in link_retransmit_failure() called by tipc_link_retransmit() subsequently, which is demonstrated on CPU0. As a result, deadlock occurs. If the order of holding the two locks happening on CPU0 is reversed, the deadlock risk will be relieved. Therefore, the node lock taken in link_retransmit_failure() originally is moved to tipc_bclink_rcv() so that it's obtained before bclink lock. But the precondition of the adjustment of node lock is that responding to bclink reset event must be moved from tipc_bclink_unlock() to tipc_node_unlock(). Reviewed-by: Erik Hugne Signed-off-by: Ying Xue Signed-off-by: David S. Miller --- net/tipc/node.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net/tipc/node.c') diff --git a/net/tipc/node.c b/net/tipc/node.c index 26d1de1bf34d..5cc43d34ad0a 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -459,7 +459,7 @@ void tipc_node_unlock(struct tipc_node *node) TIPC_NOTIFY_NODE_DOWN | TIPC_NOTIFY_NODE_UP | TIPC_NOTIFY_LINK_DOWN | TIPC_NOTIFY_LINK_UP | TIPC_WAKEUP_BCAST_USERS | TIPC_BCAST_MSG_EVT | - TIPC_NAMED_MSG_EVT); + TIPC_NAMED_MSG_EVT | TIPC_BCAST_RESET); spin_unlock_bh(&node->lock); @@ -488,6 +488,9 @@ void tipc_node_unlock(struct tipc_node *node) if (flags & TIPC_BCAST_MSG_EVT) tipc_bclink_input(net); + + if (flags & TIPC_BCAST_RESET) + tipc_link_reset_all(node); } /* Caller should hold node lock for the passed node */ -- cgit v1.2.3 From 8a0f6ebe8494c5c6ccfe12264385b64c280e3241 Mon Sep 17 00:00:00 2001 From: Ying Xue Date: Thu, 26 Mar 2015 18:10:24 +0800 Subject: tipc: involve reference counter for node structure TIPC node hash node table is protected with rcu lock on read side. tipc_node_find() is used to look for a node object with node address through iterating the hash node table. As the entire process of what tipc_node_find() traverses the table is guarded with rcu read lock, it's safe for us. However, when callers use the node object returned by tipc_node_find(), there is no rcu read lock applied. Therefore, this is absolutely unsafe for callers of tipc_node_find(). Now we introduce a reference counter for node structure. Before tipc_node_find() returns node object to its caller, it first increases the reference counter. Accordingly, after its caller used it up, it decreases the counter again. This can prevent a node being used by one thread from being freed by another thread. Reviewed-by: Erik Hugne Reviewed-by: Jon Maloy Signed-off-by: Ying Xue Signed-off-by: David S. Miller --- net/tipc/node.c | 85 +++++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 61 insertions(+), 24 deletions(-) (limited to 'net/tipc/node.c') diff --git a/net/tipc/node.c b/net/tipc/node.c index 5cc43d34ad0a..3e4f04897c03 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -42,6 +42,7 @@ static void node_lost_contact(struct tipc_node *n_ptr); static void node_established_contact(struct tipc_node *n_ptr); +static void tipc_node_delete(struct tipc_node *node); struct tipc_sock_conn { u32 port; @@ -67,6 +68,23 @@ static unsigned int tipc_hashfn(u32 addr) return addr & (NODE_HTABLE_SIZE - 1); } +static void tipc_node_kref_release(struct kref *kref) +{ + struct tipc_node *node = container_of(kref, struct tipc_node, kref); + + tipc_node_delete(node); +} + +void tipc_node_put(struct tipc_node *node) +{ + kref_put(&node->kref, tipc_node_kref_release); +} + +static void tipc_node_get(struct tipc_node *node) +{ + kref_get(&node->kref); +} + /* * tipc_node_find - locate specified node object, if it exists */ @@ -82,6 +100,7 @@ struct tipc_node *tipc_node_find(struct net *net, u32 addr) hlist_for_each_entry_rcu(node, &tn->node_htable[tipc_hashfn(addr)], hash) { if (node->addr == addr) { + tipc_node_get(node); rcu_read_unlock(); return node; } @@ -106,6 +125,7 @@ struct tipc_node *tipc_node_create(struct net *net, u32 addr) } n_ptr->addr = addr; n_ptr->net = net; + kref_init(&n_ptr->kref); spin_lock_init(&n_ptr->lock); INIT_HLIST_NODE(&n_ptr->hash); INIT_LIST_HEAD(&n_ptr->list); @@ -120,16 +140,17 @@ struct tipc_node *tipc_node_create(struct net *net, u32 addr) list_add_tail_rcu(&n_ptr->list, &temp_node->list); n_ptr->action_flags = TIPC_WAIT_PEER_LINKS_DOWN; n_ptr->signature = INVALID_NODE_SIG; + tipc_node_get(n_ptr); exit: spin_unlock_bh(&tn->node_list_lock); return n_ptr; } -static void tipc_node_delete(struct tipc_net *tn, struct tipc_node *n_ptr) +static void tipc_node_delete(struct tipc_node *node) { - list_del_rcu(&n_ptr->list); - hlist_del_rcu(&n_ptr->hash); - kfree_rcu(n_ptr, rcu); + list_del_rcu(&node->list); + hlist_del_rcu(&node->hash); + kfree_rcu(node, rcu); } void tipc_node_stop(struct net *net) @@ -139,7 +160,7 @@ void tipc_node_stop(struct net *net) spin_lock_bh(&tn->node_list_lock); list_for_each_entry_safe(node, t_node, &tn->node_list, list) - tipc_node_delete(tn, node); + tipc_node_put(node); spin_unlock_bh(&tn->node_list_lock); } @@ -147,6 +168,7 @@ int tipc_node_add_conn(struct net *net, u32 dnode, u32 port, u32 peer_port) { struct tipc_node *node; struct tipc_sock_conn *conn; + int err = 0; if (in_own_node(net, dnode)) return 0; @@ -157,8 +179,10 @@ int tipc_node_add_conn(struct net *net, u32 dnode, u32 port, u32 peer_port) return -EHOSTUNREACH; } conn = kmalloc(sizeof(*conn), GFP_ATOMIC); - if (!conn) - return -EHOSTUNREACH; + if (!conn) { + err = -EHOSTUNREACH; + goto exit; + } conn->peer_node = dnode; conn->port = port; conn->peer_port = peer_port; @@ -166,7 +190,9 @@ int tipc_node_add_conn(struct net *net, u32 dnode, u32 port, u32 peer_port) tipc_node_lock(node); list_add_tail(&conn->list, &node->conn_sks); tipc_node_unlock(node); - return 0; +exit: + tipc_node_put(node); + return err; } void tipc_node_remove_conn(struct net *net, u32 dnode, u32 port) @@ -189,6 +215,7 @@ void tipc_node_remove_conn(struct net *net, u32 dnode, u32 port) kfree(conn); } tipc_node_unlock(node); + tipc_node_put(node); } /** @@ -417,19 +444,25 @@ int tipc_node_get_linkname(struct net *net, u32 bearer_id, u32 addr, char *linkname, size_t len) { struct tipc_link *link; + int err = -EINVAL; struct tipc_node *node = tipc_node_find(net, addr); - if ((bearer_id >= MAX_BEARERS) || !node) - return -EINVAL; + if (!node) + return err; + + if (bearer_id >= MAX_BEARERS) + goto exit; + tipc_node_lock(node); link = node->links[bearer_id]; if (link) { strncpy(linkname, link->name, len); - tipc_node_unlock(node); - return 0; + err = 0; } +exit: tipc_node_unlock(node); - return -EINVAL; + tipc_node_put(node); + return err; } void tipc_node_unlock(struct tipc_node *node) @@ -545,17 +578,21 @@ int tipc_nl_node_dump(struct sk_buff *skb, struct netlink_callback *cb) msg.seq = cb->nlh->nlmsg_seq; rcu_read_lock(); - - if (last_addr && !tipc_node_find(net, last_addr)) { - rcu_read_unlock(); - /* We never set seq or call nl_dump_check_consistent() this - * means that setting prev_seq here will cause the consistence - * check to fail in the netlink callback handler. Resulting in - * the NLMSG_DONE message having the NLM_F_DUMP_INTR flag set if - * the node state changed while we released the lock. - */ - cb->prev_seq = 1; - return -EPIPE; + if (last_addr) { + node = tipc_node_find(net, last_addr); + if (!node) { + rcu_read_unlock(); + /* We never set seq or call nl_dump_check_consistent() + * this means that setting prev_seq here will cause the + * consistence check to fail in the netlink callback + * handler. Resulting in the NLMSG_DONE message having + * the NLM_F_DUMP_INTR flag set if the node state + * changed while we released the lock. + */ + cb->prev_seq = 1; + return -EPIPE; + } + tipc_node_put(node); } list_for_each_entry_rcu(node, &tn->node_list, list) { -- cgit v1.2.3 From dff29b1a88524fe6afe296d6c477c491d1e02af0 Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Thu, 2 Apr 2015 09:33:01 -0400 Subject: tipc: eliminate delayed link deletion at link failover When a bearer is disabled manually, all its links have to be reset and deleted. However, if there is a remaining, parallel link ready to take over a deleted link's traffic, we currently delay the delete of the removed link until the failover procedure is finished. This is because the remaining link needs to access state from the reset link, such as the last received packet number, and any partially reassembled buffer, in order to perform a successful failover. In this commit, we do instead move the state data over to the new link, so that it can fulfill the procedure autonomously, without accessing any data on the old link. This means that we can now proceed and delete all pertaining links immediately when a bearer is disabled. This saves us from some unnecessary complexity in such situations. We also choose to change the confusing definitions CHANGEOVER_PROTOCOL, ORIGINAL_MSG and DUPLICATE_MSG to the more descriptive TUNNEL_PROTOCOL, FAILOVER_MSG and SYNCH_MSG respectively. Reviewed-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/node.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'net/tipc/node.c') diff --git a/net/tipc/node.c b/net/tipc/node.c index 3e4f04897c03..f3d522c2881a 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -394,18 +394,17 @@ static void node_lost_contact(struct tipc_node *n_ptr) n_ptr->bclink.recv_permitted = false; } - /* Abort link changeover */ + /* Abort any ongoing link failover */ for (i = 0; i < MAX_BEARERS; i++) { struct tipc_link *l_ptr = n_ptr->links[i]; if (!l_ptr) continue; - l_ptr->reset_checkpoint = l_ptr->next_in_no; - l_ptr->exp_msg_count = 0; + l_ptr->flags &= ~LINK_FAILINGOVER; + l_ptr->failover_checkpt = 0; + l_ptr->failover_pkts = 0; + kfree_skb(l_ptr->failover_skb); + l_ptr->failover_skb = NULL; tipc_link_reset_fragments(l_ptr); - - /* Link marked for deletion after failover? => do it now */ - if (l_ptr->flags & LINK_STOPPED) - tipc_link_delete(l_ptr); } n_ptr->action_flags &= ~TIPC_WAIT_OWN_LINKS_DOWN; -- cgit v1.2.3 From ed193ece2649c194a87a9d8470195760d367c075 Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Thu, 2 Apr 2015 09:33:02 -0400 Subject: tipc: simplify link mtu negotiation When a link is being established, the two endpoints advertise their respective interface MTU in the transmitted RESET and ACTIVATE messages. If there is any difference, the lower of the two MTUs will be selected for use by both endpoints. However, as a remnant of earlier attempts to introduce TIPC level routing. there also exists an MTU discovery mechanism. If an intermediate node has a lower MTU than the two endpoints, they will discover this through a bisectional approach, and finally adopt this MTU for common use. Since there is no TIPC level routing, and probably never will be, this mechanism doesn't make any sense, and only serves to make the link level protocol unecessarily complex. In this commit, we eliminate the MTU discovery algorithm,and fall back to the simple MTU advertising approach. This change is fully backwards compatible. Reviewed-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/node.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'net/tipc/node.c') diff --git a/net/tipc/node.c b/net/tipc/node.c index f3d522c2881a..22c059ad2999 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -254,8 +254,8 @@ void tipc_node_link_up(struct tipc_node *n_ptr, struct tipc_link *l_ptr) active[0] = active[1] = l_ptr; exit: /* Leave room for changeover header when returning 'mtu' to users: */ - n_ptr->act_mtus[0] = active[0]->max_pkt - INT_H_SIZE; - n_ptr->act_mtus[1] = active[1]->max_pkt - INT_H_SIZE; + n_ptr->act_mtus[0] = active[0]->mtu - INT_H_SIZE; + n_ptr->act_mtus[1] = active[1]->mtu - INT_H_SIZE; } /** @@ -319,11 +319,10 @@ void tipc_node_link_down(struct tipc_node *n_ptr, struct tipc_link *l_ptr) /* Leave room for changeover header when returning 'mtu' to users: */ if (active[0]) { - n_ptr->act_mtus[0] = active[0]->max_pkt - INT_H_SIZE; - n_ptr->act_mtus[1] = active[1]->max_pkt - INT_H_SIZE; + n_ptr->act_mtus[0] = active[0]->mtu - INT_H_SIZE; + n_ptr->act_mtus[1] = active[1]->mtu - INT_H_SIZE; return; } - /* Loopback link went down? No fragmentation needed from now on. */ if (n_ptr->addr == tn->own_addr) { n_ptr->act_mtus[0] = MAX_MSG_SIZE; -- cgit v1.2.3