From 92ef12b32feab8f277b69e9fb89ede2796777f4d Mon Sep 17 00:00:00 2001 From: Parthasarathy Bhuvaragan Date: Tue, 25 Sep 2018 18:21:58 +0200 Subject: tipc: fix flow control accounting for implicit connect In the case of implicit connect message with data > 1K, the flow control accounting is incorrect. At this state, the socket does not know the peer nodes capability and falls back to legacy flow control by return 1, however the receiver of this message will perform the new block accounting. This leads to a slack and eventually traffic disturbance. In this commit, we perform tipc_node_get_capabilities() at implicit connect and perform accounting based on the peer's capability. Signed-off-by: Parthasarathy Bhuvaragan Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/socket.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net/tipc/socket.c') diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 3f03ddd0e35b..b6f99b021d09 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -1419,8 +1419,10 @@ static int __tipc_sendstream(struct socket *sock, struct msghdr *m, size_t dlen) /* Handle implicit connection setup */ if (unlikely(dest)) { rc = __tipc_sendmsg(sock, m, dlen); - if (dlen && (dlen == rc)) + if (dlen && dlen == rc) { + tsk->peer_caps = tipc_node_get_capabilities(net, dnode); tsk->snt_unacked = tsk_inc(tsk, dlen + msg_hdr_sz(hdr)); + } return rc; } -- cgit v1.2.3 From afe8792fec69c693060c0db43ed4bf05d92ce45b Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Fri, 28 Sep 2018 20:23:19 +0200 Subject: tipc: refactor function tipc_sk_timeout() We refactor this function as a preparation for the coming commits in the same series. Acked-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/socket.c | 62 ++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 38 insertions(+), 24 deletions(-) (limited to 'net/tipc/socket.c') diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 3f03ddd0e35b..8152a8135422 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -2545,43 +2545,57 @@ static int tipc_shutdown(struct socket *sock, int how) return res; } +static void tipc_sk_check_probing_state(struct sock *sk, + struct sk_buff_head *list) +{ + struct tipc_sock *tsk = tipc_sk(sk); + u32 pnode = tsk_peer_node(tsk); + u32 pport = tsk_peer_port(tsk); + u32 self = tsk_own_node(tsk); + u32 oport = tsk->portid; + struct sk_buff *skb; + + if (tsk->probe_unacked) { + tipc_set_sk_state(sk, TIPC_DISCONNECTING); + sk->sk_err = ECONNABORTED; + tipc_node_remove_conn(sock_net(sk), pnode, pport); + sk->sk_state_change(sk); + return; + } + /* Prepare new probe */ + skb = tipc_msg_create(CONN_MANAGER, CONN_PROBE, INT_H_SIZE, 0, + pnode, self, pport, oport, TIPC_OK); + if (skb) + __skb_queue_tail(list, skb); + tsk->probe_unacked = true; + sk_reset_timer(sk, &sk->sk_timer, jiffies + CONN_PROBING_INTV); +} + static void tipc_sk_timeout(struct timer_list *t) { struct sock *sk = from_timer(sk, t, sk_timer); struct tipc_sock *tsk = tipc_sk(sk); - u32 peer_port = tsk_peer_port(tsk); - u32 peer_node = tsk_peer_node(tsk); - u32 own_node = tsk_own_node(tsk); - u32 own_port = tsk->portid; - struct net *net = sock_net(sk); - struct sk_buff *skb = NULL; + u32 pnode = tsk_peer_node(tsk); + struct sk_buff_head list; + skb_queue_head_init(&list); bh_lock_sock(sk); - if (!tipc_sk_connected(sk)) - goto exit; /* Try again later if socket is busy */ if (sock_owned_by_user(sk)) { sk_reset_timer(sk, &sk->sk_timer, jiffies + HZ / 20); - goto exit; + bh_unlock_sock(sk); + return; } - if (tsk->probe_unacked) { - tipc_set_sk_state(sk, TIPC_DISCONNECTING); - tipc_node_remove_conn(net, peer_node, peer_port); - sk->sk_state_change(sk); - goto exit; - } - /* Send new probe */ - skb = tipc_msg_create(CONN_MANAGER, CONN_PROBE, INT_H_SIZE, 0, - peer_node, own_node, peer_port, own_port, - TIPC_OK); - tsk->probe_unacked = true; - sk_reset_timer(sk, &sk->sk_timer, jiffies + CONN_PROBING_INTV); -exit: + if (sk->sk_state == TIPC_ESTABLISHED) + tipc_sk_check_probing_state(sk, &list); + bh_unlock_sock(sk); - if (skb) - tipc_node_xmit_skb(net, skb, peer_node, own_port); + + if (!skb_queue_empty(&list)) + tipc_node_xmit(sock_net(sk), &list, pnode, tsk->portid); + sock_put(sk); } -- cgit v1.2.3 From 39fdc9c71f652cdf056ec3341581bde805444b57 Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Fri, 28 Sep 2018 20:23:20 +0200 Subject: tipc: refactor function tipc_sk_filter_connect() We refactor the function tipc_sk_filter_connect(), both to make it more readable and as a preparation for the next commit. Acked-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/socket.c | 101 +++++++++++++++++++++++------------------------------- 1 file changed, 43 insertions(+), 58 deletions(-) (limited to 'net/tipc/socket.c') diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 8152a8135422..094cfdc1e2b6 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -1959,91 +1959,76 @@ static void tipc_sk_proto_rcv(struct sock *sk, } /** - * tipc_filter_connect - Handle incoming message for a connection-based socket + * tipc_sk_filter_connect - check incoming message for a connection-based socket * @tsk: TIPC socket - * @skb: pointer to message buffer. Set to NULL if buffer is consumed - * - * Returns true if everything ok, false otherwise + * @skb: pointer to message buffer. + * Returns true if message should be added to receive queue, false otherwise */ static bool tipc_sk_filter_connect(struct tipc_sock *tsk, struct sk_buff *skb) { struct sock *sk = &tsk->sk; struct net *net = sock_net(sk); struct tipc_msg *hdr = buf_msg(skb); - u32 pport = msg_origport(hdr); - u32 pnode = msg_orignode(hdr); + bool con_msg = msg_connected(hdr); + u32 pport = tsk_peer_port(tsk); + u32 pnode = tsk_peer_node(tsk); + u32 oport = msg_origport(hdr); + u32 onode = msg_orignode(hdr); + int err = msg_errcode(hdr); if (unlikely(msg_mcast(hdr))) return false; switch (sk->sk_state) { case TIPC_CONNECTING: - /* Accept only ACK or NACK message */ - if (unlikely(!msg_connected(hdr))) { - if (pport != tsk_peer_port(tsk) || - pnode != tsk_peer_node(tsk)) - return false; - - tipc_set_sk_state(sk, TIPC_DISCONNECTING); - sk->sk_err = ECONNREFUSED; - sk->sk_state_change(sk); - return true; - } - - if (unlikely(msg_errcode(hdr))) { - tipc_set_sk_state(sk, TIPC_DISCONNECTING); - sk->sk_err = ECONNREFUSED; - sk->sk_state_change(sk); - return true; - } - - if (unlikely(!msg_isdata(hdr))) { - tipc_set_sk_state(sk, TIPC_DISCONNECTING); - sk->sk_err = EINVAL; - sk->sk_state_change(sk); - return true; + /* Setup ACK */ + if (likely(con_msg)) { + if (err) + break; + tipc_sk_finish_conn(tsk, oport, onode); + msg_set_importance(&tsk->phdr, msg_importance(hdr)); + /* ACK+ message with data is added to receive queue */ + if (msg_data_sz(hdr)) + return true; + /* Empty ACK-, - wake up sleeping connect() and drop */ + sk->sk_data_ready(sk); + msg_set_dest_droppable(hdr, 1); + return false; } + /* Ignore connectionless message if not from listening socket */ + if (oport != pport || onode != pnode) + return false; - tipc_sk_finish_conn(tsk, msg_origport(hdr), msg_orignode(hdr)); - msg_set_importance(&tsk->phdr, msg_importance(hdr)); - - /* If 'ACK+' message, add to socket receive queue */ - if (msg_data_sz(hdr)) - return true; - - /* If empty 'ACK-' message, wake up sleeping connect() */ - sk->sk_data_ready(sk); - - /* 'ACK-' message is neither accepted nor rejected: */ - msg_set_dest_droppable(hdr, 1); - return false; - + /* Rejected SYN - abort */ + break; case TIPC_OPEN: case TIPC_DISCONNECTING: - break; + return false; case TIPC_LISTEN: /* Accept only SYN message */ - if (!msg_connected(hdr) && !(msg_errcode(hdr))) + if (!con_msg && !err) return true; - break; + return false; case TIPC_ESTABLISHED: /* Accept only connection-based messages sent by peer */ - if (unlikely(!tsk_peer_msg(tsk, hdr))) + if (likely(con_msg && !err && pport == oport && pnode == onode)) + return true; + if (!tsk_peer_msg(tsk, hdr)) return false; - - if (unlikely(msg_errcode(hdr))) { - tipc_set_sk_state(sk, TIPC_DISCONNECTING); - /* Let timer expire on it's own */ - tipc_node_remove_conn(net, tsk_peer_node(tsk), - tsk->portid); - sk->sk_state_change(sk); - } + if (!err) + return true; + tipc_set_sk_state(sk, TIPC_DISCONNECTING); + tipc_node_remove_conn(net, pnode, tsk->portid); + sk->sk_state_change(sk); return true; default: pr_err("Unknown sk_state %u\n", sk->sk_state); } - - return false; + /* Abort connection setup attempt */ + tipc_set_sk_state(sk, TIPC_DISCONNECTING); + sk->sk_err = ECONNREFUSED; + sk->sk_state_change(sk); + return true; } /** -- cgit v1.2.3 From 25b9221b959483f17c2964d0922869e16caa86b5 Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Fri, 28 Sep 2018 20:23:21 +0200 Subject: tipc: add SYN bit to connection setup messages Messages intended for intitating a connection are currently indistinguishable from regular datagram messages. The TIPC protocol specification defines bit 17 in word 0 as a SYN bit to allow sanity check of such messages in the listening socket, but this has so far never been implemented. We do that in this commit. Acked-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/socket.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net/tipc/socket.c') diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 094cfdc1e2b6..89d6dc0f456d 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -1319,6 +1319,7 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen) tsk->conn_type = dest->addr.name.name.type; tsk->conn_instance = dest->addr.name.name.instance; } + msg_set_syn(hdr, 1); } seq = &dest->addr.nameseq; @@ -1478,6 +1479,7 @@ static void tipc_sk_finish_conn(struct tipc_sock *tsk, u32 peer_port, struct net *net = sock_net(sk); struct tipc_msg *msg = &tsk->phdr; + msg_set_syn(msg, 0); msg_set_destnode(msg, peer_node); msg_set_destport(msg, peer_port); msg_set_type(msg, TIPC_CONN_MSG); @@ -2006,6 +2008,9 @@ static bool tipc_sk_filter_connect(struct tipc_sock *tsk, struct sk_buff *skb) return false; case TIPC_LISTEN: /* Accept only SYN message */ + if (!msg_is_syn(hdr) && + tipc_node_get_capabilities(net, onode) & TIPC_SYN_BIT) + return false; if (!con_msg && !err) return true; return false; -- cgit v1.2.3 From 6787927475e52f6933e3affce365dabb2aa2fadf Mon Sep 17 00:00:00 2001 From: Tung Nguyen Date: Fri, 28 Sep 2018 20:23:22 +0200 Subject: tipc: buffer overflow handling in listener socket Default socket receive buffer size for a listener socket is 2Mb. For each arriving empty SYN, the linux kernel allocates a 768 bytes buffer. This means that a listener socket can serve maximum 2700 simultaneous empty connection setup requests before it hits a receive buffer overflow, and much fewer if the SYN is carrying any significant amount of data. When this happens the setup request is rejected, and the client receives an ECONNREFUSED error. This commit mitigates this problem by letting the client socket try to retransmit the SYN message multiple times when it sees it rejected with the code TIPC_ERR_OVERLOAD. Retransmission is done at random intervals in the range of [100 ms, setup_timeout / 4], as many times as there is room for within the setup timeout limit. Signed-off-by: Tung Nguyen Acked-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/socket.c | 49 +++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 43 insertions(+), 6 deletions(-) (limited to 'net/tipc/socket.c') diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 89d6dc0f456d..595c5001b28d 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -47,7 +47,7 @@ #include "netlink.h" #include "group.h" -#define CONN_TIMEOUT_DEFAULT 8000 /* default connect timeout = 8s */ +#define CONN_TIMEOUT_DEFAULT 8000 /* default connect timeout = 8s */ #define CONN_PROBING_INTV msecs_to_jiffies(3600000) /* [ms] => 1 h */ #define TIPC_FWD_MSG 1 #define TIPC_MAX_PORT 0xffffffff @@ -80,7 +80,6 @@ struct sockaddr_pair { * @publications: list of publications for port * @blocking_link: address of the congested link we are currently sleeping on * @pub_count: total # of publications port has made during its lifetime - * @probing_state: * @conn_timeout: the time we can wait for an unresponded setup request * @dupl_rcvcnt: number of bytes counted twice, in both backlog and rcv queue * @cong_link_cnt: number of congested links @@ -102,8 +101,8 @@ struct tipc_sock { struct list_head cong_links; struct list_head publications; u32 pub_count; - uint conn_timeout; atomic_t dupl_rcvcnt; + u16 conn_timeout; bool probe_unacked; u16 cong_link_cnt; u16 snt_unacked; @@ -507,6 +506,9 @@ static void __tipc_shutdown(struct socket *sock, int error) tipc_wait_for_cond(sock, &timeout, (!tsk->cong_link_cnt && !tsk_conn_cong(tsk))); + /* Remove any pending SYN message */ + __skb_queue_purge(&sk->sk_write_queue); + /* Reject all unreceived messages, except on an active connection * (which disconnects locally & sends a 'FIN+' to peer). */ @@ -1362,6 +1364,8 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen) rc = tipc_msg_build(hdr, m, 0, dlen, mtu, &pkts); if (unlikely(rc != dlen)) return rc; + if (unlikely(syn && !tipc_msg_skb_clone(&pkts, &sk->sk_write_queue))) + return -ENOMEM; rc = tipc_node_xmit(net, &pkts, dnode, tsk->portid); if (unlikely(rc == -ELINKCONG)) { @@ -1491,6 +1495,7 @@ static void tipc_sk_finish_conn(struct tipc_sock *tsk, u32 peer_port, tipc_node_add_conn(net, peer_node, tsk->portid, peer_port); tsk->max_pkt = tipc_node_get_mtu(net, peer_node, tsk->portid); tsk->peer_caps = tipc_node_get_capabilities(net, peer_node); + __skb_queue_purge(&sk->sk_write_queue); if (tsk->peer_caps & TIPC_BLOCK_FLOWCTL) return; @@ -1977,6 +1982,7 @@ static bool tipc_sk_filter_connect(struct tipc_sock *tsk, struct sk_buff *skb) u32 oport = msg_origport(hdr); u32 onode = msg_orignode(hdr); int err = msg_errcode(hdr); + unsigned long delay; if (unlikely(msg_mcast(hdr))) return false; @@ -2001,8 +2007,18 @@ static bool tipc_sk_filter_connect(struct tipc_sock *tsk, struct sk_buff *skb) if (oport != pport || onode != pnode) return false; - /* Rejected SYN - abort */ - break; + /* Rejected SYN */ + if (err != TIPC_ERR_OVERLOAD) + break; + + /* Prepare for new setup attempt if we have a SYN clone */ + if (skb_queue_empty(&sk->sk_write_queue)) + break; + get_random_bytes(&delay, 2); + delay %= (tsk->conn_timeout / 4); + delay = msecs_to_jiffies(delay + 100); + sk_reset_timer(sk, &sk->sk_timer, jiffies + delay); + return false; case TIPC_OPEN: case TIPC_DISCONNECTING: return false; @@ -2561,12 +2577,26 @@ static void tipc_sk_check_probing_state(struct sock *sk, sk_reset_timer(sk, &sk->sk_timer, jiffies + CONN_PROBING_INTV); } +static void tipc_sk_retry_connect(struct sock *sk, struct sk_buff_head *list) +{ + struct tipc_sock *tsk = tipc_sk(sk); + + /* Try again later if dest link is congested */ + if (tsk->cong_link_cnt) { + sk_reset_timer(sk, &sk->sk_timer, msecs_to_jiffies(100)); + return; + } + /* Prepare SYN for retransmit */ + tipc_msg_skb_clone(&sk->sk_write_queue, list); +} + static void tipc_sk_timeout(struct timer_list *t) { struct sock *sk = from_timer(sk, t, sk_timer); struct tipc_sock *tsk = tipc_sk(sk); u32 pnode = tsk_peer_node(tsk); struct sk_buff_head list; + int rc = 0; skb_queue_head_init(&list); bh_lock_sock(sk); @@ -2580,12 +2610,19 @@ static void tipc_sk_timeout(struct timer_list *t) if (sk->sk_state == TIPC_ESTABLISHED) tipc_sk_check_probing_state(sk, &list); + else if (sk->sk_state == TIPC_CONNECTING) + tipc_sk_retry_connect(sk, &list); bh_unlock_sock(sk); if (!skb_queue_empty(&list)) - tipc_node_xmit(sock_net(sk), &list, pnode, tsk->portid); + rc = tipc_node_xmit(sock_net(sk), &list, pnode, tsk->portid); + /* SYN messages may cause link congestion */ + if (rc == -ELINKCONG) { + tipc_dest_push(&tsk->cong_links, pnode, 0); + tsk->cong_link_cnt = 1; + } sock_put(sk); } -- cgit v1.2.3 From e7eb05823806502747eadc31039cecfd7836ddeb Mon Sep 17 00:00:00 2001 From: Parthasarathy Bhuvaragan Date: Wed, 10 Oct 2018 17:50:23 +0200 Subject: tipc: queue socket protocol error messages into socket receive buffer In tipc_sk_filter_rcv(), when we detect protocol messages with error we call tipc_sk_conn_proto_rcv() and let it reset the connection and notify the socket by calling sk->sk_state_change(). However, tipc_sk_filter_rcv() may have been called from the function tipc_backlog_rcv(), in which case the socket lock is held and the socket already awake. This means that the sk_state_change() call is ignored and the error notification lost. Now the receive queue will remain empty and the socket sleeps forever. In this commit, we convert the protocol message into a connection abort message and enqueue it into the socket's receive queue. By this addition to the above state change we cover all conditions. Acked-by: Ying Xue Signed-off-by: Parthasarathy Bhuvaragan Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/socket.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'net/tipc/socket.c') diff --git a/net/tipc/socket.c b/net/tipc/socket.c index b6f99b021d09..49810fdff4c5 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -1196,6 +1196,7 @@ void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq, * @skb: pointer to message buffer. */ static void tipc_sk_conn_proto_rcv(struct tipc_sock *tsk, struct sk_buff *skb, + struct sk_buff_head *inputq, struct sk_buff_head *xmitq) { struct tipc_msg *hdr = buf_msg(skb); @@ -1213,7 +1214,16 @@ static void tipc_sk_conn_proto_rcv(struct tipc_sock *tsk, struct sk_buff *skb, tipc_node_remove_conn(sock_net(sk), tsk_peer_node(tsk), tsk_peer_port(tsk)); sk->sk_state_change(sk); - goto exit; + + /* State change is ignored if socket already awake, + * - convert msg to abort msg and add to inqueue + */ + msg_set_user(hdr, TIPC_CRITICAL_IMPORTANCE); + msg_set_type(hdr, TIPC_CONN_MSG); + msg_set_size(hdr, BASIC_H_SIZE); + msg_set_hdr_sz(hdr, BASIC_H_SIZE); + __skb_queue_tail(inputq, skb); + return; } tsk->probe_unacked = false; @@ -1936,7 +1946,7 @@ static void tipc_sk_proto_rcv(struct sock *sk, switch (msg_user(hdr)) { case CONN_MANAGER: - tipc_sk_conn_proto_rcv(tsk, skb, xmitq); + tipc_sk_conn_proto_rcv(tsk, skb, inputq, xmitq); return; case SOCK_WAKEUP: tipc_dest_del(&tsk->cong_links, msg_orignode(hdr), 0); -- cgit v1.2.3 From 89ab066d4229acd32e323f1569833302544a4186 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Tue, 23 Oct 2018 13:40:39 +0200 Subject: Revert "net: simplify sock_poll_wait" This reverts commit dd979b4df817e9976f18fb6f9d134d6bc4a3c317. This broke tcp_poll for SMC fallback: An AF_SMC socket establishes an internal TCP socket for the initial handshake with the remote peer. Whenever the SMC connection can not be established this TCP socket is used as a fallback. All socket operations on the SMC socket are then forwarded to the TCP socket. In case of poll, the file->private_data pointer references the SMC socket because the TCP socket has no file assigned. This causes tcp_poll to wait on the wrong socket. Signed-off-by: Karsten Graul Signed-off-by: David S. Miller --- net/tipc/socket.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/tipc/socket.c') diff --git a/net/tipc/socket.c b/net/tipc/socket.c index de09f514428c..636e6131769d 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -717,7 +717,7 @@ static __poll_t tipc_poll(struct file *file, struct socket *sock, struct tipc_sock *tsk = tipc_sk(sk); __poll_t revents = 0; - sock_poll_wait(file, wait); + sock_poll_wait(file, sock, wait); if (sk->sk_shutdown & RCV_SHUTDOWN) revents |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM; -- cgit v1.2.3