From a399a8053164ec8bcb06fed52be9941a26ecde11 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 8 Aug 2012 21:13:53 +0000 Subject: time: jiffies_delta_to_clock_t() helper to the rescue MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Various /proc/net files sometimes report crazy timer values, expressed in clock_t units. This happens when an expired timer delta (expires - jiffies) is passed to jiffies_to_clock_t(). This function has an overflow in : return div_u64((u64)x * TICK_NSEC, NSEC_PER_SEC / USER_HZ); commit cbbc719fccdb8cb (time: Change jiffies_to_clock_t() argument type to unsigned long) only got around the problem. As we cant output negative values in /proc/net/tcp without breaking various tools, I suggest adding a jiffies_delta_to_clock_t() wrapper that caps the negative delta to a 0 value. Signed-off-by: Eric Dumazet Reported-by: Maciej Żenczykowski Cc: Thomas Gleixner Cc: Paul Gortmaker Cc: Andrew Morton Cc: hank Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 42b2a6a73092..c660d2c19a2b 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2385,7 +2385,7 @@ static void get_openreq4(const struct sock *sk, const struct request_sock *req, struct seq_file *f, int i, int uid, int *len) { const struct inet_request_sock *ireq = inet_rsk(req); - int ttd = req->expires - jiffies; + long delta = req->expires - jiffies; seq_printf(f, "%4d: %08X:%04X %08X:%04X" " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %pK%n", @@ -2397,7 +2397,7 @@ static void get_openreq4(const struct sock *sk, const struct request_sock *req, TCP_SYN_RECV, 0, 0, /* could print option size, but that is af dependent. */ 1, /* timers active (only the expire timer) */ - jiffies_to_clock_t(ttd), + jiffies_delta_to_clock_t(delta), req->retrans, uid, 0, /* non standard timer */ @@ -2448,7 +2448,7 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len) tp->write_seq - tp->snd_una, rx_queue, timer_active, - jiffies_to_clock_t(timer_expires - jiffies), + jiffies_delta_to_clock_t(timer_expires - jiffies), icsk->icsk_retransmits, sock_i_uid(sk), icsk->icsk_probes_out, @@ -2467,10 +2467,7 @@ static void get_timewait4_sock(const struct inet_timewait_sock *tw, { __be32 dest, src; __u16 destp, srcp; - int ttd = tw->tw_ttd - jiffies; - - if (ttd < 0) - ttd = 0; + long delta = tw->tw_ttd - jiffies; dest = tw->tw_daddr; src = tw->tw_rcv_saddr; @@ -2480,7 +2477,7 @@ static void get_timewait4_sock(const struct inet_timewait_sock *tw, seq_printf(f, "%4d: %08X:%04X %08X:%04X" " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK%n", i, src, srcp, dest, destp, tw->tw_substate, 0, 0, - 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0, + 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0, atomic_read(&tw->tw_refcnt), tw, len); } -- cgit v1.2.3 From a7cb5a49bf64ba64864ae16a6be028f8b0d3cc06 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 24 May 2012 01:10:10 -0600 Subject: userns: Print out socket uids in a user namespace aware fashion. Cc: Alexey Kuznetsov Cc: James Morris Cc: Hideaki YOSHIFUJI Cc: Patrick McHardy Cc: Arnaldo Carvalho de Melo Cc: Sridhar Samudrala Acked-by: Vlad Yasevich Acked-by: David S. Miller Acked-by: Serge Hallyn Signed-off-by: Eric W. Biederman --- net/ipv4/tcp_ipv4.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 42b2a6a73092..642be8a4c6a3 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2382,7 +2382,7 @@ void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo) EXPORT_SYMBOL(tcp_proc_unregister); static void get_openreq4(const struct sock *sk, const struct request_sock *req, - struct seq_file *f, int i, int uid, int *len) + struct seq_file *f, int i, kuid_t uid, int *len) { const struct inet_request_sock *ireq = inet_rsk(req); int ttd = req->expires - jiffies; @@ -2399,7 +2399,7 @@ static void get_openreq4(const struct sock *sk, const struct request_sock *req, 1, /* timers active (only the expire timer) */ jiffies_to_clock_t(ttd), req->retrans, - uid, + from_kuid_munged(seq_user_ns(f), uid), 0, /* non standard timer */ 0, /* open_requests have no inode */ atomic_read(&sk->sk_refcnt), @@ -2450,7 +2450,7 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len) timer_active, jiffies_to_clock_t(timer_expires - jiffies), icsk->icsk_retransmits, - sock_i_uid(sk), + from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)), icsk->icsk_probes_out, sock_i_ino(sk), atomic_read(&sk->sk_refcnt), sk, -- cgit v1.2.3 From 8336886f786fdacbc19b719c1f7ea91eb70706d4 Mon Sep 17 00:00:00 2001 From: Jerry Chu Date: Fri, 31 Aug 2012 12:29:12 +0000 Subject: tcp: TCP Fast Open Server - support TFO listeners This patch builds on top of the previous patch to add the support for TFO listeners. This includes - 1. allocating, properly initializing, and managing the per listener fastopen_queue structure when TFO is enabled 2. changes to the inet_csk_accept code to support TFO. E.g., the request_sock can no longer be freed upon accept(), not until 3WHS finishes 3. allowing a TCP_SYN_RECV socket to properly poll() and sendmsg() if it's a TFO socket 4. properly closing a TFO listener, and a TFO socket before 3WHS finishes 5. supporting TCP_FASTOPEN socket option 6. modifying tcp_check_req() to use to check a TFO socket as well as request_sock 7. supporting TCP's TFO cookie option 8. adding a new SYN-ACK retransmit handler to use the timer directly off the TFO socket rather than the listener socket. Note that TFO server side will not retransmit anything other than SYN-ACK until the 3WHS is completed. The patch also contains an important function "reqsk_fastopen_remove()" to manage the somewhat complex relation between a listener, its request_sock, and the corresponding child socket. See the comment above the function for the detail. Signed-off-by: H.K. Jerry Chu Cc: Yuchung Cheng Cc: Neal Cardwell Cc: Eric Dumazet Cc: Tom Herbert Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 36f02f954ac1..bb148dee1edd 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -839,7 +839,7 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) return -1; - skb = tcp_make_synack(sk, dst, req, rvp); + skb = tcp_make_synack(sk, dst, req, rvp, NULL); if (skb) { __tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr); @@ -1554,7 +1554,7 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) struct request_sock *req = inet_csk_search_req(sk, &prev, th->source, iph->saddr, iph->daddr); if (req) - return tcp_check_req(sk, skb, req, prev); + return tcp_check_req(sk, skb, req, prev, false); nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr, th->source, iph->daddr, th->dest, inet_iif(skb)); -- cgit v1.2.3 From 168a8f58059a22feb9e9a2dcc1b8053dbbbc12ef Mon Sep 17 00:00:00 2001 From: Jerry Chu Date: Fri, 31 Aug 2012 12:29:13 +0000 Subject: tcp: TCP Fast Open Server - main code path This patch adds the main processing path to complete the TFO server patches. A TFO request (i.e., SYN+data packet with a TFO cookie option) first gets processed in tcp_v4_conn_request(). If it passes the various TFO checks by tcp_fastopen_check(), a child socket will be created right away to be accepted by applications, rather than waiting for the 3WHS to finish. In additon to the use of TFO cookie, a simple max_qlen based scheme is put in place to fend off spoofed TFO attack. When a valid ACK comes back to tcp_rcv_state_process(), it will cause the state of the child socket to switch from either TCP_SYN_RECV to TCP_ESTABLISHED, or TCP_FIN_WAIT1 to TCP_FIN_WAIT2. At this time retransmission will resume for any unack'ed (data, FIN,...) segments. Signed-off-by: H.K. Jerry Chu Cc: Yuchung Cheng Cc: Neal Cardwell Cc: Eric Dumazet Cc: Tom Herbert Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 265 +++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 251 insertions(+), 14 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index bb148dee1edd..e64abed249cc 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -352,6 +352,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) const int code = icmp_hdr(icmp_skb)->code; struct sock *sk; struct sk_buff *skb; + struct request_sock *req; __u32 seq; __u32 remaining; int err; @@ -394,9 +395,12 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) icsk = inet_csk(sk); tp = tcp_sk(sk); + req = tp->fastopen_rsk; seq = ntohl(th->seq); if (sk->sk_state != TCP_LISTEN && - !between(seq, tp->snd_una, tp->snd_nxt)) { + !between(seq, tp->snd_una, tp->snd_nxt) && + (req == NULL || seq != tcp_rsk(req)->snt_isn)) { + /* For a Fast Open socket, allow seq to be snt_isn. */ NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS); goto out; } @@ -435,6 +439,8 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) !icsk->icsk_backoff) break; + /* XXX (TFO) - revisit the following logic for TFO */ + if (sock_owned_by_user(sk)) break; @@ -466,6 +472,14 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) goto out; } + /* XXX (TFO) - if it's a TFO socket and has been accepted, rather + * than following the TCP_SYN_RECV case and closing the socket, + * we ignore the ICMP error and keep trying like a fully established + * socket. Is this the right thing to do? + */ + if (req && req->sk == NULL) + goto out; + switch (sk->sk_state) { struct request_sock *req, **prev; case TCP_LISTEN: @@ -498,7 +512,8 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) case TCP_SYN_SENT: case TCP_SYN_RECV: /* Cannot happen. - It can f.e. if SYNs crossed. + It can f.e. if SYNs crossed, + or Fast Open. */ if (!sock_owned_by_user(sk)) { sk->sk_err = err; @@ -809,8 +824,12 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, struct request_sock *req) { - tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1, - tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd, + /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV + * sk->sk_state == TCP_SYN_RECV -> for Fast Open. + */ + tcp_v4_send_ack(skb, (sk->sk_state == TCP_LISTEN) ? + tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt, + tcp_rsk(req)->rcv_nxt, req->rcv_wnd, req->ts_recent, 0, tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr, @@ -1272,6 +1291,178 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { }; #endif +static bool tcp_fastopen_check(struct sock *sk, struct sk_buff *skb, + struct request_sock *req, + struct tcp_fastopen_cookie *foc, + struct tcp_fastopen_cookie *valid_foc) +{ + bool skip_cookie = false; + struct fastopen_queue *fastopenq; + + if (likely(!fastopen_cookie_present(foc))) { + /* See include/net/tcp.h for the meaning of these knobs */ + if ((sysctl_tcp_fastopen & TFO_SERVER_ALWAYS) || + ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD) && + (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1))) + skip_cookie = true; /* no cookie to validate */ + else + return false; + } + fastopenq = inet_csk(sk)->icsk_accept_queue.fastopenq; + /* A FO option is present; bump the counter. */ + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVE); + + /* Make sure the listener has enabled fastopen, and we don't + * exceed the max # of pending TFO requests allowed before trying + * to validating the cookie in order to avoid burning CPU cycles + * unnecessarily. + * + * XXX (TFO) - The implication of checking the max_qlen before + * processing a cookie request is that clients can't differentiate + * between qlen overflow causing Fast Open to be disabled + * temporarily vs a server not supporting Fast Open at all. + */ + if ((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) == 0 || + fastopenq == NULL || fastopenq->max_qlen == 0) + return false; + + if (fastopenq->qlen >= fastopenq->max_qlen) { + struct request_sock *req1; + spin_lock(&fastopenq->lock); + req1 = fastopenq->rskq_rst_head; + if ((req1 == NULL) || time_after(req1->expires, jiffies)) { + spin_unlock(&fastopenq->lock); + NET_INC_STATS_BH(sock_net(sk), + LINUX_MIB_TCPFASTOPENLISTENOVERFLOW); + /* Avoid bumping LINUX_MIB_TCPFASTOPENPASSIVEFAIL*/ + foc->len = -1; + return false; + } + fastopenq->rskq_rst_head = req1->dl_next; + fastopenq->qlen--; + spin_unlock(&fastopenq->lock); + reqsk_free(req1); + } + if (skip_cookie) { + tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq; + return true; + } + if (foc->len == TCP_FASTOPEN_COOKIE_SIZE) { + if ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_CHKED) == 0) { + tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc); + if ((valid_foc->len != TCP_FASTOPEN_COOKIE_SIZE) || + memcmp(&foc->val[0], &valid_foc->val[0], + TCP_FASTOPEN_COOKIE_SIZE) != 0) + return false; + valid_foc->len = -1; + } + /* Acknowledge the data received from the peer. */ + tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq; + return true; + } else if (foc->len == 0) { /* Client requesting a cookie */ + tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc); + NET_INC_STATS_BH(sock_net(sk), + LINUX_MIB_TCPFASTOPENCOOKIEREQD); + } else { + /* Client sent a cookie with wrong size. Treat it + * the same as invalid and return a valid one. + */ + tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc); + } + return false; +} + +static int tcp_v4_conn_req_fastopen(struct sock *sk, + struct sk_buff *skb, + struct sk_buff *skb_synack, + struct request_sock *req, + struct request_values *rvp) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; + const struct inet_request_sock *ireq = inet_rsk(req); + struct sock *child; + + req->retrans = 0; + req->sk = NULL; + + child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL); + if (child == NULL) { + NET_INC_STATS_BH(sock_net(sk), + LINUX_MIB_TCPFASTOPENPASSIVEFAIL); + kfree_skb(skb_synack); + return -1; + } + ip_build_and_send_pkt(skb_synack, sk, ireq->loc_addr, + ireq->rmt_addr, ireq->opt); + /* XXX (TFO) - is it ok to ignore error and continue? */ + + spin_lock(&queue->fastopenq->lock); + queue->fastopenq->qlen++; + spin_unlock(&queue->fastopenq->lock); + + /* Initialize the child socket. Have to fix some values to take + * into account the child is a Fast Open socket and is created + * only out of the bits carried in the SYN packet. + */ + tp = tcp_sk(child); + + tp->fastopen_rsk = req; + /* Do a hold on the listner sk so that if the listener is being + * closed, the child that has been accepted can live on and still + * access listen_lock. + */ + sock_hold(sk); + tcp_rsk(req)->listener = sk; + + /* RFC1323: The window in SYN & SYN/ACK segments is never + * scaled. So correct it appropriately. + */ + tp->snd_wnd = ntohs(tcp_hdr(skb)->window); + + /* Activate the retrans timer so that SYNACK can be retransmitted. + * The request socket is not added to the SYN table of the parent + * because it's been added to the accept queue directly. + */ + inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS, + TCP_TIMEOUT_INIT, TCP_RTO_MAX); + + /* Add the child socket directly into the accept queue */ + inet_csk_reqsk_queue_add(sk, req, child); + + /* Now finish processing the fastopen child socket. */ + inet_csk(child)->icsk_af_ops->rebuild_header(child); + tcp_init_congestion_control(child); + tcp_mtup_init(child); + tcp_init_buffer_space(child); + tcp_init_metrics(child); + + /* Queue the data carried in the SYN packet. We need to first + * bump skb's refcnt because the caller will attempt to free it. + * + * XXX (TFO) - we honor a zero-payload TFO request for now. + * (Any reason not to?) + */ + if (TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq + 1) { + /* Don't queue the skb if there is no payload in SYN. + * XXX (TFO) - How about SYN+FIN? + */ + tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; + } else { + skb = skb_get(skb); + skb_dst_drop(skb); + __skb_pull(skb, tcp_hdr(skb)->doff * 4); + skb_set_owner_r(skb, child); + __skb_queue_tail(&child->sk_receive_queue, skb); + tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; + } + sk->sk_data_ready(sk, 0); + bh_unlock_sock(child); + sock_put(child); + WARN_ON(req->sk == NULL); + return 0; +} + int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) { struct tcp_extend_values tmp_ext; @@ -1285,6 +1476,11 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) __be32 daddr = ip_hdr(skb)->daddr; __u32 isn = TCP_SKB_CB(skb)->when; bool want_cookie = false; + struct flowi4 fl4; + struct tcp_fastopen_cookie foc = { .len = -1 }; + struct tcp_fastopen_cookie valid_foc = { .len = -1 }; + struct sk_buff *skb_synack; + int do_fastopen; /* Never answer to SYNs send to broadcast or multicast */ if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) @@ -1319,7 +1515,8 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) tcp_clear_options(&tmp_opt); tmp_opt.mss_clamp = TCP_MSS_DEFAULT; tmp_opt.user_mss = tp->rx_opt.user_mss; - tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL); + tcp_parse_options(skb, &tmp_opt, &hash_location, 0, + want_cookie ? NULL : &foc); if (tmp_opt.cookie_plus > 0 && tmp_opt.saw_tstamp && @@ -1377,8 +1574,6 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) isn = cookie_v4_init_sequence(sk, skb, &req->mss); req->cookie_ts = tmp_opt.tstamp_ok; } else if (!isn) { - struct flowi4 fl4; - /* VJ's idea. We save last timestamp seen * from the destination in peer table, when entering * state TIME-WAIT, and check against it before @@ -1419,14 +1614,52 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) tcp_rsk(req)->snt_isn = isn; tcp_rsk(req)->snt_synack = tcp_time_stamp; - if (tcp_v4_send_synack(sk, dst, req, - (struct request_values *)&tmp_ext, - skb_get_queue_mapping(skb), - want_cookie) || - want_cookie) + if (dst == NULL) { + dst = inet_csk_route_req(sk, &fl4, req); + if (dst == NULL) + goto drop_and_free; + } + do_fastopen = tcp_fastopen_check(sk, skb, req, &foc, &valid_foc); + + /* We don't call tcp_v4_send_synack() directly because we need + * to make sure a child socket can be created successfully before + * sending back synack! + * + * XXX (TFO) - Ideally one would simply call tcp_v4_send_synack() + * (or better yet, call tcp_send_synack() in the child context + * directly, but will have to fix bunch of other code first) + * after syn_recv_sock() except one will need to first fix the + * latter to remove its dependency on the current implementation + * of tcp_v4_send_synack()->tcp_select_initial_window(). + */ + skb_synack = tcp_make_synack(sk, dst, req, + (struct request_values *)&tmp_ext, + fastopen_cookie_present(&valid_foc) ? &valid_foc : NULL); + + if (skb_synack) { + __tcp_v4_send_check(skb_synack, ireq->loc_addr, ireq->rmt_addr); + skb_set_queue_mapping(skb_synack, skb_get_queue_mapping(skb)); + } else + goto drop_and_free; + + if (likely(!do_fastopen)) { + int err; + err = ip_build_and_send_pkt(skb_synack, sk, ireq->loc_addr, + ireq->rmt_addr, ireq->opt); + err = net_xmit_eval(err); + if (err || want_cookie) + goto drop_and_free; + + tcp_rsk(req)->listener = NULL; + /* Add the request_sock to the SYN table */ + inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); + if (fastopen_cookie_present(&foc) && foc.len != 0) + NET_INC_STATS_BH(sock_net(sk), + LINUX_MIB_TCPFASTOPENPASSIVEFAIL); + } else if (tcp_v4_conn_req_fastopen(sk, skb, skb_synack, req, + (struct request_values *)&tmp_ext)) goto drop_and_free; - inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); return 0; drop_and_release: @@ -1977,6 +2210,7 @@ void tcp_v4_destroy_sock(struct sock *sk) tcp_cookie_values_release); tp->cookie_values = NULL; } + BUG_ON(tp->fastopen_rsk != NULL); /* If socket is aborted during connect operation */ tcp_free_fastopen_req(tp); @@ -2425,6 +2659,7 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len) const struct tcp_sock *tp = tcp_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); const struct inet_sock *inet = inet_sk(sk); + struct fastopen_queue *fastopenq = icsk->icsk_accept_queue.fastopenq; __be32 dest = inet->inet_daddr; __be32 src = inet->inet_rcv_saddr; __u16 destp = ntohs(inet->inet_dport); @@ -2469,7 +2704,9 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len) jiffies_to_clock_t(icsk->icsk_ack.ato), (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong, tp->snd_cwnd, - tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh, + sk->sk_state == TCP_LISTEN ? + (fastopenq ? fastopenq->max_qlen : 0) : + (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh), len); } -- cgit v1.2.3 From 623df484a777f3c00c1ea3d6a7565b8d8ac688a1 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Sat, 22 Sep 2012 04:18:54 +0000 Subject: tcp: extract code to compute SYNACK RTT In preparation for adding another spot where we compute the SYNACK RTT, extract this code so that it can be shared. Signed-off-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index e64abed249cc..1e66f7fb4fe6 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1733,9 +1733,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, newtp->advmss = tcp_sk(sk)->rx_opt.user_mss; tcp_initialize_rcv_mss(newsk); - if (tcp_rsk(req)->snt_synack) - tcp_valid_rtt_meas(newsk, - tcp_time_stamp - tcp_rsk(req)->snt_synack); + tcp_synack_rtt_meas(newsk, req); newtp->total_retrans = req->retrans; #ifdef CONFIG_TCP_MD5SIG -- cgit v1.2.3 From 016818d076871c4ee34db1e8d74dc17ac1de626a Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Sat, 22 Sep 2012 04:18:55 +0000 Subject: tcp: TCP Fast Open Server - take SYNACK RTT after completing 3WHS When taking SYNACK RTT samples for servers using TCP Fast Open, fix the code to ensure that we only call tcp_valid_rtt_meas() after we receive the ACK that completes the 3-way handshake. Previously we were always taking an RTT sample in tcp_v4_syn_recv_sock(). However, for TCP Fast Open connections tcp_v4_conn_req_fastopen() calls tcp_v4_syn_recv_sock() at the time we receive the SYN. So for TFO we must wait until tcp_rcv_state_process() to take the RTT sample. To fix this, we wait until after TFO calls tcp_v4_syn_recv_sock() before we set the snt_synack timestamp, since tcp_synack_rtt_meas() already ensures that we only take a SYNACK RTT sample if snt_synack is non-zero. To be careful, we only take a snt_synack timestamp when a SYNACK transmit or retransmit succeeds. Signed-off-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 1e66f7fb4fe6..0a7e020f16b5 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -868,6 +868,8 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, ireq->rmt_addr, ireq->opt); err = net_xmit_eval(err); + if (!tcp_rsk(req)->snt_synack && !err) + tcp_rsk(req)->snt_synack = tcp_time_stamp; } return err; @@ -1382,6 +1384,7 @@ static int tcp_v4_conn_req_fastopen(struct sock *sk, struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; const struct inet_request_sock *ireq = inet_rsk(req); struct sock *child; + int err; req->retrans = 0; req->sk = NULL; @@ -1393,8 +1396,11 @@ static int tcp_v4_conn_req_fastopen(struct sock *sk, kfree_skb(skb_synack); return -1; } - ip_build_and_send_pkt(skb_synack, sk, ireq->loc_addr, - ireq->rmt_addr, ireq->opt); + err = ip_build_and_send_pkt(skb_synack, sk, ireq->loc_addr, + ireq->rmt_addr, ireq->opt); + err = net_xmit_eval(err); + if (!err) + tcp_rsk(req)->snt_synack = tcp_time_stamp; /* XXX (TFO) - is it ok to ignore error and continue? */ spin_lock(&queue->fastopenq->lock); @@ -1612,7 +1618,6 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) isn = tcp_v4_init_sequence(skb); } tcp_rsk(req)->snt_isn = isn; - tcp_rsk(req)->snt_synack = tcp_time_stamp; if (dst == NULL) { dst = inet_csk_route_req(sk, &fl4, req); @@ -1650,6 +1655,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) if (err || want_cookie) goto drop_and_free; + tcp_rsk(req)->snt_synack = tcp_time_stamp; tcp_rsk(req)->listener = NULL; /* Add the request_sock to the SYN table */ inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); -- cgit v1.2.3 From 5640f7685831e088fe6c2e1f863a6805962f8e81 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 23 Sep 2012 23:04:42 +0000 Subject: net: use a per task frag allocator We currently use a per socket order-0 page cache for tcp_sendmsg() operations. This page is used to build fragments for skbs. Its done to increase probability of coalescing small write() into single segments in skbs still in write queue (not yet sent) But it wastes a lot of memory for applications handling many mostly idle sockets, since each socket holds one page in sk->sk_sndmsg_page Its also quite inefficient to build TSO 64KB packets, because we need about 16 pages per skb on arches where PAGE_SIZE = 4096, so we hit page allocator more than wanted. This patch adds a per task frag allocator and uses bigger pages, if available. An automatic fallback is done in case of memory pressure. (up to 32768 bytes per frag, thats order-3 pages on x86) This increases TCP stream performance by 20% on loopback device, but also benefits on other network devices, since 8x less frags are mapped on transmit and unmapped on tx completion. Alexander Duyck mentioned a probable performance win on systems with IOMMU enabled. Its possible some SG enabled hardware cant cope with bigger fragments, but their ndo_start_xmit() should already handle this, splitting a fragment in sub fragments, since some arches have PAGE_SIZE=65536 Successfully tested on various ethernet devices. (ixgbe, igb, bnx2x, tg3, mellanox mlx4) Signed-off-by: Eric Dumazet Cc: Ben Hutchings Cc: Vijay Subramanian Cc: Alexander Duyck Tested-by: Vijay Subramanian Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 0a7e020f16b5..93406c583f43 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2200,14 +2200,6 @@ void tcp_v4_destroy_sock(struct sock *sk) if (inet_csk(sk)->icsk_bind_hash) inet_put_port(sk); - /* - * If sendmsg cached page exists, toss it. - */ - if (sk->sk_sndmsg_page) { - __free_page(sk->sk_sndmsg_page); - sk->sk_sndmsg_page = NULL; - } - /* TCP Cookie Transactions */ if (tp->cookie_values != NULL) { kref_put(&tp->cookie_values->kref, -- cgit v1.2.3 From 5dff747b7038d10f9c174a1245263fd1c36a644d Mon Sep 17 00:00:00 2001 From: Christoph Paasch Date: Wed, 26 Sep 2012 11:59:09 +0000 Subject: tcp: Remove unused parameter from tcp_v4_save_options struct sock *sk is not used inside tcp_v4_save_options. Thus it can be removed. Signed-off-by: Christoph Paasch Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 93406c583f43..385eb79cf6aa 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -925,8 +925,7 @@ EXPORT_SYMBOL(tcp_syn_flood_action); /* * Save and compile IPv4 options into the request_sock if needed. */ -static struct ip_options_rcu *tcp_v4_save_options(struct sock *sk, - struct sk_buff *skb) +static struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb) { const struct ip_options *opt = &(IPCB(skb)->opt); struct ip_options_rcu *dopt = NULL; @@ -1568,7 +1567,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) ireq->loc_addr = daddr; ireq->rmt_addr = saddr; ireq->no_srccheck = inet_sk(sk)->transparent; - ireq->opt = tcp_v4_save_options(sk, skb); + ireq->opt = tcp_v4_save_options(skb); if (security_inet_conn_request(sk, skb, req)) goto drop_and_free; -- cgit v1.2.3 From 861b650101eb0c627d171eb18de81dddb93d395e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 27 Sep 2012 02:14:53 +0000 Subject: tcp: gro: add checksuming helpers skb with CHECKSUM_NONE cant currently be handled by GRO, and we notice this deep in GRO stack in tcp[46]_gro_receive() But there are cases where GRO can be a benefit, even with a lack of checksums. This preliminary work is needed to add GRO support to tunnels. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 385eb79cf6aa..75735c9a6a9d 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2803,6 +2803,8 @@ void tcp4_proc_exit(void) struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb) { const struct iphdr *iph = skb_gro_network_header(skb); + __wsum wsum; + __sum16 sum; switch (skb->ip_summed) { case CHECKSUM_COMPLETE: @@ -2811,11 +2813,22 @@ struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb) skb->ip_summed = CHECKSUM_UNNECESSARY; break; } - - /* fall through */ - case CHECKSUM_NONE: +flush: NAPI_GRO_CB(skb)->flush = 1; return NULL; + + case CHECKSUM_NONE: + wsum = csum_tcpudp_nofold(iph->saddr, iph->daddr, + skb_gro_len(skb), IPPROTO_TCP, 0); + sum = csum_fold(skb_checksum(skb, + skb_gro_offset(skb), + skb_gro_len(skb), + wsum)); + if (sum) + goto flush; + + skb->ip_summed = CHECKSUM_UNNECESSARY; + break; } return tcp_gro_receive(head, skb); -- cgit v1.2.3