summaryrefslogtreecommitdiff
path: root/net/ipv4
diff options
context:
space:
mode:
authorAjay Nandakumar <anandakumarm@nvidia.com>2013-12-16 19:11:53 +0530
committerAjay Nandakumar <anandakumarm@nvidia.com>2013-12-16 19:11:53 +0530
commit4186e42658f8f3e95fa4f87f917872e8a3431057 (patch)
tree88d96f1893b395c38c246e79e75cbcf088678441 /net/ipv4
parent6bbaea6ef1c82c1d4a94b655ecf2e2b08164545f (diff)
parent05bcf8f867f4af11c93395d4a6dd1dd52d8904ea (diff)
Merge tag 'v3.10.24' into HEAD
This is the 3.10.24 stable release Change-Id: Ibd2734f93d44385ab86867272a1359158635133b
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/datagram.c2
-rw-r--r--net/ipv4/inet_hashtables.c2
-rw-r--r--net/ipv4/ip_gre.c2
-rw-r--r--net/ipv4/ip_output.c2
-rw-r--r--net/ipv4/ip_sockglue.c3
-rw-r--r--net/ipv4/ip_tunnel.c4
-rw-r--r--net/ipv4/ip_vti.c15
-rw-r--r--net/ipv4/ipip.c2
-rw-r--r--net/ipv4/ping.c19
-rw-r--r--net/ipv4/raw.c6
-rw-r--r--net/ipv4/route.c10
-rw-r--r--net/ipv4/sysctl_net_ipv4.c10
-rw-r--r--net/ipv4/tcp.c39
-rw-r--r--net/ipv4/tcp_input.c41
-rw-r--r--net/ipv4/tcp_ipv4.c2
-rw-r--r--net/ipv4/tcp_metrics.c5
-rw-r--r--net/ipv4/tcp_output.c33
-rw-r--r--net/ipv4/udp.c17
18 files changed, 151 insertions, 63 deletions
diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
index b28e863fe0a7..19e36376d2a0 100644
--- a/net/ipv4/datagram.c
+++ b/net/ipv4/datagram.c
@@ -57,7 +57,7 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
if (IS_ERR(rt)) {
err = PTR_ERR(rt);
if (err == -ENETUNREACH)
- IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
+ IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
goto out;
}
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 6af375afeeef..c95848d00039 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -287,7 +287,7 @@ begintw:
if (unlikely(!INET_TW_MATCH(sk, net, acookie,
saddr, daddr, ports,
dif))) {
- sock_put(sk);
+ inet_twsk_put(inet_twsk(sk));
goto begintw;
}
goto out;
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index c52fee0976da..64e4e98c8786 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -335,7 +335,7 @@ static int ipgre_rcv(struct sk_buff *skb)
iph->saddr, iph->daddr, tpi.key);
if (tunnel) {
- ip_tunnel_rcv(tunnel, skb, &tpi, log_ecn_error);
+ ip_tunnel_rcv(tunnel, skb, &tpi, hdr_len, log_ecn_error);
return 0;
}
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index ec2d430a6a55..6ca5873d6175 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -844,7 +844,7 @@ static int __ip_append_data(struct sock *sk,
csummode = CHECKSUM_PARTIAL;
cork->length += length;
- if (((length > mtu) || (skb && skb_is_gso(skb))) &&
+ if (((length > mtu) || (skb && skb_has_frags(skb))) &&
(sk->sk_protocol == IPPROTO_UDP) &&
(rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len) {
err = ip_ufo_append_data(sk, queue, getfrag, from, length,
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index d9c4f113d709..23e6ab0a2dc0 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -368,7 +368,7 @@ void ip_local_error(struct sock *sk, int err, __be32 daddr, __be16 port, u32 inf
/*
* Handle MSG_ERRQUEUE
*/
-int ip_recv_error(struct sock *sk, struct msghdr *msg, int len)
+int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
{
struct sock_exterr_skb *serr;
struct sk_buff *skb, *skb2;
@@ -405,6 +405,7 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len)
serr->addr_offset);
sin->sin_port = serr->port;
memset(&sin->sin_zero, 0, sizeof(sin->sin_zero));
+ *addr_len = sizeof(*sin);
}
memcpy(&errhdr.ee, &serr->ee, sizeof(struct sock_extended_err));
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index 92d2f0f5d7bf..46dcf32c012e 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -402,7 +402,7 @@ static struct ip_tunnel *ip_tunnel_create(struct net *net,
}
int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
- const struct tnl_ptk_info *tpi, bool log_ecn_error)
+ const struct tnl_ptk_info *tpi, int hdr_len, bool log_ecn_error)
{
struct pcpu_tstats *tstats;
const struct iphdr *iph = ip_hdr(skb);
@@ -413,7 +413,7 @@ int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
skb->protocol = tpi->proto;
skb->mac_header = skb->network_header;
- __pskb_pull(skb, tunnel->hlen);
+ __pskb_pull(skb, hdr_len);
skb_postpull_rcsum(skb, skb_transport_header(skb), tunnel->hlen);
#ifdef CONFIG_NET_IPGRE_BROADCAST
if (ipv4_is_multicast(iph->daddr)) {
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index 17cc0ffa8c0d..feb19db62359 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -285,8 +285,17 @@ static int vti_rcv(struct sk_buff *skb)
tunnel = vti_tunnel_lookup(dev_net(skb->dev), iph->saddr, iph->daddr);
if (tunnel != NULL) {
struct pcpu_tstats *tstats;
+ u32 oldmark = skb->mark;
+ int ret;
- if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
+
+ /* temporarily mark the skb with the tunnel o_key, to
+ * only match policies with this mark.
+ */
+ skb->mark = be32_to_cpu(tunnel->parms.o_key);
+ ret = xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb);
+ skb->mark = oldmark;
+ if (!ret)
return -1;
tstats = this_cpu_ptr(tunnel->dev->tstats);
@@ -295,7 +304,6 @@ static int vti_rcv(struct sk_buff *skb)
tstats->rx_bytes += skb->len;
u64_stats_update_end(&tstats->syncp);
- skb->mark = 0;
secpath_reset(skb);
skb->dev = tunnel->dev;
return 1;
@@ -327,7 +335,7 @@ static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
memset(&fl4, 0, sizeof(fl4));
flowi4_init_output(&fl4, tunnel->parms.link,
- be32_to_cpu(tunnel->parms.i_key), RT_TOS(tos),
+ be32_to_cpu(tunnel->parms.o_key), RT_TOS(tos),
RT_SCOPE_UNIVERSE,
IPPROTO_IPIP, 0,
dst, tiph->saddr, 0, 0);
@@ -342,6 +350,7 @@ static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
if (!rt->dst.xfrm ||
rt->dst.xfrm->props.mode != XFRM_MODE_TUNNEL) {
dev->stats.tx_carrier_errors++;
+ ip_rt_put(rt);
goto tx_error_icmp;
}
tdev = rt->dst.dev;
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 7cfc45624b6d..f5cc7b331511 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -195,7 +195,7 @@ static int ipip_rcv(struct sk_buff *skb)
if (tunnel) {
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
goto drop;
- return ip_tunnel_rcv(tunnel, skb, &tpi, log_ecn_error);
+ return ip_tunnel_rcv(tunnel, skb, &tpi, 0, log_ecn_error);
}
return -1;
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 3d6cf7f5d54d..0b26c09e5af7 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -775,7 +775,7 @@ int ping_v4_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
err = PTR_ERR(rt);
rt = NULL;
if (err == -ENETUNREACH)
- IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES);
+ IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
goto out;
}
@@ -853,7 +853,7 @@ int ping_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
if (flags & MSG_ERRQUEUE) {
if (family == AF_INET) {
- return ip_recv_error(sk, msg, len);
+ return ip_recv_error(sk, msg, len, addr_len);
#if IS_ENABLED(CONFIG_IPV6)
} else if (family == AF_INET6) {
return pingv6_ops.ipv6_recv_error(sk, msg, len);
@@ -881,10 +881,17 @@ int ping_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
/* Copy the address and add cmsg data. */
if (family == AF_INET) {
sin = (struct sockaddr_in *) msg->msg_name;
- sin->sin_family = AF_INET;
- sin->sin_port = 0 /* skb->h.uh->source */;
- sin->sin_addr.s_addr = ip_hdr(skb)->saddr;
- memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
+
+ /* Copy the address. */
+ if (msg->msg_name) {
+ struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name;
+
+ sin->sin_family = AF_INET;
+ sin->sin_port = 0 /* skb->h.uh->source */;
+ sin->sin_addr.s_addr = ip_hdr(skb)->saddr;
+ memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
+ *addr_len = sizeof(*sin);
+ }
if (isk->cmsg_flags)
ip_cmsg_recv(msg, skb);
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 6fb233772f79..402870fdfa0e 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -692,11 +692,8 @@ static int raw_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
if (flags & MSG_OOB)
goto out;
- if (addr_len)
- *addr_len = sizeof(*sin);
-
if (flags & MSG_ERRQUEUE) {
- err = ip_recv_error(sk, msg, len);
+ err = ip_recv_error(sk, msg, len, addr_len);
goto out;
}
@@ -722,6 +719,7 @@ static int raw_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
sin->sin_addr.s_addr = ip_hdr(skb)->saddr;
sin->sin_port = 0;
memset(&sin->sin_zero, 0, sizeof(sin->sin_zero));
+ *addr_len = sizeof(*sin);
}
if (inet->cmsg_flags)
ip_cmsg_recv(msg, skb);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index d35bbf0cf404..f6c6ab14da41 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1720,8 +1720,12 @@ local_input:
rth->dst.error= -err;
rth->rt_flags &= ~RTCF_LOCAL;
}
- if (do_cache)
- rt_cache_route(&FIB_RES_NH(res), rth);
+ if (do_cache) {
+ if (unlikely(!rt_cache_route(&FIB_RES_NH(res), rth))) {
+ rth->dst.flags |= DST_NOCACHE;
+ rt_add_uncached_list(rth);
+ }
+ }
skb_dst_set(skb, &rth->dst);
err = 0;
goto out;
@@ -2020,7 +2024,7 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
RT_SCOPE_LINK);
goto make_route;
}
- if (fl4->saddr) {
+ if (!fl4->saddr) {
if (ipv4_is_multicast(fl4->daddr))
fl4->saddr = inet_select_addr(dev_out, 0,
fl4->flowi4_scope);
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 3f25e75ae692..90b26beb84d4 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -29,6 +29,7 @@
static int zero;
static int one = 1;
static int four = 4;
+static int gso_max_segs = GSO_MAX_SEGS;
static int tcp_retr1_max = 255;
static int ip_local_port_range_min[] = { 1, 1 };
static int ip_local_port_range_max[] = { 65535, 65535 };
@@ -753,6 +754,15 @@ static struct ctl_table ipv4_table[] = {
.extra2 = &four,
},
{
+ .procname = "tcp_min_tso_segs",
+ .data = &sysctl_tcp_min_tso_segs,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &gso_max_segs,
+ },
+ {
.procname = "udp_mem",
.data = &sysctl_udp_mem,
.maxlen = sizeof(sysctl_udp_mem),
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index a5b4f535a519..78411dad59ef 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -286,6 +286,8 @@
int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT;
+int sysctl_tcp_min_tso_segs __read_mostly = 2;
+
struct percpu_counter tcp_orphan_count;
EXPORT_SYMBOL_GPL(tcp_orphan_count);
@@ -790,14 +792,24 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
xmit_size_goal = mss_now;
if (large_allowed && sk_can_gso(sk)) {
- xmit_size_goal = ((sk->sk_gso_max_size - 1) -
- inet_csk(sk)->icsk_af_ops->net_header_len -
- inet_csk(sk)->icsk_ext_hdr_len -
- tp->tcp_header_len);
+ u32 gso_size, hlen;
+
+ /* Maybe we should/could use sk->sk_prot->max_header here ? */
+ hlen = inet_csk(sk)->icsk_af_ops->net_header_len +
+ inet_csk(sk)->icsk_ext_hdr_len +
+ tp->tcp_header_len;
- /* TSQ : try to have two TSO segments in flight */
- xmit_size_goal = min_t(u32, xmit_size_goal,
- sysctl_tcp_limit_output_bytes >> 1);
+ /* Goal is to send at least one packet per ms,
+ * not one big TSO packet every 100 ms.
+ * This preserves ACK clocking and is consistent
+ * with tcp_tso_should_defer() heuristic.
+ */
+ gso_size = sk->sk_pacing_rate / (2 * MSEC_PER_SEC);
+ gso_size = max_t(u32, gso_size,
+ sysctl_tcp_min_tso_segs * mss_now);
+
+ xmit_size_goal = min_t(u32, gso_size,
+ sk->sk_gso_max_size - 1 - hlen);
xmit_size_goal = tcp_bound_to_half_wnd(tp, xmit_size_goal);
@@ -2900,6 +2912,7 @@ struct sk_buff *tcp_tso_segment(struct sk_buff *skb,
netdev_features_t features)
{
struct sk_buff *segs = ERR_PTR(-EINVAL);
+ unsigned int sum_truesize = 0;
struct tcphdr *th;
unsigned int thlen;
unsigned int seq;
@@ -2983,13 +2996,7 @@ struct sk_buff *tcp_tso_segment(struct sk_buff *skb,
if (copy_destructor) {
skb->destructor = gso_skb->destructor;
skb->sk = gso_skb->sk;
- /* {tcp|sock}_wfree() use exact truesize accounting :
- * sum(skb->truesize) MUST be exactly be gso_skb->truesize
- * So we account mss bytes of 'true size' for each segment.
- * The last segment will contain the remaining.
- */
- skb->truesize = mss;
- gso_skb->truesize -= mss;
+ sum_truesize += skb->truesize;
}
skb = skb->next;
th = tcp_hdr(skb);
@@ -3006,7 +3013,9 @@ struct sk_buff *tcp_tso_segment(struct sk_buff *skb,
if (copy_destructor) {
swap(gso_skb->sk, skb->sk);
swap(gso_skb->destructor, skb->destructor);
- swap(gso_skb->truesize, skb->truesize);
+ sum_truesize += skb->truesize;
+ atomic_add(sum_truesize - gso_skb->truesize,
+ &skb->sk->sk_wmem_alloc);
}
delta = htonl(oldlen + (skb->tail - skb->transport_header) +
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 4b75aad14b04..e15d330919af 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -699,6 +699,34 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt)
}
}
+/* Set the sk_pacing_rate to allow proper sizing of TSO packets.
+ * Note: TCP stack does not yet implement pacing.
+ * FQ packet scheduler can be used to implement cheap but effective
+ * TCP pacing, to smooth the burst on large writes when packets
+ * in flight is significantly lower than cwnd (or rwin)
+ */
+static void tcp_update_pacing_rate(struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ u64 rate;
+
+ /* set sk_pacing_rate to 200 % of current rate (mss * cwnd / srtt) */
+ rate = (u64)tp->mss_cache * 2 * (HZ << 3);
+
+ rate *= max(tp->snd_cwnd, tp->packets_out);
+
+ /* Correction for small srtt : minimum srtt being 8 (1 jiffy << 3),
+ * be conservative and assume srtt = 1 (125 us instead of 1.25 ms)
+ * We probably need usec resolution in the future.
+ * Note: This also takes care of possible srtt=0 case,
+ * when tcp_rtt_estimator() was not yet called.
+ */
+ if (tp->srtt > 8 + 2)
+ do_div(rate, tp->srtt);
+
+ sk->sk_pacing_rate = min_t(u64, rate, ~0U);
+}
+
/* Calculate rto without backoff. This is the second half of Van Jacobson's
* routine referred to above.
*/
@@ -1264,7 +1292,10 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
tp->lost_cnt_hint -= tcp_skb_pcount(prev);
}
- TCP_SKB_CB(skb)->tcp_flags |= TCP_SKB_CB(prev)->tcp_flags;
+ TCP_SKB_CB(prev)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
+ if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
+ TCP_SKB_CB(prev)->end_seq++;
+
if (skb == tcp_highest_sack(sk))
tcp_advance_highest_sack(sk, skb);
@@ -3314,7 +3345,7 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
tcp_init_cwnd_reduction(sk, true);
tcp_set_ca_state(sk, TCP_CA_CWR);
tcp_end_cwnd_reduction(sk);
- tcp_set_ca_state(sk, TCP_CA_Open);
+ tcp_try_keep_open(sk);
NET_INC_STATS_BH(sock_net(sk),
LINUX_MIB_TCPLOSSPROBERECOVERY);
}
@@ -3330,7 +3361,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
u32 ack_seq = TCP_SKB_CB(skb)->seq;
u32 ack = TCP_SKB_CB(skb)->ack_seq;
bool is_dupack = false;
- u32 prior_in_flight;
+ u32 prior_in_flight, prior_cwnd = tp->snd_cwnd, prior_rtt = tp->srtt;
u32 prior_fackets;
int prior_packets = tp->packets_out;
int prior_sacked = tp->sacked_out;
@@ -3438,6 +3469,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
if (icsk->icsk_pending == ICSK_TIME_RETRANS)
tcp_schedule_loss_probe(sk);
+ if (tp->srtt != prior_rtt || tp->snd_cwnd != prior_cwnd)
+ tcp_update_pacing_rate(sk);
return 1;
no_queue:
@@ -5736,6 +5769,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
} else
tcp_init_metrics(sk);
+ tcp_update_pacing_rate(sk);
+
/* Prevent spurious tcp_cwnd_restart() on
* first data packet.
*/
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 1fb157f0957d..bc172b836ed6 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -178,7 +178,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
if (IS_ERR(rt)) {
err = PTR_ERR(rt);
if (err == -ENETUNREACH)
- IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
+ IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
return err;
}
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index f6a005c485a9..306dbd9a9441 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -665,10 +665,13 @@ void tcp_fastopen_cache_get(struct sock *sk, u16 *mss,
void tcp_fastopen_cache_set(struct sock *sk, u16 mss,
struct tcp_fastopen_cookie *cookie, bool syn_lost)
{
+ struct dst_entry *dst = __sk_dst_get(sk);
struct tcp_metrics_block *tm;
+ if (!dst)
+ return;
rcu_read_lock();
- tm = tcp_get_metrics(sk, __sk_dst_get(sk), true);
+ tm = tcp_get_metrics(sk, dst, true);
if (tm) {
struct tcp_fastopen_metrics *tfom = &tm->tcpm_fastopen;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 0145ce7e6098..5560abfe6d30 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -887,8 +887,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
skb_orphan(skb);
skb->sk = sk;
- skb->destructor = (sysctl_tcp_limit_output_bytes > 0) ?
- tcp_wfree : sock_wfree;
+ skb->destructor = tcp_wfree;
atomic_add(skb->truesize, &sk->sk_wmem_alloc);
/* Build TCP header and checksum it. */
@@ -977,6 +976,9 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
static void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb,
unsigned int mss_now)
{
+ /* Make sure we own this skb before messing gso_size/gso_segs */
+ WARN_ON_ONCE(skb_cloned(skb));
+
if (skb->len <= mss_now || !sk_can_gso(sk) ||
skb->ip_summed == CHECKSUM_NONE) {
/* Avoid the costly divide in the normal
@@ -1058,9 +1060,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
if (nsize < 0)
nsize = 0;
- if (skb_cloned(skb) &&
- skb_is_nonlinear(skb) &&
- pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
+ if (skb_unclone(skb, GFP_ATOMIC))
return -ENOMEM;
/* Get a new skb... force flag on. */
@@ -1623,7 +1623,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb)
/* If a full-sized TSO skb can be sent, do it. */
if (limit >= min_t(unsigned int, sk->sk_gso_max_size,
- sk->sk_gso_max_segs * tp->mss_cache))
+ tp->xmit_size_goal_segs * tp->mss_cache))
goto send_now;
/* Middle in queue won't get any more data, full sendable already? */
@@ -1832,7 +1832,6 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
while ((skb = tcp_send_head(sk))) {
unsigned int limit;
-
tso_segs = tcp_init_tso_segs(sk, skb, mss_now);
BUG_ON(!tso_segs);
@@ -1861,13 +1860,24 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
break;
}
- /* TSQ : sk_wmem_alloc accounts skb truesize,
- * including skb overhead. But thats OK.
+ /* TCP Small Queues :
+ * Control number of packets in qdisc/devices to two packets / or ~1 ms.
+ * This allows for :
+ * - better RTT estimation and ACK scheduling
+ * - faster recovery
+ * - high rates
+ * Alas, some drivers / subsystems require a fair amount
+ * of queued bytes to ensure line rate.
+ * One example is wifi aggregation (802.11 AMPDU)
*/
- if (atomic_read(&sk->sk_wmem_alloc) >= sysctl_tcp_limit_output_bytes) {
+ limit = max_t(unsigned int, sysctl_tcp_limit_output_bytes,
+ sk->sk_pacing_rate >> 10);
+
+ if (atomic_read(&sk->sk_wmem_alloc) > limit) {
set_bit(TSQ_THROTTLED, &tp->tsq_flags);
break;
}
+
limit = mss_now;
if (tso_segs > 1 && !tcp_urg_mode(tp))
limit = tcp_mss_split_point(sk, skb, mss_now,
@@ -2329,6 +2339,8 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
int oldpcount = tcp_skb_pcount(skb);
if (unlikely(oldpcount > 1)) {
+ if (skb_unclone(skb, GFP_ATOMIC))
+ return -ENOMEM;
tcp_init_tso_segs(sk, skb, cur_mss);
tcp_adjust_pcount(sk, skb, oldpcount - tcp_skb_pcount(skb));
}
@@ -3090,7 +3102,6 @@ void tcp_send_window_probe(struct sock *sk)
{
if (sk->sk_state == TCP_ESTABLISHED) {
tcp_sk(sk)->snd_wl1 = tcp_sk(sk)->rcv_nxt - 1;
- tcp_sk(sk)->snd_nxt = tcp_sk(sk)->write_seq;
tcp_xmit_probe_skb(sk, 0);
}
}
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 6c9ab80efb52..5ed9a7a09f04 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -973,7 +973,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
err = PTR_ERR(rt);
rt = NULL;
if (err == -ENETUNREACH)
- IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES);
+ IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
goto out;
}
@@ -1072,6 +1072,12 @@ int udp_sendpage(struct sock *sk, struct page *page, int offset,
struct udp_sock *up = udp_sk(sk);
int ret;
+ if (flags & MSG_SENDPAGE_NOTLAST)
+ flags |= MSG_MORE;
+
+ if (flags & MSG_SENDPAGE_NOTLAST)
+ flags |= MSG_MORE;
+
if (!up->pending) {
struct msghdr msg = { .msg_flags = flags|MSG_MORE };
@@ -1209,14 +1215,8 @@ int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
int is_udplite = IS_UDPLITE(sk);
bool slow;
- /*
- * Check any passed addresses
- */
- if (addr_len)
- *addr_len = sizeof(*sin);
-
if (flags & MSG_ERRQUEUE)
- return ip_recv_error(sk, msg, len);
+ return ip_recv_error(sk, msg, len, addr_len);
try_again:
skb = __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0),
@@ -1276,6 +1276,7 @@ try_again:
sin->sin_port = udp_hdr(skb)->source;
sin->sin_addr.s_addr = ip_hdr(skb)->saddr;
memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
+ *addr_len = sizeof(*sin);
}
if (inet->cmsg_flags)
ip_cmsg_recv(msg, skb);