From 5d4c9bfbabdb1d497f21afd81501e5c54b0c85d9 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 18 Nov 2015 21:03:33 -0800
Subject: tcp: fix potential huge kmalloc() calls in TCP_REPAIR

tcp_send_rcvq() is used for re-injecting data into tcp receive queue.

Problems :

- No check against size is performed, allowed user to fool kernel in
  attempting very large memory allocations, eventually triggering
  OOM when memory is fragmented.

- In case of fault during the copy we do not return correct errno.

Lets use alloc_skb_with_frags() to cook optimal skbs.

Fixes: 292e8d8c8538 ("tcp: Move rcvq sending to tcp_input.c")
Fixes: c0e88ff0f256 ("tcp: Repair socket queues")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Acked-by: Pavel Emelyanov <xemul@parallels.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 22 +++++++++++++++++++---
 1 file changed, 19 insertions(+), 3 deletions(-)

(limited to 'net/ipv4/tcp_input.c')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index fdd88c3803a6..a4a0b6b3bcf2 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4481,19 +4481,34 @@ static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int
 int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size)
 {
 	struct sk_buff *skb;
+	int err = -ENOMEM;
+	int data_len = 0;
 	bool fragstolen;
 
 	if (size == 0)
 		return 0;
 
-	skb = alloc_skb(size, sk->sk_allocation);
+	if (size > PAGE_SIZE) {
+		int npages = min_t(size_t, size >> PAGE_SHIFT, MAX_SKB_FRAGS);
+
+		data_len = npages << PAGE_SHIFT;
+		size = data_len + (size & ~PAGE_MASK);
+	}
+	skb = alloc_skb_with_frags(size - data_len, data_len,
+				   PAGE_ALLOC_COSTLY_ORDER,
+				   &err, sk->sk_allocation);
 	if (!skb)
 		goto err;
 
+	skb_put(skb, size - data_len);
+	skb->data_len = data_len;
+	skb->len = size;
+
 	if (tcp_try_rmem_schedule(sk, skb, skb->truesize))
 		goto err_free;
 
-	if (memcpy_from_msg(skb_put(skb, size), msg, size))
+	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
+	if (err)
 		goto err_free;
 
 	TCP_SKB_CB(skb)->seq = tcp_sk(sk)->rcv_nxt;
@@ -4509,7 +4524,8 @@ int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size)
 err_free:
 	kfree_skb(skb);
 err:
-	return -ENOMEM;
+	return err;
+
 }
 
 static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
-- 
cgit v1.2.3


From 142a2e7ece8d8ac0e818eb2c91f99ca894730e2a Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 26 Nov 2015 08:18:14 -0800
Subject: tcp: initialize tp->copied_seq in case of cross SYN connection

Dmitry provided a syzkaller (http://github.com/google/syzkaller)
generated program that triggers the WARNING at
net/ipv4/tcp.c:1729 in tcp_recvmsg() :

WARN_ON(tp->copied_seq != tp->rcv_nxt &&
        !(flags & (MSG_PEEK | MSG_TRUNC)));

His program is specifically attempting a Cross SYN TCP exchange,
that we support (for the pleasure of hackers ?), but it looks we
lack proper tcp->copied_seq initialization.

Thanks again Dmitry for your report and testings.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Dmitry Vyukov <dvyukov@google.com>
Tested-by: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net/ipv4/tcp_input.c')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index a4a0b6b3bcf2..2d656eef7f8e 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5683,6 +5683,7 @@ discard:
 		}
 
 		tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
+		tp->copied_seq = tp->rcv_nxt;
 		tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
 
 		/* RFC1323: The window in SYN & SYN/ACK segments is
-- 
cgit v1.2.3


From 6dd9a14e92e54895e143f10fef4d0b9abe109aa9 Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Wed, 16 Dec 2015 13:20:44 -0800
Subject: net: Allow accepted sockets to be bound to l3mdev domain

Allow accepted sockets to derive their sk_bound_dev_if setting from the
l3mdev domain in which the packets originated. A sysctl setting is added
to control the behavior which is similar to sk_mark and
sysctl_tcp_fwmark_accept.

This effectively allow a process to have a "VRF-global" listen socket,
with child sockets bound to the VRF device in which the packet originated.
A similar behavior can be achieved using sk_mark, but a solution using marks
is incomplete as it does not handle duplicate addresses in different L3
domains/VRFs. Allowing sockets to inherit the sk_bound_dev_if from l3mdev
domain provides a complete solution.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/ipv4/tcp_input.c')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 2d656eef7f8e..7b1fddc47019 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -6204,7 +6204,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
 	tcp_openreq_init(req, &tmp_opt, skb, sk);
 
 	/* Note: tcp_v6_init_req() might override ir_iif for link locals */
-	inet_rsk(req)->ir_iif = sk->sk_bound_dev_if;
+	inet_rsk(req)->ir_iif = inet_request_bound_dev_if(sk, skb);
 
 	af_ops->init_req(req, sk, skb);
 
-- 
cgit v1.2.3


From 8b8a321ff72c785ed5e8b4cf6eda20b35d427390 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Wed, 6 Jan 2016 12:42:38 -0800
Subject: tcp: fix zero cwnd in tcp_cwnd_reduction

Patch 3759824da87b ("tcp: PRR uses CRB mode by default and SS mode
conditionally") introduced a bug that cwnd may become 0 when both
inflight and sndcnt are 0 (cwnd = inflight + sndcnt). This may lead
to a div-by-zero if the connection starts another cwnd reduction
phase by setting tp->prior_cwnd to the current cwnd (0) in
tcp_init_cwnd_reduction().

To prevent this we skip PRR operation when nothing is acked or
sacked. Then cwnd must be positive in all cases as long as ssthresh
is positive:

1) The proportional reduction mode
   inflight > ssthresh > 0

2) The reduction bound mode
  a) inflight == ssthresh > 0

  b) inflight < ssthresh
     sndcnt > 0 since newly_acked_sacked > 0 and inflight < ssthresh

Therefore in all cases inflight and sndcnt can not both be 0.
We check invalid tp->prior_cwnd to avoid potential div0 bugs.

In reality this bug is triggered only with a sequence of less common
events.  For example, the connection is terminating an ECN-triggered
cwnd reduction with an inflight 0, then it receives reordered/old
ACKs or DSACKs from prior transmission (which acks nothing). Or the
connection is in fast recovery stage that marks everything lost,
but fails to retransmit due to local issues, then receives data
packets from other end which acks nothing.

Fixes: 3759824da87b ("tcp: PRR uses CRB mode by default and SS mode conditionally")
Reported-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'net/ipv4/tcp_input.c')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 2d656eef7f8e..d4c51158470f 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2478,6 +2478,9 @@ static void tcp_cwnd_reduction(struct sock *sk, const int prior_unsacked,
 	int newly_acked_sacked = prior_unsacked -
 				 (tp->packets_out - tp->sacked_out);
 
+	if (newly_acked_sacked <= 0 || WARN_ON_ONCE(!tp->prior_cwnd))
+		return;
+
 	tp->prr_delivered += newly_acked_sacked;
 	if (delta < 0) {
 		u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered +
-- 
cgit v1.2.3