balena-os · vicgal · Feb 12, 2020
diff --git a/layers/meta-balena-nanopc-t4/recipes-kernel/linux/files/0002-NFLX-2019-001-SACK-Panic.patch b/layers/meta-balena-nanopc-t4/recipes-kernel/linux/files/0002-NFLX-2019-001-SACK-Panic.patch
@@ -0,0 +1,155 @@
+Date: Sat, 8 Jun 2019 10:38:05 -0700
+Subject: [PATCH net 1/4] tcp: limit payload size of sacked skbs
+From: Eric Dumazet <[email protected]>
+
+Jonathan Looney reported that TCP can trigger the following crash
+in tcp_shifted_skb() :
+
+ BUG_ON(tcp_skb_pcount(skb) < pcount);
+
+This can happen if the remote peer has advertized the smallest
+MSS that linux TCP accepts : 48
+
+An skb can hold 17 fragments, and each fragment can hold 32KB
+on x86, or 64KB on PowerPC.
+
+This means that the 16bit witdh of TCP_SKB_CB(skb)->tcp_gso_segs
+can overflow.
+
+Note that tcp_sendmsg() builds skbs with less than 64KB
+of payload, so this problem needs SACK to be enabled.
+SACK blocks allow TCP to coalesce multiple skbs in the retransmit
+queue, thus filling the 17 fragments to maximal capacity.
+
+Fixes: 832d11c5cd07 ("tcp: Try to restore large SKBs while SACK processing")
+Signed-off-by: Eric Dumazet <[email protected]>
+Reported-by: Jonathan Looney <[email protected]>
+Acked-by: Neal Cardwell <[email protected]>
+Reviewed-by: Tyler Hicks <[email protected]>
+Cc: Yuchung Cheng <[email protected]>
+Cc: Bruce Curtis <[email protected]>
+Cc: Jonathan Lemon <[email protected]>
+
+Upstream-Status: Inappropriate [not author]
+Signed-off-by: Vicentiu Galanopulo <[email protected]>
+---
+ include/linux/tcp.h | 3 +++
+ include/net/tcp.h | 2 ++
+ net/ipv4/tcp.c | 1 +
+ net/ipv4/tcp_input.c | 26 ++++++++++++++++++++------
+ net/ipv4/tcp_output.c | 4 ++--
+ 5 files changed, 28 insertions(+), 8 deletions(-)
+
+diff --git a/include/linux/tcp.h b/include/linux/tcp.h
+index b386361..cfbe3c4 100644
+--- a/include/linux/tcp.h
++++ b/include/linux/tcp.h
+@@ -410,4 +410,7 @@ static inline void tcp_saved_syn_free(struct tcp_sock *tp)
+ tp->saved_syn = NULL;
+ }
+
++int tcp_skb_shift(struct sk_buff *to, struct sk_buff *from, int pcount,
++ int shiftlen);
++
+ #endif /* _LINUX_TCP_H */
+diff --git a/include/net/tcp.h b/include/net/tcp.h
+index 4ea3739..03c6f68 100644
+--- a/include/net/tcp.h
++++ b/include/net/tcp.h
+@@ -54,6 +54,8 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
+
+ #define MAX_TCP_HEADER (128 + MAX_HEADER)
+ #define MAX_TCP_OPTION_SPACE 40
++#define TCP_MIN_SND_MSS 48
++#define TCP_MIN_GSO_SIZE (TCP_MIN_SND_MSS - MAX_TCP_OPTION_SPACE)
+
+ /*
+ * Never offer a window over 32767 without using window scaling. Some
+diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
+index dd2a41b..367dc51 100644
+--- a/net/ipv4/tcp.c
++++ b/net/ipv4/tcp.c
+@@ -3170,6 +3170,7 @@ void __init tcp_init(void)
+ int max_rshare, max_wshare, cnt;
+ unsigned int i;
+
++ BUILD_BUG_ON(TCP_MIN_SND_MSS <= MAX_TCP_OPTION_SPACE);
+ sock_skb_cb_check_size(sizeof(struct tcp_skb_cb));
+
+ percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL);
+diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
+index 35e97ff..467d414 100644
+--- a/net/ipv4/tcp_input.c
++++ b/net/ipv4/tcp_input.c
+@@ -1267,7 +1267,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
+ TCP_SKB_CB(skb)->seq += shifted;
+
+ tcp_skb_pcount_add(prev, pcount);
+- BUG_ON(tcp_skb_pcount(skb) < pcount);
++ WARN_ON_ONCE(tcp_skb_pcount(skb) < pcount);
+ tcp_skb_pcount_add(skb, -pcount);
+
+ /* When we're adding to gso_segs == 1, gso_size will be zero,
+@@ -1329,6 +1329,21 @@ static int skb_can_shift(const struct sk_buff *skb)
+ return !skb_headlen(skb) && skb_is_nonlinear(skb);
+ }
+
++int tcp_skb_shift(struct sk_buff *to, struct sk_buff *from,
++ int pcount, int shiftlen)
++{
++ /* TCP min gso_size is 8 bytes (TCP_MIN_GSO_SIZE)
++ * Since TCP_SKB_CB(skb)->tcp_gso_segs is 16 bits, we need
++ * to make sure not storing more than 65535 * 8 bytes per skb,
++ * even if current MSS is bigger.
++ */
++ if (unlikely(to->len + shiftlen >= 65535 * TCP_MIN_GSO_SIZE))
++ return 0;
++ if (unlikely(tcp_skb_pcount(to) + pcount > 65535))
++ return 0;
++ return skb_shift(to, from, shiftlen);
++}
++
+ /* Try collapsing SACK blocks spanning across multiple skbs to a single
+ * skb.
+ */
+@@ -1434,7 +1449,7 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
+ if (!after(TCP_SKB_CB(skb)->seq + len, tp->snd_una))
+ goto fallback;
+
+- if (!skb_shift(prev, skb, len))
++ if (!tcp_skb_shift(prev, skb, pcount, len))
+ goto fallback;
+ if (!tcp_shifted_skb(sk, skb, state, pcount, len, mss, dup_sack))
+ goto out;
+@@ -1453,10 +1468,9 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
+ goto out;
+
+ len = skb->len;
+- if (skb_shift(prev, skb, len)) {
+- pcount += tcp_skb_pcount(skb);
+- tcp_shifted_skb(sk, skb, state, tcp_skb_pcount(skb), len, mss, 0);
+- }
++ pcount = tcp_skb_pcount(skb);
++ if (tcp_skb_shift(prev, skb, pcount, len))
++ tcp_shifted_skb(sk, skb, state, pcount, len, mss, 0);
+
+ out:
+ state->fack_count += pcount;
+diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
+index 3e52a48..34042e0 100644
+--- a/net/ipv4/tcp_output.c
++++ b/net/ipv4/tcp_output.c
+@@ -1337,8 +1337,8 @@ static inline int __tcp_mtu_to_mss(struct sock *sk, int pmtu)
+ mss_now -= icsk->icsk_ext_hdr_len;
+
+ /* Then reserve room for full set of TCP options and 8 bytes of data */
+- if (mss_now < 48)
+- mss_now = 48;
++ if (mss_now < TCP_MIN_SND_MSS)
++ mss_now = TCP_MIN_SND_MSS;
+ return mss_now;
+ }
+
+-- 
+2.7.4
+
diff --git a/...na-nanopc-t4/recipes-kernel/linux/files/0003-NFLX-2019-001-SACK-Panic-for-lteq-4.14.patch b/...na-nanopc-t4/recipes-kernel/linux/files/0003-NFLX-2019-001-SACK-Panic-for-lteq-4.14.patch
@@ -0,0 +1,62 @@
+From cd4ffa93f16efea290bb70537f98f518e1927e63 Mon Sep 17 00:00:00 2001
+From: Joao Martins <[email protected]>
+Date: Mon, 10 Jun 2019 23:12:39 +0100
+Subject: [PATCH 5/5] tcp: fix fack_count accounting on tcp_shift_skb_data()
+
+v4.15 or since commit 737ff314563 ("tcp: use sequence distance to
+detect reordering") had switched from the packet-based FACK tracking
+to sequence-based.
+
+v4.14 and older still have the old logic and hence on
+tcp_skb_shift_data() needs to retain its original logic and have
+@fack_count in sync. In other words, we keep the increment of pcount with
+tcp_skb_pcount(skb) to later used that to update fack_count. To make it
+more explicit we track the new skb that gets incremented to pcount in
+@next_pcount, and we get to avoid the constant invocation of
+tcp_skb_pcount(skb) all together.
+
+Fixes: a5f1faa40101 ("tcp: limit payload size of sacked skbs")
+Reported-by: Alexey Kodanev <[email protected]>
+Reviewed-by: Jack Vogel <[email protected]>
+Reviewed-by: John Haxby <[email protected]>
+Reviewed-by: Rao Shoaib [email protected]>
+Signed-off-by: Joao Martins <[email protected]>
+Signed-off-by: Konrad Rzeszutek Wilk <[email protected]>
+
+Upstream-Status: Inappropriate [not author]
+Signed-off-by: Vicentiu Galanopulo <[email protected]>
+---
+ net/ipv4/tcp_input.c | 10 ++++++----
+ 1 file changed, 6 insertions(+), 4 deletions(-)
+
+diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
+index c092c7c..6c7190c 100644
+--- a/net/ipv4/tcp_input.c
++++ b/net/ipv4/tcp_input.c
+@@ -1422,6 +1422,7 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *prev;
+ int mss;
++ int next_pcount;
+ int pcount = 0;
+ int len;
+ int in_sack;
+@@ -1538,10 +1539,11 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
+ goto out;
+
+ len = skb->len;
+- pcount = tcp_skb_pcount(skb);
+- if (tcp_skb_shift(prev, skb, pcount, len))
+- tcp_shifted_skb(sk, skb, state, pcount, len, mss, 0);
+-
++ next_pcount = tcp_skb_pcount(skb);
++ if (tcp_skb_shift(prev, skb, next_pcount, len)) {
++ pcount += next_pcount;
++ tcp_shifted_skb(sk, skb, state, next_pcount, len, mss, 0);
++ }
+ out:
+ state->fack_count += pcount;
+ return prev;
+-- 
+2.7.4
+
diff --git a/...s/meta-balena-nanopc-t4/recipes-kernel/linux/files/0004-NFLX-2019-001-SACK-Slowness.patch b/...s/meta-balena-nanopc-t4/recipes-kernel/linux/files/0004-NFLX-2019-001-SACK-Slowness.patch
@@ -0,0 +1,77 @@
+Date: Sat, 8 Jun 2019 10:38:06 -0700
+Subject: [PATCH net 2/4] tcp: tcp_fragment() should apply sane memory limits
+From: Eric Dumazet <[email protected]>
+
+Jonathan Looney reported that a malicious peer can force a sender
+to fragment its retransmit queue into tiny skbs, inflating memory
+usage and/or overflow 32bit counters.
+
+TCP allows an application to queue up to sk_sndbuf bytes,
+so we need to give some allowance for non malicious splitting
+of retransmit queue.
+
+A new SNMP counter is added to monitor how many times TCP
+did not allow to split an skb if the allowance was exceeded.
+
+Note that this counter might increase in the case applications
+use SO_SNDBUF socket option to lower sk_sndbuf.
+
+Signed-off-by: Eric Dumazet <[email protected]>
+Reported-by: Jonathan Looney <[email protected]>
+Acked-by: Neal Cardwell <[email protected]>
+Acked-by: Yuchung Cheng <[email protected]>
+Reviewed-by: Tyler Hicks <[email protected]>
+Cc: Bruce Curtis <[email protected]>
+Cc: Jonathan Lemon <[email protected]>
+
+Upstream-Status: Inappropriate [not author]
+Signed-off-by: Vicentiu Galanopulo <[email protected]>
+---
+ include/uapi/linux/snmp.h | 1 +
+ net/ipv4/proc.c | 1 +
+ net/ipv4/tcp_output.c | 5 +++++
+ 3 files changed, 7 insertions(+)
+
+diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h
+index f5d753e..bf31965 100644
+--- a/include/uapi/linux/snmp.h
++++ b/include/uapi/linux/snmp.h
+@@ -278,6 +278,7 @@ enum
+ LINUX_MIB_TCPKEEPALIVE, /* TCPKeepAlive */
+ LINUX_MIB_TCPMTUPFAIL, /* TCPMTUPFail */
+ LINUX_MIB_TCPMTUPSUCCESS, /* TCPMTUPSuccess */
++ LINUX_MIB_TCPWQUEUETOOBIG, /* TCPWqueueTooBig */
+ __LINUX_MIB_MAX
+ };
+
+diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
+index 3fbf688..88aaf14 100644
+--- a/net/ipv4/proc.c
++++ b/net/ipv4/proc.c
+@@ -299,6 +299,7 @@ static const struct snmp_mib snmp4_net_list[] = {
+ SNMP_MIB_ITEM("TCPKeepAlive", LINUX_MIB_TCPKEEPALIVE),
+ SNMP_MIB_ITEM("TCPMTUPFail", LINUX_MIB_TCPMTUPFAIL),
+ SNMP_MIB_ITEM("TCPMTUPSuccess", LINUX_MIB_TCPMTUPSUCCESS),
++ SNMP_MIB_ITEM("TCPWqueueTooBig", LINUX_MIB_TCPWQUEUETOOBIG),
+ SNMP_MIB_SENTINEL
+ };
+
+diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
+index def09d1..36d1945 100644
+--- a/net/ipv4/tcp_output.c
++++ b/net/ipv4/tcp_output.c
+@@ -1274,6 +1274,11 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
+ if (nsize < 0)
+ nsize = 0;
+
++ if (unlikely((sk->sk_wmem_queued >> 1) > sk->sk_sndbuf)) {
++ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPWQUEUETOOBIG);
++ return -ENOMEM;
++ }
++
+ if (skb_unclone(skb, gfp))
+ return -ENOMEM;
+
+-- 
+2.7.4
+