From d964ff491ba6a02713b8aa4002c143e5aa28ef26 Mon Sep 17 00:00:00 2001 From: Alexander K Date: Mon, 21 Jan 2019 00:59:55 +0300 Subject: [PATCH] Fix #984: 1. accurately fix skb->truesize and TCP write memory in kernel by tcp_skb_unclone(); 2. __split_pgfrag_del() if we just move pointers, then we do not free TCP write memory, so do not change skb->truesize. 3. ss_skb_unroll(): truesize and data_len/len are completely different counters, so do not mix them in ss_skb_adjust_data_len(). By the way, during the tests I saw crazy skb overheads - truesize can be larger than len in tens kilobytes. The explanation for such overheads is various fragments stoling (e.g. our __split_pgfrag_del) and cloning. 4. cleanup: move ss_skb coalescing functions closer to their calls. --- linux-4.14.32.patch | 83 +++++++++++++++++++++++++--------- tempesta_fw/sock.c | 13 +++--- tempesta_fw/ss_skb.c | 105 ++++++++++++++++++++++--------------------- 3 files changed, 123 insertions(+), 78 deletions(-) diff --git a/linux-4.14.32.patch b/linux-4.14.32.patch index c8864d67b1..75c6f6eec8 100644 --- a/linux-4.14.32.patch +++ b/linux-4.14.32.patch @@ -1815,7 +1815,7 @@ index 420fecbb..67e0513a 100644 void tcp_twsk_destructor(struct sock *sk) { diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c -index 83d11cd2..b676c6a5 100644 +index 83d11cd2..14918c14 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -37,6 +37,9 @@ @@ -1862,7 +1862,58 @@ index 83d11cd2..b676c6a5 100644 /* Initialize TSO segments for a packet. */ static void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now) -@@ -1560,6 +1565,7 @@ unsigned int tcp_current_mss(struct sock *sk) +@@ -1241,6 +1246,32 @@ static void tcp_skb_fragment_eor(struct sk_buff *skb, struct sk_buff *skb2) + TCP_SKB_CB(skb)->eor = 0; + } + ++/** ++ * Tempesta uses page fragments for all skb allocations, so if an skb was ++ * allocated in standard Linux way, then pskb_expand_head( , 0, 0, ) may ++ * return larger skb and we have to adjust skb->truesize and memory accounting ++ * for TCP write queue. ++ */ ++static int ++tcp_skb_unclone(struct sock *sk, struct sk_buff *skb, gfp_t pri) ++{ ++ int r, delta_truesize = skb->truesize; ++ ++ if ((r = skb_unclone(skb, pri))) ++ return r; ++ ++ delta_truesize -= skb->truesize; ++ sk->sk_wmem_queued -= delta_truesize; ++ if (delta_truesize > 0) { ++ sk_mem_uncharge(sk, delta_truesize); ++ sock_set_flag(sk, SOCK_QUEUE_SHRUNK); ++ } else { ++ sk_mem_charge(sk, -delta_truesize); ++ } ++ ++ return 0; ++} ++ + /* Function to create two new TCP segments. Shrinks the given segment + * to the specified size and appends a new segment with the rest of the + * packet to the list. This won't be called frequently, I hope. +@@ -1262,7 +1293,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, + if (nsize < 0) + nsize = 0; + +- if (skb_unclone(skb, gfp)) ++ if (tcp_skb_unclone(sk, skb, gfp)) + return -ENOMEM; + + /* Get a new skb... force flag on. */ +@@ -1380,7 +1411,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) + { + u32 delta_truesize; + +- if (skb_unclone(skb, GFP_ATOMIC)) ++ if (tcp_skb_unclone(sk, skb, GFP_ATOMIC)) + return -ENOMEM; + + delta_truesize = __pskb_trim_head(skb, len); +@@ -1560,6 +1591,7 @@ unsigned int tcp_current_mss(struct sock *sk) return mss_now; } @@ -1870,7 +1921,7 @@ index 83d11cd2..b676c6a5 100644 /* RFC2861, slow part. Adjust cwnd, after it was not full during one rto. * As additional protections, we do not touch cwnd in retransmission phases, -@@ -2327,7 +2333,20 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, +@@ -2327,7 +2359,20 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, cwnd_quota, max_segs), nonagle); @@ -1892,7 +1943,7 @@ index 83d11cd2..b676c6a5 100644 if (skb->len > limit && unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) break; -@@ -2336,7 +2355,30 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, +@@ -2336,7 +2381,33 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, clear_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags); if (tcp_small_queue_check(sk, skb, 0)) break; @@ -1919,12 +1970,15 @@ index 83d11cd2..b676c6a5 100644 + tcp_reset(sk); + break; + } ++ /* We must not break TSO. */ ++ WARN_ON_ONCE(tcp_skb_pcount(skb) ++ != DIV_ROUND_UP(skb->len, mss_now)); + } +#endif if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp))) break; -@@ -2518,6 +2560,7 @@ void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss, +@@ -2518,6 +2589,7 @@ void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss, sk_gfp_mask(sk, GFP_ATOMIC))) tcp_check_probe_timer(sk); } @@ -1932,27 +1986,16 @@ index 83d11cd2..b676c6a5 100644 /* Send _single_ skb sitting at the send head. This function requires * true push pending frames to setup probe timer etc. -@@ -2839,9 +2882,19 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) +@@ -2839,7 +2911,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) if (tcp_fragment(sk, skb, len, cur_mss, GFP_ATOMIC)) return -ENOMEM; /* We'll try again later. */ } else { -+ int delta_truesize = skb->truesize; -+ - if (skb_unclone(skb, GFP_ATOMIC)) +- if (skb_unclone(skb, GFP_ATOMIC)) ++ if (tcp_skb_unclone(sk, skb, GFP_ATOMIC)) return -ENOMEM; -+ delta_truesize -= skb->truesize; -+ sk->sk_wmem_queued -= delta_truesize; -+ if (delta_truesize > 0) { -+ sk_mem_uncharge(sk, delta_truesize); -+ sock_set_flag(sk, SOCK_QUEUE_SHRUNK); -+ } else { -+ sk_mem_charge(sk, -delta_truesize); -+ } diff = tcp_skb_pcount(skb); - tcp_set_skb_tso_segs(skb, cur_mss); - diff -= tcp_skb_pcount(skb); -@@ -3129,6 +3182,7 @@ int tcp_send_synack(struct sock *sk) +@@ -3129,6 +3201,7 @@ int tcp_send_synack(struct sock *sk) } return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); } diff --git a/tempesta_fw/sock.c b/tempesta_fw/sock.c index 1a4e0427e2..9e3ad5581e 100644 --- a/tempesta_fw/sock.c +++ b/tempesta_fw/sock.c @@ -382,9 +382,10 @@ ss_do_send(struct sock *sk, struct sk_buff **skb_head, int flags) /* Propagate mark of message head skb.*/ skb->mark = mark; - TFW_DBG3("[%d]: %s: entail skb=%pK data_len=%u len=%u mark=%u" - " tls_type=%x\n", smp_processor_id(), __func__, - skb, skb->data_len, skb->len, skb->mark, + TFW_DBG3("[%d]: %s: entail sk=%pK skb=%pK data_len=%u len=%u" + " truesize=%u mark=%u tls_type=%x\n", + smp_processor_id(), __func__, sk, + skb, skb->data_len, skb->len, skb->truesize, skb->mark, tempesta_tls_skb_type(skb)); skb_entail(sk, skb); @@ -449,11 +450,7 @@ ss_send(struct sock *sk, struct sk_buff **skb_head, int flags) * or copy them if they're going to be used by Tempesta during * and after the transmission. */ - /* - * FIXME #984 the `true ||` statement at the below fixes the issue - * (at least basic tests are passed now). - */ - if (/*true ||*/ flags & SS_F_KEEP_SKB) { + if (flags & SS_F_KEEP_SKB) { skb = *skb_head; do { /* tcp_transmit_skb() will clone the skb. */ diff --git a/tempesta_fw/ss_skb.c b/tempesta_fw/ss_skb.c index 2756939b53..f94aac5209 100644 --- a/tempesta_fw/ss_skb.c +++ b/tempesta_fw/ss_skb.c @@ -634,7 +634,8 @@ __split_pgfrag_del(struct sk_buff *skb_head, struct sk_buff *skb, int i, int off if (likely(!off)) { frag->page_offset += len; skb_frag_size_sub(frag, len); - ss_skb_adjust_data_len(skb, -len); + skb->len -= len; + skb->data_len -= len; it->ptr = skb_frag_address(frag); it->skb = skb; return 0; @@ -642,7 +643,8 @@ __split_pgfrag_del(struct sk_buff *skb_head, struct sk_buff *skb, int i, int off /* Fast path (e.g. TLS tag): delete the tail part of a fragment. */ if (likely(off + len == skb_frag_size(frag))) { skb_frag_size_sub(frag, len); - ss_skb_adjust_data_len(skb, -len); + skb->len -= len; + skb->data_len -= len; __it_next_data(skb, i + 1, it); return 0; } @@ -679,7 +681,8 @@ __split_pgfrag_del(struct sk_buff *skb_head, struct sk_buff *skb, int i, int off ss_skb_adjust_data_len(skb, -tail_len); ss_skb_adjust_data_len(skb_dst, tail_len); } - ss_skb_adjust_data_len(skb, -len); + skb->len -= len; + skb->data_len -= len; /* Get the SKB and the address for data after the deleted data. */ it->ptr = skb_frag_address(&skb_shinfo(skb_dst)->frags[i]); @@ -1138,52 +1141,6 @@ ss_skb_split(struct sk_buff *skb, int len) return buff; } -static inline int -__coalesce_frag(struct sk_buff **skb_head, skb_frag_t *frag, - const struct sk_buff *orig_skb) -{ - struct sk_buff *skb = ss_skb_peek_tail(skb_head); - - if (!skb || skb_shinfo(skb)->nr_frags == MAX_SKB_FRAGS) { - skb = ss_skb_alloc(0); - if (!skb) - return -ENOMEM; - ss_skb_queue_tail(skb_head, skb); - skb->mark = orig_skb->mark; - } - - skb_shinfo(skb)->frags[skb_shinfo(skb)->nr_frags++] = *frag; - ss_skb_adjust_data_len(skb, frag->size); - __skb_frag_ref(frag); - - return 0; -} - -static int -ss_skb_queue_coalesce_tail(struct sk_buff **skb_head, const struct sk_buff *skb) -{ - int i; - skb_frag_t head_frag; - unsigned int headlen = skb_headlen(skb); - - if (headlen) { - BUG_ON(!skb->head_frag); - head_frag.size = headlen; - head_frag.page.p = virt_to_page(skb->head); - head_frag.page_offset = skb->data - - (unsigned char *)page_address(head_frag.page.p); - if (__coalesce_frag(skb_head, &head_frag, skb)) - return -ENOMEM; - } - - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { - if (__coalesce_frag(skb_head, &skb_shinfo(skb)->frags[i], skb)) - return -ENOMEM; - } - - return 0; -} - /** * Tempesta FW forwards skbs with application and transport payload as is, * so initialize such skbs such that TCP/IP stack won't stumble on dirty @@ -1236,6 +1193,52 @@ ss_skb_init_for_xmit(struct sk_buff *skb) skb->ip_summed = CHECKSUM_PARTIAL; } +static inline int +__coalesce_frag(struct sk_buff **skb_head, skb_frag_t *frag, + const struct sk_buff *orig_skb) +{ + struct sk_buff *skb = ss_skb_peek_tail(skb_head); + + if (!skb || skb_shinfo(skb)->nr_frags == MAX_SKB_FRAGS) { + skb = ss_skb_alloc(0); + if (!skb) + return -ENOMEM; + ss_skb_queue_tail(skb_head, skb); + skb->mark = orig_skb->mark; + } + + skb_shinfo(skb)->frags[skb_shinfo(skb)->nr_frags++] = *frag; + ss_skb_adjust_data_len(skb, frag->size); + __skb_frag_ref(frag); + + return 0; +} + +static int +ss_skb_queue_coalesce_tail(struct sk_buff **skb_head, const struct sk_buff *skb) +{ + int i; + skb_frag_t head_frag; + unsigned int headlen = skb_headlen(skb); + + if (headlen) { + BUG_ON(!skb->head_frag); + head_frag.size = headlen; + head_frag.page.p = virt_to_page(skb->head); + head_frag.page_offset = skb->data - + (unsigned char *)page_address(head_frag.page.p); + if (__coalesce_frag(skb_head, &head_frag, skb)) + return -ENOMEM; + } + + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + if (__coalesce_frag(skb_head, &skb_shinfo(skb)->frags[i], skb)) + return -ENOMEM; + } + + return 0; +} + /* * When the original SKB is a clone then its shinfo and payload cannot be * modified as they are shared with other SKB users. As the SKB is unrolled, @@ -1328,7 +1331,9 @@ ss_skb_unroll(struct sk_buff **skb_head, struct sk_buff *skb) * when we track whitelist requests during HTTP processing. */ f_skb->mark = skb->mark; - ss_skb_adjust_data_len(skb, -f_skb->len); + skb->len -= f_skb->len; + skb->data_len -= f_skb->len; + skb->truesize -= f_skb->truesize; f_skb->prev = prev_skb; prev_skb = f_skb; }