Skip to content

Commit

Permalink
Fix #984:
Browse files Browse the repository at this point in the history
1. accurately fix skb->truesize and TCP write memory in kernel
   by tcp_skb_unclone();

2. __split_pgfrag_del() if we just move pointers, then we do not free
   TCP write memory, so do not change skb->truesize.

3. ss_skb_unroll(): truesize and data_len/len are completely different counters,
   so do not mix them in ss_skb_adjust_data_len(). By the way, during the tests
   I saw crazy skb overheads - truesize can be larger than len in tens kilobytes.
   The explanation for such overheads is various fragments stoling (e.g. our
   __split_pgfrag_del) and cloning.

4. cleanup: move ss_skb coalescing functions closer to their calls.
  • Loading branch information
krizhanovsky committed Jan 20, 2019
1 parent 9862238 commit d964ff4
Show file tree
Hide file tree
Showing 3 changed files with 123 additions and 78 deletions.
83 changes: 63 additions & 20 deletions linux-4.14.32.patch
Original file line number Diff line number Diff line change
Expand Up @@ -1815,7 +1815,7 @@ index 420fecbb..67e0513a 100644
void tcp_twsk_destructor(struct sock *sk)
{
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 83d11cd2..b676c6a5 100644
index 83d11cd2..14918c14 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -37,6 +37,9 @@
Expand Down Expand Up @@ -1862,15 +1862,66 @@ index 83d11cd2..b676c6a5 100644

/* Initialize TSO segments for a packet. */
static void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now)
@@ -1560,6 +1565,7 @@ unsigned int tcp_current_mss(struct sock *sk)
@@ -1241,6 +1246,32 @@ static void tcp_skb_fragment_eor(struct sk_buff *skb, struct sk_buff *skb2)
TCP_SKB_CB(skb)->eor = 0;
}

+/**
+ * Tempesta uses page fragments for all skb allocations, so if an skb was
+ * allocated in standard Linux way, then pskb_expand_head( , 0, 0, ) may
+ * return larger skb and we have to adjust skb->truesize and memory accounting
+ * for TCP write queue.
+ */
+static int
+tcp_skb_unclone(struct sock *sk, struct sk_buff *skb, gfp_t pri)
+{
+ int r, delta_truesize = skb->truesize;
+
+ if ((r = skb_unclone(skb, pri)))
+ return r;
+
+ delta_truesize -= skb->truesize;
+ sk->sk_wmem_queued -= delta_truesize;
+ if (delta_truesize > 0) {
+ sk_mem_uncharge(sk, delta_truesize);
+ sock_set_flag(sk, SOCK_QUEUE_SHRUNK);
+ } else {
+ sk_mem_charge(sk, -delta_truesize);
+ }
+
+ return 0;
+}
+
/* Function to create two new TCP segments. Shrinks the given segment
* to the specified size and appends a new segment with the rest of the
* packet to the list. This won't be called frequently, I hope.
@@ -1262,7 +1293,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
if (nsize < 0)
nsize = 0;

- if (skb_unclone(skb, gfp))
+ if (tcp_skb_unclone(sk, skb, gfp))
return -ENOMEM;

/* Get a new skb... force flag on. */
@@ -1380,7 +1411,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
{
u32 delta_truesize;

- if (skb_unclone(skb, GFP_ATOMIC))
+ if (tcp_skb_unclone(sk, skb, GFP_ATOMIC))
return -ENOMEM;

delta_truesize = __pskb_trim_head(skb, len);
@@ -1560,6 +1591,7 @@ unsigned int tcp_current_mss(struct sock *sk)

return mss_now;
}
+EXPORT_SYMBOL(tcp_current_mss);

/* RFC2861, slow part. Adjust cwnd, after it was not full during one rto.
* As additional protections, we do not touch cwnd in retransmission phases,
@@ -2327,7 +2333,20 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
@@ -2327,7 +2359,20 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
cwnd_quota,
max_segs),
nonagle);
Expand All @@ -1892,7 +1943,7 @@ index 83d11cd2..b676c6a5 100644
if (skb->len > limit &&
unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
break;
@@ -2336,7 +2355,30 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
@@ -2336,7 +2381,33 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
clear_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags);
if (tcp_small_queue_check(sk, skb, 0))
break;
Expand All @@ -1919,40 +1970,32 @@ index 83d11cd2..b676c6a5 100644
+ tcp_reset(sk);
+ break;
+ }
+ /* We must not break TSO. */
+ WARN_ON_ONCE(tcp_skb_pcount(skb)
+ != DIV_ROUND_UP(skb->len, mss_now));
+ }
+#endif
if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp)))
break;

@@ -2518,6 +2560,7 @@ void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
@@ -2518,6 +2589,7 @@ void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
sk_gfp_mask(sk, GFP_ATOMIC)))
tcp_check_probe_timer(sk);
}
+EXPORT_SYMBOL(__tcp_push_pending_frames);

/* Send _single_ skb sitting at the send head. This function requires
* true push pending frames to setup probe timer etc.
@@ -2839,9 +2882,19 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
@@ -2839,7 +2911,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
if (tcp_fragment(sk, skb, len, cur_mss, GFP_ATOMIC))
return -ENOMEM; /* We'll try again later. */
} else {
+ int delta_truesize = skb->truesize;
+
if (skb_unclone(skb, GFP_ATOMIC))
- if (skb_unclone(skb, GFP_ATOMIC))
+ if (tcp_skb_unclone(sk, skb, GFP_ATOMIC))
return -ENOMEM;

+ delta_truesize -= skb->truesize;
+ sk->sk_wmem_queued -= delta_truesize;
+ if (delta_truesize > 0) {
+ sk_mem_uncharge(sk, delta_truesize);
+ sock_set_flag(sk, SOCK_QUEUE_SHRUNK);
+ } else {
+ sk_mem_charge(sk, -delta_truesize);
+ }
diff = tcp_skb_pcount(skb);
tcp_set_skb_tso_segs(skb, cur_mss);
diff -= tcp_skb_pcount(skb);
@@ -3129,6 +3182,7 @@ int tcp_send_synack(struct sock *sk)
@@ -3129,6 +3201,7 @@ int tcp_send_synack(struct sock *sk)
}
return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
}
Expand Down
13 changes: 5 additions & 8 deletions tempesta_fw/sock.c
Original file line number Diff line number Diff line change
Expand Up @@ -382,9 +382,10 @@ ss_do_send(struct sock *sk, struct sk_buff **skb_head, int flags)
/* Propagate mark of message head skb.*/
skb->mark = mark;

TFW_DBG3("[%d]: %s: entail skb=%pK data_len=%u len=%u mark=%u"
" tls_type=%x\n", smp_processor_id(), __func__,
skb, skb->data_len, skb->len, skb->mark,
TFW_DBG3("[%d]: %s: entail sk=%pK skb=%pK data_len=%u len=%u"
" truesize=%u mark=%u tls_type=%x\n",
smp_processor_id(), __func__, sk,
skb, skb->data_len, skb->len, skb->truesize, skb->mark,
tempesta_tls_skb_type(skb));

skb_entail(sk, skb);
Expand Down Expand Up @@ -449,11 +450,7 @@ ss_send(struct sock *sk, struct sk_buff **skb_head, int flags)
* or copy them if they're going to be used by Tempesta during
* and after the transmission.
*/
/*
* FIXME #984 the `true ||` statement at the below fixes the issue
* (at least basic tests are passed now).
*/
if (/*true ||*/ flags & SS_F_KEEP_SKB) {
if (flags & SS_F_KEEP_SKB) {
skb = *skb_head;
do {
/* tcp_transmit_skb() will clone the skb. */
Expand Down
105 changes: 55 additions & 50 deletions tempesta_fw/ss_skb.c
Original file line number Diff line number Diff line change
Expand Up @@ -634,15 +634,17 @@ __split_pgfrag_del(struct sk_buff *skb_head, struct sk_buff *skb, int i, int off
if (likely(!off)) {
frag->page_offset += len;
skb_frag_size_sub(frag, len);
ss_skb_adjust_data_len(skb, -len);
skb->len -= len;
skb->data_len -= len;
it->ptr = skb_frag_address(frag);
it->skb = skb;
return 0;
}
/* Fast path (e.g. TLS tag): delete the tail part of a fragment. */
if (likely(off + len == skb_frag_size(frag))) {
skb_frag_size_sub(frag, len);
ss_skb_adjust_data_len(skb, -len);
skb->len -= len;
skb->data_len -= len;
__it_next_data(skb, i + 1, it);
return 0;
}
Expand Down Expand Up @@ -679,7 +681,8 @@ __split_pgfrag_del(struct sk_buff *skb_head, struct sk_buff *skb, int i, int off
ss_skb_adjust_data_len(skb, -tail_len);
ss_skb_adjust_data_len(skb_dst, tail_len);
}
ss_skb_adjust_data_len(skb, -len);
skb->len -= len;
skb->data_len -= len;

/* Get the SKB and the address for data after the deleted data. */
it->ptr = skb_frag_address(&skb_shinfo(skb_dst)->frags[i]);
Expand Down Expand Up @@ -1138,52 +1141,6 @@ ss_skb_split(struct sk_buff *skb, int len)
return buff;
}

static inline int
__coalesce_frag(struct sk_buff **skb_head, skb_frag_t *frag,
const struct sk_buff *orig_skb)
{
struct sk_buff *skb = ss_skb_peek_tail(skb_head);

if (!skb || skb_shinfo(skb)->nr_frags == MAX_SKB_FRAGS) {
skb = ss_skb_alloc(0);
if (!skb)
return -ENOMEM;
ss_skb_queue_tail(skb_head, skb);
skb->mark = orig_skb->mark;
}

skb_shinfo(skb)->frags[skb_shinfo(skb)->nr_frags++] = *frag;
ss_skb_adjust_data_len(skb, frag->size);
__skb_frag_ref(frag);

return 0;
}

static int
ss_skb_queue_coalesce_tail(struct sk_buff **skb_head, const struct sk_buff *skb)
{
int i;
skb_frag_t head_frag;
unsigned int headlen = skb_headlen(skb);

if (headlen) {
BUG_ON(!skb->head_frag);
head_frag.size = headlen;
head_frag.page.p = virt_to_page(skb->head);
head_frag.page_offset = skb->data -
(unsigned char *)page_address(head_frag.page.p);
if (__coalesce_frag(skb_head, &head_frag, skb))
return -ENOMEM;
}

for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
if (__coalesce_frag(skb_head, &skb_shinfo(skb)->frags[i], skb))
return -ENOMEM;
}

return 0;
}

/**
* Tempesta FW forwards skbs with application and transport payload as is,
* so initialize such skbs such that TCP/IP stack won't stumble on dirty
Expand Down Expand Up @@ -1236,6 +1193,52 @@ ss_skb_init_for_xmit(struct sk_buff *skb)
skb->ip_summed = CHECKSUM_PARTIAL;
}

static inline int
__coalesce_frag(struct sk_buff **skb_head, skb_frag_t *frag,
const struct sk_buff *orig_skb)
{
struct sk_buff *skb = ss_skb_peek_tail(skb_head);

if (!skb || skb_shinfo(skb)->nr_frags == MAX_SKB_FRAGS) {
skb = ss_skb_alloc(0);
if (!skb)
return -ENOMEM;
ss_skb_queue_tail(skb_head, skb);
skb->mark = orig_skb->mark;
}

skb_shinfo(skb)->frags[skb_shinfo(skb)->nr_frags++] = *frag;
ss_skb_adjust_data_len(skb, frag->size);
__skb_frag_ref(frag);

return 0;
}

static int
ss_skb_queue_coalesce_tail(struct sk_buff **skb_head, const struct sk_buff *skb)
{
int i;
skb_frag_t head_frag;
unsigned int headlen = skb_headlen(skb);

if (headlen) {
BUG_ON(!skb->head_frag);
head_frag.size = headlen;
head_frag.page.p = virt_to_page(skb->head);
head_frag.page_offset = skb->data -
(unsigned char *)page_address(head_frag.page.p);
if (__coalesce_frag(skb_head, &head_frag, skb))
return -ENOMEM;
}

for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
if (__coalesce_frag(skb_head, &skb_shinfo(skb)->frags[i], skb))
return -ENOMEM;
}

return 0;
}

/*
* When the original SKB is a clone then its shinfo and payload cannot be
* modified as they are shared with other SKB users. As the SKB is unrolled,
Expand Down Expand Up @@ -1328,7 +1331,9 @@ ss_skb_unroll(struct sk_buff **skb_head, struct sk_buff *skb)
* when we track whitelist requests during HTTP processing.
*/
f_skb->mark = skb->mark;
ss_skb_adjust_data_len(skb, -f_skb->len);
skb->len -= f_skb->len;
skb->data_len -= f_skb->len;
skb->truesize -= f_skb->truesize;
f_skb->prev = prev_skb;
prev_skb = f_skb;
}
Expand Down

0 comments on commit d964ff4

Please sign in to comment.