Skip to content

Commit

Permalink
Merge pull request #1161 from tempesta-tech/ak-984
Browse files Browse the repository at this point in the history
Fix #984
  • Loading branch information
krizhanovsky authored Jan 28, 2019
2 parents fe8e37d + 6c5e735 commit ef02fd1
Show file tree
Hide file tree
Showing 5 changed files with 173 additions and 99 deletions.
86 changes: 62 additions & 24 deletions linux-4.14.32.patch
Original file line number Diff line number Diff line change
Expand Up @@ -1815,7 +1815,7 @@ index 420fecbb..67e0513a 100644
void tcp_twsk_destructor(struct sock *sk)
{
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 83d11cd2..4e79cb5e 100644
index 83d11cd2..3066c3ac 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -37,6 +37,9 @@
Expand Down Expand Up @@ -1862,15 +1862,66 @@ index 83d11cd2..4e79cb5e 100644

/* Initialize TSO segments for a packet. */
static void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now)
@@ -1560,6 +1565,7 @@ unsigned int tcp_current_mss(struct sock *sk)
@@ -1241,6 +1246,32 @@ static void tcp_skb_fragment_eor(struct sk_buff *skb, struct sk_buff *skb2)
TCP_SKB_CB(skb)->eor = 0;
}

+/**
+ * Tempesta uses page fragments for all skb allocations, so if an skb was
+ * allocated in standard Linux way, then pskb_expand_head( , 0, 0, ) may
+ * return larger skb and we have to adjust skb->truesize and memory accounting
+ * for TCP write queue.
+ */
+static int
+tcp_skb_unclone(struct sock *sk, struct sk_buff *skb, gfp_t pri)
+{
+ int r, delta_truesize = skb->truesize;
+
+ if ((r = skb_unclone(skb, pri)))
+ return r;
+
+ delta_truesize -= skb->truesize;
+ sk->sk_wmem_queued -= delta_truesize;
+ if (delta_truesize > 0) {
+ sk_mem_uncharge(sk, delta_truesize);
+ sock_set_flag(sk, SOCK_QUEUE_SHRUNK);
+ } else {
+ sk_mem_charge(sk, -delta_truesize);
+ }
+
+ return 0;
+}
+
/* Function to create two new TCP segments. Shrinks the given segment
* to the specified size and appends a new segment with the rest of the
* packet to the list. This won't be called frequently, I hope.
@@ -1262,7 +1293,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
if (nsize < 0)
nsize = 0;

- if (skb_unclone(skb, gfp))
+ if (tcp_skb_unclone(sk, skb, gfp))
return -ENOMEM;

/* Get a new skb... force flag on. */
@@ -1380,7 +1411,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
{
u32 delta_truesize;

- if (skb_unclone(skb, GFP_ATOMIC))
+ if (tcp_skb_unclone(sk, skb, GFP_ATOMIC))
return -ENOMEM;

delta_truesize = __pskb_trim_head(skb, len);
@@ -1560,6 +1591,7 @@ unsigned int tcp_current_mss(struct sock *sk)

return mss_now;
}
+EXPORT_SYMBOL(tcp_current_mss);

/* RFC2861, slow part. Adjust cwnd, after it was not full during one rto.
* As additional protections, we do not touch cwnd in retransmission phases,
@@ -2327,7 +2333,20 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
@@ -2327,7 +2359,20 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
cwnd_quota,
max_segs),
nonagle);
Expand All @@ -1892,7 +1943,7 @@ index 83d11cd2..4e79cb5e 100644
if (skb->len > limit &&
unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
break;
@@ -2336,7 +2355,34 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
@@ -2336,7 +2381,32 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
clear_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags);
if (tcp_small_queue_check(sk, skb, 0))
break;
Expand All @@ -1916,47 +1967,34 @@ index 83d11cd2..4e79cb5e 100644
+ net_warn_ratelimited(
+ "Tempesta: cannot encrypt data (%d),"
+ " reset a TLS connection.\n", result);
+ /*
+ * FIXME #984 WARNING: at net/core/stream.c:205
+ * sk_stream_kill_queues+0x106/0x120
+ */
+ tcp_reset(sk);
+ break;
+ }
+ /* Fix up TSO segments after TLS overhead. */
+ tcp_set_skb_tso_segs(skb, mss_now);
+ }
+#endif
if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp)))
break;

@@ -2518,6 +2564,7 @@ void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
@@ -2518,6 +2588,7 @@ void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
sk_gfp_mask(sk, GFP_ATOMIC)))
tcp_check_probe_timer(sk);
}
+EXPORT_SYMBOL(__tcp_push_pending_frames);

/* Send _single_ skb sitting at the send head. This function requires
* true push pending frames to setup probe timer etc.
@@ -2839,9 +2886,19 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
@@ -2839,7 +2910,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
if (tcp_fragment(sk, skb, len, cur_mss, GFP_ATOMIC))
return -ENOMEM; /* We'll try again later. */
} else {
+ int delta_truesize = skb->truesize;
+
if (skb_unclone(skb, GFP_ATOMIC))
- if (skb_unclone(skb, GFP_ATOMIC))
+ if (tcp_skb_unclone(sk, skb, GFP_ATOMIC))
return -ENOMEM;

+ delta_truesize -= skb->truesize;
+ sk->sk_wmem_queued -= delta_truesize;
+ if (delta_truesize > 0) {
+ sk_mem_uncharge(sk, delta_truesize);
+ sock_set_flag(sk, SOCK_QUEUE_SHRUNK);
+ } else {
+ sk_mem_charge(sk, -delta_truesize);
+ }
diff = tcp_skb_pcount(skb);
tcp_set_skb_tso_segs(skb, cur_mss);
diff -= tcp_skb_pcount(skb);
@@ -3129,6 +3186,7 @@ int tcp_send_synack(struct sock *sk)
@@ -3129,6 +3200,7 @@ int tcp_send_synack(struct sock *sk)
}
return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
}
Expand Down
3 changes: 1 addition & 2 deletions tempesta_db/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,11 @@
# Temple Place - Suite 330, Boston, MA 02111-1307, USA.

all: libtdb tdbq
.PHONY: all

.PHONY: libtdb
libtdb:
$(MAKE) -C libtdb

.PHONY: tdbq
tdbq: libtdb
$(MAKE) -C tdbq

Expand Down
11 changes: 6 additions & 5 deletions tempesta_fw/sock.c
Original file line number Diff line number Diff line change
Expand Up @@ -350,7 +350,7 @@ ss_do_send(struct sock *sk, struct sk_buff **skb_head, int flags)
int size, mss = tcp_send_mss(sk, &size, MSG_DONTWAIT);
unsigned int mark = (*skb_head)->mark;

TFW_DBG3("[%d]: %s: sk=%p queue_empty=%d send_head=%p"
TFW_DBG3("[%d]: %s: sk=%pK queue_empty=%d send_head=%pK"
" sk_state=%d mss=%d size=%d\n",
smp_processor_id(), __func__,
sk, tcp_write_queue_empty(sk), tcp_send_head(sk),
Expand All @@ -369,7 +369,7 @@ ss_do_send(struct sock *sk, struct sk_buff **skb_head, int flags)
* these SKBs.
*/
if (!skb->len) {
TFW_DBG3("[%d]: %s: drop skb=%p data_len=%u len=%u\n",
TFW_DBG3("[%d]: %s: drop skb=%pK data_len=%u len=%u\n",
smp_processor_id(), __func__,
skb, skb->data_len, skb->len);
kfree_skb(skb);
Expand All @@ -382,9 +382,10 @@ ss_do_send(struct sock *sk, struct sk_buff **skb_head, int flags)
/* Propagate mark of message head skb.*/
skb->mark = mark;

TFW_DBG3("[%d]: %s: entail skb=%p data_len=%u len=%u mark=%u"
" tls_type=%x\n", smp_processor_id(), __func__,
skb, skb->data_len, skb->len, skb->mark,
TFW_DBG3("[%d]: %s: entail sk=%pK skb=%pK data_len=%u len=%u"
" truesize=%u mark=%u tls_type=%x\n",
smp_processor_id(), __func__, sk,
skb, skb->data_len, skb->len, skb->truesize, skb->mark,
tempesta_tls_skb_type(skb));

skb_entail(sk, skb);
Expand Down
133 changes: 70 additions & 63 deletions tempesta_fw/ss_skb.c
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
* on top on native Linux socket buffers. The helpers provide common and
* convenient wrappers for skb processing.
*
* Copyright (C) 2015-2018 Tempesta Technologies, Inc.
* Copyright (C) 2015-2019 Tempesta Technologies, Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by
Expand Down Expand Up @@ -634,15 +634,17 @@ __split_pgfrag_del(struct sk_buff *skb_head, struct sk_buff *skb, int i, int off
if (likely(!off)) {
frag->page_offset += len;
skb_frag_size_sub(frag, len);
ss_skb_adjust_data_len(skb, -len);
skb->len -= len;
skb->data_len -= len;
it->data = skb_frag_address(frag);
it->skb = skb;
return 0;
}
/* Fast path (e.g. TLS tag): delete the tail part of a fragment. */
if (likely(off + len == skb_frag_size(frag))) {
skb_frag_size_sub(frag, len);
ss_skb_adjust_data_len(skb, -len);
skb->len -= len;
skb->data_len -= len;
__it_next_data(skb, i + 1, it);
return 0;
}
Expand Down Expand Up @@ -679,7 +681,8 @@ __split_pgfrag_del(struct sk_buff *skb_head, struct sk_buff *skb, int i, int off
ss_skb_adjust_data_len(skb, -tail_len);
ss_skb_adjust_data_len(skb_dst, tail_len);
}
ss_skb_adjust_data_len(skb, -len);
skb->len -= len;
skb->data_len -= len;

/* Get the SKB and the address for data after the deleted data. */
it->data = skb_frag_address(&skb_shinfo(skb_dst)->frags[i]);
Expand Down Expand Up @@ -1102,27 +1105,29 @@ struct sk_buff *
ss_skb_split(struct sk_buff *skb, int len)
{
struct sk_buff *buff;
int nsize, asize, nlen;
int n = 0;

/* Assert that the SKB is orphaned. */
WARN_ON_ONCE(skb->destructor);

nsize = skb_headlen(skb) - len;
if (nsize < 0)
nsize = 0;
asize = ALIGN(nsize, 4);
if (len < skb_headlen(skb))
n = skb_headlen(skb) - len;

buff = alloc_skb_fclone(asize + MAX_TCP_HEADER, GFP_ATOMIC);
if (buff == NULL)
buff = alloc_skb_fclone(ALIGN(n, 4) + MAX_TCP_HEADER, GFP_ATOMIC);
if (!buff)
return NULL;

skb_reserve(buff, MAX_TCP_HEADER);
/* Make sure there's exactly asize bytes available. */
buff->reserved_tailroom = buff->end - buff->tail - asize;

nlen = skb->len - len - nsize;
buff->truesize += nlen;
skb->truesize -= nlen;
/* @buff already accounts @n in truesize. */
buff->truesize += skb->len - len - n;
skb->truesize -= skb->len - len;

/*
* Initialize GSO segments counter to let TCP set it according to
* the current MSS on egress path.
*/
tcp_skb_pcount_set(skb, 0);

/*
* These are orphaned SKBs that are taken out of the TCP/IP
Expand All @@ -1136,52 +1141,6 @@ ss_skb_split(struct sk_buff *skb, int len)
return buff;
}

static inline int
__coalesce_frag(struct sk_buff **skb_head, skb_frag_t *frag,
const struct sk_buff *orig_skb)
{
struct sk_buff *skb = ss_skb_peek_tail(skb_head);

if (!skb || skb_shinfo(skb)->nr_frags == MAX_SKB_FRAGS) {
skb = ss_skb_alloc(0);
if (!skb)
return -ENOMEM;
ss_skb_queue_tail(skb_head, skb);
skb->mark = orig_skb->mark;
}

skb_shinfo(skb)->frags[skb_shinfo(skb)->nr_frags++] = *frag;
ss_skb_adjust_data_len(skb, frag->size);
__skb_frag_ref(frag);

return 0;
}

static int
ss_skb_queue_coalesce_tail(struct sk_buff **skb_head, const struct sk_buff *skb)
{
int i;
skb_frag_t head_frag;
unsigned int headlen = skb_headlen(skb);

if (headlen) {
BUG_ON(!skb->head_frag);
head_frag.size = headlen;
head_frag.page.p = virt_to_page(skb->head);
head_frag.page_offset = skb->data -
(unsigned char *)page_address(head_frag.page.p);
if (__coalesce_frag(skb_head, &head_frag, skb))
return -ENOMEM;
}

for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
if (__coalesce_frag(skb_head, &skb_shinfo(skb)->frags[i], skb))
return -ENOMEM;
}

return 0;
}

/**
* Tempesta FW forwards skbs with application and transport payload as is,
* so initialize such skbs such that TCP/IP stack won't stumble on dirty
Expand Down Expand Up @@ -1234,6 +1193,52 @@ ss_skb_init_for_xmit(struct sk_buff *skb)
skb->ip_summed = CHECKSUM_PARTIAL;
}

static inline int
__coalesce_frag(struct sk_buff **skb_head, skb_frag_t *frag,
const struct sk_buff *orig_skb)
{
struct sk_buff *skb = ss_skb_peek_tail(skb_head);

if (!skb || skb_shinfo(skb)->nr_frags == MAX_SKB_FRAGS) {
skb = ss_skb_alloc(0);
if (!skb)
return -ENOMEM;
ss_skb_queue_tail(skb_head, skb);
skb->mark = orig_skb->mark;
}

skb_shinfo(skb)->frags[skb_shinfo(skb)->nr_frags++] = *frag;
ss_skb_adjust_data_len(skb, frag->size);
__skb_frag_ref(frag);

return 0;
}

static int
ss_skb_queue_coalesce_tail(struct sk_buff **skb_head, const struct sk_buff *skb)
{
int i;
skb_frag_t head_frag;
unsigned int headlen = skb_headlen(skb);

if (headlen) {
BUG_ON(!skb->head_frag);
head_frag.size = headlen;
head_frag.page.p = virt_to_page(skb->head);
head_frag.page_offset = skb->data -
(unsigned char *)page_address(head_frag.page.p);
if (__coalesce_frag(skb_head, &head_frag, skb))
return -ENOMEM;
}

for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
if (__coalesce_frag(skb_head, &skb_shinfo(skb)->frags[i], skb))
return -ENOMEM;
}

return 0;
}

/*
* When the original SKB is a clone then its shinfo and payload cannot be
* modified as they are shared with other SKB users. As the SKB is unrolled,
Expand Down Expand Up @@ -1326,7 +1331,9 @@ ss_skb_unroll(struct sk_buff **skb_head, struct sk_buff *skb)
* when we track whitelist requests during HTTP processing.
*/
f_skb->mark = skb->mark;
ss_skb_adjust_data_len(skb, -f_skb->len);
skb->len -= f_skb->len;
skb->data_len -= f_skb->len;
skb->truesize -= f_skb->truesize;
f_skb->prev = prev_skb;
prev_skb = f_skb;
}
Expand Down
Loading

0 comments on commit ef02fd1

Please sign in to comment.