diff --git a/linux-4.14.32.patch b/linux-4.14.32.patch index a25e2255d3..8c77a0e5e3 100644 --- a/linux-4.14.32.patch +++ b/linux-4.14.32.patch @@ -1815,7 +1815,7 @@ index 420fecbb..67e0513a 100644 void tcp_twsk_destructor(struct sock *sk) { diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c -index 83d11cd2..4e79cb5e 100644 +index 83d11cd2..3066c3ac 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -37,6 +37,9 @@ @@ -1862,7 +1862,58 @@ index 83d11cd2..4e79cb5e 100644 /* Initialize TSO segments for a packet. */ static void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now) -@@ -1560,6 +1565,7 @@ unsigned int tcp_current_mss(struct sock *sk) +@@ -1241,6 +1246,32 @@ static void tcp_skb_fragment_eor(struct sk_buff *skb, struct sk_buff *skb2) + TCP_SKB_CB(skb)->eor = 0; + } + ++/** ++ * Tempesta uses page fragments for all skb allocations, so if an skb was ++ * allocated in standard Linux way, then pskb_expand_head( , 0, 0, ) may ++ * return larger skb and we have to adjust skb->truesize and memory accounting ++ * for TCP write queue. ++ */ ++static int ++tcp_skb_unclone(struct sock *sk, struct sk_buff *skb, gfp_t pri) ++{ ++ int r, delta_truesize = skb->truesize; ++ ++ if ((r = skb_unclone(skb, pri))) ++ return r; ++ ++ delta_truesize -= skb->truesize; ++ sk->sk_wmem_queued -= delta_truesize; ++ if (delta_truesize > 0) { ++ sk_mem_uncharge(sk, delta_truesize); ++ sock_set_flag(sk, SOCK_QUEUE_SHRUNK); ++ } else { ++ sk_mem_charge(sk, -delta_truesize); ++ } ++ ++ return 0; ++} ++ + /* Function to create two new TCP segments. Shrinks the given segment + * to the specified size and appends a new segment with the rest of the + * packet to the list. This won't be called frequently, I hope. +@@ -1262,7 +1293,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, + if (nsize < 0) + nsize = 0; + +- if (skb_unclone(skb, gfp)) ++ if (tcp_skb_unclone(sk, skb, gfp)) + return -ENOMEM; + + /* Get a new skb... force flag on. */ +@@ -1380,7 +1411,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) + { + u32 delta_truesize; + +- if (skb_unclone(skb, GFP_ATOMIC)) ++ if (tcp_skb_unclone(sk, skb, GFP_ATOMIC)) + return -ENOMEM; + + delta_truesize = __pskb_trim_head(skb, len); +@@ -1560,6 +1591,7 @@ unsigned int tcp_current_mss(struct sock *sk) return mss_now; } @@ -1870,7 +1921,7 @@ index 83d11cd2..4e79cb5e 100644 /* RFC2861, slow part. Adjust cwnd, after it was not full during one rto. * As additional protections, we do not touch cwnd in retransmission phases, -@@ -2327,7 +2333,20 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, +@@ -2327,7 +2359,20 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, cwnd_quota, max_segs), nonagle); @@ -1892,7 +1943,7 @@ index 83d11cd2..4e79cb5e 100644 if (skb->len > limit && unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) break; -@@ -2336,7 +2355,34 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, +@@ -2336,7 +2381,32 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, clear_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags); if (tcp_small_queue_check(sk, skb, 0)) break; @@ -1916,19 +1967,17 @@ index 83d11cd2..4e79cb5e 100644 + net_warn_ratelimited( + "Tempesta: cannot encrypt data (%d)," + " reset a TLS connection.\n", result); -+ /* -+ * FIXME #984 WARNING: at net/core/stream.c:205 -+ * sk_stream_kill_queues+0x106/0x120 -+ */ + tcp_reset(sk); + break; + } ++ /* Fix up TSO segments after TLS overhead. */ ++ tcp_set_skb_tso_segs(skb, mss_now); + } +#endif if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp))) break; -@@ -2518,6 +2564,7 @@ void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss, +@@ -2518,6 +2588,7 @@ void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss, sk_gfp_mask(sk, GFP_ATOMIC))) tcp_check_probe_timer(sk); } @@ -1936,27 +1985,16 @@ index 83d11cd2..4e79cb5e 100644 /* Send _single_ skb sitting at the send head. This function requires * true push pending frames to setup probe timer etc. -@@ -2839,9 +2886,19 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) +@@ -2839,7 +2910,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) if (tcp_fragment(sk, skb, len, cur_mss, GFP_ATOMIC)) return -ENOMEM; /* We'll try again later. */ } else { -+ int delta_truesize = skb->truesize; -+ - if (skb_unclone(skb, GFP_ATOMIC)) +- if (skb_unclone(skb, GFP_ATOMIC)) ++ if (tcp_skb_unclone(sk, skb, GFP_ATOMIC)) return -ENOMEM; -+ delta_truesize -= skb->truesize; -+ sk->sk_wmem_queued -= delta_truesize; -+ if (delta_truesize > 0) { -+ sk_mem_uncharge(sk, delta_truesize); -+ sock_set_flag(sk, SOCK_QUEUE_SHRUNK); -+ } else { -+ sk_mem_charge(sk, -delta_truesize); -+ } diff = tcp_skb_pcount(skb); - tcp_set_skb_tso_segs(skb, cur_mss); - diff -= tcp_skb_pcount(skb); -@@ -3129,6 +3186,7 @@ int tcp_send_synack(struct sock *sk) +@@ -3129,6 +3200,7 @@ int tcp_send_synack(struct sock *sk) } return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); } diff --git a/tempesta_db/Makefile b/tempesta_db/Makefile index 4cc09b3d8f..15dd9e2754 100644 --- a/tempesta_db/Makefile +++ b/tempesta_db/Makefile @@ -18,12 +18,11 @@ # Temple Place - Suite 330, Boston, MA 02111-1307, USA. all: libtdb tdbq +.PHONY: all -.PHONY: libtdb libtdb: $(MAKE) -C libtdb -.PHONY: tdbq tdbq: libtdb $(MAKE) -C tdbq diff --git a/tempesta_fw/sock.c b/tempesta_fw/sock.c index 74d03dba74..9e3ad5581e 100644 --- a/tempesta_fw/sock.c +++ b/tempesta_fw/sock.c @@ -350,7 +350,7 @@ ss_do_send(struct sock *sk, struct sk_buff **skb_head, int flags) int size, mss = tcp_send_mss(sk, &size, MSG_DONTWAIT); unsigned int mark = (*skb_head)->mark; - TFW_DBG3("[%d]: %s: sk=%p queue_empty=%d send_head=%p" + TFW_DBG3("[%d]: %s: sk=%pK queue_empty=%d send_head=%pK" " sk_state=%d mss=%d size=%d\n", smp_processor_id(), __func__, sk, tcp_write_queue_empty(sk), tcp_send_head(sk), @@ -369,7 +369,7 @@ ss_do_send(struct sock *sk, struct sk_buff **skb_head, int flags) * these SKBs. */ if (!skb->len) { - TFW_DBG3("[%d]: %s: drop skb=%p data_len=%u len=%u\n", + TFW_DBG3("[%d]: %s: drop skb=%pK data_len=%u len=%u\n", smp_processor_id(), __func__, skb, skb->data_len, skb->len); kfree_skb(skb); @@ -382,9 +382,10 @@ ss_do_send(struct sock *sk, struct sk_buff **skb_head, int flags) /* Propagate mark of message head skb.*/ skb->mark = mark; - TFW_DBG3("[%d]: %s: entail skb=%p data_len=%u len=%u mark=%u" - " tls_type=%x\n", smp_processor_id(), __func__, - skb, skb->data_len, skb->len, skb->mark, + TFW_DBG3("[%d]: %s: entail sk=%pK skb=%pK data_len=%u len=%u" + " truesize=%u mark=%u tls_type=%x\n", + smp_processor_id(), __func__, sk, + skb, skb->data_len, skb->len, skb->truesize, skb->mark, tempesta_tls_skb_type(skb)); skb_entail(sk, skb); diff --git a/tempesta_fw/ss_skb.c b/tempesta_fw/ss_skb.c index a1abd88ef9..973003b44c 100644 --- a/tempesta_fw/ss_skb.c +++ b/tempesta_fw/ss_skb.c @@ -7,7 +7,7 @@ * on top on native Linux socket buffers. The helpers provide common and * convenient wrappers for skb processing. * - * Copyright (C) 2015-2018 Tempesta Technologies, Inc. + * Copyright (C) 2015-2019 Tempesta Technologies, Inc. * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by @@ -634,7 +634,8 @@ __split_pgfrag_del(struct sk_buff *skb_head, struct sk_buff *skb, int i, int off if (likely(!off)) { frag->page_offset += len; skb_frag_size_sub(frag, len); - ss_skb_adjust_data_len(skb, -len); + skb->len -= len; + skb->data_len -= len; it->data = skb_frag_address(frag); it->skb = skb; return 0; @@ -642,7 +643,8 @@ __split_pgfrag_del(struct sk_buff *skb_head, struct sk_buff *skb, int i, int off /* Fast path (e.g. TLS tag): delete the tail part of a fragment. */ if (likely(off + len == skb_frag_size(frag))) { skb_frag_size_sub(frag, len); - ss_skb_adjust_data_len(skb, -len); + skb->len -= len; + skb->data_len -= len; __it_next_data(skb, i + 1, it); return 0; } @@ -679,7 +681,8 @@ __split_pgfrag_del(struct sk_buff *skb_head, struct sk_buff *skb, int i, int off ss_skb_adjust_data_len(skb, -tail_len); ss_skb_adjust_data_len(skb_dst, tail_len); } - ss_skb_adjust_data_len(skb, -len); + skb->len -= len; + skb->data_len -= len; /* Get the SKB and the address for data after the deleted data. */ it->data = skb_frag_address(&skb_shinfo(skb_dst)->frags[i]); @@ -1102,27 +1105,29 @@ struct sk_buff * ss_skb_split(struct sk_buff *skb, int len) { struct sk_buff *buff; - int nsize, asize, nlen; + int n = 0; /* Assert that the SKB is orphaned. */ WARN_ON_ONCE(skb->destructor); - nsize = skb_headlen(skb) - len; - if (nsize < 0) - nsize = 0; - asize = ALIGN(nsize, 4); + if (len < skb_headlen(skb)) + n = skb_headlen(skb) - len; - buff = alloc_skb_fclone(asize + MAX_TCP_HEADER, GFP_ATOMIC); - if (buff == NULL) + buff = alloc_skb_fclone(ALIGN(n, 4) + MAX_TCP_HEADER, GFP_ATOMIC); + if (!buff) return NULL; skb_reserve(buff, MAX_TCP_HEADER); - /* Make sure there's exactly asize bytes available. */ - buff->reserved_tailroom = buff->end - buff->tail - asize; - nlen = skb->len - len - nsize; - buff->truesize += nlen; - skb->truesize -= nlen; + /* @buff already accounts @n in truesize. */ + buff->truesize += skb->len - len - n; + skb->truesize -= skb->len - len; + + /* + * Initialize GSO segments counter to let TCP set it according to + * the current MSS on egress path. + */ + tcp_skb_pcount_set(skb, 0); /* * These are orphaned SKBs that are taken out of the TCP/IP @@ -1136,52 +1141,6 @@ ss_skb_split(struct sk_buff *skb, int len) return buff; } -static inline int -__coalesce_frag(struct sk_buff **skb_head, skb_frag_t *frag, - const struct sk_buff *orig_skb) -{ - struct sk_buff *skb = ss_skb_peek_tail(skb_head); - - if (!skb || skb_shinfo(skb)->nr_frags == MAX_SKB_FRAGS) { - skb = ss_skb_alloc(0); - if (!skb) - return -ENOMEM; - ss_skb_queue_tail(skb_head, skb); - skb->mark = orig_skb->mark; - } - - skb_shinfo(skb)->frags[skb_shinfo(skb)->nr_frags++] = *frag; - ss_skb_adjust_data_len(skb, frag->size); - __skb_frag_ref(frag); - - return 0; -} - -static int -ss_skb_queue_coalesce_tail(struct sk_buff **skb_head, const struct sk_buff *skb) -{ - int i; - skb_frag_t head_frag; - unsigned int headlen = skb_headlen(skb); - - if (headlen) { - BUG_ON(!skb->head_frag); - head_frag.size = headlen; - head_frag.page.p = virt_to_page(skb->head); - head_frag.page_offset = skb->data - - (unsigned char *)page_address(head_frag.page.p); - if (__coalesce_frag(skb_head, &head_frag, skb)) - return -ENOMEM; - } - - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { - if (__coalesce_frag(skb_head, &skb_shinfo(skb)->frags[i], skb)) - return -ENOMEM; - } - - return 0; -} - /** * Tempesta FW forwards skbs with application and transport payload as is, * so initialize such skbs such that TCP/IP stack won't stumble on dirty @@ -1234,6 +1193,52 @@ ss_skb_init_for_xmit(struct sk_buff *skb) skb->ip_summed = CHECKSUM_PARTIAL; } +static inline int +__coalesce_frag(struct sk_buff **skb_head, skb_frag_t *frag, + const struct sk_buff *orig_skb) +{ + struct sk_buff *skb = ss_skb_peek_tail(skb_head); + + if (!skb || skb_shinfo(skb)->nr_frags == MAX_SKB_FRAGS) { + skb = ss_skb_alloc(0); + if (!skb) + return -ENOMEM; + ss_skb_queue_tail(skb_head, skb); + skb->mark = orig_skb->mark; + } + + skb_shinfo(skb)->frags[skb_shinfo(skb)->nr_frags++] = *frag; + ss_skb_adjust_data_len(skb, frag->size); + __skb_frag_ref(frag); + + return 0; +} + +static int +ss_skb_queue_coalesce_tail(struct sk_buff **skb_head, const struct sk_buff *skb) +{ + int i; + skb_frag_t head_frag; + unsigned int headlen = skb_headlen(skb); + + if (headlen) { + BUG_ON(!skb->head_frag); + head_frag.size = headlen; + head_frag.page.p = virt_to_page(skb->head); + head_frag.page_offset = skb->data - + (unsigned char *)page_address(head_frag.page.p); + if (__coalesce_frag(skb_head, &head_frag, skb)) + return -ENOMEM; + } + + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + if (__coalesce_frag(skb_head, &skb_shinfo(skb)->frags[i], skb)) + return -ENOMEM; + } + + return 0; +} + /* * When the original SKB is a clone then its shinfo and payload cannot be * modified as they are shared with other SKB users. As the SKB is unrolled, @@ -1326,7 +1331,9 @@ ss_skb_unroll(struct sk_buff **skb_head, struct sk_buff *skb) * when we track whitelist requests during HTTP processing. */ f_skb->mark = skb->mark; - ss_skb_adjust_data_len(skb, -f_skb->len); + skb->len -= f_skb->len; + skb->data_len -= f_skb->len; + skb->truesize -= f_skb->truesize; f_skb->prev = prev_skb; prev_skb = f_skb; } diff --git a/tempesta_fw/tls.c b/tempesta_fw/tls.c index f9a5583c47..0e64ba2594 100644 --- a/tempesta_fw/tls.c +++ b/tempesta_fw/tls.c @@ -3,7 +3,7 @@ * * Transport Layer Security (TLS) interfaces to Tempesta TLS. * - * Copyright (C) 2015-2018 Tempesta Technologies, Inc. + * Copyright (C) 2015-2019 Tempesta Technologies, Inc. * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by @@ -183,7 +183,6 @@ tfw_tls_tcp_add_overhead(struct sock *sk, unsigned int overhead) { sk->sk_wmem_queued += overhead; sk_mem_charge(sk, overhead); - tcp_sk(sk)->write_seq += overhead; } /** @@ -230,7 +229,7 @@ tfw_tls_encrypt(struct sock *sk, struct sk_buff *skb, unsigned int limit) #define AUTO_SEGS_N 8 int r = -ENOMEM; - unsigned int head_sz, tag_sz, len, frags; + unsigned int head_sz, tag_sz, len, frags, t_sz; unsigned char type; struct sk_buff *next = skb, *skb_tail = skb; struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); @@ -297,6 +296,8 @@ tfw_tls_encrypt(struct sock *sk, struct sk_buff *skb, unsigned int limit) * if there is no free frag slot in skb_tail, a new skb is allocated. */ next = skb_tail->next; + t_sz = skb_tail->truesize; + WARN_ON_ONCE(next == skb); if (skb_tail == skb) { r = ss_skb_expand_head_tail(skb->next, skb, head_sz, tag_sz); if (r < 0) @@ -318,12 +319,31 @@ tfw_tls_encrypt(struct sock *sk, struct sk_buff *skb, unsigned int limit) * The last skb in our list will bring TLS tag - add it to end_seqno. * Otherwise (in worst case), a new skb was inserted to fit TLS tag * - fix end_seqno's for @skb_tail and this new skb. + * + * @limit = mss_now - tls_overhead, so {tso,tcp}_fragment() called from + * tcp_write_xmit() should set proper skb->tcp_gso_segs. */ if (likely(skb_tail->next == next)) { TCP_SKB_CB(skb_tail)->end_seq += tag_sz; - } else { + + /* A new frag is added to the end of the current skb. */ + WARN_ON_ONCE(t_sz >= skb_tail->truesize); + t_sz = skb_tail->truesize - t_sz; + } + else { WARN_ON_ONCE(skb_tail->next->len != tag_sz); + WARN_ON_ONCE(skb_tail->truesize != t_sz); + tfw_tls_tcp_propagate_dseq(sk, skb_tail); + + /* + * A new skb is added to the socket wmem. + * + * pcount for a new skb is zero, to tcp_write_xmit() will + * set TSO segs to proper value on next iteration. + */ + t_sz = skb_tail->next->truesize; + skb_tail = skb_tail->next; } @@ -336,8 +356,17 @@ tfw_tls_encrypt(struct sock *sk, struct sk_buff *skb, unsigned int limit) * consistent state. */ tfw_tls_tcp_propagate_dseq(sk, skb_tail); + tcp_sk(sk)->write_seq += head_sz + tag_sz; - tfw_tls_tcp_add_overhead(sk, head_sz + tag_sz); + /* + * TLS record header is always allocated from the reserved skb headroom. + * The room for the tag may also be allocated from the reserved tailroom + * or in a new page fragment in skb_tail or next, probably new, skb. + * So to adjust the socket write memory we have to check the both skbs + * and only for tag_sz. + */ + WARN_ON_ONCE(t_sz < tag_sz); + tfw_tls_tcp_add_overhead(sk, t_sz); if (likely(sgt.nents <= AUTO_SEGS_N)) { sgt.sgl = sg;