diff --git a/linux-4.9.35.patch b/linux-4.9.35.patch index 17b8df05c5..aad637e865 100644 --- a/linux-4.9.35.patch +++ b/linux-4.9.35.patch @@ -147,7 +147,7 @@ index cd0c8bd..df4de18 100644 struct kvec; diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h -index 32810f2..3c5bd09 100644 +index 32810f2..107e395 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -652,8 +652,12 @@ struct sk_buff { @@ -198,7 +198,20 @@ index 32810f2..3c5bd09 100644 #ifdef CONFIG_NET_SCHED __u16 tc_index; /* traffic control index */ -@@ -1791,7 +1802,11 @@ static inline struct sk_buff *__skb_dequeue_tail(struct sk_buff_head *list) +@@ -961,12 +972,6 @@ static inline struct sk_buff *alloc_skb_fclone(unsigned int size, + return __alloc_skb(size, priority, SKB_ALLOC_FCLONE, NUMA_NO_NODE); + } + +-struct sk_buff *__alloc_skb_head(gfp_t priority, int node); +-static inline struct sk_buff *alloc_skb_head(gfp_t priority) +-{ +- return __alloc_skb_head(priority, -1); +-} +- + struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src); + int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask); + struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t priority); +@@ -1791,7 +1796,11 @@ static inline struct sk_buff *__skb_dequeue_tail(struct sk_buff_head *list) static inline bool skb_is_nonlinear(const struct sk_buff *skb) { @@ -210,7 +223,7 @@ index 32810f2..3c5bd09 100644 } static inline unsigned int skb_headlen(const struct sk_buff *skb) -@@ -1977,6 +1992,20 @@ static inline unsigned int skb_headroom(const struct sk_buff *skb) +@@ -1977,6 +1986,20 @@ static inline unsigned int skb_headroom(const struct sk_buff *skb) return skb->data - skb->head; } @@ -231,7 +244,7 @@ index 32810f2..3c5bd09 100644 /** * skb_tailroom - bytes at buffer end * @skb: buffer to check -@@ -3873,5 +3902,29 @@ static inline __wsum lco_csum(struct sk_buff *skb) +@@ -3873,5 +3896,29 @@ static inline __wsum lco_csum(struct sk_buff *skb) return csum_partial(l4_hdr, csum_start - l4_hdr, partial); } @@ -449,7 +462,7 @@ index 744fa61..da91fc7 100644 restart: /* Reset the pending bitmask before enabling irqs */ set_softirq_pending(0); -@@ -305,6 +310,9 @@ asmlinkage __visible void __softirq_entry __do_softirq(void) +@@ -305,6 +310,9 @@ restart: wakeup_softirqd(); } @@ -838,7 +851,7 @@ index 5d26056..ee02253 100644 } +EXPORT_SYMBOL(reqsk_fastopen_remove); diff --git a/net/core/skbuff.c b/net/core/skbuff.c -index fe008f1..f232d88 100644 +index fe008f1..a0d768f 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -79,7 +79,9 @@ @@ -859,7 +872,7 @@ index fe008f1..f232d88 100644 /* * kmalloc_reserve is a wrapper around kmalloc_node_track_caller that tells * the caller if emergency pfmemalloc reserves are being used. If it is and -@@ -151,6 +154,173 @@ static void *__kmalloc_reserve(size_t size, gfp_t flags, int node, +@@ -151,22 +154,179 @@ out: return obj; } @@ -876,13 +889,17 @@ index fe008f1..f232d88 100644 +#define PG_ALLOC_SZ(s) (((s) + (PG_CHUNK_SZ - 1)) & PG_CHUNK_MASK) +#define PG_CHUNK_NUM(s) (PG_ALLOC_SZ(s) >> PG_CHUNK_BITS) +#define PG_POOL_HLIM_BASE 256 -+ + +-/* Allocate a new skbuff. We do this ourselves so we can fill in a few +- * 'private' fields and also do memory statistics to find all the +- * [BEEP] leaks. +- * +/** + * @lh - list head of chunk pool; + * @count - current number of chunks in @lh; + * @h_limit - hard limit for size of @lh; + * @max - current maximum allowed size of the list, can be 0. -+ */ + */ +typedef struct { + struct list_head lh; + unsigned int count; @@ -905,10 +922,12 @@ index fe008f1..f232d88 100644 + ++pool->max; + return true; +} -+ + +-struct sk_buff *__alloc_skb_head(gfp_t gfp_mask, int node) +static bool +__pg_pool_shrink(TfwSkbMemPool *pool) -+{ + { +- struct sk_buff *skb; + if (unlikely(pool->count >= pool->max)) { + /* Producers are much faster consumers right now. */ + pool->max >>= 1; @@ -928,7 +947,12 @@ index fe008f1..f232d88 100644 + --pool->max; + return true; +} -+ + +- /* Get the HEAD */ +- skb = kmem_cache_alloc_node(skbuff_head_cache, +- gfp_mask & ~__GFP_DMA, node); +- if (!skb) +- goto out; +static void * +__pg_skb_alloc(unsigned int size, gfp_t gfp_mask, int node) +{ @@ -1030,34 +1054,33 @@ index fe008f1..f232d88 100644 +#undef PREEMPT_CTX_ENABLE +} +#endif - - /* Allocate a new skbuff. We do this ourselves so we can fill in a few - * 'private' fields and also do memory statistics to find all the -@@ -183,6 +353,54 @@ struct sk_buff *__alloc_skb_head(gfp_t gfp_mask, int node) - return skb; - } - ++ +static void +__alloc_skb_init(struct sk_buff *skb, u8 *data, unsigned int size, + int flags, bool pfmemalloc) +{ + struct skb_shared_info *shinfo; -+ -+ /* -+ * Only clear those fields we need to clear, not those that we will -+ * actually initialise below. Hence, don't put any more fields after -+ * the tail pointer in struct sk_buff! -+ */ -+ memset(skb, 0, offsetof(struct sk_buff, tail)); + + /* + * Only clear those fields we need to clear, not those that we will +@@ -174,13 +334,40 @@ struct sk_buff *__alloc_skb_head(gfp_t gfp_mask, int node) + * the tail pointer in struct sk_buff! + */ + memset(skb, 0, offsetof(struct sk_buff, tail)); +- skb->head = NULL; +- skb->truesize = sizeof(struct sk_buff); + /* Account for allocated memory : skb + skb->head */ + skb->truesize = SKB_TRUESIZE(size); + skb->pfmemalloc = pfmemalloc; -+ atomic_set(&skb->users, 1); + atomic_set(&skb->users, 1); +- + skb->head = data; + skb->data = data; + skb_reset_tail_pointer(skb); + skb->end = skb->tail + size; -+ skb->mac_header = (typeof(skb->mac_header))~0U; + skb->mac_header = (typeof(skb->mac_header))~0U; +-out: +- return skb; + skb->transport_header = (typeof(skb->transport_header))~0U; + + /* make sure we initialize shinfo sequentially */ @@ -1083,12 +1106,10 @@ index fe008f1..f232d88 100644 + fclones->skb2.head_frag = 1; +#endif + } -+} -+ + } + /** - * __alloc_skb - allocate a network buffer - * @size: size to allocate -@@ -200,11 +418,11 @@ struct sk_buff *__alloc_skb_head(gfp_t gfp_mask, int node) +@@ -200,11 +387,11 @@ out: * Buffers may only be allocated from interrupts using a @gfp_mask of * %GFP_ATOMIC. */ @@ -1101,7 +1122,7 @@ index fe008f1..f232d88 100644 struct sk_buff *skb; u8 *data; bool pfmemalloc; -@@ -238,41 +456,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, +@@ -238,41 +425,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, size = SKB_WITH_OVERHEAD(ksize(data)); prefetchw(data + size); @@ -1144,7 +1165,7 @@ index fe008f1..f232d88 100644 out: return skb; nodata: -@@ -280,6 +464,42 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, +@@ -280,6 +433,42 @@ nodata: skb = NULL; goto out; } @@ -1187,7 +1208,7 @@ index fe008f1..f232d88 100644 EXPORT_SYMBOL(__alloc_skb); /** -@@ -620,7 +840,12 @@ static void kfree_skbmem(struct sk_buff *skb) +@@ -620,7 +809,12 @@ static void kfree_skbmem(struct sk_buff *skb) switch (skb->fclone) { case SKB_FCLONE_UNAVAILABLE: @@ -1201,7 +1222,7 @@ index fe008f1..f232d88 100644 return; case SKB_FCLONE_ORIG: -@@ -641,7 +866,12 @@ static void kfree_skbmem(struct sk_buff *skb) +@@ -641,7 +835,12 @@ static void kfree_skbmem(struct sk_buff *skb) if (!atomic_dec_and_test(&fclones->fclone_ref)) return; fastpath: @@ -1214,7 +1235,7 @@ index fe008f1..f232d88 100644 } static void skb_release_head_state(struct sk_buff *skb) -@@ -777,6 +1007,17 @@ static inline void _kfree_skb_defer(struct sk_buff *skb) +@@ -777,6 +976,17 @@ static inline void _kfree_skb_defer(struct sk_buff *skb) /* drop skb->head and call any destructors for packet */ skb_release_all(skb); @@ -1232,7 +1253,7 @@ index fe008f1..f232d88 100644 /* record skb to CPU local list */ nc->skb_cache[nc->skb_count++] = skb; -@@ -837,7 +1078,12 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) +@@ -837,7 +1047,12 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) new->tstamp = old->tstamp; /* We do not copy old->sk */ new->dev = old->dev; @@ -1245,17 +1266,7 @@ index fe008f1..f232d88 100644 skb_dst_copy(new, old); #ifdef CONFIG_XFRM new->sp = secpath_get(old->sp); -@@ -932,6 +1178,9 @@ static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb) - struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src) - { - skb_release_all(dst); -+#ifdef CONFIG_SECURITY_TEMPESTA -+ dst->skb_page = src->skb_page; -+#endif - return __skb_clone(dst, src); - } - EXPORT_SYMBOL_GPL(skb_morph); -@@ -1025,6 +1274,10 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) +@@ -1025,6 +1240,10 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) atomic_read(&fclones->fclone_ref) == 1) { n = &fclones->skb2; atomic_set(&fclones->fclone_ref, 2); @@ -1266,7 +1277,7 @@ index fe008f1..f232d88 100644 } else { if (skb_pfmemalloc(skb)) gfp_mask |= __GFP_MEMALLOC; -@@ -1035,6 +1288,9 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) +@@ -1035,6 +1254,9 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) kmemcheck_annotate_bitfield(n, flags1); n->fclone = SKB_FCLONE_UNAVAILABLE; @@ -1276,7 +1287,7 @@ index fe008f1..f232d88 100644 } return __skb_clone(n, skb); -@@ -1205,15 +1461,22 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, +@@ -1205,15 +1427,22 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, if (skb_shared(skb)) BUG(); @@ -1302,7 +1313,7 @@ index fe008f1..f232d88 100644 /* Copy only real data... and, alas, header. This should be * optimized for the cases when header is void. -@@ -1246,7 +1509,12 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, +@@ -1246,7 +1475,12 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, off = (data + nhead) - skb->head; skb->head = data; @@ -1315,7 +1326,7 @@ index fe008f1..f232d88 100644 skb->data += off; #ifdef NET_SKBUFF_DATA_USES_OFFSET skb->end = size; -@@ -1263,7 +1531,11 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, +@@ -1263,7 +1497,11 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, return 0; nofrags: @@ -1327,7 +1338,7 @@ index fe008f1..f232d88 100644 nodata: return -ENOMEM; } -@@ -1372,7 +1644,11 @@ int skb_pad(struct sk_buff *skb, int pad) +@@ -1372,7 +1610,11 @@ int skb_pad(struct sk_buff *skb, int pad) return 0; } @@ -1339,7 +1350,7 @@ index fe008f1..f232d88 100644 if (likely(skb_cloned(skb) || ntail > 0)) { err = pskb_expand_head(skb, 0, ntail, GFP_ATOMIC); if (unlikely(err)) -@@ -1607,7 +1883,13 @@ unsigned char *__pskb_pull_tail(struct sk_buff *skb, int delta) +@@ -1607,7 +1849,13 @@ unsigned char *__pskb_pull_tail(struct sk_buff *skb, int delta) * plus 128 bytes for future expansions. If we have enough * room at tail, reallocate without expansion only if skb is cloned. */ @@ -1354,7 +1365,7 @@ index fe008f1..f232d88 100644 if (eat > 0 || skb_cloned(skb)) { if (pskb_expand_head(skb, 0, eat > 0 ? eat + 128 : 0, -@@ -3463,16 +3745,31 @@ EXPORT_SYMBOL_GPL(skb_gro_receive); +@@ -3463,16 +3711,31 @@ EXPORT_SYMBOL_GPL(skb_gro_receive); void __init skb_init(void) { @@ -1391,21 +1402,23 @@ index fe008f1..f232d88 100644 } /** -@@ -4262,7 +4559,12 @@ void kfree_skb_partial(struct sk_buff *skb, bool head_stolen) +@@ -4262,7 +4525,15 @@ void kfree_skb_partial(struct sk_buff *skb, bool head_stolen) { if (head_stolen) { skb_release_head_state(skb); -- kmem_cache_free(skbuff_head_cache, skb); +#ifdef CONFIG_SECURITY_TEMPESTA -+ if (skb->skb_page) -+ put_page(virt_to_page(skb)); -+ else ++ /* ++ * fclones are possible here with Tempesta due to using ++ * pskb_copy_for_clone() in ss_send(). ++ */ ++ kfree_skbmem(skb); ++#else + kmem_cache_free(skbuff_head_cache, skb); +#endif -+ kmem_cache_free(skbuff_head_cache, skb); } else { __kfree_skb(skb); } -@@ -4704,13 +5006,20 @@ static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off, +@@ -4704,13 +4975,20 @@ static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off, if (skb_pfmemalloc(skb)) gfp_mask |= __GFP_MEMALLOC; @@ -1427,7 +1440,31 @@ index fe008f1..f232d88 100644 /* Copy real data, and all frags */ skb_copy_from_linear_data_offset(skb, off, data, new_hlen); -@@ -4828,13 +5137,20 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off, +@@ -4723,7 +5001,11 @@ static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off, + if (skb_cloned(skb)) { + /* drop the old head gracefully */ + if (skb_orphan_frags(skb, gfp_mask)) { ++#ifdef CONFIG_SECURITY_TEMPESTA ++ skb_free_frag(data); ++#else + kfree(data); ++#endif + return -ENOMEM; + } + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) +@@ -4740,7 +5022,11 @@ static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off, + + skb->head = data; + skb->data = data; ++#ifdef CONFIG_SECURITY_TEMPESTA ++ skb->head_frag = 1; ++#else + skb->head_frag = 0; ++#endif + #ifdef NET_SKBUFF_DATA_USES_OFFSET + skb->end = size; + #else +@@ -4828,19 +5114,30 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off, if (skb_pfmemalloc(skb)) gfp_mask |= __GFP_MEMALLOC; @@ -1449,31 +1486,76 @@ index fe008f1..f232d88 100644 memcpy((struct skb_shared_info *)(data + size), skb_shinfo(skb), offsetof(struct skb_shared_info, + frags[skb_shinfo(skb)->nr_frags])); + if (skb_orphan_frags(skb, gfp_mask)) { ++#ifdef CONFIG_SECURITY_TEMPESTA ++ skb_free_frag(data); ++#else + kfree(data); ++#endif + return -ENOMEM; + } + shinfo = (struct skb_shared_info *)(data + size); +@@ -4878,8 +5175,12 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off, + skb_release_data(skb); + + skb->head = data; +- skb->head_frag = 0; + skb->data = data; ++#ifdef CONFIG_SECURITY_TEMPESTA ++ skb->head_frag = 1; ++#else ++ skb->head_frag = 0; ++#endif + #ifdef NET_SKBUFF_DATA_USES_OFFSET + skb->end = size; + #else +diff --git a/net/core/sock.c b/net/core/sock.c +index 1989b3d..471c4f6 100644 +--- a/net/core/sock.c ++++ b/net/core/sock.c +@@ -1642,8 +1642,16 @@ void sock_wfree(struct sk_buff *skb) + * if sk_wmem_alloc reaches 0, we must finish what sk_free() + * could not do because of in-flight packets + */ +- if (atomic_sub_and_test(len, &sk->sk_wmem_alloc)) ++ if (atomic_sub_and_test(len, &sk->sk_wmem_alloc)) { ++ /* ++ * We don't bother with Tempesta socket memory limitations ++ * since in proxy mode we just forward packets instead of real ++ * allocations. Probably this is an issue. Probably sockets ++ * can be freed from under us. ++ */ ++ WARN_ON(sock_flag(sk, SOCK_TEMPESTA)); + __sk_free(sk); ++ } + } + EXPORT_SYMBOL(sock_wfree); + diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c -index cf3d567..e93129c 100644 +index cf3d567..0f00e28 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c -@@ -801,7 +801,14 @@ struct sock *inet_csk_reqsk_queue_add(struct sock *sk, - if (unlikely(sk->sk_state != TCP_LISTEN)) { - inet_child_forget(sk, req, child); - child = NULL; -- } else { -+ } +@@ -797,6 +797,14 @@ struct sock *inet_csk_reqsk_queue_add(struct sock *sk, + { + struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; + +#ifdef CONFIG_SECURITY_TEMPESTA -+ else if (sock_flag(sk, SOCK_TEMPESTA)) { ++ if (sk->sk_state == TCP_LISTEN && sock_flag(sk, SOCK_TEMPESTA)) { + /* Tempesta doesn't use accept queue, just put the request. */ + reqsk_put(req); ++ return child; + } +#endif -+ else { - req->sk = child; - req->dl_next = NULL; - if (queue->rskq_accept_head == NULL) ++ + spin_lock(&queue->rskq_lock); + if (unlikely(sk->sk_state != TCP_LISTEN)) { + inet_child_forget(sk, req, child); diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index ca97835..8427f32 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c -@@ -621,7 +621,8 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, +@@ -621,7 +621,8 @@ other_parity_scan: goto ok; next_port: spin_unlock_bh(&head->lock); @@ -1484,7 +1566,7 @@ index ca97835..8427f32 100644 offset++; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c -index 86fbf0f..6944a4a 100644 +index 86fbf0f..628b5df 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -592,18 +592,19 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) @@ -1509,7 +1591,20 @@ index 86fbf0f..6944a4a 100644 { struct tcp_sock *tp = tcp_sk(sk); struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); -@@ -621,6 +622,7 @@ static void skb_entail(struct sock *sk, struct sk_buff *skb) +@@ -612,7 +613,11 @@ static void skb_entail(struct sock *sk, struct sk_buff *skb) + tcb->seq = tcb->end_seq = tp->write_seq; + tcb->tcp_flags = TCPHDR_ACK; + tcb->sacked = 0; +- __skb_header_release(skb); ++ /* ++ * fclones are possible here, so accurately update ++ * skb_shinfo(skb)->dataref. ++ */ ++ skb_header_release(skb); + tcp_add_write_queue_tail(sk, skb); + sk->sk_wmem_queued += skb->truesize; + sk_mem_charge(sk, skb->truesize); +@@ -621,6 +626,7 @@ static void skb_entail(struct sock *sk, struct sk_buff *skb) tcp_slow_start_after_idle_check(sk); } @@ -1517,7 +1612,7 @@ index 86fbf0f..6944a4a 100644 static inline void tcp_mark_urg(struct tcp_sock *tp, int flags) { -@@ -647,8 +649,8 @@ static bool tcp_should_autocork(struct sock *sk, struct sk_buff *skb, +@@ -647,8 +653,8 @@ static bool tcp_should_autocork(struct sock *sk, struct sk_buff *skb, atomic_read(&sk->sk_wmem_alloc) > skb->truesize; } @@ -1528,7 +1623,7 @@ index 86fbf0f..6944a4a 100644 { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; -@@ -681,6 +683,7 @@ static void tcp_push(struct sock *sk, int flags, int mss_now, +@@ -681,6 +687,7 @@ static void tcp_push(struct sock *sk, int flags, int mss_now, __tcp_push_pending_frames(sk, mss_now, nonagle); } @@ -1536,7 +1631,7 @@ index 86fbf0f..6944a4a 100644 static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, unsigned int offset, size_t len) -@@ -871,7 +874,7 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now, +@@ -871,7 +878,7 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now, return max(size_goal, mss_now); } @@ -1545,7 +1640,7 @@ index 86fbf0f..6944a4a 100644 { int mss_now; -@@ -880,6 +883,7 @@ static int tcp_send_mss(struct sock *sk, int *size_goal, int flags) +@@ -880,6 +887,7 @@ static int tcp_send_mss(struct sock *sk, int *size_goal, int flags) return mss_now; } @@ -1553,7 +1648,7 @@ index 86fbf0f..6944a4a 100644 static ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset, size_t size, int flags) -@@ -1422,7 +1426,7 @@ static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len) +@@ -1422,7 +1430,7 @@ static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len) * calculation of whether or not we must ACK for the sake of * a window update. */ @@ -1562,7 +1657,7 @@ index 86fbf0f..6944a4a 100644 { struct tcp_sock *tp = tcp_sk(sk); bool time_to_ack = false; -@@ -1479,6 +1483,7 @@ static void tcp_cleanup_rbuf(struct sock *sk, int copied) +@@ -1479,6 +1487,7 @@ static void tcp_cleanup_rbuf(struct sock *sk, int copied) if (time_to_ack) tcp_send_ack(sk); } @@ -1570,7 +1665,7 @@ index 86fbf0f..6944a4a 100644 static void tcp_prequeue_process(struct sock *sk) { -@@ -2012,7 +2017,7 @@ static const unsigned char new_state[16] = { +@@ -2012,7 +2021,7 @@ static const unsigned char new_state[16] = { [TCP_NEW_SYN_RECV] = TCP_CLOSE, /* should not happen ! */ }; @@ -1579,7 +1674,7 @@ index 86fbf0f..6944a4a 100644 { int next = (int)new_state[sk->sk_state]; int ns = next & TCP_STATE_MASK; -@@ -2021,6 +2026,7 @@ static int tcp_close_state(struct sock *sk) +@@ -2021,6 +2030,7 @@ static int tcp_close_state(struct sock *sk) return next & TCP_ACTION_FIN; } @@ -1587,7 +1682,7 @@ index 86fbf0f..6944a4a 100644 /* * Shutdown the sending side of a connection. Much like close except -@@ -2060,6 +2066,7 @@ bool tcp_check_oom(struct sock *sk, int shift) +@@ -2060,6 +2070,7 @@ bool tcp_check_oom(struct sock *sk, int shift) net_info_ratelimited("out of memory -- consider tuning tcp_mem\n"); return too_many_orphans || out_of_socket_memory; } @@ -1599,7 +1694,7 @@ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 01336aa..3e47409 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c -@@ -642,6 +642,7 @@ void tcp_rcv_space_adjust(struct sock *sk) +@@ -642,6 +642,7 @@ new_measure: tp->rcvq_space.seq = tp->copied_seq; tp->rcvq_space.time = tcp_time_stamp; } diff --git a/tempesta_fw/classifier.h b/tempesta_fw/classifier.h index 9a24ee59b3..3299eb6f89 100644 --- a/tempesta_fw/classifier.h +++ b/tempesta_fw/classifier.h @@ -2,7 +2,7 @@ * Tempesta FW * * Copyright (C) 2014 NatSys Lab. (info@natsys-lab.com). - * Copyright (C) 2015-2016 Tempesta Technologies, Inc. + * Copyright (C) 2015-2017 Tempesta Technologies, Inc. * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by diff --git a/tempesta_fw/http.c b/tempesta_fw/http.c index 1e45ba1b77..0fb48a3dc1 100644 --- a/tempesta_fw/http.c +++ b/tempesta_fw/http.c @@ -1544,7 +1544,7 @@ tfw_http_conn_send(TfwConn *conn, TfwMsg *msg) /** * Create a sibling for @msg message. - * Siblings in HTTP are pipelined requests that share the same SKB. + * Siblings in HTTP are pipelined HTTP messages that share the same SKB. */ static TfwHttpMsg * tfw_http_msg_create_sibling(TfwHttpMsg *hm, struct sk_buff **skb, diff --git a/tempesta_fw/sock.c b/tempesta_fw/sock.c index 03a75d8e51..6bd86495ce 100644 --- a/tempesta_fw/sock.c +++ b/tempesta_fw/sock.c @@ -126,14 +126,6 @@ ss_sk_incoming_cpu_update(struct sock *sk) sk->sk_incoming_cpu = raw_smp_processor_id(); } -static inline void -skb_sender_cpu_clear(struct sk_buff *skb) -{ -#ifdef CONFIG_XPS - skb->sender_cpu = 0; -#endif -} - /** * Enters critical section synchronized with ss_synchronize(). * Active networking operations which involves SS callback calls must be @@ -360,13 +352,7 @@ ss_do_send(struct sock *sk, SsSkbList *skb_list, int flags) continue; } - skb->ip_summed = CHECKSUM_PARTIAL; - tcp_skb_pcount_set(skb, 0); - - /* @skb should be rerouted on forwarding. */ - skb_dst_drop(skb); - /* Clear sender_cpu so flow_disscector can set it properly. */ - skb_sender_cpu_clear(skb); + ss_skb_init_for_xmit(skb); TFW_DBG("[%d]: %s: entail skb=%p data_len=%u len=%u\n", smp_processor_id(), __func__, @@ -411,7 +397,7 @@ ss_send(struct sock *sk, SsSkbList *skb_list, int flags) }; BUG_ON(!sk); - BUG_ON(ss_skb_queue_empty(skb_list)); + WARN_ON_ONCE(ss_skb_queue_empty(skb_list)); cpu = sk->sk_incoming_cpu; @@ -506,7 +492,6 @@ ss_do_close(struct sock *sk) { struct sk_buff *skb; int data_was_unread = 0; - int state; if (unlikely(!sk)) return; @@ -514,13 +499,14 @@ ss_do_close(struct sock *sk) smp_processor_id(), sk, ss_statename[sk->sk_state], sk_has_account(sk), atomic_read(&sk->sk_refcnt)); assert_spin_locked(&sk->sk_lock.slock); - BUG_ON(sk->sk_state == TCP_LISTEN); + BUG_ON(sk->sk_lock.slock.rlock.owner_cpu != raw_smp_processor_id()); + WARN_ON_ONCE(sk->sk_state == TCP_LISTEN); /* We must return immediately, so LINGER option is meaningless. */ - WARN_ON(sock_flag(sk, SOCK_LINGER)); + WARN_ON_ONCE(sock_flag(sk, SOCK_LINGER)); /* We don't support virtual containers, so TCP_REPAIR is prohibited. */ - WARN_ON(tcp_sk(sk)->repair); + WARN_ON_ONCE(tcp_sk(sk)->repair); /* The socket must have atomic allocation mask. */ - WARN_ON(!(sk->sk_allocation & GFP_ATOMIC)); + WARN_ON_ONCE(!(sk->sk_allocation & GFP_ATOMIC)); /* The below is mostly copy-paste from tcp_close(). */ sk->sk_shutdown = SHUTDOWN_MASK; @@ -550,7 +536,7 @@ ss_do_close(struct sock *sk) skb = tcp_write_queue_tail(sk); - if (tcp_send_head(sk) != NULL) { + if (skb && tcp_send_head(sk)) { /* Send FIN with data if we have any. */ TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_FIN; TCP_SKB_CB(skb)->end_seq++; @@ -573,7 +559,6 @@ ss_do_close(struct sock *sk) } adjudge_to_death: - state = sk->sk_state; sock_hold(sk); sock_orphan(sk); @@ -585,9 +570,6 @@ ss_do_close(struct sock *sk) percpu_counter_inc(sk->sk_prot->orphan_count); - if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE) - return; - if (sk->sk_state == TCP_FIN_WAIT2) { const int tmo = tcp_fin_time(sk); if (tmo > TCP_TIMEWAIT_LEN) { @@ -707,9 +689,10 @@ ss_tcp_process_skb(struct sock *sk, struct sk_buff *skb, int *processed) while ((skb = ss_skb_dequeue(&skb_list))) { int off; - - /* We don't expect to see such SKBs here */ - WARN_ON(skb->tail_lock); + + WARN_ON_ONCE(skb->tail_lock); + WARN_ON_ONCE(skb_has_frag_list(skb)); + WARN_ON_ONCE(skb->sk || skb->destructor); if (unlikely(offset >= skb->len)) { offset -= skb->len; @@ -787,9 +770,7 @@ ss_tcp_process_data(struct sock *sk) __skb_unlink(skb, &sk->sk_receive_queue); skb_orphan(skb); - /* Shared SKBs shouldn't be seen here. */ - if (skb_shared(skb)) - BUG(); + WARN_ON_ONCE(skb_shared(skb)); /* Save the original len and seq for reporting. */ skb_len = skb->len; @@ -835,6 +816,7 @@ ss_tcp_data_ready(struct sock *sk) TFW_DBG("[%d]: %s: sk=%p state=%s\n", smp_processor_id(), __func__, sk, ss_statename[sk->sk_state]); assert_spin_locked(&sk->sk_lock.slock); + BUG_ON(sk->sk_lock.slock.rlock.owner_cpu != raw_smp_processor_id()); if (!skb_queue_empty(&sk->sk_error_queue)) { /* @@ -884,6 +866,7 @@ ss_tcp_state_change(struct sock *sk) smp_processor_id(), __func__, sk, ss_statename[sk->sk_state]); ss_sk_incoming_cpu_update(sk); assert_spin_locked(&sk->sk_lock.slock); + BUG_ON(sk->sk_lock.slock.rlock.owner_cpu != raw_smp_processor_id()); if (sk->sk_state == TCP_ESTABLISHED) { /* Process the new TCP connection. */ @@ -999,7 +982,7 @@ ss_proto_init(SsProto *proto, const SsHooks *hooks, int type) * The memory allocated for @proto should be already zero'ed, so don't * initialize this field to NULL, but instead check the invariant. */ - BUG_ON(proto->listener); + WARN_ON_ONCE(proto->listener); } EXPORT_SYMBOL(ss_proto_init); @@ -1026,7 +1009,7 @@ ss_set_callbacks(struct sock *sk) * ss_tcp_state_change() dereferences sk->sk_user_data as SsProto, * so the caller must initialize it before setting callbacks. */ - BUG_ON(!sk->sk_user_data); + WARN_ON_ONCE(!sk->sk_user_data); sk->sk_data_ready = ss_tcp_data_ready; sk->sk_state_change = ss_tcp_state_change; @@ -1068,7 +1051,7 @@ ss_inet_create(struct net *net, int family, struct proto *answer_prot; /* TCP only is supported for now. */ - BUG_ON(type != SOCK_STREAM || protocol != IPPROTO_TCP); + WARN_ON_ONCE(type != SOCK_STREAM || protocol != IPPROTO_TCP); /* * Get socket properties. @@ -1173,12 +1156,12 @@ EXPORT_SYMBOL(ss_sock_create); /* * The original functions are inet_release() and inet6_release(). - * NOTE: Rework this function if/when Tempesta needs multicast support. + * Executes tcp_close(), so must be called from process context only. */ void ss_release(struct sock *sk) { - BUG_ON(sock_flag(sk, SOCK_LINGER)); + WARN_ON_ONCE(sock_flag(sk, SOCK_LINGER)); sk->sk_prot->close(sk, 0); } @@ -1193,8 +1176,9 @@ ss_connect(struct sock *sk, struct sockaddr *uaddr, int uaddr_len, int flags) { int r; - BUG_ON((sk->sk_family != AF_INET) && (sk->sk_family != AF_INET6)); - BUG_ON((uaddr->sa_family != AF_INET) && (uaddr->sa_family != AF_INET6)); + WARN_ON_ONCE((sk->sk_family != AF_INET) && (sk->sk_family != AF_INET6)); + WARN_ON_ONCE((uaddr->sa_family != AF_INET) + && (uaddr->sa_family != AF_INET6)); if (uaddr_len < sizeof(uaddr->sa_family)) return -EINVAL; @@ -1204,7 +1188,9 @@ ss_connect(struct sock *sk, struct sockaddr *uaddr, int uaddr_len, int flags) if (ss_active_guard_enter(SS_V_ACT_LIVECONN)) return SS_SHUTDOWN; + bh_lock_sock(sk); r = sk->sk_prot->connect(sk, uaddr, uaddr_len); + bh_unlock_sock(sk); /* * If connect() successfully returns, then the soket is living somewhere @@ -1229,8 +1215,10 @@ ss_bind(struct sock *sk, struct sockaddr *uaddr, int uaddr_len) .sk = sk, .type = sk->sk_type }; - BUG_ON((sk->sk_family != AF_INET) && (sk->sk_family != AF_INET6)); - BUG_ON(sk->sk_type != SOCK_STREAM); + + WARN_ON_ONCE((sk->sk_family != AF_INET) && (sk->sk_family != AF_INET6)); + WARN_ON_ONCE(sk->sk_type != SOCK_STREAM); + if (sk->sk_family == AF_INET) return inet_bind(&sock, uaddr, uaddr_len); else @@ -1250,7 +1238,9 @@ ss_listen(struct sock *sk, int backlog) .type = sk->sk_type, .state = SS_UNCONNECTED }; - BUG_ON(sk->sk_type != SOCK_STREAM); + + WARN_ON_ONCE(sk->sk_type != SOCK_STREAM); + return inet_listen(&sock, backlog); } EXPORT_SYMBOL(ss_listen); @@ -1347,7 +1337,6 @@ ss_tx_action(void) sock_put(sk); /* paired with push() calls */ while ((skb = ss_skb_dequeue(&sw.skb_list))) kfree_skb(skb); - } /* diff --git a/tempesta_fw/ss_skb.c b/tempesta_fw/ss_skb.c index 7670332c37..f91e687f27 100644 --- a/tempesta_fw/ss_skb.c +++ b/tempesta_fw/ss_skb.c @@ -27,6 +27,7 @@ #include #include #include +#include #include "addr.h" #include "procfs.h" @@ -66,6 +67,9 @@ ss_skb_fmt_src_addr(const struct sk_buff *skb, char *out_buf) * * Similar to alloc_skb_with_frags() except it doesn't allocate multi-page * fragments, and it sets up fragments with zero size. + * + * TODO #391: use less pages by allocating the skb from ss_skb_alloc() + * with maximum page header to fully utilize the page. */ struct sk_buff * ss_skb_alloc_pages(size_t len) @@ -200,10 +204,10 @@ __skb_data_address(struct sk_buff *skb) return NULL; if (skb_headlen(skb)) return skb->data; - BUG_ON(!skb_is_nonlinear(skb)); + WARN_ON_ONCE(!skb_is_nonlinear(skb)); if (skb_shinfo(skb)->nr_frags) return skb_frag_address(&skb_shinfo(skb)->frags[0]); - BUG_ON(skb_has_frag_list(skb)); + WARN_ON_ONCE(skb_has_frag_list(skb)); return NULL; } @@ -253,7 +257,7 @@ __skb_skblist_fixup(SsSkbList *skb_list) if (lscb->next) skb_list->last = lscb->next; - BUG_ON(TFW_SKB_CB(skb_list->last)->next); + WARN_ON_ONCE(TFW_SKB_CB(skb_list->last)->next); } /** @@ -929,14 +933,6 @@ ss_skb_process(struct sk_buff *skb, unsigned int *off, ss_skb_actor_t actor, } } - /* - * If paged fragments are full, in case of GRO skb_gro_receive() - * adds SKBs to frag_list from gro_list. However, SKBs that have - * frag_list are split into separate SKBs before they get to - * Tempesta for processing. - */ - BUG_ON(skb_has_frag_list(skb)); - return r; } EXPORT_SYMBOL(ss_skb_process); @@ -957,7 +953,7 @@ ss_skb_split(struct sk_buff *skb, int len) int nsize, asize, nlen; /* Assert that the SKB is orphaned. */ - BUG_ON(skb->destructor); + WARN_ON_ONCE(skb->destructor); nsize = skb_headlen(skb) - len; if (nsize < 0) @@ -1056,6 +1052,58 @@ __copy_ip_header(struct sk_buff *to, const struct sk_buff *from) memcpy(skb_network_header(to), ip4, sizeof(*ip4)); } +/** + * Tempesta FW forwards skbs with application and transport payload as is, + * so initialize such skbs such that TCP/IP stack won't stumble on dirty + * data. + */ +void +ss_skb_init_for_xmit(struct sk_buff *skb) +{ + struct skb_shared_info *shinfo = skb_shinfo(skb); + __u8 pfmemalloc = skb->pfmemalloc; + + WARN_ON_ONCE(skb->next || skb->prev); + WARN_ON_ONCE(skb->sk); + WARN_ON_ONCE(skb->destructor); + + if (!skb_transport_header_was_set(skb)) { + /* Quick path for new skbs. */ + skb->ip_summed = CHECKSUM_PARTIAL; + return; + } + + memset(&skb->skb_mstamp, 0, sizeof(skb->skb_mstamp)); + skb->dev = NULL; + memset(skb->cb, 0, sizeof(skb->cb)); + skb_dst_drop(skb); +#ifdef CONFIG_XFRM + secpath_put(skb->sp); +#endif + nf_reset(skb); + skb->mac_len = 0; + skb->queue_mapping = 0; + skb->peeked = 0; + skb->xmit_more = 0; + memset(&skb->headers_start, 0, + offsetof(struct sk_buff, headers_end) - + offsetof(struct sk_buff, headers_start)); + skb->pfmemalloc = pfmemalloc; + skb->mac_header = (typeof(skb->mac_header))~0U; + skb->transport_header = (typeof(skb->transport_header))~0U; + + shinfo->tx_flags = 0; + shinfo->gso_size = 0; + shinfo->gso_segs = 0; + shinfo->gso_type = 0; + memset(&shinfo->hwtstamps, 0, sizeof(shinfo->hwtstamps)); + shinfo->tskey = 0; + shinfo->ip6_frag_id = 0; + shinfo->destructor_arg = NULL; + + skb->ip_summed = CHECKSUM_PARTIAL; +} + /* * When the original SKB is a clone then its shinfo and payload cannot be * modified as they are shared with other SKB users. As the SKB is unrolled, @@ -1127,14 +1175,14 @@ ss_skb_unroll(SsSkbList *skb_list, struct sk_buff *skb) if (unlikely(skb_cloned(skb))) return ss_skb_unroll_slow(skb_list, skb); - /* - * Note, that skb_gro_receive() drops reference to the - * SKB's header via the __skb_header_release(). So, to - * not break the things we must take reference back. - */ ss_skb_queue_tail(skb_list, skb); skb_walk_frags(skb, f_skb) { if (f_skb->nohdr) { + /* + * skb_gro_receive() drops reference to the SKB's header + * via the __skb_header_release(). So, to not break the + * things we must take reference back. + */ f_skb->nohdr = 0; atomic_sub(1 << SKB_DATAREF_SHIFT, &skb_shinfo(f_skb)->dataref); diff --git a/tempesta_fw/ss_skb.h b/tempesta_fw/ss_skb.h index e16f08fa2c..804d772034 100644 --- a/tempesta_fw/ss_skb.h +++ b/tempesta_fw/ss_skb.h @@ -73,7 +73,8 @@ ss_skb_queue_tail(SsSkbList *list, struct sk_buff *skb) { SsSkbCb *scb = TFW_SKB_CB(skb); - BUG_ON(scb->next || scb->prev); + /* The skb shouldn't be in any other queue. */ + WARN_ON_ONCE(scb->next || scb->prev); scb->prev = list->last; if (ss_skb_queue_empty(list)) @@ -81,8 +82,6 @@ ss_skb_queue_tail(SsSkbList *list, struct sk_buff *skb) else TFW_SKB_CB(list->last)->next = skb; list->last = skb; - - BUG_ON(scb->next); } static inline void @@ -183,10 +182,12 @@ static inline unsigned char * ss_skb_put(struct sk_buff *skb, const int len) { unsigned char *tmp = skb_tail_pointer(skb); + skb->tail += len; skb->len += len; - if (unlikely(skb->tail > skb->end)) - BUG(); + + WARN_ON_ONCE(skb->tail > skb->end); + return tmp; } @@ -217,6 +218,7 @@ int ss_skb_process(struct sk_buff *skb, unsigned int *off, ss_skb_actor_t actor, void *objdata); int ss_skb_unroll(SsSkbList *skb_list, struct sk_buff *skb); +void ss_skb_init_for_xmit(struct sk_buff *skb); void ss_skb_dump(struct sk_buff *skb); #endif /* __TFW_SS_SKB_H__ */