diff --git a/linux-4.8.15.patch b/linux-4.8.15.patch index d5581f22e..c21cf8888 100644 --- a/linux-4.8.15.patch +++ b/linux-4.8.15.patch @@ -318,7 +318,7 @@ index 0000000..55049bd +#endif /* __TEMPESTA_H__ */ + diff --git a/include/net/sock.h b/include/net/sock.h -index c26eab9..196f9f9 100644 +index c26eab9..84bb38b 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -741,6 +741,9 @@ enum sock_flags { @@ -331,23 +331,7 @@ index c26eab9..196f9f9 100644 }; #define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)) -@@ -872,9 +875,14 @@ static inline int sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) - return sk->sk_backlog_rcv(sk, skb); - } - -+#define TFW_SK_CPU_INIT USHRT_MAX -+ - static inline void sk_incoming_cpu_update(struct sock *sk) - { -- sk->sk_incoming_cpu = raw_smp_processor_id(); -+#ifdef CONFIG_SECURITY_TEMPESTA -+ if (sk->sk_incoming_cpu == TFW_SK_CPU_INIT) -+#endif -+ sk->sk_incoming_cpu = raw_smp_processor_id(); - } - - static inline void sock_rps_record_flow_hash(__u32 hash) -@@ -1670,8 +1678,7 @@ static inline void sk_rethink_txhash(struct sock *sk) +@@ -1670,8 +1673,7 @@ static inline void sk_rethink_txhash(struct sock *sk) static inline struct dst_entry * __sk_dst_get(struct sock *sk) { @@ -848,7 +832,7 @@ index 5d26056..ee02253 100644 } +EXPORT_SYMBOL(reqsk_fastopen_remove); diff --git a/net/core/skbuff.c b/net/core/skbuff.c -index 3864b4b..90bf764 100644 +index 3864b4b..d2bd601 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -79,7 +79,9 @@ @@ -869,7 +853,7 @@ index 3864b4b..90bf764 100644 /* * kmalloc_reserve is a wrapper around kmalloc_node_track_caller that tells * the caller if emergency pfmemalloc reserves are being used. If it is and -@@ -151,6 +154,161 @@ out: +@@ -151,6 +154,166 @@ out: return obj; } @@ -999,8 +983,13 @@ index 3864b4b..90bf764 100644 + if (!pg) + return NULL; + ptr = (char *)page_address(pg); ++ /* ++ * Don't try to split compound page. ++ * TODO compound pages can be split as __alloc_page_frag() does it ++ * using fragment size in page reference counter. ++ */ + if (po) -+ return ptr; /* don't try to split compound page */ ++ return ptr; + o = PAGE_SHIFT - PG_CHUNK_BITS; + + PREEMPT_CTX_DISABLE(); @@ -1031,7 +1020,7 @@ index 3864b4b..90bf764 100644 /* Allocate a new skbuff. We do this ourselves so we can fill in a few * 'private' fields and also do memory statistics to find all the -@@ -183,6 +341,54 @@ out: +@@ -183,6 +346,54 @@ out: return skb; } @@ -1086,7 +1075,7 @@ index 3864b4b..90bf764 100644 /** * __alloc_skb - allocate a network buffer * @size: size to allocate -@@ -200,11 +406,11 @@ out: +@@ -200,11 +411,11 @@ out: * Buffers may only be allocated from interrupts using a @gfp_mask of * %GFP_ATOMIC. */ @@ -1099,7 +1088,7 @@ index 3864b4b..90bf764 100644 struct sk_buff *skb; u8 *data; bool pfmemalloc; -@@ -238,41 +444,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, +@@ -238,41 +449,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, size = SKB_WITH_OVERHEAD(ksize(data)); prefetchw(data + size); @@ -1142,7 +1131,7 @@ index 3864b4b..90bf764 100644 out: return skb; nodata: -@@ -280,6 +452,42 @@ nodata: +@@ -280,6 +457,42 @@ nodata: skb = NULL; goto out; } @@ -1185,7 +1174,7 @@ index 3864b4b..90bf764 100644 EXPORT_SYMBOL(__alloc_skb); /** -@@ -620,7 +828,12 @@ static void kfree_skbmem(struct sk_buff *skb) +@@ -620,7 +833,12 @@ static void kfree_skbmem(struct sk_buff *skb) switch (skb->fclone) { case SKB_FCLONE_UNAVAILABLE: @@ -1199,7 +1188,7 @@ index 3864b4b..90bf764 100644 return; case SKB_FCLONE_ORIG: -@@ -641,7 +854,12 @@ static void kfree_skbmem(struct sk_buff *skb) +@@ -641,7 +859,12 @@ static void kfree_skbmem(struct sk_buff *skb) if (!atomic_dec_and_test(&fclones->fclone_ref)) return; fastpath: @@ -1212,7 +1201,7 @@ index 3864b4b..90bf764 100644 } static void skb_release_head_state(struct sk_buff *skb) -@@ -777,6 +995,17 @@ static inline void _kfree_skb_defer(struct sk_buff *skb) +@@ -777,6 +1000,17 @@ static inline void _kfree_skb_defer(struct sk_buff *skb) /* drop skb->head and call any destructors for packet */ skb_release_all(skb); @@ -1230,7 +1219,7 @@ index 3864b4b..90bf764 100644 /* record skb to CPU local list */ nc->skb_cache[nc->skb_count++] = skb; -@@ -837,7 +1066,12 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) +@@ -837,7 +1071,12 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) new->tstamp = old->tstamp; /* We do not copy old->sk */ new->dev = old->dev; @@ -1243,7 +1232,7 @@ index 3864b4b..90bf764 100644 skb_dst_copy(new, old); #ifdef CONFIG_XFRM new->sp = secpath_get(old->sp); -@@ -932,6 +1166,9 @@ static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb) +@@ -932,6 +1171,9 @@ static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb) struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src) { skb_release_all(dst); @@ -1253,7 +1242,7 @@ index 3864b4b..90bf764 100644 return __skb_clone(dst, src); } EXPORT_SYMBOL_GPL(skb_morph); -@@ -1025,6 +1262,10 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) +@@ -1025,6 +1267,10 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) atomic_read(&fclones->fclone_ref) == 1) { n = &fclones->skb2; atomic_set(&fclones->fclone_ref, 2); @@ -1264,7 +1253,7 @@ index 3864b4b..90bf764 100644 } else { if (skb_pfmemalloc(skb)) gfp_mask |= __GFP_MEMALLOC; -@@ -1035,6 +1276,9 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) +@@ -1035,6 +1281,9 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) kmemcheck_annotate_bitfield(n, flags1); n->fclone = SKB_FCLONE_UNAVAILABLE; @@ -1274,7 +1263,7 @@ index 3864b4b..90bf764 100644 } return __skb_clone(n, skb); -@@ -1205,15 +1449,22 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, +@@ -1205,15 +1454,22 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, if (skb_shared(skb)) BUG(); @@ -1300,7 +1289,7 @@ index 3864b4b..90bf764 100644 /* Copy only real data... and, alas, header. This should be * optimized for the cases when header is void. -@@ -1246,7 +1497,12 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, +@@ -1246,7 +1502,12 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, off = (data + nhead) - skb->head; skb->head = data; @@ -1313,7 +1302,7 @@ index 3864b4b..90bf764 100644 skb->data += off; #ifdef NET_SKBUFF_DATA_USES_OFFSET skb->end = size; -@@ -1263,7 +1519,11 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, +@@ -1263,7 +1524,11 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, return 0; nofrags: @@ -1325,7 +1314,7 @@ index 3864b4b..90bf764 100644 nodata: return -ENOMEM; } -@@ -1372,7 +1632,11 @@ int skb_pad(struct sk_buff *skb, int pad) +@@ -1372,7 +1637,11 @@ int skb_pad(struct sk_buff *skb, int pad) return 0; } @@ -1337,7 +1326,7 @@ index 3864b4b..90bf764 100644 if (likely(skb_cloned(skb) || ntail > 0)) { err = pskb_expand_head(skb, 0, ntail, GFP_ATOMIC); if (unlikely(err)) -@@ -1607,7 +1871,13 @@ unsigned char *__pskb_pull_tail(struct sk_buff *skb, int delta) +@@ -1607,7 +1876,13 @@ unsigned char *__pskb_pull_tail(struct sk_buff *skb, int delta) * plus 128 bytes for future expansions. If we have enough * room at tail, reallocate without expansion only if skb is cloned. */ @@ -1352,7 +1341,7 @@ index 3864b4b..90bf764 100644 if (eat > 0 || skb_cloned(skb)) { if (pskb_expand_head(skb, 0, eat > 0 ? eat + 128 : 0, -@@ -3429,16 +3699,31 @@ EXPORT_SYMBOL_GPL(skb_gro_receive); +@@ -3429,16 +3704,31 @@ EXPORT_SYMBOL_GPL(skb_gro_receive); void __init skb_init(void) { @@ -1389,7 +1378,7 @@ index 3864b4b..90bf764 100644 } /** -@@ -4225,7 +4510,12 @@ void kfree_skb_partial(struct sk_buff *skb, bool head_stolen) +@@ -4225,7 +4515,12 @@ void kfree_skb_partial(struct sk_buff *skb, bool head_stolen) { if (head_stolen) { skb_release_head_state(skb); @@ -1403,7 +1392,7 @@ index 3864b4b..90bf764 100644 } else { __kfree_skb(skb); } -@@ -4661,13 +4951,20 @@ static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off, +@@ -4661,13 +4956,20 @@ static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off, if (skb_pfmemalloc(skb)) gfp_mask |= __GFP_MEMALLOC; @@ -1425,7 +1414,7 @@ index 3864b4b..90bf764 100644 /* Copy real data, and all frags */ skb_copy_from_linear_data_offset(skb, off, data, new_hlen); -@@ -4785,13 +5082,20 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off, +@@ -4785,13 +5087,20 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off, if (skb_pfmemalloc(skb)) gfp_mask |= __GFP_MEMALLOC; @@ -1590,7 +1579,7 @@ index a756b87..1eade37 100644 /* There is something which you must keep in mind when you analyze the * behavior of the tp->ato delayed ack timeout interval. When a diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c -index 7b235fa..b2169a4 100644 +index 7b235fa..9d4101d 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -62,6 +62,7 @@ @@ -1636,6 +1625,23 @@ index 7b235fa..b2169a4 100644 if (__inet_inherit_port(sk, newsk) < 0) goto put_and_exit; *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash)); +@@ -1677,7 +1684,16 @@ process: + + sk_incoming_cpu_update(sk); + ++#ifdef CONFIG_SECURITY_TEMPESTA ++ /* ++ * The socket is just retrieved by __inet_lookup_skb(), so there is ++ * no real nested locking yet. Leave the nested locking possiblity to ++ * Tempesta. ++ */ ++ bh_lock_sock(sk); ++#else + bh_lock_sock_nested(sk); ++#endif + tcp_segs_in(tcp_sk(sk), skb); + ret = 0; + if (!sock_owned_by_user(sk)) { diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 4b95ec4..8f5407d 100644 --- a/net/ipv4/tcp_minisocks.c diff --git a/tempesta_fw/sock.c b/tempesta_fw/sock.c index 32af75456..582743251 100644 --- a/tempesta_fw/sock.c +++ b/tempesta_fw/sock.c @@ -82,18 +82,27 @@ static DEFINE_PER_CPU(struct irq_work, ipi_work); ? ((SsProto *)(sk)->sk_user_data)->hooks->f(__VA_ARGS__) \ : 0) +/* TODO #668 remove the function after the tests. */ static void ss_sock_cpu_check(struct sock *sk, const char *op) { - if (unlikely(sk->sk_incoming_cpu != TFW_SK_CPU_INIT + if (unlikely(sk->sk_incoming_cpu != -1 && sk->sk_incoming_cpu != smp_processor_id())) { - SS_DBG("Bad socket cpu locality on <%s>:" - " sk=%p old_cpu=%d curr_cpu=%d\n", - op, sk, sk->sk_incoming_cpu, smp_processor_id()); + SS_WARN("Bad socket cpu locality on <%s>:" + " sk=%p (peer=%x:%x) old_cpu=%d curr_cpu=%d\n", + op, sk, sk->sk_daddr, sk->sk_dport, + sk->sk_incoming_cpu, smp_processor_id()); } } +static void +ss_sk_incoming_cpu_update(struct sock *sk) +{ + if (sk->sk_incoming_cpu == -1) + sk->sk_incoming_cpu = raw_smp_processor_id(); +} + static inline void skb_sender_cpu_clear(struct sk_buff *skb) { @@ -526,7 +535,7 @@ __ss_close(struct sock *sk, int flags) { if (unlikely(!sk)) return SS_OK; - sk_incoming_cpu_update(sk); + ss_sk_incoming_cpu_update(sk); if (!(flags & SS_F_SYNC) || !in_serving_softirq() || smp_processor_id() != sk->sk_incoming_cpu) @@ -541,10 +550,10 @@ __ss_close(struct sock *sk, int flags) } /* - * Don't put the work to work queue if we should execute it on current - * CPU and we're in softirq now. We avoid overhead on work queue - * operations and prevent infinite loop on synchronous push() if a - * consumer is actually the same softirq context. + * Don't put the work to work queue if we should execute it + * synchronously on current CPU and we're in softirq now. + * We avoid overhead on work queue operations and prevent infinite loop + * on synchronous push() if a consumer is actually the same softirq. * * Keep in mind possible ordering problem: the socket can already have * a queued work when we close it synchronously, so the socket can be @@ -556,23 +565,15 @@ __ss_close(struct sock *sk, int flags) * if it's live. However, in some cases this may be called multiple * times on the same socket. Do it only once for the socket. * - * We can be called from tcp_v4_rcv() under the socket lock, so lock - * the socket only if it isn't locked. It safe because we just checked - * the socket's CPU. + * We can be called from tcp_v4_rcv() under the socket lock. */ - if (raw_spin_is_locked(&sk->sk_lock.slock.rlock)) { - if (unlikely(!ss_sock_live(sk))) - return SS_OK; - ss_do_close(sk); - } else { - bh_lock_sock(sk); - if (unlikely(!ss_sock_live(sk))) { - bh_unlock_sock(sk); - return SS_OK; - } - ss_do_close(sk); + bh_lock_sock_nested(sk); + if (unlikely(!ss_sock_live(sk))) { bh_unlock_sock(sk); + return SS_OK; } + ss_do_close(sk); + bh_unlock_sock(sk); if (flags & SS_F_CONN_CLOSE) SS_CALL_GUARD_EXIT(connection_drop, sk); sock_put(sk); /* paired with ss_do_close() */ @@ -725,7 +726,7 @@ ss_tcp_process_data(struct sock *sk) */ /* * Called when a new data received on the socket. - * Called under bh_lock_sock_nested(sk) (see tcp_v4_rcv()). + * Called under bh_lock_sock(sk) (see tcp_v4_rcv()). */ static void ss_tcp_data_ready(struct sock *sk) @@ -778,7 +779,7 @@ ss_tcp_state_change(struct sock *sk) { SS_DBG("[%d]: %s: sk=%p state=%s\n", smp_processor_id(), __func__, sk, ss_statename[sk->sk_state]); - sk_incoming_cpu_update(sk); + ss_sk_incoming_cpu_update(sk); assert_spin_locked(&sk->sk_lock.slock); if (sk->sk_state == TCP_ESTABLISHED) { @@ -994,7 +995,7 @@ ss_inet_create(struct net *net, int family, sock_init_data(NULL, sk); sk->sk_type = type; sk->sk_allocation = GFP_ATOMIC; - sk->sk_incoming_cpu = TFW_SK_CPU_INIT; + sk->sk_incoming_cpu = -1; /* same as in sock_init_data() */ sk->sk_destruct = inet_sock_destruct; sk->sk_protocol = protocol; sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv; diff --git a/tempesta_fw/work_queue.c b/tempesta_fw/work_queue.c index 970397f82..1635c0e44 100644 --- a/tempesta_fw/work_queue.c +++ b/tempesta_fw/work_queue.c @@ -118,6 +118,11 @@ __update_guards(TfwRBQueue *q) atomic64_set(&q->last_tail, last_tail); } +/** + * FIXME A caller must be very careful with @sync: if two softirqs are running + * the operation to add an item to queues of each other, then they can spin + * forever (i.e. deadlock is possible). + */ int __tfw_wq_push(TfwRBQueue *q, void *ptr, bool sync) {