diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index 80900d9109920f..ce660d51549b46 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -474,6 +474,7 @@ struct nf_ct_hook { const struct sk_buff *); void (*attach)(struct sk_buff *nskb, const struct sk_buff *skb); void (*set_closing)(struct nf_conntrack *nfct); + int (*confirm)(struct sk_buff *skb); }; extern const struct nf_ct_hook __rcu *nf_ct_hook; diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index ed17208907578a..35e10c5a766d55 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -43,6 +43,10 @@ #include #endif +#if IS_ENABLED(CONFIG_NF_CONNTRACK) +#include +#endif + static unsigned int brnf_net_id __read_mostly; struct brnf_net { @@ -553,6 +557,90 @@ static unsigned int br_nf_pre_routing(void *priv, return NF_STOLEN; } +#if IS_ENABLED(CONFIG_NF_CONNTRACK) +/* conntracks' nf_confirm logic cannot handle cloned skbs referencing + * the same nf_conn entry, which will happen for multicast (broadcast) + * Frames on bridges. + * + * Example: + * macvlan0 + * br0 + * ethX ethY + * + * ethX (or Y) receives multicast or broadcast packet containing + * an IP packet, not yet in conntrack table. + * + * 1. skb passes through bridge and fake-ip (br_netfilter)Prerouting. + * -> skb->_nfct now references a unconfirmed entry + * 2. skb is broad/mcast packet. bridge now passes clones out on each bridge + * interface. + * 3. skb gets passed up the stack. + * 4. In macvlan case, macvlan driver retains clone(s) of the mcast skb + * and schedules a work queue to send them out on the lower devices. + * + * The clone skb->_nfct is not a copy, it is the same entry as the + * original skb. The macvlan rx handler then returns RX_HANDLER_PASS. + * 5. Normal conntrack hooks (in NF_INET_LOCAL_IN) confirm the orig skb. + * + * The Macvlan broadcast worker and normal confirm path will race. + * + * This race will not happen if step 2 already confirmed a clone. In that + * case later steps perform skb_clone() with skb->_nfct already confirmed (in + * hash table). This works fine. + * + * But such confirmation won't happen when eb/ip/nftables rules dropped the + * packets before they reached the nf_confirm step in postrouting. + * + * Work around this problem by explicit confirmation of the entry at + * LOCAL_IN time, before upper layer has a chance to clone the unconfirmed + * entry. + * + */ +static unsigned int br_nf_local_in(void *priv, + struct sk_buff *skb, + const struct nf_hook_state *state) +{ + struct nf_conntrack *nfct = skb_nfct(skb); + const struct nf_ct_hook *ct_hook; + struct nf_conn *ct; + int ret; + + if (!nfct || skb->pkt_type == PACKET_HOST) + return NF_ACCEPT; + + ct = container_of(nfct, struct nf_conn, ct_general); + if (likely(nf_ct_is_confirmed(ct))) + return NF_ACCEPT; + + WARN_ON_ONCE(skb_shared(skb)); + WARN_ON_ONCE(refcount_read(&nfct->use) != 1); + + /* We can't call nf_confirm here, it would create a dependency + * on nf_conntrack module. + */ + ct_hook = rcu_dereference(nf_ct_hook); + if (!ct_hook) { + skb->_nfct = 0ul; + nf_conntrack_put(nfct); + return NF_ACCEPT; + } + + nf_bridge_pull_encap_header(skb); + ret = ct_hook->confirm(skb); + switch (ret & NF_VERDICT_MASK) { + case NF_STOLEN: + return NF_STOLEN; + default: + nf_bridge_push_encap_header(skb); + break; + } + + ct = container_of(nfct, struct nf_conn, ct_general); + WARN_ON_ONCE(!nf_ct_is_confirmed(ct)); + + return ret; +} +#endif /* PF_BRIDGE/FORWARD *************************************************/ static int br_nf_forward_finish(struct net *net, struct sock *sk, struct sk_buff *skb) @@ -964,6 +1052,14 @@ static const struct nf_hook_ops br_nf_ops[] = { .hooknum = NF_BR_PRE_ROUTING, .priority = NF_BR_PRI_BRNF, }, +#if IS_ENABLED(CONFIG_NF_CONNTRACK) + { + .hook = br_nf_local_in, + .pf = NFPROTO_BRIDGE, + .hooknum = NF_BR_LOCAL_IN, + .priority = NF_BR_PRI_LAST, + }, +#endif { .hook = br_nf_forward, .pf = NFPROTO_BRIDGE, diff --git a/net/bridge/netfilter/nf_conntrack_bridge.c b/net/bridge/netfilter/nf_conntrack_bridge.c index abb090f94ed260..6f877e31709bad 100644 --- a/net/bridge/netfilter/nf_conntrack_bridge.c +++ b/net/bridge/netfilter/nf_conntrack_bridge.c @@ -291,6 +291,30 @@ static unsigned int nf_ct_bridge_pre(void *priv, struct sk_buff *skb, return nf_conntrack_in(skb, &bridge_state); } +static unsigned int nf_ct_bridge_in(void *priv, struct sk_buff *skb, + const struct nf_hook_state *state) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + + if (skb->pkt_type == PACKET_HOST) + return NF_ACCEPT; + + /* nf_conntrack_confirm() cannot handle concurrent clones, + * this happens for broad/multicast frames with e.g. macvlan on top + * of the bridge device. + */ + ct = nf_ct_get(skb, &ctinfo); + if (!ct || nf_ct_is_confirmed(ct) || nf_ct_is_template(ct)) + return NF_ACCEPT; + + /* let inet prerouting call conntrack again */ + skb->_nfct = 0; + nf_ct_put(ct); + + return NF_ACCEPT; +} + static void nf_ct_bridge_frag_save(struct sk_buff *skb, struct nf_bridge_frag_data *data) { @@ -385,6 +409,12 @@ static struct nf_hook_ops nf_ct_bridge_hook_ops[] __read_mostly = { .hooknum = NF_BR_PRE_ROUTING, .priority = NF_IP_PRI_CONNTRACK, }, + { + .hook = nf_ct_bridge_in, + .pf = NFPROTO_BRIDGE, + .hooknum = NF_BR_LOCAL_IN, + .priority = NF_IP_PRI_CONNTRACK_CONFIRM, + }, { .hook = nf_ct_bridge_post, .pf = NFPROTO_BRIDGE, diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 2e5f3864d353a3..5b876fa7f9af9e 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -2756,6 +2756,7 @@ static const struct nf_ct_hook nf_conntrack_hook = { .get_tuple_skb = nf_conntrack_get_tuple_skb, .attach = nf_conntrack_attach, .set_closing = nf_conntrack_set_closing, + .confirm = __nf_conntrack_confirm, }; void nf_conntrack_init_end(void) diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index 1f9474fefe8492..d3d11dede54507 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -359,10 +359,20 @@ static int nft_target_validate(const struct nft_ctx *ctx, if (ctx->family != NFPROTO_IPV4 && ctx->family != NFPROTO_IPV6 && + ctx->family != NFPROTO_INET && ctx->family != NFPROTO_BRIDGE && ctx->family != NFPROTO_ARP) return -EOPNOTSUPP; + ret = nft_chain_validate_hooks(ctx->chain, + (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_LOCAL_IN) | + (1 << NF_INET_FORWARD) | + (1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_POST_ROUTING)); + if (ret) + return ret; + if (nft_is_base_chain(ctx->chain)) { const struct nft_base_chain *basechain = nft_base_chain(ctx->chain); @@ -610,10 +620,20 @@ static int nft_match_validate(const struct nft_ctx *ctx, if (ctx->family != NFPROTO_IPV4 && ctx->family != NFPROTO_IPV6 && + ctx->family != NFPROTO_INET && ctx->family != NFPROTO_BRIDGE && ctx->family != NFPROTO_ARP) return -EOPNOTSUPP; + ret = nft_chain_validate_hooks(ctx->chain, + (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_LOCAL_IN) | + (1 << NF_INET_FORWARD) | + (1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_POST_ROUTING)); + if (ret) + return ret; + if (nft_is_base_chain(ctx->chain)) { const struct nft_base_chain *basechain = nft_base_chain(ctx->chain); diff --git a/tools/testing/selftests/netfilter/Makefile b/tools/testing/selftests/netfilter/Makefile index db27153eb4a02c..936c3085bb8373 100644 --- a/tools/testing/selftests/netfilter/Makefile +++ b/tools/testing/selftests/netfilter/Makefile @@ -7,7 +7,8 @@ TEST_PROGS := nft_trans_stress.sh nft_fib.sh nft_nat.sh bridge_brouter.sh \ nft_queue.sh nft_meta.sh nf_nat_edemux.sh \ ipip-conntrack-mtu.sh conntrack_tcp_unreplied.sh \ conntrack_vrf.sh nft_synproxy.sh rpath.sh nft_audit.sh \ - conntrack_sctp_collision.sh xt_string.sh + conntrack_sctp_collision.sh xt_string.sh \ + bridge_netfilter.sh HOSTPKG_CONFIG := pkg-config diff --git a/tools/testing/selftests/netfilter/bridge_netfilter.sh b/tools/testing/selftests/netfilter/bridge_netfilter.sh new file mode 100644 index 00000000000000..659b3ab02c8beb --- /dev/null +++ b/tools/testing/selftests/netfilter/bridge_netfilter.sh @@ -0,0 +1,188 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Test bridge netfilter + conntrack, a combination that doesn't really work, +# with multicast/broadcast packets racing for hash table insertion. + +# eth0 br0 eth0 +# setup is: ns1 <->,ns0 <-> ns3 +# ns2 <-' `'-> ns4 + +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 +ret=0 + +sfx=$(mktemp -u "XXXXXXXX") +ns0="ns0-$sfx" +ns1="ns1-$sfx" +ns2="ns2-$sfx" +ns3="ns3-$sfx" +ns4="ns4-$sfx" + +ebtables -V > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: Could not run test without ebtables" + exit $ksft_skip +fi + +ip -Version > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: Could not run test without ip tool" + exit $ksft_skip +fi + +for i in $(seq 0 4); do + eval ip netns add \$ns$i +done + +cleanup() { + for i in $(seq 0 4); do eval ip netns del \$ns$i;done +} + +trap cleanup EXIT + +do_ping() +{ + fromns="$1" + dstip="$2" + + ip netns exec $fromns ping -c 1 -q $dstip > /dev/null + if [ $? -ne 0 ]; then + echo "ERROR: ping from $fromns to $dstip" + ip netns exec ${ns0} nft list ruleset + ret=1 + fi +} + +bcast_ping() +{ + fromns="$1" + dstip="$2" + + for i in $(seq 1 1000); do + ip netns exec $fromns ping -q -f -b -c 1 -q $dstip > /dev/null 2>&1 + if [ $? -ne 0 ]; then + echo "ERROR: ping -b from $fromns to $dstip" + ip netns exec ${ns0} nft list ruleset + fi + done +} + +ip link add veth1 netns ${ns0} type veth peer name eth0 netns ${ns1} +if [ $? -ne 0 ]; then + echo "SKIP: Can't create veth device" + exit $ksft_skip +fi + +ip link add veth2 netns ${ns0} type veth peer name eth0 netns $ns2 +ip link add veth3 netns ${ns0} type veth peer name eth0 netns $ns3 +ip link add veth4 netns ${ns0} type veth peer name eth0 netns $ns4 + +ip -net ${ns0} link set lo up + +for i in $(seq 1 4); do + ip -net ${ns0} link set veth$i up +done + +ip -net ${ns0} link add br0 type bridge stp_state 0 forward_delay 0 nf_call_iptables 1 nf_call_ip6tables 1 nf_call_arptables 1 +if [ $? -ne 0 ]; then + echo "SKIP: Can't create bridge br0" + exit $ksft_skip +fi + +# make veth0,1,2 part of bridge. +for i in $(seq 1 3); do + ip -net ${ns0} link set veth$i master br0 +done + +# add a macvlan on top of the bridge. +MACVLAN_ADDR=ba:f3:13:37:42:23 +ip -net ${ns0} link add link br0 name macvlan0 type macvlan mode private +ip -net ${ns0} link set macvlan0 address ${MACVLAN_ADDR} +ip -net ${ns0} link set macvlan0 up +ip -net ${ns0} addr add 10.23.0.1/24 dev macvlan0 + +# add a macvlan on top of veth4. +MACVLAN_ADDR=ba:f3:13:37:42:24 +ip -net ${ns0} link add link veth4 name macvlan4 type macvlan mode vepa +ip -net ${ns0} link set macvlan4 address ${MACVLAN_ADDR} +ip -net ${ns0} link set macvlan4 up + +# make the macvlan part of the bridge. +# veth4 is not a bridge port, only the macvlan on top of it. +ip -net ${ns0} link set macvlan4 master br0 + +ip -net ${ns0} link set br0 up +ip -net ${ns0} addr add 10.0.0.1/24 dev br0 +ip netns exec ${ns0} sysctl -q net.bridge.bridge-nf-call-iptables=1 +ret=$? +if [ $ret -ne 0 ] ; then + echo "SKIP: bridge netfilter not available" + ret=$ksft_skip +fi + +# for testing, so namespaces will reply to ping -b probes. +ip netns exec ${ns0} sysctl -q net.ipv4.icmp_echo_ignore_broadcasts=0 + +# enable conntrack in ns0 and drop broadcast packets in forward to +# avoid them from getting confirmed in the postrouting hook before +# the cloned skb is passed up the stack. +ip netns exec ${ns0} nft -f - <