Skip to content

Commit

Permalink
Collect MTU from skb->_skb_refdst
Browse files Browse the repository at this point in the history
According to source code, kernel uses MTU from skb->_skb_refdst. Let
pwru collect MTU from there using the same logic.

It requires to cast skb->_skb_refdst to dst_entry*, then fetch
dst_metric_raw(dst, RTAX_MTU) and dst->dev->mtu.

```
// https://elixir.bootlin.com/linux/v6.5/source/net/ipv4/ip_forward.c#L86
// net/ipv4/ip_forward.c
int ip_forward(struct sk_buff *skb)
{
[...]
	rt = skb_rtable(skb);
[...]
	mtu = ip_dst_mtu_maybe_forward(&rt->dst, true);
	if (ip_exceeds_mtu(skb, mtu)) {
		IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
			  htonl(mtu));
		SKB_DR_SET(reason, PKT_TOO_BIG);
		goto drop;
	}
[...]
}

// include/linux/skbuff.h
static inline struct rtable *skb_rtable(const struct sk_buff *skb)
{
	return (struct rtable *)skb_dst(skb);
}

// include/linux/skbuff.h
static inline struct dst_entry *skb_dst(const struct sk_buff *skb)
{
[...]
	return (struct dst_entry *)(skb->_skb_refdst & SKB_DST_PTRMASK);
}

// include/net/ip.h
static inline unsigned int ip_dst_mtu_maybe_forward(const struct dst_entry *dst,
						    bool forwarding)
{
[...]
	mtu = dst_metric_raw(dst, RTAX_MTU);
	if (mtu)
		goto out;

	mtu = READ_ONCE(dst->dev->mtu);
[...]
}

// include/net/dst.h
	((u32 *)((Y) & ~DST_METRICS_FLAGS))
```

With this patch, pwru can output the correct MTU used by OS.

Case 1: Cilium could reduce the route MTU to 1423 inside a pod. This
can't be detected by pwru because it only checks link MTU.

Case 2: Xfrm could reduce the route MTU to 1446 in ip_forward(). Pwru
must inspect route MTU from skb->_skb_dstref to understand that.

Signed-off-by: gray <[email protected]>
  • Loading branch information
jschwinger233 authored and brb committed Jul 7, 2024
1 parent 438ea99 commit b33c3bb
Showing 1 changed file with 18 additions and 0 deletions.
18 changes: 18 additions & 0 deletions bpf/kprobe_pwru.c
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,17 @@
#define ETH_P_IPV6 0x86dd
#define ETH_P_8021Q 0x8100

#define RTAX_MTU 2
#define SKB_DST_NOREF 1UL
#define SKB_DST_PTRMASK ~(SKB_DST_NOREF)
#define __SKB_DST_PTR(X) \
((struct dst_entry *)((X) & SKB_DST_PTRMASK))

#define DST_METRICS_FLAGS 0x3UL
#define __DST_METRICS_PTR(X) \
((u32 *)((X) & ~DST_METRICS_FLAGS))


const static bool TRUE = true;

volatile const static __u64 BPF_PROG_ADDR = 0;
Expand Down Expand Up @@ -238,6 +249,13 @@ set_meta(struct sk_buff *skb, struct skb_meta *meta) {
meta->protocol = BPF_CORE_READ(skb, protocol);
meta->ifindex = BPF_CORE_READ(skb, dev, ifindex);
meta->mtu = BPF_CORE_READ(skb, dev, mtu);
struct dst_entry *dst = __SKB_DST_PTR(BPF_CORE_READ(skb, _skb_refdst));
if (dst) {
u32 *metrics = __DST_METRICS_PTR(BPF_CORE_READ(dst, _metrics));
bpf_probe_read_kernel(&meta->mtu, sizeof(meta->mtu), metrics + RTAX_MTU - 1);
if (!meta->mtu)
meta->mtu = BPF_CORE_READ(dst, dev, mtu);
}
}

static __always_inline void
Expand Down

0 comments on commit b33c3bb

Please sign in to comment.