Skip to content

Commit

Permalink
bpf: Alloc bpf_async_cb by using bpf_global_ma under PREEMPT_RT
Browse files Browse the repository at this point in the history
Under PREEMPT_RT, it is not safe to use GPF_ATOMIC kmalloc when
preemption or irq is disabled. The following warning is reported when
running test_progs under PREEMPT_RT:

  BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:48
  in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 675, name: test_progs
  preempt_count: 1, expected: 0
  RCU nest depth: 0, expected: 0
  2 locks held by test_progs/675:
   #0: ffffffff864b0240 (rcu_read_lock_trace){....}-{0:0}, at: bpf_prog_test_run_syscall+0x2c0/0x830
   #1: ffff8881f4ec40c8 ((&c->lock)){....}-{2:2}, at: ___slab_alloc+0xbc/0x1280
  Preemption disabled at:
  [<ffffffff8175ae2b>] __bpf_async_init+0xbb/0xb10
  CPU: 1 UID: 0 PID: 675 Comm: test_progs Tainted: G           O       6.12.0+ torvalds#11
  Tainted: [O]=OOT_MODULE
  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996) ...
  Call Trace:
   <TASK>
   dump_stack_lvl+0x57/0x70
   dump_stack+0x10/0x20
   __might_resched+0x337/0x4d0
   rt_spin_lock+0xd4/0x230
   ___slab_alloc+0xbc/0x1280
   __slab_alloc.isra.0+0x5d/0xa0
   __kmalloc_node_noprof+0xf7/0x4f0
   bpf_map_kmalloc_node+0xf5/0x6b0
   __bpf_async_init+0x20e/0xb10
   bpf_timer_init+0x30/0x40
   bpf_prog_c7e2dc9ff3d5ba62_start_cb+0x55/0x85
   bpf_prog_4eb421be69ae82fa_start_timer+0x5d/0x7e
   bpf_prog_test_run_syscall+0x322/0x830
   __sys_bpf+0x135d/0x3ca0
   __x64_sys_bpf+0x75/0xb0
   x64_sys_call+0x1b5/0xa10
   do_syscall_64+0x3b/0xc0
   entry_SYSCALL_64_after_hwframe+0x4b/0x53

Fix the problem by using bpf_global_ma to allocate bpf_async_cb when
PREEMPT_RT is enabled. The reason for still using kmalloc for
no-PREEMPT_RT case is that bpf_global_ma doesn't support accouting the
allocated memory to specific memcg. Also doing the memory allocation
before invoking __bpf_spin_lock_irqsave() to reduce the possibility of
-ENOMEM for bpf_global_ma.

Signed-off-by: Hou Tao <[email protected]>
  • Loading branch information
Hou Tao authored and intel-lab-lkp committed Jan 14, 2025
1 parent be339dd commit b9dea4b
Showing 1 changed file with 38 additions and 10 deletions.
48 changes: 38 additions & 10 deletions kernel/bpf/helpers.c
Original file line number Diff line number Diff line change
Expand Up @@ -1109,12 +1109,14 @@ struct bpf_async_cb {
* freeing the timers when inner map is replaced or deleted by user space.
*/
struct bpf_hrtimer {
/* cb must be the first member */
struct bpf_async_cb cb;
struct hrtimer timer;
atomic_t cancelling;
};

struct bpf_work {
/* cb must be the first member */
struct bpf_async_cb cb;
struct work_struct work;
struct work_struct delete_work;
Expand All @@ -1141,6 +1143,34 @@ enum bpf_async_type {

static DEFINE_PER_CPU(struct bpf_hrtimer *, hrtimer_running);

static void bpf_async_free(struct bpf_async_cb *cb)
{
if (IS_ENABLED(CONFIG_PREEMPT_RT))
bpf_mem_free(&bpf_global_ma, cb);
else
kfree(cb);
}

static void bpf_async_free_rcu(struct bpf_async_cb *cb)
{
if (IS_ENABLED(CONFIG_PREEMPT_RT))
bpf_mem_free_rcu(&bpf_global_ma, cb);
else
kfree_rcu(cb, rcu);
}

static struct bpf_async_cb *bpf_async_alloc(struct bpf_map *map, size_t size)
{
struct bpf_async_cb *cb;

if (IS_ENABLED(CONFIG_PREEMPT_RT))
cb = bpf_mem_alloc(&bpf_global_ma, size);
else
/* allocate hrtimer via map_kmalloc to use memcg accounting */
cb = bpf_map_kmalloc_node(map, size, GFP_ATOMIC, map->numa_node);
return cb;
}

static enum hrtimer_restart bpf_timer_cb(struct hrtimer *hrtimer)
{
struct bpf_hrtimer *t = container_of(hrtimer, struct bpf_hrtimer, timer);
Expand Down Expand Up @@ -1221,7 +1251,7 @@ static void bpf_wq_delete_work(struct work_struct *work)

cancel_work_sync(&w->work);

kfree_rcu(w, cb.rcu);
bpf_async_free_rcu(&w->cb);
}

static void bpf_timer_delete_work(struct work_struct *work)
Expand All @@ -1236,7 +1266,7 @@ static void bpf_timer_delete_work(struct work_struct *work)
* bpf_timer_cancel_and_free will have been cancelled.
*/
hrtimer_cancel(&t->timer);
kfree_rcu(t, cb.rcu);
bpf_async_free_rcu(&t->cb);
}

static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u64 flags,
Expand All @@ -1263,20 +1293,18 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u
return -EINVAL;
}

cb = bpf_async_alloc(map, size);
if (!cb)
return -ENOMEM;

__bpf_spin_lock_irqsave(&async->lock);
t = async->timer;
if (t) {
bpf_async_free(cb);
ret = -EBUSY;
goto out;
}

/* allocate hrtimer via map_kmalloc to use memcg accounting */
cb = bpf_map_kmalloc_node(map, size, GFP_ATOMIC, map->numa_node);
if (!cb) {
ret = -ENOMEM;
goto out;
}

switch (type) {
case BPF_ASYNC_TYPE_TIMER:
clockid = flags & (MAX_CLOCKS - 1);
Expand Down Expand Up @@ -1313,7 +1341,7 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u
* or pinned in bpffs.
*/
WRITE_ONCE(async->cb, NULL);
kfree(cb);
bpf_async_free(cb);
ret = -EPERM;
}
out:
Expand Down

0 comments on commit b9dea4b

Please sign in to comment.