Skip to content

Commit c3c7041

Browse files
committed
scx: Avoid possible deadlock with cpus_read_lock()
Han Xing Yi reported a syzbot lockdep error over the weekend: ====================================================== WARNING: possible circular locking dependency detected 6.6.0-g2f6ba98e2d3d #4 Not tainted ------------------------------------------------------ syz-executor.0/2181 is trying to acquire lock: ffffffff84772410 (pernet_ops_rwsem){++++}-{3:3}, at: copy_net_ns+0x216/0x590 net/core/net_namespace.c:487 but task is already holding lock: ffffffff8449dc50 (scx_fork_rwsem){++++}-{0:0}, at: sched_fork+0x3b/0x190 kernel/sched/core.c:4810 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #3 (scx_fork_rwsem){++++}-{0:0}: percpu_down_write+0x51/0x210 kernel/locking/percpu-rwsem.c:227 scx_ops_enable+0x230/0xf90 kernel/sched/ext.c:3271 bpf_struct_ops_link_create+0x1b9/0x220 kernel/bpf/bpf_struct_ops.c:914 link_create kernel/bpf/syscall.c:4938 [inline] __sys_bpf+0x35af/0x4ac0 kernel/bpf/syscall.c:5453 __do_sys_bpf kernel/bpf/syscall.c:5487 [inline] __se_sys_bpf kernel/bpf/syscall.c:5485 [inline] __x64_sys_bpf+0x48/0x60 kernel/bpf/syscall.c:5485 do_syscall_x64 arch/x86/entry/common.c:51 [inline] do_syscall_64+0x46/0x100 arch/x86/entry/common.c:82 entry_SYSCALL_64_after_hwframe+0x6e/0x76 -> #2 (cpu_hotplug_lock){++++}-{0:0}: percpu_down_read include/linux/percpu-rwsem.h:51 [inline] cpus_read_lock+0x42/0x1b0 kernel/cpu.c:489 flush_all_backlogs net/core/dev.c:5885 [inline] unregister_netdevice_many_notify+0x30a/0x1070 net/core/dev.c:10965 unregister_netdevice_many+0x19/0x20 net/core/dev.c:11039 sit_exit_batch_net+0x433/0x460 net/ipv6/sit.c:1887 ops_exit_list+0xc5/0xe0 net/core/net_namespace.c:175 cleanup_net+0x3e2/0x750 net/core/net_namespace.c:614 process_one_work+0x50d/0xc20 kernel/workqueue.c:2630 process_scheduled_works kernel/workqueue.c:2703 [inline] worker_thread+0x50b/0x950 kernel/workqueue.c:2784 kthread+0x1fa/0x250 kernel/kthread.c:388 ret_from_fork+0x48/0x60 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x1b/0x30 arch/x86/entry/entry_64.S:242 -> #1 (rtnl_mutex){+.+.}-{3:3}: __mutex_lock_common kernel/locking/mutex.c:603 [inline] __mutex_lock+0xc1/0xea0 kernel/locking/mutex.c:747 mutex_lock_nested+0x16/0x20 kernel/locking/mutex.c:799 rtnl_lock+0x17/0x20 net/core/rtnetlink.c:79 register_netdevice_notifier+0x25/0x1c0 net/core/dev.c:1741 rtnetlink_init+0x3a/0x6e0 net/core/rtnetlink.c:6657 netlink_proto_init+0x23d/0x2f0 net/netlink/af_netlink.c:2946 do_one_initcall+0xb3/0x5f0 init/main.c:1232 do_initcall_level init/main.c:1294 [inline] do_initcalls init/main.c:1310 [inline] do_basic_setup init/main.c:1329 [inline] kernel_init_freeable+0x40c/0x5d0 init/main.c:1547 kernel_init+0x1d/0x350 init/main.c:1437 ret_from_fork+0x48/0x60 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x1b/0x30 arch/x86/entry/entry_64.S:242 -> #0 (pernet_ops_rwsem){++++}-{3:3}: check_prev_add kernel/locking/lockdep.c:3134 [inline] check_prevs_add kernel/locking/lockdep.c:3253 [inline] validate_chain kernel/locking/lockdep.c:3868 [inline] __lock_acquire+0x16b4/0x2b30 kernel/locking/lockdep.c:5136 lock_acquire kernel/locking/lockdep.c:5753 [inline] lock_acquire+0xc1/0x2b0 kernel/locking/lockdep.c:5718 down_read_killable+0x5d/0x280 kernel/locking/rwsem.c:1549 copy_net_ns+0x216/0x590 net/core/net_namespace.c:487 create_new_namespaces+0x2ed/0x770 kernel/nsproxy.c:110 copy_namespaces+0x488/0x540 kernel/nsproxy.c:179 copy_process+0x1b52/0x4680 kernel/fork.c:2504 kernel_clone+0x116/0x660 kernel/fork.c:2914 __do_sys_clone3+0x192/0x220 kernel/fork.c:3215 __se_sys_clone3 kernel/fork.c:3199 [inline] __x64_sys_clone3+0x30/0x40 kernel/fork.c:3199 do_syscall_x64 arch/x86/entry/common.c:51 [inline] do_syscall_64+0x46/0x100 arch/x86/entry/common.c:82 entry_SYSCALL_64_after_hwframe+0x6e/0x76 other info that might help us debug this: Chain exists of: pernet_ops_rwsem --> cpu_hotplug_lock --> scx_fork_rwsem Possible unsafe locking scenario: CPU0 CPU1 ---- ---- rlock(scx_fork_rwsem); lock(cpu_hotplug_lock); lock(scx_fork_rwsem); rlock(pernet_ops_rwsem); *** DEADLOCK *** 1 lock held by syz-executor.0/2181: #0: ffffffff8449dc50 (scx_fork_rwsem){++++}-{0:0}, at: sched_fork+0x3b/0x190 kernel/sched/core.c:4810 stack backtrace: CPU: 0 PID: 2181 Comm: syz-executor.0 Not tainted 6.6.0-g2f6ba98e2d3d #4 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014 Sched_ext: serialise (enabled), task: runnable_at=-6ms Call Trace: <TASK> __dump_stack lib/dump_stack.c:89 [inline] dump_stack_lvl+0x91/0xf0 lib/dump_stack.c:107 dump_stack+0x15/0x20 lib/dump_stack.c:114 check_noncircular+0x134/0x150 kernel/locking/lockdep.c:2187 check_prev_add kernel/locking/lockdep.c:3134 [inline] check_prevs_add kernel/locking/lockdep.c:3253 [inline] validate_chain kernel/locking/lockdep.c:3868 [inline] __lock_acquire+0x16b4/0x2b30 kernel/locking/lockdep.c:5136 lock_acquire kernel/locking/lockdep.c:5753 [inline] lock_acquire+0xc1/0x2b0 kernel/locking/lockdep.c:5718 down_read_killable+0x5d/0x280 kernel/locking/rwsem.c:1549 copy_net_ns+0x216/0x590 net/core/net_namespace.c:487 create_new_namespaces+0x2ed/0x770 kernel/nsproxy.c:110 copy_namespaces+0x488/0x540 kernel/nsproxy.c:179 copy_process+0x1b52/0x4680 kernel/fork.c:2504 kernel_clone+0x116/0x660 kernel/fork.c:2914 __do_sys_clone3+0x192/0x220 kernel/fork.c:3215 __se_sys_clone3 kernel/fork.c:3199 [inline] __x64_sys_clone3+0x30/0x40 kernel/fork.c:3199 do_syscall_x64 arch/x86/entry/common.c:51 [inline] do_syscall_64+0x46/0x100 arch/x86/entry/common.c:82 entry_SYSCALL_64_after_hwframe+0x6e/0x76 RIP: 0033:0x7f9f764e240d Code: c3 e8 97 2b 00 00 0f 1f 80 00 00 00 00 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b0 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007f9f75851ee8 EFLAGS: 00000246 ORIG_RAX: 00000000000001b3 RAX: ffffffffffffffda RBX: 00007f9f7661ef80 RCX: 00007f9f764e240d RDX: 0000000000000100 RSI: 0000000000000058 RDI: 00007f9f75851f00 RBP: 00007f9f765434a6 R08: 0000000000000000 R09: 0000000000000058 R10: 00007f9f75851f00 R11: 0000000000000246 R12: 0000000000000058 R13: 0000000000000006 R14: 00007f9f7661ef80 R15: 00007f9f75832000 </TASK> The issue is that we're acquiring the cpus_read_lock() _before_ we acquire scx_fork_rwsem in scx_ops_enable() and scx_ops_disable(), but we acquire and hold scx_fork_rwsem around basically the whole fork() path. I don't see how a deadlock could actually occur in practice, but it should be safe to acquire the scx_fork_rwsem and scx_cgroup_rwsem semaphores before the hotplug lock, so let's do that. Reported-by: Han Xing Yi <hxingyi104@gmail.com> Signed-off-by: David Vernet <void@manifault.com>
1 parent 8ccf9d7 commit c3c7041

File tree

1 file changed

+20
-19
lines changed

1 file changed

+20
-19
lines changed

kernel/sched/ext.c

Lines changed: 20 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -3155,9 +3155,9 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
31553155
WRITE_ONCE(scx_switching_all, false);
31563156

31573157
/* avoid racing against fork and cgroup changes */
3158-
cpus_read_lock();
31593158
percpu_down_write(&scx_fork_rwsem);
31603159
scx_cgroup_lock();
3160+
cpus_read_lock();
31613161

31623162
spin_lock_irq(&scx_tasks_lock);
31633163
scx_task_iter_init(&sti);
@@ -3196,9 +3196,9 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
31963196

31973197
scx_cgroup_exit();
31983198

3199+
cpus_read_unlock();
31993200
scx_cgroup_unlock();
32003201
percpu_up_write(&scx_fork_rwsem);
3201-
cpus_read_unlock();
32023202

32033203
if (ei->kind >= SCX_EXIT_ERROR) {
32043204
printk(KERN_ERR "sched_ext: BPF scheduler \"%s\" errored, disabling\n", scx_ops.name);
@@ -3353,9 +3353,18 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
33533353
atomic_long_set(&scx_nr_rejected, 0);
33543354

33553355
/*
3356-
* Keep CPUs stable during enable so that the BPF scheduler can track
3357-
* online CPUs by watching ->on/offline_cpu() after ->init().
3356+
* Lock out forks, cgroup on/offlining and moves before opening the
3357+
* floodgate so that they don't wander into the operations prematurely.
3358+
*
3359+
* Also keep CPUs stable during enable so that the BPF scheduler can
3360+
* track online CPUs by watching ->on/offline_cpu() after ->init().
3361+
*
3362+
* Acquire scx_fork_rwsem and scx_group_rwsem before the hotplug lock.
3363+
* cpus_read_lock() is acquired in a ton of places, so let's be a bit
3364+
* cautious to avoid possible deadlock.
33583365
*/
3366+
percpu_down_write(&scx_fork_rwsem);
3367+
scx_cgroup_lock();
33593368
cpus_read_lock();
33603369

33613370
scx_switch_all_req = false;
@@ -3399,13 +3408,6 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
33993408
queue_delayed_work(system_unbound_wq, &scx_watchdog_work,
34003409
scx_watchdog_timeout / 2);
34013410

3402-
/*
3403-
* Lock out forks, cgroup on/offlining and moves before opening the
3404-
* floodgate so that they don't wander into the operations prematurely.
3405-
*/
3406-
percpu_down_write(&scx_fork_rwsem);
3407-
scx_cgroup_lock();
3408-
34093411
for (i = 0; i < SCX_NR_ONLINE_OPS; i++)
34103412
if (((void (**)(void))ops)[i])
34113413
static_branch_enable_cpuslocked(&scx_has_op[i]);
@@ -3431,7 +3433,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
34313433
*/
34323434
ret = scx_cgroup_init();
34333435
if (ret)
3434-
goto err_disable_unlock;
3436+
goto err_disable;
34353437

34363438
static_branch_enable_cpuslocked(&__scx_ops_enabled);
34373439

@@ -3457,7 +3459,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
34573459
spin_unlock_irq(&scx_tasks_lock);
34583460
pr_err("sched_ext: ops.init_task() failed (%d) for %s[%d] while loading\n",
34593461
ret, p->comm, p->pid);
3460-
goto err_disable_unlock;
3462+
goto err_disable;
34613463
}
34623464

34633465
put_task_struct(p);
@@ -3481,7 +3483,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
34813483
preempt_enable();
34823484
spin_unlock_irq(&scx_tasks_lock);
34833485
ret = -EBUSY;
3484-
goto err_disable_unlock;
3486+
goto err_disable;
34853487
}
34863488

34873489
/*
@@ -3515,8 +3517,6 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
35153517

35163518
spin_unlock_irq(&scx_tasks_lock);
35173519
preempt_enable();
3518-
scx_cgroup_unlock();
3519-
percpu_up_write(&scx_fork_rwsem);
35203520

35213521
if (!scx_ops_tryset_enable_state(SCX_OPS_ENABLED, SCX_OPS_ENABLING)) {
35223522
ret = -EBUSY;
@@ -3527,6 +3527,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
35273527
static_branch_enable_cpuslocked(&__scx_switched_all);
35283528

35293529
cpus_read_unlock();
3530+
scx_cgroup_unlock();
3531+
percpu_up_write(&scx_fork_rwsem);
35303532
mutex_unlock(&scx_ops_enable_mutex);
35313533

35323534
scx_cgroup_config_knobs();
@@ -3537,11 +3539,10 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
35373539
mutex_unlock(&scx_ops_enable_mutex);
35383540
return ret;
35393541

3540-
err_disable_unlock:
3541-
scx_cgroup_unlock();
3542-
percpu_up_write(&scx_fork_rwsem);
35433542
err_disable:
35443543
cpus_read_unlock();
3544+
scx_cgroup_unlock();
3545+
percpu_up_write(&scx_fork_rwsem);
35453546
mutex_unlock(&scx_ops_enable_mutex);
35463547
/* must be fully disabled before returning */
35473548
scx_ops_disable(SCX_EXIT_ERROR);

0 commit comments

Comments
 (0)