scx: Avoid possible deadlock with cpus_read_lock()

Byte-Lab · Byte-Lab · commit c3c7041b04c7 · 2024-01-08T11:09:57.000-06:00
Han Xing Yi reported a syzbot lockdep error over the weekend: ====================================================== WARNING: possible circular locking dependency detected 6.6.0-g2f6ba98e2d3d #4 Not tainted ------------------------------------------------------ syz-executor.0/2181 is trying to acquire lock: ffffffff84772410 (pernet_ops_rwsem){++++}-{3:3}, at: copy_net_ns+0x216/0x590 net/core/net_namespace.c:487 but task is already holding lock: ffffffff8449dc50 (scx_fork_rwsem){++++}-{0:0}, at: sched_fork+0x3b/0x190 kernel/sched/core.c:4810 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #3 (scx_fork_rwsem){++++}-{0:0}: percpu_down_write+0x51/0x210 kernel/locking/percpu-rwsem.c:227 scx_ops_enable+0x230/0xf90 kernel/sched/ext.c:3271 bpf_struct_ops_link_create+0x1b9/0x220 kernel/bpf/bpf_struct_ops.c:914 link_create kernel/bpf/syscall.c:4938 [inline] __sys_bpf+0x35af/0x4ac0 kernel/bpf/syscall.c:5453 __do_sys_bpf kernel/bpf/syscall.c:5487 [inline] __se_sys_bpf kernel/bpf/syscall.c:5485 [inline] __x64_sys_bpf+0x48/0x60 kernel/bpf/syscall.c:5485 do_syscall_x64 arch/x86/entry/common.c:51 [inline] do_syscall_64+0x46/0x100 arch/x86/entry/common.c:82 entry_SYSCALL_64_after_hwframe+0x6e/0x76 -> #2 (cpu_hotplug_lock){++++}-{0:0}: percpu_down_read include/linux/percpu-rwsem.h:51 [inline] cpus_read_lock+0x42/0x1b0 kernel/cpu.c:489 flush_all_backlogs net/core/dev.c:5885 [inline] unregister_netdevice_many_notify+0x30a/0x1070 net/core/dev.c:10965 unregister_netdevice_many+0x19/0x20 net/core/dev.c:11039 sit_exit_batch_net+0x433/0x460 net/ipv6/sit.c:1887 ops_exit_list+0xc5/0xe0 net/core/net_namespace.c:175 cleanup_net+0x3e2/0x750 net/core/net_namespace.c:614 process_one_work+0x50d/0xc20 kernel/workqueue.c:2630 process_scheduled_works kernel/workqueue.c:2703 [inline] worker_thread+0x50b/0x950 kernel/workqueue.c:2784 kthread+0x1fa/0x250 kernel/kthread.c:388 ret_from_fork+0x48/0x60 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x1b/0x30 arch/x86/entry/entry_64.S:242 -> #1 (rtnl_mutex){+.+.}-{3:3}: __mutex_lock_common kernel/locking/mutex.c:603 [inline] __mutex_lock+0xc1/0xea0 kernel/locking/mutex.c:747 mutex_lock_nested+0x16/0x20 kernel/locking/mutex.c:799 rtnl_lock+0x17/0x20 net/core/rtnetlink.c:79 register_netdevice_notifier+0x25/0x1c0 net/core/dev.c:1741 rtnetlink_init+0x3a/0x6e0 net/core/rtnetlink.c:6657 netlink_proto_init+0x23d/0x2f0 net/netlink/af_netlink.c:2946 do_one_initcall+0xb3/0x5f0 init/main.c:1232 do_initcall_level init/main.c:1294 [inline] do_initcalls init/main.c:1310 [inline] do_basic_setup init/main.c:1329 [inline] kernel_init_freeable+0x40c/0x5d0 init/main.c:1547 kernel_init+0x1d/0x350 init/main.c:1437 ret_from_fork+0x48/0x60 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x1b/0x30 arch/x86/entry/entry_64.S:242 -> #0 (pernet_ops_rwsem){++++}-{3:3}: check_prev_add kernel/locking/lockdep.c:3134 [inline] check_prevs_add kernel/locking/lockdep.c:3253 [inline] validate_chain kernel/locking/lockdep.c:3868 [inline] __lock_acquire+0x16b4/0x2b30 kernel/locking/lockdep.c:5136 lock_acquire kernel/locking/lockdep.c:5753 [inline] lock_acquire+0xc1/0x2b0 kernel/locking/lockdep.c:5718 down_read_killable+0x5d/0x280 kernel/locking/rwsem.c:1549 copy_net_ns+0x216/0x590 net/core/net_namespace.c:487 create_new_namespaces+0x2ed/0x770 kernel/nsproxy.c:110 copy_namespaces+0x488/0x540 kernel/nsproxy.c:179 copy_process+0x1b52/0x4680 kernel/fork.c:2504 kernel_clone+0x116/0x660 kernel/fork.c:2914 __do_sys_clone3+0x192/0x220 kernel/fork.c:3215 __se_sys_clone3 kernel/fork.c:3199 [inline] __x64_sys_clone3+0x30/0x40 kernel/fork.c:3199 do_syscall_x64 arch/x86/entry/common.c:51 [inline] do_syscall_64+0x46/0x100 arch/x86/entry/common.c:82 entry_SYSCALL_64_after_hwframe+0x6e/0x76 other info that might help us debug this: Chain exists of: pernet_ops_rwsem --> cpu_hotplug_lock --> scx_fork_rwsem Possible unsafe locking scenario: CPU0 CPU1 ---- ---- rlock(scx_fork_rwsem); lock(cpu_hotplug_lock); lock(scx_fork_rwsem); rlock(pernet_ops_rwsem); *** DEADLOCK *** 1 lock held by syz-executor.0/2181: #0: ffffffff8449dc50 (scx_fork_rwsem){++++}-{0:0}, at: sched_fork+0x3b/0x190 kernel/sched/core.c:4810 stack backtrace: CPU: 0 PID: 2181 Comm: syz-executor.0 Not tainted 6.6.0-g2f6ba98e2d3d #4 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014 Sched_ext: serialise (enabled), task: runnable_at=-6ms Call Trace: <TASK> __dump_stack lib/dump_stack.c:89 [inline] dump_stack_lvl+0x91/0xf0 lib/dump_stack.c:107 dump_stack+0x15/0x20 lib/dump_stack.c:114 check_noncircular+0x134/0x150 kernel/locking/lockdep.c:2187 check_prev_add kernel/locking/lockdep.c:3134 [inline] check_prevs_add kernel/locking/lockdep.c:3253 [inline] validate_chain kernel/locking/lockdep.c:3868 [inline] __lock_acquire+0x16b4/0x2b30 kernel/locking/lockdep.c:5136 lock_acquire kernel/locking/lockdep.c:5753 [inline] lock_acquire+0xc1/0x2b0 kernel/locking/lockdep.c:5718 down_read_killable+0x5d/0x280 kernel/locking/rwsem.c:1549 copy_net_ns+0x216/0x590 net/core/net_namespace.c:487 create_new_namespaces+0x2ed/0x770 kernel/nsproxy.c:110 copy_namespaces+0x488/0x540 kernel/nsproxy.c:179 copy_process+0x1b52/0x4680 kernel/fork.c:2504 kernel_clone+0x116/0x660 kernel/fork.c:2914 __do_sys_clone3+0x192/0x220 kernel/fork.c:3215 __se_sys_clone3 kernel/fork.c:3199 [inline] __x64_sys_clone3+0x30/0x40 kernel/fork.c:3199 do_syscall_x64 arch/x86/entry/common.c:51 [inline] do_syscall_64+0x46/0x100 arch/x86/entry/common.c:82 entry_SYSCALL_64_after_hwframe+0x6e/0x76 RIP: 0033:0x7f9f764e240d Code: c3 e8 97 2b 00 00 0f 1f 80 00 00 00 00 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b0 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007f9f75851ee8 EFLAGS: 00000246 ORIG_RAX: 00000000000001b3 RAX: ffffffffffffffda RBX: 00007f9f7661ef80 RCX: 00007f9f764e240d RDX: 0000000000000100 RSI: 0000000000000058 RDI: 00007f9f75851f00 RBP: 00007f9f765434a6 R08: 0000000000000000 R09: 0000000000000058 R10: 00007f9f75851f00 R11: 0000000000000246 R12: 0000000000000058 R13: 0000000000000006 R14: 00007f9f7661ef80 R15: 00007f9f75832000 </TASK> The issue is that we're acquiring the cpus_read_lock() _before_ we acquire scx_fork_rwsem in scx_ops_enable() and scx_ops_disable(), but we acquire and hold scx_fork_rwsem around basically the whole fork() path. I don't see how a deadlock could actually occur in practice, but it should be safe to acquire the scx_fork_rwsem and scx_cgroup_rwsem semaphores before the hotplug lock, so let's do that. Reported-by: Han Xing Yi <hxingyi104@gmail.com> Signed-off-by: David Vernet <void@manifault.com>
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
@@ -3155,9 +3155,9 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
 	WRITE_ONCE(scx_switching_all, false);
 
 	/* avoid racing against fork and cgroup changes */
-	cpus_read_lock();
 	percpu_down_write(&scx_fork_rwsem);
 	scx_cgroup_lock();
+	cpus_read_lock();
 
 	spin_lock_irq(&scx_tasks_lock);
 	scx_task_iter_init(&sti);
@@ -3196,9 +3196,9 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
 
 	scx_cgroup_exit();
 
+	cpus_read_unlock();
 	scx_cgroup_unlock();
 	percpu_up_write(&scx_fork_rwsem);
-	cpus_read_unlock();
 
 	if (ei->kind >= SCX_EXIT_ERROR) {
 		printk(KERN_ERR "sched_ext: BPF scheduler \"%s\" errored, disabling\n", scx_ops.name);
@@ -3353,9 +3353,18 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 	atomic_long_set(&scx_nr_rejected, 0);
 
 	/*
-	 * Keep CPUs stable during enable so that the BPF scheduler can track
-	 * online CPUs by watching ->on/offline_cpu() after ->init().
+	 * Lock out forks, cgroup on/offlining and moves before opening the
+	 * floodgate so that they don't wander into the operations prematurely.
+	 *
+	 * Also keep CPUs stable during enable so that the BPF scheduler can
+	 * track online CPUs by watching ->on/offline_cpu() after ->init().
+	 *
+	 * Acquire scx_fork_rwsem and scx_group_rwsem before the hotplug lock.
+	 * cpus_read_lock() is acquired in a ton of places, so let's be a bit
+	 * cautious to avoid possible deadlock.
 	 */
+	percpu_down_write(&scx_fork_rwsem);
+	scx_cgroup_lock();
 	cpus_read_lock();
 
 	scx_switch_all_req = false;
@@ -3399,13 +3408,6 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 	queue_delayed_work(system_unbound_wq, &scx_watchdog_work,
 			   scx_watchdog_timeout / 2);
 
-	/*
-	 * Lock out forks, cgroup on/offlining and moves before opening the
-	 * floodgate so that they don't wander into the operations prematurely.
-	 */
-	percpu_down_write(&scx_fork_rwsem);
-	scx_cgroup_lock();
-
 	for (i = 0; i < SCX_NR_ONLINE_OPS; i++)
 		if (((void (**)(void))ops)[i])
 			static_branch_enable_cpuslocked(&scx_has_op[i]);
@@ -3431,7 +3433,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 	 */
 	ret = scx_cgroup_init();
 	if (ret)
-		goto err_disable_unlock;
+		goto err_disable;
 
 	static_branch_enable_cpuslocked(&__scx_ops_enabled);
 
@@ -3457,7 +3459,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 			spin_unlock_irq(&scx_tasks_lock);
 			pr_err("sched_ext: ops.init_task() failed (%d) for %s[%d] while loading\n",
 			       ret, p->comm, p->pid);
-			goto err_disable_unlock;
+			goto err_disable;
 		}
 
 		put_task_struct(p);
@@ -3481,7 +3483,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 		preempt_enable();
 		spin_unlock_irq(&scx_tasks_lock);
 		ret = -EBUSY;
-		goto err_disable_unlock;
+		goto err_disable;
 	}
 
 	/*
@@ -3515,8 +3517,6 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 
 	spin_unlock_irq(&scx_tasks_lock);
 	preempt_enable();
-	scx_cgroup_unlock();
-	percpu_up_write(&scx_fork_rwsem);
 
 	if (!scx_ops_tryset_enable_state(SCX_OPS_ENABLED, SCX_OPS_ENABLING)) {
 		ret = -EBUSY;
@@ -3527,6 +3527,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 		static_branch_enable_cpuslocked(&__scx_switched_all);
 
 	cpus_read_unlock();
+	scx_cgroup_unlock();
+	percpu_up_write(&scx_fork_rwsem);
 	mutex_unlock(&scx_ops_enable_mutex);
 
 	scx_cgroup_config_knobs();
@@ -3537,11 +3539,10 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 	mutex_unlock(&scx_ops_enable_mutex);
 	return ret;
 
-err_disable_unlock:
-	scx_cgroup_unlock();
-	percpu_up_write(&scx_fork_rwsem);
 err_disable:
 	cpus_read_unlock();
+	scx_cgroup_unlock();
+	percpu_up_write(&scx_fork_rwsem);
 	mutex_unlock(&scx_ops_enable_mutex);
 	/* must be fully disabled before returning */
 	scx_ops_disable(SCX_EXIT_ERROR);