Skip to content

Commit 87579e9

Browse files
dschatzbergtorvalds
authored andcommitted
loop: use worker per cgroup instead of kworker
Patch series "Charge loop device i/o to issuing cgroup", v14. The loop device runs all i/o to the backing file on a separate kworker thread which results in all i/o being charged to the root cgroup. This allows a loop device to be used to trivially bypass resource limits and other policy. This patch series fixes this gap in accounting. A simple script to demonstrate this behavior on cgroupv2 machine: ''' #!/bin/bash set -e CGROUP=/sys/fs/cgroup/test.slice LOOP_DEV=/dev/loop0 if [[ ! -d $CGROUP ]] then sudo mkdir $CGROUP fi grep oom_kill $CGROUP/memory.events # Set a memory limit, write more than that limit to tmpfs -> OOM kill sudo unshare -m bash -c " echo \$\$ > $CGROUP/cgroup.procs; echo 0 > $CGROUP/memory.swap.max; echo 64M > $CGROUP/memory.max; mount -t tmpfs -o size=512m tmpfs /tmp; dd if=/dev/zero of=/tmp/file bs=1M count=256" || true grep oom_kill $CGROUP/memory.events # Set a memory limit, write more than that limit through loopback # device -> no OOM kill sudo unshare -m bash -c " echo \$\$ > $CGROUP/cgroup.procs; echo 0 > $CGROUP/memory.swap.max; echo 64M > $CGROUP/memory.max; mount -t tmpfs -o size=512m tmpfs /tmp; truncate -s 512m /tmp/backing_file losetup $LOOP_DEV /tmp/backing_file dd if=/dev/zero of=$LOOP_DEV bs=1M count=256; losetup -D $LOOP_DEV" || true grep oom_kill $CGROUP/memory.events ''' Naively charging cgroups could result in priority inversions through the single kworker thread in the case where multiple cgroups are reading/writing to the same loop device. This patch series does some minor modification to the loop driver so that each cgroup can make forward progress independently to avoid this inversion. With this patch series applied, the above script triggers OOM kills when writing through the loop device as expected. This patch (of 3): Existing uses of loop device may have multiple cgroups reading/writing to the same device. Simply charging resources for I/O to the backing file could result in priority inversion where one cgroup gets synchronously blocked, holding up all other I/O to the loop device. In order to avoid this priority inversion, we use a single workqueue where each work item is a "struct loop_worker" which contains a queue of struct loop_cmds to issue. The loop device maintains a tree mapping blk css_id -> loop_worker. This allows each cgroup to independently make forward progress issuing I/O to the backing file. There is also a single queue for I/O associated with the rootcg which can be used in cases of extreme memory shortage where we cannot allocate a loop_worker. The locking for the tree and queues is fairly heavy handed - we acquire a per-loop-device spinlock any time either is accessed. The existing implementation serializes all I/O through a single thread anyways, so I don't believe this is any worse. [colin.king@canonical.com: fixes] Link: https://lkml.kernel.org/r/20210610173944.1203706-1-schatzberg.dan@gmail.com Link: https://lkml.kernel.org/r/20210610173944.1203706-2-schatzberg.dan@gmail.com Signed-off-by: Dan Schatzberg <schatzberg.dan@gmail.com> Reviewed-by: Ming Lei <ming.lei@redhat.com> Acked-by: Jens Axboe <axboe@kernel.dk> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Michal Hocko <mhocko@suse.com> Cc: Chris Down <chris@chrisdown.name> Cc: Shakeel Butt <shakeelb@google.com> Cc: Tejun Heo <tj@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
1 parent b51478a commit 87579e9

File tree

2 files changed

+187
-34
lines changed

2 files changed

+187
-34
lines changed

drivers/block/loop.c

Lines changed: 179 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,6 @@
7171
#include <linux/writeback.h>
7272
#include <linux/completion.h>
7373
#include <linux/highmem.h>
74-
#include <linux/kthread.h>
7574
#include <linux/splice.h>
7675
#include <linux/sysfs.h>
7776
#include <linux/miscdevice.h>
@@ -84,6 +83,8 @@
8483

8584
#include <linux/uaccess.h>
8685

86+
#define LOOP_IDLE_WORKER_TIMEOUT (60 * HZ)
87+
8788
static DEFINE_IDR(loop_index_idr);
8889
static DEFINE_MUTEX(loop_ctl_mutex);
8990

@@ -921,27 +922,95 @@ static void loop_config_discard(struct loop_device *lo)
921922
q->limits.discard_alignment = 0;
922923
}
923924

924-
static void loop_unprepare_queue(struct loop_device *lo)
925+
struct loop_worker {
926+
struct rb_node rb_node;
927+
struct work_struct work;
928+
struct list_head cmd_list;
929+
struct list_head idle_list;
930+
struct loop_device *lo;
931+
struct cgroup_subsys_state *css;
932+
unsigned long last_ran_at;
933+
};
934+
935+
static void loop_workfn(struct work_struct *work);
936+
static void loop_rootcg_workfn(struct work_struct *work);
937+
static void loop_free_idle_workers(struct timer_list *timer);
938+
939+
#ifdef CONFIG_BLK_CGROUP
940+
static inline int queue_on_root_worker(struct cgroup_subsys_state *css)
925941
{
926-
kthread_flush_worker(&lo->worker);
927-
kthread_stop(lo->worker_task);
942+
return !css || css == blkcg_root_css;
928943
}
929-
930-
static int loop_kthread_worker_fn(void *worker_ptr)
944+
#else
945+
static inline int queue_on_root_worker(struct cgroup_subsys_state *css)
931946
{
932-
current->flags |= PF_LOCAL_THROTTLE | PF_MEMALLOC_NOIO;
933-
return kthread_worker_fn(worker_ptr);
947+
return !css;
934948
}
949+
#endif
935950

936-
static int loop_prepare_queue(struct loop_device *lo)
951+
static void loop_queue_work(struct loop_device *lo, struct loop_cmd *cmd)
937952
{
938-
kthread_init_worker(&lo->worker);
939-
lo->worker_task = kthread_run(loop_kthread_worker_fn,
940-
&lo->worker, "loop%d", lo->lo_number);
941-
if (IS_ERR(lo->worker_task))
942-
return -ENOMEM;
943-
set_user_nice(lo->worker_task, MIN_NICE);
944-
return 0;
953+
struct rb_node **node = &(lo->worker_tree.rb_node), *parent = NULL;
954+
struct loop_worker *cur_worker, *worker = NULL;
955+
struct work_struct *work;
956+
struct list_head *cmd_list;
957+
958+
spin_lock_irq(&lo->lo_work_lock);
959+
960+
if (queue_on_root_worker(cmd->css))
961+
goto queue_work;
962+
963+
node = &lo->worker_tree.rb_node;
964+
965+
while (*node) {
966+
parent = *node;
967+
cur_worker = container_of(*node, struct loop_worker, rb_node);
968+
if (cur_worker->css == cmd->css) {
969+
worker = cur_worker;
970+
break;
971+
} else if ((long)cur_worker->css < (long)cmd->css) {
972+
node = &(*node)->rb_left;
973+
} else {
974+
node = &(*node)->rb_right;
975+
}
976+
}
977+
if (worker)
978+
goto queue_work;
979+
980+
worker = kzalloc(sizeof(struct loop_worker), GFP_NOWAIT | __GFP_NOWARN);
981+
/*
982+
* In the event we cannot allocate a worker, just queue on the
983+
* rootcg worker
984+
*/
985+
if (!worker)
986+
goto queue_work;
987+
988+
worker->css = cmd->css;
989+
css_get(worker->css);
990+
INIT_WORK(&worker->work, loop_workfn);
991+
INIT_LIST_HEAD(&worker->cmd_list);
992+
INIT_LIST_HEAD(&worker->idle_list);
993+
worker->lo = lo;
994+
rb_link_node(&worker->rb_node, parent, node);
995+
rb_insert_color(&worker->rb_node, &lo->worker_tree);
996+
queue_work:
997+
if (worker) {
998+
/*
999+
* We need to remove from the idle list here while
1000+
* holding the lock so that the idle timer doesn't
1001+
* free the worker
1002+
*/
1003+
if (!list_empty(&worker->idle_list))
1004+
list_del_init(&worker->idle_list);
1005+
work = &worker->work;
1006+
cmd_list = &worker->cmd_list;
1007+
} else {
1008+
work = &lo->rootcg_work;
1009+
cmd_list = &lo->rootcg_cmd_list;
1010+
}
1011+
list_add_tail(&cmd->list_entry, cmd_list);
1012+
queue_work(lo->workqueue, work);
1013+
spin_unlock_irq(&lo->lo_work_lock);
9451014
}
9461015

9471016
static void loop_update_rotational(struct loop_device *lo)
@@ -1127,12 +1196,23 @@ static int loop_configure(struct loop_device *lo, fmode_t mode,
11271196
!file->f_op->write_iter)
11281197
lo->lo_flags |= LO_FLAGS_READ_ONLY;
11291198

1130-
error = loop_prepare_queue(lo);
1131-
if (error)
1199+
lo->workqueue = alloc_workqueue("loop%d",
1200+
WQ_UNBOUND | WQ_FREEZABLE,
1201+
0,
1202+
lo->lo_number);
1203+
if (!lo->workqueue) {
1204+
error = -ENOMEM;
11321205
goto out_unlock;
1206+
}
11331207

11341208
set_disk_ro(lo->lo_disk, (lo->lo_flags & LO_FLAGS_READ_ONLY) != 0);
11351209

1210+
INIT_WORK(&lo->rootcg_work, loop_rootcg_workfn);
1211+
INIT_LIST_HEAD(&lo->rootcg_cmd_list);
1212+
INIT_LIST_HEAD(&lo->idle_worker_list);
1213+
lo->worker_tree = RB_ROOT;
1214+
timer_setup(&lo->timer, loop_free_idle_workers,
1215+
TIMER_DEFERRABLE);
11361216
lo->use_dio = lo->lo_flags & LO_FLAGS_DIRECT_IO;
11371217
lo->lo_device = bdev;
11381218
lo->lo_backing_file = file;
@@ -1200,6 +1280,7 @@ static int __loop_clr_fd(struct loop_device *lo, bool release)
12001280
int err = 0;
12011281
bool partscan = false;
12021282
int lo_number;
1283+
struct loop_worker *pos, *worker;
12031284

12041285
mutex_lock(&lo->lo_mutex);
12051286
if (WARN_ON_ONCE(lo->lo_state != Lo_rundown)) {
@@ -1219,6 +1300,18 @@ static int __loop_clr_fd(struct loop_device *lo, bool release)
12191300
/* freeze request queue during the transition */
12201301
blk_mq_freeze_queue(lo->lo_queue);
12211302

1303+
destroy_workqueue(lo->workqueue);
1304+
spin_lock_irq(&lo->lo_work_lock);
1305+
list_for_each_entry_safe(worker, pos, &lo->idle_worker_list,
1306+
idle_list) {
1307+
list_del(&worker->idle_list);
1308+
rb_erase(&worker->rb_node, &lo->worker_tree);
1309+
css_put(worker->css);
1310+
kfree(worker);
1311+
}
1312+
spin_unlock_irq(&lo->lo_work_lock);
1313+
del_timer_sync(&lo->timer);
1314+
12221315
spin_lock_irq(&lo->lo_lock);
12231316
lo->lo_backing_file = NULL;
12241317
spin_unlock_irq(&lo->lo_lock);
@@ -1255,7 +1348,6 @@ static int __loop_clr_fd(struct loop_device *lo, bool release)
12551348

12561349
partscan = lo->lo_flags & LO_FLAGS_PARTSCAN && bdev;
12571350
lo_number = lo->lo_number;
1258-
loop_unprepare_queue(lo);
12591351
out_unlock:
12601352
mutex_unlock(&lo->lo_mutex);
12611353
if (partscan) {
@@ -2015,7 +2107,7 @@ static blk_status_t loop_queue_rq(struct blk_mq_hw_ctx *hctx,
20152107
} else
20162108
#endif
20172109
cmd->css = NULL;
2018-
kthread_queue_work(&lo->worker, &cmd->work);
2110+
loop_queue_work(lo, cmd);
20192111

20202112
return BLK_STS_OK;
20212113
}
@@ -2045,26 +2137,82 @@ static void loop_handle_cmd(struct loop_cmd *cmd)
20452137
}
20462138
}
20472139

2048-
static void loop_queue_work(struct kthread_work *work)
2140+
static void loop_set_timer(struct loop_device *lo)
2141+
{
2142+
timer_reduce(&lo->timer, jiffies + LOOP_IDLE_WORKER_TIMEOUT);
2143+
}
2144+
2145+
static void loop_process_work(struct loop_worker *worker,
2146+
struct list_head *cmd_list, struct loop_device *lo)
20492147
{
2050-
struct loop_cmd *cmd =
2051-
container_of(work, struct loop_cmd, work);
2148+
int orig_flags = current->flags;
2149+
struct loop_cmd *cmd;
20522150

2053-
loop_handle_cmd(cmd);
2151+
current->flags |= PF_LOCAL_THROTTLE | PF_MEMALLOC_NOIO;
2152+
spin_lock_irq(&lo->lo_work_lock);
2153+
while (!list_empty(cmd_list)) {
2154+
cmd = container_of(
2155+
cmd_list->next, struct loop_cmd, list_entry);
2156+
list_del(cmd_list->next);
2157+
spin_unlock_irq(&lo->lo_work_lock);
2158+
2159+
loop_handle_cmd(cmd);
2160+
cond_resched();
2161+
2162+
spin_lock_irq(&lo->lo_work_lock);
2163+
}
2164+
2165+
/*
2166+
* We only add to the idle list if there are no pending cmds
2167+
* *and* the worker will not run again which ensures that it
2168+
* is safe to free any worker on the idle list
2169+
*/
2170+
if (worker && !work_pending(&worker->work)) {
2171+
worker->last_ran_at = jiffies;
2172+
list_add_tail(&worker->idle_list, &lo->idle_worker_list);
2173+
loop_set_timer(lo);
2174+
}
2175+
spin_unlock_irq(&lo->lo_work_lock);
2176+
current->flags = orig_flags;
20542177
}
20552178

2056-
static int loop_init_request(struct blk_mq_tag_set *set, struct request *rq,
2057-
unsigned int hctx_idx, unsigned int numa_node)
2179+
static void loop_workfn(struct work_struct *work)
20582180
{
2059-
struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq);
2181+
struct loop_worker *worker =
2182+
container_of(work, struct loop_worker, work);
2183+
loop_process_work(worker, &worker->cmd_list, worker->lo);
2184+
}
20602185

2061-
kthread_init_work(&cmd->work, loop_queue_work);
2062-
return 0;
2186+
static void loop_rootcg_workfn(struct work_struct *work)
2187+
{
2188+
struct loop_device *lo =
2189+
container_of(work, struct loop_device, rootcg_work);
2190+
loop_process_work(NULL, &lo->rootcg_cmd_list, lo);
2191+
}
2192+
2193+
static void loop_free_idle_workers(struct timer_list *timer)
2194+
{
2195+
struct loop_device *lo = container_of(timer, struct loop_device, timer);
2196+
struct loop_worker *pos, *worker;
2197+
2198+
spin_lock_irq(&lo->lo_work_lock);
2199+
list_for_each_entry_safe(worker, pos, &lo->idle_worker_list,
2200+
idle_list) {
2201+
if (time_is_after_jiffies(worker->last_ran_at +
2202+
LOOP_IDLE_WORKER_TIMEOUT))
2203+
break;
2204+
list_del(&worker->idle_list);
2205+
rb_erase(&worker->rb_node, &lo->worker_tree);
2206+
css_put(worker->css);
2207+
kfree(worker);
2208+
}
2209+
if (!list_empty(&lo->idle_worker_list))
2210+
loop_set_timer(lo);
2211+
spin_unlock_irq(&lo->lo_work_lock);
20632212
}
20642213

20652214
static const struct blk_mq_ops loop_mq_ops = {
20662215
.queue_rq = loop_queue_rq,
2067-
.init_request = loop_init_request,
20682216
.complete = lo_complete_rq,
20692217
};
20702218

@@ -2153,6 +2301,7 @@ static int loop_add(struct loop_device **l, int i)
21532301
mutex_init(&lo->lo_mutex);
21542302
lo->lo_number = i;
21552303
spin_lock_init(&lo->lo_lock);
2304+
spin_lock_init(&lo->lo_work_lock);
21562305
disk->major = LOOP_MAJOR;
21572306
disk->first_minor = i << part_shift;
21582307
disk->fops = &lo_fops;

drivers/block/loop.h

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@
1414
#include <linux/blk-mq.h>
1515
#include <linux/spinlock.h>
1616
#include <linux/mutex.h>
17-
#include <linux/kthread.h>
1817
#include <uapi/linux/loop.h>
1918

2019
/* Possible states of device */
@@ -55,8 +54,13 @@ struct loop_device {
5554

5655
spinlock_t lo_lock;
5756
int lo_state;
58-
struct kthread_worker worker;
59-
struct task_struct *worker_task;
57+
spinlock_t lo_work_lock;
58+
struct workqueue_struct *workqueue;
59+
struct work_struct rootcg_work;
60+
struct list_head rootcg_cmd_list;
61+
struct list_head idle_worker_list;
62+
struct rb_root worker_tree;
63+
struct timer_list timer;
6064
bool use_dio;
6165
bool sysfs_inited;
6266

@@ -67,7 +71,7 @@ struct loop_device {
6771
};
6872

6973
struct loop_cmd {
70-
struct kthread_work work;
74+
struct list_head list_entry;
7175
bool use_aio; /* use AIO interface to handle I/O */
7276
atomic_t ref; /* only for aio */
7377
long ret;

0 commit comments

Comments
 (0)