Skip to content

Commit

Permalink
writeback: bdi_writeback iteration must not skip dying ones
Browse files Browse the repository at this point in the history
bdi_for_each_wb() is used in several places to wake up or issue
writeback work items to all wb's (bdi_writeback's) on a given bdi.
The iteration is performed by walking bdi->cgwb_tree; however, the
tree only indexes wb's which are currently active.

For example, when a memcg gets associated with a different blkcg, the
old wb is removed from the tree so that the new one can be indexed.
The old wb starts dying from then on but will linger till all its
inodes are drained.  As these dying wb's may still host dirty inodes,
writeback operations which affect all wb's must include them.
bdi_for_each_wb() skipping dying wb's led to sync(2) missing and
failing to sync the inodes belonging to those wb's.

This patch adds a RCU protected @bdi->wb_list which lists all wb's
beloinging to that bdi.  wb's are added on creation and removed on
release rather than on the start of destruction.  bdi_for_each_wb()
usages are replaced with list_for_each[_continue]_rcu() iterations
over @bdi->wb_list and bdi_for_each_wb() and its helpers are removed.

v2: Updated as per Jan.  last_wb ref leak in bdi_split_work_to_wbs()
    fixed and unnecessary list head severing in cgwb_bdi_destroy()
    removed.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-and-tested-by: Artem Bityutskiy <dedekind1@gmail.com>
Fixes: ebe41ab ("writeback: implement bdi_for_each_wb()")
Link: http://lkml.kernel.org/g/1443012552.19983.209.camel@gmail.com
Cc: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@fb.com>
  • Loading branch information
htejun authored and axboe committed Oct 12, 2015
1 parent 6fdf860 commit b817525
Show file tree
Hide file tree
Showing 5 changed files with 39 additions and 75 deletions.
31 changes: 22 additions & 9 deletions fs/fs-writeback.c
Original file line number Diff line number Diff line change
Expand Up @@ -778,19 +778,24 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
struct wb_writeback_work *base_work,
bool skip_if_busy)
{
int next_memcg_id = 0;
struct bdi_writeback *wb;
struct wb_iter iter;
struct bdi_writeback *last_wb = NULL;
struct bdi_writeback *wb = list_entry_rcu(&bdi->wb_list,
struct bdi_writeback, bdi_node);

might_sleep();
restart:
rcu_read_lock();
bdi_for_each_wb(wb, bdi, &iter, next_memcg_id) {
list_for_each_entry_continue_rcu(wb, &bdi->wb_list, bdi_node) {
DEFINE_WB_COMPLETION_ONSTACK(fallback_work_done);
struct wb_writeback_work fallback_work;
struct wb_writeback_work *work;
long nr_pages;

if (last_wb) {
wb_put(last_wb);
last_wb = NULL;
}

/* SYNC_ALL writes out I_DIRTY_TIME too */
if (!wb_has_dirty_io(wb) &&
(base_work->sync_mode == WB_SYNC_NONE ||
Expand Down Expand Up @@ -819,12 +824,22 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,

wb_queue_work(wb, work);

next_memcg_id = wb->memcg_css->id + 1;
/*
* Pin @wb so that it stays on @bdi->wb_list. This allows
* continuing iteration from @wb after dropping and
* regrabbing rcu read lock.
*/
wb_get(wb);
last_wb = wb;

rcu_read_unlock();
wb_wait_for_completion(bdi, &fallback_work_done);
goto restart;
}
rcu_read_unlock();

if (last_wb)
wb_put(last_wb);
}

#else /* CONFIG_CGROUP_WRITEBACK */
Expand Down Expand Up @@ -1857,12 +1872,11 @@ void wakeup_flusher_threads(long nr_pages, enum wb_reason reason)
rcu_read_lock();
list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
struct bdi_writeback *wb;
struct wb_iter iter;

if (!bdi_has_dirty_io(bdi))
continue;

bdi_for_each_wb(wb, bdi, &iter, 0)
list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node)
wb_start_writeback(wb, wb_split_bdi_pages(wb, nr_pages),
false, reason);
}
Expand Down Expand Up @@ -1894,9 +1908,8 @@ static void wakeup_dirtytime_writeback(struct work_struct *w)
rcu_read_lock();
list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
struct bdi_writeback *wb;
struct wb_iter iter;

bdi_for_each_wb(wb, bdi, &iter, 0)
list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node)
if (!list_empty(&wb->b_dirty_time))
wb_wakeup(wb);
}
Expand Down
3 changes: 3 additions & 0 deletions include/linux/backing-dev-defs.h
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,8 @@ struct bdi_writeback {
struct list_head work_list;
struct delayed_work dwork; /* work item used for writeback */

struct list_head bdi_node; /* anchored at bdi->wb_list */

#ifdef CONFIG_CGROUP_WRITEBACK
struct percpu_ref refcnt; /* used only for !root wb's */
struct fprop_local_percpu memcg_completions;
Expand Down Expand Up @@ -150,6 +152,7 @@ struct backing_dev_info {
atomic_long_t tot_write_bandwidth;

struct bdi_writeback wb; /* the root writeback info for this bdi */
struct list_head wb_list; /* list of all wbs */
#ifdef CONFIG_CGROUP_WRITEBACK
struct radix_tree_root cgwb_tree; /* radix tree of active cgroup wbs */
struct rb_root cgwb_congested_tree; /* their congested states */
Expand Down
63 changes: 0 additions & 63 deletions include/linux/backing-dev.h
Original file line number Diff line number Diff line change
Expand Up @@ -408,61 +408,6 @@ static inline void unlocked_inode_to_wb_end(struct inode *inode, bool locked)
rcu_read_unlock();
}

struct wb_iter {
int start_memcg_id;
struct radix_tree_iter tree_iter;
void **slot;
};

static inline struct bdi_writeback *__wb_iter_next(struct wb_iter *iter,
struct backing_dev_info *bdi)
{
struct radix_tree_iter *titer = &iter->tree_iter;

WARN_ON_ONCE(!rcu_read_lock_held());

if (iter->start_memcg_id >= 0) {
iter->slot = radix_tree_iter_init(titer, iter->start_memcg_id);
iter->start_memcg_id = -1;
} else {
iter->slot = radix_tree_next_slot(iter->slot, titer, 0);
}

if (!iter->slot)
iter->slot = radix_tree_next_chunk(&bdi->cgwb_tree, titer, 0);
if (iter->slot)
return *iter->slot;
return NULL;
}

static inline struct bdi_writeback *__wb_iter_init(struct wb_iter *iter,
struct backing_dev_info *bdi,
int start_memcg_id)
{
iter->start_memcg_id = start_memcg_id;

if (start_memcg_id)
return __wb_iter_next(iter, bdi);
else
return &bdi->wb;
}

/**
* bdi_for_each_wb - walk all wb's of a bdi in ascending memcg ID order
* @wb_cur: cursor struct bdi_writeback pointer
* @bdi: bdi to walk wb's of
* @iter: pointer to struct wb_iter to be used as iteration buffer
* @start_memcg_id: memcg ID to start iteration from
*
* Iterate @wb_cur through the wb's (bdi_writeback's) of @bdi in ascending
* memcg ID order starting from @start_memcg_id. @iter is struct wb_iter
* to be used as temp storage during iteration. rcu_read_lock() must be
* held throughout iteration.
*/
#define bdi_for_each_wb(wb_cur, bdi, iter, start_memcg_id) \
for ((wb_cur) = __wb_iter_init(iter, bdi, start_memcg_id); \
(wb_cur); (wb_cur) = __wb_iter_next(iter, bdi))

#else /* CONFIG_CGROUP_WRITEBACK */

static inline bool inode_cgwb_enabled(struct inode *inode)
Expand Down Expand Up @@ -522,14 +467,6 @@ static inline void wb_blkcg_offline(struct blkcg *blkcg)
{
}

struct wb_iter {
int next_id;
};

#define bdi_for_each_wb(wb_cur, bdi, iter, start_blkcg_id) \
for ((iter)->next_id = (start_blkcg_id); \
({ (wb_cur) = !(iter)->next_id++ ? &(bdi)->wb : NULL; }); )

static inline int inode_congested(struct inode *inode, int cong_bits)
{
return wb_congested(&inode_to_bdi(inode)->wb, cong_bits);
Expand Down
14 changes: 13 additions & 1 deletion mm/backing-dev.c
Original file line number Diff line number Diff line change
Expand Up @@ -480,6 +480,10 @@ static void cgwb_release_workfn(struct work_struct *work)
release_work);
struct backing_dev_info *bdi = wb->bdi;

spin_lock_irq(&cgwb_lock);
list_del_rcu(&wb->bdi_node);
spin_unlock_irq(&cgwb_lock);

wb_shutdown(wb);

css_put(wb->memcg_css);
Expand Down Expand Up @@ -575,6 +579,7 @@ static int cgwb_create(struct backing_dev_info *bdi,
ret = radix_tree_insert(&bdi->cgwb_tree, memcg_css->id, wb);
if (!ret) {
atomic_inc(&bdi->usage_cnt);
list_add_tail_rcu(&wb->bdi_node, &bdi->wb_list);
list_add(&wb->memcg_node, memcg_cgwb_list);
list_add(&wb->blkcg_node, blkcg_cgwb_list);
css_get(memcg_css);
Expand Down Expand Up @@ -764,15 +769,22 @@ static void cgwb_bdi_destroy(struct backing_dev_info *bdi) { }

int bdi_init(struct backing_dev_info *bdi)
{
int ret;

bdi->dev = NULL;

bdi->min_ratio = 0;
bdi->max_ratio = 100;
bdi->max_prop_frac = FPROP_FRAC_BASE;
INIT_LIST_HEAD(&bdi->bdi_list);
INIT_LIST_HEAD(&bdi->wb_list);
init_waitqueue_head(&bdi->wb_waitq);

return cgwb_bdi_init(bdi);
ret = cgwb_bdi_init(bdi);

list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list);

return ret;
}
EXPORT_SYMBOL(bdi_init);

Expand Down
3 changes: 1 addition & 2 deletions mm/page-writeback.c
Original file line number Diff line number Diff line change
Expand Up @@ -1956,7 +1956,6 @@ void laptop_mode_timer_fn(unsigned long data)
int nr_pages = global_page_state(NR_FILE_DIRTY) +
global_page_state(NR_UNSTABLE_NFS);
struct bdi_writeback *wb;
struct wb_iter iter;

/*
* We want to write everything out, not just down to the dirty
Expand All @@ -1966,7 +1965,7 @@ void laptop_mode_timer_fn(unsigned long data)
return;

rcu_read_lock();
bdi_for_each_wb(wb, &q->backing_dev_info, &iter, 0)
list_for_each_entry_rcu(wb, &q->backing_dev_info.wb_list, bdi_node)
if (wb_has_dirty_io(wb))
wb_start_writeback(wb, nr_pages, true,
WB_REASON_LAPTOP_TIMER);
Expand Down

0 comments on commit b817525

Please sign in to comment.