Skip to content

Commit

Permalink
UPSTREAM: mm: don't cap request size based on read-ahead setting
Browse files Browse the repository at this point in the history
We ran into a funky issue, where someone doing 256K buffered reads saw
128K requests at the device level.  Turns out it is read-ahead capping
the request size, since we use 128K as the default setting.  This
doesn't make a lot of sense - if someone is issuing 256K reads, they
should see 256K reads, regardless of the read-ahead setting, if the
underlying device can support a 256K read in a single command.

This patch introduces a bdi hint, io_pages.  This is the soft max IO
size for the lower level, I've hooked it up to the bdev settings here.
Read-ahead is modified to issue the maximum of the user request size,
and the read-ahead max size, but capped to the max request size on the
device side.  The latter is done to avoid reading ahead too much, if the
application asks for a huge read.  With this patch, the kernel behaves
like the application expects.

Change-Id: Ibe52ffac7a6e1ac86ed0c6a59a0f7a32d651ee5f
Link: http://lkml.kernel.org/r/1479498073-8657-1-git-send-email-axboe@fb.com
Signed-off-by: Jens Axboe <axboe@fb.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@google.com>
  • Loading branch information
axboe authored and Alistair Strachan committed Jan 23, 2019
1 parent a9e8246 commit 8d71d62
Show file tree
Hide file tree
Showing 4 changed files with 44 additions and 11 deletions.
2 changes: 2 additions & 0 deletions block/blk-settings.c
Original file line number Diff line number Diff line change
Expand Up @@ -274,6 +274,8 @@ EXPORT_SYMBOL(blk_limits_max_hw_sectors);
void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_sectors)
{
blk_limits_max_hw_sectors(&q->limits, max_hw_sectors);
q->backing_dev_info.io_pages =
q->limits.max_sectors >> (PAGE_SHIFT - 9);
}
EXPORT_SYMBOL(blk_queue_max_hw_sectors);

Expand Down
1 change: 1 addition & 0 deletions block/blk-sysfs.c
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,7 @@ queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)

spin_lock_irq(q->queue_lock);
q->limits.max_sectors = max_sectors_kb << 1;
q->backing_dev_info.io_pages = max_sectors_kb >> (PAGE_SHIFT - 10);
spin_unlock_irq(q->queue_lock);

return ret;
Expand Down
1 change: 1 addition & 0 deletions include/linux/backing-dev.h
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ struct bdi_writeback {
struct backing_dev_info {
struct list_head bdi_list;
unsigned long ra_pages; /* max readahead in PAGE_CACHE_SIZE units */
unsigned long io_pages; /* max allowed IO size */
unsigned long state; /* Always use atomic bitops on this */
unsigned int capabilities; /* Device capabilities */
congested_fn *congested_fn; /* Function pointer if device is md/dm */
Expand Down
51 changes: 40 additions & 11 deletions mm/readahead.c
Original file line number Diff line number Diff line change
Expand Up @@ -203,17 +203,38 @@ int __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
return ret;
}

/* Copied from fs/fs-writeback.c to avoid backport conflict. */
static inline struct backing_dev_info *inode_to_bdi(struct inode *inode)
{
struct super_block *sb = inode->i_sb;

if (sb_is_blkdev_sb(sb))
return inode->i_mapping->backing_dev_info;

return sb->s_bdi;
}

/*
* Chunk the readahead into 2 megabyte units, so that we don't pin too much
* memory at once.
*/
int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
pgoff_t offset, unsigned long nr_to_read)
pgoff_t offset, unsigned long nr_to_read)
{
struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
struct file_ra_state *ra = &filp->f_ra;
unsigned long max_pages;

if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages))
return -EINVAL;

nr_to_read = max_sane_readahead(nr_to_read);
/*
* If the request exceeds the readahead window, allow the read to
* be up to the optimal hardware IO size
*/
max_pages = max_t(unsigned long, bdi->io_pages, ra->ra_pages);
nr_to_read = min(nr_to_read, max_pages);

while (nr_to_read) {
int err;

Expand Down Expand Up @@ -380,9 +401,17 @@ ondemand_readahead(struct address_space *mapping,
bool hit_readahead_marker, pgoff_t offset,
unsigned long req_size)
{
unsigned long max = max_sane_readahead(ra->ra_pages);
struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
unsigned long max_pages = max_sane_readahead(ra->ra_pages);
pgoff_t prev_offset;

/*
* If the request exceeds the readahead window, allow the read to
* be up to the optimal hardware IO size
*/
if (req_size > max_pages && bdi->io_pages > max_pages)
max_pages = min(req_size, bdi->io_pages);

/*
* start of file
*/
Expand All @@ -396,7 +425,7 @@ ondemand_readahead(struct address_space *mapping,
if ((offset == (ra->start + ra->size - ra->async_size) ||
offset == (ra->start + ra->size))) {
ra->start += ra->size;
ra->size = get_next_ra_size(ra, max);
ra->size = get_next_ra_size(ra, max_pages);
ra->async_size = ra->size;
goto readit;
}
Expand All @@ -411,24 +440,24 @@ ondemand_readahead(struct address_space *mapping,
pgoff_t start;

rcu_read_lock();
start = page_cache_next_hole(mapping, offset + 1, max);
start = page_cache_next_hole(mapping, offset + 1, max_pages);
rcu_read_unlock();

if (!start || start - offset > max)
if (!start || start - offset > max_pages)
return 0;

ra->start = start;
ra->size = start - offset; /* old async_size */
ra->size += req_size;
ra->size = get_next_ra_size(ra, max);
ra->size = get_next_ra_size(ra, max_pages);
ra->async_size = ra->size;
goto readit;
}

/*
* oversize read
*/
if (req_size > max)
if (req_size > max_pages)
goto initial_readahead;

/*
Expand All @@ -444,7 +473,7 @@ ondemand_readahead(struct address_space *mapping,
* Query the page cache and look for the traces(cached history pages)
* that a sequential stream would leave behind.
*/
if (try_context_readahead(mapping, ra, offset, req_size, max))
if (try_context_readahead(mapping, ra, offset, req_size, max_pages))
goto readit;

/*
Expand All @@ -455,7 +484,7 @@ ondemand_readahead(struct address_space *mapping,

initial_readahead:
ra->start = offset;
ra->size = get_init_ra_size(req_size, max);
ra->size = get_init_ra_size(req_size, max_pages);
ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size;

readit:
Expand All @@ -465,7 +494,7 @@ ondemand_readahead(struct address_space *mapping,
* the resulted next readahead window into the current one.
*/
if (offset == ra->start && ra->size == ra->async_size) {
ra->async_size = get_next_ra_size(ra, max);
ra->async_size = get_next_ra_size(ra, max_pages);
ra->size += ra->async_size;
}

Expand Down

0 comments on commit 8d71d62

Please sign in to comment.