Skip to content

Commit c9de560

Browse files
Alex Tomastytso
Alex Tomas
authored andcommitted
ext4: Add multi block allocator for ext4
Signed-off-by: Alex Tomas <alex@clusterfs.com> Signed-off-by: Andreas Dilger <adilger@clusterfs.com> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> Signed-off-by: Eric Sandeen <sandeen@redhat.com> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
1 parent 1988b51 commit c9de560

File tree

13 files changed

+4900
-38
lines changed

13 files changed

+4900
-38
lines changed

Documentation/filesystems/ext4.txt

+9-1
Original file line numberDiff line numberDiff line change
@@ -86,9 +86,11 @@ Alex is working on a new set of patches right now.
8686
When mounting an ext4 filesystem, the following option are accepted:
8787
(*) == default
8888

89-
extents ext4 will use extents to address file data. The
89+
extents (*) ext4 will use extents to address file data. The
9090
file system will no longer be mountable by ext3.
9191

92+
noextents ext4 will not use extents for newly created files
93+
9294
journal_checksum Enable checksumming of the journal transactions.
9395
This will allow the recovery code in e2fsck and the
9496
kernel to detect corruption in the kernel. It is a
@@ -206,6 +208,12 @@ nobh (a) cache disk block mapping information
206208
"nobh" option tries to avoid associating buffer
207209
heads (supported only for "writeback" mode).
208210

211+
mballoc (*) Use the multiple block allocator for block allocation
212+
nomballoc disabled multiple block allocator for block allocation.
213+
stripe=n Number of filesystem blocks that mballoc will try
214+
to use for allocation size and alignment. For RAID5/6
215+
systems this should be the number of data
216+
disks * RAID chunk size in file system blocks.
209217

210218
Data Mode
211219
---------

Documentation/filesystems/proc.txt

+39
Original file line numberDiff line numberDiff line change
@@ -857,6 +857,45 @@ CPUs.
857857
The "procs_blocked" line gives the number of processes currently blocked,
858858
waiting for I/O to complete.
859859

860+
1.9 Ext4 file system parameters
861+
------------------------------
862+
Ext4 file system have one directory per partition under /proc/fs/ext4/
863+
# ls /proc/fs/ext4/hdc/
864+
group_prealloc max_to_scan mb_groups mb_history min_to_scan order2_req
865+
stats stream_req
866+
867+
mb_groups:
868+
This file gives the details of mutiblock allocator buddy cache of free blocks
869+
870+
mb_history:
871+
Multiblock allocation history.
872+
873+
stats:
874+
This file indicate whether the multiblock allocator should start collecting
875+
statistics. The statistics are shown during unmount
876+
877+
group_prealloc:
878+
The multiblock allocator normalize the block allocation request to
879+
group_prealloc filesystem blocks if we don't have strip value set.
880+
The stripe value can be specified at mount time or during mke2fs.
881+
882+
max_to_scan:
883+
How long multiblock allocator can look for a best extent (in found extents)
884+
885+
min_to_scan:
886+
How long multiblock allocator must look for a best extent
887+
888+
order2_req:
889+
Multiblock allocator use 2^N search using buddies only for requests greater
890+
than or equal to order2_req. The request size is specfied in file system
891+
blocks. A value of 2 indicate only if the requests are greater than or equal
892+
to 4 blocks.
893+
894+
stream_req:
895+
Files smaller than stream_req are served by the stream allocator, whose
896+
purpose is to pack requests as close each to other as possible to
897+
produce smooth I/O traffic. Avalue of 16 indicate that file smaller than 16
898+
filesystem block size will use group based preallocation.
860899

861900
------------------------------------------------------------------------------
862901
Summary

fs/ext4/Makefile

+1-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ obj-$(CONFIG_EXT4DEV_FS) += ext4dev.o
66

77
ext4dev-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
88
ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
9-
ext4_jbd2.o migrate.o
9+
ext4_jbd2.o migrate.o mballoc.o
1010

1111
ext4dev-$(CONFIG_EXT4DEV_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
1212
ext4dev-$(CONFIG_EXT4DEV_FS_POSIX_ACL) += acl.o

fs/ext4/balloc.c

+56-11
Original file line numberDiff line numberDiff line change
@@ -577,6 +577,8 @@ void ext4_discard_reservation(struct inode *inode)
577577
struct ext4_reserve_window_node *rsv;
578578
spinlock_t *rsv_lock = &EXT4_SB(inode->i_sb)->s_rsv_window_lock;
579579

580+
ext4_mb_discard_inode_preallocations(inode);
581+
580582
if (!block_i)
581583
return;
582584

@@ -785,19 +787,29 @@ void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb,
785787
* @inode: inode
786788
* @block: start physical block to free
787789
* @count: number of blocks to count
790+
* @metadata: Are these metadata blocks
788791
*/
789792
void ext4_free_blocks(handle_t *handle, struct inode *inode,
790-
ext4_fsblk_t block, unsigned long count)
793+
ext4_fsblk_t block, unsigned long count,
794+
int metadata)
791795
{
792796
struct super_block * sb;
793797
unsigned long dquot_freed_blocks;
794798

799+
/* this isn't the right place to decide whether block is metadata
800+
* inode.c/extents.c knows better, but for safety ... */
801+
if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode) ||
802+
ext4_should_journal_data(inode))
803+
metadata = 1;
804+
795805
sb = inode->i_sb;
796-
if (!sb) {
797-
printk ("ext4_free_blocks: nonexistent device");
798-
return;
799-
}
800-
ext4_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks);
806+
807+
if (!test_opt(sb, MBALLOC) || !EXT4_SB(sb)->s_group_info)
808+
ext4_free_blocks_sb(handle, sb, block, count,
809+
&dquot_freed_blocks);
810+
else
811+
ext4_mb_free_blocks(handle, inode, block, count,
812+
metadata, &dquot_freed_blocks);
801813
if (dquot_freed_blocks)
802814
DQUOT_FREE_BLOCK(inode, dquot_freed_blocks);
803815
return;
@@ -1576,7 +1588,7 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
15761588
}
15771589

15781590
/**
1579-
* ext4_new_blocks() -- core block(s) allocation function
1591+
* ext4_new_blocks_old() -- core block(s) allocation function
15801592
* @handle: handle to this transaction
15811593
* @inode: file inode
15821594
* @goal: given target block(filesystem wide)
@@ -1589,7 +1601,7 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
15891601
* any specific goal block.
15901602
*
15911603
*/
1592-
ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
1604+
ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
15931605
ext4_fsblk_t goal, unsigned long *count, int *errp)
15941606
{
15951607
struct buffer_head *bitmap_bh = NULL;
@@ -1849,13 +1861,46 @@ ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
18491861
}
18501862

18511863
ext4_fsblk_t ext4_new_block(handle_t *handle, struct inode *inode,
1852-
ext4_fsblk_t goal, int *errp)
1864+
ext4_fsblk_t goal, int *errp)
1865+
{
1866+
struct ext4_allocation_request ar;
1867+
ext4_fsblk_t ret;
1868+
1869+
if (!test_opt(inode->i_sb, MBALLOC)) {
1870+
unsigned long count = 1;
1871+
ret = ext4_new_blocks_old(handle, inode, goal, &count, errp);
1872+
return ret;
1873+
}
1874+
1875+
memset(&ar, 0, sizeof(ar));
1876+
ar.inode = inode;
1877+
ar.goal = goal;
1878+
ar.len = 1;
1879+
ret = ext4_mb_new_blocks(handle, &ar, errp);
1880+
return ret;
1881+
}
1882+
1883+
ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
1884+
ext4_fsblk_t goal, unsigned long *count, int *errp)
18531885
{
1854-
unsigned long count = 1;
1886+
struct ext4_allocation_request ar;
1887+
ext4_fsblk_t ret;
18551888

1856-
return ext4_new_blocks(handle, inode, goal, &count, errp);
1889+
if (!test_opt(inode->i_sb, MBALLOC)) {
1890+
ret = ext4_new_blocks_old(handle, inode, goal, count, errp);
1891+
return ret;
1892+
}
1893+
1894+
memset(&ar, 0, sizeof(ar));
1895+
ar.inode = inode;
1896+
ar.goal = goal;
1897+
ar.len = *count;
1898+
ret = ext4_mb_new_blocks(handle, &ar, errp);
1899+
*count = ar.len;
1900+
return ret;
18571901
}
18581902

1903+
18591904
/**
18601905
* ext4_count_free_blocks() -- count filesystem free blocks
18611906
* @sb: superblock

fs/ext4/extents.c

+36-9
Original file line numberDiff line numberDiff line change
@@ -853,7 +853,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
853853
for (i = 0; i < depth; i++) {
854854
if (!ablocks[i])
855855
continue;
856-
ext4_free_blocks(handle, inode, ablocks[i], 1);
856+
ext4_free_blocks(handle, inode, ablocks[i], 1, 1);
857857
}
858858
}
859859
kfree(ablocks);
@@ -1698,7 +1698,7 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
16981698
ext_debug("index is empty, remove it, free block %llu\n", leaf);
16991699
bh = sb_find_get_block(inode->i_sb, leaf);
17001700
ext4_forget(handle, 1, inode, bh, leaf);
1701-
ext4_free_blocks(handle, inode, leaf, 1);
1701+
ext4_free_blocks(handle, inode, leaf, 1, 1);
17021702
return err;
17031703
}
17041704

@@ -1759,8 +1759,10 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
17591759
{
17601760
struct buffer_head *bh;
17611761
unsigned short ee_len = ext4_ext_get_actual_len(ex);
1762-
int i;
1762+
int i, metadata = 0;
17631763

1764+
if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
1765+
metadata = 1;
17641766
#ifdef EXTENTS_STATS
17651767
{
17661768
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
@@ -1789,7 +1791,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
17891791
bh = sb_find_get_block(inode->i_sb, start + i);
17901792
ext4_forget(handle, 0, inode, bh, start + i);
17911793
}
1792-
ext4_free_blocks(handle, inode, start, num);
1794+
ext4_free_blocks(handle, inode, start, num, metadata);
17931795
} else if (from == le32_to_cpu(ex->ee_block)
17941796
&& to <= le32_to_cpu(ex->ee_block) + ee_len - 1) {
17951797
printk(KERN_INFO "strange request: removal %u-%u from %u:%u\n",
@@ -2287,6 +2289,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
22872289
ext4_fsblk_t goal, newblock;
22882290
int err = 0, depth, ret;
22892291
unsigned long allocated = 0;
2292+
struct ext4_allocation_request ar;
22902293

22912294
__clear_bit(BH_New, &bh_result->b_state);
22922295
ext_debug("blocks %u/%lu requested for inode %u\n",
@@ -2397,8 +2400,15 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
23972400
if (S_ISREG(inode->i_mode) && (!EXT4_I(inode)->i_block_alloc_info))
23982401
ext4_init_block_alloc_info(inode);
23992402

2400-
/* allocate new block */
2401-
goal = ext4_ext_find_goal(inode, path, iblock);
2403+
/* find neighbour allocated blocks */
2404+
ar.lleft = iblock;
2405+
err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft);
2406+
if (err)
2407+
goto out2;
2408+
ar.lright = iblock;
2409+
err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright);
2410+
if (err)
2411+
goto out2;
24022412

24032413
/*
24042414
* See if request is beyond maximum number of blocks we can have in
@@ -2421,22 +2431,36 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
24212431
allocated = le16_to_cpu(newex.ee_len);
24222432
else
24232433
allocated = max_blocks;
2424-
newblock = ext4_new_blocks(handle, inode, goal, &allocated, &err);
2434+
2435+
/* allocate new block */
2436+
ar.inode = inode;
2437+
ar.goal = ext4_ext_find_goal(inode, path, iblock);
2438+
ar.logical = iblock;
2439+
ar.len = allocated;
2440+
if (S_ISREG(inode->i_mode))
2441+
ar.flags = EXT4_MB_HINT_DATA;
2442+
else
2443+
/* disable in-core preallocation for non-regular files */
2444+
ar.flags = 0;
2445+
newblock = ext4_mb_new_blocks(handle, &ar, &err);
24252446
if (!newblock)
24262447
goto out2;
24272448
ext_debug("allocate new block: goal %llu, found %llu/%lu\n",
24282449
goal, newblock, allocated);
24292450

24302451
/* try to insert new extent into found leaf and return */
24312452
ext4_ext_store_pblock(&newex, newblock);
2432-
newex.ee_len = cpu_to_le16(allocated);
2453+
newex.ee_len = cpu_to_le16(ar.len);
24332454
if (create == EXT4_CREATE_UNINITIALIZED_EXT) /* Mark uninitialized */
24342455
ext4_ext_mark_uninitialized(&newex);
24352456
err = ext4_ext_insert_extent(handle, inode, path, &newex);
24362457
if (err) {
24372458
/* free data blocks we just allocated */
2459+
/* not a good idea to call discard here directly,
2460+
* but otherwise we'd need to call it every free() */
2461+
ext4_mb_discard_inode_preallocations(inode);
24382462
ext4_free_blocks(handle, inode, ext_pblock(&newex),
2439-
le16_to_cpu(newex.ee_len));
2463+
le16_to_cpu(newex.ee_len), 0);
24402464
goto out2;
24412465
}
24422466

@@ -2445,6 +2469,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
24452469

24462470
/* previous routine could use block we allocated */
24472471
newblock = ext_pblock(&newex);
2472+
allocated = le16_to_cpu(newex.ee_len);
24482473
outnew:
24492474
__set_bit(BH_New, &bh_result->b_state);
24502475

@@ -2496,6 +2521,8 @@ void ext4_ext_truncate(struct inode * inode, struct page *page)
24962521
down_write(&EXT4_I(inode)->i_data_sem);
24972522
ext4_ext_invalidate_cache(inode);
24982523

2524+
ext4_mb_discard_inode_preallocations(inode);
2525+
24992526
/*
25002527
* TODO: optimization is possible here.
25012528
* Probably we need not scan at all,

fs/ext4/inode.c

+8-7
Original file line numberDiff line numberDiff line change
@@ -551,7 +551,7 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
551551
return ret;
552552
failed_out:
553553
for (i = 0; i <index; i++)
554-
ext4_free_blocks(handle, inode, new_blocks[i], 1);
554+
ext4_free_blocks(handle, inode, new_blocks[i], 1, 0);
555555
return ret;
556556
}
557557

@@ -650,9 +650,9 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
650650
ext4_journal_forget(handle, branch[i].bh);
651651
}
652652
for (i = 0; i <indirect_blks; i++)
653-
ext4_free_blocks(handle, inode, new_blocks[i], 1);
653+
ext4_free_blocks(handle, inode, new_blocks[i], 1, 0);
654654

655-
ext4_free_blocks(handle, inode, new_blocks[i], num);
655+
ext4_free_blocks(handle, inode, new_blocks[i], num, 0);
656656

657657
return err;
658658
}
@@ -749,9 +749,10 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode,
749749
for (i = 1; i <= num; i++) {
750750
BUFFER_TRACE(where[i].bh, "call jbd2_journal_forget");
751751
ext4_journal_forget(handle, where[i].bh);
752-
ext4_free_blocks(handle,inode,le32_to_cpu(where[i-1].key),1);
752+
ext4_free_blocks(handle, inode,
753+
le32_to_cpu(where[i-1].key), 1, 0);
753754
}
754-
ext4_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks);
755+
ext4_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks, 0);
755756

756757
return err;
757758
}
@@ -2052,7 +2053,7 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode,
20522053
}
20532054
}
20542055

2055-
ext4_free_blocks(handle, inode, block_to_free, count);
2056+
ext4_free_blocks(handle, inode, block_to_free, count, 0);
20562057
}
20572058

20582059
/**
@@ -2225,7 +2226,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
22252226
ext4_journal_test_restart(handle, inode);
22262227
}
22272228

2228-
ext4_free_blocks(handle, inode, nr, 1);
2229+
ext4_free_blocks(handle, inode, nr, 1, 1);
22292230

22302231
if (parent_bh) {
22312232
/*

0 commit comments

Comments
 (0)