Skip to content

Commit c83f6bf

Browse files
rhvgoyalaxboe
authored andcommitted
block: add partition resize function to blkpg ioctl
Add a new operation code (BLKPG_RESIZE_PARTITION) to the BLKPG ioctl that allows altering the size of an existing partition, even if it is currently in use. This patch converts hd_struct->nr_sects into sequence counter because One might extend a partition while IO is happening to it and update of nr_sects can be non-atomic on 32bit machines with 64bit sector_t. This can lead to issues like reading inconsistent size of a partition. Sequence counter have been used so that readers don't have to take bdev mutex lock as we call sector_in_part() very frequently. Now all the access to hd_struct->nr_sects should happen using sequence counter read/update helper functions part_nr_sects_read/part_nr_sects_write. There is one exception though, set_capacity()/get_capacity(). I think theoritically race should exist there too but this patch does not modify set_capacity()/get_capacity() due to sheer number of call sites and I am afraid that change might break something. I have left that as a TODO item. We can handle it later if need be. This patch does not introduce any new races as such w.r.t set_capacity()/get_capacity(). v2: Add CONFIG_LBDAF test to UP preempt case as suggested by Phillip. Signed-off-by: Vivek Goyal <vgoyal@redhat.com> Signed-off-by: Phillip Susi <psusi@ubuntu.com> Signed-off-by: Jens Axboe <axboe@kernel.dk>
1 parent 4638a83 commit c83f6bf

File tree

5 files changed

+132
-9
lines changed

5 files changed

+132
-9
lines changed

block/genhd.c

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -154,7 +154,7 @@ struct hd_struct *disk_part_iter_next(struct disk_part_iter *piter)
154154
part = rcu_dereference(ptbl->part[piter->idx]);
155155
if (!part)
156156
continue;
157-
if (!part->nr_sects &&
157+
if (!part_nr_sects_read(part) &&
158158
!(piter->flags & DISK_PITER_INCL_EMPTY) &&
159159
!(piter->flags & DISK_PITER_INCL_EMPTY_PART0 &&
160160
piter->idx == 0))
@@ -191,7 +191,7 @@ EXPORT_SYMBOL_GPL(disk_part_iter_exit);
191191
static inline int sector_in_part(struct hd_struct *part, sector_t sector)
192192
{
193193
return part->start_sect <= sector &&
194-
sector < part->start_sect + part->nr_sects;
194+
sector < part->start_sect + part_nr_sects_read(part);
195195
}
196196

197197
/**
@@ -769,8 +769,8 @@ void __init printk_all_partitions(void)
769769

770770
printk("%s%s %10llu %s %s", is_part0 ? "" : " ",
771771
bdevt_str(part_devt(part), devt_buf),
772-
(unsigned long long)part->nr_sects >> 1,
773-
disk_name(disk, part->partno, name_buf),
772+
(unsigned long long)part_nr_sects_read(part) >> 1
773+
, disk_name(disk, part->partno, name_buf),
774774
uuid_buf);
775775
if (is_part0) {
776776
if (disk->driverfs_dev != NULL &&
@@ -862,7 +862,7 @@ static int show_partition(struct seq_file *seqf, void *v)
862862
while ((part = disk_part_iter_next(&piter)))
863863
seq_printf(seqf, "%4d %7d %10llu %s\n",
864864
MAJOR(part_devt(part)), MINOR(part_devt(part)),
865-
(unsigned long long)part->nr_sects >> 1,
865+
(unsigned long long)part_nr_sects_read(part) >> 1,
866866
disk_name(sgp, part->partno, buf));
867867
disk_part_iter_exit(&piter);
868868

@@ -1268,6 +1268,16 @@ struct gendisk *alloc_disk_node(int minors, int node_id)
12681268
}
12691269
disk->part_tbl->part[0] = &disk->part0;
12701270

1271+
/*
1272+
* set_capacity() and get_capacity() currently don't use
1273+
* seqcounter to read/update the part0->nr_sects. Still init
1274+
* the counter as we can read the sectors in IO submission
1275+
* patch using seqence counters.
1276+
*
1277+
* TODO: Ideally set_capacity() and get_capacity() should be
1278+
* converted to make use of bd_mutex and sequence counters.
1279+
*/
1280+
seqcount_init(&disk->part0.nr_sects_seq);
12711281
hd_ref_init(&disk->part0);
12721282

12731283
disk->minors = minors;

block/ioctl.c

Lines changed: 56 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
1313
{
1414
struct block_device *bdevp;
1515
struct gendisk *disk;
16-
struct hd_struct *part;
16+
struct hd_struct *part, *lpart;
1717
struct blkpg_ioctl_arg a;
1818
struct blkpg_partition p;
1919
struct disk_part_iter piter;
@@ -36,8 +36,8 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
3636
case BLKPG_ADD_PARTITION:
3737
start = p.start >> 9;
3838
length = p.length >> 9;
39-
/* check for fit in a hd_struct */
40-
if (sizeof(sector_t) == sizeof(long) &&
39+
/* check for fit in a hd_struct */
40+
if (sizeof(sector_t) == sizeof(long) &&
4141
sizeof(long long) > sizeof(long)) {
4242
long pstart = start, plength = length;
4343
if (pstart != start || plength != length
@@ -91,6 +91,59 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
9191
mutex_unlock(&bdevp->bd_mutex);
9292
bdput(bdevp);
9393

94+
return 0;
95+
case BLKPG_RESIZE_PARTITION:
96+
start = p.start >> 9;
97+
/* new length of partition in bytes */
98+
length = p.length >> 9;
99+
/* check for fit in a hd_struct */
100+
if (sizeof(sector_t) == sizeof(long) &&
101+
sizeof(long long) > sizeof(long)) {
102+
long pstart = start, plength = length;
103+
if (pstart != start || plength != length
104+
|| pstart < 0 || plength < 0)
105+
return -EINVAL;
106+
}
107+
part = disk_get_part(disk, partno);
108+
if (!part)
109+
return -ENXIO;
110+
bdevp = bdget(part_devt(part));
111+
if (!bdevp) {
112+
disk_put_part(part);
113+
return -ENOMEM;
114+
}
115+
mutex_lock(&bdevp->bd_mutex);
116+
mutex_lock_nested(&bdev->bd_mutex, 1);
117+
if (start != part->start_sect) {
118+
mutex_unlock(&bdevp->bd_mutex);
119+
mutex_unlock(&bdev->bd_mutex);
120+
bdput(bdevp);
121+
disk_put_part(part);
122+
return -EINVAL;
123+
}
124+
/* overlap? */
125+
disk_part_iter_init(&piter, disk,
126+
DISK_PITER_INCL_EMPTY);
127+
while ((lpart = disk_part_iter_next(&piter))) {
128+
if (lpart->partno != partno &&
129+
!(start + length <= lpart->start_sect ||
130+
start >= lpart->start_sect + lpart->nr_sects)
131+
) {
132+
disk_part_iter_exit(&piter);
133+
mutex_unlock(&bdevp->bd_mutex);
134+
mutex_unlock(&bdev->bd_mutex);
135+
bdput(bdevp);
136+
disk_put_part(part);
137+
return -EBUSY;
138+
}
139+
}
140+
disk_part_iter_exit(&piter);
141+
part_nr_sects_write(part, (sector_t)length);
142+
i_size_write(bdevp->bd_inode, p.length);
143+
mutex_unlock(&bdevp->bd_mutex);
144+
mutex_unlock(&bdev->bd_mutex);
145+
bdput(bdevp);
146+
disk_put_part(part);
94147
return 0;
95148
default:
96149
return -EINVAL;

block/partition-generic.c

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ ssize_t part_size_show(struct device *dev,
8484
struct device_attribute *attr, char *buf)
8585
{
8686
struct hd_struct *p = dev_to_part(dev);
87-
return sprintf(buf, "%llu\n",(unsigned long long)p->nr_sects);
87+
return sprintf(buf, "%llu\n",(unsigned long long)part_nr_sects_read(p));
8888
}
8989

9090
static ssize_t part_ro_show(struct device *dev,
@@ -294,6 +294,8 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
294294
err = -ENOMEM;
295295
goto out_free;
296296
}
297+
298+
seqcount_init(&p->nr_sects_seq);
297299
pdev = part_to_dev(p);
298300

299301
p->start_sect = start;

include/linux/blkpg.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ struct blkpg_ioctl_arg {
4040
/* The subfunctions (for the op field) */
4141
#define BLKPG_ADD_PARTITION 1
4242
#define BLKPG_DEL_PARTITION 2
43+
#define BLKPG_RESIZE_PARTITION 3
4344

4445
/* Sizes of name fields. Unused at present. */
4546
#define BLKPG_DEVNAMELTH 64

include/linux/genhd.h

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,13 @@ struct partition_meta_info {
9898

9999
struct hd_struct {
100100
sector_t start_sect;
101+
/*
102+
* nr_sects is protected by sequence counter. One might extend a
103+
* partition while IO is happening to it and update of nr_sects
104+
* can be non-atomic on 32bit machines with 64bit sector_t.
105+
*/
101106
sector_t nr_sects;
107+
seqcount_t nr_sects_seq;
102108
sector_t alignment_offset;
103109
unsigned int discard_alignment;
104110
struct device __dev;
@@ -648,6 +654,57 @@ static inline void hd_struct_put(struct hd_struct *part)
648654
__delete_partition(part);
649655
}
650656

657+
/*
658+
* Any access of part->nr_sects which is not protected by partition
659+
* bd_mutex or gendisk bdev bd_mutex, should be done using this
660+
* accessor function.
661+
*
662+
* Code written along the lines of i_size_read() and i_size_write().
663+
* CONFIG_PREEMPT case optimizes the case of UP kernel with preemption
664+
* on.
665+
*/
666+
static inline sector_t part_nr_sects_read(struct hd_struct *part)
667+
{
668+
#if BITS_PER_LONG==32 && defined(CONFIG_LBDAF) && defined(CONFIG_SMP)
669+
sector_t nr_sects;
670+
unsigned seq;
671+
do {
672+
seq = read_seqcount_begin(&part->nr_sects_seq);
673+
nr_sects = part->nr_sects;
674+
} while (read_seqcount_retry(&part->nr_sects_seq, seq));
675+
return nr_sects;
676+
#elif BITS_PER_LONG==32 && defined(CONFIG_LBDAF) && defined(CONFIG_PREEMPT)
677+
sector_t nr_sects;
678+
679+
preempt_disable();
680+
nr_sects = part->nr_sects;
681+
preempt_enable();
682+
return nr_sects;
683+
#else
684+
return part->nr_sects;
685+
#endif
686+
}
687+
688+
/*
689+
* Should be called with mutex lock held (typically bd_mutex) of partition
690+
* to provide mutual exlusion among writers otherwise seqcount might be
691+
* left in wrong state leaving the readers spinning infinitely.
692+
*/
693+
static inline void part_nr_sects_write(struct hd_struct *part, sector_t size)
694+
{
695+
#if BITS_PER_LONG==32 && defined(CONFIG_LBDAF) && defined(CONFIG_SMP)
696+
write_seqcount_begin(&part->nr_sects_seq);
697+
part->nr_sects = size;
698+
write_seqcount_end(&part->nr_sects_seq);
699+
#elif BITS_PER_LONG==32 && defined(CONFIG_LBDAF) && defined(CONFIG_PREEMPT)
700+
preempt_disable();
701+
part->nr_sects = size;
702+
preempt_enable();
703+
#else
704+
part->nr_sects = size;
705+
#endif
706+
}
707+
651708
#else /* CONFIG_BLOCK */
652709

653710
static inline void printk_all_partitions(void) { }

0 commit comments

Comments
 (0)