Skip to content

Commit 8b4822d

Browse files
committed
Merge tag 'md/4.12-rc2' of git://git.kernel.org/pub/scm/linux/kernel/git/shli/md
Pull MD fixes from Shaohua Li: - Several bug fixes for raid5-cache from Song Liu, mainly handle journal disk error - Fix bad block handling in choosing raid1 disk from Tomasz Majchrzak - Simplify external metadata array sysfs handling from Artur Paszkiewicz - Optimize raid0 discard handling from me, now raid0 will dispatch large discard IO directly to underlayer disks. * tag 'md/4.12-rc2' of git://git.kernel.org/pub/scm/linux/kernel/git/shli/md: raid1: prefer disk without bad blocks md/r5cache: handle sync with data in write back cache md/r5cache: gracefully handle journal device errors for writeback mode md/raid1/10: avoid unnecessary locking md/raid5-cache: in r5l_do_submit_io(), submit io->split_bio first md/md0: optimize raid0 discard handling md: don't return -EAGAIN in md_allow_write for external metadata arrays md/raid5: make use of spin_lock_irq over local_irq_disable + spin_lock
2 parents 667f867 + d82dd0e commit 8b4822d

File tree

8 files changed

+209
-86
lines changed

8 files changed

+209
-86
lines changed

drivers/md/md.c

Lines changed: 8 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -8022,18 +8022,15 @@ EXPORT_SYMBOL(md_write_end);
80228022
* may proceed without blocking. It is important to call this before
80238023
* attempting a GFP_KERNEL allocation while holding the mddev lock.
80248024
* Must be called with mddev_lock held.
8025-
*
8026-
* In the ->external case MD_SB_CHANGE_PENDING can not be cleared until mddev->lock
8027-
* is dropped, so return -EAGAIN after notifying userspace.
80288025
*/
8029-
int md_allow_write(struct mddev *mddev)
8026+
void md_allow_write(struct mddev *mddev)
80308027
{
80318028
if (!mddev->pers)
8032-
return 0;
8029+
return;
80338030
if (mddev->ro)
8034-
return 0;
8031+
return;
80358032
if (!mddev->pers->sync_request)
8036-
return 0;
8033+
return;
80378034

80388035
spin_lock(&mddev->lock);
80398036
if (mddev->in_sync) {
@@ -8046,13 +8043,12 @@ int md_allow_write(struct mddev *mddev)
80468043
spin_unlock(&mddev->lock);
80478044
md_update_sb(mddev, 0);
80488045
sysfs_notify_dirent_safe(mddev->sysfs_state);
8046+
/* wait for the dirty state to be recorded in the metadata */
8047+
wait_event(mddev->sb_wait,
8048+
!test_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags) &&
8049+
!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
80498050
} else
80508051
spin_unlock(&mddev->lock);
8051-
8052-
if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags))
8053-
return -EAGAIN;
8054-
else
8055-
return 0;
80568052
}
80578053
EXPORT_SYMBOL_GPL(md_allow_write);
80588054

drivers/md/md.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -665,7 +665,7 @@ extern int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
665665
bool metadata_op);
666666
extern void md_do_sync(struct md_thread *thread);
667667
extern void md_new_event(struct mddev *mddev);
668-
extern int md_allow_write(struct mddev *mddev);
668+
extern void md_allow_write(struct mddev *mddev);
669669
extern void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev);
670670
extern void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors);
671671
extern int md_check_no_bitmap(struct mddev *mddev);

drivers/md/raid0.c

Lines changed: 102 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -385,7 +385,7 @@ static int raid0_run(struct mddev *mddev)
385385
blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors);
386386
blk_queue_max_write_same_sectors(mddev->queue, mddev->chunk_sectors);
387387
blk_queue_max_write_zeroes_sectors(mddev->queue, mddev->chunk_sectors);
388-
blk_queue_max_discard_sectors(mddev->queue, mddev->chunk_sectors);
388+
blk_queue_max_discard_sectors(mddev->queue, UINT_MAX);
389389

390390
blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9);
391391
blk_queue_io_opt(mddev->queue,
@@ -459,6 +459,95 @@ static inline int is_io_in_chunk_boundary(struct mddev *mddev,
459459
}
460460
}
461461

462+
static void raid0_handle_discard(struct mddev *mddev, struct bio *bio)
463+
{
464+
struct r0conf *conf = mddev->private;
465+
struct strip_zone *zone;
466+
sector_t start = bio->bi_iter.bi_sector;
467+
sector_t end;
468+
unsigned int stripe_size;
469+
sector_t first_stripe_index, last_stripe_index;
470+
sector_t start_disk_offset;
471+
unsigned int start_disk_index;
472+
sector_t end_disk_offset;
473+
unsigned int end_disk_index;
474+
unsigned int disk;
475+
476+
zone = find_zone(conf, &start);
477+
478+
if (bio_end_sector(bio) > zone->zone_end) {
479+
struct bio *split = bio_split(bio,
480+
zone->zone_end - bio->bi_iter.bi_sector, GFP_NOIO,
481+
mddev->bio_set);
482+
bio_chain(split, bio);
483+
generic_make_request(bio);
484+
bio = split;
485+
end = zone->zone_end;
486+
} else
487+
end = bio_end_sector(bio);
488+
489+
if (zone != conf->strip_zone)
490+
end = end - zone[-1].zone_end;
491+
492+
/* Now start and end is the offset in zone */
493+
stripe_size = zone->nb_dev * mddev->chunk_sectors;
494+
495+
first_stripe_index = start;
496+
sector_div(first_stripe_index, stripe_size);
497+
last_stripe_index = end;
498+
sector_div(last_stripe_index, stripe_size);
499+
500+
start_disk_index = (int)(start - first_stripe_index * stripe_size) /
501+
mddev->chunk_sectors;
502+
start_disk_offset = ((int)(start - first_stripe_index * stripe_size) %
503+
mddev->chunk_sectors) +
504+
first_stripe_index * mddev->chunk_sectors;
505+
end_disk_index = (int)(end - last_stripe_index * stripe_size) /
506+
mddev->chunk_sectors;
507+
end_disk_offset = ((int)(end - last_stripe_index * stripe_size) %
508+
mddev->chunk_sectors) +
509+
last_stripe_index * mddev->chunk_sectors;
510+
511+
for (disk = 0; disk < zone->nb_dev; disk++) {
512+
sector_t dev_start, dev_end;
513+
struct bio *discard_bio = NULL;
514+
struct md_rdev *rdev;
515+
516+
if (disk < start_disk_index)
517+
dev_start = (first_stripe_index + 1) *
518+
mddev->chunk_sectors;
519+
else if (disk > start_disk_index)
520+
dev_start = first_stripe_index * mddev->chunk_sectors;
521+
else
522+
dev_start = start_disk_offset;
523+
524+
if (disk < end_disk_index)
525+
dev_end = (last_stripe_index + 1) * mddev->chunk_sectors;
526+
else if (disk > end_disk_index)
527+
dev_end = last_stripe_index * mddev->chunk_sectors;
528+
else
529+
dev_end = end_disk_offset;
530+
531+
if (dev_end <= dev_start)
532+
continue;
533+
534+
rdev = conf->devlist[(zone - conf->strip_zone) *
535+
conf->strip_zone[0].nb_dev + disk];
536+
if (__blkdev_issue_discard(rdev->bdev,
537+
dev_start + zone->dev_start + rdev->data_offset,
538+
dev_end - dev_start, GFP_NOIO, 0, &discard_bio) ||
539+
!discard_bio)
540+
continue;
541+
bio_chain(discard_bio, bio);
542+
if (mddev->gendisk)
543+
trace_block_bio_remap(bdev_get_queue(rdev->bdev),
544+
discard_bio, disk_devt(mddev->gendisk),
545+
bio->bi_iter.bi_sector);
546+
generic_make_request(discard_bio);
547+
}
548+
bio_endio(bio);
549+
}
550+
462551
static void raid0_make_request(struct mddev *mddev, struct bio *bio)
463552
{
464553
struct strip_zone *zone;
@@ -473,6 +562,11 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio)
473562
return;
474563
}
475564

565+
if (unlikely((bio_op(bio) == REQ_OP_DISCARD))) {
566+
raid0_handle_discard(mddev, bio);
567+
return;
568+
}
569+
476570
bio_sector = bio->bi_iter.bi_sector;
477571
sector = bio_sector;
478572
chunk_sects = mddev->chunk_sectors;
@@ -498,19 +592,13 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio)
498592
bio->bi_iter.bi_sector = sector + zone->dev_start +
499593
tmp_dev->data_offset;
500594

501-
if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
502-
!blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) {
503-
/* Just ignore it */
504-
bio_endio(bio);
505-
} else {
506-
if (mddev->gendisk)
507-
trace_block_bio_remap(bdev_get_queue(bio->bi_bdev),
508-
bio, disk_devt(mddev->gendisk),
509-
bio_sector);
510-
mddev_check_writesame(mddev, bio);
511-
mddev_check_write_zeroes(mddev, bio);
512-
generic_make_request(bio);
513-
}
595+
if (mddev->gendisk)
596+
trace_block_bio_remap(bdev_get_queue(bio->bi_bdev),
597+
bio, disk_devt(mddev->gendisk),
598+
bio_sector);
599+
mddev_check_writesame(mddev, bio);
600+
mddev_check_write_zeroes(mddev, bio);
601+
generic_make_request(bio);
514602
}
515603

516604
static void raid0_status(struct seq_file *seq, struct mddev *mddev)

drivers/md/raid1.c

Lines changed: 10 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -666,8 +666,11 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
666666
break;
667667
}
668668
continue;
669-
} else
669+
} else {
670+
if ((sectors > best_good_sectors) && (best_disk >= 0))
671+
best_disk = -1;
670672
best_good_sectors = sectors;
673+
}
671674

672675
if (best_disk >= 0)
673676
/* At least two disks to choose from so failfast is OK */
@@ -1529,17 +1532,16 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
15291532
plug = container_of(cb, struct raid1_plug_cb, cb);
15301533
else
15311534
plug = NULL;
1532-
spin_lock_irqsave(&conf->device_lock, flags);
15331535
if (plug) {
15341536
bio_list_add(&plug->pending, mbio);
15351537
plug->pending_cnt++;
15361538
} else {
1539+
spin_lock_irqsave(&conf->device_lock, flags);
15371540
bio_list_add(&conf->pending_bio_list, mbio);
15381541
conf->pending_count++;
1539-
}
1540-
spin_unlock_irqrestore(&conf->device_lock, flags);
1541-
if (!plug)
1542+
spin_unlock_irqrestore(&conf->device_lock, flags);
15421543
md_wakeup_thread(mddev->thread);
1544+
}
15431545
}
15441546

15451547
r1_bio_write_done(r1_bio);
@@ -3197,7 +3199,7 @@ static int raid1_reshape(struct mddev *mddev)
31973199
struct r1conf *conf = mddev->private;
31983200
int cnt, raid_disks;
31993201
unsigned long flags;
3200-
int d, d2, err;
3202+
int d, d2;
32013203

32023204
/* Cannot change chunk_size, layout, or level */
32033205
if (mddev->chunk_sectors != mddev->new_chunk_sectors ||
@@ -3209,11 +3211,8 @@ static int raid1_reshape(struct mddev *mddev)
32093211
return -EINVAL;
32103212
}
32113213

3212-
if (!mddev_is_clustered(mddev)) {
3213-
err = md_allow_write(mddev);
3214-
if (err)
3215-
return err;
3216-
}
3214+
if (!mddev_is_clustered(mddev))
3215+
md_allow_write(mddev);
32173216

32183217
raid_disks = mddev->raid_disks + mddev->delta_disks;
32193218

drivers/md/raid10.c

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1282,17 +1282,16 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
12821282
plug = container_of(cb, struct raid10_plug_cb, cb);
12831283
else
12841284
plug = NULL;
1285-
spin_lock_irqsave(&conf->device_lock, flags);
12861285
if (plug) {
12871286
bio_list_add(&plug->pending, mbio);
12881287
plug->pending_cnt++;
12891288
} else {
1289+
spin_lock_irqsave(&conf->device_lock, flags);
12901290
bio_list_add(&conf->pending_bio_list, mbio);
12911291
conf->pending_count++;
1292-
}
1293-
spin_unlock_irqrestore(&conf->device_lock, flags);
1294-
if (!plug)
1292+
spin_unlock_irqrestore(&conf->device_lock, flags);
12951293
md_wakeup_thread(mddev->thread);
1294+
}
12961295
}
12971296

12981297
static void raid10_write_request(struct mddev *mddev, struct bio *bio,

drivers/md/raid5-cache.c

Lines changed: 35 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
#include "md.h"
2525
#include "raid5.h"
2626
#include "bitmap.h"
27+
#include "raid5-log.h"
2728

2829
/*
2930
* metadata/data stored in disk with 4k size unit (a block) regardless
@@ -622,20 +623,30 @@ static void r5l_do_submit_io(struct r5l_log *log, struct r5l_io_unit *io)
622623
__r5l_set_io_unit_state(io, IO_UNIT_IO_START);
623624
spin_unlock_irqrestore(&log->io_list_lock, flags);
624625

626+
/*
627+
* In case of journal device failures, submit_bio will get error
628+
* and calls endio, then active stripes will continue write
629+
* process. Therefore, it is not necessary to check Faulty bit
630+
* of journal device here.
631+
*
632+
* We can't check split_bio after current_bio is submitted. If
633+
* io->split_bio is null, after current_bio is submitted, current_bio
634+
* might already be completed and the io_unit is freed. We submit
635+
* split_bio first to avoid the issue.
636+
*/
637+
if (io->split_bio) {
638+
if (io->has_flush)
639+
io->split_bio->bi_opf |= REQ_PREFLUSH;
640+
if (io->has_fua)
641+
io->split_bio->bi_opf |= REQ_FUA;
642+
submit_bio(io->split_bio);
643+
}
644+
625645
if (io->has_flush)
626646
io->current_bio->bi_opf |= REQ_PREFLUSH;
627647
if (io->has_fua)
628648
io->current_bio->bi_opf |= REQ_FUA;
629649
submit_bio(io->current_bio);
630-
631-
if (!io->split_bio)
632-
return;
633-
634-
if (io->has_flush)
635-
io->split_bio->bi_opf |= REQ_PREFLUSH;
636-
if (io->has_fua)
637-
io->split_bio->bi_opf |= REQ_FUA;
638-
submit_bio(io->split_bio);
639650
}
640651

641652
/* deferred io_unit will be dispatched here */
@@ -670,6 +681,11 @@ static void r5c_disable_writeback_async(struct work_struct *work)
670681
return;
671682
pr_info("md/raid:%s: Disabling writeback cache for degraded array.\n",
672683
mdname(mddev));
684+
685+
/* wait superblock change before suspend */
686+
wait_event(mddev->sb_wait,
687+
!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
688+
673689
mddev_suspend(mddev);
674690
log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
675691
mddev_resume(mddev);
@@ -2621,8 +2637,11 @@ int r5c_try_caching_write(struct r5conf *conf,
26212637
* When run in degraded mode, array is set to write-through mode.
26222638
* This check helps drain pending write safely in the transition to
26232639
* write-through mode.
2640+
*
2641+
* When a stripe is syncing, the write is also handled in write
2642+
* through mode.
26242643
*/
2625-
if (s->failed) {
2644+
if (s->failed || test_bit(STRIPE_SYNCING, &sh->state)) {
26262645
r5c_make_stripe_write_out(sh);
26272646
return -EAGAIN;
26282647
}
@@ -2825,6 +2844,9 @@ void r5c_finish_stripe_write_out(struct r5conf *conf,
28252844
}
28262845

28272846
r5l_append_flush_payload(log, sh->sector);
2847+
/* stripe is flused to raid disks, we can do resync now */
2848+
if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state))
2849+
set_bit(STRIPE_HANDLE, &sh->state);
28282850
}
28292851

28302852
int r5c_cache_data(struct r5l_log *log, struct stripe_head *sh)
@@ -2973,15 +2995,16 @@ static int r5l_load_log(struct r5l_log *log)
29732995
return ret;
29742996
}
29752997

2976-
void r5c_update_on_rdev_error(struct mddev *mddev)
2998+
void r5c_update_on_rdev_error(struct mddev *mddev, struct md_rdev *rdev)
29772999
{
29783000
struct r5conf *conf = mddev->private;
29793001
struct r5l_log *log = conf->log;
29803002

29813003
if (!log)
29823004
return;
29833005

2984-
if (raid5_calc_degraded(conf) > 0 &&
3006+
if ((raid5_calc_degraded(conf) > 0 ||
3007+
test_bit(Journal, &rdev->flags)) &&
29853008
conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK)
29863009
schedule_work(&log->disable_writeback_work);
29873010
}

drivers/md/raid5-log.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,8 @@ extern void r5c_flush_cache(struct r5conf *conf, int num);
2828
extern void r5c_check_stripe_cache_usage(struct r5conf *conf);
2929
extern void r5c_check_cached_full_stripe(struct r5conf *conf);
3030
extern struct md_sysfs_entry r5c_journal_mode;
31-
extern void r5c_update_on_rdev_error(struct mddev *mddev);
31+
extern void r5c_update_on_rdev_error(struct mddev *mddev,
32+
struct md_rdev *rdev);
3233
extern bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect);
3334

3435
extern struct dma_async_tx_descriptor *

0 commit comments

Comments
 (0)