From dc58266385e51420298275c90a616c34f1473a73 Mon Sep 17 00:00:00 2001 From: Andre Noll Date: Tue, 16 Jun 2009 16:18:43 +1000 Subject: [PATCH 01/39] md: raid0: Replace hash table lookup by looping over all strip_zones. The number of strip_zones of a raid0 array is bounded by the number of drives in the array and is in fact much smaller for typical setups. For example, any raid0 array containing identical disks will have only a single strip_zone. Therefore, the hash tables which are used for quickly finding the strip_zone that holds a particular sector are of questionable value and add quite a bit of unnecessary complexity. This patch replaces the hash table lookup by equivalent code which simply loops over all strip zones to find the zone that holds the given sector. In order to make this loop as fast as possible, the zone->start field of struct strip_zone has been renamed to zone_end, and it now stores the beginning of the next zone in sectors. This allows to save one addition in the loop. Subsequent cleanup patches will remove the hash table structure. Signed-off-by: Andre Noll Signed-off-by: NeilBrown --- drivers/md/raid0.c | 40 ++++++++++++++++++++-------------------- drivers/md/raid0.h | 2 +- 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 925507e7d673..bb245a6d16c8 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -52,7 +52,6 @@ static int raid0_congested(void *data, int bits) return ret; } - static int create_strip_zones (mddev_t *mddev) { int i, c, j; @@ -158,7 +157,7 @@ static int create_strip_zones (mddev_t *mddev) } zone->nb_dev = cnt; zone->sectors = smallest->sectors * cnt; - zone->zone_start = 0; + zone->zone_end = zone->sectors; current_start = smallest->sectors; curr_zone_start = zone->sectors; @@ -198,14 +197,13 @@ static int create_strip_zones (mddev_t *mddev) printk(KERN_INFO "raid0: zone->nb_dev: %d, sectors: %llu\n", zone->nb_dev, (unsigned long long)zone->sectors); - zone->zone_start = curr_zone_start; + zone->zone_end = curr_zone_start + zone->sectors; curr_zone_start += zone->sectors; current_start = smallest->sectors; printk(KERN_INFO "raid0: current zone start: %llu\n", (unsigned long long)current_start); } - /* Now find appropriate hash spacing. * We want a number which causes most hash entries to cover * at most two strips, but the hash table must be at most @@ -398,6 +396,19 @@ static int raid0_stop (mddev_t *mddev) return 0; } +/* Find the zone which holds a particular offset */ +static struct strip_zone *find_zone(struct raid0_private_data *conf, + sector_t sector) +{ + int i; + struct strip_zone *z = conf->strip_zone; + + for (i = 0; i < conf->nr_strip_zones; i++) + if (sector < z[i].zone_end) + return z + i; + BUG(); +} + static int raid0_make_request (struct request_queue *q, struct bio *bio) { mddev_t *mddev = q->queuedata; @@ -443,22 +454,11 @@ static int raid0_make_request (struct request_queue *q, struct bio *bio) bio_pair_release(bp); return 0; } - - - { - sector_t x = sector >> conf->sector_shift; - sector_div(x, (u32)conf->spacing); - zone = conf->hash_table[x]; - } - - while (sector >= zone->zone_start + zone->sectors) - zone++; - + zone = find_zone(conf, sector); sect_in_chunk = bio->bi_sector & (chunk_sects - 1); - - { - sector_t x = (sector - zone->zone_start) >> chunksect_bits; + sector_t x = (zone->sectors + sector - zone->zone_end) + >> chunksect_bits; sector_div(x, zone->nb_dev); chunk = x; @@ -503,8 +503,8 @@ static void raid0_status (struct seq_file *seq, mddev_t *mddev) seq_printf(seq, "%s/", bdevname( conf->strip_zone[j].dev[k]->bdev,b)); - seq_printf(seq, "] zs=%d ds=%d s=%d\n", - conf->strip_zone[j].zone_start, + seq_printf(seq, "] ze=%d ds=%d s=%d\n", + conf->strip_zone[j].zone_end, conf->strip_zone[j].dev_start, conf->strip_zone[j].sectors); } diff --git a/drivers/md/raid0.h b/drivers/md/raid0.h index 824b12eb1d4f..556666fec3a5 100644 --- a/drivers/md/raid0.h +++ b/drivers/md/raid0.h @@ -3,7 +3,7 @@ struct strip_zone { - sector_t zone_start; /* Zone offset in md_dev (in sectors) */ + sector_t zone_end; /* Start of the next zone (in sectors) */ sector_t dev_start; /* Zone offset in real dev (in sectors) */ sector_t sectors; /* Zone size in sectors */ int nb_dev; /* # of devices attached to the zone */ From d27a43abd7be0ab4b2337e4587feca8c7340e5f9 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 16 Jun 2009 16:46:46 +1000 Subject: [PATCH 02/39] md/raid0: two cleanups in create_stripe_zones. 1/ remove current_start. The same value is available in zone->dev_start and storing it separately doesn't gain anything. 2/ rename curr_zone_start to curr_zone_end as we are now more focused on the 'end' of each zone. We end up storing the same number though - the old name was a little confusing (and what does 'current' mean in this context anyway). Signed-off-by: NeilBrown --- drivers/md/raid0.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index bb245a6d16c8..1afdfd120bba 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -55,7 +55,7 @@ static int raid0_congested(void *data, int bits) static int create_strip_zones (mddev_t *mddev) { int i, c, j; - sector_t current_start, curr_zone_start; + sector_t curr_zone_end; sector_t min_spacing; raid0_conf_t *conf = mddev_to_conf(mddev); mdk_rdev_t *smallest, *rdev1, *rdev2, *rdev; @@ -159,8 +159,7 @@ static int create_strip_zones (mddev_t *mddev) zone->sectors = smallest->sectors * cnt; zone->zone_end = zone->sectors; - current_start = smallest->sectors; - curr_zone_start = zone->sectors; + curr_zone_end = zone->sectors; /* now do the other zones */ for (i = 1; i < conf->nr_strip_zones; i++) @@ -169,7 +168,7 @@ static int create_strip_zones (mddev_t *mddev) zone->dev = conf->strip_zone[i-1].dev + mddev->raid_disks; printk(KERN_INFO "raid0: zone %d\n", i); - zone->dev_start = current_start; + zone->dev_start = smallest->sectors; smallest = NULL; c = 0; @@ -178,7 +177,7 @@ static int create_strip_zones (mddev_t *mddev) rdev = conf->strip_zone[0].dev[j]; printk(KERN_INFO "raid0: checking %s ...", bdevname(rdev->bdev, b)); - if (rdev->sectors <= current_start) { + if (rdev->sectors <= zone->dev_start) { printk(KERN_INFO " nope.\n"); continue; } @@ -193,16 +192,15 @@ static int create_strip_zones (mddev_t *mddev) } zone->nb_dev = c; - zone->sectors = (smallest->sectors - current_start) * c; + zone->sectors = (smallest->sectors - zone->dev_start) * c; printk(KERN_INFO "raid0: zone->nb_dev: %d, sectors: %llu\n", zone->nb_dev, (unsigned long long)zone->sectors); - zone->zone_end = curr_zone_start + zone->sectors; - curr_zone_start += zone->sectors; + curr_zone_end += zone->sectors; + zone->zone_end = curr_zone_end; - current_start = smallest->sectors; printk(KERN_INFO "raid0: current zone start: %llu\n", - (unsigned long long)current_start); + (unsigned long long)smallest->sectors); } /* Now find appropriate hash spacing. * We want a number which causes most hash entries to cover @@ -212,8 +210,8 @@ static int create_strip_zones (mddev_t *mddev) * strip though as it's size has no bearing on the efficacy of the hash * table. */ - conf->spacing = curr_zone_start; - min_spacing = curr_zone_start; + conf->spacing = curr_zone_end; + min_spacing = curr_zone_end; sector_div(min_spacing, PAGE_SIZE/sizeof(struct strip_zone*)); for (i=0; i < conf->nr_strip_zones-1; i++) { sector_t s = 0; From 09770e0b6ee649313611a2d6a9b44f456072dbd6 Mon Sep 17 00:00:00 2001 From: Andre Noll Date: Tue, 16 Jun 2009 16:46:48 +1000 Subject: [PATCH 03/39] md: raid0: Remove hash table. The raid0 hash table has become unused due to the changes in the previous patch. This patch removes the hash table allocation and setup code and kills the hash_table field of struct raid0_private_data. Signed-off-by: Andre Noll Signed-off-by: NeilBrown --- drivers/md/raid0.c | 12 ------------ drivers/md/raid0.h | 1 - 2 files changed, 13 deletions(-) diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 1afdfd120bba..d4c9c5d5d7f5 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -326,22 +326,14 @@ static int raid0_run (mddev_t *mddev) nb_zone = s + round; } printk(KERN_INFO "raid0 : nb_zone is %d.\n", nb_zone); - - printk(KERN_INFO "raid0 : Allocating %zu bytes for hash.\n", - nb_zone*sizeof(struct strip_zone*)); - conf->hash_table = kmalloc (sizeof (struct strip_zone *)*nb_zone, GFP_KERNEL); - if (!conf->hash_table) - goto out_free_conf; sectors = conf->strip_zone[cur].sectors; - conf->hash_table[0] = conf->strip_zone + cur; for (i=1; i< nb_zone; i++) { while (sectors <= conf->spacing) { cur++; sectors += conf->strip_zone[cur].sectors; } sectors -= conf->spacing; - conf->hash_table[i] = conf->strip_zone + cur; } if (conf->sector_shift) { conf->spacing >>= conf->sector_shift; @@ -384,8 +376,6 @@ static int raid0_stop (mddev_t *mddev) raid0_conf_t *conf = mddev_to_conf(mddev); blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ - kfree(conf->hash_table); - conf->hash_table = NULL; kfree(conf->strip_zone); conf->strip_zone = NULL; kfree(conf); @@ -494,8 +484,6 @@ static void raid0_status (struct seq_file *seq, mddev_t *mddev) h = 0; for (j = 0; j < conf->nr_strip_zones; j++) { seq_printf(seq, " z%d", j); - if (conf->hash_table[h] == conf->strip_zone+j) - seq_printf(seq, "(h%d)", h++); seq_printf(seq, "=["); for (k = 0; k < conf->strip_zone[j].nb_dev; k++) seq_printf(seq, "%s/", bdevname( diff --git a/drivers/md/raid0.h b/drivers/md/raid0.h index 556666fec3a5..a14630a25aa4 100644 --- a/drivers/md/raid0.h +++ b/drivers/md/raid0.h @@ -12,7 +12,6 @@ struct strip_zone struct raid0_private_data { - struct strip_zone **hash_table; /* Table of indexes into strip_zone */ struct strip_zone *strip_zone; mdk_rdev_t **devlist; /* lists of rdevs, pointed to by strip_zone->dev */ int nr_strip_zones; From 8f79cfcdb65472f1504ade2f53e5f2bfdaeb95da Mon Sep 17 00:00:00 2001 From: Andre Noll Date: Tue, 16 Jun 2009 16:47:10 +1000 Subject: [PATCH 04/39] md: raid0: Remove hash spacing and sector shift. The "sector_shift" and "spacing" fields of struct raid0_private_data were only used for the hash table lookups. So the removal of the hash table allows get rid of these fields as well which simplifies create_strip_zones() and raid0_run() quite a bit. Signed-off-by: Andre Noll Signed-off-by: NeilBrown --- drivers/md/raid0.c | 63 +--------------------------------------------- drivers/md/raid0.h | 3 --- 2 files changed, 1 insertion(+), 65 deletions(-) diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index d4c9c5d5d7f5..edffc4940b49 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -56,7 +56,6 @@ static int create_strip_zones (mddev_t *mddev) { int i, c, j; sector_t curr_zone_end; - sector_t min_spacing; raid0_conf_t *conf = mddev_to_conf(mddev); mdk_rdev_t *smallest, *rdev1, *rdev2, *rdev; struct strip_zone *zone; @@ -202,28 +201,7 @@ static int create_strip_zones (mddev_t *mddev) printk(KERN_INFO "raid0: current zone start: %llu\n", (unsigned long long)smallest->sectors); } - /* Now find appropriate hash spacing. - * We want a number which causes most hash entries to cover - * at most two strips, but the hash table must be at most - * 1 PAGE. We choose the smallest strip, or contiguous collection - * of strips, that has big enough size. We never consider the last - * strip though as it's size has no bearing on the efficacy of the hash - * table. - */ - conf->spacing = curr_zone_end; - min_spacing = curr_zone_end; - sector_div(min_spacing, PAGE_SIZE/sizeof(struct strip_zone*)); - for (i=0; i < conf->nr_strip_zones-1; i++) { - sector_t s = 0; - for (j = i; j < conf->nr_strip_zones - 1 && - s < min_spacing; j++) - s += conf->strip_zone[j].sectors; - if (s >= min_spacing && s < conf->spacing) - conf->spacing = s; - } - mddev->queue->unplug_fn = raid0_unplug; - mddev->queue->backing_dev_info.congested_fn = raid0_congested; mddev->queue->backing_dev_info.congested_data = mddev; @@ -273,10 +251,8 @@ static sector_t raid0_size(mddev_t *mddev, sector_t sectors, int raid_disks) return array_sectors; } -static int raid0_run (mddev_t *mddev) +static int raid0_run(mddev_t *mddev) { - unsigned cur=0, i=0, nb_zone; - s64 sectors; raid0_conf_t *conf; if (mddev->chunk_size == 0) { @@ -306,43 +282,6 @@ static int raid0_run (mddev_t *mddev) printk(KERN_INFO "raid0 : md_size is %llu sectors.\n", (unsigned long long)mddev->array_sectors); - printk(KERN_INFO "raid0 : conf->spacing is %llu sectors.\n", - (unsigned long long)conf->spacing); - { - sector_t s = raid0_size(mddev, 0, 0); - sector_t space = conf->spacing; - int round; - conf->sector_shift = 0; - if (sizeof(sector_t) > sizeof(u32)) { - /*shift down space and s so that sector_div will work */ - while (space > (sector_t) (~(u32)0)) { - s >>= 1; - space >>= 1; - s += 1; /* force round-up */ - conf->sector_shift++; - } - } - round = sector_div(s, (u32)space) ? 1 : 0; - nb_zone = s + round; - } - printk(KERN_INFO "raid0 : nb_zone is %d.\n", nb_zone); - sectors = conf->strip_zone[cur].sectors; - - for (i=1; i< nb_zone; i++) { - while (sectors <= conf->spacing) { - cur++; - sectors += conf->strip_zone[cur].sectors; - } - sectors -= conf->spacing; - } - if (conf->sector_shift) { - conf->spacing >>= conf->sector_shift; - /* round spacing up so when we divide by it, we - * err on the side of too-low, which is safest - */ - conf->spacing++; - } - /* calculate the max read-ahead size. * For read-ahead of large files to be effective, we need to * readahead at least twice a whole stripe. i.e. number of devices diff --git a/drivers/md/raid0.h b/drivers/md/raid0.h index a14630a25aa4..dbcf1da916b7 100644 --- a/drivers/md/raid0.h +++ b/drivers/md/raid0.h @@ -15,9 +15,6 @@ struct raid0_private_data struct strip_zone *strip_zone; mdk_rdev_t **devlist; /* lists of rdevs, pointed to by strip_zone->dev */ int nr_strip_zones; - - sector_t spacing; - int sector_shift; /* shift this before divide by spacing */ }; typedef struct raid0_private_data raid0_conf_t; From 5568a6035d9fca2cd8f1ef7005e215eae4e65fab Mon Sep 17 00:00:00 2001 From: Andre Noll Date: Tue, 16 Jun 2009 16:47:21 +1000 Subject: [PATCH 05/39] md: raid0: Make raid0_run() return a proper error code. Currently raid0_run() always returns -ENOMEM on errors. This is incorrect as running the array might fail for other reasons, for example because not all component devices were available. This patch changes create_strip_zones() so that it returns a proper error code (either -ENOMEM or -EINVAL) rather than 1 on errors and makes raid0_run(), its single caller, return that value instead of -ENOMEM. Signed-off-by: Andre Noll Signed-off-by: NeilBrown --- drivers/md/raid0.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index edffc4940b49..e5648b660e75 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -105,12 +105,12 @@ static int create_strip_zones (mddev_t *mddev) conf->strip_zone = kzalloc(sizeof(struct strip_zone)* conf->nr_strip_zones, GFP_KERNEL); if (!conf->strip_zone) - return 1; + return -ENOMEM; conf->devlist = kzalloc(sizeof(mdk_rdev_t*)* conf->nr_strip_zones*mddev->raid_disks, GFP_KERNEL); if (!conf->devlist) - return 1; + return -ENOMEM; /* The first zone must contain all devices, so here we check that * there is a proper alignment of slots to devices and find them all @@ -207,8 +207,8 @@ static int create_strip_zones (mddev_t *mddev) printk(KERN_INFO "raid0: done.\n"); return 0; - abort: - return 1; +abort: + return -EINVAL; } /** @@ -254,6 +254,7 @@ static sector_t raid0_size(mddev_t *mddev, sector_t sectors, int raid_disks) static int raid0_run(mddev_t *mddev) { raid0_conf_t *conf; + int ret; if (mddev->chunk_size == 0) { printk(KERN_ERR "md/raid0: non-zero chunk size required.\n"); @@ -269,12 +270,13 @@ static int raid0_run(mddev_t *mddev) conf = kmalloc(sizeof (raid0_conf_t), GFP_KERNEL); if (!conf) - goto out; + return -ENOMEM; mddev->private = (void *)conf; conf->strip_zone = NULL; conf->devlist = NULL; - if (create_strip_zones (mddev)) + ret = create_strip_zones(mddev); + if (ret < 0) goto out_free_conf; /* calculate array device size */ @@ -306,8 +308,7 @@ out_free_conf: kfree(conf->devlist); kfree(conf); mddev->private = NULL; -out: - return -ENOMEM; + return ret; } static int raid0_stop (mddev_t *mddev) From ed7b00380d957ec770b5e90380d012c6062c13cc Mon Sep 17 00:00:00 2001 From: Andre Noll Date: Tue, 16 Jun 2009 16:47:36 +1000 Subject: [PATCH 06/39] md: raid0: Allocate all buffers for the raid0 configuration in one function. Currently the raid0 configuration is allocated in raid0_run() while the buffers for the strip_zone and the dev_list arrays are allocated in create_strip_zones(). On errors, all three buffers are freed in raid0_run(). It's easier and more readable to do the allocation and cleanup within a single function. So move that code into create_strip_zones(). Signed-off-by: Andre Noll Signed-off-by: NeilBrown --- drivers/md/raid0.c | 47 +++++++++++++++++----------------------------- 1 file changed, 17 insertions(+), 30 deletions(-) diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index e5648b660e75..99cee51734e5 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -52,21 +52,18 @@ static int raid0_congested(void *data, int bits) return ret; } -static int create_strip_zones (mddev_t *mddev) +static int create_strip_zones(mddev_t *mddev) { - int i, c, j; + int i, c, j, err; sector_t curr_zone_end; - raid0_conf_t *conf = mddev_to_conf(mddev); mdk_rdev_t *smallest, *rdev1, *rdev2, *rdev; struct strip_zone *zone; int cnt; char b[BDEVNAME_SIZE]; - - /* - * The number of 'same size groups' - */ - conf->nr_strip_zones = 0; - + raid0_conf_t *conf = kzalloc(sizeof(*conf), GFP_KERNEL); + + if (!conf) + return -ENOMEM; list_for_each_entry(rdev1, &mddev->disks, same_set) { printk(KERN_INFO "raid0: looking at %s\n", bdevname(rdev1->bdev,b)); @@ -101,16 +98,16 @@ static int create_strip_zones (mddev_t *mddev) } } printk(KERN_INFO "raid0: FINAL %d zones\n", conf->nr_strip_zones); - + err = -ENOMEM; conf->strip_zone = kzalloc(sizeof(struct strip_zone)* conf->nr_strip_zones, GFP_KERNEL); if (!conf->strip_zone) - return -ENOMEM; + goto abort; conf->devlist = kzalloc(sizeof(mdk_rdev_t*)* conf->nr_strip_zones*mddev->raid_disks, GFP_KERNEL); if (!conf->devlist) - return -ENOMEM; + goto abort; /* The first zone must contain all devices, so here we check that * there is a proper alignment of slots to devices and find them all @@ -119,6 +116,7 @@ static int create_strip_zones (mddev_t *mddev) cnt = 0; smallest = NULL; zone->dev = conf->devlist; + err = -EINVAL; list_for_each_entry(rdev1, &mddev->disks, same_set) { int j = rdev1->raid_disk; @@ -206,9 +204,14 @@ static int create_strip_zones (mddev_t *mddev) mddev->queue->backing_dev_info.congested_data = mddev; printk(KERN_INFO "raid0: done.\n"); + mddev->private = conf; return 0; abort: - return -EINVAL; + kfree(conf->strip_zone); + kfree(conf->devlist); + kfree(conf); + mddev->private = NULL; + return err; } /** @@ -253,7 +256,6 @@ static sector_t raid0_size(mddev_t *mddev, sector_t sectors, int raid_disks) static int raid0_run(mddev_t *mddev) { - raid0_conf_t *conf; int ret; if (mddev->chunk_size == 0) { @@ -268,16 +270,9 @@ static int raid0_run(mddev_t *mddev) blk_queue_segment_boundary(mddev->queue, (mddev->chunk_size>>1) - 1); mddev->queue->queue_lock = &mddev->queue->__queue_lock; - conf = kmalloc(sizeof (raid0_conf_t), GFP_KERNEL); - if (!conf) - return -ENOMEM; - mddev->private = (void *)conf; - - conf->strip_zone = NULL; - conf->devlist = NULL; ret = create_strip_zones(mddev); if (ret < 0) - goto out_free_conf; + return ret; /* calculate array device size */ md_set_array_sectors(mddev, raid0_size(mddev, 0, 0)); @@ -299,16 +294,8 @@ static int raid0_run(mddev_t *mddev) mddev->queue->backing_dev_info.ra_pages = 2* stripe; } - blk_queue_merge_bvec(mddev->queue, raid0_mergeable_bvec); return 0; - -out_free_conf: - kfree(conf->strip_zone); - kfree(conf->devlist); - kfree(conf); - mddev->private = NULL; - return ret; } static int raid0_stop (mddev_t *mddev) From fb5ab4b5d6e16fd5006c9f800d0116f3547cb760 Mon Sep 17 00:00:00 2001 From: Andre Noll Date: Tue, 16 Jun 2009 16:48:19 +1000 Subject: [PATCH 07/39] md: raid0: Fix a memory leak when stopping a raid0 array. raid0_stop() removes all references to the raid0 configuration but misses to free the ->devlist buffer. This patch closes this leak, removes a pointless initialization and fixes a coding style issue in raid0_stop(). Signed-off-by: Andre Noll Signed-off-by: NeilBrown --- drivers/md/raid0.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 99cee51734e5..0d62ad6df212 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -298,16 +298,15 @@ static int raid0_run(mddev_t *mddev) return 0; } -static int raid0_stop (mddev_t *mddev) +static int raid0_stop(mddev_t *mddev) { raid0_conf_t *conf = mddev_to_conf(mddev); blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ kfree(conf->strip_zone); - conf->strip_zone = NULL; + kfree(conf->devlist); kfree(conf); mddev->private = NULL; - return 0; } From 49f357a22b3fa3eeac042dfa0a6cae920c174e48 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 16 Jun 2009 16:50:35 +1000 Subject: [PATCH 08/39] md: raid0: remove ->sectors from the strip_zone structure. storing ->sectors is redundant as is can be computed from the difference z->zone_end - (z-1)->zone_end The one place where it is used, it is just as efficient to use a zone_end value instead. And removing it makes strip_zone smaller, so they array of these that is searched on every request has a better chance to say in cache. So discard the field and get the value from elsewhere. Signed-off-by: NeilBrown --- drivers/md/raid0.c | 33 +++++++++++++++++++-------------- drivers/md/raid0.h | 1 - 2 files changed, 19 insertions(+), 15 deletions(-) diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 0d62ad6df212..07ef936afc71 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -55,7 +55,7 @@ static int raid0_congested(void *data, int bits) static int create_strip_zones(mddev_t *mddev) { int i, c, j, err; - sector_t curr_zone_end; + sector_t curr_zone_end, sectors; mdk_rdev_t *smallest, *rdev1, *rdev2, *rdev; struct strip_zone *zone; int cnt; @@ -153,10 +153,9 @@ static int create_strip_zones(mddev_t *mddev) goto abort; } zone->nb_dev = cnt; - zone->sectors = smallest->sectors * cnt; - zone->zone_end = zone->sectors; + zone->zone_end = smallest->sectors * cnt; - curr_zone_end = zone->sectors; + curr_zone_end = zone->zone_end; /* now do the other zones */ for (i = 1; i < conf->nr_strip_zones; i++) @@ -189,11 +188,11 @@ static int create_strip_zones(mddev_t *mddev) } zone->nb_dev = c; - zone->sectors = (smallest->sectors - zone->dev_start) * c; + sectors = (smallest->sectors - zone->dev_start) * c; printk(KERN_INFO "raid0: zone->nb_dev: %d, sectors: %llu\n", - zone->nb_dev, (unsigned long long)zone->sectors); + zone->nb_dev, (unsigned long long)sectors); - curr_zone_end += zone->sectors; + curr_zone_end += sectors; zone->zone_end = curr_zone_end; printk(KERN_INFO "raid0: current zone start: %llu\n", @@ -310,16 +309,22 @@ static int raid0_stop(mddev_t *mddev) return 0; } -/* Find the zone which holds a particular offset */ +/* Find the zone which holds a particular offset + * Update *sectorp to be an offset in that zone + */ static struct strip_zone *find_zone(struct raid0_private_data *conf, - sector_t sector) + sector_t *sectorp) { int i; struct strip_zone *z = conf->strip_zone; + sector_t sector = *sectorp; for (i = 0; i < conf->nr_strip_zones; i++) - if (sector < z[i].zone_end) + if (sector < z[i].zone_end) { + if (i) + *sectorp = sector - z[i-1].zone_end; return z + i; + } BUG(); } @@ -331,7 +336,7 @@ static int raid0_make_request (struct request_queue *q, struct bio *bio) struct strip_zone *zone; mdk_rdev_t *tmp_dev; sector_t chunk; - sector_t sector, rsect; + sector_t sector, rsect, sector_offset; const int rw = bio_data_dir(bio); int cpu; @@ -368,11 +373,11 @@ static int raid0_make_request (struct request_queue *q, struct bio *bio) bio_pair_release(bp); return 0; } - zone = find_zone(conf, sector); + sector_offset = sector; + zone = find_zone(conf, §or_offset); sect_in_chunk = bio->bi_sector & (chunk_sects - 1); { - sector_t x = (zone->sectors + sector - zone->zone_end) - >> chunksect_bits; + sector_t x = sector_offset >> chunksect_bits; sector_div(x, zone->nb_dev); chunk = x; diff --git a/drivers/md/raid0.h b/drivers/md/raid0.h index dbcf1da916b7..124ba34c8eed 100644 --- a/drivers/md/raid0.h +++ b/drivers/md/raid0.h @@ -5,7 +5,6 @@ struct strip_zone { sector_t zone_end; /* Start of the next zone (in sectors) */ sector_t dev_start; /* Zone offset in real dev (in sectors) */ - sector_t sectors; /* Zone size in sectors */ int nb_dev; /* # of devices attached to the zone */ mdk_rdev_t **dev; /* Devices attached to the zone */ }; From b414579f4573b6dc8583e31b01dcffd13f49fd62 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 16 Jun 2009 16:50:52 +1000 Subject: [PATCH 09/39] md: raid0: remove ->dev pointer from strip_zone structure If we treat conf->devlist more like a 2 dimensional array, we can get the devlist for a particular zone simply by indexing that array, so we don't need to store the pointers to subarrays in strip_zone. This makes strip_zone smaller and so (hopefully) searches faster. Signed-of-by: NeilBrown --- drivers/md/raid0.c | 21 +++++++++++---------- drivers/md/raid0.h | 1 - 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 07ef936afc71..af0df78223b1 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -27,7 +27,7 @@ static void raid0_unplug(struct request_queue *q) { mddev_t *mddev = q->queuedata; raid0_conf_t *conf = mddev_to_conf(mddev); - mdk_rdev_t **devlist = conf->strip_zone[0].dev; + mdk_rdev_t **devlist = conf->devlist; int i; for (i=0; iraid_disks; i++) { @@ -41,7 +41,7 @@ static int raid0_congested(void *data, int bits) { mddev_t *mddev = data; raid0_conf_t *conf = mddev_to_conf(mddev); - mdk_rdev_t **devlist = conf->strip_zone[0].dev; + mdk_rdev_t **devlist = conf->devlist; int i, ret = 0; for (i = 0; i < mddev->raid_disks && !ret ; i++) { @@ -56,7 +56,7 @@ static int create_strip_zones(mddev_t *mddev) { int i, c, j, err; sector_t curr_zone_end, sectors; - mdk_rdev_t *smallest, *rdev1, *rdev2, *rdev; + mdk_rdev_t *smallest, *rdev1, *rdev2, *rdev, **dev; struct strip_zone *zone; int cnt; char b[BDEVNAME_SIZE]; @@ -115,7 +115,7 @@ static int create_strip_zones(mddev_t *mddev) zone = &conf->strip_zone[0]; cnt = 0; smallest = NULL; - zone->dev = conf->devlist; + dev = conf->devlist; err = -EINVAL; list_for_each_entry(rdev1, &mddev->disks, same_set) { int j = rdev1->raid_disk; @@ -125,12 +125,12 @@ static int create_strip_zones(mddev_t *mddev) "aborting!\n", j); goto abort; } - if (zone->dev[j]) { + if (dev[j]) { printk(KERN_ERR "raid0: multiple devices for %d - " "aborting!\n", j); goto abort; } - zone->dev[j] = rdev1; + dev[j] = rdev1; blk_queue_stack_limits(mddev->queue, rdev1->bdev->bd_disk->queue); @@ -161,7 +161,7 @@ static int create_strip_zones(mddev_t *mddev) for (i = 1; i < conf->nr_strip_zones; i++) { zone = conf->strip_zone + i; - zone->dev = conf->strip_zone[i-1].dev + mddev->raid_disks; + dev = conf->devlist + i * mddev->raid_disks; printk(KERN_INFO "raid0: zone %d\n", i); zone->dev_start = smallest->sectors; @@ -170,7 +170,7 @@ static int create_strip_zones(mddev_t *mddev) for (j=0; jstrip_zone[0].dev[j]; + rdev = conf->devlist[j]; printk(KERN_INFO "raid0: checking %s ...", bdevname(rdev->bdev, b)); if (rdev->sectors <= zone->dev_start) { @@ -178,7 +178,7 @@ static int create_strip_zones(mddev_t *mddev) continue; } printk(KERN_INFO " contained as device %d\n", c); - zone->dev[c] = rdev; + dev[c] = rdev; c++; if (!smallest || rdev->sectors < smallest->sectors) { smallest = rdev; @@ -383,7 +383,8 @@ static int raid0_make_request (struct request_queue *q, struct bio *bio) chunk = x; x = sector >> chunksect_bits; - tmp_dev = zone->dev[sector_div(x, zone->nb_dev)]; + tmp_dev = conf->devlist[(zone - conf->strip_zone)*mddev->raid_disks + + sector_div(x, zone->nb_dev)]; } rsect = (chunk << chunksect_bits) + zone->dev_start + sect_in_chunk; diff --git a/drivers/md/raid0.h b/drivers/md/raid0.h index 124ba34c8eed..7b3605e570c0 100644 --- a/drivers/md/raid0.h +++ b/drivers/md/raid0.h @@ -6,7 +6,6 @@ struct strip_zone sector_t zone_end; /* Start of the next zone (in sectors) */ sector_t dev_start; /* Zone offset in real dev (in sectors) */ int nb_dev; /* # of devices attached to the zone */ - mdk_rdev_t **dev; /* Devices attached to the zone */ }; struct raid0_private_data From a6b3deafe0c50e3e873e8ed5cc8abfcb25c05eff Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 16 Jun 2009 16:54:07 +1000 Subject: [PATCH 10/39] md: raid0: remove setting of segment boundary. This setting doesn't seem to make sense (half the chunk size??) and shouldn't be needed. The segment boundary exported by raid0 should simply be the minimum of the segment boundary of all component devices. And we already get that right. Signed-off-by: NeilBrown --- drivers/md/raid0.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index af0df78223b1..e2e9c1833336 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -261,12 +261,7 @@ static int raid0_run(mddev_t *mddev) printk(KERN_ERR "md/raid0: non-zero chunk size required.\n"); return -EINVAL; } - printk(KERN_INFO "%s: setting max_sectors to %d, segment boundary to %d\n", - mdname(mddev), - mddev->chunk_size >> 9, - (mddev->chunk_size>>1)-1); blk_queue_max_sectors(mddev->queue, mddev->chunk_size >> 9); - blk_queue_segment_boundary(mddev->queue, (mddev->chunk_size>>1) - 1); mddev->queue->queue_lock = &mddev->queue->__queue_lock; ret = create_strip_zones(mddev); From 070ec55d07157a3041f92654135c3c6e2eaaf901 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 16 Jun 2009 16:54:21 +1000 Subject: [PATCH 11/39] md: remove mddev_to_conf "helper" macro Having a macro just to cast a void* isn't really helpful. I would must rather see that we are simply de-referencing ->private, than have to know what the macro does. So open code the macro everywhere and remove the pointless cast. Signed-off-by: NeilBrown --- drivers/md/linear.c | 12 ++++++------ drivers/md/linear.h | 2 -- drivers/md/multipath.c | 20 ++++++++++---------- drivers/md/multipath.h | 6 ------ drivers/md/raid0.c | 10 +++++----- drivers/md/raid0.h | 2 -- drivers/md/raid1.c | 38 +++++++++++++++++++------------------- drivers/md/raid1.h | 6 ------ drivers/md/raid10.c | 42 +++++++++++++++++++++--------------------- drivers/md/raid10.h | 6 ------ drivers/md/raid5.c | 36 ++++++++++++++++++------------------ drivers/md/raid5.h | 2 -- 12 files changed, 79 insertions(+), 103 deletions(-) diff --git a/drivers/md/linear.c b/drivers/md/linear.c index 64f1f3e046e0..31f8ec7131bd 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -28,7 +28,7 @@ static inline dev_info_t *which_dev(mddev_t *mddev, sector_t sector) { dev_info_t *hash; - linear_conf_t *conf = mddev_to_conf(mddev); + linear_conf_t *conf = mddev->private; sector_t idx = sector >> conf->sector_shift; /* @@ -79,7 +79,7 @@ static int linear_mergeable_bvec(struct request_queue *q, static void linear_unplug(struct request_queue *q) { mddev_t *mddev = q->queuedata; - linear_conf_t *conf = mddev_to_conf(mddev); + linear_conf_t *conf = mddev->private; int i; for (i=0; i < mddev->raid_disks; i++) { @@ -91,7 +91,7 @@ static void linear_unplug(struct request_queue *q) static int linear_congested(void *data, int bits) { mddev_t *mddev = data; - linear_conf_t *conf = mddev_to_conf(mddev); + linear_conf_t *conf = mddev->private; int i, ret = 0; for (i = 0; i < mddev->raid_disks && !ret ; i++) { @@ -103,7 +103,7 @@ static int linear_congested(void *data, int bits) static sector_t linear_size(mddev_t *mddev, sector_t sectors, int raid_disks) { - linear_conf_t *conf = mddev_to_conf(mddev); + linear_conf_t *conf = mddev->private; WARN_ONCE(sectors || raid_disks, "%s does not support generic reshape\n", __func__); @@ -294,7 +294,7 @@ static int linear_add(mddev_t *mddev, mdk_rdev_t *rdev) if (!newconf) return -ENOMEM; - newconf->prev = mddev_to_conf(mddev); + newconf->prev = mddev->private; mddev->private = newconf; mddev->raid_disks++; md_set_array_sectors(mddev, linear_size(mddev, 0, 0)); @@ -304,7 +304,7 @@ static int linear_add(mddev_t *mddev, mdk_rdev_t *rdev) static int linear_stop (mddev_t *mddev) { - linear_conf_t *conf = mddev_to_conf(mddev); + linear_conf_t *conf = mddev->private; blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ do { diff --git a/drivers/md/linear.h b/drivers/md/linear.h index bf8179587f95..76078f1cded0 100644 --- a/drivers/md/linear.h +++ b/drivers/md/linear.h @@ -24,6 +24,4 @@ struct linear_private_data typedef struct linear_private_data linear_conf_t; -#define mddev_to_conf(mddev) ((linear_conf_t *) mddev->private) - #endif diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index 4ee31aa13c40..c1ca63f278a9 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -58,7 +58,7 @@ static void multipath_reschedule_retry (struct multipath_bh *mp_bh) { unsigned long flags; mddev_t *mddev = mp_bh->mddev; - multipath_conf_t *conf = mddev_to_conf(mddev); + multipath_conf_t *conf = mddev->private; spin_lock_irqsave(&conf->device_lock, flags); list_add(&mp_bh->retry_list, &conf->retry_list); @@ -75,7 +75,7 @@ static void multipath_reschedule_retry (struct multipath_bh *mp_bh) static void multipath_end_bh_io (struct multipath_bh *mp_bh, int err) { struct bio *bio = mp_bh->master_bio; - multipath_conf_t *conf = mddev_to_conf(mp_bh->mddev); + multipath_conf_t *conf = mp_bh->mddev->private; bio_endio(bio, err); mempool_free(mp_bh, conf->pool); @@ -85,7 +85,7 @@ static void multipath_end_request(struct bio *bio, int error) { int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); struct multipath_bh * mp_bh = (struct multipath_bh *)(bio->bi_private); - multipath_conf_t *conf = mddev_to_conf(mp_bh->mddev); + multipath_conf_t *conf = mp_bh->mddev->private; mdk_rdev_t *rdev = conf->multipaths[mp_bh->path].rdev; if (uptodate) @@ -107,7 +107,7 @@ static void multipath_end_request(struct bio *bio, int error) static void unplug_slaves(mddev_t *mddev) { - multipath_conf_t *conf = mddev_to_conf(mddev); + multipath_conf_t *conf = mddev->private; int i; rcu_read_lock(); @@ -138,7 +138,7 @@ static void multipath_unplug(struct request_queue *q) static int multipath_make_request (struct request_queue *q, struct bio * bio) { mddev_t *mddev = q->queuedata; - multipath_conf_t *conf = mddev_to_conf(mddev); + multipath_conf_t *conf = mddev->private; struct multipath_bh * mp_bh; struct multipath_info *multipath; const int rw = bio_data_dir(bio); @@ -180,7 +180,7 @@ static int multipath_make_request (struct request_queue *q, struct bio * bio) static void multipath_status (struct seq_file *seq, mddev_t *mddev) { - multipath_conf_t *conf = mddev_to_conf(mddev); + multipath_conf_t *conf = mddev->private; int i; seq_printf (seq, " [%d/%d] [", conf->raid_disks, @@ -195,7 +195,7 @@ static void multipath_status (struct seq_file *seq, mddev_t *mddev) static int multipath_congested(void *data, int bits) { mddev_t *mddev = data; - multipath_conf_t *conf = mddev_to_conf(mddev); + multipath_conf_t *conf = mddev->private; int i, ret = 0; rcu_read_lock(); @@ -220,7 +220,7 @@ static int multipath_congested(void *data, int bits) */ static void multipath_error (mddev_t *mddev, mdk_rdev_t *rdev) { - multipath_conf_t *conf = mddev_to_conf(mddev); + multipath_conf_t *conf = mddev->private; if (conf->working_disks <= 1) { /* @@ -367,7 +367,7 @@ static void multipathd (mddev_t *mddev) struct multipath_bh *mp_bh; struct bio *bio; unsigned long flags; - multipath_conf_t *conf = mddev_to_conf(mddev); + multipath_conf_t *conf = mddev->private; struct list_head *head = &conf->retry_list; md_check_recovery(mddev); @@ -531,7 +531,7 @@ out: static int multipath_stop (mddev_t *mddev) { - multipath_conf_t *conf = mddev_to_conf(mddev); + multipath_conf_t *conf = mddev->private; md_unregister_thread(mddev->thread); mddev->thread = NULL; diff --git a/drivers/md/multipath.h b/drivers/md/multipath.h index 6fa70b400cda..d1c2a8d78395 100644 --- a/drivers/md/multipath.h +++ b/drivers/md/multipath.h @@ -18,12 +18,6 @@ struct multipath_private_data { typedef struct multipath_private_data multipath_conf_t; -/* - * this is the only point in the RAID code where we violate - * C type safety. mddev->private is an 'opaque' pointer. - */ -#define mddev_to_conf(mddev) ((multipath_conf_t *) mddev->private) - /* * this is our 'private' 'collective' MULTIPATH buffer head. * it contains information about what kind of IO operations were started diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index e2e9c1833336..77764dad1bcb 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -26,7 +26,7 @@ static void raid0_unplug(struct request_queue *q) { mddev_t *mddev = q->queuedata; - raid0_conf_t *conf = mddev_to_conf(mddev); + raid0_conf_t *conf = mddev->private; mdk_rdev_t **devlist = conf->devlist; int i; @@ -40,7 +40,7 @@ static void raid0_unplug(struct request_queue *q) static int raid0_congested(void *data, int bits) { mddev_t *mddev = data; - raid0_conf_t *conf = mddev_to_conf(mddev); + raid0_conf_t *conf = mddev->private; mdk_rdev_t **devlist = conf->devlist; int i, ret = 0; @@ -294,7 +294,7 @@ static int raid0_run(mddev_t *mddev) static int raid0_stop(mddev_t *mddev) { - raid0_conf_t *conf = mddev_to_conf(mddev); + raid0_conf_t *conf = mddev->private; blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ kfree(conf->strip_zone); @@ -327,7 +327,7 @@ static int raid0_make_request (struct request_queue *q, struct bio *bio) { mddev_t *mddev = q->queuedata; unsigned int sect_in_chunk, chunksect_bits, chunk_sects; - raid0_conf_t *conf = mddev_to_conf(mddev); + raid0_conf_t *conf = mddev->private; struct strip_zone *zone; mdk_rdev_t *tmp_dev; sector_t chunk; @@ -406,7 +406,7 @@ static void raid0_status (struct seq_file *seq, mddev_t *mddev) #ifdef MD_DEBUG int j, k, h; char b[BDEVNAME_SIZE]; - raid0_conf_t *conf = mddev_to_conf(mddev); + raid0_conf_t *conf = mddev->private; h = 0; for (j = 0; j < conf->nr_strip_zones; j++) { diff --git a/drivers/md/raid0.h b/drivers/md/raid0.h index 7b3605e570c0..91f8e876ee64 100644 --- a/drivers/md/raid0.h +++ b/drivers/md/raid0.h @@ -17,6 +17,4 @@ struct raid0_private_data typedef struct raid0_private_data raid0_conf_t; -#define mddev_to_conf(mddev) ((raid0_conf_t *) mddev->private) - #endif diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index e23758b4a34e..5ea5bca53a5e 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -182,7 +182,7 @@ static void put_all_bios(conf_t *conf, r1bio_t *r1_bio) static void free_r1bio(r1bio_t *r1_bio) { - conf_t *conf = mddev_to_conf(r1_bio->mddev); + conf_t *conf = r1_bio->mddev->private; /* * Wake up any possible resync thread that waits for the device @@ -196,7 +196,7 @@ static void free_r1bio(r1bio_t *r1_bio) static void put_buf(r1bio_t *r1_bio) { - conf_t *conf = mddev_to_conf(r1_bio->mddev); + conf_t *conf = r1_bio->mddev->private; int i; for (i=0; iraid_disks; i++) { @@ -214,7 +214,7 @@ static void reschedule_retry(r1bio_t *r1_bio) { unsigned long flags; mddev_t *mddev = r1_bio->mddev; - conf_t *conf = mddev_to_conf(mddev); + conf_t *conf = mddev->private; spin_lock_irqsave(&conf->device_lock, flags); list_add(&r1_bio->retry_list, &conf->retry_list); @@ -253,7 +253,7 @@ static void raid_end_bio_io(r1bio_t *r1_bio) */ static inline void update_head_pos(int disk, r1bio_t *r1_bio) { - conf_t *conf = mddev_to_conf(r1_bio->mddev); + conf_t *conf = r1_bio->mddev->private; conf->mirrors[disk].head_position = r1_bio->sector + (r1_bio->sectors); @@ -264,7 +264,7 @@ static void raid1_end_read_request(struct bio *bio, int error) int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); int mirror; - conf_t *conf = mddev_to_conf(r1_bio->mddev); + conf_t *conf = r1_bio->mddev->private; mirror = r1_bio->read_disk; /* @@ -309,7 +309,7 @@ static void raid1_end_write_request(struct bio *bio, int error) int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); int mirror, behind = test_bit(R1BIO_BehindIO, &r1_bio->state); - conf_t *conf = mddev_to_conf(r1_bio->mddev); + conf_t *conf = r1_bio->mddev->private; struct bio *to_put = NULL; @@ -541,7 +541,7 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) static void unplug_slaves(mddev_t *mddev) { - conf_t *conf = mddev_to_conf(mddev); + conf_t *conf = mddev->private; int i; rcu_read_lock(); @@ -573,7 +573,7 @@ static void raid1_unplug(struct request_queue *q) static int raid1_congested(void *data, int bits) { mddev_t *mddev = data; - conf_t *conf = mddev_to_conf(mddev); + conf_t *conf = mddev->private; int i, ret = 0; rcu_read_lock(); @@ -772,7 +772,7 @@ do_sync_io: static int make_request(struct request_queue *q, struct bio * bio) { mddev_t *mddev = q->queuedata; - conf_t *conf = mddev_to_conf(mddev); + conf_t *conf = mddev->private; mirror_info_t *mirror; r1bio_t *r1_bio; struct bio *read_bio; @@ -991,7 +991,7 @@ static int make_request(struct request_queue *q, struct bio * bio) static void status(struct seq_file *seq, mddev_t *mddev) { - conf_t *conf = mddev_to_conf(mddev); + conf_t *conf = mddev->private; int i; seq_printf(seq, " [%d/%d] [", conf->raid_disks, @@ -1010,7 +1010,7 @@ static void status(struct seq_file *seq, mddev_t *mddev) static void error(mddev_t *mddev, mdk_rdev_t *rdev) { char b[BDEVNAME_SIZE]; - conf_t *conf = mddev_to_conf(mddev); + conf_t *conf = mddev->private; /* * If it is not operational, then we have already marked it as dead @@ -1214,7 +1214,7 @@ static void end_sync_write(struct bio *bio, int error) int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); mddev_t *mddev = r1_bio->mddev; - conf_t *conf = mddev_to_conf(mddev); + conf_t *conf = mddev->private; int i; int mirror=0; @@ -1248,7 +1248,7 @@ static void end_sync_write(struct bio *bio, int error) static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) { - conf_t *conf = mddev_to_conf(mddev); + conf_t *conf = mddev->private; int i; int disks = conf->raid_disks; struct bio *bio, *wbio; @@ -1562,7 +1562,7 @@ static void raid1d(mddev_t *mddev) r1bio_t *r1_bio; struct bio *bio; unsigned long flags; - conf_t *conf = mddev_to_conf(mddev); + conf_t *conf = mddev->private; struct list_head *head = &conf->retry_list; int unplug=0; mdk_rdev_t *rdev; @@ -1585,7 +1585,7 @@ static void raid1d(mddev_t *mddev) spin_unlock_irqrestore(&conf->device_lock, flags); mddev = r1_bio->mddev; - conf = mddev_to_conf(mddev); + conf = mddev->private; if (test_bit(R1BIO_IsSync, &r1_bio->state)) { sync_request_write(mddev, r1_bio); unplug = 1; @@ -1706,7 +1706,7 @@ static int init_resync(conf_t *conf) static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster) { - conf_t *conf = mddev_to_conf(mddev); + conf_t *conf = mddev->private; r1bio_t *r1_bio; struct bio *bio; sector_t max_sector, nr_sectors; @@ -2087,7 +2087,7 @@ out: static int stop(mddev_t *mddev) { - conf_t *conf = mddev_to_conf(mddev); + conf_t *conf = mddev->private; struct bitmap *bitmap = mddev->bitmap; int behind_wait = 0; @@ -2155,7 +2155,7 @@ static int raid1_reshape(mddev_t *mddev) mempool_t *newpool, *oldpool; struct pool_info *newpoolinfo; mirror_info_t *newmirrors; - conf_t *conf = mddev_to_conf(mddev); + conf_t *conf = mddev->private; int cnt, raid_disks; unsigned long flags; int d, d2, err; @@ -2252,7 +2252,7 @@ static int raid1_reshape(mddev_t *mddev) static void raid1_quiesce(mddev_t *mddev, int state) { - conf_t *conf = mddev_to_conf(mddev); + conf_t *conf = mddev->private; switch(state) { case 1: diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h index 1620eea3d57c..e87b84deff68 100644 --- a/drivers/md/raid1.h +++ b/drivers/md/raid1.h @@ -63,12 +63,6 @@ struct r1_private_data_s { typedef struct r1_private_data_s conf_t; -/* - * this is the only point in the RAID code where we violate - * C type safety. mddev->private is an 'opaque' pointer. - */ -#define mddev_to_conf(mddev) ((conf_t *) mddev->private) - /* * this is our 'private' RAID1 bio. * diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 750550c1166f..9a5beb4fd954 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -188,7 +188,7 @@ static void put_all_bios(conf_t *conf, r10bio_t *r10_bio) static void free_r10bio(r10bio_t *r10_bio) { - conf_t *conf = mddev_to_conf(r10_bio->mddev); + conf_t *conf = r10_bio->mddev->private; /* * Wake up any possible resync thread that waits for the device @@ -202,7 +202,7 @@ static void free_r10bio(r10bio_t *r10_bio) static void put_buf(r10bio_t *r10_bio) { - conf_t *conf = mddev_to_conf(r10_bio->mddev); + conf_t *conf = r10_bio->mddev->private; mempool_free(r10_bio, conf->r10buf_pool); @@ -213,7 +213,7 @@ static void reschedule_retry(r10bio_t *r10_bio) { unsigned long flags; mddev_t *mddev = r10_bio->mddev; - conf_t *conf = mddev_to_conf(mddev); + conf_t *conf = mddev->private; spin_lock_irqsave(&conf->device_lock, flags); list_add(&r10_bio->retry_list, &conf->retry_list); @@ -245,7 +245,7 @@ static void raid_end_bio_io(r10bio_t *r10_bio) */ static inline void update_head_pos(int slot, r10bio_t *r10_bio) { - conf_t *conf = mddev_to_conf(r10_bio->mddev); + conf_t *conf = r10_bio->mddev->private; conf->mirrors[r10_bio->devs[slot].devnum].head_position = r10_bio->devs[slot].addr + (r10_bio->sectors); @@ -256,7 +256,7 @@ static void raid10_end_read_request(struct bio *bio, int error) int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private); int slot, dev; - conf_t *conf = mddev_to_conf(r10_bio->mddev); + conf_t *conf = r10_bio->mddev->private; slot = r10_bio->read_slot; @@ -297,7 +297,7 @@ static void raid10_end_write_request(struct bio *bio, int error) int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private); int slot, dev; - conf_t *conf = mddev_to_conf(r10_bio->mddev); + conf_t *conf = r10_bio->mddev->private; for (slot = 0; slot < conf->copies; slot++) if (r10_bio->devs[slot].bio == bio) @@ -596,7 +596,7 @@ rb_out: static void unplug_slaves(mddev_t *mddev) { - conf_t *conf = mddev_to_conf(mddev); + conf_t *conf = mddev->private; int i; rcu_read_lock(); @@ -628,7 +628,7 @@ static void raid10_unplug(struct request_queue *q) static int raid10_congested(void *data, int bits) { mddev_t *mddev = data; - conf_t *conf = mddev_to_conf(mddev); + conf_t *conf = mddev->private; int i, ret = 0; rcu_read_lock(); @@ -788,7 +788,7 @@ static void unfreeze_array(conf_t *conf) static int make_request(struct request_queue *q, struct bio * bio) { mddev_t *mddev = q->queuedata; - conf_t *conf = mddev_to_conf(mddev); + conf_t *conf = mddev->private; mirror_info_t *mirror; r10bio_t *r10_bio; struct bio *read_bio; @@ -981,7 +981,7 @@ static int make_request(struct request_queue *q, struct bio * bio) static void status(struct seq_file *seq, mddev_t *mddev) { - conf_t *conf = mddev_to_conf(mddev); + conf_t *conf = mddev->private; int i; if (conf->near_copies < conf->raid_disks) @@ -1006,7 +1006,7 @@ static void status(struct seq_file *seq, mddev_t *mddev) static void error(mddev_t *mddev, mdk_rdev_t *rdev) { char b[BDEVNAME_SIZE]; - conf_t *conf = mddev_to_conf(mddev); + conf_t *conf = mddev->private; /* * If it is not operational, then we have already marked it as dead @@ -1215,7 +1215,7 @@ abort: static void end_sync_read(struct bio *bio, int error) { r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private); - conf_t *conf = mddev_to_conf(r10_bio->mddev); + conf_t *conf = r10_bio->mddev->private; int i,d; for (i=0; icopies; i++) @@ -1253,7 +1253,7 @@ static void end_sync_write(struct bio *bio, int error) int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private); mddev_t *mddev = r10_bio->mddev; - conf_t *conf = mddev_to_conf(mddev); + conf_t *conf = mddev->private; int i,d; for (i = 0; i < conf->copies; i++) @@ -1300,7 +1300,7 @@ static void end_sync_write(struct bio *bio, int error) */ static void sync_request_write(mddev_t *mddev, r10bio_t *r10_bio) { - conf_t *conf = mddev_to_conf(mddev); + conf_t *conf = mddev->private; int i, first; struct bio *tbio, *fbio; @@ -1400,7 +1400,7 @@ done: static void recovery_request_write(mddev_t *mddev, r10bio_t *r10_bio) { - conf_t *conf = mddev_to_conf(mddev); + conf_t *conf = mddev->private; int i, d; struct bio *bio, *wbio; @@ -1549,7 +1549,7 @@ static void raid10d(mddev_t *mddev) r10bio_t *r10_bio; struct bio *bio; unsigned long flags; - conf_t *conf = mddev_to_conf(mddev); + conf_t *conf = mddev->private; struct list_head *head = &conf->retry_list; int unplug=0; mdk_rdev_t *rdev; @@ -1572,7 +1572,7 @@ static void raid10d(mddev_t *mddev) spin_unlock_irqrestore(&conf->device_lock, flags); mddev = r10_bio->mddev; - conf = mddev_to_conf(mddev); + conf = mddev->private; if (test_bit(R10BIO_IsSync, &r10_bio->state)) { sync_request_write(mddev, r10_bio); unplug = 1; @@ -1680,7 +1680,7 @@ static int init_resync(conf_t *conf) static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster) { - conf_t *conf = mddev_to_conf(mddev); + conf_t *conf = mddev->private; r10bio_t *r10_bio; struct bio *biolist = NULL, *bio; sector_t max_sector, nr_sectors; @@ -2026,7 +2026,7 @@ static sector_t raid10_size(mddev_t *mddev, sector_t sectors, int raid_disks) { sector_t size; - conf_t *conf = mddev_to_conf(mddev); + conf_t *conf = mddev->private; if (!raid_disks) raid_disks = mddev->raid_disks; @@ -2227,7 +2227,7 @@ out: static int stop(mddev_t *mddev) { - conf_t *conf = mddev_to_conf(mddev); + conf_t *conf = mddev->private; raise_barrier(conf, 0); lower_barrier(conf); @@ -2245,7 +2245,7 @@ static int stop(mddev_t *mddev) static void raid10_quiesce(mddev_t *mddev, int state) { - conf_t *conf = mddev_to_conf(mddev); + conf_t *conf = mddev->private; switch(state) { case 1: diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h index 244dbe507a54..59cd1efb8d30 100644 --- a/drivers/md/raid10.h +++ b/drivers/md/raid10.h @@ -61,12 +61,6 @@ struct r10_private_data_s { typedef struct r10_private_data_s conf_t; -/* - * this is the only point in the RAID code where we violate - * C type safety. mddev->private is an 'opaque' pointer. - */ -#define mddev_to_conf(mddev) ((conf_t *) mddev->private) - /* * this is our 'private' RAID10 bio. * diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index bef876698232..7fb97c65ad37 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3284,7 +3284,7 @@ static void activate_bit_delay(raid5_conf_t *conf) static void unplug_slaves(mddev_t *mddev) { - raid5_conf_t *conf = mddev_to_conf(mddev); + raid5_conf_t *conf = mddev->private; int i; rcu_read_lock(); @@ -3308,7 +3308,7 @@ static void unplug_slaves(mddev_t *mddev) static void raid5_unplug_device(struct request_queue *q) { mddev_t *mddev = q->queuedata; - raid5_conf_t *conf = mddev_to_conf(mddev); + raid5_conf_t *conf = mddev->private; unsigned long flags; spin_lock_irqsave(&conf->device_lock, flags); @@ -3327,7 +3327,7 @@ static void raid5_unplug_device(struct request_queue *q) static int raid5_congested(void *data, int bits) { mddev_t *mddev = data; - raid5_conf_t *conf = mddev_to_conf(mddev); + raid5_conf_t *conf = mddev->private; /* No difference between reads and writes. Just check * how busy the stripe_cache is @@ -3440,7 +3440,7 @@ static void raid5_align_endio(struct bio *bi, int error) bio_put(bi); mddev = raid_bi->bi_bdev->bd_disk->queue->queuedata; - conf = mddev_to_conf(mddev); + conf = mddev->private; rdev = (void*)raid_bi->bi_next; raid_bi->bi_next = NULL; @@ -3482,7 +3482,7 @@ static int bio_fits_rdev(struct bio *bi) static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio) { mddev_t *mddev = q->queuedata; - raid5_conf_t *conf = mddev_to_conf(mddev); + raid5_conf_t *conf = mddev->private; unsigned int dd_idx; struct bio* align_bi; mdk_rdev_t *rdev; @@ -3599,7 +3599,7 @@ static struct stripe_head *__get_priority_stripe(raid5_conf_t *conf) static int make_request(struct request_queue *q, struct bio * bi) { mddev_t *mddev = q->queuedata; - raid5_conf_t *conf = mddev_to_conf(mddev); + raid5_conf_t *conf = mddev->private; int dd_idx; sector_t new_sector; sector_t logical_sector, last_sector; @@ -4129,7 +4129,7 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio) static void raid5d(mddev_t *mddev) { struct stripe_head *sh; - raid5_conf_t *conf = mddev_to_conf(mddev); + raid5_conf_t *conf = mddev->private; int handled; pr_debug("+++ raid5d active\n"); @@ -4185,7 +4185,7 @@ static void raid5d(mddev_t *mddev) static ssize_t raid5_show_stripe_cache_size(mddev_t *mddev, char *page) { - raid5_conf_t *conf = mddev_to_conf(mddev); + raid5_conf_t *conf = mddev->private; if (conf) return sprintf(page, "%d\n", conf->max_nr_stripes); else @@ -4195,7 +4195,7 @@ raid5_show_stripe_cache_size(mddev_t *mddev, char *page) static ssize_t raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len) { - raid5_conf_t *conf = mddev_to_conf(mddev); + raid5_conf_t *conf = mddev->private; unsigned long new; int err; @@ -4233,7 +4233,7 @@ raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR, static ssize_t raid5_show_preread_threshold(mddev_t *mddev, char *page) { - raid5_conf_t *conf = mddev_to_conf(mddev); + raid5_conf_t *conf = mddev->private; if (conf) return sprintf(page, "%d\n", conf->bypass_threshold); else @@ -4243,7 +4243,7 @@ raid5_show_preread_threshold(mddev_t *mddev, char *page) static ssize_t raid5_store_preread_threshold(mddev_t *mddev, const char *page, size_t len) { - raid5_conf_t *conf = mddev_to_conf(mddev); + raid5_conf_t *conf = mddev->private; unsigned long new; if (len >= PAGE_SIZE) return -EINVAL; @@ -4267,7 +4267,7 @@ raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold, static ssize_t stripe_cache_active_show(mddev_t *mddev, char *page) { - raid5_conf_t *conf = mddev_to_conf(mddev); + raid5_conf_t *conf = mddev->private; if (conf) return sprintf(page, "%d\n", atomic_read(&conf->active_stripes)); else @@ -4291,7 +4291,7 @@ static struct attribute_group raid5_attrs_group = { static sector_t raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks) { - raid5_conf_t *conf = mddev_to_conf(mddev); + raid5_conf_t *conf = mddev->private; if (!sectors) sectors = mddev->dev_sectors; @@ -4845,7 +4845,7 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors) static int raid5_check_reshape(mddev_t *mddev) { - raid5_conf_t *conf = mddev_to_conf(mddev); + raid5_conf_t *conf = mddev->private; if (mddev->delta_disks == 0 && mddev->new_layout == mddev->layout && @@ -4890,7 +4890,7 @@ static int raid5_check_reshape(mddev_t *mddev) static int raid5_start_reshape(mddev_t *mddev) { - raid5_conf_t *conf = mddev_to_conf(mddev); + raid5_conf_t *conf = mddev->private; mdk_rdev_t *rdev; int spares = 0; int added_devices = 0; @@ -5022,7 +5022,7 @@ static void end_reshape(raid5_conf_t *conf) static void raid5_finish_reshape(mddev_t *mddev) { struct block_device *bdev; - raid5_conf_t *conf = mddev_to_conf(mddev); + raid5_conf_t *conf = mddev->private; if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { @@ -5061,7 +5061,7 @@ static void raid5_finish_reshape(mddev_t *mddev) static void raid5_quiesce(mddev_t *mddev, int state) { - raid5_conf_t *conf = mddev_to_conf(mddev); + raid5_conf_t *conf = mddev->private; switch(state) { case 2: /* resume for a suspend */ @@ -5157,7 +5157,7 @@ static int raid5_reconfig(mddev_t *mddev, int new_layout, int new_chunk) * For larger arrays we record the new value - after validation * to be used by a reshape pass. */ - raid5_conf_t *conf = mddev_to_conf(mddev); + raid5_conf_t *conf = mddev->private; if (new_layout >= 0 && !algorithm_valid_raid5(new_layout)) return -EINVAL; diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 52ba99954dec..1a25c9e252b4 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -408,8 +408,6 @@ struct raid5_private_data { typedef struct raid5_private_data raid5_conf_t; -#define mddev_to_conf(mddev) ((raid5_conf_t *) mddev->private) - /* * Our supported algorithms */ From 45d4582f219619e368ea91ea1189085e1c5f1969 Mon Sep 17 00:00:00 2001 From: Sandeep K Sinha Date: Tue, 16 Jun 2009 16:55:26 +1000 Subject: [PATCH 12/39] md: Removal of hash table in linear raid Get rid of sector_div and hash table for linear raid and replace with a linear search in which_dev. The hash table adds a lot of complexity for little if any gain. Ultimately a binary search will be used which will have smaller cache foot print, a similar number of memory access, and no divisions. Signed-off-by: Sandeep K Sinha Signed-off-by: NeilBrown --- drivers/md/linear.c | 93 ++------------------------------------------- drivers/md/linear.h | 5 --- 2 files changed, 3 insertions(+), 95 deletions(-) diff --git a/drivers/md/linear.c b/drivers/md/linear.c index 31f8ec7131bd..92bcd3dd52cc 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -29,13 +29,8 @@ static inline dev_info_t *which_dev(mddev_t *mddev, sector_t sector) { dev_info_t *hash; linear_conf_t *conf = mddev->private; - sector_t idx = sector >> conf->sector_shift; - /* - * sector_div(a,b) returns the remainer and sets a to a/b - */ - (void)sector_div(idx, conf->spacing); - hash = conf->hash_table[idx]; + hash = conf->disks; while (sector >= hash->num_sectors + hash->start_sector) hash++; @@ -114,11 +109,8 @@ static sector_t linear_size(mddev_t *mddev, sector_t sectors, int raid_disks) static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks) { linear_conf_t *conf; - dev_info_t **table; mdk_rdev_t *rdev; - int i, nb_zone, cnt; - sector_t min_sectors; - sector_t curr_sector; + int i, cnt; conf = kzalloc (sizeof (*conf) + raid_disks*sizeof(dev_info_t), GFP_KERNEL); @@ -159,63 +151,8 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks) goto out; } - min_sectors = conf->array_sectors; - sector_div(min_sectors, PAGE_SIZE/sizeof(struct dev_info *)); - if (min_sectors == 0) - min_sectors = 1; - - /* min_sectors is the minimum spacing that will fit the hash - * table in one PAGE. This may be much smaller than needed. - * We find the smallest non-terminal set of consecutive devices - * that is larger than min_sectors and use the size of that as - * the actual spacing - */ - conf->spacing = conf->array_sectors; - for (i=0; i < cnt-1 ; i++) { - sector_t tmp = 0; - int j; - for (j = i; j < cnt - 1 && tmp < min_sectors; j++) - tmp += conf->disks[j].num_sectors; - if (tmp >= min_sectors && tmp < conf->spacing) - conf->spacing = tmp; - } - - /* spacing may be too large for sector_div to work with, - * so we might need to pre-shift - */ - conf->sector_shift = 0; - if (sizeof(sector_t) > sizeof(u32)) { - sector_t space = conf->spacing; - while (space > (sector_t)(~(u32)0)) { - space >>= 1; - conf->sector_shift++; - } - } /* - * This code was restructured to work around a gcc-2.95.3 internal - * compiler error. Alter it with care. - */ - { - sector_t sz; - unsigned round; - unsigned long base; - - sz = conf->array_sectors >> conf->sector_shift; - sz += 1; /* force round-up */ - base = conf->spacing >> conf->sector_shift; - round = sector_div(sz, base); - nb_zone = sz + (round ? 1 : 0); - } - BUG_ON(nb_zone > PAGE_SIZE / sizeof(struct dev_info *)); - - conf->hash_table = kmalloc (sizeof (struct dev_info *) * nb_zone, - GFP_KERNEL); - if (!conf->hash_table) - goto out; - - /* - * Here we generate the linear hash table - * First calculate the device offsets. + * Here we calculate the device offsets. */ conf->disks[0].start_sector = 0; for (i = 1; i < raid_disks; i++) @@ -223,29 +160,6 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks) conf->disks[i-1].start_sector + conf->disks[i-1].num_sectors; - table = conf->hash_table; - i = 0; - for (curr_sector = 0; - curr_sector < conf->array_sectors; - curr_sector += conf->spacing) { - - while (i < raid_disks-1 && - curr_sector >= conf->disks[i+1].start_sector) - i++; - - *table ++ = conf->disks + i; - } - - if (conf->sector_shift) { - conf->spacing >>= conf->sector_shift; - /* round spacing up so that when we divide by it, - * we err on the side of "too-low", which is safest. - */ - conf->spacing++; - } - - BUG_ON(table - conf->hash_table > nb_zone); - return conf; out: @@ -309,7 +223,6 @@ static int linear_stop (mddev_t *mddev) blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ do { linear_conf_t *t = conf->prev; - kfree(conf->hash_table); kfree(conf); conf = t; } while (conf); diff --git a/drivers/md/linear.h b/drivers/md/linear.h index 76078f1cded0..721a878403d1 100644 --- a/drivers/md/linear.h +++ b/drivers/md/linear.h @@ -12,12 +12,7 @@ typedef struct dev_info dev_info_t; struct linear_private_data { struct linear_private_data *prev; /* earlier version */ - dev_info_t **hash_table; - sector_t spacing; sector_t array_sectors; - int sector_shift; /* shift before dividing - * by spacing - */ dev_info_t disks[0]; }; From 4db7cdc859f56ecf0a186e0cfb238b5bb3af2efb Mon Sep 17 00:00:00 2001 From: Sandeep K Sinha Date: Tue, 16 Jun 2009 16:56:13 +1000 Subject: [PATCH 13/39] md: Removing num_sector and replacing start_sector with end_sector Remove num_sectors from dev_info and replace start_sector with end_sector. This makes a lot of comparisons much simpler. Signed-off-by: Sandeep K Sinha Signed-off-by: NeilBrown --- drivers/md/linear.c | 37 ++++++++++++++++++------------------- drivers/md/linear.h | 3 +-- 2 files changed, 19 insertions(+), 21 deletions(-) diff --git a/drivers/md/linear.c b/drivers/md/linear.c index 92bcd3dd52cc..529a3d37e3fe 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -32,7 +32,7 @@ static inline dev_info_t *which_dev(mddev_t *mddev, sector_t sector) hash = conf->disks; - while (sector >= hash->num_sectors + hash->start_sector) + while (sector >= hash->end_sector) hash++; return hash; } @@ -55,7 +55,7 @@ static int linear_mergeable_bvec(struct request_queue *q, sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); dev0 = which_dev(mddev, sector); - maxsectors = dev0->num_sectors - (sector - dev0->start_sector); + maxsectors = dev0->end_sector - sector; if (maxsectors < bio_sectors) maxsectors = 0; @@ -141,10 +141,9 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks) queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9)) blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); - disk->num_sectors = rdev->sectors; conf->array_sectors += rdev->sectors; - cnt++; + } if (cnt != raid_disks) { printk("linear: not enough drives present. Aborting!\n"); @@ -154,11 +153,12 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks) /* * Here we calculate the device offsets. */ - conf->disks[0].start_sector = 0; + conf->disks[0].end_sector = conf->disks[0].rdev->sectors; + for (i = 1; i < raid_disks; i++) - conf->disks[i].start_sector = - conf->disks[i-1].start_sector + - conf->disks[i-1].num_sectors; + conf->disks[i].end_sector = + conf->disks[i-1].end_sector + + conf->disks[i].rdev->sectors; return conf; @@ -235,6 +235,7 @@ static int linear_make_request (struct request_queue *q, struct bio *bio) const int rw = bio_data_dir(bio); mddev_t *mddev = q->queuedata; dev_info_t *tmp_dev; + sector_t start_sector; int cpu; if (unlikely(bio_barrier(bio))) { @@ -249,32 +250,30 @@ static int linear_make_request (struct request_queue *q, struct bio *bio) part_stat_unlock(); tmp_dev = which_dev(mddev, bio->bi_sector); - - if (unlikely(bio->bi_sector >= (tmp_dev->num_sectors + - tmp_dev->start_sector) - || (bio->bi_sector < - tmp_dev->start_sector))) { + start_sector = tmp_dev->end_sector - tmp_dev->rdev->sectors; + + if (unlikely(bio->bi_sector >= (tmp_dev->end_sector) + || (bio->bi_sector < start_sector))) { char b[BDEVNAME_SIZE]; printk("linear_make_request: Sector %llu out of bounds on " "dev %s: %llu sectors, offset %llu\n", (unsigned long long)bio->bi_sector, bdevname(tmp_dev->rdev->bdev, b), - (unsigned long long)tmp_dev->num_sectors, - (unsigned long long)tmp_dev->start_sector); + (unsigned long long)tmp_dev->rdev->sectors, + (unsigned long long)start_sector); bio_io_error(bio); return 0; } if (unlikely(bio->bi_sector + (bio->bi_size >> 9) > - tmp_dev->start_sector + tmp_dev->num_sectors)) { + tmp_dev->end_sector)) { /* This bio crosses a device boundary, so we have to * split it. */ struct bio_pair *bp; bp = bio_split(bio, - tmp_dev->start_sector + tmp_dev->num_sectors - - bio->bi_sector); + tmp_dev->end_sector - bio->bi_sector); if (linear_make_request(q, &bp->bio1)) generic_make_request(&bp->bio1); @@ -285,7 +284,7 @@ static int linear_make_request (struct request_queue *q, struct bio *bio) } bio->bi_bdev = tmp_dev->rdev->bdev; - bio->bi_sector = bio->bi_sector - tmp_dev->start_sector + bio->bi_sector = bio->bi_sector - start_sector + tmp_dev->rdev->data_offset; return 1; diff --git a/drivers/md/linear.h b/drivers/md/linear.h index 721a878403d1..599e5c1bbb01 100644 --- a/drivers/md/linear.h +++ b/drivers/md/linear.h @@ -3,8 +3,7 @@ struct dev_info { mdk_rdev_t *rdev; - sector_t num_sectors; - sector_t start_sector; + sector_t end_sector; }; typedef struct dev_info dev_info_t; From aece3d1f40879759f641dfbfdbb9e2593adeb43c Mon Sep 17 00:00:00 2001 From: Sandeep K Sinha Date: Tue, 16 Jun 2009 16:57:08 +1000 Subject: [PATCH 14/39] md: Binary search in linear raid Replace the linear search with binary search in which_dev. Signed-off-by: Sandeep K Sinha Signed-off-by: NeilBrown --- drivers/md/linear.c | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/drivers/md/linear.c b/drivers/md/linear.c index 529a3d37e3fe..9b02a73fbc6b 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -27,14 +27,26 @@ */ static inline dev_info_t *which_dev(mddev_t *mddev, sector_t sector) { - dev_info_t *hash; + int lo, mid, hi; linear_conf_t *conf = mddev->private; - hash = conf->disks; + lo = 0; + hi = mddev->raid_disks - 1; - while (sector >= hash->end_sector) - hash++; - return hash; + /* + * Binary Search + */ + + while (hi > lo) { + + mid = (hi + lo) / 2; + if (sector < conf->disks[mid].end_sector) + hi = mid; + else + lo = mid + 1; + } + + return conf->disks + lo; } /** From 1b9614291eb319fad96de45392eb4452ad39f0ee Mon Sep 17 00:00:00 2001 From: raz ben yehuda Date: Tue, 16 Jun 2009 16:57:40 +1000 Subject: [PATCH 15/39] md: have raid0 compile with MD_DEBUG on Because of the removal of the device list from the strips raid0 did not compile with MD_DEBUG flag on Signed-off-by: NeilBrown --- drivers/md/raid0.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 77764dad1bcb..d8692fc17963 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -400,7 +400,7 @@ bad_map: return 0; } -static void raid0_status (struct seq_file *seq, mddev_t *mddev) +static void raid0_status(struct seq_file *seq, mddev_t *mddev) { #undef MD_DEBUG #ifdef MD_DEBUG @@ -408,18 +408,24 @@ static void raid0_status (struct seq_file *seq, mddev_t *mddev) char b[BDEVNAME_SIZE]; raid0_conf_t *conf = mddev->private; + sector_t zone_size; + sector_t zone_start = 0; h = 0; + for (j = 0; j < conf->nr_strip_zones; j++) { seq_printf(seq, " z%d", j); seq_printf(seq, "=["); for (k = 0; k < conf->strip_zone[j].nb_dev; k++) seq_printf(seq, "%s/", bdevname( - conf->strip_zone[j].dev[k]->bdev,b)); + conf->devlist[j*mddev->raid_disks + k] + ->bdev, b)); - seq_printf(seq, "] ze=%d ds=%d s=%d\n", - conf->strip_zone[j].zone_end, - conf->strip_zone[j].dev_start, - conf->strip_zone[j].sectors); + zone_size = conf->strip_zone[j].zone_end - zone_start; + seq_printf(seq, "] ze=%lld ds=%lld s=%lld\n", + (unsigned long long)zone_start>>1, + (unsigned long long)conf->strip_zone[j].dev_start>>1, + (unsigned long long)zone_size>>1); + zone_start = conf->strip_zone[j].zone_end; } #endif seq_printf(seq, " %dk chunks", mddev->chunk_size/1024); From 46994191ae8fdf1cbcc1f29282576b269a638c69 Mon Sep 17 00:00:00 2001 From: raz ben yehuda Date: Tue, 16 Jun 2009 17:00:54 +1000 Subject: [PATCH 16/39] md: have raid0 report its formation Report to the user what are the raid zones Signed-off-by: raziebe@gmail.com Signed-off-by: NeilBrown --- drivers/md/raid0.c | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index d8692fc17963..62fde23bf281 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -52,6 +52,38 @@ static int raid0_congested(void *data, int bits) return ret; } +/* + * inform the user of the raid configuration +*/ +static void dump_zones(mddev_t *mddev) +{ + int j, k, h; + sector_t zone_size = 0; + sector_t zone_start = 0; + char b[BDEVNAME_SIZE]; + raid0_conf_t *conf = mddev->private; + printk(KERN_INFO "******* %s configuration *********\n", + mdname(mddev)); + h = 0; + for (j = 0; j < conf->nr_strip_zones; j++) { + printk(KERN_INFO "zone%d=[", j); + for (k = 0; k < conf->strip_zone[j].nb_dev; k++) + printk("%s/", + bdevname(conf->devlist[j*mddev->raid_disks + + k]->bdev, b)); + printk("]\n"); + + zone_size = conf->strip_zone[j].zone_end - zone_start; + printk(KERN_INFO " zone offset=%llukb " + "device offset=%llukb size=%llukb\n", + (unsigned long long)zone_start>>1, + (unsigned long long)conf->strip_zone[j].dev_start>>1, + (unsigned long long)zone_size>>1); + zone_start = conf->strip_zone[j].zone_end; + } + printk(KERN_INFO "**********************************\n\n"); +} + static int create_strip_zones(mddev_t *mddev) { int i, c, j, err; @@ -289,6 +321,7 @@ static int raid0_run(mddev_t *mddev) } blk_queue_merge_bvec(mddev->queue, raid0_mergeable_bvec); + dump_zones(mddev); return 0; } From 92e59b6ba21845fadd2cce725010a9351740b76e Mon Sep 17 00:00:00 2001 From: raz ben yehuda Date: Tue, 16 Jun 2009 17:00:57 +1000 Subject: [PATCH 17/39] md: raid0: chunk size check in raid0_run have raid0 check chunk size in run method instead of in md. This is part of a series moving the checks from common code to the personalities where they belong. hardsect is short and chunksize is an int, so it is safe to use %. Signed-off-by: raziebe@gmail.com Signed-off-by: NeilBrown --- drivers/md/raid0.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 62fde23bf281..39936a217f95 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -234,6 +234,16 @@ static int create_strip_zones(mddev_t *mddev) mddev->queue->backing_dev_info.congested_fn = raid0_congested; mddev->queue->backing_dev_info.congested_data = mddev; + /* + * now since we have the hard sector sizes, we can make sure + * chunk size is a multiple of that sector size + */ + if (mddev->chunk_size % queue_logical_block_size(mddev->queue)) { + printk(KERN_ERR "%s chunk_size of %d not valid\n", + mdname(mddev), + mddev->chunk_size); + goto abort; + } printk(KERN_INFO "raid0: done.\n"); mddev->private = conf; return 0; @@ -289,8 +299,9 @@ static int raid0_run(mddev_t *mddev) { int ret; - if (mddev->chunk_size == 0) { - printk(KERN_ERR "md/raid0: non-zero chunk size required.\n"); + if (mddev->chunk_size == 0 || + !is_power_of_2(mddev->chunk_size)) { + printk(KERN_ERR "md/raid0: chunk size must be a power of 2.\n"); return -EINVAL; } blk_queue_max_sectors(mddev->queue, mddev->chunk_size >> 9); From 964e7913b0d25b988e27a7cd9378bc55cc572bb4 Mon Sep 17 00:00:00 2001 From: raz ben yehuda Date: Tue, 16 Jun 2009 17:01:22 +1000 Subject: [PATCH 18/39] md: raid10: chunk size check in run have raid10 check chunk size in run method instead of in md Signed-off-by: raziebe@gmail.com Signed-off-by: NeilBrown --- drivers/md/raid10.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 9a5beb4fd954..06bef686f91b 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -2050,9 +2050,10 @@ static int run(mddev_t *mddev) int nc, fc, fo; sector_t stride, size; - if (mddev->chunk_size < PAGE_SIZE) { + if (mddev->chunk_size < PAGE_SIZE || + !is_power_of_2(mddev->chunk_size)) { printk(KERN_ERR "md/raid10: chunk size must be " - "at least PAGE_SIZE(%ld).\n", PAGE_SIZE); + "at least PAGE_SIZE(%ld) and be a power of 2.\n", PAGE_SIZE); return -EINVAL; } From 740da44918680a0c72411ae4ccdd1861069afcc4 Mon Sep 17 00:00:00 2001 From: raz ben yehuda Date: Tue, 16 Jun 2009 17:01:36 +1000 Subject: [PATCH 19/39] md: raid5: chunk size check in setup_conf have raid5 check chunk size in run/reshape method instead of in md Signed-off-by: raziebe@gmail.com Signed-off-by: NeilBrown --- drivers/md/raid5.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 7fb97c65ad37..be4e62f611bc 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -4336,7 +4336,8 @@ static raid5_conf_t *setup_conf(mddev_t *mddev) return ERR_PTR(-EINVAL); } - if (!mddev->new_chunk || mddev->new_chunk % PAGE_SIZE) { + if (!mddev->new_chunk || mddev->new_chunk % PAGE_SIZE || + !is_power_of_2(mddev->new_chunk)) { printk(KERN_ERR "raid5: invalid chunk size %d for %s\n", mddev->new_chunk, mdname(mddev)); return ERR_PTR(-EINVAL); From 2ac06c3332898103210b478c5a17c20e28929287 Mon Sep 17 00:00:00 2001 From: raz ben yehuda Date: Tue, 16 Jun 2009 17:01:42 +1000 Subject: [PATCH 20/39] md: prepare for non-power-of-two chunk sizes Remove chunk size check from md as this is now performed in the run function in each personality. Replace chunk size power 2 code calculations by a regular division. Signed-off-by: raziebe@gmail.com Signed-off-by: NeilBrown --- drivers/md/md.c | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 20f6ac338349..a02bde70874b 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -444,8 +444,11 @@ static sector_t calc_num_sectors(mdk_rdev_t *rdev, unsigned chunk_size) { sector_t num_sectors = rdev->sb_start; - if (chunk_size) - num_sectors &= ~((sector_t)chunk_size/512 - 1); + if (chunk_size) { + unsigned chunk_sects = chunk_size>>9; + sector_div(num_sectors, chunk_sects); + num_sectors *= chunk_sects; + } return num_sectors; } @@ -1248,8 +1251,12 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) if (rdev->sectors < le64_to_cpu(sb->data_size)) return -EINVAL; rdev->sectors = le64_to_cpu(sb->data_size); - if (le32_to_cpu(sb->chunksize)) - rdev->sectors &= ~((sector_t)le32_to_cpu(sb->chunksize) - 1); + if (le32_to_cpu(sb->chunksize)) { + int chunk_sects = le32_to_cpu(sb->chunksize); + sector_t chunks = rdev->sectors; + sector_div(chunks, chunk_sects); + rdev->sectors = chunks * chunk_sects; + } if (le64_to_cpu(sb->size) > rdev->sectors) return -EINVAL; @@ -3528,7 +3535,8 @@ min_sync_store(mddev_t *mddev, const char *buf, size_t len) /* Must be a multiple of chunk_size */ if (mddev->chunk_size) { - if (min & (sector_t)((mddev->chunk_size>>9)-1)) + sector_t temp = min; + if (sector_div(temp, (mddev->chunk_size>>9))) return -EINVAL; } mddev->resync_min = min; @@ -3565,7 +3573,8 @@ max_sync_store(mddev_t *mddev, const char *buf, size_t len) /* Must be a multiple of chunk_size */ if (mddev->chunk_size) { - if (max & (sector_t)((mddev->chunk_size>>9)-1)) + sector_t temp = max; + if (sector_div(temp, (mddev->chunk_size>>9))) return -EINVAL; } mddev->resync_max = max; @@ -4006,14 +4015,6 @@ static int do_md_run(mddev_t * mddev) chunk_size, MAX_CHUNK_SIZE); return -EINVAL; } - /* - * chunk-size has to be a power of 2 - */ - if ( (1 << ffz(~chunk_size)) != chunk_size) { - printk(KERN_ERR "chunk_size of %d not valid\n", chunk_size); - return -EINVAL; - } - /* devices must have minimum size of one chunk */ list_for_each_entry(rdev, &mddev->disks, same_set) { if (test_bit(Faulty, &rdev->flags)) From fbb704efb784e2c8418e34dc3013af76bdd58101 Mon Sep 17 00:00:00 2001 From: raz ben yehuda Date: Tue, 16 Jun 2009 17:02:05 +1000 Subject: [PATCH 21/39] md: raid0 :Enables chunk size other than powers of 2. Maintain two flows, one for pow2 chunk sizes (which uses masks and shift), and a flow for the general case (which uses sector_div). This is for the sake of performance. - introduce map_sector and is_io_in_chunk_boundary to encapsulate those two flows better for raid0_make_request - fix blk_mergeable to support the two flows. Signed-off-by: raziebe@gmail.com Signed-off-by: NeilBrown --- drivers/md/raid0.c | 107 ++++++++++++++++++++++++++++++++------------- 1 file changed, 77 insertions(+), 30 deletions(-) diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 39936a217f95..7cd2671cc794 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -273,7 +273,12 @@ static int raid0_mergeable_bvec(struct request_queue *q, unsigned int chunk_sectors = mddev->chunk_size >> 9; unsigned int bio_sectors = bvm->bi_size >> 9; - max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9; + if (is_power_of_2(mddev->chunk_size)) + max = (chunk_sectors - ((sector & (chunk_sectors-1)) + + bio_sectors)) << 9; + else + max = (chunk_sectors - (sector_div(sector, chunk_sectors) + + bio_sectors)) << 9; if (max < 0) max = 0; /* bio_add cannot handle a negative return */ if (max <= biovec->bv_len && bio_sectors == 0) return biovec->bv_len; @@ -299,9 +304,8 @@ static int raid0_run(mddev_t *mddev) { int ret; - if (mddev->chunk_size == 0 || - !is_power_of_2(mddev->chunk_size)) { - printk(KERN_ERR "md/raid0: chunk size must be a power of 2.\n"); + if (mddev->chunk_size == 0) { + printk(KERN_ERR "md/raid0: chunk size must be set.\n"); return -EINVAL; } blk_queue_max_sectors(mddev->queue, mddev->chunk_size >> 9); @@ -367,15 +371,65 @@ static struct strip_zone *find_zone(struct raid0_private_data *conf, BUG(); } -static int raid0_make_request (struct request_queue *q, struct bio *bio) +/* + * remaps the bio to the target device. we separate two flows. + * power 2 flow and a general flow for the sake of perfromance +*/ +static mdk_rdev_t *map_sector(mddev_t *mddev, struct strip_zone *zone, + sector_t sector, sector_t *sector_offset) +{ + unsigned int sect_in_chunk; + sector_t chunk; + raid0_conf_t *conf = mddev->private; + unsigned int chunk_sects = mddev->chunk_size >> 9; + + if (is_power_of_2(mddev->chunk_size)) { + int chunksect_bits = ffz(~chunk_sects); + /* find the sector offset inside the chunk */ + sect_in_chunk = sector & (chunk_sects - 1); + sector >>= chunksect_bits; + /* chunk in zone */ + chunk = *sector_offset; + /* quotient is the chunk in real device*/ + sector_div(chunk, zone->nb_dev << chunksect_bits); + } else{ + sect_in_chunk = sector_div(sector, chunk_sects); + chunk = *sector_offset; + sector_div(chunk, chunk_sects * zone->nb_dev); + } + /* + * position the bio over the real device + * real sector = chunk in device + starting of zone + * + the position in the chunk + */ + *sector_offset = (chunk * chunk_sects) + sect_in_chunk; + return conf->devlist[(zone - conf->strip_zone)*mddev->raid_disks + + sector_div(sector, zone->nb_dev)]; +} + +/* + * Is io distribute over 1 or more chunks ? +*/ +static inline int is_io_in_chunk_boundary(mddev_t *mddev, + unsigned int chunk_sects, struct bio *bio) +{ + if (likely(is_power_of_2(mddev->chunk_size))) { + return chunk_sects >= ((bio->bi_sector & (chunk_sects-1)) + + (bio->bi_size >> 9)); + } else{ + sector_t sector = bio->bi_sector; + return chunk_sects >= (sector_div(sector, chunk_sects) + + (bio->bi_size >> 9)); + } +} + +static int raid0_make_request(struct request_queue *q, struct bio *bio) { mddev_t *mddev = q->queuedata; - unsigned int sect_in_chunk, chunksect_bits, chunk_sects; - raid0_conf_t *conf = mddev->private; + unsigned int chunk_sects; + sector_t sector_offset; struct strip_zone *zone; mdk_rdev_t *tmp_dev; - sector_t chunk; - sector_t sector, rsect, sector_offset; const int rw = bio_data_dir(bio); int cpu; @@ -391,10 +445,8 @@ static int raid0_make_request (struct request_queue *q, struct bio *bio) part_stat_unlock(); chunk_sects = mddev->chunk_size >> 9; - chunksect_bits = ffz(~chunk_sects); - sector = bio->bi_sector; - - if (unlikely(chunk_sects < (bio->bi_sector & (chunk_sects - 1)) + (bio->bi_size >> 9))) { + if (unlikely(!is_io_in_chunk_boundary(mddev, chunk_sects, bio))) { + sector_t sector = bio->bi_sector; struct bio_pair *bp; /* Sanity check -- queue functions should prevent this happening */ if (bio->bi_vcnt != 1 || @@ -403,7 +455,12 @@ static int raid0_make_request (struct request_queue *q, struct bio *bio) /* This is a one page bio that upper layers * refuse to split for us, so we need to split it. */ - bp = bio_split(bio, chunk_sects - (bio->bi_sector & (chunk_sects - 1))); + if (likely(is_power_of_2(mddev->chunk_size))) + bp = bio_split(bio, chunk_sects - (sector & + (chunk_sects-1))); + else + bp = bio_split(bio, chunk_sects - + sector_div(sector, chunk_sects)); if (raid0_make_request(q, &bp->bio1)) generic_make_request(&bp->bio1); if (raid0_make_request(q, &bp->bio2)) @@ -412,24 +469,14 @@ static int raid0_make_request (struct request_queue *q, struct bio *bio) bio_pair_release(bp); return 0; } - sector_offset = sector; - zone = find_zone(conf, §or_offset); - sect_in_chunk = bio->bi_sector & (chunk_sects - 1); - { - sector_t x = sector_offset >> chunksect_bits; - sector_div(x, zone->nb_dev); - chunk = x; - - x = sector >> chunksect_bits; - tmp_dev = conf->devlist[(zone - conf->strip_zone)*mddev->raid_disks - + sector_div(x, zone->nb_dev)]; - } - rsect = (chunk << chunksect_bits) + zone->dev_start + sect_in_chunk; - + sector_offset = bio->bi_sector; + zone = find_zone(mddev->private, §or_offset); + tmp_dev = map_sector(mddev, zone, bio->bi_sector, + §or_offset); bio->bi_bdev = tmp_dev->bdev; - bio->bi_sector = rsect + tmp_dev->data_offset; - + bio->bi_sector = sector_offset + zone->dev_start + + tmp_dev->data_offset; /* * Let the main block layer submit the IO and resolve recursion: */ From 9d8f0363623b3da12c43007cf77f5e1a4e8a5964 Mon Sep 17 00:00:00 2001 From: Andre Noll Date: Thu, 18 Jun 2009 08:45:01 +1000 Subject: [PATCH 22/39] md: Make mddev->chunk_size sector-based. This patch renames the chunk_size field to chunk_sectors with the implied change of semantics. Since is_power_of_2(chunk_size) = is_power_of_2(chunk_sectors << 9) = is_power_of_2(chunk_sectors) these bits don't need an adjustment for the shift. Signed-off-by: Andre Noll Signed-off-by: NeilBrown --- drivers/md/linear.c | 2 +- drivers/md/md.c | 51 +++++++++++++++++++++++---------------------- drivers/md/md.h | 2 +- drivers/md/raid0.c | 27 ++++++++++++------------ drivers/md/raid1.c | 4 ++-- drivers/md/raid10.c | 15 ++++++------- drivers/md/raid5.c | 41 +++++++++++++++++++----------------- 7 files changed, 74 insertions(+), 68 deletions(-) diff --git a/drivers/md/linear.c b/drivers/md/linear.c index 9b02a73fbc6b..9f7cec42dd8e 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -305,7 +305,7 @@ static int linear_make_request (struct request_queue *q, struct bio *bio) static void linear_status (struct seq_file *seq, mddev_t *mddev) { - seq_printf(seq, " %dk rounding", mddev->chunk_size/1024); + seq_printf(seq, " %dk rounding", mddev->chunk_sectors / 2); } diff --git a/drivers/md/md.c b/drivers/md/md.c index a02bde70874b..abcc0fef30e3 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -869,7 +869,7 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) mddev->minor_version = sb->minor_version; mddev->patch_version = sb->patch_version; mddev->external = 0; - mddev->chunk_size = sb->chunk_size; + mddev->chunk_sectors = sb->chunk_size >> 9; mddev->ctime = sb->ctime; mddev->utime = sb->utime; mddev->level = sb->level; @@ -892,7 +892,7 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) mddev->delta_disks = 0; mddev->new_level = mddev->level; mddev->new_layout = mddev->layout; - mddev->new_chunk = mddev->chunk_size; + mddev->new_chunk = mddev->chunk_sectors << 9; } if (sb->state & (1<recovery_cp = 0; sb->layout = mddev->layout; - sb->chunk_size = mddev->chunk_size; + sb->chunk_size = mddev->chunk_sectors << 9; if (mddev->bitmap && mddev->bitmap_file == NULL) sb->state |= (1<major_version = 1; mddev->patch_version = 0; mddev->external = 0; - mddev->chunk_size = le32_to_cpu(sb->chunksize) << 9; + mddev->chunk_sectors = le32_to_cpu(sb->chunksize); mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1); mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1); mddev->level = le32_to_cpu(sb->level); @@ -1310,7 +1310,7 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) mddev->delta_disks = 0; mddev->new_level = mddev->level; mddev->new_layout = mddev->layout; - mddev->new_chunk = mddev->chunk_size; + mddev->new_chunk = mddev->chunk_sectors << 9; } } else if (mddev->pers == NULL) { @@ -1382,7 +1382,7 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) sb->raid_disks = cpu_to_le32(mddev->raid_disks); sb->size = cpu_to_le64(mddev->dev_sectors); - sb->chunksize = cpu_to_le32(mddev->chunk_size >> 9); + sb->chunksize = cpu_to_le32(mddev->chunk_sectors); sb->level = cpu_to_le32(mddev->level); sb->layout = cpu_to_le32(mddev->layout); @@ -2753,7 +2753,7 @@ level_store(mddev_t *mddev, const char *buf, size_t len) if (IS_ERR(priv)) { mddev->new_level = mddev->level; mddev->new_layout = mddev->layout; - mddev->new_chunk = mddev->chunk_size; + mddev->new_chunk = mddev->chunk_sectors << 9; mddev->raid_disks -= mddev->delta_disks; mddev->delta_disks = 0; module_put(pers->owner); @@ -2771,7 +2771,7 @@ level_store(mddev_t *mddev, const char *buf, size_t len) strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); mddev->level = mddev->new_level; mddev->layout = mddev->new_layout; - mddev->chunk_size = mddev->new_chunk; + mddev->chunk_sectors = mddev->new_chunk >> 9; mddev->delta_disks = 0; pers->run(mddev); mddev_resume(mddev); @@ -2864,10 +2864,10 @@ static ssize_t chunk_size_show(mddev_t *mddev, char *page) { if (mddev->reshape_position != MaxSector && - mddev->chunk_size != mddev->new_chunk) + mddev->chunk_sectors << 9 != mddev->new_chunk) return sprintf(page, "%d (%d)\n", mddev->new_chunk, - mddev->chunk_size); - return sprintf(page, "%d\n", mddev->chunk_size); + mddev->chunk_sectors << 9); + return sprintf(page, "%d\n", mddev->chunk_sectors << 9); } static ssize_t @@ -2889,7 +2889,7 @@ chunk_size_store(mddev_t *mddev, const char *buf, size_t len) } else { mddev->new_chunk = n; if (mddev->reshape_position == MaxSector) - mddev->chunk_size = n; + mddev->chunk_sectors = n >> 9; } return len; } @@ -3534,9 +3534,9 @@ min_sync_store(mddev_t *mddev, const char *buf, size_t len) return -EBUSY; /* Must be a multiple of chunk_size */ - if (mddev->chunk_size) { + if (mddev->chunk_sectors) { sector_t temp = min; - if (sector_div(temp, (mddev->chunk_size>>9))) + if (sector_div(temp, mddev->chunk_sectors)) return -EINVAL; } mddev->resync_min = min; @@ -3572,9 +3572,9 @@ max_sync_store(mddev_t *mddev, const char *buf, size_t len) return -EBUSY; /* Must be a multiple of chunk_size */ - if (mddev->chunk_size) { + if (mddev->chunk_sectors) { sector_t temp = max; - if (sector_div(temp, (mddev->chunk_size>>9))) + if (sector_div(temp, mddev->chunk_sectors)) return -EINVAL; } mddev->resync_max = max; @@ -3665,7 +3665,7 @@ reshape_position_store(mddev_t *mddev, const char *buf, size_t len) mddev->delta_disks = 0; mddev->new_level = mddev->level; mddev->new_layout = mddev->layout; - mddev->new_chunk = mddev->chunk_size; + mddev->new_chunk = mddev->chunk_sectors << 9; return len; } @@ -4007,7 +4007,7 @@ static int do_md_run(mddev_t * mddev) analyze_sbs(mddev); } - chunk_size = mddev->chunk_size; + chunk_size = mddev->chunk_sectors << 9; if (chunk_size) { if (chunk_size > MAX_CHUNK_SIZE) { @@ -4406,7 +4406,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) mddev->flags = 0; mddev->ro = 0; mddev->metadata_type[0] = 0; - mddev->chunk_size = 0; + mddev->chunk_sectors = 0; mddev->ctime = mddev->utime = 0; mddev->layout = 0; mddev->max_disks = 0; @@ -4619,7 +4619,7 @@ static int get_array_info(mddev_t * mddev, void __user * arg) info.spare_disks = spare; info.layout = mddev->layout; - info.chunk_size = mddev->chunk_size; + info.chunk_size = mddev->chunk_sectors << 9; if (copy_to_user(arg, &info, sizeof(info))) return -EFAULT; @@ -4844,7 +4844,8 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) rdev->sb_start = rdev->bdev->bd_inode->i_size / 512; } else rdev->sb_start = calc_dev_sboffset(rdev->bdev); - rdev->sectors = calc_num_sectors(rdev, mddev->chunk_size); + rdev->sectors = calc_num_sectors(rdev, + mddev->chunk_sectors << 9); err = bind_rdev_to_array(rdev, mddev); if (err) { @@ -4914,7 +4915,7 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev) else rdev->sb_start = rdev->bdev->bd_inode->i_size / 512; - rdev->sectors = calc_num_sectors(rdev, mddev->chunk_size); + rdev->sectors = calc_num_sectors(rdev, mddev->chunk_sectors << 9); if (test_bit(Faulty, &rdev->flags)) { printk(KERN_WARNING @@ -5063,7 +5064,7 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) mddev->external = 0; mddev->layout = info->layout; - mddev->chunk_size = info->chunk_size; + mddev->chunk_sectors = info->chunk_size >> 9; mddev->max_disks = MD_SB_DISKS; @@ -5082,7 +5083,7 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) get_random_bytes(mddev->uuid, 16); mddev->new_level = mddev->level; - mddev->new_chunk = mddev->chunk_size; + mddev->new_chunk = mddev->chunk_sectors << 9; mddev->new_layout = mddev->layout; mddev->delta_disks = 0; @@ -5192,7 +5193,7 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) mddev->level != info->level || /* mddev->layout != info->layout || */ !mddev->persistent != info->not_persistent|| - mddev->chunk_size != info->chunk_size || + mddev->chunk_sectors != info->chunk_size >> 9 || /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */ ((state^info->state) & 0xfffffe00) ) diff --git a/drivers/md/md.h b/drivers/md/md.h index 8227ab909d44..5d78830043d0 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -145,7 +145,7 @@ struct mddev_s int external; /* metadata is * managed externally */ char metadata_type[17]; /* externally set*/ - int chunk_size; + int chunk_sectors; time_t ctime, utime; int level, layout; char clevel[16]; diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 7cd2671cc794..f20b18ff7969 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -238,10 +238,10 @@ static int create_strip_zones(mddev_t *mddev) * now since we have the hard sector sizes, we can make sure * chunk size is a multiple of that sector size */ - if (mddev->chunk_size % queue_logical_block_size(mddev->queue)) { + if ((mddev->chunk_sectors << 9) % queue_logical_block_size(mddev->queue)) { printk(KERN_ERR "%s chunk_size of %d not valid\n", mdname(mddev), - mddev->chunk_size); + mddev->chunk_sectors << 9); goto abort; } printk(KERN_INFO "raid0: done.\n"); @@ -270,10 +270,10 @@ static int raid0_mergeable_bvec(struct request_queue *q, mddev_t *mddev = q->queuedata; sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); int max; - unsigned int chunk_sectors = mddev->chunk_size >> 9; + unsigned int chunk_sectors = mddev->chunk_sectors; unsigned int bio_sectors = bvm->bi_size >> 9; - if (is_power_of_2(mddev->chunk_size)) + if (is_power_of_2(mddev->chunk_sectors)) max = (chunk_sectors - ((sector & (chunk_sectors-1)) + bio_sectors)) << 9; else @@ -304,11 +304,11 @@ static int raid0_run(mddev_t *mddev) { int ret; - if (mddev->chunk_size == 0) { + if (mddev->chunk_sectors == 0) { printk(KERN_ERR "md/raid0: chunk size must be set.\n"); return -EINVAL; } - blk_queue_max_sectors(mddev->queue, mddev->chunk_size >> 9); + blk_queue_max_sectors(mddev->queue, mddev->chunk_sectors); mddev->queue->queue_lock = &mddev->queue->__queue_lock; ret = create_strip_zones(mddev); @@ -330,7 +330,8 @@ static int raid0_run(mddev_t *mddev) * chunksize should be used in that case. */ { - int stripe = mddev->raid_disks * mddev->chunk_size / PAGE_SIZE; + int stripe = mddev->raid_disks * + (mddev->chunk_sectors << 9) / PAGE_SIZE; if (mddev->queue->backing_dev_info.ra_pages < 2* stripe) mddev->queue->backing_dev_info.ra_pages = 2* stripe; } @@ -381,9 +382,9 @@ static mdk_rdev_t *map_sector(mddev_t *mddev, struct strip_zone *zone, unsigned int sect_in_chunk; sector_t chunk; raid0_conf_t *conf = mddev->private; - unsigned int chunk_sects = mddev->chunk_size >> 9; + unsigned int chunk_sects = mddev->chunk_sectors; - if (is_power_of_2(mddev->chunk_size)) { + if (is_power_of_2(mddev->chunk_sectors)) { int chunksect_bits = ffz(~chunk_sects); /* find the sector offset inside the chunk */ sect_in_chunk = sector & (chunk_sects - 1); @@ -413,7 +414,7 @@ static mdk_rdev_t *map_sector(mddev_t *mddev, struct strip_zone *zone, static inline int is_io_in_chunk_boundary(mddev_t *mddev, unsigned int chunk_sects, struct bio *bio) { - if (likely(is_power_of_2(mddev->chunk_size))) { + if (likely(is_power_of_2(mddev->chunk_sectors))) { return chunk_sects >= ((bio->bi_sector & (chunk_sects-1)) + (bio->bi_size >> 9)); } else{ @@ -444,7 +445,7 @@ static int raid0_make_request(struct request_queue *q, struct bio *bio) bio_sectors(bio)); part_stat_unlock(); - chunk_sects = mddev->chunk_size >> 9; + chunk_sects = mddev->chunk_sectors; if (unlikely(!is_io_in_chunk_boundary(mddev, chunk_sects, bio))) { sector_t sector = bio->bi_sector; struct bio_pair *bp; @@ -455,7 +456,7 @@ static int raid0_make_request(struct request_queue *q, struct bio *bio) /* This is a one page bio that upper layers * refuse to split for us, so we need to split it. */ - if (likely(is_power_of_2(mddev->chunk_size))) + if (likely(is_power_of_2(mddev->chunk_sectors))) bp = bio_split(bio, chunk_sects - (sector & (chunk_sects-1))); else @@ -519,7 +520,7 @@ static void raid0_status(struct seq_file *seq, mddev_t *mddev) zone_start = conf->strip_zone[j].zone_end; } #endif - seq_printf(seq, " %dk chunks", mddev->chunk_size/1024); + seq_printf(seq, " %dk chunks", mddev->chunk_sectors / 2); return; } diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 5ea5bca53a5e..388635735ae5 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -2161,10 +2161,10 @@ static int raid1_reshape(mddev_t *mddev) int d, d2, err; /* Cannot change chunk_size, layout, or level */ - if (mddev->chunk_size != mddev->new_chunk || + if (mddev->chunk_sectors << 9 != mddev->new_chunk || mddev->layout != mddev->new_layout || mddev->level != mddev->new_level) { - mddev->new_chunk = mddev->chunk_size; + mddev->new_chunk = mddev->chunk_sectors << 9; mddev->new_layout = mddev->layout; mddev->new_level = mddev->level; return -EINVAL; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 06bef686f91b..30029a312cf5 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -461,7 +461,7 @@ static int raid10_mergeable_bvec(struct request_queue *q, mddev_t *mddev = q->queuedata; sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); int max; - unsigned int chunk_sectors = mddev->chunk_size >> 9; + unsigned int chunk_sectors = mddev->chunk_sectors; unsigned int bio_sectors = bvm->bi_size >> 9; max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9; @@ -985,7 +985,7 @@ static void status(struct seq_file *seq, mddev_t *mddev) int i; if (conf->near_copies < conf->raid_disks) - seq_printf(seq, " %dK chunks", mddev->chunk_size/1024); + seq_printf(seq, " %dK chunks", mddev->chunk_sectors / 2); if (conf->near_copies > 1) seq_printf(seq, " %d near-copies", conf->near_copies); if (conf->far_copies > 1) { @@ -2050,8 +2050,8 @@ static int run(mddev_t *mddev) int nc, fc, fo; sector_t stride, size; - if (mddev->chunk_size < PAGE_SIZE || - !is_power_of_2(mddev->chunk_size)) { + if (mddev->chunk_sectors < (PAGE_SIZE >> 9) || + !is_power_of_2(mddev->chunk_sectors)) { printk(KERN_ERR "md/raid10: chunk size must be " "at least PAGE_SIZE(%ld) and be a power of 2.\n", PAGE_SIZE); return -EINVAL; @@ -2096,8 +2096,8 @@ static int run(mddev_t *mddev) conf->far_copies = fc; conf->copies = nc*fc; conf->far_offset = fo; - conf->chunk_mask = (sector_t)(mddev->chunk_size>>9)-1; - conf->chunk_shift = ffz(~mddev->chunk_size) - 9; + conf->chunk_mask = mddev->chunk_sectors - 1; + conf->chunk_shift = ffz(~mddev->chunk_sectors); size = mddev->dev_sectors >> conf->chunk_shift; sector_div(size, fc); size = size * conf->raid_disks; @@ -2205,7 +2205,8 @@ static int run(mddev_t *mddev) * maybe... */ { - int stripe = conf->raid_disks * (mddev->chunk_size / PAGE_SIZE); + int stripe = conf->raid_disks * + ((mddev->chunk_sectors << 9) / PAGE_SIZE); stripe /= conf->near_copies; if (mddev->queue->backing_dev_info.ra_pages < 2* stripe) mddev->queue->backing_dev_info.ra_pages = 2* stripe; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index be4e62f611bc..1e4fd5e8bfdd 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3352,13 +3352,13 @@ static int raid5_mergeable_bvec(struct request_queue *q, mddev_t *mddev = q->queuedata; sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); int max; - unsigned int chunk_sectors = mddev->chunk_size >> 9; + unsigned int chunk_sectors = mddev->chunk_sectors; unsigned int bio_sectors = bvm->bi_size >> 9; if ((bvm->bi_rw & 1) == WRITE) return biovec->bv_len; /* always allow writes to be mergeable */ - if (mddev->new_chunk < mddev->chunk_size) + if (mddev->new_chunk < mddev->chunk_sectors << 9) chunk_sectors = mddev->new_chunk >> 9; max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9; if (max < 0) max = 0; @@ -3372,10 +3372,10 @@ static int raid5_mergeable_bvec(struct request_queue *q, static int in_chunk_boundary(mddev_t *mddev, struct bio *bio) { sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev); - unsigned int chunk_sectors = mddev->chunk_size >> 9; + unsigned int chunk_sectors = mddev->chunk_sectors; unsigned int bio_sectors = bio->bi_size >> 9; - if (mddev->new_chunk < mddev->chunk_size) + if (mddev->new_chunk < mddev->chunk_sectors << 9) chunk_sectors = mddev->new_chunk >> 9; return chunk_sectors >= ((sector & (chunk_sectors - 1)) + bio_sectors); @@ -3791,10 +3791,10 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped * If old and new chunk sizes differ, we need to process the * largest of these */ - if (mddev->new_chunk > mddev->chunk_size) + if (mddev->new_chunk > mddev->chunk_sectors << 9) reshape_sectors = mddev->new_chunk / 512; else - reshape_sectors = mddev->chunk_size / 512; + reshape_sectors = mddev->chunk_sectors; /* we update the metadata when there is more than 3Meg * in the block range (that is rather arbitrary, should @@ -4303,7 +4303,7 @@ raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks) raid_disks = conf->previous_raid_disks; } - sectors &= ~((sector_t)mddev->chunk_size/512 - 1); + sectors &= ~((sector_t)mddev->chunk_sectors - 1); sectors &= ~((sector_t)mddev->new_chunk/512 - 1); return sectors * (raid_disks - conf->max_degraded); } @@ -4412,7 +4412,7 @@ static raid5_conf_t *setup_conf(mddev_t *mddev) conf->max_nr_stripes = NR_STRIPES; conf->reshape_progress = mddev->reshape_position; if (conf->reshape_progress != MaxSector) { - conf->prev_chunk = mddev->chunk_size; + conf->prev_chunk = mddev->chunk_sectors << 9; conf->prev_algo = mddev->layout; } @@ -4484,7 +4484,7 @@ static int run(mddev_t *mddev) } /* here_new is the stripe we will write to */ here_old = mddev->reshape_position; - sector_div(here_old, (mddev->chunk_size>>9)* + sector_div(here_old, mddev->chunk_sectors * (old_disks-max_degraded)); /* here_old is the first stripe that we might need to read * from */ @@ -4499,7 +4499,7 @@ static int run(mddev_t *mddev) } else { BUG_ON(mddev->level != mddev->new_level); BUG_ON(mddev->layout != mddev->new_layout); - BUG_ON(mddev->chunk_size != mddev->new_chunk); + BUG_ON(mddev->chunk_sectors << 9 != mddev->new_chunk); BUG_ON(mddev->delta_disks != 0); } @@ -4533,7 +4533,7 @@ static int run(mddev_t *mddev) } /* device size must be a multiple of chunk size */ - mddev->dev_sectors &= ~(mddev->chunk_size / 512 - 1); + mddev->dev_sectors &= ~(mddev->chunk_sectors - 1); mddev->resync_max_sectors = mddev->dev_sectors; if (mddev->degraded > 0 && @@ -4582,7 +4582,7 @@ static int run(mddev_t *mddev) { int data_disks = conf->previous_raid_disks - conf->max_degraded; int stripe = data_disks * - (mddev->chunk_size / PAGE_SIZE); + ((mddev->chunk_sectors << 9) / PAGE_SIZE); if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) mddev->queue->backing_dev_info.ra_pages = 2 * stripe; } @@ -4679,7 +4679,8 @@ static void status(struct seq_file *seq, mddev_t *mddev) raid5_conf_t *conf = (raid5_conf_t *) mddev->private; int i; - seq_printf (seq, " level %d, %dk chunk, algorithm %d", mddev->level, mddev->chunk_size >> 10, mddev->layout); + seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level, + mddev->chunk_sectors / 2, mddev->layout); seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded); for (i = 0; i < conf->raid_disks; i++) seq_printf (seq, "%s", @@ -4827,7 +4828,7 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors) * any io in the removed space completes, but it hardly seems * worth it. */ - sectors &= ~((sector_t)mddev->chunk_size/512 - 1); + sectors &= ~((sector_t)mddev->chunk_sectors - 1); md_set_array_sectors(mddev, raid5_size(mddev, sectors, mddev->raid_disks)); if (mddev->array_sectors > @@ -4850,7 +4851,7 @@ static int raid5_check_reshape(mddev_t *mddev) if (mddev->delta_disks == 0 && mddev->new_layout == mddev->layout && - mddev->new_chunk == mddev->chunk_size) + mddev->new_chunk == mddev->chunk_sectors << 9) return -EINVAL; /* nothing to do */ if (mddev->bitmap) /* Cannot grow a bitmap yet */ @@ -4878,10 +4879,11 @@ static int raid5_check_reshape(mddev_t *mddev) * If the chunk size is greater, user-space should request more * stripe_heads first. */ - if ((mddev->chunk_size / STRIPE_SIZE) * 4 > conf->max_nr_stripes || + if (((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4 + > conf->max_nr_stripes || (mddev->new_chunk / STRIPE_SIZE) * 4 > conf->max_nr_stripes) { printk(KERN_WARNING "raid5: reshape: not enough stripes. Needed %lu\n", - (max(mddev->chunk_size, mddev->new_chunk) + (max(mddev->chunk_sectors << 9, mddev->new_chunk) / STRIPE_SIZE)*4); return -ENOSPC; } @@ -5054,7 +5056,7 @@ static void raid5_finish_reshape(mddev_t *mddev) raid5_remove_disk(mddev, d); } mddev->layout = conf->algorithm; - mddev->chunk_size = conf->chunk_size; + mddev->chunk_sectors = conf->chunk_size >> 9; mddev->reshape_position = MaxSector; mddev->delta_disks = 0; } @@ -5183,7 +5185,8 @@ static int raid5_reconfig(mddev_t *mddev, int new_layout, int new_chunk) } if (new_chunk > 0) { conf->chunk_size = new_chunk; - mddev->chunk_size = mddev->new_chunk = new_chunk; + mddev->new_chunk = new_chunk; + mddev->chunk_sectors = new_chunk >> 9; } set_bit(MD_CHANGE_DEVS, &mddev->flags); md_wakeup_thread(mddev->thread); From 664e7c413f1e90eceb0b2596dd73a0832faec058 Mon Sep 17 00:00:00 2001 From: Andre Noll Date: Thu, 18 Jun 2009 08:45:27 +1000 Subject: [PATCH 23/39] md: Convert mddev->new_chunk to sectors. A straight-forward conversion which gets rid of some multiplications/divisions/shifts. The patch also introduces a couple of new ones, most of which are due to conf->chunk_size still being represented in bytes. This will be cleaned up in subsequent patches. Signed-off-by: Andre Noll Signed-off-by: NeilBrown --- drivers/md/md.c | 29 +++++++++++++++-------------- drivers/md/md.h | 3 ++- drivers/md/raid1.c | 4 ++-- drivers/md/raid5.c | 45 ++++++++++++++++++++++++--------------------- 4 files changed, 43 insertions(+), 38 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index abcc0fef30e3..f996d8342a85 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -886,13 +886,13 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) mddev->delta_disks = sb->delta_disks; mddev->new_level = sb->new_level; mddev->new_layout = sb->new_layout; - mddev->new_chunk = sb->new_chunk; + mddev->new_chunk_sectors = sb->new_chunk >> 9; } else { mddev->reshape_position = MaxSector; mddev->delta_disks = 0; mddev->new_level = mddev->level; mddev->new_layout = mddev->layout; - mddev->new_chunk = mddev->chunk_sectors << 9; + mddev->new_chunk_sectors = mddev->chunk_sectors; } if (sb->state & (1<new_level = mddev->new_level; sb->delta_disks = mddev->delta_disks; sb->new_layout = mddev->new_layout; - sb->new_chunk = mddev->new_chunk; + sb->new_chunk = mddev->new_chunk_sectors << 9; } mddev->minor_version = sb->minor_version; if (mddev->in_sync) @@ -1304,13 +1304,13 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) mddev->delta_disks = le32_to_cpu(sb->delta_disks); mddev->new_level = le32_to_cpu(sb->new_level); mddev->new_layout = le32_to_cpu(sb->new_layout); - mddev->new_chunk = le32_to_cpu(sb->new_chunk)<<9; + mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk); } else { mddev->reshape_position = MaxSector; mddev->delta_disks = 0; mddev->new_level = mddev->level; mddev->new_layout = mddev->layout; - mddev->new_chunk = mddev->chunk_sectors << 9; + mddev->new_chunk_sectors = mddev->chunk_sectors; } } else if (mddev->pers == NULL) { @@ -1409,7 +1409,7 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) sb->new_layout = cpu_to_le32(mddev->new_layout); sb->delta_disks = cpu_to_le32(mddev->delta_disks); sb->new_level = cpu_to_le32(mddev->new_level); - sb->new_chunk = cpu_to_le32(mddev->new_chunk>>9); + sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors); } max_dev = 0; @@ -2753,7 +2753,7 @@ level_store(mddev_t *mddev, const char *buf, size_t len) if (IS_ERR(priv)) { mddev->new_level = mddev->level; mddev->new_layout = mddev->layout; - mddev->new_chunk = mddev->chunk_sectors << 9; + mddev->new_chunk_sectors = mddev->chunk_sectors; mddev->raid_disks -= mddev->delta_disks; mddev->delta_disks = 0; module_put(pers->owner); @@ -2771,7 +2771,7 @@ level_store(mddev_t *mddev, const char *buf, size_t len) strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); mddev->level = mddev->new_level; mddev->layout = mddev->new_layout; - mddev->chunk_sectors = mddev->new_chunk >> 9; + mddev->chunk_sectors = mddev->new_chunk_sectors; mddev->delta_disks = 0; pers->run(mddev); mddev_resume(mddev); @@ -2864,8 +2864,9 @@ static ssize_t chunk_size_show(mddev_t *mddev, char *page) { if (mddev->reshape_position != MaxSector && - mddev->chunk_sectors << 9 != mddev->new_chunk) - return sprintf(page, "%d (%d)\n", mddev->new_chunk, + mddev->chunk_sectors != mddev->new_chunk_sectors) + return sprintf(page, "%d (%d)\n", + mddev->new_chunk_sectors << 9, mddev->chunk_sectors << 9); return sprintf(page, "%d\n", mddev->chunk_sectors << 9); } @@ -2887,7 +2888,7 @@ chunk_size_store(mddev_t *mddev, const char *buf, size_t len) if (err) return err; } else { - mddev->new_chunk = n; + mddev->new_chunk_sectors = n >> 9; if (mddev->reshape_position == MaxSector) mddev->chunk_sectors = n >> 9; } @@ -3665,7 +3666,7 @@ reshape_position_store(mddev_t *mddev, const char *buf, size_t len) mddev->delta_disks = 0; mddev->new_level = mddev->level; mddev->new_layout = mddev->layout; - mddev->new_chunk = mddev->chunk_sectors << 9; + mddev->new_chunk_sectors = mddev->chunk_sectors; return len; } @@ -4414,7 +4415,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) mddev->delta_disks = 0; mddev->new_level = LEVEL_NONE; mddev->new_layout = 0; - mddev->new_chunk = 0; + mddev->new_chunk_sectors = 0; mddev->curr_resync = 0; mddev->resync_mismatches = 0; mddev->suspend_lo = mddev->suspend_hi = 0; @@ -5083,7 +5084,7 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) get_random_bytes(mddev->uuid, 16); mddev->new_level = mddev->level; - mddev->new_chunk = mddev->chunk_sectors << 9; + mddev->new_chunk_sectors = mddev->chunk_sectors; mddev->new_layout = mddev->layout; mddev->delta_disks = 0; diff --git a/drivers/md/md.h b/drivers/md/md.h index 5d78830043d0..e0a2b8e3985d 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -166,7 +166,8 @@ struct mddev_s * If reshape_position is MaxSector, then no reshape is happening (yet). */ sector_t reshape_position; - int delta_disks, new_level, new_layout, new_chunk; + int delta_disks, new_level, new_layout; + int new_chunk_sectors; struct mdk_thread_s *thread; /* management thread */ struct mdk_thread_s *sync_thread; /* doing resync or reconstruct */ diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 388635735ae5..12f8f34f17ae 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -2161,10 +2161,10 @@ static int raid1_reshape(mddev_t *mddev) int d, d2, err; /* Cannot change chunk_size, layout, or level */ - if (mddev->chunk_sectors << 9 != mddev->new_chunk || + if (mddev->chunk_sectors != mddev->new_chunk_sectors || mddev->layout != mddev->new_layout || mddev->level != mddev->new_level) { - mddev->new_chunk = mddev->chunk_sectors << 9; + mddev->new_chunk_sectors = mddev->chunk_sectors; mddev->new_layout = mddev->layout; mddev->new_level = mddev->level; return -EINVAL; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 1e4fd5e8bfdd..bc3564cfbba0 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3358,8 +3358,8 @@ static int raid5_mergeable_bvec(struct request_queue *q, if ((bvm->bi_rw & 1) == WRITE) return biovec->bv_len; /* always allow writes to be mergeable */ - if (mddev->new_chunk < mddev->chunk_sectors << 9) - chunk_sectors = mddev->new_chunk >> 9; + if (mddev->new_chunk_sectors < mddev->chunk_sectors) + chunk_sectors = mddev->new_chunk_sectors; max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9; if (max < 0) max = 0; if (max <= biovec->bv_len && bio_sectors == 0) @@ -3375,8 +3375,8 @@ static int in_chunk_boundary(mddev_t *mddev, struct bio *bio) unsigned int chunk_sectors = mddev->chunk_sectors; unsigned int bio_sectors = bio->bi_size >> 9; - if (mddev->new_chunk < mddev->chunk_sectors << 9) - chunk_sectors = mddev->new_chunk >> 9; + if (mddev->new_chunk_sectors < mddev->chunk_sectors) + chunk_sectors = mddev->new_chunk_sectors; return chunk_sectors >= ((sector & (chunk_sectors - 1)) + bio_sectors); } @@ -3791,8 +3791,8 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped * If old and new chunk sizes differ, we need to process the * largest of these */ - if (mddev->new_chunk > mddev->chunk_sectors << 9) - reshape_sectors = mddev->new_chunk / 512; + if (mddev->new_chunk_sectors > mddev->chunk_sectors) + reshape_sectors = mddev->new_chunk_sectors; else reshape_sectors = mddev->chunk_sectors; @@ -4304,7 +4304,7 @@ raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks) } sectors &= ~((sector_t)mddev->chunk_sectors - 1); - sectors &= ~((sector_t)mddev->new_chunk/512 - 1); + sectors &= ~((sector_t)mddev->new_chunk_sectors - 1); return sectors * (raid_disks - conf->max_degraded); } @@ -4336,10 +4336,11 @@ static raid5_conf_t *setup_conf(mddev_t *mddev) return ERR_PTR(-EINVAL); } - if (!mddev->new_chunk || mddev->new_chunk % PAGE_SIZE || - !is_power_of_2(mddev->new_chunk)) { + if (!mddev->new_chunk_sectors || + (mddev->new_chunk_sectors << 9) % PAGE_SIZE || + !is_power_of_2(mddev->new_chunk_sectors)) { printk(KERN_ERR "raid5: invalid chunk size %d for %s\n", - mddev->new_chunk, mdname(mddev)); + mddev->new_chunk_sectors << 9, mdname(mddev)); return ERR_PTR(-EINVAL); } @@ -4402,7 +4403,7 @@ static raid5_conf_t *setup_conf(mddev_t *mddev) conf->fullsync = 1; } - conf->chunk_size = mddev->new_chunk; + conf->chunk_size = mddev->new_chunk_sectors << 9; conf->level = mddev->new_level; if (conf->level == 6) conf->max_degraded = 2; @@ -4476,7 +4477,7 @@ static int run(mddev_t *mddev) * geometry. */ here_new = mddev->reshape_position; - if (sector_div(here_new, (mddev->new_chunk>>9)* + if (sector_div(here_new, mddev->new_chunk_sectors * (mddev->raid_disks - max_degraded))) { printk(KERN_ERR "raid5: reshape_position not " "on a stripe boundary\n"); @@ -4499,7 +4500,7 @@ static int run(mddev_t *mddev) } else { BUG_ON(mddev->level != mddev->new_level); BUG_ON(mddev->layout != mddev->new_layout); - BUG_ON(mddev->chunk_sectors << 9 != mddev->new_chunk); + BUG_ON(mddev->chunk_sectors != mddev->new_chunk_sectors); BUG_ON(mddev->delta_disks != 0); } @@ -4851,7 +4852,7 @@ static int raid5_check_reshape(mddev_t *mddev) if (mddev->delta_disks == 0 && mddev->new_layout == mddev->layout && - mddev->new_chunk == mddev->chunk_sectors << 9) + mddev->new_chunk_sectors == mddev->chunk_sectors) return -EINVAL; /* nothing to do */ if (mddev->bitmap) /* Cannot grow a bitmap yet */ @@ -4881,9 +4882,11 @@ static int raid5_check_reshape(mddev_t *mddev) */ if (((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4 > conf->max_nr_stripes || - (mddev->new_chunk / STRIPE_SIZE) * 4 > conf->max_nr_stripes) { + ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4 + > conf->max_nr_stripes) { printk(KERN_WARNING "raid5: reshape: not enough stripes. Needed %lu\n", - (max(mddev->chunk_sectors << 9, mddev->new_chunk) + (max(mddev->chunk_sectors << 9, + mddev->new_chunk_sectors << 9) / STRIPE_SIZE)*4); return -ENOSPC; } @@ -4929,7 +4932,7 @@ static int raid5_start_reshape(mddev_t *mddev) conf->previous_raid_disks = conf->raid_disks; conf->raid_disks += mddev->delta_disks; conf->prev_chunk = conf->chunk_size; - conf->chunk_size = mddev->new_chunk; + conf->chunk_size = mddev->new_chunk_sectors << 9; conf->prev_algo = conf->algorithm; conf->algorithm = mddev->new_layout; if (mddev->delta_disks < 0) @@ -5114,7 +5117,7 @@ static void *raid5_takeover_raid1(mddev_t *mddev) mddev->new_level = 5; mddev->new_layout = ALGORITHM_LEFT_SYMMETRIC; - mddev->new_chunk = chunksect << 9; + mddev->new_chunk_sectors = chunksect; return setup_conf(mddev); } @@ -5185,7 +5188,7 @@ static int raid5_reconfig(mddev_t *mddev, int new_layout, int new_chunk) } if (new_chunk > 0) { conf->chunk_size = new_chunk; - mddev->new_chunk = new_chunk; + mddev->new_chunk_sectors = new_chunk >> 9; mddev->chunk_sectors = new_chunk >> 9; } set_bit(MD_CHANGE_DEVS, &mddev->flags); @@ -5194,7 +5197,7 @@ static int raid5_reconfig(mddev_t *mddev, int new_layout, int new_chunk) if (new_layout >= 0) mddev->new_layout = new_layout; if (new_chunk > 0) - mddev->new_chunk = new_chunk; + mddev->new_chunk_sectors = new_chunk >> 9; } return 0; } @@ -5219,7 +5222,7 @@ static int raid6_reconfig(mddev_t *mddev, int new_layout, int new_chunk) if (new_layout >= 0) mddev->new_layout = new_layout; if (new_chunk > 0) - mddev->new_chunk = new_chunk; + mddev->new_chunk_sectors = new_chunk >> 9; return 0; } From 09c9e5fa1b93ad5b81c9dcf8ce3a5b9ae2ac31e4 Mon Sep 17 00:00:00 2001 From: Andre Noll Date: Thu, 18 Jun 2009 08:45:55 +1000 Subject: [PATCH 24/39] md: convert conf->chunk_size and conf->prev_chunk to sectors. This kills some more shifts. Signed-off-by: Andre Noll Signed-off-by: NeilBrown --- drivers/md/raid5.c | 27 +++++++++++++-------------- drivers/md/raid5.h | 6 ++++-- 2 files changed, 17 insertions(+), 16 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index bc3564cfbba0..eaa2d3ee2b5d 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -1274,8 +1274,8 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, sector_t new_sector; int algorithm = previous ? conf->prev_algo : conf->algorithm; - int sectors_per_chunk = previous ? (conf->prev_chunk >> 9) - : (conf->chunk_size >> 9); + int sectors_per_chunk = previous ? conf->prev_chunk_sectors + : conf->chunk_sectors; int raid_disks = previous ? conf->previous_raid_disks : conf->raid_disks; int data_disks = raid_disks - conf->max_degraded; @@ -1480,8 +1480,8 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous) int raid_disks = sh->disks; int data_disks = raid_disks - conf->max_degraded; sector_t new_sector = sh->sector, check; - int sectors_per_chunk = previous ? (conf->prev_chunk >> 9) - : (conf->chunk_size >> 9); + int sectors_per_chunk = previous ? conf->prev_chunk_sectors + : conf->chunk_sectors; int algorithm = previous ? conf->prev_algo : conf->algorithm; sector_t stripe; @@ -1997,8 +1997,7 @@ static void stripe_set_idx(sector_t stripe, raid5_conf_t *conf, int previous, struct stripe_head *sh) { int sectors_per_chunk = - previous ? (conf->prev_chunk >> 9) - : (conf->chunk_size >> 9); + previous ? conf->prev_chunk_sectors : conf->chunk_sectors; int dd_idx; int chunk_offset = sector_div(stripe, sectors_per_chunk); int disks = previous ? conf->previous_raid_disks : conf->raid_disks; @@ -3917,7 +3916,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped 1, &dd_idx, NULL); last_sector = raid5_compute_sector(conf, ((stripe_addr+reshape_sectors) - *(new_data_disks) - 1), + * new_data_disks - 1), 1, &dd_idx, NULL); if (last_sector >= mddev->dev_sectors) last_sector = mddev->dev_sectors - 1; @@ -4403,7 +4402,7 @@ static raid5_conf_t *setup_conf(mddev_t *mddev) conf->fullsync = 1; } - conf->chunk_size = mddev->new_chunk_sectors << 9; + conf->chunk_sectors = mddev->new_chunk_sectors; conf->level = mddev->new_level; if (conf->level == 6) conf->max_degraded = 2; @@ -4413,7 +4412,7 @@ static raid5_conf_t *setup_conf(mddev_t *mddev) conf->max_nr_stripes = NR_STRIPES; conf->reshape_progress = mddev->reshape_position; if (conf->reshape_progress != MaxSector) { - conf->prev_chunk = mddev->chunk_sectors << 9; + conf->prev_chunk_sectors = mddev->chunk_sectors; conf->prev_algo = mddev->layout; } @@ -4931,8 +4930,8 @@ static int raid5_start_reshape(mddev_t *mddev) spin_lock_irq(&conf->device_lock); conf->previous_raid_disks = conf->raid_disks; conf->raid_disks += mddev->delta_disks; - conf->prev_chunk = conf->chunk_size; - conf->chunk_size = mddev->new_chunk_sectors << 9; + conf->prev_chunk_sectors = conf->chunk_sectors; + conf->chunk_sectors = mddev->new_chunk_sectors; conf->prev_algo = conf->algorithm; conf->algorithm = mddev->new_layout; if (mddev->delta_disks < 0) @@ -5014,7 +5013,7 @@ static void end_reshape(raid5_conf_t *conf) */ { int data_disks = conf->raid_disks - conf->max_degraded; - int stripe = data_disks * (conf->chunk_size + int stripe = data_disks * ((conf->chunk_sectors << 9) / PAGE_SIZE); if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe) conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe; @@ -5059,7 +5058,7 @@ static void raid5_finish_reshape(mddev_t *mddev) raid5_remove_disk(mddev, d); } mddev->layout = conf->algorithm; - mddev->chunk_sectors = conf->chunk_size >> 9; + mddev->chunk_sectors = conf->chunk_sectors; mddev->reshape_position = MaxSector; mddev->delta_disks = 0; } @@ -5187,7 +5186,7 @@ static int raid5_reconfig(mddev_t *mddev, int new_layout, int new_chunk) mddev->layout = mddev->new_layout = new_layout; } if (new_chunk > 0) { - conf->chunk_size = new_chunk; + conf->chunk_sectors = new_chunk >> 9; mddev->new_chunk_sectors = new_chunk >> 9; mddev->chunk_sectors = new_chunk >> 9; } diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 1a25c9e252b4..9459689c4ea0 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -334,7 +334,8 @@ struct raid5_private_data { struct hlist_head *stripe_hashtbl; mddev_t *mddev; struct disk_info *spare; - int chunk_size, level, algorithm; + int chunk_sectors; + int level, algorithm; int max_degraded; int raid_disks; int max_nr_stripes; @@ -350,7 +351,8 @@ struct raid5_private_data { */ sector_t reshape_safe; int previous_raid_disks; - int prev_chunk, prev_algo; + int prev_chunk_sectors; + int prev_algo; short generation; /* increments with every reshape */ unsigned long reshape_checkpoint; /* Time we last updated * metadata */ From 0ba459d26260d4d13346c76642f461b2bf607eef Mon Sep 17 00:00:00 2001 From: Andre Noll Date: Thu, 18 Jun 2009 08:46:10 +1000 Subject: [PATCH 25/39] md/raid5: Use is_power_of_2() in raid5_reconfig()/raid6_reconfig(). Signed-off-by: Andre Noll Signed-off-by: NeilBrown --- drivers/md/raid5.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index eaa2d3ee2b5d..72e8a324dcfb 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5167,8 +5167,7 @@ static int raid5_reconfig(mddev_t *mddev, int new_layout, int new_chunk) if (new_layout >= 0 && !algorithm_valid_raid5(new_layout)) return -EINVAL; if (new_chunk > 0) { - if (new_chunk & (new_chunk-1)) - /* not a power of 2 */ + if (!is_power_of_2(new_chunk)) return -EINVAL; if (new_chunk < PAGE_SIZE) return -EINVAL; @@ -5206,8 +5205,7 @@ static int raid6_reconfig(mddev_t *mddev, int new_layout, int new_chunk) if (new_layout >= 0 && !algorithm_valid_raid6(new_layout)) return -EINVAL; if (new_chunk > 0) { - if (new_chunk & (new_chunk-1)) - /* not a power of 2 */ + if (!is_power_of_2(new_chunk)) return -EINVAL; if (new_chunk < PAGE_SIZE) return -EINVAL; From cdc2ae6d6a30df8fd92c5e300d0e3005e13eb6b0 Mon Sep 17 00:00:00 2001 From: Andre Noll Date: Thu, 18 Jun 2009 08:46:47 +1000 Subject: [PATCH 26/39] md: fix some comments. 1/ Raid5 has learned to take over also raid4 and raid6 arrays. 2/ new_chunk in mdp_superblock_1 is in sectors, not bytes. Signed-off-by: NeilBrown --- drivers/md/raid5.c | 2 -- include/linux/raid/md_p.h | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 72e8a324dcfb..009344156751 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5231,8 +5231,6 @@ static void *raid5_takeover(mddev_t *mddev) * raid1 - if there are two drives. We need to know the chunk size * raid4 - trivial - just use a raid4 layout. * raid6 - Providing it is a *_6 layout - * - * For now, just do raid1 */ if (mddev->level == 1) diff --git a/include/linux/raid/md_p.h b/include/linux/raid/md_p.h index 6ba830fa8538..ffa2efbbe382 100644 --- a/include/linux/raid/md_p.h +++ b/include/linux/raid/md_p.h @@ -232,7 +232,7 @@ struct mdp_superblock_1 { __le64 reshape_position; /* next address in array-space for reshape */ __le32 delta_disks; /* change in number of raid_disks */ __le32 new_layout; /* new layout */ - __le32 new_chunk; /* new chunk size (bytes) */ + __le32 new_chunk; /* new chunk size (512byte sectors) */ __u8 pad1[128-124]; /* set to 0 when written */ /* constant this-device information - 64 bytes */ From d6e412eaa52db82010f12ea7d2c9b9468e933c44 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 18 Jun 2009 08:47:00 +1000 Subject: [PATCH 27/39] md: raid0: chunk_sectors cleanups. following the conversion to chunk_sectors, there is room for cleaning up a little. Signed-off-by: NeilBrown --- drivers/md/raid0.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index f20b18ff7969..11e384253718 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -273,7 +273,7 @@ static int raid0_mergeable_bvec(struct request_queue *q, unsigned int chunk_sectors = mddev->chunk_sectors; unsigned int bio_sectors = bvm->bi_size >> 9; - if (is_power_of_2(mddev->chunk_sectors)) + if (is_power_of_2(chunk_sectors)) max = (chunk_sectors - ((sector & (chunk_sectors-1)) + bio_sectors)) << 9; else @@ -384,7 +384,7 @@ static mdk_rdev_t *map_sector(mddev_t *mddev, struct strip_zone *zone, raid0_conf_t *conf = mddev->private; unsigned int chunk_sects = mddev->chunk_sectors; - if (is_power_of_2(mddev->chunk_sectors)) { + if (is_power_of_2(chunk_sects)) { int chunksect_bits = ffz(~chunk_sects); /* find the sector offset inside the chunk */ sect_in_chunk = sector & (chunk_sects - 1); @@ -414,7 +414,7 @@ static mdk_rdev_t *map_sector(mddev_t *mddev, struct strip_zone *zone, static inline int is_io_in_chunk_boundary(mddev_t *mddev, unsigned int chunk_sects, struct bio *bio) { - if (likely(is_power_of_2(mddev->chunk_sectors))) { + if (likely(is_power_of_2(chunk_sects))) { return chunk_sects >= ((bio->bi_sector & (chunk_sects-1)) + (bio->bi_size >> 9)); } else{ @@ -456,7 +456,7 @@ static int raid0_make_request(struct request_queue *q, struct bio *bio) /* This is a one page bio that upper layers * refuse to split for us, so we need to split it. */ - if (likely(is_power_of_2(mddev->chunk_sectors))) + if (likely(is_power_of_2(chunk_sects))) bp = bio_split(bio, chunk_sects - (sector & (chunk_sects-1))); else From 01ee22b496c41384eaa6dcae983c86d8bc32fbb8 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 18 Jun 2009 08:47:20 +1000 Subject: [PATCH 28/39] md: raid5: check stripe cache is large enough in start_reshape In reshape cases that do not change the number of devices, start_reshape is called without first calling check_reshape. Currently, the check that the stripe_cache is large enough is only done in check_reshape. It should be in start_reshape too. Signed-off-by: NeilBrown --- drivers/md/raid5.c | 44 +++++++++++++++++++++++++++----------------- 1 file changed, 27 insertions(+), 17 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 009344156751..b84766e347c3 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -4845,6 +4845,29 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors) return 0; } +static int check_stripe_cache(mddev_t *mddev) +{ + /* Can only proceed if there are plenty of stripe_heads. + * We need a minimum of one full stripe,, and for sensible progress + * it is best to have about 4 times that. + * If we require 4 times, then the default 256 4K stripe_heads will + * allow for chunk sizes up to 256K, which is probably OK. + * If the chunk size is greater, user-space should request more + * stripe_heads first. + */ + raid5_conf_t *conf = mddev->private; + if (((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4 + > conf->max_nr_stripes || + ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4 + > conf->max_nr_stripes) { + printk(KERN_WARNING "raid5: reshape: not enough stripes. Needed %lu\n", + ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9) + / STRIPE_SIZE)*4); + return 0; + } + return 1; +} + static int raid5_check_reshape(mddev_t *mddev) { raid5_conf_t *conf = mddev->private; @@ -4871,24 +4894,8 @@ static int raid5_check_reshape(mddev_t *mddev) return -EINVAL; } - /* Can only proceed if there are plenty of stripe_heads. - * We need a minimum of one full stripe,, and for sensible progress - * it is best to have about 4 times that. - * If we require 4 times, then the default 256 4K stripe_heads will - * allow for chunk sizes up to 256K, which is probably OK. - * If the chunk size is greater, user-space should request more - * stripe_heads first. - */ - if (((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4 - > conf->max_nr_stripes || - ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4 - > conf->max_nr_stripes) { - printk(KERN_WARNING "raid5: reshape: not enough stripes. Needed %lu\n", - (max(mddev->chunk_sectors << 9, - mddev->new_chunk_sectors << 9) - / STRIPE_SIZE)*4); + if (!check_stripe_cache(mddev)) return -ENOSPC; - } return resize_stripes(conf, conf->raid_disks + mddev->delta_disks); } @@ -4904,6 +4911,9 @@ static int raid5_start_reshape(mddev_t *mddev) if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) return -EBUSY; + if (!check_stripe_cache(mddev)) + return -ENOSPC; + list_for_each_entry(rdev, &mddev->disks, same_set) if (rdev->raid_disk < 0 && !test_bit(Faulty, &rdev->flags)) From 597a711b69cfff95c4b8f6069037e7ad3fc71f56 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 18 Jun 2009 08:47:42 +1000 Subject: [PATCH 29/39] md: remove unnecessary arguments from ->reconfig method. Passing the new layout and chunksize as args is not necessary as the mddev has fields for new_check and new_layout. This is preparation for combining the check_reshape and reconfig methods Signed-off-by: NeilBrown --- drivers/md/faulty.c | 13 +++++++------ drivers/md/md.c | 23 +++++++++++++++++------ drivers/md/md.h | 2 +- drivers/md/raid5.c | 42 ++++++++++++++++-------------------------- 4 files changed, 41 insertions(+), 39 deletions(-) diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c index 8695809b24b0..6513b7b3e379 100644 --- a/drivers/md/faulty.c +++ b/drivers/md/faulty.c @@ -255,14 +255,14 @@ static void status(struct seq_file *seq, mddev_t *mddev) } -static int reconfig(mddev_t *mddev, int layout, int chunk_size) +static int reconfig(mddev_t *mddev) { - int mode = layout & ModeMask; - int count = layout >> ModeShift; + int mode = mddev->new_layout & ModeMask; + int count = mddev->new_layout >> ModeShift; conf_t *conf = mddev->private; - if (chunk_size != -1) - return -EINVAL; + if (mddev->new_layout < 0) + return 0; /* new layout */ if (mode == ClearFaults) @@ -279,6 +279,7 @@ static int reconfig(mddev_t *mddev, int layout, int chunk_size) atomic_set(&conf->counters[mode], count); } else return -EINVAL; + mddev->new_layout = -1; mddev->layout = -1; /* makes sure further changes come through */ return 0; } @@ -315,7 +316,7 @@ static int run(mddev_t *mddev) md_set_array_sectors(mddev, faulty_size(mddev, 0, 0)); mddev->private = conf; - reconfig(mddev, mddev->layout, -1); + reconfig(mddev); return 0; } diff --git a/drivers/md/md.c b/drivers/md/md.c index f996d8342a85..5caa421c2367 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2809,9 +2809,12 @@ layout_store(mddev_t *mddev, const char *buf, size_t len) int err; if (mddev->pers->reconfig == NULL) return -EBUSY; - err = mddev->pers->reconfig(mddev, n, -1); - if (err) + mddev->new_layout = n; + err = mddev->pers->reconfig(mddev); + if (err) { + mddev->new_layout = mddev->layout; return err; + } } else { mddev->new_layout = n; if (mddev->reshape_position == MaxSector) @@ -2884,9 +2887,12 @@ chunk_size_store(mddev_t *mddev, const char *buf, size_t len) int err; if (mddev->pers->reconfig == NULL) return -EBUSY; - err = mddev->pers->reconfig(mddev, -1, n); - if (err) + mddev->new_chunk_sectors = n >> 9; + err = mddev->pers->reconfig(mddev); + if (err) { + mddev->new_chunk_sectors = mddev->chunk_sectors; return err; + } } else { mddev->new_chunk_sectors = n >> 9; if (mddev->reshape_position == MaxSector) @@ -5220,8 +5226,13 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) */ if (mddev->pers->reconfig == NULL) return -EINVAL; - else - return mddev->pers->reconfig(mddev, info->layout, -1); + else { + mddev->new_layout = info->layout; + rv = mddev->pers->reconfig(mddev); + if (rv) + mddev->new_layout = mddev->layout; + return rv; + } } if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) rv = update_size(mddev, (sector_t)info->size * 2); diff --git a/drivers/md/md.h b/drivers/md/md.h index e0a2b8e3985d..815013f8da6c 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -326,7 +326,7 @@ struct mdk_personality int (*check_reshape) (mddev_t *mddev); int (*start_reshape) (mddev_t *mddev); void (*finish_reshape) (mddev_t *mddev); - int (*reconfig) (mddev_t *mddev, int layout, int chunk_size); + int (*reconfig) (mddev_t *mddev); /* quiesce moves between quiescence states * 0 - fully active * 1 - no new requests allowed diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index b84766e347c3..136051bc6725 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5165,7 +5165,7 @@ static void *raid5_takeover_raid6(mddev_t *mddev) } -static int raid5_reconfig(mddev_t *mddev, int new_layout, int new_chunk) +static int raid5_reconfig(mddev_t *mddev) { /* For a 2-drive array, the layout and chunk size can be changed * immediately as not restriping is needed. @@ -5173,15 +5173,16 @@ static int raid5_reconfig(mddev_t *mddev, int new_layout, int new_chunk) * to be used by a reshape pass. */ raid5_conf_t *conf = mddev->private; + int new_chunk = mddev->new_chunk_sectors; - if (new_layout >= 0 && !algorithm_valid_raid5(new_layout)) + if (mddev->new_layout >= 0 && !algorithm_valid_raid5(mddev->new_layout)) return -EINVAL; if (new_chunk > 0) { if (!is_power_of_2(new_chunk)) return -EINVAL; - if (new_chunk < PAGE_SIZE) + if (new_chunk < (PAGE_SIZE>>9)) return -EINVAL; - if (mddev->array_sectors & ((new_chunk>>9)-1)) + if (mddev->array_sectors & (new_chunk-1)) /* not factor of array size */ return -EINVAL; } @@ -5189,48 +5190,37 @@ static int raid5_reconfig(mddev_t *mddev, int new_layout, int new_chunk) /* They look valid */ if (mddev->raid_disks == 2) { - - if (new_layout >= 0) { - conf->algorithm = new_layout; - mddev->layout = mddev->new_layout = new_layout; + /* can make the change immediately */ + if (mddev->new_layout >= 0) { + conf->algorithm = mddev->new_layout; + mddev->layout = mddev->new_layout; } if (new_chunk > 0) { - conf->chunk_sectors = new_chunk >> 9; - mddev->new_chunk_sectors = new_chunk >> 9; - mddev->chunk_sectors = new_chunk >> 9; + conf->chunk_sectors = new_chunk ; + mddev->chunk_sectors = new_chunk; } set_bit(MD_CHANGE_DEVS, &mddev->flags); md_wakeup_thread(mddev->thread); - } else { - if (new_layout >= 0) - mddev->new_layout = new_layout; - if (new_chunk > 0) - mddev->new_chunk_sectors = new_chunk >> 9; } return 0; } -static int raid6_reconfig(mddev_t *mddev, int new_layout, int new_chunk) +static int raid6_reconfig(mddev_t *mddev) { - if (new_layout >= 0 && !algorithm_valid_raid6(new_layout)) + int new_chunk = mddev->new_chunk_sectors; + if (mddev->new_layout >= 0 && !algorithm_valid_raid6(mddev->new_layout)) return -EINVAL; if (new_chunk > 0) { if (!is_power_of_2(new_chunk)) return -EINVAL; - if (new_chunk < PAGE_SIZE) + if (new_chunk < (PAGE_SIZE >> 9)) return -EINVAL; - if (mddev->array_sectors & ((new_chunk>>9)-1)) + if (mddev->array_sectors & (new_chunk-1)) /* not factor of array size */ return -EINVAL; } /* They look valid */ - - if (new_layout >= 0) - mddev->new_layout = new_layout; - if (new_chunk > 0) - mddev->new_chunk_sectors = new_chunk >> 9; - return 0; } From 50ac168a6e0a061bf5346d53aa9e7beb94c97527 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 18 Jun 2009 08:47:55 +1000 Subject: [PATCH 30/39] md: merge reconfig and check_reshape methods. The difference between these two methods is artificial. Both check that a pending reshape is valid, and perform any aspect of it that can be done immediately. 'reconfig' handles chunk size and layout. 'check_reshape' handles raid_disks. So make them just one method. Signed-off-by: NeilBrown --- drivers/md/faulty.c | 6 +++--- drivers/md/md.c | 15 ++++++++------- drivers/md/md.h | 1 - drivers/md/raid5.c | 17 ++++++++--------- 4 files changed, 19 insertions(+), 20 deletions(-) diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c index 6513b7b3e379..6e83b38d931d 100644 --- a/drivers/md/faulty.c +++ b/drivers/md/faulty.c @@ -255,7 +255,7 @@ static void status(struct seq_file *seq, mddev_t *mddev) } -static int reconfig(mddev_t *mddev) +static int reshape(mddev_t *mddev) { int mode = mddev->new_layout & ModeMask; int count = mddev->new_layout >> ModeShift; @@ -316,7 +316,7 @@ static int run(mddev_t *mddev) md_set_array_sectors(mddev, faulty_size(mddev, 0, 0)); mddev->private = conf; - reconfig(mddev); + reshape(mddev); return 0; } @@ -339,7 +339,7 @@ static struct mdk_personality faulty_personality = .run = run, .stop = stop, .status = status, - .reconfig = reconfig, + .check_reshape = reshape, .size = faulty_size, }; diff --git a/drivers/md/md.c b/drivers/md/md.c index 5caa421c2367..80f039ec3ac2 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2807,10 +2807,10 @@ layout_store(mddev_t *mddev, const char *buf, size_t len) if (mddev->pers) { int err; - if (mddev->pers->reconfig == NULL) + if (mddev->pers->check_reshape == NULL) return -EBUSY; mddev->new_layout = n; - err = mddev->pers->reconfig(mddev); + err = mddev->pers->check_reshape(mddev); if (err) { mddev->new_layout = mddev->layout; return err; @@ -2885,10 +2885,10 @@ chunk_size_store(mddev_t *mddev, const char *buf, size_t len) if (mddev->pers) { int err; - if (mddev->pers->reconfig == NULL) + if (mddev->pers->check_reshape == NULL) return -EBUSY; mddev->new_chunk_sectors = n >> 9; - err = mddev->pers->reconfig(mddev); + err = mddev->pers->check_reshape(mddev); if (err) { mddev->new_chunk_sectors = mddev->chunk_sectors; return err; @@ -5224,11 +5224,11 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) * we don't need to do anything at the md level, the * personality will take care of it all. */ - if (mddev->pers->reconfig == NULL) + if (mddev->pers->check_reshape == NULL) return -EINVAL; else { mddev->new_layout = info->layout; - rv = mddev->pers->reconfig(mddev); + rv = mddev->pers->check_reshape(mddev); if (rv) mddev->new_layout = mddev->layout; return rv; @@ -6731,7 +6731,8 @@ void md_check_recovery(mddev_t *mddev) */ if (mddev->reshape_position != MaxSector) { - if (mddev->pers->check_reshape(mddev) != 0) + if (mddev->pers->check_reshape == NULL || + mddev->pers->check_reshape(mddev) != 0) /* Cannot proceed */ goto unlock; set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); diff --git a/drivers/md/md.h b/drivers/md/md.h index 815013f8da6c..bac7c2bf8616 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -326,7 +326,6 @@ struct mdk_personality int (*check_reshape) (mddev_t *mddev); int (*start_reshape) (mddev_t *mddev); void (*finish_reshape) (mddev_t *mddev); - int (*reconfig) (mddev_t *mddev); /* quiesce moves between quiescence states * 0 - fully active * 1 - no new requests allowed diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 136051bc6725..5ea2bdece278 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -4868,14 +4868,14 @@ static int check_stripe_cache(mddev_t *mddev) return 1; } -static int raid5_check_reshape(mddev_t *mddev) +static int check_reshape(mddev_t *mddev) { raid5_conf_t *conf = mddev->private; if (mddev->delta_disks == 0 && mddev->new_layout == mddev->layout && mddev->new_chunk_sectors == mddev->chunk_sectors) - return -EINVAL; /* nothing to do */ + return 0; /* nothing to do */ if (mddev->bitmap) /* Cannot grow a bitmap yet */ return -EBUSY; @@ -5165,7 +5165,7 @@ static void *raid5_takeover_raid6(mddev_t *mddev) } -static int raid5_reconfig(mddev_t *mddev) +static int raid5_check_reshape(mddev_t *mddev) { /* For a 2-drive array, the layout and chunk size can be changed * immediately as not restriping is needed. @@ -5202,12 +5202,13 @@ static int raid5_reconfig(mddev_t *mddev) set_bit(MD_CHANGE_DEVS, &mddev->flags); md_wakeup_thread(mddev->thread); } - return 0; + return check_reshape(mddev); } -static int raid6_reconfig(mddev_t *mddev) +static int raid6_check_reshape(mddev_t *mddev) { int new_chunk = mddev->new_chunk_sectors; + if (mddev->new_layout >= 0 && !algorithm_valid_raid6(mddev->new_layout)) return -EINVAL; if (new_chunk > 0) { @@ -5221,7 +5222,7 @@ static int raid6_reconfig(mddev_t *mddev) } /* They look valid */ - return 0; + return check_reshape(mddev); } static void *raid5_takeover(mddev_t *mddev) @@ -5312,12 +5313,11 @@ static struct mdk_personality raid6_personality = .sync_request = sync_request, .resize = raid5_resize, .size = raid5_size, - .check_reshape = raid5_check_reshape, + .check_reshape = raid6_check_reshape, .start_reshape = raid5_start_reshape, .finish_reshape = raid5_finish_reshape, .quiesce = raid5_quiesce, .takeover = raid6_takeover, - .reconfig = raid6_reconfig, }; static struct mdk_personality raid5_personality = { @@ -5340,7 +5340,6 @@ static struct mdk_personality raid5_personality = .finish_reshape = raid5_finish_reshape, .quiesce = raid5_quiesce, .takeover = raid5_takeover, - .reconfig = raid5_reconfig, }; static struct mdk_personality raid4_personality = From 8c6ac868b107ed50a46204f6d14e2ad9443ff146 Mon Sep 17 00:00:00 2001 From: Andre Noll Date: Thu, 18 Jun 2009 08:48:06 +1000 Subject: [PATCH 31/39] md: Push down reconstruction log message to personality code. Currently, the md layer checks in analyze_sbs() if the raid level supports reconstruction (mddev->level >= 1) and if reconstruction is in progress (mddev->recovery_cp != MaxSector). Move that printk into the personality code of those raid levels that care (levels 1, 4, 5, 6, 10). Signed-off-by: Andre Noll Signed-off-by: NeilBrown --- drivers/md/md.c | 9 --------- drivers/md/raid1.c | 4 ++++ drivers/md/raid10.c | 4 ++++ drivers/md/raid5.c | 4 ++++ 4 files changed, 12 insertions(+), 9 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 80f039ec3ac2..90147370bfd7 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2604,15 +2604,6 @@ static void analyze_sbs(mddev_t * mddev) clear_bit(In_sync, &rdev->flags); } } - - - - if (mddev->recovery_cp != MaxSector && - mddev->level >= 1) - printk(KERN_ERR "md: %s: raid array is not clean" - " -- starting background reconstruction\n", - mdname(mddev)); - } static void md_safemode_timeout(unsigned long data); diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 12f8f34f17ae..89939a7aef57 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -2052,6 +2052,10 @@ static int run(mddev_t *mddev) goto out_free_conf; } + if (mddev->recovery_cp != MaxSector) + printk(KERN_NOTICE "raid1: %s is not clean" + " -- starting background reconstruction\n", + mdname(mddev)); printk(KERN_INFO "raid1: raid set %s active with %d out of %d mirrors\n", mdname(mddev), mddev->raid_disks - mddev->degraded, diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 30029a312cf5..ae12ceafe10c 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -2186,6 +2186,10 @@ static int run(mddev_t *mddev) goto out_free_conf; } + if (mddev->recovery_cp != MaxSector) + printk(KERN_NOTICE "raid10: %s is not clean" + " -- starting background reconstruction\n", + mdname(mddev)); printk(KERN_INFO "raid10: raid set %s active with %d out of %d devices\n", mdname(mddev), mddev->raid_disks - mddev->degraded, diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 5ea2bdece278..8f2e3740a05a 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -4454,6 +4454,10 @@ static int run(mddev_t *mddev) int working_disks = 0; mdk_rdev_t *rdev; + if (mddev->recovery_cp != MaxSector) + printk(KERN_NOTICE "raid5: %s is not clean" + " -- starting background reconstruction\n", + mdname(mddev)); if (mddev->reshape_position != MaxSector) { /* Check that we can continue the reshape. * Currently only disks can change, it must From 1b57f132231593923cb4ab99943ddd777e8745bc Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 18 Jun 2009 08:48:19 +1000 Subject: [PATCH 32/39] md: move assignment of ->utime so that it never gets skipped. Currently the assignment to utime gets skipped for 'external' metadata. So move it to the top of the function so that it always gets effected. This is of largely cosmetic interest. Nothing actually depends on ->utime being right for external arrays. "mdadm --monitor" does use it for 0.90 and 1.x arrays, but with mdadm-3.0, this is not important for external metadata. Signed-off-by: NeilBrown --- drivers/md/md.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 90147370bfd7..d9cce54cc2d5 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1904,6 +1904,7 @@ static void md_update_sb(mddev_t * mddev, int force_change) int sync_req; int nospares = 0; + mddev->utime = get_seconds(); if (mddev->external) return; repeat: @@ -1933,7 +1934,6 @@ repeat: nospares = 0; sync_req = mddev->in_sync; - mddev->utime = get_seconds(); /* If this is just a dirty<->clean transition, and the array is clean * and 'events' is odd, we can roll back to the previous clean state */ From 13f2682b7216ebebd72b3d5868fe7fccec91a92d Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 18 Jun 2009 08:48:55 +1000 Subject: [PATCH 33/39] md: raid0/linear: ensure device sizes are rounded to chunk size. This is currently ensured by common code, but it is more reliable to ensure it where it is needed in personality code. All the other personalities that care already round the size to the chunk_size. raid0 and linear are the only hold-outs. Signed-off-by: NeilBrown --- drivers/md/linear.c | 6 ++++++ drivers/md/raid0.c | 6 ++++++ 2 files changed, 12 insertions(+) diff --git a/drivers/md/linear.c b/drivers/md/linear.c index 9f7cec42dd8e..dda2f1b64a6d 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -135,6 +135,7 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks) list_for_each_entry(rdev, &mddev->disks, same_set) { int j = rdev->raid_disk; dev_info_t *disk = conf->disks + j; + sector_t sectors; if (j < 0 || j >= raid_disks || disk->rdev) { printk("linear: disk numbering problem. Aborting!\n"); @@ -142,6 +143,11 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks) } disk->rdev = rdev; + if (mddev->chunk_sectors) { + sectors = rdev->sectors; + sector_div(sectors, mddev->chunk_sectors); + rdev->sectors = sectors * mddev->chunk_sectors; + } blk_queue_stack_limits(mddev->queue, rdev->bdev->bd_disk->queue); diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 11e384253718..717e64a4af9a 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -100,6 +100,12 @@ static int create_strip_zones(mddev_t *mddev) printk(KERN_INFO "raid0: looking at %s\n", bdevname(rdev1->bdev,b)); c = 0; + + /* round size to chunk_size */ + sectors = rdev1->sectors; + sector_div(sectors, mddev->chunk_sectors); + rdev1->sectors = sectors * mddev->chunk_sectors; + list_for_each_entry(rdev2, &mddev->disks, same_set) { printk(KERN_INFO "raid0: comparing %s(%llu)", bdevname(rdev1->bdev,b), From 8190e754e0723de7cecb80bdd9eb93911dfa04a1 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 18 Jun 2009 08:48:58 +1000 Subject: [PATCH 34/39] md: remove chunksize rounding from common code. It is easiest to round sizes to multiples of chunk size in the personality code for those personalities which care. Those personalities now do the rounding, so we can remove that function from common code. Also remove the upper bound on the size of a chunk, and the lower bound on the size of a device (1 chunk), neither of which really buy us anything. Signed-off-by: NeilBrown --- drivers/md/md.c | 52 +++---------------------------------------------- drivers/md/md.h | 7 ------- 2 files changed, 3 insertions(+), 56 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index d9cce54cc2d5..0f11fd1417ab 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -440,18 +440,6 @@ static inline sector_t calc_dev_sboffset(struct block_device *bdev) return MD_NEW_SIZE_SECTORS(num_sectors); } -static sector_t calc_num_sectors(mdk_rdev_t *rdev, unsigned chunk_size) -{ - sector_t num_sectors = rdev->sb_start; - - if (chunk_size) { - unsigned chunk_sects = chunk_size>>9; - sector_div(num_sectors, chunk_sects); - num_sectors *= chunk_sects; - } - return num_sectors; -} - static int alloc_disk_sb(mdk_rdev_t * rdev) { if (rdev->sb_page) @@ -839,7 +827,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version else ret = 0; } - rdev->sectors = calc_num_sectors(rdev, sb->chunk_size); + rdev->sectors = rdev->sb_start; if (rdev->sectors < sb->size * 2 && sb->level > 1) /* "this cannot possibly happen" ... */ @@ -1251,13 +1239,6 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) if (rdev->sectors < le64_to_cpu(sb->data_size)) return -EINVAL; rdev->sectors = le64_to_cpu(sb->data_size); - if (le32_to_cpu(sb->chunksize)) { - int chunk_sects = le32_to_cpu(sb->chunksize); - sector_t chunks = rdev->sectors; - sector_div(chunks, chunk_sects); - rdev->sectors = chunks * chunk_sects; - } - if (le64_to_cpu(sb->size) > rdev->sectors) return -EINVAL; return ret; @@ -3983,11 +3964,9 @@ static int start_dirty_degraded; static int do_md_run(mddev_t * mddev) { int err; - int chunk_size; mdk_rdev_t *rdev; struct gendisk *disk; struct mdk_personality *pers; - char b[BDEVNAME_SIZE]; if (list_empty(&mddev->disks)) /* cannot run an array with no devices.. */ @@ -4005,30 +3984,6 @@ static int do_md_run(mddev_t * mddev) analyze_sbs(mddev); } - chunk_size = mddev->chunk_sectors << 9; - - if (chunk_size) { - if (chunk_size > MAX_CHUNK_SIZE) { - printk(KERN_ERR "too big chunk_size: %d > %d\n", - chunk_size, MAX_CHUNK_SIZE); - return -EINVAL; - } - /* devices must have minimum size of one chunk */ - list_for_each_entry(rdev, &mddev->disks, same_set) { - if (test_bit(Faulty, &rdev->flags)) - continue; - if (rdev->sectors < chunk_size / 512) { - printk(KERN_WARNING - "md: Dev %s smaller than chunk_size:" - " %llu < %d\n", - bdevname(rdev->bdev,b), - (unsigned long long)rdev->sectors, - chunk_size / 512); - return -EINVAL; - } - } - } - if (mddev->level != LEVEL_NONE) request_module("md-level-%d", mddev->level); else if (mddev->clevel[0]) @@ -4842,8 +4797,7 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) rdev->sb_start = rdev->bdev->bd_inode->i_size / 512; } else rdev->sb_start = calc_dev_sboffset(rdev->bdev); - rdev->sectors = calc_num_sectors(rdev, - mddev->chunk_sectors << 9); + rdev->sectors = rdev->sb_start; err = bind_rdev_to_array(rdev, mddev); if (err) { @@ -4913,7 +4867,7 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev) else rdev->sb_start = rdev->bdev->bd_inode->i_size / 512; - rdev->sectors = calc_num_sectors(rdev, mddev->chunk_sectors << 9); + rdev->sectors = rdev->sb_start; if (test_bit(Faulty, &rdev->flags)) { printk(KERN_WARNING diff --git a/drivers/md/md.h b/drivers/md/md.h index bac7c2bf8616..ea2c441449d4 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -29,13 +29,6 @@ typedef struct mddev_s mddev_t; typedef struct mdk_rdev_s mdk_rdev_t; -/* - * options passed in raidrun: - */ - -/* Currently this must fit in an 'int' */ -#define MAX_CHUNK_SIZE (1<<30) - /* * MD's 'extended' device */ From 0894cc3066aaa3e75a99383c0d25feebf9b688ac Mon Sep 17 00:00:00 2001 From: Andre Noll Date: Thu, 18 Jun 2009 08:49:23 +1000 Subject: [PATCH 35/39] md: Move check for bitmap presence to personality code. If the superblock of a component device indicates the presence of a bitmap but the corresponding raid personality does not support bitmaps (raid0, linear, multipath, faulty), then something is seriously wrong and we'd better refuse to run such an array. Currently, this check is performed while the superblocks are examined, i.e. before entering personality code. Therefore the generic md layer must know which raid levels support bitmaps and which do not. This patch avoids this layer violation without adding identical code to various personalities. This is accomplished by introducing a new public function to md.c, md_check_no_bitmap(), which replaces the hard-coded checks in the superblock loading functions. A call to md_check_no_bitmap() is added to the ->run method of each personality which does not support bitmaps and assembly is aborted if at least one component device contains a bitmap. Signed-off-by: Andre Noll Signed-off-by: NeilBrown --- drivers/md/faulty.c | 6 +++++- drivers/md/linear.c | 2 ++ drivers/md/md.c | 40 ++++++++++++++++++---------------------- drivers/md/md.h | 1 + drivers/md/multipath.c | 3 +++ drivers/md/raid0.c | 2 ++ 6 files changed, 31 insertions(+), 23 deletions(-) diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c index 6e83b38d931d..87d88dbb667f 100644 --- a/drivers/md/faulty.c +++ b/drivers/md/faulty.c @@ -299,8 +299,12 @@ static int run(mddev_t *mddev) { mdk_rdev_t *rdev; int i; + conf_t *conf; - conf_t *conf = kmalloc(sizeof(*conf), GFP_KERNEL); + if (md_check_no_bitmap(mddev)) + return -EINVAL; + + conf = kmalloc(sizeof(*conf), GFP_KERNEL); if (!conf) return -ENOMEM; diff --git a/drivers/md/linear.c b/drivers/md/linear.c index dda2f1b64a6d..564c390f8a1b 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -189,6 +189,8 @@ static int linear_run (mddev_t *mddev) { linear_conf_t *conf; + if (md_check_no_bitmap(mddev)) + return -EINVAL; mddev->queue->queue_lock = &mddev->queue->__queue_lock; conf = linear_conf(mddev, mddev->raid_disks); diff --git a/drivers/md/md.c b/drivers/md/md.c index 0f11fd1417ab..09be637d52cb 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -735,6 +735,24 @@ struct super_type { sector_t num_sectors); }; +/* + * Check that the given mddev has no bitmap. + * + * This function is called from the run method of all personalities that do not + * support bitmaps. It prints an error message and returns non-zero if mddev + * has a bitmap. Otherwise, it returns 0. + * + */ +int md_check_no_bitmap(mddev_t *mddev) +{ + if (!mddev->bitmap_file && !mddev->bitmap_offset) + return 0; + printk(KERN_ERR "%s: bitmaps are not supported for %s\n", + mdname(mddev), mddev->pers->name); + return 1; +} +EXPORT_SYMBOL(md_check_no_bitmap); + /* * load_super for 0.90.0 */ @@ -788,17 +806,6 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version rdev->data_offset = 0; rdev->sb_size = MD_SB_BYTES; - if (sb->state & (1<level != 1 && sb->level != 4 - && sb->level != 5 && sb->level != 6 - && sb->level != 10) { - /* FIXME use a better test */ - printk(KERN_WARNING - "md: bitmaps not supported for this level.\n"); - goto abort; - } - } - if (sb->level == LEVEL_MULTIPATH) rdev->desc_nr = -1; else @@ -1176,17 +1183,6 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) bdevname(rdev->bdev,b)); return -EINVAL; } - if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET)) { - if (sb->level != cpu_to_le32(1) && - sb->level != cpu_to_le32(4) && - sb->level != cpu_to_le32(5) && - sb->level != cpu_to_le32(6) && - sb->level != cpu_to_le32(10)) { - printk(KERN_WARNING - "md: bitmaps not supported for this level.\n"); - return -EINVAL; - } - } rdev->preferred_minor = 0xffff; rdev->data_offset = le64_to_cpu(sb->data_offset); diff --git a/drivers/md/md.h b/drivers/md/md.h index ea2c441449d4..9430a110db93 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -430,5 +430,6 @@ extern void md_new_event(mddev_t *mddev); extern int md_allow_write(mddev_t *mddev); extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev); extern void md_set_array_sectors(mddev_t *mddev, sector_t array_sectors); +extern int md_check_no_bitmap(mddev_t *mddev); #endif /* _MD_MD_H */ diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index c1ca63f278a9..cbe368fa6598 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -421,6 +421,9 @@ static int multipath_run (mddev_t *mddev) struct multipath_info *disk; mdk_rdev_t *rdev; + if (md_check_no_bitmap(mddev)) + return -EINVAL; + if (mddev->level != LEVEL_MULTIPATH) { printk("multipath: %s: raid level not set to multipath IO (%d)\n", mdname(mddev), mddev->level); diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 717e64a4af9a..ab4a489d8695 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -314,6 +314,8 @@ static int raid0_run(mddev_t *mddev) printk(KERN_ERR "md/raid0: chunk size must be set.\n"); return -EINVAL; } + if (md_check_no_bitmap(mddev)) + return -EINVAL; blk_queue_max_sectors(mddev->queue, mddev->chunk_sectors); mddev->queue->queue_lock = &mddev->queue->__queue_lock; From af11c397fd8835c70ec0bb777104e4ab98b2d660 Mon Sep 17 00:00:00 2001 From: SandeepKsinha Date: Thu, 18 Jun 2009 08:49:35 +1000 Subject: [PATCH 36/39] md linear: Protecting mddev with rcu locks to avoid races Due to the lack of memory ordering guarantees, we may have races around mddev->conf. In particular, the correct contents of the structure we get from dereferencing ->private might not be visible to this CPU yet, and they might not be correct w.r.t mddev->raid_disks. This patch addresses the problem using rcu protection to avoid such race conditions. Signed-off-by: SandeepKsinha Signed-off-by: NeilBrown --- drivers/md/linear.c | 47 ++++++++++++++++++++++++++++++++++++--------- 1 file changed, 38 insertions(+), 9 deletions(-) diff --git a/drivers/md/linear.c b/drivers/md/linear.c index 564c390f8a1b..93f2b1d18398 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -28,10 +28,11 @@ static inline dev_info_t *which_dev(mddev_t *mddev, sector_t sector) { int lo, mid, hi; - linear_conf_t *conf = mddev->private; + linear_conf_t *conf; lo = 0; hi = mddev->raid_disks - 1; + conf = rcu_dereference(mddev->private); /* * Binary Search @@ -66,8 +67,10 @@ static int linear_mergeable_bvec(struct request_queue *q, unsigned long maxsectors, bio_sectors = bvm->bi_size >> 9; sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); + rcu_read_lock(); dev0 = which_dev(mddev, sector); maxsectors = dev0->end_sector - sector; + rcu_read_unlock(); if (maxsectors < bio_sectors) maxsectors = 0; @@ -86,36 +89,50 @@ static int linear_mergeable_bvec(struct request_queue *q, static void linear_unplug(struct request_queue *q) { mddev_t *mddev = q->queuedata; - linear_conf_t *conf = mddev->private; + linear_conf_t *conf; int i; + rcu_read_lock(); + conf = rcu_dereference(mddev->private); + for (i=0; i < mddev->raid_disks; i++) { struct request_queue *r_queue = bdev_get_queue(conf->disks[i].rdev->bdev); blk_unplug(r_queue); } + rcu_read_unlock(); } static int linear_congested(void *data, int bits) { mddev_t *mddev = data; - linear_conf_t *conf = mddev->private; + linear_conf_t *conf; int i, ret = 0; + rcu_read_lock(); + conf = rcu_dereference(mddev->private); + for (i = 0; i < mddev->raid_disks && !ret ; i++) { struct request_queue *q = bdev_get_queue(conf->disks[i].rdev->bdev); ret |= bdi_congested(&q->backing_dev_info, bits); } + + rcu_read_unlock(); return ret; } static sector_t linear_size(mddev_t *mddev, sector_t sectors, int raid_disks) { - linear_conf_t *conf = mddev->private; + linear_conf_t *conf; + sector_t array_sectors; + rcu_read_lock(); + conf = rcu_dereference(mddev->private); WARN_ONCE(sectors || raid_disks, "%s does not support generic reshape\n", __func__); + array_sectors = conf->array_sectors; + rcu_read_unlock(); - return conf->array_sectors; + return array_sectors; } static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks) @@ -229,8 +246,8 @@ static int linear_add(mddev_t *mddev, mdk_rdev_t *rdev) return -ENOMEM; newconf->prev = mddev->private; - mddev->private = newconf; mddev->raid_disks++; + rcu_assign_pointer(mddev->private, newconf); md_set_array_sectors(mddev, linear_size(mddev, 0, 0)); set_capacity(mddev->gendisk, mddev->array_sectors); return 0; @@ -239,7 +256,13 @@ static int linear_add(mddev_t *mddev, mdk_rdev_t *rdev) static int linear_stop (mddev_t *mddev) { linear_conf_t *conf = mddev->private; - + + /* + * We do not require rcu protection here since + * we hold reconfig_mutex for both linear_add and + * linear_stop, so they cannot race. + */ + blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ do { linear_conf_t *t = conf->prev; @@ -269,9 +292,11 @@ static int linear_make_request (struct request_queue *q, struct bio *bio) bio_sectors(bio)); part_stat_unlock(); + rcu_read_lock(); tmp_dev = which_dev(mddev, bio->bi_sector); start_sector = tmp_dev->end_sector - tmp_dev->rdev->sectors; + if (unlikely(bio->bi_sector >= (tmp_dev->end_sector) || (bio->bi_sector < start_sector))) { char b[BDEVNAME_SIZE]; @@ -282,6 +307,7 @@ static int linear_make_request (struct request_queue *q, struct bio *bio) bdevname(tmp_dev->rdev->bdev, b), (unsigned long long)tmp_dev->rdev->sectors, (unsigned long long)start_sector); + rcu_read_unlock(); bio_io_error(bio); return 0; } @@ -291,9 +317,11 @@ static int linear_make_request (struct request_queue *q, struct bio *bio) * split it. */ struct bio_pair *bp; + sector_t end_sector = tmp_dev->end_sector; - bp = bio_split(bio, - tmp_dev->end_sector - bio->bi_sector); + rcu_read_unlock(); + + bp = bio_split(bio, end_sector - bio->bi_sector); if (linear_make_request(q, &bp->bio1)) generic_make_request(&bp->bio1); @@ -306,6 +334,7 @@ static int linear_make_request (struct request_queue *q, struct bio *bio) bio->bi_bdev = tmp_dev->rdev->bdev; bio->bi_sector = bio->bi_sector - start_sector + tmp_dev->rdev->data_offset; + rcu_read_unlock(); return 1; } From 495d357301e1de01fabe30ce9a555301fb4675c3 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 18 Jun 2009 08:49:42 +1000 Subject: [PATCH 37/39] md/linear: use call_rcu to free obsolete 'conf' structures. Current, when we update the 'conf' structure, when adding a drive to a linear array, we keep the old version around until the array is finally stopped, as it is not safe to free it immediately. Now that we have rcu protection on all accesses to 'conf', we can use call_rcu to free it more promptly. Signed-off-by: NeilBrown --- drivers/md/linear.c | 21 +++++++++++++-------- drivers/md/linear.h | 2 +- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/drivers/md/linear.c b/drivers/md/linear.c index 93f2b1d18398..15c8b7b25a9b 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -223,6 +223,12 @@ static int linear_run (mddev_t *mddev) return 0; } +static void free_conf(struct rcu_head *head) +{ + linear_conf_t *conf = container_of(head, linear_conf_t, rcu); + kfree(conf); +} + static int linear_add(mddev_t *mddev, mdk_rdev_t *rdev) { /* Adding a drive to a linear array allows the array to grow. @@ -233,7 +239,7 @@ static int linear_add(mddev_t *mddev, mdk_rdev_t *rdev) * The current one is never freed until the array is stopped. * This avoids races. */ - linear_conf_t *newconf; + linear_conf_t *newconf, *oldconf; if (rdev->saved_raid_disk != mddev->raid_disks) return -EINVAL; @@ -245,11 +251,12 @@ static int linear_add(mddev_t *mddev, mdk_rdev_t *rdev) if (!newconf) return -ENOMEM; - newconf->prev = mddev->private; + oldconf = rcu_dereference(mddev->private); mddev->raid_disks++; rcu_assign_pointer(mddev->private, newconf); md_set_array_sectors(mddev, linear_size(mddev, 0, 0)); set_capacity(mddev->gendisk, mddev->array_sectors); + call_rcu(&oldconf->rcu, free_conf); return 0; } @@ -261,14 +268,12 @@ static int linear_stop (mddev_t *mddev) * We do not require rcu protection here since * we hold reconfig_mutex for both linear_add and * linear_stop, so they cannot race. + * We should make sure any old 'conf's are properly + * freed though. */ - + rcu_barrier(); blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ - do { - linear_conf_t *t = conf->prev; - kfree(conf); - conf = t; - } while (conf); + kfree(conf); return 0; } diff --git a/drivers/md/linear.h b/drivers/md/linear.h index 599e5c1bbb01..0ce29b61605a 100644 --- a/drivers/md/linear.h +++ b/drivers/md/linear.h @@ -10,9 +10,9 @@ typedef struct dev_info dev_info_t; struct linear_private_data { - struct linear_private_data *prev; /* earlier version */ sector_t array_sectors; dev_info_t disks[0]; + struct rcu_head rcu; }; From 7a3ab908948b6296ee7e81d42f7c176361c51975 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 16 Jun 2009 16:00:33 -0700 Subject: [PATCH 38/39] md/raid5: add missing call to schedule() after prepare_to_wait() In the unlikely event that reshape progresses past the current request while it is waiting for a stripe we need to schedule() before retrying for 2 reasons: 1/ Prevent list corruption from duplicated list_add() calls without intervening list_del(). 2/ Give the reshape code a chance to make some progress to resolve the conflict. Cc: Signed-off-by: Dan Williams Signed-off-by: NeilBrown --- drivers/md/raid5.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 8f2e3740a05a..59e29c2983cb 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3695,6 +3695,7 @@ static int make_request(struct request_queue *q, struct bio * bi) spin_unlock_irq(&conf->device_lock); if (must_retry) { release_stripe(sh); + schedule(); goto retry; } } From 48606a9f2fc034f0b308d088c1f7ab6d407c462c Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 18 Jun 2009 09:14:12 +1000 Subject: [PATCH 39/39] md/raid5: correctly update sync_completed when we reach max_resync At the end of reshape_request we update cyrr_resync_completed if we are about to pause due to reaching resync_max. However we update it to the wrong value. We need to add the "reshape_sectors" that have just been reshaped. Signed-off-by: NeilBrown --- drivers/md/raid5.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 59e29c2983cb..f9f991e6e138 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3946,7 +3946,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped wait_event(conf->wait_for_overlap, atomic_read(&conf->reshape_stripes) == 0); mddev->reshape_position = conf->reshape_progress; - mddev->curr_resync_completed = mddev->curr_resync; + mddev->curr_resync_completed = mddev->curr_resync + reshape_sectors; conf->reshape_checkpoint = jiffies; set_bit(MD_CHANGE_DEVS, &mddev->flags); md_wakeup_thread(mddev->thread);