Sign Up
Log In
Log In
or
Sign Up
Places
All Projects
Status Monitor
Collapse sidebar
SUSE:SLE-12-SP1:Update
mdadm.5365
0232-Add-failfast-support.patch
Overview
Repositories
Revisions
Requests
Users
Attributes
Meta
File 0232-Add-failfast-support.patch of Package mdadm.5365
From 71574efb077131701b3da874df0045f259ca3448 Mon Sep 17 00:00:00 2001 From: NeilBrown <neilb@suse.com> Date: Fri, 25 Nov 2016 10:55:49 +1100 Subject: [PATCH 329/359] Add failfast support. References: Fate#311379 Allow per-device "failfast" flag to be set when creating an array or adding devices to an array. When re-adding a device which had the failfast flag, it can be removed using --nofailfast. failfast status is printed in --detail and --examine output. Signed-off-by: NeilBrown <neilb@suse.com> Signed-off-by: Jes Sorensen <Jes.Sorensen@redhat.com> --- Create.c | 2 ++ Detail.c | 1 + Incremental.c | 1 + Manage.c | 20 +++++++++++++++++++- ReadMe.c | 2 ++ md.4 | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ md_p.h | 1 + mdadm.8.in | 32 +++++++++++++++++++++++++++++++- mdadm.c | 11 +++++++++++ mdadm.h | 5 +++++ super0.c | 12 ++++++++---- super1.c | 13 +++++++++++++ 12 files changed, 148 insertions(+), 6 deletions(-) mode change 100755 => 100644 mdadm.h Index: mdadm-3.3.1/Create.c =================================================================== --- mdadm-3.3.1.orig/Create.c +++ mdadm-3.3.1/Create.c @@ -890,6 +890,8 @@ int Create(struct supertype *st, char *m if (dv->writemostly == 1) inf->disk.state |= (1<<MD_DISK_WRITEMOSTLY); + if (dv->failfast == 1) + inf->disk.state |= (1<<MD_DISK_FAILFAST); if (have_container) fd = -1; Index: mdadm-3.3.1/Detail.c =================================================================== --- mdadm-3.3.1.orig/Detail.c +++ mdadm-3.3.1/Detail.c @@ -657,6 +657,7 @@ This is pretty boring } if (disk.state & (1<<MD_DISK_REMOVED)) printf(" removed"); if (disk.state & (1<<MD_DISK_WRITEMOSTLY)) printf(" writemostly"); + if (disk.state & (1<<MD_DISK_FAILFAST)) printf(" failfast"); if (disk.state & (1<<MD_DISK_JOURNAL)) printf(" journal"); if ((disk.state & ((1<<MD_DISK_ACTIVE)|(1<<MD_DISK_SYNC) Index: mdadm-3.3.1/Incremental.c =================================================================== --- mdadm-3.3.1.orig/Incremental.c +++ mdadm-3.3.1/Incremental.c @@ -1036,6 +1036,7 @@ static int array_try_spare(char *devname devlist.next = NULL; devlist.used = 0; devlist.writemostly = 0; + devlist.failfast = 0; devlist.devname = chosen_devname; sprintf(chosen_devname, "%d:%d", major(stb.st_rdev), minor(stb.st_rdev)); Index: mdadm-3.3.1/Manage.c =================================================================== --- mdadm-3.3.1.orig/Manage.c +++ mdadm-3.3.1/Manage.c @@ -683,8 +683,13 @@ int attempt_re_add(int fd, int tfd, stru disc.state |= 1 << MD_DISK_WRITEMOSTLY; if (dv->writemostly == 2) disc.state &= ~(1 << MD_DISK_WRITEMOSTLY); + if (dv->failfast == 1) + disc.state |= 1 << MD_DISK_FAILFAST; + if (dv->failfast == 2) + disc.state &= ~(1 << MD_DISK_FAILFAST); remove_partitions(tfd); - if (update || dv->writemostly > 0) { + if (update || dv->writemostly > 0 + || dv->failfast > 0) { int rv = -1; tfd = dev_open(dv->devname, O_RDWR); if (tfd < 0) { @@ -700,6 +705,14 @@ int attempt_re_add(int fd, int tfd, stru rv = dev_st->ss->update_super( dev_st, NULL, "readwrite", devname, verbose, 0, NULL); + if (dv->failfast == 1) + rv = dev_st->ss->update_super( + dev_st, NULL, "failfast", + devname, verbose, 0, NULL); + if (dv->failfast == 2) + rv = dev_st->ss->update_super( + dev_st, NULL, "nofailfast", + devname, verbose, 0, NULL); if (update) rv = dev_st->ss->update_super( dev_st, NULL, update, @@ -964,6 +977,8 @@ int Manage_add(int fd, int tfd, struct m disc.state |= (1 << MD_DISK_JOURNAL) | (1 << MD_DISK_SYNC); if (dv->writemostly == 1) disc.state |= 1 << MD_DISK_WRITEMOSTLY; + if (dv->failfast == 1) + disc.state |= 1 << MD_DISK_FAILFAST; dfd = dev_open(dv->devname, O_RDWR | O_EXCL|O_DIRECT); if (tst->ss->add_to_super(tst, &disc, dfd, dv->devname, INVALID_SECTORS)) @@ -1009,6 +1024,8 @@ int Manage_add(int fd, int tfd, struct m if (dv->writemostly == 1) disc.state |= (1 << MD_DISK_WRITEMOSTLY); + if (dv->failfast == 1) + disc.state |= (1 << MD_DISK_FAILFAST); if (tst->ss->external) { /* add a disk * to an external metadata container */ @@ -1785,6 +1802,7 @@ int move_spare(char *from_devname, char devlist.next = NULL; devlist.used = 0; devlist.writemostly = 0; + devlist.failfast = 0; devlist.devname = devname; sprintf(devname, "%d:%d", major(devid), minor(devid)); Index: mdadm-3.3.1/ReadMe.c =================================================================== --- mdadm-3.3.1.orig/ReadMe.c +++ mdadm-3.3.1/ReadMe.c @@ -136,6 +136,8 @@ struct option long_options[] = { {"bitmap-chunk", 1, 0, BitmapChunk}, {"write-behind", 2, 0, WriteBehind}, {"write-mostly",0, 0, WriteMostly}, + {"failfast", 0, 0, FailFast}, + {"nofailfast",0, 0, NoFailFast}, {"re-add", 0, 0, ReAdd}, {"homehost", 1, 0, HomeHost}, {"symlinks", 1, 0, Symlinks}, Index: mdadm-3.3.1/md.4 =================================================================== --- mdadm-3.3.1.orig/md.4 +++ mdadm-3.3.1/md.4 @@ -621,6 +621,60 @@ slow). The extra latency of the remote operations, but the remote system will still have a reasonably up-to-date copy of all data. +.SS FAILFAST + +From Linux 4.10, +.I +md +supports FAILFAST for RAID1 and RAID10 arrays. This is a flag that +can be set on individual drives, though it is usually set on all +drives, or no drives. + +When +.I md +sends an I/O request to a drive that is marked as FAILFAST, and when +the array could survive the loss of that drive without losing data, +.I md +will request that the underlying device does not perform any retries. +This means that a failure will be reported to +.I md +promptly, and it can mark the device as faulty and continue using the +other device(s). +.I md +cannot control the timeout that the underlying devices use to +determine failure. Any changes desired to that timeout must be set +explictly on the underlying device, separately from using +.IR mdadm . + +If a FAILFAST request does fail, and if it is still safe to mark the +device as faulty without data loss, that will be done and the array +will continue functioning on a reduced number of devices. If it is not +possible to safely mark the device as faulty, +.I md +will retry the request without disabling retries in the underlying +device. In any case, +.I md +will not attempt to repair read errors on a device marked as FAILFAST +by writing out the correct. It will just mark the device as faulty. + +FAILFAST is appropriate for storage arrays that have a low probability +of true failure, but will sometimes introduce unacceptable delays to +I/O requests while performing internal maintenance. The value of +setting FAILFAST involves a trade-off. The gain is that the chance of +unacceptable delays is substantially reduced. The cost is that the +unlikely event of data-loss on one device is slightly more likely to +result in data-loss for the array. + +When a device in an array using FAILFAST is marked as faulty, it will +usually become usable again in a short while. +.I mdadm +makes no attempt to detect that possibility. Some separate +mechanism, tuned to the specific details of the expected failure modes, +needs to be created to monitor devices to see when they return to full +functionality, and to then re-add them to the array. In order of +this "re-add" functionality to be effective, an array using FAILFAST +should always have a write-intent bitmap. + .SS RESTRIPING .IR Restriping , Index: mdadm-3.3.1/md_p.h =================================================================== --- mdadm-3.3.1.orig/md_p.h +++ mdadm-3.3.1/md_p.h @@ -89,6 +89,7 @@ * read requests will only be sent here in * dire need */ +#define MD_DISK_FAILFAST 10 /* Fewer retries, more failures */ #define MD_DISK_REPLACEMENT 17 #define MD_DISK_JOURNAL 18 /* disk is used as the write journal in RAID-5/6 */ Index: mdadm-3.3.1/mdadm.8.in =================================================================== --- mdadm-3.3.1.orig/mdadm.8.in +++ mdadm-3.3.1/mdadm.8.in @@ -747,7 +747,7 @@ subsequent devices listed in a .BR \-\-create , or .B \-\-add -command will be flagged as 'write-mostly'. This is valid for RAID1 +command will be flagged as 'write\-mostly'. This is valid for RAID1 only and means that the 'md' driver will avoid reading from these devices if at all possible. This can be useful if mirroring over a slow link. @@ -762,6 +762,25 @@ mode, and write-behind is only attempted .IR write-mostly . .TP +.BR \-\-failfast +subsequent devices listed in a +.B \-\-create +or +.B \-\-add +command will be flagged as 'failfast'. This is valid for RAID1 and +RAID10 only. IO requests to these devices will be encouraged to fail +quickly rather than cause long delays due to error handling. Also no +attempt is made to repair a read error on these devices. + +If an array becomes degraded so that the 'failfast' device is the only +usable device, the 'failfast' flag will then be ignored and extended +delays will be preferred to complete failure. + +The 'failfast' flag is appropriate for storage arrays which have a +low probability of true failure, but which may sometimes +cause unacceptable delays due to internal maintenance functions. + +.TP .BR \-\-assume\-clean Tell .I mdadm @@ -1444,6 +1463,17 @@ number. The receiving node must acknowle with \-\-cluster\-confirm. Valid arguments are <slot>:<devicename> in case the device is found or <slot>:missing in case the device is not found. +.TP +.BR \-\-failfast +Subsequent devices that are added or re\-added will have +the 'failfast' flag set. This is only valid for RAID1 and RAID10 and +means that the 'md' driver will avoid long timeouts on error handling +where possible. +.TP +.BR \-\-nofailfast +Subsequent devices that are re\-added will be re\-added without +the 'failfast' flag set. + .P Each of these options requires that the first device listed is the array to be acted upon, and the remainder are component devices to be added, Index: mdadm-3.3.1/mdadm.c =================================================================== --- mdadm-3.3.1.orig/mdadm.c +++ mdadm-3.3.1/mdadm.c @@ -90,6 +90,7 @@ int main(int argc, char *argv[]) int spare_sharing = 1; struct supertype *ss = NULL; int writemostly = 0; + int failfast = 0; char *shortopt = short_options; int dosyslog = 0; int rebuild_map = 0; @@ -295,6 +296,7 @@ int main(int argc, char *argv[]) dv->devname = optarg; dv->disposition = devmode; dv->writemostly = writemostly; + dv->failfast = failfast; dv->used = 0; dv->next = NULL; *devlistend = dv; @@ -351,6 +353,7 @@ int main(int argc, char *argv[]) dv->devname = optarg; dv->disposition = devmode; dv->writemostly = writemostly; + dv->failfast = failfast; dv->used = 0; dv->next = NULL; *devlistend = dv; @@ -417,6 +420,14 @@ int main(int argc, char *argv[]) writemostly = 2; continue; + case O(MANAGE,FailFast): + case O(CREATE,FailFast): + failfast = 1; + continue; + case O(MANAGE,NoFailFast): + failfast = 2; + continue; + case O(GROW,'z'): case O(CREATE,'z'): case O(BUILD,'z'): /* size */ Index: mdadm-3.3.1/mdadm.h =================================================================== --- mdadm-3.3.1.orig/mdadm.h +++ mdadm-3.3.1/mdadm.h @@ -379,6 +379,8 @@ enum special_options { ConfigFile, ChunkSize, WriteMostly, + FailFast, + NoFailFast, Layout, Auto, Force, @@ -512,6 +514,7 @@ struct mddev_dev { * Not set for names read from .config */ char writemostly; /* 1 for 'set writemostly', 2 for 'clear writemostly' */ + char failfast; /* Ditto but for 'failfast' flag */ int used; /* set when used */ long long data_offset; struct mddev_dev *next; @@ -817,6 +820,8 @@ extern struct superswitch { * linear-grow-update - now change the size of the array. * writemostly - set the WriteMostly1 bit in the superblock devflags * readwrite - clear the WriteMostly1 bit in the superblock devflags + * failfast - set the FailFast1 bit in the superblock + * nofailfast - clear the FailFast1 bit * no-bitmap - clear any record that a bitmap is present. * bbl - add a bad-block-log if possible * no-bbl - remove any bad-block-log is it is empty. Index: mdadm-3.3.1/super0.c =================================================================== --- mdadm-3.3.1.orig/super0.c +++ mdadm-3.3.1/super0.c @@ -216,19 +216,21 @@ static void examine_super0(struct supert mdp_disk_t *dp; char *dv; char nb[5]; - int wonly; + int wonly, failfast; if (d>=0) dp = &sb->disks[d]; else dp = &sb->this_disk; snprintf(nb, sizeof(nb), "%4d", d); printf("%4s %5d %5d %5d %5d ", d < 0 ? "this" : nb, dp->number, dp->major, dp->minor, dp->raid_disk); wonly = dp->state & (1<<MD_DISK_WRITEMOSTLY); - dp->state &= ~(1<<MD_DISK_WRITEMOSTLY); + failfast = dp->state & (1<<MD_DISK_FAILFAST); + dp->state &= ~(wonly | failfast); if (dp->state & (1<<MD_DISK_FAULTY)) printf(" faulty"); if (dp->state & (1<<MD_DISK_ACTIVE)) printf(" active"); if (dp->state & (1<<MD_DISK_SYNC)) printf(" sync"); if (dp->state & (1<<MD_DISK_REMOVED)) printf(" removed"); if (wonly) printf(" write-mostly"); + if (failfast) printf(" failfast"); if (dp->state == 0) printf(" spare"); if ((dv=map_dev(dp->major, dp->minor, 0))) printf(" %s", dv); @@ -558,7 +560,8 @@ static int update_super0(struct supertyp } else if (strcmp(update, "assemble")==0) { int d = info->disk.number; int wonly = sb->disks[d].state & (1<<MD_DISK_WRITEMOSTLY); - int mask = (1<<MD_DISK_WRITEMOSTLY); + int failfast = sb->disks[d].state & (1<<MD_DISK_FAILFAST); + int mask = (1<<MD_DISK_WRITEMOSTLY)|(1<<MD_DISK_FAILFAST); int add = 0; if (sb->minor_version >= 91) /* During reshape we don't insist on everything @@ -567,7 +570,7 @@ static int update_super0(struct supertyp add = (1<<MD_DISK_SYNC); if (((sb->disks[d].state & ~mask) | add) != (unsigned)info->disk.state) { - sb->disks[d].state = info->disk.state | wonly; + sb->disks[d].state = info->disk.state | wonly |failfast; rv = 1; } if (info->reshape_active && Index: mdadm-3.3.1/super1.c =================================================================== --- mdadm-3.3.1.orig/super1.c +++ mdadm-3.3.1/super1.c @@ -77,6 +77,7 @@ struct mdp_superblock_1 { __u8 device_uuid[16]; /* user-space setable, ignored by kernel */ __u8 devflags; /* per-device flags. Only one defined...*/ #define WriteMostly1 1 /* mask for writemostly flag in above */ +#define FailFast1 2 /* Device should get FailFast requests */ /* bad block log. If there are any bad blocks the feature flag is set. * if offset and size are non-zero, that space is reserved and available. */ @@ -430,6 +431,8 @@ static void examine_super1(struct supert printf(" Flags :"); if (sb->devflags & WriteMostly1) printf(" write-mostly"); + if (sb->devflags & FailFast1) + printf(" failfast"); printf("\n"); } @@ -1020,6 +1023,8 @@ static void getinfo_super1(struct supert } if (sb->devflags & WriteMostly1) info->disk.state |= (1 << MD_DISK_WRITEMOSTLY); + if (sb->devflags & FailFast1) + info->disk.state |= (1 << MD_DISK_FAILFAST); info->events = __le64_to_cpu(sb->events); sprintf(info->text_version, "1.%d", st->minor_version); info->safe_mode_delay = 200; @@ -1383,6 +1388,10 @@ static int update_super1(struct supertyp sb->devflags |= WriteMostly1; else if (strcmp(update, "readwrite")==0) sb->devflags &= ~WriteMostly1; + else if (strcmp(update, "failfast") == 0) + sb->devflags |= FailFast1; + else if (strcmp(update, "nofailfast") == 0) + sb->devflags &= ~FailFast1; else rv = -1; @@ -1726,6 +1735,10 @@ static int write_init_super1(struct supe sb->devflags |= WriteMostly1; else sb->devflags &= ~WriteMostly1; + if (di->disk.state & (1<<MD_DISK_FAILFAST)) + sb->devflags |= FailFast1; + else + sb->devflags &= ~FailFast1; if ((rfd = open("/dev/urandom", O_RDONLY)) < 0 || read(rfd, sb->device_uuid, 16) != 16) {
Locations
Projects
Search
Status Monitor
Help
OpenBuildService.org
Documentation
API Documentation
Code of Conduct
Contact
Support
@OBShq
Terms
openSUSE Build Service is sponsored by
The Open Build Service is an
openSUSE project
.
Sign Up
Log In
Places
Places
All Projects
Status Monitor