Line data Source code
1 : /* $OpenBSD: softraid_raid5.c,v 1.27 2016/10/07 19:17:50 krw Exp $ */
2 : /*
3 : * Copyright (c) 2014 Joel Sing <jsing@openbsd.org>
4 : * Copyright (c) 2009 Marco Peereboom <marco@peereboom.us>
5 : * Copyright (c) 2009 Jordan Hargrave <jordan@openbsd.org>
6 : *
7 : * Permission to use, copy, modify, and distribute this software for any
8 : * purpose with or without fee is hereby granted, provided that the above
9 : * copyright notice and this permission notice appear in all copies.
10 : *
11 : * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
12 : * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
13 : * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
14 : * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
15 : * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
16 : * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
17 : * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
18 : */
19 :
20 : #include "bio.h"
21 :
22 : #include <sys/param.h>
23 : #include <sys/systm.h>
24 : #include <sys/buf.h>
25 : #include <sys/device.h>
26 : #include <sys/ioctl.h>
27 : #include <sys/malloc.h>
28 : #include <sys/kernel.h>
29 : #include <sys/disk.h>
30 : #include <sys/rwlock.h>
31 : #include <sys/queue.h>
32 : #include <sys/fcntl.h>
33 : #include <sys/mount.h>
34 : #include <sys/sensors.h>
35 : #include <sys/stat.h>
36 : #include <sys/task.h>
37 : #include <sys/pool.h>
38 : #include <sys/conf.h>
39 : #include <sys/uio.h>
40 :
41 : #include <scsi/scsi_all.h>
42 : #include <scsi/scsiconf.h>
43 : #include <scsi/scsi_disk.h>
44 :
45 : #include <dev/softraidvar.h>
46 :
47 : /* RAID 5 functions. */
48 : int sr_raid5_create(struct sr_discipline *, struct bioc_createraid *,
49 : int, int64_t);
50 : int sr_raid5_assemble(struct sr_discipline *, struct bioc_createraid *,
51 : int, void *);
52 : int sr_raid5_init(struct sr_discipline *);
53 : int sr_raid5_rw(struct sr_workunit *);
54 : int sr_raid5_openings(struct sr_discipline *);
55 : void sr_raid5_intr(struct buf *);
56 : int sr_raid5_wu_done(struct sr_workunit *);
57 : void sr_raid5_set_chunk_state(struct sr_discipline *, int, int);
58 : void sr_raid5_set_vol_state(struct sr_discipline *);
59 :
60 : int sr_raid5_addio(struct sr_workunit *wu, int, daddr_t, long,
61 : void *, int, int, void *);
62 : int sr_raid5_regenerate(struct sr_workunit *, int, daddr_t, long,
63 : void *);
64 : int sr_raid5_write(struct sr_workunit *, struct sr_workunit *, int, int,
65 : daddr_t, long, void *, int, int);
66 : void sr_raid5_xor(void *, void *, int);
67 :
68 : void sr_raid5_rebuild(struct sr_discipline *);
69 : void sr_raid5_scrub(struct sr_discipline *);
70 :
71 : /* discipline initialisation. */
72 : void
73 0 : sr_raid5_discipline_init(struct sr_discipline *sd)
74 : {
75 : /* Fill out discipline members. */
76 0 : sd->sd_type = SR_MD_RAID5;
77 0 : strlcpy(sd->sd_name, "RAID 5", sizeof(sd->sd_name));
78 0 : sd->sd_capabilities = SR_CAP_SYSTEM_DISK | SR_CAP_AUTO_ASSEMBLE |
79 : SR_CAP_REBUILD | SR_CAP_REDUNDANT;
80 0 : sd->sd_max_wu = SR_RAID5_NOWU + 2; /* Two for scrub/rebuild. */
81 :
82 : /* Setup discipline specific function pointers. */
83 0 : sd->sd_assemble = sr_raid5_assemble;
84 0 : sd->sd_create = sr_raid5_create;
85 0 : sd->sd_openings = sr_raid5_openings;
86 0 : sd->sd_rebuild = sr_raid5_rebuild;
87 0 : sd->sd_scsi_rw = sr_raid5_rw;
88 0 : sd->sd_scsi_intr = sr_raid5_intr;
89 0 : sd->sd_scsi_wu_done = sr_raid5_wu_done;
90 0 : sd->sd_set_chunk_state = sr_raid5_set_chunk_state;
91 0 : sd->sd_set_vol_state = sr_raid5_set_vol_state;
92 0 : }
93 :
94 : int
95 0 : sr_raid5_create(struct sr_discipline *sd, struct bioc_createraid *bc,
96 : int no_chunk, int64_t coerced_size)
97 : {
98 0 : if (no_chunk < 3) {
99 0 : sr_error(sd->sd_sc, "%s requires three or more chunks",
100 0 : sd->sd_name);
101 0 : return EINVAL;
102 : }
103 :
104 : /*
105 : * XXX add variable strip size later even though MAXPHYS is really
106 : * the clever value, users like to tinker with that type of stuff.
107 : */
108 0 : sd->sd_meta->ssdi.ssd_strip_size = MAXPHYS;
109 0 : sd->sd_meta->ssdi.ssd_size = (coerced_size &
110 0 : ~(((u_int64_t)sd->sd_meta->ssdi.ssd_strip_size >>
111 0 : DEV_BSHIFT) - 1)) * (no_chunk - 1);
112 :
113 0 : return sr_raid5_init(sd);
114 0 : }
115 :
116 : int
117 0 : sr_raid5_assemble(struct sr_discipline *sd, struct bioc_createraid *bc,
118 : int no_chunk, void *data)
119 : {
120 0 : return sr_raid5_init(sd);
121 : }
122 :
123 : int
124 0 : sr_raid5_init(struct sr_discipline *sd)
125 : {
126 : /* Initialise runtime values. */
127 0 : sd->mds.mdd_raid5.sr5_strip_bits =
128 0 : sr_validate_stripsize(sd->sd_meta->ssdi.ssd_strip_size);
129 0 : if (sd->mds.mdd_raid5.sr5_strip_bits == -1) {
130 0 : sr_error(sd->sd_sc, "invalid strip size");
131 0 : return EINVAL;
132 : }
133 :
134 0 : sd->sd_max_ccb_per_wu = sd->sd_meta->ssdi.ssd_chunk_no;
135 :
136 0 : return 0;
137 0 : }
138 :
139 : int
140 0 : sr_raid5_openings(struct sr_discipline *sd)
141 : {
142 : /* Two work units per I/O, two for rebuild/scrub. */
143 0 : return ((sd->sd_max_wu - 2) >> 1);
144 : }
145 :
146 : void
147 0 : sr_raid5_set_chunk_state(struct sr_discipline *sd, int c, int new_state)
148 : {
149 : int old_state, s;
150 :
151 : DNPRINTF(SR_D_STATE, "%s: %s: %s: sr_raid_set_chunk_state %d -> %d\n",
152 : DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
153 : sd->sd_vol.sv_chunks[c]->src_meta.scmi.scm_devname, c, new_state);
154 :
155 : /* ok to go to splbio since this only happens in error path */
156 0 : s = splbio();
157 0 : old_state = sd->sd_vol.sv_chunks[c]->src_meta.scm_status;
158 :
159 : /* multiple IOs to the same chunk that fail will come through here */
160 0 : if (old_state == new_state)
161 : goto done;
162 :
163 0 : switch (old_state) {
164 : case BIOC_SDONLINE:
165 0 : switch (new_state) {
166 : case BIOC_SDOFFLINE:
167 : case BIOC_SDSCRUB:
168 : break;
169 : default:
170 : goto die;
171 : }
172 : break;
173 :
174 : case BIOC_SDOFFLINE:
175 0 : if (new_state == BIOC_SDREBUILD) {
176 : ;
177 : } else
178 : goto die;
179 : break;
180 :
181 : case BIOC_SDSCRUB:
182 0 : switch (new_state) {
183 : case BIOC_SDONLINE:
184 : case BIOC_SDOFFLINE:
185 : break;
186 : default:
187 : goto die;
188 : }
189 : break;
190 :
191 : case BIOC_SDREBUILD:
192 0 : switch (new_state) {
193 : case BIOC_SDONLINE:
194 : case BIOC_SDOFFLINE:
195 : break;
196 : default:
197 : goto die;
198 : }
199 : break;
200 :
201 : default:
202 : die:
203 0 : splx(s); /* XXX */
204 0 : panic("%s: %s: %s: invalid chunk state transition "
205 0 : "%d -> %d", DEVNAME(sd->sd_sc),
206 0 : sd->sd_meta->ssd_devname,
207 0 : sd->sd_vol.sv_chunks[c]->src_meta.scmi.scm_devname,
208 : old_state, new_state);
209 : /* NOTREACHED */
210 : }
211 :
212 0 : sd->sd_vol.sv_chunks[c]->src_meta.scm_status = new_state;
213 0 : sd->sd_set_vol_state(sd);
214 :
215 0 : sd->sd_must_flush = 1;
216 0 : task_add(systq, &sd->sd_meta_save_task);
217 : done:
218 0 : splx(s);
219 0 : }
220 :
221 : void
222 0 : sr_raid5_set_vol_state(struct sr_discipline *sd)
223 : {
224 0 : int states[SR_MAX_STATES];
225 : int new_state, i, s, nd;
226 0 : int old_state = sd->sd_vol_status;
227 :
228 : DNPRINTF(SR_D_STATE, "%s: %s: sr_raid_set_vol_state\n",
229 : DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname);
230 :
231 0 : nd = sd->sd_meta->ssdi.ssd_chunk_no;
232 :
233 0 : for (i = 0; i < SR_MAX_STATES; i++)
234 0 : states[i] = 0;
235 :
236 0 : for (i = 0; i < nd; i++) {
237 0 : s = sd->sd_vol.sv_chunks[i]->src_meta.scm_status;
238 0 : if (s >= SR_MAX_STATES)
239 0 : panic("%s: %s: %s: invalid chunk state",
240 0 : DEVNAME(sd->sd_sc),
241 0 : sd->sd_meta->ssd_devname,
242 0 : sd->sd_vol.sv_chunks[i]->src_meta.scmi.scm_devname);
243 0 : states[s]++;
244 : }
245 :
246 0 : if (states[BIOC_SDONLINE] == nd)
247 0 : new_state = BIOC_SVONLINE;
248 0 : else if (states[BIOC_SDONLINE] < nd - 1)
249 0 : new_state = BIOC_SVOFFLINE;
250 0 : else if (states[BIOC_SDSCRUB] != 0)
251 0 : new_state = BIOC_SVSCRUB;
252 0 : else if (states[BIOC_SDREBUILD] != 0)
253 0 : new_state = BIOC_SVREBUILD;
254 0 : else if (states[BIOC_SDONLINE] == nd - 1)
255 : new_state = BIOC_SVDEGRADED;
256 : else {
257 : #ifdef SR_DEBUG
258 : DNPRINTF(SR_D_STATE, "%s: invalid volume state, old state "
259 : "was %d\n", DEVNAME(sd->sd_sc), old_state);
260 : for (i = 0; i < nd; i++)
261 : DNPRINTF(SR_D_STATE, "%s: chunk %d status = %d\n",
262 : DEVNAME(sd->sd_sc), i,
263 : sd->sd_vol.sv_chunks[i]->src_meta.scm_status);
264 : #endif
265 0 : panic("invalid volume state");
266 : }
267 :
268 : DNPRINTF(SR_D_STATE, "%s: %s: sr_raid5_set_vol_state %d -> %d\n",
269 : DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
270 : old_state, new_state);
271 :
272 0 : switch (old_state) {
273 : case BIOC_SVONLINE:
274 0 : switch (new_state) {
275 : case BIOC_SVONLINE: /* can go to same state */
276 : case BIOC_SVOFFLINE:
277 : case BIOC_SVDEGRADED:
278 : case BIOC_SVREBUILD: /* happens on boot */
279 : break;
280 : default:
281 : goto die;
282 : }
283 : break;
284 :
285 : case BIOC_SVOFFLINE:
286 : /* XXX this might be a little too much */
287 : goto die;
288 :
289 : case BIOC_SVDEGRADED:
290 0 : switch (new_state) {
291 : case BIOC_SVOFFLINE:
292 : case BIOC_SVREBUILD:
293 : case BIOC_SVDEGRADED: /* can go to the same state */
294 : break;
295 : default:
296 : goto die;
297 : }
298 : break;
299 :
300 : case BIOC_SVBUILDING:
301 0 : switch (new_state) {
302 : case BIOC_SVONLINE:
303 : case BIOC_SVOFFLINE:
304 : case BIOC_SVBUILDING: /* can go to the same state */
305 : break;
306 : default:
307 : goto die;
308 : }
309 : break;
310 :
311 : case BIOC_SVSCRUB:
312 0 : switch (new_state) {
313 : case BIOC_SVONLINE:
314 : case BIOC_SVOFFLINE:
315 : case BIOC_SVDEGRADED:
316 : case BIOC_SVSCRUB: /* can go to same state */
317 : break;
318 : default:
319 : goto die;
320 : }
321 : break;
322 :
323 : case BIOC_SVREBUILD:
324 0 : switch (new_state) {
325 : case BIOC_SVONLINE:
326 : case BIOC_SVOFFLINE:
327 : case BIOC_SVDEGRADED:
328 : case BIOC_SVREBUILD: /* can go to the same state */
329 : break;
330 : default:
331 : goto die;
332 : }
333 : break;
334 :
335 : default:
336 : die:
337 0 : panic("%s: %s: invalid volume state transition %d -> %d",
338 0 : DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
339 : old_state, new_state);
340 : /* NOTREACHED */
341 : }
342 :
343 0 : sd->sd_vol_status = new_state;
344 0 : }
345 :
346 : static inline int
347 0 : sr_raid5_chunk_online(struct sr_discipline *sd, int chunk)
348 : {
349 0 : switch (sd->sd_vol.sv_chunks[chunk]->src_meta.scm_status) {
350 : case BIOC_SDONLINE:
351 : case BIOC_SDSCRUB:
352 0 : return 1;
353 : default:
354 0 : return 0;
355 : }
356 0 : }
357 :
358 : static inline int
359 0 : sr_raid5_chunk_rebuild(struct sr_discipline *sd, int chunk)
360 : {
361 0 : switch (sd->sd_vol.sv_chunks[chunk]->src_meta.scm_status) {
362 : case BIOC_SDREBUILD:
363 0 : return 1;
364 : default:
365 0 : return 0;
366 : }
367 0 : }
368 :
369 : int
370 0 : sr_raid5_rw(struct sr_workunit *wu)
371 : {
372 : struct sr_workunit *wu_r = NULL;
373 0 : struct sr_discipline *sd = wu->swu_dis;
374 0 : struct scsi_xfer *xs = wu->swu_xs;
375 : struct sr_chunk *scp;
376 0 : daddr_t blkno, lba;
377 : int64_t chunk_offs, lbaoffs, offset, strip_offs;
378 : int64_t strip_bits, strip_no, strip_size;
379 : int64_t chunk, no_chunk;
380 : int64_t parity, row_size;
381 : long length, datalen;
382 : void *data;
383 : int s;
384 :
385 : /* blkno and scsi error will be handled by sr_validate_io */
386 0 : if (sr_validate_io(wu, &blkno, "sr_raid5_rw"))
387 : goto bad;
388 :
389 : DNPRINTF(SR_D_DIS, "%s: %s sr_raid5_rw %s: blkno %lld size %d\n",
390 : DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
391 : (xs->flags & SCSI_DATA_IN) ? "read" : "write",
392 : (long long)blkno, xs->datalen);
393 :
394 0 : strip_size = sd->sd_meta->ssdi.ssd_strip_size;
395 0 : strip_bits = sd->mds.mdd_raid5.sr5_strip_bits;
396 0 : no_chunk = sd->sd_meta->ssdi.ssd_chunk_no - 1;
397 0 : row_size = (no_chunk << strip_bits) >> DEV_BSHIFT;
398 :
399 0 : data = xs->data;
400 0 : datalen = xs->datalen;
401 0 : lbaoffs = blkno << DEV_BSHIFT;
402 :
403 0 : if (xs->flags & SCSI_DATA_OUT) {
404 0 : if ((wu_r = sr_scsi_wu_get(sd, SCSI_NOSLEEP)) == NULL){
405 0 : printf("%s: %s failed to get read work unit",
406 0 : DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname);
407 0 : goto bad;
408 : }
409 0 : wu_r->swu_state = SR_WU_INPROGRESS;
410 0 : wu_r->swu_flags |= SR_WUF_DISCIPLINE;
411 0 : }
412 :
413 0 : wu->swu_blk_start = 0;
414 0 : while (datalen != 0) {
415 0 : strip_no = lbaoffs >> strip_bits;
416 0 : strip_offs = lbaoffs & (strip_size - 1);
417 0 : chunk_offs = (strip_no / no_chunk) << strip_bits;
418 0 : offset = chunk_offs + strip_offs;
419 :
420 : /* get size remaining in this stripe */
421 0 : length = MIN(strip_size - strip_offs, datalen);
422 :
423 : /*
424 : * Map disk offset to data and parity chunks, using a left
425 : * asymmetric algorithm for the parity assignment.
426 : */
427 0 : chunk = strip_no % no_chunk;
428 0 : parity = no_chunk - ((strip_no / no_chunk) % (no_chunk + 1));
429 0 : if (chunk >= parity)
430 0 : chunk++;
431 :
432 0 : lba = offset >> DEV_BSHIFT;
433 :
434 : /* XXX big hammer.. exclude I/O from entire stripe */
435 0 : if (wu->swu_blk_start == 0)
436 0 : wu->swu_blk_start = (strip_no / no_chunk) * row_size;
437 0 : wu->swu_blk_end = (strip_no / no_chunk) * row_size +
438 0 : (row_size - 1);
439 :
440 0 : scp = sd->sd_vol.sv_chunks[chunk];
441 0 : if (xs->flags & SCSI_DATA_IN) {
442 0 : switch (scp->src_meta.scm_status) {
443 : case BIOC_SDONLINE:
444 : case BIOC_SDSCRUB:
445 : /*
446 : * Chunk is online, issue a single read
447 : * request.
448 : */
449 0 : if (sr_raid5_addio(wu, chunk, lba, length,
450 0 : data, xs->flags, 0, NULL))
451 : goto bad;
452 : break;
453 : case BIOC_SDOFFLINE:
454 : case BIOC_SDREBUILD:
455 : case BIOC_SDHOTSPARE:
456 0 : if (sr_raid5_regenerate(wu, chunk, lba,
457 : length, data))
458 : goto bad;
459 : break;
460 : default:
461 0 : printf("%s: is offline, can't read\n",
462 0 : DEVNAME(sd->sd_sc));
463 0 : goto bad;
464 : }
465 : } else {
466 0 : if (sr_raid5_write(wu, wu_r, chunk, parity, lba,
467 : length, data, xs->flags, 0))
468 : goto bad;
469 : }
470 :
471 : /* advance to next block */
472 0 : lbaoffs += length;
473 0 : datalen -= length;
474 0 : data += length;
475 : }
476 :
477 0 : s = splbio();
478 0 : if (wu_r) {
479 0 : if (wu_r->swu_io_count > 0) {
480 : /* collide write request with reads */
481 0 : wu_r->swu_blk_start = wu->swu_blk_start;
482 0 : wu_r->swu_blk_end = wu->swu_blk_end;
483 :
484 0 : wu->swu_state = SR_WU_DEFERRED;
485 0 : wu_r->swu_collider = wu;
486 0 : TAILQ_INSERT_TAIL(&sd->sd_wu_defq, wu, swu_link);
487 :
488 : wu = wu_r;
489 0 : } else {
490 0 : sr_scsi_wu_put(sd, wu_r);
491 : }
492 : }
493 0 : splx(s);
494 :
495 0 : sr_schedule_wu(wu);
496 :
497 0 : return (0);
498 :
499 : bad:
500 : /* wu is unwound by sr_wu_put */
501 0 : if (wu_r)
502 0 : sr_scsi_wu_put(sd, wu_r);
503 0 : return (1);
504 0 : }
505 :
506 : int
507 0 : sr_raid5_regenerate(struct sr_workunit *wu, int chunk, daddr_t blkno,
508 : long len, void *data)
509 : {
510 0 : struct sr_discipline *sd = wu->swu_dis;
511 : int i;
512 :
513 : /*
514 : * Regenerate a block on a RAID 5 volume by xoring the data and parity
515 : * from all of the remaining online chunks. This requires the parity
516 : * to already be correct.
517 : */
518 :
519 : DNPRINTF(SR_D_DIS, "%s: %s sr_raid5_regenerate chunk %d offline, "
520 : "regenerating block %llu\n",
521 : DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname, chunk, blkno);
522 :
523 0 : memset(data, 0, len);
524 0 : for (i = 0; i < sd->sd_meta->ssdi.ssd_chunk_no; i++) {
525 0 : if (i == chunk)
526 : continue;
527 0 : if (!sr_raid5_chunk_online(sd, i))
528 : goto bad;
529 0 : if (sr_raid5_addio(wu, i, blkno, len, NULL, SCSI_DATA_IN,
530 : 0, data))
531 : goto bad;
532 : }
533 0 : return (0);
534 :
535 : bad:
536 0 : return (1);
537 0 : }
538 :
539 : int
540 0 : sr_raid5_write(struct sr_workunit *wu, struct sr_workunit *wu_r, int chunk,
541 : int parity, daddr_t blkno, long len, void *data, int xsflags,
542 : int ccbflags)
543 : {
544 0 : struct sr_discipline *sd = wu->swu_dis;
545 0 : struct scsi_xfer *xs = wu->swu_xs;
546 : void *xorbuf;
547 : int chunk_online, chunk_rebuild;
548 : int parity_online, parity_rebuild;
549 : int other_offline = 0, other_rebuild = 0;
550 : int i;
551 :
552 : /*
553 : * Perform a write to a RAID 5 volume. This write routine does not
554 : * require the parity to already be correct and will operate on a
555 : * uninitialised volume.
556 : *
557 : * There are four possible cases:
558 : *
559 : * 1) All data chunks and parity are online. In this case we read the
560 : * data from all data chunks, except the one we are writing to, in
561 : * order to calculate and write the new parity.
562 : *
563 : * 2) The parity chunk is offline. In this case we only need to write
564 : * to the data chunk. No parity calculation is required.
565 : *
566 : * 3) The data chunk is offline. In this case we read the data from all
567 : * online chunks in order to calculate and write the new parity.
568 : * This is the same as (1) except we do not write the data chunk.
569 : *
570 : * 4) A different data chunk is offline. The new parity is calculated
571 : * by taking the existing parity, xoring the original data and
572 : * xoring in the new data. This requires that the parity already be
573 : * correct, which it will be if any of the data chunks has
574 : * previously been written.
575 : *
576 : * There is an additional complication introduced by a chunk that is
577 : * being rebuilt. If this is the data or parity chunk, then we want
578 : * to write to it as per normal. If it is another data chunk then we
579 : * need to presume that it has not yet been regenerated and use the
580 : * same method as detailed in (4) above.
581 : */
582 :
583 : DNPRINTF(SR_D_DIS, "%s: %s sr_raid5_write chunk %i parity %i "
584 : "blkno %llu\n", DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
585 : chunk, parity, (unsigned long long)blkno);
586 :
587 0 : chunk_online = sr_raid5_chunk_online(sd, chunk);
588 0 : chunk_rebuild = sr_raid5_chunk_rebuild(sd, chunk);
589 0 : parity_online = sr_raid5_chunk_online(sd, parity);
590 0 : parity_rebuild = sr_raid5_chunk_rebuild(sd, parity);
591 :
592 0 : for (i = 0; i < sd->sd_meta->ssdi.ssd_chunk_no; i++) {
593 0 : if (i == chunk || i == parity)
594 : continue;
595 0 : if (sr_raid5_chunk_rebuild(sd, i))
596 0 : other_rebuild = 1;
597 0 : else if (!sr_raid5_chunk_online(sd, i))
598 0 : other_offline = 1;
599 : }
600 :
601 : DNPRINTF(SR_D_DIS, "%s: %s chunk online %d, parity online %d, "
602 : "other offline %d\n", DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
603 : chunk_online, parity_online, other_offline);
604 :
605 0 : if (!parity_online && !parity_rebuild)
606 : goto data_write;
607 :
608 0 : xorbuf = sr_block_get(sd, len);
609 0 : if (xorbuf == NULL)
610 : goto bad;
611 0 : memcpy(xorbuf, data, len);
612 :
613 0 : if (other_offline || other_rebuild) {
614 :
615 : /*
616 : * XXX - If we can guarantee that this LBA has been scrubbed
617 : * then we can also take this faster path.
618 : */
619 :
620 : /* Read in existing data and existing parity. */
621 0 : if (sr_raid5_addio(wu_r, chunk, blkno, len, NULL,
622 : SCSI_DATA_IN, 0, xorbuf))
623 : goto bad;
624 0 : if (sr_raid5_addio(wu_r, parity, blkno, len, NULL,
625 : SCSI_DATA_IN, 0, xorbuf))
626 : goto bad;
627 :
628 : } else {
629 :
630 : /* Read in existing data from all other chunks. */
631 0 : for (i = 0; i < sd->sd_meta->ssdi.ssd_chunk_no; i++) {
632 0 : if (i == chunk || i == parity)
633 : continue;
634 0 : if (sr_raid5_addio(wu_r, i, blkno, len, NULL,
635 : SCSI_DATA_IN, 0, xorbuf))
636 : goto bad;
637 : }
638 :
639 : }
640 :
641 : /* Write new parity. */
642 0 : if (sr_raid5_addio(wu, parity, blkno, len, xorbuf, xs->flags,
643 : SR_CCBF_FREEBUF, NULL))
644 : goto bad;
645 :
646 : data_write:
647 : /* Write new data. */
648 0 : if (chunk_online || chunk_rebuild)
649 0 : if (sr_raid5_addio(wu, chunk, blkno, len, data, xs->flags,
650 : 0, NULL))
651 : goto bad;
652 :
653 0 : return (0);
654 :
655 : bad:
656 0 : return (1);
657 0 : }
658 :
659 : void
660 0 : sr_raid5_intr(struct buf *bp)
661 : {
662 0 : struct sr_ccb *ccb = (struct sr_ccb *)bp;
663 0 : struct sr_workunit *wu = ccb->ccb_wu;
664 0 : struct sr_discipline *sd = wu->swu_dis;
665 : int s;
666 :
667 : DNPRINTF(SR_D_INTR, "%s: sr_raid5_intr bp %p xs %p\n",
668 : DEVNAME(sd->sd_sc), bp, wu->swu_xs);
669 :
670 0 : s = splbio();
671 0 : sr_ccb_done(ccb);
672 :
673 : /* XXX - Should this be done via the taskq? */
674 :
675 : /* XOR data to result. */
676 0 : if (ccb->ccb_state == SR_CCB_OK && ccb->ccb_opaque)
677 0 : sr_raid5_xor(ccb->ccb_opaque, ccb->ccb_buf.b_data,
678 0 : ccb->ccb_buf.b_bcount);
679 :
680 : /* Free allocated data buffer. */
681 0 : if (ccb->ccb_flags & SR_CCBF_FREEBUF) {
682 0 : sr_block_put(sd, ccb->ccb_buf.b_data, ccb->ccb_buf.b_bcount);
683 0 : ccb->ccb_buf.b_data = NULL;
684 0 : }
685 :
686 0 : sr_wu_done(wu);
687 0 : splx(s);
688 0 : }
689 :
690 : int
691 0 : sr_raid5_wu_done(struct sr_workunit *wu)
692 : {
693 0 : struct sr_discipline *sd = wu->swu_dis;
694 0 : struct scsi_xfer *xs = wu->swu_xs;
695 :
696 : /* XXX - we have no way of propagating errors... */
697 0 : if (wu->swu_flags & (SR_WUF_DISCIPLINE | SR_WUF_REBUILD))
698 0 : return SR_WU_OK;
699 :
700 : /* XXX - This is insufficient for RAID 5. */
701 0 : if (wu->swu_ios_succeeded > 0) {
702 0 : xs->error = XS_NOERROR;
703 0 : return SR_WU_OK;
704 : }
705 :
706 0 : if (xs->flags & SCSI_DATA_IN) {
707 0 : printf("%s: retrying read on block %lld\n",
708 : sd->sd_meta->ssd_devname, (long long)wu->swu_blk_start);
709 0 : sr_wu_release_ccbs(wu);
710 0 : wu->swu_state = SR_WU_RESTART;
711 0 : if (sd->sd_scsi_rw(wu) == 0)
712 0 : return SR_WU_RESTART;
713 : } else {
714 : /* XXX - retry write if we just went from online to degraded. */
715 0 : printf("%s: permanently fail write on block %lld\n",
716 : sd->sd_meta->ssd_devname, (long long)wu->swu_blk_start);
717 : }
718 :
719 0 : wu->swu_state = SR_WU_FAILED;
720 0 : xs->error = XS_DRIVER_STUFFUP;
721 :
722 0 : return SR_WU_FAILED;
723 0 : }
724 :
725 : int
726 0 : sr_raid5_addio(struct sr_workunit *wu, int chunk, daddr_t blkno,
727 : long len, void *data, int xsflags, int ccbflags, void *xorbuf)
728 : {
729 0 : struct sr_discipline *sd = wu->swu_dis;
730 : struct sr_ccb *ccb;
731 :
732 : DNPRINTF(SR_D_DIS, "sr_raid5_addio: %s chunk %d block %lld "
733 : "length %ld %s\n", (xsflags & SCSI_DATA_IN) ? "read" : "write",
734 : chunk, (long long)blkno, len, xorbuf ? "X0R" : "-");
735 :
736 : /* Allocate temporary buffer. */
737 0 : if (data == NULL) {
738 0 : data = sr_block_get(sd, len);
739 0 : if (data == NULL)
740 0 : return (-1);
741 0 : ccbflags |= SR_CCBF_FREEBUF;
742 0 : }
743 :
744 0 : ccb = sr_ccb_rw(sd, chunk, blkno, len, data, xsflags, ccbflags);
745 0 : if (ccb == NULL) {
746 0 : if (ccbflags & SR_CCBF_FREEBUF)
747 0 : sr_block_put(sd, data, len);
748 0 : return (-1);
749 : }
750 0 : ccb->ccb_opaque = xorbuf;
751 0 : sr_wu_enqueue_ccb(wu, ccb);
752 :
753 0 : return (0);
754 0 : }
755 :
756 : void
757 0 : sr_raid5_xor(void *a, void *b, int len)
758 : {
759 0 : uint32_t *xa = a, *xb = b;
760 :
761 0 : len >>= 2;
762 0 : while (len--)
763 0 : *xa++ ^= *xb++;
764 0 : }
765 :
766 : void
767 0 : sr_raid5_rebuild(struct sr_discipline *sd)
768 : {
769 : int64_t strip_no, strip_size, strip_bits, i, restart;
770 : int64_t chunk_count, chunk_strips, chunk_lba, chunk_size, row_size;
771 : struct sr_workunit *wu_r, *wu_w;
772 : int s, slept, percent = 0, old_percent = -1;
773 : int rebuild_chunk = -1;
774 : void *xorbuf;
775 :
776 : /* Find the rebuild chunk. */
777 0 : for (i = 0; i < sd->sd_meta->ssdi.ssd_chunk_no; i++) {
778 0 : if (sr_raid5_chunk_rebuild(sd, i)) {
779 : rebuild_chunk = i;
780 0 : break;
781 : }
782 : }
783 0 : if (rebuild_chunk == -1)
784 : goto bad;
785 :
786 0 : strip_size = sd->sd_meta->ssdi.ssd_strip_size;
787 0 : strip_bits = sd->mds.mdd_raid5.sr5_strip_bits;
788 0 : chunk_count = sd->sd_meta->ssdi.ssd_chunk_no - 1;
789 0 : chunk_size = sd->sd_meta->ssdi.ssd_size / chunk_count;
790 0 : chunk_strips = (chunk_size << DEV_BSHIFT) >> strip_bits;
791 0 : row_size = (chunk_count << strip_bits) >> DEV_BSHIFT;
792 :
793 : DNPRINTF(SR_D_REBUILD, "%s: %s sr_raid5_rebuild volume size = %lld, "
794 : "chunk count = %lld, chunk size = %lld, chunk strips = %lld, "
795 : "row size = %lld\n", DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
796 : sd->sd_meta->ssdi.ssd_size, chunk_count, chunk_size, chunk_strips,
797 : row_size);
798 :
799 0 : restart = sd->sd_meta->ssd_rebuild / row_size;
800 0 : if (restart > chunk_strips) {
801 0 : printf("%s: bogus rebuild restart offset, starting from 0\n",
802 0 : DEVNAME(sd->sd_sc));
803 : restart = 0;
804 0 : }
805 0 : if (restart != 0) {
806 0 : percent = sr_rebuild_percent(sd);
807 0 : printf("%s: resuming rebuild on %s at %d%%\n",
808 0 : DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname, percent);
809 0 : }
810 :
811 0 : for (strip_no = restart; strip_no < chunk_strips; strip_no++) {
812 0 : chunk_lba = (strip_size >> DEV_BSHIFT) * strip_no;
813 :
814 : DNPRINTF(SR_D_REBUILD, "%s: %s rebuild strip %lld, "
815 : "chunk lba = %lld\n", DEVNAME(sd->sd_sc),
816 : sd->sd_meta->ssd_devname, strip_no, chunk_lba);
817 :
818 0 : wu_w = sr_scsi_wu_get(sd, 0);
819 0 : wu_r = sr_scsi_wu_get(sd, 0);
820 :
821 0 : xorbuf = sr_block_get(sd, strip_size);
822 0 : if (sr_raid5_regenerate(wu_r, rebuild_chunk, chunk_lba,
823 : strip_size, xorbuf))
824 : goto bad;
825 0 : if (sr_raid5_addio(wu_w, rebuild_chunk, chunk_lba, strip_size,
826 : xorbuf, SCSI_DATA_OUT, SR_CCBF_FREEBUF, NULL))
827 : goto bad;
828 :
829 : /* Collide write work unit with read work unit. */
830 0 : wu_r->swu_state = SR_WU_INPROGRESS;
831 0 : wu_r->swu_flags |= SR_WUF_REBUILD;
832 0 : wu_w->swu_state = SR_WU_DEFERRED;
833 0 : wu_w->swu_flags |= SR_WUF_REBUILD | SR_WUF_WAKEUP;
834 0 : wu_r->swu_collider = wu_w;
835 :
836 : /* Block I/O to this strip while we rebuild it. */
837 0 : wu_r->swu_blk_start = (strip_no / chunk_count) * row_size;
838 0 : wu_r->swu_blk_end = wu_r->swu_blk_start + row_size - 1;
839 0 : wu_w->swu_blk_start = wu_r->swu_blk_start;
840 0 : wu_w->swu_blk_end = wu_r->swu_blk_end;
841 :
842 : DNPRINTF(SR_D_REBUILD, "%s: %s rebuild swu_blk_start = %lld, "
843 : "swu_blk_end = %lld\n", DEVNAME(sd->sd_sc),
844 : sd->sd_meta->ssd_devname,
845 : wu_r->swu_blk_start, wu_r->swu_blk_end);
846 :
847 0 : s = splbio();
848 0 : TAILQ_INSERT_TAIL(&sd->sd_wu_defq, wu_w, swu_link);
849 0 : splx(s);
850 :
851 0 : sr_schedule_wu(wu_r);
852 :
853 : slept = 0;
854 0 : while ((wu_w->swu_flags & SR_WUF_REBUILDIOCOMP) == 0) {
855 0 : tsleep(wu_w, PRIBIO, "sr_rebuild", 0);
856 : slept = 1;
857 : }
858 0 : if (!slept)
859 0 : tsleep(sd->sd_sc, PWAIT, "sr_yield", 1);
860 :
861 0 : sr_scsi_wu_put(sd, wu_r);
862 0 : sr_scsi_wu_put(sd, wu_w);
863 :
864 0 : sd->sd_meta->ssd_rebuild = chunk_lba * chunk_count;
865 :
866 0 : percent = sr_rebuild_percent(sd);
867 0 : if (percent != old_percent && strip_no != chunk_strips - 1) {
868 0 : if (sr_meta_save(sd, SR_META_DIRTY))
869 0 : printf("%s: could not save metadata to %s\n",
870 0 : DEVNAME(sd->sd_sc),
871 0 : sd->sd_meta->ssd_devname);
872 : old_percent = percent;
873 0 : }
874 :
875 0 : if (sd->sd_reb_abort)
876 : goto abort;
877 : }
878 :
879 : DNPRINTF(SR_D_REBUILD, "%s: %s rebuild complete\n", DEVNAME(sd->sd_sc),
880 : sd->sd_meta->ssd_devname);
881 :
882 : /* all done */
883 0 : sd->sd_meta->ssd_rebuild = 0;
884 0 : for (i = 0; i < sd->sd_meta->ssdi.ssd_chunk_no; i++) {
885 0 : if (sd->sd_vol.sv_chunks[i]->src_meta.scm_status ==
886 : BIOC_SDREBUILD) {
887 0 : sd->sd_set_chunk_state(sd, i, BIOC_SDONLINE);
888 0 : break;
889 : }
890 : }
891 :
892 0 : return;
893 :
894 : abort:
895 0 : if (sr_meta_save(sd, SR_META_DIRTY))
896 0 : printf("%s: could not save metadata to %s\n",
897 0 : DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname);
898 : bad:
899 0 : return;
900 0 : }
901 :
902 : #if 0
903 : void
904 : sr_raid5_scrub(struct sr_discipline *sd)
905 : {
906 : int64_t strip_no, strip_size, no_chunk, parity, max_strip, strip_bits;
907 : int64_t i;
908 : struct sr_workunit *wu_r, *wu_w;
909 : int s, slept;
910 : void *xorbuf;
911 :
912 : wu_w = sr_scsi_wu_get(sd, 0);
913 : wu_r = sr_scsi_wu_get(sd, 0);
914 :
915 : no_chunk = sd->sd_meta->ssdi.ssd_chunk_no - 1;
916 : strip_size = sd->sd_meta->ssdi.ssd_strip_size;
917 : strip_bits = sd->mds.mdd_raid5.sr5_strip_bits;
918 : max_strip = sd->sd_meta->ssdi.ssd_size >> strip_bits;
919 :
920 : for (strip_no = 0; strip_no < max_strip; strip_no++) {
921 : parity = no_chunk - ((strip_no / no_chunk) % (no_chunk + 1));
922 :
923 : xorbuf = sr_block_get(sd, strip_size);
924 : for (i = 0; i <= no_chunk; i++) {
925 : if (i != parity)
926 : sr_raid5_addio(wu_r, i, 0xBADCAFE, strip_size,
927 : NULL, SCSI_DATA_IN, 0, xorbuf);
928 : }
929 : sr_raid5_addio(wu_w, parity, 0xBADCAFE, strip_size, xorbuf,
930 : SCSI_DATA_OUT, SR_CCBF_FREEBUF, NULL);
931 :
932 : wu_r->swu_flags |= SR_WUF_REBUILD;
933 :
934 : /* Collide wu_w with wu_r */
935 : wu_w->swu_state = SR_WU_DEFERRED;
936 : wu_w->swu_flags |= SR_WUF_REBUILD | SR_WUF_WAKEUP;
937 : wu_r->swu_collider = wu_w;
938 :
939 : s = splbio();
940 : TAILQ_INSERT_TAIL(&sd->sd_wu_defq, wu_w, swu_link);
941 : splx(s);
942 :
943 : wu_r->swu_state = SR_WU_INPROGRESS;
944 : sr_schedule_wu(wu_r);
945 :
946 : slept = 0;
947 : while ((wu_w->swu_flags & SR_WUF_REBUILDIOCOMP) == 0) {
948 : tsleep(wu_w, PRIBIO, "sr_scrub", 0);
949 : slept = 1;
950 : }
951 : if (!slept)
952 : tsleep(sd->sd_sc, PWAIT, "sr_yield", 1);
953 : }
954 : done:
955 : return;
956 : }
957 : #endif
|