Line data Source code
1 : /* $OpenBSD: softraid_raid6.c,v 1.71 2016/04/12 16:26:54 krw Exp $ */
2 : /*
3 : * Copyright (c) 2009 Marco Peereboom <marco@peereboom.us>
4 : * Copyright (c) 2009 Jordan Hargrave <jordan@openbsd.org>
5 : *
6 : * Permission to use, copy, modify, and distribute this software for any
7 : * purpose with or without fee is hereby granted, provided that the above
8 : * copyright notice and this permission notice appear in all copies.
9 : *
10 : * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11 : * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12 : * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13 : * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14 : * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15 : * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16 : * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17 : */
18 :
19 : #include "bio.h"
20 :
21 : #include <sys/param.h>
22 : #include <sys/systm.h>
23 : #include <sys/buf.h>
24 : #include <sys/device.h>
25 : #include <sys/ioctl.h>
26 : #include <sys/malloc.h>
27 : #include <sys/kernel.h>
28 : #include <sys/disk.h>
29 : #include <sys/rwlock.h>
30 : #include <sys/queue.h>
31 : #include <sys/fcntl.h>
32 : #include <sys/mount.h>
33 : #include <sys/sensors.h>
34 : #include <sys/stat.h>
35 : #include <sys/task.h>
36 : #include <sys/conf.h>
37 : #include <sys/uio.h>
38 :
39 : #include <scsi/scsi_all.h>
40 : #include <scsi/scsiconf.h>
41 : #include <scsi/scsi_disk.h>
42 :
43 : #include <dev/softraidvar.h>
44 :
45 : uint8_t *gf_map[256];
46 : uint8_t gf_pow[768];
47 : int gf_log[256];
48 :
49 : /* RAID 6 functions. */
50 : int sr_raid6_create(struct sr_discipline *, struct bioc_createraid *,
51 : int, int64_t);
52 : int sr_raid6_assemble(struct sr_discipline *, struct bioc_createraid *,
53 : int, void *);
54 : int sr_raid6_init(struct sr_discipline *);
55 : int sr_raid6_rw(struct sr_workunit *);
56 : int sr_raid6_openings(struct sr_discipline *);
57 : void sr_raid6_intr(struct buf *);
58 : int sr_raid6_wu_done(struct sr_workunit *);
59 : void sr_raid6_set_chunk_state(struct sr_discipline *, int, int);
60 : void sr_raid6_set_vol_state(struct sr_discipline *);
61 :
62 : void sr_raid6_xorp(void *, void *, int);
63 : void sr_raid6_xorq(void *, void *, int, int);
64 : int sr_raid6_addio(struct sr_workunit *wu, int, daddr_t, long,
65 : void *, int, int, void *, void *, int);
66 : void sr_raid6_scrub(struct sr_discipline *);
67 : int sr_failio(struct sr_workunit *);
68 :
69 : void gf_init(void);
70 : uint8_t gf_inv(uint8_t);
71 : int gf_premul(uint8_t);
72 : uint8_t gf_mul(uint8_t, uint8_t);
73 :
74 : #define SR_NOFAIL 0x00
75 : #define SR_FAILX (1L << 0)
76 : #define SR_FAILY (1L << 1)
77 : #define SR_FAILP (1L << 2)
78 : #define SR_FAILQ (1L << 3)
79 :
80 : struct sr_raid6_opaque {
81 : int gn;
82 : void *pbuf;
83 : void *qbuf;
84 : };
85 :
86 : /* discipline initialisation. */
87 : void
88 0 : sr_raid6_discipline_init(struct sr_discipline *sd)
89 : {
90 : /* Initialize GF256 tables. */
91 0 : gf_init();
92 :
93 : /* Fill out discipline members. */
94 0 : sd->sd_type = SR_MD_RAID6;
95 0 : strlcpy(sd->sd_name, "RAID 6", sizeof(sd->sd_name));
96 0 : sd->sd_capabilities = SR_CAP_SYSTEM_DISK | SR_CAP_AUTO_ASSEMBLE |
97 : SR_CAP_REDUNDANT;
98 0 : sd->sd_max_wu = SR_RAID6_NOWU;
99 :
100 : /* Setup discipline specific function pointers. */
101 0 : sd->sd_assemble = sr_raid6_assemble;
102 0 : sd->sd_create = sr_raid6_create;
103 0 : sd->sd_openings = sr_raid6_openings;
104 0 : sd->sd_scsi_rw = sr_raid6_rw;
105 0 : sd->sd_scsi_intr = sr_raid6_intr;
106 0 : sd->sd_scsi_wu_done = sr_raid6_wu_done;
107 0 : sd->sd_set_chunk_state = sr_raid6_set_chunk_state;
108 0 : sd->sd_set_vol_state = sr_raid6_set_vol_state;
109 0 : }
110 :
111 : int
112 0 : sr_raid6_create(struct sr_discipline *sd, struct bioc_createraid *bc,
113 : int no_chunk, int64_t coerced_size)
114 : {
115 0 : if (no_chunk < 4) {
116 0 : sr_error(sd->sd_sc, "%s requires four or more chunks",
117 0 : sd->sd_name);
118 0 : return EINVAL;
119 : }
120 :
121 : /*
122 : * XXX add variable strip size later even though MAXPHYS is really
123 : * the clever value, users like * to tinker with that type of stuff.
124 : */
125 0 : sd->sd_meta->ssdi.ssd_strip_size = MAXPHYS;
126 0 : sd->sd_meta->ssdi.ssd_size = (coerced_size &
127 0 : ~(((u_int64_t)sd->sd_meta->ssdi.ssd_strip_size >>
128 0 : DEV_BSHIFT) - 1)) * (no_chunk - 2);
129 :
130 0 : return sr_raid6_init(sd);
131 0 : }
132 :
133 : int
134 0 : sr_raid6_assemble(struct sr_discipline *sd, struct bioc_createraid *bc,
135 : int no_chunk, void *data)
136 : {
137 0 : return sr_raid6_init(sd);
138 : }
139 :
140 : int
141 0 : sr_raid6_init(struct sr_discipline *sd)
142 : {
143 : /* Initialise runtime values. */
144 0 : sd->mds.mdd_raid6.sr6_strip_bits =
145 0 : sr_validate_stripsize(sd->sd_meta->ssdi.ssd_strip_size);
146 0 : if (sd->mds.mdd_raid6.sr6_strip_bits == -1) {
147 0 : sr_error(sd->sd_sc, "invalid strip size");
148 0 : return EINVAL;
149 : }
150 :
151 : /* only if stripsize <= MAXPHYS */
152 0 : sd->sd_max_ccb_per_wu = max(6, 2 * sd->sd_meta->ssdi.ssd_chunk_no);
153 :
154 0 : return 0;
155 0 : }
156 :
157 : int
158 0 : sr_raid6_openings(struct sr_discipline *sd)
159 : {
160 0 : return (sd->sd_max_wu >> 1); /* 2 wu's per IO */
161 : }
162 :
163 : void
164 0 : sr_raid6_set_chunk_state(struct sr_discipline *sd, int c, int new_state)
165 : {
166 : int old_state, s;
167 :
168 : /* XXX this is for RAID 0 */
169 : DNPRINTF(SR_D_STATE, "%s: %s: %s: sr_raid_set_chunk_state %d -> %d\n",
170 : DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
171 : sd->sd_vol.sv_chunks[c]->src_meta.scmi.scm_devname, c, new_state);
172 :
173 : /* ok to go to splbio since this only happens in error path */
174 0 : s = splbio();
175 0 : old_state = sd->sd_vol.sv_chunks[c]->src_meta.scm_status;
176 :
177 : /* multiple IOs to the same chunk that fail will come through here */
178 0 : if (old_state == new_state)
179 : goto done;
180 :
181 0 : switch (old_state) {
182 : case BIOC_SDONLINE:
183 0 : switch (new_state) {
184 : case BIOC_SDOFFLINE:
185 : case BIOC_SDSCRUB:
186 : break;
187 : default:
188 : goto die;
189 : }
190 : break;
191 :
192 : case BIOC_SDOFFLINE:
193 0 : if (new_state == BIOC_SDREBUILD) {
194 : ;
195 : } else
196 : goto die;
197 : break;
198 :
199 : case BIOC_SDSCRUB:
200 0 : switch (new_state) {
201 : case BIOC_SDONLINE:
202 : case BIOC_SDOFFLINE:
203 : break;
204 : default:
205 : goto die;
206 : }
207 : break;
208 :
209 : case BIOC_SDREBUILD:
210 0 : switch (new_state) {
211 : case BIOC_SDONLINE:
212 : case BIOC_SDOFFLINE:
213 : break;
214 : default:
215 : goto die;
216 : }
217 : break;
218 :
219 : default:
220 : die:
221 0 : splx(s); /* XXX */
222 0 : panic("%s: %s: %s: invalid chunk state transition "
223 0 : "%d -> %d", DEVNAME(sd->sd_sc),
224 0 : sd->sd_meta->ssd_devname,
225 0 : sd->sd_vol.sv_chunks[c]->src_meta.scmi.scm_devname,
226 : old_state, new_state);
227 : /* NOTREACHED */
228 : }
229 :
230 0 : sd->sd_vol.sv_chunks[c]->src_meta.scm_status = new_state;
231 0 : sd->sd_set_vol_state(sd);
232 :
233 0 : sd->sd_must_flush = 1;
234 0 : task_add(systq, &sd->sd_meta_save_task);
235 : done:
236 0 : splx(s);
237 0 : }
238 :
239 : void
240 0 : sr_raid6_set_vol_state(struct sr_discipline *sd)
241 : {
242 0 : int states[SR_MAX_STATES];
243 : int new_state, i, s, nd;
244 0 : int old_state = sd->sd_vol_status;
245 :
246 : /* XXX this is for RAID 0 */
247 :
248 : DNPRINTF(SR_D_STATE, "%s: %s: sr_raid_set_vol_state\n",
249 : DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname);
250 :
251 0 : nd = sd->sd_meta->ssdi.ssd_chunk_no;
252 :
253 0 : for (i = 0; i < SR_MAX_STATES; i++)
254 0 : states[i] = 0;
255 :
256 0 : for (i = 0; i < nd; i++) {
257 0 : s = sd->sd_vol.sv_chunks[i]->src_meta.scm_status;
258 0 : if (s >= SR_MAX_STATES)
259 0 : panic("%s: %s: %s: invalid chunk state",
260 0 : DEVNAME(sd->sd_sc),
261 0 : sd->sd_meta->ssd_devname,
262 0 : sd->sd_vol.sv_chunks[i]->src_meta.scmi.scm_devname);
263 0 : states[s]++;
264 : }
265 :
266 0 : if (states[BIOC_SDONLINE] == nd)
267 0 : new_state = BIOC_SVONLINE;
268 0 : else if (states[BIOC_SDONLINE] < nd - 2)
269 0 : new_state = BIOC_SVOFFLINE;
270 0 : else if (states[BIOC_SDSCRUB] != 0)
271 0 : new_state = BIOC_SVSCRUB;
272 0 : else if (states[BIOC_SDREBUILD] != 0)
273 0 : new_state = BIOC_SVREBUILD;
274 0 : else if (states[BIOC_SDONLINE] < nd)
275 : new_state = BIOC_SVDEGRADED;
276 : else {
277 0 : printf("old_state = %d, ", old_state);
278 0 : for (i = 0; i < nd; i++)
279 0 : printf("%d = %d, ", i,
280 0 : sd->sd_vol.sv_chunks[i]->src_meta.scm_status);
281 0 : panic("invalid new_state");
282 : }
283 :
284 : DNPRINTF(SR_D_STATE, "%s: %s: sr_raid_set_vol_state %d -> %d\n",
285 : DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
286 : old_state, new_state);
287 :
288 0 : switch (old_state) {
289 : case BIOC_SVONLINE:
290 0 : switch (new_state) {
291 : case BIOC_SVONLINE: /* can go to same state */
292 : case BIOC_SVOFFLINE:
293 : case BIOC_SVDEGRADED:
294 : case BIOC_SVREBUILD: /* happens on boot */
295 : break;
296 : default:
297 : goto die;
298 : }
299 : break;
300 :
301 : case BIOC_SVOFFLINE:
302 : /* XXX this might be a little too much */
303 : goto die;
304 :
305 : case BIOC_SVDEGRADED:
306 0 : switch (new_state) {
307 : case BIOC_SVOFFLINE:
308 : case BIOC_SVREBUILD:
309 : case BIOC_SVDEGRADED: /* can go to the same state */
310 : break;
311 : default:
312 : goto die;
313 : }
314 : break;
315 :
316 : case BIOC_SVBUILDING:
317 0 : switch (new_state) {
318 : case BIOC_SVONLINE:
319 : case BIOC_SVOFFLINE:
320 : case BIOC_SVBUILDING: /* can go to the same state */
321 : break;
322 : default:
323 : goto die;
324 : }
325 : break;
326 :
327 : case BIOC_SVSCRUB:
328 0 : switch (new_state) {
329 : case BIOC_SVONLINE:
330 : case BIOC_SVOFFLINE:
331 : case BIOC_SVDEGRADED:
332 : case BIOC_SVSCRUB: /* can go to same state */
333 : break;
334 : default:
335 : goto die;
336 : }
337 : break;
338 :
339 : case BIOC_SVREBUILD:
340 0 : switch (new_state) {
341 : case BIOC_SVONLINE:
342 : case BIOC_SVOFFLINE:
343 : case BIOC_SVDEGRADED:
344 : case BIOC_SVREBUILD: /* can go to the same state */
345 : break;
346 : default:
347 : goto die;
348 : }
349 : break;
350 :
351 : default:
352 : die:
353 0 : panic("%s: %s: invalid volume state transition %d -> %d",
354 0 : DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
355 : old_state, new_state);
356 : /* NOTREACHED */
357 : }
358 :
359 0 : sd->sd_vol_status = new_state;
360 0 : }
361 :
362 : /* modes:
363 : * readq: sr_raid6_addio(i, lba, length, NULL, SCSI_DATA_IN,
364 : * 0, qbuf, NULL, 0);
365 : * readp: sr_raid6_addio(i, lba, length, NULL, SCSI_DATA_IN,
366 : * 0, pbuf, NULL, 0);
367 : * readx: sr_raid6_addio(i, lba, length, NULL, SCSI_DATA_IN,
368 : * 0, pbuf, qbuf, gf_pow[i]);
369 : */
370 :
371 : int
372 0 : sr_raid6_rw(struct sr_workunit *wu)
373 : {
374 : struct sr_workunit *wu_r = NULL;
375 0 : struct sr_discipline *sd = wu->swu_dis;
376 0 : struct scsi_xfer *xs = wu->swu_xs;
377 : struct sr_chunk *scp;
378 : int s, fail, i, gxinv, pxinv;
379 0 : daddr_t blkno, lba;
380 : int64_t chunk_offs, lbaoffs, offset, strip_offs;
381 : int64_t strip_no, strip_size, strip_bits, row_size;
382 : int64_t fchunk, no_chunk, chunk, qchunk, pchunk;
383 : long length, datalen;
384 : void *pbuf, *data, *qbuf;
385 :
386 : /* blkno and scsi error will be handled by sr_validate_io */
387 0 : if (sr_validate_io(wu, &blkno, "sr_raid6_rw"))
388 : goto bad;
389 :
390 0 : strip_size = sd->sd_meta->ssdi.ssd_strip_size;
391 0 : strip_bits = sd->mds.mdd_raid6.sr6_strip_bits;
392 0 : no_chunk = sd->sd_meta->ssdi.ssd_chunk_no - 2;
393 0 : row_size = (no_chunk << strip_bits) >> DEV_BSHIFT;
394 :
395 0 : data = xs->data;
396 0 : datalen = xs->datalen;
397 0 : lbaoffs = blkno << DEV_BSHIFT;
398 :
399 0 : if (xs->flags & SCSI_DATA_OUT) {
400 0 : if ((wu_r = sr_scsi_wu_get(sd, SCSI_NOSLEEP)) == NULL){
401 0 : printf("%s: can't get wu_r", DEVNAME(sd->sd_sc));
402 0 : goto bad;
403 : }
404 0 : wu_r->swu_state = SR_WU_INPROGRESS;
405 0 : wu_r->swu_flags |= SR_WUF_DISCIPLINE;
406 0 : }
407 :
408 0 : wu->swu_blk_start = 0;
409 0 : while (datalen != 0) {
410 0 : strip_no = lbaoffs >> strip_bits;
411 0 : strip_offs = lbaoffs & (strip_size - 1);
412 0 : chunk_offs = (strip_no / no_chunk) << strip_bits;
413 0 : offset = chunk_offs + strip_offs;
414 :
415 : /* get size remaining in this stripe */
416 0 : length = MIN(strip_size - strip_offs, datalen);
417 :
418 : /* map disk offset to parity/data drive */
419 0 : chunk = strip_no % no_chunk;
420 :
421 0 : qchunk = (no_chunk + 1) - ((strip_no / no_chunk) % (no_chunk+2));
422 0 : if (qchunk == 0)
423 0 : pchunk = no_chunk + 1;
424 : else
425 0 : pchunk = qchunk - 1;
426 0 : if (chunk >= pchunk)
427 0 : chunk++;
428 0 : if (chunk >= qchunk)
429 0 : chunk++;
430 :
431 0 : lba = offset >> DEV_BSHIFT;
432 :
433 : /* XXX big hammer.. exclude I/O from entire stripe */
434 0 : if (wu->swu_blk_start == 0)
435 0 : wu->swu_blk_start = (strip_no / no_chunk) * row_size;
436 0 : wu->swu_blk_end = (strip_no / no_chunk) * row_size + (row_size - 1);
437 :
438 : fail = 0;
439 : fchunk = -1;
440 :
441 : /* Get disk-fail flags */
442 0 : for (i=0; i< no_chunk+2; i++) {
443 0 : scp = sd->sd_vol.sv_chunks[i];
444 0 : switch (scp->src_meta.scm_status) {
445 : case BIOC_SDOFFLINE:
446 : case BIOC_SDREBUILD:
447 : case BIOC_SDHOTSPARE:
448 0 : if (i == qchunk)
449 0 : fail |= SR_FAILQ;
450 0 : else if (i == pchunk)
451 0 : fail |= SR_FAILP;
452 0 : else if (i == chunk)
453 0 : fail |= SR_FAILX;
454 : else {
455 : /* dual data-disk failure */
456 0 : fail |= SR_FAILY;
457 : fchunk = i;
458 : }
459 : break;
460 : }
461 : }
462 0 : if (xs->flags & SCSI_DATA_IN) {
463 0 : if (!(fail & SR_FAILX)) {
464 : /* drive is good. issue single read request */
465 0 : if (sr_raid6_addio(wu, chunk, lba, length,
466 : data, xs->flags, 0, NULL, NULL, 0))
467 : goto bad;
468 0 : } else if (fail & SR_FAILP) {
469 : /* Dx, P failed */
470 0 : printf("Disk %llx offline, "
471 : "regenerating Dx+P\n", chunk);
472 :
473 0 : gxinv = gf_inv(gf_pow[chunk]);
474 :
475 : /* Calculate: Dx = (Q^Dz*gz)*inv(gx) */
476 0 : memset(data, 0, length);
477 0 : if (sr_raid6_addio(wu, qchunk, lba, length,
478 : NULL, SCSI_DATA_IN, 0, NULL, data, gxinv))
479 : goto bad;
480 :
481 : /* Read Dz * gz * inv(gx) */
482 0 : for (i = 0; i < no_chunk+2; i++) {
483 0 : if (i == qchunk || i == pchunk || i == chunk)
484 : continue;
485 :
486 0 : if (sr_raid6_addio(wu, i, lba, length,
487 : NULL, SCSI_DATA_IN, 0, NULL, data,
488 0 : gf_mul(gf_pow[i], gxinv)))
489 : goto bad;
490 : }
491 :
492 : /* data will contain correct value on completion */
493 0 : } else if (fail & SR_FAILY) {
494 : /* Dx, Dy failed */
495 0 : printf("Disk %llx & %llx offline, "
496 : "regenerating Dx+Dy\n", chunk, fchunk);
497 :
498 0 : gxinv = gf_inv(gf_pow[chunk] ^ gf_pow[fchunk]);
499 0 : pxinv = gf_mul(gf_pow[fchunk], gxinv);
500 :
501 : /* read Q * inv(gx + gy) */
502 0 : memset(data, 0, length);
503 0 : if (sr_raid6_addio(wu, qchunk, lba, length,
504 : NULL, SCSI_DATA_IN, 0, NULL, data, gxinv))
505 : goto bad;
506 :
507 : /* read P * gy * inv(gx + gy) */
508 0 : if (sr_raid6_addio(wu, pchunk, lba, length,
509 : NULL, SCSI_DATA_IN, 0, NULL, data, pxinv))
510 : goto bad;
511 :
512 : /* Calculate: Dx*gx^Dy*gy = Q^(Dz*gz) ; Dx^Dy = P^Dz
513 : * Q: sr_raid6_xorp(qbuf, --, length);
514 : * P: sr_raid6_xorp(pbuf, --, length);
515 : * Dz: sr_raid6_xorp(pbuf, --, length);
516 : * sr_raid6_xorq(qbuf, --, length, gf_pow[i]);
517 : */
518 0 : for (i = 0; i < no_chunk+2; i++) {
519 0 : if (i == qchunk || i == pchunk ||
520 0 : i == chunk || i == fchunk)
521 : continue;
522 :
523 : /* read Dz * (gz + gy) * inv(gx + gy) */
524 0 : if (sr_raid6_addio(wu, i, lba, length,
525 : NULL, SCSI_DATA_IN, 0, NULL, data,
526 0 : pxinv ^ gf_mul(gf_pow[i], gxinv)))
527 : goto bad;
528 : }
529 : } else {
530 : /* Two cases: single disk (Dx) or (Dx+Q)
531 : * Dx = Dz ^ P (same as RAID5)
532 : */
533 0 : printf("Disk %llx offline, "
534 : "regenerating Dx%s\n", chunk,
535 0 : fail & SR_FAILQ ? "+Q" : " single");
536 :
537 : /* Calculate: Dx = P^Dz
538 : * P: sr_raid6_xorp(data, ---, length);
539 : * Dz: sr_raid6_xorp(data, ---, length);
540 : */
541 0 : memset(data, 0, length);
542 0 : for (i = 0; i < no_chunk+2; i++) {
543 0 : if (i != chunk && i != qchunk) {
544 : /* Read Dz */
545 0 : if (sr_raid6_addio(wu, i, lba,
546 : length, NULL, SCSI_DATA_IN,
547 : 0, data, NULL, 0))
548 : goto bad;
549 : }
550 : }
551 :
552 : /* data will contain correct value on completion */
553 : }
554 : } else {
555 : /* XXX handle writes to failed/offline disk? */
556 0 : if (fail & (SR_FAILX|SR_FAILQ|SR_FAILP))
557 : goto bad;
558 :
559 : /*
560 : * initialize pbuf with contents of new data to be
561 : * written. This will be XORed with old data and old
562 : * parity in the intr routine. The result in pbuf
563 : * is the new parity data.
564 : */
565 0 : qbuf = sr_block_get(sd, length);
566 0 : if (qbuf == NULL)
567 : goto bad;
568 :
569 0 : pbuf = sr_block_get(sd, length);
570 0 : if (pbuf == NULL)
571 : goto bad;
572 :
573 : /* Calculate P = Dn; Q = gn * Dn */
574 0 : if (gf_premul(gf_pow[chunk]))
575 : goto bad;
576 0 : sr_raid6_xorp(pbuf, data, length);
577 0 : sr_raid6_xorq(qbuf, data, length, gf_pow[chunk]);
578 :
579 : /* Read old data: P ^= Dn' ; Q ^= (gn * Dn') */
580 0 : if (sr_raid6_addio(wu_r, chunk, lba, length, NULL,
581 0 : SCSI_DATA_IN, 0, pbuf, qbuf, gf_pow[chunk]))
582 : goto bad;
583 :
584 : /* Read old xor-parity: P ^= P' */
585 0 : if (sr_raid6_addio(wu_r, pchunk, lba, length, NULL,
586 : SCSI_DATA_IN, 0, pbuf, NULL, 0))
587 : goto bad;
588 :
589 : /* Read old q-parity: Q ^= Q' */
590 0 : if (sr_raid6_addio(wu_r, qchunk, lba, length, NULL,
591 : SCSI_DATA_IN, 0, qbuf, NULL, 0))
592 : goto bad;
593 :
594 : /* write new data */
595 0 : if (sr_raid6_addio(wu, chunk, lba, length, data,
596 0 : xs->flags, 0, NULL, NULL, 0))
597 : goto bad;
598 :
599 : /* write new xor-parity */
600 0 : if (sr_raid6_addio(wu, pchunk, lba, length, pbuf,
601 0 : xs->flags, SR_CCBF_FREEBUF, NULL, NULL, 0))
602 : goto bad;
603 :
604 : /* write new q-parity */
605 0 : if (sr_raid6_addio(wu, qchunk, lba, length, qbuf,
606 0 : xs->flags, SR_CCBF_FREEBUF, NULL, NULL, 0))
607 : goto bad;
608 : }
609 :
610 : /* advance to next block */
611 0 : lbaoffs += length;
612 0 : datalen -= length;
613 0 : data += length;
614 : }
615 :
616 0 : s = splbio();
617 0 : if (wu_r) {
618 : /* collide write request with reads */
619 0 : wu_r->swu_blk_start = wu->swu_blk_start;
620 0 : wu_r->swu_blk_end = wu->swu_blk_end;
621 :
622 0 : wu->swu_state = SR_WU_DEFERRED;
623 0 : wu_r->swu_collider = wu;
624 0 : TAILQ_INSERT_TAIL(&sd->sd_wu_defq, wu, swu_link);
625 :
626 : wu = wu_r;
627 0 : }
628 0 : splx(s);
629 :
630 0 : sr_schedule_wu(wu);
631 :
632 0 : return (0);
633 : bad:
634 : /* XXX - can leak pbuf/qbuf on error. */
635 : /* wu is unwound by sr_wu_put */
636 0 : if (wu_r)
637 0 : sr_scsi_wu_put(sd, wu_r);
638 0 : return (1);
639 0 : }
640 :
641 : /* Handle failure I/O completion */
642 : int
643 0 : sr_failio(struct sr_workunit *wu)
644 : {
645 0 : struct sr_discipline *sd = wu->swu_dis;
646 : struct sr_ccb *ccb;
647 :
648 0 : if (!(wu->swu_flags & SR_WUF_FAIL))
649 0 : return (0);
650 :
651 : /* Wu is a 'fake'.. don't do real I/O just intr */
652 0 : TAILQ_INSERT_TAIL(&sd->sd_wu_pendq, wu, swu_link);
653 0 : TAILQ_FOREACH(ccb, &wu->swu_ccb, ccb_link)
654 0 : sr_raid6_intr(&ccb->ccb_buf);
655 0 : return (1);
656 0 : }
657 :
658 : void
659 0 : sr_raid6_intr(struct buf *bp)
660 : {
661 0 : struct sr_ccb *ccb = (struct sr_ccb *)bp;
662 0 : struct sr_workunit *wu = ccb->ccb_wu;
663 0 : struct sr_discipline *sd = wu->swu_dis;
664 0 : struct sr_raid6_opaque *pq = ccb->ccb_opaque;
665 : int s;
666 :
667 : DNPRINTF(SR_D_INTR, "%s: sr_raid6_intr bp %p xs %p\n",
668 : DEVNAME(sd->sd_sc), bp, wu->swu_xs);
669 :
670 0 : s = splbio();
671 0 : sr_ccb_done(ccb);
672 :
673 : /* XOR data to result. */
674 0 : if (ccb->ccb_state == SR_CCB_OK && pq) {
675 0 : if (pq->pbuf)
676 : /* Calculate xor-parity */
677 0 : sr_raid6_xorp(pq->pbuf, ccb->ccb_buf.b_data,
678 0 : ccb->ccb_buf.b_bcount);
679 0 : if (pq->qbuf)
680 : /* Calculate q-parity */
681 0 : sr_raid6_xorq(pq->qbuf, ccb->ccb_buf.b_data,
682 0 : ccb->ccb_buf.b_bcount, pq->gn);
683 0 : free(pq, M_DEVBUF, 0);
684 0 : ccb->ccb_opaque = NULL;
685 0 : }
686 :
687 : /* Free allocated data buffer. */
688 0 : if (ccb->ccb_flags & SR_CCBF_FREEBUF) {
689 0 : sr_block_put(sd, ccb->ccb_buf.b_data, ccb->ccb_buf.b_bcount);
690 0 : ccb->ccb_buf.b_data = NULL;
691 0 : }
692 :
693 0 : sr_wu_done(wu);
694 0 : splx(s);
695 0 : }
696 :
697 : int
698 0 : sr_raid6_wu_done(struct sr_workunit *wu)
699 : {
700 0 : struct sr_discipline *sd = wu->swu_dis;
701 0 : struct scsi_xfer *xs = wu->swu_xs;
702 :
703 : /* XXX - we have no way of propagating errors... */
704 0 : if (wu->swu_flags & SR_WUF_DISCIPLINE)
705 0 : return SR_WU_OK;
706 :
707 : /* XXX - This is insufficient for RAID 6. */
708 0 : if (wu->swu_ios_succeeded > 0) {
709 0 : xs->error = XS_NOERROR;
710 0 : return SR_WU_OK;
711 : }
712 :
713 0 : if (xs->flags & SCSI_DATA_IN) {
714 0 : printf("%s: retrying read on block %lld\n",
715 : sd->sd_meta->ssd_devname, (long long)wu->swu_blk_start);
716 0 : sr_wu_release_ccbs(wu);
717 0 : wu->swu_state = SR_WU_RESTART;
718 0 : if (sd->sd_scsi_rw(wu) == 0)
719 0 : return SR_WU_RESTART;
720 : } else {
721 0 : printf("%s: permanently fail write on block %lld\n",
722 : sd->sd_meta->ssd_devname, (long long)wu->swu_blk_start);
723 : }
724 :
725 0 : wu->swu_state = SR_WU_FAILED;
726 0 : xs->error = XS_DRIVER_STUFFUP;
727 :
728 0 : return SR_WU_FAILED;
729 0 : }
730 :
731 : int
732 0 : sr_raid6_addio(struct sr_workunit *wu, int chunk, daddr_t blkno,
733 : long len, void *data, int xsflags, int ccbflags, void *pbuf,
734 : void *qbuf, int gn)
735 : {
736 0 : struct sr_discipline *sd = wu->swu_dis;
737 : struct sr_ccb *ccb;
738 : struct sr_raid6_opaque *pqbuf;
739 :
740 : DNPRINTF(SR_D_DIS, "sr_raid6_addio: %s %d.%lld %ld %p:%p\n",
741 : (xsflags & SCSI_DATA_IN) ? "read" : "write", chunk,
742 : (long long)blkno, len, pbuf, qbuf);
743 :
744 : /* Allocate temporary buffer. */
745 0 : if (data == NULL) {
746 0 : data = sr_block_get(sd, len);
747 0 : if (data == NULL)
748 0 : return (-1);
749 0 : ccbflags |= SR_CCBF_FREEBUF;
750 0 : }
751 :
752 0 : ccb = sr_ccb_rw(sd, chunk, blkno, len, data, xsflags, ccbflags);
753 0 : if (ccb == NULL) {
754 0 : if (ccbflags & SR_CCBF_FREEBUF)
755 0 : sr_block_put(sd, data, len);
756 0 : return (-1);
757 : }
758 0 : if (pbuf || qbuf) {
759 : /* XXX - can leak data and ccb on failure. */
760 0 : if (qbuf && gf_premul(gn))
761 0 : return (-1);
762 :
763 : /* XXX - should be preallocated? */
764 0 : pqbuf = malloc(sizeof(struct sr_raid6_opaque),
765 : M_DEVBUF, M_ZERO | M_NOWAIT);
766 0 : if (pqbuf == NULL) {
767 0 : sr_ccb_put(ccb);
768 0 : return (-1);
769 : }
770 0 : pqbuf->pbuf = pbuf;
771 0 : pqbuf->qbuf = qbuf;
772 0 : pqbuf->gn = gn;
773 0 : ccb->ccb_opaque = pqbuf;
774 0 : }
775 0 : sr_wu_enqueue_ccb(wu, ccb);
776 :
777 0 : return (0);
778 0 : }
779 :
780 : /* Perform RAID6 parity calculation.
781 : * P=xor parity, Q=GF256 parity, D=data, gn=disk# */
782 : void
783 0 : sr_raid6_xorp(void *p, void *d, int len)
784 : {
785 0 : uint32_t *pbuf = p, *data = d;
786 :
787 0 : len >>= 2;
788 0 : while (len--)
789 0 : *pbuf++ ^= *data++;
790 0 : }
791 :
792 : void
793 0 : sr_raid6_xorq(void *q, void *d, int len, int gn)
794 : {
795 0 : uint32_t *qbuf = q, *data = d, x;
796 0 : uint8_t *gn_map = gf_map[gn];
797 :
798 0 : len >>= 2;
799 0 : while (len--) {
800 0 : x = *data++;
801 0 : *qbuf++ ^= (((uint32_t)gn_map[x & 0xff]) |
802 0 : ((uint32_t)gn_map[(x >> 8) & 0xff] << 8) |
803 0 : ((uint32_t)gn_map[(x >> 16) & 0xff] << 16) |
804 0 : ((uint32_t)gn_map[(x >> 24) & 0xff] << 24));
805 : }
806 0 : }
807 :
808 : /* Create GF256 log/pow tables: polynomial = 0x11D */
809 : void
810 0 : gf_init(void)
811 : {
812 : int i;
813 : uint8_t p = 1;
814 :
815 : /* use 2N pow table to avoid using % in multiply */
816 0 : for (i=0; i<256; i++) {
817 0 : gf_log[p] = i;
818 0 : gf_pow[i] = gf_pow[i+255] = p;
819 0 : p = ((p << 1) ^ ((p & 0x80) ? 0x1D : 0x00));
820 : }
821 0 : gf_log[0] = 512;
822 0 : }
823 :
824 : uint8_t
825 0 : gf_inv(uint8_t a)
826 : {
827 0 : return gf_pow[255 - gf_log[a]];
828 : }
829 :
830 : uint8_t
831 0 : gf_mul(uint8_t a, uint8_t b)
832 : {
833 0 : return gf_pow[gf_log[a] + gf_log[b]];
834 : }
835 :
836 : /* Precalculate multiplication tables for drive gn */
837 : int
838 0 : gf_premul(uint8_t gn)
839 : {
840 : int i;
841 :
842 0 : if (gf_map[gn] != NULL)
843 0 : return (0);
844 :
845 0 : if ((gf_map[gn] = malloc(256, M_DEVBUF, M_ZERO | M_NOWAIT)) == NULL)
846 0 : return (-1);
847 :
848 0 : for (i=0; i<256; i++)
849 0 : gf_map[gn][i] = gf_pow[gf_log[i] + gf_log[gn]];
850 0 : return (0);
851 0 : }
|