Line data Source code
1 : /* $OpenBSD: ffs_softdep.c,v 1.143 2018/07/02 20:56:22 bluhm Exp $ */
2 :
3 : /*
4 : * Copyright 1998, 2000 Marshall Kirk McKusick. All Rights Reserved.
5 : *
6 : * The soft updates code is derived from the appendix of a University
7 : * of Michigan technical report (Gregory R. Ganger and Yale N. Patt,
8 : * "Soft Updates: A Solution to the Metadata Update Problem in File
9 : * Systems", CSE-TR-254-95, August 1995).
10 : *
11 : * Further information about soft updates can be obtained from:
12 : *
13 : * Marshall Kirk McKusick http://www.mckusick.com/softdep/
14 : * 1614 Oxford Street mckusick@mckusick.com
15 : * Berkeley, CA 94709-1608 +1-510-843-9542
16 : * USA
17 : *
18 : * Redistribution and use in source and binary forms, with or without
19 : * modification, are permitted provided that the following conditions
20 : * are met:
21 : *
22 : * 1. Redistributions of source code must retain the above copyright
23 : * notice, this list of conditions and the following disclaimer.
24 : * 2. Redistributions in binary form must reproduce the above copyright
25 : * notice, this list of conditions and the following disclaimer in the
26 : * documentation and/or other materials provided with the distribution.
27 : *
28 : * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY
29 : * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
30 : * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
31 : * DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR
32 : * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
33 : * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
34 : * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
35 : * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36 : * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
37 : * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38 : * SUCH DAMAGE.
39 : *
40 : * from: @(#)ffs_softdep.c 9.59 (McKusick) 6/21/00
41 : * $FreeBSD: src/sys/ufs/ffs/ffs_softdep.c,v 1.86 2001/02/04 16:08:18 phk Exp $
42 : */
43 :
44 : #include <sys/param.h>
45 : #include <sys/buf.h>
46 : #include <sys/kernel.h>
47 : #include <sys/malloc.h>
48 : #include <sys/mount.h>
49 : #include <sys/proc.h>
50 : #include <sys/pool.h>
51 : #include <sys/syslog.h>
52 : #include <sys/systm.h>
53 : #include <sys/vnode.h>
54 : #include <sys/specdev.h>
55 : #include <crypto/siphash.h>
56 : #include <ufs/ufs/dir.h>
57 : #include <ufs/ufs/quota.h>
58 : #include <ufs/ufs/inode.h>
59 : #include <ufs/ufs/ufsmount.h>
60 : #include <ufs/ffs/fs.h>
61 : #include <ufs/ffs/softdep.h>
62 : #include <ufs/ffs/ffs_extern.h>
63 : #include <ufs/ufs/ufs_extern.h>
64 :
65 : #define STATIC
66 :
67 : /*
68 : * Mapping of dependency structure types to malloc types.
69 : */
70 : #define D_PAGEDEP 0
71 : #define D_INODEDEP 1
72 : #define D_NEWBLK 2
73 : #define D_BMSAFEMAP 3
74 : #define D_ALLOCDIRECT 4
75 : #define D_INDIRDEP 5
76 : #define D_ALLOCINDIR 6
77 : #define D_FREEFRAG 7
78 : #define D_FREEBLKS 8
79 : #define D_FREEFILE 9
80 : #define D_DIRADD 10
81 : #define D_MKDIR 11
82 : #define D_DIRREM 12
83 : #define D_NEWDIRBLK 13
84 : #define D_LAST 13
85 : /*
86 : * Names of softdep types.
87 : */
88 : const char *softdep_typenames[] = {
89 : "pagedep",
90 : "inodedep",
91 : "newblk",
92 : "bmsafemap",
93 : "allocdirect",
94 : "indirdep",
95 : "allocindir",
96 : "freefrag",
97 : "freeblks",
98 : "freefile",
99 : "diradd",
100 : "mkdir",
101 : "dirrem",
102 : "newdirblk",
103 : };
104 : #define TYPENAME(type) \
105 : ((unsigned)(type) <= D_LAST ? softdep_typenames[type] : "???")
106 : /*
107 : * Finding the current process.
108 : */
109 : #define CURPROC curproc
110 : /*
111 : * End system adaptation definitions.
112 : */
113 :
114 : /*
115 : * Internal function prototypes.
116 : */
117 : STATIC void softdep_error(char *, int);
118 : STATIC void drain_output(struct vnode *, int);
119 : STATIC int getdirtybuf(struct buf *, int);
120 : STATIC void clear_remove(struct proc *);
121 : STATIC void clear_inodedeps(struct proc *);
122 : STATIC int flush_pagedep_deps(struct vnode *, struct mount *,
123 : struct diraddhd *);
124 : STATIC int flush_inodedep_deps(struct fs *, ufsino_t);
125 : STATIC int handle_written_filepage(struct pagedep *, struct buf *);
126 : STATIC void diradd_inode_written(struct diradd *, struct inodedep *);
127 : STATIC int handle_written_inodeblock(struct inodedep *, struct buf *);
128 : STATIC void handle_allocdirect_partdone(struct allocdirect *);
129 : STATIC void handle_allocindir_partdone(struct allocindir *);
130 : STATIC void initiate_write_filepage(struct pagedep *, struct buf *);
131 : STATIC void handle_written_mkdir(struct mkdir *, int);
132 : STATIC void initiate_write_inodeblock_ufs1(struct inodedep *, struct buf *);
133 : #ifdef FFS2
134 : STATIC void initiate_write_inodeblock_ufs2(struct inodedep *, struct buf *);
135 : #endif
136 : STATIC void handle_workitem_freefile(struct freefile *);
137 : STATIC void handle_workitem_remove(struct dirrem *);
138 : STATIC struct dirrem *newdirrem(struct buf *, struct inode *,
139 : struct inode *, int, struct dirrem **);
140 : STATIC void free_diradd(struct diradd *);
141 : STATIC void free_allocindir(struct allocindir *, struct inodedep *);
142 : STATIC void free_newdirblk(struct newdirblk *);
143 : STATIC int indir_trunc(struct inode *, daddr_t, int, daddr_t, long *);
144 : STATIC void deallocate_dependencies(struct buf *, struct inodedep *);
145 : STATIC void free_allocdirect(struct allocdirectlst *,
146 : struct allocdirect *, int);
147 : STATIC int check_inode_unwritten(struct inodedep *);
148 : STATIC int free_inodedep(struct inodedep *);
149 : STATIC void handle_workitem_freeblocks(struct freeblks *);
150 : STATIC void merge_inode_lists(struct inodedep *);
151 : STATIC void setup_allocindir_phase2(struct buf *, struct inode *,
152 : struct allocindir *);
153 : STATIC struct allocindir *newallocindir(struct inode *, int, daddr_t,
154 : daddr_t);
155 : STATIC void handle_workitem_freefrag(struct freefrag *);
156 : STATIC struct freefrag *newfreefrag(struct inode *, daddr_t, long);
157 : STATIC void allocdirect_merge(struct allocdirectlst *,
158 : struct allocdirect *, struct allocdirect *);
159 : STATIC struct bmsafemap *bmsafemap_lookup(struct buf *);
160 : STATIC int newblk_lookup(struct fs *, daddr_t, int,
161 : struct newblk **);
162 : STATIC int inodedep_lookup(struct fs *, ufsino_t, int, struct inodedep **);
163 : STATIC int pagedep_lookup(struct inode *, daddr_t, int, struct pagedep **);
164 : STATIC void pause_timer(void *);
165 : STATIC int request_cleanup(int, int);
166 : STATIC int process_worklist_item(struct mount *, int);
167 : STATIC void add_to_worklist(struct worklist *);
168 :
169 : /*
170 : * Exported softdep operations.
171 : */
172 : void softdep_disk_io_initiation(struct buf *);
173 : void softdep_disk_write_complete(struct buf *);
174 : void softdep_deallocate_dependencies(struct buf *);
175 : void softdep_move_dependencies(struct buf *, struct buf *);
176 : int softdep_count_dependencies(struct buf *bp, int, int);
177 :
178 : /*
179 : * Locking primitives.
180 : *
181 : * For a uniprocessor, all we need to do is protect against disk
182 : * interrupts. For a multiprocessor, this lock would have to be
183 : * a mutex. A single mutex is used throughout this file, though
184 : * finer grain locking could be used if contention warranted it.
185 : *
186 : * For a multiprocessor, the sleep call would accept a lock and
187 : * release it after the sleep processing was complete. In a uniprocessor
188 : * implementation there is no such interlock, so we simple mark
189 : * the places where it needs to be done with the `interlocked' form
190 : * of the lock calls. Since the uniprocessor sleep already interlocks
191 : * the spl, there is nothing that really needs to be done.
192 : */
193 : #ifndef /* NOT */ DEBUG
194 : STATIC struct lockit {
195 : int lkt_spl;
196 : } lk = { 0 };
197 : #define ACQUIRE_LOCK(lk) (lk)->lkt_spl = splbio()
198 : #define FREE_LOCK(lk) splx((lk)->lkt_spl)
199 : #define ACQUIRE_LOCK_INTERLOCKED(lk,s) (lk)->lkt_spl = (s)
200 : #define FREE_LOCK_INTERLOCKED(lk) ((lk)->lkt_spl)
201 :
202 : #else /* DEBUG */
203 : STATIC struct lockit {
204 : int lkt_spl;
205 : pid_t lkt_held;
206 : int lkt_line;
207 : } lk = { 0, -1 };
208 : STATIC int lockcnt;
209 :
210 : STATIC void acquire_lock(struct lockit *, int);
211 : STATIC void free_lock(struct lockit *, int);
212 : STATIC void acquire_lock_interlocked(struct lockit *, int, int);
213 : STATIC int free_lock_interlocked(struct lockit *, int);
214 :
215 : #define ACQUIRE_LOCK(lk) acquire_lock(lk, __LINE__)
216 : #define FREE_LOCK(lk) free_lock(lk, __LINE__)
217 : #define ACQUIRE_LOCK_INTERLOCKED(lk,s) acquire_lock_interlocked(lk, (s), __LINE__)
218 : #define FREE_LOCK_INTERLOCKED(lk) free_lock_interlocked(lk, __LINE__)
219 :
220 : STATIC void
221 : acquire_lock(struct lockit *lk, int line)
222 : {
223 : pid_t holder;
224 : int original_line;
225 :
226 : if (lk->lkt_held != -1) {
227 : holder = lk->lkt_held;
228 : original_line = lk->lkt_line;
229 : FREE_LOCK(lk);
230 : if (holder == CURPROC->p_tid)
231 : panic("softdep_lock: locking against myself, acquired at line %d, relocked at line %d", original_line, line);
232 : else
233 : panic("softdep_lock: lock held by %d, acquired at line %d, relocked at line %d", holder, original_line, line);
234 : }
235 : lk->lkt_spl = splbio();
236 : lk->lkt_held = CURPROC->p_tid;
237 : lk->lkt_line = line;
238 : lockcnt++;
239 : }
240 :
241 : STATIC void
242 : free_lock(struct lockit *lk, int line)
243 : {
244 :
245 : if (lk->lkt_held == -1)
246 : panic("softdep_unlock: lock not held at line %d", line);
247 : lk->lkt_held = -1;
248 : splx(lk->lkt_spl);
249 : }
250 :
251 : STATIC void
252 : acquire_lock_interlocked(struct lockit *lk, int s, int line)
253 : {
254 : pid_t holder;
255 : int original_line;
256 :
257 : if (lk->lkt_held != -1) {
258 : holder = lk->lkt_held;
259 : original_line = lk->lkt_line;
260 : FREE_LOCK_INTERLOCKED(lk);
261 : if (holder == CURPROC->p_tid)
262 : panic("softdep_lock: locking against myself, acquired at line %d, relocked at line %d", original_line, line);
263 : else
264 : panic("softdep_lock: lock held by %d, acquired at line %d, relocked at line %d", holder, original_line, line);
265 : }
266 : lk->lkt_held = CURPROC->p_tid;
267 : lk->lkt_line = line;
268 : lk->lkt_spl = s;
269 : lockcnt++;
270 : }
271 :
272 : STATIC int
273 : free_lock_interlocked(struct lockit *lk, int line)
274 : {
275 :
276 : if (lk->lkt_held == -1)
277 : panic("softdep_unlock_interlocked: lock not held at line %d", line);
278 : lk->lkt_held = -1;
279 :
280 : return (lk->lkt_spl);
281 : }
282 : #endif /* DEBUG */
283 :
284 : /*
285 : * Place holder for real semaphores.
286 : */
287 : struct sema {
288 : int value;
289 : pid_t holder;
290 : char *name;
291 : int prio;
292 : int timo;
293 : };
294 : STATIC void sema_init(struct sema *, char *, int, int);
295 : STATIC int sema_get(struct sema *, struct lockit *);
296 : STATIC void sema_release(struct sema *);
297 :
298 : STATIC void
299 0 : sema_init(struct sema *semap, char *name, int prio, int timo)
300 : {
301 :
302 0 : semap->holder = -1;
303 0 : semap->value = 0;
304 0 : semap->name = name;
305 0 : semap->prio = prio;
306 0 : semap->timo = timo;
307 0 : }
308 :
309 : STATIC int
310 0 : sema_get(struct sema *semap, struct lockit *interlock)
311 : {
312 : int s;
313 :
314 0 : if (semap->value++ > 0) {
315 0 : if (interlock != NULL)
316 0 : s = FREE_LOCK_INTERLOCKED(interlock);
317 0 : tsleep((caddr_t)semap, semap->prio, semap->name, semap->timo);
318 0 : if (interlock != NULL) {
319 0 : ACQUIRE_LOCK_INTERLOCKED(interlock, s);
320 0 : FREE_LOCK(interlock);
321 0 : }
322 0 : return (0);
323 : }
324 0 : semap->holder = CURPROC->p_tid;
325 0 : if (interlock != NULL)
326 0 : FREE_LOCK(interlock);
327 0 : return (1);
328 0 : }
329 :
330 : STATIC void
331 0 : sema_release(struct sema *semap)
332 : {
333 :
334 0 : if (semap->value <= 0 || semap->holder != CURPROC->p_tid) {
335 : #ifdef DEBUG
336 : if (lk.lkt_held != -1)
337 : FREE_LOCK(&lk);
338 : #endif
339 0 : panic("sema_release: not held");
340 : }
341 0 : if (--semap->value > 0) {
342 0 : semap->value = 0;
343 0 : wakeup(semap);
344 0 : }
345 0 : semap->holder = -1;
346 0 : }
347 :
348 : /*
349 : * Memory management.
350 : */
351 : STATIC struct pool pagedep_pool;
352 : STATIC struct pool inodedep_pool;
353 : STATIC struct pool newblk_pool;
354 : STATIC struct pool bmsafemap_pool;
355 : STATIC struct pool allocdirect_pool;
356 : STATIC struct pool indirdep_pool;
357 : STATIC struct pool allocindir_pool;
358 : STATIC struct pool freefrag_pool;
359 : STATIC struct pool freeblks_pool;
360 : STATIC struct pool freefile_pool;
361 : STATIC struct pool diradd_pool;
362 : STATIC struct pool mkdir_pool;
363 : STATIC struct pool dirrem_pool;
364 : STATIC struct pool newdirblk_pool;
365 :
366 : static __inline void
367 0 : softdep_free(struct worklist *item, int type)
368 : {
369 :
370 0 : switch (type) {
371 : case D_PAGEDEP:
372 0 : pool_put(&pagedep_pool, item);
373 0 : break;
374 :
375 : case D_INODEDEP:
376 0 : pool_put(&inodedep_pool, item);
377 0 : break;
378 :
379 : case D_BMSAFEMAP:
380 0 : pool_put(&bmsafemap_pool, item);
381 0 : break;
382 :
383 : case D_ALLOCDIRECT:
384 0 : pool_put(&allocdirect_pool, item);
385 0 : break;
386 :
387 : case D_INDIRDEP:
388 0 : pool_put(&indirdep_pool, item);
389 0 : break;
390 :
391 : case D_ALLOCINDIR:
392 0 : pool_put(&allocindir_pool, item);
393 0 : break;
394 :
395 : case D_FREEFRAG:
396 0 : pool_put(&freefrag_pool, item);
397 0 : break;
398 :
399 : case D_FREEBLKS:
400 0 : pool_put(&freeblks_pool, item);
401 0 : break;
402 :
403 : case D_FREEFILE:
404 0 : pool_put(&freefile_pool, item);
405 0 : break;
406 :
407 : case D_DIRADD:
408 0 : pool_put(&diradd_pool, item);
409 0 : break;
410 :
411 : case D_MKDIR:
412 0 : pool_put(&mkdir_pool, item);
413 0 : break;
414 :
415 : case D_DIRREM:
416 0 : pool_put(&dirrem_pool, item);
417 0 : break;
418 :
419 : case D_NEWDIRBLK:
420 0 : pool_put(&newdirblk_pool, item);
421 0 : break;
422 :
423 : default:
424 : #ifdef DEBUG
425 : if (lk.lkt_held != -1)
426 : FREE_LOCK(&lk);
427 : #endif
428 0 : panic("softdep_free: unknown type %d", type);
429 : }
430 0 : }
431 :
432 : struct workhead softdep_freequeue;
433 :
434 : static __inline void
435 0 : softdep_freequeue_add(struct worklist *item)
436 : {
437 : int s;
438 :
439 0 : s = splbio();
440 0 : LIST_INSERT_HEAD(&softdep_freequeue, item, wk_list);
441 0 : splx(s);
442 0 : }
443 :
444 : static __inline void
445 0 : softdep_freequeue_process(void)
446 : {
447 : struct worklist *wk;
448 :
449 0 : splassert(IPL_BIO);
450 :
451 0 : while ((wk = LIST_FIRST(&softdep_freequeue)) != NULL) {
452 0 : LIST_REMOVE(wk, wk_list);
453 0 : FREE_LOCK(&lk);
454 0 : softdep_free(wk, wk->wk_type);
455 0 : ACQUIRE_LOCK(&lk);
456 : }
457 0 : }
458 :
459 : /*
460 : * Worklist queue management.
461 : * These routines require that the lock be held.
462 : */
463 : #ifndef /* NOT */ DEBUG
464 : #define WORKLIST_INSERT(head, item) do { \
465 : (item)->wk_state |= ONWORKLIST; \
466 : LIST_INSERT_HEAD(head, item, wk_list); \
467 : } while (0)
468 : #define WORKLIST_REMOVE(item) do { \
469 : (item)->wk_state &= ~ONWORKLIST; \
470 : LIST_REMOVE(item, wk_list); \
471 : } while (0)
472 : #define WORKITEM_FREE(item, type) softdep_freequeue_add((struct worklist *)item)
473 :
474 : #else /* DEBUG */
475 : STATIC void worklist_insert(struct workhead *, struct worklist *);
476 : STATIC void worklist_remove(struct worklist *);
477 : STATIC void workitem_free(struct worklist *);
478 :
479 : #define WORKLIST_INSERT(head, item) worklist_insert(head, item)
480 : #define WORKLIST_REMOVE(item) worklist_remove(item)
481 : #define WORKITEM_FREE(item, type) workitem_free((struct worklist *)item)
482 :
483 : STATIC void
484 : worklist_insert(struct workhead *head, struct worklist *item)
485 : {
486 :
487 : if (lk.lkt_held == -1)
488 : panic("worklist_insert: lock not held");
489 : if (item->wk_state & ONWORKLIST) {
490 : FREE_LOCK(&lk);
491 : panic("worklist_insert: already on list");
492 : }
493 : item->wk_state |= ONWORKLIST;
494 : LIST_INSERT_HEAD(head, item, wk_list);
495 : }
496 :
497 : STATIC void
498 : worklist_remove(struct worklist *item)
499 : {
500 :
501 : if (lk.lkt_held == -1)
502 : panic("worklist_remove: lock not held");
503 : if ((item->wk_state & ONWORKLIST) == 0) {
504 : FREE_LOCK(&lk);
505 : panic("worklist_remove: not on list");
506 : }
507 : item->wk_state &= ~ONWORKLIST;
508 : LIST_REMOVE(item, wk_list);
509 : }
510 :
511 : STATIC void
512 : workitem_free(struct worklist *item)
513 : {
514 :
515 : if (item->wk_state & ONWORKLIST) {
516 : if (lk.lkt_held != -1)
517 : FREE_LOCK(&lk);
518 : panic("workitem_free: still on list");
519 : }
520 : softdep_freequeue_add(item);
521 : }
522 : #endif /* DEBUG */
523 :
524 : /*
525 : * Workitem queue management
526 : */
527 : STATIC struct workhead softdep_workitem_pending;
528 : STATIC struct worklist *worklist_tail;
529 : STATIC int num_on_worklist; /* number of worklist items to be processed */
530 : STATIC int softdep_worklist_busy; /* 1 => trying to do unmount */
531 : STATIC int softdep_worklist_req; /* serialized waiters */
532 : STATIC int max_softdeps; /* maximum number of structs before slowdown */
533 : STATIC int tickdelay = 2; /* number of ticks to pause during slowdown */
534 : STATIC int proc_waiting; /* tracks whether we have a timeout posted */
535 : STATIC int *stat_countp; /* statistic to count in proc_waiting timeout */
536 : STATIC struct timeout proc_waiting_timeout;
537 : STATIC struct proc *filesys_syncer; /* proc of filesystem syncer process */
538 : STATIC int req_clear_inodedeps; /* syncer process flush some inodedeps */
539 : #define FLUSH_INODES 1
540 : STATIC int req_clear_remove; /* syncer process flush some freeblks */
541 : #define FLUSH_REMOVE 2
542 : /*
543 : * runtime statistics
544 : */
545 : STATIC int stat_worklist_push; /* number of worklist cleanups */
546 : STATIC int stat_blk_limit_push; /* number of times block limit neared */
547 : STATIC int stat_ino_limit_push; /* number of times inode limit neared */
548 : STATIC int stat_blk_limit_hit; /* number of times block slowdown imposed */
549 : STATIC int stat_ino_limit_hit; /* number of times inode slowdown imposed */
550 : STATIC int stat_sync_limit_hit; /* number of synchronous slowdowns imposed */
551 : STATIC int stat_indir_blk_ptrs; /* bufs redirtied as indir ptrs not written */
552 : STATIC int stat_inode_bitmap; /* bufs redirtied as inode bitmap not written */
553 : STATIC int stat_direct_blk_ptrs;/* bufs redirtied as direct ptrs not written */
554 : STATIC int stat_dir_entry; /* bufs redirtied as dir entry cannot write */
555 :
556 : /*
557 : * Add an item to the end of the work queue.
558 : * This routine requires that the lock be held.
559 : * This is the only routine that adds items to the list.
560 : * The following routine is the only one that removes items
561 : * and does so in order from first to last.
562 : */
563 : STATIC void
564 0 : add_to_worklist(struct worklist *wk)
565 : {
566 :
567 0 : if (wk->wk_state & ONWORKLIST) {
568 : #ifdef DEBUG
569 : if (lk.lkt_held != -1)
570 : FREE_LOCK(&lk);
571 : #endif
572 0 : panic("add_to_worklist: already on list");
573 : }
574 0 : wk->wk_state |= ONWORKLIST;
575 0 : if (LIST_FIRST(&softdep_workitem_pending) == NULL)
576 0 : LIST_INSERT_HEAD(&softdep_workitem_pending, wk, wk_list);
577 : else
578 0 : LIST_INSERT_AFTER(worklist_tail, wk, wk_list);
579 0 : worklist_tail = wk;
580 0 : num_on_worklist += 1;
581 0 : }
582 :
583 : /*
584 : * Process that runs once per second to handle items in the background queue.
585 : *
586 : * Note that we ensure that everything is done in the order in which they
587 : * appear in the queue. The code below depends on this property to ensure
588 : * that blocks of a file are freed before the inode itself is freed. This
589 : * ordering ensures that no new <vfsid, inum, lbn> triples will be generated
590 : * until all the old ones have been purged from the dependency lists.
591 : */
592 : int
593 0 : softdep_process_worklist(struct mount *matchmnt)
594 : {
595 0 : struct proc *p = CURPROC;
596 : int matchcnt, loopcount;
597 0 : struct timeval starttime;
598 :
599 : /*
600 : * First process any items on the delayed-free queue.
601 : */
602 0 : ACQUIRE_LOCK(&lk);
603 0 : softdep_freequeue_process();
604 0 : FREE_LOCK(&lk);
605 :
606 : /*
607 : * Record the process identifier of our caller so that we can give
608 : * this process preferential treatment in request_cleanup below.
609 : * We can't do this in softdep_initialize, because the syncer doesn't
610 : * have to run then.
611 : * NOTE! This function _could_ be called with a curproc != syncerproc.
612 : */
613 0 : filesys_syncer = syncerproc;
614 : matchcnt = 0;
615 :
616 : /*
617 : * There is no danger of having multiple processes run this
618 : * code, but we have to single-thread it when softdep_flushfiles()
619 : * is in operation to get an accurate count of the number of items
620 : * related to its mount point that are in the list.
621 : */
622 0 : if (matchmnt == NULL) {
623 0 : if (softdep_worklist_busy < 0)
624 0 : return(-1);
625 0 : softdep_worklist_busy += 1;
626 0 : }
627 :
628 : /*
629 : * If requested, try removing inode or removal dependencies.
630 : */
631 0 : if (req_clear_inodedeps) {
632 0 : clear_inodedeps(p);
633 0 : req_clear_inodedeps -= 1;
634 0 : wakeup_one(&proc_waiting);
635 0 : }
636 0 : if (req_clear_remove) {
637 0 : clear_remove(p);
638 0 : req_clear_remove -= 1;
639 0 : wakeup_one(&proc_waiting);
640 0 : }
641 : loopcount = 1;
642 0 : getmicrouptime(&starttime);
643 0 : while (num_on_worklist > 0) {
644 0 : matchcnt += process_worklist_item(matchmnt, 0);
645 :
646 : /*
647 : * If a umount operation wants to run the worklist
648 : * accurately, abort.
649 : */
650 0 : if (softdep_worklist_req && matchmnt == NULL) {
651 : matchcnt = -1;
652 0 : break;
653 : }
654 :
655 : /*
656 : * If requested, try removing inode or removal dependencies.
657 : */
658 0 : if (req_clear_inodedeps) {
659 0 : clear_inodedeps(p);
660 0 : req_clear_inodedeps -= 1;
661 0 : wakeup_one(&proc_waiting);
662 0 : }
663 0 : if (req_clear_remove) {
664 0 : clear_remove(p);
665 0 : req_clear_remove -= 1;
666 0 : wakeup_one(&proc_waiting);
667 0 : }
668 : /*
669 : * We do not generally want to stop for buffer space, but if
670 : * we are really being a buffer hog, we will stop and wait.
671 : */
672 : #if 0
673 : if (loopcount++ % 128 == 0)
674 : bwillwrite();
675 : #endif
676 : /*
677 : * Never allow processing to run for more than one
678 : * second. Otherwise the other syncer tasks may get
679 : * excessively backlogged.
680 : */
681 : {
682 : struct timeval diff;
683 0 : struct timeval tv;
684 :
685 0 : getmicrouptime(&tv);
686 0 : timersub(&tv, &starttime, &diff);
687 0 : if (diff.tv_sec != 0 && matchmnt == NULL) {
688 : matchcnt = -1;
689 0 : break;
690 : }
691 0 : }
692 :
693 : /*
694 : * Process any new items on the delayed-free queue.
695 : */
696 0 : ACQUIRE_LOCK(&lk);
697 0 : softdep_freequeue_process();
698 0 : FREE_LOCK(&lk);
699 : }
700 0 : if (matchmnt == NULL) {
701 0 : softdep_worklist_busy -= 1;
702 0 : if (softdep_worklist_req && softdep_worklist_busy == 0)
703 0 : wakeup(&softdep_worklist_req);
704 : }
705 0 : return (matchcnt);
706 0 : }
707 :
708 : /*
709 : * Process one item on the worklist.
710 : */
711 : STATIC int
712 0 : process_worklist_item(struct mount *matchmnt, int flags)
713 : {
714 : struct worklist *wk, *wkend;
715 : struct dirrem *dirrem;
716 : struct mount *mp;
717 : struct vnode *vp;
718 : int matchcnt = 0;
719 :
720 0 : ACQUIRE_LOCK(&lk);
721 : /*
722 : * Normally we just process each item on the worklist in order.
723 : * However, if we are in a situation where we cannot lock any
724 : * inodes, we have to skip over any dirrem requests whose
725 : * vnodes are resident and locked.
726 : */
727 0 : LIST_FOREACH(wk, &softdep_workitem_pending, wk_list) {
728 0 : if ((flags & LK_NOWAIT) == 0 || wk->wk_type != D_DIRREM)
729 : break;
730 0 : dirrem = WK_DIRREM(wk);
731 0 : vp = ufs_ihashlookup(VFSTOUFS(dirrem->dm_mnt)->um_dev,
732 0 : dirrem->dm_oldinum);
733 0 : if (vp == NULL || !VOP_ISLOCKED(vp))
734 : break;
735 : }
736 0 : if (wk == NULL) {
737 0 : FREE_LOCK(&lk);
738 0 : return (0);
739 : }
740 : /*
741 : * Remove the item to be processed. If we are removing the last
742 : * item on the list, we need to recalculate the tail pointer.
743 : * As this happens rarely and usually when the list is short,
744 : * we just run down the list to find it rather than tracking it
745 : * in the above loop.
746 : */
747 0 : WORKLIST_REMOVE(wk);
748 0 : if (wk == worklist_tail) {
749 0 : LIST_FOREACH(wkend, &softdep_workitem_pending, wk_list)
750 0 : if (LIST_NEXT(wkend, wk_list) == NULL)
751 : break;
752 0 : worklist_tail = wkend;
753 0 : }
754 0 : num_on_worklist -= 1;
755 0 : FREE_LOCK(&lk);
756 0 : switch (wk->wk_type) {
757 :
758 : case D_DIRREM:
759 : /* removal of a directory entry */
760 0 : mp = WK_DIRREM(wk)->dm_mnt;
761 : #if 0
762 : if (vn_write_suspend_wait(NULL, mp, V_NOWAIT))
763 : panic("%s: dirrem on suspended filesystem",
764 : "process_worklist_item");
765 : #endif
766 0 : if (mp == matchmnt)
767 0 : matchcnt += 1;
768 0 : handle_workitem_remove(WK_DIRREM(wk));
769 0 : break;
770 :
771 : case D_FREEBLKS:
772 : /* releasing blocks and/or fragments from a file */
773 0 : mp = WK_FREEBLKS(wk)->fb_mnt;
774 : #if 0
775 : if (vn_write_suspend_wait(NULL, mp, V_NOWAIT))
776 : panic("%s: freeblks on suspended filesystem",
777 : "process_worklist_item");
778 : #endif
779 0 : if (mp == matchmnt)
780 0 : matchcnt += 1;
781 0 : handle_workitem_freeblocks(WK_FREEBLKS(wk));
782 0 : break;
783 :
784 : case D_FREEFRAG:
785 : /* releasing a fragment when replaced as a file grows */
786 0 : mp = WK_FREEFRAG(wk)->ff_mnt;
787 : #if 0
788 : if (vn_write_suspend_wait(NULL, mp, V_NOWAIT))
789 : panic("%s: freefrag on suspended filesystem",
790 : "process_worklist_item");
791 : #endif
792 0 : if (mp == matchmnt)
793 0 : matchcnt += 1;
794 0 : handle_workitem_freefrag(WK_FREEFRAG(wk));
795 0 : break;
796 :
797 : case D_FREEFILE:
798 : /* releasing an inode when its link count drops to 0 */
799 0 : mp = WK_FREEFILE(wk)->fx_mnt;
800 : #if 0
801 : if (vn_write_suspend_wait(NULL, mp, V_NOWAIT))
802 : panic("%s: freefile on suspended filesystem",
803 : "process_worklist_item");
804 : #endif
805 0 : if (mp == matchmnt)
806 0 : matchcnt += 1;
807 0 : handle_workitem_freefile(WK_FREEFILE(wk));
808 0 : break;
809 :
810 : default:
811 0 : panic("%s_process_worklist: Unknown type %s",
812 0 : "softdep", TYPENAME(wk->wk_type));
813 : /* NOTREACHED */
814 : }
815 0 : return (matchcnt);
816 0 : }
817 :
818 : /*
819 : * Move dependencies from one buffer to another.
820 : */
821 : void
822 0 : softdep_move_dependencies(struct buf *oldbp, struct buf *newbp)
823 : {
824 : struct worklist *wk, *wktail;
825 :
826 0 : if (LIST_FIRST(&newbp->b_dep) != NULL)
827 0 : panic("softdep_move_dependencies: need merge code");
828 : wktail = NULL;
829 0 : ACQUIRE_LOCK(&lk);
830 0 : while ((wk = LIST_FIRST(&oldbp->b_dep)) != NULL) {
831 0 : LIST_REMOVE(wk, wk_list);
832 0 : if (wktail == NULL)
833 0 : LIST_INSERT_HEAD(&newbp->b_dep, wk, wk_list);
834 : else
835 0 : LIST_INSERT_AFTER(wktail, wk, wk_list);
836 : wktail = wk;
837 : }
838 0 : FREE_LOCK(&lk);
839 0 : }
840 :
841 : /*
842 : * Purge the work list of all items associated with a particular mount point.
843 : */
844 : int
845 0 : softdep_flushworklist(struct mount *oldmnt, int *countp, struct proc *p)
846 : {
847 : struct vnode *devvp;
848 : int count, error = 0;
849 :
850 : /*
851 : * Await our turn to clear out the queue, then serialize access.
852 : */
853 0 : while (softdep_worklist_busy) {
854 0 : softdep_worklist_req += 1;
855 0 : tsleep(&softdep_worklist_req, PRIBIO, "softflush", 0);
856 0 : softdep_worklist_req -= 1;
857 : }
858 0 : softdep_worklist_busy = -1;
859 : /*
860 : * Alternately flush the block device associated with the mount
861 : * point and process any dependencies that the flushing
862 : * creates. We continue until no more worklist dependencies
863 : * are found.
864 : */
865 0 : *countp = 0;
866 0 : devvp = VFSTOUFS(oldmnt)->um_devvp;
867 0 : while ((count = softdep_process_worklist(oldmnt)) > 0) {
868 0 : *countp += count;
869 0 : vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
870 0 : error = VOP_FSYNC(devvp, p->p_ucred, MNT_WAIT, p);
871 0 : VOP_UNLOCK(devvp);
872 0 : if (error)
873 : break;
874 : }
875 0 : softdep_worklist_busy = 0;
876 0 : if (softdep_worklist_req)
877 0 : wakeup(&softdep_worklist_req);
878 0 : return (error);
879 : }
880 :
881 : /*
882 : * Flush all vnodes and worklist items associated with a specified mount point.
883 : */
884 : int
885 0 : softdep_flushfiles(struct mount *oldmnt, int flags, struct proc *p)
886 : {
887 0 : int error, count, loopcnt;
888 :
889 : /*
890 : * Alternately flush the vnodes associated with the mount
891 : * point and process any dependencies that the flushing
892 : * creates. In theory, this loop can happen at most twice,
893 : * but we give it a few extra just to be sure.
894 : */
895 0 : for (loopcnt = 10; loopcnt > 0; loopcnt--) {
896 : /*
897 : * Do another flush in case any vnodes were brought in
898 : * as part of the cleanup operations.
899 : */
900 0 : if ((error = ffs_flushfiles(oldmnt, flags, p)) != 0)
901 : break;
902 0 : if ((error = softdep_flushworklist(oldmnt, &count, p)) != 0 ||
903 0 : count == 0)
904 : break;
905 : }
906 : /*
907 : * If the reboot process sleeps during the loop, the update
908 : * process may call softdep_process_worklist() and create
909 : * new dirty vnodes at the mount point. Call ffs_flushfiles()
910 : * again after the loop has flushed all soft dependencies.
911 : */
912 0 : if (error == 0)
913 0 : error = ffs_flushfiles(oldmnt, flags, p);
914 : /*
915 : * If we are unmounting then it is an error to fail. If we
916 : * are simply trying to downgrade to read-only, then filesystem
917 : * activity can keep us busy forever, so we just fail with EBUSY.
918 : */
919 0 : if (loopcnt == 0) {
920 : error = EBUSY;
921 0 : }
922 0 : return (error);
923 0 : }
924 :
925 : /*
926 : * Structure hashing.
927 : *
928 : * There are three types of structures that can be looked up:
929 : * 1) pagedep structures identified by mount point, inode number,
930 : * and logical block.
931 : * 2) inodedep structures identified by mount point and inode number.
932 : * 3) newblk structures identified by mount point and
933 : * physical block number.
934 : *
935 : * The "pagedep" and "inodedep" dependency structures are hashed
936 : * separately from the file blocks and inodes to which they correspond.
937 : * This separation helps when the in-memory copy of an inode or
938 : * file block must be replaced. It also obviates the need to access
939 : * an inode or file page when simply updating (or de-allocating)
940 : * dependency structures. Lookup of newblk structures is needed to
941 : * find newly allocated blocks when trying to associate them with
942 : * their allocdirect or allocindir structure.
943 : *
944 : * The lookup routines optionally create and hash a new instance when
945 : * an existing entry is not found.
946 : */
947 : #define DEPALLOC 0x0001 /* allocate structure if lookup fails */
948 : #define NODELAY 0x0002 /* cannot do background work */
949 :
950 : SIPHASH_KEY softdep_hashkey;
951 :
952 : /*
953 : * Structures and routines associated with pagedep caching.
954 : */
955 : LIST_HEAD(pagedep_hashhead, pagedep) *pagedep_hashtbl;
956 : u_long pagedep_hash; /* size of hash table - 1 */
957 : STATIC struct sema pagedep_in_progress;
958 :
959 : /*
960 : * Look up a pagedep. Return 1 if found, 0 if not found or found
961 : * when asked to allocate but not associated with any buffer.
962 : * If not found, allocate if DEPALLOC flag is passed.
963 : * Found or allocated entry is returned in pagedeppp.
964 : * This routine must be called with splbio interrupts blocked.
965 : */
966 : STATIC int
967 0 : pagedep_lookup(struct inode *ip, daddr_t lbn, int flags,
968 : struct pagedep **pagedeppp)
969 : {
970 0 : SIPHASH_CTX ctx;
971 : struct pagedep *pagedep;
972 : struct pagedep_hashhead *pagedephd;
973 0 : struct mount *mp;
974 : int i;
975 :
976 0 : splassert(IPL_BIO);
977 :
978 : #ifdef DEBUG
979 : if (lk.lkt_held == -1)
980 : panic("pagedep_lookup: lock not held");
981 : #endif
982 0 : mp = ITOV(ip)->v_mount;
983 :
984 0 : SipHash24_Init(&ctx, &softdep_hashkey);
985 0 : SipHash24_Update(&ctx, &mp, sizeof(mp));
986 0 : SipHash24_Update(&ctx, &ip->i_number, sizeof(ip->i_number));
987 0 : SipHash24_Update(&ctx, &lbn, sizeof(lbn));
988 0 : pagedephd = &pagedep_hashtbl[SipHash24_End(&ctx) & pagedep_hash];
989 : top:
990 0 : LIST_FOREACH(pagedep, pagedephd, pd_hash)
991 0 : if (ip->i_number == pagedep->pd_ino &&
992 0 : lbn == pagedep->pd_lbn &&
993 0 : mp == pagedep->pd_mnt)
994 : break;
995 0 : if (pagedep) {
996 0 : *pagedeppp = pagedep;
997 0 : if ((flags & DEPALLOC) != 0 &&
998 0 : (pagedep->pd_state & ONWORKLIST) == 0)
999 0 : return (0);
1000 0 : return (1);
1001 : }
1002 0 : if ((flags & DEPALLOC) == 0) {
1003 0 : *pagedeppp = NULL;
1004 0 : return (0);
1005 : }
1006 0 : if (sema_get(&pagedep_in_progress, &lk) == 0) {
1007 0 : ACQUIRE_LOCK(&lk);
1008 0 : goto top;
1009 : }
1010 0 : pagedep = pool_get(&pagedep_pool, PR_WAITOK | PR_ZERO);
1011 0 : pagedep->pd_list.wk_type = D_PAGEDEP;
1012 0 : pagedep->pd_mnt = mp;
1013 0 : pagedep->pd_ino = ip->i_number;
1014 0 : pagedep->pd_lbn = lbn;
1015 0 : LIST_INIT(&pagedep->pd_dirremhd);
1016 0 : LIST_INIT(&pagedep->pd_pendinghd);
1017 0 : for (i = 0; i < DAHASHSZ; i++)
1018 0 : LIST_INIT(&pagedep->pd_diraddhd[i]);
1019 0 : ACQUIRE_LOCK(&lk);
1020 0 : LIST_INSERT_HEAD(pagedephd, pagedep, pd_hash);
1021 0 : sema_release(&pagedep_in_progress);
1022 0 : *pagedeppp = pagedep;
1023 0 : return (0);
1024 0 : }
1025 :
1026 : /*
1027 : * Structures and routines associated with inodedep caching.
1028 : */
1029 : LIST_HEAD(inodedep_hashhead, inodedep) *inodedep_hashtbl;
1030 : STATIC u_long inodedep_hash; /* size of hash table - 1 */
1031 : STATIC long num_inodedep; /* number of inodedep allocated */
1032 : STATIC struct sema inodedep_in_progress;
1033 :
1034 : /*
1035 : * Look up a inodedep. Return 1 if found, 0 if not found.
1036 : * If not found, allocate if DEPALLOC flag is passed.
1037 : * Found or allocated entry is returned in inodedeppp.
1038 : * This routine must be called with splbio interrupts blocked.
1039 : */
1040 : STATIC int
1041 0 : inodedep_lookup(struct fs *fs, ufsino_t inum, int flags,
1042 : struct inodedep **inodedeppp)
1043 : {
1044 0 : SIPHASH_CTX ctx;
1045 : struct inodedep *inodedep;
1046 : struct inodedep_hashhead *inodedephd;
1047 : int firsttry;
1048 :
1049 0 : splassert(IPL_BIO);
1050 :
1051 : #ifdef DEBUG
1052 : if (lk.lkt_held == -1)
1053 : panic("inodedep_lookup: lock not held");
1054 : #endif
1055 : firsttry = 1;
1056 0 : SipHash24_Init(&ctx, &softdep_hashkey);
1057 0 : SipHash24_Update(&ctx, &fs, sizeof(fs));
1058 0 : SipHash24_Update(&ctx, &inum, sizeof(inum));
1059 0 : inodedephd = &inodedep_hashtbl[SipHash24_End(&ctx) & inodedep_hash];
1060 : top:
1061 0 : LIST_FOREACH(inodedep, inodedephd, id_hash)
1062 0 : if (inum == inodedep->id_ino && fs == inodedep->id_fs)
1063 : break;
1064 0 : if (inodedep) {
1065 0 : *inodedeppp = inodedep;
1066 0 : return (1);
1067 : }
1068 0 : if ((flags & DEPALLOC) == 0) {
1069 0 : *inodedeppp = NULL;
1070 0 : return (0);
1071 : }
1072 : /*
1073 : * If we are over our limit, try to improve the situation.
1074 : */
1075 0 : if (num_inodedep > max_softdeps && firsttry && (flags & NODELAY) == 0 &&
1076 0 : request_cleanup(FLUSH_INODES, 1)) {
1077 : firsttry = 0;
1078 0 : goto top;
1079 : }
1080 0 : if (sema_get(&inodedep_in_progress, &lk) == 0) {
1081 0 : ACQUIRE_LOCK(&lk);
1082 0 : goto top;
1083 : }
1084 0 : num_inodedep += 1;
1085 0 : inodedep = pool_get(&inodedep_pool, PR_WAITOK);
1086 0 : inodedep->id_list.wk_type = D_INODEDEP;
1087 0 : inodedep->id_fs = fs;
1088 0 : inodedep->id_ino = inum;
1089 0 : inodedep->id_state = ALLCOMPLETE;
1090 0 : inodedep->id_nlinkdelta = 0;
1091 0 : inodedep->id_savedino1 = NULL;
1092 0 : inodedep->id_savedsize = -1;
1093 0 : inodedep->id_buf = NULL;
1094 0 : LIST_INIT(&inodedep->id_pendinghd);
1095 0 : LIST_INIT(&inodedep->id_inowait);
1096 0 : LIST_INIT(&inodedep->id_bufwait);
1097 0 : TAILQ_INIT(&inodedep->id_inoupdt);
1098 0 : TAILQ_INIT(&inodedep->id_newinoupdt);
1099 0 : ACQUIRE_LOCK(&lk);
1100 0 : LIST_INSERT_HEAD(inodedephd, inodedep, id_hash);
1101 0 : sema_release(&inodedep_in_progress);
1102 0 : *inodedeppp = inodedep;
1103 0 : return (0);
1104 0 : }
1105 :
1106 : /*
1107 : * Structures and routines associated with newblk caching.
1108 : */
1109 : LIST_HEAD(newblk_hashhead, newblk) *newblk_hashtbl;
1110 : u_long newblk_hash; /* size of hash table - 1 */
1111 : STATIC struct sema newblk_in_progress;
1112 :
1113 : /*
1114 : * Look up a newblk. Return 1 if found, 0 if not found.
1115 : * If not found, allocate if DEPALLOC flag is passed.
1116 : * Found or allocated entry is returned in newblkpp.
1117 : */
1118 : STATIC int
1119 0 : newblk_lookup(struct fs *fs, daddr_t newblkno, int flags,
1120 : struct newblk **newblkpp)
1121 : {
1122 0 : SIPHASH_CTX ctx;
1123 : struct newblk *newblk;
1124 : struct newblk_hashhead *newblkhd;
1125 :
1126 0 : SipHash24_Init(&ctx, &softdep_hashkey);
1127 0 : SipHash24_Update(&ctx, &fs, sizeof(fs));
1128 0 : SipHash24_Update(&ctx, &newblkno, sizeof(newblkno));
1129 0 : newblkhd = &newblk_hashtbl[SipHash24_End(&ctx) & newblk_hash];
1130 : top:
1131 0 : LIST_FOREACH(newblk, newblkhd, nb_hash)
1132 0 : if (newblkno == newblk->nb_newblkno && fs == newblk->nb_fs)
1133 : break;
1134 0 : if (newblk) {
1135 0 : *newblkpp = newblk;
1136 0 : return (1);
1137 : }
1138 0 : if ((flags & DEPALLOC) == 0) {
1139 0 : *newblkpp = NULL;
1140 0 : return (0);
1141 : }
1142 0 : if (sema_get(&newblk_in_progress, NULL) == 0)
1143 0 : goto top;
1144 0 : newblk = pool_get(&newblk_pool, PR_WAITOK);
1145 0 : newblk->nb_state = 0;
1146 0 : newblk->nb_fs = fs;
1147 0 : newblk->nb_newblkno = newblkno;
1148 0 : LIST_INSERT_HEAD(newblkhd, newblk, nb_hash);
1149 0 : sema_release(&newblk_in_progress);
1150 0 : *newblkpp = newblk;
1151 0 : return (0);
1152 0 : }
1153 :
1154 : /*
1155 : * Executed during filesystem system initialization before
1156 : * mounting any file systems.
1157 : */
1158 : void
1159 0 : softdep_initialize(void)
1160 : {
1161 :
1162 0 : bioops.io_start = softdep_disk_io_initiation;
1163 0 : bioops.io_complete = softdep_disk_write_complete;
1164 0 : bioops.io_deallocate = softdep_deallocate_dependencies;
1165 0 : bioops.io_movedeps = softdep_move_dependencies;
1166 0 : bioops.io_countdeps = softdep_count_dependencies;
1167 :
1168 0 : LIST_INIT(&mkdirlisthd);
1169 0 : LIST_INIT(&softdep_workitem_pending);
1170 : #ifdef KMEMSTATS
1171 0 : max_softdeps = min (initialvnodes * 8,
1172 0 : kmemstats[M_INODEDEP].ks_limit / (2 * sizeof(struct inodedep)));
1173 : #else
1174 : max_softdeps = initialvnodes * 4;
1175 : #endif
1176 0 : arc4random_buf(&softdep_hashkey, sizeof(softdep_hashkey));
1177 0 : pagedep_hashtbl = hashinit(initialvnodes / 5, M_PAGEDEP, M_WAITOK,
1178 : &pagedep_hash);
1179 0 : sema_init(&pagedep_in_progress, "pagedep", PRIBIO, 0);
1180 0 : inodedep_hashtbl = hashinit(initialvnodes, M_INODEDEP, M_WAITOK,
1181 : &inodedep_hash);
1182 0 : sema_init(&inodedep_in_progress, "inodedep", PRIBIO, 0);
1183 0 : newblk_hashtbl = hashinit(64, M_NEWBLK, M_WAITOK, &newblk_hash);
1184 0 : sema_init(&newblk_in_progress, "newblk", PRIBIO, 0);
1185 0 : timeout_set(&proc_waiting_timeout, pause_timer, NULL);
1186 0 : pool_init(&pagedep_pool, sizeof(struct pagedep), 0, IPL_NONE,
1187 : PR_WAITOK, "pagedep", NULL);
1188 0 : pool_init(&inodedep_pool, sizeof(struct inodedep), 0, IPL_NONE,
1189 : PR_WAITOK, "inodedep", NULL);
1190 0 : pool_init(&newblk_pool, sizeof(struct newblk), 0, IPL_NONE,
1191 : PR_WAITOK, "newblk", NULL);
1192 0 : pool_init(&bmsafemap_pool, sizeof(struct bmsafemap), 0, IPL_NONE,
1193 : PR_WAITOK, "bmsafemap", NULL);
1194 0 : pool_init(&allocdirect_pool, sizeof(struct allocdirect), 0, IPL_NONE,
1195 : PR_WAITOK, "allocdir", NULL);
1196 0 : pool_init(&indirdep_pool, sizeof(struct indirdep), 0, IPL_NONE,
1197 : PR_WAITOK, "indirdep", NULL);
1198 0 : pool_init(&allocindir_pool, sizeof(struct allocindir), 0, IPL_NONE,
1199 : PR_WAITOK, "allocindir", NULL);
1200 0 : pool_init(&freefrag_pool, sizeof(struct freefrag), 0, IPL_NONE,
1201 : PR_WAITOK, "freefrag", NULL);
1202 0 : pool_init(&freeblks_pool, sizeof(struct freeblks), 0, IPL_NONE,
1203 : PR_WAITOK, "freeblks", NULL);
1204 0 : pool_init(&freefile_pool, sizeof(struct freefile), 0, IPL_NONE,
1205 : PR_WAITOK, "freefile", NULL);
1206 0 : pool_init(&diradd_pool, sizeof(struct diradd), 0, IPL_NONE,
1207 : PR_WAITOK, "diradd", NULL);
1208 0 : pool_init(&mkdir_pool, sizeof(struct mkdir), 0, IPL_NONE,
1209 : PR_WAITOK, "mkdir", NULL);
1210 0 : pool_init(&dirrem_pool, sizeof(struct dirrem), 0, IPL_NONE,
1211 : PR_WAITOK, "dirrem", NULL);
1212 0 : pool_init(&newdirblk_pool, sizeof(struct newdirblk), 0, IPL_NONE,
1213 : PR_WAITOK, "newdirblk", NULL);
1214 0 : }
1215 :
1216 : /*
1217 : * Called at mount time to notify the dependency code that a
1218 : * filesystem wishes to use it.
1219 : */
1220 : int
1221 0 : softdep_mount(struct vnode *devvp, struct mount *mp, struct fs *fs,
1222 : struct ucred *cred)
1223 : {
1224 0 : struct csum_total cstotal;
1225 : struct cg *cgp;
1226 0 : struct buf *bp;
1227 : int error, cyl;
1228 :
1229 : /*
1230 : * When doing soft updates, the counters in the
1231 : * superblock may have gotten out of sync, so we have
1232 : * to scan the cylinder groups and recalculate them.
1233 : */
1234 0 : if ((fs->fs_flags & FS_UNCLEAN) == 0)
1235 0 : return (0);
1236 0 : memset(&cstotal, 0, sizeof(cstotal));
1237 0 : for (cyl = 0; cyl < fs->fs_ncg; cyl++) {
1238 0 : if ((error = bread(devvp, fsbtodb(fs, cgtod(fs, cyl)),
1239 0 : fs->fs_cgsize, &bp)) != 0) {
1240 0 : brelse(bp);
1241 0 : return (error);
1242 : }
1243 0 : cgp = (struct cg *)bp->b_data;
1244 0 : cstotal.cs_nffree += cgp->cg_cs.cs_nffree;
1245 0 : cstotal.cs_nbfree += cgp->cg_cs.cs_nbfree;
1246 0 : cstotal.cs_nifree += cgp->cg_cs.cs_nifree;
1247 0 : cstotal.cs_ndir += cgp->cg_cs.cs_ndir;
1248 0 : fs->fs_cs(fs, cyl) = cgp->cg_cs;
1249 0 : brelse(bp);
1250 : }
1251 : #ifdef DEBUG
1252 : if (memcmp(&cstotal, &fs->fs_cstotal, sizeof(cstotal)))
1253 : printf("ffs_mountfs: superblock updated for soft updates\n");
1254 : #endif
1255 0 : memcpy(&fs->fs_cstotal, &cstotal, sizeof(cstotal));
1256 0 : return (0);
1257 0 : }
1258 :
1259 : /*
1260 : * Protecting the freemaps (or bitmaps).
1261 : *
1262 : * To eliminate the need to execute fsck before mounting a file system
1263 : * after a power failure, one must (conservatively) guarantee that the
1264 : * on-disk copy of the bitmaps never indicate that a live inode or block is
1265 : * free. So, when a block or inode is allocated, the bitmap should be
1266 : * updated (on disk) before any new pointers. When a block or inode is
1267 : * freed, the bitmap should not be updated until all pointers have been
1268 : * reset. The latter dependency is handled by the delayed de-allocation
1269 : * approach described below for block and inode de-allocation. The former
1270 : * dependency is handled by calling the following procedure when a block or
1271 : * inode is allocated. When an inode is allocated an "inodedep" is created
1272 : * with its DEPCOMPLETE flag cleared until its bitmap is written to disk.
1273 : * Each "inodedep" is also inserted into the hash indexing structure so
1274 : * that any additional link additions can be made dependent on the inode
1275 : * allocation.
1276 : *
1277 : * The ufs file system maintains a number of free block counts (e.g., per
1278 : * cylinder group, per cylinder and per <cylinder, rotational position> pair)
1279 : * in addition to the bitmaps. These counts are used to improve efficiency
1280 : * during allocation and therefore must be consistent with the bitmaps.
1281 : * There is no convenient way to guarantee post-crash consistency of these
1282 : * counts with simple update ordering, for two main reasons: (1) The counts
1283 : * and bitmaps for a single cylinder group block are not in the same disk
1284 : * sector. If a disk write is interrupted (e.g., by power failure), one may
1285 : * be written and the other not. (2) Some of the counts are located in the
1286 : * superblock rather than the cylinder group block. So, we focus our soft
1287 : * updates implementation on protecting the bitmaps. When mounting a
1288 : * filesystem, we recompute the auxiliary counts from the bitmaps.
1289 : */
1290 :
1291 : /*
1292 : * Called just after updating the cylinder group block to allocate an inode.
1293 : */
1294 : /* buffer for cylgroup block with inode map */
1295 : /* inode related to allocation */
1296 : /* new inode number being allocated */
1297 : void
1298 0 : softdep_setup_inomapdep(struct buf *bp, struct inode *ip, ufsino_t newinum)
1299 : {
1300 0 : struct inodedep *inodedep;
1301 : struct bmsafemap *bmsafemap;
1302 :
1303 : /*
1304 : * Create a dependency for the newly allocated inode.
1305 : * Panic if it already exists as something is seriously wrong.
1306 : * Otherwise add it to the dependency list for the buffer holding
1307 : * the cylinder group map from which it was allocated.
1308 : */
1309 0 : ACQUIRE_LOCK(&lk);
1310 0 : if (inodedep_lookup(ip->i_fs, newinum, DEPALLOC | NODELAY, &inodedep)
1311 0 : != 0) {
1312 0 : FREE_LOCK(&lk);
1313 0 : panic("softdep_setup_inomapdep: found inode");
1314 : }
1315 0 : inodedep->id_buf = bp;
1316 0 : inodedep->id_state &= ~DEPCOMPLETE;
1317 0 : bmsafemap = bmsafemap_lookup(bp);
1318 0 : LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps);
1319 0 : FREE_LOCK(&lk);
1320 0 : }
1321 :
1322 : /*
1323 : * Called just after updating the cylinder group block to
1324 : * allocate block or fragment.
1325 : */
1326 : /* buffer for cylgroup block with block map */
1327 : /* filesystem doing allocation */
1328 : /* number of newly allocated block */
1329 : void
1330 0 : softdep_setup_blkmapdep(struct buf *bp, struct fs *fs, daddr_t newblkno)
1331 : {
1332 0 : struct newblk *newblk;
1333 : struct bmsafemap *bmsafemap;
1334 :
1335 : /*
1336 : * Create a dependency for the newly allocated block.
1337 : * Add it to the dependency list for the buffer holding
1338 : * the cylinder group map from which it was allocated.
1339 : */
1340 0 : if (newblk_lookup(fs, newblkno, DEPALLOC, &newblk) != 0)
1341 0 : panic("softdep_setup_blkmapdep: found block");
1342 0 : ACQUIRE_LOCK(&lk);
1343 0 : newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(bp);
1344 0 : LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps);
1345 0 : FREE_LOCK(&lk);
1346 0 : }
1347 :
1348 : /*
1349 : * Find the bmsafemap associated with a cylinder group buffer.
1350 : * If none exists, create one. The buffer must be locked when
1351 : * this routine is called and this routine must be called with
1352 : * splbio interrupts blocked.
1353 : */
1354 : STATIC struct bmsafemap *
1355 0 : bmsafemap_lookup(struct buf *bp)
1356 : {
1357 : struct bmsafemap *bmsafemap;
1358 : struct worklist *wk;
1359 :
1360 0 : splassert(IPL_BIO);
1361 :
1362 : #ifdef DEBUG
1363 : if (lk.lkt_held == -1)
1364 : panic("bmsafemap_lookup: lock not held");
1365 : #endif
1366 0 : LIST_FOREACH(wk, &bp->b_dep, wk_list)
1367 0 : if (wk->wk_type == D_BMSAFEMAP)
1368 0 : return (WK_BMSAFEMAP(wk));
1369 0 : FREE_LOCK(&lk);
1370 0 : bmsafemap = pool_get(&bmsafemap_pool, PR_WAITOK);
1371 0 : bmsafemap->sm_list.wk_type = D_BMSAFEMAP;
1372 0 : bmsafemap->sm_list.wk_state = 0;
1373 0 : bmsafemap->sm_buf = bp;
1374 0 : LIST_INIT(&bmsafemap->sm_allocdirecthd);
1375 0 : LIST_INIT(&bmsafemap->sm_allocindirhd);
1376 0 : LIST_INIT(&bmsafemap->sm_inodedephd);
1377 0 : LIST_INIT(&bmsafemap->sm_newblkhd);
1378 0 : ACQUIRE_LOCK(&lk);
1379 0 : WORKLIST_INSERT(&bp->b_dep, &bmsafemap->sm_list);
1380 0 : return (bmsafemap);
1381 0 : }
1382 :
1383 : /*
1384 : * Direct block allocation dependencies.
1385 : *
1386 : * When a new block is allocated, the corresponding disk locations must be
1387 : * initialized (with zeros or new data) before the on-disk inode points to
1388 : * them. Also, the freemap from which the block was allocated must be
1389 : * updated (on disk) before the inode's pointer. These two dependencies are
1390 : * independent of each other and are needed for all file blocks and indirect
1391 : * blocks that are pointed to directly by the inode. Just before the
1392 : * "in-core" version of the inode is updated with a newly allocated block
1393 : * number, a procedure (below) is called to setup allocation dependency
1394 : * structures. These structures are removed when the corresponding
1395 : * dependencies are satisfied or when the block allocation becomes obsolete
1396 : * (i.e., the file is deleted, the block is de-allocated, or the block is a
1397 : * fragment that gets upgraded). All of these cases are handled in
1398 : * procedures described later.
1399 : *
1400 : * When a file extension causes a fragment to be upgraded, either to a larger
1401 : * fragment or to a full block, the on-disk location may change (if the
1402 : * previous fragment could not simply be extended). In this case, the old
1403 : * fragment must be de-allocated, but not until after the inode's pointer has
1404 : * been updated. In most cases, this is handled by later procedures, which
1405 : * will construct a "freefrag" structure to be added to the workitem queue
1406 : * when the inode update is complete (or obsolete). The main exception to
1407 : * this is when an allocation occurs while a pending allocation dependency
1408 : * (for the same block pointer) remains. This case is handled in the main
1409 : * allocation dependency setup procedure by immediately freeing the
1410 : * unreferenced fragments.
1411 : */
1412 : /* inode to which block is being added */
1413 : /* block pointer within inode */
1414 : /* disk block number being added */
1415 : /* previous block number, 0 unless frag */
1416 : /* size of new block */
1417 : /* size of new block */
1418 : /* bp for allocated block */
1419 : void
1420 0 : softdep_setup_allocdirect(struct inode *ip, daddr_t lbn, daddr_t newblkno,
1421 : daddr_t oldblkno, long newsize, long oldsize, struct buf *bp)
1422 : {
1423 : struct allocdirect *adp, *oldadp;
1424 : struct allocdirectlst *adphead;
1425 : struct bmsafemap *bmsafemap;
1426 0 : struct inodedep *inodedep;
1427 0 : struct pagedep *pagedep;
1428 0 : struct newblk *newblk;
1429 :
1430 0 : adp = pool_get(&allocdirect_pool, PR_WAITOK | PR_ZERO);
1431 0 : adp->ad_list.wk_type = D_ALLOCDIRECT;
1432 0 : adp->ad_lbn = lbn;
1433 0 : adp->ad_newblkno = newblkno;
1434 0 : adp->ad_oldblkno = oldblkno;
1435 0 : adp->ad_newsize = newsize;
1436 0 : adp->ad_oldsize = oldsize;
1437 0 : adp->ad_state = ATTACHED;
1438 0 : LIST_INIT(&adp->ad_newdirblk);
1439 0 : if (newblkno == oldblkno)
1440 0 : adp->ad_freefrag = NULL;
1441 : else
1442 0 : adp->ad_freefrag = newfreefrag(ip, oldblkno, oldsize);
1443 :
1444 0 : if (newblk_lookup(ip->i_fs, newblkno, 0, &newblk) == 0)
1445 0 : panic("softdep_setup_allocdirect: lost block");
1446 :
1447 0 : ACQUIRE_LOCK(&lk);
1448 0 : inodedep_lookup(ip->i_fs, ip->i_number, DEPALLOC | NODELAY, &inodedep);
1449 0 : adp->ad_inodedep = inodedep;
1450 :
1451 0 : if (newblk->nb_state == DEPCOMPLETE) {
1452 0 : adp->ad_state |= DEPCOMPLETE;
1453 0 : adp->ad_buf = NULL;
1454 0 : } else {
1455 0 : bmsafemap = newblk->nb_bmsafemap;
1456 0 : adp->ad_buf = bmsafemap->sm_buf;
1457 0 : LIST_REMOVE(newblk, nb_deps);
1458 0 : LIST_INSERT_HEAD(&bmsafemap->sm_allocdirecthd, adp, ad_deps);
1459 : }
1460 0 : LIST_REMOVE(newblk, nb_hash);
1461 0 : pool_put(&newblk_pool, newblk);
1462 :
1463 0 : if (bp == NULL) {
1464 : /*
1465 : * XXXUBC - Yes, I know how to fix this, but not right now.
1466 : */
1467 0 : panic("softdep_setup_allocdirect: Bonk art in the head");
1468 : }
1469 0 : WORKLIST_INSERT(&bp->b_dep, &adp->ad_list);
1470 0 : if (lbn >= NDADDR) {
1471 : /* allocating an indirect block */
1472 0 : if (oldblkno != 0) {
1473 0 : FREE_LOCK(&lk);
1474 0 : panic("softdep_setup_allocdirect: non-zero indir");
1475 : }
1476 : } else {
1477 : /*
1478 : * Allocating a direct block.
1479 : *
1480 : * If we are allocating a directory block, then we must
1481 : * allocate an associated pagedep to track additions and
1482 : * deletions.
1483 : */
1484 0 : if ((DIP(ip, mode) & IFMT) == IFDIR &&
1485 0 : pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0)
1486 0 : WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
1487 : }
1488 : /*
1489 : * The list of allocdirects must be kept in sorted and ascending
1490 : * order so that the rollback routines can quickly determine the
1491 : * first uncommitted block (the size of the file stored on disk
1492 : * ends at the end of the lowest committed fragment, or if there
1493 : * are no fragments, at the end of the highest committed block).
1494 : * Since files generally grow, the typical case is that the new
1495 : * block is to be added at the end of the list. We speed this
1496 : * special case by checking against the last allocdirect in the
1497 : * list before laboriously traversing the list looking for the
1498 : * insertion point.
1499 : */
1500 0 : adphead = &inodedep->id_newinoupdt;
1501 0 : oldadp = TAILQ_LAST(adphead, allocdirectlst);
1502 0 : if (oldadp == NULL || oldadp->ad_lbn <= lbn) {
1503 : /* insert at end of list */
1504 0 : TAILQ_INSERT_TAIL(adphead, adp, ad_next);
1505 0 : if (oldadp != NULL && oldadp->ad_lbn == lbn)
1506 0 : allocdirect_merge(adphead, adp, oldadp);
1507 0 : FREE_LOCK(&lk);
1508 0 : return;
1509 : }
1510 0 : TAILQ_FOREACH(oldadp, adphead, ad_next) {
1511 0 : if (oldadp->ad_lbn >= lbn)
1512 : break;
1513 : }
1514 0 : if (oldadp == NULL) {
1515 0 : FREE_LOCK(&lk);
1516 0 : panic("softdep_setup_allocdirect: lost entry");
1517 : }
1518 : /* insert in middle of list */
1519 0 : TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
1520 0 : if (oldadp->ad_lbn == lbn)
1521 0 : allocdirect_merge(adphead, adp, oldadp);
1522 0 : FREE_LOCK(&lk);
1523 0 : }
1524 :
1525 : /*
1526 : * Replace an old allocdirect dependency with a newer one.
1527 : * This routine must be called with splbio interrupts blocked.
1528 : */
1529 : /* head of list holding allocdirects */
1530 : /* allocdirect being added */
1531 : /* existing allocdirect being checked */
1532 : STATIC void
1533 0 : allocdirect_merge(struct allocdirectlst *adphead, struct allocdirect *newadp,
1534 : struct allocdirect *oldadp)
1535 : {
1536 : struct worklist *wk;
1537 : struct freefrag *freefrag;
1538 : struct newdirblk *newdirblk;
1539 :
1540 0 : splassert(IPL_BIO);
1541 :
1542 : #ifdef DEBUG
1543 : if (lk.lkt_held == -1)
1544 : panic("allocdirect_merge: lock not held");
1545 : #endif
1546 0 : if (newadp->ad_oldblkno != oldadp->ad_newblkno ||
1547 0 : newadp->ad_oldsize != oldadp->ad_newsize ||
1548 0 : newadp->ad_lbn >= NDADDR) {
1549 0 : FREE_LOCK(&lk);
1550 0 : panic("allocdirect_merge: old %lld != new %lld || lbn %lld >= "
1551 0 : "%d", (long long)newadp->ad_oldblkno,
1552 0 : (long long)oldadp->ad_newblkno, (long long)newadp->ad_lbn,
1553 : NDADDR);
1554 : }
1555 0 : newadp->ad_oldblkno = oldadp->ad_oldblkno;
1556 0 : newadp->ad_oldsize = oldadp->ad_oldsize;
1557 : /*
1558 : * If the old dependency had a fragment to free or had never
1559 : * previously had a block allocated, then the new dependency
1560 : * can immediately post its freefrag and adopt the old freefrag.
1561 : * This action is done by swapping the freefrag dependencies.
1562 : * The new dependency gains the old one's freefrag, and the
1563 : * old one gets the new one and then immediately puts it on
1564 : * the worklist when it is freed by free_allocdirect. It is
1565 : * not possible to do this swap when the old dependency had a
1566 : * non-zero size but no previous fragment to free. This condition
1567 : * arises when the new block is an extension of the old block.
1568 : * Here, the first part of the fragment allocated to the new
1569 : * dependency is part of the block currently claimed on disk by
1570 : * the old dependency, so cannot legitimately be freed until the
1571 : * conditions for the new dependency are fulfilled.
1572 : */
1573 0 : if (oldadp->ad_freefrag != NULL || oldadp->ad_oldblkno == 0) {
1574 0 : freefrag = newadp->ad_freefrag;
1575 0 : newadp->ad_freefrag = oldadp->ad_freefrag;
1576 0 : oldadp->ad_freefrag = freefrag;
1577 0 : }
1578 : /*
1579 : * If we are tracking a new directory-block allocation,
1580 : * move it from the old allocdirect to the new allocdirect.
1581 : */
1582 0 : if ((wk = LIST_FIRST(&oldadp->ad_newdirblk)) != NULL) {
1583 0 : newdirblk = WK_NEWDIRBLK(wk);
1584 0 : WORKLIST_REMOVE(&newdirblk->db_list);
1585 0 : if (LIST_FIRST(&oldadp->ad_newdirblk) != NULL)
1586 0 : panic("allocdirect_merge: extra newdirblk");
1587 0 : WORKLIST_INSERT(&newadp->ad_newdirblk, &newdirblk->db_list);
1588 0 : }
1589 0 : free_allocdirect(adphead, oldadp, 0);
1590 0 : }
1591 :
1592 : /*
1593 : * Allocate a new freefrag structure if needed.
1594 : */
1595 : STATIC struct freefrag *
1596 0 : newfreefrag(struct inode *ip, daddr_t blkno, long size)
1597 : {
1598 : struct freefrag *freefrag;
1599 : struct fs *fs;
1600 :
1601 0 : if (blkno == 0)
1602 0 : return (NULL);
1603 0 : fs = ip->i_fs;
1604 0 : if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag)
1605 0 : panic("newfreefrag: frag size");
1606 0 : freefrag = pool_get(&freefrag_pool, PR_WAITOK);
1607 0 : freefrag->ff_list.wk_type = D_FREEFRAG;
1608 0 : freefrag->ff_state = DIP(ip, uid) & ~ONWORKLIST; /* used below */
1609 0 : freefrag->ff_inum = ip->i_number;
1610 0 : freefrag->ff_mnt = ITOV(ip)->v_mount;
1611 0 : freefrag->ff_devvp = ip->i_devvp;
1612 0 : freefrag->ff_blkno = blkno;
1613 0 : freefrag->ff_fragsize = size;
1614 0 : return (freefrag);
1615 0 : }
1616 :
1617 : /*
1618 : * This workitem de-allocates fragments that were replaced during
1619 : * file block allocation.
1620 : */
1621 : STATIC void
1622 0 : handle_workitem_freefrag(struct freefrag *freefrag)
1623 : {
1624 0 : struct inode tip;
1625 0 : struct ufs1_dinode dtip1;
1626 :
1627 0 : tip.i_vnode = NULL;
1628 0 : tip.i_din1 = &dtip1;
1629 0 : tip.i_fs = VFSTOUFS(freefrag->ff_mnt)->um_fs;
1630 0 : tip.i_ump = VFSTOUFS(freefrag->ff_mnt);
1631 0 : tip.i_dev = freefrag->ff_devvp->v_rdev;
1632 0 : tip.i_number = freefrag->ff_inum;
1633 0 : tip.i_ffs1_uid = freefrag->ff_state & ~ONWORKLIST; /* set above */
1634 0 : ffs_blkfree(&tip, freefrag->ff_blkno, freefrag->ff_fragsize);
1635 0 : pool_put(&freefrag_pool, freefrag);
1636 0 : }
1637 :
1638 : /*
1639 : * Indirect block allocation dependencies.
1640 : *
1641 : * The same dependencies that exist for a direct block also exist when
1642 : * a new block is allocated and pointed to by an entry in a block of
1643 : * indirect pointers. The undo/redo states described above are also
1644 : * used here. Because an indirect block contains many pointers that
1645 : * may have dependencies, a second copy of the entire in-memory indirect
1646 : * block is kept. The buffer cache copy is always completely up-to-date.
1647 : * The second copy, which is used only as a source for disk writes,
1648 : * contains only the safe pointers (i.e., those that have no remaining
1649 : * update dependencies). The second copy is freed when all pointers
1650 : * are safe. The cache is not allowed to replace indirect blocks with
1651 : * pending update dependencies. If a buffer containing an indirect
1652 : * block with dependencies is written, these routines will mark it
1653 : * dirty again. It can only be successfully written once all the
1654 : * dependencies are removed. The ffs_fsync routine in conjunction with
1655 : * softdep_sync_metadata work together to get all the dependencies
1656 : * removed so that a file can be successfully written to disk. Three
1657 : * procedures are used when setting up indirect block pointer
1658 : * dependencies. The division is necessary because of the organization
1659 : * of the "balloc" routine and because of the distinction between file
1660 : * pages and file metadata blocks.
1661 : */
1662 :
1663 : /*
1664 : * Allocate a new allocindir structure.
1665 : */
1666 : /* inode for file being extended */
1667 : /* offset of pointer in indirect block */
1668 : /* disk block number being added */
1669 : /* previous block number, 0 if none */
1670 : STATIC struct allocindir *
1671 0 : newallocindir(struct inode *ip, int ptrno, daddr_t newblkno,
1672 : daddr_t oldblkno)
1673 : {
1674 : struct allocindir *aip;
1675 :
1676 0 : aip = pool_get(&allocindir_pool, PR_WAITOK | PR_ZERO);
1677 0 : aip->ai_list.wk_type = D_ALLOCINDIR;
1678 0 : aip->ai_state = ATTACHED;
1679 0 : aip->ai_offset = ptrno;
1680 0 : aip->ai_newblkno = newblkno;
1681 0 : aip->ai_oldblkno = oldblkno;
1682 0 : aip->ai_freefrag = newfreefrag(ip, oldblkno, ip->i_fs->fs_bsize);
1683 0 : return (aip);
1684 : }
1685 :
1686 : /*
1687 : * Called just before setting an indirect block pointer
1688 : * to a newly allocated file page.
1689 : */
1690 : /* inode for file being extended */
1691 : /* allocated block number within file */
1692 : /* buffer with indirect blk referencing page */
1693 : /* offset of pointer in indirect block */
1694 : /* disk block number being added */
1695 : /* previous block number, 0 if none */
1696 : /* buffer holding allocated page */
1697 : void
1698 0 : softdep_setup_allocindir_page(struct inode *ip, daddr_t lbn, struct buf *bp,
1699 : int ptrno, daddr_t newblkno, daddr_t oldblkno, struct buf *nbp)
1700 : {
1701 : struct allocindir *aip;
1702 0 : struct pagedep *pagedep;
1703 :
1704 0 : aip = newallocindir(ip, ptrno, newblkno, oldblkno);
1705 0 : ACQUIRE_LOCK(&lk);
1706 : /*
1707 : * If we are allocating a directory page, then we must
1708 : * allocate an associated pagedep to track additions and
1709 : * deletions.
1710 : */
1711 0 : if ((DIP(ip, mode) & IFMT) == IFDIR &&
1712 0 : pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0)
1713 0 : WORKLIST_INSERT(&nbp->b_dep, &pagedep->pd_list);
1714 0 : if (nbp == NULL) {
1715 : /*
1716 : * XXXUBC - Yes, I know how to fix this, but not right now.
1717 : */
1718 0 : panic("softdep_setup_allocindir_page: Bonk art in the head");
1719 : }
1720 0 : WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list);
1721 0 : FREE_LOCK(&lk);
1722 0 : setup_allocindir_phase2(bp, ip, aip);
1723 0 : }
1724 :
1725 : /*
1726 : * Called just before setting an indirect block pointer to a
1727 : * newly allocated indirect block.
1728 : */
1729 : /* newly allocated indirect block */
1730 : /* inode for file being extended */
1731 : /* indirect block referencing allocated block */
1732 : /* offset of pointer in indirect block */
1733 : /* disk block number being added */
1734 : void
1735 0 : softdep_setup_allocindir_meta(struct buf *nbp, struct inode *ip,
1736 : struct buf *bp, int ptrno, daddr_t newblkno)
1737 : {
1738 : struct allocindir *aip;
1739 :
1740 0 : aip = newallocindir(ip, ptrno, newblkno, 0);
1741 0 : ACQUIRE_LOCK(&lk);
1742 0 : WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list);
1743 0 : FREE_LOCK(&lk);
1744 0 : setup_allocindir_phase2(bp, ip, aip);
1745 0 : }
1746 :
1747 : /*
1748 : * Called to finish the allocation of the "aip" allocated
1749 : * by one of the two routines above.
1750 : */
1751 : /* in-memory copy of the indirect block */
1752 : /* inode for file being extended */
1753 : /* allocindir allocated by the above routines */
1754 : STATIC void
1755 0 : setup_allocindir_phase2(struct buf *bp, struct inode *ip,
1756 : struct allocindir *aip)
1757 : {
1758 : struct worklist *wk;
1759 : struct indirdep *indirdep, *newindirdep;
1760 : struct bmsafemap *bmsafemap;
1761 : struct allocindir *oldaip;
1762 : struct freefrag *freefrag;
1763 0 : struct newblk *newblk;
1764 :
1765 0 : if (bp->b_lblkno >= 0)
1766 0 : panic("setup_allocindir_phase2: not indir blk");
1767 0 : for (indirdep = NULL, newindirdep = NULL; ; ) {
1768 0 : ACQUIRE_LOCK(&lk);
1769 0 : LIST_FOREACH(wk, &bp->b_dep, wk_list) {
1770 0 : if (wk->wk_type != D_INDIRDEP)
1771 : continue;
1772 0 : indirdep = WK_INDIRDEP(wk);
1773 0 : break;
1774 : }
1775 0 : if (indirdep == NULL && newindirdep) {
1776 : indirdep = newindirdep;
1777 0 : WORKLIST_INSERT(&bp->b_dep, &indirdep->ir_list);
1778 : newindirdep = NULL;
1779 0 : }
1780 0 : FREE_LOCK(&lk);
1781 0 : if (indirdep) {
1782 0 : if (newblk_lookup(ip->i_fs, aip->ai_newblkno, 0,
1783 0 : &newblk) == 0)
1784 0 : panic("setup_allocindir: lost block");
1785 0 : ACQUIRE_LOCK(&lk);
1786 0 : if (newblk->nb_state == DEPCOMPLETE) {
1787 0 : aip->ai_state |= DEPCOMPLETE;
1788 0 : aip->ai_buf = NULL;
1789 0 : } else {
1790 0 : bmsafemap = newblk->nb_bmsafemap;
1791 0 : aip->ai_buf = bmsafemap->sm_buf;
1792 0 : LIST_REMOVE(newblk, nb_deps);
1793 0 : LIST_INSERT_HEAD(&bmsafemap->sm_allocindirhd,
1794 : aip, ai_deps);
1795 : }
1796 0 : LIST_REMOVE(newblk, nb_hash);
1797 0 : pool_put(&newblk_pool, newblk);
1798 0 : aip->ai_indirdep = indirdep;
1799 : /*
1800 : * Check to see if there is an existing dependency
1801 : * for this block. If there is, merge the old
1802 : * dependency into the new one.
1803 : */
1804 0 : if (aip->ai_oldblkno == 0)
1805 0 : oldaip = NULL;
1806 : else
1807 :
1808 0 : LIST_FOREACH(oldaip, &indirdep->ir_deplisthd, ai_next)
1809 0 : if (oldaip->ai_offset == aip->ai_offset)
1810 : break;
1811 : freefrag = NULL;
1812 0 : if (oldaip != NULL) {
1813 0 : if (oldaip->ai_newblkno != aip->ai_oldblkno) {
1814 0 : FREE_LOCK(&lk);
1815 0 : panic("setup_allocindir_phase2: blkno");
1816 : }
1817 0 : aip->ai_oldblkno = oldaip->ai_oldblkno;
1818 0 : freefrag = aip->ai_freefrag;
1819 0 : aip->ai_freefrag = oldaip->ai_freefrag;
1820 0 : oldaip->ai_freefrag = NULL;
1821 0 : free_allocindir(oldaip, NULL);
1822 0 : }
1823 0 : LIST_INSERT_HEAD(&indirdep->ir_deplisthd, aip, ai_next);
1824 0 : if (ip->i_ump->um_fstype == UM_UFS1)
1825 0 : ((int32_t *)indirdep->ir_savebp->b_data)
1826 0 : [aip->ai_offset] = aip->ai_oldblkno;
1827 : else
1828 0 : ((int64_t *)indirdep->ir_savebp->b_data)
1829 0 : [aip->ai_offset] = aip->ai_oldblkno;
1830 0 : FREE_LOCK(&lk);
1831 0 : if (freefrag != NULL)
1832 0 : handle_workitem_freefrag(freefrag);
1833 : }
1834 0 : if (newindirdep) {
1835 0 : if (indirdep->ir_savebp != NULL)
1836 0 : brelse(newindirdep->ir_savebp);
1837 0 : WORKITEM_FREE(newindirdep, D_INDIRDEP);
1838 0 : }
1839 0 : if (indirdep)
1840 : break;
1841 0 : newindirdep = pool_get(&indirdep_pool, PR_WAITOK);
1842 0 : newindirdep->ir_list.wk_type = D_INDIRDEP;
1843 0 : newindirdep->ir_state = ATTACHED;
1844 0 : if (ip->i_ump->um_fstype == UM_UFS1)
1845 0 : newindirdep->ir_state |= UFS1FMT;
1846 0 : LIST_INIT(&newindirdep->ir_deplisthd);
1847 0 : LIST_INIT(&newindirdep->ir_donehd);
1848 0 : if (bp->b_blkno == bp->b_lblkno) {
1849 0 : VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno,
1850 : NULL);
1851 0 : }
1852 0 : newindirdep->ir_savebp =
1853 0 : getblk(ip->i_devvp, bp->b_blkno, bp->b_bcount, 0, 0);
1854 : #if 0
1855 : BUF_KERNPROC(newindirdep->ir_savebp);
1856 : #endif
1857 0 : memcpy(newindirdep->ir_savebp->b_data, bp->b_data, bp->b_bcount);
1858 : }
1859 0 : }
1860 :
1861 : /*
1862 : * Block de-allocation dependencies.
1863 : *
1864 : * When blocks are de-allocated, the on-disk pointers must be nullified before
1865 : * the blocks are made available for use by other files. (The true
1866 : * requirement is that old pointers must be nullified before new on-disk
1867 : * pointers are set. We chose this slightly more stringent requirement to
1868 : * reduce complexity.) Our implementation handles this dependency by updating
1869 : * the inode (or indirect block) appropriately but delaying the actual block
1870 : * de-allocation (i.e., freemap and free space count manipulation) until
1871 : * after the updated versions reach stable storage. After the disk is
1872 : * updated, the blocks can be safely de-allocated whenever it is convenient.
1873 : * This implementation handles only the common case of reducing a file's
1874 : * length to zero. Other cases are handled by the conventional synchronous
1875 : * write approach.
1876 : *
1877 : * The ffs implementation with which we worked double-checks
1878 : * the state of the block pointers and file size as it reduces
1879 : * a file's length. Some of this code is replicated here in our
1880 : * soft updates implementation. The freeblks->fb_chkcnt field is
1881 : * used to transfer a part of this information to the procedure
1882 : * that eventually de-allocates the blocks.
1883 : *
1884 : * This routine should be called from the routine that shortens
1885 : * a file's length, before the inode's size or block pointers
1886 : * are modified. It will save the block pointer information for
1887 : * later release and zero the inode so that the calling routine
1888 : * can release it.
1889 : */
1890 : /* The inode whose length is to be reduced */
1891 : /* The new length for the file */
1892 : void
1893 0 : softdep_setup_freeblocks(struct inode *ip, off_t length)
1894 : {
1895 : struct freeblks *freeblks;
1896 0 : struct inodedep *inodedep;
1897 : struct allocdirect *adp;
1898 : struct vnode *vp;
1899 0 : struct buf *bp;
1900 : struct fs *fs;
1901 : int i, delay, error;
1902 :
1903 0 : fs = ip->i_fs;
1904 0 : if (length != 0)
1905 0 : panic("softdep_setup_freeblocks: non-zero length");
1906 0 : freeblks = pool_get(&freeblks_pool, PR_WAITOK | PR_ZERO);
1907 0 : freeblks->fb_list.wk_type = D_FREEBLKS;
1908 0 : freeblks->fb_state = ATTACHED;
1909 0 : freeblks->fb_uid = DIP(ip, uid);
1910 0 : freeblks->fb_previousinum = ip->i_number;
1911 0 : freeblks->fb_devvp = ip->i_devvp;
1912 0 : freeblks->fb_mnt = ITOV(ip)->v_mount;
1913 0 : freeblks->fb_oldsize = DIP(ip, size);
1914 0 : freeblks->fb_newsize = length;
1915 0 : freeblks->fb_chkcnt = DIP(ip, blocks);
1916 :
1917 0 : for (i = 0; i < NDADDR; i++) {
1918 0 : freeblks->fb_dblks[i] = DIP(ip, db[i]);
1919 0 : DIP_ASSIGN(ip, db[i], 0);
1920 : }
1921 :
1922 0 : for (i = 0; i < NIADDR; i++) {
1923 0 : freeblks->fb_iblks[i] = DIP(ip, ib[i]);
1924 0 : DIP_ASSIGN(ip, ib[i], 0);
1925 : }
1926 :
1927 0 : DIP_ASSIGN(ip, blocks, 0);
1928 0 : DIP_ASSIGN(ip, size, 0);
1929 :
1930 : /*
1931 : * Push the zero'ed inode to to its disk buffer so that we are free
1932 : * to delete its dependencies below. Once the dependencies are gone
1933 : * the buffer can be safely released.
1934 : */
1935 0 : if ((error = bread(ip->i_devvp,
1936 0 : fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
1937 0 : (int)fs->fs_bsize, &bp)) != 0)
1938 0 : softdep_error("softdep_setup_freeblocks", error);
1939 :
1940 0 : if (ip->i_ump->um_fstype == UM_UFS1)
1941 0 : *((struct ufs1_dinode *) bp->b_data +
1942 0 : ino_to_fsbo(fs, ip->i_number)) = *ip->i_din1;
1943 : else
1944 0 : *((struct ufs2_dinode *) bp->b_data +
1945 0 : ino_to_fsbo(fs, ip->i_number)) = *ip->i_din2;
1946 :
1947 : /*
1948 : * Find and eliminate any inode dependencies.
1949 : */
1950 0 : ACQUIRE_LOCK(&lk);
1951 0 : (void) inodedep_lookup(fs, ip->i_number, DEPALLOC, &inodedep);
1952 0 : if ((inodedep->id_state & IOSTARTED) != 0) {
1953 0 : FREE_LOCK(&lk);
1954 0 : panic("softdep_setup_freeblocks: inode busy");
1955 : }
1956 : /*
1957 : * Add the freeblks structure to the list of operations that
1958 : * must await the zero'ed inode being written to disk. If we
1959 : * still have a bitmap dependency (delay == 0), then the inode
1960 : * has never been written to disk, so we can process the
1961 : * freeblks below once we have deleted the dependencies.
1962 : */
1963 0 : delay = (inodedep->id_state & DEPCOMPLETE);
1964 0 : if (delay)
1965 0 : WORKLIST_INSERT(&inodedep->id_bufwait, &freeblks->fb_list);
1966 : /*
1967 : * Because the file length has been truncated to zero, any
1968 : * pending block allocation dependency structures associated
1969 : * with this inode are obsolete and can simply be de-allocated.
1970 : * We must first merge the two dependency lists to get rid of
1971 : * any duplicate freefrag structures, then purge the merged list.
1972 : * If we still have a bitmap dependency, then the inode has never
1973 : * been written to disk, so we can free any fragments without delay.
1974 : */
1975 0 : merge_inode_lists(inodedep);
1976 0 : while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL)
1977 0 : free_allocdirect(&inodedep->id_inoupdt, adp, delay);
1978 0 : FREE_LOCK(&lk);
1979 0 : bdwrite(bp);
1980 : /*
1981 : * We must wait for any I/O in progress to finish so that
1982 : * all potential buffers on the dirty list will be visible.
1983 : * Once they are all there, walk the list and get rid of
1984 : * any dependencies.
1985 : */
1986 0 : vp = ITOV(ip);
1987 0 : ACQUIRE_LOCK(&lk);
1988 0 : drain_output(vp, 1);
1989 0 : while ((bp = LIST_FIRST(&vp->v_dirtyblkhd))) {
1990 0 : if (getdirtybuf(bp, MNT_WAIT) <= 0)
1991 : break;
1992 0 : (void) inodedep_lookup(fs, ip->i_number, 0, &inodedep);
1993 0 : deallocate_dependencies(bp, inodedep);
1994 0 : bp->b_flags |= B_INVAL | B_NOCACHE;
1995 0 : FREE_LOCK(&lk);
1996 0 : brelse(bp);
1997 0 : ACQUIRE_LOCK(&lk);
1998 : }
1999 0 : if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) != 0)
2000 0 : (void) free_inodedep(inodedep);
2001 :
2002 0 : if (delay) {
2003 0 : freeblks->fb_state |= DEPCOMPLETE;
2004 : /*
2005 : * If the inode with zeroed block pointers is now on disk we
2006 : * can start freeing blocks. Add freeblks to the worklist
2007 : * instead of calling handle_workitem_freeblocks() directly as
2008 : * it is more likely that additional IO is needed to complete
2009 : * the request than in the !delay case.
2010 : */
2011 0 : if ((freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE)
2012 0 : add_to_worklist(&freeblks->fb_list);
2013 : }
2014 :
2015 0 : FREE_LOCK(&lk);
2016 : /*
2017 : * If the inode has never been written to disk (delay == 0),
2018 : * then we can process the freeblks now that we have deleted
2019 : * the dependencies.
2020 : */
2021 0 : if (!delay)
2022 0 : handle_workitem_freeblocks(freeblks);
2023 0 : }
2024 :
2025 : /*
2026 : * Reclaim any dependency structures from a buffer that is about to
2027 : * be reallocated to a new vnode. The buffer must be locked, thus,
2028 : * no I/O completion operations can occur while we are manipulating
2029 : * its associated dependencies. The mutex is held so that other I/O's
2030 : * associated with related dependencies do not occur.
2031 : */
2032 : STATIC void
2033 0 : deallocate_dependencies(struct buf *bp, struct inodedep *inodedep)
2034 : {
2035 : struct worklist *wk;
2036 : struct indirdep *indirdep;
2037 : struct allocindir *aip;
2038 : struct pagedep *pagedep;
2039 : struct dirrem *dirrem;
2040 : struct diradd *dap;
2041 : int i;
2042 :
2043 0 : while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
2044 0 : switch (wk->wk_type) {
2045 :
2046 : case D_INDIRDEP:
2047 0 : indirdep = WK_INDIRDEP(wk);
2048 : /*
2049 : * None of the indirect pointers will ever be visible,
2050 : * so they can simply be tossed. GOINGAWAY ensures
2051 : * that allocated pointers will be saved in the buffer
2052 : * cache until they are freed. Note that they will
2053 : * only be able to be found by their physical address
2054 : * since the inode mapping the logical address will
2055 : * be gone. The save buffer used for the safe copy
2056 : * was allocated in setup_allocindir_phase2 using
2057 : * the physical address so it could be used for this
2058 : * purpose. Hence we swap the safe copy with the real
2059 : * copy, allowing the safe copy to be freed and holding
2060 : * on to the real copy for later use in indir_trunc.
2061 : */
2062 0 : if (indirdep->ir_state & GOINGAWAY) {
2063 0 : FREE_LOCK(&lk);
2064 0 : panic("deallocate_dependencies: already gone");
2065 : }
2066 0 : indirdep->ir_state |= GOINGAWAY;
2067 0 : while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)))
2068 0 : free_allocindir(aip, inodedep);
2069 0 : if (bp->b_lblkno >= 0 ||
2070 0 : bp->b_blkno != indirdep->ir_savebp->b_lblkno) {
2071 0 : FREE_LOCK(&lk);
2072 0 : panic("deallocate_dependencies: not indir");
2073 : }
2074 0 : memcpy(indirdep->ir_savebp->b_data, bp->b_data,
2075 : bp->b_bcount);
2076 0 : WORKLIST_REMOVE(wk);
2077 0 : WORKLIST_INSERT(&indirdep->ir_savebp->b_dep, wk);
2078 0 : continue;
2079 :
2080 : case D_PAGEDEP:
2081 0 : pagedep = WK_PAGEDEP(wk);
2082 : /*
2083 : * None of the directory additions will ever be
2084 : * visible, so they can simply be tossed.
2085 : */
2086 0 : for (i = 0; i < DAHASHSZ; i++)
2087 0 : while ((dap =
2088 0 : LIST_FIRST(&pagedep->pd_diraddhd[i])))
2089 0 : free_diradd(dap);
2090 0 : while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)))
2091 0 : free_diradd(dap);
2092 : /*
2093 : * Copy any directory remove dependencies to the list
2094 : * to be processed after the zero'ed inode is written.
2095 : * If the inode has already been written, then they
2096 : * can be dumped directly onto the work list.
2097 : */
2098 0 : while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd))) {
2099 0 : LIST_REMOVE(dirrem, dm_next);
2100 0 : dirrem->dm_dirinum = pagedep->pd_ino;
2101 0 : if (inodedep == NULL ||
2102 0 : (inodedep->id_state & ALLCOMPLETE) ==
2103 : ALLCOMPLETE)
2104 0 : add_to_worklist(&dirrem->dm_list);
2105 : else
2106 0 : WORKLIST_INSERT(&inodedep->id_bufwait,
2107 : &dirrem->dm_list);
2108 : }
2109 0 : if ((pagedep->pd_state & NEWBLOCK) != 0) {
2110 0 : LIST_FOREACH(wk, &inodedep->id_bufwait, wk_list)
2111 0 : if (wk->wk_type == D_NEWDIRBLK &&
2112 0 : WK_NEWDIRBLK(wk)->db_pagedep ==
2113 : pagedep)
2114 : break;
2115 0 : if (wk != NULL) {
2116 0 : WORKLIST_REMOVE(wk);
2117 0 : free_newdirblk(WK_NEWDIRBLK(wk));
2118 : } else {
2119 0 : FREE_LOCK(&lk);
2120 0 : panic("deallocate_dependencies: "
2121 : "lost pagedep");
2122 : }
2123 0 : }
2124 0 : WORKLIST_REMOVE(&pagedep->pd_list);
2125 0 : LIST_REMOVE(pagedep, pd_hash);
2126 0 : WORKITEM_FREE(pagedep, D_PAGEDEP);
2127 0 : continue;
2128 :
2129 : case D_ALLOCINDIR:
2130 0 : free_allocindir(WK_ALLOCINDIR(wk), inodedep);
2131 0 : continue;
2132 :
2133 : case D_ALLOCDIRECT:
2134 : case D_INODEDEP:
2135 0 : FREE_LOCK(&lk);
2136 0 : panic("deallocate_dependencies: Unexpected type %s",
2137 0 : TYPENAME(wk->wk_type));
2138 : /* NOTREACHED */
2139 :
2140 : default:
2141 0 : FREE_LOCK(&lk);
2142 0 : panic("deallocate_dependencies: Unknown type %s",
2143 0 : TYPENAME(wk->wk_type));
2144 : /* NOTREACHED */
2145 : }
2146 : }
2147 0 : }
2148 :
2149 : /*
2150 : * Free an allocdirect. Generate a new freefrag work request if appropriate.
2151 : * This routine must be called with splbio interrupts blocked.
2152 : */
2153 : STATIC void
2154 0 : free_allocdirect(struct allocdirectlst *adphead, struct allocdirect *adp,
2155 : int delay)
2156 : {
2157 : struct newdirblk *newdirblk;
2158 : struct worklist *wk;
2159 :
2160 0 : splassert(IPL_BIO);
2161 :
2162 : #ifdef DEBUG
2163 : if (lk.lkt_held == -1)
2164 : panic("free_allocdirect: lock not held");
2165 : #endif
2166 0 : if ((adp->ad_state & DEPCOMPLETE) == 0)
2167 0 : LIST_REMOVE(adp, ad_deps);
2168 0 : TAILQ_REMOVE(adphead, adp, ad_next);
2169 0 : if ((adp->ad_state & COMPLETE) == 0)
2170 0 : WORKLIST_REMOVE(&adp->ad_list);
2171 0 : if (adp->ad_freefrag != NULL) {
2172 0 : if (delay)
2173 0 : WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait,
2174 : &adp->ad_freefrag->ff_list);
2175 : else
2176 0 : add_to_worklist(&adp->ad_freefrag->ff_list);
2177 : }
2178 0 : if ((wk = LIST_FIRST(&adp->ad_newdirblk)) != NULL) {
2179 0 : newdirblk = WK_NEWDIRBLK(wk);
2180 0 : WORKLIST_REMOVE(&newdirblk->db_list);
2181 0 : if (LIST_FIRST(&adp->ad_newdirblk) != NULL)
2182 0 : panic("free_allocdirect: extra newdirblk");
2183 0 : if (delay)
2184 0 : WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait,
2185 : &newdirblk->db_list);
2186 : else
2187 0 : free_newdirblk(newdirblk);
2188 : }
2189 0 : WORKITEM_FREE(adp, D_ALLOCDIRECT);
2190 0 : }
2191 :
2192 : /*
2193 : * Free a newdirblk. Clear the NEWBLOCK flag on its associated pagedep.
2194 : * This routine must be called with splbio interrupts blocked.
2195 : */
2196 : void
2197 0 : free_newdirblk(struct newdirblk *newdirblk)
2198 : {
2199 : struct pagedep *pagedep;
2200 : struct diradd *dap;
2201 : int i;
2202 :
2203 0 : splassert(IPL_BIO);
2204 :
2205 : #ifdef DEBUG
2206 : if (lk.lkt_held == -1)
2207 : panic("free_newdirblk: lock not held");
2208 : #endif
2209 : /*
2210 : * If the pagedep is still linked onto the directory buffer
2211 : * dependency chain, then some of the entries on the
2212 : * pd_pendinghd list may not be committed to disk yet. In
2213 : * this case, we will simply clear the NEWBLOCK flag and
2214 : * let the pd_pendinghd list be processed when the pagedep
2215 : * is next written. If the pagedep is no longer on the buffer
2216 : * dependency chain, then all the entries on the pd_pending
2217 : * list are committed to disk and we can free them here.
2218 : */
2219 0 : pagedep = newdirblk->db_pagedep;
2220 0 : pagedep->pd_state &= ~NEWBLOCK;
2221 0 : if ((pagedep->pd_state & ONWORKLIST) == 0)
2222 0 : while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
2223 0 : free_diradd(dap);
2224 : /*
2225 : * If no dependencies remain, the pagedep will be freed.
2226 : */
2227 0 : for (i = 0; i < DAHASHSZ; i++)
2228 0 : if (LIST_FIRST(&pagedep->pd_diraddhd[i]) != NULL)
2229 : break;
2230 0 : if (i == DAHASHSZ && (pagedep->pd_state & ONWORKLIST) == 0) {
2231 0 : LIST_REMOVE(pagedep, pd_hash);
2232 0 : WORKITEM_FREE(pagedep, D_PAGEDEP);
2233 0 : }
2234 0 : WORKITEM_FREE(newdirblk, D_NEWDIRBLK);
2235 0 : }
2236 :
2237 : /*
2238 : * Prepare an inode to be freed. The actual free operation is not
2239 : * done until the zero'ed inode has been written to disk.
2240 : */
2241 : void
2242 0 : softdep_freefile(struct vnode *pvp, ufsino_t ino, mode_t mode)
2243 : {
2244 0 : struct inode *ip = VTOI(pvp);
2245 0 : struct inodedep *inodedep;
2246 : struct freefile *freefile;
2247 :
2248 : /*
2249 : * This sets up the inode de-allocation dependency.
2250 : */
2251 0 : freefile = pool_get(&freefile_pool, PR_WAITOK);
2252 0 : freefile->fx_list.wk_type = D_FREEFILE;
2253 0 : freefile->fx_list.wk_state = 0;
2254 0 : freefile->fx_mode = mode;
2255 0 : freefile->fx_oldinum = ino;
2256 0 : freefile->fx_devvp = ip->i_devvp;
2257 0 : freefile->fx_mnt = ITOV(ip)->v_mount;
2258 :
2259 : /*
2260 : * If the inodedep does not exist, then the zero'ed inode has
2261 : * been written to disk. If the allocated inode has never been
2262 : * written to disk, then the on-disk inode is zero'ed. In either
2263 : * case we can free the file immediately.
2264 : */
2265 0 : ACQUIRE_LOCK(&lk);
2266 0 : if (inodedep_lookup(ip->i_fs, ino, 0, &inodedep) == 0 ||
2267 0 : check_inode_unwritten(inodedep)) {
2268 0 : FREE_LOCK(&lk);
2269 0 : handle_workitem_freefile(freefile);
2270 0 : return;
2271 : }
2272 0 : WORKLIST_INSERT(&inodedep->id_inowait, &freefile->fx_list);
2273 0 : FREE_LOCK(&lk);
2274 0 : }
2275 :
2276 : /*
2277 : * Check to see if an inode has never been written to disk. If
2278 : * so free the inodedep and return success, otherwise return failure.
2279 : * This routine must be called with splbio interrupts blocked.
2280 : *
2281 : * If we still have a bitmap dependency, then the inode has never
2282 : * been written to disk. Drop the dependency as it is no longer
2283 : * necessary since the inode is being deallocated. We set the
2284 : * ALLCOMPLETE flags since the bitmap now properly shows that the
2285 : * inode is not allocated. Even if the inode is actively being
2286 : * written, it has been rolled back to its zero'ed state, so we
2287 : * are ensured that a zero inode is what is on the disk. For short
2288 : * lived files, this change will usually result in removing all the
2289 : * dependencies from the inode so that it can be freed immediately.
2290 : */
2291 : STATIC int
2292 0 : check_inode_unwritten(struct inodedep *inodedep)
2293 : {
2294 0 : splassert(IPL_BIO);
2295 :
2296 0 : if ((inodedep->id_state & DEPCOMPLETE) != 0 ||
2297 0 : LIST_FIRST(&inodedep->id_pendinghd) != NULL ||
2298 0 : LIST_FIRST(&inodedep->id_bufwait) != NULL ||
2299 0 : LIST_FIRST(&inodedep->id_inowait) != NULL ||
2300 0 : TAILQ_FIRST(&inodedep->id_inoupdt) != NULL ||
2301 0 : TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL ||
2302 0 : inodedep->id_nlinkdelta != 0)
2303 0 : return (0);
2304 0 : inodedep->id_state |= ALLCOMPLETE;
2305 0 : LIST_REMOVE(inodedep, id_deps);
2306 0 : inodedep->id_buf = NULL;
2307 0 : if (inodedep->id_state & ONWORKLIST)
2308 0 : WORKLIST_REMOVE(&inodedep->id_list);
2309 0 : if (inodedep->id_savedino1 != NULL) {
2310 0 : free(inodedep->id_savedino1, M_INODEDEP, inodedep->id_unsize);
2311 0 : inodedep->id_savedino1 = NULL;
2312 0 : }
2313 0 : if (free_inodedep(inodedep) == 0) {
2314 0 : FREE_LOCK(&lk);
2315 0 : panic("check_inode_unwritten: busy inode");
2316 : }
2317 0 : return (1);
2318 0 : }
2319 :
2320 : /*
2321 : * Try to free an inodedep structure. Return 1 if it could be freed.
2322 : */
2323 : STATIC int
2324 0 : free_inodedep(struct inodedep *inodedep)
2325 : {
2326 :
2327 0 : if ((inodedep->id_state & ONWORKLIST) != 0 ||
2328 0 : (inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE ||
2329 0 : LIST_FIRST(&inodedep->id_pendinghd) != NULL ||
2330 0 : LIST_FIRST(&inodedep->id_bufwait) != NULL ||
2331 0 : LIST_FIRST(&inodedep->id_inowait) != NULL ||
2332 0 : TAILQ_FIRST(&inodedep->id_inoupdt) != NULL ||
2333 0 : TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL ||
2334 0 : inodedep->id_nlinkdelta != 0 || inodedep->id_savedino1 != NULL)
2335 0 : return (0);
2336 0 : LIST_REMOVE(inodedep, id_hash);
2337 0 : WORKITEM_FREE(inodedep, D_INODEDEP);
2338 0 : num_inodedep -= 1;
2339 0 : return (1);
2340 0 : }
2341 :
2342 : /*
2343 : * This workitem routine performs the block de-allocation.
2344 : * The workitem is added to the pending list after the updated
2345 : * inode block has been written to disk. As mentioned above,
2346 : * checks regarding the number of blocks de-allocated (compared
2347 : * to the number of blocks allocated for the file) are also
2348 : * performed in this function.
2349 : */
2350 : STATIC void
2351 0 : handle_workitem_freeblocks(struct freeblks *freeblks)
2352 : {
2353 0 : struct inode tip;
2354 : daddr_t bn;
2355 0 : union {
2356 : struct ufs1_dinode di1;
2357 : struct ufs2_dinode di2;
2358 : } di;
2359 : struct fs *fs;
2360 : int i, level, bsize;
2361 0 : long nblocks, blocksreleased = 0;
2362 : int error, allerror = 0;
2363 0 : daddr_t baselbns[NIADDR], tmpval;
2364 :
2365 0 : if (VFSTOUFS(freeblks->fb_mnt)->um_fstype == UM_UFS1)
2366 0 : tip.i_din1 = &di.di1;
2367 : else
2368 0 : tip.i_din2 = &di.di2;
2369 :
2370 0 : tip.i_fs = fs = VFSTOUFS(freeblks->fb_mnt)->um_fs;
2371 0 : tip.i_number = freeblks->fb_previousinum;
2372 0 : tip.i_ump = VFSTOUFS(freeblks->fb_mnt);
2373 0 : tip.i_dev = freeblks->fb_devvp->v_rdev;
2374 0 : DIP_ASSIGN(&tip, size, freeblks->fb_oldsize);
2375 0 : DIP_ASSIGN(&tip, uid, freeblks->fb_uid);
2376 0 : tip.i_vnode = NULL;
2377 : tmpval = 1;
2378 0 : baselbns[0] = NDADDR;
2379 0 : for (i = 1; i < NIADDR; i++) {
2380 0 : tmpval *= NINDIR(fs);
2381 0 : baselbns[i] = baselbns[i - 1] + tmpval;
2382 : }
2383 0 : nblocks = btodb(fs->fs_bsize);
2384 0 : blocksreleased = 0;
2385 : /*
2386 : * Indirect blocks first.
2387 : */
2388 0 : for (level = (NIADDR - 1); level >= 0; level--) {
2389 0 : if ((bn = freeblks->fb_iblks[level]) == 0)
2390 : continue;
2391 0 : if ((error = indir_trunc(&tip, fsbtodb(fs, bn), level,
2392 0 : baselbns[level], &blocksreleased)) != 0)
2393 0 : allerror = error;
2394 0 : ffs_blkfree(&tip, bn, fs->fs_bsize);
2395 0 : blocksreleased += nblocks;
2396 0 : }
2397 : /*
2398 : * All direct blocks or frags.
2399 : */
2400 0 : for (i = (NDADDR - 1); i >= 0; i--) {
2401 0 : if ((bn = freeblks->fb_dblks[i]) == 0)
2402 : continue;
2403 0 : bsize = blksize(fs, &tip, i);
2404 0 : ffs_blkfree(&tip, bn, bsize);
2405 0 : blocksreleased += btodb(bsize);
2406 0 : }
2407 :
2408 : #ifdef DIAGNOSTIC
2409 0 : if (freeblks->fb_chkcnt != blocksreleased)
2410 0 : printf("handle_workitem_freeblocks: block count\n");
2411 0 : if (allerror)
2412 0 : softdep_error("handle_workitem_freeblks", allerror);
2413 : #endif /* DIAGNOSTIC */
2414 0 : WORKITEM_FREE(freeblks, D_FREEBLKS);
2415 0 : }
2416 :
2417 : /*
2418 : * Release blocks associated with the inode ip and stored in the indirect
2419 : * block dbn. If level is greater than SINGLE, the block is an indirect block
2420 : * and recursive calls to indirtrunc must be used to cleanse other indirect
2421 : * blocks.
2422 : */
2423 : STATIC int
2424 0 : indir_trunc(struct inode *ip, daddr_t dbn, int level, daddr_t lbn,
2425 : long *countp)
2426 : {
2427 0 : struct buf *bp;
2428 : int32_t *bap1 = NULL;
2429 : int64_t nb, *bap2 = NULL;
2430 : struct fs *fs;
2431 : struct worklist *wk;
2432 : struct indirdep *indirdep;
2433 : int i, lbnadd, nblocks, ufs1fmt;
2434 : int error, allerror = 0;
2435 :
2436 0 : fs = ip->i_fs;
2437 : lbnadd = 1;
2438 0 : for (i = level; i > 0; i--)
2439 0 : lbnadd *= NINDIR(fs);
2440 : /*
2441 : * Get buffer of block pointers to be freed. This routine is not
2442 : * called until the zero'ed inode has been written, so it is safe
2443 : * to free blocks as they are encountered. Because the inode has
2444 : * been zero'ed, calls to bmap on these blocks will fail. So, we
2445 : * have to use the on-disk address and the block device for the
2446 : * filesystem to look them up. If the file was deleted before its
2447 : * indirect blocks were all written to disk, the routine that set
2448 : * us up (deallocate_dependencies) will have arranged to leave
2449 : * a complete copy of the indirect block in memory for our use.
2450 : * Otherwise we have to read the blocks in from the disk.
2451 : */
2452 0 : ACQUIRE_LOCK(&lk);
2453 0 : if ((bp = incore(ip->i_devvp, dbn)) != NULL &&
2454 0 : (wk = LIST_FIRST(&bp->b_dep)) != NULL) {
2455 0 : if (wk->wk_type != D_INDIRDEP ||
2456 0 : (indirdep = WK_INDIRDEP(wk))->ir_savebp != bp ||
2457 0 : (indirdep->ir_state & GOINGAWAY) == 0) {
2458 0 : FREE_LOCK(&lk);
2459 0 : panic("indir_trunc: lost indirdep");
2460 : }
2461 0 : WORKLIST_REMOVE(wk);
2462 0 : WORKITEM_FREE(indirdep, D_INDIRDEP);
2463 0 : if (LIST_FIRST(&bp->b_dep) != NULL) {
2464 : FREE_LOCK(&lk);
2465 0 : panic("indir_trunc: dangling dep");
2466 : }
2467 : FREE_LOCK(&lk);
2468 : } else {
2469 0 : FREE_LOCK(&lk);
2470 0 : error = bread(ip->i_devvp, dbn, (int)fs->fs_bsize, &bp);
2471 0 : if (error)
2472 0 : return (error);
2473 : }
2474 : /*
2475 : * Recursively free indirect blocks.
2476 : */
2477 0 : if (ip->i_ump->um_fstype == UM_UFS1) {
2478 : ufs1fmt = 1;
2479 0 : bap1 = (int32_t *)bp->b_data;
2480 0 : } else {
2481 : ufs1fmt = 0;
2482 0 : bap2 = (int64_t *)bp->b_data;
2483 : }
2484 0 : nblocks = btodb(fs->fs_bsize);
2485 0 : for (i = NINDIR(fs) - 1; i >= 0; i--) {
2486 0 : if (ufs1fmt)
2487 0 : nb = bap1[i];
2488 : else
2489 0 : nb = bap2[i];
2490 0 : if (nb == 0)
2491 : continue;
2492 0 : if (level != 0) {
2493 0 : if ((error = indir_trunc(ip, fsbtodb(fs, nb),
2494 0 : level - 1, lbn + (i * lbnadd), countp)) != 0)
2495 0 : allerror = error;
2496 : }
2497 0 : ffs_blkfree(ip, nb, fs->fs_bsize);
2498 0 : *countp += nblocks;
2499 0 : }
2500 0 : bp->b_flags |= B_INVAL | B_NOCACHE;
2501 0 : brelse(bp);
2502 0 : return (allerror);
2503 0 : }
2504 :
2505 : /*
2506 : * Free an allocindir.
2507 : * This routine must be called with splbio interrupts blocked.
2508 : */
2509 : STATIC void
2510 0 : free_allocindir(struct allocindir *aip, struct inodedep *inodedep)
2511 : {
2512 : struct freefrag *freefrag;
2513 :
2514 0 : splassert(IPL_BIO);
2515 :
2516 : #ifdef DEBUG
2517 : if (lk.lkt_held == -1)
2518 : panic("free_allocindir: lock not held");
2519 : #endif
2520 0 : if ((aip->ai_state & DEPCOMPLETE) == 0)
2521 0 : LIST_REMOVE(aip, ai_deps);
2522 0 : if (aip->ai_state & ONWORKLIST)
2523 0 : WORKLIST_REMOVE(&aip->ai_list);
2524 0 : LIST_REMOVE(aip, ai_next);
2525 0 : if ((freefrag = aip->ai_freefrag) != NULL) {
2526 0 : if (inodedep == NULL)
2527 0 : add_to_worklist(&freefrag->ff_list);
2528 : else
2529 0 : WORKLIST_INSERT(&inodedep->id_bufwait,
2530 : &freefrag->ff_list);
2531 : }
2532 0 : WORKITEM_FREE(aip, D_ALLOCINDIR);
2533 0 : }
2534 :
2535 : /*
2536 : * Directory entry addition dependencies.
2537 : *
2538 : * When adding a new directory entry, the inode (with its incremented link
2539 : * count) must be written to disk before the directory entry's pointer to it.
2540 : * Also, if the inode is newly allocated, the corresponding freemap must be
2541 : * updated (on disk) before the directory entry's pointer. These requirements
2542 : * are met via undo/redo on the directory entry's pointer, which consists
2543 : * simply of the inode number.
2544 : *
2545 : * As directory entries are added and deleted, the free space within a
2546 : * directory block can become fragmented. The ufs file system will compact
2547 : * a fragmented directory block to make space for a new entry. When this
2548 : * occurs, the offsets of previously added entries change. Any "diradd"
2549 : * dependency structures corresponding to these entries must be updated with
2550 : * the new offsets.
2551 : */
2552 :
2553 : /*
2554 : * This routine is called after the in-memory inode's link
2555 : * count has been incremented, but before the directory entry's
2556 : * pointer to the inode has been set.
2557 : */
2558 : /* buffer containing directory block */
2559 : /* inode for directory */
2560 : /* offset of new entry in directory */
2561 : /* inode referenced by new directory entry */
2562 : /* non-NULL => contents of new mkdir */
2563 : /* entry is in a newly allocated block */
2564 : int
2565 0 : softdep_setup_directory_add(struct buf *bp, struct inode *dp, off_t diroffset,
2566 : long newinum, struct buf *newdirbp, int isnewblk)
2567 : {
2568 : int offset; /* offset of new entry within directory block */
2569 : daddr_t lbn; /* block in directory containing new entry */
2570 : struct fs *fs;
2571 : struct diradd *dap;
2572 : struct allocdirect *adp;
2573 0 : struct pagedep *pagedep;
2574 0 : struct inodedep *inodedep;
2575 : struct newdirblk *newdirblk = NULL;
2576 : struct mkdir *mkdir1, *mkdir2;
2577 :
2578 :
2579 0 : fs = dp->i_fs;
2580 0 : lbn = lblkno(fs, diroffset);
2581 0 : offset = blkoff(fs, diroffset);
2582 0 : dap = pool_get(&diradd_pool, PR_WAITOK | PR_ZERO);
2583 0 : dap->da_list.wk_type = D_DIRADD;
2584 0 : dap->da_offset = offset;
2585 0 : dap->da_newinum = newinum;
2586 0 : dap->da_state = ATTACHED;
2587 0 : if (isnewblk && lbn < NDADDR && fragoff(fs, diroffset) == 0) {
2588 0 : newdirblk = pool_get(&newdirblk_pool, PR_WAITOK);
2589 0 : newdirblk->db_list.wk_type = D_NEWDIRBLK;
2590 0 : newdirblk->db_state = 0;
2591 0 : }
2592 0 : if (newdirbp == NULL) {
2593 0 : dap->da_state |= DEPCOMPLETE;
2594 0 : ACQUIRE_LOCK(&lk);
2595 0 : } else {
2596 0 : dap->da_state |= MKDIR_BODY | MKDIR_PARENT;
2597 0 : mkdir1 = pool_get(&mkdir_pool, PR_WAITOK);
2598 0 : mkdir1->md_list.wk_type = D_MKDIR;
2599 0 : mkdir1->md_state = MKDIR_BODY;
2600 0 : mkdir1->md_diradd = dap;
2601 0 : mkdir2 = pool_get(&mkdir_pool, PR_WAITOK);
2602 0 : mkdir2->md_list.wk_type = D_MKDIR;
2603 0 : mkdir2->md_state = MKDIR_PARENT;
2604 0 : mkdir2->md_diradd = dap;
2605 : /*
2606 : * Dependency on "." and ".." being written to disk.
2607 : */
2608 0 : mkdir1->md_buf = newdirbp;
2609 0 : ACQUIRE_LOCK(&lk);
2610 0 : LIST_INSERT_HEAD(&mkdirlisthd, mkdir1, md_mkdirs);
2611 0 : WORKLIST_INSERT(&newdirbp->b_dep, &mkdir1->md_list);
2612 0 : FREE_LOCK(&lk);
2613 0 : bdwrite(newdirbp);
2614 : /*
2615 : * Dependency on link count increase for parent directory
2616 : */
2617 0 : ACQUIRE_LOCK(&lk);
2618 0 : if (inodedep_lookup(fs, dp->i_number, 0, &inodedep) == 0
2619 0 : || (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
2620 0 : dap->da_state &= ~MKDIR_PARENT;
2621 0 : WORKITEM_FREE(mkdir2, D_MKDIR);
2622 0 : } else {
2623 0 : LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs);
2624 0 : WORKLIST_INSERT(&inodedep->id_bufwait,&mkdir2->md_list);
2625 : }
2626 : }
2627 : /*
2628 : * Link into parent directory pagedep to await its being written.
2629 : */
2630 0 : if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0)
2631 0 : WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
2632 0 : dap->da_pagedep = pagedep;
2633 0 : LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap,
2634 : da_pdlist);
2635 : /*
2636 : * Link into its inodedep. Put it on the id_bufwait list if the inode
2637 : * is not yet written. If it is written, do the post-inode write
2638 : * processing to put it on the id_pendinghd list.
2639 : */
2640 0 : (void) inodedep_lookup(fs, newinum, DEPALLOC, &inodedep);
2641 0 : if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE)
2642 0 : diradd_inode_written(dap, inodedep);
2643 : else
2644 0 : WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
2645 0 : if (isnewblk) {
2646 : /*
2647 : * Directories growing into indirect blocks are rare
2648 : * enough and the frequency of new block allocation
2649 : * in those cases even more rare, that we choose not
2650 : * to bother tracking them. Rather we simply force the
2651 : * new directory entry to disk.
2652 : */
2653 0 : if (lbn >= NDADDR) {
2654 0 : FREE_LOCK(&lk);
2655 : /*
2656 : * We only have a new allocation when at the
2657 : * beginning of a new block, not when we are
2658 : * expanding into an existing block.
2659 : */
2660 0 : if (blkoff(fs, diroffset) == 0)
2661 0 : return (1);
2662 0 : return (0);
2663 : }
2664 : /*
2665 : * We only have a new allocation when at the beginning
2666 : * of a new fragment, not when we are expanding into an
2667 : * existing fragment. Also, there is nothing to do if we
2668 : * are already tracking this block.
2669 : */
2670 0 : if (fragoff(fs, diroffset) != 0) {
2671 0 : FREE_LOCK(&lk);
2672 0 : return (0);
2673 : }
2674 :
2675 0 : if ((pagedep->pd_state & NEWBLOCK) != 0) {
2676 0 : WORKITEM_FREE(newdirblk, D_NEWDIRBLK);
2677 0 : FREE_LOCK(&lk);
2678 0 : return (0);
2679 : }
2680 : /*
2681 : * Find our associated allocdirect and have it track us.
2682 : */
2683 0 : if (inodedep_lookup(fs, dp->i_number, 0, &inodedep) == 0)
2684 0 : panic("softdep_setup_directory_add: lost inodedep");
2685 0 : adp = TAILQ_LAST(&inodedep->id_newinoupdt, allocdirectlst);
2686 0 : if (adp == NULL || adp->ad_lbn != lbn) {
2687 0 : FREE_LOCK(&lk);
2688 0 : panic("softdep_setup_directory_add: lost entry");
2689 : }
2690 0 : pagedep->pd_state |= NEWBLOCK;
2691 0 : newdirblk->db_pagedep = pagedep;
2692 0 : WORKLIST_INSERT(&adp->ad_newdirblk, &newdirblk->db_list);
2693 0 : }
2694 0 : FREE_LOCK(&lk);
2695 0 : return (0);
2696 0 : }
2697 :
2698 : /*
2699 : * This procedure is called to change the offset of a directory
2700 : * entry when compacting a directory block which must be owned
2701 : * exclusively by the caller. Note that the actual entry movement
2702 : * must be done in this procedure to ensure that no I/O completions
2703 : * occur while the move is in progress.
2704 : */
2705 : /* inode for directory */
2706 : /* address of dp->i_offset */
2707 : /* address of old directory location */
2708 : /* address of new directory location */
2709 : /* size of directory entry */
2710 : void
2711 0 : softdep_change_directoryentry_offset(struct inode *dp, caddr_t base,
2712 : caddr_t oldloc, caddr_t newloc, int entrysize)
2713 : {
2714 : int offset, oldoffset, newoffset;
2715 0 : struct pagedep *pagedep;
2716 : struct diradd *dap;
2717 : daddr_t lbn;
2718 :
2719 0 : ACQUIRE_LOCK(&lk);
2720 0 : lbn = lblkno(dp->i_fs, dp->i_offset);
2721 0 : offset = blkoff(dp->i_fs, dp->i_offset);
2722 0 : if (pagedep_lookup(dp, lbn, 0, &pagedep) == 0)
2723 : goto done;
2724 0 : oldoffset = offset + (oldloc - base);
2725 0 : newoffset = offset + (newloc - base);
2726 :
2727 0 : LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(oldoffset)], da_pdlist) {
2728 0 : if (dap->da_offset != oldoffset)
2729 : continue;
2730 0 : dap->da_offset = newoffset;
2731 0 : if (DIRADDHASH(newoffset) == DIRADDHASH(oldoffset))
2732 : break;
2733 0 : LIST_REMOVE(dap, da_pdlist);
2734 0 : LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(newoffset)],
2735 : dap, da_pdlist);
2736 0 : break;
2737 : }
2738 0 : if (dap == NULL) {
2739 :
2740 0 : LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist) {
2741 0 : if (dap->da_offset == oldoffset) {
2742 0 : dap->da_offset = newoffset;
2743 0 : break;
2744 : }
2745 : }
2746 : }
2747 : done:
2748 0 : memmove(newloc, oldloc, entrysize);
2749 0 : FREE_LOCK(&lk);
2750 0 : }
2751 :
2752 : /*
2753 : * Free a diradd dependency structure. This routine must be called
2754 : * with splbio interrupts blocked.
2755 : */
2756 : STATIC void
2757 0 : free_diradd(struct diradd *dap)
2758 : {
2759 : struct dirrem *dirrem;
2760 : struct pagedep *pagedep;
2761 0 : struct inodedep *inodedep;
2762 : struct mkdir *mkdir, *nextmd;
2763 :
2764 0 : splassert(IPL_BIO);
2765 :
2766 : #ifdef DEBUG
2767 : if (lk.lkt_held == -1)
2768 : panic("free_diradd: lock not held");
2769 : #endif
2770 0 : WORKLIST_REMOVE(&dap->da_list);
2771 0 : LIST_REMOVE(dap, da_pdlist);
2772 0 : if ((dap->da_state & DIRCHG) == 0) {
2773 0 : pagedep = dap->da_pagedep;
2774 0 : } else {
2775 0 : dirrem = dap->da_previous;
2776 0 : pagedep = dirrem->dm_pagedep;
2777 0 : dirrem->dm_dirinum = pagedep->pd_ino;
2778 0 : add_to_worklist(&dirrem->dm_list);
2779 : }
2780 0 : if (inodedep_lookup(VFSTOUFS(pagedep->pd_mnt)->um_fs, dap->da_newinum,
2781 0 : 0, &inodedep) != 0)
2782 0 : (void) free_inodedep(inodedep);
2783 0 : if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
2784 0 : for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) {
2785 0 : nextmd = LIST_NEXT(mkdir, md_mkdirs);
2786 0 : if (mkdir->md_diradd != dap)
2787 : continue;
2788 0 : dap->da_state &= ~mkdir->md_state;
2789 0 : WORKLIST_REMOVE(&mkdir->md_list);
2790 0 : LIST_REMOVE(mkdir, md_mkdirs);
2791 0 : WORKITEM_FREE(mkdir, D_MKDIR);
2792 0 : }
2793 0 : if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
2794 0 : FREE_LOCK(&lk);
2795 0 : panic("free_diradd: unfound ref");
2796 : }
2797 : }
2798 0 : WORKITEM_FREE(dap, D_DIRADD);
2799 0 : }
2800 :
2801 : /*
2802 : * Directory entry removal dependencies.
2803 : *
2804 : * When removing a directory entry, the entry's inode pointer must be
2805 : * zero'ed on disk before the corresponding inode's link count is decremented
2806 : * (possibly freeing the inode for re-use). This dependency is handled by
2807 : * updating the directory entry but delaying the inode count reduction until
2808 : * after the directory block has been written to disk. After this point, the
2809 : * inode count can be decremented whenever it is convenient.
2810 : */
2811 :
2812 : /*
2813 : * This routine should be called immediately after removing
2814 : * a directory entry. The inode's link count should not be
2815 : * decremented by the calling procedure -- the soft updates
2816 : * code will do this task when it is safe.
2817 : */
2818 : /* buffer containing directory block */
2819 : /* inode for the directory being modified */
2820 : /* inode for directory entry being removed */
2821 : /* indicates if doing RMDIR */
2822 : void
2823 0 : softdep_setup_remove(struct buf *bp, struct inode *dp, struct inode *ip,
2824 : int isrmdir)
2825 : {
2826 0 : struct dirrem *dirrem, *prevdirrem;
2827 :
2828 : /*
2829 : * Allocate a new dirrem if appropriate and ACQUIRE_LOCK.
2830 : */
2831 0 : dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
2832 :
2833 : /*
2834 : * If the COMPLETE flag is clear, then there were no active
2835 : * entries and we want to roll back to a zeroed entry until
2836 : * the new inode is committed to disk. If the COMPLETE flag is
2837 : * set then we have deleted an entry that never made it to
2838 : * disk. If the entry we deleted resulted from a name change,
2839 : * then the old name still resides on disk. We cannot delete
2840 : * its inode (returned to us in prevdirrem) until the zeroed
2841 : * directory entry gets to disk. The new inode has never been
2842 : * referenced on the disk, so can be deleted immediately.
2843 : */
2844 0 : if ((dirrem->dm_state & COMPLETE) == 0) {
2845 0 : LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem,
2846 : dm_next);
2847 0 : FREE_LOCK(&lk);
2848 0 : } else {
2849 0 : if (prevdirrem != NULL)
2850 0 : LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd,
2851 : prevdirrem, dm_next);
2852 0 : dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino;
2853 0 : FREE_LOCK(&lk);
2854 0 : handle_workitem_remove(dirrem);
2855 : }
2856 0 : }
2857 :
2858 : STATIC long num_dirrem; /* number of dirrem allocated */
2859 : /*
2860 : * Allocate a new dirrem if appropriate and return it along with
2861 : * its associated pagedep. Called without a lock, returns with lock.
2862 : */
2863 : /* buffer containing directory block */
2864 : /* inode for the directory being modified */
2865 : /* inode for directory entry being removed */
2866 : /* indicates if doing RMDIR */
2867 : /* previously referenced inode, if any */
2868 : STATIC struct dirrem *
2869 0 : newdirrem(struct buf *bp, struct inode *dp, struct inode *ip, int isrmdir,
2870 : struct dirrem **prevdirremp)
2871 : {
2872 : int offset;
2873 : daddr_t lbn;
2874 : struct diradd *dap;
2875 : struct dirrem *dirrem;
2876 0 : struct pagedep *pagedep;
2877 :
2878 : /*
2879 : * Whiteouts have no deletion dependencies.
2880 : */
2881 0 : if (ip == NULL)
2882 0 : panic("newdirrem: whiteout");
2883 : /*
2884 : * If we are over our limit, try to improve the situation.
2885 : * Limiting the number of dirrem structures will also limit
2886 : * the number of freefile and freeblks structures.
2887 : */
2888 0 : if (num_dirrem > max_softdeps / 2)
2889 0 : (void) request_cleanup(FLUSH_REMOVE, 0);
2890 0 : num_dirrem += 1;
2891 0 : dirrem = pool_get(&dirrem_pool, PR_WAITOK | PR_ZERO);
2892 0 : dirrem->dm_list.wk_type = D_DIRREM;
2893 0 : dirrem->dm_state = isrmdir ? RMDIR : 0;
2894 0 : dirrem->dm_mnt = ITOV(ip)->v_mount;
2895 0 : dirrem->dm_oldinum = ip->i_number;
2896 0 : *prevdirremp = NULL;
2897 :
2898 0 : ACQUIRE_LOCK(&lk);
2899 0 : lbn = lblkno(dp->i_fs, dp->i_offset);
2900 0 : offset = blkoff(dp->i_fs, dp->i_offset);
2901 0 : if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0)
2902 0 : WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
2903 0 : dirrem->dm_pagedep = pagedep;
2904 : /*
2905 : * Check for a diradd dependency for the same directory entry.
2906 : * If present, then both dependencies become obsolete and can
2907 : * be de-allocated. Check for an entry on both the pd_dirraddhd
2908 : * list and the pd_pendinghd list.
2909 : */
2910 :
2911 0 : LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(offset)], da_pdlist)
2912 0 : if (dap->da_offset == offset)
2913 : break;
2914 0 : if (dap == NULL) {
2915 :
2916 0 : LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist)
2917 0 : if (dap->da_offset == offset)
2918 : break;
2919 0 : if (dap == NULL)
2920 0 : return (dirrem);
2921 : }
2922 : /*
2923 : * Must be ATTACHED at this point.
2924 : */
2925 0 : if ((dap->da_state & ATTACHED) == 0) {
2926 0 : FREE_LOCK(&lk);
2927 0 : panic("newdirrem: not ATTACHED");
2928 : }
2929 0 : if (dap->da_newinum != ip->i_number) {
2930 0 : FREE_LOCK(&lk);
2931 0 : panic("newdirrem: inum %u should be %u",
2932 0 : ip->i_number, dap->da_newinum);
2933 : }
2934 : /*
2935 : * If we are deleting a changed name that never made it to disk,
2936 : * then return the dirrem describing the previous inode (which
2937 : * represents the inode currently referenced from this entry on disk).
2938 : */
2939 0 : if ((dap->da_state & DIRCHG) != 0) {
2940 0 : *prevdirremp = dap->da_previous;
2941 0 : dap->da_state &= ~DIRCHG;
2942 0 : dap->da_pagedep = pagedep;
2943 0 : }
2944 : /*
2945 : * We are deleting an entry that never made it to disk.
2946 : * Mark it COMPLETE so we can delete its inode immediately.
2947 : */
2948 0 : dirrem->dm_state |= COMPLETE;
2949 0 : free_diradd(dap);
2950 0 : return (dirrem);
2951 0 : }
2952 :
2953 : /*
2954 : * Directory entry change dependencies.
2955 : *
2956 : * Changing an existing directory entry requires that an add operation
2957 : * be completed first followed by a deletion. The semantics for the addition
2958 : * are identical to the description of adding a new entry above except
2959 : * that the rollback is to the old inode number rather than zero. Once
2960 : * the addition dependency is completed, the removal is done as described
2961 : * in the removal routine above.
2962 : */
2963 :
2964 : /*
2965 : * This routine should be called immediately after changing
2966 : * a directory entry. The inode's link count should not be
2967 : * decremented by the calling procedure -- the soft updates
2968 : * code will perform this task when it is safe.
2969 : */
2970 : /* buffer containing directory block */
2971 : /* inode for the directory being modified */
2972 : /* inode for directory entry being removed */
2973 : /* new inode number for changed entry */
2974 : /* indicates if doing RMDIR */
2975 : void
2976 0 : softdep_setup_directory_change(struct buf *bp, struct inode *dp,
2977 : struct inode *ip, long newinum, int isrmdir)
2978 : {
2979 : int offset;
2980 : struct diradd *dap;
2981 0 : struct dirrem *dirrem, *prevdirrem;
2982 : struct pagedep *pagedep;
2983 0 : struct inodedep *inodedep;
2984 :
2985 0 : offset = blkoff(dp->i_fs, dp->i_offset);
2986 0 : dap = pool_get(&diradd_pool, PR_WAITOK | PR_ZERO);
2987 0 : dap->da_list.wk_type = D_DIRADD;
2988 0 : dap->da_state = DIRCHG | ATTACHED | DEPCOMPLETE;
2989 0 : dap->da_offset = offset;
2990 0 : dap->da_newinum = newinum;
2991 :
2992 : /*
2993 : * Allocate a new dirrem and ACQUIRE_LOCK.
2994 : */
2995 0 : dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
2996 0 : pagedep = dirrem->dm_pagedep;
2997 : /*
2998 : * The possible values for isrmdir:
2999 : * 0 - non-directory file rename
3000 : * 1 - directory rename within same directory
3001 : * inum - directory rename to new directory of given inode number
3002 : * When renaming to a new directory, we are both deleting and
3003 : * creating a new directory entry, so the link count on the new
3004 : * directory should not change. Thus we do not need the followup
3005 : * dirrem which is usually done in handle_workitem_remove. We set
3006 : * the DIRCHG flag to tell handle_workitem_remove to skip the
3007 : * followup dirrem.
3008 : */
3009 0 : if (isrmdir > 1)
3010 0 : dirrem->dm_state |= DIRCHG;
3011 :
3012 : /*
3013 : * If the COMPLETE flag is clear, then there were no active
3014 : * entries and we want to roll back to the previous inode until
3015 : * the new inode is committed to disk. If the COMPLETE flag is
3016 : * set, then we have deleted an entry that never made it to disk.
3017 : * If the entry we deleted resulted from a name change, then the old
3018 : * inode reference still resides on disk. Any rollback that we do
3019 : * needs to be to that old inode (returned to us in prevdirrem). If
3020 : * the entry we deleted resulted from a create, then there is
3021 : * no entry on the disk, so we want to roll back to zero rather
3022 : * than the uncommitted inode. In either of the COMPLETE cases we
3023 : * want to immediately free the unwritten and unreferenced inode.
3024 : */
3025 0 : if ((dirrem->dm_state & COMPLETE) == 0) {
3026 0 : dap->da_previous = dirrem;
3027 0 : } else {
3028 0 : if (prevdirrem != NULL) {
3029 0 : dap->da_previous = prevdirrem;
3030 0 : } else {
3031 0 : dap->da_state &= ~DIRCHG;
3032 0 : dap->da_pagedep = pagedep;
3033 : }
3034 0 : dirrem->dm_dirinum = pagedep->pd_ino;
3035 0 : add_to_worklist(&dirrem->dm_list);
3036 : }
3037 : /*
3038 : * Link into its inodedep. Put it on the id_bufwait list if the inode
3039 : * is not yet written. If it is written, do the post-inode write
3040 : * processing to put it on the id_pendinghd list.
3041 : */
3042 0 : if (inodedep_lookup(dp->i_fs, newinum, DEPALLOC, &inodedep) == 0 ||
3043 0 : (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
3044 0 : dap->da_state |= COMPLETE;
3045 0 : LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
3046 0 : WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
3047 0 : } else {
3048 0 : LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)],
3049 : dap, da_pdlist);
3050 0 : WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
3051 : }
3052 0 : FREE_LOCK(&lk);
3053 0 : }
3054 :
3055 : /*
3056 : * Called whenever the link count on an inode is changed.
3057 : * It creates an inode dependency so that the new reference(s)
3058 : * to the inode cannot be committed to disk until the updated
3059 : * inode has been written.
3060 : */
3061 : /* the inode with the increased link count */
3062 : /* do background work or not */
3063 : void
3064 0 : softdep_change_linkcnt(struct inode *ip, int nodelay)
3065 : {
3066 0 : struct inodedep *inodedep;
3067 : int flags;
3068 :
3069 : /*
3070 : * If requested, do not allow background work to happen.
3071 : */
3072 : flags = DEPALLOC;
3073 0 : if (nodelay)
3074 0 : flags |= NODELAY;
3075 :
3076 0 : ACQUIRE_LOCK(&lk);
3077 :
3078 0 : (void) inodedep_lookup(ip->i_fs, ip->i_number, flags, &inodedep);
3079 0 : if (DIP(ip, nlink) < ip->i_effnlink) {
3080 0 : FREE_LOCK(&lk);
3081 0 : panic("softdep_change_linkcnt: bad delta");
3082 : }
3083 :
3084 0 : inodedep->id_nlinkdelta = DIP(ip, nlink) - ip->i_effnlink;
3085 :
3086 0 : FREE_LOCK(&lk);
3087 0 : }
3088 :
3089 : /*
3090 : * This workitem decrements the inode's link count.
3091 : * If the link count reaches zero, the file is removed.
3092 : */
3093 : STATIC void
3094 0 : handle_workitem_remove(struct dirrem *dirrem)
3095 : {
3096 0 : struct proc *p = CURPROC; /* XXX */
3097 0 : struct inodedep *inodedep;
3098 0 : struct vnode *vp;
3099 : struct inode *ip;
3100 : ufsino_t oldinum;
3101 : int error;
3102 :
3103 0 : if ((error = VFS_VGET(dirrem->dm_mnt, dirrem->dm_oldinum, &vp)) != 0) {
3104 0 : softdep_error("handle_workitem_remove: vget", error);
3105 0 : return;
3106 : }
3107 0 : ip = VTOI(vp);
3108 0 : ACQUIRE_LOCK(&lk);
3109 0 : if ((inodedep_lookup(ip->i_fs, dirrem->dm_oldinum, 0, &inodedep))
3110 0 : == 0) {
3111 0 : FREE_LOCK(&lk);
3112 0 : panic("handle_workitem_remove: lost inodedep");
3113 : }
3114 : /*
3115 : * Normal file deletion.
3116 : */
3117 0 : if ((dirrem->dm_state & RMDIR) == 0) {
3118 0 : DIP_ADD(ip, nlink, -1);
3119 0 : ip->i_flag |= IN_CHANGE;
3120 0 : if (DIP(ip, nlink) < ip->i_effnlink) {
3121 0 : FREE_LOCK(&lk);
3122 0 : panic("handle_workitem_remove: bad file delta");
3123 : }
3124 0 : inodedep->id_nlinkdelta = DIP(ip, nlink) - ip->i_effnlink;
3125 0 : FREE_LOCK(&lk);
3126 0 : vput(vp);
3127 0 : num_dirrem -= 1;
3128 0 : WORKITEM_FREE(dirrem, D_DIRREM);
3129 0 : return;
3130 : }
3131 : /*
3132 : * Directory deletion. Decrement reference count for both the
3133 : * just deleted parent directory entry and the reference for ".".
3134 : * Next truncate the directory to length zero. When the
3135 : * truncation completes, arrange to have the reference count on
3136 : * the parent decremented to account for the loss of "..".
3137 : */
3138 0 : DIP_ADD(ip, nlink, -2);
3139 0 : ip->i_flag |= IN_CHANGE;
3140 0 : if (DIP(ip, nlink) < ip->i_effnlink)
3141 0 : panic("handle_workitem_remove: bad dir delta");
3142 0 : inodedep->id_nlinkdelta = DIP(ip, nlink) - ip->i_effnlink;
3143 0 : FREE_LOCK(&lk);
3144 0 : if ((error = UFS_TRUNCATE(ip, (off_t)0, 0, p->p_ucred)) != 0)
3145 0 : softdep_error("handle_workitem_remove: truncate", error);
3146 : /*
3147 : * Rename a directory to a new parent. Since, we are both deleting
3148 : * and creating a new directory entry, the link count on the new
3149 : * directory should not change. Thus we skip the followup dirrem.
3150 : */
3151 0 : if (dirrem->dm_state & DIRCHG) {
3152 0 : vput(vp);
3153 0 : num_dirrem -= 1;
3154 0 : WORKITEM_FREE(dirrem, D_DIRREM);
3155 0 : return;
3156 : }
3157 : /*
3158 : * If the inodedep does not exist, then the zero'ed inode has
3159 : * been written to disk. If the allocated inode has never been
3160 : * written to disk, then the on-disk inode is zero'ed. In either
3161 : * case we can remove the file immediately.
3162 : */
3163 0 : ACQUIRE_LOCK(&lk);
3164 0 : dirrem->dm_state = 0;
3165 0 : oldinum = dirrem->dm_oldinum;
3166 0 : dirrem->dm_oldinum = dirrem->dm_dirinum;
3167 0 : if (inodedep_lookup(ip->i_fs, oldinum, 0, &inodedep) == 0 ||
3168 0 : check_inode_unwritten(inodedep)) {
3169 0 : FREE_LOCK(&lk);
3170 0 : vput(vp);
3171 0 : handle_workitem_remove(dirrem);
3172 0 : return;
3173 : }
3174 0 : WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list);
3175 0 : FREE_LOCK(&lk);
3176 0 : ip->i_flag |= IN_CHANGE;
3177 0 : UFS_UPDATE(VTOI(vp), 0);
3178 0 : vput(vp);
3179 0 : }
3180 :
3181 : /*
3182 : * Inode de-allocation dependencies.
3183 : *
3184 : * When an inode's link count is reduced to zero, it can be de-allocated. We
3185 : * found it convenient to postpone de-allocation until after the inode is
3186 : * written to disk with its new link count (zero). At this point, all of the
3187 : * on-disk inode's block pointers are nullified and, with careful dependency
3188 : * list ordering, all dependencies related to the inode will be satisfied and
3189 : * the corresponding dependency structures de-allocated. So, if/when the
3190 : * inode is reused, there will be no mixing of old dependencies with new
3191 : * ones. This artificial dependency is set up by the block de-allocation
3192 : * procedure above (softdep_setup_freeblocks) and completed by the
3193 : * following procedure.
3194 : */
3195 : STATIC void
3196 0 : handle_workitem_freefile(struct freefile *freefile)
3197 : {
3198 : struct fs *fs;
3199 0 : struct vnode vp;
3200 0 : struct inode tip;
3201 : #ifdef DEBUG
3202 : struct inodedep *idp;
3203 : #endif
3204 : int error;
3205 :
3206 0 : fs = VFSTOUFS(freefile->fx_mnt)->um_fs;
3207 : #ifdef DEBUG
3208 : ACQUIRE_LOCK(&lk);
3209 : error = inodedep_lookup(fs, freefile->fx_oldinum, 0, &idp);
3210 : FREE_LOCK(&lk);
3211 : if (error)
3212 : panic("handle_workitem_freefile: inodedep survived");
3213 : #endif
3214 0 : tip.i_ump = VFSTOUFS(freefile->fx_mnt);
3215 0 : tip.i_dev = freefile->fx_devvp->v_rdev;
3216 0 : tip.i_fs = fs;
3217 0 : tip.i_vnode = &vp;
3218 0 : vp.v_data = &tip;
3219 :
3220 0 : if ((error = ffs_freefile(&tip, freefile->fx_oldinum,
3221 0 : freefile->fx_mode)) != 0) {
3222 0 : softdep_error("handle_workitem_freefile", error);
3223 0 : }
3224 0 : WORKITEM_FREE(freefile, D_FREEFILE);
3225 0 : }
3226 :
3227 : /*
3228 : * Disk writes.
3229 : *
3230 : * The dependency structures constructed above are most actively used when file
3231 : * system blocks are written to disk. No constraints are placed on when a
3232 : * block can be written, but unsatisfied update dependencies are made safe by
3233 : * modifying (or replacing) the source memory for the duration of the disk
3234 : * write. When the disk write completes, the memory block is again brought
3235 : * up-to-date.
3236 : *
3237 : * In-core inode structure reclamation.
3238 : *
3239 : * Because there are a finite number of "in-core" inode structures, they are
3240 : * reused regularly. By transferring all inode-related dependencies to the
3241 : * in-memory inode block and indexing them separately (via "inodedep"s), we
3242 : * can allow "in-core" inode structures to be reused at any time and avoid
3243 : * any increase in contention.
3244 : *
3245 : * Called just before entering the device driver to initiate a new disk I/O.
3246 : * The buffer must be locked, thus, no I/O completion operations can occur
3247 : * while we are manipulating its associated dependencies.
3248 : */
3249 : /* structure describing disk write to occur */
3250 : void
3251 0 : softdep_disk_io_initiation(struct buf *bp)
3252 : {
3253 : struct worklist *wk, *nextwk;
3254 : struct indirdep *indirdep;
3255 : struct inodedep *inodedep;
3256 : struct buf *sbp;
3257 :
3258 : /*
3259 : * We only care about write operations. There should never
3260 : * be dependencies for reads.
3261 : */
3262 0 : if (bp->b_flags & B_READ)
3263 0 : panic("softdep_disk_io_initiation: read");
3264 :
3265 0 : ACQUIRE_LOCK(&lk);
3266 :
3267 : /*
3268 : * Do any necessary pre-I/O processing.
3269 : */
3270 0 : for (wk = LIST_FIRST(&bp->b_dep); wk; wk = nextwk) {
3271 0 : nextwk = LIST_NEXT(wk, wk_list);
3272 0 : switch (wk->wk_type) {
3273 :
3274 : case D_PAGEDEP:
3275 0 : initiate_write_filepage(WK_PAGEDEP(wk), bp);
3276 0 : continue;
3277 :
3278 : case D_INODEDEP:
3279 0 : inodedep = WK_INODEDEP(wk);
3280 0 : if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC)
3281 0 : initiate_write_inodeblock_ufs1(inodedep, bp);
3282 : #ifdef FFS2
3283 : else
3284 0 : initiate_write_inodeblock_ufs2(inodedep, bp);
3285 : #endif
3286 : continue;
3287 :
3288 : case D_INDIRDEP:
3289 0 : indirdep = WK_INDIRDEP(wk);
3290 0 : if (indirdep->ir_state & GOINGAWAY)
3291 0 : panic("disk_io_initiation: indirdep gone");
3292 : /*
3293 : * If there are no remaining dependencies, this
3294 : * will be writing the real pointers, so the
3295 : * dependency can be freed.
3296 : */
3297 0 : if (LIST_FIRST(&indirdep->ir_deplisthd) == NULL) {
3298 0 : sbp = indirdep->ir_savebp;
3299 0 : sbp->b_flags |= B_INVAL | B_NOCACHE;
3300 : /* inline expand WORKLIST_REMOVE(wk); */
3301 0 : wk->wk_state &= ~ONWORKLIST;
3302 0 : LIST_REMOVE(wk, wk_list);
3303 0 : WORKITEM_FREE(indirdep, D_INDIRDEP);
3304 0 : FREE_LOCK(&lk);
3305 0 : brelse(sbp);
3306 0 : ACQUIRE_LOCK(&lk);
3307 0 : continue;
3308 : }
3309 : /*
3310 : * Replace up-to-date version with safe version.
3311 : */
3312 0 : FREE_LOCK(&lk);
3313 0 : indirdep->ir_saveddata = malloc(bp->b_bcount,
3314 : M_INDIRDEP, M_WAITOK);
3315 0 : ACQUIRE_LOCK(&lk);
3316 0 : indirdep->ir_state &= ~ATTACHED;
3317 0 : indirdep->ir_state |= UNDONE;
3318 0 : memcpy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount);
3319 0 : memcpy(bp->b_data, indirdep->ir_savebp->b_data,
3320 : bp->b_bcount);
3321 0 : continue;
3322 :
3323 : case D_MKDIR:
3324 : case D_BMSAFEMAP:
3325 : case D_ALLOCDIRECT:
3326 : case D_ALLOCINDIR:
3327 : continue;
3328 :
3329 : default:
3330 0 : FREE_LOCK(&lk);
3331 0 : panic("handle_disk_io_initiation: Unexpected type %s",
3332 0 : TYPENAME(wk->wk_type));
3333 : /* NOTREACHED */
3334 : }
3335 : }
3336 :
3337 0 : FREE_LOCK(&lk);
3338 0 : }
3339 :
3340 : /*
3341 : * Called from within the procedure above to deal with unsatisfied
3342 : * allocation dependencies in a directory. The buffer must be locked,
3343 : * thus, no I/O completion operations can occur while we are
3344 : * manipulating its associated dependencies.
3345 : */
3346 : STATIC void
3347 0 : initiate_write_filepage(struct pagedep *pagedep, struct buf *bp)
3348 : {
3349 : struct diradd *dap;
3350 : struct direct *ep;
3351 : int i;
3352 :
3353 0 : if (pagedep->pd_state & IOSTARTED) {
3354 : /*
3355 : * This can only happen if there is a driver that does not
3356 : * understand chaining. Here biodone will reissue the call
3357 : * to strategy for the incomplete buffers.
3358 : */
3359 0 : printf("initiate_write_filepage: already started\n");
3360 0 : return;
3361 : }
3362 0 : pagedep->pd_state |= IOSTARTED;
3363 0 : for (i = 0; i < DAHASHSZ; i++) {
3364 0 : LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
3365 0 : ep = (struct direct *)
3366 0 : ((char *)bp->b_data + dap->da_offset);
3367 0 : if (ep->d_ino != dap->da_newinum) {
3368 0 : FREE_LOCK(&lk);
3369 0 : panic("%s: dir inum %u != new %u",
3370 : "initiate_write_filepage",
3371 0 : ep->d_ino, dap->da_newinum);
3372 : }
3373 0 : if (dap->da_state & DIRCHG)
3374 0 : ep->d_ino = dap->da_previous->dm_oldinum;
3375 : else
3376 0 : ep->d_ino = 0;
3377 0 : dap->da_state &= ~ATTACHED;
3378 0 : dap->da_state |= UNDONE;
3379 : }
3380 : }
3381 0 : }
3382 :
3383 : /*
3384 : * Called from within the procedure above to deal with unsatisfied
3385 : * allocation dependencies in an inodeblock. The buffer must be
3386 : * locked, thus, no I/O completion operations can occur while we
3387 : * are manipulating its associated dependencies.
3388 : */
3389 : /* The inode block */
3390 : STATIC void
3391 0 : initiate_write_inodeblock_ufs1(struct inodedep *inodedep, struct buf *bp)
3392 : {
3393 : struct allocdirect *adp, *lastadp;
3394 : struct ufs1_dinode *dp;
3395 : struct fs *fs;
3396 : #ifdef DIAGNOSTIC
3397 : daddr_t prevlbn = 0;
3398 : int32_t d1, d2;
3399 : #endif
3400 : int i, deplist;
3401 :
3402 0 : if (inodedep->id_state & IOSTARTED) {
3403 0 : FREE_LOCK(&lk);
3404 0 : panic("initiate_write_inodeblock: already started");
3405 : }
3406 0 : inodedep->id_state |= IOSTARTED;
3407 0 : fs = inodedep->id_fs;
3408 0 : dp = (struct ufs1_dinode *)bp->b_data +
3409 0 : ino_to_fsbo(fs, inodedep->id_ino);
3410 : /*
3411 : * If the bitmap is not yet written, then the allocated
3412 : * inode cannot be written to disk.
3413 : */
3414 0 : if ((inodedep->id_state & DEPCOMPLETE) == 0) {
3415 0 : if (inodedep->id_savedino1 != NULL) {
3416 : FREE_LOCK(&lk);
3417 0 : panic("initiate_write_inodeblock: already doing I/O");
3418 : }
3419 : FREE_LOCK(&lk);
3420 0 : inodedep->id_savedino1 = malloc(sizeof(struct ufs1_dinode),
3421 : M_INODEDEP, M_WAITOK);
3422 0 : inodedep->id_unsize = sizeof(struct ufs1_dinode);
3423 0 : ACQUIRE_LOCK(&lk);
3424 0 : *inodedep->id_savedino1 = *dp;
3425 0 : memset(dp, 0, sizeof(struct ufs1_dinode));
3426 0 : return;
3427 : }
3428 : /*
3429 : * If no dependencies, then there is nothing to roll back.
3430 : */
3431 0 : inodedep->id_savedsize = dp->di_size;
3432 0 : if (TAILQ_FIRST(&inodedep->id_inoupdt) == NULL)
3433 0 : return;
3434 : /*
3435 : * Set the dependencies to busy.
3436 : */
3437 0 : for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
3438 0 : adp = TAILQ_NEXT(adp, ad_next)) {
3439 : #ifdef DIAGNOSTIC
3440 0 : if (deplist != 0 && prevlbn >= adp->ad_lbn) {
3441 0 : FREE_LOCK(&lk);
3442 0 : panic("softdep_write_inodeblock: lbn order");
3443 : }
3444 0 : prevlbn = adp->ad_lbn;
3445 0 : if (adp->ad_lbn < NDADDR &&
3446 0 : (d1 = dp->di_db[adp->ad_lbn]) != (d2 = adp->ad_newblkno)) {
3447 0 : FREE_LOCK(&lk);
3448 0 : panic("%s: direct pointer #%lld mismatch %d != %d",
3449 0 : "softdep_write_inodeblock", (long long)adp->ad_lbn,
3450 : d1, d2);
3451 : }
3452 0 : if (adp->ad_lbn >= NDADDR &&
3453 0 : (d1 = dp->di_ib[adp->ad_lbn - NDADDR]) !=
3454 0 : (d2 = adp->ad_newblkno)) {
3455 0 : FREE_LOCK(&lk);
3456 0 : panic("%s: indirect pointer #%lld mismatch %d != %d",
3457 0 : "softdep_write_inodeblock", (long long)(adp->ad_lbn -
3458 : NDADDR), d1, d2);
3459 : }
3460 0 : deplist |= 1 << adp->ad_lbn;
3461 0 : if ((adp->ad_state & ATTACHED) == 0) {
3462 0 : FREE_LOCK(&lk);
3463 0 : panic("softdep_write_inodeblock: Unknown state 0x%x",
3464 0 : adp->ad_state);
3465 : }
3466 : #endif /* DIAGNOSTIC */
3467 0 : adp->ad_state &= ~ATTACHED;
3468 0 : adp->ad_state |= UNDONE;
3469 : }
3470 : /*
3471 : * The on-disk inode cannot claim to be any larger than the last
3472 : * fragment that has been written. Otherwise, the on-disk inode
3473 : * might have fragments that were not the last block in the file
3474 : * which would corrupt the filesystem.
3475 : */
3476 0 : for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
3477 0 : lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
3478 0 : if (adp->ad_lbn >= NDADDR)
3479 : break;
3480 0 : dp->di_db[adp->ad_lbn] = adp->ad_oldblkno;
3481 : /* keep going until hitting a rollback to a frag */
3482 0 : if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
3483 : continue;
3484 0 : dp->di_size = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize;
3485 0 : for (i = adp->ad_lbn + 1; i < NDADDR; i++) {
3486 : #ifdef DIAGNOSTIC
3487 0 : if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0) {
3488 0 : FREE_LOCK(&lk);
3489 0 : panic("softdep_write_inodeblock: lost dep1");
3490 : }
3491 : #endif /* DIAGNOSTIC */
3492 0 : dp->di_db[i] = 0;
3493 : }
3494 0 : for (i = 0; i < NIADDR; i++) {
3495 : #ifdef DIAGNOSTIC
3496 0 : if (dp->di_ib[i] != 0 &&
3497 0 : (deplist & ((1 << NDADDR) << i)) == 0) {
3498 0 : FREE_LOCK(&lk);
3499 0 : panic("softdep_write_inodeblock: lost dep2");
3500 : }
3501 : #endif /* DIAGNOSTIC */
3502 0 : dp->di_ib[i] = 0;
3503 : }
3504 0 : return;
3505 : }
3506 : /*
3507 : * If we have zero'ed out the last allocated block of the file,
3508 : * roll back the size to the last currently allocated block.
3509 : * We know that this last allocated block is a full-sized as
3510 : * we already checked for fragments in the loop above.
3511 : */
3512 0 : if (lastadp != NULL &&
3513 0 : dp->di_size <= (lastadp->ad_lbn + 1) * fs->fs_bsize) {
3514 0 : for (i = lastadp->ad_lbn; i >= 0; i--)
3515 0 : if (dp->di_db[i] != 0)
3516 : break;
3517 0 : dp->di_size = (i + 1) * fs->fs_bsize;
3518 0 : }
3519 : /*
3520 : * The only dependencies are for indirect blocks.
3521 : *
3522 : * The file size for indirect block additions is not guaranteed.
3523 : * Such a guarantee would be non-trivial to achieve. The conventional
3524 : * synchronous write implementation also does not make this guarantee.
3525 : * Fsck should catch and fix discrepancies. Arguably, the file size
3526 : * can be over-estimated without destroying integrity when the file
3527 : * moves into the indirect blocks (i.e., is large). If we want to
3528 : * postpone fsck, we are stuck with this argument.
3529 : */
3530 0 : for (; adp; adp = TAILQ_NEXT(adp, ad_next))
3531 0 : dp->di_ib[adp->ad_lbn - NDADDR] = 0;
3532 0 : }
3533 :
3534 : #ifdef FFS2
3535 : /*
3536 : * Version of initiate_write_inodeblock that handles FFS2 dinodes.
3537 : */
3538 : /* The inode block */
3539 : STATIC void
3540 0 : initiate_write_inodeblock_ufs2(struct inodedep *inodedep, struct buf *bp)
3541 : {
3542 : struct allocdirect *adp, *lastadp;
3543 : struct ufs2_dinode *dp;
3544 0 : struct fs *fs = inodedep->id_fs;
3545 : #ifdef DIAGNOSTIC
3546 : daddr_t prevlbn = -1, d1, d2;
3547 : #endif
3548 : int deplist, i;
3549 :
3550 0 : if (inodedep->id_state & IOSTARTED)
3551 0 : panic("initiate_write_inodeblock_ufs2: already started");
3552 0 : inodedep->id_state |= IOSTARTED;
3553 0 : fs = inodedep->id_fs;
3554 0 : dp = (struct ufs2_dinode *)bp->b_data +
3555 0 : ino_to_fsbo(fs, inodedep->id_ino);
3556 : /*
3557 : * If the bitmap is not yet written, then the allocated
3558 : * inode cannot be written to disk.
3559 : */
3560 0 : if ((inodedep->id_state & DEPCOMPLETE) == 0) {
3561 0 : if (inodedep->id_savedino2 != NULL)
3562 0 : panic("initiate_write_inodeblock_ufs2: I/O underway");
3563 0 : inodedep->id_savedino2 = malloc(sizeof(struct ufs2_dinode),
3564 : M_INODEDEP, M_WAITOK);
3565 0 : inodedep->id_unsize = sizeof(struct ufs2_dinode);
3566 0 : *inodedep->id_savedino2 = *dp;
3567 0 : memset(dp, 0, sizeof(struct ufs2_dinode));
3568 0 : return;
3569 : }
3570 : /*
3571 : * If no dependencies, then there is nothing to roll back.
3572 : */
3573 0 : inodedep->id_savedsize = dp->di_size;
3574 0 : if (TAILQ_FIRST(&inodedep->id_inoupdt) == NULL)
3575 0 : return;
3576 :
3577 : #ifdef notyet
3578 : inodedep->id_savedextsize = dp->di_extsize;
3579 : if (TAILQ_FIRST(&inodedep->id_inoupdt) == NULL &&
3580 : TAILQ_FIRST(&inodedep->id_extupdt) == NULL)
3581 : return;
3582 : /*
3583 : * Set the ext data dependencies to busy.
3584 : */
3585 : for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp;
3586 : adp = TAILQ_NEXT(adp, ad_next)) {
3587 : #ifdef DIAGNOSTIC
3588 : if (deplist != 0 && prevlbn >= adp->ad_lbn) {
3589 : FREE_LOCK(&lk);
3590 : panic("softdep_write_inodeblock: lbn order");
3591 : }
3592 : prevlbn = adp->ad_lbn;
3593 : if ((d1 = dp->di_extb[adp->ad_lbn]) !=
3594 : (d2 = adp->ad_newblkno)) {
3595 : FREE_LOCK(&lk);
3596 : panic("%s: direct pointer #%lld mismatch %lld != %lld",
3597 : "softdep_write_inodeblock", (long long)adp->ad_lbn,
3598 : d1, d2);
3599 : }
3600 : deplist |= 1 << adp->ad_lbn;
3601 : if ((adp->ad_state & ATTACHED) == 0) {
3602 : FREE_LOCK(&lk);
3603 : panic("softdep_write_inodeblock: Unknown state 0x%x",
3604 : adp->ad_state);
3605 : }
3606 : #endif /* DIAGNOSTIC */
3607 : adp->ad_state &= ~ATTACHED;
3608 : adp->ad_state |= UNDONE;
3609 : }
3610 : /*
3611 : * The on-disk inode cannot claim to be any larger than the last
3612 : * fragment that has been written. Otherwise, the on-disk inode
3613 : * might have fragments that were not the last block in the ext
3614 : * data which would corrupt the filesystem.
3615 : */
3616 : for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp;
3617 : lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
3618 : dp->di_extb[adp->ad_lbn] = adp->ad_oldblkno;
3619 : /* keep going until hitting a rollback to a frag */
3620 : if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
3621 : continue;
3622 : dp->di_extsize = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize;
3623 : for (i = adp->ad_lbn + 1; i < NXADDR; i++) {
3624 : #ifdef DIAGNOSTIC
3625 : if (dp->di_extb[i] != 0 && (deplist & (1 << i)) == 0) {
3626 : FREE_LOCK(&lk);
3627 : panic("softdep_write_inodeblock: lost dep1");
3628 : }
3629 : #endif /* DIAGNOSTIC */
3630 : dp->di_extb[i] = 0;
3631 : }
3632 : lastadp = NULL;
3633 : break;
3634 : }
3635 : /*
3636 : * If we have zero'ed out the last allocated block of the ext
3637 : * data, roll back the size to the last currently allocated block.
3638 : * We know that this last allocated block is a full-sized as
3639 : * we already checked for fragments in the loop above.
3640 : */
3641 : if (lastadp != NULL &&
3642 : dp->di_extsize <= (lastadp->ad_lbn + 1) * fs->fs_bsize) {
3643 : for (i = lastadp->ad_lbn; i >= 0; i--)
3644 : if (dp->di_extb[i] != 0)
3645 : break;
3646 : dp->di_extsize = (i + 1) * fs->fs_bsize;
3647 : }
3648 : #endif /* notyet */
3649 :
3650 : /*
3651 : * Set the file data dependencies to busy.
3652 : */
3653 0 : for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
3654 0 : adp = TAILQ_NEXT(adp, ad_next)) {
3655 : #ifdef DIAGNOSTIC
3656 0 : if (deplist != 0 && prevlbn >= adp->ad_lbn) {
3657 0 : FREE_LOCK(&lk);
3658 0 : panic("softdep_write_inodeblock: lbn order");
3659 : }
3660 0 : prevlbn = adp->ad_lbn;
3661 0 : if (adp->ad_lbn < NDADDR &&
3662 0 : (d1 = dp->di_db[adp->ad_lbn]) != (d2 = adp->ad_newblkno)) {
3663 0 : FREE_LOCK(&lk);
3664 0 : panic("%s: direct pointer #%lld mismatch %lld != %lld",
3665 0 : "softdep_write_inodeblock", (long long)adp->ad_lbn,
3666 : d1, d2);
3667 : }
3668 0 : if (adp->ad_lbn >= NDADDR &&
3669 0 : (d1 = dp->di_ib[adp->ad_lbn - NDADDR]) !=
3670 0 : (d2 = adp->ad_newblkno)) {
3671 0 : FREE_LOCK(&lk);
3672 0 : panic("%s: indirect pointer #%lld mismatch %lld != %lld",
3673 0 : "softdep_write_inodeblock", (long long)(adp->ad_lbn -
3674 : NDADDR), d1, d2);
3675 : }
3676 0 : deplist |= 1 << adp->ad_lbn;
3677 0 : if ((adp->ad_state & ATTACHED) == 0) {
3678 0 : FREE_LOCK(&lk);
3679 0 : panic("softdep_write_inodeblock: Unknown state 0x%x",
3680 0 : adp->ad_state);
3681 : }
3682 : #endif /* DIAGNOSTIC */
3683 0 : adp->ad_state &= ~ATTACHED;
3684 0 : adp->ad_state |= UNDONE;
3685 : }
3686 : /*
3687 : * The on-disk inode cannot claim to be any larger than the last
3688 : * fragment that has been written. Otherwise, the on-disk inode
3689 : * might have fragments that were not the last block in the file
3690 : * which would corrupt the filesystem.
3691 : */
3692 0 : for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
3693 0 : lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
3694 0 : if (adp->ad_lbn >= NDADDR)
3695 : break;
3696 0 : dp->di_db[adp->ad_lbn] = adp->ad_oldblkno;
3697 : /* keep going until hitting a rollback to a frag */
3698 0 : if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
3699 : continue;
3700 0 : dp->di_size = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize;
3701 0 : for (i = adp->ad_lbn + 1; i < NDADDR; i++) {
3702 : #ifdef DIAGNOSTIC
3703 0 : if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0) {
3704 0 : FREE_LOCK(&lk);
3705 0 : panic("softdep_write_inodeblock: lost dep2");
3706 : }
3707 : #endif /* DIAGNOSTIC */
3708 0 : dp->di_db[i] = 0;
3709 : }
3710 0 : for (i = 0; i < NIADDR; i++) {
3711 : #ifdef DIAGNOSTIC
3712 0 : if (dp->di_ib[i] != 0 &&
3713 0 : (deplist & ((1 << NDADDR) << i)) == 0) {
3714 0 : FREE_LOCK(&lk);
3715 0 : panic("softdep_write_inodeblock: lost dep3");
3716 : }
3717 : #endif /* DIAGNOSTIC */
3718 0 : dp->di_ib[i] = 0;
3719 : }
3720 0 : return;
3721 : }
3722 : /*
3723 : * If we have zero'ed out the last allocated block of the file,
3724 : * roll back the size to the last currently allocated block.
3725 : * We know that this last allocated block is a full-sized as
3726 : * we already checked for fragments in the loop above.
3727 : */
3728 0 : if (lastadp != NULL &&
3729 0 : dp->di_size <= (lastadp->ad_lbn + 1) * fs->fs_bsize) {
3730 0 : for (i = lastadp->ad_lbn; i >= 0; i--)
3731 0 : if (dp->di_db[i] != 0)
3732 : break;
3733 0 : dp->di_size = (i + 1) * fs->fs_bsize;
3734 0 : }
3735 : /*
3736 : * The only dependencies are for indirect blocks.
3737 : *
3738 : * The file size for indirect block additions is not guaranteed.
3739 : * Such a guarantee would be non-trivial to achieve. The conventional
3740 : * synchronous write implementation also does not make this guarantee.
3741 : * Fsck should catch and fix discrepancies. Arguably, the file size
3742 : * can be over-estimated without destroying integrity when the file
3743 : * moves into the indirect blocks (i.e., is large). If we want to
3744 : * postpone fsck, we are stuck with this argument.
3745 : */
3746 0 : for (; adp; adp = TAILQ_NEXT(adp, ad_next))
3747 0 : dp->di_ib[adp->ad_lbn - NDADDR] = 0;
3748 0 : }
3749 : #endif /* FFS2 */
3750 :
3751 : /*
3752 : * This routine is called during the completion interrupt
3753 : * service routine for a disk write (from the procedure called
3754 : * by the device driver to inform the file system caches of
3755 : * a request completion). It should be called early in this
3756 : * procedure, before the block is made available to other
3757 : * processes or other routines are called.
3758 : */
3759 : /* describes the completed disk write */
3760 : void
3761 0 : softdep_disk_write_complete(struct buf *bp)
3762 : {
3763 : struct worklist *wk;
3764 0 : struct workhead reattach;
3765 : struct newblk *newblk;
3766 : struct allocindir *aip;
3767 : struct allocdirect *adp;
3768 : struct indirdep *indirdep;
3769 : struct inodedep *inodedep;
3770 : struct bmsafemap *bmsafemap;
3771 :
3772 : /*
3773 : * If an error occurred while doing the write, then the data
3774 : * has not hit the disk and the dependencies cannot be unrolled.
3775 : */
3776 0 : if ((bp->b_flags & B_ERROR) && !(bp->b_flags & B_INVAL))
3777 0 : return;
3778 :
3779 : #ifdef DEBUG
3780 : if (lk.lkt_held != -1)
3781 : panic("softdep_disk_write_complete: lock is held");
3782 : lk.lkt_held = -2;
3783 : #endif
3784 0 : LIST_INIT(&reattach);
3785 0 : while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
3786 0 : WORKLIST_REMOVE(wk);
3787 0 : switch (wk->wk_type) {
3788 :
3789 : case D_PAGEDEP:
3790 0 : if (handle_written_filepage(WK_PAGEDEP(wk), bp))
3791 0 : WORKLIST_INSERT(&reattach, wk);
3792 0 : continue;
3793 :
3794 : case D_INODEDEP:
3795 0 : if (handle_written_inodeblock(WK_INODEDEP(wk), bp))
3796 0 : WORKLIST_INSERT(&reattach, wk);
3797 0 : continue;
3798 :
3799 : case D_BMSAFEMAP:
3800 0 : bmsafemap = WK_BMSAFEMAP(wk);
3801 0 : while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkhd))) {
3802 0 : newblk->nb_state |= DEPCOMPLETE;
3803 0 : newblk->nb_bmsafemap = NULL;
3804 0 : LIST_REMOVE(newblk, nb_deps);
3805 : }
3806 0 : while ((adp =
3807 0 : LIST_FIRST(&bmsafemap->sm_allocdirecthd))) {
3808 0 : adp->ad_state |= DEPCOMPLETE;
3809 0 : adp->ad_buf = NULL;
3810 0 : LIST_REMOVE(adp, ad_deps);
3811 0 : handle_allocdirect_partdone(adp);
3812 : }
3813 0 : while ((aip =
3814 0 : LIST_FIRST(&bmsafemap->sm_allocindirhd))) {
3815 0 : aip->ai_state |= DEPCOMPLETE;
3816 0 : aip->ai_buf = NULL;
3817 0 : LIST_REMOVE(aip, ai_deps);
3818 0 : handle_allocindir_partdone(aip);
3819 : }
3820 0 : while ((inodedep =
3821 0 : LIST_FIRST(&bmsafemap->sm_inodedephd)) != NULL) {
3822 0 : inodedep->id_state |= DEPCOMPLETE;
3823 0 : LIST_REMOVE(inodedep, id_deps);
3824 0 : inodedep->id_buf = NULL;
3825 : }
3826 0 : WORKITEM_FREE(bmsafemap, D_BMSAFEMAP);
3827 0 : continue;
3828 :
3829 : case D_MKDIR:
3830 0 : handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY);
3831 0 : continue;
3832 :
3833 : case D_ALLOCDIRECT:
3834 0 : adp = WK_ALLOCDIRECT(wk);
3835 0 : adp->ad_state |= COMPLETE;
3836 0 : handle_allocdirect_partdone(adp);
3837 0 : continue;
3838 :
3839 : case D_ALLOCINDIR:
3840 0 : aip = WK_ALLOCINDIR(wk);
3841 0 : aip->ai_state |= COMPLETE;
3842 0 : handle_allocindir_partdone(aip);
3843 0 : continue;
3844 :
3845 : case D_INDIRDEP:
3846 0 : indirdep = WK_INDIRDEP(wk);
3847 0 : if (indirdep->ir_state & GOINGAWAY)
3848 0 : panic("disk_write_complete: indirdep gone");
3849 0 : memcpy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount);
3850 0 : free(indirdep->ir_saveddata, M_INDIRDEP, bp->b_bcount);
3851 0 : indirdep->ir_saveddata = NULL;
3852 0 : indirdep->ir_state &= ~UNDONE;
3853 0 : indirdep->ir_state |= ATTACHED;
3854 0 : while ((aip = LIST_FIRST(&indirdep->ir_donehd))) {
3855 0 : handle_allocindir_partdone(aip);
3856 0 : if (aip == LIST_FIRST(&indirdep->ir_donehd))
3857 0 : panic("disk_write_complete: not gone");
3858 : }
3859 0 : WORKLIST_INSERT(&reattach, wk);
3860 0 : if ((bp->b_flags & B_DELWRI) == 0)
3861 0 : stat_indir_blk_ptrs++;
3862 0 : buf_dirty(bp);
3863 0 : continue;
3864 :
3865 : default:
3866 0 : panic("handle_disk_write_complete: Unknown type %s",
3867 0 : TYPENAME(wk->wk_type));
3868 : /* NOTREACHED */
3869 : }
3870 : }
3871 : /*
3872 : * Reattach any requests that must be redone.
3873 : */
3874 0 : while ((wk = LIST_FIRST(&reattach)) != NULL) {
3875 0 : WORKLIST_REMOVE(wk);
3876 0 : WORKLIST_INSERT(&bp->b_dep, wk);
3877 : }
3878 : #ifdef DEBUG
3879 : if (lk.lkt_held != -2)
3880 : panic("softdep_disk_write_complete: lock lost");
3881 : lk.lkt_held = -1;
3882 : #endif
3883 0 : }
3884 :
3885 : /*
3886 : * Called from within softdep_disk_write_complete above. Note that
3887 : * this routine is always called from interrupt level with further
3888 : * splbio interrupts blocked.
3889 : */
3890 : /* the completed allocdirect */
3891 : STATIC void
3892 0 : handle_allocdirect_partdone(struct allocdirect *adp)
3893 : {
3894 : struct allocdirect *listadp;
3895 : struct inodedep *inodedep;
3896 : long bsize, delay;
3897 :
3898 0 : splassert(IPL_BIO);
3899 :
3900 0 : if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
3901 0 : return;
3902 0 : if (adp->ad_buf != NULL)
3903 0 : panic("handle_allocdirect_partdone: dangling dep");
3904 :
3905 : /*
3906 : * The on-disk inode cannot claim to be any larger than the last
3907 : * fragment that has been written. Otherwise, the on-disk inode
3908 : * might have fragments that were not the last block in the file
3909 : * which would corrupt the filesystem. Thus, we cannot free any
3910 : * allocdirects after one whose ad_oldblkno claims a fragment as
3911 : * these blocks must be rolled back to zero before writing the inode.
3912 : * We check the currently active set of allocdirects in id_inoupdt.
3913 : */
3914 0 : inodedep = adp->ad_inodedep;
3915 0 : bsize = inodedep->id_fs->fs_bsize;
3916 0 : TAILQ_FOREACH(listadp, &inodedep->id_inoupdt, ad_next) {
3917 : /* found our block */
3918 0 : if (listadp == adp)
3919 : break;
3920 : /* continue if ad_oldlbn is not a fragment */
3921 0 : if (listadp->ad_oldsize == 0 ||
3922 0 : listadp->ad_oldsize == bsize)
3923 : continue;
3924 : /* hit a fragment */
3925 0 : return;
3926 : }
3927 : /*
3928 : * If we have reached the end of the current list without
3929 : * finding the just finished dependency, then it must be
3930 : * on the future dependency list. Future dependencies cannot
3931 : * be freed until they are moved to the current list.
3932 : */
3933 0 : if (listadp == NULL) {
3934 : #ifdef DEBUG
3935 : TAILQ_FOREACH(listadp, &inodedep->id_newinoupdt, ad_next)
3936 : /* found our block */
3937 : if (listadp == adp)
3938 : break;
3939 : if (listadp == NULL)
3940 : panic("handle_allocdirect_partdone: lost dep");
3941 : #endif /* DEBUG */
3942 0 : return;
3943 : }
3944 : /*
3945 : * If we have found the just finished dependency, then free
3946 : * it along with anything that follows it that is complete.
3947 : * If the inode still has a bitmap dependency, then it has
3948 : * never been written to disk, hence the on-disk inode cannot
3949 : * reference the old fragment so we can free it without delay.
3950 : */
3951 0 : delay = (inodedep->id_state & DEPCOMPLETE);
3952 0 : for (; adp; adp = listadp) {
3953 0 : listadp = TAILQ_NEXT(adp, ad_next);
3954 0 : if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
3955 0 : return;
3956 0 : free_allocdirect(&inodedep->id_inoupdt, adp, delay);
3957 : }
3958 0 : }
3959 :
3960 : /*
3961 : * Called from within softdep_disk_write_complete above. Note that
3962 : * this routine is always called from interrupt level with further
3963 : * splbio interrupts blocked.
3964 : */
3965 : /* the completed allocindir */
3966 : STATIC void
3967 0 : handle_allocindir_partdone(struct allocindir *aip)
3968 : {
3969 : struct indirdep *indirdep;
3970 :
3971 0 : splassert(IPL_BIO);
3972 :
3973 0 : if ((aip->ai_state & ALLCOMPLETE) != ALLCOMPLETE)
3974 0 : return;
3975 0 : if (aip->ai_buf != NULL)
3976 0 : panic("handle_allocindir_partdone: dangling dependency");
3977 0 : indirdep = aip->ai_indirdep;
3978 0 : if (indirdep->ir_state & UNDONE) {
3979 0 : LIST_REMOVE(aip, ai_next);
3980 0 : LIST_INSERT_HEAD(&indirdep->ir_donehd, aip, ai_next);
3981 0 : return;
3982 : }
3983 0 : if (indirdep->ir_state & UFS1FMT)
3984 0 : ((int32_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
3985 0 : aip->ai_newblkno;
3986 : else
3987 0 : ((int64_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
3988 : aip->ai_newblkno;
3989 0 : LIST_REMOVE(aip, ai_next);
3990 0 : if (aip->ai_freefrag != NULL)
3991 0 : add_to_worklist(&aip->ai_freefrag->ff_list);
3992 0 : WORKITEM_FREE(aip, D_ALLOCINDIR);
3993 0 : }
3994 :
3995 : /*
3996 : * Called from within softdep_disk_write_complete above to restore
3997 : * in-memory inode block contents to their most up-to-date state. Note
3998 : * that this routine is always called from interrupt level with further
3999 : * splbio interrupts blocked.
4000 : */
4001 : /* buffer containing the inode block */
4002 : STATIC int
4003 0 : handle_written_inodeblock(struct inodedep *inodedep, struct buf *bp)
4004 : {
4005 : struct worklist *wk, *filefree;
4006 : struct allocdirect *adp, *nextadp;
4007 : struct ufs1_dinode *dp1 = NULL;
4008 : struct ufs2_dinode *dp2 = NULL;
4009 : int hadchanges, fstype;
4010 :
4011 0 : splassert(IPL_BIO);
4012 :
4013 0 : if ((inodedep->id_state & IOSTARTED) == 0)
4014 0 : panic("handle_written_inodeblock: not started");
4015 0 : inodedep->id_state &= ~IOSTARTED;
4016 :
4017 0 : if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC) {
4018 : fstype = UM_UFS1;
4019 0 : dp1 = (struct ufs1_dinode *) bp->b_data +
4020 0 : ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
4021 0 : } else {
4022 : fstype = UM_UFS2;
4023 0 : dp2 = (struct ufs2_dinode *) bp->b_data +
4024 0 : ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
4025 : }
4026 :
4027 : /*
4028 : * If we had to rollback the inode allocation because of
4029 : * bitmaps being incomplete, then simply restore it.
4030 : * Keep the block dirty so that it will not be reclaimed until
4031 : * all associated dependencies have been cleared and the
4032 : * corresponding updates written to disk.
4033 : */
4034 0 : if (inodedep->id_savedino1 != NULL) {
4035 0 : if (fstype == UM_UFS1)
4036 0 : *dp1 = *inodedep->id_savedino1;
4037 : else
4038 0 : *dp2 = *inodedep->id_savedino2;
4039 0 : free(inodedep->id_savedino1, M_INODEDEP, inodedep->id_unsize);
4040 0 : inodedep->id_savedino1 = NULL;
4041 0 : if ((bp->b_flags & B_DELWRI) == 0)
4042 0 : stat_inode_bitmap++;
4043 0 : buf_dirty(bp);
4044 0 : return (1);
4045 : }
4046 0 : inodedep->id_state |= COMPLETE;
4047 : /*
4048 : * Roll forward anything that had to be rolled back before
4049 : * the inode could be updated.
4050 : */
4051 : hadchanges = 0;
4052 0 : for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = nextadp) {
4053 0 : nextadp = TAILQ_NEXT(adp, ad_next);
4054 0 : if (adp->ad_state & ATTACHED)
4055 0 : panic("handle_written_inodeblock: new entry");
4056 0 : if (fstype == UM_UFS1) {
4057 0 : if (adp->ad_lbn < NDADDR) {
4058 0 : if (dp1->di_db[adp->ad_lbn] != adp->ad_oldblkno)
4059 0 : panic("%s: %s #%lld mismatch %d != "
4060 : "%lld",
4061 : "handle_written_inodeblock",
4062 : "direct pointer",
4063 : (long long)adp->ad_lbn,
4064 : dp1->di_db[adp->ad_lbn],
4065 : (long long)adp->ad_oldblkno);
4066 0 : dp1->di_db[adp->ad_lbn] = adp->ad_newblkno;
4067 0 : } else {
4068 0 : if (dp1->di_ib[adp->ad_lbn - NDADDR] != 0)
4069 0 : panic("%s: %s #%lld allocated as %d",
4070 : "handle_written_inodeblock",
4071 : "indirect pointer",
4072 : (long long)(adp->ad_lbn - NDADDR),
4073 : dp1->di_ib[adp->ad_lbn - NDADDR]);
4074 0 : dp1->di_ib[adp->ad_lbn - NDADDR] =
4075 0 : adp->ad_newblkno;
4076 : }
4077 : } else {
4078 0 : if (adp->ad_lbn < NDADDR) {
4079 0 : if (dp2->di_db[adp->ad_lbn] != adp->ad_oldblkno)
4080 0 : panic("%s: %s #%lld mismatch %lld != "
4081 : "%lld", "handle_written_inodeblock",
4082 : "direct pointer",
4083 : (long long)adp->ad_lbn,
4084 : dp2->di_db[adp->ad_lbn],
4085 : (long long)adp->ad_oldblkno);
4086 0 : dp2->di_db[adp->ad_lbn] = adp->ad_newblkno;
4087 0 : } else {
4088 0 : if (dp2->di_ib[adp->ad_lbn - NDADDR] != 0)
4089 0 : panic("%s: %s #%lld allocated as %lld",
4090 : "handle_written_inodeblock",
4091 : "indirect pointer",
4092 : (long long)(adp->ad_lbn - NDADDR),
4093 : dp2->di_ib[adp->ad_lbn - NDADDR]);
4094 0 : dp2->di_ib[adp->ad_lbn - NDADDR] =
4095 0 : adp->ad_newblkno;
4096 : }
4097 : }
4098 0 : adp->ad_state &= ~UNDONE;
4099 0 : adp->ad_state |= ATTACHED;
4100 : hadchanges = 1;
4101 : }
4102 0 : if (hadchanges && (bp->b_flags & B_DELWRI) == 0)
4103 0 : stat_direct_blk_ptrs++;
4104 : /*
4105 : * Reset the file size to its most up-to-date value.
4106 : */
4107 0 : if (inodedep->id_savedsize == -1)
4108 0 : panic("handle_written_inodeblock: bad size");
4109 :
4110 0 : if (fstype == UM_UFS1) {
4111 0 : if (dp1->di_size != inodedep->id_savedsize) {
4112 0 : dp1->di_size = inodedep->id_savedsize;
4113 : hadchanges = 1;
4114 0 : }
4115 : } else {
4116 0 : if (dp2->di_size != inodedep->id_savedsize) {
4117 0 : dp2->di_size = inodedep->id_savedsize;
4118 : hadchanges = 1;
4119 0 : }
4120 : }
4121 0 : inodedep->id_savedsize = -1;
4122 : /*
4123 : * If there were any rollbacks in the inode block, then it must be
4124 : * marked dirty so that its will eventually get written back in
4125 : * its correct form.
4126 : */
4127 0 : if (hadchanges)
4128 0 : buf_dirty(bp);
4129 : /*
4130 : * Process any allocdirects that completed during the update.
4131 : */
4132 0 : if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL)
4133 0 : handle_allocdirect_partdone(adp);
4134 : /*
4135 : * Process deallocations that were held pending until the
4136 : * inode had been written to disk. Freeing of the inode
4137 : * is delayed until after all blocks have been freed to
4138 : * avoid creation of new <vfsid, inum, lbn> triples
4139 : * before the old ones have been deleted.
4140 : */
4141 : filefree = NULL;
4142 0 : while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) {
4143 0 : WORKLIST_REMOVE(wk);
4144 0 : switch (wk->wk_type) {
4145 :
4146 : case D_FREEFILE:
4147 : /*
4148 : * We defer adding filefree to the worklist until
4149 : * all other additions have been made to ensure
4150 : * that it will be done after all the old blocks
4151 : * have been freed.
4152 : */
4153 0 : if (filefree != NULL)
4154 0 : panic("handle_written_inodeblock: filefree");
4155 : filefree = wk;
4156 0 : continue;
4157 :
4158 : case D_MKDIR:
4159 0 : handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT);
4160 0 : continue;
4161 :
4162 : case D_DIRADD:
4163 0 : diradd_inode_written(WK_DIRADD(wk), inodedep);
4164 0 : continue;
4165 :
4166 : case D_FREEBLKS:
4167 0 : wk->wk_state |= COMPLETE;
4168 0 : if ((wk->wk_state & ALLCOMPLETE) != ALLCOMPLETE)
4169 0 : continue;
4170 : /* FALLTHROUGH */
4171 : case D_FREEFRAG:
4172 : case D_DIRREM:
4173 0 : add_to_worklist(wk);
4174 0 : continue;
4175 :
4176 : case D_NEWDIRBLK:
4177 0 : free_newdirblk(WK_NEWDIRBLK(wk));
4178 0 : continue;
4179 :
4180 : default:
4181 0 : panic("handle_written_inodeblock: Unknown type %s",
4182 0 : TYPENAME(wk->wk_type));
4183 : /* NOTREACHED */
4184 : }
4185 : }
4186 0 : if (filefree != NULL) {
4187 0 : if (free_inodedep(inodedep) == 0)
4188 0 : panic("handle_written_inodeblock: live inodedep");
4189 0 : add_to_worklist(filefree);
4190 0 : return (0);
4191 : }
4192 :
4193 : /*
4194 : * If no outstanding dependencies, free it.
4195 : */
4196 0 : if (free_inodedep(inodedep) ||
4197 0 : TAILQ_FIRST(&inodedep->id_inoupdt) == NULL)
4198 0 : return (0);
4199 0 : return (hadchanges);
4200 0 : }
4201 :
4202 : /*
4203 : * Process a diradd entry after its dependent inode has been written.
4204 : * This routine must be called with splbio interrupts blocked.
4205 : */
4206 : STATIC void
4207 0 : diradd_inode_written(struct diradd *dap, struct inodedep *inodedep)
4208 : {
4209 : struct pagedep *pagedep;
4210 :
4211 0 : splassert(IPL_BIO);
4212 :
4213 0 : dap->da_state |= COMPLETE;
4214 0 : if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
4215 0 : if (dap->da_state & DIRCHG)
4216 0 : pagedep = dap->da_previous->dm_pagedep;
4217 : else
4218 0 : pagedep = dap->da_pagedep;
4219 0 : LIST_REMOVE(dap, da_pdlist);
4220 0 : LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
4221 0 : }
4222 0 : WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
4223 0 : }
4224 :
4225 : /*
4226 : * Handle the completion of a mkdir dependency.
4227 : */
4228 : STATIC void
4229 0 : handle_written_mkdir(struct mkdir *mkdir, int type)
4230 : {
4231 : struct diradd *dap;
4232 : struct pagedep *pagedep;
4233 :
4234 0 : splassert(IPL_BIO);
4235 :
4236 0 : if (mkdir->md_state != type)
4237 0 : panic("handle_written_mkdir: bad type");
4238 0 : dap = mkdir->md_diradd;
4239 0 : dap->da_state &= ~type;
4240 0 : if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0)
4241 0 : dap->da_state |= DEPCOMPLETE;
4242 0 : if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
4243 0 : if (dap->da_state & DIRCHG)
4244 0 : pagedep = dap->da_previous->dm_pagedep;
4245 : else
4246 0 : pagedep = dap->da_pagedep;
4247 0 : LIST_REMOVE(dap, da_pdlist);
4248 0 : LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
4249 0 : }
4250 0 : LIST_REMOVE(mkdir, md_mkdirs);
4251 0 : WORKITEM_FREE(mkdir, D_MKDIR);
4252 0 : }
4253 :
4254 : /*
4255 : * Called from within softdep_disk_write_complete above.
4256 : * A write operation was just completed. Removed inodes can
4257 : * now be freed and associated block pointers may be committed.
4258 : * Note that this routine is always called from interrupt level
4259 : * with further splbio interrupts blocked.
4260 : */
4261 : /* buffer containing the written page */
4262 : STATIC int
4263 0 : handle_written_filepage(struct pagedep *pagedep, struct buf *bp)
4264 : {
4265 : struct dirrem *dirrem;
4266 : struct diradd *dap, *nextdap;
4267 : struct direct *ep;
4268 : int i, chgs;
4269 :
4270 0 : splassert(IPL_BIO);
4271 :
4272 0 : if ((pagedep->pd_state & IOSTARTED) == 0)
4273 0 : panic("handle_written_filepage: not started");
4274 0 : pagedep->pd_state &= ~IOSTARTED;
4275 : /*
4276 : * Process any directory removals that have been committed.
4277 : */
4278 0 : while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd)) != NULL) {
4279 0 : LIST_REMOVE(dirrem, dm_next);
4280 0 : dirrem->dm_dirinum = pagedep->pd_ino;
4281 0 : add_to_worklist(&dirrem->dm_list);
4282 : }
4283 : /*
4284 : * Free any directory additions that have been committed.
4285 : * If it is a newly allocated block, we have to wait until
4286 : * the on-disk directory inode claims the new block.
4287 : */
4288 0 : if ((pagedep->pd_state & NEWBLOCK) == 0)
4289 0 : while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
4290 0 : free_diradd(dap);
4291 : /*
4292 : * Uncommitted directory entries must be restored.
4293 : */
4294 0 : for (chgs = 0, i = 0; i < DAHASHSZ; i++) {
4295 0 : for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap;
4296 : dap = nextdap) {
4297 0 : nextdap = LIST_NEXT(dap, da_pdlist);
4298 0 : if (dap->da_state & ATTACHED)
4299 0 : panic("handle_written_filepage: attached");
4300 0 : ep = (struct direct *)
4301 0 : ((char *)bp->b_data + dap->da_offset);
4302 0 : ep->d_ino = dap->da_newinum;
4303 0 : dap->da_state &= ~UNDONE;
4304 0 : dap->da_state |= ATTACHED;
4305 : chgs = 1;
4306 : /*
4307 : * If the inode referenced by the directory has
4308 : * been written out, then the dependency can be
4309 : * moved to the pending list.
4310 : */
4311 0 : if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
4312 0 : LIST_REMOVE(dap, da_pdlist);
4313 0 : LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap,
4314 : da_pdlist);
4315 0 : }
4316 : }
4317 : }
4318 : /*
4319 : * If there were any rollbacks in the directory, then it must be
4320 : * marked dirty so that its will eventually get written back in
4321 : * its correct form.
4322 : */
4323 0 : if (chgs) {
4324 0 : if ((bp->b_flags & B_DELWRI) == 0)
4325 0 : stat_dir_entry++;
4326 0 : buf_dirty(bp);
4327 0 : return (1);
4328 : }
4329 : /*
4330 : * If we are not waiting for a new directory block to be
4331 : * claimed by its inode, then the pagedep will be freed.
4332 : * Otherwise it will remain to track any new entries on
4333 : * the page in case they are fsync'ed.
4334 : */
4335 0 : if ((pagedep->pd_state & NEWBLOCK) == 0) {
4336 0 : LIST_REMOVE(pagedep, pd_hash);
4337 0 : WORKITEM_FREE(pagedep, D_PAGEDEP);
4338 0 : }
4339 0 : return (0);
4340 0 : }
4341 :
4342 : /*
4343 : * Writing back in-core inode structures.
4344 : *
4345 : * The file system only accesses an inode's contents when it occupies an
4346 : * "in-core" inode structure. These "in-core" structures are separate from
4347 : * the page frames used to cache inode blocks. Only the latter are
4348 : * transferred to/from the disk. So, when the updated contents of the
4349 : * "in-core" inode structure are copied to the corresponding in-memory inode
4350 : * block, the dependencies are also transferred. The following procedure is
4351 : * called when copying a dirty "in-core" inode to a cached inode block.
4352 : */
4353 :
4354 : /*
4355 : * Called when an inode is loaded from disk. If the effective link count
4356 : * differed from the actual link count when it was last flushed, then we
4357 : * need to ensure that the correct effective link count is put back.
4358 : */
4359 : /* the "in_core" copy of the inode */
4360 : void
4361 0 : softdep_load_inodeblock(struct inode *ip)
4362 : {
4363 0 : struct inodedep *inodedep;
4364 :
4365 : /*
4366 : * Check for alternate nlink count.
4367 : */
4368 0 : ip->i_effnlink = DIP(ip, nlink);
4369 0 : ACQUIRE_LOCK(&lk);
4370 0 : if (inodedep_lookup(ip->i_fs, ip->i_number, 0, &inodedep) == 0) {
4371 0 : FREE_LOCK(&lk);
4372 0 : return;
4373 : }
4374 0 : ip->i_effnlink -= inodedep->id_nlinkdelta;
4375 0 : FREE_LOCK(&lk);
4376 0 : }
4377 :
4378 : /*
4379 : * This routine is called just before the "in-core" inode
4380 : * information is to be copied to the in-memory inode block.
4381 : * Recall that an inode block contains several inodes. If
4382 : * the force flag is set, then the dependencies will be
4383 : * cleared so that the update can always be made. Note that
4384 : * the buffer is locked when this routine is called, so we
4385 : * will never be in the middle of writing the inode block
4386 : * to disk.
4387 : */
4388 : /* the "in_core" copy of the inode */
4389 : /* the buffer containing the inode block */
4390 : /* nonzero => update must be allowed */
4391 : void
4392 0 : softdep_update_inodeblock(struct inode *ip, struct buf *bp, int waitfor)
4393 : {
4394 0 : struct inodedep *inodedep;
4395 : struct worklist *wk;
4396 : int error, gotit;
4397 :
4398 : /*
4399 : * If the effective link count is not equal to the actual link
4400 : * count, then we must track the difference in an inodedep while
4401 : * the inode is (potentially) tossed out of the cache. Otherwise,
4402 : * if there is no existing inodedep, then there are no dependencies
4403 : * to track.
4404 : */
4405 0 : ACQUIRE_LOCK(&lk);
4406 0 : if (inodedep_lookup(ip->i_fs, ip->i_number, 0, &inodedep) == 0) {
4407 0 : FREE_LOCK(&lk);
4408 0 : if (ip->i_effnlink != DIP(ip, nlink))
4409 0 : panic("softdep_update_inodeblock: bad link count");
4410 0 : return;
4411 : }
4412 0 : if (inodedep->id_nlinkdelta != DIP(ip, nlink) - ip->i_effnlink) {
4413 0 : FREE_LOCK(&lk);
4414 0 : panic("softdep_update_inodeblock: bad delta");
4415 : }
4416 : /*
4417 : * Changes have been initiated. Anything depending on these
4418 : * changes cannot occur until this inode has been written.
4419 : */
4420 0 : inodedep->id_state &= ~COMPLETE;
4421 0 : if ((inodedep->id_state & ONWORKLIST) == 0)
4422 0 : WORKLIST_INSERT(&bp->b_dep, &inodedep->id_list);
4423 : /*
4424 : * Any new dependencies associated with the incore inode must
4425 : * now be moved to the list associated with the buffer holding
4426 : * the in-memory copy of the inode. Once merged process any
4427 : * allocdirects that are completed by the merger.
4428 : */
4429 0 : merge_inode_lists(inodedep);
4430 0 : if (TAILQ_FIRST(&inodedep->id_inoupdt) != NULL)
4431 0 : handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt));
4432 : /*
4433 : * Now that the inode has been pushed into the buffer, the
4434 : * operations dependent on the inode being written to disk
4435 : * can be moved to the id_bufwait so that they will be
4436 : * processed when the buffer I/O completes.
4437 : */
4438 0 : while ((wk = LIST_FIRST(&inodedep->id_inowait)) != NULL) {
4439 0 : WORKLIST_REMOVE(wk);
4440 0 : WORKLIST_INSERT(&inodedep->id_bufwait, wk);
4441 : }
4442 : /*
4443 : * Newly allocated inodes cannot be written until the bitmap
4444 : * that allocates them have been written (indicated by
4445 : * DEPCOMPLETE being set in id_state). If we are doing a
4446 : * forced sync (e.g., an fsync on a file), we force the bitmap
4447 : * to be written so that the update can be done.
4448 : */
4449 0 : do {
4450 0 : if ((inodedep->id_state & DEPCOMPLETE) != 0 || waitfor == 0) {
4451 0 : FREE_LOCK(&lk);
4452 0 : return;
4453 : }
4454 0 : bp = inodedep->id_buf;
4455 0 : gotit = getdirtybuf(bp, MNT_WAIT);
4456 0 : } while (gotit == -1);
4457 0 : FREE_LOCK(&lk);
4458 0 : if (gotit && (error = bwrite(bp)) != 0)
4459 0 : softdep_error("softdep_update_inodeblock: bwrite", error);
4460 0 : if ((inodedep->id_state & DEPCOMPLETE) == 0)
4461 0 : panic("softdep_update_inodeblock: update failed");
4462 0 : }
4463 :
4464 : /*
4465 : * Merge the new inode dependency list (id_newinoupdt) into the old
4466 : * inode dependency list (id_inoupdt). This routine must be called
4467 : * with splbio interrupts blocked.
4468 : */
4469 : STATIC void
4470 0 : merge_inode_lists(struct inodedep *inodedep)
4471 : {
4472 : struct allocdirect *listadp, *newadp;
4473 :
4474 0 : splassert(IPL_BIO);
4475 :
4476 0 : newadp = TAILQ_FIRST(&inodedep->id_newinoupdt);
4477 0 : for (listadp = TAILQ_FIRST(&inodedep->id_inoupdt); listadp && newadp;) {
4478 0 : if (listadp->ad_lbn < newadp->ad_lbn) {
4479 0 : listadp = TAILQ_NEXT(listadp, ad_next);
4480 0 : continue;
4481 : }
4482 0 : TAILQ_REMOVE(&inodedep->id_newinoupdt, newadp, ad_next);
4483 0 : TAILQ_INSERT_BEFORE(listadp, newadp, ad_next);
4484 0 : if (listadp->ad_lbn == newadp->ad_lbn) {
4485 0 : allocdirect_merge(&inodedep->id_inoupdt, newadp,
4486 : listadp);
4487 : listadp = newadp;
4488 0 : }
4489 0 : newadp = TAILQ_FIRST(&inodedep->id_newinoupdt);
4490 : }
4491 0 : while ((newadp = TAILQ_FIRST(&inodedep->id_newinoupdt)) != NULL) {
4492 0 : TAILQ_REMOVE(&inodedep->id_newinoupdt, newadp, ad_next);
4493 0 : TAILQ_INSERT_TAIL(&inodedep->id_inoupdt, newadp, ad_next);
4494 : }
4495 0 : }
4496 :
4497 : /*
4498 : * If we are doing an fsync, then we must ensure that any directory
4499 : * entries for the inode have been written after the inode gets to disk.
4500 : */
4501 : /* the "in_core" copy of the inode */
4502 : int
4503 0 : softdep_fsync(struct vnode *vp)
4504 : {
4505 0 : struct inodedep *inodedep;
4506 : struct pagedep *pagedep;
4507 : struct worklist *wk;
4508 : struct diradd *dap;
4509 : struct mount *mnt;
4510 0 : struct vnode *pvp;
4511 : struct inode *ip;
4512 : struct inode *pip;
4513 0 : struct buf *bp;
4514 : struct fs *fs;
4515 0 : struct proc *p = CURPROC; /* XXX */
4516 : int error, flushparent;
4517 : ufsino_t parentino;
4518 : daddr_t lbn;
4519 :
4520 0 : ip = VTOI(vp);
4521 0 : fs = ip->i_fs;
4522 0 : ACQUIRE_LOCK(&lk);
4523 0 : if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) == 0) {
4524 0 : FREE_LOCK(&lk);
4525 0 : return (0);
4526 : }
4527 0 : if (LIST_FIRST(&inodedep->id_inowait) != NULL ||
4528 0 : LIST_FIRST(&inodedep->id_bufwait) != NULL ||
4529 0 : TAILQ_FIRST(&inodedep->id_inoupdt) != NULL ||
4530 0 : TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL) {
4531 0 : FREE_LOCK(&lk);
4532 0 : panic("softdep_fsync: pending ops");
4533 : }
4534 0 : for (error = 0, flushparent = 0; ; ) {
4535 0 : if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == NULL)
4536 : break;
4537 0 : if (wk->wk_type != D_DIRADD) {
4538 0 : FREE_LOCK(&lk);
4539 0 : panic("softdep_fsync: Unexpected type %s",
4540 0 : TYPENAME(wk->wk_type));
4541 : }
4542 0 : dap = WK_DIRADD(wk);
4543 : /*
4544 : * Flush our parent if this directory entry has a MKDIR_PARENT
4545 : * dependency or is contained in a newly allocated block.
4546 : */
4547 0 : if (dap->da_state & DIRCHG)
4548 0 : pagedep = dap->da_previous->dm_pagedep;
4549 : else
4550 0 : pagedep = dap->da_pagedep;
4551 0 : mnt = pagedep->pd_mnt;
4552 0 : parentino = pagedep->pd_ino;
4553 0 : lbn = pagedep->pd_lbn;
4554 0 : if ((dap->da_state & (MKDIR_BODY | COMPLETE)) != COMPLETE) {
4555 0 : FREE_LOCK(&lk);
4556 0 : panic("softdep_fsync: dirty");
4557 : }
4558 0 : if ((dap->da_state & MKDIR_PARENT) ||
4559 0 : (pagedep->pd_state & NEWBLOCK))
4560 0 : flushparent = 1;
4561 : else
4562 : flushparent = 0;
4563 : /*
4564 : * If we are being fsync'ed as part of vgone'ing this vnode,
4565 : * then we will not be able to release and recover the
4566 : * vnode below, so we just have to give up on writing its
4567 : * directory entry out. It will eventually be written, just
4568 : * not now, but then the user was not asking to have it
4569 : * written, so we are not breaking any promises.
4570 : */
4571 0 : if (vp->v_flag & VXLOCK)
4572 : break;
4573 : /*
4574 : * We prevent deadlock by always fetching inodes from the
4575 : * root, moving down the directory tree. Thus, when fetching
4576 : * our parent directory, we must unlock ourselves before
4577 : * requesting the lock on our parent. See the comment in
4578 : * ufs_lookup for details on possible races.
4579 : */
4580 0 : FREE_LOCK(&lk);
4581 0 : VOP_UNLOCK(vp);
4582 0 : error = VFS_VGET(mnt, parentino, &pvp);
4583 0 : vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4584 0 : if (error != 0)
4585 0 : return (error);
4586 : /*
4587 : * All MKDIR_PARENT dependencies and all the NEWBLOCK pagedeps
4588 : * that are contained in direct blocks will be resolved by
4589 : * doing a UFS_UPDATE. Pagedeps contained in indirect blocks
4590 : * may require a complete sync'ing of the directory. So, we
4591 : * try the cheap and fast UFS_UPDATE first, and if that fails,
4592 : * then we do the slower VOP_FSYNC of the directory.
4593 : */
4594 0 : pip = VTOI(pvp);
4595 0 : if (flushparent) {
4596 0 : error = UFS_UPDATE(pip, 1);
4597 0 : if (error) {
4598 0 : vput(pvp);
4599 0 : return (error);
4600 : }
4601 0 : if (pagedep->pd_state & NEWBLOCK) {
4602 0 : error = VOP_FSYNC(pvp, p->p_ucred, MNT_WAIT, p);
4603 0 : if (error) {
4604 0 : vput(pvp);
4605 0 : return (error);
4606 : }
4607 : }
4608 : }
4609 : /*
4610 : * Flush directory page containing the inode's name.
4611 : */
4612 0 : error = bread(pvp, lbn, fs->fs_bsize, &bp);
4613 0 : if (error == 0) {
4614 0 : bp->b_bcount = blksize(fs, pip, lbn);
4615 0 : error = bwrite(bp);
4616 0 : } else
4617 0 : brelse(bp);
4618 0 : vput(pvp);
4619 0 : if (error != 0)
4620 0 : return (error);
4621 0 : ACQUIRE_LOCK(&lk);
4622 0 : if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) == 0)
4623 : break;
4624 : }
4625 0 : FREE_LOCK(&lk);
4626 0 : return (0);
4627 0 : }
4628 :
4629 : /*
4630 : * Flush all the dirty bitmaps associated with the block device
4631 : * before flushing the rest of the dirty blocks so as to reduce
4632 : * the number of dependencies that will have to be rolled back.
4633 : */
4634 : void
4635 0 : softdep_fsync_mountdev(struct vnode *vp, int waitfor)
4636 : {
4637 : struct buf *bp, *nbp;
4638 : struct worklist *wk;
4639 :
4640 0 : if (!vn_isdisk(vp, NULL))
4641 0 : panic("softdep_fsync_mountdev: vnode not a disk");
4642 0 : ACQUIRE_LOCK(&lk);
4643 0 : LIST_FOREACH_SAFE(bp, &vp->v_dirtyblkhd, b_vnbufs, nbp) {
4644 : /*
4645 : * If it is already scheduled, skip to the next buffer.
4646 : */
4647 0 : splassert(IPL_BIO);
4648 0 : if (bp->b_flags & B_BUSY)
4649 : continue;
4650 :
4651 0 : if ((bp->b_flags & B_DELWRI) == 0) {
4652 0 : FREE_LOCK(&lk);
4653 0 : panic("softdep_fsync_mountdev: not dirty");
4654 : }
4655 : /*
4656 : * We are only interested in bitmaps with outstanding
4657 : * dependencies.
4658 : */
4659 0 : if ((wk = LIST_FIRST(&bp->b_dep)) == NULL ||
4660 0 : wk->wk_type != D_BMSAFEMAP) {
4661 : continue;
4662 : }
4663 0 : bremfree(bp);
4664 0 : buf_acquire(bp);
4665 0 : FREE_LOCK(&lk);
4666 0 : (void) bawrite(bp);
4667 0 : ACQUIRE_LOCK(&lk);
4668 : /*
4669 : * Since we may have slept during the I/O, we need
4670 : * to start from a known point.
4671 : */
4672 0 : nbp = LIST_FIRST(&vp->v_dirtyblkhd);
4673 0 : }
4674 0 : if (waitfor == MNT_WAIT)
4675 0 : drain_output(vp, 1);
4676 0 : FREE_LOCK(&lk);
4677 0 : }
4678 :
4679 : /*
4680 : * This routine is called when we are trying to synchronously flush a
4681 : * file. This routine must eliminate any filesystem metadata dependencies
4682 : * so that the syncing routine can succeed by pushing the dirty blocks
4683 : * associated with the file. If any I/O errors occur, they are returned.
4684 : */
4685 : int
4686 0 : softdep_sync_metadata(struct vop_fsync_args *ap)
4687 : {
4688 0 : struct vnode *vp = ap->a_vp;
4689 : struct pagedep *pagedep;
4690 : struct allocdirect *adp;
4691 : struct allocindir *aip;
4692 : struct buf *bp, *nbp;
4693 : struct worklist *wk;
4694 : int i, gotit, error, waitfor;
4695 :
4696 : /*
4697 : * Check whether this vnode is involved in a filesystem
4698 : * that is doing soft dependency processing.
4699 : */
4700 0 : if (!vn_isdisk(vp, NULL)) {
4701 0 : if (!DOINGSOFTDEP(vp))
4702 0 : return (0);
4703 : } else
4704 0 : if (vp->v_specmountpoint == NULL ||
4705 0 : (vp->v_specmountpoint->mnt_flag & MNT_SOFTDEP) == 0)
4706 0 : return (0);
4707 : /*
4708 : * Ensure that any direct block dependencies have been cleared.
4709 : */
4710 0 : ACQUIRE_LOCK(&lk);
4711 0 : if ((error = flush_inodedep_deps(VTOI(vp)->i_fs, VTOI(vp)->i_number))) {
4712 0 : FREE_LOCK(&lk);
4713 0 : return (error);
4714 : }
4715 : /*
4716 : * For most files, the only metadata dependencies are the
4717 : * cylinder group maps that allocate their inode or blocks.
4718 : * The block allocation dependencies can be found by traversing
4719 : * the dependency lists for any buffers that remain on their
4720 : * dirty buffer list. The inode allocation dependency will
4721 : * be resolved when the inode is updated with MNT_WAIT.
4722 : * This work is done in two passes. The first pass grabs most
4723 : * of the buffers and begins asynchronously writing them. The
4724 : * only way to wait for these asynchronous writes is to sleep
4725 : * on the filesystem vnode which may stay busy for a long time
4726 : * if the filesystem is active. So, instead, we make a second
4727 : * pass over the dependencies blocking on each write. In the
4728 : * usual case we will be blocking against a write that we
4729 : * initiated, so when it is done the dependency will have been
4730 : * resolved. Thus the second pass is expected to end quickly.
4731 : */
4732 0 : waitfor = MNT_NOWAIT;
4733 : top:
4734 : /*
4735 : * We must wait for any I/O in progress to finish so that
4736 : * all potential buffers on the dirty list will be visible.
4737 : */
4738 0 : drain_output(vp, 1);
4739 0 : bp = LIST_FIRST(&vp->v_dirtyblkhd);
4740 0 : gotit = getdirtybuf(bp, MNT_WAIT);
4741 0 : if (gotit == 0) {
4742 0 : FREE_LOCK(&lk);
4743 0 : return (0);
4744 0 : } else if (gotit == -1)
4745 0 : goto top;
4746 : loop:
4747 : /*
4748 : * As we hold the buffer locked, none of its dependencies
4749 : * will disappear.
4750 : */
4751 0 : LIST_FOREACH(wk, &bp->b_dep, wk_list) {
4752 0 : switch (wk->wk_type) {
4753 :
4754 : case D_ALLOCDIRECT:
4755 0 : adp = WK_ALLOCDIRECT(wk);
4756 0 : if (adp->ad_state & DEPCOMPLETE)
4757 : break;
4758 0 : nbp = adp->ad_buf;
4759 0 : gotit = getdirtybuf(nbp, waitfor);
4760 0 : if (gotit == 0)
4761 : break;
4762 0 : else if (gotit == -1)
4763 0 : goto loop;
4764 0 : FREE_LOCK(&lk);
4765 0 : if (waitfor == MNT_NOWAIT) {
4766 0 : bawrite(nbp);
4767 0 : } else if ((error = VOP_BWRITE(nbp)) != 0) {
4768 0 : bawrite(bp);
4769 0 : return (error);
4770 : }
4771 0 : ACQUIRE_LOCK(&lk);
4772 0 : break;
4773 :
4774 : case D_ALLOCINDIR:
4775 0 : aip = WK_ALLOCINDIR(wk);
4776 0 : if (aip->ai_state & DEPCOMPLETE)
4777 : break;
4778 0 : nbp = aip->ai_buf;
4779 0 : gotit = getdirtybuf(nbp, waitfor);
4780 0 : if (gotit == 0)
4781 : break;
4782 0 : else if (gotit == -1)
4783 0 : goto loop;
4784 0 : FREE_LOCK(&lk);
4785 0 : if (waitfor == MNT_NOWAIT) {
4786 0 : bawrite(nbp);
4787 0 : } else if ((error = VOP_BWRITE(nbp)) != 0) {
4788 0 : bawrite(bp);
4789 0 : return (error);
4790 : }
4791 0 : ACQUIRE_LOCK(&lk);
4792 0 : break;
4793 :
4794 : case D_INDIRDEP:
4795 : restart:
4796 :
4797 0 : LIST_FOREACH(aip, &WK_INDIRDEP(wk)->ir_deplisthd, ai_next) {
4798 0 : if (aip->ai_state & DEPCOMPLETE)
4799 : continue;
4800 0 : nbp = aip->ai_buf;
4801 0 : if (getdirtybuf(nbp, MNT_WAIT) <= 0)
4802 0 : goto restart;
4803 0 : FREE_LOCK(&lk);
4804 0 : if ((error = VOP_BWRITE(nbp)) != 0) {
4805 0 : bawrite(bp);
4806 0 : return (error);
4807 : }
4808 0 : ACQUIRE_LOCK(&lk);
4809 0 : goto restart;
4810 : }
4811 : break;
4812 :
4813 : case D_INODEDEP:
4814 0 : if ((error = flush_inodedep_deps(WK_INODEDEP(wk)->id_fs,
4815 0 : WK_INODEDEP(wk)->id_ino)) != 0) {
4816 0 : FREE_LOCK(&lk);
4817 0 : bawrite(bp);
4818 0 : return (error);
4819 : }
4820 : break;
4821 :
4822 : case D_PAGEDEP:
4823 : /*
4824 : * We are trying to sync a directory that may
4825 : * have dependencies on both its own metadata
4826 : * and/or dependencies on the inodes of any
4827 : * recently allocated files. We walk its diradd
4828 : * lists pushing out the associated inode.
4829 : */
4830 0 : pagedep = WK_PAGEDEP(wk);
4831 0 : for (i = 0; i < DAHASHSZ; i++) {
4832 0 : if (LIST_FIRST(&pagedep->pd_diraddhd[i]) ==
4833 : NULL)
4834 : continue;
4835 0 : if ((error =
4836 0 : flush_pagedep_deps(vp, pagedep->pd_mnt,
4837 : &pagedep->pd_diraddhd[i]))) {
4838 0 : FREE_LOCK(&lk);
4839 0 : bawrite(bp);
4840 0 : return (error);
4841 : }
4842 : }
4843 : break;
4844 :
4845 : case D_MKDIR:
4846 : /*
4847 : * This case should never happen if the vnode has
4848 : * been properly sync'ed. However, if this function
4849 : * is used at a place where the vnode has not yet
4850 : * been sync'ed, this dependency can show up. So,
4851 : * rather than panic, just flush it.
4852 : */
4853 0 : nbp = WK_MKDIR(wk)->md_buf;
4854 0 : KASSERT(bp != nbp);
4855 0 : gotit = getdirtybuf(nbp, waitfor);
4856 0 : if (gotit == 0)
4857 : break;
4858 0 : else if (gotit == -1)
4859 0 : goto loop;
4860 0 : FREE_LOCK(&lk);
4861 0 : if (waitfor == MNT_NOWAIT) {
4862 0 : bawrite(nbp);
4863 0 : } else if ((error = VOP_BWRITE(nbp)) != 0) {
4864 0 : bawrite(bp);
4865 0 : return (error);
4866 : }
4867 0 : ACQUIRE_LOCK(&lk);
4868 0 : break;
4869 :
4870 : case D_BMSAFEMAP:
4871 : /*
4872 : * This case should never happen if the vnode has
4873 : * been properly sync'ed. However, if this function
4874 : * is used at a place where the vnode has not yet
4875 : * been sync'ed, this dependency can show up. So,
4876 : * rather than panic, just flush it.
4877 : */
4878 0 : nbp = WK_BMSAFEMAP(wk)->sm_buf;
4879 0 : if (bp == nbp)
4880 : break;
4881 0 : gotit = getdirtybuf(nbp, waitfor);
4882 0 : if (gotit == 0)
4883 : break;
4884 0 : else if (gotit == -1)
4885 0 : goto loop;
4886 0 : FREE_LOCK(&lk);
4887 0 : if (waitfor == MNT_NOWAIT) {
4888 0 : bawrite(nbp);
4889 0 : } else if ((error = VOP_BWRITE(nbp)) != 0) {
4890 0 : bawrite(bp);
4891 0 : return (error);
4892 : }
4893 0 : ACQUIRE_LOCK(&lk);
4894 0 : break;
4895 :
4896 : default:
4897 0 : FREE_LOCK(&lk);
4898 0 : panic("softdep_sync_metadata: Unknown type %s",
4899 0 : TYPENAME(wk->wk_type));
4900 : /* NOTREACHED */
4901 : }
4902 : }
4903 0 : do {
4904 0 : nbp = LIST_NEXT(bp, b_vnbufs);
4905 0 : gotit = getdirtybuf(nbp, MNT_WAIT);
4906 0 : } while (gotit == -1);
4907 0 : FREE_LOCK(&lk);
4908 0 : bawrite(bp);
4909 0 : ACQUIRE_LOCK(&lk);
4910 0 : if (nbp != NULL) {
4911 : bp = nbp;
4912 0 : goto loop;
4913 : }
4914 : /*
4915 : * The brief unlock is to allow any pent up dependency
4916 : * processing to be done. Then proceed with the second pass.
4917 : */
4918 0 : if (waitfor == MNT_NOWAIT) {
4919 : waitfor = MNT_WAIT;
4920 0 : FREE_LOCK(&lk);
4921 0 : ACQUIRE_LOCK(&lk);
4922 0 : goto top;
4923 : }
4924 :
4925 : /*
4926 : * If we have managed to get rid of all the dirty buffers,
4927 : * then we are done. For certain directories and block
4928 : * devices, we may need to do further work.
4929 : *
4930 : * We must wait for any I/O in progress to finish so that
4931 : * all potential buffers on the dirty list will be visible.
4932 : */
4933 0 : drain_output(vp, 1);
4934 0 : if (LIST_EMPTY(&vp->v_dirtyblkhd)) {
4935 : FREE_LOCK(&lk);
4936 0 : return (0);
4937 : }
4938 :
4939 : FREE_LOCK(&lk);
4940 : /*
4941 : * If we are trying to sync a block device, some of its buffers may
4942 : * contain metadata that cannot be written until the contents of some
4943 : * partially written files have been written to disk. The only easy
4944 : * way to accomplish this is to sync the entire filesystem (luckily
4945 : * this happens rarely).
4946 : */
4947 0 : if (vn_isdisk(vp, NULL) &&
4948 0 : vp->v_specmountpoint && !VOP_ISLOCKED(vp) &&
4949 0 : (error = VFS_SYNC(vp->v_specmountpoint, MNT_WAIT, 0, ap->a_cred,
4950 0 : ap->a_p)) != 0)
4951 0 : return (error);
4952 0 : return (0);
4953 0 : }
4954 :
4955 : /*
4956 : * Flush the dependencies associated with an inodedep.
4957 : * Called with splbio blocked.
4958 : */
4959 : STATIC int
4960 0 : flush_inodedep_deps(struct fs *fs, ufsino_t ino)
4961 : {
4962 0 : struct inodedep *inodedep;
4963 : struct allocdirect *adp;
4964 : int gotit, error, waitfor;
4965 : struct buf *bp;
4966 :
4967 0 : splassert(IPL_BIO);
4968 :
4969 : /*
4970 : * This work is done in two passes. The first pass grabs most
4971 : * of the buffers and begins asynchronously writing them. The
4972 : * only way to wait for these asynchronous writes is to sleep
4973 : * on the filesystem vnode which may stay busy for a long time
4974 : * if the filesystem is active. So, instead, we make a second
4975 : * pass over the dependencies blocking on each write. In the
4976 : * usual case we will be blocking against a write that we
4977 : * initiated, so when it is done the dependency will have been
4978 : * resolved. Thus the second pass is expected to end quickly.
4979 : * We give a brief window at the top of the loop to allow
4980 : * any pending I/O to complete.
4981 : */
4982 0 : for (waitfor = MNT_NOWAIT; ; ) {
4983 : retry_ino:
4984 0 : FREE_LOCK(&lk);
4985 0 : ACQUIRE_LOCK(&lk);
4986 0 : if (inodedep_lookup(fs, ino, 0, &inodedep) == 0)
4987 0 : return (0);
4988 0 : TAILQ_FOREACH(adp, &inodedep->id_inoupdt, ad_next) {
4989 0 : if (adp->ad_state & DEPCOMPLETE)
4990 : continue;
4991 0 : bp = adp->ad_buf;
4992 0 : gotit = getdirtybuf(bp, waitfor);
4993 0 : if (gotit == 0) {
4994 0 : if (waitfor == MNT_NOWAIT)
4995 : continue;
4996 : break;
4997 0 : } else if (gotit == -1)
4998 0 : goto retry_ino;
4999 0 : FREE_LOCK(&lk);
5000 0 : if (waitfor == MNT_NOWAIT) {
5001 0 : bawrite(bp);
5002 0 : } else if ((error = VOP_BWRITE(bp)) != 0) {
5003 0 : ACQUIRE_LOCK(&lk);
5004 0 : return (error);
5005 : }
5006 0 : ACQUIRE_LOCK(&lk);
5007 0 : break;
5008 : }
5009 0 : if (adp != NULL)
5010 0 : continue;
5011 : retry_newino:
5012 0 : TAILQ_FOREACH(adp, &inodedep->id_newinoupdt, ad_next) {
5013 0 : if (adp->ad_state & DEPCOMPLETE)
5014 : continue;
5015 0 : bp = adp->ad_buf;
5016 0 : gotit = getdirtybuf(bp, waitfor);
5017 0 : if (gotit == 0) {
5018 0 : if (waitfor == MNT_NOWAIT)
5019 : continue;
5020 : break;
5021 0 : } else if (gotit == -1)
5022 0 : goto retry_newino;
5023 0 : FREE_LOCK(&lk);
5024 0 : if (waitfor == MNT_NOWAIT) {
5025 0 : bawrite(bp);
5026 0 : } else if ((error = VOP_BWRITE(bp)) != 0) {
5027 0 : ACQUIRE_LOCK(&lk);
5028 0 : return (error);
5029 : }
5030 0 : ACQUIRE_LOCK(&lk);
5031 0 : break;
5032 : }
5033 0 : if (adp != NULL)
5034 0 : continue;
5035 : /*
5036 : * If pass2, we are done, otherwise do pass 2.
5037 : */
5038 0 : if (waitfor == MNT_WAIT)
5039 : break;
5040 : waitfor = MNT_WAIT;
5041 : }
5042 : /*
5043 : * Try freeing inodedep in case all dependencies have been removed.
5044 : */
5045 0 : if (inodedep_lookup(fs, ino, 0, &inodedep) != 0)
5046 0 : (void) free_inodedep(inodedep);
5047 0 : return (0);
5048 0 : }
5049 :
5050 : /*
5051 : * Eliminate a pagedep dependency by flushing out all its diradd dependencies.
5052 : * Called with splbio blocked.
5053 : */
5054 : STATIC int
5055 0 : flush_pagedep_deps(struct vnode *pvp, struct mount *mp,
5056 : struct diraddhd *diraddhdp)
5057 : {
5058 0 : struct proc *p = CURPROC; /* XXX */
5059 : struct worklist *wk;
5060 0 : struct inodedep *inodedep;
5061 : struct ufsmount *ump;
5062 : struct diradd *dap;
5063 0 : struct vnode *vp;
5064 : int gotit, error = 0;
5065 0 : struct buf *bp;
5066 : ufsino_t inum;
5067 :
5068 0 : splassert(IPL_BIO);
5069 :
5070 0 : ump = VFSTOUFS(mp);
5071 0 : while ((dap = LIST_FIRST(diraddhdp)) != NULL) {
5072 : /*
5073 : * Flush ourselves if this directory entry
5074 : * has a MKDIR_PARENT dependency.
5075 : */
5076 0 : if (dap->da_state & MKDIR_PARENT) {
5077 0 : FREE_LOCK(&lk);
5078 0 : if ((error = UFS_UPDATE(VTOI(pvp), 1)))
5079 : break;
5080 0 : ACQUIRE_LOCK(&lk);
5081 : /*
5082 : * If that cleared dependencies, go on to next.
5083 : */
5084 0 : if (dap != LIST_FIRST(diraddhdp))
5085 0 : continue;
5086 0 : if (dap->da_state & MKDIR_PARENT) {
5087 0 : FREE_LOCK(&lk);
5088 0 : panic("flush_pagedep_deps: MKDIR_PARENT");
5089 : }
5090 : }
5091 : /*
5092 : * A newly allocated directory must have its "." and
5093 : * ".." entries written out before its name can be
5094 : * committed in its parent. We do not want or need
5095 : * the full semantics of a synchronous VOP_FSYNC as
5096 : * that may end up here again, once for each directory
5097 : * level in the filesystem. Instead, we push the blocks
5098 : * and wait for them to clear. We have to fsync twice
5099 : * because the first call may choose to defer blocks
5100 : * that still have dependencies, but deferral will
5101 : * happen at most once.
5102 : */
5103 0 : inum = dap->da_newinum;
5104 0 : if (dap->da_state & MKDIR_BODY) {
5105 0 : FREE_LOCK(&lk);
5106 0 : if ((error = VFS_VGET(mp, inum, &vp)) != 0)
5107 : break;
5108 0 : if ((error=VOP_FSYNC(vp, p->p_ucred, MNT_NOWAIT, p)) ||
5109 0 : (error=VOP_FSYNC(vp, p->p_ucred, MNT_NOWAIT, p))) {
5110 0 : vput(vp);
5111 0 : break;
5112 : }
5113 0 : drain_output(vp, 0);
5114 : /*
5115 : * If first block is still dirty with a D_MKDIR
5116 : * dependency then it needs to be written now.
5117 : */
5118 0 : for (;;) {
5119 : error = 0;
5120 0 : ACQUIRE_LOCK(&lk);
5121 0 : bp = incore(vp, 0);
5122 0 : if (bp == NULL) {
5123 0 : FREE_LOCK(&lk);
5124 0 : break;
5125 : }
5126 0 : LIST_FOREACH(wk, &bp->b_dep, wk_list)
5127 0 : if (wk->wk_type == D_MKDIR)
5128 : break;
5129 0 : if (wk) {
5130 0 : gotit = getdirtybuf(bp, MNT_WAIT);
5131 0 : FREE_LOCK(&lk);
5132 0 : if (gotit == -1)
5133 0 : continue;
5134 0 : if (gotit && (error = bwrite(bp)) != 0)
5135 : break;
5136 : } else
5137 0 : FREE_LOCK(&lk);
5138 : break;
5139 : }
5140 0 : vput(vp);
5141 : /* Flushing of first block failed */
5142 0 : if (error)
5143 : break;
5144 0 : ACQUIRE_LOCK(&lk);
5145 : /*
5146 : * If that cleared dependencies, go on to next.
5147 : */
5148 0 : if (dap != LIST_FIRST(diraddhdp))
5149 0 : continue;
5150 0 : if (dap->da_state & MKDIR_BODY) {
5151 0 : FREE_LOCK(&lk);
5152 0 : panic("flush_pagedep_deps: MKDIR_BODY");
5153 : }
5154 : }
5155 : /*
5156 : * Flush the inode on which the directory entry depends.
5157 : * Having accounted for MKDIR_PARENT and MKDIR_BODY above,
5158 : * the only remaining dependency is that the updated inode
5159 : * count must get pushed to disk. The inode has already
5160 : * been pushed into its inode buffer (via VOP_UPDATE) at
5161 : * the time of the reference count change. So we need only
5162 : * locate that buffer, ensure that there will be no rollback
5163 : * caused by a bitmap dependency, then write the inode buffer.
5164 : */
5165 0 : if (inodedep_lookup(ump->um_fs, inum, 0, &inodedep) == 0) {
5166 0 : FREE_LOCK(&lk);
5167 0 : panic("flush_pagedep_deps: lost inode");
5168 : }
5169 : /*
5170 : * If the inode still has bitmap dependencies,
5171 : * push them to disk.
5172 : */
5173 : retry:
5174 0 : if ((inodedep->id_state & DEPCOMPLETE) == 0) {
5175 0 : bp = inodedep->id_buf;
5176 0 : gotit = getdirtybuf(bp, MNT_WAIT);
5177 0 : if (gotit == -1)
5178 0 : goto retry;
5179 0 : FREE_LOCK(&lk);
5180 0 : if (gotit && (error = bwrite(bp)) != 0)
5181 : break;
5182 0 : ACQUIRE_LOCK(&lk);
5183 0 : if (dap != LIST_FIRST(diraddhdp))
5184 0 : continue;
5185 : }
5186 : /*
5187 : * If the inode is still sitting in a buffer waiting
5188 : * to be written, push it to disk.
5189 : */
5190 0 : FREE_LOCK(&lk);
5191 0 : if ((error = bread(ump->um_devvp,
5192 0 : fsbtodb(ump->um_fs, ino_to_fsba(ump->um_fs, inum)),
5193 0 : (int)ump->um_fs->fs_bsize, &bp)) != 0) {
5194 0 : brelse(bp);
5195 0 : break;
5196 : }
5197 0 : if ((error = bwrite(bp)) != 0)
5198 : break;
5199 0 : ACQUIRE_LOCK(&lk);
5200 : /*
5201 : * If we have failed to get rid of all the dependencies
5202 : * then something is seriously wrong.
5203 : */
5204 0 : if (dap == LIST_FIRST(diraddhdp)) {
5205 0 : FREE_LOCK(&lk);
5206 0 : panic("flush_pagedep_deps: flush failed");
5207 : }
5208 : }
5209 0 : if (error)
5210 0 : ACQUIRE_LOCK(&lk);
5211 0 : return (error);
5212 0 : }
5213 :
5214 : /*
5215 : * A large burst of file addition or deletion activity can drive the
5216 : * memory load excessively high. First attempt to slow things down
5217 : * using the techniques below. If that fails, this routine requests
5218 : * the offending operations to fall back to running synchronously
5219 : * until the memory load returns to a reasonable level.
5220 : */
5221 : int
5222 0 : softdep_slowdown(struct vnode *vp)
5223 : {
5224 : int max_softdeps_hard;
5225 :
5226 0 : max_softdeps_hard = max_softdeps * 11 / 10;
5227 0 : if (num_dirrem < max_softdeps_hard / 2 &&
5228 0 : num_inodedep < max_softdeps_hard)
5229 0 : return (0);
5230 0 : stat_sync_limit_hit += 1;
5231 0 : return (1);
5232 0 : }
5233 :
5234 : /*
5235 : * If memory utilization has gotten too high, deliberately slow things
5236 : * down and speed up the I/O processing.
5237 : */
5238 : STATIC int
5239 0 : request_cleanup(int resource, int islocked)
5240 : {
5241 0 : struct proc *p = CURPROC;
5242 : int s;
5243 :
5244 : /*
5245 : * We never hold up the filesystem syncer process.
5246 : */
5247 0 : if (p == filesys_syncer || (p->p_flag & P_SOFTDEP))
5248 0 : return (0);
5249 : /*
5250 : * First check to see if the work list has gotten backlogged.
5251 : * If it has, co-opt this process to help clean up two entries.
5252 : * Because this process may hold inodes locked, we cannot
5253 : * handle any remove requests that might block on a locked
5254 : * inode as that could lead to deadlock. We set P_SOFTDEP
5255 : * to avoid recursively processing the worklist.
5256 : */
5257 0 : if (num_on_worklist > max_softdeps / 10) {
5258 0 : atomic_setbits_int(&p->p_flag, P_SOFTDEP);
5259 0 : if (islocked)
5260 0 : FREE_LOCK(&lk);
5261 0 : process_worklist_item(NULL, LK_NOWAIT);
5262 0 : process_worklist_item(NULL, LK_NOWAIT);
5263 0 : atomic_clearbits_int(&p->p_flag, P_SOFTDEP);
5264 0 : stat_worklist_push += 2;
5265 0 : if (islocked)
5266 0 : ACQUIRE_LOCK(&lk);
5267 0 : return(1);
5268 : }
5269 : /*
5270 : * Next, we attempt to speed up the syncer process. If that
5271 : * is successful, then we allow the process to continue.
5272 : */
5273 0 : if (speedup_syncer())
5274 0 : return(0);
5275 : /*
5276 : * If we are resource constrained on inode dependencies, try
5277 : * flushing some dirty inodes. Otherwise, we are constrained
5278 : * by file deletions, so try accelerating flushes of directories
5279 : * with removal dependencies. We would like to do the cleanup
5280 : * here, but we probably hold an inode locked at this point and
5281 : * that might deadlock against one that we try to clean. So,
5282 : * the best that we can do is request the syncer daemon to do
5283 : * the cleanup for us.
5284 : */
5285 0 : switch (resource) {
5286 :
5287 : case FLUSH_INODES:
5288 0 : stat_ino_limit_push += 1;
5289 0 : req_clear_inodedeps += 1;
5290 0 : stat_countp = &stat_ino_limit_hit;
5291 0 : break;
5292 :
5293 : case FLUSH_REMOVE:
5294 0 : stat_blk_limit_push += 1;
5295 0 : req_clear_remove += 1;
5296 0 : stat_countp = &stat_blk_limit_hit;
5297 0 : break;
5298 :
5299 : default:
5300 0 : if (islocked)
5301 0 : FREE_LOCK(&lk);
5302 0 : panic("request_cleanup: unknown type");
5303 : }
5304 : /*
5305 : * Hopefully the syncer daemon will catch up and awaken us.
5306 : * We wait at most tickdelay before proceeding in any case.
5307 : */
5308 0 : if (islocked == 0)
5309 0 : ACQUIRE_LOCK(&lk);
5310 0 : proc_waiting += 1;
5311 0 : if (!timeout_pending(&proc_waiting_timeout))
5312 0 : timeout_add(&proc_waiting_timeout, tickdelay > 2 ? tickdelay : 2);
5313 :
5314 0 : s = FREE_LOCK_INTERLOCKED(&lk);
5315 0 : (void) tsleep((caddr_t)&proc_waiting, PPAUSE, "softupdate", 0);
5316 0 : ACQUIRE_LOCK_INTERLOCKED(&lk, s);
5317 0 : proc_waiting -= 1;
5318 0 : if (islocked == 0)
5319 0 : FREE_LOCK(&lk);
5320 0 : return (1);
5321 0 : }
5322 :
5323 : /*
5324 : * Awaken processes pausing in request_cleanup and clear proc_waiting
5325 : * to indicate that there is no longer a timer running.
5326 : */
5327 : void
5328 0 : pause_timer(void *arg)
5329 : {
5330 :
5331 0 : *stat_countp += 1;
5332 0 : wakeup_one(&proc_waiting);
5333 0 : if (proc_waiting > 0)
5334 0 : timeout_add(&proc_waiting_timeout, tickdelay > 2 ? tickdelay : 2);
5335 0 : }
5336 :
5337 : /*
5338 : * Flush out a directory with at least one removal dependency in an effort to
5339 : * reduce the number of dirrem, freefile, and freeblks dependency structures.
5340 : */
5341 : STATIC void
5342 0 : clear_remove(struct proc *p)
5343 : {
5344 : struct pagedep_hashhead *pagedephd;
5345 : struct pagedep *pagedep;
5346 : static int next = 0;
5347 : struct mount *mp;
5348 0 : struct vnode *vp;
5349 : int error, cnt;
5350 : ufsino_t ino;
5351 :
5352 0 : ACQUIRE_LOCK(&lk);
5353 0 : for (cnt = 0; cnt <= pagedep_hash; cnt++) {
5354 0 : pagedephd = &pagedep_hashtbl[next++];
5355 0 : if (next > pagedep_hash)
5356 0 : next = 0;
5357 0 : LIST_FOREACH(pagedep, pagedephd, pd_hash) {
5358 0 : if (LIST_FIRST(&pagedep->pd_dirremhd) == NULL)
5359 : continue;
5360 0 : mp = pagedep->pd_mnt;
5361 0 : ino = pagedep->pd_ino;
5362 : #if 0
5363 : if (vn_start_write(NULL, &mp, V_NOWAIT) != 0)
5364 : continue;
5365 : #endif
5366 0 : FREE_LOCK(&lk);
5367 0 : if ((error = VFS_VGET(mp, ino, &vp)) != 0) {
5368 0 : softdep_error("clear_remove: vget", error);
5369 : #if 0
5370 : vn_finished_write(mp);
5371 : #endif
5372 0 : return;
5373 : }
5374 0 : if ((error = VOP_FSYNC(vp, p->p_ucred, MNT_NOWAIT, p)))
5375 0 : softdep_error("clear_remove: fsync", error);
5376 0 : drain_output(vp, 0);
5377 0 : vput(vp);
5378 : #if 0
5379 : vn_finished_write(mp);
5380 : #endif
5381 0 : return;
5382 : }
5383 : }
5384 0 : FREE_LOCK(&lk);
5385 0 : }
5386 :
5387 : /*
5388 : * Clear out a block of dirty inodes in an effort to reduce
5389 : * the number of inodedep dependency structures.
5390 : */
5391 : STATIC void
5392 0 : clear_inodedeps(struct proc *p)
5393 : {
5394 : struct inodedep_hashhead *inodedephd;
5395 0 : struct inodedep *inodedep = NULL;
5396 : static int next = 0;
5397 : struct mount *mp;
5398 0 : struct vnode *vp;
5399 : struct fs *fs;
5400 : int error, cnt;
5401 : ufsino_t firstino, lastino, ino;
5402 :
5403 0 : ACQUIRE_LOCK(&lk);
5404 : /*
5405 : * Pick a random inode dependency to be cleared.
5406 : * We will then gather up all the inodes in its block
5407 : * that have dependencies and flush them out.
5408 : */
5409 0 : for (cnt = 0; cnt <= inodedep_hash; cnt++) {
5410 0 : inodedephd = &inodedep_hashtbl[next++];
5411 0 : if (next > inodedep_hash)
5412 0 : next = 0;
5413 0 : if ((inodedep = LIST_FIRST(inodedephd)) != NULL)
5414 : break;
5415 : }
5416 0 : if (inodedep == NULL) {
5417 0 : FREE_LOCK(&lk);
5418 0 : return;
5419 : }
5420 : /*
5421 : * Ugly code to find mount point given pointer to superblock.
5422 : */
5423 0 : fs = inodedep->id_fs;
5424 0 : TAILQ_FOREACH(mp, &mountlist, mnt_list)
5425 0 : if ((mp->mnt_flag & MNT_SOFTDEP) && fs == VFSTOUFS(mp)->um_fs)
5426 : break;
5427 : /*
5428 : * Find the last inode in the block with dependencies.
5429 : */
5430 0 : firstino = inodedep->id_ino & ~(INOPB(fs) - 1);
5431 0 : for (lastino = firstino + INOPB(fs) - 1; lastino > firstino; lastino--)
5432 0 : if (inodedep_lookup(fs, lastino, 0, &inodedep) != 0)
5433 : break;
5434 : /*
5435 : * Asynchronously push all but the last inode with dependencies.
5436 : * Synchronously push the last inode with dependencies to ensure
5437 : * that the inode block gets written to free up the inodedeps.
5438 : */
5439 0 : for (ino = firstino; ino <= lastino; ino++) {
5440 0 : if (inodedep_lookup(fs, ino, 0, &inodedep) == 0)
5441 : continue;
5442 0 : FREE_LOCK(&lk);
5443 : #if 0
5444 : if (vn_start_write(NULL, &mp, V_NOWAIT) != 0)
5445 : continue;
5446 : #endif
5447 0 : if ((error = VFS_VGET(mp, ino, &vp)) != 0) {
5448 0 : softdep_error("clear_inodedeps: vget", error);
5449 : #if 0
5450 : vn_finished_write(mp);
5451 : #endif
5452 0 : return;
5453 : }
5454 0 : if (ino == lastino) {
5455 0 : if ((error = VOP_FSYNC(vp, p->p_ucred, MNT_WAIT, p)))
5456 0 : softdep_error("clear_inodedeps: fsync1", error);
5457 : } else {
5458 0 : if ((error = VOP_FSYNC(vp, p->p_ucred, MNT_NOWAIT, p)))
5459 0 : softdep_error("clear_inodedeps: fsync2", error);
5460 0 : drain_output(vp, 0);
5461 : }
5462 0 : vput(vp);
5463 : #if 0
5464 : vn_finished_write(mp);
5465 : #endif
5466 0 : ACQUIRE_LOCK(&lk);
5467 0 : }
5468 0 : FREE_LOCK(&lk);
5469 0 : }
5470 :
5471 : /*
5472 : * Function to determine if the buffer has outstanding dependencies
5473 : * that will cause a roll-back if the buffer is written. If wantcount
5474 : * is set, return number of dependencies, otherwise just yes or no.
5475 : */
5476 : int
5477 0 : softdep_count_dependencies(struct buf *bp, int wantcount, int islocked)
5478 : {
5479 : struct worklist *wk;
5480 : struct inodedep *inodedep;
5481 : struct indirdep *indirdep;
5482 : struct allocindir *aip;
5483 : struct pagedep *pagedep;
5484 : struct diradd *dap;
5485 : int i, retval;
5486 :
5487 : retval = 0;
5488 0 : if (!islocked)
5489 0 : ACQUIRE_LOCK(&lk);
5490 0 : LIST_FOREACH(wk, &bp->b_dep, wk_list) {
5491 0 : switch (wk->wk_type) {
5492 :
5493 : case D_INODEDEP:
5494 0 : inodedep = WK_INODEDEP(wk);
5495 0 : if ((inodedep->id_state & DEPCOMPLETE) == 0) {
5496 : /* bitmap allocation dependency */
5497 0 : retval += 1;
5498 0 : if (!wantcount)
5499 : goto out;
5500 : }
5501 0 : if (TAILQ_FIRST(&inodedep->id_inoupdt)) {
5502 : /* direct block pointer dependency */
5503 0 : retval += 1;
5504 0 : if (!wantcount)
5505 : goto out;
5506 : }
5507 : continue;
5508 :
5509 : case D_INDIRDEP:
5510 0 : indirdep = WK_INDIRDEP(wk);
5511 :
5512 0 : LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) {
5513 : /* indirect block pointer dependency */
5514 0 : retval += 1;
5515 0 : if (!wantcount)
5516 : goto out;
5517 : }
5518 : continue;
5519 :
5520 : case D_PAGEDEP:
5521 0 : pagedep = WK_PAGEDEP(wk);
5522 0 : for (i = 0; i < DAHASHSZ; i++) {
5523 :
5524 0 : LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
5525 : /* directory entry dependency */
5526 0 : retval += 1;
5527 0 : if (!wantcount)
5528 : goto out;
5529 : }
5530 : }
5531 : continue;
5532 :
5533 : case D_BMSAFEMAP:
5534 : case D_ALLOCDIRECT:
5535 : case D_ALLOCINDIR:
5536 : case D_MKDIR:
5537 : /* never a dependency on these blocks */
5538 : continue;
5539 :
5540 : default:
5541 0 : if (!islocked)
5542 0 : FREE_LOCK(&lk);
5543 0 : panic("softdep_check_for_rollback: Unexpected type %s",
5544 0 : TYPENAME(wk->wk_type));
5545 : /* NOTREACHED */
5546 : }
5547 : }
5548 : out:
5549 0 : if (!islocked)
5550 0 : FREE_LOCK(&lk);
5551 0 : return retval;
5552 : }
5553 :
5554 : /*
5555 : * Acquire exclusive access to a buffer.
5556 : * Must be called with splbio blocked.
5557 : * Returns:
5558 : * 1 if the buffer was acquired and is dirty;
5559 : * 0 if the buffer was clean, or we would have slept but had MN_NOWAIT;
5560 : * -1 if we slept and may try again (but not with this bp).
5561 : */
5562 : STATIC int
5563 0 : getdirtybuf(struct buf *bp, int waitfor)
5564 : {
5565 : int s;
5566 :
5567 0 : if (bp == NULL)
5568 0 : return (0);
5569 :
5570 0 : splassert(IPL_BIO);
5571 :
5572 0 : if (bp->b_flags & B_BUSY) {
5573 0 : if (waitfor != MNT_WAIT)
5574 0 : return (0);
5575 0 : bp->b_flags |= B_WANTED;
5576 0 : s = FREE_LOCK_INTERLOCKED(&lk);
5577 0 : tsleep((caddr_t)bp, PRIBIO + 1, "sdsdty", 0);
5578 0 : ACQUIRE_LOCK_INTERLOCKED(&lk, s);
5579 0 : return (-1);
5580 : }
5581 0 : if ((bp->b_flags & B_DELWRI) == 0)
5582 0 : return (0);
5583 0 : bremfree(bp);
5584 0 : buf_acquire(bp);
5585 0 : return (1);
5586 0 : }
5587 :
5588 : /*
5589 : * Wait for pending output on a vnode to complete.
5590 : * Must be called with vnode locked.
5591 : */
5592 : STATIC void
5593 0 : drain_output(struct vnode *vp, int islocked)
5594 : {
5595 : int s;
5596 :
5597 0 : if (!islocked)
5598 0 : ACQUIRE_LOCK(&lk);
5599 :
5600 0 : splassert(IPL_BIO);
5601 :
5602 0 : while (vp->v_numoutput) {
5603 0 : vp->v_bioflag |= VBIOWAIT;
5604 0 : s = FREE_LOCK_INTERLOCKED(&lk);
5605 0 : tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "drain_output", 0);
5606 0 : ACQUIRE_LOCK_INTERLOCKED(&lk, s);
5607 : }
5608 0 : if (!islocked)
5609 0 : FREE_LOCK(&lk);
5610 0 : }
5611 :
5612 : /*
5613 : * Called whenever a buffer that is being invalidated or reallocated
5614 : * contains dependencies. This should only happen if an I/O error has
5615 : * occurred. The routine is called with the buffer locked.
5616 : */
5617 : void
5618 0 : softdep_deallocate_dependencies(struct buf *bp)
5619 : {
5620 :
5621 0 : if ((bp->b_flags & B_ERROR) == 0)
5622 0 : panic("softdep_deallocate_dependencies: dangling deps");
5623 0 : softdep_error(bp->b_vp->v_mount->mnt_stat.f_mntonname, bp->b_error);
5624 0 : panic("softdep_deallocate_dependencies: unrecovered I/O error");
5625 : }
5626 :
5627 : /*
5628 : * Function to handle asynchronous write errors in the filesystem.
5629 : */
5630 : void
5631 0 : softdep_error(char *func, int error)
5632 : {
5633 :
5634 : /* XXX should do something better! */
5635 0 : printf("%s: got error %d while accessing filesystem\n", func, error);
5636 0 : }
5637 :
5638 : #ifdef DDB
5639 : #include <machine/db_machdep.h>
5640 : #include <ddb/db_interface.h>
5641 : #include <ddb/db_output.h>
5642 :
5643 : void
5644 0 : softdep_print(struct buf *bp, int full,
5645 : int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2))))
5646 : {
5647 : struct worklist *wk;
5648 :
5649 0 : (*pr)(" deps:\n");
5650 0 : LIST_FOREACH(wk, &bp->b_dep, wk_list)
5651 0 : worklist_print(wk, full, pr);
5652 0 : }
5653 :
5654 : void
5655 0 : worklist_print(struct worklist *wk, int full,
5656 : int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2))))
5657 : {
5658 : struct pagedep *pagedep;
5659 : struct inodedep *inodedep;
5660 : struct newblk *newblk;
5661 : struct bmsafemap *bmsafemap;
5662 : struct allocdirect *adp;
5663 : struct indirdep *indirdep;
5664 : struct allocindir *aip;
5665 : struct freefrag *freefrag;
5666 : struct freeblks *freeblks;
5667 : struct freefile *freefile;
5668 : struct diradd *dap;
5669 : struct mkdir *mkdir;
5670 : struct dirrem *dirrem;
5671 : struct newdirblk *newdirblk;
5672 0 : char prefix[33];
5673 : int i;
5674 :
5675 0 : for (prefix[i = 2 * MIN(16, full)] = '\0'; i--; prefix[i] = ' ')
5676 : ;
5677 :
5678 0 : (*pr)("%s%s(%p) state %b\n%s", prefix, TYPENAME(wk->wk_type), wk,
5679 0 : wk->wk_state, DEP_BITS, prefix);
5680 0 : switch (wk->wk_type) {
5681 : case D_PAGEDEP:
5682 0 : pagedep = WK_PAGEDEP(wk);
5683 0 : (*pr)("mount %p ino %u lbn %lld\n", pagedep->pd_mnt,
5684 0 : pagedep->pd_ino, (long long)pagedep->pd_lbn);
5685 0 : break;
5686 : case D_INODEDEP:
5687 0 : inodedep = WK_INODEDEP(wk);
5688 0 : (*pr)("fs %p ino %u nlinkdelta %u dino %p\n"
5689 0 : "%s bp %p savsz %lld\n", inodedep->id_fs,
5690 0 : inodedep->id_ino, inodedep->id_nlinkdelta,
5691 0 : inodedep->id_un.idu_savedino1,
5692 0 : prefix, inodedep->id_buf, inodedep->id_savedsize);
5693 0 : break;
5694 : case D_NEWBLK:
5695 0 : newblk = WK_NEWBLK(wk);
5696 0 : (*pr)("fs %p newblk %lld state %d bmsafemap %p\n",
5697 0 : newblk->nb_fs, (long long)newblk->nb_newblkno,
5698 0 : newblk->nb_state, newblk->nb_bmsafemap);
5699 0 : break;
5700 : case D_BMSAFEMAP:
5701 0 : bmsafemap = WK_BMSAFEMAP(wk);
5702 0 : (*pr)("buf %p\n", bmsafemap->sm_buf);
5703 0 : break;
5704 : case D_ALLOCDIRECT:
5705 0 : adp = WK_ALLOCDIRECT(wk);
5706 0 : (*pr)("lbn %lld newlbk %lld oldblk %lld newsize %ld olsize "
5707 : "%ld\n%s bp %p inodedep %p freefrag %p\n",
5708 0 : (long long)adp->ad_lbn, (long long)adp->ad_newblkno,
5709 0 : (long long)adp->ad_oldblkno, adp->ad_newsize,
5710 0 : adp->ad_oldsize,
5711 0 : prefix, adp->ad_buf, adp->ad_inodedep, adp->ad_freefrag);
5712 0 : break;
5713 : case D_INDIRDEP:
5714 0 : indirdep = WK_INDIRDEP(wk);
5715 0 : (*pr)("savedata %p savebp %p\n", indirdep->ir_saveddata,
5716 0 : indirdep->ir_savebp);
5717 0 : break;
5718 : case D_ALLOCINDIR:
5719 0 : aip = WK_ALLOCINDIR(wk);
5720 0 : (*pr)("off %d newblk %lld oldblk %lld freefrag %p\n"
5721 0 : "%s indirdep %p buf %p\n", aip->ai_offset,
5722 0 : (long long)aip->ai_newblkno, (long long)aip->ai_oldblkno,
5723 0 : aip->ai_freefrag, prefix, aip->ai_indirdep, aip->ai_buf);
5724 0 : break;
5725 : case D_FREEFRAG:
5726 0 : freefrag = WK_FREEFRAG(wk);
5727 0 : (*pr)("vnode %p mp %p blkno %lld fsize %ld ino %u\n",
5728 0 : freefrag->ff_devvp, freefrag->ff_mnt,
5729 0 : (long long)freefrag->ff_blkno, freefrag->ff_fragsize,
5730 0 : freefrag->ff_inum);
5731 0 : break;
5732 : case D_FREEBLKS:
5733 0 : freeblks = WK_FREEBLKS(wk);
5734 0 : (*pr)("previno %u devvp %p mp %p oldsz %lld newsz %lld\n"
5735 0 : "%s chkcnt %d uid %d\n", freeblks->fb_previousinum,
5736 0 : freeblks->fb_devvp, freeblks->fb_mnt, freeblks->fb_oldsize,
5737 0 : freeblks->fb_newsize,
5738 0 : prefix, freeblks->fb_chkcnt, freeblks->fb_uid);
5739 0 : break;
5740 : case D_FREEFILE:
5741 0 : freefile = WK_FREEFILE(wk);
5742 0 : (*pr)("mode %x oldino %u vnode %p mp %p\n", freefile->fx_mode,
5743 0 : freefile->fx_oldinum, freefile->fx_devvp, freefile->fx_mnt);
5744 0 : break;
5745 : case D_DIRADD:
5746 0 : dap = WK_DIRADD(wk);
5747 0 : (*pr)("off %d ino %u da_un %p\n", dap->da_offset,
5748 0 : dap->da_newinum, dap->da_un.dau_previous);
5749 0 : break;
5750 : case D_MKDIR:
5751 0 : mkdir = WK_MKDIR(wk);
5752 0 : (*pr)("diradd %p bp %p\n", mkdir->md_diradd, mkdir->md_buf);
5753 0 : break;
5754 : case D_DIRREM:
5755 0 : dirrem = WK_DIRREM(wk);
5756 0 : (*pr)("mp %p ino %u dm_un %p\n", dirrem->dm_mnt,
5757 0 : dirrem->dm_oldinum, dirrem->dm_un.dmu_pagedep);
5758 0 : break;
5759 : case D_NEWDIRBLK:
5760 0 : newdirblk = WK_NEWDIRBLK(wk);
5761 0 : (*pr)("pagedep %p\n", newdirblk->db_pagedep);
5762 0 : break;
5763 : }
5764 0 : }
5765 : #endif
|