Line data Source code
1 : /* $OpenBSD: vfs_bio.c,v 1.186 2018/08/13 15:26:17 visa Exp $ */
2 : /* $NetBSD: vfs_bio.c,v 1.44 1996/06/11 11:15:36 pk Exp $ */
3 :
4 : /*
5 : * Copyright (c) 1994 Christopher G. Demetriou
6 : * Copyright (c) 1982, 1986, 1989, 1993
7 : * The Regents of the University of California. All rights reserved.
8 : * (c) UNIX System Laboratories, Inc.
9 : * All or some portions of this file are derived from material licensed
10 : * to the University of California by American Telephone and Telegraph
11 : * Co. or Unix System Laboratories, Inc. and are reproduced herein with
12 : * the permission of UNIX System Laboratories, Inc.
13 : *
14 : * Redistribution and use in source and binary forms, with or without
15 : * modification, are permitted provided that the following conditions
16 : * are met:
17 : * 1. Redistributions of source code must retain the above copyright
18 : * notice, this list of conditions and the following disclaimer.
19 : * 2. Redistributions in binary form must reproduce the above copyright
20 : * notice, this list of conditions and the following disclaimer in the
21 : * documentation and/or other materials provided with the distribution.
22 : * 3. Neither the name of the University nor the names of its contributors
23 : * may be used to endorse or promote products derived from this software
24 : * without specific prior written permission.
25 : *
26 : * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 : * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 : * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 : * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 : * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 : * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 : * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 : * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 : * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 : * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 : * SUCH DAMAGE.
37 : *
38 : * @(#)vfs_bio.c 8.6 (Berkeley) 1/11/94
39 : */
40 :
41 : /*
42 : * Some references:
43 : * Bach: The Design of the UNIX Operating System (Prentice Hall, 1986)
44 : * Leffler, et al.: The Design and Implementation of the 4.3BSD
45 : * UNIX Operating System (Addison Welley, 1989)
46 : */
47 :
48 : #include <sys/param.h>
49 : #include <sys/systm.h>
50 : #include <sys/proc.h>
51 : #include <sys/buf.h>
52 : #include <sys/vnode.h>
53 : #include <sys/mount.h>
54 : #include <sys/malloc.h>
55 : #include <sys/pool.h>
56 : #include <sys/resourcevar.h>
57 : #include <sys/conf.h>
58 : #include <sys/kernel.h>
59 : #include <sys/specdev.h>
60 : #include <uvm/uvm_extern.h>
61 :
62 : /* XXX Should really be in buf.h, but for uvm_constraint_range.. */
63 : int buf_realloc_pages(struct buf *, struct uvm_constraint_range *, int);
64 :
65 : struct uvm_constraint_range high_constraint;
66 : int fliphigh;
67 :
68 : int nobuffers;
69 : int needbuffer;
70 : struct bio_ops bioops;
71 :
72 : /* private bufcache functions */
73 : void bufcache_init(void);
74 : void bufcache_adjust(void);
75 : struct buf *bufcache_gethighcleanbuf(void);
76 : struct buf *bufcache_getdmacleanbuf(void);
77 :
78 : /*
79 : * Buffer pool for I/O buffers.
80 : */
81 : struct pool bufpool;
82 : struct bufhead bufhead = LIST_HEAD_INITIALIZER(bufhead);
83 : void buf_put(struct buf *);
84 :
85 : struct buf *bio_doread(struct vnode *, daddr_t, int, int);
86 : struct buf *buf_get(struct vnode *, daddr_t, size_t);
87 : void bread_cluster_callback(struct buf *);
88 :
89 : struct bcachestats bcstats; /* counters */
90 : long lodirtypages; /* dirty page count low water mark */
91 : long hidirtypages; /* dirty page count high water mark */
92 : long targetpages; /* target number of pages for cache size */
93 : long buflowpages; /* smallest size cache allowed */
94 : long bufhighpages; /* largest size cache allowed */
95 : long bufbackpages; /* minimum number of pages we shrink when asked to */
96 :
97 : vsize_t bufkvm;
98 :
99 : struct proc *cleanerproc;
100 : int bd_req; /* Sleep point for cleaner daemon. */
101 :
102 : #define NUM_CACHES 2
103 : #define DMA_CACHE 0
104 : struct bufcache cleancache[NUM_CACHES];
105 : struct bufqueue dirtyqueue;
106 :
107 : void
108 0 : buf_put(struct buf *bp)
109 : {
110 0 : splassert(IPL_BIO);
111 :
112 : #ifdef DIAGNOSTIC
113 0 : if (bp->b_pobj != NULL)
114 0 : KASSERT(bp->b_bufsize > 0);
115 0 : if (ISSET(bp->b_flags, B_DELWRI))
116 0 : panic("buf_put: releasing dirty buffer");
117 0 : if (bp->b_freelist.tqe_next != NOLIST &&
118 0 : bp->b_freelist.tqe_next != (void *)-1)
119 0 : panic("buf_put: still on the free list");
120 0 : if (bp->b_vnbufs.le_next != NOLIST &&
121 0 : bp->b_vnbufs.le_next != (void *)-1)
122 0 : panic("buf_put: still on the vnode list");
123 0 : if (!LIST_EMPTY(&bp->b_dep))
124 0 : panic("buf_put: b_dep is not empty");
125 : #endif
126 :
127 0 : LIST_REMOVE(bp, b_list);
128 0 : bcstats.numbufs--;
129 :
130 0 : if (buf_dealloc_mem(bp) != 0)
131 : return;
132 0 : pool_put(&bufpool, bp);
133 0 : }
134 :
135 : /*
136 : * Initialize buffers and hash links for buffers.
137 : */
138 : void
139 0 : bufinit(void)
140 : {
141 : u_int64_t dmapages;
142 : u_int64_t highpages;
143 :
144 0 : dmapages = uvm_pagecount(&dma_constraint);
145 : /* take away a guess at how much of this the kernel will consume */
146 0 : dmapages -= (atop(physmem) - atop(uvmexp.free));
147 :
148 : /* See if we have memory above the dma accessible region. */
149 0 : high_constraint.ucr_low = dma_constraint.ucr_high;
150 0 : high_constraint.ucr_high = no_constraint.ucr_high;
151 0 : if (high_constraint.ucr_low != high_constraint.ucr_high)
152 0 : high_constraint.ucr_low++;
153 0 : highpages = uvm_pagecount(&high_constraint);
154 :
155 : /*
156 : * Do we have any significant amount of high memory above
157 : * the DMA region? if so enable moving buffers there, if not,
158 : * don't bother.
159 : */
160 0 : if (highpages > dmapages / 4)
161 0 : fliphigh = 1;
162 : else
163 0 : fliphigh = 0;
164 :
165 : /*
166 : * If MD code doesn't say otherwise, use up to 10% of DMA'able
167 : * memory for buffers.
168 : */
169 0 : if (bufcachepercent == 0)
170 0 : bufcachepercent = 10;
171 :
172 : /*
173 : * XXX these values and their same use in kern_sysctl
174 : * need to move into buf.h
175 : */
176 0 : KASSERT(bufcachepercent <= 90);
177 0 : KASSERT(bufcachepercent >= 5);
178 0 : if (bufpages == 0)
179 0 : bufpages = dmapages * bufcachepercent / 100;
180 0 : if (bufpages < BCACHE_MIN)
181 0 : bufpages = BCACHE_MIN;
182 0 : KASSERT(bufpages < dmapages);
183 :
184 0 : bufhighpages = bufpages;
185 :
186 : /*
187 : * Set the base backoff level for the buffer cache. We will
188 : * not allow uvm to steal back more than this number of pages.
189 : */
190 0 : buflowpages = dmapages * 5 / 100;
191 0 : if (buflowpages < BCACHE_MIN)
192 : buflowpages = BCACHE_MIN;
193 :
194 : /*
195 : * set bufbackpages to 100 pages, or 10 percent of the low water mark
196 : * if we don't have that many pages.
197 : */
198 :
199 0 : bufbackpages = buflowpages * 10 / 100;
200 0 : if (bufbackpages > 100)
201 : bufbackpages = 100;
202 :
203 : /*
204 : * If the MD code does not say otherwise, reserve 10% of kva
205 : * space for mapping buffers.
206 : */
207 0 : if (bufkvm == 0)
208 0 : bufkvm = VM_KERNEL_SPACE_SIZE / 10;
209 :
210 : /*
211 : * Don't use more than twice the amount of bufpages for mappings.
212 : * It's twice since we map things sparsely.
213 : */
214 0 : if (bufkvm > bufpages * PAGE_SIZE)
215 0 : bufkvm = bufpages * PAGE_SIZE;
216 : /*
217 : * Round bufkvm to MAXPHYS because we allocate chunks of va space
218 : * in MAXPHYS chunks.
219 : */
220 0 : bufkvm &= ~(MAXPHYS - 1);
221 :
222 0 : pool_init(&bufpool, sizeof(struct buf), 0, IPL_BIO, 0, "bufpl", NULL);
223 :
224 0 : bufcache_init();
225 :
226 : /*
227 : * hmm - bufkvm is an argument because it's static, while
228 : * bufpages is global because it can change while running.
229 : */
230 0 : buf_mem_init(bufkvm);
231 :
232 : /*
233 : * Set the dirty page high water mark to be less than the low
234 : * water mark for pages in the buffer cache. This ensures we
235 : * can always back off by throwing away clean pages, and give
236 : * ourselves a chance to write out the dirty pages eventually.
237 : */
238 0 : hidirtypages = (buflowpages / 4) * 3;
239 0 : lodirtypages = buflowpages / 2;
240 :
241 : /*
242 : * We are allowed to use up to the reserve.
243 : */
244 0 : targetpages = bufpages - RESERVE_PAGES;
245 0 : }
246 :
247 : /*
248 : * Change cachepct
249 : */
250 : void
251 0 : bufadjust(int newbufpages)
252 : {
253 : struct buf *bp;
254 : int s;
255 :
256 0 : if (newbufpages < buflowpages)
257 0 : newbufpages = buflowpages;
258 :
259 0 : s = splbio();
260 0 : bufpages = newbufpages;
261 :
262 : /*
263 : * We are allowed to use up to the reserve
264 : */
265 0 : targetpages = bufpages - RESERVE_PAGES;
266 :
267 : /*
268 : * Shrinking the cache happens here only if someone has manually
269 : * adjusted bufcachepercent - or the pagedaemon has told us
270 : * to give back memory *now* - so we give it all back.
271 : */
272 0 : while ((bp = bufcache_getdmacleanbuf()) &&
273 0 : (bcstats.dmapages > targetpages)) {
274 0 : bufcache_take(bp);
275 0 : if (bp->b_vp) {
276 0 : RBT_REMOVE(buf_rb_bufs, &bp->b_vp->v_bufs_tree, bp);
277 0 : brelvp(bp);
278 0 : }
279 0 : buf_put(bp);
280 : }
281 0 : bufcache_adjust();
282 :
283 : /*
284 : * Wake up the cleaner if we have lots of dirty pages,
285 : * or if we are getting low on buffer cache kva.
286 : */
287 0 : if ((UNCLEAN_PAGES >= hidirtypages) ||
288 0 : bcstats.kvaslots_avail <= 2 * RESERVE_SLOTS)
289 0 : wakeup(&bd_req);
290 :
291 0 : splx(s);
292 0 : }
293 :
294 : /*
295 : * Make the buffer cache back off from cachepct.
296 : */
297 : int
298 0 : bufbackoff(struct uvm_constraint_range *range, long size)
299 : {
300 : /*
301 : * Back off "size" buffer cache pages. Called by the page
302 : * daemon to consume buffer cache pages rather than scanning.
303 : *
304 : * It returns 0 to the pagedaemon to indicate that it has
305 : * succeeded in freeing enough pages. It returns -1 to
306 : * indicate that it could not and the pagedaemon should take
307 : * other measures.
308 : *
309 : */
310 : long pdelta, oldbufpages;
311 :
312 : /*
313 : * If we will accept high memory for this backoff
314 : * try to steal it from the high memory buffer cache.
315 : */
316 0 : if (range->ucr_high > dma_constraint.ucr_high) {
317 : struct buf *bp;
318 0 : int64_t start = bcstats.numbufpages, recovered = 0;
319 0 : int s = splbio();
320 :
321 0 : while ((recovered < size) &&
322 0 : (bp = bufcache_gethighcleanbuf())) {
323 0 : bufcache_take(bp);
324 0 : if (bp->b_vp) {
325 0 : RBT_REMOVE(buf_rb_bufs,
326 : &bp->b_vp->v_bufs_tree, bp);
327 0 : brelvp(bp);
328 0 : }
329 0 : buf_put(bp);
330 0 : recovered = start - bcstats.numbufpages;
331 : }
332 0 : bufcache_adjust();
333 0 : splx(s);
334 :
335 : /* If we got enough, return success */
336 0 : if (recovered >= size)
337 0 : return 0;
338 :
339 : /*
340 : * If we needed only memory above DMA,
341 : * return failure
342 : */
343 0 : if (range->ucr_low > dma_constraint.ucr_high)
344 0 : return -1;
345 :
346 : /* Otherwise get the rest from DMA */
347 0 : size -= recovered;
348 0 : }
349 :
350 : /*
351 : * XXX Otherwise do the dma memory cache dance. this needs
352 : * refactoring later to get rid of 'bufpages'
353 : */
354 :
355 : /*
356 : * Back off by at least bufbackpages. If the page daemon gave us
357 : * a larger size, back off by that much.
358 : */
359 0 : pdelta = (size > bufbackpages) ? size : bufbackpages;
360 :
361 0 : if (bufpages <= buflowpages)
362 0 : return(-1);
363 0 : if (bufpages - pdelta < buflowpages)
364 0 : pdelta = bufpages - buflowpages;
365 0 : oldbufpages = bufpages;
366 0 : bufadjust(bufpages - pdelta);
367 0 : if (oldbufpages - bufpages < size)
368 0 : return (-1); /* we did not free what we were asked */
369 : else
370 0 : return(0);
371 0 : }
372 :
373 :
374 : /*
375 : * Opportunistically flip a buffer into high memory. Will move the buffer
376 : * if memory is available without sleeping, and return 0, otherwise will
377 : * fail and return -1 with the buffer unchanged.
378 : */
379 :
380 : int
381 0 : buf_flip_high(struct buf *bp)
382 : {
383 : int s;
384 : int ret = -1;
385 :
386 0 : KASSERT(ISSET(bp->b_flags, B_BC));
387 0 : KASSERT(ISSET(bp->b_flags, B_DMA));
388 0 : KASSERT(bp->cache == DMA_CACHE);
389 0 : KASSERT(fliphigh);
390 :
391 : /* Attempt to move the buffer to high memory if we can */
392 0 : s = splbio();
393 0 : if (buf_realloc_pages(bp, &high_constraint, UVM_PLA_NOWAIT) == 0) {
394 0 : KASSERT(!ISSET(bp->b_flags, B_DMA));
395 0 : bcstats.highflips++;
396 : ret = 0;
397 0 : } else
398 0 : bcstats.highflops++;
399 0 : splx(s);
400 :
401 0 : return ret;
402 : }
403 :
404 : /*
405 : * Flip a buffer to dma reachable memory, when we need it there for
406 : * I/O. This can sleep since it will wait for memory alloacation in the
407 : * DMA reachable area since we have to have the buffer there to proceed.
408 : */
409 : void
410 0 : buf_flip_dma(struct buf *bp)
411 : {
412 0 : KASSERT(ISSET(bp->b_flags, B_BC));
413 0 : KASSERT(ISSET(bp->b_flags, B_BUSY));
414 0 : KASSERT(bp->cache < NUM_CACHES);
415 :
416 0 : if (!ISSET(bp->b_flags, B_DMA)) {
417 0 : int s = splbio();
418 :
419 : /* move buf to dma reachable memory */
420 0 : (void) buf_realloc_pages(bp, &dma_constraint, UVM_PLA_WAITOK);
421 0 : KASSERT(ISSET(bp->b_flags, B_DMA));
422 0 : bcstats.dmaflips++;
423 0 : splx(s);
424 0 : }
425 :
426 0 : if (bp->cache > DMA_CACHE) {
427 0 : CLR(bp->b_flags, B_COLD);
428 0 : CLR(bp->b_flags, B_WARM);
429 0 : bp->cache = DMA_CACHE;
430 0 : }
431 0 : }
432 :
433 : struct buf *
434 0 : bio_doread(struct vnode *vp, daddr_t blkno, int size, int async)
435 : {
436 : struct buf *bp;
437 : struct mount *mp;
438 :
439 0 : bp = getblk(vp, blkno, size, 0, 0);
440 :
441 : /*
442 : * If buffer does not have valid data, start a read.
443 : * Note that if buffer is B_INVAL, getblk() won't return it.
444 : * Therefore, it's valid if its I/O has completed or been delayed.
445 : */
446 0 : if (!ISSET(bp->b_flags, (B_DONE | B_DELWRI))) {
447 0 : SET(bp->b_flags, B_READ | async);
448 0 : bcstats.pendingreads++;
449 0 : bcstats.numreads++;
450 0 : VOP_STRATEGY(bp);
451 : /* Pay for the read. */
452 0 : curproc->p_ru.ru_inblock++; /* XXX */
453 0 : } else if (async) {
454 0 : brelse(bp);
455 0 : }
456 :
457 0 : mp = vp->v_type == VBLK? vp->v_specmountpoint : vp->v_mount;
458 :
459 : /*
460 : * Collect statistics on synchronous and asynchronous reads.
461 : * Reads from block devices are charged to their associated
462 : * filesystem (if any).
463 : */
464 0 : if (mp != NULL) {
465 0 : if (async == 0)
466 0 : mp->mnt_stat.f_syncreads++;
467 : else
468 0 : mp->mnt_stat.f_asyncreads++;
469 : }
470 :
471 0 : return (bp);
472 : }
473 :
474 : /*
475 : * Read a disk block.
476 : * This algorithm described in Bach (p.54).
477 : */
478 : int
479 0 : bread(struct vnode *vp, daddr_t blkno, int size, struct buf **bpp)
480 : {
481 : struct buf *bp;
482 :
483 : /* Get buffer for block. */
484 0 : bp = *bpp = bio_doread(vp, blkno, size, 0);
485 :
486 : /* Wait for the read to complete, and return result. */
487 0 : return (biowait(bp));
488 : }
489 :
490 : /*
491 : * Read-ahead multiple disk blocks. The first is sync, the rest async.
492 : * Trivial modification to the breada algorithm presented in Bach (p.55).
493 : */
494 : int
495 0 : breadn(struct vnode *vp, daddr_t blkno, int size, daddr_t rablks[],
496 : int rasizes[], int nrablks, struct buf **bpp)
497 : {
498 : struct buf *bp;
499 : int i;
500 :
501 0 : bp = *bpp = bio_doread(vp, blkno, size, 0);
502 :
503 : /*
504 : * For each of the read-ahead blocks, start a read, if necessary.
505 : */
506 0 : for (i = 0; i < nrablks; i++) {
507 : /* If it's in the cache, just go on to next one. */
508 0 : if (incore(vp, rablks[i]))
509 : continue;
510 :
511 : /* Get a buffer for the read-ahead block */
512 0 : (void) bio_doread(vp, rablks[i], rasizes[i], B_ASYNC);
513 0 : }
514 :
515 : /* Otherwise, we had to start a read for it; wait until it's valid. */
516 0 : return (biowait(bp));
517 : }
518 :
519 : /*
520 : * Called from interrupt context.
521 : */
522 : void
523 0 : bread_cluster_callback(struct buf *bp)
524 : {
525 0 : struct buf **xbpp = bp->b_saveaddr;
526 : int i;
527 :
528 0 : if (xbpp[1] != NULL) {
529 0 : size_t newsize = xbpp[1]->b_bufsize;
530 :
531 : /*
532 : * Shrink this buffer's mapping to only cover its part of
533 : * the total I/O.
534 : */
535 0 : buf_fix_mapping(bp, newsize);
536 0 : bp->b_bcount = newsize;
537 0 : }
538 :
539 : /* Invalidate read-ahead buffers if read short */
540 0 : if (bp->b_resid > 0) {
541 0 : for (i = 0; xbpp[i] != NULL; i++)
542 : continue;
543 0 : for (i = i - 1; i != 0; i--) {
544 0 : if (xbpp[i]->b_bufsize <= bp->b_resid) {
545 0 : bp->b_resid -= xbpp[i]->b_bufsize;
546 0 : SET(xbpp[i]->b_flags, B_INVAL);
547 0 : } else if (bp->b_resid > 0) {
548 0 : bp->b_resid = 0;
549 0 : SET(xbpp[i]->b_flags, B_INVAL);
550 : } else
551 : break;
552 : }
553 : }
554 :
555 0 : for (i = 1; xbpp[i] != NULL; i++) {
556 0 : if (ISSET(bp->b_flags, B_ERROR))
557 0 : SET(xbpp[i]->b_flags, B_INVAL | B_ERROR);
558 0 : biodone(xbpp[i]);
559 : }
560 :
561 0 : free(xbpp, M_TEMP, 0);
562 :
563 0 : if (ISSET(bp->b_flags, B_ASYNC)) {
564 0 : brelse(bp);
565 0 : } else {
566 0 : CLR(bp->b_flags, B_WANTED);
567 0 : wakeup(bp);
568 : }
569 0 : }
570 :
571 : /*
572 : * Read-ahead multiple disk blocks, but make sure only one (big) I/O
573 : * request is sent to the disk.
574 : * XXX This should probably be dropped and breadn should instead be optimized
575 : * XXX to do fewer I/O requests.
576 : */
577 : int
578 0 : bread_cluster(struct vnode *vp, daddr_t blkno, int size, struct buf **rbpp)
579 : {
580 : struct buf *bp, **xbpp;
581 0 : int howmany, maxra, i, inc;
582 0 : daddr_t sblkno;
583 :
584 0 : *rbpp = bio_doread(vp, blkno, size, 0);
585 :
586 : /*
587 : * If the buffer is in the cache skip any I/O operation.
588 : */
589 0 : if (ISSET((*rbpp)->b_flags, B_CACHE))
590 : goto out;
591 :
592 0 : if (size != round_page(size))
593 : goto out;
594 :
595 0 : if (VOP_BMAP(vp, blkno + 1, NULL, &sblkno, &maxra))
596 : goto out;
597 :
598 0 : maxra++;
599 0 : if (sblkno == -1 || maxra < 2)
600 : goto out;
601 :
602 0 : howmany = MAXPHYS / size;
603 0 : if (howmany > maxra)
604 0 : howmany = maxra;
605 :
606 0 : xbpp = mallocarray(howmany + 1, sizeof(struct buf *), M_TEMP, M_NOWAIT);
607 0 : if (xbpp == NULL)
608 : goto out;
609 :
610 0 : for (i = howmany - 1; i >= 0; i--) {
611 : size_t sz;
612 :
613 : /*
614 : * First buffer allocates big enough size to cover what
615 : * all the other buffers need.
616 : */
617 0 : sz = i == 0 ? howmany * size : 0;
618 :
619 0 : xbpp[i] = buf_get(vp, blkno + i + 1, sz);
620 0 : if (xbpp[i] == NULL) {
621 0 : for (++i; i < howmany; i++) {
622 0 : SET(xbpp[i]->b_flags, B_INVAL);
623 0 : brelse(xbpp[i]);
624 : }
625 0 : free(xbpp, M_TEMP, 0);
626 0 : goto out;
627 : }
628 0 : }
629 :
630 0 : bp = xbpp[0];
631 :
632 0 : xbpp[howmany] = NULL;
633 :
634 0 : inc = btodb(size);
635 :
636 0 : for (i = 1; i < howmany; i++) {
637 0 : bcstats.pendingreads++;
638 0 : bcstats.numreads++;
639 : /*
640 : * We set B_DMA here because bp above will be B_DMA,
641 : * and we are playing buffer slice-n-dice games from
642 : * the memory allocated in bp.
643 : */
644 0 : SET(xbpp[i]->b_flags, B_DMA | B_READ | B_ASYNC);
645 0 : xbpp[i]->b_blkno = sblkno + (i * inc);
646 0 : xbpp[i]->b_bufsize = xbpp[i]->b_bcount = size;
647 0 : xbpp[i]->b_data = NULL;
648 0 : xbpp[i]->b_pobj = bp->b_pobj;
649 0 : xbpp[i]->b_poffs = bp->b_poffs + (i * size);
650 : }
651 :
652 0 : KASSERT(bp->b_lblkno == blkno + 1);
653 0 : KASSERT(bp->b_vp == vp);
654 :
655 0 : bp->b_blkno = sblkno;
656 0 : SET(bp->b_flags, B_READ | B_ASYNC | B_CALL);
657 :
658 0 : bp->b_saveaddr = (void *)xbpp;
659 0 : bp->b_iodone = bread_cluster_callback;
660 :
661 0 : bcstats.pendingreads++;
662 0 : bcstats.numreads++;
663 0 : VOP_STRATEGY(bp);
664 0 : curproc->p_ru.ru_inblock++;
665 :
666 : out:
667 0 : return (biowait(*rbpp));
668 0 : }
669 :
670 : /*
671 : * Block write. Described in Bach (p.56)
672 : */
673 : int
674 0 : bwrite(struct buf *bp)
675 : {
676 : int rv, async, wasdelayed, s;
677 : struct vnode *vp;
678 : struct mount *mp;
679 :
680 0 : vp = bp->b_vp;
681 0 : if (vp != NULL)
682 0 : mp = vp->v_type == VBLK? vp->v_specmountpoint : vp->v_mount;
683 : else
684 : mp = NULL;
685 :
686 : /*
687 : * Remember buffer type, to switch on it later. If the write was
688 : * synchronous, but the file system was mounted with MNT_ASYNC,
689 : * convert it to a delayed write.
690 : * XXX note that this relies on delayed tape writes being converted
691 : * to async, not sync writes (which is safe, but ugly).
692 : */
693 0 : async = ISSET(bp->b_flags, B_ASYNC);
694 0 : if (!async && mp && ISSET(mp->mnt_flag, MNT_ASYNC)) {
695 0 : bdwrite(bp);
696 0 : return (0);
697 : }
698 :
699 : /*
700 : * Collect statistics on synchronous and asynchronous writes.
701 : * Writes to block devices are charged to their associated
702 : * filesystem (if any).
703 : */
704 0 : if (mp != NULL) {
705 0 : if (async)
706 0 : mp->mnt_stat.f_asyncwrites++;
707 : else
708 0 : mp->mnt_stat.f_syncwrites++;
709 : }
710 0 : bcstats.pendingwrites++;
711 0 : bcstats.numwrites++;
712 :
713 0 : wasdelayed = ISSET(bp->b_flags, B_DELWRI);
714 0 : CLR(bp->b_flags, (B_READ | B_DONE | B_ERROR | B_DELWRI));
715 :
716 0 : s = splbio();
717 :
718 : /*
719 : * If not synchronous, pay for the I/O operation and make
720 : * sure the buf is on the correct vnode queue. We have
721 : * to do this now, because if we don't, the vnode may not
722 : * be properly notified that its I/O has completed.
723 : */
724 0 : if (wasdelayed) {
725 0 : reassignbuf(bp);
726 0 : } else
727 0 : curproc->p_ru.ru_oublock++;
728 :
729 :
730 : /* Initiate disk write. Make sure the appropriate party is charged. */
731 0 : bp->b_vp->v_numoutput++;
732 0 : splx(s);
733 0 : buf_flip_dma(bp);
734 0 : SET(bp->b_flags, B_WRITEINPROG);
735 0 : VOP_STRATEGY(bp);
736 :
737 : /*
738 : * If the queue is above the high water mark, wait till
739 : * the number of outstanding write bufs drops below the low
740 : * water mark.
741 : */
742 0 : if (bp->b_bq)
743 0 : bufq_wait(bp->b_bq);
744 :
745 0 : if (async)
746 0 : return (0);
747 :
748 : /*
749 : * If I/O was synchronous, wait for it to complete.
750 : */
751 0 : rv = biowait(bp);
752 :
753 : /* Release the buffer. */
754 0 : brelse(bp);
755 :
756 0 : return (rv);
757 0 : }
758 :
759 :
760 : /*
761 : * Delayed write.
762 : *
763 : * The buffer is marked dirty, but is not queued for I/O.
764 : * This routine should be used when the buffer is expected
765 : * to be modified again soon, typically a small write that
766 : * partially fills a buffer.
767 : *
768 : * NB: magnetic tapes cannot be delayed; they must be
769 : * written in the order that the writes are requested.
770 : *
771 : * Described in Leffler, et al. (pp. 208-213).
772 : */
773 : void
774 0 : bdwrite(struct buf *bp)
775 : {
776 : int s;
777 :
778 : /*
779 : * If the block hasn't been seen before:
780 : * (1) Mark it as having been seen,
781 : * (2) Charge for the write.
782 : * (3) Make sure it's on its vnode's correct block list,
783 : * (4) If a buffer is rewritten, move it to end of dirty list
784 : */
785 0 : if (!ISSET(bp->b_flags, B_DELWRI)) {
786 0 : SET(bp->b_flags, B_DELWRI);
787 0 : s = splbio();
788 0 : buf_flip_dma(bp);
789 0 : reassignbuf(bp);
790 0 : splx(s);
791 0 : curproc->p_ru.ru_oublock++; /* XXX */
792 0 : }
793 :
794 : /* The "write" is done, so mark and release the buffer. */
795 0 : CLR(bp->b_flags, B_NEEDCOMMIT);
796 0 : SET(bp->b_flags, B_DONE);
797 0 : brelse(bp);
798 0 : }
799 :
800 : /*
801 : * Asynchronous block write; just an asynchronous bwrite().
802 : */
803 : void
804 0 : bawrite(struct buf *bp)
805 : {
806 :
807 0 : SET(bp->b_flags, B_ASYNC);
808 0 : VOP_BWRITE(bp);
809 0 : }
810 :
811 : /*
812 : * Must be called at splbio()
813 : */
814 : void
815 0 : buf_dirty(struct buf *bp)
816 : {
817 0 : splassert(IPL_BIO);
818 :
819 : #ifdef DIAGNOSTIC
820 0 : if (!ISSET(bp->b_flags, B_BUSY))
821 0 : panic("Trying to dirty buffer on freelist!");
822 : #endif
823 :
824 0 : if (ISSET(bp->b_flags, B_DELWRI) == 0) {
825 0 : SET(bp->b_flags, B_DELWRI);
826 0 : buf_flip_dma(bp);
827 0 : reassignbuf(bp);
828 0 : }
829 0 : }
830 :
831 : /*
832 : * Must be called at splbio()
833 : */
834 : void
835 0 : buf_undirty(struct buf *bp)
836 : {
837 0 : splassert(IPL_BIO);
838 :
839 : #ifdef DIAGNOSTIC
840 0 : if (!ISSET(bp->b_flags, B_BUSY))
841 0 : panic("Trying to undirty buffer on freelist!");
842 : #endif
843 0 : if (ISSET(bp->b_flags, B_DELWRI)) {
844 0 : CLR(bp->b_flags, B_DELWRI);
845 0 : reassignbuf(bp);
846 0 : }
847 0 : }
848 :
849 : /*
850 : * Release a buffer on to the free lists.
851 : * Described in Bach (p. 46).
852 : */
853 : void
854 0 : brelse(struct buf *bp)
855 : {
856 : int s;
857 :
858 0 : s = splbio();
859 :
860 0 : if (bp->b_data != NULL)
861 0 : KASSERT(bp->b_bufsize > 0);
862 :
863 : /*
864 : * Determine which queue the buffer should be on, then put it there.
865 : */
866 :
867 : /* If it's not cacheable, or an error, mark it invalid. */
868 0 : if (ISSET(bp->b_flags, (B_NOCACHE|B_ERROR)))
869 0 : SET(bp->b_flags, B_INVAL);
870 :
871 0 : if (ISSET(bp->b_flags, B_INVAL)) {
872 : /*
873 : * If the buffer is invalid, free it now rather than leaving
874 : * it in a queue and wasting memory.
875 : */
876 0 : if (LIST_FIRST(&bp->b_dep) != NULL)
877 0 : buf_deallocate(bp);
878 :
879 0 : if (ISSET(bp->b_flags, B_DELWRI)) {
880 0 : CLR(bp->b_flags, B_DELWRI);
881 0 : }
882 :
883 0 : if (bp->b_vp) {
884 0 : RBT_REMOVE(buf_rb_bufs, &bp->b_vp->v_bufs_tree, bp);
885 0 : brelvp(bp);
886 0 : }
887 0 : bp->b_vp = NULL;
888 :
889 : /*
890 : * Wake up any processes waiting for _this_ buffer to
891 : * become free. They are not allowed to grab it
892 : * since it will be freed. But the only sleeper is
893 : * getblk and it will restart the operation after
894 : * sleep.
895 : */
896 0 : if (ISSET(bp->b_flags, B_WANTED)) {
897 0 : CLR(bp->b_flags, B_WANTED);
898 0 : wakeup(bp);
899 0 : }
900 0 : buf_put(bp);
901 0 : } else {
902 : /*
903 : * It has valid data. Put it on the end of the appropriate
904 : * queue, so that it'll stick around for as long as possible.
905 : */
906 0 : bufcache_release(bp);
907 :
908 : /* Unlock the buffer. */
909 0 : CLR(bp->b_flags, (B_AGE | B_ASYNC | B_NOCACHE | B_DEFERRED));
910 0 : buf_release(bp);
911 :
912 : /* Wake up any processes waiting for _this_ buffer to
913 : * become free. */
914 0 : if (ISSET(bp->b_flags, B_WANTED)) {
915 0 : CLR(bp->b_flags, B_WANTED);
916 0 : wakeup(bp);
917 0 : }
918 : }
919 :
920 : /* Wake up syncer and cleaner processes waiting for buffers. */
921 0 : if (nobuffers) {
922 0 : nobuffers = 0;
923 0 : wakeup(&nobuffers);
924 0 : }
925 :
926 : /* Wake up any processes waiting for any buffer to become free. */
927 0 : if (needbuffer && bcstats.dmapages < targetpages &&
928 0 : bcstats.kvaslots_avail > RESERVE_SLOTS) {
929 0 : needbuffer = 0;
930 0 : wakeup(&needbuffer);
931 0 : }
932 :
933 0 : splx(s);
934 0 : }
935 :
936 : /*
937 : * Determine if a block is in the cache. Just look on what would be its hash
938 : * chain. If it's there, return a pointer to it, unless it's marked invalid.
939 : */
940 : struct buf *
941 0 : incore(struct vnode *vp, daddr_t blkno)
942 : {
943 : struct buf *bp;
944 0 : struct buf b;
945 : int s;
946 :
947 0 : s = splbio();
948 :
949 : /* Search buf lookup tree */
950 0 : b.b_lblkno = blkno;
951 0 : bp = RBT_FIND(buf_rb_bufs, &vp->v_bufs_tree, &b);
952 0 : if (bp != NULL && ISSET(bp->b_flags, B_INVAL))
953 0 : bp = NULL;
954 :
955 0 : splx(s);
956 0 : return (bp);
957 0 : }
958 :
959 : /*
960 : * Get a block of requested size that is associated with
961 : * a given vnode and block offset. If it is found in the
962 : * block cache, mark it as having been found, make it busy
963 : * and return it. Otherwise, return an empty block of the
964 : * correct size. It is up to the caller to ensure that the
965 : * cached blocks be of the correct size.
966 : */
967 : struct buf *
968 0 : getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo)
969 : {
970 : struct buf *bp;
971 0 : struct buf b;
972 0 : int s, error;
973 :
974 : /*
975 : * XXX
976 : * The following is an inlined version of 'incore()', but with
977 : * the 'invalid' test moved to after the 'busy' test. It's
978 : * necessary because there are some cases in which the NFS
979 : * code sets B_INVAL prior to writing data to the server, but
980 : * in which the buffers actually contain valid data. In this
981 : * case, we can't allow the system to allocate a new buffer for
982 : * the block until the write is finished.
983 : */
984 : start:
985 0 : s = splbio();
986 0 : b.b_lblkno = blkno;
987 0 : bp = RBT_FIND(buf_rb_bufs, &vp->v_bufs_tree, &b);
988 0 : if (bp != NULL) {
989 0 : if (ISSET(bp->b_flags, B_BUSY)) {
990 0 : SET(bp->b_flags, B_WANTED);
991 0 : error = tsleep(bp, slpflag | (PRIBIO + 1), "getblk",
992 : slptimeo);
993 0 : splx(s);
994 0 : if (error)
995 0 : return (NULL);
996 0 : goto start;
997 : }
998 :
999 0 : if (!ISSET(bp->b_flags, B_INVAL)) {
1000 0 : bcstats.cachehits++;
1001 0 : SET(bp->b_flags, B_CACHE);
1002 0 : bufcache_take(bp);
1003 0 : buf_acquire(bp);
1004 0 : splx(s);
1005 0 : return (bp);
1006 : }
1007 : }
1008 0 : splx(s);
1009 :
1010 0 : if ((bp = buf_get(vp, blkno, size)) == NULL)
1011 0 : goto start;
1012 :
1013 0 : return (bp);
1014 0 : }
1015 :
1016 : /*
1017 : * Get an empty, disassociated buffer of given size.
1018 : */
1019 : struct buf *
1020 0 : geteblk(size_t size)
1021 : {
1022 : struct buf *bp;
1023 :
1024 0 : while ((bp = buf_get(NULL, 0, size)) == NULL)
1025 0 : continue;
1026 :
1027 0 : return (bp);
1028 : }
1029 :
1030 : /*
1031 : * Allocate a buffer.
1032 : * If vp is given, put it into the buffer cache for that vnode.
1033 : * If size != 0, allocate memory and call buf_map().
1034 : * If there is already a buffer for the given vnode/blkno, return NULL.
1035 : */
1036 : struct buf *
1037 0 : buf_get(struct vnode *vp, daddr_t blkno, size_t size)
1038 : {
1039 : struct buf *bp;
1040 0 : int poolwait = size == 0 ? PR_NOWAIT : PR_WAITOK;
1041 : int npages;
1042 : int s;
1043 :
1044 0 : s = splbio();
1045 0 : if (size) {
1046 : /*
1047 : * Wake up the cleaner if we have lots of dirty pages,
1048 : * or if we are getting low on buffer cache kva.
1049 : */
1050 0 : if (UNCLEAN_PAGES >= hidirtypages ||
1051 0 : bcstats.kvaslots_avail <= 2 * RESERVE_SLOTS)
1052 0 : wakeup(&bd_req);
1053 :
1054 0 : npages = atop(round_page(size));
1055 :
1056 : /*
1057 : * if our cache has been previously shrunk,
1058 : * allow it to grow again with use up to
1059 : * bufhighpages (cachepercent)
1060 : */
1061 0 : if (bufpages < bufhighpages)
1062 0 : bufadjust(bufhighpages);
1063 :
1064 : /*
1065 : * If we would go over the page target with our
1066 : * new allocation, free enough buffers first
1067 : * to stay at the target with our new allocation.
1068 : */
1069 0 : while ((bcstats.dmapages + npages > targetpages) &&
1070 0 : (bp = bufcache_getdmacleanbuf())) {
1071 0 : bufcache_take(bp);
1072 0 : if (bp->b_vp) {
1073 0 : RBT_REMOVE(buf_rb_bufs,
1074 : &bp->b_vp->v_bufs_tree, bp);
1075 0 : brelvp(bp);
1076 0 : }
1077 0 : buf_put(bp);
1078 : }
1079 :
1080 : /*
1081 : * If we get here, we tried to free the world down
1082 : * above, and couldn't get down - Wake the cleaner
1083 : * and wait for it to push some buffers out.
1084 : */
1085 0 : if ((bcstats.dmapages + npages > targetpages ||
1086 0 : bcstats.kvaslots_avail <= RESERVE_SLOTS) &&
1087 0 : curproc != syncerproc && curproc != cleanerproc) {
1088 0 : wakeup(&bd_req);
1089 0 : needbuffer++;
1090 0 : tsleep(&needbuffer, PRIBIO, "needbuffer", 0);
1091 0 : splx(s);
1092 0 : return (NULL);
1093 : }
1094 0 : if (bcstats.dmapages + npages > bufpages) {
1095 : /* cleaner or syncer */
1096 0 : nobuffers = 1;
1097 0 : tsleep(&nobuffers, PRIBIO, "nobuffers", 0);
1098 0 : splx(s);
1099 0 : return (NULL);
1100 : }
1101 : }
1102 :
1103 0 : bp = pool_get(&bufpool, poolwait|PR_ZERO);
1104 :
1105 0 : if (bp == NULL) {
1106 0 : splx(s);
1107 0 : return (NULL);
1108 : }
1109 :
1110 0 : bp->b_freelist.tqe_next = NOLIST;
1111 0 : bp->b_dev = NODEV;
1112 0 : LIST_INIT(&bp->b_dep);
1113 0 : bp->b_bcount = size;
1114 :
1115 0 : buf_acquire_nomap(bp);
1116 :
1117 0 : if (vp != NULL) {
1118 : /*
1119 : * We insert the buffer into the hash with B_BUSY set
1120 : * while we allocate pages for it. This way any getblk
1121 : * that happens while we allocate pages will wait for
1122 : * this buffer instead of starting its own buf_get.
1123 : *
1124 : * But first, we check if someone beat us to it.
1125 : */
1126 0 : if (incore(vp, blkno)) {
1127 0 : pool_put(&bufpool, bp);
1128 0 : splx(s);
1129 0 : return (NULL);
1130 : }
1131 :
1132 0 : bp->b_blkno = bp->b_lblkno = blkno;
1133 0 : bgetvp(vp, bp);
1134 0 : if (RBT_INSERT(buf_rb_bufs, &vp->v_bufs_tree, bp))
1135 0 : panic("buf_get: dup lblk vp %p bp %p", vp, bp);
1136 : } else {
1137 0 : bp->b_vnbufs.le_next = NOLIST;
1138 0 : SET(bp->b_flags, B_INVAL);
1139 0 : bp->b_vp = NULL;
1140 : }
1141 :
1142 0 : LIST_INSERT_HEAD(&bufhead, bp, b_list);
1143 0 : bcstats.numbufs++;
1144 :
1145 0 : if (size) {
1146 0 : buf_alloc_pages(bp, round_page(size));
1147 0 : KASSERT(ISSET(bp->b_flags, B_DMA));
1148 0 : buf_map(bp);
1149 0 : }
1150 :
1151 0 : SET(bp->b_flags, B_BC);
1152 0 : splx(s);
1153 :
1154 0 : return (bp);
1155 0 : }
1156 :
1157 : /*
1158 : * Buffer cleaning daemon.
1159 : */
1160 : void
1161 0 : buf_daemon(void *arg)
1162 : {
1163 : struct buf *bp = NULL;
1164 : int s, pushed = 0;
1165 :
1166 0 : s = splbio();
1167 0 : for (;;) {
1168 0 : if (bp == NULL || (pushed >= 16 &&
1169 0 : UNCLEAN_PAGES < hidirtypages &&
1170 0 : bcstats.kvaslots_avail > 2 * RESERVE_SLOTS)){
1171 : pushed = 0;
1172 : /*
1173 : * Wake up anyone who was waiting for buffers
1174 : * to be released.
1175 : */
1176 0 : if (needbuffer) {
1177 0 : needbuffer = 0;
1178 0 : wakeup(&needbuffer);
1179 0 : }
1180 0 : tsleep(&bd_req, PRIBIO - 7, "cleaner", 0);
1181 0 : }
1182 :
1183 0 : while ((bp = bufcache_getdirtybuf())) {
1184 :
1185 0 : if (UNCLEAN_PAGES < lodirtypages &&
1186 0 : bcstats.kvaslots_avail > 2 * RESERVE_SLOTS &&
1187 0 : pushed >= 16)
1188 : break;
1189 :
1190 0 : bufcache_take(bp);
1191 0 : buf_acquire(bp);
1192 0 : splx(s);
1193 :
1194 0 : if (ISSET(bp->b_flags, B_INVAL)) {
1195 0 : brelse(bp);
1196 0 : s = splbio();
1197 0 : continue;
1198 : }
1199 : #ifdef DIAGNOSTIC
1200 0 : if (!ISSET(bp->b_flags, B_DELWRI))
1201 0 : panic("Clean buffer on dirty queue");
1202 : #endif
1203 0 : if (LIST_FIRST(&bp->b_dep) != NULL &&
1204 0 : !ISSET(bp->b_flags, B_DEFERRED) &&
1205 0 : buf_countdeps(bp, 0, 0)) {
1206 0 : SET(bp->b_flags, B_DEFERRED);
1207 0 : s = splbio();
1208 0 : bufcache_release(bp);
1209 0 : buf_release(bp);
1210 0 : continue;
1211 : }
1212 :
1213 0 : bawrite(bp);
1214 0 : pushed++;
1215 :
1216 0 : sched_pause(yield);
1217 :
1218 0 : s = splbio();
1219 : }
1220 : }
1221 : }
1222 :
1223 : /*
1224 : * Wait for operations on the buffer to complete.
1225 : * When they do, extract and return the I/O's error value.
1226 : */
1227 : int
1228 0 : biowait(struct buf *bp)
1229 : {
1230 : int s;
1231 :
1232 0 : KASSERT(!(bp->b_flags & B_ASYNC));
1233 :
1234 0 : s = splbio();
1235 0 : while (!ISSET(bp->b_flags, B_DONE))
1236 0 : tsleep(bp, PRIBIO + 1, "biowait", 0);
1237 0 : splx(s);
1238 :
1239 : /* check for interruption of I/O (e.g. via NFS), then errors. */
1240 0 : if (ISSET(bp->b_flags, B_EINTR)) {
1241 0 : CLR(bp->b_flags, B_EINTR);
1242 0 : return (EINTR);
1243 : }
1244 :
1245 0 : if (ISSET(bp->b_flags, B_ERROR))
1246 0 : return (bp->b_error ? bp->b_error : EIO);
1247 : else
1248 0 : return (0);
1249 0 : }
1250 :
1251 : /*
1252 : * Mark I/O complete on a buffer.
1253 : *
1254 : * If a callback has been requested, e.g. the pageout
1255 : * daemon, do so. Otherwise, awaken waiting processes.
1256 : *
1257 : * [ Leffler, et al., says on p.247:
1258 : * "This routine wakes up the blocked process, frees the buffer
1259 : * for an asynchronous write, or, for a request by the pagedaemon
1260 : * process, invokes a procedure specified in the buffer structure" ]
1261 : *
1262 : * In real life, the pagedaemon (or other system processes) wants
1263 : * to do async stuff to, and doesn't want the buffer brelse()'d.
1264 : * (for swap pager, that puts swap buffers on the free lists (!!!),
1265 : * for the vn device, that puts malloc'd buffers on the free lists!)
1266 : *
1267 : * Must be called at splbio().
1268 : */
1269 : void
1270 0 : biodone(struct buf *bp)
1271 : {
1272 0 : splassert(IPL_BIO);
1273 :
1274 0 : if (ISSET(bp->b_flags, B_DONE))
1275 0 : panic("biodone already");
1276 0 : SET(bp->b_flags, B_DONE); /* note that it's done */
1277 :
1278 0 : if (bp->b_bq)
1279 0 : bufq_done(bp->b_bq, bp);
1280 :
1281 0 : if (LIST_FIRST(&bp->b_dep) != NULL)
1282 0 : buf_complete(bp);
1283 :
1284 0 : if (!ISSET(bp->b_flags, B_READ)) {
1285 0 : CLR(bp->b_flags, B_WRITEINPROG);
1286 0 : vwakeup(bp->b_vp);
1287 0 : }
1288 0 : if (bcstats.numbufs &&
1289 0 : (!(ISSET(bp->b_flags, B_RAW) || ISSET(bp->b_flags, B_PHYS)))) {
1290 0 : if (!ISSET(bp->b_flags, B_READ)) {
1291 0 : bcstats.pendingwrites--;
1292 0 : } else
1293 0 : bcstats.pendingreads--;
1294 : }
1295 0 : if (ISSET(bp->b_flags, B_CALL)) { /* if necessary, call out */
1296 0 : CLR(bp->b_flags, B_CALL); /* but note callout done */
1297 0 : (*bp->b_iodone)(bp);
1298 0 : } else {
1299 0 : if (ISSET(bp->b_flags, B_ASYNC)) {/* if async, release it */
1300 0 : brelse(bp);
1301 0 : } else { /* or just wakeup the buffer */
1302 0 : CLR(bp->b_flags, B_WANTED);
1303 0 : wakeup(bp);
1304 : }
1305 : }
1306 0 : }
1307 :
1308 : #ifdef DDB
1309 : void bcstats_print(int (*)(const char *, ...)
1310 : __attribute__((__format__(__kprintf__,1,2))));
1311 : /*
1312 : * bcstats_print: ddb hook to print interesting buffer cache counters
1313 : */
1314 : void
1315 0 : bcstats_print(
1316 : int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2))))
1317 : {
1318 0 : (*pr)("Current Buffer Cache status:\n");
1319 0 : (*pr)("numbufs %lld busymapped %lld, delwri %lld\n",
1320 0 : bcstats.numbufs, bcstats.busymapped, bcstats.delwribufs);
1321 0 : (*pr)("kvaslots %lld avail kva slots %lld\n",
1322 0 : bcstats.kvaslots, bcstats.kvaslots_avail);
1323 0 : (*pr)("bufpages %lld, dmapages %lld, dirtypages %lld\n",
1324 0 : bcstats.numbufpages, bcstats.dmapages, bcstats.numdirtypages);
1325 0 : (*pr)("pendingreads %lld, pendingwrites %lld\n",
1326 0 : bcstats.pendingreads, bcstats.pendingwrites);
1327 0 : (*pr)("highflips %lld, highflops %lld, dmaflips %lld\n",
1328 0 : bcstats.highflips, bcstats.highflops, bcstats.dmaflips);
1329 0 : }
1330 : #endif
1331 :
1332 : void
1333 0 : buf_adjcnt(struct buf *bp, long ncount)
1334 : {
1335 0 : KASSERT(ncount <= bp->b_bufsize);
1336 0 : bp->b_bcount = ncount;
1337 0 : }
1338 :
1339 : /* bufcache freelist code below */
1340 : /*
1341 : * Copyright (c) 2014 Ted Unangst <tedu@openbsd.org>
1342 : *
1343 : * Permission to use, copy, modify, and distribute this software for any
1344 : * purpose with or without fee is hereby granted, provided that the above
1345 : * copyright notice and this permission notice appear in all copies.
1346 : *
1347 : * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
1348 : * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
1349 : * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
1350 : * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
1351 : * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
1352 : * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
1353 : * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
1354 : */
1355 :
1356 : /*
1357 : * The code below implements a variant of the 2Q buffer cache algorithm by
1358 : * Johnson and Shasha.
1359 : *
1360 : * General Outline
1361 : * We divide the buffer cache into three working sets: current, previous,
1362 : * and long term. Each list is itself LRU and buffers get promoted and moved
1363 : * around between them. A buffer starts its life in the current working set.
1364 : * As time passes and newer buffers push it out, it will turn into the previous
1365 : * working set and is subject to recycling. But if it's accessed again from
1366 : * the previous working set, that's an indication that it's actually in the
1367 : * long term working set, so we promote it there. The separation of current
1368 : * and previous working sets prevents us from promoting a buffer that's only
1369 : * temporarily hot to the long term cache.
1370 : *
1371 : * The objective is to provide scan resistance by making the long term
1372 : * working set ineligible for immediate recycling, even as the current
1373 : * working set is rapidly turned over.
1374 : *
1375 : * Implementation
1376 : * The code below identifies the current, previous, and long term sets as
1377 : * hotqueue, coldqueue, and warmqueue. The hot and warm queues are capped at
1378 : * 1/3 of the total clean pages, after which point they start pushing their
1379 : * oldest buffers into coldqueue.
1380 : * A buf always starts out with neither WARM or COLD flags set (implying HOT).
1381 : * When released, it will be returned to the tail of the hotqueue list.
1382 : * When the hotqueue gets too large, the oldest hot buf will be moved to the
1383 : * coldqueue, with the B_COLD flag set. When a cold buf is released, we set
1384 : * the B_WARM flag and put it onto the warmqueue. Warm bufs are also
1385 : * directly returned to the end of the warmqueue. As with the hotqueue, when
1386 : * the warmqueue grows too large, B_WARM bufs are moved onto the coldqueue.
1387 : *
1388 : * Note that this design does still support large working sets, greater
1389 : * than the cap of hotqueue or warmqueue would imply. The coldqueue is still
1390 : * cached and has no maximum length. The hot and warm queues form a Y feeding
1391 : * into the coldqueue. Moving bufs between queues is constant time, so this
1392 : * design decays to one long warm->cold queue.
1393 : *
1394 : * In the 2Q paper, hotqueue and coldqueue are A1in and A1out. The warmqueue
1395 : * is Am. We always cache pages, as opposed to pointers to pages for A1.
1396 : *
1397 : * This implementation adds support for multiple 2q caches.
1398 : *
1399 : * If we have more than one 2q cache, as bufs fall off the cold queue
1400 : * for recyclying, bufs that have been warm before (which retain the
1401 : * B_WARM flag in addition to B_COLD) can be put into the hot queue of
1402 : * a second level 2Q cache. buffers which are only B_COLD are
1403 : * recycled. Bufs falling off the last cache's cold queue are always
1404 : * recycled.
1405 : *
1406 : */
1407 :
1408 : /*
1409 : * this function is called when a hot or warm queue may have exceeded its
1410 : * size limit. it will move a buf to the coldqueue.
1411 : */
1412 : int chillbufs(struct
1413 : bufcache *cache, struct bufqueue *queue, int64_t *queuepages);
1414 :
1415 : void
1416 0 : bufcache_init(void)
1417 : {
1418 : int i;
1419 0 : for (i=0; i < NUM_CACHES; i++) {
1420 0 : TAILQ_INIT(&cleancache[i].hotqueue);
1421 0 : TAILQ_INIT(&cleancache[i].coldqueue);
1422 0 : TAILQ_INIT(&cleancache[i].warmqueue);
1423 : }
1424 0 : TAILQ_INIT(&dirtyqueue);
1425 0 : }
1426 :
1427 : /*
1428 : * if the buffer caches have shrunk, we may need to rebalance our queues.
1429 : */
1430 : void
1431 0 : bufcache_adjust(void)
1432 : {
1433 : int i;
1434 0 : for (i=0; i < NUM_CACHES; i++) {
1435 0 : while (chillbufs(&cleancache[i], &cleancache[i].warmqueue,
1436 0 : &cleancache[i].warmbufpages) ||
1437 0 : chillbufs(&cleancache[i], &cleancache[i].hotqueue,
1438 0 : &cleancache[i].hotbufpages))
1439 0 : continue;
1440 : }
1441 0 : }
1442 :
1443 : /*
1444 : * Get a clean buffer from the cache. if "discard" is set do not promote
1445 : * previously warm buffers as normal, because we are tossing everything
1446 : * away such as in a hibernation
1447 : */
1448 : struct buf *
1449 0 : bufcache_getcleanbuf(int cachenum, int discard)
1450 : {
1451 : struct buf *bp = NULL;
1452 0 : struct bufcache *cache = &cleancache[cachenum];
1453 :
1454 0 : splassert(IPL_BIO);
1455 :
1456 : /* try cold queue */
1457 0 : while ((bp = TAILQ_FIRST(&cache->coldqueue))) {
1458 0 : if ((!discard) &&
1459 0 : cachenum < NUM_CACHES - 1 && ISSET(bp->b_flags, B_WARM)) {
1460 0 : int64_t pages = atop(bp->b_bufsize);
1461 : struct bufcache *newcache;
1462 :
1463 0 : KASSERT(bp->cache == cachenum);
1464 :
1465 : /*
1466 : * If this buffer was warm before, move it to
1467 : * the hot queue in the next cache
1468 : */
1469 :
1470 0 : if (fliphigh) {
1471 : /*
1472 : * If we are in the DMA cache, try to flip the
1473 : * buffer up high to move it on to the other
1474 : * caches. if we can't move the buffer to high
1475 : * memory without sleeping, we give it up and
1476 : * return it rather than fight for more memory
1477 : * against non buffer cache competitors.
1478 : */
1479 0 : SET(bp->b_flags, B_BUSY);
1480 0 : if (bp->cache == 0 && buf_flip_high(bp) == -1) {
1481 0 : CLR(bp->b_flags, B_BUSY);
1482 0 : return bp;
1483 : }
1484 0 : CLR(bp->b_flags, B_BUSY);
1485 0 : }
1486 :
1487 : /* Move the buffer to the hot queue in the next cache */
1488 0 : TAILQ_REMOVE(&cache->coldqueue, bp, b_freelist);
1489 0 : CLR(bp->b_flags, B_WARM);
1490 0 : CLR(bp->b_flags, B_COLD);
1491 0 : bp->cache++;
1492 0 : newcache= &cleancache[bp->cache];
1493 0 : newcache->cachepages += pages;
1494 0 : newcache->hotbufpages += pages;
1495 0 : chillbufs(newcache, &newcache->hotqueue,
1496 : &newcache->hotbufpages);
1497 0 : TAILQ_INSERT_TAIL(&newcache->hotqueue, bp, b_freelist);
1498 0 : }
1499 : else
1500 : /* buffer is cold - give it up */
1501 0 : return bp;
1502 : }
1503 0 : if ((bp = TAILQ_FIRST(&cache->warmqueue)))
1504 0 : return bp;
1505 0 : if ((bp = TAILQ_FIRST(&cache->hotqueue)))
1506 0 : return bp;
1507 : return bp;
1508 0 : }
1509 :
1510 : struct buf *
1511 0 : bufcache_getcleanbuf_range(int start, int end, int discard)
1512 : {
1513 : int i, j = start, q = end;
1514 : struct buf *bp = NULL;
1515 :
1516 : /*
1517 : * XXX in theory we could promote warm buffers into a previous queue
1518 : * so in the pathological case of where we go through all the caches
1519 : * without getting a buffer we have to start at the beginning again.
1520 : */
1521 0 : while (j <= q) {
1522 0 : for (i = q; i >= j; i--)
1523 0 : if ((bp = bufcache_getcleanbuf(i, discard)))
1524 0 : return(bp);
1525 0 : j++;
1526 : }
1527 0 : return bp;
1528 0 : }
1529 :
1530 : struct buf *
1531 0 : bufcache_gethighcleanbuf(void)
1532 : {
1533 0 : if (!fliphigh)
1534 0 : return NULL;
1535 0 : return bufcache_getcleanbuf_range(DMA_CACHE + 1, NUM_CACHES - 1, 0);
1536 0 : }
1537 :
1538 : struct buf *
1539 0 : bufcache_getdmacleanbuf(void)
1540 : {
1541 0 : if (fliphigh)
1542 0 : return bufcache_getcleanbuf_range(DMA_CACHE, DMA_CACHE, 0);
1543 0 : return bufcache_getcleanbuf_range(DMA_CACHE, NUM_CACHES - 1, 0);
1544 0 : }
1545 :
1546 : struct buf *
1547 0 : bufcache_getdirtybuf(void)
1548 : {
1549 0 : return TAILQ_FIRST(&dirtyqueue);
1550 : }
1551 :
1552 : void
1553 0 : bufcache_take(struct buf *bp)
1554 : {
1555 : struct bufqueue *queue;
1556 : int64_t pages;
1557 :
1558 0 : splassert(IPL_BIO);
1559 0 : KASSERT(ISSET(bp->b_flags, B_BC));
1560 0 : KASSERT(bp->cache >= DMA_CACHE);
1561 0 : KASSERT((bp->cache < NUM_CACHES));
1562 :
1563 0 : pages = atop(bp->b_bufsize);
1564 0 : struct bufcache *cache = &cleancache[bp->cache];
1565 0 : if (!ISSET(bp->b_flags, B_DELWRI)) {
1566 0 : if (ISSET(bp->b_flags, B_COLD)) {
1567 0 : queue = &cache->coldqueue;
1568 0 : } else if (ISSET(bp->b_flags, B_WARM)) {
1569 0 : queue = &cache->warmqueue;
1570 0 : cache->warmbufpages -= pages;
1571 0 : } else {
1572 0 : queue = &cache->hotqueue;
1573 0 : cache->hotbufpages -= pages;
1574 : }
1575 0 : bcstats.numcleanpages -= pages;
1576 0 : cache->cachepages -= pages;
1577 0 : } else {
1578 : queue = &dirtyqueue;
1579 0 : bcstats.numdirtypages -= pages;
1580 0 : bcstats.delwribufs--;
1581 : }
1582 0 : TAILQ_REMOVE(queue, bp, b_freelist);
1583 0 : }
1584 :
1585 : /* move buffers from a hot or warm queue to a cold queue in a cache */
1586 : int
1587 0 : chillbufs(struct bufcache *cache, struct bufqueue *queue, int64_t *queuepages)
1588 : {
1589 : struct buf *bp;
1590 : int64_t limit, pages;
1591 :
1592 : /*
1593 : * We limit the hot queue to be small, with a max of 4096 pages.
1594 : * We limit the warm queue to half the cache size.
1595 : *
1596 : * We impose a minimum size of 96 to prevent too much "wobbling".
1597 : */
1598 0 : if (queue == &cache->hotqueue)
1599 0 : limit = min(cache->cachepages / 20, 4096);
1600 0 : else if (queue == &cache->warmqueue)
1601 0 : limit = (cache->cachepages / 2);
1602 : else
1603 0 : panic("chillbufs: invalid queue");
1604 :
1605 0 : if (*queuepages > 96 && *queuepages > limit) {
1606 0 : bp = TAILQ_FIRST(queue);
1607 0 : if (!bp)
1608 0 : panic("inconsistent bufpage counts");
1609 0 : pages = atop(bp->b_bufsize);
1610 0 : *queuepages -= pages;
1611 0 : TAILQ_REMOVE(queue, bp, b_freelist);
1612 : /* we do not clear B_WARM */
1613 0 : SET(bp->b_flags, B_COLD);
1614 0 : TAILQ_INSERT_TAIL(&cache->coldqueue, bp, b_freelist);
1615 0 : return 1;
1616 : }
1617 0 : return 0;
1618 0 : }
1619 :
1620 : void
1621 0 : bufcache_release(struct buf *bp)
1622 : {
1623 : struct bufqueue *queue;
1624 : int64_t pages;
1625 0 : struct bufcache *cache = &cleancache[bp->cache];
1626 :
1627 0 : pages = atop(bp->b_bufsize);
1628 0 : KASSERT(ISSET(bp->b_flags, B_BC));
1629 0 : if (fliphigh) {
1630 0 : if (ISSET(bp->b_flags, B_DMA) && bp->cache > 0)
1631 0 : panic("B_DMA buffer release from cache %d",
1632 : bp->cache);
1633 0 : else if ((!ISSET(bp->b_flags, B_DMA)) && bp->cache == 0)
1634 0 : panic("Non B_DMA buffer release from cache %d",
1635 : bp->cache);
1636 : }
1637 :
1638 0 : if (!ISSET(bp->b_flags, B_DELWRI)) {
1639 : int64_t *queuepages;
1640 0 : if (ISSET(bp->b_flags, B_WARM | B_COLD)) {
1641 0 : SET(bp->b_flags, B_WARM);
1642 0 : CLR(bp->b_flags, B_COLD);
1643 0 : queue = &cache->warmqueue;
1644 0 : queuepages = &cache->warmbufpages;
1645 0 : } else {
1646 0 : queue = &cache->hotqueue;
1647 0 : queuepages = &cache->hotbufpages;
1648 : }
1649 0 : *queuepages += pages;
1650 0 : bcstats.numcleanpages += pages;
1651 0 : cache->cachepages += pages;
1652 0 : chillbufs(cache, queue, queuepages);
1653 0 : } else {
1654 : queue = &dirtyqueue;
1655 0 : bcstats.numdirtypages += pages;
1656 0 : bcstats.delwribufs++;
1657 : }
1658 0 : TAILQ_INSERT_TAIL(queue, bp, b_freelist);
1659 0 : }
1660 :
1661 : #ifdef HIBERNATE
1662 : /*
1663 : * Nuke the buffer cache from orbit when hibernating. We do not want to save
1664 : * any clean cache pages to swap and read them back. the original disk files
1665 : * are just as good.
1666 : */
1667 : void
1668 0 : hibernate_suspend_bufcache(void)
1669 : {
1670 : struct buf *bp;
1671 : int s;
1672 :
1673 0 : s = splbio();
1674 : /* Chuck away all the cache pages.. discard bufs, do not promote */
1675 0 : while ((bp = bufcache_getcleanbuf_range(DMA_CACHE, NUM_CACHES - 1, 1))) {
1676 0 : bufcache_take(bp);
1677 0 : if (bp->b_vp) {
1678 0 : RBT_REMOVE(buf_rb_bufs, &bp->b_vp->v_bufs_tree, bp);
1679 0 : brelvp(bp);
1680 0 : }
1681 0 : buf_put(bp);
1682 : }
1683 0 : splx(s);
1684 0 : }
1685 :
1686 : void
1687 0 : hibernate_resume_bufcache(void)
1688 : {
1689 : /* XXX Nothing needed here for now */
1690 0 : }
1691 : #endif /* HIBERNATE */
|