Line data Source code
1 : /* $OpenBSD: uvm_pmemrange.c,v 1.53 2016/09/16 02:52:24 dlg Exp $ */
2 :
3 : /*
4 : * Copyright (c) 2009, 2010 Ariane van der Steldt <ariane@stack.nl>
5 : *
6 : * Permission to use, copy, modify, and distribute this software for any
7 : * purpose with or without fee is hereby granted, provided that the above
8 : * copyright notice and this permission notice appear in all copies.
9 : *
10 : * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11 : * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12 : * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13 : * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14 : * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15 : * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16 : * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17 : */
18 :
19 : #include <sys/param.h>
20 : #include <sys/systm.h>
21 : #include <uvm/uvm.h>
22 : #include <sys/malloc.h>
23 : #include <sys/kernel.h>
24 : #include <sys/kthread.h>
25 : #include <sys/mount.h>
26 :
27 : /*
28 : * 2 trees: addr tree and size tree.
29 : *
30 : * The allocator keeps chunks of free pages (called a range).
31 : * Two pages are part of the same range if:
32 : * - all pages in between are part of that range,
33 : * - they are of the same memory type (zeroed or non-zeroed),
34 : * - they are part of the same pmemrange.
35 : * A pmemrange is a range of memory which is part of the same vm_physseg
36 : * and has a use-count.
37 : *
38 : * addr tree is vm_page[0].objt
39 : * size tree is vm_page[1].objt
40 : *
41 : * The size tree is not used for memory ranges of 1 page, instead,
42 : * single queue is vm_page[0].pageq
43 : *
44 : * vm_page[0].fpgsz describes the length of a free range. Two adjecent ranges
45 : * are joined, unless:
46 : * - they have pages in between them which are not free
47 : * - they belong to different memtypes (zeroed vs dirty memory)
48 : * - they are in different pmemrange areas (ISA vs non-ISA memory for instance)
49 : * - they are not a continuation of the same array
50 : * The latter issue is caused by vm_physseg ordering and splitting from the
51 : * MD initialization machinery. The MD code is dependant on freelists and
52 : * happens to split ISA memory from non-ISA memory.
53 : * (Note: freelists die die die!)
54 : *
55 : * uvm_page_init guarantees that every vm_physseg contains an array of
56 : * struct vm_page. Also, uvm_page_physload allocates an array of struct
57 : * vm_page. This code depends on that array. The array may break across
58 : * vm_physsegs boundaries.
59 : */
60 :
61 : /*
62 : * Validate the flags of the page. (Used in asserts.)
63 : * Any free page must have the PQ_FREE flag set.
64 : * Free pages may be zeroed.
65 : * Pmap flags are left untouched.
66 : *
67 : * The PQ_FREE flag is not checked here: by not checking, we can easily use
68 : * this check in pages which are freed.
69 : */
70 : #define VALID_FLAGS(pg_flags) \
71 : (((pg_flags) & ~(PQ_FREE|PG_ZERO|PG_PMAPMASK)) == 0x0)
72 :
73 : /* Tree comparators. */
74 : int uvm_pmemrange_addr_cmp(const struct uvm_pmemrange *,
75 : const struct uvm_pmemrange *);
76 : int uvm_pmemrange_use_cmp(struct uvm_pmemrange *, struct uvm_pmemrange *);
77 : int uvm_pmr_pg_to_memtype(struct vm_page *);
78 :
79 : #ifdef DDB
80 : void uvm_pmr_print(void);
81 : #endif
82 :
83 : /*
84 : * Memory types. The page flags are used to derive what the current memory
85 : * type of a page is.
86 : */
87 : int
88 0 : uvm_pmr_pg_to_memtype(struct vm_page *pg)
89 : {
90 0 : if (pg->pg_flags & PG_ZERO)
91 0 : return UVM_PMR_MEMTYPE_ZERO;
92 : /* Default: dirty memory. */
93 0 : return UVM_PMR_MEMTYPE_DIRTY;
94 0 : }
95 :
96 : /* Trees. */
97 0 : RBT_GENERATE(uvm_pmr_addr, vm_page, objt, uvm_pmr_addr_cmp);
98 0 : RBT_GENERATE(uvm_pmr_size, vm_page, objt, uvm_pmr_size_cmp);
99 0 : RBT_GENERATE(uvm_pmemrange_addr, uvm_pmemrange, pmr_addr,
100 : uvm_pmemrange_addr_cmp);
101 :
102 : /* Validation. */
103 : #ifdef DEBUG
104 : void uvm_pmr_assertvalid(struct uvm_pmemrange *pmr);
105 : #else
106 : #define uvm_pmr_assertvalid(pmr) do {} while (0)
107 : #endif
108 :
109 : psize_t uvm_pmr_get1page(psize_t, int, struct pglist *,
110 : paddr_t, paddr_t, int);
111 :
112 : struct uvm_pmemrange *uvm_pmr_allocpmr(void);
113 : struct vm_page *uvm_pmr_nfindsz(struct uvm_pmemrange *, psize_t, int);
114 : struct vm_page *uvm_pmr_nextsz(struct uvm_pmemrange *,
115 : struct vm_page *, int);
116 : void uvm_pmr_pnaddr(struct uvm_pmemrange *pmr,
117 : struct vm_page *pg, struct vm_page **pg_prev,
118 : struct vm_page **pg_next);
119 : struct vm_page *uvm_pmr_findnextsegment(struct uvm_pmemrange *,
120 : struct vm_page *, paddr_t);
121 : psize_t uvm_pmr_remove_1strange(struct pglist *, paddr_t,
122 : struct vm_page **, int);
123 : void uvm_pmr_split(paddr_t);
124 : struct uvm_pmemrange *uvm_pmemrange_find(paddr_t);
125 : struct uvm_pmemrange *uvm_pmemrange_use_insert(struct uvm_pmemrange_use *,
126 : struct uvm_pmemrange *);
127 : psize_t pow2divide(psize_t, psize_t);
128 : struct vm_page *uvm_pmr_rootupdate(struct uvm_pmemrange *,
129 : struct vm_page *, paddr_t, paddr_t, int);
130 :
131 : /*
132 : * Computes num/denom and rounds it up to the next power-of-2.
133 : *
134 : * This is a division function which calculates an approximation of
135 : * num/denom, with result =~ num/denom. It is meant to be fast and doesn't
136 : * have to be accurate.
137 : *
138 : * Providing too large a value makes the allocator slightly faster, at the
139 : * risk of hitting the failure case more often. Providing too small a value
140 : * makes the allocator a bit slower, but less likely to hit a failure case.
141 : */
142 : psize_t
143 0 : pow2divide(psize_t num, psize_t denom)
144 : {
145 : int rshift;
146 :
147 0 : for (rshift = 0; num > denom; rshift++, denom <<= 1)
148 : ;
149 0 : return (paddr_t)1 << rshift;
150 : }
151 :
152 : /*
153 : * Predicate: lhs is a subrange or rhs.
154 : *
155 : * If rhs_low == 0: don't care about lower bound.
156 : * If rhs_high == 0: don't care about upper bound.
157 : */
158 : #define PMR_IS_SUBRANGE_OF(lhs_low, lhs_high, rhs_low, rhs_high) \
159 : (((rhs_low) == 0 || (lhs_low) >= (rhs_low)) && \
160 : ((rhs_high) == 0 || (lhs_high) <= (rhs_high)))
161 :
162 : /*
163 : * Predicate: lhs intersects with rhs.
164 : *
165 : * If rhs_low == 0: don't care about lower bound.
166 : * If rhs_high == 0: don't care about upper bound.
167 : * Ranges don't intersect if they don't have any page in common, array
168 : * semantics mean that < instead of <= should be used here.
169 : */
170 : #define PMR_INTERSECTS_WITH(lhs_low, lhs_high, rhs_low, rhs_high) \
171 : (((rhs_low) == 0 || (rhs_low) < (lhs_high)) && \
172 : ((rhs_high) == 0 || (lhs_low) < (rhs_high)))
173 :
174 : /*
175 : * Align to power-of-2 alignment.
176 : */
177 : #define PMR_ALIGN(pgno, align) \
178 : (((pgno) + ((align) - 1)) & ~((align) - 1))
179 :
180 :
181 : /*
182 : * Comparator: sort by address ascending.
183 : */
184 : int
185 0 : uvm_pmemrange_addr_cmp(const struct uvm_pmemrange *lhs,
186 : const struct uvm_pmemrange *rhs)
187 : {
188 0 : return lhs->low < rhs->low ? -1 : lhs->low > rhs->low;
189 : }
190 :
191 : /*
192 : * Comparator: sort by use ascending.
193 : *
194 : * The higher the use value of a range, the more devices need memory in
195 : * this range. Therefore allocate from the range with the lowest use first.
196 : */
197 : int
198 0 : uvm_pmemrange_use_cmp(struct uvm_pmemrange *lhs, struct uvm_pmemrange *rhs)
199 : {
200 : int result;
201 :
202 0 : result = lhs->use < rhs->use ? -1 : lhs->use > rhs->use;
203 0 : if (result == 0)
204 0 : result = uvm_pmemrange_addr_cmp(lhs, rhs);
205 0 : return result;
206 : }
207 :
208 : int
209 0 : uvm_pmr_addr_cmp(const struct vm_page *lhs, const struct vm_page *rhs)
210 : {
211 : paddr_t lhs_addr, rhs_addr;
212 :
213 0 : lhs_addr = VM_PAGE_TO_PHYS(lhs);
214 0 : rhs_addr = VM_PAGE_TO_PHYS(rhs);
215 :
216 0 : return (lhs_addr < rhs_addr ? -1 : lhs_addr > rhs_addr);
217 : }
218 :
219 : int
220 0 : uvm_pmr_size_cmp(const struct vm_page *lhs, const struct vm_page *rhs)
221 : {
222 : psize_t lhs_size, rhs_size;
223 : int cmp;
224 :
225 : /* Using second tree, so we receive pg[1] instead of pg[0]. */
226 0 : lhs_size = (lhs - 1)->fpgsz;
227 0 : rhs_size = (rhs - 1)->fpgsz;
228 :
229 0 : cmp = (lhs_size < rhs_size ? -1 : lhs_size > rhs_size);
230 0 : if (cmp == 0)
231 0 : cmp = uvm_pmr_addr_cmp(lhs - 1, rhs - 1);
232 0 : return cmp;
233 : }
234 :
235 : /*
236 : * Find the first range of free pages that is at least sz pages long.
237 : */
238 : struct vm_page *
239 0 : uvm_pmr_nfindsz(struct uvm_pmemrange *pmr, psize_t sz, int mti)
240 : {
241 : struct vm_page *node, *best;
242 :
243 0 : KASSERT(sz >= 1);
244 :
245 0 : if (sz == 1 && !TAILQ_EMPTY(&pmr->single[mti]))
246 0 : return TAILQ_FIRST(&pmr->single[mti]);
247 :
248 0 : node = RBT_ROOT(uvm_pmr_size, &pmr->size[mti]);
249 : best = NULL;
250 0 : while (node != NULL) {
251 0 : if ((node - 1)->fpgsz >= sz) {
252 : best = (node - 1);
253 0 : node = RBT_LEFT(uvm_objtree, node);
254 0 : } else
255 0 : node = RBT_RIGHT(uvm_objtree, node);
256 : }
257 0 : return best;
258 0 : }
259 :
260 : /*
261 : * Finds the next range. The next range has a size >= pg->fpgsz.
262 : * Returns NULL if no more ranges are available.
263 : */
264 : struct vm_page *
265 0 : uvm_pmr_nextsz(struct uvm_pmemrange *pmr, struct vm_page *pg, int mt)
266 : {
267 : struct vm_page *npg;
268 :
269 0 : KASSERT(pmr != NULL && pg != NULL);
270 0 : if (pg->fpgsz == 1) {
271 0 : if (TAILQ_NEXT(pg, pageq) != NULL)
272 0 : return TAILQ_NEXT(pg, pageq);
273 : else
274 0 : npg = RBT_MIN(uvm_pmr_size, &pmr->size[mt]);
275 0 : } else
276 0 : npg = RBT_NEXT(uvm_pmr_size, pg + 1);
277 :
278 0 : return npg == NULL ? NULL : npg - 1;
279 0 : }
280 :
281 : /*
282 : * Finds the previous and next ranges relative to the (uninserted) pg range.
283 : *
284 : * *pg_prev == NULL if no previous range is available, that can join with
285 : * pg.
286 : * *pg_next == NULL if no next range is available, that can join with
287 : * pg.
288 : */
289 : void
290 0 : uvm_pmr_pnaddr(struct uvm_pmemrange *pmr, struct vm_page *pg,
291 : struct vm_page **pg_prev, struct vm_page **pg_next)
292 : {
293 0 : KASSERT(pg_prev != NULL && pg_next != NULL);
294 :
295 0 : *pg_next = RBT_NFIND(uvm_pmr_addr, &pmr->addr, pg);
296 0 : if (*pg_next == NULL)
297 0 : *pg_prev = RBT_MAX(uvm_pmr_addr, &pmr->addr);
298 : else
299 0 : *pg_prev = RBT_PREV(uvm_pmr_addr, *pg_next);
300 :
301 : KDASSERT(*pg_next == NULL ||
302 : VM_PAGE_TO_PHYS(*pg_next) > VM_PAGE_TO_PHYS(pg));
303 : KDASSERT(*pg_prev == NULL ||
304 : VM_PAGE_TO_PHYS(*pg_prev) < VM_PAGE_TO_PHYS(pg));
305 :
306 : /* Reset if not contig. */
307 0 : if (*pg_prev != NULL &&
308 0 : (atop(VM_PAGE_TO_PHYS(*pg_prev)) + (*pg_prev)->fpgsz
309 0 : != atop(VM_PAGE_TO_PHYS(pg)) ||
310 0 : *pg_prev + (*pg_prev)->fpgsz != pg || /* Array broke. */
311 0 : uvm_pmr_pg_to_memtype(*pg_prev) != uvm_pmr_pg_to_memtype(pg)))
312 0 : *pg_prev = NULL;
313 0 : if (*pg_next != NULL &&
314 0 : (atop(VM_PAGE_TO_PHYS(pg)) + pg->fpgsz
315 0 : != atop(VM_PAGE_TO_PHYS(*pg_next)) ||
316 0 : pg + pg->fpgsz != *pg_next || /* Array broke. */
317 0 : uvm_pmr_pg_to_memtype(*pg_next) != uvm_pmr_pg_to_memtype(pg)))
318 0 : *pg_next = NULL;
319 0 : return;
320 : }
321 :
322 : /*
323 : * Remove a range from the address tree.
324 : * Address tree maintains pmr counters.
325 : */
326 : void
327 0 : uvm_pmr_remove_addr(struct uvm_pmemrange *pmr, struct vm_page *pg)
328 : {
329 : KDASSERT(RBT_FIND(uvm_pmr_addr, &pmr->addr, pg) == pg);
330 : KDASSERT(pg->pg_flags & PQ_FREE);
331 0 : RBT_REMOVE(uvm_pmr_addr, &pmr->addr, pg);
332 :
333 0 : pmr->nsegs--;
334 0 : }
335 : /*
336 : * Remove a range from the size tree.
337 : */
338 : void
339 0 : uvm_pmr_remove_size(struct uvm_pmemrange *pmr, struct vm_page *pg)
340 : {
341 : int memtype;
342 : #ifdef DEBUG
343 : struct vm_page *i;
344 : #endif
345 :
346 : KDASSERT(pg->fpgsz >= 1);
347 : KDASSERT(pg->pg_flags & PQ_FREE);
348 0 : memtype = uvm_pmr_pg_to_memtype(pg);
349 :
350 0 : if (pg->fpgsz == 1) {
351 : #ifdef DEBUG
352 : TAILQ_FOREACH(i, &pmr->single[memtype], pageq) {
353 : if (i == pg)
354 : break;
355 : }
356 : KDASSERT(i == pg);
357 : #endif
358 0 : TAILQ_REMOVE(&pmr->single[memtype], pg, pageq);
359 0 : } else {
360 : KDASSERT(RBT_FIND(uvm_pmr_size, &pmr->size[memtype],
361 : pg + 1) == pg + 1);
362 0 : RBT_REMOVE(uvm_pmr_size, &pmr->size[memtype], pg + 1);
363 : }
364 0 : }
365 : /* Remove from both trees. */
366 : void
367 0 : uvm_pmr_remove(struct uvm_pmemrange *pmr, struct vm_page *pg)
368 : {
369 : uvm_pmr_assertvalid(pmr);
370 0 : uvm_pmr_remove_size(pmr, pg);
371 0 : uvm_pmr_remove_addr(pmr, pg);
372 : uvm_pmr_assertvalid(pmr);
373 0 : }
374 :
375 : /*
376 : * Insert the range described in pg.
377 : * Returns the range thus created (which may be joined with the previous and
378 : * next ranges).
379 : * If no_join, the caller guarantees that the range cannot possibly join
380 : * with adjecent ranges.
381 : */
382 : struct vm_page *
383 0 : uvm_pmr_insert_addr(struct uvm_pmemrange *pmr, struct vm_page *pg, int no_join)
384 : {
385 0 : struct vm_page *prev, *next;
386 :
387 : #ifdef DEBUG
388 : struct vm_page *i;
389 : int mt;
390 : #endif
391 :
392 : KDASSERT(pg->pg_flags & PQ_FREE);
393 : KDASSERT(pg->fpgsz >= 1);
394 :
395 : #ifdef DEBUG
396 : for (mt = 0; mt < UVM_PMR_MEMTYPE_MAX; mt++) {
397 : TAILQ_FOREACH(i, &pmr->single[mt], pageq)
398 : KDASSERT(i != pg);
399 : if (pg->fpgsz > 1) {
400 : KDASSERT(RBT_FIND(uvm_pmr_size, &pmr->size[mt],
401 : pg + 1) == NULL);
402 : }
403 : KDASSERT(RBT_FIND(uvm_pmr_addr, &pmr->addr, pg) == NULL);
404 : }
405 : #endif
406 :
407 0 : if (!no_join) {
408 0 : uvm_pmr_pnaddr(pmr, pg, &prev, &next);
409 0 : if (next != NULL) {
410 0 : uvm_pmr_remove_size(pmr, next);
411 0 : uvm_pmr_remove_addr(pmr, next);
412 0 : pg->fpgsz += next->fpgsz;
413 0 : next->fpgsz = 0;
414 0 : }
415 0 : if (prev != NULL) {
416 0 : uvm_pmr_remove_size(pmr, prev);
417 0 : prev->fpgsz += pg->fpgsz;
418 0 : pg->fpgsz = 0;
419 0 : return prev;
420 : }
421 : }
422 :
423 0 : RBT_INSERT(uvm_pmr_addr, &pmr->addr, pg);
424 :
425 0 : pmr->nsegs++;
426 :
427 0 : return pg;
428 0 : }
429 : /*
430 : * Insert the range described in pg.
431 : * Returns the range thus created (which may be joined with the previous and
432 : * next ranges).
433 : * Page must already be in the address tree.
434 : */
435 : void
436 0 : uvm_pmr_insert_size(struct uvm_pmemrange *pmr, struct vm_page *pg)
437 : {
438 : int memtype;
439 : #ifdef DEBUG
440 : struct vm_page *i;
441 : int mti;
442 : #endif
443 :
444 : KDASSERT(pg->fpgsz >= 1);
445 : KDASSERT(pg->pg_flags & PQ_FREE);
446 :
447 0 : memtype = uvm_pmr_pg_to_memtype(pg);
448 : #ifdef DEBUG
449 : for (mti = 0; mti < UVM_PMR_MEMTYPE_MAX; mti++) {
450 : TAILQ_FOREACH(i, &pmr->single[mti], pageq)
451 : KDASSERT(i != pg);
452 : if (pg->fpgsz > 1) {
453 : KDASSERT(RBT_FIND(uvm_pmr_size, &pmr->size[mti],
454 : pg + 1) == NULL);
455 : }
456 : KDASSERT(RBT_FIND(uvm_pmr_addr, &pmr->addr, pg) == pg);
457 : }
458 : for (i = pg; i < pg + pg->fpgsz; i++)
459 : KASSERT(uvm_pmr_pg_to_memtype(i) == memtype);
460 : #endif
461 :
462 0 : if (pg->fpgsz == 1)
463 0 : TAILQ_INSERT_TAIL(&pmr->single[memtype], pg, pageq);
464 : else
465 0 : RBT_INSERT(uvm_pmr_size, &pmr->size[memtype], pg + 1);
466 0 : }
467 : /* Insert in both trees. */
468 : struct vm_page *
469 0 : uvm_pmr_insert(struct uvm_pmemrange *pmr, struct vm_page *pg, int no_join)
470 : {
471 : uvm_pmr_assertvalid(pmr);
472 0 : pg = uvm_pmr_insert_addr(pmr, pg, no_join);
473 0 : uvm_pmr_insert_size(pmr, pg);
474 : uvm_pmr_assertvalid(pmr);
475 0 : return pg;
476 : }
477 :
478 : /*
479 : * Find the last page that is part of this segment.
480 : * => pg: the range at which to start the search.
481 : * => boundary: the page number boundary specification (0 = no boundary).
482 : * => pmr: the pmemrange of the page.
483 : *
484 : * This function returns 1 before the next range, so if you want to have the
485 : * next range, you need to run TAILQ_NEXT(result, pageq) after calling.
486 : * The reason is that this way, the length of the segment is easily
487 : * calculated using: atop(result) - atop(pg) + 1.
488 : * Hence this function also never returns NULL.
489 : */
490 : struct vm_page *
491 0 : uvm_pmr_findnextsegment(struct uvm_pmemrange *pmr,
492 : struct vm_page *pg, paddr_t boundary)
493 : {
494 : paddr_t first_boundary;
495 : struct vm_page *next;
496 : struct vm_page *prev;
497 :
498 : KDASSERT(pmr->low <= atop(VM_PAGE_TO_PHYS(pg)) &&
499 : pmr->high > atop(VM_PAGE_TO_PHYS(pg)));
500 0 : if (boundary != 0) {
501 : first_boundary =
502 0 : PMR_ALIGN(atop(VM_PAGE_TO_PHYS(pg)) + 1, boundary);
503 0 : } else
504 : first_boundary = 0;
505 :
506 : /*
507 : * Increase next until it hits the first page of the next segment.
508 : *
509 : * While loop checks the following:
510 : * - next != NULL we have not reached the end of pgl
511 : * - boundary == 0 || next < first_boundary
512 : * we do not cross a boundary
513 : * - atop(prev) + 1 == atop(next)
514 : * still in the same segment
515 : * - low <= last
516 : * - high > last still in the same memory range
517 : * - memtype is equal allocator is unable to view different memtypes
518 : * as part of the same segment
519 : * - prev + 1 == next no array breakage occurs
520 : */
521 : prev = pg;
522 0 : next = TAILQ_NEXT(prev, pageq);
523 0 : while (next != NULL &&
524 0 : (boundary == 0 || atop(VM_PAGE_TO_PHYS(next)) < first_boundary) &&
525 0 : atop(VM_PAGE_TO_PHYS(prev)) + 1 == atop(VM_PAGE_TO_PHYS(next)) &&
526 0 : pmr->low <= atop(VM_PAGE_TO_PHYS(next)) &&
527 0 : pmr->high > atop(VM_PAGE_TO_PHYS(next)) &&
528 0 : uvm_pmr_pg_to_memtype(prev) == uvm_pmr_pg_to_memtype(next) &&
529 0 : prev + 1 == next) {
530 : prev = next;
531 0 : next = TAILQ_NEXT(prev, pageq);
532 : }
533 :
534 : /*
535 : * End of this segment.
536 : */
537 0 : return prev;
538 : }
539 :
540 : /*
541 : * Remove the first segment of contiguous pages from pgl.
542 : * A segment ends if it crosses boundary (unless boundary = 0) or
543 : * if it would enter a different uvm_pmemrange.
544 : *
545 : * Work: the page range that the caller is currently working with.
546 : * May be null.
547 : *
548 : * If is_desperate is non-zero, the smallest segment is erased. Otherwise,
549 : * the first segment is erased (which, if called by uvm_pmr_getpages(),
550 : * probably is the smallest or very close to it).
551 : */
552 : psize_t
553 0 : uvm_pmr_remove_1strange(struct pglist *pgl, paddr_t boundary,
554 : struct vm_page **work, int is_desperate)
555 : {
556 : struct vm_page *start, *end, *iter, *iter_end, *inserted;
557 : psize_t count;
558 : struct uvm_pmemrange *pmr, *pmr_iter;
559 :
560 0 : KASSERT(!TAILQ_EMPTY(pgl));
561 :
562 : /*
563 : * Initialize to first page.
564 : * Unless desperate scan finds a better candidate, this is what'll be
565 : * erased.
566 : */
567 : start = TAILQ_FIRST(pgl);
568 0 : pmr = uvm_pmemrange_find(atop(VM_PAGE_TO_PHYS(start)));
569 0 : end = uvm_pmr_findnextsegment(pmr, start, boundary);
570 :
571 : /*
572 : * If we are desperate, we _really_ want to get rid of the smallest
573 : * element (rather than a close match to the smallest element).
574 : */
575 0 : if (is_desperate) {
576 : /* Linear search for smallest segment. */
577 : pmr_iter = pmr;
578 0 : for (iter = TAILQ_NEXT(end, pageq);
579 0 : iter != NULL && start != end;
580 0 : iter = TAILQ_NEXT(iter_end, pageq)) {
581 : /*
582 : * Only update pmr if it doesn't match current
583 : * iteration.
584 : */
585 0 : if (pmr->low > atop(VM_PAGE_TO_PHYS(iter)) ||
586 0 : pmr->high <= atop(VM_PAGE_TO_PHYS(iter))) {
587 0 : pmr_iter = uvm_pmemrange_find(atop(
588 : VM_PAGE_TO_PHYS(iter)));
589 0 : }
590 :
591 0 : iter_end = uvm_pmr_findnextsegment(pmr_iter, iter,
592 : boundary);
593 :
594 : /*
595 : * Current iteration is smaller than best match so
596 : * far; update.
597 : */
598 0 : if (VM_PAGE_TO_PHYS(iter_end) - VM_PAGE_TO_PHYS(iter) <
599 0 : VM_PAGE_TO_PHYS(end) - VM_PAGE_TO_PHYS(start)) {
600 : start = iter;
601 : end = iter_end;
602 : pmr = pmr_iter;
603 0 : }
604 : }
605 : }
606 :
607 : /*
608 : * Calculate count and end of the list.
609 : */
610 0 : count = atop(VM_PAGE_TO_PHYS(end) - VM_PAGE_TO_PHYS(start)) + 1;
611 0 : end = TAILQ_NEXT(end, pageq);
612 :
613 : /*
614 : * Actually remove the range of pages.
615 : *
616 : * Sadly, this cannot be done using pointer iteration:
617 : * vm_physseg is not guaranteed to be sorted on address, hence
618 : * uvm_page_init() may not have initialized its array sorted by
619 : * page number.
620 : */
621 0 : for (iter = start; iter != end; iter = iter_end) {
622 0 : iter_end = TAILQ_NEXT(iter, pageq);
623 0 : TAILQ_REMOVE(pgl, iter, pageq);
624 : }
625 :
626 0 : start->fpgsz = count;
627 0 : inserted = uvm_pmr_insert(pmr, start, 0);
628 :
629 : /*
630 : * If the caller was working on a range and this function modified
631 : * that range, update the pointer.
632 : */
633 0 : if (work != NULL && *work != NULL &&
634 0 : atop(VM_PAGE_TO_PHYS(inserted)) <= atop(VM_PAGE_TO_PHYS(*work)) &&
635 0 : atop(VM_PAGE_TO_PHYS(inserted)) + inserted->fpgsz >
636 : atop(VM_PAGE_TO_PHYS(*work)))
637 0 : *work = inserted;
638 0 : return count;
639 : }
640 :
641 : /*
642 : * Extract a number of pages from a segment of free pages.
643 : * Called by uvm_pmr_getpages.
644 : *
645 : * Returns the segment that was created from pages left over at the tail
646 : * of the remove set of pages, or NULL if no pages were left at the tail.
647 : */
648 : struct vm_page *
649 0 : uvm_pmr_extract_range(struct uvm_pmemrange *pmr, struct vm_page *pg,
650 : paddr_t start, paddr_t end, struct pglist *result)
651 : {
652 : struct vm_page *after, *pg_i;
653 : psize_t before_sz, after_sz;
654 : #ifdef DEBUG
655 : psize_t i;
656 : #endif
657 :
658 : KDASSERT(end > start);
659 : KDASSERT(pmr->low <= atop(VM_PAGE_TO_PHYS(pg)));
660 : KDASSERT(pmr->high >= atop(VM_PAGE_TO_PHYS(pg)) + pg->fpgsz);
661 : KDASSERT(atop(VM_PAGE_TO_PHYS(pg)) <= start);
662 : KDASSERT(atop(VM_PAGE_TO_PHYS(pg)) + pg->fpgsz >= end);
663 :
664 0 : before_sz = start - atop(VM_PAGE_TO_PHYS(pg));
665 0 : after_sz = atop(VM_PAGE_TO_PHYS(pg)) + pg->fpgsz - end;
666 : KDASSERT(before_sz + after_sz + (end - start) == pg->fpgsz);
667 : uvm_pmr_assertvalid(pmr);
668 :
669 0 : uvm_pmr_remove_size(pmr, pg);
670 0 : if (before_sz == 0)
671 0 : uvm_pmr_remove_addr(pmr, pg);
672 0 : after = pg + before_sz + (end - start);
673 :
674 : /* Add selected pages to result. */
675 0 : for (pg_i = pg + before_sz; pg_i != after; pg_i++) {
676 0 : KASSERT(pg_i->pg_flags & PQ_FREE);
677 0 : pg_i->fpgsz = 0;
678 0 : TAILQ_INSERT_TAIL(result, pg_i, pageq);
679 : }
680 :
681 : /* Before handling. */
682 0 : if (before_sz > 0) {
683 0 : pg->fpgsz = before_sz;
684 0 : uvm_pmr_insert_size(pmr, pg);
685 0 : }
686 :
687 : /* After handling. */
688 0 : if (after_sz > 0) {
689 : #ifdef DEBUG
690 : for (i = 0; i < after_sz; i++) {
691 : KASSERT(!uvm_pmr_isfree(after + i));
692 : }
693 : #endif
694 : KDASSERT(atop(VM_PAGE_TO_PHYS(after)) == end);
695 0 : after->fpgsz = after_sz;
696 0 : after = uvm_pmr_insert_addr(pmr, after, 1);
697 0 : uvm_pmr_insert_size(pmr, after);
698 0 : }
699 :
700 : uvm_pmr_assertvalid(pmr);
701 0 : return (after_sz > 0 ? after : NULL);
702 : }
703 :
704 : /*
705 : * Acquire a number of pages.
706 : *
707 : * count: the number of pages returned
708 : * start: lowest page number
709 : * end: highest page number +1
710 : * (start = end = 0: no limitation)
711 : * align: power-of-2 alignment constraint (align = 1: no alignment)
712 : * boundary: power-of-2 boundary (boundary = 0: no boundary)
713 : * maxseg: maximum number of segments to return
714 : * flags: UVM_PLA_* flags
715 : * result: returned pages storage (uses pageq)
716 : */
717 : int
718 0 : uvm_pmr_getpages(psize_t count, paddr_t start, paddr_t end, paddr_t align,
719 : paddr_t boundary, int maxseg, int flags, struct pglist *result)
720 : {
721 : struct uvm_pmemrange *pmr; /* Iterate memory ranges. */
722 0 : struct vm_page *found, *f_next; /* Iterate chunks. */
723 : psize_t fcount; /* Current found pages. */
724 : int fnsegs; /* Current segment counter. */
725 : int try, start_try;
726 0 : psize_t search[3];
727 : paddr_t fstart, fend; /* Pages to be taken from found. */
728 : int memtype; /* Requested memtype. */
729 : int memtype_init; /* Best memtype. */
730 : int desperate; /* True if allocation failed. */
731 : #ifdef DIAGNOSTIC
732 : struct vm_page *diag_prev; /* Used during validation. */
733 : #endif /* DIAGNOSTIC */
734 :
735 : /*
736 : * Validate arguments.
737 : */
738 0 : KASSERT(count > 0);
739 0 : KASSERT(start == 0 || end == 0 || start < end);
740 0 : KASSERT(align >= 1);
741 0 : KASSERT(powerof2(align));
742 0 : KASSERT(maxseg > 0);
743 0 : KASSERT(boundary == 0 || powerof2(boundary));
744 0 : KASSERT(boundary == 0 || maxseg * boundary >= count);
745 0 : KASSERT(TAILQ_EMPTY(result));
746 :
747 : /*
748 : * TRYCONTIG is a noop if you only want a single segment.
749 : * Remove it if that's the case: otherwise it'll deny the fast
750 : * allocation.
751 : */
752 0 : if (maxseg == 1 || count == 1)
753 0 : flags &= ~UVM_PLA_TRYCONTIG;
754 :
755 : /*
756 : * Configure search.
757 : *
758 : * search[0] is one segment, only used in UVM_PLA_TRYCONTIG case.
759 : * search[1] is multiple segments, chosen to fulfill the search in
760 : * approximately even-sized segments.
761 : * This is a good trade-off between slightly reduced allocation speed
762 : * and less fragmentation.
763 : * search[2] is the worst case, in which all segments are evaluated.
764 : * This provides the least fragmentation, but makes the search
765 : * possibly longer (although in the case it is selected, that no
766 : * longer matters most).
767 : *
768 : * The exception is when maxseg == 1: since we can only fulfill that
769 : * with one segment of size pages, only a single search type has to
770 : * be attempted.
771 : */
772 0 : if (maxseg == 1 || count == 1) {
773 : start_try = 2;
774 0 : search[2] = count;
775 0 : } else if (maxseg >= count && (flags & UVM_PLA_TRYCONTIG) == 0) {
776 : start_try = 2;
777 0 : search[2] = 1;
778 0 : } else {
779 : start_try = 0;
780 0 : search[0] = count;
781 0 : search[1] = pow2divide(count, maxseg);
782 0 : search[2] = 1;
783 0 : if ((flags & UVM_PLA_TRYCONTIG) == 0)
784 0 : start_try = 1;
785 0 : if (search[1] >= search[0]) {
786 0 : search[1] = search[0];
787 : start_try = 1;
788 0 : }
789 0 : if (search[2] >= search[start_try]) {
790 : start_try = 2;
791 0 : }
792 : }
793 :
794 : /*
795 : * Memory type: if zeroed memory is requested, traverse the zero set.
796 : * Otherwise, traverse the dirty set.
797 : *
798 : * The memtype iterator is reinitialized to memtype_init on entrance
799 : * of a pmemrange.
800 : */
801 0 : if (flags & UVM_PLA_ZERO)
802 0 : memtype_init = UVM_PMR_MEMTYPE_ZERO;
803 : else
804 : memtype_init = UVM_PMR_MEMTYPE_DIRTY;
805 :
806 : /*
807 : * Initially, we're not desperate.
808 : *
809 : * Note that if we return from a sleep, we are still desperate.
810 : * Chances are that memory pressure is still high, so resetting
811 : * seems over-optimistic to me.
812 : */
813 : desperate = 0;
814 :
815 0 : uvm_lock_fpageq();
816 :
817 : retry: /* Return point after sleeping. */
818 : fcount = 0;
819 0 : fnsegs = 0;
820 :
821 : retry_desperate:
822 : /*
823 : * If we just want any page(s), go for the really fast option.
824 : */
825 0 : if (count <= maxseg && align == 1 && boundary == 0 &&
826 0 : (flags & UVM_PLA_TRYCONTIG) == 0) {
827 0 : fcount += uvm_pmr_get1page(count - fcount, memtype_init,
828 : result, start, end, 0);
829 :
830 : /*
831 : * If we found sufficient pages, go to the succes exit code.
832 : *
833 : * Otherwise, go immediately to fail, since we collected
834 : * all we could anyway.
835 : */
836 0 : if (fcount == count)
837 : goto out;
838 : else
839 : goto fail;
840 : }
841 :
842 : /*
843 : * The heart of the contig case.
844 : *
845 : * The code actually looks like this:
846 : *
847 : * foreach (struct pmemrange) {
848 : * foreach (memtype) {
849 : * foreach(try) {
850 : * foreach (free range of memtype in pmemrange,
851 : * starting at search[try]) {
852 : * while (range has space left)
853 : * take from range
854 : * }
855 : * }
856 : * }
857 : *
858 : * if next pmemrange has higher usecount than current:
859 : * enter desperate case (which will drain the pmemranges
860 : * until empty prior to moving to the next one)
861 : * }
862 : *
863 : * When desperate is activated, try always starts at the highest
864 : * value. The memtype loop is using a goto ReScanMemtype.
865 : * The try loop is using a goto ReScan.
866 : * The 'range has space left' loop uses label DrainFound.
867 : *
868 : * Writing them all as loops would take up a lot of screen space in
869 : * the form of indentation and some parts are easier to express
870 : * using the labels.
871 : */
872 :
873 0 : TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) {
874 : /* Empty range. */
875 0 : if (pmr->nsegs == 0)
876 : continue;
877 :
878 : /* Outside requested range. */
879 0 : if (!PMR_INTERSECTS_WITH(pmr->low, pmr->high, start, end))
880 : continue;
881 :
882 0 : memtype = memtype_init;
883 :
884 : rescan_memtype: /* Return point at memtype++. */
885 0 : try = start_try;
886 :
887 : rescan: /* Return point at try++. */
888 0 : for (found = uvm_pmr_nfindsz(pmr, search[try], memtype);
889 0 : found != NULL;
890 0 : found = f_next) {
891 0 : f_next = uvm_pmr_nextsz(pmr, found, memtype);
892 :
893 0 : fstart = atop(VM_PAGE_TO_PHYS(found));
894 0 : if (start != 0)
895 0 : fstart = MAX(start, fstart);
896 : drain_found:
897 : /*
898 : * Throw away the first segment if fnsegs == maxseg
899 : *
900 : * Note that f_next is still valid after this call,
901 : * since we only allocated from entries before f_next.
902 : * We don't revisit the entries we already extracted
903 : * from unless we entered the desperate case.
904 : */
905 0 : if (fnsegs == maxseg) {
906 0 : fnsegs--;
907 0 : fcount -=
908 0 : uvm_pmr_remove_1strange(result, boundary,
909 : &found, desperate);
910 0 : }
911 :
912 0 : fstart = PMR_ALIGN(fstart, align);
913 0 : fend = atop(VM_PAGE_TO_PHYS(found)) + found->fpgsz;
914 0 : if (end != 0)
915 0 : fend = MIN(end, fend);
916 0 : if (boundary != 0) {
917 : fend =
918 0 : MIN(fend, PMR_ALIGN(fstart + 1, boundary));
919 0 : }
920 0 : if (fstart >= fend)
921 : continue;
922 0 : if (fend - fstart > count - fcount)
923 0 : fend = fstart + (count - fcount);
924 :
925 0 : fcount += fend - fstart;
926 0 : fnsegs++;
927 0 : found = uvm_pmr_extract_range(pmr, found,
928 : fstart, fend, result);
929 :
930 0 : if (fcount == count)
931 : goto out;
932 :
933 : /*
934 : * If there's still space left in found, try to
935 : * fully drain it prior to continueing.
936 : */
937 0 : if (found != NULL) {
938 : fstart = fend;
939 0 : goto drain_found;
940 : }
941 : }
942 :
943 : /* Try a smaller search now. */
944 0 : if (++try < nitems(search))
945 0 : goto rescan;
946 :
947 : /*
948 : * Exhaust all memory types prior to going to the next memory
949 : * segment.
950 : * This means that zero-vs-dirty are eaten prior to moving
951 : * to a pmemrange with a higher use-count.
952 : *
953 : * Code is basically a difficult way of writing:
954 : * memtype = memtype_init;
955 : * do {
956 : * ...;
957 : * memtype += 1;
958 : * memtype %= MEMTYPE_MAX;
959 : * } while (memtype != memtype_init);
960 : */
961 0 : memtype += 1;
962 0 : if (memtype == UVM_PMR_MEMTYPE_MAX)
963 : memtype = 0;
964 0 : if (memtype != memtype_init)
965 0 : goto rescan_memtype;
966 :
967 : /*
968 : * If not desperate, enter desperate case prior to eating all
969 : * the good stuff in the next range.
970 : */
971 0 : if (!desperate && TAILQ_NEXT(pmr, pmr_use) != NULL &&
972 0 : TAILQ_NEXT(pmr, pmr_use)->use != pmr->use)
973 : break;
974 : }
975 :
976 : /*
977 : * Not enough memory of the requested type available. Fall back to
978 : * less good memory that we'll clean up better later.
979 : *
980 : * This algorithm is not very smart though, it just starts scanning
981 : * a different typed range, but the nicer ranges of the previous
982 : * iteration may fall out. Hence there is a small chance of a false
983 : * negative.
984 : *
985 : * When desparate: scan all sizes starting at the smallest
986 : * (start_try = 1) and do not consider UVM_PLA_TRYCONTIG (which may
987 : * allow us to hit the fast path now).
988 : *
989 : * Also, because we will revisit entries we scanned before, we need
990 : * to reset the page queue, or we may end up releasing entries in
991 : * such a way as to invalidate f_next.
992 : */
993 0 : if (!desperate) {
994 : desperate = 1;
995 : start_try = nitems(search) - 1;
996 0 : flags &= ~UVM_PLA_TRYCONTIG;
997 :
998 0 : while (!TAILQ_EMPTY(result))
999 0 : uvm_pmr_remove_1strange(result, 0, NULL, 0);
1000 : fnsegs = 0;
1001 : fcount = 0;
1002 0 : goto retry_desperate;
1003 : }
1004 :
1005 : fail:
1006 : /* Allocation failed. */
1007 : /* XXX: claim from memory reserve here */
1008 :
1009 0 : while (!TAILQ_EMPTY(result))
1010 0 : uvm_pmr_remove_1strange(result, 0, NULL, 0);
1011 :
1012 0 : if (flags & UVM_PLA_WAITOK) {
1013 0 : if (uvm_wait_pla(ptoa(start), ptoa(end) - 1, ptoa(count),
1014 0 : flags & UVM_PLA_FAILOK) == 0)
1015 0 : goto retry;
1016 0 : KASSERT(flags & UVM_PLA_FAILOK);
1017 : } else
1018 0 : wakeup(&uvm.pagedaemon);
1019 0 : uvm_unlock_fpageq();
1020 :
1021 0 : return ENOMEM;
1022 :
1023 : out:
1024 : /* Allocation succesful. */
1025 0 : uvmexp.free -= fcount;
1026 :
1027 0 : uvm_unlock_fpageq();
1028 :
1029 : /* Update statistics and zero pages if UVM_PLA_ZERO. */
1030 : #ifdef DIAGNOSTIC
1031 : fnsegs = 0;
1032 : fcount = 0;
1033 : diag_prev = NULL;
1034 : #endif /* DIAGNOSTIC */
1035 0 : TAILQ_FOREACH(found, result, pageq) {
1036 0 : atomic_clearbits_int(&found->pg_flags, PG_PMAPMASK);
1037 :
1038 0 : if (found->pg_flags & PG_ZERO) {
1039 0 : uvm_lock_fpageq();
1040 0 : uvmexp.zeropages--;
1041 0 : if (uvmexp.zeropages < UVM_PAGEZERO_TARGET)
1042 0 : wakeup(&uvmexp.zeropages);
1043 0 : uvm_unlock_fpageq();
1044 0 : }
1045 0 : if (flags & UVM_PLA_ZERO) {
1046 0 : if (found->pg_flags & PG_ZERO)
1047 0 : uvmexp.pga_zerohit++;
1048 : else {
1049 0 : uvmexp.pga_zeromiss++;
1050 0 : uvm_pagezero(found);
1051 : }
1052 : }
1053 0 : atomic_clearbits_int(&found->pg_flags, PG_ZERO|PQ_FREE);
1054 :
1055 0 : found->uobject = NULL;
1056 0 : found->uanon = NULL;
1057 0 : found->pg_version++;
1058 :
1059 : /*
1060 : * Validate that the page matches range criterium.
1061 : */
1062 : KDASSERT(start == 0 || atop(VM_PAGE_TO_PHYS(found)) >= start);
1063 : KDASSERT(end == 0 || atop(VM_PAGE_TO_PHYS(found)) < end);
1064 :
1065 : #ifdef DIAGNOSTIC
1066 : /*
1067 : * Update fcount (# found pages) and
1068 : * fnsegs (# found segments) counters.
1069 : */
1070 0 : if (diag_prev == NULL ||
1071 : /* new segment if it contains a hole */
1072 0 : atop(VM_PAGE_TO_PHYS(diag_prev)) + 1 !=
1073 0 : atop(VM_PAGE_TO_PHYS(found)) ||
1074 : /* new segment if it crosses boundary */
1075 0 : (atop(VM_PAGE_TO_PHYS(diag_prev)) & ~(boundary - 1)) !=
1076 0 : (atop(VM_PAGE_TO_PHYS(found)) & ~(boundary - 1)))
1077 0 : fnsegs++;
1078 0 : fcount++;
1079 :
1080 0 : diag_prev = found;
1081 : #endif /* DIAGNOSTIC */
1082 : }
1083 :
1084 : #ifdef DIAGNOSTIC
1085 : /*
1086 : * Panic on algorithm failure.
1087 : */
1088 0 : if (fcount != count || fnsegs > maxseg) {
1089 0 : panic("pmemrange allocation error: "
1090 : "allocated %ld pages in %d segments, "
1091 : "but request was %ld pages in %d segments",
1092 : fcount, fnsegs, count, maxseg);
1093 : }
1094 : #endif /* DIAGNOSTIC */
1095 :
1096 0 : return 0;
1097 0 : }
1098 :
1099 : /*
1100 : * Free a number of contig pages (invoked by uvm_page_init).
1101 : */
1102 : void
1103 0 : uvm_pmr_freepages(struct vm_page *pg, psize_t count)
1104 : {
1105 : struct uvm_pmemrange *pmr;
1106 : psize_t i, pmr_count;
1107 : struct vm_page *firstpg = pg;
1108 :
1109 0 : for (i = 0; i < count; i++) {
1110 0 : KASSERT(atop(VM_PAGE_TO_PHYS(&pg[i])) ==
1111 : atop(VM_PAGE_TO_PHYS(pg)) + i);
1112 :
1113 0 : if (!((pg[i].pg_flags & PQ_FREE) == 0 &&
1114 0 : VALID_FLAGS(pg[i].pg_flags))) {
1115 0 : printf("Flags: 0x%x, will panic now.\n",
1116 0 : pg[i].pg_flags);
1117 0 : }
1118 0 : KASSERT((pg[i].pg_flags & PQ_FREE) == 0 &&
1119 : VALID_FLAGS(pg[i].pg_flags));
1120 0 : atomic_setbits_int(&pg[i].pg_flags, PQ_FREE);
1121 0 : atomic_clearbits_int(&pg[i].pg_flags, PG_ZERO);
1122 : }
1123 :
1124 0 : uvm_lock_fpageq();
1125 :
1126 0 : for (i = count; i > 0; i -= pmr_count) {
1127 0 : pmr = uvm_pmemrange_find(atop(VM_PAGE_TO_PHYS(pg)));
1128 0 : KASSERT(pmr != NULL);
1129 :
1130 0 : pmr_count = MIN(i, pmr->high - atop(VM_PAGE_TO_PHYS(pg)));
1131 0 : pg->fpgsz = pmr_count;
1132 0 : uvm_pmr_insert(pmr, pg, 0);
1133 :
1134 0 : uvmexp.free += pmr_count;
1135 0 : pg += pmr_count;
1136 : }
1137 0 : wakeup(&uvmexp.free);
1138 0 : if (uvmexp.zeropages < UVM_PAGEZERO_TARGET)
1139 0 : wakeup(&uvmexp.zeropages);
1140 :
1141 0 : uvm_wakeup_pla(VM_PAGE_TO_PHYS(firstpg), ptoa(count));
1142 :
1143 0 : uvm_unlock_fpageq();
1144 0 : }
1145 :
1146 : /*
1147 : * Free all pages in the queue.
1148 : */
1149 : void
1150 0 : uvm_pmr_freepageq(struct pglist *pgl)
1151 : {
1152 : struct vm_page *pg;
1153 : paddr_t pstart;
1154 : psize_t plen;
1155 :
1156 0 : TAILQ_FOREACH(pg, pgl, pageq) {
1157 0 : if (!((pg->pg_flags & PQ_FREE) == 0 &&
1158 0 : VALID_FLAGS(pg->pg_flags))) {
1159 0 : printf("Flags: 0x%x, will panic now.\n",
1160 0 : pg->pg_flags);
1161 0 : }
1162 0 : KASSERT((pg->pg_flags & PQ_FREE) == 0 &&
1163 : VALID_FLAGS(pg->pg_flags));
1164 0 : atomic_setbits_int(&pg->pg_flags, PQ_FREE);
1165 0 : atomic_clearbits_int(&pg->pg_flags, PG_ZERO);
1166 : }
1167 :
1168 0 : uvm_lock_fpageq();
1169 0 : while (!TAILQ_EMPTY(pgl)) {
1170 0 : pstart = VM_PAGE_TO_PHYS(TAILQ_FIRST(pgl));
1171 0 : plen = uvm_pmr_remove_1strange(pgl, 0, NULL, 0);
1172 0 : uvmexp.free += plen;
1173 :
1174 0 : uvm_wakeup_pla(pstart, ptoa(plen));
1175 : }
1176 0 : wakeup(&uvmexp.free);
1177 0 : if (uvmexp.zeropages < UVM_PAGEZERO_TARGET)
1178 0 : wakeup(&uvmexp.zeropages);
1179 0 : uvm_unlock_fpageq();
1180 :
1181 : return;
1182 0 : }
1183 :
1184 : /*
1185 : * Store a pmemrange in the list.
1186 : *
1187 : * The list is sorted by use.
1188 : */
1189 : struct uvm_pmemrange *
1190 0 : uvm_pmemrange_use_insert(struct uvm_pmemrange_use *useq,
1191 : struct uvm_pmemrange *pmr)
1192 : {
1193 : struct uvm_pmemrange *iter;
1194 : int cmp = 1;
1195 :
1196 0 : TAILQ_FOREACH(iter, useq, pmr_use) {
1197 0 : cmp = uvm_pmemrange_use_cmp(pmr, iter);
1198 0 : if (cmp == 0)
1199 0 : return iter;
1200 0 : if (cmp == -1)
1201 : break;
1202 : }
1203 :
1204 0 : if (iter == NULL)
1205 0 : TAILQ_INSERT_TAIL(useq, pmr, pmr_use);
1206 : else
1207 0 : TAILQ_INSERT_BEFORE(iter, pmr, pmr_use);
1208 0 : return NULL;
1209 0 : }
1210 :
1211 : #ifdef DEBUG
1212 : /*
1213 : * Validation of the whole pmemrange.
1214 : * Called with fpageq locked.
1215 : */
1216 : void
1217 : uvm_pmr_assertvalid(struct uvm_pmemrange *pmr)
1218 : {
1219 : struct vm_page *prev, *next, *i, *xref;
1220 : int lcv, mti;
1221 :
1222 : /* Empty range */
1223 : if (pmr->nsegs == 0)
1224 : return;
1225 :
1226 : /* Validate address tree. */
1227 : RBT_FOREACH(i, uvm_pmr_addr, &pmr->addr) {
1228 : /* Validate the range. */
1229 : KASSERT(i->fpgsz > 0);
1230 : KASSERT(atop(VM_PAGE_TO_PHYS(i)) >= pmr->low);
1231 : KASSERT(atop(VM_PAGE_TO_PHYS(i)) + i->fpgsz
1232 : <= pmr->high);
1233 :
1234 : /* Validate each page in this range. */
1235 : for (lcv = 0; lcv < i->fpgsz; lcv++) {
1236 : /*
1237 : * Only the first page has a size specification.
1238 : * Rest is size 0.
1239 : */
1240 : KASSERT(lcv == 0 || i[lcv].fpgsz == 0);
1241 : /*
1242 : * Flag check.
1243 : */
1244 : KASSERT(VALID_FLAGS(i[lcv].pg_flags) &&
1245 : (i[lcv].pg_flags & PQ_FREE) == PQ_FREE);
1246 : /*
1247 : * Free pages are:
1248 : * - not wired
1249 : * - have no vm_anon
1250 : * - have no uvm_object
1251 : */
1252 : KASSERT(i[lcv].wire_count == 0);
1253 : KASSERT(i[lcv].uanon == (void*)0xdeadbeef ||
1254 : i[lcv].uanon == NULL);
1255 : KASSERT(i[lcv].uobject == (void*)0xdeadbeef ||
1256 : i[lcv].uobject == NULL);
1257 : /*
1258 : * Pages in a single range always have the same
1259 : * memtype.
1260 : */
1261 : KASSERT(uvm_pmr_pg_to_memtype(&i[0]) ==
1262 : uvm_pmr_pg_to_memtype(&i[lcv]));
1263 : }
1264 :
1265 : /* Check that it shouldn't be joined with its predecessor. */
1266 : prev = RBT_PREV(uvm_pmr_addr, i);
1267 : if (prev != NULL) {
1268 : KASSERT(uvm_pmr_pg_to_memtype(i) !=
1269 : uvm_pmr_pg_to_memtype(prev) ||
1270 : atop(VM_PAGE_TO_PHYS(i)) >
1271 : atop(VM_PAGE_TO_PHYS(prev)) + prev->fpgsz ||
1272 : prev + prev->fpgsz != i);
1273 : }
1274 :
1275 : /* Assert i is in the size tree as well. */
1276 : if (i->fpgsz == 1) {
1277 : TAILQ_FOREACH(xref,
1278 : &pmr->single[uvm_pmr_pg_to_memtype(i)], pageq) {
1279 : if (xref == i)
1280 : break;
1281 : }
1282 : KASSERT(xref == i);
1283 : } else {
1284 : KASSERT(RBT_FIND(uvm_pmr_size,
1285 : &pmr->size[uvm_pmr_pg_to_memtype(i)], i + 1) ==
1286 : i + 1);
1287 : }
1288 : }
1289 :
1290 : /* Validate size tree. */
1291 : for (mti = 0; mti < UVM_PMR_MEMTYPE_MAX; mti++) {
1292 : for (i = uvm_pmr_nfindsz(pmr, 1, mti); i != NULL; i = next) {
1293 : next = uvm_pmr_nextsz(pmr, i, mti);
1294 : if (next != NULL) {
1295 : KASSERT(i->fpgsz <=
1296 : next->fpgsz);
1297 : }
1298 :
1299 : /* Assert i is in the addr tree as well. */
1300 : KASSERT(RBT_FIND(uvm_pmr_addr, &pmr->addr, i) == i);
1301 :
1302 : /* Assert i is of the correct memory type. */
1303 : KASSERT(uvm_pmr_pg_to_memtype(i) == mti);
1304 : }
1305 : }
1306 :
1307 : /* Validate nsegs statistic. */
1308 : lcv = 0;
1309 : RBT_FOREACH(i, uvm_pmr_addr, &pmr->addr)
1310 : lcv++;
1311 : KASSERT(pmr->nsegs == lcv);
1312 : }
1313 : #endif /* DEBUG */
1314 :
1315 : /*
1316 : * Split pmr at split point pageno.
1317 : * Called with fpageq unlocked.
1318 : *
1319 : * Split is only applied if a pmemrange spans pageno.
1320 : */
1321 : void
1322 0 : uvm_pmr_split(paddr_t pageno)
1323 : {
1324 : struct uvm_pmemrange *pmr, *drain;
1325 : struct vm_page *rebuild, *prev, *next;
1326 : psize_t prev_sz;
1327 :
1328 0 : uvm_lock_fpageq();
1329 0 : pmr = uvm_pmemrange_find(pageno);
1330 0 : if (pmr == NULL || !(pmr->low < pageno)) {
1331 : /* No split required. */
1332 0 : uvm_unlock_fpageq();
1333 0 : return;
1334 : }
1335 :
1336 0 : KASSERT(pmr->low < pageno);
1337 0 : KASSERT(pmr->high > pageno);
1338 :
1339 : /*
1340 : * uvm_pmr_allocpmr() calls into malloc() which in turn calls into
1341 : * uvm_kmemalloc which calls into pmemrange, making the locking
1342 : * a bit hard, so we just race!
1343 : */
1344 0 : uvm_unlock_fpageq();
1345 0 : drain = uvm_pmr_allocpmr();
1346 0 : uvm_lock_fpageq();
1347 0 : pmr = uvm_pmemrange_find(pageno);
1348 0 : if (pmr == NULL || !(pmr->low < pageno)) {
1349 : /*
1350 : * We lost the race since someone else ran this or a related
1351 : * function, however this should be triggered very rarely so
1352 : * we just leak the pmr.
1353 : */
1354 0 : printf("uvm_pmr_split: lost one pmr\n");
1355 0 : uvm_unlock_fpageq();
1356 0 : return;
1357 : }
1358 :
1359 0 : drain->low = pageno;
1360 0 : drain->high = pmr->high;
1361 0 : drain->use = pmr->use;
1362 :
1363 : uvm_pmr_assertvalid(pmr);
1364 : uvm_pmr_assertvalid(drain);
1365 0 : KASSERT(drain->nsegs == 0);
1366 :
1367 0 : RBT_FOREACH(rebuild, uvm_pmr_addr, &pmr->addr) {
1368 0 : if (atop(VM_PAGE_TO_PHYS(rebuild)) >= pageno)
1369 : break;
1370 : }
1371 0 : if (rebuild == NULL)
1372 0 : prev = RBT_MAX(uvm_pmr_addr, &pmr->addr);
1373 : else
1374 0 : prev = RBT_PREV(uvm_pmr_addr, rebuild);
1375 0 : KASSERT(prev == NULL || atop(VM_PAGE_TO_PHYS(prev)) < pageno);
1376 :
1377 : /*
1378 : * Handle free chunk that spans the split point.
1379 : */
1380 0 : if (prev != NULL &&
1381 0 : atop(VM_PAGE_TO_PHYS(prev)) + prev->fpgsz > pageno) {
1382 : psize_t before, after;
1383 :
1384 0 : KASSERT(atop(VM_PAGE_TO_PHYS(prev)) < pageno);
1385 :
1386 0 : uvm_pmr_remove(pmr, prev);
1387 0 : prev_sz = prev->fpgsz;
1388 0 : before = pageno - atop(VM_PAGE_TO_PHYS(prev));
1389 0 : after = atop(VM_PAGE_TO_PHYS(prev)) + prev_sz - pageno;
1390 :
1391 0 : KASSERT(before > 0);
1392 0 : KASSERT(after > 0);
1393 :
1394 0 : prev->fpgsz = before;
1395 0 : uvm_pmr_insert(pmr, prev, 1);
1396 0 : (prev + before)->fpgsz = after;
1397 0 : uvm_pmr_insert(drain, prev + before, 1);
1398 0 : }
1399 :
1400 : /* Move free chunks that no longer fall in the range. */
1401 0 : for (; rebuild != NULL; rebuild = next) {
1402 0 : next = RBT_NEXT(uvm_pmr_addr, rebuild);
1403 :
1404 0 : uvm_pmr_remove(pmr, rebuild);
1405 0 : uvm_pmr_insert(drain, rebuild, 1);
1406 : }
1407 :
1408 0 : pmr->high = pageno;
1409 : uvm_pmr_assertvalid(pmr);
1410 : uvm_pmr_assertvalid(drain);
1411 :
1412 0 : RBT_INSERT(uvm_pmemrange_addr, &uvm.pmr_control.addr, drain);
1413 0 : uvm_pmemrange_use_insert(&uvm.pmr_control.use, drain);
1414 0 : uvm_unlock_fpageq();
1415 0 : }
1416 :
1417 : /*
1418 : * Increase the usage counter for the given range of memory.
1419 : *
1420 : * The more usage counters a given range of memory has, the more will be
1421 : * attempted not to allocate from it.
1422 : *
1423 : * Addresses here are in paddr_t, not page-numbers.
1424 : * The lowest and highest allowed address are specified.
1425 : */
1426 : void
1427 0 : uvm_pmr_use_inc(paddr_t low, paddr_t high)
1428 : {
1429 : struct uvm_pmemrange *pmr;
1430 : paddr_t sz;
1431 :
1432 : /* pmr uses page numbers, translate low and high. */
1433 0 : high++;
1434 0 : high = atop(trunc_page(high));
1435 0 : low = atop(round_page(low));
1436 0 : uvm_pmr_split(low);
1437 0 : uvm_pmr_split(high);
1438 :
1439 : sz = 0;
1440 0 : uvm_lock_fpageq();
1441 : /* Increase use count on segments in range. */
1442 0 : RBT_FOREACH(pmr, uvm_pmemrange_addr, &uvm.pmr_control.addr) {
1443 0 : if (PMR_IS_SUBRANGE_OF(pmr->low, pmr->high, low, high)) {
1444 0 : TAILQ_REMOVE(&uvm.pmr_control.use, pmr, pmr_use);
1445 0 : pmr->use++;
1446 0 : sz += pmr->high - pmr->low;
1447 0 : uvm_pmemrange_use_insert(&uvm.pmr_control.use, pmr);
1448 0 : }
1449 : uvm_pmr_assertvalid(pmr);
1450 : }
1451 0 : uvm_unlock_fpageq();
1452 :
1453 0 : KASSERT(sz >= high - low);
1454 0 : }
1455 :
1456 : /*
1457 : * Allocate a pmemrange.
1458 : *
1459 : * If called from uvm_page_init, the uvm_pageboot_alloc is used.
1460 : * If called after uvm_init, malloc is used.
1461 : * (And if called in between, you're dead.)
1462 : */
1463 : struct uvm_pmemrange *
1464 0 : uvm_pmr_allocpmr(void)
1465 : {
1466 : struct uvm_pmemrange *nw;
1467 : int i;
1468 :
1469 : /* We're only ever hitting the !uvm.page_init_done case for now. */
1470 0 : if (!uvm.page_init_done) {
1471 0 : nw = (struct uvm_pmemrange *)
1472 0 : uvm_pageboot_alloc(sizeof(struct uvm_pmemrange));
1473 0 : } else {
1474 0 : nw = malloc(sizeof(struct uvm_pmemrange),
1475 : M_VMMAP, M_NOWAIT);
1476 : }
1477 0 : KASSERT(nw != NULL);
1478 0 : memset(nw, 0, sizeof(struct uvm_pmemrange));
1479 0 : RBT_INIT(uvm_pmr_addr, &nw->addr);
1480 0 : for (i = 0; i < UVM_PMR_MEMTYPE_MAX; i++) {
1481 0 : RBT_INIT(uvm_pmr_size, &nw->size[i]);
1482 0 : TAILQ_INIT(&nw->single[i]);
1483 : }
1484 0 : return nw;
1485 : }
1486 :
1487 : /*
1488 : * Initialization of pmr.
1489 : * Called by uvm_page_init.
1490 : *
1491 : * Sets up pmemranges.
1492 : */
1493 : void
1494 0 : uvm_pmr_init(void)
1495 : {
1496 : struct uvm_pmemrange *new_pmr;
1497 : int i;
1498 :
1499 0 : TAILQ_INIT(&uvm.pmr_control.use);
1500 0 : RBT_INIT(uvm_pmemrange_addr, &uvm.pmr_control.addr);
1501 0 : TAILQ_INIT(&uvm.pmr_control.allocs);
1502 :
1503 : /* By default, one range for the entire address space. */
1504 0 : new_pmr = uvm_pmr_allocpmr();
1505 0 : new_pmr->low = 0;
1506 0 : new_pmr->high = atop((paddr_t)-1) + 1;
1507 :
1508 0 : RBT_INSERT(uvm_pmemrange_addr, &uvm.pmr_control.addr, new_pmr);
1509 0 : uvm_pmemrange_use_insert(&uvm.pmr_control.use, new_pmr);
1510 :
1511 0 : for (i = 0; uvm_md_constraints[i] != NULL; i++) {
1512 0 : uvm_pmr_use_inc(uvm_md_constraints[i]->ucr_low,
1513 0 : uvm_md_constraints[i]->ucr_high);
1514 : }
1515 0 : }
1516 :
1517 : /*
1518 : * Find the pmemrange that contains the given page number.
1519 : *
1520 : * (Manually traverses the binary tree, because that is cheaper on stack
1521 : * usage.)
1522 : */
1523 : struct uvm_pmemrange *
1524 0 : uvm_pmemrange_find(paddr_t pageno)
1525 : {
1526 : struct uvm_pmemrange *pmr;
1527 :
1528 0 : pmr = RBT_ROOT(uvm_pmemrange_addr, &uvm.pmr_control.addr);
1529 0 : while (pmr != NULL) {
1530 0 : if (pmr->low > pageno)
1531 0 : pmr = RBT_LEFT(uvm_pmemrange_addr, pmr);
1532 0 : else if (pmr->high <= pageno)
1533 0 : pmr = RBT_RIGHT(uvm_pmemrange_addr, pmr);
1534 : else
1535 : break;
1536 : }
1537 :
1538 0 : return pmr;
1539 : }
1540 :
1541 : #if defined(DDB) || defined(DEBUG)
1542 : /*
1543 : * Return true if the given page is in any of the free lists.
1544 : * Used by uvm_page_printit.
1545 : * This function is safe, even if the page is not on the freeq.
1546 : * Note: does not apply locking, only called from ddb.
1547 : */
1548 : int
1549 0 : uvm_pmr_isfree(struct vm_page *pg)
1550 : {
1551 : struct vm_page *r;
1552 : struct uvm_pmemrange *pmr;
1553 :
1554 0 : pmr = uvm_pmemrange_find(atop(VM_PAGE_TO_PHYS(pg)));
1555 0 : if (pmr == NULL)
1556 0 : return 0;
1557 0 : r = RBT_NFIND(uvm_pmr_addr, &pmr->addr, pg);
1558 0 : if (r == NULL)
1559 0 : r = RBT_MAX(uvm_pmr_addr, &pmr->addr);
1560 0 : else if (r != pg)
1561 0 : r = RBT_PREV(uvm_pmr_addr, r);
1562 0 : if (r == NULL)
1563 0 : return 0; /* Empty tree. */
1564 :
1565 : KDASSERT(atop(VM_PAGE_TO_PHYS(r)) <= atop(VM_PAGE_TO_PHYS(pg)));
1566 0 : return atop(VM_PAGE_TO_PHYS(r)) + r->fpgsz >
1567 0 : atop(VM_PAGE_TO_PHYS(pg));
1568 0 : }
1569 : #endif /* DEBUG */
1570 :
1571 : /*
1572 : * Given a root of a tree, find a range which intersects start, end and
1573 : * is of the same memtype.
1574 : *
1575 : * Page must be in the address tree.
1576 : */
1577 : struct vm_page*
1578 0 : uvm_pmr_rootupdate(struct uvm_pmemrange *pmr, struct vm_page *init_root,
1579 : paddr_t start, paddr_t end, int memtype)
1580 : {
1581 : int direction;
1582 : struct vm_page *root;
1583 : struct vm_page *high, *high_next;
1584 : struct vm_page *low, *low_next;
1585 :
1586 : KDASSERT(pmr != NULL && init_root != NULL);
1587 : root = init_root;
1588 :
1589 : /* Which direction to use for searching. */
1590 0 : if (start != 0 && atop(VM_PAGE_TO_PHYS(root)) + root->fpgsz <= start)
1591 0 : direction = 1;
1592 0 : else if (end != 0 && atop(VM_PAGE_TO_PHYS(root)) >= end)
1593 : direction = -1;
1594 : else /* nothing to do */
1595 0 : return root;
1596 :
1597 : /* First, update root to fall within the chosen range. */
1598 0 : while (root && !PMR_INTERSECTS_WITH(
1599 : atop(VM_PAGE_TO_PHYS(root)),
1600 : atop(VM_PAGE_TO_PHYS(root)) + root->fpgsz,
1601 : start, end)) {
1602 0 : if (direction == 1)
1603 0 : root = RBT_RIGHT(uvm_objtree, root);
1604 : else
1605 0 : root = RBT_LEFT(uvm_objtree, root);
1606 : }
1607 0 : if (root == NULL || uvm_pmr_pg_to_memtype(root) == memtype)
1608 0 : return root;
1609 :
1610 : /*
1611 : * Root is valid, but of the wrong memtype.
1612 : *
1613 : * Try to find a range that has the given memtype in the subtree
1614 : * (memtype mismatches are costly, either because the conversion
1615 : * is expensive, or a later allocation will need to do the opposite
1616 : * conversion, which will be expensive).
1617 : *
1618 : *
1619 : * First, simply increase address until we hit something we can use.
1620 : * Cache the upper page, so we can page-walk later.
1621 : */
1622 : high = root;
1623 0 : high_next = RBT_RIGHT(uvm_objtree, high);
1624 0 : while (high_next != NULL && PMR_INTERSECTS_WITH(
1625 : atop(VM_PAGE_TO_PHYS(high_next)),
1626 : atop(VM_PAGE_TO_PHYS(high_next)) + high_next->fpgsz,
1627 : start, end)) {
1628 : high = high_next;
1629 0 : if (uvm_pmr_pg_to_memtype(high) == memtype)
1630 0 : return high;
1631 0 : high_next = RBT_RIGHT(uvm_objtree, high);
1632 : }
1633 :
1634 : /*
1635 : * Second, decrease the address until we hit something we can use.
1636 : * Cache the lower page, so we can page-walk later.
1637 : */
1638 : low = root;
1639 0 : low_next = RBT_LEFT(uvm_objtree, low);
1640 0 : while (low_next != NULL && PMR_INTERSECTS_WITH(
1641 : atop(VM_PAGE_TO_PHYS(low_next)),
1642 : atop(VM_PAGE_TO_PHYS(low_next)) + low_next->fpgsz,
1643 : start, end)) {
1644 : low = low_next;
1645 0 : if (uvm_pmr_pg_to_memtype(low) == memtype)
1646 0 : return low;
1647 0 : low_next = RBT_LEFT(uvm_objtree, low);
1648 : }
1649 :
1650 0 : if (low == high)
1651 0 : return NULL;
1652 :
1653 : /* No hits. Walk the address tree until we find something usable. */
1654 0 : for (low = RBT_NEXT(uvm_pmr_addr, low);
1655 0 : low != high;
1656 0 : low = RBT_NEXT(uvm_pmr_addr, low)) {
1657 : KDASSERT(PMR_IS_SUBRANGE_OF(atop(VM_PAGE_TO_PHYS(low)),
1658 : atop(VM_PAGE_TO_PHYS(low)) + low->fpgsz,
1659 : start, end));
1660 0 : if (uvm_pmr_pg_to_memtype(low) == memtype)
1661 0 : return low;
1662 : }
1663 :
1664 : /* Nothing found. */
1665 0 : return NULL;
1666 0 : }
1667 :
1668 : /*
1669 : * Allocate any page, the fastest way. Page number constraints only.
1670 : */
1671 : psize_t
1672 0 : uvm_pmr_get1page(psize_t count, int memtype_init, struct pglist *result,
1673 : paddr_t start, paddr_t end, int memtype_only)
1674 : {
1675 : struct uvm_pmemrange *pmr;
1676 : struct vm_page *found, *splitpg;
1677 : psize_t fcount;
1678 : int memtype;
1679 :
1680 : fcount = 0;
1681 0 : TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) {
1682 : /* We're done. */
1683 0 : if (fcount == count)
1684 : break;
1685 :
1686 : /* Outside requested range. */
1687 0 : if (!(start == 0 && end == 0) &&
1688 0 : !PMR_INTERSECTS_WITH(pmr->low, pmr->high, start, end))
1689 : continue;
1690 :
1691 : /* Range is empty. */
1692 0 : if (pmr->nsegs == 0)
1693 : continue;
1694 :
1695 : /* Loop over all memtypes, starting at memtype_init. */
1696 : memtype = memtype_init;
1697 0 : while (fcount != count) {
1698 0 : found = TAILQ_FIRST(&pmr->single[memtype]);
1699 : /*
1700 : * If found is outside the range, walk the list
1701 : * until we find something that intersects with
1702 : * boundaries.
1703 : */
1704 0 : while (found && !PMR_INTERSECTS_WITH(
1705 : atop(VM_PAGE_TO_PHYS(found)),
1706 : atop(VM_PAGE_TO_PHYS(found)) + 1,
1707 : start, end))
1708 0 : found = TAILQ_NEXT(found, pageq);
1709 :
1710 0 : if (found == NULL) {
1711 : /*
1712 : * Check if the size tree contains a range
1713 : * that intersects with the boundaries. As the
1714 : * allocation is for any page, try the smallest
1715 : * range so that large ranges are preserved for
1716 : * more constrained cases. Only one entry is
1717 : * checked here, to avoid a brute-force search.
1718 : *
1719 : * Note that a size tree gives pg[1] instead of
1720 : * pg[0].
1721 : */
1722 0 : found = RBT_MIN(uvm_pmr_size,
1723 : &pmr->size[memtype]);
1724 0 : if (found != NULL) {
1725 0 : found--;
1726 0 : if (!PMR_INTERSECTS_WITH(
1727 : atop(VM_PAGE_TO_PHYS(found)),
1728 : atop(VM_PAGE_TO_PHYS(found)) +
1729 : found->fpgsz, start, end))
1730 0 : found = NULL;
1731 : }
1732 : }
1733 0 : if (found == NULL) {
1734 : /*
1735 : * Try address-guided search to meet the page
1736 : * number constraints.
1737 : */
1738 0 : found = RBT_ROOT(uvm_pmr_addr, &pmr->addr);
1739 0 : if (found != NULL) {
1740 0 : found = uvm_pmr_rootupdate(pmr, found,
1741 : start, end, memtype);
1742 0 : }
1743 : }
1744 0 : if (found != NULL) {
1745 : uvm_pmr_assertvalid(pmr);
1746 0 : uvm_pmr_remove_size(pmr, found);
1747 :
1748 : /*
1749 : * If the page intersects the end, then it'll
1750 : * need splitting.
1751 : *
1752 : * Note that we don't need to split if the page
1753 : * intersects start: the drain function will
1754 : * simply stop on hitting start.
1755 : */
1756 0 : if (end != 0 && atop(VM_PAGE_TO_PHYS(found)) +
1757 0 : found->fpgsz > end) {
1758 : psize_t splitsz =
1759 : atop(VM_PAGE_TO_PHYS(found)) +
1760 0 : found->fpgsz - end;
1761 :
1762 0 : uvm_pmr_remove_addr(pmr, found);
1763 : uvm_pmr_assertvalid(pmr);
1764 0 : found->fpgsz -= splitsz;
1765 0 : splitpg = found + found->fpgsz;
1766 0 : splitpg->fpgsz = splitsz;
1767 0 : uvm_pmr_insert(pmr, splitpg, 1);
1768 :
1769 : /*
1770 : * At this point, splitpg and found
1771 : * actually should be joined.
1772 : * But we explicitly disable that,
1773 : * because we will start subtracting
1774 : * from found.
1775 : */
1776 0 : KASSERT(start == 0 ||
1777 : atop(VM_PAGE_TO_PHYS(found)) +
1778 : found->fpgsz > start);
1779 0 : uvm_pmr_insert_addr(pmr, found, 1);
1780 0 : }
1781 :
1782 : /*
1783 : * Fetch pages from the end.
1784 : * If the range is larger than the requested
1785 : * number of pages, this saves us an addr-tree
1786 : * update.
1787 : *
1788 : * Since we take from the end and insert at
1789 : * the head, any ranges keep preserved.
1790 : */
1791 0 : while (found->fpgsz > 0 && fcount < count &&
1792 0 : (start == 0 ||
1793 0 : atop(VM_PAGE_TO_PHYS(found)) +
1794 0 : found->fpgsz > start)) {
1795 0 : found->fpgsz--;
1796 0 : fcount++;
1797 0 : TAILQ_INSERT_HEAD(result,
1798 : &found[found->fpgsz], pageq);
1799 : }
1800 0 : if (found->fpgsz > 0) {
1801 0 : uvm_pmr_insert_size(pmr, found);
1802 : KDASSERT(fcount == count);
1803 : uvm_pmr_assertvalid(pmr);
1804 0 : return fcount;
1805 : }
1806 :
1807 : /*
1808 : * Delayed addr-tree removal.
1809 : */
1810 0 : uvm_pmr_remove_addr(pmr, found);
1811 : uvm_pmr_assertvalid(pmr);
1812 0 : } else {
1813 0 : if (memtype_only)
1814 : break;
1815 : /*
1816 : * Skip to the next memtype.
1817 : */
1818 0 : memtype += 1;
1819 0 : if (memtype == UVM_PMR_MEMTYPE_MAX)
1820 : memtype = 0;
1821 0 : if (memtype == memtype_init)
1822 : break;
1823 : }
1824 : }
1825 : }
1826 :
1827 : /*
1828 : * Search finished.
1829 : *
1830 : * Ran out of ranges before enough pages were gathered, or we hit the
1831 : * case where found->fpgsz == count - fcount, in which case the
1832 : * above exit condition didn't trigger.
1833 : *
1834 : * On failure, caller will free the pages.
1835 : */
1836 0 : return fcount;
1837 0 : }
1838 :
1839 : #ifdef DDB
1840 : /*
1841 : * Print information about pmemrange.
1842 : * Does not do locking (so either call it from DDB or acquire fpageq lock
1843 : * before invoking.
1844 : */
1845 : void
1846 0 : uvm_pmr_print(void)
1847 : {
1848 : struct uvm_pmemrange *pmr;
1849 : struct vm_page *pg;
1850 0 : psize_t size[UVM_PMR_MEMTYPE_MAX];
1851 : psize_t free;
1852 : int useq_len;
1853 : int mt;
1854 :
1855 0 : printf("Ranges, use queue:\n");
1856 : useq_len = 0;
1857 0 : TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) {
1858 0 : useq_len++;
1859 : free = 0;
1860 0 : for (mt = 0; mt < UVM_PMR_MEMTYPE_MAX; mt++) {
1861 0 : pg = RBT_MAX(uvm_pmr_size, &pmr->size[mt]);
1862 0 : if (pg != NULL)
1863 0 : pg--;
1864 : else
1865 0 : pg = TAILQ_FIRST(&pmr->single[mt]);
1866 0 : size[mt] = (pg == NULL ? 0 : pg->fpgsz);
1867 :
1868 0 : RBT_FOREACH(pg, uvm_pmr_addr, &pmr->addr)
1869 0 : free += pg->fpgsz;
1870 : }
1871 :
1872 0 : printf("* [0x%lx-0x%lx] use=%d nsegs=%ld",
1873 0 : (unsigned long)pmr->low, (unsigned long)pmr->high,
1874 0 : pmr->use, (unsigned long)pmr->nsegs);
1875 0 : for (mt = 0; mt < UVM_PMR_MEMTYPE_MAX; mt++) {
1876 0 : printf(" maxsegsz[%d]=0x%lx", mt,
1877 0 : (unsigned long)size[mt]);
1878 : }
1879 0 : printf(" free=0x%lx\n", (unsigned long)free);
1880 : }
1881 0 : printf("#ranges = %d\n", useq_len);
1882 0 : }
1883 : #endif
1884 :
1885 : /*
1886 : * uvm_wait_pla: wait (sleep) for the page daemon to free some pages
1887 : * in a specific physmem area.
1888 : *
1889 : * Returns ENOMEM if the pagedaemon failed to free any pages.
1890 : * If not failok, failure will lead to panic.
1891 : *
1892 : * Must be called with fpageq locked.
1893 : */
1894 : int
1895 0 : uvm_wait_pla(paddr_t low, paddr_t high, paddr_t size, int failok)
1896 : {
1897 0 : struct uvm_pmalloc pma;
1898 : const char *wmsg = "pmrwait";
1899 :
1900 0 : if (curproc == uvm.pagedaemon_proc) {
1901 : /*
1902 : * This is not that uncommon when the pagedaemon is trying
1903 : * to flush out a large mmapped file. VOP_WRITE will circle
1904 : * back through the buffer cache and try to get more memory.
1905 : * The pagedaemon starts by calling bufbackoff, but we can
1906 : * easily use up that reserve in a single scan iteration.
1907 : */
1908 0 : uvm_unlock_fpageq();
1909 0 : if (bufbackoff(NULL, atop(size)) == 0) {
1910 : uvm_lock_fpageq();
1911 0 : return 0;
1912 : }
1913 : uvm_lock_fpageq();
1914 :
1915 : /*
1916 : * XXX detect pagedaemon deadlock - see comment in
1917 : * uvm_wait(), as this is exactly the same issue.
1918 : */
1919 0 : printf("pagedaemon: wait_pla deadlock detected!\n");
1920 0 : msleep(&uvmexp.free, &uvm.fpageqlock, PVM, wmsg, hz >> 3);
1921 : #if defined(DEBUG)
1922 : /* DEBUG: panic so we can debug it */
1923 : panic("wait_pla pagedaemon deadlock");
1924 : #endif
1925 0 : return 0;
1926 : }
1927 :
1928 0 : for (;;) {
1929 0 : pma.pm_constraint.ucr_low = low;
1930 0 : pma.pm_constraint.ucr_high = high;
1931 0 : pma.pm_size = size;
1932 0 : pma.pm_flags = UVM_PMA_LINKED;
1933 0 : TAILQ_INSERT_TAIL(&uvm.pmr_control.allocs, &pma, pmq);
1934 :
1935 0 : wakeup(&uvm.pagedaemon); /* wake the daemon! */
1936 0 : while (pma.pm_flags & (UVM_PMA_LINKED | UVM_PMA_BUSY))
1937 0 : msleep(&pma, &uvm.fpageqlock, PVM, wmsg, 0);
1938 :
1939 0 : if (!(pma.pm_flags & UVM_PMA_FREED) &&
1940 0 : pma.pm_flags & UVM_PMA_FAIL) {
1941 0 : if (failok)
1942 0 : return ENOMEM;
1943 0 : printf("uvm_wait: failed to free %ld pages between "
1944 0 : "0x%lx-0x%lx\n", atop(size), low, high);
1945 : } else
1946 0 : return 0;
1947 : }
1948 : /* UNREACHABLE */
1949 0 : }
1950 :
1951 : /*
1952 : * Wake up uvm_pmalloc sleepers.
1953 : */
1954 : void
1955 0 : uvm_wakeup_pla(paddr_t low, psize_t len)
1956 : {
1957 : struct uvm_pmalloc *pma, *pma_next;
1958 : paddr_t high;
1959 :
1960 0 : high = low + len;
1961 :
1962 : /* Wake specific allocations waiting for this memory. */
1963 0 : for (pma = TAILQ_FIRST(&uvm.pmr_control.allocs); pma != NULL;
1964 : pma = pma_next) {
1965 0 : pma_next = TAILQ_NEXT(pma, pmq);
1966 :
1967 0 : if (low < pma->pm_constraint.ucr_high &&
1968 0 : high > pma->pm_constraint.ucr_low) {
1969 0 : pma->pm_flags |= UVM_PMA_FREED;
1970 0 : if (!(pma->pm_flags & UVM_PMA_BUSY)) {
1971 0 : pma->pm_flags &= ~UVM_PMA_LINKED;
1972 0 : TAILQ_REMOVE(&uvm.pmr_control.allocs, pma,
1973 : pmq);
1974 0 : wakeup(pma);
1975 0 : }
1976 : }
1977 : }
1978 0 : }
1979 :
1980 : void
1981 0 : uvm_pagezero_thread(void *arg)
1982 : {
1983 0 : struct pglist pgl;
1984 : struct vm_page *pg;
1985 : int count;
1986 :
1987 : /* Run at the lowest possible priority. */
1988 0 : curproc->p_p->ps_nice = NZERO + PRIO_MAX;
1989 :
1990 0 : KERNEL_UNLOCK();
1991 :
1992 0 : TAILQ_INIT(&pgl);
1993 0 : for (;;) {
1994 0 : uvm_lock_fpageq();
1995 0 : while (uvmexp.zeropages >= UVM_PAGEZERO_TARGET ||
1996 0 : (count = uvm_pmr_get1page(16, UVM_PMR_MEMTYPE_DIRTY,
1997 0 : &pgl, 0, 0, 1)) == 0) {
1998 0 : msleep(&uvmexp.zeropages, &uvm.fpageqlock, MAXPRI,
1999 : "pgzero", 0);
2000 : }
2001 0 : uvm_unlock_fpageq();
2002 :
2003 0 : TAILQ_FOREACH(pg, &pgl, pageq) {
2004 0 : uvm_pagezero(pg);
2005 0 : atomic_setbits_int(&pg->pg_flags, PG_ZERO);
2006 : }
2007 :
2008 0 : uvm_lock_fpageq();
2009 0 : while (!TAILQ_EMPTY(&pgl))
2010 0 : uvm_pmr_remove_1strange(&pgl, 0, NULL, 0);
2011 0 : uvmexp.zeropages += count;
2012 0 : uvm_unlock_fpageq();
2013 :
2014 0 : yield();
2015 : }
2016 : }
|