LCOV - code coverage report
Current view: top level - uvm - uvm_pmemrange.c (source / functions) Hit Total Coverage
Test: 6.4 Lines: 0 637 0.0 %
Date: 2018-10-19 03:25:38 Functions: 0 37 0.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*      $OpenBSD: uvm_pmemrange.c,v 1.53 2016/09/16 02:52:24 dlg Exp $  */
       2             : 
       3             : /*
       4             :  * Copyright (c) 2009, 2010 Ariane van der Steldt <ariane@stack.nl>
       5             :  *
       6             :  * Permission to use, copy, modify, and distribute this software for any
       7             :  * purpose with or without fee is hereby granted, provided that the above
       8             :  * copyright notice and this permission notice appear in all copies.
       9             :  *
      10             :  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
      11             :  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
      12             :  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
      13             :  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
      14             :  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
      15             :  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
      16             :  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
      17             :  */
      18             : 
      19             : #include <sys/param.h>
      20             : #include <sys/systm.h>
      21             : #include <uvm/uvm.h>
      22             : #include <sys/malloc.h>
      23             : #include <sys/kernel.h>
      24             : #include <sys/kthread.h>
      25             : #include <sys/mount.h>
      26             : 
      27             : /*
      28             :  * 2 trees: addr tree and size tree.
      29             :  *
      30             :  * The allocator keeps chunks of free pages (called a range).
      31             :  * Two pages are part of the same range if:
      32             :  * - all pages in between are part of that range,
      33             :  * - they are of the same memory type (zeroed or non-zeroed),
      34             :  * - they are part of the same pmemrange.
      35             :  * A pmemrange is a range of memory which is part of the same vm_physseg
      36             :  * and has a use-count.
      37             :  *
      38             :  * addr tree is vm_page[0].objt
      39             :  * size tree is vm_page[1].objt
      40             :  *
      41             :  * The size tree is not used for memory ranges of 1 page, instead,
      42             :  * single queue is vm_page[0].pageq
      43             :  *
      44             :  * vm_page[0].fpgsz describes the length of a free range. Two adjecent ranges
      45             :  * are joined, unless:
      46             :  * - they have pages in between them which are not free
      47             :  * - they belong to different memtypes (zeroed vs dirty memory)
      48             :  * - they are in different pmemrange areas (ISA vs non-ISA memory for instance)
      49             :  * - they are not a continuation of the same array
      50             :  * The latter issue is caused by vm_physseg ordering and splitting from the
      51             :  * MD initialization machinery. The MD code is dependant on freelists and
      52             :  * happens to split ISA memory from non-ISA memory.
      53             :  * (Note: freelists die die die!)
      54             :  *
      55             :  * uvm_page_init guarantees that every vm_physseg contains an array of
      56             :  * struct vm_page. Also, uvm_page_physload allocates an array of struct
      57             :  * vm_page. This code depends on that array. The array may break across
      58             :  * vm_physsegs boundaries.
      59             :  */
      60             : 
      61             : /*
      62             :  * Validate the flags of the page. (Used in asserts.)
      63             :  * Any free page must have the PQ_FREE flag set.
      64             :  * Free pages may be zeroed.
      65             :  * Pmap flags are left untouched.
      66             :  *
      67             :  * The PQ_FREE flag is not checked here: by not checking, we can easily use
      68             :  * this check in pages which are freed.
      69             :  */
      70             : #define VALID_FLAGS(pg_flags)                                           \
      71             :         (((pg_flags) & ~(PQ_FREE|PG_ZERO|PG_PMAPMASK)) == 0x0)
      72             : 
      73             : /* Tree comparators. */
      74             : int     uvm_pmemrange_addr_cmp(const struct uvm_pmemrange *,
      75             :             const struct uvm_pmemrange *);
      76             : int     uvm_pmemrange_use_cmp(struct uvm_pmemrange *, struct uvm_pmemrange *);
      77             : int     uvm_pmr_pg_to_memtype(struct vm_page *);
      78             : 
      79             : #ifdef DDB
      80             : void    uvm_pmr_print(void);
      81             : #endif
      82             : 
      83             : /*
      84             :  * Memory types. The page flags are used to derive what the current memory
      85             :  * type of a page is.
      86             :  */
      87             : int
      88           0 : uvm_pmr_pg_to_memtype(struct vm_page *pg)
      89             : {
      90           0 :         if (pg->pg_flags & PG_ZERO)
      91           0 :                 return UVM_PMR_MEMTYPE_ZERO;
      92             :         /* Default: dirty memory. */
      93           0 :         return UVM_PMR_MEMTYPE_DIRTY;
      94           0 : }
      95             : 
      96             : /* Trees. */
      97           0 : RBT_GENERATE(uvm_pmr_addr, vm_page, objt, uvm_pmr_addr_cmp);
      98           0 : RBT_GENERATE(uvm_pmr_size, vm_page, objt, uvm_pmr_size_cmp);
      99           0 : RBT_GENERATE(uvm_pmemrange_addr, uvm_pmemrange, pmr_addr,
     100             :     uvm_pmemrange_addr_cmp);
     101             : 
     102             : /* Validation. */
     103             : #ifdef DEBUG
     104             : void    uvm_pmr_assertvalid(struct uvm_pmemrange *pmr);
     105             : #else
     106             : #define uvm_pmr_assertvalid(pmr)        do {} while (0)
     107             : #endif
     108             : 
     109             : psize_t                  uvm_pmr_get1page(psize_t, int, struct pglist *,
     110             :                             paddr_t, paddr_t, int);
     111             : 
     112             : struct uvm_pmemrange    *uvm_pmr_allocpmr(void);
     113             : struct vm_page          *uvm_pmr_nfindsz(struct uvm_pmemrange *, psize_t, int);
     114             : struct vm_page          *uvm_pmr_nextsz(struct uvm_pmemrange *,
     115             :                             struct vm_page *, int);
     116             : void                     uvm_pmr_pnaddr(struct uvm_pmemrange *pmr,
     117             :                             struct vm_page *pg, struct vm_page **pg_prev,
     118             :                             struct vm_page **pg_next);
     119             : struct vm_page          *uvm_pmr_findnextsegment(struct uvm_pmemrange *,
     120             :                             struct vm_page *, paddr_t);
     121             : psize_t                  uvm_pmr_remove_1strange(struct pglist *, paddr_t,
     122             :                             struct vm_page **, int);
     123             : void                     uvm_pmr_split(paddr_t);
     124             : struct uvm_pmemrange    *uvm_pmemrange_find(paddr_t);
     125             : struct uvm_pmemrange    *uvm_pmemrange_use_insert(struct uvm_pmemrange_use *,
     126             :                             struct uvm_pmemrange *);
     127             : psize_t                  pow2divide(psize_t, psize_t);
     128             : struct vm_page          *uvm_pmr_rootupdate(struct uvm_pmemrange *,
     129             :                             struct vm_page *, paddr_t, paddr_t, int);
     130             : 
     131             : /*
     132             :  * Computes num/denom and rounds it up to the next power-of-2.
     133             :  *
     134             :  * This is a division function which calculates an approximation of
     135             :  * num/denom, with result =~ num/denom. It is meant to be fast and doesn't
     136             :  * have to be accurate.
     137             :  *
     138             :  * Providing too large a value makes the allocator slightly faster, at the
     139             :  * risk of hitting the failure case more often. Providing too small a value
     140             :  * makes the allocator a bit slower, but less likely to hit a failure case.
     141             :  */
     142             : psize_t
     143           0 : pow2divide(psize_t num, psize_t denom)
     144             : {
     145             :         int rshift;
     146             : 
     147           0 :         for (rshift = 0; num > denom; rshift++, denom <<= 1)
     148             :                 ;
     149           0 :         return (paddr_t)1 << rshift;
     150             : }
     151             : 
     152             : /*
     153             :  * Predicate: lhs is a subrange or rhs.
     154             :  *
     155             :  * If rhs_low == 0: don't care about lower bound.
     156             :  * If rhs_high == 0: don't care about upper bound.
     157             :  */
     158             : #define PMR_IS_SUBRANGE_OF(lhs_low, lhs_high, rhs_low, rhs_high)        \
     159             :         (((rhs_low) == 0 || (lhs_low) >= (rhs_low)) &&                       \
     160             :         ((rhs_high) == 0 || (lhs_high) <= (rhs_high)))
     161             : 
     162             : /*
     163             :  * Predicate: lhs intersects with rhs.
     164             :  *
     165             :  * If rhs_low == 0: don't care about lower bound.
     166             :  * If rhs_high == 0: don't care about upper bound.
     167             :  * Ranges don't intersect if they don't have any page in common, array
     168             :  * semantics mean that < instead of <= should be used here.
     169             :  */
     170             : #define PMR_INTERSECTS_WITH(lhs_low, lhs_high, rhs_low, rhs_high)       \
     171             :         (((rhs_low) == 0 || (rhs_low) < (lhs_high)) &&                       \
     172             :         ((rhs_high) == 0 || (lhs_low) < (rhs_high)))
     173             : 
     174             : /*
     175             :  * Align to power-of-2 alignment.
     176             :  */
     177             : #define PMR_ALIGN(pgno, align)                                          \
     178             :         (((pgno) + ((align) - 1)) & ~((align) - 1))
     179             : 
     180             : 
     181             : /*
     182             :  * Comparator: sort by address ascending.
     183             :  */
     184             : int
     185           0 : uvm_pmemrange_addr_cmp(const struct uvm_pmemrange *lhs,
     186             :     const struct uvm_pmemrange *rhs)
     187             : {
     188           0 :         return lhs->low < rhs->low ? -1 : lhs->low > rhs->low;
     189             : }
     190             : 
     191             : /*
     192             :  * Comparator: sort by use ascending.
     193             :  *
     194             :  * The higher the use value of a range, the more devices need memory in
     195             :  * this range. Therefore allocate from the range with the lowest use first.
     196             :  */
     197             : int
     198           0 : uvm_pmemrange_use_cmp(struct uvm_pmemrange *lhs, struct uvm_pmemrange *rhs)
     199             : {
     200             :         int result;
     201             : 
     202           0 :         result = lhs->use < rhs->use ? -1 : lhs->use > rhs->use;
     203           0 :         if (result == 0)
     204           0 :                 result = uvm_pmemrange_addr_cmp(lhs, rhs);
     205           0 :         return result;
     206             : }
     207             : 
     208             : int
     209           0 : uvm_pmr_addr_cmp(const struct vm_page *lhs, const struct vm_page *rhs)
     210             : {
     211             :         paddr_t lhs_addr, rhs_addr;
     212             : 
     213           0 :         lhs_addr = VM_PAGE_TO_PHYS(lhs);
     214           0 :         rhs_addr = VM_PAGE_TO_PHYS(rhs);
     215             : 
     216           0 :         return (lhs_addr < rhs_addr ? -1 : lhs_addr > rhs_addr);
     217             : }
     218             : 
     219             : int
     220           0 : uvm_pmr_size_cmp(const struct vm_page *lhs, const struct vm_page *rhs)
     221             : {
     222             :         psize_t lhs_size, rhs_size;
     223             :         int cmp;
     224             : 
     225             :         /* Using second tree, so we receive pg[1] instead of pg[0]. */
     226           0 :         lhs_size = (lhs - 1)->fpgsz;
     227           0 :         rhs_size = (rhs - 1)->fpgsz;
     228             : 
     229           0 :         cmp = (lhs_size < rhs_size ? -1 : lhs_size > rhs_size);
     230           0 :         if (cmp == 0)
     231           0 :                 cmp = uvm_pmr_addr_cmp(lhs - 1, rhs - 1);
     232           0 :         return cmp;
     233             : }
     234             : 
     235             : /*
     236             :  * Find the first range of free pages that is at least sz pages long.
     237             :  */
     238             : struct vm_page *
     239           0 : uvm_pmr_nfindsz(struct uvm_pmemrange *pmr, psize_t sz, int mti)
     240             : {
     241             :         struct  vm_page *node, *best;
     242             : 
     243           0 :         KASSERT(sz >= 1);
     244             : 
     245           0 :         if (sz == 1 && !TAILQ_EMPTY(&pmr->single[mti]))
     246           0 :                 return TAILQ_FIRST(&pmr->single[mti]);
     247             : 
     248           0 :         node = RBT_ROOT(uvm_pmr_size, &pmr->size[mti]);
     249             :         best = NULL;
     250           0 :         while (node != NULL) {
     251           0 :                 if ((node - 1)->fpgsz >= sz) {
     252             :                         best = (node - 1);
     253           0 :                         node = RBT_LEFT(uvm_objtree, node);
     254           0 :                 } else
     255           0 :                         node = RBT_RIGHT(uvm_objtree, node);
     256             :         }
     257           0 :         return best;
     258           0 : }
     259             : 
     260             : /*
     261             :  * Finds the next range. The next range has a size >= pg->fpgsz.
     262             :  * Returns NULL if no more ranges are available.
     263             :  */
     264             : struct vm_page *
     265           0 : uvm_pmr_nextsz(struct uvm_pmemrange *pmr, struct vm_page *pg, int mt)
     266             : {
     267             :         struct vm_page *npg;
     268             : 
     269           0 :         KASSERT(pmr != NULL && pg != NULL);
     270           0 :         if (pg->fpgsz == 1) {
     271           0 :                 if (TAILQ_NEXT(pg, pageq) != NULL)
     272           0 :                         return TAILQ_NEXT(pg, pageq);
     273             :                 else
     274           0 :                         npg = RBT_MIN(uvm_pmr_size, &pmr->size[mt]);
     275           0 :         } else
     276           0 :                 npg = RBT_NEXT(uvm_pmr_size, pg + 1);
     277             : 
     278           0 :         return npg == NULL ? NULL : npg - 1;
     279           0 : }
     280             : 
     281             : /*
     282             :  * Finds the previous and next ranges relative to the (uninserted) pg range.
     283             :  *
     284             :  * *pg_prev == NULL if no previous range is available, that can join with
     285             :  *      pg.
     286             :  * *pg_next == NULL if no next range is available, that can join with
     287             :  *      pg.
     288             :  */
     289             : void
     290           0 : uvm_pmr_pnaddr(struct uvm_pmemrange *pmr, struct vm_page *pg,
     291             :     struct vm_page **pg_prev, struct vm_page **pg_next)
     292             : {
     293           0 :         KASSERT(pg_prev != NULL && pg_next != NULL);
     294             : 
     295           0 :         *pg_next = RBT_NFIND(uvm_pmr_addr, &pmr->addr, pg);
     296           0 :         if (*pg_next == NULL)
     297           0 :                 *pg_prev = RBT_MAX(uvm_pmr_addr, &pmr->addr);
     298             :         else
     299           0 :                 *pg_prev = RBT_PREV(uvm_pmr_addr, *pg_next);
     300             : 
     301             :         KDASSERT(*pg_next == NULL ||
     302             :             VM_PAGE_TO_PHYS(*pg_next) > VM_PAGE_TO_PHYS(pg));
     303             :         KDASSERT(*pg_prev == NULL ||
     304             :             VM_PAGE_TO_PHYS(*pg_prev) < VM_PAGE_TO_PHYS(pg));
     305             : 
     306             :         /* Reset if not contig. */
     307           0 :         if (*pg_prev != NULL &&
     308           0 :             (atop(VM_PAGE_TO_PHYS(*pg_prev)) + (*pg_prev)->fpgsz
     309           0 :             != atop(VM_PAGE_TO_PHYS(pg)) ||
     310           0 :             *pg_prev + (*pg_prev)->fpgsz != pg || /* Array broke. */
     311           0 :             uvm_pmr_pg_to_memtype(*pg_prev) != uvm_pmr_pg_to_memtype(pg)))
     312           0 :                 *pg_prev = NULL;
     313           0 :         if (*pg_next != NULL &&
     314           0 :             (atop(VM_PAGE_TO_PHYS(pg)) + pg->fpgsz
     315           0 :             != atop(VM_PAGE_TO_PHYS(*pg_next)) ||
     316           0 :             pg + pg->fpgsz != *pg_next || /* Array broke. */
     317           0 :             uvm_pmr_pg_to_memtype(*pg_next) != uvm_pmr_pg_to_memtype(pg)))
     318           0 :                 *pg_next = NULL;
     319           0 :         return;
     320             : }
     321             : 
     322             : /*
     323             :  * Remove a range from the address tree.
     324             :  * Address tree maintains pmr counters.
     325             :  */
     326             : void
     327           0 : uvm_pmr_remove_addr(struct uvm_pmemrange *pmr, struct vm_page *pg)
     328             : {
     329             :         KDASSERT(RBT_FIND(uvm_pmr_addr, &pmr->addr, pg) == pg);
     330             :         KDASSERT(pg->pg_flags & PQ_FREE);
     331           0 :         RBT_REMOVE(uvm_pmr_addr, &pmr->addr, pg);
     332             : 
     333           0 :         pmr->nsegs--;
     334           0 : }
     335             : /*
     336             :  * Remove a range from the size tree.
     337             :  */
     338             : void
     339           0 : uvm_pmr_remove_size(struct uvm_pmemrange *pmr, struct vm_page *pg)
     340             : {
     341             :         int memtype;
     342             : #ifdef DEBUG
     343             :         struct vm_page *i;
     344             : #endif
     345             : 
     346             :         KDASSERT(pg->fpgsz >= 1);
     347             :         KDASSERT(pg->pg_flags & PQ_FREE);
     348           0 :         memtype = uvm_pmr_pg_to_memtype(pg);
     349             : 
     350           0 :         if (pg->fpgsz == 1) {
     351             : #ifdef DEBUG
     352             :                 TAILQ_FOREACH(i, &pmr->single[memtype], pageq) {
     353             :                         if (i == pg)
     354             :                                 break;
     355             :                 }
     356             :                 KDASSERT(i == pg);
     357             : #endif
     358           0 :                 TAILQ_REMOVE(&pmr->single[memtype], pg, pageq);
     359           0 :         } else {
     360             :                 KDASSERT(RBT_FIND(uvm_pmr_size, &pmr->size[memtype],
     361             :                     pg + 1) == pg + 1);
     362           0 :                 RBT_REMOVE(uvm_pmr_size, &pmr->size[memtype], pg + 1);
     363             :         }
     364           0 : }
     365             : /* Remove from both trees. */
     366             : void
     367           0 : uvm_pmr_remove(struct uvm_pmemrange *pmr, struct vm_page *pg)
     368             : {
     369             :         uvm_pmr_assertvalid(pmr);
     370           0 :         uvm_pmr_remove_size(pmr, pg);
     371           0 :         uvm_pmr_remove_addr(pmr, pg);
     372             :         uvm_pmr_assertvalid(pmr);
     373           0 : }
     374             : 
     375             : /*
     376             :  * Insert the range described in pg.
     377             :  * Returns the range thus created (which may be joined with the previous and
     378             :  * next ranges).
     379             :  * If no_join, the caller guarantees that the range cannot possibly join
     380             :  * with adjecent ranges.
     381             :  */
     382             : struct vm_page *
     383           0 : uvm_pmr_insert_addr(struct uvm_pmemrange *pmr, struct vm_page *pg, int no_join)
     384             : {
     385           0 :         struct vm_page *prev, *next;
     386             : 
     387             : #ifdef DEBUG
     388             :         struct vm_page *i;
     389             :         int mt;
     390             : #endif
     391             : 
     392             :         KDASSERT(pg->pg_flags & PQ_FREE);
     393             :         KDASSERT(pg->fpgsz >= 1);
     394             : 
     395             : #ifdef DEBUG
     396             :         for (mt = 0; mt < UVM_PMR_MEMTYPE_MAX; mt++) {
     397             :                 TAILQ_FOREACH(i, &pmr->single[mt], pageq)
     398             :                         KDASSERT(i != pg);
     399             :                 if (pg->fpgsz > 1) {
     400             :                         KDASSERT(RBT_FIND(uvm_pmr_size, &pmr->size[mt],
     401             :                             pg + 1) == NULL);
     402             :                 }
     403             :                 KDASSERT(RBT_FIND(uvm_pmr_addr, &pmr->addr, pg) == NULL);
     404             :         }
     405             : #endif
     406             : 
     407           0 :         if (!no_join) {
     408           0 :                 uvm_pmr_pnaddr(pmr, pg, &prev, &next);
     409           0 :                 if (next != NULL) {
     410           0 :                         uvm_pmr_remove_size(pmr, next);
     411           0 :                         uvm_pmr_remove_addr(pmr, next);
     412           0 :                         pg->fpgsz += next->fpgsz;
     413           0 :                         next->fpgsz = 0;
     414           0 :                 }
     415           0 :                 if (prev != NULL) {
     416           0 :                         uvm_pmr_remove_size(pmr, prev);
     417           0 :                         prev->fpgsz += pg->fpgsz;
     418           0 :                         pg->fpgsz = 0;
     419           0 :                         return prev;
     420             :                 }
     421             :         }
     422             : 
     423           0 :         RBT_INSERT(uvm_pmr_addr, &pmr->addr, pg);
     424             : 
     425           0 :         pmr->nsegs++;
     426             : 
     427           0 :         return pg;
     428           0 : }
     429             : /*
     430             :  * Insert the range described in pg.
     431             :  * Returns the range thus created (which may be joined with the previous and
     432             :  * next ranges).
     433             :  * Page must already be in the address tree.
     434             :  */
     435             : void
     436           0 : uvm_pmr_insert_size(struct uvm_pmemrange *pmr, struct vm_page *pg)
     437             : {
     438             :         int memtype;
     439             : #ifdef DEBUG
     440             :         struct vm_page *i;
     441             :         int mti;
     442             : #endif
     443             : 
     444             :         KDASSERT(pg->fpgsz >= 1);
     445             :         KDASSERT(pg->pg_flags & PQ_FREE);
     446             : 
     447           0 :         memtype = uvm_pmr_pg_to_memtype(pg);
     448             : #ifdef DEBUG
     449             :         for (mti = 0; mti < UVM_PMR_MEMTYPE_MAX; mti++) {
     450             :                 TAILQ_FOREACH(i, &pmr->single[mti], pageq)
     451             :                         KDASSERT(i != pg);
     452             :                 if (pg->fpgsz > 1) {
     453             :                         KDASSERT(RBT_FIND(uvm_pmr_size, &pmr->size[mti],
     454             :                             pg + 1) == NULL);
     455             :                 }
     456             :                 KDASSERT(RBT_FIND(uvm_pmr_addr, &pmr->addr, pg) == pg);
     457             :         }
     458             :         for (i = pg; i < pg + pg->fpgsz; i++)
     459             :                 KASSERT(uvm_pmr_pg_to_memtype(i) == memtype);
     460             : #endif
     461             : 
     462           0 :         if (pg->fpgsz == 1)
     463           0 :                 TAILQ_INSERT_TAIL(&pmr->single[memtype], pg, pageq);
     464             :         else
     465           0 :                 RBT_INSERT(uvm_pmr_size, &pmr->size[memtype], pg + 1);
     466           0 : }
     467             : /* Insert in both trees. */
     468             : struct vm_page *
     469           0 : uvm_pmr_insert(struct uvm_pmemrange *pmr, struct vm_page *pg, int no_join)
     470             : {
     471             :         uvm_pmr_assertvalid(pmr);
     472           0 :         pg = uvm_pmr_insert_addr(pmr, pg, no_join);
     473           0 :         uvm_pmr_insert_size(pmr, pg);
     474             :         uvm_pmr_assertvalid(pmr);
     475           0 :         return pg;
     476             : }
     477             : 
     478             : /*
     479             :  * Find the last page that is part of this segment.
     480             :  * => pg: the range at which to start the search.
     481             :  * => boundary: the page number boundary specification (0 = no boundary).
     482             :  * => pmr: the pmemrange of the page.
     483             :  * 
     484             :  * This function returns 1 before the next range, so if you want to have the
     485             :  * next range, you need to run TAILQ_NEXT(result, pageq) after calling.
     486             :  * The reason is that this way, the length of the segment is easily
     487             :  * calculated using: atop(result) - atop(pg) + 1.
     488             :  * Hence this function also never returns NULL.
     489             :  */
     490             : struct vm_page *
     491           0 : uvm_pmr_findnextsegment(struct uvm_pmemrange *pmr,
     492             :     struct vm_page *pg, paddr_t boundary)
     493             : {
     494             :         paddr_t first_boundary;
     495             :         struct  vm_page *next;
     496             :         struct  vm_page *prev;
     497             : 
     498             :         KDASSERT(pmr->low <= atop(VM_PAGE_TO_PHYS(pg)) &&
     499             :             pmr->high > atop(VM_PAGE_TO_PHYS(pg)));
     500           0 :         if (boundary != 0) {
     501             :                 first_boundary =
     502           0 :                     PMR_ALIGN(atop(VM_PAGE_TO_PHYS(pg)) + 1, boundary);
     503           0 :         } else
     504             :                 first_boundary = 0;
     505             : 
     506             :         /*
     507             :          * Increase next until it hits the first page of the next segment.
     508             :          *
     509             :          * While loop checks the following:
     510             :          * - next != NULL       we have not reached the end of pgl
     511             :          * - boundary == 0 || next < first_boundary
     512             :          *                      we do not cross a boundary
     513             :          * - atop(prev) + 1 == atop(next)
     514             :          *                      still in the same segment
     515             :          * - low <= last
     516             :          * - high > last     still in the same memory range
     517             :          * - memtype is equal   allocator is unable to view different memtypes
     518             :          *                      as part of the same segment
     519             :          * - prev + 1 == next   no array breakage occurs
     520             :          */
     521             :         prev = pg;
     522           0 :         next = TAILQ_NEXT(prev, pageq);
     523           0 :         while (next != NULL &&
     524           0 :             (boundary == 0 || atop(VM_PAGE_TO_PHYS(next)) < first_boundary) &&
     525           0 :             atop(VM_PAGE_TO_PHYS(prev)) + 1 == atop(VM_PAGE_TO_PHYS(next)) &&
     526           0 :             pmr->low <= atop(VM_PAGE_TO_PHYS(next)) &&
     527           0 :             pmr->high > atop(VM_PAGE_TO_PHYS(next)) &&
     528           0 :             uvm_pmr_pg_to_memtype(prev) == uvm_pmr_pg_to_memtype(next) &&
     529           0 :             prev + 1 == next) {
     530             :                 prev = next;
     531           0 :                 next = TAILQ_NEXT(prev, pageq);
     532             :         }
     533             : 
     534             :         /*
     535             :          * End of this segment.
     536             :          */
     537           0 :         return prev;
     538             : }
     539             : 
     540             : /*
     541             :  * Remove the first segment of contiguous pages from pgl.
     542             :  * A segment ends if it crosses boundary (unless boundary = 0) or
     543             :  * if it would enter a different uvm_pmemrange.
     544             :  *
     545             :  * Work: the page range that the caller is currently working with.
     546             :  * May be null.
     547             :  *
     548             :  * If is_desperate is non-zero, the smallest segment is erased. Otherwise,
     549             :  * the first segment is erased (which, if called by uvm_pmr_getpages(),
     550             :  * probably is the smallest or very close to it).
     551             :  */
     552             : psize_t
     553           0 : uvm_pmr_remove_1strange(struct pglist *pgl, paddr_t boundary,
     554             :     struct vm_page **work, int is_desperate)
     555             : {
     556             :         struct vm_page *start, *end, *iter, *iter_end, *inserted;
     557             :         psize_t count;
     558             :         struct uvm_pmemrange *pmr, *pmr_iter;
     559             : 
     560           0 :         KASSERT(!TAILQ_EMPTY(pgl));
     561             : 
     562             :         /*
     563             :          * Initialize to first page.
     564             :          * Unless desperate scan finds a better candidate, this is what'll be
     565             :          * erased.
     566             :          */
     567             :         start = TAILQ_FIRST(pgl);
     568           0 :         pmr = uvm_pmemrange_find(atop(VM_PAGE_TO_PHYS(start)));
     569           0 :         end = uvm_pmr_findnextsegment(pmr, start, boundary);
     570             : 
     571             :         /*
     572             :          * If we are desperate, we _really_ want to get rid of the smallest
     573             :          * element (rather than a close match to the smallest element).
     574             :          */
     575           0 :         if (is_desperate) {
     576             :                 /* Linear search for smallest segment. */
     577             :                 pmr_iter = pmr;
     578           0 :                 for (iter = TAILQ_NEXT(end, pageq);
     579           0 :                     iter != NULL && start != end;
     580           0 :                     iter = TAILQ_NEXT(iter_end, pageq)) {
     581             :                         /*
     582             :                          * Only update pmr if it doesn't match current
     583             :                          * iteration.
     584             :                          */
     585           0 :                         if (pmr->low > atop(VM_PAGE_TO_PHYS(iter)) ||
     586           0 :                             pmr->high <= atop(VM_PAGE_TO_PHYS(iter))) {
     587           0 :                                 pmr_iter = uvm_pmemrange_find(atop(
     588             :                                     VM_PAGE_TO_PHYS(iter)));
     589           0 :                         }
     590             : 
     591           0 :                         iter_end = uvm_pmr_findnextsegment(pmr_iter, iter,
     592             :                             boundary);
     593             : 
     594             :                         /*
     595             :                          * Current iteration is smaller than best match so
     596             :                          * far; update.
     597             :                          */
     598           0 :                         if (VM_PAGE_TO_PHYS(iter_end) - VM_PAGE_TO_PHYS(iter) <
     599           0 :                             VM_PAGE_TO_PHYS(end) - VM_PAGE_TO_PHYS(start)) {
     600             :                                 start = iter;
     601             :                                 end = iter_end;
     602             :                                 pmr = pmr_iter;
     603           0 :                         }
     604             :                 }
     605             :         }
     606             : 
     607             :         /*
     608             :          * Calculate count and end of the list.
     609             :          */
     610           0 :         count = atop(VM_PAGE_TO_PHYS(end) - VM_PAGE_TO_PHYS(start)) + 1;
     611           0 :         end = TAILQ_NEXT(end, pageq);
     612             : 
     613             :         /*
     614             :          * Actually remove the range of pages.
     615             :          *
     616             :          * Sadly, this cannot be done using pointer iteration:
     617             :          * vm_physseg is not guaranteed to be sorted on address, hence
     618             :          * uvm_page_init() may not have initialized its array sorted by
     619             :          * page number.
     620             :          */
     621           0 :         for (iter = start; iter != end; iter = iter_end) {
     622           0 :                 iter_end = TAILQ_NEXT(iter, pageq);
     623           0 :                 TAILQ_REMOVE(pgl, iter, pageq);
     624             :         }
     625             : 
     626           0 :         start->fpgsz = count;
     627           0 :         inserted = uvm_pmr_insert(pmr, start, 0);
     628             : 
     629             :         /*
     630             :          * If the caller was working on a range and this function modified
     631             :          * that range, update the pointer.
     632             :          */
     633           0 :         if (work != NULL && *work != NULL &&
     634           0 :             atop(VM_PAGE_TO_PHYS(inserted)) <= atop(VM_PAGE_TO_PHYS(*work)) &&
     635           0 :             atop(VM_PAGE_TO_PHYS(inserted)) + inserted->fpgsz >
     636             :             atop(VM_PAGE_TO_PHYS(*work)))
     637           0 :                 *work = inserted;
     638           0 :         return count;
     639             : }
     640             : 
     641             : /*
     642             :  * Extract a number of pages from a segment of free pages.
     643             :  * Called by uvm_pmr_getpages.
     644             :  *
     645             :  * Returns the segment that was created from pages left over at the tail
     646             :  * of the remove set of pages, or NULL if no pages were left at the tail.
     647             :  */
     648             : struct vm_page *
     649           0 : uvm_pmr_extract_range(struct uvm_pmemrange *pmr, struct vm_page *pg,
     650             :     paddr_t start, paddr_t end, struct pglist *result)
     651             : {
     652             :         struct vm_page *after, *pg_i;
     653             :         psize_t before_sz, after_sz;
     654             : #ifdef DEBUG
     655             :         psize_t i;
     656             : #endif
     657             : 
     658             :         KDASSERT(end > start);
     659             :         KDASSERT(pmr->low <= atop(VM_PAGE_TO_PHYS(pg)));
     660             :         KDASSERT(pmr->high >= atop(VM_PAGE_TO_PHYS(pg)) + pg->fpgsz);
     661             :         KDASSERT(atop(VM_PAGE_TO_PHYS(pg)) <= start);
     662             :         KDASSERT(atop(VM_PAGE_TO_PHYS(pg)) + pg->fpgsz >= end);
     663             : 
     664           0 :         before_sz = start - atop(VM_PAGE_TO_PHYS(pg));
     665           0 :         after_sz = atop(VM_PAGE_TO_PHYS(pg)) + pg->fpgsz - end;
     666             :         KDASSERT(before_sz + after_sz + (end - start) == pg->fpgsz);
     667             :         uvm_pmr_assertvalid(pmr);
     668             : 
     669           0 :         uvm_pmr_remove_size(pmr, pg);
     670           0 :         if (before_sz == 0)
     671           0 :                 uvm_pmr_remove_addr(pmr, pg);
     672           0 :         after = pg + before_sz + (end - start);
     673             : 
     674             :         /* Add selected pages to result. */
     675           0 :         for (pg_i = pg + before_sz; pg_i != after; pg_i++) {
     676           0 :                 KASSERT(pg_i->pg_flags & PQ_FREE);
     677           0 :                 pg_i->fpgsz = 0;
     678           0 :                 TAILQ_INSERT_TAIL(result, pg_i, pageq);
     679             :         }
     680             : 
     681             :         /* Before handling. */
     682           0 :         if (before_sz > 0) {
     683           0 :                 pg->fpgsz = before_sz;
     684           0 :                 uvm_pmr_insert_size(pmr, pg);
     685           0 :         }
     686             : 
     687             :         /* After handling. */
     688           0 :         if (after_sz > 0) {
     689             : #ifdef DEBUG
     690             :                 for (i = 0; i < after_sz; i++) {
     691             :                         KASSERT(!uvm_pmr_isfree(after + i));
     692             :                 }
     693             : #endif
     694             :                 KDASSERT(atop(VM_PAGE_TO_PHYS(after)) == end);
     695           0 :                 after->fpgsz = after_sz;
     696           0 :                 after = uvm_pmr_insert_addr(pmr, after, 1);
     697           0 :                 uvm_pmr_insert_size(pmr, after);
     698           0 :         }
     699             : 
     700             :         uvm_pmr_assertvalid(pmr);
     701           0 :         return (after_sz > 0 ? after : NULL);
     702             : }
     703             : 
     704             : /*
     705             :  * Acquire a number of pages.
     706             :  *
     707             :  * count:       the number of pages returned
     708             :  * start:       lowest page number
     709             :  * end:         highest page number +1
     710             :  *              (start = end = 0: no limitation)
     711             :  * align:       power-of-2 alignment constraint (align = 1: no alignment)
     712             :  * boundary:    power-of-2 boundary (boundary = 0: no boundary)
     713             :  * maxseg:      maximum number of segments to return
     714             :  * flags:       UVM_PLA_* flags
     715             :  * result:      returned pages storage (uses pageq)
     716             :  */
     717             : int
     718           0 : uvm_pmr_getpages(psize_t count, paddr_t start, paddr_t end, paddr_t align,
     719             :     paddr_t boundary, int maxseg, int flags, struct pglist *result)
     720             : {
     721             :         struct  uvm_pmemrange *pmr;     /* Iterate memory ranges. */
     722           0 :         struct  vm_page *found, *f_next; /* Iterate chunks. */
     723             :         psize_t fcount;                 /* Current found pages. */
     724             :         int     fnsegs;                 /* Current segment counter. */
     725             :         int     try, start_try;
     726           0 :         psize_t search[3];
     727             :         paddr_t fstart, fend;           /* Pages to be taken from found. */
     728             :         int     memtype;                /* Requested memtype. */
     729             :         int     memtype_init;           /* Best memtype. */
     730             :         int     desperate;              /* True if allocation failed. */
     731             : #ifdef DIAGNOSTIC
     732             :         struct  vm_page *diag_prev;     /* Used during validation. */
     733             : #endif /* DIAGNOSTIC */
     734             : 
     735             :         /*
     736             :          * Validate arguments.
     737             :          */
     738           0 :         KASSERT(count > 0);
     739           0 :         KASSERT(start == 0 || end == 0 || start < end);
     740           0 :         KASSERT(align >= 1);
     741           0 :         KASSERT(powerof2(align));
     742           0 :         KASSERT(maxseg > 0);
     743           0 :         KASSERT(boundary == 0 || powerof2(boundary));
     744           0 :         KASSERT(boundary == 0 || maxseg * boundary >= count);
     745           0 :         KASSERT(TAILQ_EMPTY(result));
     746             : 
     747             :         /*
     748             :          * TRYCONTIG is a noop if you only want a single segment.
     749             :          * Remove it if that's the case: otherwise it'll deny the fast
     750             :          * allocation.
     751             :          */
     752           0 :         if (maxseg == 1 || count == 1)
     753           0 :                 flags &= ~UVM_PLA_TRYCONTIG;
     754             : 
     755             :         /*
     756             :          * Configure search.
     757             :          *
     758             :          * search[0] is one segment, only used in UVM_PLA_TRYCONTIG case.
     759             :          * search[1] is multiple segments, chosen to fulfill the search in
     760             :          *   approximately even-sized segments.
     761             :          *   This is a good trade-off between slightly reduced allocation speed
     762             :          *   and less fragmentation.
     763             :          * search[2] is the worst case, in which all segments are evaluated.
     764             :          *   This provides the least fragmentation, but makes the search
     765             :          *   possibly longer (although in the case it is selected, that no
     766             :          *   longer matters most).
     767             :          *
     768             :          * The exception is when maxseg == 1: since we can only fulfill that
     769             :          * with one segment of size pages, only a single search type has to
     770             :          * be attempted.
     771             :          */
     772           0 :         if (maxseg == 1 || count == 1) {
     773             :                 start_try = 2;
     774           0 :                 search[2] = count;
     775           0 :         } else if (maxseg >= count && (flags & UVM_PLA_TRYCONTIG) == 0) {
     776             :                 start_try = 2;
     777           0 :                 search[2] = 1;
     778           0 :         } else {
     779             :                 start_try = 0;
     780           0 :                 search[0] = count;
     781           0 :                 search[1] = pow2divide(count, maxseg);
     782           0 :                 search[2] = 1;
     783           0 :                 if ((flags & UVM_PLA_TRYCONTIG) == 0)
     784           0 :                         start_try = 1;
     785           0 :                 if (search[1] >= search[0]) {
     786           0 :                         search[1] = search[0];
     787             :                         start_try = 1;
     788           0 :                 }
     789           0 :                 if (search[2] >= search[start_try]) {
     790             :                         start_try = 2;
     791           0 :                 }
     792             :         }
     793             : 
     794             :         /*
     795             :          * Memory type: if zeroed memory is requested, traverse the zero set.
     796             :          * Otherwise, traverse the dirty set.
     797             :          *
     798             :          * The memtype iterator is reinitialized to memtype_init on entrance
     799             :          * of a pmemrange.
     800             :          */
     801           0 :         if (flags & UVM_PLA_ZERO)
     802           0 :                 memtype_init = UVM_PMR_MEMTYPE_ZERO;
     803             :         else
     804             :                 memtype_init = UVM_PMR_MEMTYPE_DIRTY;
     805             : 
     806             :         /*
     807             :          * Initially, we're not desperate.
     808             :          *
     809             :          * Note that if we return from a sleep, we are still desperate.
     810             :          * Chances are that memory pressure is still high, so resetting
     811             :          * seems over-optimistic to me.
     812             :          */
     813             :         desperate = 0;
     814             : 
     815           0 :         uvm_lock_fpageq();
     816             : 
     817             : retry:          /* Return point after sleeping. */
     818             :         fcount = 0;
     819           0 :         fnsegs = 0;
     820             : 
     821             : retry_desperate:
     822             :         /*
     823             :          * If we just want any page(s), go for the really fast option.
     824             :          */
     825           0 :         if (count <= maxseg && align == 1 && boundary == 0 &&
     826           0 :             (flags & UVM_PLA_TRYCONTIG) == 0) {
     827           0 :                 fcount += uvm_pmr_get1page(count - fcount, memtype_init,
     828             :                     result, start, end, 0);
     829             : 
     830             :                 /*
     831             :                  * If we found sufficient pages, go to the succes exit code.
     832             :                  *
     833             :                  * Otherwise, go immediately to fail, since we collected
     834             :                  * all we could anyway.
     835             :                  */
     836           0 :                 if (fcount == count)
     837             :                         goto out;
     838             :                 else
     839             :                         goto fail;
     840             :         }
     841             : 
     842             :         /*
     843             :          * The heart of the contig case.
     844             :          *
     845             :          * The code actually looks like this:
     846             :          *
     847             :          * foreach (struct pmemrange) {
     848             :          *      foreach (memtype) {
     849             :          *              foreach(try) {
     850             :          *                      foreach (free range of memtype in pmemrange,
     851             :          *                          starting at search[try]) {
     852             :          *                              while (range has space left)
     853             :          *                                      take from range
     854             :          *                      }
     855             :          *              }
     856             :          *      }
     857             :          *
     858             :          *      if next pmemrange has higher usecount than current:
     859             :          *              enter desperate case (which will drain the pmemranges
     860             :          *              until empty prior to moving to the next one)
     861             :          * }
     862             :          *
     863             :          * When desperate is activated, try always starts at the highest
     864             :          * value. The memtype loop is using a goto ReScanMemtype.
     865             :          * The try loop is using a goto ReScan.
     866             :          * The 'range has space left' loop uses label DrainFound.
     867             :          *
     868             :          * Writing them all as loops would take up a lot of screen space in
     869             :          * the form of indentation and some parts are easier to express
     870             :          * using the labels.
     871             :          */
     872             : 
     873           0 :         TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) {
     874             :                 /* Empty range. */
     875           0 :                 if (pmr->nsegs == 0)
     876             :                         continue;
     877             : 
     878             :                 /* Outside requested range. */
     879           0 :                 if (!PMR_INTERSECTS_WITH(pmr->low, pmr->high, start, end))
     880             :                         continue;
     881             : 
     882           0 :                 memtype = memtype_init;
     883             : 
     884             : rescan_memtype: /* Return point at memtype++. */
     885           0 :                 try = start_try;
     886             : 
     887             : rescan:         /* Return point at try++. */
     888           0 :                 for (found = uvm_pmr_nfindsz(pmr, search[try], memtype);
     889           0 :                     found != NULL;
     890           0 :                     found = f_next) {
     891           0 :                         f_next = uvm_pmr_nextsz(pmr, found, memtype);
     892             : 
     893           0 :                         fstart = atop(VM_PAGE_TO_PHYS(found));
     894           0 :                         if (start != 0)
     895           0 :                                 fstart = MAX(start, fstart);
     896             : drain_found:
     897             :                         /*
     898             :                          * Throw away the first segment if fnsegs == maxseg
     899             :                          *
     900             :                          * Note that f_next is still valid after this call,
     901             :                          * since we only allocated from entries before f_next.
     902             :                          * We don't revisit the entries we already extracted
     903             :                          * from unless we entered the desperate case.
     904             :                          */
     905           0 :                         if (fnsegs == maxseg) {
     906           0 :                                 fnsegs--;
     907           0 :                                 fcount -=
     908           0 :                                     uvm_pmr_remove_1strange(result, boundary,
     909             :                                     &found, desperate);
     910           0 :                         }
     911             : 
     912           0 :                         fstart = PMR_ALIGN(fstart, align);
     913           0 :                         fend = atop(VM_PAGE_TO_PHYS(found)) + found->fpgsz;
     914           0 :                         if (end != 0)
     915           0 :                                 fend = MIN(end, fend);
     916           0 :                         if (boundary != 0) {
     917             :                                 fend =
     918           0 :                                     MIN(fend, PMR_ALIGN(fstart + 1, boundary));
     919           0 :                         }
     920           0 :                         if (fstart >= fend)
     921             :                                 continue;
     922           0 :                         if (fend - fstart > count - fcount)
     923           0 :                                 fend = fstart + (count - fcount);
     924             : 
     925           0 :                         fcount += fend - fstart;
     926           0 :                         fnsegs++;
     927           0 :                         found = uvm_pmr_extract_range(pmr, found,
     928             :                             fstart, fend, result);
     929             : 
     930           0 :                         if (fcount == count)
     931             :                                 goto out;
     932             : 
     933             :                         /*
     934             :                          * If there's still space left in found, try to
     935             :                          * fully drain it prior to continueing.
     936             :                          */
     937           0 :                         if (found != NULL) {
     938             :                                 fstart = fend;
     939           0 :                                 goto drain_found;
     940             :                         }
     941             :                 }
     942             : 
     943             :                 /* Try a smaller search now. */
     944           0 :                 if (++try < nitems(search))
     945           0 :                         goto rescan;
     946             : 
     947             :                 /*
     948             :                  * Exhaust all memory types prior to going to the next memory
     949             :                  * segment.
     950             :                  * This means that zero-vs-dirty are eaten prior to moving
     951             :                  * to a pmemrange with a higher use-count.
     952             :                  *
     953             :                  * Code is basically a difficult way of writing:
     954             :                  * memtype = memtype_init;
     955             :                  * do {
     956             :                  *      ...;
     957             :                  *      memtype += 1;
     958             :                  *      memtype %= MEMTYPE_MAX;
     959             :                  * } while (memtype != memtype_init);
     960             :                  */
     961           0 :                 memtype += 1;
     962           0 :                 if (memtype == UVM_PMR_MEMTYPE_MAX)
     963             :                         memtype = 0;
     964           0 :                 if (memtype != memtype_init)
     965           0 :                         goto rescan_memtype;
     966             : 
     967             :                 /*
     968             :                  * If not desperate, enter desperate case prior to eating all
     969             :                  * the good stuff in the next range.
     970             :                  */
     971           0 :                 if (!desperate && TAILQ_NEXT(pmr, pmr_use) != NULL &&
     972           0 :                     TAILQ_NEXT(pmr, pmr_use)->use != pmr->use)
     973             :                         break;
     974             :         }
     975             : 
     976             :         /*
     977             :          * Not enough memory of the requested type available. Fall back to
     978             :          * less good memory that we'll clean up better later.
     979             :          *
     980             :          * This algorithm is not very smart though, it just starts scanning
     981             :          * a different typed range, but the nicer ranges of the previous
     982             :          * iteration may fall out. Hence there is a small chance of a false
     983             :          * negative.
     984             :          *
     985             :          * When desparate: scan all sizes starting at the smallest
     986             :          * (start_try = 1) and do not consider UVM_PLA_TRYCONTIG (which may
     987             :          * allow us to hit the fast path now).
     988             :          *
     989             :          * Also, because we will revisit entries we scanned before, we need
     990             :          * to reset the page queue, or we may end up releasing entries in
     991             :          * such a way as to invalidate f_next.
     992             :          */
     993           0 :         if (!desperate) {
     994             :                 desperate = 1;
     995             :                 start_try = nitems(search) - 1;
     996           0 :                 flags &= ~UVM_PLA_TRYCONTIG;
     997             : 
     998           0 :                 while (!TAILQ_EMPTY(result))
     999           0 :                         uvm_pmr_remove_1strange(result, 0, NULL, 0);
    1000             :                 fnsegs = 0;
    1001             :                 fcount = 0;
    1002           0 :                 goto retry_desperate;
    1003             :         }
    1004             : 
    1005             : fail:
    1006             :         /* Allocation failed. */
    1007             :         /* XXX: claim from memory reserve here */
    1008             : 
    1009           0 :         while (!TAILQ_EMPTY(result))
    1010           0 :                 uvm_pmr_remove_1strange(result, 0, NULL, 0);
    1011             : 
    1012           0 :         if (flags & UVM_PLA_WAITOK) {
    1013           0 :                 if (uvm_wait_pla(ptoa(start), ptoa(end) - 1, ptoa(count),
    1014           0 :                     flags & UVM_PLA_FAILOK) == 0)
    1015           0 :                         goto retry;
    1016           0 :                 KASSERT(flags & UVM_PLA_FAILOK);
    1017             :         } else
    1018           0 :                 wakeup(&uvm.pagedaemon);
    1019           0 :         uvm_unlock_fpageq();
    1020             : 
    1021           0 :         return ENOMEM;
    1022             : 
    1023             : out:
    1024             :         /* Allocation succesful. */
    1025           0 :         uvmexp.free -= fcount;
    1026             : 
    1027           0 :         uvm_unlock_fpageq();
    1028             : 
    1029             :         /* Update statistics and zero pages if UVM_PLA_ZERO. */
    1030             : #ifdef DIAGNOSTIC
    1031             :         fnsegs = 0;
    1032             :         fcount = 0;
    1033             :         diag_prev = NULL;
    1034             : #endif /* DIAGNOSTIC */
    1035           0 :         TAILQ_FOREACH(found, result, pageq) {
    1036           0 :                 atomic_clearbits_int(&found->pg_flags, PG_PMAPMASK);
    1037             : 
    1038           0 :                 if (found->pg_flags & PG_ZERO) {
    1039           0 :                         uvm_lock_fpageq();
    1040           0 :                         uvmexp.zeropages--;
    1041           0 :                         if (uvmexp.zeropages < UVM_PAGEZERO_TARGET)
    1042           0 :                                 wakeup(&uvmexp.zeropages);
    1043           0 :                         uvm_unlock_fpageq();
    1044           0 :                 }
    1045           0 :                 if (flags & UVM_PLA_ZERO) {
    1046           0 :                         if (found->pg_flags & PG_ZERO)
    1047           0 :                                 uvmexp.pga_zerohit++;
    1048             :                         else {
    1049           0 :                                 uvmexp.pga_zeromiss++;
    1050           0 :                                 uvm_pagezero(found);
    1051             :                         }
    1052             :                 }
    1053           0 :                 atomic_clearbits_int(&found->pg_flags, PG_ZERO|PQ_FREE);
    1054             : 
    1055           0 :                 found->uobject = NULL;
    1056           0 :                 found->uanon = NULL;
    1057           0 :                 found->pg_version++;
    1058             : 
    1059             :                 /*
    1060             :                  * Validate that the page matches range criterium.
    1061             :                  */
    1062             :                 KDASSERT(start == 0 || atop(VM_PAGE_TO_PHYS(found)) >= start);
    1063             :                 KDASSERT(end == 0 || atop(VM_PAGE_TO_PHYS(found)) < end);
    1064             : 
    1065             : #ifdef DIAGNOSTIC
    1066             :                 /*
    1067             :                  * Update fcount (# found pages) and
    1068             :                  * fnsegs (# found segments) counters.
    1069             :                  */
    1070           0 :                 if (diag_prev == NULL ||
    1071             :                     /* new segment if it contains a hole */
    1072           0 :                     atop(VM_PAGE_TO_PHYS(diag_prev)) + 1 !=
    1073           0 :                     atop(VM_PAGE_TO_PHYS(found)) ||
    1074             :                     /* new segment if it crosses boundary */
    1075           0 :                     (atop(VM_PAGE_TO_PHYS(diag_prev)) & ~(boundary - 1)) !=
    1076           0 :                     (atop(VM_PAGE_TO_PHYS(found)) & ~(boundary - 1)))
    1077           0 :                         fnsegs++;
    1078           0 :                 fcount++;
    1079             : 
    1080           0 :                 diag_prev = found;
    1081             : #endif /* DIAGNOSTIC */
    1082             :         }
    1083             : 
    1084             : #ifdef DIAGNOSTIC
    1085             :         /*
    1086             :          * Panic on algorithm failure.
    1087             :          */
    1088           0 :         if (fcount != count || fnsegs > maxseg) {
    1089           0 :                 panic("pmemrange allocation error: "
    1090             :                     "allocated %ld pages in %d segments, "
    1091             :                     "but request was %ld pages in %d segments",
    1092             :                     fcount, fnsegs, count, maxseg);
    1093             :         }
    1094             : #endif /* DIAGNOSTIC */
    1095             : 
    1096           0 :         return 0;
    1097           0 : }
    1098             : 
    1099             : /*
    1100             :  * Free a number of contig pages (invoked by uvm_page_init).
    1101             :  */
    1102             : void
    1103           0 : uvm_pmr_freepages(struct vm_page *pg, psize_t count)
    1104             : {
    1105             :         struct uvm_pmemrange *pmr;
    1106             :         psize_t i, pmr_count;
    1107             :         struct vm_page *firstpg = pg;
    1108             : 
    1109           0 :         for (i = 0; i < count; i++) {
    1110           0 :                 KASSERT(atop(VM_PAGE_TO_PHYS(&pg[i])) ==
    1111             :                     atop(VM_PAGE_TO_PHYS(pg)) + i);
    1112             : 
    1113           0 :                 if (!((pg[i].pg_flags & PQ_FREE) == 0 &&
    1114           0 :                     VALID_FLAGS(pg[i].pg_flags))) {
    1115           0 :                         printf("Flags: 0x%x, will panic now.\n",
    1116           0 :                             pg[i].pg_flags);
    1117           0 :                 }
    1118           0 :                 KASSERT((pg[i].pg_flags & PQ_FREE) == 0 &&
    1119             :                     VALID_FLAGS(pg[i].pg_flags));
    1120           0 :                 atomic_setbits_int(&pg[i].pg_flags, PQ_FREE);
    1121           0 :                 atomic_clearbits_int(&pg[i].pg_flags, PG_ZERO);
    1122             :         }
    1123             : 
    1124           0 :         uvm_lock_fpageq();
    1125             : 
    1126           0 :         for (i = count; i > 0; i -= pmr_count) {
    1127           0 :                 pmr = uvm_pmemrange_find(atop(VM_PAGE_TO_PHYS(pg)));
    1128           0 :                 KASSERT(pmr != NULL);
    1129             : 
    1130           0 :                 pmr_count = MIN(i, pmr->high - atop(VM_PAGE_TO_PHYS(pg)));
    1131           0 :                 pg->fpgsz = pmr_count;
    1132           0 :                 uvm_pmr_insert(pmr, pg, 0);
    1133             : 
    1134           0 :                 uvmexp.free += pmr_count;
    1135           0 :                 pg += pmr_count;
    1136             :         }
    1137           0 :         wakeup(&uvmexp.free);
    1138           0 :         if (uvmexp.zeropages < UVM_PAGEZERO_TARGET)
    1139           0 :                 wakeup(&uvmexp.zeropages);
    1140             : 
    1141           0 :         uvm_wakeup_pla(VM_PAGE_TO_PHYS(firstpg), ptoa(count));
    1142             : 
    1143           0 :         uvm_unlock_fpageq();
    1144           0 : }
    1145             : 
    1146             : /*
    1147             :  * Free all pages in the queue.
    1148             :  */
    1149             : void
    1150           0 : uvm_pmr_freepageq(struct pglist *pgl)
    1151             : {
    1152             :         struct vm_page *pg;
    1153             :         paddr_t pstart;
    1154             :         psize_t plen;
    1155             : 
    1156           0 :         TAILQ_FOREACH(pg, pgl, pageq) {
    1157           0 :                 if (!((pg->pg_flags & PQ_FREE) == 0 &&
    1158           0 :                     VALID_FLAGS(pg->pg_flags))) {
    1159           0 :                         printf("Flags: 0x%x, will panic now.\n",
    1160           0 :                             pg->pg_flags);
    1161           0 :                 }
    1162           0 :                 KASSERT((pg->pg_flags & PQ_FREE) == 0 &&
    1163             :                     VALID_FLAGS(pg->pg_flags));
    1164           0 :                 atomic_setbits_int(&pg->pg_flags, PQ_FREE);
    1165           0 :                 atomic_clearbits_int(&pg->pg_flags, PG_ZERO);
    1166             :         }
    1167             : 
    1168           0 :         uvm_lock_fpageq();
    1169           0 :         while (!TAILQ_EMPTY(pgl)) {
    1170           0 :                 pstart = VM_PAGE_TO_PHYS(TAILQ_FIRST(pgl));
    1171           0 :                 plen = uvm_pmr_remove_1strange(pgl, 0, NULL, 0);
    1172           0 :                 uvmexp.free += plen;
    1173             : 
    1174           0 :                 uvm_wakeup_pla(pstart, ptoa(plen));
    1175             :         }
    1176           0 :         wakeup(&uvmexp.free);
    1177           0 :         if (uvmexp.zeropages < UVM_PAGEZERO_TARGET)
    1178           0 :                 wakeup(&uvmexp.zeropages);
    1179           0 :         uvm_unlock_fpageq();
    1180             : 
    1181             :         return;
    1182           0 : }
    1183             : 
    1184             : /*
    1185             :  * Store a pmemrange in the list.
    1186             :  *
    1187             :  * The list is sorted by use.
    1188             :  */
    1189             : struct uvm_pmemrange *
    1190           0 : uvm_pmemrange_use_insert(struct uvm_pmemrange_use *useq,
    1191             :     struct uvm_pmemrange *pmr)
    1192             : {
    1193             :         struct uvm_pmemrange *iter;
    1194             :         int cmp = 1;
    1195             : 
    1196           0 :         TAILQ_FOREACH(iter, useq, pmr_use) {
    1197           0 :                 cmp = uvm_pmemrange_use_cmp(pmr, iter);
    1198           0 :                 if (cmp == 0)
    1199           0 :                         return iter;
    1200           0 :                 if (cmp == -1)
    1201             :                         break;
    1202             :         }
    1203             : 
    1204           0 :         if (iter == NULL)
    1205           0 :                 TAILQ_INSERT_TAIL(useq, pmr, pmr_use);
    1206             :         else
    1207           0 :                 TAILQ_INSERT_BEFORE(iter, pmr, pmr_use);
    1208           0 :         return NULL;
    1209           0 : }
    1210             : 
    1211             : #ifdef DEBUG
    1212             : /*
    1213             :  * Validation of the whole pmemrange.
    1214             :  * Called with fpageq locked.
    1215             :  */
    1216             : void
    1217             : uvm_pmr_assertvalid(struct uvm_pmemrange *pmr)
    1218             : {
    1219             :         struct vm_page *prev, *next, *i, *xref;
    1220             :         int lcv, mti;
    1221             : 
    1222             :         /* Empty range */
    1223             :         if (pmr->nsegs == 0)
    1224             :                 return;
    1225             : 
    1226             :         /* Validate address tree. */
    1227             :         RBT_FOREACH(i, uvm_pmr_addr, &pmr->addr) {
    1228             :                 /* Validate the range. */
    1229             :                 KASSERT(i->fpgsz > 0);
    1230             :                 KASSERT(atop(VM_PAGE_TO_PHYS(i)) >= pmr->low);
    1231             :                 KASSERT(atop(VM_PAGE_TO_PHYS(i)) + i->fpgsz
    1232             :                     <= pmr->high);
    1233             : 
    1234             :                 /* Validate each page in this range. */
    1235             :                 for (lcv = 0; lcv < i->fpgsz; lcv++) {
    1236             :                         /*
    1237             :                          * Only the first page has a size specification.
    1238             :                          * Rest is size 0.
    1239             :                          */
    1240             :                         KASSERT(lcv == 0 || i[lcv].fpgsz == 0);
    1241             :                         /*
    1242             :                          * Flag check.
    1243             :                          */
    1244             :                         KASSERT(VALID_FLAGS(i[lcv].pg_flags) &&
    1245             :                             (i[lcv].pg_flags & PQ_FREE) == PQ_FREE);
    1246             :                         /*
    1247             :                          * Free pages are:
    1248             :                          * - not wired
    1249             :                          * - have no vm_anon
    1250             :                          * - have no uvm_object
    1251             :                          */
    1252             :                         KASSERT(i[lcv].wire_count == 0);
    1253             :                         KASSERT(i[lcv].uanon == (void*)0xdeadbeef ||
    1254             :                             i[lcv].uanon == NULL);
    1255             :                         KASSERT(i[lcv].uobject == (void*)0xdeadbeef ||
    1256             :                             i[lcv].uobject == NULL);
    1257             :                         /*
    1258             :                          * Pages in a single range always have the same
    1259             :                          * memtype.
    1260             :                          */
    1261             :                         KASSERT(uvm_pmr_pg_to_memtype(&i[0]) ==
    1262             :                             uvm_pmr_pg_to_memtype(&i[lcv]));
    1263             :                 }
    1264             : 
    1265             :                 /* Check that it shouldn't be joined with its predecessor. */
    1266             :                 prev = RBT_PREV(uvm_pmr_addr, i);
    1267             :                 if (prev != NULL) {
    1268             :                         KASSERT(uvm_pmr_pg_to_memtype(i) !=
    1269             :                             uvm_pmr_pg_to_memtype(prev) ||
    1270             :                             atop(VM_PAGE_TO_PHYS(i)) >
    1271             :                             atop(VM_PAGE_TO_PHYS(prev)) + prev->fpgsz ||
    1272             :                             prev + prev->fpgsz != i);
    1273             :                 }
    1274             : 
    1275             :                 /* Assert i is in the size tree as well. */
    1276             :                 if (i->fpgsz == 1) {
    1277             :                         TAILQ_FOREACH(xref,
    1278             :                             &pmr->single[uvm_pmr_pg_to_memtype(i)], pageq) {
    1279             :                                 if (xref == i)
    1280             :                                         break;
    1281             :                         }
    1282             :                         KASSERT(xref == i);
    1283             :                 } else {
    1284             :                         KASSERT(RBT_FIND(uvm_pmr_size,
    1285             :                             &pmr->size[uvm_pmr_pg_to_memtype(i)], i + 1) ==
    1286             :                             i + 1);
    1287             :                 }
    1288             :         }
    1289             : 
    1290             :         /* Validate size tree. */
    1291             :         for (mti = 0; mti < UVM_PMR_MEMTYPE_MAX; mti++) {
    1292             :                 for (i = uvm_pmr_nfindsz(pmr, 1, mti); i != NULL; i = next) {
    1293             :                         next = uvm_pmr_nextsz(pmr, i, mti);
    1294             :                         if (next != NULL) {
    1295             :                                 KASSERT(i->fpgsz <=
    1296             :                                     next->fpgsz);
    1297             :                         }
    1298             : 
    1299             :                         /* Assert i is in the addr tree as well. */
    1300             :                         KASSERT(RBT_FIND(uvm_pmr_addr, &pmr->addr, i) == i);
    1301             : 
    1302             :                         /* Assert i is of the correct memory type. */
    1303             :                         KASSERT(uvm_pmr_pg_to_memtype(i) == mti);
    1304             :                 }
    1305             :         }
    1306             : 
    1307             :         /* Validate nsegs statistic. */
    1308             :         lcv = 0;
    1309             :         RBT_FOREACH(i, uvm_pmr_addr, &pmr->addr)
    1310             :                 lcv++;
    1311             :         KASSERT(pmr->nsegs == lcv);
    1312             : }
    1313             : #endif /* DEBUG */
    1314             : 
    1315             : /*
    1316             :  * Split pmr at split point pageno.
    1317             :  * Called with fpageq unlocked.
    1318             :  *
    1319             :  * Split is only applied if a pmemrange spans pageno.
    1320             :  */
    1321             : void
    1322           0 : uvm_pmr_split(paddr_t pageno)
    1323             : {
    1324             :         struct uvm_pmemrange *pmr, *drain;
    1325             :         struct vm_page *rebuild, *prev, *next;
    1326             :         psize_t prev_sz;
    1327             : 
    1328           0 :         uvm_lock_fpageq();
    1329           0 :         pmr = uvm_pmemrange_find(pageno);
    1330           0 :         if (pmr == NULL || !(pmr->low < pageno)) {
    1331             :                 /* No split required. */
    1332           0 :                 uvm_unlock_fpageq();
    1333           0 :                 return;
    1334             :         }
    1335             : 
    1336           0 :         KASSERT(pmr->low < pageno);
    1337           0 :         KASSERT(pmr->high > pageno);
    1338             : 
    1339             :         /*
    1340             :          * uvm_pmr_allocpmr() calls into malloc() which in turn calls into
    1341             :          * uvm_kmemalloc which calls into pmemrange, making the locking
    1342             :          * a bit hard, so we just race!
    1343             :          */
    1344           0 :         uvm_unlock_fpageq();
    1345           0 :         drain = uvm_pmr_allocpmr();
    1346           0 :         uvm_lock_fpageq();
    1347           0 :         pmr = uvm_pmemrange_find(pageno);
    1348           0 :         if (pmr == NULL || !(pmr->low < pageno)) {
    1349             :                 /*
    1350             :                  * We lost the race since someone else ran this or a related
    1351             :                  * function, however this should be triggered very rarely so
    1352             :                  * we just leak the pmr.
    1353             :                  */
    1354           0 :                 printf("uvm_pmr_split: lost one pmr\n");
    1355           0 :                 uvm_unlock_fpageq();
    1356           0 :                 return;
    1357             :         }
    1358             : 
    1359           0 :         drain->low = pageno;
    1360           0 :         drain->high = pmr->high;
    1361           0 :         drain->use = pmr->use;
    1362             : 
    1363             :         uvm_pmr_assertvalid(pmr);
    1364             :         uvm_pmr_assertvalid(drain);
    1365           0 :         KASSERT(drain->nsegs == 0);
    1366             : 
    1367           0 :         RBT_FOREACH(rebuild, uvm_pmr_addr, &pmr->addr) {
    1368           0 :                 if (atop(VM_PAGE_TO_PHYS(rebuild)) >= pageno)
    1369             :                         break;
    1370             :         }
    1371           0 :         if (rebuild == NULL)
    1372           0 :                 prev = RBT_MAX(uvm_pmr_addr, &pmr->addr);
    1373             :         else
    1374           0 :                 prev = RBT_PREV(uvm_pmr_addr, rebuild);
    1375           0 :         KASSERT(prev == NULL || atop(VM_PAGE_TO_PHYS(prev)) < pageno);
    1376             : 
    1377             :         /*
    1378             :          * Handle free chunk that spans the split point.
    1379             :          */
    1380           0 :         if (prev != NULL &&
    1381           0 :             atop(VM_PAGE_TO_PHYS(prev)) + prev->fpgsz > pageno) {
    1382             :                 psize_t before, after;
    1383             : 
    1384           0 :                 KASSERT(atop(VM_PAGE_TO_PHYS(prev)) < pageno);
    1385             : 
    1386           0 :                 uvm_pmr_remove(pmr, prev);
    1387           0 :                 prev_sz = prev->fpgsz;
    1388           0 :                 before = pageno - atop(VM_PAGE_TO_PHYS(prev));
    1389           0 :                 after = atop(VM_PAGE_TO_PHYS(prev)) + prev_sz - pageno;
    1390             : 
    1391           0 :                 KASSERT(before > 0);
    1392           0 :                 KASSERT(after > 0);
    1393             : 
    1394           0 :                 prev->fpgsz = before;
    1395           0 :                 uvm_pmr_insert(pmr, prev, 1);
    1396           0 :                 (prev + before)->fpgsz = after;
    1397           0 :                 uvm_pmr_insert(drain, prev + before, 1);
    1398           0 :         }
    1399             : 
    1400             :         /* Move free chunks that no longer fall in the range. */
    1401           0 :         for (; rebuild != NULL; rebuild = next) {
    1402           0 :                 next = RBT_NEXT(uvm_pmr_addr, rebuild);
    1403             : 
    1404           0 :                 uvm_pmr_remove(pmr, rebuild);
    1405           0 :                 uvm_pmr_insert(drain, rebuild, 1);
    1406             :         }
    1407             : 
    1408           0 :         pmr->high = pageno;
    1409             :         uvm_pmr_assertvalid(pmr);
    1410             :         uvm_pmr_assertvalid(drain);
    1411             : 
    1412           0 :         RBT_INSERT(uvm_pmemrange_addr, &uvm.pmr_control.addr, drain);
    1413           0 :         uvm_pmemrange_use_insert(&uvm.pmr_control.use, drain);
    1414           0 :         uvm_unlock_fpageq();
    1415           0 : }
    1416             : 
    1417             : /*
    1418             :  * Increase the usage counter for the given range of memory.
    1419             :  *
    1420             :  * The more usage counters a given range of memory has, the more will be
    1421             :  * attempted not to allocate from it.
    1422             :  *
    1423             :  * Addresses here are in paddr_t, not page-numbers.
    1424             :  * The lowest and highest allowed address are specified.
    1425             :  */
    1426             : void
    1427           0 : uvm_pmr_use_inc(paddr_t low, paddr_t high)
    1428             : {
    1429             :         struct uvm_pmemrange *pmr;
    1430             :         paddr_t sz;
    1431             : 
    1432             :         /* pmr uses page numbers, translate low and high. */
    1433           0 :         high++;
    1434           0 :         high = atop(trunc_page(high));
    1435           0 :         low = atop(round_page(low));
    1436           0 :         uvm_pmr_split(low);
    1437           0 :         uvm_pmr_split(high);
    1438             : 
    1439             :         sz = 0;
    1440           0 :         uvm_lock_fpageq();
    1441             :         /* Increase use count on segments in range. */
    1442           0 :         RBT_FOREACH(pmr, uvm_pmemrange_addr, &uvm.pmr_control.addr) {
    1443           0 :                 if (PMR_IS_SUBRANGE_OF(pmr->low, pmr->high, low, high)) {
    1444           0 :                         TAILQ_REMOVE(&uvm.pmr_control.use, pmr, pmr_use);
    1445           0 :                         pmr->use++;
    1446           0 :                         sz += pmr->high - pmr->low;
    1447           0 :                         uvm_pmemrange_use_insert(&uvm.pmr_control.use, pmr);
    1448           0 :                 }
    1449             :                 uvm_pmr_assertvalid(pmr);
    1450             :         }
    1451           0 :         uvm_unlock_fpageq();
    1452             : 
    1453           0 :         KASSERT(sz >= high - low);
    1454           0 : }
    1455             : 
    1456             : /*
    1457             :  * Allocate a pmemrange.
    1458             :  *
    1459             :  * If called from uvm_page_init, the uvm_pageboot_alloc is used.
    1460             :  * If called after uvm_init, malloc is used.
    1461             :  * (And if called in between, you're dead.)
    1462             :  */
    1463             : struct uvm_pmemrange *
    1464           0 : uvm_pmr_allocpmr(void)
    1465             : {
    1466             :         struct uvm_pmemrange *nw;
    1467             :         int i;
    1468             : 
    1469             :         /* We're only ever hitting the !uvm.page_init_done case for now. */
    1470           0 :         if (!uvm.page_init_done) {
    1471           0 :                 nw = (struct uvm_pmemrange *)
    1472           0 :                     uvm_pageboot_alloc(sizeof(struct uvm_pmemrange));
    1473           0 :         } else {
    1474           0 :                 nw = malloc(sizeof(struct uvm_pmemrange),
    1475             :                     M_VMMAP, M_NOWAIT);
    1476             :         }
    1477           0 :         KASSERT(nw != NULL);
    1478           0 :         memset(nw, 0, sizeof(struct uvm_pmemrange));
    1479           0 :         RBT_INIT(uvm_pmr_addr, &nw->addr);
    1480           0 :         for (i = 0; i < UVM_PMR_MEMTYPE_MAX; i++) {
    1481           0 :                 RBT_INIT(uvm_pmr_size, &nw->size[i]);
    1482           0 :                 TAILQ_INIT(&nw->single[i]);
    1483             :         }
    1484           0 :         return nw;
    1485             : }
    1486             : 
    1487             : /*
    1488             :  * Initialization of pmr.
    1489             :  * Called by uvm_page_init.
    1490             :  *
    1491             :  * Sets up pmemranges.
    1492             :  */
    1493             : void
    1494           0 : uvm_pmr_init(void)
    1495             : {
    1496             :         struct uvm_pmemrange *new_pmr;
    1497             :         int i;
    1498             : 
    1499           0 :         TAILQ_INIT(&uvm.pmr_control.use);
    1500           0 :         RBT_INIT(uvm_pmemrange_addr, &uvm.pmr_control.addr);
    1501           0 :         TAILQ_INIT(&uvm.pmr_control.allocs);
    1502             : 
    1503             :         /* By default, one range for the entire address space. */
    1504           0 :         new_pmr = uvm_pmr_allocpmr();
    1505           0 :         new_pmr->low = 0;
    1506           0 :         new_pmr->high = atop((paddr_t)-1) + 1; 
    1507             : 
    1508           0 :         RBT_INSERT(uvm_pmemrange_addr, &uvm.pmr_control.addr, new_pmr);
    1509           0 :         uvm_pmemrange_use_insert(&uvm.pmr_control.use, new_pmr);
    1510             : 
    1511           0 :         for (i = 0; uvm_md_constraints[i] != NULL; i++) {
    1512           0 :                 uvm_pmr_use_inc(uvm_md_constraints[i]->ucr_low,
    1513           0 :                     uvm_md_constraints[i]->ucr_high);
    1514             :         }
    1515           0 : }
    1516             : 
    1517             : /*
    1518             :  * Find the pmemrange that contains the given page number.
    1519             :  *
    1520             :  * (Manually traverses the binary tree, because that is cheaper on stack
    1521             :  * usage.)
    1522             :  */
    1523             : struct uvm_pmemrange *
    1524           0 : uvm_pmemrange_find(paddr_t pageno)
    1525             : {
    1526             :         struct uvm_pmemrange *pmr;
    1527             : 
    1528           0 :         pmr = RBT_ROOT(uvm_pmemrange_addr, &uvm.pmr_control.addr);
    1529           0 :         while (pmr != NULL) {
    1530           0 :                 if (pmr->low > pageno)
    1531           0 :                         pmr = RBT_LEFT(uvm_pmemrange_addr, pmr);
    1532           0 :                 else if (pmr->high <= pageno)
    1533           0 :                         pmr = RBT_RIGHT(uvm_pmemrange_addr, pmr);
    1534             :                 else
    1535             :                         break;
    1536             :         }
    1537             : 
    1538           0 :         return pmr;
    1539             : }
    1540             : 
    1541             : #if defined(DDB) || defined(DEBUG)
    1542             : /*
    1543             :  * Return true if the given page is in any of the free lists.
    1544             :  * Used by uvm_page_printit.
    1545             :  * This function is safe, even if the page is not on the freeq.
    1546             :  * Note: does not apply locking, only called from ddb.
    1547             :  */
    1548             : int
    1549           0 : uvm_pmr_isfree(struct vm_page *pg)
    1550             : {
    1551             :         struct vm_page *r;
    1552             :         struct uvm_pmemrange *pmr;
    1553             : 
    1554           0 :         pmr = uvm_pmemrange_find(atop(VM_PAGE_TO_PHYS(pg)));
    1555           0 :         if (pmr == NULL)
    1556           0 :                 return 0;
    1557           0 :         r = RBT_NFIND(uvm_pmr_addr, &pmr->addr, pg);
    1558           0 :         if (r == NULL)
    1559           0 :                 r = RBT_MAX(uvm_pmr_addr, &pmr->addr);
    1560           0 :         else if (r != pg)
    1561           0 :                 r = RBT_PREV(uvm_pmr_addr, r);
    1562           0 :         if (r == NULL)
    1563           0 :                 return 0; /* Empty tree. */
    1564             : 
    1565             :         KDASSERT(atop(VM_PAGE_TO_PHYS(r)) <= atop(VM_PAGE_TO_PHYS(pg)));
    1566           0 :         return atop(VM_PAGE_TO_PHYS(r)) + r->fpgsz >
    1567           0 :             atop(VM_PAGE_TO_PHYS(pg));
    1568           0 : }
    1569             : #endif /* DEBUG */
    1570             : 
    1571             : /*
    1572             :  * Given a root of a tree, find a range which intersects start, end and
    1573             :  * is of the same memtype.
    1574             :  *
    1575             :  * Page must be in the address tree.
    1576             :  */
    1577             : struct vm_page*
    1578           0 : uvm_pmr_rootupdate(struct uvm_pmemrange *pmr, struct vm_page *init_root,
    1579             :     paddr_t start, paddr_t end, int memtype)
    1580             : {
    1581             :         int     direction;
    1582             :         struct  vm_page *root;
    1583             :         struct  vm_page *high, *high_next;
    1584             :         struct  vm_page *low, *low_next;
    1585             : 
    1586             :         KDASSERT(pmr != NULL && init_root != NULL);
    1587             :         root = init_root;
    1588             : 
    1589             :         /* Which direction to use for searching. */
    1590           0 :         if (start != 0 && atop(VM_PAGE_TO_PHYS(root)) + root->fpgsz <= start)
    1591           0 :                 direction =  1;
    1592           0 :         else if (end != 0 && atop(VM_PAGE_TO_PHYS(root)) >= end)
    1593             :                 direction = -1;
    1594             :         else /* nothing to do */
    1595           0 :                 return root;
    1596             : 
    1597             :         /* First, update root to fall within the chosen range. */
    1598           0 :         while (root && !PMR_INTERSECTS_WITH(
    1599             :             atop(VM_PAGE_TO_PHYS(root)),
    1600             :             atop(VM_PAGE_TO_PHYS(root)) + root->fpgsz,
    1601             :             start, end)) {
    1602           0 :                 if (direction == 1)
    1603           0 :                         root = RBT_RIGHT(uvm_objtree, root);
    1604             :                 else
    1605           0 :                         root = RBT_LEFT(uvm_objtree, root);
    1606             :         }
    1607           0 :         if (root == NULL || uvm_pmr_pg_to_memtype(root) == memtype)
    1608           0 :                 return root;
    1609             : 
    1610             :         /*
    1611             :          * Root is valid, but of the wrong memtype.
    1612             :          *
    1613             :          * Try to find a range that has the given memtype in the subtree
    1614             :          * (memtype mismatches are costly, either because the conversion
    1615             :          * is expensive, or a later allocation will need to do the opposite
    1616             :          * conversion, which will be expensive).
    1617             :          *
    1618             :          *
    1619             :          * First, simply increase address until we hit something we can use.
    1620             :          * Cache the upper page, so we can page-walk later.
    1621             :          */
    1622             :         high = root;
    1623           0 :         high_next = RBT_RIGHT(uvm_objtree, high);
    1624           0 :         while (high_next != NULL && PMR_INTERSECTS_WITH(
    1625             :             atop(VM_PAGE_TO_PHYS(high_next)),
    1626             :             atop(VM_PAGE_TO_PHYS(high_next)) + high_next->fpgsz,
    1627             :             start, end)) {
    1628             :                 high = high_next;
    1629           0 :                 if (uvm_pmr_pg_to_memtype(high) == memtype)
    1630           0 :                         return high;
    1631           0 :                 high_next = RBT_RIGHT(uvm_objtree, high);
    1632             :         }
    1633             : 
    1634             :         /*
    1635             :          * Second, decrease the address until we hit something we can use.
    1636             :          * Cache the lower page, so we can page-walk later.
    1637             :          */
    1638             :         low = root;
    1639           0 :         low_next = RBT_LEFT(uvm_objtree, low);
    1640           0 :         while (low_next != NULL && PMR_INTERSECTS_WITH(
    1641             :             atop(VM_PAGE_TO_PHYS(low_next)),
    1642             :             atop(VM_PAGE_TO_PHYS(low_next)) + low_next->fpgsz,
    1643             :             start, end)) {
    1644             :                 low = low_next;
    1645           0 :                 if (uvm_pmr_pg_to_memtype(low) == memtype)
    1646           0 :                         return low;
    1647           0 :                 low_next = RBT_LEFT(uvm_objtree, low);
    1648             :         }
    1649             : 
    1650           0 :         if (low == high)
    1651           0 :                 return NULL;
    1652             : 
    1653             :         /* No hits. Walk the address tree until we find something usable. */
    1654           0 :         for (low = RBT_NEXT(uvm_pmr_addr, low);
    1655           0 :             low != high;
    1656           0 :             low = RBT_NEXT(uvm_pmr_addr, low)) {
    1657             :                 KDASSERT(PMR_IS_SUBRANGE_OF(atop(VM_PAGE_TO_PHYS(low)),
    1658             :                     atop(VM_PAGE_TO_PHYS(low)) + low->fpgsz,
    1659             :                     start, end));
    1660           0 :                 if (uvm_pmr_pg_to_memtype(low) == memtype)
    1661           0 :                         return low;
    1662             :         }
    1663             : 
    1664             :         /* Nothing found. */
    1665           0 :         return NULL;
    1666           0 : }
    1667             : 
    1668             : /*
    1669             :  * Allocate any page, the fastest way. Page number constraints only.
    1670             :  */
    1671             : psize_t
    1672           0 : uvm_pmr_get1page(psize_t count, int memtype_init, struct pglist *result,
    1673             :     paddr_t start, paddr_t end, int memtype_only)
    1674             : {
    1675             :         struct  uvm_pmemrange *pmr;
    1676             :         struct  vm_page *found, *splitpg;
    1677             :         psize_t fcount;
    1678             :         int     memtype;
    1679             : 
    1680             :         fcount = 0;
    1681           0 :         TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) {
    1682             :                 /* We're done. */
    1683           0 :                 if (fcount == count)
    1684             :                         break;
    1685             : 
    1686             :                 /* Outside requested range. */
    1687           0 :                 if (!(start == 0 && end == 0) &&
    1688           0 :                     !PMR_INTERSECTS_WITH(pmr->low, pmr->high, start, end))
    1689             :                         continue;
    1690             : 
    1691             :                 /* Range is empty. */
    1692           0 :                 if (pmr->nsegs == 0)
    1693             :                         continue;
    1694             : 
    1695             :                 /* Loop over all memtypes, starting at memtype_init. */
    1696             :                 memtype = memtype_init;
    1697           0 :                 while (fcount != count) {
    1698           0 :                         found = TAILQ_FIRST(&pmr->single[memtype]);
    1699             :                         /*
    1700             :                          * If found is outside the range, walk the list
    1701             :                          * until we find something that intersects with
    1702             :                          * boundaries.
    1703             :                          */
    1704           0 :                         while (found && !PMR_INTERSECTS_WITH(
    1705             :                             atop(VM_PAGE_TO_PHYS(found)),
    1706             :                             atop(VM_PAGE_TO_PHYS(found)) + 1,
    1707             :                             start, end))
    1708           0 :                                 found = TAILQ_NEXT(found, pageq);
    1709             : 
    1710           0 :                         if (found == NULL) {
    1711             :                                 /*
    1712             :                                  * Check if the size tree contains a range
    1713             :                                  * that intersects with the boundaries. As the
    1714             :                                  * allocation is for any page, try the smallest
    1715             :                                  * range so that large ranges are preserved for
    1716             :                                  * more constrained cases. Only one entry is
    1717             :                                  * checked here, to avoid a brute-force search.
    1718             :                                  *
    1719             :                                  * Note that a size tree gives pg[1] instead of
    1720             :                                  * pg[0].
    1721             :                                  */
    1722           0 :                                 found = RBT_MIN(uvm_pmr_size,
    1723             :                                     &pmr->size[memtype]);
    1724           0 :                                 if (found != NULL) {
    1725           0 :                                         found--;
    1726           0 :                                         if (!PMR_INTERSECTS_WITH(
    1727             :                                             atop(VM_PAGE_TO_PHYS(found)),
    1728             :                                             atop(VM_PAGE_TO_PHYS(found)) +
    1729             :                                             found->fpgsz, start, end))
    1730           0 :                                                 found = NULL;
    1731             :                                 }
    1732             :                         }
    1733           0 :                         if (found == NULL) {
    1734             :                                 /*
    1735             :                                  * Try address-guided search to meet the page
    1736             :                                  * number constraints.
    1737             :                                  */
    1738           0 :                                 found = RBT_ROOT(uvm_pmr_addr, &pmr->addr);
    1739           0 :                                 if (found != NULL) {
    1740           0 :                                         found = uvm_pmr_rootupdate(pmr, found,
    1741             :                                             start, end, memtype);
    1742           0 :                                 }
    1743             :                         }
    1744           0 :                         if (found != NULL) {
    1745             :                                 uvm_pmr_assertvalid(pmr);
    1746           0 :                                 uvm_pmr_remove_size(pmr, found);
    1747             : 
    1748             :                                 /*
    1749             :                                  * If the page intersects the end, then it'll
    1750             :                                  * need splitting.
    1751             :                                  *
    1752             :                                  * Note that we don't need to split if the page
    1753             :                                  * intersects start: the drain function will
    1754             :                                  * simply stop on hitting start.
    1755             :                                  */
    1756           0 :                                 if (end != 0 && atop(VM_PAGE_TO_PHYS(found)) +
    1757           0 :                                     found->fpgsz > end) {
    1758             :                                         psize_t splitsz =
    1759             :                                             atop(VM_PAGE_TO_PHYS(found)) +
    1760           0 :                                             found->fpgsz - end;
    1761             : 
    1762           0 :                                         uvm_pmr_remove_addr(pmr, found);
    1763             :                                         uvm_pmr_assertvalid(pmr);
    1764           0 :                                         found->fpgsz -= splitsz;
    1765           0 :                                         splitpg = found + found->fpgsz;
    1766           0 :                                         splitpg->fpgsz = splitsz;
    1767           0 :                                         uvm_pmr_insert(pmr, splitpg, 1);
    1768             : 
    1769             :                                         /*
    1770             :                                          * At this point, splitpg and found
    1771             :                                          * actually should be joined.
    1772             :                                          * But we explicitly disable that,
    1773             :                                          * because we will start subtracting
    1774             :                                          * from found.
    1775             :                                          */
    1776           0 :                                         KASSERT(start == 0 ||
    1777             :                                             atop(VM_PAGE_TO_PHYS(found)) +
    1778             :                                             found->fpgsz > start);
    1779           0 :                                         uvm_pmr_insert_addr(pmr, found, 1);
    1780           0 :                                 }
    1781             : 
    1782             :                                 /*
    1783             :                                  * Fetch pages from the end.
    1784             :                                  * If the range is larger than the requested
    1785             :                                  * number of pages, this saves us an addr-tree
    1786             :                                  * update.
    1787             :                                  *
    1788             :                                  * Since we take from the end and insert at
    1789             :                                  * the head, any ranges keep preserved.
    1790             :                                  */
    1791           0 :                                 while (found->fpgsz > 0 && fcount < count &&
    1792           0 :                                     (start == 0 ||
    1793           0 :                                     atop(VM_PAGE_TO_PHYS(found)) +
    1794           0 :                                     found->fpgsz > start)) {
    1795           0 :                                         found->fpgsz--;
    1796           0 :                                         fcount++;
    1797           0 :                                         TAILQ_INSERT_HEAD(result,
    1798             :                                             &found[found->fpgsz], pageq);
    1799             :                                 }
    1800           0 :                                 if (found->fpgsz > 0) {
    1801           0 :                                         uvm_pmr_insert_size(pmr, found);
    1802             :                                         KDASSERT(fcount == count);
    1803             :                                         uvm_pmr_assertvalid(pmr);
    1804           0 :                                         return fcount;
    1805             :                                 }
    1806             : 
    1807             :                                 /*
    1808             :                                  * Delayed addr-tree removal.
    1809             :                                  */
    1810           0 :                                 uvm_pmr_remove_addr(pmr, found);
    1811             :                                 uvm_pmr_assertvalid(pmr);
    1812           0 :                         } else {
    1813           0 :                                 if (memtype_only)
    1814             :                                         break;
    1815             :                                 /*
    1816             :                                  * Skip to the next memtype.
    1817             :                                  */
    1818           0 :                                 memtype += 1;
    1819           0 :                                 if (memtype == UVM_PMR_MEMTYPE_MAX)
    1820             :                                         memtype = 0;
    1821           0 :                                 if (memtype == memtype_init)
    1822             :                                         break;
    1823             :                         }
    1824             :                 }
    1825             :         }
    1826             : 
    1827             :         /*
    1828             :          * Search finished.
    1829             :          *
    1830             :          * Ran out of ranges before enough pages were gathered, or we hit the
    1831             :          * case where found->fpgsz == count - fcount, in which case the
    1832             :          * above exit condition didn't trigger.
    1833             :          *
    1834             :          * On failure, caller will free the pages.
    1835             :          */
    1836           0 :         return fcount;
    1837           0 : }
    1838             : 
    1839             : #ifdef DDB
    1840             : /*
    1841             :  * Print information about pmemrange.
    1842             :  * Does not do locking (so either call it from DDB or acquire fpageq lock
    1843             :  * before invoking.
    1844             :  */
    1845             : void
    1846           0 : uvm_pmr_print(void)
    1847             : {
    1848             :         struct  uvm_pmemrange *pmr;
    1849             :         struct  vm_page *pg;
    1850           0 :         psize_t size[UVM_PMR_MEMTYPE_MAX];
    1851             :         psize_t free;
    1852             :         int     useq_len;
    1853             :         int     mt;
    1854             : 
    1855           0 :         printf("Ranges, use queue:\n");
    1856             :         useq_len = 0;
    1857           0 :         TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) {
    1858           0 :                 useq_len++;
    1859             :                 free = 0;
    1860           0 :                 for (mt = 0; mt < UVM_PMR_MEMTYPE_MAX; mt++) {
    1861           0 :                         pg = RBT_MAX(uvm_pmr_size, &pmr->size[mt]);
    1862           0 :                         if (pg != NULL)
    1863           0 :                                 pg--;
    1864             :                         else
    1865           0 :                                 pg = TAILQ_FIRST(&pmr->single[mt]);
    1866           0 :                         size[mt] = (pg == NULL ? 0 : pg->fpgsz);
    1867             : 
    1868           0 :                         RBT_FOREACH(pg, uvm_pmr_addr, &pmr->addr)
    1869           0 :                                 free += pg->fpgsz;
    1870             :                 }
    1871             : 
    1872           0 :                 printf("* [0x%lx-0x%lx] use=%d nsegs=%ld",
    1873           0 :                     (unsigned long)pmr->low, (unsigned long)pmr->high,
    1874           0 :                     pmr->use, (unsigned long)pmr->nsegs);
    1875           0 :                 for (mt = 0; mt < UVM_PMR_MEMTYPE_MAX; mt++) {
    1876           0 :                         printf(" maxsegsz[%d]=0x%lx", mt,
    1877           0 :                             (unsigned long)size[mt]);
    1878             :                 }
    1879           0 :                 printf(" free=0x%lx\n", (unsigned long)free);
    1880             :         }
    1881           0 :         printf("#ranges = %d\n", useq_len);
    1882           0 : }
    1883             : #endif
    1884             : 
    1885             : /*
    1886             :  * uvm_wait_pla: wait (sleep) for the page daemon to free some pages
    1887             :  * in a specific physmem area.
    1888             :  *
    1889             :  * Returns ENOMEM if the pagedaemon failed to free any pages.
    1890             :  * If not failok, failure will lead to panic.
    1891             :  *
    1892             :  * Must be called with fpageq locked.
    1893             :  */
    1894             : int
    1895           0 : uvm_wait_pla(paddr_t low, paddr_t high, paddr_t size, int failok)
    1896             : {
    1897           0 :         struct uvm_pmalloc pma;
    1898             :         const char *wmsg = "pmrwait";
    1899             : 
    1900           0 :         if (curproc == uvm.pagedaemon_proc) {
    1901             :                 /*
    1902             :                  * This is not that uncommon when the pagedaemon is trying
    1903             :                  * to flush out a large mmapped file. VOP_WRITE will circle
    1904             :                  * back through the buffer cache and try to get more memory.
    1905             :                  * The pagedaemon starts by calling bufbackoff, but we can
    1906             :                  * easily use up that reserve in a single scan iteration.
    1907             :                  */
    1908           0 :                 uvm_unlock_fpageq();
    1909           0 :                 if (bufbackoff(NULL, atop(size)) == 0) {
    1910             :                         uvm_lock_fpageq();
    1911           0 :                         return 0;
    1912             :                 }
    1913             :                 uvm_lock_fpageq();
    1914             : 
    1915             :                 /*
    1916             :                  * XXX detect pagedaemon deadlock - see comment in
    1917             :                  * uvm_wait(), as this is exactly the same issue.
    1918             :                  */
    1919           0 :                 printf("pagedaemon: wait_pla deadlock detected!\n");
    1920           0 :                 msleep(&uvmexp.free, &uvm.fpageqlock, PVM, wmsg, hz >> 3);
    1921             : #if defined(DEBUG)
    1922             :                 /* DEBUG: panic so we can debug it */
    1923             :                 panic("wait_pla pagedaemon deadlock");
    1924             : #endif
    1925           0 :                 return 0;
    1926             :         }
    1927             : 
    1928           0 :         for (;;) {
    1929           0 :                 pma.pm_constraint.ucr_low = low;
    1930           0 :                 pma.pm_constraint.ucr_high = high;
    1931           0 :                 pma.pm_size = size;
    1932           0 :                 pma.pm_flags = UVM_PMA_LINKED;
    1933           0 :                 TAILQ_INSERT_TAIL(&uvm.pmr_control.allocs, &pma, pmq);
    1934             : 
    1935           0 :                 wakeup(&uvm.pagedaemon);            /* wake the daemon! */
    1936           0 :                 while (pma.pm_flags & (UVM_PMA_LINKED | UVM_PMA_BUSY))
    1937           0 :                         msleep(&pma, &uvm.fpageqlock, PVM, wmsg, 0);
    1938             : 
    1939           0 :                 if (!(pma.pm_flags & UVM_PMA_FREED) &&
    1940           0 :                     pma.pm_flags & UVM_PMA_FAIL) {
    1941           0 :                         if (failok)
    1942           0 :                                 return ENOMEM;
    1943           0 :                         printf("uvm_wait: failed to free %ld pages between "
    1944           0 :                             "0x%lx-0x%lx\n", atop(size), low, high);
    1945             :                 } else
    1946           0 :                         return 0;
    1947             :         }
    1948             :         /* UNREACHABLE */
    1949           0 : }
    1950             : 
    1951             : /*
    1952             :  * Wake up uvm_pmalloc sleepers.
    1953             :  */
    1954             : void
    1955           0 : uvm_wakeup_pla(paddr_t low, psize_t len)
    1956             : {
    1957             :         struct uvm_pmalloc *pma, *pma_next;
    1958             :         paddr_t high;
    1959             : 
    1960           0 :         high = low + len;
    1961             : 
    1962             :         /* Wake specific allocations waiting for this memory. */
    1963           0 :         for (pma = TAILQ_FIRST(&uvm.pmr_control.allocs); pma != NULL;
    1964             :             pma = pma_next) {
    1965           0 :                 pma_next = TAILQ_NEXT(pma, pmq);
    1966             : 
    1967           0 :                 if (low < pma->pm_constraint.ucr_high &&
    1968           0 :                     high > pma->pm_constraint.ucr_low) {
    1969           0 :                         pma->pm_flags |= UVM_PMA_FREED;
    1970           0 :                         if (!(pma->pm_flags & UVM_PMA_BUSY)) {
    1971           0 :                                 pma->pm_flags &= ~UVM_PMA_LINKED;
    1972           0 :                                 TAILQ_REMOVE(&uvm.pmr_control.allocs, pma,
    1973             :                                     pmq);
    1974           0 :                                 wakeup(pma);
    1975           0 :                         }
    1976             :                 }
    1977             :         }
    1978           0 : }
    1979             : 
    1980             : void
    1981           0 : uvm_pagezero_thread(void *arg)
    1982             : {
    1983           0 :         struct pglist pgl;
    1984             :         struct vm_page *pg;
    1985             :         int count;
    1986             : 
    1987             :         /* Run at the lowest possible priority. */
    1988           0 :         curproc->p_p->ps_nice = NZERO + PRIO_MAX;
    1989             : 
    1990           0 :         KERNEL_UNLOCK();
    1991             : 
    1992           0 :         TAILQ_INIT(&pgl);
    1993           0 :         for (;;) {
    1994           0 :                 uvm_lock_fpageq();
    1995           0 :                 while (uvmexp.zeropages >= UVM_PAGEZERO_TARGET ||
    1996           0 :                     (count = uvm_pmr_get1page(16, UVM_PMR_MEMTYPE_DIRTY,
    1997           0 :                      &pgl, 0, 0, 1)) == 0) {
    1998           0 :                         msleep(&uvmexp.zeropages, &uvm.fpageqlock, MAXPRI,
    1999             :                             "pgzero", 0);
    2000             :                 }
    2001           0 :                 uvm_unlock_fpageq();
    2002             : 
    2003           0 :                 TAILQ_FOREACH(pg, &pgl, pageq) {
    2004           0 :                         uvm_pagezero(pg);
    2005           0 :                         atomic_setbits_int(&pg->pg_flags, PG_ZERO);
    2006             :                 }
    2007             : 
    2008           0 :                 uvm_lock_fpageq();
    2009           0 :                 while (!TAILQ_EMPTY(&pgl))
    2010           0 :                         uvm_pmr_remove_1strange(&pgl, 0, NULL, 0);
    2011           0 :                 uvmexp.zeropages += count;
    2012           0 :                 uvm_unlock_fpageq();
    2013             : 
    2014           0 :                 yield();
    2015             :         }
    2016             : }

Generated by: LCOV version 1.13