vm_reserv.c 42.8 KB
Newer Older
1
/*-
2
3
 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
 *
4
 * Copyright (c) 2002-2006 Rice University
Alan Cox's avatar
Alan Cox committed
5
 * Copyright (c) 2007-2011 Alan L. Cox <alc@cs.rice.edu>
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
 * All rights reserved.
 *
 * This software was developed for the FreeBSD Project by Alan L. Cox,
 * Olivier Crameri, Peter Druschel, Sitaram Iyer, and Juan Navarro.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT
 * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
 * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 *	Superpage reservation management module
36
37
38
 *
 * Any external functions defined by this module are only to be used by the
 * virtual memory system.
39
40
41
42
43
44
45
46
47
48
49
50
51
 */

#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");

#include "opt_vm.h"

#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mutex.h>
#include <sys/queue.h>
52
#include <sys/rwlock.h>
53
54
55
#include <sys/sbuf.h>
#include <sys/sysctl.h>
#include <sys/systm.h>
56
57
#include <sys/counter.h>
#include <sys/ktr.h>
58
#include <sys/vmmeter.h>
59
#include <sys/smp.h>
60
61
62
63
64

#include <vm/vm.h>
#include <vm/vm_param.h>
#include <vm/vm_object.h>
#include <vm/vm_page.h>
65
#include <vm/vm_pageout.h>
66
#include <vm/vm_phys.h>
67
#include <vm/vm_pagequeue.h>
68
#include <vm/vm_radix.h>
69
70
71
72
#include <vm/vm_reserv.h>

/*
 * The reservation system supports the speculative allocation of large physical
73
 * pages ("superpages").  Speculative allocation enables the fully automatic
74
75
76
77
78
79
 * utilization of superpages by the virtual memory system.  In other words, no
 * programmatic directives are required to use superpages.
 */

#if VM_NRESERVLEVEL > 0

80
81
82
83
#ifndef VM_LEVEL_0_ORDER_MAX
#define	VM_LEVEL_0_ORDER_MAX	VM_LEVEL_0_ORDER
#endif

84
85
86
87
/*
 * The number of small pages that are contained in a level 0 reservation
 */
#define	VM_LEVEL_0_NPAGES	(1 << VM_LEVEL_0_ORDER)
88
#define	VM_LEVEL_0_NPAGES_MAX	(1 << VM_LEVEL_0_ORDER_MAX)
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107

/*
 * The number of bits by which a physical address is shifted to obtain the
 * reservation number
 */
#define	VM_LEVEL_0_SHIFT	(VM_LEVEL_0_ORDER + PAGE_SHIFT)

/*
 * The size of a level 0 reservation in bytes
 */
#define	VM_LEVEL_0_SIZE		(1 << VM_LEVEL_0_SHIFT)

/*
 * Computes the index of the small page underlying the given (object, pindex)
 * within the reservation's array of small pages.
 */
#define	VM_RESERV_INDEX(object, pindex)	\
    (((object)->pg_color + (pindex)) & (VM_LEVEL_0_NPAGES - 1))

Alan Cox's avatar
Alan Cox committed
108
109
110
111
112
113
114
115
116
117
118
119
120
121
/*
 * The size of a population map entry
 */
typedef	u_long		popmap_t;

/*
 * The number of bits in a population map entry
 */
#define	NBPOPMAP	(NBBY * sizeof(popmap_t))

/*
 * The number of population map entries in a reservation
 */
#define	NPOPMAP		howmany(VM_LEVEL_0_NPAGES, NBPOPMAP)
122
#define	NPOPMAP_MAX	howmany(VM_LEVEL_0_NPAGES_MAX, NBPOPMAP)
Alan Cox's avatar
Alan Cox committed
123

124
125
126
127
128
129
/*
 * Number of elapsed ticks before we update the LRU queue position.  Used
 * to reduce contention and churn on the list.
 */
#define	PARTPOPSLOP	1

130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
/*
 * Clear a bit in the population map.
 */
static __inline void
popmap_clear(popmap_t popmap[], int i)
{

	popmap[i / NBPOPMAP] &= ~(1UL << (i % NBPOPMAP));
}

/*
 * Set a bit in the population map.
 */
static __inline void
popmap_set(popmap_t popmap[], int i)
{

	popmap[i / NBPOPMAP] |= 1UL << (i % NBPOPMAP);
}

/*
 * Is a bit in the population map clear?
 */
static __inline boolean_t
popmap_is_clear(popmap_t popmap[], int i)
{

	return ((popmap[i / NBPOPMAP] & (1UL << (i % NBPOPMAP))) == 0);
}

/*
 * Is a bit in the population map set?
 */
static __inline boolean_t
popmap_is_set(popmap_t popmap[], int i)
{

	return ((popmap[i / NBPOPMAP] & (1UL << (i % NBPOPMAP))) != 0);
}

170
171
172
173
174
175
176
177
/*
 * The reservation structure
 *
 * A reservation structure is constructed whenever a large physical page is
 * speculatively allocated to an object.  The reservation provides the small
 * physical pages for the range [pindex, pindex + VM_LEVEL_0_NPAGES) of offsets
 * within that object.  The reservation's "popcnt" tracks the number of these
 * small physical pages that are in use at any given time.  When and if the
178
 * reservation is not fully utilized, it appears in the queue of partially
179
180
181
 * populated reservations.  The reservation always appears on the containing
 * object's list of reservations.
 *
182
 * A partially populated reservation can be broken and reclaimed at any time.
183
 *
184
 * c - constant after boot
185
 * d - vm_reserv_domain_lock
186
 * o - vm_reserv_object_lock
187
188
 * r - vm_reserv_lock
 * s - vm_reserv_domain_scan_lock
189
190
 */
struct vm_reserv {
191
	struct mtx	lock;			/* reservation lock. */
192
	TAILQ_ENTRY(vm_reserv) partpopq;	/* (d, r) per-domain queue. */
193
194
195
	LIST_ENTRY(vm_reserv) objq;		/* (o, r) object queue */
	vm_object_t	object;			/* (o, r) containing object */
	vm_pindex_t	pindex;			/* (o, r) offset in object */
196
	vm_page_t	pages;			/* (c) first page  */
197
	uint16_t	popcnt;			/* (r) # of pages in use */
198
199
	uint8_t		domain;			/* (c) NUMA domain. */
	char		inpartpopq;		/* (d, r) */
200
	int		lasttick;		/* (r) last pop update tick. */
201
	popmap_t	popmap[NPOPMAP_MAX];	/* (r) bit vector, used pages */
202
203
};

204
205
TAILQ_HEAD(vm_reserv_queue, vm_reserv);

206
207
208
209
210
211
212
#define	vm_reserv_lockptr(rv)		(&(rv)->lock)
#define	vm_reserv_assert_locked(rv)					\
	    mtx_assert(vm_reserv_lockptr(rv), MA_OWNED)
#define	vm_reserv_lock(rv)		mtx_lock(vm_reserv_lockptr(rv))
#define	vm_reserv_trylock(rv)		mtx_trylock(vm_reserv_lockptr(rv))
#define	vm_reserv_unlock(rv)		mtx_unlock(vm_reserv_lockptr(rv))

213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
/*
 * The reservation array
 *
 * This array is analoguous in function to vm_page_array.  It differs in the
 * respect that it may contain a greater number of useful reservation
 * structures than there are (physical) superpages.  These "invalid"
 * reservation structures exist to trade-off space for time in the
 * implementation of vm_reserv_from_page().  Invalid reservation structures are
 * distinguishable from "valid" reservation structures by inspecting the
 * reservation's "pages" field.  Invalid reservation structures have a NULL
 * "pages" field.
 *
 * vm_reserv_from_page() maps a small (physical) page to an element of this
 * array by computing a physical reservation number from the page's physical
 * address.  The physical reservation number is used as the array index.
 *
 * An "active" reservation is a valid reservation structure that has a non-NULL
 * "object" field and a non-zero "popcnt" field.  In other words, every active
 * reservation belongs to a particular object.  Moreover, every active
 * reservation has an entry in the containing object's list of reservations.  
 */
static vm_reserv_t vm_reserv_array;

/*
237
 * The per-domain partially populated reservation queues
238
 *
239
240
 * These queues enable the fast recovery of an unused free small page from a
 * partially populated reservation.  The reservation at the head of a queue
241
 * is the least recently changed, partially populated reservation.
242
 *
243
 * Access to this queue is synchronized by the per-domain reservation lock.
244
245
 * Threads reclaiming free pages from the queue must hold the per-domain scan
 * lock.
246
 */
247
struct vm_reserv_domain {
248
249
250
	struct mtx 		lock;
	struct vm_reserv_queue	partpop;	/* (d) */
	struct vm_reserv	marker;		/* (d, s) scan marker/lock */
251
252
253
254
255
} __aligned(CACHE_LINE_SIZE);

static struct vm_reserv_domain vm_rvd[MAXMEMDOM];

#define	vm_reserv_domain_lockptr(d)	(&vm_rvd[(d)].lock)
256
257
#define	vm_reserv_domain_assert_locked(d)	\
	mtx_assert(vm_reserv_domain_lockptr(d), MA_OWNED)
258
259
#define	vm_reserv_domain_lock(d)	mtx_lock(vm_reserv_domain_lockptr(d))
#define	vm_reserv_domain_unlock(d)	mtx_unlock(vm_reserv_domain_lockptr(d))
260

261
262
263
#define	vm_reserv_domain_scan_lock(d)	mtx_lock(&vm_rvd[(d)].marker.lock)
#define	vm_reserv_domain_scan_unlock(d)	mtx_unlock(&vm_rvd[(d)].marker.lock)

264
265
static SYSCTL_NODE(_vm, OID_AUTO, reserv, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
    "Reservation Info");
266

267
static COUNTER_U64_DEFINE_EARLY(vm_reserv_broken);
268
269
SYSCTL_COUNTER_U64(_vm_reserv, OID_AUTO, broken, CTLFLAG_RD,
    &vm_reserv_broken, "Cumulative number of broken reservations");
270

271
static COUNTER_U64_DEFINE_EARLY(vm_reserv_freed);
272
273
SYSCTL_COUNTER_U64(_vm_reserv, OID_AUTO, freed, CTLFLAG_RD,
    &vm_reserv_freed, "Cumulative number of freed reservations");
274

275
276
static int sysctl_vm_reserv_fullpop(SYSCTL_HANDLER_ARGS);

277
278
SYSCTL_PROC(_vm_reserv, OID_AUTO, fullpop, CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RD,
    NULL, 0, sysctl_vm_reserv_fullpop, "I", "Current number of full reservations");
279

280
281
static int sysctl_vm_reserv_partpopq(SYSCTL_HANDLER_ARGS);

282
283
284
285
SYSCTL_OID(_vm_reserv, OID_AUTO, partpopq,
    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_NEEDGIANT, NULL, 0,
    sysctl_vm_reserv_partpopq, "A",
    "Partially populated reservation queues");
286

287
static COUNTER_U64_DEFINE_EARLY(vm_reserv_reclaimed);
288
289
SYSCTL_COUNTER_U64(_vm_reserv, OID_AUTO, reclaimed, CTLFLAG_RD,
    &vm_reserv_reclaimed, "Cumulative number of reclaimed reservations");
290

291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
/*
 * The object lock pool is used to synchronize the rvq.  We can not use a
 * pool mutex because it is required before malloc works.
 *
 * The "hash" function could be made faster without divide and modulo.
 */
#define	VM_RESERV_OBJ_LOCK_COUNT	MAXCPU

struct mtx_padalign vm_reserv_object_mtx[VM_RESERV_OBJ_LOCK_COUNT];

#define	vm_reserv_object_lock_idx(object)			\
	    (((uintptr_t)object / sizeof(*object)) % VM_RESERV_OBJ_LOCK_COUNT)
#define	vm_reserv_object_lock_ptr(object)			\
	    &vm_reserv_object_mtx[vm_reserv_object_lock_idx((object))]
#define	vm_reserv_object_lock(object)				\
	    mtx_lock(vm_reserv_object_lock_ptr((object)))
#define	vm_reserv_object_unlock(object)				\
	    mtx_unlock(vm_reserv_object_lock_ptr((object)))

310
static void		vm_reserv_break(vm_reserv_t rv);
Alan Cox's avatar
Alan Cox committed
311
static void		vm_reserv_depopulate(vm_reserv_t rv, int index);
312
313
314
static vm_reserv_t	vm_reserv_from_page(vm_page_t m);
static boolean_t	vm_reserv_has_pindex(vm_reserv_t rv,
			    vm_pindex_t pindex);
Alan Cox's avatar
Alan Cox committed
315
static void		vm_reserv_populate(vm_reserv_t rv, int index);
316
static void		vm_reserv_reclaim(vm_reserv_t rv);
317

318
319
320
/*
 * Returns the current number of full reservations.
 *
321
322
 * Since the number of full reservations is computed without acquiring any
 * locks, the returned value is inexact.
323
324
325
326
327
328
329
330
331
332
333
334
335
 */
static int
sysctl_vm_reserv_fullpop(SYSCTL_HANDLER_ARGS)
{
	vm_paddr_t paddr;
	struct vm_phys_seg *seg;
	vm_reserv_t rv;
	int fullpop, segind;

	fullpop = 0;
	for (segind = 0; segind < vm_phys_nsegs; segind++) {
		seg = &vm_phys_segs[segind];
		paddr = roundup2(seg->start, VM_LEVEL_0_SIZE);
336
337
338
339
340
341
#ifdef VM_PHYSSEG_SPARSE
		rv = seg->first_reserv + (paddr >> VM_LEVEL_0_SHIFT) -
		    (seg->start >> VM_LEVEL_0_SHIFT);
#else
		rv = &vm_reserv_array[paddr >> VM_LEVEL_0_SHIFT];
#endif
Aleksandr Rybalko's avatar
Aleksandr Rybalko committed
342
343
		while (paddr + VM_LEVEL_0_SIZE > paddr && paddr +
		    VM_LEVEL_0_SIZE <= seg->end) {
344
345
			fullpop += rv->popcnt == VM_LEVEL_0_NPAGES;
			paddr += VM_LEVEL_0_SIZE;
346
			rv++;
347
348
349
350
351
		}
	}
	return (sysctl_handle_int(oidp, &fullpop, 0, req));
}

352
/*
353
 * Describes the current state of the partially populated reservation queue.
354
355
356
357
358
359
 */
static int
sysctl_vm_reserv_partpopq(SYSCTL_HANDLER_ARGS)
{
	struct sbuf sbuf;
	vm_reserv_t rv;
360
	int counter, error, domain, level, unused_pages;
361

362
363
364
	error = sysctl_wire_old_buffer(req, 0);
	if (error != 0)
		return (error);
365
	sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
366
367
368
369
370
	sbuf_printf(&sbuf, "\nDOMAIN    LEVEL     SIZE  NUMBER\n\n");
	for (domain = 0; domain < vm_ndomains; domain++) {
		for (level = -1; level <= VM_NRESERVLEVEL - 2; level++) {
			counter = 0;
			unused_pages = 0;
371
			vm_reserv_domain_lock(domain);
372
			TAILQ_FOREACH(rv, &vm_rvd[domain].partpop, partpopq) {
373
374
				if (rv == &vm_rvd[domain].marker)
					continue;
375
376
377
				counter++;
				unused_pages += VM_LEVEL_0_NPAGES - rv->popcnt;
			}
378
			vm_reserv_domain_unlock(domain);
379
380
381
			sbuf_printf(&sbuf, "%6d, %7d, %6dK, %6d\n",
			    domain, level,
			    unused_pages * ((int)PAGE_SIZE / 1024), counter);
382
383
		}
	}
384
	error = sbuf_finish(&sbuf);
385
386
387
388
	sbuf_delete(&sbuf);
	return (error);
}

389
390
391
392
393
394
395
396
/*
 * Remove a reservation from the object's objq.
 */
static void
vm_reserv_remove(vm_reserv_t rv)
{
	vm_object_t object;

397
398
399
	vm_reserv_assert_locked(rv);
	CTR5(KTR_VM, "%s: rv %p object %p popcnt %d inpartpop %d",
	    __FUNCTION__, rv, rv->object, rv->popcnt, rv->inpartpopq);
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
	KASSERT(rv->object != NULL,
	    ("vm_reserv_remove: reserv %p is free", rv));
	KASSERT(!rv->inpartpopq,
	    ("vm_reserv_remove: reserv %p's inpartpopq is TRUE", rv));
	object = rv->object;
	vm_reserv_object_lock(object);
	LIST_REMOVE(rv, objq);
	rv->object = NULL;
	vm_reserv_object_unlock(object);
}

/*
 * Insert a new reservation into the object's objq.
 */
static void
vm_reserv_insert(vm_reserv_t rv, vm_object_t object, vm_pindex_t pindex)
{
	int i;

419
420
421
422
423
	vm_reserv_assert_locked(rv);
	CTR6(KTR_VM,
	    "%s: rv %p(%p) object %p new %p popcnt %d",
	    __FUNCTION__, rv, rv->pages, rv->object, object,
	   rv->popcnt);
424
425
426
427
428
429
430
431
432
433
434
435
	KASSERT(rv->object == NULL,
	    ("vm_reserv_insert: reserv %p isn't free", rv));
	KASSERT(rv->popcnt == 0,
	    ("vm_reserv_insert: reserv %p's popcnt is corrupted", rv));
	KASSERT(!rv->inpartpopq,
	    ("vm_reserv_insert: reserv %p's inpartpopq is TRUE", rv));
	for (i = 0; i < NPOPMAP; i++)
		KASSERT(rv->popmap[i] == 0,
		    ("vm_reserv_insert: reserv %p's popmap is corrupted", rv));
	vm_reserv_object_lock(object);
	rv->pindex = pindex;
	rv->object = object;
436
	rv->lasttick = ticks;
437
438
439
440
	LIST_INSERT_HEAD(&object->rvq, rv, objq);
	vm_reserv_object_unlock(object);
}

441
442
443
/*
 * Reduces the given reservation's population count.  If the population count
 * becomes zero, the reservation is destroyed.  Additionally, moves the
444
 * reservation to the tail of the partially populated reservation queue if the
445
446
447
 * population count is non-zero.
 */
static void
Alan Cox's avatar
Alan Cox committed
448
vm_reserv_depopulate(vm_reserv_t rv, int index)
449
{
450
	struct vm_domain *vmd;
451

452
453
454
	vm_reserv_assert_locked(rv);
	CTR5(KTR_VM, "%s: rv %p object %p popcnt %d inpartpop %d",
	    __FUNCTION__, rv, rv->object, rv->popcnt, rv->inpartpopq);
455
456
	KASSERT(rv->object != NULL,
	    ("vm_reserv_depopulate: reserv %p is free", rv));
457
	KASSERT(popmap_is_set(rv->popmap, index),
458
459
	    ("vm_reserv_depopulate: reserv %p's popmap[%d] is clear", rv,
	    index));
460
461
	KASSERT(rv->popcnt > 0,
	    ("vm_reserv_depopulate: reserv %p's popcnt is corrupted", rv));
462
	KASSERT(rv->domain < vm_ndomains,
463
464
	    ("vm_reserv_depopulate: reserv %p's domain is corrupted %d",
	    rv, rv->domain));
465
	if (rv->popcnt == VM_LEVEL_0_NPAGES) {
466
467
468
469
		KASSERT(rv->pages->psind == 1,
		    ("vm_reserv_depopulate: reserv %p is already demoted",
		    rv));
		rv->pages->psind = 0;
470
	}
471
	popmap_clear(rv->popmap, index);
472
	rv->popcnt--;
473
474
475
476
	if ((unsigned)(ticks - rv->lasttick) >= PARTPOPSLOP ||
	    rv->popcnt == 0) {
		vm_reserv_domain_lock(rv->domain);
		if (rv->inpartpopq) {
477
			TAILQ_REMOVE(&vm_rvd[rv->domain].partpop, rv, partpopq);
478
479
480
481
			rv->inpartpopq = FALSE;
		}
		if (rv->popcnt != 0) {
			rv->inpartpopq = TRUE;
482
483
			TAILQ_INSERT_TAIL(&vm_rvd[rv->domain].partpop, rv,
			    partpopq);
484
485
486
		}
		vm_reserv_domain_unlock(rv->domain);
		rv->lasttick = ticks;
487
488
	}
	vmd = VM_DOMAIN(rv->domain);
489
	if (rv->popcnt == 0) {
490
		vm_reserv_remove(rv);
491
		vm_domain_free_lock(vmd);
492
		vm_phys_free_pages(rv->pages, VM_LEVEL_0_ORDER);
493
494
		vm_domain_free_unlock(vmd);
		counter_u64_add(vm_reserv_freed, 1);
495
	}
496
	vm_domain_freecnt_inc(vmd, 1);
497
498
499
500
501
502
503
504
}

/*
 * Returns the reservation to which the given page might belong.
 */
static __inline vm_reserv_t
vm_reserv_from_page(vm_page_t m)
{
505
506
#ifdef VM_PHYSSEG_SPARSE
	struct vm_phys_seg *seg;
507

508
509
510
511
	seg = &vm_phys_segs[m->segind];
	return (seg->first_reserv + (VM_PAGE_TO_PHYS(m) >> VM_LEVEL_0_SHIFT) -
	    (seg->start >> VM_LEVEL_0_SHIFT));
#else
512
	return (&vm_reserv_array[VM_PAGE_TO_PHYS(m) >> VM_LEVEL_0_SHIFT]);
513
#endif
514
515
}

516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
/*
 * Returns an existing reservation or NULL and initialized successor pointer.
 */
static vm_reserv_t
vm_reserv_from_object(vm_object_t object, vm_pindex_t pindex,
    vm_page_t mpred, vm_page_t *msuccp)
{
	vm_reserv_t rv;
	vm_page_t msucc;

	msucc = NULL;
	if (mpred != NULL) {
		KASSERT(mpred->object == object,
		    ("vm_reserv_from_object: object doesn't contain mpred"));
		KASSERT(mpred->pindex < pindex,
		    ("vm_reserv_from_object: mpred doesn't precede pindex"));
		rv = vm_reserv_from_page(mpred);
		if (rv->object == object && vm_reserv_has_pindex(rv, pindex))
			goto found;
		msucc = TAILQ_NEXT(mpred, listq);
	} else
		msucc = TAILQ_FIRST(&object->memq);
	if (msucc != NULL) {
		KASSERT(msucc->pindex > pindex,
		    ("vm_reserv_from_object: msucc doesn't succeed pindex"));
		rv = vm_reserv_from_page(msucc);
		if (rv->object == object && vm_reserv_has_pindex(rv, pindex))
			goto found;
	}
	rv = NULL;

found:
	*msuccp = msucc;

	return (rv);
}

553
554
555
556
557
558
559
560
561
562
563
564
565
/*
 * Returns TRUE if the given reservation contains the given page index and
 * FALSE otherwise.
 */
static __inline boolean_t
vm_reserv_has_pindex(vm_reserv_t rv, vm_pindex_t pindex)
{

	return (((pindex - rv->pindex) & ~(VM_LEVEL_0_NPAGES - 1)) == 0);
}

/*
 * Increases the given reservation's population count.  Moves the reservation
566
 * to the tail of the partially populated reservation queue.
567
568
 */
static void
Alan Cox's avatar
Alan Cox committed
569
vm_reserv_populate(vm_reserv_t rv, int index)
570
571
{

572
573
574
	vm_reserv_assert_locked(rv);
	CTR5(KTR_VM, "%s: rv %p object %p popcnt %d inpartpop %d",
	    __FUNCTION__, rv, rv->object, rv->popcnt, rv->inpartpopq);
575
576
	KASSERT(rv->object != NULL,
	    ("vm_reserv_populate: reserv %p is free", rv));
577
	KASSERT(popmap_is_clear(rv->popmap, index),
578
579
	    ("vm_reserv_populate: reserv %p's popmap[%d] is set", rv,
	    index));
580
581
	KASSERT(rv->popcnt < VM_LEVEL_0_NPAGES,
	    ("vm_reserv_populate: reserv %p is already full", rv));
582
583
	KASSERT(rv->pages->psind == 0,
	    ("vm_reserv_populate: reserv %p is already promoted", rv));
584
	KASSERT(rv->domain < vm_ndomains,
585
586
	    ("vm_reserv_populate: reserv %p's domain is corrupted %d",
	    rv, rv->domain));
587
588
	popmap_set(rv->popmap, index);
	rv->popcnt++;
589
590
591
592
	if ((unsigned)(ticks - rv->lasttick) < PARTPOPSLOP &&
	    rv->inpartpopq && rv->popcnt != VM_LEVEL_0_NPAGES)
		return;
	rv->lasttick = ticks;
593
	vm_reserv_domain_lock(rv->domain);
594
	if (rv->inpartpopq) {
595
		TAILQ_REMOVE(&vm_rvd[rv->domain].partpop, rv, partpopq);
596
597
598
599
		rv->inpartpopq = FALSE;
	}
	if (rv->popcnt < VM_LEVEL_0_NPAGES) {
		rv->inpartpopq = TRUE;
600
		TAILQ_INSERT_TAIL(&vm_rvd[rv->domain].partpop, rv, partpopq);
601
602
603
604
	} else {
		KASSERT(rv->pages->psind == 0,
		    ("vm_reserv_populate: reserv %p is already promoted",
		    rv));
605
		rv->pages->psind = 1;
606
607
	}
	vm_reserv_domain_unlock(rv->domain);
608
609
610
}

/*
611
 * Allocates a contiguous set of physical pages of the given size "npages"
612
 * from existing or newly created reservations.  All of the physical pages
613
614
615
616
617
618
 * must be at or above the given physical address "low" and below the given
 * physical address "high".  The given value "alignment" determines the
 * alignment of the first physical page in the set.  If the given value
 * "boundary" is non-zero, then the set of physical pages cannot cross any
 * physical address boundary that is a multiple of that value.  Both
 * "alignment" and "boundary" must be a power of two.
619
 *
620
621
622
 * The page "mpred" must immediately precede the offset "pindex" within the
 * specified object.
 *
623
 * The object must be locked.
624
625
 */
vm_page_t
626
627
628
vm_reserv_alloc_contig(vm_object_t object, vm_pindex_t pindex, int domain,
    int req, vm_page_t mpred, u_long npages, vm_paddr_t low, vm_paddr_t high,
    u_long alignment, vm_paddr_t boundary)
629
{
630
	struct vm_domain *vmd;
631
	vm_paddr_t pa, size;
632
	vm_page_t m, m_ret, msucc;
633
634
	vm_pindex_t first, leftcap, rightcap;
	vm_reserv_t rv;
635
636
	u_long allocpages, maxpages, minpages;
	int i, index, n;
637

638
	VM_OBJECT_ASSERT_WLOCKED(object);
639
	KASSERT(npages != 0, ("vm_reserv_alloc_contig: npages is 0"));
640
641

	/*
642
	 * Is a reservation fundamentally impossible?
643
644
	 */
	if (pindex < VM_RESERV_INDEX(object, pindex) ||
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
	    pindex + npages > object->size)
		return (NULL);

	/*
	 * All reservations of a particular size have the same alignment.
	 * Assuming that the first page is allocated from a reservation, the
	 * least significant bits of its physical address can be determined
	 * from its offset from the beginning of the reservation and the size
	 * of the reservation.
	 *
	 * Could the specified index within a reservation of the smallest
	 * possible size satisfy the alignment and boundary requirements?
	 */
	pa = VM_RESERV_INDEX(object, pindex) << PAGE_SHIFT;
	if ((pa & (alignment - 1)) != 0)
		return (NULL);
	size = npages << PAGE_SHIFT;
	if (((pa ^ (pa + size - 1)) & ~(boundary - 1)) != 0)
663
664
665
		return (NULL);

	/*
666
	 * Look for an existing reservation.
667
	 */
668
	rv = vm_reserv_from_object(object, pindex, mpred, &msucc);
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
	if (rv != NULL) {
		KASSERT(object != kernel_object || rv->domain == domain,
		    ("vm_reserv_alloc_contig: domain mismatch"));
		index = VM_RESERV_INDEX(object, pindex);
		/* Does the allocation fit within the reservation? */
		if (index + npages > VM_LEVEL_0_NPAGES)
			return (NULL);
		domain = rv->domain;
		vmd = VM_DOMAIN(domain);
		vm_reserv_lock(rv);
		/* Handle reclaim race. */
		if (rv->object != object)
			goto out;
		m = &rv->pages[index];
		pa = VM_PAGE_TO_PHYS(m);
		if (pa < low || pa + size > high ||
		    (pa & (alignment - 1)) != 0 ||
		    ((pa ^ (pa + size - 1)) & ~(boundary - 1)) != 0)
			goto out;
		/* Handle vm_page_rename(m, new_object, ...). */
		for (i = 0; i < npages; i++)
			if (popmap_is_set(rv->popmap, index + i))
				goto out;
		if (!vm_domain_allocate(vmd, req, npages))
			goto out;
		for (i = 0; i < npages; i++)
			vm_reserv_populate(rv, index + i);
		vm_reserv_unlock(rv);
		return (m);
out:
		vm_reserv_unlock(rv);
700
		return (NULL);
701
	}
702
703

	/*
704
	 * Could at least one reservation fit between the first index to the
705
706
	 * left that can be used ("leftcap") and the first index to the right
	 * that cannot be used ("rightcap")?
707
708
709
710
	 *
	 * We must synchronize with the reserv object lock to protect the
	 * pindex/object of the resulting reservations against rename while
	 * we are inspecting.
711
712
	 */
	first = pindex - VM_RESERV_INDEX(object, pindex);
713
714
715
716
	minpages = VM_RESERV_INDEX(object, pindex) + npages;
	maxpages = roundup2(minpages, VM_LEVEL_0_NPAGES);
	allocpages = maxpages;
	vm_reserv_object_lock(object);
717
718
719
720
721
	if (mpred != NULL) {
		if ((rv = vm_reserv_from_page(mpred))->object != object)
			leftcap = mpred->pindex + 1;
		else
			leftcap = rv->pindex + VM_LEVEL_0_NPAGES;
722
723
		if (leftcap > first) {
			vm_reserv_object_unlock(object);
724
			return (NULL);
725
		}
726
727
728
729
730
731
732
	}
	if (msucc != NULL) {
		if ((rv = vm_reserv_from_page(msucc))->object != object)
			rightcap = msucc->pindex;
		else
			rightcap = rv->pindex;
		if (first + maxpages > rightcap) {
733
734
			if (maxpages == VM_LEVEL_0_NPAGES) {
				vm_reserv_object_unlock(object);
735
				return (NULL);
736
			}
737
738
739
740
741
742
743

			/*
			 * At least one reservation will fit between "leftcap"
			 * and "rightcap".  However, a reservation for the
			 * last of the requested pages will not fit.  Reduce
			 * the size of the upcoming allocation accordingly.
			 */
744
745
746
			allocpages = minpages;
		}
	}
747
	vm_reserv_object_unlock(object);
748
749

	/*
750
	 * Would the last new reservation extend past the end of the object?
751
752
753
	 *
	 * If the object is unlikely to grow don't allocate a reservation for
	 * the tail.
754
	 */
755
756
757
758
759
	if ((object->flags & OBJ_ANON) == 0 &&
	    first + maxpages > object->size) {
		if (maxpages == VM_LEVEL_0_NPAGES)
			return (NULL);
		allocpages = minpages;
760
761
762
	}

	/*
763
764
765
766
767
	 * Allocate the physical pages.  The alignment and boundary specified
	 * for this allocation may be different from the alignment and
	 * boundary specified for the requested pages.  For instance, the
	 * specified index may not be the first page within the first new
	 * reservation.
768
	 */
769
770
771
772
773
774
775
776
777
778
779
780
781
	m = NULL;
	vmd = VM_DOMAIN(domain);
	if (vm_domain_allocate(vmd, req, npages)) {
		vm_domain_free_lock(vmd);
		m = vm_phys_alloc_contig(domain, allocpages, low, high,
		    ulmax(alignment, VM_LEVEL_0_SIZE),
		    boundary > VM_LEVEL_0_SIZE ? boundary : 0);
		vm_domain_free_unlock(vmd);
		if (m == NULL) {
			vm_domain_freecnt_inc(vmd, npages);
			return (NULL);
		}
	} else
782
		return (NULL);
783
	KASSERT(vm_phys_domain(m) == domain,
784
	    ("vm_reserv_alloc_contig: Page domain does not match requested."));
785
786
787
788
789
790
791

	/*
	 * The allocated physical pages always begin at a reservation
	 * boundary, but they do not always end at a reservation boundary.
	 * Initialize every reservation that is completely covered by the
	 * allocated physical pages.
	 */
792
793
794
	m_ret = NULL;
	index = VM_RESERV_INDEX(object, pindex);
	do {
795
796
		rv = vm_reserv_from_page(m);
		KASSERT(rv->pages == m,
797
		    ("vm_reserv_alloc_contig: reserv %p's pages is corrupted",
798
		    rv));
799
		vm_reserv_lock(rv);
800
		vm_reserv_insert(rv, object, first);
801
802
		n = ulmin(VM_LEVEL_0_NPAGES - index, npages);
		for (i = 0; i < n; i++)
Alan Cox's avatar
Alan Cox committed
803
			vm_reserv_populate(rv, index + i);
804
805
806
807
808
		npages -= n;
		if (m_ret == NULL) {
			m_ret = &rv->pages[index];
			index = 0;
		}
809
		vm_reserv_unlock(rv);
810
811
812
		m += VM_LEVEL_0_NPAGES;
		first += VM_LEVEL_0_NPAGES;
		allocpages -= VM_LEVEL_0_NPAGES;
813
	} while (allocpages >= VM_LEVEL_0_NPAGES);
814
	return (m_ret);
815
816
817
}

/*
818
 * Allocate a physical page from an existing or newly created reservation.
819
820
821
822
823
824
825
 *
 * The page "mpred" must immediately precede the offset "pindex" within the
 * specified object.
 *
 * The object must be locked.
 */
vm_page_t
826
827
vm_reserv_alloc_page(vm_object_t object, vm_pindex_t pindex, int domain,
    int req, vm_page_t mpred)
828
{
829
	struct vm_domain *vmd;
830
	vm_page_t m, msucc;
831
832
	vm_pindex_t first, leftcap, rightcap;
	vm_reserv_t rv;
833
	int index;
834

835
	VM_OBJECT_ASSERT_WLOCKED(object);
836
837
838
839
840
841
842
843
844

	/*
	 * Is a reservation fundamentally impossible?
	 */
	if (pindex < VM_RESERV_INDEX(object, pindex) ||
	    pindex >= object->size)
		return (NULL);

	/*
845
	 * Look for an existing reservation.
846
	 */
847
	rv = vm_reserv_from_object(object, pindex, mpred, &msucc);
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
	if (rv != NULL) {
		KASSERT(object != kernel_object || rv->domain == domain,
		    ("vm_reserv_alloc_page: domain mismatch"));
		domain = rv->domain;
		vmd = VM_DOMAIN(domain);
		index = VM_RESERV_INDEX(object, pindex);
		m = &rv->pages[index];
		vm_reserv_lock(rv);
		/* Handle reclaim race. */
		if (rv->object != object ||
		    /* Handle vm_page_rename(m, new_object, ...). */
		    popmap_is_set(rv->popmap, index)) {
			m = NULL;
			goto out;
		}
		if (vm_domain_allocate(vmd, req, 1) == 0)
			m = NULL;
		else
			vm_reserv_populate(rv, index);
out:
		vm_reserv_unlock(rv);
		return (m);
	}
871
872
873
874

	/*
	 * Could a reservation fit between the first index to the left that
	 * can be used and the first index to the right that cannot be used?
875
876
877
878
	 *
	 * We must synchronize with the reserv object lock to protect the
	 * pindex/object of the resulting reservations against rename while
	 * we are inspecting.
879
880
	 */
	first = pindex - VM_RESERV_INDEX(object, pindex);
881
	vm_reserv_object_lock(object);
882
883
884
885
886
	if (mpred != NULL) {
		if ((rv = vm_reserv_from_page(mpred))->object != object)
			leftcap = mpred->pindex + 1;
		else
			leftcap = rv->pindex + VM_LEVEL_0_NPAGES;
887
888
		if (leftcap > first) {
			vm_reserv_object_unlock(object);
889
			return (NULL);
890
		}
891
892
893
894
895
896
	}
	if (msucc != NULL) {
		if ((rv = vm_reserv_from_page(msucc))->object != object)
			rightcap = msucc->pindex;
		else
			rightcap = rv->pindex;
897
898
		if (first + VM_LEVEL_0_NPAGES > rightcap) {
			vm_reserv_object_unlock(object);
899
			return (NULL);
900
		}
901
	}
902
	vm_reserv_object_unlock(object);
903
904

	/*
905
906
907
908
	 * Would the last new reservation extend past the end of the object?
	 *
	 * If the object is unlikely to grow don't allocate a reservation for
	 * the tail.
909
	 */
910
911
912
	if ((object->flags & OBJ_ANON) == 0 &&
	    first + VM_LEVEL_0_NPAGES > object->size)
		return (NULL);
913
914
915
916

	/*
	 * Allocate and populate the new reservation.
	 */
917
918
919
920
921
922
923
924
925
926
927
928
	m = NULL;
	vmd = VM_DOMAIN(domain);
	if (vm_domain_allocate(vmd, req, 1)) {
		vm_domain_free_lock(vmd);
		m = vm_phys_alloc_pages(domain, VM_FREEPOOL_DEFAULT,
		    VM_LEVEL_0_ORDER);
		vm_domain_free_unlock(vmd);
		if (m == NULL) {
			vm_domain_freecnt_inc(vmd, 1);
			return (NULL);
		}
	} else
929
930
		return (NULL);
	rv = vm_reserv_from_page(m);
931
	vm_reserv_lock(rv);
932
933
	KASSERT(rv->pages == m,
	    ("vm_reserv_alloc_page: reserv %p's pages is corrupted", rv));
934
	vm_reserv_insert(rv, object, first);
Alan Cox's avatar
Alan Cox committed
935
936
	index = VM_RESERV_INDEX(object, pindex);
	vm_reserv_populate(rv, index);
937
938
	vm_reserv_unlock(rv);

Alan Cox's avatar
Alan Cox committed
939
	return (&rv->pages[index]);
940
941
}

Alan Cox's avatar
Alan Cox committed
942
/*
943
944
945
 * Breaks the given reservation.  All free pages in the reservation
 * are returned to the physical memory allocator.  The reservation's
 * population count and map are reset to their initial state.
Alan Cox's avatar
Alan Cox committed
946
 *
947
 * The given reservation must not be in the partially populated reservation
948
 * queue.
Alan Cox's avatar
Alan Cox committed
949
950
 */
static void
951
vm_reserv_break(vm_reserv_t rv)
Alan Cox's avatar
Alan Cox committed
952
{
953
954
	u_long changes;
	int bitpos, hi, i, lo;
Alan Cox's avatar
Alan Cox committed
955

956
957
958
	vm_reserv_assert_locked(rv);
	CTR5(KTR_VM, "%s: rv %p object %p popcnt %d inpartpop %d",
	    __FUNCTION__, rv, rv->object, rv->popcnt, rv->inpartpopq);
959
	vm_reserv_remove(rv);
960
	rv->pages->psind = 0;
961
962
963
964
965
966
967
968
969
970
971
972
973
	hi = lo = -1;
	for (i = 0; i <= NPOPMAP; i++) {
		/*
		 * "changes" is a bitmask that marks where a new sequence of
		 * 0s or 1s begins in popmap[i], with last bit in popmap[i-1]
		 * considered to be 1 if and only if lo == hi.  The bits of
		 * popmap[-1] and popmap[NPOPMAP] are considered all 1s.
		 */
		if (i == NPOPMAP)
			changes = lo != hi;
		else {
			changes = rv->popmap[i];
			changes ^= (changes << 1) | (lo == hi);
Alan Cox's avatar
Alan Cox committed
974
975
			rv->popmap[i] = 0;
		}
976
977
978
979
980
981
982
983
984
985
986
987
988
		while (changes != 0) {
			/*
			 * If the next change marked begins a run of 0s, set
			 * lo to mark that position.  Otherwise set hi and
			 * free pages from lo up to hi.
			 */
			bitpos = ffsl(changes) - 1;
			changes ^= 1UL << bitpos;
			if (lo == hi)
				lo = NBPOPMAP * i + bitpos;
			else {
				hi = NBPOPMAP * i + bitpos;
				vm_domain_free_lock(VM_DOMAIN(rv->domain));
989
				vm_phys_enqueue_contig(&rv->pages[lo], hi - lo);
990
991
992
				vm_domain_free_unlock(VM_DOMAIN(rv->domain));
				lo = hi;
			}
Alan Cox's avatar
Alan Cox committed
993
		}
994
995
	}
	rv->popcnt = 0;
996
	counter_u64_add(vm_reserv_broken, 1);
Alan Cox's avatar
Alan Cox committed
997
998
}

999
1000
/*
 * Breaks all reservations belonging to the given object.
For faster browsing, not all history is shown. View entire blame