uipc_shm.c 53.1 KB
Newer Older
1
/*-
2
3
 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
 *
4
 * Copyright (c) 2006, 2011, 2016-2017 Robert N. M. Watson
5
 * Copyright 2020 The FreeBSD Foundation
6
7
 * All rights reserved.
 *
8
9
10
11
12
 * Portions of this software were developed by BAE Systems, the University of
 * Cambridge Computer Laboratory, and Memorial University under DARPA/AFRL
 * contract FA8650-15-C-7558 ("CADETS"), as part of the DARPA Transparent
 * Computing (TC) research program.
 *
13
14
15
 * Portions of this software were developed by Konstantin Belousov
 * under sponsorship from the FreeBSD Foundation.
 *
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

/*
 * Support for shared swap-backed anonymous memory objects via
David Bright's avatar
David Bright committed
40
41
42
 * shm_open(2), shm_rename(2), and shm_unlink(2).
 * While most of the implementation is here, vm_mmap.c contains
 * mapping logic changes.
43
 *
44
45
46
47
 * posixshmcontrol(1) allows users to inspect the state of the memory
 * objects.  Per-uid swap resource limit controls total amount of
 * memory that user can consume for anonymous objects, including
 * shared.
48
49
50
51
52
 */

#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");

53
#include "opt_capsicum.h"
54
#include "opt_ktrace.h"
55

56
#include <sys/param.h>
57
#include <sys/capsicum.h>
58
#include <sys/conf.h>
59
60
61
#include <sys/fcntl.h>
#include <sys/file.h>
#include <sys/filedesc.h>
62
#include <sys/filio.h>
63
64
#include <sys/fnv_hash.h>
#include <sys/kernel.h>
65
#include <sys/limits.h>
66
67
#include <sys/uio.h>
#include <sys/signal.h>
68
#include <sys/jail.h>
69
#include <sys/ktrace.h>
70
71
72
73
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mman.h>
#include <sys/mutex.h>
74
#include <sys/priv.h>
75
76
77
#include <sys/proc.h>
#include <sys/refcount.h>
#include <sys/resourcevar.h>
78
#include <sys/rwlock.h>
79
#include <sys/sbuf.h>
80
#include <sys/stat.h>
Ed Schouten's avatar
Ed Schouten committed
81
#include <sys/syscallsubr.h>
82
83
84
85
86
#include <sys/sysctl.h>
#include <sys/sysproto.h>
#include <sys/systm.h>
#include <sys/sx.h>
#include <sys/time.h>
87
#include <sys/vmmeter.h>
88
#include <sys/vnode.h>
89
#include <sys/unistd.h>
90
#include <sys/user.h>
91

92
#include <security/audit/audit.h>
93
94
95
96
97
#include <security/mac/mac_framework.h>

#include <vm/vm.h>
#include <vm/vm_param.h>
#include <vm/pmap.h>
John Baldwin's avatar
John Baldwin committed
98
#include <vm/vm_extern.h>
99
#include <vm/vm_map.h>
100
#include <vm/vm_kern.h>
101
102
#include <vm/vm_object.h>
#include <vm/vm_page.h>
103
#include <vm/vm_pageout.h>
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
#include <vm/vm_pager.h>
#include <vm/swap_pager.h>

struct shm_mapping {
	char		*sm_path;
	Fnv32_t		sm_fnv;
	struct shmfd	*sm_shmfd;
	LIST_ENTRY(shm_mapping) sm_link;
};

static MALLOC_DEFINE(M_SHMFD, "shmfd", "shared memory file descriptor");
static LIST_HEAD(, shm_mapping) *shm_dictionary;
static struct sx shm_dict_lock;
static struct mtx shm_timestamp_lock;
static u_long shm_hash;
119
static struct unrhdr64 shm_ino_unr;
120
static dev_t shm_dev_ino;
121
122
123

#define	SHM_HASH(fnv)	(&shm_dictionary[(fnv) & shm_hash])

124
static void	shm_init(void *arg);
125
126
127
static void	shm_insert(char *path, Fnv32_t fnv, struct shmfd *shmfd);
static struct shmfd *shm_lookup(char *path, Fnv32_t fnv);
static int	shm_remove(char *path, Fnv32_t fnv, struct ucred *ucred);
128
129
static int	shm_dotruncate_cookie(struct shmfd *shmfd, off_t length,
    void *rl_cookie);
130
131
static int	shm_dotruncate_locked(struct shmfd *shmfd, off_t length,
    void *rl_cookie);
132
133
static int	shm_copyin_path(struct thread *td, const char *userpath_in,
    char **path_out);
134
135
static int	shm_deallocate(struct shmfd *shmfd, off_t *offset,
    off_t *length, int flags);
136
137
138
139

static fo_rdwr_t	shm_read;
static fo_rdwr_t	shm_write;
static fo_truncate_t	shm_truncate;
140
static fo_ioctl_t	shm_ioctl;
141
142
static fo_stat_t	shm_stat;
static fo_close_t	shm_close;
143
144
static fo_chmod_t	shm_chmod;
static fo_chown_t	shm_chown;
145
static fo_seek_t	shm_seek;
146
static fo_fill_kinfo_t	shm_fill_kinfo;
147
static fo_mmap_t	shm_mmap;
148
149
static fo_get_seals_t	shm_get_seals;
static fo_add_seals_t	shm_add_seals;
150
static fo_fallocate_t	shm_fallocate;
151
static fo_fspacectl_t	shm_fspacectl;
152
153

/* File descriptor operations. */
154
struct fileops shm_ops = {
155
156
157
	.fo_read = shm_read,
	.fo_write = shm_write,
	.fo_truncate = shm_truncate,
158
	.fo_ioctl = shm_ioctl,
159
160
	.fo_poll = invfo_poll,
	.fo_kqfilter = invfo_kqfilter,
161
162
	.fo_stat = shm_stat,
	.fo_close = shm_close,
163
164
	.fo_chmod = shm_chmod,
	.fo_chown = shm_chown,
165
	.fo_sendfile = vn_sendfile,
166
	.fo_seek = shm_seek,
167
	.fo_fill_kinfo = shm_fill_kinfo,
168
	.fo_mmap = shm_mmap,
169
170
	.fo_get_seals = shm_get_seals,
	.fo_add_seals = shm_add_seals,
171
	.fo_fallocate = shm_fallocate,
172
	.fo_fspacectl = shm_fspacectl,
173
	.fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE,
174
175
176
177
};

FEATURE(posix_shm, "POSIX shared memory");

178
179
180
181
182
183
184
185
static SYSCTL_NODE(_vm, OID_AUTO, largepages, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
    "");

static int largepage_reclaim_tries = 1;
SYSCTL_INT(_vm_largepages, OID_AUTO, reclaim_tries,
    CTLFLAG_RWTUN, &largepage_reclaim_tries, 0,
    "Number of contig reclaims before giving up for default alloc policy");

186
187
188
189
190
191
192
193
194
195
196
197
static int
uiomove_object_page(vm_object_t obj, size_t len, struct uio *uio)
{
	vm_page_t m;
	vm_pindex_t idx;
	size_t tlen;
	int error, offset, rv;

	idx = OFF_TO_IDX(uio->uio_offset);
	offset = uio->uio_offset & PAGE_MASK;
	tlen = MIN(PAGE_SIZE - offset, len);

198
199
200
201
	rv = vm_page_grab_valid_unlocked(&m, obj, idx,
	    VM_ALLOC_SBUSY | VM_ALLOC_IGN_SBUSY | VM_ALLOC_NOCREAT);
	if (rv == VM_PAGER_OK)
		goto found;
202

203
204
205
206
207
	/*
	 * Read I/O without either a corresponding resident page or swap
	 * page: use zero_region.  This is intended to avoid instantiating
	 * pages on read from a sparse region.
	 */
208
209
210
	VM_OBJECT_WLOCK(obj);
	m = vm_page_lookup(obj, idx);
	if (uio->uio_rw == UIO_READ && m == NULL &&
211
212
	    !vm_pager_has_page(obj, idx, NULL, NULL)) {
		VM_OBJECT_WUNLOCK(obj);
213
		return (uiomove(__DECONST(void *, zero_region), tlen, uio));
214
215
	}

216
217
218
219
220
221
222
	/*
	 * Although the tmpfs vnode lock is held here, it is
	 * nonetheless safe to sleep waiting for a free page.  The
	 * pageout daemon does not need to acquire the tmpfs vnode
	 * lock to page out tobj's pages because tobj is a OBJT_SWAP
	 * type object.
	 */
223
	rv = vm_page_grab_valid(&m, obj, idx,
224
	    VM_ALLOC_NORMAL | VM_ALLOC_SBUSY | VM_ALLOC_IGN_SBUSY);
225
226
227
228
229
	if (rv != VM_PAGER_OK) {
		VM_OBJECT_WUNLOCK(obj);
		printf("uiomove_object: vm_obj %p idx %jd pager error %d\n",
		    obj, idx, rv);
		return (EIO);
230
231
	}
	VM_OBJECT_WUNLOCK(obj);
232
233

found:
234
	error = uiomove_fromphys(&m, offset, tlen, uio);
235
236
	if (uio->uio_rw == UIO_WRITE && error == 0)
		vm_page_set_dirty(m);
237
	vm_page_activate(m);
238
	vm_page_sunbusy(m);
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263

	return (error);
}

int
uiomove_object(vm_object_t obj, off_t obj_size, struct uio *uio)
{
	ssize_t resid;
	size_t len;
	int error;

	error = 0;
	while ((resid = uio->uio_resid) > 0) {
		if (obj_size <= uio->uio_offset)
			break;
		len = MIN(obj_size - uio->uio_offset, resid);
		if (len == 0)
			break;
		error = uiomove_object_page(obj, len, uio);
		if (error != 0 || resid == uio->uio_resid)
			break;
	}
	return (error);
}

264
265
266
267
268
269
static u_long count_largepages[MAXPAGESIZES];

static int
shm_largepage_phys_populate(vm_object_t object, vm_pindex_t pidx,
    int fault_type, vm_prot_t max_prot, vm_pindex_t *first, vm_pindex_t *last)
{
270
	vm_page_t m __diagused;
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
	int psind;

	psind = object->un_pager.phys.data_val;
	if (psind == 0 || pidx >= object->size)
		return (VM_PAGER_FAIL);
	*first = rounddown2(pidx, pagesizes[psind] / PAGE_SIZE);

	/*
	 * We only busy the first page in the superpage run.  It is
	 * useless to busy whole run since we only remove full
	 * superpage, and it takes too long to busy e.g. 512 * 512 ==
	 * 262144 pages constituing 1G amd64 superage.
	 */
	m = vm_page_grab(object, *first, VM_ALLOC_NORMAL | VM_ALLOC_NOCREAT);
	MPASS(m != NULL);

	*last = *first + atop(pagesizes[psind]) - 1;
	return (VM_PAGER_OK);
}

static boolean_t
shm_largepage_phys_haspage(vm_object_t object, vm_pindex_t pindex,
    int *before, int *after)
{
	int psind;

	psind = object->un_pager.phys.data_val;
	if (psind == 0 || pindex >= object->size)
		return (FALSE);
	if (before != NULL) {
		*before = pindex - rounddown2(pindex, pagesizes[psind] /
		    PAGE_SIZE);
	}
	if (after != NULL) {
		*after = roundup2(pindex, pagesizes[psind] / PAGE_SIZE) -
		    pindex;
	}
	return (TRUE);
}

static void
shm_largepage_phys_ctor(vm_object_t object, vm_prot_t prot,
    vm_ooffset_t foff, struct ucred *cred)
{
}

static void
shm_largepage_phys_dtor(vm_object_t object)
{
	int psind;

	psind = object->un_pager.phys.data_val;
	if (psind != 0) {
		atomic_subtract_long(&count_largepages[psind],
		    object->size / (pagesizes[psind] / PAGE_SIZE));
		vm_wire_sub(object->size);
	} else {
		KASSERT(object->size == 0,
		    ("largepage phys obj %p not initialized bit size %#jx > 0",
		    object, (uintmax_t)object->size));
	}
}

334
static const struct phys_pager_ops shm_largepage_phys_ops = {
335
336
337
338
339
340
341
342
343
344
345
346
	.phys_pg_populate =	shm_largepage_phys_populate,
	.phys_pg_haspage =	shm_largepage_phys_haspage,
	.phys_pg_ctor =		shm_largepage_phys_ctor,
	.phys_pg_dtor =		shm_largepage_phys_dtor,
};

bool
shm_largepage(struct shmfd *shmfd)
{
	return (shmfd->shm_object->type == OBJT_PHYS);
}

347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
static int
shm_seek(struct file *fp, off_t offset, int whence, struct thread *td)
{
	struct shmfd *shmfd;
	off_t foffset;
	int error;

	shmfd = fp->f_data;
	foffset = foffset_lock(fp, 0);
	error = 0;
	switch (whence) {
	case L_INCR:
		if (foffset < 0 ||
		    (offset > 0 && foffset > OFF_MAX - offset)) {
			error = EOVERFLOW;
			break;
		}
		offset += foffset;
		break;
	case L_XTND:
		if (offset > 0 && shmfd->shm_size > OFF_MAX - offset) {
			error = EOVERFLOW;
			break;
		}
		offset += shmfd->shm_size;
		break;
	case L_SET:
		break;
	default:
		error = EINVAL;
	}
	if (error == 0) {
		if (offset < 0 || offset > shmfd->shm_size)
			error = EINVAL;
		else
382
			td->td_uretoff.tdu_off = offset;
383
384
385
386
387
	}
	foffset_unlock(fp, offset, error != 0 ? FOF_NOUPDATE : 0);
	return (error);
}

388
389
390
391
static int
shm_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
    int flags, struct thread *td)
{
392
393
394
	struct shmfd *shmfd;
	void *rl_cookie;
	int error;
395

396
397
398
399
400
401
	shmfd = fp->f_data;
#ifdef MAC
	error = mac_posixshm_check_read(active_cred, fp->f_cred, shmfd);
	if (error)
		return (error);
#endif
402
403
404
	foffset_lock_uio(fp, uio, flags);
	rl_cookie = rangelock_rlock(&shmfd->shm_rl, uio->uio_offset,
	    uio->uio_offset + uio->uio_resid, &shmfd->shm_mtx);
405
406
407
408
	error = uiomove_object(shmfd->shm_object, shmfd->shm_size, uio);
	rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx);
	foffset_unlock_uio(fp, uio, flags);
	return (error);
409
410
411
412
413
414
}

static int
shm_write(struct file *fp, struct uio *uio, struct ucred *active_cred,
    int flags, struct thread *td)
{
415
416
417
	struct shmfd *shmfd;
	void *rl_cookie;
	int error;
418
	off_t size;
419

420
421
422
423
424
425
	shmfd = fp->f_data;
#ifdef MAC
	error = mac_posixshm_check_write(active_cred, fp->f_cred, shmfd);
	if (error)
		return (error);
#endif
426
427
	if (shm_largepage(shmfd) && shmfd->shm_lp_psind == 0)
		return (EINVAL);
428
	foffset_lock_uio(fp, uio, flags);
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
	if (uio->uio_resid > OFF_MAX - uio->uio_offset) {
		/*
		 * Overflow is only an error if we're supposed to expand on
		 * write.  Otherwise, we'll just truncate the write to the
		 * size of the file, which can only grow up to OFF_MAX.
		 */
		if ((shmfd->shm_flags & SHM_GROW_ON_WRITE) != 0) {
			foffset_unlock_uio(fp, uio, flags);
			return (EFBIG);
		}

		size = shmfd->shm_size;
	} else {
		size = uio->uio_offset + uio->uio_resid;
	}
444
445
446
447
448
	if ((flags & FOF_OFFSET) == 0) {
		rl_cookie = rangelock_wlock(&shmfd->shm_rl, 0, OFF_MAX,
		    &shmfd->shm_mtx);
	} else {
		rl_cookie = rangelock_wlock(&shmfd->shm_rl, uio->uio_offset,
449
		    size, &shmfd->shm_mtx);
450
	}
451
	if ((shmfd->shm_seals & F_SEAL_WRITE) != 0) {
452
		error = EPERM;
453
454
455
456
	} else {
		error = 0;
		if ((shmfd->shm_flags & SHM_GROW_ON_WRITE) != 0 &&
		    size > shmfd->shm_size) {
457
			error = shm_dotruncate_cookie(shmfd, size, rl_cookie);
458
459
460
461
462
		}
		if (error == 0)
			error = uiomove_object(shmfd->shm_object,
			    shmfd->shm_size, uio);
	}
463
464
465
	rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx);
	foffset_unlock_uio(fp, uio, flags);
	return (error);
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
}

static int
shm_truncate(struct file *fp, off_t length, struct ucred *active_cred,
    struct thread *td)
{
	struct shmfd *shmfd;
#ifdef MAC
	int error;
#endif

	shmfd = fp->f_data;
#ifdef MAC
	error = mac_posixshm_check_truncate(active_cred, fp->f_cred, shmfd);
	if (error)
		return (error);
#endif
483
	return (shm_dotruncate(shmfd, length));
484
485
}

486
487
488
489
int
shm_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred,
    struct thread *td)
{
490
491
492
	struct shmfd *shmfd;
	struct shm_largepage_conf *conf;
	void *rl_cookie;
493

494
	shmfd = fp->f_data;
495
496
497
498
499
500
501
502
	switch (com) {
	case FIONBIO:
	case FIOASYNC:
		/*
		 * Allow fcntl(fd, F_SETFL, O_NONBLOCK) to work,
		 * just like it would on an unlinked regular file
		 */
		return (0);
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
	case FIOSSHMLPGCNF:
		if (!shm_largepage(shmfd))
			return (ENOTTY);
		conf = data;
		if (shmfd->shm_lp_psind != 0 &&
		    conf->psind != shmfd->shm_lp_psind)
			return (EINVAL);
		if (conf->psind <= 0 || conf->psind >= MAXPAGESIZES ||
		    pagesizes[conf->psind] == 0)
			return (EINVAL);
		if (conf->alloc_policy != SHM_LARGEPAGE_ALLOC_DEFAULT &&
		    conf->alloc_policy != SHM_LARGEPAGE_ALLOC_NOWAIT &&
		    conf->alloc_policy != SHM_LARGEPAGE_ALLOC_HARD)
			return (EINVAL);

		rl_cookie = rangelock_wlock(&shmfd->shm_rl, 0, OFF_MAX,
		    &shmfd->shm_mtx);
		shmfd->shm_lp_psind = conf->psind;
		shmfd->shm_lp_alloc_policy = conf->alloc_policy;
		shmfd->shm_object->un_pager.phys.data_val = conf->psind;
		rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx);
		return (0);
	case FIOGSHMLPGCNF:
		if (!shm_largepage(shmfd))
			return (ENOTTY);
		conf = data;
		rl_cookie = rangelock_rlock(&shmfd->shm_rl, 0, OFF_MAX,
		    &shmfd->shm_mtx);
		conf->psind = shmfd->shm_lp_psind;
		conf->alloc_policy = shmfd->shm_lp_alloc_policy;
		rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx);
		return (0);
535
536
537
538
539
	default:
		return (ENOTTY);
	}
}

540
static int
541
shm_stat(struct file *fp, struct stat *sb, struct ucred *active_cred)
542
543
544
545
546
547
548
549
550
551
552
553
554
{
	struct shmfd *shmfd;
#ifdef MAC
	int error;
#endif

	shmfd = fp->f_data;

#ifdef MAC
	error = mac_posixshm_check_stat(active_cred, fp->f_cred, shmfd);
	if (error)
		return (error);
#endif
555

556
557
558
559
560
561
562
	/*
	 * Attempt to return sanish values for fstat() on a memory file
	 * descriptor.
	 */
	bzero(sb, sizeof(*sb));
	sb->st_blksize = PAGE_SIZE;
	sb->st_size = shmfd->shm_size;
563
	sb->st_blocks = howmany(sb->st_size, sb->st_blksize);
564
	mtx_lock(&shm_timestamp_lock);
565
566
567
	sb->st_atim = shmfd->shm_atime;
	sb->st_ctim = shmfd->shm_ctime;
	sb->st_mtim = shmfd->shm_mtime;
568
569
	sb->st_birthtim = shmfd->shm_birthtime;
	sb->st_mode = S_IFREG | shmfd->shm_mode;		/* XXX */
570
571
	sb->st_uid = shmfd->shm_uid;
	sb->st_gid = shmfd->shm_gid;
572
	mtx_unlock(&shm_timestamp_lock);
573
574
	sb->st_dev = shm_dev_ino;
	sb->st_ino = shmfd->shm_ino;
575
	sb->st_nlink = shmfd->shm_object->ref_count;
576
577
	sb->st_blocks = shmfd->shm_object->size /
	    (pagesizes[shmfd->shm_lp_psind] >> PAGE_SHIFT);
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593

	return (0);
}

static int
shm_close(struct file *fp, struct thread *td)
{
	struct shmfd *shmfd;

	shmfd = fp->f_data;
	fp->f_data = NULL;
	shm_drop(shmfd);

	return (0);
}

594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
static int
shm_copyin_path(struct thread *td, const char *userpath_in, char **path_out) {
	int error;
	char *path;
	const char *pr_path;
	size_t pr_pathlen;

	path = malloc(MAXPATHLEN, M_SHMFD, M_WAITOK);
	pr_path = td->td_ucred->cr_prison->pr_path;

	/* Construct a full pathname for jailed callers. */
	pr_pathlen = strcmp(pr_path, "/") ==
	    0 ? 0 : strlcpy(path, pr_path, MAXPATHLEN);
	error = copyinstr(userpath_in, path + pr_pathlen,
	    MAXPATHLEN - pr_pathlen, NULL);
	if (error != 0)
		goto out;

#ifdef KTRACE
	if (KTRPOINT(curthread, KTR_NAMEI))
		ktrnamei(path);
#endif

	/* Require paths to start with a '/' character. */
	if (path[pr_pathlen] != '/') {
		error = EINVAL;
		goto out;
	}

	*path_out = path;

out:
	if (error != 0)
		free(path, M_SHMFD);

	return (error);
}

632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
static int
shm_partial_page_invalidate(vm_object_t object, vm_pindex_t idx, int base,
    int end)
{
	vm_page_t m;
	int rv;

	VM_OBJECT_ASSERT_WLOCKED(object);
	KASSERT(base >= 0, ("%s: base %d", __func__, base));
	KASSERT(end - base <= PAGE_SIZE, ("%s: base %d end %d", __func__, base,
	    end));

retry:
	m = vm_page_grab(object, idx, VM_ALLOC_NOCREAT);
	if (m != NULL) {
		MPASS(vm_page_all_valid(m));
	} else if (vm_pager_has_page(object, idx, NULL, NULL)) {
		m = vm_page_alloc(object, idx,
		    VM_ALLOC_NORMAL | VM_ALLOC_WAITFAIL);
		if (m == NULL)
			goto retry;
		vm_object_pip_add(object, 1);
		VM_OBJECT_WUNLOCK(object);
		rv = vm_pager_get_pages(object, &m, 1, NULL, NULL);
		VM_OBJECT_WLOCK(object);
		vm_object_pip_wakeup(object);
		if (rv == VM_PAGER_OK) {
			/*
			 * Since the page was not resident, and therefore not
			 * recently accessed, immediately enqueue it for
			 * asynchronous laundering.  The current operation is
			 * not regarded as an access.
			 */
			vm_page_launder(m);
		} else {
			vm_page_free(m);
			VM_OBJECT_WUNLOCK(object);
			return (EIO);
		}
	}
	if (m != NULL) {
		pmap_zero_page_area(m, base, end - base);
		KASSERT(vm_page_all_valid(m), ("%s: page %p is invalid",
		    __func__, m));
		vm_page_set_dirty(m);
		vm_page_xunbusy(m);
	}

	return (0);
}

683
684
static int
shm_dotruncate_locked(struct shmfd *shmfd, off_t length, void *rl_cookie)
685
686
{
	vm_object_t object;
687
	vm_pindex_t nobjsize;
688
	vm_ooffset_t delta;
689
	int base, error;
690

691
	KASSERT(length >= 0, ("shm_dotruncate: length < 0"));
692
	object = shmfd->shm_object;
693
694
695
	VM_OBJECT_ASSERT_WLOCKED(object);
	rangelock_cookie_assert(rl_cookie, RA_WLOCKED);
	if (length == shmfd->shm_size)
696
		return (0);
697
698
699
700
	nobjsize = OFF_TO_IDX(length + PAGE_MASK);

	/* Are we shrinking?  If so, trim the end. */
	if (length < shmfd->shm_size) {
701
702
703
		if ((shmfd->shm_seals & F_SEAL_SHRINK) != 0)
			return (EPERM);

704
705
706
707
		/*
		 * Disallow any requests to shrink the size if this
		 * object is mapped into the kernel.
		 */
708
		if (shmfd->shm_kmappings > 0)
709
			return (EBUSY);
710
711
712
713
714
715

		/*
		 * Zero the truncated part of the last page.
		 */
		base = length & PAGE_MASK;
		if (base != 0) {
716
717
718
719
			error = shm_partial_page_invalidate(object,
			    OFF_TO_IDX(length), base, PAGE_SIZE);
			if (error)
				return (error);
720
		}
721
		delta = IDX_TO_OFF(object->size - nobjsize);
722

723
724
		if (nobjsize < object->size)
			vm_object_page_remove(object, nobjsize, object->size,
725
			    0);
726

727
		/* Free the swap accounted for shm */
728
		swap_release_by_cred(delta, object->cred);
729
730
		object->charge -= delta;
	} else {
731
732
733
		if ((shmfd->shm_seals & F_SEAL_GROW) != 0)
			return (EPERM);

734
735
		/* Try to reserve additional swap space. */
		delta = IDX_TO_OFF(nobjsize - object->size);
736
		if (!swap_reserve_by_cred(delta, object->cred))
737
738
			return (ENOMEM);
		object->charge += delta;
739
740
741
742
743
744
745
	}
	shmfd->shm_size = length;
	mtx_lock(&shm_timestamp_lock);
	vfs_timestamp(&shmfd->shm_ctime);
	shmfd->shm_mtime = shmfd->shm_ctime;
	mtx_unlock(&shm_timestamp_lock);
	object->size = nobjsize;
746
	return (0);
747
748
}

749
750
751
752
753
static int
shm_dotruncate_largepage(struct shmfd *shmfd, off_t length, void *rl_cookie)
{
	vm_object_t object;
	vm_page_t m;
754
755
	vm_pindex_t newobjsz;
	vm_pindex_t oldobjsz __unused;
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
	int aflags, error, i, psind, try;

	KASSERT(length >= 0, ("shm_dotruncate: length < 0"));
	object = shmfd->shm_object;
	VM_OBJECT_ASSERT_WLOCKED(object);
	rangelock_cookie_assert(rl_cookie, RA_WLOCKED);

	oldobjsz = object->size;
	newobjsz = OFF_TO_IDX(length);
	if (length == shmfd->shm_size)
		return (0);
	psind = shmfd->shm_lp_psind;
	if (psind == 0 && length != 0)
		return (EINVAL);
	if ((length & (pagesizes[psind] - 1)) != 0)
		return (EINVAL);

	if (length < shmfd->shm_size) {
		if ((shmfd->shm_seals & F_SEAL_SHRINK) != 0)
			return (EPERM);
		if (shmfd->shm_kmappings > 0)
			return (EBUSY);
		return (ENOTSUP);	/* Pages are unmanaged. */
#if 0
		vm_object_page_remove(object, newobjsz, oldobjsz, 0);
		object->size = newobjsz;
		shmfd->shm_size = length;
		return (0);
#endif
	}

787
788
789
	if ((shmfd->shm_seals & F_SEAL_GROW) != 0)
		return (EPERM);

790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
	aflags = VM_ALLOC_NORMAL | VM_ALLOC_ZERO;
	if (shmfd->shm_lp_alloc_policy == SHM_LARGEPAGE_ALLOC_NOWAIT)
		aflags |= VM_ALLOC_WAITFAIL;
	try = 0;

	/*
	 * Extend shmfd and object, keeping all already fully
	 * allocated large pages intact even on error, because dropped
	 * object lock might allowed mapping of them.
	 */
	while (object->size < newobjsz) {
		m = vm_page_alloc_contig(object, object->size, aflags,
		    pagesizes[psind] / PAGE_SIZE, 0, ~0,
		    pagesizes[psind], 0,
		    VM_MEMATTR_DEFAULT);
		if (m == NULL) {
			VM_OBJECT_WUNLOCK(object);
			if (shmfd->shm_lp_alloc_policy ==
			    SHM_LARGEPAGE_ALLOC_NOWAIT ||
			    (shmfd->shm_lp_alloc_policy ==
			    SHM_LARGEPAGE_ALLOC_DEFAULT &&
			    try >= largepage_reclaim_tries)) {
				VM_OBJECT_WLOCK(object);
				return (ENOMEM);
			}
			error = vm_page_reclaim_contig(aflags,
			    pagesizes[psind] / PAGE_SIZE, 0, ~0,
			    pagesizes[psind], 0) ? 0 :
			    vm_wait_intr(object);
			if (error != 0) {
				VM_OBJECT_WLOCK(object);
				return (error);
			}
			try++;
			VM_OBJECT_WLOCK(object);
			continue;
		}
		try = 0;
		for (i = 0; i < pagesizes[psind] / PAGE_SIZE; i++) {
			if ((m[i].flags & PG_ZERO) == 0)
				pmap_zero_page(&m[i]);
			vm_page_valid(&m[i]);
			vm_page_xunbusy(&m[i]);
		}
		object->size += OFF_TO_IDX(pagesizes[psind]);
		shmfd->shm_size += pagesizes[psind];
		atomic_add_long(&count_largepages[psind], 1);
		vm_wire_add(atop(pagesizes[psind]));
	}
	return (0);
}

static int
shm_dotruncate_cookie(struct shmfd *shmfd, off_t length, void *rl_cookie)
{
	int error;

	VM_OBJECT_WLOCK(shmfd->shm_object);
	error = shm_largepage(shmfd) ? shm_dotruncate_largepage(shmfd,
	    length, rl_cookie) : shm_dotruncate_locked(shmfd, length,
	    rl_cookie);
	VM_OBJECT_WUNLOCK(shmfd->shm_object);
	return (error);
}

855
856
857
858
859
860
861
862
int
shm_dotruncate(struct shmfd *shmfd, off_t length)
{
	void *rl_cookie;
	int error;

	rl_cookie = rangelock_wlock(&shmfd->shm_rl, 0, OFF_MAX,
	    &shmfd->shm_mtx);
863
	error = shm_dotruncate_cookie(shmfd, length, rl_cookie);
864
865
866
867
	rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx);
	return (error);
}

868
869
870
871
/*
 * shmfd object management including creation and reference counting
 * routines.
 */
872
struct shmfd *
873
shm_alloc(struct ucred *ucred, mode_t mode, bool largepage)
874
875
876
877
878
879
880
881
{
	struct shmfd *shmfd;

	shmfd = malloc(sizeof(*shmfd), M_SHMFD, M_WAITOK | M_ZERO);
	shmfd->shm_size = 0;
	shmfd->shm_uid = ucred->cr_uid;
	shmfd->shm_gid = ucred->cr_gid;
	shmfd->shm_mode = mode;
882
883
884
885
886
887
888
889
890
	if (largepage) {
		shmfd->shm_object = phys_pager_allocate(NULL,
		    &shm_largepage_phys_ops, NULL, shmfd->shm_size,
		    VM_PROT_DEFAULT, 0, ucred);
		shmfd->shm_lp_alloc_policy = SHM_LARGEPAGE_ALLOC_DEFAULT;
	} else {
		shmfd->shm_object = vm_pager_allocate(OBJT_SWAP, NULL,
		    shmfd->shm_size, VM_PROT_DEFAULT, 0, ucred);
	}
891
892
893
894
	KASSERT(shmfd->shm_object != NULL, ("shm_create: vm_pager_allocate"));
	vfs_timestamp(&shmfd->shm_birthtime);
	shmfd->shm_atime = shmfd->shm_mtime = shmfd->shm_ctime =
	    shmfd->shm_birthtime;
895
	shmfd->shm_ino = alloc_unr64(&shm_ino_unr);
896
	refcount_init(&shmfd->shm_refs, 1);
897
898
	mtx_init(&shmfd->shm_mtx, "shmrl", NULL, MTX_DEF);
	rangelock_init(&shmfd->shm_rl);
899
900
901
902
903
904
905
906
#ifdef MAC
	mac_posixshm_init(shmfd);
	mac_posixshm_create(ucred, shmfd);
#endif

	return (shmfd);
}

907
struct shmfd *
908
909
910
911
912
913
914
shm_hold(struct shmfd *shmfd)
{

	refcount_acquire(&shmfd->shm_refs);
	return (shmfd);
}

915
void
916
917
918
919
920
921
922
shm_drop(struct shmfd *shmfd)
{

	if (refcount_release(&shmfd->shm_refs)) {
#ifdef MAC
		mac_posixshm_destroy(shmfd);
#endif
923
924
		rangelock_destroy(&shmfd->shm_rl);
		mtx_destroy(&shmfd->shm_mtx);
925
926
927
928
929
930
931
932
933
		vm_object_deallocate(shmfd->shm_object);
		free(shmfd, M_SHMFD);
	}
}

/*
 * Determine if the credentials have sufficient permissions for a
 * specified combination of FREAD and FWRITE.
 */
934
int
935
936
shm_access(struct shmfd *shmfd, struct ucred *ucred, int flags)
{
937
	accmode_t accmode;
938
	int error;
939

940
	accmode = 0;
941
	if (flags & FREAD)
942
		accmode |= VREAD;
943
	if (flags & FWRITE)
944
		accmode |= VWRITE;
945
946
	mtx_lock(&shm_timestamp_lock);
	error = vaccess(VREG, shmfd->shm_mode, shmfd->shm_uid, shmfd->shm_gid,
947
	    accmode, ucred);
948
949
	mtx_unlock(&shm_timestamp_lock);
	return (error);
950
951
952
}

static void
953
shm_init(void *arg)
954
{
955
956
	char name[32];
	int i;
957
958
959
960

	mtx_init(&shm_timestamp_lock, "shm timestamps", NULL, MTX_DEF);
	sx_init(&shm_dict_lock, "shm dictionary");
	shm_dictionary = hashinit(1024, M_SHMFD, &shm_hash);
961
	new_unrhdr64(&shm_ino_unr, 1);
962
963
	shm_dev_ino = devfs_alloc_cdp_inode();
	KASSERT(shm_dev_ino > 0, ("shm dev inode not initialized"));
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981

	for (i = 1; i < MAXPAGESIZES; i++) {
		if (pagesizes[i] == 0)
			break;
#define	M	(1024 * 1024)
#define	G	(1024 * M)
		if (pagesizes[i] >= G)
			snprintf(name, sizeof(name), "%luG", pagesizes[i] / G);
		else if (pagesizes[i] >= M)
			snprintf(name, sizeof(name), "%luM", pagesizes[i] / M);
		else
			snprintf(name, sizeof(name), "%lu", pagesizes[i]);
#undef G
#undef M
		SYSCTL_ADD_ULONG(NULL, SYSCTL_STATIC_CHILDREN(_vm_largepages),
		    OID_AUTO, name, CTLFLAG_RD, &count_largepages[i],
		    "number of non-transient largepages allocated");
	}
982
}
983
SYSINIT(shm_init, SI_SUB_SYSV_SHM, SI_ORDER_ANY, shm_init, NULL);
984

985
986
987
988
989
/*
 * Dictionary management.  We maintain an in-kernel dictionary to map
 * paths to shmfd objects.  We use the FNV hash on the path to store
 * the mappings in a hash table.
 */
990
991
992
993
994
995
996
997
998
999
1000
static struct shmfd *
shm_lookup(char *path, Fnv32_t fnv)
{
	struct shm_mapping *map;

	LIST_FOREACH(map, SHM_HASH(fnv), sm_link) {
		if (map->sm_fnv != fnv)
			continue;
		if (strcmp(map->sm_path, path) == 0)
			return (map->sm_shmfd);
	}