uipc_shm.c 50.5 KB
Newer Older
1
/*-
2
3
 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
 *
4
 * Copyright (c) 2006, 2011, 2016-2017 Robert N. M. Watson
5
 * Copyright 2020 The FreeBSD Foundation
6
7
 * All rights reserved.
 *
8
9
10
11
12
 * Portions of this software were developed by BAE Systems, the University of
 * Cambridge Computer Laboratory, and Memorial University under DARPA/AFRL
 * contract FA8650-15-C-7558 ("CADETS"), as part of the DARPA Transparent
 * Computing (TC) research program.
 *
13
14
15
 * Portions of this software were developed by Konstantin Belousov
 * under sponsorship from the FreeBSD Foundation.
 *
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

/*
 * Support for shared swap-backed anonymous memory objects via
David Bright's avatar
David Bright committed
40
41
42
 * shm_open(2), shm_rename(2), and shm_unlink(2).
 * While most of the implementation is here, vm_mmap.c contains
 * mapping logic changes.
43
 *
44
45
46
47
 * posixshmcontrol(1) allows users to inspect the state of the memory
 * objects.  Per-uid swap resource limit controls total amount of
 * memory that user can consume for anonymous objects, including
 * shared.
48
49
50
51
52
 */

#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");

53
#include "opt_capsicum.h"
54
#include "opt_ktrace.h"
55

56
#include <sys/param.h>
57
#include <sys/capsicum.h>
58
#include <sys/conf.h>
59
60
61
#include <sys/fcntl.h>
#include <sys/file.h>
#include <sys/filedesc.h>
62
#include <sys/filio.h>
63
64
#include <sys/fnv_hash.h>
#include <sys/kernel.h>
65
#include <sys/limits.h>
66
67
#include <sys/uio.h>
#include <sys/signal.h>
68
#include <sys/jail.h>
69
#include <sys/ktrace.h>
70
71
72
73
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mman.h>
#include <sys/mutex.h>
74
#include <sys/priv.h>
75
76
77
#include <sys/proc.h>
#include <sys/refcount.h>
#include <sys/resourcevar.h>
78
#include <sys/rwlock.h>
79
#include <sys/sbuf.h>
80
#include <sys/stat.h>
Ed Schouten's avatar
Ed Schouten committed
81
#include <sys/syscallsubr.h>
82
83
84
85
86
#include <sys/sysctl.h>
#include <sys/sysproto.h>
#include <sys/systm.h>
#include <sys/sx.h>
#include <sys/time.h>
87
#include <sys/vmmeter.h>
88
#include <sys/vnode.h>
89
#include <sys/unistd.h>
90
#include <sys/user.h>
91

92
#include <security/audit/audit.h>
93
94
95
96
97
#include <security/mac/mac_framework.h>

#include <vm/vm.h>
#include <vm/vm_param.h>
#include <vm/pmap.h>
John Baldwin's avatar
John Baldwin committed
98
#include <vm/vm_extern.h>
99
#include <vm/vm_map.h>
100
#include <vm/vm_kern.h>
101
102
#include <vm/vm_object.h>
#include <vm/vm_page.h>
103
#include <vm/vm_pageout.h>
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
#include <vm/vm_pager.h>
#include <vm/swap_pager.h>

struct shm_mapping {
	char		*sm_path;
	Fnv32_t		sm_fnv;
	struct shmfd	*sm_shmfd;
	LIST_ENTRY(shm_mapping) sm_link;
};

static MALLOC_DEFINE(M_SHMFD, "shmfd", "shared memory file descriptor");
static LIST_HEAD(, shm_mapping) *shm_dictionary;
static struct sx shm_dict_lock;
static struct mtx shm_timestamp_lock;
static u_long shm_hash;
119
static struct unrhdr64 shm_ino_unr;
120
static dev_t shm_dev_ino;
121
122
123

#define	SHM_HASH(fnv)	(&shm_dictionary[(fnv) & shm_hash])

124
static void	shm_init(void *arg);
125
126
127
static void	shm_insert(char *path, Fnv32_t fnv, struct shmfd *shmfd);
static struct shmfd *shm_lookup(char *path, Fnv32_t fnv);
static int	shm_remove(char *path, Fnv32_t fnv, struct ucred *ucred);
128
129
static int	shm_dotruncate_cookie(struct shmfd *shmfd, off_t length,
    void *rl_cookie);
130
131
static int	shm_dotruncate_locked(struct shmfd *shmfd, off_t length,
    void *rl_cookie);
132
133
static int	shm_copyin_path(struct thread *td, const char *userpath_in,
    char **path_out);
134
135
136
137

static fo_rdwr_t	shm_read;
static fo_rdwr_t	shm_write;
static fo_truncate_t	shm_truncate;
138
static fo_ioctl_t	shm_ioctl;
139
140
static fo_stat_t	shm_stat;
static fo_close_t	shm_close;
141
142
static fo_chmod_t	shm_chmod;
static fo_chown_t	shm_chown;
143
static fo_seek_t	shm_seek;
144
static fo_fill_kinfo_t	shm_fill_kinfo;
145
static fo_mmap_t	shm_mmap;
146
147
static fo_get_seals_t	shm_get_seals;
static fo_add_seals_t	shm_add_seals;
148
static fo_fallocate_t	shm_fallocate;
149
150

/* File descriptor operations. */
151
struct fileops shm_ops = {
152
153
154
	.fo_read = shm_read,
	.fo_write = shm_write,
	.fo_truncate = shm_truncate,
155
	.fo_ioctl = shm_ioctl,
156
157
	.fo_poll = invfo_poll,
	.fo_kqfilter = invfo_kqfilter,
158
159
	.fo_stat = shm_stat,
	.fo_close = shm_close,
160
161
	.fo_chmod = shm_chmod,
	.fo_chown = shm_chown,
162
	.fo_sendfile = vn_sendfile,
163
	.fo_seek = shm_seek,
164
	.fo_fill_kinfo = shm_fill_kinfo,
165
	.fo_mmap = shm_mmap,
166
167
	.fo_get_seals = shm_get_seals,
	.fo_add_seals = shm_add_seals,
168
	.fo_fallocate = shm_fallocate,
169
	.fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE,
170
171
172
173
};

FEATURE(posix_shm, "POSIX shared memory");

174
175
176
177
178
179
180
181
static SYSCTL_NODE(_vm, OID_AUTO, largepages, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
    "");

static int largepage_reclaim_tries = 1;
SYSCTL_INT(_vm_largepages, OID_AUTO, reclaim_tries,
    CTLFLAG_RWTUN, &largepage_reclaim_tries, 0,
    "Number of contig reclaims before giving up for default alloc policy");

182
183
184
185
186
187
188
189
190
191
192
193
static int
uiomove_object_page(vm_object_t obj, size_t len, struct uio *uio)
{
	vm_page_t m;
	vm_pindex_t idx;
	size_t tlen;
	int error, offset, rv;

	idx = OFF_TO_IDX(uio->uio_offset);
	offset = uio->uio_offset & PAGE_MASK;
	tlen = MIN(PAGE_SIZE - offset, len);

194
195
196
197
	rv = vm_page_grab_valid_unlocked(&m, obj, idx,
	    VM_ALLOC_SBUSY | VM_ALLOC_IGN_SBUSY | VM_ALLOC_NOCREAT);
	if (rv == VM_PAGER_OK)
		goto found;
198

199
200
201
202
203
	/*
	 * Read I/O without either a corresponding resident page or swap
	 * page: use zero_region.  This is intended to avoid instantiating
	 * pages on read from a sparse region.
	 */
204
205
206
	VM_OBJECT_WLOCK(obj);
	m = vm_page_lookup(obj, idx);
	if (uio->uio_rw == UIO_READ && m == NULL &&
207
208
	    !vm_pager_has_page(obj, idx, NULL, NULL)) {
		VM_OBJECT_WUNLOCK(obj);
209
		return (uiomove(__DECONST(void *, zero_region), tlen, uio));
210
211
	}

212
213
214
215
216
217
218
	/*
	 * Although the tmpfs vnode lock is held here, it is
	 * nonetheless safe to sleep waiting for a free page.  The
	 * pageout daemon does not need to acquire the tmpfs vnode
	 * lock to page out tobj's pages because tobj is a OBJT_SWAP
	 * type object.
	 */
219
	rv = vm_page_grab_valid(&m, obj, idx,
220
	    VM_ALLOC_NORMAL | VM_ALLOC_SBUSY | VM_ALLOC_IGN_SBUSY);
221
222
223
224
225
	if (rv != VM_PAGER_OK) {
		VM_OBJECT_WUNLOCK(obj);
		printf("uiomove_object: vm_obj %p idx %jd pager error %d\n",
		    obj, idx, rv);
		return (EIO);
226
227
	}
	VM_OBJECT_WUNLOCK(obj);
228
229

found:
230
	error = uiomove_fromphys(&m, offset, tlen, uio);
231
232
	if (uio->uio_rw == UIO_WRITE && error == 0)
		vm_page_set_dirty(m);
233
	vm_page_activate(m);
234
	vm_page_sunbusy(m);
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259

	return (error);
}

int
uiomove_object(vm_object_t obj, off_t obj_size, struct uio *uio)
{
	ssize_t resid;
	size_t len;
	int error;

	error = 0;
	while ((resid = uio->uio_resid) > 0) {
		if (obj_size <= uio->uio_offset)
			break;
		len = MIN(obj_size - uio->uio_offset, resid);
		if (len == 0)
			break;
		error = uiomove_object_page(obj, len, uio);
		if (error != 0 || resid == uio->uio_resid)
			break;
	}
	return (error);
}

260
261
262
263
264
265
static u_long count_largepages[MAXPAGESIZES];

static int
shm_largepage_phys_populate(vm_object_t object, vm_pindex_t pidx,
    int fault_type, vm_prot_t max_prot, vm_pindex_t *first, vm_pindex_t *last)
{
266
	vm_page_t m __diagused;
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
	int psind;

	psind = object->un_pager.phys.data_val;
	if (psind == 0 || pidx >= object->size)
		return (VM_PAGER_FAIL);
	*first = rounddown2(pidx, pagesizes[psind] / PAGE_SIZE);

	/*
	 * We only busy the first page in the superpage run.  It is
	 * useless to busy whole run since we only remove full
	 * superpage, and it takes too long to busy e.g. 512 * 512 ==
	 * 262144 pages constituing 1G amd64 superage.
	 */
	m = vm_page_grab(object, *first, VM_ALLOC_NORMAL | VM_ALLOC_NOCREAT);
	MPASS(m != NULL);

	*last = *first + atop(pagesizes[psind]) - 1;
	return (VM_PAGER_OK);
}

static boolean_t
shm_largepage_phys_haspage(vm_object_t object, vm_pindex_t pindex,
    int *before, int *after)
{
	int psind;

	psind = object->un_pager.phys.data_val;
	if (psind == 0 || pindex >= object->size)
		return (FALSE);
	if (before != NULL) {
		*before = pindex - rounddown2(pindex, pagesizes[psind] /
		    PAGE_SIZE);
	}
	if (after != NULL) {
		*after = roundup2(pindex, pagesizes[psind] / PAGE_SIZE) -
		    pindex;
	}
	return (TRUE);
}

static void
shm_largepage_phys_ctor(vm_object_t object, vm_prot_t prot,
    vm_ooffset_t foff, struct ucred *cred)
{
}

static void
shm_largepage_phys_dtor(vm_object_t object)
{
	int psind;

	psind = object->un_pager.phys.data_val;
	if (psind != 0) {
		atomic_subtract_long(&count_largepages[psind],
		    object->size / (pagesizes[psind] / PAGE_SIZE));
		vm_wire_sub(object->size);
	} else {
		KASSERT(object->size == 0,
		    ("largepage phys obj %p not initialized bit size %#jx > 0",
		    object, (uintmax_t)object->size));
	}
}

330
static const struct phys_pager_ops shm_largepage_phys_ops = {
331
332
333
334
335
336
337
338
339
340
341
342
	.phys_pg_populate =	shm_largepage_phys_populate,
	.phys_pg_haspage =	shm_largepage_phys_haspage,
	.phys_pg_ctor =		shm_largepage_phys_ctor,
	.phys_pg_dtor =		shm_largepage_phys_dtor,
};

bool
shm_largepage(struct shmfd *shmfd)
{
	return (shmfd->shm_object->type == OBJT_PHYS);
}

343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
static int
shm_seek(struct file *fp, off_t offset, int whence, struct thread *td)
{
	struct shmfd *shmfd;
	off_t foffset;
	int error;

	shmfd = fp->f_data;
	foffset = foffset_lock(fp, 0);
	error = 0;
	switch (whence) {
	case L_INCR:
		if (foffset < 0 ||
		    (offset > 0 && foffset > OFF_MAX - offset)) {
			error = EOVERFLOW;
			break;
		}
		offset += foffset;
		break;
	case L_XTND:
		if (offset > 0 && shmfd->shm_size > OFF_MAX - offset) {
			error = EOVERFLOW;
			break;
		}
		offset += shmfd->shm_size;
		break;
	case L_SET:
		break;
	default:
		error = EINVAL;
	}
	if (error == 0) {
		if (offset < 0 || offset > shmfd->shm_size)
			error = EINVAL;
		else
378
			td->td_uretoff.tdu_off = offset;
379
380
381
382
383
	}
	foffset_unlock(fp, offset, error != 0 ? FOF_NOUPDATE : 0);
	return (error);
}

384
385
386
387
static int
shm_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
    int flags, struct thread *td)
{
388
389
390
	struct shmfd *shmfd;
	void *rl_cookie;
	int error;
391

392
393
394
395
396
397
	shmfd = fp->f_data;
#ifdef MAC
	error = mac_posixshm_check_read(active_cred, fp->f_cred, shmfd);
	if (error)
		return (error);
#endif
398
399
400
	foffset_lock_uio(fp, uio, flags);
	rl_cookie = rangelock_rlock(&shmfd->shm_rl, uio->uio_offset,
	    uio->uio_offset + uio->uio_resid, &shmfd->shm_mtx);
401
402
403
404
	error = uiomove_object(shmfd->shm_object, shmfd->shm_size, uio);
	rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx);
	foffset_unlock_uio(fp, uio, flags);
	return (error);
405
406
407
408
409
410
}

static int
shm_write(struct file *fp, struct uio *uio, struct ucred *active_cred,
    int flags, struct thread *td)
{
411
412
413
	struct shmfd *shmfd;
	void *rl_cookie;
	int error;
414
	off_t size;
415

416
417
418
419
420
421
	shmfd = fp->f_data;
#ifdef MAC
	error = mac_posixshm_check_write(active_cred, fp->f_cred, shmfd);
	if (error)
		return (error);
#endif
422
423
	if (shm_largepage(shmfd) && shmfd->shm_lp_psind == 0)
		return (EINVAL);
424
	foffset_lock_uio(fp, uio, flags);
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
	if (uio->uio_resid > OFF_MAX - uio->uio_offset) {
		/*
		 * Overflow is only an error if we're supposed to expand on
		 * write.  Otherwise, we'll just truncate the write to the
		 * size of the file, which can only grow up to OFF_MAX.
		 */
		if ((shmfd->shm_flags & SHM_GROW_ON_WRITE) != 0) {
			foffset_unlock_uio(fp, uio, flags);
			return (EFBIG);
		}

		size = shmfd->shm_size;
	} else {
		size = uio->uio_offset + uio->uio_resid;
	}
440
441
442
443
444
	if ((flags & FOF_OFFSET) == 0) {
		rl_cookie = rangelock_wlock(&shmfd->shm_rl, 0, OFF_MAX,
		    &shmfd->shm_mtx);
	} else {
		rl_cookie = rangelock_wlock(&shmfd->shm_rl, uio->uio_offset,
445
		    size, &shmfd->shm_mtx);
446
	}
447
	if ((shmfd->shm_seals & F_SEAL_WRITE) != 0) {
448
		error = EPERM;
449
450
451
452
	} else {
		error = 0;
		if ((shmfd->shm_flags & SHM_GROW_ON_WRITE) != 0 &&
		    size > shmfd->shm_size) {
453
			error = shm_dotruncate_cookie(shmfd, size, rl_cookie);
454
455
456
457
458
		}
		if (error == 0)
			error = uiomove_object(shmfd->shm_object,
			    shmfd->shm_size, uio);
	}
459
460
461
	rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx);
	foffset_unlock_uio(fp, uio, flags);
	return (error);
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
}

static int
shm_truncate(struct file *fp, off_t length, struct ucred *active_cred,
    struct thread *td)
{
	struct shmfd *shmfd;
#ifdef MAC
	int error;
#endif

	shmfd = fp->f_data;
#ifdef MAC
	error = mac_posixshm_check_truncate(active_cred, fp->f_cred, shmfd);
	if (error)
		return (error);
#endif
479
	return (shm_dotruncate(shmfd, length));
480
481
}

482
483
484
485
int
shm_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred,
    struct thread *td)
{
486
487
488
	struct shmfd *shmfd;
	struct shm_largepage_conf *conf;
	void *rl_cookie;
489

490
	shmfd = fp->f_data;
491
492
493
494
495
496
497
498
	switch (com) {
	case FIONBIO:
	case FIOASYNC:
		/*
		 * Allow fcntl(fd, F_SETFL, O_NONBLOCK) to work,
		 * just like it would on an unlinked regular file
		 */
		return (0);
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
	case FIOSSHMLPGCNF:
		if (!shm_largepage(shmfd))
			return (ENOTTY);
		conf = data;
		if (shmfd->shm_lp_psind != 0 &&
		    conf->psind != shmfd->shm_lp_psind)
			return (EINVAL);
		if (conf->psind <= 0 || conf->psind >= MAXPAGESIZES ||
		    pagesizes[conf->psind] == 0)
			return (EINVAL);
		if (conf->alloc_policy != SHM_LARGEPAGE_ALLOC_DEFAULT &&
		    conf->alloc_policy != SHM_LARGEPAGE_ALLOC_NOWAIT &&
		    conf->alloc_policy != SHM_LARGEPAGE_ALLOC_HARD)
			return (EINVAL);

		rl_cookie = rangelock_wlock(&shmfd->shm_rl, 0, OFF_MAX,
		    &shmfd->shm_mtx);
		shmfd->shm_lp_psind = conf->psind;
		shmfd->shm_lp_alloc_policy = conf->alloc_policy;
		shmfd->shm_object->un_pager.phys.data_val = conf->psind;
		rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx);
		return (0);
	case FIOGSHMLPGCNF:
		if (!shm_largepage(shmfd))
			return (ENOTTY);
		conf = data;
		rl_cookie = rangelock_rlock(&shmfd->shm_rl, 0, OFF_MAX,
		    &shmfd->shm_mtx);
		conf->psind = shmfd->shm_lp_psind;
		conf->alloc_policy = shmfd->shm_lp_alloc_policy;
		rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx);
		return (0);
531
532
533
534
535
	default:
		return (ENOTTY);
	}
}

536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
static int
shm_stat(struct file *fp, struct stat *sb, struct ucred *active_cred,
    struct thread *td)
{
	struct shmfd *shmfd;
#ifdef MAC
	int error;
#endif

	shmfd = fp->f_data;

#ifdef MAC
	error = mac_posixshm_check_stat(active_cred, fp->f_cred, shmfd);
	if (error)
		return (error);
#endif
552

553
554
555
556
557
558
559
	/*
	 * Attempt to return sanish values for fstat() on a memory file
	 * descriptor.
	 */
	bzero(sb, sizeof(*sb));
	sb->st_blksize = PAGE_SIZE;
	sb->st_size = shmfd->shm_size;
560
	sb->st_blocks = howmany(sb->st_size, sb->st_blksize);
561
	mtx_lock(&shm_timestamp_lock);
562
563
564
	sb->st_atim = shmfd->shm_atime;
	sb->st_ctim = shmfd->shm_ctime;
	sb->st_mtim = shmfd->shm_mtime;
565
566
	sb->st_birthtim = shmfd->shm_birthtime;
	sb->st_mode = S_IFREG | shmfd->shm_mode;		/* XXX */
567
568
	sb->st_uid = shmfd->shm_uid;
	sb->st_gid = shmfd->shm_gid;
569
	mtx_unlock(&shm_timestamp_lock);
570
571
	sb->st_dev = shm_dev_ino;
	sb->st_ino = shmfd->shm_ino;
572
	sb->st_nlink = shmfd->shm_object->ref_count;
573
574
	sb->st_blocks = shmfd->shm_object->size /
	    (pagesizes[shmfd->shm_lp_psind] >> PAGE_SHIFT);
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590

	return (0);
}

static int
shm_close(struct file *fp, struct thread *td)
{
	struct shmfd *shmfd;

	shmfd = fp->f_data;
	fp->f_data = NULL;
	shm_drop(shmfd);

	return (0);
}

591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
static int
shm_copyin_path(struct thread *td, const char *userpath_in, char **path_out) {
	int error;
	char *path;
	const char *pr_path;
	size_t pr_pathlen;

	path = malloc(MAXPATHLEN, M_SHMFD, M_WAITOK);
	pr_path = td->td_ucred->cr_prison->pr_path;

	/* Construct a full pathname for jailed callers. */
	pr_pathlen = strcmp(pr_path, "/") ==
	    0 ? 0 : strlcpy(path, pr_path, MAXPATHLEN);
	error = copyinstr(userpath_in, path + pr_pathlen,
	    MAXPATHLEN - pr_pathlen, NULL);
	if (error != 0)
		goto out;

#ifdef KTRACE
	if (KTRPOINT(curthread, KTR_NAMEI))
		ktrnamei(path);
#endif

	/* Require paths to start with a '/' character. */
	if (path[pr_pathlen] != '/') {
		error = EINVAL;
		goto out;
	}

	*path_out = path;

out:
	if (error != 0)
		free(path, M_SHMFD);

	return (error);
}

629
630
static int
shm_dotruncate_locked(struct shmfd *shmfd, off_t length, void *rl_cookie)
631
632
{
	vm_object_t object;
633
	vm_page_t m;
634
	vm_pindex_t idx, nobjsize;
635
	vm_ooffset_t delta;
636
	int base, rv;
637

638
	KASSERT(length >= 0, ("shm_dotruncate: length < 0"));
639
	object = shmfd->shm_object;
640
641
642
	VM_OBJECT_ASSERT_WLOCKED(object);
	rangelock_cookie_assert(rl_cookie, RA_WLOCKED);
	if (length == shmfd->shm_size)
643
		return (0);
644
645
646
647
	nobjsize = OFF_TO_IDX(length + PAGE_MASK);

	/* Are we shrinking?  If so, trim the end. */
	if (length < shmfd->shm_size) {
648
649
650
		if ((shmfd->shm_seals & F_SEAL_SHRINK) != 0)
			return (EPERM);

651
652
653
654
		/*
		 * Disallow any requests to shrink the size if this
		 * object is mapped into the kernel.
		 */
655
		if (shmfd->shm_kmappings > 0)
656
			return (EBUSY);
657
658
659
660
661
662
663
664

		/*
		 * Zero the truncated part of the last page.
		 */
		base = length & PAGE_MASK;
		if (base != 0) {
			idx = OFF_TO_IDX(length);
retry:
665
			m = vm_page_grab(object, idx, VM_ALLOC_NOCREAT);
666
			if (m != NULL) {
667
				MPASS(vm_page_all_valid(m));
668
			} else if (vm_pager_has_page(object, idx, NULL, NULL)) {
669
670
671
				m = vm_page_alloc(object, idx,
				    VM_ALLOC_NORMAL | VM_ALLOC_WAITFAIL);
				if (m == NULL)
672
					goto retry;
673
674
				vm_object_pip_add(object, 1);
				VM_OBJECT_WUNLOCK(object);
675
676
				rv = vm_pager_get_pages(object, &m, 1, NULL,
				    NULL);
677
678
				VM_OBJECT_WLOCK(object);
				vm_object_pip_wakeup(object);
679
				if (rv == VM_PAGER_OK) {
680
681
682
683
684
685
686
687
688
					/*
					 * Since the page was not resident,
					 * and therefore not recently
					 * accessed, immediately enqueue it
					 * for asynchronous laundering.  The
					 * current operation is not regarded
					 * as an access.
					 */
					vm_page_launder(m);
689
690
				} else {
					vm_page_free(m);
691
					VM_OBJECT_WUNLOCK(object);
692
693
694
695
696
					return (EIO);
				}
			}
			if (m != NULL) {
				pmap_zero_page_area(m, base, PAGE_SIZE - base);
697
				KASSERT(vm_page_all_valid(m),
698
				    ("shm_dotruncate: page %p is invalid", m));
699
				vm_page_set_dirty(m);
700
				vm_page_xunbusy(m);
701
702
			}
		}
703
		delta = IDX_TO_OFF(object->size - nobjsize);
704

705
706
		if (nobjsize < object->size)
			vm_object_page_remove(object, nobjsize, object->size,
707
			    0);
708

709
		/* Free the swap accounted for shm */
710
		swap_release_by_cred(delta, object->cred);
711
712
		object->charge -= delta;
	} else {
713
714
715
		if ((shmfd->shm_seals & F_SEAL_GROW) != 0)
			return (EPERM);

716
717
		/* Try to reserve additional swap space. */
		delta = IDX_TO_OFF(nobjsize - object->size);
718
		if (!swap_reserve_by_cred(delta, object->cred))
719
720
			return (ENOMEM);
		object->charge += delta;
721
722
723
724
725
726
727
	}
	shmfd->shm_size = length;
	mtx_lock(&shm_timestamp_lock);
	vfs_timestamp(&shmfd->shm_ctime);
	shmfd->shm_mtime = shmfd->shm_ctime;
	mtx_unlock(&shm_timestamp_lock);
	object->size = nobjsize;
728
	return (0);
729
730
}

731
732
733
734
735
static int
shm_dotruncate_largepage(struct shmfd *shmfd, off_t length, void *rl_cookie)
{
	vm_object_t object;
	vm_page_t m;
736
737
	vm_pindex_t newobjsz;
	vm_pindex_t oldobjsz __unused;
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
	int aflags, error, i, psind, try;

	KASSERT(length >= 0, ("shm_dotruncate: length < 0"));
	object = shmfd->shm_object;
	VM_OBJECT_ASSERT_WLOCKED(object);
	rangelock_cookie_assert(rl_cookie, RA_WLOCKED);

	oldobjsz = object->size;
	newobjsz = OFF_TO_IDX(length);
	if (length == shmfd->shm_size)
		return (0);
	psind = shmfd->shm_lp_psind;
	if (psind == 0 && length != 0)
		return (EINVAL);
	if ((length & (pagesizes[psind] - 1)) != 0)
		return (EINVAL);

	if (length < shmfd->shm_size) {
		if ((shmfd->shm_seals & F_SEAL_SHRINK) != 0)
			return (EPERM);
		if (shmfd->shm_kmappings > 0)
			return (EBUSY);
		return (ENOTSUP);	/* Pages are unmanaged. */
#if 0
		vm_object_page_remove(object, newobjsz, oldobjsz, 0);
		object->size = newobjsz;
		shmfd->shm_size = length;
		return (0);
#endif
	}

769
770
771
	if ((shmfd->shm_seals & F_SEAL_GROW) != 0)
		return (EPERM);

772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
	aflags = VM_ALLOC_NORMAL | VM_ALLOC_ZERO;
	if (shmfd->shm_lp_alloc_policy == SHM_LARGEPAGE_ALLOC_NOWAIT)
		aflags |= VM_ALLOC_WAITFAIL;
	try = 0;

	/*
	 * Extend shmfd and object, keeping all already fully
	 * allocated large pages intact even on error, because dropped
	 * object lock might allowed mapping of them.
	 */
	while (object->size < newobjsz) {
		m = vm_page_alloc_contig(object, object->size, aflags,
		    pagesizes[psind] / PAGE_SIZE, 0, ~0,
		    pagesizes[psind], 0,
		    VM_MEMATTR_DEFAULT);
		if (m == NULL) {
			VM_OBJECT_WUNLOCK(object);
			if (shmfd->shm_lp_alloc_policy ==
			    SHM_LARGEPAGE_ALLOC_NOWAIT ||
			    (shmfd->shm_lp_alloc_policy ==
			    SHM_LARGEPAGE_ALLOC_DEFAULT &&
			    try >= largepage_reclaim_tries)) {
				VM_OBJECT_WLOCK(object);
				return (ENOMEM);
			}
			error = vm_page_reclaim_contig(aflags,
			    pagesizes[psind] / PAGE_SIZE, 0, ~0,
			    pagesizes[psind], 0) ? 0 :
			    vm_wait_intr(object);
			if (error != 0) {
				VM_OBJECT_WLOCK(object);
				return (error);
			}
			try++;
			VM_OBJECT_WLOCK(object);
			continue;
		}
		try = 0;
		for (i = 0; i < pagesizes[psind] / PAGE_SIZE; i++) {
			if ((m[i].flags & PG_ZERO) == 0)
				pmap_zero_page(&m[i]);
			vm_page_valid(&m[i]);
			vm_page_xunbusy(&m[i]);
		}
		object->size += OFF_TO_IDX(pagesizes[psind]);
		shmfd->shm_size += pagesizes[psind];
		atomic_add_long(&count_largepages[psind], 1);
		vm_wire_add(atop(pagesizes[psind]));
	}
	return (0);
}

static int
shm_dotruncate_cookie(struct shmfd *shmfd, off_t length, void *rl_cookie)
{
	int error;

	VM_OBJECT_WLOCK(shmfd->shm_object);
	error = shm_largepage(shmfd) ? shm_dotruncate_largepage(shmfd,
	    length, rl_cookie) : shm_dotruncate_locked(shmfd, length,
	    rl_cookie);
	VM_OBJECT_WUNLOCK(shmfd->shm_object);
	return (error);
}

837
838
839
840
841
842
843
844
int
shm_dotruncate(struct shmfd *shmfd, off_t length)
{
	void *rl_cookie;
	int error;

	rl_cookie = rangelock_wlock(&shmfd->shm_rl, 0, OFF_MAX,
	    &shmfd->shm_mtx);
845
	error = shm_dotruncate_cookie(shmfd, length, rl_cookie);
846
847
848
849
	rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx);
	return (error);
}

850
851
852
853
/*
 * shmfd object management including creation and reference counting
 * routines.
 */
854
struct shmfd *
855
shm_alloc(struct ucred *ucred, mode_t mode, bool largepage)
856
857
858
859
860
861
862
863
{
	struct shmfd *shmfd;

	shmfd = malloc(sizeof(*shmfd), M_SHMFD, M_WAITOK | M_ZERO);
	shmfd->shm_size = 0;
	shmfd->shm_uid = ucred->cr_uid;
	shmfd->shm_gid = ucred->cr_gid;
	shmfd->shm_mode = mode;
864
865
866
867
868
869
870
871
872
	if (largepage) {
		shmfd->shm_object = phys_pager_allocate(NULL,
		    &shm_largepage_phys_ops, NULL, shmfd->shm_size,
		    VM_PROT_DEFAULT, 0, ucred);
		shmfd->shm_lp_alloc_policy = SHM_LARGEPAGE_ALLOC_DEFAULT;
	} else {
		shmfd->shm_object = vm_pager_allocate(OBJT_SWAP, NULL,
		    shmfd->shm_size, VM_PROT_DEFAULT, 0, ucred);
	}
873
874
875
876
	KASSERT(shmfd->shm_object != NULL, ("shm_create: vm_pager_allocate"));
	vfs_timestamp(&shmfd->shm_birthtime);
	shmfd->shm_atime = shmfd->shm_mtime = shmfd->shm_ctime =
	    shmfd->shm_birthtime;
877
	shmfd->shm_ino = alloc_unr64(&shm_ino_unr);
878
	refcount_init(&shmfd->shm_refs, 1);
879
880
	mtx_init(&shmfd->shm_mtx, "shmrl", NULL, MTX_DEF);
	rangelock_init(&shmfd->shm_rl);
881
882
883
884
885
886
887
888
#ifdef MAC
	mac_posixshm_init(shmfd);
	mac_posixshm_create(ucred, shmfd);
#endif

	return (shmfd);
}

889
struct shmfd *
890
891
892
893
894
895
896
shm_hold(struct shmfd *shmfd)
{

	refcount_acquire(&shmfd->shm_refs);
	return (shmfd);
}

897
void
898
899
900
901
902
903
904
shm_drop(struct shmfd *shmfd)
{

	if (refcount_release(&shmfd->shm_refs)) {
#ifdef MAC
		mac_posixshm_destroy(shmfd);
#endif
905
906
		rangelock_destroy(&shmfd->shm_rl);
		mtx_destroy(&shmfd->shm_mtx);
907
908
909
910
911
912
913
914
915
		vm_object_deallocate(shmfd->shm_object);
		free(shmfd, M_SHMFD);
	}
}

/*
 * Determine if the credentials have sufficient permissions for a
 * specified combination of FREAD and FWRITE.
 */
916
int
917
918
shm_access(struct shmfd *shmfd, struct ucred *ucred, int flags)
{
919
	accmode_t accmode;
920
	int error;
921

922
	accmode = 0;
923
	if (flags & FREAD)
924
		accmode |= VREAD;
925
	if (flags & FWRITE)
926
		accmode |= VWRITE;
927
928
	mtx_lock(&shm_timestamp_lock);
	error = vaccess(VREG, shmfd->shm_mode, shmfd->shm_uid, shmfd->shm_gid,
929
	    accmode, ucred);
930
931
	mtx_unlock(&shm_timestamp_lock);
	return (error);
932
933
934
}

static void
935
shm_init(void *arg)
936
{
937
938
	char name[32];
	int i;
939
940
941
942

	mtx_init(&shm_timestamp_lock, "shm timestamps", NULL, MTX_DEF);
	sx_init(&shm_dict_lock, "shm dictionary");
	shm_dictionary = hashinit(1024, M_SHMFD, &shm_hash);
943
	new_unrhdr64(&shm_ino_unr, 1);
944
945
	shm_dev_ino = devfs_alloc_cdp_inode();
	KASSERT(shm_dev_ino > 0, ("shm dev inode not initialized"));
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963

	for (i = 1; i < MAXPAGESIZES; i++) {
		if (pagesizes[i] == 0)
			break;
#define	M	(1024 * 1024)
#define	G	(1024 * M)
		if (pagesizes[i] >= G)
			snprintf(name, sizeof(name), "%luG", pagesizes[i] / G);
		else if (pagesizes[i] >= M)
			snprintf(name, sizeof(name), "%luM", pagesizes[i] / M);
		else
			snprintf(name, sizeof(name), "%lu", pagesizes[i]);
#undef G
#undef M
		SYSCTL_ADD_ULONG(NULL, SYSCTL_STATIC_CHILDREN(_vm_largepages),
		    OID_AUTO, name, CTLFLAG_RD, &count_largepages[i],
		    "number of non-transient largepages allocated");
	}
964
}
965
SYSINIT(shm_init, SI_SUB_SYSV_SHM, SI_ORDER_ANY, shm_init, NULL);
966

967
968
969
970
971
/*
 * Dictionary management.  We maintain an in-kernel dictionary to map
 * paths to shmfd objects.  We use the FNV hash on the path to store
 * the mappings in a hash table.
 */
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
static struct shmfd *
shm_lookup(char *path, Fnv32_t fnv)
{
	struct shm_mapping *map;

	LIST_FOREACH(map, SHM_HASH(fnv), sm_link) {
		if (map->sm_fnv != fnv)
			continue;
		if (strcmp(map->sm_path, path) == 0)
			return (map->sm_shmfd);
	}

	return (NULL);
}

static void
shm_insert(char *path, Fnv32_t fnv, struct shmfd *shmfd)
{
	struct shm_mapping *map;

	map = malloc(sizeof(struct shm_mapping), M_SHMFD, M_WAITOK);
	map->sm_path = path;
	map->sm_fnv = fnv;
	map->sm_shmfd = shm_hold(shmfd);
996
	shmfd->shm_path = path;
997
998
999
1000
	LIST_INSERT_HEAD(SHM_HASH(fnv), map, sm_link);
}

static int
For faster browsing, not all history is shown. View entire blame