uipc_shm.c 39.2 KB
Newer Older
1
/*-
2
3
 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
 *
4
 * Copyright (c) 2006, 2011, 2016-2017 Robert N. M. Watson
5
6
 * All rights reserved.
 *
7
8
9
10
11
 * Portions of this software were developed by BAE Systems, the University of
 * Cambridge Computer Laboratory, and Memorial University under DARPA/AFRL
 * contract FA8650-15-C-7558 ("CADETS"), as part of the DARPA Transparent
 * Computing (TC) research program.
 *
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

/*
 * Support for shared swap-backed anonymous memory objects via
David Bright's avatar
David Bright committed
36
37
38
 * shm_open(2), shm_rename(2), and shm_unlink(2).
 * While most of the implementation is here, vm_mmap.c contains
 * mapping logic changes.
39
 *
40
41
42
43
 * posixshmcontrol(1) allows users to inspect the state of the memory
 * objects.  Per-uid swap resource limit controls total amount of
 * memory that user can consume for anonymous objects, including
 * shared.
44
45
46
47
48
 */

#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");

49
#include "opt_capsicum.h"
50
#include "opt_ktrace.h"
51

52
#include <sys/param.h>
53
#include <sys/capsicum.h>
54
#include <sys/conf.h>
55
56
57
#include <sys/fcntl.h>
#include <sys/file.h>
#include <sys/filedesc.h>
58
#include <sys/filio.h>
59
60
#include <sys/fnv_hash.h>
#include <sys/kernel.h>
61
#include <sys/limits.h>
62
63
#include <sys/uio.h>
#include <sys/signal.h>
64
#include <sys/jail.h>
65
#include <sys/ktrace.h>
66
67
68
69
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mman.h>
#include <sys/mutex.h>
70
#include <sys/priv.h>
71
72
73
#include <sys/proc.h>
#include <sys/refcount.h>
#include <sys/resourcevar.h>
74
#include <sys/rwlock.h>
75
#include <sys/sbuf.h>
76
#include <sys/stat.h>
Ed Schouten's avatar
Ed Schouten committed
77
#include <sys/syscallsubr.h>
78
79
80
81
82
83
#include <sys/sysctl.h>
#include <sys/sysproto.h>
#include <sys/systm.h>
#include <sys/sx.h>
#include <sys/time.h>
#include <sys/vnode.h>
84
#include <sys/unistd.h>
85
#include <sys/user.h>
86

87
#include <security/audit/audit.h>
88
89
90
91
92
#include <security/mac/mac_framework.h>

#include <vm/vm.h>
#include <vm/vm_param.h>
#include <vm/pmap.h>
John Baldwin's avatar
John Baldwin committed
93
#include <vm/vm_extern.h>
94
#include <vm/vm_map.h>
95
#include <vm/vm_kern.h>
96
97
#include <vm/vm_object.h>
#include <vm/vm_page.h>
98
#include <vm/vm_pageout.h>
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
#include <vm/vm_pager.h>
#include <vm/swap_pager.h>

struct shm_mapping {
	char		*sm_path;
	Fnv32_t		sm_fnv;
	struct shmfd	*sm_shmfd;
	LIST_ENTRY(shm_mapping) sm_link;
};

static MALLOC_DEFINE(M_SHMFD, "shmfd", "shared memory file descriptor");
static LIST_HEAD(, shm_mapping) *shm_dictionary;
static struct sx shm_dict_lock;
static struct mtx shm_timestamp_lock;
static u_long shm_hash;
114
static struct unrhdr64 shm_ino_unr;
115
static dev_t shm_dev_ino;
116
117
118

#define	SHM_HASH(fnv)	(&shm_dictionary[(fnv) & shm_hash])

119
static void	shm_init(void *arg);
120
121
122
static void	shm_insert(char *path, Fnv32_t fnv, struct shmfd *shmfd);
static struct shmfd *shm_lookup(char *path, Fnv32_t fnv);
static int	shm_remove(char *path, Fnv32_t fnv, struct ucred *ucred);
123
124
static int	shm_dotruncate_locked(struct shmfd *shmfd, off_t length,
    void *rl_cookie);
125
126
static int	shm_copyin_path(struct thread *td, const char *userpath_in,
    char **path_out);
127
128
129
130

static fo_rdwr_t	shm_read;
static fo_rdwr_t	shm_write;
static fo_truncate_t	shm_truncate;
131
static fo_ioctl_t	shm_ioctl;
132
133
static fo_stat_t	shm_stat;
static fo_close_t	shm_close;
134
135
static fo_chmod_t	shm_chmod;
static fo_chown_t	shm_chown;
136
static fo_seek_t	shm_seek;
137
static fo_fill_kinfo_t	shm_fill_kinfo;
138
static fo_mmap_t	shm_mmap;
139
140
static fo_get_seals_t	shm_get_seals;
static fo_add_seals_t	shm_add_seals;
141
static fo_fallocate_t	shm_fallocate;
142
143

/* File descriptor operations. */
144
struct fileops shm_ops = {
145
146
147
	.fo_read = shm_read,
	.fo_write = shm_write,
	.fo_truncate = shm_truncate,
148
	.fo_ioctl = shm_ioctl,
149
150
	.fo_poll = invfo_poll,
	.fo_kqfilter = invfo_kqfilter,
151
152
	.fo_stat = shm_stat,
	.fo_close = shm_close,
153
154
	.fo_chmod = shm_chmod,
	.fo_chown = shm_chown,
155
	.fo_sendfile = vn_sendfile,
156
	.fo_seek = shm_seek,
157
	.fo_fill_kinfo = shm_fill_kinfo,
158
	.fo_mmap = shm_mmap,
159
160
	.fo_get_seals = shm_get_seals,
	.fo_add_seals = shm_add_seals,
161
	.fo_fallocate = shm_fallocate,
162
	.fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE
163
164
165
166
};

FEATURE(posix_shm, "POSIX shared memory");

167
168
169
170
171
172
173
174
175
176
177
178
static int
uiomove_object_page(vm_object_t obj, size_t len, struct uio *uio)
{
	vm_page_t m;
	vm_pindex_t idx;
	size_t tlen;
	int error, offset, rv;

	idx = OFF_TO_IDX(uio->uio_offset);
	offset = uio->uio_offset & PAGE_MASK;
	tlen = MIN(PAGE_SIZE - offset, len);

179
180
181
182
	rv = vm_page_grab_valid_unlocked(&m, obj, idx,
	    VM_ALLOC_SBUSY | VM_ALLOC_IGN_SBUSY | VM_ALLOC_NOCREAT);
	if (rv == VM_PAGER_OK)
		goto found;
183

184
185
186
187
188
	/*
	 * Read I/O without either a corresponding resident page or swap
	 * page: use zero_region.  This is intended to avoid instantiating
	 * pages on read from a sparse region.
	 */
189
190
191
	VM_OBJECT_WLOCK(obj);
	m = vm_page_lookup(obj, idx);
	if (uio->uio_rw == UIO_READ && m == NULL &&
192
193
	    !vm_pager_has_page(obj, idx, NULL, NULL)) {
		VM_OBJECT_WUNLOCK(obj);
194
		return (uiomove(__DECONST(void *, zero_region), tlen, uio));
195
196
	}

197
198
199
200
201
202
203
	/*
	 * Although the tmpfs vnode lock is held here, it is
	 * nonetheless safe to sleep waiting for a free page.  The
	 * pageout daemon does not need to acquire the tmpfs vnode
	 * lock to page out tobj's pages because tobj is a OBJT_SWAP
	 * type object.
	 */
204
	rv = vm_page_grab_valid(&m, obj, idx,
205
	    VM_ALLOC_NORMAL | VM_ALLOC_SBUSY | VM_ALLOC_IGN_SBUSY);
206
207
208
209
210
	if (rv != VM_PAGER_OK) {
		VM_OBJECT_WUNLOCK(obj);
		printf("uiomove_object: vm_obj %p idx %jd pager error %d\n",
		    obj, idx, rv);
		return (EIO);
211
212
	}
	VM_OBJECT_WUNLOCK(obj);
213
214

found:
215
	error = uiomove_fromphys(&m, offset, tlen, uio);
216
217
	if (uio->uio_rw == UIO_WRITE && error == 0)
		vm_page_set_dirty(m);
218
	vm_page_activate(m);
219
	vm_page_sunbusy(m);
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244

	return (error);
}

int
uiomove_object(vm_object_t obj, off_t obj_size, struct uio *uio)
{
	ssize_t resid;
	size_t len;
	int error;

	error = 0;
	while ((resid = uio->uio_resid) > 0) {
		if (obj_size <= uio->uio_offset)
			break;
		len = MIN(obj_size - uio->uio_offset, resid);
		if (len == 0)
			break;
		error = uiomove_object_page(obj, len, uio);
		if (error != 0 || resid == uio->uio_resid)
			break;
	}
	return (error);
}

245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
static int
shm_seek(struct file *fp, off_t offset, int whence, struct thread *td)
{
	struct shmfd *shmfd;
	off_t foffset;
	int error;

	shmfd = fp->f_data;
	foffset = foffset_lock(fp, 0);
	error = 0;
	switch (whence) {
	case L_INCR:
		if (foffset < 0 ||
		    (offset > 0 && foffset > OFF_MAX - offset)) {
			error = EOVERFLOW;
			break;
		}
		offset += foffset;
		break;
	case L_XTND:
		if (offset > 0 && shmfd->shm_size > OFF_MAX - offset) {
			error = EOVERFLOW;
			break;
		}
		offset += shmfd->shm_size;
		break;
	case L_SET:
		break;
	default:
		error = EINVAL;
	}
	if (error == 0) {
		if (offset < 0 || offset > shmfd->shm_size)
			error = EINVAL;
		else
280
			td->td_uretoff.tdu_off = offset;
281
282
283
284
285
	}
	foffset_unlock(fp, offset, error != 0 ? FOF_NOUPDATE : 0);
	return (error);
}

286
287
288
289
static int
shm_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
    int flags, struct thread *td)
{
290
291
292
	struct shmfd *shmfd;
	void *rl_cookie;
	int error;
293

294
295
296
297
298
299
	shmfd = fp->f_data;
#ifdef MAC
	error = mac_posixshm_check_read(active_cred, fp->f_cred, shmfd);
	if (error)
		return (error);
#endif
300
301
302
	foffset_lock_uio(fp, uio, flags);
	rl_cookie = rangelock_rlock(&shmfd->shm_rl, uio->uio_offset,
	    uio->uio_offset + uio->uio_resid, &shmfd->shm_mtx);
303
304
305
306
	error = uiomove_object(shmfd->shm_object, shmfd->shm_size, uio);
	rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx);
	foffset_unlock_uio(fp, uio, flags);
	return (error);
307
308
309
310
311
312
}

static int
shm_write(struct file *fp, struct uio *uio, struct ucred *active_cred,
    int flags, struct thread *td)
{
313
314
315
	struct shmfd *shmfd;
	void *rl_cookie;
	int error;
316

317
318
319
320
321
322
323
324
325
326
327
328
329
330
	shmfd = fp->f_data;
#ifdef MAC
	error = mac_posixshm_check_write(active_cred, fp->f_cred, shmfd);
	if (error)
		return (error);
#endif
	foffset_lock_uio(fp, uio, flags);
	if ((flags & FOF_OFFSET) == 0) {
		rl_cookie = rangelock_wlock(&shmfd->shm_rl, 0, OFF_MAX,
		    &shmfd->shm_mtx);
	} else {
		rl_cookie = rangelock_wlock(&shmfd->shm_rl, uio->uio_offset,
		    uio->uio_offset + uio->uio_resid, &shmfd->shm_mtx);
	}
331
332
333
334
	if ((shmfd->shm_seals & F_SEAL_WRITE) != 0)
		error = EPERM;
	else
		error = uiomove_object(shmfd->shm_object, shmfd->shm_size, uio);
335
336
337
	rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx);
	foffset_unlock_uio(fp, uio, flags);
	return (error);
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
}

static int
shm_truncate(struct file *fp, off_t length, struct ucred *active_cred,
    struct thread *td)
{
	struct shmfd *shmfd;
#ifdef MAC
	int error;
#endif

	shmfd = fp->f_data;
#ifdef MAC
	error = mac_posixshm_check_truncate(active_cred, fp->f_cred, shmfd);
	if (error)
		return (error);
#endif
355
	return (shm_dotruncate(shmfd, length));
356
357
}

358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
int
shm_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred,
    struct thread *td)
{

	switch (com) {
	case FIONBIO:
	case FIOASYNC:
		/*
		 * Allow fcntl(fd, F_SETFL, O_NONBLOCK) to work,
		 * just like it would on an unlinked regular file
		 */
		return (0);
	default:
		return (ENOTTY);
	}
}

376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
static int
shm_stat(struct file *fp, struct stat *sb, struct ucred *active_cred,
    struct thread *td)
{
	struct shmfd *shmfd;
#ifdef MAC
	int error;
#endif

	shmfd = fp->f_data;

#ifdef MAC
	error = mac_posixshm_check_stat(active_cred, fp->f_cred, shmfd);
	if (error)
		return (error);
#endif
	
	/*
	 * Attempt to return sanish values for fstat() on a memory file
	 * descriptor.
	 */
	bzero(sb, sizeof(*sb));
	sb->st_blksize = PAGE_SIZE;
	sb->st_size = shmfd->shm_size;
400
	sb->st_blocks = howmany(sb->st_size, sb->st_blksize);
401
	mtx_lock(&shm_timestamp_lock);
402
403
404
	sb->st_atim = shmfd->shm_atime;
	sb->st_ctim = shmfd->shm_ctime;
	sb->st_mtim = shmfd->shm_mtime;
405
406
	sb->st_birthtim = shmfd->shm_birthtime;
	sb->st_mode = S_IFREG | shmfd->shm_mode;		/* XXX */
407
408
	sb->st_uid = shmfd->shm_uid;
	sb->st_gid = shmfd->shm_gid;
409
	mtx_unlock(&shm_timestamp_lock);
410
411
	sb->st_dev = shm_dev_ino;
	sb->st_ino = shmfd->shm_ino;
412
	sb->st_nlink = shmfd->shm_object->ref_count;
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428

	return (0);
}

static int
shm_close(struct file *fp, struct thread *td)
{
	struct shmfd *shmfd;

	shmfd = fp->f_data;
	fp->f_data = NULL;
	shm_drop(shmfd);

	return (0);
}

429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
static int
shm_copyin_path(struct thread *td, const char *userpath_in, char **path_out) {
	int error;
	char *path;
	const char *pr_path;
	size_t pr_pathlen;

	path = malloc(MAXPATHLEN, M_SHMFD, M_WAITOK);
	pr_path = td->td_ucred->cr_prison->pr_path;

	/* Construct a full pathname for jailed callers. */
	pr_pathlen = strcmp(pr_path, "/") ==
	    0 ? 0 : strlcpy(path, pr_path, MAXPATHLEN);
	error = copyinstr(userpath_in, path + pr_pathlen,
	    MAXPATHLEN - pr_pathlen, NULL);
	if (error != 0)
		goto out;

#ifdef KTRACE
	if (KTRPOINT(curthread, KTR_NAMEI))
		ktrnamei(path);
#endif

	/* Require paths to start with a '/' character. */
	if (path[pr_pathlen] != '/') {
		error = EINVAL;
		goto out;
	}

	*path_out = path;

out:
	if (error != 0)
		free(path, M_SHMFD);

	return (error);
}

467
468
static int
shm_dotruncate_locked(struct shmfd *shmfd, off_t length, void *rl_cookie)
469
470
{
	vm_object_t object;
471
	vm_page_t m;
472
	vm_pindex_t idx, nobjsize;
473
	vm_ooffset_t delta;
474
	int base, rv;
475

476
	KASSERT(length >= 0, ("shm_dotruncate: length < 0"));
477
	object = shmfd->shm_object;
478
479
480
	VM_OBJECT_ASSERT_WLOCKED(object);
	rangelock_cookie_assert(rl_cookie, RA_WLOCKED);
	if (length == shmfd->shm_size)
481
		return (0);
482
483
484
485
	nobjsize = OFF_TO_IDX(length + PAGE_MASK);

	/* Are we shrinking?  If so, trim the end. */
	if (length < shmfd->shm_size) {
486
487
488
		if ((shmfd->shm_seals & F_SEAL_SHRINK) != 0)
			return (EPERM);

489
490
491
492
		/*
		 * Disallow any requests to shrink the size if this
		 * object is mapped into the kernel.
		 */
493
		if (shmfd->shm_kmappings > 0)
494
			return (EBUSY);
495
496
497
498
499
500
501
502

		/*
		 * Zero the truncated part of the last page.
		 */
		base = length & PAGE_MASK;
		if (base != 0) {
			idx = OFF_TO_IDX(length);
retry:
503
			m = vm_page_grab(object, idx, VM_ALLOC_NOCREAT);
504
			if (m != NULL) {
505
				MPASS(vm_page_all_valid(m));
506
			} else if (vm_pager_has_page(object, idx, NULL, NULL)) {
507
508
509
				m = vm_page_alloc(object, idx,
				    VM_ALLOC_NORMAL | VM_ALLOC_WAITFAIL);
				if (m == NULL)
510
					goto retry;
511
512
				vm_object_pip_add(object, 1);
				VM_OBJECT_WUNLOCK(object);
513
514
				rv = vm_pager_get_pages(object, &m, 1, NULL,
				    NULL);
515
516
				VM_OBJECT_WLOCK(object);
				vm_object_pip_wakeup(object);
517
				if (rv == VM_PAGER_OK) {
518
519
520
521
522
523
524
525
526
					/*
					 * Since the page was not resident,
					 * and therefore not recently
					 * accessed, immediately enqueue it
					 * for asynchronous laundering.  The
					 * current operation is not regarded
					 * as an access.
					 */
					vm_page_launder(m);
527
528
				} else {
					vm_page_free(m);
529
					VM_OBJECT_WUNLOCK(object);
530
531
532
533
534
					return (EIO);
				}
			}
			if (m != NULL) {
				pmap_zero_page_area(m, base, PAGE_SIZE - base);
535
				KASSERT(vm_page_all_valid(m),
536
				    ("shm_dotruncate: page %p is invalid", m));
537
				vm_page_set_dirty(m);
538
				vm_page_xunbusy(m);
539
540
			}
		}
541
		delta = IDX_TO_OFF(object->size - nobjsize);
542

543
544
545
		/* Toss in memory pages. */
		if (nobjsize < object->size)
			vm_object_page_remove(object, nobjsize, object->size,
546
			    0);
547
548
549

		/* Toss pages from swap. */
		if (object->type == OBJT_SWAP)
550
551
552
			swap_pager_freespace(object, nobjsize, delta);

		/* Free the swap accounted for shm */
553
		swap_release_by_cred(delta, object->cred);
554
555
		object->charge -= delta;
	} else {
556
557
558
		if ((shmfd->shm_seals & F_SEAL_GROW) != 0)
			return (EPERM);

559
560
		/* Try to reserve additional swap space. */
		delta = IDX_TO_OFF(nobjsize - object->size);
561
		if (!swap_reserve_by_cred(delta, object->cred))
562
563
			return (ENOMEM);
		object->charge += delta;
564
565
566
567
568
569
570
	}
	shmfd->shm_size = length;
	mtx_lock(&shm_timestamp_lock);
	vfs_timestamp(&shmfd->shm_ctime);
	shmfd->shm_mtime = shmfd->shm_ctime;
	mtx_unlock(&shm_timestamp_lock);
	object->size = nobjsize;
571
	return (0);
572
573
}

574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
int
shm_dotruncate(struct shmfd *shmfd, off_t length)
{
	void *rl_cookie;
	int error;

	rl_cookie = rangelock_wlock(&shmfd->shm_rl, 0, OFF_MAX,
	    &shmfd->shm_mtx);
	VM_OBJECT_WLOCK(shmfd->shm_object);
	error = shm_dotruncate_locked(shmfd, length, rl_cookie);
	VM_OBJECT_WUNLOCK(shmfd->shm_object);
	rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx);
	return (error);
}

589
590
591
592
/*
 * shmfd object management including creation and reference counting
 * routines.
 */
593
struct shmfd *
594
595
596
597
598
599
600
601
602
shm_alloc(struct ucred *ucred, mode_t mode)
{
	struct shmfd *shmfd;

	shmfd = malloc(sizeof(*shmfd), M_SHMFD, M_WAITOK | M_ZERO);
	shmfd->shm_size = 0;
	shmfd->shm_uid = ucred->cr_uid;
	shmfd->shm_gid = ucred->cr_gid;
	shmfd->shm_mode = mode;
603
	shmfd->shm_object = vm_pager_allocate(OBJT_SWAP, NULL,
604
	    shmfd->shm_size, VM_PROT_DEFAULT, 0, ucred);
605
606
607
608
	KASSERT(shmfd->shm_object != NULL, ("shm_create: vm_pager_allocate"));
	vfs_timestamp(&shmfd->shm_birthtime);
	shmfd->shm_atime = shmfd->shm_mtime = shmfd->shm_ctime =
	    shmfd->shm_birthtime;
609
	shmfd->shm_ino = alloc_unr64(&shm_ino_unr);
610
	refcount_init(&shmfd->shm_refs, 1);
611
612
	mtx_init(&shmfd->shm_mtx, "shmrl", NULL, MTX_DEF);
	rangelock_init(&shmfd->shm_rl);
613
614
615
616
617
618
619
620
#ifdef MAC
	mac_posixshm_init(shmfd);
	mac_posixshm_create(ucred, shmfd);
#endif

	return (shmfd);
}

621
struct shmfd *
622
623
624
625
626
627
628
shm_hold(struct shmfd *shmfd)
{

	refcount_acquire(&shmfd->shm_refs);
	return (shmfd);
}

629
void
630
631
632
633
634
635
636
shm_drop(struct shmfd *shmfd)
{

	if (refcount_release(&shmfd->shm_refs)) {
#ifdef MAC
		mac_posixshm_destroy(shmfd);
#endif
637
638
		rangelock_destroy(&shmfd->shm_rl);
		mtx_destroy(&shmfd->shm_mtx);
639
640
641
642
643
644
645
646
647
		vm_object_deallocate(shmfd->shm_object);
		free(shmfd, M_SHMFD);
	}
}

/*
 * Determine if the credentials have sufficient permissions for a
 * specified combination of FREAD and FWRITE.
 */
648
int
649
650
shm_access(struct shmfd *shmfd, struct ucred *ucred, int flags)
{
651
	accmode_t accmode;
652
	int error;
653

654
	accmode = 0;
655
	if (flags & FREAD)
656
		accmode |= VREAD;
657
	if (flags & FWRITE)
658
		accmode |= VWRITE;
659
660
661
662
663
	mtx_lock(&shm_timestamp_lock);
	error = vaccess(VREG, shmfd->shm_mode, shmfd->shm_uid, shmfd->shm_gid,
	    accmode, ucred, NULL);
	mtx_unlock(&shm_timestamp_lock);
	return (error);
664
665
666
667
668
669
670
671
}

/*
 * Dictionary management.  We maintain an in-kernel dictionary to map
 * paths to shmfd objects.  We use the FNV hash on the path to store
 * the mappings in a hash table.
 */
static void
672
shm_init(void *arg)
673
674
675
676
677
{

	mtx_init(&shm_timestamp_lock, "shm timestamps", NULL, MTX_DEF);
	sx_init(&shm_dict_lock, "shm dictionary");
	shm_dictionary = hashinit(1024, M_SHMFD, &shm_hash);
678
	new_unrhdr64(&shm_ino_unr, 1);
679
680
	shm_dev_ino = devfs_alloc_cdp_inode();
	KASSERT(shm_dev_ino > 0, ("shm dev inode not initialized"));
681
}
682
SYSINIT(shm_init, SI_SUB_SYSV_SHM, SI_ORDER_ANY, shm_init, NULL);
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707

static struct shmfd *
shm_lookup(char *path, Fnv32_t fnv)
{
	struct shm_mapping *map;

	LIST_FOREACH(map, SHM_HASH(fnv), sm_link) {
		if (map->sm_fnv != fnv)
			continue;
		if (strcmp(map->sm_path, path) == 0)
			return (map->sm_shmfd);
	}

	return (NULL);
}

static void
shm_insert(char *path, Fnv32_t fnv, struct shmfd *shmfd)
{
	struct shm_mapping *map;

	map = malloc(sizeof(struct shm_mapping), M_SHMFD, M_WAITOK);
	map->sm_path = path;
	map->sm_fnv = fnv;
	map->sm_shmfd = shm_hold(shmfd);
708
	shmfd->shm_path = path;
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
	LIST_INSERT_HEAD(SHM_HASH(fnv), map, sm_link);
}

static int
shm_remove(char *path, Fnv32_t fnv, struct ucred *ucred)
{
	struct shm_mapping *map;
	int error;

	LIST_FOREACH(map, SHM_HASH(fnv), sm_link) {
		if (map->sm_fnv != fnv)
			continue;
		if (strcmp(map->sm_path, path) == 0) {
#ifdef MAC
			error = mac_posixshm_check_unlink(ucred, map->sm_shmfd);
			if (error)
				return (error);
#endif
			error = shm_access(map->sm_shmfd, ucred,
			    FREAD | FWRITE);
			if (error)
				return (error);
731
			map->sm_shmfd->shm_path = NULL;
732
733
734
735
736
737
738
739
740
741
742
743
			LIST_REMOVE(map, sm_link);
			shm_drop(map->sm_shmfd);
			free(map->sm_path, M_SHMFD);
			free(map, M_SHMFD);
			return (0);
		}
	}

	return (ENOENT);
}

int
744
745
kern_shm_open2(struct thread *td, const char *userpath, int flags, mode_t mode,
    int shmflags, struct filecaps *fcaps, const char *name __unused)
746
747
748
749
750
{
	struct filedesc *fdp;
	struct shmfd *shmfd;
	struct file *fp;
	char *path;
751
	void *rl_cookie;
752
753
	Fnv32_t fnv;
	mode_t cmode;
754
755
756
757
758
759
760
761
	int error, fd, initial_seals;

	if ((shmflags & ~SHM_ALLOW_SEALING) != 0)
		return (EINVAL);

	initial_seals = F_SEAL_SEAL;
	if ((shmflags & SHM_ALLOW_SEALING) != 0)
		initial_seals &= ~F_SEAL_SEAL;
762

763
764
765
766
#ifdef CAPABILITY_MODE
	/*
	 * shm_open(2) is only allowed for anonymous objects.
	 */
Ed Schouten's avatar
Ed Schouten committed
767
	if (IN_CAPABILITY_MODE(td) && (userpath != SHM_ANON))
768
769
770
		return (ECAPMODE);
#endif

771
772
773
	AUDIT_ARG_FFLAGS(flags);
	AUDIT_ARG_MODE(mode);

Ed Schouten's avatar
Ed Schouten committed
774
	if ((flags & O_ACCMODE) != O_RDONLY && (flags & O_ACCMODE) != O_RDWR)
775
776
		return (EINVAL);

Ed Schouten's avatar
Ed Schouten committed
777
	if ((flags & ~(O_ACCMODE | O_CREAT | O_EXCL | O_TRUNC | O_CLOEXEC)) != 0)
778
779
		return (EINVAL);

780
781
782
783
784
785
786
787
788
789
790
	/*
	 * Currently only F_SEAL_SEAL may be set when creating or opening shmfd.
	 * If the decision is made later to allow additional seals, care must be
	 * taken below to ensure that the seals are properly set if the shmfd
	 * already existed -- this currently assumes that only F_SEAL_SEAL can
	 * be set and doesn't take further precautions to ensure the validity of
	 * the seals being added with respect to current mappings.
	 */
	if ((initial_seals & ~F_SEAL_SEAL) != 0)
		return (EINVAL);

791
	fdp = td->td_proc->p_fd;
Ed Schouten's avatar
Ed Schouten committed
792
	cmode = (mode & ~fdp->fd_cmask) & ACCESSPERMS;
793

794
795
796
797
798
799
800
801
	/*
	 * shm_open(2) created shm should always have O_CLOEXEC set, as mandated
	 * by POSIX.  We allow it to be unset here so that an in-kernel
	 * interface may be written as a thin layer around shm, optionally not
	 * setting CLOEXEC.  For shm_open(2), O_CLOEXEC is set unconditionally
	 * in sys_shm_open() to keep this implementation compliant.
	 */
	error = falloc_caps(td, &fp, &fd, flags & O_CLOEXEC, fcaps);
802
803
804
805
	if (error)
		return (error);

	/* A SHM_ANON path pointer creates an anonymous object. */
Ed Schouten's avatar
Ed Schouten committed
806
	if (userpath == SHM_ANON) {
807
		/* A read-only anonymous object is pointless. */
Ed Schouten's avatar
Ed Schouten committed
808
		if ((flags & O_ACCMODE) == O_RDONLY) {
809
			fdclose(td, fp, fd);
810
811
812
813
			fdrop(fp, td);
			return (EINVAL);
		}
		shmfd = shm_alloc(td->td_ucred, cmode);
814
		shmfd->shm_seals = initial_seals;
815
	} else {
816
817
		error = shm_copyin_path(td, userpath, &path);
		if (error != 0) {
818
			fdclose(td, fp, fd);
819
820
821
822
			fdrop(fp, td);
			return (error);
		}

823
		AUDIT_ARG_UPATH1_CANON(path);
824
825
826
827
828
		fnv = fnv_32_str(path, FNV1_32_INIT);
		sx_xlock(&shm_dict_lock);
		shmfd = shm_lookup(path, fnv);
		if (shmfd == NULL) {
			/* Object does not yet exist, create it if requested. */
Ed Schouten's avatar
Ed Schouten committed
829
			if (flags & O_CREAT) {
830
831
832
833
834
835
#ifdef MAC
				error = mac_posixshm_check_create(td->td_ucred,
				    path);
				if (error == 0) {
#endif
					shmfd = shm_alloc(td->td_ucred, cmode);
836
					shmfd->shm_seals = initial_seals;
837
838
839
840
					shm_insert(path, fnv, shmfd);
#ifdef MAC
				}
#endif
841
842
843
844
845
			} else {
				free(path, M_SHMFD);
				error = ENOENT;
			}
		} else {
846
847
848
849
850
851
852
853
854
855
856
857
858
			rl_cookie = rangelock_wlock(&shmfd->shm_rl, 0, OFF_MAX,
			    &shmfd->shm_mtx);

			/*
			 * kern_shm_open() likely shouldn't ever error out on
			 * trying to set a seal that already exists, unlike
			 * F_ADD_SEALS.  This would break terribly as
			 * shm_open(2) actually sets F_SEAL_SEAL to maintain
			 * historical behavior where the underlying file could
			 * not be sealed.
			 */
			initial_seals &= ~shmfd->shm_seals;

859
860
861
862
863
			/*
			 * Object already exists, obtain a new
			 * reference if requested and permitted.
			 */
			free(path, M_SHMFD);
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878

			/*
			 * initial_seals can't set additional seals if we've
			 * already been set F_SEAL_SEAL.  If F_SEAL_SEAL is set,
			 * then we've already removed that one from
			 * initial_seals.  This is currently redundant as we
			 * only allow setting F_SEAL_SEAL at creation time, but
			 * it's cheap to check and decreases the effort required
			 * to allow additional seals.
			 */
			if ((shmfd->shm_seals & F_SEAL_SEAL) != 0 &&
			    initial_seals != 0)
				error = EPERM;
			else if ((flags & (O_CREAT | O_EXCL)) ==
			    (O_CREAT | O_EXCL))
879
880
881
882
				error = EEXIST;
			else {
#ifdef MAC
				error = mac_posixshm_check_open(td->td_ucred,
Ed Schouten's avatar
Ed Schouten committed
883
				    shmfd, FFLAGS(flags & O_ACCMODE));
884
885
886
				if (error == 0)
#endif
				error = shm_access(shmfd, td->td_ucred,
Ed Schouten's avatar
Ed Schouten committed
887
				    FFLAGS(flags & O_ACCMODE));
888
889
890
891
892
893
894
895
			}

			/*
			 * Truncate the file back to zero length if
			 * O_TRUNC was specified and the object was
			 * opened with read/write.
			 */
			if (error == 0 &&
Ed Schouten's avatar
Ed Schouten committed
896
			    (flags & (O_ACCMODE | O_TRUNC)) ==
897
			    (O_RDWR | O_TRUNC)) {
898
				VM_OBJECT_WLOCK(shmfd->shm_object);
899
900
901
902
903
#ifdef MAC
				error = mac_posixshm_check_truncate(
					td->td_ucred, fp->f_cred, shmfd);
				if (error == 0)
#endif
904
905
906
					error = shm_dotruncate_locked(shmfd, 0,
					    rl_cookie);
				VM_OBJECT_WUNLOCK(shmfd->shm_object);
907
			}
908
909
910
911
912
913
914
			if (error == 0) {
				/*
				 * Currently we only allow F_SEAL_SEAL to be
				 * set initially.  As noted above, this would
				 * need to be reworked should that change.
				 */
				shmfd->shm_seals |= initial_seals;
915
				shm_hold(shmfd);
916
917
918
			}
			rangelock_unlock(&shmfd->shm_rl, rl_cookie,
			    &shmfd->shm_mtx);
919
920
921
922
		}
		sx_xunlock(&shm_dict_lock);

		if (error) {
923
			fdclose(td, fp, fd);
924
925
926
927
928
			fdrop(fp, td);
			return (error);
		}
	}

Ed Schouten's avatar
Ed Schouten committed
929
	finit(fp, FFLAGS(flags & O_ACCMODE), DTYPE_SHM, shmfd, &shm_ops);
930
931
932
933
934
935
936

	td->td_retval[0] = fd;
	fdrop(fp, td);

	return (0);
}

Ed Schouten's avatar
Ed Schouten committed
937
/* System calls. */
Kyle Evans's avatar
Kyle Evans committed
938
#ifdef COMPAT_FREEBSD12
Ed Schouten's avatar
Ed Schouten committed
939
int
Kyle Evans's avatar
Kyle Evans committed
940
freebsd12_shm_open(struct thread *td, struct freebsd12_shm_open_args *uap)
Ed Schouten's avatar
Ed Schouten committed
941
942
{

943
944
	return (kern_shm_open(td, uap->path, uap->flags | O_CLOEXEC,
	    uap->mode, NULL));
Ed Schouten's avatar
Ed Schouten committed
945
}
Kyle Evans's avatar
Kyle Evans committed
946
#endif
Ed Schouten's avatar
Ed Schouten committed
947

948
int
949
sys_shm_unlink(struct thread *td, struct shm_unlink_args *uap)
950
951
952
953
954
{
	char *path;
	Fnv32_t fnv;
	int error;

955
956
	error = shm_copyin_path(td, uap->path, &path);
	if (error != 0)
957
		return (error);
958

959
	AUDIT_ARG_UPATH1_CANON(path);
960
961
962
963
	fnv = fnv_32_str(path, FNV1_32_INIT);
	sx_xlock(&shm_dict_lock);
	error = shm_remove(path, fnv, td->td_ucred);
	sx_xunlock(&shm_dict_lock);
964
	free(path, M_SHMFD);
965
966
967
968

	return (error);
}

David Bright's avatar
David Bright committed
969
970
971
972
973
974
975
976
977
978
979
int
sys_shm_rename(struct thread *td, struct shm_rename_args *uap)
{
	char *path_from = NULL, *path_to = NULL;
	Fnv32_t fnv_from, fnv_to;
	struct shmfd *fd_from;
	struct shmfd *fd_to;
	int error;
	int flags;

	flags = uap->flags;
980
	AUDIT_ARG_FFLAGS(flags);
David Bright's avatar
David Bright committed
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000

	/*
	 * Make sure the user passed only valid flags.
	 * If you add a new flag, please add a new term here.
	 */
	if ((flags & ~(
	    SHM_RENAME_NOREPLACE |
	    SHM_RENAME_EXCHANGE
	    )) != 0) {
		error = EINVAL;
		goto out;
	}

	/*
	 * EXCHANGE and NOREPLACE don't quite make sense together. Let's
	 * force the user to choose one or the other.
	 */
	if ((flags & SHM_RENAME_NOREPLACE) != 0 &&
	    (flags & SHM_RENAME_EXCHANGE) != 0) {
		error = EINVAL;
For faster browsing, not all history is shown. View entire blame