uipc_shm.c 37.5 KB
Newer Older
1
/*-
2
3
 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
 *
4
 * Copyright (c) 2006, 2011, 2016-2017 Robert N. M. Watson
5
6
 * All rights reserved.
 *
7
8
9
10
11
 * Portions of this software were developed by BAE Systems, the University of
 * Cambridge Computer Laboratory, and Memorial University under DARPA/AFRL
 * contract FA8650-15-C-7558 ("CADETS"), as part of the DARPA Transparent
 * Computing (TC) research program.
 *
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

/*
 * Support for shared swap-backed anonymous memory objects via
David Bright's avatar
David Bright committed
36
37
38
 * shm_open(2), shm_rename(2), and shm_unlink(2).
 * While most of the implementation is here, vm_mmap.c contains
 * mapping logic changes.
39
 *
40
41
42
43
 * posixshmcontrol(1) allows users to inspect the state of the memory
 * objects.  Per-uid swap resource limit controls total amount of
 * memory that user can consume for anonymous objects, including
 * shared.
44
45
46
47
48
 */

#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");

49
#include "opt_capsicum.h"
50
#include "opt_ktrace.h"
51

52
#include <sys/param.h>
53
#include <sys/capsicum.h>
54
#include <sys/conf.h>
55
56
57
#include <sys/fcntl.h>
#include <sys/file.h>
#include <sys/filedesc.h>
58
#include <sys/filio.h>
59
60
#include <sys/fnv_hash.h>
#include <sys/kernel.h>
61
#include <sys/limits.h>
62
63
#include <sys/uio.h>
#include <sys/signal.h>
64
#include <sys/jail.h>
65
#include <sys/ktrace.h>
66
67
68
69
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mman.h>
#include <sys/mutex.h>
70
#include <sys/priv.h>
71
72
73
#include <sys/proc.h>
#include <sys/refcount.h>
#include <sys/resourcevar.h>
74
#include <sys/rwlock.h>
75
#include <sys/sbuf.h>
76
#include <sys/stat.h>
Ed Schouten's avatar
Ed Schouten committed
77
#include <sys/syscallsubr.h>
78
79
80
81
82
83
#include <sys/sysctl.h>
#include <sys/sysproto.h>
#include <sys/systm.h>
#include <sys/sx.h>
#include <sys/time.h>
#include <sys/vnode.h>
84
#include <sys/unistd.h>
85
#include <sys/user.h>
86

87
#include <security/audit/audit.h>
88
89
90
91
92
#include <security/mac/mac_framework.h>

#include <vm/vm.h>
#include <vm/vm_param.h>
#include <vm/pmap.h>
John Baldwin's avatar
John Baldwin committed
93
#include <vm/vm_extern.h>
94
#include <vm/vm_map.h>
95
#include <vm/vm_kern.h>
96
97
#include <vm/vm_object.h>
#include <vm/vm_page.h>
98
#include <vm/vm_pageout.h>
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
#include <vm/vm_pager.h>
#include <vm/swap_pager.h>

struct shm_mapping {
	char		*sm_path;
	Fnv32_t		sm_fnv;
	struct shmfd	*sm_shmfd;
	LIST_ENTRY(shm_mapping) sm_link;
};

static MALLOC_DEFINE(M_SHMFD, "shmfd", "shared memory file descriptor");
static LIST_HEAD(, shm_mapping) *shm_dictionary;
static struct sx shm_dict_lock;
static struct mtx shm_timestamp_lock;
static u_long shm_hash;
114
static struct unrhdr64 shm_ino_unr;
115
static dev_t shm_dev_ino;
116
117
118

#define	SHM_HASH(fnv)	(&shm_dictionary[(fnv) & shm_hash])

119
static void	shm_init(void *arg);
120
121
122
static void	shm_insert(char *path, Fnv32_t fnv, struct shmfd *shmfd);
static struct shmfd *shm_lookup(char *path, Fnv32_t fnv);
static int	shm_remove(char *path, Fnv32_t fnv, struct ucred *ucred);
123
124
static int	shm_dotruncate_locked(struct shmfd *shmfd, off_t length,
    void *rl_cookie);
125
126
static int	shm_copyin_path(struct thread *td, const char *userpath_in,
    char **path_out);
127
128
129
130

static fo_rdwr_t	shm_read;
static fo_rdwr_t	shm_write;
static fo_truncate_t	shm_truncate;
131
static fo_ioctl_t	shm_ioctl;
132
133
static fo_stat_t	shm_stat;
static fo_close_t	shm_close;
134
135
static fo_chmod_t	shm_chmod;
static fo_chown_t	shm_chown;
136
static fo_seek_t	shm_seek;
137
static fo_fill_kinfo_t	shm_fill_kinfo;
138
static fo_mmap_t	shm_mmap;
139
140
static fo_get_seals_t	shm_get_seals;
static fo_add_seals_t	shm_add_seals;
141
142

/* File descriptor operations. */
143
struct fileops shm_ops = {
144
145
146
	.fo_read = shm_read,
	.fo_write = shm_write,
	.fo_truncate = shm_truncate,
147
	.fo_ioctl = shm_ioctl,
148
149
	.fo_poll = invfo_poll,
	.fo_kqfilter = invfo_kqfilter,
150
151
	.fo_stat = shm_stat,
	.fo_close = shm_close,
152
153
	.fo_chmod = shm_chmod,
	.fo_chown = shm_chown,
154
	.fo_sendfile = vn_sendfile,
155
	.fo_seek = shm_seek,
156
	.fo_fill_kinfo = shm_fill_kinfo,
157
	.fo_mmap = shm_mmap,
158
159
	.fo_get_seals = shm_get_seals,
	.fo_add_seals = shm_add_seals,
160
	.fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE
161
162
163
164
};

FEATURE(posix_shm, "POSIX shared memory");

165
166
167
168
169
170
171
172
173
174
175
176
177
178
static int
uiomove_object_page(vm_object_t obj, size_t len, struct uio *uio)
{
	vm_page_t m;
	vm_pindex_t idx;
	size_t tlen;
	int error, offset, rv;

	idx = OFF_TO_IDX(uio->uio_offset);
	offset = uio->uio_offset & PAGE_MASK;
	tlen = MIN(PAGE_SIZE - offset, len);

	VM_OBJECT_WLOCK(obj);

179
180
181
182
183
184
185
186
	/*
	 * Read I/O without either a corresponding resident page or swap
	 * page: use zero_region.  This is intended to avoid instantiating
	 * pages on read from a sparse region.
	 */
	if (uio->uio_rw == UIO_READ && vm_page_lookup(obj, idx) == NULL &&
	    !vm_pager_has_page(obj, idx, NULL, NULL)) {
		VM_OBJECT_WUNLOCK(obj);
187
		return (uiomove(__DECONST(void *, zero_region), tlen, uio));
188
189
	}

190
191
192
193
194
195
196
197
198
199
	/*
	 * Parallel reads of the page content from disk are prevented
	 * by exclusive busy.
	 *
	 * Although the tmpfs vnode lock is held here, it is
	 * nonetheless safe to sleep waiting for a free page.  The
	 * pageout daemon does not need to acquire the tmpfs vnode
	 * lock to page out tobj's pages because tobj is a OBJT_SWAP
	 * type object.
	 */
200
	rv = vm_page_grab_valid(&m, obj, idx,
201
	    VM_ALLOC_NORMAL | VM_ALLOC_SBUSY | VM_ALLOC_IGN_SBUSY);
202
203
204
205
206
	if (rv != VM_PAGER_OK) {
		VM_OBJECT_WUNLOCK(obj);
		printf("uiomove_object: vm_obj %p idx %jd pager error %d\n",
		    obj, idx, rv);
		return (EIO);
207
208
209
	}
	VM_OBJECT_WUNLOCK(obj);
	error = uiomove_fromphys(&m, offset, tlen, uio);
210
211
	if (uio->uio_rw == UIO_WRITE && error == 0)
		vm_page_set_dirty(m);
212
213
214
	vm_page_lock(m);
	vm_page_activate(m);
	vm_page_unlock(m);
215
	vm_page_sunbusy(m);
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240

	return (error);
}

int
uiomove_object(vm_object_t obj, off_t obj_size, struct uio *uio)
{
	ssize_t resid;
	size_t len;
	int error;

	error = 0;
	while ((resid = uio->uio_resid) > 0) {
		if (obj_size <= uio->uio_offset)
			break;
		len = MIN(obj_size - uio->uio_offset, resid);
		if (len == 0)
			break;
		error = uiomove_object_page(obj, len, uio);
		if (error != 0 || resid == uio->uio_resid)
			break;
	}
	return (error);
}

241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
static int
shm_seek(struct file *fp, off_t offset, int whence, struct thread *td)
{
	struct shmfd *shmfd;
	off_t foffset;
	int error;

	shmfd = fp->f_data;
	foffset = foffset_lock(fp, 0);
	error = 0;
	switch (whence) {
	case L_INCR:
		if (foffset < 0 ||
		    (offset > 0 && foffset > OFF_MAX - offset)) {
			error = EOVERFLOW;
			break;
		}
		offset += foffset;
		break;
	case L_XTND:
		if (offset > 0 && shmfd->shm_size > OFF_MAX - offset) {
			error = EOVERFLOW;
			break;
		}
		offset += shmfd->shm_size;
		break;
	case L_SET:
		break;
	default:
		error = EINVAL;
	}
	if (error == 0) {
		if (offset < 0 || offset > shmfd->shm_size)
			error = EINVAL;
		else
276
			td->td_uretoff.tdu_off = offset;
277
278
279
280
281
	}
	foffset_unlock(fp, offset, error != 0 ? FOF_NOUPDATE : 0);
	return (error);
}

282
283
284
285
static int
shm_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
    int flags, struct thread *td)
{
286
287
288
	struct shmfd *shmfd;
	void *rl_cookie;
	int error;
289

290
291
292
293
294
295
	shmfd = fp->f_data;
#ifdef MAC
	error = mac_posixshm_check_read(active_cred, fp->f_cred, shmfd);
	if (error)
		return (error);
#endif
296
297
298
	foffset_lock_uio(fp, uio, flags);
	rl_cookie = rangelock_rlock(&shmfd->shm_rl, uio->uio_offset,
	    uio->uio_offset + uio->uio_resid, &shmfd->shm_mtx);
299
300
301
302
	error = uiomove_object(shmfd->shm_object, shmfd->shm_size, uio);
	rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx);
	foffset_unlock_uio(fp, uio, flags);
	return (error);
303
304
305
306
307
308
}

static int
shm_write(struct file *fp, struct uio *uio, struct ucred *active_cred,
    int flags, struct thread *td)
{
309
310
311
	struct shmfd *shmfd;
	void *rl_cookie;
	int error;
312

313
314
315
316
317
318
319
320
321
322
323
324
325
326
	shmfd = fp->f_data;
#ifdef MAC
	error = mac_posixshm_check_write(active_cred, fp->f_cred, shmfd);
	if (error)
		return (error);
#endif
	foffset_lock_uio(fp, uio, flags);
	if ((flags & FOF_OFFSET) == 0) {
		rl_cookie = rangelock_wlock(&shmfd->shm_rl, 0, OFF_MAX,
		    &shmfd->shm_mtx);
	} else {
		rl_cookie = rangelock_wlock(&shmfd->shm_rl, uio->uio_offset,
		    uio->uio_offset + uio->uio_resid, &shmfd->shm_mtx);
	}
327
328
329
330
	if ((shmfd->shm_seals & F_SEAL_WRITE) != 0)
		error = EPERM;
	else
		error = uiomove_object(shmfd->shm_object, shmfd->shm_size, uio);
331
332
333
	rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx);
	foffset_unlock_uio(fp, uio, flags);
	return (error);
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
}

static int
shm_truncate(struct file *fp, off_t length, struct ucred *active_cred,
    struct thread *td)
{
	struct shmfd *shmfd;
#ifdef MAC
	int error;
#endif

	shmfd = fp->f_data;
#ifdef MAC
	error = mac_posixshm_check_truncate(active_cred, fp->f_cred, shmfd);
	if (error)
		return (error);
#endif
351
	return (shm_dotruncate(shmfd, length));
352
353
}

354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
int
shm_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred,
    struct thread *td)
{

	switch (com) {
	case FIONBIO:
	case FIOASYNC:
		/*
		 * Allow fcntl(fd, F_SETFL, O_NONBLOCK) to work,
		 * just like it would on an unlinked regular file
		 */
		return (0);
	default:
		return (ENOTTY);
	}
}

372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
static int
shm_stat(struct file *fp, struct stat *sb, struct ucred *active_cred,
    struct thread *td)
{
	struct shmfd *shmfd;
#ifdef MAC
	int error;
#endif

	shmfd = fp->f_data;

#ifdef MAC
	error = mac_posixshm_check_stat(active_cred, fp->f_cred, shmfd);
	if (error)
		return (error);
#endif
	
	/*
	 * Attempt to return sanish values for fstat() on a memory file
	 * descriptor.
	 */
	bzero(sb, sizeof(*sb));
	sb->st_blksize = PAGE_SIZE;
	sb->st_size = shmfd->shm_size;
396
	sb->st_blocks = howmany(sb->st_size, sb->st_blksize);
397
	mtx_lock(&shm_timestamp_lock);
398
399
400
	sb->st_atim = shmfd->shm_atime;
	sb->st_ctim = shmfd->shm_ctime;
	sb->st_mtim = shmfd->shm_mtime;
401
402
	sb->st_birthtim = shmfd->shm_birthtime;
	sb->st_mode = S_IFREG | shmfd->shm_mode;		/* XXX */
403
404
	sb->st_uid = shmfd->shm_uid;
	sb->st_gid = shmfd->shm_gid;
405
	mtx_unlock(&shm_timestamp_lock);
406
407
	sb->st_dev = shm_dev_ino;
	sb->st_ino = shmfd->shm_ino;
408
	sb->st_nlink = shmfd->shm_object->ref_count;
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424

	return (0);
}

static int
shm_close(struct file *fp, struct thread *td)
{
	struct shmfd *shmfd;

	shmfd = fp->f_data;
	fp->f_data = NULL;
	shm_drop(shmfd);

	return (0);
}

425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
static int
shm_copyin_path(struct thread *td, const char *userpath_in, char **path_out) {
	int error;
	char *path;
	const char *pr_path;
	size_t pr_pathlen;

	path = malloc(MAXPATHLEN, M_SHMFD, M_WAITOK);
	pr_path = td->td_ucred->cr_prison->pr_path;

	/* Construct a full pathname for jailed callers. */
	pr_pathlen = strcmp(pr_path, "/") ==
	    0 ? 0 : strlcpy(path, pr_path, MAXPATHLEN);
	error = copyinstr(userpath_in, path + pr_pathlen,
	    MAXPATHLEN - pr_pathlen, NULL);
	if (error != 0)
		goto out;

#ifdef KTRACE
	if (KTRPOINT(curthread, KTR_NAMEI))
		ktrnamei(path);
#endif

	/* Require paths to start with a '/' character. */
	if (path[pr_pathlen] != '/') {
		error = EINVAL;
		goto out;
	}

	*path_out = path;

out:
	if (error != 0)
		free(path, M_SHMFD);

	return (error);
}

463
464
static int
shm_dotruncate_locked(struct shmfd *shmfd, off_t length, void *rl_cookie)
465
466
{
	vm_object_t object;
467
	vm_page_t m;
468
	vm_pindex_t idx, nobjsize;
469
	vm_ooffset_t delta;
470
	int base, rv;
471

472
	KASSERT(length >= 0, ("shm_dotruncate: length < 0"));
473
	object = shmfd->shm_object;
474
475
476
	VM_OBJECT_ASSERT_WLOCKED(object);
	rangelock_cookie_assert(rl_cookie, RA_WLOCKED);
	if (length == shmfd->shm_size)
477
		return (0);
478
479
480
481
	nobjsize = OFF_TO_IDX(length + PAGE_MASK);

	/* Are we shrinking?  If so, trim the end. */
	if (length < shmfd->shm_size) {
482
483
484
		if ((shmfd->shm_seals & F_SEAL_SHRINK) != 0)
			return (EPERM);

485
486
487
488
		/*
		 * Disallow any requests to shrink the size if this
		 * object is mapped into the kernel.
		 */
489
		if (shmfd->shm_kmappings > 0)
490
			return (EBUSY);
491
492
493
494
495
496
497
498

		/*
		 * Zero the truncated part of the last page.
		 */
		base = length & PAGE_MASK;
		if (base != 0) {
			idx = OFF_TO_IDX(length);
retry:
499
			m = vm_page_grab(object, idx, VM_ALLOC_NOCREAT);
500
			if (m != NULL) {
501
				MPASS(vm_page_all_valid(m));
502
			} else if (vm_pager_has_page(object, idx, NULL, NULL)) {
503
504
505
				m = vm_page_alloc(object, idx,
				    VM_ALLOC_NORMAL | VM_ALLOC_WAITFAIL);
				if (m == NULL)
506
					goto retry;
507
508
				rv = vm_pager_get_pages(object, &m, 1, NULL,
				    NULL);
509
				if (rv == VM_PAGER_OK) {
510
511
512
513
514
515
516
517
518
					/*
					 * Since the page was not resident,
					 * and therefore not recently
					 * accessed, immediately enqueue it
					 * for asynchronous laundering.  The
					 * current operation is not regarded
					 * as an access.
					 */
					vm_page_launder(m);
519
520
				} else {
					vm_page_free(m);
521
					VM_OBJECT_WUNLOCK(object);
522
523
524
525
526
					return (EIO);
				}
			}
			if (m != NULL) {
				pmap_zero_page_area(m, base, PAGE_SIZE - base);
527
				KASSERT(vm_page_all_valid(m),
528
				    ("shm_dotruncate: page %p is invalid", m));
529
				vm_page_set_dirty(m);
530
				vm_page_xunbusy(m);
531
532
			}
		}
533
		delta = IDX_TO_OFF(object->size - nobjsize);
534

535
536
537
		/* Toss in memory pages. */
		if (nobjsize < object->size)
			vm_object_page_remove(object, nobjsize, object->size,
538
			    0);
539
540
541

		/* Toss pages from swap. */
		if (object->type == OBJT_SWAP)
542
543
544
			swap_pager_freespace(object, nobjsize, delta);

		/* Free the swap accounted for shm */
545
		swap_release_by_cred(delta, object->cred);
546
547
		object->charge -= delta;
	} else {
548
549
550
		if ((shmfd->shm_seals & F_SEAL_GROW) != 0)
			return (EPERM);

551
552
		/* Try to reserve additional swap space. */
		delta = IDX_TO_OFF(nobjsize - object->size);
553
		if (!swap_reserve_by_cred(delta, object->cred))
554
555
			return (ENOMEM);
		object->charge += delta;
556
557
558
559
560
561
562
	}
	shmfd->shm_size = length;
	mtx_lock(&shm_timestamp_lock);
	vfs_timestamp(&shmfd->shm_ctime);
	shmfd->shm_mtime = shmfd->shm_ctime;
	mtx_unlock(&shm_timestamp_lock);
	object->size = nobjsize;
563
	return (0);
564
565
}

566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
int
shm_dotruncate(struct shmfd *shmfd, off_t length)
{
	void *rl_cookie;
	int error;

	rl_cookie = rangelock_wlock(&shmfd->shm_rl, 0, OFF_MAX,
	    &shmfd->shm_mtx);
	VM_OBJECT_WLOCK(shmfd->shm_object);
	error = shm_dotruncate_locked(shmfd, length, rl_cookie);
	VM_OBJECT_WUNLOCK(shmfd->shm_object);
	rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx);
	return (error);
}

581
582
583
584
/*
 * shmfd object management including creation and reference counting
 * routines.
 */
585
struct shmfd *
586
587
588
589
590
591
592
593
594
shm_alloc(struct ucred *ucred, mode_t mode)
{
	struct shmfd *shmfd;

	shmfd = malloc(sizeof(*shmfd), M_SHMFD, M_WAITOK | M_ZERO);
	shmfd->shm_size = 0;
	shmfd->shm_uid = ucred->cr_uid;
	shmfd->shm_gid = ucred->cr_gid;
	shmfd->shm_mode = mode;
595
	shmfd->shm_object = vm_pager_allocate(OBJT_SWAP, NULL,
596
	    shmfd->shm_size, VM_PROT_DEFAULT, 0, ucred);
597
598
599
600
	KASSERT(shmfd->shm_object != NULL, ("shm_create: vm_pager_allocate"));
	vfs_timestamp(&shmfd->shm_birthtime);
	shmfd->shm_atime = shmfd->shm_mtime = shmfd->shm_ctime =
	    shmfd->shm_birthtime;
601
	shmfd->shm_ino = alloc_unr64(&shm_ino_unr);
602
	refcount_init(&shmfd->shm_refs, 1);
603
604
	mtx_init(&shmfd->shm_mtx, "shmrl", NULL, MTX_DEF);
	rangelock_init(&shmfd->shm_rl);
605
606
607
608
609
610
611
612
#ifdef MAC
	mac_posixshm_init(shmfd);
	mac_posixshm_create(ucred, shmfd);
#endif

	return (shmfd);
}

613
struct shmfd *
614
615
616
617
618
619
620
shm_hold(struct shmfd *shmfd)
{

	refcount_acquire(&shmfd->shm_refs);
	return (shmfd);
}

621
void
622
623
624
625
626
627
628
shm_drop(struct shmfd *shmfd)
{

	if (refcount_release(&shmfd->shm_refs)) {
#ifdef MAC
		mac_posixshm_destroy(shmfd);
#endif
629
630
		rangelock_destroy(&shmfd->shm_rl);
		mtx_destroy(&shmfd->shm_mtx);
631
632
633
634
635
636
637
638
639
		vm_object_deallocate(shmfd->shm_object);
		free(shmfd, M_SHMFD);
	}
}

/*
 * Determine if the credentials have sufficient permissions for a
 * specified combination of FREAD and FWRITE.
 */
640
int
641
642
shm_access(struct shmfd *shmfd, struct ucred *ucred, int flags)
{
643
	accmode_t accmode;
644
	int error;
645

646
	accmode = 0;
647
	if (flags & FREAD)
648
		accmode |= VREAD;
649
	if (flags & FWRITE)
650
		accmode |= VWRITE;
651
652
653
654
655
	mtx_lock(&shm_timestamp_lock);
	error = vaccess(VREG, shmfd->shm_mode, shmfd->shm_uid, shmfd->shm_gid,
	    accmode, ucred, NULL);
	mtx_unlock(&shm_timestamp_lock);
	return (error);
656
657
658
659
660
661
662
663
}

/*
 * Dictionary management.  We maintain an in-kernel dictionary to map
 * paths to shmfd objects.  We use the FNV hash on the path to store
 * the mappings in a hash table.
 */
static void
664
shm_init(void *arg)
665
666
667
668
669
{

	mtx_init(&shm_timestamp_lock, "shm timestamps", NULL, MTX_DEF);
	sx_init(&shm_dict_lock, "shm dictionary");
	shm_dictionary = hashinit(1024, M_SHMFD, &shm_hash);
670
	new_unrhdr64(&shm_ino_unr, 1);
671
672
	shm_dev_ino = devfs_alloc_cdp_inode();
	KASSERT(shm_dev_ino > 0, ("shm dev inode not initialized"));
673
}
674
SYSINIT(shm_init, SI_SUB_SYSV_SHM, SI_ORDER_ANY, shm_init, NULL);
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699

static struct shmfd *
shm_lookup(char *path, Fnv32_t fnv)
{
	struct shm_mapping *map;

	LIST_FOREACH(map, SHM_HASH(fnv), sm_link) {
		if (map->sm_fnv != fnv)
			continue;
		if (strcmp(map->sm_path, path) == 0)
			return (map->sm_shmfd);
	}

	return (NULL);
}

static void
shm_insert(char *path, Fnv32_t fnv, struct shmfd *shmfd)
{
	struct shm_mapping *map;

	map = malloc(sizeof(struct shm_mapping), M_SHMFD, M_WAITOK);
	map->sm_path = path;
	map->sm_fnv = fnv;
	map->sm_shmfd = shm_hold(shmfd);
700
	shmfd->shm_path = path;
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
	LIST_INSERT_HEAD(SHM_HASH(fnv), map, sm_link);
}

static int
shm_remove(char *path, Fnv32_t fnv, struct ucred *ucred)
{
	struct shm_mapping *map;
	int error;

	LIST_FOREACH(map, SHM_HASH(fnv), sm_link) {
		if (map->sm_fnv != fnv)
			continue;
		if (strcmp(map->sm_path, path) == 0) {
#ifdef MAC
			error = mac_posixshm_check_unlink(ucred, map->sm_shmfd);
			if (error)
				return (error);
#endif
			error = shm_access(map->sm_shmfd, ucred,
			    FREAD | FWRITE);
			if (error)
				return (error);
723
			map->sm_shmfd->shm_path = NULL;
724
725
726
727
728
729
730
731
732
733
734
735
			LIST_REMOVE(map, sm_link);
			shm_drop(map->sm_shmfd);
			free(map->sm_path, M_SHMFD);
			free(map, M_SHMFD);
			return (0);
		}
	}

	return (ENOENT);
}

int
Ed Schouten's avatar
Ed Schouten committed
736
kern_shm_open(struct thread *td, const char *userpath, int flags, mode_t mode,
737
    struct filecaps *fcaps, int initial_seals)
738
739
740
741
742
{
	struct filedesc *fdp;
	struct shmfd *shmfd;
	struct file *fp;
	char *path;
743
	void *rl_cookie;
744
745
746
747
	Fnv32_t fnv;
	mode_t cmode;
	int fd, error;

748
749
750
751
#ifdef CAPABILITY_MODE
	/*
	 * shm_open(2) is only allowed for anonymous objects.
	 */
Ed Schouten's avatar
Ed Schouten committed
752
	if (IN_CAPABILITY_MODE(td) && (userpath != SHM_ANON))
753
754
755
		return (ECAPMODE);
#endif

756
757
758
	AUDIT_ARG_FFLAGS(flags);
	AUDIT_ARG_MODE(mode);

Ed Schouten's avatar
Ed Schouten committed
759
	if ((flags & O_ACCMODE) != O_RDONLY && (flags & O_ACCMODE) != O_RDWR)
760
761
		return (EINVAL);

Ed Schouten's avatar
Ed Schouten committed
762
	if ((flags & ~(O_ACCMODE | O_CREAT | O_EXCL | O_TRUNC | O_CLOEXEC)) != 0)
763
764
		return (EINVAL);

765
766
767
768
769
770
771
772
773
774
775
	/*
	 * Currently only F_SEAL_SEAL may be set when creating or opening shmfd.
	 * If the decision is made later to allow additional seals, care must be
	 * taken below to ensure that the seals are properly set if the shmfd
	 * already existed -- this currently assumes that only F_SEAL_SEAL can
	 * be set and doesn't take further precautions to ensure the validity of
	 * the seals being added with respect to current mappings.
	 */
	if ((initial_seals & ~F_SEAL_SEAL) != 0)
		return (EINVAL);

776
	fdp = td->td_proc->p_fd;
Ed Schouten's avatar
Ed Schouten committed
777
	cmode = (mode & ~fdp->fd_cmask) & ACCESSPERMS;
778

779
780
781
782
783
784
785
786
	/*
	 * shm_open(2) created shm should always have O_CLOEXEC set, as mandated
	 * by POSIX.  We allow it to be unset here so that an in-kernel
	 * interface may be written as a thin layer around shm, optionally not
	 * setting CLOEXEC.  For shm_open(2), O_CLOEXEC is set unconditionally
	 * in sys_shm_open() to keep this implementation compliant.
	 */
	error = falloc_caps(td, &fp, &fd, flags & O_CLOEXEC, fcaps);
787
788
789
790
	if (error)
		return (error);

	/* A SHM_ANON path pointer creates an anonymous object. */
Ed Schouten's avatar
Ed Schouten committed
791
	if (userpath == SHM_ANON) {
792
		/* A read-only anonymous object is pointless. */
Ed Schouten's avatar
Ed Schouten committed
793
		if ((flags & O_ACCMODE) == O_RDONLY) {
794
			fdclose(td, fp, fd);
795
796
797
798
			fdrop(fp, td);
			return (EINVAL);
		}
		shmfd = shm_alloc(td->td_ucred, cmode);
799
		shmfd->shm_seals = initial_seals;
800
	} else {
801
802
		error = shm_copyin_path(td, userpath, &path);
		if (error != 0) {
803
			fdclose(td, fp, fd);
804
805
806
807
			fdrop(fp, td);
			return (error);
		}

808
		AUDIT_ARG_UPATH1_CANON(path);
809
810
811
812
813
		fnv = fnv_32_str(path, FNV1_32_INIT);
		sx_xlock(&shm_dict_lock);
		shmfd = shm_lookup(path, fnv);
		if (shmfd == NULL) {
			/* Object does not yet exist, create it if requested. */
Ed Schouten's avatar
Ed Schouten committed
814
			if (flags & O_CREAT) {
815
816
817
818
819
820
#ifdef MAC
				error = mac_posixshm_check_create(td->td_ucred,
				    path);
				if (error == 0) {
#endif
					shmfd = shm_alloc(td->td_ucred, cmode);
821
					shmfd->shm_seals = initial_seals;
822
823
824
825
					shm_insert(path, fnv, shmfd);
#ifdef MAC
				}
#endif
826
827
828
829
830
			} else {
				free(path, M_SHMFD);
				error = ENOENT;
			}
		} else {
831
832
833
834
835
836
837
838
839
840
841
842
843
			rl_cookie = rangelock_wlock(&shmfd->shm_rl, 0, OFF_MAX,
			    &shmfd->shm_mtx);

			/*
			 * kern_shm_open() likely shouldn't ever error out on
			 * trying to set a seal that already exists, unlike
			 * F_ADD_SEALS.  This would break terribly as
			 * shm_open(2) actually sets F_SEAL_SEAL to maintain
			 * historical behavior where the underlying file could
			 * not be sealed.
			 */
			initial_seals &= ~shmfd->shm_seals;

844
845
846
847
848
			/*
			 * Object already exists, obtain a new
			 * reference if requested and permitted.
			 */
			free(path, M_SHMFD);
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863

			/*
			 * initial_seals can't set additional seals if we've
			 * already been set F_SEAL_SEAL.  If F_SEAL_SEAL is set,
			 * then we've already removed that one from
			 * initial_seals.  This is currently redundant as we
			 * only allow setting F_SEAL_SEAL at creation time, but
			 * it's cheap to check and decreases the effort required
			 * to allow additional seals.
			 */
			if ((shmfd->shm_seals & F_SEAL_SEAL) != 0 &&
			    initial_seals != 0)
				error = EPERM;
			else if ((flags & (O_CREAT | O_EXCL)) ==
			    (O_CREAT | O_EXCL))
864
865
866
867
				error = EEXIST;
			else {
#ifdef MAC
				error = mac_posixshm_check_open(td->td_ucred,
Ed Schouten's avatar
Ed Schouten committed
868
				    shmfd, FFLAGS(flags & O_ACCMODE));
869
870
871
				if (error == 0)
#endif
				error = shm_access(shmfd, td->td_ucred,
Ed Schouten's avatar
Ed Schouten committed
872
				    FFLAGS(flags & O_ACCMODE));
873
874
875
876
877
878
879
880
			}

			/*
			 * Truncate the file back to zero length if
			 * O_TRUNC was specified and the object was
			 * opened with read/write.
			 */
			if (error == 0 &&
Ed Schouten's avatar
Ed Schouten committed
881
			    (flags & (O_ACCMODE | O_TRUNC)) ==
882
			    (O_RDWR | O_TRUNC)) {
883
				VM_OBJECT_WLOCK(shmfd->shm_object);
884
885
886
887
888
#ifdef MAC
				error = mac_posixshm_check_truncate(
					td->td_ucred, fp->f_cred, shmfd);
				if (error == 0)
#endif
889
890
891
					error = shm_dotruncate_locked(shmfd, 0,
					    rl_cookie);
				VM_OBJECT_WUNLOCK(shmfd->shm_object);
892
			}
893
894
895
896
897
898
899
			if (error == 0) {
				/*
				 * Currently we only allow F_SEAL_SEAL to be
				 * set initially.  As noted above, this would
				 * need to be reworked should that change.
				 */
				shmfd->shm_seals |= initial_seals;
900
				shm_hold(shmfd);
901
902
903
			}
			rangelock_unlock(&shmfd->shm_rl, rl_cookie,
			    &shmfd->shm_mtx);
904
905
906
907
		}
		sx_xunlock(&shm_dict_lock);

		if (error) {
908
			fdclose(td, fp, fd);
909
910
911
912
913
			fdrop(fp, td);
			return (error);
		}
	}

Ed Schouten's avatar
Ed Schouten committed
914
	finit(fp, FFLAGS(flags & O_ACCMODE), DTYPE_SHM, shmfd, &shm_ops);
915
916
917
918
919
920
921

	td->td_retval[0] = fd;
	fdrop(fp, td);

	return (0);
}

Ed Schouten's avatar
Ed Schouten committed
922
/* System calls. */
Kyle Evans's avatar
Kyle Evans committed
923
#ifdef COMPAT_FREEBSD12
Ed Schouten's avatar
Ed Schouten committed
924
int
Kyle Evans's avatar
Kyle Evans committed
925
freebsd12_shm_open(struct thread *td, struct freebsd12_shm_open_args *uap)
Ed Schouten's avatar
Ed Schouten committed
926
927
{

928
	return (kern_shm_open(td, uap->path, uap->flags | O_CLOEXEC, uap->mode,
929
	    NULL, F_SEAL_SEAL));
Ed Schouten's avatar
Ed Schouten committed
930
}
Kyle Evans's avatar
Kyle Evans committed
931
#endif
Ed Schouten's avatar
Ed Schouten committed
932

933
int
934
sys_shm_unlink(struct thread *td, struct shm_unlink_args *uap)
935
936
937
938
939
{
	char *path;
	Fnv32_t fnv;
	int error;

940
941
	error = shm_copyin_path(td, uap->path, &path);
	if (error != 0)
942
		return (error);
943

944
	AUDIT_ARG_UPATH1_CANON(path);
945
946
947
948
949
950
951
952
953
	fnv = fnv_32_str(path, FNV1_32_INIT);
	sx_xlock(&shm_dict_lock);
	error = shm_remove(path, fnv, td->td_ucred);
	sx_xunlock(&shm_dict_lock);
	free(path, M_TEMP);

	return (error);
}

David Bright's avatar
David Bright committed
954
955
956
957
958
959
960
961
962
963
964
int
sys_shm_rename(struct thread *td, struct shm_rename_args *uap)
{
	char *path_from = NULL, *path_to = NULL;
	Fnv32_t fnv_from, fnv_to;
	struct shmfd *fd_from;
	struct shmfd *fd_to;
	int error;
	int flags;

	flags = uap->flags;
965
	AUDIT_ARG_FFLAGS(flags);
David Bright's avatar
David Bright committed
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988

	/*
	 * Make sure the user passed only valid flags.
	 * If you add a new flag, please add a new term here.
	 */
	if ((flags & ~(
	    SHM_RENAME_NOREPLACE |
	    SHM_RENAME_EXCHANGE
	    )) != 0) {
		error = EINVAL;
		goto out;
	}

	/*
	 * EXCHANGE and NOREPLACE don't quite make sense together. Let's
	 * force the user to choose one or the other.
	 */
	if ((flags & SHM_RENAME_NOREPLACE) != 0 &&
	    (flags & SHM_RENAME_EXCHANGE) != 0) {
		error = EINVAL;
		goto out;
	}

989
990
991
	/* Renaming to or from anonymous makes no sense */
	if (uap->path_from == SHM_ANON || uap->path_to == SHM_ANON) {
		error = EINVAL;
David Bright's avatar
David Bright committed
992
		goto out;
993
	}
David Bright's avatar
David Bright committed
994

995
996
997
998
999
1000
	error = shm_copyin_path(td, uap->path_from, &path_from);
	if (error != 0)
		goto out;

	error = shm_copyin_path(td, uap->path_to, &path_to);
	if (error != 0)
For faster browsing, not all history is shown. View entire blame