in_pcb.c 82.7 KB
Newer Older
1
/*-
2
3
 * SPDX-License-Identifier: BSD-3-Clause
 *
4
 * Copyright (c) 1982, 1986, 1991, 1993, 1995
5
 *	The Regents of the University of California.
6
 * Copyright (c) 2007-2009 Robert N. M. Watson
7
 * Copyright (c) 2010-2011 Juniper Networks, Inc.
8
 * All rights reserved.
Rodney W. Grimes's avatar
Rodney W. Grimes committed
9
 *
10
11
12
 * Portions of this software were developed by Robert N. M. Watson under
 * contract to Juniper Networks, Inc.
 *
Rodney W. Grimes's avatar
Rodney W. Grimes committed
13
14
15
16
17
18
19
20
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
Warner Losh's avatar
Warner Losh committed
21
 * 3. Neither the name of the University nor the names of its contributors
Rodney W. Grimes's avatar
Rodney W. Grimes committed
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
37
 *	@(#)in_pcb.c	8.4 (Berkeley) 5/24/95
Rodney W. Grimes's avatar
Rodney W. Grimes committed
38
39
 */

40
41
42
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");

43
#include "opt_ddb.h"
Yoshinobu Inoue's avatar
Yoshinobu Inoue committed
44
#include "opt_ipsec.h"
45
#include "opt_inet.h"
46
#include "opt_inet6.h"
47
#include "opt_ratelimit.h"
48
#include "opt_route.h"
49
#include "opt_rss.h"
50

Rodney W. Grimes's avatar
Rodney W. Grimes committed
51
52
#include <sys/param.h>
#include <sys/systm.h>
53
#include <sys/lock.h>
Rodney W. Grimes's avatar
Rodney W. Grimes committed
54
55
#include <sys/malloc.h>
#include <sys/mbuf.h>
Bjoern A. Zeeb's avatar
Bjoern A. Zeeb committed
56
#include <sys/callout.h>
57
#include <sys/eventhandler.h>
58
#include <sys/domain.h>
Rodney W. Grimes's avatar
Rodney W. Grimes committed
59
#include <sys/protosw.h>
60
#include <sys/smp.h>
Rodney W. Grimes's avatar
Rodney W. Grimes committed
61
62
#include <sys/socket.h>
#include <sys/socketvar.h>
63
#include <sys/sockio.h>
64
#include <sys/priv.h>
Rodney W. Grimes's avatar
Rodney W. Grimes committed
65
#include <sys/proc.h>
66
#include <sys/refcount.h>
67
#include <sys/jail.h>
68
69
#include <sys/kernel.h>
#include <sys/sysctl.h>
70

71
72
73
74
#ifdef DDB
#include <ddb/ddb.h>
#endif

75
#include <vm/uma.h>
76
#include <vm/vm.h>
Rodney W. Grimes's avatar
Rodney W. Grimes committed
77
78

#include <net/if.h>
79
#include <net/if_var.h>
80
#include <net/if_types.h>
81
#include <net/if_llatbl.h>
Rodney W. Grimes's avatar
Rodney W. Grimes committed
82
#include <net/route.h>
83
#include <net/rss_config.h>
84
#include <net/vnet.h>
Rodney W. Grimes's avatar
Rodney W. Grimes committed
85

86
#if defined(INET) || defined(INET6)
Rodney W. Grimes's avatar
Rodney W. Grimes committed
87
88
#include <netinet/in.h>
#include <netinet/in_pcb.h>
89
#include <netinet/in_pcb_var.h>
90
91
#ifdef INET
#include <netinet/in_var.h>
92
#include <netinet/in_fib.h>
93
#endif
Rodney W. Grimes's avatar
Rodney W. Grimes committed
94
#include <netinet/ip_var.h>
95
#include <netinet/tcp_var.h>
96
97
98
#ifdef TCPHPTS
#include <netinet/tcp_hpts.h>
#endif
99
100
#include <netinet/udp.h>
#include <netinet/udp_var.h>
101
102
#ifdef INET6
#include <netinet/ip6.h>
103
#include <netinet6/in6_pcb.h>
104
105
#include <netinet6/in6_var.h>
#include <netinet6/ip6_var.h>
106
#endif /* INET6 */
107
#include <net/route/nhop.h>
108
#endif
109

110
#include <netipsec/ipsec_support.h>
111

112
113
#include <security/mac/mac_framework.h>

114
115
116
#define	INPCBLBGROUP_SIZMIN	8
#define	INPCBLBGROUP_SIZMAX	256

Bjoern A. Zeeb's avatar
Bjoern A. Zeeb committed
117
118
static struct callout	ipport_tick_callout;

119
120
121
122
/*
 * These configure the range of local port addresses assigned to
 * "unspecified" outgoing connections/packets/whatever.
 */
123
124
125
126
127
128
VNET_DEFINE(int, ipport_lowfirstauto) = IPPORT_RESERVED - 1;	/* 1023 */
VNET_DEFINE(int, ipport_lowlastauto) = IPPORT_RESERVEDSTART;	/* 600 */
VNET_DEFINE(int, ipport_firstauto) = IPPORT_EPHEMERALFIRST;	/* 10000 */
VNET_DEFINE(int, ipport_lastauto) = IPPORT_EPHEMERALLAST;	/* 65535 */
VNET_DEFINE(int, ipport_hifirstauto) = IPPORT_HIFIRSTAUTO;	/* 49152 */
VNET_DEFINE(int, ipport_hilastauto) = IPPORT_HILASTAUTO;	/* 65535 */
129

130
131
132
133
134
/*
 * Reserved ports accessible only to root. There are significant
 * security considerations that must be accounted for when changing these,
 * but the security benefits can be great. Please be careful.
 */
135
136
VNET_DEFINE(int, ipport_reservedhigh) = IPPORT_RESERVED - 1;	/* 1023 */
VNET_DEFINE(int, ipport_reservedlow);
137

138
/* Variables dealing with random ephemeral port allocation. */
139
140
141
142
143
VNET_DEFINE(int, ipport_randomized) = 1;	/* user controlled via sysctl */
VNET_DEFINE(int, ipport_randomcps) = 10;	/* user controlled via sysctl */
VNET_DEFINE(int, ipport_randomtime) = 45;	/* user controlled via sysctl */
VNET_DEFINE(int, ipport_stoprandom);		/* toggled by ipport_tick */
VNET_DEFINE(int, ipport_tcpallocs);
144
VNET_DEFINE_STATIC(int, ipport_tcplastcount);
145

146
#define	V_ipport_tcplastcount		VNET(ipport_tcplastcount)
147

148
149
static void	in_pcbremlists(struct inpcb *inp);
#ifdef INET
150
151
152
static struct inpcb	*in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo,
			    struct in_addr faddr, u_int fport_arg,
			    struct in_addr laddr, u_int lport_arg,
153
154
			    int lookupflags, struct ifnet *ifp,
			    uint8_t numa_domain);
155

156
157
158
159
160
#define RANGECHK(var, min, max) \
	if ((var) < (min)) { (var) = (min); } \
	else if ((var) > (max)) { (var) = (max); }

static int
161
sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS)
162
{
163
164
	int error;

165
	error = sysctl_handle_int(oidp, arg1, arg2, req);
166
	if (error == 0) {
167
168
169
170
171
172
		RANGECHK(V_ipport_lowfirstauto, 1, IPPORT_RESERVED - 1);
		RANGECHK(V_ipport_lowlastauto, 1, IPPORT_RESERVED - 1);
		RANGECHK(V_ipport_firstauto, IPPORT_RESERVED, IPPORT_MAX);
		RANGECHK(V_ipport_lastauto, IPPORT_RESERVED, IPPORT_MAX);
		RANGECHK(V_ipport_hifirstauto, IPPORT_RESERVED, IPPORT_MAX);
		RANGECHK(V_ipport_hilastauto, IPPORT_RESERVED, IPPORT_MAX);
173
	}
174
	return (error);
175
}
176

177
#undef RANGECHK
178

179
180
static SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange,
    CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
181
    "IP Ports");
182

183
SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst,
184
185
186
    CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
    &VNET_NAME(ipport_lowfirstauto), 0, &sysctl_net_ipport_check, "I",
    "");
187
SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast,
188
189
190
    CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
    &VNET_NAME(ipport_lowlastauto), 0, &sysctl_net_ipport_check, "I",
    "");
191
SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, first,
192
193
194
    CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
    &VNET_NAME(ipport_firstauto), 0, &sysctl_net_ipport_check, "I",
    "");
195
SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, last,
196
197
198
    CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
    &VNET_NAME(ipport_lastauto), 0, &sysctl_net_ipport_check, "I",
    "");
199
SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst,
200
201
202
    CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
    &VNET_NAME(ipport_hifirstauto), 0, &sysctl_net_ipport_check, "I",
    "");
203
SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast,
204
205
206
    CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
    &VNET_NAME(ipport_hilastauto), 0, &sysctl_net_ipport_check, "I",
    "");
207
208
209
210
SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedhigh,
	CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE,
	&VNET_NAME(ipport_reservedhigh), 0, "");
SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedlow,
211
	CTLFLAG_RW|CTLFLAG_SECURE, &VNET_NAME(ipport_reservedlow), 0, "");
212
213
SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomized,
	CTLFLAG_VNET | CTLFLAG_RW,
214
	&VNET_NAME(ipport_randomized), 0, "Enable random port allocation");
215
216
SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomcps,
	CTLFLAG_VNET | CTLFLAG_RW,
217
	&VNET_NAME(ipport_randomcps), 0, "Maximum number of random port "
218
	"allocations before switching to a sequential one");
219
220
SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomtime,
	CTLFLAG_VNET | CTLFLAG_RW,
221
	&VNET_NAME(ipport_randomtime), 0,
222
	"Minimum time to keep sequential port "
223
	"allocation before switching to a random one");
224
225

#ifdef RATELIMIT
226
227
counter_u64_t rate_limit_new;
counter_u64_t rate_limit_chg;
228
229
230
231
counter_u64_t rate_limit_active;
counter_u64_t rate_limit_alloc_fail;
counter_u64_t rate_limit_set_ok;

232
static SYSCTL_NODE(_net_inet_ip, OID_AUTO, rl, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
233
234
235
236
237
238
239
    "IP Rate Limiting");
SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, active, CTLFLAG_RD,
    &rate_limit_active, "Active rate limited connections");
SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, alloc_fail, CTLFLAG_RD,
   &rate_limit_alloc_fail, "Rate limited connection failures");
SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, set_ok, CTLFLAG_RD,
   &rate_limit_set_ok, "Rate limited setting succeeded");
240
241
242
243
244
SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, newrl, CTLFLAG_RD,
   &rate_limit_new, "Total Rate limit new attempts");
SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, chgrl, CTLFLAG_RD,
   &rate_limit_chg, "Total Rate limited change attempts");

245
246
#endif /* RATELIMIT */

247
#endif /* INET */
248

249
250
251
/*
 * in_pcb.c: manage the Protocol Control Blocks.
 *
252
253
254
 * NOTE: It is assumed that most of these functions will be called with
 * the pcbinfo lock held, and often, the inpcb lock held, as these utility
 * functions often modify hash chains or addresses in pcbs.
255
256
 */

257
258
static struct inpcblbgroup *
in_pcblbgroup_alloc(struct inpcblbgrouphead *hdr, u_char vflag,
259
260
    uint16_t port, const union in_dependaddr *addr, int size,
    uint8_t numa_domain)
261
262
263
264
265
266
267
268
269
270
{
	struct inpcblbgroup *grp;
	size_t bytes;

	bytes = __offsetof(struct inpcblbgroup, il_inp[size]);
	grp = malloc(bytes, M_PCB, M_ZERO | M_NOWAIT);
	if (!grp)
		return (NULL);
	grp->il_vflag = vflag;
	grp->il_lport = port;
271
	grp->il_numa_domain = numa_domain;
272
273
	grp->il_dependladdr = *addr;
	grp->il_inpsiz = size;
274
	CK_LIST_INSERT_HEAD(hdr, grp, il_list);
275
276
277
278
	return (grp);
}

static void
279
in_pcblbgroup_free_deferred(epoch_context_t ctx)
280
{
281
	struct inpcblbgroup *grp;
282

283
	grp = __containerof(ctx, struct inpcblbgroup, il_epoch_ctx);
284
	free(grp, M_PCB);
285
286
}

287
288
289
290
291
static void
in_pcblbgroup_free(struct inpcblbgroup *grp)
{

	CK_LIST_REMOVE(grp, il_list);
292
	NET_EPOCH_CALL(in_pcblbgroup_free_deferred, &grp->il_epoch_ctx);
293
294
}

295
296
297
298
299
300
301
302
static struct inpcblbgroup *
in_pcblbgroup_resize(struct inpcblbgrouphead *hdr,
    struct inpcblbgroup *old_grp, int size)
{
	struct inpcblbgroup *grp;
	int i;

	grp = in_pcblbgroup_alloc(hdr, old_grp->il_vflag,
303
304
	    old_grp->il_lport, &old_grp->il_dependladdr, size,
	    old_grp->il_numa_domain);
305
	if (grp == NULL)
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
		return (NULL);

	KASSERT(old_grp->il_inpcnt < grp->il_inpsiz,
	    ("invalid new local group size %d and old local group count %d",
	     grp->il_inpsiz, old_grp->il_inpcnt));

	for (i = 0; i < old_grp->il_inpcnt; ++i)
		grp->il_inp[i] = old_grp->il_inp[i];
	grp->il_inpcnt = old_grp->il_inpcnt;
	in_pcblbgroup_free(old_grp);
	return (grp);
}

/*
 * PCB at index 'i' is removed from the group. Pull up the ones below il_inp[i]
 * and shrink group if possible.
 */
static void
in_pcblbgroup_reorder(struct inpcblbgrouphead *hdr, struct inpcblbgroup **grpp,
    int i)
{
327
	struct inpcblbgroup *grp, *new_grp;
328

329
	grp = *grpp;
330
331
332
333
334
	for (; i + 1 < grp->il_inpcnt; ++i)
		grp->il_inp[i] = grp->il_inp[i + 1];
	grp->il_inpcnt--;

	if (grp->il_inpsiz > INPCBLBGROUP_SIZMIN &&
335
	    grp->il_inpcnt <= grp->il_inpsiz / 4) {
336
		/* Shrink this group. */
337
338
		new_grp = in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz / 2);
		if (new_grp != NULL)
339
340
341
342
343
344
345
346
			*grpp = new_grp;
	}
}

/*
 * Add PCB to load balance group for SO_REUSEPORT_LB option.
 */
static int
347
in_pcbinslbgrouphash(struct inpcb *inp, uint8_t numa_domain)
348
{
349
350
	const static struct timeval interval = { 60, 0 };
	static struct timeval lastprint;
351
352
353
	struct inpcbinfo *pcbinfo;
	struct inpcblbgrouphead *hdr;
	struct inpcblbgroup *grp;
354
	uint32_t idx;
355
356
357
358
359
360
361
362
363

	pcbinfo = inp->inp_pcbinfo;

	INP_WLOCK_ASSERT(inp);
	INP_HASH_WLOCK_ASSERT(pcbinfo);

	/*
	 * Don't allow jailed socket to join local group.
	 */
364
	if (inp->inp_socket != NULL && jailed(inp->inp_socket->so_cred))
365
366
367
368
369
370
371
372
373
374
375
376
377
		return (0);

#ifdef INET6
	/*
	 * Don't allow IPv4 mapped INET6 wild socket.
	 */
	if ((inp->inp_vflag & INP_IPV4) &&
	    inp->inp_laddr.s_addr == INADDR_ANY &&
	    INP_CHECK_SOCKAF(inp->inp_socket, AF_INET6)) {
		return (0);
	}
#endif

378
	idx = INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask);
379
	hdr = &pcbinfo->ipi_lbgrouphashbase[idx];
380
	CK_LIST_FOREACH(grp, hdr, il_list) {
381
382
		if (grp->il_vflag == inp->inp_vflag &&
		    grp->il_lport == inp->inp_lport &&
383
		    grp->il_numa_domain == numa_domain &&
384
		    memcmp(&grp->il_dependladdr,
385
386
		    &inp->inp_inc.inc_ie.ie_dependladdr,
		    sizeof(grp->il_dependladdr)) == 0)
387
388
389
390
391
392
			break;
	}
	if (grp == NULL) {
		/* Create new load balance group. */
		grp = in_pcblbgroup_alloc(hdr, inp->inp_vflag,
		    inp->inp_lport, &inp->inp_inc.inc_ie.ie_dependladdr,
393
		    INPCBLBGROUP_SIZMIN, numa_domain);
394
		if (grp == NULL)
395
396
397
			return (ENOBUFS);
	} else if (grp->il_inpcnt == grp->il_inpsiz) {
		if (grp->il_inpsiz >= INPCBLBGROUP_SIZMAX) {
398
			if (ratecheck(&lastprint, &interval))
399
400
401
402
403
404
405
				printf("lb group port %d, limit reached\n",
				    ntohs(grp->il_lport));
			return (0);
		}

		/* Expand this local group. */
		grp = in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz * 2);
406
		if (grp == NULL)
407
408
409
410
			return (ENOBUFS);
	}

	KASSERT(grp->il_inpcnt < grp->il_inpsiz,
411
412
	    ("invalid local group size %d and count %d", grp->il_inpsiz,
	    grp->il_inpcnt));
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435

	grp->il_inp[grp->il_inpcnt] = inp;
	grp->il_inpcnt++;
	return (0);
}

/*
 * Remove PCB from load balance group.
 */
static void
in_pcbremlbgrouphash(struct inpcb *inp)
{
	struct inpcbinfo *pcbinfo;
	struct inpcblbgrouphead *hdr;
	struct inpcblbgroup *grp;
	int i;

	pcbinfo = inp->inp_pcbinfo;

	INP_WLOCK_ASSERT(inp);
	INP_HASH_WLOCK_ASSERT(pcbinfo);

	hdr = &pcbinfo->ipi_lbgrouphashbase[
436
	    INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)];
437
	CK_LIST_FOREACH(grp, hdr, il_list) {
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
		for (i = 0; i < grp->il_inpcnt; ++i) {
			if (grp->il_inp[i] != inp)
				continue;

			if (grp->il_inpcnt == 1) {
				/* We are the last, free this local group. */
				in_pcblbgroup_free(grp);
			} else {
				/* Pull up inpcbs, shrink group if possible. */
				in_pcblbgroup_reorder(hdr, &grp, i);
			}
			return;
		}
	}
}

454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
int
in_pcblbgroup_numa(struct inpcb *inp, int arg)
{
	struct inpcbinfo *pcbinfo;
	struct inpcblbgrouphead *hdr;
	struct inpcblbgroup *grp;
	int err, i;
	uint8_t numa_domain;

	switch (arg) {
	case TCP_REUSPORT_LB_NUMA_NODOM:
		numa_domain = M_NODOM;
		break;
	case TCP_REUSPORT_LB_NUMA_CURDOM:
		numa_domain = PCPU_GET(domain);
		break;
	default:
		if (arg < 0 || arg >= vm_ndomains)
			return (EINVAL);
		numa_domain = arg;
	}

	err = 0;
	pcbinfo = inp->inp_pcbinfo;
	INP_WLOCK_ASSERT(inp);
	INP_HASH_WLOCK(pcbinfo);
	hdr = &pcbinfo->ipi_lbgrouphashbase[
	    INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)];
	CK_LIST_FOREACH(grp, hdr, il_list) {
		for (i = 0; i < grp->il_inpcnt; ++i) {
			if (grp->il_inp[i] != inp)
				continue;

			if (grp->il_numa_domain == numa_domain) {
				goto abort_with_hash_wlock;
			}

			/* Remove it from the old group. */
			in_pcbremlbgrouphash(inp);

			/* Add it to the new group based on numa domain. */
			in_pcbinslbgrouphash(inp, numa_domain);
			goto abort_with_hash_wlock;
		}
	}
	err = ENOENT;
abort_with_hash_wlock:
	INP_HASH_WUNLOCK(pcbinfo);
	return (err);
}

505
506
507
508
509
510
511
512
513
514
515
516
/*
 * Different protocols initialize their inpcbs differently - giving
 * different name to the lock.  But they all are disposed the same.
 */
static void
inpcb_fini(void *mem, int size)
{
	struct inpcb *inp = mem;

	INP_LOCK_DESTROY(inp);
}

517
518
519
520
521
522
523
/*
 * Initialize an inpcbinfo -- we should be able to reduce the number of
 * arguments in time.
 */
void
in_pcbinfo_init(struct inpcbinfo *pcbinfo, const char *name,
    struct inpcbhead *listhead, int hash_nelements, int porthash_nelements,
524
    char *inpcbzone_name, uma_init inpcbzone_init, u_int hashfields)
525
526
{

527
528
	porthash_nelements = imin(porthash_nelements, IPPORT_MAX + 1);

529
	INP_INFO_LOCK_INIT(pcbinfo, name);
530
	INP_HASH_LOCK_INIT(pcbinfo, "pcbinfohash");	/* XXXRW: argument? */
531
	INP_LIST_LOCK_INIT(pcbinfo, "pcbinfolist");
532
533
534
535
#ifdef VIMAGE
	pcbinfo->ipi_vnet = curvnet;
#endif
	pcbinfo->ipi_listhead = listhead;
536
	CK_LIST_INIT(pcbinfo->ipi_listhead);
537
	pcbinfo->ipi_count = 0;
538
539
540
541
	pcbinfo->ipi_hashbase = hashinit(hash_nelements, M_PCB,
	    &pcbinfo->ipi_hashmask);
	pcbinfo->ipi_porthashbase = hashinit(porthash_nelements, M_PCB,
	    &pcbinfo->ipi_porthashmask);
542
	pcbinfo->ipi_lbgrouphashbase = hashinit(porthash_nelements, M_PCB,
543
	    &pcbinfo->ipi_lbgrouphashmask);
544
	pcbinfo->ipi_zone = uma_zcreate(inpcbzone_name, sizeof(struct inpcb),
545
	    NULL, NULL, inpcbzone_init, inpcb_fini, UMA_ALIGN_PTR, 0);
546
	uma_zone_set_max(pcbinfo->ipi_zone, maxsockets);
547
548
	uma_zone_set_warning(pcbinfo->ipi_zone,
	    "kern.ipc.maxsockets limit reached");
549
550
551
552
553
554
555
556
557
}

/*
 * Destroy an inpcbinfo.
 */
void
in_pcbinfo_destroy(struct inpcbinfo *pcbinfo)
{

558
559
560
	KASSERT(pcbinfo->ipi_count == 0,
	    ("%s: ipi_count = %u", __func__, pcbinfo->ipi_count));

561
562
563
	hashdestroy(pcbinfo->ipi_hashbase, M_PCB, pcbinfo->ipi_hashmask);
	hashdestroy(pcbinfo->ipi_porthashbase, M_PCB,
	    pcbinfo->ipi_porthashmask);
564
565
	hashdestroy(pcbinfo->ipi_lbgrouphashbase, M_PCB,
	    pcbinfo->ipi_lbgrouphashmask);
566
	uma_zdestroy(pcbinfo->ipi_zone);
567
	INP_LIST_LOCK_DESTROY(pcbinfo);
568
	INP_HASH_LOCK_DESTROY(pcbinfo);
569
570
571
	INP_INFO_LOCK_DESTROY(pcbinfo);
}

572
573
/*
 * Allocate a PCB and associate it with the socket.
574
 * On success return with the PCB locked.
575
 */
Rodney W. Grimes's avatar
Rodney W. Grimes committed
576
int
577
in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo)
Rodney W. Grimes's avatar
Rodney W. Grimes committed
578
{
579
	struct inpcb *inp;
580
	int error;
581
582

	error = 0;
583
	inp = uma_zalloc(pcbinfo->ipi_zone, M_NOWAIT);
Rodney W. Grimes's avatar
Rodney W. Grimes committed
584
585
	if (inp == NULL)
		return (ENOBUFS);
586
	bzero(&inp->inp_start_zero, inp_zero_size);
587
588
589
#ifdef NUMA
	inp->inp_numa_domain = M_NODOM;
#endif
590
	inp->inp_pcbinfo = pcbinfo;
Rodney W. Grimes's avatar
Rodney W. Grimes committed
591
	inp->inp_socket = so;
592
	inp->inp_cred = crhold(so->so_cred);
593
	inp->inp_inc.inc_fibnum = so->so_fibnum;
594
#ifdef MAC
595
	error = mac_inpcb_init(inp, M_NOWAIT);
596
597
	if (error != 0)
		goto out;
598
	mac_inpcb_create(so, inp);
599
#endif
600
601
#if defined(IPSEC) || defined(IPSEC_SUPPORT)
	error = ipsec_init_pcbpolicy(inp);
602
603
604
605
	if (error != 0) {
#ifdef MAC
		mac_inpcb_destroy(inp);
#endif
606
		goto out;
607
	}
608
#endif /*IPSEC*/
609
#ifdef INET6
610
611
	if (INP_SOCKAF(so) == AF_INET6) {
		inp->inp_vflag |= INP_IPV6PROTO;
612
		if (V_ip6_v6only)
613
614
			inp->inp_flags |= IN6P_IPV6_V6ONLY;
	}
615
#endif
616
617
	INP_WLOCK(inp);
	INP_LIST_WLOCK(pcbinfo);
618
	CK_LIST_INSERT_HEAD(pcbinfo->ipi_listhead, inp, inp_list);
619
	pcbinfo->ipi_count++;
Rodney W. Grimes's avatar
Rodney W. Grimes committed
620
	so->so_pcb = (caddr_t)inp;
Hajimu UMEMOTO's avatar
Hajimu UMEMOTO committed
621
#ifdef INET6
622
	if (V_ip6_auto_flowlabel)
Hajimu UMEMOTO's avatar
Hajimu UMEMOTO committed
623
624
		inp->inp_flags |= IN6P_AUTOFLOWLABEL;
#endif
625
	inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
626
	refcount_init(&inp->inp_refcount, 1);	/* Reference from inpcbinfo */
627
628
629
630
631
632

	/*
	 * Routes in inpcb's can cache L2 as well; they are guaranteed
	 * to be cleaned up.
	 */
	inp->inp_route.ro_flags = RT_LLE_CACHE;
633
	INP_LIST_WUNLOCK(pcbinfo);
634
#if defined(IPSEC) || defined(IPSEC_SUPPORT) || defined(MAC)
635
out:
636
637
	if (error != 0) {
		crfree(inp->inp_cred);
638
		uma_zfree(pcbinfo->ipi_zone, inp);
639
	}
640
641
#endif
	return (error);
Rodney W. Grimes's avatar
Rodney W. Grimes committed
642
643
}

644
#ifdef INET
Rodney W. Grimes's avatar
Rodney W. Grimes committed
645
int
646
in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred)
Rodney W. Grimes's avatar
Rodney W. Grimes committed
647
{
648
649
	int anonport, error;

650
651
652
653
	KASSERT(nam == NULL || nam->sa_family == AF_INET,
	    ("%s: invalid address family for %p", __func__, nam));
	KASSERT(nam == NULL || nam->sa_len == sizeof(struct sockaddr_in),
	    ("%s: invalid address length for %p", __func__, nam));
654
	INP_WLOCK_ASSERT(inp);
655
	INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
Sam Leffler's avatar
Sam Leffler committed
656

657
658
	if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY)
		return (EINVAL);
659
	anonport = nam == NULL || ((struct sockaddr_in *)nam)->sin_port == 0;
660
	error = in_pcbbind_setup(inp, nam, &inp->inp_laddr.s_addr,
661
	    &inp->inp_lport, cred);
662
663
664
665
666
667
668
669
670
671
672
	if (error)
		return (error);
	if (in_pcbinshash(inp) != 0) {
		inp->inp_laddr.s_addr = INADDR_ANY;
		inp->inp_lport = 0;
		return (EAGAIN);
	}
	if (anonport)
		inp->inp_flags |= INP_ANONPORT;
	return (0);
}
673
#endif
674

675
#if defined(INET) || defined(INET6)
676
/*
677
678
679
 * Assign a local port like in_pcb_lport(), but also used with connect()
 * and a foreign address and port.  If fsa is non-NULL, choose a local port
 * that is unused with those, otherwise one that is completely unused.
680
 * lsa can be NULL for IPv6.
681
 */
682
int
683
684
in_pcb_lport_dest(struct inpcb *inp, struct sockaddr *lsa, u_short *lportp,
    struct sockaddr *fsa, u_short fport, struct ucred *cred, int lookupflags)
685
686
687
688
689
690
691
{
	struct inpcbinfo *pcbinfo;
	struct inpcb *tmpinp;
	unsigned short *lastport;
	int count, dorandom, error;
	u_short aux, first, last, lport;
#ifdef INET
692
693
694
695
	struct in_addr laddr, faddr;
#endif
#ifdef INET6
	struct in6_addr *laddr6, *faddr6;
696
697
698
699
700
701
702
703
704
#endif

	pcbinfo = inp->inp_pcbinfo;

	/*
	 * Because no actual state changes occur here, a global write lock on
	 * the pcbinfo isn't required.
	 */
	INP_LOCK_ASSERT(inp);
705
	INP_HASH_LOCK_ASSERT(pcbinfo);
706
707
708
709
710
711

	if (inp->inp_flags & INP_HIGHPORT) {
		first = V_ipport_hifirstauto;	/* sysctl */
		last  = V_ipport_hilastauto;
		lastport = &pcbinfo->ipi_lasthi;
	} else if (inp->inp_flags & INP_LOWPORT) {
712
		error = priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT);
713
714
715
716
717
718
719
720
721
722
723
		if (error)
			return (error);
		first = V_ipport_lowfirstauto;	/* 1023 */
		last  = V_ipport_lowlastauto;	/* 600 */
		lastport = &pcbinfo->ipi_lastlow;
	} else {
		first = V_ipport_firstauto;	/* sysctl */
		last  = V_ipport_lastauto;
		lastport = &pcbinfo->ipi_lastport;
	}
	/*
724
	 * For UDP(-Lite), use random port allocation as long as the user
725
726
727
728
729
	 * allows it.  For TCP (and as of yet unknown) connections,
	 * use random port allocation only if the user allows it AND
	 * ipport_tick() allows it.
	 */
	if (V_ipport_randomized &&
730
731
		(!V_ipport_stoprandom || pcbinfo == &V_udbinfo ||
		pcbinfo == &V_ulitecbinfo))
732
733
734
735
736
737
738
739
740
		dorandom = 1;
	else
		dorandom = 0;
	/*
	 * It makes no sense to do random port allocation if
	 * we have the only port available.
	 */
	if (first == last)
		dorandom = 0;
741
742
	/* Make sure to not include UDP(-Lite) packets in the count. */
	if (pcbinfo != &V_udbinfo || pcbinfo != &V_ulitecbinfo)
743
744
745
746
747
748
749
750
751
752
753
754
755
		V_ipport_tcpallocs++;
	/*
	 * Instead of having two loops further down counting up or down
	 * make sure that first is always <= last and go with only one
	 * code path implementing all logic.
	 */
	if (first > last) {
		aux = first;
		first = last;
		last = aux;
	}

#ifdef INET
756
	laddr.s_addr = INADDR_ANY;
757
	if ((inp->inp_vflag & (INP_IPV4|INP_IPV6)) == INP_IPV4) {
758
759
		if (lsa != NULL)
			laddr = ((struct sockaddr_in *)lsa)->sin_addr;
760
761
762
763
764
		if (fsa != NULL)
			faddr = ((struct sockaddr_in *)fsa)->sin_addr;
	}
#endif
#ifdef INET6
765
766
767
768
	laddr6 = NULL;
	if ((inp->inp_vflag & INP_IPV6) != 0) {
		if (lsa != NULL)
			laddr6 = &((struct sockaddr_in6 *)lsa)->sin6_addr;
769
770
		if (fsa != NULL)
			faddr6 = &((struct sockaddr_in6 *)fsa)->sin6_addr;
771
772
	}
#endif
773
774

	tmpinp = NULL;
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
	lport = *lportp;

	if (dorandom)
		*lastport = first + (arc4random() % (last - first));

	count = last - first;

	do {
		if (count-- < 0)	/* completely used? */
			return (EADDRNOTAVAIL);
		++*lastport;
		if (*lastport < first || *lastport > last)
			*lastport = first;
		lport = htons(*lastport);

790
791
792
793
794
		if (fsa != NULL) {
#ifdef INET
			if (lsa->sa_family == AF_INET) {
				tmpinp = in_pcblookup_hash_locked(pcbinfo,
				    faddr, fport, laddr, lport, lookupflags,
795
				    NULL, M_NODOM);
796
797
798
799
800
801
			}
#endif
#ifdef INET6
			if (lsa->sa_family == AF_INET6) {
				tmpinp = in6_pcblookup_hash_locked(pcbinfo,
				    faddr6, fport, laddr6, lport, lookupflags,
802
				    NULL, M_NODOM);
803
804
805
			}
#endif
		} else {
806
#ifdef INET6
807
808
809
			if ((inp->inp_vflag & INP_IPV6) != 0)
				tmpinp = in6_pcblookup_local(pcbinfo,
				    &inp->in6p_laddr, lport, lookupflags, cred);
810
811
#endif
#if defined(INET) && defined(INET6)
812
			else
813
814
#endif
#ifdef INET
815
816
				tmpinp = in_pcblookup_local(pcbinfo, laddr,
				    lport, lookupflags, cred);
817
#endif
818
		}
819
820
821
822
823
824
	} while (tmpinp != NULL);

	*lportp = lport;

	return (0);
}
825

826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
/*
 * Select a local port (number) to use.
 */
int
in_pcb_lport(struct inpcb *inp, struct in_addr *laddrp, u_short *lportp,
    struct ucred *cred, int lookupflags)
{
	struct sockaddr_in laddr;

	if (laddrp) {
		bzero(&laddr, sizeof(laddr));
		laddr.sin_family = AF_INET;
		laddr.sin_addr = *laddrp;
	}
	return (in_pcb_lport_dest(inp, laddrp ? (struct sockaddr *) &laddr :
	    NULL, lportp, NULL, 0, cred, lookupflags));
}

844
845
846
/*
 * Return cached socket options.
 */
847
int
848
849
inp_so_options(const struct inpcb *inp)
{
850
	int so_options;
851

852
	so_options = 0;
853

854
855
856
857
858
859
860
	if ((inp->inp_flags2 & INP_REUSEPORT_LB) != 0)
		so_options |= SO_REUSEPORT_LB;
	if ((inp->inp_flags2 & INP_REUSEPORT) != 0)
		so_options |= SO_REUSEPORT;
	if ((inp->inp_flags2 & INP_REUSEADDR) != 0)
		so_options |= SO_REUSEADDR;
	return (so_options);
861
}
862
863
#endif /* INET || INET6 */

864
865
866
867
868
869
870
871
872
/*
 * Check if a new BINDMULTI socket is allowed to be created.
 *
 * ni points to the new inp.
 * oi points to the exisitng inp.
 *
 * This checks whether the existing inp also has BINDMULTI and
 * whether the credentials match.
 */
873
int
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
in_pcbbind_check_bindmulti(const struct inpcb *ni, const struct inpcb *oi)
{
	/* Check permissions match */
	if ((ni->inp_flags2 & INP_BINDMULTI) &&
	    (ni->inp_cred->cr_uid !=
	    oi->inp_cred->cr_uid))
		return (0);

	/* Check the existing inp has BINDMULTI set */
	if ((ni->inp_flags2 & INP_BINDMULTI) &&
	    ((oi->inp_flags2 & INP_BINDMULTI) == 0))
		return (0);

	/*
	 * We're okay - either INP_BINDMULTI isn't set on ni, or
	 * it is and it matches the checks.
	 */
	return (1);
}

894
#ifdef INET
895
896
897
898
899
900
901
902
903
904
/*
 * Set up a bind operation on a PCB, performing port allocation
 * as required, but do not actually modify the PCB. Callers can
 * either complete the bind by setting inp_laddr/inp_lport and
 * calling in_pcbinshash(), or they can just use the resulting
 * port and address to authorise the sending of a once-off packet.
 *
 * On error, the values of *laddrp and *lportp are not changed.
 */
int
905
906
in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp,
    u_short *lportp, struct ucred *cred)
907
908
{
	struct socket *so = inp->inp_socket;
909
	struct sockaddr_in *sin;
910
	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
911
	struct in_addr laddr;
Rodney W. Grimes's avatar
Rodney W. Grimes committed
912
	u_short lport = 0;
913
	int lookupflags = 0, reuseport = (so->so_options & SO_REUSEPORT);
Bjoern A. Zeeb's avatar
MFp4:    
Bjoern A. Zeeb committed
914
	int error;
Rodney W. Grimes's avatar
Rodney W. Grimes committed
915

916
917
918
919
920
921
	/*
	 * XXX: Maybe we could let SO_REUSEPORT_LB set SO_REUSEPORT bit here
	 * so that we don't have to add to the (already messy) code below.
	 */
	int reuseport_lb = (so->so_options & SO_REUSEPORT_LB);

922
	/*
923
	 * No state changes, so read locks are sufficient here.
924
	 */
Sam Leffler's avatar
Sam Leffler committed
925
	INP_LOCK_ASSERT(inp);
926
	INP_HASH_LOCK_ASSERT(pcbinfo);
Sam Leffler's avatar
Sam Leffler committed
927

928
929
	laddr.s_addr = *laddrp;
	if (nam != NULL && laddr.s_addr != INADDR_ANY)
Rodney W. Grimes's avatar
Rodney W. Grimes committed
930
		return (EINVAL);
931
	if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT|SO_REUSEPORT_LB)) == 0)
932
		lookupflags = INPLOOKUP_WILDCARD;
Ermal Luçi's avatar
Ermal Luçi committed
933
	if (nam == NULL) {
934
935
936
		if ((error = prison_local_ip4(cred, &laddr)) != 0)
			return (error);
	} else {
937
		sin = (struct sockaddr_in *)nam;
938
939
940
941
942
		KASSERT(sin->sin_family == AF_INET,
		    ("%s: invalid family for address %p", __func__, sin));
		KASSERT(sin->sin_len == sizeof(*sin),
		    ("%s: invalid length for address %p", __func__, sin));

943
944
945
		error = prison_local_ip4(cred, &sin->sin_addr);
		if (error)
			return (error);
946
947
948
949
950
951
952
		if (sin->sin_port != *lportp) {
			/* Don't allow the port to change. */
			if (*lportp != 0)
				return (EINVAL);
			lport = sin->sin_port;
		}
		/* NB: lport is left as 0 if the port isn't being changed. */
Rodney W. Grimes's avatar
Rodney W. Grimes committed
953
954
955
956
957
958
959
960
		if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) {
			/*
			 * Treat SO_REUSEADDR as SO_REUSEPORT for multicast;
			 * allow complete duplication of binding if
			 * SO_REUSEPORT is set, or if SO_REUSEADDR is set
			 * and a multicast address is bound on both
			 * new and duplicated sockets.
			 */
961
			if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) != 0)
Rodney W. Grimes's avatar
Rodney W. Grimes committed
962
				reuseport = SO_REUSEADDR|SO_REUSEPORT;
963
964
965
966
967
968
969
			/*
			 * XXX: How to deal with SO_REUSEPORT_LB here?
			 * Treat same as SO_REUSEPORT for now.
			 */
			if ((so->so_options &
			    (SO_REUSEADDR|SO_REUSEPORT_LB)) != 0)
				reuseport_lb = SO_REUSEADDR|SO_REUSEPORT_LB;
Rodney W. Grimes's avatar
Rodney W. Grimes committed
970
971
		} else if (sin->sin_addr.s_addr != INADDR_ANY) {
			sin->sin_port = 0;		/* yech... */
972
			bzero(&sin->sin_zero, sizeof(sin->sin_zero));
973
			/*
974
			 * Is the address a local IP address?
975
			 * If INP_BINDANY is set, then the socket may be bound
Adrian Chadd's avatar
Adrian Chadd committed
976
			 * to any endpoint address, local or not.
977
			 */
978
			if ((inp->inp_flags & INP_BINDANY) == 0 &&
979
			    ifa_ifwithaddr_check((struct sockaddr *)sin) == 0)
Rodney W. Grimes's avatar
Rodney W. Grimes committed
980
981
				return (EADDRNOTAVAIL);
		}
982
		laddr = sin->sin_addr;
Rodney W. Grimes's avatar
Rodney W. Grimes committed
983
984
		if (lport) {
			struct inpcb *t;
985
986
			struct tcptw *tw;

Rodney W. Grimes's avatar
Rodney W. Grimes committed
987
			/* GROSS */
988
989
			if (ntohs(lport) <= V_ipport_reservedhigh &&
			    ntohs(lport) >= V_ipport_reservedlow &&
990
			    priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT))
991
				return (EACCES);
992
			if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) &&
993
			    priv_check_cred(inp->inp_cred, PRIV_NETINET_REUSEPORT) != 0) {
994
				t = in_pcblookup_local(pcbinfo, sin->sin_addr,
Bjoern A. Zeeb's avatar
MFp4:    
Bjoern A. Zeeb committed
995
				    lport, INPLOOKUP_WILDCARD, cred);
996
997
998
999
	/*
	 * XXX
	 * This entire block sorely needs a rewrite.
	 */
1000
				if (t &&
For faster browsing, not all history is shown. View entire blame