in_pcb.h 32 KB
Newer Older
1
/*-
2
3
 * SPDX-License-Identifier: BSD-3-Clause
 *
Rodney W. Grimes's avatar
Rodney W. Grimes committed
4
 * Copyright (c) 1982, 1986, 1990, 1993
5
 *	The Regents of the University of California.
6
 * Copyright (c) 2010-2011 Juniper Networks, Inc.
7
 * All rights reserved.
Rodney W. Grimes's avatar
Rodney W. Grimes committed
8
 *
9
10
11
 * Portions of this software were developed by Robert N. M. Watson under
 * contract to Juniper Networks, Inc.
 *
Rodney W. Grimes's avatar
Rodney W. Grimes committed
12
13
14
15
16
17
18
19
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
Warner Losh's avatar
Warner Losh committed
20
 * 3. Neither the name of the University nor the names of its contributors
Rodney W. Grimes's avatar
Rodney W. Grimes committed
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *	@(#)in_pcb.h	8.1 (Berkeley) 6/10/93
Peter Wemm's avatar
Peter Wemm committed
37
 * $FreeBSD$
Rodney W. Grimes's avatar
Rodney W. Grimes committed
38
39
 */

Paul Richards's avatar
Paul Richards committed
40
41
42
#ifndef _NETINET_IN_PCB_H_
#define _NETINET_IN_PCB_H_

43
#include <sys/queue.h>
44
#include <sys/epoch.h>
45
46
#include <sys/_lock.h>
#include <sys/_mutex.h>
47
#include <sys/_rwlock.h>
48
#include <net/route.h>
49

50
#ifdef _KERNEL
51
#include <sys/lock.h>
52
#include <sys/rwlock.h>
53
#include <net/vnet.h>
54
#include <vm/uma.h>
55
#endif
56
#include <sys/ck.h>
57

Rodney W. Grimes's avatar
Rodney W. Grimes committed
58
/*
Robert Watson's avatar
Robert Watson committed
59
60
 * struct inpcb is the common protocol control block structure used in most
 * IP transport protocols.
61
62
63
64
 *
 * Pointers to local and foreign host table entries, local and foreign socket
 * numbers, and pointers up (to a socket structure) and down (to a
 * protocol-specific control block) are stored here.
Rodney W. Grimes's avatar
Rodney W. Grimes committed
65
 */
66
67
CK_LIST_HEAD(inpcbhead, inpcb);
CK_LIST_HEAD(inpcbporthead, inpcbport);
68
CK_LIST_HEAD(inpcblbgrouphead, inpcblbgroup);
69
typedef	uint64_t	inp_gen_t;
70

71
72
/*
 * PCB with AF_INET6 null bind'ed laddr can receive AF_INET input packet.
73
74
 * So, AF_INET6 null laddr is also used as AF_INET null laddr, by utilizing
 * the following structure.
75
76
77
78
79
80
 */
struct in_addr_4in6 {
	u_int32_t	ia46_pad32[3];
	struct	in_addr	ia46_addr4;
};

81
82
83
84
85
union in_dependaddr {
	struct in_addr_4in6 id46_addr;
	struct in6_addr	id6_addr;
};

86
/*
87
88
 * NOTE: ipv6 addrs should be 64-bit aligned, per RFC 2553.  in_conninfo has
 * some extra padding to accomplish this.
89
90
 * NOTE 2: tcp_syncache.c uses first 5 32-bit words, which identify fport,
 * lport, faddr to generate hash, so these fields shouldn't be moved.
91
92
93
94
95
 */
struct in_endpoints {
	u_int16_t	ie_fport;		/* foreign port */
	u_int16_t	ie_lport;		/* local port */
	/* protocol dependent part, local and foreign addr */
96
97
98
99
100
101
	union in_dependaddr ie_dependfaddr;	/* foreign host table entry */
	union in_dependaddr ie_dependladdr;	/* local host table entry */
#define	ie_faddr	ie_dependfaddr.id46_addr.ia46_addr4
#define	ie_laddr	ie_dependladdr.id46_addr.ia46_addr4
#define	ie6_faddr	ie_dependfaddr.id6_addr
#define	ie6_laddr	ie_dependladdr.id6_addr
102
	u_int32_t	ie6_zoneid;		/* scope zone id */
103
};
104
105

/*
106
107
 * XXX The defines for inc_* are hacks and should be changed to direct
 * references.
108
109
110
111
 */
struct in_conninfo {
	u_int8_t	inc_flags;
	u_int8_t	inc_len;
112
	u_int16_t	inc_fibnum;	/* XXX was pad, 16 bits is plenty */
113
	/* protocol dependent part */
114
115
	struct	in_endpoints inc_ie;
};
116
117
118
119
120

/*
 * Flags for inc_flags.
 */
#define	INC_ISIPV6	0x01
121
#define	INC_IPV6MINMTU	0x02
122

123
124
125
126
127
128
#define	inc_fport	inc_ie.ie_fport
#define	inc_lport	inc_ie.ie_lport
#define	inc_faddr	inc_ie.ie_faddr
#define	inc_laddr	inc_ie.ie_laddr
#define	inc6_faddr	inc_ie.ie6_faddr
#define	inc6_laddr	inc_ie.ie6_laddr
129
#define	inc6_zoneid	inc_ie.ie6_zoneid
130

131
132
#if defined(_KERNEL) || defined(_WANT_INPCB)
/*
133
134
 * struct inpcb captures the network layer state for TCP, UDP, and raw IPv4 and
 * IPv6 sockets.  In the case of TCP and UDP, further per-connection state is
135
136
 * hung off of inp_ppcb most of the time.  Almost all fields of struct inpcb
 * are static after creation or protected by a per-inpcb rwlock, inp_lock.  A
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
 * few fields are protected by multiple locks as indicated in the locking notes
 * below.  For these fields, all of the listed locks must be write-locked for
 * any modifications.  However, these fields can be safely read while any one of
 * the listed locks are read-locked.  This model can permit greater concurrency
 * for read operations.  For example, connections can be looked up while only
 * holding a read lock on the global pcblist lock.  This is important for
 * performance when attempting to find the connection for a packet given its IP
 * and port tuple.
 *
 * One noteworthy exception is that the global pcbinfo lock follows a different
 * set of rules in relation to the inp_list field.  Rather than being
 * write-locked for modifications and read-locked for list iterations, it must
 * be read-locked during modifications and write-locked during list iterations.
 * This ensures that the relatively rare global list iterations safely walk a
 * stable snapshot of connections while allowing more common list modifications
 * to safely grab the pcblist lock just while adding or removing a connection
 * from the global list.
154
155
 *
 * Key:
156
 * (b) - Protected by the hpts lock.
157
 * (c) - Constant after initialization
158
 * (e) - Protected by the net_epoch_prempt epoch
159
160
 * (i) - Protected by the inpcb lock
 * (p) - Protected by the pcbinfo lock for the inpcb
161
162
 * (l) - Protected by the pcblist lock for the inpcb
 * (h) - Protected by the pcbhash lock for the inpcb
163
164
 * (s) - Protected by another subsystem's locks
 * (x) - Undefined locking
165
 *
166
 * Notes on the tcp_hpts:
167
 *
168
169
 * First Hpts lock order is
 * 1) INP_WLOCK()
170
 * 2) HPTS_LOCK() i.e. hpts->pmtx
171
 *
172
173
174
 * To insert a TCB on the hpts you *must* be holding the INP_WLOCK().
 * You may check the inp->inp_in_hpts flag without the hpts lock.
 * The hpts is the only one that will clear this flag holding
175
 * only the hpts lock. This means that in your tcp_output()
176
177
178
 * routine when you test for the inp_in_hpts flag to be 1
 * it may be transitioning to 0 (by the hpts).
 * That's ok since that will just mean an extra call to tcp_output
179
 * that most likely will find the call you executed
180
 * (when the mis-match occured) will have put the TCB back
181
182
183
184
185
186
187
188
189
190
 * on the hpts and it will return. If your
 * call did not add the inp back to the hpts then you will either
 * over-send or the cwnd will block you from sending more.
 *
 * Note you should also be holding the INP_WLOCK() when you
 * call the remove from the hpts as well. Though usually
 * you are either doing this from a timer, where you need and have
 * the INP_WLOCK() or from destroying your TCB where again
 * you should already have the INP_WLOCK().
 *
191
 * The inp_hpts_cpu, inp_hpts_cpu_set, inp_input_cpu and
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
 * inp_input_cpu_set fields are controlled completely by
 * the hpts. Do not ever set these. The inp_hpts_cpu_set
 * and inp_input_cpu_set fields indicate if the hpts has
 * setup the respective cpu field. It is advised if this
 * field is 0, to enqueue the packet with the appropriate
 * hpts_immediate() call. If the _set field is 1, then
 * you may compare the inp_*_cpu field to the curcpu and
 * may want to again insert onto the hpts if these fields
 * are not equal (i.e. you are not on the expected CPU).
 *
 * A note on inp_hpts_calls and inp_input_calls, these
 * flags are set when the hpts calls either the output
 * or do_segment routines respectively. If the routine
 * being called wants to use this, then it needs to
 * clear the flag before returning. The hpts will not
 * clear the flag. The flags can be used to tell if
 * the hpts is the function calling the respective
 * routine.
210
211
212
213
214
215
216
217
218
219
220
221
 *
 * A few other notes:
 *
 * When a read lock is held, stability of the field is guaranteed; to write
 * to a field, a write lock must generally be held.
 *
 * netinet/netinet6-layer code should not assume that the inp_socket pointer
 * is safe to dereference without inp_lock being held, even for protocols
 * other than TCP (where the inpcb persists during TIMEWAIT even after the
 * socket has been freed), or there may be close(2)-related races.
 *
 * The inp_vflag field is overloaded, and would otherwise ideally be (c).
222
223
224
225
 *
 * TODO:  Currently only the TCP stack is leveraging the global pcbinfo lock
 * read-lock usage during modification, this model can be applied to other
 * protocols (especially SCTP).
226
 */
227
228
struct icmp6_filter;
struct inpcbpolicy;
229
struct m_snd_tag;
Rodney W. Grimes's avatar
Rodney W. Grimes committed
230
struct inpcb {
231
	/* Cache line #1 (amd64) */
232
	CK_LIST_ENTRY(inpcb) inp_hash;	/* [w](h/i) [r](e/i)  hash list */
233
234
	struct rwlock	inp_lock;
	/* Cache line #2 (amd64) */
235
#define	inp_start_zero	inp_hpts
236
237
#define	inp_zero_size	(sizeof(struct inpcb) - \
			    offsetof(struct inpcb, inp_start_zero))
238
239
240
241
242
243
	TAILQ_ENTRY(inpcb) inp_hpts;	/* pacing out queue next lock(b) */

	uint32_t inp_hpts_request;	/* Current hpts request, zero if
					 * fits in the pacing window (i&b). */
	/*
	 * Note the next fields are protected by a
244
	 * different lock (hpts-lock). This means that
245
246
247
248
249
250
	 * they must correspond in size to the smallest
	 * protectable bit field (uint8_t on x86, and
	 * other platfomrs potentially uint32_t?). Also
	 * since CPU switches can occur at different times the two
	 * fields can *not* be collapsed into a signal bit field.
	 */
251
#if defined(__amd64__) || defined(__i386__)
252
253
254
255
256
257
258
	volatile uint8_t inp_in_hpts; /* on output hpts (lock b) */
	volatile uint8_t inp_in_input; /* on input hpts (lock b) */
#else
	volatile uint32_t inp_in_hpts; /* on output hpts (lock b) */
	volatile uint32_t inp_in_input; /* on input hpts (lock b) */
#endif
	volatile uint16_t  inp_hpts_cpu; /* Lock (i) */
259
	volatile uint16_t  inp_irq_cpu;	/* Set by LRO in behalf of or the driver */
260
261
262
	u_int	inp_refcount;		/* (i) refcount */
	int	inp_flags;		/* (i) generic IP/datagram flags */
	int	inp_flags2;		/* (i) generic IP/datagram flags #2*/
263
264
265
266
267
	volatile uint16_t  inp_input_cpu; /* Lock (i) */
	volatile uint8_t inp_hpts_cpu_set :1,  /* on output hpts (i) */
			 inp_input_cpu_set : 1,	/* on input hpts (i) */
			 inp_hpts_calls :1,	/* (i) from output hpts */
			 inp_input_calls :1,	/* (i) from input hpts */
268
269
			 inp_irq_cpu_set :1,	/* (i) from LRO/Driver */
			 inp_spare_bits2 : 3;
270
	uint8_t inp_numa_domain;	/* numa domain */
271
	void	*inp_ppcb;		/* (i) pointer to per-protocol pcb */
272
	struct	socket *inp_socket;	/* (i) back pointer to socket */
273
274
275
	uint32_t 	 inp_hptsslot;	/* Hpts wheel slot this tcb is Lock(i&b) */
	uint32_t         inp_hpts_drop_reas;	/* reason we are dropping the PCB (lock i&b) */
	TAILQ_ENTRY(inpcb) inp_input;	/* pacing in  queue next lock(b) */
276
	struct	inpcbinfo *inp_pcbinfo;	/* (c) PCB list info */
277
	struct	ucred	*inp_cred;	/* (c) cache of socket cred */
278
	u_int32_t inp_flow;		/* (i) IPv6 flow information */
279
280
281
282
	u_char	inp_vflag;		/* (i) IP version flag (v4/v6) */
	u_char	inp_ip_ttl;		/* (i) time to live proto */
	u_char	inp_ip_p;		/* (c) protocol proto */
	u_char	inp_ip_minttl;		/* (i) minimum TTL or drop */
283
	uint32_t inp_flowid;		/* (x) flow id / queue id */
284
	struct m_snd_tag *inp_snd_tag;	/* (i) send tag for outgoing mbufs */
Adrian Chadd's avatar
Adrian Chadd committed
285
	uint32_t inp_flowtype;		/* (x) M_HASHTYPE value */
286
	uint32_t inp_rss_listen_bucket;	/* (x) overridden RSS listen bucket */
287
288

	/* Local and foreign ports, local and foreign addr. */
289
	struct	in_conninfo inp_inc;	/* (i) list for PCB's local port */
290

291
	/* MAC and IPSEC policy information. */
292
293
	struct	label *inp_label;	/* (i) MAC label */
	struct	inpcbpolicy *inp_sp;    /* (s) for IPSEC */
294

295
	/* Protocol-dependent part; options. */
296
	struct {
297
298
299
300
		u_char	inp_ip_tos;		/* (i) type of service proto */
		struct mbuf		*inp_options;	/* (i) IP options */
		struct ip_moptions	*inp_moptions;	/* (i) mcast options */
	};
301
	struct {
302
		/* (i) IP options */
303
		struct mbuf		*in6p_options;
304
		/* (i) IP6 options for outgoing packets */
305
		struct ip6_pktopts	*in6p_outputopts;
306
		/* (i) IP multicast options */
307
		struct ip6_moptions	*in6p_moptions;
308
		/* (i) ICMPv6 code type filter */
309
		struct icmp6_filter	*in6p_icmp6filt;
310
		/* (i) IPV6_CHECKSUM setsockopt */
311
312
313
		int	in6p_cksum;
		short	in6p_hops;
	};
314
	CK_LIST_ENTRY(inpcb) inp_portlist;	/* (i/h) */
315
	struct	inpcbport *inp_phd;	/* (i/h) head of this list */
316
	inp_gen_t	inp_gencnt;	/* (c) generation count */
317
	void		*spare_ptr;	/* Spare pointer. */
318
319
	rt_gen_t	inp_rt_cookie;	/* generation for route entry */
	union {				/* cached L3 information */
320
321
322
		struct route inp_route;
		struct route_in6 inp_route6;
	};
323
	CK_LIST_ENTRY(inpcb) inp_list;	/* (p/l) list for all PCBs for proto */
324
325
	                                /* (e[r]) for list iteration */
	                                /* (p[w]/l) for addition/removal */
326
	struct epoch_context inp_epoch_ctx;
327
};
328
329
#endif	/* _KERNEL */

330
331
332
333
#define	inp_fport	inp_inc.inc_fport
#define	inp_lport	inp_inc.inc_lport
#define	inp_faddr	inp_inc.inc_faddr
#define	inp_laddr	inp_inc.inc_laddr
Jeffrey Hsu's avatar
Jeffrey Hsu committed
334

335
336
#define	in6p_faddr	inp_inc.inc6_faddr
#define	in6p_laddr	inp_inc.inc6_laddr
337
#define	in6p_zoneid	inp_inc.inc6_zoneid
338

339
340
#define	inp_vnet	inp_pcbinfo->ipi_vnet

341
/*
342
343
344
345
 * The range of the generation count, as used in this implementation, is 9e19.
 * We would have to create 300 billion connections per second for this number
 * to roll over in a year.  This seems sufficiently unlikely that we simply
 * don't concern ourselves with that possibility.
346
 */
347

348
/*
349
350
 * Interface exported to userland by various protocols which use inpcbs.  Hack
 * alert -- only define if struct xsocket is in scope.
351
352
353
354
355
356
357
358
 * Fields prefixed with "xi_" are unique to this structure, and the rest
 * match fields in the struct inpcb, to ease coding and porting.
 *
 * Legend:
 * (s) - used by userland utilities in src
 * (p) - used by utilities in ports
 * (3) - is known to be used by third party software not in ports
 * (n) - no known usage
359
360
 */
#ifdef _SYS_SOCKETVAR_H_
361
struct xinpcb {
362
	ksize_t		xi_len;			/* length of this structure */
363
364
365
	struct xsocket	xi_socket;		/* (s,p) */
	struct in_conninfo inp_inc;		/* (s,p) */
	uint64_t	inp_gencnt;		/* (s,p) */
366
	kvaddr_t	inp_ppcb;		/* (s) netstat(1) */
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
	int64_t		inp_spare64[4];
	uint32_t	inp_flow;		/* (s) */
	uint32_t	inp_flowid;		/* (s) */
	uint32_t	inp_flowtype;		/* (s) */
	int32_t		inp_flags;		/* (s,p) */
	int32_t		inp_flags2;		/* (s) */
	int32_t		inp_rss_listen_bucket;	/* (n) */
	int32_t		in6p_cksum;		/* (n) */
	int32_t		inp_spare32[4];
	uint16_t	in6p_hops;		/* (n) */
	uint8_t		inp_ip_tos;		/* (n) */
	int8_t		pad8;
	uint8_t		inp_vflag;		/* (s,p) */
	uint8_t		inp_ip_ttl;		/* (n) */
	uint8_t		inp_ip_p;		/* (n) */
	uint8_t		inp_ip_minttl;		/* (n) */
	int8_t		inp_spare8[4];
} __aligned(8);
385

386
struct xinpgen {
387
	ksize_t	xig_len;	/* length of this structure */
388
	u_int		xig_count;	/* number of PCBs at this time */
389
	uint32_t	_xig_spare32;
390
391
	inp_gen_t	xig_gen;	/* generation count at this time */
	so_gen_t	xig_sogen;	/* socket generation count this time */
392
	uint64_t	_xig_spare64[4];
393
} __aligned(8);
394
395
396
#ifdef	_KERNEL
void	in_pcbtoxinpcb(const struct inpcb *, struct xinpcb *);
#endif
397
398
#endif /* _SYS_SOCKETVAR_H_ */

399
struct inpcbport {
400
	struct epoch_context phd_epoch_ctx;
401
	CK_LIST_ENTRY(inpcbport) phd_hash;
402
403
	struct inpcbhead phd_pcblist;
	u_short phd_port;
Rodney W. Grimes's avatar
Rodney W. Grimes committed
404
405
};

406
/*-
407
408
 * Global data structure for each high-level protocol (UDP, TCP, ...) in both
 * IPv4 and IPv6.  Holds inpcb lists and information for managing them.
409
 *
410
411
412
413
414
415
416
417
 * Each pcbinfo is protected by three locks: ipi_lock, ipi_hash_lock and
 * ipi_list_lock:
 *  - ipi_lock covering the global pcb list stability during loop iteration,
 *  - ipi_hash_lock covering the hashed lookup tables,
 *  - ipi_list_lock covering mutable global fields (such as the global
 *    pcb list)
 *
 * The lock order is:
418
 *
419
420
421
 *    ipi_lock (before)
 *        inpcb locks (before)
 *            ipi_list locks (before)
422
423
424
425
 *
 * Locking key:
 *
 * (c) Constant or nearly constant after initialisation
426
 * (e) - Protected by the net_epoch_prempt epoch
427
 * (g) Locked by ipi_lock
428
 * (l) Locked by ipi_list_lock
429
 * (h) Read using either net_epoch_preempt or inpcb lock; write requires both ipi_hash_lock and inpcb lock
430
 * (x) Synchronisation properties poorly defined
431
432
433
 */
struct inpcbinfo {
	/*
434
	 * Global lock protecting inpcb list modification
435
	 */
436
	struct mtx		 ipi_lock;
437
438

	/*
439
	 * Global list of inpcbs on the protocol.
440
	 */
441
	struct inpcbhead	*ipi_listhead;		/* [r](e) [w](g/l) */
442
	u_int			 ipi_count;		/* (l) */
443
444

	/*
445
446
	 * Generation count -- incremented each time a connection is allocated
	 * or freed.
447
	 */
448
	u_quad_t		 ipi_gencnt;		/* (l) */
449
450
451
452

	/*
	 * Fields associated with port lookup and allocation.
	 */
453
454
455
	u_short			 ipi_lastport;		/* (x) */
	u_short			 ipi_lastlow;		/* (x) */
	u_short			 ipi_lasthi;		/* (x) */
456
457
458
459

	/*
	 * UMA zone from which inpcbs are allocated for this protocol.
	 */
460
	struct	uma_zone	*ipi_zone;		/* (c) */
461

462
	/*
Gleb Smirnoff's avatar
Gleb Smirnoff committed
463
	 * Global lock protecting modification hash lookup tables.
464
	 */
465
	struct mtx		 ipi_hash_lock;
466

467
	/*
468
469
	 * Global hash of inpcbs, hashed by local and foreign addresses and
	 * port numbers.
470
	 */
471
472
	struct inpcbhead	*ipi_hashbase;		/* (h) */
	u_long			 ipi_hashmask;		/* (h) */
473
474
475
476

	/*
	 * Global hash of inpcbs, hashed by only local port number.
	 */
477
478
	struct inpcbporthead	*ipi_porthashbase;	/* (h) */
	u_long			 ipi_porthashmask;	/* (h) */
479

480
481
482
483
484
485
486
	/*
	 * Load balance groups used for the SO_REUSEPORT_LB option,
	 * hashed by local port.
	 */
	struct	inpcblbgrouphead *ipi_lbgrouphashbase;	/* (h) */
	u_long			 ipi_lbgrouphashmask;	/* (h) */

487
	/*
488
489
	 * Pointer to network stack instance
	 */
490
	struct vnet		*ipi_vnet;		/* (c) */
491
492
493

	/*
	 * general use 2
494
	 */
Robert Watson's avatar
Robert Watson committed
495
	void 			*ipi_pspare[2];
496
497
498
499
500

	/*
	 * Global lock protecting global inpcb list, inpcb count, etc.
	 */
	struct rwlock		 ipi_list_lock;
501
502
};

503
#ifdef _KERNEL
504
505
506
507
508
509
510
/*
 * Load balance groups used for the SO_REUSEPORT_LB socket option. Each group
 * (or unique address:port combination) can be re-used at most
 * INPCBLBGROUP_SIZMAX (256) times. The inpcbs are stored in il_inp which
 * is dynamically resized as processes bind/unbind to that specific group.
 */
struct inpcblbgroup {
511
512
	CK_LIST_ENTRY(inpcblbgroup) il_list;
	struct epoch_context il_epoch_ctx;
513
514
	uint16_t	il_lport;			/* (c) */
	u_char		il_vflag;			/* (c) */
515
	u_int8_t		il_numa_domain;
516
517
518
519
520
521
522
523
524
	uint32_t	il_pad2;
	union in_dependaddr il_dependladdr;		/* (c) */
#define	il_laddr	il_dependladdr.id46_addr.ia46_addr4
#define	il6_laddr	il_dependladdr.id6_addr
	uint32_t	il_inpsiz; /* max count in il_inp[] (h) */
	uint32_t	il_inpcnt; /* cur count in il_inp[] (h) */
	struct inpcb	*il_inp[];			/* (h) */
};

525
#define INP_LOCK_INIT(inp, d, t) \
526
527
528
529
	rw_init_flags(&(inp)->inp_lock, (t), RW_RECURSE |  RW_DUPOK)
#define INP_LOCK_DESTROY(inp)	rw_destroy(&(inp)->inp_lock)
#define INP_RLOCK(inp)		rw_rlock(&(inp)->inp_lock)
#define INP_WLOCK(inp)		rw_wlock(&(inp)->inp_lock)
530
531
#define INP_TRY_RLOCK(inp)	rw_try_rlock(&(inp)->inp_lock)
#define INP_TRY_WLOCK(inp)	rw_try_wlock(&(inp)->inp_lock)
532
533
#define INP_RUNLOCK(inp)	rw_runlock(&(inp)->inp_lock)
#define INP_WUNLOCK(inp)	rw_wunlock(&(inp)->inp_lock)
534
#define INP_UNLOCK(inp)		rw_unlock(&(inp)->inp_lock)
535
536
537
538
#define	INP_TRY_UPGRADE(inp)	rw_try_upgrade(&(inp)->inp_lock)
#define	INP_DOWNGRADE(inp)	rw_downgrade(&(inp)->inp_lock)
#define	INP_WLOCKED(inp)	rw_wowned(&(inp)->inp_lock)
#define	INP_LOCK_ASSERT(inp)	rw_assert(&(inp)->inp_lock, RA_LOCKED)
539
540
541
#define	INP_RLOCK_ASSERT(inp)	rw_assert(&(inp)->inp_lock, RA_RLOCKED)
#define	INP_WLOCK_ASSERT(inp)	rw_assert(&(inp)->inp_lock, RA_WLOCKED)
#define	INP_UNLOCK_ASSERT(inp)	rw_assert(&(inp)->inp_lock, RA_UNLOCKED)
Jeffrey Hsu's avatar
Jeffrey Hsu committed
542

543
/*
Robert Watson's avatar
Robert Watson committed
544
 * These locking functions are for inpcb consumers outside of sys/netinet,
545
546
547
548
549
550
551
552
 * more specifically, they were added for the benefit of TOE drivers. The
 * macros are reserved for use by the stack.
 */
void inp_wlock(struct inpcb *);
void inp_wunlock(struct inpcb *);
void inp_rlock(struct inpcb *);
void inp_runlock(struct inpcb *);

553
#ifdef INVARIANT_SUPPORT
554
555
void inp_lock_assert(struct inpcb *);
void inp_unlock_assert(struct inpcb *);
556
#else
557
558
#define	inp_lock_assert(inp)	do {} while (0)
#define	inp_unlock_assert(inp)	do {} while (0)
559
#endif
560

561
562
563
564
565
566
567
void	inp_apply_all(void (*func)(struct inpcb *, void *), void *arg);
int 	inp_ip_tos_get(const struct inpcb *inp);
void 	inp_ip_tos_set(struct inpcb *inp, int val);
struct socket *
	inp_inpcbtosocket(struct inpcb *inp);
struct tcpcb *
	inp_inpcbtotcpcb(struct inpcb *inp);
Robert Watson's avatar
Robert Watson committed
568
void 	inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp,
569
		uint32_t *faddr, uint16_t *fp);
570
int	inp_so_options(const struct inpcb *inp);
571

572
573
#endif /* _KERNEL */

Jeffrey Hsu's avatar
Jeffrey Hsu committed
574
#define INP_INFO_LOCK_INIT(ipi, d) \
575
576
577
578
579
580
	mtx_init(&(ipi)->ipi_lock, (d), NULL, MTX_DEF| MTX_RECURSE)
#define INP_INFO_LOCK_DESTROY(ipi)  mtx_destroy(&(ipi)->ipi_lock)
#define INP_INFO_WLOCK(ipi) mtx_lock(&(ipi)->ipi_lock)
#define INP_INFO_TRY_WLOCK(ipi)	mtx_trylock(&(ipi)->ipi_lock)
#define INP_INFO_WLOCKED(ipi)	mtx_owned(&(ipi)->ipi_lock)
#define INP_INFO_WUNLOCK(ipi)	mtx_unlock(&(ipi)->ipi_lock)
581
#define	INP_INFO_LOCK_ASSERT(ipi)	MPASS(in_epoch(net_epoch_preempt) || mtx_owned(&(ipi)->ipi_lock))
582
#define INP_INFO_WLOCK_ASSERT(ipi)	mtx_assert(&(ipi)->ipi_lock, MA_OWNED)
583
584
#define INP_INFO_WUNLOCK_ASSERT(ipi)	\
	mtx_assert(&(ipi)->ipi_lock, MA_NOTOWNED)
Jeffrey Hsu's avatar
Jeffrey Hsu committed
585

586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
#define INP_LIST_LOCK_INIT(ipi, d) \
        rw_init_flags(&(ipi)->ipi_list_lock, (d), 0)
#define INP_LIST_LOCK_DESTROY(ipi)  rw_destroy(&(ipi)->ipi_list_lock)
#define INP_LIST_RLOCK(ipi)     rw_rlock(&(ipi)->ipi_list_lock)
#define INP_LIST_WLOCK(ipi)     rw_wlock(&(ipi)->ipi_list_lock)
#define INP_LIST_TRY_RLOCK(ipi) rw_try_rlock(&(ipi)->ipi_list_lock)
#define INP_LIST_TRY_WLOCK(ipi) rw_try_wlock(&(ipi)->ipi_list_lock)
#define INP_LIST_TRY_UPGRADE(ipi)       rw_try_upgrade(&(ipi)->ipi_list_lock)
#define INP_LIST_RUNLOCK(ipi)   rw_runlock(&(ipi)->ipi_list_lock)
#define INP_LIST_WUNLOCK(ipi)   rw_wunlock(&(ipi)->ipi_list_lock)
#define INP_LIST_LOCK_ASSERT(ipi) \
	rw_assert(&(ipi)->ipi_list_lock, RA_LOCKED)
#define INP_LIST_RLOCK_ASSERT(ipi) \
	rw_assert(&(ipi)->ipi_list_lock, RA_RLOCKED)
#define INP_LIST_WLOCK_ASSERT(ipi) \
	rw_assert(&(ipi)->ipi_list_lock, RA_WLOCKED)
#define INP_LIST_UNLOCK_ASSERT(ipi) \
	rw_assert(&(ipi)->ipi_list_lock, RA_UNLOCKED)

605
606
607
608
#define	INP_HASH_LOCK_INIT(ipi, d) mtx_init(&(ipi)->ipi_hash_lock, (d), NULL, MTX_DEF)
#define	INP_HASH_LOCK_DESTROY(ipi)	mtx_destroy(&(ipi)->ipi_hash_lock)
#define	INP_HASH_WLOCK(ipi)		mtx_lock(&(ipi)->ipi_hash_lock)
#define	INP_HASH_WUNLOCK(ipi)		mtx_unlock(&(ipi)->ipi_hash_lock)
609
#define	INP_HASH_LOCK_ASSERT(ipi)	MPASS(in_epoch(net_epoch_preempt) || mtx_owned(&(ipi)->ipi_hash_lock))
610
#define	INP_HASH_WLOCK_ASSERT(ipi)	mtx_assert(&(ipi)->ipi_hash_lock, MA_OWNED);
611

612
613
614
615
616
617
618
619
#define	INP_GROUP_LOCK_INIT(ipg, d)	mtx_init(&(ipg)->ipg_lock, (d), NULL, \
					    MTX_DEF | MTX_DUPOK)
#define	INP_GROUP_LOCK_DESTROY(ipg)	mtx_destroy(&(ipg)->ipg_lock)

#define	INP_GROUP_LOCK(ipg)		mtx_lock(&(ipg)->ipg_lock)
#define	INP_GROUP_LOCK_ASSERT(ipg)	mtx_assert(&(ipg)->ipg_lock, MA_OWNED)
#define	INP_GROUP_UNLOCK(ipg)		mtx_unlock(&(ipg)->ipg_lock)

620
#define INP_PCBHASH(faddr, lport, fport, mask) \
621
622
623
	(((faddr) ^ ((faddr) >> 16) ^ ntohs((lport) ^ (fport))) & (mask))
#define INP_PCBPORTHASH(lport, mask) \
	(ntohs((lport)) & (mask))
624
625
#define	INP_PCBLBGROUP_PKTHASH(faddr, lport, fport) \
	((faddr) ^ ((faddr) >> 16) ^ ntohs((lport) ^ (fport)))
626
#define	INP6_PCBHASHKEY(faddr)	((faddr)->s6_addr32[3])
627

628
/*
629
 * Flags for inp_vflags -- historically version flags only
630
631
632
633
634
635
 */
#define	INP_IPV4	0x1
#define	INP_IPV6	0x2
#define	INP_IPV6PROTO	0x4		/* opened under IPv6 protocol */

/*
636
 * Flags for inp_flags.
637
 */
638
639
640
641
642
643
644
645
646
#define	INP_RECVOPTS		0x00000001 /* receive incoming IP options */
#define	INP_RECVRETOPTS		0x00000002 /* receive IP options for reply */
#define	INP_RECVDSTADDR		0x00000004 /* receive IP dst address */
#define	INP_HDRINCL		0x00000008 /* user supplies entire IP header */
#define	INP_HIGHPORT		0x00000010 /* user wants "high" port binding */
#define	INP_LOWPORT		0x00000020 /* user wants "low" port binding */
#define	INP_ANONPORT		0x00000040 /* port chosen for user */
#define	INP_RECVIF		0x00000080 /* receive incoming interface */
#define	INP_MTUDISC		0x00000100 /* user can do MTU discovery */
647
				   	   /* 0x000200 unused: was INP_FAITH */
648
649
#define	INP_RECVTTL		0x00000400 /* receive incoming IP TTL */
#define	INP_DONTFRAG		0x00000800 /* don't fragment packet */
650
#define	INP_BINDANY		0x00001000 /* allow bind to any address */
651
#define	INP_INHASHLIST		0x00002000 /* in_pcbinshash() has been called */
652
#define	INP_RECVTOS		0x00004000 /* receive incoming IP TOS */
653
654
655
656
657
658
659
660
661
662
663
664
665
#define	IN6P_IPV6_V6ONLY	0x00008000 /* restrict AF_INET6 socket for v6 */
#define	IN6P_PKTINFO		0x00010000 /* receive IP6 dst and I/F */
#define	IN6P_HOPLIMIT		0x00020000 /* receive hoplimit */
#define	IN6P_HOPOPTS		0x00040000 /* receive hop-by-hop options */
#define	IN6P_DSTOPTS		0x00080000 /* receive dst options after rthdr */
#define	IN6P_RTHDR		0x00100000 /* receive routing header */
#define	IN6P_RTHDRDSTOPTS	0x00200000 /* receive dstoptions before rthdr */
#define	IN6P_TCLASS		0x00400000 /* receive traffic class value */
#define	IN6P_AUTOFLOWLABEL	0x00800000 /* attach flowlabel automatically */
#define	INP_TIMEWAIT		0x01000000 /* in TIMEWAIT, ppcb is tcptw */
#define	INP_ONESBCAST		0x02000000 /* send all-ones broadcast */
#define	INP_DROPPED		0x04000000 /* protocol drop flag */
#define	INP_SOCKREF		0x08000000 /* strong socket reference */
666
667
#define	INP_RESERVED_0          0x10000000 /* reserved field */
#define	INP_RESERVED_1          0x20000000 /* reserved field */
Hajimu UMEMOTO's avatar
Hajimu UMEMOTO committed
668
669
#define	IN6P_RFC2292		0x40000000 /* used RFC2292 API on the socket */
#define	IN6P_MTU		0x80000000 /* receive path MTU */
Hajimu UMEMOTO's avatar
Hajimu UMEMOTO committed
670

671
#define	INP_CONTROLOPTS		(INP_RECVOPTS|INP_RECVRETOPTS|INP_RECVDSTADDR|\
672
				 INP_RECVIF|INP_RECVTTL|INP_RECVTOS|\
Hajimu UMEMOTO's avatar
Hajimu UMEMOTO committed
673
674
				 IN6P_PKTINFO|IN6P_HOPLIMIT|IN6P_HOPOPTS|\
				 IN6P_DSTOPTS|IN6P_RTHDR|IN6P_RTHDRDSTOPTS|\
675
676
				 IN6P_TCLASS|IN6P_AUTOFLOWLABEL|IN6P_RFC2292|\
				 IN6P_MTU)
677

678
679
680
/*
 * Flags for inp_flags2.
 */
681
682
#define	INP_MBUF_L_ACKS		0x00000001 /* We need large mbufs for ack compression */
#define	INP_MBUF_ACKCMP		0x00000002 /* TCP mbuf ack compression ok */
Gleb Smirnoff's avatar
Gleb Smirnoff committed
683
/*				0x00000004 */
684
#define	INP_REUSEPORT		0x00000008 /* SO_REUSEPORT option is set */
685
#define	INP_FREED		0x00000010 /* inp itself is not valid */
686
#define	INP_REUSEADDR		0x00000020 /* SO_REUSEADDR option is set */
687
688
#define	INP_BINDMULTI		0x00000040 /* IP_BINDMULTI option is set */
#define	INP_RSS_BUCKET_SET	0x00000080 /* IP_RSS_LISTEN_BUCKET is set */
689
690
#define	INP_RECVFLOWID		0x00000100 /* populate recv datagram with flow info */
#define	INP_RECVRSSBUCKETID	0x00000200 /* populate recv datagram with bucket id */
691
#define	INP_RATE_LIMIT_CHANGED	0x00000400 /* rate limit needs attention */
692
#define	INP_ORIGDSTADDR		0x00000800 /* receive IP dst address/port */
693
#define INP_CANNOT_DO_ECN	0x00001000 /* The stack does not do ECN */
694
#define	INP_REUSEPORT_LB	0x00002000 /* SO_REUSEPORT_LB option is set */
695
696
697
#define INP_SUPPORTS_MBUFQ	0x00004000 /* Supports the mbuf queue method of LRO */
#define INP_MBUF_QUEUE_READY	0x00008000 /* The transport is pacing, inputs can be queued */
#define INP_DONT_SACK_QUEUE	0x00010000 /* If a sack arrives do not wake me */
698
699
700
701
702
703
704
#define INP_2PCP_SET		0x00020000 /* If the Eth PCP should be set explicitly */
#define INP_2PCP_BIT0		0x00040000 /* Eth PCP Bit 0 */
#define INP_2PCP_BIT1		0x00080000 /* Eth PCP Bit 1 */
#define INP_2PCP_BIT2		0x00100000 /* Eth PCP Bit 2 */
#define INP_2PCP_BASE	INP_2PCP_BIT0
#define INP_2PCP_MASK	(INP_2PCP_BIT0 | INP_2PCP_BIT1 | INP_2PCP_BIT2)
#define INP_2PCP_SHIFT		18         /* shift PCP field in/out of inp_flags2 */
705
706
707
708
709
710
711
712
/*
 * Flags passed to in_pcblookup*() functions.
 */
#define	INPLOOKUP_WILDCARD	0x00000001	/* Allow wildcard sockets. */
#define	INPLOOKUP_RLOCKPCB	0x00000002	/* Return inpcb read-locked. */
#define	INPLOOKUP_WLOCKPCB	0x00000004	/* Return inpcb write-locked. */

#define	INPLOOKUP_MASK	(INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB | \
713
	    INPLOOKUP_WLOCKPCB)
714

Rodney W. Grimes's avatar
Rodney W. Grimes committed
715
#define	sotoinpcb(so)	((struct inpcb *)(so)->so_pcb)
716
717
718

#define	INP_SOCKAF(so) so->so_proto->pr_domain->dom_family

719
#define	INP_CHECK_SOCKAF(so, af)	(INP_SOCKAF(so) == af)
Rodney W. Grimes's avatar
Rodney W. Grimes committed
720

721
722
723
724
725
726
727
/*
 * Constants for pcbinfo.ipi_hashfields.
 */
#define	IPI_HASHFIELDS_NONE	0
#define	IPI_HASHFIELDS_2TUPLE	1
#define	IPI_HASHFIELDS_4TUPLE	2

728
#ifdef _KERNEL
729
730
731
732
733
734
735
736
737
738
739
740
741
742
VNET_DECLARE(int, ipport_reservedhigh);
VNET_DECLARE(int, ipport_reservedlow);
VNET_DECLARE(int, ipport_lowfirstauto);
VNET_DECLARE(int, ipport_lowlastauto);
VNET_DECLARE(int, ipport_firstauto);
VNET_DECLARE(int, ipport_lastauto);
VNET_DECLARE(int, ipport_hifirstauto);
VNET_DECLARE(int, ipport_hilastauto);
VNET_DECLARE(int, ipport_randomized);
VNET_DECLARE(int, ipport_randomcps);
VNET_DECLARE(int, ipport_randomtime);
VNET_DECLARE(int, ipport_stoprandom);
VNET_DECLARE(int, ipport_tcpallocs);

743
744
745
746
747
748
749
750
751
752
753
754
755
#define	V_ipport_reservedhigh	VNET(ipport_reservedhigh)
#define	V_ipport_reservedlow	VNET(ipport_reservedlow)
#define	V_ipport_lowfirstauto	VNET(ipport_lowfirstauto)
#define	V_ipport_lowlastauto	VNET(ipport_lowlastauto)
#define	V_ipport_firstauto	VNET(ipport_firstauto)
#define	V_ipport_lastauto	VNET(ipport_lastauto)
#define	V_ipport_hifirstauto	VNET(ipport_hifirstauto)
#define	V_ipport_hilastauto	VNET(ipport_hilastauto)
#define	V_ipport_randomized	VNET(ipport_randomized)
#define	V_ipport_randomcps	VNET(ipport_randomcps)
#define	V_ipport_randomtime	VNET(ipport_randomtime)
#define	V_ipport_stoprandom	VNET(ipport_stoprandom)
#define	V_ipport_tcpallocs	VNET(ipport_tcpallocs)
756

757
758
void	in_pcbinfo_destroy(struct inpcbinfo *);
void	in_pcbinfo_init(struct inpcbinfo *, const char *, struct inpcbhead *,
759
	    int, int, char *, uma_init, u_int);
760

761
762
763
int	in_pcbbind_check_bindmulti(const struct inpcb *ni,
	    const struct inpcb *oi);

Jeffrey Hsu's avatar
Jeffrey Hsu committed
764
void	in_pcbpurgeif0(struct inpcbinfo *, struct ifnet *);
765
int	in_pcballoc(struct socket *, struct inpcbinfo *);
766
int	in_pcbbind(struct inpcb *, struct sockaddr *, struct ucred *);
767
int	in_pcbbind_setup(struct inpcb *, struct sockaddr *, in_addr_t *,
768
769
	    u_short *, struct ucred *);
int	in_pcbconnect(struct inpcb *, struct sockaddr *, struct ucred *);
770
int	in_pcbconnect_mbuf(struct inpcb *, struct sockaddr *, struct ucred *,
771
	    struct mbuf *, bool);
772
773
int	in_pcbconnect_setup(struct inpcb *, struct sockaddr *, in_addr_t *,
	    u_short *, in_addr_t *, u_short *, struct inpcb **,
774
	    struct ucred *);
Alfred Perlstein's avatar
Alfred Perlstein committed
775
776
void	in_pcbdetach(struct inpcb *);
void	in_pcbdisconnect(struct inpcb *);
777
void	in_pcbdrop(struct inpcb *);
778
void	in_pcbfree(struct inpcb *);
Alfred Perlstein's avatar
Alfred Perlstein committed
779
int	in_pcbinshash(struct inpcb *);
780
int	in_pcbinshash_mbuf(struct inpcb *, struct mbuf *);
781
782
int	in_pcbladdr(struct inpcb *, struct in_addr *, struct in_addr *,
	    struct ucred *);
783
int	in_pcblbgroup_numa(struct inpcb *, int arg);
784
struct inpcb *
785
	in_pcblookup(struct inpcbinfo *, struct in_addr, u_int,
786
	    struct in_addr, u_int, int, struct ifnet *);
787
788
789
struct inpcb *
	in_pcblookup_mbuf(struct inpcbinfo *, struct in_addr, u_int,
	    struct in_addr, u_int, int, struct ifnet *, struct mbuf *);
Jeffrey Hsu's avatar
Jeffrey Hsu committed
790
void	in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr,
791
	    int, struct inpcb *(*)(struct inpcb *, int));
792
void	in_pcbref(struct inpcb *);
Alfred Perlstein's avatar
Alfred Perlstein committed
793
void	in_pcbrehash(struct inpcb *);
794
void	in_pcbrehash_mbuf(struct inpcb *, struct mbuf *);
795
796
int	in_pcbrele_rlocked(struct inpcb *);
int	in_pcbrele_wlocked(struct inpcb *);
797
void	in_losing(struct inpcb *);
798
void	in_pcbsetsolabel(struct socket *so);
799
800
int	in_getpeeraddr(struct socket *so, struct sockaddr **nam);
int	in_getsockaddr(struct socket *so, struct sockaddr **nam);
801
802
struct sockaddr *
	in_sockaddr(in_port_t port, struct in_addr *addr);
803
void	in_pcbsosetlabel(struct socket *so);
804
#ifdef RATELIMIT
805
806
807
808
809
int
in_pcboutput_txrtlmt_locked(struct inpcb *, struct ifnet *,
	    struct mbuf *, uint32_t);
int	in_pcbattach_txrtlmt(struct inpcb *, struct ifnet *, uint32_t, uint32_t,
	    uint32_t, struct m_snd_tag **);
810
void	in_pcbdetach_txrtlmt(struct inpcb *);
811
void    in_pcbdetach_tag(struct m_snd_tag *);
812
813
int	in_pcbmodify_txrtlmt(struct inpcb *, uint32_t);
int	in_pcbquery_txrtlmt(struct inpcb *, uint32_t *);
814
int	in_pcbquery_txrlevel(struct inpcb *, uint32_t *);
815
816
817
void	in_pcboutput_txrtlmt(struct inpcb *, struct ifnet *, struct mbuf *);
void	in_pcboutput_eagain(struct inpcb *);
#endif
818
#endif /* _KERNEL */
819

820
#endif /* !_NETINET_IN_PCB_H_ */