tcp_subr.c 112 KB
Newer Older
1
/*-
2
3
 * SPDX-License-Identifier: BSD-3-Clause
 *
4
 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
Rodney W. Grimes's avatar
Rodney W. Grimes committed
5
6
7
8
9
10
11
12
13
14
 *	The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
Warner Losh's avatar
Warner Losh committed
15
 * 3. Neither the name of the University nor the names of its contributors
Rodney W. Grimes's avatar
Rodney W. Grimes committed
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
31
 *	@(#)tcp_subr.c	8.2 (Berkeley) 5/24/95
Rodney W. Grimes's avatar
Rodney W. Grimes committed
32
33
 */

34
35
36
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");

37
#include "opt_inet.h"
38
#include "opt_inet6.h"
39
#include "opt_ipsec.h"
40
#include "opt_kern_tls.h"
41
42
#include "opt_tcpdebug.h"

Rodney W. Grimes's avatar
Rodney W. Grimes committed
43
44
#include <sys/param.h>
#include <sys/systm.h>
45
#include <sys/arb.h>
46
#include <sys/callout.h>
47
#include <sys/eventhandler.h>
48
#ifdef TCP_HHOOK
49
#include <sys/hhook.h>
50
#endif
51
#include <sys/kernel.h>
52
#ifdef TCP_HHOOK
53
#include <sys/khelp.h>
54
#endif
55
56
57
#ifdef KERN_TLS
#include <sys/ktls.h>
#endif
58
59
#include <sys/qmath.h>
#include <sys/stats.h>
60
#include <sys/sysctl.h>
61
#include <sys/jail.h>
Rodney W. Grimes's avatar
Rodney W. Grimes committed
62
#include <sys/malloc.h>
63
#include <sys/refcount.h>
Rodney W. Grimes's avatar
Rodney W. Grimes committed
64
#include <sys/mbuf.h>
Yoshinobu Inoue's avatar
Yoshinobu Inoue committed
65
66
67
#ifdef INET6
#include <sys/domain.h>
#endif
68
#include <sys/priv.h>
69
#include <sys/proc.h>
70
#include <sys/sdt.h>
Rodney W. Grimes's avatar
Rodney W. Grimes committed
71
72
73
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/protosw.h>
74
#include <sys/random.h>
75

76
#include <vm/uma.h>
Rodney W. Grimes's avatar
Rodney W. Grimes committed
77
78

#include <net/route.h>
79
#include <net/route/nhop.h>
Rodney W. Grimes's avatar
Rodney W. Grimes committed
80
#include <net/if.h>
81
#include <net/if_var.h>
82
#include <net/vnet.h>
Rodney W. Grimes's avatar
Rodney W. Grimes committed
83
84

#include <netinet/in.h>
85
#include <netinet/in_fib.h>
86
#include <netinet/in_kdtrace.h>
87
#include <netinet/in_pcb.h>
Rodney W. Grimes's avatar
Rodney W. Grimes committed
88
#include <netinet/in_systm.h>
89
#include <netinet/in_var.h>
Rodney W. Grimes's avatar
Rodney W. Grimes committed
90
#include <netinet/ip.h>
91
92
#include <netinet/ip_icmp.h>
#include <netinet/ip_var.h>
Yoshinobu Inoue's avatar
Yoshinobu Inoue committed
93
#ifdef INET6
94
#include <netinet/icmp6.h>
Yoshinobu Inoue's avatar
Yoshinobu Inoue committed
95
#include <netinet/ip6.h>
96
#include <netinet6/in6_fib.h>
Yoshinobu Inoue's avatar
Yoshinobu Inoue committed
97
98
#include <netinet6/in6_pcb.h>
#include <netinet6/ip6_var.h>
99
#include <netinet6/scope6_var.h>
100
#include <netinet6/nd6.h>
Yoshinobu Inoue's avatar
Yoshinobu Inoue committed
101
#endif
102

103
#include <netinet/tcp.h>
104
105
106
#ifdef INVARIANTS
#define TCPSTATES
#endif
Rodney W. Grimes's avatar
Rodney W. Grimes committed
107
108
109
110
#include <netinet/tcp_fsm.h>
#include <netinet/tcp_seq.h>
#include <netinet/tcp_timer.h>
#include <netinet/tcp_var.h>
111
#include <netinet/tcp_log_buf.h>
112
#include <netinet/tcp_syncache.h>
113
#include <netinet/tcp_hpts.h>
114
#include <netinet/cc/cc.h>
Yoshinobu Inoue's avatar
Yoshinobu Inoue committed
115
116
117
#ifdef INET6
#include <netinet6/tcp6_var.h>
#endif
Rodney W. Grimes's avatar
Rodney W. Grimes committed
118
#include <netinet/tcpip.h>
119
#include <netinet/tcp_fastopen.h>
120
121
122
#ifdef TCPPCAP
#include <netinet/tcp_pcap.h>
#endif
123
124
125
#ifdef TCPDEBUG
#include <netinet/tcp_debug.h>
#endif
126
#ifdef INET6
Yoshinobu Inoue's avatar
Yoshinobu Inoue committed
127
#include <netinet6/ip6protosw.h>
128
#endif
129
130
131
#ifdef TCP_OFFLOAD
#include <netinet/tcp_offload.h>
#endif
132
133
#include <netinet/udp.h>
#include <netinet/udp_var.h>
Yoshinobu Inoue's avatar
Yoshinobu Inoue committed
134

135
#include <netipsec/ipsec_support.h>
136

137
#include <machine/in_cksum.h>
138
#include <crypto/siphash/siphash.h>
139

140
141
#include <security/mac/mac_framework.h>

142
VNET_DEFINE(int, tcp_mssdflt) = TCP_MSS;
143
#ifdef INET6
144
VNET_DEFINE(int, tcp_v6mssdflt) = TCP6_MSS;
145
146
#endif

147
148
#ifdef NETFLIX_EXP_DETECTION
/*  Sack attack detection thresholds and such */
149
150
SYSCTL_NODE(_net_inet_tcp, OID_AUTO, sack_attack,
    CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
    "Sack Attack detection thresholds");
int32_t tcp_force_detection = 0;
SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, force_detection,
    CTLFLAG_RW,
    &tcp_force_detection, 0,
    "Do we force detection even if the INP has it off?");
int32_t tcp_sack_to_ack_thresh = 700;	/* 70 % */
SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, sack_to_ack_thresh,
    CTLFLAG_RW,
    &tcp_sack_to_ack_thresh, 700,
    "Percentage of sacks to acks we must see above (10.1 percent is 101)?");
int32_t tcp_sack_to_move_thresh = 600;	/* 60 % */
SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, move_thresh,
    CTLFLAG_RW,
    &tcp_sack_to_move_thresh, 600,
    "Percentage of sack moves we must see above (10.1 percent is 101)");
int32_t tcp_restoral_thresh = 650;	/* 65 % (sack:2:ack -5%) */
SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, restore_thresh,
    CTLFLAG_RW,
    &tcp_restoral_thresh, 550,
    "Percentage of sack to ack percentage we must see below to restore(10.1 percent is 101)");
int32_t tcp_sad_decay_val = 800;
SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, decay_per,
    CTLFLAG_RW,
    &tcp_sad_decay_val, 800,
    "The decay percentage (10.1 percent equals 101 )");
int32_t tcp_map_minimum = 500;
SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, nummaps,
    CTLFLAG_RW,
    &tcp_map_minimum, 500,
    "Number of Map enteries before we start detection");
int32_t tcp_attack_on_turns_on_logging = 0;
SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, attacks_logged,
    CTLFLAG_RW,
    &tcp_attack_on_turns_on_logging, 0,
   "When we have a positive hit on attack, do we turn on logging?");
int32_t tcp_sad_pacing_interval = 2000;
SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, sad_pacing_int,
    CTLFLAG_RW,
    &tcp_sad_pacing_interval, 2000,
    "What is the minimum pacing interval for a classified attacker?");

int32_t tcp_sad_low_pps = 100;
SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, sad_low_pps,
    CTLFLAG_RW,
    &tcp_sad_low_pps, 100,
    "What is the input pps that below which we do not decay?");
#endif
199
200
201
202
203
204
205
206
207
208
uint32_t tcp_ack_war_time_window = 1000;
SYSCTL_UINT(_net_inet_tcp, OID_AUTO, ack_war_timewindow,
    CTLFLAG_RW,
    &tcp_ack_war_time_window, 1000,
   "If the tcp_stack does ack-war prevention how many milliseconds are in its time window?");
uint32_t tcp_ack_war_cnt = 5;
SYSCTL_UINT(_net_inet_tcp, OID_AUTO, ack_war_cnt,
    CTLFLAG_RW,
    &tcp_ack_war_cnt, 5,
   "If the tcp_stack does ack-war prevention how many acks can be sent in its time window?");
209

210
211
struct rwlock tcp_function_lock;

212
213
214
215
216
static int
sysctl_net_inet_tcp_mss_check(SYSCTL_HANDLER_ARGS)
{
	int error, new;

217
	new = V_tcp_mssdflt;
218
219
220
221
222
	error = sysctl_handle_int(oidp, &new, 0, req);
	if (error == 0 && req->newptr) {
		if (new < TCP_MINMSS)
			error = EINVAL;
		else
223
			V_tcp_mssdflt = new;
224
225
226
227
	}
	return (error);
}

228
SYSCTL_PROC(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt,
229
230
    CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
    &VNET_NAME(tcp_mssdflt), 0, &sysctl_net_inet_tcp_mss_check, "I",
231
    "Default TCP Maximum Segment Size");
232
233
234
235
236
237
238

#ifdef INET6
static int
sysctl_net_inet_tcp_mss_v6_check(SYSCTL_HANDLER_ARGS)
{
	int error, new;

239
	new = V_tcp_v6mssdflt;
240
241
242
243
244
	error = sysctl_handle_int(oidp, &new, 0, req);
	if (error == 0 && req->newptr) {
		if (new < TCP_MINMSS)
			error = EINVAL;
		else
245
			V_tcp_v6mssdflt = new;
246
247
248
249
	}
	return (error);
}

250
SYSCTL_PROC(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt,
251
252
    CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
    &VNET_NAME(tcp_v6mssdflt), 0, &sysctl_net_inet_tcp_mss_v6_check, "I",
253
   "Default TCP Maximum Segment Size for IPv6");
254
#endif /* INET6 */
255

256
257
258
259
260
261
262
263
/*
 * Minimum MSS we accept and use. This prevents DoS attacks where
 * we are forced to a ridiculous low MSS like 20 and send hundreds
 * of packets instead of one. The effect scales with the available
 * bandwidth and quickly saturates the CPU and network interface
 * with packet generation and sending. Set to zero to disable MINMSS
 * checking. This setting prevents us from sending too small packets.
 */
264
VNET_DEFINE(int, tcp_minmss) = TCP_MINMSS;
265
SYSCTL_INT(_net_inet_tcp, OID_AUTO, minmss, CTLFLAG_VNET | CTLFLAG_RW,
266
     &VNET_NAME(tcp_minmss), 0,
267
    "Minimum TCP Maximum Segment Size");
268

269
VNET_DEFINE(int, tcp_do_rfc1323) = 1;
270
SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323, CTLFLAG_VNET | CTLFLAG_RW,
271
    &VNET_NAME(tcp_do_rfc1323), 0,
272
    "Enable rfc1323 (high performance TCP) extensions");
273

274
275
276
277
278
279
280
281
/*
 * As of June 2021, several TCP stacks violate RFC 7323 from September 2014.
 * Some stacks negotiate TS, but never send them after connection setup. Some
 * stacks negotiate TS, but don't send them when sending keep-alive segments.
 * These include modern widely deployed TCP stacks.
 * Therefore tolerating violations for now...
 */
VNET_DEFINE(int, tcp_tolerate_missing_ts) = 1;
282
283
284
285
SYSCTL_INT(_net_inet_tcp, OID_AUTO, tolerate_missing_ts, CTLFLAG_VNET | CTLFLAG_RW,
    &VNET_NAME(tcp_tolerate_missing_ts), 0,
    "Tolerate missing TCP timestamps");

286
287
288
289
290
VNET_DEFINE(int, tcp_ts_offset_per_conn) = 1;
SYSCTL_INT(_net_inet_tcp, OID_AUTO, ts_offset_per_conn, CTLFLAG_VNET | CTLFLAG_RW,
    &VNET_NAME(tcp_ts_offset_per_conn), 0,
    "Initialize TCP timestamps per connection instead of per host pair");

291
292
293
294
295
296
297
298
299
300
301
302
/* How many connections are pacing */
static volatile uint32_t number_of_tcp_connections_pacing = 0;
static uint32_t shadow_num_connections = 0;

static int tcp_pacing_limit = 10000;
SYSCTL_INT(_net_inet_tcp, OID_AUTO, pacing_limit, CTLFLAG_RW,
    &tcp_pacing_limit, 1000,
    "If the TCP stack does pacing, is there a limit (-1 = no, 0 = no pacing N = number of connections)");

SYSCTL_UINT(_net_inet_tcp, OID_AUTO, pacing_count, CTLFLAG_RD,
    &shadow_num_connections, 0, "Number of TCP connections being paced");

303
static int	tcp_log_debug = 0;
304
305
306
SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_debug, CTLFLAG_RW,
    &tcp_log_debug, 0, "Log errors caused by incoming TCP segments");

307
308
static int	tcp_tcbhashsize;
SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
309
    &tcp_tcbhashsize, 0, "Size of TCP control-block hashtable");
310

311
static int	do_tcpdrain = 1;
312
SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_tcpdrain, CTLFLAG_RW, &do_tcpdrain, 0,
313
    "Enable tcp_drain routine for extra help when low on mbufs");
314

315
SYSCTL_UINT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_VNET | CTLFLAG_RD,
316
    &VNET_NAME(tcbinfo.ipi_count), 0, "Number of active PCBs");
317

318
VNET_DEFINE_STATIC(int, icmp_may_rst) = 1;
319
#define	V_icmp_may_rst			VNET(icmp_may_rst)
320
SYSCTL_INT(_net_inet_tcp, OID_AUTO, icmp_may_rst, CTLFLAG_VNET | CTLFLAG_RW,
321
    &VNET_NAME(icmp_may_rst), 0,
322
    "Certain ICMP unreachable messages may abort connections in SYN_SENT");
323

324
VNET_DEFINE_STATIC(int, tcp_isn_reseed_interval) = 0;
325
#define	V_tcp_isn_reseed_interval	VNET(tcp_isn_reseed_interval)
326
SYSCTL_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_VNET | CTLFLAG_RW,
327
    &VNET_NAME(tcp_isn_reseed_interval), 0,
328
    "Seconds between reseeding of ISN secret");
329

330
static int	tcp_soreceive_stream;
331
332
333
SYSCTL_INT(_net_inet_tcp, OID_AUTO, soreceive_stream, CTLFLAG_RDTUN,
    &tcp_soreceive_stream, 0, "Using soreceive_stream for TCP sockets");

334
VNET_DEFINE(uma_zone_t, sack_hole_zone);
335
#define	V_sack_hole_zone		VNET(sack_hole_zone)
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
VNET_DEFINE(uint32_t, tcp_map_entries_limit) = 0;	/* unlimited */
static int
sysctl_net_inet_tcp_map_limit_check(SYSCTL_HANDLER_ARGS)
{
	int error;
	uint32_t new;

	new = V_tcp_map_entries_limit;
	error = sysctl_handle_int(oidp, &new, 0, req);
	if (error == 0 && req->newptr) {
		/* only allow "0" and value > minimum */
		if (new > 0 && new < TCP_MIN_MAP_ENTRIES_LIMIT)
			error = EINVAL;
		else
			V_tcp_map_entries_limit = new;
	}
	return (error);
}
SYSCTL_PROC(_net_inet_tcp, OID_AUTO, map_limit,
355
    CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
356
357
358
359
360
361
362
363
    &VNET_NAME(tcp_map_entries_limit), 0,
    &sysctl_net_inet_tcp_map_limit_check, "IU",
    "Total sendmap entries limit");

VNET_DEFINE(uint32_t, tcp_map_split_limit) = 0;	/* unlimited */
SYSCTL_UINT(_net_inet_tcp, OID_AUTO, split_limit, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(tcp_map_split_limit), 0,
    "Total sendmap split entries limit");
364

365
#ifdef TCP_HHOOK
366
VNET_DEFINE(struct hhook_head *, tcp_hhh[HHOOK_TCP_LAST+1]);
367
#endif
368

369
#define TS_OFFSET_SECRET_LENGTH SIPHASH_KEY_LENGTH
370
VNET_DEFINE_STATIC(u_char, ts_offset_secret[TS_OFFSET_SECRET_LENGTH]);
371
372
#define	V_ts_offset_secret	VNET(ts_offset_secret)

373
374
375
static int	tcp_default_fb_init(struct tcpcb *tp);
static void	tcp_default_fb_fini(struct tcpcb *tp, int tcb_is_purged);
static int	tcp_default_handoff_ok(struct tcpcb *tp);
376
static struct inpcb *tcp_notify(struct inpcb *, int);
377
static struct inpcb *tcp_mtudisc_notify(struct inpcb *, int);
378
static void tcp_mtudisc(struct inpcb *, int);
379
380
static char *	tcp_log_addr(struct in_conninfo *inc, struct tcphdr *th,
		    void *ip4hdr, const void *ip6hdr);
Rodney W. Grimes's avatar
Rodney W. Grimes committed
381

382
static struct tcp_function_block tcp_def_funcblk = {
383
384
385
386
387
388
389
	.tfb_tcp_block_name = "freebsd",
	.tfb_tcp_output = tcp_output,
	.tfb_tcp_do_segment = tcp_do_segment,
	.tfb_tcp_ctloutput = tcp_default_ctloutput,
	.tfb_tcp_handoff_ok = tcp_default_handoff_ok,
	.tfb_tcp_fb_init = tcp_default_fb_init,
	.tfb_tcp_fb_fini = tcp_default_fb_fini,
390
391
};

392
static int tcp_fb_cnt = 0;
393
394
395
struct tcp_funchead t_functions;
static struct tcp_function_block *tcp_func_set_ptr = &tcp_def_funcblk;

396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
void
tcp_record_dsack(struct tcpcb *tp, tcp_seq start, tcp_seq end, int tlp)
{
	TCPSTAT_INC(tcps_dsack_count);
	tp->t_dsack_pack++;
	if (tlp == 0) {
		if (SEQ_GT(end, start)) {
			tp->t_dsack_bytes += (end - start);
			TCPSTAT_ADD(tcps_dsack_bytes, (end - start));
		} else {
			tp->t_dsack_tlp_bytes += (start - end);
			TCPSTAT_ADD(tcps_dsack_bytes, (start - end));
		}
	} else {
		if (SEQ_GT(end, start)) {
			tp->t_dsack_bytes += (end - start);
			TCPSTAT_ADD(tcps_dsack_tlp_bytes, (end - start));
		} else {
			tp->t_dsack_tlp_bytes += (start - end);
			TCPSTAT_ADD(tcps_dsack_tlp_bytes, (start - end));
		}
	}
}

420
421
422
423
424
425
426
static struct tcp_function_block *
find_tcp_functions_locked(struct tcp_function_set *fs)
{
	struct tcp_function *f;
	struct tcp_function_block *blk=NULL;

	TAILQ_FOREACH(f, &t_functions, tf_next) {
427
		if (strcmp(f->tf_name, fs->function_set_name) == 0) {
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
			blk = f->tf_fb;
			break;
		}
	}
	return(blk);
}

static struct tcp_function_block *
find_tcp_fb_locked(struct tcp_function_block *blk, struct tcp_function **s)
{
	struct tcp_function_block *rblk=NULL;
	struct tcp_function *f;

	TAILQ_FOREACH(f, &t_functions, tf_next) {
		if (f->tf_fb == blk) {
			rblk = blk;
			if (s) {
				*s = f;
			}
			break;
		}
	}
	return (rblk);
}

struct tcp_function_block *
find_and_ref_tcp_functions(struct tcp_function_set *fs)
{
	struct tcp_function_block *blk;
457
458

	rw_rlock(&tcp_function_lock);
459
460
	blk = find_tcp_functions_locked(fs);
	if (blk)
461
		refcount_acquire(&blk->tfb_refcnt);
462
463
464
465
466
467
468
469
	rw_runlock(&tcp_function_lock);
	return(blk);
}

struct tcp_function_block *
find_and_ref_tcp_fb(struct tcp_function_block *blk)
{
	struct tcp_function_block *rblk;
470
471

	rw_rlock(&tcp_function_lock);
472
	rblk = find_tcp_fb_locked(blk, NULL);
473
	if (rblk)
474
475
476
477
478
		refcount_acquire(&rblk->tfb_refcnt);
	rw_runlock(&tcp_function_lock);
	return(rblk);
}

479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
/* Find a matching alias for the given tcp_function_block. */
int
find_tcp_function_alias(struct tcp_function_block *blk,
    struct tcp_function_set *fs)
{
	struct tcp_function *f;
	int found;

	found = 0;
	rw_rlock(&tcp_function_lock);
	TAILQ_FOREACH(f, &t_functions, tf_next) {
		if ((f->tf_fb == blk) &&
		    (strncmp(f->tf_name, blk->tfb_tcp_block_name,
		        TCP_FUNCTION_NAME_LEN_MAX) != 0)) {
			/* Matching function block with different name. */
			strncpy(fs->function_set_name, f->tf_name,
			    TCP_FUNCTION_NAME_LEN_MAX);
			found = 1;
			break;
		}
	}
	/* Null terminate the string appropriately. */
	if (found) {
		fs->function_set_name[TCP_FUNCTION_NAME_LEN_MAX - 1] = '\0';
	} else {
		fs->function_set_name[0] = '\0';
	}
	rw_runlock(&tcp_function_lock);
	return (found);
}

510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
static struct tcp_function_block *
find_and_ref_tcp_default_fb(void)
{
	struct tcp_function_block *rblk;

	rw_rlock(&tcp_function_lock);
	rblk = tcp_func_set_ptr;
	refcount_acquire(&rblk->tfb_refcnt);
	rw_runlock(&tcp_function_lock);
	return (rblk);
}

void
tcp_switch_back_to_default(struct tcpcb *tp)
{
	struct tcp_function_block *tfb;

	KASSERT(tp->t_fb != &tcp_def_funcblk,
	    ("%s: called by the built-in default stack", __func__));

	/*
	 * Release the old stack. This function will either find a new one
	 * or panic.
	 */
	if (tp->t_fb->tfb_tcp_fb_fini != NULL)
		(*tp->t_fb->tfb_tcp_fb_fini)(tp, 0);
	refcount_release(&tp->t_fb->tfb_refcnt);

	/*
	 * Now, we'll find a new function block to use.
	 * Start by trying the current user-selected
	 * default, unless this stack is the user-selected
	 * default.
	 */
	tfb = find_and_ref_tcp_default_fb();
	if (tfb == tp->t_fb) {
		refcount_release(&tfb->tfb_refcnt);
		tfb = NULL;
	}
	/* Does the stack accept this connection? */
	if (tfb != NULL && tfb->tfb_tcp_handoff_ok != NULL &&
	    (*tfb->tfb_tcp_handoff_ok)(tp)) {
		refcount_release(&tfb->tfb_refcnt);
		tfb = NULL;
	}
	/* Try to use that stack. */
	if (tfb != NULL) {
		/* Initialize the new stack. If it succeeds, we are done. */
		tp->t_fb = tfb;
		if (tp->t_fb->tfb_tcp_fb_init == NULL ||
		    (*tp->t_fb->tfb_tcp_fb_init)(tp) == 0)
			return;

		/*
		 * Initialization failed. Release the reference count on
		 * the stack.
		 */
		refcount_release(&tfb->tfb_refcnt);
	}

	/*
	 * If that wasn't feasible, use the built-in default
	 * stack which is not allowed to reject anyone.
	 */
	tfb = find_and_ref_tcp_fb(&tcp_def_funcblk);
	if (tfb == NULL) {
		/* there always should be a default */
		panic("Can't refer to tcp_def_funcblk");
	}
	if (tfb->tfb_tcp_handoff_ok != NULL) {
		if ((*tfb->tfb_tcp_handoff_ok) (tp)) {
			/* The default stack cannot say no */
			panic("Default stack rejects a new session?");
		}
	}
	tp->t_fb = tfb;
	if (tp->t_fb->tfb_tcp_fb_init != NULL &&
	    (*tp->t_fb->tfb_tcp_fb_init)(tp)) {
		/* The default stack cannot fail */
		panic("Default stack initialization failed");
	}
}
592

593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
static void
tcp_recv_udp_tunneled_packet(struct mbuf *m, int off, struct inpcb *inp,
    const struct sockaddr *sa, void *ctx)
{
	struct ip *iph;
#ifdef INET6
	struct ip6_hdr *ip6;
#endif
	struct udphdr *uh;
	struct tcphdr *th;
	int thlen;
	uint16_t port;

	TCPSTAT_INC(tcps_tunneled_pkts);
	if ((m->m_flags & M_PKTHDR) == 0) {
		/* Can't handle one that is not a pkt hdr */
		TCPSTAT_INC(tcps_tunneled_errs);
		goto out;
	}
	thlen = sizeof(struct tcphdr);
	if (m->m_len < off + sizeof(struct udphdr) + thlen &&
	    (m =  m_pullup(m, off + sizeof(struct udphdr) + thlen)) == NULL) {
		TCPSTAT_INC(tcps_tunneled_errs);
		goto out;
	}
	iph = mtod(m, struct ip *);
	uh = (struct udphdr *)((caddr_t)iph + off);
	th = (struct tcphdr *)(uh + 1);
	thlen = th->th_off << 2;
	if (m->m_len < off + sizeof(struct udphdr) + thlen) {
		m =  m_pullup(m, off + sizeof(struct udphdr) + thlen);
		if (m == NULL) {
			TCPSTAT_INC(tcps_tunneled_errs);
			goto out;
		} else {
			iph = mtod(m, struct ip *);
			uh = (struct udphdr *)((caddr_t)iph + off);
			th = (struct tcphdr *)(uh + 1);
		}
	}
	m->m_pkthdr.tcp_tun_port = port = uh->uh_sport;
	bcopy(th, uh, m->m_len - off);
	m->m_len -= sizeof(struct udphdr);
	m->m_pkthdr.len -= sizeof(struct udphdr);
	/*
	 * We use the same algorithm for
	 * both UDP and TCP for c-sum. So
	 * the code in tcp_input will skip
	 * the checksum. So we do nothing
	 * with the flag (m->m_pkthdr.csum_flags).
	 */
	switch (iph->ip_v) {
#ifdef INET
	case IPVERSION:
		iph->ip_len = htons(ntohs(iph->ip_len) - sizeof(struct udphdr));
		tcp_input_with_port(&m, &off, IPPROTO_TCP, port);
		break;
#endif
#ifdef INET6
	case IPV6_VERSION >> 4:
		ip6 = mtod(m, struct ip6_hdr *);
		ip6->ip6_plen = htons(ntohs(ip6->ip6_plen) - sizeof(struct udphdr));
		tcp6_input_with_port(&m, &off, IPPROTO_TCP, port);
		break;
#endif
	default:
		goto out;
		break;
	}
	return;
out:
	m_freem(m);
}

667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
static int
sysctl_net_inet_default_tcp_functions(SYSCTL_HANDLER_ARGS)
{
	int error=ENOENT;
	struct tcp_function_set fs;
	struct tcp_function_block *blk;

	memset(&fs, 0, sizeof(fs));
	rw_rlock(&tcp_function_lock);
	blk = find_tcp_fb_locked(tcp_func_set_ptr, NULL);
	if (blk) {
		/* Found him */
		strcpy(fs.function_set_name, blk->tfb_tcp_block_name);
		fs.pcbcnt = blk->tfb_refcnt;
	}
682
	rw_runlock(&tcp_function_lock);
683
684
685
686
687
688
689
690
691
692
	error = sysctl_handle_string(oidp, fs.function_set_name,
				     sizeof(fs.function_set_name), req);

	/* Check for error or no change */
	if (error != 0 || req->newptr == NULL)
		return(error);

	rw_wlock(&tcp_function_lock);
	blk = find_tcp_functions_locked(&fs);
	if ((blk == NULL) ||
693
694
	    (blk->tfb_flags & TCP_FUNC_BEING_REMOVED)) {
		error = ENOENT;
695
696
697
698
699
700
701
702
703
		goto done;
	}
	tcp_func_set_ptr = blk;
done:
	rw_wunlock(&tcp_function_lock);
	return (error);
}

SYSCTL_PROC(_net_inet_tcp, OID_AUTO, functions_default,
704
705
706
    CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
    NULL, 0, sysctl_net_inet_default_tcp_functions, "A",
    "Set/get the default TCP functions");
707
708
709
710
711
712
713
714

static int
sysctl_net_inet_list_available(SYSCTL_HANDLER_ARGS)
{
	int error, cnt, linesz;
	struct tcp_function *f;
	char *buffer, *cp;
	size_t bufsz, outsz;
715
	bool alias;
716
717
718
719
720
721
722
723

	cnt = 0;
	rw_rlock(&tcp_function_lock);
	TAILQ_FOREACH(f, &t_functions, tf_next) {
		cnt++;
	}
	rw_runlock(&tcp_function_lock);

724
	bufsz = (cnt+2) * ((TCP_FUNCTION_NAME_LEN_MAX * 2) + 13) + 1;
725
726
727
728
729
	buffer = malloc(bufsz, M_TEMP, M_WAITOK);

	error = 0;
	cp = buffer;

730
731
	linesz = snprintf(cp, bufsz, "\n%-32s%c %-32s %s\n", "Stack", 'D',
	    "Alias", "PCB count");
732
733
734
735
	cp += linesz;
	bufsz -= linesz;
	outsz = linesz;

736
	rw_rlock(&tcp_function_lock);
737
	TAILQ_FOREACH(f, &t_functions, tf_next) {
738
739
		alias = (f->tf_name != f->tf_fb->tfb_tcp_block_name);
		linesz = snprintf(cp, bufsz, "%-32s%c %-32s %u\n",
740
741
		    f->tf_fb->tfb_tcp_block_name,
		    (f->tf_fb == tcp_func_set_ptr) ? '*' : ' ',
742
		    alias ? f->tf_name : "-",
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
		    f->tf_fb->tfb_refcnt);
		if (linesz >= bufsz) {
			error = EOVERFLOW;
			break;
		}
		cp += linesz;
		bufsz -= linesz;
		outsz += linesz;
	}
	rw_runlock(&tcp_function_lock);
	if (error == 0)
		error = sysctl_handle_string(oidp, buffer, outsz + 1, req);
	free(buffer, M_TEMP);
	return (error);
}

SYSCTL_PROC(_net_inet_tcp, OID_AUTO, functions_available,
760
761
762
    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_NEEDGIANT,
    NULL, 0, sysctl_net_inet_list_available, "A",
    "list available TCP Function sets");
763

764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
VNET_DEFINE(int, tcp_udp_tunneling_port) = TCP_TUNNELING_PORT_DEFAULT;

#ifdef INET
VNET_DEFINE(struct socket *, udp4_tun_socket) = NULL;
#define	V_udp4_tun_socket	VNET(udp4_tun_socket)
#endif
#ifdef INET6
VNET_DEFINE(struct socket *, udp6_tun_socket) = NULL;
#define	V_udp6_tun_socket	VNET(udp6_tun_socket)
#endif

static void
tcp_over_udp_stop(void)
{
	/*
	 * This function assumes sysctl caller holds inp_rinfo_lock()
780
	 * for writing!
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
	 */
#ifdef INET
	if (V_udp4_tun_socket != NULL) {
		soclose(V_udp4_tun_socket);
		V_udp4_tun_socket = NULL;
	}
#endif
#ifdef INET6
	if (V_udp6_tun_socket != NULL) {
		soclose(V_udp6_tun_socket);
		V_udp6_tun_socket = NULL;
	}
#endif
}

static int
tcp_over_udp_start(void)
{
	uint16_t port;
	int ret;
#ifdef INET
	struct sockaddr_in sin;
#endif
#ifdef INET6
	struct sockaddr_in6 sin6;
#endif
	/*
	 * This function assumes sysctl caller holds inp_info_rlock()
809
	 * for writing!
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
	 */
	port = V_tcp_udp_tunneling_port;
	if (ntohs(port) == 0) {
		/* Must have a port set */
		return (EINVAL);
	}
#ifdef INET
	if (V_udp4_tun_socket != NULL) {
		/* Already running -- must stop first */
		return (EALREADY);
	}
#endif
#ifdef INET6
	if (V_udp6_tun_socket != NULL) {
		/* Already running -- must stop first */
		return (EALREADY);
	}
#endif
#ifdef INET
	if ((ret = socreate(PF_INET, &V_udp4_tun_socket,
	    SOCK_DGRAM, IPPROTO_UDP,
	    curthread->td_ucred, curthread))) {
		tcp_over_udp_stop();
		return (ret);
	}
	/* Call the special UDP hook. */
	if ((ret = udp_set_kernel_tunneling(V_udp4_tun_socket,
	    tcp_recv_udp_tunneled_packet,
	    tcp_ctlinput_viaudp,
	    NULL))) {
		tcp_over_udp_stop();
		return (ret);
	}
	/* Ok, we have a socket, bind it to the port. */
	memset(&sin, 0, sizeof(struct sockaddr_in));
	sin.sin_len = sizeof(struct sockaddr_in);
	sin.sin_family = AF_INET;
	sin.sin_port = htons(port);
	if ((ret = sobind(V_udp4_tun_socket,
	    (struct sockaddr *)&sin, curthread))) {
		tcp_over_udp_stop();
		return (ret);
	}
#endif
#ifdef INET6
	if ((ret = socreate(PF_INET6, &V_udp6_tun_socket,
	    SOCK_DGRAM, IPPROTO_UDP,
	    curthread->td_ucred, curthread))) {
		tcp_over_udp_stop();
		return (ret);
	}
	/* Call the special UDP hook. */
	if ((ret = udp_set_kernel_tunneling(V_udp6_tun_socket,
	    tcp_recv_udp_tunneled_packet,
	    tcp6_ctlinput_viaudp,
	    NULL))) {
		tcp_over_udp_stop();
		return (ret);
	}
	/* Ok, we have a socket, bind it to the port. */
	memset(&sin6, 0, sizeof(struct sockaddr_in6));
	sin6.sin6_len = sizeof(struct sockaddr_in6);
	sin6.sin6_family = AF_INET6;
	sin6.sin6_port = htons(port);
	if ((ret = sobind(V_udp6_tun_socket,
	    (struct sockaddr *)&sin6, curthread))) {
		tcp_over_udp_stop();
		return (ret);
	}
#endif
	return (0);
}

static int
sysctl_net_inet_tcp_udp_tunneling_port_check(SYSCTL_HANDLER_ARGS)
{
	int error;
	uint32_t old, new;

	old = V_tcp_udp_tunneling_port;
	new = old;
	error = sysctl_handle_int(oidp, &new, 0, req);
	if ((error == 0) &&
	    (req->newptr != NULL)) {
		if ((new < TCP_TUNNELING_PORT_MIN) ||
		    (new > TCP_TUNNELING_PORT_MAX)) {
			error = EINVAL;
		} else {
			V_tcp_udp_tunneling_port = new;
			if (old != 0) {
				tcp_over_udp_stop();
			}
			if (new != 0) {
				error = tcp_over_udp_start();
			}
		}
	}
	return (error);
}

SYSCTL_PROC(_net_inet_tcp, OID_AUTO, udp_tunneling_port,
    CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
    &VNET_NAME(tcp_udp_tunneling_port),
    0, &sysctl_net_inet_tcp_udp_tunneling_port_check, "IU",
    "Tunneling port for tcp over udp");

VNET_DEFINE(int, tcp_udp_tunneling_overhead) = TCP_TUNNELING_OVERHEAD_DEFAULT;

static int
sysctl_net_inet_tcp_udp_tunneling_overhead_check(SYSCTL_HANDLER_ARGS)
{
	int error, new;

	new = V_tcp_udp_tunneling_overhead;
	error = sysctl_handle_int(oidp, &new, 0, req);
	if (error == 0 && req->newptr) {
		if ((new < TCP_TUNNELING_OVERHEAD_MIN) ||
		    (new > TCP_TUNNELING_OVERHEAD_MAX))
			error = EINVAL;
		else
			V_tcp_udp_tunneling_overhead = new;
	}
	return (error);
}

SYSCTL_PROC(_net_inet_tcp, OID_AUTO, udp_tunneling_overhead,
    CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
    &VNET_NAME(tcp_udp_tunneling_overhead),
    0, &sysctl_net_inet_tcp_udp_tunneling_overhead_check, "IU",
    "MSS reduction when using tcp over udp");

941
/*
942
 * Exports one (struct tcp_function_info) for each alias/name.
943
944
 */
static int
945
sysctl_net_inet_list_func_info(SYSCTL_HANDLER_ARGS)
946
{
947
	int cnt, error;
948
	struct tcp_function *f;
949
	struct tcp_function_info tfi;
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967

	/*
	 * We don't allow writes.
	 */
	if (req->newptr != NULL)
		return (EINVAL);

	/*
	 * Wire the old buffer so we can directly copy the functions to
	 * user space without dropping the lock.
	 */
	if (req->oldptr != NULL) {
		error = sysctl_wire_old_buffer(req, 0);
		if (error)
			return (error);
	}

	/*
968
969
970
	 * Walk the list and copy out matching entries. If INVARIANTS
	 * is compiled in, also walk the list to verify the length of
	 * the list matches what we have recorded.
971
972
	 */
	rw_rlock(&tcp_function_lock);
973

974
	cnt = 0;
975
#ifndef INVARIANTS
976
977
978
979
980
	if (req->oldptr == NULL) {
		cnt = tcp_fb_cnt;
		goto skip_loop;
	}
#endif
981
	TAILQ_FOREACH(f, &t_functions, tf_next) {
982
983
984
#ifdef INVARIANTS
		cnt++;
#endif
985
		if (req->oldptr != NULL) {
986
			bzero(&tfi, sizeof(tfi));
987
			tfi.tfi_refcnt = f->tf_fb->tfb_refcnt;
988
			tfi.tfi_id = f->tf_fb->tfb_id;
989
990
991
992
			(void)strlcpy(tfi.tfi_alias, f->tf_name,
			    sizeof(tfi.tfi_alias));
			(void)strlcpy(tfi.tfi_name,
			    f->tf_fb->tfb_tcp_block_name, sizeof(tfi.tfi_name));
993
994
995
996
997
998
			error = SYSCTL_OUT(req, &tfi, sizeof(tfi));
			/*
			 * Don't stop on error, as that is the
			 * mechanism we use to accumulate length
			 * information if the buffer was too short.
			 */
999
		}
1000
	}
1001
1002
1003
1004
1005
	KASSERT(cnt == tcp_fb_cnt,
	    ("%s: cnt (%d) != tcp_fb_cnt (%d)", __func__, cnt, tcp_fb_cnt));
#ifndef INVARIANTS
skip_loop:
#endif
1006
1007
1008
	rw_runlock(&tcp_function_lock);
	if (req->oldptr == NULL)
		error = SYSCTL_OUT(req, NULL,
1009
		    (cnt + 1) * sizeof(struct tcp_function_info));
1010
1011
1012
1013

	return (error);
}

1014
SYSCTL_PROC(_net_inet_tcp, OID_AUTO, function_info,
1015
	    CTLTYPE_OPAQUE | CTLFLAG_SKIP | CTLFLAG_RD | CTLFLAG_MPSAFE,
1016
	    NULL, 0, sysctl_net_inet_list_func_info, "S,tcp_function_info",
1017
1018
	    "List TCP function block name-to-ID mappings");

1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
/*
 * tfb_tcp_handoff_ok() function for the default stack.
 * Note that we'll basically try to take all comers.
 */
static int
tcp_default_handoff_ok(struct tcpcb *tp)
{

	return (0);
}

/*
 * tfb_tcp_fb_init() function for the default stack.
 *
 * This handles making sure we have appropriate timers set if you are
 * transitioning a socket that has some amount of setup done.
 *
 * The init() fuction from the default can *never* return non-zero i.e.
 * it is required to always succeed since it is the stack of last resort!
 */
static int
tcp_default_fb_init(struct tcpcb *tp)
{

	struct socket *so;

	INP_WLOCK_ASSERT(tp->t_inpcb);

	KASSERT(tp->t_state >= 0 && tp->t_state < TCPS_TIME_WAIT,
	    ("%s: connection %p in unexpected state %d", __func__, tp,
	    tp->t_state));

	/*
	 * Nothing to do for ESTABLISHED or LISTEN states. And, we don't
	 * know what to do for unexpected states (which includes TIME_WAIT).
	 */
	if (tp->t_state <= TCPS_LISTEN || tp->t_state >= TCPS_TIME_WAIT)
		return (0);

	/*
	 * Make sure some kind of transmission timer is set if there is
	 * outstanding data.
	 */
	so = tp->t_inpcb->inp_socket;
	if ((!TCPS_HAVEESTABLISHED(tp->t_state) || sbavail(&so->so_snd) ||
	    tp->snd_una != tp->snd_max) && !(tcp_timer_active(tp, TT_REXMT) ||
	    tcp_timer_active(tp, TT_PERSIST))) {
		/*
		 * If the session has established and it looks like it should
		 * be in the persist state, set the persist timer. Otherwise,
		 * set the retransmit timer.
		 */
		if (TCPS_HAVEESTABLISHED(tp->t_state) && tp->snd_wnd == 0 &&
		    (int32_t)(tp->snd_nxt - tp->snd_una) <
		    (int32_t)sbavail(&so->so_snd))
			tcp_setpersist(tp);
		else
			tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur);
	}

	/* All non-embryonic sessions get a keepalive timer. */
	if (!tcp_timer_active(tp, TT_KEEP))
		tcp_timer_activate(tp, TT_KEEP,
		    TCPS_HAVEESTABLISHED(tp->t_state) ? TP_KEEPIDLE(tp) :
		    TP_KEEPINIT(tp));

1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
	/*
	 * Make sure critical variables are initialized
	 * if transitioning while in Recovery.
	 */
	if IN_FASTRECOVERY(tp->t_flags) {
		if (tp->sackhint.recover_fs == 0)
			tp->sackhint.recover_fs = max(1,
			    tp->snd_nxt - tp->snd_una);
	}

1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
	return (0);
}

/*
 * tfb_tcp_fb_fini() function for the default stack.
 *
 * This changes state as necessary (or prudent) to prepare for another stack
 * to assume responsibility for the connection.
 */
static void
tcp_default_fb_fini(struct tcpcb *tp, int tcb_is_purged)
{

	INP_WLOCK_ASSERT(tp->t_inpcb);
	return;
}

1112
/*
1113
 * Target size of TCP PCB hash tables. Must be a power of two.
1114
1115
1116
 *
 * Note that this can be overridden by the kernel environment
 * variable net.inet.tcp.tcbhashsize
1117
1118
 */
#ifndef TCBHASHSIZE
1119
#define TCBHASHSIZE	0
1120
#endif
Rodney W. Grimes's avatar
Rodney W. Grimes committed
1121

1122
/*
1123
1124
 * XXX
 * Callouts should be moved into struct tcp directly.  They are currently
1125
 * separate because the tcpcb structure is exported to userland for sysctl
1126
 * parsing purposes, which do not know about callouts.
1127
 */
1128
1129
struct tcpcb_mem {
	struct	tcpcb		tcb;
Mike Silbersack's avatar
Mike Silbersack committed
1130
	struct	tcp_timer	tt;
1131
	struct	cc_var		ccv;
1132
#ifdef TCP_HHOOK
1133
	struct	osd		osd;
1134
#endif
1135
1136
};

1137
VNET_DEFINE_STATIC(uma_zone_t, tcpcb_zone);
1138
#define	V_tcpcb_zone			VNET(tcpcb_zone)
1139

1140
MALLOC_DEFINE(M_TCPLOG, "tcplog", "TCP address and flags print buffers");
1141
1142
MALLOC_DEFINE(M_TCPFUNCTIONS, "tcpfunc", "TCP function set memory");

1143
static struct mtx isn_mtx;
1144

1145
1146
1147
1148
#define	ISN_LOCK_INIT()	mtx_init(&isn_mtx, "isn_mtx", NULL, MTX_DEF)
#define	ISN_LOCK()	mtx_lock(&isn_mtx)
#define	ISN_UNLOCK()	mtx_unlock(&isn_mtx)

Rodney W. Grimes's avatar
Rodney W. Grimes committed
1149
/*
1150
 * TCP initialization.
Rodney W. Grimes's avatar
Rodney W. Grimes committed
1151
 */
1152
1153
1154
1155
static void
tcp_zone_change(void *tag)
{

1156
	uma_zone_set_max(V_tcbinfo.ipi_zone, maxsockets);
1157
	uma_zone_set_max(V_tcpcb_zone, maxsockets);
1158
	tcp_tw_zone_change();
1159
1160
}

1161
1162
1163
static int
tcp_inpcb_init(void *mem, int size, int flags)
{
1164
1165
	struct inpcb *inp = mem;

1166
1167
1168
1169
	INP_LOCK_INIT(inp, "inp", "tcpinp");
	return (0);
}

1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
/*
 * Take a value and get the next power of 2 that doesn't overflow.
 * Used to size the tcp_inpcb hash buckets.
 */
static int
maketcp_hashsize(int size)
{
	int hashsize;

	/*
	 * auto tune.
	 * get the next power of 2 higher than maxsockets.
	 */
	hashsize = 1 << fls(size);
	/* catch overflow, and just go one power of 2 smaller */
	if (hashsize < size) {
		hashsize = 1 << (fls(size) - 1);
	}
	return (hashsize);
}

1191
1192
static volatile int next_tcp_stack_id = 1;

1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
/*
 * Register a TCP function block with the name provided in the names
 * array.  (Note that this function does NOT automatically register
 * blk->tfb_tcp_block_name as a stack name.  Therefore, you should
 * explicitly include blk->tfb_tcp_block_name in the list of names if
 * you wish to register the stack with that name.)
 *
 * Either all name registrations will succeed or all will fail.  If
 * a name registration fails, the function will update the num_names
 * argument to point to the array index of the name that encountered
 * the failure.
 *
 * Returns 0 on success, or an error code on failure.
 */
1207
int
1208
1209
register_tcp_functions_as_names(struct tcp_function_block *blk, int wait,
    const char *names[], int *num_names)
1210
1211
1212
{
	struct tcp_function *n;
	struct tcp_function_set fs;
1213
1214
1215
1216
1217
	int error, i;

	KASSERT(names != NULL && *num_names > 0,
	    ("%s: Called with 0-length name list", __func__));
	KASSERT(names != NULL, ("%s: Called with NULL name list", __func__));
1218
1219
	KASSERT(rw_initialized(&tcp_function_lock),
	    ("%s: called too early", __func__));
1220
1221
1222
1223
1224

	if ((blk->tfb_tcp_output == NULL) ||
	    (blk->tfb_tcp_do_segment == NULL) ||
	    (blk->tfb_tcp_ctloutput == NULL) ||
	    (strlen(blk->tfb_tcp_block_name) == 0)) {
1225
		/*
1226
1227
1228
		 * These functions are required and you
		 * need a name.
		 */
1229
		*num_names = 0;
1230
1231
1232
1233
1234
1235
1236
		return (EINVAL);
	}
	if (blk->tfb_tcp_timer_stop_all ||
	    blk->tfb_tcp_timer_activate ||
	    blk->tfb_tcp_timer_active ||
	    blk->tfb_tcp_timer_stop) {
		/*
1237
		 * If you define one timer function you
1238
1239
1240
1241
1242
1243
		 * must have them all.
		 */
		if ((blk->tfb_tcp_timer_stop_all == NULL) ||
		    (blk->tfb_tcp_timer_activate == NULL) ||
		    (blk->tfb_tcp_timer_active == NULL) ||
		    (blk->tfb_tcp_timer_stop == NULL)) {
1244
1245
			*num_names = 0;
			return (EINVAL);
1246
1247
		}
	}
1248

1249
1250
1251
1252
1253
	if (blk->tfb_flags & TCP_FUNC_BEING_REMOVED) {
		*num_names = 0;
		return (EINVAL);
	}

1254
	refcount_init(&blk->tfb_refcnt, 0);
1255
	blk->tfb_id = atomic_fetchadd_int(&next_tcp_stack_id, 1);
1256
1257
1258
1259
1260
1261
1262
1263
	for (i = 0; i < *num_names; i++) {
		n = malloc(sizeof(struct tcp_function), M_TCPFUNCTIONS, wait);
		if (n == NULL) {
			error = ENOMEM;
			goto cleanup;
		}
		n->tf_fb = blk;

1264
1265
		(void)strlcpy(fs.function_set_name, names[i],
		    sizeof(fs.function_set_name));
1266
1267
1268
1269
1270
1271
1272
1273
		rw_wlock(&tcp_function_lock);
		if (find_tcp_functions_locked(&fs) != NULL) {
			/* Duplicate name space not allowed */
			rw_wunlock(&tcp_function_lock);
			free(n, M_TCPFUNCTIONS);
			error = EALREADY;
			goto cleanup;
		}
1274
		(void)strlcpy(n->tf_name, names[i], sizeof(n->tf_name));
1275
		TAILQ_INSERT_TAIL(&t_functions, n, tf_next);
1276
		tcp_fb_cnt++;
1277
1278
		rw_wunlock(&tcp_function_lock);
	}
1279
	return(0);
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292

cleanup:
	/*
	 * Deregister the names we just added. Because registration failed
	 * for names[i], we don't need to deregister that name.
	 */
	*num_names = i;
	rw_wlock(&tcp_function_lock);
	while (--i >= 0) {
		TAILQ_FOREACH(n, &t_functions, tf_next) {
			if (!strncmp(n->tf_name, names[i],
			    TCP_FUNCTION_NAME_LEN_MAX)) {
				TAILQ_REMOVE(&t_functions, n, tf_next);
1293
				tcp_fb_cnt--;
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
				n->tf_fb = NULL;
				free(n, M_TCPFUNCTIONS);
				break;
			}
		}
	}
	rw_wunlock(&tcp_function_lock);
	return (error);
}

/*
 * Register a TCP function block using the name provided in the name
 * argument.
 *
 * Returns 0 on success, or an error code on failure.
 */
int
register_tcp_functions_as_name(struct tcp_function_block *blk, const char *name,
    int wait)
{
	const char *name_list[1];
	int num_names, rv;

	num_names = 1;
	if (name != NULL)
		name_list[0] = name;
	else
		name_list[0] = blk->tfb_tcp_block_name;
	rv = register_tcp_functions_as_names(blk, wait, name_list, &num_names);
	return (rv);
}

/*
 * Register a TCP function block using the name defined in
 * blk->tfb_tcp_block_name.
 *
 * Returns 0 on success, or an error code on failure.
 */
int
register_tcp_functions(struct tcp_function_block *blk, int wait)
{

	return (register_tcp_functions_as_name(blk, NULL, wait));
}
1338

1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
/*
 * Deregister all names associated with a function block. This
 * functionally removes the function block from use within the system.
 *
 * When called with a true quiesce argument, mark the function block
 * as being removed so no more stacks will use it and determine
 * whether the removal would succeed.
 *
 * When called with a false quiesce argument, actually attempt the
 * removal.
 *
 * When called with a force argument, attempt to switch all TCBs to
 * use the default stack instead of returning EBUSY.
 *
 * Returns 0 on success (or if the removal would succeed, or an error
 * code on failure.
 */
1356
int
1357
1358
deregister_tcp_functions(struct tcp_function_block *blk, bool quiesce,
    bool force)
1359
1360
{
	struct tcp_function *f;
1361
1362

	if (blk == &tcp_def_funcblk) {
1363
1364
1365
1366
1367
1368
1369
1370
1371
		/* You can't un-register the default */
		return (EPERM);
	}
	rw_wlock(&tcp_function_lock);
	if (blk == tcp_func_set_ptr) {
		/* You can't free the current default */
		rw_wunlock(&tcp_function_lock);
		return (EBUSY);
	}
1372
1373
1374
1375
1376
1377
1378
	/* Mark the block so no more stacks can use it. */
	blk->tfb_flags |= TCP_FUNC_BEING_REMOVED;
	/*
	 * If TCBs are still attached to the stack, attempt to switch them
	 * to the default stack.
	 */
	if (force && blk->tfb_refcnt) {
1379
1380
		struct inpcb_iterator inpi = INP_ALL_ITERATOR(&V_tcbinfo,
		    INPLOOKUP_WLOCKPCB);
1381
1382
1383
1384
1385
1386
1387
1388
1389
		struct inpcb *inp;
		struct tcpcb *tp;
		VNET_ITERATOR_DECL(vnet_iter);

		rw_wunlock(&tcp_function_lock);

		VNET_LIST_RLOCK();
		VNET_FOREACH(vnet_iter) {
			CURVNET_SET(vnet_iter);
1390
1391
			while ((inp = inp_next(&inpi)) != NULL) {
				if (inp->inp_flags & INP_TIMEWAIT)
1392
1393
					continue;
				tp = intotcpcb(inp);
1394
				if (tp == NULL || tp->t_fb != blk)
1395
1396
1397
1398
1399
1400
1401
1402
1403
					continue;
				tcp_switch_back_to_default(tp);
			}
			CURVNET_RESTORE();
		}
		VNET_LIST_RUNLOCK();

		rw_wlock(&tcp_function_lock);
	}
1404
	if (blk->tfb_refcnt) {