rack.c 636 KB
Newer Older
1
/*-
2
 * Copyright (c) 2016-2020 Netflix, Inc.
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 */

#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");

#include "opt_inet.h"
#include "opt_inet6.h"
#include "opt_ipsec.h"
#include "opt_tcpdebug.h"
34
#include "opt_ratelimit.h"
35
#include "opt_kern_tls.h"
36
#include <sys/param.h>
37
#include <sys/arb.h>
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
#include <sys/module.h>
#include <sys/kernel.h>
#ifdef TCP_HHOOK
#include <sys/hhook.h>
#endif
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/mbuf.h>
#include <sys/proc.h>		/* for proc0 declaration */
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/sysctl.h>
#include <sys/systm.h>
53
#ifdef STATS
54
55
#include <sys/qmath.h>
#include <sys/tree.h>
56
#include <sys/stats.h> /* Must come after qmath.h and tree.h */
57
58
#else
#include <sys/tree.h>
59
60
61
#endif
#include <sys/refcount.h>
#include <sys/queue.h>
62
#include <sys/tim_filter.h>
63
64
65
#include <sys/smp.h>
#include <sys/kthread.h>
#include <sys/kern_prefetch.h>
66
#include <sys/protosw.h>
67
68
69
70
#ifdef TCP_ACCOUNTING
#include <sys/sched.h>
#include <machine/cpu.h>
#endif
71
72
73
#include <vm/uma.h>

#include <net/route.h>
74
#include <net/route/nhop.h>
75
76
77
78
79
80
81
82
83
84
85
86
87
88
#include <net/vnet.h>

#define TCPSTATES		/* for logging */

#include <netinet/in.h>
#include <netinet/in_kdtrace.h>
#include <netinet/in_pcb.h>
#include <netinet/ip.h>
#include <netinet/ip_icmp.h>	/* required for icmp_var.h */
#include <netinet/icmp_var.h>	/* for ICMP_BANDLIM */
#include <netinet/ip_var.h>
#include <netinet/ip6.h>
#include <netinet6/in6_pcb.h>
#include <netinet6/ip6_var.h>
89
#include <netinet/tcp.h>
90
#define	TCPOUTFLAGS
91
92
93
94
95
96
#include <netinet/tcp_fsm.h>
#include <netinet/tcp_log_buf.h>
#include <netinet/tcp_seq.h>
#include <netinet/tcp_timer.h>
#include <netinet/tcp_var.h>
#include <netinet/tcp_hpts.h>
97
#include <netinet/tcp_ratelimit.h>
98
#include <netinet/tcp_accounting.h>
99
100
#include <netinet/tcpip.h>
#include <netinet/cc/cc.h>
101
#include <netinet/cc/cc_newreno.h>
102
#include <netinet/tcp_fastopen.h>
103
#include <netinet/tcp_lro.h>
104
105
106
#ifdef NETFLIX_SHARED_CWND
#include <netinet/tcp_shared_cwnd.h>
#endif
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
#ifdef TCPDEBUG
#include <netinet/tcp_debug.h>
#endif				/* TCPDEBUG */
#ifdef TCP_OFFLOAD
#include <netinet/tcp_offload.h>
#endif
#ifdef INET6
#include <netinet6/tcp6_var.h>
#endif

#include <netipsec/ipsec_support.h>

#if defined(IPSEC) || defined(IPSEC_SUPPORT)
#include <netipsec/ipsec.h>
#include <netipsec/ipsec6.h>
#endif				/* IPSEC */

#include <netinet/udp.h>
#include <netinet/udp_var.h>
#include <machine/in_cksum.h>

#ifdef MAC
#include <security/mac/mac_framework.h>
#endif
#include "sack_filter.h"
#include "tcp_rack.h"
#include "rack_bbr_common.h"

uma_zone_t rack_zone;
uma_zone_t rack_pcb_zone;

#ifndef TICKS2SBT
#define	TICKS2SBT(__t)	(tick_sbt * ((sbintime_t)(__t)))
#endif

142
143
144
145
146
147
148
149
150
VNET_DECLARE(uint32_t, newreno_beta);
VNET_DECLARE(uint32_t, newreno_beta_ecn);
#define V_newreno_beta VNET(newreno_beta)
#define V_newreno_beta_ecn VNET(newreno_beta_ecn)


MALLOC_DEFINE(M_TCPFSB, "tcp_fsb", "TCP fast send block");
MALLOC_DEFINE(M_TCPDO, "tcp_do", "TCP deferred options");

151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
struct sysctl_ctx_list rack_sysctl_ctx;
struct sysctl_oid *rack_sysctl_root;

#define CUM_ACKED 1
#define SACKED 2

/*
 * The RACK module incorporates a number of
 * TCP ideas that have been put out into the IETF
 * over the last few years:
 * - Matt Mathis's Rate Halving which slowly drops
 *    the congestion window so that the ack clock can
 *    be maintained during a recovery.
 * - Yuchung Cheng's RACK TCP (for which its named) that
 *    will stop us using the number of dup acks and instead
 *    use time as the gage of when we retransmit.
 * - Reorder Detection of RFC4737 and the Tail-Loss probe draft
 *    of Dukkipati et.al.
 * RACK depends on SACK, so if an endpoint arrives that
 * cannot do SACK the state machine below will shuttle the
 * connection back to using the "default" TCP stack that is
 * in FreeBSD.
 *
 * To implement RACK the original TCP stack was first decomposed
 * into a functional state machine with individual states
 * for each of the possible TCP connection states. The do_segement
 * functions role in life is to mandate the connection supports SACK
 * initially and then assure that the RACK state matches the conenction
 * state before calling the states do_segment function. Each
 * state is simplified due to the fact that the original do_segment
 * has been decomposed and we *know* what state we are in (no
 * switches on the state) and all tests for SACK are gone. This
 * greatly simplifies what each state does.
 *
 * TCP output is also over-written with a new version since it
 * must maintain the new rack scoreboard.
 *
 */
static int32_t rack_tlp_thresh = 1;
190
191
static int32_t rack_tlp_limit = 2;	/* No more than 2 TLPs w-out new data */
static int32_t rack_tlp_use_greater = 1;
192
static int32_t rack_reorder_thresh = 2;
193
static int32_t rack_reorder_fade = 60000000;	/* 0 - never fade, def 60,000,000
194
						 * - 60 seconds */
195
static uint8_t rack_req_measurements = 1;
196
197
198
/* Attack threshold detections */
static uint32_t rack_highest_sack_thresh_seen = 0;
static uint32_t rack_highest_move_thresh_seen = 0;
199
200
201
202
203
204
205
206
207
208
static int32_t rack_enable_hw_pacing = 0; /* Due to CCSP keep it off by default */
static int32_t rack_hw_pace_extra_slots = 2;	/* 2 extra MSS time betweens */
static int32_t rack_hw_rate_caps = 1; /* 1; */
static int32_t rack_hw_rate_min = 0; /* 1500000;*/
static int32_t rack_hw_rate_to_low = 0; /* 1200000; */
static int32_t rack_hw_up_only = 1;
static int32_t rack_stats_gets_ms_rtt = 1;
static int32_t rack_prr_addbackmax = 2;

static int32_t rack_pkt_delay = 1000;
209
static int32_t rack_send_a_lot_in_prr = 1;
210
static int32_t rack_min_to = 1000;	/* Number of microsecond  min timeout */
211
212
static int32_t rack_verbose_logging = 0;
static int32_t rack_ignore_data_after_close = 1;
213
214
215
216
217
218
219
static int32_t rack_enable_shared_cwnd = 1;
static int32_t rack_use_cmp_acks = 1;
static int32_t rack_use_fsb = 1;
static int32_t rack_use_rfo = 1;
static int32_t rack_use_rsm_rfo = 1;
static int32_t rack_max_abc_post_recovery = 2;
static int32_t rack_client_low_buf = 0;
220
static int32_t rack_dsack_std_based = 0x3;	/* bit field bit 1 sets rc_rack_tmr_std_based and bit 2 sets rc_rack_use_dsack */
221
222
223
#ifdef TCP_ACCOUNTING
static int32_t rack_tcp_accounting = 0;
#endif
224
225
226
227
228
static int32_t rack_limits_scwnd = 1;
static int32_t rack_enable_mqueue_for_nonpaced = 0;
static int32_t rack_disable_prr = 0;
static int32_t use_rack_rr = 1;
static int32_t rack_non_rxt_use_cr = 0; /* does a non-rxt in recovery use the configured rate (ss/ca)? */
229
230
231
232
static int32_t rack_persist_min = 250000;	/* 250usec */
static int32_t rack_persist_max = 2000000;	/* 2 Second in usec's */
static int32_t rack_sack_not_required = 1;	/* set to one to allow non-sack to use rack */
static int32_t rack_default_init_window = 0;	/* Use system default */
233
static int32_t rack_limit_time_with_srtt = 0;
234
235
236
237
238
static int32_t rack_autosndbuf_inc = 20;	/* In percentage form */
static int32_t rack_enobuf_hw_boost_mult = 2;	/* How many times the hw rate we boost slot using time_between */
static int32_t rack_enobuf_hw_max = 12000;	/* 12 ms in usecs */
static int32_t rack_enobuf_hw_min = 10000;	/* 10 ms in usecs */
static int32_t rack_hw_rwnd_factor = 2;		/* How many max_segs the rwnd must be before we hold off sending */
239
240
241
242
243
244
/*
 * Currently regular tcp has a rto_min of 30ms
 * the backoff goes 12 times so that ends up
 * being a total of 122.850 seconds before a
 * connection is killed.
 */
245
246
247
248
static uint32_t rack_def_data_window = 20;
static uint32_t rack_goal_bdp = 2;
static uint32_t rack_min_srtts = 1;
static uint32_t rack_min_measure_usec = 0;
249
250
251
static int32_t rack_tlp_min = 10000;	/* 10ms */
static int32_t rack_rto_min = 30000;	/* 30,000 usec same as main freebsd */
static int32_t rack_rto_max = 4000000;	/* 4 seconds in usec's */
252
253
254
static const int32_t rack_free_cache = 2;
static int32_t rack_hptsi_segments = 40;
static int32_t rack_rate_sample_method = USE_RTT_LOW;
255
static int32_t rack_pace_every_seg = 0;
256
static int32_t rack_delayed_ack_time = 40000;	/* 40ms in usecs */
257
static int32_t rack_slot_reduction = 4;
258
259
260
static int32_t rack_wma_divisor = 8;		/* For WMA calculation */
static int32_t rack_cwnd_block_ends_measure = 0;
static int32_t rack_rwnd_block_ends_measure = 0;
261
static int32_t rack_def_profile = 0;
262

263
264
265
266
267
static int32_t rack_lower_cwnd_at_tlp = 0;
static int32_t rack_limited_retran = 0;
static int32_t rack_always_send_oldest = 0;
static int32_t rack_tlp_threshold_use = TLP_USE_TWO_ONE;

268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
static uint16_t rack_per_of_gp_ss = 250;	/* 250 % slow-start */
static uint16_t rack_per_of_gp_ca = 200;	/* 200 % congestion-avoidance */
static uint16_t rack_per_of_gp_rec = 200;	/* 200 % of bw */

/* Probertt */
static uint16_t rack_per_of_gp_probertt = 60;	/* 60% of bw */
static uint16_t rack_per_of_gp_lowthresh = 40;	/* 40% is bottom */
static uint16_t rack_per_of_gp_probertt_reduce = 10; /* 10% reduction */
static uint16_t rack_atexit_prtt_hbp = 130;	/* Clamp to 130% on exit prtt if highly buffered path */
static uint16_t rack_atexit_prtt = 130;	/* Clamp to 100% on exit prtt if non highly buffered path */

static uint32_t rack_max_drain_wait = 2;	/* How man gp srtt's before we give up draining */
static uint32_t rack_must_drain = 1;		/* How many GP srtt's we *must* wait */
static uint32_t rack_probertt_use_min_rtt_entry = 1;	/* Use the min to calculate the goal else gp_srtt */
static uint32_t rack_probertt_use_min_rtt_exit = 0;
static uint32_t rack_probe_rtt_sets_cwnd = 0;
static uint32_t rack_probe_rtt_safety_val = 2000000;	/* No more than 2 sec in probe-rtt */
285
static uint32_t rack_time_between_probertt = 9600000;	/* 9.6 sec in usecs */
286
static uint32_t rack_probertt_gpsrtt_cnt_mul = 0;	/* How many srtt periods does probe-rtt last top fraction */
287
288
static uint32_t rack_probertt_gpsrtt_cnt_div = 0;	/* How many srtt periods does probe-rtt last bottom fraction */
static uint32_t rack_min_probertt_hold = 40000;		/* Equal to delayed ack time */
289
290
static uint32_t rack_probertt_filter_life = 10000000;
static uint32_t rack_probertt_lower_within = 10;
291
static uint32_t rack_min_rtt_movement = 250000;	/* Must move at least 250ms (in microseconds)  to count as a lowering */
292
293
294
295
296
297
298
299
300
301
static int32_t rack_pace_one_seg = 0;		/* Shall we pace for less than 1.4Meg 1MSS at a time */
static int32_t rack_probertt_clear_is = 1;
static int32_t rack_max_drain_hbp = 1;		/* Extra drain times gpsrtt for highly buffered paths */
static int32_t rack_hbp_thresh = 3;		/* what is the divisor max_rtt/min_rtt to decided a hbp */

/* Part of pacing */
static int32_t rack_max_per_above = 30;		/* When we go to increment stop if above 100+this% */

/* Timely information */
/* Combine these two gives the range of 'no change' to bw */
302
/* ie the up/down provide the upper and lower bound */
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
static int32_t rack_gp_per_bw_mul_up = 2;	/* 2% */
static int32_t rack_gp_per_bw_mul_down = 4;	/* 4% */
static int32_t rack_gp_rtt_maxmul = 3;		/* 3 x maxmin */
static int32_t rack_gp_rtt_minmul = 1;		/* minrtt + (minrtt/mindiv) is lower rtt */
static int32_t rack_gp_rtt_mindiv = 4;		/* minrtt + (minrtt * minmul/mindiv) is lower rtt */
static int32_t rack_gp_decrease_per = 20;	/* 20% decrease in multipler */
static int32_t rack_gp_increase_per = 2;	/* 2% increase in multipler */
static int32_t rack_per_lower_bound = 50;	/* Don't allow to drop below this multiplier */
static int32_t rack_per_upper_bound_ss = 0;	/* Don't allow SS to grow above this */
static int32_t rack_per_upper_bound_ca = 0;	/* Don't allow CA to grow above this */
static int32_t rack_do_dyn_mul = 0;		/* Are the rack gp multipliers dynamic */
static int32_t rack_gp_no_rec_chg = 1;		/* Prohibit recovery from reducing it's multiplier */
static int32_t rack_timely_dec_clear = 6;	/* Do we clear decrement count at a value (6)? */
static int32_t rack_timely_max_push_rise = 3;	/* One round of pushing */
static int32_t rack_timely_max_push_drop = 3;	/* Three round of pushing */
static int32_t rack_timely_min_segs = 4;	/* 4 segment minimum */
static int32_t rack_use_max_for_nobackoff = 0;
static int32_t rack_timely_int_timely_only = 0;	/* do interim timely's only use the timely algo (no b/w changes)? */
static int32_t rack_timely_no_stopping = 0;
static int32_t rack_down_raise_thresh = 100;
static int32_t rack_req_segs = 1;
324
static uint64_t rack_bw_rate_cap = 0;
325
326
327

/* Weird delayed ack mode */
static int32_t rack_use_imac_dack = 0;
328
329
330
331
332
333
334
335
336
/* Rack specific counters */
counter_u64_t rack_badfr;
counter_u64_t rack_badfr_bytes;
counter_u64_t rack_rtm_prr_retran;
counter_u64_t rack_rtm_prr_newdata;
counter_u64_t rack_timestamp_mismatch;
counter_u64_t rack_reorder_seen;
counter_u64_t rack_paced_segments;
counter_u64_t rack_unpaced_segments;
337
338
counter_u64_t rack_calc_zero;
counter_u64_t rack_calc_nonzero;
339
counter_u64_t rack_saw_enobuf;
340
counter_u64_t rack_saw_enobuf_hw;
341
counter_u64_t rack_saw_enetunreach;
342
counter_u64_t rack_per_timer_hole;
343
344
345
346
347
counter_u64_t rack_large_ackcmp;
counter_u64_t rack_small_ackcmp;
#ifdef INVARIANTS
counter_u64_t rack_adjust_map_bw;
#endif
348
349
350
351
352
353
354
355
356
/* Tail loss probe counters */
counter_u64_t rack_tlp_tot;
counter_u64_t rack_tlp_newdata;
counter_u64_t rack_tlp_retran;
counter_u64_t rack_tlp_retran_bytes;
counter_u64_t rack_tlp_retran_fail;
counter_u64_t rack_to_tot;
counter_u64_t rack_to_arm_rack;
counter_u64_t rack_to_arm_tlp;
357
counter_u64_t rack_hot_alloc;
358
359
360
counter_u64_t rack_to_alloc;
counter_u64_t rack_to_alloc_hard;
counter_u64_t rack_to_alloc_emerg;
361
counter_u64_t rack_to_alloc_limited;
362
363
counter_u64_t rack_alloc_limited_conns;
counter_u64_t rack_split_limited;
364

365
366
367
368
369
370
371
372
373
374
375
#define MAX_NUM_OF_CNTS 13
counter_u64_t rack_proc_comp_ack[MAX_NUM_OF_CNTS];
counter_u64_t rack_multi_single_eq;
counter_u64_t rack_proc_non_comp_ack;

counter_u64_t rack_fto_send;
counter_u64_t rack_fto_rsm_send;
counter_u64_t rack_nfto_resend;
counter_u64_t rack_non_fto_send;
counter_u64_t rack_extended_rfo;

376
377
378
counter_u64_t rack_sack_proc_all;
counter_u64_t rack_sack_proc_short;
counter_u64_t rack_sack_proc_restart;
379
380
381
382
383
384
385
386
387
388
389
390
counter_u64_t rack_sack_attacks_detected;
counter_u64_t rack_sack_attacks_reversed;
counter_u64_t rack_sack_used_next_merge;
counter_u64_t rack_sack_splits;
counter_u64_t rack_sack_used_prev_merge;
counter_u64_t rack_sack_skipped_acked;
counter_u64_t rack_ack_total;
counter_u64_t rack_express_sack;
counter_u64_t rack_sack_total;
counter_u64_t rack_move_none;
counter_u64_t rack_move_some;

391
392
393
394
counter_u64_t rack_used_tlpmethod;
counter_u64_t rack_used_tlpmethod2;
counter_u64_t rack_enter_tlp_calc;
counter_u64_t rack_input_idle_reduces;
395
counter_u64_t rack_collapsed_win;
396
counter_u64_t rack_tlp_does_nada;
397
counter_u64_t rack_try_scwnd;
398
399
400
401
counter_u64_t rack_hw_pace_init_fail;
counter_u64_t rack_hw_pace_lost;
counter_u64_t rack_sbsndptr_right;
counter_u64_t rack_sbsndptr_wrong;
402
403
404
405
406
407
408
409

/* Temp CPU counters */
counter_u64_t rack_find_high;

counter_u64_t rack_progress_drops;
counter_u64_t rack_out_size[TCP_MSS_ACCT_SIZE];
counter_u64_t rack_opts_arry[RACK_OPTS_SIZE];

410
411
412

#define	RACK_REXMTVAL(tp) max(rack_rto_min, ((tp)->t_srtt + ((tp)->t_rttvar << 2)))

413
414
#define	RACK_TCPT_RANGESET(tv, value, tvmin, tvmax, slop) do {	\
	(tv) = (value) + slop;	 \
415
416
417
418
419
420
	if ((u_long)(tv) < (u_long)(tvmin)) \
		(tv) = (tvmin); \
	if ((u_long)(tv) > (u_long)(tvmax)) \
		(tv) = (tvmax); \
} while (0)

421
422
423
424
425
static void
rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick,  int event, int line);

static int
rack_process_ack(struct mbuf *m, struct tcphdr *th,
426
    struct socket *so, struct tcpcb *tp, struct tcpopt *to,
427
428
429
430
    uint32_t tiwin, int32_t tlen, int32_t * ofia, int32_t thflags, int32_t * ret_val);
static int
rack_process_data(struct mbuf *m, struct tcphdr *th,
    struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen,
431
    uint32_t tiwin, int32_t thflags, int32_t nxt_pkt);
432
433
static void
rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack,
434
   uint32_t th_ack, uint16_t nsegs, uint16_t type, int32_t recovery);
435
static struct rack_sendmap *rack_alloc(struct tcp_rack *rack);
436
437
static struct rack_sendmap *rack_alloc_limit(struct tcp_rack *rack,
    uint8_t limit_type);
438
439
440
441
static struct rack_sendmap *
rack_check_recovery_mode(struct tcpcb *tp,
    uint32_t tsused);
static void
442
443
rack_cong_signal(struct tcpcb *tp,
		 uint32_t type, uint32_t ack);
444
445
446
447
448
449
static void rack_counter_destroy(void);
static int
rack_ctloutput(struct socket *so, struct sockopt *sopt,
    struct inpcb *inp, struct tcpcb *tp);
static int32_t rack_ctor(void *mem, int32_t size, void *arg, int32_t how);
static void
450
rack_set_pace_segments(struct tcpcb *tp, struct tcp_rack *rack, uint32_t line, uint64_t *fill_override);
451
static void
452
453
rack_do_segment(struct mbuf *m, struct tcphdr *th,
    struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen,
454
    uint8_t iptos);
455
456
static void rack_dtor(void *mem, int32_t size, void *arg);
static void
457
458
459
460
461
rack_log_alt_to_to_cancel(struct tcp_rack *rack,
    uint32_t flex1, uint32_t flex2,
    uint32_t flex3, uint32_t flex4,
    uint32_t flex5, uint32_t flex6,
    uint16_t flex7, uint8_t mod);
462

463
464
static void
rack_log_pacing_delay_calc(struct tcp_rack *rack, uint32_t len, uint32_t slot,
465
466
   uint64_t bw_est, uint64_t bw, uint64_t len_time, int method, int line,
   struct rack_sendmap *rsm, uint8_t quality);
467
468
469
470
471
472
473
474
475
static struct rack_sendmap *
rack_find_high_nonack(struct tcp_rack *rack,
    struct rack_sendmap *rsm);
static struct rack_sendmap *rack_find_lowest_rsm(struct tcp_rack *rack);
static void rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm);
static void rack_fini(struct tcpcb *tp, int32_t tcb_is_purged);
static int
rack_get_sockopt(struct socket *so, struct sockopt *sopt,
    struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack);
476
477
static void
rack_do_goodput_measurement(struct tcpcb *tp, struct tcp_rack *rack,
478
			    tcp_seq th_ack, int line, uint8_t quality);
479
480
static uint32_t
rack_get_pacing_len(struct tcp_rack *rack, uint64_t bw, uint32_t mss);
481
482
483
484
485
static int32_t rack_handoff_ok(struct tcpcb *tp);
static int32_t rack_init(struct tcpcb *tp);
static void rack_init_sysctls(void);
static void
rack_log_ack(struct tcpcb *tp, struct tcpopt *to,
486
    struct tcphdr *th, int entered_rec, int dup_ack_struck);
487
488
static void
rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len,
489
    uint32_t seq_out, uint8_t th_flags, int32_t err, uint64_t ts,
490
    struct rack_sendmap *hintrsm, uint16_t add_flags, struct mbuf *s_mb, uint32_t s_moff, int hw_tls);
491

492
493
494
static void
rack_log_sack_passed(struct tcpcb *tp, struct tcp_rack *rack,
    struct rack_sendmap *rsm);
495
static void rack_log_to_event(struct tcp_rack *rack, int32_t to_num, struct rack_sendmap *rsm);
496
497
498
499
500
static int32_t rack_output(struct tcpcb *tp);

static uint32_t
rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack,
    struct sackblk *sack, struct tcpopt *to, struct rack_sendmap **prsm,
501
    uint32_t cts, int *moved_two);
502
static void rack_post_recovery(struct tcpcb *tp, uint32_t th_seq);
503
504
505
506
507
508
509
510
511
512
513
514
515
516
static void rack_remxt_tmr(struct tcpcb *tp);
static int
rack_set_sockopt(struct socket *so, struct sockopt *sopt,
    struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack);
static void rack_set_state(struct tcpcb *tp, struct tcp_rack *rack);
static int32_t rack_stopall(struct tcpcb *tp);
static void
rack_timer_activate(struct tcpcb *tp, uint32_t timer_type,
    uint32_t delta);
static int32_t rack_timer_active(struct tcpcb *tp, uint32_t timer_type);
static void rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line);
static void rack_timer_stop(struct tcpcb *tp, uint32_t timer_type);
static uint32_t
rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack,
517
    struct rack_sendmap *rsm, uint64_t ts, int32_t * lenp, uint16_t add_flag);
518
519
static void
rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack,
520
    struct rack_sendmap *rsm, uint64_t ts, uint16_t add_flag);
521
522
static int
rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack,
523
    struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type, tcp_seq th_ack);
524
525
526
527
static int32_t tcp_addrack(module_t mod, int32_t type, void *data);
static int
rack_do_close_wait(struct mbuf *m, struct tcphdr *th,
    struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
528
    int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
529
530
531
static int
rack_do_closing(struct mbuf *m, struct tcphdr *th,
    struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
532
    int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
533
534
535
static int
rack_do_established(struct mbuf *m, struct tcphdr *th,
    struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
536
    int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
537
538
539
static int
rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th,
    struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
540
    int32_t tlen, uint32_t tiwin, int32_t nxt_pkt, uint8_t iptos);
541
542
543
static int
rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th,
    struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
544
    int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
545
546
547
static int
rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th,
    struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
548
    int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
549
550
551
static int
rack_do_lastack(struct mbuf *m, struct tcphdr *th,
    struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
552
    int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
553
554
555
static int
rack_do_syn_recv(struct mbuf *m, struct tcphdr *th,
    struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
556
    int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
557
558
559
static int
rack_do_syn_sent(struct mbuf *m, struct tcphdr *th,
    struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
560
    int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
561
562
563
struct rack_sendmap *
tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack,
    uint32_t tsused);
564
565
static void tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt,
    uint32_t len, uint32_t us_tim, int confidence, struct rack_sendmap *rsm, uint16_t rtrcnt);
566
static void
567
568
569
570
571
     tcp_rack_partialack(struct tcpcb *tp);
static int
rack_set_profile(struct tcp_rack *rack, int prof);
static void
rack_apply_deferred_options(struct tcp_rack *rack);
572
573
574

int32_t rack_clear_counter=0;

575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
static void
rack_set_cc_pacing(struct tcp_rack *rack)
{
	struct sockopt sopt;
	struct cc_newreno_opts opt;
	struct newreno old, *ptr;
	struct tcpcb *tp;
	int error;

	if (rack->rc_pacing_cc_set)
		return;

	tp = rack->rc_tp;
	if (tp->cc_algo == NULL) {
		/* Tcb is leaving */
		printf("No cc algorithm?\n");
		return;
	}
	rack->rc_pacing_cc_set = 1;
	if (strcmp(tp->cc_algo->name, CCALGONAME_NEWRENO) != 0) {
		/* Not new-reno we can't play games with beta! */
		goto out;
	}
	ptr = ((struct newreno *)tp->ccv->cc_data);
	if (CC_ALGO(tp)->ctl_output == NULL)  {
		/* Huh, why does new_reno no longer have a set function? */
		printf("no ctl_output for algo:%s\n", tp->cc_algo->name);
		goto out;
	}
	if (ptr == NULL) {
		/* Just the default values */
		old.beta = V_newreno_beta_ecn;
		old.beta_ecn = V_newreno_beta_ecn;
		old.newreno_flags = 0;
	} else {
		old.beta = ptr->beta;
		old.beta_ecn = ptr->beta_ecn;
		old.newreno_flags = ptr->newreno_flags;
	}
	sopt.sopt_valsize = sizeof(struct cc_newreno_opts);
	sopt.sopt_dir = SOPT_SET;
	opt.name = CC_NEWRENO_BETA;
	opt.val = rack->r_ctl.rc_saved_beta.beta;
	error = CC_ALGO(tp)->ctl_output(tp->ccv, &sopt, &opt);
	if (error)  {
		printf("Error returned by ctl_output %d\n", error);
		goto out;
	}
	/*
	 * Hack alert we need to set in our newreno_flags
	 * so that Abe behavior is also applied.
	 */
	((struct newreno *)tp->ccv->cc_data)->newreno_flags = CC_NEWRENO_BETA_ECN;
	opt.name = CC_NEWRENO_BETA_ECN;
	opt.val = rack->r_ctl.rc_saved_beta.beta_ecn;
	error = CC_ALGO(tp)->ctl_output(tp->ccv, &sopt, &opt);
	if (error) {
		printf("Error returned by ctl_output %d\n", error);
		goto out;
	}
	/* Save off the original values for restoral */
	memcpy(&rack->r_ctl.rc_saved_beta, &old, sizeof(struct newreno));
out:
	if (rack_verbose_logging && (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) {
		union tcp_log_stackspecific log;
		struct timeval tv;

		ptr = ((struct newreno *)tp->ccv->cc_data);
		memset(&log.u_bbr, 0, sizeof(log.u_bbr));
		log.u_bbr.timeStamp = tcp_get_usecs(&tv);
		if (ptr) {
			log.u_bbr.flex1 = ptr->beta;
			log.u_bbr.flex2 = ptr->beta_ecn;
			log.u_bbr.flex3 = ptr->newreno_flags;
		}
		log.u_bbr.flex4 = rack->r_ctl.rc_saved_beta.beta;
		log.u_bbr.flex5 = rack->r_ctl.rc_saved_beta.beta_ecn;
		log.u_bbr.flex6 = rack->r_ctl.rc_saved_beta.newreno_flags;
		log.u_bbr.flex7 = rack->gp_ready;
		log.u_bbr.flex7 <<= 1;
		log.u_bbr.flex7 |= rack->use_fixed_rate;
		log.u_bbr.flex7 <<= 1;
		log.u_bbr.flex7 |= rack->rc_pacing_cc_set;
		log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt;
		log.u_bbr.flex8 = 3;
		tcp_log_event_(tp, NULL, NULL, NULL, BBR_LOG_CWND, error,
			       0, &log, false, NULL, NULL, 0, &tv);
	}
}

static void
rack_undo_cc_pacing(struct tcp_rack *rack)
{
	struct newreno old, *ptr;
	struct tcpcb *tp;

	if (rack->rc_pacing_cc_set == 0)
		return;
	tp = rack->rc_tp;
	rack->rc_pacing_cc_set = 0;
	if (tp->cc_algo == NULL)
		/* Tcb is leaving */
		return;
	if (strcmp(tp->cc_algo->name, CCALGONAME_NEWRENO) != 0) {
		/* Not new-reno nothing to do! */
		return;
	}
	ptr = ((struct newreno *)tp->ccv->cc_data);
	if (ptr == NULL) {
		/*
		 * This happens at rack_fini() if the
		 * cc module gets freed on us. In that
		 * case we loose our "new" settings but
		 * thats ok, since the tcb is going away anyway.
		 */
		return;
	}
	/* Grab out our set values */
	memcpy(&old, ptr, sizeof(struct newreno));
	/* Copy back in the original values */
	memcpy(ptr, &rack->r_ctl.rc_saved_beta, sizeof(struct newreno));
	/* Now save back the values we had set in (for when pacing is restored) */
	memcpy(&rack->r_ctl.rc_saved_beta, &old, sizeof(struct newreno));
	if (rack_verbose_logging && (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) {
		union tcp_log_stackspecific log;
		struct timeval tv;

		ptr = ((struct newreno *)tp->ccv->cc_data);
		memset(&log.u_bbr, 0, sizeof(log.u_bbr));
		log.u_bbr.timeStamp = tcp_get_usecs(&tv);
		log.u_bbr.flex1 = ptr->beta;
		log.u_bbr.flex2 = ptr->beta_ecn;
		log.u_bbr.flex3 = ptr->newreno_flags;
		log.u_bbr.flex4 = rack->r_ctl.rc_saved_beta.beta;
		log.u_bbr.flex5 = rack->r_ctl.rc_saved_beta.beta_ecn;
		log.u_bbr.flex6 = rack->r_ctl.rc_saved_beta.newreno_flags;
		log.u_bbr.flex7 = rack->gp_ready;
		log.u_bbr.flex7 <<= 1;
		log.u_bbr.flex7 |= rack->use_fixed_rate;
		log.u_bbr.flex7 <<= 1;
		log.u_bbr.flex7 |= rack->rc_pacing_cc_set;
		log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt;
		log.u_bbr.flex8 = 4;
		tcp_log_event_(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0,
			       0, &log, false, NULL, NULL, 0, &tv);
	}
}

#ifdef NETFLIX_PEAKRATE
static inline void
rack_update_peakrate_thr(struct tcpcb *tp)
{
	/* Keep in mind that t_maxpeakrate is in B/s. */
	uint64_t peak;
	peak = uqmax((tp->t_maxseg * 2),
		     (((uint64_t)tp->t_maxpeakrate * (uint64_t)(tp->t_srtt)) / (uint64_t)HPTS_USEC_IN_SEC));
	tp->t_peakrate_thr = (uint32_t)uqmin(peak, UINT32_MAX);
}
#endif

735
736
737
738
739
static int
sysctl_rack_clear(SYSCTL_HANDLER_ARGS)
{
	uint32_t stat;
	int32_t error;
740
	int i;
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767

	error = SYSCTL_OUT(req, &rack_clear_counter, sizeof(uint32_t));
	if (error || req->newptr == NULL)
		return error;

	error = SYSCTL_IN(req, &stat, sizeof(uint32_t));
	if (error)
		return (error);
	if (stat == 1) {
#ifdef INVARIANTS
		printf("Clearing RACK counters\n");
#endif
		counter_u64_zero(rack_badfr);
		counter_u64_zero(rack_badfr_bytes);
		counter_u64_zero(rack_rtm_prr_retran);
		counter_u64_zero(rack_rtm_prr_newdata);
		counter_u64_zero(rack_timestamp_mismatch);
		counter_u64_zero(rack_reorder_seen);
		counter_u64_zero(rack_tlp_tot);
		counter_u64_zero(rack_tlp_newdata);
		counter_u64_zero(rack_tlp_retran);
		counter_u64_zero(rack_tlp_retran_bytes);
		counter_u64_zero(rack_tlp_retran_fail);
		counter_u64_zero(rack_to_tot);
		counter_u64_zero(rack_to_arm_rack);
		counter_u64_zero(rack_to_arm_tlp);
		counter_u64_zero(rack_paced_segments);
768
769
		counter_u64_zero(rack_calc_zero);
		counter_u64_zero(rack_calc_nonzero);
770
771
		counter_u64_zero(rack_unpaced_segments);
		counter_u64_zero(rack_saw_enobuf);
772
		counter_u64_zero(rack_saw_enobuf_hw);
773
		counter_u64_zero(rack_saw_enetunreach);
774
		counter_u64_zero(rack_per_timer_hole);
775
776
777
778
779
		counter_u64_zero(rack_large_ackcmp);
		counter_u64_zero(rack_small_ackcmp);
#ifdef INVARIANTS
		counter_u64_zero(rack_adjust_map_bw);
#endif
780
781
782
		counter_u64_zero(rack_to_alloc_hard);
		counter_u64_zero(rack_to_alloc_emerg);
		counter_u64_zero(rack_sack_proc_all);
783
784
785
786
787
788
789
790
791
		counter_u64_zero(rack_fto_send);
		counter_u64_zero(rack_fto_rsm_send);
		counter_u64_zero(rack_extended_rfo);
		counter_u64_zero(rack_hw_pace_init_fail);
		counter_u64_zero(rack_hw_pace_lost);
		counter_u64_zero(rack_sbsndptr_wrong);
		counter_u64_zero(rack_sbsndptr_right);
		counter_u64_zero(rack_non_fto_send);
		counter_u64_zero(rack_nfto_resend);
792
793
794
		counter_u64_zero(rack_sack_proc_short);
		counter_u64_zero(rack_sack_proc_restart);
		counter_u64_zero(rack_to_alloc);
795
		counter_u64_zero(rack_to_alloc_limited);
796
797
		counter_u64_zero(rack_alloc_limited_conns);
		counter_u64_zero(rack_split_limited);
798
799
800
801
802
		for (i = 0; i < MAX_NUM_OF_CNTS; i++) {
			counter_u64_zero(rack_proc_comp_ack[i]);
		}
		counter_u64_zero(rack_multi_single_eq);
		counter_u64_zero(rack_proc_non_comp_ack);
803
		counter_u64_zero(rack_find_high);
804
805
806
807
808
809
810
811
812
813
814
		counter_u64_zero(rack_sack_attacks_detected);
		counter_u64_zero(rack_sack_attacks_reversed);
		counter_u64_zero(rack_sack_used_next_merge);
		counter_u64_zero(rack_sack_used_prev_merge);
		counter_u64_zero(rack_sack_splits);
		counter_u64_zero(rack_sack_skipped_acked);
		counter_u64_zero(rack_ack_total);
		counter_u64_zero(rack_express_sack);
		counter_u64_zero(rack_sack_total);
		counter_u64_zero(rack_move_none);
		counter_u64_zero(rack_move_some);
815
816
817
818
819
		counter_u64_zero(rack_used_tlpmethod);
		counter_u64_zero(rack_used_tlpmethod2);
		counter_u64_zero(rack_enter_tlp_calc);
		counter_u64_zero(rack_progress_drops);
		counter_u64_zero(rack_tlp_does_nada);
820
		counter_u64_zero(rack_try_scwnd);
821
		counter_u64_zero(rack_collapsed_win);
822
823
824
825
826
827
	}
	rack_clear_counter = 0;
	return (0);
}

static void
828
rack_init_sysctls(void)
829
{
830
	int i;
831
832
	struct sysctl_oid *rack_counters;
	struct sysctl_oid *rack_attack;
833
834
835
836
837
838
839
	struct sysctl_oid *rack_pacing;
	struct sysctl_oid *rack_timely;
	struct sysctl_oid *rack_timers;
	struct sysctl_oid *rack_tlp;
	struct sysctl_oid *rack_misc;
	struct sysctl_oid *rack_measure;
	struct sysctl_oid *rack_probertt;
840
	struct sysctl_oid *rack_hw_pacing;
841

842
843
844
845
846
847
848
849
850
851
852
853
	rack_attack = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
	    SYSCTL_CHILDREN(rack_sysctl_root),
	    OID_AUTO,
	    "sack_attack",
	    CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
	    "Rack Sack Attack Counters and Controls");
	rack_counters = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
	    SYSCTL_CHILDREN(rack_sysctl_root),
	    OID_AUTO,
	    "stats",
	    CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
	    "Rack Counters");
854
855
856
857
858
	SYSCTL_ADD_S32(&rack_sysctl_ctx,
	    SYSCTL_CHILDREN(rack_sysctl_root),
	    OID_AUTO, "rate_sample_method", CTLFLAG_RW,
	    &rack_rate_sample_method , USE_RTT_LOW,
	    "What method should we use for rate sampling 0=high, 1=low ");
859
860
	/* Probe rtt related controls */
	rack_probertt = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
861
	    SYSCTL_CHILDREN(rack_sysctl_root),
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
	    OID_AUTO,
	    "probertt",
	    CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
	    "ProbeRTT related Controls");
	SYSCTL_ADD_U16(&rack_sysctl_ctx,
	    SYSCTL_CHILDREN(rack_probertt),
	    OID_AUTO, "exit_per_hpb", CTLFLAG_RW,
	    &rack_atexit_prtt_hbp, 130,
	    "What percentage above goodput do we clamp CA/SS to at exit on high-BDP path 110%");
	SYSCTL_ADD_U16(&rack_sysctl_ctx,
	    SYSCTL_CHILDREN(rack_probertt),
	    OID_AUTO, "exit_per_nonhpb", CTLFLAG_RW,
	    &rack_atexit_prtt, 130,
	    "What percentage above goodput do we clamp CA/SS to at exit on a non high-BDP path 100%");
	SYSCTL_ADD_U16(&rack_sysctl_ctx,
	    SYSCTL_CHILDREN(rack_probertt),
	    OID_AUTO, "gp_per_mul", CTLFLAG_RW,
	    &rack_per_of_gp_probertt, 60,
	    "What percentage of goodput do we pace at in probertt");
	SYSCTL_ADD_U16(&rack_sysctl_ctx,
	    SYSCTL_CHILDREN(rack_probertt),
	    OID_AUTO, "gp_per_reduce", CTLFLAG_RW,
	    &rack_per_of_gp_probertt_reduce, 10,
	    "What percentage of goodput do we reduce every gp_srtt");
	SYSCTL_ADD_U16(&rack_sysctl_ctx,
	    SYSCTL_CHILDREN(rack_probertt),
	    OID_AUTO, "gp_per_low", CTLFLAG_RW,
	    &rack_per_of_gp_lowthresh, 40,
	    "What percentage of goodput do we allow the multiplier to fall to");
	SYSCTL_ADD_U32(&rack_sysctl_ctx,
	    SYSCTL_CHILDREN(rack_probertt),
	    OID_AUTO, "time_between", CTLFLAG_RW,
	    & rack_time_between_probertt, 96000000,
	    "How many useconds between the lowest rtt falling must past before we enter probertt");
	SYSCTL_ADD_U32(&rack_sysctl_ctx,
	    SYSCTL_CHILDREN(rack_probertt),
	    OID_AUTO, "safety", CTLFLAG_RW,
	    &rack_probe_rtt_safety_val, 2000000,
	    "If not zero, provides a maximum usecond that you can stay in probertt (2sec = 2000000)");
	SYSCTL_ADD_U32(&rack_sysctl_ctx,
	    SYSCTL_CHILDREN(rack_probertt),
	    OID_AUTO, "sets_cwnd", CTLFLAG_RW,
	    &rack_probe_rtt_sets_cwnd, 0,
	    "Do we set the cwnd too (if always_lower is on)");
	SYSCTL_ADD_U32(&rack_sysctl_ctx,
	    SYSCTL_CHILDREN(rack_probertt),
	    OID_AUTO, "maxdrainsrtts", CTLFLAG_RW,
	    &rack_max_drain_wait, 2,
	    "Maximum number of gp_srtt's to hold in drain waiting for flight to reach goal");
	SYSCTL_ADD_U32(&rack_sysctl_ctx,
	    SYSCTL_CHILDREN(rack_probertt),
	    OID_AUTO, "mustdrainsrtts", CTLFLAG_RW,
	    &rack_must_drain, 1,
	    "We must drain this many gp_srtt's waiting for flight to reach goal");
	SYSCTL_ADD_U32(&rack_sysctl_ctx,
	    SYSCTL_CHILDREN(rack_probertt),
	    OID_AUTO, "goal_use_min_entry", CTLFLAG_RW,
	    &rack_probertt_use_min_rtt_entry, 1,
	    "Should we use the min-rtt to calculate the goal rtt (else gp_srtt) at entry");
	SYSCTL_ADD_U32(&rack_sysctl_ctx,
	    SYSCTL_CHILDREN(rack_probertt),
	    OID_AUTO, "goal_use_min_exit", CTLFLAG_RW,
	    &rack_probertt_use_min_rtt_exit, 0,
	    "How to set cwnd at exit, 0 - dynamic, 1 - use min-rtt, 2 - use curgprtt, 3 - entry gp-rtt");
	SYSCTL_ADD_U32(&rack_sysctl_ctx,
	    SYSCTL_CHILDREN(rack_probertt),
	    OID_AUTO, "length_div", CTLFLAG_RW,
	    &rack_probertt_gpsrtt_cnt_div, 0,
	    "How many recent goodput srtt periods plus hold tim does probertt last (bottom of fraction)");
	SYSCTL_ADD_U32(&rack_sysctl_ctx,
	    SYSCTL_CHILDREN(rack_probertt),
	    OID_AUTO, "length_mul", CTLFLAG_RW,
	    &rack_probertt_gpsrtt_cnt_mul, 0,
	    "How many recent goodput srtt periods plus hold tim does probertt last (top of fraction)");
	SYSCTL_ADD_U32(&rack_sysctl_ctx,
	    SYSCTL_CHILDREN(rack_probertt),
	    OID_AUTO, "holdtim_at_target", CTLFLAG_RW,
	    &rack_min_probertt_hold, 200000,
	    "What is the minimum time we hold probertt at target");
	SYSCTL_ADD_U32(&rack_sysctl_ctx,
	    SYSCTL_CHILDREN(rack_probertt),
	    OID_AUTO, "filter_life", CTLFLAG_RW,
	    &rack_probertt_filter_life, 10000000,
	    "What is the time for the filters life in useconds");
	SYSCTL_ADD_U32(&rack_sysctl_ctx,
	    SYSCTL_CHILDREN(rack_probertt),
	    OID_AUTO, "lower_within", CTLFLAG_RW,
	    &rack_probertt_lower_within, 10,
	    "If the rtt goes lower within this percentage of the time, go into probe-rtt");
	SYSCTL_ADD_U32(&rack_sysctl_ctx,
	    SYSCTL_CHILDREN(rack_probertt),
	    OID_AUTO, "must_move", CTLFLAG_RW,
	    &rack_min_rtt_movement, 250,
	    "How much is the minimum movement in rtt to count as a drop for probertt purposes");
	SYSCTL_ADD_U32(&rack_sysctl_ctx,
	    SYSCTL_CHILDREN(rack_probertt),
	    OID_AUTO, "clear_is_cnts", CTLFLAG_RW,
	    &rack_probertt_clear_is, 1,
	    "Do we clear I/S counts on exiting probe-rtt");
961
	SYSCTL_ADD_S32(&rack_sysctl_ctx,
962
963
964
965
966
967
968
969
970
971
972
	    SYSCTL_CHILDREN(rack_probertt),
	    OID_AUTO, "hbp_extra_drain", CTLFLAG_RW,
	    &rack_max_drain_hbp, 1,
	    "How many extra drain gpsrtt's do we get in highly buffered paths");
	SYSCTL_ADD_S32(&rack_sysctl_ctx,
	    SYSCTL_CHILDREN(rack_probertt),
	    OID_AUTO, "hbp_threshold", CTLFLAG_RW,
	    &rack_hbp_thresh, 3,
	    "We are highly buffered if min_rtt_seen / max_rtt_seen > this-threshold");
	/* Pacing related sysctls */
	rack_pacing = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
973
	    SYSCTL_CHILDREN(rack_sysctl_root),
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
	    OID_AUTO,
	    "pacing",
	    CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
	    "Pacing related Controls");
	SYSCTL_ADD_S32(&rack_sysctl_ctx,
	    SYSCTL_CHILDREN(rack_pacing),
	    OID_AUTO, "max_pace_over", CTLFLAG_RW,
	    &rack_max_per_above, 30,
	    "What is the maximum allowable percentage that we can pace above (so 30 = 130% of our goal)");
	SYSCTL_ADD_S32(&rack_sysctl_ctx,
	    SYSCTL_CHILDREN(rack_pacing),
	    OID_AUTO, "pace_to_one", CTLFLAG_RW,
	    &rack_pace_one_seg, 0,
	    "Do we allow low b/w pacing of 1MSS instead of two");
	SYSCTL_ADD_S32(&rack_sysctl_ctx,
	    SYSCTL_CHILDREN(rack_pacing),
	    OID_AUTO, "limit_wsrtt", CTLFLAG_RW,
	    &rack_limit_time_with_srtt, 0,
	    "Do we limit pacing time based on srtt");
	SYSCTL_ADD_S32(&rack_sysctl_ctx,
	    SYSCTL_CHILDREN(rack_pacing),
	    OID_AUTO, "init_win", CTLFLAG_RW,
	    &rack_default_init_window, 0,
	    "Do we have a rack initial window 0 = system default");
	SYSCTL_ADD_U16(&rack_sysctl_ctx,
	    SYSCTL_CHILDREN(rack_pacing),
	    OID_AUTO, "gp_per_ss", CTLFLAG_RW,
For faster browsing, not all history is shown. View entire blame