rack.c 647 KB
Newer Older
1
/*-
2
 * Copyright (c) 2016-2020 Netflix, Inc.
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 */

#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");

#include "opt_inet.h"
#include "opt_inet6.h"
#include "opt_ipsec.h"
#include "opt_tcpdebug.h"
34
#include "opt_ratelimit.h"
35
#include "opt_kern_tls.h"
36
#include <sys/param.h>
37
#include <sys/arb.h>
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
#include <sys/module.h>
#include <sys/kernel.h>
#ifdef TCP_HHOOK
#include <sys/hhook.h>
#endif
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/mbuf.h>
#include <sys/proc.h>		/* for proc0 declaration */
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/sysctl.h>
#include <sys/systm.h>
53
#ifdef STATS
54
55
#include <sys/qmath.h>
#include <sys/tree.h>
56
#include <sys/stats.h> /* Must come after qmath.h and tree.h */
57
58
#else
#include <sys/tree.h>
59
60
61
#endif
#include <sys/refcount.h>
#include <sys/queue.h>
62
#include <sys/tim_filter.h>
63
64
65
#include <sys/smp.h>
#include <sys/kthread.h>
#include <sys/kern_prefetch.h>
66
#include <sys/protosw.h>
67
68
69
70
#ifdef TCP_ACCOUNTING
#include <sys/sched.h>
#include <machine/cpu.h>
#endif
71
72
73
#include <vm/uma.h>

#include <net/route.h>
74
#include <net/route/nhop.h>
75
76
77
78
79
80
81
82
83
84
85
86
87
88
#include <net/vnet.h>

#define TCPSTATES		/* for logging */

#include <netinet/in.h>
#include <netinet/in_kdtrace.h>
#include <netinet/in_pcb.h>
#include <netinet/ip.h>
#include <netinet/ip_icmp.h>	/* required for icmp_var.h */
#include <netinet/icmp_var.h>	/* for ICMP_BANDLIM */
#include <netinet/ip_var.h>
#include <netinet/ip6.h>
#include <netinet6/in6_pcb.h>
#include <netinet6/ip6_var.h>
89
#include <netinet/tcp.h>
90
#define	TCPOUTFLAGS
91
92
93
94
95
96
#include <netinet/tcp_fsm.h>
#include <netinet/tcp_log_buf.h>
#include <netinet/tcp_seq.h>
#include <netinet/tcp_timer.h>
#include <netinet/tcp_var.h>
#include <netinet/tcp_hpts.h>
97
#include <netinet/tcp_ratelimit.h>
98
#include <netinet/tcp_accounting.h>
99
100
#include <netinet/tcpip.h>
#include <netinet/cc/cc.h>
101
#include <netinet/cc/cc_newreno.h>
102
#include <netinet/tcp_fastopen.h>
103
#include <netinet/tcp_lro.h>
104
105
106
#ifdef NETFLIX_SHARED_CWND
#include <netinet/tcp_shared_cwnd.h>
#endif
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
#ifdef TCPDEBUG
#include <netinet/tcp_debug.h>
#endif				/* TCPDEBUG */
#ifdef TCP_OFFLOAD
#include <netinet/tcp_offload.h>
#endif
#ifdef INET6
#include <netinet6/tcp6_var.h>
#endif

#include <netipsec/ipsec_support.h>

#if defined(IPSEC) || defined(IPSEC_SUPPORT)
#include <netipsec/ipsec.h>
#include <netipsec/ipsec6.h>
#endif				/* IPSEC */

#include <netinet/udp.h>
#include <netinet/udp_var.h>
#include <machine/in_cksum.h>

#ifdef MAC
#include <security/mac/mac_framework.h>
#endif
#include "sack_filter.h"
#include "tcp_rack.h"
#include "rack_bbr_common.h"

uma_zone_t rack_zone;
uma_zone_t rack_pcb_zone;

#ifndef TICKS2SBT
#define	TICKS2SBT(__t)	(tick_sbt * ((sbintime_t)(__t)))
#endif

142
143
144
145
146
147
148
149
150
VNET_DECLARE(uint32_t, newreno_beta);
VNET_DECLARE(uint32_t, newreno_beta_ecn);
#define V_newreno_beta VNET(newreno_beta)
#define V_newreno_beta_ecn VNET(newreno_beta_ecn)


MALLOC_DEFINE(M_TCPFSB, "tcp_fsb", "TCP fast send block");
MALLOC_DEFINE(M_TCPDO, "tcp_do", "TCP deferred options");

151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
struct sysctl_ctx_list rack_sysctl_ctx;
struct sysctl_oid *rack_sysctl_root;

#define CUM_ACKED 1
#define SACKED 2

/*
 * The RACK module incorporates a number of
 * TCP ideas that have been put out into the IETF
 * over the last few years:
 * - Matt Mathis's Rate Halving which slowly drops
 *    the congestion window so that the ack clock can
 *    be maintained during a recovery.
 * - Yuchung Cheng's RACK TCP (for which its named) that
 *    will stop us using the number of dup acks and instead
 *    use time as the gage of when we retransmit.
 * - Reorder Detection of RFC4737 and the Tail-Loss probe draft
 *    of Dukkipati et.al.
 * RACK depends on SACK, so if an endpoint arrives that
 * cannot do SACK the state machine below will shuttle the
 * connection back to using the "default" TCP stack that is
 * in FreeBSD.
 *
 * To implement RACK the original TCP stack was first decomposed
 * into a functional state machine with individual states
176
 * for each of the possible TCP connection states. The do_segment
177
178
179
180
181
182
183
184
185
186
187
188
189
 * functions role in life is to mandate the connection supports SACK
 * initially and then assure that the RACK state matches the conenction
 * state before calling the states do_segment function. Each
 * state is simplified due to the fact that the original do_segment
 * has been decomposed and we *know* what state we are in (no
 * switches on the state) and all tests for SACK are gone. This
 * greatly simplifies what each state does.
 *
 * TCP output is also over-written with a new version since it
 * must maintain the new rack scoreboard.
 *
 */
static int32_t rack_tlp_thresh = 1;
190
191
static int32_t rack_tlp_limit = 2;	/* No more than 2 TLPs w-out new data */
static int32_t rack_tlp_use_greater = 1;
192
static int32_t rack_reorder_thresh = 2;
193
static int32_t rack_reorder_fade = 60000000;	/* 0 - never fade, def 60,000,000
194
						 * - 60 seconds */
195
static uint8_t rack_req_measurements = 1;
196
197
198
/* Attack threshold detections */
static uint32_t rack_highest_sack_thresh_seen = 0;
static uint32_t rack_highest_move_thresh_seen = 0;
199
200
201
202
203
204
205
206
static int32_t rack_enable_hw_pacing = 0; /* Due to CCSP keep it off by default */
static int32_t rack_hw_pace_extra_slots = 2;	/* 2 extra MSS time betweens */
static int32_t rack_hw_rate_caps = 1; /* 1; */
static int32_t rack_hw_rate_min = 0; /* 1500000;*/
static int32_t rack_hw_rate_to_low = 0; /* 1200000; */
static int32_t rack_hw_up_only = 1;
static int32_t rack_stats_gets_ms_rtt = 1;
static int32_t rack_prr_addbackmax = 2;
207
static int32_t rack_do_hystart = 0;
208
static int32_t rack_apply_rtt_with_reduced_conf = 0;
209
210

static int32_t rack_pkt_delay = 1000;
211
static int32_t rack_send_a_lot_in_prr = 1;
212
static int32_t rack_min_to = 1000;	/* Number of microsecond  min timeout */
213
214
static int32_t rack_verbose_logging = 0;
static int32_t rack_ignore_data_after_close = 1;
215
216
217
218
219
220
221
static int32_t rack_enable_shared_cwnd = 1;
static int32_t rack_use_cmp_acks = 1;
static int32_t rack_use_fsb = 1;
static int32_t rack_use_rfo = 1;
static int32_t rack_use_rsm_rfo = 1;
static int32_t rack_max_abc_post_recovery = 2;
static int32_t rack_client_low_buf = 0;
222
static int32_t rack_dsack_std_based = 0x3;	/* bit field bit 1 sets rc_rack_tmr_std_based and bit 2 sets rc_rack_use_dsack */
223
224
225
#ifdef TCP_ACCOUNTING
static int32_t rack_tcp_accounting = 0;
#endif
226
227
228
229
230
static int32_t rack_limits_scwnd = 1;
static int32_t rack_enable_mqueue_for_nonpaced = 0;
static int32_t rack_disable_prr = 0;
static int32_t use_rack_rr = 1;
static int32_t rack_non_rxt_use_cr = 0; /* does a non-rxt in recovery use the configured rate (ss/ca)? */
231
232
233
234
static int32_t rack_persist_min = 250000;	/* 250usec */
static int32_t rack_persist_max = 2000000;	/* 2 Second in usec's */
static int32_t rack_sack_not_required = 1;	/* set to one to allow non-sack to use rack */
static int32_t rack_default_init_window = 0;	/* Use system default */
235
static int32_t rack_limit_time_with_srtt = 0;
236
237
238
239
240
static int32_t rack_autosndbuf_inc = 20;	/* In percentage form */
static int32_t rack_enobuf_hw_boost_mult = 2;	/* How many times the hw rate we boost slot using time_between */
static int32_t rack_enobuf_hw_max = 12000;	/* 12 ms in usecs */
static int32_t rack_enobuf_hw_min = 10000;	/* 10 ms in usecs */
static int32_t rack_hw_rwnd_factor = 2;		/* How many max_segs the rwnd must be before we hold off sending */
241
242
243
244
245
246
/*
 * Currently regular tcp has a rto_min of 30ms
 * the backoff goes 12 times so that ends up
 * being a total of 122.850 seconds before a
 * connection is killed.
 */
247
248
249
250
static uint32_t rack_def_data_window = 20;
static uint32_t rack_goal_bdp = 2;
static uint32_t rack_min_srtts = 1;
static uint32_t rack_min_measure_usec = 0;
251
252
253
static int32_t rack_tlp_min = 10000;	/* 10ms */
static int32_t rack_rto_min = 30000;	/* 30,000 usec same as main freebsd */
static int32_t rack_rto_max = 4000000;	/* 4 seconds in usec's */
254
255
256
static const int32_t rack_free_cache = 2;
static int32_t rack_hptsi_segments = 40;
static int32_t rack_rate_sample_method = USE_RTT_LOW;
257
static int32_t rack_pace_every_seg = 0;
258
static int32_t rack_delayed_ack_time = 40000;	/* 40ms in usecs */
259
static int32_t rack_slot_reduction = 4;
260
261
262
static int32_t rack_wma_divisor = 8;		/* For WMA calculation */
static int32_t rack_cwnd_block_ends_measure = 0;
static int32_t rack_rwnd_block_ends_measure = 0;
263
static int32_t rack_def_profile = 0;
264

265
266
267
268
269
static int32_t rack_lower_cwnd_at_tlp = 0;
static int32_t rack_limited_retran = 0;
static int32_t rack_always_send_oldest = 0;
static int32_t rack_tlp_threshold_use = TLP_USE_TWO_ONE;

270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
static uint16_t rack_per_of_gp_ss = 250;	/* 250 % slow-start */
static uint16_t rack_per_of_gp_ca = 200;	/* 200 % congestion-avoidance */
static uint16_t rack_per_of_gp_rec = 200;	/* 200 % of bw */

/* Probertt */
static uint16_t rack_per_of_gp_probertt = 60;	/* 60% of bw */
static uint16_t rack_per_of_gp_lowthresh = 40;	/* 40% is bottom */
static uint16_t rack_per_of_gp_probertt_reduce = 10; /* 10% reduction */
static uint16_t rack_atexit_prtt_hbp = 130;	/* Clamp to 130% on exit prtt if highly buffered path */
static uint16_t rack_atexit_prtt = 130;	/* Clamp to 100% on exit prtt if non highly buffered path */

static uint32_t rack_max_drain_wait = 2;	/* How man gp srtt's before we give up draining */
static uint32_t rack_must_drain = 1;		/* How many GP srtt's we *must* wait */
static uint32_t rack_probertt_use_min_rtt_entry = 1;	/* Use the min to calculate the goal else gp_srtt */
static uint32_t rack_probertt_use_min_rtt_exit = 0;
static uint32_t rack_probe_rtt_sets_cwnd = 0;
static uint32_t rack_probe_rtt_safety_val = 2000000;	/* No more than 2 sec in probe-rtt */
287
static uint32_t rack_time_between_probertt = 9600000;	/* 9.6 sec in usecs */
288
static uint32_t rack_probertt_gpsrtt_cnt_mul = 0;	/* How many srtt periods does probe-rtt last top fraction */
289
290
static uint32_t rack_probertt_gpsrtt_cnt_div = 0;	/* How many srtt periods does probe-rtt last bottom fraction */
static uint32_t rack_min_probertt_hold = 40000;		/* Equal to delayed ack time */
291
292
static uint32_t rack_probertt_filter_life = 10000000;
static uint32_t rack_probertt_lower_within = 10;
293
static uint32_t rack_min_rtt_movement = 250000;	/* Must move at least 250ms (in microseconds)  to count as a lowering */
294
295
296
297
298
299
300
301
302
303
static int32_t rack_pace_one_seg = 0;		/* Shall we pace for less than 1.4Meg 1MSS at a time */
static int32_t rack_probertt_clear_is = 1;
static int32_t rack_max_drain_hbp = 1;		/* Extra drain times gpsrtt for highly buffered paths */
static int32_t rack_hbp_thresh = 3;		/* what is the divisor max_rtt/min_rtt to decided a hbp */

/* Part of pacing */
static int32_t rack_max_per_above = 30;		/* When we go to increment stop if above 100+this% */

/* Timely information */
/* Combine these two gives the range of 'no change' to bw */
304
/* ie the up/down provide the upper and lower bound */
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
static int32_t rack_gp_per_bw_mul_up = 2;	/* 2% */
static int32_t rack_gp_per_bw_mul_down = 4;	/* 4% */
static int32_t rack_gp_rtt_maxmul = 3;		/* 3 x maxmin */
static int32_t rack_gp_rtt_minmul = 1;		/* minrtt + (minrtt/mindiv) is lower rtt */
static int32_t rack_gp_rtt_mindiv = 4;		/* minrtt + (minrtt * minmul/mindiv) is lower rtt */
static int32_t rack_gp_decrease_per = 20;	/* 20% decrease in multipler */
static int32_t rack_gp_increase_per = 2;	/* 2% increase in multipler */
static int32_t rack_per_lower_bound = 50;	/* Don't allow to drop below this multiplier */
static int32_t rack_per_upper_bound_ss = 0;	/* Don't allow SS to grow above this */
static int32_t rack_per_upper_bound_ca = 0;	/* Don't allow CA to grow above this */
static int32_t rack_do_dyn_mul = 0;		/* Are the rack gp multipliers dynamic */
static int32_t rack_gp_no_rec_chg = 1;		/* Prohibit recovery from reducing it's multiplier */
static int32_t rack_timely_dec_clear = 6;	/* Do we clear decrement count at a value (6)? */
static int32_t rack_timely_max_push_rise = 3;	/* One round of pushing */
static int32_t rack_timely_max_push_drop = 3;	/* Three round of pushing */
static int32_t rack_timely_min_segs = 4;	/* 4 segment minimum */
static int32_t rack_use_max_for_nobackoff = 0;
static int32_t rack_timely_int_timely_only = 0;	/* do interim timely's only use the timely algo (no b/w changes)? */
static int32_t rack_timely_no_stopping = 0;
static int32_t rack_down_raise_thresh = 100;
static int32_t rack_req_segs = 1;
326
static uint64_t rack_bw_rate_cap = 0;
327
328
329

/* Weird delayed ack mode */
static int32_t rack_use_imac_dack = 0;
330
331
332
333
334
335
336
337
338
/* Rack specific counters */
counter_u64_t rack_badfr;
counter_u64_t rack_badfr_bytes;
counter_u64_t rack_rtm_prr_retran;
counter_u64_t rack_rtm_prr_newdata;
counter_u64_t rack_timestamp_mismatch;
counter_u64_t rack_reorder_seen;
counter_u64_t rack_paced_segments;
counter_u64_t rack_unpaced_segments;
339
340
counter_u64_t rack_calc_zero;
counter_u64_t rack_calc_nonzero;
341
counter_u64_t rack_saw_enobuf;
342
counter_u64_t rack_saw_enobuf_hw;
343
counter_u64_t rack_saw_enetunreach;
344
counter_u64_t rack_per_timer_hole;
345
346
counter_u64_t rack_large_ackcmp;
counter_u64_t rack_small_ackcmp;
347
348
349
350
counter_u64_t rack_persists_sends;
counter_u64_t rack_persists_acks;
counter_u64_t rack_persists_loss;
counter_u64_t rack_persists_lost_ends;
351
352
353
#ifdef INVARIANTS
counter_u64_t rack_adjust_map_bw;
#endif
354
355
356
357
358
359
360
361
362
/* Tail loss probe counters */
counter_u64_t rack_tlp_tot;
counter_u64_t rack_tlp_newdata;
counter_u64_t rack_tlp_retran;
counter_u64_t rack_tlp_retran_bytes;
counter_u64_t rack_tlp_retran_fail;
counter_u64_t rack_to_tot;
counter_u64_t rack_to_arm_rack;
counter_u64_t rack_to_arm_tlp;
363
counter_u64_t rack_hot_alloc;
364
365
366
counter_u64_t rack_to_alloc;
counter_u64_t rack_to_alloc_hard;
counter_u64_t rack_to_alloc_emerg;
367
counter_u64_t rack_to_alloc_limited;
368
369
counter_u64_t rack_alloc_limited_conns;
counter_u64_t rack_split_limited;
370

371
372
373
374
375
376
377
378
379
380
381
#define MAX_NUM_OF_CNTS 13
counter_u64_t rack_proc_comp_ack[MAX_NUM_OF_CNTS];
counter_u64_t rack_multi_single_eq;
counter_u64_t rack_proc_non_comp_ack;

counter_u64_t rack_fto_send;
counter_u64_t rack_fto_rsm_send;
counter_u64_t rack_nfto_resend;
counter_u64_t rack_non_fto_send;
counter_u64_t rack_extended_rfo;

382
383
384
counter_u64_t rack_sack_proc_all;
counter_u64_t rack_sack_proc_short;
counter_u64_t rack_sack_proc_restart;
385
386
387
388
389
390
391
392
393
394
395
396
counter_u64_t rack_sack_attacks_detected;
counter_u64_t rack_sack_attacks_reversed;
counter_u64_t rack_sack_used_next_merge;
counter_u64_t rack_sack_splits;
counter_u64_t rack_sack_used_prev_merge;
counter_u64_t rack_sack_skipped_acked;
counter_u64_t rack_ack_total;
counter_u64_t rack_express_sack;
counter_u64_t rack_sack_total;
counter_u64_t rack_move_none;
counter_u64_t rack_move_some;

397
398
399
400
counter_u64_t rack_used_tlpmethod;
counter_u64_t rack_used_tlpmethod2;
counter_u64_t rack_enter_tlp_calc;
counter_u64_t rack_input_idle_reduces;
401
counter_u64_t rack_collapsed_win;
402
counter_u64_t rack_tlp_does_nada;
403
counter_u64_t rack_try_scwnd;
404
405
406
407
counter_u64_t rack_hw_pace_init_fail;
counter_u64_t rack_hw_pace_lost;
counter_u64_t rack_sbsndptr_right;
counter_u64_t rack_sbsndptr_wrong;
408
409
410
411
412
413
414
415

/* Temp CPU counters */
counter_u64_t rack_find_high;

counter_u64_t rack_progress_drops;
counter_u64_t rack_out_size[TCP_MSS_ACCT_SIZE];
counter_u64_t rack_opts_arry[RACK_OPTS_SIZE];

416
417
418

#define	RACK_REXMTVAL(tp) max(rack_rto_min, ((tp)->t_srtt + ((tp)->t_rttvar << 2)))

419
420
#define	RACK_TCPT_RANGESET(tv, value, tvmin, tvmax, slop) do {	\
	(tv) = (value) + slop;	 \
421
422
423
424
425
426
	if ((u_long)(tv) < (u_long)(tvmin)) \
		(tv) = (tvmin); \
	if ((u_long)(tv) > (u_long)(tvmax)) \
		(tv) = (tvmax); \
} while (0)

427
428
429
430
431
static void
rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick,  int event, int line);

static int
rack_process_ack(struct mbuf *m, struct tcphdr *th,
432
    struct socket *so, struct tcpcb *tp, struct tcpopt *to,
433
434
435
436
    uint32_t tiwin, int32_t tlen, int32_t * ofia, int32_t thflags, int32_t * ret_val);
static int
rack_process_data(struct mbuf *m, struct tcphdr *th,
    struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen,
437
    uint32_t tiwin, int32_t thflags, int32_t nxt_pkt);
438
439
static void
rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack,
440
   uint32_t th_ack, uint16_t nsegs, uint16_t type, int32_t recovery);
441
static struct rack_sendmap *rack_alloc(struct tcp_rack *rack);
442
443
static struct rack_sendmap *rack_alloc_limit(struct tcp_rack *rack,
    uint8_t limit_type);
444
445
446
447
static struct rack_sendmap *
rack_check_recovery_mode(struct tcpcb *tp,
    uint32_t tsused);
static void
448
449
rack_cong_signal(struct tcpcb *tp,
		 uint32_t type, uint32_t ack);
450
451
452
453
454
455
static void rack_counter_destroy(void);
static int
rack_ctloutput(struct socket *so, struct sockopt *sopt,
    struct inpcb *inp, struct tcpcb *tp);
static int32_t rack_ctor(void *mem, int32_t size, void *arg, int32_t how);
static void
456
rack_set_pace_segments(struct tcpcb *tp, struct tcp_rack *rack, uint32_t line, uint64_t *fill_override);
457
static void
458
459
rack_do_segment(struct mbuf *m, struct tcphdr *th,
    struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen,
460
    uint8_t iptos);
461
462
static void rack_dtor(void *mem, int32_t size, void *arg);
static void
463
464
465
466
467
rack_log_alt_to_to_cancel(struct tcp_rack *rack,
    uint32_t flex1, uint32_t flex2,
    uint32_t flex3, uint32_t flex4,
    uint32_t flex5, uint32_t flex6,
    uint16_t flex7, uint8_t mod);
468

469
470
static void
rack_log_pacing_delay_calc(struct tcp_rack *rack, uint32_t len, uint32_t slot,
471
472
   uint64_t bw_est, uint64_t bw, uint64_t len_time, int method, int line,
   struct rack_sendmap *rsm, uint8_t quality);
473
474
475
476
477
478
479
480
481
static struct rack_sendmap *
rack_find_high_nonack(struct tcp_rack *rack,
    struct rack_sendmap *rsm);
static struct rack_sendmap *rack_find_lowest_rsm(struct tcp_rack *rack);
static void rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm);
static void rack_fini(struct tcpcb *tp, int32_t tcb_is_purged);
static int
rack_get_sockopt(struct socket *so, struct sockopt *sopt,
    struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack);
482
483
static void
rack_do_goodput_measurement(struct tcpcb *tp, struct tcp_rack *rack,
484
			    tcp_seq th_ack, int line, uint8_t quality);
485
486
static uint32_t
rack_get_pacing_len(struct tcp_rack *rack, uint64_t bw, uint32_t mss);
487
488
489
490
491
static int32_t rack_handoff_ok(struct tcpcb *tp);
static int32_t rack_init(struct tcpcb *tp);
static void rack_init_sysctls(void);
static void
rack_log_ack(struct tcpcb *tp, struct tcpopt *to,
492
    struct tcphdr *th, int entered_rec, int dup_ack_struck);
493
494
static void
rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len,
495
    uint32_t seq_out, uint8_t th_flags, int32_t err, uint64_t ts,
496
    struct rack_sendmap *hintrsm, uint16_t add_flags, struct mbuf *s_mb, uint32_t s_moff, int hw_tls);
497

498
499
500
static void
rack_log_sack_passed(struct tcpcb *tp, struct tcp_rack *rack,
    struct rack_sendmap *rsm);
501
static void rack_log_to_event(struct tcp_rack *rack, int32_t to_num, struct rack_sendmap *rsm);
502
503
504
505
506
static int32_t rack_output(struct tcpcb *tp);

static uint32_t
rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack,
    struct sackblk *sack, struct tcpopt *to, struct rack_sendmap **prsm,
507
    uint32_t cts, int *moved_two);
508
static void rack_post_recovery(struct tcpcb *tp, uint32_t th_seq);
509
510
511
512
513
514
515
516
517
518
519
520
521
522
static void rack_remxt_tmr(struct tcpcb *tp);
static int
rack_set_sockopt(struct socket *so, struct sockopt *sopt,
    struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack);
static void rack_set_state(struct tcpcb *tp, struct tcp_rack *rack);
static int32_t rack_stopall(struct tcpcb *tp);
static void
rack_timer_activate(struct tcpcb *tp, uint32_t timer_type,
    uint32_t delta);
static int32_t rack_timer_active(struct tcpcb *tp, uint32_t timer_type);
static void rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line);
static void rack_timer_stop(struct tcpcb *tp, uint32_t timer_type);
static uint32_t
rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack,
523
    struct rack_sendmap *rsm, uint64_t ts, int32_t * lenp, uint16_t add_flag);
524
525
static void
rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack,
526
    struct rack_sendmap *rsm, uint64_t ts, uint16_t add_flag);
527
528
static int
rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack,
529
    struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type, tcp_seq th_ack);
530
531
532
533
static int32_t tcp_addrack(module_t mod, int32_t type, void *data);
static int
rack_do_close_wait(struct mbuf *m, struct tcphdr *th,
    struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
534
    int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
535
536
537
static int
rack_do_closing(struct mbuf *m, struct tcphdr *th,
    struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
538
    int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
539
540
541
static int
rack_do_established(struct mbuf *m, struct tcphdr *th,
    struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
542
    int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
543
544
545
static int
rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th,
    struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
546
    int32_t tlen, uint32_t tiwin, int32_t nxt_pkt, uint8_t iptos);
547
548
549
static int
rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th,
    struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
550
    int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
551
552
553
static int
rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th,
    struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
554
    int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
555
556
557
static int
rack_do_lastack(struct mbuf *m, struct tcphdr *th,
    struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
558
    int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
559
560
561
static int
rack_do_syn_recv(struct mbuf *m, struct tcphdr *th,
    struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
562
    int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
563
564
565
static int
rack_do_syn_sent(struct mbuf *m, struct tcphdr *th,
    struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
566
    int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
567
568
569
struct rack_sendmap *
tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack,
    uint32_t tsused);
570
571
static void tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt,
    uint32_t len, uint32_t us_tim, int confidence, struct rack_sendmap *rsm, uint16_t rtrcnt);
572
static void
573
574
575
576
577
     tcp_rack_partialack(struct tcpcb *tp);
static int
rack_set_profile(struct tcp_rack *rack, int prof);
static void
rack_apply_deferred_options(struct tcp_rack *rack);
578
579
580

int32_t rack_clear_counter=0;

581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
static void
rack_set_cc_pacing(struct tcp_rack *rack)
{
	struct sockopt sopt;
	struct cc_newreno_opts opt;
	struct newreno old, *ptr;
	struct tcpcb *tp;
	int error;

	if (rack->rc_pacing_cc_set)
		return;

	tp = rack->rc_tp;
	if (tp->cc_algo == NULL) {
		/* Tcb is leaving */
		printf("No cc algorithm?\n");
		return;
	}
	rack->rc_pacing_cc_set = 1;
	if (strcmp(tp->cc_algo->name, CCALGONAME_NEWRENO) != 0) {
		/* Not new-reno we can't play games with beta! */
		goto out;
	}
	ptr = ((struct newreno *)tp->ccv->cc_data);
	if (CC_ALGO(tp)->ctl_output == NULL)  {
		/* Huh, why does new_reno no longer have a set function? */
		goto out;
	}
	if (ptr == NULL) {
		/* Just the default values */
		old.beta = V_newreno_beta_ecn;
		old.beta_ecn = V_newreno_beta_ecn;
		old.newreno_flags = 0;
	} else {
		old.beta = ptr->beta;
		old.beta_ecn = ptr->beta_ecn;
		old.newreno_flags = ptr->newreno_flags;
	}
	sopt.sopt_valsize = sizeof(struct cc_newreno_opts);
	sopt.sopt_dir = SOPT_SET;
	opt.name = CC_NEWRENO_BETA;
	opt.val = rack->r_ctl.rc_saved_beta.beta;
	error = CC_ALGO(tp)->ctl_output(tp->ccv, &sopt, &opt);
	if (error)  {
		goto out;
	}
	/*
	 * Hack alert we need to set in our newreno_flags
	 * so that Abe behavior is also applied.
	 */
631
	((struct newreno *)tp->ccv->cc_data)->newreno_flags |= CC_NEWRENO_BETA_ECN_ENABLED;
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
	opt.name = CC_NEWRENO_BETA_ECN;
	opt.val = rack->r_ctl.rc_saved_beta.beta_ecn;
	error = CC_ALGO(tp)->ctl_output(tp->ccv, &sopt, &opt);
	if (error) {
		goto out;
	}
	/* Save off the original values for restoral */
	memcpy(&rack->r_ctl.rc_saved_beta, &old, sizeof(struct newreno));
out:
	if (rack_verbose_logging && (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) {
		union tcp_log_stackspecific log;
		struct timeval tv;

		ptr = ((struct newreno *)tp->ccv->cc_data);
		memset(&log.u_bbr, 0, sizeof(log.u_bbr));
		log.u_bbr.timeStamp = tcp_get_usecs(&tv);
		if (ptr) {
			log.u_bbr.flex1 = ptr->beta;
			log.u_bbr.flex2 = ptr->beta_ecn;
			log.u_bbr.flex3 = ptr->newreno_flags;
		}
		log.u_bbr.flex4 = rack->r_ctl.rc_saved_beta.beta;
		log.u_bbr.flex5 = rack->r_ctl.rc_saved_beta.beta_ecn;
		log.u_bbr.flex6 = rack->r_ctl.rc_saved_beta.newreno_flags;
		log.u_bbr.flex7 = rack->gp_ready;
		log.u_bbr.flex7 <<= 1;
		log.u_bbr.flex7 |= rack->use_fixed_rate;
		log.u_bbr.flex7 <<= 1;
		log.u_bbr.flex7 |= rack->rc_pacing_cc_set;
		log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt;
		log.u_bbr.flex8 = 3;
		tcp_log_event_(tp, NULL, NULL, NULL, BBR_LOG_CWND, error,
			       0, &log, false, NULL, NULL, 0, &tv);
	}
}

static void
rack_undo_cc_pacing(struct tcp_rack *rack)
{
	struct newreno old, *ptr;
	struct tcpcb *tp;

	if (rack->rc_pacing_cc_set == 0)
		return;
	tp = rack->rc_tp;
	rack->rc_pacing_cc_set = 0;
	if (tp->cc_algo == NULL)
		/* Tcb is leaving */
		return;
	if (strcmp(tp->cc_algo->name, CCALGONAME_NEWRENO) != 0) {
		/* Not new-reno nothing to do! */
		return;
	}
	ptr = ((struct newreno *)tp->ccv->cc_data);
	if (ptr == NULL) {
		/*
		 * This happens at rack_fini() if the
		 * cc module gets freed on us. In that
		 * case we loose our "new" settings but
		 * thats ok, since the tcb is going away anyway.
		 */
		return;
	}
	/* Grab out our set values */
	memcpy(&old, ptr, sizeof(struct newreno));
	/* Copy back in the original values */
	memcpy(ptr, &rack->r_ctl.rc_saved_beta, sizeof(struct newreno));
	/* Now save back the values we had set in (for when pacing is restored) */
	memcpy(&rack->r_ctl.rc_saved_beta, &old, sizeof(struct newreno));
	if (rack_verbose_logging && (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) {
		union tcp_log_stackspecific log;
		struct timeval tv;

		ptr = ((struct newreno *)tp->ccv->cc_data);
		memset(&log.u_bbr, 0, sizeof(log.u_bbr));
		log.u_bbr.timeStamp = tcp_get_usecs(&tv);
		log.u_bbr.flex1 = ptr->beta;
		log.u_bbr.flex2 = ptr->beta_ecn;
		log.u_bbr.flex3 = ptr->newreno_flags;
		log.u_bbr.flex4 = rack->r_ctl.rc_saved_beta.beta;
		log.u_bbr.flex5 = rack->r_ctl.rc_saved_beta.beta_ecn;
		log.u_bbr.flex6 = rack->r_ctl.rc_saved_beta.newreno_flags;
		log.u_bbr.flex7 = rack->gp_ready;
		log.u_bbr.flex7 <<= 1;
		log.u_bbr.flex7 |= rack->use_fixed_rate;
		log.u_bbr.flex7 <<= 1;
		log.u_bbr.flex7 |= rack->rc_pacing_cc_set;
		log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt;
		log.u_bbr.flex8 = 4;
		tcp_log_event_(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0,
			       0, &log, false, NULL, NULL, 0, &tv);
	}
}

#ifdef NETFLIX_PEAKRATE
static inline void
rack_update_peakrate_thr(struct tcpcb *tp)
{
	/* Keep in mind that t_maxpeakrate is in B/s. */
	uint64_t peak;
	peak = uqmax((tp->t_maxseg * 2),
		     (((uint64_t)tp->t_maxpeakrate * (uint64_t)(tp->t_srtt)) / (uint64_t)HPTS_USEC_IN_SEC));
	tp->t_peakrate_thr = (uint32_t)uqmin(peak, UINT32_MAX);
}
#endif

738
739
740
741
742
static int
sysctl_rack_clear(SYSCTL_HANDLER_ARGS)
{
	uint32_t stat;
	int32_t error;
743
	int i;
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770

	error = SYSCTL_OUT(req, &rack_clear_counter, sizeof(uint32_t));
	if (error || req->newptr == NULL)
		return error;

	error = SYSCTL_IN(req, &stat, sizeof(uint32_t));
	if (error)
		return (error);
	if (stat == 1) {
#ifdef INVARIANTS
		printf("Clearing RACK counters\n");
#endif
		counter_u64_zero(rack_badfr);
		counter_u64_zero(rack_badfr_bytes);
		counter_u64_zero(rack_rtm_prr_retran);
		counter_u64_zero(rack_rtm_prr_newdata);
		counter_u64_zero(rack_timestamp_mismatch);
		counter_u64_zero(rack_reorder_seen);
		counter_u64_zero(rack_tlp_tot);
		counter_u64_zero(rack_tlp_newdata);
		counter_u64_zero(rack_tlp_retran);
		counter_u64_zero(rack_tlp_retran_bytes);
		counter_u64_zero(rack_tlp_retran_fail);
		counter_u64_zero(rack_to_tot);
		counter_u64_zero(rack_to_arm_rack);
		counter_u64_zero(rack_to_arm_tlp);
		counter_u64_zero(rack_paced_segments);
771
772
		counter_u64_zero(rack_calc_zero);
		counter_u64_zero(rack_calc_nonzero);
773
774
		counter_u64_zero(rack_unpaced_segments);
		counter_u64_zero(rack_saw_enobuf);
775
		counter_u64_zero(rack_saw_enobuf_hw);
776
		counter_u64_zero(rack_saw_enetunreach);
777
		counter_u64_zero(rack_per_timer_hole);
778
779
		counter_u64_zero(rack_large_ackcmp);
		counter_u64_zero(rack_small_ackcmp);
780
781
782
783
		counter_u64_zero(rack_persists_sends);
		counter_u64_zero(rack_persists_acks);
		counter_u64_zero(rack_persists_loss);
		counter_u64_zero(rack_persists_lost_ends);
784
785
786
#ifdef INVARIANTS
		counter_u64_zero(rack_adjust_map_bw);
#endif
787
788
789
		counter_u64_zero(rack_to_alloc_hard);
		counter_u64_zero(rack_to_alloc_emerg);
		counter_u64_zero(rack_sack_proc_all);
790
791
792
793
794
795
796
797
798
		counter_u64_zero(rack_fto_send);
		counter_u64_zero(rack_fto_rsm_send);
		counter_u64_zero(rack_extended_rfo);
		counter_u64_zero(rack_hw_pace_init_fail);
		counter_u64_zero(rack_hw_pace_lost);
		counter_u64_zero(rack_sbsndptr_wrong);
		counter_u64_zero(rack_sbsndptr_right);
		counter_u64_zero(rack_non_fto_send);
		counter_u64_zero(rack_nfto_resend);
799
800
801
		counter_u64_zero(rack_sack_proc_short);
		counter_u64_zero(rack_sack_proc_restart);
		counter_u64_zero(rack_to_alloc);
802
		counter_u64_zero(rack_to_alloc_limited);
803
804
		counter_u64_zero(rack_alloc_limited_conns);
		counter_u64_zero(rack_split_limited);
805
806
807
808
809
		for (i = 0; i < MAX_NUM_OF_CNTS; i++) {
			counter_u64_zero(rack_proc_comp_ack[i]);
		}
		counter_u64_zero(rack_multi_single_eq);
		counter_u64_zero(rack_proc_non_comp_ack);
810
		counter_u64_zero(rack_find_high);
811
812
813
814
815
816
817
818
819
820
821
		counter_u64_zero(rack_sack_attacks_detected);
		counter_u64_zero(rack_sack_attacks_reversed);
		counter_u64_zero(rack_sack_used_next_merge);
		counter_u64_zero(rack_sack_used_prev_merge);
		counter_u64_zero(rack_sack_splits);
		counter_u64_zero(rack_sack_skipped_acked);
		counter_u64_zero(rack_ack_total);
		counter_u64_zero(rack_express_sack);
		counter_u64_zero(rack_sack_total);
		counter_u64_zero(rack_move_none);
		counter_u64_zero(rack_move_some);
822
823
824
825
826
		counter_u64_zero(rack_used_tlpmethod);
		counter_u64_zero(rack_used_tlpmethod2);
		counter_u64_zero(rack_enter_tlp_calc);
		counter_u64_zero(rack_progress_drops);
		counter_u64_zero(rack_tlp_does_nada);
827
		counter_u64_zero(rack_try_scwnd);
828
		counter_u64_zero(rack_collapsed_win);
829
830
831
832
833
834
	}
	rack_clear_counter = 0;
	return (0);
}

static void
835
rack_init_sysctls(void)
836
{
837
	int i;
838
839
	struct sysctl_oid *rack_counters;
	struct sysctl_oid *rack_attack;
840
841
842
843
844
	struct sysctl_oid *rack_pacing;
	struct sysctl_oid *rack_timely;
	struct sysctl_oid *rack_timers;
	struct sysctl_oid *rack_tlp;
	struct sysctl_oid *rack_misc;
845
	struct sysctl_oid *rack_features;
846
847
	struct sysctl_oid *rack_measure;
	struct sysctl_oid *rack_probertt;
848
	struct sysctl_oid *rack_hw_pacing;
849

850
851
852
853
854
855
856
857
858
859
860
861
	rack_attack = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
	    SYSCTL_CHILDREN(rack_sysctl_root),
	    OID_AUTO,
	    "sack_attack",
	    CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
	    "Rack Sack Attack Counters and Controls");
	rack_counters = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
	    SYSCTL_CHILDREN(rack_sysctl_root),
	    OID_AUTO,
	    "stats",
	    CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
	    "Rack Counters");
862
863
864
865
866
	SYSCTL_ADD_S32(&rack_sysctl_ctx,
	    SYSCTL_CHILDREN(rack_sysctl_root),
	    OID_AUTO, "rate_sample_method", CTLFLAG_RW,
	    &rack_rate_sample_method , USE_RTT_LOW,
	    "What method should we use for rate sampling 0=high, 1=low ");
867
868
	/* Probe rtt related controls */
	rack_probertt = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
869
	    SYSCTL_CHILDREN(rack_sysctl_root),
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
	    OID_AUTO,
	    "probertt",
	    CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
	    "ProbeRTT related Controls");
	SYSCTL_ADD_U16(&rack_sysctl_ctx,
	    SYSCTL_CHILDREN(rack_probertt),
	    OID_AUTO, "exit_per_hpb", CTLFLAG_RW,
	    &rack_atexit_prtt_hbp, 130,
	    "What percentage above goodput do we clamp CA/SS to at exit on high-BDP path 110%");
	SYSCTL_ADD_U16(&rack_sysctl_ctx,
	    SYSCTL_CHILDREN(rack_probertt),
	    OID_AUTO, "exit_per_nonhpb", CTLFLAG_RW,
	    &rack_atexit_prtt, 130,
	    "What percentage above goodput do we clamp CA/SS to at exit on a non high-BDP path 100%");
	SYSCTL_ADD_U16(&rack_sysctl_ctx,
	    SYSCTL_CHILDREN(rack_probertt),
	    OID_AUTO, "gp_per_mul", CTLFLAG_RW,
	    &rack_per_of_gp_probertt, 60,
	    "What percentage of goodput do we pace at in probertt");
	SYSCTL_ADD_U16(&rack_sysctl_ctx,
	    SYSCTL_CHILDREN(rack_probertt),
	    OID_AUTO, "gp_per_reduce", CTLFLAG_RW,
	    &rack_per_of_gp_probertt_reduce, 10,
	    "What percentage of goodput do we reduce every gp_srtt");
	SYSCTL_ADD_U16(&rack_sysctl_ctx,
	    SYSCTL_CHILDREN(rack_probertt),
	    OID_AUTO, "gp_per_low", CTLFLAG_RW,
	    &rack_per_of_gp_lowthresh, 40,
	    "What percentage of goodput do we allow the multiplier to fall to");
	SYSCTL_ADD_U32(&rack_sysctl_ctx,
	    SYSCTL_CHILDREN(rack_probertt),
	    OID_AUTO, "time_between", CTLFLAG_RW,
	    & rack_time_between_probertt, 96000000,
	    "How many useconds between the lowest rtt falling must past before we enter probertt");
	SYSCTL_ADD_U32(&rack_sysctl_ctx,
	    SYSCTL_CHILDREN(rack_probertt),
	    OID_AUTO, "safety", CTLFLAG_RW,
	    &rack_probe_rtt_safety_val, 2000000,
	    "If not zero, provides a maximum usecond that you can stay in probertt (2sec = 2000000)");
	SYSCTL_ADD_U32(&rack_sysctl_ctx,
	    SYSCTL_CHILDREN(rack_probertt),
	    OID_AUTO, "sets_cwnd", CTLFLAG_RW,
	    &rack_probe_rtt_sets_cwnd, 0,
	    "Do we set the cwnd too (if always_lower is on)");
	SYSCTL_ADD_U32(&rack_sysctl_ctx,
	    SYSCTL_CHILDREN(rack_probertt),
	    OID_AUTO, "maxdrainsrtts", CTLFLAG_RW,
	    &rack_max_drain_wait, 2,
	    "Maximum number of gp_srtt's to hold in drain waiting for flight to reach goal");
	SYSCTL_ADD_U32(&rack_sysctl_ctx,
	    SYSCTL_CHILDREN(rack_probertt),
	    OID_AUTO, "mustdrainsrtts", CTLFLAG_RW,
	    &rack_must_drain, 1,
	    "We must drain this many gp_srtt's waiting for flight to reach goal");
	SYSCTL_ADD_U32(&rack_sysctl_ctx,
	    SYSCTL_CHILDREN(rack_probertt),
	    OID_AUTO, "goal_use_min_entry", CTLFLAG_RW,
	    &rack_probertt_use_min_rtt_entry, 1,
	    "Should we use the min-rtt to calculate the goal rtt (else gp_srtt) at entry");
	SYSCTL_ADD_U32(&rack_sysctl_ctx,
	    SYSCTL_CHILDREN(rack_probertt),
	    OID_AUTO, "goal_use_min_exit", CTLFLAG_RW,
	    &rack_probertt_use_min_rtt_exit, 0,
	    "How to set cwnd at exit, 0 - dynamic, 1 - use min-rtt, 2 - use curgprtt, 3 - entry gp-rtt");
	SYSCTL_ADD_U32(&rack_sysctl_ctx,
	    SYSCTL_CHILDREN(rack_probertt),
	    OID_AUTO, "length_div", CTLFLAG_RW,
	    &rack_probertt_gpsrtt_cnt_div, 0,
	    "How many recent goodput srtt periods plus hold tim does probertt last (bottom of fraction)");
	SYSCTL_ADD_U32(&rack_sysctl_ctx,
	    SYSCTL_CHILDREN(rack_probertt),
	    OID_AUTO, "length_mul", CTLFLAG_RW,
	    &rack_probertt_gpsrtt_cnt_mul, 0,
	    "How many recent goodput srtt periods plus hold tim does probertt last (top of fraction)");
	SYSCTL_ADD_U32(&rack_sysctl_ctx,
	    SYSCTL_CHILDREN(rack_probertt),
	    OID_AUTO, "holdtim_at_target", CTLFLAG_RW,
	    &rack_min_probertt_hold, 200000,
	    "What is the minimum time we hold probertt at target");
	SYSCTL_ADD_U32(&rack_sysctl_ctx,
	    SYSCTL_CHILDREN(rack_probertt),
	    OID_AUTO, "filter_life", CTLFLAG_RW,
	    &rack_probertt_filter_life, 10000000,
	    "What is the time for the filters life in useconds");
	SYSCTL_ADD_U32(&rack_sysctl_ctx,
	    SYSCTL_CHILDREN(rack_probertt),
	    OID_AUTO, "lower_within", CTLFLAG_RW,
	    &rack_probertt_lower_within, 10,
	    "If the rtt goes lower within this percentage of the time, go into probe-rtt");
	SYSCTL_ADD_U32(&rack_sysctl_ctx,
	    SYSCTL_CHILDREN(rack_probertt),
	    OID_AUTO, "must_move", CTLFLAG_RW,
	    &rack_min_rtt_movement, 250,
	    "How much is the minimum movement in rtt to count as a drop for probertt purposes");
	SYSCTL_ADD_U32(&rack_sysctl_ctx,
	    SYSCTL_CHILDREN(rack_probertt),
	    OID_AUTO, "clear_is_cnts", CTLFLAG_RW,
	    &rack_probertt_clear_is, 1,
	    "Do we clear I/S counts on exiting probe-rtt");
969
	SYSCTL_ADD_S32(&rack_sysctl_ctx,
970
971
972
973
974
975
976
977
978
979
980
	    SYSCTL_CHILDREN(rack_probertt),
	    OID_AUTO, "hbp_extra_drain", CTLFLAG_RW,
	    &rack_max_drain_hbp, 1,
	    "How many extra drain gpsrtt's do we get in highly buffered paths");
	SYSCTL_ADD_S32(&rack_sysctl_ctx,
	    SYSCTL_CHILDREN(rack_probertt),
	    OID_AUTO, "hbp_threshold", CTLFLAG_RW,
	    &rack_hbp_thresh, 3,
	    "We are highly buffered if min_rtt_seen / max_rtt_seen > this-threshold");
	/* Pacing related sysctls */
	rack_pacing = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
981
	    SYSCTL_CHILDREN(rack_sysctl_root),
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
	    OID_AUTO,
	    "pacing",
	    CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
	    "Pacing related Controls");
	SYSCTL_ADD_S32(&rack_sysctl_ctx,
	    SYSCTL_CHILDREN(rack_pacing),
	    OID_AUTO, "max_pace_over", CTLFLAG_RW,
	    &rack_max_per_above, 30,
	    "What is the maximum allowable percentage that we can pace above (so 30 = 130% of our goal)");
	SYSCTL_ADD_S32(&rack_sysctl_ctx,
	    SYSCTL_CHILDREN(rack_pacing),
	    OID_AUTO, "pace_to_one", CTLFLAG_RW,
	    &rack_pace_one_seg, 0,
	    "Do we allow low b/w pacing of 1MSS instead of two");
	SYSCTL_ADD_S32(&rack_sysctl_ctx,
	    SYSCTL_CHILDREN(rack_pacing),
	    OID_AUTO, "limit_wsrtt", CTLFLAG_RW,
	    &rack_limit_time_with_srtt, 0,
	    "Do we limit pacing time based on srtt");
For faster browsing, not all history is shown. View entire blame