tcp_hpts.c 70 KB
Newer Older
1
/*-
Warner Losh's avatar
Warner Losh committed
2
 * Copyright (c) 2016-2018 Netflix, Inc.
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 */
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");

#include "opt_inet.h"
#include "opt_inet6.h"
31
#include "opt_rss.h"
32
#include "opt_tcpdebug.h"
33

34
35
36
37
/**
 * Some notes about usage.
 *
 * The tcp_hpts system is designed to provide a high precision timer
38
 * system for tcp. Its main purpose is to provide a mechanism for
39
40
41
 * pacing packets out onto the wire. It can be used in two ways
 * by a given TCP stack (and those two methods can be used simultaneously).
 *
42
 * First, and probably the main thing its used by Rack and BBR, it can
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
 * be used to call tcp_output() of a transport stack at some time in the future.
 * The normal way this is done is that tcp_output() of the stack schedules
 * itself to be called again by calling tcp_hpts_insert(tcpcb, slot). The
 * slot is the time from now that the stack wants to be called but it
 * must be converted to tcp_hpts's notion of slot. This is done with
 * one of the macros HPTS_MS_TO_SLOTS or HPTS_USEC_TO_SLOTS. So a typical
 * call from the tcp_output() routine might look like:
 *
 * tcp_hpts_insert(tp, HPTS_USEC_TO_SLOTS(550));
 *
 * The above would schedule tcp_ouput() to be called in 550 useconds.
 * Note that if using this mechanism the stack will want to add near
 * its top a check to prevent unwanted calls (from user land or the
 * arrival of incoming ack's). So it would add something like:
 *
58
 * if (tcp_in_hpts(inp))
59
60
61
62
63
 *    return;
 *
 * to prevent output processing until the time alotted has gone by.
 * Of course this is a bare bones example and the stack will probably
 * have more consideration then just the above.
64
 *
65
 * Now the second function (actually two functions I guess :D)
66
67
 * the tcp_hpts system provides is the  ability to either abort
 * a connection (later) or process input on a connection.
68
69
70
 * Why would you want to do this? To keep processor locality
 * and or not have to worry about untangling any recursive
 * locks. The input function now is hooked to the new LRO
71
 * system as well.
72
 *
73
 * In order to use the input redirection function the
74
 * tcp stack must define an input function for
75
76
 * tfb_do_queued_segments(). This function understands
 * how to dequeue a array of packets that were input and
77
 * knows how to call the correct processing routine.
78
 *
79
 * Locking in this is important as well so most likely the
80
81
82
83
84
85
86
87
 * stack will need to define the tfb_do_segment_nounlock()
 * splitting tfb_do_segment() into two parts. The main processing
 * part that does not unlock the INP and returns a value of 1 or 0.
 * It returns 0 if all is well and the lock was not released. It
 * returns 1 if we had to destroy the TCB (a reset received etc).
 * The remains of tfb_do_segment() then become just a simple call
 * to the tfb_do_segment_nounlock() function and check the return
 * code and possibly unlock.
88
 *
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
 * The stack must also set the flag on the INP that it supports this
 * feature i.e. INP_SUPPORTS_MBUFQ. The LRO code recoginizes
 * this flag as well and will queue packets when it is set.
 * There are other flags as well INP_MBUF_QUEUE_READY and
 * INP_DONT_SACK_QUEUE. The first flag tells the LRO code
 * that we are in the pacer for output so there is no
 * need to wake up the hpts system to get immediate
 * input. The second tells the LRO code that its okay
 * if a SACK arrives you can still defer input and let
 * the current hpts timer run (this is usually set when
 * a rack timer is up so we know SACK's are happening
 * on the connection already and don't want to wakeup yet).
 *
 * There is a common functions within the rack_bbr_common code
 * version i.e. ctf_do_queued_segments(). This function
104
105
106
 * knows how to take the input queue of packets from
 * tp->t_in_pkts and process them digging out
 * all the arguments, calling any bpf tap and
107
 * calling into tfb_do_segment_nounlock(). The common
108
 * function (ctf_do_queued_segments())  requires that
109
110
111
112
113
114
 * you have defined the tfb_do_segment_nounlock() as
 * described above.
 *
 * The second feature of the input side of hpts is the
 * dropping of a connection. This is due to the way that
 * locking may have occured on the INP_WLOCK. So if
115
116
117
 * a stack wants to drop a connection it calls:
 *
 *     tcp_set_inp_to_drop(tp, ETIMEDOUT)
118
119
120
 *
 * To schedule the tcp_hpts system to call
 *
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
 *    tcp_drop(tp, drop_reason)
 *
 * at a future point. This is quite handy to prevent locking
 * issues when dropping connections.
 *
 */

#include <sys/param.h>
#include <sys/bus.h>
#include <sys/interrupt.h>
#include <sys/module.h>
#include <sys/kernel.h>
#include <sys/hhook.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
#include <sys/proc.h>		/* for proc0 declaration */
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/sysctl.h>
#include <sys/systm.h>
#include <sys/refcount.h>
#include <sys/sched.h>
#include <sys/queue.h>
#include <sys/smp.h>
#include <sys/counter.h>
#include <sys/time.h>
#include <sys/kthread.h>
#include <sys/kern_prefetch.h>

#include <vm/uma.h>
151
#include <vm/vm.h>
152
153
154
155

#include <net/route.h>
#include <net/vnet.h>

156
157
158
159
160
#ifdef RSS
#include <net/netisr.h>
#include <net/rss_config.h>
#endif

161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
#define TCPSTATES		/* for logging */

#include <netinet/in.h>
#include <netinet/in_kdtrace.h>
#include <netinet/in_pcb.h>
#include <netinet/ip.h>
#include <netinet/ip_icmp.h>	/* required for icmp_var.h */
#include <netinet/icmp_var.h>	/* for ICMP_BANDLIM */
#include <netinet/ip_var.h>
#include <netinet/ip6.h>
#include <netinet6/in6_pcb.h>
#include <netinet6/ip6_var.h>
#include <netinet/tcp.h>
#include <netinet/tcp_fsm.h>
#include <netinet/tcp_seq.h>
#include <netinet/tcp_timer.h>
#include <netinet/tcp_var.h>
#include <netinet/tcpip.h>
#include <netinet/cc/cc.h>
#include <netinet/tcp_hpts.h>
181
#include <netinet/tcp_log_buf.h>
182
183
184
185
186
187
188
189

#ifdef tcpdebug
#include <netinet/tcp_debug.h>
#endif				/* tcpdebug */
#ifdef tcp_offload
#include <netinet/tcp_offload.h>
#endif

190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
/*
 * The hpts uses a 102400 wheel. The wheel
 * defines the time in 10 usec increments (102400 x 10).
 * This gives a range of 10usec - 1024ms to place
 * an entry within. If the user requests more than
 * 1.024 second, a remaineder is attached and the hpts
 * when seeing the remainder will re-insert the
 * inpcb forward in time from where it is until
 * the remainder is zero.
 */

#define NUM_OF_HPTSI_SLOTS 102400

/* Each hpts has its own p_mtx which is used for locking */
#define	HPTS_MTX_ASSERT(hpts)	mtx_assert(&(hpts)->p_mtx, MA_OWNED)
TAILQ_HEAD(hptsh, inpcb);
struct tcp_hpts_entry {
	/* Cache line 0x00 */
	struct mtx p_mtx;	/* Mutex for hpts */
	struct timeval p_mysleep;	/* Our min sleep time */
	uint64_t syscall_cnt;
	uint64_t sleeping;	/* What the actual sleep was (if sleeping) */
	uint16_t p_hpts_active; /* Flag that says hpts is awake  */
	uint8_t p_wheel_complete; /* have we completed the wheel arc walk? */
	uint32_t p_curtick;	/* Tick in 10 us the hpts is going to */
	uint32_t p_runningslot; /* Current tick we are at if we are running */
	uint32_t p_prev_slot;	/* Previous slot we were on */
	uint32_t p_cur_slot;	/* Current slot in wheel hpts is draining */
	uint32_t p_nxt_slot;	/* The next slot outside the current range of
				 * slots that the hpts is running on. */
	int32_t p_on_queue_cnt;	/* Count on queue in this hpts */
	uint32_t p_lasttick;	/* Last tick before the current one */
	uint8_t p_direct_wake :1, /* boolean */
		p_on_min_sleep:1, /* boolean */
		p_hpts_wake_scheduled:1, /* boolean */
		p_avail:5;
	uint8_t p_fill[3];	  /* Fill to 32 bits */
	/* Cache line 0x40 */
	void *p_inp;
	struct hptsh p_input;	/* For the tcp-input runner */
	/* Hptsi wheel */
	struct hptsh *p_hptss;
	int32_t p_on_inqueue_cnt; /* Count on input queue in this hpts */
	uint32_t p_hpts_sleep_time;	/* Current sleep interval having a max
					 * of 255ms */
	uint32_t overidden_sleep;	/* what was overrided by min-sleep for logging */
	uint32_t saved_lasttick;	/* for logging */
	uint32_t saved_curtick;		/* for logging */
	uint32_t saved_curslot;		/* for logging */
	uint32_t saved_prev_slot;       /* for logging */
	uint32_t p_delayed_by;	/* How much were we delayed by */
	/* Cache line 0x80 */
	struct sysctl_ctx_list hpts_ctx;
	struct sysctl_oid *hpts_root;
	struct intr_event *ie;
	void *ie_cookie;
	uint16_t p_num;		/* The hpts number one per cpu */
	uint16_t p_cpu;		/* The hpts CPU */
	/* There is extra space in here */
	/* Cache line 0x100 */
	struct callout co __aligned(CACHE_LINE_SIZE);
}               __aligned(CACHE_LINE_SIZE);

struct tcp_hptsi {
	struct proc *rp_proc;	/* Process structure for hpts */
	struct tcp_hpts_entry **rp_ent;	/* Array of hptss */
	uint32_t *cts_last_ran;
	uint32_t rp_num_hptss;	/* Number of hpts threads */
};

260
261
262
263
MALLOC_DEFINE(M_TCPHPTS, "tcp_hpts", "TCP hpts");
#ifdef RSS
static int tcp_bind_threads = 1;
#else
264
static int tcp_bind_threads = 2;
265
#endif
266
static int tcp_use_irq_cpu = 0;
267
static struct tcp_hptsi tcp_pace;
268
static uint32_t *cts_last_ran;
269
static int hpts_does_tp_logging = 0;
270
271
static int hpts_use_assigned_cpu = 1;
static int32_t hpts_uses_oldest = OLDEST_THRESHOLD;
272
273

static void tcp_input_data(struct tcp_hpts_entry *hpts, struct timeval *tv);
274
static int32_t tcp_hptsi(struct tcp_hpts_entry *hpts, int from_callout);
275
276
277
278
static void tcp_hpts_thread(void *ctx);
static void tcp_init_hptsi(void *st);

int32_t tcp_min_hptsi_time = DEFAULT_MIN_SLEEP;
279
280
281
282
283
static int conn_cnt_thresh = DEFAULT_CONNECTION_THESHOLD;
static int32_t dynamic_min_sleep = DYNAMIC_MIN_SLEEP;
static int32_t dynamic_max_sleep = DYNAMIC_MAX_SLEEP;


284

285
286
SYSCTL_NODE(_net_inet_tcp, OID_AUTO, hpts, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
    "TCP Hpts controls");
287
288
SYSCTL_NODE(_net_inet_tcp_hpts, OID_AUTO, stats, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
    "TCP Hpts statistics");
289
290
291
292
293
294
295
296
297
298
299
300
301

#define	timersub(tvp, uvp, vvp)						\
	do {								\
		(vvp)->tv_sec = (tvp)->tv_sec - (uvp)->tv_sec;		\
		(vvp)->tv_usec = (tvp)->tv_usec - (uvp)->tv_usec;	\
		if ((vvp)->tv_usec < 0) {				\
			(vvp)->tv_sec--;				\
			(vvp)->tv_usec += 1000000;			\
		}							\
	} while (0)

static int32_t tcp_hpts_precision = 120;

302
static struct hpts_domain_info {
303
304
	int count;
	int cpu[MAXCPU];
305
} hpts_domains[MAXMEMDOM];
306

307
308
counter_u64_t hpts_hopelessly_behind;

309
SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, hopeless, CTLFLAG_RD,
310
311
    &hpts_hopelessly_behind,
    "Number of times hpts could not catch up and was behind hopelessly");
312
313
314

counter_u64_t hpts_loops;

315
SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, loops, CTLFLAG_RD,
316
317
318
319
    &hpts_loops, "Number of times hpts had to loop to catch up");

counter_u64_t back_tosleep;

320
SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, no_tcbsfound, CTLFLAG_RD,
321
322
    &back_tosleep, "Number of times hpts found no tcbs");

323
324
counter_u64_t combined_wheel_wrap;

325
SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, comb_wheel_wrap, CTLFLAG_RD,
326
    &combined_wheel_wrap, "Number of times the wheel lagged enough to have an insert see wrap");
327

328
counter_u64_t wheel_wrap;
329

330
SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, wheel_wrap, CTLFLAG_RD,
331
    &wheel_wrap, "Number of times the wheel lagged enough to have an insert see wrap");
332

333
334
335
336
337
338
339
340
341
342
343
344
345
counter_u64_t hpts_direct_call;
SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, direct_call, CTLFLAG_RD,
    &hpts_direct_call, "Number of times hpts was called by syscall/trap or other entry");

counter_u64_t hpts_wake_timeout;

SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, timeout_wakeup, CTLFLAG_RD,
    &hpts_wake_timeout, "Number of times hpts threads woke up via the callout expiring");

counter_u64_t hpts_direct_awakening;

SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, direct_awakening, CTLFLAG_RD,
    &hpts_direct_awakening, "Number of times hpts threads woke up via the callout expiring");
346

347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
counter_u64_t hpts_back_tosleep;

SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, back_tosleep, CTLFLAG_RD,
    &hpts_back_tosleep, "Number of times hpts threads woke up via the callout expiring and went back to sleep no work");

counter_u64_t cpu_uses_flowid;
counter_u64_t cpu_uses_random;

SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, cpusel_flowid, CTLFLAG_RD,
    &cpu_uses_flowid, "Number of times when setting cpuid we used the flowid field");
SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, cpusel_random, CTLFLAG_RD,
    &cpu_uses_random, "Number of times when setting cpuid we used the a random value");

TUNABLE_INT("net.inet.tcp.bind_hptss", &tcp_bind_threads);
TUNABLE_INT("net.inet.tcp.use_irq", &tcp_use_irq_cpu);
SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, bind_hptss, CTLFLAG_RD,
    &tcp_bind_threads, 2,
    "Thread Binding tunable");
SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, use_irq, CTLFLAG_RD,
    &tcp_use_irq_cpu, 0,
    "Use of irq CPU  tunable");
SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, precision, CTLFLAG_RW,
    &tcp_hpts_precision, 120,
    "Value for PRE() precision of callout");
SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, cnt_thresh, CTLFLAG_RW,
    &conn_cnt_thresh, 0,
    "How many connections (below) make us use the callout based mechanism");
374
375
376
SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, logging, CTLFLAG_RW,
    &hpts_does_tp_logging, 0,
    "Do we add to any tp that has logging on pacer logs");
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, use_assigned_cpu, CTLFLAG_RW,
    &hpts_use_assigned_cpu, 0,
    "Do we start any hpts timer on the assigned cpu?");
SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, use_oldest, CTLFLAG_RW,
    &hpts_uses_oldest, OLDEST_THRESHOLD,
    "Do syscalls look for the hpts that has been the longest since running (or just use cpu no if 0)?");
SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, dyn_minsleep, CTLFLAG_RW,
    &dynamic_min_sleep, 250,
    "What is the dynamic minsleep value?");
SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, dyn_maxsleep, CTLFLAG_RW,
    &dynamic_max_sleep, 5000,
    "What is the dynamic maxsleep value?");




393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411

static int32_t max_pacer_loops = 10;
SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, loopmax, CTLFLAG_RW,
    &max_pacer_loops, 10,
    "What is the maximum number of times the pacer will loop trying to catch up");

#define HPTS_MAX_SLEEP_ALLOWED (NUM_OF_HPTSI_SLOTS/2)

static uint32_t hpts_sleep_max = HPTS_MAX_SLEEP_ALLOWED;

static int
sysctl_net_inet_tcp_hpts_max_sleep(SYSCTL_HANDLER_ARGS)
{
	int error;
	uint32_t new;

	new = hpts_sleep_max;
	error = sysctl_handle_int(oidp, &new, 0, req);
	if (error == 0 && req->newptr) {
412
		if ((new < dynamic_min_sleep) ||
413
		    (new > HPTS_MAX_SLEEP_ALLOWED))
414
415
416
417
418
419
			error = EINVAL;
		else
			hpts_sleep_max = new;
	}
	return (error);
}
420

421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
static int
sysctl_net_inet_tcp_hpts_min_sleep(SYSCTL_HANDLER_ARGS)
{
	int error;
	uint32_t new;

	new = tcp_min_hptsi_time;
	error = sysctl_handle_int(oidp, &new, 0, req);
	if (error == 0 && req->newptr) {
		if (new < LOWEST_SLEEP_ALLOWED)
			error = EINVAL;
		else
			tcp_min_hptsi_time = new;
	}
	return (error);
}

438
SYSCTL_PROC(_net_inet_tcp_hpts, OID_AUTO, maxsleep,
439
    CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
440
    &hpts_sleep_max, 0,
441
442
    &sysctl_net_inet_tcp_hpts_max_sleep, "IU",
    "Maximum time hpts will sleep");
443

444
445
SYSCTL_PROC(_net_inet_tcp_hpts, OID_AUTO, minsleep,
    CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
446
    &tcp_min_hptsi_time, 0,
447
    &sysctl_net_inet_tcp_hpts_min_sleep, "IU",
448
449
    "The minimum time the hpts must sleep before processing more slots");

450
451
452
453
454
455
456
457
458
459
460
461
462
static int ticks_indicate_more_sleep = TICKS_INDICATE_MORE_SLEEP;
static int ticks_indicate_less_sleep = TICKS_INDICATE_LESS_SLEEP;
static int tcp_hpts_no_wake_over_thresh = 1;

SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, more_sleep, CTLFLAG_RW,
    &ticks_indicate_more_sleep, 0,
    "If we only process this many or less on a timeout, we need longer sleep on the next callout");
SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, less_sleep, CTLFLAG_RW,
    &ticks_indicate_less_sleep, 0,
    "If we process this many or more on a timeout, we need less sleep on the next callout");
SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, nowake_over_thresh, CTLFLAG_RW,
    &tcp_hpts_no_wake_over_thresh, 0,
    "When we are over the threshold on the pacer do we prohibit wakeups?");
463
464

static void
465
tcp_hpts_log(struct tcp_hpts_entry *hpts, struct tcpcb *tp, struct timeval *tv,
466
	     int slots_to_run, int idx, int from_callout)
467
{
468
	union tcp_log_stackspecific log;
469
470
471
472
473
474
	/*
	 * Unused logs are
	 * 64 bit - delRate, rttProp, bw_inuse
	 * 16 bit - cwnd_gain
	 *  8 bit - bbr_state, bbr_substate, inhpts, ininput;
	 */
475
476
477
478
479
480
481
	memset(&log.u_bbr, 0, sizeof(log.u_bbr));
	log.u_bbr.flex1 = hpts->p_nxt_slot;
	log.u_bbr.flex2 = hpts->p_cur_slot;
	log.u_bbr.flex3 = hpts->p_prev_slot;
	log.u_bbr.flex4 = idx;
	log.u_bbr.flex5 = hpts->p_curtick;
	log.u_bbr.flex6 = hpts->p_on_queue_cnt;
482
483
484
	log.u_bbr.flex7 = hpts->p_cpu;
	log.u_bbr.flex8 = (uint8_t)from_callout;
	log.u_bbr.inflight = slots_to_run;
485
486
487
488
489
490
491
	log.u_bbr.applimited = hpts->overidden_sleep;
	log.u_bbr.delivered = hpts->saved_curtick;
	log.u_bbr.timeStamp = tcp_tv_to_usectick(tv);
	log.u_bbr.epoch = hpts->saved_curslot;
	log.u_bbr.lt_epoch = hpts->saved_prev_slot;
	log.u_bbr.pkts_out = hpts->p_delayed_by;
	log.u_bbr.lost = hpts->p_hpts_sleep_time;
492
493
494
	log.u_bbr.pacing_gain = hpts->p_cpu;
	log.u_bbr.pkt_epoch = hpts->p_runningslot;
	log.u_bbr.use_lt_bw = 1;
495
496
497
498
499
	TCP_LOG_EVENTP(tp, NULL,
		       &tp->t_inpcb->inp_socket->so_rcv,
		       &tp->t_inpcb->inp_socket->so_snd,
		       BBR_LOG_HPTSDIAG, 0,
		       0, &log, false, tv);
500
501
502
}

static void
503
tcp_wakehpts(struct tcp_hpts_entry *hpts)
504
{
505
	HPTS_MTX_ASSERT(hpts);
506

507
508
509
510
511
512
513
514
	if (tcp_hpts_no_wake_over_thresh && (hpts->p_on_queue_cnt >= conn_cnt_thresh)) {
		hpts->p_direct_wake = 0;
		return;
	}
	if (hpts->p_hpts_wake_scheduled == 0) {
		hpts->p_hpts_wake_scheduled = 1;
		swi_sched(hpts->ie_cookie, 0);
	}
515
516
517
}

static void
518
hpts_timeout_swi(void *arg)
519
{
520
521
522
523
	struct tcp_hpts_entry *hpts;

	hpts = (struct tcp_hpts_entry *)arg;
	swi_sched(hpts->ie_cookie, 0);
524
525
526
527
528
}

static inline void
hpts_sane_pace_remove(struct tcp_hpts_entry *hpts, struct inpcb *inp, struct hptsh *head, int clear)
{
529
530
531
	HPTS_MTX_ASSERT(hpts);
	KASSERT(hpts->p_cpu == inp->inp_hpts_cpu, ("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp));
	KASSERT(inp->inp_in_hpts != 0, ("%s: hpts:%p inp:%p not on the hpts?", __FUNCTION__, hpts, inp));
532
533
	TAILQ_REMOVE(head, inp, inp_hpts);
	hpts->p_on_queue_cnt--;
534
535
536
	KASSERT(hpts->p_on_queue_cnt >= 0,
		("Hpts goes negative inp:%p hpts:%p",
		 inp, hpts));
537
538
539
540
541
542
543
544
545
	if (clear) {
		inp->inp_hpts_request = 0;
		inp->inp_in_hpts = 0;
	}
}

static inline void
hpts_sane_pace_insert(struct tcp_hpts_entry *hpts, struct inpcb *inp, struct hptsh *head, int line, int noref)
{
546
547
548
549
550
551
552
	HPTS_MTX_ASSERT(hpts);
	KASSERT(hpts->p_cpu == inp->inp_hpts_cpu,
		("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp));
	KASSERT(((noref == 1) && (inp->inp_in_hpts == 1)) ||
		((noref == 0) && (inp->inp_in_hpts == 0)),
		("%s: hpts:%p inp:%p already on the hpts?",
		 __FUNCTION__, hpts, inp));
553
554
555
556
557
558
559
560
561
562
563
	TAILQ_INSERT_TAIL(head, inp, inp_hpts);
	inp->inp_in_hpts = 1;
	hpts->p_on_queue_cnt++;
	if (noref == 0) {
		in_pcbref(inp);
	}
}

static inline void
hpts_sane_input_remove(struct tcp_hpts_entry *hpts, struct inpcb *inp, int clear)
{
564
565
566
567
568
	HPTS_MTX_ASSERT(hpts);
	KASSERT(hpts->p_cpu == inp->inp_hpts_cpu,
		("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp));
	KASSERT(inp->inp_in_input != 0,
		("%s: hpts:%p inp:%p not on the input hpts?", __FUNCTION__, hpts, inp));
569
570
	TAILQ_REMOVE(&hpts->p_input, inp, inp_input);
	hpts->p_on_inqueue_cnt--;
571
572
573
574
575
576
577
	KASSERT(hpts->p_on_inqueue_cnt >= 0,
		("Hpts in goes negative inp:%p hpts:%p",
		 inp, hpts));
	KASSERT((((TAILQ_EMPTY(&hpts->p_input) != 0) && (hpts->p_on_inqueue_cnt == 0)) ||
		 ((TAILQ_EMPTY(&hpts->p_input) == 0) && (hpts->p_on_inqueue_cnt > 0))),
		("%s hpts:%p input cnt (p_on_inqueue):%d and queue state mismatch",
		 __FUNCTION__, hpts, hpts->p_on_inqueue_cnt));
578
579
580
581
582
583
584
	if (clear)
		inp->inp_in_input = 0;
}

static inline void
hpts_sane_input_insert(struct tcp_hpts_entry *hpts, struct inpcb *inp, int line)
{
585
586
587
588
589
	HPTS_MTX_ASSERT(hpts);
	KASSERT(hpts->p_cpu == inp->inp_hpts_cpu,
		("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp));
	KASSERT(inp->inp_in_input == 0,
		("%s: hpts:%p inp:%p already on the input hpts?", __FUNCTION__, hpts, inp));
590
591
592
593
594
595
	TAILQ_INSERT_TAIL(&hpts->p_input, inp, inp_input);
	inp->inp_in_input = 1;
	hpts->p_on_inqueue_cnt++;
	in_pcbref(inp);
}

596
static struct tcp_hpts_entry *
597
598
599
600
601
602
603
604
tcp_hpts_lock(struct inpcb *inp)
{
	struct tcp_hpts_entry *hpts;
	int32_t hpts_num;

again:
	hpts_num = inp->inp_hpts_cpu;
	hpts = tcp_pace.rp_ent[hpts_num];
605
606
607
	KASSERT(mtx_owned(&hpts->p_mtx) == 0,
		("Hpts:%p owns mtx prior-to lock line:%d",
		 hpts, __LINE__));
608
609
610
611
612
613
614
615
	mtx_lock(&hpts->p_mtx);
	if (hpts_num != inp->inp_hpts_cpu) {
		mtx_unlock(&hpts->p_mtx);
		goto again;
	}
	return (hpts);
}

616
static struct tcp_hpts_entry *
617
618
619
620
621
622
623
624
tcp_input_lock(struct inpcb *inp)
{
	struct tcp_hpts_entry *hpts;
	int32_t hpts_num;

again:
	hpts_num = inp->inp_input_cpu;
	hpts = tcp_pace.rp_ent[hpts_num];
625
626
627
	KASSERT(mtx_owned(&hpts->p_mtx) == 0,
		("Hpts:%p owns mtx prior-to lock line:%d",
		hpts, __LINE__));
628
629
630
631
632
633
634
635
636
637
638
	mtx_lock(&hpts->p_mtx);
	if (hpts_num != inp->inp_input_cpu) {
		mtx_unlock(&hpts->p_mtx);
		goto again;
	}
	return (hpts);
}

static void
tcp_remove_hpts_ref(struct inpcb *inp, struct tcp_hpts_entry *hpts, int line)
{
639
	int32_t ret;
640

641
642
	ret = in_pcbrele_wlocked(inp);
	KASSERT(ret != 1, ("inpcb:%p release ret 1", inp));
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
}

static void
tcp_hpts_remove_locked_output(struct tcp_hpts_entry *hpts, struct inpcb *inp, int32_t flags, int32_t line)
{
	if (inp->inp_in_hpts) {
		hpts_sane_pace_remove(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], 1);
		tcp_remove_hpts_ref(inp, hpts, line);
	}
}

static void
tcp_hpts_remove_locked_input(struct tcp_hpts_entry *hpts, struct inpcb *inp, int32_t flags, int32_t line)
{
	HPTS_MTX_ASSERT(hpts);
	if (inp->inp_in_input) {
		hpts_sane_input_remove(hpts, inp, 1);
		tcp_remove_hpts_ref(inp, hpts, line);
	}
}

/*
 * Called normally with the INP_LOCKED but it
 * does not matter, the hpts lock is the key
 * but the lock order allows us to hold the
 * INP lock and then get the hpts lock.
 *
 * Valid values in the flags are
 * HPTS_REMOVE_OUTPUT - remove from the output of the hpts.
 * HPTS_REMOVE_INPUT - remove from the input of the hpts.
673
 * Note that you can use one or both values together
674
 * and get two actions.
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
 */
void
__tcp_hpts_remove(struct inpcb *inp, int32_t flags, int32_t line)
{
	struct tcp_hpts_entry *hpts;

	INP_WLOCK_ASSERT(inp);
	if (flags & HPTS_REMOVE_OUTPUT) {
		hpts = tcp_hpts_lock(inp);
		tcp_hpts_remove_locked_output(hpts, inp, flags, line);
		mtx_unlock(&hpts->p_mtx);
	}
	if (flags & HPTS_REMOVE_INPUT) {
		hpts = tcp_input_lock(inp);
		tcp_hpts_remove_locked_input(hpts, inp, flags, line);
		mtx_unlock(&hpts->p_mtx);
	}
}

static inline int
695
hpts_slot(uint32_t wheel_slot, uint32_t plus)
696
{
697
698
699
700
	/*
	 * Given a slot on the wheel, what slot
	 * is that plus ticks out?
	 */
701
702
	KASSERT(wheel_slot < NUM_OF_HPTSI_SLOTS, ("Invalid tick %u not on wheel", wheel_slot));
	return ((wheel_slot + plus) % NUM_OF_HPTSI_SLOTS);
703
704
705
706
707
}

static inline int
tick_to_wheel(uint32_t cts_in_wticks)
{
708
	/*
709
710
711
712
713
	 * Given a timestamp in ticks (so by
	 * default to get it to a real time one
	 * would multiply by 10.. i.e the number
	 * of ticks in a slot) map it to our limited
	 * space wheel.
714
715
716
717
718
	 */
	return (cts_in_wticks % NUM_OF_HPTSI_SLOTS);
}

static inline int
719
hpts_slots_diff(int prev_slot, int slot_now)
720
721
{
	/*
722
	 * Given two slots that are someplace
723
724
	 * on our wheel. How far are they apart?
	 */
725
726
727
	if (slot_now > prev_slot)
		return (slot_now - prev_slot);
	else if (slot_now == prev_slot)
728
729
		/*
		 * Special case, same means we can go all of our
730
731
732
733
		 * wheel less one slot.
		 */
		return (NUM_OF_HPTSI_SLOTS - 1);
	else
734
		return ((NUM_OF_HPTSI_SLOTS - prev_slot) + slot_now);
735
736
737
}

/*
738
739
 * Given a slot on the wheel that is the current time
 * mapped to the wheel (wheel_slot), what is the maximum
740
 * distance forward that can be obtained without
741
 * wrapping past either prev_slot or running_slot
742
 * depending on the htps state? Also if passed
743
 * a uint32_t *, fill it with the slot location.
744
745
 *
 * Note if you do not give this function the current
746
 * time (that you think it is) mapped to the wheel slot
747
748
749
750
 * then the results will not be what you expect and
 * could lead to invalid inserts.
 */
static inline int32_t
751
max_slots_available(struct tcp_hpts_entry *hpts, uint32_t wheel_slot, uint32_t *target_slot)
752
{
753
	uint32_t dis_to_travel, end_slot, pacer_to_now, avail_on_wheel;
754
755
756

	if ((hpts->p_hpts_active == 1) &&
	    (hpts->p_wheel_complete == 0)) {
757
		end_slot = hpts->p_runningslot;
758
		/* Back up one tick */
759
760
		if (end_slot == 0)
			end_slot = NUM_OF_HPTSI_SLOTS - 1;
761
		else
762
763
764
			end_slot--;
		if (target_slot)
			*target_slot = end_slot;
765
766
767
768
769
770
771
772
773
	} else {
		/*
		 * For the case where we are
		 * not active, or we have
		 * completed the pass over
		 * the wheel, we can use the
		 * prev tick and subtract one from it. This puts us
		 * as far out as possible on the wheel.
		 */
774
775
776
		end_slot = hpts->p_prev_slot;
		if (end_slot == 0)
			end_slot = NUM_OF_HPTSI_SLOTS - 1;
777
		else
778
779
780
			end_slot--;
		if (target_slot)
			*target_slot = end_slot;
781
782
		/*
		 * Now we have close to the full wheel left minus the
783
784
785
786
		 * time it has been since the pacer went to sleep. Note
		 * that wheel_tick, passed in, should be the current time
		 * from the perspective of the caller, mapped to the wheel.
		 */
787
788
		if (hpts->p_prev_slot != wheel_slot)
			dis_to_travel = hpts_slots_diff(hpts->p_prev_slot, wheel_slot);
789
790
		else
			dis_to_travel = 1;
791
792
		/*
		 * dis_to_travel in this case is the space from when the
793
		 * pacer stopped (p_prev_slot) and where our wheel_slot
794
		 * is now. To know how many slots we can put it in we
795
796
797
798
799
800
		 * subtract from the wheel size. We would not want
		 * to place something after p_prev_slot or it will
		 * get ran too soon.
		 */
		return (NUM_OF_HPTSI_SLOTS - dis_to_travel);
	}
801
	/*
802
	 * So how many slots are open between p_runningslot -> p_cur_slot
803
804
805
806
	 * that is what is currently un-available for insertion. Special
	 * case when we are at the last slot, this gets 1, so that
	 * the answer to how many slots are available is all but 1.
	 */
807
	if (hpts->p_runningslot == hpts->p_cur_slot)
808
809
		dis_to_travel = 1;
	else
810
		dis_to_travel = hpts_slots_diff(hpts->p_runningslot, hpts->p_cur_slot);
811
	/*
812
813
	 * How long has the pacer been running?
	 */
814
	if (hpts->p_cur_slot != wheel_slot) {
815
		/* The pacer is a bit late */
816
		pacer_to_now = hpts_slots_diff(hpts->p_cur_slot, wheel_slot);
817
818
819
820
	} else {
		/* The pacer is right on time, now == pacers start time */
		pacer_to_now = 0;
	}
821
	/*
822
823
824
825
826
	 * To get the number left we can insert into we simply
	 * subract the distance the pacer has to run from how
	 * many slots there are.
	 */
	avail_on_wheel = NUM_OF_HPTSI_SLOTS - dis_to_travel;
827
828
829
	/*
	 * Now how many of those we will eat due to the pacer's
	 * time (p_cur_slot) of start being behind the
830
	 * real time (wheel_slot)?
831
832
	 */
	if (avail_on_wheel <= pacer_to_now) {
833
		/*
834
835
		 * Wheel wrap, we can't fit on the wheel, that
		 * is unusual the system must be way overloaded!
836
		 * Insert into the assured slot, and return special
837
838
839
		 * "0".
		 */
		counter_u64_add(combined_wheel_wrap, 1);
840
		*target_slot = hpts->p_nxt_slot;
841
842
		return (0);
	} else {
843
		/*
844
845
846
		 * We know how many slots are open
		 * on the wheel (the reverse of what
		 * is left to run. Take away the time
847
		 * the pacer started to now (wheel_slot)
848
849
850
851
852
853
		 * and that tells you how many slots are
		 * open that can be inserted into that won't
		 * be touched by the pacer until later.
		 */
		return (avail_on_wheel - pacer_to_now);
	}
854
855
856
857
858
}

static int
tcp_queue_to_hpts_immediate_locked(struct inpcb *inp, struct tcp_hpts_entry *hpts, int32_t line, int32_t noref)
{
859
	uint32_t need_wake = 0;
860

861
862
863
	HPTS_MTX_ASSERT(hpts);
	if (inp->inp_in_hpts == 0) {
		/* Ok we need to set it on the hpts in the current slot */
864
865
866
867
		inp->inp_hpts_request = 0;
		if ((hpts->p_hpts_active == 0) ||
		    (hpts->p_wheel_complete)) {
			/*
868
			 * A sleeping hpts we want in next slot to run
869
870
			 * note that in this state p_prev_slot == p_cur_slot
			 */
871
			inp->inp_hptsslot = hpts_slot(hpts->p_prev_slot, 1);
872
873
			if ((hpts->p_on_min_sleep == 0) && (hpts->p_hpts_active == 0))
				need_wake = 1;
874
875
		} else if ((void *)inp == hpts->p_inp) {
			/*
876
			 * The hpts system is running and the caller
877
			 * was awoken by the hpts system.
878
			 * We can't allow you to go into the same slot we
879
			 * are in (we don't want a loop :-D).
880
881
882
			 */
			inp->inp_hptsslot = hpts->p_nxt_slot;
		} else
883
			inp->inp_hptsslot = hpts->p_runningslot;
884
885
886
887
888
889
890
891
892
893
894
895
896
		hpts_sane_pace_insert(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], line, noref);
		if (need_wake) {
			/*
			 * Activate the hpts if it is sleeping and its
			 * timeout is not 1.
			 */
			hpts->p_direct_wake = 1;
			tcp_wakehpts(hpts);
		}
	}
	return (need_wake);
}

897
898
899
900
901
#ifdef INVARIANTS
static void
check_if_slot_would_be_wrong(struct tcp_hpts_entry *hpts, struct inpcb *inp, uint32_t inp_hptsslot, int line)
{
	/*
902
	 * Sanity checks for the pacer with invariants
903
904
	 * on insert.
	 */
905
906
907
	KASSERT(inp_hptsslot < NUM_OF_HPTSI_SLOTS,
		("hpts:%p inp:%p slot:%d > max",
		 hpts, inp, inp_hptsslot));
908
909
	if ((hpts->p_hpts_active) &&
	    (hpts->p_wheel_complete == 0)) {
910
		/*
911
912
913
914
915
916
917
		 * If the pacer is processing a arc
		 * of the wheel, we need to make
		 * sure we are not inserting within
		 * that arc.
		 */
		int distance, yet_to_run;

918
919
920
		distance = hpts_slots_diff(hpts->p_runningslot, inp_hptsslot);
		if (hpts->p_runningslot != hpts->p_cur_slot)
			yet_to_run = hpts_slots_diff(hpts->p_runningslot, hpts->p_cur_slot);
921
922
		else
			yet_to_run = 0;	/* processing last slot */
923
924
925
926
927
		KASSERT(yet_to_run <= distance,
			("hpts:%p inp:%p slot:%d distance:%d yet_to_run:%d rs:%d cs:%d",
			 hpts, inp, inp_hptsslot,
			 distance, yet_to_run,
			 hpts->p_runningslot, hpts->p_cur_slot));
928
929
930
931
	}
}
#endif

932
static void
933
934
tcp_hpts_insert_locked(struct tcp_hpts_entry *hpts, struct inpcb *inp, uint32_t slot, int32_t line,
		       struct hpts_diag *diag, struct timeval *tv)
935
{
936
	uint32_t need_new_to = 0;
937
938
939
	uint32_t wheel_cts; 
	int32_t wheel_slot, maxslots, last_slot;
	int cpu;
940
	int8_t need_wakeup = 0;
941
942
943
944
945

	HPTS_MTX_ASSERT(hpts);
	if (diag) {
		memset(diag, 0, sizeof(struct hpts_diag));
		diag->p_hpts_active = hpts->p_hpts_active;
946
		diag->p_prev_slot = hpts->p_prev_slot;
947
		diag->p_runningslot = hpts->p_runningslot;
948
949
		diag->p_nxt_slot = hpts->p_nxt_slot;
		diag->p_cur_slot = hpts->p_cur_slot;
950
951
		diag->p_curtick = hpts->p_curtick;
		diag->p_lasttick = hpts->p_lasttick;
952
		diag->slot_req = slot;
953
954
		diag->p_on_min_sleep = hpts->p_on_min_sleep;
		diag->hpts_sleep_time = hpts->p_hpts_sleep_time;
955
	}
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
	KASSERT(inp->inp_in_hpts == 0, ("Hpts:%p tp:%p already on hpts and add?", hpts, inp));
	if (slot == 0) {
		/* Immediate */
		tcp_queue_to_hpts_immediate_locked(inp, hpts, line, 0);
		return;
	}
	/* Get the current time relative to the wheel */
	wheel_cts = tcp_tv_to_hptstick(tv);
	/* Map it onto the wheel */
	wheel_slot = tick_to_wheel(wheel_cts);
	/* Now what's the max we can place it at? */
	maxslots = max_slots_available(hpts, wheel_slot, &last_slot);
	if (diag) {
		diag->wheel_slot = wheel_slot;
		diag->maxslots = maxslots;
		diag->wheel_cts = wheel_cts;
	}
	if (maxslots == 0) {
		/* The pacer is in a wheel wrap behind, yikes! */
		if (slot > 1) {
			/*
			 * Reduce by 1 to prevent a forever loop in
			 * case something else is wrong. Note this
			 * probably does not hurt because the pacer
			 * if its true is so far behind we will be
			 * > 1second late calling anyway.
			 */
			slot--;
984
		}
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
		inp->inp_hptsslot = last_slot;
		inp->inp_hpts_request = slot;
	} else 	if (maxslots >= slot) {
		/* It all fits on the wheel */
		inp->inp_hpts_request = 0;
		inp->inp_hptsslot = hpts_slot(wheel_slot, slot);
	} else {
		/* It does not fit */
		inp->inp_hpts_request = slot - maxslots;
		inp->inp_hptsslot = last_slot;
	}
	if (diag) {
		diag->slot_remaining = inp->inp_hpts_request;
		diag->inp_hptsslot = inp->inp_hptsslot;
	}
#ifdef INVARIANTS
For faster browsing, not all history is shown. View entire blame