tcp_hpts.c 67.2 KB
Newer Older
1
/*-
Warner Losh's avatar
Warner Losh committed
2
 * Copyright (c) 2016-2018 Netflix, Inc.
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 */
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");

#include "opt_inet.h"
#include "opt_inet6.h"
31
#include "opt_rss.h"
32
#include "opt_tcpdebug.h"
33

34
35
36
37
/**
 * Some notes about usage.
 *
 * The tcp_hpts system is designed to provide a high precision timer
38
 * system for tcp. Its main purpose is to provide a mechanism for
39
40
41
 * pacing packets out onto the wire. It can be used in two ways
 * by a given TCP stack (and those two methods can be used simultaneously).
 *
42
 * First, and probably the main thing its used by Rack and BBR, it can
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
 * be used to call tcp_output() of a transport stack at some time in the future.
 * The normal way this is done is that tcp_output() of the stack schedules
 * itself to be called again by calling tcp_hpts_insert(tcpcb, slot). The
 * slot is the time from now that the stack wants to be called but it
 * must be converted to tcp_hpts's notion of slot. This is done with
 * one of the macros HPTS_MS_TO_SLOTS or HPTS_USEC_TO_SLOTS. So a typical
 * call from the tcp_output() routine might look like:
 *
 * tcp_hpts_insert(tp, HPTS_USEC_TO_SLOTS(550));
 *
 * The above would schedule tcp_ouput() to be called in 550 useconds.
 * Note that if using this mechanism the stack will want to add near
 * its top a check to prevent unwanted calls (from user land or the
 * arrival of incoming ack's). So it would add something like:
 *
58
 * if (tcp_in_hpts(inp))
59
60
61
62
63
 *    return;
 *
 * to prevent output processing until the time alotted has gone by.
 * Of course this is a bare bones example and the stack will probably
 * have more consideration then just the above.
64
 *
65
 * In order to run input queued segments from the HPTS context the
66
 * tcp stack must define an input function for
67
68
 * tfb_do_queued_segments(). This function understands
 * how to dequeue a array of packets that were input and
69
 * knows how to call the correct processing routine.
70
 *
71
 * Locking in this is important as well so most likely the
72
73
74
75
76
77
78
79
 * stack will need to define the tfb_do_segment_nounlock()
 * splitting tfb_do_segment() into two parts. The main processing
 * part that does not unlock the INP and returns a value of 1 or 0.
 * It returns 0 if all is well and the lock was not released. It
 * returns 1 if we had to destroy the TCB (a reset received etc).
 * The remains of tfb_do_segment() then become just a simple call
 * to the tfb_do_segment_nounlock() function and check the return
 * code and possibly unlock.
80
 *
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
 * The stack must also set the flag on the INP that it supports this
 * feature i.e. INP_SUPPORTS_MBUFQ. The LRO code recoginizes
 * this flag as well and will queue packets when it is set.
 * There are other flags as well INP_MBUF_QUEUE_READY and
 * INP_DONT_SACK_QUEUE. The first flag tells the LRO code
 * that we are in the pacer for output so there is no
 * need to wake up the hpts system to get immediate
 * input. The second tells the LRO code that its okay
 * if a SACK arrives you can still defer input and let
 * the current hpts timer run (this is usually set when
 * a rack timer is up so we know SACK's are happening
 * on the connection already and don't want to wakeup yet).
 *
 * There is a common functions within the rack_bbr_common code
 * version i.e. ctf_do_queued_segments(). This function
96
97
98
 * knows how to take the input queue of packets from
 * tp->t_in_pkts and process them digging out
 * all the arguments, calling any bpf tap and
99
 * calling into tfb_do_segment_nounlock(). The common
100
 * function (ctf_do_queued_segments())  requires that
101
102
103
 * you have defined the tfb_do_segment_nounlock() as
 * described above.
 *
104
105
106
107
 * Now the second function the tcp_hpts system provides is the ability
 * to abort a connection later. Why would you want to do this?
 * To not have to worry about untangling any recursive locks.
 *
108
109
110
 * The second feature of the input side of hpts is the
 * dropping of a connection. This is due to the way that
 * locking may have occured on the INP_WLOCK. So if
111
112
113
 * a stack wants to drop a connection it calls:
 *
 *     tcp_set_inp_to_drop(tp, ETIMEDOUT)
114
115
116
 *
 * To schedule the tcp_hpts system to call
 *
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
 *    tcp_drop(tp, drop_reason)
 *
 * at a future point. This is quite handy to prevent locking
 * issues when dropping connections.
 *
 */

#include <sys/param.h>
#include <sys/bus.h>
#include <sys/interrupt.h>
#include <sys/module.h>
#include <sys/kernel.h>
#include <sys/hhook.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
#include <sys/proc.h>		/* for proc0 declaration */
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/sysctl.h>
#include <sys/systm.h>
#include <sys/refcount.h>
#include <sys/sched.h>
#include <sys/queue.h>
#include <sys/smp.h>
#include <sys/counter.h>
#include <sys/time.h>
#include <sys/kthread.h>
#include <sys/kern_prefetch.h>

#include <vm/uma.h>
147
#include <vm/vm.h>
148
149
150
151

#include <net/route.h>
#include <net/vnet.h>

152
153
154
155
156
#ifdef RSS
#include <net/netisr.h>
#include <net/rss_config.h>
#endif

157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
#define TCPSTATES		/* for logging */

#include <netinet/in.h>
#include <netinet/in_kdtrace.h>
#include <netinet/in_pcb.h>
#include <netinet/ip.h>
#include <netinet/ip_icmp.h>	/* required for icmp_var.h */
#include <netinet/icmp_var.h>	/* for ICMP_BANDLIM */
#include <netinet/ip_var.h>
#include <netinet/ip6.h>
#include <netinet6/in6_pcb.h>
#include <netinet6/ip6_var.h>
#include <netinet/tcp.h>
#include <netinet/tcp_fsm.h>
#include <netinet/tcp_seq.h>
#include <netinet/tcp_timer.h>
#include <netinet/tcp_var.h>
#include <netinet/tcpip.h>
#include <netinet/cc/cc.h>
#include <netinet/tcp_hpts.h>
177
#include <netinet/tcp_log_buf.h>
178
179
180
181
182
183
184
185

#ifdef tcpdebug
#include <netinet/tcp_debug.h>
#endif				/* tcpdebug */
#ifdef tcp_offload
#include <netinet/tcp_offload.h>
#endif

186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
/*
 * The hpts uses a 102400 wheel. The wheel
 * defines the time in 10 usec increments (102400 x 10).
 * This gives a range of 10usec - 1024ms to place
 * an entry within. If the user requests more than
 * 1.024 second, a remaineder is attached and the hpts
 * when seeing the remainder will re-insert the
 * inpcb forward in time from where it is until
 * the remainder is zero.
 */

#define NUM_OF_HPTSI_SLOTS 102400

/* Each hpts has its own p_mtx which is used for locking */
#define	HPTS_MTX_ASSERT(hpts)	mtx_assert(&(hpts)->p_mtx, MA_OWNED)
201
202
#define	HPTS_LOCK(hpts)		mtx_lock(&(hpts)->p_mtx)
#define	HPTS_UNLOCK(hpts)	mtx_unlock(&(hpts)->p_mtx)
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
TAILQ_HEAD(hptsh, inpcb);
struct tcp_hpts_entry {
	/* Cache line 0x00 */
	struct mtx p_mtx;	/* Mutex for hpts */
	struct timeval p_mysleep;	/* Our min sleep time */
	uint64_t syscall_cnt;
	uint64_t sleeping;	/* What the actual sleep was (if sleeping) */
	uint16_t p_hpts_active; /* Flag that says hpts is awake  */
	uint8_t p_wheel_complete; /* have we completed the wheel arc walk? */
	uint32_t p_curtick;	/* Tick in 10 us the hpts is going to */
	uint32_t p_runningslot; /* Current tick we are at if we are running */
	uint32_t p_prev_slot;	/* Previous slot we were on */
	uint32_t p_cur_slot;	/* Current slot in wheel hpts is draining */
	uint32_t p_nxt_slot;	/* The next slot outside the current range of
				 * slots that the hpts is running on. */
	int32_t p_on_queue_cnt;	/* Count on queue in this hpts */
	uint32_t p_lasttick;	/* Last tick before the current one */
	uint8_t p_direct_wake :1, /* boolean */
		p_on_min_sleep:1, /* boolean */
		p_hpts_wake_scheduled:1, /* boolean */
		p_avail:5;
	uint8_t p_fill[3];	  /* Fill to 32 bits */
	/* Cache line 0x40 */
	void *p_inp;
227
	TAILQ_HEAD(, inpcb) p_dropq;	/* Delayed drop queue */
228
229
	/* Hptsi wheel */
	struct hptsh *p_hptss;
230
231
	uint32_t p_dropq_cnt;		/* Count on drop queue */
	uint32_t p_dropq_gencnt;
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
	uint32_t p_hpts_sleep_time;	/* Current sleep interval having a max
					 * of 255ms */
	uint32_t overidden_sleep;	/* what was overrided by min-sleep for logging */
	uint32_t saved_lasttick;	/* for logging */
	uint32_t saved_curtick;		/* for logging */
	uint32_t saved_curslot;		/* for logging */
	uint32_t saved_prev_slot;       /* for logging */
	uint32_t p_delayed_by;	/* How much were we delayed by */
	/* Cache line 0x80 */
	struct sysctl_ctx_list hpts_ctx;
	struct sysctl_oid *hpts_root;
	struct intr_event *ie;
	void *ie_cookie;
	uint16_t p_num;		/* The hpts number one per cpu */
	uint16_t p_cpu;		/* The hpts CPU */
	/* There is extra space in here */
	/* Cache line 0x100 */
	struct callout co __aligned(CACHE_LINE_SIZE);
}               __aligned(CACHE_LINE_SIZE);

struct tcp_hptsi {
	struct proc *rp_proc;	/* Process structure for hpts */
	struct tcp_hpts_entry **rp_ent;	/* Array of hptss */
	uint32_t *cts_last_ran;
	uint32_t rp_num_hptss;	/* Number of hpts threads */
};

259
260
261
262
MALLOC_DEFINE(M_TCPHPTS, "tcp_hpts", "TCP hpts");
#ifdef RSS
static int tcp_bind_threads = 1;
#else
263
static int tcp_bind_threads = 2;
264
#endif
265
static int tcp_use_irq_cpu = 0;
266
static struct tcp_hptsi tcp_pace;
267
static uint32_t *cts_last_ran;
268
static int hpts_does_tp_logging = 0;
269
270
static int hpts_use_assigned_cpu = 1;
static int32_t hpts_uses_oldest = OLDEST_THRESHOLD;
271

272
static int32_t tcp_hptsi(struct tcp_hpts_entry *hpts, int from_callout);
273
274
275
276
static void tcp_hpts_thread(void *ctx);
static void tcp_init_hptsi(void *st);

int32_t tcp_min_hptsi_time = DEFAULT_MIN_SLEEP;
277
278
279
280
281
static int conn_cnt_thresh = DEFAULT_CONNECTION_THESHOLD;
static int32_t dynamic_min_sleep = DYNAMIC_MIN_SLEEP;
static int32_t dynamic_max_sleep = DYNAMIC_MAX_SLEEP;


282

283
284
SYSCTL_NODE(_net_inet_tcp, OID_AUTO, hpts, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
    "TCP Hpts controls");
285
286
SYSCTL_NODE(_net_inet_tcp_hpts, OID_AUTO, stats, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
    "TCP Hpts statistics");
287
288
289
290
291
292
293
294
295
296
297
298
299

#define	timersub(tvp, uvp, vvp)						\
	do {								\
		(vvp)->tv_sec = (tvp)->tv_sec - (uvp)->tv_sec;		\
		(vvp)->tv_usec = (tvp)->tv_usec - (uvp)->tv_usec;	\
		if ((vvp)->tv_usec < 0) {				\
			(vvp)->tv_sec--;				\
			(vvp)->tv_usec += 1000000;			\
		}							\
	} while (0)

static int32_t tcp_hpts_precision = 120;

300
static struct hpts_domain_info {
301
302
	int count;
	int cpu[MAXCPU];
303
} hpts_domains[MAXMEMDOM];
304

305
306
counter_u64_t hpts_hopelessly_behind;

307
SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, hopeless, CTLFLAG_RD,
308
309
    &hpts_hopelessly_behind,
    "Number of times hpts could not catch up and was behind hopelessly");
310
311
312

counter_u64_t hpts_loops;

313
SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, loops, CTLFLAG_RD,
314
315
316
317
    &hpts_loops, "Number of times hpts had to loop to catch up");

counter_u64_t back_tosleep;

318
SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, no_tcbsfound, CTLFLAG_RD,
319
320
    &back_tosleep, "Number of times hpts found no tcbs");

321
322
counter_u64_t combined_wheel_wrap;

323
SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, comb_wheel_wrap, CTLFLAG_RD,
324
    &combined_wheel_wrap, "Number of times the wheel lagged enough to have an insert see wrap");
325

326
counter_u64_t wheel_wrap;
327

328
SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, wheel_wrap, CTLFLAG_RD,
329
    &wheel_wrap, "Number of times the wheel lagged enough to have an insert see wrap");
330

331
332
333
334
335
336
337
338
339
340
341
342
343
counter_u64_t hpts_direct_call;
SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, direct_call, CTLFLAG_RD,
    &hpts_direct_call, "Number of times hpts was called by syscall/trap or other entry");

counter_u64_t hpts_wake_timeout;

SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, timeout_wakeup, CTLFLAG_RD,
    &hpts_wake_timeout, "Number of times hpts threads woke up via the callout expiring");

counter_u64_t hpts_direct_awakening;

SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, direct_awakening, CTLFLAG_RD,
    &hpts_direct_awakening, "Number of times hpts threads woke up via the callout expiring");
344

345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
counter_u64_t hpts_back_tosleep;

SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, back_tosleep, CTLFLAG_RD,
    &hpts_back_tosleep, "Number of times hpts threads woke up via the callout expiring and went back to sleep no work");

counter_u64_t cpu_uses_flowid;
counter_u64_t cpu_uses_random;

SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, cpusel_flowid, CTLFLAG_RD,
    &cpu_uses_flowid, "Number of times when setting cpuid we used the flowid field");
SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, cpusel_random, CTLFLAG_RD,
    &cpu_uses_random, "Number of times when setting cpuid we used the a random value");

TUNABLE_INT("net.inet.tcp.bind_hptss", &tcp_bind_threads);
TUNABLE_INT("net.inet.tcp.use_irq", &tcp_use_irq_cpu);
SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, bind_hptss, CTLFLAG_RD,
    &tcp_bind_threads, 2,
    "Thread Binding tunable");
SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, use_irq, CTLFLAG_RD,
    &tcp_use_irq_cpu, 0,
    "Use of irq CPU  tunable");
SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, precision, CTLFLAG_RW,
    &tcp_hpts_precision, 120,
    "Value for PRE() precision of callout");
SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, cnt_thresh, CTLFLAG_RW,
    &conn_cnt_thresh, 0,
    "How many connections (below) make us use the callout based mechanism");
372
373
374
SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, logging, CTLFLAG_RW,
    &hpts_does_tp_logging, 0,
    "Do we add to any tp that has logging on pacer logs");
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, use_assigned_cpu, CTLFLAG_RW,
    &hpts_use_assigned_cpu, 0,
    "Do we start any hpts timer on the assigned cpu?");
SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, use_oldest, CTLFLAG_RW,
    &hpts_uses_oldest, OLDEST_THRESHOLD,
    "Do syscalls look for the hpts that has been the longest since running (or just use cpu no if 0)?");
SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, dyn_minsleep, CTLFLAG_RW,
    &dynamic_min_sleep, 250,
    "What is the dynamic minsleep value?");
SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, dyn_maxsleep, CTLFLAG_RW,
    &dynamic_max_sleep, 5000,
    "What is the dynamic maxsleep value?");




391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409

static int32_t max_pacer_loops = 10;
SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, loopmax, CTLFLAG_RW,
    &max_pacer_loops, 10,
    "What is the maximum number of times the pacer will loop trying to catch up");

#define HPTS_MAX_SLEEP_ALLOWED (NUM_OF_HPTSI_SLOTS/2)

static uint32_t hpts_sleep_max = HPTS_MAX_SLEEP_ALLOWED;

static int
sysctl_net_inet_tcp_hpts_max_sleep(SYSCTL_HANDLER_ARGS)
{
	int error;
	uint32_t new;

	new = hpts_sleep_max;
	error = sysctl_handle_int(oidp, &new, 0, req);
	if (error == 0 && req->newptr) {
410
		if ((new < dynamic_min_sleep) ||
411
		    (new > HPTS_MAX_SLEEP_ALLOWED))
412
413
414
415
416
417
			error = EINVAL;
		else
			hpts_sleep_max = new;
	}
	return (error);
}
418

419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
static int
sysctl_net_inet_tcp_hpts_min_sleep(SYSCTL_HANDLER_ARGS)
{
	int error;
	uint32_t new;

	new = tcp_min_hptsi_time;
	error = sysctl_handle_int(oidp, &new, 0, req);
	if (error == 0 && req->newptr) {
		if (new < LOWEST_SLEEP_ALLOWED)
			error = EINVAL;
		else
			tcp_min_hptsi_time = new;
	}
	return (error);
}

436
SYSCTL_PROC(_net_inet_tcp_hpts, OID_AUTO, maxsleep,
437
    CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
438
    &hpts_sleep_max, 0,
439
440
    &sysctl_net_inet_tcp_hpts_max_sleep, "IU",
    "Maximum time hpts will sleep");
441

442
443
SYSCTL_PROC(_net_inet_tcp_hpts, OID_AUTO, minsleep,
    CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
444
    &tcp_min_hptsi_time, 0,
445
    &sysctl_net_inet_tcp_hpts_min_sleep, "IU",
446
447
    "The minimum time the hpts must sleep before processing more slots");

448
449
450
451
452
453
454
455
456
457
458
459
460
static int ticks_indicate_more_sleep = TICKS_INDICATE_MORE_SLEEP;
static int ticks_indicate_less_sleep = TICKS_INDICATE_LESS_SLEEP;
static int tcp_hpts_no_wake_over_thresh = 1;

SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, more_sleep, CTLFLAG_RW,
    &ticks_indicate_more_sleep, 0,
    "If we only process this many or less on a timeout, we need longer sleep on the next callout");
SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, less_sleep, CTLFLAG_RW,
    &ticks_indicate_less_sleep, 0,
    "If we process this many or more on a timeout, we need less sleep on the next callout");
SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, nowake_over_thresh, CTLFLAG_RW,
    &tcp_hpts_no_wake_over_thresh, 0,
    "When we are over the threshold on the pacer do we prohibit wakeups?");
461
462

static void
463
tcp_hpts_log(struct tcp_hpts_entry *hpts, struct tcpcb *tp, struct timeval *tv,
464
	     int slots_to_run, int idx, int from_callout)
465
{
466
	union tcp_log_stackspecific log;
467
468
469
470
471
472
	/*
	 * Unused logs are
	 * 64 bit - delRate, rttProp, bw_inuse
	 * 16 bit - cwnd_gain
	 *  8 bit - bbr_state, bbr_substate, inhpts, ininput;
	 */
473
474
475
476
477
478
479
	memset(&log.u_bbr, 0, sizeof(log.u_bbr));
	log.u_bbr.flex1 = hpts->p_nxt_slot;
	log.u_bbr.flex2 = hpts->p_cur_slot;
	log.u_bbr.flex3 = hpts->p_prev_slot;
	log.u_bbr.flex4 = idx;
	log.u_bbr.flex5 = hpts->p_curtick;
	log.u_bbr.flex6 = hpts->p_on_queue_cnt;
480
481
482
	log.u_bbr.flex7 = hpts->p_cpu;
	log.u_bbr.flex8 = (uint8_t)from_callout;
	log.u_bbr.inflight = slots_to_run;
483
484
485
486
487
488
489
	log.u_bbr.applimited = hpts->overidden_sleep;
	log.u_bbr.delivered = hpts->saved_curtick;
	log.u_bbr.timeStamp = tcp_tv_to_usectick(tv);
	log.u_bbr.epoch = hpts->saved_curslot;
	log.u_bbr.lt_epoch = hpts->saved_prev_slot;
	log.u_bbr.pkts_out = hpts->p_delayed_by;
	log.u_bbr.lost = hpts->p_hpts_sleep_time;
490
491
492
	log.u_bbr.pacing_gain = hpts->p_cpu;
	log.u_bbr.pkt_epoch = hpts->p_runningslot;
	log.u_bbr.use_lt_bw = 1;
493
494
495
496
497
	TCP_LOG_EVENTP(tp, NULL,
		       &tp->t_inpcb->inp_socket->so_rcv,
		       &tp->t_inpcb->inp_socket->so_snd,
		       BBR_LOG_HPTSDIAG, 0,
		       0, &log, false, tv);
498
499
500
}

static void
501
tcp_wakehpts(struct tcp_hpts_entry *hpts)
502
{
503
	HPTS_MTX_ASSERT(hpts);
504

505
506
507
508
509
510
511
512
	if (tcp_hpts_no_wake_over_thresh && (hpts->p_on_queue_cnt >= conn_cnt_thresh)) {
		hpts->p_direct_wake = 0;
		return;
	}
	if (hpts->p_hpts_wake_scheduled == 0) {
		hpts->p_hpts_wake_scheduled = 1;
		swi_sched(hpts->ie_cookie, 0);
	}
513
514
515
}

static void
516
hpts_timeout_swi(void *arg)
517
{
518
519
520
521
	struct tcp_hpts_entry *hpts;

	hpts = (struct tcp_hpts_entry *)arg;
	swi_sched(hpts->ie_cookie, 0);
522
523
524
525
526
}

static inline void
hpts_sane_pace_remove(struct tcp_hpts_entry *hpts, struct inpcb *inp, struct hptsh *head, int clear)
{
527
528
529
	HPTS_MTX_ASSERT(hpts);
	KASSERT(hpts->p_cpu == inp->inp_hpts_cpu, ("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp));
	KASSERT(inp->inp_in_hpts != 0, ("%s: hpts:%p inp:%p not on the hpts?", __FUNCTION__, hpts, inp));
530
531
	TAILQ_REMOVE(head, inp, inp_hpts);
	hpts->p_on_queue_cnt--;
532
533
534
	KASSERT(hpts->p_on_queue_cnt >= 0,
		("Hpts goes negative inp:%p hpts:%p",
		 inp, hpts));
535
536
537
538
539
540
541
542
543
	if (clear) {
		inp->inp_hpts_request = 0;
		inp->inp_in_hpts = 0;
	}
}

static inline void
hpts_sane_pace_insert(struct tcp_hpts_entry *hpts, struct inpcb *inp, struct hptsh *head, int line, int noref)
{
544
545
546
547
548
549
550
	HPTS_MTX_ASSERT(hpts);
	KASSERT(hpts->p_cpu == inp->inp_hpts_cpu,
		("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp));
	KASSERT(((noref == 1) && (inp->inp_in_hpts == 1)) ||
		((noref == 0) && (inp->inp_in_hpts == 0)),
		("%s: hpts:%p inp:%p already on the hpts?",
		 __FUNCTION__, hpts, inp));
551
552
553
554
555
556
557
558
	TAILQ_INSERT_TAIL(head, inp, inp_hpts);
	inp->inp_in_hpts = 1;
	hpts->p_on_queue_cnt++;
	if (noref == 0) {
		in_pcbref(inp);
	}
}

559
static struct tcp_hpts_entry *
560
561
562
563
564
565
566
567
tcp_hpts_lock(struct inpcb *inp)
{
	struct tcp_hpts_entry *hpts;
	int32_t hpts_num;

again:
	hpts_num = inp->inp_hpts_cpu;
	hpts = tcp_pace.rp_ent[hpts_num];
568
569
570
	KASSERT(mtx_owned(&hpts->p_mtx) == 0,
		("Hpts:%p owns mtx prior-to lock line:%d",
		 hpts, __LINE__));
571
572
573
574
575
576
577
578
	mtx_lock(&hpts->p_mtx);
	if (hpts_num != inp->inp_hpts_cpu) {
		mtx_unlock(&hpts->p_mtx);
		goto again;
	}
	return (hpts);
}

579
static struct tcp_hpts_entry *
580
tcp_dropq_lock(struct inpcb *inp)
581
582
583
584
585
{
	struct tcp_hpts_entry *hpts;
	int32_t hpts_num;

again:
586
	hpts_num = inp->inp_dropq_cpu;
587
	hpts = tcp_pace.rp_ent[hpts_num];
588
589
590
	KASSERT(mtx_owned(&hpts->p_mtx) == 0,
		("Hpts:%p owns mtx prior-to lock line:%d",
		hpts, __LINE__));
591
	mtx_lock(&hpts->p_mtx);
592
	if (hpts_num != inp->inp_dropq_cpu) {
593
594
595
596
597
598
599
600
601
		mtx_unlock(&hpts->p_mtx);
		goto again;
	}
	return (hpts);
}

static void
tcp_remove_hpts_ref(struct inpcb *inp, struct tcp_hpts_entry *hpts, int line)
{
602
	int32_t ret;
603

604
605
	ret = in_pcbrele_wlocked(inp);
	KASSERT(ret != 1, ("inpcb:%p release ret 1", inp));
606
607
608
609
610
611
612
613
614
615
616
617
}

static void
tcp_hpts_remove_locked_output(struct tcp_hpts_entry *hpts, struct inpcb *inp, int32_t flags, int32_t line)
{
	if (inp->inp_in_hpts) {
		hpts_sane_pace_remove(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], 1);
		tcp_remove_hpts_ref(inp, hpts, line);
	}
}

static void
618
tcp_dropq_remove(struct tcp_hpts_entry *hpts, struct inpcb *inp)
619
{
620
621
	bool released __diagused;

622
	HPTS_MTX_ASSERT(hpts);
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
	INP_WLOCK_ASSERT(inp);

	if (inp->inp_in_dropq != IHPTS_ONQUEUE)
		return;

	MPASS(hpts->p_cpu == inp->inp_dropq_cpu);
	if (__predict_true(inp->inp_dropq_gencnt == hpts->p_dropq_gencnt)) {
		TAILQ_REMOVE(&hpts->p_dropq, inp, inp_dropq);
		MPASS(hpts->p_dropq_cnt > 0);
		hpts->p_dropq_cnt--;
		inp->inp_in_dropq = IHPTS_NONE;
		released = in_pcbrele_wlocked(inp);
		MPASS(released == false);
	} else {
		/*
		 * tcp_delayed_drop() now owns the TAILQ head of this inp.
		 * Can't TAILQ_REMOVE, just mark it.
		 */
#ifdef INVARIANTS
		struct inpcb *tmp;

		TAILQ_FOREACH(tmp, &hpts->p_dropq, inp_dropq)
			MPASS(tmp != inp);
#endif
		inp->inp_in_dropq = IHPTS_MOVING;
648
	}
649

650
651
652
653
654
655
656
657
658
659
}

/*
 * Called normally with the INP_LOCKED but it
 * does not matter, the hpts lock is the key
 * but the lock order allows us to hold the
 * INP lock and then get the hpts lock.
 *
 * Valid values in the flags are
 * HPTS_REMOVE_OUTPUT - remove from the output of the hpts.
660
 * HPTS_REMOVE_DROPQ - remove from the drop queue of the hpts.
661
 * Note that you can use one or both values together
662
 * and get two actions.
663
664
665
666
667
668
669
670
671
672
673
674
 */
void
__tcp_hpts_remove(struct inpcb *inp, int32_t flags, int32_t line)
{
	struct tcp_hpts_entry *hpts;

	INP_WLOCK_ASSERT(inp);
	if (flags & HPTS_REMOVE_OUTPUT) {
		hpts = tcp_hpts_lock(inp);
		tcp_hpts_remove_locked_output(hpts, inp, flags, line);
		mtx_unlock(&hpts->p_mtx);
	}
675
676
677
	if (flags & HPTS_REMOVE_DROPQ) {
		hpts = tcp_dropq_lock(inp);
		tcp_dropq_remove(hpts, inp);
678
679
680
681
682
		mtx_unlock(&hpts->p_mtx);
	}
}

static inline int
683
hpts_slot(uint32_t wheel_slot, uint32_t plus)
684
{
685
686
687
688
	/*
	 * Given a slot on the wheel, what slot
	 * is that plus ticks out?
	 */
689
690
	KASSERT(wheel_slot < NUM_OF_HPTSI_SLOTS, ("Invalid tick %u not on wheel", wheel_slot));
	return ((wheel_slot + plus) % NUM_OF_HPTSI_SLOTS);
691
692
693
694
695
}

static inline int
tick_to_wheel(uint32_t cts_in_wticks)
{
696
	/*
697
698
699
700
701
	 * Given a timestamp in ticks (so by
	 * default to get it to a real time one
	 * would multiply by 10.. i.e the number
	 * of ticks in a slot) map it to our limited
	 * space wheel.
702
703
704
705
706
	 */
	return (cts_in_wticks % NUM_OF_HPTSI_SLOTS);
}

static inline int
707
hpts_slots_diff(int prev_slot, int slot_now)
708
709
{
	/*
710
	 * Given two slots that are someplace
711
712
	 * on our wheel. How far are they apart?
	 */
713
714
715
	if (slot_now > prev_slot)
		return (slot_now - prev_slot);
	else if (slot_now == prev_slot)
716
717
		/*
		 * Special case, same means we can go all of our
718
719
720
721
		 * wheel less one slot.
		 */
		return (NUM_OF_HPTSI_SLOTS - 1);
	else
722
		return ((NUM_OF_HPTSI_SLOTS - prev_slot) + slot_now);
723
724
725
}

/*
726
727
 * Given a slot on the wheel that is the current time
 * mapped to the wheel (wheel_slot), what is the maximum
728
 * distance forward that can be obtained without
729
 * wrapping past either prev_slot or running_slot
730
 * depending on the htps state? Also if passed
731
 * a uint32_t *, fill it with the slot location.
732
733
 *
 * Note if you do not give this function the current
734
 * time (that you think it is) mapped to the wheel slot
735
736
737
738
 * then the results will not be what you expect and
 * could lead to invalid inserts.
 */
static inline int32_t
739
max_slots_available(struct tcp_hpts_entry *hpts, uint32_t wheel_slot, uint32_t *target_slot)
740
{
741
	uint32_t dis_to_travel, end_slot, pacer_to_now, avail_on_wheel;
742
743
744

	if ((hpts->p_hpts_active == 1) &&
	    (hpts->p_wheel_complete == 0)) {
745
		end_slot = hpts->p_runningslot;
746
		/* Back up one tick */
747
748
		if (end_slot == 0)
			end_slot = NUM_OF_HPTSI_SLOTS - 1;
749
		else
750
751
752
			end_slot--;
		if (target_slot)
			*target_slot = end_slot;
753
754
755
756
757
758
759
760
761
	} else {
		/*
		 * For the case where we are
		 * not active, or we have
		 * completed the pass over
		 * the wheel, we can use the
		 * prev tick and subtract one from it. This puts us
		 * as far out as possible on the wheel.
		 */
762
763
764
		end_slot = hpts->p_prev_slot;
		if (end_slot == 0)
			end_slot = NUM_OF_HPTSI_SLOTS - 1;
765
		else
766
767
768
			end_slot--;
		if (target_slot)
			*target_slot = end_slot;
769
770
		/*
		 * Now we have close to the full wheel left minus the
771
772
773
774
		 * time it has been since the pacer went to sleep. Note
		 * that wheel_tick, passed in, should be the current time
		 * from the perspective of the caller, mapped to the wheel.
		 */
775
776
		if (hpts->p_prev_slot != wheel_slot)
			dis_to_travel = hpts_slots_diff(hpts->p_prev_slot, wheel_slot);
777
778
		else
			dis_to_travel = 1;
779
780
		/*
		 * dis_to_travel in this case is the space from when the
781
		 * pacer stopped (p_prev_slot) and where our wheel_slot
782
		 * is now. To know how many slots we can put it in we
783
784
785
786
787
788
		 * subtract from the wheel size. We would not want
		 * to place something after p_prev_slot or it will
		 * get ran too soon.
		 */
		return (NUM_OF_HPTSI_SLOTS - dis_to_travel);
	}
789
	/*
790
	 * So how many slots are open between p_runningslot -> p_cur_slot
791
792
793
794
	 * that is what is currently un-available for insertion. Special
	 * case when we are at the last slot, this gets 1, so that
	 * the answer to how many slots are available is all but 1.
	 */
795
	if (hpts->p_runningslot == hpts->p_cur_slot)
796
797
		dis_to_travel = 1;
	else
798
		dis_to_travel = hpts_slots_diff(hpts->p_runningslot, hpts->p_cur_slot);
799
	/*
800
801
	 * How long has the pacer been running?
	 */
802
	if (hpts->p_cur_slot != wheel_slot) {
803
		/* The pacer is a bit late */
804
		pacer_to_now = hpts_slots_diff(hpts->p_cur_slot, wheel_slot);
805
806
807
808
	} else {
		/* The pacer is right on time, now == pacers start time */
		pacer_to_now = 0;
	}
809
	/*
810
811
812
813
814
	 * To get the number left we can insert into we simply
	 * subract the distance the pacer has to run from how
	 * many slots there are.
	 */
	avail_on_wheel = NUM_OF_HPTSI_SLOTS - dis_to_travel;
815
816
817
	/*
	 * Now how many of those we will eat due to the pacer's
	 * time (p_cur_slot) of start being behind the
818
	 * real time (wheel_slot)?
819
820
	 */
	if (avail_on_wheel <= pacer_to_now) {
821
		/*
822
823
		 * Wheel wrap, we can't fit on the wheel, that
		 * is unusual the system must be way overloaded!
824
		 * Insert into the assured slot, and return special
825
826
827
		 * "0".
		 */
		counter_u64_add(combined_wheel_wrap, 1);
828
		*target_slot = hpts->p_nxt_slot;
829
830
		return (0);
	} else {
831
		/*
832
833
834
		 * We know how many slots are open
		 * on the wheel (the reverse of what
		 * is left to run. Take away the time
835
		 * the pacer started to now (wheel_slot)
836
837
838
839
840
841
		 * and that tells you how many slots are
		 * open that can be inserted into that won't
		 * be touched by the pacer until later.
		 */
		return (avail_on_wheel - pacer_to_now);
	}
842
843
844
845
846
}

static int
tcp_queue_to_hpts_immediate_locked(struct inpcb *inp, struct tcp_hpts_entry *hpts, int32_t line, int32_t noref)
{
847
	uint32_t need_wake = 0;
848

849
850
851
	HPTS_MTX_ASSERT(hpts);
	if (inp->inp_in_hpts == 0) {
		/* Ok we need to set it on the hpts in the current slot */
852
853
854
855
		inp->inp_hpts_request = 0;
		if ((hpts->p_hpts_active == 0) ||
		    (hpts->p_wheel_complete)) {
			/*
856
			 * A sleeping hpts we want in next slot to run
857
858
			 * note that in this state p_prev_slot == p_cur_slot
			 */
859
			inp->inp_hptsslot = hpts_slot(hpts->p_prev_slot, 1);
860
861
			if ((hpts->p_on_min_sleep == 0) && (hpts->p_hpts_active == 0))
				need_wake = 1;
862
863
		} else if ((void *)inp == hpts->p_inp) {
			/*
864
			 * The hpts system is running and the caller
865
			 * was awoken by the hpts system.
866
			 * We can't allow you to go into the same slot we
867
			 * are in (we don't want a loop :-D).
868
869
870
			 */
			inp->inp_hptsslot = hpts->p_nxt_slot;
		} else
871
			inp->inp_hptsslot = hpts->p_runningslot;
872
873
874
875
876
877
878
879
880
881
882
883
884
		hpts_sane_pace_insert(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], line, noref);
		if (need_wake) {
			/*
			 * Activate the hpts if it is sleeping and its
			 * timeout is not 1.
			 */
			hpts->p_direct_wake = 1;
			tcp_wakehpts(hpts);
		}
	}
	return (need_wake);
}

885
886
887
888
889
#ifdef INVARIANTS
static void
check_if_slot_would_be_wrong(struct tcp_hpts_entry *hpts, struct inpcb *inp, uint32_t inp_hptsslot, int line)
{
	/*
890
	 * Sanity checks for the pacer with invariants
891
892
	 * on insert.
	 */
893
894
895
	KASSERT(inp_hptsslot < NUM_OF_HPTSI_SLOTS,
		("hpts:%p inp:%p slot:%d > max",
		 hpts, inp, inp_hptsslot));
896
897
	if ((hpts->p_hpts_active) &&
	    (hpts->p_wheel_complete == 0)) {
898
		/*
899
900
901
902
903
904
905
		 * If the pacer is processing a arc
		 * of the wheel, we need to make
		 * sure we are not inserting within
		 * that arc.
		 */
		int distance, yet_to_run;

906
907
908
		distance = hpts_slots_diff(hpts->p_runningslot, inp_hptsslot);
		if (hpts->p_runningslot != hpts->p_cur_slot)
			yet_to_run = hpts_slots_diff(hpts->p_runningslot, hpts->p_cur_slot);
909
910
		else
			yet_to_run = 0;	/* processing last slot */
911
912
913
914
915
		KASSERT(yet_to_run <= distance,
			("hpts:%p inp:%p slot:%d distance:%d yet_to_run:%d rs:%d cs:%d",
			 hpts, inp, inp_hptsslot,
			 distance, yet_to_run,
			 hpts->p_runningslot, hpts->p_cur_slot));
916
917
918
919
	}
}
#endif

920
static void
921
922
tcp_hpts_insert_locked(struct tcp_hpts_entry *hpts, struct inpcb *inp, uint32_t slot, int32_t line,
		       struct hpts_diag *diag, struct timeval *tv)
923
{
924
	uint32_t need_new_to = 0;
925
926
927
	uint32_t wheel_cts; 
	int32_t wheel_slot, maxslots, last_slot;
	int cpu;
928
	int8_t need_wakeup = 0;
929
930
931
932
933

	HPTS_MTX_ASSERT(hpts);
	if (diag) {
		memset(diag, 0, sizeof(struct hpts_diag));
		diag->p_hpts_active = hpts->p_hpts_active;
934
		diag->p_prev_slot = hpts->p_prev_slot;
935
		diag->p_runningslot = hpts->p_runningslot;
936
937
		diag->p_nxt_slot = hpts->p_nxt_slot;
		diag->p_cur_slot = hpts->p_cur_slot;
938
939
		diag->p_curtick = hpts->p_curtick;
		diag->p_lasttick = hpts->p_lasttick;
940
		diag->slot_req = slot;
941
942
		diag->p_on_min_sleep = hpts->p_on_min_sleep;
		diag->hpts_sleep_time = hpts->p_hpts_sleep_time;
943
	}
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
	KASSERT(inp->inp_in_hpts == 0, ("Hpts:%p tp:%p already on hpts and add?", hpts, inp));
	if (slot == 0) {
		/* Immediate */
		tcp_queue_to_hpts_immediate_locked(inp, hpts, line, 0);
		return;
	}
	/* Get the current time relative to the wheel */
	wheel_cts = tcp_tv_to_hptstick(tv);
	/* Map it onto the wheel */
	wheel_slot = tick_to_wheel(wheel_cts);
	/* Now what's the max we can place it at? */
	maxslots = max_slots_available(hpts, wheel_slot, &last_slot);
	if (diag) {
		diag->wheel_slot = wheel_slot;
		diag->maxslots = maxslots;
		diag->wheel_cts = wheel_cts;
	}
	if (maxslots == 0) {
		/* The pacer is in a wheel wrap behind, yikes! */
		if (slot > 1) {
			/*
			 * Reduce by 1 to prevent a forever loop in
			 * case something else is wrong. Note this
			 * probably does not hurt because the pacer
			 * if its true is so far behind we will be
			 * > 1second late calling anyway.
			 */
			slot--;
972
		}
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
		inp->inp_hptsslot = last_slot;
		inp->inp_hpts_request = slot;
	} else 	if (maxslots >= slot) {
		/* It all fits on the wheel */
		inp->inp_hpts_request = 0;
		inp->inp_hptsslot = hpts_slot(wheel_slot, slot);
	} else {
		/* It does not fit */
		inp->inp_hpts_request = slot - maxslots;
		inp->inp_hptsslot = last_slot;
	}
	if (diag) {
		diag->slot_remaining = inp->inp_hpts_request;
		diag->inp_hptsslot = inp->inp_hptsslot;
	}
#ifdef INVARIANTS
	check_if_slot_would_be_wrong(hpts, inp, inp->inp_hptsslot, line);
#endif
	hpts_sane_pace_insert(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], line, 0);
	if ((hpts->p_hpts_active == 0) &&
	    (inp->inp_hpts_request == 0) &&
	    (hpts->p_on_min_sleep == 0)) {
		/*
		 * The hpts is sleeping and NOT on a minimum
		 * sleep time, we need to figure out where
		 * it will wake up at and if we need to reschedule
		 * its time-out.
		 */
For faster browsing, not all history is shown. View entire blame