tcp_hpts.c 68.8 KB
Newer Older
1
/*-
Warner Losh's avatar
Warner Losh committed
2
 * Copyright (c) 2016-2018 Netflix, Inc.
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 */
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");

#include "opt_inet.h"
#include "opt_inet6.h"
31
#include "opt_rss.h"
32
#include "opt_tcpdebug.h"
33

34
35
36
37
/**
 * Some notes about usage.
 *
 * The tcp_hpts system is designed to provide a high precision timer
38
 * system for tcp. Its main purpose is to provide a mechanism for
39
40
41
 * pacing packets out onto the wire. It can be used in two ways
 * by a given TCP stack (and those two methods can be used simultaneously).
 *
42
 * First, and probably the main thing its used by Rack and BBR, it can
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
 * be used to call tcp_output() of a transport stack at some time in the future.
 * The normal way this is done is that tcp_output() of the stack schedules
 * itself to be called again by calling tcp_hpts_insert(tcpcb, slot). The
 * slot is the time from now that the stack wants to be called but it
 * must be converted to tcp_hpts's notion of slot. This is done with
 * one of the macros HPTS_MS_TO_SLOTS or HPTS_USEC_TO_SLOTS. So a typical
 * call from the tcp_output() routine might look like:
 *
 * tcp_hpts_insert(tp, HPTS_USEC_TO_SLOTS(550));
 *
 * The above would schedule tcp_ouput() to be called in 550 useconds.
 * Note that if using this mechanism the stack will want to add near
 * its top a check to prevent unwanted calls (from user land or the
 * arrival of incoming ack's). So it would add something like:
 *
58
 * if (tcp_in_hpts(inp))
59
60
61
62
63
 *    return;
 *
 * to prevent output processing until the time alotted has gone by.
 * Of course this is a bare bones example and the stack will probably
 * have more consideration then just the above.
64
 *
65
 * Now the second function (actually two functions I guess :D)
66
67
 * the tcp_hpts system provides is the  ability to either abort
 * a connection (later) or process input on a connection.
68
69
70
 * Why would you want to do this? To keep processor locality
 * and or not have to worry about untangling any recursive
 * locks. The input function now is hooked to the new LRO
71
 * system as well.
72
 *
73
 * In order to use the input redirection function the
74
 * tcp stack must define an input function for
75
76
 * tfb_do_queued_segments(). This function understands
 * how to dequeue a array of packets that were input and
77
 * knows how to call the correct processing routine.
78
 *
79
 * Locking in this is important as well so most likely the
80
81
82
83
84
85
86
87
 * stack will need to define the tfb_do_segment_nounlock()
 * splitting tfb_do_segment() into two parts. The main processing
 * part that does not unlock the INP and returns a value of 1 or 0.
 * It returns 0 if all is well and the lock was not released. It
 * returns 1 if we had to destroy the TCB (a reset received etc).
 * The remains of tfb_do_segment() then become just a simple call
 * to the tfb_do_segment_nounlock() function and check the return
 * code and possibly unlock.
88
 *
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
 * The stack must also set the flag on the INP that it supports this
 * feature i.e. INP_SUPPORTS_MBUFQ. The LRO code recoginizes
 * this flag as well and will queue packets when it is set.
 * There are other flags as well INP_MBUF_QUEUE_READY and
 * INP_DONT_SACK_QUEUE. The first flag tells the LRO code
 * that we are in the pacer for output so there is no
 * need to wake up the hpts system to get immediate
 * input. The second tells the LRO code that its okay
 * if a SACK arrives you can still defer input and let
 * the current hpts timer run (this is usually set when
 * a rack timer is up so we know SACK's are happening
 * on the connection already and don't want to wakeup yet).
 *
 * There is a common functions within the rack_bbr_common code
 * version i.e. ctf_do_queued_segments(). This function
104
105
106
 * knows how to take the input queue of packets from
 * tp->t_in_pkts and process them digging out
 * all the arguments, calling any bpf tap and
107
 * calling into tfb_do_segment_nounlock(). The common
108
 * function (ctf_do_queued_segments())  requires that
109
110
111
112
113
114
 * you have defined the tfb_do_segment_nounlock() as
 * described above.
 *
 * The second feature of the input side of hpts is the
 * dropping of a connection. This is due to the way that
 * locking may have occured on the INP_WLOCK. So if
115
116
117
 * a stack wants to drop a connection it calls:
 *
 *     tcp_set_inp_to_drop(tp, ETIMEDOUT)
118
119
120
 *
 * To schedule the tcp_hpts system to call
 *
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
 *    tcp_drop(tp, drop_reason)
 *
 * at a future point. This is quite handy to prevent locking
 * issues when dropping connections.
 *
 */

#include <sys/param.h>
#include <sys/bus.h>
#include <sys/interrupt.h>
#include <sys/module.h>
#include <sys/kernel.h>
#include <sys/hhook.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
#include <sys/proc.h>		/* for proc0 declaration */
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/sysctl.h>
#include <sys/systm.h>
#include <sys/refcount.h>
#include <sys/sched.h>
#include <sys/queue.h>
#include <sys/smp.h>
#include <sys/counter.h>
#include <sys/time.h>
#include <sys/kthread.h>
#include <sys/kern_prefetch.h>

#include <vm/uma.h>
151
#include <vm/vm.h>
152
153
154
155

#include <net/route.h>
#include <net/vnet.h>

156
157
158
159
160
#ifdef RSS
#include <net/netisr.h>
#include <net/rss_config.h>
#endif

161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
#define TCPSTATES		/* for logging */

#include <netinet/in.h>
#include <netinet/in_kdtrace.h>
#include <netinet/in_pcb.h>
#include <netinet/ip.h>
#include <netinet/ip_icmp.h>	/* required for icmp_var.h */
#include <netinet/icmp_var.h>	/* for ICMP_BANDLIM */
#include <netinet/ip_var.h>
#include <netinet/ip6.h>
#include <netinet6/in6_pcb.h>
#include <netinet6/ip6_var.h>
#include <netinet/tcp.h>
#include <netinet/tcp_fsm.h>
#include <netinet/tcp_seq.h>
#include <netinet/tcp_timer.h>
#include <netinet/tcp_var.h>
#include <netinet/tcpip.h>
#include <netinet/cc/cc.h>
#include <netinet/tcp_hpts.h>
181
#include <netinet/tcp_log_buf.h>
182
183
184
185
186
187
188
189
190
191
192
193

#ifdef tcpdebug
#include <netinet/tcp_debug.h>
#endif				/* tcpdebug */
#ifdef tcp_offload
#include <netinet/tcp_offload.h>
#endif

MALLOC_DEFINE(M_TCPHPTS, "tcp_hpts", "TCP hpts");
#ifdef RSS
static int tcp_bind_threads = 1;
#else
194
static int tcp_bind_threads = 2;
195
#endif
196
static int tcp_use_irq_cpu = 0;
197
static struct tcp_hptsi tcp_pace;
198
static uint32_t *cts_last_ran;
199
static int hpts_does_tp_logging = 0;
200
201
static int hpts_use_assigned_cpu = 1;
static int32_t hpts_uses_oldest = OLDEST_THRESHOLD;
202
203

static void tcp_input_data(struct tcp_hpts_entry *hpts, struct timeval *tv);
204
static int32_t tcp_hptsi(struct tcp_hpts_entry *hpts, int from_callout);
205
206
207
208
static void tcp_hpts_thread(void *ctx);
static void tcp_init_hptsi(void *st);

int32_t tcp_min_hptsi_time = DEFAULT_MIN_SLEEP;
209
210
211
212
213
static int conn_cnt_thresh = DEFAULT_CONNECTION_THESHOLD;
static int32_t dynamic_min_sleep = DYNAMIC_MIN_SLEEP;
static int32_t dynamic_max_sleep = DYNAMIC_MAX_SLEEP;


214

215
216
SYSCTL_NODE(_net_inet_tcp, OID_AUTO, hpts, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
    "TCP Hpts controls");
217
218
SYSCTL_NODE(_net_inet_tcp_hpts, OID_AUTO, stats, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
    "TCP Hpts statistics");
219
220
221
222
223
224
225
226
227
228
229
230
231

#define	timersub(tvp, uvp, vvp)						\
	do {								\
		(vvp)->tv_sec = (tvp)->tv_sec - (uvp)->tv_sec;		\
		(vvp)->tv_usec = (tvp)->tv_usec - (uvp)->tv_usec;	\
		if ((vvp)->tv_usec < 0) {				\
			(vvp)->tv_sec--;				\
			(vvp)->tv_usec += 1000000;			\
		}							\
	} while (0)

static int32_t tcp_hpts_precision = 120;

232
233
234
235
236
237
238
struct hpts_domain_info {
	int count;
	int cpu[MAXCPU];
};

struct hpts_domain_info hpts_domains[MAXMEMDOM];

239
240
counter_u64_t hpts_hopelessly_behind;

241
SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, hopeless, CTLFLAG_RD,
242
243
    &hpts_hopelessly_behind,
    "Number of times hpts could not catch up and was behind hopelessly");
244
245
246

counter_u64_t hpts_loops;

247
SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, loops, CTLFLAG_RD,
248
249
250
251
    &hpts_loops, "Number of times hpts had to loop to catch up");

counter_u64_t back_tosleep;

252
SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, no_tcbsfound, CTLFLAG_RD,
253
254
    &back_tosleep, "Number of times hpts found no tcbs");

255
256
counter_u64_t combined_wheel_wrap;

257
SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, comb_wheel_wrap, CTLFLAG_RD,
258
    &combined_wheel_wrap, "Number of times the wheel lagged enough to have an insert see wrap");
259

260
counter_u64_t wheel_wrap;
261

262
SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, wheel_wrap, CTLFLAG_RD,
263
    &wheel_wrap, "Number of times the wheel lagged enough to have an insert see wrap");
264

265
266
267
268
269
270
271
272
273
274
275
276
277
counter_u64_t hpts_direct_call;
SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, direct_call, CTLFLAG_RD,
    &hpts_direct_call, "Number of times hpts was called by syscall/trap or other entry");

counter_u64_t hpts_wake_timeout;

SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, timeout_wakeup, CTLFLAG_RD,
    &hpts_wake_timeout, "Number of times hpts threads woke up via the callout expiring");

counter_u64_t hpts_direct_awakening;

SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, direct_awakening, CTLFLAG_RD,
    &hpts_direct_awakening, "Number of times hpts threads woke up via the callout expiring");
278

279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
counter_u64_t hpts_back_tosleep;

SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, back_tosleep, CTLFLAG_RD,
    &hpts_back_tosleep, "Number of times hpts threads woke up via the callout expiring and went back to sleep no work");

counter_u64_t cpu_uses_flowid;
counter_u64_t cpu_uses_random;

SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, cpusel_flowid, CTLFLAG_RD,
    &cpu_uses_flowid, "Number of times when setting cpuid we used the flowid field");
SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, cpusel_random, CTLFLAG_RD,
    &cpu_uses_random, "Number of times when setting cpuid we used the a random value");

TUNABLE_INT("net.inet.tcp.bind_hptss", &tcp_bind_threads);
TUNABLE_INT("net.inet.tcp.use_irq", &tcp_use_irq_cpu);
SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, bind_hptss, CTLFLAG_RD,
    &tcp_bind_threads, 2,
    "Thread Binding tunable");
SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, use_irq, CTLFLAG_RD,
    &tcp_use_irq_cpu, 0,
    "Use of irq CPU  tunable");
SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, precision, CTLFLAG_RW,
    &tcp_hpts_precision, 120,
    "Value for PRE() precision of callout");
SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, cnt_thresh, CTLFLAG_RW,
    &conn_cnt_thresh, 0,
    "How many connections (below) make us use the callout based mechanism");
306
307
308
SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, logging, CTLFLAG_RW,
    &hpts_does_tp_logging, 0,
    "Do we add to any tp that has logging on pacer logs");
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, use_assigned_cpu, CTLFLAG_RW,
    &hpts_use_assigned_cpu, 0,
    "Do we start any hpts timer on the assigned cpu?");
SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, use_oldest, CTLFLAG_RW,
    &hpts_uses_oldest, OLDEST_THRESHOLD,
    "Do syscalls look for the hpts that has been the longest since running (or just use cpu no if 0)?");
SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, dyn_minsleep, CTLFLAG_RW,
    &dynamic_min_sleep, 250,
    "What is the dynamic minsleep value?");
SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, dyn_maxsleep, CTLFLAG_RW,
    &dynamic_max_sleep, 5000,
    "What is the dynamic maxsleep value?");




325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343

static int32_t max_pacer_loops = 10;
SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, loopmax, CTLFLAG_RW,
    &max_pacer_loops, 10,
    "What is the maximum number of times the pacer will loop trying to catch up");

#define HPTS_MAX_SLEEP_ALLOWED (NUM_OF_HPTSI_SLOTS/2)

static uint32_t hpts_sleep_max = HPTS_MAX_SLEEP_ALLOWED;

static int
sysctl_net_inet_tcp_hpts_max_sleep(SYSCTL_HANDLER_ARGS)
{
	int error;
	uint32_t new;

	new = hpts_sleep_max;
	error = sysctl_handle_int(oidp, &new, 0, req);
	if (error == 0 && req->newptr) {
344
		if ((new < dynamic_min_sleep) ||
345
		    (new > HPTS_MAX_SLEEP_ALLOWED))
346
347
348
349
350
351
			error = EINVAL;
		else
			hpts_sleep_max = new;
	}
	return (error);
}
352

353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
static int
sysctl_net_inet_tcp_hpts_min_sleep(SYSCTL_HANDLER_ARGS)
{
	int error;
	uint32_t new;

	new = tcp_min_hptsi_time;
	error = sysctl_handle_int(oidp, &new, 0, req);
	if (error == 0 && req->newptr) {
		if (new < LOWEST_SLEEP_ALLOWED)
			error = EINVAL;
		else
			tcp_min_hptsi_time = new;
	}
	return (error);
}

370
SYSCTL_PROC(_net_inet_tcp_hpts, OID_AUTO, maxsleep,
371
    CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
372
    &hpts_sleep_max, 0,
373
374
    &sysctl_net_inet_tcp_hpts_max_sleep, "IU",
    "Maximum time hpts will sleep");
375

376
377
SYSCTL_PROC(_net_inet_tcp_hpts, OID_AUTO, minsleep,
    CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
378
    &tcp_min_hptsi_time, 0,
379
    &sysctl_net_inet_tcp_hpts_min_sleep, "IU",
380
381
    "The minimum time the hpts must sleep before processing more slots");

382
383
384
385
386
387
388
389
390
391
392
393
394
static int ticks_indicate_more_sleep = TICKS_INDICATE_MORE_SLEEP;
static int ticks_indicate_less_sleep = TICKS_INDICATE_LESS_SLEEP;
static int tcp_hpts_no_wake_over_thresh = 1;

SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, more_sleep, CTLFLAG_RW,
    &ticks_indicate_more_sleep, 0,
    "If we only process this many or less on a timeout, we need longer sleep on the next callout");
SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, less_sleep, CTLFLAG_RW,
    &ticks_indicate_less_sleep, 0,
    "If we process this many or more on a timeout, we need less sleep on the next callout");
SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, nowake_over_thresh, CTLFLAG_RW,
    &tcp_hpts_no_wake_over_thresh, 0,
    "When we are over the threshold on the pacer do we prohibit wakeups?");
395
396

static void
397
tcp_hpts_log(struct tcp_hpts_entry *hpts, struct tcpcb *tp, struct timeval *tv,
398
	     int slots_to_run, int idx, int from_callout)
399
{
400
	union tcp_log_stackspecific log;
401
402
403
404
405
406
	/*
	 * Unused logs are
	 * 64 bit - delRate, rttProp, bw_inuse
	 * 16 bit - cwnd_gain
	 *  8 bit - bbr_state, bbr_substate, inhpts, ininput;
	 */
407
408
409
410
411
412
413
	memset(&log.u_bbr, 0, sizeof(log.u_bbr));
	log.u_bbr.flex1 = hpts->p_nxt_slot;
	log.u_bbr.flex2 = hpts->p_cur_slot;
	log.u_bbr.flex3 = hpts->p_prev_slot;
	log.u_bbr.flex4 = idx;
	log.u_bbr.flex5 = hpts->p_curtick;
	log.u_bbr.flex6 = hpts->p_on_queue_cnt;
414
415
416
	log.u_bbr.flex7 = hpts->p_cpu;
	log.u_bbr.flex8 = (uint8_t)from_callout;
	log.u_bbr.inflight = slots_to_run;
417
418
419
420
421
422
423
	log.u_bbr.applimited = hpts->overidden_sleep;
	log.u_bbr.delivered = hpts->saved_curtick;
	log.u_bbr.timeStamp = tcp_tv_to_usectick(tv);
	log.u_bbr.epoch = hpts->saved_curslot;
	log.u_bbr.lt_epoch = hpts->saved_prev_slot;
	log.u_bbr.pkts_out = hpts->p_delayed_by;
	log.u_bbr.lost = hpts->p_hpts_sleep_time;
424
425
426
	log.u_bbr.pacing_gain = hpts->p_cpu;
	log.u_bbr.pkt_epoch = hpts->p_runningslot;
	log.u_bbr.use_lt_bw = 1;
427
428
429
430
431
	TCP_LOG_EVENTP(tp, NULL,
		       &tp->t_inpcb->inp_socket->so_rcv,
		       &tp->t_inpcb->inp_socket->so_snd,
		       BBR_LOG_HPTSDIAG, 0,
		       0, &log, false, tv);
432
433
434
}

static void
435
tcp_wakehpts(struct tcp_hpts_entry *hpts)
436
{
437
	HPTS_MTX_ASSERT(hpts);
438

439
440
441
442
443
444
445
446
	if (tcp_hpts_no_wake_over_thresh && (hpts->p_on_queue_cnt >= conn_cnt_thresh)) {
		hpts->p_direct_wake = 0;
		return;
	}
	if (hpts->p_hpts_wake_scheduled == 0) {
		hpts->p_hpts_wake_scheduled = 1;
		swi_sched(hpts->ie_cookie, 0);
	}
447
448
449
}

static void
450
hpts_timeout_swi(void *arg)
451
{
452
453
454
455
	struct tcp_hpts_entry *hpts;

	hpts = (struct tcp_hpts_entry *)arg;
	swi_sched(hpts->ie_cookie, 0);
456
457
458
459
460
}

static inline void
hpts_sane_pace_remove(struct tcp_hpts_entry *hpts, struct inpcb *inp, struct hptsh *head, int clear)
{
461
462
463
	HPTS_MTX_ASSERT(hpts);
	KASSERT(hpts->p_cpu == inp->inp_hpts_cpu, ("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp));
	KASSERT(inp->inp_in_hpts != 0, ("%s: hpts:%p inp:%p not on the hpts?", __FUNCTION__, hpts, inp));
464
465
	TAILQ_REMOVE(head, inp, inp_hpts);
	hpts->p_on_queue_cnt--;
466
467
468
	KASSERT(hpts->p_on_queue_cnt >= 0,
		("Hpts goes negative inp:%p hpts:%p",
		 inp, hpts));
469
470
471
472
473
474
475
476
477
	if (clear) {
		inp->inp_hpts_request = 0;
		inp->inp_in_hpts = 0;
	}
}

static inline void
hpts_sane_pace_insert(struct tcp_hpts_entry *hpts, struct inpcb *inp, struct hptsh *head, int line, int noref)
{
478
479
480
481
482
483
484
	HPTS_MTX_ASSERT(hpts);
	KASSERT(hpts->p_cpu == inp->inp_hpts_cpu,
		("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp));
	KASSERT(((noref == 1) && (inp->inp_in_hpts == 1)) ||
		((noref == 0) && (inp->inp_in_hpts == 0)),
		("%s: hpts:%p inp:%p already on the hpts?",
		 __FUNCTION__, hpts, inp));
485
486
487
488
489
490
491
492
493
494
495
	TAILQ_INSERT_TAIL(head, inp, inp_hpts);
	inp->inp_in_hpts = 1;
	hpts->p_on_queue_cnt++;
	if (noref == 0) {
		in_pcbref(inp);
	}
}

static inline void
hpts_sane_input_remove(struct tcp_hpts_entry *hpts, struct inpcb *inp, int clear)
{
496
497
498
499
500
	HPTS_MTX_ASSERT(hpts);
	KASSERT(hpts->p_cpu == inp->inp_hpts_cpu,
		("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp));
	KASSERT(inp->inp_in_input != 0,
		("%s: hpts:%p inp:%p not on the input hpts?", __FUNCTION__, hpts, inp));
501
502
	TAILQ_REMOVE(&hpts->p_input, inp, inp_input);
	hpts->p_on_inqueue_cnt--;
503
504
505
506
507
508
509
	KASSERT(hpts->p_on_inqueue_cnt >= 0,
		("Hpts in goes negative inp:%p hpts:%p",
		 inp, hpts));
	KASSERT((((TAILQ_EMPTY(&hpts->p_input) != 0) && (hpts->p_on_inqueue_cnt == 0)) ||
		 ((TAILQ_EMPTY(&hpts->p_input) == 0) && (hpts->p_on_inqueue_cnt > 0))),
		("%s hpts:%p input cnt (p_on_inqueue):%d and queue state mismatch",
		 __FUNCTION__, hpts, hpts->p_on_inqueue_cnt));
510
511
512
513
514
515
516
	if (clear)
		inp->inp_in_input = 0;
}

static inline void
hpts_sane_input_insert(struct tcp_hpts_entry *hpts, struct inpcb *inp, int line)
{
517
518
519
520
521
	HPTS_MTX_ASSERT(hpts);
	KASSERT(hpts->p_cpu == inp->inp_hpts_cpu,
		("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp));
	KASSERT(inp->inp_in_input == 0,
		("%s: hpts:%p inp:%p already on the input hpts?", __FUNCTION__, hpts, inp));
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
	TAILQ_INSERT_TAIL(&hpts->p_input, inp, inp_input);
	inp->inp_in_input = 1;
	hpts->p_on_inqueue_cnt++;
	in_pcbref(inp);
}

struct tcp_hpts_entry *
tcp_cur_hpts(struct inpcb *inp)
{
	int32_t hpts_num;
	struct tcp_hpts_entry *hpts;

	hpts_num = inp->inp_hpts_cpu;
	hpts = tcp_pace.rp_ent[hpts_num];
	return (hpts);
}

struct tcp_hpts_entry *
tcp_hpts_lock(struct inpcb *inp)
{
	struct tcp_hpts_entry *hpts;
	int32_t hpts_num;

again:
	hpts_num = inp->inp_hpts_cpu;
	hpts = tcp_pace.rp_ent[hpts_num];
548
549
550
	KASSERT(mtx_owned(&hpts->p_mtx) == 0,
		("Hpts:%p owns mtx prior-to lock line:%d",
		 hpts, __LINE__));
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
	mtx_lock(&hpts->p_mtx);
	if (hpts_num != inp->inp_hpts_cpu) {
		mtx_unlock(&hpts->p_mtx);
		goto again;
	}
	return (hpts);
}

struct tcp_hpts_entry *
tcp_input_lock(struct inpcb *inp)
{
	struct tcp_hpts_entry *hpts;
	int32_t hpts_num;

again:
	hpts_num = inp->inp_input_cpu;
	hpts = tcp_pace.rp_ent[hpts_num];
568
569
570
	KASSERT(mtx_owned(&hpts->p_mtx) == 0,
		("Hpts:%p owns mtx prior-to lock line:%d",
		hpts, __LINE__));
571
572
573
574
575
576
577
578
579
580
581
	mtx_lock(&hpts->p_mtx);
	if (hpts_num != inp->inp_input_cpu) {
		mtx_unlock(&hpts->p_mtx);
		goto again;
	}
	return (hpts);
}

static void
tcp_remove_hpts_ref(struct inpcb *inp, struct tcp_hpts_entry *hpts, int line)
{
582
	int32_t ret;
583

584
585
	ret = in_pcbrele_wlocked(inp);
	KASSERT(ret != 1, ("inpcb:%p release ret 1", inp));
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
}

static void
tcp_hpts_remove_locked_output(struct tcp_hpts_entry *hpts, struct inpcb *inp, int32_t flags, int32_t line)
{
	if (inp->inp_in_hpts) {
		hpts_sane_pace_remove(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], 1);
		tcp_remove_hpts_ref(inp, hpts, line);
	}
}

static void
tcp_hpts_remove_locked_input(struct tcp_hpts_entry *hpts, struct inpcb *inp, int32_t flags, int32_t line)
{
	HPTS_MTX_ASSERT(hpts);
	if (inp->inp_in_input) {
		hpts_sane_input_remove(hpts, inp, 1);
		tcp_remove_hpts_ref(inp, hpts, line);
	}
}

/*
 * Called normally with the INP_LOCKED but it
 * does not matter, the hpts lock is the key
 * but the lock order allows us to hold the
 * INP lock and then get the hpts lock.
 *
 * Valid values in the flags are
 * HPTS_REMOVE_OUTPUT - remove from the output of the hpts.
 * HPTS_REMOVE_INPUT - remove from the input of the hpts.
616
 * Note that you can use one or both values together
617
 * and get two actions.
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
 */
void
__tcp_hpts_remove(struct inpcb *inp, int32_t flags, int32_t line)
{
	struct tcp_hpts_entry *hpts;

	INP_WLOCK_ASSERT(inp);
	if (flags & HPTS_REMOVE_OUTPUT) {
		hpts = tcp_hpts_lock(inp);
		tcp_hpts_remove_locked_output(hpts, inp, flags, line);
		mtx_unlock(&hpts->p_mtx);
	}
	if (flags & HPTS_REMOVE_INPUT) {
		hpts = tcp_input_lock(inp);
		tcp_hpts_remove_locked_input(hpts, inp, flags, line);
		mtx_unlock(&hpts->p_mtx);
	}
}

static inline int
638
hpts_slot(uint32_t wheel_slot, uint32_t plus)
639
{
640
641
642
643
	/*
	 * Given a slot on the wheel, what slot
	 * is that plus ticks out?
	 */
644
645
	KASSERT(wheel_slot < NUM_OF_HPTSI_SLOTS, ("Invalid tick %u not on wheel", wheel_slot));
	return ((wheel_slot + plus) % NUM_OF_HPTSI_SLOTS);
646
647
648
649
650
}

static inline int
tick_to_wheel(uint32_t cts_in_wticks)
{
651
	/*
652
653
654
655
656
	 * Given a timestamp in ticks (so by
	 * default to get it to a real time one
	 * would multiply by 10.. i.e the number
	 * of ticks in a slot) map it to our limited
	 * space wheel.
657
658
659
660
661
	 */
	return (cts_in_wticks % NUM_OF_HPTSI_SLOTS);
}

static inline int
662
hpts_slots_diff(int prev_slot, int slot_now)
663
664
{
	/*
665
	 * Given two slots that are someplace
666
667
	 * on our wheel. How far are they apart?
	 */
668
669
670
	if (slot_now > prev_slot)
		return (slot_now - prev_slot);
	else if (slot_now == prev_slot)
671
672
		/*
		 * Special case, same means we can go all of our
673
674
675
676
		 * wheel less one slot.
		 */
		return (NUM_OF_HPTSI_SLOTS - 1);
	else
677
		return ((NUM_OF_HPTSI_SLOTS - prev_slot) + slot_now);
678
679
680
}

/*
681
682
 * Given a slot on the wheel that is the current time
 * mapped to the wheel (wheel_slot), what is the maximum
683
 * distance forward that can be obtained without
684
 * wrapping past either prev_slot or running_slot
685
 * depending on the htps state? Also if passed
686
 * a uint32_t *, fill it with the slot location.
687
688
 *
 * Note if you do not give this function the current
689
 * time (that you think it is) mapped to the wheel slot
690
691
692
693
 * then the results will not be what you expect and
 * could lead to invalid inserts.
 */
static inline int32_t
694
max_slots_available(struct tcp_hpts_entry *hpts, uint32_t wheel_slot, uint32_t *target_slot)
695
{
696
	uint32_t dis_to_travel, end_slot, pacer_to_now, avail_on_wheel;
697
698
699

	if ((hpts->p_hpts_active == 1) &&
	    (hpts->p_wheel_complete == 0)) {
700
		end_slot = hpts->p_runningslot;
701
		/* Back up one tick */
702
703
		if (end_slot == 0)
			end_slot = NUM_OF_HPTSI_SLOTS - 1;
704
		else
705
706
707
			end_slot--;
		if (target_slot)
			*target_slot = end_slot;
708
709
710
711
712
713
714
715
716
	} else {
		/*
		 * For the case where we are
		 * not active, or we have
		 * completed the pass over
		 * the wheel, we can use the
		 * prev tick and subtract one from it. This puts us
		 * as far out as possible on the wheel.
		 */
717
718
719
		end_slot = hpts->p_prev_slot;
		if (end_slot == 0)
			end_slot = NUM_OF_HPTSI_SLOTS - 1;
720
		else
721
722
723
			end_slot--;
		if (target_slot)
			*target_slot = end_slot;
724
725
		/*
		 * Now we have close to the full wheel left minus the
726
727
728
729
		 * time it has been since the pacer went to sleep. Note
		 * that wheel_tick, passed in, should be the current time
		 * from the perspective of the caller, mapped to the wheel.
		 */
730
731
		if (hpts->p_prev_slot != wheel_slot)
			dis_to_travel = hpts_slots_diff(hpts->p_prev_slot, wheel_slot);
732
733
		else
			dis_to_travel = 1;
734
735
		/*
		 * dis_to_travel in this case is the space from when the
736
		 * pacer stopped (p_prev_slot) and where our wheel_slot
737
		 * is now. To know how many slots we can put it in we
738
739
740
741
742
743
		 * subtract from the wheel size. We would not want
		 * to place something after p_prev_slot or it will
		 * get ran too soon.
		 */
		return (NUM_OF_HPTSI_SLOTS - dis_to_travel);
	}
744
	/*
745
	 * So how many slots are open between p_runningslot -> p_cur_slot
746
747
748
749
	 * that is what is currently un-available for insertion. Special
	 * case when we are at the last slot, this gets 1, so that
	 * the answer to how many slots are available is all but 1.
	 */
750
	if (hpts->p_runningslot == hpts->p_cur_slot)
751
752
		dis_to_travel = 1;
	else
753
		dis_to_travel = hpts_slots_diff(hpts->p_runningslot, hpts->p_cur_slot);
754
	/*
755
756
	 * How long has the pacer been running?
	 */
757
	if (hpts->p_cur_slot != wheel_slot) {
758
		/* The pacer is a bit late */
759
		pacer_to_now = hpts_slots_diff(hpts->p_cur_slot, wheel_slot);
760
761
762
763
	} else {
		/* The pacer is right on time, now == pacers start time */
		pacer_to_now = 0;
	}
764
	/*
765
766
767
768
769
	 * To get the number left we can insert into we simply
	 * subract the distance the pacer has to run from how
	 * many slots there are.
	 */
	avail_on_wheel = NUM_OF_HPTSI_SLOTS - dis_to_travel;
770
771
772
	/*
	 * Now how many of those we will eat due to the pacer's
	 * time (p_cur_slot) of start being behind the
773
	 * real time (wheel_slot)?
774
775
	 */
	if (avail_on_wheel <= pacer_to_now) {
776
		/*
777
778
		 * Wheel wrap, we can't fit on the wheel, that
		 * is unusual the system must be way overloaded!
779
		 * Insert into the assured slot, and return special
780
781
782
		 * "0".
		 */
		counter_u64_add(combined_wheel_wrap, 1);
783
		*target_slot = hpts->p_nxt_slot;
784
785
		return (0);
	} else {
786
		/*
787
788
789
		 * We know how many slots are open
		 * on the wheel (the reverse of what
		 * is left to run. Take away the time
790
		 * the pacer started to now (wheel_slot)
791
792
793
794
795
796
		 * and that tells you how many slots are
		 * open that can be inserted into that won't
		 * be touched by the pacer until later.
		 */
		return (avail_on_wheel - pacer_to_now);
	}
797
798
799
800
801
}

static int
tcp_queue_to_hpts_immediate_locked(struct inpcb *inp, struct tcp_hpts_entry *hpts, int32_t line, int32_t noref)
{
802
	uint32_t need_wake = 0;
803

804
805
806
	HPTS_MTX_ASSERT(hpts);
	if (inp->inp_in_hpts == 0) {
		/* Ok we need to set it on the hpts in the current slot */
807
808
809
810
		inp->inp_hpts_request = 0;
		if ((hpts->p_hpts_active == 0) ||
		    (hpts->p_wheel_complete)) {
			/*
811
			 * A sleeping hpts we want in next slot to run
812
813
			 * note that in this state p_prev_slot == p_cur_slot
			 */
814
			inp->inp_hptsslot = hpts_slot(hpts->p_prev_slot, 1);
815
816
			if ((hpts->p_on_min_sleep == 0) && (hpts->p_hpts_active == 0))
				need_wake = 1;
817
818
		} else if ((void *)inp == hpts->p_inp) {
			/*
819
			 * The hpts system is running and the caller
820
			 * was awoken by the hpts system.
821
			 * We can't allow you to go into the same slot we
822
			 * are in (we don't want a loop :-D).
823
824
825
			 */
			inp->inp_hptsslot = hpts->p_nxt_slot;
		} else
826
			inp->inp_hptsslot = hpts->p_runningslot;
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
		hpts_sane_pace_insert(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], line, noref);
		if (need_wake) {
			/*
			 * Activate the hpts if it is sleeping and its
			 * timeout is not 1.
			 */
			hpts->p_direct_wake = 1;
			tcp_wakehpts(hpts);
		}
	}
	return (need_wake);
}

int
__tcp_queue_to_hpts_immediate(struct inpcb *inp, int32_t line)
{
	int32_t ret;
	struct tcp_hpts_entry *hpts;

	INP_WLOCK_ASSERT(inp);
	hpts = tcp_hpts_lock(inp);
	ret = tcp_queue_to_hpts_immediate_locked(inp, hpts, line, 0);
	mtx_unlock(&hpts->p_mtx);
	return (ret);
}

853
854
855
856
857
#ifdef INVARIANTS
static void
check_if_slot_would_be_wrong(struct tcp_hpts_entry *hpts, struct inpcb *inp, uint32_t inp_hptsslot, int line)
{
	/*
858
	 * Sanity checks for the pacer with invariants
859
860
	 * on insert.
	 */
861
862
863
	KASSERT(inp_hptsslot < NUM_OF_HPTSI_SLOTS,
		("hpts:%p inp:%p slot:%d > max",
		 hpts, inp, inp_hptsslot));
864
865
	if ((hpts->p_hpts_active) &&
	    (hpts->p_wheel_complete == 0)) {
866
		/*
867
868
869
870
871
872
873
		 * If the pacer is processing a arc
		 * of the wheel, we need to make
		 * sure we are not inserting within
		 * that arc.
		 */
		int distance, yet_to_run;

874
875
876
		distance = hpts_slots_diff(hpts->p_runningslot, inp_hptsslot);
		if (hpts->p_runningslot != hpts->p_cur_slot)
			yet_to_run = hpts_slots_diff(hpts->p_runningslot, hpts->p_cur_slot);
877
878
		else
			yet_to_run = 0;	/* processing last slot */
879
880
881
882
883
		KASSERT(yet_to_run <= distance,
			("hpts:%p inp:%p slot:%d distance:%d yet_to_run:%d rs:%d cs:%d",
			 hpts, inp, inp_hptsslot,
			 distance, yet_to_run,
			 hpts->p_runningslot, hpts->p_cur_slot));
884
885
886
887
	}
}
#endif

888
static void
889
890
tcp_hpts_insert_locked(struct tcp_hpts_entry *hpts, struct inpcb *inp, uint32_t slot, int32_t line,
		       struct hpts_diag *diag, struct timeval *tv)
891
{
892
	uint32_t need_new_to = 0;
893
894
895
	uint32_t wheel_cts; 
	int32_t wheel_slot, maxslots, last_slot;
	int cpu;
896
	int8_t need_wakeup = 0;
897
898
899
900
901

	HPTS_MTX_ASSERT(hpts);
	if (diag) {
		memset(diag, 0, sizeof(struct hpts_diag));
		diag->p_hpts_active = hpts->p_hpts_active;
902
		diag->p_prev_slot = hpts->p_prev_slot;
903
		diag->p_runningslot = hpts->p_runningslot;
904
905
		diag->p_nxt_slot = hpts->p_nxt_slot;
		diag->p_cur_slot = hpts->p_cur_slot;
906
907
		diag->p_curtick = hpts->p_curtick;
		diag->p_lasttick = hpts->p_lasttick;
908
		diag->slot_req = slot;
909
910
		diag->p_on_min_sleep = hpts->p_on_min_sleep;
		diag->hpts_sleep_time = hpts->p_hpts_sleep_time;
911
	}
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
	KASSERT(inp->inp_in_hpts == 0, ("Hpts:%p tp:%p already on hpts and add?", hpts, inp));
	if (slot == 0) {
		/* Immediate */
		tcp_queue_to_hpts_immediate_locked(inp, hpts, line, 0);
		return;
	}
	/* Get the current time relative to the wheel */
	wheel_cts = tcp_tv_to_hptstick(tv);
	/* Map it onto the wheel */
	wheel_slot = tick_to_wheel(wheel_cts);
	/* Now what's the max we can place it at? */
	maxslots = max_slots_available(hpts, wheel_slot, &last_slot);
	if (diag) {
		diag->wheel_slot = wheel_slot;
		diag->maxslots = maxslots;
		diag->wheel_cts = wheel_cts;
	}
	if (maxslots == 0) {
		/* The pacer is in a wheel wrap behind, yikes! */
		if (slot > 1) {
			/*
			 * Reduce by 1 to prevent a forever loop in
			 * case something else is wrong. Note this
			 * probably does not hurt because the pacer
			 * if its true is so far behind we will be
			 * > 1second late calling anyway.
			 */
			slot--;
940
		}
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
		inp->inp_hptsslot = last_slot;
		inp->inp_hpts_request = slot;
	} else 	if (maxslots >= slot) {
		/* It all fits on the wheel */
		inp->inp_hpts_request = 0;
		inp->inp_hptsslot = hpts_slot(wheel_slot, slot);
	} else {
		/* It does not fit */
		inp->inp_hpts_request = slot - maxslots;
		inp->inp_hptsslot = last_slot;
	}
	if (diag) {
		diag->slot_remaining = inp->inp_hpts_request;
		diag->inp_hptsslot = inp->inp_hptsslot;
	}
#ifdef INVARIANTS
	check_if_slot_would_be_wrong(hpts, inp, inp->inp_hptsslot, line);
#endif
	hpts_sane_pace_insert(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], line, 0);
	if ((hpts->p_hpts_active == 0) &&
	    (inp->inp_hpts_request == 0) &&
	    (hpts->p_on_min_sleep == 0)) {
		/*
		 * The hpts is sleeping and NOT on a minimum
		 * sleep time, we need to figure out where
		 * it will wake up at and if we need to reschedule
		 * its time-out.
		 */
		uint32_t have_slept, yet_to_sleep;

		/* Now do we need to restart the hpts's timer? */
		have_slept = hpts_slots_diff(hpts->p_prev_slot, wheel_slot);
		if (have_slept < hpts->p_hpts_sleep_time)
			yet_to_sleep = hpts->p_hpts_sleep_time - have_slept;
		else {
			/* We are over-due */
			yet_to_sleep = 0;
			need_wakeup = 1;
979
980
		}
		if (diag) {
981
982
			diag->have_slept = have_slept;
			diag->yet_to_sleep = yet_to_sleep;
983
		}
984
985
		if (yet_to_sleep &&
		    (yet_to_sleep > slot)) {
986
			/*
987
			 * We need to reschedule the hpts's time-out.
988
			 */
989
990
			hpts->p_hpts_sleep_time = slot;
			need_new_to = slot * HPTS_TICKS_PER_SLOT;
991
		}
992
993
994
995
996
997
998
999
1000
	}
	/*
	 * Now how far is the hpts sleeping to? if active is 1, its
	 * up and ticking we do nothing, otherwise we may need to
	 * reschedule its callout if need_new_to is set from above.
	 */
	if (need_wakeup) {
		hpts->p_direct_wake = 1;
		tcp_wakehpts(hpts);
For faster browsing, not all history is shown. View entire blame