tcp_lro.c 48 KB
Newer Older
1
/*-
2
3
 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
 *
4
5
 * Copyright (c) 2007, Myricom Inc.
 * Copyright (c) 2008, Intel Corporation.
Bjoern A. Zeeb's avatar
Bjoern A. Zeeb committed
6
 * Copyright (c) 2012 The FreeBSD Foundation
7
 * Copyright (c) 2016-2021 Mellanox Technologies.
8
9
 * All rights reserved.
 *
Bjoern A. Zeeb's avatar
Bjoern A. Zeeb committed
10
11
12
 * Portions of this software were developed by Bjoern Zeeb
 * under sponsorship from the FreeBSD Foundation.
 *
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */
34

Bjoern A. Zeeb's avatar
Bjoern A. Zeeb committed
35
36
37
38
39
40
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");

#include "opt_inet.h"
#include "opt_inet6.h"

41
42
43
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
44
45
#include <sys/malloc.h>
#include <sys/mbuf.h>
46
#include <sys/socket.h>
47
48
#include <sys/socketvar.h>
#include <sys/sockbuf.h>
49
#include <sys/sysctl.h>
50
51

#include <net/if.h>
Bjoern A. Zeeb's avatar
Bjoern A. Zeeb committed
52
#include <net/if_var.h>
53
#include <net/ethernet.h>
54
#include <net/bpf.h>
55
#include <net/vnet.h>
56
57
58

#include <netinet/in_systm.h>
#include <netinet/in.h>
Bjoern A. Zeeb's avatar
Bjoern A. Zeeb committed
59
#include <netinet/ip6.h>
60
#include <netinet/ip.h>
61
#include <netinet/ip_var.h>
62
63
#include <netinet/in_pcb.h>
#include <netinet6/in6_pcb.h>
64
#include <netinet/tcp.h>
65
#include <netinet/tcp_seq.h>
66
#include <netinet/tcp_lro.h>
67
#include <netinet/tcp_var.h>
68
#include <netinet/tcpip.h>
69
70
#include <netinet/tcp_hpts.h>
#include <netinet/tcp_log_buf.h>
71
#include <netinet/udp.h>
72
73
#include <netinet6/ip6_var.h>

74
75
#include <machine/in_cksum.h>

76
static MALLOC_DEFINE(M_LRO, "LRO", "LRO control structures");
77

78
79
80
#define	TCP_LRO_TS_OPTION \
    ntohl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | \
	  (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)
81

82
static void	tcp_lro_rx_done(struct lro_ctrl *lc);
83
84
85
86
87
88
89
90
static int	tcp_lro_rx_common(struct lro_ctrl *lc, struct mbuf *m,
		    uint32_t csum, bool use_hash);

#ifdef TCPHPTS
static bool	do_bpf_strip_and_compress(struct inpcb *, struct lro_ctrl *,
		struct lro_entry *, struct mbuf **, struct mbuf **, struct mbuf **, bool *, bool);

#endif
91

92
93
94
SYSCTL_NODE(_net_inet_tcp, OID_AUTO, lro,  CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
    "TCP LRO");

95
static long tcplro_stacks_wanting_mbufq;
96
97
98
99
counter_u64_t tcp_inp_lro_direct_queue;
counter_u64_t tcp_inp_lro_wokeup_queue;
counter_u64_t tcp_inp_lro_compressed;
counter_u64_t tcp_inp_lro_locks_taken;
100
101
102
103
counter_u64_t tcp_extra_mbuf;
counter_u64_t tcp_would_have_but;
counter_u64_t tcp_comp_total;
counter_u64_t tcp_uncomp_total;
104

105
106
107
108
static unsigned	tcp_lro_entries = TCP_LRO_ENTRIES;
SYSCTL_UINT(_net_inet_tcp_lro, OID_AUTO, entries,
    CTLFLAG_RDTUN | CTLFLAG_MPSAFE, &tcp_lro_entries, 0,
    "default number of LRO entries");
109

110
111
112
113
114
115
116
117
SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, fullqueue, CTLFLAG_RD,
    &tcp_inp_lro_direct_queue, "Number of lro's fully queued to transport");
SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, wokeup, CTLFLAG_RD,
    &tcp_inp_lro_wokeup_queue, "Number of lro's where we woke up transport via hpts");
SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, compressed, CTLFLAG_RD,
    &tcp_inp_lro_compressed, "Number of lro's compressed and sent to transport");
SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, lockcnt, CTLFLAG_RD,
    &tcp_inp_lro_locks_taken, "Number of lro's inp_wlocks taken");
118
119
120
SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, extra_mbuf, CTLFLAG_RD,
    &tcp_extra_mbuf, "Number of times we had an extra compressed ack dropped into the tp");
SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, would_have_but, CTLFLAG_RD,
121
    &tcp_would_have_but, "Number of times we would have had an extra compressed, but mget failed");
122
123
124
125
SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, with_m_ackcmp, CTLFLAG_RD,
    &tcp_comp_total, "Number of mbufs queued with M_ACKCMP flags set");
SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, without_m_ackcmp, CTLFLAG_RD,
    &tcp_uncomp_total, "Number of mbufs queued without M_ACKCMP");
126
127
128
129
130
131
132
133
134
135
136
137

void
tcp_lro_reg_mbufq(void)
{
	atomic_fetchadd_long(&tcplro_stacks_wanting_mbufq, 1);
}

void
tcp_lro_dereg_mbufq(void)
{
	atomic_fetchadd_long(&tcplro_stacks_wanting_mbufq, -1);
}
138

139
static __inline void
140
141
tcp_lro_active_insert(struct lro_ctrl *lc, struct lro_head *bucket,
    struct lro_entry *le)
142
143
144
{

	LIST_INSERT_HEAD(&lc->lro_active, le, next);
145
	LIST_INSERT_HEAD(bucket, le, hash_next);
146
147
148
149
150
151
}

static __inline void
tcp_lro_active_remove(struct lro_entry *le)
{

152
153
	LIST_REMOVE(le, next);		/* active list */
	LIST_REMOVE(le, hash_next);	/* hash bucket */
154
155
}

156
int
Bjoern A. Zeeb's avatar
Bjoern A. Zeeb committed
157
tcp_lro_init(struct lro_ctrl *lc)
158
{
159
	return (tcp_lro_init_args(lc, NULL, tcp_lro_entries, 0));
160
161
162
163
164
}

int
tcp_lro_init_args(struct lro_ctrl *lc, struct ifnet *ifp,
    unsigned lro_entries, unsigned lro_mbufs)
165
{
Bjoern A. Zeeb's avatar
Bjoern A. Zeeb committed
166
	struct lro_entry *le;
167
	size_t size;
168
	unsigned i, elements;
169

Bjoern A. Zeeb's avatar
Bjoern A. Zeeb committed
170
171
172
	lc->lro_bad_csum = 0;
	lc->lro_queued = 0;
	lc->lro_flushed = 0;
173
174
175
	lc->lro_mbuf_count = 0;
	lc->lro_mbuf_max = lro_mbufs;
	lc->lro_cnt = lro_entries;
176
177
	lc->lro_ackcnt_lim = TCP_LRO_ACKCNT_MAX;
	lc->lro_length_lim = TCP_LRO_LENGTH_MAX;
178
	lc->ifp = ifp;
179
180
	LIST_INIT(&lc->lro_free);
	LIST_INIT(&lc->lro_active);
181

182
183
184
185
186
187
188
189
190
191
192
193
	/* create hash table to accelerate entry lookup */
	if (lro_entries > lro_mbufs)
		elements = lro_entries;
	else
		elements = lro_mbufs;
	lc->lro_hash = phashinit_flags(elements, M_LRO, &lc->lro_hashsz,
	    HASH_NOWAIT);
	if (lc->lro_hash == NULL) {
		memset(lc, 0, sizeof(*lc));
		return (ENOMEM);
	}

194
	/* compute size to allocate */
195
	size = (lro_mbufs * sizeof(struct lro_mbuf_sort)) +
196
	    (lro_entries * sizeof(*le));
197
	lc->lro_mbuf_data = (struct lro_mbuf_sort *)
198
199
200
201
	    malloc(size, M_LRO, M_NOWAIT | M_ZERO);

	/* check for out of memory */
	if (lc->lro_mbuf_data == NULL) {
202
		free(lc->lro_hash, M_LRO);
203
204
205
206
207
208
209
210
211
		memset(lc, 0, sizeof(*lc));
		return (ENOMEM);
	}
	/* compute offset for LRO entries */
	le = (struct lro_entry *)
	    (lc->lro_mbuf_data + lro_mbufs);

	/* setup linked list */
	for (i = 0; i != lro_entries; i++)
212
		LIST_INSERT_HEAD(&lc->lro_free, le + i, next);
213
214

	return (0);
215
216
}

217
218
219
220
221
222
223
struct vxlan_header {
	uint32_t	vxlh_flags;
	uint32_t	vxlh_vni;
};

static inline void *
tcp_lro_low_level_parser(void *ptr, struct lro_parser *parser, bool update_data, bool is_vxlan)
224
{
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
	const struct ether_vlan_header *eh;
	void *old;
	uint16_t eth_type;

	if (update_data)
		memset(parser, 0, sizeof(*parser));

	old = ptr;

	if (is_vxlan) {
		const struct vxlan_header *vxh;
		vxh = ptr;
		ptr = (uint8_t *)ptr + sizeof(*vxh);
		if (update_data) {
			parser->data.vxlan_vni =
			    vxh->vxlh_vni & htonl(0xffffff00);
		}
	}

	eh = ptr;
	if (__predict_false(eh->evl_encap_proto == htons(ETHERTYPE_VLAN))) {
		eth_type = eh->evl_proto;
		if (update_data) {
			/* strip priority and keep VLAN ID only */
			parser->data.vlan_id = eh->evl_tag & htons(EVL_VLID_MASK);
		}
		/* advance to next header */
		ptr = (uint8_t *)ptr + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
	} else {
		eth_type = eh->evl_encap_proto;
		/* advance to next header */
		ptr = (uint8_t *)ptr + ETHER_HDR_LEN;
	}

	switch (eth_type) {
260
#ifdef INET
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
	case htons(ETHERTYPE_IP):
		parser->ip4 = ptr;
		/* Ensure there are no IPv4 options. */
		if ((parser->ip4->ip_hl << 2) != sizeof (*parser->ip4))
			break;
		/* .. and the packet is not fragmented. */
		if (parser->ip4->ip_off & htons(IP_MF|IP_OFFMASK))
			break;
		ptr = (uint8_t *)ptr + (parser->ip4->ip_hl << 2);
		if (update_data) {
			parser->data.s_addr.v4 = parser->ip4->ip_src;
			parser->data.d_addr.v4 = parser->ip4->ip_dst;
		}
		switch (parser->ip4->ip_p) {
		case IPPROTO_UDP:
			parser->udp = ptr;
			if (update_data) {
				parser->data.lro_type = LRO_TYPE_IPV4_UDP;
				parser->data.s_port = parser->udp->uh_sport;
				parser->data.d_port = parser->udp->uh_dport;
			} else {
				MPASS(parser->data.lro_type == LRO_TYPE_IPV4_UDP);
			}
			ptr = ((uint8_t *)ptr + sizeof(*parser->udp));
			parser->total_hdr_len = (uint8_t *)ptr - (uint8_t *)old;
			return (ptr);
		case IPPROTO_TCP:
			parser->tcp = ptr;
			if (update_data) {
				parser->data.lro_type = LRO_TYPE_IPV4_TCP;
				parser->data.s_port = parser->tcp->th_sport;
				parser->data.d_port = parser->tcp->th_dport;
			} else {
				MPASS(parser->data.lro_type == LRO_TYPE_IPV4_TCP);
			}
			ptr = (uint8_t *)ptr + (parser->tcp->th_off << 2);
			parser->total_hdr_len = (uint8_t *)ptr - (uint8_t *)old;
			return (ptr);
		default:
			break;
		}
		break;
303
304
#endif
#ifdef INET6
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
	case htons(ETHERTYPE_IPV6):
		parser->ip6 = ptr;
		ptr = (uint8_t *)ptr + sizeof(*parser->ip6);
		if (update_data) {
			parser->data.s_addr.v6 = parser->ip6->ip6_src;
			parser->data.d_addr.v6 = parser->ip6->ip6_dst;
		}
		switch (parser->ip6->ip6_nxt) {
		case IPPROTO_UDP:
			parser->udp = ptr;
			if (update_data) {
				parser->data.lro_type = LRO_TYPE_IPV6_UDP;
				parser->data.s_port = parser->udp->uh_sport;
				parser->data.d_port = parser->udp->uh_dport;
			} else {
				MPASS(parser->data.lro_type == LRO_TYPE_IPV6_UDP);
			}
			ptr = (uint8_t *)ptr + sizeof(*parser->udp);
			parser->total_hdr_len = (uint8_t *)ptr - (uint8_t *)old;
			return (ptr);
		case IPPROTO_TCP:
			parser->tcp = ptr;
			if (update_data) {
				parser->data.lro_type = LRO_TYPE_IPV6_TCP;
				parser->data.s_port = parser->tcp->th_sport;
				parser->data.d_port = parser->tcp->th_dport;
			} else {
				MPASS(parser->data.lro_type == LRO_TYPE_IPV6_TCP);
			}
			ptr = (uint8_t *)ptr + (parser->tcp->th_off << 2);
			parser->total_hdr_len = (uint8_t *)ptr - (uint8_t *)old;
			return (ptr);
		default:
			break;
		}
340
341
		break;
#endif
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
	default:
		break;
	}
	/* Invalid packet - cannot parse */
	return (NULL);
}

static const int vxlan_csum = CSUM_INNER_L3_CALC | CSUM_INNER_L3_VALID |
    CSUM_INNER_L4_CALC | CSUM_INNER_L4_VALID;

static inline struct lro_parser *
tcp_lro_parser(struct mbuf *m, struct lro_parser *po, struct lro_parser *pi, bool update_data)
{
	void *data_ptr;

	/* Try to parse outer headers first. */
	data_ptr = tcp_lro_low_level_parser(m->m_data, po, update_data, false);
	if (data_ptr == NULL || po->total_hdr_len > m->m_len)
		return (NULL);

	if (update_data) {
		/* Store VLAN ID, if any. */
		if (__predict_false(m->m_flags & M_VLANTAG)) {
			po->data.vlan_id =
			    htons(m->m_pkthdr.ether_vtag) & htons(EVL_VLID_MASK);
		}
	}

	switch (po->data.lro_type) {
	case LRO_TYPE_IPV4_UDP:
	case LRO_TYPE_IPV6_UDP:
		/* Check for VXLAN headers. */
		if ((m->m_pkthdr.csum_flags & vxlan_csum) != vxlan_csum)
			break;

		/* Try to parse inner headers. */
		data_ptr = tcp_lro_low_level_parser(data_ptr, pi, update_data, true);
		if (data_ptr == NULL || pi->total_hdr_len > m->m_len)
			break;

		/* Verify supported header types. */
		switch (pi->data.lro_type) {
		case LRO_TYPE_IPV4_TCP:
		case LRO_TYPE_IPV6_TCP:
			return (pi);
		default:
			break;
		}
		break;
	case LRO_TYPE_IPV4_TCP:
	case LRO_TYPE_IPV6_TCP:
		if (update_data)
			memset(pi, 0, sizeof(*pi));
		return (po);
	default:
		break;
	}
	return (NULL);
}

static inline int
tcp_lro_trim_mbuf_chain(struct mbuf *m, const struct lro_parser *po)
{
	int len;

	switch (po->data.lro_type) {
408
#ifdef INET
409
410
411
	case LRO_TYPE_IPV4_TCP:
		len = ((uint8_t *)po->ip4 - (uint8_t *)m->m_data) +
		    ntohs(po->ip4->ip_len);
412
413
		break;
#endif
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
#ifdef INET6
	case LRO_TYPE_IPV6_TCP:
		len = ((uint8_t *)po->ip6 - (uint8_t *)m->m_data) +
		    ntohs(po->ip6->ip6_plen) + sizeof(*po->ip6);
		break;
#endif
	default:
		return (TCP_LRO_CANNOT);
	}

	/*
	 * If the frame is padded beyond the end of the IP packet,
	 * then trim the extra bytes off:
	 */
	if (__predict_true(m->m_pkthdr.len == len)) {
		return (0);
	} else if (m->m_pkthdr.len > len) {
		m_adj(m, len - m->m_pkthdr.len);
		return (0);
433
	}
434
435
436
437
438
439
440
	return (TCP_LRO_CANNOT);
}

static struct tcphdr *
tcp_lro_get_th(struct mbuf *m)
{
	return ((struct tcphdr *)((uint8_t *)m->m_data + m->m_pkthdr.lro_tcp_h_off));
441
442
}

443
444
445
446
447
448
449
450
451
452
453
454
455
static void
lro_free_mbuf_chain(struct mbuf *m)
{
	struct mbuf *save;

	while (m) {
		save = m->m_nextpkt;
		m->m_nextpkt = NULL;
		m_freem(m);
		m = save;
	}
}

456
void
Bjoern A. Zeeb's avatar
Bjoern A. Zeeb committed
457
tcp_lro_free(struct lro_ctrl *lc)
458
{
Bjoern A. Zeeb's avatar
Bjoern A. Zeeb committed
459
	struct lro_entry *le;
460
461
462
	unsigned x;

	/* reset LRO free list */
463
	LIST_INIT(&lc->lro_free);
Bjoern A. Zeeb's avatar
Bjoern A. Zeeb committed
464

465
	/* free active mbufs, if any */
466
	while ((le = LIST_FIRST(&lc->lro_active)) != NULL) {
467
		tcp_lro_active_remove(le);
468
		lro_free_mbuf_chain(le->m_head);
Bjoern A. Zeeb's avatar
Bjoern A. Zeeb committed
469
	}
470

471
	/* free hash table */
472
473
	free(lc->lro_hash, M_LRO);
	lc->lro_hash = NULL;
474
475
	lc->lro_hashsz = 0;

476
477
	/* free mbuf array, if any */
	for (x = 0; x != lc->lro_mbuf_count; x++)
478
		m_freem(lc->lro_mbuf_data[x].mb);
479
	lc->lro_mbuf_count = 0;
480

481
482
483
	/* free allocated memory, if any */
	free(lc->lro_mbuf_data, M_LRO);
	lc->lro_mbuf_data = NULL;
Bjoern A. Zeeb's avatar
Bjoern A. Zeeb committed
484
}
485

Bjoern A. Zeeb's avatar
Bjoern A. Zeeb committed
486
static uint16_t
487
tcp_lro_rx_csum_tcphdr(const struct tcphdr *th)
Bjoern A. Zeeb's avatar
Bjoern A. Zeeb committed
488
{
489
490
491
492
493
494
495
496
497
498
499
500
	const uint16_t *ptr;
	uint32_t csum;
	uint16_t len;

	csum = -th->th_sum;	/* exclude checksum field */
	len = th->th_off;
	ptr = (const uint16_t *)th;
	while (len--) {
		csum += *ptr;
		ptr++;
		csum += *ptr;
		ptr++;
Bjoern A. Zeeb's avatar
Bjoern A. Zeeb committed
501
	}
502
503
	while (csum > 0xffff)
		csum = (csum >> 16) + (csum & 0xffff);
Bjoern A. Zeeb's avatar
Bjoern A. Zeeb committed
504

505
	return (csum);
Bjoern A. Zeeb's avatar
Bjoern A. Zeeb committed
506
507
508
}

static uint16_t
509
tcp_lro_rx_csum_data(const struct lro_parser *pa, uint16_t tcp_csum)
Bjoern A. Zeeb's avatar
Bjoern A. Zeeb committed
510
511
512
513
{
	uint32_t c;
	uint16_t cs;

514
	c = tcp_csum;
Bjoern A. Zeeb's avatar
Bjoern A. Zeeb committed
515

516
	switch (pa->data.lro_type) {
Bjoern A. Zeeb's avatar
Bjoern A. Zeeb committed
517
#ifdef INET6
518
519
520
	case LRO_TYPE_IPV6_TCP:
		/* Compute full pseudo IPv6 header checksum. */
		cs = in6_cksum_pseudo(pa->ip6, ntohs(pa->ip6->ip6_plen), pa->ip6->ip6_nxt, 0);
Bjoern A. Zeeb's avatar
Bjoern A. Zeeb committed
521
522
523
		break;
#endif
#ifdef INET
524
525
526
527
	case LRO_TYPE_IPV4_TCP:
		/* Compute full pseudo IPv4 header checsum. */
		cs = in_addword(ntohs(pa->ip4->ip_len) - sizeof(*pa->ip4), IPPROTO_TCP);
		cs = in_pseudo(pa->ip4->ip_src.s_addr, pa->ip4->ip_dst.s_addr, htons(cs));
Bjoern A. Zeeb's avatar
Bjoern A. Zeeb committed
528
529
530
531
		break;
#endif
	default:
		cs = 0;		/* Keep compiler happy. */
532
		break;
533
	}
Bjoern A. Zeeb's avatar
Bjoern A. Zeeb committed
534

535
	/* Complement checksum. */
Bjoern A. Zeeb's avatar
Bjoern A. Zeeb committed
536
537
538
	cs = ~cs;
	c += cs;

539
540
	/* Remove TCP header checksum. */
	cs = ~tcp_lro_rx_csum_tcphdr(pa->tcp);
Bjoern A. Zeeb's avatar
Bjoern A. Zeeb committed
541
	c += cs;
542
543

	/* Compute checksum remainder. */
Bjoern A. Zeeb's avatar
Bjoern A. Zeeb committed
544
545
546
	while (c > 0xffff)
		c = (c >> 16) + (c & 0xffff);

547
	return (c);
548
549
}

550
551
552
553
554
static void
tcp_lro_rx_done(struct lro_ctrl *lc)
{
	struct lro_entry *le;

555
	while ((le = LIST_FIRST(&lc->lro_active)) != NULL) {
556
		tcp_lro_active_remove(le);
557
558
559
560
		tcp_lro_flush(lc, le);
	}
}

561
562
563
564
void
tcp_lro_flush_inactive(struct lro_ctrl *lc, const struct timeval *timeout)
{
	struct lro_entry *le, *le_tmp;
565
	sbintime_t sbt;
566

567
	if (LIST_EMPTY(&lc->lro_active))
568
569
		return;

570
571
572
	/* get timeout time */
	sbt = getsbinuptime() - tvtosbt(*timeout);

573
	LIST_FOREACH_SAFE(le, &lc->lro_active, next, le_tmp) {
574
		if (sbt >= le->alloc_time) {
575
			tcp_lro_active_remove(le);
576
577
578
579
580
			tcp_lro_flush(lc, le);
		}
	}
}

581
582
#ifdef INET
static int
583
tcp_lro_rx_ipv4(struct lro_ctrl *lc, struct mbuf *m, struct ip *ip4)
584
{
585
586
587
	uint16_t csum;

	/* Legacy IP has a header checksum that needs to be correct. */
588
589
	if (m->m_pkthdr.csum_flags & CSUM_IP_CHECKED) {
		if (__predict_false((m->m_pkthdr.csum_flags & CSUM_IP_VALID) == 0)) {
590
591
592
593
594
			lc->lro_bad_csum++;
			return (TCP_LRO_CANNOT);
		}
	} else {
		csum = in_cksum_hdr(ip4);
595
		if (__predict_false(csum != 0)) {
596
597
598
599
600
601
602
603
			lc->lro_bad_csum++;
			return (TCP_LRO_CANNOT);
		}
	}
	return (0);
}
#endif

604
#ifdef TCPHPTS
605
static void
606
607
608
609
tcp_lro_log(struct tcpcb *tp, const struct lro_ctrl *lc,
    const struct lro_entry *le, const struct mbuf *m,
    int frm, int32_t tcp_data_len, uint32_t th_seq,
    uint32_t th_ack, uint16_t th_win)
610
611
612
613
614
{
	if (tp->t_logstate != TCP_LOG_STATE_OFF) {
		union tcp_log_stackspecific log;
		struct timeval tv;
		uint32_t cts;
615

616
617
618
619
620
621
622
623
		cts = tcp_get_usecs(&tv);
		memset(&log, 0, sizeof(union tcp_log_stackspecific));
		log.u_bbr.flex8 = frm;
		log.u_bbr.flex1 = tcp_data_len;
		if (m)
			log.u_bbr.flex2 = m->m_pkthdr.len;
		else
			log.u_bbr.flex2 = 0;
624
625
		log.u_bbr.flex3 = le->m_head->m_pkthdr.lro_nsegs;
		log.u_bbr.flex4 = le->m_head->m_pkthdr.lro_tcp_d_len;
626
627
628
629
630
		if (le->m_head) {
			log.u_bbr.flex5 = le->m_head->m_pkthdr.len;
			log.u_bbr.delRate = le->m_head->m_flags;
			log.u_bbr.rttProp = le->m_head->m_pkthdr.rcv_tstmp;
		}
631
632
633
634
635
636
637
		log.u_bbr.inflight = th_seq;
		log.u_bbr.timeStamp = cts;
		log.u_bbr.epoch = le->next_seq;
		log.u_bbr.delivered = th_ack;
		log.u_bbr.lt_epoch = le->ack_seq;
		log.u_bbr.pacing_gain = th_win;
		log.u_bbr.cwnd_gain = le->window;
Warner Losh's avatar
Warner Losh committed
638
639
		log.u_bbr.cur_del_rate = (uintptr_t)m;
		log.u_bbr.bw_inuse = (uintptr_t)le->m_head;
640
641
642
		log.u_bbr.flex6 = sbttous(lc->lro_last_queue_time);
		log.u_bbr.flex7 = le->compressed;
		log.u_bbr.pacing_gain = le->uncompressed;
643
644
645
646
		if (in_epoch(net_epoch_preempt))
			log.u_bbr.inhpts = 1;
		else
			log.u_bbr.inhpts = 0;
647
648
649
650
651
652
653
		TCP_LOG_EVENTP(tp, NULL,
			       &tp->t_inpcb->inp_socket->so_rcv,
			       &tp->t_inpcb->inp_socket->so_snd,
			       TCP_LOG_LRO, 0,
			       0, &log, false, &tv);
	}
}
654
#endif
655

656
657
static inline void
tcp_lro_assign_and_checksum_16(uint16_t *ptr, uint16_t value, uint16_t *psum)
658
{
659
	uint32_t csum;
Bjoern A. Zeeb's avatar
Bjoern A. Zeeb committed
660

661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
	csum = 0xffff - *ptr + value;
	while (csum > 0xffff)
		csum = (csum >> 16) + (csum & 0xffff);
	*ptr = value;
	*psum = csum;
}

static uint16_t
tcp_lro_update_checksum(const struct lro_parser *pa, const struct lro_entry *le,
    uint16_t payload_len, uint16_t delta_sum)
{
	uint32_t csum;
	uint16_t tlen;
	uint16_t temp[5] = {};

	switch (pa->data.lro_type) {
	case LRO_TYPE_IPV4_TCP:
		/* Compute new IPv4 length. */
		tlen = (pa->ip4->ip_hl << 2) + (pa->tcp->th_off << 2) + payload_len;
		tcp_lro_assign_and_checksum_16(&pa->ip4->ip_len, htons(tlen), &temp[0]);

		/* Subtract delta from current IPv4 checksum. */
		csum = pa->ip4->ip_sum + 0xffff - temp[0];
		while (csum > 0xffff)
			csum = (csum >> 16) + (csum & 0xffff);
		tcp_lro_assign_and_checksum_16(&pa->ip4->ip_sum, csum, &temp[1]);
		goto update_tcp_header;

	case LRO_TYPE_IPV6_TCP:
		/* Compute new IPv6 length. */
		tlen = (pa->tcp->th_off << 2) + payload_len;
		tcp_lro_assign_and_checksum_16(&pa->ip6->ip6_plen, htons(tlen), &temp[0]);
		goto update_tcp_header;

	case LRO_TYPE_IPV4_UDP:
		/* Compute new IPv4 length. */
		tlen = (pa->ip4->ip_hl << 2) + sizeof(*pa->udp) + payload_len;
		tcp_lro_assign_and_checksum_16(&pa->ip4->ip_len, htons(tlen), &temp[0]);

		/* Subtract delta from current IPv4 checksum. */
		csum = pa->ip4->ip_sum + 0xffff - temp[0];
		while (csum > 0xffff)
			csum = (csum >> 16) + (csum & 0xffff);
		tcp_lro_assign_and_checksum_16(&pa->ip4->ip_sum, csum, &temp[1]);
		goto update_udp_header;

	case LRO_TYPE_IPV6_UDP:
		/* Compute new IPv6 length. */
		tlen = sizeof(*pa->udp) + payload_len;
		tcp_lro_assign_and_checksum_16(&pa->ip6->ip6_plen, htons(tlen), &temp[0]);
		goto update_udp_header;

	default:
		return (0);
	}

update_tcp_header:
	/* Compute current TCP header checksum. */
	temp[2] = tcp_lro_rx_csum_tcphdr(pa->tcp);

	/* Incorporate the latest ACK into the TCP header. */
	pa->tcp->th_ack = le->ack_seq;
	pa->tcp->th_win = le->window;

	/* Incorporate latest timestamp into the TCP header. */
	if (le->timestamp != 0) {
		uint32_t *ts_ptr;

		ts_ptr = (uint32_t *)(pa->tcp + 1);
		ts_ptr[1] = htonl(le->tsval);
		ts_ptr[2] = le->tsecr;
	}

	/* Compute new TCP header checksum. */
	temp[3] = tcp_lro_rx_csum_tcphdr(pa->tcp);

	/* Compute new TCP checksum. */
	csum = pa->tcp->th_sum + 0xffff - delta_sum +
	    0xffff - temp[0] + 0xffff - temp[3] + temp[2];
	while (csum > 0xffff)
		csum = (csum >> 16) + (csum & 0xffff);

	/* Assign new TCP checksum. */
	tcp_lro_assign_and_checksum_16(&pa->tcp->th_sum, csum, &temp[4]);

	/* Compute all modififications affecting next checksum. */
	csum = temp[0] + temp[1] + 0xffff - temp[2] +
	    temp[3] + temp[4] + delta_sum;
	while (csum > 0xffff)
		csum = (csum >> 16) + (csum & 0xffff);

	/* Return delta checksum to next stage, if any. */
	return (csum);

update_udp_header:
	tlen = sizeof(*pa->udp) + payload_len;
	/* Assign new UDP length and compute checksum delta. */
	tcp_lro_assign_and_checksum_16(&pa->udp->uh_ulen, htons(tlen), &temp[2]);

	/* Check if there is a UDP checksum. */
	if (__predict_false(pa->udp->uh_sum != 0)) {
		/* Compute new UDP checksum. */
		csum = pa->udp->uh_sum + 0xffff - delta_sum +
		    0xffff - temp[0] + 0xffff - temp[2];
		while (csum > 0xffff)
			csum = (csum >> 16) + (csum & 0xffff);
		/* Assign new UDP checksum. */
		tcp_lro_assign_and_checksum_16(&pa->udp->uh_sum, csum, &temp[3]);
	}
Bjoern A. Zeeb's avatar
Bjoern A. Zeeb committed
770

771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
	/* Compute all modififications affecting next checksum. */
	csum = temp[0] + temp[1] + temp[2] + temp[3] + delta_sum;
	while (csum > 0xffff)
		csum = (csum >> 16) + (csum & 0xffff);

	/* Return delta checksum to next stage, if any. */
	return (csum);
}

static void
tcp_flush_out_entry(struct lro_ctrl *lc, struct lro_entry *le)
{
	/* Check if we need to recompute any checksums. */
	if (le->m_head->m_pkthdr.lro_nsegs > 1) {
		uint16_t csum;

		switch (le->inner.data.lro_type) {
		case LRO_TYPE_IPV4_TCP:
			csum = tcp_lro_update_checksum(&le->inner, le,
			    le->m_head->m_pkthdr.lro_tcp_d_len,
			    le->m_head->m_pkthdr.lro_tcp_d_csum);
			csum = tcp_lro_update_checksum(&le->outer, NULL,
			    le->m_head->m_pkthdr.lro_tcp_d_len +
			    le->inner.total_hdr_len, csum);
Bjoern A. Zeeb's avatar
Bjoern A. Zeeb committed
795
			le->m_head->m_pkthdr.csum_flags = CSUM_DATA_VALID |
796
797
			    CSUM_PSEUDO_HDR | CSUM_IP_CHECKED | CSUM_IP_VALID;
			le->m_head->m_pkthdr.csum_data = 0xffff;
Bjoern A. Zeeb's avatar
Bjoern A. Zeeb committed
798
			break;
799
800
801
802
803
804
805
		case LRO_TYPE_IPV6_TCP:
			csum = tcp_lro_update_checksum(&le->inner, le,
			    le->m_head->m_pkthdr.lro_tcp_d_len,
			    le->m_head->m_pkthdr.lro_tcp_d_csum);
			csum = tcp_lro_update_checksum(&le->outer, NULL,
			    le->m_head->m_pkthdr.lro_tcp_d_len +
			    le->inner.total_hdr_len, csum);
Bjoern A. Zeeb's avatar
Bjoern A. Zeeb committed
806
			le->m_head->m_pkthdr.csum_flags = CSUM_DATA_VALID |
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
			    CSUM_PSEUDO_HDR;
			le->m_head->m_pkthdr.csum_data = 0xffff;
			break;
		case LRO_TYPE_NONE:
			switch (le->outer.data.lro_type) {
			case LRO_TYPE_IPV4_TCP:
				csum = tcp_lro_update_checksum(&le->outer, le,
				    le->m_head->m_pkthdr.lro_tcp_d_len,
				    le->m_head->m_pkthdr.lro_tcp_d_csum);
				le->m_head->m_pkthdr.csum_flags = CSUM_DATA_VALID |
				    CSUM_PSEUDO_HDR | CSUM_IP_CHECKED | CSUM_IP_VALID;
				le->m_head->m_pkthdr.csum_data = 0xffff;
				break;
			case LRO_TYPE_IPV6_TCP:
				csum = tcp_lro_update_checksum(&le->outer, le,
				    le->m_head->m_pkthdr.lro_tcp_d_len,
				    le->m_head->m_pkthdr.lro_tcp_d_csum);
				le->m_head->m_pkthdr.csum_flags = CSUM_DATA_VALID |
				    CSUM_PSEUDO_HDR;
				le->m_head->m_pkthdr.csum_data = 0xffff;
				break;
			default:
				break;
			}
Bjoern A. Zeeb's avatar
Bjoern A. Zeeb committed
831
832
			break;
		default:
833
			break;
Bjoern A. Zeeb's avatar
Bjoern A. Zeeb committed
834
		}
835
	}
836

837
838
839
	/*
	 * Break any chain, this is not set to NULL on the singleton
	 * case m_nextpkt points to m_head. Other case set them
840
841
842
	 * m_nextpkt to NULL in push_and_replace.
	 */
	le->m_head->m_nextpkt = NULL;
843
	lc->lro_queued += le->m_head->m_pkthdr.lro_nsegs;
844
845
846
847
	(*lc->ifp->if_input)(lc->ifp, le->m_head);
}

static void
848
849
tcp_set_entry_to_mbuf(struct lro_ctrl *lc, struct lro_entry *le,
    struct mbuf *m, struct tcphdr *th)
850
851
852
{
	uint32_t *ts_ptr;
	uint16_t tcp_data_len;
853
	uint16_t tcp_opt_len;
854
855

	ts_ptr = (uint32_t *)(th + 1);
856
857
858
859
860
861
862
863
864
865
	tcp_opt_len = (th->th_off << 2);
	tcp_opt_len -= sizeof(*th);

	/* Check if there is a timestamp option. */
	if (tcp_opt_len == 0 ||
	    __predict_false(tcp_opt_len != TCPOLEN_TSTAMP_APPA ||
	    *ts_ptr != TCP_LRO_TS_OPTION)) {
		/* We failed to find the timestamp option. */
		le->timestamp = 0;
	} else {
866
867
868
		le->timestamp = 1;
		le->tsval = ntohl(*(ts_ptr + 1));
		le->tsecr = *(ts_ptr + 2);
869
870
871
872
873
	}

	tcp_data_len = m->m_pkthdr.lro_tcp_d_len;

	/* Pull out TCP sequence numbers and window size. */
874
875
876
	le->next_seq = ntohl(th->th_seq) + tcp_data_len;
	le->ack_seq = th->th_ack;
	le->window = th->th_win;
877
878

	/* Setup new data pointers. */
879
880
881
	le->m_head = m;
	le->m_tail = m_last(m);
}
Bjoern A. Zeeb's avatar
Bjoern A. Zeeb committed
882

883
static void
884
tcp_push_and_replace(struct lro_ctrl *lc, struct lro_entry *le, struct mbuf *m)
885
{
886
887
	struct lro_parser *pa;

888
	/*
889
890
	 * Push up the stack of the current entry
	 * and replace it with "m".
891
892
893
894
895
896
	 */
	struct mbuf *msave;

	/* Grab off the next and save it */
	msave = le->m_head->m_nextpkt;
	le->m_head->m_nextpkt = NULL;
897
898
899
900
901
902
903
904
905

	/* Now push out the old entry */
	tcp_flush_out_entry(lc, le);

	/* Re-parse new header, should not fail. */
	pa = tcp_lro_parser(m, &le->outer, &le->inner, false);
	KASSERT(pa != NULL,
	    ("tcp_push_and_replace: LRO parser failed on m=%p\n", m));

906
	/*
907
908
	 * Now to replace the data properly in the entry
	 * we have to reset the TCP header and
909
910
	 * other fields.
	 */
911
912
	tcp_set_entry_to_mbuf(lc, le, m, pa->tcp);

913
914
915
916
917
	/* Restore the next list */
	m->m_nextpkt = msave;
}

static void
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
tcp_lro_mbuf_append_pkthdr(struct mbuf *m, const struct mbuf *p)
{
	uint32_t csum;

	if (m->m_pkthdr.lro_nsegs == 1) {
		/* Compute relative checksum. */
		csum = p->m_pkthdr.lro_tcp_d_csum;
	} else {
		/* Merge TCP data checksums. */
		csum = (uint32_t)m->m_pkthdr.lro_tcp_d_csum +
		    (uint32_t)p->m_pkthdr.lro_tcp_d_csum;
		while (csum > 0xffff)
			csum = (csum >> 16) + (csum & 0xffff);
	}

	/* Update various counters. */
	m->m_pkthdr.len += p->m_pkthdr.lro_tcp_d_len;
	m->m_pkthdr.lro_tcp_d_csum = csum;
	m->m_pkthdr.lro_tcp_d_len += p->m_pkthdr.lro_tcp_d_len;
	m->m_pkthdr.lro_nsegs += p->m_pkthdr.lro_nsegs;
}

static void
tcp_lro_condense(struct lro_ctrl *lc, struct lro_entry *le)
942
{
943
944
945
	/*
	 * Walk through the mbuf chain we
	 * have on tap and compress/condense
946
947
948
949
950
	 * as required.
	 */
	uint32_t *ts_ptr;
	struct mbuf *m;
	struct tcphdr *th;
951
952
953
954
	uint32_t tcp_data_len_total;
	uint32_t tcp_data_seg_total;
	uint16_t tcp_data_len;
	uint16_t tcp_opt_len;
955

956
957
958
	/*
	 * First we must check the lead (m_head)
	 * we must make sure that it is *not*
959
960
961
962
963
964
	 * something that should be sent up
	 * right away (sack etc).
	 */
again:
	m = le->m_head->m_nextpkt;
	if (m == NULL) {
965
		/* Just one left. */
966
967
		return;
	}
968
969
970
971

	th = tcp_lro_get_th(m);
	tcp_opt_len = (th->th_off << 2);
	tcp_opt_len -= sizeof(*th);
972
	ts_ptr = (uint32_t *)(th + 1);
973
974
975

	if (tcp_opt_len != 0 && __predict_false(tcp_opt_len != TCPOLEN_TSTAMP_APPA ||
	    *ts_ptr != TCP_LRO_TS_OPTION)) {
976
977
978
979
980
		/*
		 * Its not the timestamp. We can't
		 * use this guy as the head.
		 */
		le->m_head->m_nextpkt = m->m_nextpkt;
981
		tcp_push_and_replace(lc, le, m);
982
983
984
985
986
987
988
989
		goto again;
	}
	if ((th->th_flags & ~(TH_ACK | TH_PUSH)) != 0) {
		/*
		 * Make sure that previously seen segements/ACKs are delivered
		 * before this segment, e.g. FIN.
		 */
		le->m_head->m_nextpkt = m->m_nextpkt;
990
		tcp_push_and_replace(lc, le, m);
991
992
993
		goto again;
	}
	while((m = le->m_head->m_nextpkt) != NULL) {
994
		/*
995
996
997
998
999
1000
		 * condense m into le, first
		 * pull m out of the list.
		 */
		le->m_head->m_nextpkt = m->m_nextpkt;
		m->m_nextpkt = NULL;
		/* Setup my data */
For faster browsing, not all history is shown. View entire blame