Commit de2d4784 authored by Gleb Smirnoff's avatar Gleb Smirnoff
Browse files

SMR protection for inpcbs

With introduction of epoch(9) synchronization to network stack the
inpcb database became protected by the network epoch together with
static network data (interfaces, addresses, etc).  However, inpcb
aren't static in nature, they are created and destroyed all the
time, which creates some traffic on the epoch(9) garbage collector.

Fairly new feature of uma(9) - Safe Memory Reclamation allows to
safely free memory in page-sized batches, with virtually zero
overhead compared to uma_zfree().  However, unlike epoch(9), it
puts stricter requirement on the access to the protected memory,
needing the critical(9) section to access it.  Details:

- The database is already build on CK lists, thanks to epoch(9).
- For write access nothing is changed.
- For a lookup in the database SMR section is now required.
  Once the desired inpcb is found we need to transition from SMR
  section to r/w lock on the inpcb itself, with a check that inpcb
  isn't yet freed.  This requires some compexity, since SMR section
  itself is a critical(9) section.  The complexity is hidden from
  KPI users in inp_smr_lock().
- For a inpcb list traversal (a pcblist sysctl, or broadcast
  notification) also a new KPI is provided, that hides internals of
  the database - inp_next(struct inp_iterator *).

Reviewed by:		rrs
Differential revision:	https://reviews.freebsd.org/D33022
parent 565655f4
......@@ -564,15 +564,15 @@ static struct witness_order_list_entry order_lists[] = {
/*
* UDP/IP
*/
{ "udp", &lock_class_mtx_sleep },
{ "udpinp", &lock_class_rw },
{ "udp", &lock_class_mtx_sleep },
{ "so_snd", &lock_class_mtx_sleep },
{ NULL, NULL },
/*
* TCP/IP
*/
{ "tcp", &lock_class_mtx_sleep },
{ "tcpinp", &lock_class_rw },
{ "tcp", &lock_class_mtx_sleep },
{ "so_snd", &lock_class_mtx_sleep },
{ NULL, NULL },
/*
......
......@@ -854,10 +854,6 @@ ktls_try_toe(struct socket *so, struct ktls_session *tls, int direction)
inp = so->so_pcb;
INP_WLOCK(inp);
if (inp->inp_flags2 & INP_FREED) {
INP_WUNLOCK(inp);
return (ECONNRESET);
}
if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
INP_WUNLOCK(inp);
return (ECONNRESET);
......@@ -909,10 +905,6 @@ ktls_alloc_snd_tag(struct inpcb *inp, struct ktls_session *tls, bool force,
int error;
INP_RLOCK(inp);
if (inp->inp_flags2 & INP_FREED) {
INP_RUNLOCK(inp);
return (ECONNRESET);
}
if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
INP_RUNLOCK(inp);
return (ECONNRESET);
......@@ -2716,8 +2708,7 @@ ktls_disable_ifnet_help(void *context, int pending __unused)
INP_WLOCK(inp);
so = inp->inp_socket;
MPASS(so != NULL);
if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) ||
(inp->inp_flags2 & INP_FREED)) {
if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
goto out;
}
......@@ -2729,7 +2720,6 @@ ktls_disable_ifnet_help(void *context, int pending __unused)
counter_u64_add(ktls_ifnet_disable_ok, 1);
/* ktls_set_tx_mode() drops inp wlock, so recheck flags */
if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) == 0 &&
(inp->inp_flags2 & INP_FREED) == 0 &&
(tp = intotcpcb(inp)) != NULL &&
tp->t_fb->tfb_hwtls_change != NULL)
(*tp->t_fb->tfb_hwtls_change)(tp, 0);
......
This diff is collapsed.
......@@ -49,7 +49,9 @@
#ifdef _KERNEL
#include <sys/lock.h>
#include <sys/proc.h>
#include <sys/rwlock.h>
#include <sys/smr.h>
#include <net/vnet.h>
#include <vm/uma.h>
#endif
......@@ -133,32 +135,19 @@ struct in_conninfo {
* struct inpcb captures the network layer state for TCP, UDP, and raw IPv4 and
* IPv6 sockets. In the case of TCP and UDP, further per-connection state is
* hung off of inp_ppcb most of the time. Almost all fields of struct inpcb
* are static after creation or protected by a per-inpcb rwlock, inp_lock. A
* few fields are protected by multiple locks as indicated in the locking notes
* below. For these fields, all of the listed locks must be write-locked for
* any modifications. However, these fields can be safely read while any one of
* the listed locks are read-locked. This model can permit greater concurrency
* for read operations. For example, connections can be looked up while only
* holding a read lock on the global pcblist lock. This is important for
* performance when attempting to find the connection for a packet given its IP
* and port tuple.
* are static after creation or protected by a per-inpcb rwlock, inp_lock.
*
* One noteworthy exception is that the global pcbinfo lock follows a different
* set of rules in relation to the inp_list field. Rather than being
* write-locked for modifications and read-locked for list iterations, it must
* be read-locked during modifications and write-locked during list iterations.
* This ensures that the relatively rare global list iterations safely walk a
* stable snapshot of connections while allowing more common list modifications
* to safely grab the pcblist lock just while adding or removing a connection
* from the global list.
* A inpcb database is indexed by addresses/ports hash as well as list of
* all pcbs that belong to a certain proto. Database lookups or list traversals
* are be performed inside SMR section. Once desired PCB is found its own
* lock is to be obtained and SMR section exited.
*
* Key:
* (b) - Protected by the hpts lock.
* (c) - Constant after initialization
* (e) - Protected by the net_epoch_prempt epoch
* (e) - Protected by the SMR section
* (i) - Protected by the inpcb lock
* (p) - Protected by the pcbinfo lock for the inpcb
* (l) - Protected by the pcblist lock for the inpcb
* (h) - Protected by the pcbhash lock for the inpcb
* (s) - Protected by another subsystem's locks
* (x) - Undefined locking
......@@ -219,17 +208,13 @@ struct in_conninfo {
* socket has been freed), or there may be close(2)-related races.
*
* The inp_vflag field is overloaded, and would otherwise ideally be (c).
*
* TODO: Currently only the TCP stack is leveraging the global pcbinfo lock
* read-lock usage during modification, this model can be applied to other
* protocols (especially SCTP).
*/
struct icmp6_filter;
struct inpcbpolicy;
struct m_snd_tag;
struct inpcb {
/* Cache line #1 (amd64) */
CK_LIST_ENTRY(inpcb) inp_hash; /* [w](h/i) [r](e/i) hash list */
CK_LIST_ENTRY(inpcb) inp_hash; /* (w:h/r:e) hash list */
struct rwlock inp_lock;
/* Cache line #2 (amd64) */
#define inp_start_zero inp_hpts
......@@ -311,8 +296,8 @@ struct inpcb {
int in6p_cksum;
short in6p_hops;
};
CK_LIST_ENTRY(inpcb) inp_portlist; /* (i/h) */
struct inpcbport *inp_phd; /* (i/h) head of this list */
CK_LIST_ENTRY(inpcb) inp_portlist; /* (r:e/w:h) port list */
struct inpcbport *inp_phd; /* (r:e/w:h) head of this list */
inp_gen_t inp_gencnt; /* (c) generation count */
void *spare_ptr; /* Spare pointer. */
rt_gen_t inp_rt_cookie; /* generation for route entry */
......@@ -320,10 +305,7 @@ struct inpcb {
struct route inp_route;
struct route_in6 inp_route6;
};
CK_LIST_ENTRY(inpcb) inp_list; /* (p/l) list for all PCBs for proto */
/* (e[r]) for list iteration */
/* (p[w]/l) for addition/removal */
struct epoch_context inp_epoch_ctx;
CK_LIST_ENTRY(inpcb) inp_list; /* (r:e/w:p) all PCBs for proto */
};
#endif /* _KERNEL */
......@@ -396,80 +378,58 @@ void in_pcbtoxinpcb(const struct inpcb *, struct xinpcb *);
#endif
#endif /* _SYS_SOCKETVAR_H_ */
struct inpcbport {
struct epoch_context phd_epoch_ctx;
CK_LIST_ENTRY(inpcbport) phd_hash;
struct inpcbhead phd_pcblist;
u_short phd_port;
};
/*-
#ifdef _KERNEL
/*
* Global data structure for each high-level protocol (UDP, TCP, ...) in both
* IPv4 and IPv6. Holds inpcb lists and information for managing them.
*
* Each pcbinfo is protected by three locks: ipi_lock, ipi_hash_lock and
* ipi_list_lock:
* - ipi_lock covering the global pcb list stability during loop iteration,
* - ipi_hash_lock covering the hashed lookup tables,
* - ipi_list_lock covering mutable global fields (such as the global
* pcb list)
*
* The lock order is:
*
* ipi_lock (before)
* inpcb locks (before)
* ipi_list locks (before)
* The pcbs are protected with SMR section and thus all lists in inpcbinfo
* are CK-lists. Locking is required to insert a pcb into database. Two
* locks are provided: one for the hash and one for the global list of pcbs,
* as well as overall count and generation count.
*
* Locking key:
*
* (c) Constant or nearly constant after initialisation
* (e) - Protected by the net_epoch_prempt epoch
* (e) Protected by SMR section
* (g) Locked by ipi_lock
* (l) Locked by ipi_list_lock
* (h) Read using either net_epoch_preempt or inpcb lock; write requires both ipi_hash_lock and inpcb lock
* (x) Synchronisation properties poorly defined
* (h) Locked by ipi_hash_lock
*/
struct inpcbinfo {
/*
* Global lock protecting inpcb list modification
*/
struct mtx ipi_lock;
/*
* Global list of inpcbs on the protocol.
*/
struct inpcbhead *ipi_listhead; /* [r](e) [w](g/l) */
u_int ipi_count; /* (l) */
struct inpcbhead ipi_listhead; /* (r:e/w:g) */
u_int ipi_count; /* (g) */
/*
* Generation count -- incremented each time a connection is allocated
* or freed.
*/
u_quad_t ipi_gencnt; /* (l) */
u_quad_t ipi_gencnt; /* (g) */
/*
* Fields associated with port lookup and allocation.
*/
u_short ipi_lastport; /* (x) */
u_short ipi_lastlow; /* (x) */
u_short ipi_lasthi; /* (x) */
u_short ipi_lastport; /* (h) */
u_short ipi_lastlow; /* (h) */
u_short ipi_lasthi; /* (h) */
/*
* UMA zone from which inpcbs are allocated for this protocol.
*/
struct uma_zone *ipi_zone; /* (c) */
/*
* Global lock protecting modification hash lookup tables.
*/
struct mtx ipi_hash_lock;
uma_zone_t ipi_zone; /* (c) */
uma_zone_t ipi_portzone; /* (c) */
smr_t ipi_smr; /* (c) */
/*
* Global hash of inpcbs, hashed by local and foreign addresses and
* port numbers.
*/
struct inpcbhead *ipi_hashbase; /* (h) */
u_long ipi_hashmask; /* (h) */
struct mtx ipi_hash_lock;
struct inpcbhead *ipi_hashbase; /* (r:e/w:h) */
u_long ipi_hashmask; /* (c) */
/*
* Global hash of inpcbs, hashed by only local port number.
......@@ -481,26 +441,15 @@ struct inpcbinfo {
* Load balance groups used for the SO_REUSEPORT_LB option,
* hashed by local port.
*/
struct inpcblbgrouphead *ipi_lbgrouphashbase; /* (h) */
struct inpcblbgrouphead *ipi_lbgrouphashbase; /* (r:e/w:h) */
u_long ipi_lbgrouphashmask; /* (h) */
/*
* Pointer to network stack instance
*/
struct vnet *ipi_vnet; /* (c) */
/*
* general use 2
*/
void *ipi_pspare[2];
/*
* Global lock protecting global inpcb list, inpcb count, etc.
*/
struct rwlock ipi_list_lock;
};
#ifdef _KERNEL
/*
* Load balance groups used for the SO_REUSEPORT_LB socket option. Each group
* (or unique address:port combination) can be re-used at most
......@@ -523,7 +472,7 @@ struct inpcblbgroup {
};
#define INP_LOCK_INIT(inp, d, t) \
rw_init_flags(&(inp)->inp_lock, (t), RW_RECURSE | RW_DUPOK)
rw_init_flags(&(inp)->inp_lock, (t), RW_RECURSE | RW_DUPOK)
#define INP_LOCK_DESTROY(inp) rw_destroy(&(inp)->inp_lock)
#define INP_RLOCK(inp) rw_rlock(&(inp)->inp_lock)
#define INP_WLOCK(inp) rw_wlock(&(inp)->inp_lock)
......@@ -571,51 +520,21 @@ int inp_so_options(const struct inpcb *inp);
#endif /* _KERNEL */
#define INP_INFO_LOCK_INIT(ipi, d) \
mtx_init(&(ipi)->ipi_lock, (d), NULL, MTX_DEF| MTX_RECURSE)
#define INP_INFO_LOCK_DESTROY(ipi) mtx_destroy(&(ipi)->ipi_lock)
#define INP_INFO_WLOCK(ipi) mtx_lock(&(ipi)->ipi_lock)
#define INP_INFO_TRY_WLOCK(ipi) mtx_trylock(&(ipi)->ipi_lock)
#define INP_INFO_WLOCK(ipi) mtx_lock(&(ipi)->ipi_lock)
#define INP_INFO_WLOCKED(ipi) mtx_owned(&(ipi)->ipi_lock)
#define INP_INFO_WUNLOCK(ipi) mtx_unlock(&(ipi)->ipi_lock)
#define INP_INFO_LOCK_ASSERT(ipi) MPASS(in_epoch(net_epoch_preempt) || mtx_owned(&(ipi)->ipi_lock))
#define INP_INFO_LOCK_ASSERT(ipi) MPASS(SMR_ENTERED((ipi)->ipi_smr) || \
mtx_owned(&(ipi)->ipi_lock))
#define INP_INFO_WLOCK_ASSERT(ipi) mtx_assert(&(ipi)->ipi_lock, MA_OWNED)
#define INP_INFO_WUNLOCK_ASSERT(ipi) \
mtx_assert(&(ipi)->ipi_lock, MA_NOTOWNED)
#define INP_LIST_LOCK_INIT(ipi, d) \
rw_init_flags(&(ipi)->ipi_list_lock, (d), 0)
#define INP_LIST_LOCK_DESTROY(ipi) rw_destroy(&(ipi)->ipi_list_lock)
#define INP_LIST_RLOCK(ipi) rw_rlock(&(ipi)->ipi_list_lock)
#define INP_LIST_WLOCK(ipi) rw_wlock(&(ipi)->ipi_list_lock)
#define INP_LIST_TRY_RLOCK(ipi) rw_try_rlock(&(ipi)->ipi_list_lock)
#define INP_LIST_TRY_WLOCK(ipi) rw_try_wlock(&(ipi)->ipi_list_lock)
#define INP_LIST_TRY_UPGRADE(ipi) rw_try_upgrade(&(ipi)->ipi_list_lock)
#define INP_LIST_RUNLOCK(ipi) rw_runlock(&(ipi)->ipi_list_lock)
#define INP_LIST_WUNLOCK(ipi) rw_wunlock(&(ipi)->ipi_list_lock)
#define INP_LIST_LOCK_ASSERT(ipi) \
rw_assert(&(ipi)->ipi_list_lock, RA_LOCKED)
#define INP_LIST_RLOCK_ASSERT(ipi) \
rw_assert(&(ipi)->ipi_list_lock, RA_RLOCKED)
#define INP_LIST_WLOCK_ASSERT(ipi) \
rw_assert(&(ipi)->ipi_list_lock, RA_WLOCKED)
#define INP_LIST_UNLOCK_ASSERT(ipi) \
rw_assert(&(ipi)->ipi_list_lock, RA_UNLOCKED)
#define INP_HASH_LOCK_INIT(ipi, d) mtx_init(&(ipi)->ipi_hash_lock, (d), NULL, MTX_DEF)
#define INP_HASH_LOCK_DESTROY(ipi) mtx_destroy(&(ipi)->ipi_hash_lock)
mtx_assert(&(ipi)->ipi_lock, MA_NOTOWNED)
#define INP_HASH_WLOCK(ipi) mtx_lock(&(ipi)->ipi_hash_lock)
#define INP_HASH_WUNLOCK(ipi) mtx_unlock(&(ipi)->ipi_hash_lock)
#define INP_HASH_LOCK_ASSERT(ipi) MPASS(in_epoch(net_epoch_preempt) || mtx_owned(&(ipi)->ipi_hash_lock))
#define INP_HASH_WLOCK_ASSERT(ipi) mtx_assert(&(ipi)->ipi_hash_lock, MA_OWNED);
#define INP_GROUP_LOCK_INIT(ipg, d) mtx_init(&(ipg)->ipg_lock, (d), NULL, \
MTX_DEF | MTX_DUPOK)
#define INP_GROUP_LOCK_DESTROY(ipg) mtx_destroy(&(ipg)->ipg_lock)
#define INP_GROUP_LOCK(ipg) mtx_lock(&(ipg)->ipg_lock)
#define INP_GROUP_LOCK_ASSERT(ipg) mtx_assert(&(ipg)->ipg_lock, MA_OWNED)
#define INP_GROUP_UNLOCK(ipg) mtx_unlock(&(ipg)->ipg_lock)
#define INP_HASH_LOCK_ASSERT(ipi) MPASS(SMR_ENTERED((ipi)->ipi_smr) || \
mtx_owned(&(ipi)->ipi_hash_lock))
#define INP_HASH_WLOCK_ASSERT(ipi) mtx_assert(&(ipi)->ipi_hash_lock, \
MA_OWNED)
#define INP_PCBHASH(faddr, lport, fport, mask) \
(((faddr) ^ ((faddr) >> 16) ^ ntohs((lport) ^ (fport))) & (mask))
......@@ -644,7 +563,7 @@ int inp_so_options(const struct inpcb *inp);
#define INP_ANONPORT 0x00000040 /* port chosen for user */
#define INP_RECVIF 0x00000080 /* receive incoming interface */
#define INP_MTUDISC 0x00000100 /* user can do MTU discovery */
/* 0x000200 unused: was INP_FAITH */
/* INP_FREED 0x00000200 private to in_pcb.c */
#define INP_RECVTTL 0x00000400 /* receive incoming IP TTL */
#define INP_DONTFRAG 0x00000800 /* don't fragment packet */
#define INP_BINDANY 0x00001000 /* allow bind to any address */
......@@ -682,7 +601,7 @@ int inp_so_options(const struct inpcb *inp);
#define INP_MBUF_ACKCMP 0x00000002 /* TCP mbuf ack compression ok */
/* 0x00000004 */
#define INP_REUSEPORT 0x00000008 /* SO_REUSEPORT option is set */
#define INP_FREED 0x00000010 /* inp itself is not valid */
/* 0x00000010 */
#define INP_REUSEADDR 0x00000020 /* SO_REUSEADDR option is set */
#define INP_BINDMULTI 0x00000040 /* IP_BINDMULTI option is set */
#define INP_RSS_BUCKET_SET 0x00000080 /* IP_RSS_LISTEN_BUCKET is set */
......@@ -702,15 +621,19 @@ int inp_so_options(const struct inpcb *inp);
#define INP_2PCP_BASE INP_2PCP_BIT0
#define INP_2PCP_MASK (INP_2PCP_BIT0 | INP_2PCP_BIT1 | INP_2PCP_BIT2)
#define INP_2PCP_SHIFT 18 /* shift PCP field in/out of inp_flags2 */
/*
* Flags passed to in_pcblookup*() functions.
* Flags passed to in_pcblookup*(), inp_smr_lock() and inp_next().
*/
#define INPLOOKUP_WILDCARD 0x00000001 /* Allow wildcard sockets. */
#define INPLOOKUP_RLOCKPCB 0x00000002 /* Return inpcb read-locked. */
#define INPLOOKUP_WLOCKPCB 0x00000004 /* Return inpcb write-locked. */
typedef enum {
INPLOOKUP_WILDCARD = 0x00000001, /* Allow wildcard sockets. */
INPLOOKUP_RLOCKPCB = 0x00000002, /* Return inpcb read-locked. */
INPLOOKUP_WLOCKPCB = 0x00000004, /* Return inpcb write-locked. */
} inp_lookup_t;
#define INPLOOKUP_MASK (INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB | \
INPLOOKUP_WLOCKPCB)
#define INPLOOKUP_LOCKMASK (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)
#define sotoinpcb(so) ((struct inpcb *)(so)->so_pcb)
......@@ -718,13 +641,6 @@ int inp_so_options(const struct inpcb *inp);
#define INP_CHECK_SOCKAF(so, af) (INP_SOCKAF(so) == af)
/*
* Constants for pcbinfo.ipi_hashfields.
*/
#define IPI_HASHFIELDS_NONE 0
#define IPI_HASHFIELDS_2TUPLE 1
#define IPI_HASHFIELDS_4TUPLE 2
#ifdef _KERNEL
VNET_DECLARE(int, ipport_reservedhigh);
VNET_DECLARE(int, ipport_reservedlow);
......@@ -755,8 +671,8 @@ VNET_DECLARE(int, ipport_tcpallocs);
#define V_ipport_tcpallocs VNET(ipport_tcpallocs)
void in_pcbinfo_destroy(struct inpcbinfo *);
void in_pcbinfo_init(struct inpcbinfo *, const char *, struct inpcbhead *,
int, int, char *, uma_init, u_int);
void in_pcbinfo_init(struct inpcbinfo *, const char *, u_int, int, char *,
uma_init);
int in_pcbbind_check_bindmulti(const struct inpcb *ni,
const struct inpcb *oi);
......@@ -788,8 +704,37 @@ void in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr,
int, struct inpcb *(*)(struct inpcb *, int));
void in_pcbref(struct inpcb *);
void in_pcbrehash(struct inpcb *);
int in_pcbrele_rlocked(struct inpcb *);
int in_pcbrele_wlocked(struct inpcb *);
bool in_pcbrele_rlocked(struct inpcb *);
bool in_pcbrele_wlocked(struct inpcb *);
typedef bool inp_match_t(const struct inpcb *, void *);
struct inpcb_iterator {
const struct inpcbinfo *ipi;
struct inpcb *inp;
inp_match_t *match;
void *ctx;
int hash;
#define INP_ALL_LIST -1
const inp_lookup_t lock;
};
/* Note: sparse initializers guarantee .inp = NULL. */
#define INP_ITERATOR(_ipi, _lock, _match, _ctx) \
{ \
.ipi = (_ipi), \
.lock = (_lock), \
.hash = INP_ALL_LIST, \
.match = (_match), \
.ctx = (_ctx), \
}
#define INP_ALL_ITERATOR(_ipi, _lock) \
{ \
.ipi = (_ipi), \
.lock = (_lock), \
.hash = INP_ALL_LIST, \
}
struct inpcb *inp_next(struct inpcb_iterator *);
void in_losing(struct inpcb *);
void in_pcbsetsolabel(struct socket *so);
int in_getpeeraddr(struct socket *so, struct sockaddr **nam);
......
......@@ -44,6 +44,7 @@
* Definitions shared between netinet/in_pcb.c and netinet6/in6_pcb.c
*/
bool inp_smr_lock(struct inpcb *, const inp_lookup_t);
int in_pcb_lport(struct inpcb *, struct in_addr *, u_short *,
struct ucred *, int);
int in_pcb_lport_dest(struct inpcb *inp, struct sockaddr *lsa,
......@@ -52,4 +53,10 @@ int in_pcb_lport_dest(struct inpcb *inp, struct sockaddr *lsa,
struct inpcb * in_pcblookup_local(struct inpcbinfo *, struct in_addr, u_short,
int, struct ucred *);
struct inpcbport {
struct inpcbhead phd_pcblist;
CK_LIST_ENTRY(inpcbport) phd_hash;
u_short phd_port;
};
#endif /* !_NETINET_IN_PCB_VAR_H_ */
......@@ -111,10 +111,7 @@ __FBSDID("$FreeBSD$");
*/
/* Internal variables. */
VNET_DEFINE_STATIC(struct inpcbhead, divcb);
VNET_DEFINE_STATIC(struct inpcbinfo, divcbinfo);
#define V_divcb VNET(divcb)
#define V_divcbinfo VNET(divcbinfo)
static u_long div_sendspace = DIVSNDQ; /* XXX sysctl ? */
......@@ -154,8 +151,7 @@ div_init(void)
* allocate one-entry hash lists than it is to check all over the
* place for hashbase == NULL.
*/
in_pcbinfo_init(&V_divcbinfo, "div", &V_divcb, 1, 1, "divcb",
div_inpcb_init, IPI_HASHFIELDS_NONE);
in_pcbinfo_init(&V_divcbinfo, "div", 1, 1, "divcb", div_inpcb_init);
}
static void
......@@ -181,6 +177,14 @@ div_input(struct mbuf **mp, int *offp, int proto)
return (IPPROTO_DONE);
}
static bool
div_port_match(const struct inpcb *inp, void *v)
{
uint16_t nport = *(uint16_t *)v;
return (inp->inp_lport == nport);
}
/*
* Divert a packet by passing it up to the divert socket at port 'port'.
*
......@@ -195,6 +199,8 @@ divert_packet(struct mbuf *m, bool incoming)
struct socket *sa;
u_int16_t nport;
struct sockaddr_in divsrc;
struct inpcb_iterator inpi = INP_ITERATOR(&V_divcbinfo,
INPLOOKUP_RLOCKPCB, div_port_match, &nport);
struct m_tag *mtag;
NET_EPOCH_ASSERT();
......@@ -288,27 +294,20 @@ divert_packet(struct mbuf *m, bool incoming)
/* Put packet on socket queue, if any */
sa = NULL;
/* nport is inp_next's context. */
nport = htons((u_int16_t)(((struct ipfw_rule_ref *)(mtag+1))->info));
CK_LIST_FOREACH(inp, &V_divcb, inp_list) {
while ((inp = inp_next(&inpi)) != NULL) {
sa = inp->inp_socket;
SOCKBUF_LOCK(&sa->so_rcv);
if (sbappendaddr_locked(&sa->so_rcv,
(struct sockaddr *)&divsrc, m, NULL) == 0) {
soroverflow_locked(sa);
sa = NULL; /* force mbuf reclaim below */
} else
sorwakeup_locked(sa);
/* XXX why does only one socket match? */
if (inp->inp_lport == nport) {
INP_RLOCK(inp);
if (__predict_false(inp->inp_flags2 & INP_FREED)) {
INP_RUNLOCK(inp);
continue;
}
sa = inp->inp_socket;
SOCKBUF_LOCK(&sa->so_rcv);
if (sbappendaddr_locked(&sa->so_rcv,
(struct sockaddr *)&divsrc, m,
(struct mbuf *)0) == 0) {
soroverflow_locked(sa);
sa = NULL; /* force mbuf reclaim below */
} else
sorwakeup_locked(sa);
INP_RUNLOCK(inp);
break;
}
INP_RUNLOCK(inp);
break;
}
if (sa == NULL) {
m_freem(m);
......@@ -603,14 +602,10 @@ div_attach(struct socket *so, int proto, struct thread *td)
error = soreserve(so, div_sendspace, div_recvspace);
if (error)
return error;
INP_INFO_WLOCK(&V_divcbinfo);
error = in_pcballoc(so, &V_divcbinfo);
if (error) {
INP_INFO_WUNLOCK(&V_divcbinfo);
if (error)
return error;
}
inp = (struct inpcb *)so->so_pcb;
INP_INFO_WUNLOCK(&V_divcbinfo);
inp->inp_ip_p = proto;
inp->inp_vflag |= INP_IPV4;
inp->inp_flags |= INP_HDRINCL;
......@@ -625,11 +620,9 @@ div_detach(struct socket *so)
inp = sotoinpcb(so);
KASSERT(inp != NULL, ("div_detach: inp == NULL"));
INP_INFO_WLOCK(&V_divcbinfo);
INP_WLOCK(inp);
in_pcbdetach(inp);
in_pcbfree(inp);
INP_INFO_WUNLOCK(&V_divcbinfo);
}
static int
......@@ -652,13 +645,11 @@ div_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
if (nam->sa_len != sizeof(struct sockaddr_in))
return EINVAL;
((struct sockaddr_in *)nam)->sin_addr.s_addr = INADDR_ANY;
INP_INFO_WLOCK(&V_divcbinfo);
INP_WLOCK(inp);
INP_HASH_WLOCK(&V_divcbinfo);
error = in_pcbbind(inp, nam, td->td_ucred);
INP_HASH_WUNLOCK(&V_divcbinfo);
INP_WUNLOCK(inp);
INP_INFO_WUNLOCK(&V_divcbinfo);
return error;
}
......@@ -697,8 +688,9 @@ div_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
static int
div_pcblist(SYSCTL_HANDLER_ARGS)
{
struct inpcb_iterator inpi = INP_ALL_ITERATOR(&V_divcbinfo,
INPLOOKUP_RLOCKPCB);
struct xinpgen xig;
struct epoch_tracker et;
struct inpcb *inp;
int error;
......@@ -726,21 +718,18 @@ div_pcblist(SYSCTL_HANDLER_ARGS)
if (error)
return error;
NET_EPOCH_ENTER(et);
for (inp = CK_LIST_FIRST(V_divcbinfo.ipi_listhead);
inp != NULL;
inp = CK_LIST_NEXT(inp, inp_list)) {
INP_RLOCK(inp);
while ((inp = inp_next(&inpi)) != NULL) {
if (inp->inp_gencnt <= xig.xig_gen) {
struct xinpcb xi;
in_pcbtoxinpcb(inp, &xi);
INP_RUNLOCK(inp);
error = SYSCTL_OUT(req, &xi, sizeof xi);
} else
INP_RUNLOCK(inp);
if (error) {
INP_RUNLOCK(inp);
break;
}
}
}
NET_EPOCH_EXIT(et);
if (!error) {
/*
......
......@@ -223,25 +223,11 @@ static void