ipvs: allow rescheduling of new connections when port reuse is detected
Currently, when TCP/SCTP port reusing happens, IPVS will find the old entry and use it for the new one, behaving like a forced persistence. But if you consider a cluster with a heavy load of small connections, such reuse will happen often and may lead to a not optimal load balancing and might prevent a new node from getting a fair load. This patch introduces a new sysctl, conn_reuse_mode, that allows controlling how to proceed when port reuse is detected. The default value will allow rescheduling of new connections only if the old entry was in TIME_WAIT state for TCP or CLOSED for SCTP. Signed-off-by: Marcelo Ricardo Leitner <mleitner@redhat.com> Signed-off-by: Julian Anastasov <ja@ssi.bg> Signed-off-by: Simon Horman <horms@verge.net.au>
This commit is contained in:
Родитель
7f73b9f1ca
Коммит
d752c36457
|
@ -22,6 +22,27 @@ backup_only - BOOLEAN
|
||||||
If set, disable the director function while the server is
|
If set, disable the director function while the server is
|
||||||
in backup mode to avoid packet loops for DR/TUN methods.
|
in backup mode to avoid packet loops for DR/TUN methods.
|
||||||
|
|
||||||
|
conn_reuse_mode - INTEGER
|
||||||
|
1 - default
|
||||||
|
|
||||||
|
Controls how ipvs will deal with connections that are detected
|
||||||
|
port reuse. It is a bitmap, with the values being:
|
||||||
|
|
||||||
|
0: disable any special handling on port reuse. The new
|
||||||
|
connection will be delivered to the same real server that was
|
||||||
|
servicing the previous connection. This will effectively
|
||||||
|
disable expire_nodest_conn.
|
||||||
|
|
||||||
|
bit 1: enable rescheduling of new connections when it is safe.
|
||||||
|
That is, whenever expire_nodest_conn and for TCP sockets, when
|
||||||
|
the connection is in TIME_WAIT state (which is only possible if
|
||||||
|
you use NAT mode).
|
||||||
|
|
||||||
|
bit 2: it is bit 1 plus, for TCP connections, when connections
|
||||||
|
are in FIN_WAIT state, as this is the last state seen by load
|
||||||
|
balancer in Direct Routing mode. This bit helps on adding new
|
||||||
|
real servers to a very busy cluster.
|
||||||
|
|
||||||
conntrack - BOOLEAN
|
conntrack - BOOLEAN
|
||||||
0 - disabled (default)
|
0 - disabled (default)
|
||||||
not 0 - enabled
|
not 0 - enabled
|
||||||
|
|
|
@ -941,6 +941,7 @@ struct netns_ipvs {
|
||||||
int sysctl_nat_icmp_send;
|
int sysctl_nat_icmp_send;
|
||||||
int sysctl_pmtu_disc;
|
int sysctl_pmtu_disc;
|
||||||
int sysctl_backup_only;
|
int sysctl_backup_only;
|
||||||
|
int sysctl_conn_reuse_mode;
|
||||||
|
|
||||||
/* ip_vs_lblc */
|
/* ip_vs_lblc */
|
||||||
int sysctl_lblc_expiration;
|
int sysctl_lblc_expiration;
|
||||||
|
@ -1059,6 +1060,11 @@ static inline int sysctl_backup_only(struct netns_ipvs *ipvs)
|
||||||
ipvs->sysctl_backup_only;
|
ipvs->sysctl_backup_only;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline int sysctl_conn_reuse_mode(struct netns_ipvs *ipvs)
|
||||||
|
{
|
||||||
|
return ipvs->sysctl_conn_reuse_mode;
|
||||||
|
}
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
static inline int sysctl_sync_threshold(struct netns_ipvs *ipvs)
|
static inline int sysctl_sync_threshold(struct netns_ipvs *ipvs)
|
||||||
|
@ -1126,6 +1132,11 @@ static inline int sysctl_backup_only(struct netns_ipvs *ipvs)
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline int sysctl_conn_reuse_mode(struct netns_ipvs *ipvs)
|
||||||
|
{
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/* IPVS core functions
|
/* IPVS core functions
|
||||||
|
|
|
@ -1042,6 +1042,26 @@ static inline bool is_new_conn(const struct sk_buff *skb,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline bool is_new_conn_expected(const struct ip_vs_conn *cp,
|
||||||
|
int conn_reuse_mode)
|
||||||
|
{
|
||||||
|
/* Controlled (FTP DATA or persistence)? */
|
||||||
|
if (cp->control)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
switch (cp->protocol) {
|
||||||
|
case IPPROTO_TCP:
|
||||||
|
return (cp->state == IP_VS_TCP_S_TIME_WAIT) ||
|
||||||
|
((conn_reuse_mode & 2) &&
|
||||||
|
(cp->state == IP_VS_TCP_S_FIN_WAIT) &&
|
||||||
|
(cp->flags & IP_VS_CONN_F_NOOUTPUT));
|
||||||
|
case IPPROTO_SCTP:
|
||||||
|
return cp->state == IP_VS_SCTP_S_CLOSED;
|
||||||
|
default:
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/* Handle response packets: rewrite addresses and send away...
|
/* Handle response packets: rewrite addresses and send away...
|
||||||
*/
|
*/
|
||||||
static unsigned int
|
static unsigned int
|
||||||
|
@ -1580,6 +1600,7 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
|
||||||
struct ip_vs_conn *cp;
|
struct ip_vs_conn *cp;
|
||||||
int ret, pkts;
|
int ret, pkts;
|
||||||
struct netns_ipvs *ipvs;
|
struct netns_ipvs *ipvs;
|
||||||
|
int conn_reuse_mode;
|
||||||
|
|
||||||
/* Already marked as IPVS request or reply? */
|
/* Already marked as IPVS request or reply? */
|
||||||
if (skb->ipvs_property)
|
if (skb->ipvs_property)
|
||||||
|
@ -1648,10 +1669,14 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
|
||||||
*/
|
*/
|
||||||
cp = pp->conn_in_get(af, skb, &iph, 0);
|
cp = pp->conn_in_get(af, skb, &iph, 0);
|
||||||
|
|
||||||
if (unlikely(sysctl_expire_nodest_conn(ipvs)) && cp && cp->dest &&
|
conn_reuse_mode = sysctl_conn_reuse_mode(ipvs);
|
||||||
unlikely(!atomic_read(&cp->dest->weight)) && !iph.fragoffs &&
|
if (conn_reuse_mode && !iph.fragoffs &&
|
||||||
is_new_conn(skb, &iph)) {
|
is_new_conn(skb, &iph) && cp &&
|
||||||
ip_vs_conn_expire_now(cp);
|
((unlikely(sysctl_expire_nodest_conn(ipvs)) && cp->dest &&
|
||||||
|
unlikely(!atomic_read(&cp->dest->weight))) ||
|
||||||
|
unlikely(is_new_conn_expected(cp, conn_reuse_mode)))) {
|
||||||
|
if (!atomic_read(&cp->n_control))
|
||||||
|
ip_vs_conn_expire_now(cp);
|
||||||
__ip_vs_conn_put(cp);
|
__ip_vs_conn_put(cp);
|
||||||
cp = NULL;
|
cp = NULL;
|
||||||
}
|
}
|
||||||
|
|
|
@ -1823,6 +1823,12 @@ static struct ctl_table vs_vars[] = {
|
||||||
.mode = 0644,
|
.mode = 0644,
|
||||||
.proc_handler = proc_dointvec,
|
.proc_handler = proc_dointvec,
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
.procname = "conn_reuse_mode",
|
||||||
|
.maxlen = sizeof(int),
|
||||||
|
.mode = 0644,
|
||||||
|
.proc_handler = proc_dointvec,
|
||||||
|
},
|
||||||
#ifdef CONFIG_IP_VS_DEBUG
|
#ifdef CONFIG_IP_VS_DEBUG
|
||||||
{
|
{
|
||||||
.procname = "debug_level",
|
.procname = "debug_level",
|
||||||
|
@ -3790,6 +3796,8 @@ static int __net_init ip_vs_control_net_init_sysctl(struct net *net)
|
||||||
ipvs->sysctl_pmtu_disc = 1;
|
ipvs->sysctl_pmtu_disc = 1;
|
||||||
tbl[idx++].data = &ipvs->sysctl_pmtu_disc;
|
tbl[idx++].data = &ipvs->sysctl_pmtu_disc;
|
||||||
tbl[idx++].data = &ipvs->sysctl_backup_only;
|
tbl[idx++].data = &ipvs->sysctl_backup_only;
|
||||||
|
ipvs->sysctl_conn_reuse_mode = 1;
|
||||||
|
tbl[idx++].data = &ipvs->sysctl_conn_reuse_mode;
|
||||||
|
|
||||||
|
|
||||||
ipvs->sysctl_hdr = register_net_sysctl(net, "net/ipv4/vs", tbl);
|
ipvs->sysctl_hdr = register_net_sysctl(net, "net/ipv4/vs", tbl);
|
||||||
|
|
|
@ -845,10 +845,27 @@ static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param,
|
||||||
struct ip_vs_conn *cp;
|
struct ip_vs_conn *cp;
|
||||||
struct netns_ipvs *ipvs = net_ipvs(net);
|
struct netns_ipvs *ipvs = net_ipvs(net);
|
||||||
|
|
||||||
if (!(flags & IP_VS_CONN_F_TEMPLATE))
|
if (!(flags & IP_VS_CONN_F_TEMPLATE)) {
|
||||||
cp = ip_vs_conn_in_get(param);
|
cp = ip_vs_conn_in_get(param);
|
||||||
else
|
if (cp && ((cp->dport != dport) ||
|
||||||
|
!ip_vs_addr_equal(cp->daf, &cp->daddr, daddr))) {
|
||||||
|
if (!(flags & IP_VS_CONN_F_INACTIVE)) {
|
||||||
|
ip_vs_conn_expire_now(cp);
|
||||||
|
__ip_vs_conn_put(cp);
|
||||||
|
cp = NULL;
|
||||||
|
} else {
|
||||||
|
/* This is the expiration message for the
|
||||||
|
* connection that was already replaced, so we
|
||||||
|
* just ignore it.
|
||||||
|
*/
|
||||||
|
__ip_vs_conn_put(cp);
|
||||||
|
kfree(param->pe_data);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
cp = ip_vs_ct_in_get(param);
|
cp = ip_vs_ct_in_get(param);
|
||||||
|
}
|
||||||
|
|
||||||
if (cp) {
|
if (cp) {
|
||||||
/* Free pe_data */
|
/* Free pe_data */
|
||||||
|
|
Загрузка…
Ссылка в новой задаче