Merge branch 'act_tc-offload-originating-device'

Paul Blakey says:

====================
net/sched: Pass originating device to drivers offloading ct connection

Currently, drivers register to a ct zone that can be shared by multiple
devices. This can be inefficient for the driver to offload, as it
needs to handle all the cases where the tuple can come from,
instead of where it's most likely will arive from.

For example, consider the following tc rules:
tc filter add dev dev1 ... flower action ct commit zone 5 \
   action mirred egress redirect dev dev2

tc filter add dev dev2 ... flower action ct zone 5 \
   action goto chain chain 2
tc filter add dev dev2 ... flower ct_state +trk+est ... \
   action mirred egress redirect dev dev1

Both dev2 and dev1 register to the zone 5 flow table (created
by act_ct). A tuple originating on dev1, going to dev2, will
be offloaded to both devices, and both will need to offload
both directions, resulting in 4 total rules. The traffic
will only hit originiating tuple on dev1, and reply tuple
on dev2.

By passing the originating device that created the connection
with the tuple, dev1 can choose to offload only the originating
tuple, and dev2 only the reply tuple. Resulting in a more
efficient offload.

The first patch adds an act_ct nf conntrack extension, to
temporarily store the originiating device from the skb before
offloading the connection once the connection is established.
Once sent to offload, it fills the tuple originating device.

The second patch get this information from tuples
which pass in openvswitch.

The third patch is Mellanox driver ct offload implementation using
this information to provide a hint to firmware of where this
offloaded tuple packets will arrive from (LOCAL or UPLINK port),
and thus increase insertion rate.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
David S. Miller 2022-01-04 12:12:56 +00:00
Родитель 9d2c27aad0 c9c079b4de
Коммит dfb55f9984
8 изменённых файлов: 141 добавлений и 6 удалений

Просмотреть файл

@ -538,7 +538,7 @@ int mlx5_rescan_drivers_locked(struct mlx5_core_dev *dev)
return add_drivers(dev);
}
static bool mlx5_same_hw_devs(struct mlx5_core_dev *dev, struct mlx5_core_dev *peer_dev)
bool mlx5_same_hw_devs(struct mlx5_core_dev *dev, struct mlx5_core_dev *peer_dev)
{
u64 fsystem_guid, psystem_guid;

Просмотреть файл

@ -14,6 +14,7 @@
#include <linux/workqueue.h>
#include <linux/refcount.h>
#include <linux/xarray.h>
#include <linux/if_macvlan.h>
#include "lib/fs_chains.h"
#include "en/tc_ct.h"
@ -326,7 +327,33 @@ mlx5_tc_ct_rule_to_tuple_nat(struct mlx5_ct_tuple *tuple,
}
static int
mlx5_tc_ct_set_tuple_match(struct mlx5e_priv *priv, struct mlx5_flow_spec *spec,
mlx5_tc_ct_get_flow_source_match(struct mlx5_tc_ct_priv *ct_priv,
struct net_device *ndev)
{
struct mlx5e_priv *other_priv = netdev_priv(ndev);
struct mlx5_core_dev *mdev = ct_priv->dev;
bool vf_rep, uplink_rep;
vf_rep = mlx5e_eswitch_vf_rep(ndev) && mlx5_same_hw_devs(mdev, other_priv->mdev);
uplink_rep = mlx5e_eswitch_uplink_rep(ndev) && mlx5_same_hw_devs(mdev, other_priv->mdev);
if (vf_rep)
return MLX5_FLOW_CONTEXT_FLOW_SOURCE_LOCAL_VPORT;
if (uplink_rep)
return MLX5_FLOW_CONTEXT_FLOW_SOURCE_UPLINK;
if (is_vlan_dev(ndev))
return mlx5_tc_ct_get_flow_source_match(ct_priv, vlan_dev_real_dev(ndev));
if (netif_is_macvlan(ndev))
return mlx5_tc_ct_get_flow_source_match(ct_priv, macvlan_dev_real_dev(ndev));
if (mlx5e_get_tc_tun(ndev) || netif_is_lag_master(ndev))
return MLX5_FLOW_CONTEXT_FLOW_SOURCE_UPLINK;
return MLX5_FLOW_CONTEXT_FLOW_SOURCE_ANY_VPORT;
}
static int
mlx5_tc_ct_set_tuple_match(struct mlx5_tc_ct_priv *ct_priv,
struct mlx5_flow_spec *spec,
struct flow_rule *rule)
{
void *headers_c = MLX5_ADDR_OF(fte_match_param, spec->match_criteria,
@ -341,8 +368,7 @@ mlx5_tc_ct_set_tuple_match(struct mlx5e_priv *priv, struct mlx5_flow_spec *spec,
flow_rule_match_basic(rule, &match);
mlx5e_tc_set_ethertype(priv->mdev, &match, true, headers_c,
headers_v);
mlx5e_tc_set_ethertype(ct_priv->dev, &match, true, headers_c, headers_v);
MLX5_SET(fte_match_set_lyr_2_4, headers_c, ip_protocol,
match.mask->ip_proto);
MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_protocol,
@ -438,6 +464,23 @@ mlx5_tc_ct_set_tuple_match(struct mlx5e_priv *priv, struct mlx5_flow_spec *spec,
ntohs(match.key->flags));
}
if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_META)) {
struct flow_match_meta match;
flow_rule_match_meta(rule, &match);
if (match.key->ingress_ifindex & match.mask->ingress_ifindex) {
struct net_device *dev;
dev = dev_get_by_index(&init_net, match.key->ingress_ifindex);
if (dev && MLX5_CAP_ESW_FLOWTABLE(ct_priv->dev, flow_source))
spec->flow_context.flow_source =
mlx5_tc_ct_get_flow_source_match(ct_priv, dev);
dev_put(dev);
}
}
return 0;
}
@ -770,7 +813,7 @@ mlx5_tc_ct_entry_add_rule(struct mlx5_tc_ct_priv *ct_priv,
if (ct_priv->ns_type == MLX5_FLOW_NAMESPACE_FDB)
attr->esw_attr->in_mdev = priv->mdev;
mlx5_tc_ct_set_tuple_match(netdev_priv(ct_priv->netdev), spec, flow_rule);
mlx5_tc_ct_set_tuple_match(ct_priv, spec, flow_rule);
mlx5e_tc_match_to_reg_match(spec, ZONE_TO_REG, entry->tuple.zone, MLX5_CT_ZONE_MASK);
zone_rule->rule = mlx5_tc_rule_insert(priv, spec, attr);

Просмотреть файл

@ -305,5 +305,6 @@ static inline u32 mlx5_sriov_get_vf_total_msix(struct pci_dev *pdev)
bool mlx5_eth_supported(struct mlx5_core_dev *dev);
bool mlx5_rdma_supported(struct mlx5_core_dev *dev);
bool mlx5_vnet_supported(struct mlx5_core_dev *dev);
bool mlx5_same_hw_devs(struct mlx5_core_dev *dev, struct mlx5_core_dev *peer_dev);
#endif /* __MLX5_CORE_H__ */

Просмотреть файл

@ -0,0 +1,50 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _NF_CONNTRACK_ACT_CT_H
#define _NF_CONNTRACK_ACT_CT_H
#include <net/netfilter/nf_conntrack.h>
#include <linux/netfilter/nf_conntrack_common.h>
#include <net/netfilter/nf_conntrack_extend.h>
struct nf_conn_act_ct_ext {
int ifindex[IP_CT_DIR_MAX];
};
static inline struct nf_conn_act_ct_ext *nf_conn_act_ct_ext_find(const struct nf_conn *ct)
{
#if IS_ENABLED(CONFIG_NET_ACT_CT)
return nf_ct_ext_find(ct, NF_CT_EXT_ACT_CT);
#else
return NULL;
#endif
}
static inline struct nf_conn_act_ct_ext *nf_conn_act_ct_ext_add(struct nf_conn *ct)
{
#if IS_ENABLED(CONFIG_NET_ACT_CT)
struct nf_conn_act_ct_ext *act_ct = nf_ct_ext_find(ct, NF_CT_EXT_ACT_CT);
if (act_ct)
return act_ct;
act_ct = nf_ct_ext_add(ct, NF_CT_EXT_ACT_CT, GFP_ATOMIC);
return act_ct;
#else
return NULL;
#endif
}
static inline void nf_conn_act_ct_ext_fill(struct sk_buff *skb, struct nf_conn *ct,
enum ip_conntrack_info ctinfo)
{
#if IS_ENABLED(CONFIG_NET_ACT_CT)
struct nf_conn_act_ct_ext *act_ct_ext;
act_ct_ext = nf_conn_act_ct_ext_find(ct);
if (dev_net(skb->dev) == &init_net && act_ct_ext)
act_ct_ext->ifindex[CTINFO2DIR(ctinfo)] = skb->dev->ifindex;
#endif
}
#endif /* _NF_CONNTRACK_ACT_CT_H */

Просмотреть файл

@ -27,6 +27,9 @@ enum nf_ct_ext_id {
#endif
#if IS_ENABLED(CONFIG_NETFILTER_SYNPROXY)
NF_CT_EXT_SYNPROXY,
#endif
#if IS_ENABLED(CONFIG_NET_ACT_CT)
NF_CT_EXT_ACT_CT,
#endif
NF_CT_EXT_NUM,
};
@ -40,6 +43,7 @@ enum nf_ct_ext_id {
#define NF_CT_EXT_TIMEOUT_TYPE struct nf_conn_timeout
#define NF_CT_EXT_LABELS_TYPE struct nf_conn_labels
#define NF_CT_EXT_SYNPROXY_TYPE struct nf_conn_synproxy
#define NF_CT_EXT_ACT_CT_TYPE struct nf_conn_act_ct_ext
/* Extensions: optional stuff which isn't permanently in struct. */
struct nf_ct_ext {

Просмотреть файл

@ -47,6 +47,7 @@
#include <net/netfilter/nf_conntrack_timeout.h>
#include <net/netfilter/nf_conntrack_labels.h>
#include <net/netfilter/nf_conntrack_synproxy.h>
#include <net/netfilter/nf_conntrack_act_ct.h>
#include <net/netfilter/nf_nat.h>
#include <net/netfilter/nf_nat_helper.h>
#include <net/netns/hash.h>
@ -2626,7 +2627,7 @@ int nf_conntrack_set_hashsize(const char *val, const struct kernel_param *kp)
static __always_inline unsigned int total_extension_size(void)
{
/* remember to add new extensions below */
BUILD_BUG_ON(NF_CT_EXT_NUM > 9);
BUILD_BUG_ON(NF_CT_EXT_NUM > 10);
return sizeof(struct nf_ct_ext) +
sizeof(struct nf_conn_help)
@ -2649,6 +2650,9 @@ static __always_inline unsigned int total_extension_size(void)
#endif
#if IS_ENABLED(CONFIG_NETFILTER_SYNPROXY)
+ sizeof(struct nf_conn_synproxy)
#endif
#if IS_ENABLED(CONFIG_NET_ACT_CT)
+ sizeof(struct nf_conn_act_ct_ext)
#endif
;
};

Просмотреть файл

@ -25,6 +25,8 @@
#include <net/netfilter/nf_nat.h>
#endif
#include <net/netfilter/nf_conntrack_act_ct.h>
#include "datapath.h"
#include "conntrack.h"
#include "flow.h"
@ -1045,6 +1047,8 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
*/
nf_ct_set_tcp_be_liberal(ct);
}
nf_conn_act_ct_ext_fill(skb, ct, ctinfo);
}
return 0;
@ -1245,6 +1249,8 @@ static int ovs_ct_commit(struct net *net, struct sw_flow_key *key,
&info->labels.mask);
if (err)
return err;
nf_conn_act_ct_ext_add(ct);
} else if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) &&
labels_nonzero(&info->labels.mask)) {
err = ovs_ct_set_labels(ct, key, &info->labels.value,

Просмотреть файл

@ -32,6 +32,7 @@
#include <net/netfilter/nf_conntrack_helper.h>
#include <net/netfilter/nf_conntrack_acct.h>
#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
#include <net/netfilter/nf_conntrack_act_ct.h>
#include <uapi/linux/netfilter/nf_nat.h>
static struct workqueue_struct *act_ct_wq;
@ -56,6 +57,12 @@ static const struct rhashtable_params zones_params = {
.automatic_shrinking = true,
};
static struct nf_ct_ext_type act_ct_extend __read_mostly = {
.len = sizeof(struct nf_conn_act_ct_ext),
.align = __alignof__(struct nf_conn_act_ct_ext),
.id = NF_CT_EXT_ACT_CT,
};
static struct flow_action_entry *
tcf_ct_flow_table_flow_action_get_next(struct flow_action *flow_action)
{
@ -358,6 +365,7 @@ static void tcf_ct_flow_table_add(struct tcf_ct_flow_table *ct_ft,
struct nf_conn *ct,
bool tcp)
{
struct nf_conn_act_ct_ext *act_ct_ext;
struct flow_offload *entry;
int err;
@ -375,6 +383,14 @@ static void tcf_ct_flow_table_add(struct tcf_ct_flow_table *ct_ft,
ct->proto.tcp.seen[1].flags |= IP_CT_TCP_FLAG_BE_LIBERAL;
}
act_ct_ext = nf_conn_act_ct_ext_find(ct);
if (act_ct_ext) {
entry->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.iifidx =
act_ct_ext->ifindex[IP_CT_DIR_ORIGINAL];
entry->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.iifidx =
act_ct_ext->ifindex[IP_CT_DIR_REPLY];
}
err = flow_offload_add(&ct_ft->nf_ft, entry);
if (err)
goto err_add;
@ -1027,6 +1043,7 @@ do_nat:
if (!ct)
goto out_push;
nf_ct_deliver_cached_events(ct);
nf_conn_act_ct_ext_fill(skb, ct, ctinfo);
err = tcf_ct_act_nat(skb, ct, ctinfo, p->ct_action, &p->range, commit);
if (err != NF_ACCEPT)
@ -1036,6 +1053,9 @@ do_nat:
tcf_ct_act_set_mark(ct, p->mark, p->mark_mask);
tcf_ct_act_set_labels(ct, p->labels, p->labels_mask);
if (!nf_ct_is_confirmed(ct))
nf_conn_act_ct_ext_add(ct);
/* This will take care of sending queued events
* even if the connection is already confirmed.
*/
@ -1583,10 +1603,16 @@ static int __init ct_init_module(void)
if (err)
goto err_register;
err = nf_ct_extend_register(&act_ct_extend);
if (err)
goto err_register_extend;
static_branch_inc(&tcf_frag_xmit_count);
return 0;
err_register_extend:
tcf_unregister_action(&act_ct_ops, &ct_net_ops);
err_register:
tcf_ct_flow_tables_uninit();
err_tbl_init:
@ -1597,6 +1623,7 @@ err_tbl_init:
static void __exit ct_cleanup_module(void)
{
static_branch_dec(&tcf_frag_xmit_count);
nf_ct_extend_unregister(&act_ct_extend);
tcf_unregister_action(&act_ct_ops, &ct_net_ops);
tcf_ct_flow_tables_uninit();
destroy_workqueue(act_ct_wq);