mlx5_core-$(CONFIG_PCI_HYPERV_INTERFACE) += en/hv_vhca_stats.o
mlx5_core-$(CONFIG_MLX5_ESWITCH) += lag/mp.o lag/port_sel.o lib/geneve.o lib/port_tun.o \
en_rep.o en/rep/bond.o en/mod_hdr.o \
- en/mapping.o
+ en/mapping.o lag/mpesw.o
mlx5_core-$(CONFIG_MLX5_CLS_ACT) += en_tc.o en/rep/tc.o en/rep/neigh.o \
lib/fs_chains.o en/tc_tun.o \
esw/indir_table.o en/tc_tun_encap.o \
#include "en/tc_tun_encap.h"
#include "en/tc_priv.h"
#include "en_rep.h"
+#include "lag/lag.h"
static bool
same_vf_reps(struct mlx5e_priv *priv, struct net_device *out_dev)
struct net_device *uplink_dev;
struct mlx5e_priv *out_priv;
struct mlx5_eswitch *esw;
+ bool is_uplink_rep;
int *ifindexes;
int if_count;
int err;
parse_state->ifindexes[if_count] = out_dev->ifindex;
parse_state->if_count++;
+ is_uplink_rep = mlx5e_eswitch_uplink_rep(out_dev);
+ err = mlx5_lag_do_mirred(priv->mdev, out_dev);
+ if (err)
+ return err;
out_dev = get_fdb_out_dev(uplink_dev, out_dev);
if (!out_dev)
rpriv = out_priv->ppriv;
esw_attr->dests[esw_attr->out_count].rep = rpriv->rep;
esw_attr->dests[esw_attr->out_count].mdev = out_priv->mdev;
+
+ /* If output device is bond master then rules are not explicit
+ * so we don't attempt to count them.
+ */
+ if (is_uplink_rep && MLX5_CAP_PORT_SELECTION(priv->mdev, port_select_flow_table) &&
+ MLX5_CAP_GEN(priv->mdev, create_lag_when_not_master_up))
+ attr->lag.count = true;
+
esw_attr->out_count++;
return 0;
free_flow_post_acts(flow);
+ if (flow->attr->lag.count)
+ mlx5_lag_del_mpesw_rule(esw->dev);
+
kvfree(attr->esw_attr->rx_tun_attr);
kvfree(attr->parse_attr);
kfree(flow->attr);
same_hw_reps(priv, peer_netdev));
}
+static bool is_multiport_eligible(struct mlx5e_priv *priv, struct net_device *out_dev)
+{
+ if (mlx5e_eswitch_uplink_rep(out_dev) &&
+ MLX5_CAP_PORT_SELECTION(priv->mdev, port_select_flow_table) &&
+ MLX5_CAP_GEN(priv->mdev, create_lag_when_not_master_up))
+ return true;
+
+ return false;
+}
+
bool mlx5e_is_valid_eswitch_fwd_dev(struct mlx5e_priv *priv,
struct net_device *out_dev)
{
if (is_merged_eswitch_vfs(priv, out_dev))
return true;
+ if (is_multiport_eligible(priv, out_dev))
+ return true;
+
if (is_lag_dev(priv, out_dev))
return true;
struct mlx5_core_dev *in_mdev)
{
struct flow_rule *rule = flow_cls_offload_flow_rule(f);
+ struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
struct netlink_ext_ack *extack = f->common.extack;
struct mlx5e_tc_flow_parse_attr *parse_attr;
struct mlx5e_tc_flow *flow;
if (err)
goto err_free;
+ if (flow->attr->lag.count) {
+ err = mlx5_lag_add_mpesw_rule(esw->dev);
+ if (err)
+ goto err_free;
+ }
+
err = mlx5e_tc_add_fdb_flow(priv, flow, extack);
complete_all(&flow->init_done);
if (err) {
if (!(err == -ENETUNREACH && mlx5_lag_is_multipath(in_mdev)))
- goto err_free;
+ goto err_lag;
add_unready_flow(flow);
}
return flow;
+err_lag:
+ if (flow->attr->lag.count)
+ mlx5_lag_del_mpesw_rule(esw->dev);
err_free:
mlx5e_flow_put(priv, flow);
out:
u32 flags;
struct list_head list;
struct mlx5e_post_act_handle *post_act_handle;
+ struct {
+ /* Indicate whether the parsed flow should be counted for lag mode decision
+ * making
+ */
+ bool count;
+ } lag;
+ /* keep this union last */
union {
struct mlx5_esw_flow_attr esw_attr[0];
struct mlx5_nic_flow_attr nic_attr[0];
#include "en_tc.h"
#include "en/mapping.h"
#include "devlink.h"
+#include "lag/lag.h"
#define mlx5_esw_for_each_rep(esw, i, rep) \
xa_for_each(&((esw)->offloads.vport_reps), i, rep)
dest[dest_idx].vport.vhca_id =
MLX5_CAP_GEN(esw_attr->dests[attr_idx].mdev, vhca_id);
dest[dest_idx].vport.flags |= MLX5_FLOW_DEST_VPORT_VHCA_ID;
+ if (mlx5_lag_mpesw_is_activated(esw->dev))
+ dest[dest_idx].type = MLX5_FLOW_DESTINATION_TYPE_UPLINK;
}
if (esw_attr->dests[attr_idx].flags & MLX5_ESW_DEST_ENCAP) {
if (pkt_reformat) {
static char *get_str_mode_type(struct mlx5_lag *ldev)
{
- if (ldev->mode == MLX5_LAG_MODE_ROCE)
- return "roce";
- if (ldev->mode == MLX5_LAG_MODE_SRIOV)
- return "switchdev";
- if (ldev->mode == MLX5_LAG_MODE_MULTIPATH)
- return "multipath";
+ switch (ldev->mode) {
+ case MLX5_LAG_MODE_ROCE: return "roce";
+ case MLX5_LAG_MODE_SRIOV: return "switchdev";
+ case MLX5_LAG_MODE_MULTIPATH: return "multipath";
+ case MLX5_LAG_MODE_MPESW: return "multiport_eswitch";
+ default: return "invalid";
+ }
return NULL;
}
ldev = dev->priv.lag;
mutex_lock(&ldev->lock);
if (__mlx5_lag_is_active(ldev))
- mode = get_str_port_sel_mode(ldev->mode_flags);
+ mode = mlx5_get_str_port_sel_mode(ldev);
else
ret = -EINVAL;
mutex_unlock(&ldev->lock);
- if (ret || !mode)
+ if (ret)
return ret;
seq_printf(file, "%s\n", mode);
#include "esw/acl/ofld.h"
#include "lag.h"
#include "mp.h"
+#include "mpesw.h"
enum {
MLX5_LAG_EGRESS_PORT_1 = 1,
if (test_bit(MLX5_LAG_MODE_FLAG_HASH_BASED, &flags))
return MLX5_LAG_PORT_SELECT_MODE_PORT_SELECT_FT;
+ if (mode == MLX5_LAG_MODE_MPESW)
+ return MLX5_LAG_PORT_SELECT_MODE_PORT_SELECT_MPESW;
+
return MLX5_LAG_PORT_SELECT_MODE_QUEUE_AFFINITY;
}
if (ldev->nb.notifier_call)
unregister_netdevice_notifier_net(&init_net, &ldev->nb);
mlx5_lag_mp_cleanup(ldev);
- cancel_delayed_work_sync(&ldev->bond_work);
+ mlx5_lag_mpesw_cleanup(ldev);
+ cancel_work_sync(&ldev->mpesw_work);
destroy_workqueue(ldev->wq);
mutex_destroy(&ldev->lock);
kfree(ldev);
if (err)
mlx5_core_err(dev, "Failed to init multipath lag err=%d\n",
err);
+
+ mlx5_lag_mpesw_init(ldev);
ldev->ports = MLX5_CAP_GEN(dev, num_lag_ports);
ldev->buckets = 1;
return 0;
}
-static int mlx5_lag_set_port_sel_mode_offloads(struct mlx5_lag *ldev,
- struct lag_tracker *tracker, unsigned long *flags)
+static void mlx5_lag_set_port_sel_mode_offloads(struct mlx5_lag *ldev,
+ struct lag_tracker *tracker,
+ enum mlx5_lag_mode mode,
+ unsigned long *flags)
{
struct lag_func *dev0 = &ldev->pf[MLX5_LAG_P1];
+ if (mode == MLX5_LAG_MODE_MPESW)
+ return;
+
if (MLX5_CAP_PORT_SELECTION(dev0->dev, port_select_flow_table) &&
tracker->tx_type == NETDEV_LAG_TX_TYPE_HASH)
set_bit(MLX5_LAG_MODE_FLAG_HASH_BASED, flags);
-
- return 0;
}
static int mlx5_lag_set_flags(struct mlx5_lag *ldev, enum mlx5_lag_mode mode,
if (roce_lag)
return mlx5_lag_set_port_sel_mode_roce(ldev, flags);
- return mlx5_lag_set_port_sel_mode_offloads(ldev, tracker, flags);
+ mlx5_lag_set_port_sel_mode_offloads(ldev, tracker, mode, flags);
+ return 0;
}
-char *get_str_port_sel_mode(unsigned long flags)
+char *mlx5_get_str_port_sel_mode(struct mlx5_lag *ldev)
{
- if (test_bit(MLX5_LAG_MODE_FLAG_HASH_BASED, &flags))
- return "hash";
- return "queue_affinity";
+ int port_sel_mode = get_port_sel_mode(ldev->mode, ldev->mode_flags);
+
+ switch (port_sel_mode) {
+ case MLX5_LAG_PORT_SELECT_MODE_QUEUE_AFFINITY: return "queue_affinity";
+ case MLX5_LAG_PORT_SELECT_MODE_PORT_SELECT_FT: return "hash";
+ case MLX5_LAG_PORT_SELECT_MODE_PORT_SELECT_MPESW: return "mpesw";
+ default: return "invalid";
+ }
}
static int mlx5_create_lag(struct mlx5_lag *ldev,
u32 in[MLX5_ST_SZ_DW(destroy_lag_in)] = {};
int err;
- mlx5_lag_print_mapping(dev0, ldev, tracker, flags);
+ if (tracker)
+ mlx5_lag_print_mapping(dev0, ldev, tracker, flags);
mlx5_core_info(dev0, "shared_fdb:%d mode:%s\n",
- shared_fdb, get_str_port_sel_mode(flags));
+ shared_fdb, mlx5_get_str_port_sel_mode(ldev));
err = mlx5_cmd_create_lag(dev0, ldev->v2p_map, mode, flags);
if (err) {
{
bool roce_lag = mode == MLX5_LAG_MODE_ROCE;
struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev;
- unsigned long flags;
+ unsigned long flags = 0;
int err;
err = mlx5_lag_set_flags(ldev, mode, tracker, shared_fdb, &flags);
if (err)
return err;
- mlx5_infer_tx_affinity_mapping(tracker, ldev->ports, ldev->buckets, ldev->v2p_map);
- if (test_bit(MLX5_LAG_MODE_FLAG_HASH_BASED, &flags)) {
- err = mlx5_lag_port_sel_create(ldev, tracker->hash_type,
- ldev->v2p_map);
- if (err) {
- mlx5_core_err(dev0,
- "Failed to create LAG port selection(%d)\n",
- err);
- return err;
+ if (mode != MLX5_LAG_MODE_MPESW) {
+ mlx5_infer_tx_affinity_mapping(tracker, ldev->ports, ldev->buckets, ldev->v2p_map);
+ if (test_bit(MLX5_LAG_MODE_FLAG_HASH_BASED, &flags)) {
+ err = mlx5_lag_port_sel_create(ldev, tracker->hash_type,
+ ldev->v2p_map);
+ if (err) {
+ mlx5_core_err(dev0,
+ "Failed to create LAG port selection(%d)\n",
+ err);
+ return err;
+ }
}
}
return err;
}
- if (tracker->tx_type == NETDEV_LAG_TX_TYPE_ACTIVEBACKUP &&
+ if (tracker && tracker->tx_type == NETDEV_LAG_TX_TYPE_ACTIVEBACKUP &&
!roce_lag)
mlx5_lag_drop_rule_setup(ldev, tracker);
}
}
-static void mlx5_disable_lag(struct mlx5_lag *ldev)
+void mlx5_disable_lag(struct mlx5_lag *ldev)
{
bool shared_fdb = test_bit(MLX5_LAG_MODE_FLAG_SHARED_FDB, &ldev->mode_flags);
struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev;
}
}
-static bool mlx5_shared_fdb_supported(struct mlx5_lag *ldev)
+bool mlx5_shared_fdb_supported(struct mlx5_lag *ldev)
{
struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev;
struct mlx5_core_dev *dev1 = ldev->pf[MLX5_LAG_P2].dev;
return roce_lag;
}
+static bool mlx5_lag_should_modify_lag(struct mlx5_lag *ldev, bool do_bond)
+{
+ return do_bond && __mlx5_lag_is_active(ldev) &&
+ ldev->mode != MLX5_LAG_MODE_MPESW;
+}
+
+static bool mlx5_lag_should_disable_lag(struct mlx5_lag *ldev, bool do_bond)
+{
+ return !do_bond && __mlx5_lag_is_active(ldev) &&
+ ldev->mode != MLX5_LAG_MODE_MPESW;
+}
+
static void mlx5_do_bond(struct mlx5_lag *ldev)
{
struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev;
return;
}
}
- } else if (do_bond && __mlx5_lag_is_active(ldev)) {
+ } else if (mlx5_lag_should_modify_lag(ldev, do_bond)) {
mlx5_modify_lag(ldev, &tracker);
- } else if (!do_bond && __mlx5_lag_is_active(ldev)) {
+ } else if (mlx5_lag_should_disable_lag(ldev, do_bond)) {
mlx5_disable_lag(ldev);
}
}
return 1;
}
+/* this handler is always registered to netdev events */
static int mlx5_lag_netdev_event(struct notifier_block *this,
unsigned long event, void *ptr)
{
#include "mlx5_core.h"
#include "mp.h"
#include "port_sel.h"
+#include "mpesw.h"
enum {
MLX5_LAG_P1,
MLX5_LAG_MODE_ROCE,
MLX5_LAG_MODE_SRIOV,
MLX5_LAG_MODE_MULTIPATH,
+ MLX5_LAG_MODE_MPESW,
};
struct lag_func {
struct lag_tracker tracker;
struct workqueue_struct *wq;
struct delayed_work bond_work;
+ struct work_struct mpesw_work;
struct notifier_block nb;
struct lag_mp lag_mp;
struct mlx5_lag_port_sel port_sel;
/* Protect lag fields/state changes */
struct mutex lock;
+ struct lag_mpesw lag_mpesw;
};
static inline struct mlx5_lag *
bool shared_fdb);
int mlx5_lag_dev_get_netdev_idx(struct mlx5_lag *ldev,
struct net_device *ndev);
+bool mlx5_shared_fdb_supported(struct mlx5_lag *ldev);
+void mlx5_lag_del_mpesw_rule(struct mlx5_core_dev *dev);
+int mlx5_lag_add_mpesw_rule(struct mlx5_core_dev *dev);
-char *get_str_port_sel_mode(unsigned long flags);
+char *mlx5_get_str_port_sel_mode(struct mlx5_lag *ldev);
void mlx5_infer_tx_enabled(struct lag_tracker *tracker, u8 num_ports,
u8 *ports, int *num_enabled);
void mlx5_ldev_add_debugfs(struct mlx5_core_dev *dev);
void mlx5_ldev_remove_debugfs(struct dentry *dbg);
+void mlx5_disable_lag(struct mlx5_lag *ldev);
#endif /* __MLX5_LAG_H__ */
--- /dev/null
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/* Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */
+
+#include <linux/netdevice.h>
+#include <net/nexthop.h>
+#include "lag/lag.h"
+#include "eswitch.h"
+#include "lib/mlx5.h"
+
+void mlx5_mpesw_work(struct work_struct *work)
+{
+ struct mlx5_lag *ldev = container_of(work, struct mlx5_lag, mpesw_work);
+
+ mutex_lock(&ldev->lock);
+ mlx5_disable_lag(ldev);
+ mutex_unlock(&ldev->lock);
+}
+
+static void mlx5_lag_disable_mpesw(struct mlx5_core_dev *dev)
+{
+ struct mlx5_lag *ldev = dev->priv.lag;
+
+ if (!queue_work(ldev->wq, &ldev->mpesw_work))
+ mlx5_core_warn(dev, "failed to queue work\n");
+}
+
+void mlx5_lag_del_mpesw_rule(struct mlx5_core_dev *dev)
+{
+ struct mlx5_lag *ldev = dev->priv.lag;
+
+ if (!ldev)
+ return;
+
+ mutex_lock(&ldev->lock);
+ if (!atomic_dec_return(&ldev->lag_mpesw.mpesw_rule_count) &&
+ ldev->mode == MLX5_LAG_MODE_MPESW)
+ mlx5_lag_disable_mpesw(dev);
+ mutex_unlock(&ldev->lock);
+}
+
+int mlx5_lag_add_mpesw_rule(struct mlx5_core_dev *dev)
+{
+ struct mlx5_lag *ldev = dev->priv.lag;
+ bool shared_fdb;
+ int err = 0;
+
+ if (!ldev)
+ return 0;
+
+ mutex_lock(&ldev->lock);
+ if (atomic_add_return(1, &ldev->lag_mpesw.mpesw_rule_count) != 1)
+ goto out;
+
+ if (ldev->mode != MLX5_LAG_MODE_NONE) {
+ err = -EINVAL;
+ goto out;
+ }
+ shared_fdb = mlx5_shared_fdb_supported(ldev);
+ err = mlx5_activate_lag(ldev, NULL, MLX5_LAG_MODE_MPESW, shared_fdb);
+ if (err)
+ mlx5_core_warn(dev, "Failed to create LAG in MPESW mode (%d)\n", err);
+
+out:
+ mutex_unlock(&ldev->lock);
+ return err;
+}
+
+int mlx5_lag_do_mirred(struct mlx5_core_dev *mdev, struct net_device *out_dev)
+{
+ struct mlx5_lag *ldev = mdev->priv.lag;
+
+ if (!netif_is_bond_master(out_dev) || !ldev)
+ return 0;
+
+ mutex_lock(&ldev->lock);
+ if (ldev->mode == MLX5_LAG_MODE_MPESW) {
+ mutex_unlock(&ldev->lock);
+ return -EOPNOTSUPP;
+ }
+ mutex_unlock(&ldev->lock);
+ return 0;
+}
+
+bool mlx5_lag_mpesw_is_activated(struct mlx5_core_dev *dev)
+{
+ bool ret;
+
+ ret = dev->priv.lag && dev->priv.lag->mode == MLX5_LAG_MODE_MPESW;
+ return ret;
+}
+
+void mlx5_lag_mpesw_init(struct mlx5_lag *ldev)
+{
+ INIT_WORK(&ldev->mpesw_work, mlx5_mpesw_work);
+ atomic_set(&ldev->lag_mpesw.mpesw_rule_count, 0);
+}
+
+void mlx5_lag_mpesw_cleanup(struct mlx5_lag *ldev)
+{
+ cancel_delayed_work_sync(&ldev->bond_work);
+}
--- /dev/null
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/* Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */
+
+#ifndef __MLX5_LAG_MPESW_H__
+#define __MLX5_LAG_MPESW_H__
+
+#include "lag.h"
+#include "mlx5_core.h"
+
+struct lag_mpesw {
+ struct work_struct mpesw_work;
+ atomic_t mpesw_rule_count;
+};
+
+void mlx5_mpesw_work(struct work_struct *work);
+int mlx5_lag_do_mirred(struct mlx5_core_dev *mdev, struct net_device *out_dev);
+bool mlx5_lag_mpesw_is_activated(struct mlx5_core_dev *dev);
+#if IS_ENABLED(CONFIG_MLX5_ESWITCH)
+void mlx5_lag_mpesw_init(struct mlx5_lag *ldev);
+void mlx5_lag_mpesw_cleanup(struct mlx5_lag *ldev);
+#else
+void mlx5_lag_mpesw_init(struct mlx5_lag *ldev) {}
+void mlx5_lag_mpesw_cleanup(struct mlx5_lag *ldev) {}
+#endif
+
+#endif /* __MLX5_LAG_MPESW_H__ */
u8 vhca_resource_manager[0x1];
u8 hca_cap_2[0x1];
- u8 reserved_at_21[0x1];
+ u8 create_lag_when_not_master_up[0x1];
u8 dtor[0x1];
u8 event_on_vhca_state_teardown_request[0x1];
u8 event_on_vhca_state_in_use[0x1];
enum {
MLX5_LAG_PORT_SELECT_MODE_QUEUE_AFFINITY = 0,
- MLX5_LAG_PORT_SELECT_MODE_PORT_SELECT_FT,
+ MLX5_LAG_PORT_SELECT_MODE_PORT_SELECT_FT = 1,
+ MLX5_LAG_PORT_SELECT_MODE_PORT_SELECT_MPESW = 2,
};
struct mlx5_ifc_lagc_bits {