ice: enable ndo_setup_tc support for mqprio_qdisc
authorKiran Patil <kiran.patil@intel.com>
Fri, 15 Oct 2021 23:35:16 +0000 (16:35 -0700)
committerTony Nguyen <anthony.l.nguyen@intel.com>
Wed, 20 Oct 2021 22:58:11 +0000 (15:58 -0700)
Add support in driver for TC_QDISC_SETUP_MQPRIO. This support
enables instantiation of channels in HW using existing MQPRIO
infrastructure which is extended to be offloadable. This
provides a mechanism to configure dedicated set of queues for
each TC.

Configuring channels using "tc mqprio":
--------------------------------------
tc qdisc add dev <ethX> root mqprio num_tc 3 map 0 1 2 \
queues 4@0 4@4 4@8  hw 1 mode channel

Above command configures 3 TCs having 4 queues each. "hw 1 mode channel"
implies offload of channel configuration to HW. When driver processes
configuration received via "ndo_setup_tc: QDISC_SETUP_MQPRIO", each
TC maps to HW VSI with specified queues.

User can optionally specify bandwidth min and max rate limit per TC
(see example below). If shaper params like min and/or max bandwidth
rate limit are specified, driver configures VSI specific rate limiter
in HW.

Configuring channels and bandwidth shaper parameters using "tc mqprio":
----------------------------------------------------------------
tc qdisc add dev <ethX> root mqprio \
num_tc 4 map 0 1 2 3 queues 4@0 4@4 4@8 4@12 hw 1 mode channel \
shaper bw_rlimit min_rate 1Gbit 2Gbit 3Gbit 4Gbit \
max_rate 4Gbit 5Gbit 6Gbit 7Gbit

Command to view configured TCs:
-----------------------------
tc qdisc show dev <ethX>

Deleting TCs:
------------
tc qdisc del dev <ethX> root mqprio

Signed-off-by: Kiran Patil <kiran.patil@intel.com>
Signed-off-by: Amritha Nambiar <amritha.nambiar@intel.com>
Signed-off-by: Sudheer Mogilappagari <sudheer.mogilappagari@intel.com>
Tested-by: Bharathi Sreenivas <bharathi.sreenivas@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
drivers/net/ethernet/intel/ice/ice.h
drivers/net/ethernet/intel/ice/ice_ethtool.c
drivers/net/ethernet/intel/ice/ice_lib.c
drivers/net/ethernet/intel/ice/ice_lib.h
drivers/net/ethernet/intel/ice/ice_main.c

index c2814e3..4f94ecf 100644 (file)
@@ -56,6 +56,7 @@
 #include "ice_dcb.h"
 #include "ice_switch.h"
 #include "ice_common.h"
+#include "ice_flow.h"
 #include "ice_sched.h"
 #include "ice_idc_int.h"
 #include "ice_virtchnl_pf.h"
 #define ICE_TX_CTX_DESC(R, i) (&(((struct ice_tx_ctx_desc *)((R)->desc))[i]))
 #define ICE_TX_FDIRDESC(R, i) (&(((struct ice_fltr_desc *)((R)->desc))[i]))
 
+/* Minimum BW limit is 500 Kbps for any scheduler node */
+#define ICE_MIN_BW_LIMIT               500
+/* User can specify BW in either Kbit/Mbit/Gbit and OS converts it in bytes.
+ * use it to convert user specified BW limit into Kbps
+ */
+#define ICE_BW_KBPS_DIVISOR            125
+
 /* Macro for each VSI in a PF */
 #define ice_for_each_vsi(pf, i) \
        for ((i) = 0; (i) < (pf)->num_alloc_vsi; (i)++)
@@ -440,6 +448,8 @@ struct ice_q_vector {
        cpumask_t affinity_mask;
        struct irq_affinity_notify affinity_notify;
 
+       struct ice_channel *ch;
+
        char name[ICE_INT_NAME_STR_LEN];
 
        u16 total_events;       /* net_dim(): number of interrupts processed */
@@ -596,6 +606,17 @@ struct ice_netdev_priv {
 };
 
 /**
+ * ice_vector_ch_enabled
+ * @qv: pointer to q_vector, can be NULL
+ *
+ * This function returns true if vector is channel enabled otherwise false
+ */
+static inline bool ice_vector_ch_enabled(struct ice_q_vector *qv)
+{
+       return !!qv->ch; /* Enable it to run with TC */
+}
+
+/**
  * ice_irq_dynamic_ena - Enable default interrupt generation settings
  * @hw: pointer to HW struct
  * @vsi: pointer to VSI struct, can be NULL
index 8b3eef6..cfe96a1 100644 (file)
@@ -3194,6 +3194,11 @@ ice_set_rxfh(struct net_device *netdev, const u32 *indir, const u8 *key,
                return -EIO;
        }
 
+       if (ice_is_adq_active(pf)) {
+               netdev_err(netdev, "Cannot change RSS params with ADQ configured.\n");
+               return -EOPNOTSUPP;
+       }
+
        if (key) {
                if (!vsi->rss_hkey_user) {
                        vsi->rss_hkey_user =
@@ -3404,6 +3409,11 @@ static int ice_set_channels(struct net_device *dev, struct ethtool_channels *ch)
        if (ch->other_count != (test_bit(ICE_FLAG_FD_ENA, pf->flags) ? 1U : 0U))
                return -EINVAL;
 
+       if (ice_is_adq_active(pf)) {
+               netdev_err(dev, "Cannot set channels with ADQ configured.\n");
+               return -EOPNOTSUPP;
+       }
+
        if (test_bit(ICE_FLAG_FD_ENA, pf->flags) && pf->hw.fdir_active_fltr) {
                netdev_err(dev, "Cannot set channels when Flow Director filters are active\n");
                return -EOPNOTSUPP;
index 62adbb3..4904ae0 100644 (file)
@@ -3954,7 +3954,7 @@ int ice_get_link_speed_mbps(struct ice_vsi *vsi)
  *
  * Return current VSI link speed and 0 if the speed is unknown.
  */
-static int ice_get_link_speed_kbps(struct ice_vsi *vsi)
+int ice_get_link_speed_kbps(struct ice_vsi *vsi)
 {
        int speed_mbps;
 
index b44ceff..e7f4ecb 100644 (file)
@@ -124,6 +124,7 @@ int ice_set_dflt_vsi(struct ice_sw *sw, struct ice_vsi *vsi);
 int ice_clear_dflt_vsi(struct ice_sw *sw);
 int ice_set_min_bw_limit(struct ice_vsi *vsi, u64 min_tx_rate);
 int ice_set_max_bw_limit(struct ice_vsi *vsi, u64 max_tx_rate);
+int ice_get_link_speed_kbps(struct ice_vsi *vsi);
 int ice_get_link_speed_mbps(struct ice_vsi *vsi);
 int
 ice_vsi_update_security(struct ice_vsi *vsi, void (*fill)(struct ice_vsi_ctx *));
index 20c9128..ff2d000 100644 (file)
@@ -55,6 +55,9 @@ static void ice_rebuild(struct ice_pf *pf, enum ice_reset_req reset_type);
 
 static void ice_vsi_release_all(struct ice_pf *pf);
 
+static int ice_rebuild_channels(struct ice_pf *pf);
+static void ice_remove_q_channels(struct ice_vsi *vsi, bool rem_adv_fltr);
+
 bool netif_is_ice(struct net_device *dev)
 {
        return dev && (dev->netdev_ops == &ice_netdev_ops);
@@ -106,7 +109,12 @@ static void ice_check_for_hang_subtask(struct ice_pf *pf)
        ice_for_each_txq(vsi, i) {
                struct ice_tx_ring *tx_ring = vsi->tx_rings[i];
 
-               if (tx_ring && tx_ring->desc) {
+               if (!tx_ring)
+                       continue;
+               if (ice_ring_ch_enabled(tx_ring))
+                       continue;
+
+               if (tx_ring->desc) {
                        /* If packet counter has not changed the queue is
                         * likely stalled, so force an interrupt for this
                         * queue.
@@ -458,17 +466,21 @@ static void ice_pf_dis_all_vsi(struct ice_pf *pf, bool locked)
 }
 
 /**
- * ice_prepare_for_reset - prep for the core to reset
+ * ice_prepare_for_reset - prep for reset
  * @pf: board private structure
+ * @reset_type: reset type requested
  *
  * Inform or close all dependent features in prep for reset.
  */
 static void
-ice_prepare_for_reset(struct ice_pf *pf)
+ice_prepare_for_reset(struct ice_pf *pf, enum ice_reset_req reset_type)
 {
        struct ice_hw *hw = &pf->hw;
+       struct ice_vsi *vsi;
        unsigned int i;
 
+       dev_dbg(ice_pf_to_dev(pf), "reset_type=%d\n", reset_type);
+
        /* already prepared for reset */
        if (test_bit(ICE_PREPARED_FOR_RESET, pf->state))
                return;
@@ -483,6 +495,38 @@ ice_prepare_for_reset(struct ice_pf *pf)
        ice_for_each_vf(pf, i)
                ice_set_vf_state_qs_dis(&pf->vf[i]);
 
+       /* release ADQ specific HW and SW resources */
+       vsi = ice_get_main_vsi(pf);
+       if (!vsi)
+               goto skip;
+
+       /* to be on safe side, reset orig_rss_size so that normal flow
+        * of deciding rss_size can take precedence
+        */
+       vsi->orig_rss_size = 0;
+
+       if (test_bit(ICE_FLAG_TC_MQPRIO, pf->flags)) {
+               if (reset_type == ICE_RESET_PFR) {
+                       vsi->old_ena_tc = vsi->all_enatc;
+                       vsi->old_numtc = vsi->all_numtc;
+               } else {
+                       ice_remove_q_channels(vsi, true);
+
+                       /* for other reset type, do not support channel rebuild
+                        * hence reset needed info
+                        */
+                       vsi->old_ena_tc = 0;
+                       vsi->all_enatc = 0;
+                       vsi->old_numtc = 0;
+                       vsi->all_numtc = 0;
+                       vsi->req_txq = 0;
+                       vsi->req_rxq = 0;
+                       clear_bit(ICE_FLAG_TC_MQPRIO, pf->flags);
+                       memset(&vsi->mqprio_qopt, 0, sizeof(vsi->mqprio_qopt));
+               }
+       }
+skip:
+
        /* clear SW filtering DB */
        ice_clear_hw_tbls(hw);
        /* disable the VSIs and their queues that are not already DOWN */
@@ -502,8 +546,7 @@ ice_prepare_for_reset(struct ice_pf *pf)
 /**
  * ice_do_reset - Initiate one of many types of resets
  * @pf: board private structure
- * @reset_type: reset type requested
- * before this function was called.
+ * @reset_type: reset type requested before this function was called.
  */
 static void ice_do_reset(struct ice_pf *pf, enum ice_reset_req reset_type)
 {
@@ -512,7 +555,7 @@ static void ice_do_reset(struct ice_pf *pf, enum ice_reset_req reset_type)
 
        dev_dbg(dev, "reset_type 0x%x requested\n", reset_type);
 
-       ice_prepare_for_reset(pf);
+       ice_prepare_for_reset(pf, reset_type);
 
        /* trigger the reset */
        if (ice_reset(hw, reset_type)) {
@@ -570,7 +613,7 @@ static void ice_reset_subtask(struct ice_pf *pf)
                /* return if no valid reset type requested */
                if (reset_type == ICE_RESET_INVAL)
                        return;
-               ice_prepare_for_reset(pf);
+               ice_prepare_for_reset(pf, reset_type);
 
                /* make sure we are ready to rebuild */
                if (ice_check_reset(&pf->hw)) {
@@ -3243,6 +3286,13 @@ ice_pf_vsi_setup(struct ice_pf *pf, struct ice_port_info *pi)
        return ice_vsi_setup(pf, pi, ICE_VSI_PF, ICE_INVAL_VFID, NULL);
 }
 
+static struct ice_vsi *
+ice_chnl_vsi_setup(struct ice_pf *pf, struct ice_port_info *pi,
+                  struct ice_channel *ch)
+{
+       return ice_vsi_setup(pf, pi, ICE_VSI_CHNL, ICE_INVAL_VFID, ch);
+}
+
 /**
  * ice_ctrl_vsi_setup - Set up a control VSI
  * @pf: board private structure
@@ -3361,6 +3411,9 @@ static int ice_setup_pf_sw(struct ice_pf *pf)
        if (!vsi)
                return -ENOMEM;
 
+       /* init channel list */
+       INIT_LIST_HEAD(&vsi->ch_list);
+
        status = ice_cfg_netdev(vsi);
        if (status) {
                status = -ENODEV;
@@ -4956,7 +5009,7 @@ ice_pci_err_detected(struct pci_dev *pdev, pci_channel_state_t err)
 
                if (!test_bit(ICE_PREPARED_FOR_RESET, pf->state)) {
                        set_bit(ICE_PFR_REQ, pf->state);
-                       ice_prepare_for_reset(pf);
+                       ice_prepare_for_reset(pf, ICE_RESET_PFR);
                }
        }
 
@@ -5048,7 +5101,7 @@ static void ice_pci_err_reset_prepare(struct pci_dev *pdev)
 
                if (!test_bit(ICE_PREPARED_FOR_RESET, pf->state)) {
                        set_bit(ICE_PFR_REQ, pf->state);
-                       ice_prepare_for_reset(pf);
+                       ice_prepare_for_reset(pf, ICE_RESET_PFR);
                }
        }
 }
@@ -5453,6 +5506,11 @@ ice_set_features(struct net_device *netdev, netdev_features_t features)
                ice_clear_arfs(vsi);
        }
 
+       /* don't turn off hw_tc_offload when ADQ is already enabled */
+       if (!(features & NETIF_F_HW_TC) && ice_is_adq_active(pf)) {
+               dev_err(ice_pf_to_dev(pf), "ADQ is active, can't turn hw_tc_offload off\n");
+               return -EACCES;
+       }
        return ret;
 }
 
@@ -6307,6 +6365,9 @@ static void ice_vsi_release_all(struct ice_pf *pf)
                if (!pf->vsi[i])
                        continue;
 
+               if (pf->vsi[i]->type == ICE_VSI_CHNL)
+                       continue;
+
                err = ice_vsi_release(pf->vsi[i]);
                if (err)
                        dev_dbg(ice_pf_to_dev(pf), "Failed to release pf->vsi[%d], err %d, vsi_num = %d\n",
@@ -6517,6 +6578,15 @@ static void ice_rebuild(struct ice_pf *pf, enum ice_reset_req reset_type)
                goto err_vsi_rebuild;
        }
 
+       if (reset_type == ICE_RESET_PFR) {
+               err = ice_rebuild_channels(pf);
+               if (err) {
+                       dev_err(dev, "failed to rebuild and replay ADQ VSIs, err %d\n",
+                               err);
+                       goto err_vsi_rebuild;
+               }
+       }
+
        /* If Flow Director is active */
        if (test_bit(ICE_FLAG_FD_ENA, pf->flags)) {
                err = ice_vsi_rebuild_by_type(pf, ICE_VSI_CTRL);
@@ -7183,6 +7253,799 @@ ice_setup_tc_block_cb(enum tc_setup_type type, void *type_data, void *cb_priv)
        }
 }
 
+/**
+ * ice_validate_mqprio_qopt - Validate TCF input parameters
+ * @vsi: Pointer to VSI
+ * @mqprio_qopt: input parameters for mqprio queue configuration
+ *
+ * This function validates MQPRIO params, such as qcount (power of 2 wherever
+ * needed), and make sure user doesn't specify qcount and BW rate limit
+ * for TCs, which are more than "num_tc"
+ */
+static int
+ice_validate_mqprio_qopt(struct ice_vsi *vsi,
+                        struct tc_mqprio_qopt_offload *mqprio_qopt)
+{
+       u64 sum_max_rate = 0, sum_min_rate = 0;
+       int non_power_of_2_qcount = 0;
+       struct ice_pf *pf = vsi->back;
+       int max_rss_q_cnt = 0;
+       struct device *dev;
+       int i, speed;
+       u8 num_tc;
+
+       if (vsi->type != ICE_VSI_PF)
+               return -EINVAL;
+
+       if (mqprio_qopt->qopt.offset[0] != 0 ||
+           mqprio_qopt->qopt.num_tc < 1 ||
+           mqprio_qopt->qopt.num_tc > ICE_CHNL_MAX_TC)
+               return -EINVAL;
+
+       dev = ice_pf_to_dev(pf);
+       vsi->ch_rss_size = 0;
+       num_tc = mqprio_qopt->qopt.num_tc;
+
+       for (i = 0; num_tc; i++) {
+               int qcount = mqprio_qopt->qopt.count[i];
+               u64 max_rate, min_rate, rem;
+
+               if (!qcount)
+                       return -EINVAL;
+
+               if (is_power_of_2(qcount)) {
+                       if (non_power_of_2_qcount &&
+                           qcount > non_power_of_2_qcount) {
+                               dev_err(dev, "qcount[%d] cannot be greater than non power of 2 qcount[%d]\n",
+                                       qcount, non_power_of_2_qcount);
+                               return -EINVAL;
+                       }
+                       if (qcount > max_rss_q_cnt)
+                               max_rss_q_cnt = qcount;
+               } else {
+                       if (non_power_of_2_qcount &&
+                           qcount != non_power_of_2_qcount) {
+                               dev_err(dev, "Only one non power of 2 qcount allowed[%d,%d]\n",
+                                       qcount, non_power_of_2_qcount);
+                               return -EINVAL;
+                       }
+                       if (qcount < max_rss_q_cnt) {
+                               dev_err(dev, "non power of 2 qcount[%d] cannot be less than other qcount[%d]\n",
+                                       qcount, max_rss_q_cnt);
+                               return -EINVAL;
+                       }
+                       max_rss_q_cnt = qcount;
+                       non_power_of_2_qcount = qcount;
+               }
+
+               /* TC command takes input in K/N/Gbps or K/M/Gbit etc but
+                * converts the bandwidth rate limit into Bytes/s when
+                * passing it down to the driver. So convert input bandwidth
+                * from Bytes/s to Kbps
+                */
+               max_rate = mqprio_qopt->max_rate[i];
+               max_rate = div_u64(max_rate, ICE_BW_KBPS_DIVISOR);
+               sum_max_rate += max_rate;
+
+               /* min_rate is minimum guaranteed rate and it can't be zero */
+               min_rate = mqprio_qopt->min_rate[i];
+               min_rate = div_u64(min_rate, ICE_BW_KBPS_DIVISOR);
+               sum_min_rate += min_rate;
+
+               if (min_rate && min_rate < ICE_MIN_BW_LIMIT) {
+                       dev_err(dev, "TC%d: min_rate(%llu Kbps) < %u Kbps\n", i,
+                               min_rate, ICE_MIN_BW_LIMIT);
+                       return -EINVAL;
+               }
+
+               iter_div_u64_rem(min_rate, ICE_MIN_BW_LIMIT, &rem);
+               if (rem) {
+                       dev_err(dev, "TC%d: Min Rate not multiple of %u Kbps",
+                               i, ICE_MIN_BW_LIMIT);
+                       return -EINVAL;
+               }
+
+               iter_div_u64_rem(max_rate, ICE_MIN_BW_LIMIT, &rem);
+               if (rem) {
+                       dev_err(dev, "TC%d: Max Rate not multiple of %u Kbps",
+                               i, ICE_MIN_BW_LIMIT);
+                       return -EINVAL;
+               }
+
+               /* min_rate can't be more than max_rate, except when max_rate
+                * is zero (implies max_rate sought is max line rate). In such
+                * a case min_rate can be more than max.
+                */
+               if (max_rate && min_rate > max_rate) {
+                       dev_err(dev, "min_rate %llu Kbps can't be more than max_rate %llu Kbps\n",
+                               min_rate, max_rate);
+                       return -EINVAL;
+               }
+
+               if (i >= mqprio_qopt->qopt.num_tc - 1)
+                       break;
+               if (mqprio_qopt->qopt.offset[i + 1] !=
+                   (mqprio_qopt->qopt.offset[i] + qcount))
+                       return -EINVAL;
+       }
+       if (vsi->num_rxq <
+           (mqprio_qopt->qopt.offset[i] + mqprio_qopt->qopt.count[i]))
+               return -EINVAL;
+       if (vsi->num_txq <
+           (mqprio_qopt->qopt.offset[i] + mqprio_qopt->qopt.count[i]))
+               return -EINVAL;
+
+       speed = ice_get_link_speed_kbps(vsi);
+       if (sum_max_rate && sum_max_rate > (u64)speed) {
+               dev_err(dev, "Invalid max Tx rate(%llu) Kbps > speed(%u) Kbps specified\n",
+                       sum_max_rate, speed);
+               return -EINVAL;
+       }
+       if (sum_min_rate && sum_min_rate > (u64)speed) {
+               dev_err(dev, "Invalid min Tx rate(%llu) Kbps > speed (%u) Kbps specified\n",
+                       sum_min_rate, speed);
+               return -EINVAL;
+       }
+
+       /* make sure vsi->ch_rss_size is set correctly based on TC's qcount */
+       vsi->ch_rss_size = max_rss_q_cnt;
+
+       return 0;
+}
+
+/**
+ * ice_add_channel - add a channel by adding VSI
+ * @pf: ptr to PF device
+ * @sw_id: underlying HW switching element ID
+ * @ch: ptr to channel structure
+ *
+ * Add a channel (VSI) using add_vsi and queue_map
+ */
+static int ice_add_channel(struct ice_pf *pf, u16 sw_id, struct ice_channel *ch)
+{
+       struct device *dev = ice_pf_to_dev(pf);
+       struct ice_vsi *vsi;
+
+       if (ch->type != ICE_VSI_CHNL) {
+               dev_err(dev, "add new VSI failed, ch->type %d\n", ch->type);
+               return -EINVAL;
+       }
+
+       vsi = ice_chnl_vsi_setup(pf, pf->hw.port_info, ch);
+       if (!vsi || vsi->type != ICE_VSI_CHNL) {
+               dev_err(dev, "create chnl VSI failure\n");
+               return -EINVAL;
+       }
+
+       ch->sw_id = sw_id;
+       ch->vsi_num = vsi->vsi_num;
+       ch->info.mapping_flags = vsi->info.mapping_flags;
+       ch->ch_vsi = vsi;
+       /* set the back pointer of channel for newly created VSI */
+       vsi->ch = ch;
+
+       memcpy(&ch->info.q_mapping, &vsi->info.q_mapping,
+              sizeof(vsi->info.q_mapping));
+       memcpy(&ch->info.tc_mapping, vsi->info.tc_mapping,
+              sizeof(vsi->info.tc_mapping));
+
+       return 0;
+}
+
+/**
+ * ice_chnl_cfg_res
+ * @vsi: the VSI being setup
+ * @ch: ptr to channel structure
+ *
+ * Configure channel specific resources such as rings, vector.
+ */
+static void ice_chnl_cfg_res(struct ice_vsi *vsi, struct ice_channel *ch)
+{
+       int i;
+
+       for (i = 0; i < ch->num_txq; i++) {
+               struct ice_q_vector *tx_q_vector, *rx_q_vector;
+               struct ice_ring_container *rc;
+               struct ice_tx_ring *tx_ring;
+               struct ice_rx_ring *rx_ring;
+
+               tx_ring = vsi->tx_rings[ch->base_q + i];
+               rx_ring = vsi->rx_rings[ch->base_q + i];
+               if (!tx_ring || !rx_ring)
+                       continue;
+
+               /* setup ring being channel enabled */
+               tx_ring->ch = ch;
+               rx_ring->ch = ch;
+
+               /* following code block sets up vector specific attributes */
+               tx_q_vector = tx_ring->q_vector;
+               rx_q_vector = rx_ring->q_vector;
+               if (!tx_q_vector && !rx_q_vector)
+                       continue;
+
+               if (tx_q_vector) {
+                       tx_q_vector->ch = ch;
+                       /* setup Tx and Rx ITR setting if DIM is off */
+                       rc = &tx_q_vector->tx;
+                       if (!ITR_IS_DYNAMIC(rc))
+                               ice_write_itr(rc, rc->itr_setting);
+               }
+               if (rx_q_vector) {
+                       rx_q_vector->ch = ch;
+                       /* setup Tx and Rx ITR setting if DIM is off */
+                       rc = &rx_q_vector->rx;
+                       if (!ITR_IS_DYNAMIC(rc))
+                               ice_write_itr(rc, rc->itr_setting);
+               }
+       }
+
+       /* it is safe to assume that, if channel has non-zero num_t[r]xq, then
+        * GLINT_ITR register would have written to perform in-context
+        * update, hence perform flush
+        */
+       if (ch->num_txq || ch->num_rxq)
+               ice_flush(&vsi->back->hw);
+}
+
+/**
+ * ice_cfg_chnl_all_res - configure channel resources
+ * @vsi: pte to main_vsi
+ * @ch: ptr to channel structure
+ *
+ * This function configures channel specific resources such as flow-director
+ * counter index, and other resources such as queues, vectors, ITR settings
+ */
+static void
+ice_cfg_chnl_all_res(struct ice_vsi *vsi, struct ice_channel *ch)
+{
+       /* configure channel (aka ADQ) resources such as queues, vectors,
+        * ITR settings for channel specific vectors and anything else
+        */
+       ice_chnl_cfg_res(vsi, ch);
+}
+
+/**
+ * ice_setup_hw_channel - setup new channel
+ * @pf: ptr to PF device
+ * @vsi: the VSI being setup
+ * @ch: ptr to channel structure
+ * @sw_id: underlying HW switching element ID
+ * @type: type of channel to be created (VMDq2/VF)
+ *
+ * Setup new channel (VSI) based on specified type (VMDq2/VF)
+ * and configures Tx rings accordingly
+ */
+static int
+ice_setup_hw_channel(struct ice_pf *pf, struct ice_vsi *vsi,
+                    struct ice_channel *ch, u16 sw_id, u8 type)
+{
+       struct device *dev = ice_pf_to_dev(pf);
+       int ret;
+
+       ch->base_q = vsi->next_base_q;
+       ch->type = type;
+
+       ret = ice_add_channel(pf, sw_id, ch);
+       if (ret) {
+               dev_err(dev, "failed to add_channel using sw_id %u\n", sw_id);
+               return ret;
+       }
+
+       /* configure/setup ADQ specific resources */
+       ice_cfg_chnl_all_res(vsi, ch);
+
+       /* make sure to update the next_base_q so that subsequent channel's
+        * (aka ADQ) VSI queue map is correct
+        */
+       vsi->next_base_q = vsi->next_base_q + ch->num_rxq;
+       dev_dbg(dev, "added channel: vsi_num %u, num_rxq %u\n", ch->vsi_num,
+               ch->num_rxq);
+
+       return 0;
+}
+
+/**
+ * ice_setup_channel - setup new channel using uplink element
+ * @pf: ptr to PF device
+ * @vsi: the VSI being setup
+ * @ch: ptr to channel structure
+ *
+ * Setup new channel (VSI) based on specified type (VMDq2/VF)
+ * and uplink switching element
+ */
+static bool
+ice_setup_channel(struct ice_pf *pf, struct ice_vsi *vsi,
+                 struct ice_channel *ch)
+{
+       struct device *dev = ice_pf_to_dev(pf);
+       u16 sw_id;
+       int ret;
+
+       if (vsi->type != ICE_VSI_PF) {
+               dev_err(dev, "unsupported parent VSI type(%d)\n", vsi->type);
+               return false;
+       }
+
+       sw_id = pf->first_sw->sw_id;
+
+       /* create channel (VSI) */
+       ret = ice_setup_hw_channel(pf, vsi, ch, sw_id, ICE_VSI_CHNL);
+       if (ret) {
+               dev_err(dev, "failed to setup hw_channel\n");
+               return false;
+       }
+       dev_dbg(dev, "successfully created channel()\n");
+
+       return ch->ch_vsi ? true : false;
+}
+
+/**
+ * ice_set_bw_limit - setup BW limit for Tx traffic based on max_tx_rate
+ * @vsi: VSI to be configured
+ * @max_tx_rate: max Tx rate in Kbps to be configured as maximum BW limit
+ * @min_tx_rate: min Tx rate in Kbps to be configured as minimum BW limit
+ */
+static int
+ice_set_bw_limit(struct ice_vsi *vsi, u64 max_tx_rate, u64 min_tx_rate)
+{
+       int err;
+
+       err = ice_set_min_bw_limit(vsi, min_tx_rate);
+       if (err)
+               return err;
+
+       return ice_set_max_bw_limit(vsi, max_tx_rate);
+}
+
+/**
+ * ice_create_q_channel - function to create channel
+ * @vsi: VSI to be configured
+ * @ch: ptr to channel (it contains channel specific params)
+ *
+ * This function creates channel (VSI) using num_queues specified by user,
+ * reconfigs RSS if needed.
+ */
+static int ice_create_q_channel(struct ice_vsi *vsi, struct ice_channel *ch)
+{
+       struct ice_pf *pf = vsi->back;
+       struct device *dev;
+
+       if (!ch)
+               return -EINVAL;
+
+       dev = ice_pf_to_dev(pf);
+       if (!ch->num_txq || !ch->num_rxq) {
+               dev_err(dev, "Invalid num_queues requested: %d\n", ch->num_rxq);
+               return -EINVAL;
+       }
+
+       if (!vsi->cnt_q_avail || vsi->cnt_q_avail < ch->num_txq) {
+               dev_err(dev, "cnt_q_avail (%u) less than num_queues %d\n",
+                       vsi->cnt_q_avail, ch->num_txq);
+               return -EINVAL;
+       }
+
+       if (!ice_setup_channel(pf, vsi, ch)) {
+               dev_info(dev, "Failed to setup channel\n");
+               return -EINVAL;
+       }
+       /* configure BW rate limit */
+       if (ch->ch_vsi && (ch->max_tx_rate || ch->min_tx_rate)) {
+               int ret;
+
+               ret = ice_set_bw_limit(ch->ch_vsi, ch->max_tx_rate,
+                                      ch->min_tx_rate);
+               if (ret)
+                       dev_err(dev, "failed to set Tx rate of %llu Kbps for VSI(%u)\n",
+                               ch->max_tx_rate, ch->ch_vsi->vsi_num);
+               else
+                       dev_dbg(dev, "set Tx rate of %llu Kbps for VSI(%u)\n",
+                               ch->max_tx_rate, ch->ch_vsi->vsi_num);
+       }
+
+       vsi->cnt_q_avail -= ch->num_txq;
+
+       return 0;
+}
+
+/**
+ * ice_remove_q_channels - Remove queue channels for the TCs
+ * @vsi: VSI to be configured
+ * @rem_fltr: delete advanced switch filter or not
+ *
+ * Remove queue channels for the TCs
+ */
+static void ice_remove_q_channels(struct ice_vsi *vsi, bool __maybe_unused rem_fltr)
+{
+       struct ice_channel *ch, *ch_tmp;
+       int i;
+
+       /* perform cleanup for channels if they exist */
+       list_for_each_entry_safe(ch, ch_tmp, &vsi->ch_list, list) {
+               struct ice_vsi *ch_vsi;
+
+               list_del(&ch->list);
+               ch_vsi = ch->ch_vsi;
+               if (!ch_vsi) {
+                       kfree(ch);
+                       continue;
+               }
+
+               /* Reset queue contexts */
+               for (i = 0; i < ch->num_rxq; i++) {
+                       struct ice_tx_ring *tx_ring;
+                       struct ice_rx_ring *rx_ring;
+
+                       tx_ring = vsi->tx_rings[ch->base_q + i];
+                       rx_ring = vsi->rx_rings[ch->base_q + i];
+                       if (tx_ring) {
+                               tx_ring->ch = NULL;
+                               if (tx_ring->q_vector)
+                                       tx_ring->q_vector->ch = NULL;
+                       }
+                       if (rx_ring) {
+                               rx_ring->ch = NULL;
+                               if (rx_ring->q_vector)
+                                       rx_ring->q_vector->ch = NULL;
+                       }
+               }
+
+               /* clear the VSI from scheduler tree */
+               ice_rm_vsi_lan_cfg(ch->ch_vsi->port_info, ch->ch_vsi->idx);
+
+               /* Delete VSI from FW */
+               ice_vsi_delete(ch->ch_vsi);
+
+               /* Delete VSI from PF and HW VSI arrays */
+               ice_vsi_clear(ch->ch_vsi);
+
+               /* free the channel */
+               kfree(ch);
+       }
+
+       /* clear the channel VSI map which is stored in main VSI */
+       ice_for_each_chnl_tc(i)
+               vsi->tc_map_vsi[i] = NULL;
+
+       /* reset main VSI's all TC information */
+       vsi->all_enatc = 0;
+       vsi->all_numtc = 0;
+}
+
+/**
+ * ice_rebuild_channels - rebuild channel
+ * @pf: ptr to PF
+ *
+ * Recreate channel VSIs and replay filters
+ */
+static int ice_rebuild_channels(struct ice_pf *pf)
+{
+       struct device *dev = ice_pf_to_dev(pf);
+       struct ice_vsi *main_vsi;
+       bool rem_adv_fltr = true;
+       struct ice_channel *ch;
+       struct ice_vsi *vsi;
+       int tc_idx = 1;
+       int i, err;
+
+       main_vsi = ice_get_main_vsi(pf);
+       if (!main_vsi)
+               return 0;
+
+       if (!test_bit(ICE_FLAG_TC_MQPRIO, pf->flags) ||
+           main_vsi->old_numtc == 1)
+               return 0; /* nothing to be done */
+
+       /* reconfigure main VSI based on old value of TC and cached values
+        * for MQPRIO opts
+        */
+       err = ice_vsi_cfg_tc(main_vsi, main_vsi->old_ena_tc);
+       if (err) {
+               dev_err(dev, "failed configuring TC(ena_tc:0x%02x) for HW VSI=%u\n",
+                       main_vsi->old_ena_tc, main_vsi->vsi_num);
+               return err;
+       }
+
+       /* rebuild ADQ VSIs */
+       ice_for_each_vsi(pf, i) {
+               enum ice_vsi_type type;
+
+               vsi = pf->vsi[i];
+               if (!vsi || vsi->type != ICE_VSI_CHNL)
+                       continue;
+
+               type = vsi->type;
+
+               /* rebuild ADQ VSI */
+               err = ice_vsi_rebuild(vsi, true);
+               if (err) {
+                       dev_err(dev, "VSI (type:%s) at index %d rebuild failed, err %d\n",
+                               ice_vsi_type_str(type), vsi->idx, err);
+                       goto cleanup;
+               }
+
+               /* Re-map HW VSI number, using VSI handle that has been
+                * previously validated in ice_replay_vsi() call above
+                */
+               vsi->vsi_num = ice_get_hw_vsi_num(&pf->hw, vsi->idx);
+
+               /* replay filters for the VSI */
+               err = ice_replay_vsi(&pf->hw, vsi->idx);
+               if (err) {
+                       dev_err(dev, "VSI (type:%s) replay failed, err %d, VSI index %d\n",
+                               ice_vsi_type_str(type), err, vsi->idx);
+                       rem_adv_fltr = false;
+                       goto cleanup;
+               }
+               dev_info(dev, "VSI (type:%s) at index %d rebuilt successfully\n",
+                        ice_vsi_type_str(type), vsi->idx);
+
+               /* store ADQ VSI at correct TC index in main VSI's
+                * map of TC to VSI
+                */
+               main_vsi->tc_map_vsi[tc_idx++] = vsi;
+       }
+
+       /* ADQ VSI(s) has been rebuilt successfully, so setup
+        * channel for main VSI's Tx and Rx rings
+        */
+       list_for_each_entry(ch, &main_vsi->ch_list, list) {
+               struct ice_vsi *ch_vsi;
+
+               ch_vsi = ch->ch_vsi;
+               if (!ch_vsi)
+                       continue;
+
+               /* reconfig channel resources */
+               ice_cfg_chnl_all_res(main_vsi, ch);
+
+               /* replay BW rate limit if it is non-zero */
+               if (!ch->max_tx_rate && !ch->min_tx_rate)
+                       continue;
+
+               err = ice_set_bw_limit(ch_vsi, ch->max_tx_rate,
+                                      ch->min_tx_rate);
+               if (err)
+                       dev_err(dev, "failed (err:%d) to rebuild BW rate limit, max_tx_rate: %llu Kbps, min_tx_rate: %llu Kbps for VSI(%u)\n",
+                               err, ch->max_tx_rate, ch->min_tx_rate,
+                               ch_vsi->vsi_num);
+               else
+                       dev_dbg(dev, "successfully rebuild BW rate limit, max_tx_rate: %llu Kbps, min_tx_rate: %llu Kbps for VSI(%u)\n",
+                               ch->max_tx_rate, ch->min_tx_rate,
+                               ch_vsi->vsi_num);
+       }
+
+       /* reconfig RSS for main VSI */
+       if (main_vsi->ch_rss_size)
+               ice_vsi_cfg_rss_lut_key(main_vsi);
+
+       return 0;
+
+cleanup:
+       ice_remove_q_channels(main_vsi, rem_adv_fltr);
+       return err;
+}
+
+/**
+ * ice_create_q_channels - Add queue channel for the given TCs
+ * @vsi: VSI to be configured
+ *
+ * Configures queue channel mapping to the given TCs
+ */
+static int ice_create_q_channels(struct ice_vsi *vsi)
+{
+       struct ice_pf *pf = vsi->back;
+       struct ice_channel *ch;
+       int ret = 0, i;
+
+       ice_for_each_chnl_tc(i) {
+               if (!(vsi->all_enatc & BIT(i)))
+                       continue;
+
+               ch = kzalloc(sizeof(*ch), GFP_KERNEL);
+               if (!ch) {
+                       ret = -ENOMEM;
+                       goto err_free;
+               }
+               INIT_LIST_HEAD(&ch->list);
+               ch->num_rxq = vsi->mqprio_qopt.qopt.count[i];
+               ch->num_txq = vsi->mqprio_qopt.qopt.count[i];
+               ch->base_q = vsi->mqprio_qopt.qopt.offset[i];
+               ch->max_tx_rate = vsi->mqprio_qopt.max_rate[i];
+               ch->min_tx_rate = vsi->mqprio_qopt.min_rate[i];
+
+               /* convert to Kbits/s */
+               if (ch->max_tx_rate)
+                       ch->max_tx_rate = div_u64(ch->max_tx_rate,
+                                                 ICE_BW_KBPS_DIVISOR);
+               if (ch->min_tx_rate)
+                       ch->min_tx_rate = div_u64(ch->min_tx_rate,
+                                                 ICE_BW_KBPS_DIVISOR);
+
+               ret = ice_create_q_channel(vsi, ch);
+               if (ret) {
+                       dev_err(ice_pf_to_dev(pf),
+                               "failed creating channel TC:%d\n", i);
+                       kfree(ch);
+                       goto err_free;
+               }
+               list_add_tail(&ch->list, &vsi->ch_list);
+               vsi->tc_map_vsi[i] = ch->ch_vsi;
+               dev_dbg(ice_pf_to_dev(pf),
+                       "successfully created channel: VSI %pK\n", ch->ch_vsi);
+       }
+       return 0;
+
+err_free:
+       ice_remove_q_channels(vsi, false);
+
+       return ret;
+}
+
+/**
+ * ice_setup_tc_mqprio_qdisc - configure multiple traffic classes
+ * @netdev: net device to configure
+ * @type_data: TC offload data
+ */
+static int ice_setup_tc_mqprio_qdisc(struct net_device *netdev, void *type_data)
+{
+       struct tc_mqprio_qopt_offload *mqprio_qopt = type_data;
+       struct ice_netdev_priv *np = netdev_priv(netdev);
+       struct ice_vsi *vsi = np->vsi;
+       struct ice_pf *pf = vsi->back;
+       u16 mode, ena_tc_qdisc = 0;
+       int cur_txq, cur_rxq;
+       u8 hw = 0, num_tcf;
+       struct device *dev;
+       int ret, i;
+
+       dev = ice_pf_to_dev(pf);
+       num_tcf = mqprio_qopt->qopt.num_tc;
+       hw = mqprio_qopt->qopt.hw;
+       mode = mqprio_qopt->mode;
+       if (!hw) {
+               clear_bit(ICE_FLAG_TC_MQPRIO, pf->flags);
+               vsi->ch_rss_size = 0;
+               memcpy(&vsi->mqprio_qopt, mqprio_qopt, sizeof(*mqprio_qopt));
+               goto config_tcf;
+       }
+
+       /* Generate queue region map for number of TCF requested */
+       for (i = 0; i < num_tcf; i++)
+               ena_tc_qdisc |= BIT(i);
+
+       switch (mode) {
+       case TC_MQPRIO_MODE_CHANNEL:
+
+               ret = ice_validate_mqprio_qopt(vsi, mqprio_qopt);
+               if (ret) {
+                       netdev_err(netdev, "failed to validate_mqprio_qopt(), ret %d\n",
+                                  ret);
+                       return ret;
+               }
+               memcpy(&vsi->mqprio_qopt, mqprio_qopt, sizeof(*mqprio_qopt));
+               set_bit(ICE_FLAG_TC_MQPRIO, pf->flags);
+               break;
+       default:
+               return -EINVAL;
+       }
+
+config_tcf:
+
+       /* Requesting same TCF configuration as already enabled */
+       if (ena_tc_qdisc == vsi->tc_cfg.ena_tc &&
+           mode != TC_MQPRIO_MODE_CHANNEL)
+               return 0;
+
+       /* Pause VSI queues */
+       ice_dis_vsi(vsi, true);
+
+       if (!hw && !test_bit(ICE_FLAG_TC_MQPRIO, pf->flags))
+               ice_remove_q_channels(vsi, true);
+
+       if (!hw && !test_bit(ICE_FLAG_TC_MQPRIO, pf->flags)) {
+               vsi->req_txq = min_t(int, ice_get_avail_txq_count(pf),
+                                    num_online_cpus());
+               vsi->req_rxq = min_t(int, ice_get_avail_rxq_count(pf),
+                                    num_online_cpus());
+       } else {
+               /* logic to rebuild VSI, same like ethtool -L */
+               u16 offset = 0, qcount_tx = 0, qcount_rx = 0;
+
+               for (i = 0; i < num_tcf; i++) {
+                       if (!(ena_tc_qdisc & BIT(i)))
+                               continue;
+
+                       offset = vsi->mqprio_qopt.qopt.offset[i];
+                       qcount_rx = vsi->mqprio_qopt.qopt.count[i];
+                       qcount_tx = vsi->mqprio_qopt.qopt.count[i];
+               }
+               vsi->req_txq = offset + qcount_tx;
+               vsi->req_rxq = offset + qcount_rx;
+
+               /* store away original rss_size info, so that it gets reused
+                * form ice_vsi_rebuild during tc-qdisc delete stage - to
+                * determine, what should be the rss_sizefor main VSI
+                */
+               vsi->orig_rss_size = vsi->rss_size;
+       }
+
+       /* save current values of Tx and Rx queues before calling VSI rebuild
+        * for fallback option
+        */
+       cur_txq = vsi->num_txq;
+       cur_rxq = vsi->num_rxq;
+
+       /* proceed with rebuild main VSI using correct number of queues */
+       ret = ice_vsi_rebuild(vsi, false);
+       if (ret) {
+               /* fallback to current number of queues */
+               dev_info(dev, "Rebuild failed with new queues, try with current number of queues\n");
+               vsi->req_txq = cur_txq;
+               vsi->req_rxq = cur_rxq;
+               clear_bit(ICE_RESET_FAILED, pf->state);
+               if (ice_vsi_rebuild(vsi, false)) {
+                       dev_err(dev, "Rebuild of main VSI failed again\n");
+                       return ret;
+               }
+       }
+
+       vsi->all_numtc = num_tcf;
+       vsi->all_enatc = ena_tc_qdisc;
+       ret = ice_vsi_cfg_tc(vsi, ena_tc_qdisc);
+       if (ret) {
+               netdev_err(netdev, "failed configuring TC for VSI id=%d\n",
+                          vsi->vsi_num);
+               goto exit;
+       }
+
+       if (test_bit(ICE_FLAG_TC_MQPRIO, pf->flags)) {
+               u64 max_tx_rate = vsi->mqprio_qopt.max_rate[0];
+               u64 min_tx_rate = vsi->mqprio_qopt.min_rate[0];
+
+               /* set TC0 rate limit if specified */
+               if (max_tx_rate || min_tx_rate) {
+                       /* convert to Kbits/s */
+                       if (max_tx_rate)
+                               max_tx_rate = div_u64(max_tx_rate, ICE_BW_KBPS_DIVISOR);
+                       if (min_tx_rate)
+                               min_tx_rate = div_u64(min_tx_rate, ICE_BW_KBPS_DIVISOR);
+
+                       ret = ice_set_bw_limit(vsi, max_tx_rate, min_tx_rate);
+                       if (!ret) {
+                               dev_dbg(dev, "set Tx rate max %llu min %llu for VSI(%u)\n",
+                                       max_tx_rate, min_tx_rate, vsi->vsi_num);
+                       } else {
+                               dev_err(dev, "failed to set Tx rate max %llu min %llu for VSI(%u)\n",
+                                       max_tx_rate, min_tx_rate, vsi->vsi_num);
+                               goto exit;
+                       }
+               }
+               ret = ice_create_q_channels(vsi);
+               if (ret) {
+                       netdev_err(netdev, "failed configuring queue channels\n");
+                       goto exit;
+               } else {
+                       netdev_dbg(netdev, "successfully configured channels\n");
+               }
+       }
+
+       if (vsi->ch_rss_size)
+               ice_vsi_cfg_rss_lut_key(vsi);
+
+exit:
+       /* if error, reset the all_numtc and all_enatc */
+       if (ret) {
+               vsi->all_numtc = 0;
+               vsi->all_enatc = 0;
+       }
+       /* resume VSI */
+       ice_ena_vsi(vsi, true);
+
+       return ret;
+}
+
 static LIST_HEAD(ice_block_cb_list);
 
 static int
@@ -7190,6 +8053,8 @@ ice_setup_tc(struct net_device *netdev, enum tc_setup_type type,
             void *type_data)
 {
        struct ice_netdev_priv *np = netdev_priv(netdev);
+       struct ice_pf *pf = np->vsi->back;
+       int err;
 
        switch (type) {
        case TC_SETUP_BLOCK:
@@ -7197,6 +8062,12 @@ ice_setup_tc(struct net_device *netdev, enum tc_setup_type type,
                                                  &ice_block_cb_list,
                                                  ice_setup_tc_block_cb,
                                                  np, np, true);
+       case TC_SETUP_QDISC_MQPRIO:
+               /* setup traffic classifier for receive side */
+               mutex_lock(&pf->tc_mutex);
+               err = ice_setup_tc_mqprio_qdisc(netdev, type_data);
+               mutex_unlock(&pf->tc_mutex);
+               return err;
        default:
                return -EOPNOTSUPP;
        }