From 0153f36041b8e52019ebfa8629c13bf8f9b0a951 Mon Sep 17 00:00:00 2001 From: Michal Kubiak Date: Tue, 13 May 2025 12:55:27 +0200 Subject: [PATCH 1/5] ice: fix Tx scheduler error handling in XDP callback When the XDP program is loaded, the XDP callback adds new Tx queues. This means that the callback must update the Tx scheduler with the new queue number. In the event of a Tx scheduler failure, the XDP callback should also fail and roll back any changes previously made for XDP preparation. The previous implementation had a bug that not all changes made by the XDP callback were rolled back. This caused the crash with the following call trace: [ +9.549584] ice 0000:ca:00.0: Failed VSI LAN queue config for XDP, error: -5 [ +0.382335] Oops: general protection fault, probably for non-canonical address 0x50a2250a90495525: 0000 [#1] SMP NOPTI [ +0.010710] CPU: 103 UID: 0 PID: 0 Comm: swapper/103 Not tainted 6.14.0-net-next-mar-31+ #14 PREEMPT(voluntary) [ +0.010175] Hardware name: Intel Corporation M50CYP2SBSTD/M50CYP2SBSTD, BIOS SE5C620.86B.01.01.0005.2202160810 02/16/2022 [ +0.010946] RIP: 0010:__ice_update_sample+0x39/0xe0 [ice] [...] [ +0.002715] Call Trace: [ +0.002452] [ +0.002021] ? __die_body.cold+0x19/0x29 [ +0.003922] ? die_addr+0x3c/0x60 [ +0.003319] ? exc_general_protection+0x17c/0x400 [ +0.004707] ? asm_exc_general_protection+0x26/0x30 [ +0.004879] ? __ice_update_sample+0x39/0xe0 [ice] [ +0.004835] ice_napi_poll+0x665/0x680 [ice] [ +0.004320] __napi_poll+0x28/0x190 [ +0.003500] net_rx_action+0x198/0x360 [ +0.003752] ? update_rq_clock+0x39/0x220 [ +0.004013] handle_softirqs+0xf1/0x340 [ +0.003840] ? sched_clock_cpu+0xf/0x1f0 [ +0.003925] __irq_exit_rcu+0xc2/0xe0 [ +0.003665] common_interrupt+0x85/0xa0 [ +0.003839] [ +0.002098] [ +0.002106] asm_common_interrupt+0x26/0x40 [ +0.004184] RIP: 0010:cpuidle_enter_state+0xd3/0x690 Fix this by performing the missing unmapping of XDP queues from q_vectors and setting the XDP rings pointer back to NULL after all those queues are released. Also, add an immediate exit from the XDP callback in case of ring preparation failure. Fixes: efc2214b6047 ("ice: Add support for XDP") Reviewed-by: Dawid Osuchowski Reviewed-by: Przemek Kitszel Reviewed-by: Jacob Keller Signed-off-by: Michal Kubiak Reviewed-by: Aleksandr Loktionov Reviewed-by: Simon Horman Tested-by: Jesse Brandeburg Tested-by: Saritha Sanigani (A Contingent Worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_main.c | 47 ++++++++++++++++------- 1 file changed, 33 insertions(+), 14 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index 20d3baf955e3..d97d4b25b30d 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -2740,6 +2740,27 @@ void ice_map_xdp_rings(struct ice_vsi *vsi) } } +/** + * ice_unmap_xdp_rings - Unmap XDP rings from interrupt vectors + * @vsi: the VSI with XDP rings being unmapped + */ +static void ice_unmap_xdp_rings(struct ice_vsi *vsi) +{ + int v_idx; + + ice_for_each_q_vector(vsi, v_idx) { + struct ice_q_vector *q_vector = vsi->q_vectors[v_idx]; + struct ice_tx_ring *ring; + + ice_for_each_tx_ring(ring, q_vector->tx) + if (!ring->tx_buf || !ice_ring_is_xdp(ring)) + break; + + /* restore the value of last node prior to XDP setup */ + q_vector->tx.tx_ring = ring; + } +} + /** * ice_prepare_xdp_rings - Allocate, configure and setup Tx rings for XDP * @vsi: VSI to bring up Tx rings used by XDP @@ -2803,7 +2824,7 @@ int ice_prepare_xdp_rings(struct ice_vsi *vsi, struct bpf_prog *prog, if (status) { dev_err(dev, "Failed VSI LAN queue config for XDP, error: %d\n", status); - goto clear_xdp_rings; + goto unmap_xdp_rings; } /* assign the prog only when it's not already present on VSI; @@ -2819,6 +2840,8 @@ int ice_prepare_xdp_rings(struct ice_vsi *vsi, struct bpf_prog *prog, ice_vsi_assign_bpf_prog(vsi, prog); return 0; +unmap_xdp_rings: + ice_unmap_xdp_rings(vsi); clear_xdp_rings: ice_for_each_xdp_txq(vsi, i) if (vsi->xdp_rings[i]) { @@ -2835,6 +2858,8 @@ err_map_xdp: mutex_unlock(&pf->avail_q_mutex); devm_kfree(dev, vsi->xdp_rings); + vsi->xdp_rings = NULL; + return -ENOMEM; } @@ -2850,7 +2875,7 @@ int ice_destroy_xdp_rings(struct ice_vsi *vsi, enum ice_xdp_cfg cfg_type) { u16 max_txqs[ICE_MAX_TRAFFIC_CLASS] = { 0 }; struct ice_pf *pf = vsi->back; - int i, v_idx; + int i; /* q_vectors are freed in reset path so there's no point in detaching * rings @@ -2858,17 +2883,7 @@ int ice_destroy_xdp_rings(struct ice_vsi *vsi, enum ice_xdp_cfg cfg_type) if (cfg_type == ICE_XDP_CFG_PART) goto free_qmap; - ice_for_each_q_vector(vsi, v_idx) { - struct ice_q_vector *q_vector = vsi->q_vectors[v_idx]; - struct ice_tx_ring *ring; - - ice_for_each_tx_ring(ring, q_vector->tx) - if (!ring->tx_buf || !ice_ring_is_xdp(ring)) - break; - - /* restore the value of last node prior to XDP setup */ - q_vector->tx.tx_ring = ring; - } + ice_unmap_xdp_rings(vsi); free_qmap: mutex_lock(&pf->avail_q_mutex); @@ -3013,11 +3028,14 @@ ice_xdp_setup_prog(struct ice_vsi *vsi, struct bpf_prog *prog, xdp_ring_err = ice_vsi_determine_xdp_res(vsi); if (xdp_ring_err) { NL_SET_ERR_MSG_MOD(extack, "Not enough Tx resources for XDP"); + goto resume_if; } else { xdp_ring_err = ice_prepare_xdp_rings(vsi, prog, ICE_XDP_CFG_FULL); - if (xdp_ring_err) + if (xdp_ring_err) { NL_SET_ERR_MSG_MOD(extack, "Setting up XDP Tx resources failed"); + goto resume_if; + } } xdp_features_set_redirect_target(vsi->netdev, true); /* reallocate Rx queues that are used for zero-copy */ @@ -3035,6 +3053,7 @@ ice_xdp_setup_prog(struct ice_vsi *vsi, struct bpf_prog *prog, NL_SET_ERR_MSG_MOD(extack, "Freeing XDP Rx resources failed"); } +resume_if: if (if_running) ret = ice_up(vsi); From 6fa2942578472c9cab13a8fc1dae0d830193e0a1 Mon Sep 17 00:00:00 2001 From: Michal Kubiak Date: Tue, 13 May 2025 12:55:28 +0200 Subject: [PATCH 2/5] ice: create new Tx scheduler nodes for new queues only The current implementation of the Tx scheduler tree attempts to create nodes for all Tx queues, ignoring the fact that some queues may already exist in the tree. For example, if the VSI already has 128 Tx queues and the user requests for 16 new queues, the Tx scheduler will compute the tree for 272 queues (128 existing queues + 144 new queues), instead of 144 queues (128 existing queues and 16 new queues). Fix that by modifying the node count calculation algorithm to skip the queues that already exist in the tree. Fixes: 5513b920a4f7 ("ice: Update Tx scheduler tree for VSI multi-Tx queue support") Reviewed-by: Dawid Osuchowski Reviewed-by: Przemek Kitszel Reviewed-by: Jacob Keller Signed-off-by: Michal Kubiak Reviewed-by: Simon Horman Tested-by: Jesse Brandeburg Tested-by: Saritha Sanigani (A Contingent Worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_sched.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_sched.c b/drivers/net/ethernet/intel/ice/ice_sched.c index 6ca13c5dcb14..6524875b34d3 100644 --- a/drivers/net/ethernet/intel/ice/ice_sched.c +++ b/drivers/net/ethernet/intel/ice/ice_sched.c @@ -1604,16 +1604,16 @@ ice_sched_get_agg_node(struct ice_port_info *pi, struct ice_sched_node *tc_node, /** * ice_sched_calc_vsi_child_nodes - calculate number of VSI child nodes * @hw: pointer to the HW struct - * @num_qs: number of queues + * @num_new_qs: number of new queues that will be added to the tree * @num_nodes: num nodes array * * This function calculates the number of VSI child nodes based on the * number of queues. */ static void -ice_sched_calc_vsi_child_nodes(struct ice_hw *hw, u16 num_qs, u16 *num_nodes) +ice_sched_calc_vsi_child_nodes(struct ice_hw *hw, u16 num_new_qs, u16 *num_nodes) { - u16 num = num_qs; + u16 num = num_new_qs; u8 i, qgl, vsil; qgl = ice_sched_get_qgrp_layer(hw); @@ -1863,8 +1863,9 @@ ice_sched_update_vsi_child_nodes(struct ice_port_info *pi, u16 vsi_handle, return status; } - if (new_numqs) - ice_sched_calc_vsi_child_nodes(hw, new_numqs, new_num_nodes); + ice_sched_calc_vsi_child_nodes(hw, new_numqs - prev_numqs, + new_num_nodes); + /* Keep the max number of queue configuration all the time. Update the * tree only if number of queues > previous number of queues. This may * leave some extra nodes in the tree if number of queues < previous From 73145e6d81070d34a21431c9e0d7aaf2f29ca048 Mon Sep 17 00:00:00 2001 From: Michal Kubiak Date: Tue, 13 May 2025 12:55:29 +0200 Subject: [PATCH 3/5] ice: fix rebuilding the Tx scheduler tree for large queue counts The current implementation of the Tx scheduler allows the tree to be rebuilt as the user adds more Tx queues to the VSI. In such a case, additional child nodes are added to the tree to support the new number of queues. Unfortunately, this algorithm does not take into account that the limit of the VSI support node may be exceeded, so an additional node in the VSI layer may be required to handle all the requested queues. Such a scenario occurs when adding XDP Tx queues on machines with many CPUs. Although the driver still respects the queue limit returned by the FW, the Tx scheduler was unable to add those queues to its tree and returned one of the errors below. Such a scenario occurs when adding XDP Tx queues on machines with many CPUs (e.g. at least 321 CPUs, if there is already 128 Tx/Rx queue pairs). Although the driver still respects the queue limit returned by the FW, the Tx scheduler was unable to add those queues to its tree and returned the following errors: Failed VSI LAN queue config for XDP, error: -5 or: Failed to set LAN Tx queue context, error: -22 Fix this problem by extending the tree rebuild algorithm to check if the current VSI node can support the requested number of queues. If it cannot, create as many additional VSI support nodes as necessary to handle all the required Tx queues. Symmetrically, adjust the VSI node removal algorithm to remove all nodes associated with the given VSI. Also, make the search for the next free VSI node more restrictive. That is, add queue group nodes only to the VSI support nodes that have a matching VSI handle. Finally, fix the comment describing the tree update algorithm to better reflect the current scenario. Fixes: b0153fdd7e8a ("ice: update VSI config dynamically") Reviewed-by: Dawid Osuchowski Reviewed-by: Przemek Kitszel Signed-off-by: Michal Kubiak Reviewed-by: Simon Horman Tested-by: Jesse Brandeburg Tested-by: Saritha Sanigani (A Contingent Worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_sched.c | 170 +++++++++++++++++---- 1 file changed, 142 insertions(+), 28 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_sched.c b/drivers/net/ethernet/intel/ice/ice_sched.c index 6524875b34d3..d9d09296d1d4 100644 --- a/drivers/net/ethernet/intel/ice/ice_sched.c +++ b/drivers/net/ethernet/intel/ice/ice_sched.c @@ -84,6 +84,27 @@ ice_sched_find_node_by_teid(struct ice_sched_node *start_node, u32 teid) return NULL; } +/** + * ice_sched_find_next_vsi_node - find the next node for a given VSI + * @vsi_node: VSI support node to start search with + * + * Return: Next VSI support node, or NULL. + * + * The function returns a pointer to the next node from the VSI layer + * assigned to the given VSI, or NULL if there is no such a node. + */ +static struct ice_sched_node * +ice_sched_find_next_vsi_node(struct ice_sched_node *vsi_node) +{ + unsigned int vsi_handle = vsi_node->vsi_handle; + + while ((vsi_node = vsi_node->sibling) != NULL) + if (vsi_node->vsi_handle == vsi_handle) + break; + + return vsi_node; +} + /** * ice_aqc_send_sched_elem_cmd - send scheduling elements cmd * @hw: pointer to the HW struct @@ -1084,8 +1105,10 @@ ice_sched_add_nodes_to_layer(struct ice_port_info *pi, if (parent->num_children < max_child_nodes) { new_num_nodes = max_child_nodes - parent->num_children; } else { - /* This parent is full, try the next sibling */ - parent = parent->sibling; + /* This parent is full, + * try the next available sibling. + */ + parent = ice_sched_find_next_vsi_node(parent); /* Don't modify the first node TEID memory if the * first node was added already in the above call. * Instead send some temp memory for all other @@ -1528,12 +1551,23 @@ ice_sched_get_free_qparent(struct ice_port_info *pi, u16 vsi_handle, u8 tc, /* get the first queue group node from VSI sub-tree */ qgrp_node = ice_sched_get_first_node(pi, vsi_node, qgrp_layer); while (qgrp_node) { + struct ice_sched_node *next_vsi_node; + /* make sure the qgroup node is part of the VSI subtree */ if (ice_sched_find_node_in_subtree(pi->hw, vsi_node, qgrp_node)) if (qgrp_node->num_children < max_children && qgrp_node->owner == owner) break; qgrp_node = qgrp_node->sibling; + if (qgrp_node) + continue; + + next_vsi_node = ice_sched_find_next_vsi_node(vsi_node); + if (!next_vsi_node) + break; + + vsi_node = next_vsi_node; + qgrp_node = ice_sched_get_first_node(pi, vsi_node, qgrp_layer); } /* Select the best queue group */ @@ -1779,7 +1813,11 @@ ice_sched_add_vsi_support_nodes(struct ice_port_info *pi, u16 vsi_handle, if (!parent) return -EIO; - if (i == vsil) + /* Do not modify the VSI handle for already existing VSI nodes, + * (if no new VSI node was added to the tree). + * Assign the VSI handle only to newly added VSI nodes. + */ + if (i == vsil && num_added) parent->vsi_handle = vsi_handle; } @@ -1812,6 +1850,41 @@ ice_sched_add_vsi_to_topo(struct ice_port_info *pi, u16 vsi_handle, u8 tc) num_nodes); } +/** + * ice_sched_recalc_vsi_support_nodes - recalculate VSI support nodes count + * @hw: pointer to the HW struct + * @vsi_node: pointer to the leftmost VSI node that needs to be extended + * @new_numqs: new number of queues that has to be handled by the VSI + * @new_num_nodes: pointer to nodes count table to modify the VSI layer entry + * + * This function recalculates the number of supported nodes that need to + * be added after adding more Tx queues for a given VSI. + * The number of new VSI support nodes that shall be added will be saved + * to the @new_num_nodes table for the VSI layer. + */ +static void +ice_sched_recalc_vsi_support_nodes(struct ice_hw *hw, + struct ice_sched_node *vsi_node, + unsigned int new_numqs, u16 *new_num_nodes) +{ + u32 vsi_nodes_cnt = 1; + u32 max_queue_cnt = 1; + u32 qgl, vsil; + + qgl = ice_sched_get_qgrp_layer(hw); + vsil = ice_sched_get_vsi_layer(hw); + + for (u32 i = vsil; i <= qgl; i++) + max_queue_cnt *= hw->max_children[i]; + + while ((vsi_node = ice_sched_find_next_vsi_node(vsi_node)) != NULL) + vsi_nodes_cnt++; + + if (new_numqs > (max_queue_cnt * vsi_nodes_cnt)) + new_num_nodes[vsil] = DIV_ROUND_UP(new_numqs, max_queue_cnt) - + vsi_nodes_cnt; +} + /** * ice_sched_update_vsi_child_nodes - update VSI child nodes * @pi: port information structure @@ -1863,16 +1936,25 @@ ice_sched_update_vsi_child_nodes(struct ice_port_info *pi, u16 vsi_handle, return status; } + ice_sched_recalc_vsi_support_nodes(hw, vsi_node, + new_numqs, new_num_nodes); ice_sched_calc_vsi_child_nodes(hw, new_numqs - prev_numqs, new_num_nodes); - /* Keep the max number of queue configuration all the time. Update the - * tree only if number of queues > previous number of queues. This may + /* Never decrease the number of queues in the tree. Update the tree + * only if number of queues > previous number of queues. This may * leave some extra nodes in the tree if number of queues < previous * number but that wouldn't harm anything. Removing those extra nodes * may complicate the code if those nodes are part of SRL or * individually rate limited. + * Also, add the required VSI support nodes if the existing ones cannot + * handle the requested new number of queues. */ + status = ice_sched_add_vsi_support_nodes(pi, vsi_handle, tc_node, + new_num_nodes); + if (status) + return status; + status = ice_sched_add_vsi_child_nodes(pi, vsi_handle, tc_node, new_num_nodes, owner); if (status) @@ -2013,6 +2095,58 @@ static bool ice_sched_is_leaf_node_present(struct ice_sched_node *node) return (node->info.data.elem_type == ICE_AQC_ELEM_TYPE_LEAF); } +/** + * ice_sched_rm_vsi_subtree - remove all nodes assigned to a given VSI + * @pi: port information structure + * @vsi_node: pointer to the leftmost node of the VSI to be removed + * @owner: LAN or RDMA + * @tc: TC number + * + * Return: Zero in case of success, or -EBUSY if the VSI has leaf nodes in TC. + * + * This function removes all the VSI support nodes associated with a given VSI + * and its LAN or RDMA children nodes from the scheduler tree. + */ +static int +ice_sched_rm_vsi_subtree(struct ice_port_info *pi, + struct ice_sched_node *vsi_node, u8 owner, u8 tc) +{ + u16 vsi_handle = vsi_node->vsi_handle; + bool all_vsi_nodes_removed = true; + int j = 0; + + while (vsi_node) { + struct ice_sched_node *next_vsi_node; + + if (ice_sched_is_leaf_node_present(vsi_node)) { + ice_debug(pi->hw, ICE_DBG_SCHED, "VSI has leaf nodes in TC %d\n", tc); + return -EBUSY; + } + while (j < vsi_node->num_children) { + if (vsi_node->children[j]->owner == owner) + ice_free_sched_node(pi, vsi_node->children[j]); + else + j++; + } + + next_vsi_node = ice_sched_find_next_vsi_node(vsi_node); + + /* remove the VSI if it has no children */ + if (!vsi_node->num_children) + ice_free_sched_node(pi, vsi_node); + else + all_vsi_nodes_removed = false; + + vsi_node = next_vsi_node; + } + + /* clean up aggregator related VSI info if any */ + if (all_vsi_nodes_removed) + ice_sched_rm_agg_vsi_info(pi, vsi_handle); + + return 0; +} + /** * ice_sched_rm_vsi_cfg - remove the VSI and its children nodes * @pi: port information structure @@ -2039,7 +2173,6 @@ ice_sched_rm_vsi_cfg(struct ice_port_info *pi, u16 vsi_handle, u8 owner) ice_for_each_traffic_class(i) { struct ice_sched_node *vsi_node, *tc_node; - u8 j = 0; tc_node = ice_sched_get_tc_node(pi, i); if (!tc_node) @@ -2049,31 +2182,12 @@ ice_sched_rm_vsi_cfg(struct ice_port_info *pi, u16 vsi_handle, u8 owner) if (!vsi_node) continue; - if (ice_sched_is_leaf_node_present(vsi_node)) { - ice_debug(pi->hw, ICE_DBG_SCHED, "VSI has leaf nodes in TC %d\n", i); - status = -EBUSY; + status = ice_sched_rm_vsi_subtree(pi, vsi_node, owner, i); + if (status) goto exit_sched_rm_vsi_cfg; - } - while (j < vsi_node->num_children) { - if (vsi_node->children[j]->owner == owner) { - ice_free_sched_node(pi, vsi_node->children[j]); - /* reset the counter again since the num - * children will be updated after node removal - */ - j = 0; - } else { - j++; - } - } - /* remove the VSI if it has no children */ - if (!vsi_node->num_children) { - ice_free_sched_node(pi, vsi_node); - vsi_ctx->sched.vsi_node[i] = NULL; + vsi_ctx->sched.vsi_node[i] = NULL; - /* clean up aggregator related VSI info if any */ - ice_sched_rm_agg_vsi_info(pi, vsi_handle); - } if (owner == ICE_SCHED_NODE_OWNER_LAN) vsi_ctx->sched.max_lanq[i] = 0; else From 7292af042bcf22e2c18b96ed250f78498a5b28ab Mon Sep 17 00:00:00 2001 From: Brian Vazquez Date: Thu, 1 May 2025 17:06:17 +0000 Subject: [PATCH 4/5] idpf: fix a race in txq wakeup Add a helper function to correctly handle the lockless synchronization when the sender needs to block. The paradigm is if (no_resources()) { stop_queue(); barrier(); if (!no_resources()) restart_queue(); } netif_subqueue_maybe_stop already handles the paradigm correctly, but the code split the check for resources in three parts, the first one (descriptors) followed the protocol, but the other two (completions and tx_buf) were only doing the first part and so race prone. Luckily netif_subqueue_maybe_stop macro already allows you to use a function to evaluate the start/stop conditions so the fix only requires the right helper function to evaluate all the conditions at once. The patch removes idpf_tx_maybe_stop_common since it's no longer needed and instead adjusts separately the conditions for singleq and splitq. Note that idpf_tx_buf_hw_update doesn't need to check for resources since that will be covered in idpf_tx_splitq_frame. To reproduce: Reduce the threshold for pending completions to increase the chances of hitting this pause by changing your kernel: drivers/net/ethernet/intel/idpf/idpf_txrx.h -#define IDPF_TX_COMPLQ_OVERFLOW_THRESH(txcq) ((txcq)->desc_count >> 1) +#define IDPF_TX_COMPLQ_OVERFLOW_THRESH(txcq) ((txcq)->desc_count >> 4) Use pktgen to force the host to push small pkts very aggressively: ./pktgen_sample02_multiqueue.sh -i eth1 -s 100 -6 -d $IP -m $MAC \ -p 10000-10000 -t 16 -n 0 -v -x -c 64 Fixes: 6818c4d5b3c2 ("idpf: add splitq start_xmit") Reviewed-by: Jacob Keller Reviewed-by: Madhu Chittim Signed-off-by: Josh Hay Signed-off-by: Brian Vazquez Signed-off-by: Luigi Rizzo Reviewed-by: Simon Horman Tested-by: Samuel Salin Signed-off-by: Tony Nguyen --- .../ethernet/intel/idpf/idpf_singleq_txrx.c | 9 ++-- drivers/net/ethernet/intel/idpf/idpf_txrx.c | 45 +++++++------------ drivers/net/ethernet/intel/idpf/idpf_txrx.h | 8 ---- 3 files changed, 22 insertions(+), 40 deletions(-) diff --git a/drivers/net/ethernet/intel/idpf/idpf_singleq_txrx.c b/drivers/net/ethernet/intel/idpf/idpf_singleq_txrx.c index 2e356dd10812..993c354aa27a 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_singleq_txrx.c +++ b/drivers/net/ethernet/intel/idpf/idpf_singleq_txrx.c @@ -362,17 +362,18 @@ netdev_tx_t idpf_tx_singleq_frame(struct sk_buff *skb, { struct idpf_tx_offload_params offload = { }; struct idpf_tx_buf *first; + int csum, tso, needed; unsigned int count; __be16 protocol; - int csum, tso; count = idpf_tx_desc_count_required(tx_q, skb); if (unlikely(!count)) return idpf_tx_drop_skb(tx_q, skb); - if (idpf_tx_maybe_stop_common(tx_q, - count + IDPF_TX_DESCS_PER_CACHE_LINE + - IDPF_TX_DESCS_FOR_CTX)) { + needed = count + IDPF_TX_DESCS_PER_CACHE_LINE + IDPF_TX_DESCS_FOR_CTX; + if (!netif_subqueue_maybe_stop(tx_q->netdev, tx_q->idx, + IDPF_DESC_UNUSED(tx_q), + needed, needed)) { idpf_tx_buf_hw_update(tx_q, tx_q->next_to_use, false); u64_stats_update_begin(&tx_q->stats_sync); diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.c b/drivers/net/ethernet/intel/idpf/idpf_txrx.c index 631679cdaa6f..5cf440e09d0a 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_txrx.c +++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.c @@ -2184,6 +2184,19 @@ void idpf_tx_splitq_build_flow_desc(union idpf_tx_flex_desc *desc, desc->flow.qw1.compl_tag = cpu_to_le16(params->compl_tag); } +/* Global conditions to tell whether the txq (and related resources) + * has room to allow the use of "size" descriptors. + */ +static int idpf_txq_has_room(struct idpf_tx_queue *tx_q, u32 size) +{ + if (IDPF_DESC_UNUSED(tx_q) < size || + IDPF_TX_COMPLQ_PENDING(tx_q->txq_grp) > + IDPF_TX_COMPLQ_OVERFLOW_THRESH(tx_q->txq_grp->complq) || + IDPF_TX_BUF_RSV_LOW(tx_q)) + return 0; + return 1; +} + /** * idpf_tx_maybe_stop_splitq - 1st level check for Tx splitq stop conditions * @tx_q: the queue to be checked @@ -2194,29 +2207,11 @@ void idpf_tx_splitq_build_flow_desc(union idpf_tx_flex_desc *desc, static int idpf_tx_maybe_stop_splitq(struct idpf_tx_queue *tx_q, unsigned int descs_needed) { - if (idpf_tx_maybe_stop_common(tx_q, descs_needed)) - goto out; + if (netif_subqueue_maybe_stop(tx_q->netdev, tx_q->idx, + idpf_txq_has_room(tx_q, descs_needed), + 1, 1)) + return 0; - /* If there are too many outstanding completions expected on the - * completion queue, stop the TX queue to give the device some time to - * catch up - */ - if (unlikely(IDPF_TX_COMPLQ_PENDING(tx_q->txq_grp) > - IDPF_TX_COMPLQ_OVERFLOW_THRESH(tx_q->txq_grp->complq))) - goto splitq_stop; - - /* Also check for available book keeping buffers; if we are low, stop - * the queue to wait for more completions - */ - if (unlikely(IDPF_TX_BUF_RSV_LOW(tx_q))) - goto splitq_stop; - - return 0; - -splitq_stop: - netif_stop_subqueue(tx_q->netdev, tx_q->idx); - -out: u64_stats_update_begin(&tx_q->stats_sync); u64_stats_inc(&tx_q->q_stats.q_busy); u64_stats_update_end(&tx_q->stats_sync); @@ -2242,12 +2237,6 @@ void idpf_tx_buf_hw_update(struct idpf_tx_queue *tx_q, u32 val, nq = netdev_get_tx_queue(tx_q->netdev, tx_q->idx); tx_q->next_to_use = val; - if (idpf_tx_maybe_stop_common(tx_q, IDPF_TX_DESC_NEEDED)) { - u64_stats_update_begin(&tx_q->stats_sync); - u64_stats_inc(&tx_q->q_stats.q_busy); - u64_stats_update_end(&tx_q->stats_sync); - } - /* Force memory writes to complete before letting h/w * know there are new descriptors to fetch. (Only * applicable for weak-ordered memory model archs, diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.h b/drivers/net/ethernet/intel/idpf/idpf_txrx.h index c779fe71df99..36a0f828a6f8 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_txrx.h +++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.h @@ -1049,12 +1049,4 @@ bool idpf_rx_singleq_buf_hw_alloc_all(struct idpf_rx_queue *rxq, u16 cleaned_count); int idpf_tso(struct sk_buff *skb, struct idpf_tx_offload_params *off); -static inline bool idpf_tx_maybe_stop_common(struct idpf_tx_queue *tx_q, - u32 needed) -{ - return !netif_subqueue_maybe_stop(tx_q->netdev, tx_q->idx, - IDPF_DESC_UNUSED(tx_q), - needed, needed); -} - #endif /* !_IDPF_TXRX_H_ */ From 9dc63d8ff182150d7d7b318ab9389702a2c0a292 Mon Sep 17 00:00:00 2001 From: Emil Tantilov Date: Thu, 8 May 2025 11:47:15 -0700 Subject: [PATCH 5/5] idpf: avoid mailbox timeout delays during reset Mailbox operations are not possible while the driver is in reset. Operations that require MBX exchange with the control plane will result in long delays if executed while a reset is in progress: ethtool -L combined 8& echo 1 > /sys/class/net//device/reset idpf 0000:83:00.0: HW reset detected idpf 0000:83:00.0: Device HW Reset initiated idpf 0000:83:00.0: Transaction timed-out (op:504 cookie:be00 vc_op:504 salt:be timeout:2000ms) idpf 0000:83:00.0: Transaction timed-out (op:508 cookie:bf00 vc_op:508 salt:bf timeout:2000ms) idpf 0000:83:00.0: Transaction timed-out (op:512 cookie:c000 vc_op:512 salt:c0 timeout:2000ms) idpf 0000:83:00.0: Transaction timed-out (op:510 cookie:c100 vc_op:510 salt:c1 timeout:2000ms) idpf 0000:83:00.0: Transaction timed-out (op:509 cookie:c200 vc_op:509 salt:c2 timeout:60000ms) idpf 0000:83:00.0: Transaction timed-out (op:509 cookie:c300 vc_op:509 salt:c3 timeout:60000ms) idpf 0000:83:00.0: Transaction timed-out (op:505 cookie:c400 vc_op:505 salt:c4 timeout:60000ms) idpf 0000:83:00.0: Failed to configure queues for vport 0, -62 Disable mailbox communication in case of a reset, unless it's done during a driver load, where the virtchnl operations are needed to configure the device. Fixes: 8077c727561aa ("idpf: add controlq init and reset checks") Co-developed-by: Joshua Hay Signed-off-by: Joshua Hay Signed-off-by: Emil Tantilov Reviewed-by: Ahmed Zaki Reviewed-by: Aleksandr Loktionov Reviewed-by: Simon Horman Tested-by: Samuel Salin Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/idpf/idpf_lib.c | 18 +++++++++++++----- .../net/ethernet/intel/idpf/idpf_virtchnl.c | 2 +- .../net/ethernet/intel/idpf/idpf_virtchnl.h | 1 + 3 files changed, 15 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/intel/idpf/idpf_lib.c b/drivers/net/ethernet/intel/idpf/idpf_lib.c index bab12ecb2df5..4eb20ec2accb 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_lib.c +++ b/drivers/net/ethernet/intel/idpf/idpf_lib.c @@ -1801,11 +1801,19 @@ void idpf_vc_event_task(struct work_struct *work) if (test_bit(IDPF_REMOVE_IN_PROG, adapter->flags)) return; - if (test_bit(IDPF_HR_FUNC_RESET, adapter->flags) || - test_bit(IDPF_HR_DRV_LOAD, adapter->flags)) { - set_bit(IDPF_HR_RESET_IN_PROG, adapter->flags); - idpf_init_hard_reset(adapter); - } + if (test_bit(IDPF_HR_FUNC_RESET, adapter->flags)) + goto func_reset; + + if (test_bit(IDPF_HR_DRV_LOAD, adapter->flags)) + goto drv_load; + + return; + +func_reset: + idpf_vc_xn_shutdown(adapter->vcxn_mngr); +drv_load: + set_bit(IDPF_HR_RESET_IN_PROG, adapter->flags); + idpf_init_hard_reset(adapter); } /** diff --git a/drivers/net/ethernet/intel/idpf/idpf_virtchnl.c b/drivers/net/ethernet/intel/idpf/idpf_virtchnl.c index 07a9f5ae34fd..24febaaa8fbb 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_virtchnl.c +++ b/drivers/net/ethernet/intel/idpf/idpf_virtchnl.c @@ -347,7 +347,7 @@ static void idpf_vc_xn_init(struct idpf_vc_xn_manager *vcxn_mngr) * All waiting threads will be woken-up and their transaction aborted. Further * operations on that object will fail. */ -static void idpf_vc_xn_shutdown(struct idpf_vc_xn_manager *vcxn_mngr) +void idpf_vc_xn_shutdown(struct idpf_vc_xn_manager *vcxn_mngr) { int i; diff --git a/drivers/net/ethernet/intel/idpf/idpf_virtchnl.h b/drivers/net/ethernet/intel/idpf/idpf_virtchnl.h index 3522c1238ea2..77578206bada 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_virtchnl.h +++ b/drivers/net/ethernet/intel/idpf/idpf_virtchnl.h @@ -150,5 +150,6 @@ int idpf_send_get_stats_msg(struct idpf_vport *vport); int idpf_send_set_sriov_vfs_msg(struct idpf_adapter *adapter, u16 num_vfs); int idpf_send_get_set_rss_key_msg(struct idpf_vport *vport, bool get); int idpf_send_get_set_rss_lut_msg(struct idpf_vport *vport, bool get); +void idpf_vc_xn_shutdown(struct idpf_vc_xn_manager *vcxn_mngr); #endif /* _IDPF_VIRTCHNL_H_ */