Merge branch 'net-improve-core-queue-api-handling-while-device-is-down'

Jakub Kicinski says:

====================
net: improve core queue API handling while device is down

The core netdev_rx_queue_restart() doesn't currently take into account
that the device may be down. The current and proposed queue API
implementations deal with this by rejecting queue API calls while
the device is down. We can do better, in theory we can still allow
devmem binding when the device is down - we shouldn't stop and start
the queues just try to allocate the memory. The reason we allocate
the memory is that memory provider binding checks if any compatible
page pool has been created (page_pool_check_memory_provider()).

Alternatively we could reject installing MP while the device is down
but the MP assignment survives ifdown (so presumably MP doesn't cease
to exist while down), and in general we allow configuration while down.

Previously I thought we need this as a fix, but gve rejects page pool
calls while down, and so did Saeed in the patches he posted. So this
series just makes the core act more sensibly but practically should
be a noop for now.

v1: https://lore.kernel.org/20250205190131.564456-1-kuba@kernel.org
====================

Link: https://patch.msgid.link/20250206225638.1387810-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
This commit is contained in:
Jakub Kicinski
2025-02-07 17:21:05 -08:00
6 changed files with 59 additions and 29 deletions
+4 -6
View File
@@ -645,8 +645,11 @@ nsim_queue_mem_alloc(struct net_device *dev, void *per_queue_mem, int idx)
if (ns->rq_reset_mode > 3)
return -EINVAL;
if (ns->rq_reset_mode == 1)
if (ns->rq_reset_mode == 1) {
if (!netif_running(ns->netdev))
return -ENETDOWN;
return nsim_create_page_pool(&qmem->pp, &ns->rq[idx]->napi);
}
qmem->rq = nsim_queue_alloc();
if (!qmem->rq)
@@ -754,11 +757,6 @@ nsim_qreset_write(struct file *file, const char __user *data,
return -EINVAL;
rtnl_lock();
if (!netif_running(ns->netdev)) {
ret = -ENETDOWN;
goto exit_unlock;
}
if (queue >= ns->netdev->real_num_rx_queues) {
ret = -EINVAL;
goto exit_unlock;
+4
View File
@@ -117,6 +117,10 @@ struct netdev_stat_ops {
*
* @ndo_queue_stop: Stop the RX queue at the specified index. The stopped
* queue's memory is written at the specified address.
*
* Note that @ndo_queue_mem_alloc and @ndo_queue_mem_free may be called while
* the interface is closed. @ndo_queue_start and @ndo_queue_stop will only
* be called for an interface which is open.
*/
struct netdev_queue_mgmt_ops {
size_t ndo_queue_mem_size;
+12
View File
@@ -299,6 +299,18 @@ void xdp_do_check_flushed(struct napi_struct *napi);
static inline void xdp_do_check_flushed(struct napi_struct *napi) { }
#endif
/* Best effort check that NAPI is not idle (can't be scheduled to run) */
static inline void napi_assert_will_not_race(const struct napi_struct *napi)
{
/* uninitialized instance, can't race */
if (!napi->poll_list.next)
return;
/* SCHED bit is set on disabled instances */
WARN_ON(!test_bit(NAPI_STATE_SCHED, &napi->state));
WARN_ON(READ_ONCE(napi->list_owner) != -1);
}
void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu);
#define XMIT_RECURSION_LIMIT 8
+20 -17
View File
@@ -10,28 +10,27 @@
int netdev_rx_queue_restart(struct net_device *dev, unsigned int rxq_idx)
{
struct netdev_rx_queue *rxq = __netif_get_rx_queue(dev, rxq_idx);
const struct netdev_queue_mgmt_ops *qops = dev->queue_mgmt_ops;
void *new_mem, *old_mem;
int err;
if (!dev->queue_mgmt_ops || !dev->queue_mgmt_ops->ndo_queue_stop ||
!dev->queue_mgmt_ops->ndo_queue_mem_free ||
!dev->queue_mgmt_ops->ndo_queue_mem_alloc ||
!dev->queue_mgmt_ops->ndo_queue_start)
if (!qops || !qops->ndo_queue_stop || !qops->ndo_queue_mem_free ||
!qops->ndo_queue_mem_alloc || !qops->ndo_queue_start)
return -EOPNOTSUPP;
ASSERT_RTNL();
new_mem = kvzalloc(dev->queue_mgmt_ops->ndo_queue_mem_size, GFP_KERNEL);
new_mem = kvzalloc(qops->ndo_queue_mem_size, GFP_KERNEL);
if (!new_mem)
return -ENOMEM;
old_mem = kvzalloc(dev->queue_mgmt_ops->ndo_queue_mem_size, GFP_KERNEL);
old_mem = kvzalloc(qops->ndo_queue_mem_size, GFP_KERNEL);
if (!old_mem) {
err = -ENOMEM;
goto err_free_new_mem;
}
err = dev->queue_mgmt_ops->ndo_queue_mem_alloc(dev, new_mem, rxq_idx);
err = qops->ndo_queue_mem_alloc(dev, new_mem, rxq_idx);
if (err)
goto err_free_old_mem;
@@ -39,15 +38,19 @@ int netdev_rx_queue_restart(struct net_device *dev, unsigned int rxq_idx)
if (err)
goto err_free_new_queue_mem;
err = dev->queue_mgmt_ops->ndo_queue_stop(dev, old_mem, rxq_idx);
if (err)
goto err_free_new_queue_mem;
if (netif_running(dev)) {
err = qops->ndo_queue_stop(dev, old_mem, rxq_idx);
if (err)
goto err_free_new_queue_mem;
err = dev->queue_mgmt_ops->ndo_queue_start(dev, new_mem, rxq_idx);
if (err)
goto err_start_queue;
err = qops->ndo_queue_start(dev, new_mem, rxq_idx);
if (err)
goto err_start_queue;
} else {
swap(new_mem, old_mem);
}
dev->queue_mgmt_ops->ndo_queue_mem_free(dev, old_mem);
qops->ndo_queue_mem_free(dev, old_mem);
kvfree(old_mem);
kvfree(new_mem);
@@ -62,15 +65,15 @@ err_start_queue:
* WARN if we fail to recover the old rx queue, and at least free
* old_mem so we don't also leak that.
*/
if (dev->queue_mgmt_ops->ndo_queue_start(dev, old_mem, rxq_idx)) {
if (qops->ndo_queue_start(dev, old_mem, rxq_idx)) {
WARN(1,
"Failed to restart old queue in error path. RX queue %d may be unhealthy.",
rxq_idx);
dev->queue_mgmt_ops->ndo_queue_mem_free(dev, old_mem);
qops->ndo_queue_mem_free(dev, old_mem);
}
err_free_new_queue_mem:
dev->queue_mgmt_ops->ndo_queue_mem_free(dev, new_mem);
qops->ndo_queue_mem_free(dev, new_mem);
err_free_old_mem:
kvfree(old_mem);
+2 -5
View File
@@ -26,6 +26,7 @@
#include <trace/events/page_pool.h>
#include "dev.h"
#include "mp_dmabuf_devmem.h"
#include "netmem_priv.h"
#include "page_pool_priv.h"
@@ -1147,11 +1148,7 @@ void page_pool_disable_direct_recycling(struct page_pool *pool)
if (!pool->p.napi)
return;
/* To avoid races with recycling and additional barriers make sure
* pool and NAPI are unlinked when NAPI is disabled.
*/
WARN_ON(!test_bit(NAPI_STATE_SCHED, &pool->p.napi->state));
WARN_ON(READ_ONCE(pool->p.napi->list_owner) != -1);
napi_assert_will_not_race(pool->p.napi);
mutex_lock(&page_pools_lock);
WRITE_ONCE(pool->p.napi, NULL);
+17 -1
View File
@@ -35,6 +35,21 @@ def napi_list_check(nf) -> None:
comment=f"queue count after reset queue {q} mode {i}")
def nsim_rxq_reset_down(nf) -> None:
"""
Test that the queue API supports resetting a queue
while the interface is down. We should convert this
test to testing real HW once more devices support
queue API.
"""
with NetdevSimDev(queue_count=4) as nsimdev:
nsim = nsimdev.nsims[0]
ip(f"link set dev {nsim.ifname} down")
for i in [0, 2, 3]:
nsim.dfs_write("queue_reset", f"1 {i}")
def page_pool_check(nf) -> None:
with NetdevSimDev() as nsimdev:
nsim = nsimdev.nsims[0]
@@ -106,7 +121,8 @@ def page_pool_check(nf) -> None:
def main() -> None:
nf = NetdevFamily()
ksft_run([empty_check, lo_check, page_pool_check, napi_list_check],
ksft_run([empty_check, lo_check, page_pool_check, napi_list_check,
nsim_rxq_reset_down],
args=(nf, ))
ksft_exit()