From 07970d048cddeb56fa17925d7c37cb2400322aab Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 14 Apr 2025 10:46:39 -0300 Subject: [PATCH 01/11] vfio/type1: Remove Fine Grained Superpages detection VFIO is looking to enable an optimization where it can rely on a fast unmap operation that returned the size of a larger IOPTE. Due to how the test was constructed this would only ever succeed on the AMDv1 page table that supported an 8k contiguous size. Nothing else supports this. Alex says the performance win was fairly minor, so lets remove this code. Always use iommu_iova_to_phys() to extent contiguous pages. Signed-off-by: Jason Gunthorpe Reviewed-by: Alejandro Jimenez Tested-by: Alejandro Jimenez Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/0-v2-97fa1da8d983+412-vfio_fgsp_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/vfio_iommu_type1.c | 49 +-------------------------------- 1 file changed, 1 insertion(+), 48 deletions(-) diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 0ac56072af9f..afc1449335c3 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -80,7 +80,6 @@ struct vfio_domain { struct iommu_domain *domain; struct list_head next; struct list_head group_list; - bool fgsp : 1; /* Fine-grained super pages */ bool enforce_cache_coherency : 1; }; @@ -1095,8 +1094,7 @@ static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma, * may require hardware cache flushing, try to find the * largest contiguous physical memory chunk to unmap. */ - for (len = PAGE_SIZE; - !domain->fgsp && iova + len < end; len += PAGE_SIZE) { + for (len = PAGE_SIZE; iova + len < end; len += PAGE_SIZE) { next = iommu_iova_to_phys(domain->domain, iova + len); if (next != phys + len) break; @@ -1833,49 +1831,6 @@ unwind: return ret; } -/* - * We change our unmap behavior slightly depending on whether the IOMMU - * supports fine-grained superpages. IOMMUs like AMD-Vi will use a superpage - * for practically any contiguous power-of-two mapping we give it. This means - * we don't need to look for contiguous chunks ourselves to make unmapping - * more efficient. On IOMMUs with coarse-grained super pages, like Intel VT-d - * with discrete 2M/1G/512G/1T superpages, identifying contiguous chunks - * significantly boosts non-hugetlbfs mappings and doesn't seem to hurt when - * hugetlbfs is in use. - */ -static void vfio_test_domain_fgsp(struct vfio_domain *domain, struct list_head *regions) -{ - int ret, order = get_order(PAGE_SIZE * 2); - struct vfio_iova *region; - struct page *pages; - dma_addr_t start; - - pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, order); - if (!pages) - return; - - list_for_each_entry(region, regions, list) { - start = ALIGN(region->start, PAGE_SIZE * 2); - if (start >= region->end || (region->end - start < PAGE_SIZE * 2)) - continue; - - ret = iommu_map(domain->domain, start, page_to_phys(pages), PAGE_SIZE * 2, - IOMMU_READ | IOMMU_WRITE | IOMMU_CACHE, - GFP_KERNEL_ACCOUNT); - if (!ret) { - size_t unmapped = iommu_unmap(domain->domain, start, PAGE_SIZE); - - if (unmapped == PAGE_SIZE) - iommu_unmap(domain->domain, start + PAGE_SIZE, PAGE_SIZE); - else - domain->fgsp = true; - } - break; - } - - __free_pages(pages, order); -} - static struct vfio_iommu_group *find_iommu_group(struct vfio_domain *domain, struct iommu_group *iommu_group) { @@ -2314,8 +2269,6 @@ static int vfio_iommu_type1_attach_group(void *iommu_data, } } - vfio_test_domain_fgsp(domain, &iova_copy); - /* replay mappings on new domains */ ret = vfio_iommu_replay(iommu, domain); if (ret) From 8bb7170c5a055ea17c6857c256ee73c10ff872eb Mon Sep 17 00:00:00 2001 From: Longfang Liu Date: Sat, 10 May 2025 16:11:50 +0800 Subject: [PATCH 02/11] hisi_acc_vfio_pci: fix XQE dma address error The dma addresses of EQE and AEQE are wrong after migration and results in guest kernel-mode encryption services failure. Comparing the definition of hardware registers, we found that there was an error when the data read from the register was combined into an address. Therefore, the address combination sequence needs to be corrected. Even after fixing the above problem, we still have an issue where the Guest from an old kernel can get migrated to new kernel and may result in wrong data. In order to ensure that the address is correct after migration, if an old magic number is detected, the dma address needs to be updated. Fixes: b0eed085903e ("hisi_acc_vfio_pci: Add support for VFIO live migration") Signed-off-by: Longfang Liu Reviewed-by: Shameer Kolothum Link: https://lore.kernel.org/r/20250510081155.55840-2-liulongfang@huawei.com Signed-off-by: Alex Williamson --- .../vfio/pci/hisilicon/hisi_acc_vfio_pci.c | 41 ++++++++++++++++--- .../vfio/pci/hisilicon/hisi_acc_vfio_pci.h | 14 ++++++- 2 files changed, 47 insertions(+), 8 deletions(-) diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c index 451c639299eb..304dbdfa0e95 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c @@ -350,6 +350,32 @@ static int vf_qm_func_stop(struct hisi_qm *qm) return hisi_qm_mb(qm, QM_MB_CMD_PAUSE_QM, 0, 0, 0); } +static int vf_qm_version_check(struct acc_vf_data *vf_data, struct device *dev) +{ + switch (vf_data->acc_magic) { + case ACC_DEV_MAGIC_V2: + if (vf_data->major_ver != ACC_DRV_MAJOR_VER) { + dev_info(dev, "migration driver version<%u.%u> not match!\n", + vf_data->major_ver, vf_data->minor_ver); + return -EINVAL; + } + break; + case ACC_DEV_MAGIC_V1: + /* Correct dma address */ + vf_data->eqe_dma = vf_data->qm_eqc_dw[QM_XQC_ADDR_HIGH]; + vf_data->eqe_dma <<= QM_XQC_ADDR_OFFSET; + vf_data->eqe_dma |= vf_data->qm_eqc_dw[QM_XQC_ADDR_LOW]; + vf_data->aeqe_dma = vf_data->qm_aeqc_dw[QM_XQC_ADDR_HIGH]; + vf_data->aeqe_dma <<= QM_XQC_ADDR_OFFSET; + vf_data->aeqe_dma |= vf_data->qm_aeqc_dw[QM_XQC_ADDR_LOW]; + break; + default: + return -EINVAL; + } + + return 0; +} + static int vf_qm_check_match(struct hisi_acc_vf_core_device *hisi_acc_vdev, struct hisi_acc_vf_migration_file *migf) { @@ -363,7 +389,8 @@ static int vf_qm_check_match(struct hisi_acc_vf_core_device *hisi_acc_vdev, if (migf->total_length < QM_MATCH_SIZE || hisi_acc_vdev->match_done) return 0; - if (vf_data->acc_magic != ACC_DEV_MAGIC) { + ret = vf_qm_version_check(vf_data, dev); + if (ret) { dev_err(dev, "failed to match ACC_DEV_MAGIC\n"); return -EINVAL; } @@ -418,7 +445,9 @@ static int vf_qm_get_match_data(struct hisi_acc_vf_core_device *hisi_acc_vdev, int vf_id = hisi_acc_vdev->vf_id; int ret; - vf_data->acc_magic = ACC_DEV_MAGIC; + vf_data->acc_magic = ACC_DEV_MAGIC_V2; + vf_data->major_ver = ACC_DRV_MAJOR_VER; + vf_data->minor_ver = ACC_DRV_MINOR_VER; /* Save device id */ vf_data->dev_id = hisi_acc_vdev->vf_dev->device; @@ -496,12 +525,12 @@ static int vf_qm_read_data(struct hisi_qm *vf_qm, struct acc_vf_data *vf_data) return -EINVAL; /* Every reg is 32 bit, the dma address is 64 bit. */ - vf_data->eqe_dma = vf_data->qm_eqc_dw[1]; + vf_data->eqe_dma = vf_data->qm_eqc_dw[QM_XQC_ADDR_HIGH]; vf_data->eqe_dma <<= QM_XQC_ADDR_OFFSET; - vf_data->eqe_dma |= vf_data->qm_eqc_dw[0]; - vf_data->aeqe_dma = vf_data->qm_aeqc_dw[1]; + vf_data->eqe_dma |= vf_data->qm_eqc_dw[QM_XQC_ADDR_LOW]; + vf_data->aeqe_dma = vf_data->qm_aeqc_dw[QM_XQC_ADDR_HIGH]; vf_data->aeqe_dma <<= QM_XQC_ADDR_OFFSET; - vf_data->aeqe_dma |= vf_data->qm_aeqc_dw[0]; + vf_data->aeqe_dma |= vf_data->qm_aeqc_dw[QM_XQC_ADDR_LOW]; /* Through SQC_BT/CQC_BT to get sqc and cqc address */ ret = qm_get_sqc(vf_qm, &vf_data->sqc_dma); diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h index 245d7537b2bc..91002ceeebc1 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h @@ -39,6 +39,9 @@ #define QM_REG_ADDR_OFFSET 0x0004 #define QM_XQC_ADDR_OFFSET 32U +#define QM_XQC_ADDR_LOW 0x1 +#define QM_XQC_ADDR_HIGH 0x2 + #define QM_VF_AEQ_INT_MASK 0x0004 #define QM_VF_EQ_INT_MASK 0x000c #define QM_IFC_INT_SOURCE_V 0x0020 @@ -50,10 +53,15 @@ #define QM_EQC_DW0 0X8000 #define QM_AEQC_DW0 0X8020 +#define ACC_DRV_MAJOR_VER 1 +#define ACC_DRV_MINOR_VER 0 + +#define ACC_DEV_MAGIC_V1 0XCDCDCDCDFEEDAACC +#define ACC_DEV_MAGIC_V2 0xAACCFEEDDECADEDE + struct acc_vf_data { #define QM_MATCH_SIZE offsetofend(struct acc_vf_data, qm_rsv_state) /* QM match information */ -#define ACC_DEV_MAGIC 0XCDCDCDCDFEEDAACC u64 acc_magic; u32 qp_num; u32 dev_id; @@ -61,7 +69,9 @@ struct acc_vf_data { u32 qp_base; u32 vf_qm_state; /* QM reserved match information */ - u32 qm_rsv_state[3]; + u16 major_ver; + u16 minor_ver; + u32 qm_rsv_state[2]; /* QM RW regs */ u32 aeq_int_mask; From 3495cec0787721ba7a9d5c19d0bbb66d182de584 Mon Sep 17 00:00:00 2001 From: Longfang Liu Date: Sat, 10 May 2025 16:11:51 +0800 Subject: [PATCH 03/11] hisi_acc_vfio_pci: add eq and aeq interruption restore In order to ensure that the task packets of the accelerator device are not lost during the migration process, it is necessary to send an EQ and AEQ command to the device after the live migration is completed and to update the completion position of the task queue. Let the device recheck the completed tasks data and if there are uncollected packets, device resend a task completion interrupt to the software. Fixes: b0eed085903e ("hisi_acc_vfio_pci: Add support for VFIO live migration") Signed-off-by: Longfang Liu Reviewed-by: Shameer Kolothum Link: https://lore.kernel.org/r/20250510081155.55840-3-liulongfang@huawei.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c index 304dbdfa0e95..80217aea5475 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c @@ -470,6 +470,19 @@ static int vf_qm_get_match_data(struct hisi_acc_vf_core_device *hisi_acc_vdev, return 0; } +static void vf_qm_xeqc_save(struct hisi_qm *qm, + struct hisi_acc_vf_migration_file *migf) +{ + struct acc_vf_data *vf_data = &migf->vf_data; + u16 eq_head, aeq_head; + + eq_head = vf_data->qm_eqc_dw[0] & 0xFFFF; + qm_db(qm, 0, QM_DOORBELL_CMD_EQ, eq_head, 0); + + aeq_head = vf_data->qm_aeqc_dw[0] & 0xFFFF; + qm_db(qm, 0, QM_DOORBELL_CMD_AEQ, aeq_head, 0); +} + static int vf_qm_load_data(struct hisi_acc_vf_core_device *hisi_acc_vdev, struct hisi_acc_vf_migration_file *migf) { @@ -578,6 +591,9 @@ static int vf_qm_state_save(struct hisi_acc_vf_core_device *hisi_acc_vdev, return -EINVAL; migf->total_length = sizeof(struct acc_vf_data); + /* Save eqc and aeqc interrupt information */ + vf_qm_xeqc_save(vf_qm, migf); + return 0; } From e63c466398731bb7867f42f44b76fa984de59db2 Mon Sep 17 00:00:00 2001 From: Longfang Liu Date: Sat, 10 May 2025 16:11:52 +0800 Subject: [PATCH 04/11] hisi_acc_vfio_pci: bugfix cache write-back issue At present, cache write-back is placed in the device data copy stage after stopping the device operation. Writing back to the cache at this stage will cause the data obtained by the cache to be written back to be empty. In order to ensure that the cache data is written back successfully, the data needs to be written back into the stop device stage. Fixes: b0eed085903e ("hisi_acc_vfio_pci: Add support for VFIO live migration") Signed-off-by: Longfang Liu Reviewed-by: Shameer Kolothum Link: https://lore.kernel.org/r/20250510081155.55840-4-liulongfang@huawei.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c index 80217aea5475..d96446f499ed 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c @@ -566,7 +566,6 @@ static int vf_qm_state_save(struct hisi_acc_vf_core_device *hisi_acc_vdev, { struct acc_vf_data *vf_data = &migf->vf_data; struct hisi_qm *vf_qm = &hisi_acc_vdev->vf_qm; - struct device *dev = &vf_qm->pdev->dev; int ret; if (unlikely(qm_wait_dev_not_ready(vf_qm))) { @@ -580,12 +579,6 @@ static int vf_qm_state_save(struct hisi_acc_vf_core_device *hisi_acc_vdev, vf_data->vf_qm_state = QM_READY; hisi_acc_vdev->vf_qm_state = vf_data->vf_qm_state; - ret = vf_qm_cache_wb(vf_qm); - if (ret) { - dev_err(dev, "failed to writeback QM Cache!\n"); - return ret; - } - ret = vf_qm_read_data(vf_qm, vf_data); if (ret) return -EINVAL; @@ -1012,6 +1005,13 @@ static int hisi_acc_vf_stop_device(struct hisi_acc_vf_core_device *hisi_acc_vdev dev_err(dev, "failed to check QM INT state!\n"); return ret; } + + ret = vf_qm_cache_wb(vf_qm); + if (ret) { + dev_err(dev, "failed to writeback QM cache!\n"); + return ret; + } + return 0; } From db6525a8573957faea28850392f4744e5f8f7a53 Mon Sep 17 00:00:00 2001 From: Longfang Liu Date: Sat, 10 May 2025 16:11:53 +0800 Subject: [PATCH 05/11] hisi_acc_vfio_pci: bugfix the problem of uninstalling driver In a live migration scenario. If the number of VFs at the destination is greater than the source, the recovery operation will fail and qemu will not be able to complete the process and exit after shutting down the device FD. This will cause the driver to be unable to be unloaded normally due to abnormal reference counting of the live migration driver caused by the abnormal closing operation of fd. Therefore, make sure the migration file descriptor references are always released when the device is closed. Fixes: b0eed085903e ("hisi_acc_vfio_pci: Add support for VFIO live migration") Signed-off-by: Longfang Liu Reviewed-by: Shameer Kolothum Link: https://lore.kernel.org/r/20250510081155.55840-5-liulongfang@huawei.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c index d96446f499ed..cadc82419dca 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c @@ -1508,6 +1508,7 @@ static void hisi_acc_vfio_pci_close_device(struct vfio_device *core_vdev) struct hisi_acc_vf_core_device *hisi_acc_vdev = hisi_acc_get_vf_dev(core_vdev); struct hisi_qm *vf_qm = &hisi_acc_vdev->vf_qm; + hisi_acc_vf_disable_fds(hisi_acc_vdev); mutex_lock(&hisi_acc_vdev->open_mutex); hisi_acc_vdev->dev_opened = false; iounmap(vf_qm->io_base); From 2777a40998deb36f96b6afc48bd397cf58a4edf0 Mon Sep 17 00:00:00 2001 From: Longfang Liu Date: Sat, 10 May 2025 16:11:54 +0800 Subject: [PATCH 06/11] hisi_acc_vfio_pci: bugfix live migration function without VF device driver If the VF device driver is not loaded in the Guest OS and we attempt to perform device data migration, the address of the migrated data will be NULL. The live migration recovery operation on the destination side will access a null address value, which will cause access errors. Therefore, live migration of VMs without added VF device drivers does not require device data migration. In addition, when the queue address data obtained by the destination is empty, device queue recovery processing will not be performed. Fixes: b0eed085903e ("hisi_acc_vfio_pci: Add support for VFIO live migration") Signed-off-by: Longfang Liu Reviewed-by: Shameer Kolothum Link: https://lore.kernel.org/r/20250510081155.55840-6-liulongfang@huawei.com Signed-off-by: Alex Williamson --- .../vfio/pci/hisilicon/hisi_acc_vfio_pci.c | 22 +++++++++++++------ 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c index cadc82419dca..d12a350440d3 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c @@ -426,13 +426,6 @@ static int vf_qm_check_match(struct hisi_acc_vf_core_device *hisi_acc_vdev, return -EINVAL; } - ret = qm_write_regs(vf_qm, QM_VF_STATE, &vf_data->vf_qm_state, 1); - if (ret) { - dev_err(dev, "failed to write QM_VF_STATE\n"); - return ret; - } - - hisi_acc_vdev->vf_qm_state = vf_data->vf_qm_state; hisi_acc_vdev->match_done = true; return 0; } @@ -498,6 +491,20 @@ static int vf_qm_load_data(struct hisi_acc_vf_core_device *hisi_acc_vdev, if (migf->total_length < sizeof(struct acc_vf_data)) return -EINVAL; + if (!vf_data->eqe_dma || !vf_data->aeqe_dma || + !vf_data->sqc_dma || !vf_data->cqc_dma) { + dev_info(dev, "resume dma addr is NULL!\n"); + hisi_acc_vdev->vf_qm_state = QM_NOT_READY; + return 0; + } + + ret = qm_write_regs(qm, QM_VF_STATE, &vf_data->vf_qm_state, 1); + if (ret) { + dev_err(dev, "failed to write QM_VF_STATE\n"); + return -EINVAL; + } + hisi_acc_vdev->vf_qm_state = vf_data->vf_qm_state; + qm->eqe_dma = vf_data->eqe_dma; qm->aeqe_dma = vf_data->aeqe_dma; qm->sqc_dma = vf_data->sqc_dma; @@ -1531,6 +1538,7 @@ static int hisi_acc_vfio_pci_migrn_init_dev(struct vfio_device *core_vdev) hisi_acc_vdev->vf_id = pci_iov_vf_id(pdev) + 1; hisi_acc_vdev->pf_qm = pf_qm; hisi_acc_vdev->vf_dev = pdev; + hisi_acc_vdev->vf_qm_state = QM_NOT_READY; mutex_init(&hisi_acc_vdev->state_mutex); mutex_init(&hisi_acc_vdev->open_mutex); From 1dcf2cf102d72e0bbf0d7533756c9a0ccb6e91d3 Mon Sep 17 00:00:00 2001 From: Longfang Liu Date: Sat, 10 May 2025 16:11:55 +0800 Subject: [PATCH 07/11] hisi_acc_vfio_pci: update function return values. In this driver file, many functions call sub-functions and use ret to store the error code of the sub-functions. However, instead of directly returning ret to the caller, they use a converted error code, which prevents the end-user from clearly understanding the root cause of the error. Therefore, the code needs to be modified to directly return the error code from the sub-functions. Signed-off-by: Longfang Liu Reviewed-by: Shameer Kolothum Link: https://lore.kernel.org/r/20250510081155.55840-7-liulongfang@huawei.com Signed-off-by: Alex Williamson --- .../vfio/pci/hisilicon/hisi_acc_vfio_pci.c | 29 ++++++++++--------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c index d12a350440d3..2149f49aeec7 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c @@ -190,9 +190,10 @@ static int qm_set_regs(struct hisi_qm *qm, struct acc_vf_data *vf_data) int ret; /* Check VF state */ - if (unlikely(hisi_qm_wait_mb_ready(qm))) { + ret = hisi_qm_wait_mb_ready(qm); + if (unlikely(ret)) { dev_err(&qm->pdev->dev, "QM device is not ready to write\n"); - return -EBUSY; + return ret; } ret = qm_write_regs(qm, QM_VF_AEQ_INT_MASK, &vf_data->aeq_int_mask, 1); @@ -325,13 +326,15 @@ static void qm_dev_cmd_init(struct hisi_qm *qm) static int vf_qm_cache_wb(struct hisi_qm *qm) { unsigned int val; + int ret; writel(0x1, qm->io_base + QM_CACHE_WB_START); - if (readl_relaxed_poll_timeout(qm->io_base + QM_CACHE_WB_DONE, + ret = readl_relaxed_poll_timeout(qm->io_base + QM_CACHE_WB_DONE, val, val & BIT(0), MB_POLL_PERIOD_US, - MB_POLL_TIMEOUT_US)) { + MB_POLL_TIMEOUT_US); + if (ret) { dev_err(&qm->pdev->dev, "vf QM writeback sqc cache fail\n"); - return -EINVAL; + return ret; } return 0; @@ -392,7 +395,7 @@ static int vf_qm_check_match(struct hisi_acc_vf_core_device *hisi_acc_vdev, ret = vf_qm_version_check(vf_data, dev); if (ret) { dev_err(dev, "failed to match ACC_DEV_MAGIC\n"); - return -EINVAL; + return ret; } if (vf_data->dev_id != hisi_acc_vdev->vf_dev->device) { @@ -404,7 +407,7 @@ static int vf_qm_check_match(struct hisi_acc_vf_core_device *hisi_acc_vdev, ret = qm_get_vft(vf_qm, &vf_qm->qp_base); if (ret <= 0) { dev_err(dev, "failed to get vft qp nums\n"); - return -EINVAL; + return ret; } if (ret != vf_data->qp_num) { @@ -501,7 +504,7 @@ static int vf_qm_load_data(struct hisi_acc_vf_core_device *hisi_acc_vdev, ret = qm_write_regs(qm, QM_VF_STATE, &vf_data->vf_qm_state, 1); if (ret) { dev_err(dev, "failed to write QM_VF_STATE\n"); - return -EINVAL; + return ret; } hisi_acc_vdev->vf_qm_state = vf_data->vf_qm_state; @@ -542,7 +545,7 @@ static int vf_qm_read_data(struct hisi_qm *vf_qm, struct acc_vf_data *vf_data) ret = qm_get_regs(vf_qm, vf_data); if (ret) - return -EINVAL; + return ret; /* Every reg is 32 bit, the dma address is 64 bit. */ vf_data->eqe_dma = vf_data->qm_eqc_dw[QM_XQC_ADDR_HIGH]; @@ -556,13 +559,13 @@ static int vf_qm_read_data(struct hisi_qm *vf_qm, struct acc_vf_data *vf_data) ret = qm_get_sqc(vf_qm, &vf_data->sqc_dma); if (ret) { dev_err(dev, "failed to read SQC addr!\n"); - return -EINVAL; + return ret; } ret = qm_get_cqc(vf_qm, &vf_data->cqc_dma); if (ret) { dev_err(dev, "failed to read CQC addr!\n"); - return -EINVAL; + return ret; } return 0; @@ -588,7 +591,7 @@ static int vf_qm_state_save(struct hisi_acc_vf_core_device *hisi_acc_vdev, ret = vf_qm_read_data(vf_qm, vf_data); if (ret) - return -EINVAL; + return ret; migf->total_length = sizeof(struct acc_vf_data); /* Save eqc and aeqc interrupt information */ @@ -1379,7 +1382,7 @@ static int hisi_acc_vf_debug_check(struct seq_file *seq, struct vfio_device *vde ret = qm_wait_dev_not_ready(vf_qm); if (ret) { seq_puts(seq, "VF device not ready!\n"); - return -EBUSY; + return ret; } return 0; From 674ebb64cd5dc40827b8ac8c95bb72d328ed59ff Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 20 May 2025 16:46:30 +0300 Subject: [PATCH 08/11] vfio/mlx5: Explicitly use number of pages instead of allocated length allocated_length is a multiple of page size and number of pages, so let's change the functions to accept number of pages. This improves code readability, simplifies buffer handling, and enables combining DMA send/receive operations, as will be introduced in the next patches. Tested-by: Jens Axboe Reviewed-by: Jason Gunthorpe Signed-off-by: Leon Romanovsky Acked-by: Yishai Hadas Link: https://lore.kernel.org/r/76f39993d2ca0311b3bcfe56038a669d03926815.1747747694.git.leon@kernel.org Signed-off-by: Alex Williamson --- drivers/vfio/pci/mlx5/cmd.c | 32 ++++++++++----------- drivers/vfio/pci/mlx5/cmd.h | 10 +++---- drivers/vfio/pci/mlx5/main.c | 56 +++++++++++++++++++++++------------- 3 files changed, 57 insertions(+), 41 deletions(-) diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index 11eda6b207f1..377dee7765fb 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -318,8 +318,7 @@ static int _create_mkey(struct mlx5_core_dev *mdev, u32 pdn, struct mlx5_vhca_recv_buf *recv_buf, u32 *mkey) { - size_t npages = buf ? DIV_ROUND_UP(buf->allocated_length, PAGE_SIZE) : - recv_buf->npages; + size_t npages = buf ? buf->npages : recv_buf->npages; int err = 0, inlen; __be64 *mtt; void *mkc; @@ -375,7 +374,7 @@ static int mlx5vf_dma_data_buffer(struct mlx5_vhca_data_buffer *buf) if (mvdev->mdev_detach) return -ENOTCONN; - if (buf->dmaed || !buf->allocated_length) + if (buf->dmaed || !buf->npages) return -EINVAL; ret = dma_map_sgtable(mdev->device, &buf->table.sgt, buf->dma_dir, 0); @@ -445,7 +444,7 @@ static int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf, if (ret) goto err_append; - buf->allocated_length += filled * PAGE_SIZE; + buf->npages += filled; /* clean input for another bulk allocation */ memset(page_list, 0, filled * sizeof(*page_list)); to_fill = min_t(unsigned int, to_alloc, @@ -464,8 +463,7 @@ err: } struct mlx5_vhca_data_buffer * -mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf, - size_t length, +mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf, u32 npages, enum dma_data_direction dma_dir) { struct mlx5_vhca_data_buffer *buf; @@ -477,9 +475,8 @@ mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf, buf->dma_dir = dma_dir; buf->migf = migf; - if (length) { - ret = mlx5vf_add_migration_pages(buf, - DIV_ROUND_UP_ULL(length, PAGE_SIZE)); + if (npages) { + ret = mlx5vf_add_migration_pages(buf, npages); if (ret) goto end; @@ -505,8 +502,8 @@ void mlx5vf_put_data_buffer(struct mlx5_vhca_data_buffer *buf) } struct mlx5_vhca_data_buffer * -mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf, - size_t length, enum dma_data_direction dma_dir) +mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf, u32 npages, + enum dma_data_direction dma_dir) { struct mlx5_vhca_data_buffer *buf, *temp_buf; struct list_head free_list; @@ -521,7 +518,7 @@ mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf, list_for_each_entry_safe(buf, temp_buf, &migf->avail_list, buf_elm) { if (buf->dma_dir == dma_dir) { list_del_init(&buf->buf_elm); - if (buf->allocated_length >= length) { + if (buf->npages >= npages) { spin_unlock_irq(&migf->list_lock); goto found; } @@ -535,7 +532,7 @@ mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf, } } spin_unlock_irq(&migf->list_lock); - buf = mlx5vf_alloc_data_buffer(migf, length, dma_dir); + buf = mlx5vf_alloc_data_buffer(migf, npages, dma_dir); found: while ((temp_buf = list_first_entry_or_null(&free_list, @@ -716,7 +713,7 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, MLX5_SET(save_vhca_state_in, in, op_mod, 0); MLX5_SET(save_vhca_state_in, in, vhca_id, mvdev->vhca_id); MLX5_SET(save_vhca_state_in, in, mkey, buf->mkey); - MLX5_SET(save_vhca_state_in, in, size, buf->allocated_length); + MLX5_SET(save_vhca_state_in, in, size, buf->npages * PAGE_SIZE); MLX5_SET(save_vhca_state_in, in, incremental, inc); MLX5_SET(save_vhca_state_in, in, set_track, track); @@ -738,8 +735,11 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, } if (!header_buf) { - header_buf = mlx5vf_get_data_buffer(migf, - sizeof(struct mlx5_vf_migration_header), DMA_NONE); + header_buf = mlx5vf_get_data_buffer( + migf, + DIV_ROUND_UP(sizeof(struct mlx5_vf_migration_header), + PAGE_SIZE), + DMA_NONE); if (IS_ERR(header_buf)) { err = PTR_ERR(header_buf); goto err_free; diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h index df421dc6de04..7d4a833b6900 100644 --- a/drivers/vfio/pci/mlx5/cmd.h +++ b/drivers/vfio/pci/mlx5/cmd.h @@ -56,7 +56,7 @@ struct mlx5_vhca_data_buffer { struct sg_append_table table; loff_t start_pos; u64 length; - u64 allocated_length; + u32 npages; u32 mkey; enum dma_data_direction dma_dir; u8 dmaed:1; @@ -217,12 +217,12 @@ int mlx5vf_cmd_alloc_pd(struct mlx5_vf_migration_file *migf); void mlx5vf_cmd_dealloc_pd(struct mlx5_vf_migration_file *migf); void mlx5fv_cmd_clean_migf_resources(struct mlx5_vf_migration_file *migf); struct mlx5_vhca_data_buffer * -mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf, - size_t length, enum dma_data_direction dma_dir); +mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf, u32 npages, + enum dma_data_direction dma_dir); void mlx5vf_free_data_buffer(struct mlx5_vhca_data_buffer *buf); struct mlx5_vhca_data_buffer * -mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf, - size_t length, enum dma_data_direction dma_dir); +mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf, u32 npages, + enum dma_data_direction dma_dir); void mlx5vf_put_data_buffer(struct mlx5_vhca_data_buffer *buf); struct page *mlx5vf_get_migration_page(struct mlx5_vhca_data_buffer *buf, unsigned long offset); diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index 709543e7eb04..bc0f468f741b 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -308,6 +308,7 @@ static struct mlx5_vhca_data_buffer * mlx5vf_mig_file_get_stop_copy_buf(struct mlx5_vf_migration_file *migf, u8 index, size_t required_length) { + u32 npages = DIV_ROUND_UP(required_length, PAGE_SIZE); struct mlx5_vhca_data_buffer *buf = migf->buf[index]; u8 chunk_num; @@ -315,12 +316,11 @@ mlx5vf_mig_file_get_stop_copy_buf(struct mlx5_vf_migration_file *migf, chunk_num = buf->stop_copy_chunk_num; buf->migf->buf[index] = NULL; /* Checking whether the pre-allocated buffer can fit */ - if (buf->allocated_length >= required_length) + if (buf->npages >= npages) return buf; mlx5vf_put_data_buffer(buf); - buf = mlx5vf_get_data_buffer(buf->migf, required_length, - DMA_FROM_DEVICE); + buf = mlx5vf_get_data_buffer(buf->migf, npages, DMA_FROM_DEVICE); if (IS_ERR(buf)) return buf; @@ -373,7 +373,8 @@ static int mlx5vf_add_stop_copy_header(struct mlx5_vf_migration_file *migf, u8 *to_buff; int ret; - header_buf = mlx5vf_get_data_buffer(migf, size, DMA_NONE); + header_buf = mlx5vf_get_data_buffer(migf, DIV_ROUND_UP(size, PAGE_SIZE), + DMA_NONE); if (IS_ERR(header_buf)) return PTR_ERR(header_buf); @@ -388,7 +389,7 @@ static int mlx5vf_add_stop_copy_header(struct mlx5_vf_migration_file *migf, to_buff = kmap_local_page(page); memcpy(to_buff, &header, sizeof(header)); header_buf->length = sizeof(header); - data.stop_copy_size = cpu_to_le64(migf->buf[0]->allocated_length); + data.stop_copy_size = cpu_to_le64(migf->buf[0]->npages * PAGE_SIZE); memcpy(to_buff + sizeof(header), &data, sizeof(data)); header_buf->length += sizeof(data); kunmap_local(to_buff); @@ -437,15 +438,20 @@ static int mlx5vf_prep_stop_copy(struct mlx5vf_pci_core_device *mvdev, num_chunks = mvdev->chunk_mode ? MAX_NUM_CHUNKS : 1; for (i = 0; i < num_chunks; i++) { - buf = mlx5vf_get_data_buffer(migf, inc_state_size, DMA_FROM_DEVICE); + buf = mlx5vf_get_data_buffer( + migf, DIV_ROUND_UP(inc_state_size, PAGE_SIZE), + DMA_FROM_DEVICE); if (IS_ERR(buf)) { ret = PTR_ERR(buf); goto err; } migf->buf[i] = buf; - buf = mlx5vf_get_data_buffer(migf, - sizeof(struct mlx5_vf_migration_header), DMA_NONE); + buf = mlx5vf_get_data_buffer( + migf, + DIV_ROUND_UP(sizeof(struct mlx5_vf_migration_header), + PAGE_SIZE), + DMA_NONE); if (IS_ERR(buf)) { ret = PTR_ERR(buf); goto err; @@ -553,7 +559,8 @@ static long mlx5vf_precopy_ioctl(struct file *filp, unsigned int cmd, * We finished transferring the current state and the device has a * dirty state, save a new state to be ready for. */ - buf = mlx5vf_get_data_buffer(migf, inc_length, DMA_FROM_DEVICE); + buf = mlx5vf_get_data_buffer(migf, DIV_ROUND_UP(inc_length, PAGE_SIZE), + DMA_FROM_DEVICE); if (IS_ERR(buf)) { ret = PTR_ERR(buf); mlx5vf_mark_err(migf); @@ -675,8 +682,8 @@ mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev, bool track) if (track) { /* leave the allocated buffer ready for the stop-copy phase */ - buf = mlx5vf_alloc_data_buffer(migf, - migf->buf[0]->allocated_length, DMA_FROM_DEVICE); + buf = mlx5vf_alloc_data_buffer(migf, migf->buf[0]->npages, + DMA_FROM_DEVICE); if (IS_ERR(buf)) { ret = PTR_ERR(buf); goto out_pd; @@ -917,11 +924,14 @@ static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf, goto out_unlock; break; case MLX5_VF_LOAD_STATE_PREP_HEADER_DATA: - if (vhca_buf_header->allocated_length < migf->record_size) { + { + u32 npages = DIV_ROUND_UP(migf->record_size, PAGE_SIZE); + + if (vhca_buf_header->npages < npages) { mlx5vf_free_data_buffer(vhca_buf_header); - migf->buf_header[0] = mlx5vf_alloc_data_buffer(migf, - migf->record_size, DMA_NONE); + migf->buf_header[0] = mlx5vf_alloc_data_buffer( + migf, npages, DMA_NONE); if (IS_ERR(migf->buf_header[0])) { ret = PTR_ERR(migf->buf_header[0]); migf->buf_header[0] = NULL; @@ -934,6 +944,7 @@ static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf, vhca_buf_header->start_pos = migf->max_pos; migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER_DATA; break; + } case MLX5_VF_LOAD_STATE_READ_HEADER_DATA: ret = mlx5vf_resume_read_header_data(migf, vhca_buf_header, &buf, &len, pos, &done); @@ -944,12 +955,13 @@ static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf, { u64 size = max(migf->record_size, migf->stop_copy_prep_size); + u32 npages = DIV_ROUND_UP(size, PAGE_SIZE); - if (vhca_buf->allocated_length < size) { + if (vhca_buf->npages < npages) { mlx5vf_free_data_buffer(vhca_buf); - migf->buf[0] = mlx5vf_alloc_data_buffer(migf, - size, DMA_TO_DEVICE); + migf->buf[0] = mlx5vf_alloc_data_buffer( + migf, npages, DMA_TO_DEVICE); if (IS_ERR(migf->buf[0])) { ret = PTR_ERR(migf->buf[0]); migf->buf[0] = NULL; @@ -1037,8 +1049,11 @@ mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev) } migf->buf[0] = buf; - buf = mlx5vf_alloc_data_buffer(migf, - sizeof(struct mlx5_vf_migration_header), DMA_NONE); + buf = mlx5vf_alloc_data_buffer( + migf, + DIV_ROUND_UP(sizeof(struct mlx5_vf_migration_header), + PAGE_SIZE), + DMA_NONE); if (IS_ERR(buf)) { ret = PTR_ERR(buf); goto out_buf; @@ -1148,7 +1163,8 @@ mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev, MLX5VF_QUERY_INC | MLX5VF_QUERY_CLEANUP); if (ret) return ERR_PTR(ret); - buf = mlx5vf_get_data_buffer(migf, size, DMA_FROM_DEVICE); + buf = mlx5vf_get_data_buffer(migf, + DIV_ROUND_UP(size, PAGE_SIZE), DMA_FROM_DEVICE); if (IS_ERR(buf)) return ERR_CAST(buf); /* pre_copy cleanup */ From ac6c973a480bcbd1161974b6b8ec6fd12e87b506 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 20 May 2025 16:46:31 +0300 Subject: [PATCH 09/11] vfio/mlx5: Rewrite create mkey flow to allow better code reuse Change the creation of mkey to be performed in multiple steps: data allocation, DMA setup and actual call to HW to create that mkey. In this new flow, the whole input to MKEY command is saved to eliminate the need to keep array of pointers for DMA addresses for receive list and in the future patches for send list too. In addition to memory size reduce and elimination of unnecessary data movements to set MKEY input, the code is prepared for future reuse. Tested-by: Jens Axboe Reviewed-by: Jason Gunthorpe Signed-off-by: Leon Romanovsky Acked-by: Yishai Hadas Link: https://lore.kernel.org/r/d4ad0384fbd1e23a607cbbe9e5756748f3a761d9.1747747694.git.leon@kernel.org Signed-off-by: Alex Williamson --- drivers/vfio/pci/mlx5/cmd.c | 163 ++++++++++++++++++++---------------- drivers/vfio/pci/mlx5/cmd.h | 4 +- 2 files changed, 94 insertions(+), 73 deletions(-) diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index 377dee7765fb..84dc3bc128c6 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -313,39 +313,21 @@ err_exec: return ret; } -static int _create_mkey(struct mlx5_core_dev *mdev, u32 pdn, - struct mlx5_vhca_data_buffer *buf, - struct mlx5_vhca_recv_buf *recv_buf, - u32 *mkey) +static u32 *alloc_mkey_in(u32 npages, u32 pdn) { - size_t npages = buf ? buf->npages : recv_buf->npages; - int err = 0, inlen; - __be64 *mtt; + int inlen; void *mkc; u32 *in; inlen = MLX5_ST_SZ_BYTES(create_mkey_in) + - sizeof(*mtt) * round_up(npages, 2); + sizeof(__be64) * round_up(npages, 2); - in = kvzalloc(inlen, GFP_KERNEL); + in = kvzalloc(inlen, GFP_KERNEL_ACCOUNT); if (!in) - return -ENOMEM; + return NULL; MLX5_SET(create_mkey_in, in, translations_octword_actual_size, DIV_ROUND_UP(npages, 2)); - mtt = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt); - - if (buf) { - struct sg_dma_page_iter dma_iter; - - for_each_sgtable_dma_page(&buf->table.sgt, &dma_iter, 0) - *mtt++ = cpu_to_be64(sg_page_iter_dma_address(&dma_iter)); - } else { - int i; - - for (i = 0; i < npages; i++) - *mtt++ = cpu_to_be64(recv_buf->dma_addrs[i]); - } mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT); @@ -359,9 +341,30 @@ static int _create_mkey(struct mlx5_core_dev *mdev, u32 pdn, MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT); MLX5_SET(mkc, mkc, translations_octword_size, DIV_ROUND_UP(npages, 2)); MLX5_SET64(mkc, mkc, len, npages * PAGE_SIZE); - err = mlx5_core_create_mkey(mdev, mkey, in, inlen); - kvfree(in); - return err; + + return in; +} + +static int create_mkey(struct mlx5_core_dev *mdev, u32 npages, + struct mlx5_vhca_data_buffer *buf, u32 *mkey_in, + u32 *mkey) +{ + __be64 *mtt; + int inlen; + + mtt = (__be64 *)MLX5_ADDR_OF(create_mkey_in, mkey_in, klm_pas_mtt); + if (buf) { + struct sg_dma_page_iter dma_iter; + + for_each_sgtable_dma_page(&buf->table.sgt, &dma_iter, 0) + *mtt++ = cpu_to_be64( + sg_page_iter_dma_address(&dma_iter)); + } + + inlen = MLX5_ST_SZ_BYTES(create_mkey_in) + + sizeof(__be64) * round_up(npages, 2); + + return mlx5_core_create_mkey(mdev, mkey, mkey_in, inlen); } static int mlx5vf_dma_data_buffer(struct mlx5_vhca_data_buffer *buf) @@ -374,20 +377,28 @@ static int mlx5vf_dma_data_buffer(struct mlx5_vhca_data_buffer *buf) if (mvdev->mdev_detach) return -ENOTCONN; - if (buf->dmaed || !buf->npages) + if (buf->mkey_in || !buf->npages) return -EINVAL; ret = dma_map_sgtable(mdev->device, &buf->table.sgt, buf->dma_dir, 0); if (ret) return ret; - ret = _create_mkey(mdev, buf->migf->pdn, buf, NULL, &buf->mkey); - if (ret) + buf->mkey_in = alloc_mkey_in(buf->npages, buf->migf->pdn); + if (!buf->mkey_in) { + ret = -ENOMEM; goto err; + } - buf->dmaed = true; + ret = create_mkey(mdev, buf->npages, buf, buf->mkey_in, &buf->mkey); + if (ret) + goto err_create_mkey; return 0; + +err_create_mkey: + kvfree(buf->mkey_in); + buf->mkey_in = NULL; err: dma_unmap_sgtable(mdev->device, &buf->table.sgt, buf->dma_dir, 0); return ret; @@ -401,8 +412,9 @@ void mlx5vf_free_data_buffer(struct mlx5_vhca_data_buffer *buf) lockdep_assert_held(&migf->mvdev->state_mutex); WARN_ON(migf->mvdev->mdev_detach); - if (buf->dmaed) { + if (buf->mkey_in) { mlx5_core_destroy_mkey(migf->mvdev->mdev, buf->mkey); + kvfree(buf->mkey_in); dma_unmap_sgtable(migf->mvdev->mdev->device, &buf->table.sgt, buf->dma_dir, 0); } @@ -783,7 +795,7 @@ int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev, if (mvdev->mdev_detach) return -ENOTCONN; - if (!buf->dmaed) { + if (!buf->mkey_in) { err = mlx5vf_dma_data_buffer(buf); if (err) return err; @@ -1384,56 +1396,54 @@ err: kvfree(recv_buf->page_list); return -ENOMEM; } - -static int register_dma_recv_pages(struct mlx5_core_dev *mdev, - struct mlx5_vhca_recv_buf *recv_buf) +static void unregister_dma_pages(struct mlx5_core_dev *mdev, u32 npages, + u32 *mkey_in) { - int i, j; + dma_addr_t addr; + __be64 *mtt; + int i; - recv_buf->dma_addrs = kvcalloc(recv_buf->npages, - sizeof(*recv_buf->dma_addrs), - GFP_KERNEL_ACCOUNT); - if (!recv_buf->dma_addrs) - return -ENOMEM; - - for (i = 0; i < recv_buf->npages; i++) { - recv_buf->dma_addrs[i] = dma_map_page(mdev->device, - recv_buf->page_list[i], - 0, PAGE_SIZE, - DMA_FROM_DEVICE); - if (dma_mapping_error(mdev->device, recv_buf->dma_addrs[i])) - goto error; + mtt = (__be64 *)MLX5_ADDR_OF(create_mkey_in, mkey_in, klm_pas_mtt); + for (i = npages - 1; i >= 0; i--) { + addr = be64_to_cpu(mtt[i]); + dma_unmap_single(mdev->device, addr, PAGE_SIZE, + DMA_FROM_DEVICE); } +} + +static int register_dma_pages(struct mlx5_core_dev *mdev, u32 npages, + struct page **page_list, u32 *mkey_in) +{ + dma_addr_t addr; + __be64 *mtt; + int i; + + mtt = (__be64 *)MLX5_ADDR_OF(create_mkey_in, mkey_in, klm_pas_mtt); + + for (i = 0; i < npages; i++) { + addr = dma_map_page(mdev->device, page_list[i], 0, PAGE_SIZE, + DMA_FROM_DEVICE); + if (dma_mapping_error(mdev->device, addr)) + goto error; + + *mtt++ = cpu_to_be64(addr); + } + return 0; error: - for (j = 0; j < i; j++) - dma_unmap_single(mdev->device, recv_buf->dma_addrs[j], - PAGE_SIZE, DMA_FROM_DEVICE); - - kvfree(recv_buf->dma_addrs); + unregister_dma_pages(mdev, i, mkey_in); return -ENOMEM; } -static void unregister_dma_recv_pages(struct mlx5_core_dev *mdev, - struct mlx5_vhca_recv_buf *recv_buf) -{ - int i; - - for (i = 0; i < recv_buf->npages; i++) - dma_unmap_single(mdev->device, recv_buf->dma_addrs[i], - PAGE_SIZE, DMA_FROM_DEVICE); - - kvfree(recv_buf->dma_addrs); -} - static void mlx5vf_free_qp_recv_resources(struct mlx5_core_dev *mdev, struct mlx5_vhca_qp *qp) { struct mlx5_vhca_recv_buf *recv_buf = &qp->recv_buf; mlx5_core_destroy_mkey(mdev, recv_buf->mkey); - unregister_dma_recv_pages(mdev, recv_buf); + unregister_dma_pages(mdev, recv_buf->npages, recv_buf->mkey_in); + kvfree(recv_buf->mkey_in); free_recv_pages(&qp->recv_buf); } @@ -1449,18 +1459,29 @@ static int mlx5vf_alloc_qp_recv_resources(struct mlx5_core_dev *mdev, if (err < 0) return err; - err = register_dma_recv_pages(mdev, recv_buf); - if (err) + recv_buf->mkey_in = alloc_mkey_in(npages, pdn); + if (!recv_buf->mkey_in) { + err = -ENOMEM; goto end; + } - err = _create_mkey(mdev, pdn, NULL, recv_buf, &recv_buf->mkey); + err = register_dma_pages(mdev, npages, recv_buf->page_list, + recv_buf->mkey_in); + if (err) + goto err_register_dma; + + err = create_mkey(mdev, npages, NULL, recv_buf->mkey_in, + &recv_buf->mkey); if (err) goto err_create_mkey; return 0; err_create_mkey: - unregister_dma_recv_pages(mdev, recv_buf); + unregister_dma_pages(mdev, npages, recv_buf->mkey_in); +err_register_dma: + kvfree(recv_buf->mkey_in); + recv_buf->mkey_in = NULL; end: free_recv_pages(recv_buf); return err; diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h index 7d4a833b6900..25dd6ff54591 100644 --- a/drivers/vfio/pci/mlx5/cmd.h +++ b/drivers/vfio/pci/mlx5/cmd.h @@ -58,8 +58,8 @@ struct mlx5_vhca_data_buffer { u64 length; u32 npages; u32 mkey; + u32 *mkey_in; enum dma_data_direction dma_dir; - u8 dmaed:1; u8 stop_copy_chunk_num; struct list_head buf_elm; struct mlx5_vf_migration_file *migf; @@ -133,8 +133,8 @@ struct mlx5_vhca_cq { struct mlx5_vhca_recv_buf { u32 npages; struct page **page_list; - dma_addr_t *dma_addrs; u32 next_rq_offset; + u32 *mkey_in; u32 mkey; }; From 089803c40193ccce21da04d9f047468eba3aa47f Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 20 May 2025 16:46:32 +0300 Subject: [PATCH 10/11] vfio/mlx5: Enable the DMA link API Remove intermediate scatter-gather table completely and enable new DMA link API. Tested-by: Jens Axboe Reviewed-by: Jason Gunthorpe Signed-off-by: Leon Romanovsky Acked-by: Yishai Hadas Link: https://lore.kernel.org/r/f71638d50c9c79a462f2e0423501b1de77617656.1747747694.git.leon@kernel.org Signed-off-by: Alex Williamson --- drivers/vfio/pci/mlx5/cmd.c | 300 +++++++++++++++-------------------- drivers/vfio/pci/mlx5/cmd.h | 21 ++- drivers/vfio/pci/mlx5/main.c | 31 ---- 3 files changed, 146 insertions(+), 206 deletions(-) diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index 84dc3bc128c6..5b919a0b2524 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -345,28 +345,80 @@ static u32 *alloc_mkey_in(u32 npages, u32 pdn) return in; } -static int create_mkey(struct mlx5_core_dev *mdev, u32 npages, - struct mlx5_vhca_data_buffer *buf, u32 *mkey_in, +static int create_mkey(struct mlx5_core_dev *mdev, u32 npages, u32 *mkey_in, u32 *mkey) { - __be64 *mtt; - int inlen; - - mtt = (__be64 *)MLX5_ADDR_OF(create_mkey_in, mkey_in, klm_pas_mtt); - if (buf) { - struct sg_dma_page_iter dma_iter; - - for_each_sgtable_dma_page(&buf->table.sgt, &dma_iter, 0) - *mtt++ = cpu_to_be64( - sg_page_iter_dma_address(&dma_iter)); - } - - inlen = MLX5_ST_SZ_BYTES(create_mkey_in) + + int inlen = MLX5_ST_SZ_BYTES(create_mkey_in) + sizeof(__be64) * round_up(npages, 2); return mlx5_core_create_mkey(mdev, mkey, mkey_in, inlen); } +static void unregister_dma_pages(struct mlx5_core_dev *mdev, u32 npages, + u32 *mkey_in, struct dma_iova_state *state, + enum dma_data_direction dir) +{ + dma_addr_t addr; + __be64 *mtt; + int i; + + if (dma_use_iova(state)) { + dma_iova_destroy(mdev->device, state, npages * PAGE_SIZE, dir, + 0); + } else { + mtt = (__be64 *)MLX5_ADDR_OF(create_mkey_in, mkey_in, + klm_pas_mtt); + for (i = npages - 1; i >= 0; i--) { + addr = be64_to_cpu(mtt[i]); + dma_unmap_page(mdev->device, addr, PAGE_SIZE, dir); + } + } +} + +static int register_dma_pages(struct mlx5_core_dev *mdev, u32 npages, + struct page **page_list, u32 *mkey_in, + struct dma_iova_state *state, + enum dma_data_direction dir) +{ + dma_addr_t addr; + size_t mapped = 0; + __be64 *mtt; + int i, err; + + mtt = (__be64 *)MLX5_ADDR_OF(create_mkey_in, mkey_in, klm_pas_mtt); + + if (dma_iova_try_alloc(mdev->device, state, 0, npages * PAGE_SIZE)) { + addr = state->addr; + for (i = 0; i < npages; i++) { + err = dma_iova_link(mdev->device, state, + page_to_phys(page_list[i]), mapped, + PAGE_SIZE, dir, 0); + if (err) + goto error; + *mtt++ = cpu_to_be64(addr); + addr += PAGE_SIZE; + mapped += PAGE_SIZE; + } + err = dma_iova_sync(mdev->device, state, 0, mapped); + if (err) + goto error; + } else { + for (i = 0; i < npages; i++) { + addr = dma_map_page(mdev->device, page_list[i], 0, + PAGE_SIZE, dir); + err = dma_mapping_error(mdev->device, addr); + if (err) + goto error; + *mtt++ = cpu_to_be64(addr); + } + } + return 0; + +error: + unregister_dma_pages(mdev, i, mkey_in, state, dir); + return err; +} + static int mlx5vf_dma_data_buffer(struct mlx5_vhca_data_buffer *buf) { struct mlx5vf_pci_core_device *mvdev = buf->migf->mvdev; @@ -380,98 +432,90 @@ static int mlx5vf_dma_data_buffer(struct mlx5_vhca_data_buffer *buf) if (buf->mkey_in || !buf->npages) return -EINVAL; - ret = dma_map_sgtable(mdev->device, &buf->table.sgt, buf->dma_dir, 0); - if (ret) - return ret; - buf->mkey_in = alloc_mkey_in(buf->npages, buf->migf->pdn); - if (!buf->mkey_in) { - ret = -ENOMEM; - goto err; - } + if (!buf->mkey_in) + return -ENOMEM; - ret = create_mkey(mdev, buf->npages, buf, buf->mkey_in, &buf->mkey); + ret = register_dma_pages(mdev, buf->npages, buf->page_list, + buf->mkey_in, &buf->state, buf->dma_dir); + if (ret) + goto err_register_dma; + + ret = create_mkey(mdev, buf->npages, buf->mkey_in, &buf->mkey); if (ret) goto err_create_mkey; return 0; err_create_mkey: + unregister_dma_pages(mdev, buf->npages, buf->mkey_in, &buf->state, + buf->dma_dir); +err_register_dma: kvfree(buf->mkey_in); buf->mkey_in = NULL; -err: - dma_unmap_sgtable(mdev->device, &buf->table.sgt, buf->dma_dir, 0); return ret; } +static void free_page_list(u32 npages, struct page **page_list) +{ + int i; + + /* Undo alloc_pages_bulk() */ + for (i = npages - 1; i >= 0; i--) + __free_page(page_list[i]); + + kvfree(page_list); +} + void mlx5vf_free_data_buffer(struct mlx5_vhca_data_buffer *buf) { - struct mlx5_vf_migration_file *migf = buf->migf; - struct sg_page_iter sg_iter; + struct mlx5vf_pci_core_device *mvdev = buf->migf->mvdev; + struct mlx5_core_dev *mdev = mvdev->mdev; - lockdep_assert_held(&migf->mvdev->state_mutex); - WARN_ON(migf->mvdev->mdev_detach); + lockdep_assert_held(&mvdev->state_mutex); + WARN_ON(mvdev->mdev_detach); if (buf->mkey_in) { - mlx5_core_destroy_mkey(migf->mvdev->mdev, buf->mkey); + mlx5_core_destroy_mkey(mdev, buf->mkey); + unregister_dma_pages(mdev, buf->npages, buf->mkey_in, + &buf->state, buf->dma_dir); kvfree(buf->mkey_in); - dma_unmap_sgtable(migf->mvdev->mdev->device, &buf->table.sgt, - buf->dma_dir, 0); } - /* Undo alloc_pages_bulk() */ - for_each_sgtable_page(&buf->table.sgt, &sg_iter, 0) - __free_page(sg_page_iter_page(&sg_iter)); - sg_free_append_table(&buf->table); + free_page_list(buf->npages, buf->page_list); kfree(buf); } -static int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf, - unsigned int npages) +static int mlx5vf_add_pages(struct page ***page_list, unsigned int npages) { - unsigned int to_alloc = npages; - struct page **page_list; - unsigned long filled; - unsigned int to_fill; - int ret; + unsigned int filled, done = 0; int i; - to_fill = min_t(unsigned int, npages, PAGE_SIZE / sizeof(*page_list)); - page_list = kvzalloc(to_fill * sizeof(*page_list), GFP_KERNEL_ACCOUNT); - if (!page_list) + *page_list = + kvcalloc(npages, sizeof(struct page *), GFP_KERNEL_ACCOUNT); + if (!*page_list) return -ENOMEM; - do { - filled = alloc_pages_bulk(GFP_KERNEL_ACCOUNT, to_fill, - page_list); - if (!filled) { - ret = -ENOMEM; + for (;;) { + filled = alloc_pages_bulk(GFP_KERNEL_ACCOUNT, npages - done, + *page_list + done); + if (!filled) goto err; - } - to_alloc -= filled; - ret = sg_alloc_append_table_from_pages( - &buf->table, page_list, filled, 0, - filled << PAGE_SHIFT, UINT_MAX, SG_MAX_SINGLE_ALLOC, - GFP_KERNEL_ACCOUNT); - if (ret) - goto err_append; - buf->npages += filled; - /* clean input for another bulk allocation */ - memset(page_list, 0, filled * sizeof(*page_list)); - to_fill = min_t(unsigned int, to_alloc, - PAGE_SIZE / sizeof(*page_list)); - } while (to_alloc > 0); + done += filled; + if (done == npages) + break; + } - kvfree(page_list); return 0; -err_append: - for (i = filled - 1; i >= 0; i--) - __free_page(page_list[i]); err: - kvfree(page_list); - return ret; + for (i = 0; i < done; i++) + __free_page(*page_list[i]); + + kvfree(*page_list); + *page_list = NULL; + return -ENOMEM; } struct mlx5_vhca_data_buffer * @@ -488,10 +532,12 @@ mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf, u32 npages, buf->dma_dir = dma_dir; buf->migf = migf; if (npages) { - ret = mlx5vf_add_migration_pages(buf, npages); + ret = mlx5vf_add_pages(&buf->page_list, npages); if (ret) goto end; + buf->npages = npages; + if (dma_dir != DMA_NONE) { ret = mlx5vf_dma_data_buffer(buf); if (ret) @@ -1350,101 +1396,16 @@ static void mlx5vf_destroy_qp(struct mlx5_core_dev *mdev, kfree(qp); } -static void free_recv_pages(struct mlx5_vhca_recv_buf *recv_buf) -{ - int i; - - /* Undo alloc_pages_bulk() */ - for (i = 0; i < recv_buf->npages; i++) - __free_page(recv_buf->page_list[i]); - - kvfree(recv_buf->page_list); -} - -static int alloc_recv_pages(struct mlx5_vhca_recv_buf *recv_buf, - unsigned int npages) -{ - unsigned int filled = 0, done = 0; - int i; - - recv_buf->page_list = kvcalloc(npages, sizeof(*recv_buf->page_list), - GFP_KERNEL_ACCOUNT); - if (!recv_buf->page_list) - return -ENOMEM; - - for (;;) { - filled = alloc_pages_bulk(GFP_KERNEL_ACCOUNT, - npages - done, - recv_buf->page_list + done); - if (!filled) - goto err; - - done += filled; - if (done == npages) - break; - } - - recv_buf->npages = npages; - return 0; - -err: - for (i = 0; i < npages; i++) { - if (recv_buf->page_list[i]) - __free_page(recv_buf->page_list[i]); - } - - kvfree(recv_buf->page_list); - return -ENOMEM; -} -static void unregister_dma_pages(struct mlx5_core_dev *mdev, u32 npages, - u32 *mkey_in) -{ - dma_addr_t addr; - __be64 *mtt; - int i; - - mtt = (__be64 *)MLX5_ADDR_OF(create_mkey_in, mkey_in, klm_pas_mtt); - for (i = npages - 1; i >= 0; i--) { - addr = be64_to_cpu(mtt[i]); - dma_unmap_single(mdev->device, addr, PAGE_SIZE, - DMA_FROM_DEVICE); - } -} - -static int register_dma_pages(struct mlx5_core_dev *mdev, u32 npages, - struct page **page_list, u32 *mkey_in) -{ - dma_addr_t addr; - __be64 *mtt; - int i; - - mtt = (__be64 *)MLX5_ADDR_OF(create_mkey_in, mkey_in, klm_pas_mtt); - - for (i = 0; i < npages; i++) { - addr = dma_map_page(mdev->device, page_list[i], 0, PAGE_SIZE, - DMA_FROM_DEVICE); - if (dma_mapping_error(mdev->device, addr)) - goto error; - - *mtt++ = cpu_to_be64(addr); - } - - return 0; - -error: - unregister_dma_pages(mdev, i, mkey_in); - return -ENOMEM; -} - static void mlx5vf_free_qp_recv_resources(struct mlx5_core_dev *mdev, struct mlx5_vhca_qp *qp) { struct mlx5_vhca_recv_buf *recv_buf = &qp->recv_buf; mlx5_core_destroy_mkey(mdev, recv_buf->mkey); - unregister_dma_pages(mdev, recv_buf->npages, recv_buf->mkey_in); + unregister_dma_pages(mdev, recv_buf->npages, recv_buf->mkey_in, + &recv_buf->state, DMA_FROM_DEVICE); kvfree(recv_buf->mkey_in); - free_recv_pages(&qp->recv_buf); + free_page_list(recv_buf->npages, recv_buf->page_list); } static int mlx5vf_alloc_qp_recv_resources(struct mlx5_core_dev *mdev, @@ -1455,10 +1416,12 @@ static int mlx5vf_alloc_qp_recv_resources(struct mlx5_core_dev *mdev, struct mlx5_vhca_recv_buf *recv_buf = &qp->recv_buf; int err; - err = alloc_recv_pages(recv_buf, npages); - if (err < 0) + err = mlx5vf_add_pages(&recv_buf->page_list, npages); + if (err) return err; + recv_buf->npages = npages; + recv_buf->mkey_in = alloc_mkey_in(npages, pdn); if (!recv_buf->mkey_in) { err = -ENOMEM; @@ -1466,24 +1429,25 @@ static int mlx5vf_alloc_qp_recv_resources(struct mlx5_core_dev *mdev, } err = register_dma_pages(mdev, npages, recv_buf->page_list, - recv_buf->mkey_in); + recv_buf->mkey_in, &recv_buf->state, + DMA_FROM_DEVICE); if (err) goto err_register_dma; - err = create_mkey(mdev, npages, NULL, recv_buf->mkey_in, - &recv_buf->mkey); + err = create_mkey(mdev, npages, recv_buf->mkey_in, &recv_buf->mkey); if (err) goto err_create_mkey; return 0; err_create_mkey: - unregister_dma_pages(mdev, npages, recv_buf->mkey_in); + unregister_dma_pages(mdev, npages, recv_buf->mkey_in, &recv_buf->state, + DMA_FROM_DEVICE); err_register_dma: kvfree(recv_buf->mkey_in); recv_buf->mkey_in = NULL; end: - free_recv_pages(recv_buf); + free_page_list(npages, recv_buf->page_list); return err; } diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h index 25dd6ff54591..d7821b5ca772 100644 --- a/drivers/vfio/pci/mlx5/cmd.h +++ b/drivers/vfio/pci/mlx5/cmd.h @@ -53,7 +53,8 @@ struct mlx5_vf_migration_header { }; struct mlx5_vhca_data_buffer { - struct sg_append_table table; + struct page **page_list; + struct dma_iova_state state; loff_t start_pos; u64 length; u32 npages; @@ -63,10 +64,6 @@ struct mlx5_vhca_data_buffer { u8 stop_copy_chunk_num; struct list_head buf_elm; struct mlx5_vf_migration_file *migf; - /* Optimize mlx5vf_get_migration_page() for sequential access */ - struct scatterlist *last_offset_sg; - unsigned int sg_last_entry; - unsigned long last_offset; }; struct mlx5vf_async_data { @@ -133,6 +130,7 @@ struct mlx5_vhca_cq { struct mlx5_vhca_recv_buf { u32 npages; struct page **page_list; + struct dma_iova_state state; u32 next_rq_offset; u32 *mkey_in; u32 mkey; @@ -224,8 +222,17 @@ struct mlx5_vhca_data_buffer * mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf, u32 npages, enum dma_data_direction dma_dir); void mlx5vf_put_data_buffer(struct mlx5_vhca_data_buffer *buf); -struct page *mlx5vf_get_migration_page(struct mlx5_vhca_data_buffer *buf, - unsigned long offset); +static inline struct page * +mlx5vf_get_migration_page(struct mlx5_vhca_data_buffer *buf, + unsigned long offset) +{ + int page_entry = offset / PAGE_SIZE; + + if (page_entry >= buf->npages) + return NULL; + + return buf->page_list[page_entry]; +} void mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device *mvdev); void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev, enum mlx5_vf_migf_state *last_save_state); diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index bc0f468f741b..93f894fe60d2 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -34,37 +34,6 @@ static struct mlx5vf_pci_core_device *mlx5vf_drvdata(struct pci_dev *pdev) core_device); } -struct page * -mlx5vf_get_migration_page(struct mlx5_vhca_data_buffer *buf, - unsigned long offset) -{ - unsigned long cur_offset = 0; - struct scatterlist *sg; - unsigned int i; - - /* All accesses are sequential */ - if (offset < buf->last_offset || !buf->last_offset_sg) { - buf->last_offset = 0; - buf->last_offset_sg = buf->table.sgt.sgl; - buf->sg_last_entry = 0; - } - - cur_offset = buf->last_offset; - - for_each_sg(buf->last_offset_sg, sg, - buf->table.sgt.orig_nents - buf->sg_last_entry, i) { - if (offset < sg->length + cur_offset) { - buf->last_offset_sg = sg; - buf->sg_last_entry += i; - buf->last_offset = cur_offset; - return nth_page(sg_page(sg), - (offset - cur_offset) / PAGE_SIZE); - } - cur_offset += sg->length; - } - return NULL; -} - static void mlx5vf_disable_fd(struct mlx5_vf_migration_file *migf) { mutex_lock(&migf->lock); From 4518e5a60c7fbf0cdff393c2681db39d77b4f87e Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Wed, 21 May 2025 11:46:47 +0800 Subject: [PATCH 11/11] vfio/type1: Fix error unwind in migration dirty bitmap allocation When setting up dirty page tracking at the vfio IOMMU backend for device migration, if an error is encountered allocating a tracking bitmap, the unwind loop fails to free previously allocated tracking bitmaps. This occurs because the wrong loop index is used to generate the tracking object. This results in unintended memory usage for the life of the current DMA mappings where bitmaps were successfully allocated. Use the correct loop index to derive the tracking object for freeing during unwind. Fixes: d6a4c185660c ("vfio iommu: Implementation of ioctl for dirty pages tracking") Signed-off-by: Li RongQing Link: https://lore.kernel.org/r/20250521034647.2877-1-lirongqing@baidu.com Signed-off-by: Alex Williamson --- drivers/vfio/vfio_iommu_type1.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index afc1449335c3..1136d7ac6b59 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -292,7 +292,7 @@ static int vfio_dma_bitmap_alloc_all(struct vfio_iommu *iommu, size_t pgsize) struct rb_node *p; for (p = rb_prev(n); p; p = rb_prev(p)) { - struct vfio_dma *dma = rb_entry(n, + struct vfio_dma *dma = rb_entry(p, struct vfio_dma, node); vfio_dma_bitmap_free(dma);