From 474d9f32766a104fc8703c9a76815769139edd60 Mon Sep 17 00:00:00 2001 From: GuokaiXu Date: Mon, 4 Sep 2023 10:32:36 +0800 Subject: [PATCH 01/52] iommufd: Fix spelling errors in comments requres -> requires dramtically -> dramatically Link: https://lore.kernel.org/r/31680D47D9533D91+20230904023236.GA12494@xgk8823 Signed-off-by: GuokaiXu Signed-off-by: Jason Gunthorpe --- tools/testing/selftests/iommu/iommufd.c | 2 +- tools/testing/selftests/iommu/iommufd_fail_nth.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c index 33d08600be13..b7249ffc6750 100644 --- a/tools/testing/selftests/iommu/iommufd.c +++ b/tools/testing/selftests/iommu/iommufd.c @@ -1729,7 +1729,7 @@ TEST_F(vfio_compat_mock_domain, map) ASSERT_EQ(0, ioctl(self->fd, VFIO_IOMMU_UNMAP_DMA, &unmap_cmd)); ASSERT_EQ(BUFFER_SIZE, unmap_cmd.size); - /* UNMAP_FLAG_ALL requres 0 iova/size */ + /* UNMAP_FLAG_ALL requires 0 iova/size */ ASSERT_EQ(0, ioctl(self->fd, VFIO_IOMMU_MAP_DMA, &map_cmd)); unmap_cmd.flags = VFIO_DMA_UNMAP_FLAG_ALL; EXPECT_ERRNO(EINVAL, ioctl(self->fd, VFIO_IOMMU_UNMAP_DMA, &unmap_cmd)); diff --git a/tools/testing/selftests/iommu/iommufd_fail_nth.c b/tools/testing/selftests/iommu/iommufd_fail_nth.c index a220ca2a689d..36e7aa4f615c 100644 --- a/tools/testing/selftests/iommu/iommufd_fail_nth.c +++ b/tools/testing/selftests/iommu/iommufd_fail_nth.c @@ -105,7 +105,7 @@ static bool fail_nth_next(struct __test_metadata *_metadata, /* * This is just an arbitrary limit based on the current kernel - * situation. Changes in the kernel can dramtically change the number of + * situation. Changes in the kernel can dramatically change the number of * required fault injection sites, so if this hits it doesn't * necessarily mean a test failure, just that the limit has to be made * bigger. From bb812e0069ce5de9af2a7910951c8c95632cebe3 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Mon, 18 Sep 2023 18:16:37 -0700 Subject: [PATCH 02/52] iommufd/selftest: Iterate idev_ids in mock_domain's alloc_hwpt test The point in iterating variant->mock_domains is to test the idev_ids[0] and idev_ids[1]. So use it instead of keeping testing idev_ids[0] only. Link: https://lore.kernel.org/r/20230919011637.16483-1-nicolinc@nvidia.com Signed-off-by: Nicolin Chen Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe --- tools/testing/selftests/iommu/iommufd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c index b7249ffc6750..f094a08a17ed 100644 --- a/tools/testing/selftests/iommu/iommufd.c +++ b/tools/testing/selftests/iommu/iommufd.c @@ -1407,7 +1407,7 @@ TEST_F(iommufd_mock_domain, alloc_hwpt) uint32_t stddev_id; uint32_t hwpt_id; - test_cmd_hwpt_alloc(self->idev_ids[0], self->ioas_id, &hwpt_id); + test_cmd_hwpt_alloc(self->idev_ids[i], self->ioas_id, &hwpt_id); test_cmd_mock_domain(hwpt_id, &stddev_id, NULL, NULL); test_ioctl_destroy(stddev_id); test_ioctl_destroy(hwpt_id); From 909f4abd1097769d024c3a9c2e59c2fbe5d2d0c0 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Thu, 28 Sep 2023 00:15:23 -0700 Subject: [PATCH 03/52] iommu: Add new iommu op to create domains owned by userspace Introduce a new iommu_domain op to create domains owned by userspace, e.g. through IOMMUFD. These domains have a few different properties compares to kernel owned domains: - They may be PAGING domains, but created with special parameters. For instance aperture size changes/number of levels, different IOPTE formats, or other things necessary to make a vIOMMU work - We have to track all the memory allocations with GFP_KERNEL_ACCOUNT to make the cgroup sandbox stronger - Device-specialty domains, such as NESTED domains can be created by IOMMUFD. The new op clearly says the domain is being created by IOMMUFD, that the domain is intended for userspace use, and it provides a way to pass user flags or a driver specific uAPI structure to customize the created domain to exactly what the vIOMMU userspace driver requires. iommu drivers that cannot support VFIO/IOMMUFD should not support this op. This includes any driver that cannot provide a fully functional PAGING domain. This new op for now is only supposed to be used by IOMMUFD, hence no wrapper for it. IOMMUFD would call the callback directly. As for domain free, IOMMUFD would use iommu_domain_free(). Link: https://lore.kernel.org/r/20230928071528.26258-2-yi.l.liu@intel.com Suggested-by: Jason Gunthorpe Signed-off-by: Lu Baolu Co-developed-by: Nicolin Chen Signed-off-by: Nicolin Chen Signed-off-by: Yi Liu Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe --- include/linux/iommu.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/include/linux/iommu.h b/include/linux/iommu.h index c50a769d569a..3861d66b65c1 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -234,7 +234,15 @@ struct iommu_iotlb_gather { * op is allocated in the iommu driver and freed by the caller after * use. The information type is one of enum iommu_hw_info_type defined * in include/uapi/linux/iommufd.h. - * @domain_alloc: allocate iommu domain + * @domain_alloc: allocate and return an iommu domain if success. Otherwise + * NULL is returned. The domain is not fully initialized until + * the caller iommu_domain_alloc() returns. + * @domain_alloc_user: Allocate an iommu domain corresponding to the input + * parameters as defined in include/uapi/linux/iommufd.h. + * Unlike @domain_alloc, it is called only by IOMMUFD and + * must fully initialize the new domain before return. + * Upon success, a domain is returned. Upon failure, + * ERR_PTR must be returned. * @probe_device: Add device to iommu driver handling * @release_device: Remove device from iommu driver handling * @probe_finalize: Do final setup work after the device is added to an IOMMU @@ -267,6 +275,7 @@ struct iommu_ops { /* Domain allocation and freeing by the iommu driver */ struct iommu_domain *(*domain_alloc)(unsigned iommu_domain_type); + struct iommu_domain *(*domain_alloc_user)(struct device *dev, u32 flags); struct iommu_device *(*probe_device)(struct device *dev); void (*release_device)(struct device *dev); From 7975b722087fa23ff3ad1ff4998b8572a7e17e84 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Thu, 28 Sep 2023 00:15:24 -0700 Subject: [PATCH 04/52] iommufd: Use the domain_alloc_user() op for domain allocation Make IOMMUFD use iommu_domain_alloc_user() by default for iommu_domain creation. IOMMUFD needs to support iommu_domain allocation with parameters from userspace in nested support, and a driver is expected to implement everything under this op. If the iommu driver doesn't provide domain_alloc_user callback then IOMMUFD falls back to use iommu_domain_alloc() with an UNMANAGED type if possible. Link: https://lore.kernel.org/r/20230928071528.26258-3-yi.l.liu@intel.com Suggested-by: Jason Gunthorpe Reviewed-by: Lu Baolu Reviewed-by: Kevin Tian Co-developed-by: Nicolin Chen Signed-off-by: Nicolin Chen Signed-off-by: Yi Liu Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/hw_pagetable.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c index cf2c1504e20d..48874f896521 100644 --- a/drivers/iommu/iommufd/hw_pagetable.c +++ b/drivers/iommu/iommufd/hw_pagetable.c @@ -5,6 +5,7 @@ #include #include +#include "../iommu-priv.h" #include "iommufd_private.h" void iommufd_hw_pagetable_destroy(struct iommufd_object *obj) @@ -74,6 +75,7 @@ struct iommufd_hw_pagetable * iommufd_hw_pagetable_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, struct iommufd_device *idev, bool immediate_attach) { + const struct iommu_ops *ops = dev_iommu_ops(idev->dev); struct iommufd_hw_pagetable *hwpt; int rc; @@ -88,10 +90,19 @@ iommufd_hw_pagetable_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, refcount_inc(&ioas->obj.users); hwpt->ioas = ioas; - hwpt->domain = iommu_domain_alloc(idev->dev->bus); - if (!hwpt->domain) { - rc = -ENOMEM; - goto out_abort; + if (ops->domain_alloc_user) { + hwpt->domain = ops->domain_alloc_user(idev->dev, 0); + if (IS_ERR(hwpt->domain)) { + rc = PTR_ERR(hwpt->domain); + hwpt->domain = NULL; + goto out_abort; + } + } else { + hwpt->domain = iommu_domain_alloc(idev->dev->bus); + if (!hwpt->domain) { + rc = -ENOMEM; + goto out_abort; + } } /* From 89d63875d80ea127280c60dd4cd101af1d9b6557 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Thu, 28 Sep 2023 00:15:25 -0700 Subject: [PATCH 05/52] iommufd: Flow user flags for domain allocation to domain_alloc_user() Extends iommufd_hw_pagetable_alloc() to accept user flags, the uAPI will provide the flags. Link: https://lore.kernel.org/r/20230928071528.26258-4-yi.l.liu@intel.com Reviewed-by: Kevin Tian Signed-off-by: Yi Liu Reviewed-by: Lu Baolu Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/device.c | 2 +- drivers/iommu/iommufd/hw_pagetable.c | 9 ++++++--- drivers/iommu/iommufd/iommufd_private.h | 3 ++- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c index ce78c3671539..e88fa73a45e6 100644 --- a/drivers/iommu/iommufd/device.c +++ b/drivers/iommu/iommufd/device.c @@ -540,7 +540,7 @@ iommufd_device_auto_get_domain(struct iommufd_device *idev, } hwpt = iommufd_hw_pagetable_alloc(idev->ictx, ioas, idev, - immediate_attach); + 0, immediate_attach); if (IS_ERR(hwpt)) { destroy_hwpt = ERR_CAST(hwpt); goto out_unlock; diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c index 48874f896521..5be7a31cbd9c 100644 --- a/drivers/iommu/iommufd/hw_pagetable.c +++ b/drivers/iommu/iommufd/hw_pagetable.c @@ -61,6 +61,7 @@ int iommufd_hw_pagetable_enforce_cc(struct iommufd_hw_pagetable *hwpt) * @ictx: iommufd context * @ioas: IOAS to associate the domain with * @idev: Device to get an iommu_domain for + * @flags: Flags from userspace * @immediate_attach: True if idev should be attached to the hwpt * * Allocate a new iommu_domain and return it as a hw_pagetable. The HWPT @@ -73,7 +74,8 @@ int iommufd_hw_pagetable_enforce_cc(struct iommufd_hw_pagetable *hwpt) */ struct iommufd_hw_pagetable * iommufd_hw_pagetable_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, - struct iommufd_device *idev, bool immediate_attach) + struct iommufd_device *idev, u32 flags, + bool immediate_attach) { const struct iommu_ops *ops = dev_iommu_ops(idev->dev); struct iommufd_hw_pagetable *hwpt; @@ -91,7 +93,7 @@ iommufd_hw_pagetable_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, hwpt->ioas = ioas; if (ops->domain_alloc_user) { - hwpt->domain = ops->domain_alloc_user(idev->dev, 0); + hwpt->domain = ops->domain_alloc_user(idev->dev, flags); if (IS_ERR(hwpt->domain)) { rc = PTR_ERR(hwpt->domain); hwpt->domain = NULL; @@ -166,7 +168,8 @@ int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd) } mutex_lock(&ioas->mutex); - hwpt = iommufd_hw_pagetable_alloc(ucmd->ictx, ioas, idev, false); + hwpt = iommufd_hw_pagetable_alloc(ucmd->ictx, ioas, + idev, cmd->flags, false); if (IS_ERR(hwpt)) { rc = PTR_ERR(hwpt); goto out_unlock; diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index 2c58670011fe..3064997a0181 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -242,7 +242,8 @@ struct iommufd_hw_pagetable { struct iommufd_hw_pagetable * iommufd_hw_pagetable_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, - struct iommufd_device *idev, bool immediate_attach); + struct iommufd_device *idev, u32 flags, + bool immediate_attach); int iommufd_hw_pagetable_enforce_cc(struct iommufd_hw_pagetable *hwpt); int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt, struct iommufd_device *idev); From 4ff542163397073f86eda484318d61980ff1031d Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Thu, 28 Sep 2023 00:15:26 -0700 Subject: [PATCH 06/52] iommufd: Support allocating nested parent domain Extend IOMMU_HWPT_ALLOC to allocate domains to be used as parent (stage-2) in nested translation. Add IOMMU_HWPT_ALLOC_NEST_PARENT to the uAPI. Link: https://lore.kernel.org/r/20230928071528.26258-5-yi.l.liu@intel.com Signed-off-by: Yi Liu Reviewed-by: Kevin Tian Reviewed-by: Lu Baolu Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/hw_pagetable.c | 5 ++++- include/uapi/linux/iommufd.h | 12 +++++++++++- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c index 5be7a31cbd9c..8b3d2875d642 100644 --- a/drivers/iommu/iommufd/hw_pagetable.c +++ b/drivers/iommu/iommufd/hw_pagetable.c @@ -83,6 +83,9 @@ iommufd_hw_pagetable_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, lockdep_assert_held(&ioas->mutex); + if (flags && !ops->domain_alloc_user) + return ERR_PTR(-EOPNOTSUPP); + hwpt = iommufd_object_alloc(ictx, hwpt, IOMMUFD_OBJ_HW_PAGETABLE); if (IS_ERR(hwpt)) return hwpt; @@ -154,7 +157,7 @@ int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd) struct iommufd_ioas *ioas; int rc; - if (cmd->flags || cmd->__reserved) + if ((cmd->flags & (~IOMMU_HWPT_ALLOC_NEST_PARENT)) || cmd->__reserved) return -EOPNOTSUPP; idev = iommufd_get_device(ucmd, cmd->dev_id); diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index b4ba0c0cbab6..4a7c5c8fdbb4 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -347,10 +347,20 @@ struct iommu_vfio_ioas { }; #define IOMMU_VFIO_IOAS _IO(IOMMUFD_TYPE, IOMMUFD_CMD_VFIO_IOAS) +/** + * enum iommufd_hwpt_alloc_flags - Flags for HWPT allocation + * @IOMMU_HWPT_ALLOC_NEST_PARENT: If set, allocate a domain which can serve + * as the parent domain in the nesting + * configuration. + */ +enum iommufd_hwpt_alloc_flags { + IOMMU_HWPT_ALLOC_NEST_PARENT = 1 << 0, +}; + /** * struct iommu_hwpt_alloc - ioctl(IOMMU_HWPT_ALLOC) * @size: sizeof(struct iommu_hwpt_alloc) - * @flags: Must be 0 + * @flags: Combination of enum iommufd_hwpt_alloc_flags * @dev_id: The device to allocate this HWPT for * @pt_id: The IOAS to connect this HWPT to * @out_hwpt_id: The ID of the new HWPT From 408663619fcfc89c087df65b362c91bf0a0be617 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Thu, 28 Sep 2023 00:15:27 -0700 Subject: [PATCH 07/52] iommufd/selftest: Add domain_alloc_user() support in iommu mock Add mock_domain_alloc_user() and a new test case for IOMMU_HWPT_ALLOC_NEST_PARENT. Link: https://lore.kernel.org/r/20230928071528.26258-6-yi.l.liu@intel.com Co-developed-by: Nicolin Chen Signed-off-by: Nicolin Chen Signed-off-by: Yi Liu Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/selftest.c | 19 +++++++++++++++ tools/testing/selftests/iommu/iommufd.c | 24 +++++++++++++++---- .../selftests/iommu/iommufd_fail_nth.c | 2 +- tools/testing/selftests/iommu/iommufd_utils.h | 11 ++++++--- 4 files changed, 48 insertions(+), 8 deletions(-) diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c index 56506d5753f1..fe7e3c7d933a 100644 --- a/drivers/iommu/iommufd/selftest.c +++ b/drivers/iommu/iommufd/selftest.c @@ -146,6 +146,8 @@ static void *mock_domain_hw_info(struct device *dev, u32 *length, u32 *type) return info; } +static const struct iommu_ops mock_ops; + static struct iommu_domain *mock_domain_alloc(unsigned int iommu_domain_type) { struct mock_iommu_domain *mock; @@ -162,10 +164,26 @@ static struct iommu_domain *mock_domain_alloc(unsigned int iommu_domain_type) mock->domain.geometry.aperture_start = MOCK_APERTURE_START; mock->domain.geometry.aperture_end = MOCK_APERTURE_LAST; mock->domain.pgsize_bitmap = MOCK_IO_PAGE_SIZE; + mock->domain.ops = mock_ops.default_domain_ops; + mock->domain.type = iommu_domain_type; xa_init(&mock->pfns); return &mock->domain; } +static struct iommu_domain * +mock_domain_alloc_user(struct device *dev, u32 flags) +{ + struct iommu_domain *domain; + + if (flags & (~IOMMU_HWPT_ALLOC_NEST_PARENT)) + return ERR_PTR(-EOPNOTSUPP); + + domain = mock_domain_alloc(IOMMU_DOMAIN_UNMANAGED); + if (!domain) + domain = ERR_PTR(-ENOMEM); + return domain; +} + static void mock_domain_free(struct iommu_domain *domain) { struct mock_iommu_domain *mock = @@ -307,6 +325,7 @@ static const struct iommu_ops mock_ops = { .pgsize_bitmap = MOCK_IO_PAGE_SIZE, .hw_info = mock_domain_hw_info, .domain_alloc = mock_domain_alloc, + .domain_alloc_user = mock_domain_alloc_user, .capable = mock_domain_capable, .set_platform_dma_ops = mock_domain_set_plaform_dma_ops, .device_group = generic_device_group, diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c index f094a08a17ed..c5eca2fee42c 100644 --- a/tools/testing/selftests/iommu/iommufd.c +++ b/tools/testing/selftests/iommu/iommufd.c @@ -114,6 +114,7 @@ TEST_F(iommufd, cmd_length) TEST_LENGTH(iommu_destroy, IOMMU_DESTROY); TEST_LENGTH(iommu_hw_info, IOMMU_GET_HW_INFO); + TEST_LENGTH(iommu_hwpt_alloc, IOMMU_HWPT_ALLOC); TEST_LENGTH(iommu_ioas_alloc, IOMMU_IOAS_ALLOC); TEST_LENGTH(iommu_ioas_iova_ranges, IOMMU_IOAS_IOVA_RANGES); TEST_LENGTH(iommu_ioas_allow_iovas, IOMMU_IOAS_ALLOW_IOVAS); @@ -1404,13 +1405,28 @@ TEST_F(iommufd_mock_domain, alloc_hwpt) int i; for (i = 0; i != variant->mock_domains; i++) { + uint32_t hwpt_id[2]; uint32_t stddev_id; - uint32_t hwpt_id; - test_cmd_hwpt_alloc(self->idev_ids[i], self->ioas_id, &hwpt_id); - test_cmd_mock_domain(hwpt_id, &stddev_id, NULL, NULL); + test_err_hwpt_alloc(EOPNOTSUPP, + self->idev_ids[i], self->ioas_id, + ~IOMMU_HWPT_ALLOC_NEST_PARENT, &hwpt_id[0]); + test_cmd_hwpt_alloc(self->idev_ids[i], self->ioas_id, + 0, &hwpt_id[0]); + test_cmd_hwpt_alloc(self->idev_ids[i], self->ioas_id, + IOMMU_HWPT_ALLOC_NEST_PARENT, &hwpt_id[1]); + + /* Do a hw_pagetable rotation test */ + test_cmd_mock_domain_replace(self->stdev_ids[i], hwpt_id[0]); + EXPECT_ERRNO(EBUSY, _test_ioctl_destroy(self->fd, hwpt_id[0])); + test_cmd_mock_domain_replace(self->stdev_ids[i], hwpt_id[1]); + EXPECT_ERRNO(EBUSY, _test_ioctl_destroy(self->fd, hwpt_id[1])); + test_cmd_mock_domain_replace(self->stdev_ids[i], self->ioas_id); + test_ioctl_destroy(hwpt_id[1]); + + test_cmd_mock_domain(hwpt_id[0], &stddev_id, NULL, NULL); test_ioctl_destroy(stddev_id); - test_ioctl_destroy(hwpt_id); + test_ioctl_destroy(hwpt_id[0]); } } diff --git a/tools/testing/selftests/iommu/iommufd_fail_nth.c b/tools/testing/selftests/iommu/iommufd_fail_nth.c index 36e7aa4f615c..31386be42439 100644 --- a/tools/testing/selftests/iommu/iommufd_fail_nth.c +++ b/tools/testing/selftests/iommu/iommufd_fail_nth.c @@ -615,7 +615,7 @@ TEST_FAIL_NTH(basic_fail_nth, device) if (_test_cmd_get_hw_info(self->fd, idev_id, &info, sizeof(info))) return -1; - if (_test_cmd_hwpt_alloc(self->fd, idev_id, ioas_id, &hwpt_id)) + if (_test_cmd_hwpt_alloc(self->fd, idev_id, ioas_id, 0, &hwpt_id)) return -1; if (_test_cmd_mock_domain_replace(self->fd, stdev_id, ioas_id2, NULL)) diff --git a/tools/testing/selftests/iommu/iommufd_utils.h b/tools/testing/selftests/iommu/iommufd_utils.h index e0753d03ecaa..be4970a84977 100644 --- a/tools/testing/selftests/iommu/iommufd_utils.h +++ b/tools/testing/selftests/iommu/iommufd_utils.h @@ -103,10 +103,11 @@ static int _test_cmd_mock_domain_replace(int fd, __u32 stdev_id, __u32 pt_id, pt_id, NULL)) static int _test_cmd_hwpt_alloc(int fd, __u32 device_id, __u32 pt_id, - __u32 *hwpt_id) + __u32 flags, __u32 *hwpt_id) { struct iommu_hwpt_alloc cmd = { .size = sizeof(cmd), + .flags = flags, .dev_id = device_id, .pt_id = pt_id, }; @@ -120,8 +121,12 @@ static int _test_cmd_hwpt_alloc(int fd, __u32 device_id, __u32 pt_id, return 0; } -#define test_cmd_hwpt_alloc(device_id, pt_id, hwpt_id) \ - ASSERT_EQ(0, _test_cmd_hwpt_alloc(self->fd, device_id, pt_id, hwpt_id)) +#define test_cmd_hwpt_alloc(device_id, pt_id, flags, hwpt_id) \ + ASSERT_EQ(0, _test_cmd_hwpt_alloc(self->fd, device_id, \ + pt_id, flags, hwpt_id)) +#define test_err_hwpt_alloc(_errno, device_id, pt_id, flags, hwpt_id) \ + EXPECT_ERRNO(_errno, _test_cmd_hwpt_alloc(self->fd, device_id, \ + pt_id, flags, hwpt_id)) static int _test_cmd_access_replace_ioas(int fd, __u32 access_id, unsigned int ioas_id) From c97d1b20d3835178bcd0e3a86c20ce4e36b6d80c Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Thu, 28 Sep 2023 00:15:28 -0700 Subject: [PATCH 08/52] iommu/vt-d: Add domain_alloc_user op Add the domain_alloc_user() op implementation. It supports allocating domains to be used as parent under nested translation. Unlike other drivers VT-D uses only a single page table format so it only needs to check if the HW can support nesting. Link: https://lore.kernel.org/r/20230928071528.26258-7-yi.l.liu@intel.com Signed-off-by: Yi Liu Reviewed-by: Lu Baolu Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe --- drivers/iommu/intel/iommu.c | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 5db283c17e0d..017aed5813d8 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -4074,6 +4074,33 @@ static struct iommu_domain *intel_iommu_domain_alloc(unsigned type) return NULL; } +static struct iommu_domain * +intel_iommu_domain_alloc_user(struct device *dev, u32 flags) +{ + struct iommu_domain *domain; + struct intel_iommu *iommu; + + if (flags & (~IOMMU_HWPT_ALLOC_NEST_PARENT)) + return ERR_PTR(-EOPNOTSUPP); + + iommu = device_to_iommu(dev, NULL, NULL); + if (!iommu) + return ERR_PTR(-ENODEV); + + if ((flags & IOMMU_HWPT_ALLOC_NEST_PARENT) && !ecap_nest(iommu->ecap)) + return ERR_PTR(-EOPNOTSUPP); + + /* + * domain_alloc_user op needs to fully initialize a domain + * before return, so uses iommu_domain_alloc() here for + * simple. + */ + domain = iommu_domain_alloc(dev->bus); + if (!domain) + domain = ERR_PTR(-ENOMEM); + return domain; +} + static void intel_iommu_domain_free(struct iommu_domain *domain) { if (domain != &si_domain->domain && domain != &blocking_domain) @@ -4807,6 +4834,7 @@ const struct iommu_ops intel_iommu_ops = { .capable = intel_iommu_capable, .hw_info = intel_iommu_hw_info, .domain_alloc = intel_iommu_domain_alloc, + .domain_alloc_user = intel_iommu_domain_alloc_user, .probe_device = intel_iommu_probe_device, .probe_finalize = intel_iommu_probe_finalize, .release_device = intel_iommu_release_device, From 266dcae34d8f44c3bbab00e227f8b14517682bb7 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Sun, 15 Oct 2023 00:46:48 -0700 Subject: [PATCH 09/52] iommufd/selftest: Rework TEST_LENGTH to test min_size explicitly TEST_LENGTH passing ".size = sizeof(struct _struct) - 1" expects -EINVAL from "if (ucmd.user_size < op->min_size)" check in iommufd_fops_ioctl(). This has been working when min_size is exactly the size of the structure. However, if the size of the structure becomes larger than min_size, i.e. the passing size above is larger than min_size, that min_size sanity no longer works. Since the first test in TEST_LENGTH() was to test that min_size sanity routine, rework it to support a min_size calculation, rather than using the full size of the structure. Link: https://lore.kernel.org/r/20231015074648.24185-1-nicolinc@nvidia.com Signed-off-by: Nicolin Chen Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe --- tools/testing/selftests/iommu/iommufd.c | 29 ++++++++++++++----------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c index c5eca2fee42c..6323153d277b 100644 --- a/tools/testing/selftests/iommu/iommufd.c +++ b/tools/testing/selftests/iommu/iommufd.c @@ -86,12 +86,13 @@ TEST_F(iommufd, cmd_fail) TEST_F(iommufd, cmd_length) { -#define TEST_LENGTH(_struct, _ioctl) \ +#define TEST_LENGTH(_struct, _ioctl, _last) \ { \ + size_t min_size = offsetofend(struct _struct, _last); \ struct { \ struct _struct cmd; \ uint8_t extra; \ - } cmd = { .cmd = { .size = sizeof(struct _struct) - 1 }, \ + } cmd = { .cmd = { .size = min_size - 1 }, \ .extra = UINT8_MAX }; \ int old_errno; \ int rc; \ @@ -112,17 +113,19 @@ TEST_F(iommufd, cmd_length) } \ } - TEST_LENGTH(iommu_destroy, IOMMU_DESTROY); - TEST_LENGTH(iommu_hw_info, IOMMU_GET_HW_INFO); - TEST_LENGTH(iommu_hwpt_alloc, IOMMU_HWPT_ALLOC); - TEST_LENGTH(iommu_ioas_alloc, IOMMU_IOAS_ALLOC); - TEST_LENGTH(iommu_ioas_iova_ranges, IOMMU_IOAS_IOVA_RANGES); - TEST_LENGTH(iommu_ioas_allow_iovas, IOMMU_IOAS_ALLOW_IOVAS); - TEST_LENGTH(iommu_ioas_map, IOMMU_IOAS_MAP); - TEST_LENGTH(iommu_ioas_copy, IOMMU_IOAS_COPY); - TEST_LENGTH(iommu_ioas_unmap, IOMMU_IOAS_UNMAP); - TEST_LENGTH(iommu_option, IOMMU_OPTION); - TEST_LENGTH(iommu_vfio_ioas, IOMMU_VFIO_IOAS); + TEST_LENGTH(iommu_destroy, IOMMU_DESTROY, id); + TEST_LENGTH(iommu_hw_info, IOMMU_GET_HW_INFO, __reserved); + TEST_LENGTH(iommu_hwpt_alloc, IOMMU_HWPT_ALLOC, __reserved); + TEST_LENGTH(iommu_ioas_alloc, IOMMU_IOAS_ALLOC, out_ioas_id); + TEST_LENGTH(iommu_ioas_iova_ranges, IOMMU_IOAS_IOVA_RANGES, + out_iova_alignment); + TEST_LENGTH(iommu_ioas_allow_iovas, IOMMU_IOAS_ALLOW_IOVAS, + allowed_iovas); + TEST_LENGTH(iommu_ioas_map, IOMMU_IOAS_MAP, iova); + TEST_LENGTH(iommu_ioas_copy, IOMMU_IOAS_COPY, src_iova); + TEST_LENGTH(iommu_ioas_unmap, IOMMU_IOAS_UNMAP, length); + TEST_LENGTH(iommu_option, IOMMU_OPTION, val64); + TEST_LENGTH(iommu_vfio_ioas, IOMMU_VFIO_IOAS, __reserved); #undef TEST_LENGTH } From b5f9e63278d6f32789478acf1ed41d21d92b36cf Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Tue, 17 Oct 2023 11:15:52 -0700 Subject: [PATCH 10/52] iommufd: Correct IOMMU_HWPT_ALLOC_NEST_PARENT description The IOMMU_HWPT_ALLOC_NEST_PARENT flag is used to allocate a HWPT. Though a HWPT holds a domain in the core structure, it is still quite confusing to describe it using "domain" in the uAPI kdoc. Correct it to "HWPT". Fixes: 4ff542163397 ("iommufd: Support allocating nested parent domain") Link: https://lore.kernel.org/r/20231017181552.12667-1-nicolinc@nvidia.com Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe --- include/uapi/linux/iommufd.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index 4a7c5c8fdbb4..be7a95042677 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -349,9 +349,8 @@ struct iommu_vfio_ioas { /** * enum iommufd_hwpt_alloc_flags - Flags for HWPT allocation - * @IOMMU_HWPT_ALLOC_NEST_PARENT: If set, allocate a domain which can serve - * as the parent domain in the nesting - * configuration. + * @IOMMU_HWPT_ALLOC_NEST_PARENT: If set, allocate a HWPT that can serve as + * the parent HWPT in a nesting configuration. */ enum iommufd_hwpt_alloc_flags { IOMMU_HWPT_ALLOC_NEST_PARENT = 1 << 0, From 53f0b020218fcc0a56a11df39630dbd379e4d9a6 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Tue, 24 Oct 2023 14:50:52 +0100 Subject: [PATCH 11/52] vfio/iova_bitmap: Export more API symbols In preparation to move iova_bitmap into iommufd, export the rest of API symbols that will be used in what could be used by modules, namely: iova_bitmap_alloc iova_bitmap_free iova_bitmap_for_each Link: https://lore.kernel.org/r/20231024135109.73787-2-joao.m.martins@oracle.com Suggested-by: Alex Williamson Signed-off-by: Joao Martins Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Reviewed-by: Alex Williamson Signed-off-by: Jason Gunthorpe --- drivers/vfio/iova_bitmap.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/vfio/iova_bitmap.c b/drivers/vfio/iova_bitmap.c index 0848f920efb7..f54b56388e00 100644 --- a/drivers/vfio/iova_bitmap.c +++ b/drivers/vfio/iova_bitmap.c @@ -268,6 +268,7 @@ err: iova_bitmap_free(bitmap); return ERR_PTR(rc); } +EXPORT_SYMBOL_GPL(iova_bitmap_alloc); /** * iova_bitmap_free() - Frees an IOVA bitmap object @@ -289,6 +290,7 @@ void iova_bitmap_free(struct iova_bitmap *bitmap) kfree(bitmap); } +EXPORT_SYMBOL_GPL(iova_bitmap_free); /* * Returns the remaining bitmap indexes from mapped_total_index to process for @@ -387,6 +389,7 @@ int iova_bitmap_for_each(struct iova_bitmap *bitmap, void *opaque, return ret; } +EXPORT_SYMBOL_GPL(iova_bitmap_for_each); /** * iova_bitmap_set() - Records an IOVA range in bitmap From 8c9c727b6142325ed5697240fceb99cbeb4ac2ec Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Tue, 24 Oct 2023 14:50:53 +0100 Subject: [PATCH 12/52] vfio: Move iova_bitmap into iommufd Both VFIO and IOMMUFD will need iova bitmap for storing dirties and walking the user bitmaps, so move to the common dependency into IOMMUFD. In doing so, create the symbol IOMMUFD_DRIVER which designates the builtin code that will be used by drivers when selected. Today this means MLX5_VFIO_PCI and PDS_VFIO_PCI. IOMMU drivers will do the same (in future patches) when supporting dirty tracking and select IOMMUFD_DRIVER accordingly. Given that the symbol maybe be disabled, add header definitions in iova_bitmap.h for when IOMMUFD_DRIVER=n Link: https://lore.kernel.org/r/20231024135109.73787-3-joao.m.martins@oracle.com Signed-off-by: Joao Martins Reviewed-by: Jason Gunthorpe Reviewed-by: Brett Creeley Reviewed-by: Kevin Tian Reviewed-by: Alex Williamson Signed-off-by: Jason Gunthorpe --- drivers/iommu/Kconfig | 4 +++ drivers/iommu/iommufd/Makefile | 1 + drivers/{vfio => iommu/iommufd}/iova_bitmap.c | 0 drivers/vfio/Makefile | 3 +-- drivers/vfio/pci/mlx5/Kconfig | 1 + drivers/vfio/pci/pds/Kconfig | 1 + include/linux/iova_bitmap.h | 26 +++++++++++++++++++ 7 files changed, 34 insertions(+), 2 deletions(-) rename drivers/{vfio => iommu/iommufd}/iova_bitmap.c (100%) diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig index 2b12b583ef4b..5cc869db1b79 100644 --- a/drivers/iommu/Kconfig +++ b/drivers/iommu/Kconfig @@ -7,6 +7,10 @@ config IOMMU_IOVA config IOMMU_API bool +config IOMMUFD_DRIVER + bool + default n + menuconfig IOMMU_SUPPORT bool "IOMMU Hardware Support" depends on MMU diff --git a/drivers/iommu/iommufd/Makefile b/drivers/iommu/iommufd/Makefile index 8aeba81800c5..34b446146961 100644 --- a/drivers/iommu/iommufd/Makefile +++ b/drivers/iommu/iommufd/Makefile @@ -11,3 +11,4 @@ iommufd-y := \ iommufd-$(CONFIG_IOMMUFD_TEST) += selftest.o obj-$(CONFIG_IOMMUFD) += iommufd.o +obj-$(CONFIG_IOMMUFD_DRIVER) += iova_bitmap.o diff --git a/drivers/vfio/iova_bitmap.c b/drivers/iommu/iommufd/iova_bitmap.c similarity index 100% rename from drivers/vfio/iova_bitmap.c rename to drivers/iommu/iommufd/iova_bitmap.c diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile index c82ea032d352..68c05705200f 100644 --- a/drivers/vfio/Makefile +++ b/drivers/vfio/Makefile @@ -1,8 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_VFIO) += vfio.o -vfio-y += vfio_main.o \ - iova_bitmap.o +vfio-y += vfio_main.o vfio-$(CONFIG_VFIO_DEVICE_CDEV) += device_cdev.o vfio-$(CONFIG_VFIO_GROUP) += group.o vfio-$(CONFIG_IOMMUFD) += iommufd.o diff --git a/drivers/vfio/pci/mlx5/Kconfig b/drivers/vfio/pci/mlx5/Kconfig index 7088edc4fb28..c3ced56b7787 100644 --- a/drivers/vfio/pci/mlx5/Kconfig +++ b/drivers/vfio/pci/mlx5/Kconfig @@ -3,6 +3,7 @@ config MLX5_VFIO_PCI tristate "VFIO support for MLX5 PCI devices" depends on MLX5_CORE select VFIO_PCI_CORE + select IOMMUFD_DRIVER help This provides migration support for MLX5 devices using the VFIO framework. diff --git a/drivers/vfio/pci/pds/Kconfig b/drivers/vfio/pci/pds/Kconfig index 407b3fd32733..fff368a8183b 100644 --- a/drivers/vfio/pci/pds/Kconfig +++ b/drivers/vfio/pci/pds/Kconfig @@ -5,6 +5,7 @@ config PDS_VFIO_PCI tristate "VFIO support for PDS PCI devices" depends on PDS_CORE select VFIO_PCI_CORE + select IOMMUFD_DRIVER help This provides generic PCI support for PDS devices using the VFIO framework. diff --git a/include/linux/iova_bitmap.h b/include/linux/iova_bitmap.h index c006cf0a25f3..1c338f5e5b7a 100644 --- a/include/linux/iova_bitmap.h +++ b/include/linux/iova_bitmap.h @@ -7,6 +7,7 @@ #define _IOVA_BITMAP_H_ #include +#include struct iova_bitmap; @@ -14,6 +15,7 @@ typedef int (*iova_bitmap_fn_t)(struct iova_bitmap *bitmap, unsigned long iova, size_t length, void *opaque); +#if IS_ENABLED(CONFIG_IOMMUFD_DRIVER) struct iova_bitmap *iova_bitmap_alloc(unsigned long iova, size_t length, unsigned long page_size, u64 __user *data); @@ -22,5 +24,29 @@ int iova_bitmap_for_each(struct iova_bitmap *bitmap, void *opaque, iova_bitmap_fn_t fn); void iova_bitmap_set(struct iova_bitmap *bitmap, unsigned long iova, size_t length); +#else +static inline struct iova_bitmap *iova_bitmap_alloc(unsigned long iova, + size_t length, + unsigned long page_size, + u64 __user *data) +{ + return NULL; +} + +static inline void iova_bitmap_free(struct iova_bitmap *bitmap) +{ +} + +static inline int iova_bitmap_for_each(struct iova_bitmap *bitmap, void *opaque, + iova_bitmap_fn_t fn) +{ + return -EOPNOTSUPP; +} + +static inline void iova_bitmap_set(struct iova_bitmap *bitmap, + unsigned long iova, size_t length) +{ +} +#endif #endif From 13578d4ebe8be1c16146f37c0c91f2579611cff2 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Tue, 24 Oct 2023 14:50:54 +0100 Subject: [PATCH 13/52] iommufd/iova_bitmap: Move symbols to IOMMUFD namespace Have the IOVA bitmap exported symbols adhere to the IOMMUFD symbol export convention i.e. using the IOMMUFD namespace. In doing so, import the namespace in the current users. This means VFIO and the vfio-pci drivers that use iova_bitmap_set(). Link: https://lore.kernel.org/r/20231024135109.73787-4-joao.m.martins@oracle.com Suggested-by: Jason Gunthorpe Signed-off-by: Joao Martins Reviewed-by: Jason Gunthorpe Reviewed-by: Brett Creeley Reviewed-by: Kevin Tian Reviewed-by: Alex Williamson Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/iova_bitmap.c | 8 ++++---- drivers/vfio/pci/mlx5/main.c | 1 + drivers/vfio/pci/pds/pci_drv.c | 1 + drivers/vfio/vfio_main.c | 1 + 4 files changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/iommu/iommufd/iova_bitmap.c b/drivers/iommu/iommufd/iova_bitmap.c index f54b56388e00..0a92c9eeaf7f 100644 --- a/drivers/iommu/iommufd/iova_bitmap.c +++ b/drivers/iommu/iommufd/iova_bitmap.c @@ -268,7 +268,7 @@ err: iova_bitmap_free(bitmap); return ERR_PTR(rc); } -EXPORT_SYMBOL_GPL(iova_bitmap_alloc); +EXPORT_SYMBOL_NS_GPL(iova_bitmap_alloc, IOMMUFD); /** * iova_bitmap_free() - Frees an IOVA bitmap object @@ -290,7 +290,7 @@ void iova_bitmap_free(struct iova_bitmap *bitmap) kfree(bitmap); } -EXPORT_SYMBOL_GPL(iova_bitmap_free); +EXPORT_SYMBOL_NS_GPL(iova_bitmap_free, IOMMUFD); /* * Returns the remaining bitmap indexes from mapped_total_index to process for @@ -389,7 +389,7 @@ int iova_bitmap_for_each(struct iova_bitmap *bitmap, void *opaque, return ret; } -EXPORT_SYMBOL_GPL(iova_bitmap_for_each); +EXPORT_SYMBOL_NS_GPL(iova_bitmap_for_each, IOMMUFD); /** * iova_bitmap_set() - Records an IOVA range in bitmap @@ -423,4 +423,4 @@ void iova_bitmap_set(struct iova_bitmap *bitmap, cur_bit += nbits; } while (cur_bit <= last_bit); } -EXPORT_SYMBOL_GPL(iova_bitmap_set); +EXPORT_SYMBOL_NS_GPL(iova_bitmap_set, IOMMUFD); diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index 42ec574a8622..5cf2b491d15a 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -1376,6 +1376,7 @@ static struct pci_driver mlx5vf_pci_driver = { module_pci_driver(mlx5vf_pci_driver); +MODULE_IMPORT_NS(IOMMUFD); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Max Gurtovoy "); MODULE_AUTHOR("Yishai Hadas "); diff --git a/drivers/vfio/pci/pds/pci_drv.c b/drivers/vfio/pci/pds/pci_drv.c index ab4b5958e413..dd8c00c895a2 100644 --- a/drivers/vfio/pci/pds/pci_drv.c +++ b/drivers/vfio/pci/pds/pci_drv.c @@ -204,6 +204,7 @@ static struct pci_driver pds_vfio_pci_driver = { module_pci_driver(pds_vfio_pci_driver); +MODULE_IMPORT_NS(IOMMUFD); MODULE_DESCRIPTION(PDS_VFIO_DRV_DESCRIPTION); MODULE_AUTHOR("Brett Creeley "); MODULE_LICENSE("GPL"); diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 40732e8ed4c6..a96d97da367d 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -1693,6 +1693,7 @@ static void __exit vfio_cleanup(void) module_init(vfio_init); module_exit(vfio_cleanup); +MODULE_IMPORT_NS(IOMMUFD); MODULE_VERSION(DRIVER_VERSION); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR(DRIVER_AUTHOR); From 750e2e902b7180cb82d2f9b1e372e32087bb8b1b Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Tue, 24 Oct 2023 14:50:55 +0100 Subject: [PATCH 14/52] iommu: Add iommu_domain ops for dirty tracking Add to iommu domain operations a set of callbacks to perform dirty tracking, particulary to start and stop tracking and to read and clear the dirty data. Drivers are generally expected to dynamically change its translation structures to toggle the tracking and flush some form of control state structure that stands in the IOVA translation path. Though it's not mandatory, as drivers can also enable dirty tracking at boot, and just clear the dirty bits before setting dirty tracking. For each of the newly added IOMMU core APIs: iommu_cap::IOMMU_CAP_DIRTY_TRACKING: new device iommu_capable value when probing for capabilities of the device. .set_dirty_tracking(): an iommu driver is expected to change its translation structures and enable dirty tracking for the devices in the iommu_domain. For drivers making dirty tracking always-enabled, it should just return 0. .read_and_clear_dirty(): an iommu driver is expected to walk the pagetables for the iova range passed in and use iommu_dirty_bitmap_record() to record dirty info per IOVA. When detecting that a given IOVA is dirty it should also clear its dirty state from the PTE, *unless* the flag IOMMU_DIRTY_NO_CLEAR is passed in -- flushing is steered from the caller of the domain_op via iotlb_gather. The iommu core APIs use the same data structure in use for dirty tracking for VFIO device dirty (struct iova_bitmap) abstracted by iommu_dirty_bitmap_record() helper function. domain::dirty_ops: IOMMU domains will store the dirty ops depending on whether the iommu device supports dirty tracking or not. iommu drivers can then use this field to figure if the dirty tracking is supported+enforced on attach. The enforcement is enable via domain_alloc_user() which is done via IOMMUFD hwpt flag introduced later. Link: https://lore.kernel.org/r/20231024135109.73787-5-joao.m.martins@oracle.com Signed-off-by: Joao Martins Reviewed-by: Jason Gunthorpe Reviewed-by: Lu Baolu Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe --- include/linux/io-pgtable.h | 4 +++ include/linux/iommu.h | 70 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 74 insertions(+) diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h index 1b7a44b35616..25142a0e2fc2 100644 --- a/include/linux/io-pgtable.h +++ b/include/linux/io-pgtable.h @@ -166,6 +166,10 @@ struct io_pgtable_ops { struct iommu_iotlb_gather *gather); phys_addr_t (*iova_to_phys)(struct io_pgtable_ops *ops, unsigned long iova); + int (*read_and_clear_dirty)(struct io_pgtable_ops *ops, + unsigned long iova, size_t size, + unsigned long flags, + struct iommu_dirty_bitmap *dirty); }; /** diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 3861d66b65c1..1d42bdb37cbc 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -13,6 +13,7 @@ #include #include #include +#include #include #define IOMMU_READ (1 << 0) @@ -37,6 +38,7 @@ struct bus_type; struct device; struct iommu_domain; struct iommu_domain_ops; +struct iommu_dirty_ops; struct notifier_block; struct iommu_sva; struct iommu_fault_event; @@ -95,6 +97,8 @@ struct iommu_domain_geometry { struct iommu_domain { unsigned type; const struct iommu_domain_ops *ops; + const struct iommu_dirty_ops *dirty_ops; + unsigned long pgsize_bitmap; /* Bitmap of page sizes in use */ struct iommu_domain_geometry geometry; struct iommu_dma_cookie *iova_cookie; @@ -133,6 +137,7 @@ enum iommu_cap { * usefully support the non-strict DMA flush queue. */ IOMMU_CAP_DEFERRED_FLUSH, + IOMMU_CAP_DIRTY_TRACKING, /* IOMMU supports dirty tracking */ }; /* These are the possible reserved region types */ @@ -227,6 +232,35 @@ struct iommu_iotlb_gather { bool queued; }; +/** + * struct iommu_dirty_bitmap - Dirty IOVA bitmap state + * @bitmap: IOVA bitmap + * @gather: Range information for a pending IOTLB flush + */ +struct iommu_dirty_bitmap { + struct iova_bitmap *bitmap; + struct iommu_iotlb_gather *gather; +}; + +/* Read but do not clear any dirty bits */ +#define IOMMU_DIRTY_NO_CLEAR (1 << 0) + +/** + * struct iommu_dirty_ops - domain specific dirty tracking operations + * @set_dirty_tracking: Enable or Disable dirty tracking on the iommu domain + * @read_and_clear_dirty: Walk IOMMU page tables for dirtied PTEs marshalled + * into a bitmap, with a bit represented as a page. + * Reads the dirty PTE bits and clears it from IO + * pagetables. + */ +struct iommu_dirty_ops { + int (*set_dirty_tracking)(struct iommu_domain *domain, bool enabled); + int (*read_and_clear_dirty)(struct iommu_domain *domain, + unsigned long iova, size_t size, + unsigned long flags, + struct iommu_dirty_bitmap *dirty); +}; + /** * struct iommu_ops - iommu ops and capabilities * @capable: check capability @@ -641,6 +675,28 @@ static inline bool iommu_iotlb_gather_queued(struct iommu_iotlb_gather *gather) return gather && gather->queued; } +static inline void iommu_dirty_bitmap_init(struct iommu_dirty_bitmap *dirty, + struct iova_bitmap *bitmap, + struct iommu_iotlb_gather *gather) +{ + if (gather) + iommu_iotlb_gather_init(gather); + + dirty->bitmap = bitmap; + dirty->gather = gather; +} + +static inline void iommu_dirty_bitmap_record(struct iommu_dirty_bitmap *dirty, + unsigned long iova, + unsigned long length) +{ + if (dirty->bitmap) + iova_bitmap_set(dirty->bitmap, iova, length); + + if (dirty->gather) + iommu_iotlb_gather_add_range(dirty->gather, iova, length); +} + /* PCI device grouping function */ extern struct iommu_group *pci_device_group(struct device *dev); /* Generic device grouping function */ @@ -746,6 +802,8 @@ struct iommu_fwspec {}; struct iommu_device {}; struct iommu_fault_param {}; struct iommu_iotlb_gather {}; +struct iommu_dirty_bitmap {}; +struct iommu_dirty_ops {}; static inline bool iommu_present(const struct bus_type *bus) { @@ -978,6 +1036,18 @@ static inline bool iommu_iotlb_gather_queued(struct iommu_iotlb_gather *gather) return false; } +static inline void iommu_dirty_bitmap_init(struct iommu_dirty_bitmap *dirty, + struct iova_bitmap *bitmap, + struct iommu_iotlb_gather *gather) +{ +} + +static inline void iommu_dirty_bitmap_record(struct iommu_dirty_bitmap *dirty, + unsigned long iova, + unsigned long length) +{ +} + static inline void iommu_device_unregister(struct iommu_device *iommu) { } From 5f9bdbf4c65860cc8b9c544d92bfd76fbea8d9c5 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Tue, 24 Oct 2023 14:50:56 +0100 Subject: [PATCH 15/52] iommufd: Add a flag to enforce dirty tracking on attach Throughout IOMMU domain lifetime that wants to use dirty tracking, some guarantees are needed such that any device attached to the iommu_domain supports dirty tracking. The idea is to handle a case where IOMMU in the system are assymetric feature-wise and thus the capability may not be supported for all devices. The enforcement is done by adding a flag into HWPT_ALLOC namely: IOMMU_HWPT_ALLOC_DIRTY_TRACKING .. Passed in HWPT_ALLOC ioctl() flags. The enforcement is done by creating a iommu_domain via domain_alloc_user() and validating the requested flags with what the device IOMMU supports (and failing accordingly) advertised). Advertising the new IOMMU domain feature flag requires that the individual iommu driver capability is supported when a future device attachment happens. Link: https://lore.kernel.org/r/20231024135109.73787-6-joao.m.martins@oracle.com Signed-off-by: Joao Martins Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/hw_pagetable.c | 4 +++- include/uapi/linux/iommufd.h | 3 +++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c index 8b3d2875d642..dd50ca9e2c09 100644 --- a/drivers/iommu/iommufd/hw_pagetable.c +++ b/drivers/iommu/iommufd/hw_pagetable.c @@ -157,7 +157,9 @@ int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd) struct iommufd_ioas *ioas; int rc; - if ((cmd->flags & (~IOMMU_HWPT_ALLOC_NEST_PARENT)) || cmd->__reserved) + if ((cmd->flags & ~(IOMMU_HWPT_ALLOC_NEST_PARENT | + IOMMU_HWPT_ALLOC_DIRTY_TRACKING)) || + cmd->__reserved) return -EOPNOTSUPP; idev = iommufd_get_device(ucmd, cmd->dev_id); diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index be7a95042677..c76248410120 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -351,9 +351,12 @@ struct iommu_vfio_ioas { * enum iommufd_hwpt_alloc_flags - Flags for HWPT allocation * @IOMMU_HWPT_ALLOC_NEST_PARENT: If set, allocate a HWPT that can serve as * the parent HWPT in a nesting configuration. + * @IOMMU_HWPT_ALLOC_DIRTY_TRACKING: Dirty tracking support for device IOMMU is + * enforced on device attachment */ enum iommufd_hwpt_alloc_flags { IOMMU_HWPT_ALLOC_NEST_PARENT = 1 << 0, + IOMMU_HWPT_ALLOC_DIRTY_TRACKING = 1 << 1, }; /** From e2a4b294784957fc28ecb1fed8a7e69da18eb18d Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Tue, 24 Oct 2023 14:50:57 +0100 Subject: [PATCH 16/52] iommufd: Add IOMMU_HWPT_SET_DIRTY_TRACKING Every IOMMU driver should be able to implement the needed iommu domain ops to control dirty tracking. Connect a hw_pagetable to the IOMMU core dirty tracking ops, specifically the ability to enable/disable dirty tracking on an IOMMU domain (hw_pagetable id). To that end add an io_pagetable kernel API to toggle dirty tracking: * iopt_set_dirty_tracking(iopt, [domain], state) The intended caller of this is via the hw_pagetable object that is created. Internally it will ensure the leftover dirty state is cleared /right before/ dirty tracking starts. This is also useful for iommu drivers which may decide that dirty tracking is always-enabled at boot without wanting to toggle dynamically via corresponding iommu domain op. Link: https://lore.kernel.org/r/20231024135109.73787-7-joao.m.martins@oracle.com Signed-off-by: Joao Martins Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/hw_pagetable.c | 24 +++++++++++ drivers/iommu/iommufd/io_pagetable.c | 54 +++++++++++++++++++++++++ drivers/iommu/iommufd/iommufd_private.h | 12 ++++++ drivers/iommu/iommufd/main.c | 3 ++ include/uapi/linux/iommufd.h | 28 +++++++++++++ 5 files changed, 121 insertions(+) diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c index dd50ca9e2c09..c3b7bd9bfcbb 100644 --- a/drivers/iommu/iommufd/hw_pagetable.c +++ b/drivers/iommu/iommufd/hw_pagetable.c @@ -196,3 +196,27 @@ out_put_idev: iommufd_put_object(&idev->obj); return rc; } + +int iommufd_hwpt_set_dirty_tracking(struct iommufd_ucmd *ucmd) +{ + struct iommu_hwpt_set_dirty_tracking *cmd = ucmd->cmd; + struct iommufd_hw_pagetable *hwpt; + struct iommufd_ioas *ioas; + int rc = -EOPNOTSUPP; + bool enable; + + if (cmd->flags & ~IOMMU_HWPT_DIRTY_TRACKING_ENABLE) + return rc; + + hwpt = iommufd_get_hwpt(ucmd, cmd->hwpt_id); + if (IS_ERR(hwpt)) + return PTR_ERR(hwpt); + + ioas = hwpt->ioas; + enable = cmd->flags & IOMMU_HWPT_DIRTY_TRACKING_ENABLE; + + rc = iopt_set_dirty_tracking(&ioas->iopt, hwpt->domain, enable); + + iommufd_put_object(&hwpt->obj); + return rc; +} diff --git a/drivers/iommu/iommufd/io_pagetable.c b/drivers/iommu/iommufd/io_pagetable.c index 3a598182b761..41c2efb6ff15 100644 --- a/drivers/iommu/iommufd/io_pagetable.c +++ b/drivers/iommu/iommufd/io_pagetable.c @@ -412,6 +412,60 @@ int iopt_map_user_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt, return 0; } +static int iopt_clear_dirty_data(struct io_pagetable *iopt, + struct iommu_domain *domain) +{ + const struct iommu_dirty_ops *ops = domain->dirty_ops; + struct iommu_iotlb_gather gather; + struct iommu_dirty_bitmap dirty; + struct iopt_area *area; + int ret = 0; + + lockdep_assert_held_read(&iopt->iova_rwsem); + + iommu_dirty_bitmap_init(&dirty, NULL, &gather); + + for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area; + area = iopt_area_iter_next(area, 0, ULONG_MAX)) { + if (!area->pages) + continue; + + ret = ops->read_and_clear_dirty(domain, iopt_area_iova(area), + iopt_area_length(area), 0, + &dirty); + if (ret) + break; + } + + iommu_iotlb_sync(domain, &gather); + return ret; +} + +int iopt_set_dirty_tracking(struct io_pagetable *iopt, + struct iommu_domain *domain, bool enable) +{ + const struct iommu_dirty_ops *ops = domain->dirty_ops; + int ret = 0; + + if (!ops) + return -EOPNOTSUPP; + + down_read(&iopt->iova_rwsem); + + /* Clear dirty bits from PTEs to ensure a clean snapshot */ + if (enable) { + ret = iopt_clear_dirty_data(iopt, domain); + if (ret) + goto out_unlock; + } + + ret = ops->set_dirty_tracking(domain, enable); + +out_unlock: + up_read(&iopt->iova_rwsem); + return ret; +} + int iopt_get_pages(struct io_pagetable *iopt, unsigned long iova, unsigned long length, struct list_head *pages_list) { diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index 3064997a0181..b09750848da6 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -8,6 +8,7 @@ #include #include #include +#include struct iommu_domain; struct iommu_group; @@ -70,6 +71,9 @@ int iopt_unmap_iova(struct io_pagetable *iopt, unsigned long iova, unsigned long length, unsigned long *unmapped); int iopt_unmap_all(struct io_pagetable *iopt, unsigned long *unmapped); +int iopt_set_dirty_tracking(struct io_pagetable *iopt, + struct iommu_domain *domain, bool enable); + void iommufd_access_notify_unmap(struct io_pagetable *iopt, unsigned long iova, unsigned long length); int iopt_table_add_domain(struct io_pagetable *iopt, @@ -240,6 +244,14 @@ struct iommufd_hw_pagetable { struct list_head hwpt_item; }; +static inline struct iommufd_hw_pagetable * +iommufd_get_hwpt(struct iommufd_ucmd *ucmd, u32 id) +{ + return container_of(iommufd_get_object(ucmd->ictx, id, + IOMMUFD_OBJ_HW_PAGETABLE), + struct iommufd_hw_pagetable, obj); +} +int iommufd_hwpt_set_dirty_tracking(struct iommufd_ucmd *ucmd); struct iommufd_hw_pagetable * iommufd_hw_pagetable_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, struct iommufd_device *idev, u32 flags, diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c index e71523cbd0de..46fedd779714 100644 --- a/drivers/iommu/iommufd/main.c +++ b/drivers/iommu/iommufd/main.c @@ -307,6 +307,7 @@ union ucmd_buffer { struct iommu_destroy destroy; struct iommu_hw_info info; struct iommu_hwpt_alloc hwpt; + struct iommu_hwpt_set_dirty_tracking set_dirty_tracking; struct iommu_ioas_alloc alloc; struct iommu_ioas_allow_iovas allow_iovas; struct iommu_ioas_copy ioas_copy; @@ -342,6 +343,8 @@ static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = { __reserved), IOCTL_OP(IOMMU_HWPT_ALLOC, iommufd_hwpt_alloc, struct iommu_hwpt_alloc, __reserved), + IOCTL_OP(IOMMU_HWPT_SET_DIRTY_TRACKING, iommufd_hwpt_set_dirty_tracking, + struct iommu_hwpt_set_dirty_tracking, __reserved), IOCTL_OP(IOMMU_IOAS_ALLOC, iommufd_ioas_alloc_ioctl, struct iommu_ioas_alloc, out_ioas_id), IOCTL_OP(IOMMU_IOAS_ALLOW_IOVAS, iommufd_ioas_allow_iovas, diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index c76248410120..5c82b68c88f3 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -47,6 +47,7 @@ enum { IOMMUFD_CMD_VFIO_IOAS, IOMMUFD_CMD_HWPT_ALLOC, IOMMUFD_CMD_GET_HW_INFO, + IOMMUFD_CMD_HWPT_SET_DIRTY_TRACKING, }; /** @@ -453,4 +454,31 @@ struct iommu_hw_info { __u32 __reserved; }; #define IOMMU_GET_HW_INFO _IO(IOMMUFD_TYPE, IOMMUFD_CMD_GET_HW_INFO) + +/* + * enum iommufd_hwpt_set_dirty_tracking_flags - Flags for steering dirty + * tracking + * @IOMMU_HWPT_DIRTY_TRACKING_ENABLE: Enable dirty tracking + */ +enum iommufd_hwpt_set_dirty_tracking_flags { + IOMMU_HWPT_DIRTY_TRACKING_ENABLE = 1, +}; + +/** + * struct iommu_hwpt_set_dirty_tracking - ioctl(IOMMU_HWPT_SET_DIRTY_TRACKING) + * @size: sizeof(struct iommu_hwpt_set_dirty_tracking) + * @flags: Combination of enum iommufd_hwpt_set_dirty_tracking_flags + * @hwpt_id: HW pagetable ID that represents the IOMMU domain + * @__reserved: Must be 0 + * + * Toggle dirty tracking on an HW pagetable. + */ +struct iommu_hwpt_set_dirty_tracking { + __u32 size; + __u32 flags; + __u32 hwpt_id; + __u32 __reserved; +}; +#define IOMMU_HWPT_SET_DIRTY_TRACKING _IO(IOMMUFD_TYPE, \ + IOMMUFD_CMD_HWPT_SET_DIRTY_TRACKING) #endif From b9a60d6f850e4470017b60f731220a58cda199aa Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Tue, 24 Oct 2023 14:50:58 +0100 Subject: [PATCH 17/52] iommufd: Add IOMMU_HWPT_GET_DIRTY_BITMAP Connect a hw_pagetable to the IOMMU core dirty tracking read_and_clear_dirty iommu domain op. It exposes all of the functionality for the UAPI that read the dirtied IOVAs while clearing the Dirty bits from the PTEs. In doing so, add an IO pagetable API iopt_read_and_clear_dirty_data() that performs the reading of dirty IOPTEs for a given IOVA range and then copying back to userspace bitmap. Underneath it uses the IOMMU domain kernel API which will read the dirty bits, as well as atomically clearing the IOPTE dirty bit and flushing the IOTLB at the end. The IOVA bitmaps usage takes care of the iteration of the bitmaps user pages efficiently and without copies. Within the iterator function we iterate over io-pagetable contigous areas that have been mapped. Contrary to past incantation of a similar interface in VFIO the IOVA range to be scanned is tied in to the bitmap size, thus the application needs to pass a appropriately sized bitmap address taking into account the iova range being passed *and* page size ... as opposed to allowing bitmap-iova != iova. Link: https://lore.kernel.org/r/20231024135109.73787-8-joao.m.martins@oracle.com Signed-off-by: Joao Martins Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/hw_pagetable.c | 22 +++++ drivers/iommu/iommufd/io_pagetable.c | 113 ++++++++++++++++++++++++ drivers/iommu/iommufd/iommufd_private.h | 10 +++ drivers/iommu/iommufd/main.c | 4 + include/uapi/linux/iommufd.h | 35 ++++++++ 5 files changed, 184 insertions(+) diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c index c3b7bd9bfcbb..7316f69110ef 100644 --- a/drivers/iommu/iommufd/hw_pagetable.c +++ b/drivers/iommu/iommufd/hw_pagetable.c @@ -220,3 +220,25 @@ int iommufd_hwpt_set_dirty_tracking(struct iommufd_ucmd *ucmd) iommufd_put_object(&hwpt->obj); return rc; } + +int iommufd_hwpt_get_dirty_bitmap(struct iommufd_ucmd *ucmd) +{ + struct iommu_hwpt_get_dirty_bitmap *cmd = ucmd->cmd; + struct iommufd_hw_pagetable *hwpt; + struct iommufd_ioas *ioas; + int rc = -EOPNOTSUPP; + + if ((cmd->flags || cmd->__reserved)) + return -EOPNOTSUPP; + + hwpt = iommufd_get_hwpt(ucmd, cmd->hwpt_id); + if (IS_ERR(hwpt)) + return PTR_ERR(hwpt); + + ioas = hwpt->ioas; + rc = iopt_read_and_clear_dirty_data(&ioas->iopt, hwpt->domain, + cmd->flags, cmd); + + iommufd_put_object(&hwpt->obj); + return rc; +} diff --git a/drivers/iommu/iommufd/io_pagetable.c b/drivers/iommu/iommufd/io_pagetable.c index 41c2efb6ff15..255264e796fb 100644 --- a/drivers/iommu/iommufd/io_pagetable.c +++ b/drivers/iommu/iommufd/io_pagetable.c @@ -15,6 +15,7 @@ #include #include #include +#include #include "io_pagetable.h" #include "double_span.h" @@ -412,6 +413,118 @@ int iopt_map_user_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt, return 0; } +struct iova_bitmap_fn_arg { + struct io_pagetable *iopt; + struct iommu_domain *domain; + struct iommu_dirty_bitmap *dirty; +}; + +static int __iommu_read_and_clear_dirty(struct iova_bitmap *bitmap, + unsigned long iova, size_t length, + void *opaque) +{ + struct iopt_area *area; + struct iopt_area_contig_iter iter; + struct iova_bitmap_fn_arg *arg = opaque; + struct iommu_domain *domain = arg->domain; + struct iommu_dirty_bitmap *dirty = arg->dirty; + const struct iommu_dirty_ops *ops = domain->dirty_ops; + unsigned long last_iova = iova + length - 1; + int ret; + + iopt_for_each_contig_area(&iter, area, arg->iopt, iova, last_iova) { + unsigned long last = min(last_iova, iopt_area_last_iova(area)); + + ret = ops->read_and_clear_dirty(domain, iter.cur_iova, + last - iter.cur_iova + 1, 0, + dirty); + if (ret) + return ret; + } + + if (!iopt_area_contig_done(&iter)) + return -EINVAL; + return 0; +} + +static int +iommu_read_and_clear_dirty(struct iommu_domain *domain, + struct io_pagetable *iopt, unsigned long flags, + struct iommu_hwpt_get_dirty_bitmap *bitmap) +{ + const struct iommu_dirty_ops *ops = domain->dirty_ops; + struct iommu_iotlb_gather gather; + struct iommu_dirty_bitmap dirty; + struct iova_bitmap_fn_arg arg; + struct iova_bitmap *iter; + int ret = 0; + + if (!ops || !ops->read_and_clear_dirty) + return -EOPNOTSUPP; + + iter = iova_bitmap_alloc(bitmap->iova, bitmap->length, + bitmap->page_size, + u64_to_user_ptr(bitmap->data)); + if (IS_ERR(iter)) + return -ENOMEM; + + iommu_dirty_bitmap_init(&dirty, iter, &gather); + + arg.iopt = iopt; + arg.domain = domain; + arg.dirty = &dirty; + iova_bitmap_for_each(iter, &arg, __iommu_read_and_clear_dirty); + + iommu_iotlb_sync(domain, &gather); + iova_bitmap_free(iter); + + return ret; +} + +int iommufd_check_iova_range(struct io_pagetable *iopt, + struct iommu_hwpt_get_dirty_bitmap *bitmap) +{ + size_t iommu_pgsize = iopt->iova_alignment; + u64 last_iova; + + if (check_add_overflow(bitmap->iova, bitmap->length - 1, &last_iova)) + return -EOVERFLOW; + + if (bitmap->iova > ULONG_MAX || last_iova > ULONG_MAX) + return -EOVERFLOW; + + if ((bitmap->iova & (iommu_pgsize - 1)) || + ((last_iova + 1) & (iommu_pgsize - 1))) + return -EINVAL; + + if (!bitmap->page_size) + return -EINVAL; + + if ((bitmap->iova & (bitmap->page_size - 1)) || + ((last_iova + 1) & (bitmap->page_size - 1))) + return -EINVAL; + + return 0; +} + +int iopt_read_and_clear_dirty_data(struct io_pagetable *iopt, + struct iommu_domain *domain, + unsigned long flags, + struct iommu_hwpt_get_dirty_bitmap *bitmap) +{ + int ret; + + ret = iommufd_check_iova_range(iopt, bitmap); + if (ret) + return ret; + + down_read(&iopt->iova_rwsem); + ret = iommu_read_and_clear_dirty(domain, iopt, flags, bitmap); + up_read(&iopt->iova_rwsem); + + return ret; +} + static int iopt_clear_dirty_data(struct io_pagetable *iopt, struct iommu_domain *domain) { diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index b09750848da6..034129130db3 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -8,6 +8,8 @@ #include #include #include +#include +#include #include struct iommu_domain; @@ -71,6 +73,10 @@ int iopt_unmap_iova(struct io_pagetable *iopt, unsigned long iova, unsigned long length, unsigned long *unmapped); int iopt_unmap_all(struct io_pagetable *iopt, unsigned long *unmapped); +int iopt_read_and_clear_dirty_data(struct io_pagetable *iopt, + struct iommu_domain *domain, + unsigned long flags, + struct iommu_hwpt_get_dirty_bitmap *bitmap); int iopt_set_dirty_tracking(struct io_pagetable *iopt, struct iommu_domain *domain, bool enable); @@ -226,6 +232,8 @@ int iommufd_option_rlimit_mode(struct iommu_option *cmd, struct iommufd_ctx *ictx); int iommufd_vfio_ioas(struct iommufd_ucmd *ucmd); +int iommufd_check_iova_range(struct io_pagetable *iopt, + struct iommu_hwpt_get_dirty_bitmap *bitmap); /* * A HW pagetable is called an iommu_domain inside the kernel. This user object @@ -252,6 +260,8 @@ iommufd_get_hwpt(struct iommufd_ucmd *ucmd, u32 id) struct iommufd_hw_pagetable, obj); } int iommufd_hwpt_set_dirty_tracking(struct iommufd_ucmd *ucmd); +int iommufd_hwpt_get_dirty_bitmap(struct iommufd_ucmd *ucmd); + struct iommufd_hw_pagetable * iommufd_hw_pagetable_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, struct iommufd_device *idev, u32 flags, diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c index 46fedd779714..d50f42a730aa 100644 --- a/drivers/iommu/iommufd/main.c +++ b/drivers/iommu/iommufd/main.c @@ -307,6 +307,7 @@ union ucmd_buffer { struct iommu_destroy destroy; struct iommu_hw_info info; struct iommu_hwpt_alloc hwpt; + struct iommu_hwpt_get_dirty_bitmap get_dirty_bitmap; struct iommu_hwpt_set_dirty_tracking set_dirty_tracking; struct iommu_ioas_alloc alloc; struct iommu_ioas_allow_iovas allow_iovas; @@ -343,6 +344,8 @@ static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = { __reserved), IOCTL_OP(IOMMU_HWPT_ALLOC, iommufd_hwpt_alloc, struct iommu_hwpt_alloc, __reserved), + IOCTL_OP(IOMMU_HWPT_GET_DIRTY_BITMAP, iommufd_hwpt_get_dirty_bitmap, + struct iommu_hwpt_get_dirty_bitmap, data), IOCTL_OP(IOMMU_HWPT_SET_DIRTY_TRACKING, iommufd_hwpt_set_dirty_tracking, struct iommu_hwpt_set_dirty_tracking, __reserved), IOCTL_OP(IOMMU_IOAS_ALLOC, iommufd_ioas_alloc_ioctl, @@ -555,5 +558,6 @@ MODULE_ALIAS_MISCDEV(VFIO_MINOR); MODULE_ALIAS("devname:vfio/vfio"); #endif MODULE_IMPORT_NS(IOMMUFD_INTERNAL); +MODULE_IMPORT_NS(IOMMUFD); MODULE_DESCRIPTION("I/O Address Space Management for passthrough devices"); MODULE_LICENSE("GPL"); diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index 5c82b68c88f3..dce38e32ca84 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -48,6 +48,7 @@ enum { IOMMUFD_CMD_HWPT_ALLOC, IOMMUFD_CMD_GET_HW_INFO, IOMMUFD_CMD_HWPT_SET_DIRTY_TRACKING, + IOMMUFD_CMD_HWPT_GET_DIRTY_BITMAP, }; /** @@ -481,4 +482,38 @@ struct iommu_hwpt_set_dirty_tracking { }; #define IOMMU_HWPT_SET_DIRTY_TRACKING _IO(IOMMUFD_TYPE, \ IOMMUFD_CMD_HWPT_SET_DIRTY_TRACKING) + +/** + * struct iommu_hwpt_get_dirty_bitmap - ioctl(IOMMU_HWPT_GET_DIRTY_BITMAP) + * @size: sizeof(struct iommu_hwpt_get_dirty_bitmap) + * @hwpt_id: HW pagetable ID that represents the IOMMU domain + * @flags: Must be zero + * @__reserved: Must be 0 + * @iova: base IOVA of the bitmap first bit + * @length: IOVA range size + * @page_size: page size granularity of each bit in the bitmap + * @data: bitmap where to set the dirty bits. The bitmap bits each + * represent a page_size which you deviate from an arbitrary iova. + * + * Checking a given IOVA is dirty: + * + * data[(iova / page_size) / 64] & (1ULL << ((iova / page_size) % 64)) + * + * Walk the IOMMU pagetables for a given IOVA range to return a bitmap + * with the dirty IOVAs. In doing so it will also by default clear any + * dirty bit metadata set in the IOPTE. + */ +struct iommu_hwpt_get_dirty_bitmap { + __u32 size; + __u32 hwpt_id; + __u32 flags; + __u32 __reserved; + __aligned_u64 iova; + __aligned_u64 length; + __aligned_u64 page_size; + __aligned_u64 data; +}; +#define IOMMU_HWPT_GET_DIRTY_BITMAP _IO(IOMMUFD_TYPE, \ + IOMMUFD_CMD_HWPT_GET_DIRTY_BITMAP) + #endif From 7623683857e52b75184d37862c70f1230aef2edd Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Tue, 24 Oct 2023 14:50:59 +0100 Subject: [PATCH 18/52] iommufd: Add capabilities to IOMMU_GET_HW_INFO Extend IOMMUFD_CMD_GET_HW_INFO op to query generic iommu capabilities for a given device. Capabilities are IOMMU agnostic and use device_iommu_capable() API passing one of the IOMMU_CAP_*. Enumerate IOMMU_CAP_DIRTY_TRACKING for now in the out_capabilities field returned back to userspace. Link: https://lore.kernel.org/r/20231024135109.73787-9-joao.m.martins@oracle.com Signed-off-by: Joao Martins Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/device.c | 4 ++++ include/uapi/linux/iommufd.h | 17 +++++++++++++++++ 2 files changed, 21 insertions(+) diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c index e88fa73a45e6..2a41fd2b6ef8 100644 --- a/drivers/iommu/iommufd/device.c +++ b/drivers/iommu/iommufd/device.c @@ -1185,6 +1185,10 @@ int iommufd_get_hw_info(struct iommufd_ucmd *ucmd) */ cmd->data_len = data_len; + cmd->out_capabilities = 0; + if (device_iommu_capable(idev->dev, IOMMU_CAP_DIRTY_TRACKING)) + cmd->out_capabilities |= IOMMU_HW_CAP_DIRTY_TRACKING; + rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); out_free: kfree(data); diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index dce38e32ca84..036ebc6c19cf 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -418,6 +418,20 @@ enum iommu_hw_info_type { IOMMU_HW_INFO_TYPE_INTEL_VTD, }; +/** + * enum iommufd_hw_capabilities + * @IOMMU_HW_CAP_DIRTY_TRACKING: IOMMU hardware support for dirty tracking + * If available, it means the following APIs + * are supported: + * + * IOMMU_HWPT_GET_DIRTY_BITMAP + * IOMMU_HWPT_SET_DIRTY_TRACKING + * + */ +enum iommufd_hw_capabilities { + IOMMU_HW_CAP_DIRTY_TRACKING = 1 << 0, +}; + /** * struct iommu_hw_info - ioctl(IOMMU_GET_HW_INFO) * @size: sizeof(struct iommu_hw_info) @@ -429,6 +443,8 @@ enum iommu_hw_info_type { * the iommu type specific hardware information data * @out_data_type: Output the iommu hardware info type as defined in the enum * iommu_hw_info_type. + * @out_capabilities: Output the generic iommu capability info type as defined + * in the enum iommu_hw_capabilities. * @__reserved: Must be 0 * * Query an iommu type specific hardware information data from an iommu behind @@ -453,6 +469,7 @@ struct iommu_hw_info { __aligned_u64 data_uptr; __u32 out_data_type; __u32 __reserved; + __aligned_u64 out_capabilities; }; #define IOMMU_GET_HW_INFO _IO(IOMMUFD_TYPE, IOMMUFD_CMD_GET_HW_INFO) From 609848132c71316df3260d1ec066539c21bba585 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Tue, 24 Oct 2023 14:51:00 +0100 Subject: [PATCH 19/52] iommufd: Add a flag to skip clearing of IOPTE dirty VFIO has an operation where it unmaps an IOVA while returning a bitmap with the dirty data. In reality the operation doesn't quite query the IO pagetables that the PTE was dirty or not. Instead it marks as dirty on anything that was mapped, and doing so in one syscall. In IOMMUFD the equivalent is done in two operations by querying with GET_DIRTY_IOVA followed by UNMAP_IOVA. However, this would incur two TLB flushes given that after clearing dirty bits IOMMU implementations require invalidating their IOTLB, plus another invalidation needed for the UNMAP. To allow dirty bits to be queried faster, add a flag (IOMMU_HWPT_GET_DIRTY_BITMAP_NO_CLEAR) that requests to not clear the dirty bits from the PTE (but just reading them), under the expectation that the next operation is the unmap. An alternative is to unmap and just perpectually mark as dirty as that's the same behaviour as today. So here equivalent functionally can be provided with unmap alone, and if real dirty info is required it will amortize the cost while querying. There's still a race against DMA where in theory the unmap of the IOVA (when the guest invalidates the IOTLB via emulated iommu) would race against the VF performing DMA on the same IOVA. As discussed in [0], we are accepting to resolve this race as throwing away the DMA and it doesn't matter if it hit physical DRAM or not, the VM can't tell if we threw it away because the DMA was blocked or because we failed to copy the DRAM. [0] https://lore.kernel.org/linux-iommu/20220502185239.GR8364@nvidia.com/ Link: https://lore.kernel.org/r/20231024135109.73787-10-joao.m.martins@oracle.com Signed-off-by: Joao Martins Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/hw_pagetable.c | 3 ++- drivers/iommu/iommufd/io_pagetable.c | 9 +++++++-- include/uapi/linux/iommufd.h | 15 ++++++++++++++- 3 files changed, 23 insertions(+), 4 deletions(-) diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c index 7316f69110ef..72a5269984b0 100644 --- a/drivers/iommu/iommufd/hw_pagetable.c +++ b/drivers/iommu/iommufd/hw_pagetable.c @@ -228,7 +228,8 @@ int iommufd_hwpt_get_dirty_bitmap(struct iommufd_ucmd *ucmd) struct iommufd_ioas *ioas; int rc = -EOPNOTSUPP; - if ((cmd->flags || cmd->__reserved)) + if ((cmd->flags & ~(IOMMU_HWPT_GET_DIRTY_BITMAP_NO_CLEAR)) || + cmd->__reserved) return -EOPNOTSUPP; hwpt = iommufd_get_hwpt(ucmd, cmd->hwpt_id); diff --git a/drivers/iommu/iommufd/io_pagetable.c b/drivers/iommu/iommufd/io_pagetable.c index 255264e796fb..9f060abe53b6 100644 --- a/drivers/iommu/iommufd/io_pagetable.c +++ b/drivers/iommu/iommufd/io_pagetable.c @@ -414,6 +414,7 @@ int iopt_map_user_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt, } struct iova_bitmap_fn_arg { + unsigned long flags; struct io_pagetable *iopt; struct iommu_domain *domain; struct iommu_dirty_bitmap *dirty; @@ -430,13 +431,14 @@ static int __iommu_read_and_clear_dirty(struct iova_bitmap *bitmap, struct iommu_dirty_bitmap *dirty = arg->dirty; const struct iommu_dirty_ops *ops = domain->dirty_ops; unsigned long last_iova = iova + length - 1; + unsigned long flags = arg->flags; int ret; iopt_for_each_contig_area(&iter, area, arg->iopt, iova, last_iova) { unsigned long last = min(last_iova, iopt_area_last_iova(area)); ret = ops->read_and_clear_dirty(domain, iter.cur_iova, - last - iter.cur_iova + 1, 0, + last - iter.cur_iova + 1, flags, dirty); if (ret) return ret; @@ -470,12 +472,15 @@ iommu_read_and_clear_dirty(struct iommu_domain *domain, iommu_dirty_bitmap_init(&dirty, iter, &gather); + arg.flags = flags; arg.iopt = iopt; arg.domain = domain; arg.dirty = &dirty; iova_bitmap_for_each(iter, &arg, __iommu_read_and_clear_dirty); - iommu_iotlb_sync(domain, &gather); + if (!(flags & IOMMU_DIRTY_NO_CLEAR)) + iommu_iotlb_sync(domain, &gather); + iova_bitmap_free(iter); return ret; diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index 036ebc6c19cf..c44eecf5d318 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -500,11 +500,24 @@ struct iommu_hwpt_set_dirty_tracking { #define IOMMU_HWPT_SET_DIRTY_TRACKING _IO(IOMMUFD_TYPE, \ IOMMUFD_CMD_HWPT_SET_DIRTY_TRACKING) +/** + * enum iommufd_hwpt_get_dirty_bitmap_flags - Flags for getting dirty bits + * @IOMMU_HWPT_GET_DIRTY_BITMAP_NO_CLEAR: Just read the PTEs without clearing + * any dirty bits metadata. This flag + * can be passed in the expectation + * where the next operation is an unmap + * of the same IOVA range. + * + */ +enum iommufd_hwpt_get_dirty_bitmap_flags { + IOMMU_HWPT_GET_DIRTY_BITMAP_NO_CLEAR = 1, +}; + /** * struct iommu_hwpt_get_dirty_bitmap - ioctl(IOMMU_HWPT_GET_DIRTY_BITMAP) * @size: sizeof(struct iommu_hwpt_get_dirty_bitmap) * @hwpt_id: HW pagetable ID that represents the IOMMU domain - * @flags: Must be zero + * @flags: Combination of enum iommufd_hwpt_get_dirty_bitmap_flags * @__reserved: Must be 0 * @iova: base IOVA of the bitmap first bit * @length: IOVA range size From 134288158a415cd863b1c32c7dcddc0a1dc32aab Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Tue, 24 Oct 2023 14:51:01 +0100 Subject: [PATCH 20/52] iommu/amd: Add domain_alloc_user based domain allocation Add the domain_alloc_user op implementation. To that end, refactor amd_iommu_domain_alloc() to receive a dev pointer and flags, while renaming it too, such that it becomes a common function shared with domain_alloc_user() implementation. The sole difference with domain_alloc_user() is that we initialize also other fields that iommu_domain_alloc() does. It lets it return the iommu domain correctly initialized in one function. This is in preparation to add dirty enforcement on AMD implementation of domain_alloc_user. Link: https://lore.kernel.org/r/20231024135109.73787-11-joao.m.martins@oracle.com Signed-off-by: Joao Martins Reviewed-by: Suravee Suthikulpanit Signed-off-by: Jason Gunthorpe --- drivers/iommu/amd/iommu.c | 44 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 41 insertions(+), 3 deletions(-) diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 95bd7c25ba6f..667e23b0ab0d 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -37,6 +37,7 @@ #include #include #include +#include #include "amd_iommu.h" #include "../dma-iommu.h" @@ -2155,28 +2156,64 @@ static inline u64 dma_max_address(void) return ((1ULL << PM_LEVEL_SHIFT(amd_iommu_gpt_level)) - 1); } -static struct iommu_domain *amd_iommu_domain_alloc(unsigned type) +static struct iommu_domain *do_iommu_domain_alloc(unsigned int type, + struct device *dev, u32 flags) { struct protection_domain *domain; + struct amd_iommu *iommu = NULL; + + if (dev) { + iommu = rlookup_amd_iommu(dev); + if (!iommu) + return ERR_PTR(-ENODEV); + } /* * Since DTE[Mode]=0 is prohibited on SNP-enabled system, * default to use IOMMU_DOMAIN_DMA[_FQ]. */ if (amd_iommu_snp_en && (type == IOMMU_DOMAIN_IDENTITY)) - return NULL; + return ERR_PTR(-EINVAL); domain = protection_domain_alloc(type); if (!domain) - return NULL; + return ERR_PTR(-ENOMEM); domain->domain.geometry.aperture_start = 0; domain->domain.geometry.aperture_end = dma_max_address(); domain->domain.geometry.force_aperture = true; + if (iommu) { + domain->domain.type = type; + domain->domain.pgsize_bitmap = iommu->iommu.ops->pgsize_bitmap; + domain->domain.ops = iommu->iommu.ops->default_domain_ops; + } + return &domain->domain; } +static struct iommu_domain *amd_iommu_domain_alloc(unsigned int type) +{ + struct iommu_domain *domain; + + domain = do_iommu_domain_alloc(type, NULL, 0); + if (IS_ERR(domain)) + return NULL; + + return domain; +} + +static struct iommu_domain *amd_iommu_domain_alloc_user(struct device *dev, + u32 flags) +{ + unsigned int type = IOMMU_DOMAIN_UNMANAGED; + + if (flags) + return ERR_PTR(-EOPNOTSUPP); + + return do_iommu_domain_alloc(type, dev, flags); +} + static void amd_iommu_domain_free(struct iommu_domain *dom) { struct protection_domain *domain; @@ -2464,6 +2501,7 @@ static bool amd_iommu_enforce_cache_coherency(struct iommu_domain *domain) const struct iommu_ops amd_iommu_ops = { .capable = amd_iommu_capable, .domain_alloc = amd_iommu_domain_alloc, + .domain_alloc_user = amd_iommu_domain_alloc_user, .probe_device = amd_iommu_probe_device, .release_device = amd_iommu_release_device, .probe_finalize = amd_iommu_probe_finalize, From 421a511a293fe1c73b37f6147c6676c4ee6efa04 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Tue, 24 Oct 2023 14:51:02 +0100 Subject: [PATCH 21/52] iommu/amd: Access/Dirty bit support in IOPTEs IOMMU advertises Access/Dirty bits if the extended feature register reports it. Relevant AMD IOMMU SDM ref[0] "1.3.8 Enhanced Support for Access and Dirty Bits" To enable it set the DTE flag in bits 7 and 8 to enable access, or access+dirty. With that, the IOMMU starts marking the D and A flags on every Memory Request or ATS translation request. It is on the VMM side to steer whether to enable dirty tracking or not, rather than wrongly doing in IOMMU. Relevant AMD IOMMU SDM ref [0], "Table 7. Device Table Entry (DTE) Field Definitions" particularly the entry "HAD". To actually toggle on and off it's relatively simple as it's setting 2 bits on DTE and flush the device DTE cache. To get what's dirtied use existing AMD io-pgtable support, by walking the pagetables over each IOVA, with fetch_pte(). The IOTLB flushing is left to the caller (much like unmap), and iommu_dirty_bitmap_record() is the one adding page-ranges to invalidate. This allows caller to batch the flush over a big span of IOVA space, without the iommu wondering about when to flush. Worthwhile sections from AMD IOMMU SDM: "2.2.3.1 Host Access Support" "2.2.3.2 Host Dirty Support" For details on how IOMMU hardware updates the dirty bit see, and expects from its consequent clearing by CPU: "2.2.7.4 Updating Accessed and Dirty Bits in the Guest Address Tables" "2.2.7.5 Clearing Accessed and Dirty Bits" Quoting the SDM: "The setting of accessed and dirty status bits in the page tables is visible to both the CPU and the peripheral when sharing guest page tables. The IOMMU interlocked operations to update A and D bits must be 64-bit operations and naturally aligned on a 64-bit boundary" .. and for the IOMMU update sequence to Dirty bit, essentially is states: 1. Decodes the read and write intent from the memory access. 2. If P=0 in the page descriptor, fail the access. 3. Compare the A & D bits in the descriptor with the read and write intent in the request. 4. If the A or D bits need to be updated in the descriptor: * Start atomic operation. * Read the descriptor as a 64-bit access. * If the descriptor no longer appears to require an update, release the atomic lock with no further action and continue to step 5. * Calculate the new A & D bits. * Write the descriptor as a 64-bit access. * End atomic operation. 5. Continue to the next stage of translation or to the memory access. Access/Dirty bits readout also need to consider the non-default page-sizes (aka replicated PTEs as mentined by manual), as AMD supports all powers of two (except 512G) page sizes. Select IOMMUFD_DRIVER only if IOMMUFD is enabled considering that IOMMU dirty tracking requires IOMMUFD. Link: https://lore.kernel.org/r/20231024135109.73787-12-joao.m.martins@oracle.com Signed-off-by: Joao Martins Reviewed-by: Suravee Suthikulpanit Signed-off-by: Jason Gunthorpe --- drivers/iommu/amd/Kconfig | 1 + drivers/iommu/amd/amd_iommu_types.h | 12 ++++ drivers/iommu/amd/io_pgtable.c | 68 +++++++++++++++++++ drivers/iommu/amd/iommu.c | 102 +++++++++++++++++++++++++++- 4 files changed, 182 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/amd/Kconfig b/drivers/iommu/amd/Kconfig index 9b5fc3356bf2..8bd4c3b183ec 100644 --- a/drivers/iommu/amd/Kconfig +++ b/drivers/iommu/amd/Kconfig @@ -10,6 +10,7 @@ config AMD_IOMMU select IOMMU_API select IOMMU_IOVA select IOMMU_IO_PGTABLE + select IOMMUFD_DRIVER if IOMMUFD depends on X86_64 && PCI && ACPI && HAVE_CMPXCHG_DOUBLE help With this option you can enable support for AMD IOMMU hardware in diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h index 7dc30c2b56b3..dec4e5c2b66b 100644 --- a/drivers/iommu/amd/amd_iommu_types.h +++ b/drivers/iommu/amd/amd_iommu_types.h @@ -97,7 +97,9 @@ #define FEATURE_GATS_MASK (3ULL) #define FEATURE_GAM_VAPIC BIT_ULL(21) #define FEATURE_GIOSUP BIT_ULL(48) +#define FEATURE_HASUP BIT_ULL(49) #define FEATURE_EPHSUP BIT_ULL(50) +#define FEATURE_HDSUP BIT_ULL(52) #define FEATURE_SNP BIT_ULL(63) #define FEATURE_PASID_SHIFT 32 @@ -212,6 +214,7 @@ /* macros and definitions for device table entries */ #define DEV_ENTRY_VALID 0x00 #define DEV_ENTRY_TRANSLATION 0x01 +#define DEV_ENTRY_HAD 0x07 #define DEV_ENTRY_PPR 0x34 #define DEV_ENTRY_IR 0x3d #define DEV_ENTRY_IW 0x3e @@ -370,10 +373,16 @@ #define PTE_LEVEL_PAGE_SIZE(level) \ (1ULL << (12 + (9 * (level)))) +/* + * The IOPTE dirty bit + */ +#define IOMMU_PTE_HD_BIT (6) + /* * Bit value definition for I/O PTE fields */ #define IOMMU_PTE_PR BIT_ULL(0) +#define IOMMU_PTE_HD BIT_ULL(IOMMU_PTE_HD_BIT) #define IOMMU_PTE_U BIT_ULL(59) #define IOMMU_PTE_FC BIT_ULL(60) #define IOMMU_PTE_IR BIT_ULL(61) @@ -384,6 +393,7 @@ */ #define DTE_FLAG_V BIT_ULL(0) #define DTE_FLAG_TV BIT_ULL(1) +#define DTE_FLAG_HAD (3ULL << 7) #define DTE_FLAG_GIOV BIT_ULL(54) #define DTE_FLAG_GV BIT_ULL(55) #define DTE_GLX_SHIFT (56) @@ -413,6 +423,7 @@ #define IOMMU_PAGE_MASK (((1ULL << 52) - 1) & ~0xfffULL) #define IOMMU_PTE_PRESENT(pte) ((pte) & IOMMU_PTE_PR) +#define IOMMU_PTE_DIRTY(pte) ((pte) & IOMMU_PTE_HD) #define IOMMU_PTE_PAGE(pte) (iommu_phys_to_virt((pte) & IOMMU_PAGE_MASK)) #define IOMMU_PTE_MODE(pte) (((pte) >> 9) & 0x07) @@ -563,6 +574,7 @@ struct protection_domain { int nid; /* Node ID */ u64 *gcr3_tbl; /* Guest CR3 table */ unsigned long flags; /* flags to find out type of domain */ + bool dirty_tracking; /* dirty tracking is enabled in the domain */ unsigned dev_cnt; /* devices assigned to this domain */ unsigned dev_iommu[MAX_IOMMUS]; /* per-IOMMU reference count */ }; diff --git a/drivers/iommu/amd/io_pgtable.c b/drivers/iommu/amd/io_pgtable.c index 2892aa1b4dc1..6c0621f6f572 100644 --- a/drivers/iommu/amd/io_pgtable.c +++ b/drivers/iommu/amd/io_pgtable.c @@ -486,6 +486,73 @@ static phys_addr_t iommu_v1_iova_to_phys(struct io_pgtable_ops *ops, unsigned lo return (__pte & ~offset_mask) | (iova & offset_mask); } +static bool pte_test_and_clear_dirty(u64 *ptep, unsigned long size, + unsigned long flags) +{ + bool test_only = flags & IOMMU_DIRTY_NO_CLEAR; + bool dirty = false; + int i, count; + + /* + * 2.2.3.2 Host Dirty Support + * When a non-default page size is used , software must OR the + * Dirty bits in all of the replicated host PTEs used to map + * the page. The IOMMU does not guarantee the Dirty bits are + * set in all of the replicated PTEs. Any portion of the page + * may have been written even if the Dirty bit is set in only + * one of the replicated PTEs. + */ + count = PAGE_SIZE_PTE_COUNT(size); + for (i = 0; i < count && test_only; i++) { + if (test_bit(IOMMU_PTE_HD_BIT, (unsigned long *)&ptep[i])) { + dirty = true; + break; + } + } + + for (i = 0; i < count && !test_only; i++) { + if (test_and_clear_bit(IOMMU_PTE_HD_BIT, + (unsigned long *)&ptep[i])) { + dirty = true; + } + } + + return dirty; +} + +static int iommu_v1_read_and_clear_dirty(struct io_pgtable_ops *ops, + unsigned long iova, size_t size, + unsigned long flags, + struct iommu_dirty_bitmap *dirty) +{ + struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops); + unsigned long end = iova + size - 1; + + do { + unsigned long pgsize = 0; + u64 *ptep, pte; + + ptep = fetch_pte(pgtable, iova, &pgsize); + if (ptep) + pte = READ_ONCE(*ptep); + if (!ptep || !IOMMU_PTE_PRESENT(pte)) { + pgsize = pgsize ?: PTE_LEVEL_PAGE_SIZE(0); + iova += pgsize; + continue; + } + + /* + * Mark the whole IOVA range as dirty even if only one of + * the replicated PTEs were marked dirty. + */ + if (pte_test_and_clear_dirty(ptep, pgsize, flags)) + iommu_dirty_bitmap_record(dirty, iova, pgsize); + iova += pgsize; + } while (iova < end); + + return 0; +} + /* * ---------------------------------------------------- */ @@ -527,6 +594,7 @@ static struct io_pgtable *v1_alloc_pgtable(struct io_pgtable_cfg *cfg, void *coo pgtable->iop.ops.map_pages = iommu_v1_map_pages; pgtable->iop.ops.unmap_pages = iommu_v1_unmap_pages; pgtable->iop.ops.iova_to_phys = iommu_v1_iova_to_phys; + pgtable->iop.ops.read_and_clear_dirty = iommu_v1_read_and_clear_dirty; return &pgtable->iop; } diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 667e23b0ab0d..caad10f9cee3 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -66,6 +66,7 @@ LIST_HEAD(hpet_map); LIST_HEAD(acpihid_map); const struct iommu_ops amd_iommu_ops; +const struct iommu_dirty_ops amd_dirty_ops; static ATOMIC_NOTIFIER_HEAD(ppr_notifier); int amd_iommu_max_glx_val = -1; @@ -1611,6 +1612,9 @@ static void set_dte_entry(struct amd_iommu *iommu, u16 devid, pte_root |= 1ULL << DEV_ENTRY_PPR; } + if (domain->dirty_tracking) + pte_root |= DTE_FLAG_HAD; + if (domain->flags & PD_IOMMUV2_MASK) { u64 gcr3 = iommu_virt_to_phys(domain->gcr3_tbl); u64 glx = domain->glx; @@ -2156,9 +2160,15 @@ static inline u64 dma_max_address(void) return ((1ULL << PM_LEVEL_SHIFT(amd_iommu_gpt_level)) - 1); } +static bool amd_iommu_hd_support(struct amd_iommu *iommu) +{ + return iommu && (iommu->features & FEATURE_HDSUP); +} + static struct iommu_domain *do_iommu_domain_alloc(unsigned int type, struct device *dev, u32 flags) { + bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING; struct protection_domain *domain; struct amd_iommu *iommu = NULL; @@ -2175,6 +2185,9 @@ static struct iommu_domain *do_iommu_domain_alloc(unsigned int type, if (amd_iommu_snp_en && (type == IOMMU_DOMAIN_IDENTITY)) return ERR_PTR(-EINVAL); + if (dirty_tracking && !amd_iommu_hd_support(iommu)) + return ERR_PTR(-EOPNOTSUPP); + domain = protection_domain_alloc(type); if (!domain) return ERR_PTR(-ENOMEM); @@ -2187,6 +2200,9 @@ static struct iommu_domain *do_iommu_domain_alloc(unsigned int type, domain->domain.type = type; domain->domain.pgsize_bitmap = iommu->iommu.ops->pgsize_bitmap; domain->domain.ops = iommu->iommu.ops->default_domain_ops; + + if (dirty_tracking) + domain->domain.dirty_ops = &amd_dirty_ops; } return &domain->domain; @@ -2208,7 +2224,7 @@ static struct iommu_domain *amd_iommu_domain_alloc_user(struct device *dev, { unsigned int type = IOMMU_DOMAIN_UNMANAGED; - if (flags) + if (flags & ~IOMMU_HWPT_ALLOC_DIRTY_TRACKING) return ERR_PTR(-EOPNOTSUPP); return do_iommu_domain_alloc(type, dev, flags); @@ -2251,6 +2267,13 @@ static int amd_iommu_attach_device(struct iommu_domain *dom, dev_data->defer_attach = false; + /* + * Restrict to devices with compatible IOMMU hardware support + * when enforcement of dirty tracking is enabled. + */ + if (dom->dirty_ops && !amd_iommu_hd_support(iommu)) + return -EINVAL; + if (dev_data->domain) detach_device(dev); @@ -2369,6 +2392,11 @@ static bool amd_iommu_capable(struct device *dev, enum iommu_cap cap) return true; case IOMMU_CAP_DEFERRED_FLUSH: return true; + case IOMMU_CAP_DIRTY_TRACKING: { + struct amd_iommu *iommu = rlookup_amd_iommu(dev); + + return amd_iommu_hd_support(iommu); + } default: break; } @@ -2376,6 +2404,73 @@ static bool amd_iommu_capable(struct device *dev, enum iommu_cap cap) return false; } +static int amd_iommu_set_dirty_tracking(struct iommu_domain *domain, + bool enable) +{ + struct protection_domain *pdomain = to_pdomain(domain); + struct dev_table_entry *dev_table; + struct iommu_dev_data *dev_data; + bool domain_flush = false; + struct amd_iommu *iommu; + unsigned long flags; + u64 pte_root; + + spin_lock_irqsave(&pdomain->lock, flags); + if (!(pdomain->dirty_tracking ^ enable)) { + spin_unlock_irqrestore(&pdomain->lock, flags); + return 0; + } + + list_for_each_entry(dev_data, &pdomain->dev_list, list) { + iommu = rlookup_amd_iommu(dev_data->dev); + if (!iommu) + continue; + + dev_table = get_dev_table(iommu); + pte_root = dev_table[dev_data->devid].data[0]; + + pte_root = (enable ? pte_root | DTE_FLAG_HAD : + pte_root & ~DTE_FLAG_HAD); + + /* Flush device DTE */ + dev_table[dev_data->devid].data[0] = pte_root; + device_flush_dte(dev_data); + domain_flush = true; + } + + /* Flush IOTLB to mark IOPTE dirty on the next translation(s) */ + if (domain_flush) { + amd_iommu_domain_flush_tlb_pde(pdomain); + amd_iommu_domain_flush_complete(pdomain); + } + pdomain->dirty_tracking = enable; + spin_unlock_irqrestore(&pdomain->lock, flags); + + return 0; +} + +static int amd_iommu_read_and_clear_dirty(struct iommu_domain *domain, + unsigned long iova, size_t size, + unsigned long flags, + struct iommu_dirty_bitmap *dirty) +{ + struct protection_domain *pdomain = to_pdomain(domain); + struct io_pgtable_ops *ops = &pdomain->iop.iop.ops; + unsigned long lflags; + + if (!ops || !ops->read_and_clear_dirty) + return -EOPNOTSUPP; + + spin_lock_irqsave(&pdomain->lock, lflags); + if (!pdomain->dirty_tracking && dirty->bitmap) { + spin_unlock_irqrestore(&pdomain->lock, lflags); + return -EINVAL; + } + spin_unlock_irqrestore(&pdomain->lock, lflags); + + return ops->read_and_clear_dirty(ops, iova, size, flags, dirty); +} + static void amd_iommu_get_resv_regions(struct device *dev, struct list_head *head) { @@ -2498,6 +2593,11 @@ static bool amd_iommu_enforce_cache_coherency(struct iommu_domain *domain) return true; } +const struct iommu_dirty_ops amd_dirty_ops = { + .set_dirty_tracking = amd_iommu_set_dirty_tracking, + .read_and_clear_dirty = amd_iommu_read_and_clear_dirty, +}; + const struct iommu_ops amd_iommu_ops = { .capable = amd_iommu_capable, .domain_alloc = amd_iommu_domain_alloc, From f35f22cc760eb2c7034bf53251399685d611e03f Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Tue, 24 Oct 2023 14:51:03 +0100 Subject: [PATCH 22/52] iommu/vt-d: Access/Dirty bit support for SS domains IOMMU advertises Access/Dirty bits for second-stage page table if the extended capability DMAR register reports it (ECAP, mnemonic ECAP.SSADS). The first stage table is compatible with CPU page table thus A/D bits are implicitly supported. Relevant Intel IOMMU SDM ref for first stage table "3.6.2 Accessed, Extended Accessed, and Dirty Flags" and second stage table "3.7.2 Accessed and Dirty Flags". First stage page table is enabled by default so it's allowed to set dirty tracking and no control bits needed, it just returns 0. To use SSADS, set bit 9 (SSADE) in the scalable-mode PASID table entry and flush the IOTLB via pasid_flush_caches() following the manual. Relevant SDM refs: "3.7.2 Accessed and Dirty Flags" "6.5.3.3 Guidance to Software for Invalidations, Table 23. Guidance to Software for Invalidations" PTE dirty bit is located in bit 9 and it's cached in the IOTLB so flush IOTLB to make sure IOMMU attempts to set the dirty bit again. Note that iommu_dirty_bitmap_record() will add the IOVA to iotlb_gather and thus the caller of the iommu op will flush the IOTLB. Relevant manuals over the hardware translation is chapter 6 with some special mention to: "6.2.3.1 Scalable-Mode PASID-Table Entry Programming Considerations" "6.2.4 IOTLB" Select IOMMUFD_DRIVER only if IOMMUFD is enabled, given that IOMMU dirty tracking requires IOMMUFD. Link: https://lore.kernel.org/r/20231024135109.73787-13-joao.m.martins@oracle.com Signed-off-by: Joao Martins Reviewed-by: Lu Baolu Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe --- drivers/iommu/intel/Kconfig | 1 + drivers/iommu/intel/iommu.c | 103 +++++++++++++++++++++++++++++++++- drivers/iommu/intel/iommu.h | 16 ++++++ drivers/iommu/intel/pasid.c | 109 ++++++++++++++++++++++++++++++++++++ drivers/iommu/intel/pasid.h | 4 ++ 5 files changed, 232 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/intel/Kconfig b/drivers/iommu/intel/Kconfig index 2e56bd79f589..f5348b80652b 100644 --- a/drivers/iommu/intel/Kconfig +++ b/drivers/iommu/intel/Kconfig @@ -15,6 +15,7 @@ config INTEL_IOMMU select DMA_OPS select IOMMU_API select IOMMU_IOVA + select IOMMUFD_DRIVER if IOMMUFD select NEED_DMA_MAP_STATE select DMAR_TABLE select SWIOTLB diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 017aed5813d8..eb92a201cc0b 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -300,6 +300,7 @@ static int iommu_skip_te_disable; #define IDENTMAP_AZALIA 4 const struct iommu_ops intel_iommu_ops; +const struct iommu_dirty_ops intel_dirty_ops; static bool translation_pre_enabled(struct intel_iommu *iommu) { @@ -4079,8 +4080,10 @@ intel_iommu_domain_alloc_user(struct device *dev, u32 flags) { struct iommu_domain *domain; struct intel_iommu *iommu; + bool dirty_tracking; - if (flags & (~IOMMU_HWPT_ALLOC_NEST_PARENT)) + if (flags & + (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING))) return ERR_PTR(-EOPNOTSUPP); iommu = device_to_iommu(dev, NULL, NULL); @@ -4090,6 +4093,10 @@ intel_iommu_domain_alloc_user(struct device *dev, u32 flags) if ((flags & IOMMU_HWPT_ALLOC_NEST_PARENT) && !ecap_nest(iommu->ecap)) return ERR_PTR(-EOPNOTSUPP); + dirty_tracking = (flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING); + if (dirty_tracking && !ssads_supported(iommu)) + return ERR_PTR(-EOPNOTSUPP); + /* * domain_alloc_user op needs to fully initialize a domain * before return, so uses iommu_domain_alloc() here for @@ -4098,6 +4105,15 @@ intel_iommu_domain_alloc_user(struct device *dev, u32 flags) domain = iommu_domain_alloc(dev->bus); if (!domain) domain = ERR_PTR(-ENOMEM); + + if (!IS_ERR(domain) && dirty_tracking) { + if (to_dmar_domain(domain)->use_first_level) { + iommu_domain_free(domain); + return ERR_PTR(-EOPNOTSUPP); + } + domain->dirty_ops = &intel_dirty_ops; + } + return domain; } @@ -4121,6 +4137,9 @@ static int prepare_domain_attach_device(struct iommu_domain *domain, if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap)) return -EINVAL; + if (domain->dirty_ops && !ssads_supported(iommu)) + return -EINVAL; + /* check if this iommu agaw is sufficient for max mapped address */ addr_width = agaw_to_width(iommu->agaw); if (addr_width > cap_mgaw(iommu->cap)) @@ -4375,6 +4394,8 @@ static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap) return dmar_platform_optin(); case IOMMU_CAP_ENFORCE_CACHE_COHERENCY: return ecap_sc_support(info->iommu->ecap); + case IOMMU_CAP_DIRTY_TRACKING: + return ssads_supported(info->iommu); default: return false; } @@ -4772,6 +4793,9 @@ static int intel_iommu_set_dev_pasid(struct iommu_domain *domain, if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev)) return -EOPNOTSUPP; + if (domain->dirty_ops) + return -EINVAL; + if (context_copied(iommu, info->bus, info->devfn)) return -EBUSY; @@ -4830,6 +4854,83 @@ static void *intel_iommu_hw_info(struct device *dev, u32 *length, u32 *type) return vtd; } +static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain, + bool enable) +{ + struct dmar_domain *dmar_domain = to_dmar_domain(domain); + struct device_domain_info *info; + int ret; + + spin_lock(&dmar_domain->lock); + if (dmar_domain->dirty_tracking == enable) + goto out_unlock; + + list_for_each_entry(info, &dmar_domain->devices, link) { + ret = intel_pasid_setup_dirty_tracking(info->iommu, + info->domain, info->dev, + IOMMU_NO_PASID, enable); + if (ret) + goto err_unwind; + } + + dmar_domain->dirty_tracking = enable; +out_unlock: + spin_unlock(&dmar_domain->lock); + + return 0; + +err_unwind: + list_for_each_entry(info, &dmar_domain->devices, link) + intel_pasid_setup_dirty_tracking(info->iommu, dmar_domain, + info->dev, IOMMU_NO_PASID, + dmar_domain->dirty_tracking); + spin_unlock(&dmar_domain->lock); + return ret; +} + +static int intel_iommu_read_and_clear_dirty(struct iommu_domain *domain, + unsigned long iova, size_t size, + unsigned long flags, + struct iommu_dirty_bitmap *dirty) +{ + struct dmar_domain *dmar_domain = to_dmar_domain(domain); + unsigned long end = iova + size - 1; + unsigned long pgsize; + + /* + * IOMMUFD core calls into a dirty tracking disabled domain without an + * IOVA bitmap set in order to clean dirty bits in all PTEs that might + * have occurred when we stopped dirty tracking. This ensures that we + * never inherit dirtied bits from a previous cycle. + */ + if (!dmar_domain->dirty_tracking && dirty->bitmap) + return -EINVAL; + + do { + struct dma_pte *pte; + int lvl = 0; + + pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &lvl, + GFP_ATOMIC); + pgsize = level_size(lvl) << VTD_PAGE_SHIFT; + if (!pte || !dma_pte_present(pte)) { + iova += pgsize; + continue; + } + + if (dma_sl_pte_test_and_clear_dirty(pte, flags)) + iommu_dirty_bitmap_record(dirty, iova, pgsize); + iova += pgsize; + } while (iova < end); + + return 0; +} + +const struct iommu_dirty_ops intel_dirty_ops = { + .set_dirty_tracking = intel_iommu_set_dirty_tracking, + .read_and_clear_dirty = intel_iommu_read_and_clear_dirty, +}; + const struct iommu_ops intel_iommu_ops = { .capable = intel_iommu_capable, .hw_info = intel_iommu_hw_info, diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h index c18fb699c87a..3bb569146229 100644 --- a/drivers/iommu/intel/iommu.h +++ b/drivers/iommu/intel/iommu.h @@ -48,6 +48,9 @@ #define DMA_FL_PTE_DIRTY BIT_ULL(6) #define DMA_FL_PTE_XD BIT_ULL(63) +#define DMA_SL_PTE_DIRTY_BIT 9 +#define DMA_SL_PTE_DIRTY BIT_ULL(DMA_SL_PTE_DIRTY_BIT) + #define ADDR_WIDTH_5LEVEL (57) #define ADDR_WIDTH_4LEVEL (48) @@ -539,6 +542,8 @@ enum { #define sm_supported(iommu) (intel_iommu_sm && ecap_smts((iommu)->ecap)) #define pasid_supported(iommu) (sm_supported(iommu) && \ ecap_pasid((iommu)->ecap)) +#define ssads_supported(iommu) (sm_supported(iommu) && \ + ecap_slads((iommu)->ecap)) struct pasid_entry; struct pasid_state_entry; @@ -592,6 +597,7 @@ struct dmar_domain { * otherwise, goes through the second * level. */ + u8 dirty_tracking:1; /* Dirty tracking is enabled */ spinlock_t lock; /* Protect device tracking lists */ struct list_head devices; /* all devices' list */ @@ -781,6 +787,16 @@ static inline bool dma_pte_present(struct dma_pte *pte) return (pte->val & 3) != 0; } +static inline bool dma_sl_pte_test_and_clear_dirty(struct dma_pte *pte, + unsigned long flags) +{ + if (flags & IOMMU_DIRTY_NO_CLEAR) + return (pte->val & DMA_SL_PTE_DIRTY) != 0; + + return test_and_clear_bit(DMA_SL_PTE_DIRTY_BIT, + (unsigned long *)&pte->val); +} + static inline bool dma_pte_superpage(struct dma_pte *pte) { return (pte->val & DMA_PTE_LARGE_PAGE); diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c index 8f92b92f3d2a..b9264b9174e8 100644 --- a/drivers/iommu/intel/pasid.c +++ b/drivers/iommu/intel/pasid.c @@ -277,6 +277,11 @@ static inline void pasid_set_bits(u64 *ptr, u64 mask, u64 bits) WRITE_ONCE(*ptr, (old & ~mask) | bits); } +static inline u64 pasid_get_bits(u64 *ptr) +{ + return READ_ONCE(*ptr); +} + /* * Setup the DID(Domain Identifier) field (Bit 64~79) of scalable mode * PASID entry. @@ -335,6 +340,36 @@ static inline void pasid_set_fault_enable(struct pasid_entry *pe) pasid_set_bits(&pe->val[0], 1 << 1, 0); } +/* + * Enable second level A/D bits by setting the SLADE (Second Level + * Access Dirty Enable) field (Bit 9) of a scalable mode PASID + * entry. + */ +static inline void pasid_set_ssade(struct pasid_entry *pe) +{ + pasid_set_bits(&pe->val[0], 1 << 9, 1 << 9); +} + +/* + * Disable second level A/D bits by clearing the SLADE (Second Level + * Access Dirty Enable) field (Bit 9) of a scalable mode PASID + * entry. + */ +static inline void pasid_clear_ssade(struct pasid_entry *pe) +{ + pasid_set_bits(&pe->val[0], 1 << 9, 0); +} + +/* + * Checks if second level A/D bits specifically the SLADE (Second Level + * Access Dirty Enable) field (Bit 9) of a scalable mode PASID + * entry is set. + */ +static inline bool pasid_get_ssade(struct pasid_entry *pe) +{ + return pasid_get_bits(&pe->val[0]) & (1 << 9); +} + /* * Setup the WPE(Write Protect Enable) field (Bit 132) of a * scalable mode PASID entry. @@ -627,6 +662,8 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu, pasid_set_translation_type(pte, PASID_ENTRY_PGTT_SL_ONLY); pasid_set_fault_enable(pte); pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap)); + if (domain->dirty_tracking) + pasid_set_ssade(pte); pasid_set_present(pte); spin_unlock(&iommu->lock); @@ -636,6 +673,78 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu, return 0; } +/* + * Set up dirty tracking on a second only or nested translation type. + */ +int intel_pasid_setup_dirty_tracking(struct intel_iommu *iommu, + struct dmar_domain *domain, + struct device *dev, u32 pasid, + bool enabled) +{ + struct pasid_entry *pte; + u16 did, pgtt; + + spin_lock(&iommu->lock); + + pte = intel_pasid_get_entry(dev, pasid); + if (!pte) { + spin_unlock(&iommu->lock); + dev_err_ratelimited( + dev, "Failed to get pasid entry of PASID %d\n", pasid); + return -ENODEV; + } + + did = domain_id_iommu(domain, iommu); + pgtt = pasid_pte_get_pgtt(pte); + if (pgtt != PASID_ENTRY_PGTT_SL_ONLY && + pgtt != PASID_ENTRY_PGTT_NESTED) { + spin_unlock(&iommu->lock); + dev_err_ratelimited( + dev, + "Dirty tracking not supported on translation type %d\n", + pgtt); + return -EOPNOTSUPP; + } + + if (pasid_get_ssade(pte) == enabled) { + spin_unlock(&iommu->lock); + return 0; + } + + if (enabled) + pasid_set_ssade(pte); + else + pasid_clear_ssade(pte); + spin_unlock(&iommu->lock); + + if (!ecap_coherent(iommu->ecap)) + clflush_cache_range(pte, sizeof(*pte)); + + /* + * From VT-d spec table 25 "Guidance to Software for Invalidations": + * + * - PASID-selective-within-Domain PASID-cache invalidation + * If (PGTT=SS or Nested) + * - Domain-selective IOTLB invalidation + * Else + * - PASID-selective PASID-based IOTLB invalidation + * - If (pasid is RID_PASID) + * - Global Device-TLB invalidation to affected functions + * Else + * - PASID-based Device-TLB invalidation (with S=1 and + * Addr[63:12]=0x7FFFFFFF_FFFFF) to affected functions + */ + pasid_cache_invalidation_with_pasid(iommu, did, pasid); + + iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH); + + /* Device IOTLB doesn't need to be flushed in caching mode. */ + if (!cap_caching_mode(iommu->cap)) + devtlb_invalidation_with_pasid(iommu, dev, pasid); + + return 0; +} + /* * Set up the scalable mode pasid entry for passthrough translation type. */ diff --git a/drivers/iommu/intel/pasid.h b/drivers/iommu/intel/pasid.h index 4e9e68c3c388..958050b093aa 100644 --- a/drivers/iommu/intel/pasid.h +++ b/drivers/iommu/intel/pasid.h @@ -106,6 +106,10 @@ int intel_pasid_setup_first_level(struct intel_iommu *iommu, int intel_pasid_setup_second_level(struct intel_iommu *iommu, struct dmar_domain *domain, struct device *dev, u32 pasid); +int intel_pasid_setup_dirty_tracking(struct intel_iommu *iommu, + struct dmar_domain *domain, + struct device *dev, u32 pasid, + bool enabled); int intel_pasid_setup_pass_through(struct intel_iommu *iommu, struct dmar_domain *domain, struct device *dev, u32 pasid); From e04b23c8d4ed977dbab4a4159f9e4d9a878b5c65 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Tue, 24 Oct 2023 14:51:04 +0100 Subject: [PATCH 23/52] iommufd/selftest: Expand mock_domain with dev_flags Expand mock_domain test to be able to manipulate the device capabilities. This allows testing with mockdev without dirty tracking support advertised and thus make sure enforce_dirty test does the expected. To avoid breaking IOMMUFD_TEST UABI replicate the mock_domain struct and thus add an input dev_flags at the end. Link: https://lore.kernel.org/r/20231024135109.73787-14-joao.m.martins@oracle.com Signed-off-by: Joao Martins Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/iommufd_test.h | 12 ++++++++ drivers/iommu/iommufd/selftest.c | 11 +++++-- tools/testing/selftests/iommu/iommufd_utils.h | 29 +++++++++++++++++++ 3 files changed, 50 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/iommufd/iommufd_test.h b/drivers/iommu/iommufd/iommufd_test.h index 3f3644375bf1..9817edcd8968 100644 --- a/drivers/iommu/iommufd/iommufd_test.h +++ b/drivers/iommu/iommufd/iommufd_test.h @@ -19,6 +19,7 @@ enum { IOMMU_TEST_OP_SET_TEMP_MEMORY_LIMIT, IOMMU_TEST_OP_MOCK_DOMAIN_REPLACE, IOMMU_TEST_OP_ACCESS_REPLACE_IOAS, + IOMMU_TEST_OP_MOCK_DOMAIN_FLAGS, }; enum { @@ -40,6 +41,10 @@ enum { MOCK_FLAGS_ACCESS_CREATE_NEEDS_PIN_PAGES = 1 << 0, }; +enum { + MOCK_FLAGS_DEVICE_NO_DIRTY = 1 << 0, +}; + struct iommu_test_cmd { __u32 size; __u32 op; @@ -56,6 +61,13 @@ struct iommu_test_cmd { /* out_idev_id is the standard iommufd_bind object */ __u32 out_idev_id; } mock_domain; + struct { + __u32 out_stdev_id; + __u32 out_hwpt_id; + __u32 out_idev_id; + /* Expand mock_domain to set mock device flags */ + __u32 dev_flags; + } mock_domain_flags; struct { __u32 pt_id; } mock_domain_replace; diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c index fe7e3c7d933a..bd3704b28bfb 100644 --- a/drivers/iommu/iommufd/selftest.c +++ b/drivers/iommu/iommufd/selftest.c @@ -96,6 +96,7 @@ enum selftest_obj_type { struct mock_dev { struct device dev; + unsigned long flags; }; struct selftest_obj { @@ -381,7 +382,7 @@ static void mock_dev_release(struct device *dev) kfree(mdev); } -static struct mock_dev *mock_dev_create(void) +static struct mock_dev *mock_dev_create(unsigned long dev_flags) { struct mock_dev *mdev; int rc; @@ -391,6 +392,7 @@ static struct mock_dev *mock_dev_create(void) return ERR_PTR(-ENOMEM); device_initialize(&mdev->dev); + mdev->flags = dev_flags; mdev->dev.release = mock_dev_release; mdev->dev.bus = &iommufd_mock_bus_type.bus; @@ -426,6 +428,7 @@ static int iommufd_test_mock_domain(struct iommufd_ucmd *ucmd, struct iommufd_device *idev; struct selftest_obj *sobj; u32 pt_id = cmd->id; + u32 dev_flags = 0; u32 idev_id; int rc; @@ -436,7 +439,10 @@ static int iommufd_test_mock_domain(struct iommufd_ucmd *ucmd, sobj->idev.ictx = ucmd->ictx; sobj->type = TYPE_IDEV; - sobj->idev.mock_dev = mock_dev_create(); + if (cmd->op == IOMMU_TEST_OP_MOCK_DOMAIN_FLAGS) + dev_flags = cmd->mock_domain_flags.dev_flags; + + sobj->idev.mock_dev = mock_dev_create(dev_flags); if (IS_ERR(sobj->idev.mock_dev)) { rc = PTR_ERR(sobj->idev.mock_dev); goto out_sobj; @@ -1019,6 +1025,7 @@ int iommufd_test(struct iommufd_ucmd *ucmd) cmd->add_reserved.start, cmd->add_reserved.length); case IOMMU_TEST_OP_MOCK_DOMAIN: + case IOMMU_TEST_OP_MOCK_DOMAIN_FLAGS: return iommufd_test_mock_domain(ucmd, cmd); case IOMMU_TEST_OP_MOCK_DOMAIN_REPLACE: return iommufd_test_mock_domain_replace( diff --git a/tools/testing/selftests/iommu/iommufd_utils.h b/tools/testing/selftests/iommu/iommufd_utils.h index be4970a84977..1e0736adc991 100644 --- a/tools/testing/selftests/iommu/iommufd_utils.h +++ b/tools/testing/selftests/iommu/iommufd_utils.h @@ -74,6 +74,35 @@ static int _test_cmd_mock_domain(int fd, unsigned int ioas_id, __u32 *stdev_id, EXPECT_ERRNO(_errno, _test_cmd_mock_domain(self->fd, ioas_id, \ stdev_id, hwpt_id, NULL)) +static int _test_cmd_mock_domain_flags(int fd, unsigned int ioas_id, + __u32 stdev_flags, __u32 *stdev_id, + __u32 *hwpt_id, __u32 *idev_id) +{ + struct iommu_test_cmd cmd = { + .size = sizeof(cmd), + .op = IOMMU_TEST_OP_MOCK_DOMAIN_FLAGS, + .id = ioas_id, + .mock_domain_flags = { .dev_flags = stdev_flags }, + }; + int ret; + + ret = ioctl(fd, IOMMU_TEST_CMD, &cmd); + if (ret) + return ret; + if (stdev_id) + *stdev_id = cmd.mock_domain_flags.out_stdev_id; + assert(cmd.id != 0); + if (hwpt_id) + *hwpt_id = cmd.mock_domain_flags.out_hwpt_id; + if (idev_id) + *idev_id = cmd.mock_domain_flags.out_idev_id; + return 0; +} +#define test_err_mock_domain_flags(_errno, ioas_id, flags, stdev_id, hwpt_id) \ + EXPECT_ERRNO(_errno, \ + _test_cmd_mock_domain_flags(self->fd, ioas_id, flags, \ + stdev_id, hwpt_id, NULL)) + static int _test_cmd_mock_domain_replace(int fd, __u32 stdev_id, __u32 pt_id, __u32 *hwpt_id) { From 266ce58989ba05e2a24460fdbf402d766c2e3870 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Tue, 24 Oct 2023 14:51:05 +0100 Subject: [PATCH 24/52] iommufd/selftest: Test IOMMU_HWPT_ALLOC_DIRTY_TRACKING In order to selftest the iommu domain dirty enforcing implement the mock_domain necessary support and add a new dev_flags to test that the hwpt_alloc/attach_device fails as expected. Expand the existing mock_domain fixture with a enforce_dirty test that exercises the hwpt_alloc and device attachment. Link: https://lore.kernel.org/r/20231024135109.73787-15-joao.m.martins@oracle.com Signed-off-by: Joao Martins Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/selftest.c | 37 +++++++++++++- tools/testing/selftests/iommu/iommufd.c | 49 +++++++++++++++++++ tools/testing/selftests/iommu/iommufd_utils.h | 3 ++ 3 files changed, 88 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c index bd3704b28bfb..78362f2334f5 100644 --- a/drivers/iommu/iommufd/selftest.c +++ b/drivers/iommu/iommufd/selftest.c @@ -119,6 +119,11 @@ static void mock_domain_blocking_free(struct iommu_domain *domain) static int mock_domain_nop_attach(struct iommu_domain *domain, struct device *dev) { + struct mock_dev *mdev = container_of(dev, struct mock_dev, dev); + + if (domain->dirty_ops && (mdev->flags & MOCK_FLAGS_DEVICE_NO_DIRTY)) + return -EINVAL; + return 0; } @@ -147,6 +152,25 @@ static void *mock_domain_hw_info(struct device *dev, u32 *length, u32 *type) return info; } +static int mock_domain_set_dirty_tracking(struct iommu_domain *domain, + bool enable) +{ + return 0; +} + +static int mock_domain_read_and_clear_dirty(struct iommu_domain *domain, + unsigned long iova, size_t size, + unsigned long flags, + struct iommu_dirty_bitmap *dirty) +{ + return 0; +} + +const struct iommu_dirty_ops dirty_ops = { + .set_dirty_tracking = mock_domain_set_dirty_tracking, + .read_and_clear_dirty = mock_domain_read_and_clear_dirty, +}; + static const struct iommu_ops mock_ops; static struct iommu_domain *mock_domain_alloc(unsigned int iommu_domain_type) @@ -174,12 +198,20 @@ static struct iommu_domain *mock_domain_alloc(unsigned int iommu_domain_type) static struct iommu_domain * mock_domain_alloc_user(struct device *dev, u32 flags) { + struct mock_dev *mdev = container_of(dev, struct mock_dev, dev); struct iommu_domain *domain; - if (flags & (~IOMMU_HWPT_ALLOC_NEST_PARENT)) + if (flags & + (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING))) + return ERR_PTR(-EOPNOTSUPP); + + if ((flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING) && + (mdev->flags & MOCK_FLAGS_DEVICE_NO_DIRTY)) return ERR_PTR(-EOPNOTSUPP); domain = mock_domain_alloc(IOMMU_DOMAIN_UNMANAGED); + if (domain && !(mdev->flags & MOCK_FLAGS_DEVICE_NO_DIRTY)) + domain->dirty_ops = &dirty_ops; if (!domain) domain = ERR_PTR(-ENOMEM); return domain; @@ -387,6 +419,9 @@ static struct mock_dev *mock_dev_create(unsigned long dev_flags) struct mock_dev *mdev; int rc; + if (dev_flags & ~(MOCK_FLAGS_DEVICE_NO_DIRTY)) + return ERR_PTR(-EINVAL); + mdev = kzalloc(sizeof(*mdev), GFP_KERNEL); if (!mdev) return ERR_PTR(-ENOMEM); diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c index 6323153d277b..6bebba183426 100644 --- a/tools/testing/selftests/iommu/iommufd.c +++ b/tools/testing/selftests/iommu/iommufd.c @@ -1433,6 +1433,55 @@ TEST_F(iommufd_mock_domain, alloc_hwpt) } } +FIXTURE(iommufd_dirty_tracking) +{ + int fd; + uint32_t ioas_id; + uint32_t hwpt_id; + uint32_t stdev_id; + uint32_t idev_id; +}; + +FIXTURE_SETUP(iommufd_dirty_tracking) +{ + self->fd = open("/dev/iommu", O_RDWR); + ASSERT_NE(-1, self->fd); + + test_ioctl_ioas_alloc(&self->ioas_id); + test_cmd_mock_domain(self->ioas_id, &self->stdev_id, &self->hwpt_id, + &self->idev_id); +} + +FIXTURE_TEARDOWN(iommufd_dirty_tracking) +{ + teardown_iommufd(self->fd, _metadata); +} + +TEST_F(iommufd_dirty_tracking, enforce_dirty) +{ + uint32_t ioas_id, stddev_id, idev_id; + uint32_t hwpt_id, _hwpt_id; + uint32_t dev_flags; + + /* Regular case */ + dev_flags = MOCK_FLAGS_DEVICE_NO_DIRTY; + test_cmd_hwpt_alloc(self->idev_id, self->ioas_id, + IOMMU_HWPT_ALLOC_DIRTY_TRACKING, &hwpt_id); + test_cmd_mock_domain(hwpt_id, &stddev_id, NULL, NULL); + test_err_mock_domain_flags(EINVAL, hwpt_id, dev_flags, &stddev_id, + NULL); + test_ioctl_destroy(stddev_id); + test_ioctl_destroy(hwpt_id); + + /* IOMMU device does not support dirty tracking */ + test_ioctl_ioas_alloc(&ioas_id); + test_cmd_mock_domain_flags(ioas_id, dev_flags, &stddev_id, &_hwpt_id, + &idev_id); + test_err_hwpt_alloc(EOPNOTSUPP, idev_id, ioas_id, + IOMMU_HWPT_ALLOC_DIRTY_TRACKING, &hwpt_id); + test_ioctl_destroy(stddev_id); +} + /* VFIO compatibility IOCTLs */ TEST_F(iommufd, simple_ioctls) diff --git a/tools/testing/selftests/iommu/iommufd_utils.h b/tools/testing/selftests/iommu/iommufd_utils.h index 1e0736adc991..4ddafa29e638 100644 --- a/tools/testing/selftests/iommu/iommufd_utils.h +++ b/tools/testing/selftests/iommu/iommufd_utils.h @@ -98,6 +98,9 @@ static int _test_cmd_mock_domain_flags(int fd, unsigned int ioas_id, *idev_id = cmd.mock_domain_flags.out_idev_id; return 0; } +#define test_cmd_mock_domain_flags(ioas_id, flags, stdev_id, hwpt_id, idev_id) \ + ASSERT_EQ(0, _test_cmd_mock_domain_flags(self->fd, ioas_id, flags, \ + stdev_id, hwpt_id, idev_id)) #define test_err_mock_domain_flags(_errno, ioas_id, flags, stdev_id, hwpt_id) \ EXPECT_ERRNO(_errno, \ _test_cmd_mock_domain_flags(self->fd, ioas_id, flags, \ From 7adf267d66d1d737ea8318976fd1ce93733fd3a4 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Tue, 24 Oct 2023 14:51:06 +0100 Subject: [PATCH 25/52] iommufd/selftest: Test IOMMU_HWPT_SET_DIRTY_TRACKING Change mock_domain to supporting dirty tracking and add tests to exercise the new SET_DIRTY_TRACKING API in the iommufd_dirty_tracking selftest fixture. Link: https://lore.kernel.org/r/20231024135109.73787-16-joao.m.martins@oracle.com Signed-off-by: Joao Martins Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/selftest.c | 16 ++++++++++++++++ tools/testing/selftests/iommu/iommufd.c | 15 +++++++++++++++ tools/testing/selftests/iommu/iommufd_utils.h | 17 +++++++++++++++++ 3 files changed, 48 insertions(+) diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c index 78362f2334f5..2773275566af 100644 --- a/drivers/iommu/iommufd/selftest.c +++ b/drivers/iommu/iommufd/selftest.c @@ -24,6 +24,7 @@ static struct platform_device *selftest_iommu_dev; size_t iommufd_test_memory_limit = 65536; enum { + MOCK_DIRTY_TRACK = 1, MOCK_IO_PAGE_SIZE = PAGE_SIZE / 2, /* @@ -86,6 +87,7 @@ void iommufd_test_syz_conv_iova_id(struct iommufd_ucmd *ucmd, } struct mock_iommu_domain { + unsigned long flags; struct iommu_domain domain; struct xarray pfns; }; @@ -155,6 +157,20 @@ static void *mock_domain_hw_info(struct device *dev, u32 *length, u32 *type) static int mock_domain_set_dirty_tracking(struct iommu_domain *domain, bool enable) { + struct mock_iommu_domain *mock = + container_of(domain, struct mock_iommu_domain, domain); + unsigned long flags = mock->flags; + + if (enable && !domain->dirty_ops) + return -EINVAL; + + /* No change? */ + if (!(enable ^ !!(flags & MOCK_DIRTY_TRACK))) + return 0; + + flags = (enable ? flags | MOCK_DIRTY_TRACK : flags & ~MOCK_DIRTY_TRACK); + + mock->flags = flags; return 0; } diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c index 6bebba183426..8c46012006e1 100644 --- a/tools/testing/selftests/iommu/iommufd.c +++ b/tools/testing/selftests/iommu/iommufd.c @@ -1482,6 +1482,21 @@ TEST_F(iommufd_dirty_tracking, enforce_dirty) test_ioctl_destroy(stddev_id); } +TEST_F(iommufd_dirty_tracking, set_dirty_tracking) +{ + uint32_t stddev_id; + uint32_t hwpt_id; + + test_cmd_hwpt_alloc(self->idev_id, self->ioas_id, + IOMMU_HWPT_ALLOC_DIRTY_TRACKING, &hwpt_id); + test_cmd_mock_domain(hwpt_id, &stddev_id, NULL, NULL); + test_cmd_set_dirty_tracking(hwpt_id, true); + test_cmd_set_dirty_tracking(hwpt_id, false); + + test_ioctl_destroy(stddev_id); + test_ioctl_destroy(hwpt_id); +} + /* VFIO compatibility IOCTLs */ TEST_F(iommufd, simple_ioctls) diff --git a/tools/testing/selftests/iommu/iommufd_utils.h b/tools/testing/selftests/iommu/iommufd_utils.h index 4ddafa29e638..e37af6291b22 100644 --- a/tools/testing/selftests/iommu/iommufd_utils.h +++ b/tools/testing/selftests/iommu/iommufd_utils.h @@ -179,6 +179,23 @@ static int _test_cmd_access_replace_ioas(int fd, __u32 access_id, #define test_cmd_access_replace_ioas(access_id, ioas_id) \ ASSERT_EQ(0, _test_cmd_access_replace_ioas(self->fd, access_id, ioas_id)) +static int _test_cmd_set_dirty_tracking(int fd, __u32 hwpt_id, bool enabled) +{ + struct iommu_hwpt_set_dirty_tracking cmd = { + .size = sizeof(cmd), + .flags = enabled ? IOMMU_HWPT_DIRTY_TRACKING_ENABLE : 0, + .hwpt_id = hwpt_id, + }; + int ret; + + ret = ioctl(fd, IOMMU_HWPT_SET_DIRTY_TRACKING, &cmd); + if (ret) + return -errno; + return 0; +} +#define test_cmd_set_dirty_tracking(hwpt_id, enabled) \ + ASSERT_EQ(0, _test_cmd_set_dirty_tracking(self->fd, hwpt_id, enabled)) + static int _test_cmd_create_access(int fd, unsigned int ioas_id, __u32 *access_id, unsigned int flags) { From a9af47e382a4d517685cb13c780272e7f300ebc5 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Tue, 24 Oct 2023 14:51:07 +0100 Subject: [PATCH 26/52] iommufd/selftest: Test IOMMU_HWPT_GET_DIRTY_BITMAP Add a new test ioctl for simulating the dirty IOVAs in the mock domain, and implement the mock iommu domain ops that get the dirty tracking supported. The selftest exercises the usual main workflow of: 1) Setting dirty tracking from the iommu domain 2) Read and clear dirty IOPTEs Different fixtures will test different IOVA range sizes, that exercise corner cases of the bitmaps. Link: https://lore.kernel.org/r/20231024135109.73787-17-joao.m.martins@oracle.com Signed-off-by: Joao Martins Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/iommufd_test.h | 9 ++ drivers/iommu/iommufd/selftest.c | 107 ++++++++++++++- tools/testing/selftests/iommu/iommufd.c | 96 +++++++++++++ tools/testing/selftests/iommu/iommufd_utils.h | 127 ++++++++++++++++++ 4 files changed, 334 insertions(+), 5 deletions(-) diff --git a/drivers/iommu/iommufd/iommufd_test.h b/drivers/iommu/iommufd/iommufd_test.h index 9817edcd8968..1f2e93d3d4e8 100644 --- a/drivers/iommu/iommufd/iommufd_test.h +++ b/drivers/iommu/iommufd/iommufd_test.h @@ -20,6 +20,7 @@ enum { IOMMU_TEST_OP_MOCK_DOMAIN_REPLACE, IOMMU_TEST_OP_ACCESS_REPLACE_IOAS, IOMMU_TEST_OP_MOCK_DOMAIN_FLAGS, + IOMMU_TEST_OP_DIRTY, }; enum { @@ -107,6 +108,14 @@ struct iommu_test_cmd { struct { __u32 ioas_id; } access_replace_ioas; + struct { + __u32 flags; + __aligned_u64 iova; + __aligned_u64 length; + __aligned_u64 page_size; + __aligned_u64 uptr; + __aligned_u64 out_nr_dirty; + } dirty; }; __u32 last; }; diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c index 2773275566af..4eb86025dde9 100644 --- a/drivers/iommu/iommufd/selftest.c +++ b/drivers/iommu/iommufd/selftest.c @@ -37,6 +37,7 @@ enum { _MOCK_PFN_START = MOCK_PFN_MASK + 1, MOCK_PFN_START_IOVA = _MOCK_PFN_START, MOCK_PFN_LAST_IOVA = _MOCK_PFN_START, + MOCK_PFN_DIRTY_IOVA = _MOCK_PFN_START << 1, }; /* @@ -179,6 +180,31 @@ static int mock_domain_read_and_clear_dirty(struct iommu_domain *domain, unsigned long flags, struct iommu_dirty_bitmap *dirty) { + struct mock_iommu_domain *mock = + container_of(domain, struct mock_iommu_domain, domain); + unsigned long i, max = size / MOCK_IO_PAGE_SIZE; + void *ent, *old; + + if (!(mock->flags & MOCK_DIRTY_TRACK) && dirty->bitmap) + return -EINVAL; + + for (i = 0; i < max; i++) { + unsigned long cur = iova + i * MOCK_IO_PAGE_SIZE; + + ent = xa_load(&mock->pfns, cur / MOCK_IO_PAGE_SIZE); + if (ent && (xa_to_value(ent) & MOCK_PFN_DIRTY_IOVA)) { + unsigned long val; + + /* Clear dirty */ + val = xa_to_value(ent) & ~MOCK_PFN_DIRTY_IOVA; + old = xa_store(&mock->pfns, cur / MOCK_IO_PAGE_SIZE, + xa_mk_value(val), GFP_KERNEL); + WARN_ON_ONCE(ent != old); + iommu_dirty_bitmap_record(dirty, cur, + MOCK_IO_PAGE_SIZE); + } + } + return 0; } @@ -310,7 +336,7 @@ static size_t mock_domain_unmap_pages(struct iommu_domain *domain, for (cur = 0; cur != pgsize; cur += MOCK_IO_PAGE_SIZE) { ent = xa_erase(&mock->pfns, iova / MOCK_IO_PAGE_SIZE); - WARN_ON(!ent); + /* * iommufd generates unmaps that must be a strict * superset of the map's performend So every starting @@ -320,13 +346,13 @@ static size_t mock_domain_unmap_pages(struct iommu_domain *domain, * passed to map_pages */ if (first) { - WARN_ON(!(xa_to_value(ent) & - MOCK_PFN_START_IOVA)); + WARN_ON(ent && !(xa_to_value(ent) & + MOCK_PFN_START_IOVA)); first = false; } if (pgcount == 1 && cur + MOCK_IO_PAGE_SIZE == pgsize) - WARN_ON(!(xa_to_value(ent) & - MOCK_PFN_LAST_IOVA)); + WARN_ON(ent && !(xa_to_value(ent) & + MOCK_PFN_LAST_IOVA)); iova += MOCK_IO_PAGE_SIZE; ret += MOCK_IO_PAGE_SIZE; @@ -1053,6 +1079,71 @@ static_assert((unsigned int)MOCK_ACCESS_RW_WRITE == IOMMUFD_ACCESS_RW_WRITE); static_assert((unsigned int)MOCK_ACCESS_RW_SLOW_PATH == __IOMMUFD_ACCESS_RW_SLOW_PATH); +static int iommufd_test_dirty(struct iommufd_ucmd *ucmd, unsigned int mockpt_id, + unsigned long iova, size_t length, + unsigned long page_size, void __user *uptr, + u32 flags) +{ + unsigned long bitmap_size, i, max = length / page_size; + struct iommu_test_cmd *cmd = ucmd->cmd; + struct iommufd_hw_pagetable *hwpt; + struct mock_iommu_domain *mock; + int rc, count = 0; + void *tmp; + + if (iova % page_size || length % page_size || !uptr) + return -EINVAL; + + hwpt = get_md_pagetable(ucmd, mockpt_id, &mock); + if (IS_ERR(hwpt)) + return PTR_ERR(hwpt); + + if (!(mock->flags & MOCK_DIRTY_TRACK)) { + rc = -EINVAL; + goto out_put; + } + + bitmap_size = max / BITS_PER_BYTE; + + tmp = kvzalloc(bitmap_size, GFP_KERNEL_ACCOUNT); + if (!tmp) { + rc = -ENOMEM; + goto out_put; + } + + if (copy_from_user(tmp, uptr, bitmap_size)) { + rc = -EFAULT; + goto out_free; + } + + for (i = 0; i < max; i++) { + unsigned long cur = iova + i * page_size; + void *ent, *old; + + if (!test_bit(i, (unsigned long *)tmp)) + continue; + + ent = xa_load(&mock->pfns, cur / page_size); + if (ent) { + unsigned long val; + + val = xa_to_value(ent) | MOCK_PFN_DIRTY_IOVA; + old = xa_store(&mock->pfns, cur / page_size, + xa_mk_value(val), GFP_KERNEL); + WARN_ON_ONCE(ent != old); + count++; + } + } + + cmd->dirty.out_nr_dirty = count; + rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); +out_free: + kvfree(tmp); +out_put: + iommufd_put_object(&hwpt->obj); + return rc; +} + void iommufd_selftest_destroy(struct iommufd_object *obj) { struct selftest_obj *sobj = container_of(obj, struct selftest_obj, obj); @@ -1118,6 +1209,12 @@ int iommufd_test(struct iommufd_ucmd *ucmd) return -EINVAL; iommufd_test_memory_limit = cmd->memory_limit.limit; return 0; + case IOMMU_TEST_OP_DIRTY: + return iommufd_test_dirty(ucmd, cmd->id, cmd->dirty.iova, + cmd->dirty.length, + cmd->dirty.page_size, + u64_to_user_ptr(cmd->dirty.uptr), + cmd->dirty.flags); default: return -EOPNOTSUPP; } diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c index 8c46012006e1..891250acf47e 100644 --- a/tools/testing/selftests/iommu/iommufd.c +++ b/tools/testing/selftests/iommu/iommufd.c @@ -1440,13 +1440,47 @@ FIXTURE(iommufd_dirty_tracking) uint32_t hwpt_id; uint32_t stdev_id; uint32_t idev_id; + unsigned long page_size; + unsigned long bitmap_size; + void *bitmap; + void *buffer; +}; + +FIXTURE_VARIANT(iommufd_dirty_tracking) +{ + unsigned long buffer_size; }; FIXTURE_SETUP(iommufd_dirty_tracking) { + void *vrc; + int rc; + self->fd = open("/dev/iommu", O_RDWR); ASSERT_NE(-1, self->fd); + rc = posix_memalign(&self->buffer, HUGEPAGE_SIZE, variant->buffer_size); + if (rc || !self->buffer) { + SKIP(return, "Skipping buffer_size=%lu due to errno=%d", + variant->buffer_size, rc); + } + + assert((uintptr_t)self->buffer % HUGEPAGE_SIZE == 0); + vrc = mmap(self->buffer, variant->buffer_size, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_ANONYMOUS | MAP_FIXED, -1, 0); + assert(vrc == self->buffer); + + self->page_size = MOCK_PAGE_SIZE; + self->bitmap_size = + variant->buffer_size / self->page_size / BITS_PER_BYTE; + + /* Provision with an extra (MOCK_PAGE_SIZE) for the unaligned case */ + rc = posix_memalign(&self->bitmap, PAGE_SIZE, + self->bitmap_size + MOCK_PAGE_SIZE); + assert(!rc); + assert(self->bitmap); + assert((uintptr_t)self->bitmap % PAGE_SIZE == 0); + test_ioctl_ioas_alloc(&self->ioas_id); test_cmd_mock_domain(self->ioas_id, &self->stdev_id, &self->hwpt_id, &self->idev_id); @@ -1454,9 +1488,41 @@ FIXTURE_SETUP(iommufd_dirty_tracking) FIXTURE_TEARDOWN(iommufd_dirty_tracking) { + munmap(self->buffer, variant->buffer_size); + munmap(self->bitmap, self->bitmap_size); teardown_iommufd(self->fd, _metadata); } +FIXTURE_VARIANT_ADD(iommufd_dirty_tracking, domain_dirty128k) +{ + /* one u32 index bitmap */ + .buffer_size = 128UL * 1024UL, +}; + +FIXTURE_VARIANT_ADD(iommufd_dirty_tracking, domain_dirty256k) +{ + /* one u64 index bitmap */ + .buffer_size = 256UL * 1024UL, +}; + +FIXTURE_VARIANT_ADD(iommufd_dirty_tracking, domain_dirty640k) +{ + /* two u64 index and trailing end bitmap */ + .buffer_size = 640UL * 1024UL, +}; + +FIXTURE_VARIANT_ADD(iommufd_dirty_tracking, domain_dirty128M) +{ + /* 4K bitmap (128M IOVA range) */ + .buffer_size = 128UL * 1024UL * 1024UL, +}; + +FIXTURE_VARIANT_ADD(iommufd_dirty_tracking, domain_dirty256M) +{ + /* 8K bitmap (256M IOVA range) */ + .buffer_size = 256UL * 1024UL * 1024UL, +}; + TEST_F(iommufd_dirty_tracking, enforce_dirty) { uint32_t ioas_id, stddev_id, idev_id; @@ -1497,6 +1563,36 @@ TEST_F(iommufd_dirty_tracking, set_dirty_tracking) test_ioctl_destroy(hwpt_id); } +TEST_F(iommufd_dirty_tracking, get_dirty_bitmap) +{ + uint32_t stddev_id; + uint32_t hwpt_id; + uint32_t ioas_id; + + test_ioctl_ioas_alloc(&ioas_id); + test_ioctl_ioas_map_fixed_id(ioas_id, self->buffer, + variant->buffer_size, MOCK_APERTURE_START); + + test_cmd_hwpt_alloc(self->idev_id, ioas_id, + IOMMU_HWPT_ALLOC_DIRTY_TRACKING, &hwpt_id); + test_cmd_mock_domain(hwpt_id, &stddev_id, NULL, NULL); + + test_cmd_set_dirty_tracking(hwpt_id, true); + + test_mock_dirty_bitmaps(hwpt_id, variant->buffer_size, + MOCK_APERTURE_START, self->page_size, + self->bitmap, self->bitmap_size, _metadata); + + /* PAGE_SIZE unaligned bitmap */ + test_mock_dirty_bitmaps(hwpt_id, variant->buffer_size, + MOCK_APERTURE_START, self->page_size, + self->bitmap + MOCK_PAGE_SIZE, + self->bitmap_size, _metadata); + + test_ioctl_destroy(stddev_id); + test_ioctl_destroy(hwpt_id); +} + /* VFIO compatibility IOCTLs */ TEST_F(iommufd, simple_ioctls) diff --git a/tools/testing/selftests/iommu/iommufd_utils.h b/tools/testing/selftests/iommu/iommufd_utils.h index e37af6291b22..b129cf23b824 100644 --- a/tools/testing/selftests/iommu/iommufd_utils.h +++ b/tools/testing/selftests/iommu/iommufd_utils.h @@ -16,6 +16,25 @@ /* Hack to make assertions more readable */ #define _IOMMU_TEST_CMD(x) IOMMU_TEST_CMD +/* Imported from include/asm-generic/bitops/generic-non-atomic.h */ +#define BITS_PER_BYTE 8 +#define BITS_PER_LONG __BITS_PER_LONG +#define BIT_MASK(nr) (1UL << ((nr) % __BITS_PER_LONG)) +#define BIT_WORD(nr) ((nr) / __BITS_PER_LONG) + +static inline void set_bit(unsigned int nr, unsigned long *addr) +{ + unsigned long mask = BIT_MASK(nr); + unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr); + + *p |= mask; +} + +static inline bool test_bit(unsigned int nr, unsigned long *addr) +{ + return 1UL & (addr[BIT_WORD(nr)] >> (nr & (BITS_PER_LONG - 1))); +} + static void *buffer; static unsigned long BUFFER_SIZE; @@ -196,6 +215,103 @@ static int _test_cmd_set_dirty_tracking(int fd, __u32 hwpt_id, bool enabled) #define test_cmd_set_dirty_tracking(hwpt_id, enabled) \ ASSERT_EQ(0, _test_cmd_set_dirty_tracking(self->fd, hwpt_id, enabled)) +static int _test_cmd_get_dirty_bitmap(int fd, __u32 hwpt_id, size_t length, + __u64 iova, size_t page_size, + __u64 *bitmap) +{ + struct iommu_hwpt_get_dirty_bitmap cmd = { + .size = sizeof(cmd), + .hwpt_id = hwpt_id, + .iova = iova, + .length = length, + .page_size = page_size, + .data = (uintptr_t)bitmap, + }; + int ret; + + ret = ioctl(fd, IOMMU_HWPT_GET_DIRTY_BITMAP, &cmd); + if (ret) + return ret; + return 0; +} + +#define test_cmd_get_dirty_bitmap(fd, hwpt_id, length, iova, page_size, \ + bitmap) \ + ASSERT_EQ(0, _test_cmd_get_dirty_bitmap(fd, hwpt_id, length, iova, \ + page_size, bitmap)) + +static int _test_cmd_mock_domain_set_dirty(int fd, __u32 hwpt_id, size_t length, + __u64 iova, size_t page_size, + __u64 *bitmap, __u64 *dirty) +{ + struct iommu_test_cmd cmd = { + .size = sizeof(cmd), + .op = IOMMU_TEST_OP_DIRTY, + .id = hwpt_id, + .dirty = { + .iova = iova, + .length = length, + .page_size = page_size, + .uptr = (uintptr_t)bitmap, + } + }; + int ret; + + ret = ioctl(fd, _IOMMU_TEST_CMD(IOMMU_TEST_OP_DIRTY), &cmd); + if (ret) + return -ret; + if (dirty) + *dirty = cmd.dirty.out_nr_dirty; + return 0; +} + +#define test_cmd_mock_domain_set_dirty(fd, hwpt_id, length, iova, page_size, \ + bitmap, nr) \ + ASSERT_EQ(0, \ + _test_cmd_mock_domain_set_dirty(fd, hwpt_id, length, iova, \ + page_size, bitmap, nr)) + +static int _test_mock_dirty_bitmaps(int fd, __u32 hwpt_id, size_t length, + __u64 iova, size_t page_size, __u64 *bitmap, + __u64 bitmap_size, + struct __test_metadata *_metadata) +{ + unsigned long i, count, nbits = bitmap_size * BITS_PER_BYTE; + unsigned long nr = nbits / 2; + __u64 out_dirty = 0; + + /* Mark all even bits as dirty in the mock domain */ + for (count = 0, i = 0; i < nbits; count += !(i % 2), i++) + if (!(i % 2)) + set_bit(i, (unsigned long *)bitmap); + ASSERT_EQ(nr, count); + + test_cmd_mock_domain_set_dirty(fd, hwpt_id, length, iova, page_size, + bitmap, &out_dirty); + ASSERT_EQ(nr, out_dirty); + + /* Expect all even bits as dirty in the user bitmap */ + memset(bitmap, 0, bitmap_size); + test_cmd_get_dirty_bitmap(fd, hwpt_id, length, iova, page_size, bitmap); + for (count = 0, i = 0; i < nbits; count += !(i % 2), i++) + ASSERT_EQ(!(i % 2), test_bit(i, (unsigned long *)bitmap)); + ASSERT_EQ(count, out_dirty); + + memset(bitmap, 0, bitmap_size); + test_cmd_get_dirty_bitmap(fd, hwpt_id, length, iova, page_size, bitmap); + + /* It as read already -- expect all zeroes */ + for (i = 0; i < nbits; i++) + ASSERT_EQ(0, test_bit(i, (unsigned long *)bitmap)); + + return 0; +} +#define test_mock_dirty_bitmaps(hwpt_id, length, iova, page_size, bitmap, \ + bitmap_size, _metadata) \ + ASSERT_EQ(0, _test_mock_dirty_bitmaps(self->fd, hwpt_id, length, iova, \ + page_size, bitmap, bitmap_size, \ + _metadata)) + static int _test_cmd_create_access(int fd, unsigned int ioas_id, __u32 *access_id, unsigned int flags) { @@ -320,6 +436,17 @@ static int _test_ioctl_ioas_map(int fd, unsigned int ioas_id, void *buffer, IOMMU_IOAS_MAP_READABLE)); \ }) +#define test_ioctl_ioas_map_fixed_id(ioas_id, buffer, length, iova) \ + ({ \ + __u64 __iova = iova; \ + ASSERT_EQ(0, \ + _test_ioctl_ioas_map( \ + self->fd, ioas_id, buffer, length, &__iova, \ + IOMMU_IOAS_MAP_FIXED_IOVA | \ + IOMMU_IOAS_MAP_WRITEABLE | \ + IOMMU_IOAS_MAP_READABLE)); \ + }) + #define test_err_ioctl_ioas_map_fixed(_errno, buffer, length, iova) \ ({ \ __u64 __iova = iova; \ From ae36fe70cea4d7c177452ab41e6734fa3cbd4ad8 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Tue, 24 Oct 2023 14:51:08 +0100 Subject: [PATCH 27/52] iommufd/selftest: Test out_capabilities in IOMMU_GET_HW_INFO Enumerate the capabilities from the mock device and test whether it advertises as expected. Include it as part of the iommufd_dirty_tracking fixture. Link: https://lore.kernel.org/r/20231024135109.73787-18-joao.m.martins@oracle.com Signed-off-by: Joao Martins Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/selftest.c | 13 +++++++++- tools/testing/selftests/iommu/iommufd.c | 17 +++++++++++++ .../selftests/iommu/iommufd_fail_nth.c | 2 +- tools/testing/selftests/iommu/iommufd_utils.h | 24 ++++++++++++------- 4 files changed, 45 insertions(+), 11 deletions(-) diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c index 4eb86025dde9..0eb01d1f9df8 100644 --- a/drivers/iommu/iommufd/selftest.c +++ b/drivers/iommu/iommufd/selftest.c @@ -376,7 +376,18 @@ static phys_addr_t mock_domain_iova_to_phys(struct iommu_domain *domain, static bool mock_domain_capable(struct device *dev, enum iommu_cap cap) { - return cap == IOMMU_CAP_CACHE_COHERENCY; + struct mock_dev *mdev = container_of(dev, struct mock_dev, dev); + + switch (cap) { + case IOMMU_CAP_CACHE_COHERENCY: + return true; + case IOMMU_CAP_DIRTY_TRACKING: + return !(mdev->flags & MOCK_FLAGS_DEVICE_NO_DIRTY); + default: + break; + } + + return false; } static void mock_domain_set_plaform_dma_ops(struct device *dev) diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c index 891250acf47e..f4f8bd17ae67 100644 --- a/tools/testing/selftests/iommu/iommufd.c +++ b/tools/testing/selftests/iommu/iommufd.c @@ -1563,6 +1563,23 @@ TEST_F(iommufd_dirty_tracking, set_dirty_tracking) test_ioctl_destroy(hwpt_id); } +TEST_F(iommufd_dirty_tracking, device_dirty_capability) +{ + uint32_t caps = 0; + uint32_t stddev_id; + uint32_t hwpt_id; + + test_cmd_hwpt_alloc(self->idev_id, self->ioas_id, 0, &hwpt_id); + test_cmd_mock_domain(hwpt_id, &stddev_id, NULL, NULL); + test_cmd_get_hw_capabilities(self->idev_id, caps, + IOMMU_HW_CAP_DIRTY_TRACKING); + ASSERT_EQ(IOMMU_HW_CAP_DIRTY_TRACKING, + caps & IOMMU_HW_CAP_DIRTY_TRACKING); + + test_ioctl_destroy(stddev_id); + test_ioctl_destroy(hwpt_id); +} + TEST_F(iommufd_dirty_tracking, get_dirty_bitmap) { uint32_t stddev_id; diff --git a/tools/testing/selftests/iommu/iommufd_fail_nth.c b/tools/testing/selftests/iommu/iommufd_fail_nth.c index 31386be42439..ff735bdd833e 100644 --- a/tools/testing/selftests/iommu/iommufd_fail_nth.c +++ b/tools/testing/selftests/iommu/iommufd_fail_nth.c @@ -612,7 +612,7 @@ TEST_FAIL_NTH(basic_fail_nth, device) &idev_id)) return -1; - if (_test_cmd_get_hw_info(self->fd, idev_id, &info, sizeof(info))) + if (_test_cmd_get_hw_info(self->fd, idev_id, &info, sizeof(info), NULL)) return -1; if (_test_cmd_hwpt_alloc(self->fd, idev_id, ioas_id, 0, &hwpt_id)) diff --git a/tools/testing/selftests/iommu/iommufd_utils.h b/tools/testing/selftests/iommu/iommufd_utils.h index b129cf23b824..2410d06f5a34 100644 --- a/tools/testing/selftests/iommu/iommufd_utils.h +++ b/tools/testing/selftests/iommu/iommufd_utils.h @@ -535,8 +535,8 @@ static void teardown_iommufd(int fd, struct __test_metadata *_metadata) #endif /* @data can be NULL */ -static int _test_cmd_get_hw_info(int fd, __u32 device_id, - void *data, size_t data_len) +static int _test_cmd_get_hw_info(int fd, __u32 device_id, void *data, + size_t data_len, uint32_t *capabilities) { struct iommu_test_hw_info *info = (struct iommu_test_hw_info *)data; struct iommu_hw_info cmd = { @@ -544,6 +544,7 @@ static int _test_cmd_get_hw_info(int fd, __u32 device_id, .dev_id = device_id, .data_len = data_len, .data_uptr = (uint64_t)data, + .out_capabilities = 0, }; int ret; @@ -580,14 +581,19 @@ static int _test_cmd_get_hw_info(int fd, __u32 device_id, assert(!info->flags); } + if (capabilities) + *capabilities = cmd.out_capabilities; + return 0; } -#define test_cmd_get_hw_info(device_id, data, data_len) \ - ASSERT_EQ(0, _test_cmd_get_hw_info(self->fd, device_id, \ - data, data_len)) +#define test_cmd_get_hw_info(device_id, data, data_len) \ + ASSERT_EQ(0, _test_cmd_get_hw_info(self->fd, device_id, data, \ + data_len, NULL)) -#define test_err_get_hw_info(_errno, device_id, data, data_len) \ - EXPECT_ERRNO(_errno, \ - _test_cmd_get_hw_info(self->fd, device_id, \ - data, data_len)) +#define test_err_get_hw_info(_errno, device_id, data, data_len) \ + EXPECT_ERRNO(_errno, _test_cmd_get_hw_info(self->fd, device_id, data, \ + data_len, NULL)) + +#define test_cmd_get_hw_capabilities(device_id, caps, mask) \ + ASSERT_EQ(0, _test_cmd_get_hw_info(self->fd, device_id, NULL, 0, &caps)) From 0795b305da8902e7d092f90bf9a1a2c98f34b1db Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Tue, 24 Oct 2023 14:51:09 +0100 Subject: [PATCH 28/52] iommufd/selftest: Test IOMMU_HWPT_GET_DIRTY_BITMAP_NO_CLEAR flag Change test_mock_dirty_bitmaps() to pass a flag where it specifies the flag under test. The test does the same thing as the GET_DIRTY_BITMAP regular test. Except that it tests whether the dirtied bits are fetched all the same a second time, as opposed to observing them cleared. Link: https://lore.kernel.org/r/20231024135109.73787-19-joao.m.martins@oracle.com Signed-off-by: Joao Martins Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/selftest.c | 15 +++++--- tools/testing/selftests/iommu/iommufd.c | 38 ++++++++++++++++++- tools/testing/selftests/iommu/iommufd_utils.h | 26 ++++++++----- 3 files changed, 61 insertions(+), 18 deletions(-) diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c index 0eb01d1f9df8..d8551c9d5b6c 100644 --- a/drivers/iommu/iommufd/selftest.c +++ b/drivers/iommu/iommufd/selftest.c @@ -193,13 +193,16 @@ static int mock_domain_read_and_clear_dirty(struct iommu_domain *domain, ent = xa_load(&mock->pfns, cur / MOCK_IO_PAGE_SIZE); if (ent && (xa_to_value(ent) & MOCK_PFN_DIRTY_IOVA)) { - unsigned long val; - /* Clear dirty */ - val = xa_to_value(ent) & ~MOCK_PFN_DIRTY_IOVA; - old = xa_store(&mock->pfns, cur / MOCK_IO_PAGE_SIZE, - xa_mk_value(val), GFP_KERNEL); - WARN_ON_ONCE(ent != old); + if (!(flags & IOMMU_DIRTY_NO_CLEAR)) { + unsigned long val; + + val = xa_to_value(ent) & ~MOCK_PFN_DIRTY_IOVA; + old = xa_store(&mock->pfns, + cur / MOCK_IO_PAGE_SIZE, + xa_mk_value(val), GFP_KERNEL); + WARN_ON_ONCE(ent != old); + } iommu_dirty_bitmap_record(dirty, cur, MOCK_IO_PAGE_SIZE); } diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c index f4f8bd17ae67..76a4351e3434 100644 --- a/tools/testing/selftests/iommu/iommufd.c +++ b/tools/testing/selftests/iommu/iommufd.c @@ -1598,13 +1598,47 @@ TEST_F(iommufd_dirty_tracking, get_dirty_bitmap) test_mock_dirty_bitmaps(hwpt_id, variant->buffer_size, MOCK_APERTURE_START, self->page_size, - self->bitmap, self->bitmap_size, _metadata); + self->bitmap, self->bitmap_size, 0, _metadata); /* PAGE_SIZE unaligned bitmap */ test_mock_dirty_bitmaps(hwpt_id, variant->buffer_size, MOCK_APERTURE_START, self->page_size, self->bitmap + MOCK_PAGE_SIZE, - self->bitmap_size, _metadata); + self->bitmap_size, 0, _metadata); + + test_ioctl_destroy(stddev_id); + test_ioctl_destroy(hwpt_id); +} + +TEST_F(iommufd_dirty_tracking, get_dirty_bitmap_no_clear) +{ + uint32_t stddev_id; + uint32_t hwpt_id; + uint32_t ioas_id; + + test_ioctl_ioas_alloc(&ioas_id); + test_ioctl_ioas_map_fixed_id(ioas_id, self->buffer, + variant->buffer_size, MOCK_APERTURE_START); + + test_cmd_hwpt_alloc(self->idev_id, ioas_id, + IOMMU_HWPT_ALLOC_DIRTY_TRACKING, &hwpt_id); + test_cmd_mock_domain(hwpt_id, &stddev_id, NULL, NULL); + + test_cmd_set_dirty_tracking(hwpt_id, true); + + test_mock_dirty_bitmaps(hwpt_id, variant->buffer_size, + MOCK_APERTURE_START, self->page_size, + self->bitmap, self->bitmap_size, + IOMMU_HWPT_GET_DIRTY_BITMAP_NO_CLEAR, + _metadata); + + /* Unaligned bitmap */ + test_mock_dirty_bitmaps(hwpt_id, variant->buffer_size, + MOCK_APERTURE_START, self->page_size, + self->bitmap + MOCK_PAGE_SIZE, + self->bitmap_size, + IOMMU_HWPT_GET_DIRTY_BITMAP_NO_CLEAR, + _metadata); test_ioctl_destroy(stddev_id); test_ioctl_destroy(hwpt_id); diff --git a/tools/testing/selftests/iommu/iommufd_utils.h b/tools/testing/selftests/iommu/iommufd_utils.h index 2410d06f5a34..e263bf80a977 100644 --- a/tools/testing/selftests/iommu/iommufd_utils.h +++ b/tools/testing/selftests/iommu/iommufd_utils.h @@ -217,11 +217,12 @@ static int _test_cmd_set_dirty_tracking(int fd, __u32 hwpt_id, bool enabled) static int _test_cmd_get_dirty_bitmap(int fd, __u32 hwpt_id, size_t length, __u64 iova, size_t page_size, - __u64 *bitmap) + __u64 *bitmap, __u32 flags) { struct iommu_hwpt_get_dirty_bitmap cmd = { .size = sizeof(cmd), .hwpt_id = hwpt_id, + .flags = flags, .iova = iova, .length = length, .page_size = page_size, @@ -236,9 +237,9 @@ static int _test_cmd_get_dirty_bitmap(int fd, __u32 hwpt_id, size_t length, } #define test_cmd_get_dirty_bitmap(fd, hwpt_id, length, iova, page_size, \ - bitmap) \ + bitmap, flags) \ ASSERT_EQ(0, _test_cmd_get_dirty_bitmap(fd, hwpt_id, length, iova, \ - page_size, bitmap)) + page_size, bitmap, flags)) static int _test_cmd_mock_domain_set_dirty(int fd, __u32 hwpt_id, size_t length, __u64 iova, size_t page_size, @@ -273,7 +274,7 @@ static int _test_cmd_mock_domain_set_dirty(int fd, __u32 hwpt_id, size_t length, static int _test_mock_dirty_bitmaps(int fd, __u32 hwpt_id, size_t length, __u64 iova, size_t page_size, __u64 *bitmap, - __u64 bitmap_size, + __u64 bitmap_size, __u32 flags, struct __test_metadata *_metadata) { unsigned long i, count, nbits = bitmap_size * BITS_PER_BYTE; @@ -292,25 +293,30 @@ static int _test_mock_dirty_bitmaps(int fd, __u32 hwpt_id, size_t length, /* Expect all even bits as dirty in the user bitmap */ memset(bitmap, 0, bitmap_size); - test_cmd_get_dirty_bitmap(fd, hwpt_id, length, iova, page_size, bitmap); + test_cmd_get_dirty_bitmap(fd, hwpt_id, length, iova, page_size, bitmap, + flags); for (count = 0, i = 0; i < nbits; count += !(i % 2), i++) ASSERT_EQ(!(i % 2), test_bit(i, (unsigned long *)bitmap)); ASSERT_EQ(count, out_dirty); memset(bitmap, 0, bitmap_size); - test_cmd_get_dirty_bitmap(fd, hwpt_id, length, iova, page_size, bitmap); + test_cmd_get_dirty_bitmap(fd, hwpt_id, length, iova, page_size, bitmap, + flags); /* It as read already -- expect all zeroes */ - for (i = 0; i < nbits; i++) - ASSERT_EQ(0, test_bit(i, (unsigned long *)bitmap)); + for (i = 0; i < nbits; i++) { + ASSERT_EQ(!(i % 2) && (flags & + IOMMU_HWPT_GET_DIRTY_BITMAP_NO_CLEAR), + test_bit(i, (unsigned long *)bitmap)); + } return 0; } #define test_mock_dirty_bitmaps(hwpt_id, length, iova, page_size, bitmap, \ - bitmap_size, _metadata) \ + bitmap_size, flags, _metadata) \ ASSERT_EQ(0, _test_mock_dirty_bitmaps(self->fd, hwpt_id, length, iova, \ page_size, bitmap, bitmap_size, \ - _metadata)) + flags, _metadata)) static int _test_cmd_create_access(int fd, unsigned int ioas_id, __u32 *access_id, unsigned int flags) From 2ccabf81ddff81355bf044bdad3c44e9e6ac32d9 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Mon, 23 Oct 2023 18:29:58 -0700 Subject: [PATCH 29/52] iommufd: Only enforce cache coherency in iommufd_hw_pagetable_alloc According to the conversation in the following link: https://lore.kernel.org/linux-iommu/20231020135501.GG3952@nvidia.com/ The enforce_cache_coherency should be set/enforced in the hwpt allocation routine. The iommu driver in its attach_dev() op should decide whether to reject or not a device that doesn't match with the configuration of cache coherency. Drop the enforce_cache_coherency piece in the attach/replace() and move the remaining "num_devices" piece closer to the refcount that is using it. Accordingly drop its function prototype in the header and mark it static. Also add some extra comments to clarify the expected behaviors. Link: https://lore.kernel.org/r/20231024012958.30842-1-nicolinc@nvidia.com Suggested-by: Kevin Tian Reviewed-by: Lu Baolu Reviewed-by: Kevin Tian Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/device.c | 20 ++------------------ drivers/iommu/iommufd/hw_pagetable.c | 9 ++++++++- drivers/iommu/iommufd/iommufd_private.h | 1 - 3 files changed, 10 insertions(+), 20 deletions(-) diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c index 2a41fd2b6ef8..0a8867487508 100644 --- a/drivers/iommu/iommufd/device.c +++ b/drivers/iommu/iommufd/device.c @@ -337,13 +337,6 @@ int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt, goto err_unlock; } - /* Try to upgrade the domain we have */ - if (idev->enforce_cache_coherency) { - rc = iommufd_hw_pagetable_enforce_cc(hwpt); - if (rc) - goto err_unlock; - } - rc = iopt_table_enforce_dev_resv_regions(&hwpt->ioas->iopt, idev->dev, &idev->igroup->sw_msi_start); if (rc) @@ -413,8 +406,8 @@ iommufd_device_do_replace(struct iommufd_device *idev, { struct iommufd_group *igroup = idev->igroup; struct iommufd_hw_pagetable *old_hwpt; - unsigned int num_devices = 0; struct iommufd_device *cur; + unsigned int num_devices; int rc; mutex_lock(&idev->igroup->lock); @@ -429,16 +422,6 @@ iommufd_device_do_replace(struct iommufd_device *idev, return NULL; } - /* Try to upgrade the domain we have */ - list_for_each_entry(cur, &igroup->device_list, group_item) { - num_devices++; - if (cur->enforce_cache_coherency) { - rc = iommufd_hw_pagetable_enforce_cc(hwpt); - if (rc) - goto err_unlock; - } - } - old_hwpt = igroup->hwpt; if (hwpt->ioas != old_hwpt->ioas) { list_for_each_entry(cur, &igroup->device_list, group_item) { @@ -465,6 +448,7 @@ iommufd_device_do_replace(struct iommufd_device *idev, igroup->hwpt = hwpt; + num_devices = list_count_nodes(&igroup->device_list); /* * Move the refcounts held by the device_list to the new hwpt. Retain a * refcount for this thread as the caller will free it. diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c index 72a5269984b0..c2b742ffeb0c 100644 --- a/drivers/iommu/iommufd/hw_pagetable.c +++ b/drivers/iommu/iommufd/hw_pagetable.c @@ -42,7 +42,7 @@ void iommufd_hw_pagetable_abort(struct iommufd_object *obj) iommufd_hw_pagetable_destroy(obj); } -int iommufd_hw_pagetable_enforce_cc(struct iommufd_hw_pagetable *hwpt) +static int iommufd_hw_pagetable_enforce_cc(struct iommufd_hw_pagetable *hwpt) { if (hwpt->enforce_cache_coherency) return 0; @@ -116,6 +116,13 @@ iommufd_hw_pagetable_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, * doing any maps. It is an iommu driver bug to report * IOMMU_CAP_ENFORCE_CACHE_COHERENCY but fail enforce_cache_coherency on * a new domain. + * + * The cache coherency mode must be configured here and unchanged later. + * Note that a HWPT (non-CC) created for a device (non-CC) can be later + * reused by another device (either non-CC or CC). However, A HWPT (CC) + * created for a device (CC) cannot be reused by another device (non-CC) + * but only devices (CC). Instead user space in this case would need to + * allocate a separate HWPT (non-CC). */ if (idev->enforce_cache_coherency) { rc = iommufd_hw_pagetable_enforce_cc(hwpt); diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index 034129130db3..43410fd53157 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -266,7 +266,6 @@ struct iommufd_hw_pagetable * iommufd_hw_pagetable_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, struct iommufd_device *idev, u32 flags, bool immediate_attach); -int iommufd_hw_pagetable_enforce_cc(struct iommufd_hw_pagetable *hwpt); int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt, struct iommufd_device *idev); struct iommufd_hw_pagetable * From 54d606816b32401de5431f6776a78b1de135bfa2 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Wed, 25 Oct 2023 21:39:29 -0700 Subject: [PATCH 30/52] iommu: Add IOMMU_DOMAIN_NESTED Introduce a new domain type for a user I/O page table, which is nested on top of another user space address represented by a PAGING domain. This new domain can be allocated by the domain_alloc_user op, and attached to a device through the existing iommu_attach_device/group() interfaces. The mappings of a nested domain are managed by user space software, so it is not necessary to have map/unmap callbacks. Link: https://lore.kernel.org/r/20231026043938.63898-2-yi.l.liu@intel.com Signed-off-by: Lu Baolu Signed-off-by: Nicolin Chen Signed-off-by: Yi Liu Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- include/linux/iommu.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 1d42bdb37cbc..bc303cb2af37 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -67,6 +67,9 @@ struct iommu_domain_geometry { #define __IOMMU_DOMAIN_SVA (1U << 4) /* Shared process address space */ +#define __IOMMU_DOMAIN_NESTED (1U << 6) /* User-managed address space nested + on a stage-2 translation */ + #define IOMMU_DOMAIN_ALLOC_FLAGS ~__IOMMU_DOMAIN_DMA_FQ /* * This are the possible domain-types @@ -93,6 +96,7 @@ struct iommu_domain_geometry { __IOMMU_DOMAIN_DMA_API | \ __IOMMU_DOMAIN_DMA_FQ) #define IOMMU_DOMAIN_SVA (__IOMMU_DOMAIN_SVA) +#define IOMMU_DOMAIN_NESTED (__IOMMU_DOMAIN_NESTED) struct iommu_domain { unsigned type; From 9744a7ab62cc7354096aaff788c08b947f86ba60 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 25 Oct 2023 21:39:30 -0700 Subject: [PATCH 31/52] iommufd: Rename IOMMUFD_OBJ_HW_PAGETABLE to IOMMUFD_OBJ_HWPT_PAGING To add a new IOMMUFD_OBJ_HWPT_NESTED, rename the HWPT object to confine it to PAGING hwpts/domains. The following patch will separate the hwpt structure as well. Link: https://lore.kernel.org/r/20231026043938.63898-3-yi.l.liu@intel.com Signed-off-by: Nicolin Chen Signed-off-by: Yi Liu Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/device.c | 10 +++++----- drivers/iommu/iommufd/hw_pagetable.c | 2 +- drivers/iommu/iommufd/iommufd_private.h | 4 ++-- drivers/iommu/iommufd/main.c | 2 +- drivers/iommu/iommufd/selftest.c | 2 +- 5 files changed, 10 insertions(+), 10 deletions(-) diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c index 0a8867487508..449b64e6ef53 100644 --- a/drivers/iommu/iommufd/device.c +++ b/drivers/iommu/iommufd/device.c @@ -563,7 +563,7 @@ static int iommufd_device_change_pt(struct iommufd_device *idev, u32 *pt_id, return PTR_ERR(pt_obj); switch (pt_obj->type) { - case IOMMUFD_OBJ_HW_PAGETABLE: { + case IOMMUFD_OBJ_HWPT_PAGING: { struct iommufd_hw_pagetable *hwpt = container_of(pt_obj, struct iommufd_hw_pagetable, obj); @@ -601,8 +601,8 @@ out_put_pt_obj: /** * iommufd_device_attach - Connect a device to an iommu_domain * @idev: device to attach - * @pt_id: Input a IOMMUFD_OBJ_IOAS, or IOMMUFD_OBJ_HW_PAGETABLE - * Output the IOMMUFD_OBJ_HW_PAGETABLE ID + * @pt_id: Input a IOMMUFD_OBJ_IOAS, or IOMMUFD_OBJ_HWPT_PAGING + * Output the IOMMUFD_OBJ_HWPT_PAGING ID * * This connects the device to an iommu_domain, either automatically or manually * selected. Once this completes the device could do DMA. @@ -630,8 +630,8 @@ EXPORT_SYMBOL_NS_GPL(iommufd_device_attach, IOMMUFD); /** * iommufd_device_replace - Change the device's iommu_domain * @idev: device to change - * @pt_id: Input a IOMMUFD_OBJ_IOAS, or IOMMUFD_OBJ_HW_PAGETABLE - * Output the IOMMUFD_OBJ_HW_PAGETABLE ID + * @pt_id: Input a IOMMUFD_OBJ_IOAS, or IOMMUFD_OBJ_HWPT_PAGING + * Output the IOMMUFD_OBJ_HWPT_PAGING ID * * This is the same as:: * diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c index c2b742ffeb0c..8dc2b39f8cb0 100644 --- a/drivers/iommu/iommufd/hw_pagetable.c +++ b/drivers/iommu/iommufd/hw_pagetable.c @@ -86,7 +86,7 @@ iommufd_hw_pagetable_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, if (flags && !ops->domain_alloc_user) return ERR_PTR(-EOPNOTSUPP); - hwpt = iommufd_object_alloc(ictx, hwpt, IOMMUFD_OBJ_HW_PAGETABLE); + hwpt = iommufd_object_alloc(ictx, hwpt, IOMMUFD_OBJ_HWPT_PAGING); if (IS_ERR(hwpt)) return hwpt; diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index 43410fd53157..70bebad63a74 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -123,7 +123,7 @@ enum iommufd_object_type { IOMMUFD_OBJ_NONE, IOMMUFD_OBJ_ANY = IOMMUFD_OBJ_NONE, IOMMUFD_OBJ_DEVICE, - IOMMUFD_OBJ_HW_PAGETABLE, + IOMMUFD_OBJ_HWPT_PAGING, IOMMUFD_OBJ_IOAS, IOMMUFD_OBJ_ACCESS, #ifdef CONFIG_IOMMUFD_TEST @@ -256,7 +256,7 @@ static inline struct iommufd_hw_pagetable * iommufd_get_hwpt(struct iommufd_ucmd *ucmd, u32 id) { return container_of(iommufd_get_object(ucmd->ictx, id, - IOMMUFD_OBJ_HW_PAGETABLE), + IOMMUFD_OBJ_HWPT_PAGING), struct iommufd_hw_pagetable, obj); } int iommufd_hwpt_set_dirty_tracking(struct iommufd_ucmd *ucmd); diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c index d50f42a730aa..46198e8948d6 100644 --- a/drivers/iommu/iommufd/main.c +++ b/drivers/iommu/iommufd/main.c @@ -488,7 +488,7 @@ static const struct iommufd_object_ops iommufd_object_ops[] = { [IOMMUFD_OBJ_IOAS] = { .destroy = iommufd_ioas_destroy, }, - [IOMMUFD_OBJ_HW_PAGETABLE] = { + [IOMMUFD_OBJ_HWPT_PAGING] = { .destroy = iommufd_hw_pagetable_destroy, .abort = iommufd_hw_pagetable_abort, }, diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c index d8551c9d5b6c..068928ba7950 100644 --- a/drivers/iommu/iommufd/selftest.c +++ b/drivers/iommu/iommufd/selftest.c @@ -437,7 +437,7 @@ get_md_pagetable(struct iommufd_ucmd *ucmd, u32 mockpt_id, struct iommufd_object *obj; obj = iommufd_get_object(ucmd->ictx, mockpt_id, - IOMMUFD_OBJ_HW_PAGETABLE); + IOMMUFD_OBJ_HWPT_PAGING); if (IS_ERR(obj)) return ERR_CAST(obj); hwpt = container_of(obj, struct iommufd_hw_pagetable, obj); From 58d84f430dc7f737d21c60906de0f39104c89e9d Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 25 Oct 2023 21:39:31 -0700 Subject: [PATCH 32/52] iommufd/device: Wrap IOMMUFD_OBJ_HWPT_PAGING-only configurations Some of the configurations during the attach/replace() should only apply to IOMMUFD_OBJ_HWPT_PAGING. Once IOMMUFD_OBJ_HWPT_NESTED gets introduced in a following patch, keeping them unconditionally in the common routine will not work. Wrap all of those PAGING-only configurations together into helpers. Do a hwpt_is_paging check whenever calling them or their fallback routines. Link: https://lore.kernel.org/r/20231026043938.63898-4-yi.l.liu@intel.com Signed-off-by: Nicolin Chen Signed-off-by: Yi Liu Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/device.c | 111 +++++++++++++++++------- drivers/iommu/iommufd/iommufd_private.h | 5 ++ 2 files changed, 86 insertions(+), 30 deletions(-) diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c index 449b64e6ef53..0c844acb15b9 100644 --- a/drivers/iommu/iommufd/device.c +++ b/drivers/iommu/iommufd/device.c @@ -325,6 +325,28 @@ static int iommufd_group_setup_msi(struct iommufd_group *igroup, return 0; } +static int iommufd_hwpt_paging_attach(struct iommufd_hw_pagetable *hwpt, + struct iommufd_device *idev) +{ + int rc; + + lockdep_assert_held(&idev->igroup->lock); + + rc = iopt_table_enforce_dev_resv_regions(&hwpt->ioas->iopt, idev->dev, + &idev->igroup->sw_msi_start); + if (rc) + return rc; + + if (list_empty(&idev->igroup->device_list)) { + rc = iommufd_group_setup_msi(idev->igroup, hwpt); + if (rc) { + iopt_remove_reserved_iova(&hwpt->ioas->iopt, idev->dev); + return rc; + } + } + return 0; +} + int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt, struct iommufd_device *idev) { @@ -337,10 +359,11 @@ int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt, goto err_unlock; } - rc = iopt_table_enforce_dev_resv_regions(&hwpt->ioas->iopt, idev->dev, - &idev->igroup->sw_msi_start); - if (rc) - goto err_unlock; + if (hwpt_is_paging(hwpt)) { + rc = iommufd_hwpt_paging_attach(hwpt, idev); + if (rc) + goto err_unlock; + } /* * Only attach to the group once for the first device that is in the @@ -350,10 +373,6 @@ int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt, * attachment. */ if (list_empty(&idev->igroup->device_list)) { - rc = iommufd_group_setup_msi(idev->igroup, hwpt); - if (rc) - goto err_unresv; - rc = iommu_attach_group(hwpt->domain, idev->igroup->group); if (rc) goto err_unresv; @@ -364,7 +383,8 @@ int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt, mutex_unlock(&idev->igroup->lock); return 0; err_unresv: - iopt_remove_reserved_iova(&hwpt->ioas->iopt, idev->dev); + if (hwpt_is_paging(hwpt)) + iopt_remove_reserved_iova(&hwpt->ioas->iopt, idev->dev); err_unlock: mutex_unlock(&idev->igroup->lock); return rc; @@ -381,7 +401,8 @@ iommufd_hw_pagetable_detach(struct iommufd_device *idev) iommu_detach_group(hwpt->domain, idev->igroup->group); idev->igroup->hwpt = NULL; } - iopt_remove_reserved_iova(&hwpt->ioas->iopt, idev->dev); + if (hwpt_is_paging(hwpt)) + iopt_remove_reserved_iova(&hwpt->ioas->iopt, idev->dev); mutex_unlock(&idev->igroup->lock); /* Caller must destroy hwpt */ @@ -400,13 +421,52 @@ iommufd_device_do_attach(struct iommufd_device *idev, return NULL; } +static void +iommufd_group_remove_reserved_iova(struct iommufd_group *igroup, + struct iommufd_hw_pagetable *hwpt) +{ + struct iommufd_device *cur; + + lockdep_assert_held(&igroup->lock); + + list_for_each_entry(cur, &igroup->device_list, group_item) + iopt_remove_reserved_iova(&hwpt->ioas->iopt, cur->dev); +} + +static int iommufd_group_do_replace_paging(struct iommufd_group *igroup, + struct iommufd_hw_pagetable *hwpt) +{ + struct iommufd_hw_pagetable *old_hwpt = igroup->hwpt; + struct iommufd_device *cur; + int rc; + + lockdep_assert_held(&igroup->lock); + + if (!hwpt_is_paging(old_hwpt) || hwpt->ioas != old_hwpt->ioas) { + list_for_each_entry(cur, &igroup->device_list, group_item) { + rc = iopt_table_enforce_dev_resv_regions( + &hwpt->ioas->iopt, cur->dev, NULL); + if (rc) + goto err_unresv; + } + } + + rc = iommufd_group_setup_msi(igroup, hwpt); + if (rc) + goto err_unresv; + return 0; + +err_unresv: + iommufd_group_remove_reserved_iova(igroup, hwpt); + return rc; +} + static struct iommufd_hw_pagetable * iommufd_device_do_replace(struct iommufd_device *idev, struct iommufd_hw_pagetable *hwpt) { struct iommufd_group *igroup = idev->igroup; struct iommufd_hw_pagetable *old_hwpt; - struct iommufd_device *cur; unsigned int num_devices; int rc; @@ -422,29 +482,20 @@ iommufd_device_do_replace(struct iommufd_device *idev, return NULL; } - old_hwpt = igroup->hwpt; - if (hwpt->ioas != old_hwpt->ioas) { - list_for_each_entry(cur, &igroup->device_list, group_item) { - rc = iopt_table_enforce_dev_resv_regions( - &hwpt->ioas->iopt, cur->dev, NULL); - if (rc) - goto err_unresv; - } + if (hwpt_is_paging(hwpt)) { + rc = iommufd_group_do_replace_paging(igroup, hwpt); + if (rc) + goto err_unlock; } - rc = iommufd_group_setup_msi(idev->igroup, hwpt); - if (rc) - goto err_unresv; - rc = iommu_group_replace_domain(igroup->group, hwpt->domain); if (rc) goto err_unresv; - if (hwpt->ioas != old_hwpt->ioas) { - list_for_each_entry(cur, &igroup->device_list, group_item) - iopt_remove_reserved_iova(&old_hwpt->ioas->iopt, - cur->dev); - } + old_hwpt = igroup->hwpt; + if (hwpt_is_paging(old_hwpt) && + (!hwpt_is_paging(hwpt) || hwpt->ioas != old_hwpt->ioas)) + iommufd_group_remove_reserved_iova(igroup, old_hwpt); igroup->hwpt = hwpt; @@ -462,8 +513,8 @@ iommufd_device_do_replace(struct iommufd_device *idev, /* Caller must destroy old_hwpt */ return old_hwpt; err_unresv: - list_for_each_entry(cur, &igroup->device_list, group_item) - iopt_remove_reserved_iova(&hwpt->ioas->iopt, cur->dev); + if (hwpt_is_paging(hwpt)) + iommufd_group_remove_reserved_iova(igroup, hwpt); err_unlock: mutex_unlock(&idev->igroup->lock); return ERR_PTR(rc); diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index 70bebad63a74..776dd41c077f 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -252,6 +252,11 @@ struct iommufd_hw_pagetable { struct list_head hwpt_item; }; +static inline bool hwpt_is_paging(struct iommufd_hw_pagetable *hwpt) +{ + return hwpt->obj.type == IOMMUFD_OBJ_HWPT_PAGING; +} + static inline struct iommufd_hw_pagetable * iommufd_get_hwpt(struct iommufd_ucmd *ucmd, u32 id) { From 89db31635c87a7856e205c7ebf9f562e4bb206fe Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Wed, 25 Oct 2023 21:39:32 -0700 Subject: [PATCH 33/52] iommufd: Derive iommufd_hwpt_paging from iommufd_hw_pagetable To prepare for IOMMUFD_OBJ_HWPT_NESTED, derive struct iommufd_hwpt_paging from struct iommufd_hw_pagetable, by leaving the common members in struct iommufd_hw_pagetable. Add a __iommufd_object_alloc and to_hwpt_paging() helpers for the new structure. Then, update "hwpt" to "hwpt_paging" throughout the files, accordingly. Link: https://lore.kernel.org/r/20231026043938.63898-5-yi.l.liu@intel.com Suggested-by: Jason Gunthorpe Signed-off-by: Nicolin Chen Signed-off-by: Yi Liu Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/device.c | 76 ++++++++------ drivers/iommu/iommufd/hw_pagetable.c | 129 +++++++++++++----------- drivers/iommu/iommufd/iommufd_private.h | 41 +++++--- drivers/iommu/iommufd/main.c | 4 +- drivers/iommu/iommufd/vfio_compat.c | 6 +- 5 files changed, 148 insertions(+), 108 deletions(-) diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c index 0c844acb15b9..a99ce4547353 100644 --- a/drivers/iommu/iommufd/device.c +++ b/drivers/iommu/iommufd/device.c @@ -293,7 +293,7 @@ u32 iommufd_device_to_id(struct iommufd_device *idev) EXPORT_SYMBOL_NS_GPL(iommufd_device_to_id, IOMMUFD); static int iommufd_group_setup_msi(struct iommufd_group *igroup, - struct iommufd_hw_pagetable *hwpt) + struct iommufd_hwpt_paging *hwpt_paging) { phys_addr_t sw_msi_start = igroup->sw_msi_start; int rc; @@ -311,8 +311,9 @@ static int iommufd_group_setup_msi(struct iommufd_group *igroup, * matches what the IRQ layer actually expects in a newly created * domain. */ - if (sw_msi_start != PHYS_ADDR_MAX && !hwpt->msi_cookie) { - rc = iommu_get_msi_cookie(hwpt->domain, sw_msi_start); + if (sw_msi_start != PHYS_ADDR_MAX && !hwpt_paging->msi_cookie) { + rc = iommu_get_msi_cookie(hwpt_paging->common.domain, + sw_msi_start); if (rc) return rc; @@ -320,27 +321,29 @@ static int iommufd_group_setup_msi(struct iommufd_group *igroup, * iommu_get_msi_cookie() can only be called once per domain, * it returns -EBUSY on later calls. */ - hwpt->msi_cookie = true; + hwpt_paging->msi_cookie = true; } return 0; } -static int iommufd_hwpt_paging_attach(struct iommufd_hw_pagetable *hwpt, +static int iommufd_hwpt_paging_attach(struct iommufd_hwpt_paging *hwpt_paging, struct iommufd_device *idev) { int rc; lockdep_assert_held(&idev->igroup->lock); - rc = iopt_table_enforce_dev_resv_regions(&hwpt->ioas->iopt, idev->dev, + rc = iopt_table_enforce_dev_resv_regions(&hwpt_paging->ioas->iopt, + idev->dev, &idev->igroup->sw_msi_start); if (rc) return rc; if (list_empty(&idev->igroup->device_list)) { - rc = iommufd_group_setup_msi(idev->igroup, hwpt); + rc = iommufd_group_setup_msi(idev->igroup, hwpt_paging); if (rc) { - iopt_remove_reserved_iova(&hwpt->ioas->iopt, idev->dev); + iopt_remove_reserved_iova(&hwpt_paging->ioas->iopt, + idev->dev); return rc; } } @@ -360,7 +363,7 @@ int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt, } if (hwpt_is_paging(hwpt)) { - rc = iommufd_hwpt_paging_attach(hwpt, idev); + rc = iommufd_hwpt_paging_attach(to_hwpt_paging(hwpt), idev); if (rc) goto err_unlock; } @@ -384,7 +387,8 @@ int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt, return 0; err_unresv: if (hwpt_is_paging(hwpt)) - iopt_remove_reserved_iova(&hwpt->ioas->iopt, idev->dev); + iopt_remove_reserved_iova(&to_hwpt_paging(hwpt)->ioas->iopt, + idev->dev); err_unlock: mutex_unlock(&idev->igroup->lock); return rc; @@ -402,7 +406,8 @@ iommufd_hw_pagetable_detach(struct iommufd_device *idev) idev->igroup->hwpt = NULL; } if (hwpt_is_paging(hwpt)) - iopt_remove_reserved_iova(&hwpt->ioas->iopt, idev->dev); + iopt_remove_reserved_iova(&to_hwpt_paging(hwpt)->ioas->iopt, + idev->dev); mutex_unlock(&idev->igroup->lock); /* Caller must destroy hwpt */ @@ -423,18 +428,19 @@ iommufd_device_do_attach(struct iommufd_device *idev, static void iommufd_group_remove_reserved_iova(struct iommufd_group *igroup, - struct iommufd_hw_pagetable *hwpt) + struct iommufd_hwpt_paging *hwpt_paging) { struct iommufd_device *cur; lockdep_assert_held(&igroup->lock); list_for_each_entry(cur, &igroup->device_list, group_item) - iopt_remove_reserved_iova(&hwpt->ioas->iopt, cur->dev); + iopt_remove_reserved_iova(&hwpt_paging->ioas->iopt, cur->dev); } -static int iommufd_group_do_replace_paging(struct iommufd_group *igroup, - struct iommufd_hw_pagetable *hwpt) +static int +iommufd_group_do_replace_paging(struct iommufd_group *igroup, + struct iommufd_hwpt_paging *hwpt_paging) { struct iommufd_hw_pagetable *old_hwpt = igroup->hwpt; struct iommufd_device *cur; @@ -442,22 +448,23 @@ static int iommufd_group_do_replace_paging(struct iommufd_group *igroup, lockdep_assert_held(&igroup->lock); - if (!hwpt_is_paging(old_hwpt) || hwpt->ioas != old_hwpt->ioas) { + if (!hwpt_is_paging(old_hwpt) || + hwpt_paging->ioas != to_hwpt_paging(old_hwpt)->ioas) { list_for_each_entry(cur, &igroup->device_list, group_item) { rc = iopt_table_enforce_dev_resv_regions( - &hwpt->ioas->iopt, cur->dev, NULL); + &hwpt_paging->ioas->iopt, cur->dev, NULL); if (rc) goto err_unresv; } } - rc = iommufd_group_setup_msi(igroup, hwpt); + rc = iommufd_group_setup_msi(igroup, hwpt_paging); if (rc) goto err_unresv; return 0; err_unresv: - iommufd_group_remove_reserved_iova(igroup, hwpt); + iommufd_group_remove_reserved_iova(igroup, hwpt_paging); return rc; } @@ -482,8 +489,10 @@ iommufd_device_do_replace(struct iommufd_device *idev, return NULL; } + old_hwpt = igroup->hwpt; if (hwpt_is_paging(hwpt)) { - rc = iommufd_group_do_replace_paging(igroup, hwpt); + rc = iommufd_group_do_replace_paging(igroup, + to_hwpt_paging(hwpt)); if (rc) goto err_unlock; } @@ -492,10 +501,11 @@ iommufd_device_do_replace(struct iommufd_device *idev, if (rc) goto err_unresv; - old_hwpt = igroup->hwpt; if (hwpt_is_paging(old_hwpt) && - (!hwpt_is_paging(hwpt) || hwpt->ioas != old_hwpt->ioas)) - iommufd_group_remove_reserved_iova(igroup, old_hwpt); + (!hwpt_is_paging(hwpt) || + to_hwpt_paging(hwpt)->ioas != to_hwpt_paging(old_hwpt)->ioas)) + iommufd_group_remove_reserved_iova(igroup, + to_hwpt_paging(old_hwpt)); igroup->hwpt = hwpt; @@ -514,7 +524,8 @@ iommufd_device_do_replace(struct iommufd_device *idev, return old_hwpt; err_unresv: if (hwpt_is_paging(hwpt)) - iommufd_group_remove_reserved_iova(igroup, hwpt); + iommufd_group_remove_reserved_iova(igroup, + to_hwpt_paging(old_hwpt)); err_unlock: mutex_unlock(&idev->igroup->lock); return ERR_PTR(rc); @@ -542,6 +553,7 @@ iommufd_device_auto_get_domain(struct iommufd_device *idev, */ bool immediate_attach = do_attach == iommufd_device_do_attach; struct iommufd_hw_pagetable *destroy_hwpt; + struct iommufd_hwpt_paging *hwpt_paging; struct iommufd_hw_pagetable *hwpt; /* @@ -550,10 +562,11 @@ iommufd_device_auto_get_domain(struct iommufd_device *idev, * other. */ mutex_lock(&ioas->mutex); - list_for_each_entry(hwpt, &ioas->hwpt_list, hwpt_item) { - if (!hwpt->auto_domain) + list_for_each_entry(hwpt_paging, &ioas->hwpt_list, hwpt_item) { + if (!hwpt_paging->auto_domain) continue; + hwpt = &hwpt_paging->common; if (!iommufd_lock_obj(&hwpt->obj)) continue; destroy_hwpt = (*do_attach)(idev, hwpt); @@ -574,12 +587,13 @@ iommufd_device_auto_get_domain(struct iommufd_device *idev, goto out_unlock; } - hwpt = iommufd_hw_pagetable_alloc(idev->ictx, ioas, idev, - 0, immediate_attach); - if (IS_ERR(hwpt)) { - destroy_hwpt = ERR_CAST(hwpt); + hwpt_paging = iommufd_hwpt_paging_alloc(idev->ictx, ioas, idev, 0, + immediate_attach); + if (IS_ERR(hwpt_paging)) { + destroy_hwpt = ERR_CAST(hwpt_paging); goto out_unlock; } + hwpt = &hwpt_paging->common; if (!immediate_attach) { destroy_hwpt = (*do_attach)(idev, hwpt); @@ -589,7 +603,7 @@ iommufd_device_auto_get_domain(struct iommufd_device *idev, destroy_hwpt = NULL; } - hwpt->auto_domain = true; + hwpt_paging->auto_domain = true; *pt_id = hwpt->obj.id; iommufd_object_finalize(idev->ictx, &hwpt->obj); diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c index 8dc2b39f8cb0..39b8b625b48d 100644 --- a/drivers/iommu/iommufd/hw_pagetable.c +++ b/drivers/iommu/iommufd/hw_pagetable.c @@ -8,56 +8,61 @@ #include "../iommu-priv.h" #include "iommufd_private.h" -void iommufd_hw_pagetable_destroy(struct iommufd_object *obj) +void iommufd_hwpt_paging_destroy(struct iommufd_object *obj) { - struct iommufd_hw_pagetable *hwpt = - container_of(obj, struct iommufd_hw_pagetable, obj); + struct iommufd_hwpt_paging *hwpt_paging = + container_of(obj, struct iommufd_hwpt_paging, common.obj); - if (!list_empty(&hwpt->hwpt_item)) { - mutex_lock(&hwpt->ioas->mutex); - list_del(&hwpt->hwpt_item); - mutex_unlock(&hwpt->ioas->mutex); + if (!list_empty(&hwpt_paging->hwpt_item)) { + mutex_lock(&hwpt_paging->ioas->mutex); + list_del(&hwpt_paging->hwpt_item); + mutex_unlock(&hwpt_paging->ioas->mutex); - iopt_table_remove_domain(&hwpt->ioas->iopt, hwpt->domain); + iopt_table_remove_domain(&hwpt_paging->ioas->iopt, + hwpt_paging->common.domain); } - if (hwpt->domain) - iommu_domain_free(hwpt->domain); + if (hwpt_paging->common.domain) + iommu_domain_free(hwpt_paging->common.domain); - refcount_dec(&hwpt->ioas->obj.users); + refcount_dec(&hwpt_paging->ioas->obj.users); } -void iommufd_hw_pagetable_abort(struct iommufd_object *obj) +void iommufd_hwpt_paging_abort(struct iommufd_object *obj) { - struct iommufd_hw_pagetable *hwpt = - container_of(obj, struct iommufd_hw_pagetable, obj); + struct iommufd_hwpt_paging *hwpt_paging = + container_of(obj, struct iommufd_hwpt_paging, common.obj); /* The ioas->mutex must be held until finalize is called. */ - lockdep_assert_held(&hwpt->ioas->mutex); + lockdep_assert_held(&hwpt_paging->ioas->mutex); - if (!list_empty(&hwpt->hwpt_item)) { - list_del_init(&hwpt->hwpt_item); - iopt_table_remove_domain(&hwpt->ioas->iopt, hwpt->domain); + if (!list_empty(&hwpt_paging->hwpt_item)) { + list_del_init(&hwpt_paging->hwpt_item); + iopt_table_remove_domain(&hwpt_paging->ioas->iopt, + hwpt_paging->common.domain); } - iommufd_hw_pagetable_destroy(obj); + iommufd_hwpt_paging_destroy(obj); } -static int iommufd_hw_pagetable_enforce_cc(struct iommufd_hw_pagetable *hwpt) +static int +iommufd_hwpt_paging_enforce_cc(struct iommufd_hwpt_paging *hwpt_paging) { - if (hwpt->enforce_cache_coherency) + struct iommu_domain *paging_domain = hwpt_paging->common.domain; + + if (hwpt_paging->enforce_cache_coherency) return 0; - if (hwpt->domain->ops->enforce_cache_coherency) - hwpt->enforce_cache_coherency = - hwpt->domain->ops->enforce_cache_coherency( - hwpt->domain); - if (!hwpt->enforce_cache_coherency) + if (paging_domain->ops->enforce_cache_coherency) + hwpt_paging->enforce_cache_coherency = + paging_domain->ops->enforce_cache_coherency( + paging_domain); + if (!hwpt_paging->enforce_cache_coherency) return -EINVAL; return 0; } /** - * iommufd_hw_pagetable_alloc() - Get an iommu_domain for a device + * iommufd_hwpt_paging_alloc() - Get a PAGING iommu_domain for a device * @ictx: iommufd context * @ioas: IOAS to associate the domain with * @idev: Device to get an iommu_domain for @@ -72,12 +77,13 @@ static int iommufd_hw_pagetable_enforce_cc(struct iommufd_hw_pagetable *hwpt) * iommufd_object_abort_and_destroy() or iommufd_object_finalize() is called on * the returned hwpt. */ -struct iommufd_hw_pagetable * -iommufd_hw_pagetable_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, - struct iommufd_device *idev, u32 flags, - bool immediate_attach) +struct iommufd_hwpt_paging * +iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, + struct iommufd_device *idev, u32 flags, + bool immediate_attach) { const struct iommu_ops *ops = dev_iommu_ops(idev->dev); + struct iommufd_hwpt_paging *hwpt_paging; struct iommufd_hw_pagetable *hwpt; int rc; @@ -86,14 +92,16 @@ iommufd_hw_pagetable_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, if (flags && !ops->domain_alloc_user) return ERR_PTR(-EOPNOTSUPP); - hwpt = iommufd_object_alloc(ictx, hwpt, IOMMUFD_OBJ_HWPT_PAGING); - if (IS_ERR(hwpt)) - return hwpt; + hwpt_paging = __iommufd_object_alloc( + ictx, hwpt_paging, IOMMUFD_OBJ_HWPT_PAGING, common.obj); + if (IS_ERR(hwpt_paging)) + return ERR_CAST(hwpt_paging); + hwpt = &hwpt_paging->common; - INIT_LIST_HEAD(&hwpt->hwpt_item); + INIT_LIST_HEAD(&hwpt_paging->hwpt_item); /* Pairs with iommufd_hw_pagetable_destroy() */ refcount_inc(&ioas->obj.users); - hwpt->ioas = ioas; + hwpt_paging->ioas = ioas; if (ops->domain_alloc_user) { hwpt->domain = ops->domain_alloc_user(idev->dev, flags); @@ -125,7 +133,7 @@ iommufd_hw_pagetable_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, * allocate a separate HWPT (non-CC). */ if (idev->enforce_cache_coherency) { - rc = iommufd_hw_pagetable_enforce_cc(hwpt); + rc = iommufd_hwpt_paging_enforce_cc(hwpt_paging); if (WARN_ON(rc)) goto out_abort; } @@ -142,11 +150,11 @@ iommufd_hw_pagetable_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, goto out_abort; } - rc = iopt_table_add_domain(&hwpt->ioas->iopt, hwpt->domain); + rc = iopt_table_add_domain(&ioas->iopt, hwpt->domain); if (rc) goto out_detach; - list_add_tail(&hwpt->hwpt_item, &hwpt->ioas->hwpt_list); - return hwpt; + list_add_tail(&hwpt_paging->hwpt_item, &ioas->hwpt_list); + return hwpt_paging; out_detach: if (immediate_attach) @@ -159,6 +167,7 @@ out_abort: int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd) { struct iommu_hwpt_alloc *cmd = ucmd->cmd; + struct iommufd_hwpt_paging *hwpt_paging; struct iommufd_hw_pagetable *hwpt; struct iommufd_device *idev; struct iommufd_ioas *ioas; @@ -180,12 +189,13 @@ int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd) } mutex_lock(&ioas->mutex); - hwpt = iommufd_hw_pagetable_alloc(ucmd->ictx, ioas, - idev, cmd->flags, false); - if (IS_ERR(hwpt)) { - rc = PTR_ERR(hwpt); + hwpt_paging = iommufd_hwpt_paging_alloc(ucmd->ictx, ioas, idev, + cmd->flags, false); + if (IS_ERR(hwpt_paging)) { + rc = PTR_ERR(hwpt_paging); goto out_unlock; } + hwpt = &hwpt_paging->common; cmd->out_hwpt_id = hwpt->obj.id; rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); @@ -207,7 +217,7 @@ out_put_idev: int iommufd_hwpt_set_dirty_tracking(struct iommufd_ucmd *ucmd) { struct iommu_hwpt_set_dirty_tracking *cmd = ucmd->cmd; - struct iommufd_hw_pagetable *hwpt; + struct iommufd_hwpt_paging *hwpt_paging; struct iommufd_ioas *ioas; int rc = -EOPNOTSUPP; bool enable; @@ -215,23 +225,24 @@ int iommufd_hwpt_set_dirty_tracking(struct iommufd_ucmd *ucmd) if (cmd->flags & ~IOMMU_HWPT_DIRTY_TRACKING_ENABLE) return rc; - hwpt = iommufd_get_hwpt(ucmd, cmd->hwpt_id); - if (IS_ERR(hwpt)) - return PTR_ERR(hwpt); + hwpt_paging = iommufd_get_hwpt_paging(ucmd, cmd->hwpt_id); + if (IS_ERR(hwpt_paging)) + return PTR_ERR(hwpt_paging); - ioas = hwpt->ioas; + ioas = hwpt_paging->ioas; enable = cmd->flags & IOMMU_HWPT_DIRTY_TRACKING_ENABLE; - rc = iopt_set_dirty_tracking(&ioas->iopt, hwpt->domain, enable); + rc = iopt_set_dirty_tracking(&ioas->iopt, hwpt_paging->common.domain, + enable); - iommufd_put_object(&hwpt->obj); + iommufd_put_object(&hwpt_paging->common.obj); return rc; } int iommufd_hwpt_get_dirty_bitmap(struct iommufd_ucmd *ucmd) { struct iommu_hwpt_get_dirty_bitmap *cmd = ucmd->cmd; - struct iommufd_hw_pagetable *hwpt; + struct iommufd_hwpt_paging *hwpt_paging; struct iommufd_ioas *ioas; int rc = -EOPNOTSUPP; @@ -239,14 +250,14 @@ int iommufd_hwpt_get_dirty_bitmap(struct iommufd_ucmd *ucmd) cmd->__reserved) return -EOPNOTSUPP; - hwpt = iommufd_get_hwpt(ucmd, cmd->hwpt_id); - if (IS_ERR(hwpt)) - return PTR_ERR(hwpt); + hwpt_paging = iommufd_get_hwpt_paging(ucmd, cmd->hwpt_id); + if (IS_ERR(hwpt_paging)) + return PTR_ERR(hwpt_paging); - ioas = hwpt->ioas; - rc = iopt_read_and_clear_dirty_data(&ioas->iopt, hwpt->domain, - cmd->flags, cmd); + ioas = hwpt_paging->ioas; + rc = iopt_read_and_clear_dirty_data( + &ioas->iopt, hwpt_paging->common.domain, cmd->flags, cmd); - iommufd_put_object(&hwpt->obj); + iommufd_put_object(&hwpt_paging->common.obj); return rc; } diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index 776dd41c077f..cd8da289ed0b 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -181,7 +181,7 @@ struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx, size_t size, enum iommufd_object_type type); -#define iommufd_object_alloc(ictx, ptr, type) \ +#define __iommufd_object_alloc(ictx, ptr, type, obj) \ container_of(_iommufd_object_alloc( \ ictx, \ sizeof(*(ptr)) + BUILD_BUG_ON_ZERO( \ @@ -190,6 +190,9 @@ struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx, type), \ typeof(*(ptr)), obj) +#define iommufd_object_alloc(ictx, ptr, type) \ + __iommufd_object_alloc(ictx, ptr, type, obj) + /* * The IO Address Space (IOAS) pagetable is a virtual page table backed by the * io_pagetable object. It is a user controlled mapping of IOVA -> PFNs. The @@ -243,8 +246,12 @@ int iommufd_check_iova_range(struct io_pagetable *iopt, */ struct iommufd_hw_pagetable { struct iommufd_object obj; - struct iommufd_ioas *ioas; struct iommu_domain *domain; +}; + +struct iommufd_hwpt_paging { + struct iommufd_hw_pagetable common; + struct iommufd_ioas *ioas; bool auto_domain : 1; bool enforce_cache_coherency : 1; bool msi_cookie : 1; @@ -257,33 +264,41 @@ static inline bool hwpt_is_paging(struct iommufd_hw_pagetable *hwpt) return hwpt->obj.type == IOMMUFD_OBJ_HWPT_PAGING; } -static inline struct iommufd_hw_pagetable * -iommufd_get_hwpt(struct iommufd_ucmd *ucmd, u32 id) +static inline struct iommufd_hwpt_paging * +to_hwpt_paging(struct iommufd_hw_pagetable *hwpt) +{ + return container_of(hwpt, struct iommufd_hwpt_paging, common); +} + +static inline struct iommufd_hwpt_paging * +iommufd_get_hwpt_paging(struct iommufd_ucmd *ucmd, u32 id) { return container_of(iommufd_get_object(ucmd->ictx, id, IOMMUFD_OBJ_HWPT_PAGING), - struct iommufd_hw_pagetable, obj); + struct iommufd_hwpt_paging, common.obj); } int iommufd_hwpt_set_dirty_tracking(struct iommufd_ucmd *ucmd); int iommufd_hwpt_get_dirty_bitmap(struct iommufd_ucmd *ucmd); -struct iommufd_hw_pagetable * -iommufd_hw_pagetable_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, - struct iommufd_device *idev, u32 flags, - bool immediate_attach); +struct iommufd_hwpt_paging * +iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, + struct iommufd_device *idev, u32 flags, + bool immediate_attach); int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt, struct iommufd_device *idev); struct iommufd_hw_pagetable * iommufd_hw_pagetable_detach(struct iommufd_device *idev); -void iommufd_hw_pagetable_destroy(struct iommufd_object *obj); -void iommufd_hw_pagetable_abort(struct iommufd_object *obj); +void iommufd_hwpt_paging_destroy(struct iommufd_object *obj); +void iommufd_hwpt_paging_abort(struct iommufd_object *obj); int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd); static inline void iommufd_hw_pagetable_put(struct iommufd_ctx *ictx, struct iommufd_hw_pagetable *hwpt) { - lockdep_assert_not_held(&hwpt->ioas->mutex); - if (hwpt->auto_domain) + struct iommufd_hwpt_paging *hwpt_paging = to_hwpt_paging(hwpt); + + lockdep_assert_not_held(&hwpt_paging->ioas->mutex); + if (hwpt_paging->auto_domain) iommufd_object_deref_user(ictx, &hwpt->obj); else refcount_dec(&hwpt->obj.users); diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c index 46198e8948d6..ab6675a7f6b4 100644 --- a/drivers/iommu/iommufd/main.c +++ b/drivers/iommu/iommufd/main.c @@ -489,8 +489,8 @@ static const struct iommufd_object_ops iommufd_object_ops[] = { .destroy = iommufd_ioas_destroy, }, [IOMMUFD_OBJ_HWPT_PAGING] = { - .destroy = iommufd_hw_pagetable_destroy, - .abort = iommufd_hw_pagetable_abort, + .destroy = iommufd_hwpt_paging_destroy, + .abort = iommufd_hwpt_paging_abort, }, #ifdef CONFIG_IOMMUFD_TEST [IOMMUFD_OBJ_SELFTEST] = { diff --git a/drivers/iommu/iommufd/vfio_compat.c b/drivers/iommu/iommufd/vfio_compat.c index 6c810bf80f99..538fbf76354d 100644 --- a/drivers/iommu/iommufd/vfio_compat.c +++ b/drivers/iommu/iommufd/vfio_compat.c @@ -255,7 +255,7 @@ err_put: static int iommufd_vfio_cc_iommu(struct iommufd_ctx *ictx) { - struct iommufd_hw_pagetable *hwpt; + struct iommufd_hwpt_paging *hwpt_paging; struct iommufd_ioas *ioas; int rc = 1; @@ -264,8 +264,8 @@ static int iommufd_vfio_cc_iommu(struct iommufd_ctx *ictx) return PTR_ERR(ioas); mutex_lock(&ioas->mutex); - list_for_each_entry(hwpt, &ioas->hwpt_list, hwpt_item) { - if (!hwpt->enforce_cache_coherency) { + list_for_each_entry(hwpt_paging, &ioas->hwpt_list, hwpt_item) { + if (!hwpt_paging->enforce_cache_coherency) { rc = 0; break; } From b5021cb264e67baf051569a41debe277c279952b Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Wed, 25 Oct 2023 21:39:33 -0700 Subject: [PATCH 34/52] iommufd: Share iommufd_hwpt_alloc with IOMMUFD_OBJ_HWPT_NESTED Allow iommufd_hwpt_alloc() to have a common routine but jump to different allocators corresponding to different user input pt_obj types, either an IOMMUFD_OBJ_IOAS for a PAGING hwpt or an IOMMUFD_OBJ_HWPT_PAGING as the parent for a NESTED hwpt. Also, move the "flags" validation to the hwpt allocator (paging), so that later the hwpt_nested allocator can do its own separate flags validation. Link: https://lore.kernel.org/r/20231026043938.63898-6-yi.l.liu@intel.com Signed-off-by: Nicolin Chen Signed-off-by: Yi Liu Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/hw_pagetable.c | 46 ++++++++++++++++++---------- 1 file changed, 29 insertions(+), 17 deletions(-) diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c index 39b8b625b48d..6bce9af0cb8d 100644 --- a/drivers/iommu/iommufd/hw_pagetable.c +++ b/drivers/iommu/iommufd/hw_pagetable.c @@ -82,6 +82,8 @@ iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, struct iommufd_device *idev, u32 flags, bool immediate_attach) { + const u32 valid_flags = IOMMU_HWPT_ALLOC_NEST_PARENT | + IOMMU_HWPT_ALLOC_DIRTY_TRACKING; const struct iommu_ops *ops = dev_iommu_ops(idev->dev); struct iommufd_hwpt_paging *hwpt_paging; struct iommufd_hw_pagetable *hwpt; @@ -91,6 +93,8 @@ iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, if (flags && !ops->domain_alloc_user) return ERR_PTR(-EOPNOTSUPP); + if (flags & ~valid_flags) + return ERR_PTR(-EOPNOTSUPP); hwpt_paging = __iommufd_object_alloc( ictx, hwpt_paging, IOMMUFD_OBJ_HWPT_PAGING, common.obj); @@ -167,35 +171,41 @@ out_abort: int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd) { struct iommu_hwpt_alloc *cmd = ucmd->cmd; - struct iommufd_hwpt_paging *hwpt_paging; struct iommufd_hw_pagetable *hwpt; + struct iommufd_ioas *ioas = NULL; + struct iommufd_object *pt_obj; struct iommufd_device *idev; - struct iommufd_ioas *ioas; int rc; - if ((cmd->flags & ~(IOMMU_HWPT_ALLOC_NEST_PARENT | - IOMMU_HWPT_ALLOC_DIRTY_TRACKING)) || - cmd->__reserved) + if (cmd->__reserved) return -EOPNOTSUPP; idev = iommufd_get_device(ucmd, cmd->dev_id); if (IS_ERR(idev)) return PTR_ERR(idev); - ioas = iommufd_get_ioas(ucmd->ictx, cmd->pt_id); - if (IS_ERR(ioas)) { - rc = PTR_ERR(ioas); + pt_obj = iommufd_get_object(ucmd->ictx, cmd->pt_id, IOMMUFD_OBJ_ANY); + if (IS_ERR(pt_obj)) { + rc = -EINVAL; goto out_put_idev; } - mutex_lock(&ioas->mutex); - hwpt_paging = iommufd_hwpt_paging_alloc(ucmd->ictx, ioas, idev, - cmd->flags, false); - if (IS_ERR(hwpt_paging)) { - rc = PTR_ERR(hwpt_paging); - goto out_unlock; + if (pt_obj->type == IOMMUFD_OBJ_IOAS) { + struct iommufd_hwpt_paging *hwpt_paging; + + ioas = container_of(pt_obj, struct iommufd_ioas, obj); + mutex_lock(&ioas->mutex); + hwpt_paging = iommufd_hwpt_paging_alloc(ucmd->ictx, ioas, idev, + cmd->flags, false); + if (IS_ERR(hwpt_paging)) { + rc = PTR_ERR(hwpt_paging); + goto out_unlock; + } + hwpt = &hwpt_paging->common; + } else { + rc = -EINVAL; + goto out_put_pt; } - hwpt = &hwpt_paging->common; cmd->out_hwpt_id = hwpt->obj.id; rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); @@ -207,8 +217,10 @@ int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd) out_hwpt: iommufd_object_abort_and_destroy(ucmd->ictx, &hwpt->obj); out_unlock: - mutex_unlock(&ioas->mutex); - iommufd_put_object(&ioas->obj); + if (ioas) + mutex_unlock(&ioas->mutex); +out_put_pt: + iommufd_put_object(pt_obj); out_put_idev: iommufd_put_object(&idev->obj); return rc; From 2bdabb8e82f564d19eeeb7c83e6b2467af0707cb Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Wed, 25 Oct 2023 21:39:34 -0700 Subject: [PATCH 35/52] iommu: Pass in parent domain with user_data to domain_alloc_user op domain_alloc_user op already accepts user flags for domain allocation, add a parent domain pointer and a driver specific user data support as well. The user data would be tagged with a type for iommu drivers to add their own driver specific user data per hw_pagetable. Add a struct iommu_user_data as a bundle of data_ptr/data_len/type from an iommufd core uAPI structure. Make the user data opaque to the core, since a userspace driver must match the kernel driver. In the future, if drivers share some common parameter, there would be a generic parameter as well. Link: https://lore.kernel.org/r/20231026043938.63898-7-yi.l.liu@intel.com Signed-off-by: Lu Baolu Co-developed-by: Nicolin Chen Signed-off-by: Nicolin Chen Signed-off-by: Yi Liu Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- drivers/iommu/amd/iommu.c | 9 ++++++--- drivers/iommu/intel/iommu.c | 7 ++++++- drivers/iommu/iommufd/hw_pagetable.c | 3 ++- drivers/iommu/iommufd/selftest.c | 7 ++++++- include/linux/iommu.h | 27 ++++++++++++++++++++++++--- 5 files changed, 44 insertions(+), 9 deletions(-) diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index caad10f9cee3..b399c5741378 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -2219,12 +2219,15 @@ static struct iommu_domain *amd_iommu_domain_alloc(unsigned int type) return domain; } -static struct iommu_domain *amd_iommu_domain_alloc_user(struct device *dev, - u32 flags) +static struct iommu_domain * +amd_iommu_domain_alloc_user(struct device *dev, u32 flags, + struct iommu_domain *parent, + const struct iommu_user_data *user_data) + { unsigned int type = IOMMU_DOMAIN_UNMANAGED; - if (flags & ~IOMMU_HWPT_ALLOC_DIRTY_TRACKING) + if ((flags & ~IOMMU_HWPT_ALLOC_DIRTY_TRACKING) || parent || user_data) return ERR_PTR(-EOPNOTSUPP); return do_iommu_domain_alloc(type, dev, flags); diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index eb92a201cc0b..fe67f8d77b09 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -4076,7 +4076,9 @@ static struct iommu_domain *intel_iommu_domain_alloc(unsigned type) } static struct iommu_domain * -intel_iommu_domain_alloc_user(struct device *dev, u32 flags) +intel_iommu_domain_alloc_user(struct device *dev, u32 flags, + struct iommu_domain *parent, + const struct iommu_user_data *user_data) { struct iommu_domain *domain; struct intel_iommu *iommu; @@ -4086,6 +4088,9 @@ intel_iommu_domain_alloc_user(struct device *dev, u32 flags) (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING))) return ERR_PTR(-EOPNOTSUPP); + if (parent || user_data) + return ERR_PTR(-EOPNOTSUPP); + iommu = device_to_iommu(dev, NULL, NULL); if (!iommu) return ERR_PTR(-ENODEV); diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c index 6bce9af0cb8d..198ecbd536f7 100644 --- a/drivers/iommu/iommufd/hw_pagetable.c +++ b/drivers/iommu/iommufd/hw_pagetable.c @@ -108,7 +108,8 @@ iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, hwpt_paging->ioas = ioas; if (ops->domain_alloc_user) { - hwpt->domain = ops->domain_alloc_user(idev->dev, flags); + hwpt->domain = + ops->domain_alloc_user(idev->dev, flags, NULL, NULL); if (IS_ERR(hwpt->domain)) { rc = PTR_ERR(hwpt->domain); hwpt->domain = NULL; diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c index 068928ba7950..d71007234896 100644 --- a/drivers/iommu/iommufd/selftest.c +++ b/drivers/iommu/iommufd/selftest.c @@ -241,7 +241,9 @@ static struct iommu_domain *mock_domain_alloc(unsigned int iommu_domain_type) } static struct iommu_domain * -mock_domain_alloc_user(struct device *dev, u32 flags) +mock_domain_alloc_user(struct device *dev, u32 flags, + struct iommu_domain *parent, + const struct iommu_user_data *user_data) { struct mock_dev *mdev = container_of(dev, struct mock_dev, dev); struct iommu_domain *domain; @@ -250,6 +252,9 @@ mock_domain_alloc_user(struct device *dev, u32 flags) (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING))) return ERR_PTR(-EOPNOTSUPP); + if (parent || user_data) + return ERR_PTR(-EOPNOTSUPP); + if ((flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING) && (mdev->flags & MOCK_FLAGS_DEVICE_NO_DIRTY)) return ERR_PTR(-EOPNOTSUPP); diff --git a/include/linux/iommu.h b/include/linux/iommu.h index bc303cb2af37..2ddd99f55471 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -265,6 +265,21 @@ struct iommu_dirty_ops { struct iommu_dirty_bitmap *dirty); }; +/** + * struct iommu_user_data - iommu driver specific user space data info + * @type: The data type of the user buffer + * @uptr: Pointer to the user buffer for copy_from_user() + * @len: The length of the user buffer in bytes + * + * A user space data is an uAPI that is defined in include/uapi/linux/iommufd.h + * @type, @uptr and @len should be just copied from an iommufd core uAPI struct. + */ +struct iommu_user_data { + unsigned int type; + void __user *uptr; + size_t len; +}; + /** * struct iommu_ops - iommu ops and capabilities * @capable: check capability @@ -279,8 +294,12 @@ struct iommu_dirty_ops { * parameters as defined in include/uapi/linux/iommufd.h. * Unlike @domain_alloc, it is called only by IOMMUFD and * must fully initialize the new domain before return. - * Upon success, a domain is returned. Upon failure, - * ERR_PTR must be returned. + * Upon success, if the @user_data is valid and the @parent + * points to a kernel-managed domain, the new domain must be + * IOMMU_DOMAIN_NESTED type; otherwise, the @parent must be + * NULL while the @user_data can be optionally provided, the + * new domain must support __IOMMU_DOMAIN_PAGING. + * Upon failure, ERR_PTR must be returned. * @probe_device: Add device to iommu driver handling * @release_device: Remove device from iommu driver handling * @probe_finalize: Do final setup work after the device is added to an IOMMU @@ -313,7 +332,9 @@ struct iommu_ops { /* Domain allocation and freeing by the iommu driver */ struct iommu_domain *(*domain_alloc)(unsigned iommu_domain_type); - struct iommu_domain *(*domain_alloc_user)(struct device *dev, u32 flags); + struct iommu_domain *(*domain_alloc_user)( + struct device *dev, u32 flags, struct iommu_domain *parent, + const struct iommu_user_data *user_data); struct iommu_device *(*probe_device)(struct device *dev); void (*release_device)(struct device *dev); From bd529dbb661d62bd9f03e44c9fc837d98a190499 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Wed, 25 Oct 2023 21:39:35 -0700 Subject: [PATCH 36/52] iommufd: Add a nested HW pagetable object IOMMU_HWPT_ALLOC already supports iommu_domain allocation for usersapce. But it can only allocate a hw_pagetable that associates to a given IOAS, i.e. only a kernel-managed hw_pagetable of IOMMUFD_OBJ_HWPT_PAGING type. IOMMU drivers can now support user-managed hw_pagetables, for two-stage translation use cases that require user data input from the user space. Add a new IOMMUFD_OBJ_HWPT_NESTED type with its abort/destroy(). Pair it with a new iommufd_hwpt_nested structure and its to_hwpt_nested() helper. Update the to_hwpt_paging() helper, so a NESTED-type hw_pagetable can be handled in the callers, for example iommufd_hw_pagetable_enforce_rr(). Screen the inputs including the parent PAGING-type hw_pagetable that has a need of a new nest_parent flag in the iommufd_hwpt_paging structure. Extend the IOMMU_HWPT_ALLOC ioctl to accept an IOMMU driver specific data input which is tagged by the enum iommu_hwpt_data_type. Also, update the @pt_id to accept hwpt_id too besides an ioas_id. Then, use them to allocate a hw_pagetable of IOMMUFD_OBJ_HWPT_NESTED type using the iommufd_hw_pagetable_alloc_nested() allocator. Link: https://lore.kernel.org/r/20231026043938.63898-8-yi.l.liu@intel.com Signed-off-by: Nicolin Chen Co-developed-by: Yi Liu Signed-off-by: Yi Liu Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/device.c | 3 +- drivers/iommu/iommufd/hw_pagetable.c | 109 ++++++++++++++++++++++-- drivers/iommu/iommufd/iommufd_private.h | 28 ++++-- drivers/iommu/iommufd/main.c | 4 + include/uapi/linux/iommufd.h | 31 ++++++- 5 files changed, 159 insertions(+), 16 deletions(-) diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c index a99ce4547353..59d3a07300d9 100644 --- a/drivers/iommu/iommufd/device.c +++ b/drivers/iommu/iommufd/device.c @@ -588,7 +588,7 @@ iommufd_device_auto_get_domain(struct iommufd_device *idev, } hwpt_paging = iommufd_hwpt_paging_alloc(idev->ictx, ioas, idev, 0, - immediate_attach); + immediate_attach, NULL); if (IS_ERR(hwpt_paging)) { destroy_hwpt = ERR_CAST(hwpt_paging); goto out_unlock; @@ -628,6 +628,7 @@ static int iommufd_device_change_pt(struct iommufd_device *idev, u32 *pt_id, return PTR_ERR(pt_obj); switch (pt_obj->type) { + case IOMMUFD_OBJ_HWPT_NESTED: case IOMMUFD_OBJ_HWPT_PAGING: { struct iommufd_hw_pagetable *hwpt = container_of(pt_obj, struct iommufd_hw_pagetable, obj); diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c index 198ecbd536f7..2abbeafdbd22 100644 --- a/drivers/iommu/iommufd/hw_pagetable.c +++ b/drivers/iommu/iommufd/hw_pagetable.c @@ -44,6 +44,22 @@ void iommufd_hwpt_paging_abort(struct iommufd_object *obj) iommufd_hwpt_paging_destroy(obj); } +void iommufd_hwpt_nested_destroy(struct iommufd_object *obj) +{ + struct iommufd_hwpt_nested *hwpt_nested = + container_of(obj, struct iommufd_hwpt_nested, common.obj); + + if (hwpt_nested->common.domain) + iommu_domain_free(hwpt_nested->common.domain); + + refcount_dec(&hwpt_nested->parent->common.obj.users); +} + +void iommufd_hwpt_nested_abort(struct iommufd_object *obj) +{ + iommufd_hwpt_nested_destroy(obj); +} + static int iommufd_hwpt_paging_enforce_cc(struct iommufd_hwpt_paging *hwpt_paging) { @@ -68,6 +84,8 @@ iommufd_hwpt_paging_enforce_cc(struct iommufd_hwpt_paging *hwpt_paging) * @idev: Device to get an iommu_domain for * @flags: Flags from userspace * @immediate_attach: True if idev should be attached to the hwpt + * @user_data: The user provided driver specific data describing the domain to + * create * * Allocate a new iommu_domain and return it as a hw_pagetable. The HWPT * will be linked to the given ioas and upon return the underlying iommu_domain @@ -80,7 +98,8 @@ iommufd_hwpt_paging_enforce_cc(struct iommufd_hwpt_paging *hwpt_paging) struct iommufd_hwpt_paging * iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, struct iommufd_device *idev, u32 flags, - bool immediate_attach) + bool immediate_attach, + const struct iommu_user_data *user_data) { const u32 valid_flags = IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING; @@ -91,7 +110,7 @@ iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, lockdep_assert_held(&ioas->mutex); - if (flags && !ops->domain_alloc_user) + if ((flags || user_data) && !ops->domain_alloc_user) return ERR_PTR(-EOPNOTSUPP); if (flags & ~valid_flags) return ERR_PTR(-EOPNOTSUPP); @@ -106,10 +125,11 @@ iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, /* Pairs with iommufd_hw_pagetable_destroy() */ refcount_inc(&ioas->obj.users); hwpt_paging->ioas = ioas; + hwpt_paging->nest_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT; if (ops->domain_alloc_user) { - hwpt->domain = - ops->domain_alloc_user(idev->dev, flags, NULL, NULL); + hwpt->domain = ops->domain_alloc_user(idev->dev, flags, NULL, + user_data); if (IS_ERR(hwpt->domain)) { rc = PTR_ERR(hwpt->domain); hwpt->domain = NULL; @@ -169,9 +189,70 @@ out_abort: return ERR_PTR(rc); } +/** + * iommufd_hwpt_nested_alloc() - Get a NESTED iommu_domain for a device + * @ictx: iommufd context + * @parent: Parent PAGING-type hwpt to associate the domain with + * @idev: Device to get an iommu_domain for + * @flags: Flags from userspace + * @user_data: user_data pointer. Must be valid + * + * Allocate a new iommu_domain (must be IOMMU_DOMAIN_NESTED) and return it as + * a NESTED hw_pagetable. The given parent PAGING-type hwpt must be capable of + * being a parent. + */ +static struct iommufd_hwpt_nested * +iommufd_hwpt_nested_alloc(struct iommufd_ctx *ictx, + struct iommufd_hwpt_paging *parent, + struct iommufd_device *idev, u32 flags, + const struct iommu_user_data *user_data) +{ + const struct iommu_ops *ops = dev_iommu_ops(idev->dev); + struct iommufd_hwpt_nested *hwpt_nested; + struct iommufd_hw_pagetable *hwpt; + int rc; + + if (flags || !user_data->len || !ops->domain_alloc_user) + return ERR_PTR(-EOPNOTSUPP); + if (parent->auto_domain || !parent->nest_parent) + return ERR_PTR(-EINVAL); + + hwpt_nested = __iommufd_object_alloc( + ictx, hwpt_nested, IOMMUFD_OBJ_HWPT_NESTED, common.obj); + if (IS_ERR(hwpt_nested)) + return ERR_CAST(hwpt_nested); + hwpt = &hwpt_nested->common; + + refcount_inc(&parent->common.obj.users); + hwpt_nested->parent = parent; + + hwpt->domain = ops->domain_alloc_user(idev->dev, flags, + parent->common.domain, user_data); + if (IS_ERR(hwpt->domain)) { + rc = PTR_ERR(hwpt->domain); + hwpt->domain = NULL; + goto out_abort; + } + + if (WARN_ON_ONCE(hwpt->domain->type != IOMMU_DOMAIN_NESTED)) { + rc = -EINVAL; + goto out_abort; + } + return hwpt_nested; + +out_abort: + iommufd_object_abort_and_destroy(ictx, &hwpt->obj); + return ERR_PTR(rc); +} + int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd) { struct iommu_hwpt_alloc *cmd = ucmd->cmd; + const struct iommu_user_data user_data = { + .type = cmd->data_type, + .uptr = u64_to_user_ptr(cmd->data_uptr), + .len = cmd->data_len, + }; struct iommufd_hw_pagetable *hwpt; struct iommufd_ioas *ioas = NULL; struct iommufd_object *pt_obj; @@ -180,6 +261,8 @@ int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd) if (cmd->__reserved) return -EOPNOTSUPP; + if (cmd->data_type == IOMMU_HWPT_DATA_NONE && cmd->data_len) + return -EINVAL; idev = iommufd_get_device(ucmd, cmd->dev_id); if (IS_ERR(idev)) @@ -196,13 +279,27 @@ int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd) ioas = container_of(pt_obj, struct iommufd_ioas, obj); mutex_lock(&ioas->mutex); - hwpt_paging = iommufd_hwpt_paging_alloc(ucmd->ictx, ioas, idev, - cmd->flags, false); + hwpt_paging = iommufd_hwpt_paging_alloc( + ucmd->ictx, ioas, idev, cmd->flags, false, + user_data.len ? &user_data : NULL); if (IS_ERR(hwpt_paging)) { rc = PTR_ERR(hwpt_paging); goto out_unlock; } hwpt = &hwpt_paging->common; + } else if (pt_obj->type == IOMMUFD_OBJ_HWPT_PAGING) { + struct iommufd_hwpt_nested *hwpt_nested; + + hwpt_nested = iommufd_hwpt_nested_alloc( + ucmd->ictx, + container_of(pt_obj, struct iommufd_hwpt_paging, + common.obj), + idev, cmd->flags, &user_data); + if (IS_ERR(hwpt_nested)) { + rc = PTR_ERR(hwpt_nested); + goto out_unlock; + } + hwpt = &hwpt_nested->common; } else { rc = -EINVAL; goto out_put_pt; diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index cd8da289ed0b..a74cfefffbc6 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -124,6 +124,7 @@ enum iommufd_object_type { IOMMUFD_OBJ_ANY = IOMMUFD_OBJ_NONE, IOMMUFD_OBJ_DEVICE, IOMMUFD_OBJ_HWPT_PAGING, + IOMMUFD_OBJ_HWPT_NESTED, IOMMUFD_OBJ_IOAS, IOMMUFD_OBJ_ACCESS, #ifdef CONFIG_IOMMUFD_TEST @@ -255,10 +256,16 @@ struct iommufd_hwpt_paging { bool auto_domain : 1; bool enforce_cache_coherency : 1; bool msi_cookie : 1; + bool nest_parent : 1; /* Head at iommufd_ioas::hwpt_list */ struct list_head hwpt_item; }; +struct iommufd_hwpt_nested { + struct iommufd_hw_pagetable common; + struct iommufd_hwpt_paging *parent; +}; + static inline bool hwpt_is_paging(struct iommufd_hw_pagetable *hwpt) { return hwpt->obj.type == IOMMUFD_OBJ_HWPT_PAGING; @@ -283,25 +290,32 @@ int iommufd_hwpt_get_dirty_bitmap(struct iommufd_ucmd *ucmd); struct iommufd_hwpt_paging * iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, struct iommufd_device *idev, u32 flags, - bool immediate_attach); + bool immediate_attach, + const struct iommu_user_data *user_data); int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt, struct iommufd_device *idev); struct iommufd_hw_pagetable * iommufd_hw_pagetable_detach(struct iommufd_device *idev); void iommufd_hwpt_paging_destroy(struct iommufd_object *obj); void iommufd_hwpt_paging_abort(struct iommufd_object *obj); +void iommufd_hwpt_nested_destroy(struct iommufd_object *obj); +void iommufd_hwpt_nested_abort(struct iommufd_object *obj); int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd); static inline void iommufd_hw_pagetable_put(struct iommufd_ctx *ictx, struct iommufd_hw_pagetable *hwpt) { - struct iommufd_hwpt_paging *hwpt_paging = to_hwpt_paging(hwpt); + if (hwpt->obj.type == IOMMUFD_OBJ_HWPT_PAGING) { + struct iommufd_hwpt_paging *hwpt_paging = to_hwpt_paging(hwpt); - lockdep_assert_not_held(&hwpt_paging->ioas->mutex); - if (hwpt_paging->auto_domain) - iommufd_object_deref_user(ictx, &hwpt->obj); - else - refcount_dec(&hwpt->obj.users); + lockdep_assert_not_held(&hwpt_paging->ioas->mutex); + + if (hwpt_paging->auto_domain) { + iommufd_object_deref_user(ictx, &hwpt->obj); + return; + } + } + refcount_dec(&hwpt->obj.users); } struct iommufd_group { diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c index ab6675a7f6b4..45b9d40773b1 100644 --- a/drivers/iommu/iommufd/main.c +++ b/drivers/iommu/iommufd/main.c @@ -492,6 +492,10 @@ static const struct iommufd_object_ops iommufd_object_ops[] = { .destroy = iommufd_hwpt_paging_destroy, .abort = iommufd_hwpt_paging_abort, }, + [IOMMUFD_OBJ_HWPT_NESTED] = { + .destroy = iommufd_hwpt_nested_destroy, + .abort = iommufd_hwpt_nested_abort, + }, #ifdef CONFIG_IOMMUFD_TEST [IOMMUFD_OBJ_SELFTEST] = { .destroy = iommufd_selftest_destroy, diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index c44eecf5d318..d816deac906f 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -361,20 +361,44 @@ enum iommufd_hwpt_alloc_flags { IOMMU_HWPT_ALLOC_DIRTY_TRACKING = 1 << 1, }; +/** + * enum iommu_hwpt_data_type - IOMMU HWPT Data Type + * @IOMMU_HWPT_DATA_NONE: no data + */ +enum iommu_hwpt_data_type { + IOMMU_HWPT_DATA_NONE, +}; + /** * struct iommu_hwpt_alloc - ioctl(IOMMU_HWPT_ALLOC) * @size: sizeof(struct iommu_hwpt_alloc) * @flags: Combination of enum iommufd_hwpt_alloc_flags * @dev_id: The device to allocate this HWPT for - * @pt_id: The IOAS to connect this HWPT to + * @pt_id: The IOAS or HWPT to connect this HWPT to * @out_hwpt_id: The ID of the new HWPT * @__reserved: Must be 0 + * @data_type: One of enum iommu_hwpt_data_type + * @data_len: Length of the type specific data + * @data_uptr: User pointer to the type specific data * * Explicitly allocate a hardware page table object. This is the same object * type that is returned by iommufd_device_attach() and represents the * underlying iommu driver's iommu_domain kernel object. * - * A HWPT will be created with the IOVA mappings from the given IOAS. + * A kernel-managed HWPT will be created with the mappings from the given + * IOAS via the @pt_id. The @data_type for this allocation must be set to + * IOMMU_HWPT_DATA_NONE. The HWPT can be allocated as a parent HWPT for a + * nesting configuration by passing IOMMU_HWPT_ALLOC_NEST_PARENT via @flags. + * + * A user-managed nested HWPT will be created from a given parent HWPT via + * @pt_id, in which the parent HWPT must be allocated previously via the + * same ioctl from a given IOAS (@pt_id). In this case, the @data_type + * must be set to a pre-defined type corresponding to an I/O page table + * type supported by the underlying IOMMU hardware. + * + * If the @data_type is set to IOMMU_HWPT_DATA_NONE, @data_len and + * @data_uptr should be zero. Otherwise, both @data_len and @data_uptr + * must be given. */ struct iommu_hwpt_alloc { __u32 size; @@ -383,6 +407,9 @@ struct iommu_hwpt_alloc { __u32 pt_id; __u32 out_hwpt_id; __u32 __reserved; + __u32 data_type; + __u32 data_len; + __aligned_u64 data_uptr; }; #define IOMMU_HWPT_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_HWPT_ALLOC) From e9d36c07bb787840e4813fb09a929a17d522a69f Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Wed, 25 Oct 2023 21:39:36 -0700 Subject: [PATCH 37/52] iommu: Add iommu_copy_struct_from_user helper Wrap up the data type/pointer/len sanity and a copy_struct_from_user call for iommu drivers to copy driver specific data via struct iommu_user_data. And expect it to be used in the domain_alloc_user op for example. Link: https://lore.kernel.org/r/20231026043938.63898-9-yi.l.liu@intel.com Signed-off-by: Nicolin Chen Co-developed-by: Yi Liu Signed-off-by: Yi Liu Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- include/linux/iommu.h | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 2ddd99f55471..8fb1b41b4d15 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -280,6 +280,46 @@ struct iommu_user_data { size_t len; }; +/** + * __iommu_copy_struct_from_user - Copy iommu driver specific user space data + * @dst_data: Pointer to an iommu driver specific user data that is defined in + * include/uapi/linux/iommufd.h + * @src_data: Pointer to a struct iommu_user_data for user space data info + * @data_type: The data type of the @dst_data. Must match with @src_data.type + * @data_len: Length of current user data structure, i.e. sizeof(struct _dst) + * @min_len: Initial length of user data structure for backward compatibility. + * This should be offsetofend using the last member in the user data + * struct that was initially added to include/uapi/linux/iommufd.h + */ +static inline int __iommu_copy_struct_from_user( + void *dst_data, const struct iommu_user_data *src_data, + unsigned int data_type, size_t data_len, size_t min_len) +{ + if (src_data->type != data_type) + return -EINVAL; + if (WARN_ON(!dst_data || !src_data)) + return -EINVAL; + if (src_data->len < min_len || data_len < src_data->len) + return -EINVAL; + return copy_struct_from_user(dst_data, data_len, src_data->uptr, + src_data->len); +} + +/** + * iommu_copy_struct_from_user - Copy iommu driver specific user space data + * @kdst: Pointer to an iommu driver specific user data that is defined in + * include/uapi/linux/iommufd.h + * @user_data: Pointer to a struct iommu_user_data for user space data info + * @data_type: The data type of the @kdst. Must match with @user_data->type + * @min_last: The last memember of the data structure @kdst points in the + * initial version. + * Return 0 for success, otherwise -error. + */ +#define iommu_copy_struct_from_user(kdst, user_data, data_type, min_last) \ + __iommu_copy_struct_from_user(kdst, user_data, data_type, \ + sizeof(*kdst), \ + offsetofend(typeof(*kdst), min_last)) + /** * struct iommu_ops - iommu ops and capabilities * @capable: check capability From 65fe32f7a4472e19331a524b9c980b3444dd20a2 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Wed, 25 Oct 2023 21:39:37 -0700 Subject: [PATCH 38/52] iommufd/selftest: Add nested domain allocation for mock domain Add nested domain support in the ->domain_alloc_user op with some proper sanity checks. Then, add a domain_nested_ops for all nested domains and split the get_md_pagetable helper into paging and nested helpers. Also, add an iotlb as a testing property of a nested domain. Link: https://lore.kernel.org/r/20231026043938.63898-10-yi.l.liu@intel.com Signed-off-by: Nicolin Chen Signed-off-by: Yi Liu Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/iommufd_test.h | 18 ++++ drivers/iommu/iommufd/selftest.c | 152 +++++++++++++++++++++------ 2 files changed, 140 insertions(+), 30 deletions(-) diff --git a/drivers/iommu/iommufd/iommufd_test.h b/drivers/iommu/iommufd/iommufd_test.h index 1f2e93d3d4e8..7910fbe1962d 100644 --- a/drivers/iommu/iommufd/iommufd_test.h +++ b/drivers/iommu/iommufd/iommufd_test.h @@ -46,6 +46,11 @@ enum { MOCK_FLAGS_DEVICE_NO_DIRTY = 1 << 0, }; +enum { + MOCK_NESTED_DOMAIN_IOTLB_ID_MAX = 3, + MOCK_NESTED_DOMAIN_IOTLB_NUM = 4, +}; + struct iommu_test_cmd { __u32 size; __u32 op; @@ -130,4 +135,17 @@ struct iommu_test_hw_info { __u32 test_reg; }; +/* Should not be equal to any defined value in enum iommu_hwpt_data_type */ +#define IOMMU_HWPT_DATA_SELFTEST 0xdead +#define IOMMU_TEST_IOTLB_DEFAULT 0xbadbeef + +/** + * struct iommu_hwpt_selftest + * + * @iotlb: default mock iotlb value, IOMMU_TEST_IOTLB_DEFAULT + */ +struct iommu_hwpt_selftest { + __u32 iotlb; +}; + #endif diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c index d71007234896..6684ab4cdc7a 100644 --- a/drivers/iommu/iommufd/selftest.c +++ b/drivers/iommu/iommufd/selftest.c @@ -93,6 +93,12 @@ struct mock_iommu_domain { struct xarray pfns; }; +struct mock_iommu_domain_nested { + struct iommu_domain domain; + struct mock_iommu_domain *parent; + u32 iotlb[MOCK_NESTED_DOMAIN_IOTLB_NUM]; +}; + enum selftest_obj_type { TYPE_IDEV, }; @@ -217,54 +223,99 @@ const struct iommu_dirty_ops dirty_ops = { }; static const struct iommu_ops mock_ops; +static struct iommu_domain_ops domain_nested_ops; -static struct iommu_domain *mock_domain_alloc(unsigned int iommu_domain_type) +static struct iommu_domain * +__mock_domain_alloc_paging(unsigned int iommu_domain_type, bool needs_dirty_ops) { struct mock_iommu_domain *mock; - if (iommu_domain_type == IOMMU_DOMAIN_BLOCKED) - return &mock_blocking_domain; - - if (iommu_domain_type != IOMMU_DOMAIN_UNMANAGED) - return NULL; - mock = kzalloc(sizeof(*mock), GFP_KERNEL); if (!mock) - return NULL; + return ERR_PTR(-ENOMEM); mock->domain.geometry.aperture_start = MOCK_APERTURE_START; mock->domain.geometry.aperture_end = MOCK_APERTURE_LAST; mock->domain.pgsize_bitmap = MOCK_IO_PAGE_SIZE; mock->domain.ops = mock_ops.default_domain_ops; + if (needs_dirty_ops) + mock->domain.dirty_ops = &dirty_ops; mock->domain.type = iommu_domain_type; xa_init(&mock->pfns); return &mock->domain; } +static struct iommu_domain * +__mock_domain_alloc_nested(struct mock_iommu_domain *mock_parent, + const struct iommu_hwpt_selftest *user_cfg) +{ + struct mock_iommu_domain_nested *mock_nested; + int i; + + mock_nested = kzalloc(sizeof(*mock_nested), GFP_KERNEL); + if (!mock_nested) + return ERR_PTR(-ENOMEM); + mock_nested->parent = mock_parent; + mock_nested->domain.ops = &domain_nested_ops; + mock_nested->domain.type = IOMMU_DOMAIN_NESTED; + for (i = 0; i < MOCK_NESTED_DOMAIN_IOTLB_NUM; i++) + mock_nested->iotlb[i] = user_cfg->iotlb; + return &mock_nested->domain; +} + +static struct iommu_domain *mock_domain_alloc(unsigned int iommu_domain_type) +{ + struct iommu_domain *domain; + + if (iommu_domain_type == IOMMU_DOMAIN_BLOCKED) + return &mock_blocking_domain; + if (iommu_domain_type != IOMMU_DOMAIN_UNMANAGED) + return NULL; + domain = __mock_domain_alloc_paging(iommu_domain_type, false); + if (IS_ERR(domain)) + domain = NULL; + return domain; +} + static struct iommu_domain * mock_domain_alloc_user(struct device *dev, u32 flags, struct iommu_domain *parent, const struct iommu_user_data *user_data) { - struct mock_dev *mdev = container_of(dev, struct mock_dev, dev); - struct iommu_domain *domain; + struct mock_iommu_domain *mock_parent; + struct iommu_hwpt_selftest user_cfg; + int rc; - if (flags & - (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING))) + /* must be mock_domain */ + if (!parent) { + struct mock_dev *mdev = container_of(dev, struct mock_dev, dev); + bool has_dirty_flag = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING; + bool no_dirty_ops = mdev->flags & MOCK_FLAGS_DEVICE_NO_DIRTY; + + if (flags & (~(IOMMU_HWPT_ALLOC_NEST_PARENT | + IOMMU_HWPT_ALLOC_DIRTY_TRACKING))) + return ERR_PTR(-EOPNOTSUPP); + if (user_data || (has_dirty_flag && no_dirty_ops)) + return ERR_PTR(-EOPNOTSUPP); + return __mock_domain_alloc_paging(IOMMU_DOMAIN_UNMANAGED, + has_dirty_flag); + } + + /* must be mock_domain_nested */ + if (user_data->type != IOMMU_HWPT_DATA_SELFTEST || flags) return ERR_PTR(-EOPNOTSUPP); + if (!parent || parent->ops != mock_ops.default_domain_ops) + return ERR_PTR(-EINVAL); - if (parent || user_data) - return ERR_PTR(-EOPNOTSUPP); + mock_parent = container_of(parent, struct mock_iommu_domain, domain); + if (!mock_parent) + return ERR_PTR(-EINVAL); - if ((flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING) && - (mdev->flags & MOCK_FLAGS_DEVICE_NO_DIRTY)) - return ERR_PTR(-EOPNOTSUPP); + rc = iommu_copy_struct_from_user(&user_cfg, user_data, + IOMMU_HWPT_DATA_SELFTEST, iotlb); + if (rc) + return ERR_PTR(rc); - domain = mock_domain_alloc(IOMMU_DOMAIN_UNMANAGED); - if (domain && !(mdev->flags & MOCK_FLAGS_DEVICE_NO_DIRTY)) - domain->dirty_ops = &dirty_ops; - if (!domain) - domain = ERR_PTR(-ENOMEM); - return domain; + return __mock_domain_alloc_nested(mock_parent, &user_cfg); } static void mock_domain_free(struct iommu_domain *domain) @@ -434,19 +485,41 @@ static const struct iommu_ops mock_ops = { }, }; +static void mock_domain_free_nested(struct iommu_domain *domain) +{ + struct mock_iommu_domain_nested *mock_nested = + container_of(domain, struct mock_iommu_domain_nested, domain); + + kfree(mock_nested); +} + +static struct iommu_domain_ops domain_nested_ops = { + .free = mock_domain_free_nested, + .attach_dev = mock_domain_nop_attach, +}; + +static inline struct iommufd_hw_pagetable * +__get_md_pagetable(struct iommufd_ucmd *ucmd, u32 mockpt_id, u32 hwpt_type) +{ + struct iommufd_object *obj; + + obj = iommufd_get_object(ucmd->ictx, mockpt_id, hwpt_type); + if (IS_ERR(obj)) + return ERR_CAST(obj); + return container_of(obj, struct iommufd_hw_pagetable, obj); +} + static inline struct iommufd_hw_pagetable * get_md_pagetable(struct iommufd_ucmd *ucmd, u32 mockpt_id, struct mock_iommu_domain **mock) { struct iommufd_hw_pagetable *hwpt; - struct iommufd_object *obj; - obj = iommufd_get_object(ucmd->ictx, mockpt_id, - IOMMUFD_OBJ_HWPT_PAGING); - if (IS_ERR(obj)) - return ERR_CAST(obj); - hwpt = container_of(obj, struct iommufd_hw_pagetable, obj); - if (hwpt->domain->ops != mock_ops.default_domain_ops) { + hwpt = __get_md_pagetable(ucmd, mockpt_id, IOMMUFD_OBJ_HWPT_PAGING); + if (IS_ERR(hwpt)) + return hwpt; + if (hwpt->domain->type != IOMMU_DOMAIN_UNMANAGED || + hwpt->domain->ops != mock_ops.default_domain_ops) { iommufd_put_object(&hwpt->obj); return ERR_PTR(-EINVAL); } @@ -454,6 +527,25 @@ get_md_pagetable(struct iommufd_ucmd *ucmd, u32 mockpt_id, return hwpt; } +static inline struct iommufd_hw_pagetable * +get_md_pagetable_nested(struct iommufd_ucmd *ucmd, u32 mockpt_id, + struct mock_iommu_domain_nested **mock_nested) +{ + struct iommufd_hw_pagetable *hwpt; + + hwpt = __get_md_pagetable(ucmd, mockpt_id, IOMMUFD_OBJ_HWPT_NESTED); + if (IS_ERR(hwpt)) + return hwpt; + if (hwpt->domain->type != IOMMU_DOMAIN_NESTED || + hwpt->domain->ops != &domain_nested_ops) { + iommufd_put_object(&hwpt->obj); + return ERR_PTR(-EINVAL); + } + *mock_nested = container_of(hwpt->domain, + struct mock_iommu_domain_nested, domain); + return hwpt; +} + struct mock_bus_type { struct bus_type bus; struct notifier_block nb; From 55a01657cbee07d772b1d3cb144f867a326e4673 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Wed, 25 Oct 2023 21:39:38 -0700 Subject: [PATCH 39/52] iommufd/selftest: Add coverage for IOMMU_HWPT_ALLOC with nested HWPTs The IOMMU_HWPT_ALLOC ioctl now supports passing user_data to allocate a user-managed domain for nested HWPTs. Add its coverage for that. Also, update _test_cmd_hwpt_alloc() and add test_cmd/err_hwpt_alloc_nested(). Link: https://lore.kernel.org/r/20231026043938.63898-11-yi.l.liu@intel.com Signed-off-by: Nicolin Chen Signed-off-by: Yi Liu Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe --- tools/testing/selftests/iommu/iommufd.c | 115 ++++++++++++++++++ .../selftests/iommu/iommufd_fail_nth.c | 3 +- tools/testing/selftests/iommu/iommufd_utils.h | 30 +++-- 3 files changed, 140 insertions(+), 8 deletions(-) diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c index 76a4351e3434..6ed328c863c4 100644 --- a/tools/testing/selftests/iommu/iommufd.c +++ b/tools/testing/selftests/iommu/iommufd.c @@ -264,6 +264,121 @@ TEST_F(iommufd_ioas, ioas_destroy) } } +TEST_F(iommufd_ioas, alloc_hwpt_nested) +{ + const uint32_t min_data_len = + offsetofend(struct iommu_hwpt_selftest, iotlb); + struct iommu_hwpt_selftest data = { + .iotlb = IOMMU_TEST_IOTLB_DEFAULT, + }; + uint32_t nested_hwpt_id[2] = {}; + uint32_t parent_hwpt_id = 0; + uint32_t parent_hwpt_id_not_work = 0; + uint32_t test_hwpt_id = 0; + + if (self->device_id) { + /* Negative tests */ + test_err_hwpt_alloc(ENOENT, self->ioas_id, self->device_id, 0, + &test_hwpt_id); + test_err_hwpt_alloc(EINVAL, self->device_id, self->device_id, 0, + &test_hwpt_id); + + test_cmd_hwpt_alloc(self->device_id, self->ioas_id, + IOMMU_HWPT_ALLOC_NEST_PARENT, + &parent_hwpt_id); + + test_cmd_hwpt_alloc(self->device_id, self->ioas_id, 0, + &parent_hwpt_id_not_work); + + /* Negative nested tests */ + test_err_hwpt_alloc_nested(EINVAL, self->device_id, + parent_hwpt_id, 0, + &nested_hwpt_id[0], + IOMMU_HWPT_DATA_NONE, &data, + sizeof(data)); + test_err_hwpt_alloc_nested(EOPNOTSUPP, self->device_id, + parent_hwpt_id, 0, + &nested_hwpt_id[0], + IOMMU_HWPT_DATA_SELFTEST + 1, &data, + sizeof(data)); + test_err_hwpt_alloc_nested(EINVAL, self->device_id, + parent_hwpt_id, 0, + &nested_hwpt_id[0], + IOMMU_HWPT_DATA_SELFTEST, &data, + min_data_len - 1); + test_err_hwpt_alloc_nested(EFAULT, self->device_id, + parent_hwpt_id, 0, + &nested_hwpt_id[0], + IOMMU_HWPT_DATA_SELFTEST, NULL, + sizeof(data)); + test_err_hwpt_alloc_nested( + EOPNOTSUPP, self->device_id, parent_hwpt_id, + IOMMU_HWPT_ALLOC_NEST_PARENT, &nested_hwpt_id[0], + IOMMU_HWPT_DATA_SELFTEST, &data, sizeof(data)); + test_err_hwpt_alloc_nested(EINVAL, self->device_id, + parent_hwpt_id_not_work, 0, + &nested_hwpt_id[0], + IOMMU_HWPT_DATA_SELFTEST, &data, + sizeof(data)); + + /* Allocate two nested hwpts sharing one common parent hwpt */ + test_cmd_hwpt_alloc_nested(self->device_id, parent_hwpt_id, 0, + &nested_hwpt_id[0], + IOMMU_HWPT_DATA_SELFTEST, &data, + sizeof(data)); + test_cmd_hwpt_alloc_nested(self->device_id, parent_hwpt_id, 0, + &nested_hwpt_id[1], + IOMMU_HWPT_DATA_SELFTEST, &data, + sizeof(data)); + + /* Negative test: a nested hwpt on top of a nested hwpt */ + test_err_hwpt_alloc_nested(EINVAL, self->device_id, + nested_hwpt_id[0], 0, &test_hwpt_id, + IOMMU_HWPT_DATA_SELFTEST, &data, + sizeof(data)); + /* Negative test: parent hwpt now cannot be freed */ + EXPECT_ERRNO(EBUSY, + _test_ioctl_destroy(self->fd, parent_hwpt_id)); + + /* Attach device to nested_hwpt_id[0] that then will be busy */ + test_cmd_mock_domain_replace(self->stdev_id, nested_hwpt_id[0]); + EXPECT_ERRNO(EBUSY, + _test_ioctl_destroy(self->fd, nested_hwpt_id[0])); + + /* Switch from nested_hwpt_id[0] to nested_hwpt_id[1] */ + test_cmd_mock_domain_replace(self->stdev_id, nested_hwpt_id[1]); + EXPECT_ERRNO(EBUSY, + _test_ioctl_destroy(self->fd, nested_hwpt_id[1])); + test_ioctl_destroy(nested_hwpt_id[0]); + + /* Detach from nested_hwpt_id[1] and destroy it */ + test_cmd_mock_domain_replace(self->stdev_id, parent_hwpt_id); + test_ioctl_destroy(nested_hwpt_id[1]); + + /* Detach from the parent hw_pagetable and destroy it */ + test_cmd_mock_domain_replace(self->stdev_id, self->ioas_id); + test_ioctl_destroy(parent_hwpt_id); + test_ioctl_destroy(parent_hwpt_id_not_work); + } else { + test_err_hwpt_alloc(ENOENT, self->device_id, self->ioas_id, 0, + &parent_hwpt_id); + test_err_hwpt_alloc_nested(ENOENT, self->device_id, + parent_hwpt_id, 0, + &nested_hwpt_id[0], + IOMMU_HWPT_DATA_SELFTEST, &data, + sizeof(data)); + test_err_hwpt_alloc_nested(ENOENT, self->device_id, + parent_hwpt_id, 0, + &nested_hwpt_id[1], + IOMMU_HWPT_DATA_SELFTEST, &data, + sizeof(data)); + test_err_mock_domain_replace(ENOENT, self->stdev_id, + nested_hwpt_id[0]); + test_err_mock_domain_replace(ENOENT, self->stdev_id, + nested_hwpt_id[1]); + } +} + TEST_F(iommufd_ioas, hwpt_attach) { /* Create a device attached directly to a hwpt */ diff --git a/tools/testing/selftests/iommu/iommufd_fail_nth.c b/tools/testing/selftests/iommu/iommufd_fail_nth.c index ff735bdd833e..f590417cd67a 100644 --- a/tools/testing/selftests/iommu/iommufd_fail_nth.c +++ b/tools/testing/selftests/iommu/iommufd_fail_nth.c @@ -615,7 +615,8 @@ TEST_FAIL_NTH(basic_fail_nth, device) if (_test_cmd_get_hw_info(self->fd, idev_id, &info, sizeof(info), NULL)) return -1; - if (_test_cmd_hwpt_alloc(self->fd, idev_id, ioas_id, 0, &hwpt_id)) + if (_test_cmd_hwpt_alloc(self->fd, idev_id, ioas_id, 0, &hwpt_id, + IOMMU_HWPT_DATA_NONE, 0, 0)) return -1; if (_test_cmd_mock_domain_replace(self->fd, stdev_id, ioas_id2, NULL)) diff --git a/tools/testing/selftests/iommu/iommufd_utils.h b/tools/testing/selftests/iommu/iommufd_utils.h index e263bf80a977..050e9751321c 100644 --- a/tools/testing/selftests/iommu/iommufd_utils.h +++ b/tools/testing/selftests/iommu/iommufd_utils.h @@ -154,13 +154,17 @@ static int _test_cmd_mock_domain_replace(int fd, __u32 stdev_id, __u32 pt_id, pt_id, NULL)) static int _test_cmd_hwpt_alloc(int fd, __u32 device_id, __u32 pt_id, - __u32 flags, __u32 *hwpt_id) + __u32 flags, __u32 *hwpt_id, __u32 data_type, + void *data, size_t data_len) { struct iommu_hwpt_alloc cmd = { .size = sizeof(cmd), .flags = flags, .dev_id = device_id, .pt_id = pt_id, + .data_type = data_type, + .data_len = data_len, + .data_uptr = (uint64_t)data, }; int ret; @@ -172,12 +176,24 @@ static int _test_cmd_hwpt_alloc(int fd, __u32 device_id, __u32 pt_id, return 0; } -#define test_cmd_hwpt_alloc(device_id, pt_id, flags, hwpt_id) \ - ASSERT_EQ(0, _test_cmd_hwpt_alloc(self->fd, device_id, \ - pt_id, flags, hwpt_id)) -#define test_err_hwpt_alloc(_errno, device_id, pt_id, flags, hwpt_id) \ - EXPECT_ERRNO(_errno, _test_cmd_hwpt_alloc(self->fd, device_id, \ - pt_id, flags, hwpt_id)) +#define test_cmd_hwpt_alloc(device_id, pt_id, flags, hwpt_id) \ + ASSERT_EQ(0, _test_cmd_hwpt_alloc(self->fd, device_id, pt_id, flags, \ + hwpt_id, IOMMU_HWPT_DATA_NONE, NULL, \ + 0)) +#define test_err_hwpt_alloc(_errno, device_id, pt_id, flags, hwpt_id) \ + EXPECT_ERRNO(_errno, _test_cmd_hwpt_alloc( \ + self->fd, device_id, pt_id, flags, \ + hwpt_id, IOMMU_HWPT_DATA_NONE, NULL, 0)) + +#define test_cmd_hwpt_alloc_nested(device_id, pt_id, flags, hwpt_id, \ + data_type, data, data_len) \ + ASSERT_EQ(0, _test_cmd_hwpt_alloc(self->fd, device_id, pt_id, flags, \ + hwpt_id, data_type, data, data_len)) +#define test_err_hwpt_alloc_nested(_errno, device_id, pt_id, flags, hwpt_id, \ + data_type, data, data_len) \ + EXPECT_ERRNO(_errno, \ + _test_cmd_hwpt_alloc(self->fd, device_id, pt_id, flags, \ + hwpt_id, data_type, data, data_len)) static int _test_cmd_access_replace_ioas(int fd, __u32 access_id, unsigned int ioas_id) From a2cdecdf9d234455fdfc8f539bbf5818711bc29d Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Tue, 24 Oct 2023 08:00:11 -0700 Subject: [PATCH 40/52] iommu/vt-d: Enhance capability check for nested parent domain allocation This adds the scalable mode check before allocating the nested parent domain as checking nested capability is not enough. User may turn off scalable mode which also means no nested support even if the hardware supports it. Fixes: c97d1b20d383 ("iommu/vt-d: Add domain_alloc_user op") Link: https://lore.kernel.org/r/20231024150011.44642-1-yi.l.liu@intel.com Signed-off-by: Yi Liu Reviewed-by: Lu Baolu Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe --- drivers/iommu/intel/iommu.c | 2 +- drivers/iommu/intel/iommu.h | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index fe67f8d77b09..cb64759b3d95 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -4095,7 +4095,7 @@ intel_iommu_domain_alloc_user(struct device *dev, u32 flags, if (!iommu) return ERR_PTR(-ENODEV); - if ((flags & IOMMU_HWPT_ALLOC_NEST_PARENT) && !ecap_nest(iommu->ecap)) + if ((flags & IOMMU_HWPT_ALLOC_NEST_PARENT) && !nested_supported(iommu)) return ERR_PTR(-EOPNOTSUPP); dirty_tracking = (flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING); diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h index 3bb569146229..cc7301579773 100644 --- a/drivers/iommu/intel/iommu.h +++ b/drivers/iommu/intel/iommu.h @@ -544,6 +544,8 @@ enum { ecap_pasid((iommu)->ecap)) #define ssads_supported(iommu) (sm_supported(iommu) && \ ecap_slads((iommu)->ecap)) +#define nested_supported(iommu) (sm_supported(iommu) && \ + ecap_nest((iommu)->ecap)) struct pasid_entry; struct pasid_state_entry; From 82b6661c9c35e60946dee536545b4848f25eafab Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Wed, 25 Oct 2023 21:42:09 -0700 Subject: [PATCH 41/52] iommufd: Add data structure for Intel VT-d stage-1 domain allocation This adds IOMMU_HWPT_DATA_VTD_S1 for stage-1 hw_pagetable of Intel VT-d and the corressponding data structure for userspace specified parameter for the domain allocation. Link: https://lore.kernel.org/r/20231026044216.64964-2-yi.l.liu@intel.com Reviewed-by: Kevin Tian Signed-off-by: Yi Liu Signed-off-by: Jason Gunthorpe --- include/uapi/linux/iommufd.h | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index d816deac906f..3ce5ee5f09b6 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -361,12 +361,42 @@ enum iommufd_hwpt_alloc_flags { IOMMU_HWPT_ALLOC_DIRTY_TRACKING = 1 << 1, }; +/** + * enum iommu_hwpt_vtd_s1_flags - Intel VT-d stage-1 page table + * entry attributes + * @IOMMU_VTD_S1_SRE: Supervisor request + * @IOMMU_VTD_S1_EAFE: Extended access enable + * @IOMMU_VTD_S1_WPE: Write protect enable + */ +enum iommu_hwpt_vtd_s1_flags { + IOMMU_VTD_S1_SRE = 1 << 0, + IOMMU_VTD_S1_EAFE = 1 << 1, + IOMMU_VTD_S1_WPE = 1 << 2, +}; + +/** + * struct iommu_hwpt_vtd_s1 - Intel VT-d stage-1 page table + * info (IOMMU_HWPT_DATA_VTD_S1) + * @flags: Combination of enum iommu_hwpt_vtd_s1_flags + * @pgtbl_addr: The base address of the stage-1 page table. + * @addr_width: The address width of the stage-1 page table + * @__reserved: Must be 0 + */ +struct iommu_hwpt_vtd_s1 { + __aligned_u64 flags; + __aligned_u64 pgtbl_addr; + __u32 addr_width; + __u32 __reserved; +}; + /** * enum iommu_hwpt_data_type - IOMMU HWPT Data Type * @IOMMU_HWPT_DATA_NONE: no data + * @IOMMU_HWPT_DATA_VTD_S1: Intel VT-d stage-1 page table */ enum iommu_hwpt_data_type { IOMMU_HWPT_DATA_NONE, + IOMMU_HWPT_DATA_VTD_S1, }; /** From 04f261ac2356ee8962fbd67e38a35e86cbe3c5d8 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Wed, 25 Oct 2023 21:42:10 -0700 Subject: [PATCH 42/52] iommu/vt-d: Extend dmar_domain to support nested domain The nested domain fields are exclusive to those that used for a DMA remapping domain. Use union to avoid memory waste. Link: https://lore.kernel.org/r/20231026044216.64964-3-yi.l.liu@intel.com Reviewed-by: Kevin Tian Signed-off-by: Lu Baolu Signed-off-by: Yi Liu Signed-off-by: Jason Gunthorpe --- drivers/iommu/intel/iommu.h | 36 ++++++++++++++++++++++++++++++------ 1 file changed, 30 insertions(+), 6 deletions(-) diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h index cc7301579773..244f111ea0bb 100644 --- a/drivers/iommu/intel/iommu.h +++ b/drivers/iommu/intel/iommu.h @@ -25,6 +25,7 @@ #include #include +#include /* * VT-d hardware uses 4KiB page size regardless of host page size. @@ -605,15 +606,38 @@ struct dmar_domain { struct list_head devices; /* all devices' list */ struct list_head dev_pasids; /* all attached pasids */ - struct dma_pte *pgd; /* virtual address */ - int gaw; /* max guest address width */ - - /* adjusted guest address width, 0 is level 2 30-bit */ - int agaw; int iommu_superpage;/* Level of superpages supported: 0 == 4KiB (no superpages), 1 == 2MiB, 2 == 1GiB, 3 == 512GiB, 4 == 1TiB */ - u64 max_addr; /* maximum mapped address */ + union { + /* DMA remapping domain */ + struct { + /* virtual address */ + struct dma_pte *pgd; + /* max guest address width */ + int gaw; + /* + * adjusted guest address width: + * 0: level 2 30-bit + * 1: level 3 39-bit + * 2: level 4 48-bit + * 3: level 5 57-bit + */ + int agaw; + /* maximum mapped address */ + u64 max_addr; + }; + + /* Nested user domain */ + struct { + /* parent page table which the user domain is nested on */ + struct dmar_domain *s2_domain; + /* user page table pointer (in GPA) */ + unsigned long s1_pgtbl; + /* page table attributes */ + struct iommu_hwpt_vtd_s1 s1_cfg; + }; + }; struct iommu_domain domain; /* generic domain data structure for iommu core */ From 79ae1eccd3f7fb010064c0f6242da8f8944c21fd Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Wed, 25 Oct 2023 21:42:11 -0700 Subject: [PATCH 43/52] iommu/vt-d: Add helper for nested domain allocation This adds helper for accepting user parameters and allocate a nested domain. Link: https://lore.kernel.org/r/20231026044216.64964-4-yi.l.liu@intel.com Reviewed-by: Kevin Tian Signed-off-by: Jacob Pan Signed-off-by: Lu Baolu Signed-off-by: Yi Liu Signed-off-by: Jason Gunthorpe --- drivers/iommu/intel/Makefile | 2 +- drivers/iommu/intel/iommu.h | 2 ++ drivers/iommu/intel/nested.c | 62 ++++++++++++++++++++++++++++++++++++ 3 files changed, 65 insertions(+), 1 deletion(-) create mode 100644 drivers/iommu/intel/nested.c diff --git a/drivers/iommu/intel/Makefile b/drivers/iommu/intel/Makefile index 7af3b8a4f2a0..5dabf081a779 100644 --- a/drivers/iommu/intel/Makefile +++ b/drivers/iommu/intel/Makefile @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_DMAR_TABLE) += dmar.o -obj-$(CONFIG_INTEL_IOMMU) += iommu.o pasid.o +obj-$(CONFIG_INTEL_IOMMU) += iommu.o pasid.o nested.o obj-$(CONFIG_DMAR_TABLE) += trace.o cap_audit.o obj-$(CONFIG_DMAR_PERF) += perf.o obj-$(CONFIG_INTEL_IOMMU_DEBUGFS) += debugfs.o diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h index 244f111ea0bb..f59a9374f62d 100644 --- a/drivers/iommu/intel/iommu.h +++ b/drivers/iommu/intel/iommu.h @@ -884,6 +884,8 @@ void *alloc_pgtable_page(int node, gfp_t gfp); void free_pgtable_page(void *vaddr); void iommu_flush_write_buffer(struct intel_iommu *iommu); struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn); +struct iommu_domain *intel_nested_domain_alloc(struct iommu_domain *parent, + const struct iommu_user_data *user_data); #ifdef CONFIG_INTEL_IOMMU_SVM void intel_svm_check(struct intel_iommu *iommu); diff --git a/drivers/iommu/intel/nested.c b/drivers/iommu/intel/nested.c new file mode 100644 index 000000000000..56bb205fca06 --- /dev/null +++ b/drivers/iommu/intel/nested.c @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * nested.c - nested mode translation support + * + * Copyright (C) 2023 Intel Corporation + * + * Author: Lu Baolu + * Jacob Pan + * Yi Liu + */ + +#define pr_fmt(fmt) "DMAR: " fmt + +#include + +#include "iommu.h" + +static void intel_nested_domain_free(struct iommu_domain *domain) +{ + kfree(to_dmar_domain(domain)); +} + +static const struct iommu_domain_ops intel_nested_domain_ops = { + .free = intel_nested_domain_free, +}; + +struct iommu_domain *intel_nested_domain_alloc(struct iommu_domain *parent, + const struct iommu_user_data *user_data) +{ + struct dmar_domain *s2_domain = to_dmar_domain(parent); + struct iommu_hwpt_vtd_s1 vtd; + struct dmar_domain *domain; + int ret; + + /* Must be nested domain */ + if (user_data->type != IOMMU_HWPT_DATA_VTD_S1) + return ERR_PTR(-EOPNOTSUPP); + if (parent->ops != intel_iommu_ops.default_domain_ops) + return ERR_PTR(-EINVAL); + + ret = iommu_copy_struct_from_user(&vtd, user_data, + IOMMU_HWPT_DATA_VTD_S1, __reserved); + if (ret) + return ERR_PTR(ret); + + domain = kzalloc(sizeof(*domain), GFP_KERNEL_ACCOUNT); + if (!domain) + return ERR_PTR(-ENOMEM); + + domain->use_first_level = true; + domain->s2_domain = s2_domain; + domain->s1_pgtbl = vtd.pgtbl_addr; + domain->s1_cfg = vtd; + domain->domain.ops = &intel_nested_domain_ops; + domain->domain.type = IOMMU_DOMAIN_NESTED; + INIT_LIST_HEAD(&domain->devices); + INIT_LIST_HEAD(&domain->dev_pasids); + spin_lock_init(&domain->lock); + xa_init(&domain->iommu_array); + + return &domain->domain; +} From 111bf85c68f6edb2d06c6705faab9d1649348bdb Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Wed, 25 Oct 2023 21:42:12 -0700 Subject: [PATCH 44/52] iommu/vt-d: Add helper to setup pasid nested translation The configurations are passed in from the user when the user domain is allocated. This helper interprets these configurations according to the data structure defined in uapi/linux/iommufd.h. The EINVAL error will be returned if any of configurations are not compatible with the hardware capabilities. The caller can retry with another compatible user domain. The encoding of fields of each pasid entry is defined in section 9.6 of the VT-d spec. Link: https://lore.kernel.org/r/20231026044216.64964-5-yi.l.liu@intel.com Signed-off-by: Jacob Pan Signed-off-by: Lu Baolu Signed-off-by: Yi Liu Signed-off-by: Jason Gunthorpe --- drivers/iommu/intel/pasid.c | 112 ++++++++++++++++++++++++++++++++++++ drivers/iommu/intel/pasid.h | 2 + 2 files changed, 114 insertions(+) diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c index b9264b9174e8..74e8e4c17e81 100644 --- a/drivers/iommu/intel/pasid.c +++ b/drivers/iommu/intel/pasid.c @@ -370,6 +370,15 @@ static inline bool pasid_get_ssade(struct pasid_entry *pe) return pasid_get_bits(&pe->val[0]) & (1 << 9); } +/* + * Setup the SRE(Supervisor Request Enable) field (Bit 128) of a + * scalable mode PASID entry. + */ +static inline void pasid_set_sre(struct pasid_entry *pe) +{ + pasid_set_bits(&pe->val[2], 1 << 0, 1); +} + /* * Setup the WPE(Write Protect Enable) field (Bit 132) of a * scalable mode PASID entry. @@ -437,6 +446,15 @@ pasid_set_flpm(struct pasid_entry *pe, u64 value) pasid_set_bits(&pe->val[2], GENMASK_ULL(3, 2), value << 2); } +/* + * Setup the Extended Access Flag Enable (EAFE) field (Bit 135) + * of a scalable mode PASID entry. + */ +static inline void pasid_set_eafe(struct pasid_entry *pe) +{ + pasid_set_bits(&pe->val[2], 1 << 7, 1 << 7); +} + static void pasid_cache_invalidation_with_pasid(struct intel_iommu *iommu, u16 did, u32 pasid) @@ -822,3 +840,97 @@ void intel_pasid_setup_page_snoop_control(struct intel_iommu *iommu, if (!cap_caching_mode(iommu->cap)) devtlb_invalidation_with_pasid(iommu, dev, pasid); } + +/** + * intel_pasid_setup_nested() - Set up PASID entry for nested translation. + * @iommu: IOMMU which the device belong to + * @dev: Device to be set up for translation + * @pasid: PASID to be programmed in the device PASID table + * @domain: User stage-1 domain nested on a stage-2 domain + * + * This is used for nested translation. The input domain should be + * nested type and nested on a parent with 'is_nested_parent' flag + * set. + */ +int intel_pasid_setup_nested(struct intel_iommu *iommu, struct device *dev, + u32 pasid, struct dmar_domain *domain) +{ + struct iommu_hwpt_vtd_s1 *s1_cfg = &domain->s1_cfg; + pgd_t *s1_gpgd = (pgd_t *)(uintptr_t)domain->s1_pgtbl; + struct dmar_domain *s2_domain = domain->s2_domain; + u16 did = domain_id_iommu(domain, iommu); + struct dma_pte *pgd = s2_domain->pgd; + struct pasid_entry *pte; + + /* Address width should match the address width supported by hardware */ + switch (s1_cfg->addr_width) { + case ADDR_WIDTH_4LEVEL: + break; + case ADDR_WIDTH_5LEVEL: + if (!cap_fl5lp_support(iommu->cap)) { + dev_err_ratelimited(dev, + "5-level paging not supported\n"); + return -EINVAL; + } + break; + default: + dev_err_ratelimited(dev, "Invalid stage-1 address width %d\n", + s1_cfg->addr_width); + return -EINVAL; + } + + if ((s1_cfg->flags & IOMMU_VTD_S1_SRE) && !ecap_srs(iommu->ecap)) { + pr_err_ratelimited("No supervisor request support on %s\n", + iommu->name); + return -EINVAL; + } + + if ((s1_cfg->flags & IOMMU_VTD_S1_EAFE) && !ecap_eafs(iommu->ecap)) { + pr_err_ratelimited("No extended access flag support on %s\n", + iommu->name); + return -EINVAL; + } + + spin_lock(&iommu->lock); + pte = intel_pasid_get_entry(dev, pasid); + if (!pte) { + spin_unlock(&iommu->lock); + return -ENODEV; + } + if (pasid_pte_is_present(pte)) { + spin_unlock(&iommu->lock); + return -EBUSY; + } + + pasid_clear_entry(pte); + + if (s1_cfg->addr_width == ADDR_WIDTH_5LEVEL) + pasid_set_flpm(pte, 1); + + pasid_set_flptr(pte, (uintptr_t)s1_gpgd); + + if (s1_cfg->flags & IOMMU_VTD_S1_SRE) { + pasid_set_sre(pte); + if (s1_cfg->flags & IOMMU_VTD_S1_WPE) + pasid_set_wpe(pte); + } + + if (s1_cfg->flags & IOMMU_VTD_S1_EAFE) + pasid_set_eafe(pte); + + if (s2_domain->force_snooping) + pasid_set_pgsnp(pte); + + pasid_set_slptr(pte, virt_to_phys(pgd)); + pasid_set_fault_enable(pte); + pasid_set_domain_id(pte, did); + pasid_set_address_width(pte, s2_domain->agaw); + pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap)); + pasid_set_translation_type(pte, PASID_ENTRY_PGTT_NESTED); + pasid_set_present(pte); + spin_unlock(&iommu->lock); + + pasid_flush_caches(iommu, pte, pasid, did); + + return 0; +} diff --git a/drivers/iommu/intel/pasid.h b/drivers/iommu/intel/pasid.h index 958050b093aa..dd37611175cc 100644 --- a/drivers/iommu/intel/pasid.h +++ b/drivers/iommu/intel/pasid.h @@ -113,6 +113,8 @@ int intel_pasid_setup_dirty_tracking(struct intel_iommu *iommu, int intel_pasid_setup_pass_through(struct intel_iommu *iommu, struct dmar_domain *domain, struct device *dev, u32 pasid); +int intel_pasid_setup_nested(struct intel_iommu *iommu, struct device *dev, + u32 pasid, struct dmar_domain *domain); void intel_pasid_tear_down_entry(struct intel_iommu *iommu, struct device *dev, u32 pasid, bool fault_ignore); From d86724d4dc45ba2ed80eebb704e12bb71c35d901 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Wed, 25 Oct 2023 21:42:13 -0700 Subject: [PATCH 45/52] iommu/vt-d: Make domain attach helpers to be extern This makes the helpers visible to nested.c. Link: https://lore.kernel.org/r/20231026044216.64964-6-yi.l.liu@intel.com Suggested-by: Lu Baolu Reviewed-by: Kevin Tian Reviewed-by: Lu Baolu Signed-off-by: Yi Liu Signed-off-by: Jason Gunthorpe --- drivers/iommu/intel/iommu.c | 15 ++++++--------- drivers/iommu/intel/iommu.h | 7 +++++++ 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index cb64759b3d95..292baa64188b 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -282,7 +282,6 @@ static LIST_HEAD(dmar_satc_units); #define for_each_rmrr_units(rmrr) \ list_for_each_entry(rmrr, &dmar_rmrr_units, list) -static void device_block_translation(struct device *dev); static void intel_iommu_domain_free(struct iommu_domain *domain); int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON); @@ -561,7 +560,7 @@ static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain) } /* Some capabilities may be different across iommus */ -static void domain_update_iommu_cap(struct dmar_domain *domain) +void domain_update_iommu_cap(struct dmar_domain *domain) { domain_update_iommu_coherency(domain); domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL); @@ -1779,8 +1778,7 @@ static struct dmar_domain *alloc_domain(unsigned int type) return domain; } -static int domain_attach_iommu(struct dmar_domain *domain, - struct intel_iommu *iommu) +int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu) { struct iommu_domain_info *info, *curr; unsigned long ndomains; @@ -1829,8 +1827,7 @@ err_unlock: return ret; } -static void domain_detach_iommu(struct dmar_domain *domain, - struct intel_iommu *iommu) +void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu) { struct iommu_domain_info *info; @@ -3975,7 +3972,7 @@ static void dmar_remove_one_dev_info(struct device *dev) * all DMA requests without PASID from the device are blocked. If the page * table has been set, clean up the data structures. */ -static void device_block_translation(struct device *dev) +void device_block_translation(struct device *dev) { struct device_domain_info *info = dev_iommu_priv_get(dev); struct intel_iommu *iommu = info->iommu; @@ -4128,8 +4125,8 @@ static void intel_iommu_domain_free(struct iommu_domain *domain) domain_exit(to_dmar_domain(domain)); } -static int prepare_domain_attach_device(struct iommu_domain *domain, - struct device *dev) +int prepare_domain_attach_device(struct iommu_domain *domain, + struct device *dev) { struct dmar_domain *dmar_domain = to_dmar_domain(domain); struct intel_iommu *iommu; diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h index f59a9374f62d..6a97711f947a 100644 --- a/drivers/iommu/intel/iommu.h +++ b/drivers/iommu/intel/iommu.h @@ -878,6 +878,13 @@ int qi_submit_sync(struct intel_iommu *iommu, struct qi_desc *desc, */ #define QI_OPT_WAIT_DRAIN BIT(0) +int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu); +void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu); +void device_block_translation(struct device *dev); +int prepare_domain_attach_device(struct iommu_domain *domain, + struct device *dev); +void domain_update_iommu_cap(struct dmar_domain *domain); + int dmar_ir_support(void); void *alloc_pgtable_page(int node, gfp_t gfp); From 9838f2bb6b6be1e648b9377fc97ee7b18d9f2fbf Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Wed, 25 Oct 2023 21:42:14 -0700 Subject: [PATCH 46/52] iommu/vt-d: Set the nested domain to a device This adds the helper for setting the nested domain to a device hence enable nested domain usage on Intel VT-d. Link: https://lore.kernel.org/r/20231026044216.64964-7-yi.l.liu@intel.com Signed-off-by: Jacob Pan Signed-off-by: Lu Baolu Signed-off-by: Yi Liu Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe --- drivers/iommu/intel/nested.c | 54 ++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/drivers/iommu/intel/nested.c b/drivers/iommu/intel/nested.c index 56bb205fca06..b560ab76e126 100644 --- a/drivers/iommu/intel/nested.c +++ b/drivers/iommu/intel/nested.c @@ -12,8 +12,61 @@ #define pr_fmt(fmt) "DMAR: " fmt #include +#include +#include #include "iommu.h" +#include "pasid.h" + +static int intel_nested_attach_dev(struct iommu_domain *domain, + struct device *dev) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct dmar_domain *dmar_domain = to_dmar_domain(domain); + struct intel_iommu *iommu = info->iommu; + unsigned long flags; + int ret = 0; + + if (info->domain) + device_block_translation(dev); + + if (iommu->agaw < dmar_domain->s2_domain->agaw) { + dev_err_ratelimited(dev, "Adjusted guest address width not compatible\n"); + return -ENODEV; + } + + /* + * Stage-1 domain cannot work alone, it is nested on a s2_domain. + * The s2_domain will be used in nested translation, hence needs + * to ensure the s2_domain is compatible with this IOMMU. + */ + ret = prepare_domain_attach_device(&dmar_domain->s2_domain->domain, dev); + if (ret) { + dev_err_ratelimited(dev, "s2 domain is not compatible\n"); + return ret; + } + + ret = domain_attach_iommu(dmar_domain, iommu); + if (ret) { + dev_err_ratelimited(dev, "Failed to attach domain to iommu\n"); + return ret; + } + + ret = intel_pasid_setup_nested(iommu, dev, + IOMMU_NO_PASID, dmar_domain); + if (ret) { + domain_detach_iommu(dmar_domain, iommu); + dev_err_ratelimited(dev, "Failed to setup pasid entry\n"); + return ret; + } + + info->domain = dmar_domain; + spin_lock_irqsave(&dmar_domain->lock, flags); + list_add(&info->link, &dmar_domain->devices); + spin_unlock_irqrestore(&dmar_domain->lock, flags); + + return 0; +} static void intel_nested_domain_free(struct iommu_domain *domain) { @@ -21,6 +74,7 @@ static void intel_nested_domain_free(struct iommu_domain *domain) } static const struct iommu_domain_ops intel_nested_domain_ops = { + .attach_dev = intel_nested_attach_dev, .free = intel_nested_domain_free, }; From b41e38e225398191aaa0f1115d6234f57ffd0741 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Wed, 25 Oct 2023 21:42:15 -0700 Subject: [PATCH 47/52] iommu/vt-d: Add nested domain allocation This adds the support for IOMMU_HWPT_DATA_VTD_S1 type. And 'nested_parent' is added to mark the nested parent domain to sanitize the input parent domain. Link: https://lore.kernel.org/r/20231026044216.64964-8-yi.l.liu@intel.com Signed-off-by: Lu Baolu Signed-off-by: Yi Liu Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe --- drivers/iommu/intel/iommu.c | 39 ++++++++++++++++++------------------ drivers/iommu/intel/iommu.h | 1 + drivers/iommu/intel/nested.c | 3 ++- 3 files changed, 23 insertions(+), 20 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 292baa64188b..4ce372d5d4f3 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -4077,38 +4077,39 @@ intel_iommu_domain_alloc_user(struct device *dev, u32 flags, struct iommu_domain *parent, const struct iommu_user_data *user_data) { + struct device_domain_info *info = dev_iommu_priv_get(dev); + bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING; + bool nested_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT; + struct intel_iommu *iommu = info->iommu; struct iommu_domain *domain; - struct intel_iommu *iommu; - bool dirty_tracking; + + /* Must be NESTING domain */ + if (parent) { + if (!nested_supported(iommu) || flags) + return ERR_PTR(-EOPNOTSUPP); + return intel_nested_domain_alloc(parent, user_data); + } if (flags & (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING))) return ERR_PTR(-EOPNOTSUPP); - - if (parent || user_data) + if (nested_parent && !nested_supported(iommu)) return ERR_PTR(-EOPNOTSUPP); - - iommu = device_to_iommu(dev, NULL, NULL); - if (!iommu) - return ERR_PTR(-ENODEV); - - if ((flags & IOMMU_HWPT_ALLOC_NEST_PARENT) && !nested_supported(iommu)) - return ERR_PTR(-EOPNOTSUPP); - - dirty_tracking = (flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING); - if (dirty_tracking && !ssads_supported(iommu)) + if (user_data || (dirty_tracking && !ssads_supported(iommu))) return ERR_PTR(-EOPNOTSUPP); /* - * domain_alloc_user op needs to fully initialize a domain - * before return, so uses iommu_domain_alloc() here for - * simple. + * domain_alloc_user op needs to fully initialize a domain before + * return, so uses iommu_domain_alloc() here for simple. */ domain = iommu_domain_alloc(dev->bus); if (!domain) - domain = ERR_PTR(-ENOMEM); + return ERR_PTR(-ENOMEM); - if (!IS_ERR(domain) && dirty_tracking) { + if (nested_parent) + to_dmar_domain(domain)->nested_parent = true; + + if (dirty_tracking) { if (to_dmar_domain(domain)->use_first_level) { iommu_domain_free(domain); return ERR_PTR(-EOPNOTSUPP); diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h index 6a97711f947a..ba9be915eb84 100644 --- a/drivers/iommu/intel/iommu.h +++ b/drivers/iommu/intel/iommu.h @@ -601,6 +601,7 @@ struct dmar_domain { * level. */ u8 dirty_tracking:1; /* Dirty tracking is enabled */ + u8 nested_parent:1; /* Has other domains nested on it */ spinlock_t lock; /* Protect device tracking lists */ struct list_head devices; /* all devices' list */ diff --git a/drivers/iommu/intel/nested.c b/drivers/iommu/intel/nested.c index b560ab76e126..b5a5563ab32c 100644 --- a/drivers/iommu/intel/nested.c +++ b/drivers/iommu/intel/nested.c @@ -89,7 +89,8 @@ struct iommu_domain *intel_nested_domain_alloc(struct iommu_domain *parent, /* Must be nested domain */ if (user_data->type != IOMMU_HWPT_DATA_VTD_S1) return ERR_PTR(-EOPNOTSUPP); - if (parent->ops != intel_iommu_ops.default_domain_ops) + if (parent->ops != intel_iommu_ops.default_domain_ops || + !s2_domain->nested_parent) return ERR_PTR(-EINVAL); ret = iommu_copy_struct_from_user(&vtd, user_data, From 03476e687eb07b94f7cdb07cd3c7c4304b6c58b3 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Wed, 25 Oct 2023 21:42:16 -0700 Subject: [PATCH 48/52] iommu/vt-d: Disallow read-only mappings to nest parent domain When remapping hardware is configured by system software in scalable mode as Nested (PGTT=011b) and with PWSNP field Set in the PASID-table-entry, it may Set Accessed bit and Dirty bit (and Extended Access bit if enabled) in first-stage page-table entries even when second-stage mappings indicate that corresponding first-stage page-table is Read-Only. As the result, contents of pages designated by VMM as Read-Only can be modified by IOMMU via PML5E (PML4E for 4-level tables) access as part of address translation process due to DMAs issued by Guest. This disallows read-only mappings in the domain that is supposed to be used as nested parent. Reference from Sapphire Rapids Specification Update [1], errata details, SPR17. Userspace should know this limitation by checking the IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17 flag reported in the IOMMU_GET_HW_INFO ioctl. [1] https://www.intel.com/content/www/us/en/content-details/772415/content-details.html Link: https://lore.kernel.org/r/20231026044216.64964-9-yi.l.liu@intel.com Reviewed-by: Kevin Tian Signed-off-by: Lu Baolu Signed-off-by: Yi Liu Signed-off-by: Jason Gunthorpe --- drivers/iommu/intel/iommu.c | 6 ++++++ include/uapi/linux/iommufd.h | 12 +++++++++++- 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 4ce372d5d4f3..a2c429855cc0 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -2194,6 +2194,11 @@ __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn, if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0) return -EINVAL; + if (!(prot & DMA_PTE_WRITE) && domain->nested_parent) { + pr_err_ratelimited("Read-only mapping is disallowed on the domain which serves as the parent in a nested configuration, due to HW errata (ERRATA_772415_SPR17)\n"); + return -EINVAL; + } + attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP); attr |= DMA_FL_PTE_PRESENT; if (domain->use_first_level) { @@ -4850,6 +4855,7 @@ static void *intel_iommu_hw_info(struct device *dev, u32 *length, u32 *type) if (!vtd) return ERR_PTR(-ENOMEM); + vtd->flags = IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17; vtd->cap_reg = iommu->cap; vtd->ecap_reg = iommu->ecap; *length = sizeof(*vtd); diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index 3ce5ee5f09b6..0b2bc6252e2c 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -443,10 +443,20 @@ struct iommu_hwpt_alloc { }; #define IOMMU_HWPT_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_HWPT_ALLOC) +/** + * enum iommu_hw_info_vtd_flags - Flags for VT-d hw_info + * @IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17: If set, disallow read-only mappings + * on a nested_parent domain. + * https://www.intel.com/content/www/us/en/content-details/772415/content-details.html + */ +enum iommu_hw_info_vtd_flags { + IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17 = 1 << 0, +}; + /** * struct iommu_hw_info_vtd - Intel VT-d hardware information * - * @flags: Must be 0 + * @flags: Combination of enum iommu_hw_info_vtd_flags * @__reserved: Must be 0 * * @cap_reg: Value of Intel VT-d capability register defined in VT-d spec From e7250ab7ca4998fe026f2149805b03e09dc32498 Mon Sep 17 00:00:00 2001 From: Koichiro Den Date: Sat, 28 Oct 2023 01:29:42 +0900 Subject: [PATCH 49/52] iommufd: Fix missing update of domains_itree after splitting iopt_area In iopt_area_split(), if the original iopt_area has filled a domain and is linked to domains_itree, pages_nodes have to be properly reinserted. Otherwise the domains_itree becomes corrupted and we will UAF. Fixes: 51fe6141f0f6 ("iommufd: Data structure to provide IOVA to PFN mapping") Link: https://lore.kernel.org/r/20231027162941.2864615-2-den@valinux.co.jp Cc: stable@vger.kernel.org Signed-off-by: Koichiro Den Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/io_pagetable.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/iommu/iommufd/io_pagetable.c b/drivers/iommu/iommufd/io_pagetable.c index 9f060abe53b6..c3e7791f8201 100644 --- a/drivers/iommu/iommufd/io_pagetable.c +++ b/drivers/iommu/iommufd/io_pagetable.c @@ -1220,6 +1220,16 @@ static int iopt_area_split(struct iopt_area *area, unsigned long iova) if (WARN_ON(rc)) goto err_remove_lhs; + /* + * If the original area has filled a domain, domains_itree has to be + * updated. + */ + if (area->storage_domain) { + interval_tree_remove(&area->pages_node, &pages->domains_itree); + interval_tree_insert(&lhs->pages_node, &pages->domains_itree); + interval_tree_insert(&rhs->pages_node, &pages->domains_itree); + } + lhs->storage_domain = area->storage_domain; lhs->pages = area->pages; rhs->storage_domain = area->storage_domain; From 361d744ddd61de065fbeb042aaed590d32dd92ec Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 30 Oct 2023 11:26:33 -0300 Subject: [PATCH 50/52] iommufd: Add iopt_area_alloc() We never initialize the two interval tree nodes, and zero fill is not the same as RB_CLEAR_NODE. This can hide issues where we missed adding the area to the trees. Factor out the allocation and clear the two nodes. Fixes: 51fe6141f0f6 ("iommufd: Data structure to provide IOVA to PFN mapping") Link: https://lore.kernel.org/r/20231030145035.GG691768@ziepe.ca Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/io_pagetable.c | 18 +++++++++++++++--- drivers/iommu/iommufd/pages.c | 2 ++ 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/drivers/iommu/iommufd/io_pagetable.c b/drivers/iommu/iommufd/io_pagetable.c index c3e7791f8201..504ac1b01b2d 100644 --- a/drivers/iommu/iommufd/io_pagetable.c +++ b/drivers/iommu/iommufd/io_pagetable.c @@ -222,6 +222,18 @@ static int iopt_insert_area(struct io_pagetable *iopt, struct iopt_area *area, return 0; } +static struct iopt_area *iopt_area_alloc(void) +{ + struct iopt_area *area; + + area = kzalloc(sizeof(*area), GFP_KERNEL_ACCOUNT); + if (!area) + return NULL; + RB_CLEAR_NODE(&area->node.rb); + RB_CLEAR_NODE(&area->pages_node.rb); + return area; +} + static int iopt_alloc_area_pages(struct io_pagetable *iopt, struct list_head *pages_list, unsigned long length, unsigned long *dst_iova, @@ -232,7 +244,7 @@ static int iopt_alloc_area_pages(struct io_pagetable *iopt, int rc = 0; list_for_each_entry(elm, pages_list, next) { - elm->area = kzalloc(sizeof(*elm->area), GFP_KERNEL_ACCOUNT); + elm->area = iopt_area_alloc(); if (!elm->area) return -ENOMEM; } @@ -1177,11 +1189,11 @@ static int iopt_area_split(struct iopt_area *area, unsigned long iova) iopt_area_start_byte(area, new_start) & (alignment - 1)) return -EINVAL; - lhs = kzalloc(sizeof(*area), GFP_KERNEL_ACCOUNT); + lhs = iopt_area_alloc(); if (!lhs) return -ENOMEM; - rhs = kzalloc(sizeof(*area), GFP_KERNEL_ACCOUNT); + rhs = iopt_area_alloc(); if (!rhs) { rc = -ENOMEM; goto err_free_lhs; diff --git a/drivers/iommu/iommufd/pages.c b/drivers/iommu/iommufd/pages.c index 8d9aa297c117..528f356238b3 100644 --- a/drivers/iommu/iommufd/pages.c +++ b/drivers/iommu/iommufd/pages.c @@ -1507,6 +1507,8 @@ void iopt_area_unfill_domains(struct iopt_area *area, struct iopt_pages *pages) area, domain, iopt_area_index(area), iopt_area_last_index(area)); + if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) + WARN_ON(RB_EMPTY_NODE(&area->pages_node.rb)); interval_tree_remove(&area->pages_node, &pages->domains_itree); iopt_area_unfill_domain(area, pages, area->storage_domain); area->storage_domain = NULL; From 2e22aac3ea9cfc0ec3209c96644f60c1806a8117 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Mon, 30 Oct 2023 11:34:46 +0000 Subject: [PATCH 51/52] iommufd/selftest: Fix page-size check in iommufd_test_dirty() iommufd_test_dirty()/IOMMU_TEST_OP_DIRTY sets the dirty bits in the mock domain implementation that the userspace side validates against what it obtains via the UAPI. However in introducing iommufd_test_dirty() it forgot to validate page_size being 0 leading to two possible divide-by-zero problems: one at the beginning when calculating @max and while calculating the IOVA in the XArray PFN tracking list. While at it, validate the length to require non-zero value as well, as we can't be allocating a 0-sized bitmap. Link: https://lore.kernel.org/r/20231030113446.7056-1-joao.m.martins@oracle.com Reported-by: syzbot+25dc7383c30ecdc83c38@syzkaller.appspotmail.com Closes: https://lore.kernel.org/linux-iommu/00000000000005f6aa0608b9220f@google.com/ Fixes: a9af47e382a4 ("iommufd/selftest: Test IOMMU_HWPT_GET_DIRTY_BITMAP") Signed-off-by: Joao Martins Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/selftest.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c index 6684ab4cdc7a..a11d29f368ff 100644 --- a/drivers/iommu/iommufd/selftest.c +++ b/drivers/iommu/iommufd/selftest.c @@ -1195,14 +1195,15 @@ static int iommufd_test_dirty(struct iommufd_ucmd *ucmd, unsigned int mockpt_id, unsigned long page_size, void __user *uptr, u32 flags) { - unsigned long bitmap_size, i, max = length / page_size; + unsigned long bitmap_size, i, max; struct iommu_test_cmd *cmd = ucmd->cmd; struct iommufd_hw_pagetable *hwpt; struct mock_iommu_domain *mock; int rc, count = 0; void *tmp; - if (iova % page_size || length % page_size || !uptr) + if (!page_size || !length || iova % page_size || length % page_size || + !uptr) return -EINVAL; hwpt = get_md_pagetable(ucmd, mockpt_id, &mock); @@ -1214,6 +1215,7 @@ static int iommufd_test_dirty(struct iommufd_ucmd *ucmd, unsigned int mockpt_id, goto out_put; } + max = length / page_size; bitmap_size = max / BITS_PER_BYTE; tmp = kvzalloc(bitmap_size, GFP_KERNEL_ACCOUNT); From b2b67c997bf74453f3469d8b54e4859f190943bd Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 30 Oct 2023 15:11:20 -0300 Subject: [PATCH 52/52] iommufd: Organize the mock domain alloc functions closer to Joerg's tree Patches in Joerg's iommu tree to convert the mock driver to use domain_alloc_paging() that clash badly with the way the selftest changes for nesting were structured. Massage the selftest so that it looks closer the code after the domain_alloc_paging() conversion to ease the merge. Change __mock_domain_alloc_paging() into mock_domain_alloc_paging() in the same way as the iommu tree. The merge resolution then trivially takes both and deletes mock_domain_alloc(). Link: https://lore.kernel.org/r/0-v1-90a855762c96+19de-mock_merge_jgg@nvidia.com Reviewed-by: Nicolin Chen Reviewed-by: Kevin Tian Reviewed-by: Yi Liu Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/selftest.c | 35 +++++++++++++++----------------- 1 file changed, 16 insertions(+), 19 deletions(-) diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c index a11d29f368ff..d43a87737c1e 100644 --- a/drivers/iommu/iommufd/selftest.c +++ b/drivers/iommu/iommufd/selftest.c @@ -20,6 +20,8 @@ static DECLARE_FAULT_ATTR(fail_iommufd); static struct dentry *dbgfs_root; static struct platform_device *selftest_iommu_dev; +static const struct iommu_ops mock_ops; +static struct iommu_domain_ops domain_nested_ops; size_t iommufd_test_memory_limit = 65536; @@ -222,24 +224,18 @@ const struct iommu_dirty_ops dirty_ops = { .read_and_clear_dirty = mock_domain_read_and_clear_dirty, }; -static const struct iommu_ops mock_ops; -static struct iommu_domain_ops domain_nested_ops; - -static struct iommu_domain * -__mock_domain_alloc_paging(unsigned int iommu_domain_type, bool needs_dirty_ops) +static struct iommu_domain *mock_domain_alloc_paging(struct device *dev) { struct mock_iommu_domain *mock; mock = kzalloc(sizeof(*mock), GFP_KERNEL); if (!mock) - return ERR_PTR(-ENOMEM); + return NULL; mock->domain.geometry.aperture_start = MOCK_APERTURE_START; mock->domain.geometry.aperture_end = MOCK_APERTURE_LAST; mock->domain.pgsize_bitmap = MOCK_IO_PAGE_SIZE; mock->domain.ops = mock_ops.default_domain_ops; - if (needs_dirty_ops) - mock->domain.dirty_ops = &dirty_ops; - mock->domain.type = iommu_domain_type; + mock->domain.type = IOMMU_DOMAIN_UNMANAGED; xa_init(&mock->pfns); return &mock->domain; } @@ -264,16 +260,11 @@ __mock_domain_alloc_nested(struct mock_iommu_domain *mock_parent, static struct iommu_domain *mock_domain_alloc(unsigned int iommu_domain_type) { - struct iommu_domain *domain; - if (iommu_domain_type == IOMMU_DOMAIN_BLOCKED) return &mock_blocking_domain; - if (iommu_domain_type != IOMMU_DOMAIN_UNMANAGED) - return NULL; - domain = __mock_domain_alloc_paging(iommu_domain_type, false); - if (IS_ERR(domain)) - domain = NULL; - return domain; + if (iommu_domain_type == IOMMU_DOMAIN_UNMANAGED) + return mock_domain_alloc_paging(NULL); + return NULL; } static struct iommu_domain * @@ -290,14 +281,20 @@ mock_domain_alloc_user(struct device *dev, u32 flags, struct mock_dev *mdev = container_of(dev, struct mock_dev, dev); bool has_dirty_flag = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING; bool no_dirty_ops = mdev->flags & MOCK_FLAGS_DEVICE_NO_DIRTY; + struct iommu_domain *domain; if (flags & (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING))) return ERR_PTR(-EOPNOTSUPP); if (user_data || (has_dirty_flag && no_dirty_ops)) return ERR_PTR(-EOPNOTSUPP); - return __mock_domain_alloc_paging(IOMMU_DOMAIN_UNMANAGED, - has_dirty_flag); + domain = mock_domain_alloc_paging(NULL); + if (!domain) + return ERR_PTR(-ENOMEM); + if (has_dirty_flag) + container_of(domain, struct mock_iommu_domain, domain) + ->domain.dirty_ops = &dirty_ops; + return domain; } /* must be mock_domain_nested */