drm/xe: Convert existing drm_exec transactions for exhaustive eviction

Convert existing drm_exec transactions, like GT pagefault validation,
non-LR exec() IOCTL and the rebind worker to support
exhaustive eviction using the xe_validation_guard().

v2:
- Adapt to signature change in xe_validation_guard() (Matt Brost)
- Avoid gotos from within xe_validation_guard() (Matt Brost)
- Check error return from xe_validation_guard()

v3:
- Rebase on gpu_madvise()

Signed-off-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
Reviewed-by: Matthew Brost <matthew.brost@intel.com> #v1
Link: https://lore.kernel.org/r/20250908101246.65025-6-thomas.hellstrom@linux.intel.com
This commit is contained in:
Thomas Hellström
2025-09-08 12:12:38 +02:00
parent 1710cd5c8c
commit 8f25e5abcb
4 changed files with 75 additions and 106 deletions
+8 -12
View File
@@ -120,10 +120,10 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
struct drm_gpuvm_exec vm_exec = {.extra.fn = xe_exec_fn};
struct drm_exec *exec = &vm_exec.exec;
u32 i, num_syncs, num_ufence = 0;
struct xe_validation_ctx ctx;
struct xe_sched_job *job;
struct xe_vm *vm;
bool write_locked, skip_retry = false;
ktime_t end = 0;
int err = 0;
struct xe_hw_engine_group *group;
enum xe_hw_engine_group_execution_mode mode, previous_mode;
@@ -251,17 +251,12 @@ retry:
if (err)
goto err_unlock_list;
vm_exec.vm = &vm->gpuvm;
vm_exec.flags = DRM_EXEC_INTERRUPTIBLE_WAIT;
if (xe_vm_in_lr_mode(vm)) {
drm_exec_init(exec, vm_exec.flags, 0);
} else {
err = drm_gpuvm_exec_lock(&vm_exec);
if (err) {
if (xe_vm_validate_should_retry(exec, err, &end))
err = -EAGAIN;
if (!xe_vm_in_lr_mode(vm)) {
vm_exec.vm = &vm->gpuvm;
vm_exec.flags = DRM_EXEC_INTERRUPTIBLE_WAIT;
err = xe_validation_exec_lock(&ctx, &vm_exec, &xe->val);
if (err)
goto err_unlock_list;
}
}
if (xe_vm_is_closed_or_banned(q->vm)) {
@@ -355,7 +350,8 @@ err_put_job:
if (err)
xe_sched_job_put(job);
err_exec:
drm_exec_fini(exec);
if (!xe_vm_in_lr_mode(vm))
xe_validation_ctx_fini(&ctx);
err_unlock_list:
up_read(&vm->lock);
if (err == -EAGAIN && !skip_retry)
+9 -11
View File
@@ -96,9 +96,9 @@ static int handle_vma_pagefault(struct xe_gt *gt, struct xe_vma *vma,
{
struct xe_vm *vm = xe_vma_vm(vma);
struct xe_tile *tile = gt_to_tile(gt);
struct xe_validation_ctx ctx;
struct drm_exec exec;
struct dma_fence *fence;
ktime_t end = 0;
int err, needs_vram;
lockdep_assert_held_write(&vm->lock);
@@ -127,12 +127,11 @@ retry_userptr:
}
/* Lock VM and BOs dma-resv */
drm_exec_init(&exec, 0, 0);
xe_validation_ctx_init(&ctx, &vm->xe->val, &exec, (struct xe_val_flags) {});
drm_exec_until_all_locked(&exec) {
err = xe_pf_begin(&exec, vma, needs_vram == 1, tile->mem.vram);
drm_exec_retry_on_contention(&exec);
if (xe_vm_validate_should_retry(&exec, err, &end))
err = -EAGAIN;
xe_validation_retry_on_oom(&ctx, &err);
if (err)
goto unlock_dma_resv;
@@ -143,8 +142,7 @@ retry_userptr:
xe_vm_set_validation_exec(vm, NULL);
if (IS_ERR(fence)) {
err = PTR_ERR(fence);
if (xe_vm_validate_should_retry(&exec, err, &end))
err = -EAGAIN;
xe_validation_retry_on_oom(&ctx, &err);
goto unlock_dma_resv;
}
}
@@ -153,7 +151,7 @@ retry_userptr:
dma_fence_put(fence);
unlock_dma_resv:
drm_exec_fini(&exec);
xe_validation_ctx_fini(&ctx);
if (err == -EAGAIN)
goto retry_userptr;
@@ -535,6 +533,7 @@ static int handle_acc(struct xe_gt *gt, struct acc *acc)
{
struct xe_device *xe = gt_to_xe(gt);
struct xe_tile *tile = gt_to_tile(gt);
struct xe_validation_ctx ctx;
struct drm_exec exec;
struct xe_vm *vm;
struct xe_vma *vma;
@@ -564,15 +563,14 @@ static int handle_acc(struct xe_gt *gt, struct acc *acc)
goto unlock_vm;
/* Lock VM and BOs dma-resv */
drm_exec_init(&exec, 0, 0);
xe_validation_ctx_init(&ctx, &vm->xe->val, &exec, (struct xe_val_flags) {});
drm_exec_until_all_locked(&exec) {
ret = xe_pf_begin(&exec, vma, IS_DGFX(vm->xe), tile->mem.vram);
drm_exec_retry_on_contention(&exec);
if (ret)
break;
xe_validation_retry_on_oom(&ctx, &ret);
}
drm_exec_fini(&exec);
xe_validation_ctx_fini(&ctx);
unlock_vm:
up_read(&vm->lock);
xe_vm_put(vm);
+58 -81
View File
@@ -210,6 +210,7 @@ int xe_vm_add_compute_exec_queue(struct xe_vm *vm, struct xe_exec_queue *q)
.num_fences = 1,
};
struct drm_exec *exec = &vm_exec.exec;
struct xe_validation_ctx ctx;
struct dma_fence *pfence;
int err;
bool wait;
@@ -217,7 +218,7 @@ int xe_vm_add_compute_exec_queue(struct xe_vm *vm, struct xe_exec_queue *q)
xe_assert(vm->xe, xe_vm_in_preempt_fence_mode(vm));
down_write(&vm->lock);
err = drm_gpuvm_exec_lock(&vm_exec);
err = xe_validation_exec_lock(&ctx, &vm_exec, &vm->xe->val);
if (err)
goto out_up_write;
@@ -249,7 +250,7 @@ int xe_vm_add_compute_exec_queue(struct xe_vm *vm, struct xe_exec_queue *q)
xe_svm_notifier_unlock(vm);
out_fini:
drm_exec_fini(exec);
xe_validation_ctx_fini(&ctx);
out_up_write:
up_write(&vm->lock);
@@ -313,39 +314,6 @@ void xe_vm_kill(struct xe_vm *vm, bool unlocked)
/* TODO: Inform user the VM is banned */
}
/**
* xe_vm_validate_should_retry() - Whether to retry after a validate error.
* @exec: The drm_exec object used for locking before validation.
* @err: The error returned from ttm_bo_validate().
* @end: A ktime_t cookie that should be set to 0 before first use and
* that should be reused on subsequent calls.
*
* With multiple active VMs, under memory pressure, it is possible that
* ttm_bo_validate() run into -EDEADLK and in such case returns -ENOMEM.
* Until ttm properly handles locking in such scenarios, best thing the
* driver can do is retry with a timeout. Check if that is necessary, and
* if so unlock the drm_exec's objects while keeping the ticket to prepare
* for a rerun.
*
* Return: true if a retry after drm_exec_init() is recommended;
* false otherwise.
*/
bool xe_vm_validate_should_retry(struct drm_exec *exec, int err, ktime_t *end)
{
ktime_t cur;
if (err != -ENOMEM)
return false;
cur = ktime_get();
*end = *end ? : ktime_add_ms(cur, XE_VM_REBIND_RETRY_TIMEOUT_MS);
if (!ktime_before(cur, *end))
return false;
msleep(20);
return true;
}
static int xe_gpuvm_validate(struct drm_gpuvm_bo *vm_bo, struct drm_exec *exec)
{
struct xe_vm *vm = gpuvm_to_vm(vm_bo->vm);
@@ -476,10 +444,10 @@ void xe_vm_resume_rebind_worker(struct xe_vm *vm)
static void preempt_rebind_work_func(struct work_struct *w)
{
struct xe_vm *vm = container_of(w, struct xe_vm, preempt.rebind_work);
struct xe_validation_ctx ctx;
struct drm_exec exec;
unsigned int fence_count = 0;
LIST_HEAD(preempt_fences);
ktime_t end = 0;
int err = 0;
long wait;
int __maybe_unused tries = 0;
@@ -507,18 +475,19 @@ retry:
goto out_unlock_outer;
}
drm_exec_init(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT, 0);
err = xe_validation_ctx_init(&ctx, &vm->xe->val, &exec,
(struct xe_val_flags) {.interruptible = true});
if (err)
goto out_unlock_outer;
drm_exec_until_all_locked(&exec) {
bool done = false;
err = xe_preempt_work_begin(&exec, vm, &done);
drm_exec_retry_on_contention(&exec);
xe_validation_retry_on_oom(&ctx, &err);
if (err || done) {
drm_exec_fini(&exec);
if (err && xe_vm_validate_should_retry(&exec, err, &end))
err = -EAGAIN;
xe_validation_ctx_fini(&ctx);
goto out_unlock_outer;
}
}
@@ -566,7 +535,7 @@ retry:
xe_svm_notifier_unlock(vm);
out_unlock:
drm_exec_fini(&exec);
xe_validation_ctx_fini(&ctx);
out_unlock_outer:
if (err == -EAGAIN) {
trace_xe_vm_rebind_worker_retry(vm);
@@ -1164,20 +1133,19 @@ int xe_vm_lock_vma(struct drm_exec *exec, struct xe_vma *vma)
static void xe_vma_destroy_unlocked(struct xe_vma *vma)
{
struct xe_device *xe = xe_vma_vm(vma)->xe;
struct xe_validation_ctx ctx;
struct drm_exec exec;
int err;
int err = 0;
drm_exec_init(&exec, 0, 0);
drm_exec_until_all_locked(&exec) {
xe_validation_guard(&ctx, &xe->val, &exec, (struct xe_val_flags) {}, err) {
err = xe_vm_lock_vma(&exec, vma);
drm_exec_retry_on_contention(&exec);
if (XE_WARN_ON(err))
break;
xe_vma_destroy(vma, NULL);
}
xe_vma_destroy(vma, NULL);
drm_exec_fini(&exec);
xe_assert(xe, !err);
}
struct xe_vma *
@@ -2383,6 +2351,7 @@ static struct xe_vma *new_vma(struct xe_vm *vm, struct drm_gpuva_op_map *op,
struct xe_vma_mem_attr *attr, unsigned int flags)
{
struct xe_bo *bo = op->gem.obj ? gem_to_xe_bo(op->gem.obj) : NULL;
struct xe_validation_ctx ctx;
struct drm_exec exec;
struct xe_vma *vma;
int err = 0;
@@ -2390,9 +2359,9 @@ static struct xe_vma *new_vma(struct xe_vm *vm, struct drm_gpuva_op_map *op,
lockdep_assert_held_write(&vm->lock);
if (bo) {
drm_exec_init(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT, 0);
drm_exec_until_all_locked(&exec) {
err = 0;
err = 0;
xe_validation_guard(&ctx, &vm->xe->val, &exec,
(struct xe_val_flags) {.interruptible = true}, err) {
if (!bo->vm) {
err = drm_exec_lock_obj(&exec, xe_vm_obj(vm));
drm_exec_retry_on_contention(&exec);
@@ -2401,27 +2370,35 @@ static struct xe_vma *new_vma(struct xe_vm *vm, struct drm_gpuva_op_map *op,
err = drm_exec_lock_obj(&exec, &bo->ttm.base);
drm_exec_retry_on_contention(&exec);
}
if (err) {
drm_exec_fini(&exec);
if (err)
return ERR_PTR(err);
vma = xe_vma_create(vm, bo, op->gem.offset,
op->va.addr, op->va.addr +
op->va.range - 1, attr, flags);
if (IS_ERR(vma))
return vma;
if (!bo->vm) {
err = add_preempt_fences(vm, bo);
if (err) {
prep_vma_destroy(vm, vma, false);
xe_vma_destroy(vma, NULL);
}
}
}
if (err)
return ERR_PTR(err);
} else {
vma = xe_vma_create(vm, NULL, op->gem.offset,
op->va.addr, op->va.addr +
op->va.range - 1, attr, flags);
if (IS_ERR(vma))
return vma;
if (xe_vma_is_userptr(vma))
err = xe_vma_userptr_pin_pages(to_userptr_vma(vma));
}
vma = xe_vma_create(vm, bo, op->gem.offset,
op->va.addr, op->va.addr +
op->va.range - 1, attr, flags);
if (IS_ERR(vma))
goto err_unlock;
if (xe_vma_is_userptr(vma))
err = xe_vma_userptr_pin_pages(to_userptr_vma(vma));
else if (!xe_vma_has_no_bo(vma) && !bo->vm)
err = add_preempt_fences(vm, bo);
err_unlock:
if (bo)
drm_exec_fini(&exec);
if (err) {
prep_vma_destroy(vm, vma, false);
xe_vma_destroy_unlocked(vma);
@@ -3220,21 +3197,23 @@ static void vm_bind_ioctl_ops_fini(struct xe_vm *vm, struct xe_vma_ops *vops,
static struct dma_fence *vm_bind_ioctl_ops_execute(struct xe_vm *vm,
struct xe_vma_ops *vops)
{
struct xe_validation_ctx ctx;
struct drm_exec exec;
struct dma_fence *fence;
int err;
int err = 0;
lockdep_assert_held_write(&vm->lock);
drm_exec_init(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT |
DRM_EXEC_IGNORE_DUPLICATES, 0);
drm_exec_until_all_locked(&exec) {
xe_validation_guard(&ctx, &vm->xe->val, &exec,
((struct xe_val_flags) {
.interruptible = true,
.exec_ignore_duplicates = true,
}), err) {
err = vm_bind_ioctl_ops_lock_and_prep(&exec, vm, vops);
drm_exec_retry_on_contention(&exec);
if (err) {
fence = ERR_PTR(err);
goto unlock;
}
xe_validation_retry_on_oom(&ctx, &err);
if (err)
return ERR_PTR(err);
xe_vm_set_validation_exec(vm, &exec);
fence = ops_execute(vm, vops);
@@ -3242,15 +3221,13 @@ static struct dma_fence *vm_bind_ioctl_ops_execute(struct xe_vm *vm,
if (IS_ERR(fence)) {
if (PTR_ERR(fence) == -ENODATA)
vm_bind_ioctl_ops_fini(vm, vops, NULL);
goto unlock;
return fence;
}
vm_bind_ioctl_ops_fini(vm, vops, fence);
}
unlock:
drm_exec_fini(&exec);
return fence;
return err ? ERR_PTR(err) : fence;
}
ALLOW_ERROR_INJECTION(vm_bind_ioctl_ops_execute, ERRNO);
-2
View File
@@ -260,8 +260,6 @@ static inline void xe_vm_reactivate_rebind(struct xe_vm *vm)
}
}
bool xe_vm_validate_should_retry(struct drm_exec *exec, int err, ktime_t *end);
int xe_vm_lock_vma(struct drm_exec *exec, struct xe_vma *vma);
int xe_vm_validate_rebind(struct xe_vm *vm, struct drm_exec *exec,