drm/xe: Convert existing drm_exec transactions for exhaustive eviction
Convert existing drm_exec transactions, like GT pagefault validation, non-LR exec() IOCTL and the rebind worker to support exhaustive eviction using the xe_validation_guard(). v2: - Adapt to signature change in xe_validation_guard() (Matt Brost) - Avoid gotos from within xe_validation_guard() (Matt Brost) - Check error return from xe_validation_guard() v3: - Rebase on gpu_madvise() Signed-off-by: Thomas Hellström <thomas.hellstrom@linux.intel.com> Reviewed-by: Matthew Brost <matthew.brost@intel.com> #v1 Link: https://lore.kernel.org/r/20250908101246.65025-6-thomas.hellstrom@linux.intel.com
This commit is contained in:
@@ -120,10 +120,10 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
|
||||
struct drm_gpuvm_exec vm_exec = {.extra.fn = xe_exec_fn};
|
||||
struct drm_exec *exec = &vm_exec.exec;
|
||||
u32 i, num_syncs, num_ufence = 0;
|
||||
struct xe_validation_ctx ctx;
|
||||
struct xe_sched_job *job;
|
||||
struct xe_vm *vm;
|
||||
bool write_locked, skip_retry = false;
|
||||
ktime_t end = 0;
|
||||
int err = 0;
|
||||
struct xe_hw_engine_group *group;
|
||||
enum xe_hw_engine_group_execution_mode mode, previous_mode;
|
||||
@@ -251,17 +251,12 @@ retry:
|
||||
if (err)
|
||||
goto err_unlock_list;
|
||||
|
||||
vm_exec.vm = &vm->gpuvm;
|
||||
vm_exec.flags = DRM_EXEC_INTERRUPTIBLE_WAIT;
|
||||
if (xe_vm_in_lr_mode(vm)) {
|
||||
drm_exec_init(exec, vm_exec.flags, 0);
|
||||
} else {
|
||||
err = drm_gpuvm_exec_lock(&vm_exec);
|
||||
if (err) {
|
||||
if (xe_vm_validate_should_retry(exec, err, &end))
|
||||
err = -EAGAIN;
|
||||
if (!xe_vm_in_lr_mode(vm)) {
|
||||
vm_exec.vm = &vm->gpuvm;
|
||||
vm_exec.flags = DRM_EXEC_INTERRUPTIBLE_WAIT;
|
||||
err = xe_validation_exec_lock(&ctx, &vm_exec, &xe->val);
|
||||
if (err)
|
||||
goto err_unlock_list;
|
||||
}
|
||||
}
|
||||
|
||||
if (xe_vm_is_closed_or_banned(q->vm)) {
|
||||
@@ -355,7 +350,8 @@ err_put_job:
|
||||
if (err)
|
||||
xe_sched_job_put(job);
|
||||
err_exec:
|
||||
drm_exec_fini(exec);
|
||||
if (!xe_vm_in_lr_mode(vm))
|
||||
xe_validation_ctx_fini(&ctx);
|
||||
err_unlock_list:
|
||||
up_read(&vm->lock);
|
||||
if (err == -EAGAIN && !skip_retry)
|
||||
|
||||
@@ -96,9 +96,9 @@ static int handle_vma_pagefault(struct xe_gt *gt, struct xe_vma *vma,
|
||||
{
|
||||
struct xe_vm *vm = xe_vma_vm(vma);
|
||||
struct xe_tile *tile = gt_to_tile(gt);
|
||||
struct xe_validation_ctx ctx;
|
||||
struct drm_exec exec;
|
||||
struct dma_fence *fence;
|
||||
ktime_t end = 0;
|
||||
int err, needs_vram;
|
||||
|
||||
lockdep_assert_held_write(&vm->lock);
|
||||
@@ -127,12 +127,11 @@ retry_userptr:
|
||||
}
|
||||
|
||||
/* Lock VM and BOs dma-resv */
|
||||
drm_exec_init(&exec, 0, 0);
|
||||
xe_validation_ctx_init(&ctx, &vm->xe->val, &exec, (struct xe_val_flags) {});
|
||||
drm_exec_until_all_locked(&exec) {
|
||||
err = xe_pf_begin(&exec, vma, needs_vram == 1, tile->mem.vram);
|
||||
drm_exec_retry_on_contention(&exec);
|
||||
if (xe_vm_validate_should_retry(&exec, err, &end))
|
||||
err = -EAGAIN;
|
||||
xe_validation_retry_on_oom(&ctx, &err);
|
||||
if (err)
|
||||
goto unlock_dma_resv;
|
||||
|
||||
@@ -143,8 +142,7 @@ retry_userptr:
|
||||
xe_vm_set_validation_exec(vm, NULL);
|
||||
if (IS_ERR(fence)) {
|
||||
err = PTR_ERR(fence);
|
||||
if (xe_vm_validate_should_retry(&exec, err, &end))
|
||||
err = -EAGAIN;
|
||||
xe_validation_retry_on_oom(&ctx, &err);
|
||||
goto unlock_dma_resv;
|
||||
}
|
||||
}
|
||||
@@ -153,7 +151,7 @@ retry_userptr:
|
||||
dma_fence_put(fence);
|
||||
|
||||
unlock_dma_resv:
|
||||
drm_exec_fini(&exec);
|
||||
xe_validation_ctx_fini(&ctx);
|
||||
if (err == -EAGAIN)
|
||||
goto retry_userptr;
|
||||
|
||||
@@ -535,6 +533,7 @@ static int handle_acc(struct xe_gt *gt, struct acc *acc)
|
||||
{
|
||||
struct xe_device *xe = gt_to_xe(gt);
|
||||
struct xe_tile *tile = gt_to_tile(gt);
|
||||
struct xe_validation_ctx ctx;
|
||||
struct drm_exec exec;
|
||||
struct xe_vm *vm;
|
||||
struct xe_vma *vma;
|
||||
@@ -564,15 +563,14 @@ static int handle_acc(struct xe_gt *gt, struct acc *acc)
|
||||
goto unlock_vm;
|
||||
|
||||
/* Lock VM and BOs dma-resv */
|
||||
drm_exec_init(&exec, 0, 0);
|
||||
xe_validation_ctx_init(&ctx, &vm->xe->val, &exec, (struct xe_val_flags) {});
|
||||
drm_exec_until_all_locked(&exec) {
|
||||
ret = xe_pf_begin(&exec, vma, IS_DGFX(vm->xe), tile->mem.vram);
|
||||
drm_exec_retry_on_contention(&exec);
|
||||
if (ret)
|
||||
break;
|
||||
xe_validation_retry_on_oom(&ctx, &ret);
|
||||
}
|
||||
|
||||
drm_exec_fini(&exec);
|
||||
xe_validation_ctx_fini(&ctx);
|
||||
unlock_vm:
|
||||
up_read(&vm->lock);
|
||||
xe_vm_put(vm);
|
||||
|
||||
+58
-81
@@ -210,6 +210,7 @@ int xe_vm_add_compute_exec_queue(struct xe_vm *vm, struct xe_exec_queue *q)
|
||||
.num_fences = 1,
|
||||
};
|
||||
struct drm_exec *exec = &vm_exec.exec;
|
||||
struct xe_validation_ctx ctx;
|
||||
struct dma_fence *pfence;
|
||||
int err;
|
||||
bool wait;
|
||||
@@ -217,7 +218,7 @@ int xe_vm_add_compute_exec_queue(struct xe_vm *vm, struct xe_exec_queue *q)
|
||||
xe_assert(vm->xe, xe_vm_in_preempt_fence_mode(vm));
|
||||
|
||||
down_write(&vm->lock);
|
||||
err = drm_gpuvm_exec_lock(&vm_exec);
|
||||
err = xe_validation_exec_lock(&ctx, &vm_exec, &vm->xe->val);
|
||||
if (err)
|
||||
goto out_up_write;
|
||||
|
||||
@@ -249,7 +250,7 @@ int xe_vm_add_compute_exec_queue(struct xe_vm *vm, struct xe_exec_queue *q)
|
||||
xe_svm_notifier_unlock(vm);
|
||||
|
||||
out_fini:
|
||||
drm_exec_fini(exec);
|
||||
xe_validation_ctx_fini(&ctx);
|
||||
out_up_write:
|
||||
up_write(&vm->lock);
|
||||
|
||||
@@ -313,39 +314,6 @@ void xe_vm_kill(struct xe_vm *vm, bool unlocked)
|
||||
/* TODO: Inform user the VM is banned */
|
||||
}
|
||||
|
||||
/**
|
||||
* xe_vm_validate_should_retry() - Whether to retry after a validate error.
|
||||
* @exec: The drm_exec object used for locking before validation.
|
||||
* @err: The error returned from ttm_bo_validate().
|
||||
* @end: A ktime_t cookie that should be set to 0 before first use and
|
||||
* that should be reused on subsequent calls.
|
||||
*
|
||||
* With multiple active VMs, under memory pressure, it is possible that
|
||||
* ttm_bo_validate() run into -EDEADLK and in such case returns -ENOMEM.
|
||||
* Until ttm properly handles locking in such scenarios, best thing the
|
||||
* driver can do is retry with a timeout. Check if that is necessary, and
|
||||
* if so unlock the drm_exec's objects while keeping the ticket to prepare
|
||||
* for a rerun.
|
||||
*
|
||||
* Return: true if a retry after drm_exec_init() is recommended;
|
||||
* false otherwise.
|
||||
*/
|
||||
bool xe_vm_validate_should_retry(struct drm_exec *exec, int err, ktime_t *end)
|
||||
{
|
||||
ktime_t cur;
|
||||
|
||||
if (err != -ENOMEM)
|
||||
return false;
|
||||
|
||||
cur = ktime_get();
|
||||
*end = *end ? : ktime_add_ms(cur, XE_VM_REBIND_RETRY_TIMEOUT_MS);
|
||||
if (!ktime_before(cur, *end))
|
||||
return false;
|
||||
|
||||
msleep(20);
|
||||
return true;
|
||||
}
|
||||
|
||||
static int xe_gpuvm_validate(struct drm_gpuvm_bo *vm_bo, struct drm_exec *exec)
|
||||
{
|
||||
struct xe_vm *vm = gpuvm_to_vm(vm_bo->vm);
|
||||
@@ -476,10 +444,10 @@ void xe_vm_resume_rebind_worker(struct xe_vm *vm)
|
||||
static void preempt_rebind_work_func(struct work_struct *w)
|
||||
{
|
||||
struct xe_vm *vm = container_of(w, struct xe_vm, preempt.rebind_work);
|
||||
struct xe_validation_ctx ctx;
|
||||
struct drm_exec exec;
|
||||
unsigned int fence_count = 0;
|
||||
LIST_HEAD(preempt_fences);
|
||||
ktime_t end = 0;
|
||||
int err = 0;
|
||||
long wait;
|
||||
int __maybe_unused tries = 0;
|
||||
@@ -507,18 +475,19 @@ retry:
|
||||
goto out_unlock_outer;
|
||||
}
|
||||
|
||||
drm_exec_init(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT, 0);
|
||||
err = xe_validation_ctx_init(&ctx, &vm->xe->val, &exec,
|
||||
(struct xe_val_flags) {.interruptible = true});
|
||||
if (err)
|
||||
goto out_unlock_outer;
|
||||
|
||||
drm_exec_until_all_locked(&exec) {
|
||||
bool done = false;
|
||||
|
||||
err = xe_preempt_work_begin(&exec, vm, &done);
|
||||
drm_exec_retry_on_contention(&exec);
|
||||
xe_validation_retry_on_oom(&ctx, &err);
|
||||
if (err || done) {
|
||||
drm_exec_fini(&exec);
|
||||
if (err && xe_vm_validate_should_retry(&exec, err, &end))
|
||||
err = -EAGAIN;
|
||||
|
||||
xe_validation_ctx_fini(&ctx);
|
||||
goto out_unlock_outer;
|
||||
}
|
||||
}
|
||||
@@ -566,7 +535,7 @@ retry:
|
||||
xe_svm_notifier_unlock(vm);
|
||||
|
||||
out_unlock:
|
||||
drm_exec_fini(&exec);
|
||||
xe_validation_ctx_fini(&ctx);
|
||||
out_unlock_outer:
|
||||
if (err == -EAGAIN) {
|
||||
trace_xe_vm_rebind_worker_retry(vm);
|
||||
@@ -1164,20 +1133,19 @@ int xe_vm_lock_vma(struct drm_exec *exec, struct xe_vma *vma)
|
||||
|
||||
static void xe_vma_destroy_unlocked(struct xe_vma *vma)
|
||||
{
|
||||
struct xe_device *xe = xe_vma_vm(vma)->xe;
|
||||
struct xe_validation_ctx ctx;
|
||||
struct drm_exec exec;
|
||||
int err;
|
||||
int err = 0;
|
||||
|
||||
drm_exec_init(&exec, 0, 0);
|
||||
drm_exec_until_all_locked(&exec) {
|
||||
xe_validation_guard(&ctx, &xe->val, &exec, (struct xe_val_flags) {}, err) {
|
||||
err = xe_vm_lock_vma(&exec, vma);
|
||||
drm_exec_retry_on_contention(&exec);
|
||||
if (XE_WARN_ON(err))
|
||||
break;
|
||||
xe_vma_destroy(vma, NULL);
|
||||
}
|
||||
|
||||
xe_vma_destroy(vma, NULL);
|
||||
|
||||
drm_exec_fini(&exec);
|
||||
xe_assert(xe, !err);
|
||||
}
|
||||
|
||||
struct xe_vma *
|
||||
@@ -2383,6 +2351,7 @@ static struct xe_vma *new_vma(struct xe_vm *vm, struct drm_gpuva_op_map *op,
|
||||
struct xe_vma_mem_attr *attr, unsigned int flags)
|
||||
{
|
||||
struct xe_bo *bo = op->gem.obj ? gem_to_xe_bo(op->gem.obj) : NULL;
|
||||
struct xe_validation_ctx ctx;
|
||||
struct drm_exec exec;
|
||||
struct xe_vma *vma;
|
||||
int err = 0;
|
||||
@@ -2390,9 +2359,9 @@ static struct xe_vma *new_vma(struct xe_vm *vm, struct drm_gpuva_op_map *op,
|
||||
lockdep_assert_held_write(&vm->lock);
|
||||
|
||||
if (bo) {
|
||||
drm_exec_init(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT, 0);
|
||||
drm_exec_until_all_locked(&exec) {
|
||||
err = 0;
|
||||
err = 0;
|
||||
xe_validation_guard(&ctx, &vm->xe->val, &exec,
|
||||
(struct xe_val_flags) {.interruptible = true}, err) {
|
||||
if (!bo->vm) {
|
||||
err = drm_exec_lock_obj(&exec, xe_vm_obj(vm));
|
||||
drm_exec_retry_on_contention(&exec);
|
||||
@@ -2401,27 +2370,35 @@ static struct xe_vma *new_vma(struct xe_vm *vm, struct drm_gpuva_op_map *op,
|
||||
err = drm_exec_lock_obj(&exec, &bo->ttm.base);
|
||||
drm_exec_retry_on_contention(&exec);
|
||||
}
|
||||
if (err) {
|
||||
drm_exec_fini(&exec);
|
||||
if (err)
|
||||
return ERR_PTR(err);
|
||||
|
||||
vma = xe_vma_create(vm, bo, op->gem.offset,
|
||||
op->va.addr, op->va.addr +
|
||||
op->va.range - 1, attr, flags);
|
||||
if (IS_ERR(vma))
|
||||
return vma;
|
||||
|
||||
if (!bo->vm) {
|
||||
err = add_preempt_fences(vm, bo);
|
||||
if (err) {
|
||||
prep_vma_destroy(vm, vma, false);
|
||||
xe_vma_destroy(vma, NULL);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (err)
|
||||
return ERR_PTR(err);
|
||||
} else {
|
||||
vma = xe_vma_create(vm, NULL, op->gem.offset,
|
||||
op->va.addr, op->va.addr +
|
||||
op->va.range - 1, attr, flags);
|
||||
if (IS_ERR(vma))
|
||||
return vma;
|
||||
|
||||
if (xe_vma_is_userptr(vma))
|
||||
err = xe_vma_userptr_pin_pages(to_userptr_vma(vma));
|
||||
}
|
||||
vma = xe_vma_create(vm, bo, op->gem.offset,
|
||||
op->va.addr, op->va.addr +
|
||||
op->va.range - 1, attr, flags);
|
||||
if (IS_ERR(vma))
|
||||
goto err_unlock;
|
||||
|
||||
if (xe_vma_is_userptr(vma))
|
||||
err = xe_vma_userptr_pin_pages(to_userptr_vma(vma));
|
||||
else if (!xe_vma_has_no_bo(vma) && !bo->vm)
|
||||
err = add_preempt_fences(vm, bo);
|
||||
|
||||
err_unlock:
|
||||
if (bo)
|
||||
drm_exec_fini(&exec);
|
||||
|
||||
if (err) {
|
||||
prep_vma_destroy(vm, vma, false);
|
||||
xe_vma_destroy_unlocked(vma);
|
||||
@@ -3220,21 +3197,23 @@ static void vm_bind_ioctl_ops_fini(struct xe_vm *vm, struct xe_vma_ops *vops,
|
||||
static struct dma_fence *vm_bind_ioctl_ops_execute(struct xe_vm *vm,
|
||||
struct xe_vma_ops *vops)
|
||||
{
|
||||
struct xe_validation_ctx ctx;
|
||||
struct drm_exec exec;
|
||||
struct dma_fence *fence;
|
||||
int err;
|
||||
int err = 0;
|
||||
|
||||
lockdep_assert_held_write(&vm->lock);
|
||||
|
||||
drm_exec_init(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT |
|
||||
DRM_EXEC_IGNORE_DUPLICATES, 0);
|
||||
drm_exec_until_all_locked(&exec) {
|
||||
xe_validation_guard(&ctx, &vm->xe->val, &exec,
|
||||
((struct xe_val_flags) {
|
||||
.interruptible = true,
|
||||
.exec_ignore_duplicates = true,
|
||||
}), err) {
|
||||
err = vm_bind_ioctl_ops_lock_and_prep(&exec, vm, vops);
|
||||
drm_exec_retry_on_contention(&exec);
|
||||
if (err) {
|
||||
fence = ERR_PTR(err);
|
||||
goto unlock;
|
||||
}
|
||||
xe_validation_retry_on_oom(&ctx, &err);
|
||||
if (err)
|
||||
return ERR_PTR(err);
|
||||
|
||||
xe_vm_set_validation_exec(vm, &exec);
|
||||
fence = ops_execute(vm, vops);
|
||||
@@ -3242,15 +3221,13 @@ static struct dma_fence *vm_bind_ioctl_ops_execute(struct xe_vm *vm,
|
||||
if (IS_ERR(fence)) {
|
||||
if (PTR_ERR(fence) == -ENODATA)
|
||||
vm_bind_ioctl_ops_fini(vm, vops, NULL);
|
||||
goto unlock;
|
||||
return fence;
|
||||
}
|
||||
|
||||
vm_bind_ioctl_ops_fini(vm, vops, fence);
|
||||
}
|
||||
|
||||
unlock:
|
||||
drm_exec_fini(&exec);
|
||||
return fence;
|
||||
return err ? ERR_PTR(err) : fence;
|
||||
}
|
||||
ALLOW_ERROR_INJECTION(vm_bind_ioctl_ops_execute, ERRNO);
|
||||
|
||||
|
||||
@@ -260,8 +260,6 @@ static inline void xe_vm_reactivate_rebind(struct xe_vm *vm)
|
||||
}
|
||||
}
|
||||
|
||||
bool xe_vm_validate_should_retry(struct drm_exec *exec, int err, ktime_t *end);
|
||||
|
||||
int xe_vm_lock_vma(struct drm_exec *exec, struct xe_vma *vma);
|
||||
|
||||
int xe_vm_validate_rebind(struct xe_vm *vm, struct drm_exec *exec,
|
||||
|
||||
Reference in New Issue
Block a user