From 0c079d3f88df5f8286cd5c91b54bdac7c819be85 Mon Sep 17 00:00:00 2001 From: Matthew Auld Date: Tue, 6 Dec 2022 16:11:41 +0000 Subject: [PATCH] drm/i915: improve the catch-all evict to handle lock contention MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The catch-all evict can fail due to object lock contention, since it only goes as far as trylocking the object, due to us already holding the vm->mutex. Doing a full object lock here can deadlock, since the vm->mutex is always our inner lock. Add another execbuf pass which drops the vm->mutex and then tries to grab the object will the full lock, before then retrying the eviction. This should be good enough for now to fix the immediate regression with userspace seeing -ENOSPC from execbuf due to contended object locks during GTT eviction. Testcase: igt@gem_ppgtt@shrink-vs-evict-* Fixes: 7e00897be8bf ("drm/i915: Add object locking to i915_gem_evict_for_node and i915_gem_evict_something, v2.") References: https://gitlab.freedesktop.org/drm/intel/-/issues/7627 References: https://gitlab.freedesktop.org/drm/intel/-/issues/7570 References: https://bugzilla.mozilla.org/show_bug.cgi?id=1779558 Signed-off-by: Matthew Auld Cc: Maarten Lankhorst Cc: Thomas Hellström Cc: Tvrtko Ursulin Cc: Andrzej Hajda Cc: Mani Milani Cc: # v5.18+ Revision 1 of https://patchwork.freedesktop.org/series/111686/ --- .../gpu/drm/i915/gem/i915_gem_execbuffer.c | 25 +++++++++++-- drivers/gpu/drm/i915/gem/i915_gem_mman.c | 2 +- drivers/gpu/drm/i915/i915_gem_evict.c | 37 ++++++++++++++----- drivers/gpu/drm/i915/i915_gem_evict.h | 4 +- drivers/gpu/drm/i915/i915_vma.c | 2 +- .../gpu/drm/i915/selftests/i915_gem_evict.c | 4 +- 6 files changed, 56 insertions(+), 18 deletions(-) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c index 845023c14eb36f..094e92ed28db4f 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c @@ -741,25 +741,44 @@ static int eb_reserve(struct i915_execbuffer *eb) * * Defragmenting is skipped if all objects are pinned at a fixed location. */ - for (pass = 0; pass <= 2; pass++) { + for (pass = 0; pass <= 3; pass++) { int pin_flags = PIN_USER | PIN_VALIDATE; if (pass == 0) pin_flags |= PIN_NONBLOCK; if (pass >= 1) - unpinned = eb_unbind(eb, pass == 2); + unpinned = eb_unbind(eb, pass >= 2); if (pass == 2) { err = mutex_lock_interruptible(&eb->context->vm->mutex); if (!err) { - err = i915_gem_evict_vm(eb->context->vm, &eb->ww); + err = i915_gem_evict_vm(eb->context->vm, &eb->ww, NULL); mutex_unlock(&eb->context->vm->mutex); } if (err) return err; } + if (pass == 3) { +retry: + err = mutex_lock_interruptible(&eb->context->vm->mutex); + if (!err) { + struct drm_i915_gem_object *busy_bo = NULL; + + err = i915_gem_evict_vm(eb->context->vm, &eb->ww, &busy_bo); + mutex_unlock(&eb->context->vm->mutex); + if (err && busy_bo) { + err = i915_gem_object_lock(busy_bo, &eb->ww); + i915_gem_object_put(busy_bo); + if (!err) + goto retry; + } + } + if (err) + return err; + } + list_for_each_entry(ev, &eb->unbound, bind_link) { err = eb_reserve_vma(eb, ev, pin_flags); if (err) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_mman.c b/drivers/gpu/drm/i915/gem/i915_gem_mman.c index 73d9eda1d6b7a6..c83d98e1dc5da0 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_mman.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_mman.c @@ -369,7 +369,7 @@ static vm_fault_t vm_fault_gtt(struct vm_fault *vmf) if (vma == ERR_PTR(-ENOSPC)) { ret = mutex_lock_interruptible(&ggtt->vm.mutex); if (!ret) { - ret = i915_gem_evict_vm(&ggtt->vm, &ww); + ret = i915_gem_evict_vm(&ggtt->vm, &ww, NULL); mutex_unlock(&ggtt->vm.mutex); } if (ret) diff --git a/drivers/gpu/drm/i915/i915_gem_evict.c b/drivers/gpu/drm/i915/i915_gem_evict.c index f025ee4fa52618..a4b4d9b7d26c7a 100644 --- a/drivers/gpu/drm/i915/i915_gem_evict.c +++ b/drivers/gpu/drm/i915/i915_gem_evict.c @@ -416,6 +416,11 @@ int i915_gem_evict_for_node(struct i915_address_space *vm, * @vm: Address space to cleanse * @ww: An optional struct i915_gem_ww_ctx. If not NULL, i915_gem_evict_vm * will be able to evict vma's locked by the ww as well. + * @busy_bo: Optional pointer to struct drm_i915_gem_object. If not NULL, then + * in the event i915_gem_evict_vm() is unable to trylock an object for eviction, + * then @busy_bo will point to it. -EBUSY is also returned. The caller must drop + * the vm->mutex, before trying again to acquire the contended lock. The caller + * also owns a reference to the object. * * This function evicts all vmas from a vm. * @@ -425,7 +430,8 @@ int i915_gem_evict_for_node(struct i915_address_space *vm, * To clarify: This is for freeing up virtual address space, not for freeing * memory in e.g. the shrinker. */ -int i915_gem_evict_vm(struct i915_address_space *vm, struct i915_gem_ww_ctx *ww) +int i915_gem_evict_vm(struct i915_address_space *vm, struct i915_gem_ww_ctx *ww, + struct drm_i915_gem_object **busy_bo) { int ret = 0; @@ -457,15 +463,22 @@ int i915_gem_evict_vm(struct i915_address_space *vm, struct i915_gem_ww_ctx *ww) * the resv is shared among multiple objects, we still * need the object ref. */ - if (dying_vma(vma) || + if (!i915_gem_object_get_rcu(vma->obj) || (ww && (dma_resv_locking_ctx(vma->obj->base.resv) == &ww->ctx))) { __i915_vma_pin(vma); list_add(&vma->evict_link, &locked_eviction_list); continue; } - if (!i915_gem_object_trylock(vma->obj, ww)) + if (!i915_gem_object_trylock(vma->obj, ww)) { + if (busy_bo) { + *busy_bo = vma->obj; /* holds ref */ + ret = -EBUSY; + break; + } + i915_gem_object_put(vma->obj); continue; + } __i915_vma_pin(vma); list_add(&vma->evict_link, &eviction_list); @@ -473,25 +486,29 @@ int i915_gem_evict_vm(struct i915_address_space *vm, struct i915_gem_ww_ctx *ww) if (list_empty(&eviction_list) && list_empty(&locked_eviction_list)) break; - ret = 0; /* Unbind locked objects first, before unlocking the eviction_list */ list_for_each_entry_safe(vma, vn, &locked_eviction_list, evict_link) { __i915_vma_unpin(vma); - if (ret == 0) + if (ret == 0) { ret = __i915_vma_unbind(vma); - if (ret != -EINTR) /* "Get me out of here!" */ - ret = 0; + if (ret != -EINTR) /* "Get me out of here!" */ + ret = 0; + } + if (!dying_vma(vma)) + i915_gem_object_put(vma->obj); } list_for_each_entry_safe(vma, vn, &eviction_list, evict_link) { __i915_vma_unpin(vma); - if (ret == 0) + if (ret == 0) { ret = __i915_vma_unbind(vma); - if (ret != -EINTR) /* "Get me out of here!" */ - ret = 0; + if (ret != -EINTR) /* "Get me out of here!" */ + ret = 0; + } i915_gem_object_unlock(vma->obj); + i915_gem_object_put(vma->obj); } } while (ret == 0); diff --git a/drivers/gpu/drm/i915/i915_gem_evict.h b/drivers/gpu/drm/i915/i915_gem_evict.h index e593c530f9bd7a..bf0ee0e4fe6088 100644 --- a/drivers/gpu/drm/i915/i915_gem_evict.h +++ b/drivers/gpu/drm/i915/i915_gem_evict.h @@ -11,6 +11,7 @@ struct drm_mm_node; struct i915_address_space; struct i915_gem_ww_ctx; +struct drm_i915_gem_object; int __must_check i915_gem_evict_something(struct i915_address_space *vm, struct i915_gem_ww_ctx *ww, @@ -23,6 +24,7 @@ int __must_check i915_gem_evict_for_node(struct i915_address_space *vm, struct drm_mm_node *node, unsigned int flags); int i915_gem_evict_vm(struct i915_address_space *vm, - struct i915_gem_ww_ctx *ww); + struct i915_gem_ww_ctx *ww, + struct drm_i915_gem_object **busy_bo); #endif /* __I915_GEM_EVICT_H__ */ diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c index f17c09ead7d778..4d06875de14a14 100644 --- a/drivers/gpu/drm/i915/i915_vma.c +++ b/drivers/gpu/drm/i915/i915_vma.c @@ -1569,7 +1569,7 @@ static int __i915_ggtt_pin(struct i915_vma *vma, struct i915_gem_ww_ctx *ww, * locked objects when called from execbuf when pinning * is removed. This would probably regress badly. */ - i915_gem_evict_vm(vm, NULL); + i915_gem_evict_vm(vm, NULL, NULL); mutex_unlock(&vm->mutex); } } while (1); diff --git a/drivers/gpu/drm/i915/selftests/i915_gem_evict.c b/drivers/gpu/drm/i915/selftests/i915_gem_evict.c index 8c6517d29b8e0c..37068542aafe7f 100644 --- a/drivers/gpu/drm/i915/selftests/i915_gem_evict.c +++ b/drivers/gpu/drm/i915/selftests/i915_gem_evict.c @@ -344,7 +344,7 @@ static int igt_evict_vm(void *arg) /* Everything is pinned, nothing should happen */ mutex_lock(&ggtt->vm.mutex); - err = i915_gem_evict_vm(&ggtt->vm, NULL); + err = i915_gem_evict_vm(&ggtt->vm, NULL, NULL); mutex_unlock(&ggtt->vm.mutex); if (err) { pr_err("i915_gem_evict_vm on a full GGTT returned err=%d]\n", @@ -356,7 +356,7 @@ static int igt_evict_vm(void *arg) for_i915_gem_ww(&ww, err, false) { mutex_lock(&ggtt->vm.mutex); - err = i915_gem_evict_vm(&ggtt->vm, &ww); + err = i915_gem_evict_vm(&ggtt->vm, &ww, NULL); mutex_unlock(&ggtt->vm.mutex); }