[Intel-gfx] [PATCH v7] drm/i915: Update error capture code to avoid using the current vma state
Thomas Hellström
thomas.hellstrom at linux.intel.com
Tue Dec 7 20:54:09 UTC 2021
Hi, John.
On 12/7/21 21:46, John Harrison wrote:
> On 11/29/2021 12:22, Thomas Hellström wrote:
>> With asynchronous migrations, the vma state may be several migrations
>> ahead of the state that matches the request we're capturing.
>> Address that by introducing an i915_vma_snapshot structure that
>> can be used to snapshot relevant state at request submission.
>> In order to make sure we access the correct memory, the snapshots take
>> references on relevant sg-tables and memory regions.
>>
>> Also move the capture list allocation out of the fence signaling
>> critical path and use the CONFIG_DRM_I915_CAPTURE_ERROR define to
>> avoid compiling in members and functions used for error capture
>> when they're not used.
>>
>> Finally, Introduce lockdep annotation.
>>
>> v4:
>> - Break out the capture allocation mode change to a separate patch.
>> v5:
>> - Fix compilation error in the !CONFIG_DRM_I915_CAPTURE_ERROR case
>> (kernel test robot)
>> v6:
>> - Use #if IS_ENABLED() instead of #ifdef to match driver style.
>> - Move yet another change of allocation mode to the separate patch.
>> - Commit message rework due to patch reordering.
>> v7:
>> - Adjust for removal of region refcounting.
>>
>> Signed-off-by: Thomas Hellström <thomas.hellstrom at linux.intel.com>
>> Reviewed-by: Ramalingam C <ramalingam.c at intel.com>
>> ---
>> drivers/gpu/drm/i915/Makefile | 1 +
>> .../gpu/drm/i915/gem/i915_gem_execbuffer.c | 135 +++++++++++---
>> drivers/gpu/drm/i915/gt/intel_engine_cs.c | 8 +-
>> drivers/gpu/drm/i915/i915_gpu_error.c | 176 +++++++++++++-----
>> drivers/gpu/drm/i915/i915_request.c | 63 +++++--
>> drivers/gpu/drm/i915/i915_request.h | 20 +-
>> drivers/gpu/drm/i915/i915_vma_snapshot.c | 134 +++++++++++++
>> drivers/gpu/drm/i915/i915_vma_snapshot.h | 112 +++++++++++
>> 8 files changed, 554 insertions(+), 95 deletions(-)
>> create mode 100644 drivers/gpu/drm/i915/i915_vma_snapshot.c
>> create mode 100644 drivers/gpu/drm/i915/i915_vma_snapshot.h
>>
>> diff --git a/drivers/gpu/drm/i915/Makefile
>> b/drivers/gpu/drm/i915/Makefile
>> index 3f57e85d4846..3b5857da4123 100644
>> --- a/drivers/gpu/drm/i915/Makefile
>> +++ b/drivers/gpu/drm/i915/Makefile
>> @@ -173,6 +173,7 @@ i915-y += \
>> i915_trace_points.o \
>> i915_ttm_buddy_manager.o \
>> i915_vma.o \
>> + i915_vma_snapshot.o \
>> intel_wopcm.o
>> # general-purpose microcontroller (GuC) support
>> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
>> b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
>> index 9f7c6ecadb90..6a0ed537c199 100644
>> --- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
>> +++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
>> @@ -29,6 +29,7 @@
>> #include "i915_gem_ioctls.h"
>> #include "i915_trace.h"
>> #include "i915_user_extensions.h"
>> +#include "i915_vma_snapshot.h"
>> struct eb_vma {
>> struct i915_vma *vma;
>> @@ -307,11 +308,15 @@ struct i915_execbuffer {
>> struct eb_fence *fences;
>> unsigned long num_fences;
>> +#if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR)
>> + struct i915_capture_list *capture_lists[MAX_ENGINE_INSTANCE + 1];
>> +#endif
>> };
>> static int eb_parse(struct i915_execbuffer *eb);
>> static int eb_pin_engine(struct i915_execbuffer *eb, bool throttle);
>> static void eb_unpin_engine(struct i915_execbuffer *eb);
>> +static void eb_capture_release(struct i915_execbuffer *eb);
>> static inline bool eb_use_cmdparser(const struct i915_execbuffer
>> *eb)
>> {
>> @@ -1054,6 +1059,7 @@ static void eb_release_vmas(struct
>> i915_execbuffer *eb, bool final)
>> i915_vma_put(vma);
>> }
>> + eb_capture_release(eb);
>> eb_unpin_engine(eb);
>> }
>> @@ -1891,36 +1897,113 @@ eb_find_first_request_added(struct
>> i915_execbuffer *eb)
>> return NULL;
>> }
>> -static int eb_move_to_gpu(struct i915_execbuffer *eb)
>> +#if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR)
>> +
>> +/* Stage with GFP_KERNEL allocations before we enter the signaling
>> critical path */
>> +static void eb_capture_stage(struct i915_execbuffer *eb)
>> {
>> const unsigned int count = eb->buffer_count;
>> - unsigned int i = count;
>> - int err = 0, j;
>> + unsigned int i = count, j;
>> + struct i915_vma_snapshot *vsnap;
>> while (i--) {
>> struct eb_vma *ev = &eb->vma[i];
>> struct i915_vma *vma = ev->vma;
>> unsigned int flags = ev->flags;
>> - struct drm_i915_gem_object *obj = vma->obj;
>> - assert_vma_held(vma);
>> + if (!(flags & EXEC_OBJECT_CAPTURE))
>> + continue;
>> - if (flags & EXEC_OBJECT_CAPTURE) {
>> + vsnap = i915_vma_snapshot_alloc(GFP_KERNEL);
>> + if (!vsnap)
>> + continue;
>> +
>> + i915_vma_snapshot_init(vsnap, vma, "user");
>> + for_each_batch_create_order(eb, j) {
>> struct i915_capture_list *capture;
>> - for_each_batch_create_order(eb, j) {
>> - if (!eb->requests[j])
>> - break;
>> + capture = kmalloc(sizeof(*capture), GFP_KERNEL);
>> + if (!capture)
>> + continue;
>> - capture = kmalloc(sizeof(*capture), GFP_KERNEL);
>> - if (capture) {
>> - capture->next =
>> - eb->requests[j]->capture_list;
>> - capture->vma = vma;
>> - eb->requests[j]->capture_list = capture;
>> - }
>> - }
>> + capture->next = eb->capture_lists[j];
>> + capture->vma_snapshot = i915_vma_snapshot_get(vsnap);
>> + eb->capture_lists[j] = capture;
>> + }
>> + i915_vma_snapshot_put(vsnap);
>> + }
>> +}
>> +
>> +/* Commit once we're in the critical path */
>> +static void eb_capture_commit(struct i915_execbuffer *eb)
>> +{
>> + unsigned int j;
>> +
>> + for_each_batch_create_order(eb, j) {
>> + struct i915_request *rq = eb->requests[j];
>> +
>> + if (!rq)
>> + break;
>> +
>> + rq->capture_list = eb->capture_lists[j];
>> + eb->capture_lists[j] = NULL;
>> + }
>> +}
>> +
>> +/*
>> + * Release anything that didn't get committed due to errors.
>> + * The capture_list will otherwise be freed at request retire.
>> + */
>> +static void eb_capture_release(struct i915_execbuffer *eb)
>> +{
>> + unsigned int j;
>> +
>> + for_each_batch_create_order(eb, j) {
>> + if (eb->capture_lists[j]) {
>> + i915_request_free_capture_list(eb->capture_lists[j]);
>> + eb->capture_lists[j] = NULL;
>> }
>> + }
>> +}
>> +
>> +static void eb_capture_list_clear(struct i915_execbuffer *eb)
>> +{
>> + memset(eb->capture_lists, 0, sizeof(eb->capture_lists));
>> +}
>> +
>> +#else
>> +
>> +static void eb_capture_stage(struct i915_execbuffer *eb)
>> +{
>> +}
>> +
>> +static void eb_capture_commit(struct i915_execbuffer *eb)
>> +{
>> +}
>> +
>> +static void eb_capture_release(struct i915_execbuffer *eb)
>> +{
>> +}
>> +
>> +static void eb_capture_list_clear(struct i915_execbuffer *eb)
>> +{
>> +}
>> +
>> +#endif
>> +
>> +static int eb_move_to_gpu(struct i915_execbuffer *eb)
>> +{
>> + const unsigned int count = eb->buffer_count;
>> + unsigned int i = count;
>> + int err = 0, j;
>> +
>> + while (i--) {
>> + struct eb_vma *ev = &eb->vma[i];
>> + struct i915_vma *vma = ev->vma;
>> + unsigned int flags = ev->flags;
>> + struct drm_i915_gem_object *obj = vma->obj;
>> +
>> + assert_vma_held(vma);
>> /*
>> * If the GPU is not _reading_ through the CPU cache, we need
>> @@ -2001,6 +2084,8 @@ static int eb_move_to_gpu(struct
>> i915_execbuffer *eb)
>> /* Unconditionally flush any chipset caches (for streaming
>> writes). */
>> intel_gt_chipset_flush(eb->gt);
>> + eb_capture_commit(eb);
>> +
>> return 0;
>> err_skip:
>> @@ -3143,13 +3228,14 @@ eb_requests_create(struct i915_execbuffer
>> *eb, struct dma_fence *in_fence,
>> }
>> /*
>> - * Whilst this request exists, batch_obj will be on the
>> - * active_list, and so will hold the active reference. Only
>> when
>> - * this request is retired will the batch_obj be moved onto
>> - * the inactive_list and lose its active reference. Hence we do
>> - * not need to explicitly hold another reference here.
>> + * Not really on stack, but we don't want to call
>> + * kfree on the batch_snapshot when we put it, so use the
>> + * _onstack interface.
>> */
>> - eb->requests[i]->batch = eb->batches[i]->vma;
>> + if (eb->batches[i]->vma)
>> + i915_vma_snapshot_init_onstack(&eb->requests[i]->batch_snapshot,
>> + eb->batches[i]->vma,
>> + "batch");
>> if (eb->batch_pool) {
>> GEM_BUG_ON(intel_context_is_parallel(eb->context));
>> intel_gt_buffer_pool_mark_active(eb->batch_pool,
>> @@ -3198,6 +3284,8 @@ i915_gem_do_execbuffer(struct drm_device *dev,
>> eb.fences = NULL;
>> eb.num_fences = 0;
>> + eb_capture_list_clear(&eb);
>> +
>> memset(eb.requests, 0, sizeof(struct i915_request *) *
>> ARRAY_SIZE(eb.requests));
>> eb.composite_fence = NULL;
>> @@ -3284,6 +3372,7 @@ i915_gem_do_execbuffer(struct drm_device *dev,
>> }
>> ww_acquire_done(&eb.ww.ctx);
>> + eb_capture_stage(&eb);
>> out_fence = eb_requests_create(&eb, in_fence, out_fence_fd);
>> if (IS_ERR(out_fence)) {
>> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
>> b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
>> index 332756036007..f2ccd5b53d42 100644
>> --- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
>> +++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
>> @@ -1676,14 +1676,18 @@ static void
>> intel_engine_print_registers(struct intel_engine_cs *engine,
>> static void print_request_ring(struct drm_printer *m, struct
>> i915_request *rq)
>> {
>> + struct i915_vma_snapshot *vsnap = &rq->batch_snapshot;
>> void *ring;
>> int size;
>> + if (!i915_vma_snapshot_present(vsnap))
>> + vsnap = NULL;
>> +
>> drm_printf(m,
>> "[head %04x, postfix %04x, tail %04x, batch
>> 0x%08x_%08x]:\n",
>> rq->head, rq->postfix, rq->tail,
>> - rq->batch ? upper_32_bits(rq->batch->node.start) : ~0u,
>> - rq->batch ? lower_32_bits(rq->batch->node.start) : ~0u);
>> + vsnap ? upper_32_bits(vsnap->gtt_offset) : ~0u,
>> + vsnap ? lower_32_bits(vsnap->gtt_offset) : ~0u);
>> size = rq->tail - rq->head;
>> if (rq->tail < rq->head)
>> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c
>> b/drivers/gpu/drm/i915/i915_gpu_error.c
>> index b1e4ce0f798f..c61f9aaa40f9 100644
>> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
>> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
>> @@ -48,6 +48,7 @@
>> #include "i915_gpu_error.h"
>> #include "i915_memcpy.h"
>> #include "i915_scatterlist.h"
>> +#include "i915_vma_snapshot.h"
>> #define ALLOW_FAIL (__GFP_KSWAPD_RECLAIM | __GFP_RETRY_MAYFAIL |
>> __GFP_NOWARN)
>> #define ATOMIC_MAYFAIL (GFP_ATOMIC | __GFP_NOWARN)
>> @@ -1012,8 +1013,7 @@ void __i915_gpu_coredump_free(struct kref
>> *error_ref)
>> static struct i915_vma_coredump *
>> i915_vma_coredump_create(const struct intel_gt *gt,
>> - const struct i915_vma *vma,
>> - const char *name,
>> + const struct i915_vma_snapshot *vsnap,
>> struct i915_vma_compress *compress)
>> {
>> struct i915_ggtt *ggtt = gt->ggtt;
>> @@ -1024,7 +1024,7 @@ i915_vma_coredump_create(const struct intel_gt
>> *gt,
>> might_sleep();
>> - if (!vma || !vma->pages || !compress)
>> + if (!vsnap || !vsnap->pages || !compress)
>> return NULL;
>> dst = kmalloc(sizeof(*dst), ALLOW_FAIL);
>> @@ -1037,12 +1037,12 @@ i915_vma_coredump_create(const struct
>> intel_gt *gt,
>> }
>> INIT_LIST_HEAD(&dst->page_list);
>> - strcpy(dst->name, name);
>> + strcpy(dst->name, vsnap->name);
>> dst->next = NULL;
>> - dst->gtt_offset = vma->node.start;
>> - dst->gtt_size = vma->node.size;
>> - dst->gtt_page_sizes = vma->page_sizes.gtt;
>> + dst->gtt_offset = vsnap->gtt_offset;
>> + dst->gtt_size = vsnap->gtt_size;
>> + dst->gtt_page_sizes = vsnap->page_sizes;
>> dst->unused = 0;
>> ret = -EINVAL;
>> @@ -1050,7 +1050,7 @@ i915_vma_coredump_create(const struct intel_gt
>> *gt,
>> void __iomem *s;
>> dma_addr_t dma;
>> - for_each_sgt_daddr(dma, iter, vma->pages) {
>> + for_each_sgt_daddr(dma, iter, vsnap->pages) {
>> mutex_lock(&ggtt->error_mutex);
>> ggtt->vm.insert_page(&ggtt->vm, dma, slot,
>> I915_CACHE_NONE, 0);
>> @@ -1068,11 +1068,11 @@ i915_vma_coredump_create(const struct
>> intel_gt *gt,
>> if (ret)
>> break;
>> }
>> - } else if (__i915_gem_object_is_lmem(vma->obj)) {
>> - struct intel_memory_region *mem = vma->obj->mm.region;
>> + } else if (vsnap->mr && vsnap->mr->type != INTEL_MEMORY_SYSTEM) {
>> + struct intel_memory_region *mem = vsnap->mr;
>> dma_addr_t dma;
>> - for_each_sgt_daddr(dma, iter, vma->pages) {
>> + for_each_sgt_daddr(dma, iter, vsnap->pages) {
>> void __iomem *s;
>> s = io_mapping_map_wc(&mem->iomap,
>> @@ -1088,7 +1088,7 @@ i915_vma_coredump_create(const struct intel_gt
>> *gt,
>> } else {
>> struct page *page;
>> - for_each_sgt_page(page, iter, vma->pages) {
>> + for_each_sgt_page(page, iter, vsnap->pages) {
>> void *s;
>> drm_clflush_pages(&page, 1);
>> @@ -1324,37 +1324,71 @@ static bool record_context(struct
>> i915_gem_context_coredump *e,
>> struct intel_engine_capture_vma {
>> struct intel_engine_capture_vma *next;
>> - struct i915_vma *vma;
>> + struct i915_vma_snapshot *vsnap;
>> char name[16];
>> + bool lockdep_cookie;
>> };
>> static struct intel_engine_capture_vma *
>> -capture_vma(struct intel_engine_capture_vma *next,
>> - struct i915_vma *vma,
>> - const char *name,
>> - gfp_t gfp)
>> +capture_vma_snapshot(struct intel_engine_capture_vma *next,
>> + struct i915_vma_snapshot *vsnap,
>> + gfp_t gfp)
>> {
>> struct intel_engine_capture_vma *c;
>> - if (!vma)
>> + if (!i915_vma_snapshot_present(vsnap))
>> return next;
>> c = kmalloc(sizeof(*c), gfp);
>> if (!c)
>> return next;
>> - if (!i915_active_acquire_if_busy(&vma->active)) {
>> + if (!i915_vma_snapshot_resource_pin(vsnap, &c->lockdep_cookie)) {
>> kfree(c);
>> return next;
>> }
>> - strcpy(c->name, name);
>> - c->vma = vma; /* reference held while active */
>> + strcpy(c->name, vsnap->name);
>> + c->vsnap = vsnap;
>> + i915_vma_snapshot_get(vsnap);
>> c->next = next;
>> return c;
>> }
>> +static struct intel_engine_capture_vma *
>> +capture_vma(struct intel_engine_capture_vma *next,
>> + struct i915_vma *vma,
>> + const char *name,
>> + gfp_t gfp)
>> +{
>> + struct i915_vma_snapshot *vsnap;
>> +
>> + if (!vma)
>> + return next;
>> +
>> + /*
>> + * If the vma isn't pinned, then the vma should be snapshotted
>> + * to a struct i915_vma_snapshot at command submission time.
>> + * Not here.
>> + */
>> + GEM_WARN_ON(!i915_vma_is_pinned(vma));
>> + if (!i915_vma_is_pinned(vma))
>> + return next;
>> +
>> + vsnap = i915_vma_snapshot_alloc(gfp);
>> + if (!vsnap)
>> + return next;
>> +
>> + i915_vma_snapshot_init(vsnap, vma, name);
>> + next = capture_vma_snapshot(next, vsnap, gfp);
>> +
>> + /* FIXME: Replace on async unbind. */
>> + i915_vma_snapshot_put(vsnap);
>> +
>> + return next;
>> +}
>> +
>> static struct intel_engine_capture_vma *
>> capture_user(struct intel_engine_capture_vma *capture,
>> const struct i915_request *rq,
>> @@ -1363,7 +1397,7 @@ capture_user(struct intel_engine_capture_vma
>> *capture,
>> struct i915_capture_list *c;
>> for (c = rq->capture_list; c; c = c->next)
>> - capture = capture_vma(capture, c->vma, "user", gfp);
>> + capture = capture_vma_snapshot(capture, c->vma_snapshot, gfp);
>> return capture;
>> }
>> @@ -1377,6 +1411,36 @@ static void add_vma(struct
>> intel_engine_coredump *ee,
>> }
>> }
>> +static struct i915_vma_coredump *
>> +create_vma_coredump(const struct intel_gt *gt, struct i915_vma *vma,
>> + const char *name, struct i915_vma_compress *compress)
>> +{
>> + struct i915_vma_coredump *ret = NULL;
>> + struct i915_vma_snapshot tmp;
>> + bool lockdep_cookie;
>> +
>> + if (!vma)
>> + return NULL;
>> +
>> + i915_vma_snapshot_init_onstack(&tmp, vma, name);
>> + if (i915_vma_snapshot_resource_pin(&tmp, &lockdep_cookie)) {
>> + ret = i915_vma_coredump_create(gt, &tmp, compress);
>> + i915_vma_snapshot_resource_unpin(&tmp, lockdep_cookie);
>> + }
>> + i915_vma_snapshot_put_onstack(&tmp);
>> +
>> + return ret;
>> +}
> <snip>
>
>> @@ -1554,10 +1620,8 @@ gt_record_uc(struct intel_gt_coredump *gt,
>> */
>> error_uc->guc_fw.path = kstrdup(uc->guc.fw.path, ALLOW_FAIL);
>> error_uc->huc_fw.path = kstrdup(uc->huc.fw.path, ALLOW_FAIL);
>> - error_uc->guc_log =
>> - i915_vma_coredump_create(gt->_gt,
>> - uc->guc.log.vma, "GuC log buffer",
>> - compress);
>> + error_uc->guc_log = create_vma_coredump(gt->_gt, uc->guc.log.vma,
>> + "GuC log buffer", compress);
> <snip>
>
>> +bool i915_vma_snapshot_resource_pin(struct i915_vma_snapshot *vsnap,
>> + bool *lockdep_cookie)
>> +{
>> + bool pinned = i915_active_acquire_if_busy(vsnap->vma_resource);
>> +
>> + if (pinned)
>> + *lockdep_cookie = dma_fence_begin_signalling();
>> +
>> + return pinned;
>> +}
>>
>
> This change has broken the capture of GuC logs. The snapshot wrapper
> requires the vma to be in the active list. The GuC log is never in the
> active list as it has nothing to do with requests. Thus
> create_cma_coredump() always returns a null pointer and there is no
> log in the crash dump.
Hm. Sorry about that,
I'll take a look tomorrow to see what's going on. It's quite likely that
the last patch in the series which was temporarily dropped masked this bug.
/Thomas
>
> John.
>
More information about the Intel-gfx
mailing list