[Intel-gfx] [PATCH 07/24] drm/i915: Rely on spinlock protection for GPU error capture

Tvrtko Ursulin tvrtko.ursulin at linux.intel.com
Tue Jul 16 10:48:28 UTC 2019


On 15/07/2019 09:09, Chris Wilson wrote:
> Trust that we now have adequate protection over the low level structures
> via the engine->active.lock to allow ourselves to capture the GPU error
> state without the heavy hammer of stop_machine(). Sadly this does mean
> that we have to forgo some of the lesser used information (not derived
> from the active state) that is not controlled by the active locks.
> 
> A useful side-effect is that this allows us to restore error capturing
> for Braswell and Broxton.
> 
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> ---
>   drivers/gpu/drm/i915/i915_gem_gtt.c   |   5 -
>   drivers/gpu/drm/i915/i915_gpu_error.c | 503 +++++++++++---------------
>   drivers/gpu/drm/i915/i915_gpu_error.h |  17 -
>   3 files changed, 207 insertions(+), 318 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
> index 220aba5a94d2..5c9c7cae4817 100644
> --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
> @@ -2966,11 +2966,6 @@ static int gen8_gmch_probe(struct i915_ggtt *ggtt)
>   		ggtt->vm.insert_page    = bxt_vtd_ggtt_insert_page__BKL;
>   		if (ggtt->vm.clear_range != nop_clear_range)
>   			ggtt->vm.clear_range = bxt_vtd_ggtt_clear_range__BKL;
> -
> -		/* Prevent recursively calling stop_machine() and deadlocks. */
> -		dev_info(dev_priv->drm.dev,
> -			 "Disabling error capture for VT-d workaround\n");
> -		i915_disable_error_state(dev_priv, -ENODEV);
>   	}
>   
>   	ggtt->invalidate = gen6_ggtt_invalidate;
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
> index c5b89bf4d616..026ee1f5536d 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
> @@ -30,7 +30,6 @@
>   #include <linux/ascii85.h>
>   #include <linux/nmi.h>
>   #include <linux/scatterlist.h>
> -#include <linux/stop_machine.h>
>   #include <linux/utsname.h>
>   #include <linux/zlib.h>
>   
> @@ -46,6 +45,9 @@
>   #include "i915_scatterlist.h"
>   #include "intel_csr.h"
>   
> +#define ALLOW_FAIL (GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN)
> +#define ATOMIC_MAYFAIL (GFP_ATOMIC | __GFP_NOWARN)
> +
>   static inline const struct intel_engine_cs *
>   engine_lookup(const struct drm_i915_private *i915, unsigned int id)
>   {
> @@ -67,26 +69,6 @@ engine_name(const struct drm_i915_private *i915, unsigned int id)
>   	return __engine_name(engine_lookup(i915, id));
>   }
>   
> -static const char *tiling_flag(int tiling)
> -{
> -	switch (tiling) {
> -	default:
> -	case I915_TILING_NONE: return "";
> -	case I915_TILING_X: return " X";
> -	case I915_TILING_Y: return " Y";
> -	}
> -}
> -
> -static const char *dirty_flag(int dirty)
> -{
> -	return dirty ? " dirty" : "";
> -}
> -
> -static const char *purgeable_flag(int purgeable)
> -{
> -	return purgeable ? " purgeable" : "";
> -}
> -
>   static void __sg_set_buf(struct scatterlist *sg,
>   			 void *addr, unsigned int len, loff_t it)
>   {
> @@ -114,7 +96,7 @@ static bool __i915_error_grow(struct drm_i915_error_state_buf *e, size_t len)
>   	if (e->cur == e->end) {
>   		struct scatterlist *sgl;
>   
> -		sgl = (typeof(sgl))__get_free_page(GFP_KERNEL);
> +		sgl = (typeof(sgl))__get_free_page(ALLOW_FAIL);
>   		if (!sgl) {
>   			e->err = -ENOMEM;
>   			return false;
> @@ -134,7 +116,7 @@ static bool __i915_error_grow(struct drm_i915_error_state_buf *e, size_t len)
>   	}
>   
>   	e->size = ALIGN(len + 1, SZ_64K);
> -	e->buf = kmalloc(e->size, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY);
> +	e->buf = kmalloc(e->size, ALLOW_FAIL);
>   	if (!e->buf) {
>   		e->size = PAGE_ALIGN(len + 1);
>   		e->buf = kmalloc(e->size, GFP_KERNEL);
> @@ -211,47 +193,127 @@ i915_error_printer(struct drm_i915_error_state_buf *e)
>   	return p;
>   }
>   
> +/* single threaded page allocator with a reserved stash for emergencies */
> +struct pool {
> +	void *stash[31];

Why 31?

> +	unsigned int count;
> +};
> +
> +static void *__alloc_page(gfp_t gfp)
> +{
> +	return (void *)__get_free_page(gfp);
> +}
> +
> +static void pool_fini(struct pool *p)
> +{
> +	while (p->count--)
> +		free_page((unsigned long)p->stash[p->count]);
> +}
> +
> +static int pool_refill(struct pool *p, gfp_t gfp)
> +{
> +	while (p->count < ARRAY_SIZE(p->stash)) {
> +		p->stash[p->count] = __alloc_page(gfp);
> +		if (!p->stash[p->count])
> +			return -ENOMEM;
> +
> +		p->count++;
> +	}
> +
> +	return 0;
> +}
> +
> +static int pool_init(struct pool *p, gfp_t gfp)
> +{
> +	int err;
> +
> +	p->count = 0;
> +
> +	err = pool_refill(p, gfp);
> +	if (err)
> +		pool_fini(p);
> +
> +	return err;
> +}
> +
> +static void *pool_alloc(struct pool *p, gfp_t gfp)
> +{
> +	void *ptr;
> +
> +	ptr = __alloc_page(gfp);
> +	if (ptr)
> +		return ptr;
> +
> +	if (p->count)
> +		return p->stash[--p->count];
> +
> +	return NULL;
> +}
> +
> +static void pool_free(struct pool *p, void *page)
> +{
> +	if (p->count < ARRAY_SIZE(p->stash)) {
> +		p->stash[p->count++] = page;
> +		return;
> +	}
> +
> +	free_page((unsigned long)page);
> +}
> +
>   #ifdef CONFIG_DRM_I915_COMPRESS_ERROR
>   
>   struct compress {
> +	struct pool pool;
>   	struct z_stream_s zstream;
>   	void *tmp;
>   };
>   
>   static bool compress_init(struct compress *c)
>   {
> -	struct z_stream_s *zstream = memset(&c->zstream, 0, sizeof(c->zstream));
> +	struct z_stream_s *zstream = &c->zstream;
>   
> -	zstream->workspace =
> -		kmalloc(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL),
> -			GFP_ATOMIC | __GFP_NOWARN);
> -	if (!zstream->workspace)
> +	if (pool_init(&c->pool, ALLOW_FAIL))
>   		return false;
>   
> -	if (zlib_deflateInit(zstream, Z_DEFAULT_COMPRESSION) != Z_OK) {
> -		kfree(zstream->workspace);
> +	zstream->workspace =
> +		kmalloc(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL),
> +			ALLOW_FAIL);
> +	if (!zstream->workspace) {
> +		pool_fini(&c->pool);
>   		return false;
>   	}
>   
>   	c->tmp = NULL;
>   	if (i915_has_memcpy_from_wc())
> -		c->tmp = (void *)__get_free_page(GFP_ATOMIC | __GFP_NOWARN);
> +		c->tmp = pool_alloc(&c->pool, ALLOW_FAIL);

So 31 stashed pages will be enough to not need atomic allocations, or I 
missed a point?

What determines 31 is enough?

>   
>   	return true;
>   }
>   
> -static void *compress_next_page(struct drm_i915_error_object *dst)
> +static bool compress_start(struct compress *c)
> +{
> +	struct z_stream_s *zstream = &c->zstream;
> +	void *workspace = zstream->workspace;
> +
> +	memset(zstream, 0, sizeof(*zstream));
> +	zstream->workspace = workspace;
> +
> +	return zlib_deflateInit(zstream, Z_DEFAULT_COMPRESSION) == Z_OK;
> +}
> +
> +static void *compress_next_page(struct compress *c,
> +				struct drm_i915_error_object *dst)
>   {
> -	unsigned long page;
> +	void *page;
>   
>   	if (dst->page_count >= dst->num_pages)
>   		return ERR_PTR(-ENOSPC);
>   
> -	page = __get_free_page(GFP_ATOMIC | __GFP_NOWARN);
> +	page = pool_alloc(&c->pool, ATOMIC_MAYFAIL);
>   	if (!page)
>   		return ERR_PTR(-ENOMEM);
>   
> -	return dst->pages[dst->page_count++] = (void *)page;
> +	return dst->pages[dst->page_count++] = page;
>   }
>   
>   static int compress_page(struct compress *c,
> @@ -267,7 +329,7 @@ static int compress_page(struct compress *c,
>   
>   	do {
>   		if (zstream->avail_out == 0) {
> -			zstream->next_out = compress_next_page(dst);
> +			zstream->next_out = compress_next_page(c, dst);
>   			if (IS_ERR(zstream->next_out))
>   				return PTR_ERR(zstream->next_out);
>   
> @@ -295,7 +357,7 @@ static int compress_flush(struct compress *c,
>   	do {
>   		switch (zlib_deflate(zstream, Z_FINISH)) {
>   		case Z_OK: /* more space requested */
> -			zstream->next_out = compress_next_page(dst);
> +			zstream->next_out = compress_next_page(c, dst);
>   			if (IS_ERR(zstream->next_out))
>   				return PTR_ERR(zstream->next_out);
>   
> @@ -316,15 +378,17 @@ static int compress_flush(struct compress *c,
>   	return 0;
>   }
>   
> -static void compress_fini(struct compress *c,
> -			  struct drm_i915_error_object *dst)
> +static void compress_finish(struct compress *c)
>   {
> -	struct z_stream_s *zstream = &c->zstream;
> +	zlib_deflateEnd(&c->zstream);
> +}
>   
> -	zlib_deflateEnd(zstream);
> -	kfree(zstream->workspace);
> +static void compress_fini(struct compress *c)
> +{
> +	kfree(c->zstream.workspace);
>   	if (c->tmp)
> -		free_page((unsigned long)c->tmp);
> +		pool_free(&c->pool, c->tmp);
> +	pool_fini(&c->pool);
>   }
>   
>   static void err_compression_marker(struct drm_i915_error_state_buf *m)
> @@ -335,9 +399,18 @@ static void err_compression_marker(struct drm_i915_error_state_buf *m)
>   #else
>   
>   struct compress {
> +	struct pool pool;
>   };
>   
>   static bool compress_init(struct compress *c)
> +{
> +	if (pool_init(&c->pool, ALLOW_FAIL))
> +		return false;
> +
> +	return true;

return pool_init() == 0 ?

> +}
> +
> +static bool compress_start(struct compress *c)
>   {
>   	return true;
>   }
> @@ -346,14 +419,12 @@ static int compress_page(struct compress *c,
>   			 void *src,
>   			 struct drm_i915_error_object *dst)
>   {
> -	unsigned long page;
>   	void *ptr;
>   
> -	page = __get_free_page(GFP_ATOMIC | __GFP_NOWARN);
> -	if (!page)
> +	ptr = pool_alloc(&c->pool, ATOMIC_MAYFAIL);
> +	if (!ptr)
>   		return -ENOMEM;
>   
> -	ptr = (void *)page;
>   	if (!i915_memcpy_from_wc(ptr, src, PAGE_SIZE))
>   		memcpy(ptr, src, PAGE_SIZE);
>   	dst->pages[dst->page_count++] = ptr;
> @@ -367,11 +438,15 @@ static int compress_flush(struct compress *c,
>   	return 0;
>   }
>   
> -static void compress_fini(struct compress *c,
> -			  struct drm_i915_error_object *dst)
> +static void compress_finish(struct compress *c)
>   {
>   }
>   
> +static void compress_fini(struct compress *c)
> +{
> +	pool_fini(&c->pool);
> +}
> +
>   static void err_compression_marker(struct drm_i915_error_state_buf *m)
>   {
>   	err_puts(m, "~");
> @@ -379,36 +454,6 @@ static void err_compression_marker(struct drm_i915_error_state_buf *m)
>   
>   #endif
>   
> -static void print_error_buffers(struct drm_i915_error_state_buf *m,
> -				const char *name,
> -				struct drm_i915_error_buffer *err,
> -				int count)
> -{
> -	err_printf(m, "%s [%d]:\n", name, count);
> -
> -	while (count--) {
> -		err_printf(m, "    %08x_%08x %8u %02x %02x",
> -			   upper_32_bits(err->gtt_offset),
> -			   lower_32_bits(err->gtt_offset),
> -			   err->size,
> -			   err->read_domains,
> -			   err->write_domain);
> -		err_puts(m, tiling_flag(err->tiling));
> -		err_puts(m, dirty_flag(err->dirty));
> -		err_puts(m, purgeable_flag(err->purgeable));
> -		err_puts(m, err->userptr ? " userptr" : "");
> -		err_puts(m, i915_cache_level_str(m->i915, err->cache_level));
> -
> -		if (err->name)
> -			err_printf(m, " (name: %d)", err->name);
> -		if (err->fence_reg != I915_FENCE_REG_NONE)
> -			err_printf(m, " (fence: %d)", err->fence_reg);
> -
> -		err_puts(m, "\n");
> -		err++;
> -	}
> -}

So no active and pinned bos any more.

Ca you put in the commit message what data is gone and what remains? And 
some notes on how does that affect the usefulness of error state.

Regards,

Tvrtko

> -
>   static void error_print_instdone(struct drm_i915_error_state_buf *m,
>   				 const struct drm_i915_error_engine *ee)
>   {
> @@ -734,33 +779,6 @@ static void __err_print_to_sgl(struct drm_i915_error_state_buf *m,
>   			error_print_engine(m, &error->engine[i], error->epoch);
>   	}
>   
> -	for (i = 0; i < ARRAY_SIZE(error->active_vm); i++) {
> -		char buf[128];
> -		int len, first = 1;
> -
> -		if (!error->active_vm[i])
> -			break;
> -
> -		len = scnprintf(buf, sizeof(buf), "Active (");
> -		for (j = 0; j < ARRAY_SIZE(error->engine); j++) {
> -			if (error->engine[j].vm != error->active_vm[i])
> -				continue;
> -
> -			len += scnprintf(buf + len, sizeof(buf), "%s%s",
> -					 first ? "" : ", ",
> -					 m->i915->engine[j]->name);
> -			first = 0;
> -		}
> -		scnprintf(buf + len, sizeof(buf), ")");
> -		print_error_buffers(m, buf,
> -				    error->active_bo[i],
> -				    error->active_bo_count[i]);
> -	}
> -
> -	print_error_buffers(m, "Pinned (global)",
> -			    error->pinned_bo,
> -			    error->pinned_bo_count);
> -
>   	for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
>   		const struct drm_i915_error_engine *ee = &error->engine[i];
>   
> @@ -974,10 +992,6 @@ void __i915_gpu_state_free(struct kref *error_ref)
>   		kfree(ee->requests);
>   	}
>   
> -	for (i = 0; i < ARRAY_SIZE(error->active_bo); i++)
> -		kfree(error->active_bo[i]);
> -	kfree(error->pinned_bo);
> -
>   	kfree(error->overlay);
>   	kfree(error->display);
>   
> @@ -990,12 +1004,12 @@ void __i915_gpu_state_free(struct kref *error_ref)
>   
>   static struct drm_i915_error_object *
>   i915_error_object_create(struct drm_i915_private *i915,
> -			 struct i915_vma *vma)
> +			 struct i915_vma *vma,
> +			 struct compress *compress)
>   {
>   	struct i915_ggtt *ggtt = &i915->ggtt;
>   	const u64 slot = ggtt->error_capture.start;
>   	struct drm_i915_error_object *dst;
> -	struct compress compress;
>   	unsigned long num_pages;
>   	struct sgt_iter iter;
>   	dma_addr_t dma;
> @@ -1006,22 +1020,21 @@ i915_error_object_create(struct drm_i915_private *i915,
>   
>   	num_pages = min_t(u64, vma->size, vma->obj->base.size) >> PAGE_SHIFT;
>   	num_pages = DIV_ROUND_UP(10 * num_pages, 8); /* worstcase zlib growth */
> -	dst = kmalloc(sizeof(*dst) + num_pages * sizeof(u32 *),
> -		      GFP_ATOMIC | __GFP_NOWARN);
> +	dst = kmalloc(sizeof(*dst) + num_pages * sizeof(u32 *), ATOMIC_MAYFAIL);
>   	if (!dst)
>   		return NULL;
>   
> +	if (!compress_start(compress)) {
> +		kfree(dst);
> +		return NULL;
> +	}
> +
>   	dst->gtt_offset = vma->node.start;
>   	dst->gtt_size = vma->node.size;
>   	dst->num_pages = num_pages;
>   	dst->page_count = 0;
>   	dst->unused = 0;
>   
> -	if (!compress_init(&compress)) {
> -		kfree(dst);
> -		return NULL;
> -	}
> -
>   	ret = -EINVAL;
>   	for_each_sgt_dma(dma, iter, vma->pages) {
>   		void __iomem *s;
> @@ -1029,69 +1042,23 @@ i915_error_object_create(struct drm_i915_private *i915,
>   		ggtt->vm.insert_page(&ggtt->vm, dma, slot, I915_CACHE_NONE, 0);
>   
>   		s = io_mapping_map_atomic_wc(&ggtt->iomap, slot);
> -		ret = compress_page(&compress, (void  __force *)s, dst);
> +		ret = compress_page(compress, (void  __force *)s, dst);
>   		io_mapping_unmap_atomic(s);
>   		if (ret)
>   			break;
>   	}
>   
> -	if (ret || compress_flush(&compress, dst)) {
> +	if (ret || compress_flush(compress, dst)) {
>   		while (dst->page_count--)
> -			free_page((unsigned long)dst->pages[dst->page_count]);
> +			pool_free(&compress->pool, dst->pages[dst->page_count]);
>   		kfree(dst);
>   		dst = NULL;
>   	}
> +	compress_finish(compress);
>   
> -	compress_fini(&compress, dst);
>   	return dst;
>   }
>   
> -static void capture_bo(struct drm_i915_error_buffer *err,
> -		       struct i915_vma *vma)
> -{
> -	struct drm_i915_gem_object *obj = vma->obj;
> -
> -	err->size = obj->base.size;
> -	err->name = obj->base.name;
> -
> -	err->gtt_offset = vma->node.start;
> -	err->read_domains = obj->read_domains;
> -	err->write_domain = obj->write_domain;
> -	err->fence_reg = vma->fence ? vma->fence->id : -1;
> -	err->tiling = i915_gem_object_get_tiling(obj);
> -	err->dirty = obj->mm.dirty;
> -	err->purgeable = obj->mm.madv != I915_MADV_WILLNEED;
> -	err->userptr = obj->userptr.mm != NULL;
> -	err->cache_level = obj->cache_level;
> -}
> -
> -static u32 capture_error_bo(struct drm_i915_error_buffer *err,
> -			    int count, struct list_head *head,
> -			    unsigned int flags)
> -#define ACTIVE_ONLY BIT(0)
> -#define PINNED_ONLY BIT(1)
> -{
> -	struct i915_vma *vma;
> -	int i = 0;
> -
> -	list_for_each_entry(vma, head, vm_link) {
> -		if (!vma->obj)
> -			continue;
> -
> -		if (flags & ACTIVE_ONLY && !i915_vma_is_active(vma))
> -			continue;
> -
> -		if (flags & PINNED_ONLY && !i915_vma_is_pinned(vma))
> -			continue;
> -
> -		capture_bo(err++, vma);
> -		if (++i == count)
> -			break;
> -	}
> -
> -	return i;
> -}
> -
>   /*
>    * Generate a semi-unique error code. The code is not meant to have meaning, The
>    * code's only purpose is to try to prevent false duplicated bug reports by
> @@ -1281,7 +1248,7 @@ static void engine_record_requests(struct intel_engine_cs *engine,
>   	if (!count)
>   		return;
>   
> -	ee->requests = kcalloc(count, sizeof(*ee->requests), GFP_ATOMIC);
> +	ee->requests = kcalloc(count, sizeof(*ee->requests), ATOMIC_MAYFAIL);
>   	if (!ee->requests)
>   		return;
>   
> @@ -1349,8 +1316,10 @@ static void record_context(struct drm_i915_error_context *e,
>   	e->active = atomic_read(&ctx->active_count);
>   }
>   
> -static void request_record_user_bo(struct i915_request *request,
> -				   struct drm_i915_error_engine *ee)
> +static void
> +request_record_user_bo(struct i915_request *request,
> +		       struct drm_i915_error_engine *ee,
> +		       struct compress *compress)
>   {
>   	struct i915_capture_list *c;
>   	struct drm_i915_error_object **bo;
> @@ -1362,18 +1331,20 @@ static void request_record_user_bo(struct i915_request *request,
>   	if (!max)
>   		return;
>   
> -	bo = kmalloc_array(max, sizeof(*bo), GFP_ATOMIC);
> +	bo = kmalloc_array(max, sizeof(*bo), ATOMIC_MAYFAIL);
>   	if (!bo) {
>   		/* If we can't capture everything, try to capture something. */
>   		max = min_t(long, max, PAGE_SIZE / sizeof(*bo));
> -		bo = kmalloc_array(max, sizeof(*bo), GFP_ATOMIC);
> +		bo = kmalloc_array(max, sizeof(*bo), ATOMIC_MAYFAIL);
>   	}
>   	if (!bo)
>   		return;
>   
>   	count = 0;
>   	for (c = request->capture_list; c; c = c->next) {
> -		bo[count] = i915_error_object_create(request->i915, c->vma);
> +		bo[count] = i915_error_object_create(request->i915,
> +						     c->vma,
> +						     compress);
>   		if (!bo[count])
>   			break;
>   		if (++count == max)
> @@ -1386,7 +1357,8 @@ static void request_record_user_bo(struct i915_request *request,
>   
>   static struct drm_i915_error_object *
>   capture_object(struct drm_i915_private *dev_priv,
> -	       struct drm_i915_gem_object *obj)
> +	       struct drm_i915_gem_object *obj,
> +	       struct compress *compress)
>   {
>   	if (obj && i915_gem_object_has_pages(obj)) {
>   		struct i915_vma fake = {
> @@ -1396,13 +1368,14 @@ capture_object(struct drm_i915_private *dev_priv,
>   			.obj = obj,
>   		};
>   
> -		return i915_error_object_create(dev_priv, &fake);
> +		return i915_error_object_create(dev_priv, &fake, compress);
>   	} else {
>   		return NULL;
>   	}
>   }
>   
> -static void gem_record_rings(struct i915_gpu_state *error)
> +static void
> +gem_record_rings(struct i915_gpu_state *error, struct compress *compress)
>   {
>   	struct drm_i915_private *i915 = error->i915;
>   	int i;
> @@ -1420,6 +1393,9 @@ static void gem_record_rings(struct i915_gpu_state *error)
>   
>   		ee->engine_id = i;
>   
> +		/* Refill our page pool before entering atomic section */
> +		pool_refill(&compress->pool, ALLOW_FAIL);
> +
>   		error_record_engine_registers(error, engine, ee);
>   		error_record_engine_execlists(engine, ee);
>   
> @@ -1429,8 +1405,6 @@ static void gem_record_rings(struct i915_gpu_state *error)
>   			struct i915_gem_context *ctx = request->gem_context;
>   			struct intel_ring *ring = request->ring;
>   
> -			ee->vm = ctx->vm ?: &engine->gt->ggtt->vm;
> -
>   			record_context(&ee->context, ctx);
>   
>   			/* We need to copy these to an anonymous buffer
> @@ -1438,17 +1412,21 @@ static void gem_record_rings(struct i915_gpu_state *error)
>   			 * by userspace.
>   			 */
>   			ee->batchbuffer =
> -				i915_error_object_create(i915, request->batch);
> +				i915_error_object_create(i915,
> +							 request->batch,
> +							 compress);
>   
>   			if (HAS_BROKEN_CS_TLB(i915))
>   				ee->wa_batchbuffer =
>   				  i915_error_object_create(i915,
> -							   engine->gt->scratch);
> -			request_record_user_bo(request, ee);
> +							   engine->gt->scratch,
> +							   compress);
> +			request_record_user_bo(request, ee, compress);
>   
>   			ee->ctx =
>   				i915_error_object_create(i915,
> -							 request->hw_context->state);
> +							 request->hw_context->state,
> +							 compress);
>   
>   			error->simulated |=
>   				i915_gem_context_no_error_capture(ctx);
> @@ -1460,7 +1438,9 @@ static void gem_record_rings(struct i915_gpu_state *error)
>   			ee->cpu_ring_head = ring->head;
>   			ee->cpu_ring_tail = ring->tail;
>   			ee->ringbuffer =
> -				i915_error_object_create(i915, ring->vma);
> +				i915_error_object_create(i915,
> +							 ring->vma,
> +							 compress);
>   
>   			engine_record_requests(engine, request, ee);
>   		}
> @@ -1468,89 +1448,21 @@ static void gem_record_rings(struct i915_gpu_state *error)
>   
>   		ee->hws_page =
>   			i915_error_object_create(i915,
> -						 engine->status_page.vma);
> -
> -		ee->wa_ctx = i915_error_object_create(i915, engine->wa_ctx.vma);
> -
> -		ee->default_state = capture_object(i915, engine->default_state);
> -	}
> -}
> -
> -static void gem_capture_vm(struct i915_gpu_state *error,
> -			   struct i915_address_space *vm,
> -			   int idx)
> -{
> -	struct drm_i915_error_buffer *active_bo;
> -	struct i915_vma *vma;
> -	int count;
> -
> -	count = 0;
> -	list_for_each_entry(vma, &vm->bound_list, vm_link)
> -		if (i915_vma_is_active(vma))
> -			count++;
> -
> -	active_bo = NULL;
> -	if (count)
> -		active_bo = kcalloc(count, sizeof(*active_bo), GFP_ATOMIC);
> -	if (active_bo)
> -		count = capture_error_bo(active_bo,
> -					 count, &vm->bound_list,
> -					 ACTIVE_ONLY);
> -	else
> -		count = 0;
> +						 engine->status_page.vma,
> +						 compress);
>   
> -	error->active_vm[idx] = vm;
> -	error->active_bo[idx] = active_bo;
> -	error->active_bo_count[idx] = count;
> -}
> -
> -static void capture_active_buffers(struct i915_gpu_state *error)
> -{
> -	int cnt = 0, i, j;
> -
> -	BUILD_BUG_ON(ARRAY_SIZE(error->engine) > ARRAY_SIZE(error->active_bo));
> -	BUILD_BUG_ON(ARRAY_SIZE(error->active_bo) != ARRAY_SIZE(error->active_vm));
> -	BUILD_BUG_ON(ARRAY_SIZE(error->active_bo) != ARRAY_SIZE(error->active_bo_count));
> -
> -	/* Scan each engine looking for unique active contexts/vm */
> -	for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
> -		struct drm_i915_error_engine *ee = &error->engine[i];
> -		bool found;
> -
> -		if (!ee->vm)
> -			continue;
> +		ee->wa_ctx =
> +			i915_error_object_create(i915,
> +						 engine->wa_ctx.vma,
> +						 compress);
>   
> -		found = false;
> -		for (j = 0; j < i && !found; j++)
> -			found = error->engine[j].vm == ee->vm;
> -		if (!found)
> -			gem_capture_vm(error, ee->vm, cnt++);
> +		ee->default_state =
> +			capture_object(i915, engine->default_state, compress);
>   	}
>   }
>   
> -static void capture_pinned_buffers(struct i915_gpu_state *error)
> -{
> -	struct i915_address_space *vm = &error->i915->ggtt.vm;
> -	struct drm_i915_error_buffer *bo;
> -	struct i915_vma *vma;
> -	int count;
> -
> -	count = 0;
> -	list_for_each_entry(vma, &vm->bound_list, vm_link)
> -		count++;
> -
> -	bo = NULL;
> -	if (count)
> -		bo = kcalloc(count, sizeof(*bo), GFP_ATOMIC);
> -	if (!bo)
> -		return;
> -
> -	error->pinned_bo_count =
> -		capture_error_bo(bo, count, &vm->bound_list, PINNED_ONLY);
> -	error->pinned_bo = bo;
> -}
> -
> -static void capture_uc_state(struct i915_gpu_state *error)
> +static void
> +capture_uc_state(struct i915_gpu_state *error, struct compress *compress)
>   {
>   	struct drm_i915_private *i915 = error->i915;
>   	struct i915_error_uc *error_uc = &error->uc;
> @@ -1567,9 +1479,11 @@ static void capture_uc_state(struct i915_gpu_state *error)
>   	 * As modparams are generally accesible from the userspace make
>   	 * explicit copies of the firmware paths.
>   	 */
> -	error_uc->guc_fw.path = kstrdup(uc->guc.fw.path, GFP_ATOMIC);
> -	error_uc->huc_fw.path = kstrdup(uc->huc.fw.path, GFP_ATOMIC);
> -	error_uc->guc_log = i915_error_object_create(i915, uc->guc.log.vma);
> +	error_uc->guc_fw.path = kstrdup(uc->guc.fw.path, ALLOW_FAIL);
> +	error_uc->huc_fw.path = kstrdup(uc->huc.fw.path, ALLOW_FAIL);
> +	error_uc->guc_log = i915_error_object_create(i915,
> +						     uc->guc.log.vma,
> +						     compress);
>   }
>   
>   /* Capture all registers which don't fit into another category. */
> @@ -1753,56 +1667,53 @@ static void capture_finish(struct i915_gpu_state *error)
>   	ggtt->vm.clear_range(&ggtt->vm, slot, PAGE_SIZE);
>   }
>   
> -static int capture(void *data)
> -{
> -	struct i915_gpu_state *error = data;
> -
> -	error->time = ktime_get_real();
> -	error->boottime = ktime_get_boottime();
> -	error->uptime = ktime_sub(ktime_get(),
> -				  error->i915->gt.last_init_time);
> -	error->capture = jiffies;
> -
> -	capture_params(error);
> -	capture_gen_state(error);
> -	capture_uc_state(error);
> -	capture_reg_state(error);
> -	gem_record_fences(error);
> -	gem_record_rings(error);
> -	capture_active_buffers(error);
> -	capture_pinned_buffers(error);
> -
> -	error->overlay = intel_overlay_capture_error_state(error->i915);
> -	error->display = intel_display_capture_error_state(error->i915);
> -
> -	error->epoch = capture_find_epoch(error);
> -
> -	capture_finish(error);
> -	return 0;
> -}
> -
>   #define DAY_AS_SECONDS(x) (24 * 60 * 60 * (x))
>   
>   struct i915_gpu_state *
>   i915_capture_gpu_state(struct drm_i915_private *i915)
>   {
>   	struct i915_gpu_state *error;
> +	struct compress compress;
>   
>   	/* Check if GPU capture has been disabled */
>   	error = READ_ONCE(i915->gpu_error.first_error);
>   	if (IS_ERR(error))
>   		return error;
>   
> -	error = kzalloc(sizeof(*error), GFP_ATOMIC);
> +	error = kzalloc(sizeof(*error), ALLOW_FAIL);
>   	if (!error) {
>   		i915_disable_error_state(i915, -ENOMEM);
>   		return ERR_PTR(-ENOMEM);
>   	}
>   
> +	if (!compress_init(&compress)) {
> +		kfree(error);
> +		i915_disable_error_state(i915, -ENOMEM);
> +		return ERR_PTR(-ENOMEM);
> +	}
> +
>   	kref_init(&error->ref);
>   	error->i915 = i915;
>   
> -	stop_machine(capture, error, NULL);
> +	error->time = ktime_get_real();
> +	error->boottime = ktime_get_boottime();
> +	error->uptime = ktime_sub(ktime_get(), i915->gt.last_init_time);
> +	error->capture = jiffies;
> +
> +	capture_params(error);
> +	capture_gen_state(error);
> +	capture_uc_state(error, &compress);
> +	capture_reg_state(error);
> +	gem_record_fences(error);
> +	gem_record_rings(error, &compress);
> +
> +	error->overlay = intel_overlay_capture_error_state(i915);
> +	error->display = intel_display_capture_error_state(i915);
> +
> +	error->epoch = capture_find_epoch(error);
> +
> +	capture_finish(error);
> +	compress_fini(&compress);
>   
>   	return error;
>   }
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.h b/drivers/gpu/drm/i915/i915_gpu_error.h
> index 85f06bc5da05..a24c35107d16 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.h
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.h
> @@ -85,7 +85,6 @@ struct i915_gpu_state {
>   		/* Software tracked state */
>   		bool idle;
>   		unsigned long hangcheck_timestamp;
> -		struct i915_address_space *vm;
>   		int num_requests;
>   		u32 reset_count;
>   
> @@ -161,22 +160,6 @@ struct i915_gpu_state {
>   		} vm_info;
>   	} engine[I915_NUM_ENGINES];
>   
> -	struct drm_i915_error_buffer {
> -		u32 size;
> -		u32 name;
> -		u64 gtt_offset;
> -		u32 read_domains;
> -		u32 write_domain;
> -		s32 fence_reg:I915_MAX_NUM_FENCE_BITS;
> -		u32 tiling:2;
> -		u32 dirty:1;
> -		u32 purgeable:1;
> -		u32 userptr:1;
> -		u32 cache_level:3;
> -	} *active_bo[I915_NUM_ENGINES], *pinned_bo;
> -	u32 active_bo_count[I915_NUM_ENGINES], pinned_bo_count;
> -	struct i915_address_space *active_vm[I915_NUM_ENGINES];
> -
>   	struct scatterlist *sgl, *fit;
>   };
>   
> 


More information about the Intel-gfx mailing list