[Intel-gfx] [PATCH 11/20] drm/i915: Rely on spinlock protection for GPU error capture

Mon Jul 22 13:40:33 UTC 2019

On 18/07/2019 08:00, Chris Wilson wrote:
> Trust that we now have adequate protection over the low level structures
> via the engine->active.lock to allow ourselves to capture the GPU error
> state without the heavy hammer of stop_machine(). Sadly this does mean
> that we have to forgo some of the lesser used information (not derived
> from the active state) that is not controlled by the active locks. This
> includes the list of buffers in the ppGTT and pinned globally in the
> GGTT. Originally this was used to manually verify relocations, but
> hasn't been required for sometime and modern mesa now has the habit of
> ensuring that all interesting buffers within a batch are captured in their
> entirety (that are the auxiliary state buffers, but not the textures).
> 
> A useful side-effect is that this allows us to restore error capturing
> for Braswell and Broxton.
> 
> v2: Use pagevec for a typical random number
> 
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> ---
>   drivers/gpu/drm/i915/i915_gem_gtt.c   |   5 -
>   drivers/gpu/drm/i915/i915_gpu_error.c | 499 +++++++++++---------------
>   drivers/gpu/drm/i915/i915_gpu_error.h |  17 -
>   3 files changed, 203 insertions(+), 318 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
> index 8f33d3e52b9e..500f3cdcc4b3 100644
> --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
> @@ -2966,11 +2966,6 @@ static int gen8_gmch_probe(struct i915_ggtt *ggtt)
>   		ggtt->vm.insert_page    = bxt_vtd_ggtt_insert_page__BKL;
>   		if (ggtt->vm.clear_range != nop_clear_range)
>   			ggtt->vm.clear_range = bxt_vtd_ggtt_clear_range__BKL;
> -
> -		/* Prevent recursively calling stop_machine() and deadlocks. */
> -		dev_info(dev_priv->drm.dev,
> -			 "Disabling error capture for VT-d workaround\n");
> -		i915_disable_error_state(dev_priv, -ENODEV);
>   	}
>   
>   	ggtt->invalidate = gen6_ggtt_invalidate;
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
> index 24835be300bc..131a52c428de 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
> @@ -29,8 +29,8 @@
>   
>   #include <linux/ascii85.h>
>   #include <linux/nmi.h>
> +#include <linux/pagevec.h>
>   #include <linux/scatterlist.h>
> -#include <linux/stop_machine.h>
>   #include <linux/utsname.h>
>   #include <linux/zlib.h>
>   
> @@ -46,6 +46,9 @@
>   #include "i915_scatterlist.h"
>   #include "intel_csr.h"
>   
> +#define ALLOW_FAIL (GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN)
> +#define ATOMIC_MAYFAIL (GFP_ATOMIC | __GFP_NOWARN)
> +
>   static inline const struct intel_engine_cs *
>   engine_lookup(const struct drm_i915_private *i915, unsigned int id)
>   {
> @@ -67,26 +70,6 @@ engine_name(const struct drm_i915_private *i915, unsigned int id)
>   	return __engine_name(engine_lookup(i915, id));
>   }
>   
> -static const char *tiling_flag(int tiling)
> -{
> -	switch (tiling) {
> -	default:
> -	case I915_TILING_NONE: return "";
> -	case I915_TILING_X: return " X";
> -	case I915_TILING_Y: return " Y";
> -	}
> -}
> -
> -static const char *dirty_flag(int dirty)
> -{
> -	return dirty ? " dirty" : "";
> -}
> -
> -static const char *purgeable_flag(int purgeable)
> -{
> -	return purgeable ? " purgeable" : "";
> -}
> -
>   static void __sg_set_buf(struct scatterlist *sg,
>   			 void *addr, unsigned int len, loff_t it)
>   {
> @@ -114,7 +97,7 @@ static bool __i915_error_grow(struct drm_i915_error_state_buf *e, size_t len)
>   	if (e->cur == e->end) {
>   		struct scatterlist *sgl;
>   
> -		sgl = (typeof(sgl))__get_free_page(GFP_KERNEL);
> +		sgl = (typeof(sgl))__get_free_page(ALLOW_FAIL);
>   		if (!sgl) {
>   			e->err = -ENOMEM;
>   			return false;
> @@ -134,7 +117,7 @@ static bool __i915_error_grow(struct drm_i915_error_state_buf *e, size_t len)
>   	}
>   
>   	e->size = ALIGN(len + 1, SZ_64K);
> -	e->buf = kmalloc(e->size, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY);
> +	e->buf = kmalloc(e->size, ALLOW_FAIL);
>   	if (!e->buf) {
>   		e->size = PAGE_ALIGN(len + 1);
>   		e->buf = kmalloc(e->size, GFP_KERNEL);
> @@ -211,47 +194,125 @@ i915_error_printer(struct drm_i915_error_state_buf *e)
>   	return p;
>   }
>   
> +/* single threaded page allocator with a reserved stash for emergencies */
> +static void *__alloc_page(gfp_t gfp)
> +{
> +	return (void *)__get_free_page(gfp);
> +}
> +
> +static void pool_fini(struct pagevec *pv)
> +{
> +	pagevec_release(pv);
> +}
> +
> +static int pool_refill(struct pagevec *pv, gfp_t gfp)
> +{
> +	while (pagevec_space(pv)) {
> +		struct page *p;
> +
> +		p = alloc_page(gfp);
> +		if (!p)
> +			return -ENOMEM;
> +
> +		pagevec_add(pv, p);
> +	}
> +
> +	return 0;
> +}
> +
> +static int pool_init(struct pagevec *pv, gfp_t gfp)
> +{
> +	int err;
> +
> +	pagevec_init(pv);
> +
> +	err = pool_refill(pv, gfp);
> +	if (err)
> +		pool_fini(pv);
> +
> +	return err;
> +}
> +
> +static void *pool_alloc(struct pagevec *pv, gfp_t gfp)
> +{
> +	void *ptr;
> +
> +	ptr = __alloc_page(gfp);
> +	if (ptr)
> +		return ptr;
> +
> +	if (!pagevec_count(pv))
> +		return NULL;
> +
> +	return page_address(pv->pages[--pv->nr]);
> +}
> +
> +static void pool_free(struct pagevec *pv, void *addr)
> +{
> +	struct page *p = virt_to_page(addr);
> +
> +	if (pagevec_space(pv)) {
> +		pv->pages[pv->nr++] = p;

pagevec_add(pv, p)

> +		return;
> +	}
> +
> +	__free_pages(p, 0);

if-else would probably me more readable, instead of if-return. Also 
could use __free_page to avoid ", 0".

> +}
> +
>   #ifdef CONFIG_DRM_I915_COMPRESS_ERROR
>   
>   struct compress {
> +	struct pagevec pool;
>   	struct z_stream_s zstream;
>   	void *tmp;
>   };
>   
>   static bool compress_init(struct compress *c)
>   {
> -	struct z_stream_s *zstream = memset(&c->zstream, 0, sizeof(c->zstream));
> +	struct z_stream_s *zstream = &c->zstream;
>   
> -	zstream->workspace =
> -		kmalloc(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL),
> -			GFP_ATOMIC | __GFP_NOWARN);
> -	if (!zstream->workspace)
> +	if (pool_init(&c->pool, ALLOW_FAIL))
>   		return false;
>   
> -	if (zlib_deflateInit(zstream, Z_DEFAULT_COMPRESSION) != Z_OK) {
> -		kfree(zstream->workspace);
> +	zstream->workspace =
> +		kmalloc(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL),
> +			ALLOW_FAIL);
> +	if (!zstream->workspace) {
> +		pool_fini(&c->pool);
>   		return false;
>   	}
>   
>   	c->tmp = NULL;
>   	if (i915_has_memcpy_from_wc())
> -		c->tmp = (void *)__get_free_page(GFP_ATOMIC | __GFP_NOWARN);
> +		c->tmp = pool_alloc(&c->pool, ALLOW_FAIL);
>   
>   	return true;
>   }
>   
> -static void *compress_next_page(struct drm_i915_error_object *dst)
> +static bool compress_start(struct compress *c)
>   {
> -	unsigned long page;
> +	struct z_stream_s *zstream = &c->zstream;
> +	void *workspace = zstream->workspace;
> +
> +	memset(zstream, 0, sizeof(*zstream));
> +	zstream->workspace = workspace;
> +
> +	return zlib_deflateInit(zstream, Z_DEFAULT_COMPRESSION) == Z_OK;
> +}
> +
> +static void *compress_next_page(struct compress *c,
> +				struct drm_i915_error_object *dst)
> +{
> +	void *page;
>   
>   	if (dst->page_count >= dst->num_pages)
>   		return ERR_PTR(-ENOSPC);
>   
> -	page = __get_free_page(GFP_ATOMIC | __GFP_NOWARN);
> +	page = pool_alloc(&c->pool, ATOMIC_MAYFAIL);
>   	if (!page)
>   		return ERR_PTR(-ENOMEM);
>   
> -	return dst->pages[dst->page_count++] = (void *)page;
> +	return dst->pages[dst->page_count++] = page;
>   }
>   
>   static int compress_page(struct compress *c,
> @@ -267,7 +328,7 @@ static int compress_page(struct compress *c,
>   
>   	do {
>   		if (zstream->avail_out == 0) {
> -			zstream->next_out = compress_next_page(dst);
> +			zstream->next_out = compress_next_page(c, dst);
>   			if (IS_ERR(zstream->next_out))
>   				return PTR_ERR(zstream->next_out);
>   
> @@ -295,7 +356,7 @@ static int compress_flush(struct compress *c,
>   	do {
>   		switch (zlib_deflate(zstream, Z_FINISH)) {
>   		case Z_OK: /* more space requested */
> -			zstream->next_out = compress_next_page(dst);
> +			zstream->next_out = compress_next_page(c, dst);
>   			if (IS_ERR(zstream->next_out))
>   				return PTR_ERR(zstream->next_out);
>   
> @@ -316,15 +377,17 @@ static int compress_flush(struct compress *c,
>   	return 0;
>   }
>   
> -static void compress_fini(struct compress *c,
> -			  struct drm_i915_error_object *dst)
> +static void compress_finish(struct compress *c)
>   {
> -	struct z_stream_s *zstream = &c->zstream;
> +	zlib_deflateEnd(&c->zstream);
> +}
>   
> -	zlib_deflateEnd(zstream);
> -	kfree(zstream->workspace);
> +static void compress_fini(struct compress *c)
> +{
> +	kfree(c->zstream.workspace);
>   	if (c->tmp)
> -		free_page((unsigned long)c->tmp);
> +		pool_free(&c->pool, c->tmp);
> +	pool_fini(&c->pool);
>   }
>   
>   static void err_compression_marker(struct drm_i915_error_state_buf *m)
> @@ -335,9 +398,15 @@ static void err_compression_marker(struct drm_i915_error_state_buf *m)
>   #else
>   
>   struct compress {
> +	struct pagevec pool;
>   };
>   
>   static bool compress_init(struct compress *c)
> +{
> +	return pool_init(&c->pool, ALLOW_FAIL) == 0;
> +}
> +
> +static bool compress_start(struct compress *c)
>   {
>   	return true;
>   }
> @@ -346,14 +415,12 @@ static int compress_page(struct compress *c,
>   			 void *src,
>   			 struct drm_i915_error_object *dst)
>   {
> -	unsigned long page;
>   	void *ptr;
>   
> -	page = __get_free_page(GFP_ATOMIC | __GFP_NOWARN);
> -	if (!page)
> +	ptr = pool_alloc(&c->pool, ATOMIC_MAYFAIL);
> +	if (!ptr)
>   		return -ENOMEM;
>   
> -	ptr = (void *)page;
>   	if (!i915_memcpy_from_wc(ptr, src, PAGE_SIZE))
>   		memcpy(ptr, src, PAGE_SIZE);
>   	dst->pages[dst->page_count++] = ptr;
> @@ -367,9 +434,13 @@ static int compress_flush(struct compress *c,
>   	return 0;
>   }
>   
> -static void compress_fini(struct compress *c,
> -			  struct drm_i915_error_object *dst)
> +static void compress_finish(struct compress *c)
> +{
> +}
> +
> +static void compress_fini(struct compress *c)
>   {
> +	pool_fini(&c->pool);
>   }
>   
>   static void err_compression_marker(struct drm_i915_error_state_buf *m)
> @@ -379,36 +450,6 @@ static void err_compression_marker(struct drm_i915_error_state_buf *m)
>   
>   #endif
>   
> -static void print_error_buffers(struct drm_i915_error_state_buf *m,
> -				const char *name,
> -				struct drm_i915_error_buffer *err,
> -				int count)
> -{
> -	err_printf(m, "%s [%d]:\n", name, count);
> -
> -	while (count--) {
> -		err_printf(m, "    %08x_%08x %8u %02x %02x",
> -			   upper_32_bits(err->gtt_offset),
> -			   lower_32_bits(err->gtt_offset),
> -			   err->size,
> -			   err->read_domains,
> -			   err->write_domain);
> -		err_puts(m, tiling_flag(err->tiling));
> -		err_puts(m, dirty_flag(err->dirty));
> -		err_puts(m, purgeable_flag(err->purgeable));
> -		err_puts(m, err->userptr ? " userptr" : "");
> -		err_puts(m, i915_cache_level_str(m->i915, err->cache_level));
> -
> -		if (err->name)
> -			err_printf(m, " (name: %d)", err->name);
> -		if (err->fence_reg != I915_FENCE_REG_NONE)
> -			err_printf(m, " (fence: %d)", err->fence_reg);
> -
> -		err_puts(m, "\n");
> -		err++;
> -	}
> -}
> -
>   static void error_print_instdone(struct drm_i915_error_state_buf *m,
>   				 const struct drm_i915_error_engine *ee)
>   {
> @@ -734,33 +775,6 @@ static void __err_print_to_sgl(struct drm_i915_error_state_buf *m,
>   			error_print_engine(m, &error->engine[i], error->epoch);
>   	}
>   
> -	for (i = 0; i < ARRAY_SIZE(error->active_vm); i++) {
> -		char buf[128];
> -		int len, first = 1;
> -
> -		if (!error->active_vm[i])
> -			break;
> -
> -		len = scnprintf(buf, sizeof(buf), "Active (");
> -		for (j = 0; j < ARRAY_SIZE(error->engine); j++) {
> -			if (error->engine[j].vm != error->active_vm[i])
> -				continue;
> -
> -			len += scnprintf(buf + len, sizeof(buf), "%s%s",
> -					 first ? "" : ", ",
> -					 m->i915->engine[j]->name);
> -			first = 0;
> -		}
> -		scnprintf(buf + len, sizeof(buf), ")");
> -		print_error_buffers(m, buf,
> -				    error->active_bo[i],
> -				    error->active_bo_count[i]);
> -	}
> -
> -	print_error_buffers(m, "Pinned (global)",
> -			    error->pinned_bo,
> -			    error->pinned_bo_count);
> -
>   	for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
>   		const struct drm_i915_error_engine *ee = &error->engine[i];
>   
> @@ -974,10 +988,6 @@ void __i915_gpu_state_free(struct kref *error_ref)
>   		kfree(ee->requests);
>   	}
>   
> -	for (i = 0; i < ARRAY_SIZE(error->active_bo); i++)
> -		kfree(error->active_bo[i]);
> -	kfree(error->pinned_bo);
> -
>   	kfree(error->overlay);
>   	kfree(error->display);
>   
> @@ -990,12 +1000,12 @@ void __i915_gpu_state_free(struct kref *error_ref)
>   
>   static struct drm_i915_error_object *
>   i915_error_object_create(struct drm_i915_private *i915,
> -			 struct i915_vma *vma)
> +			 struct i915_vma *vma,
> +			 struct compress *compress)
>   {
>   	struct i915_ggtt *ggtt = &i915->ggtt;
>   	const u64 slot = ggtt->error_capture.start;
>   	struct drm_i915_error_object *dst;
> -	struct compress compress;
>   	unsigned long num_pages;
>   	struct sgt_iter iter;
>   	dma_addr_t dma;
> @@ -1006,22 +1016,21 @@ i915_error_object_create(struct drm_i915_private *i915,
>   
>   	num_pages = min_t(u64, vma->size, vma->obj->base.size) >> PAGE_SHIFT;
>   	num_pages = DIV_ROUND_UP(10 * num_pages, 8); /* worstcase zlib growth */
> -	dst = kmalloc(sizeof(*dst) + num_pages * sizeof(u32 *),
> -		      GFP_ATOMIC | __GFP_NOWARN);
> +	dst = kmalloc(sizeof(*dst) + num_pages * sizeof(u32 *), ATOMIC_MAYFAIL);
>   	if (!dst)
>   		return NULL;
>   
> +	if (!compress_start(compress)) {
> +		kfree(dst);
> +		return NULL;
> +	}
> +
>   	dst->gtt_offset = vma->node.start;
>   	dst->gtt_size = vma->node.size;
>   	dst->num_pages = num_pages;
>   	dst->page_count = 0;
>   	dst->unused = 0;
>   
> -	if (!compress_init(&compress)) {
> -		kfree(dst);
> -		return NULL;
> -	}
> -
>   	ret = -EINVAL;
>   	for_each_sgt_dma(dma, iter, vma->pages) {
>   		void __iomem *s;
> @@ -1029,69 +1038,23 @@ i915_error_object_create(struct drm_i915_private *i915,
>   		ggtt->vm.insert_page(&ggtt->vm, dma, slot, I915_CACHE_NONE, 0);
>   
>   		s = io_mapping_map_atomic_wc(&ggtt->iomap, slot);
> -		ret = compress_page(&compress, (void  __force *)s, dst);
> +		ret = compress_page(compress, (void  __force *)s, dst);
>   		io_mapping_unmap_atomic(s);
>   		if (ret)
>   			break;
>   	}
>   
> -	if (ret || compress_flush(&compress, dst)) {
> +	if (ret || compress_flush(compress, dst)) {
>   		while (dst->page_count--)
> -			free_page((unsigned long)dst->pages[dst->page_count]);
> +			pool_free(&compress->pool, dst->pages[dst->page_count]);
>   		kfree(dst);
>   		dst = NULL;
>   	}
> +	compress_finish(compress);
>   
> -	compress_fini(&compress, dst);
>   	return dst;
>   }
>   
> -static void capture_bo(struct drm_i915_error_buffer *err,
> -		       struct i915_vma *vma)
> -{
> -	struct drm_i915_gem_object *obj = vma->obj;
> -
> -	err->size = obj->base.size;
> -	err->name = obj->base.name;
> -
> -	err->gtt_offset = vma->node.start;
> -	err->read_domains = obj->read_domains;
> -	err->write_domain = obj->write_domain;
> -	err->fence_reg = vma->fence ? vma->fence->id : -1;
> -	err->tiling = i915_gem_object_get_tiling(obj);
> -	err->dirty = obj->mm.dirty;
> -	err->purgeable = obj->mm.madv != I915_MADV_WILLNEED;
> -	err->userptr = obj->userptr.mm != NULL;
> -	err->cache_level = obj->cache_level;
> -}
> -
> -static u32 capture_error_bo(struct drm_i915_error_buffer *err,
> -			    int count, struct list_head *head,
> -			    unsigned int flags)
> -#define ACTIVE_ONLY BIT(0)
> -#define PINNED_ONLY BIT(1)
> -{
> -	struct i915_vma *vma;
> -	int i = 0;
> -
> -	list_for_each_entry(vma, head, vm_link) {
> -		if (!vma->obj)
> -			continue;
> -
> -		if (flags & ACTIVE_ONLY && !i915_vma_is_active(vma))
> -			continue;
> -
> -		if (flags & PINNED_ONLY && !i915_vma_is_pinned(vma))
> -			continue;
> -
> -		capture_bo(err++, vma);
> -		if (++i == count)
> -			break;
> -	}
> -
> -	return i;
> -}
> -
>   /*
>    * Generate a semi-unique error code. The code is not meant to have meaning, The
>    * code's only purpose is to try to prevent false duplicated bug reports by
> @@ -1281,7 +1244,7 @@ static void engine_record_requests(struct intel_engine_cs *engine,
>   	if (!count)
>   		return;
>   
> -	ee->requests = kcalloc(count, sizeof(*ee->requests), GFP_ATOMIC);
> +	ee->requests = kcalloc(count, sizeof(*ee->requests), ATOMIC_MAYFAIL);
>   	if (!ee->requests)
>   		return;
>   
> @@ -1349,8 +1312,10 @@ static void record_context(struct drm_i915_error_context *e,
>   	e->active = atomic_read(&ctx->active_count);
>   }
>   
> -static void request_record_user_bo(struct i915_request *request,
> -				   struct drm_i915_error_engine *ee)
> +static void
> +request_record_user_bo(struct i915_request *request,
> +		       struct drm_i915_error_engine *ee,
> +		       struct compress *compress)
>   {
>   	struct i915_capture_list *c;
>   	struct drm_i915_error_object **bo;
> @@ -1362,18 +1327,20 @@ static void request_record_user_bo(struct i915_request *request,
>   	if (!max)
>   		return;
>   
> -	bo = kmalloc_array(max, sizeof(*bo), GFP_ATOMIC);
> +	bo = kmalloc_array(max, sizeof(*bo), ATOMIC_MAYFAIL);
>   	if (!bo) {
>   		/* If we can't capture everything, try to capture something. */
>   		max = min_t(long, max, PAGE_SIZE / sizeof(*bo));
> -		bo = kmalloc_array(max, sizeof(*bo), GFP_ATOMIC);
> +		bo = kmalloc_array(max, sizeof(*bo), ATOMIC_MAYFAIL);
>   	}
>   	if (!bo)
>   		return;
>   
>   	count = 0;
>   	for (c = request->capture_list; c; c = c->next) {
> -		bo[count] = i915_error_object_create(request->i915, c->vma);
> +		bo[count] = i915_error_object_create(request->i915,
> +						     c->vma,
> +						     compress);
>   		if (!bo[count])
>   			break;
>   		if (++count == max)
> @@ -1386,7 +1353,8 @@ static void request_record_user_bo(struct i915_request *request,
>   
>   static struct drm_i915_error_object *
>   capture_object(struct drm_i915_private *dev_priv,
> -	       struct drm_i915_gem_object *obj)
> +	       struct drm_i915_gem_object *obj,
> +	       struct compress *compress)
>   {
>   	if (obj && i915_gem_object_has_pages(obj)) {
>   		struct i915_vma fake = {
> @@ -1396,13 +1364,14 @@ capture_object(struct drm_i915_private *dev_priv,
>   			.obj = obj,
>   		};
>   
> -		return i915_error_object_create(dev_priv, &fake);
> +		return i915_error_object_create(dev_priv, &fake, compress);
>   	} else {
>   		return NULL;
>   	}
>   }
>   
> -static void gem_record_rings(struct i915_gpu_state *error)
> +static void
> +gem_record_rings(struct i915_gpu_state *error, struct compress *compress)
>   {
>   	struct drm_i915_private *i915 = error->i915;
>   	int i;
> @@ -1420,6 +1389,9 @@ static void gem_record_rings(struct i915_gpu_state *error)
>   
>   		ee->engine_id = i;
>   
> +		/* Refill our page pool before entering atomic section */
> +		pool_refill(&compress->pool, ALLOW_FAIL);
> +
>   		error_record_engine_registers(error, engine, ee);
>   		error_record_engine_execlists(engine, ee);
>   
> @@ -1429,8 +1401,6 @@ static void gem_record_rings(struct i915_gpu_state *error)
>   			struct i915_gem_context *ctx = request->gem_context;
>   			struct intel_ring *ring = request->ring;
>   
> -			ee->vm = request->hw_context->vm;
> -
>   			record_context(&ee->context, ctx);
>   
>   			/* We need to copy these to an anonymous buffer
> @@ -1438,17 +1408,21 @@ static void gem_record_rings(struct i915_gpu_state *error)
>   			 * by userspace.
>   			 */
>   			ee->batchbuffer =
> -				i915_error_object_create(i915, request->batch);
> +				i915_error_object_create(i915,
> +							 request->batch,
> +							 compress);
>   
>   			if (HAS_BROKEN_CS_TLB(i915))
>   				ee->wa_batchbuffer =
>   				  i915_error_object_create(i915,
> -							   engine->gt->scratch);
> -			request_record_user_bo(request, ee);
> +							   engine->gt->scratch,
> +							   compress);
> +			request_record_user_bo(request, ee, compress);
>   
>   			ee->ctx =
>   				i915_error_object_create(i915,
> -							 request->hw_context->state);
> +							 request->hw_context->state,
> +							 compress);
>   
>   			error->simulated |=
>   				i915_gem_context_no_error_capture(ctx);
> @@ -1460,7 +1434,9 @@ static void gem_record_rings(struct i915_gpu_state *error)
>   			ee->cpu_ring_head = ring->head;
>   			ee->cpu_ring_tail = ring->tail;
>   			ee->ringbuffer =
> -				i915_error_object_create(i915, ring->vma);
> +				i915_error_object_create(i915,
> +							 ring->vma,
> +							 compress);
>   
>   			engine_record_requests(engine, request, ee);
>   		}
> @@ -1468,89 +1444,21 @@ static void gem_record_rings(struct i915_gpu_state *error)
>   
>   		ee->hws_page =
>   			i915_error_object_create(i915,
> -						 engine->status_page.vma);
> -
> -		ee->wa_ctx = i915_error_object_create(i915, engine->wa_ctx.vma);
> -
> -		ee->default_state = capture_object(i915, engine->default_state);
> -	}
> -}
> -
> -static void gem_capture_vm(struct i915_gpu_state *error,
> -			   struct i915_address_space *vm,
> -			   int idx)
> -{
> -	struct drm_i915_error_buffer *active_bo;
> -	struct i915_vma *vma;
> -	int count;
> -
> -	count = 0;
> -	list_for_each_entry(vma, &vm->bound_list, vm_link)
> -		if (i915_vma_is_active(vma))
> -			count++;
> -
> -	active_bo = NULL;
> -	if (count)
> -		active_bo = kcalloc(count, sizeof(*active_bo), GFP_ATOMIC);
> -	if (active_bo)
> -		count = capture_error_bo(active_bo,
> -					 count, &vm->bound_list,
> -					 ACTIVE_ONLY);
> -	else
> -		count = 0;
> +						 engine->status_page.vma,
> +						 compress);
>   
> -	error->active_vm[idx] = vm;
> -	error->active_bo[idx] = active_bo;
> -	error->active_bo_count[idx] = count;
> -}
> -
> -static void capture_active_buffers(struct i915_gpu_state *error)
> -{
> -	int cnt = 0, i, j;
> -
> -	BUILD_BUG_ON(ARRAY_SIZE(error->engine) > ARRAY_SIZE(error->active_bo));
> -	BUILD_BUG_ON(ARRAY_SIZE(error->active_bo) != ARRAY_SIZE(error->active_vm));
> -	BUILD_BUG_ON(ARRAY_SIZE(error->active_bo) != ARRAY_SIZE(error->active_bo_count));
> -
> -	/* Scan each engine looking for unique active contexts/vm */
> -	for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
> -		struct drm_i915_error_engine *ee = &error->engine[i];
> -		bool found;
> -
> -		if (!ee->vm)
> -			continue;
> +		ee->wa_ctx =
> +			i915_error_object_create(i915,
> +						 engine->wa_ctx.vma,
> +						 compress);
>   
> -		found = false;
> -		for (j = 0; j < i && !found; j++)
> -			found = error->engine[j].vm == ee->vm;
> -		if (!found)
> -			gem_capture_vm(error, ee->vm, cnt++);
> +		ee->default_state =
> +			capture_object(i915, engine->default_state, compress);
>   	}
>   }
>   
> -static void capture_pinned_buffers(struct i915_gpu_state *error)
> -{
> -	struct i915_address_space *vm = &error->i915->ggtt.vm;
> -	struct drm_i915_error_buffer *bo;
> -	struct i915_vma *vma;
> -	int count;
> -
> -	count = 0;
> -	list_for_each_entry(vma, &vm->bound_list, vm_link)
> -		count++;
> -
> -	bo = NULL;
> -	if (count)
> -		bo = kcalloc(count, sizeof(*bo), GFP_ATOMIC);
> -	if (!bo)
> -		return;
> -
> -	error->pinned_bo_count =
> -		capture_error_bo(bo, count, &vm->bound_list, PINNED_ONLY);
> -	error->pinned_bo = bo;
> -}
> -
> -static void capture_uc_state(struct i915_gpu_state *error)
> +static void
> +capture_uc_state(struct i915_gpu_state *error, struct compress *compress)
>   {
>   	struct drm_i915_private *i915 = error->i915;
>   	struct i915_error_uc *error_uc = &error->uc;
> @@ -1567,9 +1475,11 @@ static void capture_uc_state(struct i915_gpu_state *error)
>   	 * As modparams are generally accesible from the userspace make
>   	 * explicit copies of the firmware paths.
>   	 */
> -	error_uc->guc_fw.path = kstrdup(uc->guc.fw.path, GFP_ATOMIC);
> -	error_uc->huc_fw.path = kstrdup(uc->huc.fw.path, GFP_ATOMIC);
> -	error_uc->guc_log = i915_error_object_create(i915, uc->guc.log.vma);
> +	error_uc->guc_fw.path = kstrdup(uc->guc.fw.path, ALLOW_FAIL);
> +	error_uc->huc_fw.path = kstrdup(uc->huc.fw.path, ALLOW_FAIL);
> +	error_uc->guc_log = i915_error_object_create(i915,
> +						     uc->guc.log.vma,
> +						     compress);
>   }
>   
>   /* Capture all registers which don't fit into another category. */
> @@ -1753,56 +1663,53 @@ static void capture_finish(struct i915_gpu_state *error)
>   	ggtt->vm.clear_range(&ggtt->vm, slot, PAGE_SIZE);
>   }
>   
> -static int capture(void *data)
> -{
> -	struct i915_gpu_state *error = data;
> -
> -	error->time = ktime_get_real();
> -	error->boottime = ktime_get_boottime();
> -	error->uptime = ktime_sub(ktime_get(),
> -				  error->i915->gt.last_init_time);
> -	error->capture = jiffies;
> -
> -	capture_params(error);
> -	capture_gen_state(error);
> -	capture_uc_state(error);
> -	capture_reg_state(error);
> -	gem_record_fences(error);
> -	gem_record_rings(error);
> -	capture_active_buffers(error);
> -	capture_pinned_buffers(error);
> -
> -	error->overlay = intel_overlay_capture_error_state(error->i915);
> -	error->display = intel_display_capture_error_state(error->i915);
> -
> -	error->epoch = capture_find_epoch(error);
> -
> -	capture_finish(error);
> -	return 0;
> -}
> -
>   #define DAY_AS_SECONDS(x) (24 * 60 * 60 * (x))
>   
>   struct i915_gpu_state *
>   i915_capture_gpu_state(struct drm_i915_private *i915)
>   {
>   	struct i915_gpu_state *error;
> +	struct compress compress;
>   
>   	/* Check if GPU capture has been disabled */
>   	error = READ_ONCE(i915->gpu_error.first_error);
>   	if (IS_ERR(error))
>   		return error;
>   
> -	error = kzalloc(sizeof(*error), GFP_ATOMIC);
> +	error = kzalloc(sizeof(*error), ALLOW_FAIL);
>   	if (!error) {
>   		i915_disable_error_state(i915, -ENOMEM);
>   		return ERR_PTR(-ENOMEM);
>   	}
>   
> +	if (!compress_init(&compress)) {
> +		kfree(error);
> +		i915_disable_error_state(i915, -ENOMEM);
> +		return ERR_PTR(-ENOMEM);
> +	}
> +
>   	kref_init(&error->ref);
>   	error->i915 = i915;
>   
> -	stop_machine(capture, error, NULL);
> +	error->time = ktime_get_real();
> +	error->boottime = ktime_get_boottime();
> +	error->uptime = ktime_sub(ktime_get(), i915->gt.last_init_time);
> +	error->capture = jiffies;
> +
> +	capture_params(error);
> +	capture_gen_state(error);
> +	capture_uc_state(error, &compress);
> +	capture_reg_state(error);
> +	gem_record_fences(error);
> +	gem_record_rings(error, &compress);
> +
> +	error->overlay = intel_overlay_capture_error_state(i915);
> +	error->display = intel_display_capture_error_state(i915);
> +
> +	error->epoch = capture_find_epoch(error);
> +
> +	capture_finish(error);
> +	compress_fini(&compress);
>   
>   	return error;
>   }
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.h b/drivers/gpu/drm/i915/i915_gpu_error.h
> index 85f06bc5da05..a24c35107d16 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.h
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.h
> @@ -85,7 +85,6 @@ struct i915_gpu_state {
>   		/* Software tracked state */
>   		bool idle;
>   		unsigned long hangcheck_timestamp;
> -		struct i915_address_space *vm;
>   		int num_requests;
>   		u32 reset_count;
>   
> @@ -161,22 +160,6 @@ struct i915_gpu_state {
>   		} vm_info;
>   	} engine[I915_NUM_ENGINES];
>   
> -	struct drm_i915_error_buffer {
> -		u32 size;
> -		u32 name;
> -		u64 gtt_offset;
> -		u32 read_domains;
> -		u32 write_domain;
> -		s32 fence_reg:I915_MAX_NUM_FENCE_BITS;
> -		u32 tiling:2;
> -		u32 dirty:1;
> -		u32 purgeable:1;
> -		u32 userptr:1;
> -		u32 cache_level:3;
> -	} *active_bo[I915_NUM_ENGINES], *pinned_bo;
> -	u32 active_bo_count[I915_NUM_ENGINES], pinned_bo_count;
> -	struct i915_address_space *active_vm[I915_NUM_ENGINES];
> -
>   	struct scatterlist *sgl, *fit;
>   };
>   
> 

Otherwise;

Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin at intel.com>

Regards,

Tvrtko