[PATCH 01/45] drm/i915: Use memcpy_from_wc for GPU error capture

Tue Dec 6 12:02:50 UTC 2016

On 06/12/2016 00:12, Chris Wilson wrote:
> On all platforms we now always read the contents of buffers via the GTT,
> i.e. using WC cpu access. Reads are slow, but they can be accelerated
> with an internal read buffer using sse4.1 (movntqda). This is our
> i915_memcpy_from_wc() routine which also checks for sse4.1 support and
> so we can fallback to using a regular slow memcpy if we need to.
>
> When compressing the pages, the reads are currently done inside zlib's
> fill_window() routine and so we must copy the page into a temporary
> which is then already inside the CPU cache and fast for zlib's
> compression. When not compressing the pages, we don't need a temporary
> and can just use the accelerated read from WC into the destination.

I suppose this is not very performance interesting so just a cool factor. :)

> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> ---
>  drivers/gpu/drm/i915/i915_gpu_error.c | 75 +++++++++++++++++++++--------------
>  1 file changed, 46 insertions(+), 29 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
> index a14f7badc337..b5d1a9947a94 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
> @@ -176,33 +176,42 @@ static void i915_error_puts(struct drm_i915_error_state_buf *e,
>
>  #ifdef CONFIG_DRM_I915_COMPRESS_ERROR
>
> -static bool compress_init(struct z_stream_s *zstream)
> +struct compress {
> +	struct z_stream_s zstream;
> +	void *tmp;
> +};
> +
> +static bool compress_init(struct compress *c)
>  {
> -	memset(zstream, 0, sizeof(*zstream));
> +	c->tmp = (void *)__get_free_page(GFP_ATOMIC | __GFP_NOWARN);
> +	memset(&c->zstream, 0, sizeof(c->zstream));
>
> -	zstream->workspace =
> +	c->zstream.workspace =
>  		kmalloc(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL),
>  			GFP_ATOMIC | __GFP_NOWARN);
> -	if (!zstream->workspace)
> +	if (!c->zstream.workspace)
>  		return false;
>
> -	if (zlib_deflateInit(zstream, Z_DEFAULT_COMPRESSION) != Z_OK) {
> -		kfree(zstream->workspace);
> +	if (zlib_deflateInit(&c->zstream, Z_DEFAULT_COMPRESSION) != Z_OK) {
> +		kfree(c->zstream.workspace);
>  		return false;
>  	}
>
>  	return true;
>  }
>
> -static int compress_page(struct z_stream_s *zstream,
> +static int compress_page(struct compress *c,
>  			 void *src,
>  			 struct drm_i915_error_object *dst)
>  {

zstream local for a smaller diff? And probably more readable code.

> -	zstream->next_in = src;
> -	zstream->avail_in = PAGE_SIZE;
> +	if (!i915_memcpy_from_wc(c->tmp, src, PAGE_SIZE))
> +		memcpy(c->tmp, src, PAGE_SIZE);

if (c->tmp && !i915_memcpy_from_wc())

But also, if i915_memcpy_from_wc is not available we could skip the 
memcpy and just set next_in to src? Or this is a separate part of the 
perf opt, I mean faster than letting zlib read from the mapping directly?

> +
> +	c->zstream.next_in = c->tmp;
> +	c->zstream.avail_in = PAGE_SIZE;
>
>  	do {
> -		if (zstream->avail_out == 0) {
> +		if (c->zstream.avail_out == 0) {
>  			unsigned long page;
>
>  			page = __get_free_page(GFP_ATOMIC | __GFP_NOWARN);
> @@ -211,31 +220,33 @@ static int compress_page(struct z_stream_s *zstream,
>
>  			dst->pages[dst->page_count++] = (void *)page;
>
> -			zstream->next_out = (void *)page;
> -			zstream->avail_out = PAGE_SIZE;
> +			c->zstream.next_out = (void *)page;
> +			c->zstream.avail_out = PAGE_SIZE;
>  		}
>
> -		if (zlib_deflate(zstream, Z_SYNC_FLUSH) != Z_OK)
> +		if (zlib_deflate(&c->zstream, Z_SYNC_FLUSH) != Z_OK)
>  			return -EIO;
> -	} while (zstream->avail_in);
> +	} while (c->zstream.avail_in);
>
>  	/* Fallback to uncompressed if we increase size? */
> -	if (0 && zstream->total_out > zstream->total_in)
> +	if (0 && c->zstream.total_out > c->zstream.total_in)
>  		return -E2BIG;

Uh oh! :) TODO/FIXME or cannot happen?

>
>  	return 0;
>  }
>
> -static void compress_fini(struct z_stream_s *zstream,
> +static void compress_fini(struct compress *c,
>  			  struct drm_i915_error_object *dst)
>  {

zstream local again would be good I think.

>  	if (dst) {
> -		zlib_deflate(zstream, Z_FINISH);
> -		dst->unused = zstream->avail_out;
> +		zlib_deflate(&c->zstream, Z_FINISH);
> +		dst->unused = c->zstream.avail_out;
>  	}
>
> -	zlib_deflateEnd(zstream);
> -	kfree(zstream->workspace);
> +	zlib_deflateEnd(&c->zstream);
> +	kfree(c->zstream.workspace);
> +
> +	free_page((unsigned long)c->tmp);
>  }
>
>  static void err_compression_marker(struct drm_i915_error_state_buf *m)
> @@ -245,28 +256,34 @@ static void err_compression_marker(struct drm_i915_error_state_buf *m)
>
>  #else
>
> -static bool compress_init(struct z_stream_s *zstream)
> +struct compress {
> +};
> +
> +static bool compress_init(struct compress *c)
>  {
>  	return true;
>  }
>
> -static int compress_page(struct z_stream_s *zstream,
> +static int compress_page(struct compress *c,
>  			 void *src,
>  			 struct drm_i915_error_object *dst)
>  {
>  	unsigned long page;
> +	void *ptr;
>
>  	page = __get_free_page(GFP_ATOMIC | __GFP_NOWARN);
>  	if (!page)
>  		return -ENOMEM;
>
> -	dst->pages[dst->page_count++] =
> -		memcpy((void *)page, src, PAGE_SIZE);
> +	ptr = (void *)page;
> +	if (!i915_memcpy_from_wc(ptr, src, PAGE_SIZE))
> +		memcpy(ptr, src, PAGE_SIZE);
> +	dst->pages[dst->page_count++] = ptr;
>
>  	return 0;
>  }
>
> -static void compress_fini(struct z_stream_s *zstream,
> +static void compress_fini(struct compress *c,
>  			  struct drm_i915_error_object *dst)
>  {
>  }
> @@ -784,7 +801,7 @@ i915_error_object_create(struct drm_i915_private *i915,
>  	struct i915_ggtt *ggtt = &i915->ggtt;
>  	const u64 slot = ggtt->error_capture.start;
>  	struct drm_i915_error_object *dst;
> -	struct z_stream_s zstream;
> +	struct compress compress;
>  	unsigned long num_pages;
>  	struct sgt_iter iter;
>  	dma_addr_t dma;
> @@ -804,7 +821,7 @@ i915_error_object_create(struct drm_i915_private *i915,
>  	dst->page_count = 0;
>  	dst->unused = 0;
>
> -	if (!compress_init(&zstream)) {
> +	if (!compress_init(&compress)) {
>  		kfree(dst);
>  		return NULL;
>  	}
> @@ -817,7 +834,7 @@ i915_error_object_create(struct drm_i915_private *i915,
>  				       I915_CACHE_NONE, 0);
>
>  		s = io_mapping_map_atomic_wc(&ggtt->mappable, slot);
> -		ret = compress_page(&zstream, (void  __force *)s, dst);
> +		ret = compress_page(&compress, (void  __force *)s, dst);
>  		io_mapping_unmap_atomic(s);
>
>  		if (ret)
> @@ -832,7 +849,7 @@ i915_error_object_create(struct drm_i915_private *i915,
>  	dst = NULL;
>
>  out:
> -	compress_fini(&zstream, dst);
> +	compress_fini(&compress, dst);
>  	ggtt->base.clear_range(&ggtt->base, slot, PAGE_SIZE);
>  	return dst;
>  }
>

No other observations.

Regards,

Tvrtko