[PATCH 01/45] drm/i915: Use memcpy_from_wc for GPU error capture
Tvrtko Ursulin
tvrtko.ursulin at linux.intel.com
Tue Dec 6 12:02:50 UTC 2016
On 06/12/2016 00:12, Chris Wilson wrote:
> On all platforms we now always read the contents of buffers via the GTT,
> i.e. using WC cpu access. Reads are slow, but they can be accelerated
> with an internal read buffer using sse4.1 (movntqda). This is our
> i915_memcpy_from_wc() routine which also checks for sse4.1 support and
> so we can fallback to using a regular slow memcpy if we need to.
>
> When compressing the pages, the reads are currently done inside zlib's
> fill_window() routine and so we must copy the page into a temporary
> which is then already inside the CPU cache and fast for zlib's
> compression. When not compressing the pages, we don't need a temporary
> and can just use the accelerated read from WC into the destination.
I suppose this is not very performance interesting so just a cool factor. :)
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> ---
> drivers/gpu/drm/i915/i915_gpu_error.c | 75 +++++++++++++++++++++--------------
> 1 file changed, 46 insertions(+), 29 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
> index a14f7badc337..b5d1a9947a94 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
> @@ -176,33 +176,42 @@ static void i915_error_puts(struct drm_i915_error_state_buf *e,
>
> #ifdef CONFIG_DRM_I915_COMPRESS_ERROR
>
> -static bool compress_init(struct z_stream_s *zstream)
> +struct compress {
> + struct z_stream_s zstream;
> + void *tmp;
> +};
> +
> +static bool compress_init(struct compress *c)
> {
> - memset(zstream, 0, sizeof(*zstream));
> + c->tmp = (void *)__get_free_page(GFP_ATOMIC | __GFP_NOWARN);
> + memset(&c->zstream, 0, sizeof(c->zstream));
>
> - zstream->workspace =
> + c->zstream.workspace =
> kmalloc(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL),
> GFP_ATOMIC | __GFP_NOWARN);
> - if (!zstream->workspace)
> + if (!c->zstream.workspace)
> return false;
>
> - if (zlib_deflateInit(zstream, Z_DEFAULT_COMPRESSION) != Z_OK) {
> - kfree(zstream->workspace);
> + if (zlib_deflateInit(&c->zstream, Z_DEFAULT_COMPRESSION) != Z_OK) {
> + kfree(c->zstream.workspace);
> return false;
> }
>
> return true;
> }
>
> -static int compress_page(struct z_stream_s *zstream,
> +static int compress_page(struct compress *c,
> void *src,
> struct drm_i915_error_object *dst)
> {
zstream local for a smaller diff? And probably more readable code.
> - zstream->next_in = src;
> - zstream->avail_in = PAGE_SIZE;
> + if (!i915_memcpy_from_wc(c->tmp, src, PAGE_SIZE))
> + memcpy(c->tmp, src, PAGE_SIZE);
if (c->tmp && !i915_memcpy_from_wc())
But also, if i915_memcpy_from_wc is not available we could skip the
memcpy and just set next_in to src? Or this is a separate part of the
perf opt, I mean faster than letting zlib read from the mapping directly?
> +
> + c->zstream.next_in = c->tmp;
> + c->zstream.avail_in = PAGE_SIZE;
>
> do {
> - if (zstream->avail_out == 0) {
> + if (c->zstream.avail_out == 0) {
> unsigned long page;
>
> page = __get_free_page(GFP_ATOMIC | __GFP_NOWARN);
> @@ -211,31 +220,33 @@ static int compress_page(struct z_stream_s *zstream,
>
> dst->pages[dst->page_count++] = (void *)page;
>
> - zstream->next_out = (void *)page;
> - zstream->avail_out = PAGE_SIZE;
> + c->zstream.next_out = (void *)page;
> + c->zstream.avail_out = PAGE_SIZE;
> }
>
> - if (zlib_deflate(zstream, Z_SYNC_FLUSH) != Z_OK)
> + if (zlib_deflate(&c->zstream, Z_SYNC_FLUSH) != Z_OK)
> return -EIO;
> - } while (zstream->avail_in);
> + } while (c->zstream.avail_in);
>
> /* Fallback to uncompressed if we increase size? */
> - if (0 && zstream->total_out > zstream->total_in)
> + if (0 && c->zstream.total_out > c->zstream.total_in)
> return -E2BIG;
Uh oh! :) TODO/FIXME or cannot happen?
>
> return 0;
> }
>
> -static void compress_fini(struct z_stream_s *zstream,
> +static void compress_fini(struct compress *c,
> struct drm_i915_error_object *dst)
> {
zstream local again would be good I think.
> if (dst) {
> - zlib_deflate(zstream, Z_FINISH);
> - dst->unused = zstream->avail_out;
> + zlib_deflate(&c->zstream, Z_FINISH);
> + dst->unused = c->zstream.avail_out;
> }
>
> - zlib_deflateEnd(zstream);
> - kfree(zstream->workspace);
> + zlib_deflateEnd(&c->zstream);
> + kfree(c->zstream.workspace);
> +
> + free_page((unsigned long)c->tmp);
> }
>
> static void err_compression_marker(struct drm_i915_error_state_buf *m)
> @@ -245,28 +256,34 @@ static void err_compression_marker(struct drm_i915_error_state_buf *m)
>
> #else
>
> -static bool compress_init(struct z_stream_s *zstream)
> +struct compress {
> +};
> +
> +static bool compress_init(struct compress *c)
> {
> return true;
> }
>
> -static int compress_page(struct z_stream_s *zstream,
> +static int compress_page(struct compress *c,
> void *src,
> struct drm_i915_error_object *dst)
> {
> unsigned long page;
> + void *ptr;
>
> page = __get_free_page(GFP_ATOMIC | __GFP_NOWARN);
> if (!page)
> return -ENOMEM;
>
> - dst->pages[dst->page_count++] =
> - memcpy((void *)page, src, PAGE_SIZE);
> + ptr = (void *)page;
> + if (!i915_memcpy_from_wc(ptr, src, PAGE_SIZE))
> + memcpy(ptr, src, PAGE_SIZE);
> + dst->pages[dst->page_count++] = ptr;
>
> return 0;
> }
>
> -static void compress_fini(struct z_stream_s *zstream,
> +static void compress_fini(struct compress *c,
> struct drm_i915_error_object *dst)
> {
> }
> @@ -784,7 +801,7 @@ i915_error_object_create(struct drm_i915_private *i915,
> struct i915_ggtt *ggtt = &i915->ggtt;
> const u64 slot = ggtt->error_capture.start;
> struct drm_i915_error_object *dst;
> - struct z_stream_s zstream;
> + struct compress compress;
> unsigned long num_pages;
> struct sgt_iter iter;
> dma_addr_t dma;
> @@ -804,7 +821,7 @@ i915_error_object_create(struct drm_i915_private *i915,
> dst->page_count = 0;
> dst->unused = 0;
>
> - if (!compress_init(&zstream)) {
> + if (!compress_init(&compress)) {
> kfree(dst);
> return NULL;
> }
> @@ -817,7 +834,7 @@ i915_error_object_create(struct drm_i915_private *i915,
> I915_CACHE_NONE, 0);
>
> s = io_mapping_map_atomic_wc(&ggtt->mappable, slot);
> - ret = compress_page(&zstream, (void __force *)s, dst);
> + ret = compress_page(&compress, (void __force *)s, dst);
> io_mapping_unmap_atomic(s);
>
> if (ret)
> @@ -832,7 +849,7 @@ i915_error_object_create(struct drm_i915_private *i915,
> dst = NULL;
>
> out:
> - compress_fini(&zstream, dst);
> + compress_fini(&compress, dst);
> ggtt->base.clear_range(&ggtt->base, slot, PAGE_SIZE);
> return dst;
> }
>
No other observations.
Regards,
Tvrtko
More information about the Intel-gfx-trybot
mailing list