[Mesa-dev] [PATCH 3/3] swr: Add path to draw directly from client memory without copy.

Rowley, Timothy O timothy.o.rowley at intel.com
Wed Jul 12 19:49:01 UTC 2017


> On Jul 11, 2017, at 8:20 PM, Bruce Cherniak <bruce.cherniak at intel.com> wrote:
> 
> If size of client memory copy is too large, don't copy. The draw will
> access user-buffer directly and then block.  This is faster and more
> efficient than queuing many large client draws.
> 
> Applications that use large draws from client arrays benefit from this.
> VMD is an example.
> 
> The threshold for this path defaults to 32KB.  This value can be
> overridden by setting environment variable SWR_CLIENT_COPY_LIMIT.
> ---
> src/gallium/drivers/swr/swr_context.h  |  1 +
> src/gallium/drivers/swr/swr_draw.cpp   |  9 +++++++++
> src/gallium/drivers/swr/swr_screen.cpp | 10 +++++++++
> src/gallium/drivers/swr/swr_screen.h   |  2 ++
> src/gallium/drivers/swr/swr_state.cpp  | 37 ++++++++++++++++++++++++----------
> 5 files changed, 48 insertions(+), 11 deletions(-)
> 
> diff --git a/src/gallium/drivers/swr/swr_context.h b/src/gallium/drivers/swr/swr_context.h
> index 3ff4bf3e2f..ab3057af96 100644
> --- a/src/gallium/drivers/swr/swr_context.h
> +++ b/src/gallium/drivers/swr/swr_context.h
> @@ -51,6 +51,7 @@
> #define SWR_NEW_FRAMEBUFFER (1 << 15)
> #define SWR_NEW_CLIP (1 << 16)
> #define SWR_NEW_SO (1 << 17)
> +#define SWR_LARGE_CLIENT_DRAW (1<<18) // Indicates client draw will block
> 
> namespace std
> {
> diff --git a/src/gallium/drivers/swr/swr_draw.cpp b/src/gallium/drivers/swr/swr_draw.cpp
> index f26b8e873c..cbd1558624 100644
> --- a/src/gallium/drivers/swr/swr_draw.cpp
> +++ b/src/gallium/drivers/swr/swr_draw.cpp
> @@ -188,6 +188,15 @@ swr_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
>                        info->instance_count,
>                        info->start,
>                        info->start_instance);
> +
> +   /* On large client-buffer draw, we used client buffer directly, without
> +    * copy.  Block until draw is finished.
> +    * VMD is an example application that benefits from this. */
> +   if (ctx->dirty & SWR_LARGE_CLIENT_DRAW) {
> +      struct swr_screen *screen = swr_screen(pipe->screen);
> +      swr_fence_submit(ctx, screen->flush_fence);
> +      swr_fence_finish(pipe->screen, NULL, screen->flush_fence, 0);
> +   }
> }
> 
> 
> diff --git a/src/gallium/drivers/swr/swr_screen.cpp b/src/gallium/drivers/swr/swr_screen.cpp
> index 9b3897ce6b..8be09697e6 100644
> --- a/src/gallium/drivers/swr/swr_screen.cpp
> +++ b/src/gallium/drivers/swr/swr_screen.cpp
> @@ -1066,6 +1066,16 @@ swr_destroy_screen(struct pipe_screen *p_screen)
> static void
> swr_validate_env_options(struct swr_screen *screen)
> {
> +   /* The client_copy_limit sets a maximum on the amount of user-buffer memory
> +    * copied to scratch space on a draw.  Past this, the draw will access
> +    * user-buffer directly and then block.  This is faster than queuing many
> +    * large client draws. */
> +   screen->client_copy_limit = 32768;
> +   int client_copy_limit =
> +      debug_get_num_option("SWR_CLIENT_COPY_LIMIT", 32768);

Could you move the default value into a macro defined at the top of the file, so it can be easily spotted in the future?

> +   if (client_copy_limit > 0)
> +      screen->client_copy_limit = client_copy_limit;
> +
>    /* XXX msaa under development, disable by default for now */
>    screen->msaa_max_count = 0; /* was SWR_MAX_NUM_MULTISAMPLES; */
> 
> diff --git a/src/gallium/drivers/swr/swr_screen.h b/src/gallium/drivers/swr/swr_screen.h
> index dc1bb47f02..6d6d1cb87d 100644
> --- a/src/gallium/drivers/swr/swr_screen.h
> +++ b/src/gallium/drivers/swr/swr_screen.h
> @@ -43,8 +43,10 @@ struct swr_screen {
> 
>    struct sw_winsys *winsys;
> 
> +   /* Configurable environment settings */
>    boolean msaa_force_enable;
>    uint8_t msaa_max_count;
> +   uint32_t client_copy_limit;
> 
>    HANDLE hJitMgr;
> };
> diff --git a/src/gallium/drivers/swr/swr_state.cpp b/src/gallium/drivers/swr/swr_state.cpp
> index 45c9c213e5..6c406a37ec 100644
> --- a/src/gallium/drivers/swr/swr_state.cpp
> +++ b/src/gallium/drivers/swr/swr_state.cpp
> @@ -1267,12 +1267,20 @@ swr_update_derived(struct pipe_context *pipe,
>             partial_inbounds = 0;
>             min_vertex_index = info.min_index;
> 
> -            /* Copy only needed vertices to scratch space */
>             size = AlignUp(size, 4);
> -            const void *ptr = (const uint8_t *) vb->buffer.user + base;
> -            ptr = (uint8_t *)swr_copy_to_scratch_space(
> -               ctx, &ctx->scratch->vertex_buffer, ptr, size);
> -            p_data = (const uint8_t *)ptr - base;
> +            /* If size of client memory copy is too large, don't copy. The
> +             * draw will access user-buffer directly and then block.  This is
> +             * faster than queuing many large client draws. */
> +            if (size >= screen->client_copy_limit) {
> +               post_update_dirty_flags |= SWR_LARGE_CLIENT_DRAW;
> +               p_data = (const uint8_t *) vb->buffer.user;
> +            } else {
> +               /* Copy only needed vertices to scratch space */
> +               const void *ptr = (const uint8_t *) vb->buffer.user + base;
> +               ptr = (uint8_t *)swr_copy_to_scratch_space(
> +                     ctx, &ctx->scratch->vertex_buffer, ptr, size);
> +               p_data = (const uint8_t *)ptr - base;
> +            }
>          }
> 
>          swrVertexBuffers[i] = {0};
> @@ -1311,12 +1319,19 @@ swr_update_derived(struct pipe_context *pipe,
> 
>             size = info.count * pitch;
>             size = AlignUp(size, 4);
> -
> -            /* Copy indices to scratch space */
> -            const void *ptr = info.index.user;
> -            ptr = swr_copy_to_scratch_space(
> -               ctx, &ctx->scratch->index_buffer, ptr, size);
> -            p_data = (const uint8_t *)ptr;
> +            /* If size of client memory copy is too large, don't copy. The
> +             * draw will access user-buffer directly and then block.  This is
> +             * faster than queuing many large client draws. */
> +            if (size >= screen->client_copy_limit) {
> +               post_update_dirty_flags |= SWR_LARGE_CLIENT_DRAW;
> +               p_data = (const uint8_t *) info.index.user;
> +            } else {
> +               /* Copy indices to scratch space */
> +               const void *ptr = info.index.user;
> +               ptr = swr_copy_to_scratch_space(
> +                     ctx, &ctx->scratch->index_buffer, ptr, size);
> +               p_data = (const uint8_t *)ptr;
> +            }
>          }
> 
>          SWR_INDEX_BUFFER_STATE swrIndexBuffer;
> -- 
> 2.11.0
> 
> _______________________________________________
> mesa-dev mailing list
> mesa-dev at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev



More information about the mesa-dev mailing list