[Mesa-dev] [PATCH 12/88] i965: add initial implementation of on disk shader cache

Mon Sep 26 02:43:30 UTC 2016

On Saturday, September 24, 2016 3:24:53 PM PDT Timothy Arceri wrote:
> This uses the recently-added cache.c to write out the final linked
> binary for vertex and fragment shader programs.
> 
> This is based off the initial implementation done by Carl.
> ---
>  src/mesa/drivers/dri/i965/Makefile.sources   |   1 +
>  src/mesa/drivers/dri/i965/brw_shader_cache.c | 390 +++++++++++++++++++++++++++
>  src/mesa/drivers/dri/i965/brw_state.h        |   7 +
>  3 files changed, 398 insertions(+)
>  create mode 100644 src/mesa/drivers/dri/i965/brw_shader_cache.c
> 
> diff --git a/src/mesa/drivers/dri/i965/Makefile.sources b/src/mesa/drivers/dri/i965/Makefile.sources
> index df90cb4..bd2bd37 100644
> --- a/src/mesa/drivers/dri/i965/Makefile.sources
> +++ b/src/mesa/drivers/dri/i965/Makefile.sources
> @@ -147,6 +147,7 @@ i965_FILES = \
>  	brw_sf_emit.c \
>  	brw_sf.h \
>  	brw_sf_state.c \
> +	brw_shader_cache.cpp \
>  	brw_state_batch.c \
>  	brw_state_cache.c \
>  	brw_state_dump.c \
> diff --git a/src/mesa/drivers/dri/i965/brw_shader_cache.c b/src/mesa/drivers/dri/i965/brw_shader_cache.c
> new file mode 100644
> index 0000000..aba45b6
> --- /dev/null
> +++ b/src/mesa/drivers/dri/i965/brw_shader_cache.c
> @@ -0,0 +1,390 @@
> +/*
> + * Copyright © 2014 Intel Corporation
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice (including the next
> + * paragraph) shall be included in all copies or substantial portions of the
> + * Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
> + * IN THE SOFTWARE.
> + */
> +
> +#include <util/macros.h>
> +#include <util/mesa-sha1.h>
> +#include <main/mtypes.h>
> +#include <compiler/glsl/glsl_parser_extras.h>
> +#include <compiler/glsl/ir_uniform.h>
> +#include <compiler/glsl/cache.h>
> +#include <compiler/glsl/blob.h>
> +
> +#include "brw_state.h"
> +#include "brw_wm.h"
> +#include "brw_vs.h"
> +#include "brw_context.h"
> +
> +static void
> +gen_vs_sha1(struct brw_context *brw, struct gl_shader_program *prog,
> +            struct brw_vs_prog_key *vs_key, unsigned char *vs_sha1)
> +{
> +   char sha1_buf[41];
> +   unsigned char sha1[20];
> +   char manifest[256];
> +   int offset = 0;
> +
> +   offset += snprintf(manifest, sizeof(manifest), "program: %s\n",
> +                      _mesa_sha1_format(sha1_buf, prog->sha1));
> +
> +   _mesa_sha1_compute(vs_key, sizeof *vs_key, sha1);
> +   offset += snprintf(manifest + offset, sizeof(manifest) - offset,
> +                      "vs_key: %s\n", _mesa_sha1_format(sha1_buf, sha1));
> +
> +   _mesa_sha1_compute(manifest, strlen(manifest), vs_sha1);
> +}

The VS/TCS/TES/GS code is basically identical...you could avoid a lot of
duplication by doing...

static void
gen_shader_sha1(struct brw_context *brw, struct gl_shader_program *prog,
                unsigned stage, void *key, unsigned char *out_sha1)
{
   char sha1_buf[41];
   unsigned char sha1[20];
   char manifest[256];
   int offset = 0;

   format_program_sha1(prog, manifest, sizeof(manifest), &offset);

   _mesa_sha1_compute(key, key_size(stage), sha1);
   offset += snprintf(manifest + offset, sizeof(manifest) - offset,
                      "%s_key: %s\n",
                      _mesa_shader_stage_to_abbrev(stage),
                      _mesa_sha1_format(sha1_buf, sha1));

   _mesa_sha1_compute(manifest, strlen(manifest), tcs_sha1)
}

assuming you move the key initialization for TCS/TES/GS to the caller,
which would make it more consistent with the VS anyway.  (Here, key_size
is a helper function that returns sizeof(brw_vs_prog) etc.)

(Also assuming you're OK with using "VS_key" rather than "vs_key"...)

> +
> +static void
> +gen_wm_sha1(struct brw_context *brw, struct gl_shader_program *prog,
> +            struct brw_vs_prog_key *vs_key, struct brw_wm_prog_key *wm_key,
> +            unsigned char *wm_sha1)
> +{
> +   char sha1_buf[41];
> +   unsigned char sha1[20];
> +   char manifest[256];
> +   int offset = 0;
> +
> +   offset += snprintf(manifest, sizeof(manifest), "program: %s\n",
> +                      _mesa_sha1_format(sha1_buf, prog->sha1));
> +
> +   brw_wm_populate_key(brw, wm_key);
> +   _mesa_sha1_compute(wm_key, sizeof *wm_key, sha1);
> +   offset += snprintf(manifest + offset, sizeof(manifest) - offset,
> +                      "wm_key: %s\n", _mesa_sha1_format(sha1_buf, sha1));
> +
> +   _mesa_sha1_compute(manifest, strlen(manifest), wm_sha1);
> +
> +}

I don't know why this function is (eventually) monkeying around with the
vue map coming out of the GS stage based on VS outputs written.  I can't
imagine it works once you're caching TES/GS...

> +
> +static void
> +load_program_data(struct gl_shader_program *prog, struct blob_reader *binary,
> +                  struct brw_stage_prog_data *prog_data,
> +                  gl_shader_stage stage, struct gl_context *ctx)
> +{
> +   static const gl_constant_value zero = { 0 };
> +
> +   intptr_t parameter_values_base = blob_read_intptr(binary);
> +   intptr_t uniform_data_slots_base = blob_read_intptr(binary);
> +
> +   uint32_t nr_params = blob_read_uint32(binary);
> +   assert(nr_params == prog_data->nr_params);
> +
> +   prog_data->param = rzalloc_array(NULL, const gl_constant_value *,
> +                                    nr_params);
> +   if (ctx->_Shader->Flags & GLSL_CACHE_INFO) {
> +      fprintf(stderr, "Allocating %d prog_data->params (%p)\n",
> +              prog_data->nr_params, prog_data->param);
> +   }
> +
> +   for (unsigned i = 0; i < nr_params; i++) {
> +      intptr_t param = blob_read_intptr(binary);
> +      ptrdiff_t p_offset, u_offset;
> +      struct gl_program_parameter_list *param_list =
> +         prog->_LinkedShaders[stage]->Program->Parameters;
> +
> +      p_offset = (param - parameter_values_base) / sizeof(gl_constant_value);
> +      u_offset = (param - uniform_data_slots_base) / sizeof(gl_constant_value);
> +      
> +      if (p_offset >= 0 && p_offset < 4 * param_list->NumParameters) {
> +         prog_data->param[i] =
> +            ((gl_constant_value *) param_list->ParameterValues) + p_offset;
> +      } else if (u_offset >= 0 && u_offset < prog->NumUniformDataSlots) {
> +         prog_data->param[i] = prog->UniformDataSlots + u_offset;
> +      } else {
> +         prog_data->param[i] = &zero;
> +      }
> +   }
> +
> +   uint32_t nr_pull_params = blob_read_uint32(binary);
> +   assert(nr_pull_params == prog_data->nr_pull_params);
> +
> +   prog_data->pull_param = rzalloc_array(NULL, const gl_constant_value *,
> +                                         nr_pull_params);
> +
> +   for (unsigned i = 0; i < nr_pull_params; i++) {
> +      intptr_t pull_param = blob_read_intptr(binary);
> +      /* FIXME: We need to fixup pull_params pointers here. */
> +   }
> +
> +}
> +
> +static void
> +upload_cached_vs(struct brw_context *brw, struct blob_reader *binary,
> +                 struct gl_shader_program *prog,
> +                 struct brw_vs_prog_key *vs_key)
> +{
> +   struct brw_vs_prog_data *vs_prog_data;
> +   struct brw_stage_prog_data *prog_data;
> +
> +   /* Read VS program from blob. */
> +   size_t vs_program_size = blob_read_uint32(binary);
> +   uint8_t *vs_program = blob_read_bytes(binary, vs_program_size);
> +
> +   /* Read VS program_data from blob and fixup params pointers. */
> +   size_t vs_prog_data_size = blob_read_uint32(binary);
> +   assert(vs_prog_data_size == sizeof *vs_prog_data);
> +
> +   vs_prog_data = blob_read_bytes(binary, vs_prog_data_size);
> +   prog_data = &vs_prog_data->base.base;
> +
> +   load_program_data(prog, binary, prog_data, MESA_SHADER_VERTEX, &brw->ctx);
> +
> +   struct brw_vertex_program *vp =
> +      (struct brw_vertex_program *)brw->vertex_program;
> +   brw_upload_cache(&brw->cache, BRW_CACHE_VS_PROG,
> +                    vs_key, sizeof(struct brw_vs_prog_key),
> +                    vs_program, vs_program_size,
> +                    vs_prog_data, vs_prog_data_size,
> +                    &brw->vs.base.prog_offset, &brw->vs.prog_data, vp);
> +}
> +
> +static void
> +upload_cached_wm(struct brw_context *brw, struct blob_reader *binary,
> +                 struct gl_shader_program *prog,
> +                 struct brw_wm_prog_key *wm_key)
> +{
> +   struct brw_wm_prog_data *wm_prog_data;
> +   struct brw_stage_prog_data *prog_data;
> +
> +   /* Read WM program from blob. */
> +   size_t wm_program_size = blob_read_uint32(binary);
> +   uint8_t *wm_program = blob_read_bytes(binary, wm_program_size);
> +
> +   /* Read WM program_data from blob and fixup params pointers. */
> +   size_t wm_prog_data_size = blob_read_uint32(binary);
> +   assert(wm_prog_data_size == sizeof *wm_prog_data);
> +
> +   wm_prog_data = blob_read_bytes(binary, wm_prog_data_size);
> +   prog_data = &wm_prog_data->base;
> +
> +   load_program_data(prog, binary, prog_data, MESA_SHADER_FRAGMENT,
> +                     &brw->ctx);
> +
> +   struct brw_fragment_program *wp =
> +      (struct brw_fragment_program *)brw->fragment_program;
> +   brw_upload_cache(&brw->cache, BRW_CACHE_FS_PROG,
> +                    wm_key, sizeof(struct brw_wm_prog_key),
> +                    wm_program, wm_program_size,
> +                    wm_prog_data, wm_prog_data_size,
> +                    &brw->wm.base.prog_offset, &brw->wm.prog_data, wp);
> +}

It seems like you could do a similar treatment here...again,
upload_cached_{vs,tcs,tes,gs,wm} are basically identical...making a
more general function might even allow you to drop some switches in
the caller...

> +
> +void
> +upload_cached_program(struct brw_context *brw, gl_shader_stage stage)
> +{
> +   char sha1_buf[41];
> +   unsigned char binary_sha1[20];
> +   size_t size;
> +   uint8_t *buffer;
> +   struct blob_reader binary;
> +   struct gl_shader_program *prog;
> +   struct brw_wm_prog_key wm_key;
> +   struct brw_vs_prog_key vs_key;
> +
> +   struct program_cache *cache = brw->ctx.Cache;
> +   if (cache == NULL)
> +      return;
> +
> +   prog = brw->ctx.Shader.ActiveProgram;
> +   if (prog == NULL)
> +      return;
> +
> +   brw_vs_populate_key(brw, &vs_key);
> +   switch (stage) {
> +   case MESA_SHADER_VERTEX:
> +      gen_vs_sha1(brw, prog, &vs_key, binary_sha1);
> +      break;
> +   case MESA_SHADER_FRAGMENT:
> +      gen_wm_sha1(brw, prog, &vs_key, &wm_key, binary_sha1);
> +      break;
> +   }
> +
> +   buffer = cache_get(cache, binary_sha1, &size);
> +   if (buffer == NULL)
> +      goto FAIL;
> +
> +   if (brw->ctx._Shader->Flags & GLSL_CACHE_INFO) {
> +      fprintf(stderr, "attempting to populate bo cache with binary: %s\n",
> +              _mesa_sha1_format(sha1_buf, binary_sha1));
> +   }
> +
> +   blob_reader_init(&binary, buffer, size);
> +
> +   switch (stage) {
> +   case MESA_SHADER_VERTEX:
> +      upload_cached_vs(brw, &binary, prog, &vs_key);
> +      break;
> +   case MESA_SHADER_FRAGMENT:
> +      upload_cached_wm(brw, &binary, prog, &wm_key);
> +      break;
> +   }
> +
> +   if (binary.current != binary.end || binary.overrun) {
> +      if (brw->ctx._Shader->Flags & GLSL_CACHE_INFO) {
> +         fprintf(stderr, "Error reading program from cache (did not read "
> +                 "every byte written)\n");
> +      }
> +      goto FAIL;
> +   }
> +
> +   if (brw->ctx._Shader->Flags & GLSL_CACHE_INFO) {
> +      fprintf(stderr, "%s: Successfully read every byte written!\n",
> +              __FUNCTION__);
> +   }
> +   prog->program_written_to_cache = true;
> +
> +   free(buffer);
> +   return;
> +
> +FAIL:
> +   /*FIXME: Fall back and compile from source here. */
> +   prog->program_written_to_cache = false;
> +   free(buffer);
> +}
> +
> +static void
> +write_program_data(struct gl_shader_program *prog, struct blob *binary,
> +                   struct brw_stage_prog_data *prog_data,
> +                   gl_shader_stage stage)
> +{
> +   /* Include variable-length params from end of brw_stage_prog_data as well.
> +    *
> +    * Before writing either of the params or pull_params arrays, we first
> +    * write out the addresses of the ParameterValues and UniformDataSlots
> +    * storage. The pointers within params will be pointers to within one of
> +    * these blocks of storage. So we can use the addresses of this storage
> +    * together with the pointer values to correctly construct pointers to the
> +    * actual storage when the program data is loaded from the cache.
> +    */
> +   blob_write_intptr(binary,
> +                     (intptr_t) prog->_LinkedShaders[stage]->
> +                      Program->Parameters->ParameterValues);
> +
> +   blob_write_intptr(binary, (intptr_t) prog->UniformDataSlots);
> +
> +   blob_write_uint32(binary, prog_data->nr_params);
> +
> +   for (unsigned i = 0; i < prog_data->nr_params; i++) {
> +      blob_write_intptr(binary, (intptr_t) prog_data->param[i]);
> +   }
> +
> +   blob_write_uint32(binary, prog_data->nr_pull_params);
> +   for (unsigned i = 0; i < prog_data->nr_pull_params; i++) {
> +      blob_write_intptr(binary, (intptr_t) prog_data->pull_param[i]);
> +   }
> +}
> +
> +void
> +write_cached_program(struct brw_context *brw)
> +{
> +   struct blob *binary;
> +   uint8_t *blob_cursor;
> +   size_t program_size;
> +   struct gl_shader_program *prog;
> +   struct program_cache *cache;
> +   char buf[41];
> +
> +   cache = brw->ctx.Cache;
> +   if (cache == NULL)
> +      return;
> +
> +   prog = brw->ctx.Shader.ActiveProgram;
> +   if (prog == NULL)
> +      return;
> +
> +   if (prog->program_written_to_cache)
> +      return;
> +
> +   struct brw_vs_prog_key vs_key;
> +   brw_vs_populate_key(brw, &vs_key);
> +
> +   if (prog->_LinkedShaders[MESA_SHADER_VERTEX]) {
> +      unsigned char vs_sha1[20];
> +
> +      binary = blob_create (NULL);
> +      if (binary == NULL)
> +         return;
> +
> +      gen_vs_sha1(brw, prog, &vs_key, vs_sha1);
> +
> +      /* Write VS program to blob. */
> +      program_size = brw->vs.prog_data->program_size;
> +
> +      blob_write_uint32(binary, program_size);
> +
> +      blob_cursor = blob_reserve_bytes(binary, program_size);
> +      drm_intel_bo_get_subdata(brw->cache.bo, brw->vs.base.prog_offset,
> +                               program_size, blob_cursor);
> +
> +      /* Write VS program_data to blob. */
> +      blob_write_uint32(binary, sizeof *brw->vs.prog_data);
> +      blob_write_bytes(binary, brw->vs.prog_data, sizeof *brw->vs.prog_data);
> +
> +      write_program_data(prog, binary, &brw->vs.prog_data->base.base,
> +                         MESA_SHADER_VERTEX);
> +
> +      if (brw->ctx._Shader->Flags & GLSL_CACHE_INFO) {
> +         fprintf(stderr, "putting binary in cache: %s\n",
> +                 _mesa_sha1_format(buf, vs_sha1));
> +      }
> +
> +      cache_put(cache, vs_sha1, binary->data, binary->size);
> +      ralloc_free (binary);
> +   }
> +
> +   if (prog->_LinkedShaders[MESA_SHADER_FRAGMENT]) {
> +      struct brw_wm_prog_key wm_key;
> +      unsigned char wm_sha1[20];
> +
> +      binary = blob_create (NULL);
> +      if (binary == NULL)
> +         return;
> +
> +      gen_wm_sha1(brw, prog, &vs_key, &wm_key, wm_sha1);
> +
> +      /* Write WM program to blob. */
> +      program_size = brw->wm.prog_data->program_size;
> +
> +      blob_write_uint32(binary, program_size);
> +
> +      blob_cursor = blob_reserve_bytes(binary, program_size);
> +      drm_intel_bo_get_subdata(brw->cache.bo, brw->wm.base.prog_offset,
> +                               program_size, blob_cursor);
> +
> +      /* Write WM program_data to blob. */
> +      blob_write_uint32(binary, sizeof *brw->wm.prog_data);
> +      blob_write_bytes(binary, brw->wm.prog_data, sizeof *brw->wm.prog_data);
> +
> +      write_program_data(prog, binary, &brw->wm.prog_data->base,
> +                         MESA_SHADER_FRAGMENT);
> +
> +      if (brw->ctx._Shader->Flags & GLSL_CACHE_INFO) {
> +         fprintf(stderr, "putting binary in cache: %s\n",
> +                 _mesa_sha1_format(buf, wm_sha1));
> +      }
> +
> +      cache_put(cache, wm_sha1, binary->data, binary->size);
> +      ralloc_free (binary);
> +   }

Likewise...this code is screaming for a helper function.  It's basically
five copies of the same thing...

> +
> +   prog->program_written_to_cache = true;
> +}
> diff --git a/src/mesa/drivers/dri/i965/brw_state.h b/src/mesa/drivers/dri/i965/brw_state.h
> index aba9508..2a11a55 100644
> --- a/src/mesa/drivers/dri/i965/brw_state.h
> +++ b/src/mesa/drivers/dri/i965/brw_state.h
> @@ -196,6 +196,13 @@ void brw_upload_state_base_address(struct brw_context *brw);
>  void gen8_write_pma_stall_bits(struct brw_context *brw,
>                                 uint32_t pma_stall_bits);
>  
> +/* brw_shader_cache.h */
> +void
> +upload_cached_program(struct brw_context *brw, gl_shader_stage stage);
> +
> +void
> +write_cached_program(struct brw_context *brw);
> +
>  /***********************************************************************
>   * brw_state.c
>   */
> 

-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 801 bytes
Desc: This is a digitally signed message part.
URL: <https://lists.freedesktop.org/archives/mesa-dev/attachments/20160925/879839c3/attachment.sig>