[Mesa-dev] [PATCH v3 21/34] i965: add initial implementation of on disk shader cache
Kenneth Graunke
kenneth at whitecape.org
Sun Oct 29 08:02:47 UTC 2017
On Sunday, October 22, 2017 1:01:29 PM PDT Jordan Justen wrote:
> From: Timothy Arceri <timothy.arceri at collabora.com>
>
> This uses the recently-added disk_cache.c to write out the final
> linked binary for vertex and fragment shader programs.
>
> This is based off the initial implementation done by Carl Worth.
>
> v2:
> * Squash 'i965: add image param shader cache support'
> * Squash 'i965: add shader cache support for pull param pointers'
> * Sustantially simplified by a rework on top of Jason's 2975e4c56a7a.
> * Rename load_program_data to read_program_data. (Jason)
>
> v3:
> * Simplify and align program read/write. (Jason)
>
> [jordan.l.justen at intel.com: *_cached_program => brw_disk_cache_*_program]
> [jordan.l.justen at intel.com: brw_shader_cache.c => brw_disk_cache.c]
> [jordan.l.justen at intel.com: don't map to write program when LLC is present]
> [jordan.l.justen at intel.com: set program_written_to_cache on read from cache]
> [jordan.l.justen at intel.com: only try cache when status is linking_skipped]
> [jordan.l.justen at intel.com: rework based on uniforms rework 2975e4c56a7a]
> [jordan.l.justen at intel.com: Simplify and align program read/write]
> Signed-off-by: Jordan Justen <jordan.l.justen at intel.com>
> ---
> src/mesa/drivers/dri/i965/Makefile.sources | 1 +
> src/mesa/drivers/dri/i965/brw_disk_cache.c | 329 +++++++++++++++++++++++++++++
> src/mesa/drivers/dri/i965/brw_state.h | 5 +
> src/mesa/drivers/dri/i965/meson.build | 1 +
> 4 files changed, 336 insertions(+)
> create mode 100644 src/mesa/drivers/dri/i965/brw_disk_cache.c
>
> diff --git a/src/mesa/drivers/dri/i965/Makefile.sources b/src/mesa/drivers/dri/i965/Makefile.sources
> index 053d89b81ec..2980cdb3c54 100644
> --- a/src/mesa/drivers/dri/i965/Makefile.sources
> +++ b/src/mesa/drivers/dri/i965/Makefile.sources
> @@ -14,6 +14,7 @@ i965_FILES = \
> brw_cs.h \
> brw_curbe.c \
> brw_defines.h \
> + brw_disk_cache.c \
> brw_draw.c \
> brw_draw.h \
> brw_draw_upload.c \
> diff --git a/src/mesa/drivers/dri/i965/brw_disk_cache.c b/src/mesa/drivers/dri/i965/brw_disk_cache.c
> new file mode 100644
> index 00000000000..186cbe83706
> --- /dev/null
> +++ b/src/mesa/drivers/dri/i965/brw_disk_cache.c
> @@ -0,0 +1,329 @@
> +/*
> + * Copyright © 2014 Intel Corporation
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice (including the next
> + * paragraph) shall be included in all copies or substantial portions of the
> + * Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
> + * IN THE SOFTWARE.
> + */
> +
> +#include "compiler/blob.h"
> +#include "compiler/glsl/ir_uniform.h"
> +#include "compiler/glsl/shader_cache.h"
> +#include "main/mtypes.h"
> +#include "util/disk_cache.h"
> +#include "util/macros.h"
> +#include "util/mesa-sha1.h"
> +
> +#include "brw_context.h"
> +#include "brw_state.h"
> +#include "brw_vs.h"
> +#include "brw_wm.h"
> +
> +static void
> +gen_shader_sha1(struct brw_context *brw, struct gl_program *prog,
> + gl_shader_stage stage, void *key, unsigned char *out_sha1)
> +{
> + char sha1_buf[41];
> + unsigned char sha1[20];
> + char manifest[256];
> + int offset = 0;
> +
> + _mesa_sha1_format(sha1_buf, prog->sh.data->sha1);
> + offset += snprintf(manifest, sizeof(manifest), "program: %s\n", sha1_buf);
> +
> + _mesa_sha1_compute(key, brw_prog_key_size(stage), sha1);
> + _mesa_sha1_format(sha1_buf, sha1);
> + offset += snprintf(manifest + offset, sizeof(manifest) - offset,
> + "%s_key: %s\n", _mesa_shader_stage_to_abbrev(stage),
> + sha1_buf);
> +
> + _mesa_sha1_compute(manifest, strlen(manifest), out_sha1);
> +}
> +
> +static void
> +write_blob_program_data(struct blob *binary, const void *program,
> + size_t program_size,
> + struct brw_stage_prog_data *prog_data,
> + size_t prog_data_size)
> +{
> + /* Write program to blob. */
> + blob_write_uint32(binary, program_size);
> + blob_write_bytes(binary, program, program_size);
> +
> + /* Write program_data to blob. */
> + blob_write_uint32(binary, prog_data_size);
> + blob_write_bytes(binary, prog_data, prog_data_size);
> +
> + /* Write push params */
> + blob_write_bytes(binary, prog_data->param,
> + sizeof(uint32_t) * prog_data->nr_params);
> +
> + /* Write pull params */
> + blob_write_bytes(binary, prog_data->pull_param,
> + sizeof(uint32_t) * prog_data->nr_pull_params);
> +}
> +
> +static bool
> +read_blob_program_data(struct blob_reader *binary,
> + struct gl_program *prog, gl_shader_stage stage,
> + const uint8_t **program, size_t *program_size,
> + struct brw_stage_prog_data *prog_data)
> +{
Hmm, I'm kind of surprised to see you writing and reading the program
and prog_data sizes here. If you wrote:
1. prog_data
2. assembly
3. push params
4. pull params
Then you could read it back:
1. Read prog_data:
blob_copy_bytes(binary, prog_data, brw_prog_data_size(stage));
2. Read the assembly:
*program = blob_read_bytes(binary, prog_data->program_size);
3. Read the push params.
4. Read the pull params.
Though, maybe by writing it you're trying to be more defensive, and
have an extra consistency check?
> + /* Read shader program from blob. */
> + *program_size = blob_read_uint32(binary);
> + *program = blob_read_bytes(binary, *program_size);
I was going to suggest adding more binary->overrun checks here, but...
I guess they're not really necessary. blob already prevents you from
reading too much, so these will just get bogus data...which we'll
happily ignore thanks to the !binary->overrun check at the end.
> +
> + /* Read shader prog_data from blob. */
> + size_t prog_data_size = blob_read_uint32(binary);
> + if (binary->overrun || prog_data_size != brw_prog_data_size(stage))
> + return false;
I don't think you actually need the binary->overrun check here...if you
overran, the size wouldn't match.
> + blob_copy_bytes(binary, prog_data, prog_data_size);
> + if (binary->overrun)
> + return false;
This overrun check is useful, you don't want to accidentally allocate
0 bytes.
> +
> + /* Read push params */
> + prog_data->param = rzalloc_array(NULL, uint32_t, prog_data->nr_params);
> + blob_copy_bytes(binary, prog_data->param,
> + sizeof(uint32_t) * prog_data->nr_params);
> +
> + /* Read pull params */
> + prog_data->pull_param = rzalloc_array(NULL, uint32_t,
> + prog_data->nr_pull_params);
> + blob_copy_bytes(binary, prog_data->pull_param,
> + sizeof(uint32_t) * prog_data->nr_pull_params);
> +
> + return (binary->current == binary->end && !binary->overrun);
> +}
> +
> +static bool
> +read_and_upload(struct brw_context *brw, struct disk_cache *cache,
> + struct gl_program *prog, gl_shader_stage stage)
> +{
> + unsigned char binary_sha1[20];
> +
> + brw_any_prog_key prog_key;
> +
> + switch (stage) {
> + case MESA_SHADER_VERTEX:
> + brw_vs_populate_key(brw, &prog_key.vs);
> + /* We don't care what instance of the program it is for the disk cache
> + * hash lookup, so set the id to 0 for the sha1 hashing.
> + * program_string_id will be set by the SET_UPLOAD_PARAMS macro below.
> + */
> + prog_key.vs.program_string_id = 0;
> + break;
> + case MESA_SHADER_FRAGMENT:
> + brw_wm_populate_key(brw, &prog_key.wm);
> + prog_key.wm.program_string_id = 0;
> + break;
> + default:
> + unreachable("Unsupported stage!");
> + }
> +
> + gen_shader_sha1(brw, prog, stage, &prog_key, binary_sha1);
> +
> + size_t buffer_size;
> + uint8_t *buffer = disk_cache_get(cache, binary_sha1, &buffer_size);
> + if (buffer == NULL) {
> + if (brw->ctx._Shader->Flags & GLSL_CACHE_INFO) {
> + char sha1_buf[41];
> + _mesa_sha1_format(sha1_buf, binary_sha1);
> + fprintf(stderr, "No cached %s binary found for: %s\n",
> + _mesa_shader_stage_to_abbrev(stage), sha1_buf);
> + }
> + return false;
> + }
> +
> + if (brw->ctx._Shader->Flags & GLSL_CACHE_INFO) {
> + char sha1_buf[41];
> + _mesa_sha1_format(sha1_buf, binary_sha1);
> + fprintf(stderr, "attempting to populate bo cache with binary: %s\n",
> + sha1_buf);
> + }
> +
> + struct blob_reader binary;
> + blob_reader_init(&binary, buffer, buffer_size);
> +
> + size_t program_size;
> + const uint8_t *program;
> + struct brw_stage_prog_data *prog_data =
> + ralloc_size(NULL, sizeof(brw_any_prog_data));
> + if (!read_blob_program_data(&binary, prog, stage, &program, &program_size,
> + prog_data)) {
> + /* Something very bad has gone wrong discard the item from the cache and
> + * rebuild from source.
> + */
> + assert(!"Invalid i965 shader disk cache item!");
This is probably useful for initial debugging, but we might want to drop
it when merging...couldn't this actually trigger due to say, ext4
randomly truncating your cache files to 0 bytes? We'll still have the
fprintf in that case...
> +
> + if (brw->ctx._Shader->Flags & GLSL_CACHE_INFO) {
> + fprintf(stderr, "Error reading program from cache (invalid i965 "
> + "cache item)\n");
> + }
> +
> + disk_cache_remove(cache, binary_sha1);
> + free(buffer);
> + return false;
> + }
> +
> + const struct gen_device_info *devinfo = &brw->screen->devinfo;
> + enum brw_cache_id cache_id;
> + unsigned max_threads;
> + struct brw_stage_state *stage_state;
> +
> + #define SET_UPLOAD_PARAMS(sh, sh_caps, prog) \
> + do { \
> + prog_key.sh.program_string_id = prog->id; \
> + cache_id = BRW_CACHE_##sh_caps##_PROG; \
> + max_threads = devinfo->max_##sh##_threads; \
> + stage_state = &brw->sh.base; \
> + } while(0)
> +
> + switch (stage) {
> + case MESA_SHADER_VERTEX: {
> + struct brw_program *vp = (struct brw_program *) prog;
> + SET_UPLOAD_PARAMS(vs, VS, vp);
> + break;
> + }
> + case MESA_SHADER_FRAGMENT: {
> + struct brw_program *wp = (struct brw_program *) prog;
> + SET_UPLOAD_PARAMS(wm, FS, wp);
> + break;
> + }
> + default:
> + unreachable("Unsupported stage!");
> + }
> +
> + brw_alloc_stage_scratch(brw, stage_state, prog_data->total_scratch,
> + max_threads);
This will severely underallocate scratch for compute shaders.
devinfo->max_cs_threads is the number of threads per subslice.
You need to multiply by the number of subslices as well. See brw_cs.c.
You'll also need to account for the WaCSScratchSize:hsw workaround.
It probably makes sense to refactor brw_alloc_stage_scratch() and
eliminate the last parameter:
brw_alloc_stage_scratch(brw, stage_state, prog_data->total_scratch);
then have it do switch (stage_state->stage) and internally calculate the
number of threads / scratch IDs. That way, you don't need to duplicate
that logic here. I'd do that as a separate patch, before this one in
the series.
> +
> + brw_upload_cache(&brw->cache, cache_id, &prog_key, brw_prog_key_size(stage),
> + program, program_size, prog_data, brw_prog_data_size(stage),
> + &stage_state->prog_offset, &stage_state->prog_data);
> +
> + prog->program_written_to_cache = true;
> +
> + free(buffer);
> +
> + return true;
> +}
> +
> +bool
> +brw_disk_cache_upload_program(struct brw_context *brw, gl_shader_stage stage)
> +{
> + struct disk_cache *cache = brw->ctx.Cache;
> + if (cache == NULL)
> + return false;
> +
> + struct gl_program *prog = brw->ctx._Shader->CurrentProgram[stage];
> + if (prog == NULL)
> + return false;
> +
> + if (prog->sh.data->LinkStatus != linking_skipped)
> + goto FAIL;
> +
> + if (!read_and_upload(brw, cache, prog, stage))
> + goto FAIL;
> +
> + if (brw->ctx._Shader->Flags & GLSL_CACHE_INFO) {
> + fprintf(stderr, "read gen program from cache\n");
> + }
> +
> + return true;
> +
> +FAIL:
Let's use lowercase "fail:" - it's more common than "FAIL:".
> + /*FIXME: Fall back and compile from source here. */
> + return false;
> +}
> +
> +static void
> +write_program_data(struct brw_context *brw, struct gl_program *prog,
> + void *key, struct brw_stage_prog_data *prog_data,
> + size_t program_size, size_t prog_data_size,
> + uint32_t prog_offset, struct disk_cache *cache,
> + gl_shader_stage stage)
> +{
> + struct blob binary;
> + blob_init(&binary);
> +
> + const void *program_map;
> + if (brw->screen->devinfo.has_llc) {
> + program_map = brw->cache.map + prog_offset;
> + } else {
> + program_map = brw_bo_map(brw, brw->cache.bo, MAP_READ);
> + if (unlikely(!program_map)) {
> + _mesa_error_no_memory(__func__);
> + return;
> + }
> + program_map += prog_offset;
> + }
> +
> + write_blob_program_data(&binary, program_map, program_size, prog_data,
> + prog_data_size);
> +
> + if (!brw->screen->devinfo.has_llc) {
> + brw_bo_unmap(brw->cache.bo);
> + }
This needs rebasing, non-LLC systems started using persistent mappings
back in July (commit 0044de931f9a82bcf33a4c12851709788361d990).
Fortunately, it's a bit easier now! All you need is:
/* TODO: Use streaming reads on non-LLC platforms? */
const void *program_map = brw->cache.map + prog_offset;
write_blob_program_data(&binary, program_map, program_size, prog_data,
prog_data_size);
No need for mapping/unmapping anymore.
Reading from the program cache is going to be slow (uncached reads) on
non-LLC, though. We could use _mesa_streaming_load_memcpy to improve
this...or, ideally, we wouldn't read it out of the program cache at all.
We'd just upload the assembly we generated on the CPU directly.
I think it's fine to leave that as a TODO for now.
> +
> + unsigned char sha1[20];
> + char buf[41];
> + gen_shader_sha1(brw, prog, stage, key, sha1);
> + _mesa_sha1_format(buf, sha1);
> + if (brw->ctx._Shader->Flags & GLSL_CACHE_INFO) {
> + fprintf(stderr, "putting binary in cache: %s\n", buf);
> + }
> +
> + disk_cache_put(cache, sha1, binary.data, binary.size, NULL);
> +
> + prog->program_written_to_cache = true;
> + blob_finish(&binary);
> +}
> +
> +void
> +brw_disk_cache_write_program(struct brw_context *brw)
> +{
> + struct disk_cache *cache = brw->ctx.Cache;
> + if (cache == NULL)
> + return;
> +
> + struct gl_program *prog =
> + brw->ctx._Shader->CurrentProgram[MESA_SHADER_VERTEX];
> + if (prog && !prog->program_written_to_cache) {
> + struct brw_vs_prog_key vs_key;
> + brw_vs_populate_key(brw, &vs_key);
> + vs_key.program_string_id = 0;
> +
> + write_program_data(brw, prog, &vs_key, brw->vs.base.prog_data,
> + brw->vs.base.prog_data->program_size,
> + sizeof(struct brw_vs_prog_data),
> + brw->vs.base.prog_offset, cache,
> + MESA_SHADER_VERTEX);
> + }
> +
> + prog = brw->ctx._Shader->CurrentProgram[MESA_SHADER_FRAGMENT];
> + if (prog && !prog->program_written_to_cache) {
> + struct brw_wm_prog_key wm_key;
> + brw_wm_populate_key(brw, &wm_key);
> + wm_key.program_string_id = 0;
> +
> + write_program_data(brw, prog, &wm_key, brw->wm.base.prog_data,
> + brw->wm.base.prog_data->program_size,
> + sizeof(struct brw_wm_prog_data),
> + brw->wm.base.prog_offset, cache,
> + MESA_SHADER_FRAGMENT);
> + }
> +}
> diff --git a/src/mesa/drivers/dri/i965/brw_state.h b/src/mesa/drivers/dri/i965/brw_state.h
> index 8db354cf232..6f2e0501b4b 100644
> --- a/src/mesa/drivers/dri/i965/brw_state.h
> +++ b/src/mesa/drivers/dri/i965/brw_state.h
> @@ -131,6 +131,11 @@ void brw_upload_state_base_address(struct brw_context *brw);
> void gen8_write_pma_stall_bits(struct brw_context *brw,
> uint32_t pma_stall_bits);
>
> +/* brw_disk_cache.c */
> +bool brw_disk_cache_upload_program(struct brw_context *brw,
> + gl_shader_stage stage);
> +void brw_disk_cache_write_program(struct brw_context *brw);
> +
> /***********************************************************************
> * brw_state.c
> */
> diff --git a/src/mesa/drivers/dri/i965/meson.build b/src/mesa/drivers/dri/i965/meson.build
> index 144a254bd64..09e1179adc4 100644
> --- a/src/mesa/drivers/dri/i965/meson.build
> +++ b/src/mesa/drivers/dri/i965/meson.build
> @@ -34,6 +34,7 @@ files_i965 = files(
> 'brw_cs.h',
> 'brw_curbe.c',
> 'brw_defines.h',
> + 'brw_disk_cache.c',
> 'brw_draw.c',
> 'brw_draw.h',
> 'brw_draw_upload.c',
>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 833 bytes
Desc: This is a digitally signed message part.
URL: <https://lists.freedesktop.org/archives/mesa-dev/attachments/20171029/28eb162e/attachment.sig>
More information about the mesa-dev
mailing list