Mesa (master): freedreno+turnip: Upload large shader constants as a UBO.

GitLab Mirror gitlab-mirror at kemper.freedesktop.org
Mon Nov 16 22:33:03 UTC 2020


Module: Mesa
Branch: master
Commit: 1f44053301339e64ef070ac8ad81a0ba3f073310
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=1f44053301339e64ef070ac8ad81a0ba3f073310

Author: Eric Anholt <eric at anholt.net>
Date:   Tue Jul  7 11:56:35 2020 -0700

freedreno+turnip: Upload large shader constants as a UBO.

Right now if the shader indirects on some large constant array, we see NIR
load_consts (usually from the const file) of its contents into general
registers, then indirection on the GPRs.  This often results in register
allocation failures, as it's easy to go beyond the ~256 dwords of
registers per invocation.

By moving the large constants to a UBO, we can load an arbitrary number of
them.  They also can be theoretically moved to the constant reg file (~2k
dwords), though you're unlikely to hit this path without an indirect load
on your large constant, and we don't yet let UBO indirect loads get moved
to constant regs.

This possibly won't work out right if we have 16-bit load_constants, but
without other MRs in flight we won't see 16-bit temps to be lowered to
this.

This allows 2 kerbal-space-program shaders to compile that previously
would fail, and fixes the new dEQP-VK and -GLES2 tests I wrote that
dynamically index a 40-element temporary array of float/vec2/vec3/vec4
with constant element initializers.

Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/2789
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/5810>

---

 .gitlab-ci/deqp-freedreno-a307-fails.txt        |  4 --
 .gitlab-ci/deqp-freedreno-a630-fails.txt        |  4 --
 src/freedreno/computerator/ir3_asm.c            |  2 +-
 src/freedreno/ir3/ir3.c                         | 25 +++++--
 src/freedreno/ir3/ir3.h                         |  7 ++
 src/freedreno/ir3/ir3_disk_cache.c              |  8 ++-
 src/freedreno/ir3/ir3_nir.c                     | 14 ++++
 src/freedreno/ir3/ir3_nir.h                     |  1 +
 src/freedreno/ir3/ir3_nir_analyze_ubo_ranges.c  | 91 +++++++++++++++++++++++++
 src/freedreno/ir3/ir3_shader.h                  | 11 +++
 src/freedreno/vulkan/tu_cmd_buffer.c            | 12 +++-
 src/freedreno/vulkan/tu_pipeline.c              | 76 ++++++++++++++++-----
 src/gallium/drivers/freedreno/a6xx/fd6_const.c  | 10 +++
 src/gallium/drivers/freedreno/ir3/ir3_const.h   | 53 +++++++++++++-
 src/gallium/drivers/freedreno/ir3/ir3_gallium.c |  6 +-
 15 files changed, 283 insertions(+), 41 deletions(-)

diff --git a/.gitlab-ci/deqp-freedreno-a307-fails.txt b/.gitlab-ci/deqp-freedreno-a307-fails.txt
index 060d10cc869..fa6a12dba16 100644
--- a/.gitlab-ci/deqp-freedreno-a307-fails.txt
+++ b/.gitlab-ci/deqp-freedreno-a307-fails.txt
@@ -388,10 +388,6 @@ dEQP-GLES3.functional.shaders.derivate.fwidth.nicest.fbo_msaa4.vec3_highp,Fail
 dEQP-GLES3.functional.shaders.derivate.fwidth.nicest.fbo_msaa4.vec3_mediump,Fail
 dEQP-GLES3.functional.shaders.derivate.fwidth.nicest.fbo_msaa4.vec4_highp,Fail
 dEQP-GLES3.functional.shaders.derivate.fwidth.nicest.fbo_msaa4.vec4_mediump,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.vec3_const_write_dynamic_loop_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.vec3_const_write_dynamic_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.vec4_const_write_dynamic_loop_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.vec4_const_write_dynamic_read_fragment,Fail
 dEQP-GLES3.functional.shaders.linkage.varying.rules.differing_interpolation_2,Fail
 dEQP-GLES3.functional.shaders.texture_functions.texturegradoffset.isampler2d_vertex,Fail
 dEQP-GLES3.functional.shaders.texture_functions.texturegradoffset.isampler3d_vertex,Fail
diff --git a/.gitlab-ci/deqp-freedreno-a630-fails.txt b/.gitlab-ci/deqp-freedreno-a630-fails.txt
index 4d8c2a69960..2a555e22936 100644
--- a/.gitlab-ci/deqp-freedreno-a630-fails.txt
+++ b/.gitlab-ci/deqp-freedreno-a630-fails.txt
@@ -1,8 +1,4 @@
 
-dEQP-GLES2.functional.shaders.indexing.tmp_array.vec3_const_write_dynamic_loop_read_vertex,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.vec3_const_write_dynamic_read_vertex,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.vec4_const_write_dynamic_loop_read_vertex,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.vec4_const_write_dynamic_read_vertex,Fail
 dEQP-VK.api.copy_and_blit.core.blit_image.all_formats.color.3d.a1r5g5b5_unorm_pack16.a1r5g5b5_unorm_pack16.optimal_general_nearest,Fail
 dEQP-VK.api.copy_and_blit.core.blit_image.all_formats.color.3d.a2b10g10r10_uint_pack32.a2b10g10r10_uint_pack32.general_optimal_nearest,Fail
 dEQP-VK.api.copy_and_blit.core.blit_image.all_formats.color.3d.a2r10g10b10_unorm_pack32.a2r10g10b10_unorm_pack32.optimal_optimal_nearest,Fail
diff --git a/src/freedreno/computerator/ir3_asm.c b/src/freedreno/computerator/ir3_asm.c
index e1e845a9a7c..a976bede5e7 100644
--- a/src/freedreno/computerator/ir3_asm.c
+++ b/src/freedreno/computerator/ir3_asm.c
@@ -42,7 +42,7 @@ ir3_asm_assemble(struct ir3_compiler *c, FILE *in)
 	kernel->base.num_bufs = kernel->info.num_bufs;
 	memcpy(kernel->base.buf_sizes, kernel->info.buf_sizes, sizeof(kernel->base.buf_sizes));
 
-	unsigned sz = v->info.sizedwords * 4;
+	unsigned sz = v->info.size;
 
 	v->bo = fd_bo_new(c->dev, sz,
 			DRM_FREEDRENO_GEM_CACHE_WCOMBINE |
diff --git a/src/freedreno/ir3/ir3.c b/src/freedreno/ir3/ir3.c
index 84aa8eb46d9..2f2612d40c6 100644
--- a/src/freedreno/ir3/ir3.c
+++ b/src/freedreno/ir3/ir3.c
@@ -942,15 +942,24 @@ void * ir3_assemble(struct ir3_shader_variant *v)
 	 * doesn't try to decode the following data as instructions (such as the
 	 * next stage's shader in turnip)
 	 */
-	info->sizedwords = MAX2(v->instrlen * compiler->instr_align,
-			instr_count + 4) * sizeof(instr_t) / 4;
+	info->size = MAX2(v->instrlen * compiler->instr_align, instr_count + 4) *
+		sizeof(instr_t);
+	info->sizedwords = info->size / 4;
+
+	if (v->constant_data_size) {
+		/* Make sure that where we're about to place the constant_data is safe
+		 * to indirectly upload from.
+		 */
+		info->constant_data_offset = align(info->size, v->shader->compiler->const_upload_unit * 16);
+		info->size = info->constant_data_offset + v->constant_data_size;
+	}
 
 	/* Pad out the size so that when turnip uploads the shaders in
 	 * sequence, the starting offset of the next one is properly aligned.
 	 */
-	info->sizedwords = align(info->sizedwords, compiler->instr_align * sizeof(instr_t) / 4);
+	info->size = align(info->size, compiler->instr_align * sizeof(instr_t));
 
-	ptr = dwords = rzalloc_size(v, 4 * info->sizedwords);
+	ptr = dwords = rzalloc_size(v, info->size);
 
 	foreach_block (block, &shader->block_list) {
 		unsigned sfu_delay = 0;
@@ -1003,6 +1012,14 @@ void * ir3_assemble(struct ir3_shader_variant *v)
 		}
 	}
 
+	/* Append the immediates after the end of the program.  This lets us emit
+	 * the immediates as an indirect load, while avoiding creating another BO.
+	 */
+	if (v->constant_data_size)
+		memcpy(&ptr[info->constant_data_offset / 4], v->constant_data, v->constant_data_size);
+	ralloc_free(v->constant_data);
+	v->constant_data = NULL;
+
 	return ptr;
 
 fail:
diff --git a/src/freedreno/ir3/ir3.h b/src/freedreno/ir3/ir3.h
index cb42636f285..262f2a28dcf 100644
--- a/src/freedreno/ir3/ir3.h
+++ b/src/freedreno/ir3/ir3.h
@@ -45,6 +45,13 @@ struct ir3_block;
 
 struct ir3_info {
 	void *data;              /* used internally in ir3 assembler */
+	/* Size in bytes of the shader binary, including NIR constants and
+	 * padding
+	 */
+	uint32_t size;
+	/* byte offset from start of the shader to the NIR constant data. */
+	uint32_t constant_data_offset;
+	/* Size in dwords of the instructions. */
 	uint16_t sizedwords;
 	uint16_t instrs_count;   /* expanded to account for rpt's */
 	uint16_t nops_count;     /* # of nop instructions, including nopN */
diff --git a/src/freedreno/ir3/ir3_disk_cache.c b/src/freedreno/ir3/ir3_disk_cache.c
index 78726710758..29a2c8c2157 100644
--- a/src/freedreno/ir3/ir3_disk_cache.c
+++ b/src/freedreno/ir3/ir3_disk_cache.c
@@ -126,8 +126,8 @@ retrieve_variant(struct blob_reader *blob, struct ir3_shader_variant *v)
 	 * pointers need special handling:
 	 */
 
-	v->bin = rzalloc_size(v, 4 * v->info.sizedwords);
-	blob_copy_bytes(blob, v->bin, 4 * v->info.sizedwords);
+	v->bin = rzalloc_size(v, v->info.size);
+	blob_copy_bytes(blob, v->bin, v->info.size);
 
 	if (!v->binning_pass) {
 		blob_copy_bytes(blob, v->const_state, sizeof(*v->const_state));
@@ -147,7 +147,9 @@ store_variant(struct blob *blob, struct ir3_shader_variant *v)
 	 * pointers need special handling:
 	 */
 
-	blob_write_bytes(blob, v->bin, 4 * v->info.sizedwords);
+	blob_write_bytes(blob, v->bin, v->info.size);
+
+	/* No saving constant_data, it's already baked into bin at this point. */
 
 	if (!v->binning_pass) {
 		blob_write_bytes(blob, v->const_state, sizeof(*v->const_state));
diff --git a/src/freedreno/ir3/ir3_nir.c b/src/freedreno/ir3/ir3_nir.c
index 29ab29691e0..d6d891a9560 100644
--- a/src/freedreno/ir3/ir3_nir.c
+++ b/src/freedreno/ir3/ir3_nir.c
@@ -495,11 +495,25 @@ ir3_nir_lower_variant(struct ir3_shader_variant *so, nir_shader *s)
 		progress |= OPT(s, nir_lower_tex, &tex_options);
 	}
 
+	/* Move large constant variables to the constants attached to the NIR
+	 * shader, which we will upload in the immediates range.  This generates
+	 * amuls, so we need to clean those up after.
+	 *
+	 * Passing no size_align, we would get packed values, which if we end up
+	 * having to load with LDC would result in extra reads to unpack from
+	 * straddling loads.  Align everything to vec4 to avoid that, though we
+	 * could theoretically do better.
+	 */
+	OPT_V(s, nir_opt_large_constants, glsl_get_vec4_size_align_bytes, 32 /* bytes */);
+	OPT_V(s, ir3_nir_lower_load_constant, so);
+
 	if (!so->binning_pass)
 		OPT_V(s, ir3_nir_analyze_ubo_ranges, so);
 
 	progress |= OPT(s, ir3_nir_lower_ubo_loads, so);
 
+	OPT_V(s, nir_lower_amul, ir3_glsl_type_size);
+
 	/* UBO offset lowering has to come after we've decided what will
 	 * be left as load_ubo
 	 */
diff --git a/src/freedreno/ir3/ir3_nir.h b/src/freedreno/ir3/ir3_nir.h
index d716e530493..17dc4aa155c 100644
--- a/src/freedreno/ir3/ir3_nir.h
+++ b/src/freedreno/ir3/ir3_nir.h
@@ -59,6 +59,7 @@ void ir3_nir_lower_variant(struct ir3_shader_variant *so, nir_shader *s);
 
 void ir3_setup_const_state(nir_shader *nir, struct ir3_shader_variant *v,
 		struct ir3_const_state *const_state);
+bool ir3_nir_lower_load_constant(nir_shader *nir, struct ir3_shader_variant *v);
 void ir3_nir_analyze_ubo_ranges(nir_shader *nir, struct ir3_shader_variant *v);
 bool ir3_nir_lower_ubo_loads(nir_shader *nir, struct ir3_shader_variant *v);
 bool ir3_nir_fixup_load_uniform(nir_shader *nir);
diff --git a/src/freedreno/ir3/ir3_nir_analyze_ubo_ranges.c b/src/freedreno/ir3/ir3_nir_analyze_ubo_ranges.c
index 8e7f9aa29d1..a1c06b90819 100644
--- a/src/freedreno/ir3/ir3_nir_analyze_ubo_ranges.c
+++ b/src/freedreno/ir3/ir3_nir_analyze_ubo_ranges.c
@@ -530,3 +530,94 @@ ir3_nir_fixup_load_uniform(nir_shader *nir)
 			fixup_load_uniform_filter, fixup_load_uniform_instr,
 			NULL);
 }
+static nir_ssa_def *
+ir3_nir_lower_load_const_instr(nir_builder *b, nir_instr *in_instr, void *data)
+{
+	struct ir3_const_state *const_state = data;
+	nir_intrinsic_instr *instr = nir_instr_as_intrinsic(in_instr);
+
+	/* Pick a UBO index to use as our constant data.  Skip UBO 0 since that's
+	 * reserved for gallium's cb0.
+	 */
+	if (const_state->constant_data_ubo == -1) {
+		if (b->shader->info.num_ubos == 0)
+			b->shader->info.num_ubos++;
+		const_state->constant_data_ubo = b->shader->info.num_ubos++;
+	}
+
+	unsigned num_components = instr->num_components;
+	if (nir_dest_bit_size(instr->dest) == 16) {
+		/* We can't do 16b loads -- either from LDC (32-bit only in any of our
+		 * traces, and disasm that doesn't look like it really supports it) or
+		 * from the constant file (where CONSTANT_DEMOTION_ENABLE means we get
+		 * automatic 32b-to-16b conversions when we ask for 16b from it).
+		 * Instead, we'll load 32b from a UBO and unpack from there.
+		 */
+		num_components = DIV_ROUND_UP(num_components, 2);
+	}
+	unsigned base = nir_intrinsic_base(instr);
+	nir_intrinsic_instr *load =
+		nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_ubo);
+	load->num_components = num_components;
+	nir_ssa_dest_init(&load->instr, &load->dest,
+			load->num_components, 32,
+			instr->dest.ssa.name);
+
+	load->src[0] = nir_src_for_ssa(nir_imm_int(b,
+					const_state->constant_data_ubo));
+	load->src[1] = nir_src_for_ssa(nir_iadd_imm(b,
+					nir_ssa_for_src(b, instr->src[0], 1), base));
+
+	nir_intrinsic_set_align(load,
+			nir_intrinsic_align_mul(instr),
+			nir_intrinsic_align_offset(instr));
+	nir_intrinsic_set_range_base(load, base);
+	nir_intrinsic_set_range(load, nir_intrinsic_range(instr));
+
+	nir_builder_instr_insert(b, &load->instr);
+
+	nir_ssa_def *result = &load->dest.ssa;
+	if (nir_dest_bit_size(instr->dest) == 16) {
+		result = nir_bitcast_vector(b, result, 16);
+		result = nir_channels(b, result, BITSET_MASK(instr->num_components));
+	}
+
+	return result;
+}
+
+static bool
+ir3_lower_load_const_filter(const nir_instr *instr, const void *data)
+{
+        return (instr->type == nir_instr_type_intrinsic &&
+                nir_instr_as_intrinsic(instr)->intrinsic == nir_intrinsic_load_constant);
+}
+
+/* Lowers load_constant intrinsics to UBO accesses so we can run them through
+ * the general "upload to const file or leave as UBO access" code.
+ */
+bool
+ir3_nir_lower_load_constant(nir_shader *nir, struct ir3_shader_variant *v)
+{
+	struct ir3_const_state *const_state = ir3_const_state(v);
+
+	const_state->constant_data_ubo = -1;
+
+	bool progress = nir_shader_lower_instructions(nir,
+			ir3_lower_load_const_filter, ir3_nir_lower_load_const_instr,
+			const_state);
+
+	if (progress) {
+		struct ir3_compiler *compiler = v->shader->compiler;
+
+		/* Save a copy of the NIR constant data to the variant for
+			* inclusion in the final assembly.
+			*/
+		v->constant_data_size = align(nir->constant_data_size,
+				compiler->const_upload_unit * 4 * sizeof(uint32_t));
+		v->constant_data = rzalloc_size(v, v->constant_data_size);
+		memcpy(v->constant_data, nir->constant_data,
+				nir->constant_data_size);
+	}
+
+	return progress;
+}
diff --git a/src/freedreno/ir3/ir3_shader.h b/src/freedreno/ir3/ir3_shader.h
index 36aba4facc7..bba3c627da3 100644
--- a/src/freedreno/ir3/ir3_shader.h
+++ b/src/freedreno/ir3/ir3_shader.h
@@ -157,6 +157,9 @@ struct ir3_const_state {
 	unsigned num_ubos;
 	unsigned num_driver_params;   /* scalar */
 
+	/* UBO that should be mapped to the NIR shader's constant_data (or -1). */
+	int32_t constant_data_ubo;
+
 	struct {
 		/* user const start at zero */
 		unsigned ubo;
@@ -504,6 +507,12 @@ struct ir3_shader_variant {
 	gl_shader_stage type;
 	struct ir3_shader *shader;
 
+	/* variant's copy of nir->constant_data (since we don't track the NIR in
+	 * the variant, and shader->nir is before the opt pass).  Moves to v->bin
+	 * after assembly.
+	 */
+	void *constant_data;
+
 	/*
 	 * Below here is serialized when written to disk cache:
 	 */
@@ -525,6 +534,8 @@ struct ir3_shader_variant {
 
 	struct ir3_info info;
 
+	uint32_t constant_data_size;
+
 	/* Levels of nesting of flow control:
 	 */
 	unsigned branchstack;
diff --git a/src/freedreno/vulkan/tu_cmd_buffer.c b/src/freedreno/vulkan/tu_cmd_buffer.c
index dfcaca99f42..41d9c81858b 100644
--- a/src/freedreno/vulkan/tu_cmd_buffer.c
+++ b/src/freedreno/vulkan/tu_cmd_buffer.c
@@ -3013,7 +3013,8 @@ tu6_emit_user_consts(struct tu_cs *cs, const struct tu_pipeline *pipeline,
 {
    const struct tu_program_descriptor_linkage *link =
       &pipeline->program.link[type];
-   const struct ir3_ubo_analysis_state *state = &link->const_state.ubo_state;
+   const struct ir3_const_state *const_state = &link->const_state;
+   const struct ir3_ubo_analysis_state *state = &const_state->ubo_state;
 
    if (link->push_consts.count > 0) {
       unsigned num_units = link->push_consts.count;
@@ -3048,9 +3049,14 @@ tu6_emit_user_consts(struct tu_cs *cs, const struct tu_pipeline *pipeline,
       debug_assert((offset % 16) == 0);
 
       /* Dig out the descriptor from the descriptor state and read the VA from
-       * it.
+       * it.  All our UBOs are bindless with the exception of the NIR
+       * constant_data, which is uploaded once in the pipeline.
        */
-      assert(state->range[i].ubo.bindless);
+      if (!state->range[i].ubo.bindless) {
+         assert(state->range[i].ubo.block == const_state->constant_data_ubo);
+         continue;
+      }
+
       uint32_t *base = state->range[i].ubo.bindless_base == MAX_SETS ?
          descriptors_state->dynamic_descriptors :
          descriptors_state->sets[state->range[i].ubo.bindless_base]->mapped_ptr;
diff --git a/src/freedreno/vulkan/tu_pipeline.c b/src/freedreno/vulkan/tu_pipeline.c
index b2f8c636682..dde112391da 100644
--- a/src/freedreno/vulkan/tu_pipeline.c
+++ b/src/freedreno/vulkan/tu_pipeline.c
@@ -453,19 +453,61 @@ tu6_emit_xs_config(struct tu_cs *cs,
     */
    size = MIN2(size + base, xs->constlen) - base;
 
-   if (size <= 0)
-      return;
+   if (size > 0) {
+      tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 3 + size * 4);
+      tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(base) |
+                 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
+                 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
+                 CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) |
+                 CP_LOAD_STATE6_0_NUM_UNIT(size));
+      tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
+      tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
 
-   tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 3 + size * 4);
-   tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(base) |
-                  CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
-                  CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
-                  CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) |
-                  CP_LOAD_STATE6_0_NUM_UNIT(size));
-   tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
-   tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
+      tu_cs_emit_array(cs, const_state->immediates, size * 4);
+   }
+
+   if (const_state->constant_data_ubo != -1) {
+      uint64_t iova = binary_iova + xs->info.constant_data_offset;
 
-   tu_cs_emit_array(cs, const_state->immediates, size * 4);
+      /* Upload UBO state for the constant data. */
+      tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 5);
+      tu_cs_emit(cs,
+                 CP_LOAD_STATE6_0_DST_OFF(const_state->constant_data_ubo) |
+                 CP_LOAD_STATE6_0_STATE_TYPE(ST6_UBO)|
+                 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
+                 CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) |
+                 CP_LOAD_STATE6_0_NUM_UNIT(1));
+      tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
+      tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
+      int size_vec4s = DIV_ROUND_UP(xs->constant_data_size, 16);
+      tu_cs_emit_qw(cs,
+                    iova |
+                    (uint64_t)A6XX_UBO_1_SIZE(size_vec4s) << 32);
+
+      /* Upload the constant data to the const file if needed. */
+      const struct ir3_ubo_analysis_state *ubo_state = &const_state->ubo_state;
+
+      for (int i = 0; i < ubo_state->num_enabled; i++) {
+         if (ubo_state->range[i].ubo.block != const_state->constant_data_ubo ||
+             ubo_state->range[i].ubo.bindless) {
+            continue;
+         }
+
+         uint32_t start = ubo_state->range[i].start;
+         uint32_t end = ubo_state->range[i].end;
+         uint32_t size = MIN2(end - start,
+                              (16 * xs->constlen) - ubo_state->range[i].offset);
+
+         tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 3);
+         tu_cs_emit(cs,
+                    CP_LOAD_STATE6_0_DST_OFF(ubo_state->range[i].offset / 16) |
+                    CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
+                    CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
+                    CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) |
+                    CP_LOAD_STATE6_0_NUM_UNIT(size / 16));
+         tu_cs_emit_qw(cs, iova + start);
+      }
+   }
 }
 
 static void
@@ -1939,12 +1981,12 @@ tu_pipeline_allocate_cs(struct tu_device *dev,
    if (builder) {
       for (uint32_t i = 0; i < MESA_SHADER_STAGES; i++) {
          if (builder->variants[i])
-            size += builder->variants[i]->info.sizedwords;
+            size += builder->variants[i]->info.size / 4;
       }
 
-      size += builder->binning_variant->info.sizedwords;
+      size += builder->binning_variant->info.size / 4;
    } else {
-      size += compute->info.sizedwords;
+      size += compute->info.size / 4;
    }
 
    tu_cs_init(&pipeline->cs, dev, TU_CS_MODE_SUB_STREAM, size);
@@ -2016,12 +2058,12 @@ tu_upload_variant(struct tu_pipeline *pipeline,
       return 0;
 
    /* this expects to get enough alignment because shaders are allocated first
-    * and sizedwords is always aligned correctly
+    * and total size is always aligned correctly
     * note: an assert in tu6_emit_xs_config validates the alignment
     */
-   tu_cs_alloc(&pipeline->cs, variant->info.sizedwords, 1, &memory);
+   tu_cs_alloc(&pipeline->cs, variant->info.size / 4, 1, &memory);
 
-   memcpy(memory.map, variant->bin, sizeof(uint32_t) * variant->info.sizedwords);
+   memcpy(memory.map, variant->bin, variant->info.size);
    return memory.iova;
 }
 
diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_const.c b/src/gallium/drivers/freedreno/a6xx/fd6_const.c
index 020fbf532d2..78b7b05a32c 100644
--- a/src/gallium/drivers/freedreno/a6xx/fd6_const.c
+++ b/src/gallium/drivers/freedreno/a6xx/fd6_const.c
@@ -248,6 +248,16 @@ fd6_emit_ubos(struct fd_context *ctx, const struct ir3_shader_variant *v,
 	OUT_RING(ring, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
 
 	for (int i = 0; i < num_ubos; i++) {
+		/* NIR constant data is packed into the end of the shader. */
+		if (i == const_state->constant_data_ubo) {
+			int size_vec4s = DIV_ROUND_UP(v->constant_data_size, 16);
+			OUT_RELOC(ring, v->bo,
+					v->info.constant_data_offset,
+					(uint64_t)A6XX_UBO_1_SIZE(size_vec4s) << 32,
+					0);
+			continue;
+		}
+
 		struct pipe_constant_buffer *cb = &constbuf->cb[i];
 
 		/* If we have user pointers (constbuf 0, aka GL uniforms), upload them
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_const.h b/src/gallium/drivers/freedreno/ir3/ir3_const.h
index 2c9c56041b5..4dc36c47c5d 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_const.h
+++ b/src/gallium/drivers/freedreno/ir3/ir3_const.h
@@ -106,6 +106,44 @@ ir3_user_consts_size(struct ir3_ubo_analysis_state *state,
 	}
 }
 
+/**
+ * Uploads the referenced subranges of the nir constant_data to the hardware's
+ * constant buffer.
+ */
+static inline void
+ir3_emit_constant_data(struct fd_screen *screen,
+		const struct ir3_shader_variant *v, struct fd_ringbuffer *ring)
+{
+	const struct ir3_const_state *const_state = ir3_const_state(v);
+	const struct ir3_ubo_analysis_state *state = &const_state->ubo_state;
+
+	for (unsigned i = 0; i < state->num_enabled; i++) {
+		unsigned ubo = state->range[i].ubo.block;
+		if (ubo != const_state->constant_data_ubo)
+			continue;
+
+		uint32_t size = state->range[i].end - state->range[i].start;
+
+		/* Pre-a6xx, we might have ranges enabled in the shader that aren't
+		 * used in the binning variant.
+		 */
+		if (16 * v->constlen <= state->range[i].offset)
+			continue;
+
+		/* and even if the start of the const buffer is before
+		 * first_immediate, the end may not be:
+		 */
+		size = MIN2(size, (16 * v->constlen) - state->range[i].offset);
+
+		if (size == 0)
+			continue;
+
+		emit_const_bo(ring, v, state->range[i].offset / 4,
+				v->info.constant_data_offset + state->range[i].start,
+				size / 4, v->bo);
+	}
+}
+
 /**
  * Uploads sub-ranges of UBOs to the hardware's constant buffer (UBO access
  * outside of these ranges will be done using full UBO accesses in the
@@ -121,8 +159,10 @@ ir3_emit_user_consts(struct fd_screen *screen, const struct ir3_shader_variant *
 	for (unsigned i = 0; i < state->num_enabled; i++) {
 		assert(!state->range[i].ubo.bindless);
 		unsigned ubo = state->range[i].ubo.block;
-		if (!(constbuf->enabled_mask & (1 << ubo)))
+		if (!(constbuf->enabled_mask & (1 << ubo)) ||
+				ubo == const_state->constant_data_ubo) {
 			continue;
+		}
 		struct pipe_constant_buffer *cb = &constbuf->cb[ubo];
 
 		uint32_t size = state->range[i].end - state->range[i].start;
@@ -176,6 +216,12 @@ ir3_emit_ubos(struct fd_context *ctx, const struct ir3_shader_variant *v,
 		struct fd_bo *bos[params];
 
 		for (uint32_t i = 0; i < params; i++) {
+			if (i == const_state->constant_data_ubo) {
+				bos[i] = v->bo;
+				offsets[i] = v->info.constant_data_offset;
+				continue;
+			}
+
 			struct pipe_constant_buffer *cb = &constbuf->cb[i];
 
 			/* If we have user pointers (constbuf 0, aka GL uniforms), upload
@@ -299,6 +345,11 @@ ir3_emit_immediates(struct fd_screen *screen, const struct ir3_shader_variant *v
 
 	if (size > 0)
 		emit_const_user(ring, v, base, size, const_state->immediates);
+
+	/* NIR constant data has the same lifetime as immediates, so upload it
+	 * now, too.
+	 */
+	ir3_emit_constant_data(screen, v, ring);
 }
 
 static inline void
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_gallium.c b/src/gallium/drivers/freedreno/ir3/ir3_gallium.c
index cb28ed559cf..5a79a7692cc 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_gallium.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_gallium.c
@@ -86,9 +86,7 @@ upload_shader_variant(struct ir3_shader_variant *v)
 
 	assert(!v->bo);
 
-	unsigned sz = v->info.sizedwords * 4;
-
-	v->bo = fd_bo_new(compiler->dev, sz,
+	v->bo = fd_bo_new(compiler->dev, v->info.size,
 			DRM_FREEDRENO_GEM_CACHE_WCOMBINE |
 			DRM_FREEDRENO_GEM_TYPE_KMEM,
 			"%s:%s", ir3_shader_stage(v), info->name);
@@ -96,7 +94,7 @@ upload_shader_variant(struct ir3_shader_variant *v)
 	/* Always include shaders in kernel crash dumps. */
 	fd_bo_mark_for_dump(v->bo);
 
-	memcpy(fd_bo_map(v->bo), v->bin, sz);
+	memcpy(fd_bo_map(v->bo), v->bin, v->info.size);
 }
 
 struct ir3_shader_variant *



More information about the mesa-commit mailing list