Mesa (master): ir3: Calcuate max_waves and threadsize

Mon Mar 22 18:19:59 UTC 2021

Module: Mesa
Branch: master
Commit: fd7960e1915dd43f42478cb165cd5367459a8629
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=fd7960e1915dd43f42478cb165cd5367459a8629

Author: Connor Abbott <cwabbott0 at gmail.com>
Date:   Wed Mar 10 13:03:16 2021 +0100

ir3: Calcuate max_waves and threadsize

max_waves is just for shader-db stats for now, but threadsize will
replace the various mechanisms used to determine threadsize across the
different gen's. Calculating these correctly entails adding a bunch of
details about the sizes of various things to ir3. In the future we will
use the guts of the max_waves calculation to inform RA decisions as
well, which is why the max_waves calculation is broken up into register
dependent/independent pieces.

Something should be said about the units of reg_size_vec4. These units
were chosen for two reasons:

1. As said in the comment, it makes some calculations easier.
2. For a4xx/a5xx, where we don't know as much because we haven't done
   the same sorts of experiments to probe for the HW configuration, it
   corresponds more directly to things that are known. The existing code
   switches to the smaller threadsize when r24.x or higher is used,
   which translates directly to a reg_size_vec4 of 48. If we chose
   different units (e.g. multiplying by wave_granularity and/or
   threadsize_base), then to match the same behavior we'd have to set
   reg_size_vec4 based on some other parameters that aren't 100% known.
   If someone comes along and updates them, they might inadvertantly
   break it.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9498>

---

 src/freedreno/ir3/ir3.c          | 107 +++++++++++++++++++++++++++++++++++++++
 src/freedreno/ir3/ir3.h          |   5 ++
 src/freedreno/ir3/ir3_compiler.c |  35 +++++++++++++
 src/freedreno/ir3/ir3_compiler.h |  38 ++++++++++++++
 4 files changed, 185 insertions(+)

diff --git a/src/freedreno/ir3/ir3.c b/src/freedreno/ir3/ir3.c
index 0cff9fae04e..4945e7a968d 100644
--- a/src/freedreno/ir3/ir3.c
+++ b/src/freedreno/ir3/ir3.c
@@ -107,6 +107,99 @@ collect_reg_info(struct ir3_instruction *instr, struct ir3_register *reg,
 	}
 }
 
+static bool
+should_double_threadsize(struct ir3_shader_variant *v,
+						 unsigned regs_count)
+{
+	const struct ir3_compiler *compiler = v->shader->compiler;
+	switch (v->type) {
+	case MESA_SHADER_COMPUTE: {
+		unsigned threads_per_wg = v->local_size[0] * v->local_size[1] * v->local_size[2];
+
+		/* For a5xx, if the workgroup size is greater than the maximum number
+		 * of threads per core with 32 threads per wave (512) then we have to
+		 * use the doubled threadsize because otherwise the workgroup wouldn't
+		 * fit. For smaller workgroup sizes, we follow the blob and use the
+		 * smaller threadsize.
+		 */
+		if (compiler->gpu_id < 600) {
+			return v->local_size_variable || threads_per_wg >
+				compiler->threadsize_base * compiler->max_waves;
+		}
+
+		/* On a6xx, we prefer the larger threadsize unless the workgroup is
+		 * small enough that it would be useless. Note that because
+		 * threadsize_base is bumped to 64, we don't have to worry about the
+		 * workgroup fitting, unlike the a5xx case.
+		 */
+		if (!v->local_size_variable) {
+			if (threads_per_wg <= compiler->threadsize_base)
+				return false;
+		}
+	}
+	/* fallthrough */
+	case MESA_SHADER_FRAGMENT: {
+		/* Check that doubling the threadsize wouldn't exceed the regfile size */
+		return regs_count * 2 <= compiler->reg_size_vec4;
+	}
+
+	default:
+		/* On a6xx+, it's impossible to use a doubled wavesize in the geometry
+		 * stages - the bit doesn't exist. The blob never used it for the VS
+		 * on earlier gen's anyway.
+		 */
+		return false;
+	}
+}
+
+/* Get the maximum number of waves that could be used even if this shader
+ * didn't use any registers.
+ */
+static unsigned
+get_reg_independent_max_waves(struct ir3_shader_variant *v, bool double_threadsize)
+{
+	const struct ir3_compiler *compiler = v->shader->compiler;
+	unsigned max_waves = compiler->max_waves;
+
+	/* If this is a compute shader, compute the limit based on shared size */
+	if (v->type == MESA_SHADER_COMPUTE) {
+		/* Shared is allocated in chunks of 1k */
+		unsigned shared_per_wg = ALIGN_POT(v->shared_size, 1024);
+		if (shared_per_wg > 0 && !v->local_size_variable) {
+			unsigned wgs_per_core = compiler->local_mem_size / shared_per_wg;
+			unsigned threads_per_wg = v->local_size[0] * v->local_size[1] * v->local_size[2];
+			unsigned waves_per_wg =
+				DIV_ROUND_UP(threads_per_wg,
+					compiler->threadsize_base *
+					(double_threadsize ? 2 : 1) * compiler->wave_granularity);
+			max_waves =
+				MIN2(max_waves, waves_per_wg * wgs_per_core * compiler->wave_granularity);
+		}
+	}
+
+	/* Compute the limit based on branchstack */
+	if (v->branchstack > 0) {
+		unsigned branchstack_max_waves =
+			compiler->branchstack_size / v->branchstack *
+			compiler->wave_granularity;
+		max_waves = MIN2(max_waves, branchstack_max_waves);
+	}
+
+	return max_waves;
+}
+
+/* Get the maximum number of waves that could be launched limited by reg size.
+ */
+static unsigned
+get_reg_dependent_max_waves(const struct ir3_compiler *compiler,
+							unsigned reg_count, bool double_threadsize)
+{
+	return reg_count ?
+		(compiler->reg_size_vec4 / (reg_count * (double_threadsize ? 2 : 1)) *
+		 compiler->wave_granularity) :
+		compiler->max_waves;
+}
+
 void
 ir3_collect_info(struct ir3_shader_variant *v)
 {
@@ -200,6 +293,20 @@ ir3_collect_info(struct ir3_shader_variant *v)
 			}
 		}
 	}
+
+	/* TODO: for a5xx and below, is there a separate regfile for
+	 * half-registers?
+	 */
+	unsigned regs_count =
+		info->max_reg + 1 + (compiler->gpu_id >= 600 ? ((info->max_half_reg + 2) / 2) : 0);
+
+	info->double_threadsize = should_double_threadsize(v, regs_count);
+	unsigned reg_independent_max_waves =
+		get_reg_independent_max_waves(v, info->double_threadsize);
+	unsigned reg_dependent_max_waves =
+		get_reg_dependent_max_waves(compiler, regs_count, info->double_threadsize);
+	info->max_waves = MIN2(reg_independent_max_waves, reg_dependent_max_waves);
+	assert(info->max_waves <= v->shader->compiler->max_waves);
 }
 
 static struct ir3_register * reg_create(struct ir3 *shader,
diff --git a/src/freedreno/ir3/ir3.h b/src/freedreno/ir3/ir3.h
index c869844fa38..bc11f89b94b 100644
--- a/src/freedreno/ir3/ir3.h
+++ b/src/freedreno/ir3/ir3.h
@@ -64,6 +64,11 @@ struct ir3_info {
 	int8_t   max_reg;   /* highest GPR # used by shader */
 	int8_t   max_half_reg;
 	int16_t  max_const;
+	/* This is the maximum # of waves that can executed at once in one core,
+	 * assuming that they are all executing this shader.
+	 */
+	int8_t   max_waves;
+	bool     double_threadsize;
 	bool     multi_dword_ldp_stp;
 
 	/* number of sync bits: */
diff --git a/src/freedreno/ir3/ir3_compiler.c b/src/freedreno/ir3/ir3_compiler.c
index c27e8bcedfe..ed8b43364c5 100644
--- a/src/freedreno/ir3/ir3_compiler.c
+++ b/src/freedreno/ir3/ir3_compiler.c
@@ -79,6 +79,13 @@ ir3_compiler_create(struct fd_device *dev, uint32_t gpu_id)
 	compiler->gpu_id = gpu_id;
 	compiler->set = ir3_ra_alloc_reg_set(compiler, false);
 
+	/* All known GPU's have 32k local memory (aka shared) */
+	compiler->local_mem_size = 32 * 1024;
+	/* TODO see if older GPU's were different here */
+	compiler->branchstack_size = 64;
+	compiler->wave_granularity = 2;
+	compiler->max_waves = 16;
+
 	if (compiler->gpu_id >= 600) {
 		compiler->mergedregs_set = ir3_ra_alloc_reg_set(compiler, true);
 		compiler->samgq_workaround = true;
@@ -123,6 +130,34 @@ ir3_compiler_create(struct fd_device *dev, uint32_t gpu_id)
 		compiler->max_const_safe = 256;
 	}
 
+	if (compiler->gpu_id == 650) {
+		/* This changed mid-generation for a650, so that using r32.x and above
+		 * requires using the smallest threadsize.
+		 */
+		compiler->reg_size_vec4 = 64;
+	} else if (compiler->gpu_id >= 600) {
+		compiler->reg_size_vec4 = 96;
+	} else if (compiler->gpu_id >= 400) {
+		/* On a4xx-a5xx, using r24.x and above requires using the smallest
+		 * threadsize.
+		 */
+		compiler->reg_size_vec4 = 48;
+	} else {
+		/* TODO: confirm this */
+		compiler->reg_size_vec4 = 96;
+	}
+
+	if (compiler->gpu_id >= 600) {
+		compiler->threadsize_base = 64;
+	} else if (compiler->gpu_id >= 400) {
+		/* TODO: Confirm this for a4xx. For a5xx this is based on the Vulkan
+		 * 1.1 subgroupSize which is 32.
+		 */
+		compiler->threadsize_base = 32;
+	} else {
+		compiler->threadsize_base = 8;
+	}
+
 	if (compiler->gpu_id >= 400) {
 		/* need special handling for "flat" */
 		compiler->flat_bypass = true;
diff --git a/src/freedreno/ir3/ir3_compiler.h b/src/freedreno/ir3/ir3_compiler.h
index 54a78f37726..6f7058f37e5 100644
--- a/src/freedreno/ir3/ir3_compiler.h
+++ b/src/freedreno/ir3/ir3_compiler.h
@@ -107,6 +107,44 @@ struct ir3_compiler {
 	 */
 	uint32_t const_upload_unit;
 
+	/* The base number of threads per wave. Some stages may be able to double
+	 * this.
+	 */
+	uint32_t threadsize_base;
+
+	/* On at least a6xx, waves are always launched in pairs. In calculations
+	 * about occupancy, we pretend that each wave pair is actually one wave,
+	 * which simplifies many of the calculations, but means we have to
+	 * multiply threadsize_base by this number.
+	 */
+	uint32_t wave_granularity;
+
+	/* The maximum number of simultaneous waves per core. */
+	uint32_t max_waves;
+
+	/* This is theoretical maximum number of vec4 registers that one wave of
+	 * the base threadsize could use. To get the actual size of the register
+	 * file in bytes one would need to compute:
+	 *
+	 * reg_size_vec4 * threadsize_base * wave_granularity * 16 (bytes per vec4)
+	 *
+	 * However this number is more often what we actually need. For example, a
+	 * max_reg more than half of this will result in a doubled threadsize
+	 * being impossible (because double-sized waves take up twice as many
+	 * registers). Also, the formula for the occupancy given a particular
+	 * register footprint is simpler.
+	 *
+	 * It is in vec4 units because the register file is allocated
+	 * with vec4 granularity, so it's in the same units as max_reg.
+	 */
+	uint32_t reg_size_vec4;
+
+	/* The size of local memory in bytes */
+	uint32_t local_mem_size;
+
+	/* The number of total branch stack entries, divided by wave_granularity. */
+	uint32_t branchstack_size;
+
 	/* Whether clip+cull distances are supported */
 	bool has_clip_cull;