Mesa (main): intel/fs,vec4: Drop uniform compaction and pull constant support
GitLab Mirror
gitlab-mirror at kemper.freedesktop.org
Fri Dec 10 21:36:55 UTC 2021
Module: Mesa
Branch: main
Commit: 8f3c100d61c673115880cc4c2d9a3d5d2ad7db3d
URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=8f3c100d61c673115880cc4c2d9a3d5d2ad7db3d
Author: Jason Ekstrand <jason at jlekstrand.net>
Date: Fri Dec 3 21:34:06 2021 -0600
intel/fs,vec4: Drop uniform compaction and pull constant support
The only driver using these was i965 and it's gone now. This is all
dead code.
Reviewed-by: Caio Oliveira <caio.oliveira at intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14056>
---
src/gallium/drivers/crocus/crocus_disk_cache.c | 3 -
src/gallium/drivers/crocus/crocus_program_cache.c | 4 +-
src/gallium/drivers/crocus/crocus_screen.c | 2 -
src/gallium/drivers/iris/iris_disk_cache.c | 3 -
src/gallium/drivers/iris/iris_program.c | 1 -
src/gallium/drivers/iris/iris_screen.c | 2 -
src/intel/compiler/brw_compiler.h | 14 -
src/intel/compiler/brw_fs.cpp | 331 +---------------------
src/intel/compiler/brw_fs.h | 6 -
src/intel/compiler/brw_fs_nir.cpp | 4 +-
src/intel/compiler/brw_fs_visitor.cpp | 2 -
src/intel/compiler/brw_vec4.cpp | 289 +------------------
src/intel/compiler/brw_vec4.h | 7 -
src/intel/compiler/brw_vec4_gs_visitor.cpp | 1 -
src/intel/compiler/brw_vec4_visitor.cpp | 140 ---------
src/intel/vulkan/anv_device.c | 2 -
16 files changed, 20 insertions(+), 791 deletions(-)
diff --git a/src/gallium/drivers/crocus/crocus_disk_cache.c b/src/gallium/drivers/crocus/crocus_disk_cache.c
index 037136ec43d..3a3d302da88 100644
--- a/src/gallium/drivers/crocus/crocus_disk_cache.c
+++ b/src/gallium/drivers/crocus/crocus_disk_cache.c
@@ -181,9 +181,6 @@ crocus_disk_cache_retrieve(struct crocus_context *ice,
}
prog_data->param = NULL;
- prog_data->pull_param = NULL;
- assert(prog_data->nr_pull_params == 0);
-
if (prog_data->nr_params) {
prog_data->param = ralloc_array(NULL, uint32_t, prog_data->nr_params);
blob_copy_bytes(&blob, prog_data->param,
diff --git a/src/gallium/drivers/crocus/crocus_program_cache.c b/src/gallium/drivers/crocus/crocus_program_cache.c
index 52d8bbf0b4c..11c3e400290 100644
--- a/src/gallium/drivers/crocus/crocus_program_cache.c
+++ b/src/gallium/drivers/crocus/crocus_program_cache.c
@@ -224,10 +224,8 @@ crocus_upload_shader(struct crocus_context *ice,
shader->bt = *bt;
ralloc_steal(shader, shader->prog_data);
- if (prog_data_size > 16) {
+ if (prog_data_size > 16)
ralloc_steal(shader->prog_data, prog_data->param);
- ralloc_steal(shader->prog_data, prog_data->pull_param);
- }
ralloc_steal(shader, shader->streamout);
ralloc_steal(shader, shader->system_values);
diff --git a/src/gallium/drivers/crocus/crocus_screen.c b/src/gallium/drivers/crocus/crocus_screen.c
index f4e37bcdcee..9e2a7004d42 100644
--- a/src/gallium/drivers/crocus/crocus_screen.c
+++ b/src/gallium/drivers/crocus/crocus_screen.c
@@ -779,9 +779,7 @@ crocus_screen_create(int fd, const struct pipe_screen_config *config)
screen->compiler = brw_compiler_create(screen, &screen->devinfo);
screen->compiler->shader_debug_log = crocus_shader_debug_log;
screen->compiler->shader_perf_log = crocus_shader_perf_log;
- screen->compiler->supports_pull_constants = false;
screen->compiler->supports_shader_constants = false;
- screen->compiler->compact_params = false;
screen->compiler->constant_buffer_0_is_relative = true;
if (screen->devinfo.ver >= 7) {
diff --git a/src/gallium/drivers/iris/iris_disk_cache.c b/src/gallium/drivers/iris/iris_disk_cache.c
index 2ad12002a61..520bfd83397 100644
--- a/src/gallium/drivers/iris/iris_disk_cache.c
+++ b/src/gallium/drivers/iris/iris_disk_cache.c
@@ -207,9 +207,6 @@ iris_disk_cache_retrieve(struct iris_screen *screen,
}
prog_data->param = NULL;
- prog_data->pull_param = NULL;
- assert(prog_data->nr_pull_params == 0);
-
if (prog_data->nr_params) {
prog_data->param = ralloc_array(NULL, uint32_t, prog_data->nr_params);
blob_copy_bytes(&blob, prog_data->param,
diff --git a/src/gallium/drivers/iris/iris_program.c b/src/gallium/drivers/iris/iris_program.c
index 2e5227ad33e..5a21228a8a7 100644
--- a/src/gallium/drivers/iris/iris_program.c
+++ b/src/gallium/drivers/iris/iris_program.c
@@ -90,7 +90,6 @@ iris_finalize_program(struct iris_compiled_shader *shader,
ralloc_steal(shader, shader->prog_data);
ralloc_steal(shader->prog_data, (void *)prog_data->relocs);
ralloc_steal(shader->prog_data, prog_data->param);
- ralloc_steal(shader->prog_data, prog_data->pull_param);
ralloc_steal(shader, shader->streamout);
ralloc_steal(shader, shader->system_values);
}
diff --git a/src/gallium/drivers/iris/iris_screen.c b/src/gallium/drivers/iris/iris_screen.c
index 870bb24aaff..09c69594578 100644
--- a/src/gallium/drivers/iris/iris_screen.c
+++ b/src/gallium/drivers/iris/iris_screen.c
@@ -839,9 +839,7 @@ iris_screen_create(int fd, const struct pipe_screen_config *config)
screen->compiler = brw_compiler_create(screen, &screen->devinfo);
screen->compiler->shader_debug_log = iris_shader_debug_log;
screen->compiler->shader_perf_log = iris_shader_perf_log;
- screen->compiler->supports_pull_constants = false;
screen->compiler->supports_shader_constants = true;
- screen->compiler->compact_params = false;
screen->compiler->indirect_ubos_use_sampler = screen->devinfo.ver < 12;
screen->l3_config_3d = iris_get_default_l3_config(&screen->devinfo, false);
diff --git a/src/intel/compiler/brw_compiler.h b/src/intel/compiler/brw_compiler.h
index 4c932fb46a4..113c00eb7d1 100644
--- a/src/intel/compiler/brw_compiler.h
+++ b/src/intel/compiler/brw_compiler.h
@@ -91,24 +91,12 @@ struct brw_compiler {
*/
bool constant_buffer_0_is_relative;
- /**
- * Whether or not the driver supports pull constants. If not, the compiler
- * will attempt to push everything.
- */
- bool supports_pull_constants;
-
/**
* Whether or not the driver supports NIR shader constants. This controls
* whether nir_opt_large_constants will be run.
*/
bool supports_shader_constants;
- /**
- * Whether or not the driver wants uniform params to be compacted by the
- * back-end compiler.
- */
- bool compact_params;
-
/**
* Whether or not the driver wants variable group size to be lowered by the
* back-end compiler.
@@ -775,7 +763,6 @@ struct brw_stage_prog_data {
struct brw_ubo_range ubo_ranges[4];
GLuint nr_params; /**< number of float params/constants */
- GLuint nr_pull_params;
gl_shader_stage stage;
@@ -822,7 +809,6 @@ struct brw_stage_prog_data {
* above.
*/
uint32_t *param;
- uint32_t *pull_param;
/* Whether shader uses atomic operations. */
bool uses_atomic_load_store;
diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index 99c6979dbdf..3325bac0025 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -1234,7 +1234,6 @@ void
fs_visitor::import_uniforms(fs_visitor *v)
{
this->push_constant_loc = v->push_constant_loc;
- this->pull_constant_loc = v->pull_constant_loc;
this->uniforms = v->uniforms;
this->subgroup_id = v->subgroup_id;
for (unsigned i = 0; i < ARRAY_SIZE(this->group_size); i++)
@@ -1801,7 +1800,6 @@ fs_visitor::assign_curb_setup()
uint64_t want_zero = used & stage_prog_data->zero_push_reg;
if (want_zero) {
- assert(!compiler->compact_params);
fs_builder ubld = bld.exec_all().group(8, 0).at(
cfg->first_block(), cfg->first_block()->start());
@@ -2396,109 +2394,6 @@ get_subgroup_id_param_index(const intel_device_info *devinfo,
return -1;
}
-/**
- * Struct for handling complex alignments.
- *
- * A complex alignment is stored as multiplier and an offset. A value is
- * considered to be aligned if it is {offset} larger than a multiple of {mul}.
- * For instance, with an alignment of {8, 2}, cplx_align_apply would do the
- * following:
- *
- * N | cplx_align_apply({8, 2}, N)
- * ----+-----------------------------
- * 4 | 6
- * 6 | 6
- * 8 | 14
- * 10 | 14
- * 12 | 14
- * 14 | 14
- * 16 | 22
- */
-struct cplx_align {
- unsigned mul:4;
- unsigned offset:4;
-};
-
-#define CPLX_ALIGN_MAX_MUL 8
-
-static void
-cplx_align_assert_sane(struct cplx_align a)
-{
- assert(a.mul > 0 && util_is_power_of_two_nonzero(a.mul));
- assert(a.offset < a.mul);
-}
-
-/**
- * Combines two alignments to produce a least multiple of sorts.
- *
- * The returned alignment is the smallest (in terms of multiplier) such that
- * anything aligned to both a and b will be aligned to the new alignment.
- * This function will assert-fail if a and b are not compatible, i.e. if the
- * offset parameters are such that no common alignment is possible.
- */
-static struct cplx_align
-cplx_align_combine(struct cplx_align a, struct cplx_align b)
-{
- cplx_align_assert_sane(a);
- cplx_align_assert_sane(b);
-
- /* Assert that the alignments agree. */
- assert((a.offset & (b.mul - 1)) == (b.offset & (a.mul - 1)));
-
- return a.mul > b.mul ? a : b;
-}
-
-/**
- * Apply a complex alignment
- *
- * This function will return the smallest number greater than or equal to
- * offset that is aligned to align.
- */
-static unsigned
-cplx_align_apply(struct cplx_align align, unsigned offset)
-{
- return ALIGN(offset - align.offset, align.mul) + align.offset;
-}
-
-#define UNIFORM_SLOT_SIZE 4
-
-struct uniform_slot_info {
- /** True if the given uniform slot is live */
- unsigned is_live:1;
-
- /** True if this slot and the next slot must remain contiguous */
- unsigned contiguous:1;
-
- struct cplx_align align;
-};
-
-static void
-mark_uniform_slots_read(struct uniform_slot_info *slots,
- unsigned num_slots, unsigned alignment)
-{
- assert(alignment > 0 && util_is_power_of_two_nonzero(alignment));
- assert(alignment <= CPLX_ALIGN_MAX_MUL);
-
- /* We can't align a slot to anything less than the slot size */
- alignment = MAX2(alignment, UNIFORM_SLOT_SIZE);
-
- struct cplx_align align = {alignment, 0};
- cplx_align_assert_sane(align);
-
- for (unsigned i = 0; i < num_slots; i++) {
- slots[i].is_live = true;
- if (i < num_slots - 1)
- slots[i].contiguous = true;
-
- align.offset = (i * UNIFORM_SLOT_SIZE) & (align.mul - 1);
- if (slots[i].align.mul == 0) {
- slots[i].align = align;
- } else {
- slots[i].align = cplx_align_combine(slots[i].align, align);
- }
- }
-}
-
/**
* Assign UNIFORM file registers to either push constants or pull constants.
*
@@ -2512,197 +2407,12 @@ void
fs_visitor::assign_constant_locations()
{
/* Only the first compile gets to decide on locations. */
- if (push_constant_loc) {
- assert(pull_constant_loc);
+ if (push_constant_loc)
return;
- }
-
- if (compiler->compact_params) {
- struct uniform_slot_info slots[uniforms + 1];
- memset(slots, 0, sizeof(slots));
-
- foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
- for (int i = 0 ; i < inst->sources; i++) {
- if (inst->src[i].file != UNIFORM)
- continue;
-
- /* NIR tightly packs things so the uniform number might not be
- * aligned (if we have a double right after a float, for
- * instance). This is fine because the process of re-arranging
- * them will ensure that things are properly aligned. The offset
- * into that uniform, however, must be aligned.
- *
- * In Vulkan, we have explicit offsets but everything is crammed
- * into a single "variable" so inst->src[i].nr will always be 0.
- * Everything will be properly aligned relative to that one base.
- */
- assert(inst->src[i].offset % type_sz(inst->src[i].type) == 0);
-
- unsigned u = inst->src[i].nr +
- inst->src[i].offset / UNIFORM_SLOT_SIZE;
-
- if (u >= uniforms)
- continue;
-
- unsigned slots_read;
- if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0) {
- slots_read = DIV_ROUND_UP(inst->src[2].ud, UNIFORM_SLOT_SIZE);
- } else {
- unsigned bytes_read = inst->components_read(i) *
- type_sz(inst->src[i].type);
- slots_read = DIV_ROUND_UP(bytes_read, UNIFORM_SLOT_SIZE);
- }
-
- assert(u + slots_read <= uniforms);
- mark_uniform_slots_read(&slots[u], slots_read,
- type_sz(inst->src[i].type));
- }
- }
- int subgroup_id_index = get_subgroup_id_param_index(devinfo,
- stage_prog_data);
-
- /* Only allow 16 registers (128 uniform components) as push constants.
- *
- * Just demote the end of the list. We could probably do better
- * here, demoting things that are rarely used in the program first.
- *
- * If changing this value, note the limitation about total_regs in
- * brw_curbe.c.
- */
- unsigned int max_push_components = 16 * 8;
- if (subgroup_id_index >= 0)
- max_push_components--; /* Save a slot for the thread ID */
-
- /* We push small arrays, but no bigger than 16 floats. This is big
- * enough for a vec4 but hopefully not large enough to push out other
- * stuff. We should probably use a better heuristic at some point.
- */
- const unsigned int max_chunk_size = 16;
-
- unsigned int num_push_constants = 0;
- unsigned int num_pull_constants = 0;
-
- push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
- pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
-
- /* Default to -1 meaning no location */
- memset(push_constant_loc, -1, uniforms * sizeof(*push_constant_loc));
- memset(pull_constant_loc, -1, uniforms * sizeof(*pull_constant_loc));
-
- int chunk_start = -1;
- struct cplx_align align;
- for (unsigned u = 0; u < uniforms; u++) {
- if (!slots[u].is_live) {
- assert(chunk_start == -1);
- continue;
- }
-
- /* Skip subgroup_id_index to put it in the last push register. */
- if (subgroup_id_index == (int)u)
- continue;
-
- if (chunk_start == -1) {
- chunk_start = u;
- align = slots[u].align;
- } else {
- /* Offset into the chunk */
- unsigned chunk_offset = (u - chunk_start) * UNIFORM_SLOT_SIZE;
-
- /* Shift the slot alignment down by the chunk offset so it is
- * comparable with the base chunk alignment.
- */
- struct cplx_align slot_align = slots[u].align;
- slot_align.offset =
- (slot_align.offset - chunk_offset) & (align.mul - 1);
-
- align = cplx_align_combine(align, slot_align);
- }
-
- /* Sanity check the alignment */
- cplx_align_assert_sane(align);
-
- if (slots[u].contiguous)
- continue;
-
- /* Adjust the alignment to be in terms of slots, not bytes */
- assert((align.mul & (UNIFORM_SLOT_SIZE - 1)) == 0);
- assert((align.offset & (UNIFORM_SLOT_SIZE - 1)) == 0);
- align.mul /= UNIFORM_SLOT_SIZE;
- align.offset /= UNIFORM_SLOT_SIZE;
-
- unsigned push_start_align = cplx_align_apply(align, num_push_constants);
- unsigned chunk_size = u - chunk_start + 1;
- if ((!compiler->supports_pull_constants && u < UBO_START) ||
- (chunk_size < max_chunk_size &&
- push_start_align + chunk_size <= max_push_components)) {
- /* Align up the number of push constants */
- num_push_constants = push_start_align;
- for (unsigned i = 0; i < chunk_size; i++)
- push_constant_loc[chunk_start + i] = num_push_constants++;
- } else {
- /* We need to pull this one */
- num_pull_constants = cplx_align_apply(align, num_pull_constants);
- for (unsigned i = 0; i < chunk_size; i++)
- pull_constant_loc[chunk_start + i] = num_pull_constants++;
- }
-
- /* Reset the chunk and start again */
- chunk_start = -1;
- }
-
- /* Add the CS local thread ID uniform at the end of the push constants */
- if (subgroup_id_index >= 0)
- push_constant_loc[subgroup_id_index] = num_push_constants++;
-
- /* As the uniforms are going to be reordered, stash the old array and
- * create two new arrays for push/pull params.
- */
- uint32_t *param = stage_prog_data->param;
- stage_prog_data->nr_params = num_push_constants;
- if (num_push_constants) {
- stage_prog_data->param = rzalloc_array(mem_ctx, uint32_t,
- num_push_constants);
- } else {
- stage_prog_data->param = NULL;
- }
- assert(stage_prog_data->nr_pull_params == 0);
- assert(stage_prog_data->pull_param == NULL);
- if (num_pull_constants > 0) {
- stage_prog_data->nr_pull_params = num_pull_constants;
- stage_prog_data->pull_param = rzalloc_array(mem_ctx, uint32_t,
- num_pull_constants);
- }
-
- /* Up until now, the param[] array has been indexed by reg + offset
- * of UNIFORM registers. Move pull constants into pull_param[] and
- * condense param[] to only contain the uniforms we chose to push.
- *
- * NOTE: Because we are condensing the params[] array, we know that
- * push_constant_loc[i] <= i and we can do it in one smooth loop without
- * having to make a copy.
- */
- for (unsigned int i = 0; i < uniforms; i++) {
- uint32_t value = param[i];
- if (pull_constant_loc[i] != -1) {
- stage_prog_data->pull_param[pull_constant_loc[i]] = value;
- } else if (push_constant_loc[i] != -1) {
- stage_prog_data->param[push_constant_loc[i]] = value;
- }
- }
- ralloc_free(param);
- } else {
- /* If we don't want to compact anything, just set up dummy push/pull
- * arrays. All the rest of the compiler cares about are these arrays.
- */
- push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
- pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
-
- for (unsigned u = 0; u < uniforms; u++)
- push_constant_loc[u] = u;
-
- memset(pull_constant_loc, -1, uniforms * sizeof(*pull_constant_loc));
- }
+ push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
+ for (unsigned u = 0; u < uniforms; u++)
+ push_constant_loc[u] = u;
/* Now that we know how many regular uniforms we'll push, reduce the
* UBO push ranges so we don't exceed the 3DSTATE_CONSTANT limits.
@@ -2733,33 +2443,22 @@ fs_visitor::get_pull_locs(const fs_reg &src,
{
assert(src.file == UNIFORM);
- if (src.nr >= UBO_START) {
- const struct brw_ubo_range *range =
- &prog_data->ubo_ranges[src.nr - UBO_START];
-
- /* If this access is in our (reduced) range, use the push data. */
- if (src.offset / 32 < range->length)
- return false;
+ if (src.nr < UBO_START)
+ return false;
- *out_surf_index = prog_data->binding_table.ubo_start + range->block;
- *out_pull_index = (32 * range->start + src.offset) / 4;
+ const struct brw_ubo_range *range =
+ &prog_data->ubo_ranges[src.nr - UBO_START];
- prog_data->has_ubo_pull = true;
- return true;
- }
-
- const unsigned location = src.nr + src.offset / 4;
+ /* If this access is in our (reduced) range, use the push data. */
+ if (src.offset / 32 < range->length)
+ return false;
- if (location < uniforms && pull_constant_loc[location] != -1) {
- /* A regular uniform push constant */
- *out_surf_index = stage_prog_data->binding_table.pull_constants_start;
- *out_pull_index = pull_constant_loc[location];
+ *out_surf_index = prog_data->binding_table.ubo_start + range->block;
+ *out_pull_index = (32 * range->start + src.offset) / 4;
- prog_data->has_ubo_pull = true;
- return true;
- }
+ prog_data->has_ubo_pull = true;
- return false;
+ return true;
}
/**
diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h
index 160a1e4d952..1faee57f23a 100644
--- a/src/intel/compiler/brw_fs.h
+++ b/src/intel/compiler/brw_fs.h
@@ -369,12 +369,6 @@ public:
/** Byte-offset for the next available spot in the scratch space buffer. */
unsigned last_scratch;
- /**
- * Array mapping UNIFORM register numbers to the pull parameter index,
- * or -1 if this uniform register isn't being uploaded as a pull constant.
- */
- int *pull_constant_loc;
-
/**
* Array mapping UNIFORM register numbers to the push parameter index,
* or -1 if this uniform register isn't being uploaded as a push constant.
diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp
index 2d3b81a2363..671dfa8e2ce 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -103,10 +103,8 @@ void
fs_visitor::nir_setup_uniforms()
{
/* Only the first compile gets to set up uniforms. */
- if (push_constant_loc) {
- assert(pull_constant_loc);
+ if (push_constant_loc)
return;
- }
uniforms = nir->num_uniforms / 4;
diff --git a/src/intel/compiler/brw_fs_visitor.cpp b/src/intel/compiler/brw_fs_visitor.cpp
index 4de37671b90..7f8d69a7d0a 100644
--- a/src/intel/compiler/brw_fs_visitor.cpp
+++ b/src/intel/compiler/brw_fs_visitor.cpp
@@ -126,7 +126,6 @@ fs_visitor::emit_dummy_fs()
/* We don't have any uniforms. */
stage_prog_data->nr_params = 0;
- stage_prog_data->nr_pull_params = 0;
stage_prog_data->curb_read_length = 0;
stage_prog_data->dispatch_grf_start_reg = 2;
wm_prog_data->dispatch_grf_start_reg_16 = 2;
@@ -1192,7 +1191,6 @@ fs_visitor::init()
this->uniforms = 0;
this->last_scratch = 0;
- this->pull_constant_loc = NULL;
this->push_constant_loc = NULL;
this->shader_stats.scheduler_mode = NULL;
diff --git a/src/intel/compiler/brw_vec4.cpp b/src/intel/compiler/brw_vec4.cpp
index 72165932c55..3d70f920500 100644
--- a/src/intel/compiler/brw_vec4.cpp
+++ b/src/intel/compiler/brw_vec4.cpp
@@ -604,194 +604,6 @@ vec4_visitor::split_uniform_registers()
}
}
-/* This function returns the register number where we placed the uniform */
-static int
-set_push_constant_loc(const int nr_uniforms, int *new_uniform_count,
- const int src, const int size, const int channel_size,
- int *new_loc, int *new_chan,
- int *new_chans_used)
-{
- int dst;
- /* Find the lowest place we can slot this uniform in. */
- for (dst = 0; dst < nr_uniforms; dst++) {
- if (ALIGN(new_chans_used[dst], channel_size) + size <= 4)
- break;
- }
-
- assert(dst < nr_uniforms);
-
- new_loc[src] = dst;
- new_chan[src] = ALIGN(new_chans_used[dst], channel_size);
- new_chans_used[dst] = ALIGN(new_chans_used[dst], channel_size) + size;
-
- *new_uniform_count = MAX2(*new_uniform_count, dst + 1);
- return dst;
-}
-
-void
-vec4_visitor::pack_uniform_registers()
-{
- if (!compiler->compact_params)
- return;
-
- uint8_t chans_used[this->uniforms];
- int new_loc[this->uniforms];
- int new_chan[this->uniforms];
- bool is_aligned_to_dvec4[this->uniforms];
- int new_chans_used[this->uniforms];
- int channel_sizes[this->uniforms];
-
- memset(chans_used, 0, sizeof(chans_used));
- memset(new_loc, 0, sizeof(new_loc));
- memset(new_chan, 0, sizeof(new_chan));
- memset(new_chans_used, 0, sizeof(new_chans_used));
- memset(is_aligned_to_dvec4, 0, sizeof(is_aligned_to_dvec4));
- memset(channel_sizes, 0, sizeof(channel_sizes));
-
- /* Find which uniform vectors are actually used by the program. We
- * expect unused vector elements when we've moved array access out
- * to pull constants, and from some GLSL code generators like wine.
- */
- foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
- unsigned readmask;
- switch (inst->opcode) {
- case VEC4_OPCODE_PACK_BYTES:
- case BRW_OPCODE_DP4:
- case BRW_OPCODE_DPH:
- readmask = 0xf;
- break;
- case BRW_OPCODE_DP3:
- readmask = 0x7;
- break;
- case BRW_OPCODE_DP2:
- readmask = 0x3;
- break;
- default:
- readmask = inst->dst.writemask;
- break;
- }
-
- for (int i = 0 ; i < 3; i++) {
- if (inst->src[i].file != UNIFORM || inst->src[i].nr >= UBO_START)
- continue;
-
- assert(type_sz(inst->src[i].type) % 4 == 0);
- int channel_size = type_sz(inst->src[i].type) / 4;
-
- int reg = inst->src[i].nr;
- for (int c = 0; c < 4; c++) {
- if (!(readmask & (1 << c)))
- continue;
-
- unsigned channel = BRW_GET_SWZ(inst->src[i].swizzle, c) + 1;
- unsigned used = MAX2(chans_used[reg], channel * channel_size);
- if (used <= 4) {
- chans_used[reg] = used;
- channel_sizes[reg] = MAX2(channel_sizes[reg], channel_size);
- } else {
- is_aligned_to_dvec4[reg] = true;
- is_aligned_to_dvec4[reg + 1] = true;
- chans_used[reg + 1] = used - 4;
- channel_sizes[reg + 1] = MAX2(channel_sizes[reg + 1], channel_size);
- }
- }
- }
-
- if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT &&
- inst->src[0].file == UNIFORM) {
- assert(inst->src[2].file == BRW_IMMEDIATE_VALUE);
- assert(inst->src[0].subnr == 0);
-
- unsigned bytes_read = inst->src[2].ud;
- assert(bytes_read % 4 == 0);
- unsigned vec4s_read = DIV_ROUND_UP(bytes_read, 16);
-
- /* We just mark every register touched by a MOV_INDIRECT as being
- * fully used. This ensures that it doesn't broken up piecewise by
- * the next part of our packing algorithm.
- */
- int reg = inst->src[0].nr;
- int channel_size = type_sz(inst->src[0].type) / 4;
- for (unsigned i = 0; i < vec4s_read; i++) {
- chans_used[reg + i] = 4;
- channel_sizes[reg + i] = MAX2(channel_sizes[reg + i], channel_size);
- }
- }
- }
-
- int new_uniform_count = 0;
-
- /* As the uniforms are going to be reordered, take the data from a temporary
- * copy of the original param[].
- */
- uint32_t *param = ralloc_array(NULL, uint32_t, stage_prog_data->nr_params);
- memcpy(param, stage_prog_data->param,
- sizeof(uint32_t) * stage_prog_data->nr_params);
-
- /* Now, figure out a packing of the live uniform vectors into our
- * push constants. Start with dvec{3,4} because they are aligned to
- * dvec4 size (2 vec4).
- */
- for (int src = 0; src < uniforms; src++) {
- int size = chans_used[src];
-
- if (size == 0 || !is_aligned_to_dvec4[src])
- continue;
-
- /* dvec3 are aligned to dvec4 size, apply the alignment of the size
- * to 4 to avoid moving last component of a dvec3 to the available
- * location at the end of a previous dvec3. These available locations
- * could be filled by smaller variables in next loop.
- */
- size = ALIGN(size, 4);
- int dst = set_push_constant_loc(uniforms, &new_uniform_count,
- src, size, channel_sizes[src],
- new_loc, new_chan,
- new_chans_used);
- /* Move the references to the data */
- for (int j = 0; j < size; j++) {
- stage_prog_data->param[dst * 4 + new_chan[src] + j] =
- param[src * 4 + j];
- }
- }
-
- /* Continue with the rest of data, which is aligned to vec4. */
- for (int src = 0; src < uniforms; src++) {
- int size = chans_used[src];
-
- if (size == 0 || is_aligned_to_dvec4[src])
- continue;
-
- int dst = set_push_constant_loc(uniforms, &new_uniform_count,
- src, size, channel_sizes[src],
- new_loc, new_chan,
- new_chans_used);
- /* Move the references to the data */
- for (int j = 0; j < size; j++) {
- stage_prog_data->param[dst * 4 + new_chan[src] + j] =
- param[src * 4 + j];
- }
- }
-
- ralloc_free(param);
- this->uniforms = new_uniform_count;
- stage_prog_data->nr_params = new_uniform_count * 4;
-
- /* Now, update the instructions for our repacked uniforms. */
- foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
- for (int i = 0 ; i < 3; i++) {
- int src = inst->src[i].nr;
-
- if (inst->src[i].file != UNIFORM || inst->src[i].nr >= UBO_START)
- continue;
-
- int chan = new_chan[src] / channel_sizes[src];
- inst->src[i].nr = new_loc[src];
- inst->src[i].swizzle += BRW_SWIZZLE4(chan, chan, chan, chan);
- }
- }
-}
-
/**
* Does algebraic optimizations (0 * a = 0, 1 * a = a, a + 0 = a).
*
@@ -910,97 +722,6 @@ vec4_visitor::opt_algebraic()
return progress;
}
-/**
- * Only a limited number of hardware registers may be used for push
- * constants, so this turns access to the overflowed constants into
- * pull constants.
- */
-void
-vec4_visitor::move_push_constants_to_pull_constants()
-{
- int pull_constant_loc[this->uniforms];
-
- const int max_uniform_components = push_length * 8;
-
- if (this->uniforms * 4 <= max_uniform_components)
- return;
-
- assert(compiler->supports_pull_constants);
- assert(compiler->compact_params);
-
- /* If we got here, we also can't have any push ranges */
- for (unsigned i = 0; i < 4; i++)
- assert(prog_data->base.ubo_ranges[i].length == 0);
-
- /* Make some sort of choice as to which uniforms get sent to pull
- * constants. We could potentially do something clever here like
- * look for the most infrequently used uniform vec4s, but leave
- * that for later.
- */
- for (int i = 0; i < this->uniforms * 4; i += 4) {
- pull_constant_loc[i / 4] = -1;
-
- if (i >= max_uniform_components) {
- uint32_t *values = &stage_prog_data->param[i];
-
- /* Try to find an existing copy of this uniform in the pull
- * constants if it was part of an array access already.
- */
- for (unsigned int j = 0; j < stage_prog_data->nr_pull_params; j += 4) {
- int matches;
-
- for (matches = 0; matches < 4; matches++) {
- if (stage_prog_data->pull_param[j + matches] != values[matches])
- break;
- }
-
- if (matches == 4) {
- pull_constant_loc[i / 4] = j / 4;
- break;
- }
- }
-
- if (pull_constant_loc[i / 4] == -1) {
- assert(stage_prog_data->nr_pull_params % 4 == 0);
- pull_constant_loc[i / 4] = stage_prog_data->nr_pull_params / 4;
-
- for (int j = 0; j < 4; j++) {
- stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
- values[j];
- }
- }
- }
- }
-
- /* Now actually rewrite usage of the things we've moved to pull
- * constants.
- */
- foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
- for (int i = 0 ; i < 3; i++) {
- if (inst->src[i].file != UNIFORM || inst->src[i].nr >= UBO_START ||
- pull_constant_loc[inst->src[i].nr] == -1)
- continue;
-
- int uniform = inst->src[i].nr;
-
- const glsl_type *temp_type = type_sz(inst->src[i].type) == 8 ?
- glsl_type::dvec4_type : glsl_type::vec4_type;
- dst_reg temp = dst_reg(this, temp_type);
-
- emit_pull_constant_load(block, inst, temp, inst->src[i],
- pull_constant_loc[uniform], src_reg());
-
- inst->src[i].file = temp.file;
- inst->src[i].nr = temp.nr;
- inst->src[i].offset %= 16;
- inst->src[i].reladdr = NULL;
- }
- }
-
- /* Repack push constants to remove the now-unused ones. */
- pack_uniform_registers();
-}
-
/* Conditions for which we want to avoid setting the dependency control bits */
bool
vec4_visitor::is_dep_ctrl_unsafe(const vec4_instruction *inst)
@@ -1842,15 +1563,13 @@ vec4_visitor::setup_uniforms(int reg)
/* It's possible that uniform compaction will shrink further than expected
* so we re-compute the layout and set up our UBO push starts.
*/
- const unsigned old_push_length = push_length;
+ ASSERTED const unsigned old_push_length = push_length;
push_length = DIV_ROUND_UP(prog_data->base.nr_params, 8);
for (unsigned i = 0; i < 4; i++) {
ubo_push_start[i] = push_length;
push_length += stage_prog_data->ubo_ranges[i].length;
}
- assert(push_length <= old_push_length);
- if (push_length < old_push_length)
- assert(compiler->compact_params);
+ assert(push_length == old_push_length);
/* The pre-gfx6 VS requires that some push constants get loaded no
* matter what, or the GPU would hang.
@@ -2738,10 +2457,8 @@ vec4_visitor::run()
* often do repeated subexpressions for those.
*/
move_grf_array_access_to_scratch();
- move_uniform_array_access_to_pull_constants();
+ split_uniform_registers();
- pack_uniform_registers();
- move_push_constants_to_pull_constants();
split_virtual_grfs();
#define OPT(pass, args...) ({ \
diff --git a/src/intel/compiler/brw_vec4.h b/src/intel/compiler/brw_vec4.h
index f27e3d3c4ad..2529d69c974 100644
--- a/src/intel/compiler/brw_vec4.h
+++ b/src/intel/compiler/brw_vec4.h
@@ -138,9 +138,7 @@ public:
void spill_reg(unsigned spill_reg);
void move_grf_array_access_to_scratch();
void move_uniform_array_access_to_pull_constants();
- void move_push_constants_to_pull_constants();
void split_uniform_registers();
- void pack_uniform_registers();
void setup_push_ranges();
virtual void invalidate_analysis(brw::analysis_dependency_class c);
void split_virtual_grfs();
@@ -292,11 +290,6 @@ public:
int base_offset);
void emit_scratch_write(bblock_t *block, vec4_instruction *inst,
int base_offset);
- void emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
- dst_reg dst,
- src_reg orig_src,
- int base_offset,
- src_reg indirect);
void emit_pull_constant_load_reg(dst_reg dst,
src_reg surf_index,
src_reg offset,
diff --git a/src/intel/compiler/brw_vec4_gs_visitor.cpp b/src/intel/compiler/brw_vec4_gs_visitor.cpp
index 1b55e9234e2..aa396eaab70 100644
--- a/src/intel/compiler/brw_vec4_gs_visitor.cpp
+++ b/src/intel/compiler/brw_vec4_gs_visitor.cpp
@@ -889,7 +889,6 @@ brw_compile_gs(const struct brw_compiler *compiler, void *log_data,
memcpy(prog_data->base.base.param, param,
sizeof(uint32_t) * param_count);
prog_data->base.base.nr_params = param_count;
- prog_data->base.base.nr_pull_params = 0;
ralloc_free(param);
}
}
diff --git a/src/intel/compiler/brw_vec4_visitor.cpp b/src/intel/compiler/brw_vec4_visitor.cpp
index 3ad8868ac5f..8bfb7ee872a 100644
--- a/src/intel/compiler/brw_vec4_visitor.cpp
+++ b/src/intel/compiler/brw_vec4_visitor.cpp
@@ -1592,146 +1592,6 @@ vec4_visitor::move_grf_array_access_to_scratch()
}
}
-/**
- * Emits an instruction before @inst to load the value named by @orig_src
- * from the pull constant buffer (surface) at @base_offset to @temp.
- */
-void
-vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
- dst_reg temp, src_reg orig_src,
- int base_offset, src_reg indirect)
-{
- assert(orig_src.offset % 16 == 0);
- const unsigned index = prog_data->base.binding_table.pull_constants_start;
-
- /* For 64bit loads we need to emit two 32-bit load messages and we also
- * we need to shuffle the 32-bit data result into proper 64-bit data. To do
- * that we emit the 32-bit loads into a temporary and we shuffle the result
- * into the original destination.
- */
- dst_reg orig_temp = temp;
- bool is_64bit = type_sz(orig_src.type) == 8;
- if (is_64bit) {
- assert(type_sz(temp.type) == 8);
- dst_reg temp_df = dst_reg(this, glsl_type::dvec4_type);
- temp = retype(temp_df, BRW_REGISTER_TYPE_F);
- }
-
- src_reg src = orig_src;
- for (int i = 0; i < (is_64bit ? 2 : 1); i++) {
- int reg_offset = base_offset + src.offset / 16;
-
- src_reg offset;
- if (indirect.file != BAD_FILE) {
- offset = src_reg(this, glsl_type::uint_type);
- emit_before(block, inst, ADD(dst_reg(offset), indirect,
- brw_imm_ud(reg_offset * 16)));
- } else {
- offset = brw_imm_d(reg_offset * 16);
- }
-
- emit_pull_constant_load_reg(byte_offset(temp, i * REG_SIZE),
- brw_imm_ud(index),
- offset,
- block, inst);
-
- src = byte_offset(src, 16);
- }
-
- if (is_64bit) {
- temp = retype(temp, BRW_REGISTER_TYPE_DF);
- shuffle_64bit_data(orig_temp, src_reg(temp), false, false, block, inst);
- }
-}
-
-/**
- * Implements array access of uniforms by inserting a
- * PULL_CONSTANT_LOAD instruction.
- *
- * Unlike temporary GRF array access (where we don't support it due to
- * the difficulty of doing relative addressing on instruction
- * destinations), we could potentially do array access of uniforms
- * that were loaded in GRF space as push constants. In real-world
- * usage we've seen, though, the arrays being used are always larger
- * than we could load as push constants, so just always move all
- * uniform array access out to a pull constant buffer.
- */
-void
-vec4_visitor::move_uniform_array_access_to_pull_constants()
-{
- /* The vulkan dirver doesn't support pull constants other than UBOs so
- * everything has to be pushed regardless.
- */
- if (!compiler->supports_pull_constants) {
- split_uniform_registers();
- return;
- }
-
- /* Allocate the pull_params array */
- assert(stage_prog_data->nr_pull_params == 0);
- stage_prog_data->pull_param = ralloc_array(mem_ctx, uint32_t,
- this->uniforms * 4);
-
- int pull_constant_loc[this->uniforms];
- memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
-
- /* First, walk through the instructions and determine which things need to
- * be pulled. We mark something as needing to be pulled by setting
- * pull_constant_loc to 0.
- */
- foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
- /* We only care about MOV_INDIRECT of a uniform */
- if (inst->opcode != SHADER_OPCODE_MOV_INDIRECT ||
- inst->src[0].file != UNIFORM)
- continue;
-
- int uniform_nr = inst->src[0].nr + inst->src[0].offset / 16;
-
- for (unsigned j = 0; j < DIV_ROUND_UP(inst->src[2].ud, 16); j++)
- pull_constant_loc[uniform_nr + j] = 0;
- }
-
- /* Next, we walk the list of uniforms and assign real pull constant
- * locations and set their corresponding entries in pull_param.
- */
- for (int j = 0; j < this->uniforms; j++) {
- if (pull_constant_loc[j] < 0)
- continue;
-
- pull_constant_loc[j] = stage_prog_data->nr_pull_params / 4;
-
- for (int i = 0; i < 4; i++) {
- stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
- = stage_prog_data->param[j * 4 + i];
- }
- }
-
- /* Finally, we can walk through the instructions and lower MOV_INDIRECT
- * instructions to actual uniform pulls.
- */
- foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
- /* We only care about MOV_INDIRECT of a uniform */
- if (inst->opcode != SHADER_OPCODE_MOV_INDIRECT ||
- inst->src[0].file != UNIFORM)
- continue;
-
- int uniform_nr = inst->src[0].nr + inst->src[0].offset / 16;
-
- assert(inst->src[0].swizzle == BRW_SWIZZLE_NOOP);
-
- emit_pull_constant_load(block, inst, inst->dst, inst->src[0],
- pull_constant_loc[uniform_nr], inst->src[1]);
- inst->remove(block);
- }
-
- /* Now there are no accesses of the UNIFORM file with a reladdr, so
- * no need to track them as larger-than-vec4 objects. This will be
- * relied on in cutting out unused uniform vectors from push
- * constants.
- */
- split_uniform_registers();
-}
-
void
vec4_visitor::resolve_ud_negate(src_reg *reg)
{
diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device.c
index bcaddf88a99..939bca12986 100644
--- a/src/intel/vulkan/anv_device.c
+++ b/src/intel/vulkan/anv_device.c
@@ -974,11 +974,9 @@ anv_physical_device_try_create(struct anv_instance *instance,
}
device->compiler->shader_debug_log = compiler_debug_log;
device->compiler->shader_perf_log = compiler_perf_log;
- device->compiler->supports_pull_constants = false;
device->compiler->constant_buffer_0_is_relative =
device->info.ver < 8 || !device->has_context_isolation;
device->compiler->supports_shader_constants = true;
- device->compiler->compact_params = false;
device->compiler->indirect_ubos_use_sampler = device->info.ver < 12;
isl_device_init(&device->isl_dev, &device->info);
More information about the mesa-commit
mailing list