[Mesa-dev] [PATCH v2 10/15] SQUASH: i965/fs: Rework fs_visitor::lower_load_payload
Jason Ekstrand
jason at jlekstrand.net
Tue May 5 18:28:13 PDT 2015
Instead of the complicated and broken-by-design pile of heuristics we had
before, we now have a straightforward lowering:
1) All header sources are copied directly using force_writemask_all and,
since they are guaranteed to be a single register, there are no
force_sechalf issues.
2) All non-header sources are copied using the exact same force_sechalf
and force_writemask_all modifiers as the LOAD_PAYLOAD operation itself.
3) In order to accommodate older gens that need interleaved colors,
lower_load_payload detects when the destination is a COMPR4 register
and automatically interleaves the non-header sources. The
lower_load_payload pass does the right thing here regardless of whether
or not the hardware actually supports COMPR4.
v2: Don't allow the saturate flag on LOAD_PAYLOAD instructions
Reviewed-by: Kenneth Graunke <kenneth at whitecape.org>
---
src/mesa/drivers/dri/i965/brw_fs.cpp | 171 ++++++++++++++-------------
src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 3 -
2 files changed, 90 insertions(+), 84 deletions(-)
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index ee520bb..e7095c9 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -3419,99 +3419,108 @@ fs_visitor::lower_load_payload()
{
bool progress = false;
- int vgrf_to_reg[alloc.count];
- int reg_count = 0;
- for (unsigned i = 0; i < alloc.count; ++i) {
- vgrf_to_reg[i] = reg_count;
- reg_count += alloc.sizes[i];
- }
+ foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
+ if (inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
+ continue;
- struct {
- bool written:1; /* Whether this register has ever been written */
- bool force_writemask_all:1;
- bool force_sechalf:1;
- } metadata[reg_count];
- memset(metadata, 0, sizeof(metadata));
+ assert(inst->dst.file == MRF || inst->dst.file == GRF);
+ assert(inst->saturate == false);
- foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
- if (inst->dst.file == GRF) {
- const int dst_reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
- bool force_sechalf = inst->force_sechalf &&
- !inst->force_writemask_all;
- bool toggle_sechalf = inst->dst.width == 16 &&
- type_sz(inst->dst.type) == 4 &&
- !inst->force_writemask_all;
- for (int i = 0; i < inst->regs_written; ++i) {
- metadata[dst_reg + i].written = true;
- metadata[dst_reg + i].force_sechalf = force_sechalf;
- metadata[dst_reg + i].force_writemask_all = inst->force_writemask_all;
- force_sechalf = (toggle_sechalf != force_sechalf);
+ fs_reg dst = inst->dst;
+
+ /* Get rid of COMPR4. We'll add it back in if we need it */
+ if (dst.file == MRF)
+ dst.reg = dst.reg & ~BRW_MRF_COMPR4;
+
+ dst.width = 8;
+ for (uint8_t i = 0; i < inst->header_size; i++) {
+ if (inst->src[i].file != BAD_FILE) {
+ fs_reg mov_dst = retype(dst, BRW_REGISTER_TYPE_UD);
+ fs_reg mov_src = retype(inst->src[i], BRW_REGISTER_TYPE_UD);
+ mov_src.width = 8;
+ fs_inst *mov = MOV(mov_dst, mov_src);
+ mov->force_writemask_all = true;
+ inst->insert_before(block, mov);
}
+ dst = offset(dst, 1);
}
- if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
- assert(inst->dst.file == MRF || inst->dst.file == GRF);
- fs_reg dst = inst->dst;
-
- for (int i = 0; i < inst->sources; i++) {
- dst.width = inst->src[i].effective_width;
- dst.type = inst->src[i].type;
-
- if (inst->src[i].file == BAD_FILE) {
- /* Do nothing but otherwise increment as normal */
- } else if (dst.file == MRF &&
- dst.width == 8 &&
- devinfo->has_compr4 &&
- i + 4 < inst->sources &&
- inst->src[i + 4].equals(horiz_offset(inst->src[i], 8))) {
- fs_reg compr4_dst = dst;
- compr4_dst.reg += BRW_MRF_COMPR4;
- compr4_dst.width = 16;
- fs_reg compr4_src = inst->src[i];
- compr4_src.width = 16;
- fs_inst *mov = MOV(compr4_dst, compr4_src);
- mov->force_writemask_all = true;
- inst->insert_before(block, mov);
- /* Mark i+4 as BAD_FILE so we don't emit a MOV for it */
- inst->src[i + 4].file = BAD_FILE;
- } else {
- fs_inst *mov = MOV(dst, inst->src[i]);
- if (inst->src[i].file == GRF) {
- int src_reg = vgrf_to_reg[inst->src[i].reg] +
- inst->src[i].reg_offset;
- mov->force_sechalf = metadata[src_reg].force_sechalf;
- mov->force_writemask_all = metadata[src_reg].force_writemask_all;
+ dst.width = inst->exec_size;
+ if (inst->dst.file == MRF && (inst->dst.reg & BRW_MRF_COMPR4) &&
+ inst->exec_size > 8) {
+ /* In this case, the payload portion of the LOAD_PAYLOAD isn't
+ * a straightforward copy. Instead, the result of the
+ * LOAD_PAYLOAD is treated as interleaved and the first four
+ * non-header sources are unpacked as:
+ *
+ * m + 0: r0
+ * m + 1: g0
+ * m + 2: b0
+ * m + 3: a0
+ * m + 4: r1
+ * m + 5: g1
+ * m + 6: b1
+ * m + 7: a1
+ *
+ * This is used for gen <= 5 fb writes.
+ */
+ assert(inst->exec_size == 16);
+ assert(inst->header_size + 4 <= inst->sources);
+ for (uint8_t i = inst->header_size; i < inst->header_size + 4; i++) {
+ if (inst->src[i].file != BAD_FILE) {
+ if (devinfo->has_compr4) {
+ fs_reg compr4_dst = retype(dst, inst->src[i].type);
+ compr4_dst.reg |= BRW_MRF_COMPR4;
+
+ fs_inst *mov = MOV(compr4_dst, inst->src[i]);
+ mov->force_writemask_all = inst->force_writemask_all;
+ inst->insert_before(block, mov);
} else {
- /* We don't have any useful metadata for immediates or
- * uniforms. Assume that any of the channels of the
- * destination may be used.
- */
- assert(inst->src[i].file == IMM ||
- inst->src[i].file == UNIFORM);
- mov->force_writemask_all = true;
- }
-
- if (dst.file == GRF) {
- const int dst_reg = vgrf_to_reg[dst.reg] + dst.reg_offset;
- const bool force_writemask = mov->force_writemask_all;
- metadata[dst_reg].force_writemask_all = force_writemask;
- metadata[dst_reg].force_sechalf = mov->force_sechalf;
- if (dst.width * type_sz(dst.type) > 32) {
- assert(!mov->force_sechalf);
- metadata[dst_reg + 1].force_writemask_all = force_writemask;
- metadata[dst_reg + 1].force_sechalf = !force_writemask;
- }
+ /* Platform doesn't have COMPR4. We have to fake it */
+ fs_reg mov_dst = retype(dst, inst->src[i].type);
+ mov_dst.width = 8;
+
+ fs_inst *mov = MOV(mov_dst, half(inst->src[i], 0));
+ mov->force_writemask_all = inst->force_writemask_all;
+ inst->insert_before(block, mov);
+
+ mov = MOV(offset(mov_dst, 4), half(inst->src[i], 1));
+ mov->force_writemask_all = inst->force_writemask_all;
+ mov->force_sechalf = true;
+ inst->insert_before(block, mov);
}
-
- inst->insert_before(block, mov);
}
- dst = offset(dst, 1);
+ dst.reg++;
}
- inst->remove(block);
- progress = true;
+ /* The loop above only ever incremented us through the first set
+ * of 4 registers. However, thanks to the magic of COMPR4, we
+ * actually wrote to the first 8 registers, so we need to take
+ * that into account now.
+ */
+ dst.reg += 4;
+
+ /* The COMPR4 code took care of the first 4 sources. We'll let
+ * the regular path handle any remaining sources. Yes, we are
+ * modifying the instruction but we're about to delete it so
+ * this really doesn't hurt anything.
+ */
+ inst->header_size += 4;
}
+
+ for (uint8_t i = inst->header_size; i < inst->sources; i++) {
+ if (inst->src[i].file != BAD_FILE) {
+ fs_inst *mov = MOV(retype(dst, inst->src[i].type),
+ inst->src[i]);
+ mov->force_writemask_all = inst->force_writemask_all;
+ inst->insert_before(block, mov);
+ }
+ dst = offset(dst, 1);
+ }
+
+ inst->remove(block);
+ progress = true;
}
if (progress)
diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index c3ccc9d..772285e 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -3602,9 +3602,6 @@ fs_visitor::setup_color_payload(fs_reg *dst, fs_reg color, unsigned components,
color.type, color.width);
inst = emit(MOV(dst[len], offset(color, i)));
inst->saturate = key->clamp_fragment_color;
- } else if (color.width == 16) {
- /* We need two BAD_FILE slots for a 16-wide color */
- len++;
}
len++;
}
--
2.3.6
More information about the mesa-dev
mailing list