[Mesa-dev] [PATCH 14/12] i965/fs: Copy propagate partial reads.
Matt Turner
mattst88 at gmail.com
Mon Sep 29 09:50:36 PDT 2014
On Fri, Sep 26, 2014 at 12:24 PM, Jason Ekstrand <jason at jlekstrand.net> wrote:
> This commit reworks copy propagation a bit to support propagating the
> copying of partial registers. This comes up every time we have pull
> constants because we do a pull constant read immediately followed by a move
> to splat the one component of the out to 8 or 16-wide. This allows us to
> eliminate the copy and simply use the one component of the register.
>
> Shader DB results:
>
> total instructions in shared programs: 5044937 -> 5044428 (-0.01%)
> instructions in affected programs: 66112 -> 65603 (-0.77%)
> GAINED: 0
> LOST: 0
>
> Signed-off-by: Jason Ekstrand <jason.ekstrand at intel.com>
> ---
> src/mesa/drivers/dri/i965/brw_fs.h | 1 +
> .../drivers/dri/i965/brw_fs_copy_propagation.cpp | 83 ++++++++++++++++------
> 2 files changed, 64 insertions(+), 20 deletions(-)
>
> diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
> index 50b5fc1..9b63114 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs.h
> +++ b/src/mesa/drivers/dri/i965/brw_fs.h
> @@ -337,6 +337,7 @@ public:
> bool opt_cse_local(bblock_t *block);
> bool opt_copy_propagate();
> bool try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry);
> + bool try_constant_propagate(fs_inst *inst, acp_entry *entry);
> bool opt_copy_propagate_local(void *mem_ctx, bblock_t *block,
> exec_list *acp);
> void opt_drop_redundant_mov_to_flags();
> diff --git a/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp b/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp
> index e5816df..a97dc04 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp
> @@ -277,24 +277,30 @@ is_logic_op(enum opcode opcode)
> bool
> fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry)
> {
> + if (inst->src[arg].file != GRF)
> + return false;
> +
> if (entry->src.file == IMM)
> return false;
> + assert(entry->src.file == GRF || entry->src.file == UNIFORM);
>
> if (entry->opcode == SHADER_OPCODE_LOAD_PAYLOAD &&
> inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD)
> return false;
>
> - /* Bail if inst is reading more than entry is writing. */
> - if ((inst->regs_read(this, arg) * inst->src[arg].stride *
> - type_sz(inst->src[arg].type)) > type_sz(entry->dst.type))
> + assert(entry->dst.file == GRF);
> + if (inst->src[arg].reg != entry->dst.reg)
> return false;
>
> - if (inst->src[arg].file != entry->dst.file ||
> - inst->src[arg].reg != entry->dst.reg ||
> - inst->src[arg].reg_offset != entry->dst.reg_offset ||
> - inst->src[arg].subreg_offset != entry->dst.subreg_offset) {
> + /* Bail if inst is reading a range that isn't contained in the range
> + * that entry is writing.
> + */
> + int reg_size = dispatch_width * sizeof(float);
> + if (inst->src[arg].reg_offset < entry->dst.reg_offset ||
> + (inst->src[arg].reg_offset * reg_size + inst->src[arg].subreg_offset +
> + inst->regs_read(this, arg) * inst->src[arg].stride * reg_size) >
> + (entry->dst.reg_offset + 1) * reg_size)
> return false;
> - }
>
> /* See resolve_ud_negate() and comment in brw_fs_emit.cpp. */
> if (inst->conditional_mod &&
> @@ -361,11 +367,39 @@ fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry)
>
> inst->src[arg].file = entry->src.file;
> inst->src[arg].reg = entry->src.reg;
> - inst->src[arg].reg_offset = entry->src.reg_offset;
> - inst->src[arg].subreg_offset = entry->src.subreg_offset;
> inst->src[arg].stride *= entry->src.stride;
> inst->saturate = inst->saturate || entry->saturate;
>
> + switch (entry->src.file) {
> + case BAD_FILE:
> + case HW_REG:
> + case UNIFORM:
> + inst->src[arg].reg_offset = entry->src.reg_offset;
> + inst->src[arg].subreg_offset = entry->src.subreg_offset;
> + break;
> + case GRF:
> + {
> + /* In this case, we have to deal with mapping parts of vgrfs to
> + * other parts of vgrfs so we have to do some reg_offset magic.
> + */
> +
> + /* Compute the offset of inst->src[arg] relative to inst->dst */
> + assert(entry->dst.subreg_offset == 0);
> + int rel_offset = inst->src[arg].reg_offset - entry->dst.reg_offset;
> + int rel_suboffset = inst->src[arg].subreg_offset;
> +
> + /* Compute the final register offset (in bytes) */
> + int offset = entry->src.reg_offset * reg_size + entry->src.subreg_offset;
> + offset += rel_offset * reg_size + rel_suboffset;
> + inst->src[arg].reg_offset = offset / reg_size;
> + inst->src[arg].subreg_offset = offset % reg_size;
> + }
> + break;
> + default:
> + unreachable("Invalid register file");
> + break;
> + }
> +
> if (!inst->src[arg].abs) {
> inst->src[arg].abs = entry->src.abs;
> inst->src[arg].negate ^= entry->src.negate;
> @@ -375,9 +409,8 @@ fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry)
> }
>
>
> -static bool
> -try_constant_propagate(struct brw_context *brw, fs_inst *inst,
> - acp_entry *entry)
> +bool
> +fs_visitor::try_constant_propagate(fs_inst *inst, acp_entry *entry)
> {
> bool progress = false;
>
> @@ -385,12 +418,22 @@ try_constant_propagate(struct brw_context *brw, fs_inst *inst,
> return false;
>
> for (int i = inst->sources - 1; i >= 0; i--) {
> - if (inst->src[i].file != entry->dst.file ||
> - inst->src[i].reg != entry->dst.reg ||
> - inst->src[i].reg_offset != entry->dst.reg_offset ||
> - inst->src[i].subreg_offset != entry->dst.subreg_offset ||
> - inst->src[i].type != entry->dst.type ||
> - inst->src[i].stride > 1)
> + if (inst->src[i].file != GRF)
> + continue;
> +
> + assert(entry->dst.file == GRF);
> + if (inst->src[i].reg != entry->dst.reg ||
> + inst->src[i].type != entry->dst.type)
> + continue;
> +
> + /* Bail if inst is reading a range that isn't contained in the range
> + * that entry is writing.
> + */
> + int reg_size = dispatch_width * sizeof(float);
> + if (inst->src[i].reg_offset < entry->dst.reg_offset ||
> + (inst->src[i].reg_offset * reg_size + inst->src[i].subreg_offset +
> + inst->regs_read(this, i) * inst->src[i].stride * reg_size) >
> + (entry->dst.reg_offset + 1) * reg_size)
> continue;
Does this bit, the constant propagation code, actually benefit from
this? I guess it'd let an immediate loaded with a mov(16) be constant
propagated into two SIMD8 instructions? That seems plausible, I was
just a little surprised at first.
More information about the mesa-dev
mailing list