[Mesa-dev] [PATCH 2/6] i965/vec4/generator: use 1-Oword Block Read/Write messages for DF scratch writes/reads
Samuel Iglesias Gonsálvez
siglesias at igalia.com
Thu Jun 15 11:15:06 UTC 2017
Signed-off-by: Samuel Iglesias Gonsálvez <siglesias at igalia.com>
---
src/intel/compiler/brw_eu_defines.h | 2 +
src/intel/compiler/brw_shader.cpp | 5 +
src/intel/compiler/brw_vec4.cpp | 7 ++
src/intel/compiler/brw_vec4.h | 8 ++
src/intel/compiler/brw_vec4_generator.cpp | 136 +++++++++++++++++++++++++++
src/intel/compiler/brw_vec4_reg_allocate.cpp | 6 +-
src/intel/compiler/brw_vec4_visitor.cpp | 49 ++++++++++
7 files changed, 212 insertions(+), 1 deletion(-)
diff --git a/src/intel/compiler/brw_eu_defines.h b/src/intel/compiler/brw_eu_defines.h
index 1af835d47e..3c148de0fa 100644
--- a/src/intel/compiler/brw_eu_defines.h
+++ b/src/intel/compiler/brw_eu_defines.h
@@ -436,6 +436,8 @@ enum opcode {
VEC4_OPCODE_PICK_HIGH_32BIT,
VEC4_OPCODE_SET_LOW_32BIT,
VEC4_OPCODE_SET_HIGH_32BIT,
+ VEC4_OPCODE_GEN4_SCRATCH_READ_1OWORD_LOW,
+ VEC4_OPCODE_GEN4_SCRATCH_READ_1OWORD_HIGH,
FS_OPCODE_DDX_COARSE,
FS_OPCODE_DDX_FINE,
diff --git a/src/intel/compiler/brw_shader.cpp b/src/intel/compiler/brw_shader.cpp
index 53d0742d2e..248feacbd2 100644
--- a/src/intel/compiler/brw_shader.cpp
+++ b/src/intel/compiler/brw_shader.cpp
@@ -296,6 +296,11 @@ brw_instruction_name(const struct gen_device_info *devinfo, enum opcode op)
case FS_OPCODE_PACK:
return "pack";
+
+ case VEC4_OPCODE_GEN4_SCRATCH_READ_1OWORD_LOW:
+ return "gen4_scratch_read_1word_low";
+ case VEC4_OPCODE_GEN4_SCRATCH_READ_1OWORD_HIGH:
+ return "gen4_scratch_read_1word_high";
case SHADER_OPCODE_GEN4_SCRATCH_READ:
return "gen4_scratch_read";
case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
diff --git a/src/intel/compiler/brw_vec4.cpp b/src/intel/compiler/brw_vec4.cpp
index b443effca9..b6d409eea2 100644
--- a/src/intel/compiler/brw_vec4.cpp
+++ b/src/intel/compiler/brw_vec4.cpp
@@ -259,6 +259,8 @@ bool
vec4_instruction::can_do_writemask(const struct gen_device_info *devinfo)
{
switch (opcode) {
+ case VEC4_OPCODE_GEN4_SCRATCH_READ_1OWORD_LOW:
+ case VEC4_OPCODE_GEN4_SCRATCH_READ_1OWORD_HIGH:
case SHADER_OPCODE_GEN4_SCRATCH_READ:
case VEC4_OPCODE_DOUBLE_TO_F32:
case VEC4_OPCODE_DOUBLE_TO_D32:
@@ -335,6 +337,9 @@ vec4_visitor::implied_mrf_writes(vec4_instruction *inst)
return 1;
case VS_OPCODE_PULL_CONSTANT_LOAD:
return 2;
+ case VEC4_OPCODE_GEN4_SCRATCH_READ_1OWORD_LOW:
+ case VEC4_OPCODE_GEN4_SCRATCH_READ_1OWORD_HIGH:
+ return 1;
case SHADER_OPCODE_GEN4_SCRATCH_READ:
return 2;
case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
@@ -2091,6 +2096,8 @@ get_lowered_simd_width(const struct gen_device_info *devinfo,
{
/* Do not split some instructions that require special handling */
switch (inst->opcode) {
+ case VEC4_OPCODE_GEN4_SCRATCH_READ_1OWORD_LOW:
+ case VEC4_OPCODE_GEN4_SCRATCH_READ_1OWORD_HIGH:
case SHADER_OPCODE_GEN4_SCRATCH_READ:
case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
return inst->exec_size;
diff --git a/src/intel/compiler/brw_vec4.h b/src/intel/compiler/brw_vec4.h
index d828da02ea..a5b45aca21 100644
--- a/src/intel/compiler/brw_vec4.h
+++ b/src/intel/compiler/brw_vec4.h
@@ -214,6 +214,9 @@ public:
enum brw_conditional_mod condition);
vec4_instruction *IF(enum brw_predicate predicate);
EMIT1(SCRATCH_READ)
+ vec4_instruction *DF_IVB_SCRATCH_READ(const dst_reg &dst, const src_reg &src0,
+ bool low);
+
EMIT2(SCRATCH_WRITE)
EMIT3(LRP)
EMIT1(BFREV)
@@ -294,6 +297,11 @@ public:
dst_reg dst,
src_reg orig_src,
int base_offset);
+ void emit_1grf_df_ivb_scratch_read(bblock_t *block,
+ vec4_instruction *inst,
+ dst_reg temp, src_reg orig_src,
+ int base_offset, bool first_grf);
+
void emit_scratch_write(bblock_t *block, vec4_instruction *inst,
int base_offset);
void emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
diff --git a/src/intel/compiler/brw_vec4_generator.cpp b/src/intel/compiler/brw_vec4_generator.cpp
index 334933d15a..3bb931385a 100644
--- a/src/intel/compiler/brw_vec4_generator.cpp
+++ b/src/intel/compiler/brw_vec4_generator.cpp
@@ -1133,6 +1133,73 @@ generate_unpack_flags(struct brw_codegen *p,
}
static void
+generate_scratch_read_1oword(struct brw_codegen *p,
+ vec4_instruction *inst,
+ struct brw_reg dst,
+ struct brw_reg index,
+ bool low)
+{
+ const struct gen_device_info *devinfo = p->devinfo;
+
+ assert(devinfo->gen >= 7 && inst->exec_size == 4 &&
+ type_sz(dst.type) == 8);
+ brw_set_default_access_mode(p, BRW_ALIGN_1);
+ brw_set_default_exec_size(p, BRW_EXECUTE_8);
+
+ if (!low) {
+ /* Read second GRF (offset in OWORDs) */
+ for (int i = 0; i < 2; i++) {
+ brw_oword_block_read_scratch(p,
+ dst,
+ brw_message_reg(inst->base_mrf),
+ 1, 32*inst->offset + 16*i + 32, false, true);
+ if (i == 0) {
+ /* The scratch read message writes the 128 MSB (OWORD1 HIGH) of
+ * the destination. We need to move them to dst.0 so we can
+ * read the pending 128 bits without using a temporary register.
+ */
+ brw_set_default_exec_size(p, BRW_EXECUTE_4);
+ struct brw_reg tmp =
+ stride(suboffset(dst, 16 / type_sz(dst.type)),
+ 4, 4, 1);
+
+ brw_set_default_mask_control(p, true);
+ brw_MOV(p, dst, tmp);
+ brw_set_default_mask_control(p, inst->force_writemask_all);
+ brw_set_default_exec_size(p, BRW_EXECUTE_8);
+ }
+ }
+ } else {
+ /* Read first GRF (offset in OWORDs) */
+ for (int i = 1; i >= 0; i--) {
+ brw_oword_block_read_scratch(p,
+ dst,
+ brw_message_reg(inst->base_mrf),
+ 1, 32*inst->offset + 16*i, true, false);
+
+ if (i == 1) {
+ /* The scratch read message writes the 128 LSB (OWORD1 LOW) of
+ * the destination. We need to move them to dst.4 so we can
+ * read the pending 128 bits without using a temporary register.
+ */
+ struct brw_reg tmp = stride(dst, 4, 4, 1);
+ brw_set_default_exec_size(p, BRW_EXECUTE_4);
+ brw_set_default_mask_control(p, true);
+ brw_MOV(p,
+ suboffset(dst, 16 / type_sz(dst.type)),
+ tmp);
+ brw_set_default_mask_control(p, inst->force_writemask_all);
+ brw_set_default_exec_size(p, BRW_EXECUTE_8);
+ }
+ }
+ }
+
+ brw_set_default_exec_size(p, cvt(inst->exec_size) - 1);
+ brw_set_default_access_mode(p, BRW_ALIGN_16);
+ return;
+}
+
+static void
generate_scratch_read(struct brw_codegen *p,
vec4_instruction *inst,
struct brw_reg dst,
@@ -1143,6 +1210,16 @@ generate_scratch_read(struct brw_codegen *p,
gen6_resolve_implied_move(p, &header, inst->base_mrf);
+ if (devinfo->gen >= 7 && inst->exec_size == 4 &&
+ type_sz(dst.type) == 8) {
+ /* First read second GRF (offset in OWORDs) */
+ struct brw_reg dst_high = suboffset(dst, 32 / type_sz(dst.type));
+ generate_scratch_read_1oword(p, inst, dst_high, index, false);
+ /* Now read first GRF (data from first vertex) */
+ generate_scratch_read_1oword(p, inst, dst, index, true);
+ return;
+ }
+
generate_oword_dual_block_offsets(p, brw_message_reg(inst->base_mrf + 1),
index);
@@ -1192,6 +1269,57 @@ generate_scratch_write(struct brw_codegen *p,
struct brw_reg header = brw_vec8_grf(0, 0);
bool write_commit;
+ if (devinfo->gen >= 7 && inst->exec_size == 4 &&
+ type_sz(src.type) == 8) {
+ brw_set_default_access_mode(p, BRW_ALIGN_1);
+
+ /* The messages only works with group == 0, we use the group to know which
+ * message emit (1-OWORD LOW or 1-OWORD HIGH).
+ */
+ brw_set_default_group(p, 0);
+
+ if (inst->group == 0) {
+ for (int i = 0; i < 2; i++) {
+ brw_set_default_exec_size(p, BRW_EXECUTE_4);
+ brw_set_default_mask_control(p, true);
+ struct brw_reg temp =
+ retype(suboffset(src, i * 16 / type_sz(src.type)), BRW_REGISTER_TYPE_UD);
+ temp = stride(temp, 4, 4, 1);
+
+ brw_MOV(p, brw_uvec_mrf(4, inst->base_mrf + 1, 0),
+ temp);
+ brw_set_default_mask_control(p, inst->force_writemask_all);
+ brw_set_default_exec_size(p, BRW_EXECUTE_8);
+
+ /* Offset in OWORDs */
+ brw_oword_block_write_scratch(p, brw_message_reg(inst->base_mrf),
+ 1, 32*inst->offset + 16*i, true, false);
+ }
+ } else {
+ for (int i = 0; i < 2; i++) {
+ brw_set_default_exec_size(p, BRW_EXECUTE_4);
+
+ brw_set_default_mask_control(p, true);
+ struct brw_reg temp =
+ retype(suboffset(src, i * 16 / type_sz(src.type)), BRW_REGISTER_TYPE_UD);
+ temp = stride(temp, 4, 4, 1);
+
+ brw_MOV(p, brw_uvec_mrf(4, inst->base_mrf + 1, 4),
+ temp);
+
+ brw_set_default_mask_control(p, inst->force_writemask_all);
+ brw_set_default_exec_size(p, BRW_EXECUTE_8);
+
+ /* Offset in OWORDs */
+ brw_oword_block_write_scratch(p, brw_message_reg(inst->base_mrf),
+ 1, 32*inst->offset + 16*i + 32, false, true);
+ }
+ }
+ brw_set_default_exec_size(p, cvt(inst->exec_size) - 1);
+ brw_set_default_access_mode(p, BRW_ALIGN_16);
+ return;
+ }
+
/* If the instruction is predicated, we'll predicate the send, not
* the header setup.
*/
@@ -1780,6 +1908,14 @@ generate_code(struct brw_codegen *p,
generate_vs_urb_write(p, inst);
break;
+ case VEC4_OPCODE_GEN4_SCRATCH_READ_1OWORD_LOW:
+ generate_scratch_read_1oword(p, inst, dst, src[0], true);
+ fill_count++;
+ break;
+ case VEC4_OPCODE_GEN4_SCRATCH_READ_1OWORD_HIGH:
+ generate_scratch_read_1oword(p, inst, dst, src[0], false);
+ fill_count++;
+ break;
case SHADER_OPCODE_GEN4_SCRATCH_READ:
generate_scratch_read(p, inst, dst, src[0]);
fill_count++;
diff --git a/src/intel/compiler/brw_vec4_reg_allocate.cpp b/src/intel/compiler/brw_vec4_reg_allocate.cpp
index a0ba77b867..ec5ba10e86 100644
--- a/src/intel/compiler/brw_vec4_reg_allocate.cpp
+++ b/src/intel/compiler/brw_vec4_reg_allocate.cpp
@@ -332,7 +332,9 @@ can_use_scratch_for_source(const vec4_instruction *inst, unsigned i,
* reusing scratch_reg for this instruction.
*/
if (prev_inst->opcode == SHADER_OPCODE_GEN4_SCRATCH_WRITE ||
- prev_inst->opcode == SHADER_OPCODE_GEN4_SCRATCH_READ)
+ prev_inst->opcode == SHADER_OPCODE_GEN4_SCRATCH_READ ||
+ prev_inst->opcode == VEC4_OPCODE_GEN4_SCRATCH_READ_1OWORD_LOW ||
+ prev_inst->opcode == VEC4_OPCODE_GEN4_SCRATCH_READ_1OWORD_HIGH)
continue;
/* If the previous instruction does not write to scratch_reg, then check
@@ -467,6 +469,8 @@ vec4_visitor::evaluate_spill_costs(float *spill_costs, bool *no_spill)
loop_scale /= 10;
break;
+ case VEC4_OPCODE_GEN4_SCRATCH_READ_1OWORD_LOW:
+ case VEC4_OPCODE_GEN4_SCRATCH_READ_1OWORD_HIGH:
case SHADER_OPCODE_GEN4_SCRATCH_READ:
case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
for (int i = 0; i < 3; i++) {
diff --git a/src/intel/compiler/brw_vec4_visitor.cpp b/src/intel/compiler/brw_vec4_visitor.cpp
index 22ee4dd1c4..37ae31c0d5 100644
--- a/src/intel/compiler/brw_vec4_visitor.cpp
+++ b/src/intel/compiler/brw_vec4_visitor.cpp
@@ -264,6 +264,24 @@ vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
}
vec4_instruction *
+vec4_visitor::DF_IVB_SCRATCH_READ(const dst_reg &dst,
+ const src_reg &index,
+ bool first_grf)
+{
+ vec4_instruction *inst;
+ enum opcode op = first_grf ?
+ VEC4_OPCODE_GEN4_SCRATCH_READ_1OWORD_LOW :
+ VEC4_OPCODE_GEN4_SCRATCH_READ_1OWORD_HIGH;
+
+ inst = new(mem_ctx) vec4_instruction(op,
+ dst, index);
+ inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen) + 1;
+ inst->mlen = 1;
+
+ return inst;
+}
+
+vec4_instruction *
vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
const src_reg &index)
{
@@ -1472,6 +1490,37 @@ vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
/**
* Emits an instruction before @inst to load the value named by @orig_src
+ * from scratch space at @base_offset to @temp. This instruction only reads
+ * DF value on IVB, one GRF each time.
+ *
+ * @base_offset is measured in 32-byte units (the size of a register).
+ * @first_grf indicates if we want to read first vertex data (true) or
+ * the second (false).
+ */
+void
+vec4_visitor::emit_1grf_df_ivb_scratch_read(bblock_t *block,
+ vec4_instruction *inst,
+ dst_reg temp, src_reg orig_src,
+ int base_offset, bool first_grf)
+{
+ assert(orig_src.offset % REG_SIZE == 0);
+ src_reg index = get_scratch_offset(block, inst, 0, base_offset);
+
+ assert(devinfo->gen == 7 && !devinfo->is_haswell && type_sz(temp.type) == 8);
+ temp.offset = 0;
+ vec4_instruction *read = DF_IVB_SCRATCH_READ(temp, index, first_grf);
+ read->exec_size = 4;
+ /* The instruction will use group 0 but a different message depending of the
+ * vertex data to load.
+ */
+ read->group = 0;
+ read->offset = base_offset;
+ read->size_written = 1;
+ emit_before(block, inst, read);
+}
+
+/**
+ * Emits an instruction before @inst to load the value named by @orig_src
* from scratch space at @base_offset to @temp.
*
* @base_offset is measured in 32-byte units (the size of a register).
--
2.11.0
More information about the mesa-dev
mailing list