[Mesa-dev] [PATCH 1/7] i965: Introduce the BROADCAST pseudo-opcode.

Francisco Jerez currojerez at riseup.net
Fri Feb 20 11:48:58 PST 2015


The BROADCAST instruction picks the channel from its first source
given by an index passed in as second source.  This will be used in
situations where all channels from the same SIMD thread have to agree
on the value of something, e.g. a surface binding table index.
---
 src/mesa/drivers/dri/i965/brw_defines.h          |  6 ++
 src/mesa/drivers/dri/i965/brw_eu.h               |  6 ++
 src/mesa/drivers/dri/i965/brw_eu_emit.c          | 77 ++++++++++++++++++++++++
 src/mesa/drivers/dri/i965/brw_fs_generator.cpp   |  4 ++
 src/mesa/drivers/dri/i965/brw_shader.cpp         |  3 +
 src/mesa/drivers/dri/i965/brw_vec4_generator.cpp |  4 ++
 6 files changed, 100 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h
index 17c27dd..d4930e3 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -911,6 +911,12 @@ enum opcode {
 
    SHADER_OPCODE_URB_WRITE_SIMD8,
 
+   /**
+    * Pick the channel from its first source register given by the index
+    * specified as second source.  Useful for variable indexing of surfaces.
+    */
+   SHADER_OPCODE_BROADCAST,
+
    VEC4_OPCODE_MOV_BYTES,
    VEC4_OPCODE_PACK_BYTES,
    VEC4_OPCODE_UNPACK_UNIFORM,
diff --git a/src/mesa/drivers/dri/i965/brw_eu.h b/src/mesa/drivers/dri/i965/brw_eu.h
index a94ea42..2505480 100644
--- a/src/mesa/drivers/dri/i965/brw_eu.h
+++ b/src/mesa/drivers/dri/i965/brw_eu.h
@@ -413,6 +413,12 @@ brw_pixel_interpolator_query(struct brw_compile *p,
                              unsigned msg_length,
                              unsigned response_length);
 
+void
+brw_broadcast(struct brw_compile *p,
+              struct brw_reg dst,
+              struct brw_reg src,
+              struct brw_reg idx);
+
 /***********************************************************************
  * brw_eu_util.c:
  */
diff --git a/src/mesa/drivers/dri/i965/brw_eu_emit.c b/src/mesa/drivers/dri/i965/brw_eu_emit.c
index 1d6fd67..d7e3995 100644
--- a/src/mesa/drivers/dri/i965/brw_eu_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_eu_emit.c
@@ -2854,6 +2854,83 @@ brw_pixel_interpolator_query(struct brw_compile *p,
    brw_inst_set_pi_message_data(brw, insn, data);
 }
 
+void
+brw_broadcast(struct brw_compile *p,
+              struct brw_reg dst,
+              struct brw_reg src,
+              struct brw_reg idx)
+{
+   const struct brw_context *brw = p->brw;
+   const bool align1 = (brw_inst_access_mode(brw, p->current) == BRW_ALIGN_1);
+   brw_inst *inst;
+
+   assert(src.file == BRW_GENERAL_REGISTER_FILE &&
+          src.address_mode == BRW_ADDRESS_DIRECT);
+
+   if ((src.vstride == 0 && (src.hstride == 0 || !align1)) ||
+       idx.file == BRW_IMMEDIATE_VALUE) {
+      /* Trivial, the source is already uniform or the index is a constant.
+       * We will typically not get here if the optimizer is doing its job, but
+       * asserting would be mean.
+       */
+      const unsigned i = (idx.file == BRW_IMMEDIATE_VALUE ? idx.dw1.ud : 0);
+      brw_MOV(p, dst,
+              (align1 ? stride(suboffset(src, i), 0, 1, 0) :
+               stride(suboffset(src, 4 * i), 0, 4, 1)));
+
+   } else {
+      if (align1) {
+         const struct brw_reg addr =
+            retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
+         const unsigned offset = src.nr * REG_SIZE + src.subnr;
+         /* Limit in bytes of the signed indirect addressing immediate. */
+         const unsigned limit = 512;
+
+         brw_push_insn_state(p);
+         brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+         brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
+
+         /* Take into account the component size and horizontal stride. */
+         assert(src.vstride == src.hstride + src.width);
+         brw_SHL(p, addr, vec1(idx),
+                 brw_imm_ud(_mesa_logbase2(type_sz(src.type)) +
+                            src.hstride - 1));
+
+         /* We can only address up to limit bytes using the indirect
+          * addressing immediate, account for the difference if the source
+          * register is above this limit.
+          */
+         if (offset >= limit)
+            brw_ADD(p, addr, addr, brw_imm_ud(offset - offset % limit));
+
+         brw_pop_insn_state(p);
+
+         /* Use indirect addressing to fetch the specified component. */
+         brw_MOV(p, dst,
+                 retype(brw_vec1_indirect(addr.subnr, offset % limit),
+                        src.type));
+
+      } else {
+         /* In SIMD4x2 mode the index can be either zero or one, replicate it
+          * to all bits of a flag register,
+          */
+         inst = brw_MOV(p,
+                        brw_null_reg(),
+                        stride(brw_swizzle1(idx, 0), 0, 4, 1));
+         brw_inst_set_pred_control(brw, inst, BRW_PREDICATE_NONE);
+         brw_inst_set_cond_modifier(brw, inst, BRW_CONDITIONAL_NZ);
+         brw_inst_set_flag_reg_nr(brw, inst, 1);
+
+         /* and use predicated SEL to pick the right channel. */
+         inst = brw_SEL(p, dst,
+                        stride(suboffset(src, 4), 0, 4, 1),
+                        stride(src, 0, 4, 1));
+         brw_inst_set_pred_control(brw, inst, BRW_PREDICATE_NORMAL);
+         brw_inst_set_flag_reg_nr(brw, inst, 1);
+      }
+   }
+}
+
 /**
  * This instruction is generated as a single-channel align1 instruction by
  * both the VS and FS stages when using INTEL_DEBUG=shader_time.
diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
index 86cc667..b611641 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
@@ -2015,6 +2015,10 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width)
          generate_set_simd4x2_offset(inst, dst, src[0]);
          break;
 
+      case SHADER_OPCODE_BROADCAST:
+         brw_broadcast(p, dst, src[0], src[1]);
+         break;
+
       case FS_OPCODE_SET_OMASK:
          generate_set_omask(inst, dst, src[0]);
          break;
diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp
index fbb20bc..bbb5532 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.cpp
+++ b/src/mesa/drivers/dri/i965/brw_shader.cpp
@@ -471,6 +471,9 @@ brw_instruction_name(enum opcode op)
    case SHADER_OPCODE_URB_WRITE_SIMD8:
       return "gen8_urb_write_simd8";
 
+   case SHADER_OPCODE_BROADCAST:
+      return "broadcast";
+
    case VEC4_OPCODE_MOV_BYTES:
       return "mov_bytes";
    case VEC4_OPCODE_PACK_BYTES:
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
index 67c8285..60384c3 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
@@ -1508,6 +1508,10 @@ vec4_generator::generate_code(const cfg_t *cfg)
          generate_untyped_surface_read(inst, dst, src[0]);
          break;
 
+      case SHADER_OPCODE_BROADCAST:
+         brw_broadcast(p, dst, src[0], src[1]);
+         break;
+
       case VS_OPCODE_UNPACK_FLAGS_SIMD4X2:
          generate_unpack_flags(dst);
          break;
-- 
2.1.3



More information about the mesa-dev mailing list