<div dir="ltr">On 2 December 2013 11:39, Francisco Jerez <span dir="ltr"><<a href="mailto:currojerez@riseup.net" target="_blank">currojerez@riseup.net</a>></span> wrote:<br><div class="gmail_extra"><div class="gmail_quote">
<blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">---<br>
 src/mesa/drivers/dri/i965/brw_defines.h            |   4 +<br>
 src/mesa/drivers/dri/i965/brw_eu.h                 |  25 ++++<br>
 src/mesa/drivers/dri/i965/brw_eu_emit.c            | 166 +++++++++++++++++++++<br>
 src/mesa/drivers/dri/i965/brw_fs.cpp               |   3 +<br>
 src/mesa/drivers/dri/i965/brw_fs_generator.cpp     |  18 +++<br>
 .../drivers/dri/i965/brw_schedule_instructions.cpp |   3 +<br>
 src/mesa/drivers/dri/i965/brw_shader.cpp           |   2 +<br>
 src/mesa/drivers/dri/i965/brw_vec4.cpp             |   3 +<br>
 src/mesa/drivers/dri/i965/brw_vec4_generator.cpp   |  18 +++<br>
 9 files changed, 242 insertions(+)<br>
<br>
diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h<br>
index 988b07e..631473a 100644<br>
--- a/src/mesa/drivers/dri/i965/brw_defines.h<br>
+++ b/src/mesa/drivers/dri/i965/brw_defines.h<br>
@@ -780,6 +780,10 @@ enum opcode {<br>
    SHADER_OPCODE_UNTYPED_SURFACE_READ,<br>
    SHADER_OPCODE_UNTYPED_SURFACE_WRITE,<br>
<br>
+   SHADER_OPCODE_TYPED_ATOMIC,<br>
+   SHADER_OPCODE_TYPED_SURFACE_READ,<br>
+   SHADER_OPCODE_TYPED_SURFACE_WRITE,<br>
+<br>
    SHADER_OPCODE_GEN4_SCRATCH_READ,<br>
    SHADER_OPCODE_GEN4_SCRATCH_WRITE,<br>
    SHADER_OPCODE_GEN7_SCRATCH_READ,<br>
diff --git a/src/mesa/drivers/dri/i965/brw_eu.h b/src/mesa/drivers/dri/i965/brw_eu.h<br>
index e17dc49..17822ce 100644<br>
--- a/src/mesa/drivers/dri/i965/brw_eu.h<br>
+++ b/src/mesa/drivers/dri/i965/brw_eu.h<br>
@@ -383,6 +383,31 @@ brw_untyped_surface_write(struct brw_compile *p,<br>
                           unsigned msg_length,<br>
                           unsigned num_channels);<br>
<br>
+void<br>
+brw_typed_atomic(struct brw_compile *p,<br>
+                 struct brw_reg dst,<br>
+                 struct brw_reg mrf,<br>
+                 struct brw_reg surface,<br>
+                 unsigned atomic_op,<br>
+                 unsigned msg_length,<br>
+                 bool response_expected);<br>
+<br>
+void<br>
+brw_typed_surface_read(struct brw_compile *p,<br>
+                       struct brw_reg dst,<br>
+                       struct brw_reg mrf,<br>
+                       struct brw_reg surface,<br>
+                       unsigned msg_length,<br>
+                       unsigned num_channels);<br>
+<br>
+void<br>
+brw_typed_surface_write(struct brw_compile *p,<br>
+                        struct brw_reg dst,<br>
+                        struct brw_reg mrf,<br>
+                        struct brw_reg surface,<br>
+                        unsigned msg_length,<br>
+                        unsigned num_channels);<br>
+<br>
 /***********************************************************************<br>
  * brw_eu_util.c:<br>
  */<br>
diff --git a/src/mesa/drivers/dri/i965/brw_eu_emit.c b/src/mesa/drivers/dri/i965/brw_eu_emit.c<br>
index 13dd59a..772be7a 100644<br>
--- a/src/mesa/drivers/dri/i965/brw_eu_emit.c<br>
+++ b/src/mesa/drivers/dri/i965/brw_eu_emit.c<br>
@@ -2753,6 +2753,172 @@ brw_untyped_surface_write(struct brw_compile *p,<br>
    brw_send_indirect_message(p, sfid, dst, mrf, desc);<br>
 }<br>
<br>
+static void<br>
+brw_set_dp_typed_atomic_message(struct brw_compile *p,<br>
+                                struct brw_instruction *insn,<br>
+                                unsigned atomic_op,<br>
+                                bool response_expected)<br>
+{<br>
+   const unsigned access_mode = p->current->header.access_mode;<br>
+   const unsigned compression_control = p->current->header.compression_control;<br>
+<br>
+   if (p->brw->is_haswell) {<br>
+      if (access_mode == BRW_ALIGN_1) {<br>
+         if (compression_control == GEN6_COMPRESSION_2Q)<br>
+            insn->bits3.ud |= 1 << 12; /* Use high 8 slots of the sample mask */<br>
+<br>
+         insn->bits3.gen7_dp.msg_type =<br>
+            HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP;<br>
+      } else {<br>
+         insn->bits3.gen7_dp.msg_type =<br>
+            HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP_SIMD4X2;<br>
+      }<br>
+<br>
+   } else {<br>
+      insn->bits3.gen7_dp.msg_type = GEN7_DATAPORT_RC_TYPED_ATOMIC_OP;<br>
+<br>
+      if (compression_control == GEN6_COMPRESSION_2Q)<br>
+         insn->bits3.ud |= 1 << 12; /* Use high 8 slots of the sample mask */<br></blockquote><div><br></div><div>As in the previous patch, it would be nice to have a brief reminder of why it is safe to use a SIMD8 operation in a SIMD4x2 shader.  A similar comment applies to brw_set_dp_typed_surface_{read,write}_message().<br>
</div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
+   }<br>
+<br>
+   if (response_expected)<br>
+      insn->bits3.ud |= 1 << 13; /* Return data expected */<br>
+<br>
+   insn->bits3.ud |= atomic_op << 8;<br>
+}<br>
+<br>
+void<br>
+brw_typed_atomic(struct brw_compile *p,<br>
+                 struct brw_reg dst,<br>
+                 struct brw_reg mrf,<br>
+                 struct brw_reg surface,<br>
+                 unsigned atomic_op,<br>
+                 unsigned msg_length,<br>
+                 bool response_expected) {<br>
+   const unsigned sfid = (p->brw->is_haswell ? HSW_SFID_DATAPORT_DATA_CACHE_1 :<br>
+                          GEN6_SFID_DATAPORT_RENDER_CACHE);<br>
+   struct brw_reg desc = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);<br>
+   struct brw_instruction *insn;<br>
+<br>
+   insn = brw_load_indirect_message_descriptor(<br>
+      p, desc, surface, msg_length,<br>
+      brw_surface_payload_size(p, response_expected, p->brw->is_haswell, false),<br>
+      true);<br></blockquote><div><br></div><div>My concern from patch 9 about passing surface directly to brw_load_indirect_message_descriptor() applies here as well.  A similar comment applies to brw_typed_surface_{read,write}().<br>
</div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
+<br>
+   brw_set_dp_typed_atomic_message(<br>
+      p, insn, atomic_op, response_expected);<br>
+<br>
+   brw_send_indirect_message(p, sfid, dst, mrf, desc);<br>
+}<br>
+<br>
+static void<br>
+brw_set_dp_typed_surface_read_message(struct brw_compile *p,<br>
+                                      struct brw_instruction *insn,<br>
+                                      unsigned num_channels)<br>
+{<br>
+   const unsigned access_mode = p->current->header.access_mode;<br>
+   const unsigned compression_control = p->current->header.compression_control;<br>
+<br>
+   if (p->brw->is_haswell) {<br>
+      insn->bits3.gen7_dp.msg_type = HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_READ;<br>
+<br>
+      if (access_mode == BRW_ALIGN_1) {<br>
+         if (compression_control == GEN6_COMPRESSION_2Q)<br>
+            insn->bits3.ud |= 2 << 12; /* Use high 8 slots of the sample mask */<br>
+         else<br>
+            insn->bits3.ud |= 1 << 12; /* Use low 8 slots of the sample mask */<br>
+      }<br>
+<br>
+   } else {<br>
+      insn->bits3.gen7_dp.msg_type = GEN7_DATAPORT_RC_TYPED_SURFACE_READ;<br>
+<br>
+      if (access_mode == BRW_ALIGN_1) {<br>
+         if (compression_control == GEN6_COMPRESSION_2Q)<br>
+            insn->bits3.ud |= 1 << 13; /* Use high 8 slots of the sample mask */<br>
+      }<br>
+   }<br>
+<br>
+   /* Set mask of unused channels. */<br>
+   insn->bits3.ud |= (0xf & (0xf << num_channels)) << 8;<br>
+}<br>
+<br>
+void<br>
+brw_typed_surface_read(struct brw_compile *p,<br>
+                       struct brw_reg dst,<br>
+                       struct brw_reg mrf,<br>
+                       struct brw_reg surface,<br>
+                       unsigned msg_length,<br>
+                       unsigned num_channels)<br>
+{<br>
+   const unsigned sfid = (p->brw->is_haswell ? HSW_SFID_DATAPORT_DATA_CACHE_1 :<br>
+                          GEN6_SFID_DATAPORT_RENDER_CACHE);<br>
+   struct brw_reg desc = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);<br>
+   struct brw_instruction *insn;<br>
+<br>
+   insn = brw_load_indirect_message_descriptor(<br>
+      p, desc, surface, msg_length,<br>
+      brw_surface_payload_size(p, num_channels, p->brw->is_haswell, false),<br>
+      true);<br>
+<br>
+   brw_set_dp_typed_surface_read_message(<br>
+      p, insn, num_channels);<br>
+<br>
+   brw_send_indirect_message(p, sfid, dst, mrf, desc);<br>
+}<br>
+<br>
+static void<br>
+brw_set_dp_typed_surface_write_message(struct brw_compile *p,<br>
+                                       struct brw_instruction *insn,<br>
+                                       unsigned num_channels)<br>
+{<br>
+   const unsigned access_mode = p->current->header.access_mode;<br>
+   const unsigned compression_control = p->current->header.compression_control;<br>
+<br>
+   if (p->brw->is_haswell) {<br>
+      insn->bits3.gen7_dp.msg_type = HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_WRITE;<br>
+<br>
+      if (access_mode == BRW_ALIGN_1) {<br>
+         if (compression_control == GEN6_COMPRESSION_2Q)<br>
+            insn->bits3.ud |= 2 << 12; /* Use high 8 slots of the sample mask */<br>
+         else<br>
+            insn->bits3.ud |= 1 << 12; /* Use low 8 slots of the sample mask */<br>
+      }<br>
+<br>
+   } else {<br>
+      insn->bits3.gen7_dp.msg_type = GEN7_DATAPORT_RC_TYPED_SURFACE_WRITE;<br>
+<br>
+      if (access_mode == BRW_ALIGN_1) {<br>
+         if (compression_control == GEN6_COMPRESSION_2Q)<br>
+            insn->bits3.ud |= 1 << 13; /* Use high 8 slots of the sample mask */<br>
+      }<br>
+   }<br>
+<br>
+   /* Set mask of unused channels. */<br>
+   insn->bits3.ud |= (0xf & (0xf << num_channels)) << 8;<br>
+}<br>
+<br>
+void<br>
+brw_typed_surface_write(struct brw_compile *p,<br>
+                        struct brw_reg dst,<br>
+                        struct brw_reg mrf,<br>
+                        struct brw_reg surface,<br>
+                        unsigned msg_length,<br>
+                        unsigned num_channels)<br>
+{<br>
+   const unsigned sfid = (p->brw->is_haswell ? HSW_SFID_DATAPORT_DATA_CACHE_1 :<br>
+                          GEN6_SFID_DATAPORT_RENDER_CACHE);<br>
+   struct brw_reg desc = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);<br>
+   struct brw_instruction *insn;<br>
+<br>
+   insn = brw_load_indirect_message_descriptor(<br>
+      p, desc, surface, msg_length, 0, true);<br>
+<br>
+   brw_set_dp_typed_surface_write_message(<br>
+      p, insn, num_channels);<br>
+<br>
+   brw_send_indirect_message(p, sfid, dst, mrf, desc);<br>
+}<br>
+<br>
 /**<br>
  * This instruction is generated as a single-channel align1 instruction by<br>
  * both the VS and FS stages when using INTEL_DEBUG=shader_time.<br>
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp<br>
index 721162f..20cb4b9 100644<br>
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp<br>
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp<br>
@@ -782,6 +782,9 @@ fs_visitor::implied_mrf_writes(fs_inst *inst)<br>
    case SHADER_OPCODE_UNTYPED_ATOMIC:<br>
    case SHADER_OPCODE_UNTYPED_SURFACE_READ:<br>
    case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:<br>
+   case SHADER_OPCODE_TYPED_ATOMIC:<br>
+   case SHADER_OPCODE_TYPED_SURFACE_READ:<br>
+   case SHADER_OPCODE_TYPED_SURFACE_WRITE:<br>
       return 0;<br>
    default:<br>
       assert(!"not reached");<br>
diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp<br>
index 2ebb90a..9601183 100644<br>
--- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp<br>
+++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp<br>
@@ -1696,6 +1696,24 @@ fs_generator::generate_code(exec_list *instructions)<br>
                                    src[0], inst->mlen, src[1].dw1.ud);<br>
          break;<br>
<br>
+      case SHADER_OPCODE_TYPED_ATOMIC:<br>
+         assert(src[1].file == BRW_IMMEDIATE_VALUE);<br>
+         brw_typed_atomic(p, dst, brw_message_reg(inst->base_mrf),<br>
+                          src[0], src[1].dw1.ud, inst->mlen, true);<br>
+         break;<br>
+<br>
+      case SHADER_OPCODE_TYPED_SURFACE_READ:<br>
+         assert(src[1].file == BRW_IMMEDIATE_VALUE);<br>
+         brw_typed_surface_read(p, dst, brw_message_reg(inst->base_mrf),<br>
+                                src[0], inst->mlen, src[1].dw1.ud);<br>
+         break;<br>
+<br>
+      case SHADER_OPCODE_TYPED_SURFACE_WRITE:<br>
+         assert(src[1].file == BRW_IMMEDIATE_VALUE);<br>
+         brw_typed_surface_write(p, dst, brw_message_reg(inst->base_mrf),<br>
+                                 src[0], inst->mlen, src[1].dw1.ud);<br>
+         break;<br>
+<br></blockquote><div><br></div><div>As in the previous patch, I'm concerned that brw_mark_surface_used() isn't being called.  Same comment applies to brw_vec4_generator.cpp.<br></div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">

       case FS_OPCODE_SET_SIMD4X2_OFFSET:<br>
          generate_set_simd4x2_offset(inst, dst, src[0]);<br>
          break;<br>
diff --git a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp<br>
index 39b63bc..0e4d8f1 100644<br>
--- a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp<br>
+++ b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp<br>
@@ -338,6 +338,7 @@ schedule_node::set_latency_gen7(bool is_haswell)<br>
       break;<br>
<br>
    case SHADER_OPCODE_UNTYPED_ATOMIC:<br>
+   case SHADER_OPCODE_TYPED_ATOMIC:<br>
       /* Test code:<br>
        *   mov(8)    g112<1>ud       0x00000000ud       { align1 WE_all 1Q };<br>
        *   mov(1)    g112.7<1>ud     g1.7<0,1,0>ud      { align1 WE_all };<br>
@@ -357,6 +358,8 @@ schedule_node::set_latency_gen7(bool is_haswell)<br>
<br>
    case SHADER_OPCODE_UNTYPED_SURFACE_READ:<br>
    case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:<br>
+   case SHADER_OPCODE_TYPED_SURFACE_READ:<br>
+   case SHADER_OPCODE_TYPED_SURFACE_WRITE:<br>
       /* Test code:<br>
        *   mov(8)    g112<1>UD       0x00000000UD       { align1 WE_all 1Q };<br>
        *   mov(1)    g112.7<1>UD     g1.7<0,1,0>UD      { align1 WE_all };<br></blockquote><div><br></div><div>As in the previous patch, I don't think writes have the same latency as reads.<br><br></div>
<div>With those issues addressed, the patch is:<br><br>Reviewed-by: Paul Berry <<a href="mailto:stereotype441@gmail.com">stereotype441@gmail.com</a>><br></div></div></div></div>