[Mesa-dev] [PATCH 5/5] i965: Write a scalar TCS backend that runs in SINGLE_PATCH mode.

Kenneth Graunke kenneth at whitecape.org
Fri Apr 22 05:32:09 UTC 2016


Unlike most shader stages, the Hull Shader hardware makes us explicitly
tell it how many threads to dispatch and manually configure the channel
mask.  One perk of this is that we have a lot of flexibility - we can
run it in either SIMD4x2 or SIMD8 mode.

Treating it as SIMD8 means that shaders with 8 or fewer output vertices
(which is overwhemingly the common case) can be handled by a single
thread.  This has several intriguing properties:

- Accessing input arrays with gl_InvocationID as the index is a simple
  SIMD8 URB read with g1 as the header.  No indirect addressing required.
- Barriers are no-ops.
- We could potentially do output shadowing to combine writes, as the
  concurrency concerns are gone.  (We don't do this yet, though.)

Signed-off-by: Kenneth Graunke <kenneth at whitecape.org>
---
 src/mesa/drivers/dri/i965/brw_compiler.c     |   4 +-
 src/mesa/drivers/dri/i965/brw_fs.cpp         |  97 ++++++++
 src/mesa/drivers/dri/i965/brw_fs.h           |   5 +
 src/mesa/drivers/dri/i965/brw_fs_nir.cpp     | 356 +++++++++++++++++++++++++++
 src/mesa/drivers/dri/i965/brw_fs_visitor.cpp |   3 +
 src/mesa/drivers/dri/i965/brw_tcs.c          |   3 +-
 src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp   |  59 ++++-
 7 files changed, 512 insertions(+), 15 deletions(-)

Shockingly, this appears to cut instruction counts in Unigine Heaven
(-2.5 to 5.5%), Synmark (-31%), and Tessmark (-37%).  It increases
instruction counts in Shadow of Mordor (up to +57%) - but again, this
is running in scalar mode, so larger instruction counts are expected :)
I also have a bunch of optimizations in progress that will help those.

Cycle counts look pretty good too.

This patch leaves it off by default because I haven't properly benchmarked
it yet.  I fully expect we'll turn it on by default.

diff --git a/src/mesa/drivers/dri/i965/brw_compiler.c b/src/mesa/drivers/dri/i965/brw_compiler.c
index 4496699..93a30a5 100644
--- a/src/mesa/drivers/dri/i965/brw_compiler.c
+++ b/src/mesa/drivers/dri/i965/brw_compiler.c
@@ -152,7 +152,8 @@ brw_compiler_create(void *mem_ctx, const struct brw_device_info *devinfo)
 
    compiler->scalar_stage[MESA_SHADER_VERTEX] =
       devinfo->gen >= 8 && !(INTEL_DEBUG & DEBUG_VEC4VS);
-   compiler->scalar_stage[MESA_SHADER_TESS_CTRL] = false;
+   compiler->scalar_stage[MESA_SHADER_TESS_CTRL] =
+      devinfo->gen >= 8 && env_var_as_boolean("INTEL_SCALAR_TCS", false);
    compiler->scalar_stage[MESA_SHADER_TESS_EVAL] =
       devinfo->gen >= 8 && env_var_as_boolean("INTEL_SCALAR_TES", true);
    compiler->scalar_stage[MESA_SHADER_GEOMETRY] =
@@ -194,6 +195,7 @@ brw_compiler_create(void *mem_ctx, const struct brw_device_info *devinfo)
 
    compiler->glsl_compiler_options[MESA_SHADER_TESS_CTRL].EmitNoIndirectInput = false;
    compiler->glsl_compiler_options[MESA_SHADER_TESS_EVAL].EmitNoIndirectInput = false;
+   compiler->glsl_compiler_options[MESA_SHADER_TESS_CTRL].EmitNoIndirectOutput = false;
 
    if (compiler->scalar_stage[MESA_SHADER_GEOMETRY])
       compiler->glsl_compiler_options[MESA_SHADER_GEOMETRY].EmitNoIndirectInput = false;
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 5d6a107..be5edb8 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -1758,6 +1758,21 @@ fs_visitor::assign_vs_urb_setup()
 }
 
 void
+fs_visitor::assign_tcs_single_patch_urb_setup()
+{
+   assert(stage == MESA_SHADER_TESS_CTRL);
+
+   brw_vue_prog_data *vue_prog_data = (brw_vue_prog_data *) prog_data;
+
+   first_non_payload_grf += 8 * vue_prog_data->urb_read_length;
+
+   /* Rewrite all ATTR file references to HW_REGs. */
+   foreach_block_and_inst(block, fs_inst, inst, cfg) {
+      convert_attr_sources_to_hw_regs(inst);
+   }
+}
+
+void
 fs_visitor::assign_tes_urb_setup()
 {
    assert(stage == MESA_SHADER_TESS_EVAL);
@@ -5463,6 +5478,88 @@ fs_visitor::run_vs(gl_clip_plane *clip_planes)
 }
 
 bool
+fs_visitor::run_tcs_single_patch()
+{
+   assert(stage == MESA_SHADER_TESS_CTRL);
+
+   struct brw_tcs_prog_data *tcs_prog_data =
+      (struct brw_tcs_prog_data *) prog_data;
+
+   /* r1-r4 contain the ICP handles. */
+   payload.num_regs = 5;
+
+   if (shader_time_index >= 0)
+      emit_shader_time_begin();
+
+   /* Initialize gl_InvocationID */
+   fs_reg channels_uw = bld.vgrf(BRW_REGISTER_TYPE_UW);
+   fs_reg channels_ud = bld.vgrf(BRW_REGISTER_TYPE_UD);
+   bld.MOV(channels_uw, fs_reg(brw_imm_uv(0x76543210)));
+   bld.MOV(channels_ud, channels_uw);
+
+   if (tcs_prog_data->instances == 1) {
+      invocation_id = channels_ud;
+   } else {
+      invocation_id = bld.vgrf(BRW_REGISTER_TYPE_UD);
+
+      /* Get instance number from g0.2 bits 23:17, and multiply it by 8. */
+      fs_reg t = bld.vgrf(BRW_REGISTER_TYPE_UD);
+      fs_reg instance_times_8 = bld.vgrf(BRW_REGISTER_TYPE_UD);
+      bld.AND(t, fs_reg(retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD)),
+              brw_imm_ud(INTEL_MASK(23, 17)));
+      bld.SHR(instance_times_8, t, brw_imm_ud(17 - 3));
+
+      bld.ADD(invocation_id, instance_times_8, channels_ud);
+   }
+
+   /* Fix the disptach mask */
+   if (nir->info.tcs.vertices_out % 8) {
+      bld.CMP(bld.null_reg_ud(), invocation_id,
+              brw_imm_ud(nir->info.tcs.vertices_out), BRW_CONDITIONAL_L);
+      bld.IF(BRW_PREDICATE_NORMAL);
+   }
+
+   emit_nir_code();
+
+   if (nir->info.tcs.vertices_out % 8) {
+      bld.emit(BRW_OPCODE_ENDIF);
+   }
+
+   /* Emit EOT write; set TR DS Cache bit */
+   fs_reg srcs[3] = {
+      fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
+      fs_reg(brw_imm_ud(WRITEMASK_X << 16)),
+      fs_reg(brw_imm_ud(0)),
+   };
+   fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 3);
+   bld.LOAD_PAYLOAD(payload, srcs, 3, 2);
+
+   fs_inst *inst = bld.emit(SHADER_OPCODE_URB_WRITE_SIMD8_MASKED,
+                            bld.null_reg_ud(), payload);
+   inst->mlen = 3;
+   inst->base_mrf = -1;
+   inst->eot = true;
+
+   if (shader_time_index >= 0)
+      emit_shader_time_end();
+
+   if (failed)
+      return false;
+
+   calculate_cfg();
+
+   optimize();
+
+   assign_curb_setup();
+   assign_tcs_single_patch_urb_setup();
+
+   fixup_3src_null_dest();
+   allocate_registers();
+
+   return !failed;
+}
+
+bool
 fs_visitor::run_tes()
 {
    assert(stage == MESA_SHADER_TESS_EVAL);
diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
index bcd2e3e..f24c78a 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -110,6 +110,7 @@ public:
 
    bool run_fs(bool do_rep_send);
    bool run_vs(gl_clip_plane *clip_planes);
+   bool run_tcs_single_patch();
    bool run_tes();
    bool run_gs();
    bool run_cs();
@@ -126,6 +127,7 @@ public:
    void assign_urb_setup();
    void convert_attr_sources_to_hw_regs(fs_inst *inst);
    void assign_vs_urb_setup();
+   void assign_tcs_single_patch_urb_setup();
    void assign_tes_urb_setup();
    void assign_gs_urb_setup();
    bool assign_regs(bool allow_spilling);
@@ -249,6 +251,8 @@ public:
                        nir_ssa_undef_instr *instr);
    void nir_emit_vs_intrinsic(const brw::fs_builder &bld,
                               nir_intrinsic_instr *instr);
+   void nir_emit_tcs_intrinsic(const brw::fs_builder &bld,
+                               nir_intrinsic_instr *instr);
    void nir_emit_gs_intrinsic(const brw::fs_builder &bld,
                               nir_intrinsic_instr *instr);
    void nir_emit_fs_intrinsic(const brw::fs_builder &bld,
@@ -404,6 +408,7 @@ public:
    fs_reg userplane[MAX_CLIP_PLANES];
    fs_reg final_gs_vertex_count;
    fs_reg control_data_bits;
+   fs_reg invocation_id;
 
    unsigned grf_used;
    bool spilled_any_registers;
diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
index cf4f782..e617083 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@@ -114,6 +114,9 @@ fs_visitor::nir_setup_single_output_varying(fs_reg *reg,
 void
 fs_visitor::nir_setup_outputs()
 {
+   if (stage == MESA_SHADER_TESS_CTRL)
+      return;
+
    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
 
    nir_outputs = bld.vgrf(BRW_REGISTER_TYPE_F, nir->num_outputs);
@@ -232,6 +235,8 @@ emit_system_values_block(nir_block *block, void *void_visitor)
          break;
 
       case nir_intrinsic_load_invocation_id:
+         if (v->stage == MESA_SHADER_TESS_CTRL)
+            break;
          assert(v->stage == MESA_SHADER_GEOMETRY);
          reg = &v->nir_system_values[SYSTEM_VALUE_INVOCATION_ID];
          if (reg->file == BAD_FILE) {
@@ -452,6 +457,9 @@ fs_visitor::nir_emit_instr(nir_instr *instr)
       case MESA_SHADER_VERTEX:
          nir_emit_vs_intrinsic(abld, nir_instr_as_intrinsic(instr));
          break;
+      case MESA_SHADER_TESS_CTRL:
+         nir_emit_tcs_intrinsic(abld, nir_instr_as_intrinsic(instr));
+         break;
       case MESA_SHADER_TESS_EVAL:
          nir_emit_tes_intrinsic(abld, nir_instr_as_intrinsic(instr));
          break;
@@ -1901,6 +1909,354 @@ fs_visitor::nir_emit_vs_intrinsic(const fs_builder &bld,
 }
 
 void
+fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
+                                   nir_intrinsic_instr *instr)
+{
+   assert(stage == MESA_SHADER_TESS_CTRL);
+   struct brw_tcs_prog_key *tcs_key = (struct brw_tcs_prog_key *) key;
+   struct brw_tcs_prog_data *tcs_prog_data =
+      (struct brw_tcs_prog_data *) prog_data;
+
+   fs_reg dst;
+   if (nir_intrinsic_infos[instr->intrinsic].has_dest)
+      dst = get_nir_dest(instr->dest);
+
+   switch (instr->intrinsic) {
+   case nir_intrinsic_load_primitive_id:
+      bld.MOV(dst, fs_reg(brw_vec1_grf(0, 1)));
+      break;
+   case nir_intrinsic_load_invocation_id:
+      bld.MOV(retype(dst, invocation_id.type), invocation_id);
+      break;
+   case nir_intrinsic_load_patch_vertices_in:
+      bld.MOV(retype(dst, BRW_REGISTER_TYPE_D),
+              brw_imm_d(tcs_key->input_vertices));
+      break;
+
+   case nir_intrinsic_barrier: {
+      if (tcs_prog_data->instances == 1)
+         break;
+
+      fs_reg m0 = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+      fs_reg m0_2 = byte_offset(m0, 2 * sizeof(uint32_t));
+
+      const fs_builder fwa_bld = bld.exec_all();
+
+      /* Zero the message header */
+      fwa_bld.MOV(m0, brw_imm_ud(0u));
+
+      /* Copy "Barrier ID" from r0.2, bits 16:13 */
+      fwa_bld.AND(m0_2, retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD),
+                  brw_imm_ud(INTEL_MASK(16, 13)));
+
+      /* Shift it up to bits 27:24. */
+      fwa_bld.SHL(m0_2, m0_2, brw_imm_ud(11));
+
+      /* Set the Barrier Count and the enable bit */
+      fwa_bld.OR(m0_2, m0_2,
+                 brw_imm_ud(tcs_prog_data->instances << 8 | (1 << 15)));
+
+      bld.emit(SHADER_OPCODE_BARRIER, bld.null_reg_ud(), m0);
+      break;
+   }
+
+   case nir_intrinsic_load_input:
+      unreachable("nir_lower_io should never give us these.");
+      break;
+
+   case nir_intrinsic_load_per_vertex_input: {
+      fs_reg indirect_offset = get_indirect_offset(instr);
+      unsigned imm_offset = instr->const_index[0];
+
+      const nir_src &vertex_src = instr->src[0];
+      nir_const_value *vertex_const = nir_src_as_const_value(vertex_src);
+
+      fs_inst *inst;
+
+      fs_reg icp_handle;
+
+      if (vertex_const) {
+         /* Emit a MOV to resolve <0,1,0> regioning. */
+         icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+         bld.MOV(icp_handle,
+                 retype(brw_vec1_grf(1 + (vertex_const->i32[0] >> 3),
+                                     vertex_const->i32[0] & 7),
+                        BRW_REGISTER_TYPE_UD));
+      } else if (tcs_prog_data->instances == 1 &&
+                 vertex_src.is_ssa &&
+                 vertex_src.ssa->parent_instr->type == nir_instr_type_intrinsic &&
+                 nir_instr_as_intrinsic(vertex_src.ssa->parent_instr)->intrinsic == nir_intrinsic_load_invocation_id) {
+         /* For the common case of only 1 instance, an array index of
+          * gl_InvocationID means reading g1.  Skip all the indirect work.
+          */
+         icp_handle = retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD);
+      } else {
+         /* The vertex index is non-constant.  We need to use indirect
+          * addressing to fetch the proper URB handle.
+          */
+         icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+
+         /* Each ICP handle is a single DWord (4 bytes) */
+         fs_reg vertex_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+         bld.SHL(vertex_offset_bytes,
+                 retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD),
+                 brw_imm_ud(2u));
+
+         /* Start at g1.  We might read up to 4 registers. */
+         bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle,
+                  fs_reg(brw_vec8_grf(1, 0)), vertex_offset_bytes,
+                  brw_imm_ud(4 * REG_SIZE));
+      }
+
+      if (indirect_offset.file == BAD_FILE) {
+         /* Constant indexing - use global offset. */
+         inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dst, icp_handle);
+         inst->offset = imm_offset;
+         inst->mlen = 1;
+         inst->base_mrf = -1;
+         inst->regs_written = instr->num_components;
+      } else {
+         /* Indirect indexing - use per-slot offsets as well. */
+         const fs_reg srcs[] = { icp_handle, indirect_offset };
+         fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
+         bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0);
+
+         inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dst, payload);
+         inst->offset = imm_offset;
+         inst->base_mrf = -1;
+         inst->mlen = 2;
+         inst->regs_written = instr->num_components;
+      }
+
+      /* Copy the temporary to the destination to deal with writemasking.
+       *
+       * Also attempt to deal with gl_PointSize being in the .w component.
+       */
+      if (inst->offset == 0 && indirect_offset.file == BAD_FILE) {
+         inst->dst = bld.vgrf(dst.type, 4);
+         inst->regs_written = 4;
+         bld.MOV(dst, offset(inst->dst, bld, 3));
+      }
+      break;
+   }
+
+   case nir_intrinsic_load_output:
+   case nir_intrinsic_load_per_vertex_output: {
+      fs_reg indirect_offset = get_indirect_offset(instr);
+      unsigned imm_offset = instr->const_index[0];
+
+      fs_inst *inst;
+      if (indirect_offset.file == BAD_FILE) {
+         /* Replicate the patch handle to all enabled channels */
+         fs_reg patch_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+         bld.MOV(patch_handle,
+                 retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD));
+
+         if (imm_offset == 0) {
+            /* This is a read of gl_TessLevelInner[], which lives in the
+             * Patch URB header.  The layout depends on the domain.
+             */
+            dst.type = BRW_REGISTER_TYPE_F;
+            switch (tcs_key->tes_primitive_mode) {
+            case GL_QUADS: {
+               /* DWords 3-2 (reversed) */
+               fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, 4);
+
+               inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp, patch_handle);
+               inst->offset = 0;
+               inst->mlen = 1;
+               inst->base_mrf = -1;
+               inst->regs_written = 4;
+
+               /* dst.xy = tmp.wz */
+               bld.MOV(dst,                 offset(tmp, bld, 3));
+               bld.MOV(offset(dst, bld, 1), offset(tmp, bld, 2));
+               break;
+            }
+            case GL_TRIANGLES:
+               /* DWord 4; hardcode offset = 1 and regs_written = 1 */
+               inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dst, patch_handle);
+               inst->offset = 1;
+               inst->mlen = 1;
+               inst->base_mrf = -1;
+               inst->regs_written = 1;
+               break;
+            case GL_ISOLINES:
+               /* All channels are undefined. */
+               break;
+            default:
+               unreachable("Bogus tessellation domain");
+            }
+         } else if (imm_offset == 1) {
+            /* This is a read of gl_TessLevelOuter[], which lives in the
+             * Patch URB header.  The layout depends on the domain.
+             */
+            dst.type = BRW_REGISTER_TYPE_F;
+
+            fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, 4);
+            inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp, patch_handle);
+            inst->offset = 1;
+            inst->mlen = 1;
+            inst->base_mrf = -1;
+            inst->regs_written = 4;
+
+            /* Reswizzle: WZYX */
+            fs_reg srcs[4] = {
+               offset(tmp, bld, 3),
+               offset(tmp, bld, 2),
+               offset(tmp, bld, 1),
+               offset(tmp, bld, 0),
+            };
+
+            unsigned num_components;
+            switch (tcs_key->tes_primitive_mode) {
+            case GL_QUADS:
+               num_components = 4;
+               break;
+            case GL_TRIANGLES:
+               num_components = 3;
+               break;
+            case GL_ISOLINES:
+               /* Isolines are not reversed; swizzle .zw -> .xy */
+               srcs[0] = offset(tmp, bld, 2);
+               srcs[1] = offset(tmp, bld, 3);
+               num_components = 2;
+               break;
+            default:
+               unreachable("Bogus tessellation domain");
+            }
+            bld.LOAD_PAYLOAD(dst, srcs, num_components, 0);
+         } else {
+            inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dst, patch_handle);
+            inst->offset = imm_offset;
+            inst->mlen = 1;
+            inst->base_mrf = -1;
+            inst->regs_written = instr->num_components;
+         }
+      } else {
+         /* Indirect indexing - use per-slot offsets as well. */
+         const fs_reg srcs[] = {
+            retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD),
+            indirect_offset
+         };
+         fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
+         bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0);
+
+         inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dst, payload);
+         inst->offset = imm_offset;
+         inst->mlen = 2;
+         inst->base_mrf = -1;
+         inst->regs_written = instr->num_components;
+      }
+      break;
+   }
+
+   case nir_intrinsic_store_output:
+   case nir_intrinsic_store_per_vertex_output: {
+      fs_reg value = get_nir_src(instr->src[0]);
+      fs_reg indirect_offset = get_indirect_offset(instr);
+      unsigned imm_offset = instr->const_index[0];
+      unsigned swiz = BRW_SWIZZLE_XYZW;
+      unsigned mask = instr->const_index[1];
+      unsigned header_regs = 0;
+      fs_reg srcs[7];
+      srcs[header_regs++] = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD);
+
+      if (indirect_offset.file != BAD_FILE) {
+         srcs[header_regs++] = indirect_offset;
+      } else if (tcs_key->program_string_id != 0) {
+         if (imm_offset == 0) {
+            value.type = BRW_REGISTER_TYPE_F;
+
+            mask &= (1 << tesslevel_inner_components(tcs_key->tes_primitive_mode)) - 1;
+
+            /* This is a write to gl_TessLevelInner[], which lives in the
+             * Patch URB header.  The layout depends on the domain.
+             */
+            switch (tcs_key->tes_primitive_mode) {
+            case GL_QUADS:
+               /* gl_TessLevelInner[].xy lives at DWords 3-2 (reversed).
+                * We use an XXYX swizzle to reverse put .xy in the .wz
+                * channels, and use a .zw writemask.
+                */
+               mask = writemask_for_backwards_vector(mask);
+               swiz = BRW_SWIZZLE4(0, 0, 1, 0);
+               break;
+            case GL_TRIANGLES:
+               /* gl_TessLevelInner[].x lives at DWord 4, so we set the
+                * writemask to X and bump the URB offset by 1.
+                */
+               imm_offset = 1;
+               break;
+            case GL_ISOLINES:
+               /* Skip; gl_TessLevelInner[] doesn't exist for isolines. */
+               return;
+            default:
+               unreachable("Bogus tessellation domain");
+            }
+         } else if (imm_offset == 1) {
+            /* This is a write to gl_TessLevelOuter[] which lives in the
+             * Patch URB Header at DWords 4-7.  However, it's reversed, so
+             * instead of .xyzw we have .wzyx.
+             */
+            value.type = BRW_REGISTER_TYPE_F;
+
+            mask &= (1 << tesslevel_outer_components(tcs_key->tes_primitive_mode)) - 1;
+
+            if (tcs_key->tes_primitive_mode == GL_ISOLINES) {
+               /* Isolines .xy should be stored in .zw, in order. */
+               swiz = BRW_SWIZZLE4(0, 0, 0, 1);
+               mask <<= 2;
+            } else {
+               /* Other domains are reversed; store .wzyx instead of .xyzw */
+               swiz = BRW_SWIZZLE_WZYX;
+               mask = writemask_for_backwards_vector(mask);
+            }
+         }
+      }
+
+      if (mask == 0)
+         break;
+
+      unsigned num_components = _mesa_fls(mask);
+      enum opcode opcode;
+
+      if (mask != WRITEMASK_XYZW) {
+         srcs[header_regs++] = brw_imm_ud(mask << 16);
+         opcode = indirect_offset.file != BAD_FILE ?
+            SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT :
+            SHADER_OPCODE_URB_WRITE_SIMD8_MASKED;
+      } else {
+         opcode = indirect_offset.file != BAD_FILE ?
+            SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT :
+            SHADER_OPCODE_URB_WRITE_SIMD8;
+      }
+
+      for (unsigned i = 0; i < num_components; i++) {
+         if (mask & (1 << i))
+            srcs[header_regs + i] = offset(value, bld, BRW_GET_SWZ(swiz, i));
+      }
+
+      unsigned mlen = header_regs + num_components;
+
+      fs_reg payload =
+         bld.vgrf(BRW_REGISTER_TYPE_UD, mlen);
+      bld.LOAD_PAYLOAD(payload, srcs, mlen, header_regs);
+
+      fs_inst *inst = bld.emit(opcode, bld.null_reg_ud(), payload);
+      inst->offset = imm_offset;
+      inst->mlen = mlen;
+      inst->base_mrf = -1;
+      break;
+   }
+
+   default:
+      nir_emit_intrinsic(bld, instr);
+      break;
+   }
+}
+
+void
 fs_visitor::nir_emit_tes_intrinsic(const fs_builder &bld,
                                    nir_intrinsic_instr *instr)
 {
diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index daabf70..41a9b12 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -1014,6 +1014,9 @@ fs_visitor::init()
    case MESA_SHADER_VERTEX:
       key_tex = &((const brw_vs_prog_key *) key)->tex;
       break;
+   case MESA_SHADER_TESS_CTRL:
+      key_tex = &((const brw_tcs_prog_key *) key)->tex;
+      break;
    case MESA_SHADER_TESS_EVAL:
       key_tex = &((const brw_tes_prog_key *) key)->tex;
       break;
diff --git a/src/mesa/drivers/dri/i965/brw_tcs.c b/src/mesa/drivers/dri/i965/brw_tcs.c
index 0117ffe..98ed2b2 100644
--- a/src/mesa/drivers/dri/i965/brw_tcs.c
+++ b/src/mesa/drivers/dri/i965/brw_tcs.c
@@ -214,7 +214,8 @@ brw_codegen_tcs_prog(struct brw_context *brw,
       prog_data.base.base.nr_image_params = tcs->NumImages;
 
       brw_nir_setup_glsl_uniforms(nir, shader_prog, &tcp->program.Base,
-                                  &prog_data.base.base, false);
+                                  &prog_data.base.base,
+                                  compiler->scalar_stage[MESA_SHADER_TESS_CTRL]);
    } else {
       /* Upload the Patch URB Header as the first two uniforms.
        * Do the annoying scrambling so the shader doesn't have to.
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp b/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp
index 17e3448..79cf93e 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp
@@ -29,6 +29,7 @@
 
 #include "brw_nir.h"
 #include "brw_vec4_tcs.h"
+#include "brw_fs.h"
 
 namespace brw {
 
@@ -452,7 +453,10 @@ brw_compile_tcs(const struct brw_compiler *compiler,
    brw_nir_lower_tcs_outputs(nir, &vue_prog_data->vue_map);
    nir = brw_postprocess_nir(nir, compiler->devinfo, is_scalar);
 
-   prog_data->instances = DIV_ROUND_UP(nir->info.tcs.vertices_out, 2);
+   if (is_scalar)
+      prog_data->instances = DIV_ROUND_UP(nir->info.tcs.vertices_out, 8);
+   else
+      prog_data->instances = DIV_ROUND_UP(nir->info.tcs.vertices_out, 2);
 
    /* Compute URB entry size.  The maximum allowed URB entry size is 32k.
     * That divides up as follows:
@@ -493,20 +497,49 @@ brw_compile_tcs(const struct brw_compiler *compiler,
       brw_print_vue_map(stderr, &vue_prog_data->vue_map);
    }
 
-   vec4_tcs_visitor v(compiler, log_data, key, prog_data,
-                      nir, mem_ctx, shader_time_index, &input_vue_map);
-   if (!v.run()) {
-      if (error_str)
-         *error_str = ralloc_strdup(mem_ctx, v.fail_msg);
-      return NULL;
-   }
+   if (is_scalar) {
+      fs_visitor v(compiler, log_data, mem_ctx, (void *) key,
+                   &prog_data->base.base, NULL, nir, 8,
+                   shader_time_index, &input_vue_map);
+      if (!v.run_tcs_single_patch()) {
+         if (error_str)
+            *error_str = ralloc_strdup(mem_ctx, v.fail_msg);
+         return NULL;
+      }
 
-   if (unlikely(INTEL_DEBUG & DEBUG_TCS))
-      v.dump_instructions();
+      prog_data->base.dispatch_mode = DISPATCH_MODE_SIMD8;
+
+      fs_generator g(compiler, log_data, mem_ctx, (void *) key,
+                     &prog_data->base.base, v.promoted_constants, false,
+                     MESA_SHADER_TESS_CTRL);
+      if (unlikely(INTEL_DEBUG & DEBUG_TCS)) {
+         g.enable_debug(ralloc_asprintf(mem_ctx,
+                                        "%s tessellation control shader %s",
+                                        nir->info.label ? nir->info.label
+                                                        : "unnamed",
+                                        nir->info.name));
+      }
+
+      g.generate_code(v.cfg, 8);
 
-   return brw_vec4_generate_assembly(compiler, log_data, mem_ctx, nir,
-                                     &prog_data->base, v.cfg,
-                                     final_assembly_size);
+      return g.get_assembly(final_assembly_size);
+   } else {
+      vec4_tcs_visitor v(compiler, log_data, key, prog_data,
+                         nir, mem_ctx, shader_time_index, &input_vue_map);
+      if (!v.run()) {
+         if (error_str)
+            *error_str = ralloc_strdup(mem_ctx, v.fail_msg);
+         return NULL;
+      }
+
+      if (unlikely(INTEL_DEBUG & DEBUG_TCS))
+         v.dump_instructions();
+
+
+      return brw_vec4_generate_assembly(compiler, log_data, mem_ctx, nir,
+                                        &prog_data->base, v.cfg,
+                                        final_assembly_size);
+   }
 }
 
 
-- 
2.8.0



More information about the mesa-dev mailing list