[Mesa-dev] [PATCH] i965: Split nir_emit_intrinsic by stage with a general fallback.

Thu Nov 5 21:53:42 PST 2015

Many intrinsics only apply to a particular stage (such as discard).
In other cases, we may want to interpret them differently based on
the stage (such as load_primitive_id or load_input).

The current method isn't that pretty - we handle all intrinsics in
one giant function.  Sometimes we assert on stage, sometimes we forget.
Different behaviors are handled via if-ladders based on stage.

This commit introduces new nir_emit_<stage>_intrinsic() functions,
and makes nir_emit_instr() call those.  In turn, those fall back to
the generic nir_emit_intrinsic() function for cases they don't want
to handle specially.

This makes it clear which intrinsics only exist in one stage, and makes
it easy to handle inputs/outputs differently for various stages.

Signed-off-by: Kenneth Graunke <kenneth at whitecape.org>
---
 src/mesa/drivers/dri/i965/brw_fs.h       |   8 +
 src/mesa/drivers/dri/i965/brw_fs_nir.cpp | 950 +++++++++++++++++--------------
 2 files changed, 531 insertions(+), 427 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
index 2dfcab1..8a93b56 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -263,6 +263,14 @@ public:
                             nir_load_const_instr *instr);
    void nir_emit_undef(const brw::fs_builder &bld,
                        nir_ssa_undef_instr *instr);
+   void nir_emit_vs_intrinsic(const brw::fs_builder &bld,
+                              nir_intrinsic_instr *instr);
+   void nir_emit_gs_intrinsic(const brw::fs_builder &bld,
+                              nir_intrinsic_instr *instr);
+   void nir_emit_fs_intrinsic(const brw::fs_builder &bld,
+                              nir_intrinsic_instr *instr);
+   void nir_emit_cs_intrinsic(const brw::fs_builder &bld,
+                              nir_intrinsic_instr *instr);
    void nir_emit_intrinsic(const brw::fs_builder &bld,
                            nir_intrinsic_instr *instr);
    void nir_emit_ssbo_atomic(const brw::fs_builder &bld,
diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
index e7a39ff..7f033f2 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@@ -360,7 +360,22 @@ fs_visitor::nir_emit_instr(nir_instr *instr)
       break;
 
    case nir_instr_type_intrinsic:
-      nir_emit_intrinsic(abld, nir_instr_as_intrinsic(instr));
+      switch (stage) {
+      case MESA_SHADER_VERTEX:
+         nir_emit_vs_intrinsic(abld, nir_instr_as_intrinsic(instr));
+         break;
+      case MESA_SHADER_GEOMETRY:
+         nir_emit_gs_intrinsic(abld, nir_instr_as_intrinsic(instr));
+         break;
+      case MESA_SHADER_FRAGMENT:
+         nir_emit_fs_intrinsic(abld, nir_instr_as_intrinsic(instr));
+         break;
+      case MESA_SHADER_COMPUTE:
+         nir_emit_cs_intrinsic(abld, nir_instr_as_intrinsic(instr));
+         break;
+      default:
+         unreachable("unsupported shader stage");
+      }
       break;
 
    case nir_instr_type_tex:
@@ -1568,220 +1583,118 @@ fs_visitor::emit_gs_input_load(const fs_reg &dst,
 }
 
 void
-fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr)
+fs_visitor::nir_emit_vs_intrinsic(const fs_builder &bld,
+                                  nir_intrinsic_instr *instr)
 {
+   assert(stage == MESA_SHADER_VERTEX);
+
    fs_reg dest;
    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
       dest = get_nir_dest(instr->dest);
 
-   bool has_indirect = false;
-
    switch (instr->intrinsic) {
-   case nir_intrinsic_discard:
-   case nir_intrinsic_discard_if: {
-      /* We track our discarded pixels in f0.1.  By predicating on it, we can
-       * update just the flag bits that aren't yet discarded.  If there's no
-       * condition, we emit a CMP of g0 != g0, so all currently executing
-       * channels will get turned off.
-       */
-      fs_inst *cmp;
-      if (instr->intrinsic == nir_intrinsic_discard_if) {
-         cmp = bld.CMP(bld.null_reg_f(), get_nir_src(instr->src[0]),
-                       fs_reg(0), BRW_CONDITIONAL_Z);
-      } else {
-         fs_reg some_reg = fs_reg(retype(brw_vec8_grf(0, 0),
-                                       BRW_REGISTER_TYPE_UW));
-         cmp = bld.CMP(bld.null_reg_f(), some_reg, some_reg, BRW_CONDITIONAL_NZ);
-      }
-      cmp->predicate = BRW_PREDICATE_NORMAL;
-      cmp->flag_subreg = 1;
+   case nir_intrinsic_load_vertex_id:
+      unreachable("should be lowered by lower_vertex_id()");
 
-      if (devinfo->gen >= 6) {
-         emit_discard_jump();
-      }
+   case nir_intrinsic_load_vertex_id_zero_base:
+   case nir_intrinsic_load_base_vertex:
+   case nir_intrinsic_load_instance_id: {
+      gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic);
+      fs_reg val = nir_system_values[sv];
+      assert(val.file != BAD_FILE);
+      dest.type = val.type;
+      bld.MOV(dest, val);
       break;
    }
 
-   case nir_intrinsic_atomic_counter_inc:
-   case nir_intrinsic_atomic_counter_dec:
-   case nir_intrinsic_atomic_counter_read: {
-      using namespace surface_access;
-
-      /* Get the arguments of the atomic intrinsic. */
-      const fs_reg offset = get_nir_src(instr->src[0]);
-      const unsigned surface = (stage_prog_data->binding_table.abo_start +
-                                instr->const_index[0]);
-      fs_reg tmp;
-
-      /* Emit a surface read or atomic op. */
-      switch (instr->intrinsic) {
-      case nir_intrinsic_atomic_counter_read:
-         tmp = emit_untyped_read(bld, fs_reg(surface), offset, 1, 1);
-         break;
-
-      case nir_intrinsic_atomic_counter_inc:
-         tmp = emit_untyped_atomic(bld, fs_reg(surface), offset, fs_reg(),
-                                   fs_reg(), 1, 1, BRW_AOP_INC);
-         break;
-
-      case nir_intrinsic_atomic_counter_dec:
-         tmp = emit_untyped_atomic(bld, fs_reg(surface), offset, fs_reg(),
-                                   fs_reg(), 1, 1, BRW_AOP_PREDEC);
-         break;
-
-      default:
-         unreachable("Unreachable");
-      }
-
-      /* Assign the result. */
-      bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD), tmp);
-
-      /* Mark the surface as used. */
-      brw_mark_surface_used(stage_prog_data, surface);
+   default:
+      nir_emit_intrinsic(bld, instr);
       break;
    }
+}
 
-   case nir_intrinsic_image_load:
-   case nir_intrinsic_image_store:
-   case nir_intrinsic_image_atomic_add:
-   case nir_intrinsic_image_atomic_min:
-   case nir_intrinsic_image_atomic_max:
-   case nir_intrinsic_image_atomic_and:
-   case nir_intrinsic_image_atomic_or:
-   case nir_intrinsic_image_atomic_xor:
-   case nir_intrinsic_image_atomic_exchange:
-   case nir_intrinsic_image_atomic_comp_swap: {
-      using namespace image_access;
-
-      /* Get the referenced image variable and type. */
-      const nir_variable *var = instr->variables[0]->var;
-      const glsl_type *type = var->type->without_array();
-      const brw_reg_type base_type = get_image_base_type(type);
-
-      /* Get some metadata from the image intrinsic. */
-      const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic];
-      const unsigned arr_dims = type->sampler_array ? 1 : 0;
-      const unsigned surf_dims = type->coordinate_components() - arr_dims;
-      const mesa_format format =
-         (var->data.image.write_only ? MESA_FORMAT_NONE :
-          _mesa_get_shader_image_format(var->data.image.format));
+void
+fs_visitor::nir_emit_gs_intrinsic(const fs_builder &bld,
+                                  nir_intrinsic_instr *instr)
+{
+   assert(stage == MESA_SHADER_GEOMETRY);
 
-      /* Get the arguments of the image intrinsic. */
-      const fs_reg image = get_nir_image_deref(instr->variables[0]);
-      const fs_reg addr = retype(get_nir_src(instr->src[0]),
-                                 BRW_REGISTER_TYPE_UD);
-      const fs_reg src0 = (info->num_srcs >= 3 ?
-                           retype(get_nir_src(instr->src[2]), base_type) :
-                           fs_reg());
-      const fs_reg src1 = (info->num_srcs >= 4 ?
-                           retype(get_nir_src(instr->src[3]), base_type) :
-                           fs_reg());
-      fs_reg tmp;
+   fs_reg dest;
+   if (nir_intrinsic_infos[instr->intrinsic].has_dest)
+      dest = get_nir_dest(instr->dest);
 
-      /* Emit an image load, store or atomic op. */
-      if (instr->intrinsic == nir_intrinsic_image_load)
-         tmp = emit_image_load(bld, image, addr, surf_dims, arr_dims, format);
+   switch (instr->intrinsic) {
+   case nir_intrinsic_load_primitive_id:
+      assert(stage == MESA_SHADER_GEOMETRY);
+      assert(((struct brw_gs_prog_data *)prog_data)->include_primitive_id);
+      bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD),
+              retype(fs_reg(brw_vec8_grf(2, 0)), BRW_REGISTER_TYPE_UD));
+      break;
 
-      else if (instr->intrinsic == nir_intrinsic_image_store)
-         emit_image_store(bld, image, addr, src0, surf_dims, arr_dims, format);
+   case nir_intrinsic_load_input_indirect:
+   case nir_intrinsic_load_input:
+      unreachable("load_input intrinsics are invalid for the GS stage");
 
-      else
-         tmp = emit_image_atomic(bld, image, addr, src0, src1,
-                                 surf_dims, arr_dims, info->dest_components,
-                                 get_image_atomic_op(instr->intrinsic, type));
+   case nir_intrinsic_load_per_vertex_input_indirect:
+      assert(!"Not allowed");
+   case nir_intrinsic_load_per_vertex_input:
+      emit_gs_input_load(dest, instr->src[0], instr->const_index[0],
+                         instr->num_components);
+      break;
 
-      /* Assign the result. */
-      for (unsigned c = 0; c < info->dest_components; ++c)
-         bld.MOV(offset(retype(dest, base_type), bld, c),
-                 offset(tmp, bld, c));
+   case nir_intrinsic_emit_vertex_with_counter:
+      emit_gs_vertex(instr->src[0], instr->const_index[0]);
       break;
-   }
 
-   case nir_intrinsic_memory_barrier: {
-      const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, 16 / dispatch_width);
-      bld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp)
-         ->regs_written = 2;
+   case nir_intrinsic_end_primitive_with_counter:
+      emit_gs_end_primitive(instr->src[0]);
       break;
-   }
 
-   case nir_intrinsic_shader_clock: {
-      /* We cannot do anything if there is an event, so ignore it for now */
-      fs_reg shader_clock = get_timestamp(bld);
-      const fs_reg srcs[] = { shader_clock.set_smear(0), shader_clock.set_smear(1) };
+   case nir_intrinsic_set_vertex_count:
+      bld.MOV(this->final_gs_vertex_count, get_nir_src(instr->src[0]));
+      break;
 
-      bld.LOAD_PAYLOAD(dest, srcs, ARRAY_SIZE(srcs), 0);
+   case nir_intrinsic_load_invocation_id: {
+      fs_reg val = nir_system_values[SYSTEM_VALUE_INVOCATION_ID];
+      assert(val.file != BAD_FILE);
+      dest.type = val.type;
+      bld.MOV(dest, val);
       break;
    }
 
-   case nir_intrinsic_image_size: {
-      /* Get the referenced image variable and type. */
-      const nir_variable *var = instr->variables[0]->var;
-      const glsl_type *type = var->type->without_array();
-
-      /* Get the size of the image. */
-      const fs_reg image = get_nir_image_deref(instr->variables[0]);
-      const fs_reg size = offset(image, bld, BRW_IMAGE_PARAM_SIZE_OFFSET);
-
-      /* For 1DArray image types, the array index is stored in the Z component.
-       * Fix this by swizzling the Z component to the Y component.
-       */
-      const bool is_1d_array_image =
-                  type->sampler_dimensionality == GLSL_SAMPLER_DIM_1D &&
-                  type->sampler_array;
-
-      /* For CubeArray images, we should count the number of cubes instead
-       * of the number of faces. Fix it by dividing the (Z component) by 6.
-       */
-      const bool is_cube_array_image =
-                  type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
-                  type->sampler_array;
-
-      /* Copy all the components. */
-      const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic];
-      for (unsigned c = 0; c < info->dest_components; ++c) {
-         if ((int)c >= type->coordinate_components()) {
-             bld.MOV(offset(retype(dest, BRW_REGISTER_TYPE_D), bld, c),
-                     fs_reg(1));
-         } else if (c == 1 && is_1d_array_image) {
-            bld.MOV(offset(retype(dest, BRW_REGISTER_TYPE_D), bld, c),
-                    offset(size, bld, 2));
-         } else if (c == 2 && is_cube_array_image) {
-            bld.emit(SHADER_OPCODE_INT_QUOTIENT,
-                     offset(retype(dest, BRW_REGISTER_TYPE_D), bld, c),
-                     offset(size, bld, c), fs_reg(6));
-         } else {
-            bld.MOV(offset(retype(dest, BRW_REGISTER_TYPE_D), bld, c),
-                    offset(size, bld, c));
-         }
-       }
-
+   default:
+      nir_emit_intrinsic(bld, instr);
       break;
    }
+}
 
-   case nir_intrinsic_image_samples:
-      /* The driver does not support multi-sampled images. */
-      bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), fs_reg(1));
-      break;
+void
+fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld,
+                                  nir_intrinsic_instr *instr)
+{
+   assert(stage == MESA_SHADER_FRAGMENT);
+   struct brw_wm_prog_data *wm_prog_data =
+      (struct brw_wm_prog_data *) prog_data;
+
+   fs_reg dest;
+   if (nir_intrinsic_infos[instr->intrinsic].has_dest)
+      dest = get_nir_dest(instr->dest);
 
+   switch (instr->intrinsic) {
    case nir_intrinsic_load_front_face:
       bld.MOV(retype(dest, BRW_REGISTER_TYPE_D),
               *emit_frontfacing_interpolation());
       break;
 
-   case nir_intrinsic_load_vertex_id:
-      unreachable("should be lowered by lower_vertex_id()");
-
-   case nir_intrinsic_load_primitive_id:
-      assert(stage == MESA_SHADER_GEOMETRY);
-      assert(((struct brw_gs_prog_data *)prog_data)->include_primitive_id);
-      bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD),
-              retype(fs_reg(brw_vec8_grf(2, 0)), BRW_REGISTER_TYPE_UD));
+   case nir_intrinsic_load_sample_pos: {
+      fs_reg sample_pos = nir_system_values[SYSTEM_VALUE_SAMPLE_POS];
+      assert(sample_pos.file != BAD_FILE);
+      dest.type = sample_pos.type;
+      bld.MOV(dest, sample_pos);
+      bld.MOV(offset(dest, bld, 1), offset(sample_pos, bld, 1));
       break;
+   }
 
-   case nir_intrinsic_load_vertex_id_zero_base:
-   case nir_intrinsic_load_base_vertex:
-   case nir_intrinsic_load_instance_id:
-   case nir_intrinsic_load_invocation_id:
    case nir_intrinsic_load_sample_mask_in:
    case nir_intrinsic_load_sample_id: {
       gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic);
@@ -1792,19 +1705,438 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
       break;
    }
 
-   case nir_intrinsic_load_sample_pos: {
-      fs_reg sample_pos = nir_system_values[SYSTEM_VALUE_SAMPLE_POS];
-      assert(sample_pos.file != BAD_FILE);
-      dest.type = sample_pos.type;
-      bld.MOV(dest, sample_pos);
-      bld.MOV(offset(dest, bld, 1), offset(sample_pos, bld, 1));
-      break;
-   }
-
-   case nir_intrinsic_load_uniform_indirect:
-      has_indirect = true;
-      /* fallthrough */
-   case nir_intrinsic_load_uniform: {
+   case nir_intrinsic_discard:
+   case nir_intrinsic_discard_if: {
+      /* We track our discarded pixels in f0.1.  By predicating on it, we can
+       * update just the flag bits that aren't yet discarded.  If there's no
+       * condition, we emit a CMP of g0 != g0, so all currently executing
+       * channels will get turned off.
+       */
+      fs_inst *cmp;
+      if (instr->intrinsic == nir_intrinsic_discard_if) {
+         cmp = bld.CMP(bld.null_reg_f(), get_nir_src(instr->src[0]),
+                       fs_reg(0), BRW_CONDITIONAL_Z);
+      } else {
+         fs_reg some_reg = fs_reg(retype(brw_vec8_grf(0, 0),
+                                       BRW_REGISTER_TYPE_UW));
+         cmp = bld.CMP(bld.null_reg_f(), some_reg, some_reg, BRW_CONDITIONAL_NZ);
+      }
+      cmp->predicate = BRW_PREDICATE_NORMAL;
+      cmp->flag_subreg = 1;
+
+      if (devinfo->gen >= 6) {
+         emit_discard_jump();
+      }
+      break;
+   }
+
+   case nir_intrinsic_interp_var_at_centroid:
+   case nir_intrinsic_interp_var_at_sample:
+   case nir_intrinsic_interp_var_at_offset: {
+      /* Handle ARB_gpu_shader5 interpolation intrinsics
+       *
+       * It's worth a quick word of explanation as to why we handle the full
+       * variable-based interpolation intrinsic rather than a lowered version
+       * with like we do for other inputs.  We have to do that because the way
+       * we set up inputs doesn't allow us to use the already setup inputs for
+       * interpolation.  At the beginning of the shader, we go through all of
+       * the input variables and do the initial interpolation and put it in
+       * the nir_inputs array based on its location as determined in
+       * nir_lower_io.  If the input isn't used, dead code cleans up and
+       * everything works fine.  However, when we get to the ARB_gpu_shader5
+       * interpolation intrinsics, we need to reinterpolate the input
+       * differently.  If we used an intrinsic that just had an index it would
+       * only give us the offset into the nir_inputs array.  However, this is
+       * useless because that value is post-interpolation and we need
+       * pre-interpolation.  In order to get the actual location of the bits
+       * we get from the vertex fetching hardware, we need the variable.
+       */
+      wm_prog_data->pulls_bary = true;
+
+      fs_reg dst_xy = bld.vgrf(BRW_REGISTER_TYPE_F, 2);
+      const glsl_interp_qualifier interpolation =
+         (glsl_interp_qualifier) instr->variables[0]->var->data.interpolation;
+
+      switch (instr->intrinsic) {
+      case nir_intrinsic_interp_var_at_centroid:
+         emit_pixel_interpolater_send(bld,
+                                      FS_OPCODE_INTERPOLATE_AT_CENTROID,
+                                      dst_xy,
+                                      fs_reg(), /* src */
+                                      fs_reg(0u),
+                                      interpolation);
+         break;
+
+      case nir_intrinsic_interp_var_at_sample: {
+         nir_const_value *const_sample = nir_src_as_const_value(instr->src[0]);
+
+         if (const_sample) {
+            unsigned msg_data = const_sample->i[0] << 4;
+
+            emit_pixel_interpolater_send(bld,
+                                         FS_OPCODE_INTERPOLATE_AT_SAMPLE,
+                                         dst_xy,
+                                         fs_reg(), /* src */
+                                         fs_reg(msg_data),
+                                         interpolation);
+         } else {
+            const fs_reg sample_src = retype(get_nir_src(instr->src[0]),
+                                             BRW_REGISTER_TYPE_UD);
+
+            if (nir_src_is_dynamically_uniform(instr->src[0])) {
+               const fs_reg sample_id = bld.emit_uniformize(sample_src);
+               const fs_reg msg_data = vgrf(glsl_type::uint_type);
+               bld.exec_all().group(1, 0).SHL(msg_data, sample_id, fs_reg(4u));
+               emit_pixel_interpolater_send(bld,
+                                            FS_OPCODE_INTERPOLATE_AT_SAMPLE,
+                                            dst_xy,
+                                            fs_reg(), /* src */
+                                            msg_data,
+                                            interpolation);
+            } else {
+               /* Make a loop that sends a message to the pixel interpolater
+                * for the sample number in each live channel. If there are
+                * multiple channels with the same sample number then these
+                * will be handled simultaneously with a single interation of
+                * the loop.
+                */
+               bld.emit(BRW_OPCODE_DO);
+
+               /* Get the next live sample number into sample_id_reg */
+               const fs_reg sample_id = bld.emit_uniformize(sample_src);
+
+               /* Set the flag register so that we can perform the send
+                * message on all channels that have the same sample number
+                */
+               bld.CMP(bld.null_reg_ud(),
+                       sample_src, sample_id,
+                       BRW_CONDITIONAL_EQ);
+               const fs_reg msg_data = vgrf(glsl_type::uint_type);
+               bld.exec_all().group(1, 0).SHL(msg_data, sample_id, fs_reg(4u));
+               fs_inst *inst =
+                  emit_pixel_interpolater_send(bld,
+                                               FS_OPCODE_INTERPOLATE_AT_SAMPLE,
+                                               dst_xy,
+                                               fs_reg(), /* src */
+                                               msg_data,
+                                               interpolation);
+               set_predicate(BRW_PREDICATE_NORMAL, inst);
+
+               /* Continue the loop if there are any live channels left */
+               set_predicate_inv(BRW_PREDICATE_NORMAL,
+                                 true, /* inverse */
+                                 bld.emit(BRW_OPCODE_WHILE));
+            }
+         }
+
+         break;
+      }
+
+      case nir_intrinsic_interp_var_at_offset: {
+         nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
+
+         if (const_offset) {
+            unsigned off_x = MIN2((int)(const_offset->f[0] * 16), 7) & 0xf;
+            unsigned off_y = MIN2((int)(const_offset->f[1] * 16), 7) & 0xf;
+
+            emit_pixel_interpolater_send(bld,
+                                         FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET,
+                                         dst_xy,
+                                         fs_reg(), /* src */
+                                         fs_reg(off_x | (off_y << 4)),
+                                         interpolation);
+         } else {
+            fs_reg src = vgrf(glsl_type::ivec2_type);
+            fs_reg offset_src = retype(get_nir_src(instr->src[0]),
+                                       BRW_REGISTER_TYPE_F);
+            for (int i = 0; i < 2; i++) {
+               fs_reg temp = vgrf(glsl_type::float_type);
+               bld.MUL(temp, offset(offset_src, bld, i), fs_reg(16.0f));
+               fs_reg itemp = vgrf(glsl_type::int_type);
+               bld.MOV(itemp, temp);  /* float to int */
+
+               /* Clamp the upper end of the range to +7/16.
+                * ARB_gpu_shader5 requires that we support a maximum offset
+                * of +0.5, which isn't representable in a S0.4 value -- if
+                * we didn't clamp it, we'd end up with -8/16, which is the
+                * opposite of what the shader author wanted.
+                *
+                * This is legal due to ARB_gpu_shader5's quantization
+                * rules:
+                *
+                * "Not all values of <offset> may be supported; x and y
+                * offsets may be rounded to fixed-point values with the
+                * number of fraction bits given by the
+                * implementation-dependent constant
+                * FRAGMENT_INTERPOLATION_OFFSET_BITS"
+                */
+               set_condmod(BRW_CONDITIONAL_L,
+                           bld.SEL(offset(src, bld, i), itemp, fs_reg(7)));
+            }
+
+            const enum opcode opcode = FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET;
+            emit_pixel_interpolater_send(bld,
+                                         opcode,
+                                         dst_xy,
+                                         src,
+                                         fs_reg(0u),
+                                         interpolation);
+         }
+         break;
+      }
+
+      default:
+         unreachable("Invalid intrinsic");
+      }
+
+      for (unsigned j = 0; j < instr->num_components; j++) {
+         fs_reg src = interp_reg(instr->variables[0]->var->data.location, j);
+         src.type = dest.type;
+
+         bld.emit(FS_OPCODE_LINTERP, dest, dst_xy, src);
+         dest = offset(dest, bld, 1);
+      }
+      break;
+   }
+   default:
+      nir_emit_intrinsic(bld, instr);
+      break;
+   }
+}
+
+void
+fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld,
+                                  nir_intrinsic_instr *instr)
+{
+   assert(stage == MESA_SHADER_COMPUTE);
+   struct brw_cs_prog_data *cs_prog_data =
+      (struct brw_cs_prog_data *) prog_data;
+
+   fs_reg dest;
+   if (nir_intrinsic_infos[instr->intrinsic].has_dest)
+      dest = get_nir_dest(instr->dest);
+
+   switch (instr->intrinsic) {
+   case nir_intrinsic_barrier:
+      emit_barrier();
+      cs_prog_data->uses_barrier = true;
+      break;
+
+   case nir_intrinsic_load_local_invocation_id:
+   case nir_intrinsic_load_work_group_id: {
+      gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic);
+      fs_reg val = nir_system_values[sv];
+      assert(val.file != BAD_FILE);
+      dest.type = val.type;
+      for (unsigned i = 0; i < 3; i++)
+         bld.MOV(offset(dest, bld, i), offset(val, bld, i));
+      break;
+   }
+
+   case nir_intrinsic_load_num_work_groups: {
+      const unsigned surface =
+         cs_prog_data->binding_table.work_groups_start;
+
+      cs_prog_data->uses_num_work_groups = true;
+
+      fs_reg surf_index = fs_reg(surface);
+      brw_mark_surface_used(prog_data, surface);
+
+      /* Read the 3 GLuint components of gl_NumWorkGroups */
+      for (unsigned i = 0; i < 3; i++) {
+         fs_reg read_result =
+            emit_untyped_read(bld, surf_index,
+                              fs_reg(i << 2),
+                              1 /* dims */, 1 /* size */,
+                              BRW_PREDICATE_NONE);
+         read_result.type = dest.type;
+         bld.MOV(dest, read_result);
+         dest = offset(dest, bld, 1);
+      }
+      break;
+   }
+
+   default:
+      nir_emit_intrinsic(bld, instr);
+      break;
+   }
+}
+
+void
+fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr)
+{
+   fs_reg dest;
+   if (nir_intrinsic_infos[instr->intrinsic].has_dest)
+      dest = get_nir_dest(instr->dest);
+
+   bool has_indirect = false;
+
+   switch (instr->intrinsic) {
+   case nir_intrinsic_atomic_counter_inc:
+   case nir_intrinsic_atomic_counter_dec:
+   case nir_intrinsic_atomic_counter_read: {
+      using namespace surface_access;
+
+      /* Get the arguments of the atomic intrinsic. */
+      const fs_reg offset = get_nir_src(instr->src[0]);
+      const unsigned surface = (stage_prog_data->binding_table.abo_start +
+                                instr->const_index[0]);
+      fs_reg tmp;
+
+      /* Emit a surface read or atomic op. */
+      switch (instr->intrinsic) {
+      case nir_intrinsic_atomic_counter_read:
+         tmp = emit_untyped_read(bld, fs_reg(surface), offset, 1, 1);
+         break;
+
+      case nir_intrinsic_atomic_counter_inc:
+         tmp = emit_untyped_atomic(bld, fs_reg(surface), offset, fs_reg(),
+                                   fs_reg(), 1, 1, BRW_AOP_INC);
+         break;
+
+      case nir_intrinsic_atomic_counter_dec:
+         tmp = emit_untyped_atomic(bld, fs_reg(surface), offset, fs_reg(),
+                                   fs_reg(), 1, 1, BRW_AOP_PREDEC);
+         break;
+
+      default:
+         unreachable("Unreachable");
+      }
+
+      /* Assign the result. */
+      bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD), tmp);
+
+      /* Mark the surface as used. */
+      brw_mark_surface_used(stage_prog_data, surface);
+      break;
+   }
+
+   case nir_intrinsic_image_load:
+   case nir_intrinsic_image_store:
+   case nir_intrinsic_image_atomic_add:
+   case nir_intrinsic_image_atomic_min:
+   case nir_intrinsic_image_atomic_max:
+   case nir_intrinsic_image_atomic_and:
+   case nir_intrinsic_image_atomic_or:
+   case nir_intrinsic_image_atomic_xor:
+   case nir_intrinsic_image_atomic_exchange:
+   case nir_intrinsic_image_atomic_comp_swap: {
+      using namespace image_access;
+
+      /* Get the referenced image variable and type. */
+      const nir_variable *var = instr->variables[0]->var;
+      const glsl_type *type = var->type->without_array();
+      const brw_reg_type base_type = get_image_base_type(type);
+
+      /* Get some metadata from the image intrinsic. */
+      const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic];
+      const unsigned arr_dims = type->sampler_array ? 1 : 0;
+      const unsigned surf_dims = type->coordinate_components() - arr_dims;
+      const mesa_format format =
+         (var->data.image.write_only ? MESA_FORMAT_NONE :
+          _mesa_get_shader_image_format(var->data.image.format));
+
+      /* Get the arguments of the image intrinsic. */
+      const fs_reg image = get_nir_image_deref(instr->variables[0]);
+      const fs_reg addr = retype(get_nir_src(instr->src[0]),
+                                 BRW_REGISTER_TYPE_UD);
+      const fs_reg src0 = (info->num_srcs >= 3 ?
+                           retype(get_nir_src(instr->src[2]), base_type) :
+                           fs_reg());
+      const fs_reg src1 = (info->num_srcs >= 4 ?
+                           retype(get_nir_src(instr->src[3]), base_type) :
+                           fs_reg());
+      fs_reg tmp;
+
+      /* Emit an image load, store or atomic op. */
+      if (instr->intrinsic == nir_intrinsic_image_load)
+         tmp = emit_image_load(bld, image, addr, surf_dims, arr_dims, format);
+
+      else if (instr->intrinsic == nir_intrinsic_image_store)
+         emit_image_store(bld, image, addr, src0, surf_dims, arr_dims, format);
+
+      else
+         tmp = emit_image_atomic(bld, image, addr, src0, src1,
+                                 surf_dims, arr_dims, info->dest_components,
+                                 get_image_atomic_op(instr->intrinsic, type));
+
+      /* Assign the result. */
+      for (unsigned c = 0; c < info->dest_components; ++c)
+         bld.MOV(offset(retype(dest, base_type), bld, c),
+                 offset(tmp, bld, c));
+      break;
+   }
+
+   case nir_intrinsic_memory_barrier: {
+      const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, 16 / dispatch_width);
+      bld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp)
+         ->regs_written = 2;
+      break;
+   }
+
+   case nir_intrinsic_shader_clock: {
+      /* We cannot do anything if there is an event, so ignore it for now */
+      fs_reg shader_clock = get_timestamp(bld);
+      const fs_reg srcs[] = { shader_clock.set_smear(0), shader_clock.set_smear(1) };
+
+      bld.LOAD_PAYLOAD(dest, srcs, ARRAY_SIZE(srcs), 0);
+      break;
+   }
+
+   case nir_intrinsic_image_size: {
+      /* Get the referenced image variable and type. */
+      const nir_variable *var = instr->variables[0]->var;
+      const glsl_type *type = var->type->without_array();
+
+      /* Get the size of the image. */
+      const fs_reg image = get_nir_image_deref(instr->variables[0]);
+      const fs_reg size = offset(image, bld, BRW_IMAGE_PARAM_SIZE_OFFSET);
+
+      /* For 1DArray image types, the array index is stored in the Z component.
+       * Fix this by swizzling the Z component to the Y component.
+       */
+      const bool is_1d_array_image =
+                  type->sampler_dimensionality == GLSL_SAMPLER_DIM_1D &&
+                  type->sampler_array;
+
+      /* For CubeArray images, we should count the number of cubes instead
+       * of the number of faces. Fix it by dividing the (Z component) by 6.
+       */
+      const bool is_cube_array_image =
+                  type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
+                  type->sampler_array;
+
+      /* Copy all the components. */
+      const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic];
+      for (unsigned c = 0; c < info->dest_components; ++c) {
+         if ((int)c >= type->coordinate_components()) {
+             bld.MOV(offset(retype(dest, BRW_REGISTER_TYPE_D), bld, c),
+                     fs_reg(1));
+         } else if (c == 1 && is_1d_array_image) {
+            bld.MOV(offset(retype(dest, BRW_REGISTER_TYPE_D), bld, c),
+                    offset(size, bld, 2));
+         } else if (c == 2 && is_cube_array_image) {
+            bld.emit(SHADER_OPCODE_INT_QUOTIENT,
+                     offset(retype(dest, BRW_REGISTER_TYPE_D), bld, c),
+                     offset(size, bld, c), fs_reg(6));
+         } else {
+            bld.MOV(offset(retype(dest, BRW_REGISTER_TYPE_D), bld, c),
+                    offset(size, bld, c));
+         }
+       }
+
+      break;
+   }
+
+   case nir_intrinsic_image_samples:
+      /* The driver does not support multi-sampled images. */
+      bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), fs_reg(1));
+      break;
+
+   case nir_intrinsic_load_uniform_indirect:
+      has_indirect = true;
+      /* fallthrough */
+   case nir_intrinsic_load_uniform: {
       fs_reg uniform_reg(UNIFORM, instr->const_index[0]);
       uniform_reg.reg_offset = instr->const_index[1];
 
@@ -1952,185 +2284,6 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
       break;
    }
 
-   case nir_intrinsic_load_per_vertex_input_indirect:
-      assert(!"Not allowed");
-      /* fallthrough */
-   case nir_intrinsic_load_per_vertex_input:
-      emit_gs_input_load(dest, instr->src[0], instr->const_index[0],
-                         instr->num_components);
-      break;
-
-   /* Handle ARB_gpu_shader5 interpolation intrinsics
-    *
-    * It's worth a quick word of explanation as to why we handle the full
-    * variable-based interpolation intrinsic rather than a lowered version
-    * with like we do for other inputs.  We have to do that because the way
-    * we set up inputs doesn't allow us to use the already setup inputs for
-    * interpolation.  At the beginning of the shader, we go through all of
-    * the input variables and do the initial interpolation and put it in
-    * the nir_inputs array based on its location as determined in
-    * nir_lower_io.  If the input isn't used, dead code cleans up and
-    * everything works fine.  However, when we get to the ARB_gpu_shader5
-    * interpolation intrinsics, we need to reinterpolate the input
-    * differently.  If we used an intrinsic that just had an index it would
-    * only give us the offset into the nir_inputs array.  However, this is
-    * useless because that value is post-interpolation and we need
-    * pre-interpolation.  In order to get the actual location of the bits
-    * we get from the vertex fetching hardware, we need the variable.
-    */
-   case nir_intrinsic_interp_var_at_centroid:
-   case nir_intrinsic_interp_var_at_sample:
-   case nir_intrinsic_interp_var_at_offset: {
-      assert(stage == MESA_SHADER_FRAGMENT);
-
-      ((struct brw_wm_prog_data *) prog_data)->pulls_bary = true;
-
-      fs_reg dst_xy = bld.vgrf(BRW_REGISTER_TYPE_F, 2);
-      const glsl_interp_qualifier interpolation =
-         (glsl_interp_qualifier) instr->variables[0]->var->data.interpolation;
-
-      switch (instr->intrinsic) {
-      case nir_intrinsic_interp_var_at_centroid:
-         emit_pixel_interpolater_send(bld,
-                                      FS_OPCODE_INTERPOLATE_AT_CENTROID,
-                                      dst_xy,
-                                      fs_reg(), /* src */
-                                      fs_reg(0u),
-                                      interpolation);
-         break;
-
-      case nir_intrinsic_interp_var_at_sample: {
-         nir_const_value *const_sample = nir_src_as_const_value(instr->src[0]);
-
-         if (const_sample) {
-            unsigned msg_data = const_sample->i[0] << 4;
-
-            emit_pixel_interpolater_send(bld,
-                                         FS_OPCODE_INTERPOLATE_AT_SAMPLE,
-                                         dst_xy,
-                                         fs_reg(), /* src */
-                                         fs_reg(msg_data),
-                                         interpolation);
-         } else {
-            const fs_reg sample_src = retype(get_nir_src(instr->src[0]),
-                                             BRW_REGISTER_TYPE_UD);
-
-            if (nir_src_is_dynamically_uniform(instr->src[0])) {
-               const fs_reg sample_id = bld.emit_uniformize(sample_src);
-               const fs_reg msg_data = vgrf(glsl_type::uint_type);
-               bld.exec_all().group(1, 0).SHL(msg_data, sample_id, fs_reg(4u));
-               emit_pixel_interpolater_send(bld,
-                                            FS_OPCODE_INTERPOLATE_AT_SAMPLE,
-                                            dst_xy,
-                                            fs_reg(), /* src */
-                                            msg_data,
-                                            interpolation);
-            } else {
-               /* Make a loop that sends a message to the pixel interpolater
-                * for the sample number in each live channel. If there are
-                * multiple channels with the same sample number then these
-                * will be handled simultaneously with a single interation of
-                * the loop.
-                */
-               bld.emit(BRW_OPCODE_DO);
-
-               /* Get the next live sample number into sample_id_reg */
-               const fs_reg sample_id = bld.emit_uniformize(sample_src);
-
-               /* Set the flag register so that we can perform the send
-                * message on all channels that have the same sample number
-                */
-               bld.CMP(bld.null_reg_ud(),
-                       sample_src, sample_id,
-                       BRW_CONDITIONAL_EQ);
-               const fs_reg msg_data = vgrf(glsl_type::uint_type);
-               bld.exec_all().group(1, 0).SHL(msg_data, sample_id, fs_reg(4u));
-               fs_inst *inst =
-                  emit_pixel_interpolater_send(bld,
-                                               FS_OPCODE_INTERPOLATE_AT_SAMPLE,
-                                               dst_xy,
-                                               fs_reg(), /* src */
-                                               msg_data,
-                                               interpolation);
-               set_predicate(BRW_PREDICATE_NORMAL, inst);
-
-               /* Continue the loop if there are any live channels left */
-               set_predicate_inv(BRW_PREDICATE_NORMAL,
-                                 true, /* inverse */
-                                 bld.emit(BRW_OPCODE_WHILE));
-            }
-         }
-
-         break;
-      }
-
-      case nir_intrinsic_interp_var_at_offset: {
-         nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
-
-         if (const_offset) {
-            unsigned off_x = MIN2((int)(const_offset->f[0] * 16), 7) & 0xf;
-            unsigned off_y = MIN2((int)(const_offset->f[1] * 16), 7) & 0xf;
-
-            emit_pixel_interpolater_send(bld,
-                                         FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET,
-                                         dst_xy,
-                                         fs_reg(), /* src */
-                                         fs_reg(off_x | (off_y << 4)),
-                                         interpolation);
-         } else {
-            fs_reg src = vgrf(glsl_type::ivec2_type);
-            fs_reg offset_src = retype(get_nir_src(instr->src[0]),
-                                       BRW_REGISTER_TYPE_F);
-            for (int i = 0; i < 2; i++) {
-               fs_reg temp = vgrf(glsl_type::float_type);
-               bld.MUL(temp, offset(offset_src, bld, i), fs_reg(16.0f));
-               fs_reg itemp = vgrf(glsl_type::int_type);
-               bld.MOV(itemp, temp);  /* float to int */
-
-               /* Clamp the upper end of the range to +7/16.
-                * ARB_gpu_shader5 requires that we support a maximum offset
-                * of +0.5, which isn't representable in a S0.4 value -- if
-                * we didn't clamp it, we'd end up with -8/16, which is the
-                * opposite of what the shader author wanted.
-                *
-                * This is legal due to ARB_gpu_shader5's quantization
-                * rules:
-                *
-                * "Not all values of <offset> may be supported; x and y
-                * offsets may be rounded to fixed-point values with the
-                * number of fraction bits given by the
-                * implementation-dependent constant
-                * FRAGMENT_INTERPOLATION_OFFSET_BITS"
-                */
-               set_condmod(BRW_CONDITIONAL_L,
-                           bld.SEL(offset(src, bld, i), itemp, fs_reg(7)));
-            }
-
-            const enum opcode opcode = FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET;
-            emit_pixel_interpolater_send(bld,
-                                         opcode,
-                                         dst_xy,
-                                         src,
-                                         fs_reg(0u),
-                                         interpolation);
-         }
-         break;
-      }
-
-      default:
-         unreachable("Invalid intrinsic");
-      }
-
-      for (unsigned j = 0; j < instr->num_components; j++) {
-         fs_reg src = interp_reg(instr->variables[0]->var->data.location, j);
-         src.type = dest.type;
-
-         bld.emit(FS_OPCODE_LINTERP, dest, dst_xy, src);
-         dest = offset(dest, bld, 1);
-      }
-      break;
-   }
-
    case nir_intrinsic_store_ssbo_indirect:
       has_indirect = true;
       /* fallthrough */
@@ -2212,23 +2365,6 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
       break;
    }
 
-   case nir_intrinsic_barrier:
-      emit_barrier();
-      if (stage == MESA_SHADER_COMPUTE)
-         ((struct brw_cs_prog_data *) prog_data)->uses_barrier = true;
-      break;
-
-   case nir_intrinsic_load_local_invocation_id:
-   case nir_intrinsic_load_work_group_id: {
-      gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic);
-      fs_reg val = nir_system_values[sv];
-      assert(val.file != BAD_FILE);
-      dest.type = val.type;
-      for (unsigned i = 0; i < 3; i++)
-         bld.MOV(offset(dest, bld, i), offset(val, bld, i));
-      break;
-   }
-
    case nir_intrinsic_ssbo_atomic_add:
       nir_emit_ssbo_atomic(bld, BRW_AOP_ADD, instr);
       break;
@@ -2282,46 +2418,6 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
       break;
    }
 
-   case nir_intrinsic_load_num_work_groups: {
-      assert(devinfo->gen >= 7);
-      assert(stage == MESA_SHADER_COMPUTE);
-
-      struct brw_cs_prog_data *cs_prog_data =
-         (struct brw_cs_prog_data *) prog_data;
-      const unsigned surface =
-         cs_prog_data->binding_table.work_groups_start;
-
-      cs_prog_data->uses_num_work_groups = true;
-
-      fs_reg surf_index = fs_reg(surface);
-      brw_mark_surface_used(prog_data, surface);
-
-      /* Read the 3 GLuint components of gl_NumWorkGroups */
-      for (unsigned i = 0; i < 3; i++) {
-         fs_reg read_result =
-            emit_untyped_read(bld, surf_index,
-                              fs_reg(i << 2),
-                              1 /* dims */, 1 /* size */,
-                              BRW_PREDICATE_NONE);
-         read_result.type = dest.type;
-         bld.MOV(dest, read_result);
-         dest = offset(dest, bld, 1);
-      }
-      break;
-   }
-
-   case nir_intrinsic_emit_vertex_with_counter:
-      emit_gs_vertex(instr->src[0], instr->const_index[0]);
-      break;
-
-   case nir_intrinsic_end_primitive_with_counter:
-      emit_gs_end_primitive(instr->src[0]);
-      break;
-
-   case nir_intrinsic_set_vertex_count:
-      bld.MOV(this->final_gs_vertex_count, get_nir_src(instr->src[0]));
-      break;
-
    default:
       unreachable("unknown intrinsic");
    }
-- 
2.6.2