[Mesa-dev] [PATCH 14/22] intel/compiler: Do image load/store lowering to NIR

Fri Aug 17 20:06:20 UTC 2018

This commit moves our storage image format conversion codegen into NIR
instead of doing it in the back-end.  This has the advantage of letting
us run it through NIR's optimizer which is pretty effective at shrinking
things down.  In the common case of rgba8, the number of instructions
emitted after NIR is done with it is half of what it was with the
lowering happening in the back-end.  On the downside, the back-end's
lowering is able to directly use predicates and the NIR lowering has to
use IFs.

Shader-db results on Kaby Lake:

    total instructions in shared programs: 15166910 -> 15166872 (<.01%)
    instructions in affected programs: 5895 -> 5857 (-0.64%)
    helped: 15
    HURT: 0

Clearly, we don't have that much image_load_store happening in the
shaders in shader-db....
---
 src/compiler/nir/nir_intrinsics.py            |    9 +
 src/intel/Makefile.sources                    |    1 +
 src/intel/compiler/brw_fs_nir.cpp             |  128 +-
 src/intel/compiler/brw_fs_surface_builder.cpp | 1030 -----------------
 src/intel/compiler/brw_fs_surface_builder.h   |   20 -
 src/intel/compiler/brw_nir.h                  |    3 +
 .../compiler/brw_nir_lower_image_load_store.c |  824 +++++++++++++
 src/intel/compiler/meson.build                |    1 +
 src/intel/vulkan/anv_pipeline.c               |    2 +
 src/mesa/drivers/dri/i965/brw_program.c       |    2 +
 10 files changed, 899 insertions(+), 1121 deletions(-)
 create mode 100644 src/intel/compiler/brw_nir_lower_image_load_store.c

diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py
index 63c602c8874..45872e00c55 100644
--- a/src/compiler/nir/nir_intrinsics.py
+++ b/src/compiler/nir/nir_intrinsics.py
@@ -310,6 +310,15 @@ intrinsic("image_deref_atomic_comp_swap", src_comp=[1, 4, 1, 1, 1], dest_comp=1)
 intrinsic("image_deref_size",    src_comp=[1], dest_comp=0, flags=[CAN_ELIMINATE, CAN_REORDER])
 intrinsic("image_deref_samples", src_comp=[1], dest_comp=1, flags=[CAN_ELIMINATE, CAN_REORDER])
 
+# Intel-specific query for loading from the brw_image_param struct passed
+# into the shader as a uniform.  The variable is a deref to the image
+# variable. The const index specifies which of the six parameters to load.
+intrinsic("image_deref_load_param_intel", src_comp=[1], dest_comp=0,
+          indices=[BASE], flags=[CAN_ELIMINATE, CAN_REORDER])
+intrinsic("image_deref_load_raw_intel", src_comp=[1, 1], dest_comp=0,
+          flags=[CAN_ELIMINATE, CAN_REORDER])
+intrinsic("image_deref_store_raw_intel", src_comp=[1, 1, 0])
+
 # Vulkan descriptor set intrinsics
 #
 # The Vulkan API uses a different binding model from GL.  In the Vulkan
diff --git a/src/intel/Makefile.sources b/src/intel/Makefile.sources
index 5f6cd96825b..d10c4511734 100644
--- a/src/intel/Makefile.sources
+++ b/src/intel/Makefile.sources
@@ -84,6 +84,7 @@ COMPILER_FILES = \
 	compiler/brw_nir_analyze_ubo_ranges.c \
 	compiler/brw_nir_attribute_workarounds.c \
 	compiler/brw_nir_lower_cs_intrinsics.c \
+	compiler/brw_nir_lower_image_load_store.c \
 	compiler/brw_nir_opt_peephole_ffma.c \
 	compiler/brw_nir_tcs_workarounds.c \
 	compiler/brw_packed_float.c \
diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp
index 6e9a5829d3b..021a31d069c 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -3871,58 +3871,89 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
    case nir_intrinsic_image_deref_atomic_xor:
    case nir_intrinsic_image_deref_atomic_exchange:
    case nir_intrinsic_image_deref_atomic_comp_swap: {
-      using namespace image_access;
-
       if (stage == MESA_SHADER_FRAGMENT &&
           instr->intrinsic != nir_intrinsic_image_deref_load)
          brw_wm_prog_data(prog_data)->has_side_effects = true;
 
       /* Get the referenced image variable and type. */
       nir_deref_instr *deref = nir_src_as_deref(instr->src[0]);
-      const nir_variable *var = nir_deref_instr_get_variable(deref);
-      const glsl_type *type = var->type->without_array();
-      const brw_reg_type base_type = get_image_base_type(type);
+      const glsl_type *type = deref->type;
 
       /* Get some metadata from the image intrinsic. */
       const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic];
-      const unsigned arr_dims = type->sampler_array ? 1 : 0;
-      const unsigned surf_dims = type->coordinate_components() - arr_dims;
-      const unsigned format = var->data.image.format;
+      const unsigned dims = type->coordinate_components();
       const unsigned dest_components = nir_intrinsic_dest_components(instr);
 
       /* Get the arguments of the image intrinsic. */
       const fs_reg image = get_nir_image_deref(deref);
-      const fs_reg addr = retype(get_nir_src(instr->src[1]),
-                                 BRW_REGISTER_TYPE_UD);
+      const fs_reg coords = retype(get_nir_src(instr->src[1]),
+                                   BRW_REGISTER_TYPE_UD);
       const fs_reg src0 = (info->num_srcs >= 4 ?
-                           retype(get_nir_src(instr->src[3]), base_type) :
-                           fs_reg());
+                           get_nir_src(instr->src[3]) : fs_reg());
       const fs_reg src1 = (info->num_srcs >= 5 ?
-                           retype(get_nir_src(instr->src[4]), base_type) :
-                           fs_reg());
+                           get_nir_src(instr->src[4]) : fs_reg());
       fs_reg tmp;
 
       /* Emit an image load, store or atomic op. */
-      if (instr->intrinsic == nir_intrinsic_image_deref_load)
-         tmp = emit_image_load(bld, image, addr, surf_dims, arr_dims, format);
-
-      else if (instr->intrinsic == nir_intrinsic_image_deref_store)
-         emit_image_store(bld, image, addr, src0, surf_dims, arr_dims,
-                          var->data.image.write_only ? GL_NONE : format);
-
-      else
-         tmp = emit_image_atomic(bld, image, addr, src0, src1,
-                                 surf_dims, arr_dims, dest_components,
+      if (instr->intrinsic == nir_intrinsic_image_deref_load) {
+         tmp = emit_typed_read(bld, image, coords, dims,
+                               instr->num_components);
+      } else if (instr->intrinsic == nir_intrinsic_image_deref_store) {
+         emit_typed_write(bld, image, coords, src0, dims,
+                          instr->num_components);
+      } else {
+         tmp = emit_typed_atomic(bld, image, coords, src0, src1, dims, 1,
                                  get_image_atomic_op(instr->intrinsic, type));
+      }
 
       /* Assign the result. */
       for (unsigned c = 0; c < dest_components; ++c) {
-         bld.MOV(offset(retype(dest, base_type), bld, c),
-               offset(tmp, bld, c));
+         bld.MOV(offset(retype(dest, tmp.type), bld, c),
+                 offset(tmp, bld, c));
+      }
+      break;
+   }
+
+   case nir_intrinsic_image_deref_load_param_intel: {
+      nir_deref_instr *deref = nir_src_as_deref(instr->src[0]);
+      const fs_reg image = get_nir_image_deref(deref);
+      const fs_reg param = offset(image, bld, nir_intrinsic_base(instr) * 4);
+      for (unsigned c = 0; c < instr->dest.ssa.num_components; ++c) {
+         bld.MOV(offset(retype(dest, param.type), bld, c),
+                 offset(param, bld, c));
+      }
+      break;
+   }
+
+   case nir_intrinsic_image_deref_load_raw_intel: {
+      const fs_reg image = get_nir_image_deref(nir_src_as_deref(instr->src[0]));
+      const fs_reg addr = retype(get_nir_src(instr->src[1]),
+                                 BRW_REGISTER_TYPE_UD);
+
+      fs_reg tmp = emit_untyped_read(bld, image, addr, 1,
+                                     instr->num_components);
+
+      for (unsigned c = 0; c < instr->num_components; ++c) {
+         bld.MOV(offset(retype(dest, tmp.type), bld, c),
+                 offset(tmp, bld, c));
       }
       break;
    }
 
+   case nir_intrinsic_image_deref_store_raw_intel: {
+      const fs_reg image = get_nir_image_deref(nir_src_as_deref(instr->src[0]));
+      const fs_reg addr = retype(get_nir_src(instr->src[1]),
+                                 BRW_REGISTER_TYPE_UD);
+      const fs_reg data = retype(get_nir_src(instr->src[2]),
+                                 BRW_REGISTER_TYPE_UD);
+
+      brw_wm_prog_data(prog_data)->has_side_effects = true;
+
+      emit_untyped_write(bld, image, addr, data, 1,
+                         instr->num_components);
+      break;
+   }
+
    case nir_intrinsic_group_memory_barrier:
    case nir_intrinsic_memory_barrier_shared:
    case nir_intrinsic_memory_barrier_atomic_counter:
@@ -3945,51 +3976,6 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
       break;
    }
 
-   case nir_intrinsic_image_deref_size: {
-      /* Get the referenced image variable and type. */
-      nir_deref_instr *deref = nir_src_as_deref(instr->src[0]);
-      const nir_variable *var = nir_deref_instr_get_variable(deref);
-      const glsl_type *type = var->type->without_array();
-
-      /* Get the size of the image. */
-      const fs_reg image = get_nir_image_deref(deref);
-      const fs_reg size = offset(image, bld, BRW_IMAGE_PARAM_SIZE_OFFSET);
-
-      /* For 1DArray image types, the array index is stored in the Z component.
-       * Fix this by swizzling the Z component to the Y component.
-       */
-      const bool is_1d_array_image =
-                  type->sampler_dimensionality == GLSL_SAMPLER_DIM_1D &&
-                  type->sampler_array;
-
-      /* For CubeArray images, we should count the number of cubes instead
-       * of the number of faces. Fix it by dividing the (Z component) by 6.
-       */
-      const bool is_cube_array_image =
-                  type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
-                  type->sampler_array;
-
-      /* Copy all the components. */
-      for (unsigned c = 0; c < instr->dest.ssa.num_components; ++c) {
-         if ((int)c >= type->coordinate_components()) {
-             bld.MOV(offset(retype(dest, BRW_REGISTER_TYPE_D), bld, c),
-                     brw_imm_d(1));
-         } else if (c == 1 && is_1d_array_image) {
-            bld.MOV(offset(retype(dest, BRW_REGISTER_TYPE_D), bld, c),
-                    offset(size, bld, 2));
-         } else if (c == 2 && is_cube_array_image) {
-            bld.emit(SHADER_OPCODE_INT_QUOTIENT,
-                     offset(retype(dest, BRW_REGISTER_TYPE_D), bld, c),
-                     offset(size, bld, c), brw_imm_d(6));
-         } else {
-            bld.MOV(offset(retype(dest, BRW_REGISTER_TYPE_D), bld, c),
-                    offset(size, bld, c));
-         }
-       }
-
-      break;
-   }
-
    case nir_intrinsic_image_deref_samples:
       /* The driver does not support multi-sampled images. */
       bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), brw_imm_d(1));
diff --git a/src/intel/compiler/brw_fs_surface_builder.cpp b/src/intel/compiler/brw_fs_surface_builder.cpp
index c346ef9e701..65322a9410f 100644
--- a/src/intel/compiler/brw_fs_surface_builder.cpp
+++ b/src/intel/compiler/brw_fs_surface_builder.cpp
@@ -182,1033 +182,3 @@ namespace brw {
       }
    }
 }
-
-namespace {
-   namespace image_format_info {
-      /* The higher compiler layers use the GL enums for image formats even if
-       * they come in from SPIR-V or Vulkan.  We need to turn them into an ISL
-       * enum before we can use them.
-       */
-      static enum isl_format
-      isl_format_for_gl_format(uint32_t gl_format)
-      {
-         switch (gl_format) {
-         case GL_R8:             return ISL_FORMAT_R8_UNORM;
-         case GL_R8_SNORM:       return ISL_FORMAT_R8_SNORM;
-         case GL_R8UI:           return ISL_FORMAT_R8_UINT;
-         case GL_R8I:            return ISL_FORMAT_R8_SINT;
-         case GL_RG8:            return ISL_FORMAT_R8G8_UNORM;
-         case GL_RG8_SNORM:      return ISL_FORMAT_R8G8_SNORM;
-         case GL_RG8UI:          return ISL_FORMAT_R8G8_UINT;
-         case GL_RG8I:           return ISL_FORMAT_R8G8_SINT;
-         case GL_RGBA8:          return ISL_FORMAT_R8G8B8A8_UNORM;
-         case GL_RGBA8_SNORM:    return ISL_FORMAT_R8G8B8A8_SNORM;
-         case GL_RGBA8UI:        return ISL_FORMAT_R8G8B8A8_UINT;
-         case GL_RGBA8I:         return ISL_FORMAT_R8G8B8A8_SINT;
-         case GL_R11F_G11F_B10F: return ISL_FORMAT_R11G11B10_FLOAT;
-         case GL_RGB10_A2:       return ISL_FORMAT_R10G10B10A2_UNORM;
-         case GL_RGB10_A2UI:     return ISL_FORMAT_R10G10B10A2_UINT;
-         case GL_R16:            return ISL_FORMAT_R16_UNORM;
-         case GL_R16_SNORM:      return ISL_FORMAT_R16_SNORM;
-         case GL_R16F:           return ISL_FORMAT_R16_FLOAT;
-         case GL_R16UI:          return ISL_FORMAT_R16_UINT;
-         case GL_R16I:           return ISL_FORMAT_R16_SINT;
-         case GL_RG16:           return ISL_FORMAT_R16G16_UNORM;
-         case GL_RG16_SNORM:     return ISL_FORMAT_R16G16_SNORM;
-         case GL_RG16F:          return ISL_FORMAT_R16G16_FLOAT;
-         case GL_RG16UI:         return ISL_FORMAT_R16G16_UINT;
-         case GL_RG16I:          return ISL_FORMAT_R16G16_SINT;
-         case GL_RGBA16:         return ISL_FORMAT_R16G16B16A16_UNORM;
-         case GL_RGBA16_SNORM:   return ISL_FORMAT_R16G16B16A16_SNORM;
-         case GL_RGBA16F:        return ISL_FORMAT_R16G16B16A16_FLOAT;
-         case GL_RGBA16UI:       return ISL_FORMAT_R16G16B16A16_UINT;
-         case GL_RGBA16I:        return ISL_FORMAT_R16G16B16A16_SINT;
-         case GL_R32F:           return ISL_FORMAT_R32_FLOAT;
-         case GL_R32UI:          return ISL_FORMAT_R32_UINT;
-         case GL_R32I:           return ISL_FORMAT_R32_SINT;
-         case GL_RG32F:          return ISL_FORMAT_R32G32_FLOAT;
-         case GL_RG32UI:         return ISL_FORMAT_R32G32_UINT;
-         case GL_RG32I:          return ISL_FORMAT_R32G32_SINT;
-         case GL_RGBA32F:        return ISL_FORMAT_R32G32B32A32_FLOAT;
-         case GL_RGBA32UI:       return ISL_FORMAT_R32G32B32A32_UINT;
-         case GL_RGBA32I:        return ISL_FORMAT_R32G32B32A32_SINT;
-         case GL_NONE:           return ISL_FORMAT_UNSUPPORTED;
-         default:
-            assert(!"Invalid image format");
-            return ISL_FORMAT_UNSUPPORTED;
-         }
-      }
-
-      /**
-       * Simple 4-tuple of scalars used to pass around per-color component
-       * values.
-       */
-      struct color_u {
-         color_u(unsigned x = 0) : r(x), g(x), b(x), a(x)
-         {
-         }
-
-         color_u(unsigned r, unsigned g, unsigned b, unsigned a) :
-            r(r), g(g), b(b), a(a)
-         {
-         }
-
-         unsigned
-         operator[](unsigned i) const
-         {
-            const unsigned xs[] = { r, g, b, a };
-            return xs[i];
-         }
-
-         unsigned r, g, b, a;
-      };
-
-      /**
-       * Return the per-channel bitfield widths for a given image format.
-       */
-      inline color_u
-      get_bit_widths(isl_format format)
-      {
-         const isl_format_layout *fmtl = isl_format_get_layout(format);
-
-         return color_u(fmtl->channels.r.bits,
-                        fmtl->channels.g.bits,
-                        fmtl->channels.b.bits,
-                        fmtl->channels.a.bits);
-      }
-
-      /**
-       * Return the per-channel bitfield shifts for a given image format.
-       */
-      inline color_u
-      get_bit_shifts(isl_format format)
-      {
-         const color_u widths = get_bit_widths(format);
-         return color_u(0, widths.r, widths.r + widths.g,
-                        widths.r + widths.g + widths.b);
-      }
-
-      /**
-       * Return true if all present components have the same bit width.
-       */
-      inline bool
-      is_homogeneous(isl_format format)
-      {
-         const color_u widths = get_bit_widths(format);
-         return ((widths.g == 0 || widths.g == widths.r) &&
-                 (widths.b == 0 || widths.b == widths.r) &&
-                 (widths.a == 0 || widths.a == widths.r));
-      }
-
-      /**
-       * Return true if the format conversion boils down to a trivial copy.
-       */
-      inline bool
-      is_conversion_trivial(const gen_device_info *devinfo, isl_format format)
-      {
-         return (get_bit_widths(format).r == 32 && is_homogeneous(format)) ||
-                 format == isl_lower_storage_image_format(devinfo, format);
-      }
-
-      /**
-       * Return true if the hardware natively supports some format with
-       * compatible bitfield layout, but possibly different data types.
-       */
-      inline bool
-      has_supported_bit_layout(const gen_device_info *devinfo,
-                               isl_format format)
-      {
-         const color_u widths = get_bit_widths(format);
-         const color_u lower_widths = get_bit_widths(
-            isl_lower_storage_image_format(devinfo, format));
-
-         return (widths.r == lower_widths.r &&
-                 widths.g == lower_widths.g &&
-                 widths.b == lower_widths.b &&
-                 widths.a == lower_widths.a);
-      }
-
-      /**
-       * Return true if we are required to spread individual components over
-       * several components of the format used by the hardware (RG32 and
-       * friends implemented as RGBA16UI).
-       */
-      inline bool
-      has_split_bit_layout(const gen_device_info *devinfo, isl_format format)
-      {
-         const isl_format lower_format =
-            isl_lower_storage_image_format(devinfo, format);
-
-         return (isl_format_get_num_channels(format) <
-                 isl_format_get_num_channels(lower_format));
-      }
-
-      /**
-       * Return true if the hardware returns garbage in the unused high bits
-       * of each component.  This may happen on IVB because we rely on the
-       * undocumented behavior that typed reads from surfaces of the
-       * unsupported R8 and R16 formats return useful data in their least
-       * significant bits.
-       */
-      inline bool
-      has_undefined_high_bits(const gen_device_info *devinfo,
-                              isl_format format)
-      {
-         const isl_format lower_format =
-            isl_lower_storage_image_format(devinfo, format);
-
-         return (devinfo->gen == 7 && !devinfo->is_haswell &&
-                 (lower_format == ISL_FORMAT_R16_UINT ||
-                  lower_format == ISL_FORMAT_R8_UINT));
-      }
-
-      /**
-       * Return true if the format represents values as signed integers
-       * requiring sign extension when unpacking.
-       */
-      inline bool
-      needs_sign_extension(isl_format format)
-      {
-         return isl_format_has_snorm_channel(format) ||
-                isl_format_has_sint_channel(format);
-      }
-   }
-
-   namespace image_validity {
-      /**
-       * Check whether the bound image is suitable for untyped access.
-       */
-      static brw_predicate
-      emit_untyped_image_check(const fs_builder &bld, const fs_reg &image,
-                               brw_predicate pred)
-      {
-         const gen_device_info *devinfo = bld.shader->devinfo;
-         const fs_reg stride = offset(image, bld, BRW_IMAGE_PARAM_STRIDE_OFFSET);
-
-         if (devinfo->gen == 7 && !devinfo->is_haswell) {
-            /* Check whether the first stride component (i.e. the Bpp value)
-             * is greater than four, what on Gen7 indicates that a surface of
-             * type RAW has been bound for untyped access.  Reading or writing
-             * to a surface of type other than RAW using untyped surface
-             * messages causes a hang on IVB and VLV.
-             */
-            set_predicate(pred,
-                          bld.CMP(bld.null_reg_ud(), stride, brw_imm_d(4),
-                                  BRW_CONDITIONAL_G));
-
-            return BRW_PREDICATE_NORMAL;
-         } else {
-            /* More recent generations handle the format mismatch
-             * gracefully.
-             */
-            return pred;
-         }
-      }
-
-      /**
-       * Check whether there is an image bound at the given index and write
-       * the comparison result to f0.0.  Returns an appropriate predication
-       * mode to use on subsequent image operations.
-       */
-      static brw_predicate
-      emit_typed_atomic_check(const fs_builder &bld, const fs_reg &image)
-      {
-         const gen_device_info *devinfo = bld.shader->devinfo;
-         const fs_reg size = offset(image, bld, BRW_IMAGE_PARAM_SIZE_OFFSET);
-
-         if (devinfo->gen == 7 && !devinfo->is_haswell) {
-            /* Check the first component of the size field to find out if the
-             * image is bound.  Necessary on IVB for typed atomics because
-             * they don't seem to respect null surfaces and will happily
-             * corrupt or read random memory when no image is bound.
-             */
-            bld.CMP(bld.null_reg_ud(),
-                    retype(size, BRW_REGISTER_TYPE_UD),
-                    brw_imm_d(0), BRW_CONDITIONAL_NZ);
-
-            return BRW_PREDICATE_NORMAL;
-         } else {
-            /* More recent platforms implement compliant behavior when a null
-             * surface is bound.
-             */
-            return BRW_PREDICATE_NONE;
-         }
-      }
-
-      /**
-       * Check whether the provided coordinates are within the image bounds
-       * and write the comparison result to f0.0.  Returns an appropriate
-       * predication mode to use on subsequent image operations.
-       */
-      static brw_predicate
-      emit_bounds_check(const fs_builder &bld, const fs_reg &image,
-                        const fs_reg &addr, unsigned dims)
-      {
-         const fs_reg size = offset(image, bld, BRW_IMAGE_PARAM_SIZE_OFFSET);
-
-         for (unsigned c = 0; c < dims; ++c)
-            set_predicate(c == 0 ? BRW_PREDICATE_NONE : BRW_PREDICATE_NORMAL,
-                          bld.CMP(bld.null_reg_ud(),
-                                  offset(retype(addr, BRW_REGISTER_TYPE_UD), bld, c),
-                                  offset(size, bld, c),
-                                  BRW_CONDITIONAL_L));
-
-         return BRW_PREDICATE_NORMAL;
-      }
-   }
-
-   namespace image_coordinates {
-      /**
-       * Return the total number of coordinates needed to address a texel of
-       * the surface, which may be more than the sum of \p surf_dims and \p
-       * arr_dims if padding is required.
-       */
-      static unsigned
-      num_image_coordinates(const fs_builder &bld,
-                            unsigned surf_dims, unsigned arr_dims,
-                            isl_format format)
-      {
-         /* HSW in vec4 mode and our software coordinate handling for untyped
-          * reads want the array index to be at the Z component.
-          */
-         const bool array_index_at_z =
-            format != ISL_FORMAT_UNSUPPORTED &&
-            !isl_has_matching_typed_storage_image_format(
-               bld.shader->devinfo, format);
-         const unsigned zero_dims =
-            ((surf_dims == 1 && arr_dims == 1 && array_index_at_z) ? 1 : 0);
-
-         return surf_dims + zero_dims + arr_dims;
-      }
-
-      /**
-       * Transform image coordinates into the form expected by the
-       * implementation.
-       */
-      static fs_reg
-      emit_image_coordinates(const fs_builder &bld, const fs_reg &addr,
-                             unsigned surf_dims, unsigned arr_dims,
-                             isl_format format)
-      {
-         const unsigned dims =
-            num_image_coordinates(bld, surf_dims, arr_dims, format);
-
-         if (dims > surf_dims + arr_dims) {
-            assert(surf_dims == 1 && arr_dims == 1 && dims == 3);
-            /* The array index is required to be passed in as the Z component,
-             * insert a zero at the Y component to shift it to the right
-             * position.
-             *
-             * FINISHME: Factor out this frequently recurring pattern into a
-             * helper function.
-             */
-            const fs_reg srcs[] = { addr, brw_imm_d(0), offset(addr, bld, 1) };
-            const fs_reg dst = bld.vgrf(addr.type, dims);
-            bld.LOAD_PAYLOAD(dst, srcs, dims, 0);
-            return dst;
-         } else {
-            return addr;
-         }
-      }
-
-      /**
-       * Calculate the offset in memory of the texel given by \p coord.
-       *
-       * This is meant to be used with untyped surface messages to access a
-       * tiled surface, what involves taking into account the tiling and
-       * swizzling modes of the surface manually so it will hopefully not
-       * happen very often.
-       *
-       * The tiling algorithm implemented here matches either the X or Y
-       * tiling layouts supported by the hardware depending on the tiling
-       * coefficients passed to the program as uniforms.  See Volume 1 Part 2
-       * Section 4.5 "Address Tiling Function" of the IVB PRM for an in-depth
-       * explanation of the hardware tiling format.
-       */
-      static fs_reg
-      emit_address_calculation(const fs_builder &bld, const fs_reg &image,
-                               const fs_reg &coord, unsigned dims)
-      {
-         const gen_device_info *devinfo = bld.shader->devinfo;
-         const fs_reg off = offset(image, bld, BRW_IMAGE_PARAM_OFFSET_OFFSET);
-         const fs_reg stride = offset(image, bld, BRW_IMAGE_PARAM_STRIDE_OFFSET);
-         const fs_reg tile = offset(image, bld, BRW_IMAGE_PARAM_TILING_OFFSET);
-         const fs_reg swz = offset(image, bld, BRW_IMAGE_PARAM_SWIZZLING_OFFSET);
-         const fs_reg addr = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
-         const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
-         const fs_reg minor = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
-         const fs_reg major = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
-         const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD);
-
-         /* Shift the coordinates by the fixed surface offset.  It may be
-          * non-zero if the image is a single slice of a higher-dimensional
-          * surface, or if a non-zero mipmap level of the surface is bound to
-          * the pipeline.  The offset needs to be applied here rather than at
-          * surface state set-up time because the desired slice-level may
-          * start mid-tile, so simply shifting the surface base address
-          * wouldn't give a well-formed tiled surface in the general case.
-          */
-         for (unsigned c = 0; c < 2; ++c)
-            bld.ADD(offset(addr, bld, c), offset(off, bld, c),
-                    (c < dims ?
-                     offset(retype(coord, BRW_REGISTER_TYPE_UD), bld, c) :
-                     fs_reg(brw_imm_d(0))));
-
-         /* The layout of 3-D textures in memory is sort-of like a tiling
-          * format.  At each miplevel, the slices are arranged in rows of
-          * 2^level slices per row.  The slice row is stored in tmp.y and
-          * the slice within the row is stored in tmp.x.
-          *
-          * The layout of 2-D array textures and cubemaps is much simpler:
-          * Depending on whether the ARYSPC_LOD0 layout is in use it will be
-          * stored in memory as an array of slices, each one being a 2-D
-          * arrangement of miplevels, or as a 2D arrangement of miplevels,
-          * each one being an array of slices.  In either case the separation
-          * between slices of the same LOD is equal to the qpitch value
-          * provided as stride.w.
-          *
-          * This code can be made to handle either 2D arrays and 3D textures
-          * by passing in the miplevel as tile.z for 3-D textures and 0 in
-          * tile.z for 2-D array textures.
-          *
-          * See Volume 1 Part 1 of the Gen7 PRM, sections 6.18.4.7 "Surface
-          * Arrays" and 6.18.6 "3D Surfaces" for a more extensive discussion
-          * of the hardware 3D texture and 2D array layouts.
-          */
-         if (dims > 2) {
-            /* Decompose z into a major (tmp.y) and a minor (tmp.x)
-             * index.
-             */
-            bld.BFE(offset(tmp, bld, 0), offset(tile, bld, 2), brw_imm_d(0),
-                    offset(retype(coord, BRW_REGISTER_TYPE_UD), bld, 2));
-            bld.SHR(offset(tmp, bld, 1),
-                    offset(retype(coord, BRW_REGISTER_TYPE_UD), bld, 2),
-                    offset(tile, bld, 2));
-
-            /* Take into account the horizontal (tmp.x) and vertical (tmp.y)
-             * slice offset.
-             */
-            for (unsigned c = 0; c < 2; ++c) {
-               bld.MUL(offset(tmp, bld, c),
-                       offset(stride, bld, 2 + c), offset(tmp, bld, c));
-               bld.ADD(offset(addr, bld, c),
-                       offset(addr, bld, c), offset(tmp, bld, c));
-            }
-         }
-
-         if (dims > 1) {
-            /* Calculate the major/minor x and y indices.  In order to
-             * accommodate both X and Y tiling, the Y-major tiling format is
-             * treated as being a bunch of narrow X-tiles placed next to each
-             * other.  This means that the tile width for Y-tiling is actually
-             * the width of one sub-column of the Y-major tile where each 4K
-             * tile has 8 512B sub-columns.
-             *
-             * The major Y value is the row of tiles in which the pixel lives.
-             * The major X value is the tile sub-column in which the pixel
-             * lives; for X tiling, this is the same as the tile column, for Y
-             * tiling, each tile has 8 sub-columns.  The minor X and Y indices
-             * are the position within the sub-column.
-             */
-            for (unsigned c = 0; c < 2; ++c) {
-               /* Calculate the minor x and y indices. */
-               bld.BFE(offset(minor, bld, c), offset(tile, bld, c),
-                       brw_imm_d(0), offset(addr, bld, c));
-
-               /* Calculate the major x and y indices. */
-               bld.SHR(offset(major, bld, c),
-                       offset(addr, bld, c), offset(tile, bld, c));
-            }
-
-            /* Calculate the texel index from the start of the tile row and
-             * the vertical coordinate of the row.
-             * Equivalent to:
-             *   tmp.x = (major.x << tile.y << tile.x) +
-             *           (minor.y << tile.x) + minor.x
-             *   tmp.y = major.y << tile.y
-             */
-            bld.SHL(tmp, major, offset(tile, bld, 1));
-            bld.ADD(tmp, tmp, offset(minor, bld, 1));
-            bld.SHL(tmp, tmp, offset(tile, bld, 0));
-            bld.ADD(tmp, tmp, minor);
-            bld.SHL(offset(tmp, bld, 1),
-                    offset(major, bld, 1), offset(tile, bld, 1));
-
-            /* Add it to the start of the tile row. */
-            bld.MUL(offset(tmp, bld, 1),
-                    offset(tmp, bld, 1), offset(stride, bld, 1));
-            bld.ADD(tmp, tmp, offset(tmp, bld, 1));
-
-            /* Multiply by the Bpp value. */
-            bld.MUL(dst, tmp, stride);
-
-            if (devinfo->gen < 8 && !devinfo->is_baytrail) {
-               /* Take into account the two dynamically specified shifts.
-                * Both need are used to implement swizzling of X-tiled
-                * surfaces.  For Y-tiled surfaces only one bit needs to be
-                * XOR-ed with bit 6 of the memory address, so a swz value of
-                * 0xff (actually interpreted as 31 by the hardware) will be
-                * provided to cause the relevant bit of tmp.y to be zero and
-                * turn the first XOR into the identity.  For linear surfaces
-                * or platforms lacking address swizzling both shifts will be
-                * 0xff causing the relevant bits of both tmp.x and .y to be
-                * zero, what effectively disables swizzling.
-                */
-               for (unsigned c = 0; c < 2; ++c)
-                  bld.SHR(offset(tmp, bld, c), dst, offset(swz, bld, c));
-
-               /* XOR tmp.x and tmp.y with bit 6 of the memory address. */
-               bld.XOR(tmp, tmp, offset(tmp, bld, 1));
-               bld.AND(tmp, tmp, brw_imm_d(1 << 6));
-               bld.XOR(dst, dst, tmp);
-            }
-
-         } else {
-            /* Multiply by the Bpp/stride value.  Note that the addr.y may be
-             * non-zero even if the image is one-dimensional because a
-             * vertical offset may have been applied above to select a
-             * non-zero slice or level of a higher-dimensional texture.
-             */
-            bld.MUL(offset(addr, bld, 1),
-                    offset(addr, bld, 1), offset(stride, bld, 1));
-            bld.ADD(addr, addr, offset(addr, bld, 1));
-            bld.MUL(dst, addr, stride);
-         }
-
-         return dst;
-      }
-   }
-
-   namespace image_format_conversion {
-      using image_format_info::color_u;
-
-      namespace {
-         /**
-          * Maximum representable value in an unsigned integer with the given
-          * number of bits.
-          */
-         inline unsigned
-         scale(unsigned n)
-         {
-            return (1 << n) - 1;
-         }
-      }
-
-      /**
-       * Pack the vector \p src in a bitfield given the per-component bit
-       * shifts and widths.  Note that bitfield components are not allowed to
-       * cross 32-bit boundaries.
-       */
-      static fs_reg
-      emit_pack(const fs_builder &bld, const fs_reg &src,
-                const color_u &shifts, const color_u &widths)
-      {
-         const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, 4);
-         bool seen[4] = {};
-
-         for (unsigned c = 0; c < 4; ++c) {
-            if (widths[c]) {
-               const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD);
-
-               /* Shift each component left to the correct bitfield position. */
-               bld.SHL(tmp, offset(src, bld, c), brw_imm_ud(shifts[c] % 32));
-
-               /* Add everything up. */
-               if (seen[shifts[c] / 32]) {
-                  bld.OR(offset(dst, bld, shifts[c] / 32),
-                         offset(dst, bld, shifts[c] / 32), tmp);
-               } else {
-                  bld.MOV(offset(dst, bld, shifts[c] / 32), tmp);
-                  seen[shifts[c] / 32] = true;
-               }
-            }
-         }
-
-         return dst;
-      }
-
-      /**
-       * Unpack a vector from the bitfield \p src given the per-component bit
-       * shifts and widths.  Note that bitfield components are not allowed to
-       * cross 32-bit boundaries.
-       */
-      static fs_reg
-      emit_unpack(const fs_builder &bld, const fs_reg &src,
-                  const color_u &shifts, const color_u &widths)
-      {
-         const fs_reg dst = bld.vgrf(src.type, 4);
-
-         for (unsigned c = 0; c < 4; ++c) {
-            if (widths[c]) {
-               /* Shift left to discard the most significant bits. */
-               bld.SHL(offset(dst, bld, c),
-                       offset(src, bld, shifts[c] / 32),
-                       brw_imm_ud(32 - shifts[c] % 32 - widths[c]));
-
-               /* Shift back to the least significant bits using an arithmetic
-                * shift to get sign extension on signed types.
-                */
-               bld.ASR(offset(dst, bld, c),
-                       offset(dst, bld, c), brw_imm_ud(32 - widths[c]));
-            }
-         }
-
-         return dst;
-      }
-
-      /**
-       * Convert an integer vector into another integer vector of the
-       * specified bit widths, properly handling overflow.
-       */
-      static fs_reg
-      emit_convert_to_integer(const fs_builder &bld, const fs_reg &src,
-                              const color_u &widths, bool is_signed)
-      {
-         const unsigned s = (is_signed ? 1 : 0);
-         const fs_reg dst = bld.vgrf(
-            is_signed ? BRW_REGISTER_TYPE_D : BRW_REGISTER_TYPE_UD, 4);
-         assert(src.type == dst.type);
-
-         for (unsigned c = 0; c < 4; ++c) {
-            if (widths[c]) {
-               /* Clamp to the maximum value. */
-               bld.emit_minmax(offset(dst, bld, c), offset(src, bld, c),
-                               brw_imm_d((int)scale(widths[c] - s)),
-                               BRW_CONDITIONAL_L);
-
-               /* Clamp to the minimum value. */
-               if (is_signed)
-                  bld.emit_minmax(offset(dst, bld, c), offset(dst, bld, c),
-                                  brw_imm_d(-(int)scale(widths[c] - s) - 1),
-                                  BRW_CONDITIONAL_GE);
-
-               /* Mask off all but the bits we actually want.  Otherwise, if
-                * we pass a negative number into the hardware when it's
-                * expecting something like UINT8, it will happily clamp it to
-                * +255 for us.
-                */
-               if (is_signed && widths[c] < 32)
-                  bld.AND(offset(dst, bld, c), offset(dst, bld, c),
-                          brw_imm_d(scale(widths[c])));
-            }
-         }
-
-         return dst;
-      }
-
-      /**
-       * Convert a normalized fixed-point vector of the specified signedness
-       * and bit widths into a floating point vector.
-       */
-      static fs_reg
-      emit_convert_from_scaled(const fs_builder &bld, const fs_reg &src,
-                               const color_u &widths, bool is_signed)
-      {
-         const unsigned s = (is_signed ? 1 : 0);
-         const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_F, 4);
-
-         for (unsigned c = 0; c < 4; ++c) {
-            if (widths[c]) {
-               /* Convert to float. */
-               bld.MOV(offset(dst, bld, c), offset(src, bld, c));
-
-               /* Divide by the normalization constants. */
-               bld.MUL(offset(dst, bld, c), offset(dst, bld, c),
-                       brw_imm_f(1.0f / scale(widths[c] - s)));
-
-               /* Clamp to the minimum value. */
-               if (is_signed)
-                  bld.emit_minmax(offset(dst, bld, c),
-                                  offset(dst, bld, c), brw_imm_f(-1.0f),
-                                  BRW_CONDITIONAL_GE);
-            }
-         }
-         return dst;
-      }
-
-      /**
-       * Convert a floating-point vector into a normalized fixed-point vector
-       * of the specified signedness and bit widths.
-       */
-      static fs_reg
-      emit_convert_to_scaled(const fs_builder &bld, const fs_reg &src,
-                             const color_u &widths, bool is_signed)
-      {
-         const unsigned s = (is_signed ? 1 : 0);
-         const fs_reg dst = bld.vgrf(
-            is_signed ? BRW_REGISTER_TYPE_D : BRW_REGISTER_TYPE_UD, 4);
-         const fs_reg fdst = retype(dst, BRW_REGISTER_TYPE_F);
-
-         for (unsigned c = 0; c < 4; ++c) {
-            if (widths[c]) {
-               /* Clamp the normalized floating-point argument. */
-               if (is_signed) {
-                  bld.emit_minmax(offset(fdst, bld, c), offset(src, bld, c),
-                                  brw_imm_f(-1.0f), BRW_CONDITIONAL_GE);
-
-                  bld.emit_minmax(offset(fdst, bld, c), offset(fdst, bld, c),
-                                  brw_imm_f(1.0f), BRW_CONDITIONAL_L);
-               } else {
-                  set_saturate(true, bld.MOV(offset(fdst, bld, c),
-                                             offset(src, bld, c)));
-               }
-
-               /* Multiply by the normalization constants. */
-               bld.MUL(offset(fdst, bld, c), offset(fdst, bld, c),
-                       brw_imm_f((float)scale(widths[c] - s)));
-
-               /* Convert to integer. */
-               bld.RNDE(offset(fdst, bld, c), offset(fdst, bld, c));
-               bld.MOV(offset(dst, bld, c), offset(fdst, bld, c));
-
-               /* Mask off all but the bits we actually want.  Otherwise, if
-                * we pass a negative number into the hardware when it's
-                * expecting something like UINT8, it will happily clamp it to
-                * +255 for us.
-                */
-               if (is_signed && widths[c] < 32)
-                  bld.AND(offset(dst, bld, c), offset(dst, bld, c),
-                          brw_imm_d(scale(widths[c])));
-            }
-         }
-
-         return dst;
-      }
-
-      /**
-       * Convert a floating point vector of the specified bit widths into a
-       * 32-bit floating point vector.
-       */
-      static fs_reg
-      emit_convert_from_float(const fs_builder &bld, const fs_reg &src,
-                              const color_u &widths)
-      {
-         const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, 4);
-         const fs_reg fdst = retype(dst, BRW_REGISTER_TYPE_F);
-
-         for (unsigned c = 0; c < 4; ++c) {
-            if (widths[c]) {
-               bld.MOV(offset(dst, bld, c), offset(src, bld, c));
-
-               /* Extend 10-bit and 11-bit floating point numbers to 15 bits.
-                * This works because they have a 5-bit exponent just like the
-                * 16-bit floating point format, and they have no sign bit.
-                */
-               if (widths[c] < 16)
-                  bld.SHL(offset(dst, bld, c),
-                          offset(dst, bld, c), brw_imm_ud(15 - widths[c]));
-
-               /* Convert to 32-bit floating point. */
-               bld.F16TO32(offset(fdst, bld, c), offset(dst, bld, c));
-            }
-         }
-
-         return fdst;
-      }
-
-      /**
-       * Convert a vector into a floating point vector of the specified bit
-       * widths.
-       */
-      static fs_reg
-      emit_convert_to_float(const fs_builder &bld, const fs_reg &src,
-                            const color_u &widths)
-      {
-         const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, 4);
-         const fs_reg fdst = retype(dst, BRW_REGISTER_TYPE_F);
-
-         for (unsigned c = 0; c < 4; ++c) {
-            if (widths[c]) {
-               bld.MOV(offset(fdst, bld, c), offset(src, bld, c));
-
-               /* Clamp to the minimum value. */
-               if (widths[c] < 16)
-                  bld.emit_minmax(offset(fdst, bld, c), offset(fdst, bld, c),
-                                  brw_imm_f(0.0f), BRW_CONDITIONAL_GE);
-
-               /* Convert to 16-bit floating-point. */
-               bld.F32TO16(offset(dst, bld, c), offset(fdst, bld, c));
-
-               /* Discard the least significant bits to get floating point
-                * numbers of the requested width.  This works because the
-                * 10-bit and 11-bit floating point formats have a 5-bit
-                * exponent just like the 16-bit format, and they have no sign
-                * bit.
-                */
-               if (widths[c] < 16)
-                  bld.SHR(offset(dst, bld, c), offset(dst, bld, c),
-                          brw_imm_ud(15 - widths[c]));
-            }
-         }
-
-         return dst;
-      }
-
-      /**
-       * Fill missing components of a vector with 0, 0, 0, 1.
-       */
-      static fs_reg
-      emit_pad(const fs_builder &bld, const fs_reg &src,
-               const color_u &widths)
-      {
-         const fs_reg dst = bld.vgrf(src.type, 4);
-         const unsigned pad[] = { 0, 0, 0, 1 };
-
-         for (unsigned c = 0; c < 4; ++c)
-            bld.MOV(offset(dst, bld, c),
-                    widths[c] ? offset(src, bld, c)
-                              : fs_reg(brw_imm_ud(pad[c])));
-
-         return dst;
-      }
-   }
-}
-
-namespace brw {
-   namespace image_access {
-      /**
-       * Load a vector from a surface of the given format and dimensionality
-       * at the given coordinates.  \p surf_dims and \p arr_dims give the
-       * number of non-array and array coordinates of the image respectively.
-       */
-      fs_reg
-      emit_image_load(const fs_builder &bld,
-                      const fs_reg &image, const fs_reg &addr,
-                      unsigned surf_dims, unsigned arr_dims,
-                      unsigned gl_format)
-      {
-         using namespace image_format_info;
-         using namespace image_format_conversion;
-         using namespace image_validity;
-         using namespace image_coordinates;
-         using namespace surface_access;
-         const gen_device_info *devinfo = bld.shader->devinfo;
-         const isl_format format = isl_format_for_gl_format(gl_format);
-         const isl_format lower_format =
-            isl_lower_storage_image_format(devinfo, format);
-         fs_reg tmp;
-
-         /* Transform the image coordinates into actual surface coordinates. */
-         const fs_reg saddr =
-            emit_image_coordinates(bld, addr, surf_dims, arr_dims, format);
-         const unsigned dims =
-            num_image_coordinates(bld, surf_dims, arr_dims, format);
-
-         if (isl_has_matching_typed_storage_image_format(devinfo, format)) {
-            /* Hopefully we get here most of the time... */
-            tmp = emit_typed_read(bld, image, saddr, dims,
-                                  isl_format_get_num_channels(lower_format));
-         } else {
-            /* Untyped surface reads return 32 bits of the surface per
-             * component, without any sort of unpacking or type conversion,
-             */
-            const unsigned size = isl_format_get_layout(format)->bpb / 32;
-            /* they don't properly handle out of bounds access, so we have to
-             * check manually if the coordinates are valid and predicate the
-             * surface read on the result,
-             */
-            const brw_predicate pred =
-               emit_untyped_image_check(bld, image,
-                                        emit_bounds_check(bld, image,
-                                                          saddr, dims));
-
-            /* and they don't know about surface coordinates, we need to
-             * convert them to a raw memory offset.
-             */
-            const fs_reg laddr = emit_address_calculation(bld, image, saddr, dims);
-
-            tmp = emit_untyped_read(bld, image, laddr, 1, size, pred);
-
-            /* An out of bounds surface access should give zero as result. */
-            for (unsigned c = 0; c < size; ++c)
-               set_predicate(pred, bld.SEL(offset(tmp, bld, c),
-                                           offset(tmp, bld, c), brw_imm_d(0)));
-         }
-
-         /* Set the register type to D instead of UD if the data type is
-          * represented as a signed integer in memory so that sign extension
-          * is handled correctly by unpack.
-          */
-         if (needs_sign_extension(format))
-            tmp = retype(tmp, BRW_REGISTER_TYPE_D);
-
-         if (!has_supported_bit_layout(devinfo, format)) {
-            /* Unpack individual vector components from the bitfield if the
-             * hardware is unable to do it for us.
-             */
-            if (has_split_bit_layout(devinfo, format))
-               tmp = emit_pack(bld, tmp, get_bit_shifts(lower_format),
-                               get_bit_widths(lower_format));
-            else
-               tmp = emit_unpack(bld, tmp, get_bit_shifts(format),
-                                 get_bit_widths(format));
-
-         } else if ((needs_sign_extension(format) &&
-                     !is_conversion_trivial(devinfo, format)) ||
-                    has_undefined_high_bits(devinfo, format)) {
-            /* Perform a trivial unpack even though the bit layout matches in
-             * order to get the most significant bits of each component
-             * initialized properly.
-             */
-            tmp = emit_unpack(bld, tmp, color_u(0, 32, 64, 96),
-                              get_bit_widths(format));
-         }
-
-         if (!isl_format_has_int_channel(format)) {
-            if (is_conversion_trivial(devinfo, format)) {
-               /* Just need to cast the vector to the target type. */
-               tmp = retype(tmp, BRW_REGISTER_TYPE_F);
-            } else {
-               /* Do the right sort of type conversion to float. */
-               if (isl_format_has_float_channel(format))
-                  tmp = emit_convert_from_float(
-                     bld, tmp, get_bit_widths(format));
-               else
-                  tmp = emit_convert_from_scaled(
-                     bld, tmp, get_bit_widths(format),
-                     isl_format_has_snorm_channel(format));
-            }
-         }
-
-         /* Initialize missing components of the result. */
-         return emit_pad(bld, tmp, get_bit_widths(format));
-      }
-
-      /**
-       * Store a vector in a surface of the given format and dimensionality at
-       * the given coordinates.  \p surf_dims and \p arr_dims give the number
-       * of non-array and array coordinates of the image respectively.
-       */
-      void
-      emit_image_store(const fs_builder &bld, const fs_reg &image,
-                       const fs_reg &addr, const fs_reg &src,
-                       unsigned surf_dims, unsigned arr_dims,
-                       unsigned gl_format)
-      {
-         using namespace image_format_info;
-         using namespace image_format_conversion;
-         using namespace image_validity;
-         using namespace image_coordinates;
-         using namespace surface_access;
-         const isl_format format = isl_format_for_gl_format(gl_format);
-         const gen_device_info *devinfo = bld.shader->devinfo;
-
-         /* Transform the image coordinates into actual surface coordinates. */
-         const fs_reg saddr =
-            emit_image_coordinates(bld, addr, surf_dims, arr_dims, format);
-         const unsigned dims =
-            num_image_coordinates(bld, surf_dims, arr_dims, format);
-
-         if (gl_format == GL_NONE) {
-            /* We don't know what the format is, but that's fine because it
-             * implies write-only access, and typed surface writes are always
-             * able to take care of type conversion and packing for us.
-             */
-            emit_typed_write(bld, image, saddr, src, dims, 4);
-
-         } else {
-            const isl_format lower_format =
-               isl_lower_storage_image_format(devinfo, format);
-            fs_reg tmp = src;
-
-            if (!is_conversion_trivial(devinfo, format)) {
-               /* Do the right sort of type conversion. */
-               if (isl_format_has_float_channel(format))
-                  tmp = emit_convert_to_float(bld, tmp, get_bit_widths(format));
-
-               else if (isl_format_has_int_channel(format))
-                  tmp = emit_convert_to_integer(bld, tmp, get_bit_widths(format),
-                                                isl_format_has_sint_channel(format));
-
-               else
-                  tmp = emit_convert_to_scaled(bld, tmp, get_bit_widths(format),
-                                               isl_format_has_snorm_channel(format));
-            }
-
-            /* We're down to bit manipulation at this point. */
-            tmp = retype(tmp, BRW_REGISTER_TYPE_UD);
-
-            if (!has_supported_bit_layout(devinfo, format)) {
-               /* Pack the vector components into a bitfield if the hardware
-                * is unable to do it for us.
-                */
-               if (has_split_bit_layout(devinfo, format))
-                  tmp = emit_unpack(bld, tmp, get_bit_shifts(lower_format),
-                                    get_bit_widths(lower_format));
-
-               else
-                  tmp = emit_pack(bld, tmp, get_bit_shifts(format),
-                                  get_bit_widths(format));
-            }
-
-            if (isl_has_matching_typed_storage_image_format(devinfo, format)) {
-               /* Hopefully we get here most of the time... */
-               emit_typed_write(bld, image, saddr, tmp, dims,
-                                isl_format_get_num_channels(lower_format));
-
-            } else {
-               /* Untyped surface writes store 32 bits of the surface per
-                * component, without any sort of packing or type conversion,
-                */
-               const unsigned size = isl_format_get_layout(format)->bpb / 32;
-
-               /* they don't properly handle out of bounds access, so we have
-                * to check manually if the coordinates are valid and predicate
-                * the surface write on the result,
-                */
-               const brw_predicate pred =
-                  emit_untyped_image_check(bld, image,
-                                           emit_bounds_check(bld, image,
-                                                             saddr, dims));
-
-               /* and, phew, they don't know about surface coordinates, we
-                * need to convert them to a raw memory offset.
-                */
-               const fs_reg laddr = emit_address_calculation(
-                  bld, image, saddr, dims);
-
-               emit_untyped_write(bld, image, laddr, tmp, 1, size, pred);
-            }
-         }
-      }
-
-      /**
-       * Perform an atomic read-modify-write operation in a surface of the
-       * given dimensionality at the given coordinates.  \p surf_dims and \p
-       * arr_dims give the number of non-array and array coordinates of the
-       * image respectively.  Main building block of the imageAtomic GLSL
-       * built-ins.
-       */
-      fs_reg
-      emit_image_atomic(const fs_builder &bld,
-                        const fs_reg &image, const fs_reg &addr,
-                        const fs_reg &src0, const fs_reg &src1,
-                        unsigned surf_dims, unsigned arr_dims,
-                        unsigned rsize, unsigned op)
-      {
-         using namespace image_validity;
-         using namespace image_coordinates;
-         using namespace surface_access;
-         /* Avoid performing an atomic operation on an unbound surface. */
-         const brw_predicate pred = emit_typed_atomic_check(bld, image);
-
-         /* Transform the image coordinates into actual surface coordinates. */
-         const fs_reg saddr =
-            emit_image_coordinates(bld, addr, surf_dims, arr_dims,
-                                   ISL_FORMAT_R32_UINT);
-         const unsigned dims =
-            num_image_coordinates(bld, surf_dims, arr_dims,
-                                  ISL_FORMAT_R32_UINT);
-
-         /* Thankfully we can do without untyped atomics here. */
-         const fs_reg tmp = emit_typed_atomic(bld, image, saddr, src0, src1,
-                                              dims, rsize, op, pred);
-
-         /* An unbound surface access should give zero as result. */
-         if (rsize && pred)
-            set_predicate(pred, bld.SEL(tmp, tmp, brw_imm_d(0)));
-
-         return retype(tmp, src0.type);
-      }
-   }
-}
diff --git a/src/intel/compiler/brw_fs_surface_builder.h b/src/intel/compiler/brw_fs_surface_builder.h
index 194d61d4892..7b7ab6be80b 100644
--- a/src/intel/compiler/brw_fs_surface_builder.h
+++ b/src/intel/compiler/brw_fs_surface_builder.h
@@ -78,25 +78,5 @@ namespace brw {
                                 unsigned bit_size,
                                 brw_predicate pred = BRW_PREDICATE_NONE);
    }
-
-   namespace image_access {
-      fs_reg
-      emit_image_load(const fs_builder &bld,
-                      const fs_reg &image, const fs_reg &addr,
-                      unsigned surf_dims, unsigned arr_dims,
-                      unsigned gl_format);
-
-      void
-      emit_image_store(const fs_builder &bld, const fs_reg &image,
-                       const fs_reg &addr, const fs_reg &src,
-                       unsigned surf_dims, unsigned arr_dims,
-                       unsigned gl_format);
-      fs_reg
-      emit_image_atomic(const fs_builder &bld,
-                        const fs_reg &image, const fs_reg &addr,
-                        const fs_reg &src0, const fs_reg &src1,
-                        unsigned surf_dims, unsigned arr_dims,
-                        unsigned rsize, unsigned op);
-   }
 }
 #endif
diff --git a/src/intel/compiler/brw_nir.h b/src/intel/compiler/brw_nir.h
index 00b61731526..30c10117dfd 100644
--- a/src/intel/compiler/brw_nir.h
+++ b/src/intel/compiler/brw_nir.h
@@ -114,6 +114,9 @@ void brw_nir_lower_tcs_outputs(nir_shader *nir, const struct brw_vue_map *vue,
                                GLenum tes_primitive_mode);
 void brw_nir_lower_fs_outputs(nir_shader *nir);
 
+bool brw_nir_lower_image_load_store(nir_shader *nir,
+                                    const struct gen_device_info *devinfo);
+
 nir_shader *brw_postprocess_nir(nir_shader *nir,
                                 const struct brw_compiler *compiler,
                                 bool is_scalar);
diff --git a/src/intel/compiler/brw_nir_lower_image_load_store.c b/src/intel/compiler/brw_nir_lower_image_load_store.c
new file mode 100644
index 00000000000..45a30a041be
--- /dev/null
+++ b/src/intel/compiler/brw_nir_lower_image_load_store.c
@@ -0,0 +1,824 @@
+/*
+ * Copyright © 2018 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "isl/isl.h"
+
+#include "brw_nir.h"
+#include "compiler/nir/nir_builder.h"
+#include "compiler/nir/nir_format_convert.h"
+
+/* The higher compiler layers use the GL enums for image formats even if
+ * they come in from SPIR-V or Vulkan.  We need to turn them into an ISL
+ * enum before we can use them.
+ */
+static enum isl_format
+isl_format_for_gl_format(uint32_t gl_format)
+{
+   switch (gl_format) {
+   case GL_R8:             return ISL_FORMAT_R8_UNORM;
+   case GL_R8_SNORM:       return ISL_FORMAT_R8_SNORM;
+   case GL_R8UI:           return ISL_FORMAT_R8_UINT;
+   case GL_R8I:            return ISL_FORMAT_R8_SINT;
+   case GL_RG8:            return ISL_FORMAT_R8G8_UNORM;
+   case GL_RG8_SNORM:      return ISL_FORMAT_R8G8_SNORM;
+   case GL_RG8UI:          return ISL_FORMAT_R8G8_UINT;
+   case GL_RG8I:           return ISL_FORMAT_R8G8_SINT;
+   case GL_RGBA8:          return ISL_FORMAT_R8G8B8A8_UNORM;
+   case GL_RGBA8_SNORM:    return ISL_FORMAT_R8G8B8A8_SNORM;
+   case GL_RGBA8UI:        return ISL_FORMAT_R8G8B8A8_UINT;
+   case GL_RGBA8I:         return ISL_FORMAT_R8G8B8A8_SINT;
+   case GL_R11F_G11F_B10F: return ISL_FORMAT_R11G11B10_FLOAT;
+   case GL_RGB10_A2:       return ISL_FORMAT_R10G10B10A2_UNORM;
+   case GL_RGB10_A2UI:     return ISL_FORMAT_R10G10B10A2_UINT;
+   case GL_R16:            return ISL_FORMAT_R16_UNORM;
+   case GL_R16_SNORM:      return ISL_FORMAT_R16_SNORM;
+   case GL_R16F:           return ISL_FORMAT_R16_FLOAT;
+   case GL_R16UI:          return ISL_FORMAT_R16_UINT;
+   case GL_R16I:           return ISL_FORMAT_R16_SINT;
+   case GL_RG16:           return ISL_FORMAT_R16G16_UNORM;
+   case GL_RG16_SNORM:     return ISL_FORMAT_R16G16_SNORM;
+   case GL_RG16F:          return ISL_FORMAT_R16G16_FLOAT;
+   case GL_RG16UI:         return ISL_FORMAT_R16G16_UINT;
+   case GL_RG16I:          return ISL_FORMAT_R16G16_SINT;
+   case GL_RGBA16:         return ISL_FORMAT_R16G16B16A16_UNORM;
+   case GL_RGBA16_SNORM:   return ISL_FORMAT_R16G16B16A16_SNORM;
+   case GL_RGBA16F:        return ISL_FORMAT_R16G16B16A16_FLOAT;
+   case GL_RGBA16UI:       return ISL_FORMAT_R16G16B16A16_UINT;
+   case GL_RGBA16I:        return ISL_FORMAT_R16G16B16A16_SINT;
+   case GL_R32F:           return ISL_FORMAT_R32_FLOAT;
+   case GL_R32UI:          return ISL_FORMAT_R32_UINT;
+   case GL_R32I:           return ISL_FORMAT_R32_SINT;
+   case GL_RG32F:          return ISL_FORMAT_R32G32_FLOAT;
+   case GL_RG32UI:         return ISL_FORMAT_R32G32_UINT;
+   case GL_RG32I:          return ISL_FORMAT_R32G32_SINT;
+   case GL_RGBA32F:        return ISL_FORMAT_R32G32B32A32_FLOAT;
+   case GL_RGBA32UI:       return ISL_FORMAT_R32G32B32A32_UINT;
+   case GL_RGBA32I:        return ISL_FORMAT_R32G32B32A32_SINT;
+   case GL_NONE:           return ISL_FORMAT_UNSUPPORTED;
+   default:
+      assert(!"Invalid image format");
+      return ISL_FORMAT_UNSUPPORTED;
+   }
+}
+
+static nir_ssa_def *
+_load_image_param(nir_builder *b, nir_deref_instr *deref, unsigned offset)
+{
+   nir_intrinsic_instr *load =
+      nir_intrinsic_instr_create(b->shader,
+                                 nir_intrinsic_image_deref_load_param_intel);
+   load->src[0] = nir_src_for_ssa(&deref->dest.ssa);
+   nir_intrinsic_set_base(load, offset / 4);
+
+   switch (offset) {
+   case BRW_IMAGE_PARAM_SURFACE_IDX_OFFSET:
+      load->num_components = 1;
+      break;
+   case BRW_IMAGE_PARAM_OFFSET_OFFSET:
+   case BRW_IMAGE_PARAM_SWIZZLING_OFFSET:
+      load->num_components = 2;
+      break;
+   case BRW_IMAGE_PARAM_TILING_OFFSET:
+   case BRW_IMAGE_PARAM_SIZE_OFFSET:
+      load->num_components = 3;
+      break;
+   case BRW_IMAGE_PARAM_STRIDE_OFFSET:
+      load->num_components = 4;
+      break;
+   default:
+      unreachable("Invalid param offset");
+   }
+   nir_ssa_dest_init(&load->instr, &load->dest,
+                     load->num_components, 32, NULL);
+
+   nir_builder_instr_insert(b, &load->instr);
+   return &load->dest.ssa;
+}
+
+#define load_image_param(b, d, o) \
+   _load_image_param(b, d, BRW_IMAGE_PARAM_##o##_OFFSET)
+
+static nir_ssa_def *
+sanitize_image_coord(nir_builder *b, nir_deref_instr *deref, nir_ssa_def *coord)
+{
+   if (glsl_get_sampler_dim(deref->type) == GLSL_SAMPLER_DIM_1D &&
+       glsl_sampler_type_is_array(deref->type)) {
+      /* It's easier if 1D arrays are treated like 2D arrays */
+      return nir_vec3(b, nir_channel(b, coord, 0),
+                         nir_imm_int(b, 0),
+                         nir_channel(b, coord, 1));
+   } else {
+      unsigned dims = glsl_get_sampler_coordinate_components(deref->type);
+      return nir_channels(b, coord, (1 << dims) - 1);
+   }
+}
+
+static nir_ssa_def *
+image_coord_is_in_bounds(nir_builder *b, const struct gen_device_info *devinfo,
+                         nir_deref_instr *deref, nir_ssa_def *coord)
+{
+   coord = sanitize_image_coord(b, deref, coord);
+   nir_ssa_def *size = load_image_param(b, deref, SIZE);
+
+   nir_ssa_def *cmp = nir_ilt(b, coord, size);
+   nir_ssa_def *in_bounds = nir_imm_int(b, NIR_TRUE);
+   for (unsigned i = 0; i < coord->num_components; i++)
+      in_bounds = nir_iand(b, in_bounds, nir_channel(b, cmp, i));
+
+   return in_bounds;
+}
+
+/** Calculate the offset in memory of the texel given by \p coord.
+ *
+ * This is meant to be used with untyped surface messages to access a tiled
+ * surface, what involves taking into account the tiling and swizzling modes
+ * of the surface manually so it will hopefully not happen very often.
+ *
+ * The tiling algorithm implemented here matches either the X or Y tiling
+ * layouts supported by the hardware depending on the tiling coefficients
+ * passed to the program as uniforms.  See Volume 1 Part 2 Section 4.5
+ * "Address Tiling Function" of the IVB PRM for an in-depth explanation of
+ * the hardware tiling format.
+ */
+static nir_ssa_def *
+image_address(nir_builder *b, const struct gen_device_info *devinfo,
+              nir_deref_instr *deref, nir_ssa_def *coord)
+{
+   coord = sanitize_image_coord(b, deref, coord);
+
+   nir_ssa_def *offset = load_image_param(b, deref, OFFSET);
+   nir_ssa_def *tiling = load_image_param(b, deref, TILING);
+   nir_ssa_def *stride = load_image_param(b, deref, STRIDE);
+
+   /* Shift the coordinates by the fixed surface offset.  It may be non-zero
+    * if the image is a single slice of a higher-dimensional surface, or if a
+    * non-zero mipmap level of the surface is bound to the pipeline.  The
+    * offset needs to be applied here rather than at surface state set-up time
+    * because the desired slice-level may start mid-tile, so simply shifting
+    * the surface base address wouldn't give a well-formed tiled surface in
+    * the general case.
+    */
+   nir_ssa_def *xypos = (coord->num_components == 1) ?
+                        nir_vec2(b, coord, nir_imm_int(b, 0)) :
+                        nir_channels(b, coord, 0x3);
+   xypos = nir_iadd(b, xypos, offset);
+
+   /* The layout of 3-D textures in memory is sort-of like a tiling
+    * format.  At each miplevel, the slices are arranged in rows of
+    * 2^level slices per row.  The slice row is stored in tmp.y and
+    * the slice within the row is stored in tmp.x.
+    *
+    * The layout of 2-D array textures and cubemaps is much simpler:
+    * Depending on whether the ARYSPC_LOD0 layout is in use it will be
+    * stored in memory as an array of slices, each one being a 2-D
+    * arrangement of miplevels, or as a 2D arrangement of miplevels,
+    * each one being an array of slices.  In either case the separation
+    * between slices of the same LOD is equal to the qpitch value
+    * provided as stride.w.
+    *
+    * This code can be made to handle either 2D arrays and 3D textures
+    * by passing in the miplevel as tile.z for 3-D textures and 0 in
+    * tile.z for 2-D array textures.
+    *
+    * See Volume 1 Part 1 of the Gen7 PRM, sections 6.18.4.7 "Surface
+    * Arrays" and 6.18.6 "3D Surfaces" for a more extensive discussion
+    * of the hardware 3D texture and 2D array layouts.
+    */
+   if (coord->num_components > 2) {
+      /* Decompose z into a major (tmp.y) and a minor (tmp.x)
+       * index.
+       */
+      nir_ssa_def *z = nir_channel(b, coord, 2);
+      nir_ssa_def *z_x = nir_ubfe(b, z, nir_imm_int(b, 0),
+                                  nir_channel(b, tiling, 2));
+      nir_ssa_def *z_y = nir_ushr(b, z, nir_channel(b, tiling, 2));
+
+      /* Take into account the horizontal (tmp.x) and vertical (tmp.y)
+       * slice offset.
+       */
+      xypos = nir_iadd(b, xypos, nir_imul(b, nir_vec2(b, z_x, z_y),
+                                             nir_channels(b, stride, 0xc)));
+   }
+
+   nir_ssa_def *addr;
+   if (coord->num_components > 1) {
+      /* Calculate the major/minor x and y indices.  In order to
+       * accommodate both X and Y tiling, the Y-major tiling format is
+       * treated as being a bunch of narrow X-tiles placed next to each
+       * other.  This means that the tile width for Y-tiling is actually
+       * the width of one sub-column of the Y-major tile where each 4K
+       * tile has 8 512B sub-columns.
+       *
+       * The major Y value is the row of tiles in which the pixel lives.
+       * The major X value is the tile sub-column in which the pixel
+       * lives; for X tiling, this is the same as the tile column, for Y
+       * tiling, each tile has 8 sub-columns.  The minor X and Y indices
+       * are the position within the sub-column.
+       */
+
+      /* Calculate the minor x and y indices. */
+      nir_ssa_def *minor = nir_ubfe(b, xypos, nir_imm_int(b, 0),
+                                       nir_channels(b, tiling, 0x3));
+      nir_ssa_def *major = nir_ushr(b, xypos, nir_channels(b, tiling, 0x3));
+
+      /* Calculate the texel index from the start of the tile row and the
+       * vertical coordinate of the row.
+       * Equivalent to:
+       *   tmp.x = (major.x << tile.y << tile.x) +
+       *           (minor.y << tile.x) + minor.x
+       *   tmp.y = major.y << tile.y
+       */
+      nir_ssa_def *idx_x, *idx_y;
+      idx_x = nir_ishl(b, nir_channel(b, major, 0), nir_channel(b, tiling, 1));
+      idx_x = nir_iadd(b, idx_x, nir_channel(b, minor, 1));
+      idx_x = nir_ishl(b, idx_x, nir_channel(b, tiling, 0));
+      idx_x = nir_iadd(b, idx_x, nir_channel(b, minor, 0));
+      idx_y = nir_ishl(b, nir_channel(b, major, 1), nir_channel(b, tiling, 1));
+
+      /* Add it to the start of the tile row. */
+      nir_ssa_def *idx;
+      idx = nir_imul(b, idx_y, nir_channel(b, stride, 1));
+      idx = nir_iadd(b, idx, idx_x);
+
+      addr = nir_imul(b, idx, nir_channel(b, stride, 0));
+
+      if (devinfo->gen < 8 && !devinfo->is_baytrail) {
+         /* Take into account the two dynamically specified shifts.  Both need
+          * are used to implement swizzling of X-tiled surfaces.  For Y-tiled
+          * surfaces only one bit needs to be XOR-ed with bit 6 of the memory
+          * address, so a swz value of 0xff (actually interpreted as 31 by the
+          * hardware) will be provided to cause the relevant bit of tmp.y to
+          * be zero and turn the first XOR into the identity.  For linear
+          * surfaces or platforms lacking address swizzling both shifts will
+          * be 0xff causing the relevant bits of both tmp.x and .y to be zero,
+          * what effectively disables swizzling.
+          */
+         nir_ssa_def *swizzle = load_image_param(b, deref, SWIZZLING);
+         nir_ssa_def *shift0 = nir_ushr(b, addr, nir_channel(b, swizzle, 0));
+         nir_ssa_def *shift1 = nir_ushr(b, addr, nir_channel(b, swizzle, 1));
+
+         /* XOR tmp.x and tmp.y with bit 6 of the memory address. */
+         nir_ssa_def *bit = nir_iand(b, nir_ixor(b, shift0, shift1),
+                                        nir_imm_int(b, 1 << 6));
+         addr = nir_ixor(b, addr, bit);
+      }
+   } else {
+      /* Multiply by the Bpp/stride value.  Note that the addr.y may be
+       * non-zero even if the image is one-dimensional because a vertical
+       * offset may have been applied above to select a non-zero slice or
+       * level of a higher-dimensional texture.
+       */
+      nir_ssa_def *idx;
+      idx = nir_imul(b, nir_channel(b, xypos, 1), nir_channel(b, stride, 1));
+      idx = nir_iadd(b, nir_channel(b, xypos, 0), idx);
+      addr = nir_imul(b, idx, nir_channel(b, stride, 0));
+   }
+
+   return addr;
+}
+
+struct format_info {
+   const struct isl_format_layout *fmtl;
+   unsigned chans;
+   unsigned bits[4];
+};
+
+static struct format_info
+get_format_info(enum isl_format fmt)
+{
+   const struct isl_format_layout *fmtl = isl_format_get_layout(fmt);
+
+   return (struct format_info) {
+      .fmtl = fmtl,
+      .chans = isl_format_get_num_channels(fmt),
+      .bits = {
+         fmtl->channels.r.bits,
+         fmtl->channels.g.bits,
+         fmtl->channels.b.bits,
+         fmtl->channels.a.bits
+      },
+   };
+}
+
+static nir_ssa_def *
+nir_zero_vec(nir_builder *b, unsigned num_components)
+{
+   nir_const_value v;
+   memset(&v, 0, sizeof(v));
+
+   return nir_build_imm(b, num_components, 32, v);
+}
+
+static nir_ssa_def *
+convert_color_for_load(nir_builder *b, const struct gen_device_info *devinfo,
+                       nir_ssa_def *color,
+                       enum isl_format image_fmt, enum isl_format lower_fmt,
+                       unsigned dest_components)
+{
+   if (image_fmt == lower_fmt)
+      goto expand_vec;
+
+   if (image_fmt == ISL_FORMAT_R11G11B10_FLOAT) {
+      assert(lower_fmt == ISL_FORMAT_R32_UINT);
+      color = nir_format_unpack_11f11f10f(b, color);
+      goto expand_vec;
+   }
+
+   struct format_info image = get_format_info(image_fmt);
+   struct format_info lower = get_format_info(lower_fmt);
+
+   const bool needs_sign_extension =
+      isl_format_has_snorm_channel(image_fmt) ||
+      isl_format_has_sint_channel(image_fmt);
+
+   /* We only check the red channel to detect if we need to pack/unpack */
+   assert(image.bits[0] != lower.bits[0] ||
+          memcmp(image.bits, lower.bits, sizeof(image.bits)) == 0);
+
+   if (image.bits[0] != lower.bits[0] && lower_fmt == ISL_FORMAT_R32_UINT) {
+      if (needs_sign_extension)
+         color = nir_format_unpack_sint(b, color, image.bits, image.chans);
+      else
+         color = nir_format_unpack_uint(b, color, image.bits, image.chans);
+   } else {
+      /* All these formats are homogeneous */
+      for (unsigned i = 1; i < image.chans; i++)
+         assert(image.bits[i] == image.bits[0]);
+
+      /* On IVB, we rely on the undocumented behavior that typed reads from
+       * surfaces of the unsupported R8 and R16 formats return useful data in
+       * their least significant bits.  However, the data in the high bits is
+       * garbage so we have to discard it.
+       */
+      if (devinfo->gen == 7 && !devinfo->is_haswell &&
+          (lower_fmt == ISL_FORMAT_R16_UINT ||
+           lower_fmt == ISL_FORMAT_R8_UINT))
+         color = nir_format_mask_uvec(b, color, lower.bits);
+
+      if (image.bits[0] != lower.bits[0]) {
+         color = nir_format_bitcast_uvec_unmasked(b, color, lower.bits[0],
+                                                  image.bits[0]);
+      }
+
+      if (needs_sign_extension)
+         color = nir_format_sign_extend_ivec(b, color, image.bits);
+   }
+
+   switch (image.fmtl->channels.r.type) {
+   case ISL_UNORM:
+      assert(isl_format_has_uint_channel(lower_fmt));
+      color = nir_format_unorm_to_float(b, color, image.bits);
+      break;
+
+   case ISL_SNORM:
+      assert(isl_format_has_uint_channel(lower_fmt));
+      color = nir_format_snorm_to_float(b, color, image.bits);
+      break;
+
+   case ISL_SFLOAT:
+      if (image.bits[0] == 16)
+         color = nir_unpack_half_2x16_split_x(b, color);
+      break;
+
+   case ISL_UINT:
+   case ISL_SINT:
+      break;
+
+   default:
+      unreachable("Invalid image channel type");
+   }
+
+expand_vec:
+   assert(dest_components == 1 || dest_components == 4);
+   assert(color->num_components <= dest_components);
+   if (color->num_components == dest_components)
+      return color;
+
+   nir_ssa_def *comps[4];
+   for (unsigned i = 0; i < color->num_components; i++)
+      comps[i] = nir_channel(b, color, i);
+
+   for (unsigned i = color->num_components; i < 3; i++)
+      comps[i] = nir_imm_int(b, 0);
+
+   if (color->num_components < 4) {
+      if (isl_format_has_int_channel(image_fmt))
+         comps[3] = nir_imm_int(b, 1);
+      else
+         comps[3] = nir_imm_float(b, 1);
+   }
+
+   return nir_vec(b, comps, dest_components);
+}
+
+static bool
+lower_image_load_instr(nir_builder *b,
+                       const struct gen_device_info *devinfo,
+                       nir_intrinsic_instr *intrin)
+{
+   nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
+   nir_variable *var = nir_deref_instr_get_variable(deref);
+   const enum isl_format image_fmt =
+      isl_format_for_gl_format(var->data.image.format);
+
+   if (isl_has_matching_typed_storage_image_format(devinfo, image_fmt)) {
+      const enum isl_format lower_fmt =
+         isl_lower_storage_image_format(devinfo, image_fmt);
+      const unsigned dest_components = intrin->num_components;
+
+      /* Use an undef to hold the uses of the load while we do the color
+       * conversion.
+       */
+      nir_ssa_def *placeholder = nir_ssa_undef(b, 4, 32);
+      nir_ssa_def_rewrite_uses(&intrin->dest.ssa, nir_src_for_ssa(placeholder));
+
+      intrin->num_components = isl_format_get_num_channels(lower_fmt);
+      intrin->dest.ssa.num_components = intrin->num_components;
+
+      nir_ssa_def *value;
+      if (devinfo->gen == 7 && !devinfo->is_haswell) {
+         /* Check the first component of the size field to find out if the
+          * image is bound.  Necessary on IVB because it don't seem to respect
+          * null surfaces and will hang when no image is bound.
+          */
+         b->cursor = nir_instr_remove(&intrin->instr);
+         nir_ssa_def *size = load_image_param(b, deref, SIZE);
+         nir_push_if(b, nir_ine(b, nir_channel(b, size, 0), nir_imm_int(b, 0)));
+         nir_builder_instr_insert(b, &intrin->instr);
+         nir_push_else(b, NULL);
+         nir_ssa_def *zero = nir_zero_vec(b, intrin->num_components);
+         nir_pop_if(b, NULL);
+         value = nir_if_phi(b, &intrin->dest.ssa, zero);
+      } else {
+         b->cursor = nir_after_instr(&intrin->instr);
+         value = &intrin->dest.ssa;
+      }
+
+      nir_ssa_def *color = convert_color_for_load(b, devinfo, value,
+                                                  image_fmt, lower_fmt,
+                                                  dest_components);
+
+      nir_ssa_def_rewrite_uses(placeholder, nir_src_for_ssa(color));
+      nir_instr_remove(placeholder->parent_instr);
+   } else {
+      const struct isl_format_layout *image_fmtl =
+         isl_format_get_layout(image_fmt);
+      /* We have a matching typed format for everything 32b and below */
+      assert(image_fmtl->bpb == 64 || image_fmtl->bpb == 128);
+      enum isl_format raw_fmt = (image_fmtl->bpb == 64) ?
+                                ISL_FORMAT_R32G32_UINT :
+                                ISL_FORMAT_R32G32B32A32_UINT;
+      const unsigned dest_components = intrin->num_components;
+
+      b->cursor = nir_instr_remove(&intrin->instr);
+
+      nir_ssa_def *coord = intrin->src[1].ssa;
+      nir_push_if(b, image_coord_is_in_bounds(b, devinfo, deref, coord));
+
+      nir_ssa_def *addr = image_address(b, devinfo, deref, coord);
+      nir_intrinsic_instr *load =
+         nir_intrinsic_instr_create(b->shader,
+                                    nir_intrinsic_image_deref_load_raw_intel);
+      load->src[0] = nir_src_for_ssa(&deref->dest.ssa);
+      load->src[1] = nir_src_for_ssa(addr);
+      load->num_components = image_fmtl->bpb / 32;
+      nir_ssa_dest_init(&load->instr, &load->dest,
+                        load->num_components, 32, NULL);
+      nir_builder_instr_insert(b, &load->instr);
+
+      nir_push_else(b, NULL);
+
+      nir_ssa_def *zero = nir_zero_vec(b, load->num_components);
+
+      nir_pop_if(b, NULL);
+
+      nir_ssa_def *value = nir_if_phi(b, &load->dest.ssa, zero);
+
+      nir_ssa_def *color = convert_color_for_load(b, devinfo, value,
+                                                  image_fmt, raw_fmt,
+                                                  dest_components);
+
+      nir_ssa_def_rewrite_uses(&intrin->dest.ssa, nir_src_for_ssa(color));
+   }
+
+   return true;
+}
+
+static nir_ssa_def *
+convert_color_for_store(nir_builder *b, const struct gen_device_info *devinfo,
+                        nir_ssa_def *color,
+                        enum isl_format image_fmt, enum isl_format lower_fmt)
+{
+   struct format_info image = get_format_info(image_fmt);
+   struct format_info lower = get_format_info(lower_fmt);
+
+   color = nir_channels(b, color, (1 << image.chans) - 1);
+
+   if (image_fmt == lower_fmt)
+      return color;
+
+   if (image_fmt == ISL_FORMAT_R11G11B10_FLOAT) {
+      assert(lower_fmt == ISL_FORMAT_R32_UINT);
+      return nir_format_pack_11f11f10f(b, color);
+   }
+
+   switch (image.fmtl->channels.r.type) {
+   case ISL_UNORM:
+      assert(isl_format_has_uint_channel(lower_fmt));
+      color = nir_format_float_to_unorm(b, color, image.bits);
+      break;
+
+   case ISL_SNORM:
+      assert(isl_format_has_uint_channel(lower_fmt));
+      color = nir_format_float_to_snorm(b, color, image.bits);
+      break;
+
+   case ISL_SFLOAT:
+      if (image.bits[0] == 16) {
+         nir_ssa_def *f16comps[4];
+         for (unsigned i = 0; i < image.chans; i++) {
+            f16comps[i] = nir_pack_half_2x16_split(b, nir_channel(b, color, i),
+                                                      nir_imm_float(b, 0));
+         }
+         color = nir_vec(b, f16comps, image.chans);
+      }
+      break;
+
+   case ISL_UINT:
+      if (image.bits[0] < 32) {
+         nir_const_value max;
+         for (unsigned i = 0; i < image.chans; i++) {
+            assert(image.bits[i] < 32);
+            max.u32[i] = (1u << image.bits[i]) - 1;
+         }
+         color = nir_umin(b, color, nir_build_imm(b, image.chans, 32, max));
+      }
+      break;
+
+   case ISL_SINT:
+      if (image.bits[0] < 32) {
+         nir_const_value min, max;
+         for (unsigned i = 0; i < image.chans; i++) {
+            assert(image.bits[i] < 32);
+            max.i32[i] = (1 << (image.bits[i] - 1)) - 1;
+            min.i32[i] = -(1 << (image.bits[i] - 1));
+         }
+         color = nir_imin(b, color, nir_build_imm(b, image.chans, 32, max));
+         color = nir_imax(b, color, nir_build_imm(b, image.chans, 32, min));
+      }
+      break;
+
+   default:
+      unreachable("Invalid image channel type");
+   }
+
+   if (image.bits[0] < 32 &&
+       (isl_format_has_snorm_channel(image_fmt) ||
+        isl_format_has_sint_channel(image_fmt)))
+      color = nir_format_mask_uvec(b, color, image.bits);
+
+   if (image.bits[0] != lower.bits[0] && lower_fmt == ISL_FORMAT_R32_UINT) {
+      color = nir_format_pack_uint(b, color, image.bits, image.chans);
+   } else {
+      /* All these formats are homogeneous */
+      for (unsigned i = 1; i < image.chans; i++)
+         assert(image.bits[i] == image.bits[0]);
+
+      if (image.bits[0] != lower.bits[0]) {
+         color = nir_format_bitcast_uvec_unmasked(b, color, image.bits[0],
+                                                  lower.bits[0]);
+      }
+   }
+
+   return color;
+}
+
+static bool
+lower_image_store_instr(nir_builder *b,
+                        const struct gen_device_info *devinfo,
+                        nir_intrinsic_instr *intrin)
+{
+   nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
+   nir_variable *var = nir_deref_instr_get_variable(deref);
+
+   /* For write-only surfaces, we trust that the hardware can just do the
+    * conversion for us.
+    */
+   if (var->data.image.write_only)
+      return false;
+
+   const enum isl_format image_fmt =
+      isl_format_for_gl_format(var->data.image.format);
+
+   if (isl_has_matching_typed_storage_image_format(devinfo, image_fmt)) {
+      const enum isl_format lower_fmt =
+         isl_lower_storage_image_format(devinfo, image_fmt);
+
+      /* Color conversion goes before the store */
+      b->cursor = nir_before_instr(&intrin->instr);
+
+      nir_ssa_def *color = convert_color_for_store(b, devinfo,
+                                                   intrin->src[3].ssa,
+                                                   image_fmt, lower_fmt);
+      intrin->num_components = isl_format_get_num_channels(lower_fmt);
+      nir_instr_rewrite_src(&intrin->instr, &intrin->src[3],
+                            nir_src_for_ssa(color));
+
+      if (devinfo->gen == 7 && !devinfo->is_haswell) {
+         /* Check the first component of the size field to find out if the
+          * image is bound.  Necessary on IVB because it don't seem to respect
+          * null surfaces and will hang when no image is bound.  In the raw
+          * case below, we do a bounds check which will fail if the size is
+          * zero.
+          */
+         b->cursor = nir_instr_remove(&intrin->instr);
+         nir_ssa_def *size = load_image_param(b, deref, SIZE);
+         nir_ssa_def *zero = nir_imm_int(b, 0);
+         nir_push_if(b, nir_ine(b, nir_channel(b, size, 0), zero));
+         nir_builder_instr_insert(b, &intrin->instr);
+         nir_pop_if(b, NULL);
+      }
+   } else {
+      const struct isl_format_layout *image_fmtl =
+         isl_format_get_layout(image_fmt);
+      /* We have a matching typed format for everything 32b and below */
+      assert(image_fmtl->bpb == 64 || image_fmtl->bpb == 128);
+      enum isl_format raw_fmt = (image_fmtl->bpb == 64) ?
+                                ISL_FORMAT_R32G32_UINT :
+                                ISL_FORMAT_R32G32B32A32_UINT;
+
+      b->cursor = nir_instr_remove(&intrin->instr);
+
+      nir_ssa_def *coord = intrin->src[1].ssa;
+      nir_push_if(b, image_coord_is_in_bounds(b, devinfo, deref, coord));
+
+      nir_ssa_def *addr = image_address(b, devinfo, deref, coord);
+      nir_ssa_def *color = convert_color_for_store(b, devinfo,
+                                                   intrin->src[3].ssa,
+                                                   image_fmt, raw_fmt);
+
+      nir_intrinsic_instr *store =
+         nir_intrinsic_instr_create(b->shader,
+                                    nir_intrinsic_image_deref_store_raw_intel);
+      store->src[0] = nir_src_for_ssa(&deref->dest.ssa);
+      store->src[1] = nir_src_for_ssa(addr);
+      store->src[2] = nir_src_for_ssa(color);
+      store->num_components = image_fmtl->bpb / 32;
+      nir_builder_instr_insert(b, &store->instr);
+
+      nir_pop_if(b, NULL);
+   }
+
+   return true;
+}
+
+static bool
+lower_image_atomic_instr(nir_builder *b,
+                         const struct gen_device_info *devinfo,
+                         nir_intrinsic_instr *intrin)
+{
+   if (devinfo->is_haswell || devinfo->gen >= 8)
+      return false;
+
+   nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
+
+   b->cursor = nir_instr_remove(&intrin->instr);
+
+   /* Use an undef to hold the uses of the load conversion. */
+   nir_ssa_def *placeholder = nir_ssa_undef(b, 4, 32);
+   nir_ssa_def_rewrite_uses(&intrin->dest.ssa, nir_src_for_ssa(placeholder));
+
+   /* Check the first component of the size field to find out if the
+    * image is bound.  Necessary on IVB for typed atomics because
+    * they don't seem to respect null surfaces and will happily
+    * corrupt or read random memory when no image is bound.
+    */
+   nir_ssa_def *size = load_image_param(b, deref, SIZE);
+   nir_ssa_def *zero = nir_imm_int(b, 0);
+   nir_push_if(b, nir_ine(b, nir_channel(b, size, 0), zero));
+
+   nir_builder_instr_insert(b, &intrin->instr);
+
+   nir_pop_if(b, NULL);
+
+   nir_ssa_def *result = nir_if_phi(b, &intrin->dest.ssa, zero);
+   nir_ssa_def_rewrite_uses(placeholder, nir_src_for_ssa(result));
+
+   return true;
+}
+
+static bool
+lower_image_size_instr(nir_builder *b,
+                       const struct gen_device_info *devinfo,
+                       nir_intrinsic_instr *intrin)
+{
+   nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
+
+   b->cursor = nir_instr_remove(&intrin->instr);
+
+   nir_ssa_def *size = load_image_param(b, deref, SIZE);
+
+   nir_ssa_def *comps[4] = { NULL, NULL, NULL, NULL };
+
+   enum glsl_sampler_dim dim = glsl_get_sampler_dim(deref->type);
+   unsigned coord_comps = glsl_get_sampler_coordinate_components(deref->type);
+   for (unsigned c = 0; c < coord_comps; c++) {
+      if (c == 1 && dim == GLSL_SAMPLER_DIM_1D) {
+         /* The array length for 1D arrays is in .z */
+         comps[1] = nir_channel(b, size, 2);
+      } else if (c == 2 && dim == GLSL_SAMPLER_DIM_CUBE) {
+         comps[2] = nir_idiv(b, nir_channel(b, size, 2), nir_imm_int(b, 6));
+      } else {
+         comps[c] = nir_channel(b, size, c);
+      }
+   }
+
+   for (unsigned c = coord_comps; c < intrin->dest.ssa.num_components; ++c)
+      comps[c] = nir_imm_int(b, 1);
+
+   nir_ssa_def *vec = nir_vec(b, comps, intrin->dest.ssa.num_components);
+   nir_ssa_def_rewrite_uses(&intrin->dest.ssa, nir_src_for_ssa(vec));
+
+   return true;
+}
+
+bool
+brw_nir_lower_image_load_store(nir_shader *shader,
+                               const struct gen_device_info *devinfo)
+{
+   bool progress = false;
+
+   nir_foreach_function(function, shader) {
+      if (function->impl == NULL)
+         continue;
+
+      nir_foreach_block_safe(block, function->impl) {
+         nir_builder b;
+         nir_builder_init(&b, function->impl);
+
+         nir_foreach_instr_safe(instr, block) {
+            if (instr->type != nir_instr_type_intrinsic)
+               continue;
+
+            nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+            switch (intrin->intrinsic) {
+            case nir_intrinsic_image_deref_load:
+               if (lower_image_load_instr(&b, devinfo, intrin))
+                  progress = true;
+               break;
+
+            case nir_intrinsic_image_deref_store:
+               if (lower_image_store_instr(&b, devinfo, intrin))
+                  progress = true;
+               break;
+
+            case nir_intrinsic_image_deref_atomic_add:
+            case nir_intrinsic_image_deref_atomic_min:
+            case nir_intrinsic_image_deref_atomic_max:
+            case nir_intrinsic_image_deref_atomic_and:
+            case nir_intrinsic_image_deref_atomic_or:
+            case nir_intrinsic_image_deref_atomic_xor:
+            case nir_intrinsic_image_deref_atomic_exchange:
+            case nir_intrinsic_image_deref_atomic_comp_swap:
+               if (lower_image_atomic_instr(&b, devinfo, intrin))
+                  progress = true;
+               break;
+
+            case nir_intrinsic_image_deref_size:
+               if (lower_image_size_instr(&b, devinfo, intrin))
+                  progress = true;
+               break;
+
+            default:
+               /* Nothing to do */
+               break;
+            }
+         }
+      }
+
+      nir_metadata_preserve(function->impl, nir_metadata_block_index |
+                                            nir_metadata_dominance);
+   }
+
+   return progress;
+}
diff --git a/src/intel/compiler/meson.build b/src/intel/compiler/meson.build
index 98860c94374..3cdeb6214a8 100644
--- a/src/intel/compiler/meson.build
+++ b/src/intel/compiler/meson.build
@@ -77,6 +77,7 @@ libintel_compiler_files = files(
   'brw_nir_analyze_ubo_ranges.c',
   'brw_nir_attribute_workarounds.c',
   'brw_nir_lower_cs_intrinsics.c',
+  'brw_nir_lower_image_load_store.c',
   'brw_nir_opt_peephole_ffma.c',
   'brw_nir_tcs_workarounds.c',
   'brw_packed_float.c',
diff --git a/src/intel/vulkan/anv_pipeline.c b/src/intel/vulkan/anv_pipeline.c
index 0fe0c7e296e..19d59b7fbac 100644
--- a/src/intel/vulkan/anv_pipeline.c
+++ b/src/intel/vulkan/anv_pipeline.c
@@ -532,6 +532,8 @@ anv_pipeline_lower_nir(struct anv_pipeline *pipeline,
    if (nir->info.stage != MESA_SHADER_COMPUTE)
       brw_nir_analyze_ubo_ranges(compiler, nir, NULL, prog_data->ubo_ranges);
 
+   NIR_PASS_V(nir, brw_nir_lower_image_load_store, compiler->devinfo);
+
    assert(nir->num_uniforms == prog_data->nr_params * 4);
 
    stage->nir = nir;
diff --git a/src/mesa/drivers/dri/i965/brw_program.c b/src/mesa/drivers/dri/i965/brw_program.c
index a669814d0d2..f5ebd3c3b05 100644
--- a/src/mesa/drivers/dri/i965/brw_program.c
+++ b/src/mesa/drivers/dri/i965/brw_program.c
@@ -102,6 +102,8 @@ brw_create_nir(struct brw_context *brw,
 
    nir = brw_preprocess_nir(brw->screen->compiler, nir);
 
+   NIR_PASS_V(nir, brw_nir_lower_image_load_store, devinfo);
+
    if (stage == MESA_SHADER_TESS_CTRL) {
       /* Lower gl_PatchVerticesIn from a sys. value to a uniform on Gen8+. */
       static const gl_state_index16 tokens[STATE_LENGTH] =
-- 
2.17.1