Mesa (master): aco: reorder VMEM operands in ACO IR

Wed Jan 29 19:03:19 UTC 2020

Module: Mesa
Branch: master
Commit: 71440ba0f5512fe455be66ca48b253ecc37478a9
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=71440ba0f5512fe455be66ca48b253ecc37478a9

Author: Daniel Schürmann <daniel at schuermann.dev>
Date:   Thu Jan 16 16:54:35 2020 +0100

aco: reorder VMEM operands in ACO IR

For all VMEM instructions, the resource constant is now
in operands[0]. For MIMG instructions, the sampler shares
operands[1] with write data in case this instruction writes memory.
Moving the VADDR to be the last operand for MIMG is the first step to
support Navi NSA encoding.

Reviewed-by: Rhys Perry <pendingchaos02 at gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/merge_requests/3602>

---

 src/amd/compiler/aco_assembler.cpp             |  20 ++--
 src/amd/compiler/aco_builder_h.py              |   2 +-
 src/amd/compiler/aco_insert_NOPs.cpp           |  17 +--
 src/amd/compiler/aco_insert_waitcnt.cpp        |  11 +-
 src/amd/compiler/aco_instruction_selection.cpp | 146 +++++++++++++------------
 src/amd/compiler/aco_ir.h                      |  17 +--
 src/amd/compiler/aco_optimizer.cpp             |  25 ++---
 src/amd/compiler/aco_register_allocation.cpp   |  11 +-
 src/amd/compiler/aco_scheduler.cpp             |   2 +-
 src/amd/compiler/aco_spill.cpp                 |   8 +-
 src/amd/compiler/aco_validate.cpp              |  26 ++++-
 11 files changed, 153 insertions(+), 132 deletions(-)

diff --git a/src/amd/compiler/aco_assembler.cpp b/src/amd/compiler/aco_assembler.cpp
index 94c41173d6e..7683c8f45af 100644
--- a/src/amd/compiler/aco_assembler.cpp
+++ b/src/amd/compiler/aco_assembler.cpp
@@ -329,10 +329,10 @@ void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
       }
       encoding |= instr->operands[2].physReg() << 24;
       encoding |= (mubuf->tfe ? 1 : 0) << 23;
-      encoding |= (instr->operands[1].physReg() >> 2) << 16;
+      encoding |= (instr->operands[0].physReg() >> 2) << 16;
       unsigned reg = instr->operands.size() > 3 ? instr->operands[3].physReg() : instr->definitions[0].physReg();
       encoding |= (0xFF & reg) << 8;
-      encoding |= (0xFF & instr->operands[0].physReg());
+      encoding |= (0xFF & instr->operands[1].physReg());
       out.push_back(encoding);
       break;
    }
@@ -362,10 +362,10 @@ void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
       encoding |= instr->operands[2].physReg() << 24;
       encoding |= (mtbuf->tfe ? 1 : 0) << 23;
       encoding |= (mtbuf->slc ? 1 : 0) << 22;
-      encoding |= (instr->operands[1].physReg() >> 2) << 16;
+      encoding |= (instr->operands[0].physReg() >> 2) << 16;
       unsigned reg = instr->operands.size() > 3 ? instr->operands[3].physReg() : instr->definitions[0].physReg();
       encoding |= (0xFF & reg) << 8;
-      encoding |= (0xFF & instr->operands[0].physReg());
+      encoding |= (0xFF & instr->operands[1].physReg());
 
       if (ctx.chip_class >= GFX10) {
          encoding |= (((opcode & 0x08) >> 4) << 21); /* MSB of 4-bit OPCODE */
@@ -395,15 +395,15 @@ void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
       }
       encoding |= (0xF & mimg->dmask) << 8;
       out.push_back(encoding);
-      encoding = (0xFF & instr->operands[0].physReg()); /* VADDR */
+      encoding = (0xFF & instr->operands[2].physReg()); /* VADDR */
       if (!instr->definitions.empty()) {
          encoding |= (0xFF & instr->definitions[0].physReg()) << 8; /* VDATA */
-      } else if (instr->operands.size() == 4) {
-         encoding |= (0xFF & instr->operands[3].physReg()) << 8; /* VDATA */
+      } else if (instr->operands[1].regClass().type() == RegType::vgpr) {
+         encoding |= (0xFF & instr->operands[1].physReg()) << 8; /* VDATA */
       }
-      encoding |= (0x1F & (instr->operands[1].physReg() >> 2)) << 16; /* T# (resource) */
-      if (instr->operands.size() > 2)
-         encoding |= (0x1F & (instr->operands[2].physReg() >> 2)) << 21; /* sampler */
+      encoding |= (0x1F & (instr->operands[0].physReg() >> 2)) << 16; /* T# (resource) */
+      if (instr->operands[1].regClass().type() == RegType::sgpr)
+         encoding |= (0x1F & (instr->operands[1].physReg() >> 2)) << 21; /* sampler */
 
       assert(!mimg->d16 || ctx.chip_class >= GFX9);
       encoding |= mimg->d16 ? 1 << 15 : 0;
diff --git a/src/amd/compiler/aco_builder_h.py b/src/amd/compiler/aco_builder_h.py
index 4e1d6f72b63..97c03ac8adf 100644
--- a/src/amd/compiler/aco_builder_h.py
+++ b/src/amd/compiler/aco_builder_h.py
@@ -473,7 +473,7 @@ formats = [("pseudo", [Format.PSEUDO], 'Pseudo_instruction', list(itertools.prod
            ("ds", [Format.DS], 'DS_instruction', [(1, 1), (1, 2), (0, 3), (0, 4)]),
            ("mubuf", [Format.MUBUF], 'MUBUF_instruction', [(0, 4), (1, 3)]),
            ("mtbuf", [Format.MTBUF], 'MTBUF_instruction', [(0, 4), (1, 3)]),
-           ("mimg", [Format.MIMG], 'MIMG_instruction', [(0, 4), (1, 3), (0, 3), (1, 2)]), #TODO(pendingchaos): less shapes?
+           ("mimg", [Format.MIMG], 'MIMG_instruction', [(0, 3), (1, 3)]),
            ("exp", [Format.EXP], 'Export_instruction', [(0, 4)]),
            ("branch", [Format.PSEUDO_BRANCH], 'Pseudo_branch_instruction', itertools.product([0], [0, 1])),
            ("barrier", [Format.PSEUDO_BARRIER], 'Pseudo_barrier_instruction', [(0, 0)]),
diff --git a/src/amd/compiler/aco_insert_NOPs.cpp b/src/amd/compiler/aco_insert_NOPs.cpp
index 605ed8a2081..9ff3d580736 100644
--- a/src/amd/compiler/aco_insert_NOPs.cpp
+++ b/src/amd/compiler/aco_insert_NOPs.cpp
@@ -325,9 +325,9 @@ int handle_instruction_gfx8_9(NOP_ctx_gfx8_9& ctx, aco_ptr<Instruction>& instr,
                              pred->operands[2].physReg() >= 128;
          /* MIMG store with a 128-bit T# with more than two bits set in dmask (making it a >64-bit store) */
          bool consider_mimg = pred->format == Format::MIMG &&
-                              pred->operands.size() == 4 &&
-                              pred->operands[3].size() > 2 &&
-                              pred->operands[1].size() != 8;
+                              pred->operands[1].regClass().type() == RegType::vgpr &&
+                              pred->operands[1].size() > 2 &&
+                              pred->operands[0].size() == 4;
          /* FLAT/GLOBAL/SCRATCH store with >64-bit data */
          bool consider_flat = (pred->isFlatOrGlobal() || pred->format == Format::SCRATCH) &&
                               pred->operands.size() == 3 &&
@@ -376,6 +376,7 @@ int handle_instruction_gfx8_9(NOP_ctx_gfx8_9& ctx, aco_ptr<Instruction>& instr,
       /* If the VALU writes the SGPR that is used by a VMEM, the user must add five wait states. */
       for (int pred_idx = new_idx - 1; pred_idx >= 0 && pred_idx >= new_idx - 5; pred_idx--) {
          aco_ptr<Instruction>& pred = new_instructions[pred_idx];
+         // TODO: break if something else writes the SGPR
          if (!(pred->isVALU() && VALU_writes_sgpr(pred)))
             continue;
 
@@ -383,16 +384,10 @@ int handle_instruction_gfx8_9(NOP_ctx_gfx8_9& ctx, aco_ptr<Instruction>& instr,
             if (def.physReg() > 102)
                continue;
 
-            if (instr->operands.size() > 1 &&
-                regs_intersect(instr->operands[1].physReg(), instr->operands[1].size(),
-                               def.physReg(), def.size())) {
+            for (const Operand& op : instr->operands) {
+               if (regs_intersect(op.physReg(), op.size(), def.physReg(), def.size()))
                   return 5 + pred_idx - new_idx + 1;
-            }
 
-            if (instr->operands.size() > 2 &&
-                regs_intersect(instr->operands[2].physReg(), instr->operands[2].size(),
-                               def.physReg(), def.size())) {
-                  return 5 + pred_idx - new_idx + 1;
             }
          }
       }
diff --git a/src/amd/compiler/aco_insert_waitcnt.cpp b/src/amd/compiler/aco_insert_waitcnt.cpp
index 8e1b64bfcd0..5ec9636752d 100644
--- a/src/amd/compiler/aco_insert_waitcnt.cpp
+++ b/src/amd/compiler/aco_insert_waitcnt.cpp
@@ -689,11 +689,20 @@ void gen(Instruction* instr, wait_ctx& ctx)
       if (!instr->definitions.empty())
          insert_wait_entry(ctx, instr->definitions[0], ev);
 
-      if (instr->operands.size() == 4 && ctx.chip_class == GFX6) {
+      if (ctx.chip_class == GFX6 &&
+          instr->format != Format::MIMG &&
+          instr->operands.size() == 4) {
          ctx.exp_cnt++;
          update_counters(ctx, event_vmem_gpr_lock);
          insert_wait_entry(ctx, instr->operands[3], event_vmem_gpr_lock);
+      } else if (ctx.chip_class == GFX6 &&
+                 instr->format == Format::MIMG &&
+                 instr->operands[1].regClass().type() == RegType::vgpr) {
+         ctx.exp_cnt++;
+         update_counters(ctx, event_vmem_gpr_lock);
+         insert_wait_entry(ctx, instr->operands[1], event_vmem_gpr_lock);
       }
+
       break;
    }
    case Format::SOPP: {
diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp
index 6734b9c98d3..8d08c416ef0 100644
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@@ -2929,8 +2929,8 @@ void visit_store_vsgs_output(isel_context *ctx, nir_intrinsic_instr *instr)
          }
 
          aco_ptr<MTBUF_instruction> mtbuf{create_instruction<MTBUF_instruction>(aco_opcode::tbuffer_store_format_x, Format::MTBUF, 4, 0)};
-         mtbuf->operands[0] = vaddr_offset;
-         mtbuf->operands[1] = Operand(esgs_ring);
+         mtbuf->operands[0] = Operand(esgs_ring);
+         mtbuf->operands[1] = vaddr_offset;
          mtbuf->operands[2] = Operand(get_arg(ctx, ctx->args->es2gs_offset));
          mtbuf->operands[3] = Operand(elem);
          mtbuf->offen = !vaddr_offset.isUndefined();
@@ -3288,12 +3288,12 @@ void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr)
 
          if (use_mubuf) {
             Instruction *mubuf = bld.mubuf(opcode,
-                                           Definition(fetch_dst), fetch_index, list, soffset,
+                                           Definition(fetch_dst), list, fetch_index, soffset,
                                            fetch_offset, false, true).instr;
             static_cast<MUBUF_instruction*>(mubuf)->can_reorder = true;
          } else {
             Instruction *mtbuf = bld.mtbuf(opcode,
-                                           Definition(fetch_dst), fetch_index, list, soffset,
+                                           Definition(fetch_dst), list, fetch_index, soffset,
                                            fetch_dfmt, nfmt, fetch_offset, false, true).instr;
             static_cast<MTBUF_instruction*>(mtbuf)->can_reorder = true;
          }
@@ -3487,8 +3487,8 @@ void visit_load_per_vertex_input(isel_context *ctx, nir_intrinsic_instr *instr)
             aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(aco_opcode::buffer_load_dword, Format::MUBUF, 3, 1)};
             mubuf->definitions[0] = bld.def(v1);
             subelems[j] = mubuf->definitions[0].getTemp();
-            mubuf->operands[0] = Operand(offset);
-            mubuf->operands[1] = Operand(esgs_ring);
+            mubuf->operands[0] = Operand(esgs_ring);
+            mubuf->operands[1] = Operand(offset);
             mubuf->operands[2] = Operand(soffset);
             mubuf->offen = true;
             mubuf->offset = const_offset % 4096u;
@@ -3616,8 +3616,8 @@ void load_buffer(isel_context *ctx, unsigned num_components, Temp dst,
          lower = bld.tmp(v4);
          aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
          mubuf->definitions[0] = Definition(lower);
-         mubuf->operands[0] = vaddr;
-         mubuf->operands[1] = Operand(rsrc);
+         mubuf->operands[0] = Operand(rsrc);
+         mubuf->operands[1] = vaddr;
          mubuf->operands[2] = soffset;
          mubuf->offen = (offset.type() == RegType::vgpr);
          mubuf->glc = glc;
@@ -3651,8 +3651,8 @@ void load_buffer(isel_context *ctx, unsigned num_components, Temp dst,
             unreachable("Load SSBO not implemented for this size.");
       }
       aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
-      mubuf->operands[0] = vaddr;
-      mubuf->operands[1] = Operand(rsrc);
+      mubuf->operands[0] = Operand(rsrc);
+      mubuf->operands[1] = vaddr;
       mubuf->operands[2] = soffset;
       mubuf->offen = (offset.type() == RegType::vgpr);
       mubuf->glc = glc;
@@ -4228,9 +4228,10 @@ static Temp adjust_sample_index_using_fmask(isel_context *ctx, bool da, Temp coo
                   ? ac_get_sampler_dim(ctx->options->chip_class, GLSL_SAMPLER_DIM_2D, da)
                   : 0;
 
-   aco_ptr<MIMG_instruction> load{create_instruction<MIMG_instruction>(aco_opcode::image_load, Format::MIMG, 2, 1)};
-   load->operands[0] = Operand(coords);
-   load->operands[1] = Operand(fmask_desc_ptr);
+   aco_ptr<MIMG_instruction> load{create_instruction<MIMG_instruction>(aco_opcode::image_load, Format::MIMG, 3, 1)};
+   load->operands[0] = Operand(fmask_desc_ptr);
+   load->operands[1] = Operand(s4); /* no sampler */
+   load->operands[2] = Operand(coords);
    load->definitions[0] = Definition(fmask);
    load->glc = false;
    load->dlc = false;
@@ -4374,8 +4375,8 @@ void visit_image_load(isel_context *ctx, nir_intrinsic_instr *instr)
          unreachable(">4 channel buffer image load");
       }
       aco_ptr<MUBUF_instruction> load{create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 3, 1)};
-      load->operands[0] = Operand(vindex);
-      load->operands[1] = Operand(rsrc);
+      load->operands[0] = Operand(rsrc);
+      load->operands[1] = Operand(vindex);
       load->operands[2] = Operand((uint32_t) 0);
       Temp tmp;
       if (num_channels == instr->dest.ssa.num_components && dst.type() == RegType::vgpr)
@@ -4407,9 +4408,10 @@ void visit_image_load(isel_context *ctx, nir_intrinsic_instr *instr)
    bool level_zero = nir_src_is_const(instr->src[3]) && nir_src_as_uint(instr->src[3]) == 0;
    aco_opcode opcode = level_zero ? aco_opcode::image_load : aco_opcode::image_load_mip;
 
-   aco_ptr<MIMG_instruction> load{create_instruction<MIMG_instruction>(opcode, Format::MIMG, 2, 1)};
-   load->operands[0] = Operand(coords);
-   load->operands[1] = Operand(resource);
+   aco_ptr<MIMG_instruction> load{create_instruction<MIMG_instruction>(opcode, Format::MIMG, 3, 1)};
+   load->operands[0] = Operand(resource);
+   load->operands[1] = Operand(s4); /* no sampler */
+   load->operands[2] = Operand(coords);
    load->definitions[0] = Definition(tmp);
    load->glc = var->data.access & (ACCESS_VOLATILE | ACCESS_COHERENT) ? 1 : 0;
    load->dlc = load->glc && ctx->options->chip_class >= GFX10;
@@ -4455,8 +4457,8 @@ void visit_image_store(isel_context *ctx, nir_intrinsic_instr *instr)
          unreachable(">4 channel buffer image store");
       }
       aco_ptr<MUBUF_instruction> store{create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 4, 0)};
-      store->operands[0] = Operand(vindex);
-      store->operands[1] = Operand(rsrc);
+      store->operands[0] = Operand(rsrc);
+      store->operands[1] = Operand(vindex);
       store->operands[2] = Operand((uint32_t) 0);
       store->operands[3] = Operand(data);
       store->idxen = true;
@@ -4476,11 +4478,10 @@ void visit_image_store(isel_context *ctx, nir_intrinsic_instr *instr)
    bool level_zero = nir_src_is_const(instr->src[4]) && nir_src_as_uint(instr->src[4]) == 0;
    aco_opcode opcode = level_zero ? aco_opcode::image_store : aco_opcode::image_store_mip;
 
-   aco_ptr<MIMG_instruction> store{create_instruction<MIMG_instruction>(opcode, Format::MIMG, 4, 0)};
-   store->operands[0] = Operand(coords);
-   store->operands[1] = Operand(resource);
-   store->operands[2] = Operand(s4);
-   store->operands[3] = Operand(data);
+   aco_ptr<MIMG_instruction> store{create_instruction<MIMG_instruction>(opcode, Format::MIMG, 3, 0)};
+   store->operands[0] = Operand(resource);
+   store->operands[1] = Operand(data);
+   store->operands[2] = Operand(coords);
    store->glc = glc;
    store->dlc = false;
    store->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
@@ -4572,8 +4573,8 @@ void visit_image_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
       Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_BUFFER, nullptr, true, true);
       //assert(ctx->options->chip_class < GFX9 && "GFX9 stride size workaround not yet implemented.");
       aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(buf_op, Format::MUBUF, 4, return_previous ? 1 : 0)};
-      mubuf->operands[0] = Operand(vindex);
-      mubuf->operands[1] = Operand(resource);
+      mubuf->operands[0] = Operand(resource);
+      mubuf->operands[1] = Operand(vindex);
       mubuf->operands[2] = Operand((uint32_t)0);
       mubuf->operands[3] = Operand(data);
       if (return_previous)
@@ -4591,11 +4592,10 @@ void visit_image_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
 
    Temp coords = get_image_coords(ctx, instr, type);
    Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_IMAGE, nullptr, true, true);
-   aco_ptr<MIMG_instruction> mimg{create_instruction<MIMG_instruction>(image_op, Format::MIMG, 4, return_previous ? 1 : 0)};
-   mimg->operands[0] = Operand(coords);
-   mimg->operands[1] = Operand(resource);
-   mimg->operands[2] = Operand(s4); /* no sampler */
-   mimg->operands[3] = Operand(data);
+   aco_ptr<MIMG_instruction> mimg{create_instruction<MIMG_instruction>(image_op, Format::MIMG, 3, return_previous ? 1 : 0)};
+   mimg->operands[0] = Operand(resource);
+   mimg->operands[1] = Operand(data);
+   mimg->operands[2] = Operand(coords);
    if (return_previous)
       mimg->definitions[0] = Definition(dst);
    mimg->glc = return_previous;
@@ -4661,9 +4661,10 @@ void visit_image_size(isel_context *ctx, nir_intrinsic_instr *instr)
 
    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
 
-   aco_ptr<MIMG_instruction> mimg{create_instruction<MIMG_instruction>(aco_opcode::image_get_resinfo, Format::MIMG, 2, 1)};
-   mimg->operands[0] = Operand(lod);
-   mimg->operands[1] = Operand(resource);
+   aco_ptr<MIMG_instruction> mimg{create_instruction<MIMG_instruction>(aco_opcode::image_get_resinfo, Format::MIMG, 3, 1)};
+   mimg->operands[0] = Operand(resource);
+   mimg->operands[1] = Operand(s4); /* no sampler */
+   mimg->operands[2] = Operand(lod);
    uint8_t& dmask = mimg->dmask;
    mimg->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
    mimg->dmask = (1 << instr->dest.ssa.num_components) - 1;
@@ -4823,8 +4824,8 @@ void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
          }
       } else {
          aco_ptr<MUBUF_instruction> store{create_instruction<MUBUF_instruction>(vmem_op, Format::MUBUF, 4, 0)};
-         store->operands[0] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
-         store->operands[1] = Operand(rsrc);
+         store->operands[0] = Operand(rsrc);
+         store->operands[1] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
          store->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand((uint32_t) 0);
          store->operands[3] = Operand(write_data);
          store->offset = start * elem_size_bytes;
@@ -4912,8 +4913,8 @@ void visit_atomic_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
    }
    aco_opcode op = instr->dest.ssa.bit_size == 32 ? op32 : op64;
    aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, return_previous ? 1 : 0)};
-   mubuf->operands[0] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
-   mubuf->operands[1] = Operand(rsrc);
+   mubuf->operands[0] = Operand(rsrc);
+   mubuf->operands[1] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
    mubuf->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand((uint32_t) 0);
    mubuf->operands[3] = Operand(data);
    if (return_previous)
@@ -5021,8 +5022,8 @@ void visit_load_global(isel_context *ctx, nir_intrinsic_instr *instr)
          Temp rsrc = get_gfx6_global_rsrc(bld, addr);
 
          aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
-         mubuf->operands[0] = addr.type() == RegType::vgpr ? Operand(addr) : Operand(v1);
-         mubuf->operands[1] = Operand(rsrc);
+         mubuf->operands[0] = Operand(rsrc);
+         mubuf->operands[1] = addr.type() == RegType::vgpr ? Operand(addr) : Operand(v1);
          mubuf->operands[2] = Operand(0u);
          mubuf->glc = glc;
          mubuf->dlc = false;
@@ -5202,8 +5203,8 @@ void visit_store_global(isel_context *ctx, nir_intrinsic_instr *instr)
          Temp rsrc = get_gfx6_global_rsrc(bld, addr);
 
          aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, 0)};
-         mubuf->operands[0] = addr.type() == RegType::vgpr ? Operand(addr) : Operand(v1);
-         mubuf->operands[1] = Operand(rsrc);
+         mubuf->operands[0] = Operand(rsrc);
+         mubuf->operands[1] = addr.type() == RegType::vgpr ? Operand(addr) : Operand(v1);
          mubuf->operands[2] = Operand(0u);
          mubuf->operands[3] = Operand(write_data);
          mubuf->glc = glc;
@@ -5360,8 +5361,8 @@ void visit_global_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
       aco_opcode op = instr->dest.ssa.bit_size == 32 ? op32 : op64;
 
       aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, return_previous ? 1 : 0)};
-      mubuf->operands[0] = addr.type() == RegType::vgpr ? Operand(addr) : Operand(v1);
-      mubuf->operands[1] = Operand(rsrc);
+      mubuf->operands[0] = Operand(rsrc);
+      mubuf->operands[1] = addr.type() == RegType::vgpr ? Operand(addr) : Operand(v1);
       mubuf->operands[2] = Operand(0u);
       mubuf->operands[3] = Operand(data);
       if (return_previous)
@@ -5589,12 +5590,12 @@ void visit_load_scratch(isel_context *ctx, nir_intrinsic_instr *instr) {
       case 8: {
          std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
          Temp lower = bld.mubuf(aco_opcode::buffer_load_dwordx4,
-                                bld.def(v4), offset, rsrc,
+                                bld.def(v4), rsrc, offset,
                                 ctx->program->scratch_offset, 0, true);
          Temp upper = bld.mubuf(dst.size() == 6 ? aco_opcode::buffer_load_dwordx2 :
                                                   aco_opcode::buffer_load_dwordx4,
                                 dst.size() == 6 ? bld.def(v2) : bld.def(v4),
-                                offset, rsrc, ctx->program->scratch_offset, 16, true);
+                                rsrc, offset, ctx->program->scratch_offset, 16, true);
          emit_split_vector(ctx, lower, 2);
          elems[0] = emit_extract_vector(ctx, lower, 0, v2);
          elems[1] = emit_extract_vector(ctx, lower, 1, v2);
@@ -5619,7 +5620,7 @@ void visit_load_scratch(isel_context *ctx, nir_intrinsic_instr *instr) {
          unreachable("Wrong dst size for nir_intrinsic_load_scratch");
    }
 
-   bld.mubuf(op, Definition(dst), offset, rsrc, ctx->program->scratch_offset, 0, true);
+   bld.mubuf(op, Definition(dst), rsrc, offset, ctx->program->scratch_offset, 0, true);
    emit_split_vector(ctx, dst, instr->num_components);
 }
 
@@ -5680,7 +5681,7 @@ void visit_store_scratch(isel_context *ctx, nir_intrinsic_instr *instr) {
             unreachable("Invalid data size for nir_intrinsic_store_scratch.");
       }
 
-      bld.mubuf(op, offset, rsrc, ctx->program->scratch_offset, write_data, start * elem_size_bytes, true);
+      bld.mubuf(op, rsrc, offset, ctx->program->scratch_offset, write_data, start * elem_size_bytes, true);
    }
 }
 
@@ -5784,8 +5785,8 @@ void visit_emit_vertex_with_counter(isel_context *ctx, nir_intrinsic_instr *inst
             }
 
             aco_ptr<MTBUF_instruction> mtbuf{create_instruction<MTBUF_instruction>(aco_opcode::tbuffer_store_format_x, Format::MTBUF, 4, 0)};
-            mtbuf->operands[0] = vaddr_offset;
-            mtbuf->operands[1] = Operand(gsvs_ring);
+            mtbuf->operands[0] = Operand(gsvs_ring);
+            mtbuf->operands[1] = vaddr_offset;
             mtbuf->operands[2] = Operand(get_arg(ctx, ctx->args->gs2vs_offset));
             mtbuf->operands[3] = Operand(ctx->outputs.outputs[i][j]);
             mtbuf->offen = !vaddr_offset.isUndefined();
@@ -6110,8 +6111,8 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
 
          aco_ptr<MUBUF_instruction> load{create_instruction<MUBUF_instruction>(aco_opcode::buffer_load_dwordx2, Format::MUBUF, 3, 1)};
          load->definitions[0] = Definition(sample_pos);
-         load->operands[0] = Operand(addr);
-         load->operands[1] = Operand(rsrc);
+         load->operands[0] = Operand(rsrc);
+         load->operands[1] = Operand(addr);
          load->operands[2] = Operand(0u);
          load->offset = sample_pos_offset;
          load->offen = 0;
@@ -7340,9 +7341,10 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr)
       if (tmp_dst.id() == dst.id() && div_by_6)
          tmp_dst = bld.tmp(tmp_dst.regClass());
 
-      tex.reset(create_instruction<MIMG_instruction>(aco_opcode::image_get_resinfo, Format::MIMG, 2, 1));
-      tex->operands[0] = Operand(as_vgpr(ctx,lod));
-      tex->operands[1] = Operand(resource);
+      tex.reset(create_instruction<MIMG_instruction>(aco_opcode::image_get_resinfo, Format::MIMG, 3, 1));
+      tex->operands[0] = Operand(resource);
+      tex->operands[1] = Operand(s4); /* no sampler */
+      tex->operands[2] = Operand(as_vgpr(ctx,lod));
       if (ctx->options->chip_class == GFX9 &&
           instr->op == nir_texop_txs &&
           instr->sampler_dim == GLSL_SAMPLER_DIM_1D &&
@@ -7380,9 +7382,10 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr)
    Temp tg4_compare_cube_wa64 = Temp();
 
    if (tg4_integer_workarounds) {
-      tex.reset(create_instruction<MIMG_instruction>(aco_opcode::image_get_resinfo, Format::MIMG, 2, 1));
-      tex->operands[0] = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0u));
-      tex->operands[1] = Operand(resource);
+      tex.reset(create_instruction<MIMG_instruction>(aco_opcode::image_get_resinfo, Format::MIMG, 3, 1));
+      tex->operands[0] = Operand(resource);
+      tex->operands[1] = Operand(s4); /* no sampler */
+      tex->operands[2] = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0u));
       tex->dim = dim;
       tex->dmask = 0x3;
       tex->da = da;
@@ -7537,8 +7540,8 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr)
          tmp_dst = bld.tmp(RegType::vgpr, last_bit);
 
       aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
-      mubuf->operands[0] = Operand(coords);
-      mubuf->operands[1] = Operand(resource);
+      mubuf->operands[0] = Operand(resource);
+      mubuf->operands[1] = Operand(coords);
       mubuf->operands[2] = Operand((uint32_t) 0);
       mubuf->definitions[0] = Definition(tmp_dst);
       mubuf->idxen = true;
@@ -7556,9 +7559,10 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr)
        instr->op == nir_texop_fragment_fetch ||
        instr->op == nir_texop_fragment_mask_fetch) {
       aco_opcode op = level_zero || instr->sampler_dim == GLSL_SAMPLER_DIM_MS || instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS ? aco_opcode::image_load : aco_opcode::image_load_mip;
-      tex.reset(create_instruction<MIMG_instruction>(op, Format::MIMG, 2, 1));
-      tex->operands[0] = Operand(arg);
-      tex->operands[1] = Operand(resource);
+      tex.reset(create_instruction<MIMG_instruction>(op, Format::MIMG, 3, 1));
+      tex->operands[0] = Operand(resource);
+      tex->operands[1] = Operand(s4); /* no sampler */
+      tex->operands[2] = Operand(arg);
       tex->dim = dim;
       tex->dmask = dmask;
       tex->unrm = true;
@@ -7644,9 +7648,9 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr)
    }
 
    tex.reset(create_instruction<MIMG_instruction>(opcode, Format::MIMG, 3, 1));
-   tex->operands[0] = Operand(arg);
-   tex->operands[1] = Operand(resource);
-   tex->operands[2] = Operand(sampler);
+   tex->operands[0] = Operand(resource);
+   tex->operands[1] = Operand(sampler);
+   tex->operands[2] = Operand(arg);
    tex->dim = dim;
    tex->dmask = dmask;
    tex->da = da;
@@ -8753,8 +8757,8 @@ static void emit_stream_output(isel_context *ctx,
       }
 
       aco_ptr<MUBUF_instruction> store{create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 4, 0)};
-      store->operands[0] = Operand(so_write_offset[buf]);
-      store->operands[1] = Operand(so_buffers[buf]);
+      store->operands[0] = Operand(so_buffers[buf]);
+      store->operands[1] = Operand(so_write_offset[buf]);
       store->operands[2] = Operand((uint32_t) 0);
       store->operands[3] = Operand(write_data);
       if (offset > 4095) {
@@ -9118,8 +9122,8 @@ void select_gs_copy_shader(Program *program, struct nir_shader *gs_shader,
 
             aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(aco_opcode::buffer_load_dword, Format::MUBUF, 3, 1)};
             mubuf->definitions[0] = bld.def(v1);
-            mubuf->operands[0] = Operand(voffset);
-            mubuf->operands[1] = Operand(gsvs_ring);
+            mubuf->operands[0] = Operand(gsvs_ring);
+            mubuf->operands[1] = Operand(voffset);
             mubuf->operands[2] = Operand(0u);
             mubuf->offen = true;
             mubuf->offset = const_offset;
diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h
index 3f38e6aadae..1ccaf2a0158 100644
--- a/src/amd/compiler/aco_ir.h
+++ b/src/amd/compiler/aco_ir.h
@@ -781,8 +781,8 @@ struct DS_instruction : public Instruction {
 
 /**
  * Vector Memory Untyped-buffer Instructions
- * Operand(0): VADDR - Address source. Can carry an index and/or offset
- * Operand(1): SRSRC - Specifies which SGPR supplies T# (resource constant)
+ * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
+ * Operand(1): VADDR - Address source. Can carry an index and/or offset
  * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
  * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
  *
@@ -804,8 +804,8 @@ struct MUBUF_instruction : public Instruction {
 
 /**
  * Vector Memory Typed-buffer Instructions
- * Operand(0): VADDR - Address source. Can carry an index and/or offset
- * Operand(1): SRSRC - Specifies which SGPR supplies T# (resource constant)
+ * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
+ * Operand(1): VADDR - Address source. Can carry an index and/or offset
  * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
  * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
  *
@@ -827,10 +827,11 @@ struct MTBUF_instruction : public Instruction {
 
 /**
  * Vector Memory Image Instructions
- * Operand(0): VADDR - Address source. Can carry an offset or an index.
- * Operand(1): SRSRC - Scalar GPR that specifies the resource constant.
- * Operand(2): SSAMP - Scalar GPR that specifies sampler constant.
- * Operand(3) / Definition(0): VDATA - Vector GPR for read / write result.
+ * Operand(0) SRSRC - Scalar GPR that specifies the resource constant.
+ * Operand(1): SSAMP - Scalar GPR that specifies sampler constant.
+ *             or VDATA - Vector GPR for write data.
+ * Operand(2): VADDR - Address source. Can carry an offset or an index.
+ * Definition(0): VDATA - Vector GPR for read result.
  *
  */
 struct MIMG_instruction : public Instruction {
diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp
index 9e606c880f4..9ef94d4f697 100644
--- a/src/amd/compiler/aco_optimizer.cpp
+++ b/src/amd/compiler/aco_optimizer.cpp
@@ -528,9 +528,9 @@ void to_VOP3(opt_ctx& ctx, aco_ptr<Instruction>& instr)
 }
 
 /* only covers special cases */
-bool can_accept_constant(aco_ptr<Instruction>& instr, unsigned operand)
+bool alu_can_accept_constant(aco_opcode opcode, unsigned operand)
 {
-   switch (instr->opcode) {
+   switch (opcode) {
    case aco_opcode::v_interp_p2_f32:
    case aco_opcode::v_mac_f32:
    case aco_opcode::v_writelane_b32:
@@ -547,12 +547,6 @@ bool can_accept_constant(aco_ptr<Instruction>& instr, unsigned operand)
    case aco_opcode::v_readfirstlane_b32:
       return operand != 0;
    default:
-      if ((instr->format == Format::MUBUF ||
-           instr->format == Format::MIMG) &&
-          instr->definitions.size() == 1 &&
-          instr->operands.size() == 4) {
-         return operand != 3;
-      }
       return true;
    }
 }
@@ -719,7 +713,8 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr<Instruction>& instr)
                break;
             }
          }
-         if ((info.is_constant() || info.is_constant_64bit() || (info.is_literal() && instr->format == Format::PSEUDO)) && !instr->operands[i].isFixed() && can_accept_constant(instr, i)) {
+         if ((info.is_constant() || info.is_constant_64bit() || (info.is_literal() && instr->format == Format::PSEUDO)) &&
+             !instr->operands[i].isFixed() && alu_can_accept_constant(instr->opcode, i)) {
             instr->operands[i] = get_constant_op(ctx, info.val, info.is_constant_64bit());
             continue;
          }
@@ -754,7 +749,7 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr<Instruction>& instr)
                static_cast<VOP3A_instruction*>(instr.get())->neg[i] = true;
             continue;
          }
-         if ((info.is_constant() || info.is_constant_64bit()) && can_accept_constant(instr, i)) {
+         if ((info.is_constant() || info.is_constant_64bit()) && alu_can_accept_constant(instr->opcode, i)) {
             Operand op = get_constant_op(ctx, info.val, info.is_constant_64bit());
             perfwarn(instr->opcode == aco_opcode::v_cndmask_b32 && i == 2, "v_cndmask_b32 with a constant selector", instr.get());
             if (i == 0 || instr->opcode == aco_opcode::v_readlane_b32 || instr->opcode == aco_opcode::v_writelane_b32) {
@@ -780,9 +775,9 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr<Instruction>& instr)
          while (info.is_temp())
             info = ctx.info[info.temp.id()];
 
-         if (mubuf->offen && i == 0 && info.is_constant_or_literal() && mubuf->offset + info.val < 4096) {
+         if (mubuf->offen && i == 1 && info.is_constant_or_literal() && mubuf->offset + info.val < 4096) {
             assert(!mubuf->idxen);
-            instr->operands[i] = Operand(v1);
+            instr->operands[1] = Operand(v1);
             mubuf->offset += info.val;
             mubuf->offen = false;
             continue;
@@ -790,9 +785,9 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr<Instruction>& instr)
             instr->operands[2] = Operand((uint32_t) 0);
             mubuf->offset += info.val;
             continue;
-         } else if (mubuf->offen && i == 0 && parse_base_offset(ctx, instr.get(), i, &base, &offset) && base.regClass() == v1 && mubuf->offset + offset < 4096) {
+         } else if (mubuf->offen && i == 1 && parse_base_offset(ctx, instr.get(), i, &base, &offset) && base.regClass() == v1 && mubuf->offset + offset < 4096) {
             assert(!mubuf->idxen);
-            instr->operands[i].setTemp(base);
+            instr->operands[1].setTemp(base);
             mubuf->offset += offset;
             continue;
          } else if (i == 2 && parse_base_offset(ctx, instr.get(), i, &base, &offset) && base.regClass() == s1 && mubuf->offset + offset < 4096) {
@@ -2698,7 +2693,7 @@ void select_instruction(opt_ctx &ctx, aco_ptr<Instruction>& instr)
          continue;
       }
 
-      if (!can_accept_constant(instr, i))
+      if (!alu_can_accept_constant(instr->opcode, i))
          continue;
 
       if (ctx.uses[op.tempId()] < literal_uses) {
diff --git a/src/amd/compiler/aco_register_allocation.cpp b/src/amd/compiler/aco_register_allocation.cpp
index 8370effdbc3..55c41dcdb06 100644
--- a/src/amd/compiler/aco_register_allocation.cpp
+++ b/src/amd/compiler/aco_register_allocation.cpp
@@ -1532,11 +1532,14 @@ void register_allocation(Program *program, std::vector<std::set<Temp>> live_out_
          } else if (instr->opcode == aco_opcode::s_addk_i32 ||
                     instr->opcode == aco_opcode::s_mulk_i32) {
             instr->definitions[0].setFixed(instr->operands[0].physReg());
-         } else if ((instr->format == Format::MUBUF ||
-                   instr->format == Format::MIMG) &&
-                  instr->definitions.size() == 1 &&
-                  instr->operands.size() == 4) {
+         } else if (instr->format == Format::MUBUF &&
+                    instr->definitions.size() == 1 &&
+                    instr->operands.size() == 4) {
             instr->definitions[0].setFixed(instr->operands[3].physReg());
+         } else if (instr->format == Format::MIMG &&
+                    instr->definitions.size() == 1 &&
+                    instr->operands[1].regClass() == instr->definitions[0].regClass()) {
+            instr->definitions[0].setFixed(instr->operands[1].physReg());
          }
 
          ctx.defs_done.reset();
diff --git a/src/amd/compiler/aco_scheduler.cpp b/src/amd/compiler/aco_scheduler.cpp
index d5f2d913a65..5634a55766c 100644
--- a/src/amd/compiler/aco_scheduler.cpp
+++ b/src/amd/compiler/aco_scheduler.cpp
@@ -548,7 +548,7 @@ void schedule_VMEM(sched_ctx& ctx, Block* block,
       if (current->isVMEM() == candidate->isVMEM()) {
          bool same_resource = true;
          if (current->isVMEM())
-            same_resource = candidate->operands[1].tempId() == current->operands[1].tempId();
+            same_resource = candidate->operands[0].tempId() == current->operands[0].tempId();
          bool can_reorder = can_reorder_vmem || can_reorder_candidate;
          int grab_dist = clause_insert_idx - candidate_idx;
          /* We can't easily tell how much this will decrease the def-to-use
diff --git a/src/amd/compiler/aco_spill.cpp b/src/amd/compiler/aco_spill.cpp
index bfa3c266a76..dfcdab2f003 100644
--- a/src/amd/compiler/aco_spill.cpp
+++ b/src/amd/compiler/aco_spill.cpp
@@ -1575,9 +1575,9 @@ void assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr) {
                      split->definitions[i] = bld.def(v1);
                   bld.insert(split);
                   for (unsigned i = 0; i < temp.size(); i++)
-                     bld.mubuf(opcode, Operand(), scratch_rsrc, scratch_offset, split->definitions[i].getTemp(), offset + i * 4, false);
+                     bld.mubuf(opcode, scratch_rsrc, Operand(), scratch_offset, split->definitions[i].getTemp(), offset + i * 4, false);
                } else {
-                  bld.mubuf(opcode, Operand(), scratch_rsrc, scratch_offset, temp, offset, false);
+                  bld.mubuf(opcode, scratch_rsrc, Operand(), scratch_offset, temp, offset, false);
                }
             } else if (sgpr_slot.find(spill_id) != sgpr_slot.end()) {
                ctx.program->config->spilled_sgprs += (*it)->operands[0].size();
@@ -1641,11 +1641,11 @@ void assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr) {
                   for (unsigned i = 0; i < def.size(); i++) {
                      Temp tmp = bld.tmp(v1);
                      vec->operands[i] = Operand(tmp);
-                     bld.mubuf(opcode, Definition(tmp), Operand(), scratch_rsrc, scratch_offset, offset + i * 4, false);
+                     bld.mubuf(opcode, Definition(tmp), scratch_rsrc, Operand(), scratch_offset, offset + i * 4, false);
                   }
                   bld.insert(vec);
                } else {
-                  bld.mubuf(opcode, def, Operand(), scratch_rsrc, scratch_offset, offset, false);
+                  bld.mubuf(opcode, def, scratch_rsrc, Operand(), scratch_offset, offset, false);
                }
             } else if (sgpr_slot.find(spill_id) != sgpr_slot.end()) {
                uint32_t spill_slot = sgpr_slot[spill_id];
diff --git a/src/amd/compiler/aco_validate.cpp b/src/amd/compiler/aco_validate.cpp
index 8d69952c811..293ec32a330 100644
--- a/src/amd/compiler/aco_validate.cpp
+++ b/src/amd/compiler/aco_validate.cpp
@@ -99,8 +99,8 @@ void validate(Program* program, FILE * output)
                bool flat = instr->format == Format::FLAT || instr->format == Format::SCRATCH || instr->format == Format::GLOBAL;
                bool can_be_undef = is_phi(instr) || instr->format == Format::EXP ||
                                    instr->format == Format::PSEUDO_REDUCTION ||
-                                   (flat && i == 1) || (instr->format == Format::MIMG && i == 2) ||
-                                   ((instr->format == Format::MUBUF || instr->format == Format::MTBUF) && i == 0);
+                                   (flat && i == 1) || (instr->format == Format::MIMG && i == 1) ||
+                                   ((instr->format == Format::MUBUF || instr->format == Format::MTBUF) && i == 1);
                check(can_be_undef, "Undefs can only be used in certain operands", instr.get());
             }
          }
@@ -229,15 +229,29 @@ void validate(Program* program, FILE * output)
             break;
          }
          case Format::MTBUF:
-         case Format::MUBUF:
-         case Format::MIMG: {
+         case Format::MUBUF: {
             check(instr->operands.size() > 1, "VMEM instructions must have at least one operand", instr.get());
-            check(instr->operands[0].hasRegClass() && instr->operands[0].regClass().type() == RegType::vgpr,
+            check(instr->operands[1].hasRegClass() && instr->operands[1].regClass().type() == RegType::vgpr,
                   "VADDR must be in vgpr for VMEM instructions", instr.get());
-            check(instr->operands[1].isTemp() && instr->operands[1].regClass().type() == RegType::sgpr, "VMEM resource constant must be sgpr", instr.get());
+            check(instr->operands[0].isTemp() && instr->operands[0].regClass().type() == RegType::sgpr, "VMEM resource constant must be sgpr", instr.get());
             check(instr->operands.size() < 4 || (instr->operands[3].isTemp() && instr->operands[3].regClass().type() == RegType::vgpr), "VMEM write data must be vgpr", instr.get());
             break;
          }
+         case Format::MIMG: {
+            check(instr->operands.size() == 3, "MIMG instructions must have exactly 3 operands", instr.get());
+            check(instr->operands[0].hasRegClass() && (instr->operands[0].regClass() == s4 || instr->operands[0].regClass() == s8),
+                  "MIMG operands[0] (resource constant) must be in 4 or 8 SGPRs", instr.get());
+            if (instr->operands[1].hasRegClass() && instr->operands[1].regClass().type() == RegType::sgpr)
+               check(instr->operands[1].regClass() == s4, "MIMG operands[1] (sampler constant) must be 4 SGPRs", instr.get());
+            else if (instr->operands[1].hasRegClass() && instr->operands[1].regClass().type() == RegType::vgpr)
+               check(instr->definitions.empty() || instr->definitions[0].regClass() == instr->operands[1].regClass(),
+                     "MIMG operands[1] (VDATA) must be the same as definitions[0] for atomics", instr.get());
+            check(instr->operands[2].hasRegClass() && instr->operands[2].regClass().type() == RegType::vgpr,
+                  "MIMG operands[2] (VADDR) must be VGPR", instr.get());
+            check(instr->definitions.empty() || (instr->definitions[0].isTemp() && instr->definitions[0].regClass().type() == RegType::vgpr),
+                  "MIMG definitions[0] (VDATA) must be VGPR", instr.get());
+            break;
+         }
          case Format::DS: {
             for (const Operand& op : instr->operands) {
                check((op.isTemp() && op.regClass().type() == RegType::vgpr) || op.physReg() == m0,