[Beignet] [PATCH V2 15/15] Backend: Add A64 subgroup block read/write support

Xiuli Pan xiuli.pan at intel.com
Wed Oct 19 06:37:24 UTC 2016


From: Pan Xiuli <xiuli.pan at intel.com>

For GEN8+ and OpenCL2.0 we use stateless buffer and need A64 buffer read
write. Add A64 encoder for Oword block read and write.

Signed-off-by: Pan Xiuli <xiuli.pan at intel.com>
---
 backend/src/backend/gen8_encoder.cpp     |  70 +++++++++++++++++++++
 backend/src/backend/gen8_encoder.hpp     |   4 ++
 backend/src/backend/gen8_instruction.hpp |  13 ++++
 backend/src/backend/gen_context.cpp      | 103 ++++++++++++++++++++++++-------
 backend/src/backend/gen_defs.hpp         |   3 +
 backend/src/backend/gen_encoder.cpp      |   8 +++
 backend/src/backend/gen_encoder.hpp      |   4 ++
 backend/src/ir/instruction.cpp           |   8 +--
 8 files changed, 184 insertions(+), 29 deletions(-)

diff --git a/backend/src/backend/gen8_encoder.cpp b/backend/src/backend/gen8_encoder.cpp
index 277260f..2f69116 100644
--- a/backend/src/backend/gen8_encoder.cpp
+++ b/backend/src/backend/gen8_encoder.cpp
@@ -637,4 +637,74 @@ namespace gbe
     gen8_insn->bits1.da3srcacc.src2_abs = src2.absolute;
     gen8_insn->bits1.da3srcacc.src2_negate = src2.negation;
   }
+
+  static void setOBlockRWA64(GenEncoder *p,
+                             GenNativeInstruction *insn,
+                             uint32_t bti,
+                             uint32_t size,
+                             uint32_t msg_type,
+                             uint32_t msg_length,
+                             uint32_t response_length)
+  {
+    const GenMessageTarget sfid = GEN_SFID_DATAPORT1_DATA;
+    p->setMessageDescriptor(insn, sfid, msg_length, response_length);
+    assert(size == 0 || size == 1 || size == 2 || size == 4 || size == 8);
+    Gen8NativeInstruction *gen8_insn = &insn->gen8_insn;
+
+    gen8_insn->bits3.gen8_block_rw_a64.msg_type = msg_type;
+    gen8_insn->bits3.gen8_block_rw_a64.bti = bti;
+    // For OWord Block read, we use unaligned read
+    gen8_insn->bits3.gen8_block_rw_a64.msg_sub_type = msg_type == GEN8_P1_BLOCK_READ_A64 ? 1 : 0;
+    gen8_insn->bits3.gen8_block_rw_a64.block_size = size <=  2 ? size : (size == 4 ? 3 : 4);
+    gen8_insn->bits3.gen8_block_rw_a64.header_present = 1;
+  }
+
+  void Gen8Encoder::OBREADA64(GenRegister dst, GenRegister header, uint32_t bti, uint32_t size) {
+   GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+    const uint32_t msg_length = 1;
+    uint32_t rsize = size / 2;
+    uint32_t msgsize = size;
+    // When size is 1 OWord, which means half a reg, we need to know which half to use
+    if (size == 1) {
+      if (dst.subnr == 0)
+        msgsize = 0;
+      else
+        msgsize = 1;
+    }
+    rsize = rsize == 0 ? 1 : rsize;
+    const uint32_t response_length = rsize; // Size is in regs
+    this->setHeader(insn);
+    this->setDst(insn, GenRegister::uw16grf(dst.nr, 0));
+    this->setSrc0(insn, GenRegister::ud8grf(header.nr, 0));
+    this->setSrc1(insn, GenRegister::immud(0));
+    setOBlockRWA64(this,
+                   insn,
+                   bti,
+                   msgsize,
+                   GEN8_P1_BLOCK_READ_A64,
+                   msg_length,
+                   response_length);
+
+  }
+
+  void Gen8Encoder::OBWRITEA64(GenRegister header, uint32_t bti, uint32_t size) {
+   GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+    uint32_t rsize = size / 2;
+    rsize = rsize == 0 ? 1 : rsize;
+    const uint32_t msg_length = 1 + rsize; // Size is in owords
+    const uint32_t response_length = 0;
+    uint32_t msgsize = size;
+    msgsize = msgsize == 1 ? 0 : msgsize;
+    this->setHeader(insn);
+    this->setSrc0(insn, GenRegister::ud8grf(header.nr, 0));
+    this->setSrc1(insn, GenRegister::immud(0));
+    this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UW));
+    setOBlockRWA64(this,
+                   insn,
+                   bti,
+                   msgsize,
+                   GEN8_P1_BLOCK_WRITE_A64,
+                   msg_length,
+                   response_length);
+   }
 } /* End of the name space. */
diff --git a/backend/src/backend/gen8_encoder.hpp b/backend/src/backend/gen8_encoder.hpp
index 12b3765..b0aec3a 100644
--- a/backend/src/backend/gen8_encoder.hpp
+++ b/backend/src/backend/gen8_encoder.hpp
@@ -71,6 +71,10 @@ namespace gbe
                        uint32_t dstAcc, uint32_t src0Acc, uint32_t src1Acc);
     void MADM(GenRegister dst, GenRegister src0, GenRegister src1, GenRegister src2,
               uint32_t dstAcc, uint32_t src0Acc, uint32_t src1Acc, uint32_t src2Acc);
+    /*! A64 OBlock read */
+    virtual void OBREADA64(GenRegister dst, GenRegister header, uint32_t bti, uint32_t elemSize);
+    /*! A64 OBlock write */
+    virtual void OBWRITEA64(GenRegister header, uint32_t bti, uint32_t elemSize);
   };
 }
 #endif /* __GBE_GEN8_ENCODER_HPP__ */
diff --git a/backend/src/backend/gen8_instruction.hpp b/backend/src/backend/gen8_instruction.hpp
index 549948a..e76ecaa 100644
--- a/backend/src/backend/gen8_instruction.hpp
+++ b/backend/src/backend/gen8_instruction.hpp
@@ -604,6 +604,19 @@ union Gen8NativeInstruction
         uint32_t end_of_thread:1;
       } gen7_msg_gw;
 
+    struct {
+        uint32_t bti:8;
+        uint32_t block_size:3; // oword size
+        uint32_t msg_sub_type:2; // 00 OWord block R/W 01 Unaligned OWord block read 10 Oword Dual Block R/W 11 HWord Block R/W
+        uint32_t ignored:1;
+        uint32_t msg_type:5;  // 10100 A64 block read,  10101 A64 block write
+        uint32_t header_present:1;
+        uint32_t response_length:5;
+        uint32_t msg_length:4;
+        uint32_t pad2:2;
+        uint32_t end_of_thread:1;
+      } gen8_block_rw_a64;
+
       struct {
         uint32_t jip:32;
       } gen8_branch;
diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index 6bb0f22..e10d89b 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -3502,14 +3502,20 @@ namespace gbe
 
   void GenContext::emitOBReadInstruction(const SelectionInstruction &insn) {
     const GenRegister dst= ra->genReg(insn.dst(1));
+    const GenRegister addrreg = ra->genReg(insn.src(0));
     uint32_t type = dst.type;
     uint32_t typesize = typeSize(type);
-    const GenRegister addr = GenRegister::toUniform(ra->genReg(insn.src(0)), GEN_TYPE_UD);
-    const GenRegister header = GenRegister::retype(ra->genReg(insn.dst(0)), GEN_TYPE_UD);
-    const GenRegister headeraddr = GenRegister::offset(header, 0, 2*4);
     const uint32_t vec_size = insn.extra.elem;
     const GenRegister tmp = GenRegister::retype(ra->genReg(insn.dst(1 + vec_size)), type);
     const uint32_t simdWidth = p->curr.execWidth;
+    const GenRegister header = GenRegister::retype(ra->genReg(insn.dst(0)), GEN_TYPE_UD);
+    const GenRegister addr = GenRegister::toUniform(addrreg, addrreg.type);
+    GenRegister headeraddr;
+    bool isA64 = insn.getbti() == 255;
+    if (isA64)
+      headeraddr = GenRegister::retype(GenRegister::offset(header, 0, 0), GEN_TYPE_UL);
+    else
+      headeraddr = GenRegister::offset(header, 0, 2*4);
 
     // Make header
     p->push();
@@ -3525,7 +3531,9 @@ namespace gbe
       p->MOV(headeraddr, addr);
 
       // Put zero in the general state base address
-      p->MOV(GenRegister::offset(header, 0, 5 * 4), GenRegister::immud(0));
+      if (!isA64)
+        p->MOV(GenRegister::offset(header, 0, 5 * 4), GenRegister::immud(0));
+
     }
     p->pop();
     // Now read the data, oword block read can only work with simd16 and no mask
@@ -3534,7 +3542,12 @@ namespace gbe
       {
         p->curr.execWidth = 16;
         p->curr.noMask = 1;
-        p->OBREAD(dst, header, insn.getbti(), simdWidth * typesize / 16);
+        if (isA64) {
+          //p->curr.execWidth = 8;
+          p->OBREADA64(dst, header, insn.getbti(), simdWidth * typesize / 16);
+        }
+        else
+          p->OBREAD(dst, header, insn.getbti(), simdWidth * typesize / 16);
       }
       p->pop();
     } else if (vec_size == 2) {
@@ -3542,7 +3555,10 @@ namespace gbe
       {
         p->curr.execWidth = 16;
         p->curr.noMask = 1;
-        p->OBREAD(tmp, header, insn.getbti(), simdWidth * typesize / 8);
+        if (isA64)
+          p->OBREADA64(tmp, header, insn.getbti(), simdWidth * typesize / 8);
+        else
+          p->OBREAD(tmp, header, insn.getbti(), simdWidth * typesize / 8);
       }
       p->pop();
       p->MOV(ra->genReg(insn.dst(1)), GenRegister::offset(tmp, 0));
@@ -3553,7 +3569,10 @@ namespace gbe
         {
           p->curr.execWidth = 16;
           p->curr.noMask = 1;
-          p->OBREAD(tmp, header, insn.getbti(), 2 * typesize);
+          if (isA64)
+            p->OBREADA64(tmp, header, insn.getbti(), 2 * typesize);
+          else
+            p->OBREAD(tmp, header, insn.getbti(), 2 * typesize);
         }
         p->pop();
         for (uint32_t j = 0; j < 4; j++)
@@ -3569,7 +3588,10 @@ namespace gbe
             }
             p->pop();
           }
-          p->OBREAD(tmp, header, insn.getbti(), 8);
+          if (isA64)
+            p->OBREADA64(tmp, header, insn.getbti(), 8);
+          else
+            p->OBREAD(tmp, header, insn.getbti(), 8);
           for (uint32_t j = 0; j < 8 / typesize ; j++)
             p->MOV(ra->genReg(insn.dst(1 + j + i * 2)), GenRegister::offset(tmp, 0 ,j * simdWidth * typesize ));
         }
@@ -3590,7 +3612,10 @@ namespace gbe
           {
             p->curr.execWidth = 16;
             p->curr.noMask = 1;
-            p->OBREAD(tmp, header, insn.getbti(), 8);
+            if (isA64)
+              p->OBREADA64(tmp, header, insn.getbti(), 8);
+            else
+              p->OBREAD(tmp, header, insn.getbti(), 8);
           }
           p->pop();
           for (uint32_t j = 0; j < 16 / typesize; j++)
@@ -3607,7 +3632,10 @@ namespace gbe
             }
             p->pop();
           }
-          p->OBREAD(tmp, header, insn.getbti(), 8);
+          if (isA64)
+            p->OBREADA64(tmp, header, insn.getbti(), 8);
+          else
+            p->OBREAD(tmp, header, insn.getbti(), 8);
           for (uint32_t j = 0; j < 8 / typesize; j++)
             p->MOV(ra->genReg(insn.dst(1 + j + i * 8 / typesize)), GenRegister::offset(tmp, 0 ,j * simdWidth * typesize ));
         }
@@ -3616,16 +3644,23 @@ namespace gbe
   }
 
   void GenContext::emitOBWriteInstruction(const SelectionInstruction &insn) {
-    const GenRegister addr = GenRegister::toUniform(ra->genReg(insn.src(0)), GEN_TYPE_UD);
+    const GenRegister addrreg = ra->genReg(insn.src(0));
     const GenRegister header = GenRegister::retype(ra->genReg(insn.dst(0)), GEN_TYPE_UD);
-    const GenRegister headeraddr = GenRegister::offset(header, 0, 2*4);
     uint32_t type = ra->genReg(insn.src(1)).type;
     uint32_t typesize = typeSize(type);
     const uint32_t vec_size = insn.extra.elem;
     const GenRegister tmp = GenRegister::offset(header, 1);
+    const GenRegister addr = GenRegister::toUniform(addrreg, addrreg.type);
+    GenRegister headeraddr;
+    bool isA64 = insn.getbti() == 255;
+    if (isA64)
+      headeraddr = GenRegister::retype(GenRegister::offset(header, 0, 0), GEN_TYPE_UL);
+    else
+      headeraddr = GenRegister::offset(header, 0, 2*4);
     const uint32_t simdWidth = p->curr.execWidth;
     uint32_t tmp_size = simdWidth * vec_size / 8;
     tmp_size = tmp_size > 4 ? 4 : tmp_size;
+    uint32_t offset_size = isA64 ? 128 : 8;
 
     p->push();
       // Copy r0 into the header first
@@ -3636,10 +3671,14 @@ namespace gbe
 
       // Update the header with the current address
       p->curr.execWidth = 1;
-      p->SHR(headeraddr, addr, GenRegister::immud(4));
+      if (isA64)
+        p->MOV(headeraddr, addr);
+      else
+        p->SHR(headeraddr, addr, GenRegister::immud(4));
 
       // Put zero in the general state base address
-      p->MOV(GenRegister::offset(header, 0, 5*4), GenRegister::immud(0));
+      if (!isA64)
+        p->MOV(GenRegister::offset(header, 0, 5*4), GenRegister::immud(0));
 
     p->pop();
     // Now write the data, oword block write can only work with simd16 and no mask
@@ -3649,7 +3688,10 @@ namespace gbe
       {
         p->curr.execWidth = 16;
         p->curr.noMask = 1;
-        p->OBWRITE(header, insn.getbti(), simdWidth * typesize / 16);
+        if (isA64)
+          p->OBWRITEA64(header, insn.getbti(), simdWidth * typesize / 16);
+        else
+          p->OBWRITE(header, insn.getbti(), simdWidth * typesize / 16);
       }
       p->pop();
     } else if (vec_size == 2) {
@@ -3659,7 +3701,10 @@ namespace gbe
       {
         p->curr.execWidth = 16;
         p->curr.noMask = 1;
-        p->OBWRITE(header, insn.getbti(), simdWidth * typesize / 8);
+        if (isA64)
+          p->OBWRITEA64(header, insn.getbti(), simdWidth * typesize / 8);
+        else
+          p->OBWRITE(header, insn.getbti(), simdWidth * typesize / 8);
       }
       p->pop();
     } else if (vec_size == 4) {
@@ -3670,7 +3715,10 @@ namespace gbe
         {
           p->curr.execWidth = 16;
           p->curr.noMask = 1;
-          p->OBWRITE(header, insn.getbti(), 2 * typesize);
+          if (isA64)
+            p->OBWRITEA64(header, insn.getbti(), 2 * typesize);
+          else
+            p->OBWRITE(header, insn.getbti(), 2 * typesize);
         }
         p->pop();
       } else {
@@ -3682,11 +3730,14 @@ namespace gbe
             {
               // Update the address in header
               p->curr.execWidth = 1;
-              p->ADD(headeraddr, headeraddr, GenRegister::immud(8));
+              p->ADD(headeraddr, headeraddr, GenRegister::immud(offset_size));
             }
             p->pop();
           }
-          p->OBWRITE(header, insn.getbti(), 8);
+          if (isA64)
+            p->OBWRITEA64(header, insn.getbti(), 8);
+          else
+            p->OBWRITE(header, insn.getbti(), 8);
         }
       }
     } else if (vec_size == 8) {
@@ -3699,7 +3750,7 @@ namespace gbe
             {
               // Update the address in header
               p->curr.execWidth = 1;
-              p->ADD(headeraddr, headeraddr, GenRegister::immud(8));
+              p->ADD(headeraddr, headeraddr, GenRegister::immud(offset_size));
             }
             p->pop();
           }
@@ -3707,7 +3758,10 @@ namespace gbe
           {
             p->curr.execWidth = 16;
             p->curr.noMask = 1;
-            p->OBWRITE(header, insn.getbti(), 8);
+            if (isA64)
+              p->OBWRITEA64(header, insn.getbti(), 8);
+            else
+              p->OBWRITE(header, insn.getbti(), 8);
           }
           p->pop();
         }
@@ -3720,11 +3774,14 @@ namespace gbe
             {
               // Update the address in header
               p->curr.execWidth = 1;
-              p->ADD(headeraddr, headeraddr, GenRegister::immud(8));
+              p->ADD(headeraddr, headeraddr, GenRegister::immud(offset_size));
             }
             p->pop();
           }
-          p->OBWRITE(header, insn.getbti(), 8);
+          if (isA64)
+            p->OBWRITEA64(header, insn.getbti(), 8);
+          else
+            p->OBWRITE(header, insn.getbti(), 8);
         }
       }
     } else NOT_SUPPORTED;
diff --git a/backend/src/backend/gen_defs.hpp b/backend/src/backend/gen_defs.hpp
index bcbb23f..de88e11 100644
--- a/backend/src/backend/gen_defs.hpp
+++ b/backend/src/backend/gen_defs.hpp
@@ -357,6 +357,9 @@ enum GenMessageTarget {
 #define GEN75_P1_ATOMIC_COUNTER_4X2    12 //1100: Atomic Counter Operation 4X2
 #define GEN75_P1_TYPED_SURFACE_WRITE   13 //1101: Typed Surface Write
 
+#define GEN8_P1_BLOCK_READ_A64        20 //10100
+#define GEN8_P1_BLOCK_WRITE_A64       21 //10101
+
 /* Data port data cache scratch messages*/
 #define GEN_SCRATCH_READ                  0
 #define GEN_SCRATCH_WRITE                 1
diff --git a/backend/src/backend/gen_encoder.cpp b/backend/src/backend/gen_encoder.cpp
index a6f8db8..5d5f564 100644
--- a/backend/src/backend/gen_encoder.cpp
+++ b/backend/src/backend/gen_encoder.cpp
@@ -1338,6 +1338,14 @@ namespace gbe
                 response_length);
   }
 
+  void GenEncoder::OBREADA64(GenRegister dst, GenRegister header, uint32_t bti, uint32_t elemSize) {
+    NOT_SUPPORTED;
+  }
+
+  void GenEncoder::OBWRITEA64(GenRegister header, uint32_t bti, uint32_t elemSize) {
+    NOT_SUPPORTED;
+  }
+
   void GenEncoder::EOT(uint32_t msg) {
     GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
     this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
diff --git a/backend/src/backend/gen_encoder.hpp b/backend/src/backend/gen_encoder.hpp
index 0f835ca..963c811 100644
--- a/backend/src/backend/gen_encoder.hpp
+++ b/backend/src/backend/gen_encoder.hpp
@@ -275,6 +275,10 @@ namespace gbe
     virtual void MBREAD(GenRegister dst, GenRegister header, uint32_t bti, uint32_t elemSize);
     /*! MBlock write */
     virtual void MBWRITE(GenRegister header, uint32_t bti, uint32_t elemSize);
+    /*! A64 OBlock read */
+    virtual void OBREADA64(GenRegister dst, GenRegister header, uint32_t bti, uint32_t elemSize);
+    /*! A64 OBlock write */
+    virtual void OBWRITEA64(GenRegister header, uint32_t bti, uint32_t elemSize);
 
     GBE_CLASS(GenEncoder); //!< Use custom allocators
     virtual void alu3(uint32_t opcode, GenRegister dst,
diff --git a/backend/src/ir/instruction.cpp b/backend/src/ir/instruction.cpp
index 512055c..e722dbe 100644
--- a/backend/src/ir/instruction.cpp
+++ b/backend/src/ir/instruction.cpp
@@ -1652,12 +1652,8 @@ namespace ir {
             whyNot = "Wrong number of source.";
             return false;
           } else {
-            const RegisterFamily fam = fn.getPointerFamily();
-            for (uint32_t srcID = 1; srcID < this->srcNum; ++srcID) {
-              const Register regID = fn.getRegister(src, srcID);
-              if (UNLIKELY(checkRegisterData(fam, regID, fn, whyNot) == false))
-                return false;
-            }
+            if (UNLIKELY(checkRegisterData(FAMILY_DWORD, fn.getRegister(src, 1), fn, whyNot) == false))
+              return false;
           }
           break;
         default:
-- 
2.7.4



More information about the Beignet mailing list