[Beignet] [PATCH 2/2] Chv: Add chv backend support.

Yang Rong rong.r.yang at intel.com
Sun Mar 29 20:23:56 PDT 2015


The chv's backend is almost same as bdw. But some long register restrictions:
1. ARF registers must never be used with 64b datatype.
2. Source and Destination horizontal stride must be aligned to the same qword.
3. Source and Destination offset must be the same, except the case of scalar source.

Add ChvContent in gen8_context.cpp to handle it. The chv's encoder is same as Gen8Encoder.

Signed-off-by: Yang Rong <rong.r.yang at intel.com>
---
 backend/src/backend/gen8_context.cpp       | 147 +++++++++++++++++++++++++++--
 backend/src/backend/gen8_context.hpp       |  23 +++++
 backend/src/backend/gen_insn_selection.cpp |  47 ++++++++-
 backend/src/backend/gen_insn_selection.hpp |   7 ++
 backend/src/backend/gen_program.cpp        |   2 +-
 5 files changed, 216 insertions(+), 10 deletions(-)

diff --git a/backend/src/backend/gen8_context.cpp b/backend/src/backend/gen8_context.cpp
index 920eb3e..283e362 100644
--- a/backend/src/backend/gen8_context.cpp
+++ b/backend/src/backend/gen8_context.cpp
@@ -331,7 +331,7 @@ namespace gbe
       return GenRegister::unpacked_ud(reg.nr, reg.subnr + offset);
   }
 
-  static void calculateFullU64MUL(GenEncoder* p, GenRegister src0, GenRegister src1, GenRegister dst_h,
+  void Gen8Context::calculateFullU64MUL(GenRegister src0, GenRegister src1, GenRegister dst_h,
                                   GenRegister dst_l, GenRegister s0l_s1h, GenRegister s0h_s1l)
   {
     src0.type = src1.type = GEN_TYPE_UD;
@@ -377,7 +377,7 @@ namespace gbe
     p->ADD(dst_h, dst_h, tmp);
   }
 
-  static void calculateFullS64MUL(GenEncoder* p, GenRegister src0, GenRegister src1, GenRegister dst_h,
+  void Gen8Context::calculateFullS64MUL(GenRegister src0, GenRegister src1, GenRegister dst_h,
                                   GenRegister dst_l, GenRegister s0_abs, GenRegister s1_abs, 
                                   GenRegister tmp0, GenRegister tmp1, GenRegister sign, GenRegister flagReg)
   {
@@ -395,7 +395,7 @@ namespace gbe
     s0_abs.type = s1_abs.type = GEN_TYPE_L;
     p->MOV(s0_abs, GenRegister::abs(src0));
     p->MOV(s1_abs, GenRegister::abs(src1));
-    calculateFullU64MUL(p, s0_abs, s1_abs, dst_h, dst_l, tmp0, tmp1);
+    calculateFullU64MUL(s0_abs, s1_abs, dst_h, dst_l, tmp0, tmp1);
 
     p->push();
     p->curr.predicate = GEN_PREDICATE_NONE;
@@ -432,11 +432,11 @@ namespace gbe
 
     if(src0.type == GEN_TYPE_UL) {
       GBE_ASSERT(src1.type == GEN_TYPE_UL);
-      calculateFullU64MUL(p, src0, src1, dst_h, dst_l, tmp0, tmp1);
+      calculateFullU64MUL(src0, src1, dst_h, dst_l, tmp0, tmp1);
     } else {
       GBE_ASSERT(src0.type == GEN_TYPE_L);
       GBE_ASSERT(src1.type == GEN_TYPE_L);
-      calculateFullS64MUL(p, src0, src1, dst_h, dst_l, s0_abs, s1_abs, tmp0,
+      calculateFullS64MUL(src0, src1, dst_h, dst_l, s0_abs, s1_abs, tmp0,
                           tmp1, sign, flagReg);
     }
   }
@@ -461,7 +461,7 @@ namespace gbe
       GBE_ASSERT(src2.type == GEN_TYPE_UL);
       dst_l.type = dst_h.type = GEN_TYPE_UL;
       tmp0.type = tmp1.type = GEN_TYPE_UL;
-      calculateFullU64MUL(p, src0, src1, dst_h, dst_l, tmp0, tmp1);
+      calculateFullU64MUL(src0, src1, dst_h, dst_l, tmp0, tmp1);
 
       /* Inplement the logic:
       dst_l += src2;
@@ -496,7 +496,7 @@ namespace gbe
       GBE_ASSERT(src1.type == GEN_TYPE_L);
       GBE_ASSERT(src2.type == GEN_TYPE_L);
 
-      calculateFullS64MUL(p, src0, src1, dst_h, dst_l, s0_abs, s1_abs, tmp0,
+      calculateFullS64MUL(src0, src1, dst_h, dst_l, s0_abs, s1_abs, tmp0,
                           tmp1, sign, flagReg);
 
       GenRegister sum = sign;
@@ -904,4 +904,137 @@ namespace gbe
     memcpy(this->a0, new_a0, sizeof(uint16_t)*sz);
   }
 
+  void ChvContext::newSelection(void) {
+    this->sel = GBE_NEW(SelectionChv, *this);
+  }
+
+  void ChvContext::calculateFullU64MUL(GenRegister src0, GenRegister src1, GenRegister dst_h,
+                                             GenRegister dst_l, GenRegister s0l_s1h, GenRegister s0h_s1l)
+  {
+    src0.type = src1.type = GEN_TYPE_UD;
+    dst_h.type = dst_l.type = GEN_TYPE_UL;
+    s0l_s1h.type = s0h_s1l.type = GEN_TYPE_UL;
+
+    //GenRegister tmp;
+
+    GenRegister s0l = unpacked_ud(src0);
+    GenRegister s1l = unpacked_ud(src1);
+    GenRegister s0h = unpacked_ud(s0l_s1h); //s0h only used before s0l_s1h, reuse s0l_s1h
+    GenRegister s1h = unpacked_ud(dst_l); //s1h only used before dst_l, reuse dst_l
+
+    p->MOV(s0h, GenRegister::offset(s0l, 0, 4));
+    p->MOV(s1h, GenRegister::offset(s1l, 0, 4));
+
+    /* High 32 bits X High 32 bits. */
+    p->MUL(dst_h, s0h, s1h);
+    /* High 32 bits X low 32 bits. */
+    p->MUL(s0h_s1l, s0h, s1l);
+    /* Low 32 bits X high 32 bits. */
+    p->MUL(s0l_s1h, s0l, s1h);
+    /* Low 32 bits X low 32 bits. */
+    p->MUL(dst_l, s0l, s1l);
+
+    /*  Because the max product of s0l*s1h is (2^N - 1) * (2^N - 1) = 2^2N + 1 - 2^(N+1), here N = 32
+        The max of addding 2 32bits integer to it is
+        2^2N + 1 - 2^(N+1) + 2*(2^N - 1) = 2^2N - 1
+        which means the product s0h_s1l adds dst_l's high 32 bits and then adds s0l_s1h's low 32 bits will not
+        overflow and have no carry.
+        By this manner, we can avoid using acc register, which has a lot of restrictions. */
+
+    GenRegister s0l_s1h_l = unpacked_ud(s0l_s1h);
+    p->ADD(s0h_s1l, s0h_s1l, s0l_s1h_l);
+
+    p->SHR(s0l_s1h, s0l_s1h, GenRegister::immud(32));
+    GenRegister s0l_s1h_h = unpacked_ud(s0l_s1h);
+    p->ADD(dst_h, dst_h, s0l_s1h_h);
+
+    GenRegister dst_l_h = unpacked_ud(s0l_s1h);
+    p->MOV(dst_l_h, unpacked_ud(dst_l, 1));
+    p->ADD(s0h_s1l, s0h_s1l, dst_l_h);
+
+    // No longer need s0l_s1h
+    GenRegister tmp = s0l_s1h;
+
+    p->SHL(tmp, s0h_s1l, GenRegister::immud(32));
+    GenRegister tmp_unpacked = unpacked_ud(tmp, 1);
+    p->MOV(unpacked_ud(dst_l, 1), tmp_unpacked);
+
+    p->SHR(tmp, s0h_s1l, GenRegister::immud(32));
+    p->ADD(dst_h, dst_h, tmp);
+  }
+
+  void ChvContext::emitI64MULInstruction(const SelectionInstruction &insn)
+  {
+    GenRegister src0 = ra->genReg(insn.src(0));
+    GenRegister src1 = ra->genReg(insn.src(1));
+    GenRegister dst = ra->genReg(insn.dst(0));
+    GenRegister res = ra->genReg(insn.dst(1));
+
+    src0.type = src1.type = GEN_TYPE_UD;
+    dst.type = GEN_TYPE_UL;
+    res.type = GEN_TYPE_UL;
+
+    /* Low 32 bits X low 32 bits. */
+    GenRegister s0l = unpacked_ud(src0);
+    GenRegister s1l = unpacked_ud(src1);
+    p->MUL(dst, s0l, s1l);
+
+    /* Low 32 bits X high 32 bits. */
+    GenRegister s1h = unpacked_ud(res);
+    p->MOV(s1h, unpacked_ud(src1, 1));
+
+    p->MUL(res, s0l, s1h);
+    p->SHL(res, res, GenRegister::immud(32));
+    p->ADD(dst, dst, res);
+
+    /* High 32 bits X low 32 bits. */
+    GenRegister s0h = unpacked_ud(res);
+    p->MOV(s0h, unpacked_ud(src0, 1));
+
+    p->MUL(res, s0h, s1l);
+    p->SHL(res, res, GenRegister::immud(32));
+    p->ADD(dst, dst, res);
+  }
+
+  void ChvContext::setA0Content(uint16_t new_a0[16], uint16_t max_offset, int sz) {
+    int16_t diff = new_a0[0] - this->a0[0];
+    if (sz == 0)
+      sz = 16;
+    GBE_ASSERT(sz%4 == 0);
+    GBE_ASSERT(new_a0[0] >= 0 && new_a0[0] < 4096);
+    bool need_reset = false;
+    for (int i = 1; i < sz; i++) {
+      GBE_ASSERT(new_a0[i] >= 0 && new_a0[0] < 4096);
+      int16_t d = new_a0[i] - this->a0[i];
+      if (diff != d) {
+        need_reset = true;
+        break;
+      }
+    }
+
+    GBE_ASSERT(this->a0[0] + diff < 4096 && this->a0[0] + diff >= 0);
+    if (!need_reset && diff >= -512 && diff + max_offset <= 511) {
+      return;
+    } else if (!need_reset && sz == 16) {
+      p->push();
+      p->curr.execWidth = 16;
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
+      p->ADD(GenRegister::retype(GenRegister::addr8(0), GEN_TYPE_W),
+          GenRegister::retype(GenRegister::addr8(0), GEN_TYPE_W), GenRegister::immw(diff));
+      p->pop();
+    } else {
+      p->push();
+      p->curr.execWidth = 1;
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
+      for (int i = 0; i < sz/2; i++) {
+        p->MOV(GenRegister::retype(GenRegister::addr1(i*2), GEN_TYPE_UD),
+            GenRegister::immud(new_a0[i*2 + 1] << 16 | new_a0[i*2]));
+      }
+      p->pop();
+    }
+    memcpy(this->a0, new_a0, sizeof(uint16_t)*sz);
+  }
+
 }
diff --git a/backend/src/backend/gen8_context.hpp b/backend/src/backend/gen8_context.hpp
index b296a3d..8827955 100644
--- a/backend/src/backend/gen8_context.hpp
+++ b/backend/src/backend/gen8_context.hpp
@@ -82,6 +82,29 @@ namespace gbe
     virtual void newSelection(void);
     void packLongVec(GenRegister unpacked, GenRegister packed, uint32_t simd);
     void unpackLongVec(GenRegister packed, GenRegister unpacked, uint32_t simd);
+    void calculateFullS64MUL(GenRegister src0, GenRegister src1, GenRegister dst_h,
+                             GenRegister dst_l, GenRegister s0_abs, GenRegister s1_abs,
+                             GenRegister tmp0, GenRegister tmp1, GenRegister sign, GenRegister flagReg);
+    virtual void calculateFullU64MUL(GenRegister src0, GenRegister src1, GenRegister dst_h,
+                                           GenRegister dst_l, GenRegister s0l_s1h, GenRegister s0h_s1l);
+  };
+
+  class ChvContext : public Gen8Context
+  {
+  public:
+    virtual ~ChvContext(void) { }
+    ChvContext(const ir::Unit &unit, const std::string &name, uint32_t deviceID, bool relaxMath = false)
+            : Gen8Context(unit, name, deviceID, relaxMath) {
+    };
+    virtual void emitI64MULInstruction(const SelectionInstruction &insn);
+
+  protected:
+    virtual void setA0Content(uint16_t new_a0[16], uint16_t max_offset = 0, int sz = 0);
+
+  private:
+    virtual void newSelection(void);
+    virtual void calculateFullU64MUL(GenRegister src0, GenRegister src1, GenRegister dst_h,
+                                           GenRegister dst_l, GenRegister s0l_s1h, GenRegister s0h_s1l);
   };
 }
 #endif /* __GBE_GEN8_CONTEXT_HPP__ */
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index 7f9c95a..67a1d95 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -361,6 +361,8 @@ namespace gbe
     void setHas32X32Mul(bool b) { bHas32X32Mul = b; }
     bool hasLongType() const { return bHasLongType; }
     void setHasLongType(bool b) { bHasLongType = b; }
+    bool hasLongRegRestrict() { return bLongRegRestrict; }
+    void setLongRegRestrict(bool b) { bLongRegRestrict = b; }
     void setLdMsgOrder(uint32_t type)  { ldMsgOrder = type; }
     uint32_t getLdMsgOrder()  const { return ldMsgOrder; }
     /*! indicate whether a register is a scalar/uniform register. */
@@ -662,6 +664,7 @@ namespace gbe
     uint16_t currAuxLabel;
     bool bHas32X32Mul;
     bool bHasLongType;
+    bool bLongRegRestrict;
     uint32_t ldMsgOrder;
     INLINE ir::LabelIndex newAuxLabel()
     {
@@ -702,7 +705,7 @@ namespace gbe
     curr(ctx.getSimdWidth()), file(ctx.getFunction().getRegisterFile()),
     maxInsnNum(ctx.getFunction().getLargestBlockSize()), dagPool(maxInsnNum),
     stateNum(0), vectorNum(0), bwdCodeGeneration(false), currAuxLabel(ctx.getFunction().labelNum()),
-    bHas32X32Mul(false), bHasLongType(false), ldMsgOrder(LD_MSG_ORDER_IVB)
+    bHas32X32Mul(false), bHasLongType(false), bLongRegRestrict(false), ldMsgOrder(LD_MSG_ORDER_IVB)
   {
     const ir::Function &fn = ctx.getFunction();
     this->regNum = fn.regNum();
@@ -1860,6 +1863,12 @@ namespace gbe
     this->opaque->setHasLongType(true);
   }
 
+  SelectionChv::SelectionChv(GenContext &ctx) : Selection(ctx) {
+    this->opaque->setHas32X32Mul(true);
+    this->opaque->setHasLongType(true);
+    this->opaque->setLongRegRestrict(true);
+  }
+
   Selection9::Selection9(GenContext &ctx) : Selection(ctx) {
     this->opaque->setHas32X32Mul(true);
     this->opaque->setHasLongType(true);
@@ -4030,7 +4039,41 @@ namespace gbe
             sel.MOV(dst, unpacked);
           }
         }
-      } else if ((dst.isdf() && srcType == ir::TYPE_FLOAT) ||
+      }   else if (sel.hasLongType() && sel.hasLongRegRestrict() && dstFamily == FAMILY_QWORD && srcFamily != FAMILY_QWORD) {
+        // Convert i32/i16/i8/float to i64/double if hasLongRegRestrict(src and dst hstride must be aligned to the same qword).
+        GenRegister unpacked;
+        GenRegister unpacked_src = src;
+
+        sel.push();
+          if (sel.isScalarReg(insn.getSrc(0))) {
+            sel.curr.execWidth = 1;
+            sel.curr.predicate = GEN_PREDICATE_NONE;
+            sel.curr.noMask = 1;
+          }
+
+          if(srcType == ir::TYPE_FLOAT) {
+            unpacked = sel.unpacked_ud(sel.reg(FAMILY_QWORD, sel.isScalarReg(insn.getSrc(0))));
+            unpacked = GenRegister::retype(unpacked, GEN_TYPE_F);
+          } else if(srcFamily == FAMILY_DWORD) {
+            unpacked = sel.unpacked_ud(sel.reg(FAMILY_QWORD, sel.isScalarReg(insn.getSrc(0))));
+            unpacked = GenRegister::retype(unpacked, dstType == TYPE_U64 ? GEN_TYPE_UD : GEN_TYPE_D);
+          } else if(srcFamily == FAMILY_WORD) {
+            unpacked = sel.unpacked_uw(sel.reg(FAMILY_QWORD, sel.isScalarReg(insn.getSrc(0))));
+            unpacked = GenRegister::retype(unpacked, dstType == TYPE_U64 ? GEN_TYPE_UW : GEN_TYPE_W);
+          } else if(srcFamily == FAMILY_BYTE) {
+            GenRegister tmp = sel.selReg(sel.reg(FAMILY_WORD, sel.isScalarReg(insn.getSrc(0))));
+            tmp = GenRegister::retype(tmp, dstType == TYPE_U64 ? GEN_TYPE_UW : GEN_TYPE_W);
+            unpacked = sel.unpacked_uw(sel.reg(FAMILY_QWORD, sel.isScalarReg(insn.getSrc(0))));
+            unpacked = GenRegister::retype(unpacked, dstType == TYPE_U64 ? GEN_TYPE_UW : GEN_TYPE_W);
+            sel.MOV(tmp, src);
+            unpacked_src = tmp;
+          } else
+            GBE_ASSERT(0);
+
+          sel.MOV(unpacked, unpacked_src);
+        sel.pop();
+        sel.MOV(dst, unpacked);
+      }else if ((dst.isdf() && srcType == ir::TYPE_FLOAT) ||
                  (src.isdf() && dstType == ir::TYPE_FLOAT)) { // float and double conversion
         ir::Register r = sel.reg(ir::RegisterFamily::FAMILY_QWORD);
         sel.MOV_DF(dst, src, sel.selReg(r, TYPE_U64));
diff --git a/backend/src/backend/gen_insn_selection.hpp b/backend/src/backend/gen_insn_selection.hpp
index 6a08180..ee5e46f 100644
--- a/backend/src/backend/gen_insn_selection.hpp
+++ b/backend/src/backend/gen_insn_selection.hpp
@@ -294,6 +294,13 @@ namespace gbe
       Selection8(GenContext &ctx);
   };
 
+  class SelectionChv: public Selection
+  {
+    public:
+      /*! Initialize internal structures used for the selection */
+      SelectionChv(GenContext &ctx);
+  };
+
   class Selection9: public Selection
   {
     public:
diff --git a/backend/src/backend/gen_program.cpp b/backend/src/backend/gen_program.cpp
index f53d5fb..c761a2f 100644
--- a/backend/src/backend/gen_program.cpp
+++ b/backend/src/backend/gen_program.cpp
@@ -167,7 +167,7 @@ namespace gbe {
     } else if (IS_BROADWELL(deviceID)) {
       ctx = GBE_NEW(Gen8Context, unit, name, deviceID, relaxMath);
     } else if (IS_CHERRYVIEW(deviceID)) {
-      ctx = GBE_NEW(Gen8Context, unit, name, deviceID, relaxMath);
+      ctx = GBE_NEW(ChvContext, unit, name, deviceID, relaxMath);
     } else if (IS_SKYLAKE(deviceID)) {
       ctx = GBE_NEW(Gen9Context, unit, name, deviceID, relaxMath);
     }
-- 
2.1.0



More information about the Beignet mailing list