[Beignet] [PATCH V2 5/6] Backend: Add intel_sub_group_shuffle_down/up/xor

Yang, Rong R rong.r.yang at intel.com
Fri Jul 15 08:51:18 UTC 2016


The first 4 patches is LGTM, will push them.

For intel_sub_group_shuffle_down/up implement, we'd better handle it in the opencl c level.

> -----Original Message-----
> From: Beignet [mailto:beignet-bounces at lists.freedesktop.org] On Behalf Of
> Xiuli Pan
> Sent: Thursday, July 7, 2016 11:10
> To: beignet at lists.freedesktop.org
> Cc: Pan, Xiuli <xiuli.pan at intel.com>
> Subject: [Beignet] [PATCH V2 5/6] Backend: Add
> intel_sub_group_shuffle_down/up/xor
> 
> From: Pan Xiuli <xiuli.pan at intel.com>
> 
> Using a function shuffle delta for down/up, using some flags for current and
> down/up src switch. The flags and index is pre caculated in libocl.
> The shuffle delta only handle flag mask the dst with different src.
> Using the old shuffle with xor for shuffle_xor.
> 
> Signed-off-by: Pan Xiuli <xiuli.pan at intel.com>
> ---
>  backend/src/backend/gen_insn_selection.cpp | 65
> ++++++++++++++++++++++++++++++
>  backend/src/ir/instruction.cpp             | 44 ++++++++++++++++++++
>  backend/src/ir/instruction.hpp             |  9 +++++
>  backend/src/ir/instruction.hxx             |  1 +
>  backend/src/libocl/script/ocl_simd.def     |  9 +++++
>  backend/src/libocl/tmpl/ocl_simd.tmpl.cl   | 34 ++++++++++++++++
>  backend/src/libocl/tmpl/ocl_simd.tmpl.h    |  9 +++++
>  backend/src/llvm/llvm_gen_backend.cpp      | 13 ++++++
>  backend/src/llvm/llvm_gen_ocl_function.hxx |  2 +
>  9 files changed, 186 insertions(+)
> 
> diff --git a/backend/src/backend/gen_insn_selection.cpp
> b/backend/src/backend/gen_insn_selection.cpp
> index e342161..7b646e0 100644
> --- a/backend/src/backend/gen_insn_selection.cpp
> +++ b/backend/src/backend/gen_insn_selection.cpp
> @@ -6738,6 +6738,70 @@ extern bool OCL_DEBUGINFO; // first defined by
> calling BVAR in program.cpp
>      DECL_CTOR(MediaBlockWriteInstruction, 1, 1);
>    };
> 
> +  /*! SIMD shuffle delta pattern */
> +  DECL_PATTERN(SimdShuffleDeltaInstruction)
> +  {
> +    bool emitOne(Selection::Opaque &sel, const
> ir::SimdShuffleDeltaInstruction &insn, bool &markChildren) const
> +    {
> +      using namespace ir;
> +      const GenRegister dst = sel.selReg(insn.getDst(0), TYPE_U32);
> +      const GenRegister srcx = sel.selReg(insn.getSrc(0), TYPE_U32);
> +      const GenRegister srcy = sel.selReg(insn.getSrc(1), TYPE_U32);
> +      const GenRegister index = sel.selReg(insn.getSrc(2), TYPE_U32);
> +      const GenRegister inRange = sel.selReg(insn.getSrc(3), TYPE_U32);
> +      const GenRegister constZero = GenRegister::immud(0);;
> +      const GenRegister shiftL = sel.selReg(sel.reg(FAMILY_DWORD),
> TYPE_U32);
> +      bool hasShiftL = false;
> +
> +      sel.push();
> +      sel.curr.predicate = GEN_PREDICATE_NONE;
> +      /* First shuffle for srcx */
> +      if (sel.isScalarReg(insn.getSrc(0))) {
> +        sel.MOV(dst, srcx);
> +      } else {
> +        if (index.file == GEN_IMMEDIATE_VALUE) {
> +          sel.push();
> +          uint32_t offset = index.value.ud % sel.curr.execWidth;
> +          GenRegister reg = GenRegister::subphysicaloffset(srcx, offset);
> +          reg.vstride = GEN_VERTICAL_STRIDE_0;
> +          reg.hstride = GEN_HORIZONTAL_STRIDE_0;
> +          reg.width = GEN_WIDTH_1;
> +          sel.MOV(dst, reg);
> +          sel.push();
> +        } else {
> +          sel.SHL(shiftL, index, GenRegister::immud(0x2));
> +          hasShiftL = true;
> +          sel.SIMD_SHUFFLE(dst, srcx, shiftL);
> +        }
> +      }
> +      sel.curr.flag = 0;
> +      sel.curr.subFlag = 1;
> +      sel.CMP(GEN_CONDITIONAL_EQ, inRange, constZero);
> +      sel.curr.predicate = GEN_PREDICATE_NORMAL;
> +      /* Now shuffle for srcy */
> +      if (sel.isScalarReg(insn.getSrc(1))) {
> +        sel.MOV(dst, srcy);
> +      } else {
> +        if (index.file == GEN_IMMEDIATE_VALUE) {
> +          sel.push();
> +          uint32_t offset = index.value.ud % sel.curr.execWidth;
> +          GenRegister reg = GenRegister::subphysicaloffset(srcy, offset);
> +          reg.vstride = GEN_VERTICAL_STRIDE_0;
> +          reg.hstride = GEN_HORIZONTAL_STRIDE_0;
> +          reg.width = GEN_WIDTH_1;
> +          sel.MOV(dst, reg);
> +          sel.pop();
> +        } else {
> +          if (!hasShiftL)
> +            sel.SHL(shiftL, index, GenRegister::immud(0x2));
> +          sel.SIMD_SHUFFLE(dst, srcy, shiftL);
> +        }
> +      }
> +      sel.pop();
> +      return true;
> +    }
> +    DECL_CTOR(SimdShuffleDeltaInstruction, 1, 1);  };
> 
>    /*! Sort patterns */
>    INLINE bool cmp(const SelectionPattern *p0, const SelectionPattern *p1)
> { @@ -6782,6 +6846,7 @@ extern bool OCL_DEBUGINFO; // first defined by
> calling BVAR in program.cpp
>      this->insert<PrintfInstructionPattern>();
>      this->insert<MediaBlockReadInstructionPattern>();
>      this->insert<MediaBlockWriteInstructionPattern>();
> +    this->insert<SimdShuffleDeltaInstructionPattern>();
> 
>      // Sort all the patterns with the number of instructions they output
>      for (uint32_t op = 0; op < ir::OP_INVALID; ++op) diff --git
> a/backend/src/ir/instruction.cpp b/backend/src/ir/instruction.cpp index
> ed64580..a274626 100644
> --- a/backend/src/ir/instruction.cpp
> +++ b/backend/src/ir/instruction.cpp
> @@ -1136,6 +1136,35 @@ namespace ir {
>        uint8_t vec_size;
>      };
> 
> +    class ALIGNED_INSTRUCTION SimdShuffleDeltaInstruction :
> +      public BasePolicy,
> +      public TupleSrcPolicy<SimdShuffleDeltaInstruction >,
> +      public NDstPolicy<SimdShuffleDeltaInstruction, 1>
> +    {
> +    public:
> +
> +      INLINE SimdShuffleDeltaInstruction(Register dst, Tuple srcTuple, uint8_t
> srcNum) {
> +        this->opcode = OP_SIMD_SHUFFLE_DELTA;
> +        this->dst[0] = dst;
> +        this->src = srcTuple;
> +        this->srcNum = srcNum;
> +      }
> +      INLINE bool wellFormed(const Function &fn, std::string &why) const;
> +      INLINE void out(std::ostream &out, const Function &fn) const {
> +        this->outOpcode(out);
> +        out << " %" << this->getDst(fn, 0);
> +        out << " {";
> +        for (uint32_t i = 0; i < srcNum; ++i)
> +          out << "%" << this->getSrc(fn, i) << (i != (srcNum-1u) ? " " : "");
> +        out << "}";
> +      }
> +
> +      Tuple src;
> +      Register dst[1];
> +      uint8_t srcNum;
> +    };
> +
> +
>  #undef ALIGNED_INSTRUCTION
> 
>      /////////////////////////////////////////////////////////////////////////
> @@ -1679,6 +1708,14 @@ namespace ir {
>        return true;
>      }
> 
> +    INLINE bool SimdShuffleDeltaInstruction::wellFormed(const Function &fn,
> std::string &whyNot) const {
> +      if (this->srcNum != 4) {
> +        whyNot = "Wrong number of source.";
> +        return false;
> +      }
> +      return true;
> +    }
> +
>  #undef CHECK_TYPE
> 
>      /////////////////////////////////////////////////////////////////////////
> @@ -2154,6 +2191,10 @@
> START_INTROSPECTION(MediaBlockWriteInstruction)
>  #include "ir/instruction.hxx"
>  END_INTROSPECTION(MediaBlockWriteInstruction)
> 
> +START_INTROSPECTION(SimdShuffleDeltaInstruction)
> +#include "ir/instruction.hxx"
> +END_INTROSPECTION(SimdShuffleDeltaInstruction)
> +
>  #undef END_INTROSPECTION
>  #undef START_INTROSPECTION
>  #undef DECL_INSN
> @@ -2691,6 +2732,9 @@ DECL_MEM_FN(MemInstruction, void,
> setBtiReg(Register reg), setBtiReg(reg))
>      return internal::MediaBlockWriteInstruction(imageIndex, srcTuple,
> srcNum, vec_size).convert();
>    }
> 
> +  Instruction SIMD_SHUFFLE_DELTA(Register dst, Tuple srcTuple, uint8_t
> srcNum) {
> +    return internal::SimdShuffleDeltaInstruction(dst, srcTuple,
> + srcNum).convert();  }
> 
>    std::ostream &operator<< (std::ostream &out, const Instruction &insn) {
>      const Function &fn = insn.getFunction(); diff --git
> a/backend/src/ir/instruction.hpp b/backend/src/ir/instruction.hpp index
> b2b0b49..7ee59a2 100644
> --- a/backend/src/ir/instruction.hpp
> +++ b/backend/src/ir/instruction.hpp
> @@ -653,6 +653,13 @@ namespace ir {
>      uint8_t getVectorSize() const;
>    };
> 
> +  /*! simd shuffle */
> +  class SimdShuffleDeltaInstruction : public Instruction {
> +  public:
> +    /*! Return true if the given instruction is an instance of this class */
> +    static bool isClassOf(const Instruction &insn);  };
> +
>    /*! Specialize the instruction. Also performs typechecking first based on the
>     *  opcode. Crashes if it fails
>     */
> @@ -889,6 +896,8 @@ namespace ir {
>    Instruction MBREAD(uint8_t imageIndex, Tuple dst, uint8_t vec_size, Tuple
> coord, uint8_t srcNum);
>    /*! media block write */
>    Instruction MBWRITE(uint8_t imageIndex, Tuple srcTuple, uint8_t srcNum,
> uint8_t vec_size);
> +  /*! sub_group_shuffle_delta dst srctupel */  Instruction
> + SIMD_SHUFFLE_DELTA(Register dst, Tuple srcTuple, uint8_t srcNum);
>  } /* namespace ir */
>  } /* namespace gbe */
> 
> diff --git a/backend/src/ir/instruction.hxx b/backend/src/ir/instruction.hxx
> index 7d755ae..35f9623 100644
> --- a/backend/src/ir/instruction.hxx
> +++ b/backend/src/ir/instruction.hxx
> @@ -116,3 +116,4 @@ DECL_INSN(SUBGROUP, SubGroupInstruction)
> DECL_INSN(PRINTF, PrintfInstruction)  DECL_INSN(MBREAD,
> MediaBlockReadInstruction)  DECL_INSN(MBWRITE,
> MediaBlockWriteInstruction)
> +DECL_INSN(SIMD_SHUFFLE_DELTA, SimdShuffleDeltaInstruction)
> diff --git a/backend/src/libocl/script/ocl_simd.def
> b/backend/src/libocl/script/ocl_simd.def
> index e26243e..aa47735 100644
> --- a/backend/src/libocl/script/ocl_simd.def
> +++ b/backend/src/libocl/script/ocl_simd.def
> @@ -2,3 +2,12 @@
>  floatn intel_sub_group_shuffle(floatn x, uint c)  intn
> intel_sub_group_shuffle(intn x, uint c)  uintn intel_sub_group_shuffle(uintn
> x, uint c)
> +floatn intel_sub_group_shuffle_down(floatn x, floatn y, uint c) intn
> +intel_sub_group_shuffle_down(intn x, intn y, uint c) uintn
> +intel_sub_group_shuffle_down(uintn x, uintn y, uint c) floatn
> +intel_sub_group_shuffle_up(floatn x, floatn y, uint c) intn
> +intel_sub_group_shuffle_up(intn x, intn y, uint c) uintn
> +intel_sub_group_shuffle_up(uintn x, uintn y, uint c) floatn
> +intel_sub_group_shuffle_xor(floatn x, uint c) intn
> +intel_sub_group_shuffle_xor(intn x, uint c) uintn
> +intel_sub_group_shuffle_xor(uintn x, uint c)
> diff --git a/backend/src/libocl/tmpl/ocl_simd.tmpl.cl
> b/backend/src/libocl/tmpl/ocl_simd.tmpl.cl
> index b066502..6aee94e 100644
> --- a/backend/src/libocl/tmpl/ocl_simd.tmpl.cl
> +++ b/backend/src/libocl/tmpl/ocl_simd.tmpl.cl
> @@ -18,6 +18,7 @@
> 
>  #include "ocl_simd.h"
>  #include "ocl_workitem.h"
> +#include "ocl_as.h"
> 
>  uint get_max_sub_group_size(void)
>  {
> @@ -216,3 +217,36 @@ OVERLOADABLE void
> intel_sub_group_block_write8(image2d_t p, int2 cord, uint8 dat  {
>    __gen_ocl_sub_group_block_write_image8(p, cord.x, cord.y, data);  }
> +
> +PURE CONST uint __gen_ocl_sub_group_shuffle_delta(uint x, uint y, uint
> +c, uint inRange); static OVERLOADABLE INLINE uint as_uint(uint x) {
> +  return x;
> +}
> +#define SHUFFLE_DOWN(TYPE) \
> +OVERLOADABLE TYPE intel_sub_group_shuffle_down(TYPE x, TYPE y, uint c)
> +{ \
> +  uint inRange = ((int)c + (int)get_sub_group_local_id() <
> +get_max_sub_group_size()); \
> +  return as_##TYPE(__gen_ocl_sub_group_shuffle_delta(as_uint(x),
> +as_uint(y), (get_sub_group_local_id() + c) % get_max_sub_group_size(),
> +inRange ));\ }
> +SHUFFLE_DOWN(float)
> +SHUFFLE_DOWN(int)
> +SHUFFLE_DOWN(uint)
> +#undef SHUFFLE_DOWN
> +
> +#define SHUFFLE_UP(TYPE) \
> +OVERLOADABLE TYPE intel_sub_group_shuffle_up(TYPE x, TYPE y, uint c) {
> +\
> +  uint inRange = ((int)c - (int)get_sub_group_local_id() > 0); \
> +  return as_##TYPE(__gen_ocl_sub_group_shuffle_delta(as_uint(x),
> +as_uint(y), (get_max_sub_group_size() + get_sub_group_local_id() - c) %
> +get_max_sub_group_size(), inRange ));\ }
> +SHUFFLE_UP(float)
> +SHUFFLE_UP(int)
> +SHUFFLE_UP(uint)
> +#undef SHUFFLE_UP
> +#define SHUFFLE_XOR(TYPE) \
> +OVERLOADABLE TYPE intel_sub_group_shuffle_xor(TYPE x, uint c) { \
> +  return intel_sub_group_shuffle(x, (get_sub_group_local_id() ^ c) %
> +get_max_sub_group_size()); \ }
> +SHUFFLE_XOR(float)
> +SHUFFLE_XOR(int)
> +SHUFFLE_XOR(uint)
> +#undef SHUFFLE_XOR
> diff --git a/backend/src/libocl/tmpl/ocl_simd.tmpl.h
> b/backend/src/libocl/tmpl/ocl_simd.tmpl.h
> index 799f772..15da0e7 100644
> --- a/backend/src/libocl/tmpl/ocl_simd.tmpl.h
> +++ b/backend/src/libocl/tmpl/ocl_simd.tmpl.h
> @@ -132,6 +132,15 @@ OVERLOADABLE double
> sub_group_scan_exclusive_max(double x);  OVERLOADABLE float
> intel_sub_group_shuffle(float x, uint c);  OVERLOADABLE int
> intel_sub_group_shuffle(int x, uint c);  OVERLOADABLE uint
> intel_sub_group_shuffle(uint x, uint c);
> +OVERLOADABLE float intel_sub_group_shuffle_down(float x, float y, uint
> +c); OVERLOADABLE int intel_sub_group_shuffle_down(int x, int y, uint
> +c); OVERLOADABLE uint intel_sub_group_shuffle_down(uint x, uint y, uint
> +c); OVERLOADABLE float intel_sub_group_shuffle_up(float x, float y,
> +uint c); OVERLOADABLE int intel_sub_group_shuffle_up(int x, int y, uint
> +c); OVERLOADABLE uint intel_sub_group_shuffle_up(uint x, uint y, uint
> +c); OVERLOADABLE float intel_sub_group_shuffle_xor(float x, uint c);
> +OVERLOADABLE int intel_sub_group_shuffle_xor(int x, uint c);
> +OVERLOADABLE uint intel_sub_group_shuffle_xor(uint x, uint c);
> 
>  /* blocak read/write */
>  OVERLOADABLE uint intel_sub_group_block_read(const global uint* p); diff
> --git a/backend/src/llvm/llvm_gen_backend.cpp
> b/backend/src/llvm/llvm_gen_backend.cpp
> index 41cb783..8f0bcea 100644
> --- a/backend/src/llvm/llvm_gen_backend.cpp
> +++ b/backend/src/llvm/llvm_gen_backend.cpp
> @@ -3733,6 +3733,7 @@ namespace gbe
>        case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE2:
>        case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE4:
>        case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE8:
> +      case GEN_OCL_SUB_GROUP_SHUFFLE_DELTA:
>          this->newRegister(&I);
>          break;
>        case GEN_OCL_PRINTF:
> @@ -4897,6 +4898,18 @@ namespace gbe
>              this->emitBlockReadWriteImageInst(I, CS, true, 4); break;
>            case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE8:
>              this->emitBlockReadWriteImageInst(I, CS, true, 8); break;
> +          case GEN_OCL_SUB_GROUP_SHUFFLE_DELTA:
> +          {
> +            vector<ir::Register> srcTupleData;
> +            for (uint32_t i = 0; i < 4; ++i) {
> +              srcTupleData.push_back(this->getRegister(*AI));
> +              ++AI;
> +            }
> +            const ir::Register dst = this->getRegister(&I);
> +            ir::Tuple tuple = ctx.arrayTuple(&srcTupleData[0], 4);
> +            ctx.SIMD_SHUFFLE_DELTA(dst, tuple, 4);
> +            break;
> +          }
>            default: break;
>          }
>        }
> diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx
> b/backend/src/llvm/llvm_gen_ocl_function.hxx
> index 48a72d1..dbd25b0 100644
> --- a/backend/src/llvm/llvm_gen_ocl_function.hxx
> +++ b/backend/src/llvm/llvm_gen_ocl_function.hxx
> @@ -234,5 +234,7 @@
> DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_IMAGE2,
> __gen_ocl_sub_group_block_w
> DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_IMAGE4,
> __gen_ocl_sub_group_block_write_image4)
>  DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_IMAGE8,
> __gen_ocl_sub_group_block_write_image8)
> 
> +DECL_LLVM_GEN_FUNCTION(SUB_GROUP_SHUFFLE_DELTA,
> +__gen_ocl_sub_group_shuffle_delta)
> +
>  // common function
>  DECL_LLVM_GEN_FUNCTION(LRP, __gen_ocl_lrp)
> --
> 2.7.4
> 
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/beignet


More information about the Beignet mailing list