[Beignet] [PATCH 3/7] GBE: add ocl 2.0 work_group_barrier support.

Song, Ruiling ruiling.song at intel.com
Tue Apr 5 05:56:39 UTC 2016


Hi Xiuli,

Yes, it is good to keep the number as 6, and I find that I forgot to modify syncStr.
If there is no any other issue, I will send v2 to fix it.

Thanks!
Ruiling
> -----Original Message-----
> From: Beignet [mailto:beignet-bounces at lists.freedesktop.org] On Behalf Of
> Xiuli Pan
> Sent: Tuesday, April 5, 2016 1:33 PM
> To: Song, Ruiling <ruiling.song at intel.com>
> Cc: beignet at lists.freedesktop.org
> Subject: Re: [Beignet] [PATCH 3/7] GBE: add ocl 2.0 work_group_barrier support.
> 
> Hi Ruiling,
> 
> The patch set LGTM, only a small problem about syncFieldNum.
> 
> Thanks
> Xiuli
> 
> 
> On Fri, Apr 01, 2016 at 02:53:24PM +0800, Ruiling Song wrote:
> > to do an image barrier, we need to:
> > 1. flush L3 RW cache.
> > 2. do a barrier gateway.
> > 3. flush sampler cache.
> >
> > Note the fence argument maybe ORed together.
> > We need to support non-immediate barrier() argument in future.
> >
> > Signed-off-by: Ruiling Song <ruiling.song at intel.com>
> > ---
> >  backend/src/backend/gen8_encoder.cpp       | 24
> ++++++++++++++++++++++++
> >  backend/src/backend/gen8_encoder.hpp       |  2 ++
> >  backend/src/backend/gen8_instruction.hpp   |  6 +++++-
> >  backend/src/backend/gen9_context.cpp       |  9 +++++++--
> >  backend/src/backend/gen_context.cpp        | 11 ++++++++---
> >  backend/src/backend/gen_defs.hpp           |  1 +
> >  backend/src/backend/gen_encoder.cpp        |  8 ++++++--
> >  backend/src/backend/gen_encoder.hpp        |  3 ++-
> >  backend/src/ir/instruction.cpp             |  3 ++-
> >  backend/src/ir/instruction.hpp             |  6 ++++--
> >  backend/src/libocl/include/ocl_sync.h      |  1 +
> >  backend/src/libocl/src/ocl_barrier.ll      | 27 ++-------------------------
> >  backend/src/libocl/src/ocl_sync.cl         |  1 -
> >  backend/src/llvm/llvm_gen_backend.cpp      | 28
> ++++++++++++++++++++++++++--
> >  backend/src/llvm/llvm_gen_ocl_function.hxx |  2 +-
> >  15 files changed, 91 insertions(+), 41 deletions(-)
> >
> > diff --git a/backend/src/backend/gen8_encoder.cpp
> b/backend/src/backend/gen8_encoder.cpp
> > index 9af8cee..a71e44a 100644
> > --- a/backend/src/backend/gen8_encoder.cpp
> > +++ b/backend/src/backend/gen8_encoder.cpp
> > @@ -494,6 +494,30 @@ namespace gbe
> >
> >      this->setSrc1(&insn, GenRegister::immd(jip*8));
> >    }
> > +  void Gen8Encoder::FENCE(GenRegister dst, bool flushRWCache) {
> > +    GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
> > +    Gen8NativeInstruction *gen8_insn = &insn->gen8_insn;
> > +    this->setHeader(insn);
> > +    this->setDst(insn, dst);
> > +    this->setSrc0(insn, dst);
> > +    setMessageDescriptor(insn, GEN_SFID_DATAPORT_DATA, 1, 1, 1);
> > +    gen8_insn->bits3.gen7_memory_fence.msg_type = GEN_MEM_FENCE;
> > +    gen8_insn->bits3.gen7_memory_fence.commit_enable = 0x1;
> > +    gen8_insn->bits3.gen7_memory_fence.flush_rw = flushRWCache ? 1 : 0;
> > +  }
> > +
> > +  void Gen8Encoder::FLUSH_SAMPLERCACHE(GenRegister dst) {
> > +     GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
> > +     this->setHeader(insn);
> > +     this->setDst(insn, dst);
> > +     this->setSrc0(insn, GenRegister::ud8grf(0,0));
> > +     unsigned msg_type = GEN_SAMPLER_MESSAGE_CACHE_FLUSH;
> > +     unsigned simd_mode = GEN_SAMPLER_SIMD_MODE_SIMD32_64;
> > +     setSamplerMessage(insn, 0, 0, msg_type,
> > +                       1, 1,
> > +                       true,
> > +                       simd_mode, 0);
> > +  }
> >
> >    void Gen8Encoder::setDst(GenNativeInstruction *insn, GenRegister dest) {
> >      Gen8NativeInstruction *gen8_insn = &insn->gen8_insn;
> > diff --git a/backend/src/backend/gen8_encoder.hpp
> b/backend/src/backend/gen8_encoder.hpp
> > index d83cde5..81a2e1f 100644
> > --- a/backend/src/backend/gen8_encoder.hpp
> > +++ b/backend/src/backend/gen8_encoder.hpp
> > @@ -38,6 +38,7 @@ namespace gbe
> >
> >      /*! Jump indexed instruction */
> >      virtual void JMPI(GenRegister src, bool longjmp = false);
> > +    virtual void FENCE(GenRegister dst, bool flushRWCache);
> >      /*! Patch JMPI/BRC/BRD (located at index insnID) with the given jump
> distance */
> >      virtual void patchJMPI(uint32_t insnID, int32_t jip, int32_t uip);
> >      virtual void F16TO32(GenRegister dest, GenRegister src0);
> > @@ -59,6 +60,7 @@ namespace gbe
> >      virtual void setTypedWriteMessage(GenNativeInstruction *insn, unsigned
> char bti,
> >                                        unsigned char msg_type, uint32_t msg_length,
> >                                        bool header_present);
> > +    virtual void FLUSH_SAMPLERCACHE(GenRegister dst);
> >      virtual void setDst(GenNativeInstruction *insn, GenRegister dest);
> >      virtual void setSrc0(GenNativeInstruction *insn, GenRegister reg);
> >      virtual void setSrc1(GenNativeInstruction *insn, GenRegister reg);
> > diff --git a/backend/src/backend/gen8_instruction.hpp
> b/backend/src/backend/gen8_instruction.hpp
> > index b45376d..67e5dc5 100644
> > --- a/backend/src/backend/gen8_instruction.hpp
> > +++ b/backend/src/backend/gen8_instruction.hpp
> > @@ -540,7 +540,11 @@ union Gen8NativeInstruction
> >        /*! Memory fence */
> >        struct {
> >          uint32_t bti:8;
> > -        uint32_t pad:5;
> > +        uint32_t pad:1;
> > +        uint32_t flush_instruction:1;
> > +        uint32_t flush_texture:1;
> > +        uint32_t flush_constant:1;
> > +        uint32_t flush_rw:1;
> >          uint32_t commit_enable:1;
> >          uint32_t msg_type:4;
> >          uint32_t pad2:1;
> > diff --git a/backend/src/backend/gen9_context.cpp
> b/backend/src/backend/gen9_context.cpp
> > index 326f5a1..aa0c174 100644
> > --- a/backend/src/backend/gen9_context.cpp
> > +++ b/backend/src/backend/gen9_context.cpp
> > @@ -34,9 +34,10 @@ namespace gbe
> >      const GenRegister fenceDst = ra->genReg(insn.dst(0));
> >      uint32_t barrierType = insn.extra.barrierType;
> >      const GenRegister barrierId = ra-
> >genReg(GenRegister::ud1grf(ir::ocl::barrierid));
> > +    bool imageFence = barrierType & ir::SYNC_IMAGE_FENCE;
> >
> > -    if (barrierType == ir::syncGlobalBarrier) {
> > -      p->FENCE(fenceDst);
> > +    if (barrierType & ir::SYNC_GLOBAL_READ_FENCE) {
> > +      p->FENCE(fenceDst, imageFence);
> >        p->MOV(fenceDst, fenceDst);
> >      }
> >      p->push();
> > @@ -53,5 +54,9 @@ namespace gbe
> >        // Now we wait for the other threads
> >        p->WAIT();
> >      p->pop();
> > +    if (imageFence) {
> > +      p->FLUSH_SAMPLERCACHE(fenceDst);
> > +      p->MOV(fenceDst, fenceDst);
> > +    }
> >    }
> >  }
> > diff --git a/backend/src/backend/gen_context.cpp
> b/backend/src/backend/gen_context.cpp
> > index e55c25e..be78c02 100644
> > --- a/backend/src/backend/gen_context.cpp
> > +++ b/backend/src/backend/gen_context.cpp
> > @@ -1834,9 +1834,10 @@ namespace gbe
> >      const GenRegister fenceDst = ra->genReg(insn.dst(0));
> >      uint32_t barrierType = insn.extra.barrierType;
> >      const GenRegister barrierId = ra-
> >genReg(GenRegister::ud1grf(ir::ocl::barrierid));
> > +    bool imageFence = barrierType & ir::SYNC_IMAGE_FENCE;
> >
> > -    if (barrierType == ir::syncGlobalBarrier) {
> > -      p->FENCE(fenceDst);
> > +    if (barrierType & ir::SYNC_GLOBAL_READ_FENCE) {
> > +      p->FENCE(fenceDst, imageFence);
> >        p->MOV(fenceDst, fenceDst);
> >      }
> >      p->push();
> > @@ -1853,11 +1854,15 @@ namespace gbe
> >        // Now we wait for the other threads
> >        p->WAIT();
> >      p->pop();
> > +    if (imageFence) {
> > +      p->FLUSH_SAMPLERCACHE(fenceDst);
> > +      p->MOV(fenceDst, fenceDst);
> > +    }
> >    }
> >
> >    void GenContext::emitFenceInstruction(const SelectionInstruction &insn) {
> >      const GenRegister dst = ra->genReg(insn.dst(0));
> > -    p->FENCE(dst);
> > +    p->FENCE(dst, false);
> >      p->MOV(dst, dst);
> >    }
> >
> > diff --git a/backend/src/backend/gen_defs.hpp
> b/backend/src/backend/gen_defs.hpp
> > index 60ebdd6..271aaaf 100644
> > --- a/backend/src/backend/gen_defs.hpp
> > +++ b/backend/src/backend/gen_defs.hpp
> > @@ -424,6 +424,7 @@ enum GenMessageTarget {
> >  #define GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE  6
> >  #define GEN5_SAMPLER_MESSAGE_SAMPLE_LD           7
> >  #define GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO      10
> > +#define GEN_SAMPLER_MESSAGE_CACHE_FLUSH          0x1f
> >
> >  /* for GEN5 only */
> >  #define GEN_SAMPLER_SIMD_MODE_SIMD4X2                   0
> > diff --git a/backend/src/backend/gen_encoder.cpp
> b/backend/src/backend/gen_encoder.cpp
> > index d3e1936..8ca4a5e 100644
> > --- a/backend/src/backend/gen_encoder.cpp
> > +++ b/backend/src/backend/gen_encoder.cpp
> > @@ -203,7 +203,7 @@ namespace gbe
> >                                          unsigned msg_length, unsigned response_length,
> >                                          bool header_present, bool end_of_thread)
> >    {
> > -     setSrc1(inst, GenRegister::immd(0));
> > +     setSrc1(inst, GenRegister::immud(0));
> >       inst->bits3.generic_gen5.header_present = header_present;
> >       inst->bits3.generic_gen5.response_length = response_length;
> >       inst->bits3.generic_gen5.msg_length = msg_length;
> > @@ -957,7 +957,7 @@ namespace gbe
> >       insn->bits3.msg_gateway.notify = notifyN;
> >    }
> >
> > -  void GenEncoder::FENCE(GenRegister dst) {
> > +  void GenEncoder::FENCE(GenRegister dst, bool flushRWCache) {
> >      GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
> >      this->setHeader(insn);
> >      this->setDst(insn, dst);
> > @@ -1226,6 +1226,10 @@ namespace gbe
> >                         header_present,
> >                         simd_mode, return_format);
> >    }
> > +  void GenEncoder::FLUSH_SAMPLERCACHE(GenRegister dst) {
> > +    // only Gen8+ support flushing sampler cache
> > +    assert(0);
> > +  }
> >
> >    void GenEncoder::TYPED_WRITE(GenRegister msg, bool header_present,
> unsigned char bti)
> >    {
> > diff --git a/backend/src/backend/gen_encoder.hpp
> b/backend/src/backend/gen_encoder.hpp
> > index a4bf90c..d30d55c 100644
> > --- a/backend/src/backend/gen_encoder.hpp
> > +++ b/backend/src/backend/gen_encoder.hpp
> > @@ -144,7 +144,7 @@ namespace gbe
> >      /*! Forward the gateway message. */
> >      void FWD_GATEWAY_MSG(GenRegister src, uint32_t notifyN = 0);
> >      /*! Memory fence message (to order loads and stores between threads) */
> > -    void FENCE(GenRegister dst);
> > +    virtual void FENCE(GenRegister dst, bool flushRWCache);
> >      /*! Jump indexed instruction */
> >      virtual void JMPI(GenRegister src, bool longjmp = false);
> >      /*! IF indexed instruction */
> > @@ -216,6 +216,7 @@ namespace gbe
> >                             bool header_present,
> >                             uint32_t simd_mode,
> >                             uint32_t return_format);
> > +    virtual void FLUSH_SAMPLERCACHE(GenRegister dst);
> >
> >      /*! TypedWrite instruction for texture */
> >      virtual void TYPED_WRITE(GenRegister header,
> > diff --git a/backend/src/ir/instruction.cpp b/backend/src/ir/instruction.cpp
> > index 415c255..9b3f0cd 100644
> > --- a/backend/src/ir/instruction.cpp
> > +++ b/backend/src/ir/instruction.cpp
> > @@ -1202,7 +1202,8 @@ namespace ir {
> >                                   SYNC_LOCAL_READ_FENCE |
> >                                   SYNC_LOCAL_WRITE_FENCE |
> >                                   SYNC_GLOBAL_READ_FENCE |
> > -                                 SYNC_GLOBAL_WRITE_FENCE;
> > +                                 SYNC_GLOBAL_WRITE_FENCE |
> > +                                 SYNC_IMAGE_FENCE;
> >        if (UNLIKELY(this->parameters > maxParams)) {
> >          whyNot = "Invalid parameters for sync instruction";
> >          return false;
> > diff --git a/backend/src/ir/instruction.hpp b/backend/src/ir/instruction.hpp
> > index 1017c19..467192c 100644
> > --- a/backend/src/ir/instruction.hpp
> > +++ b/backend/src/ir/instruction.hpp
> > @@ -503,17 +503,19 @@ namespace ir {
> >      SYNC_LOCAL_WRITE_FENCE  = 1<<2,
> >      SYNC_GLOBAL_READ_FENCE  = 1<<3,
> >      SYNC_GLOBAL_WRITE_FENCE = 1<<4,
> > -    SYNC_INVALID            = 1<<5
> > +    SYNC_IMAGE_FENCE        = 1<<5,
> > +    SYNC_INVALID            = 1<<6
> >    };
> >
> >    /*! 5 bits to encode all possible synchronization capablities */
> > -  static const uint32_t syncFieldNum = 5u;
> > +  static const uint32_t syncFieldNum = 7u;
> 
> It seem this should be 6u if you only added one SYNC_IMAGE_FENCE
> 
> >
> >    /*! When barrier(CLK_LOCAL_MEM_FENCE) is issued */
> >    static const uint32_t syncLocalBarrier = SYNC_WORKGROUP_EXEC
> |SYNC_LOCAL_WRITE_FENCE | SYNC_LOCAL_READ_FENCE;
> >
> >    /*! When barrier(CLK_GLOBAL_MEM_FENCE) is issued */
> >    static const uint32_t syncGlobalBarrier = SYNC_WORKGROUP_EXEC |
> SYNC_GLOBAL_WRITE_FENCE | SYNC_GLOBAL_READ_FENCE;
> > +  static const uint32_t syncImageBarrier =  SYNC_WORKGROUP_EXEC |
> SYNC_GLOBAL_WRITE_FENCE | SYNC_GLOBAL_READ_FENCE |
> SYNC_IMAGE_FENCE;
> >
> >    /*! Sync instructions are used to order loads and stores for a given memory
> >     *  space and/or to serialize threads at a given point in the program
> > diff --git a/backend/src/libocl/include/ocl_sync.h
> b/backend/src/libocl/include/ocl_sync.h
> > index a2f0764..1487353 100644
> > --- a/backend/src/libocl/include/ocl_sync.h
> > +++ b/backend/src/libocl/include/ocl_sync.h
> > @@ -27,5 +27,6 @@ OVERLOADABLE void barrier(cl_mem_fence_flags flags);
> >  OVERLOADABLE void mem_fence(cl_mem_fence_flags flags);
> >  OVERLOADABLE void read_mem_fence(cl_mem_fence_flags flags);
> >  OVERLOADABLE void write_mem_fence(cl_mem_fence_flags flags);
> > +#define work_group_barrier barrier
> >  cl_mem_fence_flags get_fence(void *ptr);
> >  #endif  /* __OCL_SYNC_H__ */
> > diff --git a/backend/src/libocl/src/ocl_barrier.ll
> b/backend/src/libocl/src/ocl_barrier.ll
> > index 31087ba..009bd21 100644
> > --- a/backend/src/libocl/src/ocl_barrier.ll
> > +++ b/backend/src/libocl/src/ocl_barrier.ll
> > @@ -11,32 +11,9 @@ declare i32 @_get_local_mem_fence() nounwind
> alwaysinline
> >  declare i32 @_get_global_mem_fence() nounwind alwaysinline
> >  declare void @__gen_ocl_barrier_local() nounwind alwaysinline noduplicate
> >  declare void @__gen_ocl_barrier_global() nounwind alwaysinline noduplicate
> > -declare void @__gen_ocl_barrier_local_and_global() nounwind alwaysinline
> noduplicate
> > +declare void @__gen_ocl_barrier(i32) nounwind alwaysinline noduplicate
> >
> >  define void @_Z7barrierj(i32 %flags) nounwind noduplicate alwaysinline {
> > -  %1 = icmp eq i32 %flags, 3
> > -  br i1 %1, label %barrier_local_global, label %barrier_local_check
> > -
> > -barrier_local_global:
> > -  call void @__gen_ocl_barrier_local_and_global()
> > -  br label %done
> > -
> > -barrier_local_check:
> > -  %2 = icmp eq i32 %flags, 1
> > -  br i1 %2, label %barrier_local, label %barrier_global_check
> > -
> > -barrier_local:
> > -  call void @__gen_ocl_barrier_local()
> > -  br label %done
> > -
> > -barrier_global_check:
> > -  %3 = icmp eq i32 %flags, 2
> > -  br i1 %3, label %barrier_global, label %done
> > -
> > -barrier_global:
> > -  call void @__gen_ocl_barrier_global()
> > -  br label %done
> > -
> > -done:
> > +  call void @__gen_ocl_barrier(i32 %flags)
> >    ret void
> >  }
> > diff --git a/backend/src/libocl/src/ocl_sync.cl
> b/backend/src/libocl/src/ocl_sync.cl
> > index b85db3d..cac5cfd 100644
> > --- a/backend/src/libocl/src/ocl_sync.cl
> > +++ b/backend/src/libocl/src/ocl_sync.cl
> > @@ -20,7 +20,6 @@
> >
> >  void __gen_ocl_barrier_local(void);
> >  void __gen_ocl_barrier_global(void);
> > -void __gen_ocl_barrier_local_and_global(void);
> >
> >  OVERLOADABLE void mem_fence(cl_mem_fence_flags flags) {
> >  }
> > diff --git a/backend/src/llvm/llvm_gen_backend.cpp
> b/backend/src/llvm/llvm_gen_backend.cpp
> > index a746d80..2b6875c 100644
> > --- a/backend/src/llvm/llvm_gen_backend.cpp
> > +++ b/backend/src/llvm/llvm_gen_backend.cpp
> > @@ -3654,7 +3654,7 @@ namespace gbe
> >        case GEN_OCL_FORCE_SIMD16:
> >        case GEN_OCL_LBARRIER:
> >        case GEN_OCL_GBARRIER:
> > -      case GEN_OCL_LGBARRIER:
> > +      case GEN_OCL_BARRIER:
> >          ctx.getFunction().setUseSLM(true);
> >          break;
> >        case GEN_OCL_WRITE_IMAGE_I:
> > @@ -4344,7 +4344,31 @@ namespace gbe
> >            case GEN_OCL_FORCE_SIMD16: ctx.setSimdWidth(16); break;
> >            case GEN_OCL_LBARRIER: ctx.SYNC(ir::syncLocalBarrier); break;
> >            case GEN_OCL_GBARRIER: ctx.SYNC(ir::syncGlobalBarrier); break;
> > -          case GEN_OCL_LGBARRIER: ctx.SYNC(ir::syncLocalBarrier |
> ir::syncGlobalBarrier); break;
> > +          case GEN_OCL_BARRIER:
> > +          {
> > +            Constant *CPV = dyn_cast<Constant>(*AI);
> > +            unsigned syncFlag = 0;
> > +            if (CPV) {
> > +              const ir::Immediate &x = processConstantImm(CPV);
> > +              unsigned barrierArg = x.getIntegerValue();
> > +              if (barrierArg & 0x1) {
> > +                syncFlag |= ir::syncLocalBarrier;
> > +              }
> > +              if (barrierArg & 0x2) {
> > +                syncFlag |= ir::syncGlobalBarrier;
> > +              }
> > +              if (barrierArg & 0x4) {
> > +                syncFlag |= ir::syncImageBarrier;
> > +              }
> > +            } else {
> > +              // FIXME we default it to do global fence and barrier.
> > +              // we need to do runtime check here.
> > +              syncFlag = ir::syncLocalBarrier | ir::syncGlobalBarrier;
> > +            }
> > +
> > +            ctx.SYNC(syncFlag);
> > +            break;
> > +          }
> >            case GEN_OCL_ATOMIC_ADD0:
> >            case GEN_OCL_ATOMIC_ADD1: this-
> >emitAtomicInst(I,CS,ir::ATOMIC_OP_ADD); break;
> >            case GEN_OCL_ATOMIC_SUB0:
> > diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx
> b/backend/src/llvm/llvm_gen_ocl_function.hxx
> > index 09feb1a..a47afdf 100644
> > --- a/backend/src/llvm/llvm_gen_ocl_function.hxx
> > +++ b/backend/src/llvm/llvm_gen_ocl_function.hxx
> > @@ -30,7 +30,7 @@ DECL_LLVM_GEN_FUNCTION(FMIN, __gen_ocl_fmin)
> >  // Barrier function
> >  DECL_LLVM_GEN_FUNCTION(LBARRIER, __gen_ocl_barrier_local)
> >  DECL_LLVM_GEN_FUNCTION(GBARRIER, __gen_ocl_barrier_global)
> > -DECL_LLVM_GEN_FUNCTION(LGBARRIER,
> __gen_ocl_barrier_local_and_global)
> > +DECL_LLVM_GEN_FUNCTION(BARRIER, __gen_ocl_barrier)
> >
> >  // To force SIMD8/16 compilation
> >  DECL_LLVM_GEN_FUNCTION(FORCE_SIMD8,  __gen_ocl_force_simd8)
> > --
> > 2.4.1
> >
> > _______________________________________________
> > Beignet mailing list
> > Beignet at lists.freedesktop.org
> > https://lists.freedesktop.org/mailman/listinfo/beignet
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/beignet


More information about the Beignet mailing list