[Beignet] [PATCH 3/3] add ocl 2.0 work_group_barrier support.

Ruiling Song ruiling.song at intel.com
Fri Mar 4 08:33:41 UTC 2016


to do an image barrier, we need to:
1. flush L3 RW cache.
2. do a barrier gateway.
3. flush sampler cache.

Note the fence argument maybe ORed together.
We need to support non-immediate barrier() argument.

Signed-off-by: Ruiling Song <ruiling.song at intel.com>
---
 backend/src/backend/gen8_encoder.cpp       | 24 ++++++++++++++++++++++++
 backend/src/backend/gen8_encoder.hpp       |  2 ++
 backend/src/backend/gen8_instruction.hpp   |  6 +++++-
 backend/src/backend/gen9_context.cpp       |  9 +++++++--
 backend/src/backend/gen_context.cpp        | 11 ++++++++---
 backend/src/backend/gen_defs.hpp           |  1 +
 backend/src/backend/gen_encoder.cpp        |  8 ++++++--
 backend/src/backend/gen_encoder.hpp        |  3 ++-
 backend/src/ir/instruction.cpp             |  3 ++-
 backend/src/ir/instruction.hpp             |  6 ++++--
 backend/src/libocl/include/ocl_sync.h      |  1 +
 backend/src/libocl/src/ocl_barrier.ll      | 27 ++-------------------------
 backend/src/libocl/src/ocl_sync.cl         |  1 -
 backend/src/llvm/llvm_gen_backend.cpp      | 28 ++++++++++++++++++++++++++--
 backend/src/llvm/llvm_gen_ocl_function.hxx |  2 +-
 15 files changed, 91 insertions(+), 41 deletions(-)

diff --git a/backend/src/backend/gen8_encoder.cpp b/backend/src/backend/gen8_encoder.cpp
index 9af8cee..a71e44a 100644
--- a/backend/src/backend/gen8_encoder.cpp
+++ b/backend/src/backend/gen8_encoder.cpp
@@ -494,6 +494,30 @@ namespace gbe
 
     this->setSrc1(&insn, GenRegister::immd(jip*8));
   }
+  void Gen8Encoder::FENCE(GenRegister dst, bool flushRWCache) {
+    GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+    Gen8NativeInstruction *gen8_insn = &insn->gen8_insn;
+    this->setHeader(insn);
+    this->setDst(insn, dst);
+    this->setSrc0(insn, dst);
+    setMessageDescriptor(insn, GEN_SFID_DATAPORT_DATA, 1, 1, 1);
+    gen8_insn->bits3.gen7_memory_fence.msg_type = GEN_MEM_FENCE;
+    gen8_insn->bits3.gen7_memory_fence.commit_enable = 0x1;
+    gen8_insn->bits3.gen7_memory_fence.flush_rw = flushRWCache ? 1 : 0;
+  }
+
+  void Gen8Encoder::FLUSH_SAMPLERCACHE(GenRegister dst) {
+     GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+     this->setHeader(insn);
+     this->setDst(insn, dst);
+     this->setSrc0(insn, GenRegister::ud8grf(0,0));
+     unsigned msg_type = GEN_SAMPLER_MESSAGE_CACHE_FLUSH;
+     unsigned simd_mode = GEN_SAMPLER_SIMD_MODE_SIMD32_64;
+     setSamplerMessage(insn, 0, 0, msg_type,
+                       1, 1,
+                       true,
+                       simd_mode, 0);
+  }
 
   void Gen8Encoder::setDst(GenNativeInstruction *insn, GenRegister dest) {
     Gen8NativeInstruction *gen8_insn = &insn->gen8_insn;
diff --git a/backend/src/backend/gen8_encoder.hpp b/backend/src/backend/gen8_encoder.hpp
index d83cde5..81a2e1f 100644
--- a/backend/src/backend/gen8_encoder.hpp
+++ b/backend/src/backend/gen8_encoder.hpp
@@ -38,6 +38,7 @@ namespace gbe
 
     /*! Jump indexed instruction */
     virtual void JMPI(GenRegister src, bool longjmp = false);
+    virtual void FENCE(GenRegister dst, bool flushRWCache);
     /*! Patch JMPI/BRC/BRD (located at index insnID) with the given jump distance */
     virtual void patchJMPI(uint32_t insnID, int32_t jip, int32_t uip);
     virtual void F16TO32(GenRegister dest, GenRegister src0);
@@ -59,6 +60,7 @@ namespace gbe
     virtual void setTypedWriteMessage(GenNativeInstruction *insn, unsigned char bti,
                                       unsigned char msg_type, uint32_t msg_length,
                                       bool header_present);
+    virtual void FLUSH_SAMPLERCACHE(GenRegister dst);
     virtual void setDst(GenNativeInstruction *insn, GenRegister dest);
     virtual void setSrc0(GenNativeInstruction *insn, GenRegister reg);
     virtual void setSrc1(GenNativeInstruction *insn, GenRegister reg);
diff --git a/backend/src/backend/gen8_instruction.hpp b/backend/src/backend/gen8_instruction.hpp
index b45376d..67e5dc5 100644
--- a/backend/src/backend/gen8_instruction.hpp
+++ b/backend/src/backend/gen8_instruction.hpp
@@ -540,7 +540,11 @@ union Gen8NativeInstruction
       /*! Memory fence */
       struct {
         uint32_t bti:8;
-        uint32_t pad:5;
+        uint32_t pad:1;
+        uint32_t flush_instruction:1;
+        uint32_t flush_texture:1;
+        uint32_t flush_constant:1;
+        uint32_t flush_rw:1;
         uint32_t commit_enable:1;
         uint32_t msg_type:4;
         uint32_t pad2:1;
diff --git a/backend/src/backend/gen9_context.cpp b/backend/src/backend/gen9_context.cpp
index 326f5a1..aa0c174 100644
--- a/backend/src/backend/gen9_context.cpp
+++ b/backend/src/backend/gen9_context.cpp
@@ -34,9 +34,10 @@ namespace gbe
     const GenRegister fenceDst = ra->genReg(insn.dst(0));
     uint32_t barrierType = insn.extra.barrierType;
     const GenRegister barrierId = ra->genReg(GenRegister::ud1grf(ir::ocl::barrierid));
+    bool imageFence = barrierType & ir::SYNC_IMAGE_FENCE;
 
-    if (barrierType == ir::syncGlobalBarrier) {
-      p->FENCE(fenceDst);
+    if (barrierType & ir::SYNC_GLOBAL_READ_FENCE) {
+      p->FENCE(fenceDst, imageFence);
       p->MOV(fenceDst, fenceDst);
     }
     p->push();
@@ -53,5 +54,9 @@ namespace gbe
       // Now we wait for the other threads
       p->WAIT();
     p->pop();
+    if (imageFence) {
+      p->FLUSH_SAMPLERCACHE(fenceDst);
+      p->MOV(fenceDst, fenceDst);
+    }
   }
 }
diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index 57adaf1..cdfd98e 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -1834,9 +1834,10 @@ namespace gbe
     const GenRegister fenceDst = ra->genReg(insn.dst(0));
     uint32_t barrierType = insn.extra.barrierType;
     const GenRegister barrierId = ra->genReg(GenRegister::ud1grf(ir::ocl::barrierid));
+    bool imageFence = barrierType & ir::SYNC_IMAGE_FENCE;
 
-    if (barrierType == ir::syncGlobalBarrier) {
-      p->FENCE(fenceDst);
+    if (barrierType & ir::SYNC_GLOBAL_READ_FENCE) {
+      p->FENCE(fenceDst, imageFence);
       p->MOV(fenceDst, fenceDst);
     }
     p->push();
@@ -1853,11 +1854,15 @@ namespace gbe
       // Now we wait for the other threads
       p->WAIT();
     p->pop();
+    if (imageFence) {
+      p->FLUSH_SAMPLERCACHE(fenceDst);
+      p->MOV(fenceDst, fenceDst);
+    }
   }
 
   void GenContext::emitFenceInstruction(const SelectionInstruction &insn) {
     const GenRegister dst = ra->genReg(insn.dst(0));
-    p->FENCE(dst);
+    p->FENCE(dst, false);
     p->MOV(dst, dst);
   }
 
diff --git a/backend/src/backend/gen_defs.hpp b/backend/src/backend/gen_defs.hpp
index 60ebdd6..271aaaf 100644
--- a/backend/src/backend/gen_defs.hpp
+++ b/backend/src/backend/gen_defs.hpp
@@ -424,6 +424,7 @@ enum GenMessageTarget {
 #define GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE  6
 #define GEN5_SAMPLER_MESSAGE_SAMPLE_LD           7
 #define GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO      10
+#define GEN_SAMPLER_MESSAGE_CACHE_FLUSH          0x1f
 
 /* for GEN5 only */
 #define GEN_SAMPLER_SIMD_MODE_SIMD4X2                   0
diff --git a/backend/src/backend/gen_encoder.cpp b/backend/src/backend/gen_encoder.cpp
index d3e1936..8ca4a5e 100644
--- a/backend/src/backend/gen_encoder.cpp
+++ b/backend/src/backend/gen_encoder.cpp
@@ -203,7 +203,7 @@ namespace gbe
                                         unsigned msg_length, unsigned response_length,
                                         bool header_present, bool end_of_thread)
   {
-     setSrc1(inst, GenRegister::immd(0));
+     setSrc1(inst, GenRegister::immud(0));
      inst->bits3.generic_gen5.header_present = header_present;
      inst->bits3.generic_gen5.response_length = response_length;
      inst->bits3.generic_gen5.msg_length = msg_length;
@@ -957,7 +957,7 @@ namespace gbe
      insn->bits3.msg_gateway.notify = notifyN;
   }
 
-  void GenEncoder::FENCE(GenRegister dst) {
+  void GenEncoder::FENCE(GenRegister dst, bool flushRWCache) {
     GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
     this->setHeader(insn);
     this->setDst(insn, dst);
@@ -1226,6 +1226,10 @@ namespace gbe
                        header_present,
                        simd_mode, return_format);
   }
+  void GenEncoder::FLUSH_SAMPLERCACHE(GenRegister dst) {
+    // only Gen8+ support flushing sampler cache
+    assert(0);
+  }
 
   void GenEncoder::TYPED_WRITE(GenRegister msg, bool header_present, unsigned char bti)
   {
diff --git a/backend/src/backend/gen_encoder.hpp b/backend/src/backend/gen_encoder.hpp
index a4bf90c..d30d55c 100644
--- a/backend/src/backend/gen_encoder.hpp
+++ b/backend/src/backend/gen_encoder.hpp
@@ -144,7 +144,7 @@ namespace gbe
     /*! Forward the gateway message. */
     void FWD_GATEWAY_MSG(GenRegister src, uint32_t notifyN = 0);
     /*! Memory fence message (to order loads and stores between threads) */
-    void FENCE(GenRegister dst);
+    virtual void FENCE(GenRegister dst, bool flushRWCache);
     /*! Jump indexed instruction */
     virtual void JMPI(GenRegister src, bool longjmp = false);
     /*! IF indexed instruction */
@@ -216,6 +216,7 @@ namespace gbe
                            bool header_present,
                            uint32_t simd_mode,
                            uint32_t return_format);
+    virtual void FLUSH_SAMPLERCACHE(GenRegister dst);
 
     /*! TypedWrite instruction for texture */
     virtual void TYPED_WRITE(GenRegister header,
diff --git a/backend/src/ir/instruction.cpp b/backend/src/ir/instruction.cpp
index 415c255..9b3f0cd 100644
--- a/backend/src/ir/instruction.cpp
+++ b/backend/src/ir/instruction.cpp
@@ -1202,7 +1202,8 @@ namespace ir {
                                  SYNC_LOCAL_READ_FENCE |
                                  SYNC_LOCAL_WRITE_FENCE |
                                  SYNC_GLOBAL_READ_FENCE |
-                                 SYNC_GLOBAL_WRITE_FENCE;
+                                 SYNC_GLOBAL_WRITE_FENCE |
+                                 SYNC_IMAGE_FENCE;
       if (UNLIKELY(this->parameters > maxParams)) {
         whyNot = "Invalid parameters for sync instruction";
         return false;
diff --git a/backend/src/ir/instruction.hpp b/backend/src/ir/instruction.hpp
index 1017c19..467192c 100644
--- a/backend/src/ir/instruction.hpp
+++ b/backend/src/ir/instruction.hpp
@@ -503,17 +503,19 @@ namespace ir {
     SYNC_LOCAL_WRITE_FENCE  = 1<<2,
     SYNC_GLOBAL_READ_FENCE  = 1<<3,
     SYNC_GLOBAL_WRITE_FENCE = 1<<4,
-    SYNC_INVALID            = 1<<5
+    SYNC_IMAGE_FENCE        = 1<<5,
+    SYNC_INVALID            = 1<<6
   };
 
   /*! 5 bits to encode all possible synchronization capablities */
-  static const uint32_t syncFieldNum = 5u;
+  static const uint32_t syncFieldNum = 7u;
 
   /*! When barrier(CLK_LOCAL_MEM_FENCE) is issued */
   static const uint32_t syncLocalBarrier = SYNC_WORKGROUP_EXEC |SYNC_LOCAL_WRITE_FENCE | SYNC_LOCAL_READ_FENCE;
 
   /*! When barrier(CLK_GLOBAL_MEM_FENCE) is issued */
   static const uint32_t syncGlobalBarrier = SYNC_WORKGROUP_EXEC | SYNC_GLOBAL_WRITE_FENCE | SYNC_GLOBAL_READ_FENCE;
+  static const uint32_t syncImageBarrier =  SYNC_WORKGROUP_EXEC | SYNC_GLOBAL_WRITE_FENCE | SYNC_GLOBAL_READ_FENCE | SYNC_IMAGE_FENCE;
 
   /*! Sync instructions are used to order loads and stores for a given memory
    *  space and/or to serialize threads at a given point in the program
diff --git a/backend/src/libocl/include/ocl_sync.h b/backend/src/libocl/include/ocl_sync.h
index 2ea1183..ec5bb8c 100644
--- a/backend/src/libocl/include/ocl_sync.h
+++ b/backend/src/libocl/include/ocl_sync.h
@@ -24,6 +24,7 @@
 // Synchronization functions
 /////////////////////////////////////////////////////////////////////////////
 OVERLOADABLE void barrier(cl_mem_fence_flags flags);
+#define work_group_barrier barrier
 void mem_fence(cl_mem_fence_flags flags);
 void read_mem_fence(cl_mem_fence_flags flags);
 void write_mem_fence(cl_mem_fence_flags flags);
diff --git a/backend/src/libocl/src/ocl_barrier.ll b/backend/src/libocl/src/ocl_barrier.ll
index 31087ba..009bd21 100644
--- a/backend/src/libocl/src/ocl_barrier.ll
+++ b/backend/src/libocl/src/ocl_barrier.ll
@@ -11,32 +11,9 @@ declare i32 @_get_local_mem_fence() nounwind alwaysinline
 declare i32 @_get_global_mem_fence() nounwind alwaysinline
 declare void @__gen_ocl_barrier_local() nounwind alwaysinline noduplicate
 declare void @__gen_ocl_barrier_global() nounwind alwaysinline noduplicate
-declare void @__gen_ocl_barrier_local_and_global() nounwind alwaysinline noduplicate
+declare void @__gen_ocl_barrier(i32) nounwind alwaysinline noduplicate
 
 define void @_Z7barrierj(i32 %flags) nounwind noduplicate alwaysinline {
-  %1 = icmp eq i32 %flags, 3
-  br i1 %1, label %barrier_local_global, label %barrier_local_check
-
-barrier_local_global:
-  call void @__gen_ocl_barrier_local_and_global()
-  br label %done
-
-barrier_local_check:
-  %2 = icmp eq i32 %flags, 1
-  br i1 %2, label %barrier_local, label %barrier_global_check
-
-barrier_local:
-  call void @__gen_ocl_barrier_local()
-  br label %done
-
-barrier_global_check:
-  %3 = icmp eq i32 %flags, 2
-  br i1 %3, label %barrier_global, label %done
-
-barrier_global:
-  call void @__gen_ocl_barrier_global()
-  br label %done
-
-done:
+  call void @__gen_ocl_barrier(i32 %flags)
   ret void
 }
diff --git a/backend/src/libocl/src/ocl_sync.cl b/backend/src/libocl/src/ocl_sync.cl
index f4459f4..c679d51 100644
--- a/backend/src/libocl/src/ocl_sync.cl
+++ b/backend/src/libocl/src/ocl_sync.cl
@@ -20,7 +20,6 @@
 
 void __gen_ocl_barrier_local(void);
 void __gen_ocl_barrier_global(void);
-void __gen_ocl_barrier_local_and_global(void);
 
 void mem_fence(cl_mem_fence_flags flags) {
 }
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index 32f8fe3..03988f3 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -3522,7 +3522,7 @@ namespace gbe
       case GEN_OCL_FORCE_SIMD16:
       case GEN_OCL_LBARRIER:
       case GEN_OCL_GBARRIER:
-      case GEN_OCL_LGBARRIER:
+      case GEN_OCL_BARRIER:
         ctx.getFunction().setUseSLM(true);
         break;
       case GEN_OCL_WRITE_IMAGE_I:
@@ -4191,7 +4191,31 @@ namespace gbe
           case GEN_OCL_FORCE_SIMD16: ctx.setSimdWidth(16); break;
           case GEN_OCL_LBARRIER: ctx.SYNC(ir::syncLocalBarrier); break;
           case GEN_OCL_GBARRIER: ctx.SYNC(ir::syncGlobalBarrier); break;
-          case GEN_OCL_LGBARRIER: ctx.SYNC(ir::syncLocalBarrier | ir::syncGlobalBarrier); break;
+          case GEN_OCL_BARRIER:
+          {
+            Constant *CPV = dyn_cast<Constant>(*AI);
+            unsigned syncFlag = 0;
+            if (CPV) {
+              const ir::Immediate &x = processConstantImm(CPV);
+              unsigned barrierArg = x.getIntegerValue();
+              if (barrierArg & 0x1) {
+                syncFlag |= ir::syncLocalBarrier;
+              }
+              if (barrierArg & 0x2) {
+                syncFlag |= ir::syncGlobalBarrier;
+              }
+              if (barrierArg & 0x4) {
+                syncFlag |= ir::syncImageBarrier;
+              }
+            } else {
+              // FIXME we default it to do global fence and barrier.
+              // we need to do runtime check here.
+              syncFlag = ir::syncLocalBarrier | ir::syncGlobalBarrier;
+            }
+
+            ctx.SYNC(syncFlag);
+            break;
+          }
           case GEN_OCL_ATOMIC_ADD0:
           case GEN_OCL_ATOMIC_ADD1: this->emitAtomicInst(I,CS,ir::ATOMIC_OP_ADD); break;
           case GEN_OCL_ATOMIC_SUB0:
diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx b/backend/src/llvm/llvm_gen_ocl_function.hxx
index 92d4ea3..040e1f5 100644
--- a/backend/src/llvm/llvm_gen_ocl_function.hxx
+++ b/backend/src/llvm/llvm_gen_ocl_function.hxx
@@ -27,7 +27,7 @@ DECL_LLVM_GEN_FUNCTION(FMIN, __gen_ocl_fmin)
 // Barrier function
 DECL_LLVM_GEN_FUNCTION(LBARRIER, __gen_ocl_barrier_local)
 DECL_LLVM_GEN_FUNCTION(GBARRIER, __gen_ocl_barrier_global)
-DECL_LLVM_GEN_FUNCTION(LGBARRIER, __gen_ocl_barrier_local_and_global)
+DECL_LLVM_GEN_FUNCTION(BARRIER, __gen_ocl_barrier)
 
 // To force SIMD8/16 compilation
 DECL_LLVM_GEN_FUNCTION(FORCE_SIMD8,  __gen_ocl_force_simd8)
-- 
2.4.1



More information about the Beignet mailing list