[Beignet] [Patch v2] GBE: Add a customized loop unrolling handling mechanism.

Zhigang Gong zhigang.gong at intel.com
Tue Oct 7 21:58:59 PDT 2014


By default, the unrolling threshold is relatively small.
Thus some relative large loops which access private array
will not be unrolled, thus those private array can't
be scalarized latter. And the private array is allocated
in stack which is extreme slow for Gen backend currently.

To increase the unrolling threshold for all loops is not
a good idea, as most of the loops don't need to do unrolling
for this purpose and a large unrolling threshold will cause
a big code size and unecessary big register pressure which
may lead to register spilling.

So this patch introduce a trade-off pass to identify those
loops which still have private load/store in the outer most
of the loop. Then add a metadata to it to indicate aggresive
unrolling on those loops. Then do another round loop unrolling.

This patch with the previous small patch, can bring significant
performance improvement for some cases. I just tested with some
opencv test cases, and observed it can bring 2x to 10x improvement.

v2:
refine the parent loop unroll analysis method.

Signed-off-by: Zhigang Gong <zhigang.gong at intel.com>
---
 backend/src/CMakeLists.txt            |   1 +
 backend/src/llvm/llvm_gen_backend.hpp |   4 +
 backend/src/llvm/llvm_to_gen.cpp      |  12 +-
 backend/src/llvm/llvm_unroll.cpp      | 230 ++++++++++++++++++++++++++++++++++
 4 files changed, 245 insertions(+), 2 deletions(-)
 create mode 100644 backend/src/llvm/llvm_unroll.cpp

diff --git a/backend/src/CMakeLists.txt b/backend/src/CMakeLists.txt
index e57227a..e408365 100644
--- a/backend/src/CMakeLists.txt
+++ b/backend/src/CMakeLists.txt
@@ -84,6 +84,7 @@ set (GBE_SRC
     llvm/llvm_loadstore_optimization.cpp
     llvm/llvm_gen_backend.hpp
     llvm/llvm_gen_ocl_function.hxx
+    llvm/llvm_unroll.cpp
     llvm/llvm_to_gen.hpp
     backend/gen/gen_mesa_disasm.c
     backend/gen_insn_selection.cpp
diff --git a/backend/src/llvm/llvm_gen_backend.hpp b/backend/src/llvm/llvm_gen_backend.hpp
index f73aafe..76b60df 100644
--- a/backend/src/llvm/llvm_gen_backend.hpp
+++ b/backend/src/llvm/llvm_gen_backend.hpp
@@ -27,6 +27,7 @@
 #define __GBE_LLVM_GEN_BACKEND_HPP__
 
 #include "llvm/Pass.h"
+#include "llvm/Analysis/LoopPass.h"
 #include "sys/platform.hpp"
 #include "sys/map.hpp"
 #include "sys/hash_map.hpp"
@@ -98,6 +99,9 @@ namespace gbe
   /*! Passer the printf function call. */
   llvm::FunctionPass* createPrintfParserPass();
 
+  /* customized loop unrolling pass. */
+  llvm::LoopPass *createCustomLoopUnrollPass();
+
   /*! Add all the function call of ocl to our bitcode. */
   llvm::Module* runBitCodeLinker(llvm::Module *mod, bool strictMath);
 
diff --git a/backend/src/llvm/llvm_to_gen.cpp b/backend/src/llvm/llvm_to_gen.cpp
index 26d2a49..6fbf41f 100644
--- a/backend/src/llvm/llvm_to_gen.cpp
+++ b/backend/src/llvm/llvm_to_gen.cpp
@@ -152,9 +152,17 @@ namespace gbe
     MPM.add(createIndVarSimplifyPass());        // Canonicalize indvars
     MPM.add(createLoopIdiomPass());             // Recognize idioms like memset.
     MPM.add(createLoopDeletionPass());          // Delete dead loops
-    MPM.add(createLoopUnrollPass());          // Unroll small loops
-    if(optLevel > 0)
+    MPM.add(createLoopUnrollPass()); //1024, 32, 1024, 512)); //Unroll loops
+    if(optLevel > 0) {
+      MPM.add(createSROAPass(/*RequiresDomTree*/ false));
+      MPM.add(createGVNPass());                 // Remove redundancies
+    }
+    MPM.add(createCustomLoopUnrollPass()); //1024, 32, 1024, 512)); //Unroll loops
+    MPM.add(createLoopUnrollPass()); //1024, 32, 1024, 512)); //Unroll loops
+    if(optLevel > 0) {
+      MPM.add(createSROAPass(/*RequiresDomTree*/ false));
       MPM.add(createGVNPass());                 // Remove redundancies
+    }
     MPM.add(createMemCpyOptPass());             // Remove memcpy / form memset
     MPM.add(createSCCPPass());                  // Constant prop with SCCP
 
diff --git a/backend/src/llvm/llvm_unroll.cpp b/backend/src/llvm/llvm_unroll.cpp
new file mode 100644
index 0000000..a27324b
--- /dev/null
+++ b/backend/src/llvm/llvm_unroll.cpp
@@ -0,0 +1,230 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <set>
+#include "llvm/Config/llvm-config.h"
+#if LLVM_VERSION_MINOR <= 2
+#include "llvm/Function.h"
+#include "llvm/InstrTypes.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Module.h"
+#else
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#endif  /* LLVM_VERSION_MINOR <= 2 */
+#include "llvm/Pass.h"
+#if LLVM_VERSION_MINOR <= 1
+#include "llvm/Support/IRBuilder.h"
+#elif LLVM_VERSION_MINOR == 2
+#include "llvm/IRBuilder.h"
+#else
+#include "llvm/IR/IRBuilder.h"
+#endif /* LLVM_VERSION_MINOR <= 1 */
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/PassManager.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/llvm_gen_backend.hpp"
+#include "sys/map.hpp"
+
+
+using namespace llvm;
+
+namespace gbe {
+    class CustomLoopUnroll : public LoopPass
+    {
+    public:
+      static char ID;
+      CustomLoopUnroll() :
+       LoopPass(ID) {}
+
+      void getAnalysisUsage(AnalysisUsage &AU) const {
+        AU.addRequired<LoopInfo>();
+        AU.addPreserved<LoopInfo>();
+        AU.addRequiredID(LoopSimplifyID);
+        AU.addPreservedID(LoopSimplifyID);
+        AU.addRequiredID(LCSSAID);
+        AU.addPreservedID(LCSSAID);
+        AU.addRequired<ScalarEvolution>();
+        AU.addPreserved<ScalarEvolution>();
+      // FIXME: Loop unroll requires LCSSA. And LCSSA requires dom info.
+      // If loop unroll does not preserve dom info then LCSSA pass on next
+      // loop will receive invalid dom info.
+      // For now, recreate dom info, if loop is unrolled.
+      AU.addPreserved<DominatorTreeWrapperPass>();
+
+      }
+
+      // Returns the value associated with the given metadata node name (for
+      // example, "llvm.loop.unroll.count").  If no such named metadata node
+      // exists, then nullptr is returned.
+      static const ConstantInt *GetUnrollMetadataValue(const Loop *L,
+                                                     StringRef Name) {
+        MDNode *LoopID = L->getLoopID();
+        if (!LoopID) return nullptr;
+    
+        // First operand should refer to the loop id itself.
+        assert(LoopID->getNumOperands() > 0 && "requires at least one operand");
+        assert(LoopID->getOperand(0) == LoopID && "invalid loop id");
+    
+        for (unsigned i = 1, e = LoopID->getNumOperands(); i < e; ++i) {
+          const MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
+          if (!MD) continue;
+    
+          const MDString *S = dyn_cast<MDString>(MD->getOperand(0));
+          if (!S) continue;
+    
+          if (Name.equals(S->getString())) {
+            assert(MD->getNumOperands() == 2 &&
+                   "Unroll hint metadata should have two operands.");
+            return cast<ConstantInt>(MD->getOperand(1));
+          }
+        }
+        return nullptr;
+      }
+
+      void setUnrollID(Loop *L, bool enable) {
+        if (!enable && disabledLoops.find(L) != disabledLoops.end())
+           return;
+        LLVMContext &Context = L->getHeader()->getContext();
+        SmallVector<Value *, 2> forceUnroll;
+        forceUnroll.push_back(MDString::get(Context, "llvm.loop.unroll.enable"));
+        forceUnroll.push_back(ConstantInt::get(Type::getInt1Ty(Context), enable));
+        MDNode *forceUnrollNode = MDNode::get(Context, forceUnroll);
+        SmallVector<Value *, 4> Vals;
+        Vals.push_back(NULL);
+        Vals.push_back(forceUnrollNode);
+        MDNode *NewLoopID = MDNode::get(Context, Vals);
+        // Set operand 0 to refer to the loop id itself.
+        NewLoopID->replaceOperandWith(0, NewLoopID);
+        L->setLoopID(NewLoopID);
+        if (!enable)
+          disabledLoops.insert(L);
+      }
+
+      static bool hasPrivateLoadStore(Loop *L) {
+        const std::vector<Loop*> subLoops = L->getSubLoops();
+        std::set<BasicBlock*> subBlocks, blocks;
+
+        for(auto l : subLoops)
+          for(auto bb : l->getBlocks())
+            subBlocks.insert(bb);
+        for(auto bb : L->getBlocks())
+          if (subBlocks.find(bb) == subBlocks.end())
+            blocks.insert(bb);
+        for(auto bb : blocks) {
+          for (BasicBlock::iterator inst = bb->begin(), instE = bb->end(); inst != instE; ++inst) {
+            unsigned addrSpace = 0;
+            if (isa<LoadInst>(*inst)) {
+              LoadInst *ld = cast<LoadInst>(&*inst);
+              addrSpace = ld->getPointerAddressSpace();
+            }
+            else if (isa<StoreInst>(*inst)) {
+              StoreInst *st = cast<StoreInst>(&*inst);
+              addrSpace = st->getPointerAddressSpace();
+            }
+            if (addrSpace == 0)
+              return true;
+          }
+        }
+        return false;
+      }
+      // If one loop has very large self trip count
+      // we don't want to unroll it.
+      // self trip count means trip count divide by the parent's trip count. for example
+      // for (int i = 0; i < 16; i++) {
+      //   for (int j = 0; j < 4; j++) {
+      //     for (int k = 0; k < 2; k++) {
+      //       ...
+      //     }
+      //     ...
+      //   }
+      // The inner loops j and k could be unrolled, but the loop i will not be unrolled.
+      // The return value true means the L could be unrolled, otherwise, it could not
+      // be unrolled.
+      bool handleParentLoops(Loop *L, LPPassManager &LPM) {
+        Loop *currL = L;
+        ScalarEvolution *SE = &getAnalysis<ScalarEvolution>();
+        BasicBlock *latchBlock = currL->getLoopLatch();
+        unsigned currTripCount = 0;
+        bool shouldUnroll = true;
+        if (latchBlock)
+          currTripCount = SE->getSmallConstantTripCount(L, latchBlock);
+
+        while(currL) {
+          Loop *parentL = currL->getParentLoop();
+          unsigned parentTripCount = 0;
+          if (parentL) {
+            BasicBlock *parentLatchBlock = parentL->getLoopLatch();
+            if (parentLatchBlock)
+              parentTripCount = SE->getSmallConstantTripCount(parentL, parentLatchBlock);
+          }
+          if ((parentTripCount != 0 && currTripCount / parentTripCount > 16) ||
+              (currTripCount > 32)) {
+            if (currL == L)
+              shouldUnroll = false;
+            setUnrollID(currL, false);
+            if (currL != L)
+              LPM.deleteLoopFromQueue(currL);
+          }
+          currL = parentL;
+          currTripCount = parentTripCount;
+        }
+        return shouldUnroll;
+      }
+
+      // Analyze the outermost BBs of this loop, if there are
+      // some private load or store, we change it's loop meta data
+      // to indicate more aggresive unrolling on it.
+      virtual bool runOnLoop(Loop *L, LPPassManager &LPM) {
+        const ConstantInt *Enable = GetUnrollMetadataValue(L, "llvm.loop.unroll.enable");
+        if (Enable)
+          return false;
+        const ConstantInt *Count = GetUnrollMetadataValue(L, "llvm.loop.unroll.count");
+        if (Count)
+          return false;
+
+        if (!handleParentLoops(L, LPM))
+          return false;
+
+        if (!hasPrivateLoadStore(L))
+          return false;
+        setUnrollID(L, true);
+        return true;
+      }
+
+      virtual const char *getPassName() const {
+        return "SPIR backend: custom loop unrolling pass";
+      }
+    private:
+      std::set<Loop *> disabledLoops;
+
+    };
+
+    char CustomLoopUnroll::ID = 0;
+
+    LoopPass *createCustomLoopUnrollPass() {
+      return new CustomLoopUnroll();
+    }
+} // end namespace
-- 
1.8.3.2



More information about the Beignet mailing list