[Beignet] [PATCH 2/3] GBE: add legalize pass to handle wide integers

Fri Oct 10 00:01:26 PDT 2014

This legalize pass will break wider integers like i128/i256/... into shorter ones.
The problem is how to choose the shorter type? From my observation,
wide integer type always comes from shorter ones through 'zext' on small type
or 'bitcast' on vectors, so we simply choose the type where it comes from.
Then we can split wide integer operations into operations on shorter interger.

Signed-off-by: Ruiling Song <ruiling.song at intel.com>
---
 backend/src/CMakeLists.txt            |    1 +
 backend/src/llvm/llvm_gen_backend.hpp |    3 +
 backend/src/llvm/llvm_legalize.cpp    |  548 +++++++++++++++++++++++++++++++++
 backend/src/llvm/llvm_to_gen.cpp      |    1 +
 4 files changed, 553 insertions(+)
 create mode 100644 backend/src/llvm/llvm_legalize.cpp

diff --git a/backend/src/CMakeLists.txt b/backend/src/CMakeLists.txt
index e57227a..d028d53 100644
--- a/backend/src/CMakeLists.txt
+++ b/backend/src/CMakeLists.txt
@@ -77,6 +77,7 @@ set (GBE_SRC
     llvm/llvm_gen_backend.cpp
     llvm/llvm_passes.cpp
     llvm/llvm_scalarize.cpp
+    llvm/llvm_legalize.cpp
     llvm/llvm_intrinsic_lowering.cpp
     llvm/llvm_barrier_nodup.cpp
     llvm/llvm_printf_parser.cpp
diff --git a/backend/src/llvm/llvm_gen_backend.hpp b/backend/src/llvm/llvm_gen_backend.hpp
index f73aafe..7ccc4d1 100644
--- a/backend/src/llvm/llvm_gen_backend.hpp
+++ b/backend/src/llvm/llvm_gen_backend.hpp
@@ -92,6 +92,9 @@ namespace gbe
   /*! Remove/add NoDuplicate function attribute for barrier functions. */
   llvm::ModulePass* createBarrierNodupPass(bool);
 
+  /*! Legalize all wide integer instructions */
+  llvm::FunctionPass* createLegalizePass();
+
   /*! Convert the Intrinsic call to gen function */
   llvm::BasicBlockPass *createIntrinsicLoweringPass();
 
diff --git a/backend/src/llvm/llvm_legalize.cpp b/backend/src/llvm/llvm_legalize.cpp
new file mode 100644
index 0000000..b09aa1a
--- /dev/null
+++ b/backend/src/llvm/llvm_legalize.cpp
@@ -0,0 +1,548 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Ruiling, Song <ruiling.song at intel.com>
+ *
+ * Legalize unsupported integer data type i128/i256/...
+ * right now, the implementation only consider little-endian system.
+ *
+ */
+#include "llvm/IR/Instructions.h"
+#include "llvm/Pass.h"
+#include "llvm/PassManager.h"
+
+#include "llvm/Config/llvm-config.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/IR/IRBuilder.h"
+
+#include "llvm_gen_backend.hpp"
+
+using namespace llvm;
+
+namespace gbe {
+
+  class Legalize : public FunctionPass {
+  public:
+    Legalize() : FunctionPass(ID){}
+    bool runOnFunction(Function& F) {
+      if (!isKernelFunction(F)) return false;
+      return legalizeFunction(F);
+    }
+    void legalizeICmp(IRBuilder<> &Builder, Instruction *p);
+    void legalizeShl(IRBuilder<> &Builder, Instruction *p);
+    void legalizeLShr(IRBuilder<> &Builder, Instruction *p);
+    void legalizeAnd(IRBuilder<> &Builder, Instruction *p);
+    void legalizeOr(IRBuilder<> &Builder, Instruction *p);
+    void legalizeXor(IRBuilder<> &Builder, Instruction *p);
+    void legalizeBitCast(IRBuilder<> &Builder, Instruction *p);
+    void legalizeTrunc(IRBuilder<> &Builder, Instruction *p);
+    void legalizeZExt(IRBuilder<> &Builder, Instruction *p);
+    bool legalizeFunction(Function& F);
+    void splitLargeInteger(APInt op, Type *splitTy, SmallVector<APInt, 16> &split);
+    void splitConstantInt(ConstantInt *c, Type *splitTy, SmallVector<Value*, 16> &split);
+    static char ID;
+  private:
+    std::set<Value *> processed;
+    std::map<Value *, SmallVector<Value*, 16>> valueMap;
+    typedef std::map<Value*, SmallVector<Value*, 16>>::iterator ValueMapIter;
+  };
+
+  void splitAPInt(APInt &data, SmallVectorImpl<APInt> &result, int totalBits, int subBits) {
+    APInt lo = data.getLoBits(totalBits/2).trunc(totalBits/2);
+    APInt hi = data.getHiBits(totalBits/2).trunc(totalBits/2);
+
+    if (totalBits/2 <= subBits) {
+      result.push_back(lo);
+      result.push_back(hi);
+      return;
+    }
+    splitAPInt(lo, result, totalBits/2, subBits);
+    splitAPInt(hi, result, totalBits/2, subBits);
+  }
+
+  void Legalize::splitLargeInteger(APInt data, Type *splitTy, SmallVector<APInt, 16> &split) {
+    unsigned opSz = data.getBitWidth();
+    unsigned subSz = splitTy->getPrimitiveSizeInBits();
+    splitAPInt(data, split, opSz, subSz);
+  }
+
+  void Legalize::splitConstantInt(ConstantInt *c, Type *splitTy, SmallVector<Value*, 16> &split) {
+    SmallVector<APInt, 16> imm;
+    splitLargeInteger(c->getValue(), splitTy, imm);
+    for (unsigned i = 0; i < imm.size(); i++) {
+      split.push_back(ConstantInt::get(splitTy, imm[i]));
+    }
+  }
+
+  void Legalize::legalizeICmp(IRBuilder<> &Builder, Instruction *p) {
+    ICmpInst *IC = dyn_cast<ICmpInst>(p);
+    ICmpInst::Predicate pred = IC->getPredicate();
+    // I could not figure out why llvm could generate some
+    // compare instruction on large integers. so here only support equality check
+    GBE_ASSERT(IC->isEquality());
+    Value *op0 = p->getOperand(0);
+    Value *op1 = p->getOperand(1);
+
+    if (isa<ConstantInt>(op0)) {
+      op0 = p->getOperand(1);
+      op1 = p->getOperand(0);
+    }
+
+    if (isa<ConstantInt>(op1)) {
+      ValueMapIter iter = valueMap.find(op0);
+      SmallVectorImpl<Value*> &opVec = iter->second;
+      SmallVector<APInt, 16> imm;
+
+      Value *res = NULL;
+      Type *splitTy = opVec[0]->getType();
+      ConstantInt *CI = dyn_cast<ConstantInt>(op1);
+
+      splitLargeInteger(CI->getValue(), splitTy, imm);
+      for (unsigned i = 0; i < opVec.size(); i++) {
+        Value *tmp = Builder.CreateICmp(pred, opVec[i], ConstantInt::get(splitTy, imm[i]));
+        if (res != NULL) {
+          if (pred == CmpInst::ICMP_EQ)
+            tmp = Builder.CreateAnd(tmp, res);
+          else
+            tmp = Builder.CreateOr(tmp, res);
+        }
+        res = tmp;
+      }
+      p->replaceAllUsesWith(res);
+    } else {
+      ValueMapIter iter0 = valueMap.find(op0);
+      ValueMapIter iter1 = valueMap.find(op1);
+      SmallVectorImpl<Value*> &opVec0 = iter0->second;
+      SmallVectorImpl<Value*> &opVec1 = iter1->second;
+
+      Value *res = NULL;
+      for (unsigned i = 0; i < opVec0.size(); i++) {
+        Value *tmp = Builder.CreateICmp(pred, opVec0[i], opVec1[i]);
+        if (res != NULL) {
+          if (pred == CmpInst::ICMP_EQ)
+            tmp = Builder.CreateAnd(tmp, res);
+          else
+            tmp = Builder.CreateOr(tmp, res);
+        }
+        res = tmp;
+      }
+      p->replaceAllUsesWith(res);
+    }
+  }
+
+  void Legalize::legalizeShl(IRBuilder<> &Builder, Instruction *p) {
+    // only support known bits shift
+    GBE_ASSERT(isa<ConstantInt>(p->getOperand(1)));
+
+    ValueMapIter iter = valueMap.find(p->getOperand(0));
+    GBE_ASSERT(iter != valueMap.end());
+    SmallVectorImpl<Value*> &v0 = iter->second;
+
+    uint64_t shiftBits = dyn_cast<ConstantInt>(p->getOperand(1))->getZExtValue();
+    Type *intTy = IntegerType::get(p->getContext(), 32);
+    Type *splitTy = v0[0]->getType();
+
+    unsigned elemNum = v0.size();
+    unsigned szSplit = splitTy->getPrimitiveSizeInBits();
+    unsigned shift = shiftBits / szSplit;
+    unsigned unaligned = shiftBits % szSplit;
+
+    if (unaligned == 0) {
+      SmallVector<Value*, 16> v1;
+      // fill lower bits with zero
+      for (unsigned i = 0; i < shift; i++) {
+        v1.push_back(ConstantInt::get(splitTy, 0));
+      }
+      // do the shift
+      for (unsigned j =0; j < elemNum - shift; j++)
+        v1.push_back(v0[j]);
+
+      valueMap.insert(std::make_pair(p, v1));
+    } else {
+      SmallVector<Value*, 16> v1;
+      // fill lower bits with zero
+      for (unsigned i = 0; i < shift; i++) {
+        v1.push_back(ConstantInt::get(splitTy, 0));
+      }
+      // first one is special, shl is enough.
+      v1.push_back(Builder.CreateShl(v0[0], unaligned));
+
+      for (unsigned i = 0; i < elemNum - shift - 1; i++) {
+        Value *t0 = Builder.CreateLShr(v0[i], ConstantInt::get(intTy, szSplit-unaligned));
+        Value *t1 = Builder.CreateShl(v0[i + 1], ConstantInt::get(intTy, unaligned));
+        Value *t2 = Builder.CreateOr(t0, t1);
+        v1.push_back(t2);
+      }
+      valueMap.insert(std::make_pair(p, v1));
+    }
+  }
+
+  void Legalize::legalizeLShr(IRBuilder<> &Builder, Instruction *p) {
+    Value *op0 = p->getOperand(0);
+    Value *op1 = p->getOperand(1);
+    SmallVector<Value*, 16> result;
+
+    GBE_ASSERT(isa<ConstantInt>(p->getOperand(1)));
+
+    ValueMapIter iter = valueMap.find(op0);
+    GBE_ASSERT(iter != valueMap.end());
+    SmallVectorImpl<Value*> &opVec = iter->second;
+
+    Type *intTy = IntegerType::get(p->getContext(), 32);
+    unsigned szTotal = op1->getType()->getPrimitiveSizeInBits();
+    unsigned elemNum = opVec.size();
+    unsigned szSplit = szTotal / elemNum;
+    int64_t shift = dyn_cast<ConstantInt>(op1)->getSExtValue();
+    GBE_ASSERT(shift > 0);
+    unsigned elemShift = shift / szSplit;
+    unsigned unalign = shift % szSplit;
+
+    if (unalign == 0) {
+      // the shift bits is aligned with the split size
+      Constant *zero = ConstantInt::getSigned(opVec[0]->getType(), 0);
+      for (unsigned s = 0; s < elemNum - elemShift; s++)
+        result.push_back(opVec[s + elemShift]);
+
+      for (unsigned s = 0; s < elemShift; s++)
+        result.push_back(zero);
+
+      valueMap.insert(std::make_pair(p, result));
+    } else {
+      // not aligned case
+      for (unsigned s = elemShift; s < elemNum-1; s++) {
+        Value *t0 = Builder.CreateLShr(opVec[s], ConstantInt::get(intTy, unalign));
+        Value *t1 = Builder.CreateShl(opVec[s + 1], ConstantInt::get(intTy, szSplit - unalign));
+        Value *t2 = Builder.CreateOr(t0, t1);
+        result.push_back(t2);
+      }
+      // last element only need lshr
+      result.push_back(Builder.CreateLShr(opVec[elemNum-1], ConstantInt::get(intTy, unalign)));
+
+      for (unsigned s = 0; s < elemShift; s++) {
+        result.push_back(ConstantInt::getSigned(opVec[0]->getType(), 0));
+      }
+      valueMap.insert(std::make_pair(p, result));
+    }
+  }
+
+  void Legalize::legalizeAnd(IRBuilder<> &Builder, Instruction *p) {
+    Value *op0 = p->getOperand(0);
+    Value *op1 = p->getOperand(1);
+
+    if ((isa<UndefValue>(op0) || isa<UndefValue>(op1))) {
+      // I meet some special case as below:
+      //   %82 = zext i32 %81 to i512
+      //   %mask148 = and i512 undef, -4294967296
+      //   %ins149 = or i512 %mask148, %82
+      // I don't how to split this kind of i512 instruction in a good way,
+      // to simplify the situation, I directly optimize it to zero.
+      // And in later instructions like and/or/shr... that operates on
+      // the value can be optimized.
+      p->replaceAllUsesWith(ConstantInt::get(p->getType(), 0));
+      return;
+    }
+
+    if ((isa<ConstantInt>(op0) && dyn_cast<ConstantInt>(op0)->isZero())
+       || (isa<ConstantInt>(op1) && dyn_cast<ConstantInt>(op1)->isZero())) {
+      // zero & anyValue  ==> zero
+      p->replaceAllUsesWith(ConstantInt::get(p->getType(), 0));
+      return;
+    }
+
+    if (isa<ConstantInt>(op0)) {
+      op0 = p->getOperand(1);
+      op1 = p->getOperand(0);
+    }
+
+    ValueMapIter iter = valueMap.find(op0);
+    SmallVector<Value*, 16> v0 = iter->second;
+    SmallVector<Value*, 16> v1;
+    SmallVector<Value*, 16> v2;
+
+    if (isa<ConstantInt>(op1)) {
+      splitConstantInt(dyn_cast<ConstantInt>(op1), v0[0]->getType(), v1);
+    } else {
+      v1 = valueMap.find(op1)->second;
+    }
+
+    for (unsigned i = 0; i < v0.size(); i++) {
+      ConstantInt *c0 = NULL, *c1 = NULL;
+      if (isa<ConstantInt>(v0[i])) c0 = dyn_cast<ConstantInt>(v0[i]);
+      if (isa<ConstantInt>(v1[i])) c1 = dyn_cast<ConstantInt>(v1[i]);
+
+      if ((c0 &&c0->isZero()) || (c1 && c1->isZero())) {
+        // zero & anyvalue ==> zero
+        v2.push_back(ConstantInt::get(v0[i]->getType(), 0));
+      } else if (c0 && c0->isMinusOne()) {
+        // 1111s & anyvalue ==> anyvalue
+        v2.push_back(v1[i]);
+      } else if (c1 && c1->isMinusOne()) {
+        // 1111s & anyvalue ==> anyvalue
+        v2.push_back(v0[i]);
+      } else {
+        v2.push_back(Builder.CreateAnd(v0[i], v1[i]));
+      }
+    }
+    valueMap.insert(std::make_pair(p, v2));
+  }
+
+  void Legalize::legalizeOr(IRBuilder<> &Builder, Instruction *p) {
+    Value *op0 = p->getOperand(0);
+    Value *op1 = p->getOperand(1);
+
+    if (isa<ConstantInt>(op0)) {
+      op0 = p->getOperand(1);
+      op1 = p->getOperand(0);
+    }
+
+    if (isa<ConstantInt>(op1) && dyn_cast<ConstantInt>(op1)->isZero()) {
+      ValueMapIter iter = valueMap.find(op0);
+      valueMap.insert(std::make_pair(p, iter->second));
+      return;
+    }
+
+    ValueMapIter iter = valueMap.find(op0);
+    SmallVector<Value*, 16> v0 = iter->second;
+    SmallVector<Value*, 16> v1;
+    SmallVector<Value*, 16> v2;
+
+    if (isa<ConstantInt>(op1)) {
+      splitConstantInt(dyn_cast<ConstantInt>(op1), v0[0]->getType(), v1);
+    } else {
+      v1 = valueMap.find(op1)->second;
+    }
+
+    for (unsigned i = 0; i < v0.size(); i++) {
+      ConstantInt *c0 = NULL, *c1 = NULL;
+      if (isa<ConstantInt>(v0[i])) c0 = dyn_cast<ConstantInt>(v0[i]);
+      if (isa<ConstantInt>(v1[i])) c1 = dyn_cast<ConstantInt>(v1[i]);
+
+      if ((c0 &&c0->isZero())) {
+        // zero | anyvalue ==> anyvalue
+        v2.push_back(v1[i]);
+      } else if (c1 && c1->isZero()) {
+        // zero | anyvalue ==> anyvalue
+        v2.push_back(v0[i]);
+      } else if (c0 && c0->isMinusOne()) {
+        // 1111 | anyvalue ==> 1111
+        v2.push_back(c0);
+      } else if (c1 && c1->isMinusOne()) {
+        // 1111 | anyvalue ==> 1111
+        v2.push_back(c1);
+      } else {
+        v2.push_back(Builder.CreateOr(v0[i], v1[i]));
+      }
+    }
+    valueMap.insert(std::make_pair(p, v2));
+  }
+
+  void Legalize::legalizeXor(IRBuilder<> &Builder, Instruction *p) {
+    Value *op0 = p->getOperand(0);
+    Value *op1 = p->getOperand(1);
+
+    if (isa<ConstantInt>(op0)) {
+      op0 = p->getOperand(1);
+      op1 = p->getOperand(0);
+    }
+
+    ValueMapIter iter = valueMap.find(op0);
+    SmallVector<Value*, 16> v0 = iter->second;
+    SmallVector<Value*, 16> v1;
+    SmallVector<Value*, 16> v2;
+
+    if (isa<ConstantInt>(op1)) {
+      splitConstantInt(dyn_cast<ConstantInt>(op1), v0[0]->getType(), v1);
+    } else {
+      v1 = valueMap.find(op1)->second;
+    }
+
+    for (unsigned i = 0; i < v0.size(); i++) {
+      v2.push_back(Builder.CreateXor(v0[i], v1[i]));
+    }
+    valueMap.insert(std::make_pair(p, v2));
+  }
+  void Legalize::legalizeBitCast(IRBuilder<> &Builder, Instruction *p) {
+    SmallVector<Value*, 16> split;
+    Type *dstTy = p->getType();
+    Type *srcTy = dyn_cast<CastInst>(p)->getSrcTy();
+
+    if(srcTy->isVectorTy()) {
+      VectorType *vecTy = dyn_cast<VectorType>(srcTy);
+      Type *splitTy = vecTy->getElementType();
+      unsigned elements = srcTy->getPrimitiveSizeInBits()/splitTy->getPrimitiveSizeInBits();
+      // bitcast large integer from vector, so we do extractElement to get split integer
+      for (unsigned i = 0; i < elements; i++) {
+        Value *NV = Builder.CreateExtractElement(p->getOperand(0),
+                      ConstantInt::get(IntegerType::get(p->getContext(), 32), i));
+        split.push_back(NV);
+      }
+      valueMap.insert(std::make_pair(p, split));
+    } else if (dstTy->isVectorTy()) {
+      //bitcast from large integer to vector, so we do insertElement to build the vector
+      ValueMapIter iter = valueMap.find(p->getOperand(0));
+      SmallVectorImpl<Value*> &opVec = iter->second;
+      Type *elemTy = cast<VectorType>(dstTy)->getElementType();
+      GBE_ASSERT(elemTy == opVec[0]->getType());
+      Value *vec = NULL;
+      Type *idxTy = IntegerType::get(p->getContext(), 32);
+      for (unsigned i = 0; i < opVec.size(); ++i) {
+        Value *tmp = vec ? vec : UndefValue::get(dstTy);
+        Value *idx = ConstantInt::get(idxTy, i);
+        vec = Builder.CreateInsertElement(tmp, opVec[i], idx);
+      }
+      p->replaceAllUsesWith(vec);
+    } else {
+      p->dump(); GBE_ASSERT(0 && "Unsupported bitcast");
+    }
+  }
+
+  void Legalize::legalizeTrunc(IRBuilder<> &Builder, Instruction *p) {
+    Type *dstTy = p->getType();
+
+    ValueMapIter iter = valueMap.find(p->getOperand(0));
+    SmallVector<Value*, 16> &opVec = iter->second;
+    unsigned szSplit = opVec[0]->getType()->getPrimitiveSizeInBits();
+    unsigned szResult = dstTy->getPrimitiveSizeInBits();
+
+    if(szResult > szSplit) {
+      // the needed bits is larger than what is already split,
+      // we have to merge the split Value, use Shl/Or to do it.
+      int endIdx = (szResult + szSplit-1 )/szSplit;
+      Value * prev = ConstantInt::get(dstTy, 0);
+      for (int i = endIdx - 1; i >=0; i--) {
+        Value * res = Builder.CreateZExt(opVec[i], dstTy);
+        if (i > 0)
+          res = Builder.CreateShl(res, i*szSplit);
+        prev = Builder.CreateOr(res, prev);
+      }
+      Value *newValue = Builder.CreateTrunc(prev, dstTy);
+      p->replaceAllUsesWith(newValue);
+    } else if (szResult == szSplit) {
+      // same bit width, should use bitcast instead of trunc.
+      Value *newValue = Builder.CreateBitCast(opVec[0], dstTy);
+      p->replaceAllUsesWith(newValue);
+    } else {
+      // normal case, trunc to a shorter bit width
+      Value *newValue = Builder.CreateTrunc(opVec[0], dstTy);
+      p->replaceAllUsesWith(newValue);
+    }
+  }
+
+  void Legalize::legalizeZExt(IRBuilder<> &Builder, Instruction *p) {
+    SmallVector<Value*, 16> split;
+    Type *dstTy = dyn_cast<CastInst>(p)->getDestTy();
+    Type *srcTy = p->getOperand(0)->getType();
+    int elements = dstTy->getPrimitiveSizeInBits() / srcTy->getPrimitiveSizeInBits();
+
+    split.push_back(p->getOperand(0));
+    for (int i = 0; i < elements - 1; i++)
+      split.push_back(ConstantInt::getSigned(srcTy, 0));
+
+    valueMap.insert(std::make_pair(p, split));
+  }
+
+  bool Legalize::legalizeFunction(Function &F) {
+    bool changed = false;
+    for (Function::iterator bb = F.begin(), bbE = F.end(); bb != bbE; ++bb) {
+      IRBuilder<> Builder(bb);
+      for (BasicBlock::iterator it = bb->begin(), itE = bb->end(); it != itE; ++it) {
+        Instruction *insn = it;
+        Type *ty = insn->getType();
+        if(ty->isIntegerTy() && ty->getIntegerBitWidth() > 64) {
+          // result is large integer, push back itself and its users
+          changed = true;
+
+          processed.insert(insn);
+
+          for(Value::use_iterator iter = insn->use_begin(); iter != insn->use_end(); ++iter) {
+            // After LLVM 3.5, use_iterator points to 'Use' instead of 'User', which is more straightforward.
+          #if (LLVM_VERSION_MAJOR == 3) && (LLVM_VERSION_MINOR < 5)
+            User *theUser = *iter;
+          #else
+            User *theUser = iter->getUser();
+          #endif
+            processed.insert(theUser);
+          }
+        }
+
+        if(processed.empty() || processed.find(insn) == processed.end())
+          continue;
+
+        Builder.SetInsertPoint(insn);
+        switch(insn->getOpcode()) {
+          default: { insn->dump(); GBE_ASSERT(false && "Illegal instruction\n"); break;}
+          case Instruction::ICmp:
+            legalizeICmp(Builder, insn);
+            break;
+
+          case Instruction::Shl:
+            legalizeShl(Builder, insn);
+            break;
+
+          case Instruction::LShr:
+            legalizeLShr(Builder, insn);
+            break;
+
+          case Instruction::And:
+            legalizeAnd(Builder, insn);
+            break;
+
+          case Instruction::Or:
+            legalizeOr(Builder, insn);
+            break;
+
+          case Instruction::Xor:
+            legalizeXor(Builder, insn);
+            break;
+
+          case Instruction::BitCast:
+            legalizeBitCast(Builder, insn);
+            break;
+
+          case Instruction::Trunc:
+            legalizeTrunc(Builder, insn);
+            break;
+
+          case Instruction::ZExt:
+            legalizeZExt(Builder, insn);
+            break;
+        }
+      }
+    }
+
+    for (Value *v : processed) {
+      if (isa<Instruction>(v)) {
+        dyn_cast<Instruction>(v)->eraseFromParent();
+      }
+    }
+
+    processed.clear();
+    valueMap.clear();
+    return changed;
+  }
+
+  FunctionPass* createLegalizePass() {
+    return new Legalize();
+  }
+  char Legalize::ID = 0;
+};
diff --git a/backend/src/llvm/llvm_to_gen.cpp b/backend/src/llvm/llvm_to_gen.cpp
index e31421f..a3db919 100644
--- a/backend/src/llvm/llvm_to_gen.cpp
+++ b/backend/src/llvm/llvm_to_gen.cpp
@@ -259,6 +259,7 @@ namespace gbe
       passes.add(createGVNPass());                  // Remove redundancies
     passes.add(createPrintfParserPass());
     passes.add(createScalarizePass());        // Expand all vector ops
+    passes.add(createLegalizePass());
     passes.add(createDeadInstEliminationPass());  // Remove simplified instructions
     passes.add(createCFGSimplificationPass());     // Merge & remove BBs
     passes.add(createScalarizePass());        // Expand all vector ops
-- 
1.7.10.4