[Beignet] [PATCH 1/3] Add a scalarize llvm pass.

Wed May 15 21:36:33 PDT 2013

In this pass, expand all normal vector ops to scalar ops, except store/load, image read/write and function's argument. Add fake ExtractElement/InsertElement instructions to avoid dead instruction elimination, and unit valueMap hold the relationship between these fake instructions and real load/store instructions.

Signed-off-by: Yang Rong <rong.r.yang at intel.com>
---
 backend/src/CMakeLists.txt            |    1 +
 backend/src/ir/unit.hpp               |   22 +-
 backend/src/llvm/llvm_gen_backend.cpp |  241 +++-------
 backend/src/llvm/llvm_gen_backend.hpp |   30 +-
 backend/src/llvm/llvm_scalarize.cpp   |  801 +++++++++++++++++++++++++++++++++
 backend/src/llvm/llvm_to_gen.cpp      |    1 +
 6 files changed, 914 insertions(+), 182 deletions(-)
 create mode 100644 backend/src/llvm/llvm_scalarize.cpp

diff --git a/backend/src/CMakeLists.txt b/backend/src/CMakeLists.txt
index 1829964..183517a 100644
--- a/backend/src/CMakeLists.txt
+++ b/backend/src/CMakeLists.txt
@@ -83,6 +83,7 @@ else (GBE_USE_BLOB)
     backend/program.h
     llvm/llvm_gen_backend.cpp
     llvm/llvm_passes.cpp
+    llvm/llvm_scalarize.cpp
     llvm/llvm_to_gen.cpp
     llvm/llvm_gen_backend.hpp
     llvm/llvm_gen_ocl_function.hxx
diff --git a/backend/src/ir/unit.hpp b/backend/src/ir/unit.hpp
index ae78638..3b293f5 100644
--- a/backend/src/ir/unit.hpp
+++ b/backend/src/ir/unit.hpp
@@ -1,4 +1,4 @@
-/* 
+/*
  * Copyright © 2012 Intel Corporation
  *
  * This library is free software; you can redistribute it and/or
@@ -24,9 +24,12 @@
 #ifndef __GBE_IR_UNIT_HPP__
 #define __GBE_IR_UNIT_HPP__
 
+#include "llvm/Value.h"
+
 #include "ir/constant.hpp"
 #include "ir/register.hpp"
 #include "sys/hash_map.hpp"
+#include "sys/map.hpp"
 
 namespace gbe {
 namespace ir {
@@ -41,6 +44,7 @@ namespace ir {
   {
   public:
     typedef hash_map<std::string, Function*> FunctionSet;
+    typedef std::pair<llvm::Value*, uint32_t> ValueIndex;
     /*! Create an empty unit */
     Unit(PointerSize pointerSize = POINTER_32_BITS);
     /*! Release everything (*including* the function pointers) */
@@ -71,11 +75,27 @@ namespace ir {
     ConstantSet& getConstantSet(void) { return constantSet; }
     /*! Return the constant set */
     const ConstantSet& getConstantSet(void) const { return constantSet; }
+
+    /*! Some values will not be allocated. For example a vector extract and
+     * a vector insertion when scalarize the vector load/store
+     */
+    void newValueProxy(llvm::Value *real,
+                       llvm::Value *fake,
+                       uint32_t realIndex = 0u,
+                       uint32_t fakeIndex = 0u) {
+      const ValueIndex key(fake, fakeIndex);
+      const ValueIndex value(real, realIndex);
+      GBE_ASSERT(valueMap.find(key) == valueMap.end()); // Do not insert twice
+      valueMap[key] = value;
+    }
+    /*! Return the value map */
+    const map<ValueIndex, ValueIndex>& getValueMap(void) const { return valueMap; }
   private:
     friend class ContextInterface; //!< Can free modify the unit
     hash_map<std::string, Function*> functions; //!< All the defined functions
     ConstantSet constantSet; //!< All the constants defined in the unit
     PointerSize pointerSize; //!< Size shared by all pointers
+    map<ValueIndex, ValueIndex> valueMap; //!< fake to real value map for vector load/store
     GBE_CLASS(Unit);
   };
 
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index 8dcf15c..3855011 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -1,4 +1,4 @@
-/* 
+/*
  * Copyright © 2012 Intel Corporation
  *
  * This library is free software; you can redistribute it and/or
@@ -60,7 +60,7 @@
  * dependencies on endianness or ABIs. Fortunately, the ptx (and nvptx for LLVM
  * 3.2) profile is pretty well adapted to our needs since NV and Gen GPU are
  * kind of similar, or at least they are similar enough to share the same front
- * end. 
+ * end.
  *
  * Problems
  * ========
@@ -126,10 +126,8 @@
 #include "ir/context.hpp"
 #include "ir/unit.hpp"
 #include "ir/liveness.hpp"
-#include "sys/map.hpp"
 #include "sys/set.hpp"
 #include "sys/cvar.hpp"
-#include <algorithm>
 
 /* Not defined for LLVM 3.0 */
 #if !defined(LLVM_VERSION_MAJOR)
@@ -207,7 +205,7 @@ namespace gbe
   /*! Type to register family translation */
   static ir::RegisterFamily getFamily(const ir::Context &ctx, const Type *type)
   {
-    GBE_ASSERT(isScalarType(type) == true); 
+    GBE_ASSERT(isScalarType(type) == true);
     if (type == Type::getInt1Ty(type->getContext()))
       return ir::FAMILY_BOOL;
     if (type == Type::getInt8Ty(type->getContext()))
@@ -269,6 +267,8 @@ namespace gbe
   class RegisterTranslator
   {
   public:
+    /*! Indices will be zero for scalar values */
+    typedef std::pair<Value*, uint32_t> ValueIndex;
     RegisterTranslator(ir::Context &ctx) : ctx(ctx) {}
 
     /*! Empty the maps */
@@ -289,6 +289,11 @@ namespace gbe
       GBE_ASSERT(valueMap.find(key) == valueMap.end()); // Do not insert twice
       valueMap[key] = value;
     }
+    /*! After scalarize pass, there are some valueMap in unit,
+     *  use this function to copy from unit valueMap */
+    void initValueMap(const map<ValueIndex, ValueIndex>& vMap) {
+      valueMap.insert(vMap.begin(), vMap.end());
+    }
     /*! Mostly used for the preallocated registers (lids, gids) */
     void newScalarProxy(ir::Register reg, Value *value, uint32_t index = 0u) {
       const ValueIndex key(value, index);
@@ -325,10 +330,9 @@ namespace gbe
       };
       return ir::Register();
     }
-    /*! Get the register from the given value at given index possibly iterating
-     *  in the value map to get the final real register
-     */
-    ir::Register getScalar(Value *value, uint32_t index = 0u) {
+
+    /*! iterating in the value map to get the final real register */
+    void getRealValue(Value* &value, uint32_t& index) {
       auto end = valueMap.end();
       for (;;) {
         auto it = valueMap.find(std::make_pair(value, index));
@@ -339,6 +343,14 @@ namespace gbe
           index = it->second.second;
         }
       }
+    }
+
+    /*! Get the register from the given value at given index possibly iterating
+     *  in the value map to get the final real register
+     */
+    ir::Register getScalar(Value *value, uint32_t index = 0u) {
+      getRealValue(value, index);
+
       const auto key = std::make_pair(value, index);
       GBE_ASSERT(scalarMap.find(key) != scalarMap.end());
       return scalarMap[key];
@@ -351,16 +363,8 @@ namespace gbe
     }
     /*! Says if the value exists. Otherwise, it is undefined */
     bool valueExists(Value *value, uint32_t index) {
-      auto end = valueMap.end();
-      for (;;) {
-        auto it = valueMap.find(std::make_pair(value, index));
-        if (it == end)
-          break;
-        else {
-          value = it->second.first;
-          index = it->second.second;
-        }
-      }
+      getRealValue(value, index);
+
       const auto key = std::make_pair(value, index);
       return scalarMap.find(key) != scalarMap.end();
     }
@@ -375,8 +379,6 @@ namespace gbe
       this->insertRegister(reg, key, index);
       return reg;
     }
-    /*! Indices will be zero for scalar values */
-    typedef std::pair<Value*, uint32_t> ValueIndex;
     /*! Map value to ir::Register */
     map<ValueIndex, ir::Register> scalarMap;
     /*! Map values to values when this is only a translation (eq bitcast) */
@@ -384,28 +386,6 @@ namespace gbe
     /*! Actually allocates the registers */
     ir::Context &ctx;
   };
-  /*! All intrinsic Gen functions */
-  enum OCLInstrinsic {
-#define DECL_LLVM_GEN_FUNCTION(ID, NAME) GEN_OCL_##ID,
-#include "llvm_gen_ocl_function.hxx"
-#undef DECL_LLVM_GEN_FUNCTION
-  };
-
-  /*! Build the hash map for OCL functions on Gen */
-  struct OCLIntrinsicMap {
-    /*! Build the intrinsic hash map */
-    OCLIntrinsicMap(void) {
-#define DECL_LLVM_GEN_FUNCTION(ID, NAME) \
-  map.insert(std::make_pair(#NAME, GEN_OCL_##ID));
-#include "llvm_gen_ocl_function.hxx"
-#undef DECL_LLVM_GEN_FUNCTION
-    }
-    /*! Sort intrinsics with their names */
-    hash_map<std::string, OCLInstrinsic> map;
-  };
-
-  /*! Sort the OCL Gen instrinsic functions (built on pre-main) */
-  static const OCLIntrinsicMap instrinsicMap;
 
   /*! Translate LLVM IR code to Gen IR code */
   class GenWriter : public FunctionPass, public InstVisitor<GenWriter>
@@ -423,7 +403,7 @@ namespace gbe
      */
     set<const Value*> conditionSet;
     /*! We visit each function twice. Once to allocate the registers and once to
-     *  emit the Gen IR instructions 
+     *  emit the Gen IR instructions
      */
     enum Pass {
       PASS_EMIT_REGISTERS = 0,
@@ -663,7 +643,7 @@ namespace gbe
     if (dyn_cast<ConstantAggregateZero>(CPV)) {
       return doIt(uint32_t(0)); // XXX Handle type
     } else {
-      if (dyn_cast<ConstantVector>(CPV)) 
+      if (dyn_cast<ConstantVector>(CPV))
         CPV = extractConstantElem(CPV, index);
       GBE_ASSERTM(dyn_cast<ConstantExpr>(CPV) == NULL, "Unsupported constant expression");
 
@@ -756,6 +736,9 @@ namespace gbe
   }
 
   ir::Register GenWriter::getRegister(Value *value, uint32_t elemID) {
+    //the real value may be constant, so get real value before constant check
+    regTranslator.getRealValue(value, elemID);
+
     if (dyn_cast<ConstantExpr>(value)) {
       ConstantExpr *ce = dyn_cast<ConstantExpr>(value);
       if(ce->isCast()) {
@@ -867,6 +850,7 @@ namespace gbe
                 "Returned value for kernel functions is forbidden");
     // Loop over the arguments and output registers for them
     if (!F.arg_empty()) {
+      uint32_t argID = 0;
       Function::arg_iterator I = F.arg_begin(), E = F.arg_end();
 
       // Insert a new register for each function argument
@@ -875,10 +859,33 @@ namespace gbe
       uint32_t argID = 1; // Start at one actually
       for (; I != E; ++I, ++argID) {
 #else
-      for (; I != E; ++I) {
+      for (; I != E; ++I, ++argID) {
 #endif /* LLVM_VERSION_MINOR <= 1 */
         const std::string &argName = I->getName().str();
         Type *type = I->getType();
+
+        //add support for vector argument
+        if(type->isVectorTy()) {
+          VectorType *vectorType = cast<VectorType>(type);
+
+          this->newRegister(I);
+          ir::Register reg = getRegister(I, 0);
+
+          Type *elemType = vectorType->getElementType();
+          const uint32_t elemSize = getTypeByteSize(unit, elemType);
+          const uint32_t elemNum = vectorType->getNumElements();
+          //vector's elemType always scalar type
+          ctx.input(argName, ir::FunctionArgument::VALUE, reg, elemNum*elemSize);
+
+          ir::Function& fn = ctx.getFunction();
+          for(uint32_t i=1; i < elemNum; i++) {
+            ir::PushLocation argLocation(fn, argID, elemSize*i);
+            reg = getRegister(I, i);
+            ctx.appendPushedConstant(reg, argLocation);  //add to push map for reg alloc
+          }
+          continue;
+        }
+
         GBE_ASSERTM(isScalarType(type) == true,
                     "vector type in the function argument is not supported yet");
         const ir::Register reg = regTranslator.newScalar(I);
@@ -916,7 +923,6 @@ namespace gbe
                 ctx.input(argName, ir::FunctionArgument::IMAGE, reg, ptrSize);
                 ctx.getFunction().getImageSet()->append(reg, &ctx);
               break;
-              break;
               default: GBE_ASSERT(addrSpace != ir::MEM_PRIVATE);
             }
           }
@@ -1141,6 +1147,7 @@ namespace gbe
 
     ctx.startFunction(F.getName());
     this->regTranslator.clear();
+    this->regTranslator.initValueMap(unit.getValueMap());
     this->labelMap.clear();
     this->emitFunctionPrototype(F);
 
@@ -1495,141 +1502,15 @@ namespace gbe
     ir::Context &ctx;
   };
 
-  void GenWriter::regAllocateInsertElement(InsertElementInst &I) {
-    Value *modified = I.getOperand(0);
-    Value *toInsert = I.getOperand(1);
-    Value *index = I.getOperand(2);
-
-    // Get the index for the insertion
-    Constant *CPV = dyn_cast<Constant>(index);
-    GBE_ASSERTM(CPV != NULL, "only constant indices when inserting values");
-    auto x = processConstant<ir::Immediate>(CPV, InsertExtractFunctor(ctx));
-    GBE_ASSERTM(x.type == ir::TYPE_U32 || x.type == ir::TYPE_S32,
-                "Invalid index type for InsertElement");
-
-    // Crash on overrun
-    VectorType *vectorType = cast<VectorType>(modified->getType());
-    const uint32_t elemNum = vectorType->getNumElements();
-    const uint32_t modifiedID = x.data.u32;
-    GBE_ASSERTM(modifiedID < elemNum, "Out-of-bound index for InsertElement");
-
-    // The source vector is not constant
-    if (!isa<Constant>(modified) || isa<UndefValue>(modified)) {
-       // Non modified values are just proxies
-       for (uint32_t elemID = 0; elemID < elemNum; ++elemID)
-         if (elemID != modifiedID)
-           regTranslator.newValueProxy(modified, &I, elemID, elemID);
-     }
-     // The source vector is constant
-     else {
-       // Non modified values will use LOADI
-       for (uint32_t elemID = 0; elemID < elemNum; ++elemID)
-         if (elemID != modifiedID) {
-           const ir::Type type = getType(ctx, toInsert->getType());
-           const ir::Register reg = ctx.reg(getFamily(type));
-           regTranslator.insertRegister(reg, &I, elemID);
-         }
-     }
-
-     // If the element to insert is an immediate we will generate a LOADI.
-     // Otherwise, the value is just a proxy of the inserted value
-     if (dyn_cast<Constant>(toInsert) != NULL) {
-       const ir::Type type = getType(ctx, toInsert->getType());
-       const ir::Register reg = ctx.reg(getFamily(type));
-       regTranslator.insertRegister(reg, &I, modifiedID);
-     } else
-       regTranslator.newValueProxy(toInsert, &I, 0, modifiedID);
-  }
-
-  void GenWriter::emitInsertElement(InsertElementInst &I) {
-    // Note that we check everything in regAllocateInsertElement
-    Value *modified = I.getOperand(0);
-    Value *toInsert = I.getOperand(1);
-    Value *index = I.getOperand(2);
-
-    // Get the index of the value to insert
-    Constant *indexCPV = dyn_cast<Constant>(index);
-    auto x = processConstant<ir::Immediate>(indexCPV, InsertExtractFunctor(ctx));
-    const uint32_t modifiedID = x.data.u32;
-
-    // The source vector is constant. We need to insert LOADI for the unmodified
-    // values
-    if (isa<Constant>(modified) && !isa<UndefValue>(modified)) {
-      VectorType *vectorType = cast<VectorType>(modified->getType());
-      const uint32_t elemNum = vectorType->getNumElements();
-      for (uint32_t elemID = 0; elemID < elemNum; ++elemID)
-        if (elemID != modifiedID) {
-          Constant *sourceCPV = dyn_cast<Constant>(modified);
-          if (isa<UndefValue>(extractConstantElem(sourceCPV, elemID)) == false) {
-            const ir::ImmediateIndex immIndex = this->newImmediate(sourceCPV, elemID);
-            const ir::Immediate imm = ctx.getImmediate(immIndex);
-            const ir::Register reg = regTranslator.getScalar(&I, elemID);
-            ctx.LOADI(imm.type, reg, immIndex);
-          }
-        }
-    }
-
-    // If the inserted value is not a constant, we just use a proxy
-    if (dyn_cast<Constant>(toInsert) == NULL)
-      return;
-
-    // We need a LOADI if we insert an immediate
-    Constant *toInsertCPV = dyn_cast<Constant>(toInsert);
-    const ir::ImmediateIndex immIndex = this->newImmediate(toInsertCPV);
-    const ir::Immediate imm = ctx.getImmediate(immIndex);
-    const ir::Register reg = regTranslator.getScalar(&I, modifiedID);
-    ctx.LOADI(imm.type, reg, immIndex);
-  }
-
-  void GenWriter::regAllocateExtractElement(ExtractElementInst &I) {
-    Value *extracted = I.getOperand(0);
-    Value *index = I.getOperand(1);
-    GBE_ASSERTM(isa<Constant>(extracted) == false,
-                "TODO support constant vector for extract");
-    Constant *CPV = dyn_cast<Constant>(index);
-    GBE_ASSERTM(CPV != NULL, "only constant indices when inserting values");
-    auto x = processConstant<ir::Immediate>(CPV, InsertExtractFunctor(ctx));
-    GBE_ASSERTM(x.type == ir::TYPE_U32 || x.type == ir::TYPE_S32,
-                "Invalid index type for InsertElement");
-
-    // Crash on overrun
-    const uint32_t extractedID = x.data.u32;
-#if GBE_DEBUG
-    VectorType *vectorType = cast<VectorType>(extracted->getType());
-    const uint32_t elemNum = vectorType->getNumElements();
-    GBE_ASSERTM(extractedID < elemNum, "Out-of-bound index for InsertElement");
-#endif /* GBE_DEBUG */
-
-    // Easy when the vector is not immediate
-    regTranslator.newValueProxy(extracted, &I, extractedID, 0);
-  }
-
-  void GenWriter::emitExtractElement(ExtractElementInst &I) {
-    // TODO -> insert LOADI when the extracted vector is constant
-  }
+  /*! Because there are still fake insert/extract instruction for
+   *  load/store, so keep empty function here */
+  void GenWriter::regAllocateInsertElement(InsertElementInst &I) {}
+  void GenWriter::emitInsertElement(InsertElementInst &I) {}
 
-  void GenWriter::regAllocateShuffleVectorInst(ShuffleVectorInst &I) {
-    Value *first = I.getOperand(0);
-    Value *second = I.getOperand(1);
-    GBE_ASSERTM(!isa<Constant>(first) || isa<UndefValue>(first),
-                "TODO support constant vector for shuffle");
-    GBE_ASSERTM(!isa<Constant>(second) || isa<UndefValue>(second),
-                "TODO support constant vector for shuffle");
-    VectorType *dstType = cast<VectorType>(I.getType());
-    VectorType *srcType = cast<VectorType>(first->getType());
-    const uint32_t dstElemNum = dstType->getNumElements();
-    const uint32_t srcElemNum = srcType->getNumElements();
-    for (uint32_t elemID = 0; elemID < dstElemNum; ++elemID) {
-      uint32_t srcID = I.getMaskValue(elemID);
-      Value *src = first;
-      if (srcID >= srcElemNum) {
-        srcID -= srcElemNum;
-        src = second;
-      }
-      regTranslator.newValueProxy(src, &I, srcID, elemID);
-    }
-  }
+  void GenWriter::regAllocateExtractElement(ExtractElementInst &I) {}
+  void GenWriter::emitExtractElement(ExtractElementInst &I) {}
 
+  void GenWriter::regAllocateShuffleVectorInst(ShuffleVectorInst &I) {}
   void GenWriter::emitShuffleVectorInst(ShuffleVectorInst &I) {}
 
   void GenWriter::regAllocateSelectInst(SelectInst &I) {
diff --git a/backend/src/llvm/llvm_gen_backend.hpp b/backend/src/llvm/llvm_gen_backend.hpp
index c270924..2ad879e 100644
--- a/backend/src/llvm/llvm_gen_backend.hpp
+++ b/backend/src/llvm/llvm_gen_backend.hpp
@@ -1,4 +1,4 @@
-/* 
+/*
  * Copyright © 2012 Intel Corporation
  *
  * This library is free software; you can redistribute it and/or
@@ -28,6 +28,9 @@
 
 #include "llvm/Pass.h"
 #include "sys/platform.hpp"
+#include "sys/map.hpp"
+#include "sys/hash_map.hpp"
+#include <algorithm>
 
 // LLVM Type
 namespace llvm { class Type; }
@@ -37,6 +40,29 @@ namespace gbe
   // Final target of the Gen backend
   namespace ir { class Unit; }
 
+  /*! All intrinsic Gen functions */
+  enum OCLInstrinsic {
+#define DECL_LLVM_GEN_FUNCTION(ID, NAME) GEN_OCL_##ID,
+#include "llvm_gen_ocl_function.hxx"
+#undef DECL_LLVM_GEN_FUNCTION
+  };
+
+  /*! Build the hash map for OCL functions on Gen */
+  struct OCLIntrinsicMap {
+    /*! Build the intrinsic hash map */
+    OCLIntrinsicMap(void) {
+#define DECL_LLVM_GEN_FUNCTION(ID, NAME) \
+  map.insert(std::make_pair(#NAME, GEN_OCL_##ID));
+#include "llvm_gen_ocl_function.hxx"
+#undef DECL_LLVM_GEN_FUNCTION
+    }
+    /*! Sort intrinsics with their names */
+    hash_map<std::string, OCLInstrinsic> map;
+  };
+
+  /*! Sort the OCL Gen instrinsic functions (built on pre-main) */
+  static const OCLIntrinsicMap instrinsicMap;
+
   /*! Pad the offset */
   uint32_t getPadding(uint32_t offset, uint32_t align);
 
@@ -55,6 +81,8 @@ namespace gbe
   /*! Remove the GEP instructions */
   llvm::BasicBlockPass *createRemoveGEPPass(const ir::Unit &unit);
 
+  llvm::FunctionPass* createScalarizePass(ir::Unit &unit);
+
 } /* namespace gbe */
 
 #endif /* __GBE_LLVM_GEN_BACKEND_HPP__ */
diff --git a/backend/src/llvm/llvm_scalarize.cpp b/backend/src/llvm/llvm_scalarize.cpp
new file mode 100644
index 0000000..453a23a
--- /dev/null
+++ b/backend/src/llvm/llvm_scalarize.cpp
@@ -0,0 +1,801 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ *         Heldge RHodin <alice.rhodin at alice-dsl.net>
+ */
+
+/**
+ * \file llvm_passes.cpp
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ * \author Heldge RHodin <alice.rhodin at alice-dsl.net>
+ */
+
+/* THIS CODE IS DERIVED FROM GPL LLVM PTX BACKEND. CODE IS HERE:
+ * http://sourceforge.net/scm/?type=git&group_id=319085
+ * Note that however, the original author, Heldge Rhodin, granted me (Benjamin
+ * Segovia) the right to use another license for it (MIT here)
+ */
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/Function.h"
+#include "llvm/InstrTypes.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/IRBuilder.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include "llvm/llvm_gen_backend.hpp"
+#include "ir/unit.hpp"
+#include "sys/map.hpp"
+
+
+using namespace llvm;
+
+namespace gbe {
+
+  struct VectorValues {
+    VectorValues() : vals()
+    { }
+
+    void setComponent(int c, llvm::Value* val)
+    {
+      assert(c >= 0 && c < 16 && "Out of bounds component");
+      vals[c] = val;
+    }
+    llvm::Value* getComponent(int c)
+    {
+      assert(c >= 0 && c < 16 && "Out of bounds component");
+      assert(vals[c] && "Requesting non-existing component");
+      return vals[c];
+    }
+
+    // {Value* x, Value* y, Value* z, Value* w}
+    llvm::Value* vals[16];
+  };
+
+  class Scalarize : public FunctionPass {
+
+  public:
+    // Standard pass stuff
+    static char ID;
+
+    Scalarize(ir::Unit& unit) : FunctionPass(ID), unit(unit)
+    {
+      initializeLoopInfoPass(*PassRegistry::getPassRegistry());
+      initializeDominatorTreePass(*PassRegistry::getPassRegistry());
+    }
+
+    virtual bool runOnFunction(Function&);
+    void print(raw_ostream&, const Module* = 0) const;
+    virtual void getAnalysisUsage(AnalysisUsage&) const;
+
+  protected:
+    // An instruction is valid post-scalarization iff it is fully scalar or it
+    // is a gla_loadn
+    bool isValid(const Instruction*);
+
+    // Take an instruction that produces a vector, and scalarize it
+    bool scalarize(Instruction*);
+    bool scalarizePerComponent(Instruction*);
+    bool scalarizeFuncCall(CallInst *);
+    bool scalarizeLoad(LoadInst*);
+    bool scalarizeStore(StoreInst*);
+    //bool scalarizeIntrinsic(IntrinsicInst*);
+    bool scalarizeExtract(ExtractElementInst*);
+    bool scalarizeInsert(InsertElementInst*);
+    bool scalarizeShuffleVector(ShuffleVectorInst*);
+    bool scalarizePHI(PHINode*);
+    void scalarizeArgs(Function& F);
+    // ...
+
+    // Helpers to make the actual multiple scalar calls, one per
+    // component. Updates the given VectorValues's components with the new
+    // Values.
+    void makeScalarizedCalls(Function*, ArrayRef<Value*>, int numComponents, VectorValues&);
+
+    void makePerComponentScalarizedCalls(Instruction*, ArrayRef<Value*>);
+
+    // Makes a scalar form of the given instruction: replaces the operands
+    // and chooses a correct return type
+    Instruction* createScalarInstruction(Instruction* inst, ArrayRef<Value*>);
+
+    // Gather the specified components in the given values. Returns the
+    // component if the given value is a vector, or the scalar itself.
+    void gatherComponents(int component, ArrayRef<Value*> args, SmallVectorImpl<Value*>& componentArgs);
+
+    // Get the assigned component for that value. If the value is a scalar,
+    // returns the scalar. If it's a constant, returns that component. If
+    // it's an instruction, returns the vectorValues of that instruction for
+    // that component
+    Value* getComponent(int component, Value*);
+
+    // Used for assertion purposes. Whether we can get the component out with
+    // a getComponent call
+    bool canGetComponent(Value*);
+
+    // Used for assertion purposes. Whether for every operand we can get
+    // components with a getComponent call
+    bool canGetComponentArgs(User*);
+
+    // Delete the instruction in the deadList
+    void dce();
+
+
+    int GetConstantInt(const Value* value);
+    bool IsPerComponentOp(const Instruction* inst);
+    bool IsPerComponentOp(const Value* value);
+
+    //these function used to add extract and insert instructions when load/store etc.
+    void extractFromeVector(Value* insn);
+    Value* InsertToVector(Value* insn, Value* vecValue);
+
+    Type* GetBasicType(Value* value) {
+      return GetBasicType(value->getType());
+    }
+
+    Type* GetBasicType(Type* type) {
+      switch(type->getTypeID()) {
+      case Type::VectorTyID:
+      case Type::ArrayTyID:
+        return GetBasicType(type->getContainedType(0));
+      default:
+        break;
+      }
+      return type;
+    }
+
+    int GetComponentCount(const Type* type)  {
+      if (type->getTypeID() == Type::VectorTyID)
+        return llvm::dyn_cast<VectorType>(type)->getNumElements();
+      else
+        return 1;
+    }
+
+    int GetComponentCount(const Value* value) {
+      return GetComponentCount(value->getType());
+    }
+
+    DenseMap<Value*, VectorValues> vectorVals;
+    Module* module;
+    IRBuilder<>* builder;
+
+    Type* intTy;
+    Type* floatTy;
+    ir::Unit &unit;
+
+    std::vector<Instruction*> deadList;
+
+    // List of vector phis that were not completely scalarized because some
+    // of their operands hadn't before been visited (i.e. loop variant
+    // variables)
+    SmallVector<PHINode*, 16> incompletePhis;
+  };
+
+  Value* Scalarize::getComponent(int component, Value* v)
+  {
+    assert(canGetComponent(v) && "getComponent called on unhandled vector");
+
+    if (v->getType()->isVectorTy()) {
+      if (ConstantDataVector* c = dyn_cast<ConstantDataVector>(v)) {
+        return c->getElementAsConstant(component);
+      } else if (ConstantVector* c = dyn_cast<ConstantVector>(v)) {
+        return c->getOperand(component);
+      } else if (isa<ConstantAggregateZero>(v)) {
+        return Constant::getNullValue(GetBasicType(v));
+      } else if (isa<UndefValue>(v)) {
+        return UndefValue::get(GetBasicType(v));
+      } else {
+        return vectorVals[v].getComponent(component);
+      }
+    } else {
+      return v;
+    }
+  }
+
+  bool IsPerComponentOp(const llvm::Value* value)
+  {
+    const llvm::Instruction* inst = llvm::dyn_cast<const llvm::Instruction>(value);
+    return inst && IsPerComponentOp(inst);
+  }
+
+  bool Scalarize::IsPerComponentOp(const Instruction* inst)
+  {
+    //if (const IntrinsicInst* intr = dyn_cast<const IntrinsicInst>(inst))
+    //    return IsPerComponentOp(intr);
+
+    if (inst->isTerminator())
+        return false;
+
+    switch (inst->getOpcode()) {
+
+    // Cast ops are only per-component if they cast back to the same vector
+    // width
+    case Instruction::Trunc:
+    case Instruction::ZExt:
+    case Instruction::SExt:
+    case Instruction::FPToUI:
+    case Instruction::FPToSI:
+    case Instruction::UIToFP:
+    case Instruction::SIToFP:
+    case Instruction::FPTrunc:
+    case Instruction::FPExt:
+    case Instruction::PtrToInt:
+    case Instruction::IntToPtr:
+    case Instruction::BitCast:
+      return GetComponentCount(inst->getOperand(0)) == GetComponentCount(inst);
+
+    // Vector ops
+    case Instruction::InsertElement:
+    case Instruction::ExtractElement:
+    case Instruction::ShuffleVector:
+
+    // Ways of accessing/loading/storing vectors
+    case Instruction::ExtractValue:
+    case Instruction::InsertValue:
+
+    // Memory ops
+    case Instruction::Alloca:
+    case Instruction::Load:
+    case Instruction::Store:
+    case Instruction::GetElementPtr:
+    // Phis are a little special. We consider them not to be per-component
+    // because the mechanism of choice is a single value (what path we took to
+    // get here), and doesn't choose per-component (as select would). The caller
+    // should know to handle phis specially
+    case Instruction::PHI:
+    // Call insts, conservatively are no per-component
+    case Instruction::Call:
+    // Misc
+    case Instruction::LandingPad:  //--- 3.0
+    case Instruction::VAArg:
+      return false;
+    } // end of switch (inst->getOpcode())
+
+    return true;
+  }
+  int Scalarize::GetConstantInt(const Value* value)
+  {
+    const ConstantInt *constantInt = dyn_cast<ConstantInt>(value);
+
+    // this might still be a constant expression, rather than a numeric constant,
+    // e.g., expression with undef's in it, so it was not folded
+    if (! constantInt)
+      NOT_IMPLEMENTED; //gla::UnsupportedFunctionality("non-simple constant");
+
+    return constantInt->getValue().getSExtValue();
+  }
+  bool Scalarize::canGetComponent(Value* v)
+  {
+    if (v->getType()->isVectorTy()) {
+      if (isa<ConstantDataVector>(v) || isa<ConstantVector>(v) || isa<ConstantAggregateZero>(v) || isa<UndefValue>(v)) {
+        return true;
+      } else {
+        assert((isa<Instruction>(v) || isa<Argument>(v)) && "Non-constant non-instuction?");
+        return vectorVals.count(v);
+      }
+    } else {
+      return true;
+    }
+  }
+
+  bool Scalarize::canGetComponentArgs(User* u)
+  {
+    if (PHINode* phi = dyn_cast<PHINode>(u)) {
+      for (unsigned int i = 0; i < phi->getNumIncomingValues(); ++i)
+        if (!canGetComponent(phi->getIncomingValue(i)))
+          return false;
+    } else {
+      for (User::op_iterator i = u->op_begin(), e = u->op_end(); i != e; ++i)
+        if (!canGetComponent(*i))
+          return false;
+    }
+    return true;
+  }
+
+  void Scalarize::gatherComponents(int component, ArrayRef<Value*> args, SmallVectorImpl<Value*>& componentArgs)
+  {
+    componentArgs.clear();
+    for (ArrayRef<Value*>::iterator i = args.begin(), e = args.end(); i != e; ++i)
+      componentArgs.push_back(getComponent(component, *i));
+  }
+
+  Instruction* Scalarize::createScalarInstruction(Instruction* inst, ArrayRef<Value*> args)
+  {
+    // TODO: Refine the below into one large switch
+
+    unsigned op = inst->getOpcode();
+    if (inst->isCast()) {
+      assert(args.size() == 1 && "incorrect number of arguments for cast op");
+      return CastInst::Create((Instruction::CastOps)op, args[0], GetBasicType(inst));
+    }
+
+    if (inst->isBinaryOp()) {
+      assert(args.size() == 2 && "incorrect number of arguments for binary op");
+      return BinaryOperator::Create((Instruction::BinaryOps)op, args[0], args[1]);
+    }
+
+    if (PHINode* phi = dyn_cast<PHINode>(inst)) {
+      PHINode* res = PHINode::Create(GetBasicType(inst), phi->getNumIncomingValues());
+      assert(args.size() % 2 == 0 && "Odd number of arguments for a PHI");
+
+      // Loop over pairs of operands: [Value*, BasicBlock*]
+      for (unsigned int i = 0; i < args.size(); i++) {
+        BasicBlock* bb = phi->getIncomingBlock(i); //dyn_cast<BasicBlock>(args[i+1]);
+        //assert(bb && "Non-basic block incoming block?");
+        res->addIncoming(args[i], bb);
+      }
+
+      return res;
+    }
+
+    if (CmpInst* cmpInst = dyn_cast<CmpInst>(inst)) {
+      assert(args.size() == 2 && "incorrect number of arguments for comparison");
+      return CmpInst::Create(cmpInst->getOpcode(), cmpInst->getPredicate(), args[0], args[1]);
+    }
+
+    if (isa<SelectInst>(inst)) {
+      assert(args.size() == 3 && "incorrect number of arguments for select");
+      return SelectInst::Create(args[0], args[1], args[2]);
+    }
+
+    if (IntrinsicInst* intr = dyn_cast<IntrinsicInst>(inst)) {
+      if (! IsPerComponentOp(inst))
+        NOT_IMPLEMENTED; //gla::UnsupportedFunctionality("Scalarize instruction on a non-per-component intrinsic");
+
+      // TODO: Assumption is that all per-component intrinsics have all their
+      // arguments be overloadable. Need to find some way to assert on this
+      // assumption. This is due to how getDeclaration operates; it only takes
+      // a list of types that fit overloadable slots.
+      SmallVector<Type*, 8> tys(1, GetBasicType(inst->getType()));
+      // Call instructions have the decl as a last argument, so skip it
+      for (ArrayRef<Value*>::iterator i = args.begin(), e = args.end() - 1; i != e; ++i) {
+        tys.push_back(GetBasicType((*i)->getType()));
+      }
+
+      Function* f = Intrinsic::getDeclaration(module, intr->getIntrinsicID(), tys);
+      return CallInst::Create(f, args);
+    }
+
+    NOT_IMPLEMENTED; //gla::UnsupportedFunctionality("Currently unsupported instruction: ", inst->getOpcode(),
+                     //             inst->getOpcodeName());
+    return 0;
+
+  }
+
+
+  void Scalarize::makeScalarizedCalls(Function* f, ArrayRef<Value*> args, int count, VectorValues& vVals)
+  {
+    assert(count > 0 && count <= 16 && "invalid number of vector components");
+    for (int i = 0; i < count; ++i) {
+      Value* res;
+      SmallVector<Value*, 8> callArgs(args.begin(), args.end());
+      callArgs.push_back(ConstantInt::get(intTy, i));
+
+      res = builder->CreateCall(f, callArgs);
+      vVals.setComponent(i, res);
+    }
+  }
+
+  void Scalarize::makePerComponentScalarizedCalls(Instruction* inst, ArrayRef<Value*> args)
+  {
+    int count = GetComponentCount(inst);
+    assert(count > 0 && count <= 16 && "invalid number of vector components");
+    assert((inst->getNumOperands() == args.size() || isa<PHINode>(inst))
+           && "not enough arguments passed for instruction");
+
+    VectorValues& vVals = vectorVals[inst];
+
+    for (int i = 0; i < count; ++i) {
+      // Set this component of each arg
+      SmallVector<Value*, 8> callArgs(args.size(), 0);
+      gatherComponents(i, args, callArgs);
+
+      Instruction* res = createScalarInstruction(inst, callArgs);
+
+      vVals.setComponent(i, res);
+      builder->Insert(res);
+    }
+  }
+
+  bool Scalarize::isValid(const Instruction* inst)
+  {
+    // The result
+    if (inst->getType()->isVectorTy())
+        return false;
+
+    // The arguments
+    for (Instruction::const_op_iterator i = inst->op_begin(), e = inst->op_end(); i != e; ++i) {
+      const Value* v = (*i);
+      assert(v);
+      if (v->getType()->isVectorTy())
+        return false;
+    }
+
+    return true;
+  }
+
+  bool Scalarize::scalarize(Instruction* inst)
+  {
+    if (isValid(inst))
+        return false;
+
+    assert(! vectorVals.count(inst) && "We've already scalarized this somehow?");
+    assert((canGetComponentArgs(inst) || isa<PHINode>(inst)) &&
+           "Scalarizing an op whose arguments haven't been scalarized ");
+    builder->SetInsertPoint(inst);
+
+    if (IsPerComponentOp(inst))
+      return scalarizePerComponent(inst);
+
+    if (LoadInst* ld = dyn_cast<LoadInst>(inst))
+      return scalarizeLoad(ld);
+
+    if (CallInst* call = dyn_cast<CallInst>(inst))
+      return scalarizeFuncCall(call);
+
+    if (ExtractElementInst* extr = dyn_cast<ExtractElementInst>(inst))
+      return scalarizeExtract(extr);
+
+    if (InsertElementInst* ins = dyn_cast<InsertElementInst>(inst))
+      return scalarizeInsert(ins);
+
+    if (ShuffleVectorInst* sv = dyn_cast<ShuffleVectorInst>(inst))
+      return scalarizeShuffleVector(sv);
+
+    if (PHINode* phi = dyn_cast<PHINode>(inst))
+      return scalarizePHI(phi);
+
+    if (isa<ExtractValueInst>(inst) || isa<InsertValueInst>(inst))
+      // TODO: need to come up with a struct/array model for scalarization
+      NOT_IMPLEMENTED; //gla::UnsupportedFunctionality("Scalarizing struct/array ops");
+
+    if (StoreInst* st = dyn_cast<StoreInst>(inst))
+      return scalarizeStore(st);
+
+    NOT_IMPLEMENTED; //gla::UnsupportedFunctionality("Currently unhandled instruction ", inst->getOpcode(), inst->getOpcodeName());
+    return false;
+  }
+
+  bool Scalarize::scalarizeShuffleVector(ShuffleVectorInst* sv)
+  {
+    //     %res = shuffleVector <n x ty> %foo, <n x ty> bar, <n x i32> <...>
+    // ==> nothing (just make a new VectorValues with the new components)
+    VectorValues& vVals = vectorVals[sv];
+
+    int size = GetComponentCount(sv);
+    int srcSize = GetComponentCount(sv->getOperand(0)->getType());
+
+    for (int i = 0; i < size; ++i) {
+      int select = sv->getMaskValue(i);
+
+      if (select < 0) {
+        vVals.setComponent(i, UndefValue::get(GetBasicType(sv->getOperand(0))));
+        continue;
+      }
+
+      // Otherwise look up the corresponding component from the correct
+      // source.
+      Value* selectee;
+      if (select < srcSize) {
+        selectee = sv->getOperand(0);
+      } else {
+        // Choose from the second operand
+        select -= srcSize;
+        selectee = sv->getOperand(1);
+      }
+
+      vVals.setComponent(i, getComponent(select, selectee));
+    }
+
+    return true;
+  }
+
+  bool Scalarize::scalarizePerComponent(Instruction* inst)
+  {
+    //     dst  = op <n x ty> %foo, <n x ty> %bar
+    // ==> dstx = op ty %foox, ty %barx
+    //     dsty = op ty %fooy, ty %bary
+    //     ...
+
+    SmallVector<Value*, 16> args(inst->op_begin(), inst->op_end());
+
+    makePerComponentScalarizedCalls(inst, args);
+
+    return true;
+  }
+
+  bool Scalarize::scalarizePHI(PHINode* phi)
+  {
+    //     dst = phi <n x ty> [ %foo, %bb1 ], [ %bar, %bb2], ...
+    // ==> dstx = phi ty [ %foox, %bb1 ], [ %barx, %bb2], ...
+    //     dsty = phi ty [ %fooy, %bb1 ], [ %bary, %bb2], ...
+
+    // If the scalar values are all known up-front, then just make the full
+    // phinode now. If they are not yet known (phinode for a loop variant
+    // variable), then deferr the arguments until later
+
+    if (canGetComponentArgs(phi)) {
+      SmallVector<Value*, 8> args(phi->op_begin(), phi->op_end());
+      makePerComponentScalarizedCalls(phi, args);
+    } else {
+      makePerComponentScalarizedCalls(phi, ArrayRef<Value*>());
+      incompletePhis.push_back(phi);
+    }
+
+    return true;
+  }
+
+  void Scalarize::extractFromeVector(Value* insn) {
+    VectorValues& vVals = vectorVals[insn];
+
+    for (int i = 0; i < GetComponentCount(insn); ++i) {
+      Value *cv = ConstantInt::get(intTy, i);
+      Value *EI = builder->CreateExtractElement(insn, cv);
+      vVals.setComponent(i, EI);
+      //unit.fakeInsnMap[EI] = insn;
+      unit.newValueProxy(insn, EI, i, 0);
+    }
+  }
+
+  Value* Scalarize::InsertToVector(Value * insn, Value* vecValue) {
+    //VectorValues& vVals = vectorVals[writeValue];
+    //unit.vecValuesMap[call] = vectorVals[writeValue];
+
+    //add fake insert instructions to avoid removed
+    Value *II = NULL;
+    for (int i = 0; i < GetComponentCount(vecValue); ++i) {
+      Value *vec = II ? II : UndefValue::get(vecValue->getType());
+      Value *cv = ConstantInt::get(intTy, i);
+      II = builder->CreateInsertElement(vec, getComponent(i, vecValue), cv);
+      //unit.vecValuesMap[insn].setComponent(i, getComponent(i, writeValue));
+      //unit.newValueProxy(getComponent(i, vecValue), vecValue, 0, i);
+      //unit.fakeInsnMap[II] = insn;
+    }
+
+    for (int i = 0; i < GetComponentCount(vecValue); ++i) {
+      unit.newValueProxy(getComponent(i, vecValue), II, 0, i);
+    }
+    return II;
+  }
+
+  bool Scalarize::scalarizeFuncCall(CallInst* call) {
+    if (Function *F = call->getCalledFunction()) {
+      if (F->getIntrinsicID() != 0) {   //Intrinsic functions
+        NOT_IMPLEMENTED;
+      } else {
+        Value *Callee = call->getCalledValue();
+        const std::string fnName = Callee->getName();
+        auto it = instrinsicMap.map.find(fnName);
+        GBE_ASSERT(it != instrinsicMap.map.end());
+
+        // Get the function arguments
+        CallSite CS(call);
+        CallSite::arg_iterator CI = CS.arg_begin() + 3;
+
+        switch (it->second) {
+          default: break;
+          case GEN_OCL_READ_IMAGE0:
+          case GEN_OCL_READ_IMAGE1:
+          case GEN_OCL_READ_IMAGE2:
+          case GEN_OCL_READ_IMAGE3:
+          case GEN_OCL_READ_IMAGE4:
+          case GEN_OCL_READ_IMAGE5:
+          case GEN_OCL_READ_IMAGE10:
+          case GEN_OCL_READ_IMAGE11:
+          case GEN_OCL_READ_IMAGE12:
+          case GEN_OCL_READ_IMAGE13:
+          case GEN_OCL_READ_IMAGE14:
+          case GEN_OCL_READ_IMAGE15:
+          {
+            extractFromeVector(call);
+            break;
+          }
+          case GEN_OCL_WRITE_IMAGE10:
+          case GEN_OCL_WRITE_IMAGE11:
+          case GEN_OCL_WRITE_IMAGE12:
+          case GEN_OCL_WRITE_IMAGE13:
+          case GEN_OCL_WRITE_IMAGE14:
+          case GEN_OCL_WRITE_IMAGE15:
+            CI++;
+          case GEN_OCL_WRITE_IMAGE0:
+          case GEN_OCL_WRITE_IMAGE1:
+          case GEN_OCL_WRITE_IMAGE2:
+          case GEN_OCL_WRITE_IMAGE3:
+          case GEN_OCL_WRITE_IMAGE4:
+          case GEN_OCL_WRITE_IMAGE5:
+          {
+            *CI = InsertToVector(call, *CI);
+            break;
+          }
+        }
+      }
+    }
+    return false;
+  }
+
+  bool Scalarize::scalarizeLoad(LoadInst* ld)
+  {
+    extractFromeVector(ld);
+    return false;
+  }
+
+  bool Scalarize::scalarizeStore(StoreInst* st) {
+    st->setOperand(0, InsertToVector(st, st->getValueOperand()));
+    return false;
+  }
+
+  bool Scalarize::scalarizeExtract(ExtractElementInst* extr)
+  {
+    //     %res = extractelement <n X ty> %foo, %i
+    // ==> nothing (just use %foo's %ith component instead of %res)
+
+    if (! isa<Constant>(extr->getOperand(1))) {
+        // TODO: Variably referenced components. Probably handle/emulate through
+        // a series of selects.
+        NOT_IMPLEMENTED; //gla::UnsupportedFunctionality("Variably referenced vector components");
+    }
+    //if (isa<Argument>(extr->getOperand(0)))
+    //  return false;
+    int component = GetConstantInt(extr->getOperand(1));
+    Value* v = getComponent(component, extr->getOperand(0));
+    if(extr == v)
+      return false;
+    extr->replaceAllUsesWith(v);
+
+    return true;
+  }
+
+  bool Scalarize::scalarizeInsert(InsertElementInst* ins)
+  {
+    //     %res = insertValue <n x ty> %foo, %i
+    // ==> nothing (just make a new VectorValues with the new component)
+
+    if (! isa<Constant>(ins->getOperand(2))) {
+      // TODO: Variably referenced components. Probably handle/emulate through
+      // a series of selects.
+      NOT_IMPLEMENTED;   //gla::UnsupportedFunctionality("Variably referenced vector components");
+    }
+
+    int component = GetConstantInt(ins->getOperand(2));
+
+    VectorValues& vVals = vectorVals[ins];
+    for (int i = 0; i < GetComponentCount(ins); ++i) {
+      vVals.setComponent(i, i == component ? ins->getOperand(1)
+                                           : getComponent(i, ins->getOperand(0)));
+    }
+
+    return true;
+  }
+
+  void Scalarize::scalarizeArgs(Function& F)  {
+    if (F.arg_empty())
+      return;
+    ReversePostOrderTraversal<Function*> rpot(&F);
+    BasicBlock::iterator instI = (*rpot.begin())->begin();
+    builder->SetInsertPoint(instI);
+
+    Function::arg_iterator I = F.arg_begin(), E = F.arg_end();
+
+#if LLVM_VERSION_MINOR <= 1
+    const AttrListPtr &PAL = F.getAttributes();
+    uint32_t argID = 1; // Start at one actually
+    for (; I != E; ++I, ++argID) {
+#else
+    for (; I != E; ++I) {
+#endif /* LLVM_VERSION_MINOR <= 1 */
+      Type *type = I->getType();
+
+      if(type->isVectorTy())
+        extractFromeVector(I);
+    }
+    return;
+  }
+
+  bool Scalarize::runOnFunction(Function& F)
+  {
+    switch (F.getCallingConv()) {
+    case CallingConv::PTX_Device:
+      return false;
+    case CallingConv::PTX_Kernel:
+      break;
+    default: GBE_ASSERTM(false, "Unsupported calling convention");
+    }
+
+    bool changed = false;
+    module = F.getParent();
+    intTy = IntegerType::get(module->getContext(), 32);
+    floatTy = Type::getFloatTy(module->getContext());
+    builder = new IRBuilder<>(module->getContext());
+
+    scalarizeArgs(F);
+
+    typedef ReversePostOrderTraversal<Function*> RPOTType;
+    RPOTType rpot(&F);
+    for (RPOTType::rpo_iterator bbI = rpot.begin(), bbE = rpot.end(); bbI != bbE; ++bbI) {
+      for (BasicBlock::iterator instI = (*bbI)->begin(), instE = (*bbI)->end(); instI != instE; ++instI) {
+        bool scalarized = scalarize(instI);
+        if (scalarized) {
+          changed = true;
+          // TODO: uncomment when done
+          deadList.push_back(instI);
+        }
+      }
+    }
+
+    // Fill in the incomplete phis
+    for (SmallVectorImpl<PHINode*>::iterator phiI = incompletePhis.begin(), phiE = incompletePhis.end();
+       phiI != phiE; ++phiI) {
+      assert(canGetComponentArgs(*phiI) && "Phi's operands never scalarized");
+
+      // Fill in each component of this phi
+      VectorValues& vVals = vectorVals[*phiI];
+      for (int c = 0; c < GetComponentCount(*phiI); ++c) {
+        PHINode* compPhi = dyn_cast<PHINode>(vVals.getComponent(c));
+        assert(compPhi && "Vector phi got scalarized to non-phis?");
+
+        // Loop over pairs of operands: [Value*, BasicBlock*]
+        for (unsigned int i = 0; i < (*phiI)->getNumOperands(); i++) {
+          BasicBlock* bb = (*phiI)->getIncomingBlock(i);
+          assert(bb && "Non-basic block incoming block?");
+          compPhi->addIncoming(getComponent(c, (*phiI)->getOperand(i)), bb);
+        }
+      }
+    }
+
+    dce();
+
+    delete builder;
+    builder = 0;
+
+    return changed;
+  }
+
+  void Scalarize::dce()
+  {
+    //two passes delete for some phinode
+    for (std::vector<Instruction*>::reverse_iterator i = deadList.rbegin(), e = deadList.rend(); i != e; ++i) {
+      (*i)->dropAllReferences();
+      if((*i)->use_empty())
+        (*i)->eraseFromParent();
+    }
+    for (std::vector<Instruction*>::reverse_iterator i = deadList.rbegin(), e = deadList.rend(); i != e; ++i) {
+      if((*i)->getParent())
+        (*i)->eraseFromParent();
+    }
+    deadList.clear();
+  }
+
+  void Scalarize::getAnalysisUsage(AnalysisUsage& AU) const
+  {
+  }
+
+  void Scalarize::print(raw_ostream&, const Module*) const
+  {
+      return;
+  }
+  FunctionPass* createScalarizePass(ir::Unit &unit)
+  {
+      return new Scalarize(unit);
+  }
+  char Scalarize::ID = 0;
+
+} // end namespace
diff --git a/backend/src/llvm/llvm_to_gen.cpp b/backend/src/llvm/llvm_to_gen.cpp
index ea3d9eb..559cde0 100644
--- a/backend/src/llvm/llvm_to_gen.cpp
+++ b/backend/src/llvm/llvm_to_gen.cpp
@@ -69,6 +69,7 @@ namespace gbe
     // Print the code before further optimizations
     if (OCL_OUTPUT_LLVM_BEFORE_EXTRA_PASS)
       passes.add(createPrintModulePass(&*o));
+    passes.add(createScalarizePass(unit));        // Expand all vector ops
     passes.add(createScalarReplAggregatesPass()); // Break up allocas
     passes.add(createRemoveGEPPass(unit));
     passes.add(createConstantPropagationPass());
-- 
1.7.9.5