[Mesa-dev] [PATCH] R600: Remove AMDILPeeopholeOptimizer and replace optimizations with tablegen patterns

Mon May 6 09:48:45 PDT 2013

From: Tom Stellard <thomas.stellard at amd.com>

The BFE optimization was the only one we were actually using, and it was
emitting an intrinsic that we don't support.

https://bugs.freedesktop.org/show_bug.cgi?id=64201
---
 lib/Target/R600/AMDGPUInstructions.td      |   11 +
 lib/Target/R600/AMDGPUTargetMachine.cpp    |    1 -
 lib/Target/R600/AMDILPeepholeOptimizer.cpp | 1215 ----------------------------
 lib/Target/R600/CMakeLists.txt             |    1 -
 lib/Target/R600/R600Instructions.td        |    1 +
 test/CodeGen/R600/bfe_uint.ll              |   26 +
 6 files changed, 38 insertions(+), 1217 deletions(-)
 delete mode 100644 lib/Target/R600/AMDILPeepholeOptimizer.cpp
 create mode 100644 test/CodeGen/R600/bfe_uint.ll

diff --git a/lib/Target/R600/AMDGPUInstructions.td b/lib/Target/R600/AMDGPUInstructions.td
index b44d248..d2620b2 100644
--- a/lib/Target/R600/AMDGPUInstructions.td
+++ b/lib/Target/R600/AMDGPUInstructions.td
@@ -284,6 +284,17 @@ class SHA256MaPattern <Instruction BFI_INT, Instruction XOR> : Pat <
   (BFI_INT (XOR i32:$x, i32:$y), i32:$z, i32:$y)
 >;
 
+// Bitfield extract patterns
+
+def legalshift32 : ImmLeaf <i32, [{return Imm >=0 && Imm < 32;}]>;
+def bfemask : PatLeaf <(imm), [{return isMask_32(N->getZExtValue());}],
+                            SDNodeXForm<imm, [{ return CurDAG->getTargetConstant(CountTrailingOnes_32(N->getZExtValue()), MVT::i32);}]>>;
+
+class BFEPattern <Instruction BFE> : Pat <
+  (and (srl i32:$x, legalshift32:$y), bfemask:$z),
+  (BFE $x, $y, $z)
+>;
+
 include "R600Instructions.td"
 
 include "SIInstrInfo.td"
diff --git a/lib/Target/R600/AMDGPUTargetMachine.cpp b/lib/Target/R600/AMDGPUTargetMachine.cpp
index 0ec67ce..31fbf32 100644
--- a/lib/Target/R600/AMDGPUTargetMachine.cpp
+++ b/lib/Target/R600/AMDGPUTargetMachine.cpp
@@ -115,7 +115,6 @@ AMDGPUPassConfig::addPreISel() {
 }
 
 bool AMDGPUPassConfig::addInstSelector() {
-  addPass(createAMDGPUPeepholeOpt(*TM));
   addPass(createAMDGPUISelDag(getAMDGPUTargetMachine()));
 
   const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
diff --git a/lib/Target/R600/AMDILPeepholeOptimizer.cpp b/lib/Target/R600/AMDILPeepholeOptimizer.cpp
deleted file mode 100644
index 3a28038..0000000
--- a/lib/Target/R600/AMDILPeepholeOptimizer.cpp
+++ /dev/null
@@ -1,1215 +0,0 @@
-//===-- AMDILPeepholeOptimizer.cpp - AMDGPU Peephole optimizations ---------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-/// \file
-//==-----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "PeepholeOpt"
-#ifdef DEBUG
-#define DEBUGME (DebugFlag && isCurrentDebugType(DEBUG_TYPE))
-#else
-#define DEBUGME 0
-#endif
-
-#include "AMDILDevices.h"
-#include "AMDGPUInstrInfo.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/Twine.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineFunctionAnalysis.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/MathExtras.h"
-
-#include <sstream>
-
-#if 0
-STATISTIC(PointerAssignments, "Number of dynamic pointer "
-    "assigments discovered");
-STATISTIC(PointerSubtract, "Number of pointer subtractions discovered");
-#endif
-
-using namespace llvm;
-// The Peephole optimization pass is used to do simple last minute optimizations
-// that are required for correct code or to remove redundant functions
-namespace {
-
-class OpaqueType;
-
-class LLVM_LIBRARY_VISIBILITY AMDGPUPeepholeOpt : public FunctionPass {
-public:
-  TargetMachine &TM;
-  static char ID;
-  AMDGPUPeepholeOpt(TargetMachine &tm);
-  ~AMDGPUPeepholeOpt();
-  const char *getPassName() const;
-  bool runOnFunction(Function &F);
-  bool doInitialization(Module &M);
-  bool doFinalization(Module &M);
-  void getAnalysisUsage(AnalysisUsage &AU) const;
-protected:
-private:
-  // Function to initiate all of the instruction level optimizations.
-  bool instLevelOptimizations(BasicBlock::iterator *inst);
-  // Quick check to see if we need to dump all of the pointers into the
-  // arena. If this is correct, then we set all pointers to exist in arena. This
-  // is a workaround for aliasing of pointers in a struct/union.
-  bool dumpAllIntoArena(Function &F);
-  // Because I don't want to invalidate any pointers while in the
-  // safeNestedForEachFunction. I push atomic conversions to a vector and handle
-  // it later. This function does the conversions if required.
-  void doAtomicConversionIfNeeded(Function &F);
-  // Because __amdil_is_constant cannot be properly evaluated if
-  // optimizations are disabled, the call's are placed in a vector
-  // and evaluated after the __amdil_image* functions are evaluated
-  // which should allow the __amdil_is_constant function to be
-  // evaluated correctly.
-  void doIsConstCallConversionIfNeeded();
-  bool mChanged;
-  bool mDebug;
-  bool mConvertAtomics;
-  CodeGenOpt::Level optLevel;
-  // Run a series of tests to see if we can optimize a CALL instruction.
-  bool optimizeCallInst(BasicBlock::iterator *bbb);
-  // A peephole optimization to optimize bit extract sequences.
-  bool optimizeBitExtract(Instruction *inst);
-  // A peephole optimization to optimize bit insert sequences.
-  bool optimizeBitInsert(Instruction *inst);
-  bool setupBitInsert(Instruction *base, 
-                      Instruction *&src, 
-                      Constant *&mask, 
-                      Constant *&shift);
-  // Expand the bit field insert instruction on versions of OpenCL that
-  // don't support it.
-  bool expandBFI(CallInst *CI);
-  // Expand the bit field mask instruction on version of OpenCL that 
-  // don't support it.
-  bool expandBFM(CallInst *CI);
-  // On 7XX and 8XX operations, we do not have 24 bit signed operations. So in
-  // this case we need to expand them. These functions check for 24bit functions
-  // and then expand.
-  bool isSigned24BitOps(CallInst *CI);
-  void expandSigned24BitOps(CallInst *CI);
-  // One optimization that can occur is that if the required workgroup size is
-  // specified then the result of get_local_size is known at compile time and
-  // can be returned accordingly.
-  bool isRWGLocalOpt(CallInst *CI);
-  // On northern island cards, the division is slightly less accurate than on
-  // previous generations, so we need to utilize a more accurate division. So we
-  // can translate the accurate divide to a normal divide on all other cards.
-  bool convertAccurateDivide(CallInst *CI);
-  void expandAccurateDivide(CallInst *CI);
-  // If the alignment is set incorrectly, it can produce really inefficient
-  // code. This checks for this scenario and fixes it if possible.
-  bool correctMisalignedMemOp(Instruction *inst);
-
-  // If we are in no opt mode, then we need to make sure that
-  // local samplers are properly propagated as constant propagation 
-  // doesn't occur and we need to know the value of kernel defined
-  // samplers at compile time.
-  bool propagateSamplerInst(CallInst *CI);
-
-  // Helper functions
-
-  // Group of functions that recursively calculate the size of a structure based
-  // on it's sub-types.
-  size_t getTypeSize(Type * const T, bool dereferencePtr = false);
-  size_t getTypeSize(StructType * const ST, bool dereferencePtr = false);
-  size_t getTypeSize(IntegerType * const IT, bool dereferencePtr = false);
-  size_t getTypeSize(FunctionType * const FT,bool dereferencePtr = false);
-  size_t getTypeSize(ArrayType * const AT, bool dereferencePtr = false);
-  size_t getTypeSize(VectorType * const VT, bool dereferencePtr = false);
-  size_t getTypeSize(PointerType * const PT, bool dereferencePtr = false);
-  size_t getTypeSize(OpaqueType * const OT, bool dereferencePtr = false);
-
-  LLVMContext *mCTX;
-  Function *mF;
-  const AMDGPUSubtarget *mSTM;
-  SmallVector< std::pair<CallInst *, Function *>, 16> atomicFuncs;
-  SmallVector<CallInst *, 16> isConstVec;
-}; // class AMDGPUPeepholeOpt
-  char AMDGPUPeepholeOpt::ID = 0;
-
-// A template function that has two levels of looping before calling the
-// function with a pointer to the current iterator.
-template<class InputIterator, class SecondIterator, class Function>
-Function safeNestedForEach(InputIterator First, InputIterator Last,
-                              SecondIterator S, Function F) {
-  for ( ; First != Last; ++First) {
-    SecondIterator sf, sl;
-    for (sf = First->begin(), sl = First->end();
-         sf != sl; )  {
-      if (!F(&sf)) {
-        ++sf;
-      } 
-    }
-  }
-  return F;
-}
-
-} // anonymous namespace
-
-namespace llvm {
-  FunctionPass *
-  createAMDGPUPeepholeOpt(TargetMachine &tm) {
-    return new AMDGPUPeepholeOpt(tm);
-  }
-} // llvm namespace
-
-AMDGPUPeepholeOpt::AMDGPUPeepholeOpt(TargetMachine &tm)
-  : FunctionPass(ID), TM(tm)  {
-  mDebug = DEBUGME;
-  optLevel = TM.getOptLevel();
-
-}
-
-AMDGPUPeepholeOpt::~AMDGPUPeepholeOpt()  {
-}
-
-const char *
-AMDGPUPeepholeOpt::getPassName() const  {
-  return "AMDGPU PeepHole Optimization Pass";
-}
-
-bool 
-containsPointerType(Type *Ty)  {
-  if (!Ty) {
-    return false;
-  }
-  switch(Ty->getTypeID()) {
-  default:
-    return false;
-  case Type::StructTyID: {
-    const StructType *ST = dyn_cast<StructType>(Ty);
-    for (StructType::element_iterator stb = ST->element_begin(),
-           ste = ST->element_end(); stb != ste; ++stb) {
-      if (!containsPointerType(*stb)) {
-        continue;
-      }
-      return true;
-    }
-    break;
-  }
-  case Type::VectorTyID:
-  case Type::ArrayTyID:
-    return containsPointerType(dyn_cast<SequentialType>(Ty)->getElementType());
-  case Type::PointerTyID:
-    return true;
-  };
-  return false;
-}
-
-bool 
-AMDGPUPeepholeOpt::dumpAllIntoArena(Function &F)  {
-  bool dumpAll = false;
-  for (Function::const_arg_iterator cab = F.arg_begin(),
-       cae = F.arg_end(); cab != cae; ++cab) {
-    const Argument *arg = cab;
-    const PointerType *PT = dyn_cast<PointerType>(arg->getType());
-    if (!PT) {
-      continue;
-    }
-    Type *DereferencedType = PT->getElementType();
-    if (!dyn_cast<StructType>(DereferencedType) 
-        ) {
-      continue;
-    }
-    if (!containsPointerType(DereferencedType)) {
-      continue;
-    }
-    // FIXME: Because a pointer inside of a struct/union may be aliased to
-    // another pointer we need to take the conservative approach and place all
-    // pointers into the arena until more advanced detection is implemented.
-    dumpAll = true;
-  }
-  return dumpAll;
-}
-void
-AMDGPUPeepholeOpt::doIsConstCallConversionIfNeeded() {
-  if (isConstVec.empty()) {
-    return;
-  }
-  for (unsigned x = 0, y = isConstVec.size(); x < y; ++x) {
-    CallInst *CI = isConstVec[x];
-    Constant *CV = dyn_cast<Constant>(CI->getOperand(0));
-    Type *aType = Type::getInt32Ty(*mCTX);
-    Value *Val = (CV != NULL) ? ConstantInt::get(aType, 1)
-      : ConstantInt::get(aType, 0);
-    CI->replaceAllUsesWith(Val);
-    CI->eraseFromParent();
-  }
-  isConstVec.clear();
-}
-void 
-AMDGPUPeepholeOpt::doAtomicConversionIfNeeded(Function &F)  {
-  // Don't do anything if we don't have any atomic operations.
-  if (atomicFuncs.empty()) {
-    return;
-  }
-  // Change the function name for the atomic if it is required
-  uint32_t size = atomicFuncs.size();
-  for (uint32_t x = 0; x < size; ++x) {
-    atomicFuncs[x].first->setOperand(
-        atomicFuncs[x].first->getNumOperands()-1, 
-        atomicFuncs[x].second);
-
-  }
-  mChanged = true;
-  if (mConvertAtomics) {
-    return;
-  }
-}
-
-bool 
-AMDGPUPeepholeOpt::runOnFunction(Function &MF)  {
-  mChanged = false;
-  mF = &MF;
-  mSTM = &TM.getSubtarget<AMDGPUSubtarget>();
-  if (mDebug) {
-    MF.dump();
-  }
-  mCTX = &MF.getType()->getContext();
-  mConvertAtomics = true;
-  safeNestedForEach(MF.begin(), MF.end(), MF.begin()->begin(),
-     std::bind1st(std::mem_fun(&AMDGPUPeepholeOpt::instLevelOptimizations),
-                  this));
-
-  doAtomicConversionIfNeeded(MF);
-  doIsConstCallConversionIfNeeded();
-
-  if (mDebug) {
-    MF.dump();
-  }
-  return mChanged;
-}
-
-bool 
-AMDGPUPeepholeOpt::optimizeCallInst(BasicBlock::iterator *bbb)  {
-  Instruction *inst = (*bbb);
-  CallInst *CI = dyn_cast<CallInst>(inst);
-  if (!CI) {
-    return false;
-  }
-  if (isSigned24BitOps(CI)) {
-    expandSigned24BitOps(CI);
-    ++(*bbb);
-    CI->eraseFromParent();
-    return true;
-  }
-  if (propagateSamplerInst(CI)) {
-    return false;
-  }
-  if (expandBFI(CI) || expandBFM(CI)) {
-    ++(*bbb);
-    CI->eraseFromParent();
-    return true;
-  }
-  if (convertAccurateDivide(CI)) {
-    expandAccurateDivide(CI);
-    ++(*bbb);
-    CI->eraseFromParent();
-    return true;
-  }
-
-  StringRef calleeName = CI->getOperand(CI->getNumOperands()-1)->getName();
-  if (calleeName.startswith("__amdil_is_constant")) {
-    // If we do not have optimizations, then this
-    // cannot be properly evaluated, so we add the
-    // call instruction to a vector and process
-    // them at the end of processing after the
-    // samplers have been correctly handled.
-    if (optLevel == CodeGenOpt::None) {
-      isConstVec.push_back(CI);
-      return false;
-    } else {
-      Constant *CV = dyn_cast<Constant>(CI->getOperand(0));
-      Type *aType = Type::getInt32Ty(*mCTX);
-      Value *Val = (CV != NULL) ? ConstantInt::get(aType, 1)
-        : ConstantInt::get(aType, 0);
-      CI->replaceAllUsesWith(Val);
-      ++(*bbb);
-      CI->eraseFromParent();
-      return true;
-    }
-  }
-
-  if (calleeName.equals("__amdil_is_asic_id_i32")) {
-    ConstantInt *CV = dyn_cast<ConstantInt>(CI->getOperand(0));
-    Type *aType = Type::getInt32Ty(*mCTX);
-    Value *Val = CV;
-    if (Val) {
-      Val = ConstantInt::get(aType, 
-          mSTM->device()->getDeviceFlag() & CV->getZExtValue());
-    } else {
-      Val = ConstantInt::get(aType, 0);
-    }
-    CI->replaceAllUsesWith(Val);
-    ++(*bbb);
-    CI->eraseFromParent();
-    return true;
-  }
-  Function *F = dyn_cast<Function>(CI->getOperand(CI->getNumOperands()-1));
-  if (!F) {
-    return false;
-  } 
-  if (F->getName().startswith("__atom") && !CI->getNumUses() 
-      && F->getName().find("_xchg") == StringRef::npos) {
-    std::string buffer(F->getName().str() + "_noret");
-    F = dyn_cast<Function>(
-          F->getParent()->getOrInsertFunction(buffer, F->getFunctionType()));
-    atomicFuncs.push_back(std::make_pair(CI, F));
-  }
-  
-  if (!mSTM->device()->isSupported(AMDGPUDeviceInfo::ArenaSegment)
-      && !mSTM->device()->isSupported(AMDGPUDeviceInfo::MultiUAV)) {
-    return false;
-  }
-  if (!mConvertAtomics) {
-    return false;
-  }
-  StringRef name = F->getName();
-  if (name.startswith("__atom") && name.find("_g") != StringRef::npos) {
-    mConvertAtomics = false;
-  }
-  return false;
-}
-
-bool
-AMDGPUPeepholeOpt::setupBitInsert(Instruction *base, 
-    Instruction *&src, 
-    Constant *&mask, 
-    Constant *&shift) {
-  if (!base) {
-    if (mDebug) {
-      dbgs() << "Null pointer passed into function.\n";
-    }
-    return false;
-  }
-  bool andOp = false;
-  if (base->getOpcode() == Instruction::Shl) {
-    shift = dyn_cast<Constant>(base->getOperand(1));
-  } else if (base->getOpcode() == Instruction::And) {
-    mask = dyn_cast<Constant>(base->getOperand(1));
-    andOp = true;
-  } else {
-    if (mDebug) {
-      dbgs() << "Failed setup with no Shl or And instruction on base opcode!\n";
-    }
-    // If the base is neither a Shl or a And, we don't fit any of the patterns above.
-    return false;
-  }
-  src = dyn_cast<Instruction>(base->getOperand(0));
-  if (!src) {
-    if (mDebug) {
-      dbgs() << "Failed setup since the base operand is not an instruction!\n";
-    }
-    return false;
-  }
-  // If we find an 'and' operation, then we don't need to
-  // find the next operation as we already know the
-  // bits that are valid at this point.
-  if (andOp) {
-    return true;
-  }
-  if (src->getOpcode() == Instruction::Shl && !shift) {
-    shift = dyn_cast<Constant>(src->getOperand(1));
-    src = dyn_cast<Instruction>(src->getOperand(0));
-  } else if (src->getOpcode() == Instruction::And && !mask) {
-    mask = dyn_cast<Constant>(src->getOperand(1));
-  }
-  if (!mask && !shift) {
-    if (mDebug) {
-      dbgs() << "Failed setup since both mask and shift are NULL!\n";
-    }
-    // Did not find a constant mask or a shift.
-    return false;
-  }
-  return true;
-}
-bool
-AMDGPUPeepholeOpt::optimizeBitInsert(Instruction *inst)  {
-  if (!inst) {
-    return false;
-  }
-  if (!inst->isBinaryOp()) {
-    return false;
-  }
-  if (inst->getOpcode() != Instruction::Or) {
-    return false;
-  }
-  if (optLevel == CodeGenOpt::None) {
-    return false;
-  }
-  // We want to do an optimization on a sequence of ops that in the end equals a
-  // single ISA instruction.
-  // The base pattern for this optimization is - ((A & B) << C) | ((D & E) << F)
-  // Some simplified versions of this pattern are as follows:
-  // (A & B) | (D & E) when B & E == 0 && C == 0 && F == 0
-  // ((A & B) << C) | (D & E) when B ^ E == 0 && (1 << C) >= E
-  // (A & B) | ((D & E) << F) when B ^ E == 0 && (1 << F) >= B
-  // (A & B) | (D << F) when (1 << F) >= B
-  // (A << C) | (D & E) when (1 << C) >= E
-  if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD4XXX) {
-    // The HD4XXX hardware doesn't support the ubit_insert instruction.
-    return false;
-  }
-  Type *aType = inst->getType();
-  bool isVector = aType->isVectorTy();
-  int numEle = 1;
-  // This optimization only works on 32bit integers.
-  if (aType->getScalarType()
-      != Type::getInt32Ty(inst->getContext())) {
-    return false;
-  }
-  if (isVector) {
-    const VectorType *VT = dyn_cast<VectorType>(aType);
-    numEle = VT->getNumElements();
-    // We currently cannot support more than 4 elements in a intrinsic and we
-    // cannot support Vec3 types.
-    if (numEle > 4 || numEle == 3) {
-      return false;
-    }
-  }
-  // TODO: Handle vectors.
-  if (isVector) {
-    if (mDebug) {
-      dbgs() << "!!! Vectors are not supported yet!\n";
-    }
-    return false;
-  }
-  Instruction *LHSSrc = NULL, *RHSSrc = NULL;
-  Constant *LHSMask = NULL, *RHSMask = NULL;
-  Constant *LHSShift = NULL, *RHSShift = NULL;
-  Instruction *LHS = dyn_cast<Instruction>(inst->getOperand(0));
-  Instruction *RHS = dyn_cast<Instruction>(inst->getOperand(1));
-  if (!setupBitInsert(LHS, LHSSrc, LHSMask, LHSShift)) {
-    if (mDebug) {
-      dbgs() << "Found an OR Operation that failed setup!\n";
-      inst->dump();
-      if (LHS) { LHS->dump(); }
-      if (LHSSrc) { LHSSrc->dump(); }
-      if (LHSMask) { LHSMask->dump(); }
-      if (LHSShift) { LHSShift->dump(); }
-    }
-    // There was an issue with the setup for BitInsert.
-    return false;
-  }
-  if (!setupBitInsert(RHS, RHSSrc, RHSMask, RHSShift)) {
-    if (mDebug) {
-      dbgs() << "Found an OR Operation that failed setup!\n";
-      inst->dump();
-      if (RHS) { RHS->dump(); }
-      if (RHSSrc) { RHSSrc->dump(); }
-      if (RHSMask) { RHSMask->dump(); }
-      if (RHSShift) { RHSShift->dump(); }
-    }
-    // There was an issue with the setup for BitInsert.
-    return false;
-  }
-  if (mDebug) {
-    dbgs() << "Found an OR operation that can possible be optimized to ubit insert!\n";
-    dbgs() << "Op:        "; inst->dump();
-    dbgs() << "LHS:       "; if (LHS) { LHS->dump(); } else { dbgs() << "(None)\n"; }
-    dbgs() << "LHS Src:   "; if (LHSSrc) { LHSSrc->dump(); } else { dbgs() << "(None)\n"; }
-    dbgs() << "LHS Mask:  "; if (LHSMask) { LHSMask->dump(); } else { dbgs() << "(None)\n"; }
-    dbgs() << "LHS Shift: "; if (LHSShift) { LHSShift->dump(); } else { dbgs() << "(None)\n"; }
-    dbgs() << "RHS:       "; if (RHS) { RHS->dump(); } else { dbgs() << "(None)\n"; }
-    dbgs() << "RHS Src:   "; if (RHSSrc) { RHSSrc->dump(); } else { dbgs() << "(None)\n"; }
-    dbgs() << "RHS Mask:  "; if (RHSMask) { RHSMask->dump(); } else { dbgs() << "(None)\n"; }
-    dbgs() << "RHS Shift: "; if (RHSShift) { RHSShift->dump(); } else { dbgs() << "(None)\n"; }
-  }
-  Constant *offset = NULL;
-  Constant *width = NULL;
-  uint32_t lhsMaskVal = 0, rhsMaskVal = 0;
-  uint32_t lhsShiftVal = 0, rhsShiftVal = 0;
-  uint32_t lhsMaskWidth = 0, rhsMaskWidth = 0;
-  uint32_t lhsMaskOffset = 0, rhsMaskOffset = 0;
-  lhsMaskVal = (LHSMask 
-      ? dyn_cast<ConstantInt>(LHSMask)->getZExtValue() : 0);
-  rhsMaskVal = (RHSMask 
-      ? dyn_cast<ConstantInt>(RHSMask)->getZExtValue() : 0);
-  lhsShiftVal = (LHSShift 
-      ? dyn_cast<ConstantInt>(LHSShift)->getZExtValue() : 0);
-  rhsShiftVal = (RHSShift 
-      ? dyn_cast<ConstantInt>(RHSShift)->getZExtValue() : 0);
-  lhsMaskWidth = lhsMaskVal ? CountPopulation_32(lhsMaskVal) : 32 - lhsShiftVal;
-  rhsMaskWidth = rhsMaskVal ? CountPopulation_32(rhsMaskVal) : 32 - rhsShiftVal;
-  lhsMaskOffset = lhsMaskVal ? CountTrailingZeros_32(lhsMaskVal) : lhsShiftVal;
-  rhsMaskOffset = rhsMaskVal ? CountTrailingZeros_32(rhsMaskVal) : rhsShiftVal;
-  // TODO: Handle the case of A & B | D & ~B(i.e. inverted masks).
-  if ((lhsMaskVal || rhsMaskVal) && !(lhsMaskVal ^ rhsMaskVal)) {
-    return false;
-  }
-  if (lhsMaskOffset >= (rhsMaskWidth + rhsMaskOffset)) {
-    offset = ConstantInt::get(aType, lhsMaskOffset, false);
-    width = ConstantInt::get(aType, lhsMaskWidth, false);
-    RHSSrc = RHS;
-    if (!isMask_32(lhsMaskVal) && !isShiftedMask_32(lhsMaskVal)) {
-      return false;
-    }
-    if (!LHSShift) {
-      LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
-          "MaskShr", LHS);
-    } else if (lhsShiftVal != lhsMaskOffset) {
-      LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
-          "MaskShr", LHS);
-    }
-    if (mDebug) {
-      dbgs() << "Optimizing LHS!\n";
-    }
-  } else if (rhsMaskOffset >= (lhsMaskWidth + lhsMaskOffset)) {
-    offset = ConstantInt::get(aType, rhsMaskOffset, false);
-    width = ConstantInt::get(aType, rhsMaskWidth, false);
-    LHSSrc = RHSSrc;
-    RHSSrc = LHS;
-    if (!isMask_32(rhsMaskVal) && !isShiftedMask_32(rhsMaskVal)) {
-      return false;
-    }
-    if (!RHSShift) {
-      LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
-          "MaskShr", RHS);
-    } else if (rhsShiftVal != rhsMaskOffset) {
-      LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
-          "MaskShr", RHS);
-    }
-    if (mDebug) {
-      dbgs() << "Optimizing RHS!\n";
-    }
-  } else {
-    if (mDebug) {
-      dbgs() << "Failed constraint 3!\n";
-    }
-    return false;
-  }
-  if (mDebug) {
-    dbgs() << "Width:  "; if (width) { width->dump(); } else { dbgs() << "(0)\n"; }
-    dbgs() << "Offset: "; if (offset) { offset->dump(); } else { dbgs() << "(0)\n"; }
-    dbgs() << "LHSSrc: "; if (LHSSrc) { LHSSrc->dump(); } else { dbgs() << "(0)\n"; }
-    dbgs() << "RHSSrc: "; if (RHSSrc) { RHSSrc->dump(); } else { dbgs() << "(0)\n"; }
-  }
-  if (!offset || !width) {
-    if (mDebug) {
-      dbgs() << "Either width or offset are NULL, failed detection!\n";
-    }
-    return false;
-  }
-  // Lets create the function signature.
-  std::vector<Type *> callTypes;
-  callTypes.push_back(aType);
-  callTypes.push_back(aType);
-  callTypes.push_back(aType);
-  callTypes.push_back(aType);
-  FunctionType *funcType = FunctionType::get(aType, callTypes, false);
-  std::string name = "__amdil_ubit_insert";
-  if (isVector) { name += "_v" + itostr(numEle) + "u32"; } else { name += "_u32"; }
-  Function *Func = 
-    dyn_cast<Function>(inst->getParent()->getParent()->getParent()->
-        getOrInsertFunction(StringRef(name), funcType));
-  Value *Operands[4] = {
-    width,
-    offset,
-    LHSSrc,
-    RHSSrc
-  };
-  CallInst *CI = CallInst::Create(Func, Operands, "BitInsertOpt");
-  if (mDebug) {
-    dbgs() << "Old Inst: ";
-    inst->dump();
-    dbgs() << "New Inst: ";
-    CI->dump();
-    dbgs() << "\n\n";
-  }
-  CI->insertBefore(inst);
-  inst->replaceAllUsesWith(CI);
-  return true;
-}
-
-bool 
-AMDGPUPeepholeOpt::optimizeBitExtract(Instruction *inst)  {
-  if (!inst) {
-    return false;
-  }
-  if (!inst->isBinaryOp()) {
-    return false;
-  }
-  if (inst->getOpcode() != Instruction::And) {
-    return false;
-  }
-  if (optLevel == CodeGenOpt::None) {
-    return false;
-  }
-  // We want to do some simple optimizations on Shift right/And patterns. The
-  // basic optimization is to turn (A >> B) & C where A is a 32bit type, B is a
-  // value smaller than 32 and C is a mask. If C is a constant value, then the
-  // following transformation can occur. For signed integers, it turns into the
-  // function call dst = __amdil_ibit_extract(log2(C), B, A) For unsigned
-  // integers, it turns into the function call dst =
-  // __amdil_ubit_extract(log2(C), B, A) The function __amdil_[u|i]bit_extract
-  // can be found in Section 7.9 of the ATI IL spec of the stream SDK for
-  // Evergreen hardware.
-  if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD4XXX) {
-    // This does not work on HD4XXX hardware.
-    return false;
-  }
-  Type *aType = inst->getType();
-  bool isVector = aType->isVectorTy();
-
-  // XXX Support vector types
-  if (isVector) {
-    return false;
-  }
-  int numEle = 1;
-  // This only works on 32bit integers
-  if (aType->getScalarType()
-      != Type::getInt32Ty(inst->getContext())) {
-    return false;
-  }
-  if (isVector) {
-    const VectorType *VT = dyn_cast<VectorType>(aType);
-    numEle = VT->getNumElements();
-    // We currently cannot support more than 4 elements in a intrinsic and we
-    // cannot support Vec3 types.
-    if (numEle > 4 || numEle == 3) {
-      return false;
-    }
-  }
-  BinaryOperator *ShiftInst = dyn_cast<BinaryOperator>(inst->getOperand(0));
-  // If the first operand is not a shift instruction, then we can return as it
-  // doesn't match this pattern.
-  if (!ShiftInst || !ShiftInst->isShift()) {
-    return false;
-  }
-  // If we are a shift left, then we need don't match this pattern.
-  if (ShiftInst->getOpcode() == Instruction::Shl) {
-    return false;
-  }
-  bool isSigned = ShiftInst->isArithmeticShift();
-  Constant *AndMask = dyn_cast<Constant>(inst->getOperand(1));
-  Constant *ShrVal = dyn_cast<Constant>(ShiftInst->getOperand(1));
-  // Lets make sure that the shift value and the and mask are constant integers.
-  if (!AndMask || !ShrVal) {
-    return false;
-  }
-  Constant *newMaskConst;
-  Constant *shiftValConst;
-  if (isVector) {
-    // Handle the vector case
-    std::vector<Constant *> maskVals;
-    std::vector<Constant *> shiftVals;
-    ConstantVector *AndMaskVec = dyn_cast<ConstantVector>(AndMask);
-    ConstantVector *ShrValVec = dyn_cast<ConstantVector>(ShrVal);
-    Type *scalarType = AndMaskVec->getType()->getScalarType();
-    assert(AndMaskVec->getNumOperands() ==
-           ShrValVec->getNumOperands() && "cannot have a "
-           "combination where the number of elements to a "
-           "shift and an and are different!");
-    for (size_t x = 0, y = AndMaskVec->getNumOperands(); x < y; ++x) {
-      ConstantInt *AndCI = dyn_cast<ConstantInt>(AndMaskVec->getOperand(x));
-      ConstantInt *ShiftIC = dyn_cast<ConstantInt>(ShrValVec->getOperand(x));
-      if (!AndCI || !ShiftIC) {
-        return false;
-      }
-      uint32_t maskVal = (uint32_t)AndCI->getZExtValue();
-      if (!isMask_32(maskVal)) {
-        return false;
-      }
-      maskVal = (uint32_t)CountTrailingOnes_32(maskVal);
-      uint32_t shiftVal = (uint32_t)ShiftIC->getZExtValue();
-      // If the mask or shiftval is greater than the bitcount, then break out.
-      if (maskVal >= 32 || shiftVal >= 32) {
-        return false;
-      }
-      // If the mask val is greater than the the number of original bits left
-      // then this optimization is invalid.
-      if (maskVal > (32 - shiftVal)) {
-        return false;
-      }
-      maskVals.push_back(ConstantInt::get(scalarType, maskVal, isSigned));
-      shiftVals.push_back(ConstantInt::get(scalarType, shiftVal, isSigned));
-    }
-    newMaskConst = ConstantVector::get(maskVals);
-    shiftValConst = ConstantVector::get(shiftVals);
-  } else {
-    // Handle the scalar case
-    uint32_t maskVal = (uint32_t)dyn_cast<ConstantInt>(AndMask)->getZExtValue();
-    // This must be a mask value where all lower bits are set to 1 and then any
-    // bit higher is set to 0.
-    if (!isMask_32(maskVal)) {
-      return false;
-    }
-    maskVal = (uint32_t)CountTrailingOnes_32(maskVal);
-    // Count the number of bits set in the mask, this is the width of the
-    // resulting bit set that is extracted from the source value.
-    uint32_t shiftVal = (uint32_t)dyn_cast<ConstantInt>(ShrVal)->getZExtValue();
-    // If the mask or shift val is greater than the bitcount, then break out.
-    if (maskVal >= 32 || shiftVal >= 32) {
-      return false;
-    }
-    // If the mask val is greater than the the number of original bits left then
-    // this optimization is invalid.
-    if (maskVal > (32 - shiftVal)) {
-      return false;
-    }
-    newMaskConst = ConstantInt::get(aType, maskVal, isSigned);
-    shiftValConst = ConstantInt::get(aType, shiftVal, isSigned);
-  }
-  // Lets create the function signature.
-  std::vector<Type *> callTypes;
-  callTypes.push_back(aType);
-  callTypes.push_back(aType);
-  callTypes.push_back(aType);
-  FunctionType *funcType = FunctionType::get(aType, callTypes, false);
-  std::string name = "llvm.AMDGPU.bit.extract.u32";
-  if (isVector) {
-    name += ".v" + itostr(numEle) + "i32";
-  } else {
-    name += ".";
-  }
-  // Lets create the function.
-  Function *Func = 
-    dyn_cast<Function>(inst->getParent()->getParent()->getParent()->
-                       getOrInsertFunction(StringRef(name), funcType));
-  Value *Operands[3] = {
-    ShiftInst->getOperand(0),
-    shiftValConst,
-    newMaskConst
-  };
-  // Lets create the Call with the operands
-  CallInst *CI = CallInst::Create(Func, Operands, "ByteExtractOpt");
-  CI->setDoesNotAccessMemory();
-  CI->insertBefore(inst);
-  inst->replaceAllUsesWith(CI);
-  return true;
-}
-
-bool
-AMDGPUPeepholeOpt::expandBFI(CallInst *CI) {
-  if (!CI) {
-    return false;
-  }
-  Value *LHS = CI->getOperand(CI->getNumOperands() - 1);
-  if (!LHS->getName().startswith("__amdil_bfi")) {
-    return false;
-  }
-  Type* type = CI->getOperand(0)->getType();
-  Constant *negOneConst = NULL;
-  if (type->isVectorTy()) {
-    std::vector<Constant *> negOneVals;
-    negOneConst = ConstantInt::get(CI->getContext(), 
-        APInt(32, StringRef("-1"), 10));
-    for (size_t x = 0,
-        y = dyn_cast<VectorType>(type)->getNumElements(); x < y; ++x) {
-      negOneVals.push_back(negOneConst);
-    }
-    negOneConst = ConstantVector::get(negOneVals);
-  } else {
-    negOneConst = ConstantInt::get(CI->getContext(), 
-        APInt(32, StringRef("-1"), 10));
-  }
-  // __amdil_bfi => (A & B) | (~A & C)
-  BinaryOperator *lhs = 
-    BinaryOperator::Create(Instruction::And, CI->getOperand(0),
-        CI->getOperand(1), "bfi_and", CI);
-  BinaryOperator *rhs =
-    BinaryOperator::Create(Instruction::Xor, CI->getOperand(0), negOneConst,
-        "bfi_not", CI);
-  rhs = BinaryOperator::Create(Instruction::And, rhs, CI->getOperand(2),
-      "bfi_and", CI);
-  lhs = BinaryOperator::Create(Instruction::Or, lhs, rhs, "bfi_or", CI);
-  CI->replaceAllUsesWith(lhs);
-  return true;
-}
-
-bool
-AMDGPUPeepholeOpt::expandBFM(CallInst *CI) {
-  if (!CI) {
-    return false;
-  }
-  Value *LHS = CI->getOperand(CI->getNumOperands() - 1);
-  if (!LHS->getName().startswith("__amdil_bfm")) {
-    return false;
-  }
-  // __amdil_bfm => ((1 << (src0 & 0x1F)) - 1) << (src1 & 0x1f)
-  Constant *newMaskConst = NULL;
-  Constant *newShiftConst = NULL;
-  Type* type = CI->getOperand(0)->getType();
-  if (type->isVectorTy()) {
-    std::vector<Constant*> newMaskVals, newShiftVals;
-    newMaskConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 0x1F);
-    newShiftConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 1);
-    for (size_t x = 0,
-        y = dyn_cast<VectorType>(type)->getNumElements(); x < y; ++x) {
-      newMaskVals.push_back(newMaskConst);
-      newShiftVals.push_back(newShiftConst);
-    }
-    newMaskConst = ConstantVector::get(newMaskVals);
-    newShiftConst = ConstantVector::get(newShiftVals);
-  } else {
-    newMaskConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 0x1F);
-    newShiftConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 1);
-  }
-  BinaryOperator *lhs =
-    BinaryOperator::Create(Instruction::And, CI->getOperand(0),
-        newMaskConst, "bfm_mask", CI);
-  lhs = BinaryOperator::Create(Instruction::Shl, newShiftConst,
-      lhs, "bfm_shl", CI);
-  lhs = BinaryOperator::Create(Instruction::Sub, lhs,
-      newShiftConst, "bfm_sub", CI);
-  BinaryOperator *rhs =
-    BinaryOperator::Create(Instruction::And, CI->getOperand(1),
-        newMaskConst, "bfm_mask", CI);
-  lhs = BinaryOperator::Create(Instruction::Shl, lhs, rhs, "bfm_shl", CI);
-  CI->replaceAllUsesWith(lhs);
-  return true;
-}
-
-bool
-AMDGPUPeepholeOpt::instLevelOptimizations(BasicBlock::iterator *bbb)  {
-  Instruction *inst = (*bbb);
-  if (optimizeCallInst(bbb)) {
-    return true;
-  }
-  if (optimizeBitExtract(inst)) {
-    return false;
-  }
-  if (optimizeBitInsert(inst)) {
-    return false;
-  }
-  if (correctMisalignedMemOp(inst)) {
-    return false;
-  }
-  return false;
-}
-bool
-AMDGPUPeepholeOpt::correctMisalignedMemOp(Instruction *inst) {
-  LoadInst *linst = dyn_cast<LoadInst>(inst);
-  StoreInst *sinst = dyn_cast<StoreInst>(inst);
-  unsigned alignment;
-  Type* Ty = inst->getType();
-  if (linst) {
-    alignment = linst->getAlignment();
-    Ty = inst->getType();
-  } else if (sinst) {
-    alignment = sinst->getAlignment();
-    Ty = sinst->getValueOperand()->getType();
-  } else {
-    return false;
-  }
-  unsigned size = getTypeSize(Ty);
-  if (size == alignment || size < alignment) {
-    return false;
-  }
-  if (!Ty->isStructTy()) {
-    return false;
-  }
-  if (alignment < 4) {
-    if (linst) {
-      linst->setAlignment(0);
-      return true;
-    } else if (sinst) {
-      sinst->setAlignment(0);
-      return true;
-    }
-  }
-  return false;
-}
-bool 
-AMDGPUPeepholeOpt::isSigned24BitOps(CallInst *CI)  {
-  if (!CI) {
-    return false;
-  }
-  Value *LHS = CI->getOperand(CI->getNumOperands() - 1);
-  std::string namePrefix = LHS->getName().substr(0, 14);
-  if (namePrefix != "__amdil_imad24" && namePrefix != "__amdil_imul24"
-      && namePrefix != "__amdil__imul24_high") {
-    return false;
-  }
-  if (mSTM->device()->usesHardware(AMDGPUDeviceInfo::Signed24BitOps)) {
-    return false;
-  }
-  return true;
-}
-
-void 
-AMDGPUPeepholeOpt::expandSigned24BitOps(CallInst *CI)  {
-  assert(isSigned24BitOps(CI) && "Must be a "
-      "signed 24 bit operation to call this function!");
-  Value *LHS = CI->getOperand(CI->getNumOperands()-1);
-  // On 7XX and 8XX we do not have signed 24bit, so we need to
-  // expand it to the following:
-  // imul24 turns into 32bit imul
-  // imad24 turns into 32bit imad
-  // imul24_high turns into 32bit imulhigh
-  if (LHS->getName().substr(0, 14) == "__amdil_imad24") {
-    Type *aType = CI->getOperand(0)->getType();
-    bool isVector = aType->isVectorTy();
-    int numEle = isVector ? dyn_cast<VectorType>(aType)->getNumElements() : 1;
-    std::vector<Type*> callTypes;
-    callTypes.push_back(CI->getOperand(0)->getType());
-    callTypes.push_back(CI->getOperand(1)->getType());
-    callTypes.push_back(CI->getOperand(2)->getType());
-    FunctionType *funcType =
-      FunctionType::get(CI->getOperand(0)->getType(), callTypes, false);
-    std::string name = "__amdil_imad";
-    if (isVector) {
-      name += "_v" + itostr(numEle) + "i32";
-    } else {
-      name += "_i32";
-    }
-    Function *Func = dyn_cast<Function>(
-                       CI->getParent()->getParent()->getParent()->
-                       getOrInsertFunction(StringRef(name), funcType));
-    Value *Operands[3] = {
-      CI->getOperand(0),
-      CI->getOperand(1),
-      CI->getOperand(2)
-    };
-    CallInst *nCI = CallInst::Create(Func, Operands, "imad24");
-    nCI->insertBefore(CI);
-    CI->replaceAllUsesWith(nCI);
-  } else if (LHS->getName().substr(0, 14) == "__amdil_imul24") {
-    BinaryOperator *mulOp =
-      BinaryOperator::Create(Instruction::Mul, CI->getOperand(0),
-          CI->getOperand(1), "imul24", CI);
-    CI->replaceAllUsesWith(mulOp);
-  } else if (LHS->getName().substr(0, 19) == "__amdil_imul24_high") {
-    Type *aType = CI->getOperand(0)->getType();
-
-    bool isVector = aType->isVectorTy();
-    int numEle = isVector ? dyn_cast<VectorType>(aType)->getNumElements() : 1;
-    std::vector<Type*> callTypes;
-    callTypes.push_back(CI->getOperand(0)->getType());
-    callTypes.push_back(CI->getOperand(1)->getType());
-    FunctionType *funcType =
-      FunctionType::get(CI->getOperand(0)->getType(), callTypes, false);
-    std::string name = "__amdil_imul_high";
-    if (isVector) {
-      name += "_v" + itostr(numEle) + "i32";
-    } else {
-      name += "_i32";
-    }
-    Function *Func = dyn_cast<Function>(
-                       CI->getParent()->getParent()->getParent()->
-                       getOrInsertFunction(StringRef(name), funcType));
-    Value *Operands[2] = {
-      CI->getOperand(0),
-      CI->getOperand(1)
-    };
-    CallInst *nCI = CallInst::Create(Func, Operands, "imul24_high");
-    nCI->insertBefore(CI);
-    CI->replaceAllUsesWith(nCI);
-  }
-}
-
-bool 
-AMDGPUPeepholeOpt::isRWGLocalOpt(CallInst *CI)  {
-  return (CI != NULL
-          && CI->getOperand(CI->getNumOperands() - 1)->getName() 
-          == "__amdil_get_local_size_int");
-}
-
-bool 
-AMDGPUPeepholeOpt::convertAccurateDivide(CallInst *CI)  {
-  if (!CI) {
-    return false;
-  }
-  if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD6XXX
-      && (mSTM->getDeviceName() == "cayman")) {
-    return false;
-  }
-  return CI->getOperand(CI->getNumOperands() - 1)->getName().substr(0, 20) 
-      == "__amdil_improved_div";
-}
-
-void 
-AMDGPUPeepholeOpt::expandAccurateDivide(CallInst *CI)  {
-  assert(convertAccurateDivide(CI)
-         && "expanding accurate divide can only happen if it is expandable!");
-  BinaryOperator *divOp =
-    BinaryOperator::Create(Instruction::FDiv, CI->getOperand(0),
-                           CI->getOperand(1), "fdiv32", CI);
-  CI->replaceAllUsesWith(divOp);
-}
-
-bool
-AMDGPUPeepholeOpt::propagateSamplerInst(CallInst *CI) {
-  if (optLevel != CodeGenOpt::None) {
-    return false;
-  }
-
-  if (!CI) {
-    return false;
-  }
-
-  unsigned funcNameIdx = 0;
-  funcNameIdx = CI->getNumOperands() - 1;
-  StringRef calleeName = CI->getOperand(funcNameIdx)->getName();
-  if (calleeName != "__amdil_image2d_read_norm"
-   && calleeName != "__amdil_image2d_read_unnorm"
-   && calleeName != "__amdil_image3d_read_norm"
-   && calleeName != "__amdil_image3d_read_unnorm") {
-    return false;
-  }
-
-  unsigned samplerIdx = 2;
-  samplerIdx = 1;
-  Value *sampler = CI->getOperand(samplerIdx);
-  LoadInst *lInst = dyn_cast<LoadInst>(sampler);
-  if (!lInst) {
-    return false;
-  }
-
-  if (lInst->getPointerAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
-    return false;
-  }
-
-  GlobalVariable *gv = dyn_cast<GlobalVariable>(lInst->getPointerOperand());
-  // If we are loading from what is not a global value, then we
-  // fail and return.
-  if (!gv) {
-    return false;
-  }
-
-  // If we don't have an initializer or we have an initializer and
-  // the initializer is not a 32bit integer, we fail.
-  if (!gv->hasInitializer() 
-      || !gv->getInitializer()->getType()->isIntegerTy(32)) {
-      return false;
-  }
-
-  // Now that we have the global variable initializer, lets replace
-  // all uses of the load instruction with the samplerVal and
-  // reparse the __amdil_is_constant() function.
-  Constant *samplerVal = gv->getInitializer();
-  lInst->replaceAllUsesWith(samplerVal);
-  return true;
-}
-
-bool 
-AMDGPUPeepholeOpt::doInitialization(Module &M)  {
-  return false;
-}
-
-bool 
-AMDGPUPeepholeOpt::doFinalization(Module &M)  {
-  return false;
-}
-
-void 
-AMDGPUPeepholeOpt::getAnalysisUsage(AnalysisUsage &AU) const  {
-  AU.addRequired<MachineFunctionAnalysis>();
-  FunctionPass::getAnalysisUsage(AU);
-  AU.setPreservesAll();
-}
-
-size_t AMDGPUPeepholeOpt::getTypeSize(Type * const T, bool dereferencePtr) {
-  size_t size = 0;
-  if (!T) {
-    return size;
-  }
-  switch (T->getTypeID()) {
-  case Type::X86_FP80TyID:
-  case Type::FP128TyID:
-  case Type::PPC_FP128TyID:
-  case Type::LabelTyID:
-    assert(0 && "These types are not supported by this backend");
-  default:
-  case Type::FloatTyID:
-  case Type::DoubleTyID:
-    size = T->getPrimitiveSizeInBits() >> 3;
-    break;
-  case Type::PointerTyID:
-    size = getTypeSize(dyn_cast<PointerType>(T), dereferencePtr);
-    break;
-  case Type::IntegerTyID:
-    size = getTypeSize(dyn_cast<IntegerType>(T), dereferencePtr);
-    break;
-  case Type::StructTyID:
-    size = getTypeSize(dyn_cast<StructType>(T), dereferencePtr);
-    break;
-  case Type::ArrayTyID:
-    size = getTypeSize(dyn_cast<ArrayType>(T), dereferencePtr);
-    break;
-  case Type::FunctionTyID:
-    size = getTypeSize(dyn_cast<FunctionType>(T), dereferencePtr);
-    break;
-  case Type::VectorTyID:
-    size = getTypeSize(dyn_cast<VectorType>(T), dereferencePtr);
-    break;
-  };
-  return size;
-}
-
-size_t AMDGPUPeepholeOpt::getTypeSize(StructType * const ST,
-    bool dereferencePtr) {
-  size_t size = 0;
-  if (!ST) {
-    return size;
-  }
-  Type *curType;
-  StructType::element_iterator eib;
-  StructType::element_iterator eie;
-  for (eib = ST->element_begin(), eie = ST->element_end(); eib != eie; ++eib) {
-    curType = *eib;
-    size += getTypeSize(curType, dereferencePtr);
-  }
-  return size;
-}
-
-size_t AMDGPUPeepholeOpt::getTypeSize(IntegerType * const IT,
-    bool dereferencePtr) {
-  return IT ? (IT->getBitWidth() >> 3) : 0;
-}
-
-size_t AMDGPUPeepholeOpt::getTypeSize(FunctionType * const FT,
-    bool dereferencePtr) {
-    assert(0 && "Should not be able to calculate the size of an function type");
-    return 0;
-}
-
-size_t AMDGPUPeepholeOpt::getTypeSize(ArrayType * const AT,
-    bool dereferencePtr) {
-  return (size_t)(AT ? (getTypeSize(AT->getElementType(),
-                                    dereferencePtr) * AT->getNumElements())
-                     : 0);
-}
-
-size_t AMDGPUPeepholeOpt::getTypeSize(VectorType * const VT,
-    bool dereferencePtr) {
-  return VT ? (VT->getBitWidth() >> 3) : 0;
-}
-
-size_t AMDGPUPeepholeOpt::getTypeSize(PointerType * const PT,
-    bool dereferencePtr) {
-  if (!PT) {
-    return 0;
-  }
-  Type *CT = PT->getElementType();
-  if (CT->getTypeID() == Type::StructTyID &&
-      PT->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
-    return getTypeSize(dyn_cast<StructType>(CT));
-  } else if (dereferencePtr) {
-    size_t size = 0;
-    for (size_t x = 0, y = PT->getNumContainedTypes(); x < y; ++x) {
-      size += getTypeSize(PT->getContainedType(x), dereferencePtr);
-    }
-    return size;
-  } else {
-    return 4;
-  }
-}
-
-size_t AMDGPUPeepholeOpt::getTypeSize(OpaqueType * const OT,
-    bool dereferencePtr) {
-  //assert(0 && "Should not be able to calculate the size of an opaque type");
-  return 4;
-}
diff --git a/lib/Target/R600/CMakeLists.txt b/lib/Target/R600/CMakeLists.txt
index 2ad2047..97f0a40 100644
--- a/lib/Target/R600/CMakeLists.txt
+++ b/lib/Target/R600/CMakeLists.txt
@@ -21,7 +21,6 @@ add_llvm_target(R600CodeGen
   AMDILISelDAGToDAG.cpp
   AMDILISelLowering.cpp
   AMDILNIDevice.cpp
-  AMDILPeepholeOptimizer.cpp
   AMDILSIDevice.cpp
   AMDGPUAsmPrinter.cpp
   AMDGPUFrameLowering.cpp
diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td
index 8a60add..8f47523 100644
--- a/lib/Target/R600/R600Instructions.td
+++ b/lib/Target/R600/R600Instructions.td
@@ -1615,6 +1615,7 @@ let Predicates = [isEGorCayman] in {
                                                i32:$src2))],
     VecALU
   >;
+  def : BFEPattern <BFE_UINT_eg>;
 
   def BFI_INT_eg : R600_3OP <0x06, "BFI_INT", [], VecALU>;
   defm : BFIPatterns <BFI_INT_eg>;
diff --git a/test/CodeGen/R600/bfe_uint.ll b/test/CodeGen/R600/bfe_uint.ll
new file mode 100644
index 0000000..92570c3
--- /dev/null
+++ b/test/CodeGen/R600/bfe_uint.ll
@@ -0,0 +1,26 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; CHECK: @bfe_def
+; CHECK: BFE_UINT
+define void @bfe_def(i32 addrspace(1)* %out, i32 %x) {
+entry:
+  %0 = lshr i32 %x, 5
+  %1 = and i32 %0, 15 ; 0xf
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; This program could be implemented using a BFE_UINT instruction, however
+; since the lshr constant + number of bits in the mask is >= 32, it can also be
+; implmented with a LSHR instruction, which is better, because LSHR has less
+; operands and requires less constants.
+
+; CHECK: @bfe_shift
+; CHECK-NOT: BFE_UINT
+define void @bfe_shift(i32 addrspace(1)* %out, i32 %x) {
+entry:
+  %0 = lshr i32 %x, 16
+  %1 = and i32 %0, 65535 ; 0xffff
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
-- 
1.7.11.4