[Mesa-dev] [PATCH 4/4] AMDGPU: New control flow for SI

Sun Dec 9 09:02:59 PST 2012

This patch replaces the control flow handling with a new
pass which structurize the graph before transforming it to
machine instruction. This has a couple of different advantages
and currently fixes 20 piglit tests without a single regression.

It is now a general purpose transformation that could be not
only be used for SI/R6xx, but also for other hardware
implementations that use a form of structurized control flow.

Signed-off-by: Christian König <deathsimple at vodafone.de>
---
 lib/Target/AMDGPU/AMDGPU.h                  |    3 +-
 lib/Target/AMDGPU/AMDGPUStructurizeCFG.cpp  |  725 +++++++++++++++++++++++++++
 lib/Target/AMDGPU/AMDGPUTargetMachine.cpp   |   13 +-
 lib/Target/AMDGPU/AMDILCFGStructurizer.cpp  |    4 -
 lib/Target/AMDGPU/AMDILInstrInfo.td         |   65 ---
 lib/Target/AMDGPU/R600Instructions.td       |   65 +++
 lib/Target/AMDGPU/SIAnnotateControlFlow.cpp |  333 ++++++++++++
 lib/Target/AMDGPU/SIFixSGPRLiveness.cpp     |  179 -------
 lib/Target/AMDGPU/SIISelLowering.cpp        |  118 ++++-
 lib/Target/AMDGPU/SIISelLowering.h          |    2 +-
 lib/Target/AMDGPU/SIInstructions.td         |   72 ++-
 lib/Target/AMDGPU/SIIntrinsics.td           |   10 +
 lib/Target/AMDGPU/SILowerControlFlow.cpp    |  285 +++++++----
 13 files changed, 1491 insertions(+), 383 deletions(-)
 create mode 100644 lib/Target/AMDGPU/AMDGPUStructurizeCFG.cpp
 create mode 100644 lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
 delete mode 100644 lib/Target/AMDGPU/SIFixSGPRLiveness.cpp

diff --git a/lib/Target/AMDGPU/AMDGPU.h b/lib/Target/AMDGPU/AMDGPU.h
index 40864b0..0f5125d 100644
--- a/lib/Target/AMDGPU/AMDGPU.h
+++ b/lib/Target/AMDGPU/AMDGPU.h
@@ -25,13 +25,14 @@ FunctionPass* createR600KernelParametersPass(const DataLayout *TD);
 FunctionPass *createR600ExpandSpecialInstrsPass(TargetMachine &tm);
 
 // SI Passes
+FunctionPass *createSIAnnotateControlFlowPass();
 FunctionPass *createSIAssignInterpRegsPass(TargetMachine &tm);
 FunctionPass *createSILowerControlFlowPass(TargetMachine &tm);
 FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS);
 FunctionPass *createSILowerLiteralConstantsPass(TargetMachine &tm);
-FunctionPass *createSIFixSGPRLivenessPass(TargetMachine &tm);
 
 // Passes common to R600 and SI
+Pass *createAMDGPUStructurizeCFGPass();
 FunctionPass *createAMDGPUConvertToISAPass(TargetMachine &tm);
 
 } // End namespace llvm
diff --git a/lib/Target/AMDGPU/AMDGPUStructurizeCFG.cpp b/lib/Target/AMDGPU/AMDGPUStructurizeCFG.cpp
new file mode 100644
index 0000000..6ddb5bf
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPUStructurizeCFG.cpp
@@ -0,0 +1,725 @@
+//===-- AMDGPUStructurizeCFG.cpp -  ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// The pass implemented in this file transforms the programs control flow graph
+// into a form that's suitable for code generation on hardware that implements
+// control flow by execution masking. This currently includes all AMD GPUs but
+// may as well be useful for other types of hardware.
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "llvm/Module.h"
+#include "llvm/ADT/SCCIterator.h"
+#include "llvm/Analysis/RegionIterator.h"
+#include "llvm/Analysis/RegionInfo.h"
+#include "llvm/Analysis/RegionPass.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
+
+using namespace llvm;
+
+namespace {
+
+// Definition of the complex types used in this pass.
+
+typedef std::pair<BasicBlock *, Value *> BBValuePair;
+typedef ArrayRef<BasicBlock*> BBVecRef;
+
+typedef SmallVector<RegionNode*, 8> RNVector;
+typedef SmallVector<BasicBlock*, 8> BBVector;
+typedef SmallVector<BBValuePair, 2> BBValueVector;
+
+typedef DenseMap<PHINode *, BBValueVector> PhiMap;
+typedef DenseMap<BasicBlock *, PhiMap> BBPhiMap;
+typedef DenseMap<BasicBlock *, Value *> BBPredicates;
+typedef DenseMap<BasicBlock *, BBPredicates> PredMap;
+typedef DenseMap<BasicBlock *, unsigned> VisitedMap;
+
+// The name for newly created blocks.
+
+static const char *FlowBlockName = "Flow";
+
+// Transforms the control flow graph on one single entry/exit region at a time.
+//
+// After the transform all "If"/"Then"/"Else" style control flow looks like
+// this:
+//
+// 1
+// ||
+// | |
+// 2 |
+// | /
+// |/   
+// 3
+// ||   Where:
+// | |  1 = "If" block, calculates the condition
+// 4 |  2 = "Then" subregion, runs if the condition is true
+// | /  3 = "Flow" blocks, newly inserted flow blocks, rejoins the control flow
+// |/   4 = "Else" optional subregion, runs if the condition is false
+// 5    5 = "End" block, also rejoins the control flow
+//
+// Control flow is expressed as a branch where the true exit goes into the
+// "Then"/"Else" region, while the false exit skips the region
+// The condition for the optional "Else" region is expressed as a PHI node.
+// The incomming values of the PHI node are true for the "If" edge and false
+// for the "Then" edge.
+//
+// Additionally to that even complicated loops look like this:
+//
+// 1
+// ||
+// | |
+// 2 ^  Where:
+// | /  1 = "Entry" block
+// |/   2 = "Loop" optional subregion, with all exits at "Flow" block
+// 3    3 = "Flow" block, with back edge to entry block
+// |
+//
+// The back edge of the "Flow" block is always on the false side of the branch
+// while the true side continues the general flow. So the loop condition
+// consist of a network of PHI nodes where the true incoming values expresses
+// breaks and the false values expresses continue states.
+class AMDGPUStructurizeCFG : public RegionPass {
+
+  static char ID;
+
+  Type *Boolean;
+  ConstantInt *BoolTrue;
+  ConstantInt *BoolFalse;
+  UndefValue *BoolUndef;
+
+  Function *Function;
+  Region *ParentRegion;
+
+  DominatorTree *DT;
+
+  RNVector Order;
+  VisitedMap Visited;
+  PredMap Predicates;
+  BBPhiMap DeletedPhis;
+  BBVector FlowsInserted;
+
+  BasicBlock *LoopStart;
+  BasicBlock *LoopEnd;
+  BBPredicates LoopPred;
+
+  void orderNodes();
+
+  void buildPredicate(BranchInst *Term, unsigned Idx,
+                      BBPredicates &Pred, bool Invert);
+
+  void analyzeBlock(BasicBlock *BB);
+
+  void analyzeLoop(BasicBlock *BB, unsigned &LoopIdx);
+
+  void collectInfos();
+
+  bool dominatesPredicates(BasicBlock *A, BasicBlock *B);
+
+  void killTerminator(BasicBlock *BB);
+
+  RegionNode *skipChained(RegionNode *Node);
+
+  void delPhiValues(BasicBlock *From, BasicBlock *To);
+
+  void addPhiValues(BasicBlock *From, BasicBlock *To);
+
+  BasicBlock *getNextFlow(BasicBlock *Prev);
+
+  bool isPredictableTrue(BasicBlock *Prev, BasicBlock *Node);
+
+  BasicBlock *wireFlowBlock(BasicBlock *Prev, RegionNode *Node);
+
+  void createFlow();
+
+  void insertConditions();
+
+  void rebuildSSA();
+
+public:
+  AMDGPUStructurizeCFG():
+    RegionPass(ID) {
+
+    initializeRegionInfoPass(*PassRegistry::getPassRegistry());
+  }
+
+  virtual bool doInitialization(Region *R, RGPassManager &RGM);
+
+  virtual bool runOnRegion(Region *R, RGPassManager &RGM);
+
+  virtual const char *getPassName() const {
+    return "AMDGPU simplify control flow";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const {
+
+    AU.addRequired<DominatorTree>();
+    AU.addPreserved<DominatorTree>();
+    RegionPass::getAnalysisUsage(AU);
+  }
+
+};
+
+} // end anonymous namespace
+
+char AMDGPUStructurizeCFG::ID = 0;
+
+// Initialize the types and constants used in the pass
+bool AMDGPUStructurizeCFG::doInitialization(Region *R, RGPassManager &RGM) {
+
+  LLVMContext &Context = R->getEntry()->getContext();
+
+  Boolean = Type::getInt1Ty(Context);
+  BoolTrue = ConstantInt::getTrue(Context);
+  BoolFalse = ConstantInt::getFalse(Context);
+  BoolUndef = UndefValue::get(Boolean);
+
+  return false;
+}
+
+// Build up the general order of nodes
+void AMDGPUStructurizeCFG::orderNodes() {
+
+  scc_iterator<Region *> I = scc_begin(ParentRegion),
+                         E = scc_end(ParentRegion);
+  for (Order.clear(); I != E; ++I) {
+    std::vector<RegionNode *> &Nodes = *I;
+    Order.append(Nodes.begin(), Nodes.end());
+  }
+}
+
+// Build blocks and loop predicates
+void AMDGPUStructurizeCFG::buildPredicate(BranchInst *Term, unsigned Idx,
+                                          BBPredicates &Pred, bool Invert) {
+
+  Value *True = Invert ? BoolFalse : BoolTrue;
+  Value *False = Invert ? BoolTrue : BoolFalse;
+
+  RegionInfo *RI = ParentRegion->getRegionInfo();
+  BasicBlock *BB = Term->getParent();
+
+  // Handle the case where multiple regions start at the same block
+  Region *R = BB != ParentRegion->getEntry() ?
+              RI->getRegionFor(BB) : ParentRegion;
+
+  if (R == ParentRegion) {
+    // It's a top level block in our region
+    Value *Cond = True;
+    if (Term->isConditional()) {
+      BasicBlock *Other = Term->getSuccessor(!Idx);
+
+      if (Visited.count(Other)) {
+        if (!Pred.count(Other))
+          Pred[Other] = False;
+
+        if (!Pred.count(BB))
+          Pred[BB] = True;
+        return;
+      }
+      Cond = Term->getCondition();
+
+      if (Idx != Invert)
+        Cond = BinaryOperator::CreateNot(Cond, "", Term);
+    }
+
+    Pred[BB] = Cond;
+
+  } else if (ParentRegion->contains(R)) {
+    // It's a block in a sub region
+    while(R->getParent() != ParentRegion)
+      R = R->getParent();
+
+    Pred[R->getEntry()] = True;
+
+  } else {
+    // It's a branch from outside into our parent region
+    Pred[BB] = True;
+  }
+}
+
+// Analyze the successors of each block and build up predicates
+void AMDGPUStructurizeCFG::analyzeBlock(BasicBlock *BB) {
+
+  pred_iterator PI = pred_begin(BB), PE = pred_end(BB);
+  BBPredicates &Pred = Predicates[BB];
+
+  for (; PI != PE; ++PI) {
+    BranchInst *Term = cast<BranchInst>((*PI)->getTerminator());
+
+    for (unsigned i = 0, e = Term->getNumSuccessors(); i != e; ++i) {
+      BasicBlock *Succ = Term->getSuccessor(i);
+      if (Succ != BB)
+        continue;
+      buildPredicate(Term, i, Pred, false);
+    }
+  }
+}
+
+// Analyze the conditions leading to loop to a previous block
+void AMDGPUStructurizeCFG::analyzeLoop(BasicBlock *BB, unsigned &LoopIdx) {
+
+  BranchInst *Term = cast<BranchInst>(BB->getTerminator());
+
+  for (unsigned i = 0, e = Term->getNumSuccessors(); i != e; ++i) {
+    BasicBlock *Succ = Term->getSuccessor(i);
+
+    // Ignore it if it's not a back edge
+    if (!Visited.count(Succ))
+      continue;
+
+    buildPredicate(Term, i, LoopPred, true);
+
+    LoopEnd = BB;
+    if (Visited[Succ] < LoopIdx) {
+      LoopIdx = Visited[Succ];
+      LoopStart = Succ;
+    }
+  }
+}
+
+// Collect various loop and predicate infos
+void AMDGPUStructurizeCFG::collectInfos()
+{
+  unsigned Number = 0, LoopIdx = ~0;
+
+  // Reset predicate
+  Predicates.clear();
+
+  // and loop infos
+  LoopStart = LoopEnd = 0;
+  LoopPred.clear();
+
+  RNVector::reverse_iterator OI = Order.rbegin(), OE = Order.rend();
+  for (Visited.clear(); OI != OE; Visited[(*OI++)->getEntry()] = ++Number) {
+
+    // Analyze all the conditions leading to a node
+    analyzeBlock((*OI)->getEntry());
+
+    if ((*OI)->isSubRegion())
+      continue;
+
+    // Find the first/last loop nodes and loop predicates
+    analyzeLoop((*OI)->getNodeAs<BasicBlock>(), LoopIdx);
+  }
+}
+
+// Does A dominate all the predicates of B ?
+bool AMDGPUStructurizeCFG::dominatesPredicates(BasicBlock *A, BasicBlock *B) {
+  BBPredicates &Preds = Predicates[B];
+  for (BBPredicates::iterator PI = Preds.begin(), PE = Preds.end();
+       PI != PE; ++PI) {
+
+    if (!DT->dominates(A, PI->first))
+      return false;
+  }
+  return true;
+}
+
+// Remove phi values from all successors and the remove the terminator.
+void AMDGPUStructurizeCFG::killTerminator(BasicBlock *BB) {
+
+  TerminatorInst *Term = BB->getTerminator();
+  if (!Term)
+    return;
+
+  for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB);
+       SI != SE; ++SI) {
+
+    delPhiValues(BB, *SI);
+  }
+
+  Term->eraseFromParent();
+}
+
+// First: Skip forward to the first region node that either isn't a subregion or not
+// dominating it's exit, remove all the skipped nodes from the node order.
+//
+// Second: Handle the first successor directly if the resulting nodes successor
+// predicates are still dominated by the original entry
+RegionNode *AMDGPUStructurizeCFG::skipChained(RegionNode *Node) {
+
+  BasicBlock *Entry = Node->getEntry();
+
+  // Skip forward as long as it is just a linear flow
+  while (true) {
+    BasicBlock *Entry = Node->getEntry();
+    BasicBlock *Exit;
+
+    if (Node->isSubRegion()) {
+      Exit = Node->getNodeAs<Region>()->getExit();
+    } else {
+      TerminatorInst *Term = Entry->getTerminator();
+      if (Term->getNumSuccessors() != 1)
+        break;
+      Exit = Term->getSuccessor(0);
+    }
+
+    // It's a back edge, break here so we can insert a loop node
+    if (!Visited.count(Exit))
+      return Node;
+
+    // More than node edges are pointing to exit
+    if (!DT->dominates(Entry, Exit))
+      return Node;
+
+    RegionNode *Next = ParentRegion->getNode(Exit);
+    RNVector::iterator I = std::find(Order.begin(), Order.end(), Next);
+    assert(I != Order.end());
+
+    Visited.erase(Next->getEntry());
+    Order.erase(I);
+    Node = Next;
+  }
+
+  BasicBlock *BB = Node->getEntry();
+  TerminatorInst *Term = BB->getTerminator();
+  if (Term->getNumSuccessors() != 2)
+    return Node;
+
+  // Our node has exactly two succesors, check if we can handle
+  // any of them directly
+  BasicBlock *Succ = Term->getSuccessor(0);
+  if (!Visited.count(Succ) || !dominatesPredicates(Entry, Succ)) {
+    Succ = Term->getSuccessor(1);
+    if (!Visited.count(Succ) || !dominatesPredicates(Entry, Succ))
+      return Node;
+  } else {
+    BasicBlock *Succ2 = Term->getSuccessor(1);
+    if (Visited.count(Succ2) && Visited[Succ] > Visited[Succ2] &&
+        dominatesPredicates(Entry, Succ2))
+      Succ = Succ2;
+  }
+
+  RegionNode *Next = ParentRegion->getNode(Succ);
+  RNVector::iterator E = Order.end();
+  RNVector::iterator I = std::find(Order.begin(), E, Next);
+  assert(I != E);
+
+  killTerminator(BB);
+  FlowsInserted.push_back(BB);
+  Visited.erase(Succ);
+  Order.erase(I);
+  return ParentRegion->getNode(wireFlowBlock(BB, Next));
+}
+
+// Remove all PHI values coming from "From" into "To"
+// and remember them in DeletedPhis
+void AMDGPUStructurizeCFG::delPhiValues(BasicBlock *From, BasicBlock *To) {
+
+  PhiMap &Map = DeletedPhis[To];
+  for (BasicBlock::iterator I = To->begin(), E = To->end();
+       I != E && isa<PHINode>(*I);) {
+
+    PHINode &Phi = cast<PHINode>(*I++);
+    while (Phi.getBasicBlockIndex(From) != -1) {
+      Value *Deleted = Phi.removeIncomingValue(From, false);
+      Map[&Phi].push_back(std::make_pair(From, Deleted));
+    }
+  }
+}
+
+// Add the PHI values back once we knew the new predecessor
+void AMDGPUStructurizeCFG::addPhiValues(BasicBlock *From, BasicBlock *To) {
+
+  if (!DeletedPhis.count(To))
+    return;
+
+  PhiMap &Map = DeletedPhis[To];
+  SSAUpdater Updater;
+
+  for (PhiMap::iterator I = Map.begin(), E = Map.end(); I != E; ++I) {
+
+    PHINode *Phi = I->first;
+    Updater.Initialize(Phi->getType(), "");
+    BasicBlock *Fallback = To;
+    bool HaveFallback = false;
+
+    for (BBValueVector::iterator VI = I->second.begin(), VE = I->second.end();
+         VI != VE; ++VI) {
+
+      Updater.AddAvailableValue(VI->first, VI->second);
+      BasicBlock *Dom = DT->findNearestCommonDominator(Fallback, VI->first);
+      if (Dom == VI->first)
+        HaveFallback = true;
+      else if (Dom != Fallback)
+        HaveFallback = false;
+      Fallback = Dom;
+    }
+    if (!HaveFallback) {
+      Value *Undef = UndefValue::get(Phi->getType());
+      Updater.AddAvailableValue(Fallback, Undef);
+    }
+
+    Phi->addIncoming(Updater.GetValueAtEndOfBlock(From), From);
+  }
+  DeletedPhis.erase(To);
+}
+
+// Create a new flow node and update dominator tree and region info
+BasicBlock *AMDGPUStructurizeCFG::getNextFlow(BasicBlock *Prev) {
+
+  LLVMContext &Context = Function->getContext();
+  BasicBlock *Insert = Order.empty() ? ParentRegion->getExit() :
+                       Order.back()->getEntry();
+  BasicBlock *Flow = BasicBlock::Create(Context, FlowBlockName,
+                                        Function, Insert);
+  DT->addNewBlock(Flow, Prev);
+  ParentRegion->getRegionInfo()->setRegionFor(Flow, ParentRegion);
+  FlowsInserted.push_back(Flow);
+  return Flow;
+}
+
+// Can we predict that this node will always be called?
+bool AMDGPUStructurizeCFG::isPredictableTrue(BasicBlock *Prev,
+                                             BasicBlock *Node) {
+
+  BBPredicates &Preds = Predicates[Node];
+  bool Dominated = false;
+
+  for (BBPredicates::iterator I = Preds.begin(), E = Preds.end();
+       I != E; ++I) {
+
+    if (I->second != BoolTrue)
+      return false;
+
+    if (!Dominated && DT->dominates(I->first, Prev))
+      Dominated = true;
+  }
+  return Dominated;
+}
+
+// Wire up the new control flow by inserting or updating the branch
+// instructions at node exits
+BasicBlock *AMDGPUStructurizeCFG::wireFlowBlock(BasicBlock *Prev,
+                                                RegionNode *Node) {
+
+  BasicBlock *Entry = Node->getEntry();
+
+  if (LoopStart == Entry) {
+    LoopStart = Prev;
+    LoopPred[Prev] = BoolTrue;
+  }
+
+  // Wire it up temporary, skipChained may recurse into us
+  BranchInst::Create(Entry, Prev);
+  DT->changeImmediateDominator(Entry, Prev);
+  addPhiValues(Prev, Entry);
+
+  Node = skipChained(Node);
+
+  BasicBlock *Next = getNextFlow(Prev);
+  if (!isPredictableTrue(Prev, Entry)) {
+    // Let Prev point to entry and next block
+    Prev->getTerminator()->eraseFromParent();
+    BranchInst::Create(Entry, Next, BoolUndef, Prev);
+  } else {
+    DT->changeImmediateDominator(Next, Entry);
+  }
+
+  // Let node exit(s) point to next block
+  if (Node->isSubRegion()) {
+    Region *SubRegion = Node->getNodeAs<Region>();
+    BasicBlock *Exit = SubRegion->getExit();
+
+    // Find all the edges from the sub region to the exit
+    BBVector ToDo;
+    for (pred_iterator I = pred_begin(Exit), E = pred_end(Exit); I != E; ++I) {
+      if (SubRegion->contains(*I))
+        ToDo.push_back(*I);
+    }
+
+    // Modify the edges to point to the new flow block
+    for (BBVector::iterator I = ToDo.begin(), E = ToDo.end(); I != E; ++I) {
+      delPhiValues(*I, Exit);
+      TerminatorInst *Term = (*I)->getTerminator();
+      Term->replaceUsesOfWith(Exit, Next);
+    }
+
+    // Update the region info
+    SubRegion->replaceExit(Next);
+
+  } else {
+    BasicBlock *BB = Node->getNodeAs<BasicBlock>();
+    killTerminator(BB);
+    BranchInst::Create(Next, BB);
+
+    if (BB == LoopEnd)
+      LoopEnd = 0;
+  }
+
+  return Next;
+}
+
+// Destroy node order and visited map, build up flow order instead.
+// After this function control flow looks like it should be, but
+// branches only have undefined conditions.
+void AMDGPUStructurizeCFG::createFlow() {
+
+  DeletedPhis.clear();
+
+  BasicBlock *Prev = Order.pop_back_val()->getEntry();
+  assert(Prev == ParentRegion->getEntry() && "Incorrect node order!");
+  Visited.erase(Prev);
+
+  if (LoopStart == Prev) {
+    // Loop starts at entry, split entry so that we can predicate it
+    BasicBlock::iterator Insert = Prev->getFirstInsertionPt();
+    BasicBlock *Split = Prev->splitBasicBlock(Insert, FlowBlockName);
+    DT->addNewBlock(Split, Prev);
+    ParentRegion->getRegionInfo()->setRegionFor(Split, ParentRegion);
+    Predicates[Split] = Predicates[Prev];
+    Order.push_back(ParentRegion->getBBNode(Split));
+    LoopPred[Prev] = BoolTrue;
+
+  } else if (LoopStart == Order.back()->getEntry()) {
+    // Loop starts behind entry, split entry so that we can jump to it
+    Instruction *Term = Prev->getTerminator();
+    BasicBlock *Split = Prev->splitBasicBlock(Term, FlowBlockName);
+    DT->addNewBlock(Split, Prev);
+    ParentRegion->getRegionInfo()->setRegionFor(Split, ParentRegion);
+    Prev = Split;
+  }
+
+  killTerminator(Prev);
+  FlowsInserted.clear();
+  FlowsInserted.push_back(Prev);
+
+  while (!Order.empty()) {
+    RegionNode *Node = Order.pop_back_val();
+    Visited.erase(Node->getEntry());
+    Prev = wireFlowBlock(Prev, Node);
+    if (LoopStart && !LoopEnd) {
+      // Create an extra loop end node
+      LoopEnd = Prev;
+      Prev = getNextFlow(LoopEnd);
+      BranchInst::Create(Prev, LoopStart, BoolUndef, LoopEnd);
+      addPhiValues(LoopEnd, LoopStart);
+    }
+  }
+
+  BasicBlock *Exit = ParentRegion->getExit();
+  BranchInst::Create(Exit, Prev);
+  addPhiValues(Prev, Exit);
+  if (DT->dominates(ParentRegion->getEntry(), Exit))
+    DT->changeImmediateDominator(Exit, Prev);
+
+  if (LoopStart && LoopEnd) {
+    BBVector::iterator FI = std::find(FlowsInserted.begin(),
+                                      FlowsInserted.end(),
+                                      LoopStart);
+    for (; *FI != LoopEnd; ++FI) {
+      addPhiValues(*FI, (*FI)->getTerminator()->getSuccessor(0));
+    }
+  }
+
+  assert(Order.empty());
+  assert(Visited.empty());
+  assert(DeletedPhis.empty());
+}
+
+// Insert the missing branch conditions
+void AMDGPUStructurizeCFG::insertConditions() {
+
+  SSAUpdater PhiInserter;
+
+  for (BBVector::iterator FI = FlowsInserted.begin(), FE = FlowsInserted.end();
+       FI != FE; ++FI) {
+
+    BranchInst *Term = cast<BranchInst>((*FI)->getTerminator());
+    if (Term->isUnconditional())
+      continue;
+
+    PhiInserter.Initialize(Boolean, "");
+    PhiInserter.AddAvailableValue(&Function->getEntryBlock(), BoolFalse);
+
+    BasicBlock *Succ = Term->getSuccessor(0);
+    BBPredicates &Preds = (*FI == LoopEnd) ? LoopPred : Predicates[Succ];
+    for (BBPredicates::iterator PI = Preds.begin(), PE = Preds.end();
+         PI != PE; ++PI) {
+
+      PhiInserter.AddAvailableValue(PI->first, PI->second);
+    }
+
+    Term->setCondition(PhiInserter.GetValueAtEndOfBlock(*FI));
+  }
+}
+
+// Handle a rare case where the disintegrated nodes instructions
+// no longer dominate all their uses
+// Not sure if this is really nessasary
+void AMDGPUStructurizeCFG::rebuildSSA() {
+
+  SSAUpdater Updater;
+  for (Region::block_iterator I = ParentRegion->block_begin(),
+                              E = ParentRegion->block_end();
+       I != E; ++I) {
+
+    BasicBlock *BB = *I;
+    for (BasicBlock::iterator II = BB->begin(), IE = BB->end();
+         II != IE; ++II) {
+
+      bool Initialized = false;
+      for (Use *I = &II->use_begin().getUse(), *Next; I; I = Next) {
+
+        Next = I->getNext();
+
+        Instruction *User = cast<Instruction>(I->getUser());
+        if (User->getParent() == BB) {
+          continue;
+
+        } else if (PHINode *UserPN = dyn_cast<PHINode>(User)) {
+          if (UserPN->getIncomingBlock(*I) == BB)
+            continue;
+        }
+
+        if (DT->dominates(II, User))
+          continue;
+
+        if (!Initialized) {
+          Value *Undef = UndefValue::get(II->getType());
+          Updater.Initialize(II->getType(), "");
+          Updater.AddAvailableValue(&Function->getEntryBlock(), Undef);
+          Updater.AddAvailableValue(BB, II);
+          Initialized = true;
+        }
+        Updater.RewriteUseAfterInsertions(*I);
+      }
+    }
+  }
+}
+
+// Run the transformation for each region found
+bool AMDGPUStructurizeCFG::runOnRegion(Region *R, RGPassManager &RGM) {
+
+  if (R->isTopLevelRegion())
+    return false;
+
+  Function = R->getEntry()->getParent();
+  ParentRegion = R;
+
+  DT = &getAnalysis<DominatorTree>();
+
+  orderNodes();
+  collectInfos();
+  createFlow();
+  insertConditions();
+  rebuildSSA();
+
+  Order.clear();
+  Visited.clear();
+  Predicates.clear();
+  DeletedPhis.clear();
+  FlowsInserted.clear();
+
+  return true;
+}
+
+// Create the pass
+Pass *llvm::createAMDGPUStructurizeCFGPass()
+{
+  return new AMDGPUStructurizeCFG();
+}
diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index e42fa8a..098d42e 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -91,6 +91,11 @@ TargetPassConfig *AMDGPUTargetMachine::createPassConfig(PassManagerBase &PM) {
 
 bool
 AMDGPUPassConfig::addPreISel() {
+  const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
+  if (ST.device()->getGeneration() > AMDGPUDeviceInfo::HD6XXX) {
+    addPass(createAMDGPUStructurizeCFGPass());
+    addPass(createSIAnnotateControlFlowPass());
+  }
   return false;
 }
 
@@ -107,9 +112,6 @@ bool AMDGPUPassConfig::addPreRegAlloc() {
     addPass(createSIAssignInterpRegsPass(*TM));
   }
   addPass(createAMDGPUConvertToISAPass(*TM));
-  if (ST.device()->getGeneration() > AMDGPUDeviceInfo::HD6XXX) {
-    addPass(createSIFixSGPRLivenessPass(*TM));
-  }
   return false;
 }
 
@@ -124,11 +126,10 @@ bool AMDGPUPassConfig::addPreSched2() {
 }
 
 bool AMDGPUPassConfig::addPreEmitPass() {
-  addPass(createAMDGPUCFGPreparationPass(*TM));
-  addPass(createAMDGPUCFGStructurizerPass(*TM));
-
   const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
   if (ST.device()->getGeneration() <= AMDGPUDeviceInfo::HD6XXX) {
+    addPass(createAMDGPUCFGPreparationPass(*TM));
+    addPass(createAMDGPUCFGStructurizerPass(*TM));
     addPass(createR600ExpandSpecialInstrsPass(*TM));
     addPass(&FinalizeMachineBundlesID);
   } else {
diff --git a/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp b/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
index 1f276dc..568d281 100644
--- a/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
+++ b/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
@@ -2596,7 +2596,6 @@ struct CFGStructTraits<AMDGPUCFGStructurizer> {
     case AMDGPU::JUMP: return AMDGPU::IF_PREDICATE_SET;
     case AMDGPU::BRANCH_COND_i32:
     case AMDGPU::BRANCH_COND_f32: return AMDGPU::IF_LOGICALNZ_f32;
-    case AMDGPU::SI_IF_NZ: return AMDGPU::SI_IF_NZ;
     default:
       assert(0 && "internal error");
     }
@@ -2608,7 +2607,6 @@ struct CFGStructTraits<AMDGPUCFGStructurizer> {
     case AMDGPU::JUMP: return AMDGPU::IF_PREDICATE_SET;
     case AMDGPU::BRANCH_COND_i32:
     case AMDGPU::BRANCH_COND_f32: return AMDGPU::IF_LOGICALZ_f32;
-    case AMDGPU::SI_IF_Z: return AMDGPU::SI_IF_Z;
     default:
       assert(0 && "internal error");
     }
@@ -2658,8 +2656,6 @@ struct CFGStructTraits<AMDGPUCFGStructurizer> {
         return instr->getOperand(instr->findFirstPredOperandIdx()).getReg() != 0;
       case AMDGPU::BRANCH_COND_i32:
       case AMDGPU::BRANCH_COND_f32:
-      case AMDGPU::SI_IF_NZ:
-      case AMDGPU::SI_IF_Z:
       break;
     default:
       return false;
diff --git a/lib/Target/AMDGPU/AMDILInstrInfo.td b/lib/Target/AMDGPU/AMDILInstrInfo.td
index ac67451..e969bbf 100644
--- a/lib/Target/AMDGPU/AMDILInstrInfo.td
+++ b/lib/Target/AMDGPU/AMDILInstrInfo.td
@@ -206,68 +206,3 @@ multiclass BranchInstr2<string name> {
 // Intrinsics support
 //===--------------------------------------------------------------------===//
 include "AMDILIntrinsics.td"
-
-//===--------------------------------------------------------------------===//
-// Instructions support
-//===--------------------------------------------------------------------===//
-//===---------------------------------------------------------------------===//
-// Custom Inserter for Branches and returns, this eventually will be a
-// seperate pass
-//===---------------------------------------------------------------------===//
-let isTerminator = 1, usesCustomInserter = 1, isBranch = 1, isBarrier = 1 in {
-  def BRANCH : ILFormat<(outs), (ins brtarget:$target),
-      "; Pseudo unconditional branch instruction",
-      [(br bb:$target)]>;
-  defm BRANCH_COND : BranchConditional<IL_brcond>;
-}
-
-//===---------------------------------------------------------------------===//
-// Flow and Program control Instructions
-//===---------------------------------------------------------------------===//
-let isTerminator=1 in {
-  def SWITCH      : ILFormat< (outs), (ins GPRI32:$src),
-  !strconcat("SWITCH", " $src"), []>;
-  def CASE        : ILFormat< (outs), (ins GPRI32:$src),
-      !strconcat("CASE", " $src"), []>;
-  def BREAK       : ILFormat< (outs), (ins),
-      "BREAK", []>;
-  def CONTINUE    : ILFormat< (outs), (ins),
-      "CONTINUE", []>;
-  def DEFAULT     : ILFormat< (outs), (ins),
-      "DEFAULT", []>;
-  def ELSE        : ILFormat< (outs), (ins),
-      "ELSE", []>;
-  def ENDSWITCH   : ILFormat< (outs), (ins),
-      "ENDSWITCH", []>;
-  def ENDMAIN     : ILFormat< (outs), (ins),
-      "ENDMAIN", []>;
-  def END         : ILFormat< (outs), (ins),
-      "END", []>;
-  def ENDFUNC     : ILFormat< (outs), (ins),
-      "ENDFUNC", []>;
-  def ENDIF       : ILFormat< (outs), (ins),
-      "ENDIF", []>;
-  def WHILELOOP   : ILFormat< (outs), (ins),
-      "WHILE", []>;
-  def ENDLOOP     : ILFormat< (outs), (ins),
-      "ENDLOOP", []>;
-  def FUNC        : ILFormat< (outs), (ins),
-      "FUNC", []>;
-  def RETDYN      : ILFormat< (outs), (ins),
-      "RET_DYN", []>;
-  // This opcode has custom swizzle pattern encoded in Swizzle Encoder
-  defm IF_LOGICALNZ  : BranchInstr<"IF_LOGICALNZ">;
-  // This opcode has custom swizzle pattern encoded in Swizzle Encoder
-  defm IF_LOGICALZ   : BranchInstr<"IF_LOGICALZ">;
-  // This opcode has custom swizzle pattern encoded in Swizzle Encoder
-  defm BREAK_LOGICALNZ : BranchInstr<"BREAK_LOGICALNZ">;
-  // This opcode has custom swizzle pattern encoded in Swizzle Encoder
-  defm BREAK_LOGICALZ : BranchInstr<"BREAK_LOGICALZ">;
-  // This opcode has custom swizzle pattern encoded in Swizzle Encoder
-  defm CONTINUE_LOGICALNZ : BranchInstr<"CONTINUE_LOGICALNZ">;
-  // This opcode has custom swizzle pattern encoded in Swizzle Encoder
-  defm CONTINUE_LOGICALZ : BranchInstr<"CONTINUE_LOGICALZ">;
-  defm IFC         : BranchInstr2<"IFC">;
-  defm BREAKC      : BranchInstr2<"BREAKC">;
-  defm CONTINUEC   : BranchInstr2<"CONTINUEC">;
-}
diff --git a/lib/Target/AMDGPU/R600Instructions.td b/lib/Target/AMDGPU/R600Instructions.td
index bd5e9d6..db24c29 100644
--- a/lib/Target/AMDGPU/R600Instructions.td
+++ b/lib/Target/AMDGPU/R600Instructions.td
@@ -1481,6 +1481,71 @@ let isTerminator = 1, isReturn = 1, isBarrier = 1, hasCtrlDep = 1 in {
       "RETURN", [(IL_retflag)]>;
 }
 
+//===--------------------------------------------------------------------===//
+// Instructions support
+//===--------------------------------------------------------------------===//
+//===---------------------------------------------------------------------===//
+// Custom Inserter for Branches and returns, this eventually will be a
+// seperate pass
+//===---------------------------------------------------------------------===//
+let isTerminator = 1, usesCustomInserter = 1, isBranch = 1, isBarrier = 1 in {
+  def BRANCH : ILFormat<(outs), (ins brtarget:$target),
+      "; Pseudo unconditional branch instruction",
+      [(br bb:$target)]>;
+  defm BRANCH_COND : BranchConditional<IL_brcond>;
+}
+
+//===---------------------------------------------------------------------===//
+// Flow and Program control Instructions
+//===---------------------------------------------------------------------===//
+let isTerminator=1 in {
+  def SWITCH      : ILFormat< (outs), (ins GPRI32:$src),
+  !strconcat("SWITCH", " $src"), []>;
+  def CASE        : ILFormat< (outs), (ins GPRI32:$src),
+      !strconcat("CASE", " $src"), []>;
+  def BREAK       : ILFormat< (outs), (ins),
+      "BREAK", []>;
+  def CONTINUE    : ILFormat< (outs), (ins),
+      "CONTINUE", []>;
+  def DEFAULT     : ILFormat< (outs), (ins),
+      "DEFAULT", []>;
+  def ELSE        : ILFormat< (outs), (ins),
+      "ELSE", []>;
+  def ENDSWITCH   : ILFormat< (outs), (ins),
+      "ENDSWITCH", []>;
+  def ENDMAIN     : ILFormat< (outs), (ins),
+      "ENDMAIN", []>;
+  def END         : ILFormat< (outs), (ins),
+      "END", []>;
+  def ENDFUNC     : ILFormat< (outs), (ins),
+      "ENDFUNC", []>;
+  def ENDIF       : ILFormat< (outs), (ins),
+      "ENDIF", []>;
+  def WHILELOOP   : ILFormat< (outs), (ins),
+      "WHILE", []>;
+  def ENDLOOP     : ILFormat< (outs), (ins),
+      "ENDLOOP", []>;
+  def FUNC        : ILFormat< (outs), (ins),
+      "FUNC", []>;
+  def RETDYN      : ILFormat< (outs), (ins),
+      "RET_DYN", []>;
+  // This opcode has custom swizzle pattern encoded in Swizzle Encoder
+  defm IF_LOGICALNZ  : BranchInstr<"IF_LOGICALNZ">;
+  // This opcode has custom swizzle pattern encoded in Swizzle Encoder
+  defm IF_LOGICALZ   : BranchInstr<"IF_LOGICALZ">;
+  // This opcode has custom swizzle pattern encoded in Swizzle Encoder
+  defm BREAK_LOGICALNZ : BranchInstr<"BREAK_LOGICALNZ">;
+  // This opcode has custom swizzle pattern encoded in Swizzle Encoder
+  defm BREAK_LOGICALZ : BranchInstr<"BREAK_LOGICALZ">;
+  // This opcode has custom swizzle pattern encoded in Swizzle Encoder
+  defm CONTINUE_LOGICALNZ : BranchInstr<"CONTINUE_LOGICALNZ">;
+  // This opcode has custom swizzle pattern encoded in Swizzle Encoder
+  defm CONTINUE_LOGICALZ : BranchInstr<"CONTINUE_LOGICALZ">;
+  defm IFC         : BranchInstr2<"IFC">;
+  defm BREAKC      : BranchInstr2<"BREAKC">;
+  defm CONTINUEC   : BranchInstr2<"CONTINUEC">;
+}
+
 //===----------------------------------------------------------------------===//
 // ISel Patterns
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp b/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
new file mode 100644
index 0000000..d1cc0b8
--- /dev/null
+++ b/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
@@ -0,0 +1,333 @@
+//===-- SIAnnotateControlFlow.cpp -  ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// Annotates the control flow with hardware specific intrinsics.
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+
+#include "llvm/Pass.h"
+#include "llvm/Module.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
+
+using namespace llvm;
+
+namespace {
+
+// Complex types used in this pass
+typedef std::pair<BasicBlock *, Value *> StackEntry;
+typedef SmallVector<StackEntry, 16> StackVector;
+
+// Intrinsic names the control flow is annotated with
+static const char *IfIntrinsic = "llvm.SI.if";
+static const char *ElseIntrinsic = "llvm.SI.else";
+static const char *BreakIntrinsic = "llvm.SI.break";
+static const char *IfBreakIntrinsic = "llvm.SI.if.break";
+static const char *ElseBreakIntrinsic = "llvm.SI.else.break";
+static const char *LoopIntrinsic = "llvm.SI.loop";
+static const char *EndCfIntrinsic = "llvm.SI.end.cf";
+
+class SIAnnotateControlFlow : public FunctionPass {
+
+  static char ID;
+
+  Type *Boolean;
+  Type *Void;
+  Type *Int64;
+  Type *ReturnStruct;
+
+  ConstantInt *BoolTrue;
+  ConstantInt *BoolFalse;
+  UndefValue *BoolUndef;
+  Constant *Int64Zero;
+
+  Constant *If;
+  Constant *Else;
+  Constant *Break;
+  Constant *IfBreak;
+  Constant *ElseBreak;
+  Constant *Loop;
+  Constant *EndCf;
+
+  DominatorTree *DT;
+  StackVector Stack;
+  SSAUpdater PhiInserter;
+
+  bool isTopOfStack(BasicBlock *BB);
+
+  Value *popSaved();
+
+  void push(BasicBlock *BB, Value *Saved);
+
+  bool isElse(PHINode *Phi);
+
+  void eraseIfUnused(PHINode *Phi);
+
+  void openIf(BranchInst *Term);
+
+  void insertElse(BranchInst *Term);
+
+  void handleLoopCondition(Value *Cond);
+
+  void handleLoop(BranchInst *Term);
+
+  void closeControlFlow(BasicBlock *BB);
+
+public:
+  SIAnnotateControlFlow():
+    FunctionPass(ID) { }
+
+  virtual bool doInitialization(Module &M);
+
+  virtual bool runOnFunction(Function &F);
+
+  virtual const char *getPassName() const {
+    return "SI annotate control flow";
+  }
+
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+
+    AU.addRequired<DominatorTree>();
+    AU.addPreserved<DominatorTree>();
+    FunctionPass::getAnalysisUsage(AU);
+  }
+
+};
+
+} // end anonymous namespace
+
+char SIAnnotateControlFlow::ID = 0;
+
+// Initialize all the types and constants used in the pass
+bool SIAnnotateControlFlow::doInitialization(Module &M) {
+
+  LLVMContext &Context = M.getContext();
+
+  Void = Type::getVoidTy(Context);
+  Boolean = Type::getInt1Ty(Context);
+  Int64 = Type::getInt64Ty(Context);
+  ReturnStruct = StructType::get(Boolean, Int64, (Type *)0);
+
+  BoolTrue = ConstantInt::getTrue(Context);
+  BoolFalse = ConstantInt::getFalse(Context);
+  BoolUndef = UndefValue::get(Boolean);
+  Int64Zero = ConstantInt::get(Int64, 0);
+
+  If = M.getOrInsertFunction(
+    IfIntrinsic, ReturnStruct, Boolean, (Type *)0);
+
+  Else = M.getOrInsertFunction(
+    ElseIntrinsic, ReturnStruct, Int64, (Type *)0);
+
+  Break = M.getOrInsertFunction(
+    BreakIntrinsic, Int64, Int64, (Type *)0);
+
+  IfBreak = M.getOrInsertFunction(
+    IfBreakIntrinsic, Int64, Boolean, Int64, (Type *)0);
+
+  ElseBreak = M.getOrInsertFunction(
+    ElseBreakIntrinsic, Int64, Int64, Int64, (Type *)0);
+
+  Loop = M.getOrInsertFunction(
+    LoopIntrinsic, Boolean, Int64, (Type *)0);
+
+  EndCf = M.getOrInsertFunction(
+    EndCfIntrinsic, Void, Int64, (Type *)0);
+
+  return false;
+}
+
+// Is BB the last block saved on the stack ?
+bool SIAnnotateControlFlow::isTopOfStack(BasicBlock *BB) {
+  return Stack.back().first == BB;
+}
+
+// Pop the last saved value from the control flow stack
+Value *SIAnnotateControlFlow::popSaved() {
+  return Stack.pop_back_val().second;
+}
+
+// Push a BB and saved value to the control flow stack
+void SIAnnotateControlFlow::push(BasicBlock *BB, Value *Saved) {
+  Stack.push_back(std::make_pair(BB, Saved));
+}
+
+// Can the condition represented by this PHI node treated like an "Else" block?
+bool SIAnnotateControlFlow::isElse(PHINode *Phi) {
+
+  BasicBlock *IDom = DT->getNode(Phi->getParent())->getIDom()->getBlock();
+  for (unsigned i = 0, e = Phi->getNumIncomingValues(); i != e; ++i) {
+    if (Phi->getIncomingBlock(i) == IDom) {
+
+      if (Phi->getIncomingValue(i) != BoolTrue)
+        return false;
+
+    } else {
+      if (Phi->getIncomingValue(i) != BoolFalse)
+        return false;
+ 
+    }
+  }
+  return true;
+}
+
+// Erase "Phi" if it is not used any more
+void SIAnnotateControlFlow::eraseIfUnused(PHINode *Phi) {
+  if (!Phi->hasNUsesOrMore(1))
+    Phi->eraseFromParent();
+}
+
+// Open a new "If" block
+void SIAnnotateControlFlow::openIf(BranchInst *Term) {
+  Value *Ret = CallInst::Create(If, Term->getCondition(), "", Term);
+  Term->setCondition(ExtractValueInst::Create(Ret, 0, "", Term));
+  push(Term->getSuccessor(1), ExtractValueInst::Create(Ret, 1, "", Term));
+}
+
+// Close the last "If" block and open a new "Else" block
+void SIAnnotateControlFlow::insertElse(BranchInst *Term) {
+  Value *Ret = CallInst::Create(Else, popSaved(), "", Term);
+  Term->setCondition(ExtractValueInst::Create(Ret, 0, "", Term));
+  push(Term->getSuccessor(1), ExtractValueInst::Create(Ret, 1, "", Term));
+}
+
+// Recursively handle the condition leading to a loop
+void SIAnnotateControlFlow::handleLoopCondition(Value *Cond) {
+
+  if (PHINode *Phi = dyn_cast<PHINode>(Cond)) {
+
+    // Handle all non constant incoming values first
+    for (unsigned i = 0, e = Phi->getNumIncomingValues(); i != e; ++i) {
+      Value *Incoming = Phi->getIncomingValue(i);
+      if (isa<ConstantInt>(Incoming))
+        continue;
+
+      Phi->setIncomingValue(i, BoolFalse);
+      handleLoopCondition(Incoming);
+    }
+
+    BasicBlock *Parent = Phi->getParent();
+    BasicBlock *IDom = DT->getNode(Parent)->getIDom()->getBlock();
+
+    for (unsigned i = 0, e = Phi->getNumIncomingValues(); i != e; ++i) {
+
+      Value *Incoming = Phi->getIncomingValue(i);
+      if (Incoming != BoolTrue)
+        continue;
+
+      BasicBlock *From = Phi->getIncomingBlock(i);
+      if (From == IDom) {
+        CallInst *OldEnd = dyn_cast<CallInst>(Parent->getFirstInsertionPt());
+        if (OldEnd && OldEnd->getCalledFunction() == EndCf) {
+          Value *Args[] = {
+            OldEnd->getArgOperand(0),
+            PhiInserter.GetValueAtEndOfBlock(Parent)
+          };
+          Value *Ret = CallInst::Create(ElseBreak, Args, "", OldEnd);
+          PhiInserter.AddAvailableValue(Parent, Ret);
+          continue;
+        }
+      }
+
+      TerminatorInst *Insert = From->getTerminator();
+      Value *Arg = PhiInserter.GetValueAtEndOfBlock(From);
+      Value *Ret = CallInst::Create(Break, Arg, "", Insert);
+      PhiInserter.AddAvailableValue(From, Ret);
+    }
+    eraseIfUnused(Phi);
+
+  } else if (Instruction *Inst = dyn_cast<Instruction>(Cond)) {
+    BasicBlock *Parent = Inst->getParent();
+    TerminatorInst *Insert = Parent->getTerminator();
+    Value *Args[] = { Cond, PhiInserter.GetValueAtEndOfBlock(Parent) };
+    Value *Ret = CallInst::Create(IfBreak, Args, "", Insert);
+    PhiInserter.AddAvailableValue(Parent, Ret);
+
+  } else {
+    assert(0 && "Unhandled loop condition!");
+  }
+}
+
+// Handle a back edge (loop)
+void SIAnnotateControlFlow::handleLoop(BranchInst *Term) {
+
+  BasicBlock *Target = Term->getSuccessor(1);
+  PHINode *Broken = PHINode::Create(Int64, 0, "", &Target->front());
+
+  PhiInserter.Initialize(Int64, "");
+  PhiInserter.AddAvailableValue(Target, Broken);
+
+  Value *Cond = Term->getCondition();
+  Term->setCondition(BoolTrue);
+  handleLoopCondition(Cond);
+
+  BasicBlock *BB = Term->getParent();
+  Value *Arg = PhiInserter.GetValueAtEndOfBlock(BB);
+  for (pred_iterator PI = pred_begin(Target), PE = pred_end(Target);
+       PI != PE; ++PI) {
+
+    Broken->addIncoming(*PI == BB ? Arg : Int64Zero, *PI);
+  }
+
+  Term->setCondition(CallInst::Create(Loop, Arg, "", Term));
+  push(Term->getSuccessor(0), Arg);
+}
+
+// Close the last opened control flow
+void SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) {
+  CallInst::Create(EndCf, popSaved(), "", BB->getFirstInsertionPt());
+}
+
+// Annotate the control flow with intrinsics so the backend can
+// recognize if/then/else and loops.
+bool SIAnnotateControlFlow::runOnFunction(Function &F) {
+
+  DT = &getAnalysis<DominatorTree>();
+
+  for (df_iterator<BasicBlock *> I = df_begin(&F.getEntryBlock()),
+       E = df_end(&F.getEntryBlock()); I != E; ++I) {
+
+    BranchInst *Term = dyn_cast<BranchInst>((*I)->getTerminator());
+
+    if (!Term || Term->isUnconditional()) {
+      if (isTopOfStack(*I))
+        closeControlFlow(*I);
+      continue;
+    }
+
+    if (I.nodeVisited(Term->getSuccessor(1))) {
+      if (isTopOfStack(*I))
+        closeControlFlow(*I);
+      handleLoop(Term);
+      continue;
+    }
+
+    if (isTopOfStack(*I)) {
+      PHINode *Phi = dyn_cast<PHINode>(Term->getCondition());
+      if (Phi && Phi->getParent() == *I && isElse(Phi)) {
+        insertElse(Term);
+        eraseIfUnused(Phi);
+        continue;
+      }
+      closeControlFlow(*I);
+    }
+    openIf(Term);
+  }
+
+  assert(Stack.empty());
+  return true;
+}
+
+// Create the annotation pass
+FunctionPass *llvm::createSIAnnotateControlFlowPass()
+{
+  return new SIAnnotateControlFlow();
+}
diff --git a/lib/Target/AMDGPU/SIFixSGPRLiveness.cpp b/lib/Target/AMDGPU/SIFixSGPRLiveness.cpp
deleted file mode 100644
index 0fecd7a..0000000
--- a/lib/Target/AMDGPU/SIFixSGPRLiveness.cpp
+++ /dev/null
@@ -1,179 +0,0 @@
-//===-- SIFixSGPRLiveness.cpp - SGPR liveness adjustment ------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-///
-/// SGPRs are not affected by control flow. This pass adjusts SGPR liveness in
-/// so that the register allocator can still correctly allocate them.
-//
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/MachineDominators.h"
-#include "llvm/CodeGen/MachinePostDominators.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-
-using namespace llvm;
-
-namespace {
-
-class SIFixSGPRLiveness : public MachineFunctionPass {
-private:
-  static char ID;
-
-  const TargetInstrInfo *TII;
-  MachineRegisterInfo *MRI;
-  MachineDominatorTree *MD;
-  MachinePostDominatorTree *MPD;
-
-  bool isSGPR(const TargetRegisterClass *RegClass) {
-    return RegClass == &AMDGPU::SReg_1RegClass ||
-           RegClass == &AMDGPU::SReg_32RegClass ||
-           RegClass == &AMDGPU::SReg_64RegClass ||
-           RegClass == &AMDGPU::SReg_128RegClass ||
-           RegClass == &AMDGPU::SReg_256RegClass;
-  }
-
-  void addKill(MachineBasicBlock::iterator I, unsigned Reg);
-  MachineBasicBlock *handleUses(unsigned VirtReg, MachineBasicBlock *Begin);
-  void handlePreds(MachineBasicBlock *Begin, MachineBasicBlock *End,
-                   unsigned VirtReg);
-
-  bool handleVirtReg(unsigned VirtReg);
-
-public:
-  SIFixSGPRLiveness(TargetMachine &tm);
-
-  virtual bool runOnMachineFunction(MachineFunction &MF);
-
-  virtual const char *getPassName() const {
-    return "SI fix SGPR liveness pass";
-  }
-
-  virtual void getAnalysisUsage(AnalysisUsage &AU) const;
-};
-
-} // end anonymous namespace
-
-char SIFixSGPRLiveness::ID = 0;
-
-SIFixSGPRLiveness::SIFixSGPRLiveness(TargetMachine &tm):
-  MachineFunctionPass(ID),
-  TII(tm.getInstrInfo()) {
-  initializeLiveIntervalsPass(*PassRegistry::getPassRegistry());
-}
-
-void SIFixSGPRLiveness::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.addRequired<MachineDominatorTree>();
-  AU.addRequired<MachinePostDominatorTree>();
-  AU.setPreservesCFG();
-  MachineFunctionPass::getAnalysisUsage(AU);
-}
-
-void SIFixSGPRLiveness::addKill(MachineBasicBlock::iterator I, unsigned Reg) {
-  MachineBasicBlock *MBB = I->getParent();
-
-  BuildMI(*MBB, I, DebugLoc(), TII->get(TargetOpcode::KILL)).addReg(Reg);
-}
-
-// Find the common post dominator of all uses
-MachineBasicBlock *SIFixSGPRLiveness::handleUses(unsigned VirtReg,
-                                                 MachineBasicBlock *Begin) {
-  MachineBasicBlock *LastUse = Begin, *End = Begin;
-  bool EndUsesReg = true;
-
-  MachineRegisterInfo::use_iterator i, e;
-  for (i = MRI->use_begin(VirtReg), e = MRI->use_end(); i != e; ++i) {
-    MachineBasicBlock *MBB = i->getParent();
-    if (LastUse == MBB)
-      continue;
-
-    LastUse = MBB;
-    MBB = MPD->findNearestCommonDominator(End, MBB);
-
-    if (MBB == LastUse)
-      EndUsesReg = true;
-    else if (MBB != End)
-      EndUsesReg = false;
-
-    End = MBB;
-  }
-
-  return EndUsesReg ? Begin : End;
-}
-
-// Handles predecessors separately, only add KILLs to dominated ones
-void SIFixSGPRLiveness::handlePreds(MachineBasicBlock *Begin,
-                                    MachineBasicBlock *End,
-                                    unsigned VirtReg) {
-  MachineBasicBlock::pred_iterator i, e;
-  for (i = End->pred_begin(), e = End->pred_end(); i != e; ++i) {
-
-    if (MD->dominates(End, *i))
-      continue; // ignore loops
-
-    if (MD->dominates(*i, Begin))
-      continue; // too far up, abort search
-
-    if (MD->dominates(Begin, *i)) {
-      // found end of livetime
-      addKill((*i)->getFirstTerminator(), VirtReg);
-      continue;
-    }
-
-    handlePreds(Begin, *i, VirtReg);
-  }
-}
-
-bool SIFixSGPRLiveness::handleVirtReg(unsigned VirtReg) {
-
-  MachineInstr *Def = MRI->getVRegDef(VirtReg);
-  if (!Def || MRI->use_empty(VirtReg))
-    return false; // No definition or not used
-
-  MachineBasicBlock *Begin = Def->getParent();
-  MachineBasicBlock *End = handleUses(VirtReg, Begin);
-  if (Begin == End)
-    return false; // Defined and only used in the same block
-
-  if (MD->dominates(Begin, End)) {
-    // Lifetime dominate the end node, just kill it here
-    addKill(End->getFirstNonPHI(), VirtReg);
-  } else {
-    // only some predecessors are dominate, handle them separately
-    handlePreds(Begin, End, VirtReg);
-  }
-  return true;
-}
-
-bool SIFixSGPRLiveness::runOnMachineFunction(MachineFunction &MF) {
-  bool Changes = false;
-
-  MRI = &MF.getRegInfo();
-  MD = &getAnalysis<MachineDominatorTree>();
-  MPD = &getAnalysis<MachinePostDominatorTree>();
-
-  for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
-    unsigned VirtReg = TargetRegisterInfo::index2VirtReg(i);
-
-    const TargetRegisterClass *RegClass = MRI->getRegClass(VirtReg);
-    if (!isSGPR(RegClass))
-      continue;
-
-    Changes |= handleVirtReg(VirtReg);
-  }
-
-  return Changes;
-}
-
-FunctionPass *llvm::createSIFixSGPRLivenessPass(TargetMachine &tm) {
-  return new SIFixSGPRLiveness(tm);
-}
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index 292ce85..599c526 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -44,8 +44,6 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
   setOperationAction(ISD::ADD, MVT::i64, Legal);
   setOperationAction(ISD::ADD, MVT::i32, Legal);
 
-  setOperationAction(ISD::BR_CC, MVT::i32, Custom);
-
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
 
   // We need to custom lower loads from the USER_SGPR address space, so we can
@@ -254,7 +252,7 @@ EVT SITargetLowering::getSetCCResultType(EVT VT) const {
 SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   switch (Op.getOpcode()) {
   default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
-  case ISD::BR_CC: return LowerBR_CC(Op, DAG);
+  case ISD::BRCOND: return LowerBRCOND(Op, DAG);
   case ISD::LOAD: return LowerLOAD(Op, DAG);
   case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
   case ISD::AND: return Loweri1ContextSwitch(Op, DAG, ISD::AND);
@@ -298,27 +296,99 @@ SDValue SITargetLowering::Loweri1ContextSwitch(SDValue Op,
   return DAG.getNode(SIISD::VCC_BITCAST, DL, MVT::i1, OpNode);
 }
 
-SDValue SITargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
-  SDValue Chain = Op.getOperand(0);
-  SDValue CC = Op.getOperand(1);
-  SDValue LHS   = Op.getOperand(2);
-  SDValue RHS   = Op.getOperand(3);
-  SDValue JumpT  = Op.getOperand(4);
-  SDValue CmpValue;
-  SDValue Result;
-  CmpValue = DAG.getNode(
-      ISD::SETCC,
-      Op.getDebugLoc(),
-      MVT::i1,
-      LHS, RHS,
-      CC);
-
-  Result = DAG.getNode(
-      AMDGPUISD::BRANCH_COND,
-      CmpValue.getDebugLoc(),
-      MVT::Other, Chain,
-      JumpT, CmpValue);
-  return Result;
+// Helper function for LowerBRCOND
+static SDNode *findUser(SDValue Value, unsigned Opcode) {
+
+  SDNode *Parent = Value.getNode();
+  for (SDNode::use_iterator I = Parent->use_begin(), E = Parent->use_end();
+       I != E; ++I) {
+
+    if (I.getUse().get() != Value)
+      continue;
+
+    if (I->getOpcode() == Opcode)
+      return *I;
+  }
+  return 0;
+}
+
+// This transforms the control flow intrinsics to get the branch destination as
+// last parameter, also switches branch target with BR if the need arise
+SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
+                                      SelectionDAG &DAG) const {
+
+  DebugLoc DL = BRCOND.getDebugLoc();
+
+  SDNode *Intr = BRCOND.getOperand(1).getNode();
+  SDValue Target = BRCOND.getOperand(2);
+  SDNode *BR = 0;
+
+  if (Intr->getOpcode() == ISD::SETCC) {
+    // As long as we negate the condition everything is fine
+    SDNode *SetCC = Intr;
+    assert(SetCC->getConstantOperandVal(1) == 1);
+
+    CondCodeSDNode *CC = cast<CondCodeSDNode>(SetCC->getOperand(2).getNode());
+    assert(CC->get() == ISD::SETNE);
+    Intr = SetCC->getOperand(0).getNode();
+
+  } else {
+    // Get the target from BR if we don't negate the condition
+    BR = findUser(BRCOND, ISD::BR);
+    Target = BR->getOperand(1);
+  }
+
+  assert(Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN);
+
+  // Build the result and
+  SmallVector<EVT, 4> Res;
+  for (unsigned i = 1, e = Intr->getNumValues(); i != e; ++i)
+    Res.push_back(Intr->getValueType(i));
+
+  // operands of the new intrinsic call
+  SmallVector<SDValue, 4> Ops;
+  Ops.push_back(BRCOND.getOperand(0));
+  for (unsigned i = 1, e = Intr->getNumOperands(); i != e; ++i)
+    Ops.push_back(Intr->getOperand(i));
+  Ops.push_back(Target);
+
+  // build the new intrinsic call
+  SDNode *Result = DAG.getNode(
+    Res.size() > 1 ? ISD::INTRINSIC_W_CHAIN : ISD::INTRINSIC_VOID, DL,
+    DAG.getVTList(Res.data(), Res.size()), Ops.data(), Ops.size()).getNode();
+
+  if (BR) {
+    // Give the branch instruction our target
+    SDValue Ops[] = {
+      BR->getOperand(0),
+      BRCOND.getOperand(2)
+    };
+    DAG.MorphNodeTo(BR, ISD::BR, BR->getVTList(), Ops, 2);
+  }
+
+  SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
+
+  // Copy the intrinsic results to registers
+  for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
+    SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg);
+    if (!CopyToReg)
+      continue;
+
+    Chain = DAG.getCopyToReg(
+      Chain, DL,
+      CopyToReg->getOperand(1),
+      SDValue(Result, i - 1),
+      SDValue());
+
+    DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
+  }
+
+  // Remove the old intrinsic from the chain
+  DAG.ReplaceAllUsesOfValueWith(
+    SDValue(Intr, Intr->getNumValues() - 1),
+    Intr->getOperand(0));
+
+  return Chain;
 }
 
 SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h
index 27c2a1c..c088112 100644
--- a/lib/Target/AMDGPU/SIISelLowering.h
+++ b/lib/Target/AMDGPU/SIISelLowering.h
@@ -43,9 +43,9 @@ class SITargetLowering : public AMDGPUTargetLowering {
 
   SDValue Loweri1ContextSwitch(SDValue Op, SelectionDAG &DAG,
                                            unsigned VCCNode) const;
-  SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
 
 public:
   SITargetLowering(TargetMachine &tm);
diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td
index 42fa8e6..51cd5b4 100644
--- a/lib/Target/AMDGPU/SIInstructions.td
+++ b/lib/Target/AMDGPU/SIInstructions.td
@@ -696,7 +696,7 @@ def S_ENDPGM : SOPP <0x00000001, (ins), "S_ENDPGM",
 let isBranch = 1 in {
 def S_BRANCH : SOPP <
   0x00000002, (ins brtarget:$target), "S_BRANCH",
-  []
+  [(br bb:$target)]
 >;
 
 let DisableEncoding = "$scc" in {
@@ -1095,26 +1095,70 @@ def SI_WQM : InstSI <
 
 } // end usesCustomInserter 
 
-// SI Psuedo branch instructions.  These are used by the CFG structurizer pass
+// SI Psuedo instructions. These are used by the CFG structurizer pass
 // and should be lowered to ISA instructions prior to codegen.
 
-let isBranch = 1, isTerminator = 1, mayLoad = 0, mayStore = 0,
-                                                 hasSideEffects = 0 in {
-def SI_IF_NZ : InstSI <
+let mayLoad = 1, mayStore = 1, hasSideEffects = 1,
+    Uses = [EXEC], Defs = [EXEC] in {
+
+let isBranch = 1, isTerminator = 1 in {
+
+def SI_IF : InstSI <
+  (outs SReg_64:$dst),
+  (ins SReg_1:$vcc, brtarget:$target),
+  "SI_IF",
+  [(set SReg_64:$dst, (int_SI_if SReg_1:$vcc, bb:$target))]
+>;
+
+def SI_ELSE : InstSI <
+  (outs SReg_64:$dst),
+  (ins SReg_64:$src, brtarget:$target),
+  "SI_ELSE",
+  [(set SReg_64:$dst, (int_SI_else SReg_64:$src, bb:$target))]> {
+
+  let Constraints = "$src = $dst";
+}
+
+def SI_LOOP : InstSI <
   (outs),
-  (ins brtarget:$target, SReg_1:$vcc),
-  "SI_BRANCH_NZ",
-  [(IL_brcond bb:$target, SReg_1:$vcc)]
+  (ins SReg_64:$saved, brtarget:$target),
+  "SI_LOOP",
+  [(int_SI_loop SReg_64:$saved, bb:$target)]
 >;
 
-def SI_IF_Z : InstSI <
+} // end isBranch = 1, isTerminator = 1
+
+def SI_BREAK : InstSI <
+  (outs SReg_64:$dst),
+  (ins SReg_64:$src),
+  "SI_ELSE",
+  [(set SReg_64:$dst, (int_SI_break SReg_64:$src))]
+>;
+
+def SI_IF_BREAK : InstSI <
+  (outs SReg_64:$dst),
+  (ins SReg_1:$vcc, SReg_64:$src),
+  "SI_IF_BREAK",
+  [(set SReg_64:$dst, (int_SI_if_break SReg_1:$vcc, SReg_64:$src))]
+>;
+
+def SI_ELSE_BREAK : InstSI <
+  (outs SReg_64:$dst),
+  (ins SReg_64:$src0, SReg_64:$src1),
+  "SI_ELSE_BREAK",
+  [(set SReg_64:$dst, (int_SI_else_break SReg_64:$src0, SReg_64:$src1))]
+>;
+
+def SI_END_CF : InstSI <
   (outs),
-  (ins brtarget:$target, SReg_1:$vcc),
-  "SI_BRANCH_Z",
-  []
+  (ins SReg_64:$saved),
+  "SI_END_CF",
+  [(int_SI_end_cf SReg_64:$saved)]
 >;
-} // end isBranch = 1, isTerminator = 1, mayLoad = 0, mayStore = 0,
-  //     hasSideEffects = 0
+
+} // end mayLoad = 1, mayStore = 1, hasSideEffects = 1
+  // Uses = [EXEC], Defs = [EXEC]
+
 } // end IsCodeGenOnly, isPseudo
 
 /* int_SI_vs_load_input */
diff --git a/lib/Target/AMDGPU/SIIntrinsics.td b/lib/Target/AMDGPU/SIIntrinsics.td
index 1008fc4..c322fef 100644
--- a/lib/Target/AMDGPU/SIIntrinsics.td
+++ b/lib/Target/AMDGPU/SIIntrinsics.td
@@ -39,4 +39,14 @@ let TargetPrefix = "SI", isTarget = 1 in {
 
   def int_SI_fs_read_face : Intrinsic <[llvm_float_ty], [], [IntrNoMem]>;
   def int_SI_fs_read_pos : Intrinsic <[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>;
+
+  /* Control flow Intrinsics */
+
+  def int_SI_if : Intrinsic<[llvm_i64_ty], [llvm_i1_ty, llvm_empty_ty], []>;
+  def int_SI_else : Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_empty_ty], []>;
+  def int_SI_break : Intrinsic<[llvm_i64_ty], [llvm_i64_ty], []>;
+  def int_SI_if_break : Intrinsic<[llvm_i64_ty], [llvm_i1_ty, llvm_i64_ty], []>;
+  def int_SI_else_break : Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty], []>;
+  def int_SI_loop : Intrinsic<[], [llvm_i64_ty, llvm_empty_ty], []>;
+  def int_SI_end_cf : Intrinsic<[], [llvm_i64_ty], []>;
 }
diff --git a/lib/Target/AMDGPU/SILowerControlFlow.cpp b/lib/Target/AMDGPU/SILowerControlFlow.cpp
index 277b647..594e6f7 100644
--- a/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ b/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -8,10 +8,10 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief This pass lowers the pseudo control flow instructions (SI_IF_NZ, ELSE, ENDIF)
-/// to predicated instructions.
+/// \brief This pass lowers the pseudo control flow instructions to real
+/// machine instructions.
 ///
-/// All control flow (except loops) is handled using predicated instructions and
+/// All control flow is handled using predicated instructions and
 /// a predicate stack.  Each Scalar ALU controls the operations of 64 Vector
 /// ALUs.  The Scalar ALU can update the predicate for any of the Vector ALUs
 /// by writting to the 64-bit EXEC register (each bit corresponds to a
@@ -22,17 +22,17 @@
 ///
 /// For example:
 /// %VCC = V_CMP_GT_F32 %VGPR1, %VGPR2
-/// SI_IF_NZ %VCC
+/// %SGPR0 = SI_IF %VCC
 ///   %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0
-/// ELSE
+/// %SGPR0 = SI_ELSE %SGPR0
 ///   %VGPR0 = V_SUB_F32 %VGPR0, %VGPR0
-/// ENDIF
+/// SI_END_CF %SGPR0
 ///
 /// becomes:
 ///
 /// %SGPR0 = S_AND_SAVEEXEC_B64 %VCC  // Save and update the exec mask
 /// %SGPR0 = S_XOR_B64 %SGPR0, %EXEC  // Clear live bits from saved exec mask
-/// S_CBRANCH_EXECZ label0            // This instruction is an
+/// S_CBRANCH_EXECZ label0            // This instruction is an optional
 ///                                   // optimization which allows us to
 ///                                   // branch if all the bits of
 ///                                   // EXEC are zero.
@@ -45,7 +45,7 @@
 ///                                    // instruction again.
 /// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR   // Do the THEN block
 /// label1:
-/// %EXEC = S_OR_B64 %EXEC, %SGPR2     // Re-enable saved exec mask bits
+/// %EXEC = S_OR_B64 %EXEC, %SGPR0     // Re-enable saved exec mask bits
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
@@ -65,11 +65,14 @@ class SILowerControlFlowPass : public MachineFunctionPass {
 private:
   static char ID;
   const TargetInstrInfo *TII;
-  std::vector<unsigned> PredicateStack;
-  std::vector<unsigned> UnusedRegisters;
 
-  unsigned allocReg();
-  void freeReg(unsigned Reg);
+  void If(MachineInstr &MI);
+  void Else(MachineInstr &MI);
+  void Break(MachineInstr &MI);
+  void IfBreak(MachineInstr &MI);
+  void ElseBreak(MachineInstr &MI);
+  void Loop(MachineInstr &MI);
+  void EndCf(MachineInstr &MI);
 
 public:
   SILowerControlFlowPass(TargetMachine &tm) :
@@ -91,101 +94,205 @@ FunctionPass *llvm::createSILowerControlFlowPass(TargetMachine &tm) {
   return new SILowerControlFlowPass(tm);
 }
 
+void SILowerControlFlowPass::If(MachineInstr &MI) {
+
+  MachineBasicBlock &MBB = *MI.getParent();
+  DebugLoc DL = MI.getDebugLoc();
+  unsigned Reg = MI.getOperand(0).getReg();
+  unsigned Vcc = MI.getOperand(1).getReg();
+
+  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), Reg)
+          .addReg(Vcc);
+
+  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), Reg)
+          .addReg(AMDGPU::EXEC)
+          .addReg(Reg);
+
+  //BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_CBRANCH_EXECZ))
+  //        .addOperand(MI.getOperand(2));
+
+  MI.eraseFromParent();
+}
+
+void SILowerControlFlowPass::Else(MachineInstr &MI) {
+
+  MachineBasicBlock &MBB = *MI.getParent();
+  DebugLoc DL = MI.getDebugLoc();
+  unsigned Dst = MI.getOperand(0).getReg();
+  unsigned Src = MI.getOperand(1).getReg();
+
+  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_SAVEEXEC_B64), Dst)
+          .addReg(Src); // Saved EXEC
+
+  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
+          .addReg(AMDGPU::EXEC)
+          .addReg(Dst);
+
+  //BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_CBRANCH_EXECZ))
+  //        .addOperand(MI.getOperand(2));
+
+  MI.eraseFromParent();
+}
+
+void SILowerControlFlowPass::Break(MachineInstr &MI) {
+
+  MachineBasicBlock &MBB = *MI.getParent();
+  DebugLoc DL = MI.getDebugLoc();
+
+  unsigned Dst = MI.getOperand(0).getReg();
+  unsigned Src = MI.getOperand(1).getReg();
+ 
+  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
+          .addReg(AMDGPU::EXEC)
+          .addReg(Src);
+
+  MI.eraseFromParent();
+}
+
+void SILowerControlFlowPass::IfBreak(MachineInstr &MI) {
+
+  MachineBasicBlock &MBB = *MI.getParent();
+  DebugLoc DL = MI.getDebugLoc();
+
+  unsigned Dst = MI.getOperand(0).getReg();
+  unsigned Vcc = MI.getOperand(1).getReg();
+  unsigned Src = MI.getOperand(2).getReg();
+ 
+  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
+          .addReg(Vcc)
+          .addReg(Src);
+
+  MI.eraseFromParent();
+}
+
+void SILowerControlFlowPass::ElseBreak(MachineInstr &MI) {
+
+  MachineBasicBlock &MBB = *MI.getParent();
+  DebugLoc DL = MI.getDebugLoc();
+
+  unsigned Dst = MI.getOperand(0).getReg();
+  unsigned Saved = MI.getOperand(1).getReg();
+  unsigned Src = MI.getOperand(2).getReg();
+ 
+  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
+          .addReg(Saved)
+          .addReg(Src);
+
+  MI.eraseFromParent();
+}
+
+void SILowerControlFlowPass::Loop(MachineInstr &MI) {
+
+  MachineBasicBlock &MBB = *MI.getParent();
+  DebugLoc DL = MI.getDebugLoc();
+  unsigned Src = MI.getOperand(0).getReg();
+
+  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ANDN2_B64), AMDGPU::EXEC)
+          .addReg(AMDGPU::EXEC)
+          .addReg(Src);
+
+  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
+          .addOperand(MI.getOperand(1))
+          .addReg(AMDGPU::EXEC);
+
+  MI.eraseFromParent();
+}
+
+void SILowerControlFlowPass::EndCf(MachineInstr &MI) {
+
+  MachineBasicBlock &MBB = *MI.getParent();
+  DebugLoc DL = MI.getDebugLoc();
+  unsigned Reg = MI.getOperand(0).getReg();
+
+  BuildMI(MBB, MBB.getFirstNonPHI(), DL,
+          TII->get(AMDGPU::S_OR_B64), AMDGPU::EXEC)
+          .addReg(AMDGPU::EXEC)
+          .addReg(Reg);
+
+  MI.eraseFromParent();
+}
+
 bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
 
-  // Find all the unused registers that can be used for the predicate stack.
-  for (TargetRegisterClass::iterator I = AMDGPU::SReg_64RegClass.begin(),
-                                     S = AMDGPU::SReg_64RegClass.end();
-                                     I != S; ++I) {
-    unsigned Reg = *I;
-    if (!MF.getRegInfo().isPhysRegUsed(Reg)) {
-      UnusedRegisters.insert(UnusedRegisters.begin(), Reg);
-    }
-  }
+  bool HaveCf = false;
+
+  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
+       BI != BE; ++BI) {
 
-  for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
-                                                  BB != BB_E; ++BB) {
-    MachineBasicBlock &MBB = *BB;
+    MachineBasicBlock &MBB = *BI;
     for (MachineBasicBlock::iterator I = MBB.begin(), Next = llvm::next(I);
-                               I != MBB.end(); I = Next) {
+         I != MBB.end(); I = Next) {
+
       Next = llvm::next(I);
       MachineInstr &MI = *I;
-      unsigned Reg;
       switch (MI.getOpcode()) {
         default: break;
-        case AMDGPU::SI_IF_NZ:
-          Reg = allocReg();
-          BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_AND_SAVEEXEC_B64),
-                  Reg)
-                  .addOperand(MI.getOperand(0)); // VCC
-          BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_XOR_B64),
-                  Reg)
-                  .addReg(Reg)
-                  .addReg(AMDGPU::EXEC);
-          MI.eraseFromParent();
-          PredicateStack.push_back(Reg);
+        case AMDGPU::SI_IF:
+          If(MI);
+          break;
+
+        case AMDGPU::SI_ELSE:
+          Else(MI);
           break;
 
-        case AMDGPU::ELSE:
-          Reg = PredicateStack.back();
-          BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_OR_SAVEEXEC_B64),
-                  Reg)
-                  .addReg(Reg);
-          BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_XOR_B64),
-                  AMDGPU::EXEC)
-                  .addReg(Reg)
-                  .addReg(AMDGPU::EXEC);
-          MI.eraseFromParent();
+        case AMDGPU::SI_BREAK:
+          Break(MI);
           break;
 
-        case AMDGPU::ENDIF:
-          Reg = PredicateStack.back();
-          PredicateStack.pop_back();
-          BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_OR_B64),
-                  AMDGPU::EXEC)
-                  .addReg(AMDGPU::EXEC)
-                  .addReg(Reg);
-          freeReg(Reg);
-
-          if (MF.getInfo<SIMachineFunctionInfo>()->ShaderType == ShaderType::PIXEL &&
-              PredicateStack.empty()) {
-            // If the exec mask is non-zero, skip the next two instructions
-            BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_CBRANCH_EXECNZ))
-                    .addImm(3)
-                    .addReg(AMDGPU::EXEC);
-
-            // Exec mask is zero: Export to NULL target...
-            BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::EXP))
-                    .addImm(0)
-                    .addImm(0x09) // V_008DFC_SQ_EXP_NULL
-                    .addImm(0)
-                    .addImm(1)
-                    .addImm(1)
-                    .addReg(AMDGPU::SREG_LIT_0)
-                    .addReg(AMDGPU::SREG_LIT_0)
-                    .addReg(AMDGPU::SREG_LIT_0)
-                    .addReg(AMDGPU::SREG_LIT_0);
-
-            // ... and terminate wavefront
-            BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_ENDPGM));
-          }
-          MI.eraseFromParent();
+        case AMDGPU::SI_IF_BREAK:
+          IfBreak(MI);
+          break;
+
+        case AMDGPU::SI_ELSE_BREAK:
+          ElseBreak(MI);
+          break;
+
+        case AMDGPU::SI_LOOP:
+          Loop(MI);
+          break;
+
+        case AMDGPU::SI_END_CF:
+          HaveCf = true;
+          EndCf(MI);
           break;
       }
     }
   }
-  return true;
-}
 
-unsigned SILowerControlFlowPass::allocReg() {
+  // TODO: What is this good for?
+  unsigned ShaderType = MF.getInfo<SIMachineFunctionInfo>()->ShaderType;
+  if (HaveCf && ShaderType == ShaderType::PIXEL) {
+    for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
+         BI != BE; ++BI) {
 
-  assert(!UnusedRegisters.empty() && "Ran out of registers for predicate stack");
-  unsigned Reg = UnusedRegisters.back();
-  UnusedRegisters.pop_back();
-  return Reg;
-}
+      MachineBasicBlock &MBB = *BI;
+      if (MBB.succ_begin() == MBB.succ_end()) {
 
-void SILowerControlFlowPass::freeReg(unsigned Reg) {
+        MachineInstr &MI = *MBB.getFirstNonPHI();
+        DebugLoc DL = MI.getDebugLoc();
 
-  UnusedRegisters.push_back(Reg);
+        // If the exec mask is non-zero, skip the next two instructions
+        BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
+               .addImm(3)
+               .addReg(AMDGPU::EXEC);
+
+        // Exec mask is zero: Export to NULL target...
+        BuildMI(MBB, &MI, DL, TII->get(AMDGPU::EXP))
+                .addImm(0)
+                .addImm(0x09) // V_008DFC_SQ_EXP_NULL
+                .addImm(0)
+                .addImm(1)
+                .addImm(1)
+                .addReg(AMDGPU::SREG_LIT_0)
+                .addReg(AMDGPU::SREG_LIT_0)
+                .addReg(AMDGPU::SREG_LIT_0)
+                .addReg(AMDGPU::SREG_LIT_0);
+
+        // ... and terminate wavefront
+        BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ENDPGM));
+      }
+    }
+  }
+
+  return true;
 }
-- 
1.7.9.5