[Mesa-dev] [PATCH] R600: Factorize code handling Const Read Port limitation

Wed Mar 13 14:59:07 PDT 2013

I fixed the coding style issue.
The <iostream> include was a debug leftover line, it shouldn't be there.

----- Mail original -----
> De : Tom Stellard <tom at stellard.net>
> À : Vincent Lejeune <vljn at ovi.com>
> Cc : llvm-commits at cs.uiuc.edu; mesa-dev at lists.freedesktop.org
> Envoyé le : Mercredi 13 mars 2013 21h49
> Objet : Re: [PATCH] R600: Factorize code handling Const Read Port limitation
> 
> On Wed, Mar 13, 2013 at 09:12:41PM +0100, Vincent Lejeune wrote:
>>  ---
>>   lib/Target/R600/AMDILISelDAGToDAG.cpp    | 34 ++++++++++----
>>   lib/Target/R600/R600InstrInfo.cpp        | 54 ++++++++++++++++++++++
>>   lib/Target/R600/R600InstrInfo.h          |  3 ++
>>   lib/Target/R600/R600MachineScheduler.cpp | 77 
> ++++----------------------------
>>   lib/Target/R600/R600MachineScheduler.h   |  3 +-
>>   test/CodeGen/R600/kcache-fold-2.ll       | 52 +++++++++++++++++++++
>>   6 files changed, 144 insertions(+), 79 deletions(-)
>>   create mode 100644 test/CodeGen/R600/kcache-fold-2.ll
>> 
>>  diff --git a/lib/Target/R600/AMDILISelDAGToDAG.cpp 
> b/lib/Target/R600/AMDILISelDAGToDAG.cpp
>>  index 0c7880d..05a1ea7 100644
>>  --- a/lib/Target/R600/AMDILISelDAGToDAG.cpp
>>  +++ b/lib/Target/R600/AMDILISelDAGToDAG.cpp
>>  @@ -336,6 +336,7 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
>>     return Result;
>>   }
>>   
>>  +
> 
> Whitespace
>>   bool AMDGPUDAGToDAGISel::FoldOperands(unsigned Opcode,
>>       const R600InstrInfo *TII, std::vector<SDValue> &Ops) {
>>     int OperandIdx[] = {
>>  @@ -365,17 +366,34 @@ bool AMDGPUDAGToDAGISel::FoldOperands(unsigned 
> Opcode,
>>       SDValue Operand = Ops[OperandIdx[i] - 1];
>>       switch (Operand.getOpcode()) {
>>       case AMDGPUISD::CONST_ADDRESS: {
>>  -      if (i == 2)
>>  -        break;
>>         SDValue CstOffset;
>>  -      if (!Operand.getValueType().isVector() &&
>>  -          SelectGlobalValueConstantOffset(Operand.getOperand(0), 
> CstOffset)) {
>>  -        Ops[OperandIdx[i] - 1] = CurDAG->getRegister(AMDGPU::ALU_CONST, 
> MVT::f32);
>>  -        Ops[SelIdx[i] - 1] = CstOffset;
>>  -        return true;
>>  +      if (Operand.getValueType().isVector() ||
>>  +          !SelectGlobalValueConstantOffset(Operand.getOperand(0), 
> CstOffset))
>>  +        break;
>>  +
>>  +      // Gather others constants values
>>  +      std::vector<unsigned> Consts;
>>  +      for (unsigned j = 0; j < 3; j++) {
>>  +        int SrcIdx = OperandIdx[j];
>>  +        if (SrcIdx < 0)
>>  +          break;
>>  +        if (RegisterSDNode *Reg = 
> dyn_cast<RegisterSDNode>(Ops[SrcIdx - 1])) {
>>  +          if (Reg->getReg() == AMDGPU::ALU_CONST) {
>>  +            ConstantSDNode *Cst = 
> dyn_cast<ConstantSDNode>(Ops[SelIdx[j] - 1]);
>>  +            Consts.push_back(Cst->getZExtValue());
>>  +          }
>>  +        }
>>         }
>>  +
>>  +      ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(CstOffset);
>>  +      Consts.push_back(Cst->getZExtValue());
>>  +      if (!TII->fitsConstReadLimitations(Consts))
>>  +        break;
>>  +
>>  +      Ops[OperandIdx[i] - 1] = CurDAG->getRegister(AMDGPU::ALU_CONST, 
> MVT::f32);
>>  +      Ops[SelIdx[i] - 1] = CstOffset;
>>  +      return true;
>>         }
>>  -      break;
>>       case ISD::FNEG:
>>         if (NegIdx[i] < 0)
>>           break;
>>  diff --git a/lib/Target/R600/R600InstrInfo.cpp 
> b/lib/Target/R600/R600InstrInfo.cpp
>>  index be3318a..0865098 100644
>>  --- a/lib/Target/R600/R600InstrInfo.cpp
>>  +++ b/lib/Target/R600/R600InstrInfo.cpp
>>  @@ -139,6 +139,60 @@ bool R600InstrInfo::isALUInstr(unsigned Opcode) const 
> {
>>             (TargetFlags & R600_InstFlag::OP3));
>>   }
>>   
>>  +bool
>>  +R600InstrInfo::fitsConstReadLimitations(const std::vector<unsigned> 
> &Consts)
>>  +    const {
>>  +  assert (Consts.size() <= 12 && "Too many operands in 
> instructions group");
>>  +  unsigned Pair1 = 0, Pair2 = 0;
>>  +  for (unsigned i = 0, n = Consts.size(); i < n; ++i) {
>>  +    unsigned ReadConstHalf = Consts[i] & 2;
>>  +    unsigned ReadConstIndex = Consts[i] & (~3);
>>  +    unsigned ReadHalfConst = ReadConstIndex | ReadConstHalf;
>>  +    if (!Pair1) {
>>  +      Pair1 = ReadHalfConst;
>>  +      continue;
>>  +    }
>>  +    if (Pair1 == ReadHalfConst)
>>  +      continue;
>>  +    if (!Pair2) {
>>  +      Pair2 = ReadHalfConst;
>>  +      continue;
>>  +    }
>>  +    if (Pair2 != ReadHalfConst)
>>  +      return false;
>>  +  }
>>  +  return true;
>>  +}
>>  +
>>  +bool
>>  +R600InstrInfo::canBundle(const std::vector<MachineInstr *> &MIs) 
> const {
>>  +  std::vector<unsigned> Consts;
>>  +  for (unsigned i = 0, n = MIs.size(); i < n; i++) {
>>  +    const MachineInstr *MI = MIs[i];
>>  +
>>  +    const R600Operands::Ops OpTable[3][2] = {
>>  +      {R600Operands::SRC0, R600Operands::SRC0_SEL},
>>  +      {R600Operands::SRC1, R600Operands::SRC1_SEL},
>>  +      {R600Operands::SRC2, R600Operands::SRC2_SEL},
>>  +    };
>>  +
>>  +    if (!isALUInstr(MI->getOpcode()))
>>  +      continue;
>>  +
>>  +    for (unsigned j = 0; j < 3; j++) {
>>  +      int SrcIdx = getOperandIdx(MI->getOpcode(), OpTable[j][0]);
>>  +      if (SrcIdx < 0)
>>  +        break;
>>  +      if (MI->getOperand(SrcIdx).getReg() == AMDGPU::ALU_CONST) {
>>  +        unsigned Const = MI->getOperand(
>>  +            getOperandIdx(MI->getOpcode(), OpTable[j][1])).getImm();
>>  +        Consts.push_back(Const);
>>  +      }
>>  +    }
>>  +  }
>>  +  return fitsConstReadLimitations(Consts);
>>  +}
>>  +
>>   DFAPacketizer *R600InstrInfo::CreateTargetScheduleState(const 
> TargetMachine *TM,
>>       const ScheduleDAG *DAG) const {
>>     const InstrItineraryData *II = TM->getInstrItineraryData();
>>  diff --git a/lib/Target/R600/R600InstrInfo.h 
> b/lib/Target/R600/R600InstrInfo.h
>>  index efe721c..bf9569e 100644
>>  --- a/lib/Target/R600/R600InstrInfo.h
>>  +++ b/lib/Target/R600/R600InstrInfo.h
>>  @@ -53,6 +53,9 @@ namespace llvm {
>>     /// \returns true if this \p Opcode represents an ALU 
> instruction.
>>     bool isALUInstr(unsigned Opcode) const;
>>   
>>  +  bool fitsConstReadLimitations(const std::vector<unsigned>&) 
> const;
>>  +  bool canBundle(const std::vector<MachineInstr *> &) const;
>>  +
>>     /// \breif Vector instructions are instructions that must fill all
>>     /// instruction slots within an instruction group.
>>     bool isVector(const MachineInstr &MI) const;
>>  diff --git a/lib/Target/R600/R600MachineScheduler.cpp 
> b/lib/Target/R600/R600MachineScheduler.cpp
>>  index ee8d672..b4d9779 100644
>>  --- a/lib/Target/R600/R600MachineScheduler.cpp
>>  +++ b/lib/Target/R600/R600MachineScheduler.cpp
>>  @@ -24,7 +24,7 @@
>>   #include "llvm/PassManager.h"
>>   #include "llvm/Support/raw_ostream.h"
>>   #include <set>
>>  -
>>  +#include <iostream>
> 
> You should double check the LLVM coding style guide, but I think system
> headers need to be sorted.
> 
>>   using namespace llvm;
>>   
>>   static
>>  @@ -222,7 +222,6 @@ void R600SchedStrategy::initialize(ScheduleDAGMI *dag) 
> {
>>     CurInstKind = IDOther;
>>     CurEmitted = 0;
>>     OccupedSlotsMask = 15;
>>  -  memset(InstructionsGroupCandidate, 0, 
> sizeof(InstructionsGroupCandidate));
>>     InstKindLimit[IDAlu] = 120; // 120 minus 8 for security
>>     InstKindLimit[IDOther] = 32;
>>   
>>  @@ -503,79 +502,19 @@ int R600SchedStrategy::getInstKind(SUnit* SU) {
>>     }
>>   }
>>   
>>  -class ConstPairs {
>>  -private:
>>  -  unsigned XYPair;
>>  -  unsigned ZWPair;
>>  -public:
>>  -  ConstPairs(unsigned ReadConst[3]) : XYPair(0), ZWPair(0) {
>>  -    for (unsigned i = 0; i < 3; i++) {
>>  -      unsigned ReadConstChan = ReadConst[i] & 3;
>>  -      unsigned ReadConstIndex = ReadConst[i] & (~3);
>>  -      if (ReadConstChan < 2) {
>>  -        if (!XYPair) {
>>  -          XYPair = ReadConstIndex;
>>  -        }
>>  -      } else {
>>  -        if (!ZWPair) {
>>  -          ZWPair = ReadConstIndex;
>>  -        }
>>  -      }
>>  -    }
>>  -  }
>>  -
>>  -  bool isCompatibleWith(const ConstPairs& CP) const {
>>  -    return (!XYPair || !CP.XYPair || CP.XYPair == XYPair) &&
>>  -        (!ZWPair || !CP.ZWPair || CP.ZWPair == ZWPair);
>>  -  }
>>  -};
>>  -
>>  -static
>>  -const ConstPairs getPairs(const R600InstrInfo *TII, const 
> MachineInstr& MI) {
>>  -  unsigned ReadConsts[3] = {0, 0, 0};
>>  -  R600Operands::Ops OpTable[3][2] = {
>>  -    {R600Operands::SRC0, R600Operands::SRC0_SEL},
>>  -    {R600Operands::SRC1, R600Operands::SRC1_SEL},
>>  -    {R600Operands::SRC2, R600Operands::SRC2_SEL},
>>  -  };
>>  -
>>  -  if (!TII->isALUInstr(MI.getOpcode()))
>>  -    return ConstPairs(ReadConsts);
>>  -
>>  -  for (unsigned i = 0; i < 3; i++) {
>>  -    int SrcIdx = TII->getOperandIdx(MI.getOpcode(), OpTable[i][0]);
>>  -    if (SrcIdx < 0)
>>  -      break;
>>  -    if (MI.getOperand(SrcIdx).getReg() == AMDGPU::ALU_CONST)
>>  -      ReadConsts[i] =MI.getOperand(
>>  -          TII->getOperandIdx(MI.getOpcode(), OpTable[i][1])).getImm();
>>  -  }
>>  -  return ConstPairs(ReadConsts);
>>  -}
>>  -
>>  -bool
>>  -R600SchedStrategy::isBundleable(const MachineInstr& MI) {
>>  -  const ConstPairs &MIPair = getPairs(TII, MI);
>>  -  for (unsigned i = 0; i < 4; i++) {
>>  -    if (!InstructionsGroupCandidate[i])
>>  -      continue;
>>  -    const ConstPairs &IGPair = getPairs(TII,
>>  -        *InstructionsGroupCandidate[i]->getInstr());
>>  -    if (!IGPair.isCompatibleWith(MIPair))
>>  -      return false;
>>  -  }
>>  -  return true;
>>  -}
>>  -
>>   SUnit *R600SchedStrategy::PopInst(std::vector<SUnit *> &Q) {
>>     if (Q.empty())
>>       return NULL;
>>     for (std::vector<SUnit *>::iterator It = Q.begin(), E = Q.end();
>>         It != E; ++It) {
>>       SUnit *SU = *It;
>>  -    if (isBundleable(*SU->getInstr())) {
>>  +    InstructionsGroupCandidate.push_back(SU->getInstr());
>>  +    if (TII->canBundle(InstructionsGroupCandidate)) {
>>  +      InstructionsGroupCandidate.pop_back();
>>         Q.erase(It);
>>         return SU;
>>  +    } else {
>>  +      InstructionsGroupCandidate.pop_back();
>>       }
>>     }
>>     return NULL;
>>  @@ -596,7 +535,7 @@ void R600SchedStrategy::PrepareNextSlot() {
>>     DEBUG(dbgs() << "New Slot\n");
>>     assert (OccupedSlotsMask && "Slot wasn't filled");
>>     OccupedSlotsMask = 0;
>>  -  memset(InstructionsGroupCandidate, 0, 
> sizeof(InstructionsGroupCandidate));
>>  +  InstructionsGroupCandidate.clear();
>>     LoadAlu();
>>   }
>>   
>>  @@ -666,7 +605,7 @@ SUnit* R600SchedStrategy::pickAlu() {
>>           SUnit *SU = AttemptFillSlot(Chan);
>>           if (SU) {
>>             OccupedSlotsMask |= (1 << Chan);
>>  -          InstructionsGroupCandidate[Chan] = SU;
>>  +          InstructionsGroupCandidate.push_back(SU->getInstr());
>>             return SU;
>>           }
>>         }
>>  diff --git a/lib/Target/R600/R600MachineScheduler.h 
> b/lib/Target/R600/R600MachineScheduler.h
>>  index 4b1f6c3..47b1d56 100644
>>  --- a/lib/Target/R600/R600MachineScheduler.h
>>  +++ b/lib/Target/R600/R600MachineScheduler.h
>>  @@ -100,7 +100,7 @@ public:
>>     virtual void releaseBottomNode(SUnit *SU);
>>   
>>   private:
>>  -  SUnit *InstructionsGroupCandidate[4];
>>  +  std::vector<MachineInstr *> InstructionsGroupCandidate;
>>   
>>     int getInstKind(SUnit *SU);
>>     bool regBelongsToClass(unsigned Reg, const TargetRegisterClass *RC) 
> const;
>>  @@ -114,7 +114,6 @@ private:
>>     void AssignSlot(MachineInstr *MI, unsigned Slot);
>>     SUnit* pickAlu();
>>     SUnit* pickOther(int QID);
>>  -  bool isBundleable(const MachineInstr& MI);
>>     void MoveUnits(ReadyQueue *QSrc, ReadyQueue *QDst);
>>   };
>>   
>>  diff --git a/test/CodeGen/R600/kcache-fold-2.ll 
> b/test/CodeGen/R600/kcache-fold-2.ll
>>  new file mode 100644
>>  index 0000000..77d0052
>>  --- /dev/null
>>  +++ b/test/CodeGen/R600/kcache-fold-2.ll
> 
> For performance reasons (I think), it's prefered to stick similar tests
> into the same file, so this test should be moved to kcache-fold.ll.
> You will also need to add a CHECK: line for each function name in the
> new file.  Take a look at some of the other tests to see how this is done.
> 
> -Tom
> 
>>  @@ -0,0 +1,52 @@
>>  +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
>>  +
>>  +; CHECK-NOT: MOV
>>  +
>>  +define void @main() {
>>  +main_body:
>>  +  %0 = load <4 x float> addrspace(8)* null
>>  +  %1 = extractelement <4 x float> %0, i32 0
>>  +  %2 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x 
> float>] addrspace(8)* null, i64 0, i32 1)
>>  +  %3 = extractelement <4 x float> %2, i32 0
>>  +  %4 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x 
> float>] addrspace(8)* null, i64 0, i32 1)
>>  +  %5 = extractelement <4 x float> %4, i32 1
>>  +  %6 = fcmp ult float %1, 0.000000e+00
>>  +  %7 = select i1 %6, float %3, float %5
>>  +  %8 = load <4 x float> addrspace(8)* null
>>  +  %9 = extractelement <4 x float> %8, i32 1
>>  +  %10 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 
> x float>] addrspace(8)* null, i64 0, i32 2)
>>  +  %11 = extractelement <4 x float> %10, i32 0
>>  +  %12 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 
> x float>] addrspace(8)* null, i64 0, i32 2)
>>  +  %13 = extractelement <4 x float> %12, i32 1
>>  +  %14 = fcmp ult float %9, 0.000000e+00
>>  +  %15 = select i1 %14, float %11, float %13
>>  +  %16 = load <4 x float> addrspace(8)* null
>>  +  %17 = extractelement <4 x float> %16, i32 2
>>  +  %18 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 
> x float>] addrspace(8)* null, i64 0, i32 1)
>>  +  %19 = extractelement <4 x float> %18, i32 3
>>  +  %20 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 
> x float>] addrspace(8)* null, i64 0, i32 1)
>>  +  %21 = extractelement <4 x float> %20, i32 2
>>  +  %22 = fcmp ult float %17, 0.000000e+00
>>  +  %23 = select i1 %22, float %19, float %21
>>  +  %24 = load <4 x float> addrspace(8)* null
>>  +  %25 = extractelement <4 x float> %24, i32 3
>>  +  %26 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 
> x float>] addrspace(8)* null, i64 0, i32 2)
>>  +  %27 = extractelement <4 x float> %26, i32 3
>>  +  %28 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 
> x float>] addrspace(8)* null, i64 0, i32 2)
>>  +  %29 = extractelement <4 x float> %28, i32 2
>>  +  %30 = fcmp ult float %25, 0.000000e+00
>>  +  %31 = select i1 %30, float %27, float %29
>>  +  %32 = call float @llvm.AMDIL.clamp.(float %7, float 0.000000e+00, float 
> 1.000000e+00)
>>  +  %33 = call float @llvm.AMDIL.clamp.(float %15, float 0.000000e+00, float 
> 1.000000e+00)
>>  +  %34 = call float @llvm.AMDIL.clamp.(float %23, float 0.000000e+00, float 
> 1.000000e+00)
>>  +  %35 = call float @llvm.AMDIL.clamp.(float %31, float 0.000000e+00, float 
> 1.000000e+00)
>>  +  %36 = insertelement <4 x float> undef, float %32, i32 0
>>  +  %37 = insertelement <4 x float> %36, float %33, i32 1
>>  +  %38 = insertelement <4 x float> %37, float %34, i32 2
>>  +  %39 = insertelement <4 x float> %38, float %35, i32 3
>>  +  call void @llvm.R600.store.swizzle(<4 x float> %39, i32 0, i32 0)
>>  +  ret void
>>  +}
>>  +
>>  +declare float @llvm.AMDIL.clamp.(float, float, float) readnone
>>  +declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
>>  -- 
>>  1.8.1.4
>> 
>>  _______________________________________________
>>  llvm-commits mailing list
>>  llvm-commits at cs.uiuc.edu
>>  http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits
> 
-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0001-R600-Factorize-code-handling-Const-Read-Port-limitat.patch
Type: application/octet-stream
Size: 12979 bytes
Desc: not available
URL: <http://lists.freedesktop.org/archives/mesa-dev/attachments/20130313/e0596999/attachment-0001.obj>