[Mesa-dev] [PATCH 2/2] radeon/llvm: add store.vertex.{position, param} intrinsics

Sun Dec 16 20:09:43 PST 2012

I think it would be more efficient and correct to set EOP and change
EXPORT to EXPORT_DONE in some separate pass that will handle all exports
at once instead of iterating through instructions for every export,
probably later it will be done by the instruction scheduler (when we'll
have one). Also, as we discussed on #radeon, I think we shouldn't make
any assumptions on the EXPORT's placement in the lowering pass, so other
EXPORT's can be in other BB's in theory (again, instruction scheduler
will decide where to place them later). IMO for now we can simply add
the pass (or add the code to some existing pass) that will collect all
exports, move them right before the RETURN (until we have the proper
scheduler), and set EXPORT_DONE and EOP where needed. 

On Sun, 2012-12-16 at 21:01 +0100, Vincent Lejeune wrote:
> ---
>  lib/Target/AMDGPU/R600ISelLowering.cpp        | 51 +++++++++++++++++++++++----
>  lib/Target/AMDGPU/R600Instructions.td         |  5 +++
>  lib/Target/AMDGPU/R600Intrinsics.td           |  4 +++
>  lib/Target/AMDGPU/R600MachineFunctionInfo.cpp |  1 +
>  lib/Target/AMDGPU/R600MachineFunctionInfo.h   |  1 +
>  5 files changed, 56 insertions(+), 6 deletions(-)
> 
> diff --git a/lib/Target/AMDGPU/R600ISelLowering.cpp b/lib/Target/AMDGPU/R600ISelLowering.cpp
> index 6c594cc..5b9545e 100644
> --- a/lib/Target/AMDGPU/R600ISelLowering.cpp
> +++ b/lib/Target/AMDGPU/R600ISelLowering.cpp
> @@ -350,8 +350,27 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
>    }
>    case AMDGPU::EG_ExportSwz:
>    case AMDGPU::R600_ExportSwz: {
> +    // Instruction is left unmodified if its not the last one of its type
> +    bool isLastInstructionOfItsType;
> +    {
> +      isLastInstructionOfItsType = true;
> +      unsigned InstExportType = MI->getOperand(1).getImm();
> +      for (MachineBasicBlock::iterator NextExportInst = llvm::next(I),
> +          EndBlock = BB->end(); NextExportInst != EndBlock;
> +          NextExportInst = llvm::next(NextExportInst)) {
> +        if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz ||
> +            NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) {
> +          unsigned CurrentInstExportType = NextExportInst->getOperand(1)
> +              .getImm();
> +          if (CurrentInstExportType == InstExportType) {
> +            isLastInstructionOfItsType = false;
> +            break;
> +          }
> +        }
> +      }
> +    }
>      bool EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN)? 1 : 0;
> -    if (!EOP)
> +    if (!EOP && !isLastInstructionOfItsType)
>        return BB;
>      unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40;
>      BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
> @@ -363,7 +382,7 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
>              .addOperand(MI->getOperand(5))
>              .addOperand(MI->getOperand(6))
>              .addImm(CfInst)
> -            .addImm(1);
> +            .addImm(EOP);
>      break;
>    }
>    }
> @@ -382,7 +401,7 @@ using namespace llvm::AMDGPUIntrinsic;
>  static SDValue
>  InsertScalarToRegisterExport(SelectionDAG &DAG, DebugLoc DL, SDNode **ExportMap,
>      unsigned Slot, unsigned Channel, unsigned Inst, unsigned Type,
> -    SDValue Scalar, SDValue Chain) {
> +    unsigned ArrayBaseOffset, SDValue Scalar, SDValue Chain) {
>    if (!ExportMap[Slot]) {
>      SDValue Vector = DAG.getNode(ISD::INSERT_VECTOR_ELT,
>        DL, MVT::v4f32,
> @@ -420,7 +439,7 @@ InsertScalarToRegisterExport(SelectionDAG &DAG, DebugLoc DL, SDNode **ExportMap,
>    const SDValue Ops[] = {ExportInstruction->getOperand(0), Vector,
>        DAG.getConstant(Inst, MVT::i32),
>        DAG.getConstant(Type, MVT::i32),
> -      DAG.getConstant(Slot, MVT::i32),
> +      DAG.getConstant(Slot + ArrayBaseOffset, MVT::i32),
>        DAG.getConstant(Mask, MVT::i32)};
>  
>    DAG.UpdateNodeOperands(ExportInstruction,
> @@ -464,7 +483,7 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
>  
>        SDNode **OutputsMap = MFI->Outputs;
>        return InsertScalarToRegisterExport(DAG, Op.getDebugLoc(), OutputsMap,
> -          RegIndex / 4, RegIndex % 4, 0, 0, Op.getOperand(2),
> +          RegIndex / 4, RegIndex % 4, 0, 0, 0, Op.getOperand(2),
>            Chain);
>  
>      }
> @@ -498,7 +517,27 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
>        }
>  
>        return InsertScalarToRegisterExport(DAG, Op.getDebugLoc(), OutputsMap,
> -          RegIndex / 4, RegIndex % 4, Inst, 0, Op.getOperand(2),
> +          RegIndex / 4, RegIndex % 4, Inst, 0, 0, Op.getOperand(2),
> +          Chain);
> +    }
> +    case AMDGPUIntrinsic::R600_store_vertex_position: {
> +      MachineFunction &MF = DAG.getMachineFunction();
> +      R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
> +      int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
> +
> +      SDNode **OutputsMap = MFI->Outputs;
> +      return InsertScalarToRegisterExport(DAG, Op.getDebugLoc(), OutputsMap,
> +          RegIndex / 4, RegIndex % 4, 0, 1, 60, Op.getOperand(2),
> +          Chain);
> +    }
> +    case AMDGPUIntrinsic::R600_store_vertex_param: {
> +      MachineFunction &MF = DAG.getMachineFunction();
> +      R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
> +      int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
> +
> +      SDNode **OutputsMap = MFI->VertexParamOutputs;
> +      return InsertScalarToRegisterExport(DAG, Op.getDebugLoc(), OutputsMap,
> +          RegIndex / 4, RegIndex % 4, 0, 2, 0, Op.getOperand(2),
>            Chain);
>      }
>      // default for switch(IntrinsicID)
> diff --git a/lib/Target/AMDGPU/R600Instructions.td b/lib/Target/AMDGPU/R600Instructions.td
> index c3ffe97..3834df5 100644
> --- a/lib/Target/AMDGPU/R600Instructions.td
> +++ b/lib/Target/AMDGPU/R600Instructions.td
> @@ -545,6 +545,11 @@ multiclass ExportPattern<Instruction ExportInst, bits<8> cf_inst> {
>          (ExportInst R600_Reg128:$src, imm:$type, imm:$arraybase,
>          0, 1, 2, 3, cf_inst, 0)
>    >;
> +  def : Pat<(EXPORT (v4f32 R600_Reg128:$src), (i32 1),
> +    (i32 imm:$type), (i32 imm:$arraybase), (i32 imm)),
> +        (ExportInst R600_Reg128:$src, imm:$type, imm:$arraybase,
> +        0, 1, 2, 3, cf_inst, 0)
> +  >;
>  }
>  
>  multiclass SteamOutputExportPattern<Instruction ExportInst,
> diff --git a/lib/Target/AMDGPU/R600Intrinsics.td b/lib/Target/AMDGPU/R600Intrinsics.td
> index 0186f9d..069fa07 100644
> --- a/lib/Target/AMDGPU/R600Intrinsics.td
> +++ b/lib/Target/AMDGPU/R600Intrinsics.td
> @@ -21,6 +21,10 @@ let TargetPrefix = "R600", isTarget = 1 in {
>      Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrReadMem]>;
>    def int_R600_clipvertex :
>      Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty], [IntrNoMem]>;
> +  def int_R600_store_vertex_position :
> +    Intrinsic<[], [llvm_float_ty, llvm_i32_ty], []>;
> +  def int_R600_store_vertex_param :
> +    Intrinsic<[], [llvm_float_ty, llvm_i32_ty], []>;
>    def int_R600_store_stream_output :
>      Intrinsic<[], [llvm_float_ty, llvm_i32_ty, llvm_i32_ty], []>;
>    def int_R600_store_pixel_color :
> diff --git a/lib/Target/AMDGPU/R600MachineFunctionInfo.cpp b/lib/Target/AMDGPU/R600MachineFunctionInfo.cpp
> index 93b4608..ecc98ec 100644
> --- a/lib/Target/AMDGPU/R600MachineFunctionInfo.cpp
> +++ b/lib/Target/AMDGPU/R600MachineFunctionInfo.cpp
> @@ -19,6 +19,7 @@ R600MachineFunctionInfo::R600MachineFunctionInfo(const MachineFunction &MF)
>      HasPerspectiveInterpolation(false) {
>      memset(Outputs, 0, sizeof(Outputs));
>      memset(StreamOutputs, 0, sizeof(StreamOutputs));
> +    memset(VertexParamOutputs, 0, sizeof(StreamOutputs));
>    }
>  
>  unsigned R600MachineFunctionInfo::GetIJPerspectiveIndex() const {
> diff --git a/lib/Target/AMDGPU/R600MachineFunctionInfo.h b/lib/Target/AMDGPU/R600MachineFunctionInfo.h
> index 6cc875f..f7cede3 100644
> --- a/lib/Target/AMDGPU/R600MachineFunctionInfo.h
> +++ b/lib/Target/AMDGPU/R600MachineFunctionInfo.h
> @@ -26,6 +26,7 @@ public:
>    R600MachineFunctionInfo(const MachineFunction &MF);
>    std::vector<unsigned> ReservedRegs;
>    SDNode *Outputs[16];
> +  SDNode *VertexParamOutputs[16];
>    SDNode *StreamOutputs[64][4];
>    BitVector IndirectChannels;
>    bool HasLinearInterpolation;