[Mesa-dev] [PATCH] radeon/llvm: support for f32, v2f32, v3f32 store/load

Tom Stellard tom at stellard.net
Mon Nov 12 13:03:49 PST 2012


On Sun, Nov 11, 2012 at 10:22:41PM +0100, Vincent Lejeune wrote:
> ---
>  lib/Target/AMDGPU/AMDILDevice.cpp      |  4 +-
>  lib/Target/AMDGPU/R600ISelLowering.cpp | 69 ++++++++++++++++++++++++----------
>  lib/Target/AMDGPU/R600Instructions.td  |  4 +-
>  3 files changed, 54 insertions(+), 23 deletions(-)
> 
> diff --git a/lib/Target/AMDGPU/AMDILDevice.cpp b/lib/Target/AMDGPU/AMDILDevice.cpp
> index 3955828..b440aa6 100644
> --- a/lib/Target/AMDGPU/AMDILDevice.cpp
> +++ b/lib/Target/AMDGPU/AMDILDevice.cpp
> @@ -129,8 +129,8 @@ std::string
>  AMDGPUDevice::getDataLayout() const
>  {
>      return std::string("e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16"
> -      "-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f80:32:32"
> -      "-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64"
> +      "-i32:128:128-i64:128:128-f32:128:128-f64:128:128-f80:128:128"
> +      "-v16:16:16-v24:32:32-v32:128:128-v48:128:128-v64:128:128"
>        "-v96:128:128-v128:128:128-v192:256:256-v256:256:256"
>        "-v512:512:512-v1024:1024:1024-v2048:2048:2048"
>        "-n8:16:32:64");

Changing the DataLayout is a little tricky, because it also means we
have to update clang to keep it in sync.  I also need to test this to
make sure it doesn't break any of the compute shaders.

Is changing the DataLayout required for this patch, or does it just
simplify the lowering?


> diff --git a/lib/Target/AMDGPU/R600ISelLowering.cpp b/lib/Target/AMDGPU/R600ISelLowering.cpp
> index 712dd3f..e6418b2 100644
> --- a/lib/Target/AMDGPU/R600ISelLowering.cpp
> +++ b/lib/Target/AMDGPU/R600ISelLowering.cpp
> @@ -94,15 +94,22 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
>    setOperationAction(ISD::VSELECT, MVT::v4f32, Expand);
>    setOperationAction(ISD::VSELECT, MVT::v4i32, Expand);
>    // Legalize loads and stores to the private address space.
> +  setOperationAction(ISD::LOAD, MVT::f32, Custom);
>    setOperationAction(ISD::LOAD, MVT::i32, Custom);
> +  setOperationAction(ISD::LOAD, MVT::v2f32, Custom);
> +  setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
>    setOperationAction(ISD::LOAD, MVT::v4f32, Custom);
>    setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
>    setLoadExtAction(ISD::EXTLOAD, MVT::v4i8, Custom);
>    setLoadExtAction(ISD::EXTLOAD, MVT::i8, Custom);
>    setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i8, Custom);
>    setOperationAction(ISD::STORE, MVT::i8, Custom);
> +  setOperationAction(ISD::STORE, MVT::f32, Custom);
>    setOperationAction(ISD::STORE, MVT::i32, Custom);
> +  setOperationAction(ISD::STORE, MVT::v2f32, Custom);
> +  setOperationAction(ISD::STORE, MVT::v2i32, Custom);
>    setOperationAction(ISD::STORE, MVT::v4f32, Custom);
> +  setOperationAction(ISD::STORE, MVT::v4i32, Custom);
>  
>    setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
>  
> @@ -522,6 +529,17 @@ void R600TargetLowering::ReplaceNodeResults(SDNode *N,
>    switch (N->getOpcode()) {
>    default: return;
>    case ISD::FP_TO_UINT: Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG));

Someone else mentioned this, but I think you need a break here.

> +  case ISD::LOAD: {
> +    SDNode *Node = LowerLOAD(SDValue(N, 0), DAG).getNode();
> +    Results.push_back(SDValue(Node, 0));
> +    Results.push_back(SDValue(Node, 1));
> +    return;
> +  }
> +  case ISD::STORE:
> +    SDNode *Node = LowerSTORE(SDValue(N, 0), DAG).getNode();
> +    Results.push_back(SDValue(Node, 0));
> +    Results.push_back(SDValue(Node, 1));
> +    return;
>    }
>  }
>  
> @@ -818,26 +836,32 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
>      return SDValue();
>    }
>  
> +  // LLVM generates byte-addresing pointers, but we need to convert this to a
> +  // register index.  Each register holds 16 bytes (4 x 32), so in order to
> +  // get the register index, we need to divide the pointer by 16.
> +  Ptr = DAG.getNode(ISD::SRL, DL, Ptr.getValueType(), Ptr,
> +                    DAG.getConstant(4, MVT::i32));
> +
>    if (VT.isVector()) {
> +    unsigned NumElemVT = VT.getVectorNumElements();
>      EVT ElemVT = VT.getVectorElementType();
>      SDValue Loads[4];
> -    // LLVM generates byte-addresing pointers, but we need to convert this to a
> -    // register index.  Each register holds 16 bytes (4 x 32), so in order to
> -    // get the register index, we need to divide the pointer by 16.
> -    Ptr = DAG.getNode(ISD::SRL, DL, Ptr.getValueType(), Ptr,
> -                      DAG.getConstant(4, MVT::i32));
>  
> -    for (unsigned i = 0; i < 4; ++i) {
> +    for (unsigned i = 0; i < NumElemVT; ++i) {
>        Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT,
>                               Chain, Ptr,
> -                             DAG.getTargetConstant(i, MVT::i32), // Channel
> +                             DAG.getConstant(i, MVT::i32), // Channel
>                               Op.getOperand(2));
>      }
> -    LoweredLoad = DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Loads, 4);
> +    for (unsigned i = NumElemVT; i < 4; ++i) {
> +      Loads[i] = DAG.getUNDEF(ElemVT);
> +    }
> +    EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, 4);
> +    LoweredLoad = DAG.getNode(ISD::BUILD_VECTOR, DL, TargetVT, Loads, 4);
>    } else {
>      LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT,
>                                Chain, Ptr,
> -                              DAG.getTargetConstant(0, MVT::i32), // Channel
> +                              DAG.getConstant(0, MVT::i32), // Channel

Is there some reason to change this from a TargetConstant to a regular
Constant?  I prefer TargetConstant's because I figured that if we ever
wanted to do some automatic conversion from constants to inline
constants like AMDGPU::ZERO, AMDGPU::ONE, it would be better to use
target constants for situations like this, where this conversion does not
make sense.

>                                Op.getOperand(2));
>    }
>  
> @@ -863,32 +887,39 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const
>      return SDValue();
>    }
>  
> +  // LLVM generates byte-addresing pointers, but we need to convert this to a
> +  // register index.  Each register holds 16 bytes (4 x 32), so in order to
> +  // get the register index, we need to divide the pointer by 16.
> +  Ptr = DAG.getNode(ISD::SRL, DL, Ptr.getValueType(), Ptr,
> +                    DAG.getConstant(4, MVT::i32));
> +
>    if (VT.isVector()) {
> +    unsigned NumElemVT = VT.getVectorNumElements();
>      EVT ElemVT = VT.getVectorElementType();
>      SDValue Stores[4];
>  
> -    // LLVM generates byte-addresing pointers, but we need to convert this to a
> -    // register index.  Each register holds 16 bytes (4 x 32), so in order to
> -    // get the register index, we need to divide the pointer by 16.
> -    Ptr = DAG.getNode(ISD::SRL, DL, Ptr.getValueType(), Ptr,
> -                      DAG.getConstant(4, MVT::i32));
> -
> -    for (unsigned i = 0; i < 4; ++i) {
> +    for (unsigned i = 0; i < NumElemVT; ++i) {
>        SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT,
>                                   Value, DAG.getConstant(i, MVT::i32));
>  
>        Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
>                                Chain, Elem, Ptr,
> -                              DAG.getTargetConstant(i, MVT::i32)); // Channel
> +                              DAG.getConstant(i, MVT::i32)); // Channel
>        MFI->IndirectChannels.set(i);
>      }
> -     Chain =  DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores, 4);
> +     Chain =  DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores, NumElemVT);
>     } else {
>      if (VT == MVT::i8) {
>        Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value);
>      }
> +    // We can go here with f32 elements from a v3f32.
> +    // Such elements have a non 16 bytes aligned addresses that we can use
> +    SDValue Channel = DAG.getNode(ISD::AND, DL, MVT::i32, Op.getOperand(2),
> +        DAG.getConstant(15, MVT::i32));
> +    Channel = DAG.getNode(ISD::SRL, DL, MVT::i32, Channel,
> +            DAG.getConstant(2, MVT::i32));

What happens if Op.getOperand(2) is not a compile time constant?

>      Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr,
> -    DAG.getTargetConstant(0, MVT::i32)); // Channel 
> +        Channel); // Channel 
>      MFI->IndirectChannels.set(0);
>    }
>  
> diff --git a/lib/Target/AMDGPU/R600Instructions.td b/lib/Target/AMDGPU/R600Instructions.td
> index d081824..09183e8 100644
> --- a/lib/Target/AMDGPU/R600Instructions.td
> +++ b/lib/Target/AMDGPU/R600Instructions.td
> @@ -1389,14 +1389,14 @@ class RegisterLoad <ValueType vt> : InstR600 <0x0,
>    (outs R600_Reg32:$dst), (ins FRAMEri:$addr, i32imm:$chan),
>    "RegisterLoad $dst, $addr",
>    [(set (vt R600_Reg32:$dst), (REGISTER_LOAD ADDRIndirect:$addr,
> -                               (i32 timm:$chan)))],
> +                               (i32 imm:$chan)))],
>    NullALU
>  >;
>  
>  class RegisterStore <ValueType vt> : InstR600 <0x0,
>    (outs), (ins R600_Reg32:$val, FRAMEri:$addr, i32imm:$chan),
>    "RegisterStore_i32 $val, $addr",
> -  [(REGISTER_STORE (vt R600_Reg32:$val), ADDRIndirect:$addr, (i32 timm:$chan))],
> +  [(REGISTER_STORE (vt R600_Reg32:$val), ADDRIndirect:$addr, (i32 imm:$chan))],
>    NullALU
>  >;
>  
> -- 
> 1.7.11.7
> 
> _______________________________________________
> mesa-dev mailing list
> mesa-dev at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/mesa-dev


More information about the mesa-dev mailing list