<div dir="ltr"><blockquote type="cite">From: Tom Stellard <thomas.stellard at <a href="http://amd.com" target="_blank">amd.com</a>> These instructions compare two floating point values and return an integer true (-1) or false (0) value. When compiling code generated by the Mesa GLSL frontend, the SET*_DX10 instructions save us four instructions for most branch decisions that use floating-point comparisons. --- lib/Target/R600/R600ISelLowering.cpp | 108 +++++++++++++++++++------- lib/Target/R600/R600Instructions.td | 52 +++++++++++++ test/CodeGen/R600/fcmp.ll | 4 +- test/CodeGen/R600/set-dx10.ll | 137 ++++++++++++++++++++++++++++++++++ test/CodeGen/R600/unsuported-cc.ll | 24 +++--- 5 files changed, 281 insertions(+), 44 deletions(-) create mode 100644 test/CodeGen/R600/set-dx10.ll diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp index abfee16..c4aa172 100644 --- a/lib/Target/R600/R600ISelLowering.cpp +++ b/lib/Target/R600/R600ISelLowering.cpp @@ -90,7 +90,9 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) : setOperationAction(ISD::FrameIndex, MVT::i32, Custom); setTargetDAGCombine(ISD::FP_ROUND); + setTargetDAGCombine(ISD::FP_TO_SINT); setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); + setTargetDAGCombine(ISD::SELECT_CC); setSchedulingPreference(Sched::VLIW); } @@ -663,9 +665,12 @@ SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const } // Try to lower to a SET* instruction: - // We need all the operands of SELECT_CC to have the same value type, so if - // necessary we need to change True and False to be the same type as LHS and - // RHS, and then convert the result of the select_cc back to the correct type. + // + // CompareVT == MVT::f32 and VT == MVT::i32 is supported by the hardware, + // but for the other case where CompareVT != VT, all operands of + // SELECT_CC to have the same value type, so we need to change True and False </blockquote> "all operands of SELECT_CC to have". Maybe "need to have"? <blockquote type="cite">+ // to be the same type as LHS and RHS, and then convert the result of the + // select_cc back to the correct type. // Move hardware True/False values to the correct operand. if (isHWTrueValue(False) && isHWFalseValue(True)) { @@ -675,32 +680,17 @@ SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const } if (isHWTrueValue(True) && isHWFalseValue(False)) { - if (CompareVT != VT) { - if (VT == MVT::f32 && CompareVT == MVT::i32) { - SDValue Boolean = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, - LHS, RHS, - DAG.getConstant(-1, MVT::i32), - DAG.getConstant(0, MVT::i32), - CC); - // Convert integer values of true (-1) and false (0) to fp values of - // true (1.0f) and false (0.0f). - SDValue LSB = DAG.getNode(ISD::AND, DL, MVT::i32, Boolean, - DAG.getConstant(1, MVT::i32)); - return DAG.getNode(ISD::UINT_TO_FP, DL, VT, LSB); - } else if (VT == MVT::i32 && CompareVT == MVT::f32) { - SDValue BoolAsFlt = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, - LHS, RHS, - DAG.getConstantFP(1.0f, MVT::f32), - DAG.getConstantFP(0.0f, MVT::f32), - CC); - // Convert fp values of true (1.0f) and false (0.0f) to integer values - // of true (-1) and false (0). - SDValue Neg = DAG.getNode(ISD::FNEG, DL, MVT::f32, BoolAsFlt); - return DAG.getNode(ISD::FP_TO_SINT, DL, VT, Neg); - } else { - // I don't think there will be any other type pairings. - assert(!"Unhandled operand type parings in SELECT_CC"); - } + if (CompareVT != VT && VT == MVT::f32 && CompareVT == MVT::i32) { + SDValue Boolean = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, + LHS, RHS, + DAG.getConstant(-1, MVT::i32), + DAG.getConstant(0, MVT::i32), + CC); + // Convert integer values of true (-1) and false (0) to fp values of + // true (1.0f) and false (0.0f). + SDValue LSB = DAG.getNode(ISD::AND, DL, MVT::i32, Boolean, + DAG.getConstant(1, MVT::i32)); + return DAG.getNode(ISD::UINT_TO_FP, DL, VT, LSB); } else { // This SELECT_CC is already legal. return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC); @@ -1121,6 +1111,35 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N, } break; } + + // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) -> + // (i32 select_cc f32, f32, -1, 0 cc) + // + // Mesa's GLSL frontend generates the above pattern a lot and we can lower + // this to one of the SET*_DX10 instructions. + case ISD::FP_TO_SINT: { + SDValue FNeg = N->getOperand(0); + if (FNeg.getOpcode() != ISD::FNEG) { + return SDValue(); + } + SDValue SelectCC = FNeg.getOperand(0); + if (SelectCC.getOpcode() != ISD::SELECT_CC || + SelectCC.getOperand(0).getValueType() != MVT::f32 || // LHS + SelectCC.getOperand(2).getValueType() != MVT::f32 || // True + !isHWTrueValue(SelectCC.getOperand(2)) || + !isHWFalseValue(SelectCC.getOperand(3))) { + return SDValue(); + } + + return DAG.getNode(ISD::SELECT_CC, N->getDebugLoc(), N->getValueType(0), + SelectCC.getOperand(0), // LHS + SelectCC.getOperand(1), // RHS + DAG.getConstant(-1, MVT::i32), // True + DAG.getConstant(0, MVT::i32), // Flase + SelectCC.getOperand(4)); // CC + + break; + } // Extract_vec (Build_vector) generated by custom lowering // also needs to be customly combined case ISD::EXTRACT_VECTOR_ELT: { @@ -1140,6 +1159,37 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N, } } } + + case ISD::SELECT_CC: { + // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq -> + // selectcc x, y, a, b, inv(cc) + SDValue LHS = N->getOperand(0); + if (LHS.getOpcode() != ISD::SELECT_CC) { + return SDValue(); + } + + SDValue RHS = N->getOperand(1); + SDValue True = N->getOperand(2); + SDValue False = N->getOperand(3); + + if (LHS.getOperand(2).getNode() != True.getNode() || + LHS.getOperand(3).getNode() != False.getNode() || + RHS.getNode() != False.getNode() || + cast<CondCodeSDNode>(N->getOperand(4))->get() != ISD::SETEQ) { + return SDValue(); + } + + ISD::CondCode CCOpcode = cast<CondCodeSDNode>(LHS->getOperand(4))->get(); + CCOpcode = ISD::getSetCCInverse( + CCOpcode, LHS.getOperand(0).getValueType().isInteger()); + return DAG.getSelectCC(N->getDebugLoc(), + LHS.getOperand(0), + LHS.getOperand(1), + LHS.getOperand(2), + LHS.getOperand(3), + CCOpcode); + + } } return SDValue(); } diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td index 75ac31f..a71434a 100644 --- a/lib/Target/R600/R600Instructions.td +++ b/lib/Target/R600/R600Instructions.td @@ -711,6 +711,34 @@ def SNE : R600_2OP < COND_NE))] >; +def SETE_DX10 : R600_2OP < + 0xC, "SETE_DX10", + [(set R600_Reg32:$dst, + (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, (i32 -1), (i32 0), + COND_EQ))] +>; + +def SETGT_DX10 : R600_2OP < + 0xD, "SETGT_DX10", + [(set R600_Reg32:$dst, + (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, (i32 -1), (i32 0), + COND_GT))] +>; + +def SETGE_DX10 : R600_2OP < + 0xE, "SETGE_DX10", + [(set R600_Reg32:$dst, + (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, (i32 -1), (i32 0), + COND_GE))] +>; + +def SETNE_DX10 : R600_2OP < + 0xF, "SETNE_DX10", + [(set R600_Reg32:$dst, + (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, (i32 -1), (i32 0), + COND_NE))] +>; + def FRACT : R600_1OP_Helper <0x10, "FRACT", AMDGPUfract>; def TRUNC : R600_1OP_Helper <0x11, "TRUNC", int_AMDGPU_trunc>; def CEIL : R600_1OP_Helper <0x12, "CEIL", fceil>; @@ -1771,6 +1799,18 @@ def : Pat < (SGE R600_Reg32:$src1, R600_Reg32:$src0) >; +// SETGT_DX10 reverse args +def : Pat < + (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, COND_LT), + (SETGT_DX10 R600_Reg32:$src1, R600_Reg32:$src0) +>; + +// SETGE_DX10 reverse args +def : Pat < + (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, COND_LE), + (SETGE_DX10 R600_Reg32:$src1, R600_Reg32:$src0) +>; + // SETGT_INT reverse args def : Pat < (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETLT), @@ -1809,12 +1849,24 @@ def : Pat < (SETE R600_Reg32:$src0, R600_Reg32:$src1) >; +//SETE_DX10 - 'true if ordered' +def : Pat < + (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETO), + (SETE_DX10 R600_Reg32:$src0, R600_Reg32:$src1) +>; + //SNE - 'true if unordered' def : Pat < (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, FP_ONE, FP_ZERO, SETUO), (SNE R600_Reg32:$src0, R600_Reg32:$src1) >; +//SETNE_DX10 - 'true if ordered' +def : Pat < + (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETUO), + (SETNE_DX10 R600_Reg32:$src0, R600_Reg32:$src1) +>; + def : Extract_Element <f32, v4f32, R600_Reg128, 0, sel_x>; def : Extract_Element <f32, v4f32, R600_Reg128, 1, sel_y>; def : Extract_Element <f32, v4f32, R600_Reg128, 2, sel_z>; diff --git a/test/CodeGen/R600/fcmp.ll b/test/CodeGen/R600/fcmp.ll index 1dcd07c..89f5e9e 100644 --- a/test/CodeGen/R600/fcmp.ll +++ b/test/CodeGen/R600/fcmp.ll @@ -1,8 +1,6 @@ ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s -;CHECK: SETE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -;CHECK: MOV T{{[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}} -;CHECK: FLT_TO_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +;CHECK: SETE_DX10 T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} define void @test(i32 addrspace(1)* %out, float addrspace(1)* %in) { entry: diff --git a/test/CodeGen/R600/set-dx10.ll b/test/CodeGen/R600/set-dx10.ll new file mode 100644 index 0000000..a9eb22f --- /dev/null +++ b/test/CodeGen/R600/set-dx10.ll @@ -0,0 +1,137 @@ +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s + +; These tests check that floating point comparisons which are used by select +; to store integer true (-1) and false (0) values are lowered one of the </blockquote> "lowered one of the " -> "lowered to one of the" <blockquote type="cite">+; SET*DX10 instructions. + +; CHECK: @fcmp_une_select_fptosi +; CHECK: SETNE_DX10 T{{[0-9]+\.[XYZW]}}, T{{[0-9]+\.[XYZW]}}, literal.x, 1084227584(5.000000e+00) +define void @fcmp_une_select_fptosi(i32 addrspace(1)* %out, float %in) { +entry: + %0 = fcmp une float %in, 5.0 + %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00 + %2 = fsub float -0.000000e+00, %1 + %3 = fptosi float %2 to i32 + store i32 %3, i32 addrspace(1)* %out + ret void +} + +; CHECK: @fcmp_une_select_i32 +; CHECK: SETNE_DX10 T{{[0-9]+\.[XYZW]}}, T{{[0-9]+\.[XYZW]}}, literal.x, 1084227584(5.000000e+00) +define void @fcmp_une_select_i32(i32 addrspace(1)* %out, float %in) { +entry: + %0 = fcmp une float %in, 5.0 + %1 = select i1 %0, i32 -1, i32 0 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; CHECK: @fcmp_ueq_select_fptosi +; CHECK: SETE_DX10 T{{[0-9]+\.[XYZW]}}, T{{[0-9]+\.[XYZW]}}, literal.x, 1084227584(5.000000e+00) +define void @fcmp_ueq_select_fptosi(i32 addrspace(1)* %out, float %in) { +entry: + %0 = fcmp ueq float %in, 5.0 + %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00 + %2 = fsub float -0.000000e+00, %1 + %3 = fptosi float %2 to i32 + store i32 %3, i32 addrspace(1)* %out + ret void +} + +; CHECK: @fcmp_ueq_select_i32 +; CHECK: SETE_DX10 T{{[0-9]+\.[XYZW]}}, T{{[0-9]+\.[XYZW]}}, literal.x, 1084227584(5.000000e+00) +define void @fcmp_ueq_select_i32(i32 addrspace(1)* %out, float %in) { +entry: + %0 = fcmp ueq float %in, 5.0 + %1 = select i1 %0, i32 -1, i32 0 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; CHECK: @fcmp_ugt_select_fptosi +; CHECK: SETGT_DX10 T{{[0-9]+\.[XYZW]}}, T{{[0-9]+\.[XYZW]}}, literal.x, 1084227584(5.000000e+00) +define void @fcmp_ugt_select_fptosi(i32 addrspace(1)* %out, float %in) { +entry: + %0 = fcmp ugt float %in, 5.0 + %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00 + %2 = fsub float -0.000000e+00, %1 + %3 = fptosi float %2 to i32 + store i32 %3, i32 addrspace(1)* %out + ret void +} + +; CHECK: @fcmp_ugt_select_i32 +; CHECK: SETGT_DX10 T{{[0-9]+\.[XYZW]}}, T{{[0-9]+\.[XYZW]}}, literal.x, 1084227584(5.000000e+00) +define void @fcmp_ugt_select_i32(i32 addrspace(1)* %out, float %in) { +entry: + %0 = fcmp ugt float %in, 5.0 + %1 = select i1 %0, i32 -1, i32 0 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; CHECK: @fcmp_uge_select_fptosi +; CHECK: SETGE_DX10 T{{[0-9]+\.[XYZW]}}, T{{[0-9]+\.[XYZW]}}, literal.x, 1084227584(5.000000e+00) +define void @fcmp_uge_select_fptosi(i32 addrspace(1)* %out, float %in) { +entry: + %0 = fcmp uge float %in, 5.0 + %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00 + %2 = fsub float -0.000000e+00, %1 + %3 = fptosi float %2 to i32 + store i32 %3, i32 addrspace(1)* %out + ret void +} + +; CHECK: @fcmp_uge_select_i32 +; CHECK: SETGE_DX10 T{{[0-9]+\.[XYZW]}}, T{{[0-9]+\.[XYZW]}}, literal.x, 1084227584(5.000000e+00) +define void @fcmp_uge_select_i32(i32 addrspace(1)* %out, float %in) { +entry: + %0 = fcmp uge float %in, 5.0 + %1 = select i1 %0, i32 -1, i32 0 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; CHECK: @fcmp_ule_select_fptosi +; CHECK: SETGE_DX10 T{{[0-9]+\.[XYZW]}}, literal.x, T{{[0-9]+\.[XYZW]}}, 1084227584(5.000000e+00) +define void @fcmp_ule_select_fptosi(i32 addrspace(1)* %out, float %in) { +entry: + %0 = fcmp ule float %in, 5.0 + %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00 + %2 = fsub float -0.000000e+00, %1 + %3 = fptosi float %2 to i32 + store i32 %3, i32 addrspace(1)* %out + ret void +} + +; CHECK: @fcmp_ule_select_i32 +; CHECK: SETGE_DX10 T{{[0-9]+\.[XYZW]}}, literal.x, T{{[0-9]+\.[XYZW]}}, 1084227584(5.000000e+00) +define void @fcmp_ule_select_i32(i32 addrspace(1)* %out, float %in) { +entry: + %0 = fcmp ule float %in, 5.0 + %1 = select i1 %0, i32 -1, i32 0 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; CHECK: @fcmp_ult_select_fptosi +; CHECK: SETGT_DX10 T{{[0-9]+\.[XYZW]}}, literal.x, T{{[0-9]+\.[XYZW]}}, 1084227584(5.000000e+00) +define void @fcmp_ult_select_fptosi(i32 addrspace(1)* %out, float %in) { +entry: + %0 = fcmp ult float %in, 5.0 + %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00 + %2 = fsub float -0.000000e+00, %1 + %3 = fptosi float %2 to i32 + store i32 %3, i32 addrspace(1)* %out + ret void +} + +; CHECK: @fcmp_ult_select_i32 +; CHECK: SETGT_DX10 T{{[0-9]+\.[XYZW]}}, literal.x, T{{[0-9]+\.[XYZW]}}, 1084227584(5.000000e+00) +define void @fcmp_ult_select_i32(i32 addrspace(1)* %out, float %in) { +entry: + %0 = fcmp ult float %in, 5.0 + %1 = select i1 %0, i32 -1, i32 0 + store i32 %1, i32 addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/R600/unsuported-cc.ll b/test/CodeGen/R600/unsuported-cc.ll index 5d4c782..b48c591 100644 --- a/test/CodeGen/R600/unsuported-cc.ll +++ b/test/CodeGen/R600/unsuported-cc.ll </blockquote> Not related to this patch, but should we rename this file to unsupported-cc.ll (2 p's). I'm not qualified to do a technical review of this, so all you get is spelling and grammar :) --Aaron W <blockquote type="cite">@@ -24,21 +24,21 @@ entry: ; CHECK: @ult_float ; CHECK: SETGT T{{[0-9]+\.[XYZW]}}, literal.x, {{T[0-9]+\.[XYZW]}}, 1084227584(5.000000e+00) -define void @ult_float(i32 addrspace(1)* %out, float %in) { +define void @ult_float(float addrspace(1)* %out, float %in) { entry: %0 = fcmp ult float %in, 5.0 - %1 = select i1 %0, i32 -1, i32 0 - store i32 %1, i32 addrspace(1)* %out + %1 = select i1 %0, float 1.0, float 0.0 + store float %1, float addrspace(1)* %out ret void } ; CHECK: @olt ; CHECK: SETGT T{{[0-9]+\.[XYZW]}}, literal.x, {{T[0-9]+\.[XYZW]}}, 1084227584(5.000000e+00) -define void @olt(i32 addrspace(1)* %out, float %in) { +define void @olt(float addrspace(1)* %out, float %in) { entry: %0 = fcmp olt float %in, 5.0 - %1 = select i1 %0, i32 -1, i32 0 - store i32 %1, i32 addrspace(1)* %out + %1 = select i1 %0, float 1.0, float 0.0 + store float %1, float addrspace(1)* %out ret void } @@ -64,20 +64,20 @@ entry: ; CHECK: @ule_float ; CHECK: SETGE T{{[0-9]+\.[XYZW]}}, literal.x, {{T[0-9]+\.[XYZW]}}, 1084227584(5.000000e+00) -define void @ule_float(i32 addrspace(1)* %out, float %in) { +define void @ule_float(float addrspace(1)* %out, float %in) { entry: %0 = fcmp ule float %in, 5.0 - %1 = select i1 %0, i32 -1, i32 0 - store i32 %1, i32 addrspace(1)* %out + %1 = select i1 %0, float 1.0, float 0.0 + store float %1, float addrspace(1)* %out ret void } ; CHECK: @ole ; CHECK: SETGE T{{[0-9]+\.[XYZW]}}, literal.x, {{T[0-9]+\.[XYZW]}}, 1084227584(5.000000e+00) -define void @ole(i32 addrspace(1)* %out, float %in) { +define void @ole(float addrspace(1)* %out, float %in) { entry: %0 = fcmp ole float %in, 5.0 - %1 = select i1 %0, i32 -1, i32 0 - store i32 %1, i32 addrspace(1)* %out + %1 = select i1 %0, float 1.0, float 0.0 + store float %1, float addrspace(1)* %out ret void } -- 1.7.8.6 </blockquote> </div>