[Beignet] [PATCH v2 1/2] reimplement the LZD instruction in backend.

Song, Ruiling ruiling.song at intel.com
Fri Jan 23 00:51:44 PST 2015


It is a great job. But I have some comment inline.
And also could you help further enable our clz() implementation to use this? Our current clz() in libocl is very inefficient.
>            break;
>            case Intrinsic::ctlz:
>            {
> -            ir::Type srcType = getType(ctx, I.getType());
> +            Type *llvmDstType = I.getType();
> +            ir::Type dstType = getType(ctx, llvmDstType);
> +            Type *llvmSrcType = I.getOperand(0)->getType();
> +            ir::Type srcType = getUnsignedType(ctx, llvmSrcType);
> +
>              const ir::Register dst = this->getRegister(&I);
>              const ir::Register src = this->getRegister(I.getOperand(0));
> -            ctx.ALU1(ir::OP_LZD, srcType, dst, src);

Here type_u16 and type_u8 should be very similar, could you try to merge them together? Have duplicate code is not a good idea.
> +            if(srcType == ir::TYPE_U16) {
> +              ir::ImmediateIndex imm;
> +              ir::Type tmpType = ir::TYPE_S32;
> +              const ir::RegisterFamily family = getFamily(tmpType);
> +              imm = ctx.newIntegerImmediate(16, tmpType);
> +              const ir::Register immReg = ctx.reg(family);
> +              ctx.LOADI(ir::TYPE_S32, immReg, imm);
> +
> +              ir::Register tmp0 = ctx.reg(getFamily(tmpType));
> +              ir::Register tmp1 = ctx.reg(getFamily(tmpType));
> +              ir::Register tmp2 = ctx.reg(getFamily(tmpType));
> +              ctx.CVT(tmpType, srcType, tmp0, src);
> +              ctx.ALU1(ir::OP_LZD, tmpType, tmp1, tmp0);
> +              ctx.SUB(tmpType, tmp2, tmp1, immReg);
> +              ctx.CVT(dstType, tmpType, dst, tmp2);
> +            }else if(srcType == ir::TYPE_U8) {
> +              ir::ImmediateIndex imm;
> +              ir::Type tmpType = ir::TYPE_U32;
> +              const ir::RegisterFamily family = getFamily(tmpType);
> +              imm = ctx.newIntegerImmediate(24, tmpType);
> +              const ir::Register immReg = ctx.reg(family);
> +              ctx.LOADI(ir::TYPE_S32, immReg, imm);
> +
> +              ir::Register tmp0 = ctx.reg(getFamily(tmpType));
> +              ir::Register tmp1 = ctx.reg(getFamily(tmpType));
> +              ir::Register tmp2 = ctx.reg(getFamily(tmpType));
> +              ctx.CVT(tmpType, srcType, tmp0, src);
> +              ctx.ALU1(ir::OP_LZD, tmpType, tmp1, tmp0);
> +              ctx.SUB(tmpType, tmp2, tmp1, immReg);
> +              ctx.CVT(dstType, tmpType, dst, tmp2);
> +            }else if(srcType == ir::TYPE_U64) {
> +              ir::ImmediateIndex imm;
> +              ir::Type tmpType = ir::TYPE_U32;
> +              const ir::RegisterFamily family = getFamily(srcType);
> +              imm = ctx.newIntegerImmediate(32, srcType);
> +              const ir::Register immReg = ctx.reg(family);
> +              ctx.LOADI(ir::TYPE_S64, immReg, imm);
> +
> +              ir::ImmediateIndex immMask_L =
> ctx.newIntegerImmediate(0xFFFFFFFF, srcType);

The mask immMask_L here seems unnecessary. A convert from i64 to i32 will directly remove the high 32bit. Right?


> +              const ir::Register regMask_L = ctx.reg(family);
> +              ctx.LOADI(ir::TYPE_S64, regMask_L, immMask_L);
> +              ir::Register qwSrc_L = ctx.reg(getFamily(srcType));
> +              ctx.AND(srcType, qwSrc_L, src, regMask_L);
> +
> +              const ir::RegisterFamily tmpFamily = getFamily(tmpType);
> +              const ir::ImmediateIndex imm32 =
> ctx.newIntegerImmediate(32, tmpType);
> +              const ir::Register imm32Reg = ctx.reg(tmpFamily);
> +              ctx.LOADI(ir::TYPE_S32, imm32Reg, imm32);
> +
> +              ir::Register tmp0 = ctx.reg(getFamily(srcType));
> +              ir::Register tmp1 = ctx.reg(getFamily(tmpType));
> +              ir::Register tmp2 = ctx.reg(getFamily(tmpType));
> +              ir::Register tmp3 = ctx.reg(getFamily(tmpType));
> +              ir::Register tmp4 = ctx.reg(getFamily(tmpType));
> +              ir::Register tmp5 = ctx.reg(getFamily(tmpType));
> +              ir::Register tmp6 = ctx.reg(getFamily(tmpType));
> +              ir::Register cmp = ctx.reg(ir::FAMILY_BOOL);
> +
> +              ctx.SHR(srcType, tmp0, src, immReg);
> +              ctx.CVT(tmpType, srcType, tmp1, tmp0);
> +
> +              ctx.ALU1(ir::OP_LZD, tmpType, tmp2, tmp1);
> +              ctx.LT(tmpType, cmp, tmp2, imm32Reg);
> +
> +              ctx.CVT(tmpType, srcType, tmp3, qwSrc_L);
> +              ctx.ALU1(ir::OP_LZD, tmpType, tmp4, tmp3);
> +              ctx.ADD(tmpType, tmp5, tmp4, imm32Reg);
> +
> +              ctx.SEL(tmpType, tmp6, cmp, tmp2, tmp5);
> +              ctx.CVT(dstType, tmpType, dst, tmp6);
> +            }
> +            else
> +            {
> +              ctx.ALU1(ir::OP_LZD, dstType, dst, src);
> +            }
>            }
>            break;
>            case Intrinsic::fma:
> --
> 1.9.1
> 
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet


More information about the Beignet mailing list