[Mesa-dev] [PATCH 3/3] nv50, nvc0: handle SQRT lowering inside the driver

Sun Mar 13 15:41:45 UTC 2016

On Sun, Mar 13, 2016 at 7:52 AM, Samuel Pitoiset
<samuel.pitoiset at gmail.com> wrote:
>
>
> On 03/13/2016 04:07 AM, Ilia Mirkin wrote:
>>
>> First off, st/mesa lowers DSQRT incorrectly (it uses CMP to attempt to
>> find out whether the input is less than 0). Secondly the current
>> approach (x * rsq(x)) behaves poorly for x = inf - a NaN is produced
>> instead of inf.
>>
>
> When I had a look at this precision thing, this seemed a bit weird to me but
> I did not investigate much of time on it.
>
>> Instead we switch to the less accurate rcp(rsq(x)) method - this behaves
>> nicely for all valid inputs. We still don't do this for DSQRT since the
>> RSQ/RCP ops are *really* inaccurate, and don't even have Newton-Raphson
>> steps right now. Eventually we should have a separate library function
>> for DSQRT that does it more precisely (and perhaps move this lowering to
>> the post-opt phase).
>>
>> This fixes a number of dEQP precision tests that were expecting better
>> behavior for infinite inputs.
>
>
> Yep, this fixes distance, length, refract and sqrt deqp precision tests (37
> tests). But they are still some fails with atan2, idexp and tanh.
> Are you going to fix them too?

Not today. (that's "ldexp" btw) I glanced at atan2 - looks like it's
returning results out of range... perhaps a min/max on the poly
generated by the lowering logic is in order. Didn't investigate too
closely.

>
> Except that you forgot to remove unused variables:
> codegen/nv50_ir_lowering_nvc0.cpp: In member function ‘bool
> nv50_ir::NVC0LoweringPass::handleSQRT(nv50_ir::Instruction*)’:
> codegen/nv50_ir_lowering_nvc0.cpp:1785:20: warning: unused variable ‘mov’
> [-Wunused-variable]
>        Instruction *mov, *rsq;
>                     ^
> codegen/nv50_ir_lowering_nvc0.cpp:1785:26: warning: unused variable ‘rsq’
> [-Wunused-variable]
>        Instruction *mov, *rsq;
>                           ^
>   CC       nvc0/nvc0_surface.lo

Yeah, I noticed that about 5s after I sent the patch out... already
fixed locally.

>
> This patch is:
>
> Reviewed-by: Samuel Pitoiset <samuel.pitoiset at gmail.com>
> Tested-by: Samuel Pitoiset <samuel.pitoiset at gmail.com>

Thanks!

>
>
>>
>> Signed-off-by: Ilia Mirkin <imirkin at alum.mit.edu>
>> ---
>>   .../drivers/nouveau/codegen/nv50_ir_build_util.cpp |  6 +++-
>>   .../drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp  |  2 ++
>>   .../nouveau/codegen/nv50_ir_lowering_nv50.cpp      |  7 ++---
>>   .../nouveau/codegen/nv50_ir_lowering_nvc0.cpp      | 32
>> +++++++++++-----------
>>   src/gallium/drivers/nouveau/nv50/nv50_screen.c     |  2 +-
>>   src/gallium/drivers/nouveau/nvc0/nvc0_screen.c     |  2 +-
>>   6 files changed, 28 insertions(+), 23 deletions(-)
>>
>> diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp
>> b/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp
>> index f58cf97..84ebfdb 100644
>> --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp
>> +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp
>> @@ -585,6 +585,7 @@ BuildUtil::split64BitOpPostRA(Function *fn,
>> Instruction *i,
>>            return NULL;
>>         srcNr = 2;
>>         break;
>> +   case OP_SELP: srcNr = 3; break;
>>      default:
>>         // TODO when needed
>>         return NULL;
>> @@ -601,7 +602,10 @@ BuildUtil::split64BitOpPostRA(Function *fn,
>> Instruction *i,
>>
>>      for (int s = 0; s < srcNr; ++s) {
>>         if (lo->getSrc(s)->reg.size < 8) {
>> -         hi->setSrc(s, zero);
>> +         if (s == 2)
>> +            hi->setSrc(s, lo->getSrc(s));
>> +         else
>> +            hi->setSrc(s, zero);
>>         } else {
>>            if (lo->getSrc(s)->refCount() > 1)
>>               lo->setSrc(s, cloneShallow(fn, lo->getSrc(s)));
>> diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
>> b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
>> index b06d86a..d284446 100644
>> --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
>> +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
>> @@ -616,6 +616,7 @@ static nv50_ir::operation translateOpcode(uint opcode)
>>
>>      NV50_IR_OPCODE_CASE(RCP, RCP);
>>      NV50_IR_OPCODE_CASE(RSQ, RSQ);
>> +   NV50_IR_OPCODE_CASE(SQRT, SQRT);
>>
>>      NV50_IR_OPCODE_CASE(MUL, MUL);
>>      NV50_IR_OPCODE_CASE(ADD, ADD);
>> @@ -2689,6 +2690,7 @@ Converter::handleInstruction(const struct
>> tgsi_full_instruction *insn)
>>      case TGSI_OPCODE_FLR:
>>      case TGSI_OPCODE_TRUNC:
>>      case TGSI_OPCODE_RCP:
>> +   case TGSI_OPCODE_SQRT:
>>      case TGSI_OPCODE_IABS:
>>      case TGSI_OPCODE_INEG:
>>      case TGSI_OPCODE_NOT:
>> diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
>> b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
>> index 8752b0c..12c5f69 100644
>> --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
>> +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
>> @@ -1203,10 +1203,9 @@ NV50LoweringPreSSA::handleDIV(Instruction *i)
>>   bool
>>   NV50LoweringPreSSA::handleSQRT(Instruction *i)
>>   {
>> -   Instruction *rsq = bld.mkOp1(OP_RSQ, TYPE_F32,
>> -                                bld.getSSA(), i->getSrc(0));
>> -   i->op = OP_MUL;
>> -   i->setSrc(1, rsq->getDef(0));
>> +   bld.setPosition(i, true);
>> +   i->op = OP_RSQ;
>> +   bld.mkOp1(OP_RCP, i->dType, i->getDef(0), i->getDef(0));
>>
>>      return true;
>>   }
>> diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
>> b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
>> index d181f15..29b77c9 100644
>> --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
>> +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
>> @@ -1778,22 +1778,22 @@ NVC0LoweringPass::handleMOD(Instruction *i)
>>   bool
>>   NVC0LoweringPass::handleSQRT(Instruction *i)
>>   {
>> -   Value *pred = bld.getSSA(1, FILE_PREDICATE);
>> -   Value *zero = bld.getSSA();
>> -   Instruction *rsq;
>> -
>> -   bld.mkOp1(OP_MOV, TYPE_U32, zero, bld.mkImm(0));
>> -   if (i->dType == TYPE_F64)
>> -      zero = bld.mkOp2v(OP_MERGE, TYPE_U64, bld.getSSA(8), zero, zero);
>> -   bld.mkCmp(OP_SET, CC_LE, i->dType, pred, i->dType, i->getSrc(0),
>> zero);
>> -   bld.mkOp1(OP_MOV, i->dType, i->getDef(0), zero)->setPredicate(CC_P,
>> pred);
>> -   rsq = bld.mkOp1(OP_RSQ, i->dType,
>> -                   bld.getSSA(typeSizeof(i->dType)), i->getSrc(0));
>> -   rsq->setPredicate(CC_NOT_P, pred);
>> -   i->op = OP_MUL;
>> -   i->setSrc(1, rsq->getDef(0));
>> -   i->setPredicate(CC_NOT_P, pred);
>> -
>> +   if (i->dType == TYPE_F64) {
>> +      Value *pred = bld.getSSA(1, FILE_PREDICATE);
>> +      Value *zero = bld.loadImm(NULL, 0.0d);
>> +      Value *dst = bld.getSSA(8);
>> +      Instruction *mov, *rsq;
>> +      bld.mkOp1(OP_RSQ, i->dType, dst, i->getSrc(0));
>> +      bld.mkCmp(OP_SET, CC_LE, i->dType, pred, i->dType, i->getSrc(0),
>> zero);
>> +      bld.mkOp3(OP_SELP, TYPE_U64, dst, zero, dst, pred);
>> +      i->op = OP_MUL;
>> +      i->setSrc(1, dst);
>> +      // TODO: Handle this properly with a library function
>> +   } else {
>> +      bld.setPosition(i, true);
>> +      i->op = OP_RSQ;
>> +      bld.mkOp1(OP_RCP, i->dType, i->getDef(0), i->getDef(0));
>> +   }
>>
>>      return true;
>>   }
>> diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.c
>> b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
>> index 28e0ed3..5836bb2 100644
>> --- a/src/gallium/drivers/nouveau/nv50/nv50_screen.c
>> +++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
>> @@ -305,7 +305,7 @@ nv50_screen_get_shader_param(struct pipe_screen
>> *pscreen, unsigned shader,
>>      case PIPE_SHADER_CAP_TGSI_CONT_SUPPORTED:
>>         return 1;
>>      case PIPE_SHADER_CAP_TGSI_SQRT_SUPPORTED:
>> -      return 0;
>> +      return 1;
>>      case PIPE_SHADER_CAP_SUBROUTINES:
>>         return 0; /* please inline, or provide function declarations */
>>      case PIPE_SHADER_CAP_INTEGERS:
>> diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
>> b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
>> index 30afdf2f0..3c5b1da 100644
>> --- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
>> +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
>> @@ -328,7 +328,7 @@ nvc0_screen_get_shader_param(struct pipe_screen
>> *pscreen, unsigned shader,
>>      case PIPE_SHADER_CAP_TGSI_CONT_SUPPORTED:
>>         return 1;
>>      case PIPE_SHADER_CAP_TGSI_SQRT_SUPPORTED:
>> -      return 0;
>> +      return 1;
>>      case PIPE_SHADER_CAP_SUBROUTINES:
>>         return 1;
>>      case PIPE_SHADER_CAP_INTEGERS:
>>
>