[Mesa-dev] [PATCH 1/2] nv50/ir: add fp64 support on G200 (NVA0)

Sun Feb 22 20:01:27 PST 2015

Signed-off-by: Ilia Mirkin <imirkin at alum.mit.edu>
---

Untested beyond compiling a few shaders to see if they look like they
might work. nvdisasm agrees with envydis's decoding of these things.

Will definitely get ahold of a G200 to run tests on before pushing this.

 .../drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp  | 94 ++++++++++++++++++---
 .../nouveau/codegen/nv50_ir_lowering_nv50.cpp      | 97 +++++++++++++++++++++-
 .../nouveau/codegen/nv50_ir_target_nv50.cpp        |  2 +-
 src/gallium/drivers/nouveau/nv50/nv50_screen.c     |  4 +
 4 files changed, 185 insertions(+), 12 deletions(-)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp
index b1e7409..7c6f7da 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp
@@ -96,9 +96,12 @@ private:
    void emitUADD(const Instruction *);
    void emitAADD(const Instruction *);
    void emitFADD(const Instruction *);
+   void emitDADD(const Instruction *);
    void emitIMUL(const Instruction *);
    void emitFMUL(const Instruction *);
+   void emitDMUL(const Instruction *);
    void emitFMAD(const Instruction *);
+   void emitDMAD(const Instruction *);
    void emitIMAD(const Instruction *);
    void emitISAD(const Instruction *);
 
@@ -923,11 +926,13 @@ CodeEmitterNV50::emitMINMAX(const Instruction *i)
          assert(0);
          break;
       }
-      code[1] |= i->src(0).mod.abs() << 20;
-      code[1] |= i->src(0).mod.neg() << 26;
-      code[1] |= i->src(1).mod.abs() << 19;
-      code[1] |= i->src(1).mod.neg() << 27;
    }
+
+   code[1] |= i->src(0).mod.abs() << 20;
+   code[1] |= i->src(0).mod.neg() << 26;
+   code[1] |= i->src(1).mod.abs() << 19;
+   code[1] |= i->src(1).mod.neg() << 27;
+
    emitForm_MAD(i);
 }
 
@@ -963,6 +968,26 @@ CodeEmitterNV50::emitFMAD(const Instruction *i)
 }
 
 void
+CodeEmitterNV50::emitDMAD(const Instruction *i)
+{
+   const int neg_mul = i->src(0).mod.neg() ^ i->src(1).mod.neg();
+   const int neg_add = i->src(2).mod.neg();
+
+   assert(i->encSize == 8);
+   assert(!i->saturate);
+
+   code[1] = 0x40000000;
+   code[0] = 0xe0000000;
+
+   code[1] |= neg_mul << 26;
+   code[1] |= neg_add << 27;
+
+   roundMode_MAD(i);
+
+   emitForm_MAD(i);
+}
+
+void
 CodeEmitterNV50::emitFADD(const Instruction *i)
 {
    const int neg0 = i->src(0).mod.neg();
@@ -997,6 +1022,25 @@ CodeEmitterNV50::emitFADD(const Instruction *i)
 }
 
 void
+CodeEmitterNV50::emitDADD(const Instruction *i)
+{
+   const int neg0 = i->src(0).mod.neg();
+   const int neg1 = i->src(1).mod.neg() ^ ((i->op == OP_SUB) ? 1 : 0);
+
+   assert(!(i->src(0).mod | i->src(1).mod).abs());
+   assert(!i->saturate);
+   assert(i->encSize == 8);
+
+   code[1] = 0x60000000;
+   code[0] = 0xe0000000;
+
+   emitForm_ADD(i);
+
+   code[1] |= neg0 << 26;
+   code[1] |= neg1 << 27;
+}
+
+void
 CodeEmitterNV50::emitUADD(const Instruction *i)
 {
    const int neg0 = i->src(0).mod.neg();
@@ -1090,6 +1134,25 @@ CodeEmitterNV50::emitFMUL(const Instruction *i)
 }
 
 void
+CodeEmitterNV50::emitDMUL(const Instruction *i)
+{
+   const int neg = (i->src(0).mod ^ i->src(1).mod).neg();
+
+   assert(!i->saturate);
+   assert(i->encSize == 8);
+
+   code[1] = 0x80000000;
+   code[0] = 0xe0000000;
+
+   if (neg)
+      code[1] |= 0x08000000;
+
+   roundMode_CVT(i->rnd);
+
+   emitForm_MAD(i);
+}
+
+void
 CodeEmitterNV50::emitIMAD(const Instruction *i)
 {
    code[0] = 0x60000000;
@@ -1150,9 +1213,11 @@ CodeEmitterNV50::emitSET(const Instruction *i)
    code[0] = 0x30000000;
    code[1] = 0x60000000;
 
-   emitCondCode(i->asCmp()->setCond, i->sType, 32 + 14);
-
    switch (i->sType) {
+   case TYPE_F64:
+      code[0] = 0xe0000000;
+      code[1] = 0xe0000000;
+      break;
    case TYPE_F32: code[0] |= 0x80000000; break;
    case TYPE_S32: code[1] |= 0x0c000000; break;
    case TYPE_U32: code[1] |= 0x04000000; break;
@@ -1162,6 +1227,9 @@ CodeEmitterNV50::emitSET(const Instruction *i)
       assert(0);
       break;
    }
+
+   emitCondCode(i->asCmp()->setCond, i->sType, 32 + 14);
+
    if (i->src(0).mod.neg()) code[1] |= 0x04000000;
    if (i->src(1).mod.neg()) code[1] |= 0x08000000;
    if (i->src(0).mod.abs()) code[1] |= 0x00100000;
@@ -1725,7 +1793,9 @@ CodeEmitterNV50::emitInstruction(Instruction *insn)
       break;
    case OP_ADD:
    case OP_SUB:
-      if (isFloatType(insn->dType))
+      if (insn->dType == TYPE_F64)
+         emitDADD(insn);
+      else if (isFloatType(insn->dType))
          emitFADD(insn);
       else if (insn->getDef(0)->reg.file == FILE_ADDRESS)
          emitAADD(insn);
@@ -1733,14 +1803,18 @@ CodeEmitterNV50::emitInstruction(Instruction *insn)
          emitUADD(insn);
       break;
    case OP_MUL:
-      if (isFloatType(insn->dType))
+      if (insn->dType == TYPE_F64)
+         emitDMUL(insn);
+      else if (isFloatType(insn->dType))
          emitFMUL(insn);
       else
          emitIMUL(insn);
       break;
    case OP_MAD:
    case OP_FMA:
-      if (isFloatType(insn->dType))
+      if (insn->dType == TYPE_F64)
+         emitDMAD(insn);
+      else if (isFloatType(insn->dType))
          emitFMAD(insn);
       else
          emitIMAD(insn);
@@ -1912,7 +1986,7 @@ CodeEmitterNV50::getMinEncodingSize(const Instruction *i) const
 {
    const Target::OpInfo &info = targ->getOpInfo(i);
 
-   if (info.minEncSize > 4)
+   if (info.minEncSize > 4 || i->dType == TYPE_F64)
       return 8;
 
    // check constraints on dst and src operands
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
index 1ad0860..d5dadc2 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
@@ -314,6 +314,7 @@ private:
    void handleDIV(Instruction *);
    void handleMOD(Instruction *);
    void handleMUL(Instruction *);
+   void handleDRCPRSQ(Instruction *);
    void handleAddrDef(Instruction *);
 
    inline bool isARL(const Instruction *) const;
@@ -552,6 +553,95 @@ NV50LegalizeSSA::handleMOD(Instruction *mod)
    mod->setSrc(1, m);
 }
 
+void
+NV50LegalizeSSA::handleDRCPRSQ(Instruction *i)
+{
+   /* We need to replace this instruction with a sequence that computes the
+    * appropriate function. As a first guess, we use the "quake" style
+    * approximation for RSQ:
+    *
+    * 0x5fe6eb50c7b537a9 - num >> 1
+    *
+    * For RCP, we will then square it.
+    */
+   Value *abs, *guess, *parts[2], *input[2], *shr[4], *pred;
+
+   bld.setPosition(i, false);
+
+   abs = bld.mkOp1v(OP_ABS, TYPE_F64, bld.getSSA(8), i->getSrc(0));
+
+   parts[0] = bld.loadImm(NULL, 0xc7b537a9);
+   parts[1] = bld.loadImm(NULL, 0x5fe6eb50);
+   guess = bld.mkOp2v(OP_MERGE, TYPE_F64, bld.getSSA(8), parts[0], parts[1]);
+
+   bld.mkSplit(input, 4, abs);
+   shr[0] = bld.mkOp2v(OP_SHR, TYPE_U32, bld.getSSA(4), input[0], bld.mkImm(1));
+   shr[1] = bld.mkOp2v(OP_SHR, TYPE_U32, bld.getSSA(4), input[1], bld.mkImm(1));
+
+   // If the bottom bit of the high word was set, set the high bit of the
+   // bottom word.
+   pred = bld.getSSA(1, FILE_FLAGS);
+   bld.mkOp2(OP_AND, TYPE_U32, NULL, input[1], bld.loadImm(NULL, 1))
+      ->setFlagsDef(0, pred);
+   shr[2] = bld.getSSA(4); shr[3] = bld.getSSA(4);
+   bld.mkOp2(OP_OR, TYPE_U32, shr[2], shr[0], bld.loadImm(NULL, 0x80000000))
+      ->setPredicate(CC_S, pred);
+   bld.mkMov(shr[3], shr[0])
+      ->setPredicate(CC_NS, pred);
+   shr[0] = bld.mkOp2v(OP_UNION, TYPE_U32, bld.getSSA(4), shr[2], shr[3]);
+
+   guess = bld.mkOp2v(OP_SUB, TYPE_F64, bld.getSSA(8), guess,
+                      bld.mkOp2v(OP_MERGE, TYPE_F64, bld.getSSA(8), shr[0], shr[1]));
+
+   if (i->op == OP_RCP) {
+      Value *two = bld.getSSA(8), *neg = bld.getSSA(8), *copy = bld.getSSA(8);
+
+      bld.mkCvt(OP_CVT, TYPE_F64, two, TYPE_F32, bld.loadImm(NULL, 2.0f));
+
+      /* Square the guess first, since it was for RSQ */
+      guess = bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess, guess);
+
+      // RCP: x_{n+1} = 2 * x_n - input * x_n^2
+      guess = bld.mkOp2v(OP_SUB, TYPE_F64, bld.getSSA(8),
+                         bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), two, guess),
+                         bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), abs,
+                                    bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess, guess)));
+      guess = bld.mkOp2v(OP_SUB, TYPE_F64, bld.getSSA(8),
+                         bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), two, guess),
+                         bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), abs,
+                                    bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess, guess)));
+
+      // Restore the sign on the output
+      bld.mkSplit(input, 4, i->getSrc(0));
+      bld.mkOp2(OP_AND, TYPE_U32, NULL, input[1], bld.loadImm(NULL, 0x80000000))
+         ->setFlagsDef(0, (pred = bld.getSSA(1, FILE_FLAGS)));
+      bld.mkOp1(OP_NEG, TYPE_F64, neg, guess)
+         ->setPredicate(CC_S, pred);
+      bld.mkMov(copy, guess)
+         ->setPredicate(CC_NS, pred);
+      guess = bld.mkOp2v(OP_UNION, TYPE_U64, bld.getSSA(8), neg, copy);
+   } else {
+      Value *half_input = bld.getSSA(8), *three_half = bld.getSSA(8);
+      bld.mkCvt(OP_CVT, TYPE_F64, half_input, TYPE_F32, bld.loadImm(NULL, -0.5f));
+      bld.mkCvt(OP_CVT, TYPE_F64, three_half, TYPE_F32, bld.loadImm(NULL, 1.5f));
+
+      half_input = bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), half_input, abs);
+      // RSQ: x_{n+1} = x_n * (1.5 - 0.5 * input * x_n^2)
+      guess = bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess,
+                         bld.mkOp3v(OP_MAD, TYPE_F64, bld.getSSA(8), half_input,
+                                    bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess, guess),
+                                    three_half));
+      guess = bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess,
+                         bld.mkOp3v(OP_MAD, TYPE_F64, bld.getSSA(8), half_input,
+                                    bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess, guess),
+                                    three_half));
+   }
+
+   i->op = OP_MOV;
+   i->setSrc(0, guess);
+}
+
+
 bool
 NV50LegalizeSSA::visit(BasicBlock *bb)
 {
@@ -578,6 +668,11 @@ NV50LegalizeSSA::visit(BasicBlock *bb)
       case OP_MUL:
          handleMUL(insn);
          break;
+      case OP_RCP:
+      case OP_RSQ:
+         if (insn->dType == TYPE_F64)
+            handleDRCPRSQ(insn);
+         break;
       default:
          break;
       }
@@ -1162,7 +1257,7 @@ NV50LoweringPreSSA::handleDIV(Instruction *i)
 bool
 NV50LoweringPreSSA::handleSQRT(Instruction *i)
 {
-   Instruction *rsq = bld.mkOp1(OP_RSQ, TYPE_F32,
+   Instruction *rsq = bld.mkOp1(OP_RSQ, i->dType,
                                 bld.getSSA(), i->getSrc(0));
    i->op = OP_MUL;
    i->setSrc(1, rsq->getDef(0));
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp
index 178a167..f3d8733 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp
@@ -388,7 +388,7 @@ TargetNV50::isAccessSupported(DataFile file, DataType ty) const
 bool
 TargetNV50::isOpSupported(operation op, DataType ty) const
 {
-   if (ty == TYPE_F64 && chipset < 0xa0)
+   if (ty == TYPE_F64 && chipset != 0xa0)
       return false;
 
    switch (op) {
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.c b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
index ed07ba4..4532957 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_screen.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
@@ -237,6 +237,8 @@ static int
 nv50_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
                              enum pipe_shader_cap param)
 {
+   struct nouveau_device *dev = nouveau_screen(pscreen)->device;
+
    switch (shader) {
    case PIPE_SHADER_VERTEX:
    case PIPE_SHADER_GEOMETRY:
@@ -287,7 +289,9 @@ nv50_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
    case PIPE_SHADER_CAP_MAX_SAMPLER_VIEWS:
       return MIN2(32, PIPE_MAX_SAMPLERS);
    case PIPE_SHADER_CAP_DOUBLES:
+      return dev->chipset == 0xa0;
    case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED:
+      return dev->chipset == 0xa0;
    case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
       return 0;
    default:
-- 
2.0.5