[Nouveau] [PATCH 1/2] nv50/ir: add fp64 support on G200 (NVA0)
Ilia Mirkin
imirkin at alum.mit.edu
Sun Feb 22 20:01:27 PST 2015
Signed-off-by: Ilia Mirkin <imirkin at alum.mit.edu>
---
Untested beyond compiling a few shaders to see if they look like they
might work. nvdisasm agrees with envydis's decoding of these things.
Will definitely get ahold of a G200 to run tests on before pushing this.
.../drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp | 94 ++++++++++++++++++---
.../nouveau/codegen/nv50_ir_lowering_nv50.cpp | 97 +++++++++++++++++++++-
.../nouveau/codegen/nv50_ir_target_nv50.cpp | 2 +-
src/gallium/drivers/nouveau/nv50/nv50_screen.c | 4 +
4 files changed, 185 insertions(+), 12 deletions(-)
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp
index b1e7409..7c6f7da 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp
@@ -96,9 +96,12 @@ private:
void emitUADD(const Instruction *);
void emitAADD(const Instruction *);
void emitFADD(const Instruction *);
+ void emitDADD(const Instruction *);
void emitIMUL(const Instruction *);
void emitFMUL(const Instruction *);
+ void emitDMUL(const Instruction *);
void emitFMAD(const Instruction *);
+ void emitDMAD(const Instruction *);
void emitIMAD(const Instruction *);
void emitISAD(const Instruction *);
@@ -923,11 +926,13 @@ CodeEmitterNV50::emitMINMAX(const Instruction *i)
assert(0);
break;
}
- code[1] |= i->src(0).mod.abs() << 20;
- code[1] |= i->src(0).mod.neg() << 26;
- code[1] |= i->src(1).mod.abs() << 19;
- code[1] |= i->src(1).mod.neg() << 27;
}
+
+ code[1] |= i->src(0).mod.abs() << 20;
+ code[1] |= i->src(0).mod.neg() << 26;
+ code[1] |= i->src(1).mod.abs() << 19;
+ code[1] |= i->src(1).mod.neg() << 27;
+
emitForm_MAD(i);
}
@@ -963,6 +968,26 @@ CodeEmitterNV50::emitFMAD(const Instruction *i)
}
void
+CodeEmitterNV50::emitDMAD(const Instruction *i)
+{
+ const int neg_mul = i->src(0).mod.neg() ^ i->src(1).mod.neg();
+ const int neg_add = i->src(2).mod.neg();
+
+ assert(i->encSize == 8);
+ assert(!i->saturate);
+
+ code[1] = 0x40000000;
+ code[0] = 0xe0000000;
+
+ code[1] |= neg_mul << 26;
+ code[1] |= neg_add << 27;
+
+ roundMode_MAD(i);
+
+ emitForm_MAD(i);
+}
+
+void
CodeEmitterNV50::emitFADD(const Instruction *i)
{
const int neg0 = i->src(0).mod.neg();
@@ -997,6 +1022,25 @@ CodeEmitterNV50::emitFADD(const Instruction *i)
}
void
+CodeEmitterNV50::emitDADD(const Instruction *i)
+{
+ const int neg0 = i->src(0).mod.neg();
+ const int neg1 = i->src(1).mod.neg() ^ ((i->op == OP_SUB) ? 1 : 0);
+
+ assert(!(i->src(0).mod | i->src(1).mod).abs());
+ assert(!i->saturate);
+ assert(i->encSize == 8);
+
+ code[1] = 0x60000000;
+ code[0] = 0xe0000000;
+
+ emitForm_ADD(i);
+
+ code[1] |= neg0 << 26;
+ code[1] |= neg1 << 27;
+}
+
+void
CodeEmitterNV50::emitUADD(const Instruction *i)
{
const int neg0 = i->src(0).mod.neg();
@@ -1090,6 +1134,25 @@ CodeEmitterNV50::emitFMUL(const Instruction *i)
}
void
+CodeEmitterNV50::emitDMUL(const Instruction *i)
+{
+ const int neg = (i->src(0).mod ^ i->src(1).mod).neg();
+
+ assert(!i->saturate);
+ assert(i->encSize == 8);
+
+ code[1] = 0x80000000;
+ code[0] = 0xe0000000;
+
+ if (neg)
+ code[1] |= 0x08000000;
+
+ roundMode_CVT(i->rnd);
+
+ emitForm_MAD(i);
+}
+
+void
CodeEmitterNV50::emitIMAD(const Instruction *i)
{
code[0] = 0x60000000;
@@ -1150,9 +1213,11 @@ CodeEmitterNV50::emitSET(const Instruction *i)
code[0] = 0x30000000;
code[1] = 0x60000000;
- emitCondCode(i->asCmp()->setCond, i->sType, 32 + 14);
-
switch (i->sType) {
+ case TYPE_F64:
+ code[0] = 0xe0000000;
+ code[1] = 0xe0000000;
+ break;
case TYPE_F32: code[0] |= 0x80000000; break;
case TYPE_S32: code[1] |= 0x0c000000; break;
case TYPE_U32: code[1] |= 0x04000000; break;
@@ -1162,6 +1227,9 @@ CodeEmitterNV50::emitSET(const Instruction *i)
assert(0);
break;
}
+
+ emitCondCode(i->asCmp()->setCond, i->sType, 32 + 14);
+
if (i->src(0).mod.neg()) code[1] |= 0x04000000;
if (i->src(1).mod.neg()) code[1] |= 0x08000000;
if (i->src(0).mod.abs()) code[1] |= 0x00100000;
@@ -1725,7 +1793,9 @@ CodeEmitterNV50::emitInstruction(Instruction *insn)
break;
case OP_ADD:
case OP_SUB:
- if (isFloatType(insn->dType))
+ if (insn->dType == TYPE_F64)
+ emitDADD(insn);
+ else if (isFloatType(insn->dType))
emitFADD(insn);
else if (insn->getDef(0)->reg.file == FILE_ADDRESS)
emitAADD(insn);
@@ -1733,14 +1803,18 @@ CodeEmitterNV50::emitInstruction(Instruction *insn)
emitUADD(insn);
break;
case OP_MUL:
- if (isFloatType(insn->dType))
+ if (insn->dType == TYPE_F64)
+ emitDMUL(insn);
+ else if (isFloatType(insn->dType))
emitFMUL(insn);
else
emitIMUL(insn);
break;
case OP_MAD:
case OP_FMA:
- if (isFloatType(insn->dType))
+ if (insn->dType == TYPE_F64)
+ emitDMAD(insn);
+ else if (isFloatType(insn->dType))
emitFMAD(insn);
else
emitIMAD(insn);
@@ -1912,7 +1986,7 @@ CodeEmitterNV50::getMinEncodingSize(const Instruction *i) const
{
const Target::OpInfo &info = targ->getOpInfo(i);
- if (info.minEncSize > 4)
+ if (info.minEncSize > 4 || i->dType == TYPE_F64)
return 8;
// check constraints on dst and src operands
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
index 1ad0860..d5dadc2 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
@@ -314,6 +314,7 @@ private:
void handleDIV(Instruction *);
void handleMOD(Instruction *);
void handleMUL(Instruction *);
+ void handleDRCPRSQ(Instruction *);
void handleAddrDef(Instruction *);
inline bool isARL(const Instruction *) const;
@@ -552,6 +553,95 @@ NV50LegalizeSSA::handleMOD(Instruction *mod)
mod->setSrc(1, m);
}
+void
+NV50LegalizeSSA::handleDRCPRSQ(Instruction *i)
+{
+ /* We need to replace this instruction with a sequence that computes the
+ * appropriate function. As a first guess, we use the "quake" style
+ * approximation for RSQ:
+ *
+ * 0x5fe6eb50c7b537a9 - num >> 1
+ *
+ * For RCP, we will then square it.
+ */
+ Value *abs, *guess, *parts[2], *input[2], *shr[4], *pred;
+
+ bld.setPosition(i, false);
+
+ abs = bld.mkOp1v(OP_ABS, TYPE_F64, bld.getSSA(8), i->getSrc(0));
+
+ parts[0] = bld.loadImm(NULL, 0xc7b537a9);
+ parts[1] = bld.loadImm(NULL, 0x5fe6eb50);
+ guess = bld.mkOp2v(OP_MERGE, TYPE_F64, bld.getSSA(8), parts[0], parts[1]);
+
+ bld.mkSplit(input, 4, abs);
+ shr[0] = bld.mkOp2v(OP_SHR, TYPE_U32, bld.getSSA(4), input[0], bld.mkImm(1));
+ shr[1] = bld.mkOp2v(OP_SHR, TYPE_U32, bld.getSSA(4), input[1], bld.mkImm(1));
+
+ // If the bottom bit of the high word was set, set the high bit of the
+ // bottom word.
+ pred = bld.getSSA(1, FILE_FLAGS);
+ bld.mkOp2(OP_AND, TYPE_U32, NULL, input[1], bld.loadImm(NULL, 1))
+ ->setFlagsDef(0, pred);
+ shr[2] = bld.getSSA(4); shr[3] = bld.getSSA(4);
+ bld.mkOp2(OP_OR, TYPE_U32, shr[2], shr[0], bld.loadImm(NULL, 0x80000000))
+ ->setPredicate(CC_S, pred);
+ bld.mkMov(shr[3], shr[0])
+ ->setPredicate(CC_NS, pred);
+ shr[0] = bld.mkOp2v(OP_UNION, TYPE_U32, bld.getSSA(4), shr[2], shr[3]);
+
+ guess = bld.mkOp2v(OP_SUB, TYPE_F64, bld.getSSA(8), guess,
+ bld.mkOp2v(OP_MERGE, TYPE_F64, bld.getSSA(8), shr[0], shr[1]));
+
+ if (i->op == OP_RCP) {
+ Value *two = bld.getSSA(8), *neg = bld.getSSA(8), *copy = bld.getSSA(8);
+
+ bld.mkCvt(OP_CVT, TYPE_F64, two, TYPE_F32, bld.loadImm(NULL, 2.0f));
+
+ /* Square the guess first, since it was for RSQ */
+ guess = bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess, guess);
+
+ // RCP: x_{n+1} = 2 * x_n - input * x_n^2
+ guess = bld.mkOp2v(OP_SUB, TYPE_F64, bld.getSSA(8),
+ bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), two, guess),
+ bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), abs,
+ bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess, guess)));
+ guess = bld.mkOp2v(OP_SUB, TYPE_F64, bld.getSSA(8),
+ bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), two, guess),
+ bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), abs,
+ bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess, guess)));
+
+ // Restore the sign on the output
+ bld.mkSplit(input, 4, i->getSrc(0));
+ bld.mkOp2(OP_AND, TYPE_U32, NULL, input[1], bld.loadImm(NULL, 0x80000000))
+ ->setFlagsDef(0, (pred = bld.getSSA(1, FILE_FLAGS)));
+ bld.mkOp1(OP_NEG, TYPE_F64, neg, guess)
+ ->setPredicate(CC_S, pred);
+ bld.mkMov(copy, guess)
+ ->setPredicate(CC_NS, pred);
+ guess = bld.mkOp2v(OP_UNION, TYPE_U64, bld.getSSA(8), neg, copy);
+ } else {
+ Value *half_input = bld.getSSA(8), *three_half = bld.getSSA(8);
+ bld.mkCvt(OP_CVT, TYPE_F64, half_input, TYPE_F32, bld.loadImm(NULL, -0.5f));
+ bld.mkCvt(OP_CVT, TYPE_F64, three_half, TYPE_F32, bld.loadImm(NULL, 1.5f));
+
+ half_input = bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), half_input, abs);
+ // RSQ: x_{n+1} = x_n * (1.5 - 0.5 * input * x_n^2)
+ guess = bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess,
+ bld.mkOp3v(OP_MAD, TYPE_F64, bld.getSSA(8), half_input,
+ bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess, guess),
+ three_half));
+ guess = bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess,
+ bld.mkOp3v(OP_MAD, TYPE_F64, bld.getSSA(8), half_input,
+ bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess, guess),
+ three_half));
+ }
+
+ i->op = OP_MOV;
+ i->setSrc(0, guess);
+}
+
+
bool
NV50LegalizeSSA::visit(BasicBlock *bb)
{
@@ -578,6 +668,11 @@ NV50LegalizeSSA::visit(BasicBlock *bb)
case OP_MUL:
handleMUL(insn);
break;
+ case OP_RCP:
+ case OP_RSQ:
+ if (insn->dType == TYPE_F64)
+ handleDRCPRSQ(insn);
+ break;
default:
break;
}
@@ -1162,7 +1257,7 @@ NV50LoweringPreSSA::handleDIV(Instruction *i)
bool
NV50LoweringPreSSA::handleSQRT(Instruction *i)
{
- Instruction *rsq = bld.mkOp1(OP_RSQ, TYPE_F32,
+ Instruction *rsq = bld.mkOp1(OP_RSQ, i->dType,
bld.getSSA(), i->getSrc(0));
i->op = OP_MUL;
i->setSrc(1, rsq->getDef(0));
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp
index 178a167..f3d8733 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp
@@ -388,7 +388,7 @@ TargetNV50::isAccessSupported(DataFile file, DataType ty) const
bool
TargetNV50::isOpSupported(operation op, DataType ty) const
{
- if (ty == TYPE_F64 && chipset < 0xa0)
+ if (ty == TYPE_F64 && chipset != 0xa0)
return false;
switch (op) {
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.c b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
index ed07ba4..4532957 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_screen.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
@@ -237,6 +237,8 @@ static int
nv50_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
enum pipe_shader_cap param)
{
+ struct nouveau_device *dev = nouveau_screen(pscreen)->device;
+
switch (shader) {
case PIPE_SHADER_VERTEX:
case PIPE_SHADER_GEOMETRY:
@@ -287,7 +289,9 @@ nv50_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
case PIPE_SHADER_CAP_MAX_SAMPLER_VIEWS:
return MIN2(32, PIPE_MAX_SAMPLERS);
case PIPE_SHADER_CAP_DOUBLES:
+ return dev->chipset == 0xa0;
case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED:
+ return dev->chipset == 0xa0;
case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
return 0;
default:
--
2.0.5
More information about the Nouveau
mailing list