From c1e4a6bfbf015801c6a8b0ae694482421a22c2d9 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Sat, 12 Mar 2016 21:26:21 -0500
Subject: [PATCH] nv50,nvc0: handle SQRT lowering inside the driver

First off, st/mesa lowers DSQRT incorrectly (it uses CMP to attempt to
find out whether the input is less than 0). Secondly the current
approach (x * rsq(x)) behaves poorly for x = inf - a NaN is produced
instead of inf.

Instead we switch to the less accurate rcp(rsq(x)) method - this behaves
nicely for all valid inputs. We still don't do this for DSQRT since the
RSQ/RCP ops are *really* inaccurate, and don't even have Newton-Raphson
steps right now. Eventually we should have a separate library function
for DSQRT that does it more precisely (and perhaps move this lowering to
the post-opt phase).

This fixes a number of dEQP precision tests that were expecting better
behavior for infinite inputs.

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Tested-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
---
 .../drivers/nouveau/codegen/nv50_ir_build_util.cpp |  6 ++++-
 .../drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp  |  2 ++
 .../nouveau/codegen/nv50_ir_lowering_nv50.cpp      |  7 +++--
 .../nouveau/codegen/nv50_ir_lowering_nvc0.cpp      | 31 +++++++++++-----------
 src/gallium/drivers/nouveau/nv50/nv50_screen.c     |  2 +-
 src/gallium/drivers/nouveau/nvc0/nvc0_screen.c     |  2 +-
 6 files changed, 27 insertions(+), 23 deletions(-)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp
index f58cf97..84ebfdb 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp
@@ -585,6 +585,7 @@ BuildUtil::split64BitOpPostRA(Function *fn, Instruction *i,
          return NULL;
       srcNr = 2;
       break;
+   case OP_SELP: srcNr = 3; break;
    default:
       // TODO when needed
       return NULL;
@@ -601,7 +602,10 @@ BuildUtil::split64BitOpPostRA(Function *fn, Instruction *i,
 
    for (int s = 0; s < srcNr; ++s) {
       if (lo->getSrc(s)->reg.size < 8) {
-         hi->setSrc(s, zero);
+         if (s == 2)
+            hi->setSrc(s, lo->getSrc(s));
+         else
+            hi->setSrc(s, zero);
       } else {
          if (lo->getSrc(s)->refCount() > 1)
             lo->setSrc(s, cloneShallow(fn, lo->getSrc(s)));
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
index b06d86a..d284446 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
@@ -616,6 +616,7 @@ static nv50_ir::operation translateOpcode(uint opcode)
 
    NV50_IR_OPCODE_CASE(RCP, RCP);
    NV50_IR_OPCODE_CASE(RSQ, RSQ);
+   NV50_IR_OPCODE_CASE(SQRT, SQRT);
 
    NV50_IR_OPCODE_CASE(MUL, MUL);
    NV50_IR_OPCODE_CASE(ADD, ADD);
@@ -2689,6 +2690,7 @@ Converter::handleInstruction(const struct tgsi_full_instruction *insn)
    case TGSI_OPCODE_FLR:
    case TGSI_OPCODE_TRUNC:
    case TGSI_OPCODE_RCP:
+   case TGSI_OPCODE_SQRT:
    case TGSI_OPCODE_IABS:
    case TGSI_OPCODE_INEG:
    case TGSI_OPCODE_NOT:
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
index 8752b0c..12c5f69 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
@@ -1203,10 +1203,9 @@ NV50LoweringPreSSA::handleDIV(Instruction *i)
 bool
 NV50LoweringPreSSA::handleSQRT(Instruction *i)
 {
-   Instruction *rsq = bld.mkOp1(OP_RSQ, TYPE_F32,
-                                bld.getSSA(), i->getSrc(0));
-   i->op = OP_MUL;
-   i->setSrc(1, rsq->getDef(0));
+   bld.setPosition(i, true);
+   i->op = OP_RSQ;
+   bld.mkOp1(OP_RCP, i->dType, i->getDef(0), i->getDef(0));
 
    return true;
 }
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
index d181f15..d0936d8 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
@@ -1778,22 +1778,21 @@ NVC0LoweringPass::handleMOD(Instruction *i)
 bool
 NVC0LoweringPass::handleSQRT(Instruction *i)
 {
-   Value *pred = bld.getSSA(1, FILE_PREDICATE);
-   Value *zero = bld.getSSA();
-   Instruction *rsq;
-
-   bld.mkOp1(OP_MOV, TYPE_U32, zero, bld.mkImm(0));
-   if (i->dType == TYPE_F64)
-      zero = bld.mkOp2v(OP_MERGE, TYPE_U64, bld.getSSA(8), zero, zero);
-   bld.mkCmp(OP_SET, CC_LE, i->dType, pred, i->dType, i->getSrc(0), zero);
-   bld.mkOp1(OP_MOV, i->dType, i->getDef(0), zero)->setPredicate(CC_P, pred);
-   rsq = bld.mkOp1(OP_RSQ, i->dType,
-                   bld.getSSA(typeSizeof(i->dType)), i->getSrc(0));
-   rsq->setPredicate(CC_NOT_P, pred);
-   i->op = OP_MUL;
-   i->setSrc(1, rsq->getDef(0));
-   i->setPredicate(CC_NOT_P, pred);
-
+   if (i->dType == TYPE_F64) {
+      Value *pred = bld.getSSA(1, FILE_PREDICATE);
+      Value *zero = bld.loadImm(NULL, 0.0d);
+      Value *dst = bld.getSSA(8);
+      bld.mkOp1(OP_RSQ, i->dType, dst, i->getSrc(0));
+      bld.mkCmp(OP_SET, CC_LE, i->dType, pred, i->dType, i->getSrc(0), zero);
+      bld.mkOp3(OP_SELP, TYPE_U64, dst, zero, dst, pred);
+      i->op = OP_MUL;
+      i->setSrc(1, dst);
+      // TODO: Handle this properly with a library function
+   } else {
+      bld.setPosition(i, true);
+      i->op = OP_RSQ;
+      bld.mkOp1(OP_RCP, i->dType, i->getDef(0), i->getDef(0));
+   }
 
    return true;
 }
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.c b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
index 28e0ed3..5836bb2 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_screen.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
@@ -305,7 +305,7 @@ nv50_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
    case PIPE_SHADER_CAP_TGSI_CONT_SUPPORTED:
       return 1;
    case PIPE_SHADER_CAP_TGSI_SQRT_SUPPORTED:
-      return 0;
+      return 1;
    case PIPE_SHADER_CAP_SUBROUTINES:
       return 0; /* please inline, or provide function declarations */
    case PIPE_SHADER_CAP_INTEGERS:
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
index 30afdf2f0..3c5b1da 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
@@ -328,7 +328,7 @@ nvc0_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
    case PIPE_SHADER_CAP_TGSI_CONT_SUPPORTED:
       return 1;
    case PIPE_SHADER_CAP_TGSI_SQRT_SUPPORTED:
-      return 0;
+      return 1;
    case PIPE_SHADER_CAP_SUBROUTINES:
       return 1;
    case PIPE_SHADER_CAP_INTEGERS:
-- 
2.7.4