From c1e4a6bfbf015801c6a8b0ae694482421a22c2d9 Mon Sep 17 00:00:00 2001 From: Ilia Mirkin Date: Sat, 12 Mar 2016 21:26:21 -0500 Subject: [PATCH] nv50,nvc0: handle SQRT lowering inside the driver First off, st/mesa lowers DSQRT incorrectly (it uses CMP to attempt to find out whether the input is less than 0). Secondly the current approach (x * rsq(x)) behaves poorly for x = inf - a NaN is produced instead of inf. Instead we switch to the less accurate rcp(rsq(x)) method - this behaves nicely for all valid inputs. We still don't do this for DSQRT since the RSQ/RCP ops are *really* inaccurate, and don't even have Newton-Raphson steps right now. Eventually we should have a separate library function for DSQRT that does it more precisely (and perhaps move this lowering to the post-opt phase). This fixes a number of dEQP precision tests that were expecting better behavior for infinite inputs. Signed-off-by: Ilia Mirkin Reviewed-by: Samuel Pitoiset Tested-by: Samuel Pitoiset --- .../drivers/nouveau/codegen/nv50_ir_build_util.cpp | 6 ++++- .../drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp | 2 ++ .../nouveau/codegen/nv50_ir_lowering_nv50.cpp | 7 +++-- .../nouveau/codegen/nv50_ir_lowering_nvc0.cpp | 31 +++++++++++----------- src/gallium/drivers/nouveau/nv50/nv50_screen.c | 2 +- src/gallium/drivers/nouveau/nvc0/nvc0_screen.c | 2 +- 6 files changed, 27 insertions(+), 23 deletions(-) diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp index f58cf97..84ebfdb 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp @@ -585,6 +585,7 @@ BuildUtil::split64BitOpPostRA(Function *fn, Instruction *i, return NULL; srcNr = 2; break; + case OP_SELP: srcNr = 3; break; default: // TODO when needed return NULL; @@ -601,7 +602,10 @@ BuildUtil::split64BitOpPostRA(Function *fn, Instruction *i, for (int s = 0; s < srcNr; ++s) { if (lo->getSrc(s)->reg.size < 8) { - hi->setSrc(s, zero); + if (s == 2) + hi->setSrc(s, lo->getSrc(s)); + else + hi->setSrc(s, zero); } else { if (lo->getSrc(s)->refCount() > 1) lo->setSrc(s, cloneShallow(fn, lo->getSrc(s))); diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp index b06d86a..d284446 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp @@ -616,6 +616,7 @@ static nv50_ir::operation translateOpcode(uint opcode) NV50_IR_OPCODE_CASE(RCP, RCP); NV50_IR_OPCODE_CASE(RSQ, RSQ); + NV50_IR_OPCODE_CASE(SQRT, SQRT); NV50_IR_OPCODE_CASE(MUL, MUL); NV50_IR_OPCODE_CASE(ADD, ADD); @@ -2689,6 +2690,7 @@ Converter::handleInstruction(const struct tgsi_full_instruction *insn) case TGSI_OPCODE_FLR: case TGSI_OPCODE_TRUNC: case TGSI_OPCODE_RCP: + case TGSI_OPCODE_SQRT: case TGSI_OPCODE_IABS: case TGSI_OPCODE_INEG: case TGSI_OPCODE_NOT: diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp index 8752b0c..12c5f69 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp @@ -1203,10 +1203,9 @@ NV50LoweringPreSSA::handleDIV(Instruction *i) bool NV50LoweringPreSSA::handleSQRT(Instruction *i) { - Instruction *rsq = bld.mkOp1(OP_RSQ, TYPE_F32, - bld.getSSA(), i->getSrc(0)); - i->op = OP_MUL; - i->setSrc(1, rsq->getDef(0)); + bld.setPosition(i, true); + i->op = OP_RSQ; + bld.mkOp1(OP_RCP, i->dType, i->getDef(0), i->getDef(0)); return true; } diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp index d181f15..d0936d8 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp @@ -1778,22 +1778,21 @@ NVC0LoweringPass::handleMOD(Instruction *i) bool NVC0LoweringPass::handleSQRT(Instruction *i) { - Value *pred = bld.getSSA(1, FILE_PREDICATE); - Value *zero = bld.getSSA(); - Instruction *rsq; - - bld.mkOp1(OP_MOV, TYPE_U32, zero, bld.mkImm(0)); - if (i->dType == TYPE_F64) - zero = bld.mkOp2v(OP_MERGE, TYPE_U64, bld.getSSA(8), zero, zero); - bld.mkCmp(OP_SET, CC_LE, i->dType, pred, i->dType, i->getSrc(0), zero); - bld.mkOp1(OP_MOV, i->dType, i->getDef(0), zero)->setPredicate(CC_P, pred); - rsq = bld.mkOp1(OP_RSQ, i->dType, - bld.getSSA(typeSizeof(i->dType)), i->getSrc(0)); - rsq->setPredicate(CC_NOT_P, pred); - i->op = OP_MUL; - i->setSrc(1, rsq->getDef(0)); - i->setPredicate(CC_NOT_P, pred); - + if (i->dType == TYPE_F64) { + Value *pred = bld.getSSA(1, FILE_PREDICATE); + Value *zero = bld.loadImm(NULL, 0.0d); + Value *dst = bld.getSSA(8); + bld.mkOp1(OP_RSQ, i->dType, dst, i->getSrc(0)); + bld.mkCmp(OP_SET, CC_LE, i->dType, pred, i->dType, i->getSrc(0), zero); + bld.mkOp3(OP_SELP, TYPE_U64, dst, zero, dst, pred); + i->op = OP_MUL; + i->setSrc(1, dst); + // TODO: Handle this properly with a library function + } else { + bld.setPosition(i, true); + i->op = OP_RSQ; + bld.mkOp1(OP_RCP, i->dType, i->getDef(0), i->getDef(0)); + } return true; } diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.c b/src/gallium/drivers/nouveau/nv50/nv50_screen.c index 28e0ed3..5836bb2 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_screen.c +++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.c @@ -305,7 +305,7 @@ nv50_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader, case PIPE_SHADER_CAP_TGSI_CONT_SUPPORTED: return 1; case PIPE_SHADER_CAP_TGSI_SQRT_SUPPORTED: - return 0; + return 1; case PIPE_SHADER_CAP_SUBROUTINES: return 0; /* please inline, or provide function declarations */ case PIPE_SHADER_CAP_INTEGERS: diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c index 30afdf2f0..3c5b1da 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c @@ -328,7 +328,7 @@ nvc0_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader, case PIPE_SHADER_CAP_TGSI_CONT_SUPPORTED: return 1; case PIPE_SHADER_CAP_TGSI_SQRT_SUPPORTED: - return 0; + return 1; case PIPE_SHADER_CAP_SUBROUTINES: return 1; case PIPE_SHADER_CAP_INTEGERS: -- 2.7.4