From 362e17a7123e55d22a188943886a7ead00def6b6 Mon Sep 17 00:00:00 2001
From: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Date: Sun, 8 May 2016 19:13:15 +0200
Subject: [PATCH] nvc0/ir: add a lowering pass for surfaces on Fermi

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 .../nouveau/codegen/nv50_ir_lowering_nvc0.cpp      | 115 +++++++++++++++++++++
 .../nouveau/codegen/nv50_ir_lowering_nvc0.h        |   2 +
 2 files changed, 117 insertions(+)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
index 43a6e5f..dc0667f 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
@@ -2012,6 +2012,119 @@ NVC0LoweringPass::handleSurfaceOpNVE4(TexInstruction *su)
       su->sType = (su->tex.target == TEX_TARGET_BUFFER) ? TYPE_U32 : TYPE_U8;
 }
 
+void
+NVC0LoweringPass::processSurfaceCoordsNVC0(TexInstruction *su)
+{
+   const int idx = su->tex.r;
+   const int dim = su->tex.target.getDim();
+   const int arg = dim + (su->tex.target.isArray() || su->tex.target.isCube());
+   const uint16_t base = idx * NVE4_SU_INFO__STRIDE;
+   int c;
+   Value *zero = bld.mkImm(0);
+   Value *src[3];
+   Value *v;
+   Value *ind = NULL;
+
+   if (su->tex.rIndirectSrc >= 0) {
+      // FIXME: out of bounds
+      assert(su->tex.r == 0);
+      ind = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
+                       su->getIndirectR(), bld.mkImm(6));
+   }
+
+   // get surface coordinates
+   for (c = 0; c < arg; ++c)
+      src[c] = su->getSrc(c);
+   for (; c < 3; ++c)
+      src[c] = zero;
+
+   // calculate pixel offset
+   if (su->op == OP_SULDP || su->op == OP_SUREDP) {
+      v = loadSuInfo32(ind, base + NVE4_SU_INFO_BSIZE);
+      su->setSrc(0, bld.mkOp2v(OP_MUL, TYPE_U32, bld.getSSA(), src[0], v));
+   }
+
+   // add array layer offset
+   if (su->tex.target.isArray() || su->tex.target.isCube()) {
+      v = loadSuInfo32(ind, base + NVE4_SU_INFO_ARRAY);
+      assert(dim > 1);
+      su->setSrc(2, bld.mkOp2v(OP_MUL, TYPE_U32, bld.getSSA(), src[2], v));
+   }
+
+   // prevent read fault when the image is not actually bound
+   CmpInstruction *pred =
+      bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE),
+                TYPE_U32, bld.mkImm(0),
+                loadSuInfo32(ind, base + NVE4_SU_INFO_ADDR));
+   if (su->op != OP_SUSTP && su->tex.format) {
+      const TexInstruction::ImgFormatDesc *format = su->tex.format;
+      int blockwidth = format->bits[0] + format->bits[1] +
+                       format->bits[2] + format->bits[3];
+
+      assert(format->components != 0);
+      // make sure that the format doesn't mismatch when it's not FMT_NONE
+      bld.mkCmp(OP_SET_OR, CC_NE, TYPE_U32, pred->getDef(0),
+                TYPE_U32, bld.loadImm(NULL, blockwidth / 8),
+                loadSuInfo32(ind, base + NVE4_SU_INFO_BSIZE),
+                pred->getDef(0));
+   }
+   su->setPredicate(CC_NOT_P, pred->getDef(0));
+}
+
+void
+NVC0LoweringPass::handleSurfaceOpNVC0(TexInstruction *su)
+{
+   if (su->tex.target == TEX_TARGET_1D_ARRAY) {
+      /* As 1d arrays also need 3 coordinates, switching to TEX_TARGET_2D_ARRAY
+       * will simplify the lowering pass and the texture constraints. */
+      su->moveSources(1, 1);
+      su->setSrc(1, bld.loadImm(NULL, 0));
+      su->tex.target = TEX_TARGET_2D_ARRAY;
+   }
+
+   processSurfaceCoordsNVC0(su);
+
+   if (su->op == OP_SULDP)
+      convertSurfaceFormat(su);
+
+   if (su->op == OP_SUREDB || su->op == OP_SUREDP) {
+      const int dim = su->tex.target.getDim();
+      const int arg = dim + (su->tex.target.isArray() || su->tex.target.isCube());
+      LValue *addr = bld.getSSA(8);
+      Value *def = su->getDef(0);
+
+      su->op = OP_SULEA;
+
+      // Set the destination to the address
+      su->dType = TYPE_U64;
+      su->setDef(0, addr);
+      su->setDef(1, su->getPredicate());
+
+      bld.setPosition(su, true);
+
+      // Perform the atomic op
+      Instruction *red = bld.mkOp(OP_ATOM, su->sType, bld.getSSA());
+      red->subOp = su->subOp;
+      red->setSrc(0, bld.mkSymbol(FILE_MEMORY_GLOBAL, 0, su->sType, 0));
+      red->setSrc(1, su->getSrc(arg));
+      if (red->subOp == NV50_IR_SUBOP_ATOM_CAS)
+         red->setSrc(2, su->getSrc(arg + 1));
+      red->setIndirect(0, 0, addr);
+
+      // make sure to initialize dst value when the atomic operation is not
+      // performed
+      Instruction *mov = bld.mkMov(bld.getSSA(), bld.loadImm(NULL, 0));
+
+      assert(su->cc == CC_NOT_P);
+      red->setPredicate(su->cc, su->getPredicate());
+      mov->setPredicate(CC_P, su->getPredicate());
+
+      bld.mkOp2(OP_UNION, TYPE_U32, def, red->getDef(0), mov->getDef(0));
+
+      handleCasExch(red, false);
+   }
+}
+
 bool
 NVC0LoweringPass::handleWRSV(Instruction *i)
 {
@@ -2491,6 +2604,8 @@ NVC0LoweringPass::visit(Instruction *i)
    case OP_SUREDP:
       if (targ->getChipset() >= NVISA_GK104_CHIPSET)
          handleSurfaceOpNVE4(i->asTex());
+      else
+         handleSurfaceOpNVC0(i->asTex());
       break;
    case OP_SUQ:
       handleSUQ(i->asTex());
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h
index c007e09..2321956 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h
@@ -106,6 +106,7 @@ protected:
    bool handleATOM(Instruction *);
    bool handleCasExch(Instruction *, bool needCctl);
    void handleSurfaceOpNVE4(TexInstruction *);
+   void handleSurfaceOpNVC0(TexInstruction *);
    void handleSharedATOM(Instruction *);
    void handleSharedATOMNVE4(Instruction *);
    void handleLDST(Instruction *);
@@ -138,6 +139,7 @@ private:
 
    void adjustCoordinatesMS(TexInstruction *);
    void processSurfaceCoordsNVE4(TexInstruction *);
+   void processSurfaceCoordsNVC0(TexInstruction *);
    void convertSurfaceFormat(TexInstruction *);
 
 protected:
-- 
2.7.4