nv50: implement a basic compute support
authorSamuel Pitoiset <samuel.pitoiset@gmail.com>
Wed, 14 Oct 2015 19:42:41 +0000 (21:42 +0200)
committerSamuel Pitoiset <samuel.pitoiset@gmail.com>
Sat, 14 Nov 2015 22:42:15 +0000 (23:42 +0100)
This adds the ability to launch simple compute kernels like the one I
will use to read out MP performance counters in the upcoming patch.

This compute support is based on the work of Francisco Jerez (aka curro)
that he did as part of his EVoC project in 2011/2012 to get OpenCL
working on Tesla. His original work can be found here:
https://github.com/curro/mesa/commits/nv50-compute

I did some improvements on the original code, like fixing using both 3D
and COMPUTE simultaneously, improving global buffers binding, and making
the code closer to what nvc0 already does. This compute support has been
tested by Pierre Moreau and myself with some compute kernels. This is a
step towards OpenCL.

Speaking about this, it seems like compute programs overlap fragment
programs when they are used both. To fix this, we need to re-validate
fragment programs when binding compute programs and vice versa.

Note that, textures, samplers and surfaces still need to be implemented.

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Tested-by: Pierre Moreau <pierre.morrow@free.fr>
Acked-by: Ilia Mirkin <imirkin@alum.mit.edu>
src/gallium/drivers/nouveau/Makefile.sources
src/gallium/drivers/nouveau/nv50/nv50_compute.c [new file with mode: 0644]
src/gallium/drivers/nouveau/nv50/nv50_compute.xml.h [new file with mode: 0644]
src/gallium/drivers/nouveau/nv50/nv50_context.c
src/gallium/drivers/nouveau/nv50/nv50_context.h
src/gallium/drivers/nouveau/nv50/nv50_program.c
src/gallium/drivers/nouveau/nv50/nv50_program.h
src/gallium/drivers/nouveau/nv50/nv50_screen.c
src/gallium/drivers/nouveau/nv50/nv50_screen.h
src/gallium/drivers/nouveau/nv50/nv50_state.c

index 83f8113..c2ff8e9 100644 (file)
@@ -64,6 +64,7 @@ NV50_C_SOURCES := \
        nv50/nv50_3ddefs.xml.h \
        nv50/nv50_3d.xml.h \
        nv50/nv50_blit.h \
+       nv50/nv50_compute.c \
        nv50/nv50_context.c \
        nv50/nv50_context.h \
        nv50/nv50_defs.xml.h \
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_compute.c b/src/gallium/drivers/nouveau/nv50/nv50_compute.c
new file mode 100644 (file)
index 0000000..6d23fd6
--- /dev/null
@@ -0,0 +1,320 @@
+/*
+ * Copyright 2012 Francisco Jerez
+ * Copyright 2015 Samuel Pitoiset
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include "nv50/nv50_context.h"
+#include "nv50/nv50_compute.xml.h"
+
+#include "codegen/nv50_ir_driver.h"
+
+int
+nv50_screen_compute_setup(struct nv50_screen *screen,
+                          struct nouveau_pushbuf *push)
+{
+   struct nouveau_device *dev = screen->base.device;
+   struct nouveau_object *chan = screen->base.channel;
+   struct nv04_fifo *fifo = (struct nv04_fifo *)chan->data;
+   unsigned obj_class;
+   int i, ret;
+
+   switch (dev->chipset & 0xf0) {
+   case 0x50:
+   case 0x80:
+   case 0x90:
+      obj_class = NV50_COMPUTE_CLASS;
+      break;
+   case 0xa0:
+      switch (dev->chipset) {
+      case 0xa3:
+      case 0xa5:
+      case 0xa8:
+         obj_class = NVA3_COMPUTE_CLASS;
+         break;
+      default:
+         obj_class = NV50_COMPUTE_CLASS;
+         break;
+      }
+      break;
+   default:
+      NOUVEAU_ERR("unsupported chipset: NV%02x\n", dev->chipset);
+      return -1;
+   }
+
+   ret = nouveau_object_new(chan, 0xbeef50c0, obj_class, NULL, 0,
+                            &screen->compute);
+   if (ret)
+      return ret;
+
+   BEGIN_NV04(push, SUBC_COMPUTE(NV01_SUBCHAN_OBJECT), 1);
+   PUSH_DATA (push, screen->compute->handle);
+
+   BEGIN_NV04(push, NV50_COMPUTE(UNK02A0), 1);
+   PUSH_DATA (push, 1);
+   BEGIN_NV04(push, NV50_COMPUTE(DMA_STACK), 1);
+   PUSH_DATA (push, fifo->vram);
+   BEGIN_NV04(push, NV50_COMPUTE(STACK_ADDRESS_HIGH), 2);
+   PUSH_DATAh(push, screen->stack_bo->offset);
+   PUSH_DATA (push, screen->stack_bo->offset);
+   BEGIN_NV04(push, NV50_COMPUTE(STACK_SIZE_LOG), 1);
+   PUSH_DATA (push, 4);
+
+   BEGIN_NV04(push, NV50_COMPUTE(UNK0290), 1);
+   PUSH_DATA (push, 1);
+   BEGIN_NV04(push, NV50_COMPUTE(LANES32_ENABLE), 1);
+   PUSH_DATA (push, 1);
+   BEGIN_NV04(push, NV50_COMPUTE(REG_MODE), 1);
+   PUSH_DATA (push, NV50_COMPUTE_REG_MODE_STRIPED);
+   BEGIN_NV04(push, NV50_COMPUTE(UNK0384), 1);
+   PUSH_DATA (push, 0x100);
+   BEGIN_NV04(push, NV50_COMPUTE(DMA_GLOBAL), 1);
+   PUSH_DATA (push, fifo->vram);
+
+   for (i = 0; i < 15; i++) {
+      BEGIN_NV04(push, NV50_COMPUTE(GLOBAL_ADDRESS_HIGH(i)), 2);
+      PUSH_DATA (push, 0);
+      PUSH_DATA (push, 0);
+      BEGIN_NV04(push, NV50_COMPUTE(GLOBAL_LIMIT(i)), 1);
+      PUSH_DATA (push, 0);
+      BEGIN_NV04(push, NV50_COMPUTE(GLOBAL_MODE(i)), 1);
+      PUSH_DATA (push, NV50_COMPUTE_GLOBAL_MODE_LINEAR);
+   }
+
+   BEGIN_NV04(push, NV50_COMPUTE(GLOBAL_ADDRESS_HIGH(15)), 2);
+   PUSH_DATA (push, 0);
+   PUSH_DATA (push, 0);
+   BEGIN_NV04(push, NV50_COMPUTE(GLOBAL_LIMIT(15)), 1);
+   PUSH_DATA (push, ~0);
+   BEGIN_NV04(push, NV50_COMPUTE(GLOBAL_MODE(15)), 1);
+   PUSH_DATA (push, NV50_COMPUTE_GLOBAL_MODE_LINEAR);
+
+   BEGIN_NV04(push, NV50_COMPUTE(LOCAL_WARPS_LOG_ALLOC), 1);
+   PUSH_DATA (push, 7);
+   BEGIN_NV04(push, NV50_COMPUTE(LOCAL_WARPS_NO_CLAMP), 1);
+   PUSH_DATA (push, 1);
+   BEGIN_NV04(push, NV50_COMPUTE(STACK_WARPS_LOG_ALLOC), 1);
+   PUSH_DATA (push, 7);
+   BEGIN_NV04(push, NV50_COMPUTE(STACK_WARPS_NO_CLAMP), 1);
+   PUSH_DATA (push, 1);
+   BEGIN_NV04(push, NV50_COMPUTE(USER_PARAM_COUNT), 1);
+   PUSH_DATA (push, 0);
+
+   BEGIN_NV04(push, NV50_COMPUTE(DMA_TEXTURE), 1);
+   PUSH_DATA (push, fifo->vram);
+   BEGIN_NV04(push, NV50_COMPUTE(TEX_LIMITS), 1);
+   PUSH_DATA (push, 0x54);
+   BEGIN_NV04(push, NV50_COMPUTE(LINKED_TSC), 1);
+   PUSH_DATA (push, 0);
+
+   BEGIN_NV04(push, NV50_COMPUTE(DMA_TIC), 1);
+   PUSH_DATA (push, fifo->vram);
+   BEGIN_NV04(push, NV50_COMPUTE(TIC_ADDRESS_HIGH), 3);
+   PUSH_DATAh(push, screen->txc->offset);
+   PUSH_DATA (push, screen->txc->offset);
+   PUSH_DATA (push, NV50_TIC_MAX_ENTRIES - 1);
+
+   BEGIN_NV04(push, NV50_COMPUTE(DMA_TSC), 1);
+   PUSH_DATA (push, fifo->vram);
+   BEGIN_NV04(push, NV50_COMPUTE(TSC_ADDRESS_HIGH), 3);
+   PUSH_DATAh(push, screen->txc->offset + 65536);
+   PUSH_DATA (push, screen->txc->offset + 65536);
+   PUSH_DATA (push, NV50_TSC_MAX_ENTRIES - 1);
+
+   BEGIN_NV04(push, NV50_COMPUTE(DMA_CODE_CB), 1);
+   PUSH_DATA (push, fifo->vram);
+
+   BEGIN_NV04(push, NV50_COMPUTE(DMA_LOCAL), 1);
+   PUSH_DATA (push, fifo->vram);
+   BEGIN_NV04(push, NV50_COMPUTE(LOCAL_ADDRESS_HIGH), 2);
+   PUSH_DATAh(push, screen->tls_bo->offset + 65536);
+   PUSH_DATA (push, screen->tls_bo->offset + 65536);
+   BEGIN_NV04(push, NV50_COMPUTE(LOCAL_SIZE_LOG), 1);
+   PUSH_DATA (push, util_logbase2((screen->max_tls_space / ONE_TEMP_SIZE) * 2));
+
+   return 0;
+}
+
+static bool
+nv50_compute_validate_program(struct nv50_context *nv50)
+{
+   struct nv50_program *prog = nv50->compprog;
+
+   if (prog->mem)
+      return true;
+
+   if (!prog->translated) {
+      prog->translated = nv50_program_translate(
+         prog, nv50->screen->base.device->chipset, &nv50->base.debug);
+      if (!prog->translated)
+         return false;
+   }
+   if (unlikely(!prog->code_size))
+      return false;
+
+   if (likely(prog->code_size)) {
+      if (nv50_program_upload_code(nv50, prog)) {
+         struct nouveau_pushbuf *push = nv50->base.pushbuf;
+         BEGIN_NV04(push, NV50_COMPUTE(CODE_CB_FLUSH), 1);
+         PUSH_DATA (push, 0);
+         return true;
+      }
+   }
+   return false;
+}
+
+static void
+nv50_compute_validate_globals(struct nv50_context *nv50)
+{
+   unsigned i;
+
+   for (i = 0; i < nv50->global_residents.size / sizeof(struct pipe_resource *);
+        ++i) {
+      struct pipe_resource *res = *util_dynarray_element(
+         &nv50->global_residents, struct pipe_resource *, i);
+      if (res)
+         nv50_add_bufctx_resident(nv50->bufctx_cp, NV50_BIND_CP_GLOBAL,
+                                  nv04_resource(res), NOUVEAU_BO_RDWR);
+   }
+}
+
+static bool
+nv50_compute_state_validate(struct nv50_context *nv50)
+{
+   if (!nv50_compute_validate_program(nv50))
+      return false;
+
+   if (nv50->dirty_cp & NV50_NEW_CP_GLOBALS)
+      nv50_compute_validate_globals(nv50);
+
+   /* TODO: validate textures, samplers, surfaces */
+
+   nv50_bufctx_fence(nv50->bufctx_cp, false);
+
+   nouveau_pushbuf_bufctx(nv50->base.pushbuf, nv50->bufctx_cp);
+   if (unlikely(nouveau_pushbuf_validate(nv50->base.pushbuf)))
+      return false;
+   if (unlikely(nv50->state.flushed))
+      nv50_bufctx_fence(nv50->bufctx_cp, true);
+
+   return true;
+}
+
+static void
+nv50_compute_upload_input(struct nv50_context *nv50, const uint32_t *input)
+{
+   struct nv50_screen *screen = nv50->screen;
+   struct nouveau_pushbuf *push = screen->base.pushbuf;
+   unsigned size = align(nv50->compprog->parm_size, 0x4);
+
+   BEGIN_NV04(push, NV50_COMPUTE(USER_PARAM_COUNT), 1);
+   PUSH_DATA (push, (size / 4) << 8);
+
+   if (size) {
+      struct nouveau_mm_allocation *mm;
+      struct nouveau_bo *bo = NULL;
+      unsigned offset;
+
+      mm = nouveau_mm_allocate(screen->base.mm_GART, size, &bo, &offset);
+      assert(mm);
+
+      nouveau_bo_map(bo, 0, screen->base.client);
+      memcpy(bo->map + offset, input, size);
+
+      nouveau_bufctx_refn(nv50->bufctx, 0, bo, NOUVEAU_BO_GART | NOUVEAU_BO_RD);
+      nouveau_pushbuf_bufctx(push, nv50->bufctx);
+      nouveau_pushbuf_validate(push);
+
+      BEGIN_NV04(push, NV50_COMPUTE(USER_PARAM(0)), size / 4);
+      nouveau_pushbuf_data(push, bo, offset, size);
+
+      nouveau_fence_work(screen->base.fence.current, nouveau_mm_free_work, mm);
+      nouveau_bo_ref(NULL, &bo);
+      nouveau_bufctx_reset(nv50->bufctx, 0);
+   }
+}
+
+static uint32_t
+nv50_compute_find_symbol(struct nv50_context *nv50, uint32_t label)
+{
+   struct nv50_program *prog = nv50->compprog;
+   const struct nv50_ir_prog_symbol *syms =
+      (const struct nv50_ir_prog_symbol *)prog->cp.syms;
+   unsigned i;
+
+   for (i = 0; i < prog->cp.num_syms; ++i) {
+      if (syms[i].label == label)
+         return prog->code_base + syms[i].offset;
+   }
+   return prog->code_base; /* no symbols or symbol not found */
+}
+
+void
+nv50_launch_grid(struct pipe_context *pipe,
+                 const uint *block_layout, const uint *grid_layout,
+                 uint32_t label, const void *input)
+{
+   struct nv50_context *nv50 = nv50_context(pipe);
+   struct nouveau_pushbuf *push = nv50->base.pushbuf;
+   unsigned block_size = block_layout[0] * block_layout[1] * block_layout[2];
+   struct nv50_program *cp = nv50->compprog;
+   bool ret;
+
+   ret = !nv50_compute_state_validate(nv50);
+   if (ret) {
+      NOUVEAU_ERR("Failed to launch grid !\n");
+      return;
+   }
+
+   nv50_compute_upload_input(nv50, input);
+
+   BEGIN_NV04(push, NV50_COMPUTE(CP_START_ID), 1);
+   PUSH_DATA (push, nv50_compute_find_symbol(nv50, label));
+
+   BEGIN_NV04(push, NV50_COMPUTE(SHARED_SIZE), 1);
+   PUSH_DATA (push, align(cp->cp.smem_size + cp->parm_size + 0x10, 0x40));
+   BEGIN_NV04(push, NV50_COMPUTE(CP_REG_ALLOC_TEMP), 1);
+   PUSH_DATA (push, cp->max_gpr);
+
+   /* grid/block setup */
+   BEGIN_NV04(push, NV50_COMPUTE(BLOCKDIM_XY), 2);
+   PUSH_DATA (push, block_layout[1] << 16 | block_layout[0]);
+   PUSH_DATA (push, block_layout[2]);
+   BEGIN_NV04(push, NV50_COMPUTE(BLOCK_ALLOC), 1);
+   PUSH_DATA (push, 1 << 16 | block_size);
+   BEGIN_NV04(push, NV50_COMPUTE(BLOCKDIM_LATCH), 1);
+   PUSH_DATA (push, 1);
+   BEGIN_NV04(push, NV50_COMPUTE(GRIDDIM), 1);
+   PUSH_DATA (push, grid_layout[1] << 16 | grid_layout[0]);
+   BEGIN_NV04(push, NV50_COMPUTE(GRIDID), 1);
+   PUSH_DATA (push, 1);
+
+   /* kernel launching */
+   BEGIN_NV04(push, NV50_COMPUTE(LAUNCH), 1);
+   PUSH_DATA (push, 0);
+   BEGIN_NV04(push, SUBC_COMPUTE(NV50_GRAPH_SERIALIZE), 1);
+   PUSH_DATA (push, 0);
+
+   /* bind a compute shader clobbers fragment shader state */
+   nv50->dirty |= NV50_NEW_FRAGPROG;
+}
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_compute.xml.h b/src/gallium/drivers/nouveau/nv50/nv50_compute.xml.h
new file mode 100644 (file)
index 0000000..268d112
--- /dev/null
@@ -0,0 +1,444 @@
+#ifndef NV50_COMPUTE_XML
+#define NV50_COMPUTE_XML
+
+/* Autogenerated file, DO NOT EDIT manually!
+
+This file was generated by the rules-ng-ng headergen tool in this git repository:
+http://github.com/envytools/envytools/
+git clone https://github.com/envytools/envytools.git
+
+The rules-ng-ng source files this header was generated from are:
+- rnndb/graph/g80_compute.xml (  14027 bytes, from 2015-02-14 02:01:36)
+- rnndb/copyright.xml         (   6456 bytes, from 2015-02-14 02:01:36)
+- rnndb/nvchipsets.xml        (   2833 bytes, from 2015-04-28 16:28:33)
+- rnndb/fifo/nv_object.xml    (  15390 bytes, from 2015-04-22 20:36:09)
+- rnndb/g80_defs.xml          (  18210 bytes, from 2015-10-19 20:49:59)
+
+Copyright (C) 2006-2015 by the following authors:
+- Artur Huillet <arthur.huillet@free.fr> (ahuillet)
+- Ben Skeggs (darktama, darktama_)
+- B. R. <koala_br@users.sourceforge.net> (koala_br)
+- Carlos Martin <carlosmn@users.sf.net> (carlosmn)
+- Christoph Bumiller <e0425955@student.tuwien.ac.at> (calim, chrisbmr)
+- Dawid Gajownik <gajownik@users.sf.net> (gajownik)
+- Dmitry Baryshkov
+- Dmitry Eremin-Solenikov <lumag@users.sf.net> (lumag)
+- EdB <edb_@users.sf.net> (edb_)
+- Erik Waling <erikwailing@users.sf.net> (erikwaling)
+- Francisco Jerez <currojerez@riseup.net> (curro)
+- Ilia Mirkin <imirkin@alum.mit.edu> (imirkin)
+- jb17bsome <jb17bsome@bellsouth.net> (jb17bsome)
+- Jeremy Kolb <kjeremy@users.sf.net> (kjeremy)
+- Laurent Carlier <lordheavym@gmail.com> (lordheavy)
+- Luca Barbieri <luca@luca-barbieri.com> (lb, lb1)
+- Maarten Maathuis <madman2003@gmail.com> (stillunknown)
+- Marcin Koƛcielnicki <koriakin@0x04.net> (mwk, koriakin)
+- Mark Carey <mark.carey@gmail.com> (careym)
+- Matthieu Castet <matthieu.castet@parrot.com> (mat-c)
+- nvidiaman <nvidiaman@users.sf.net> (nvidiaman)
+- Patrice Mandin <patmandin@gmail.com> (pmandin, pmdata)
+- Pekka Paalanen <pq@iki.fi> (pq, ppaalanen)
+- Peter Popov <ironpeter@users.sf.net> (ironpeter)
+- Richard Hughes <hughsient@users.sf.net> (hughsient)
+- Rudi Cilibrasi <cilibrar@users.sf.net> (cilibrar)
+- Serge Martin
+- Simon Raffeiner
+- Stephane Loeuillet <leroutier@users.sf.net> (leroutier)
+- Stephane Marchesin <stephane.marchesin@gmail.com> (marcheu)
+- sturmflut <sturmflut@users.sf.net> (sturmflut)
+- Sylvain Munaut <tnt@246tNt.com>
+- Victor Stinner <victor.stinner@haypocalc.com> (haypo)
+- Wladmir van der Laan <laanwj@gmail.com> (miathan6)
+- Younes Manton <younes.m@gmail.com> (ymanton)
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+
+
+#define NV50_COMPUTE_DMA_NOTIFY                                        0x00000180
+
+#define NV50_COMPUTE_DMA_GLOBAL                                        0x000001a0
+
+#define NV50_COMPUTE_DMA_QUERY                                 0x000001a4
+
+#define NV50_COMPUTE_DMA_LOCAL                                 0x000001b8
+
+#define NV50_COMPUTE_DMA_STACK                                 0x000001bc
+
+#define NV50_COMPUTE_DMA_CODE_CB                                       0x000001c0
+
+#define NV50_COMPUTE_DMA_TSC                                   0x000001c4
+
+#define NV50_COMPUTE_DMA_TIC                                   0x000001c8
+
+#define NV50_COMPUTE_DMA_TEXTURE                                       0x000001cc
+
+#define NV50_COMPUTE_UNK0200                                   0x00000200
+#define NV50_COMPUTE_UNK0200_UNK1__MASK                                0x0000ffff
+#define NV50_COMPUTE_UNK0200_UNK1__SHIFT                               0
+#define NV50_COMPUTE_UNK0200_UNK2__MASK                                0x00ff0000
+#define NV50_COMPUTE_UNK0200_UNK2__SHIFT                               16
+
+#define NV50_COMPUTE_UNK0204                                   0x00000204
+
+#define NV50_COMPUTE_UNK0208                                   0x00000208
+
+#define NV50_COMPUTE_UNK020C                                   0x0000020c
+
+#define NV50_COMPUTE_CP_ADDRESS_HIGH                           0x00000210
+
+#define NV50_COMPUTE_CP_ADDRESS_LOW                            0x00000214
+
+#define NV50_COMPUTE_STACK_ADDRESS_HIGH                                0x00000218
+
+#define NV50_COMPUTE_STACK_ADDRESS_LOW                         0x0000021c
+
+#define NV50_COMPUTE_STACK_SIZE_LOG                            0x00000220
+
+#define NV50_COMPUTE_CALL_LIMIT_LOG                            0x00000224
+
+#define NV50_COMPUTE_UNK0228                                   0x00000228
+#define NV50_COMPUTE_UNK0228_UNK0                              0x00000001
+#define NV50_COMPUTE_UNK0228_UNK4__MASK                                0x00000ff0
+#define NV50_COMPUTE_UNK0228_UNK4__SHIFT                               4
+#define NV50_COMPUTE_UNK0228_UNK12__MASK                               0x000ff000
+#define NV50_COMPUTE_UNK0228_UNK12__SHIFT                      12
+
+#define NV50_COMPUTE_TSC_ADDRESS_HIGH                          0x0000022c
+
+#define NV50_COMPUTE_TSC_ADDRESS_LOW                           0x00000230
+#define NV50_COMPUTE_TSC_ADDRESS_LOW__ALIGN                    0x00000020
+
+#define NV50_COMPUTE_TSC_LIMIT                                 0x00000234
+#define NV50_COMPUTE_TSC_LIMIT__MAX                            0x00001fff
+
+#define NV50_COMPUTE_CB_ADDR                                   0x00000238
+#define NV50_COMPUTE_CB_ADDR_ID__MASK                          0x003fff00
+#define NV50_COMPUTE_CB_ADDR_ID__SHIFT                         8
+#define NV50_COMPUTE_CB_ADDR_BUFFER__MASK                      0x0000007f
+#define NV50_COMPUTE_CB_ADDR_BUFFER__SHIFT                     0
+
+#define NV50_COMPUTE_CB_DATA(i0)                                      (0x0000023c + 0x4*(i0))
+#define NV50_COMPUTE_CB_DATA__ESIZE                            0x00000004
+#define NV50_COMPUTE_CB_DATA__LEN                              0x00000010
+
+#define NV50_COMPUTE_TSC_FLUSH                                 0x0000027c
+#define NV50_COMPUTE_TSC_FLUSH_SPECIFIC                                0x00000001
+#define NV50_COMPUTE_TSC_FLUSH_ENTRY__MASK                     0x03fffff0
+#define NV50_COMPUTE_TSC_FLUSH_ENTRY__SHIFT                    4
+
+#define NV50_COMPUTE_TIC_FLUSH                                 0x00000280
+#define NV50_COMPUTE_TIC_FLUSH_SPECIFIC                                0x00000001
+#define NV50_COMPUTE_TIC_FLUSH_ENTRY__MASK                     0x03fffff0
+#define NV50_COMPUTE_TIC_FLUSH_ENTRY__SHIFT                    4
+
+#define NV50_COMPUTE_DELAY1                                    0x00000284
+
+#define NV50_COMPUTE_WATCHDOG_TIMER                            0x00000288
+
+#define NV50_COMPUTE_DELAY2                                    0x0000028c
+
+#define NV50_COMPUTE_UNK0290                                   0x00000290
+
+#define NV50_COMPUTE_LOCAL_ADDRESS_HIGH                                0x00000294
+
+#define NV50_COMPUTE_LOCAL_ADDRESS_LOW                         0x00000298
+#define NV50_COMPUTE_LOCAL_ADDRESS_LOW__ALIGN                  0x00000100
+
+#define NV50_COMPUTE_LOCAL_SIZE_LOG                            0x0000029c
+
+#define NV50_COMPUTE_UNK02A0                                   0x000002a0
+
+#define NV50_COMPUTE_CB_DEF_ADDRESS_HIGH                               0x000002a4
+
+#define NV50_COMPUTE_CB_DEF_ADDRESS_LOW                                0x000002a8
+
+#define NV50_COMPUTE_CB_DEF_SET                                        0x000002ac
+#define NV50_COMPUTE_CB_DEF_SET_SIZE__MASK                     0x0000ffff
+#define NV50_COMPUTE_CB_DEF_SET_SIZE__SHIFT                    0
+#define NV50_COMPUTE_CB_DEF_SET_BUFFER__MASK                   0x007f0000
+#define NV50_COMPUTE_CB_DEF_SET_BUFFER__SHIFT                  16
+
+#define NV50_COMPUTE_UNK02B0                                   0x000002b0
+
+#define NV50_COMPUTE_BLOCK_ALLOC                                       0x000002b4
+#define NV50_COMPUTE_BLOCK_ALLOC_THREADS__MASK                 0x0000ffff
+#define NV50_COMPUTE_BLOCK_ALLOC_THREADS__SHIFT                        0
+#define NV50_COMPUTE_BLOCK_ALLOC_BARRIERS__MASK                        0x00ff0000
+#define NV50_COMPUTE_BLOCK_ALLOC_BARRIERS__SHIFT                       16
+
+#define NV50_COMPUTE_LANES32_ENABLE                            0x000002b8
+
+#define NV50_COMPUTE_UNK02BC                                   0x000002bc
+#define NV50_COMPUTE_UNK02BC_UNK1__MASK                                0x00000007
+#define NV50_COMPUTE_UNK02BC_UNK1__SHIFT                               0
+#define NV50_COMPUTE_UNK02BC_UNK2__MASK                                0x00000070
+#define NV50_COMPUTE_UNK02BC_UNK2__SHIFT                               4
+
+#define NV50_COMPUTE_CP_REG_ALLOC_TEMP                         0x000002c0
+
+#define NV50_COMPUTE_TIC_ADDRESS_HIGH                          0x000002c4
+
+#define NV50_COMPUTE_TIC_ADDRESS_LOW                           0x000002c8
+
+#define NV50_COMPUTE_TIC_LIMIT                                 0x000002cc
+
+#define NV50_COMPUTE_MP_PM_SET(i0)                            (0x000002d0 + 0x4*(i0))
+#define NV50_COMPUTE_MP_PM_SET__ESIZE                          0x00000004
+#define NV50_COMPUTE_MP_PM_SET__LEN                            0x00000004
+
+#define NV50_COMPUTE_MP_PM_CONTROL(i0)                        (0x000002e0 + 0x4*(i0))
+#define NV50_COMPUTE_MP_PM_CONTROL__ESIZE                      0x00000004
+#define NV50_COMPUTE_MP_PM_CONTROL__LEN                                0x00000004
+#define NV50_COMPUTE_MP_PM_CONTROL_MODE__MASK                  0x00000001
+#define NV50_COMPUTE_MP_PM_CONTROL_MODE__SHIFT                 0
+#define NV50_COMPUTE_MP_PM_CONTROL_MODE_LOGOP                  0x00000000
+#define NV50_COMPUTE_MP_PM_CONTROL_MODE_LOGOP_PULSE            0x00000001
+#define NV50_COMPUTE_MP_PM_CONTROL_UNIT__MASK                  0x00000070
+#define NV50_COMPUTE_MP_PM_CONTROL_UNIT__SHIFT                 4
+#define NV50_COMPUTE_MP_PM_CONTROL_UNIT_UNK0                   0x00000000
+#define NV50_COMPUTE_MP_PM_CONTROL_UNIT_UNK1                   0x00000010
+#define NV50_COMPUTE_MP_PM_CONTROL_UNIT_UNK2                   0x00000020
+#define NV50_COMPUTE_MP_PM_CONTROL_UNIT_UNK3                   0x00000030
+#define NV50_COMPUTE_MP_PM_CONTROL_UNIT_UNK4                   0x00000040
+#define NV50_COMPUTE_MP_PM_CONTROL_UNIT_UNK5                   0x00000050
+#define NV50_COMPUTE_MP_PM_CONTROL_FUNC__MASK                  0x00ffff00
+#define NV50_COMPUTE_MP_PM_CONTROL_FUNC__SHIFT                 8
+#define NV50_COMPUTE_MP_PM_CONTROL_SIG__MASK                   0xff000000
+#define NV50_COMPUTE_MP_PM_CONTROL_SIG__SHIFT                  24
+
+#define NV50_COMPUTE_MP_PM_OVERFLOW_TRAP_ENABLE                        0x000002f0
+#define NV50_COMPUTE_MP_PM_OVERFLOW_TRAP_ENABLE_0              0x00000001
+#define NV50_COMPUTE_MP_PM_OVERFLOW_TRAP_ENABLE_1              0x00000002
+#define NV50_COMPUTE_MP_PM_OVERFLOW_TRAP_ENABLE_2              0x00000004
+#define NV50_COMPUTE_MP_PM_OVERFLOW_TRAP_ENABLE_3              0x00000008
+
+#define NV50_COMPUTE_UNK02F4                                   0x000002f4
+
+#define NV50_COMPUTE_BLOCKDIM_LATCH                            0x000002f8
+
+#define NV50_COMPUTE_LOCAL_WARPS_LOG_ALLOC                     0x000002fc
+
+#define NV50_COMPUTE_LOCAL_WARPS_NO_CLAMP                      0x00000300
+
+#define NV50_COMPUTE_STACK_WARPS_LOG_ALLOC                     0x00000304
+
+#define NV50_COMPUTE_STACK_WARPS_NO_CLAMP                      0x00000308
+
+#define NV50_COMPUTE_UNK030C                                   0x0000030c
+
+#define NV50_COMPUTE_QUERY_ADDRESS_HIGH                                0x00000310
+
+#define NV50_COMPUTE_QUERY_ADDRESS_LOW                         0x00000314
+
+#define NV50_COMPUTE_QUERY_SEQUENCE                            0x00000318
+
+#define NV50_COMPUTE_QUERY_GET                                 0x0000031c
+#define NV50_COMPUTE_QUERY_GET_INTR                            0x00000200
+#define NV50_COMPUTE_QUERY_GET_SHORT                           0x00008000
+
+#define NV50_COMPUTE_COND_ADDRESS_HIGH                         0x00000320
+
+#define NV50_COMPUTE_COND_ADDRESS_LOW                          0x00000324
+
+#define NV50_COMPUTE_COND_MODE                                 0x00000328
+#define NV50_COMPUTE_COND_MODE_NEVER                           0x00000000
+#define NV50_COMPUTE_COND_MODE_ALWAYS                          0x00000001
+#define NV50_COMPUTE_COND_MODE_RES_NON_ZERO                    0x00000002
+#define NV50_COMPUTE_COND_MODE_EQUAL                           0x00000003
+#define NV50_COMPUTE_COND_MODE_NOT_EQUAL                               0x00000004
+
+#define NV50_COMPUTE_UNK032C                                   0x0000032c
+
+#define NV50_COMPUTE_UNK0330                                   0x00000330
+
+#define NV50_COMPUTE_UNK0334(i0)                                      (0x00000334 + 0x4*(i0))
+#define NV50_COMPUTE_UNK0334__ESIZE                            0x00000004
+#define NV50_COMPUTE_UNK0334__LEN                              0x00000003
+
+#define NV50_COMPUTE_UNK0340(i0)                                      (0x00000340 + 0x4*(i0))
+#define NV50_COMPUTE_UNK0340__ESIZE                            0x00000004
+#define NV50_COMPUTE_UNK0340__LEN                              0x00000002
+
+#define NV50_COMPUTE_UNK0348(i0)                                      (0x00000348 + 0x4*(i0))
+#define NV50_COMPUTE_UNK0348__ESIZE                            0x00000004
+#define NV50_COMPUTE_UNK0348__LEN                              0x00000002
+
+#define NV50_COMPUTE_UNK0350(i0)                                      (0x00000350 + 0x4*(i0))
+#define NV50_COMPUTE_UNK0350__ESIZE                            0x00000004
+#define NV50_COMPUTE_UNK0350__LEN                              0x00000002
+
+#define NV50_COMPUTE_UNK0358                                   0x00000358
+
+#define NV50_COMPUTE_UNK035C                                   0x0000035c
+
+#define NV50_COMPUTE_UNK0360                                   0x00000360
+#define NV50_COMPUTE_UNK0360_UNK0__MASK                                0x000000f0
+#define NV50_COMPUTE_UNK0360_UNK0__SHIFT                               4
+#define NV50_COMPUTE_UNK0360_UNK1__MASK                                0x00000f00
+#define NV50_COMPUTE_UNK0360_UNK1__SHIFT                               8
+
+#define NV50_COMPUTE_UNK0364                                   0x00000364
+
+#define NV50_COMPUTE_LAUNCH                                    0x00000368
+
+#define NV50_COMPUTE_UNK036C                                   0x0000036c
+
+#define NV50_COMPUTE_UNK0370                                   0x00000370
+
+#define NV50_COMPUTE_USER_PARAM_COUNT                          0x00000374
+#define NV50_COMPUTE_USER_PARAM_COUNT_UNK0__MASK                       0x000000ff
+#define NV50_COMPUTE_USER_PARAM_COUNT_UNK0__SHIFT              0
+#define NV50_COMPUTE_USER_PARAM_COUNT_COUNT__MASK              0x0000ff00
+#define NV50_COMPUTE_USER_PARAM_COUNT_COUNT__SHIFT             8
+#define NV50_COMPUTE_USER_PARAM_COUNT_COUNT__MAX                       0x00000040
+
+#define NV50_COMPUTE_LINKED_TSC                                        0x00000378
+
+#define NV50_COMPUTE_UNK037C                                   0x0000037c
+#define NV50_COMPUTE_UNK037C_ALWAYS_DERIV                      0x00000001
+#define NV50_COMPUTE_UNK037C_UNK16                             0x00010000
+
+#define NV50_COMPUTE_CODE_CB_FLUSH                             0x00000380
+
+#define NV50_COMPUTE_UNK0384                                   0x00000384
+
+#define NV50_COMPUTE_GRIDID                                    0x00000388
+
+#define NV50_COMPUTE_UNK038C(i0)                                      (0x0000038c + 0x4*(i0))
+#define NV50_COMPUTE_UNK038C__ESIZE                            0x00000004
+#define NV50_COMPUTE_UNK038C__LEN                              0x00000003
+
+#define NV50_COMPUTE_WRCACHE_FLUSH                             0x00000398
+
+#define NV50_COMPUTE_UNK039C(i0)                                      (0x0000039c + 0x4*(i0))
+#define NV50_COMPUTE_UNK039C__ESIZE                            0x00000004
+#define NV50_COMPUTE_UNK039C__LEN                              0x00000002
+
+#define NV50_COMPUTE_GRIDDIM                                   0x000003a4
+#define NV50_COMPUTE_GRIDDIM_X__MASK                           0x0000ffff
+#define NV50_COMPUTE_GRIDDIM_X__SHIFT                          0
+#define NV50_COMPUTE_GRIDDIM_Y__MASK                           0xffff0000
+#define NV50_COMPUTE_GRIDDIM_Y__SHIFT                          16
+
+#define NV50_COMPUTE_SHARED_SIZE                                       0x000003a8
+#define NV50_COMPUTE_SHARED_SIZE__MAX                          0x00004000
+#define NV50_COMPUTE_SHARED_SIZE__ALIGN                                0x00000040
+
+#define NV50_COMPUTE_BLOCKDIM_XY                                       0x000003ac
+#define NV50_COMPUTE_BLOCKDIM_XY_X__MASK                               0x0000ffff
+#define NV50_COMPUTE_BLOCKDIM_XY_X__SHIFT                      0
+#define NV50_COMPUTE_BLOCKDIM_XY_Y__MASK                               0xffff0000
+#define NV50_COMPUTE_BLOCKDIM_XY_Y__SHIFT                      16
+
+#define NV50_COMPUTE_BLOCKDIM_Z                                        0x000003b0
+#define NV50_COMPUTE_BLOCKDIM_Z__MIN                           0x00000001
+#define NV50_COMPUTE_BLOCKDIM_Z__MAX                           0x00000040
+
+#define NV50_COMPUTE_CP_START_ID                                       0x000003b4
+
+#define NV50_COMPUTE_REG_MODE                                  0x000003b8
+#define NV50_COMPUTE_REG_MODE_PACKED                           0x00000001
+#define NV50_COMPUTE_REG_MODE_STRIPED                          0x00000002
+
+#define NV50_COMPUTE_TEX_LIMITS                                        0x000003bc
+#define NV50_COMPUTE_TEX_LIMITS_SAMPLERS_LOG2__MASK            0x0000000f
+#define NV50_COMPUTE_TEX_LIMITS_SAMPLERS_LOG2__SHIFT           0
+#define NV50_COMPUTE_TEX_LIMITS_SAMPLERS_LOG2__MIN             0x00000000
+#define NV50_COMPUTE_TEX_LIMITS_SAMPLERS_LOG2__MAX             0x00000004
+#define NV50_COMPUTE_TEX_LIMITS_TEXTURES_LOG2__MASK            0x000000f0
+#define NV50_COMPUTE_TEX_LIMITS_TEXTURES_LOG2__SHIFT           4
+#define NV50_COMPUTE_TEX_LIMITS_TEXTURES_LOG2__MIN             0x00000000
+#define NV50_COMPUTE_TEX_LIMITS_TEXTURES_LOG2__MAX             0x00000007
+
+#define NV50_COMPUTE_BIND_TSC                                  0x000003c0
+#define NV50_COMPUTE_BIND_TSC_VALID                            0x00000001
+#define NV50_COMPUTE_BIND_TSC_SAMPLER__MASK                    0x000000f0
+#define NV50_COMPUTE_BIND_TSC_SAMPLER__SHIFT                   4
+#define NV50_COMPUTE_BIND_TSC_TSC__MASK                                0x001ff000
+#define NV50_COMPUTE_BIND_TSC_TSC__SHIFT                               12
+
+#define NV50_COMPUTE_BIND_TIC                                  0x000003c4
+#define NV50_COMPUTE_BIND_TIC_VALID                            0x00000001
+#define NV50_COMPUTE_BIND_TIC_TEXTURE__MASK                    0x000001fe
+#define NV50_COMPUTE_BIND_TIC_TEXTURE__SHIFT                   1
+#define NV50_COMPUTE_BIND_TIC_TIC__MASK                                0x7ffffe00
+#define NV50_COMPUTE_BIND_TIC_TIC__SHIFT                               9
+
+#define NV50_COMPUTE_SET_PROGRAM_CB                            0x000003c8
+#define NV50_COMPUTE_SET_PROGRAM_CB_INDEX__MASK                        0x00000f00
+#define NV50_COMPUTE_SET_PROGRAM_CB_INDEX__SHIFT                       8
+#define NV50_COMPUTE_SET_PROGRAM_CB_BUFFER__MASK                       0x0007f000
+#define NV50_COMPUTE_SET_PROGRAM_CB_BUFFER__SHIFT              12
+#define NV50_COMPUTE_SET_PROGRAM_CB_VALID                      0x000000ff
+
+#define NV50_COMPUTE_UNK03CC                                   0x000003cc
+
+#define NV50_COMPUTE_TEX_CACHE_CTL                             0x000003d0
+#define NV50_COMPUTE_TEX_CACHE_CTL_UNK1__MASK                  0x00000030
+#define NV50_COMPUTE_TEX_CACHE_CTL_UNK1__SHIFT                 4
+
+#define NV50_COMPUTE_UNK03D4                                   0x000003d4
+
+#define NV50_COMPUTE_UNK03D8                                   0x000003d8
+
+#define NV50_COMPUTE_UNK03DC                                   0x000003dc
+
+#define NV50_COMPUTE_UNK03E0                                   0x000003e0
+
+#define NV50_COMPUTE_UNK03E4                                   0x000003e4
+
+#define NVA3_COMPUTE_TEX_MISC                                  0x000003e8
+#define NVA3_COMPUTE_TEX_MISC_UNK1                             0x00000001
+#define NVA3_COMPUTE_TEX_MISC_SEAMLESS_CUBE_MAP                0x00000002
+
+#define NV50_COMPUTE_GLOBAL(i0)                                       (0x00000400 + 0x20*(i0))
+#define NV50_COMPUTE_GLOBAL__ESIZE                             0x00000020
+#define NV50_COMPUTE_GLOBAL__LEN                                       0x00000010
+
+#define NV50_COMPUTE_GLOBAL_ADDRESS_HIGH(i0)                  (0x00000400 + 0x20*(i0))
+
+#define NV50_COMPUTE_GLOBAL_ADDRESS_LOW(i0)                   (0x00000404 + 0x20*(i0))
+
+#define NV50_COMPUTE_GLOBAL_PITCH(i0)                         (0x00000408 + 0x20*(i0))
+#define NV50_COMPUTE_GLOBAL_PITCH__MAX                         0x00800000
+#define NV50_COMPUTE_GLOBAL_PITCH__ALIGN                               0x00000100
+
+#define NV50_COMPUTE_GLOBAL_LIMIT(i0)                         (0x0000040c + 0x20*(i0))
+
+#define NV50_COMPUTE_GLOBAL_MODE(i0)                          (0x00000410 + 0x20*(i0))
+#define NV50_COMPUTE_GLOBAL_MODE_LINEAR                                0x00000001
+#define NV50_COMPUTE_GLOBAL_MODE_UNK1__MASK                    0x000000f0
+#define NV50_COMPUTE_GLOBAL_MODE_UNK1__SHIFT                   4
+#define NV50_COMPUTE_GLOBAL_MODE_TILE_MODE__MASK                       0x00000f00
+#define NV50_COMPUTE_GLOBAL_MODE_TILE_MODE__SHIFT              8
+
+#define NV50_COMPUTE_USER_PARAM(i0)                           (0x00000600 + 0x4*(i0))
+#define NV50_COMPUTE_USER_PARAM__ESIZE                         0x00000004
+#define NV50_COMPUTE_USER_PARAM__LEN                           0x00000040
+
+#define NV50_COMPUTE_UNK0700(i0)                                      (0x00000700 + 0x4*(i0))
+#define NV50_COMPUTE_UNK0700__ESIZE                            0x00000004
+#define NV50_COMPUTE_UNK0700__LEN                              0x00000010
+
+
+#endif /* NV50_COMPUTE_XML */
index 7867c2d..f645a4d 100644 (file)
@@ -113,6 +113,7 @@ nv50_context_unreference_resources(struct nv50_context *nv50)
 
    nouveau_bufctx_del(&nv50->bufctx_3d);
    nouveau_bufctx_del(&nv50->bufctx);
+   nouveau_bufctx_del(&nv50->bufctx_cp);
 
    util_unreference_framebuffer_state(&nv50->framebuffer);
 
@@ -131,6 +132,14 @@ nv50_context_unreference_resources(struct nv50_context *nv50)
          if (!nv50->constbuf[s][i].user)
             pipe_resource_reference(&nv50->constbuf[s][i].u.buf, NULL);
    }
+
+   for (i = 0; i < nv50->global_residents.size / sizeof(struct pipe_resource *);
+        ++i) {
+      struct pipe_resource **res = util_dynarray_element(
+         &nv50->global_residents, struct pipe_resource *, i);
+      pipe_resource_reference(res, NULL);
+   }
+   util_dynarray_fini(&nv50->global_residents);
 }
 
 static void
@@ -263,10 +272,13 @@ nv50_create(struct pipe_screen *pscreen, void *priv, unsigned ctxflags)
    nv50->base.pushbuf = screen->base.pushbuf;
    nv50->base.client = screen->base.client;
 
-   ret = nouveau_bufctx_new(screen->base.client, NV50_BIND_COUNT,
-                            &nv50->bufctx_3d);
+   ret = nouveau_bufctx_new(screen->base.client, 2, &nv50->bufctx);
+   if (!ret)
+      ret = nouveau_bufctx_new(screen->base.client, NV50_BIND_3D_COUNT,
+                               &nv50->bufctx_3d);
    if (!ret)
-      ret = nouveau_bufctx_new(screen->base.client, 2, &nv50->bufctx);
+      ret = nouveau_bufctx_new(screen->base.client, NV50_BIND_CP_COUNT,
+                               &nv50->bufctx_cp);
    if (ret)
       goto out_err;
 
@@ -290,6 +302,7 @@ nv50_create(struct pipe_screen *pscreen, void *priv, unsigned ctxflags)
 
    pipe->draw_vbo = nv50_draw_vbo;
    pipe->clear = nv50_clear;
+   pipe->launch_grid = nv50_launch_grid;
 
    pipe->flush = nv50_flush;
    pipe->texture_barrier = nv50_texture_barrier;
@@ -335,19 +348,30 @@ nv50_create(struct pipe_screen *pscreen, void *priv, unsigned ctxflags)
    BCTX_REFN_bo(nv50->bufctx_3d, SCREEN, flags, screen->uniforms);
    BCTX_REFN_bo(nv50->bufctx_3d, SCREEN, flags, screen->txc);
    BCTX_REFN_bo(nv50->bufctx_3d, SCREEN, flags, screen->stack_bo);
+   if (screen->compute) {
+      BCTX_REFN_bo(nv50->bufctx_cp, CP_SCREEN, flags, screen->code);
+      BCTX_REFN_bo(nv50->bufctx_cp, CP_SCREEN, flags, screen->txc);
+      BCTX_REFN_bo(nv50->bufctx_cp, CP_SCREEN, flags, screen->stack_bo);
+   }
 
    flags = NOUVEAU_BO_GART | NOUVEAU_BO_WR;
 
    BCTX_REFN_bo(nv50->bufctx_3d, SCREEN, flags, screen->fence.bo);
    BCTX_REFN_bo(nv50->bufctx, FENCE, flags, screen->fence.bo);
+   if (screen->compute)
+      BCTX_REFN_bo(nv50->bufctx_cp, CP_SCREEN, flags, screen->fence.bo);
 
    nv50->base.scratch.bo_size = 2 << 20;
 
+   util_dynarray_init(&nv50->global_residents);
+
    return pipe;
 
 out_err:
    if (nv50->bufctx_3d)
       nouveau_bufctx_del(&nv50->bufctx_3d);
+   if (nv50->bufctx_cp)
+      nouveau_bufctx_del(&nv50->bufctx_cp);
    if (nv50->bufctx)
       nouveau_bufctx_del(&nv50->bufctx);
    FREE(nv50->blit);
index fb74a97..fbafe02 100644 (file)
 #define NV50_NEW_MIN_SAMPLES  (1 << 22)
 #define NV50_NEW_CONTEXT      (1 << 31)
 
+#define NV50_NEW_CP_PROGRAM   (1 << 0)
+#define NV50_NEW_CP_GLOBALS   (1 << 1)
+
+/* 3d bufctx (during draw_vbo, blit_3d) */
 #define NV50_BIND_FB          0
 #define NV50_BIND_VERTEX      1
 #define NV50_BIND_VERTEX_TMP  2
 #define NV50_BIND_SO         53
 #define NV50_BIND_SCREEN     54
 #define NV50_BIND_TLS        55
-#define NV50_BIND_COUNT      56
+#define NV50_BIND_3D_COUNT   56
+
+/* compute bufctx (during launch_grid) */
+#define NV50_BIND_CP_GLOBAL   0
+#define NV50_BIND_CP_SCREEN   1
+#define NV50_BIND_CP_COUNT    2
+
+/* bufctx for other operations */
 #define NV50_BIND_2D          0
 #define NV50_BIND_M2MF        0
 #define NV50_BIND_FENCE       1
@@ -101,8 +112,10 @@ struct nv50_context {
 
    struct nouveau_bufctx *bufctx_3d;
    struct nouveau_bufctx *bufctx;
+   struct nouveau_bufctx *bufctx_cp;
 
    uint32_t dirty;
+   uint32_t dirty_cp; /* dirty flags for compute state */
    bool cb_dirty;
 
    struct nv50_graph_state state;
@@ -115,6 +128,7 @@ struct nv50_context {
    struct nv50_program *vertprog;
    struct nv50_program *gmtyprog;
    struct nv50_program *fragprog;
+   struct nv50_program *compprog;
 
    struct nv50_constbuf constbuf[3][NV50_MAX_PIPE_CONSTBUFS];
    uint16_t constbuf_dirty[3];
@@ -163,6 +177,8 @@ struct nv50_context {
    uint32_t cond_condmode; /* the calculated condition */
 
    struct nv50_blitctx *blit;
+
+   struct util_dynarray global_residents;
 };
 
 static inline struct nv50_context *
@@ -302,4 +318,9 @@ struct pipe_video_buffer *
 nv98_video_buffer_create(struct pipe_context *pipe,
                          const struct pipe_video_buffer *template);
 
+/* nv50_compute.c */
+void
+nv50_launch_grid(struct pipe_context *, const uint *, const uint *,
+                 uint32_t, const void *);
+
 #endif
index 707bf7a..48057d2 100644 (file)
@@ -259,6 +259,8 @@ nv50_program_assign_varying_slots(struct nv50_ir_prog_info *info)
       return nv50_vertprog_assign_slots(info);
    case PIPE_SHADER_FRAGMENT:
       return nv50_fragprog_assign_slots(info);
+   case PIPE_SHADER_COMPUTE:
+      return 0;
    default:
       return -1;
    }
@@ -355,6 +357,9 @@ nv50_program_translate(struct nv50_program *prog, uint16_t chipset,
    prog->gp.has_layer = 0;
    prog->gp.has_viewport = 0;
 
+   if (prog->type == PIPE_SHADER_COMPUTE)
+      info->prop.cp.inputOffset = 0x10;
+
    info->driverPriv = prog;
 
 #ifdef DEBUG
@@ -401,6 +406,10 @@ nv50_program_translate(struct nv50_program *prog, uint16_t chipset,
          break;
       }
       prog->gp.vert_count = info->prop.gp.maxVertices;
+   } else
+   if (prog->type == PIPE_SHADER_COMPUTE) {
+      prog->cp.syms = info->bin.syms;
+      prog->cp.num_syms = info->bin.numSyms;
    }
 
    if (prog->pipe.stream_output.num_outputs)
@@ -423,11 +432,13 @@ nv50_program_upload_code(struct nv50_context *nv50, struct nv50_program *prog)
    struct nouveau_heap *heap;
    int ret;
    uint32_t size = align(prog->code_size, 0x40);
+   uint8_t prog_type;
 
    switch (prog->type) {
    case PIPE_SHADER_VERTEX:   heap = nv50->screen->vp_code_heap; break;
    case PIPE_SHADER_GEOMETRY: heap = nv50->screen->gp_code_heap; break;
    case PIPE_SHADER_FRAGMENT: heap = nv50->screen->fp_code_heap; break;
+   case PIPE_SHADER_COMPUTE:  heap = nv50->screen->fp_code_heap; break;
    default:
       assert(!"invalid program type");
       return false;
@@ -450,7 +461,14 @@ nv50_program_upload_code(struct nv50_context *nv50, struct nv50_program *prog)
          return false;
       }
    }
-   prog->code_base = prog->mem->start;
+
+   if (prog->type == PIPE_SHADER_COMPUTE) {
+      /* CP code must be uploaded in FP code segment. */
+      prog_type = 1;
+   } else {
+      prog->code_base = prog->mem->start;
+      prog_type = prog->type;
+   }
 
    ret = nv50_tls_realloc(nv50->screen, prog->tls_space);
    if (ret < 0) {
@@ -468,7 +486,7 @@ nv50_program_upload_code(struct nv50_context *nv50, struct nv50_program *prog)
                             false /* flatshade */);
 
    nv50_sifc_linear_u8(&nv50->base, nv50->screen->code,
-                       (prog->type << NV50_CODE_BO_SIZE_LOG2) + prog->code_base,
+                       (prog_type << NV50_CODE_BO_SIZE_LOG2) + prog->code_base,
                        NOUVEAU_BO_VRAM, prog->code_size, prog->code);
 
    BEGIN_NV04(nv50->base.pushbuf, NV50_3D(CODE_CB_FLUSH), 1);
index 7a33eb1..f001670 100644 (file)
@@ -98,6 +98,13 @@ struct nv50_program {
       ubyte viewportid; /* hw value of viewport index output */
    } gp;
 
+   struct {
+      uint32_t lmem_size; /* local memory (TGSI PRIVATE resource) size */
+      uint32_t smem_size; /* shared memory (TGSI LOCAL resource) size */
+      void *syms;
+      unsigned num_syms;
+   } cp;
+
    void *fixups; /* relocation records */
    void *interps; /* interpolation records */
 
index f47e998..0142e86 100644 (file)
@@ -41,8 +41,6 @@
 
 #define THREADS_IN_WARP 32
 
-#define ONE_TEMP_SIZE (4/*vector*/ * sizeof(float))
-
 static boolean
 nv50_screen_is_format_supported(struct pipe_screen *pscreen,
                                 enum pipe_format format,
@@ -183,6 +181,7 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
    case PIPE_CAP_SHAREABLE_SHADERS:
    case PIPE_CAP_CLEAR_TEXTURE:
+   case PIPE_CAP_COMPUTE:
       return 1;
    case PIPE_CAP_SEAMLESS_CUBE_MAP:
       return 1; /* class_3d >= NVA0_3D_CLASS; */
@@ -212,7 +211,6 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_FAKE_SW_MSAA:
    case PIPE_CAP_TEXTURE_GATHER_OFFSETS:
    case PIPE_CAP_TGSI_VS_WINDOW_SPACE_POSITION:
-   case PIPE_CAP_COMPUTE:
    case PIPE_CAP_DRAW_INDIRECT:
    case PIPE_CAP_VERTEXID_NOBASE:
    case PIPE_CAP_MULTISAMPLE_Z_RESOLVE: /* potentially supported on some hw */
@@ -251,6 +249,7 @@ nv50_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
    case PIPE_SHADER_VERTEX:
    case PIPE_SHADER_GEOMETRY:
    case PIPE_SHADER_FRAGMENT:
+   case PIPE_SHADER_COMPUTE:
       break;
    default:
       return 0;
@@ -336,6 +335,52 @@ nv50_screen_get_paramf(struct pipe_screen *pscreen, enum pipe_capf param)
    return 0.0f;
 }
 
+static int
+nv50_screen_get_compute_param(struct pipe_screen *pscreen,
+                              enum pipe_compute_cap param, void *data)
+{
+   struct nv50_screen *screen = nv50_screen(pscreen);
+
+#define RET(x) do {                  \
+   if (data)                         \
+      memcpy(data, x, sizeof(x));    \
+   return sizeof(x);                 \
+} while (0)
+
+   switch (param) {
+   case PIPE_COMPUTE_CAP_GRID_DIMENSION:
+      RET((uint64_t []) { 2 });
+   case PIPE_COMPUTE_CAP_MAX_GRID_SIZE:
+      RET(((uint64_t []) { 65535, 65535 }));
+   case PIPE_COMPUTE_CAP_MAX_BLOCK_SIZE:
+      RET(((uint64_t []) { 512, 512, 64 }));
+   case PIPE_COMPUTE_CAP_MAX_THREADS_PER_BLOCK:
+      RET((uint64_t []) { 512 });
+   case PIPE_COMPUTE_CAP_MAX_GLOBAL_SIZE: /* g0-15[] */
+      RET((uint64_t []) { 1ULL << 32 });
+   case PIPE_COMPUTE_CAP_MAX_LOCAL_SIZE: /* s[] */
+      RET((uint64_t []) { 16 << 10 });
+   case PIPE_COMPUTE_CAP_MAX_PRIVATE_SIZE: /* l[] */
+      RET((uint64_t []) { 16 << 10 });
+   case PIPE_COMPUTE_CAP_MAX_INPUT_SIZE: /* c[], arbitrary limit */
+      RET((uint64_t []) { 4096 });
+   case PIPE_COMPUTE_CAP_SUBGROUP_SIZE:
+      RET((uint32_t []) { 32 });
+   case PIPE_COMPUTE_CAP_MAX_MEM_ALLOC_SIZE:
+      RET((uint64_t []) { 1ULL << 40 });
+   case PIPE_COMPUTE_CAP_IMAGES_SUPPORTED:
+      RET((uint32_t []) { 0 });
+   case PIPE_COMPUTE_CAP_MAX_COMPUTE_UNITS:
+      RET((uint32_t []) { screen->mp_count });
+   case PIPE_COMPUTE_CAP_MAX_CLOCK_FREQUENCY:
+      RET((uint32_t []) { 512 }); /* FIXME: arbitrary limit */
+   default:
+      return 0;
+   }
+
+#undef RET
+}
+
 static void
 nv50_screen_destroy(struct pipe_screen *pscreen)
 {
@@ -377,6 +422,7 @@ nv50_screen_destroy(struct pipe_screen *pscreen)
    nouveau_object_del(&screen->tesla);
    nouveau_object_del(&screen->eng2d);
    nouveau_object_del(&screen->m2mf);
+   nouveau_object_del(&screen->compute);
    nouveau_object_del(&screen->sync);
 
    nouveau_screen_fini(&screen->base);
@@ -742,6 +788,7 @@ nv50_screen_create(struct nouveau_device *dev)
    pscreen->get_param = nv50_screen_get_param;
    pscreen->get_shader_param = nv50_screen_get_shader_param;
    pscreen->get_paramf = nv50_screen_get_paramf;
+   pscreen->get_compute_param = nv50_screen_get_compute_param;
 
    nv50_screen_init_resource_functions(pscreen);
 
@@ -851,6 +898,8 @@ nv50_screen_create(struct nouveau_device *dev)
    screen->TPs = util_bitcount(value & 0xffff);
    screen->MPsInTP = util_bitcount((value >> 24) & 0xf);
 
+   screen->mp_count = screen->TPs * screen->MPsInTP;
+
    stack_size = util_next_power_of_two(screen->TPs) * screen->MPsInTP *
          STACK_WARPS_ALLOC * 64 * 8;
 
@@ -902,6 +951,12 @@ nv50_screen_create(struct nouveau_device *dev)
 
    nv50_screen_init_hwctx(screen);
 
+   ret = nv50_screen_compute_setup(screen, screen->base.pushbuf);
+   if (ret) {
+      NOUVEAU_ERR("Failed to init compute context: %d\n", ret);
+      goto fail;
+   }
+
    nouveau_fence_new(&screen->base, &screen->base.fence.current, false);
 
    return pscreen;
index ce51f0f..153ceea 100644 (file)
@@ -23,6 +23,10 @@ struct nv50_context;
 
 #define NV50_MAX_VIEWPORTS 16
 
+#define NV50_MAX_GLOBALS 16
+
+#define ONE_TEMP_SIZE (4/*vector*/ * sizeof(float))
+
 struct nv50_blitter;
 
 struct nv50_graph_state {
@@ -66,6 +70,7 @@ struct nv50_screen {
    unsigned MPsInTP;
    unsigned max_tls_space;
    unsigned cur_tls_space;
+   unsigned mp_count;
 
    struct nouveau_heap *vp_code_heap;
    struct nouveau_heap *gp_code_heap;
@@ -93,6 +98,7 @@ struct nv50_screen {
    struct nouveau_object *sync;
 
    struct nouveau_object *tesla;
+   struct nouveau_object *compute;
    struct nouveau_object *eng2d;
    struct nouveau_object *m2mf;
 };
@@ -109,6 +115,8 @@ void nv50_blitter_destroy(struct nv50_screen *);
 int nv50_screen_tic_alloc(struct nv50_screen *, void *);
 int nv50_screen_tsc_alloc(struct nv50_screen *, void *);
 
+int nv50_screen_compute_setup(struct nv50_screen *, struct nouveau_pushbuf *);
+
 static inline void
 nv50_resource_fence(struct nv04_resource *res, uint32_t flags)
 {
index d27f12c..b4ea08d 100644 (file)
@@ -792,6 +792,35 @@ nv50_gp_state_bind(struct pipe_context *pipe, void *hwcso)
     nv50->dirty |= NV50_NEW_GMTYPROG;
 }
 
+static void *
+nv50_cp_state_create(struct pipe_context *pipe,
+                     const struct pipe_compute_state *cso)
+{
+   struct nv50_program *prog;
+
+   prog = CALLOC_STRUCT(nv50_program);
+   if (!prog)
+      return NULL;
+   prog->type = PIPE_SHADER_COMPUTE;
+
+   prog->cp.smem_size = cso->req_local_mem;
+   prog->cp.lmem_size = cso->req_private_mem;
+   prog->parm_size = cso->req_input_mem;
+
+   prog->pipe.tokens = tgsi_dup_tokens((const struct tgsi_token *)cso->prog);
+
+   return (void *)prog;
+}
+
+static void
+nv50_cp_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+   struct nv50_context *nv50 = nv50_context(pipe);
+
+   nv50->compprog = hwcso;
+   nv50->dirty_cp |= NV50_NEW_CP_PROGRAM;
+}
+
 static void
 nv50_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index,
                          struct pipe_constant_buffer *cb)
@@ -1134,6 +1163,70 @@ nv50_set_stream_output_targets(struct pipe_context *pipe,
       nv50->dirty |= NV50_NEW_STRMOUT;
 }
 
+static void
+nv50_set_compute_resources(struct pipe_context *pipe,
+                           unsigned start, unsigned nr,
+                           struct pipe_surface **resources)
+{
+   /* TODO: bind surfaces */
+}
+
+static inline void
+nv50_set_global_handle(uint32_t *phandle, struct pipe_resource *res)
+{
+   struct nv04_resource *buf = nv04_resource(res);
+   if (buf) {
+      uint64_t limit = (buf->address + buf->base.width0) - 1;
+      if (limit < (1ULL << 32)) {
+         *phandle = (uint32_t)buf->address;
+      } else {
+         NOUVEAU_ERR("Cannot map into TGSI_RESOURCE_GLOBAL: "
+                     "resource not contained within 32-bit address space !\n");
+         *phandle = 0;
+      }
+   } else {
+      *phandle = 0;
+   }
+}
+
+static void
+nv50_set_global_bindings(struct pipe_context *pipe,
+                         unsigned start, unsigned nr,
+                         struct pipe_resource **resources,
+                         uint32_t **handles)
+{
+   struct nv50_context *nv50 = nv50_context(pipe);
+   struct pipe_resource **ptr;
+   unsigned i;
+   const unsigned end = start + nr;
+
+   if (nv50->global_residents.size <= (end * sizeof(struct pipe_resource *))) {
+      const unsigned old_size = nv50->global_residents.size;
+      const unsigned req_size = end * sizeof(struct pipe_resource *);
+      util_dynarray_resize(&nv50->global_residents, req_size);
+      memset((uint8_t *)nv50->global_residents.data + old_size, 0,
+             req_size - old_size);
+   }
+
+   if (resources) {
+      ptr = util_dynarray_element(
+         &nv50->global_residents, struct pipe_resource *, start);
+      for (i = 0; i < nr; ++i) {
+         pipe_resource_reference(&ptr[i], resources[i]);
+         nv50_set_global_handle(handles[i], resources[i]);
+      }
+   } else {
+      ptr = util_dynarray_element(
+         &nv50->global_residents, struct pipe_resource *, start);
+      for (i = 0; i < nr; ++i)
+         pipe_resource_reference(&ptr[i], NULL);
+   }
+
+   nouveau_bufctx_reset(nv50->bufctx_cp, NV50_BIND_CP_GLOBAL);
+
+   nv50->dirty_cp = NV50_NEW_CP_GLOBALS;
+}
+
 void
 nv50_init_state_functions(struct nv50_context *nv50)
 {
@@ -1162,12 +1255,15 @@ nv50_init_state_functions(struct nv50_context *nv50)
    pipe->create_vs_state = nv50_vp_state_create;
    pipe->create_fs_state = nv50_fp_state_create;
    pipe->create_gs_state = nv50_gp_state_create;
+   pipe->create_compute_state = nv50_cp_state_create;
    pipe->bind_vs_state = nv50_vp_state_bind;
    pipe->bind_fs_state = nv50_fp_state_bind;
    pipe->bind_gs_state = nv50_gp_state_bind;
+   pipe->bind_compute_state = nv50_cp_state_bind;
    pipe->delete_vs_state = nv50_sp_state_delete;
    pipe->delete_fs_state = nv50_sp_state_delete;
    pipe->delete_gs_state = nv50_sp_state_delete;
+   pipe->delete_compute_state = nv50_sp_state_delete;
 
    pipe->set_blend_color = nv50_set_blend_color;
    pipe->set_stencil_ref = nv50_set_stencil_ref;
@@ -1191,6 +1287,9 @@ nv50_init_state_functions(struct nv50_context *nv50)
    pipe->stream_output_target_destroy = nv50_so_target_destroy;
    pipe->set_stream_output_targets = nv50_set_stream_output_targets;
 
+   pipe->set_global_binding = nv50_set_global_bindings;
+   pipe->set_compute_resources = nv50_set_compute_resources;
+
    nv50->sample_mask = ~0;
    nv50->min_samples = 1;
 }