nvc0: implement compute support for nve4
authorChristoph Bumiller <e0425955@student.tuwien.ac.at>
Sat, 23 Feb 2013 18:40:23 +0000 (19:40 +0100)
committerChristoph Bumiller <e0425955@student.tuwien.ac.at>
Tue, 12 Mar 2013 11:55:37 +0000 (12:55 +0100)
18 files changed:
src/gallium/drivers/nouveau/nouveau_screen.h
src/gallium/drivers/nouveau/nv_object.xml.h
src/gallium/drivers/nv50/nv50_defs.xml.h
src/gallium/drivers/nvc0/Makefile.sources
src/gallium/drivers/nvc0/nvc0_context.c
src/gallium/drivers/nvc0/nvc0_context.h
src/gallium/drivers/nvc0/nvc0_program.c
src/gallium/drivers/nvc0/nvc0_program.h
src/gallium/drivers/nvc0/nvc0_screen.c
src/gallium/drivers/nvc0/nvc0_screen.h
src/gallium/drivers/nvc0/nvc0_shader_state.c
src/gallium/drivers/nvc0/nvc0_state.c
src/gallium/drivers/nvc0/nvc0_state_validate.c
src/gallium/drivers/nvc0/nvc0_surface.c
src/gallium/drivers/nvc0/nvc0_tex.c
src/gallium/drivers/nvc0/nve4_compute.c [new file with mode: 0644]
src/gallium/drivers/nvc0/nve4_compute.h [new file with mode: 0644]
src/gallium/drivers/nvc0/nve4_compute.xml.h [new file with mode: 0644]

index 1de3fa6..d5bc817 100644 (file)
@@ -5,6 +5,7 @@
 #include "util/u_memory.h"
 
 typedef uint32_t u32;
+typedef uint16_t u16;
 
 extern int nouveau_mesa_debug;
 
index 66ba61b..2fd52ac 100644 (file)
@@ -196,6 +196,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define NVC0_COMPUTE_CLASS                                     0x000090c0
 #define NVC8_COMPUTE_CLASS                                     0x000092c0
 #define NVE4_COMPUTE_CLASS                                     0x0000a0c0
+#define NVF0_COMPUTE_CLASS                                     0x0000a1c0
 #define NV84_CRYPT_CLASS                                       0x000074c1
 #define BLOB_NVC0_PCOPY1_CLASS                                 0x000090b8
 #define BLOB_NVC0_PCOPY0_CLASS                                 0x000090b5
index 27046e9..2e42843 100644 (file)
@@ -1,5 +1,5 @@
-#ifndef RNNDB_NV50_DEFS_XML
-#define RNNDB_NV50_DEFS_XML
+#ifndef NV50_DEFS_XML
+#define NV50_DEFS_XML
 
 /* Autogenerated file, DO NOT EDIT manually!
 
@@ -8,11 +8,11 @@ http://0x04.net/cgit/index.cgi/rules-ng-ng
 git clone git://0x04.net/rules-ng-ng
 
 The rules-ng-ng source files this header was generated from are:
-- rnndb/nv50_defs.xml    (   5468 bytes, from 2011-07-09 13:43:58)
-- ./rnndb/copyright.xml  (   6452 bytes, from 2011-07-09 13:43:58)
-- ./rnndb/nvchipsets.xml (   3617 bytes, from 2011-07-09 13:43:58)
+- rnndb/nv50_defs.xml    (   7783 bytes, from 2013-02-14 13:56:25)
+- ./rnndb/copyright.xml  (   6452 bytes, from 2011-08-11 18:25:12)
+- ./rnndb/nvchipsets.xml (   3704 bytes, from 2012-08-18 12:48:55)
 
-Copyright (C) 2006-2011 by the following authors:
+Copyright (C) 2006-2013 by the following authors:
 - Artur Huillet <arthur.huillet@free.fr> (ahuillet)
 - Ben Skeggs (darktama, darktama_)
 - B. R. <koala_br@users.sourceforge.net> (koala_br)
@@ -71,6 +71,13 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
 
 
+#define NV50_VSTATUS_IDLE                                      0x00000000
+#define NV50_VSTATUS_BUSY                                      0x00000001
+#define NV50_VSTATUS_UNK2                                      0x00000002
+#define NV50_VSTATUS_WAITING                                   0x00000003
+#define NV50_VSTATUS_BLOCKED                                   0x00000005
+#define NV50_VSTATUS_FAULTED                                   0x00000006
+#define NV50_VSTATUS_PAUSED                                    0x00000007
 #define NV50_SURFACE_FORMAT_BITMAP                             0x0000001c
 #define NV50_SURFACE_FORMAT_UNK1D                              0x0000001d
 #define NV50_SURFACE_FORMAT_RGBA32_FLOAT                       0x000000c0
@@ -143,6 +150,45 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define NV50_ZETA_FORMAT_Z24_X8_S8_C8_X16_UNORM                        0x0000001d
 #define NV50_ZETA_FORMAT_Z32_X8_C8_X16_FLOAT                   0x0000001e
 #define NV50_ZETA_FORMAT_Z32_S8_C8_X16_FLOAT                   0x0000001f
+#define NVE4_IMAGE_FORMAT_RGBA32_FLOAT                         0x00000002
+#define NVE4_IMAGE_FORMAT_RGBA32_SINT                          0x00000003
+#define NVE4_IMAGE_FORMAT_RGBA32_UINT                          0x00000004
+#define NVE4_IMAGE_FORMAT_RGBA16_UNORM                         0x00000008
+#define NVE4_IMAGE_FORMAT_RGBA16_SNORM                         0x00000009
+#define NVE4_IMAGE_FORMAT_RGBA16_SINT                          0x0000000a
+#define NVE4_IMAGE_FORMAT_RGBA16_UINT                          0x0000000b
+#define NVE4_IMAGE_FORMAT_RGBA16_FLOAT                         0x0000000c
+#define NVE4_IMAGE_FORMAT_RG32_FLOAT                           0x0000000d
+#define NVE4_IMAGE_FORMAT_RG32_SINT                            0x0000000e
+#define NVE4_IMAGE_FORMAT_RG32_UINT                            0x0000000f
+#define NVE4_IMAGE_FORMAT_RGB10_A2_UNORM                       0x00000013
+#define NVE4_IMAGE_FORMAT_RGB10_A2_UINT                                0x00000015
+#define NVE4_IMAGE_FORMAT_RGBA8_UNORM                          0x00000018
+#define NVE4_IMAGE_FORMAT_RGBA8_SNORM                          0x0000001a
+#define NVE4_IMAGE_FORMAT_RGBA8_SINT                           0x0000001b
+#define NVE4_IMAGE_FORMAT_RGBA8_UINT                           0x0000001c
+#define NVE4_IMAGE_FORMAT_RG16_UNORM                           0x0000001d
+#define NVE4_IMAGE_FORMAT_RG16_SNORM                           0x0000001e
+#define NVE4_IMAGE_FORMAT_RG16_SINT                            0x0000001f
+#define NVE4_IMAGE_FORMAT_RG16_UINT                            0x00000020
+#define NVE4_IMAGE_FORMAT_RG16_FLOAT                           0x00000021
+#define NVE4_IMAGE_FORMAT_R11G11B10_FLOAT                      0x00000024
+#define NVE4_IMAGE_FORMAT_R32_SINT                             0x00000027
+#define NVE4_IMAGE_FORMAT_R32_UINT                             0x00000028
+#define NVE4_IMAGE_FORMAT_R32_FLOAT                            0x00000029
+#define NVE4_IMAGE_FORMAT_RG8_UNORM                            0x0000002e
+#define NVE4_IMAGE_FORMAT_RG8_SNORM                            0x0000002f
+#define NVE4_IMAGE_FORMAT_RG8_SINT                             0x00000030
+#define NVE4_IMAGE_FORMAT_RG8_UINT                             0x00000031
+#define NVE4_IMAGE_FORMAT_R16_UNORM                            0x00000032
+#define NVE4_IMAGE_FORMAT_R16_SNORM                            0x00000033
+#define NVE4_IMAGE_FORMAT_R16_SINT                             0x00000034
+#define NVE4_IMAGE_FORMAT_R16_UINT                             0x00000035
+#define NVE4_IMAGE_FORMAT_R16_FLOAT                            0x00000036
+#define NVE4_IMAGE_FORMAT_R8_UNORM                             0x00000037
+#define NVE4_IMAGE_FORMAT_R8_SNORM                             0x00000038
+#define NVE4_IMAGE_FORMAT_R8_SINT                              0x00000039
+#define NVE4_IMAGE_FORMAT_R8_UINT                              0x0000003a
 #define NV50_QUERY__SIZE                                       0x00000010
 #define NV50_QUERY_COUNTER                                     0x00000000
 
@@ -151,4 +197,4 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define NV50_QUERY_TIME                                                0x00000008
 
 
-#endif /* RNNDB_NV50_DEFS_XML */
+#endif /* NV50_DEFS_XML */
index 33b90f2..db8d123 100644 (file)
@@ -14,6 +14,7 @@ C_SOURCES := \
        nvc0_program.c \
        nvc0_shader_state.c \
        nvc0_query.c \
+       nve4_compute.c \
        nvc0_video.c \
        nvc0_video_bsp.c \
        nvc0_video_vp.c \
index 75bd155..dc0c4b9 100644 (file)
@@ -63,6 +63,7 @@ nvc0_context_unreference_resources(struct nvc0_context *nvc0)
 
    nouveau_bufctx_del(&nvc0->bufctx_3d);
    nouveau_bufctx_del(&nvc0->bufctx);
+   nouveau_bufctx_del(&nvc0->bufctx_cp);
 
    util_unreference_framebuffer_state(&nvc0->framebuffer);
 
@@ -71,7 +72,7 @@ nvc0_context_unreference_resources(struct nvc0_context *nvc0)
 
    pipe_resource_reference(&nvc0->idxbuf.buffer, NULL);
 
-   for (s = 0; s < 5; ++s) {
+   for (s = 0; s < 6; ++s) {
       for (i = 0; i < nvc0->num_textures[s]; ++i)
          pipe_sampler_view_reference(&nvc0->textures[s][i], NULL);
 
@@ -80,8 +81,21 @@ nvc0_context_unreference_resources(struct nvc0_context *nvc0)
             pipe_resource_reference(&nvc0->constbuf[s][i].u.buf, NULL);
    }
 
+   for (s = 0; s < 2; ++s) {
+      for (i = 0; i < NVC0_MAX_SURFACE_SLOTS; ++i)
+         pipe_surface_reference(&nvc0->surfaces[s][i], NULL);
+   }
+
    for (i = 0; i < nvc0->num_tfbbufs; ++i)
       pipe_so_target_reference(&nvc0->tfbbuf[i], NULL);
+
+   for (i = 0; i < nvc0->global_residents.size / sizeof(struct pipe_resource *);
+        ++i) {
+      struct pipe_resource **res = util_dynarray_element(
+         &nvc0->global_residents, struct pipe_resource *, i);
+      pipe_resource_reference(res, NULL);
+   }
+   util_dynarray_fini(&nvc0->global_residents);
 }
 
 static void
@@ -219,10 +233,13 @@ nvc0_create(struct pipe_screen *pscreen, void *priv)
    nvc0->base.pushbuf = screen->base.pushbuf;
    nvc0->base.client = screen->base.client;
 
-   ret = nouveau_bufctx_new(screen->base.client, NVC0_BIND_COUNT,
-                            &nvc0->bufctx_3d);
+   ret = nouveau_bufctx_new(screen->base.client, 2, &nvc0->bufctx);
+   if (!ret)
+      ret = nouveau_bufctx_new(screen->base.client, NVC0_BIND_3D_COUNT,
+                               &nvc0->bufctx_3d);
    if (!ret)
-      nouveau_bufctx_new(screen->base.client, 2, &nvc0->bufctx);
+      ret = nouveau_bufctx_new(screen->base.client, NVC0_BIND_CP_COUNT,
+                               &nvc0->bufctx_cp);
    if (ret)
       goto out_err;
 
@@ -236,6 +253,8 @@ nvc0_create(struct pipe_screen *pscreen, void *priv)
 
    pipe->draw_vbo = nvc0_draw_vbo;
    pipe->clear = nvc0_clear;
+   if (nvc0->screen->base.class_3d >= NVE4_3D_CLASS)
+      pipe->launch_grid = nve4_launch_grid;
 
    pipe->flush = nvc0_flush;
    pipe->texture_barrier = nvc0_texture_barrier;
@@ -274,23 +293,39 @@ nvc0_create(struct pipe_screen *pscreen, void *priv)
    BCTX_REFN_bo(nvc0->bufctx_3d, SCREEN, flags, screen->text);
    BCTX_REFN_bo(nvc0->bufctx_3d, SCREEN, flags, screen->uniform_bo);
    BCTX_REFN_bo(nvc0->bufctx_3d, SCREEN, flags, screen->txc);
+   if (screen->compute) {
+      BCTX_REFN_bo(nvc0->bufctx_cp, CP_SCREEN, flags, screen->text);
+      BCTX_REFN_bo(nvc0->bufctx_cp, CP_SCREEN, flags, screen->txc);
+      BCTX_REFN_bo(nvc0->bufctx_cp, CP_SCREEN, flags, screen->parm);
+   }
+
+   flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_RDWR;
+
    BCTX_REFN_bo(nvc0->bufctx_3d, SCREEN, flags, screen->poly_cache);
+   if (screen->compute)
+      BCTX_REFN_bo(nvc0->bufctx_cp, CP_SCREEN, flags, screen->tls);
 
    flags = NOUVEAU_BO_GART | NOUVEAU_BO_WR;
 
    BCTX_REFN_bo(nvc0->bufctx_3d, SCREEN, flags, screen->fence.bo);
    BCTX_REFN_bo(nvc0->bufctx, FENCE, flags, screen->fence.bo);
+   if (screen->compute)
+      BCTX_REFN_bo(nvc0->bufctx_cp, CP_SCREEN, flags, screen->fence.bo);
 
    nvc0->base.scratch.bo_size = 2 << 20;
 
    memset(nvc0->tex_handles, ~0, sizeof(nvc0->tex_handles));
 
+   util_dynarray_init(&nvc0->global_residents);
+
    return pipe;
 
 out_err:
    if (nvc0) {
       if (nvc0->bufctx_3d)
          nouveau_bufctx_del(&nvc0->bufctx_3d);
+      if (nvc0->bufctx_cp)
+         nouveau_bufctx_del(&nvc0->bufctx_cp);
       if (nvc0->bufctx)
          nouveau_bufctx_del(&nvc0->bufctx);
       if (nvc0->blit)
index f5b0b6b..d9aa378 100644 (file)
 #define NVC0_NEW_SAMPLERS     (1 << 20)
 #define NVC0_NEW_TFB_TARGETS  (1 << 21)
 #define NVC0_NEW_IDXBUF       (1 << 22)
+#define NVC0_NEW_SURFACES     (1 << 23)
 
+#define NVC0_NEW_CP_PROGRAM   (1 << 0)
+#define NVC0_NEW_CP_SURFACES  (1 << 1)
+#define NVC0_NEW_CP_TEXTURES  (1 << 2)
+#define NVC0_NEW_CP_SAMPLERS  (1 << 3)
+#define NVC0_NEW_CP_CONSTBUF  (1 << 4)
+#define NVC0_NEW_CP_GLOBALS   (1 << 5)
+
+/* 3d bufctx (during draw_vbo, blit_3d) */
 #define NVC0_BIND_FB            0
 #define NVC0_BIND_VTX           1
 #define NVC0_BIND_VTX_TMP       2
 #define NVC0_BIND_TEX(s, i)  (  4 + 32 * (s) + (i))
 #define NVC0_BIND_CB(s, i)   (164 + 16 * (s) + (i))
 #define NVC0_BIND_TFB         244
-#define NVC0_BIND_SCREEN      245
-#define NVC0_BIND_TLS         246
-#define NVC0_BIND_COUNT       247
-
+#define NVC0_BIND_SUF         245
+#define NVC0_BIND_SCREEN      246
+#define NVC0_BIND_TLS         247
+#define NVC0_BIND_3D_COUNT    248
+
+/* compute bufctx (during launch_grid) */
+#define NVC0_BIND_CP_CB(i)     (  0 + (i))
+#define NVC0_BIND_CP_TEX(i)    ( 16 + (i))
+#define NVC0_BIND_CP_SUF         48
+#define NVC0_BIND_CP_GLOBAL      49
+#define NVC0_BIND_CP_DESC        50
+#define NVC0_BIND_CP_SCREEN      51
+#define NVC0_BIND_CP_COUNT       52
+
+/* bufctx for other operations */
 #define NVC0_BIND_2D            0
 #define NVC0_BIND_M2MF          0
 #define NVC0_BIND_FENCE         1
@@ -81,6 +101,7 @@ struct nvc0_context {
 
    struct nouveau_bufctx *bufctx_3d;
    struct nouveau_bufctx *bufctx;
+   struct nouveau_bufctx *bufctx_cp;
 
    struct nvc0_screen *screen;
 
@@ -90,6 +111,7 @@ struct nvc0_context {
                           uint32_t nblocksx, uint32_t nblocksy);
 
    uint32_t dirty;
+   uint32_t dirty_cp; /* dirty flags for compute state */
 
    struct {
       boolean flushed;
@@ -105,8 +127,8 @@ struct nvc0_context {
       uint8_t vbo_mode; /* 0 = normal, 1 = translate, 3 = translate, forced */
       uint8_t num_vtxbufs;
       uint8_t num_vtxelts;
-      uint8_t num_textures[5];
-      uint8_t num_samplers[5];
+      uint8_t num_textures[6];
+      uint8_t num_samplers[6];
       uint8_t tls_required; /* bitmask of shader types using l[] */
       uint8_t c14_bound; /* whether immediate array constbuf is bound */
       uint8_t clip_enable;
@@ -125,9 +147,10 @@ struct nvc0_context {
    struct nvc0_program *tevlprog;
    struct nvc0_program *gmtyprog;
    struct nvc0_program *fragprog;
+   struct nvc0_program *compprog;
 
-   struct nvc0_constbuf constbuf[5][NVC0_MAX_PIPE_CONSTBUFS];
-   uint16_t constbuf_dirty[5];
+   struct nvc0_constbuf constbuf[6][NVC0_MAX_PIPE_CONSTBUFS];
+   uint16_t constbuf_dirty[6];
 
    struct pipe_vertex_buffer vtxbuf[PIPE_MAX_ATTRIBS];
    unsigned num_vtxbufs;
@@ -139,14 +162,14 @@ struct nvc0_context {
    uint32_t instance_off; /* current base vertex for instanced arrays */
    uint32_t instance_max; /* last instance for current draw call */
 
-   struct pipe_sampler_view *textures[5][PIPE_MAX_SAMPLERS];
-   unsigned num_textures[5];
-   uint32_t textures_dirty[5];
-   struct nv50_tsc_entry *samplers[5][PIPE_MAX_SAMPLERS];
-   unsigned num_samplers[5];
-   uint16_t samplers_dirty[5];
+   struct pipe_sampler_view *textures[6][PIPE_MAX_SAMPLERS];
+   unsigned num_textures[6];
+   uint32_t textures_dirty[6];
+   struct nv50_tsc_entry *samplers[6][PIPE_MAX_SAMPLERS];
+   unsigned num_samplers[6];
+   uint16_t samplers_dirty[6];
 
-   uint32_t tex_handles[5][PIPE_MAX_SAMPLERS]; /* for nve4 */
+   uint32_t tex_handles[6][PIPE_MAX_SAMPLERS]; /* for nve4 */
 
    struct pipe_framebuffer_state framebuffer;
    struct pipe_blend_color blend_colour;
@@ -169,6 +192,12 @@ struct nvc0_context {
 
    struct nvc0_blitctx *blit;
 
+   struct pipe_surface *surfaces[2][NVC0_MAX_SURFACE_SLOTS];
+   uint16_t surfaces_dirty[2];
+   uint16_t surfaces_valid[2];
+
+   struct util_dynarray global_residents;
+
 #ifdef NVC0_WITH_DRAW_MODULE
    struct draw_context *draw;
 #endif
@@ -211,6 +240,8 @@ boolean nvc0_program_translate(struct nvc0_program *, uint16_t chipset);
 boolean nvc0_program_upload_code(struct nvc0_context *, struct nvc0_program *);
 void nvc0_program_destroy(struct nvc0_context *, struct nvc0_program *);
 void nvc0_program_library_upload(struct nvc0_context *);
+uint32_t nvc0_program_symbol_offset(const struct nvc0_program *,
+                                    uint32_t label);
 
 /* nvc0_query.c */
 void nvc0_init_query_functions(struct nvc0_context *);
@@ -236,6 +267,8 @@ void nvc0_tfb_validate(struct nvc0_context *);
 extern void nvc0_init_state_functions(struct nvc0_context *);
 
 /* nvc0_state_validate.c */
+void nvc0_validate_global_residents(struct nvc0_context *,
+                                    struct nouveau_bufctx *, int bin);
 extern boolean nvc0_state_validate(struct nvc0_context *, uint32_t state_mask,
                                    unsigned space_words);
 
@@ -246,9 +279,13 @@ extern void nvc0_clear(struct pipe_context *, unsigned buffers,
 extern void nvc0_init_surface_functions(struct nvc0_context *);
 
 /* nvc0_tex.c */
+boolean nve4_validate_tsc(struct nvc0_context *nvc0, int s);
 void nvc0_validate_textures(struct nvc0_context *);
 void nvc0_validate_samplers(struct nvc0_context *);
 void nve4_set_tex_handles(struct nvc0_context *);
+void nvc0_validate_surfaces(struct nvc0_context *);
+void nve4_set_surface_info(struct nouveau_pushbuf *, struct pipe_surface *,
+                           struct nvc0_screen *);
 
 struct pipe_sampler_view *
 nvc0_create_texture_view(struct pipe_context *,
@@ -315,4 +352,8 @@ nvc0_screen_get_video_param(struct pipe_screen *pscreen,
 /* nvc0_push.c */
 void nvc0_push_vbo(struct nvc0_context *, const struct pipe_draw_info *);
 
+/* nve4_compute.c */
+void nve4_launch_grid(struct pipe_context *,
+                      const uint *, const uint *, uint32_t, const void *);
+
 #endif
index e4ac8ba..592d338 100644 (file)
@@ -25,6 +25,7 @@
 #include "nvc0_context.h"
 
 #include "nv50/codegen/nv50_ir_driver.h"
+#include "nve4_compute.h"
 
 /* If only they told use the actual semantic instead of just GENERIC ... */
 static void
@@ -533,10 +534,11 @@ nvc0_program_dump(struct nvc0_program *prog)
 {
    unsigned pos;
 
-   for (pos = 0; pos < sizeof(prog->hdr) / sizeof(prog->hdr[0]); ++pos)
-      debug_printf("HDR[%02lx] = 0x%08x\n",
-                   pos * sizeof(prog->hdr[0]), prog->hdr[pos]);
-
+   if (prog->type != PIPE_SHADER_COMPUTE) {
+      for (pos = 0; pos < sizeof(prog->hdr) / sizeof(prog->hdr[0]); ++pos)
+         debug_printf("HDR[%02lx] = 0x%08x\n",
+                      pos * sizeof(prog->hdr[0]), prog->hdr[pos]);
+   }
    debug_printf("shader binary code (0x%x bytes):", prog->code_size);
    for (pos = 0; pos < prog->code_size / 4; ++pos) {
       if ((pos % 8) == 0)
@@ -569,11 +571,11 @@ nvc0_program_translate(struct nvc0_program *prog, uint16_t chipset)
    if (prog->type == PIPE_SHADER_COMPUTE) {
       if (chipset >= NVISA_GK104_CHIPSET) {
          info->io.resInfoCBSlot = 0;
-         info->io.texBindBase = 0; /* TODO */
-         info->io.suInfoBase = 0; /* TODO */
+         info->io.texBindBase = NVE4_CP_INPUT_TEX(0);
+         info->io.suInfoBase = NVE4_CP_INPUT_SUF(0);
       }
       info->io.msInfoCBSlot = 0;
-      info->io.msInfoBase = 0; /* TODO */
+      info->io.msInfoBase = NVE4_CP_INPUT_MS_OFFSETS;
    } else {
       if (chipset >= NVISA_GK104_CHIPSET) {
          info->io.resInfoCBSlot = 15;
@@ -598,14 +600,16 @@ nvc0_program_translate(struct nvc0_program *prog, uint16_t chipset)
       NOUVEAU_ERR("shader translation failed: %i\n", ret);
       goto out;
    }
-   FREE(info->bin.syms);
+   if (prog->type != PIPE_SHADER_COMPUTE)
+      FREE(info->bin.syms);
 
    prog->code = info->bin.code;
    prog->code_size = info->bin.codeSize;
    prog->immd_data = info->immd.buf;
    prog->immd_size = info->immd.bufSize;
    prog->relocs = info->bin.relocData;
-   prog->max_gpr = MAX2(4, (info->bin.maxGPR + 1));
+   prog->num_gprs = MAX2(4, (info->bin.maxGPR + 1));
+   prog->num_barriers = info->numBarriers;
 
    prog->vp.need_vertex_id = info->io.vertexId < PIPE_MAX_SHADER_INPUTS;
 
@@ -633,6 +637,10 @@ nvc0_program_translate(struct nvc0_program *prog, uint16_t chipset)
    case PIPE_SHADER_FRAGMENT:
       ret = nvc0_fp_gen_header(prog, info);
       break;
+   case PIPE_SHADER_COMPUTE:
+      prog->cp.syms = info->bin.syms;
+      prog->cp.num_syms = info->bin.numSyms;
+      break;
    default:
       ret = -1;
       NOUVEAU_ERR("unknown program type: %u\n", prog->type);
@@ -672,8 +680,9 @@ boolean
 nvc0_program_upload_code(struct nvc0_context *nvc0, struct nvc0_program *prog)
 {
    struct nvc0_screen *screen = nvc0->screen;
+   const boolean is_cp = prog->type == PIPE_SHADER_COMPUTE;
    int ret;
-   uint32_t size = prog->code_size + NVC0_SHADER_HEADER_SIZE;
+   uint32_t size = prog->code_size + (is_cp ? 0 : NVC0_SHADER_HEADER_SIZE);
    uint32_t lib_pos = screen->lib_code->start;
    uint32_t code_pos;
 
@@ -689,7 +698,7 @@ nvc0_program_upload_code(struct nvc0_context *nvc0, struct nvc0_program *prog)
     * latency information is expected only at certain positions.
     */
    if (screen->base.class_3d >= NVE4_3D_CLASS)
-      size = size + 0x70;
+      size = size + (is_cp ? 0x40 : 0x70);
    size = align(size, 0x40);
 
    ret = nouveau_heap_alloc(screen->text_heap, size, prog, &prog->mem);
@@ -714,18 +723,27 @@ nvc0_program_upload_code(struct nvc0_context *nvc0, struct nvc0_program *prog)
    assert((prog->immd_size == 0) || (prog->immd_base + prog->immd_size <=
                                      prog->mem->start + prog->mem->size));
 
-   if (screen->base.class_3d >= NVE4_3D_CLASS) {
-      switch (prog->mem->start & 0xff) {
-      case 0x40: prog->code_base += 0x70; break;
-      case 0x80: prog->code_base += 0x30; break;
-      case 0xc0: prog->code_base += 0x70; break;
-      default:
-         prog->code_base += 0x30;
-         assert((prog->mem->start & 0xff) == 0x00);
-         break;
+   if (!is_cp) {
+      if (screen->base.class_3d >= NVE4_3D_CLASS) {
+         switch (prog->mem->start & 0xff) {
+         case 0x40: prog->code_base += 0x70; break;
+         case 0x80: prog->code_base += 0x30; break;
+         case 0xc0: prog->code_base += 0x70; break;
+         default:
+            prog->code_base += 0x30;
+            assert((prog->mem->start & 0xff) == 0x00);
+            break;
+         }
+      }
+      code_pos = prog->code_base + NVC0_SHADER_HEADER_SIZE;
+   } else {
+      if (screen->base.class_3d >= NVE4_3D_CLASS) {
+         if (prog->mem->start & 0x40)
+            prog->code_base += 0x40;
+         assert((prog->code_base & 0x7f) == 0x00);
       }
+      code_pos = prog->code_base;
    }
-   code_pos = prog->code_base + NVC0_SHADER_HEADER_SIZE;
 
    if (prog->relocs)
       nv50_ir_relocate_code(prog->relocs, prog->code, code_pos, lib_pos, 0);
@@ -735,10 +753,10 @@ nvc0_program_upload_code(struct nvc0_context *nvc0, struct nvc0_program *prog)
       nvc0_program_dump(prog);
 #endif
 
-   nvc0->base.push_data(&nvc0->base, screen->text, prog->code_base,
-                        NOUVEAU_BO_VRAM, NVC0_SHADER_HEADER_SIZE, prog->hdr);
-   nvc0->base.push_data(&nvc0->base, screen->text,
-                        prog->code_base + NVC0_SHADER_HEADER_SIZE,
+   if (!is_cp)
+      nvc0->base.push_data(&nvc0->base, screen->text, prog->code_base,
+                           NOUVEAU_BO_VRAM, NVC0_SHADER_HEADER_SIZE, prog->hdr);
+   nvc0->base.push_data(&nvc0->base, screen->text, code_pos,
                         NOUVEAU_BO_VRAM, prog->code_size, prog->code);
    if (prog->immd_size)
       nvc0->base.push_data(&nvc0->base,
@@ -790,6 +808,8 @@ nvc0_program_destroy(struct nvc0_context *nvc0, struct nvc0_program *prog)
    FREE(prog->code);
    FREE(prog->immd_data);
    FREE(prog->relocs);
+   if (prog->type == PIPE_SHADER_COMPUTE && prog->cp.syms)
+      FREE(prog->cp.syms);
    if (prog->tfb) {
       if (nvc0->state.tfb == prog->tfb)
          nvc0->state.tfb = NULL;
@@ -801,3 +821,18 @@ nvc0_program_destroy(struct nvc0_context *nvc0, struct nvc0_program *prog)
    prog->pipe = pipe;
    prog->type = type;
 }
+
+uint32_t
+nvc0_program_symbol_offset(const struct nvc0_program *prog, uint32_t label)
+{
+   const struct nv50_ir_prog_symbol *syms =
+      (const struct nv50_ir_prog_symbol *)prog->cp.syms;
+   unsigned base = 0;
+   unsigned i;
+   if (prog->type != PIPE_SHADER_COMPUTE)
+      base = NVC0_SHADER_HEADER_SIZE;
+   for (i = 0; i < prog->cp.num_syms; ++i)
+      if (syms[i].label == label)
+         return prog->code_base + base + syms[i].offset;
+   return ~0;
+}
index f6d1121..9c184d1 100644 (file)
@@ -22,7 +22,7 @@ struct nvc0_program {
    ubyte type;
    boolean translated;
    boolean need_tls;
-   uint8_t max_gpr;
+   uint8_t num_gprs;
 
    uint32_t *code;
    uint32_t *immd_data;
@@ -50,6 +50,13 @@ struct nvc0_program {
       uint32_t tess_mode; /* ~0 if defined by the other stage */
       uint32_t input_patch_size;
    } tp;
+   struct {
+      uint32_t lmem_size; /* local memory (TGSI PRIVATE resource) size */
+      uint32_t smem_size; /* shared memory (TGSI LOCAL resource) size */
+      void *syms;
+      unsigned num_syms;
+   } cp;
+   uint8_t num_barriers;
 
    void *relocs;
 
index 077f89e..7d03479 100644 (file)
@@ -88,12 +88,12 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 
    switch (param) {
    case PIPE_CAP_MAX_COMBINED_SAMPLERS:
-      return 16 * PIPE_SHADER_TYPES; /* NOTE: should not count COMPUTE */
+      return 16 * 5;
    case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
    case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
       return 15;
    case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
-      return 12;
+      return (class_3d >= NVE4_3D_CLASS) ? 13 : 12;
    case PIPE_CAP_MAX_TEXTURE_ARRAY_LAYERS:
       return 2048;
    case PIPE_CAP_MIN_TEXEL_OFFSET:
@@ -176,6 +176,8 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_VERTEX_ELEMENT_SRC_OFFSET_4BYTE_ALIGNED_ONLY:
    case PIPE_CAP_TEXTURE_MULTISAMPLE:
       return 0;
+   case PIPE_CAP_COMPUTE:
+      return (class_3d >= NVE4_3D_CLASS) ? 1 : 0;
    default:
       NOUVEAU_ERR("unknown PIPE_CAP %d\n", param);
       return 0;
@@ -186,6 +188,8 @@ static int
 nvc0_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
                              enum pipe_shader_cap param)
 {
+   const uint16_t class_3d = nouveau_screen(pscreen)->class_3d;
+
    switch (shader) {
    case PIPE_SHADER_VERTEX:
       /*
@@ -195,11 +199,17 @@ nvc0_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
    case PIPE_SHADER_GEOMETRY:
    case PIPE_SHADER_FRAGMENT:
       break;
+   case PIPE_SHADER_COMPUTE:
+      if (class_3d < NVE4_3D_CLASS)
+         return 0;
+      break;
    default:
       return 0;
    }
    
    switch (param) {
+   case PIPE_SHADER_CAP_PREFERRED_IR:
+      return PIPE_SHADER_IR_TGSI;
    case PIPE_SHADER_CAP_MAX_INSTRUCTIONS:
    case PIPE_SHADER_CAP_MAX_ALU_INSTRUCTIONS:
    case PIPE_SHADER_CAP_MAX_TEX_INSTRUCTIONS:
@@ -216,6 +226,8 @@ nvc0_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
    case PIPE_SHADER_CAP_MAX_CONSTS:
       return 65536 / 16;
    case PIPE_SHADER_CAP_MAX_CONST_BUFFERS:
+      if (shader == PIPE_SHADER_COMPUTE && class_3d >= NVE4_3D_CLASS)
+         return NVE4_MAX_PIPE_CONSTBUFS_COMPUTE;
       return NVC0_MAX_PIPE_CONSTBUFS;
    case PIPE_SHADER_CAP_MAX_ADDRS:
       return 1;
@@ -234,7 +246,7 @@ nvc0_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
    case PIPE_SHADER_CAP_TGSI_SQRT_SUPPORTED:
       return 0;
    case PIPE_SHADER_CAP_SUBROUTINES:
-      return 1; /* but inlining everything, we need function declarations */
+      return 1;
    case PIPE_SHADER_CAP_INTEGERS:
       return 1;
    case PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS:
@@ -270,6 +282,47 @@ nvc0_screen_get_paramf(struct pipe_screen *pscreen, enum pipe_capf param)
    }
 }
 
+static int
+nvc0_screen_get_compute_param(struct pipe_screen *pscreen,
+                              enum pipe_compute_cap param, void *data)
+{
+   uint64_t *data64 = (uint64_t *)data;
+   const uint16_t obj_class = nvc0_screen(pscreen)->compute->oclass;
+
+   switch (param) {
+   case PIPE_COMPUTE_CAP_GRID_DIMENSION:
+      data64[0] = 3;
+      return 8;
+   case PIPE_COMPUTE_CAP_MAX_GRID_SIZE:
+      data64[0] = (obj_class >= NVE4_COMPUTE_CLASS) ? 0x7fffffff : 65535;
+      data64[1] = 65535;
+      data64[2] = 65535;
+      return 24;
+   case PIPE_COMPUTE_CAP_MAX_BLOCK_SIZE:
+      data64[0] = 1024;
+      data64[1] = 1024;
+      data64[2] = 64;
+      return 24;
+   case PIPE_COMPUTE_CAP_MAX_THREADS_PER_BLOCK:
+      data64[0] = 1024;
+      return 8;
+   case PIPE_COMPUTE_CAP_MAX_GLOBAL_SIZE: /* g[] */
+      data64[0] = (uint64_t)1 << 40;
+      return 8;
+   case PIPE_COMPUTE_CAP_MAX_LOCAL_SIZE: /* s[] */
+      data64[0] = 48 << 10;
+      return 8;
+   case PIPE_COMPUTE_CAP_MAX_PRIVATE_SIZE: /* l[] */
+      data64[0] = 512 << 10;
+      return 8;
+   case PIPE_COMPUTE_CAP_MAX_INPUT_SIZE: /* c[], arbitrary limit */
+      data64[0] = 4096;
+      return 8;
+   default:
+      return 0;
+   }
+}
+
 static void
 nvc0_screen_destroy(struct pipe_screen *pscreen)
 {
@@ -291,6 +344,7 @@ nvc0_screen_destroy(struct pipe_screen *pscreen)
    nouveau_bo_ref(NULL, &screen->txc);
    nouveau_bo_ref(NULL, &screen->fence.bo);
    nouveau_bo_ref(NULL, &screen->poly_cache);
+   nouveau_bo_ref(NULL, &screen->parm);
 
    nouveau_heap_destroy(&screen->lib_code);
    nouveau_heap_destroy(&screen->text_heap);
@@ -412,6 +466,23 @@ nvc0_screen_fence_update(struct pipe_screen *pscreen)
    return screen->fence.map[0];
 }
 
+static int
+nvc0_screen_init_compute(struct nvc0_screen *screen)
+{
+   screen->base.base.get_compute_param = nvc0_screen_get_compute_param;
+
+   switch (screen->base.device->chipset & 0xf0) {
+   case 0xc0:
+   case 0xd0:
+      return 0;
+   case 0xe0:
+   case 0xf0:
+      return nve4_screen_compute_setup(screen, screen->base.pushbuf);
+   default:
+      return -1;
+   }
+}
+
 #define FAIL_SCREEN_INIT(str, err)                    \
    do {                                               \
       NOUVEAU_ERR(str, err);                          \
@@ -653,9 +724,9 @@ nvc0_screen_create(struct nouveau_device *dev)
 
    /* max MPs * max warps per MP (TODO: ask kernel) */
    if (screen->eng3d->oclass >= NVE4_3D_CLASS)
-      screen->tls_size = 8 * 64;
+      screen->tls_size = 8 * 64 * 32;
    else
-      screen->tls_size = 16 * 48;
+      screen->tls_size = 16 * 48 * 32;
    screen->tls_size *= NVC0_CAP_MAX_PROGRAM_TEMPS * 16;
    screen->tls_size = align(screen->tls_size, 1 << 17);
 
@@ -775,6 +846,9 @@ nvc0_screen_create(struct nouveau_device *dev)
 
    IMMED_NVC0(push, NVC0_3D(EDGEFLAG), 1);
 
+   if (nvc0_screen_init_compute(screen))
+      goto fail;
+
    PUSH_KICK (push);
 
    screen->tic.entries = CALLOC(4096, sizeof(void *));
index 2adcfea..16f0feb 100644 (file)
 #define NVC0_TSC_MAX_ENTRIES 2048
 
 /* doesn't count reserved slots (for auxiliary constants, immediates, etc.) */
-#define NVC0_MAX_PIPE_CONSTBUFS 14
+#define NVC0_MAX_PIPE_CONSTBUFS         14
+#define NVE4_MAX_PIPE_CONSTBUFS_COMPUTE  7
+
+#define NVC0_MAX_SURFACE_SLOTS 16
 
 struct nvc0_context;
 
@@ -29,7 +32,8 @@ struct nvc0_screen {
    int num_occlusion_queries_active;
 
    struct nouveau_bo *text;
-   struct nouveau_bo *uniform_bo;
+   struct nouveau_bo *parm;       /* for COMPUTE */
+   struct nouveau_bo *uniform_bo; /* for 3D */
    struct nouveau_bo *tls;
    struct nouveau_bo *txc; /* TIC (offset 0) and TSC (65536) */
    struct nouveau_bo *poly_cache;
@@ -63,7 +67,7 @@ struct nvc0_screen {
    struct nouveau_object *eng3d; /* sqrt(1/2)|kepler> + sqrt(1/2)|fermi> */
    struct nouveau_object *eng2d;
    struct nouveau_object *m2mf;
-   struct nouveau_object *dijkstra;
+   struct nouveau_object *compute;
 };
 
 static INLINE struct nvc0_screen *
@@ -80,6 +84,8 @@ void nvc0_screen_make_buffers_resident(struct nvc0_screen *);
 int nvc0_screen_tic_alloc(struct nvc0_screen *, void *);
 int nvc0_screen_tsc_alloc(struct nvc0_screen *, void *);
 
+int nve4_screen_compute_setup(struct nvc0_screen *, struct nouveau_pushbuf *);
+
 static INLINE void
 nvc0_resource_fence(struct nv04_resource *res, uint32_t flags)
 {
index 786889f..5cd6a84 100644 (file)
@@ -95,7 +95,7 @@ nvc0_vertprog_validate(struct nvc0_context *nvc0)
    PUSH_DATA (push, 0x11);
    PUSH_DATA (push, vp->code_base);
    BEGIN_NVC0(push, NVC0_3D(SP_GPR_ALLOC(1)), 1);
-   PUSH_DATA (push, vp->max_gpr);
+   PUSH_DATA (push, vp->num_gprs);
 
    // BEGIN_NVC0(push, NVC0_3D_(0x163c), 1);
    // PUSH_DATA (push, 0);
@@ -120,7 +120,7 @@ nvc0_fragprog_validate(struct nvc0_context *nvc0)
    PUSH_DATA (push, 0x51);
    PUSH_DATA (push, fp->code_base);
    BEGIN_NVC0(push, NVC0_3D(SP_GPR_ALLOC(5)), 1);
-   PUSH_DATA (push, fp->max_gpr);
+   PUSH_DATA (push, fp->num_gprs);
 
    BEGIN_NVC0(push, SUBC_3D(0x0360), 2);
    PUSH_DATA (push, 0x20164010);
@@ -144,7 +144,7 @@ nvc0_tctlprog_validate(struct nvc0_context *nvc0)
       PUSH_DATA (push, 0x21);
       PUSH_DATA (push, tp->code_base);
       BEGIN_NVC0(push, NVC0_3D(SP_GPR_ALLOC(2)), 1);
-      PUSH_DATA (push, tp->max_gpr);
+      PUSH_DATA (push, tp->num_gprs);
 
       if (tp->tp.input_patch_size <= 32)
          IMMED_NVC0(push, NVC0_3D(PATCH_VERTICES), tp->tp.input_patch_size);
@@ -171,7 +171,7 @@ nvc0_tevlprog_validate(struct nvc0_context *nvc0)
       BEGIN_NVC0(push, NVC0_3D(SP_START_ID(3)), 1);
       PUSH_DATA (push, tp->code_base);
       BEGIN_NVC0(push, NVC0_3D(SP_GPR_ALLOC(3)), 1);
-      PUSH_DATA (push, tp->max_gpr);
+      PUSH_DATA (push, tp->num_gprs);
    } else {
       BEGIN_NVC0(push, NVC0_3D(MACRO_TEP_SELECT), 1);
       PUSH_DATA (push, 0x30);
@@ -197,7 +197,7 @@ nvc0_gmtyprog_validate(struct nvc0_context *nvc0)
       BEGIN_NVC0(push, NVC0_3D(SP_START_ID(4)), 1);
       PUSH_DATA (push, gp->code_base);
       BEGIN_NVC0(push, NVC0_3D(SP_GPR_ALLOC(4)), 1);
-      PUSH_DATA (push, gp->max_gpr);
+      PUSH_DATA (push, gp->num_gprs);
       BEGIN_NVC0(push, NVC0_3D(LAYER), 1);
       PUSH_DATA (push, gp_selects_layer ? NVC0_3D_LAYER_USE_GP : 0);
    } else {
index 30011df..cba076f 100644 (file)
@@ -489,6 +489,57 @@ nvc0_gp_sampler_states_bind(struct pipe_context *pipe, unsigned nr, void **s)
    nvc0_stage_sampler_states_bind(nvc0_context(pipe), 3, nr, s);
 }
 
+static void
+nvc0_stage_sampler_states_bind_range(struct nvc0_context *nvc0,
+                                     const unsigned s,
+                                     unsigned start, unsigned nr, void **cso)
+{
+   const unsigned end = start + nr;
+   int last_valid = -1;
+   unsigned i;
+
+   if (cso) {
+      for (i = start; i < end; ++i) {
+         const unsigned p = i - start;
+         if (cso[p])
+            last_valid = i;
+         if (cso[p] == nvc0->samplers[s][i])
+            continue;
+         nvc0->samplers_dirty[s] |= 1 << i;
+
+         if (nvc0->samplers[s][i])
+            nvc0_screen_tsc_unlock(nvc0->screen, nvc0->samplers[s][i]);
+         nvc0->samplers[s][i] = cso[p];
+      }
+   } else {
+      for (i = start; i < end; ++i) {
+         if (nvc0->samplers[s][i]) {
+            nvc0_screen_tsc_unlock(nvc0->screen, nvc0->samplers[s][i]);
+            nvc0->samplers[s][i] = NULL;
+            nvc0->samplers_dirty[s] |= 1 << i;
+         }
+      }
+   }
+
+   if (nvc0->num_samplers[s] <= end) {
+      if (last_valid < 0) {
+         for (i = start; i && !nvc0->samplers[s][i - 1]; --i);
+         nvc0->num_samplers[s] = i;
+      } else {
+         nvc0->num_samplers[s] = last_valid + 1;
+      }
+   }
+}
+
+static void
+nvc0_cp_sampler_states_bind(struct pipe_context *pipe,
+                            unsigned start, unsigned nr, void **cso)
+{
+   nvc0_stage_sampler_states_bind_range(nvc0_context(pipe), 5, start, nr, cso);
+
+   nvc0_context(pipe)->dirty_cp |= NVC0_NEW_CP_SAMPLERS;
+}
+
 /* NOTE: only called when not referenced anywhere, won't be bound */
 static void
 nvc0_sampler_view_destroy(struct pipe_context *pipe,
@@ -561,6 +612,67 @@ nvc0_gp_set_sampler_views(struct pipe_context *pipe,
    nvc0_stage_set_sampler_views(nvc0_context(pipe), 3, nr, views);
 }
 
+static void
+nvc0_stage_set_sampler_views_range(struct nvc0_context *nvc0, const unsigned s,
+                                   unsigned start, unsigned nr,
+                                   struct pipe_sampler_view **views)
+{
+   struct nouveau_bufctx *bctx = (s == 5) ? nvc0->bufctx_cp : nvc0->bufctx_3d;
+   const unsigned end = start + nr;
+   const unsigned bin = (s == 5) ? NVC0_BIND_CP_TEX(0) : NVC0_BIND_TEX(s, 0);
+   int last_valid = -1;
+   unsigned i;
+
+   if (views) {
+      for (i = start; i < end; ++i) {
+         const unsigned p = i - start;
+         if (views[p])
+            last_valid = i;
+         if (views[p] == nvc0->textures[s][i])
+            continue;
+         nvc0->textures_dirty[s] |= 1 << i;
+
+         if (nvc0->textures[s][i]) {
+            struct nv50_tic_entry *old = nv50_tic_entry(nvc0->textures[s][i]);
+            nouveau_bufctx_reset(bctx, bin + i);
+            nvc0_screen_tic_unlock(nvc0->screen, old);
+         }
+         pipe_sampler_view_reference(&nvc0->textures[s][i], views[p]);
+      }
+   } else {
+      for (i = start; i < end; ++i) {
+         struct nv50_tic_entry *old = nv50_tic_entry(nvc0->textures[s][i]);
+         if (!old)
+            continue;
+         nvc0->textures_dirty[s] |= 1 << i;
+
+         nvc0_screen_tic_unlock(nvc0->screen, old);
+         pipe_sampler_view_reference(&nvc0->textures[s][i], NULL);
+         nouveau_bufctx_reset(bctx, bin + i);
+      }
+   }
+
+   if (nvc0->num_textures[s] <= end) {
+      if (last_valid < 0) {
+         for (i = start; i && !nvc0->textures[s][i - 1]; --i);
+         nvc0->num_textures[s] = i;
+      } else {
+         nvc0->num_textures[s] = last_valid + 1;
+      }
+   }
+}
+
+static void
+nvc0_cp_set_sampler_views(struct pipe_context *pipe,
+                          unsigned start, unsigned nr,
+                          struct pipe_sampler_view **views)
+{
+   nvc0_stage_set_sampler_views_range(nvc0_context(pipe), 5, start, nr, views);
+
+   nvc0_context(pipe)->dirty_cp |= NVC0_NEW_CP_TEXTURES;
+}
+
+
 /* ============================= SHADERS =======================================
  */
 
@@ -644,6 +756,35 @@ nvc0_gp_state_bind(struct pipe_context *pipe, void *hwcso)
     nvc0->dirty |= NVC0_NEW_GMTYPROG;
 }
 
+static void *
+nvc0_cp_state_create(struct pipe_context *pipe,
+                     const struct pipe_compute_state *cso)
+{
+   struct nvc0_program *prog;
+
+   prog = CALLOC_STRUCT(nvc0_program);
+   if (!prog)
+      return NULL;
+   prog->type = PIPE_SHADER_COMPUTE;
+
+   prog->cp.smem_size = cso->req_local_mem;
+   prog->cp.lmem_size = cso->req_private_mem;
+   prog->parm_size = cso->req_input_mem;
+
+   prog->pipe.tokens = tgsi_dup_tokens((const struct tgsi_token *)cso->prog);
+
+   return (void *)prog;
+}
+
+static void
+nvc0_cp_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+    struct nvc0_context *nvc0 = nvc0_context(pipe);
+
+    nvc0->compprog = hwcso;
+    nvc0->dirty_cp |= NVC0_NEW_CP_PROGRAM;
+}
+
 static void
 nvc0_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index,
                          struct pipe_constant_buffer *cb)
@@ -653,14 +794,22 @@ nvc0_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index,
    const unsigned s = nvc0_shader_stage(shader);
    const unsigned i = index;
 
-   if (shader == PIPE_SHADER_COMPUTE)
-      return;
+   if (unlikely(shader == PIPE_SHADER_COMPUTE)) {
+      assert(!cb || !cb->user_buffer);
+      if (nvc0->constbuf[s][i].u.buf)
+         nouveau_bufctx_reset(nvc0->bufctx_cp, NVC0_BIND_CP_CB(i));
 
-   if (nvc0->constbuf[s][i].user)
-      nvc0->constbuf[s][i].u.buf = NULL;
-   else
-   if (nvc0->constbuf[s][i].u.buf)
-      nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_CB(s, i));
+      nvc0->dirty_cp |= NVC0_NEW_CP_CONSTBUF;
+   } else {
+      if (nvc0->constbuf[s][i].user)
+         nvc0->constbuf[s][i].u.buf = NULL;
+      else
+      if (nvc0->constbuf[s][i].u.buf)
+         nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_CB(s, i));
+
+      nvc0->dirty |= NVC0_NEW_CONSTBUF;
+   }
+   nvc0->constbuf_dirty[s] |= 1 << i;
 
    pipe_resource_reference(&nvc0->constbuf[s][i].u.buf, res);
 
@@ -673,10 +822,6 @@ nvc0_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index,
       nvc0->constbuf[s][i].offset = cb->buffer_offset;
       nvc0->constbuf[s][i].size = align(cb->buffer_size, 0x100);
    }
-
-   nvc0->constbuf_dirty[s] |= 1 << i;
-
-   nvc0->dirty |= NVC0_NEW_CONSTBUF;
 }
 
 /* =============================================================================
@@ -919,6 +1064,113 @@ nvc0_set_transform_feedback_targets(struct pipe_context *pipe,
       nvc0->dirty |= NVC0_NEW_TFB_TARGETS;
 }
 
+static void
+nvc0_bind_surfaces_range(struct nvc0_context *nvc0, const unsigned t,
+                         unsigned start, unsigned nr,
+                         struct pipe_surface **psurfaces)
+{
+   const unsigned end = start + nr;
+   const unsigned mask = ((1 << nr) - 1) << start;
+   unsigned i;
+
+   if (psurfaces) {
+      for (i = start; i < end; ++i) {
+         const unsigned p = i - start;
+         if (psurfaces[p])
+            nvc0->surfaces_valid[t] |= (1 << i);
+         else
+            nvc0->surfaces_valid[t] &= ~(1 << i);
+         pipe_surface_reference(&nvc0->surfaces[t][i], psurfaces[p]);
+      }
+   } else {
+      for (i = start; i < end; ++i)
+         pipe_surface_reference(&nvc0->surfaces[t][i], NULL);
+      nvc0->surfaces_valid[t] &= ~mask;
+   }
+   nvc0->surfaces_dirty[t] |= mask;
+
+   if (t == 0)
+      nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_SUF);
+   else
+      nouveau_bufctx_reset(nvc0->bufctx_cp, NVC0_BIND_CP_SUF);
+}
+
+static void
+nvc0_set_compute_resources(struct pipe_context *pipe,
+                           unsigned start, unsigned nr,
+                           struct pipe_surface **resources)
+{
+   nvc0_bind_surfaces_range(nvc0_context(pipe), 1, start, nr, resources);
+
+   nvc0_context(pipe)->dirty_cp |= NVC0_NEW_CP_SURFACES;
+}
+
+static void
+nvc0_set_shader_resources(struct pipe_context *pipe,
+                          unsigned start, unsigned nr,
+                          struct pipe_surface **resources)
+{
+   nvc0_bind_surfaces_range(nvc0_context(pipe), 0, start, nr, resources);
+
+   nvc0_context(pipe)->dirty |= NVC0_NEW_SURFACES;
+}
+
+static INLINE void
+nvc0_set_global_handle(uint32_t *phandle, struct pipe_resource *res)
+{
+   struct nv04_resource *buf = nv04_resource(res);
+   if (buf) {
+      uint64_t limit = (buf->address + buf->base.width0) - 1;
+      if (limit < (1ULL << 32)) {
+         *phandle = (uint32_t)buf->address;
+      } else {
+         NOUVEAU_ERR("Cannot map into TGSI_RESOURCE_GLOBAL: "
+                     "resource not contained within 32-bit address space !\n");
+         *phandle = 0;
+      }
+   } else {
+      *phandle = 0;
+   }
+}
+
+static void
+nvc0_set_global_bindings(struct pipe_context *pipe,
+                         unsigned start, unsigned nr,
+                         struct pipe_resource **resources,
+                         uint32_t **handles)
+{
+   struct nvc0_context *nvc0 = nvc0_context(pipe);
+   struct pipe_resource **ptr;
+   unsigned i;
+   const unsigned end = start + nr;
+
+   if (nvc0->global_residents.size <= (end * sizeof(struct pipe_resource *))) {
+      const unsigned old_size = nvc0->global_residents.size;
+      const unsigned req_size = end * sizeof(struct pipe_resource *);
+      util_dynarray_resize(&nvc0->global_residents, req_size);
+      memset((uint8_t *)nvc0->global_residents.data + old_size, 0,
+             req_size - old_size);
+   }
+
+   if (resources) {
+      ptr = util_dynarray_element(
+         &nvc0->global_residents, struct pipe_resource *, start);
+      for (i = 0; i < nr; ++i) {
+         pipe_resource_reference(&ptr[i], resources[i]);
+         nvc0_set_global_handle(handles[i], resources[i]);
+      }
+   } else {
+      ptr = util_dynarray_element(
+         &nvc0->global_residents, struct pipe_resource *, start);
+      for (i = 0; i < nr; ++i)
+         pipe_resource_reference(&ptr[i], NULL);
+   }
+
+   nouveau_bufctx_reset(nvc0->bufctx_cp, NVC0_BIND_CP_GLOBAL);
+
+   nvc0->dirty_cp = NVC0_NEW_CP_GLOBALS;
+}
+
 void
 nvc0_init_state_functions(struct nvc0_context *nvc0)
 {
@@ -941,12 +1193,14 @@ nvc0_init_state_functions(struct nvc0_context *nvc0)
    pipe->bind_vertex_sampler_states   = nvc0_vp_sampler_states_bind;
    pipe->bind_fragment_sampler_states = nvc0_fp_sampler_states_bind;
    pipe->bind_geometry_sampler_states = nvc0_gp_sampler_states_bind;
+   pipe->bind_compute_sampler_states = nvc0_cp_sampler_states_bind;
 
    pipe->create_sampler_view = nvc0_create_sampler_view;
    pipe->sampler_view_destroy = nvc0_sampler_view_destroy;
    pipe->set_vertex_sampler_views   = nvc0_vp_set_sampler_views;
    pipe->set_fragment_sampler_views = nvc0_fp_set_sampler_views;
    pipe->set_geometry_sampler_views = nvc0_gp_set_sampler_views;
+   pipe->set_compute_sampler_views = nvc0_cp_set_sampler_views;
 
    pipe->create_vs_state = nvc0_vp_state_create;
    pipe->create_fs_state = nvc0_fp_state_create;
@@ -958,6 +1212,10 @@ nvc0_init_state_functions(struct nvc0_context *nvc0)
    pipe->delete_fs_state = nvc0_sp_state_delete;
    pipe->delete_gs_state = nvc0_sp_state_delete;
 
+   pipe->create_compute_state = nvc0_cp_state_create;
+   pipe->bind_compute_state = nvc0_cp_state_bind;
+   pipe->delete_compute_state = nvc0_sp_state_delete;
+
    pipe->set_blend_color = nvc0_set_blend_color;
    pipe->set_stencil_ref = nvc0_set_stencil_ref;
    pipe->set_clip_state = nvc0_set_clip_state;
@@ -978,5 +1236,9 @@ nvc0_init_state_functions(struct nvc0_context *nvc0)
    pipe->create_stream_output_target = nvc0_so_target_create;
    pipe->stream_output_target_destroy = nvc0_so_target_destroy;
    pipe->set_stream_output_targets = nvc0_set_transform_feedback_targets;
+
+   pipe->set_global_binding = nvc0_set_global_bindings;
+   pipe->set_compute_resources = nvc0_set_compute_resources;
+   pipe->set_shader_resources = nvc0_set_shader_resources;
 }
 
index 80a8c01..1f12de6 100644 (file)
@@ -430,6 +430,21 @@ nvc0_validate_sample_mask(struct nvc0_context *nvc0)
    PUSH_DATA (push, 0x01);
 }
 
+void
+nvc0_validate_global_residents(struct nvc0_context *nvc0,
+                               struct nouveau_bufctx *bctx, int bin)
+{
+   unsigned i;
+
+   for (i = 0; i < nvc0->global_residents.size / sizeof(struct pipe_resource *);
+        ++i) {
+      struct pipe_resource *res = *util_dynarray_element(
+         &nvc0->global_residents, struct pipe_resource *, i);
+      if (res)
+         nvc0_add_resident(bctx, bin, nv04_resource(res), NOUVEAU_BO_RDWR);
+   }
+}
+
 static void
 nvc0_validate_derived_1(struct nvc0_context *nvc0)
 {
@@ -513,6 +528,7 @@ static struct state_validate {
     { nvc0_validate_samplers,      NVC0_NEW_SAMPLERS },
     { nve4_set_tex_handles,        NVC0_NEW_TEXTURES | NVC0_NEW_SAMPLERS },
     { nvc0_vertex_arrays_validate, NVC0_NEW_VERTEX | NVC0_NEW_ARRAYS },
+    { nvc0_validate_surfaces,      NVC0_NEW_SURFACES },
     { nvc0_idxbuf_validate,        NVC0_NEW_IDXBUF },
     { nvc0_tfb_validate,           NVC0_NEW_TFB_TARGETS | NVC0_NEW_GMTYPROG }
 };
index 281d740..77330c5 100644 (file)
@@ -515,7 +515,7 @@ nvc0_blitter_make_vp(struct nvc0_blitter *blit)
       blit->vp.code = (uint32_t *)code_nvc0; /* const_cast */
       blit->vp.code_size = sizeof(code_nvc0);
    }
-   blit->vp.max_gpr = 7;
+   blit->vp.num_gprs = 7;
    blit->vp.vp.edgeflag = PIPE_MAX_ATTRIBS;
 
    blit->vp.hdr[0]  = 0x00020461; /* vertprog magic */
index 2bce97b..7fbe1e6 100644 (file)
@@ -23,6 +23,7 @@
 #include "nvc0_context.h"
 #include "nvc0_resource.h"
 #include "nv50/nv50_texture.xml.h"
+#include "nv50/nv50_defs.xml.h"
 
 #include "util/u_format.h"
 
@@ -413,7 +414,7 @@ nvc0_validate_tsc(struct nvc0_context *nvc0, int s)
    return need_flush;
 }
 
-static boolean
+boolean
 nve4_validate_tsc(struct nvc0_context *nvc0, int s)
 {
    struct nouveau_bo *txc = nvc0->screen->txc;
@@ -515,3 +516,295 @@ nve4_set_tex_handles(struct nvc0_context *nvc0)
       nvc0->samplers_dirty[s] = 0;
    }
 }
+
+
+static const uint8_t nve4_su_format_map[PIPE_FORMAT_COUNT];
+static const uint16_t nve4_su_format_aux_map[PIPE_FORMAT_COUNT];
+static const uint16_t nve4_suldp_lib_offset[PIPE_FORMAT_COUNT];
+
+void
+nve4_set_surface_info(struct nouveau_pushbuf *push,
+                      struct pipe_surface *psf,
+                      struct nvc0_screen *screen)
+{
+   struct nv50_surface *sf = nv50_surface(psf);
+   struct nv04_resource *res;
+   uint64_t address;
+   uint32_t *const info = push->cur;
+   uint8_t log2cpp;
+
+   if (psf && !nve4_su_format_map[psf->format])
+      NOUVEAU_ERR("unsupported surface format, try is_format_supported() !\n");
+
+   push->cur += 16;
+
+   if (!psf || !nve4_su_format_map[psf->format]) {
+      memset(info, 0, 16 * sizeof(*info));
+
+      info[0] = 0xbadf0000;
+      info[1] = 0x80004000;
+      info[12] = nve4_suldp_lib_offset[PIPE_FORMAT_R32G32B32A32_UINT] +
+         screen->lib_code->start;
+      return;
+   }
+   res = nv04_resource(sf->base.texture);
+
+   address = res->address + sf->offset;
+
+   info[8] = sf->width;
+   info[9] = sf->height;
+   info[10] = sf->depth;
+   switch (res->base.target) {
+   case PIPE_TEXTURE_1D_ARRAY:
+      info[11] = 1;
+      break;
+   case PIPE_TEXTURE_2D:
+   case PIPE_TEXTURE_RECT:
+      info[11] = 2;
+      break;
+   case PIPE_TEXTURE_3D:
+      info[11] = 3;
+      break;
+   case PIPE_TEXTURE_2D_ARRAY:
+   case PIPE_TEXTURE_CUBE:
+   case PIPE_TEXTURE_CUBE_ARRAY:
+      info[11] = 4;
+      break;
+   default:
+      info[11] = 0;
+      break;
+   }
+   log2cpp = (0xf000 & nve4_su_format_aux_map[sf->base.format]) >> 12;
+
+   info[12] = nve4_suldp_lib_offset[sf->base.format] + screen->lib_code->start;
+
+   /* limit in bytes for raw access */
+   info[13] = (0x06 << 22) | ((sf->width << log2cpp) - 1);
+
+   info[1] = nve4_su_format_map[sf->base.format];
+
+#if 0
+   switch (util_format_get_blocksizebits(sf->base.format)) {
+   case  16: info[1] |= 1 << 16; break;
+   case  32: info[1] |= 2 << 16; break;
+   case  64: info[1] |= 3 << 16; break;
+   case 128: info[1] |= 4 << 16; break;
+   default:
+      break;
+   }
+#else
+   info[1] |= log2cpp << 16;
+   info[1] |=  0x4000;
+   info[1] |= (0x0f00 & nve4_su_format_aux_map[sf->base.format]);
+#endif
+
+   if (res->base.target == PIPE_BUFFER) {
+      info[0]  = address >> 8;
+      info[2]  = sf->width - 1;
+      info[2] |= (0xff & nve4_su_format_aux_map[sf->base.format]) << 22;
+      info[3]  = 0;
+      info[4]  = 0;
+      info[5]  = 0;
+      info[6]  = 0;
+      info[7]  = 0;
+      info[14] = 0;
+      info[15] = 0;
+   } else {
+      struct nv50_miptree *mt = nv50_miptree(&res->base);
+      struct nv50_miptree_level *lvl = &mt->level[sf->base.u.tex.level];
+      const unsigned z = sf->base.u.tex.first_layer;
+
+      if (z) {
+         if (mt->layout_3d) {
+            address += nvc0_mt_zslice_offset(mt, psf->u.tex.level, z);
+            /* doesn't work if z passes z-tile boundary */
+            assert(sf->depth == 1);
+         } else {
+            address += mt->layer_stride * z;
+         }
+      }
+      info[0]  = address >> 8;
+      info[2]  = sf->width - 1;
+      /* NOTE: this is really important: */
+      info[2] |= (0xff & nve4_su_format_aux_map[sf->base.format]) << 22;
+      info[3]  = (0x88 << 24) | (lvl->pitch / 64);
+      info[4]  = sf->height - 1;
+      info[4] |= (lvl->tile_mode & 0x0f0) << 25;
+      info[4] |= NVC0_TILE_SHIFT_Y(lvl->tile_mode) << 22;
+      info[5]  = mt->layer_stride >> 8;
+      info[6]  = sf->depth - 1;
+      info[6] |= (lvl->tile_mode & 0xf00) << 21;
+      info[6] |= NVC0_TILE_SHIFT_Z(lvl->tile_mode) << 22;
+      info[7]  = 0;
+      info[14] = mt->ms_x;
+      info[15] = mt->ms_y;
+   }
+}
+
+static INLINE void
+nvc0_update_surface_bindings(struct nvc0_context *nvc0)
+{
+   /* TODO */
+}
+
+static INLINE void
+nve4_update_surface_bindings(struct nvc0_context *nvc0)
+{
+   /* TODO */
+}
+
+void
+nvc0_validate_surfaces(struct nvc0_context *nvc0)
+{
+   if (nvc0->screen->base.class_3d >= NVE4_3D_CLASS) {
+      nve4_update_surface_bindings(nvc0);
+   } else {
+      nvc0_update_surface_bindings(nvc0);
+   }
+}
+
+
+static const uint8_t nve4_su_format_map[PIPE_FORMAT_COUNT] =
+{
+   [PIPE_FORMAT_R32G32B32A32_FLOAT] = NVE4_IMAGE_FORMAT_RGBA32_FLOAT,
+   [PIPE_FORMAT_R32G32B32A32_SINT] = NVE4_IMAGE_FORMAT_RGBA32_SINT,
+   [PIPE_FORMAT_R32G32B32A32_UINT] = NVE4_IMAGE_FORMAT_RGBA32_UINT,
+   [PIPE_FORMAT_R16G16B16A16_FLOAT] = NVE4_IMAGE_FORMAT_RGBA16_FLOAT,
+   [PIPE_FORMAT_R16G16B16A16_UNORM] = NVE4_IMAGE_FORMAT_RGBA16_UNORM,
+   [PIPE_FORMAT_R16G16B16A16_SNORM] = NVE4_IMAGE_FORMAT_RGBA16_SNORM,
+   [PIPE_FORMAT_R16G16B16A16_SINT] = NVE4_IMAGE_FORMAT_RGBA16_SINT,
+   [PIPE_FORMAT_R16G16B16A16_UINT] = NVE4_IMAGE_FORMAT_RGBA16_UINT,
+   [PIPE_FORMAT_R8G8B8A8_UNORM] = NVE4_IMAGE_FORMAT_RGBA8_UNORM,
+   [PIPE_FORMAT_R8G8B8A8_SNORM] = NVE4_IMAGE_FORMAT_RGBA8_SNORM,
+   [PIPE_FORMAT_R8G8B8A8_SINT] = NVE4_IMAGE_FORMAT_RGBA8_SINT,
+   [PIPE_FORMAT_R8G8B8A8_UINT] = NVE4_IMAGE_FORMAT_RGBA8_UINT,
+   [PIPE_FORMAT_R11G11B10_FLOAT] = NVE4_IMAGE_FORMAT_R11G11B10_FLOAT,
+   [PIPE_FORMAT_R10G10B10A2_UNORM] = NVE4_IMAGE_FORMAT_RGB10_A2_UNORM,
+/* [PIPE_FORMAT_R10G10B10A2_UINT] = NVE4_IMAGE_FORMAT_RGB10_A2_UINT, */
+   [PIPE_FORMAT_R32G32_FLOAT] = NVE4_IMAGE_FORMAT_RG32_FLOAT,
+   [PIPE_FORMAT_R32G32_SINT] = NVE4_IMAGE_FORMAT_RG32_SINT,
+   [PIPE_FORMAT_R32G32_UINT] = NVE4_IMAGE_FORMAT_RG32_UINT,
+   [PIPE_FORMAT_R16G16_FLOAT] = NVE4_IMAGE_FORMAT_RG16_FLOAT,
+   [PIPE_FORMAT_R16G16_UNORM] = NVE4_IMAGE_FORMAT_RG16_UNORM,
+   [PIPE_FORMAT_R16G16_SNORM] = NVE4_IMAGE_FORMAT_RG16_SNORM,
+   [PIPE_FORMAT_R16G16_SINT] = NVE4_IMAGE_FORMAT_RG16_SINT,
+   [PIPE_FORMAT_R16G16_UINT] = NVE4_IMAGE_FORMAT_RG16_UINT,
+   [PIPE_FORMAT_R8G8_UNORM] = NVE4_IMAGE_FORMAT_RG8_UNORM,
+   [PIPE_FORMAT_R8G8_SNORM] = NVE4_IMAGE_FORMAT_RG8_SNORM,
+   [PIPE_FORMAT_R8G8_SINT] = NVE4_IMAGE_FORMAT_RG8_SINT,
+   [PIPE_FORMAT_R8G8_UINT] = NVE4_IMAGE_FORMAT_RG8_UINT,
+   [PIPE_FORMAT_R32_FLOAT] = NVE4_IMAGE_FORMAT_R32_FLOAT,
+   [PIPE_FORMAT_R32_SINT] = NVE4_IMAGE_FORMAT_R32_SINT,
+   [PIPE_FORMAT_R32_UINT] = NVE4_IMAGE_FORMAT_R32_UINT,
+   [PIPE_FORMAT_R16_FLOAT] = NVE4_IMAGE_FORMAT_R16_FLOAT,
+   [PIPE_FORMAT_R16_UNORM] = NVE4_IMAGE_FORMAT_R16_UNORM,
+   [PIPE_FORMAT_R16_SNORM] = NVE4_IMAGE_FORMAT_R16_SNORM,
+   [PIPE_FORMAT_R16_SINT] = NVE4_IMAGE_FORMAT_R16_SINT,
+   [PIPE_FORMAT_R16_UINT] = NVE4_IMAGE_FORMAT_R16_UINT,
+   [PIPE_FORMAT_R8_UNORM] = NVE4_IMAGE_FORMAT_R8_UNORM,
+   [PIPE_FORMAT_R8_SNORM] = NVE4_IMAGE_FORMAT_R8_SNORM,
+   [PIPE_FORMAT_R8_SINT] = NVE4_IMAGE_FORMAT_R8_SINT,
+   [PIPE_FORMAT_R8_UINT] = NVE4_IMAGE_FORMAT_R8_UINT,
+};
+
+/* Auxiliary format description values for surface instructions.
+ * (log2(bytes per pixel) << 12) | (unk8 << 8) | unk22
+ */
+static const uint16_t nve4_su_format_aux_map[PIPE_FORMAT_COUNT] =
+{
+   [PIPE_FORMAT_R32G32B32A32_FLOAT] = 0x4842,
+   [PIPE_FORMAT_R32G32B32A32_SINT] = 0x4842,
+   [PIPE_FORMAT_R32G32B32A32_UINT] = 0x4842,
+
+   [PIPE_FORMAT_R16G16B16A16_UNORM] = 0x3933,
+   [PIPE_FORMAT_R16G16B16A16_SNORM] = 0x3933,
+   [PIPE_FORMAT_R16G16B16A16_SINT] = 0x3933,
+   [PIPE_FORMAT_R16G16B16A16_UINT] = 0x3933,
+   [PIPE_FORMAT_R16G16B16A16_FLOAT] = 0x3933,
+
+   [PIPE_FORMAT_R32G32_FLOAT] = 0x3433,
+   [PIPE_FORMAT_R32G32_SINT] = 0x3433,
+   [PIPE_FORMAT_R32G32_UINT] = 0x3433,
+
+   [PIPE_FORMAT_R10G10B10A2_UNORM] = 0x2a24,
+/* [PIPE_FORMAT_R10G10B10A2_UINT] = 0x2a24, */
+   [PIPE_FORMAT_R8G8B8A8_UNORM] = 0x2a24,
+   [PIPE_FORMAT_R8G8B8A8_SNORM] = 0x2a24,
+   [PIPE_FORMAT_R8G8B8A8_SINT] = 0x2a24,
+   [PIPE_FORMAT_R8G8B8A8_UINT] = 0x2a24,
+   [PIPE_FORMAT_R11G11B10_FLOAT] = 0x2a24,
+
+   [PIPE_FORMAT_R16G16_UNORM] = 0x2524,
+   [PIPE_FORMAT_R16G16_SNORM] = 0x2524,
+   [PIPE_FORMAT_R16G16_SINT] = 0x2524,
+   [PIPE_FORMAT_R16G16_UINT] = 0x2524,
+   [PIPE_FORMAT_R16G16_FLOAT] = 0x2524,
+
+   [PIPE_FORMAT_R32_SINT] = 0x2024,
+   [PIPE_FORMAT_R32_UINT] = 0x2024,
+   [PIPE_FORMAT_R32_FLOAT] = 0x2024,
+
+   [PIPE_FORMAT_R8G8_UNORM] = 0x1615,
+   [PIPE_FORMAT_R8G8_SNORM] = 0x1615,
+   [PIPE_FORMAT_R8G8_SINT] = 0x1615,
+   [PIPE_FORMAT_R8G8_UINT] = 0x1615,
+
+   [PIPE_FORMAT_R16_UNORM] = 0x1115,
+   [PIPE_FORMAT_R16_SNORM] = 0x1115,
+   [PIPE_FORMAT_R16_SINT] = 0x1115,
+   [PIPE_FORMAT_R16_UINT] = 0x1115,
+   [PIPE_FORMAT_R16_FLOAT] = 0x1115,
+
+   [PIPE_FORMAT_R8_UNORM] = 0x0206,
+   [PIPE_FORMAT_R8_SNORM] = 0x0206,
+   [PIPE_FORMAT_R8_SINT] = 0x0206,
+   [PIPE_FORMAT_R8_UINT] = 0x0206
+};
+
+/* NOTE: These are hardcoded offsets for the shader library.
+ * TODO: Automate them.
+ */
+static const uint16_t nve4_suldp_lib_offset[PIPE_FORMAT_COUNT] =
+{
+   [PIPE_FORMAT_R32G32B32A32_FLOAT] = 0x218,
+   [PIPE_FORMAT_R32G32B32A32_SINT]  = 0x218,
+   [PIPE_FORMAT_R32G32B32A32_UINT]  = 0x218,
+   [PIPE_FORMAT_R16G16B16A16_UNORM] = 0x248,
+   [PIPE_FORMAT_R16G16B16A16_SNORM] = 0x2b8,
+   [PIPE_FORMAT_R16G16B16A16_SINT]  = 0x330,
+   [PIPE_FORMAT_R16G16B16A16_UINT]  = 0x388,
+   [PIPE_FORMAT_R16G16B16A16_FLOAT] = 0x3d8,
+   [PIPE_FORMAT_R32G32_FLOAT]       = 0x428,
+   [PIPE_FORMAT_R32G32_SINT]        = 0x468,
+   [PIPE_FORMAT_R32G32_UINT]        = 0x468,
+   [PIPE_FORMAT_R10G10B10A2_UNORM]  = 0x4a8,
+/* [PIPE_FORMAT_R10G10B10A2_UINT]   = 0x530, */
+   [PIPE_FORMAT_R8G8B8A8_UNORM]     = 0x588,
+   [PIPE_FORMAT_R8G8B8A8_SNORM]     = 0x5f8,
+   [PIPE_FORMAT_R8G8B8A8_SINT]      = 0x670,
+   [PIPE_FORMAT_R8G8B8A8_UINT]      = 0x6c8,
+   [PIPE_FORMAT_B5G6R5_UNORM]       = 0x718,
+   [PIPE_FORMAT_B5G5R5X1_UNORM]     = 0x7a0,
+   [PIPE_FORMAT_R16G16_UNORM]       = 0x828,
+   [PIPE_FORMAT_R16G16_SNORM]       = 0x890,
+   [PIPE_FORMAT_R16G16_SINT]        = 0x8f0,
+   [PIPE_FORMAT_R16G16_UINT]        = 0x948,
+   [PIPE_FORMAT_R16G16_FLOAT]       = 0x998,
+   [PIPE_FORMAT_R32_FLOAT]          = 0x9e8,
+   [PIPE_FORMAT_R32_SINT]           = 0xa30,
+   [PIPE_FORMAT_R32_UINT]           = 0xa30,
+   [PIPE_FORMAT_R8G8_UNORM]         = 0xa78,
+   [PIPE_FORMAT_R8G8_SNORM]         = 0xae0,
+   [PIPE_FORMAT_R8G8_UINT]          = 0xb48,
+   [PIPE_FORMAT_R8G8_SINT]          = 0xb98,
+   [PIPE_FORMAT_R16_UNORM]          = 0xbe8,
+   [PIPE_FORMAT_R16_SNORM]          = 0xc48,
+   [PIPE_FORMAT_R16_SINT]           = 0xca0,
+   [PIPE_FORMAT_R16_UINT]           = 0xce8,
+   [PIPE_FORMAT_R16_FLOAT]          = 0xd30,
+   [PIPE_FORMAT_R8_UNORM]           = 0xd88,
+   [PIPE_FORMAT_R8_SNORM]           = 0xde0,
+   [PIPE_FORMAT_R8_SINT]            = 0xe38,
+   [PIPE_FORMAT_R8_UINT]            = 0xe88,
+   [PIPE_FORMAT_R11G11B10_FLOAT]    = 0xed0
+};
diff --git a/src/gallium/drivers/nvc0/nve4_compute.c b/src/gallium/drivers/nvc0/nve4_compute.c
new file mode 100644 (file)
index 0000000..e823d21
--- /dev/null
@@ -0,0 +1,607 @@
+/*
+ * Copyright 2012 Nouveau Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors: Christoph Bumiller
+ */
+
+#include "nvc0_context.h"
+#include "nve4_compute.h"
+
+#include "nv50/codegen/nv50_ir_driver.h"
+
+static void nve4_compute_dump_launch_desc(const struct nve4_cp_launch_desc *);
+
+
+int
+nve4_screen_compute_setup(struct nvc0_screen *screen,
+                          struct nouveau_pushbuf *push)
+{
+   struct nouveau_device *dev = screen->base.device;
+   struct nouveau_object *chan = screen->base.channel;
+   unsigned i;
+   int ret;
+   uint32_t obj_class;
+
+   switch (dev->chipset & 0xf0) {
+   case 0xf0:
+      obj_class = NVF0_COMPUTE_CLASS; /* GK110 */
+      break;
+   case 0xe0:
+      obj_class = NVE4_COMPUTE_CLASS; /* GK104 */
+      break;
+   default:
+      NOUVEAU_ERR("unsupported chipset: NV%02x\n", dev->chipset);
+      break;
+   }
+
+   ret = nouveau_object_new(chan, 0xbeef00c0, obj_class, NULL, 0,
+                            &screen->compute);
+   if (ret) {
+      NOUVEAU_ERR("Failed to allocate compute object: %d\n", ret);
+      return ret;
+   }
+
+   ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 0, NVE4_CP_PARAM_SIZE, NULL,
+                        &screen->parm);
+   if (ret)
+      return ret;
+
+   BEGIN_NVC0(push, SUBC_COMPUTE(NV01_SUBCHAN_OBJECT), 1);
+   PUSH_DATA (push, screen->compute->oclass);
+
+   BEGIN_NVC0(push, NVE4_COMPUTE(TEMP_ADDRESS_HIGH), 2);
+   PUSH_DATAh(push, screen->tls->offset);
+   PUSH_DATA (push, screen->tls->offset);
+   /* No idea why there are 2. Divide size by 2 to be safe.
+    * Actually this might be per-MP TEMP size and looks like I'm only using
+    * 2 MPs instead of all 8.
+    */
+   BEGIN_NVC0(push, NVE4_COMPUTE(TEMP_SIZE_HIGH(0)), 3);
+   PUSH_DATAh(push, screen->tls_size / 2);
+   PUSH_DATA (push, screen->tls_size / 2);
+   PUSH_DATA (push, 0xff);
+   BEGIN_NVC0(push, NVE4_COMPUTE(TEMP_SIZE_HIGH(1)), 3);
+   PUSH_DATAh(push, screen->tls_size / 2);
+   PUSH_DATA (push, screen->tls_size / 2);
+   PUSH_DATA (push, 0xff);
+
+   /* Unified address space ? Who needs that ? Certainly not OpenCL.
+    *
+    * FATAL: Buffers with addresses inside [0x1000000, 0x3000000] will NOT be
+    *  accessible. We cannot prevent that at the moment, so expect failure.
+    */
+   BEGIN_NVC0(push, NVE4_COMPUTE(LOCAL_BASE), 1);
+   PUSH_DATA (push, 1 << 24);
+   BEGIN_NVC0(push, NVE4_COMPUTE(SHARED_BASE), 1);
+   PUSH_DATA (push, 2 << 24);
+
+   BEGIN_NVC0(push, NVE4_COMPUTE(CODE_ADDRESS_HIGH), 2);
+   PUSH_DATAh(push, screen->text->offset);
+   PUSH_DATA (push, screen->text->offset);
+
+   BEGIN_NVC0(push, SUBC_COMPUTE(0x0310), 1);
+   PUSH_DATA (push, (obj_class >= NVF0_COMPUTE_CLASS) ? 0x400 : 0x300);
+
+   /* NOTE: these do not affect the state used by the 3D object */
+   BEGIN_NVC0(push, NVE4_COMPUTE(TIC_ADDRESS_HIGH), 3);
+   PUSH_DATAh(push, screen->txc->offset);
+   PUSH_DATA (push, screen->txc->offset);
+   PUSH_DATA (push, NVC0_TIC_MAX_ENTRIES - 1);
+   BEGIN_NVC0(push, NVE4_COMPUTE(TSC_ADDRESS_HIGH), 3);
+   PUSH_DATAh(push, screen->txc->offset + 65536);
+   PUSH_DATA (push, screen->txc->offset + 65536);
+   PUSH_DATA (push, NVC0_TSC_MAX_ENTRIES - 1);
+
+   if (obj_class >= NVF0_COMPUTE_CLASS) {
+      BEGIN_NVC0(push, SUBC_COMPUTE(0x0248), 1);
+      PUSH_DATA (push, 0x100);
+      BEGIN_NIC0(push, SUBC_COMPUTE(0x0248), 63);
+      for (i = 63; i >= 1; --i)
+         PUSH_DATA(push, 0x38000 | i);
+      IMMED_NVC0(push, SUBC_COMPUTE(NV50_GRAPH_SERIALIZE), 0);
+      IMMED_NVC0(push, SUBC_COMPUTE(0x518), 0);
+   }
+
+   BEGIN_NVC0(push, NVE4_COMPUTE(TEX_CB_INDEX), 1);
+   PUSH_DATA (push, 0); /* does not interefere with 3D */
+
+   if (obj_class >= NVF0_COMPUTE_CLASS)
+      IMMED_NVC0(push, SUBC_COMPUTE(0x02c4), 1);
+
+   /* MS sample coordinate offsets: these do not work with _ALT modes ! */
+   BEGIN_NVC0(push, NVE4_COMPUTE(UPLOAD_ADDRESS_HIGH), 2);
+   PUSH_DATAh(push, screen->parm->offset + NVE4_CP_INPUT_MS_OFFSETS);
+   PUSH_DATA (push, screen->parm->offset + NVE4_CP_INPUT_MS_OFFSETS);
+   BEGIN_NVC0(push, NVE4_COMPUTE(UPLOAD_SIZE), 2);
+   PUSH_DATA (push, 64);
+   PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_UNK0184_UNKVAL);
+   BEGIN_1IC0(push, NVE4_COMPUTE(UPLOAD_EXEC), 17);
+   PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_UNKVAL_DATA);
+   PUSH_DATA (push, 0); /* 0 */
+   PUSH_DATA (push, 0);
+   PUSH_DATA (push, 1); /* 1 */
+   PUSH_DATA (push, 0);
+   PUSH_DATA (push, 0); /* 2 */
+   PUSH_DATA (push, 1);
+   PUSH_DATA (push, 1); /* 3 */
+   PUSH_DATA (push, 1);
+   PUSH_DATA (push, 2); /* 4 */
+   PUSH_DATA (push, 0);
+   PUSH_DATA (push, 3); /* 5 */
+   PUSH_DATA (push, 0);
+   PUSH_DATA (push, 2); /* 6 */
+   PUSH_DATA (push, 1);
+   PUSH_DATA (push, 3); /* 7 */
+   PUSH_DATA (push, 1);
+   BEGIN_NVC0(push, NVE4_COMPUTE(FLUSH), 1);
+   PUSH_DATA (push, NVE4_COMPUTE_FLUSH_CB);
+
+   return 0;
+}
+
+
+static void
+nve4_compute_validate_surfaces(struct nvc0_context *nvc0)
+{
+   struct nvc0_screen *screen = nvc0->screen;
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+   struct nv50_surface *sf;
+   struct nv04_resource *res;
+   uint32_t mask;
+   unsigned i;
+   const unsigned t = 1;
+
+   mask = nvc0->surfaces_dirty[t];
+   while (mask) {
+      i = ffs(mask) - 1;
+      mask &= ~(1 << i);
+
+      /*
+       * NVE4's surface load/store instructions receive all the information
+       * directly instead of via binding points, so we have to supply them.
+       */
+      BEGIN_NVC0(push, NVE4_COMPUTE(UPLOAD_ADDRESS_HIGH), 2);
+      PUSH_DATAh(push, screen->parm->offset + NVE4_CP_INPUT_SUF(i));
+      PUSH_DATA (push, screen->parm->offset + NVE4_CP_INPUT_SUF(i));
+      BEGIN_NVC0(push, NVE4_COMPUTE(UPLOAD_SIZE), 2);
+      PUSH_DATA (push, 64);
+      PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_UNK0184_UNKVAL);
+      BEGIN_1IC0(push, NVE4_COMPUTE(UPLOAD_EXEC), 17);
+      PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_UNKVAL_DATA);
+
+      nve4_set_surface_info(push, nvc0->surfaces[t][i], screen);
+
+      sf = nv50_surface(nvc0->surfaces[t][i]);
+      if (sf) {
+         res = nv04_resource(sf->base.texture);
+
+         if (sf->base.writable)
+            BCTX_REFN(nvc0->bufctx_cp, CP_SUF, res, RDWR);
+         else
+            BCTX_REFN(nvc0->bufctx_cp, CP_SUF, res, RD);
+      }
+   }
+   if (nvc0->surfaces_dirty[t]) {
+      BEGIN_NVC0(push, NVE4_COMPUTE(FLUSH), 1);
+      PUSH_DATA (push, NVE4_COMPUTE_FLUSH_CB);
+   }
+
+   /* re-reference non-dirty surfaces */
+   mask = nvc0->surfaces_valid[t] & ~nvc0->surfaces_dirty[t];
+   while (mask) {
+      i = ffs(mask) - 1;
+      mask &= ~(1 << i);
+
+      sf = nv50_surface(nvc0->surfaces[t][i]);
+      res = nv04_resource(sf->base.texture);
+
+      if (sf->base.writable)
+         BCTX_REFN(nvc0->bufctx_cp, CP_SUF, res, RDWR);
+      else
+         BCTX_REFN(nvc0->bufctx_cp, CP_SUF, res, RD);
+   }
+
+   nvc0->surfaces_dirty[t] = 0;
+}
+
+
+/* Thankfully, textures with samplers follow the normal rules. */
+static void
+nve4_compute_validate_samplers(struct nvc0_context *nvc0)
+{
+   boolean need_flush = nve4_validate_tsc(nvc0, 5);
+   if (need_flush) {
+      BEGIN_NVC0(nvc0->base.pushbuf, NVE4_COMPUTE(TSC_FLUSH), 1);
+      PUSH_DATA (nvc0->base.pushbuf, 0);
+   }
+}
+/* (Code duplicated at bottom for various non-convincing reasons.
+ *  E.g. we might want to use the COMPUTE subchannel to upload TIC/TSC
+ *  entries to avoid a subchannel switch.
+ *  Same for texture cache flushes.
+ *  Also, the bufctx differs, and more IFs in the 3D version looks ugly.)
+ */
+static void nve4_compute_validate_textures(struct nvc0_context *);
+
+static void
+nve4_compute_set_tex_handles(struct nvc0_context *nvc0)
+{
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+   uint64_t address;
+   const unsigned s = nvc0_shader_stage(PIPE_SHADER_COMPUTE);
+   unsigned i, n;
+   uint32_t dirty = nvc0->textures_dirty[s] | nvc0->samplers_dirty[s];
+
+   if (!dirty)
+      return;
+   i = ffs(dirty) - 1;
+   n = util_logbase2(dirty) + 1 - i;
+   assert(n);
+
+   address = nvc0->screen->parm->offset + NVE4_CP_INPUT_TEX(i);
+
+   BEGIN_NVC0(push, NVE4_COMPUTE(UPLOAD_ADDRESS_HIGH), 2);
+   PUSH_DATAh(push, address);
+   PUSH_DATA (push, address);
+   BEGIN_NVC0(push, NVE4_COMPUTE(UPLOAD_SIZE), 2);
+   PUSH_DATA (push, n * 4);
+   PUSH_DATA (push, 0x1);
+   BEGIN_1IC0(push, NVE4_COMPUTE(UPLOAD_EXEC), 1 + n);
+   PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_UNKVAL_DATA);
+   PUSH_DATAp(push, &nvc0->tex_handles[s][i], n);
+
+   BEGIN_NVC0(push, NVE4_COMPUTE(FLUSH), 1);
+   PUSH_DATA (push, NVE4_COMPUTE_FLUSH_CB);
+
+   nvc0->textures_dirty[s] = 0;
+   nvc0->samplers_dirty[s] = 0;
+}
+
+
+static boolean
+nve4_compute_validate_program(struct nvc0_context *nvc0)
+{
+   struct nvc0_program *prog = nvc0->compprog;
+
+   if (prog->mem)
+      return TRUE;
+
+   if (!prog->translated) {
+      prog->translated = nvc0_program_translate(
+         prog, nvc0->screen->base.device->chipset);
+      if (!prog->translated)
+         return FALSE;
+   }
+   if (unlikely(!prog->code_size))
+      return FALSE;
+
+   if (likely(prog->code_size)) {
+      if (nvc0_program_upload_code(nvc0, prog)) {
+         struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+         BEGIN_NVC0(push, NVE4_COMPUTE(FLUSH), 1);
+         PUSH_DATA (push, NVE4_COMPUTE_FLUSH_CODE);
+         return TRUE;
+      }
+   }
+   return FALSE;
+}
+
+
+static boolean
+nve4_compute_state_validate(struct nvc0_context *nvc0)
+{
+   if (!nve4_compute_validate_program(nvc0))
+      return FALSE;
+   if (nvc0->dirty_cp & NVC0_NEW_CP_TEXTURES)
+      nve4_compute_validate_textures(nvc0);
+   if (nvc0->dirty_cp & NVC0_NEW_CP_SAMPLERS)
+      nve4_compute_validate_samplers(nvc0);
+   if (nvc0->dirty_cp & (NVC0_NEW_CP_TEXTURES | NVC0_NEW_CP_SAMPLERS))
+       nve4_compute_set_tex_handles(nvc0);
+   if (nvc0->dirty_cp & NVC0_NEW_CP_SURFACES)
+      nve4_compute_validate_surfaces(nvc0);
+   if (nvc0->dirty_cp & NVC0_NEW_CP_GLOBALS)
+      nvc0_validate_global_residents(nvc0,
+                                     nvc0->bufctx_cp, NVC0_BIND_CP_GLOBAL);
+
+   nvc0_bufctx_fence(nvc0, nvc0->bufctx_cp, FALSE);
+
+   nouveau_pushbuf_bufctx(nvc0->base.pushbuf, nvc0->bufctx_cp);
+   if (unlikely(nouveau_pushbuf_validate(nvc0->base.pushbuf)))
+      return FALSE;
+   if (unlikely(nvc0->state.flushed))
+      nvc0_bufctx_fence(nvc0, nvc0->bufctx_cp, TRUE);
+
+   return TRUE;
+}
+
+
+static void
+nve4_compute_upload_input(struct nvc0_context *nvc0, const void *input)
+{
+   struct nvc0_screen *screen = nvc0->screen;
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+   struct nvc0_program *cp = nvc0->compprog;
+
+   if (!cp->parm_size)
+      return;
+
+   BEGIN_NVC0(push, NVE4_COMPUTE(UPLOAD_ADDRESS_HIGH), 2);
+   PUSH_DATAh(push, screen->parm->offset);
+   PUSH_DATA (push, screen->parm->offset);
+   BEGIN_NVC0(push, NVE4_COMPUTE(UPLOAD_SIZE), 2);
+   PUSH_DATA (push, cp->parm_size);
+   PUSH_DATA (push, 0x1);
+   BEGIN_1IC0(push, NVE4_COMPUTE(UPLOAD_EXEC), 1 + (cp->parm_size / 4));
+   PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_UNKVAL_DATA);
+   PUSH_DATAp(push, input, cp->parm_size / 4);
+
+   BEGIN_NVC0(push, NVE4_COMPUTE(FLUSH), 1);
+   PUSH_DATA (push, NVE4_COMPUTE_FLUSH_CB);
+}
+
+static INLINE uint8_t
+nve4_compute_derive_cache_split(struct nvc0_context *nvc0, uint32_t shared_size)
+{
+   if (shared_size > (32 << 10))
+      return NVC0_3D_CACHE_SPLIT_48K_SHARED_16K_L1;
+   if (shared_size > (16 << 10))
+      return NVE4_3D_CACHE_SPLIT_32K_SHARED_32K_L1;
+   return NVC1_3D_CACHE_SPLIT_16K_SHARED_48K_L1;
+}
+
+static void
+nve4_compute_setup_launch_desc(struct nvc0_context *nvc0,
+                               struct nve4_cp_launch_desc *desc,
+                               uint32_t label,
+                               const uint *block_layout,
+                               const uint *grid_layout)
+{
+   const struct nvc0_screen *screen = nvc0->screen;
+   const struct nvc0_program *cp = nvc0->compprog;
+   unsigned i;
+
+   nve4_cp_launch_desc_init_default(desc);
+
+   desc->entry = nvc0_program_symbol_offset(cp, label);
+
+   desc->griddim_x = grid_layout[0];
+   desc->griddim_y = grid_layout[1];
+   desc->griddim_z = grid_layout[2];
+   desc->blockdim_x = block_layout[0];
+   desc->blockdim_y = block_layout[1];
+   desc->blockdim_z = block_layout[2];
+
+   desc->shared_size = align(cp->cp.smem_size, 0x100);
+   desc->local_size_p = align(cp->cp.lmem_size, 0x10);
+   desc->local_size_n = 0;
+   desc->cstack_size = 0x800;
+   desc->cache_split = nve4_compute_derive_cache_split(nvc0, cp->cp.smem_size);
+
+   desc->gpr_alloc = cp->num_gprs;
+   desc->bar_alloc = cp->num_barriers;
+
+   for (i = 0; i < 7; ++i) {
+      const unsigned s = 5;
+      if (nvc0->constbuf[s][i].u.buf)
+         nve4_cp_launch_desc_set_ctx_cb(desc, i + 1, &nvc0->constbuf[s][i]);
+   }
+   nve4_cp_launch_desc_set_cb(desc, 0, screen->parm, 0, NVE4_CP_INPUT_SIZE);
+}
+
+static INLINE struct nve4_cp_launch_desc *
+nve4_compute_alloc_launch_desc(struct nouveau_context *nv,
+                               struct nouveau_bo **pbo, uint64_t *pgpuaddr)
+{
+   uint8_t *ptr = nouveau_scratch_get(nv, 512, pgpuaddr, pbo);
+   if (!ptr)
+      return NULL;
+   if (*pgpuaddr & 255) {
+      unsigned adj = 256 - (*pgpuaddr & 255);
+      ptr += adj;
+      *pgpuaddr += adj;
+   }
+   return (struct nve4_cp_launch_desc *)ptr;
+}
+
+void
+nve4_launch_grid(struct pipe_context *pipe,
+                 const uint *block_layout, const uint *grid_layout,
+                 uint32_t label,
+                 const void *input)
+{
+   struct nvc0_context *nvc0 = nvc0_context(pipe);
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+   struct nve4_cp_launch_desc *desc;
+   uint64_t desc_gpuaddr;
+   struct nouveau_bo *desc_bo;
+   int ret;
+
+   desc = nve4_compute_alloc_launch_desc(&nvc0->base, &desc_bo, &desc_gpuaddr);
+   if (!desc)
+      goto out;
+   BCTX_REFN_bo(nvc0->bufctx_cp, CP_DESC, NOUVEAU_BO_GART | NOUVEAU_BO_RD,
+                desc_bo);
+
+   ret = !nve4_compute_state_validate(nvc0);
+   if (ret)
+      goto out;
+
+   nve4_compute_setup_launch_desc(nvc0, desc, label, block_layout, grid_layout);
+   nve4_compute_dump_launch_desc(desc);
+
+   nve4_compute_upload_input(nvc0, input);
+
+   /* upload descriptor and flush */
+#if 0
+   BEGIN_NVC0(push, NVE4_COMPUTE(UPLOAD_ADDRESS_HIGH), 2);
+   PUSH_DATAh(push, desc_gpuaddr);
+   PUSH_DATA (push, desc_gpuaddr);
+   BEGIN_NVC0(push, NVE4_COMPUTE(UPLOAD_SIZE), 2);
+   PUSH_DATA (push, 256);
+   PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_UNK0184_UNKVAL);
+   BEGIN_1IC0(push, NVE4_COMPUTE(UPLOAD_EXEC), 1 + (256 / 4));
+   PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_UNKVAL_DESC);
+   PUSH_DATAp(push, (const uint32_t *)desc, 256 / 4);
+   BEGIN_NVC0(push, NVE4_COMPUTE(FLUSH), 1);
+   PUSH_DATA (push, NVE4_COMPUTE_FLUSH_CB | NVE4_COMPUTE_FLUSH_CODE);
+#endif
+   BEGIN_NVC0(push, NVE4_COMPUTE(LAUNCH_DESC_ADDRESS), 1);
+   PUSH_DATA (push, desc_gpuaddr >> 8);
+   BEGIN_NVC0(push, NVE4_COMPUTE(LAUNCH), 1);
+   PUSH_DATA (push, 0x3);
+   BEGIN_NVC0(push, SUBC_COMPUTE(NV50_GRAPH_SERIALIZE), 1);
+   PUSH_DATA (push, 0);
+
+out:
+   if (ret)
+      NOUVEAU_ERR("Failed to launch grid !\n");
+   nouveau_scratch_done(&nvc0->base);
+   nouveau_bufctx_reset(nvc0->bufctx_cp, NVC0_BIND_CP_DESC);
+}
+
+
+#define NVE4_TIC_ENTRY_INVALID 0x000fffff
+
+static void
+nve4_compute_validate_textures(struct nvc0_context *nvc0)
+{
+   struct nouveau_bo *txc = nvc0->screen->txc;
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+   const unsigned s = 5;
+   unsigned i;
+   uint32_t commands[2][NVE4_CP_INPUT_TEX_MAX];
+   unsigned n[2] = { 0, 0 };
+
+   for (i = 0; i < nvc0->num_textures[s]; ++i) {
+      struct nv50_tic_entry *tic = nv50_tic_entry(nvc0->textures[s][i]);
+      struct nv04_resource *res;
+      const boolean dirty = !!(nvc0->textures_dirty[s] & (1 << i));
+
+      if (!tic) {
+         nvc0->tex_handles[s][i] |= NVE4_TIC_ENTRY_INVALID;
+         continue;
+      }
+      res = nv04_resource(tic->pipe.texture);
+
+      if (tic->id < 0) {
+         tic->id = nvc0_screen_tic_alloc(nvc0->screen, tic);
+
+         PUSH_SPACE(push, 16);
+         BEGIN_NVC0(push, NVE4_COMPUTE(UPLOAD_ADDRESS_HIGH), 2);
+         PUSH_DATAh(push, txc->offset + (tic->id * 32));
+         PUSH_DATA (push, txc->offset + (tic->id * 32));
+         BEGIN_NVC0(push, NVE4_COMPUTE(UPLOAD_SIZE), 2);
+         PUSH_DATA (push, 32);
+         PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_UNK0184_UNKVAL);
+         BEGIN_1IC0(push, NVE4_COMPUTE(UPLOAD_EXEC), 9);
+         PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_UNKVAL_DATA);
+         PUSH_DATAp(push, &tic->tic[0], 8);
+
+         commands[0][n[0]++] = (tic->id << 4) | 1;
+      } else
+      if (res->status & NOUVEAU_BUFFER_STATUS_GPU_WRITING) {
+         commands[1][n[1]++] = (tic->id << 4) | 1;
+      }
+      nvc0->screen->tic.lock[tic->id / 32] |= 1 << (tic->id % 32);
+
+      res->status &= ~NOUVEAU_BUFFER_STATUS_GPU_WRITING;
+      res->status |=  NOUVEAU_BUFFER_STATUS_GPU_READING;
+
+      nvc0->tex_handles[s][i] &= ~NVE4_TIC_ENTRY_INVALID;
+      nvc0->tex_handles[s][i] |= tic->id;
+      if (dirty)
+         BCTX_REFN(nvc0->bufctx_cp, CP_TEX(i), res, RD);
+   }
+   for (; i < nvc0->state.num_textures[s]; ++i)
+      nvc0->tex_handles[s][i] |= NVE4_TIC_ENTRY_INVALID;
+
+   if (n[0]) {
+      BEGIN_NIC0(push, NVE4_COMPUTE(TIC_FLUSH), n[0]);
+      PUSH_DATAp(push, commands[0], n[0]);
+   }
+   if (n[1]) {
+      BEGIN_NIC0(push, NVE4_COMPUTE(TEX_CACHE_CTL), n[1]);
+      PUSH_DATAp(push, commands[1], n[1]);
+   }
+
+   nvc0->state.num_textures[s] = nvc0->num_textures[s];
+}
+
+
+static const char *nve4_cache_split_name(unsigned value)
+{
+   switch (value) {
+   case NVC1_3D_CACHE_SPLIT_16K_SHARED_48K_L1: return "16K_SHARED_48K_L1";
+   case NVE4_3D_CACHE_SPLIT_32K_SHARED_32K_L1: return "32K_SHARED_32K_L1";
+   case NVC0_3D_CACHE_SPLIT_48K_SHARED_16K_L1: return "48K_SHARED_16K_L1";
+   default:
+      return "(invalid)";
+   }
+}
+
+static void
+nve4_compute_dump_launch_desc(const struct nve4_cp_launch_desc *desc)
+{
+   const uint32_t *data = (const uint32_t *)desc;
+   unsigned i;
+   boolean zero = FALSE;
+
+   debug_printf("COMPUTE LAUNCH DESCRIPTOR:\n");
+
+   for (i = 0; i < sizeof(*desc); i += 4) {
+      if (data[i / 4]) {
+         debug_printf("[%x]: 0x%08x\n", i, data[i / 4]);
+         zero = FALSE;
+      } else
+      if (!zero) {
+         debug_printf("...\n");
+         zero = TRUE;
+      }
+   }
+
+   debug_printf("entry = 0x%x\n", desc->entry);
+   debug_printf("grid dimensions = %ux%ux%u\n",
+                desc->griddim_x, desc->griddim_y, desc->griddim_z);
+   debug_printf("block dimensions = %ux%ux%u\n",
+                desc->blockdim_x, desc->blockdim_y, desc->blockdim_z);
+   debug_printf("s[] size: 0x%x\n", desc->shared_size);
+   debug_printf("l[] size: -0x%x / +0x%x\n",
+                desc->local_size_n, desc->local_size_p);
+   debug_printf("stack size: 0x%x\n", desc->cstack_size);
+   debug_printf("barrier count: %u\n", desc->bar_alloc);
+   debug_printf("$r count: %u\n", desc->gpr_alloc);
+   debug_printf("cache split: %s\n", nve4_cache_split_name(desc->cache_split));
+
+   for (i = 0; i < 8; ++i) {
+      uint64_t address;
+      uint32_t size = desc->cb[i].size;
+      boolean valid = !!(desc->cb_mask & (1 << i));
+
+      address = ((uint64_t)desc->cb[i].address_h << 32) | desc->cb[i].address_l;
+
+      if (!valid && !address && !size)
+         continue;
+      debug_printf("CB[%u]: address = 0x%"PRIx64", size 0x%x%s\n",
+                   i, address, size, valid ? "" : "  (invalid)");
+   }
+}
+
diff --git a/src/gallium/drivers/nvc0/nve4_compute.h b/src/gallium/drivers/nvc0/nve4_compute.h
new file mode 100644 (file)
index 0000000..82a7748
--- /dev/null
@@ -0,0 +1,110 @@
+
+#ifndef NVE4_COMPUTE_H
+#define NVE4_COMPUTE_H
+
+#include "nv50/nv50_defs.xml.h"
+#include "nve4_compute.xml.h"
+
+/* Input space is implemented as c0[], to which we bind the screen->parm bo.
+ */
+#define NVE4_CP_INPUT_USER        0x0000
+#define NVE4_CP_INPUT_USER_LIMIT  0x1000
+#define NVE4_CP_INPUT_TEX(i)     (0x1020 + (i) * 4)
+#define NVE4_CP_INPUT_TEX_STRIDE  4
+#define NVE4_CP_INPUT_TEX_MAX     32
+#define NVE4_CP_INPUT_MS_OFFSETS  0x10c0
+#define NVE4_CP_INPUT_SUF_STRIDE  64
+#define NVE4_CP_INPUT_SUF(i)     (0x1100 + (i) * NVE4_CP_INPUT_SUF_STRIDE)
+#define NVE4_CP_INPUT_SUF_MAX     32
+#define NVE4_CP_INPUT_SIZE        0x1900
+#define NVE4_CP_PARAM_SIZE        0x2000
+
+struct nve4_cp_launch_desc
+{
+   u32 unk0[8];
+   u32 entry;
+   u32 unk9[3];
+   u32 griddim_x    : 31;
+   u32 unk12        : 1;
+   u16 griddim_y;
+   u16 griddim_z;
+   u32 unk14[3];
+   u16 shared_size; /* must be aligned to 0x100 */
+   u16 unk15;
+   u16 unk16;
+   u16 blockdim_x;
+   u16 blockdim_y;
+   u16 blockdim_z;
+   u32 cb_mask      : 8;
+   u32 unk20_8      : 21;
+   u32 cache_split  : 2;
+   u32 unk20_31     : 1;
+   u32 unk21[8];
+   struct {
+      u32 address_l;
+      u32 address_h : 8;
+      u32 reserved  : 7;
+      u32 size      : 17;
+   } cb[8];
+   u32 local_size_p : 20;
+   u32 unk45_20     : 7;
+   u32 bar_alloc    : 5;
+   u32 local_size_n : 20;
+   u32 unk46_20     : 4;
+   u32 gpr_alloc    : 8;
+   u32 cstack_size  : 20;
+   u32 unk47_20     : 12;
+   u32 unk48[16];
+};
+
+#define NVE4_COMPUTE_UPLOAD_EXEC_UNKVAL_DATA 0x41
+#define NVE4_COMPUTE_UPLOAD_EXEC_UNKVAL_DESC 0x11
+#define NVE4_COMPUTE_UPLOAD_UNK0184_UNKVAL   0x1
+
+static INLINE void
+nve4_cp_launch_desc_init_default(struct nve4_cp_launch_desc *desc)
+{
+   memset(desc, 0, sizeof(*desc));
+
+   desc->unk0[7]  = 0xbc000000;
+   desc->unk9[2]  = 0x44014000;
+   desc->unk47_20 = 0x300;
+}
+
+static INLINE void
+nve4_cp_launch_desc_set_cb(struct nve4_cp_launch_desc *desc,
+                           unsigned index,
+                           struct nouveau_bo *bo,
+                           uint32_t base, uint16_t size)
+{
+   uint64_t address = bo->offset + base;
+
+   assert(index < 8);
+   assert(!(base & 0xff));
+   assert(size <= 65536);
+
+   desc->cb[index].address_l = address;
+   desc->cb[index].address_h = address >> 32;
+   desc->cb[index].size = size;
+
+   desc->cb_mask |= 1 << index;
+}
+
+static INLINE void
+nve4_cp_launch_desc_set_ctx_cb(struct nve4_cp_launch_desc *desc,
+                               unsigned index,
+                               const struct nvc0_constbuf *cb)
+{
+   assert(index < 8);
+
+   if (!cb->u.buf) {
+      desc->cb_mask &= ~(1 << index);
+   } else {
+      const struct nv04_resource *buf = nv04_resource(cb->u.buf);
+      assert(!cb->user);
+      nve4_cp_launch_desc_set_cb(desc, index,
+                                 buf->bo, buf->offset + cb->offset, cb->size);
+   }
+}
+
+#endif /* NVE4_COMPUTE_H */
diff --git a/src/gallium/drivers/nvc0/nve4_compute.xml.h b/src/gallium/drivers/nvc0/nve4_compute.xml.h
new file mode 100644 (file)
index 0000000..e513ae7
--- /dev/null
@@ -0,0 +1,269 @@
+#ifndef NVE4_COMPUTE_XML
+#define NVE4_COMPUTE_XML
+
+/* Autogenerated file, DO NOT EDIT manually!
+
+This file was generated by the rules-ng-ng headergen tool in this git repository:
+http://0x04.net/cgit/index.cgi/rules-ng-ng
+git clone git://0x04.net/rules-ng-ng
+
+The rules-ng-ng source files this header was generated from are:
+- nve4_compute.xml (   6352 bytes, from 2013-03-10 14:59:45)
+- copyright.xml    (   6452 bytes, from 2011-08-11 18:25:12)
+- nvchipsets.xml   (   3870 bytes, from 2013-03-08 12:41:50)
+- nv_object.xml    (  13238 bytes, from 2013-02-07 16:35:34)
+- nv_defs.xml      (   4437 bytes, from 2011-08-11 18:25:12)
+- nv50_defs.xml    (   7783 bytes, from 2013-03-08 12:42:29)
+
+Copyright (C) 2006-2013 by the following authors:
+- Artur Huillet <arthur.huillet@free.fr> (ahuillet)
+- Ben Skeggs (darktama, darktama_)
+- B. R. <koala_br@users.sourceforge.net> (koala_br)
+- Carlos Martin <carlosmn@users.sf.net> (carlosmn)
+- Christoph Bumiller <e0425955@student.tuwien.ac.at> (calim, chrisbmr)
+- Dawid Gajownik <gajownik@users.sf.net> (gajownik)
+- Dmitry Baryshkov
+- Dmitry Eremin-Solenikov <lumag@users.sf.net> (lumag)
+- EdB <edb_@users.sf.net> (edb_)
+- Erik Waling <erikwailing@users.sf.net> (erikwaling)
+- Francisco Jerez <currojerez@riseup.net> (curro)
+- imirkin <imirkin@users.sf.net> (imirkin)
+- jb17bsome <jb17bsome@bellsouth.net> (jb17bsome)
+- Jeremy Kolb <kjeremy@users.sf.net> (kjeremy)
+- Laurent Carlier <lordheavym@gmail.com> (lordheavy)
+- Luca Barbieri <luca@luca-barbieri.com> (lb, lb1)
+- Maarten Maathuis <madman2003@gmail.com> (stillunknown)
+- Marcin Koƛcielnicki <koriakin@0x04.net> (mwk, koriakin)
+- Mark Carey <mark.carey@gmail.com> (careym)
+- Matthieu Castet <matthieu.castet@parrot.com> (mat-c)
+- nvidiaman <nvidiaman@users.sf.net> (nvidiaman)
+- Patrice Mandin <patmandin@gmail.com> (pmandin, pmdata)
+- Pekka Paalanen <pq@iki.fi> (pq, ppaalanen)
+- Peter Popov <ironpeter@users.sf.net> (ironpeter)
+- Richard Hughes <hughsient@users.sf.net> (hughsient)
+- Rudi Cilibrasi <cilibrar@users.sf.net> (cilibrar)
+- Serge Martin
+- Simon Raffeiner
+- Stephane Loeuillet <leroutier@users.sf.net> (leroutier)
+- Stephane Marchesin <stephane.marchesin@gmail.com> (marcheu)
+- sturmflut <sturmflut@users.sf.net> (sturmflut)
+- Sylvain Munaut <tnt@246tNt.com>
+- Victor Stinner <victor.stinner@haypocalc.com> (haypo)
+- Wladmir van der Laan <laanwj@gmail.com> (miathan6)
+- Younes Manton <younes.m@gmail.com> (ymanton)
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+
+
+#define NVE4_COMPUTE_UPLOAD_SIZE                               0x00000180
+
+#define NVE4_COMPUTE_UPLOAD_UNK0184                            0x00000184
+
+#define NVE4_COMPUTE_UPLOAD_ADDRESS_HIGH                       0x00000188
+
+#define NVE4_COMPUTE_UPLOAD_ADDRESS_LOW                                0x0000018c
+
+#define NVE4_COMPUTE_UNK01A0                                   0x000001a0
+
+#define NVE4_COMPUTE_UNK01A4                                   0x000001a4
+
+#define NVE4_COMPUTE_UNK01A8                                   0x000001a8
+
+#define NVE4_COMPUTE_UNK01AC                                   0x000001ac
+
+#define NVE4_COMPUTE_UPLOAD_EXEC                               0x000001b0
+
+#define NVE4_COMPUTE_UPLOAD_DATA                               0x000001b4
+
+#define NVE4_COMPUTE_SHARED_BASE                               0x00000214
+
+#define NVE4_COMPUTE_MEM_BARRIER                               0x0000021c
+
+#define NVE4_COMPUTE_UNK0280                                   0x00000280
+
+#define NVE4_COMPUTE_UNK02B0                                   0x000002b0
+
+#define NVE4_COMPUTE_LAUNCH_DESC_ADDRESS                       0x000002b4
+#define NVE4_COMPUTE_LAUNCH_DESC_ADDRESS__SHR                  8
+
+#define NVE4_COMPUTE_UNK02B8                                   0x000002b8
+
+#define NVE4_COMPUTE_LAUNCH                                    0x000002bc
+
+#define NVE4_COMPUTE_TEMP_SIZE(i0)                            (0x000002e4 + 0xc*(i0))
+#define NVE4_COMPUTE_TEMP_SIZE__ESIZE                          0x0000000c
+#define NVE4_COMPUTE_TEMP_SIZE__LEN                            0x00000002
+
+#define NVE4_COMPUTE_TEMP_SIZE_HIGH(i0)                               (0x000002e4 + 0xc*(i0))
+
+#define NVE4_COMPUTE_TEMP_SIZE_LOW(i0)                        (0x000002e8 + 0xc*(i0))
+
+#define NVE4_COMPUTE_TEMP_SIZE_MASK(i0)                               (0x000002ec + 0xc*(i0))
+
+#define NVE4_COMPUTE_UNK0310                                   0x00000310
+
+#define NVE4_COMPUTE_LOCAL_BASE                                        0x0000077c
+
+#define NVE4_COMPUTE_TEMP_ADDRESS_HIGH                         0x00000790
+
+#define NVE4_COMPUTE_TEMP_ADDRESS_LOW                          0x00000794
+
+#define NVE4_COMPUTE_WATCHDOG_TIMER                            0x00000de4
+
+#define NVE4_COMPUTE_LINKED_TSC                                        0x00001234
+
+#define NVE4_COMPUTE_TSC_FLUSH                                 0x00001330
+#define NVE4_COMPUTE_TSC_FLUSH_SPECIFIC                                0x00000001
+#define NVE4_COMPUTE_TSC_FLUSH_ENTRY__MASK                     0x03fffff0
+#define NVE4_COMPUTE_TSC_FLUSH_ENTRY__SHIFT                    4
+
+#define NVE4_COMPUTE_TIC_FLUSH                                 0x00001334
+#define NVE4_COMPUTE_TIC_FLUSH_SPECIFIC                                0x00000001
+#define NVE4_COMPUTE_TIC_FLUSH_ENTRY__MASK                     0x03fffff0
+#define NVE4_COMPUTE_TIC_FLUSH_ENTRY__SHIFT                    4
+
+#define NVE4_COMPUTE_TEX_CACHE_CTL                             0x00001338
+#define NVE4_COMPUTE_TEX_CACHE_CTL_UNK0__MASK                  0x00000007
+#define NVE4_COMPUTE_TEX_CACHE_CTL_UNK0__SHIFT                 0
+#define NVE4_COMPUTE_TEX_CACHE_CTL_ENTRY__MASK                 0x03fffff0
+#define NVE4_COMPUTE_TEX_CACHE_CTL_ENTRY__SHIFT                        4
+
+#define NVE4_COMPUTE_COND_ADDRESS_HIGH                         0x00001550
+
+#define NVE4_COMPUTE_COND_ADDRESS_LOW                          0x00001554
+
+#define NVE4_COMPUTE_COND_MODE                                 0x00001558
+#define NVE4_COMPUTE_COND_MODE_NEVER                           0x00000000
+#define NVE4_COMPUTE_COND_MODE_ALWAYS                          0x00000001
+#define NVE4_COMPUTE_COND_MODE_RES_NON_ZERO                    0x00000002
+#define NVE4_COMPUTE_COND_MODE_EQUAL                           0x00000003
+#define NVE4_COMPUTE_COND_MODE_NOT_EQUAL                       0x00000004
+
+#define NVE4_COMPUTE_TSC_ADDRESS_HIGH                          0x0000155c
+
+#define NVE4_COMPUTE_TSC_ADDRESS_LOW                           0x00001560
+
+#define NVE4_COMPUTE_TSC_LIMIT                                 0x00001564
+
+#define NVE4_COMPUTE_TIC_ADDRESS_HIGH                          0x00001574
+
+#define NVE4_COMPUTE_TIC_ADDRESS_LOW                           0x00001578
+
+#define NVE4_COMPUTE_TIC_LIMIT                                 0x0000157c
+
+#define NVE4_COMPUTE_CODE_ADDRESS_HIGH                         0x00001608
+
+#define NVE4_COMPUTE_CODE_ADDRESS_LOW                          0x0000160c
+
+#define NVE4_COMPUTE_FLUSH                                     0x00001698
+#define NVE4_COMPUTE_FLUSH_CODE                                        0x00000001
+#define NVE4_COMPUTE_FLUSH_GLOBAL                              0x00000010
+#define NVE4_COMPUTE_FLUSH_UNK8                                        0x00000100
+#define NVE4_COMPUTE_FLUSH_CB                                  0x00001000
+
+#define NVE4_COMPUTE_QUERY_ADDRESS_HIGH                                0x00001b00
+
+#define NVE4_COMPUTE_QUERY_ADDRESS_LOW                         0x00001b04
+
+#define NVE4_COMPUTE_QUERY_SEQUENCE                            0x00001b08
+
+#define NVE4_COMPUTE_QUERY_GET                                 0x00001b0c
+#define NVE4_COMPUTE_QUERY_GET_MODE__MASK                      0x00000003
+#define NVE4_COMPUTE_QUERY_GET_MODE__SHIFT                     0
+#define NVE4_COMPUTE_QUERY_GET_MODE_WRITE                      0x00000000
+#define NVE4_COMPUTE_QUERY_GET_MODE_WRITE_INTR_NRHOST          0x00000003
+#define NVE4_COMPUTE_QUERY_GET_INTR                            0x00100000
+#define NVE4_COMPUTE_QUERY_GET_SHORT                           0x10000000
+
+#define NVE4_COMPUTE_TEX_CB_INDEX                              0x00002608
+
+#define NVE4_COMPUTE_UNK260c                                   0x0000260c
+
+#define NVE4_COMPUTE_LAUNCH_DESC__SIZE                         0x00000100
+#define NVE4_COMPUTE_LAUNCH_DESC_PROG_START                    0x00000020
+
+#define NVE4_COMPUTE_LAUNCH_DESC_12                            0x00000030
+#define NVE4_COMPUTE_LAUNCH_DESC_12_GRIDDIM_X__MASK            0x7fffffff
+#define NVE4_COMPUTE_LAUNCH_DESC_12_GRIDDIM_X__SHIFT           0
+
+#define NVE4_COMPUTE_LAUNCH_DESC_GRIDDIM_YZ                    0x00000034
+#define NVE4_COMPUTE_LAUNCH_DESC_GRIDDIM_YZ_Y__MASK            0x0000ffff
+#define NVE4_COMPUTE_LAUNCH_DESC_GRIDDIM_YZ_Y__SHIFT           0
+#define NVE4_COMPUTE_LAUNCH_DESC_GRIDDIM_YZ_Z__MASK            0xffff0000
+#define NVE4_COMPUTE_LAUNCH_DESC_GRIDDIM_YZ_Z__SHIFT           16
+
+#define NVE4_COMPUTE_LAUNCH_DESC_17                            0x00000044
+#define NVE4_COMPUTE_LAUNCH_DESC_17_SHARED_ALLOC__MASK         0x0000ffff
+#define NVE4_COMPUTE_LAUNCH_DESC_17_SHARED_ALLOC__SHIFT                0
+
+#define NVE4_COMPUTE_LAUNCH_DESC_18                            0x00000048
+#define NVE4_COMPUTE_LAUNCH_DESC_18_BLOCKDIM_X__MASK           0xffff0000
+#define NVE4_COMPUTE_LAUNCH_DESC_18_BLOCKDIM_X__SHIFT          16
+
+#define NVE4_COMPUTE_LAUNCH_DESC_BLOCKDIM_YZ                   0x0000004c
+#define NVE4_COMPUTE_LAUNCH_DESC_BLOCKDIM_YZ_Y__MASK           0x0000ffff
+#define NVE4_COMPUTE_LAUNCH_DESC_BLOCKDIM_YZ_Y__SHIFT          0
+#define NVE4_COMPUTE_LAUNCH_DESC_BLOCKDIM_YZ_Z__MASK           0xffff0000
+#define NVE4_COMPUTE_LAUNCH_DESC_BLOCKDIM_YZ_Z__SHIFT          16
+
+#define NVE4_COMPUTE_LAUNCH_DESC_20                            0x00000050
+#define NVE4_COMPUTE_LAUNCH_DESC_20_CB_VALID__MASK             0x000000ff
+#define NVE4_COMPUTE_LAUNCH_DESC_20_CB_VALID__SHIFT            0
+#define NVE4_COMPUTE_LAUNCH_DESC_20_CACHE_SPLIT__MASK          0x60000000
+#define NVE4_COMPUTE_LAUNCH_DESC_20_CACHE_SPLIT__SHIFT         29
+#define NVE4_COMPUTE_LAUNCH_DESC_20_CACHE_SPLIT_16K_SHARED_48K_L1      0x20000000
+#define NVE4_COMPUTE_LAUNCH_DESC_20_CACHE_SPLIT_32K_SHARED_32K_L1      0x40000000
+#define NVE4_COMPUTE_LAUNCH_DESC_20_CACHE_SPLIT_48K_SHARED_16K_L1      0x60000000
+
+#define NVE4_COMPUTE_LAUNCH_DESC_CB_CONFIG_0(i0)              (0x00000074 + 0x8*(i0))
+#define NVE4_COMPUTE_LAUNCH_DESC_CB_CONFIG_0__ESIZE            0x00000008
+#define NVE4_COMPUTE_LAUNCH_DESC_CB_CONFIG_0__LEN              0x00000008
+#define NVE4_COMPUTE_LAUNCH_DESC_CB_CONFIG_0_ADDRESS_LOW__MASK 0xffffffff
+#define NVE4_COMPUTE_LAUNCH_DESC_CB_CONFIG_0_ADDRESS_LOW__SHIFT        0
+
+#define NVE4_COMPUTE_LAUNCH_DESC_CB_CONFIG_1(i0)              (0x00000078 + 0x8*(i0))
+#define NVE4_COMPUTE_LAUNCH_DESC_CB_CONFIG_1__ESIZE            0x00000008
+#define NVE4_COMPUTE_LAUNCH_DESC_CB_CONFIG_1__LEN              0x00000008
+#define NVE4_COMPUTE_LAUNCH_DESC_CB_CONFIG_1_ADDRESS_HIGH__MASK        0x000000ff
+#define NVE4_COMPUTE_LAUNCH_DESC_CB_CONFIG_1_ADDRESS_HIGH__SHIFT       0
+#define NVE4_COMPUTE_LAUNCH_DESC_CB_CONFIG_1_SIZE__MASK                0xffff8000
+#define NVE4_COMPUTE_LAUNCH_DESC_CB_CONFIG_1_SIZE__SHIFT       15
+
+#define NVE4_COMPUTE_LAUNCH_DESC_45                            0x000000b4
+#define NVE4_COMPUTE_LAUNCH_DESC_45_LOCAL_POS_ALLOC__MASK      0x000fffff
+#define NVE4_COMPUTE_LAUNCH_DESC_45_LOCAL_POS_ALLOC__SHIFT     0
+#define NVE4_COMPUTE_LAUNCH_DESC_45_BARRIER_ALLOC__MASK                0xf8000000
+#define NVE4_COMPUTE_LAUNCH_DESC_45_BARRIER_ALLOC__SHIFT       27
+
+#define NVE4_COMPUTE_LAUNCH_DESC_46                            0x000000b8
+#define NVE4_COMPUTE_LAUNCH_DESC_46_LOCAL_NEG_ALLOC__MASK      0x000fffff
+#define NVE4_COMPUTE_LAUNCH_DESC_46_LOCAL_NEG_ALLOC__SHIFT     0
+#define NVE4_COMPUTE_LAUNCH_DESC_46_GPR_ALLOC__MASK            0x3f000000
+#define NVE4_COMPUTE_LAUNCH_DESC_46_GPR_ALLOC__SHIFT           24
+
+#define NVE4_COMPUTE_LAUNCH_DESC_47                            0x000000bc
+#define NVE4_COMPUTE_LAUNCH_DESC_47_WARP_CSTACK_SIZE__MASK     0x000fffff
+#define NVE4_COMPUTE_LAUNCH_DESC_47_WARP_CSTACK_SIZE__SHIFT    0
+
+
+#endif /* NVE4_COMPUTE_XML */