src/gallium/drivers/cell/spu/spu_render.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28
  29 #include <stdio.h>
  30 #include <libmisc.h>
  31 #include <spu_mfcio.h>
  32
  33 #include "spu_main.h"
  34 #include "spu_render.h"
  35 #include "spu_shuffle.h"
  36 #include "spu_tri.h"
  37 #include "spu_tile.h"
  38 #include "cell/common.h"
  39 #include "util/u_memory.h"
  40
  41
  42 /**
  43  * Given a rendering command's bounding box (in pixels) compute the
  44  * location of the corresponding screen tile bounding box.
  45  */
  46 static INLINE void
  47 tile_bounding_box(const struct cell_command_render *render,
  48                   uint *txmin, uint *tymin,
  49                   uint *box_num_tiles, uint *box_width_tiles)
  50 {
  51 #if 0
  52    /* Debug: full-window bounding box */
  53    uint txmax = spu.fb.width_tiles - 1;
  54    uint tymax = spu.fb.height_tiles - 1;
  55    *txmin = 0;
  56    *tymin = 0;
  57    *box_num_tiles = spu.fb.width_tiles * spu.fb.height_tiles;
  58    *box_width_tiles = spu.fb.width_tiles;
  59    (void) render;
  60    (void) txmax;
  61    (void) tymax;
  62 #else
  63    uint txmax, tymax, box_height_tiles;
  64
  65    *txmin = (uint) render->xmin / TILE_SIZE;
  66    *tymin = (uint) render->ymin / TILE_SIZE;
  67    txmax = (uint) render->xmax / TILE_SIZE;
  68    tymax = (uint) render->ymax / TILE_SIZE;
  69    if (txmax >= spu.fb.width_tiles)
  70       txmax = spu.fb.width_tiles-1;
  71    if (tymax >= spu.fb.height_tiles)
  72       tymax = spu.fb.height_tiles-1;
  73    *box_width_tiles = txmax - *txmin + 1;
  74    box_height_tiles = tymax - *tymin + 1;
  75    *box_num_tiles = *box_width_tiles * box_height_tiles;
  76 #endif
  77 #if 0
  78    printf("SPU %u: bounds: %g, %g  ...  %g, %g\n", spu.init.id,
  79           render->xmin, render->ymin, render->xmax, render->ymax);
  80    printf("SPU %u: tiles:  %u, %u .. %u, %u\n",
  81            spu.init.id, *txmin, *tymin, txmax, tymax);
  82    ASSERT(render->xmin <= render->xmax);
  83    ASSERT(render->ymin <= render->ymax);
  84 #endif
  85 }
  86
  87
  88 /** Check if the tile at (tx,ty) belongs to this SPU */
  89 static INLINE boolean
  90 my_tile(uint tx, uint ty)
  91 {
  92    return (spu.fb.width_tiles * ty + tx) % spu.init.num_spus == spu.init.id;
  93 }
  94
  95
  96 /**
  97  * Start fetching non-clear color/Z tiles from main memory
  98  */
  99 static INLINE void
 100 get_cz_tiles(uint tx, uint ty)
 101 {
 102    if (spu.read_depth_stencil) {
 103       if (spu.cur_ztile_status != TILE_STATUS_CLEAR) {
 104          //printf("SPU %u: getting Z tile %u, %u\n", spu.init.id, tx, ty);
 105          get_tile(tx, ty, &spu.ztile, TAG_READ_TILE_Z, 1);
 106          spu.cur_ztile_status = TILE_STATUS_GETTING;
 107       }
 108    }
 109
 110    if (spu.cur_ctile_status != TILE_STATUS_CLEAR) {
 111       //printf("SPU %u: getting C tile %u, %u\n", spu.init.id, tx, ty);
 112       get_tile(tx, ty, &spu.ctile, TAG_READ_TILE_COLOR, 0);
 113       spu.cur_ctile_status = TILE_STATUS_GETTING;
 114    }
 115 }
 116
 117
 118 /**
 119  * Start putting dirty color/Z tiles back to main memory
 120  */
 121 static INLINE void
 122 put_cz_tiles(uint tx, uint ty)
 123 {
 124    if (spu.cur_ztile_status == TILE_STATUS_DIRTY) {
 125       /* tile was modified and needs to be written back */
 126       //printf("SPU %u: put dirty Z tile %u, %u\n", spu.init.id, tx, ty);
 127       put_tile(tx, ty, &spu.ztile, TAG_WRITE_TILE_Z, 1);
 128       spu.cur_ztile_status = TILE_STATUS_DEFINED;
 129    }
 130    else if (spu.cur_ztile_status == TILE_STATUS_GETTING) {
 131       /* tile was never used */
 132       spu.cur_ztile_status = TILE_STATUS_DEFINED;
 133       //printf("SPU %u: put getting Z tile %u, %u\n", spu.init.id, tx, ty);
 134    }
 135
 136    if (spu.cur_ctile_status == TILE_STATUS_DIRTY) {
 137       /* tile was modified and needs to be written back */
 138       //printf("SPU %u: put dirty C tile %u, %u\n", spu.init.id, tx, ty);
 139       put_tile(tx, ty, &spu.ctile, TAG_WRITE_TILE_COLOR, 0);
 140       spu.cur_ctile_status = TILE_STATUS_DEFINED;
 141    }
 142    else if (spu.cur_ctile_status == TILE_STATUS_GETTING) {
 143       /* tile was never used */
 144       spu.cur_ctile_status = TILE_STATUS_DEFINED;
 145       //printf("SPU %u: put getting C tile %u, %u\n", spu.init.id, tx, ty);
 146    }
 147 }
 148
 149
 150 /**
 151  * Wait for 'put' of color/z tiles to complete.
 152  */
 153 static INLINE void
 154 wait_put_cz_tiles(void)
 155 {
 156    wait_on_mask(1 << TAG_WRITE_TILE_COLOR);
 157    if (spu.read_depth_stencil) {
 158       wait_on_mask(1 << TAG_WRITE_TILE_Z);
 159    }
 160 }
 161
 162
 163 /**
 164  * Render primitives
 165  * \param pos_incr  returns value indicating how may words to skip after
 166  *                  this command in the batch buffer
 167  */
 168 void
 169 cmd_render(const struct cell_command_render *render, uint *pos_incr)
 170 {
 171    /* we'll DMA into these buffers */
 172    PIPE_ALIGN_VAR(16) ubyte vertex_data[CELL_BUFFER_SIZE];
 173    const uint vertex_size = render->vertex_size; /* in bytes */
 174    /*const*/ uint total_vertex_bytes = render->num_verts * vertex_size;
 175    uint index_bytes;
 176    const ubyte *vertices;
 177    const ushort *indexes;
 178    uint i, j;
 179    uint num_tiles;
 180
 181    D_PRINTF(CELL_DEBUG_CMD,
 182             "RENDER prim=%u num_vert=%u num_ind=%u inline_vert=%u\n",
 183             render->prim_type,
 184             render->num_verts,
 185             render->num_indexes,
 186             render->inline_verts);
 187
 188    ASSERT(sizeof(*render) % 4 == 0);
 189    ASSERT(total_vertex_bytes % 16 == 0);
 190    ASSERT(render->prim_type == PIPE_PRIM_TRIANGLES);
 191    ASSERT(render->num_indexes % 3 == 0);
 192
 193
 194    /* indexes are right after the render command in the batch buffer */
 195    indexes = (const ushort *) (render + 1);
 196    index_bytes = ROUNDUP8(render->num_indexes * 2);
 197    *pos_incr = index_bytes / 8 + sizeof(*render) / 8;
 198
 199
 200    if (render->inline_verts) {
 201       /* Vertices are after indexes in batch buffer at next 16-byte addr */
 202       vertices = (const ubyte *) render + (*pos_incr * 8);
 203       vertices = (const ubyte *) align_pointer((void *) vertices, 16);
 204       ASSERT_ALIGN16(vertices);
 205       *pos_incr = ((vertices + total_vertex_bytes) - (ubyte *) render) / 8;
 206    }
 207    else {
 208       /* Begin DMA fetch of vertex buffer */
 209       ubyte *src = spu.init.buffers[render->vertex_buf];
 210       ubyte *dest = vertex_data;
 211
 212       /* skip vertex data we won't use */
 213 #if 01
 214       src += render->min_index * vertex_size;
 215       dest += render->min_index * vertex_size;
 216       total_vertex_bytes -= render->min_index * vertex_size;
 217 #endif
 218       ASSERT(total_vertex_bytes % 16 == 0);
 219       ASSERT_ALIGN16(dest);
 220       ASSERT_ALIGN16(src);
 221
 222       mfc_get(dest,   /* in vertex_data[] array */
 223               (unsigned int) src,  /* src in main memory */
 224               total_vertex_bytes,  /* size */
 225               TAG_VERTEX_BUFFER,
 226               0, /* tid */
 227               0  /* rid */);
 228
 229       vertices = vertex_data;
 230
 231       wait_on_mask(1 << TAG_VERTEX_BUFFER);
 232    }
 233
 234
 235    /**
 236     ** find tiles which intersect the prim bounding box
 237     **/
 238    uint txmin, tymin, box_width_tiles, box_num_tiles;
 239    tile_bounding_box(render, &txmin, &tymin,
 240                      &box_num_tiles, &box_width_tiles);
 241
 242
 243    /* make sure any pending clears have completed */
 244    wait_on_mask(1 << TAG_SURFACE_CLEAR); /* XXX temporary */
 245
 246
 247    num_tiles = 0;
 248
 249    /**
 250     ** loop over tiles, rendering tris
 251     **/
 252    for (i = 0; i < box_num_tiles; i++) {
 253       const uint tx = txmin + i % box_width_tiles;
 254       const uint ty = tymin + i / box_width_tiles;
 255
 256       ASSERT(tx < spu.fb.width_tiles);
 257       ASSERT(ty < spu.fb.height_tiles);
 258
 259       if (!my_tile(tx, ty))
 260          continue;
 261
 262       num_tiles++;
 263
 264       spu.cur_ctile_status = spu.ctile_status[ty][tx];
 265       spu.cur_ztile_status = spu.ztile_status[ty][tx];
 266
 267       get_cz_tiles(tx, ty);
 268
 269       uint drawn = 0;
 270
 271       const qword vertex_sizes = (qword)spu_splats(vertex_size);
 272       const qword verticess = (qword)spu_splats((uint)vertices);
 273
 274       ASSERT_ALIGN16(&indexes[0]);
 275
 276       const uint num_indexes = render->num_indexes;
 277
 278       /* loop over tris
 279            * &indexes[0] will be 16 byte aligned.  This loop is heavily unrolled
 280            * avoiding variable rotates when extracting vertex indices.
 281            */
 282       for (j = 0; j < num_indexes; j += 24) {
 283          /* Load three vectors, containing 24 ushort indices */
 284          const qword* lower_qword = (qword*)&indexes[j];
 285          const qword indices0 = lower_qword[0];
 286          const qword indices1 = lower_qword[1];
 287          const qword indices2 = lower_qword[2];
 288
 289          /* stores three indices for each tri n in slots 0, 1 and 2 of vsn */
 290                  /* Straightforward rotates for these */
 291          qword vs0 = indices0;
 292          qword vs1 = si_shlqbyi(indices0, 6);
 293          qword vs3 = si_shlqbyi(indices1, 2);
 294          qword vs4 = si_shlqbyi(indices1, 8);
 295          qword vs6 = si_shlqbyi(indices2, 4);
 296          qword vs7 = si_shlqbyi(indices2, 10);
 297
 298          /* For tri 2 and 5, the three indices are split across two machine
 299                   * words - rotate and combine */
 300          const qword tmp2a = si_shlqbyi(indices0, 12);
 301          const qword tmp2b = si_rotqmbyi(indices1, 12|16);
 302          qword vs2 = si_selb(tmp2a, tmp2b, si_fsmh(si_from_uint(0x20)));
 303
 304          const qword tmp5a = si_shlqbyi(indices1, 14);
 305          const qword tmp5b = si_rotqmbyi(indices2, 14|16);
 306          qword vs5 = si_selb(tmp5a, tmp5b, si_fsmh(si_from_uint(0x60)));
 307
 308          /* unpack indices from halfword slots to word slots */
 309          vs0 = si_shufb(vs0, vs0, SHUFB8(0,A,0,B,0,C,0,0));
 310          vs1 = si_shufb(vs1, vs1, SHUFB8(0,A,0,B,0,C,0,0));
 311          vs2 = si_shufb(vs2, vs2, SHUFB8(0,A,0,B,0,C,0,0));
 312          vs3 = si_shufb(vs3, vs3, SHUFB8(0,A,0,B,0,C,0,0));
 313          vs4 = si_shufb(vs4, vs4, SHUFB8(0,A,0,B,0,C,0,0));
 314          vs5 = si_shufb(vs5, vs5, SHUFB8(0,A,0,B,0,C,0,0));
 315          vs6 = si_shufb(vs6, vs6, SHUFB8(0,A,0,B,0,C,0,0));
 316          vs7 = si_shufb(vs7, vs7, SHUFB8(0,A,0,B,0,C,0,0));
 317
 318          /* Calculate address of vertex in vertices[] */
 319          vs0 = si_mpya(vs0, vertex_sizes, verticess);
 320          vs1 = si_mpya(vs1, vertex_sizes, verticess);
 321          vs2 = si_mpya(vs2, vertex_sizes, verticess);
 322          vs3 = si_mpya(vs3, vertex_sizes, verticess);
 323          vs4 = si_mpya(vs4, vertex_sizes, verticess);
 324          vs5 = si_mpya(vs5, vertex_sizes, verticess);
 325          vs6 = si_mpya(vs6, vertex_sizes, verticess);
 326          vs7 = si_mpya(vs7, vertex_sizes, verticess);
 327
 328          /* Select the appropriate call based on the number of vertices
 329                   * remaining */
 330          switch(num_indexes - j) {
 331             default: drawn += tri_draw(vs7, tx, ty);
 332             case 21: drawn += tri_draw(vs6, tx, ty);
 333             case 18: drawn += tri_draw(vs5, tx, ty);
 334             case 15: drawn += tri_draw(vs4, tx, ty);
 335             case 12: drawn += tri_draw(vs3, tx, ty);
 336             case 9:  drawn += tri_draw(vs2, tx, ty);
 337             case 6:  drawn += tri_draw(vs1, tx, ty);
 338             case 3:  drawn += tri_draw(vs0, tx, ty);
 339          }
 340       }
 341
 342       //printf("SPU %u: drew %u of %u\n", spu.init.id, drawn, render->num_indexes/3);
 343
 344       /* write color/z tiles back to main framebuffer, if dirtied */
 345       put_cz_tiles(tx, ty);
 346
 347       wait_put_cz_tiles(); /* XXX seems unnecessary... */
 348
 349       spu.ctile_status[ty][tx] = spu.cur_ctile_status;
 350       spu.ztile_status[ty][tx] = spu.cur_ztile_status;
 351    }
 352
 353    D_PRINTF(CELL_DEBUG_CMD,
 354             "RENDER done (%u tiles hit)\n",
 355             num_tiles);
 356 }