src/gallium/drivers/cell/spu/spu_tri.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28 /**
  29  * Triangle rendering within a tile.
  30  */
  31
  32 #include "pipe/p_compiler.h"
  33 #include "pipe/p_format.h"
  34 #include "util/u_math.h"
  35 #include "spu_colorpack.h"
  36 #include "spu_main.h"
  37 #include "spu_shuffle.h"
  38 #include "spu_texture.h"
  39 #include "spu_tile.h"
  40 #include "spu_tri.h"
  41
  42
  43 /** Masks are uint[4] vectors with each element being 0 or 0xffffffff */
  44 typedef vector unsigned int mask_t;
  45
  46
  47
  48 /**
  49  * Simplified types taken from other parts of Gallium
  50  */
  51 struct vertex_header {
  52    vector float data[1];
  53 };
  54
  55
  56
  57 /* XXX fix this */
  58 #undef CEILF
  59 #define CEILF(X) ((float) (int) ((X) + 0.99999f))
  60
  61
  62 #define QUAD_TOP_LEFT     0
  63 #define QUAD_TOP_RIGHT    1
  64 #define QUAD_BOTTOM_LEFT  2
  65 #define QUAD_BOTTOM_RIGHT 3
  66 #define MASK_TOP_LEFT     (1 << QUAD_TOP_LEFT)
  67 #define MASK_TOP_RIGHT    (1 << QUAD_TOP_RIGHT)
  68 #define MASK_BOTTOM_LEFT  (1 << QUAD_BOTTOM_LEFT)
  69 #define MASK_BOTTOM_RIGHT (1 << QUAD_BOTTOM_RIGHT)
  70 #define MASK_ALL          0xf
  71
  72
  73 #define CHAN0 0
  74 #define CHAN1 1
  75 #define CHAN2 2
  76 #define CHAN3 3
  77
  78
  79 #define DEBUG_VERTS 0
  80
  81 /**
  82  * Triangle edge info
  83  */
  84 struct edge {
  85    union {
  86       struct {
  87          float dx;      /**< X(v1) - X(v0), used only during setup */
  88          float dy;      /**< Y(v1) - Y(v0), used only during setup */
  89       };
  90       vec_float4 ds;    /**< vector accessor for dx and dy */
  91    };
  92    float dxdy;          /**< dx/dy */
  93    float sx, sy;        /**< first sample point coord */
  94    int lines;           /**< number of lines on this edge */
  95 };
  96
  97
  98 struct interp_coef
  99 {
 100    vector float a0;
 101    vector float dadx;
 102    vector float dady;
 103 };
 104
 105
 106 /**
 107  * Triangle setup info (derived from draw_stage).
 108  * Also used for line drawing (taking some liberties).
 109  */
 110 struct setup_stage {
 111
 112    /* Vertices are just an array of floats making up each attribute in
 113     * turn.  Currently fixed at 4 floats, but should change in time.
 114     * Codegen will help cope with this.
 115     */
 116    union {
 117       struct {
 118          const struct vertex_header *vmin;
 119          const struct vertex_header *vmid;
 120          const struct vertex_header *vmax;
 121          const struct vertex_header *vprovoke;
 122       };
 123       qword vertex_headers;
 124    };
 125
 126    struct edge ebot;
 127    struct edge etop;
 128    struct edge emaj;
 129
 130    float oneOverArea;  /* XXX maybe make into vector? */
 131
 132    uint facing;
 133
 134    uint tx, ty;  /**< position of current tile (x, y) */
 135
 136    union {
 137       struct {
 138          int cliprect_minx;
 139          int cliprect_miny;
 140          int cliprect_maxx;
 141          int cliprect_maxy;
 142       };
 143       qword cliprect;
 144    };
 145
 146    struct interp_coef coef[PIPE_MAX_SHADER_INPUTS];
 147
 148    struct {
 149       vec_int4 quad; /**< [0] = row0, [1] = row1; {left[0],left[1],right[0],right[1]} */
 150       int y;
 151       unsigned y_flags;
 152       unsigned mask;     /**< mask of MASK_BOTTOM/TOP_LEFT/RIGHT bits */
 153    } span;
 154 };
 155
 156
 157 static struct setup_stage setup;
 158
 159
 160 static INLINE vector float
 161 splatx(vector float v)
 162 {
 163    return spu_splats(spu_extract(v, CHAN0));
 164 }
 165
 166 static INLINE vector float
 167 splaty(vector float v)
 168 {
 169    return spu_splats(spu_extract(v, CHAN1));
 170 }
 171
 172 static INLINE vector float
 173 splatz(vector float v)
 174 {
 175    return spu_splats(spu_extract(v, CHAN2));
 176 }
 177
 178 static INLINE vector float
 179 splatw(vector float v)
 180 {
 181    return spu_splats(spu_extract(v, CHAN3));
 182 }
 183
 184
 185 /**
 186  * Setup fragment shader inputs by evaluating triangle's vertex
 187  * attribute coefficient info.
 188  * \param x  quad x pos
 189  * \param y  quad y pos
 190  * \param fragZ  returns quad Z values
 191  * \param fragInputs  returns fragment program inputs
 192  * Note: this code could be incorporated into the fragment program
 193  * itself to avoid the loop and switch.
 194  */
 195 static void
 196 eval_inputs(float x, float y, vector float *fragZ, vector float fragInputs[])
 197 {
 198    static const vector float deltaX = (const vector float) {0, 1, 0, 1};
 199    static const vector float deltaY = (const vector float) {0, 0, 1, 1};
 200
 201    const uint posSlot = 0;
 202    const vector float pos = setup.coef[posSlot].a0;
 203    const vector float dposdx = setup.coef[posSlot].dadx;
 204    const vector float dposdy = setup.coef[posSlot].dady;
 205    const vector float fragX = spu_splats(x) + deltaX;
 206    const vector float fragY = spu_splats(y) + deltaY;
 207    vector float fragW, wInv;
 208    uint i;
 209
 210    *fragZ = splatz(pos) + fragX * splatz(dposdx) + fragY * splatz(dposdy);
 211    fragW =  splatw(pos) + fragX * splatw(dposdx) + fragY * splatw(dposdy);
 212    wInv = spu_re(fragW);  /* 1 / w */
 213
 214    /* loop over fragment program inputs */
 215    for (i = 0; i < spu.vertex_info.num_attribs; i++) {
 216       uint attr = i + 1;
 217       enum interp_mode interp = spu.vertex_info.attrib[attr].interp_mode;
 218
 219       /* constant term */
 220       vector float a0 = setup.coef[attr].a0;
 221       vector float r0 = splatx(a0);
 222       vector float r1 = splaty(a0);
 223       vector float r2 = splatz(a0);
 224       vector float r3 = splatw(a0);
 225
 226       if (interp == INTERP_LINEAR || interp == INTERP_PERSPECTIVE) {
 227          /* linear term */
 228          vector float dadx = setup.coef[attr].dadx;
 229          vector float dady = setup.coef[attr].dady;
 230          /* Use SPU intrinsics here to get slightly better code.
 231           * originally: r0 += fragX * splatx(dadx) + fragY * splatx(dady);
 232           */
 233          r0 = spu_madd(fragX, splatx(dadx), spu_madd(fragY, splatx(dady), r0));
 234          r1 = spu_madd(fragX, splaty(dadx), spu_madd(fragY, splaty(dady), r1));
 235          r2 = spu_madd(fragX, splatz(dadx), spu_madd(fragY, splatz(dady), r2));
 236          r3 = spu_madd(fragX, splatw(dadx), spu_madd(fragY, splatw(dady), r3));
 237          if (interp == INTERP_PERSPECTIVE) {
 238             /* perspective term */
 239             r0 *= wInv;
 240             r1 *= wInv;
 241             r2 *= wInv;
 242             r3 *= wInv;
 243          }
 244       }
 245       fragInputs[CHAN0] = r0;
 246       fragInputs[CHAN1] = r1;
 247       fragInputs[CHAN2] = r2;
 248       fragInputs[CHAN3] = r3;
 249       fragInputs += 4;
 250    }
 251 }
 252
 253
 254 /**
 255  * Emit a quad (pass to next stage).  No clipping is done.
 256  * Note: about 1/5 to 1/7 of the time, mask is zero and this function
 257  * should be skipped.  But adding the test for that slows things down
 258  * overall.
 259  */
 260 static INLINE void
 261 emit_quad( int x, int y, mask_t mask)
 262 {
 263    /* If any bits in mask are set... */
 264    if (spu_extract(spu_orx(mask), 0)) {
 265       const int ix = x - setup.cliprect_minx;
 266       const int iy = y - setup.cliprect_miny;
 267
 268       spu.cur_ctile_status = TILE_STATUS_DIRTY;
 269       spu.cur_ztile_status = TILE_STATUS_DIRTY;
 270
 271       {
 272          /*
 273           * Run fragment shader, execute per-fragment ops, update fb/tile.
 274           */
 275          vector float inputs[4*4], outputs[2*4];
 276          vector unsigned int kill_mask;
 277          vector float fragZ;
 278
 279          eval_inputs((float) x, (float) y, &fragZ, inputs);
 280
 281          ASSERT(spu.fragment_program);
 282          ASSERT(spu.fragment_ops);
 283
 284          /* Execute the current fragment program */
 285          kill_mask = spu.fragment_program(inputs, outputs, spu.constants);
 286
 287          mask = spu_andc(mask, kill_mask);
 288
 289          /* Execute per-fragment/quad operations, including:
 290           * alpha test, z test, stencil test, blend and framebuffer writing.
 291           * Note that there are two different fragment operations functions
 292           * that can be called, one for front-facing fragments, and one
 293           * for back-facing fragments.  (Often the two are the same;
 294           * but in some cases, like two-sided stenciling, they can be
 295           * very different.)  So choose the correct function depending
 296           * on the calculated facing.
 297           */
 298          spu.fragment_ops[setup.facing](ix, iy, &spu.ctile, &spu.ztile,
 299                           fragZ,
 300                           outputs[0*4+0],
 301                           outputs[0*4+1],
 302                           outputs[0*4+2],
 303                           outputs[0*4+3],
 304                           mask);
 305       }
 306    }
 307 }
 308
 309
 310 /**
 311  * Given an X or Y coordinate, return the block/quad coordinate that it
 312  * belongs to.
 313  */
 314 static INLINE int
 315 block(int x)
 316 {
 317    return x & ~1;
 318 }
 319
 320
 321 /**
 322  * Render a horizontal span of quads
 323  */
 324 static void
 325 flush_spans(void)
 326 {
 327    int minleft, maxright;
 328
 329    const int l0 = spu_extract(setup.span.quad, 0);
 330    const int l1 = spu_extract(setup.span.quad, 1);
 331    const int r0 = spu_extract(setup.span.quad, 2);
 332    const int r1 = spu_extract(setup.span.quad, 3);
 333
 334    switch (setup.span.y_flags) {
 335    case 0x3:
 336       /* both odd and even lines written (both quad rows) */
 337       minleft = MIN2(l0, l1);
 338       maxright = MAX2(r0, r1);
 339       break;
 340
 341    case 0x1:
 342       /* only even line written (quad top row) */
 343       minleft = l0;
 344       maxright = r0;
 345       break;
 346
 347    case 0x2:
 348       /* only odd line written (quad bottom row) */
 349       minleft = l1;
 350       maxright = r1;
 351       break;
 352
 353    default:
 354       return;
 355    }
 356
 357    /* OK, we're very likely to need the tile data now.
 358     * clear or finish waiting if needed.
 359     */
 360    if (spu.cur_ctile_status == TILE_STATUS_GETTING) {
 361       /* wait for mfc_get() to complete */
 362       //printf("SPU: %u: waiting for ctile\n", spu.init.id);
 363       wait_on_mask(1 << TAG_READ_TILE_COLOR);
 364       spu.cur_ctile_status = TILE_STATUS_CLEAN;
 365    }
 366    else if (spu.cur_ctile_status == TILE_STATUS_CLEAR) {
 367       //printf("SPU %u: clearing C tile %u, %u\n", spu.init.id, setup.tx, setup.ty);
 368       clear_c_tile(&spu.ctile);
 369       spu.cur_ctile_status = TILE_STATUS_DIRTY;
 370    }
 371    ASSERT(spu.cur_ctile_status != TILE_STATUS_DEFINED);
 372
 373    if (spu.read_depth_stencil) {
 374       if (spu.cur_ztile_status == TILE_STATUS_GETTING) {
 375          /* wait for mfc_get() to complete */
 376          //printf("SPU: %u: waiting for ztile\n", spu.init.id);
 377          wait_on_mask(1 << TAG_READ_TILE_Z);
 378          spu.cur_ztile_status = TILE_STATUS_CLEAN;
 379       }
 380       else if (spu.cur_ztile_status == TILE_STATUS_CLEAR) {
 381          //printf("SPU %u: clearing Z tile %u, %u\n", spu.init.id, setup.tx, setup.ty);
 382          clear_z_tile(&spu.ztile);
 383          spu.cur_ztile_status = TILE_STATUS_DIRTY;
 384       }
 385       ASSERT(spu.cur_ztile_status != TILE_STATUS_DEFINED);
 386    }
 387
 388    /* XXX this loop could be moved into the above switch cases... */
 389
 390    /* Setup for mask calculation */
 391    const vec_int4 quad_LlRr = setup.span.quad;
 392    const vec_int4 quad_RrLl = spu_rlqwbyte(quad_LlRr, 8);
 393    const vec_int4 quad_LLll = spu_shuffle(quad_LlRr, quad_LlRr, SHUFFLE4(A,A,B,B));
 394    const vec_int4 quad_RRrr = spu_shuffle(quad_RrLl, quad_RrLl, SHUFFLE4(A,A,B,B));
 395
 396    const vec_int4 twos = spu_splats(2);
 397
 398    const int x = block(minleft);
 399    vec_int4 xs = {x, x+1, x, x+1};
 400
 401    for (; spu_extract(xs, 0) <= block(maxright); xs += twos) {
 402       /**
 403        * Computes mask to indicate which pixels in the 2x2 quad are actually
 404        * inside the triangle's bounds.
 405        */
 406
 407       /* Calculate ({x,x+1,x,x+1} >= {l[0],l[0],l[1],l[1]}) */
 408       const mask_t gt_LLll_xs = spu_cmpgt(quad_LLll, xs);
 409       const mask_t gte_xs_LLll = spu_nand(gt_LLll_xs, gt_LLll_xs);
 410
 411       /* Calculate ({r[0],r[0],r[1],r[1]} > {x,x+1,x,x+1}) */
 412       const mask_t gt_RRrr_xs = spu_cmpgt(quad_RRrr, xs);
 413
 414       /* Combine results to create mask */
 415       const mask_t mask = spu_and(gte_xs_LLll, gt_RRrr_xs);
 416
 417       emit_quad(spu_extract(xs, 0), setup.span.y, mask);
 418    }
 419
 420    setup.span.y = 0;
 421    setup.span.y_flags = 0;
 422    /* Zero right elements */
 423    setup.span.quad = spu_shuffle(setup.span.quad, setup.span.quad, SHUFFLE4(A,B,0,0));
 424 }
 425
 426
 427 #if DEBUG_VERTS
 428 static void
 429 print_vertex(const struct vertex_header *v)
 430 {
 431    uint i;
 432    fprintf(stderr, "  Vertex: (%p)\n", v);
 433    for (i = 0; i < spu.vertex_info.num_attribs; i++) {
 434       fprintf(stderr, "    %d: %f %f %f %f\n",  i,
 435               spu_extract(v->data[i], 0),
 436               spu_extract(v->data[i], 1),
 437               spu_extract(v->data[i], 2),
 438               spu_extract(v->data[i], 3));
 439    }
 440 }
 441 #endif
 442
 443 /* Returns the minimum of each slot of two vec_float4s as qwords.
 444  * i.e. return[n] = min(q0[n],q1[n]);
 445  */
 446 static qword
 447 minfq(qword q0, qword q1)
 448 {
 449    const qword q0q1m = si_fcgt(q0, q1);
 450    return si_selb(q0, q1, q0q1m);
 451 }
 452
 453 /* Returns the minimum of each slot of three vec_float4s as qwords.
 454  * i.e. return[n] = min(q0[n],q1[n],q2[n]);
 455  */
 456 static qword
 457 min3fq(qword q0, qword q1, qword q2)
 458 {
 459    return minfq(minfq(q0, q1), q2);
 460 }
 461
 462 /* Returns the maximum of each slot of two vec_float4s as qwords.
 463  * i.e. return[n] = min(q0[n],q1[n],q2[n]);
 464  */
 465 static qword
 466 maxfq(qword q0, qword q1) {
 467    const qword q0q1m = si_fcgt(q0, q1);
 468    return si_selb(q1, q0, q0q1m);
 469 }
 470
 471 /* Returns the maximum of each slot of three vec_float4s as qwords.
 472  * i.e. return[n] = min(q0[n],q1[n],q2[n]);
 473  */
 474 static qword
 475 max3fq(qword q0, qword q1, qword q2) {
 476    return maxfq(maxfq(q0, q1), q2);
 477 }
 478
 479 /**
 480  * Sort vertices from top to bottom.
 481  * Compute area and determine front vs. back facing.
 482  * Do coarse clip test against tile bounds
 483  * \return  FALSE if tri is totally outside tile, TRUE otherwise
 484  */
 485 static boolean
 486 setup_sort_vertices(const qword vs)
 487 {
 488    float area, sign;
 489
 490 #if DEBUG_VERTS
 491    if (spu.init.id==0) {
 492       fprintf(stderr, "SPU %u: Triangle:\n", spu.init.id);
 493       print_vertex(v0);
 494       print_vertex(v1);
 495       print_vertex(v2);
 496    }
 497 #endif
 498
 499    {
 500       /* Load the float values for various processing... */
 501       const qword f0 = (qword)(((const struct vertex_header*)si_to_ptr(vs))->data[0]);
 502       const qword f1 = (qword)(((const struct vertex_header*)si_to_ptr(si_rotqbyi(vs, 4)))->data[0]);
 503       const qword f2 = (qword)(((const struct vertex_header*)si_to_ptr(si_rotqbyi(vs, 8)))->data[0]);
 504
 505       /* Check if triangle is completely outside the tile bounds
 506        * Find the min and max x and y positions of the three poits */
 507       const qword minf = min3fq(f0, f1, f2);
 508       const qword maxf = max3fq(f0, f1, f2);
 509
 510       /* Compare min and max against cliprect vals */
 511       const qword maxsmins = si_shufb(maxf, minf, SHUFB4(A,B,a,b));
 512       const qword outside = si_fcgt(maxsmins, si_csflt(setup.cliprect, 0));
 513
 514       /* Use a little magic to work out of the tri is visible or not */
 515       if(si_to_uint(si_xori(si_gb(outside), 0xc))) return FALSE;
 516
 517       /* determine bottom to top order of vertices */
 518       /* A table of shuffle patterns for putting vertex_header pointers into
 519          correct order.  Quite magical. */
 520       const qword sort_order_patterns[] = {
 521          SHUFB4(A,B,C,C),
 522          SHUFB4(C,A,B,C),
 523          SHUFB4(A,C,B,C),
 524          SHUFB4(B,C,A,C),
 525          SHUFB4(B,A,C,C),
 526          SHUFB4(C,B,A,C) };
 527
 528       /* Collate y values into two vectors for comparison.
 529          Using only one shuffle constant! ;) */
 530       const qword y_02_ = si_shufb(f0, f2, SHUFB4(0,B,b,C));
 531       const qword y_10_ = si_shufb(f1, f0, SHUFB4(0,B,b,C));
 532       const qword y_012 = si_shufb(y_02_, f1, SHUFB4(0,B,b,C));
 533       const qword y_120 = si_shufb(y_10_, f2, SHUFB4(0,B,b,C));
 534
 535       /* Perform comparison: {y0,y1,y2} > {y1,y2,y0} */
 536       const qword compare = si_fcgt(y_012, y_120);
 537       /* Compress the result of the comparison into 4 bits */
 538       const qword gather = si_gb(compare);
 539       /* Subtract one to attain the index into the LUT.  Magical. */
 540       const unsigned int index = si_to_uint(gather) - 1;
 541
 542       /* Load the appropriate pattern and construct the desired vector. */
 543       setup.vertex_headers = si_shufb(vs, vs, sort_order_patterns[index]);
 544
 545       /* Using the result of the comparison, set sign.
 546          Very magical. */
 547       sign = ((si_to_uint(si_cntb(gather)) == 2) ? 1.0f : -1.0f);
 548    }
 549
 550    setup.ebot.ds = spu_sub(setup.vmid->data[0], setup.vmin->data[0]);
 551    setup.emaj.ds = spu_sub(setup.vmax->data[0], setup.vmin->data[0]);
 552    setup.etop.ds = spu_sub(setup.vmax->data[0], setup.vmid->data[0]);
 553
 554    /*
 555     * Compute triangle's area.  Use 1/area to compute partial
 556     * derivatives of attributes later.
 557     */
 558    area = setup.emaj.dx * setup.ebot.dy - setup.ebot.dx * setup.emaj.dy;
 559
 560    setup.oneOverArea = 1.0f / area;
 561
 562    /* The product of area * sign indicates front/back orientation (0/1).
 563     * Just in case someone gets the bright idea of switching the front
 564     * and back constants without noticing that we're assuming their
 565     * values in this operation, also assert that the values are
 566     * what we think they are.
 567     */
 568    ASSERT(CELL_FACING_FRONT == 0);
 569    ASSERT(CELL_FACING_BACK == 1);
 570    setup.facing = (area * sign > 0.0f)
 571       ^ (!spu.rasterizer.front_ccw);
 572
 573    return TRUE;
 574 }
 575
 576
 577 /**
 578  * Compute a0 for a constant-valued coefficient (GL_FLAT shading).
 579  * The value value comes from vertex->data[slot].
 580  * The result will be put into setup.coef[slot].a0.
 581  * \param slot  which attribute slot
 582  */
 583 static INLINE void
 584 const_coeff4(uint slot)
 585 {
 586    setup.coef[slot].dadx = (vector float) {0.0, 0.0, 0.0, 0.0};
 587    setup.coef[slot].dady = (vector float) {0.0, 0.0, 0.0, 0.0};
 588    setup.coef[slot].a0 = setup.vprovoke->data[slot];
 589 }
 590
 591
 592 /**
 593  * As above, but interp setup all four vector components.
 594  */
 595 static INLINE void
 596 tri_linear_coeff4(uint slot)
 597 {
 598    const vector float vmin_d = setup.vmin->data[slot];
 599    const vector float vmid_d = setup.vmid->data[slot];
 600    const vector float vmax_d = setup.vmax->data[slot];
 601    const vector float xxxx = spu_splats(spu_extract(setup.vmin->data[0], 0) - 0.5f);
 602    const vector float yyyy = spu_splats(spu_extract(setup.vmin->data[0], 1) - 0.5f);
 603
 604    vector float botda = vmid_d - vmin_d;
 605    vector float majda = vmax_d - vmin_d;
 606
 607    vector float a = spu_sub(spu_mul(spu_splats(setup.ebot.dy), majda),
 608                             spu_mul(botda, spu_splats(setup.emaj.dy)));
 609    vector float b = spu_sub(spu_mul(spu_splats(setup.emaj.dx), botda),
 610                             spu_mul(majda, spu_splats(setup.ebot.dx)));
 611
 612    setup.coef[slot].dadx = spu_mul(a, spu_splats(setup.oneOverArea));
 613    setup.coef[slot].dady = spu_mul(b, spu_splats(setup.oneOverArea));
 614
 615    vector float tempx = spu_mul(setup.coef[slot].dadx, xxxx);
 616    vector float tempy = spu_mul(setup.coef[slot].dady, yyyy);
 617
 618    setup.coef[slot].a0 = spu_sub(vmin_d, spu_add(tempx, tempy));
 619 }
 620
 621
 622 /**
 623  * Compute a0, dadx and dady for a perspective-corrected interpolant,
 624  * for a triangle.
 625  * We basically multiply the vertex value by 1/w before computing
 626  * the plane coefficients (a0, dadx, dady).
 627  * Later, when we compute the value at a particular fragment position we'll
 628  * divide the interpolated value by the interpolated W at that fragment.
 629  */
 630 static void
 631 tri_persp_coeff4(uint slot)
 632 {
 633    const vector float xxxx = spu_splats(spu_extract(setup.vmin->data[0], 0) - 0.5f);
 634    const vector float yyyy = spu_splats(spu_extract(setup.vmin->data[0], 1) - 0.5f);
 635
 636    const vector float vmin_w = spu_splats(spu_extract(setup.vmin->data[0], 3));
 637    const vector float vmid_w = spu_splats(spu_extract(setup.vmid->data[0], 3));
 638    const vector float vmax_w = spu_splats(spu_extract(setup.vmax->data[0], 3));
 639
 640    vector float vmin_d = setup.vmin->data[slot];
 641    vector float vmid_d = setup.vmid->data[slot];
 642    vector float vmax_d = setup.vmax->data[slot];
 643
 644    vmin_d = spu_mul(vmin_d, vmin_w);
 645    vmid_d = spu_mul(vmid_d, vmid_w);
 646    vmax_d = spu_mul(vmax_d, vmax_w);
 647
 648    vector float botda = vmid_d - vmin_d;
 649    vector float majda = vmax_d - vmin_d;
 650
 651    vector float a = spu_sub(spu_mul(spu_splats(setup.ebot.dy), majda),
 652                             spu_mul(botda, spu_splats(setup.emaj.dy)));
 653    vector float b = spu_sub(spu_mul(spu_splats(setup.emaj.dx), botda),
 654                             spu_mul(majda, spu_splats(setup.ebot.dx)));
 655
 656    setup.coef[slot].dadx = spu_mul(a, spu_splats(setup.oneOverArea));
 657    setup.coef[slot].dady = spu_mul(b, spu_splats(setup.oneOverArea));
 658
 659    vector float tempx = spu_mul(setup.coef[slot].dadx, xxxx);
 660    vector float tempy = spu_mul(setup.coef[slot].dady, yyyy);
 661
 662    setup.coef[slot].a0 = spu_sub(vmin_d, spu_add(tempx, tempy));
 663 }
 664
 665
 666
 667 /**
 668  * Compute the setup.coef[] array dadx, dady, a0 values.
 669  * Must be called after setup.vmin,vmid,vmax,vprovoke are initialized.
 670  */
 671 static void
 672 setup_tri_coefficients(void)
 673 {
 674    uint i;
 675
 676    for (i = 0; i < spu.vertex_info.num_attribs; i++) {
 677       switch (spu.vertex_info.attrib[i].interp_mode) {
 678       case INTERP_NONE:
 679          break;
 680       case INTERP_CONSTANT:
 681          const_coeff4(i);
 682          break;
 683       case INTERP_POS:
 684          /* fall-through */
 685       case INTERP_LINEAR:
 686          tri_linear_coeff4(i);
 687          break;
 688       case INTERP_PERSPECTIVE:
 689          tri_persp_coeff4(i);
 690          break;
 691       default:
 692          ASSERT(0);
 693       }
 694    }
 695 }
 696
 697
 698 static void
 699 setup_tri_edges(void)
 700 {
 701    float vmin_x = spu_extract(setup.vmin->data[0], 0) + 0.5f;
 702    float vmid_x = spu_extract(setup.vmid->data[0], 0) + 0.5f;
 703
 704    float vmin_y = spu_extract(setup.vmin->data[0], 1) - 0.5f;
 705    float vmid_y = spu_extract(setup.vmid->data[0], 1) - 0.5f;
 706    float vmax_y = spu_extract(setup.vmax->data[0], 1) - 0.5f;
 707
 708    setup.emaj.sy = CEILF(vmin_y);
 709    setup.emaj.lines = (int) CEILF(vmax_y - setup.emaj.sy);
 710    setup.emaj.dxdy = setup.emaj.dx / setup.emaj.dy;
 711    setup.emaj.sx = vmin_x + (setup.emaj.sy - vmin_y) * setup.emaj.dxdy;
 712
 713    setup.etop.sy = CEILF(vmid_y);
 714    setup.etop.lines = (int) CEILF(vmax_y - setup.etop.sy);
 715    setup.etop.dxdy = setup.etop.dx / setup.etop.dy;
 716    setup.etop.sx = vmid_x + (setup.etop.sy - vmid_y) * setup.etop.dxdy;
 717
 718    setup.ebot.sy = CEILF(vmin_y);
 719    setup.ebot.lines = (int) CEILF(vmid_y - setup.ebot.sy);
 720    setup.ebot.dxdy = setup.ebot.dx / setup.ebot.dy;
 721    setup.ebot.sx = vmin_x + (setup.ebot.sy - vmin_y) * setup.ebot.dxdy;
 722 }
 723
 724
 725 /**
 726  * Render the upper or lower half of a triangle.
 727  * Scissoring/cliprect is applied here too.
 728  */
 729 static void
 730 subtriangle(struct edge *eleft, struct edge *eright, unsigned lines)
 731 {
 732    const int minx = setup.cliprect_minx;
 733    const int maxx = setup.cliprect_maxx;
 734    const int miny = setup.cliprect_miny;
 735    const int maxy = setup.cliprect_maxy;
 736    int y, start_y, finish_y;
 737    int sy = (int)eleft->sy;
 738
 739    ASSERT((int)eleft->sy == (int) eright->sy);
 740
 741    /* clip top/bottom */
 742    start_y = sy;
 743    finish_y = sy + lines;
 744
 745    if (start_y < miny)
 746       start_y = miny;
 747
 748    if (finish_y > maxy)
 749       finish_y = maxy;
 750
 751    start_y -= sy;
 752    finish_y -= sy;
 753
 754    /*
 755    printf("%s %d %d\n", __FUNCTION__, start_y, finish_y);
 756    */
 757
 758    for (y = start_y; y < finish_y; y++) {
 759
 760       /* avoid accumulating adds as floats don't have the precision to
 761        * accurately iterate large triangle edges that way.  luckily we
 762        * can just multiply these days.
 763        *
 764        * this is all drowned out by the attribute interpolation anyway.
 765        */
 766       int left = (int)(eleft->sx + y * eleft->dxdy);
 767       int right = (int)(eright->sx + y * eright->dxdy);
 768
 769       /* clip left/right */
 770       if (left < minx)
 771          left = minx;
 772       if (right > maxx)
 773          right = maxx;
 774
 775       if (left < right) {
 776          int _y = sy + y;
 777          if (block(_y) != setup.span.y) {
 778             flush_spans();
 779             setup.span.y = block(_y);
 780          }
 781
 782          int offset = _y&1;
 783          vec_int4 quad_LlRr = {left, left, right, right};
 784          /* Store left and right in 0 or 1 row of quad based on offset */
 785          setup.span.quad = spu_sel(quad_LlRr, setup.span.quad, spu_maskw(5<<offset));
 786          setup.span.y_flags |= 1<<offset;
 787       }
 788    }
 789
 790
 791    /* save the values so that emaj can be restarted:
 792     */
 793    eleft->sx += lines * eleft->dxdy;
 794    eright->sx += lines * eright->dxdy;
 795    eleft->sy += lines;
 796    eright->sy += lines;
 797 }
 798
 799
 800 /**
 801  * Draw triangle into tile at (tx, ty) (tile coords)
 802  * The tile data should have already been fetched.
 803  */
 804 boolean
 805 tri_draw(const qword vs,
 806          uint tx, uint ty)
 807 {
 808    setup.tx = tx;
 809    setup.ty = ty;
 810
 811    /* set clipping bounds to tile bounds */
 812    const qword clipbase = (qword)((vec_uint4){tx, ty});
 813    const qword clipmin = si_mpyui(clipbase, TILE_SIZE);
 814    const qword clipmax = si_ai(clipmin, TILE_SIZE);
 815    setup.cliprect = si_shufb(clipmin, clipmax, SHUFB4(A,B,a,b));
 816
 817    if(!setup_sort_vertices(vs)) {
 818       return FALSE; /* totally clipped */
 819    }
 820
 821    setup_tri_coefficients();
 822    setup_tri_edges();
 823
 824    setup.span.y = 0;
 825    setup.span.y_flags = 0;
 826    /* Zero right elements */
 827    setup.span.quad = spu_shuffle(setup.span.quad, setup.span.quad, SHUFFLE4(A,B,0,0));
 828
 829    if (setup.oneOverArea < 0.0) {
 830       /* emaj on left */
 831       subtriangle( &setup.emaj, &setup.ebot, setup.ebot.lines );
 832       subtriangle( &setup.emaj, &setup.etop, setup.etop.lines );
 833    }
 834    else {
 835       /* emaj on right */
 836       subtriangle( &setup.ebot, &setup.emaj, setup.ebot.lines );
 837       subtriangle( &setup.etop, &setup.emaj, setup.etop.lines );
 838    }
 839
 840    flush_spans();
 841
 842    return TRUE;
 843 }