src/mesa/drivers/dri/i965/brw_wm_emit.c

   1 /*
   2  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
   3  Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
   4  develop this 3D driver.
   5
   6  Permission is hereby granted, free of charge, to any person obtaining
   7  a copy of this software and associated documentation files (the
   8  "Software"), to deal in the Software without restriction, including
   9  without limitation the rights to use, copy, modify, merge, publish,
  10  distribute, sublicense, and/or sell copies of the Software, and to
  11  permit persons to whom the Software is furnished to do so, subject to
  12  the following conditions:
  13
  14  The above copyright notice and this permission notice (including the
  15  next paragraph) shall be included in all copies or substantial
  16  portions of the Software.
  17
  18  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  19  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  21  IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
  22  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  23  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  24  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25
  26  **********************************************************************/
  27  /*
  28   * Authors:
  29   *   Keith Whitwell <keith@tungstengraphics.com>
  30   */
  31
  32
  33 #include "main/macros.h"
  34 #include "brw_context.h"
  35 #include "brw_wm.h"
  36
  37 static GLboolean can_do_pln(struct intel_context *intel,
  38                             const struct brw_reg *deltas)
  39 {
  40    struct brw_context *brw = brw_context(&intel->ctx);
  41
  42    if (!brw->has_pln)
  43       return GL_FALSE;
  44
  45    if (deltas[1].nr != deltas[0].nr + 1)
  46       return GL_FALSE;
  47
  48    if (intel->gen < 6 && ((deltas[0].nr & 1) != 0))
  49       return GL_FALSE;
  50
  51    return GL_TRUE;
  52 }
  53
  54 /* Not quite sure how correct this is - need to understand horiz
  55  * vs. vertical strides a little better.
  56  */
  57 static INLINE struct brw_reg sechalf( struct brw_reg reg )
  58 {
  59    if (reg.vstride)
  60       reg.nr++;
  61    return reg;
  62 }
  63
  64 /* Return the SrcReg index of the channels that can be immediate float operands
  65  * instead of usage of PROGRAM_CONSTANT values through push/pull.
  66  */
  67 GLboolean
  68 brw_wm_arg_can_be_immediate(enum prog_opcode opcode, int arg)
  69 {
  70    int opcode_array[] = {
  71       [OPCODE_ADD] = 2,
  72       [OPCODE_CMP] = 3,
  73       [OPCODE_DP3] = 2,
  74       [OPCODE_DP4] = 2,
  75       [OPCODE_DPH] = 2,
  76       [OPCODE_MAX] = 2,
  77       [OPCODE_MIN] = 2,
  78       [OPCODE_MOV] = 1,
  79       [OPCODE_MUL] = 2,
  80       [OPCODE_SEQ] = 2,
  81       [OPCODE_SGE] = 2,
  82       [OPCODE_SGT] = 2,
  83       [OPCODE_SLE] = 2,
  84       [OPCODE_SLT] = 2,
  85       [OPCODE_SNE] = 2,
  86       [OPCODE_SWZ] = 1,
  87       [OPCODE_XPD] = 2,
  88    };
  89
  90    /* These opcodes get broken down in a way that allow two
  91     * args to be immediates.
  92     */
  93    if (opcode == OPCODE_MAD || opcode == OPCODE_LRP) {
  94       if (arg == 1 || arg == 2)
  95          return GL_TRUE;
  96    }
  97
  98    if (opcode > ARRAY_SIZE(opcode_array))
  99       return GL_FALSE;
 100
 101    return arg == opcode_array[opcode] - 1;
 102 }
 103
 104 /**
 105  * Computes the screen-space x,y position of the pixels.
 106  *
 107  * This will be used by emit_delta_xy() or emit_wpos_xy() for
 108  * interpolation of attributes..
 109  *
 110  * Payload R0:
 111  *
 112  * R0.0 -- pixel mask, one bit for each of 4 pixels in 4 tiles,
 113  *         corresponding to each of the 16 execution channels.
 114  * R0.1..8 -- ?
 115  * R1.0 -- triangle vertex 0.X
 116  * R1.1 -- triangle vertex 0.Y
 117  * R1.2 -- tile 0 x,y coords (2 packed uwords)
 118  * R1.3 -- tile 1 x,y coords (2 packed uwords)
 119  * R1.4 -- tile 2 x,y coords (2 packed uwords)
 120  * R1.5 -- tile 3 x,y coords (2 packed uwords)
 121  * R1.6 -- ?
 122  * R1.7 -- ?
 123  * R1.8 -- ?
 124  */
 125 void emit_pixel_xy(struct brw_wm_compile *c,
 126                    const struct brw_reg *dst,
 127                    GLuint mask)
 128 {
 129    struct brw_compile *p = &c->func;
 130    struct brw_reg r1 = brw_vec1_grf(1, 0);
 131    struct brw_reg r1_uw = retype(r1, BRW_REGISTER_TYPE_UW);
 132    struct brw_reg dst0_uw, dst1_uw;
 133
 134    brw_push_insn_state(p);
 135    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 136
 137    if (c->dispatch_width == 16) {
 138       dst0_uw = vec16(retype(dst[0], BRW_REGISTER_TYPE_UW));
 139       dst1_uw = vec16(retype(dst[1], BRW_REGISTER_TYPE_UW));
 140    } else {
 141       dst0_uw = vec8(retype(dst[0], BRW_REGISTER_TYPE_UW));
 142       dst1_uw = vec8(retype(dst[1], BRW_REGISTER_TYPE_UW));
 143    }
 144
 145    /* Calculate pixel centers by adding 1 or 0 to each of the
 146     * micro-tile coordinates passed in r1.
 147     */
 148    if (mask & WRITEMASK_X) {
 149       brw_ADD(p,
 150               dst0_uw,
 151               stride(suboffset(r1_uw, 4), 2, 4, 0),
 152               brw_imm_v(0x10101010));
 153    }
 154
 155    if (mask & WRITEMASK_Y) {
 156       brw_ADD(p,
 157               dst1_uw,
 158               stride(suboffset(r1_uw,5), 2, 4, 0),
 159               brw_imm_v(0x11001100));
 160    }
 161    brw_pop_insn_state(p);
 162 }
 163
 164 /**
 165  * Computes the screen-space x,y distance of the pixels from the start
 166  * vertex.
 167  *
 168  * This will be used in linterp or pinterp with the start vertex value
 169  * and the Cx, Cy, and C0 coefficients passed in from the setup engine
 170  * to produce interpolated attribute values.
 171  */
 172 void emit_delta_xy(struct brw_compile *p,
 173                    const struct brw_reg *dst,
 174                    GLuint mask,
 175                    const struct brw_reg *arg0)
 176 {
 177    struct intel_context *intel = &p->brw->intel;
 178    struct brw_reg r1 = brw_vec1_grf(1, 0);
 179
 180    if (mask == 0)
 181       return;
 182
 183    assert(mask == WRITEMASK_XY);
 184
 185    if (intel->gen >= 6) {
 186        /* XXX Gen6 WM doesn't have Xstart/Ystart in payload r1.0/r1.1.
 187           Just add them with 0.0 for dst reg.. */
 188        r1 = brw_imm_v(0x00000000);
 189        brw_ADD(p,
 190                dst[0],
 191                retype(arg0[0], BRW_REGISTER_TYPE_UW),
 192                r1);
 193        brw_ADD(p,
 194                dst[1],
 195                retype(arg0[1], BRW_REGISTER_TYPE_UW),
 196                r1);
 197        return;
 198    }
 199
 200    /* Calc delta X,Y by subtracting origin in r1 from the pixel
 201     * centers produced by emit_pixel_xy().
 202     */
 203    brw_ADD(p,
 204            dst[0],
 205            retype(arg0[0], BRW_REGISTER_TYPE_UW),
 206            negate(r1));
 207    brw_ADD(p,
 208            dst[1],
 209            retype(arg0[1], BRW_REGISTER_TYPE_UW),
 210            negate(suboffset(r1,1)));
 211 }
 212
 213 /**
 214  * Computes the pixel offset from the window origin for gl_FragCoord().
 215  */
 216 void emit_wpos_xy(struct brw_wm_compile *c,
 217                   const struct brw_reg *dst,
 218                   GLuint mask,
 219                   const struct brw_reg *arg0)
 220 {
 221    struct brw_compile *p = &c->func;
 222    struct intel_context *intel = &p->brw->intel;
 223    struct brw_reg delta_x = retype(arg0[0], BRW_REGISTER_TYPE_W);
 224    struct brw_reg delta_y = retype(arg0[1], BRW_REGISTER_TYPE_W);
 225
 226    if (mask & WRITEMASK_X) {
 227       if (intel->gen >= 6) {
 228          struct brw_reg delta_x_f = retype(delta_x, BRW_REGISTER_TYPE_F);
 229          brw_MOV(p, delta_x_f, delta_x);
 230          delta_x = delta_x_f;
 231       }
 232
 233       if (c->fp->program.PixelCenterInteger) {
 234          /* X' = X */
 235          brw_MOV(p, dst[0], delta_x);
 236       } else {
 237          /* X' = X + 0.5 */
 238          brw_ADD(p, dst[0], delta_x, brw_imm_f(0.5));
 239       }
 240    }
 241
 242    if (mask & WRITEMASK_Y) {
 243       if (intel->gen >= 6) {
 244          struct brw_reg delta_y_f = retype(delta_y, BRW_REGISTER_TYPE_F);
 245          brw_MOV(p, delta_y_f, delta_y);
 246          delta_y = delta_y_f;
 247       }
 248
 249       if (c->fp->program.OriginUpperLeft) {
 250          if (c->fp->program.PixelCenterInteger) {
 251             /* Y' = Y */
 252             brw_MOV(p, dst[1], delta_y);
 253          } else {
 254             brw_ADD(p, dst[1], delta_y, brw_imm_f(0.5));
 255          }
 256       } else {
 257          float center_offset = c->fp->program.PixelCenterInteger ? 0.0 : 0.5;
 258
 259          /* Y' = (height - 1) - Y + center */
 260          brw_ADD(p, dst[1], negate(delta_y),
 261                  brw_imm_f(c->key.drawable_height - 1 + center_offset));
 262       }
 263    }
 264 }
 265
 266
 267 void emit_pixel_w(struct brw_wm_compile *c,
 268                   const struct brw_reg *dst,
 269                   GLuint mask,
 270                   const struct brw_reg *arg0,
 271                   const struct brw_reg *deltas)
 272 {
 273    struct brw_compile *p = &c->func;
 274    struct intel_context *intel = &p->brw->intel;
 275    struct brw_reg src;
 276    struct brw_reg temp_dst;
 277
 278    if (intel->gen >= 6)
 279         temp_dst = dst[3];
 280    else
 281         temp_dst = brw_message_reg(2);
 282
 283    assert(intel->gen < 6);
 284
 285    /* Don't need this if all you are doing is interpolating color, for
 286     * instance.
 287     */
 288    if (mask & WRITEMASK_W) {
 289       struct brw_reg interp3 = brw_vec1_grf(arg0[0].nr+1, 4);
 290
 291       /* Calc 1/w - just linterp wpos[3] optimized by putting the
 292        * result straight into a message reg.
 293        */
 294       if (can_do_pln(intel, deltas)) {
 295          brw_PLN(p, temp_dst, interp3, deltas[0]);
 296       } else {
 297          brw_LINE(p, brw_null_reg(), interp3, deltas[0]);
 298          brw_MAC(p, temp_dst, suboffset(interp3, 1), deltas[1]);
 299       }
 300
 301       /* Calc w */
 302       if (intel->gen >= 6)
 303          src = temp_dst;
 304       else
 305          src = brw_null_reg();
 306
 307       if (c->dispatch_width == 16) {
 308          brw_math_16(p, dst[3],
 309                      BRW_MATH_FUNCTION_INV,
 310                      BRW_MATH_SATURATE_NONE,
 311                      2, src,
 312                      BRW_MATH_PRECISION_FULL);
 313       } else {
 314          brw_math(p, dst[3],
 315                   BRW_MATH_FUNCTION_INV,
 316                   BRW_MATH_SATURATE_NONE,
 317                   2, src,
 318                   BRW_MATH_DATA_VECTOR,
 319                   BRW_MATH_PRECISION_FULL);
 320       }
 321    }
 322 }
 323
 324 void emit_linterp(struct brw_compile *p,
 325                   const struct brw_reg *dst,
 326                   GLuint mask,
 327                   const struct brw_reg *arg0,
 328                   const struct brw_reg *deltas)
 329 {
 330    struct intel_context *intel = &p->brw->intel;
 331    struct brw_reg interp[4];
 332    GLuint nr = arg0[0].nr;
 333    GLuint i;
 334
 335    interp[0] = brw_vec1_grf(nr, 0);
 336    interp[1] = brw_vec1_grf(nr, 4);
 337    interp[2] = brw_vec1_grf(nr+1, 0);
 338    interp[3] = brw_vec1_grf(nr+1, 4);
 339
 340    for (i = 0; i < 4; i++) {
 341       if (mask & (1<<i)) {
 342          if (intel->gen >= 6) {
 343             brw_PLN(p, dst[i], interp[i], brw_vec8_grf(2, 0));
 344          } else if (can_do_pln(intel, deltas)) {
 345             brw_PLN(p, dst[i], interp[i], deltas[0]);
 346          } else {
 347             brw_LINE(p, brw_null_reg(), interp[i], deltas[0]);
 348             brw_MAC(p, dst[i], suboffset(interp[i],1), deltas[1]);
 349          }
 350       }
 351    }
 352 }
 353
 354
 355 void emit_pinterp(struct brw_compile *p,
 356                   const struct brw_reg *dst,
 357                   GLuint mask,
 358                   const struct brw_reg *arg0,
 359                   const struct brw_reg *deltas,
 360                   const struct brw_reg *w)
 361 {
 362    struct intel_context *intel = &p->brw->intel;
 363    struct brw_reg interp[4];
 364    GLuint nr = arg0[0].nr;
 365    GLuint i;
 366
 367    if (intel->gen >= 6) {
 368       emit_linterp(p, dst, mask, arg0, interp);
 369       return;
 370    }
 371
 372    interp[0] = brw_vec1_grf(nr, 0);
 373    interp[1] = brw_vec1_grf(nr, 4);
 374    interp[2] = brw_vec1_grf(nr+1, 0);
 375    interp[3] = brw_vec1_grf(nr+1, 4);
 376
 377    for (i = 0; i < 4; i++) {
 378       if (mask & (1<<i)) {
 379          if (can_do_pln(intel, deltas)) {
 380             brw_PLN(p, dst[i], interp[i], deltas[0]);
 381          } else {
 382             brw_LINE(p, brw_null_reg(), interp[i], deltas[0]);
 383             brw_MAC(p, dst[i], suboffset(interp[i],1), deltas[1]);
 384          }
 385       }
 386    }
 387    for (i = 0; i < 4; i++) {
 388       if (mask & (1<<i)) {
 389          brw_MUL(p, dst[i], dst[i], w[3]);
 390       }
 391    }
 392 }
 393
 394
 395 void emit_cinterp(struct brw_compile *p,
 396                   const struct brw_reg *dst,
 397                   GLuint mask,
 398                   const struct brw_reg *arg0)
 399 {
 400    struct brw_reg interp[4];
 401    GLuint nr = arg0[0].nr;
 402    GLuint i;
 403
 404    interp[0] = brw_vec1_grf(nr, 0);
 405    interp[1] = brw_vec1_grf(nr, 4);
 406    interp[2] = brw_vec1_grf(nr+1, 0);
 407    interp[3] = brw_vec1_grf(nr+1, 4);
 408
 409    for (i = 0; i < 4; i++) {
 410       if (mask & (1<<i)) {
 411          brw_MOV(p, dst[i], suboffset(interp[i],3));    /* TODO: optimize away like other moves */
 412       }
 413    }
 414 }
 415
 416 /* Sets the destination channels to 1.0 or 0.0 according to glFrontFacing. */
 417 void emit_frontfacing(struct brw_compile *p,
 418                       const struct brw_reg *dst,
 419                       GLuint mask)
 420 {
 421    struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
 422    GLuint i;
 423
 424    if (!(mask & WRITEMASK_XYZW))
 425       return;
 426
 427    for (i = 0; i < 4; i++) {
 428       if (mask & (1<<i)) {
 429          brw_MOV(p, dst[i], brw_imm_f(0.0));
 430       }
 431    }
 432
 433    /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
 434     * us front face
 435     */
 436    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, r1_6ud, brw_imm_ud(1 << 31));
 437    for (i = 0; i < 4; i++) {
 438       if (mask & (1<<i)) {
 439          brw_MOV(p, dst[i], brw_imm_f(1.0));
 440       }
 441    }
 442    brw_set_predicate_control_flag_value(p, 0xff);
 443 }
 444
 445 /* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
 446  * looking like:
 447  *
 448  * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
 449  *
 450  * and we're trying to produce:
 451  *
 452  *           DDX                     DDY
 453  * dst: (ss0.tr - ss0.tl)     (ss0.tl - ss0.bl)
 454  *      (ss0.tr - ss0.tl)     (ss0.tr - ss0.br)
 455  *      (ss0.br - ss0.bl)     (ss0.tl - ss0.bl)
 456  *      (ss0.br - ss0.bl)     (ss0.tr - ss0.br)
 457  *      (ss1.tr - ss1.tl)     (ss1.tl - ss1.bl)
 458  *      (ss1.tr - ss1.tl)     (ss1.tr - ss1.br)
 459  *      (ss1.br - ss1.bl)     (ss1.tl - ss1.bl)
 460  *      (ss1.br - ss1.bl)     (ss1.tr - ss1.br)
 461  *
 462  * and add another set of two more subspans if in 16-pixel dispatch mode.
 463  *
 464  * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
 465  * for each pair, and vertstride = 2 jumps us 2 elements after processing a
 466  * pair. But for DDY, it's harder, as we want to produce the pairs swizzled
 467  * between each other.  We could probably do it like ddx and swizzle the right
 468  * order later, but bail for now and just produce
 469  * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4)
 470  */
 471 void emit_ddxy(struct brw_compile *p,
 472                const struct brw_reg *dst,
 473                GLuint mask,
 474                GLboolean is_ddx,
 475                const struct brw_reg *arg0)
 476 {
 477    int i;
 478    struct brw_reg src0, src1;
 479
 480    if (mask & SATURATE)
 481       brw_set_saturate(p, 1);
 482    for (i = 0; i < 4; i++ ) {
 483       if (mask & (1<<i)) {
 484          if (is_ddx) {
 485             src0 = brw_reg(arg0[i].file, arg0[i].nr, 1,
 486                            BRW_REGISTER_TYPE_F,
 487                            BRW_VERTICAL_STRIDE_2,
 488                            BRW_WIDTH_2,
 489                            BRW_HORIZONTAL_STRIDE_0,
 490                            BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
 491             src1 = brw_reg(arg0[i].file, arg0[i].nr, 0,
 492                            BRW_REGISTER_TYPE_F,
 493                            BRW_VERTICAL_STRIDE_2,
 494                            BRW_WIDTH_2,
 495                            BRW_HORIZONTAL_STRIDE_0,
 496                            BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
 497          } else {
 498             src0 = brw_reg(arg0[i].file, arg0[i].nr, 0,
 499                            BRW_REGISTER_TYPE_F,
 500                            BRW_VERTICAL_STRIDE_4,
 501                            BRW_WIDTH_4,
 502                            BRW_HORIZONTAL_STRIDE_0,
 503                            BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
 504             src1 = brw_reg(arg0[i].file, arg0[i].nr, 2,
 505                            BRW_REGISTER_TYPE_F,
 506                            BRW_VERTICAL_STRIDE_4,
 507                            BRW_WIDTH_4,
 508                            BRW_HORIZONTAL_STRIDE_0,
 509                            BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
 510          }
 511          brw_ADD(p, dst[i], src0, negate(src1));
 512       }
 513    }
 514    if (mask & SATURATE)
 515       brw_set_saturate(p, 0);
 516 }
 517
 518 void emit_alu1(struct brw_compile *p,
 519                struct brw_instruction *(*func)(struct brw_compile *,
 520                                                struct brw_reg,
 521                                                struct brw_reg),
 522                const struct brw_reg *dst,
 523                GLuint mask,
 524                const struct brw_reg *arg0)
 525 {
 526    GLuint i;
 527
 528    if (mask & SATURATE)
 529       brw_set_saturate(p, 1);
 530
 531    for (i = 0; i < 4; i++) {
 532       if (mask & (1<<i)) {
 533          func(p, dst[i], arg0[i]);
 534       }
 535    }
 536
 537    if (mask & SATURATE)
 538       brw_set_saturate(p, 0);
 539 }
 540
 541
 542 void emit_alu2(struct brw_compile *p,
 543                struct brw_instruction *(*func)(struct brw_compile *,
 544                                                struct brw_reg,
 545                                                struct brw_reg,
 546                                                struct brw_reg),
 547                const struct brw_reg *dst,
 548                GLuint mask,
 549                const struct brw_reg *arg0,
 550                const struct brw_reg *arg1)
 551 {
 552    GLuint i;
 553
 554    if (mask & SATURATE)
 555       brw_set_saturate(p, 1);
 556
 557    for (i = 0; i < 4; i++) {
 558       if (mask & (1<<i)) {
 559          func(p, dst[i], arg0[i], arg1[i]);
 560       }
 561    }
 562
 563    if (mask & SATURATE)
 564       brw_set_saturate(p, 0);
 565 }
 566
 567
 568 void emit_mad(struct brw_compile *p,
 569               const struct brw_reg *dst,
 570               GLuint mask,
 571               const struct brw_reg *arg0,
 572               const struct brw_reg *arg1,
 573               const struct brw_reg *arg2)
 574 {
 575    GLuint i;
 576
 577    for (i = 0; i < 4; i++) {
 578       if (mask & (1<<i)) {
 579          brw_MUL(p, dst[i], arg0[i], arg1[i]);
 580
 581          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 582          brw_ADD(p, dst[i], dst[i], arg2[i]);
 583          brw_set_saturate(p, 0);
 584       }
 585    }
 586 }
 587
 588 void emit_lrp(struct brw_compile *p,
 589               const struct brw_reg *dst,
 590               GLuint mask,
 591               const struct brw_reg *arg0,
 592               const struct brw_reg *arg1,
 593               const struct brw_reg *arg2)
 594 {
 595    GLuint i;
 596
 597    /* Uses dst as a temporary:
 598     */
 599    for (i = 0; i < 4; i++) {
 600       if (mask & (1<<i)) {
 601          /* Can I use the LINE instruction for this?
 602           */
 603          brw_ADD(p, dst[i], negate(arg0[i]), brw_imm_f(1.0));
 604          brw_MUL(p, brw_null_reg(), dst[i], arg2[i]);
 605
 606          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 607          brw_MAC(p, dst[i], arg0[i], arg1[i]);
 608          brw_set_saturate(p, 0);
 609       }
 610    }
 611 }
 612
 613 void emit_sop(struct brw_compile *p,
 614               const struct brw_reg *dst,
 615               GLuint mask,
 616               GLuint cond,
 617               const struct brw_reg *arg0,
 618               const struct brw_reg *arg1)
 619 {
 620    GLuint i;
 621
 622    for (i = 0; i < 4; i++) {
 623       if (mask & (1<<i)) {
 624          brw_push_insn_state(p);
 625          brw_CMP(p, brw_null_reg(), cond, arg0[i], arg1[i]);
 626          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 627          brw_MOV(p, dst[i], brw_imm_f(0));
 628          brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
 629          brw_MOV(p, dst[i], brw_imm_f(1.0));
 630          brw_pop_insn_state(p);
 631       }
 632    }
 633 }
 634
 635 static void emit_slt( struct brw_compile *p,
 636                       const struct brw_reg *dst,
 637                       GLuint mask,
 638                       const struct brw_reg *arg0,
 639                       const struct brw_reg *arg1 )
 640 {
 641    emit_sop(p, dst, mask, BRW_CONDITIONAL_L, arg0, arg1);
 642 }
 643
 644 static void emit_sle( struct brw_compile *p,
 645                       const struct brw_reg *dst,
 646                       GLuint mask,
 647                       const struct brw_reg *arg0,
 648                       const struct brw_reg *arg1 )
 649 {
 650    emit_sop(p, dst, mask, BRW_CONDITIONAL_LE, arg0, arg1);
 651 }
 652
 653 static void emit_sgt( struct brw_compile *p,
 654                       const struct brw_reg *dst,
 655                       GLuint mask,
 656                       const struct brw_reg *arg0,
 657                       const struct brw_reg *arg1 )
 658 {
 659    emit_sop(p, dst, mask, BRW_CONDITIONAL_G, arg0, arg1);
 660 }
 661
 662 static void emit_sge( struct brw_compile *p,
 663                       const struct brw_reg *dst,
 664                       GLuint mask,
 665                       const struct brw_reg *arg0,
 666                       const struct brw_reg *arg1 )
 667 {
 668    emit_sop(p, dst, mask, BRW_CONDITIONAL_GE, arg0, arg1);
 669 }
 670
 671 static void emit_seq( struct brw_compile *p,
 672                       const struct brw_reg *dst,
 673                       GLuint mask,
 674                       const struct brw_reg *arg0,
 675                       const struct brw_reg *arg1 )
 676 {
 677    emit_sop(p, dst, mask, BRW_CONDITIONAL_EQ, arg0, arg1);
 678 }
 679
 680 static void emit_sne( struct brw_compile *p,
 681                       const struct brw_reg *dst,
 682                       GLuint mask,
 683                       const struct brw_reg *arg0,
 684                       const struct brw_reg *arg1 )
 685 {
 686    emit_sop(p, dst, mask, BRW_CONDITIONAL_NEQ, arg0, arg1);
 687 }
 688
 689 void emit_cmp(struct brw_compile *p,
 690               const struct brw_reg *dst,
 691               GLuint mask,
 692               const struct brw_reg *arg0,
 693               const struct brw_reg *arg1,
 694               const struct brw_reg *arg2)
 695 {
 696    GLuint i;
 697
 698    for (i = 0; i < 4; i++) {
 699       if (mask & (1<<i)) {
 700          brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], brw_imm_f(0));
 701
 702          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 703          brw_SEL(p, dst[i], arg1[i], arg2[i]);
 704          brw_set_saturate(p, 0);
 705          brw_set_predicate_control_flag_value(p, 0xff);
 706       }
 707    }
 708 }
 709
 710 void emit_sign(struct brw_compile *p,
 711                const struct brw_reg *dst,
 712                GLuint mask,
 713                const struct brw_reg *arg0)
 714 {
 715    GLuint i;
 716
 717    for (i = 0; i < 4; i++) {
 718       if (mask & (1<<i)) {
 719          brw_MOV(p, dst[i], brw_imm_f(0.0));
 720
 721          brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], brw_imm_f(0));
 722          brw_MOV(p, dst[i], brw_imm_f(-1.0));
 723          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 724
 725          brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, arg0[i], brw_imm_f(0));
 726          brw_MOV(p, dst[i], brw_imm_f(1.0));
 727          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 728       }
 729    }
 730 }
 731
 732 void emit_max(struct brw_compile *p,
 733               const struct brw_reg *dst,
 734               GLuint mask,
 735               const struct brw_reg *arg0,
 736               const struct brw_reg *arg1)
 737 {
 738    GLuint i;
 739
 740    for (i = 0; i < 4; i++) {
 741       if (mask & (1<<i)) {
 742          brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0[i], arg1[i]);
 743
 744          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 745          brw_SEL(p, dst[i], arg0[i], arg1[i]);
 746          brw_set_saturate(p, 0);
 747          brw_set_predicate_control_flag_value(p, 0xff);
 748       }
 749    }
 750 }
 751
 752 void emit_min(struct brw_compile *p,
 753               const struct brw_reg *dst,
 754               GLuint mask,
 755               const struct brw_reg *arg0,
 756               const struct brw_reg *arg1)
 757 {
 758    GLuint i;
 759
 760    for (i = 0; i < 4; i++) {
 761       if (mask & (1<<i)) {
 762          brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], arg1[i]);
 763
 764          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 765          brw_SEL(p, dst[i], arg0[i], arg1[i]);
 766          brw_set_saturate(p, 0);
 767          brw_set_predicate_control_flag_value(p, 0xff);
 768       }
 769    }
 770 }
 771
 772
 773 void emit_dp2(struct brw_compile *p,
 774               const struct brw_reg *dst,
 775               GLuint mask,
 776               const struct brw_reg *arg0,
 777               const struct brw_reg *arg1)
 778 {
 779    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
 780
 781    if (!(mask & WRITEMASK_XYZW))
 782       return; /* Do not emit dead code */
 783
 784    assert(is_power_of_two(mask & WRITEMASK_XYZW));
 785
 786    brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
 787
 788    brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 789    brw_MAC(p, dst[dst_chan], arg0[1], arg1[1]);
 790    brw_set_saturate(p, 0);
 791 }
 792
 793
 794 void emit_dp3(struct brw_compile *p,
 795               const struct brw_reg *dst,
 796               GLuint mask,
 797               const struct brw_reg *arg0,
 798               const struct brw_reg *arg1)
 799 {
 800    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
 801
 802    if (!(mask & WRITEMASK_XYZW))
 803       return; /* Do not emit dead code */
 804
 805    assert(is_power_of_two(mask & WRITEMASK_XYZW));
 806
 807    brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
 808    brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
 809
 810    brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 811    brw_MAC(p, dst[dst_chan], arg0[2], arg1[2]);
 812    brw_set_saturate(p, 0);
 813 }
 814
 815
 816 void emit_dp4(struct brw_compile *p,
 817               const struct brw_reg *dst,
 818               GLuint mask,
 819               const struct brw_reg *arg0,
 820               const struct brw_reg *arg1)
 821 {
 822    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
 823
 824    if (!(mask & WRITEMASK_XYZW))
 825       return; /* Do not emit dead code */
 826
 827    assert(is_power_of_two(mask & WRITEMASK_XYZW));
 828
 829    brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
 830    brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
 831    brw_MAC(p, brw_null_reg(), arg0[2], arg1[2]);
 832
 833    brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 834    brw_MAC(p, dst[dst_chan], arg0[3], arg1[3]);
 835    brw_set_saturate(p, 0);
 836 }
 837
 838
 839 void emit_dph(struct brw_compile *p,
 840               const struct brw_reg *dst,
 841               GLuint mask,
 842               const struct brw_reg *arg0,
 843               const struct brw_reg *arg1)
 844 {
 845    const int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
 846
 847    if (!(mask & WRITEMASK_XYZW))
 848       return; /* Do not emit dead code */
 849
 850    assert(is_power_of_two(mask & WRITEMASK_XYZW));
 851
 852    brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
 853    brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
 854    brw_MAC(p, dst[dst_chan], arg0[2], arg1[2]);
 855
 856    brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 857    brw_ADD(p, dst[dst_chan], dst[dst_chan], arg1[3]);
 858    brw_set_saturate(p, 0);
 859 }
 860
 861
 862 void emit_xpd(struct brw_compile *p,
 863               const struct brw_reg *dst,
 864               GLuint mask,
 865               const struct brw_reg *arg0,
 866               const struct brw_reg *arg1)
 867 {
 868    GLuint i;
 869
 870    assert((mask & WRITEMASK_W) != WRITEMASK_W);
 871
 872    for (i = 0 ; i < 3; i++) {
 873       if (mask & (1<<i)) {
 874          GLuint i2 = (i+2)%3;
 875          GLuint i1 = (i+1)%3;
 876
 877          brw_MUL(p, brw_null_reg(), negate(arg0[i2]), arg1[i1]);
 878
 879          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 880          brw_MAC(p, dst[i], arg0[i1], arg1[i2]);
 881          brw_set_saturate(p, 0);
 882       }
 883    }
 884 }
 885
 886
 887 void emit_math1(struct brw_wm_compile *c,
 888                 GLuint function,
 889                 const struct brw_reg *dst,
 890                 GLuint mask,
 891                 const struct brw_reg *arg0)
 892 {
 893    struct brw_compile *p = &c->func;
 894    struct intel_context *intel = &p->brw->intel;
 895    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
 896    GLuint saturate = ((mask & SATURATE) ?
 897                       BRW_MATH_SATURATE_SATURATE :
 898                       BRW_MATH_SATURATE_NONE);
 899    struct brw_reg src;
 900
 901    if (intel->gen >= 6 && ((arg0[0].hstride == BRW_HORIZONTAL_STRIDE_0 ||
 902                             arg0[0].file != BRW_GENERAL_REGISTER_FILE) ||
 903                            arg0[0].negate || arg0[0].abs)) {
 904       /* Gen6 math requires that source and dst horizontal stride be 1,
 905        * and that the argument be in the GRF.
 906        *
 907        * The hardware ignores source modifiers (negate and abs) on math
 908        * instructions, so we also move to a temp to set those up.
 909        */
 910       src = dst[dst_chan];
 911       brw_MOV(p, src, arg0[0]);
 912    } else {
 913       src = arg0[0];
 914    }
 915
 916    if (!(mask & WRITEMASK_XYZW))
 917       return; /* Do not emit dead code */
 918
 919    assert(is_power_of_two(mask & WRITEMASK_XYZW));
 920
 921    /* Send two messages to perform all 16 operations:
 922     */
 923    brw_push_insn_state(p);
 924    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 925    brw_math(p,
 926             dst[dst_chan],
 927             function,
 928             saturate,
 929             2,
 930             src,
 931             BRW_MATH_DATA_VECTOR,
 932             BRW_MATH_PRECISION_FULL);
 933
 934    if (c->dispatch_width == 16) {
 935       brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
 936       brw_math(p,
 937                offset(dst[dst_chan],1),
 938                function,
 939                saturate,
 940                3,
 941                sechalf(src),
 942                BRW_MATH_DATA_VECTOR,
 943                BRW_MATH_PRECISION_FULL);
 944    }
 945    brw_pop_insn_state(p);
 946 }
 947
 948
 949 void emit_math2(struct brw_wm_compile *c,
 950                 GLuint function,
 951                 const struct brw_reg *dst,
 952                 GLuint mask,
 953                 const struct brw_reg *arg0,
 954                 const struct brw_reg *arg1)
 955 {
 956    struct brw_compile *p = &c->func;
 957    struct intel_context *intel = &p->brw->intel;
 958    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
 959
 960    if (!(mask & WRITEMASK_XYZW))
 961       return; /* Do not emit dead code */
 962
 963    assert(is_power_of_two(mask & WRITEMASK_XYZW));
 964
 965    brw_push_insn_state(p);
 966
 967    /* math can only operate on up to a vec8 at a time, so in
 968     * dispatch_width==16 we have to do the second half manually.
 969     */
 970    if (intel->gen >= 6) {
 971       struct brw_reg src0 = arg0[0];
 972       struct brw_reg src1 = arg1[0];
 973       struct brw_reg temp_dst = dst[dst_chan];
 974
 975       if (arg0[0].hstride == BRW_HORIZONTAL_STRIDE_0) {
 976          brw_MOV(p, temp_dst, src0);
 977          src0 = temp_dst;
 978       }
 979
 980       if (arg1[0].hstride == BRW_HORIZONTAL_STRIDE_0) {
 981          /* This is a heinous hack to get a temporary register for use
 982           * in case both arg0 and arg1 are constants.  Why you're
 983           * doing exponentiation on constant values in the shader, we
 984           * don't know.
 985           *
 986           * max_wm_grf is almost surely less than the maximum GRF, and
 987           * gen6 doesn't care about the number of GRFs used in a
 988           * shader like pre-gen6 did.
 989           */
 990          struct brw_reg temp = brw_vec8_grf(c->max_wm_grf, 0);
 991          brw_MOV(p, temp, src1);
 992          src1 = temp;
 993       }
 994
 995       brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 996       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 997       brw_math2(p,
 998                 temp_dst,
 999                 function,
1000                 src0,
1001                 src1);
1002       if (c->dispatch_width == 16) {
1003          brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
1004          brw_math2(p,
1005                    sechalf(temp_dst),
1006                    function,
1007                    sechalf(src0),
1008                    sechalf(src1));
1009       }
1010    } else {
1011       GLuint saturate = ((mask & SATURATE) ?
1012                          BRW_MATH_SATURATE_SATURATE :
1013                          BRW_MATH_SATURATE_NONE);
1014
1015       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1016       brw_MOV(p, brw_message_reg(3), arg1[0]);
1017       if (c->dispatch_width == 16) {
1018          brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
1019          brw_MOV(p, brw_message_reg(5), sechalf(arg1[0]));
1020       }
1021
1022       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1023       brw_math(p,
1024                dst[dst_chan],
1025                function,
1026                saturate,
1027                2,
1028                arg0[0],
1029                BRW_MATH_DATA_VECTOR,
1030                BRW_MATH_PRECISION_FULL);
1031
1032       /* Send two messages to perform all 16 operations:
1033        */
1034       if (c->dispatch_width == 16) {
1035          brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
1036          brw_math(p,
1037                   offset(dst[dst_chan],1),
1038                   function,
1039                   saturate,
1040                   4,
1041                   sechalf(arg0[0]),
1042                   BRW_MATH_DATA_VECTOR,
1043                   BRW_MATH_PRECISION_FULL);
1044       }
1045    }
1046    brw_pop_insn_state(p);
1047 }
1048
1049
1050 void emit_tex(struct brw_wm_compile *c,
1051               struct brw_reg *dst,
1052               GLuint dst_flags,
1053               struct brw_reg *arg,
1054               struct brw_reg depth_payload,
1055               GLuint tex_idx,
1056               GLuint sampler,
1057               GLboolean shadow)
1058 {
1059    struct brw_compile *p = &c->func;
1060    struct intel_context *intel = &p->brw->intel;
1061    struct brw_reg dst_retyped;
1062    GLuint cur_mrf = 2, response_length;
1063    GLuint i, nr_texcoords;
1064    GLuint emit;
1065    GLuint msg_type;
1066    GLuint mrf_per_channel;
1067    GLuint simd_mode;
1068
1069    if (c->dispatch_width == 16) {
1070       mrf_per_channel = 2;
1071       response_length = 8;
1072       dst_retyped = retype(vec16(dst[0]), BRW_REGISTER_TYPE_UW);
1073       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
1074    } else {
1075       mrf_per_channel = 1;
1076       response_length = 4;
1077       dst_retyped = retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW);
1078       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
1079    }
1080
1081    /* How many input regs are there?
1082     */
1083    switch (tex_idx) {
1084    case TEXTURE_1D_INDEX:
1085       emit = WRITEMASK_X;
1086       nr_texcoords = 1;
1087       break;
1088    case TEXTURE_2D_INDEX:
1089    case TEXTURE_RECT_INDEX:
1090       emit = WRITEMASK_XY;
1091       nr_texcoords = 2;
1092       break;
1093    case TEXTURE_3D_INDEX:
1094    case TEXTURE_CUBE_INDEX:
1095       emit = WRITEMASK_XYZ;
1096       nr_texcoords = 3;
1097       break;
1098    default:
1099       /* unexpected target */
1100       abort();
1101    }
1102
1103    /* Pre-Ironlake, the 8-wide sampler always took u,v,r. */
1104    if (intel->gen < 5 && c->dispatch_width == 8)
1105       nr_texcoords = 3;
1106
1107    /* For shadow comparisons, we have to supply u,v,r. */
1108    if (shadow)
1109       nr_texcoords = 3;
1110
1111    /* Emit the texcoords. */
1112    for (i = 0; i < nr_texcoords; i++) {
1113       if (emit & (1<<i))
1114          brw_MOV(p, brw_message_reg(cur_mrf), arg[i]);
1115       else
1116          brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0));
1117       cur_mrf += mrf_per_channel;
1118    }
1119
1120    /* Fill in the shadow comparison reference value. */
1121    if (shadow) {
1122       if (intel->gen >= 5) {
1123          /* Fill in the cube map array index value. */
1124          brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0));
1125          cur_mrf += mrf_per_channel;
1126       } else if (c->dispatch_width == 8) {
1127          /* Fill in the LOD bias value. */
1128          brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0));
1129          cur_mrf += mrf_per_channel;
1130       }
1131       brw_MOV(p, brw_message_reg(cur_mrf), arg[2]);
1132       cur_mrf += mrf_per_channel;
1133    }
1134
1135    if (intel->gen >= 5) {
1136       if (shadow)
1137          msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_COMPARE;
1138       else
1139          msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE;
1140    } else {
1141       /* Note that G45 and older determines shadow compare and dispatch width
1142        * from message length for most messages.
1143        */
1144       if (c->dispatch_width == 16 && shadow)
1145          msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE;
1146       else
1147          msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE;
1148    }
1149
1150    brw_SAMPLE(p,
1151               dst_retyped,
1152               1,
1153               retype(depth_payload, BRW_REGISTER_TYPE_UW),
1154               SURF_INDEX_TEXTURE(sampler),
1155               sampler,
1156               dst_flags & WRITEMASK_XYZW,
1157               msg_type,
1158               response_length,
1159               cur_mrf - 1,
1160               0,
1161               1,
1162               simd_mode);
1163 }
1164
1165
1166 void emit_txb(struct brw_wm_compile *c,
1167               struct brw_reg *dst,
1168               GLuint dst_flags,
1169               struct brw_reg *arg,
1170               struct brw_reg depth_payload,
1171               GLuint tex_idx,
1172               GLuint sampler)
1173 {
1174    struct brw_compile *p = &c->func;
1175    struct intel_context *intel = &p->brw->intel;
1176    GLuint msgLength;
1177    GLuint msg_type;
1178    GLuint mrf_per_channel;
1179    GLuint response_length;
1180    struct brw_reg dst_retyped;
1181
1182    /* The G45 and older chipsets don't support 8-wide dispatch for LOD biased
1183     * samples, so we'll use the 16-wide instruction, leave the second halves
1184     * undefined, and trust the execution mask to keep the undefined pixels
1185     * from mattering.
1186     */
1187    if (c->dispatch_width == 16 || intel->gen < 5) {
1188       if (intel->gen >= 5)
1189          msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS;
1190       else
1191          msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
1192       mrf_per_channel = 2;
1193       dst_retyped = retype(vec16(dst[0]), BRW_REGISTER_TYPE_UW);
1194       response_length = 8;
1195    } else {
1196       msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS;
1197       mrf_per_channel = 1;
1198       dst_retyped = retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW);
1199       response_length = 4;
1200    }
1201
1202    /* Shadow ignored for txb. */
1203    switch (tex_idx) {
1204    case TEXTURE_1D_INDEX:
1205       brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]);
1206       brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), brw_imm_f(0));
1207       brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), brw_imm_f(0));
1208       break;
1209    case TEXTURE_2D_INDEX:
1210    case TEXTURE_RECT_INDEX:
1211       brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]);
1212       brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), arg[1]);
1213       brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), brw_imm_f(0));
1214       break;
1215    case TEXTURE_3D_INDEX:
1216    case TEXTURE_CUBE_INDEX:
1217       brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]);
1218       brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), arg[1]);
1219       brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), arg[2]);
1220       break;
1221    default:
1222       /* unexpected target */
1223       abort();
1224    }
1225
1226    brw_MOV(p, brw_message_reg(2 + 3 * mrf_per_channel), arg[3]);
1227    msgLength = 2 + 4 * mrf_per_channel - 1;
1228
1229    brw_SAMPLE(p,
1230               dst_retyped,
1231               1,
1232               retype(depth_payload, BRW_REGISTER_TYPE_UW),
1233               SURF_INDEX_TEXTURE(sampler),
1234               sampler,
1235               dst_flags & WRITEMASK_XYZW,
1236               msg_type,
1237               response_length,
1238               msgLength,
1239               0,
1240               1,
1241               BRW_SAMPLER_SIMD_MODE_SIMD16);
1242 }
1243
1244
1245 static void emit_lit(struct brw_wm_compile *c,
1246                      const struct brw_reg *dst,
1247                      GLuint mask,
1248                      const struct brw_reg *arg0)
1249 {
1250    struct brw_compile *p = &c->func;
1251
1252    assert((mask & WRITEMASK_XW) == 0);
1253
1254    if (mask & WRITEMASK_Y) {
1255       brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
1256       brw_MOV(p, dst[1], arg0[0]);
1257       brw_set_saturate(p, 0);
1258    }
1259
1260    if (mask & WRITEMASK_Z) {
1261       emit_math2(c, BRW_MATH_FUNCTION_POW,
1262                  &dst[2],
1263                  WRITEMASK_X | (mask & SATURATE),
1264                  &arg0[1],
1265                  &arg0[3]);
1266    }
1267
1268    /* Ordinarily you'd use an iff statement to skip or shortcircuit
1269     * some of the POW calculations above, but 16-wide iff statements
1270     * seem to lock c1 hardware, so this is a nasty workaround:
1271     */
1272    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_LE, arg0[0], brw_imm_f(0));
1273    {
1274       if (mask & WRITEMASK_Y)
1275          brw_MOV(p, dst[1], brw_imm_f(0));
1276
1277       if (mask & WRITEMASK_Z)
1278          brw_MOV(p, dst[2], brw_imm_f(0));
1279    }
1280    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1281 }
1282
1283
1284 /* Kill pixel - set execution mask to zero for those pixels which
1285  * fail.
1286  */
1287 static void emit_kil( struct brw_wm_compile *c,
1288                       struct brw_reg *arg0)
1289 {
1290    struct brw_compile *p = &c->func;
1291    struct intel_context *intel = &p->brw->intel;
1292    struct brw_reg pixelmask;
1293    GLuint i, j;
1294
1295    if (intel->gen >= 6)
1296       pixelmask = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
1297    else
1298       pixelmask = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
1299
1300    for (i = 0; i < 4; i++) {
1301       /* Check if we've already done the comparison for this reg
1302        * -- common when someone does KIL TEMP.wwww.
1303        */
1304       for (j = 0; j < i; j++) {
1305          if (memcmp(&arg0[j], &arg0[i], sizeof(arg0[0])) == 0)
1306             break;
1307       }
1308       if (j != i)
1309          continue;
1310
1311       brw_push_insn_state(p);
1312       brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0[i], brw_imm_f(0));
1313       brw_set_predicate_control_flag_value(p, 0xff);
1314       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1315       brw_AND(p, pixelmask, brw_flag_reg(), pixelmask);
1316       brw_pop_insn_state(p);
1317    }
1318 }
1319
1320 static void fire_fb_write( struct brw_wm_compile *c,
1321                            GLuint base_reg,
1322                            GLuint nr,
1323                            GLuint target,
1324                            GLuint eot )
1325 {
1326    struct brw_compile *p = &c->func;
1327    struct intel_context *intel = &p->brw->intel;
1328
1329    /* Pass through control information:
1330     *
1331     * Gen6 has done m1 mov in emit_fb_write() for current SIMD16 case.
1332     */
1333 /*  mov (8) m1.0<1>:ud   r1.0<8;8,1>:ud   { Align1 NoMask } */
1334    if (intel->gen < 6)
1335    {
1336       brw_push_insn_state(p);
1337       brw_set_mask_control(p, BRW_MASK_DISABLE); /* ? */
1338       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1339       brw_MOV(p,
1340                brw_message_reg(base_reg + 1),
1341                brw_vec8_grf(1, 0));
1342       brw_pop_insn_state(p);
1343    }
1344
1345    /* Send framebuffer write message: */
1346 /*  send (16) null.0<1>:uw m0               r0.0<8;8,1>:uw   0x85a04000:ud    { Align1 EOT } */
1347    brw_fb_WRITE(p,
1348                 c->dispatch_width,
1349                 base_reg,
1350                 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW),
1351                 target,
1352                 nr,
1353                 0,
1354                 eot,
1355                 GL_TRUE);
1356 }
1357
1358
1359 static void emit_aa( struct brw_wm_compile *c,
1360                      struct brw_reg *arg1,
1361                      GLuint reg )
1362 {
1363    struct brw_compile *p = &c->func;
1364    GLuint comp = c->aa_dest_stencil_reg / 2;
1365    GLuint off = c->aa_dest_stencil_reg % 2;
1366    struct brw_reg aa = offset(arg1[comp], off);
1367
1368    brw_push_insn_state(p);
1369    brw_set_compression_control(p, BRW_COMPRESSION_NONE); /* ?? */
1370    brw_MOV(p, brw_message_reg(reg), aa);
1371    brw_pop_insn_state(p);
1372 }
1373
1374
1375 /* Post-fragment-program processing.  Send the results to the
1376  * framebuffer.
1377  * \param arg0  the fragment color
1378  * \param arg1  the pass-through depth value
1379  * \param arg2  the shader-computed depth value
1380  */
1381 void emit_fb_write(struct brw_wm_compile *c,
1382                    struct brw_reg *arg0,
1383                    struct brw_reg *arg1,
1384                    struct brw_reg *arg2,
1385                    GLuint target,
1386                    GLuint eot)
1387 {
1388    struct brw_compile *p = &c->func;
1389    struct brw_context *brw = p->brw;
1390    struct intel_context *intel = &brw->intel;
1391    GLuint nr = 2;
1392    GLuint channel;
1393
1394    /* Reserve a space for AA - may not be needed:
1395     */
1396    if (c->aa_dest_stencil_reg)
1397       nr += 1;
1398
1399    /* I don't really understand how this achieves the color interleave
1400     * (ie RGBARGBA) in the result:  [Do the saturation here]
1401     */
1402    brw_push_insn_state(p);
1403
1404    if (c->key.clamp_fragment_color)
1405       brw_set_saturate(p, 1);
1406
1407    for (channel = 0; channel < 4; channel++) {
1408       if (intel->gen >= 6) {
1409          /* gen6 SIMD16 single source DP write looks like:
1410           * m + 0: r0
1411           * m + 1: r1
1412           * m + 2: g0
1413           * m + 3: g1
1414           * m + 4: b0
1415           * m + 5: b1
1416           * m + 6: a0
1417           * m + 7: a1
1418           */
1419          if (c->dispatch_width == 16) {
1420             brw_MOV(p, brw_message_reg(nr + channel * 2), arg0[channel]);
1421          } else {
1422             brw_MOV(p, brw_message_reg(nr + channel), arg0[channel]);
1423          }
1424       } else if (c->dispatch_width == 16 && brw->has_compr4) {
1425          /* pre-gen6 SIMD16 single source DP write looks like:
1426           * m + 0: r0
1427           * m + 1: g0
1428           * m + 2: b0
1429           * m + 3: a0
1430           * m + 4: r1
1431           * m + 5: g1
1432           * m + 6: b1
1433           * m + 7: a1
1434           *
1435           * By setting the high bit of the MRF register number, we indicate
1436           * that we want COMPR4 mode - instead of doing the usual destination
1437           * + 1 for the second half we get destination + 4.
1438           */
1439          brw_MOV(p,
1440                  brw_message_reg(nr + channel + BRW_MRF_COMPR4),
1441                  arg0[channel]);
1442       } else {
1443          /*  mov (8) m2.0<1>:ud   r28.0<8;8,1>:ud  { Align1 } */
1444          /*  mov (8) m6.0<1>:ud   r29.0<8;8,1>:ud  { Align1 SecHalf } */
1445          brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1446          brw_MOV(p,
1447                  brw_message_reg(nr + channel),
1448                  arg0[channel]);
1449
1450          if (c->dispatch_width == 16) {
1451             brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
1452             brw_MOV(p,
1453                     brw_message_reg(nr + channel + 4),
1454                     sechalf(arg0[channel]));
1455          }
1456       }
1457    }
1458
1459    brw_set_saturate(p, 0);
1460
1461    /* skip over the regs populated above:
1462     */
1463    if (c->dispatch_width == 16)
1464       nr += 8;
1465    else
1466       nr += 4;
1467
1468    brw_pop_insn_state(p);
1469
1470    if (c->source_depth_to_render_target)
1471    {
1472       if (c->computes_depth)
1473          brw_MOV(p, brw_message_reg(nr), arg2[2]);
1474       else
1475          brw_MOV(p, brw_message_reg(nr), arg1[1]); /* ? */
1476
1477       nr += 2;
1478    }
1479
1480    if (c->dest_depth_reg)
1481    {
1482       GLuint comp = c->dest_depth_reg / 2;
1483       GLuint off = c->dest_depth_reg % 2;
1484
1485       if (off != 0) {
1486          brw_push_insn_state(p);
1487          brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1488
1489          brw_MOV(p, brw_message_reg(nr), offset(arg1[comp],1));
1490          /* 2nd half? */
1491          brw_MOV(p, brw_message_reg(nr+1), arg1[comp+1]);
1492          brw_pop_insn_state(p);
1493       }
1494       else {
1495          brw_MOV(p, brw_message_reg(nr), arg1[comp]);
1496       }
1497       nr += 2;
1498    }
1499
1500    if (intel->gen >= 6) {
1501       /* Load the message header.  There's no implied move from src0
1502        * to the base mrf on gen6.
1503        */
1504       brw_push_insn_state(p);
1505       brw_set_mask_control(p, BRW_MASK_DISABLE);
1506       brw_MOV(p, retype(brw_message_reg(0), BRW_REGISTER_TYPE_UD),
1507               retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1508       brw_pop_insn_state(p);
1509
1510       if (target != 0) {
1511          brw_MOV(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
1512                                         0,
1513                                         2), BRW_REGISTER_TYPE_UD),
1514                  brw_imm_ud(target));
1515       }
1516    }
1517
1518    if (!c->runtime_check_aads_emit) {
1519       if (c->aa_dest_stencil_reg)
1520          emit_aa(c, arg1, 2);
1521
1522       fire_fb_write(c, 0, nr, target, eot);
1523    }
1524    else {
1525       struct brw_reg v1_null_ud = vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
1526       struct brw_reg ip = brw_ip_reg();
1527       struct brw_instruction *jmp;
1528
1529       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1530       brw_set_conditionalmod(p, BRW_CONDITIONAL_Z);
1531       brw_AND(p,
1532               v1_null_ud,
1533               get_element_ud(brw_vec8_grf(1,0), 6),
1534               brw_imm_ud(1<<26));
1535
1536       jmp = brw_JMPI(p, ip, ip, brw_imm_w(0));
1537       {
1538          emit_aa(c, arg1, 2);
1539          fire_fb_write(c, 0, nr, target, eot);
1540          /* note - thread killed in subroutine */
1541       }
1542       brw_land_fwd_jump(p, jmp);
1543
1544       /* ELSE: Shuffle up one register to fill in the hole left for AA:
1545        */
1546       fire_fb_write(c, 1, nr-1, target, eot);
1547    }
1548 }
1549
1550 /**
1551  * Move a GPR to scratch memory.
1552  */
1553 static void emit_spill( struct brw_wm_compile *c,
1554                         struct brw_reg reg,
1555                         GLuint slot )
1556 {
1557    struct brw_compile *p = &c->func;
1558
1559    /*
1560      mov (16) m2.0<1>:ud   r2.0<8;8,1>:ud   { Align1 Compr }
1561    */
1562    brw_MOV(p, brw_message_reg(2), reg);
1563
1564    /*
1565      mov (1) r0.2<1>:d    0x00000080:d     { Align1 NoMask }
1566      send (16) null.0<1>:uw m1               r0.0<8;8,1>:uw   0x053003ff:ud    { Align1 }
1567    */
1568    brw_oword_block_write_scratch(p, brw_message_reg(1), 2, slot);
1569 }
1570
1571
1572 /**
1573  * Load a GPR from scratch memory.
1574  */
1575 static void emit_unspill( struct brw_wm_compile *c,
1576                           struct brw_reg reg,
1577                           GLuint slot )
1578 {
1579    struct brw_compile *p = &c->func;
1580
1581    /* Slot 0 is the undef value.
1582     */
1583    if (slot == 0) {
1584       brw_MOV(p, reg, brw_imm_f(0));
1585       return;
1586    }
1587
1588    /*
1589      mov (1) r0.2<1>:d    0x000000c0:d     { Align1 NoMask }
1590      send (16) r110.0<1>:uw m1               r0.0<8;8,1>:uw   0x041243ff:ud    { Align1 }
1591    */
1592
1593    brw_oword_block_read(p, vec16(reg), brw_message_reg(1), 2, slot);
1594 }
1595
1596
1597 /**
1598  * Retrieve up to 4 GEN4 register pairs for the given wm reg:
1599  * Args with unspill_reg != 0 will be loaded from scratch memory.
1600  */
1601 static void get_argument_regs( struct brw_wm_compile *c,
1602                                struct brw_wm_ref *arg[],
1603                                struct brw_reg *regs )
1604 {
1605    GLuint i;
1606
1607    for (i = 0; i < 4; i++) {
1608       if (arg[i]) {
1609          if (arg[i]->unspill_reg)
1610             emit_unspill(c,
1611                          brw_vec8_grf(arg[i]->unspill_reg, 0),
1612                          arg[i]->value->spill_slot);
1613
1614          regs[i] = arg[i]->hw_reg;
1615       }
1616       else {
1617          regs[i] = brw_null_reg();
1618       }
1619    }
1620 }
1621
1622
1623 /**
1624  * For values that have a spill_slot!=0, write those regs to scratch memory.
1625  */
1626 static void spill_values( struct brw_wm_compile *c,
1627                           struct brw_wm_value *values,
1628                           GLuint nr )
1629 {
1630    GLuint i;
1631
1632    for (i = 0; i < nr; i++)
1633       if (values[i].spill_slot)
1634          emit_spill(c, values[i].hw_reg, values[i].spill_slot);
1635 }
1636
1637
1638 /* Emit the fragment program instructions here.
1639  */
1640 void brw_wm_emit( struct brw_wm_compile *c )
1641 {
1642    struct brw_compile *p = &c->func;
1643    struct intel_context *intel = &p->brw->intel;
1644    GLuint insn;
1645
1646    brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
1647    if (intel->gen >= 6)
1648         brw_set_acc_write_control(p, 1);
1649
1650    /* Check if any of the payload regs need to be spilled:
1651     */
1652    spill_values(c, c->payload.depth, 4);
1653    spill_values(c, c->creg, c->nr_creg);
1654    spill_values(c, c->payload.input_interp, FRAG_ATTRIB_MAX);
1655
1656
1657    for (insn = 0; insn < c->nr_insns; insn++) {
1658
1659       struct brw_wm_instruction *inst = &c->instruction[insn];
1660       struct brw_reg args[3][4], dst[4];
1661       GLuint i, dst_flags;
1662
1663       /* Get argument regs:
1664        */
1665       for (i = 0; i < 3; i++)
1666          get_argument_regs(c, inst->src[i], args[i]);
1667
1668       /* Get dest regs:
1669        */
1670       for (i = 0; i < 4; i++)
1671          if (inst->dst[i])
1672             dst[i] = inst->dst[i]->hw_reg;
1673          else
1674             dst[i] = brw_null_reg();
1675
1676       /* Flags
1677        */
1678       dst_flags = inst->writemask;
1679       if (inst->saturate)
1680          dst_flags |= SATURATE;
1681
1682       switch (inst->opcode) {
1683          /* Generated instructions for calculating triangle interpolants:
1684           */
1685       case WM_PIXELXY:
1686          emit_pixel_xy(c, dst, dst_flags);
1687          break;
1688
1689       case WM_DELTAXY:
1690          emit_delta_xy(p, dst, dst_flags, args[0]);
1691          break;
1692
1693       case WM_WPOSXY:
1694          emit_wpos_xy(c, dst, dst_flags, args[0]);
1695          break;
1696
1697       case WM_PIXELW:
1698          emit_pixel_w(c, dst, dst_flags, args[0], args[1]);
1699          break;
1700
1701       case WM_LINTERP:
1702          emit_linterp(p, dst, dst_flags, args[0], args[1]);
1703          break;
1704
1705       case WM_PINTERP:
1706          emit_pinterp(p, dst, dst_flags, args[0], args[1], args[2]);
1707          break;
1708
1709       case WM_CINTERP:
1710          emit_cinterp(p, dst, dst_flags, args[0]);
1711          break;
1712
1713       case WM_FB_WRITE:
1714          emit_fb_write(c, args[0], args[1], args[2], inst->target, inst->eot);
1715          break;
1716
1717       case WM_FRONTFACING:
1718          emit_frontfacing(p, dst, dst_flags);
1719          break;
1720
1721          /* Straightforward arithmetic:
1722           */
1723       case OPCODE_ADD:
1724          emit_alu2(p, brw_ADD, dst, dst_flags, args[0], args[1]);
1725          break;
1726
1727       case OPCODE_FRC:
1728          emit_alu1(p, brw_FRC, dst, dst_flags, args[0]);
1729          break;
1730
1731       case OPCODE_FLR:
1732          emit_alu1(p, brw_RNDD, dst, dst_flags, args[0]);
1733          break;
1734
1735       case OPCODE_DDX:
1736          emit_ddxy(p, dst, dst_flags, GL_TRUE, args[0]);
1737          break;
1738
1739       case OPCODE_DDY:
1740          emit_ddxy(p, dst, dst_flags, GL_FALSE, args[0]);
1741          break;
1742
1743       case OPCODE_DP2:
1744          emit_dp2(p, dst, dst_flags, args[0], args[1]);
1745          break;
1746
1747       case OPCODE_DP3:
1748          emit_dp3(p, dst, dst_flags, args[0], args[1]);
1749          break;
1750
1751       case OPCODE_DP4:
1752          emit_dp4(p, dst, dst_flags, args[0], args[1]);
1753          break;
1754
1755       case OPCODE_DPH:
1756          emit_dph(p, dst, dst_flags, args[0], args[1]);
1757          break;
1758
1759       case OPCODE_TRUNC:
1760          for (i = 0; i < 4; i++) {
1761             if (dst_flags & (1<<i)) {
1762                brw_RNDZ(p, dst[i], args[0][i]);
1763             }
1764          }
1765          break;
1766
1767       case OPCODE_LRP:
1768          emit_lrp(p, dst, dst_flags, args[0], args[1], args[2]);
1769          break;
1770
1771       case OPCODE_MAD:
1772          emit_mad(p, dst, dst_flags, args[0], args[1], args[2]);
1773          break;
1774
1775       case OPCODE_MOV:
1776       case OPCODE_SWZ:
1777          emit_alu1(p, brw_MOV, dst, dst_flags, args[0]);
1778          break;
1779
1780       case OPCODE_MUL:
1781          emit_alu2(p, brw_MUL, dst, dst_flags, args[0], args[1]);
1782          break;
1783
1784       case OPCODE_XPD:
1785          emit_xpd(p, dst, dst_flags, args[0], args[1]);
1786          break;
1787
1788          /* Higher math functions:
1789           */
1790       case OPCODE_RCP:
1791          emit_math1(c, BRW_MATH_FUNCTION_INV, dst, dst_flags, args[0]);
1792          break;
1793
1794       case OPCODE_RSQ:
1795          emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, dst_flags, args[0]);
1796          break;
1797
1798       case OPCODE_SIN:
1799          emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, dst_flags, args[0]);
1800          break;
1801
1802       case OPCODE_COS:
1803          emit_math1(c, BRW_MATH_FUNCTION_COS, dst, dst_flags, args[0]);
1804          break;
1805
1806       case OPCODE_EX2:
1807          emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, dst_flags, args[0]);
1808          break;
1809
1810       case OPCODE_LG2:
1811          emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, dst_flags, args[0]);
1812          break;
1813
1814       case OPCODE_SCS:
1815          /* There is an scs math function, but it would need some
1816           * fixup for 16-element execution.
1817           */
1818          if (dst_flags & WRITEMASK_X)
1819             emit_math1(c, BRW_MATH_FUNCTION_COS, dst, (dst_flags&SATURATE)|WRITEMASK_X, args[0]);
1820          if (dst_flags & WRITEMASK_Y)
1821             emit_math1(c, BRW_MATH_FUNCTION_SIN, dst+1, (dst_flags&SATURATE)|WRITEMASK_X, args[0]);
1822          break;
1823
1824       case OPCODE_POW:
1825          emit_math2(c, BRW_MATH_FUNCTION_POW, dst, dst_flags, args[0], args[1]);
1826          break;
1827
1828          /* Comparisons:
1829           */
1830       case OPCODE_CMP:
1831          emit_cmp(p, dst, dst_flags, args[0], args[1], args[2]);
1832          break;
1833
1834       case OPCODE_MAX:
1835          emit_max(p, dst, dst_flags, args[0], args[1]);
1836          break;
1837
1838       case OPCODE_MIN:
1839          emit_min(p, dst, dst_flags, args[0], args[1]);
1840          break;
1841
1842       case OPCODE_SLT:
1843          emit_slt(p, dst, dst_flags, args[0], args[1]);
1844          break;
1845
1846       case OPCODE_SLE:
1847          emit_sle(p, dst, dst_flags, args[0], args[1]);
1848         break;
1849       case OPCODE_SGT:
1850          emit_sgt(p, dst, dst_flags, args[0], args[1]);
1851         break;
1852       case OPCODE_SGE:
1853          emit_sge(p, dst, dst_flags, args[0], args[1]);
1854          break;
1855       case OPCODE_SEQ:
1856          emit_seq(p, dst, dst_flags, args[0], args[1]);
1857         break;
1858       case OPCODE_SNE:
1859          emit_sne(p, dst, dst_flags, args[0], args[1]);
1860         break;
1861
1862       case OPCODE_SSG:
1863          emit_sign(p, dst, dst_flags, args[0]);
1864          break;
1865
1866       case OPCODE_LIT:
1867          emit_lit(c, dst, dst_flags, args[0]);
1868          break;
1869
1870          /* Texturing operations:
1871           */
1872       case OPCODE_TEX:
1873          emit_tex(c, dst, dst_flags, args[0], c->payload.depth[0].hw_reg,
1874                   inst->tex_idx, inst->tex_unit,
1875                   inst->tex_shadow);
1876          break;
1877
1878       case OPCODE_TXB:
1879          emit_txb(c, dst, dst_flags, args[0], c->payload.depth[0].hw_reg,
1880                   inst->tex_idx, inst->tex_unit);
1881          break;
1882
1883       case OPCODE_KIL:
1884          emit_kil(c, args[0]);
1885          break;
1886
1887       default:
1888          printf("Unsupported opcode %i (%s) in fragment shader\n",
1889                 inst->opcode, inst->opcode < MAX_OPCODE ?
1890                 _mesa_opcode_string(inst->opcode) :
1891                 "unknown");
1892       }
1893
1894       for (i = 0; i < 4; i++)
1895         if (inst->dst[i] && inst->dst[i]->spill_slot)
1896            emit_spill(c,
1897                       inst->dst[i]->hw_reg,
1898                       inst->dst[i]->spill_slot);
1899    }
1900
1901    /* Only properly tested on ILK */
1902    if (p->brw->intel.gen == 5) {
1903      brw_remove_duplicate_mrf_moves(p);
1904      if (c->dispatch_width == 16)
1905         brw_remove_grf_to_mrf_moves(p);
1906    }
1907
1908    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
1909       int i;
1910
1911      printf("wm-native:\n");
1912      for (i = 0; i < p->nr_insn; i++)
1913          brw_disasm(stdout, &p->store[i], p->brw->intel.gen);
1914       printf("\n");
1915    }
1916 }
1917