assembler/brw_eu_emit.c

   1 /*
   2  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
   3  Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
   4  develop this 3D driver.
   5
   6  Permission is hereby granted, free of charge, to any person obtaining
   7  a copy of this software and associated documentation files (the
   8  "Software"), to deal in the Software without restriction, including
   9  without limitation the rights to use, copy, modify, merge, publish,
  10  distribute, sublicense, and/or sell copies of the Software, and to
  11  permit persons to whom the Software is furnished to do so, subject to
  12  the following conditions:
  13
  14  The above copyright notice and this permission notice (including the
  15  next paragraph) shall be included in all copies or substantial
  16  portions of the Software.
  17
  18  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  19  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  21  IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
  22  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  23  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  24  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25
  26  **********************************************************************/
  27  /*
  28   * Authors:
  29   *   Keith Whitwell <keith@tungstengraphics.com>
  30   */
  31
  32 #include <string.h>
  33
  34 #include "brw_context.h"
  35 #include "brw_defines.h"
  36 #include "brw_eu.h"
  37
  38 #include "ralloc.h"
  39
  40 /***********************************************************************
  41  * Internal helper for constructing instructions
  42  */
  43
  44 static void guess_execution_size(struct brw_compile *p,
  45                                  struct brw_instruction *insn,
  46                                  struct brw_reg reg)
  47 {
  48    if (reg.width == BRW_WIDTH_8 && p->compressed)
  49       insn->header.execution_size = BRW_EXECUTE_16;
  50    else
  51       insn->header.execution_size = reg.width;  /* note - definitions are compatible */
  52 }
  53
  54
  55 /**
  56  * Prior to Sandybridge, the SEND instruction accepted non-MRF source
  57  * registers, implicitly moving the operand to a message register.
  58  *
  59  * On Sandybridge, this is no longer the case.  This function performs the
  60  * explicit move; it should be called before emitting a SEND instruction.
  61  */
  62 void
  63 gen6_resolve_implied_move(struct brw_compile *p,
  64                           struct brw_reg *src,
  65                           unsigned msg_reg_nr)
  66 {
  67    struct intel_context *intel = &p->brw->intel;
  68    if (intel->gen < 6)
  69       return;
  70
  71    if (src->file == BRW_MESSAGE_REGISTER_FILE)
  72       return;
  73
  74    if (src->file != BRW_ARCHITECTURE_REGISTER_FILE || src->nr != BRW_ARF_NULL) {
  75       brw_push_insn_state(p);
  76       brw_set_mask_control(p, BRW_MASK_DISABLE);
  77       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
  78       brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD),
  79               retype(*src, BRW_REGISTER_TYPE_UD));
  80       brw_pop_insn_state(p);
  81    }
  82    *src = brw_message_reg(msg_reg_nr);
  83 }
  84
  85 static void
  86 gen7_convert_mrf_to_grf(struct brw_compile *p, struct brw_reg *reg)
  87 {
  88    /* From the BSpec / ISA Reference / send - [DevIVB+]:
  89     * "The send with EOT should use register space R112-R127 for <src>. This is
  90     *  to enable loading of a new thread into the same slot while the message
  91     *  with EOT for current thread is pending dispatch."
  92     *
  93     * Since we're pretending to have 16 MRFs anyway, we may as well use the
  94     * registers required for messages with EOT.
  95     */
  96    struct intel_context *intel = &p->brw->intel;
  97    if (intel->gen == 7 && reg->file == BRW_MESSAGE_REGISTER_FILE) {
  98       reg->file = BRW_GENERAL_REGISTER_FILE;
  99       reg->nr += GEN7_MRF_HACK_START;
 100    }
 101 }
 102
 103
 104 void
 105 brw_set_dest(struct brw_compile *p, struct brw_instruction *insn,
 106              struct brw_reg dest)
 107 {
 108    if (dest.file != BRW_ARCHITECTURE_REGISTER_FILE &&
 109        dest.file != BRW_MESSAGE_REGISTER_FILE)
 110       assert(dest.nr < 128);
 111
 112    gen7_convert_mrf_to_grf(p, &dest);
 113
 114    insn->bits1.da1.dest_reg_file = dest.file;
 115    insn->bits1.da1.dest_reg_type = dest.type;
 116    insn->bits1.da1.dest_address_mode = dest.address_mode;
 117
 118    if (dest.address_mode == BRW_ADDRESS_DIRECT) {
 119       insn->bits1.da1.dest_reg_nr = dest.nr;
 120
 121       if (insn->header.access_mode == BRW_ALIGN_1) {
 122          insn->bits1.da1.dest_subreg_nr = dest.subnr;
 123          if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
 124             dest.hstride = BRW_HORIZONTAL_STRIDE_1;
 125          insn->bits1.da1.dest_horiz_stride = dest.hstride;
 126       }
 127       else {
 128          insn->bits1.da16.dest_subreg_nr = dest.subnr / 16;
 129          insn->bits1.da16.dest_writemask = dest.dw1.bits.writemask;
 130          /* even ignored in da16, still need to set as '01' */
 131          insn->bits1.da16.dest_horiz_stride = 1;
 132       }
 133    }
 134    else {
 135       insn->bits1.ia1.dest_subreg_nr = dest.subnr;
 136
 137       /* These are different sizes in align1 vs align16:
 138        */
 139       if (insn->header.access_mode == BRW_ALIGN_1) {
 140          insn->bits1.ia1.dest_indirect_offset = dest.dw1.bits.indirect_offset;
 141          if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
 142             dest.hstride = BRW_HORIZONTAL_STRIDE_1;
 143          insn->bits1.ia1.dest_horiz_stride = dest.hstride;
 144       }
 145       else {
 146          insn->bits1.ia16.dest_indirect_offset = dest.dw1.bits.indirect_offset;
 147          /* even ignored in da16, still need to set as '01' */
 148          insn->bits1.ia16.dest_horiz_stride = 1;
 149       }
 150    }
 151
 152    /* NEW: Set the execution size based on dest.width and
 153     * insn->compression_control:
 154     */
 155    guess_execution_size(p, insn, dest);
 156 }
 157
 158 extern int reg_type_size[];
 159
 160 static void
 161 validate_reg(struct brw_instruction *insn, struct brw_reg reg)
 162 {
 163    int hstride_for_reg[] = {0, 1, 2, 4};
 164    int vstride_for_reg[] = {0, 1, 2, 4, 8, 16, 32, 64, 128, 256};
 165    int width_for_reg[] = {1, 2, 4, 8, 16};
 166    int execsize_for_reg[] = {1, 2, 4, 8, 16, 32};
 167    int width, hstride, vstride, execsize;
 168
 169    if (reg.file == BRW_IMMEDIATE_VALUE) {
 170       /* 3.3.6: Region Parameters.  Restriction: Immediate vectors
 171        * mean the destination has to be 128-bit aligned and the
 172        * destination horiz stride has to be a word.
 173        */
 174       if (reg.type == BRW_REGISTER_TYPE_V) {
 175          assert(hstride_for_reg[insn->bits1.da1.dest_horiz_stride] *
 176                 reg_type_size[insn->bits1.da1.dest_reg_type] == 2);
 177       }
 178
 179       return;
 180    }
 181
 182    if (reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
 183        reg.file == BRW_ARF_NULL)
 184       return;
 185
 186    assert(reg.hstride >= 0 && reg.hstride < Elements(hstride_for_reg));
 187    hstride = hstride_for_reg[reg.hstride];
 188
 189    if (reg.vstride == 0xf) {
 190       vstride = -1;
 191    } else {
 192       assert(reg.vstride >= 0 && reg.vstride < Elements(vstride_for_reg));
 193       vstride = vstride_for_reg[reg.vstride];
 194    }
 195
 196    assert(reg.width >= 0 && reg.width < Elements(width_for_reg));
 197    width = width_for_reg[reg.width];
 198
 199    assert(insn->header.execution_size >= 0 &&
 200           insn->header.execution_size < Elements(execsize_for_reg));
 201    execsize = execsize_for_reg[insn->header.execution_size];
 202
 203    /* Restrictions from 3.3.10: Register Region Restrictions. */
 204    /* 3. */
 205    assert(execsize >= width);
 206
 207    /* FIXME: the assembler has a lot of code written that triggers the
 208     * assertions commented it below. Let's paper over it (for now!) until we
 209     * can re-validate the shaders with those little inconsistencies fixed. */
 210
 211    /* 4. */
 212 #if 0
 213    if (execsize == width && hstride != 0) {
 214       assert(vstride == -1 || vstride == width * hstride);
 215    }
 216 #endif
 217
 218    /* 5. */
 219    if (execsize == width && hstride == 0) {
 220       /* no restriction on vstride. */
 221    }
 222
 223    /* 6. */
 224 #if 0
 225    if (width == 1) {
 226       assert(hstride == 0);
 227    }
 228 #endif
 229
 230    /* 7. */
 231 #if 0
 232    if (execsize == 1 && width == 1) {
 233       assert(hstride == 0);
 234       assert(vstride == 0);
 235    }
 236 #endif
 237
 238    /* 8. */
 239    if (vstride == 0 && hstride == 0) {
 240       assert(width == 1);
 241    }
 242
 243    /* 10. Check destination issues. */
 244 }
 245
 246 void
 247 brw_set_src0(struct brw_compile *p, struct brw_instruction *insn,
 248              struct brw_reg reg)
 249 {
 250    struct brw_context *brw = p->brw;
 251    struct intel_context *intel = &brw->intel;
 252
 253    if (reg.file != BRW_ARCHITECTURE_REGISTER_FILE)
 254       assert(reg.nr < 128);
 255
 256    gen7_convert_mrf_to_grf(p, &reg);
 257
 258    if (intel->gen >= 6 && (insn->header.opcode == BRW_OPCODE_SEND ||
 259                            insn->header.opcode == BRW_OPCODE_SENDC)) {
 260       /* Any source modifiers or regions will be ignored, since this just
 261        * identifies the MRF/GRF to start reading the message contents from.
 262        * Check for some likely failures.
 263        */
 264       assert(!reg.negate);
 265       assert(!reg.abs);
 266       assert(reg.address_mode == BRW_ADDRESS_DIRECT);
 267    }
 268
 269    validate_reg(insn, reg);
 270
 271    insn->bits1.da1.src0_reg_file = reg.file;
 272    insn->bits1.da1.src0_reg_type = reg.type;
 273    insn->bits2.da1.src0_abs = reg.abs;
 274    insn->bits2.da1.src0_negate = reg.negate;
 275    insn->bits2.da1.src0_address_mode = reg.address_mode;
 276
 277    if (reg.file == BRW_IMMEDIATE_VALUE) {
 278       insn->bits3.ud = reg.dw1.ud;
 279
 280       /* Required to set some fields in src1 as well:
 281        */
 282
 283       /* FIXME: This looks quite wrong, tempering with src1. I did not find
 284        * anything in the bspec that was hinting it woud be needed when setting
 285        * src0. before removing this one needs to run piglit.
 286
 287       insn->bits1.da1.src1_reg_file = 0;
 288       insn->bits1.da1.src1_reg_type = reg.type;
 289        */
 290    }
 291    else
 292    {
 293       if (reg.address_mode == BRW_ADDRESS_DIRECT) {
 294          if (insn->header.access_mode == BRW_ALIGN_1) {
 295             insn->bits2.da1.src0_subreg_nr = reg.subnr;
 296             insn->bits2.da1.src0_reg_nr = reg.nr;
 297          }
 298          else {
 299             insn->bits2.da16.src0_subreg_nr = reg.subnr / 16;
 300             insn->bits2.da16.src0_reg_nr = reg.nr;
 301          }
 302       }
 303       else {
 304          insn->bits2.ia1.src0_subreg_nr = reg.subnr;
 305
 306          if (insn->header.access_mode == BRW_ALIGN_1) {
 307             insn->bits2.ia1.src0_indirect_offset = reg.dw1.bits.indirect_offset;
 308          }
 309          else {
 310             insn->bits2.ia16.src0_subreg_nr = reg.dw1.bits.indirect_offset;
 311          }
 312       }
 313
 314       if (insn->header.access_mode == BRW_ALIGN_1) {
 315
 316          /* FIXME: While this is correct, if the assembler uses that code path
 317           * the opcode generated are different and thus needs a validation
 318           * pass.
 319          if (reg.width == BRW_WIDTH_1 &&
 320              insn->header.execution_size == BRW_EXECUTE_1) {
 321             insn->bits2.da1.src0_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
 322             insn->bits2.da1.src0_width = BRW_WIDTH_1;
 323             insn->bits2.da1.src0_vert_stride = BRW_VERTICAL_STRIDE_0;
 324          }
 325          else {
 326          */
 327             insn->bits2.da1.src0_horiz_stride = reg.hstride;
 328             insn->bits2.da1.src0_width = reg.width;
 329             insn->bits2.da1.src0_vert_stride = reg.vstride;
 330      /* } */
 331       }
 332       else {
 333          insn->bits2.da16.src0_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
 334          insn->bits2.da16.src0_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
 335          insn->bits2.da16.src0_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
 336          insn->bits2.da16.src0_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
 337
 338          /* This is an oddity of the fact we're using the same
 339           * descriptions for registers in align_16 as align_1:
 340           */
 341          if (reg.vstride == BRW_VERTICAL_STRIDE_8)
 342             insn->bits2.da16.src0_vert_stride = BRW_VERTICAL_STRIDE_4;
 343          else
 344             insn->bits2.da16.src0_vert_stride = reg.vstride;
 345       }
 346    }
 347 }
 348
 349
 350 void brw_set_src1(struct brw_compile *p,
 351                   struct brw_instruction *insn,
 352                   struct brw_reg reg)
 353 {
 354    struct brw_context *brw = p->brw;
 355    struct intel_context *intel = &brw->intel;
 356
 357    assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
 358
 359    if (reg.file != BRW_ARCHITECTURE_REGISTER_FILE)
 360       assert(reg.nr < 128);
 361
 362    gen7_convert_mrf_to_grf(p, &reg);
 363
 364    validate_reg(insn, reg);
 365
 366    insn->bits1.da1.src1_reg_file = reg.file;
 367    insn->bits1.da1.src1_reg_type = reg.type;
 368    insn->bits3.da1.src1_abs = reg.abs;
 369    insn->bits3.da1.src1_negate = reg.negate;
 370    insn->bits3.da1.src1_address_mode = reg.address_mode;
 371
 372    /* Only src1 can be immediate in two-argument instructions.
 373     */
 374    assert(insn->bits1.da1.src0_reg_file != BRW_IMMEDIATE_VALUE);
 375
 376    if (reg.file == BRW_IMMEDIATE_VALUE) {
 377       insn->bits3.ud = reg.dw1.ud;
 378    }
 379    else {
 380       /* It's only BRW that does not support register-indirect addressing on
 381        * src1 */
 382       assert (intel->gen >= 4 || reg.address_mode == BRW_ADDRESS_DIRECT);
 383
 384       if (reg.address_mode == BRW_ADDRESS_DIRECT) {
 385          if (insn->header.access_mode == BRW_ALIGN_1) {
 386             insn->bits3.da1.src1_subreg_nr = reg.subnr;
 387             insn->bits3.da1.src1_reg_nr = reg.nr;
 388          }
 389          else {
 390             insn->bits3.da16.src1_subreg_nr = reg.subnr / 16;
 391             insn->bits3.da16.src1_reg_nr = reg.nr;
 392          }
 393       }
 394       else {
 395          insn->bits3.ia1.src1_subreg_nr = reg.subnr;
 396
 397          if (insn->header.access_mode == BRW_ALIGN_1)
 398             insn->bits3.ia1.src1_indirect_offset = reg.dw1.bits.indirect_offset;
 399          else
 400             insn->bits3.ia16.src1_indirect_offset = reg.dw1.bits.indirect_offset / 16;
 401       }
 402
 403       if (insn->header.access_mode == BRW_ALIGN_1) {
 404          /* FIXME: While this is correct, if the assembler uses that code path
 405           * the opcode generated are different and thus needs a validation
 406           * pass.
 407          if (reg.width == BRW_WIDTH_1 &&
 408              insn->header.execution_size == BRW_EXECUTE_1) {
 409             insn->bits3.da1.src1_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
 410             insn->bits3.da1.src1_width = BRW_WIDTH_1;
 411             insn->bits3.da1.src1_vert_stride = BRW_VERTICAL_STRIDE_0;
 412          }
 413          else { */
 414             insn->bits3.da1.src1_horiz_stride = reg.hstride;
 415             insn->bits3.da1.src1_width = reg.width;
 416             insn->bits3.da1.src1_vert_stride = reg.vstride;
 417      /* } */
 418       }
 419       else {
 420          insn->bits3.da16.src1_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
 421          insn->bits3.da16.src1_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
 422          insn->bits3.da16.src1_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
 423          insn->bits3.da16.src1_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
 424
 425          /* This is an oddity of the fact we're using the same
 426           * descriptions for registers in align_16 as align_1:
 427           */
 428          if (reg.vstride == BRW_VERTICAL_STRIDE_8)
 429             insn->bits3.da16.src1_vert_stride = BRW_VERTICAL_STRIDE_4;
 430          else
 431             insn->bits3.da16.src1_vert_stride = reg.vstride;
 432       }
 433    }
 434 }
 435
 436 /**
 437  * Set the Message Descriptor and Extended Message Descriptor fields
 438  * for SEND messages.
 439  *
 440  * \note This zeroes out the Function Control bits, so it must be called
 441  *       \b before filling out any message-specific data.  Callers can
 442  *       choose not to fill in irrelevant bits; they will be zero.
 443  */
 444 static void
 445 brw_set_message_descriptor(struct brw_compile *p,
 446                            struct brw_instruction *inst,
 447                            enum brw_message_target sfid,
 448                            unsigned msg_length,
 449                            unsigned response_length,
 450                            bool header_present,
 451                            bool end_of_thread)
 452 {
 453    struct intel_context *intel = &p->brw->intel;
 454
 455    brw_set_src1(p, inst, brw_imm_d(0));
 456
 457    if (intel->gen >= 5) {
 458       inst->bits3.generic_gen5.header_present = header_present;
 459       inst->bits3.generic_gen5.response_length = response_length;
 460       inst->bits3.generic_gen5.msg_length = msg_length;
 461       inst->bits3.generic_gen5.end_of_thread = end_of_thread;
 462
 463       if (intel->gen >= 6) {
 464          /* On Gen6+ Message target/SFID goes in bits 27:24 of the header */
 465          inst->header.destreg__conditionalmod = sfid;
 466       } else {
 467          /* Set Extended Message Descriptor (ex_desc) */
 468          inst->bits2.send_gen5.sfid = sfid;
 469          inst->bits2.send_gen5.end_of_thread = end_of_thread;
 470       }
 471    } else {
 472       inst->bits3.generic.response_length = response_length;
 473       inst->bits3.generic.msg_length = msg_length;
 474       inst->bits3.generic.msg_target = sfid;
 475       inst->bits3.generic.end_of_thread = end_of_thread;
 476    }
 477 }
 478
 479 static void brw_set_math_message( struct brw_compile *p,
 480                                   struct brw_instruction *insn,
 481                                   unsigned function,
 482                                   unsigned integer_type,
 483                                   bool low_precision,
 484                                   unsigned dataType )
 485 {
 486    struct brw_context *brw = p->brw;
 487    struct intel_context *intel = &brw->intel;
 488    unsigned msg_length;
 489    unsigned response_length;
 490
 491    /* Infer message length from the function */
 492    switch (function) {
 493    case BRW_MATH_FUNCTION_POW:
 494    case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT:
 495    case BRW_MATH_FUNCTION_INT_DIV_REMAINDER:
 496    case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
 497       msg_length = 2;
 498       break;
 499    default:
 500       msg_length = 1;
 501       break;
 502    }
 503
 504    /* Infer response length from the function */
 505    switch (function) {
 506    case BRW_MATH_FUNCTION_SINCOS:
 507    case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
 508       response_length = 2;
 509       break;
 510    default:
 511       response_length = 1;
 512       break;
 513    }
 514
 515
 516    brw_set_message_descriptor(p, insn, BRW_SFID_MATH,
 517                               msg_length, response_length, false, false);
 518    if (intel->gen == 5) {
 519       insn->bits3.math_gen5.function = function;
 520       insn->bits3.math_gen5.int_type = integer_type;
 521       insn->bits3.math_gen5.precision = low_precision;
 522       insn->bits3.math_gen5.saturate = insn->header.saturate;
 523       insn->bits3.math_gen5.data_type = dataType;
 524       insn->bits3.math_gen5.snapshot = 0;
 525    } else {
 526       insn->bits3.math.function = function;
 527       insn->bits3.math.int_type = integer_type;
 528       insn->bits3.math.precision = low_precision;
 529       insn->bits3.math.saturate = insn->header.saturate;
 530       insn->bits3.math.data_type = dataType;
 531    }
 532    insn->header.saturate = 0;
 533 }
 534
 535
 536 static void brw_set_ff_sync_message(struct brw_compile *p,
 537                                     struct brw_instruction *insn,
 538                                     bool allocate,
 539                                     unsigned response_length,
 540                                     bool end_of_thread)
 541 {
 542    brw_set_message_descriptor(p, insn, BRW_SFID_URB,
 543                               1, response_length, true, end_of_thread);
 544    insn->bits3.urb_gen5.opcode = 1; /* FF_SYNC */
 545    insn->bits3.urb_gen5.offset = 0; /* Not used by FF_SYNC */
 546    insn->bits3.urb_gen5.swizzle_control = 0; /* Not used by FF_SYNC */
 547    insn->bits3.urb_gen5.allocate = allocate;
 548    insn->bits3.urb_gen5.used = 0; /* Not used by FF_SYNC */
 549    insn->bits3.urb_gen5.complete = 0; /* Not used by FF_SYNC */
 550 }
 551
 552 static void brw_set_urb_message( struct brw_compile *p,
 553                                  struct brw_instruction *insn,
 554                                  bool allocate,
 555                                  bool used,
 556                                  unsigned msg_length,
 557                                  unsigned response_length,
 558                                  bool end_of_thread,
 559                                  bool complete,
 560                                  unsigned offset,
 561                                  unsigned swizzle_control )
 562 {
 563    struct brw_context *brw = p->brw;
 564    struct intel_context *intel = &brw->intel;
 565
 566    brw_set_message_descriptor(p, insn, BRW_SFID_URB,
 567                               msg_length, response_length, true, end_of_thread);
 568    if (intel->gen == 7) {
 569       insn->bits3.urb_gen7.opcode = 0;  /* URB_WRITE_HWORD */
 570       insn->bits3.urb_gen7.offset = offset;
 571       assert(swizzle_control != BRW_URB_SWIZZLE_TRANSPOSE);
 572       insn->bits3.urb_gen7.swizzle_control = swizzle_control;
 573       /* per_slot_offset = 0 makes it ignore offsets in message header */
 574       insn->bits3.urb_gen7.per_slot_offset = 0;
 575       insn->bits3.urb_gen7.complete = complete;
 576    } else if (intel->gen >= 5) {
 577       insn->bits3.urb_gen5.opcode = 0;  /* URB_WRITE */
 578       insn->bits3.urb_gen5.offset = offset;
 579       insn->bits3.urb_gen5.swizzle_control = swizzle_control;
 580       insn->bits3.urb_gen5.allocate = allocate;
 581       insn->bits3.urb_gen5.used = used; /* ? */
 582       insn->bits3.urb_gen5.complete = complete;
 583    } else {
 584       insn->bits3.urb.opcode = 0;       /* ? */
 585       insn->bits3.urb.offset = offset;
 586       insn->bits3.urb.swizzle_control = swizzle_control;
 587       insn->bits3.urb.allocate = allocate;
 588       insn->bits3.urb.used = used;      /* ? */
 589       insn->bits3.urb.complete = complete;
 590    }
 591 }
 592
 593 void
 594 brw_set_dp_write_message(struct brw_compile *p,
 595                          struct brw_instruction *insn,
 596                          unsigned binding_table_index,
 597                          unsigned msg_control,
 598                          unsigned msg_type,
 599                          unsigned msg_length,
 600                          bool header_present,
 601                          unsigned last_render_target,
 602                          unsigned response_length,
 603                          unsigned end_of_thread,
 604                          unsigned send_commit_msg)
 605 {
 606    struct brw_context *brw = p->brw;
 607    struct intel_context *intel = &brw->intel;
 608    unsigned sfid;
 609
 610    if (intel->gen >= 7) {
 611       /* Use the Render Cache for RT writes; otherwise use the Data Cache */
 612       if (msg_type == GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE)
 613          sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
 614       else
 615          sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
 616    } else if (intel->gen == 6) {
 617       /* Use the render cache for all write messages. */
 618       sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
 619    } else {
 620       sfid = BRW_SFID_DATAPORT_WRITE;
 621    }
 622
 623    brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
 624                               header_present, end_of_thread);
 625
 626    if (intel->gen >= 7) {
 627       insn->bits3.gen7_dp.binding_table_index = binding_table_index;
 628       insn->bits3.gen7_dp.msg_control = msg_control |
 629                                         last_render_target << 6;
 630       insn->bits3.gen7_dp.msg_type = msg_type;
 631    } else if (intel->gen == 6) {
 632       insn->bits3.gen6_dp.binding_table_index = binding_table_index;
 633       insn->bits3.gen6_dp.msg_control = msg_control |
 634                                         last_render_target << 5;
 635       insn->bits3.gen6_dp.msg_type = msg_type;
 636       insn->bits3.gen6_dp.send_commit_msg = send_commit_msg;
 637    } else if (intel->gen == 5) {
 638       insn->bits3.dp_write_gen5.binding_table_index = binding_table_index;
 639       insn->bits3.dp_write_gen5.msg_control = msg_control;
 640       insn->bits3.dp_write_gen5.last_render_target = last_render_target;
 641       insn->bits3.dp_write_gen5.msg_type = msg_type;
 642       insn->bits3.dp_write_gen5.send_commit_msg = send_commit_msg;
 643    } else {
 644       insn->bits3.dp_write.binding_table_index = binding_table_index;
 645       insn->bits3.dp_write.msg_control = msg_control;
 646       insn->bits3.dp_write.last_render_target = last_render_target;
 647       insn->bits3.dp_write.msg_type = msg_type;
 648       insn->bits3.dp_write.send_commit_msg = send_commit_msg;
 649    }
 650 }
 651
 652 void
 653 brw_set_dp_read_message(struct brw_compile *p,
 654                         struct brw_instruction *insn,
 655                         unsigned binding_table_index,
 656                         unsigned msg_control,
 657                         unsigned msg_type,
 658                         unsigned target_cache,
 659                         unsigned msg_length,
 660                         bool header_present,
 661                         unsigned response_length)
 662 {
 663    struct brw_context *brw = p->brw;
 664    struct intel_context *intel = &brw->intel;
 665    unsigned sfid;
 666
 667    if (intel->gen >= 7) {
 668       sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
 669    } else if (intel->gen == 6) {
 670       if (target_cache == BRW_DATAPORT_READ_TARGET_RENDER_CACHE)
 671          sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
 672       else
 673          sfid = GEN6_SFID_DATAPORT_SAMPLER_CACHE;
 674    } else {
 675       sfid = BRW_SFID_DATAPORT_READ;
 676    }
 677
 678    brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
 679                               header_present, false);
 680
 681    if (intel->gen >= 7) {
 682       insn->bits3.gen7_dp.binding_table_index = binding_table_index;
 683       insn->bits3.gen7_dp.msg_control = msg_control;
 684       insn->bits3.gen7_dp.msg_type = msg_type;
 685    } else if (intel->gen == 6) {
 686       insn->bits3.gen6_dp.binding_table_index = binding_table_index;
 687       insn->bits3.gen6_dp.msg_control = msg_control;
 688       insn->bits3.gen6_dp.msg_type = msg_type;
 689       insn->bits3.gen6_dp.send_commit_msg = 0;
 690    } else if (intel->gen == 5) {
 691       insn->bits3.dp_read_gen5.binding_table_index = binding_table_index;
 692       insn->bits3.dp_read_gen5.msg_control = msg_control;
 693       insn->bits3.dp_read_gen5.msg_type = msg_type;
 694       insn->bits3.dp_read_gen5.target_cache = target_cache;
 695    } else if (intel->is_g4x) {
 696       insn->bits3.dp_read_g4x.binding_table_index = binding_table_index; /*0:7*/
 697       insn->bits3.dp_read_g4x.msg_control = msg_control;  /*8:10*/
 698       insn->bits3.dp_read_g4x.msg_type = msg_type;  /*11:13*/
 699       insn->bits3.dp_read_g4x.target_cache = target_cache;  /*14:15*/
 700    } else {
 701       insn->bits3.dp_read.binding_table_index = binding_table_index; /*0:7*/
 702       insn->bits3.dp_read.msg_control = msg_control;  /*8:11*/
 703       insn->bits3.dp_read.msg_type = msg_type;  /*12:13*/
 704       insn->bits3.dp_read.target_cache = target_cache;  /*14:15*/
 705    }
 706 }
 707
 708 void
 709 brw_set_sampler_message(struct brw_compile *p,
 710                         struct brw_instruction *insn,
 711                         unsigned binding_table_index,
 712                         unsigned sampler,
 713                         unsigned msg_type,
 714                         unsigned response_length,
 715                         unsigned msg_length,
 716                         unsigned header_present,
 717                         unsigned simd_mode,
 718                         unsigned return_format)
 719 {
 720    struct brw_context *brw = p->brw;
 721    struct intel_context *intel = &brw->intel;
 722
 723    brw_set_message_descriptor(p, insn, BRW_SFID_SAMPLER, msg_length,
 724                               response_length, header_present, false);
 725
 726    if (intel->gen >= 7) {
 727       insn->bits3.sampler_gen7.binding_table_index = binding_table_index;
 728       insn->bits3.sampler_gen7.sampler = sampler;
 729       insn->bits3.sampler_gen7.msg_type = msg_type;
 730       insn->bits3.sampler_gen7.simd_mode = simd_mode;
 731    } else if (intel->gen >= 5) {
 732       insn->bits3.sampler_gen5.binding_table_index = binding_table_index;
 733       insn->bits3.sampler_gen5.sampler = sampler;
 734       insn->bits3.sampler_gen5.msg_type = msg_type;
 735       insn->bits3.sampler_gen5.simd_mode = simd_mode;
 736    } else if (intel->is_g4x) {
 737       insn->bits3.sampler_g4x.binding_table_index = binding_table_index;
 738       insn->bits3.sampler_g4x.sampler = sampler;
 739       insn->bits3.sampler_g4x.msg_type = msg_type;
 740    } else {
 741       insn->bits3.sampler.binding_table_index = binding_table_index;
 742       insn->bits3.sampler.sampler = sampler;
 743       insn->bits3.sampler.msg_type = msg_type;
 744       insn->bits3.sampler.return_format = return_format;
 745    }
 746 }
 747
 748
 749 #define next_insn brw_next_insn
 750 struct brw_instruction *
 751 brw_next_insn(struct brw_compile *p, unsigned opcode)
 752 {
 753    struct brw_instruction *insn;
 754
 755    if (p->nr_insn + 1 > p->store_size) {
 756       if (0)
 757          printf("incresing the store size to %d\n", p->store_size << 1);
 758       p->store_size <<= 1;
 759       p->store = reralloc(p->mem_ctx, p->store,
 760                           struct brw_instruction, p->store_size);
 761       if (!p->store)
 762          assert(!"realloc eu store memeory failed");
 763    }
 764
 765    p->next_insn_offset += 16;
 766    insn = &p->store[p->nr_insn++];
 767    memcpy(insn, p->current, sizeof(*insn));
 768
 769    /* Reset this one-shot flag:
 770     */
 771
 772    if (p->current->header.destreg__conditionalmod) {
 773       p->current->header.destreg__conditionalmod = 0;
 774       p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
 775    }
 776
 777    insn->header.opcode = opcode;
 778    return insn;
 779 }
 780
 781 static struct brw_instruction *brw_alu1( struct brw_compile *p,
 782                                          unsigned opcode,
 783                                          struct brw_reg dest,
 784                                          struct brw_reg src )
 785 {
 786    struct brw_instruction *insn = next_insn(p, opcode);
 787    brw_set_dest(p, insn, dest);
 788    brw_set_src0(p, insn, src);
 789    return insn;
 790 }
 791
 792 static struct brw_instruction *brw_alu2(struct brw_compile *p,
 793                                         unsigned opcode,
 794                                         struct brw_reg dest,
 795                                         struct brw_reg src0,
 796                                         struct brw_reg src1 )
 797 {
 798    struct brw_instruction *insn = next_insn(p, opcode);
 799    brw_set_dest(p, insn, dest);
 800    brw_set_src0(p, insn, src0);
 801    brw_set_src1(p, insn, src1);
 802    return insn;
 803 }
 804
 805 static int
 806 get_3src_subreg_nr(struct brw_reg reg)
 807 {
 808    if (reg.vstride == BRW_VERTICAL_STRIDE_0) {
 809       assert(brw_is_single_value_swizzle(reg.dw1.bits.swizzle));
 810       return reg.subnr / 4 + BRW_GET_SWZ(reg.dw1.bits.swizzle, 0);
 811    } else {
 812       return reg.subnr / 4;
 813    }
 814 }
 815
 816 static int get_3src_type(int type)
 817 {
 818    assert(type == BRW_REGISTER_TYPE_F ||
 819           type == BRW_REGISTER_TYPE_D ||
 820           type == BRW_REGISTER_TYPE_UD);
 821
 822    switch(type) {
 823    case BRW_REGISTER_TYPE_F: return BRW_REGISTER_3SRC_TYPE_F;
 824    case BRW_REGISTER_TYPE_D: return BRW_REGISTER_3SRC_TYPE_D;
 825    case BRW_REGISTER_TYPE_UD: return BRW_REGISTER_3SRC_TYPE_UD;
 826    }
 827
 828    return BRW_REGISTER_3SRC_TYPE_F;
 829 }
 830
 831 void
 832 brw_set_3src_dest(struct brw_compile *p,
 833                   struct brw_instruction *insn,
 834                   struct brw_reg dest)
 835 {
 836    gen7_convert_mrf_to_grf(p, &dest);
 837
 838    assert(insn->header.access_mode == BRW_ALIGN_16);
 839
 840    assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
 841           dest.file == BRW_MESSAGE_REGISTER_FILE);
 842    assert(dest.nr < 128);
 843    assert(dest.address_mode == BRW_ADDRESS_DIRECT);
 844    insn->bits1.da3src.dest_reg_type = get_3src_type(dest.type);
 845    insn->bits1.da3src.dest_reg_file = (dest.file == BRW_MESSAGE_REGISTER_FILE);
 846    insn->bits1.da3src.dest_reg_nr = dest.nr;
 847    insn->bits1.da3src.dest_subreg_nr = dest.subnr / 16;
 848    insn->bits1.da3src.dest_writemask = dest.dw1.bits.writemask;
 849    guess_execution_size(p, insn, dest);
 850 }
 851
 852 void
 853 brw_set_3src_src0(struct brw_compile *p,
 854                   struct brw_instruction *insn,
 855                   struct brw_reg src0)
 856 {
 857    assert(src0.file == BRW_GENERAL_REGISTER_FILE);
 858    assert(src0.address_mode == BRW_ADDRESS_DIRECT);
 859    assert(src0.nr < 128);
 860    insn->bits1.da3src.src_reg_type = get_3src_type(src0.type);
 861    insn->bits2.da3src.src0_swizzle = src0.dw1.bits.swizzle;
 862    insn->bits2.da3src.src0_subreg_nr = get_3src_subreg_nr(src0);
 863    insn->bits2.da3src.src0_reg_nr = src0.nr;
 864    insn->bits1.da3src.src0_abs = src0.abs;
 865    insn->bits1.da3src.src0_negate = src0.negate;
 866    insn->bits2.da3src.src0_rep_ctrl = src0.vstride == BRW_VERTICAL_STRIDE_0;
 867 }
 868
 869 void
 870 brw_set_3src_src1(struct brw_compile *p,
 871                   struct brw_instruction *insn,
 872                   struct brw_reg src1)
 873 {
 874    assert(src1.file == BRW_GENERAL_REGISTER_FILE);
 875    assert(src1.address_mode == BRW_ADDRESS_DIRECT);
 876    assert(src1.nr < 128);
 877    assert(src1.type == insn->bits1.da3src.src_reg_type);
 878    insn->bits2.da3src.src1_swizzle = src1.dw1.bits.swizzle;
 879    insn->bits2.da3src.src1_subreg_nr_low = get_3src_subreg_nr(src1) & 0x3;
 880    insn->bits3.da3src.src1_subreg_nr_high = get_3src_subreg_nr(src1) >> 2;
 881    insn->bits2.da3src.src1_rep_ctrl = src1.vstride == BRW_VERTICAL_STRIDE_0;
 882    insn->bits3.da3src.src1_reg_nr = src1.nr;
 883    insn->bits1.da3src.src1_abs = src1.abs;
 884    insn->bits1.da3src.src1_negate = src1.negate;
 885 }
 886
 887 void
 888 brw_set_3src_src2(struct brw_compile *p,
 889                   struct brw_instruction *insn,
 890                   struct brw_reg src2)
 891 {
 892    assert(src2.file == BRW_GENERAL_REGISTER_FILE);
 893    assert(src2.address_mode == BRW_ADDRESS_DIRECT);
 894    assert(src2.nr < 128);
 895    assert(src2.type == insn->bits1.da3src.src_reg_type);
 896    insn->bits3.da3src.src2_swizzle = src2.dw1.bits.swizzle;
 897    insn->bits3.da3src.src2_subreg_nr = get_3src_subreg_nr(src2);
 898    insn->bits3.da3src.src2_rep_ctrl = src2.vstride == BRW_VERTICAL_STRIDE_0;
 899    insn->bits3.da3src.src2_reg_nr = src2.nr;
 900    insn->bits1.da3src.src2_abs = src2.abs;
 901    insn->bits1.da3src.src2_negate = src2.negate;
 902 }
 903
 904 static struct brw_instruction *brw_alu3(struct brw_compile *p,
 905                                         unsigned opcode,
 906                                         struct brw_reg dest,
 907                                         struct brw_reg src0,
 908                                         struct brw_reg src1,
 909                                         struct brw_reg src2)
 910 {
 911    struct brw_instruction *insn = next_insn(p, opcode);
 912    brw_set_3src_dest(p, insn, dest);
 913    brw_set_3src_src0(p, insn, src0);
 914    brw_set_3src_src1(p, insn, src1);
 915    brw_set_3src_src2(p, insn, src2);
 916    return insn;
 917 }
 918
 919
 920 /***********************************************************************
 921  * Convenience routines.
 922  */
 923 #define ALU1(OP)                                        \
 924 struct brw_instruction *brw_##OP(struct brw_compile *p, \
 925               struct brw_reg dest,                      \
 926               struct brw_reg src0)                      \
 927 {                                                       \
 928    return brw_alu1(p, BRW_OPCODE_##OP, dest, src0);     \
 929 }
 930
 931 #define ALU2(OP)                                        \
 932 struct brw_instruction *brw_##OP(struct brw_compile *p, \
 933               struct brw_reg dest,                      \
 934               struct brw_reg src0,                      \
 935               struct brw_reg src1)                      \
 936 {                                                       \
 937    return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1);       \
 938 }
 939
 940 #define ALU3(OP)                                        \
 941 struct brw_instruction *brw_##OP(struct brw_compile *p, \
 942               struct brw_reg dest,                      \
 943               struct brw_reg src0,                      \
 944               struct brw_reg src1,                      \
 945               struct brw_reg src2)                      \
 946 {                                                       \
 947    return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
 948 }
 949
 950 /* Rounding operations (other than RNDD) require two instructions - the first
 951  * stores a rounded value (possibly the wrong way) in the dest register, but
 952  * also sets a per-channel "increment bit" in the flag register.  A predicated
 953  * add of 1.0 fixes dest to contain the desired result.
 954  *
 955  * Sandybridge and later appear to round correctly without an ADD.
 956  */
 957 #define ROUND(OP)                                                             \
 958 void brw_##OP(struct brw_compile *p,                                          \
 959               struct brw_reg dest,                                            \
 960               struct brw_reg src)                                             \
 961 {                                                                             \
 962    struct brw_instruction *rnd, *add;                                         \
 963    rnd = next_insn(p, BRW_OPCODE_##OP);                                       \
 964    brw_set_dest(p, rnd, dest);                                                \
 965    brw_set_src0(p, rnd, src);                                                 \
 966                                                                               \
 967    if (p->brw->intel.gen < 6) {                                               \
 968       /* turn on round-increments */                                          \
 969       rnd->header.destreg__conditionalmod = BRW_CONDITIONAL_R;                \
 970       add = brw_ADD(p, dest, dest, brw_imm_f(1.0f));                          \
 971       add->header.predicate_control = BRW_PREDICATE_NORMAL;                   \
 972    }                                                                          \
 973 }
 974
 975
 976 ALU1(MOV)
 977 ALU2(SEL)
 978 ALU1(NOT)
 979 ALU2(AND)
 980 ALU2(OR)
 981 ALU2(XOR)
 982 ALU2(SHR)
 983 ALU2(SHL)
 984 ALU2(RSR)
 985 ALU2(RSL)
 986 ALU2(ASR)
 987 ALU1(FRC)
 988 ALU1(RNDD)
 989 ALU2(MAC)
 990 ALU2(MACH)
 991 ALU1(LZD)
 992 ALU2(DP4)
 993 ALU2(DPH)
 994 ALU2(DP3)
 995 ALU2(DP2)
 996 ALU2(LINE)
 997 ALU2(PLN)
 998 ALU3(MAD)
 999
1000 ROUND(RNDZ)
1001 ROUND(RNDE)
1002
1003
1004 struct brw_instruction *brw_ADD(struct brw_compile *p,
1005                                 struct brw_reg dest,
1006                                 struct brw_reg src0,
1007                                 struct brw_reg src1)
1008 {
1009    /* 6.2.2: add */
1010    if (src0.type == BRW_REGISTER_TYPE_F ||
1011        (src0.file == BRW_IMMEDIATE_VALUE &&
1012         src0.type == BRW_REGISTER_TYPE_VF)) {
1013       assert(src1.type != BRW_REGISTER_TYPE_UD);
1014       assert(src1.type != BRW_REGISTER_TYPE_D);
1015    }
1016
1017    if (src1.type == BRW_REGISTER_TYPE_F ||
1018        (src1.file == BRW_IMMEDIATE_VALUE &&
1019         src1.type == BRW_REGISTER_TYPE_VF)) {
1020       assert(src0.type != BRW_REGISTER_TYPE_UD);
1021       assert(src0.type != BRW_REGISTER_TYPE_D);
1022    }
1023
1024    return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1);
1025 }
1026
1027 struct brw_instruction *brw_AVG(struct brw_compile *p,
1028                                 struct brw_reg dest,
1029                                 struct brw_reg src0,
1030                                 struct brw_reg src1)
1031 {
1032    assert(dest.type == src0.type);
1033    assert(src0.type == src1.type);
1034    switch (src0.type) {
1035    case BRW_REGISTER_TYPE_B:
1036    case BRW_REGISTER_TYPE_UB:
1037    case BRW_REGISTER_TYPE_W:
1038    case BRW_REGISTER_TYPE_UW:
1039    case BRW_REGISTER_TYPE_D:
1040    case BRW_REGISTER_TYPE_UD:
1041       break;
1042    default:
1043       assert(!"Bad type for brw_AVG");
1044    }
1045
1046    return brw_alu2(p, BRW_OPCODE_AVG, dest, src0, src1);
1047 }
1048
1049 struct brw_instruction *brw_MUL(struct brw_compile *p,
1050                                 struct brw_reg dest,
1051                                 struct brw_reg src0,
1052                                 struct brw_reg src1)
1053 {
1054    /* 6.32.38: mul */
1055    if (src0.type == BRW_REGISTER_TYPE_D ||
1056        src0.type == BRW_REGISTER_TYPE_UD ||
1057        src1.type == BRW_REGISTER_TYPE_D ||
1058        src1.type == BRW_REGISTER_TYPE_UD) {
1059       assert(dest.type != BRW_REGISTER_TYPE_F);
1060    }
1061
1062    if (src0.type == BRW_REGISTER_TYPE_F ||
1063        (src0.file == BRW_IMMEDIATE_VALUE &&
1064         src0.type == BRW_REGISTER_TYPE_VF)) {
1065       assert(src1.type != BRW_REGISTER_TYPE_UD);
1066       assert(src1.type != BRW_REGISTER_TYPE_D);
1067    }
1068
1069    if (src1.type == BRW_REGISTER_TYPE_F ||
1070        (src1.file == BRW_IMMEDIATE_VALUE &&
1071         src1.type == BRW_REGISTER_TYPE_VF)) {
1072       assert(src0.type != BRW_REGISTER_TYPE_UD);
1073       assert(src0.type != BRW_REGISTER_TYPE_D);
1074    }
1075
1076    assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1077           src0.nr != BRW_ARF_ACCUMULATOR);
1078    assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1079           src1.nr != BRW_ARF_ACCUMULATOR);
1080
1081    return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1);
1082 }
1083
1084
1085 void brw_NOP(struct brw_compile *p)
1086 {
1087    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_NOP);
1088    brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1089    brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1090    brw_set_src1(p, insn, brw_imm_ud(0x0));
1091 }
1092
1093
1094
1095
1096
1097 /***********************************************************************
1098  * Comparisons, if/else/endif
1099  */
1100
1101 struct brw_instruction *brw_JMPI(struct brw_compile *p,
1102                                  struct brw_reg dest,
1103                                  struct brw_reg src0,
1104                                  struct brw_reg src1)
1105 {
1106    struct brw_instruction *insn = brw_alu2(p, BRW_OPCODE_JMPI, dest, src0, src1);
1107
1108    insn->header.execution_size = 1;
1109    insn->header.compression_control = BRW_COMPRESSION_NONE;
1110    insn->header.mask_control = BRW_MASK_DISABLE;
1111
1112    p->current->header.predicate_control = BRW_PREDICATE_NONE;
1113
1114    return insn;
1115 }
1116
1117 static void
1118 push_if_stack(struct brw_compile *p, struct brw_instruction *inst)
1119 {
1120    p->if_stack[p->if_stack_depth] = inst - p->store;
1121
1122    p->if_stack_depth++;
1123    if (p->if_stack_array_size <= p->if_stack_depth) {
1124       p->if_stack_array_size *= 2;
1125       p->if_stack = reralloc(p->mem_ctx, p->if_stack, int,
1126                              p->if_stack_array_size);
1127    }
1128 }
1129
1130 static struct brw_instruction *
1131 pop_if_stack(struct brw_compile *p)
1132 {
1133    p->if_stack_depth--;
1134    return &p->store[p->if_stack[p->if_stack_depth]];
1135 }
1136
1137 static void
1138 push_loop_stack(struct brw_compile *p, struct brw_instruction *inst)
1139 {
1140    if (p->loop_stack_array_size < p->loop_stack_depth) {
1141       p->loop_stack_array_size *= 2;
1142       p->loop_stack = reralloc(p->mem_ctx, p->loop_stack, int,
1143                                p->loop_stack_array_size);
1144       p->if_depth_in_loop = reralloc(p->mem_ctx, p->if_depth_in_loop, int,
1145                                      p->loop_stack_array_size);
1146    }
1147
1148    p->loop_stack[p->loop_stack_depth] = inst - p->store;
1149    p->loop_stack_depth++;
1150    p->if_depth_in_loop[p->loop_stack_depth] = 0;
1151 }
1152
1153 static struct brw_instruction *
1154 get_inner_do_insn(struct brw_compile *p)
1155 {
1156    return &p->store[p->loop_stack[p->loop_stack_depth - 1]];
1157 }
1158
1159 /* EU takes the value from the flag register and pushes it onto some
1160  * sort of a stack (presumably merging with any flag value already on
1161  * the stack).  Within an if block, the flags at the top of the stack
1162  * control execution on each channel of the unit, eg. on each of the
1163  * 16 pixel values in our wm programs.
1164  *
1165  * When the matching 'else' instruction is reached (presumably by
1166  * countdown of the instruction count patched in by our ELSE/ENDIF
1167  * functions), the relevent flags are inverted.
1168  *
1169  * When the matching 'endif' instruction is reached, the flags are
1170  * popped off.  If the stack is now empty, normal execution resumes.
1171  */
1172 struct brw_instruction *
1173 brw_IF(struct brw_compile *p, unsigned execute_size)
1174 {
1175    struct intel_context *intel = &p->brw->intel;
1176    struct brw_instruction *insn;
1177
1178    insn = next_insn(p, BRW_OPCODE_IF);
1179
1180    /* Override the defaults for this instruction:
1181     */
1182    if (intel->gen < 6) {
1183       brw_set_dest(p, insn, brw_ip_reg());
1184       brw_set_src0(p, insn, brw_ip_reg());
1185       brw_set_src1(p, insn, brw_imm_d(0x0));
1186    } else if (intel->gen == 6) {
1187       brw_set_dest(p, insn, brw_imm_w(0));
1188       insn->bits1.branch_gen6.jump_count = 0;
1189       brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1190       brw_set_src1(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1191    } else {
1192       brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1193       brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1194       brw_set_src1(p, insn, brw_imm_ud(0));
1195       insn->bits3.break_cont.jip = 0;
1196       insn->bits3.break_cont.uip = 0;
1197    }
1198
1199    insn->header.execution_size = execute_size;
1200    insn->header.compression_control = BRW_COMPRESSION_NONE;
1201    insn->header.predicate_control = BRW_PREDICATE_NORMAL;
1202    insn->header.mask_control = BRW_MASK_ENABLE;
1203    if (!p->single_program_flow)
1204       insn->header.thread_control = BRW_THREAD_SWITCH;
1205
1206    p->current->header.predicate_control = BRW_PREDICATE_NONE;
1207
1208    push_if_stack(p, insn);
1209    p->if_depth_in_loop[p->loop_stack_depth]++;
1210    return insn;
1211 }
1212
1213 /* This function is only used for gen6-style IF instructions with an
1214  * embedded comparison (conditional modifier).  It is not used on gen7.
1215  */
1216 struct brw_instruction *
1217 gen6_IF(struct brw_compile *p, uint32_t conditional,
1218         struct brw_reg src0, struct brw_reg src1)
1219 {
1220    struct brw_instruction *insn;
1221
1222    insn = next_insn(p, BRW_OPCODE_IF);
1223
1224    brw_set_dest(p, insn, brw_imm_w(0));
1225    if (p->compressed) {
1226       insn->header.execution_size = BRW_EXECUTE_16;
1227    } else {
1228       insn->header.execution_size = BRW_EXECUTE_8;
1229    }
1230    insn->bits1.branch_gen6.jump_count = 0;
1231    brw_set_src0(p, insn, src0);
1232    brw_set_src1(p, insn, src1);
1233
1234    assert(insn->header.compression_control == BRW_COMPRESSION_NONE);
1235    assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
1236    insn->header.destreg__conditionalmod = conditional;
1237
1238    if (!p->single_program_flow)
1239       insn->header.thread_control = BRW_THREAD_SWITCH;
1240
1241    push_if_stack(p, insn);
1242    return insn;
1243 }
1244
1245 /**
1246  * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs.
1247  */
1248 static void
1249 convert_IF_ELSE_to_ADD(struct brw_compile *p,
1250                        struct brw_instruction *if_inst,
1251                        struct brw_instruction *else_inst)
1252 {
1253    /* The next instruction (where the ENDIF would be, if it existed) */
1254    struct brw_instruction *next_inst = &p->store[p->nr_insn];
1255
1256    assert(p->single_program_flow);
1257    assert(if_inst != NULL && if_inst->header.opcode == BRW_OPCODE_IF);
1258    assert(else_inst == NULL || else_inst->header.opcode == BRW_OPCODE_ELSE);
1259    assert(if_inst->header.execution_size == BRW_EXECUTE_1);
1260
1261    /* Convert IF to an ADD instruction that moves the instruction pointer
1262     * to the first instruction of the ELSE block.  If there is no ELSE
1263     * block, point to where ENDIF would be.  Reverse the predicate.
1264     *
1265     * There's no need to execute an ENDIF since we don't need to do any
1266     * stack operations, and if we're currently executing, we just want to
1267     * continue normally.
1268     */
1269    if_inst->header.opcode = BRW_OPCODE_ADD;
1270    if_inst->header.predicate_inverse = 1;
1271
1272    if (else_inst != NULL) {
1273       /* Convert ELSE to an ADD instruction that points where the ENDIF
1274        * would be.
1275        */
1276       else_inst->header.opcode = BRW_OPCODE_ADD;
1277
1278       if_inst->bits3.ud = (else_inst - if_inst + 1) * 16;
1279       else_inst->bits3.ud = (next_inst - else_inst) * 16;
1280    } else {
1281       if_inst->bits3.ud = (next_inst - if_inst) * 16;
1282    }
1283 }
1284
1285 /**
1286  * Patch IF and ELSE instructions with appropriate jump targets.
1287  */
1288 static void
1289 patch_IF_ELSE(struct brw_compile *p,
1290               struct brw_instruction *if_inst,
1291               struct brw_instruction *else_inst,
1292               struct brw_instruction *endif_inst)
1293 {
1294    struct intel_context *intel = &p->brw->intel;
1295
1296    /* We shouldn't be patching IF and ELSE instructions in single program flow
1297     * mode when gen < 6, because in single program flow mode on those
1298     * platforms, we convert flow control instructions to conditional ADDs that
1299     * operate on IP (see brw_ENDIF).
1300     *
1301     * However, on Gen6, writing to IP doesn't work in single program flow mode
1302     * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1303     * not be updated by non-flow control instructions.").  And on later
1304     * platforms, there is no significant benefit to converting control flow
1305     * instructions to conditional ADDs.  So we do patch IF and ELSE
1306     * instructions in single program flow mode on those platforms.
1307     */
1308    if (intel->gen < 6)
1309       assert(!p->single_program_flow);
1310
1311    assert(if_inst != NULL && if_inst->header.opcode == BRW_OPCODE_IF);
1312    assert(endif_inst != NULL);
1313    assert(else_inst == NULL || else_inst->header.opcode == BRW_OPCODE_ELSE);
1314
1315    unsigned br = 1;
1316    /* Jump count is for 64bit data chunk each, so one 128bit instruction
1317     * requires 2 chunks.
1318     */
1319    if (intel->gen >= 5)
1320       br = 2;
1321
1322    assert(endif_inst->header.opcode == BRW_OPCODE_ENDIF);
1323    endif_inst->header.execution_size = if_inst->header.execution_size;
1324
1325    if (else_inst == NULL) {
1326       /* Patch IF -> ENDIF */
1327       if (intel->gen < 6) {
1328          /* Turn it into an IFF, which means no mask stack operations for
1329           * all-false and jumping past the ENDIF.
1330           */
1331          if_inst->header.opcode = BRW_OPCODE_IFF;
1332          if_inst->bits3.if_else.jump_count = br * (endif_inst - if_inst + 1);
1333          if_inst->bits3.if_else.pop_count = 0;
1334          if_inst->bits3.if_else.pad0 = 0;
1335       } else if (intel->gen == 6) {
1336          /* As of gen6, there is no IFF and IF must point to the ENDIF. */
1337          if_inst->bits1.branch_gen6.jump_count = br * (endif_inst - if_inst);
1338       } else {
1339          if_inst->bits3.break_cont.uip = br * (endif_inst - if_inst);
1340          if_inst->bits3.break_cont.jip = br * (endif_inst - if_inst);
1341       }
1342    } else {
1343       else_inst->header.execution_size = if_inst->header.execution_size;
1344
1345       /* Patch IF -> ELSE */
1346       if (intel->gen < 6) {
1347          if_inst->bits3.if_else.jump_count = br * (else_inst - if_inst);
1348          if_inst->bits3.if_else.pop_count = 0;
1349          if_inst->bits3.if_else.pad0 = 0;
1350       } else if (intel->gen == 6) {
1351          if_inst->bits1.branch_gen6.jump_count = br * (else_inst - if_inst + 1);
1352       }
1353
1354       /* Patch ELSE -> ENDIF */
1355       if (intel->gen < 6) {
1356          /* BRW_OPCODE_ELSE pre-gen6 should point just past the
1357           * matching ENDIF.
1358           */
1359          else_inst->bits3.if_else.jump_count = br*(endif_inst - else_inst + 1);
1360          else_inst->bits3.if_else.pop_count = 1;
1361          else_inst->bits3.if_else.pad0 = 0;
1362       } else if (intel->gen == 6) {
1363          /* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */
1364          else_inst->bits1.branch_gen6.jump_count = br*(endif_inst - else_inst);
1365       } else {
1366          /* The IF instruction's JIP should point just past the ELSE */
1367          if_inst->bits3.break_cont.jip = br * (else_inst - if_inst + 1);
1368          /* The IF instruction's UIP and ELSE's JIP should point to ENDIF */
1369          if_inst->bits3.break_cont.uip = br * (endif_inst - if_inst);
1370          else_inst->bits3.break_cont.jip = br * (endif_inst - else_inst);
1371       }
1372    }
1373 }
1374
1375 void
1376 brw_ELSE(struct brw_compile *p)
1377 {
1378    struct intel_context *intel = &p->brw->intel;
1379    struct brw_instruction *insn;
1380
1381    insn = next_insn(p, BRW_OPCODE_ELSE);
1382
1383    if (intel->gen < 6) {
1384       brw_set_dest(p, insn, brw_ip_reg());
1385       brw_set_src0(p, insn, brw_ip_reg());
1386       brw_set_src1(p, insn, brw_imm_d(0x0));
1387    } else if (intel->gen == 6) {
1388       brw_set_dest(p, insn, brw_imm_w(0));
1389       insn->bits1.branch_gen6.jump_count = 0;
1390       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1391       brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1392    } else {
1393       brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1394       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1395       brw_set_src1(p, insn, brw_imm_ud(0));
1396       insn->bits3.break_cont.jip = 0;
1397       insn->bits3.break_cont.uip = 0;
1398    }
1399
1400    insn->header.compression_control = BRW_COMPRESSION_NONE;
1401    insn->header.mask_control = BRW_MASK_ENABLE;
1402    if (!p->single_program_flow)
1403       insn->header.thread_control = BRW_THREAD_SWITCH;
1404
1405    push_if_stack(p, insn);
1406 }
1407
1408 void
1409 brw_ENDIF(struct brw_compile *p)
1410 {
1411    struct intel_context *intel = &p->brw->intel;
1412    struct brw_instruction *insn = NULL;
1413    struct brw_instruction *else_inst = NULL;
1414    struct brw_instruction *if_inst = NULL;
1415    struct brw_instruction *tmp;
1416    bool emit_endif = true;
1417
1418    /* In single program flow mode, we can express IF and ELSE instructions
1419     * equivalently as ADD instructions that operate on IP.  On platforms prior
1420     * to Gen6, flow control instructions cause an implied thread switch, so
1421     * this is a significant savings.
1422     *
1423     * However, on Gen6, writing to IP doesn't work in single program flow mode
1424     * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1425     * not be updated by non-flow control instructions.").  And on later
1426     * platforms, there is no significant benefit to converting control flow
1427     * instructions to conditional ADDs.  So we only do this trick on Gen4 and
1428     * Gen5.
1429     */
1430    if (intel->gen < 6 && p->single_program_flow)
1431       emit_endif = false;
1432
1433    /*
1434     * A single next_insn() may change the base adress of instruction store
1435     * memory(p->store), so call it first before referencing the instruction
1436     * store pointer from an index
1437     */
1438    if (emit_endif)
1439       insn = next_insn(p, BRW_OPCODE_ENDIF);
1440
1441    /* Pop the IF and (optional) ELSE instructions from the stack */
1442    p->if_depth_in_loop[p->loop_stack_depth]--;
1443    tmp = pop_if_stack(p);
1444    if (tmp->header.opcode == BRW_OPCODE_ELSE) {
1445       else_inst = tmp;
1446       tmp = pop_if_stack(p);
1447    }
1448    if_inst = tmp;
1449
1450    if (!emit_endif) {
1451       /* ENDIF is useless; don't bother emitting it. */
1452       convert_IF_ELSE_to_ADD(p, if_inst, else_inst);
1453       return;
1454    }
1455
1456    if (intel->gen < 6) {
1457       brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1458       brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1459       brw_set_src1(p, insn, brw_imm_d(0x0));
1460    } else if (intel->gen == 6) {
1461       brw_set_dest(p, insn, brw_imm_w(0));
1462       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1463       brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1464    } else {
1465       brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1466       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1467       brw_set_src1(p, insn, brw_imm_ud(0));
1468    }
1469
1470    insn->header.compression_control = BRW_COMPRESSION_NONE;
1471    insn->header.mask_control = BRW_MASK_ENABLE;
1472    insn->header.thread_control = BRW_THREAD_SWITCH;
1473
1474    /* Also pop item off the stack in the endif instruction: */
1475    if (intel->gen < 6) {
1476       insn->bits3.if_else.jump_count = 0;
1477       insn->bits3.if_else.pop_count = 1;
1478       insn->bits3.if_else.pad0 = 0;
1479    } else if (intel->gen == 6) {
1480       insn->bits1.branch_gen6.jump_count = 2;
1481    } else {
1482       insn->bits3.break_cont.jip = 2;
1483    }
1484    patch_IF_ELSE(p, if_inst, else_inst, insn);
1485 }
1486
1487 struct brw_instruction *brw_BREAK(struct brw_compile *p)
1488 {
1489    struct intel_context *intel = &p->brw->intel;
1490    struct brw_instruction *insn;
1491
1492    insn = next_insn(p, BRW_OPCODE_BREAK);
1493    if (intel->gen >= 6) {
1494       brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1495       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1496       brw_set_src1(p, insn, brw_imm_d(0x0));
1497    } else {
1498       brw_set_dest(p, insn, brw_ip_reg());
1499       brw_set_src0(p, insn, brw_ip_reg());
1500       brw_set_src1(p, insn, brw_imm_d(0x0));
1501       insn->bits3.if_else.pad0 = 0;
1502       insn->bits3.if_else.pop_count = p->if_depth_in_loop[p->loop_stack_depth];
1503    }
1504    insn->header.compression_control = BRW_COMPRESSION_NONE;
1505    insn->header.execution_size = BRW_EXECUTE_8;
1506
1507    return insn;
1508 }
1509
1510 struct brw_instruction *gen6_CONT(struct brw_compile *p)
1511 {
1512    struct brw_instruction *insn;
1513
1514    insn = next_insn(p, BRW_OPCODE_CONTINUE);
1515    brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1516    brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1517    brw_set_dest(p, insn, brw_ip_reg());
1518    brw_set_src0(p, insn, brw_ip_reg());
1519    brw_set_src1(p, insn, brw_imm_d(0x0));
1520
1521    insn->header.compression_control = BRW_COMPRESSION_NONE;
1522    insn->header.execution_size = BRW_EXECUTE_8;
1523    return insn;
1524 }
1525
1526 struct brw_instruction *brw_CONT(struct brw_compile *p)
1527 {
1528    struct brw_instruction *insn;
1529    insn = next_insn(p, BRW_OPCODE_CONTINUE);
1530    brw_set_dest(p, insn, brw_ip_reg());
1531    brw_set_src0(p, insn, brw_ip_reg());
1532    brw_set_src1(p, insn, brw_imm_d(0x0));
1533    insn->header.compression_control = BRW_COMPRESSION_NONE;
1534    insn->header.execution_size = BRW_EXECUTE_8;
1535    /* insn->header.mask_control = BRW_MASK_DISABLE; */
1536    insn->bits3.if_else.pad0 = 0;
1537    insn->bits3.if_else.pop_count = p->if_depth_in_loop[p->loop_stack_depth];
1538    return insn;
1539 }
1540
1541 struct brw_instruction *gen6_HALT(struct brw_compile *p)
1542 {
1543    struct brw_instruction *insn;
1544
1545    insn = next_insn(p, BRW_OPCODE_HALT);
1546    brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1547    brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1548    brw_set_src1(p, insn, brw_imm_d(0x0)); /* UIP and JIP, updated later. */
1549
1550    if (p->compressed) {
1551       insn->header.execution_size = BRW_EXECUTE_16;
1552    } else {
1553       insn->header.compression_control = BRW_COMPRESSION_NONE;
1554       insn->header.execution_size = BRW_EXECUTE_8;
1555    }
1556    return insn;
1557 }
1558
1559 /* DO/WHILE loop:
1560  *
1561  * The DO/WHILE is just an unterminated loop -- break or continue are
1562  * used for control within the loop.  We have a few ways they can be
1563  * done.
1564  *
1565  * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
1566  * jip and no DO instruction.
1567  *
1568  * For non-uniform control flow pre-gen6, there's a DO instruction to
1569  * push the mask, and a WHILE to jump back, and BREAK to get out and
1570  * pop the mask.
1571  *
1572  * For gen6, there's no more mask stack, so no need for DO.  WHILE
1573  * just points back to the first instruction of the loop.
1574  */
1575 struct brw_instruction *brw_DO(struct brw_compile *p, unsigned execute_size)
1576 {
1577    struct intel_context *intel = &p->brw->intel;
1578
1579    if (intel->gen >= 6 || p->single_program_flow) {
1580       push_loop_stack(p, &p->store[p->nr_insn]);
1581       return &p->store[p->nr_insn];
1582    } else {
1583       struct brw_instruction *insn = next_insn(p, BRW_OPCODE_DO);
1584
1585       push_loop_stack(p, insn);
1586
1587       /* Override the defaults for this instruction:
1588        */
1589       brw_set_dest(p, insn, brw_null_reg());
1590       brw_set_src0(p, insn, brw_null_reg());
1591       brw_set_src1(p, insn, brw_null_reg());
1592
1593       insn->header.compression_control = BRW_COMPRESSION_NONE;
1594       insn->header.execution_size = execute_size;
1595       insn->header.predicate_control = BRW_PREDICATE_NONE;
1596       /* insn->header.mask_control = BRW_MASK_ENABLE; */
1597       /* insn->header.mask_control = BRW_MASK_DISABLE; */
1598
1599       return insn;
1600    }
1601 }
1602
1603 /**
1604  * For pre-gen6, we patch BREAK/CONT instructions to point at the WHILE
1605  * instruction here.
1606  *
1607  * For gen6+, see brw_set_uip_jip(), which doesn't care so much about the loop
1608  * nesting, since it can always just point to the end of the block/current loop.
1609  */
1610 static void
1611 brw_patch_break_cont(struct brw_compile *p, struct brw_instruction *while_inst)
1612 {
1613    struct intel_context *intel = &p->brw->intel;
1614    struct brw_instruction *do_inst = get_inner_do_insn(p);
1615    struct brw_instruction *inst;
1616    int br = (intel->gen == 5) ? 2 : 1;
1617
1618    for (inst = while_inst - 1; inst != do_inst; inst--) {
1619       /* If the jump count is != 0, that means that this instruction has already
1620        * been patched because it's part of a loop inside of the one we're
1621        * patching.
1622        */
1623       if (inst->header.opcode == BRW_OPCODE_BREAK &&
1624           inst->bits3.if_else.jump_count == 0) {
1625          inst->bits3.if_else.jump_count = br * ((while_inst - inst) + 1);
1626       } else if (inst->header.opcode == BRW_OPCODE_CONTINUE &&
1627                  inst->bits3.if_else.jump_count == 0) {
1628          inst->bits3.if_else.jump_count = br * (while_inst - inst);
1629       }
1630    }
1631 }
1632
1633 struct brw_instruction *brw_WHILE(struct brw_compile *p)
1634 {
1635    struct intel_context *intel = &p->brw->intel;
1636    struct brw_instruction *insn, *do_insn;
1637    unsigned br = 1;
1638
1639    if (intel->gen >= 5)
1640       br = 2;
1641
1642    if (intel->gen >= 7) {
1643       insn = next_insn(p, BRW_OPCODE_WHILE);
1644       do_insn = get_inner_do_insn(p);
1645
1646       brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1647       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1648       brw_set_src1(p, insn, brw_imm_ud(0));
1649       insn->bits3.break_cont.jip = br * (do_insn - insn);
1650
1651       insn->header.execution_size = BRW_EXECUTE_8;
1652    } else if (intel->gen == 6) {
1653       insn = next_insn(p, BRW_OPCODE_WHILE);
1654       do_insn = get_inner_do_insn(p);
1655
1656       brw_set_dest(p, insn, brw_imm_w(0));
1657       insn->bits1.branch_gen6.jump_count = br * (do_insn - insn);
1658       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1659       brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1660
1661       insn->header.execution_size = BRW_EXECUTE_8;
1662    } else {
1663       if (p->single_program_flow) {
1664          insn = next_insn(p, BRW_OPCODE_ADD);
1665          do_insn = get_inner_do_insn(p);
1666
1667          brw_set_dest(p, insn, brw_ip_reg());
1668          brw_set_src0(p, insn, brw_ip_reg());
1669          brw_set_src1(p, insn, brw_imm_d((do_insn - insn) * 16));
1670          insn->header.execution_size = BRW_EXECUTE_1;
1671       } else {
1672          insn = next_insn(p, BRW_OPCODE_WHILE);
1673          do_insn = get_inner_do_insn(p);
1674
1675          assert(do_insn->header.opcode == BRW_OPCODE_DO);
1676
1677          brw_set_dest(p, insn, brw_ip_reg());
1678          brw_set_src0(p, insn, brw_ip_reg());
1679          brw_set_src1(p, insn, brw_imm_d(0));
1680
1681          insn->header.execution_size = do_insn->header.execution_size;
1682          insn->bits3.if_else.jump_count = br * (do_insn - insn + 1);
1683          insn->bits3.if_else.pop_count = 0;
1684          insn->bits3.if_else.pad0 = 0;
1685
1686          brw_patch_break_cont(p, insn);
1687       }
1688    }
1689    insn->header.compression_control = BRW_COMPRESSION_NONE;
1690    p->current->header.predicate_control = BRW_PREDICATE_NONE;
1691
1692    p->loop_stack_depth--;
1693
1694    return insn;
1695 }
1696
1697
1698 /* FORWARD JUMPS:
1699  */
1700 void brw_land_fwd_jump(struct brw_compile *p, int jmp_insn_idx)
1701 {
1702    struct intel_context *intel = &p->brw->intel;
1703    struct brw_instruction *jmp_insn = &p->store[jmp_insn_idx];
1704    unsigned jmpi = 1;
1705
1706    if (intel->gen >= 5)
1707       jmpi = 2;
1708
1709    assert(jmp_insn->header.opcode == BRW_OPCODE_JMPI);
1710    assert(jmp_insn->bits1.da1.src1_reg_file == BRW_IMMEDIATE_VALUE);
1711
1712    jmp_insn->bits3.ud = jmpi * (p->nr_insn - jmp_insn_idx - 1);
1713 }
1714
1715
1716
1717 /* To integrate with the above, it makes sense that the comparison
1718  * instruction should populate the flag register.  It might be simpler
1719  * just to use the flag reg for most WM tasks?
1720  */
1721 void brw_CMP(struct brw_compile *p,
1722              struct brw_reg dest,
1723              unsigned conditional,
1724              struct brw_reg src0,
1725              struct brw_reg src1)
1726 {
1727    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_CMP);
1728
1729    insn->header.destreg__conditionalmod = conditional;
1730    brw_set_dest(p, insn, dest);
1731    brw_set_src0(p, insn, src0);
1732    brw_set_src1(p, insn, src1);
1733
1734 /*    guess_execution_size(insn, src0); */
1735
1736
1737    /* Make it so that future instructions will use the computed flag
1738     * value until brw_set_predicate_control_flag_value() is called
1739     * again.
1740     */
1741    if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
1742        dest.nr == 0) {
1743       p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
1744       p->flag_value = 0xff;
1745    }
1746 }
1747
1748 /* Issue 'wait' instruction for n1, host could program MMIO
1749    to wake up thread. */
1750 void brw_WAIT (struct brw_compile *p)
1751 {
1752    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_WAIT);
1753    struct brw_reg src = brw_notification_1_reg();
1754
1755    brw_set_dest(p, insn, src);
1756    brw_set_src0(p, insn, src);
1757    brw_set_src1(p, insn, brw_null_reg());
1758    insn->header.execution_size = 0; /* must */
1759    insn->header.predicate_control = 0;
1760    insn->header.compression_control = 0;
1761 }
1762
1763
1764 /***********************************************************************
1765  * Helpers for the various SEND message types:
1766  */
1767
1768 /** Extended math function, float[8].
1769  */
1770 void brw_math( struct brw_compile *p,
1771                struct brw_reg dest,
1772                unsigned function,
1773                unsigned msg_reg_nr,
1774                struct brw_reg src,
1775                unsigned data_type,
1776                unsigned precision )
1777 {
1778    struct intel_context *intel = &p->brw->intel;
1779
1780    if (intel->gen >= 6) {
1781       struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
1782
1783       assert(dest.file == BRW_GENERAL_REGISTER_FILE);
1784       assert(src.file == BRW_GENERAL_REGISTER_FILE);
1785
1786       assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1787       if (intel->gen == 6)
1788          assert(src.hstride == BRW_HORIZONTAL_STRIDE_1);
1789
1790       /* Source modifiers are ignored for extended math instructions on Gen6. */
1791       if (intel->gen == 6) {
1792          assert(!src.negate);
1793          assert(!src.abs);
1794       }
1795
1796       if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
1797           function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
1798           function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1799          assert(src.type != BRW_REGISTER_TYPE_F);
1800       } else {
1801          assert(src.type == BRW_REGISTER_TYPE_F);
1802       }
1803
1804       /* Math is the same ISA format as other opcodes, except that CondModifier
1805        * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1806        */
1807       insn->header.destreg__conditionalmod = function;
1808
1809       brw_set_dest(p, insn, dest);
1810       brw_set_src0(p, insn, src);
1811       brw_set_src1(p, insn, brw_null_reg());
1812    } else {
1813       struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1814
1815       /* Example code doesn't set predicate_control for send
1816        * instructions.
1817        */
1818       insn->header.predicate_control = 0;
1819       insn->header.destreg__conditionalmod = msg_reg_nr;
1820
1821       brw_set_dest(p, insn, dest);
1822       brw_set_src0(p, insn, src);
1823       brw_set_math_message(p,
1824                            insn,
1825                            function,
1826                            src.type == BRW_REGISTER_TYPE_D,
1827                            precision,
1828                            data_type);
1829    }
1830 }
1831
1832 /** Extended math function, float[8].
1833  */
1834 void brw_math2(struct brw_compile *p,
1835                struct brw_reg dest,
1836                unsigned function,
1837                struct brw_reg src0,
1838                struct brw_reg src1)
1839 {
1840    struct intel_context *intel = &p->brw->intel;
1841    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
1842
1843    assert(intel->gen >= 6);
1844    (void) intel;
1845
1846
1847    assert(dest.file == BRW_GENERAL_REGISTER_FILE);
1848    assert(src0.file == BRW_GENERAL_REGISTER_FILE);
1849    assert(src1.file == BRW_GENERAL_REGISTER_FILE);
1850
1851    assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1852    if (intel->gen == 6) {
1853       assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1);
1854       assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1);
1855    }
1856
1857    if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
1858        function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
1859        function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1860       assert(src0.type != BRW_REGISTER_TYPE_F);
1861       assert(src1.type != BRW_REGISTER_TYPE_F);
1862    } else {
1863       assert(src0.type == BRW_REGISTER_TYPE_F);
1864       assert(src1.type == BRW_REGISTER_TYPE_F);
1865    }
1866
1867    /* Source modifiers are ignored for extended math instructions on Gen6. */
1868    if (intel->gen == 6) {
1869       assert(!src0.negate);
1870       assert(!src0.abs);
1871       assert(!src1.negate);
1872       assert(!src1.abs);
1873    }
1874
1875    /* Math is the same ISA format as other opcodes, except that CondModifier
1876     * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1877     */
1878    insn->header.destreg__conditionalmod = function;
1879
1880    brw_set_dest(p, insn, dest);
1881    brw_set_src0(p, insn, src0);
1882    brw_set_src1(p, insn, src1);
1883 }
1884
1885
1886 /**
1887  * Write a block of OWORDs (half a GRF each) from the scratch buffer,
1888  * using a constant offset per channel.
1889  *
1890  * The offset must be aligned to oword size (16 bytes).  Used for
1891  * register spilling.
1892  */
1893 void brw_oword_block_write_scratch(struct brw_compile *p,
1894                                    struct brw_reg mrf,
1895                                    int num_regs,
1896                                    unsigned offset)
1897 {
1898    struct intel_context *intel = &p->brw->intel;
1899    uint32_t msg_control, msg_type;
1900    int mlen;
1901
1902    if (intel->gen >= 6)
1903       offset /= 16;
1904
1905    mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1906
1907    if (num_regs == 1) {
1908       msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
1909       mlen = 2;
1910    } else {
1911       msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
1912       mlen = 3;
1913    }
1914
1915    /* Set up the message header.  This is g0, with g0.2 filled with
1916     * the offset.  We don't want to leave our offset around in g0 or
1917     * it'll screw up texture samples, so set it up inside the message
1918     * reg.
1919     */
1920    {
1921       brw_push_insn_state(p);
1922       brw_set_mask_control(p, BRW_MASK_DISABLE);
1923       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1924
1925       brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1926
1927       /* set message header global offset field (reg 0, element 2) */
1928       brw_MOV(p,
1929               retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
1930                                   mrf.nr,
1931                                   2), BRW_REGISTER_TYPE_UD),
1932               brw_imm_ud(offset));
1933
1934       brw_pop_insn_state(p);
1935    }
1936
1937    {
1938       struct brw_reg dest;
1939       struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1940       int send_commit_msg;
1941       struct brw_reg src_header = retype(brw_vec8_grf(0, 0),
1942                                          BRW_REGISTER_TYPE_UW);
1943
1944       if (insn->header.compression_control != BRW_COMPRESSION_NONE) {
1945          insn->header.compression_control = BRW_COMPRESSION_NONE;
1946          src_header = vec16(src_header);
1947       }
1948       assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
1949       insn->header.destreg__conditionalmod = mrf.nr;
1950
1951       /* Until gen6, writes followed by reads from the same location
1952        * are not guaranteed to be ordered unless write_commit is set.
1953        * If set, then a no-op write is issued to the destination
1954        * register to set a dependency, and a read from the destination
1955        * can be used to ensure the ordering.
1956        *
1957        * For gen6, only writes between different threads need ordering
1958        * protection.  Our use of DP writes is all about register
1959        * spilling within a thread.
1960        */
1961       if (intel->gen >= 6) {
1962          dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
1963          send_commit_msg = 0;
1964       } else {
1965          dest = src_header;
1966          send_commit_msg = 1;
1967       }
1968
1969       brw_set_dest(p, insn, dest);
1970       if (intel->gen >= 6) {
1971          brw_set_src0(p, insn, mrf);
1972       } else {
1973          brw_set_src0(p, insn, brw_null_reg());
1974       }
1975
1976       if (intel->gen >= 6)
1977          msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
1978       else
1979          msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
1980
1981       brw_set_dp_write_message(p,
1982                                insn,
1983                                255, /* binding table index (255=stateless) */
1984                                msg_control,
1985                                msg_type,
1986                                mlen,
1987                                true, /* header_present */
1988                                0, /* not a render target */
1989                                send_commit_msg, /* response_length */
1990                                0, /* eot */
1991                                send_commit_msg);
1992    }
1993 }
1994
1995
1996 /**
1997  * Read a block of owords (half a GRF each) from the scratch buffer
1998  * using a constant index per channel.
1999  *
2000  * Offset must be aligned to oword size (16 bytes).  Used for register
2001  * spilling.
2002  */
2003 void
2004 brw_oword_block_read_scratch(struct brw_compile *p,
2005                              struct brw_reg dest,
2006                              struct brw_reg mrf,
2007                              int num_regs,
2008                              unsigned offset)
2009 {
2010    struct intel_context *intel = &p->brw->intel;
2011    uint32_t msg_control;
2012    int rlen;
2013
2014    if (intel->gen >= 6)
2015       offset /= 16;
2016
2017    mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2018    dest = retype(dest, BRW_REGISTER_TYPE_UW);
2019
2020    if (num_regs == 1) {
2021       msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
2022       rlen = 1;
2023    } else {
2024       msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
2025       rlen = 2;
2026    }
2027
2028    {
2029       brw_push_insn_state(p);
2030       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2031       brw_set_mask_control(p, BRW_MASK_DISABLE);
2032
2033       brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2034
2035       /* set message header global offset field (reg 0, element 2) */
2036       brw_MOV(p,
2037               retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2038                                   mrf.nr,
2039                                   2), BRW_REGISTER_TYPE_UD),
2040               brw_imm_ud(offset));
2041
2042       brw_pop_insn_state(p);
2043    }
2044
2045    {
2046       struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
2047
2048       assert(insn->header.predicate_control == 0);
2049       insn->header.compression_control = BRW_COMPRESSION_NONE;
2050       insn->header.destreg__conditionalmod = mrf.nr;
2051
2052       brw_set_dest(p, insn, dest);      /* UW? */
2053       if (intel->gen >= 6) {
2054          brw_set_src0(p, insn, mrf);
2055       } else {
2056          brw_set_src0(p, insn, brw_null_reg());
2057       }
2058
2059       brw_set_dp_read_message(p,
2060                               insn,
2061                               255, /* binding table index (255=stateless) */
2062                               msg_control,
2063                               BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
2064                               BRW_DATAPORT_READ_TARGET_RENDER_CACHE,
2065                               1, /* msg_length */
2066                               true, /* header_present */
2067                               rlen);
2068    }
2069 }
2070
2071 /**
2072  * Read a float[4] vector from the data port Data Cache (const buffer).
2073  * Location (in buffer) should be a multiple of 16.
2074  * Used for fetching shader constants.
2075  */
2076 void brw_oword_block_read(struct brw_compile *p,
2077                           struct brw_reg dest,
2078                           struct brw_reg mrf,
2079                           uint32_t offset,
2080                           uint32_t bind_table_index)
2081 {
2082    struct intel_context *intel = &p->brw->intel;
2083
2084    /* On newer hardware, offset is in units of owords. */
2085    if (intel->gen >= 6)
2086       offset /= 16;
2087
2088    mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2089
2090    brw_push_insn_state(p);
2091    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2092    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2093    brw_set_mask_control(p, BRW_MASK_DISABLE);
2094
2095    brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2096
2097    /* set message header global offset field (reg 0, element 2) */
2098    brw_MOV(p,
2099            retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2100                                mrf.nr,
2101                                2), BRW_REGISTER_TYPE_UD),
2102            brw_imm_ud(offset));
2103
2104    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
2105    insn->header.destreg__conditionalmod = mrf.nr;
2106
2107    /* cast dest to a uword[8] vector */
2108    dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
2109
2110    brw_set_dest(p, insn, dest);
2111    if (intel->gen >= 6) {
2112       brw_set_src0(p, insn, mrf);
2113    } else {
2114       brw_set_src0(p, insn, brw_null_reg());
2115    }
2116
2117    brw_set_dp_read_message(p,
2118                            insn,
2119                            bind_table_index,
2120                            BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW,
2121                            BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
2122                            BRW_DATAPORT_READ_TARGET_DATA_CACHE,
2123                            1, /* msg_length */
2124                            true, /* header_present */
2125                            1); /* response_length (1 reg, 2 owords!) */
2126
2127    brw_pop_insn_state(p);
2128 }
2129
2130
2131 void brw_fb_WRITE(struct brw_compile *p,
2132                   int dispatch_width,
2133                   unsigned msg_reg_nr,
2134                   struct brw_reg src0,
2135                   unsigned msg_control,
2136                   unsigned binding_table_index,
2137                   unsigned msg_length,
2138                   unsigned response_length,
2139                   bool eot,
2140                   bool header_present)
2141 {
2142    struct intel_context *intel = &p->brw->intel;
2143    struct brw_instruction *insn;
2144    unsigned msg_type;
2145    struct brw_reg dest;
2146
2147    if (dispatch_width == 16)
2148       dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2149    else
2150       dest = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2151
2152    if (intel->gen >= 6) {
2153       insn = next_insn(p, BRW_OPCODE_SENDC);
2154    } else {
2155       insn = next_insn(p, BRW_OPCODE_SEND);
2156    }
2157    /* The execution mask is ignored for render target writes. */
2158    insn->header.predicate_control = 0;
2159    insn->header.compression_control = BRW_COMPRESSION_NONE;
2160
2161    if (intel->gen >= 6) {
2162       /* headerless version, just submit color payload */
2163       src0 = brw_message_reg(msg_reg_nr);
2164
2165       msg_type = GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2166    } else {
2167       insn->header.destreg__conditionalmod = msg_reg_nr;
2168
2169       msg_type = BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2170    }
2171
2172    brw_set_dest(p, insn, dest);
2173    brw_set_src0(p, insn, src0);
2174    brw_set_dp_write_message(p,
2175                             insn,
2176                             binding_table_index,
2177                             msg_control,
2178                             msg_type,
2179                             msg_length,
2180                             header_present,
2181                             eot, /* last render target write */
2182                             response_length,
2183                             eot,
2184                             0 /* send_commit_msg */);
2185 }
2186
2187
2188 /**
2189  * Texture sample instruction.
2190  * Note: the msg_type plus msg_length values determine exactly what kind
2191  * of sampling operation is performed.  See volume 4, page 161 of docs.
2192  */
2193 void brw_SAMPLE(struct brw_compile *p,
2194                 struct brw_reg dest,
2195                 unsigned msg_reg_nr,
2196                 struct brw_reg src0,
2197                 unsigned binding_table_index,
2198                 unsigned sampler,
2199                 unsigned writemask,
2200                 unsigned msg_type,
2201                 unsigned response_length,
2202                 unsigned msg_length,
2203                 unsigned header_present,
2204                 unsigned simd_mode,
2205                 unsigned return_format)
2206 {
2207    struct intel_context *intel = &p->brw->intel;
2208    bool need_stall = 0;
2209
2210    if (writemask == 0) {
2211       /*printf("%s: zero writemask??\n", __FUNCTION__); */
2212       return;
2213    }
2214
2215    /* Hardware doesn't do destination dependency checking on send
2216     * instructions properly.  Add a workaround which generates the
2217     * dependency by other means.  In practice it seems like this bug
2218     * only crops up for texture samples, and only where registers are
2219     * written by the send and then written again later without being
2220     * read in between.  Luckily for us, we already track that
2221     * information and use it to modify the writemask for the
2222     * instruction, so that is a guide for whether a workaround is
2223     * needed.
2224     */
2225    if (writemask != BRW_WRITEMASK_XYZW) {
2226       unsigned dst_offset = 0;
2227       unsigned i, newmask = 0, len = 0;
2228
2229       for (i = 0; i < 4; i++) {
2230          if (writemask & (1<<i))
2231             break;
2232          dst_offset += 2;
2233       }
2234       for (; i < 4; i++) {
2235          if (!(writemask & (1<<i)))
2236             break;
2237          newmask |= 1<<i;
2238          len++;
2239       }
2240
2241       if (newmask != writemask) {
2242          need_stall = 1;
2243          /* printf("need stall %x %x\n", newmask , writemask); */
2244       }
2245       else {
2246          bool dispatch_16 = false;
2247
2248          struct brw_reg m1 = brw_message_reg(msg_reg_nr);
2249
2250          guess_execution_size(p, p->current, dest);
2251          if (p->current->header.execution_size == BRW_EXECUTE_16)
2252             dispatch_16 = true;
2253
2254          newmask = ~newmask & BRW_WRITEMASK_XYZW;
2255
2256          brw_push_insn_state(p);
2257
2258          brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2259          brw_set_mask_control(p, BRW_MASK_DISABLE);
2260
2261          brw_MOV(p, retype(m1, BRW_REGISTER_TYPE_UD),
2262                  retype(brw_vec8_grf(0,0), BRW_REGISTER_TYPE_UD));
2263          brw_MOV(p, get_element_ud(m1, 2), brw_imm_ud(newmask << 12));
2264
2265          brw_pop_insn_state(p);
2266
2267          src0 = retype(brw_null_reg(), BRW_REGISTER_TYPE_UW);
2268          dest = offset(dest, dst_offset);
2269
2270          /* For 16-wide dispatch, masked channels are skipped in the
2271           * response.  For 8-wide, masked channels still take up slots,
2272           * and are just not written to.
2273           */
2274          if (dispatch_16)
2275             response_length = len * 2;
2276       }
2277    }
2278
2279    {
2280       struct brw_instruction *insn;
2281
2282       gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2283
2284       insn = next_insn(p, BRW_OPCODE_SEND);
2285       insn->header.predicate_control = 0; /* XXX */
2286       insn->header.compression_control = BRW_COMPRESSION_NONE;
2287       if (intel->gen < 6)
2288           insn->header.destreg__conditionalmod = msg_reg_nr;
2289
2290       brw_set_dest(p, insn, dest);
2291       brw_set_src0(p, insn, src0);
2292       brw_set_sampler_message(p, insn,
2293                               binding_table_index,
2294                               sampler,
2295                               msg_type,
2296                               response_length,
2297                               msg_length,
2298                               header_present,
2299                               simd_mode,
2300                               return_format);
2301    }
2302
2303    if (need_stall) {
2304       struct brw_reg reg = vec8(offset(dest, response_length-1));
2305
2306       /*  mov (8) r9.0<1>:f    r9.0<8;8,1>:f    { Align1 }
2307        */
2308       brw_push_insn_state(p);
2309       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2310       brw_MOV(p, retype(reg, BRW_REGISTER_TYPE_UD),
2311               retype(reg, BRW_REGISTER_TYPE_UD));
2312       brw_pop_insn_state(p);
2313    }
2314
2315 }
2316
2317 /* All these variables are pretty confusing - we might be better off
2318  * using bitmasks and macros for this, in the old style.  Or perhaps
2319  * just having the caller instantiate the fields in dword3 itself.
2320  */
2321 void brw_urb_WRITE(struct brw_compile *p,
2322                    struct brw_reg dest,
2323                    unsigned msg_reg_nr,
2324                    struct brw_reg src0,
2325                    bool allocate,
2326                    bool used,
2327                    unsigned msg_length,
2328                    unsigned response_length,
2329                    bool eot,
2330                    bool writes_complete,
2331                    unsigned offset,
2332                    unsigned swizzle)
2333 {
2334    struct intel_context *intel = &p->brw->intel;
2335    struct brw_instruction *insn;
2336
2337    gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2338
2339    if (intel->gen == 7) {
2340       /* Enable Channel Masks in the URB_WRITE_HWORD message header */
2341       brw_push_insn_state(p);
2342       brw_set_access_mode(p, BRW_ALIGN_1);
2343       brw_OR(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 5),
2344                        BRW_REGISTER_TYPE_UD),
2345                 retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
2346                 brw_imm_ud(0xff00));
2347       brw_pop_insn_state(p);
2348    }
2349
2350    insn = next_insn(p, BRW_OPCODE_SEND);
2351
2352    assert(msg_length < BRW_MAX_MRF);
2353
2354    brw_set_dest(p, insn, dest);
2355    brw_set_src0(p, insn, src0);
2356    brw_set_src1(p, insn, brw_imm_d(0));
2357
2358    if (intel->gen < 6)
2359       insn->header.destreg__conditionalmod = msg_reg_nr;
2360
2361    brw_set_urb_message(p,
2362                        insn,
2363                        allocate,
2364                        used,
2365                        msg_length,
2366                        response_length,
2367                        eot,
2368                        writes_complete,
2369                        offset,
2370                        swizzle);
2371 }
2372
2373 static int
2374 next_ip(struct brw_compile *p, int ip)
2375 {
2376    struct brw_instruction *insn = (void *)p->store + ip;
2377
2378    if (insn->header.cmpt_control)
2379       return ip + 8;
2380    else
2381       return ip + 16;
2382 }
2383
2384 static int
2385 brw_find_next_block_end(struct brw_compile *p, int start)
2386 {
2387    int ip;
2388    void *store = p->store;
2389
2390    for (ip = next_ip(p, start); ip < p->next_insn_offset; ip = next_ip(p, ip)) {
2391       struct brw_instruction *insn = store + ip;
2392
2393       switch (insn->header.opcode) {
2394       case BRW_OPCODE_ENDIF:
2395       case BRW_OPCODE_ELSE:
2396       case BRW_OPCODE_WHILE:
2397       case BRW_OPCODE_HALT:
2398          return ip;
2399       }
2400    }
2401
2402    return 0;
2403 }
2404
2405 /* There is no DO instruction on gen6, so to find the end of the loop
2406  * we have to see if the loop is jumping back before our start
2407  * instruction.
2408  */
2409 static int
2410 brw_find_loop_end(struct brw_compile *p, int start)
2411 {
2412    struct intel_context *intel = &p->brw->intel;
2413    int ip;
2414    int scale = 8;
2415    void *store = p->store;
2416
2417    /* Always start after the instruction (such as a WHILE) we're trying to fix
2418     * up.
2419     */
2420    for (ip = next_ip(p, start); ip < p->next_insn_offset; ip = next_ip(p, ip)) {
2421       struct brw_instruction *insn = store + ip;
2422
2423       if (insn->header.opcode == BRW_OPCODE_WHILE) {
2424          int jip = intel->gen == 6 ? insn->bits1.branch_gen6.jump_count
2425                                    : insn->bits3.break_cont.jip;
2426          if (ip + jip * scale <= start)
2427             return ip;
2428       }
2429    }
2430    assert(!"not reached");
2431    return start;
2432 }
2433
2434 /* After program generation, go back and update the UIP and JIP of
2435  * BREAK, CONT, and HALT instructions to their correct locations.
2436  */
2437 void
2438 brw_set_uip_jip(struct brw_compile *p)
2439 {
2440    struct intel_context *intel = &p->brw->intel;
2441    int ip;
2442    int scale = 8;
2443    void *store = p->store;
2444
2445    if (intel->gen < 6)
2446       return;
2447
2448    for (ip = 0; ip < p->next_insn_offset; ip = next_ip(p, ip)) {
2449       struct brw_instruction *insn = store + ip;
2450
2451       if (insn->header.cmpt_control) {
2452          /* Fixups for compacted BREAK/CONTINUE not supported yet. */
2453          assert(insn->header.opcode != BRW_OPCODE_BREAK &&
2454                 insn->header.opcode != BRW_OPCODE_CONTINUE &&
2455                 insn->header.opcode != BRW_OPCODE_HALT);
2456          continue;
2457       }
2458
2459       int block_end_ip = brw_find_next_block_end(p, ip);
2460       switch (insn->header.opcode) {
2461       case BRW_OPCODE_BREAK:
2462          assert(block_end_ip != 0);
2463          insn->bits3.break_cont.jip = (block_end_ip - ip) / scale;
2464          /* Gen7 UIP points to WHILE; Gen6 points just after it */
2465          insn->bits3.break_cont.uip =
2466             (brw_find_loop_end(p, ip) - ip +
2467              (intel->gen == 6 ? 16 : 0)) / scale;
2468          break;
2469       case BRW_OPCODE_CONTINUE:
2470          assert(block_end_ip != 0);
2471          insn->bits3.break_cont.jip = (block_end_ip - ip) / scale;
2472          insn->bits3.break_cont.uip =
2473             (brw_find_loop_end(p, ip) - ip) / scale;
2474
2475          assert(insn->bits3.break_cont.uip != 0);
2476          assert(insn->bits3.break_cont.jip != 0);
2477          break;
2478
2479       case BRW_OPCODE_ENDIF:
2480          if (block_end_ip == 0)
2481             insn->bits3.break_cont.jip = 2;
2482          else
2483             insn->bits3.break_cont.jip = (block_end_ip - ip) / scale;
2484          break;
2485
2486       case BRW_OPCODE_HALT:
2487          /* From the Sandy Bridge PRM (volume 4, part 2, section 8.3.19):
2488           *
2489           *    "In case of the halt instruction not inside any conditional
2490           *     code block, the value of <JIP> and <UIP> should be the
2491           *     same. In case of the halt instruction inside conditional code
2492           *     block, the <UIP> should be the end of the program, and the
2493           *     <JIP> should be end of the most inner conditional code block."
2494           *
2495           * The uip will have already been set by whoever set up the
2496           * instruction.
2497           */
2498          if (block_end_ip == 0) {
2499             insn->bits3.break_cont.jip = insn->bits3.break_cont.uip;
2500          } else {
2501             insn->bits3.break_cont.jip = (block_end_ip - ip) / scale;
2502          }
2503          assert(insn->bits3.break_cont.uip != 0);
2504          assert(insn->bits3.break_cont.jip != 0);
2505          break;
2506       }
2507    }
2508 }
2509
2510 void brw_ff_sync(struct brw_compile *p,
2511                    struct brw_reg dest,
2512                    unsigned msg_reg_nr,
2513                    struct brw_reg src0,
2514                    bool allocate,
2515                    unsigned response_length,
2516                    bool eot)
2517 {
2518    struct intel_context *intel = &p->brw->intel;
2519    struct brw_instruction *insn;
2520
2521    gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2522
2523    insn = next_insn(p, BRW_OPCODE_SEND);
2524    brw_set_dest(p, insn, dest);
2525    brw_set_src0(p, insn, src0);
2526    brw_set_src1(p, insn, brw_imm_d(0));
2527
2528    if (intel->gen < 6)
2529       insn->header.destreg__conditionalmod = msg_reg_nr;
2530
2531    brw_set_ff_sync_message(p,
2532                            insn,
2533                            allocate,
2534                            response_length,
2535                            eot);
2536 }
2537
2538 /**
2539  * Emit the SEND instruction necessary to generate stream output data on Gen6
2540  * (for transform feedback).
2541  *
2542  * If send_commit_msg is true, this is the last piece of stream output data
2543  * from this thread, so send the data as a committed write.  According to the
2544  * Sandy Bridge PRM (volume 2 part 1, section 4.5.1):
2545  *
2546  *   "Prior to End of Thread with a URB_WRITE, the kernel must ensure all
2547  *   writes are complete by sending the final write as a committed write."
2548  */
2549 void
2550 brw_svb_write(struct brw_compile *p,
2551               struct brw_reg dest,
2552               unsigned msg_reg_nr,
2553               struct brw_reg src0,
2554               unsigned binding_table_index,
2555               bool   send_commit_msg)
2556 {
2557    struct brw_instruction *insn;
2558
2559    gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2560
2561    insn = next_insn(p, BRW_OPCODE_SEND);
2562    brw_set_dest(p, insn, dest);
2563    brw_set_src0(p, insn, src0);
2564    brw_set_src1(p, insn, brw_imm_d(0));
2565    brw_set_dp_write_message(p, insn,
2566                             binding_table_index,
2567                             0, /* msg_control: ignored */
2568                             GEN6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE,
2569                             1, /* msg_length */
2570                             true, /* header_present */
2571                             0, /* last_render_target: ignored */
2572                             send_commit_msg, /* response_length */
2573                             0, /* end_of_thread */
2574                             send_commit_msg); /* send_commit_msg */
2575 }
2576
2577 /**
2578  * This instruction is generated as a single-channel align1 instruction by
2579  * both the VS and FS stages when using INTEL_DEBUG=shader_time.
2580  *
2581  * We can't use the typed atomic op in the FS because that has the execution
2582  * mask ANDed with the pixel mask, but we just want to write the one dword for
2583  * all the pixels.
2584  *
2585  * We don't use the SIMD4x2 atomic ops in the VS because want to just write
2586  * one u32.  So we use the same untyped atomic write message as the pixel
2587  * shader.
2588  *
2589  * The untyped atomic operation requires a BUFFER surface type with RAW
2590  * format, and is only accessible through the legacy DATA_CACHE dataport
2591  * messages.
2592  */
2593 void brw_shader_time_add(struct brw_compile *p,
2594                          int base_mrf,
2595                          uint32_t surf_index)
2596 {
2597    struct intel_context *intel = &p->brw->intel;
2598    assert(intel->gen >= 7);
2599
2600    brw_push_insn_state(p);
2601    brw_set_access_mode(p, BRW_ALIGN_1);
2602    brw_set_mask_control(p, BRW_MASK_DISABLE);
2603    struct brw_instruction *send = brw_next_insn(p, BRW_OPCODE_SEND);
2604    brw_pop_insn_state(p);
2605
2606    /* We use brw_vec1_reg and unmasked because we want to increment the given
2607     * offset only once.
2608     */
2609    brw_set_dest(p, send, brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
2610                                       BRW_ARF_NULL, 0));
2611    brw_set_src0(p, send, brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2612                                       base_mrf, 0));
2613
2614    bool header_present = false;
2615    bool eot = false;
2616    uint32_t mlen = 2; /* offset, value */
2617    uint32_t rlen = 0;
2618    brw_set_message_descriptor(p, send,
2619                               GEN7_SFID_DATAPORT_DATA_CACHE,
2620                               mlen, rlen, header_present, eot);
2621
2622    send->bits3.ud |= 6 << 14; /* untyped atomic op */
2623    send->bits3.ud |= 0 << 13; /* no return data */
2624    send->bits3.ud |= 1 << 12; /* SIMD8 mode */
2625    send->bits3.ud |= BRW_AOP_ADD << 8;
2626    send->bits3.ud |= surf_index << 0;
2627 }