src/mesa/drivers/dri/i965/brw_optimize.c

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  * Authors:
  24  *    Eric Anholt <eric@anholt.net>
  25  *
  26  */
  27
  28 #include "main/macros.h"
  29 #include "program/program.h"
  30 #include "program/prog_print.h"
  31 #include "brw_context.h"
  32 #include "brw_defines.h"
  33 #include "brw_eu.h"
  34
  35 const struct brw_instruction_info brw_opcodes[128] = {
  36     [BRW_OPCODE_MOV] = { .name = "mov", .nsrc = 1, .ndst = 1, .is_arith = 1 },
  37     [BRW_OPCODE_FRC] = { .name = "frc", .nsrc = 1, .ndst = 1, .is_arith = 1 },
  38     [BRW_OPCODE_RNDU] = { .name = "rndu", .nsrc = 1, .ndst = 1, .is_arith = 1 },
  39     [BRW_OPCODE_RNDD] = { .name = "rndd", .nsrc = 1, .ndst = 1, .is_arith = 1 },
  40     [BRW_OPCODE_RNDE] = { .name = "rnde", .nsrc = 1, .ndst = 1, .is_arith = 1 },
  41     [BRW_OPCODE_RNDZ] = { .name = "rndz", .nsrc = 1, .ndst = 1, .is_arith = 1 },
  42     [BRW_OPCODE_NOT] = { .name = "not", .nsrc = 1, .ndst = 1, .is_arith = 1 },
  43     [BRW_OPCODE_LZD] = { .name = "lzd", .nsrc = 1, .ndst = 1 },
  44
  45     [BRW_OPCODE_MUL] = { .name = "mul", .nsrc = 2, .ndst = 1, .is_arith = 1 },
  46     [BRW_OPCODE_MAC] = { .name = "mac", .nsrc = 2, .ndst = 1, .is_arith = 1 },
  47     [BRW_OPCODE_MACH] = { .name = "mach", .nsrc = 2, .ndst = 1, .is_arith = 1 },
  48     [BRW_OPCODE_LINE] = { .name = "line", .nsrc = 2, .ndst = 1, .is_arith = 1 },
  49     [BRW_OPCODE_PLN] = { .name = "pln", .nsrc = 2, .ndst = 1 },
  50     [BRW_OPCODE_SAD2] = { .name = "sad2", .nsrc = 2, .ndst = 1 },
  51     [BRW_OPCODE_SADA2] = { .name = "sada2", .nsrc = 2, .ndst = 1 },
  52     [BRW_OPCODE_DP4] = { .name = "dp4", .nsrc = 2, .ndst = 1 },
  53     [BRW_OPCODE_DPH] = { .name = "dph", .nsrc = 2, .ndst = 1 },
  54     [BRW_OPCODE_DP3] = { .name = "dp3", .nsrc = 2, .ndst = 1 },
  55     [BRW_OPCODE_DP2] = { .name = "dp2", .nsrc = 2, .ndst = 1 },
  56     [BRW_OPCODE_MATH] = { .name = "math", .nsrc = 2, .ndst = 1 },
  57
  58     [BRW_OPCODE_AVG] = { .name = "avg", .nsrc = 2, .ndst = 1, .is_arith = 1 },
  59     [BRW_OPCODE_ADD] = { .name = "add", .nsrc = 2, .ndst = 1, .is_arith = 1 },
  60     [BRW_OPCODE_SEL] = { .name = "sel", .nsrc = 2, .ndst = 1, .is_arith = 1 },
  61     [BRW_OPCODE_AND] = { .name = "and", .nsrc = 2, .ndst = 1, .is_arith = 1 },
  62     [BRW_OPCODE_OR] = { .name = "or", .nsrc = 2, .ndst = 1, .is_arith = 1 },
  63     [BRW_OPCODE_XOR] = { .name = "xor", .nsrc = 2, .ndst = 1, .is_arith = 1 },
  64     [BRW_OPCODE_SHR] = { .name = "shr", .nsrc = 2, .ndst = 1, .is_arith = 1 },
  65     [BRW_OPCODE_SHL] = { .name = "shl", .nsrc = 2, .ndst = 1, .is_arith = 1 },
  66     [BRW_OPCODE_ASR] = { .name = "asr", .nsrc = 2, .ndst = 1 },
  67     [BRW_OPCODE_CMP] = { .name = "cmp", .nsrc = 2, .ndst = 1 },
  68     [BRW_OPCODE_CMPN] = { .name = "cmpn", .nsrc = 2, .ndst = 1 },
  69
  70     [BRW_OPCODE_SEND] = { .name = "send", .nsrc = 1, .ndst = 1 },
  71     [BRW_OPCODE_NOP] = { .name = "nop", .nsrc = 0, .ndst = 0 },
  72     [BRW_OPCODE_JMPI] = { .name = "jmpi", .nsrc = 1, .ndst = 0 },
  73     [BRW_OPCODE_IF] = { .name = "if", .nsrc = 2, .ndst = 0 },
  74     [BRW_OPCODE_IFF] = { .name = "iff", .nsrc = 2, .ndst = 1 },
  75     [BRW_OPCODE_WHILE] = { .name = "while", .nsrc = 2, .ndst = 0 },
  76     [BRW_OPCODE_ELSE] = { .name = "else", .nsrc = 2, .ndst = 0 },
  77     [BRW_OPCODE_BREAK] = { .name = "break", .nsrc = 2, .ndst = 0 },
  78     [BRW_OPCODE_CONTINUE] = { .name = "cont", .nsrc = 1, .ndst = 0 },
  79     [BRW_OPCODE_HALT] = { .name = "halt", .nsrc = 1, .ndst = 0 },
  80     [BRW_OPCODE_MSAVE] = { .name = "msave", .nsrc = 1, .ndst = 1 },
  81     [BRW_OPCODE_PUSH] = { .name = "push", .nsrc = 1, .ndst = 1 },
  82     [BRW_OPCODE_MRESTORE] = { .name = "mrest", .nsrc = 1, .ndst = 1 },
  83     [BRW_OPCODE_POP] = { .name = "pop", .nsrc = 2, .ndst = 0 },
  84     [BRW_OPCODE_WAIT] = { .name = "wait", .nsrc = 1, .ndst = 0 },
  85     [BRW_OPCODE_DO] = { .name = "do", .nsrc = 0, .ndst = 0 },
  86     [BRW_OPCODE_ENDIF] = { .name = "endif", .nsrc = 2, .ndst = 0 },
  87 };
  88
  89 static INLINE
  90 GLboolean brw_is_arithmetic_inst(const struct brw_instruction *inst)
  91 {
  92    return brw_opcodes[inst->header.opcode].is_arith;
  93 }
  94
  95 static const GLuint inst_stride[7] = {
  96     [0] = 0,
  97     [1] = 1,
  98     [2] = 2,
  99     [3] = 4,
 100     [4] = 8,
 101     [5] = 16,
 102     [6] = 32
 103 };
 104
 105 static const GLuint inst_type_size[8] = {
 106     [BRW_REGISTER_TYPE_UD] = 4,
 107     [BRW_REGISTER_TYPE_D] = 4,
 108     [BRW_REGISTER_TYPE_UW] = 2,
 109     [BRW_REGISTER_TYPE_W] = 2,
 110     [BRW_REGISTER_TYPE_UB] = 1,
 111     [BRW_REGISTER_TYPE_B] = 1,
 112     [BRW_REGISTER_TYPE_F] = 4
 113 };
 114
 115 static INLINE GLboolean
 116 brw_is_grf_written(const struct brw_instruction *inst,
 117                    int reg_index, int size,
 118                    int gen)
 119 {
 120    if (brw_opcodes[inst->header.opcode].ndst == 0)
 121       return GL_FALSE;
 122
 123    if (inst->bits1.da1.dest_address_mode != BRW_ADDRESS_DIRECT)
 124       if (inst->bits1.ia1.dest_reg_file == BRW_GENERAL_REGISTER_FILE)
 125          return GL_TRUE;
 126
 127    if (inst->bits1.da1.dest_reg_file != BRW_GENERAL_REGISTER_FILE)
 128       return GL_FALSE;
 129
 130    const int reg_start = reg_index * REG_SIZE;
 131    const int reg_end = reg_start + size;
 132
 133    const int type_size = inst_type_size[inst->bits1.da1.dest_reg_type];
 134    const int write_start = inst->bits1.da1.dest_reg_nr*REG_SIZE
 135                          + inst->bits1.da1.dest_subreg_nr;
 136    int length, write_end;
 137
 138    /* SEND is specific */
 139    if (inst->header.opcode == BRW_OPCODE_SEND) {
 140       if (gen >= 5)
 141          length = inst->bits3.generic_gen5.response_length*REG_SIZE;
 142       else
 143          length = inst->bits3.generic.response_length*REG_SIZE;
 144    }
 145    else {
 146       length = 1 << inst->header.execution_size;
 147       length *= type_size;
 148       length *= inst->bits1.da1.dest_horiz_stride;
 149    }
 150
 151    /* If the two intervals intersect, we overwrite the register */
 152    write_end = write_start + length;
 153    const int left = MAX2(write_start, reg_start);
 154    const int right = MIN2(write_end, reg_end);
 155
 156    return left < right;
 157 }
 158
 159 static GLboolean
 160 brw_is_mrf_written_alu(const struct brw_instruction *inst,
 161                        int reg_index, int size)
 162 {
 163    if (brw_opcodes[inst->header.opcode].ndst == 0)
 164       return GL_FALSE;
 165
 166    if (inst->bits1.da1.dest_reg_file != BRW_MESSAGE_REGISTER_FILE)
 167       return GL_FALSE;
 168
 169    if (inst->bits1.da1.dest_address_mode != BRW_ADDRESS_DIRECT)
 170       return GL_TRUE;
 171
 172    const int reg_start = reg_index * REG_SIZE;
 173    const int reg_end = reg_start + size;
 174
 175    const int mrf_index = inst->bits1.da1.dest_reg_nr & 0x0f;
 176    const int is_compr4 = inst->bits1.da1.dest_reg_nr & BRW_MRF_COMPR4;
 177    const int type_size = inst_type_size[inst->bits1.da1.dest_reg_type];
 178
 179    /* We use compr4 with a size != 16 elements. Strange, we conservatively
 180     * consider that we are writing the register.
 181     */
 182    if (is_compr4 && inst->header.execution_size != BRW_EXECUTE_16)
 183       return GL_TRUE;
 184
 185    /* Here we write mrf_{i} and mrf_{i+4}. So we read two times 8 elements */
 186    if (is_compr4) {
 187       const int length = 8 * type_size * inst->bits1.da1.dest_horiz_stride;
 188
 189       /* First 8-way register */
 190       const int write_start0 = mrf_index*REG_SIZE
 191                              + inst->bits1.da1.dest_subreg_nr;
 192       const int write_end0 = write_start0 + length;
 193
 194       /* Second 8-way register */
 195       const int write_start1 = (mrf_index+4)*REG_SIZE
 196                              + inst->bits1.da1.dest_subreg_nr;
 197       const int write_end1 = write_start1 + length;
 198
 199       /* If the two intervals intersect, we overwrite the register */
 200       const int left0 = MAX2(write_start0, reg_start);
 201       const int right0 = MIN2(write_end0, reg_end);
 202       const int left1 = MAX2(write_start1, reg_start);
 203       const int right1 = MIN2(write_end1, reg_end);
 204
 205       if (left0 < right0 || left1 < right1)
 206          return GL_TRUE;
 207    }
 208    else {
 209       int length;
 210       length = 1 << inst->header.execution_size;
 211       length *= type_size;
 212       length *= inst->bits1.da1.dest_horiz_stride;
 213
 214       /* If the two intervals intersect, we write into the register */
 215       const int write_start = inst->bits1.da1.dest_reg_nr*REG_SIZE
 216                             + inst->bits1.da1.dest_subreg_nr;
 217       const int write_end = write_start + length;
 218       const int left = MAX2(write_start, reg_start);
 219       const int right = MIN2(write_end, reg_end);
 220
 221       if (left < right)
 222          return GL_TRUE;
 223    }
 224
 225    return GL_FALSE;
 226 }
 227
 228 /* SEND may perform an implicit mov to a mrf register */
 229 static GLboolean brw_is_mrf_written_send(const struct brw_instruction *inst,
 230                                          int reg_index, int size)
 231 {
 232
 233    const int reg_start = reg_index * REG_SIZE;
 234    const int reg_end = reg_start + size;
 235    const int mrf_start = inst->header.destreg__conditionalmod;
 236    const int write_start = mrf_start * REG_SIZE;
 237    const int write_end = write_start + REG_SIZE;
 238    const int left = MAX2(write_start, reg_start);
 239    const int right = MIN2(write_end, reg_end);
 240
 241    if (inst->header.opcode != BRW_OPCODE_SEND ||
 242        inst->bits1.da1.src0_reg_file == 0)
 243       return GL_FALSE;
 244
 245    return left < right;
 246 }
 247
 248 /* Specific path for message register since we need to handle the compr4 case */
 249 static INLINE GLboolean
 250 brw_is_mrf_written(const struct brw_instruction *inst, int reg_index, int size)
 251 {
 252    return (brw_is_mrf_written_alu(inst, reg_index, size) ||
 253            brw_is_mrf_written_send(inst, reg_index, size));
 254 }
 255
 256 static INLINE GLboolean
 257 brw_is_mrf_read(const struct brw_instruction *inst,
 258                 int reg_index, int size, int gen)
 259 {
 260    if (inst->header.opcode != BRW_OPCODE_SEND)
 261       return GL_FALSE;
 262    if (inst->bits2.da1.src0_address_mode != BRW_ADDRESS_DIRECT)
 263       return GL_TRUE;
 264
 265    const int reg_start = reg_index*REG_SIZE;
 266    const int reg_end = reg_start + size;
 267
 268    int length, read_start, read_end;
 269    if (gen >= 5)
 270       length = inst->bits3.generic_gen5.msg_length*REG_SIZE;
 271    else
 272       length = inst->bits3.generic.msg_length*REG_SIZE;
 273
 274    /* Look if SEND uses an implicit mov. In that case, we read one less register
 275     * (but we write it)
 276     */
 277    if (inst->bits1.da1.src0_reg_file != 0)
 278       read_start = inst->header.destreg__conditionalmod;
 279    else {
 280       length--;
 281       read_start = inst->header.destreg__conditionalmod + 1;
 282    }
 283    read_start *= REG_SIZE;
 284    read_end = read_start + length;
 285
 286    const int left = MAX2(read_start, reg_start);
 287    const int right = MIN2(read_end, reg_end);
 288
 289    return left < right;
 290 }
 291
 292 static INLINE GLboolean
 293 brw_is_grf_read(const struct brw_instruction *inst, int reg_index, int size)
 294 {
 295    int i, j;
 296    if (brw_opcodes[inst->header.opcode].nsrc == 0)
 297       return GL_FALSE;
 298
 299    /* Look at first source. We must take into account register regions to
 300     * monitor carefully the read. Note that we are a bit too conservative here
 301     * since we do not take into account the fact that some complete registers
 302     * may be skipped
 303     */
 304    if (brw_opcodes[inst->header.opcode].nsrc >= 1) {
 305
 306       if (inst->bits2.da1.src0_address_mode != BRW_ADDRESS_DIRECT)
 307          if (inst->bits1.ia1.src0_reg_file == BRW_GENERAL_REGISTER_FILE)
 308             return GL_TRUE;
 309       if (inst->bits1.da1.src0_reg_file != BRW_GENERAL_REGISTER_FILE)
 310          return GL_FALSE;
 311
 312       const int reg_start = reg_index*REG_SIZE;
 313       const int reg_end = reg_start + size;
 314
 315       /* See if at least one of this element intersects the interval */
 316       const int type_size = inst_type_size[inst->bits1.da1.src0_reg_type];
 317       const int elem_num = 1 << inst->header.execution_size;
 318       const int width = 1 << inst->bits2.da1.src0_width;
 319       const int row_num = elem_num >> inst->bits2.da1.src0_width;
 320       const int hs = type_size*inst_stride[inst->bits2.da1.src0_horiz_stride];
 321       const int vs = type_size*inst_stride[inst->bits2.da1.src0_vert_stride];
 322       int row_start = inst->bits2.da1.src0_reg_nr*REG_SIZE
 323                     + inst->bits2.da1.src0_subreg_nr;
 324       for (j = 0; j < row_num; ++j) {
 325          int write_start = row_start;
 326          for (i = 0; i < width; ++i) {
 327             const int write_end = write_start + type_size;
 328             const int left = write_start > reg_start ? write_start : reg_start;
 329             const int right = write_end < reg_end ? write_end : reg_end;
 330             if (left < right)
 331                return GL_TRUE;
 332             write_start += hs;
 333          }
 334          row_start += vs;
 335       }
 336    }
 337
 338    /* Second src register */
 339    if (brw_opcodes[inst->header.opcode].nsrc >= 2) {
 340
 341       if (inst->bits3.da1.src1_address_mode != BRW_ADDRESS_DIRECT)
 342          if (inst->bits1.ia1.src1_reg_file == BRW_GENERAL_REGISTER_FILE)
 343             return GL_TRUE;
 344       if (inst->bits1.da1.src1_reg_file != BRW_GENERAL_REGISTER_FILE)
 345          return GL_FALSE;
 346
 347       const int reg_start = reg_index*REG_SIZE;
 348       const int reg_end = reg_start + size;
 349
 350       /* See if at least one of this element intersects the interval */
 351       const int type_size = inst_type_size[inst->bits1.da1.src1_reg_type];
 352       const int elem_num = 1 << inst->header.execution_size;
 353       const int width = 1 << inst->bits3.da1.src1_width;
 354       const int row_num = elem_num >> inst->bits3.da1.src1_width;
 355       const int hs = type_size*inst_stride[inst->bits3.da1.src1_horiz_stride];
 356       const int vs = type_size*inst_stride[inst->bits3.da1.src1_vert_stride];
 357       int row_start = inst->bits3.da1.src1_reg_nr*REG_SIZE
 358                     + inst->bits3.da1.src1_subreg_nr;
 359       for (j = 0; j < row_num; ++j) {
 360          int write_start = row_start;
 361          for (i = 0; i < width; ++i) {
 362             const int write_end = write_start + type_size;
 363             const int left = write_start > reg_start ? write_start : reg_start;
 364             const int right = write_end < reg_end ? write_end : reg_end;
 365             if (left < right)
 366                return GL_TRUE;
 367             write_start += hs;
 368          }
 369          row_start += vs;
 370       }
 371    }
 372
 373    return GL_FALSE;
 374 }
 375
 376 static INLINE GLboolean
 377 brw_is_control_done(const struct brw_instruction *mov) {
 378    return
 379        mov->header.dependency_control != 0 ||
 380        mov->header.thread_control != 0 ||
 381        mov->header.mask_control != 0 ||
 382        mov->header.saturate != 0 ||
 383        mov->header.debug_control != 0;
 384 }
 385
 386 static INLINE GLboolean
 387 brw_is_predicated(const struct brw_instruction *mov) {
 388    return mov->header.predicate_control != 0;
 389 }
 390
 391 static INLINE GLboolean
 392 brw_is_grf_to_mrf_mov(const struct brw_instruction *mov,
 393                       int *mrf_index,
 394                       int *grf_index,
 395                       GLboolean *is_compr4)
 396 {
 397    if (brw_is_predicated(mov) ||
 398        brw_is_control_done(mov) ||
 399        mov->header.debug_control != 0)
 400       return GL_FALSE;
 401
 402    if (mov->bits1.da1.dest_address_mode != BRW_ADDRESS_DIRECT ||
 403        mov->bits1.da1.dest_reg_file != BRW_MESSAGE_REGISTER_FILE ||
 404        mov->bits1.da1.dest_reg_type != BRW_REGISTER_TYPE_F ||
 405        mov->bits1.da1.dest_horiz_stride != BRW_HORIZONTAL_STRIDE_1 ||
 406        mov->bits1.da1.dest_subreg_nr != 0)
 407       return GL_FALSE;
 408
 409    if (mov->bits2.da1.src0_address_mode != BRW_ADDRESS_DIRECT ||
 410        mov->bits1.da1.src0_reg_file != BRW_GENERAL_REGISTER_FILE ||
 411        mov->bits1.da1.src0_reg_type != BRW_REGISTER_TYPE_F ||
 412        mov->bits2.da1.src0_width != BRW_WIDTH_8 ||
 413        mov->bits2.da1.src0_horiz_stride != BRW_HORIZONTAL_STRIDE_1 ||
 414        mov->bits2.da1.src0_vert_stride != BRW_VERTICAL_STRIDE_8 ||
 415        mov->bits2.da1.src0_subreg_nr != 0 ||
 416        mov->bits2.da1.src0_abs != 0 ||
 417        mov->bits2.da1.src0_negate != 0)
 418       return GL_FALSE;
 419
 420    *grf_index = mov->bits2.da1.src0_reg_nr;
 421    *mrf_index = mov->bits1.da1.dest_reg_nr & 0x0f;
 422    *is_compr4 = (mov->bits1.da1.dest_reg_nr & BRW_MRF_COMPR4) != 0;
 423    return GL_TRUE;
 424 }
 425
 426 static INLINE GLboolean
 427 brw_is_grf_straight_write(const struct brw_instruction *inst, int grf_index)
 428 {
 429    /* remark: no problem to predicate a SEL instruction */
 430    if ((!brw_is_predicated(inst) || inst->header.opcode == BRW_OPCODE_SEL) &&
 431        brw_is_control_done(inst) == GL_FALSE &&
 432        inst->header.execution_size == 4 &&
 433        inst->header.access_mode == BRW_ALIGN_1 &&
 434        inst->bits1.da1.dest_address_mode == BRW_ADDRESS_DIRECT &&
 435        inst->bits1.da1.dest_reg_file == BRW_GENERAL_REGISTER_FILE &&
 436        inst->bits1.da1.dest_reg_type == BRW_REGISTER_TYPE_F &&
 437        inst->bits1.da1.dest_horiz_stride == BRW_HORIZONTAL_STRIDE_1 &&
 438        inst->bits1.da1.dest_reg_nr == grf_index &&
 439        inst->bits1.da1.dest_subreg_nr == 0 &&
 440        brw_is_arithmetic_inst(inst))
 441       return GL_TRUE;
 442
 443    return GL_FALSE;
 444 }
 445
 446 static INLINE GLboolean
 447 brw_inst_are_equal(const struct brw_instruction *src0,
 448                    const struct brw_instruction *src1)
 449 {
 450    const GLuint *field0 = (GLuint *) src0;
 451    const GLuint *field1 = (GLuint *) src1;
 452    return field0[0] == field1[0] &&
 453           field0[1] == field1[1] &&
 454           field0[2] == field1[2] &&
 455           field0[3] == field1[3];
 456 }
 457
 458 static INLINE void
 459 brw_inst_copy(struct brw_instruction *dst,
 460               const struct brw_instruction *src)
 461 {
 462    GLuint *field_dst = (GLuint *) dst;
 463    const GLuint *field_src = (GLuint *) src;
 464    field_dst[0] = field_src[0];
 465    field_dst[1] = field_src[1];
 466    field_dst[2] = field_src[2];
 467    field_dst[3] = field_src[3];
 468 }
 469
 470 static void brw_remove_inst(struct brw_compile *p, const GLboolean *removeInst)
 471 {
 472    int i, nr_insn = 0, to = 0, from = 0;
 473
 474    for (from = 0; from < p->nr_insn; ++from) {
 475       if (removeInst[from])
 476          continue;
 477       if(to != from)
 478          brw_inst_copy(p->store + to, p->store + from);
 479       to++;
 480    }
 481
 482    for (i = 0; i < p->nr_insn; ++i)
 483       if (removeInst[i] == GL_FALSE)
 484          nr_insn++;
 485    p->nr_insn = nr_insn;
 486 }
 487
 488 /* The gen code emitter generates a lot of duplications in the
 489  * grf-to-mrf moves, for example when texture sampling with the same
 490  * coordinates from multiple textures..  Here, we monitor same mov
 491  * grf-to-mrf instrutions and remove repeated ones where the operands
 492  * and dst ahven't changed in between.
 493  */
 494 void brw_remove_duplicate_mrf_moves(struct brw_compile *p)
 495 {
 496    const int gen = p->brw->intel.gen;
 497    int i, j;
 498
 499    GLboolean *removeInst = calloc(sizeof(GLboolean), p->nr_insn);
 500    for (i = 0; i < p->nr_insn; i++) {
 501       if (removeInst[i])
 502          continue;
 503
 504       const struct brw_instruction *mov = p->store + i;
 505       int mrf_index, grf_index;
 506       GLboolean is_compr4;
 507
 508       /* Only consider _straight_ grf-to-mrf moves */
 509       if (!brw_is_grf_to_mrf_mov(mov, &mrf_index, &grf_index, &is_compr4))
 510          continue;
 511
 512       const int mrf_index0 = mrf_index;
 513       const int mrf_index1 = is_compr4 ? mrf_index0+4 : mrf_index0+1;
 514       const int simd16_size = 2 * REG_SIZE;
 515
 516       for (j = i + 1; j < p->nr_insn; j++) {
 517          const struct brw_instruction *inst = p->store + j;
 518
 519          if (brw_inst_are_equal(mov, inst)) {
 520             removeInst[j] = GL_TRUE;
 521             continue;
 522          }
 523
 524          if (brw_is_grf_written(inst, grf_index, simd16_size, gen) ||
 525              brw_is_mrf_written(inst, mrf_index0, REG_SIZE) ||
 526              brw_is_mrf_written(inst, mrf_index1, REG_SIZE))
 527             break;
 528       }
 529    }
 530
 531    brw_remove_inst(p, removeInst);
 532    free(removeInst);
 533 }
 534
 535 /* Replace moves to MRFs where the value moved is the result of a
 536  * normal arithmetic operation with computation right into the MRF.
 537  */
 538 void brw_remove_grf_to_mrf_moves(struct brw_compile *p)
 539 {
 540    int i, j, prev;
 541    struct brw_context *brw = p->brw;
 542    const int gen = brw->intel.gen;
 543    const int simd16_size = 2*REG_SIZE;
 544
 545    GLboolean *removeInst = calloc(sizeof(GLboolean), p->nr_insn);
 546    assert(removeInst);
 547
 548    for (i = 0; i < p->nr_insn; i++) {
 549       if (removeInst[i])
 550          continue;
 551
 552       struct brw_instruction *grf_inst = NULL;
 553       const struct brw_instruction *mov = p->store + i;
 554       int mrf_index, grf_index;
 555       GLboolean is_compr4;
 556
 557       /* Only consider _straight_ grf-to-mrf moves */
 558       if (!brw_is_grf_to_mrf_mov(mov, &mrf_index, &grf_index, &is_compr4))
 559          continue;
 560
 561       /* Using comp4 enables a stride of 4 for this instruction */
 562       const int mrf_index0 = mrf_index;
 563       const int mrf_index1 = is_compr4 ? mrf_index+4 : mrf_index+1;
 564
 565       /* Look where the register has been set */
 566       prev = i;
 567       GLboolean potential_remove = GL_FALSE;
 568       while (prev--) {
 569
 570          /* If _one_ instruction writes the grf, we try to remove the mov */
 571          struct brw_instruction *inst = p->store + prev;
 572          if (brw_is_grf_straight_write(inst, grf_index)) {
 573             potential_remove = GL_TRUE;
 574             grf_inst = inst;
 575             break;
 576          }
 577
 578       }
 579
 580       if (potential_remove == GL_FALSE)
 581          continue;
 582       removeInst[i] = GL_TRUE;
 583
 584       /* Monitor first the section of code between the grf computation and the
 585        * mov. Here we cannot read or write both mrf and grf register
 586        */
 587       for (j = prev + 1; j < i; ++j) {
 588          struct brw_instruction *inst = p->store + j;
 589          if (removeInst[j])
 590             continue;
 591          if (brw_is_grf_written(inst, grf_index, simd16_size, gen)   ||
 592              brw_is_grf_read(inst, grf_index, simd16_size)           ||
 593              brw_is_mrf_written(inst, mrf_index0, REG_SIZE)   ||
 594              brw_is_mrf_written(inst, mrf_index1, REG_SIZE)   ||
 595              brw_is_mrf_read(inst, mrf_index0, REG_SIZE, gen) ||
 596              brw_is_mrf_read(inst, mrf_index1, REG_SIZE, gen)) {
 597             removeInst[i] = GL_FALSE;
 598             break;
 599          }
 600       }
 601
 602       /* After the mov, we can read or write the mrf. If the grf is overwritten,
 603        * we are done
 604        */
 605       for (j = i + 1; j < p->nr_insn; ++j) {
 606          struct brw_instruction *inst = p->store + j;
 607          if (removeInst[j])
 608             continue;
 609
 610          if (brw_is_grf_read(inst, grf_index, simd16_size)) {
 611             removeInst[i] = GL_FALSE;
 612             break;
 613          }
 614
 615          if (brw_is_grf_straight_write(inst, grf_index))
 616             break;
 617       }
 618
 619       /* Note that with the top down traversal, we can safely pacth the mov
 620        * instruction
 621        */
 622       if (removeInst[i]) {
 623          grf_inst->bits1.da1.dest_reg_file = mov->bits1.da1.dest_reg_file;
 624          grf_inst->bits1.da1.dest_reg_nr = mov->bits1.da1.dest_reg_nr;
 625       }
 626    }
 627
 628    brw_remove_inst(p, removeInst);
 629    free(removeInst);
 630 }
 631
 632 static GLboolean
 633 is_single_channel_dp4(struct brw_instruction *insn)
 634 {
 635    if (insn->header.opcode != BRW_OPCODE_DP4 ||
 636        insn->header.execution_size != BRW_EXECUTE_8 ||
 637        insn->header.access_mode != BRW_ALIGN_16 ||
 638        insn->bits1.da1.dest_reg_file != BRW_GENERAL_REGISTER_FILE)
 639       return GL_FALSE;
 640
 641    if (!is_power_of_two(insn->bits1.da16.dest_writemask))
 642       return GL_FALSE;
 643
 644    return GL_TRUE;
 645 }
 646
 647 /**
 648  * Sets the dependency control fields on DP4 instructions.
 649  *
 650  * The hardware only tracks dependencies on a register basis, so when
 651  * you do:
 652  *
 653  * DP4 dst.x src1 src2
 654  * DP4 dst.y src1 src3
 655  * DP4 dst.z src1 src4
 656  * DP4 dst.w src1 src5
 657  *
 658  * It will wait to do the DP4 dst.y until the dst.x is resolved, etc.
 659  * We can examine our instruction stream and set the dependency
 660  * control fields to tell the hardware when to do it.
 661  *
 662  * We may want to extend this to other instructions that are used to
 663  * fill in a channel at a time of the destination register.
 664  */
 665 static void
 666 brw_set_dp4_dependency_control(struct brw_compile *p)
 667 {
 668    int i;
 669
 670    for (i = 1; i < p->nr_insn; i++) {
 671       struct brw_instruction *insn = &p->store[i];
 672       struct brw_instruction *prev = &p->store[i - 1];
 673
 674       if (!is_single_channel_dp4(prev))
 675          continue;
 676
 677       if (!is_single_channel_dp4(insn)) {
 678          i++;
 679          continue;
 680       }
 681
 682       /* Only avoid hw dep control if the write masks are different
 683        * channels of one reg.
 684        */
 685       if (insn->bits1.da16.dest_writemask == prev->bits1.da16.dest_writemask)
 686          continue;
 687       if (insn->bits1.da16.dest_reg_nr != prev->bits1.da16.dest_reg_nr)
 688          continue;
 689
 690       /* Check if the second instruction depends on the previous one
 691        * for a src.
 692        */
 693       if (insn->bits1.da1.src0_reg_file == BRW_GENERAL_REGISTER_FILE &&
 694           (insn->bits2.da1.src0_address_mode != BRW_ADDRESS_DIRECT ||
 695            insn->bits2.da1.src0_reg_nr == insn->bits1.da16.dest_reg_nr))
 696           continue;
 697       if (insn->bits1.da1.src1_reg_file == BRW_GENERAL_REGISTER_FILE &&
 698           (insn->bits3.da1.src1_address_mode != BRW_ADDRESS_DIRECT ||
 699            insn->bits3.da1.src1_reg_nr == insn->bits1.da16.dest_reg_nr))
 700           continue;
 701
 702       prev->header.dependency_control |= BRW_DEPENDENCY_NOTCLEARED;
 703       insn->header.dependency_control |= BRW_DEPENDENCY_NOTCHECKED;
 704    }
 705 }
 706
 707 void
 708 brw_optimize(struct brw_compile *p)
 709 {
 710    brw_set_dp4_dependency_control(p);
 711 }