src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 extern "C" {
  26 #include "main/macros.h"
  27 #include "program/prog_parameter.h"
  28 #include "program/sampler.h"
  29 }
  30
  31 namespace brw {
  32
  33 vec4_instruction::vec4_instruction(vec4_visitor *v,
  34                                    enum opcode opcode, dst_reg dst,
  35                                    src_reg src0, src_reg src1, src_reg src2)
  36 {
  37    this->opcode = opcode;
  38    this->dst = dst;
  39    this->src[0] = src0;
  40    this->src[1] = src1;
  41    this->src[2] = src2;
  42    this->ir = v->base_ir;
  43    this->annotation = v->current_annotation;
  44 }
  45
  46 vec4_instruction *
  47 vec4_visitor::emit(vec4_instruction *inst)
  48 {
  49    this->instructions.push_tail(inst);
  50
  51    return inst;
  52 }
  53
  54 vec4_instruction *
  55 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
  56 {
  57    new_inst->ir = inst->ir;
  58    new_inst->annotation = inst->annotation;
  59
  60    inst->insert_before(new_inst);
  61
  62    return inst;
  63 }
  64
  65 vec4_instruction *
  66 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
  67                    src_reg src0, src_reg src1, src_reg src2)
  68 {
  69    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
  70                                              src0, src1, src2));
  71 }
  72
  73
  74 vec4_instruction *
  75 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
  76 {
  77    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
  78 }
  79
  80 vec4_instruction *
  81 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
  82 {
  83    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
  84 }
  85
  86 vec4_instruction *
  87 vec4_visitor::emit(enum opcode opcode)
  88 {
  89    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
  90 }
  91
  92 #define ALU1(op)                                                        \
  93    vec4_instruction *                                                   \
  94    vec4_visitor::op(dst_reg dst, src_reg src0)                          \
  95    {                                                                    \
  96       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
  97                                            src0);                       \
  98    }
  99
 100 #define ALU2(op)                                                        \
 101    vec4_instruction *                                                   \
 102    vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1)            \
 103    {                                                                    \
 104       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 105                                            src0, src1);                 \
 106    }
 107
 108 ALU1(NOT)
 109 ALU1(MOV)
 110 ALU1(FRC)
 111 ALU1(RNDD)
 112 ALU1(RNDE)
 113 ALU1(RNDZ)
 114 ALU2(ADD)
 115 ALU2(MUL)
 116 ALU2(MACH)
 117 ALU2(AND)
 118 ALU2(OR)
 119 ALU2(XOR)
 120 ALU2(DP3)
 121 ALU2(DP4)
 122
 123 /** Gen4 predicated IF. */
 124 vec4_instruction *
 125 vec4_visitor::IF(uint32_t predicate)
 126 {
 127    vec4_instruction *inst;
 128
 129    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
 130    inst->predicate = predicate;
 131
 132    return inst;
 133 }
 134
 135 /** Gen6+ IF with embedded comparison. */
 136 vec4_instruction *
 137 vec4_visitor::IF(src_reg src0, src_reg src1, uint32_t condition)
 138 {
 139    assert(intel->gen >= 6);
 140
 141    vec4_instruction *inst;
 142
 143    resolve_ud_negate(&src0);
 144    resolve_ud_negate(&src1);
 145
 146    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
 147                                         src0, src1);
 148    inst->conditional_mod = condition;
 149
 150    return inst;
 151 }
 152
 153 /**
 154  * CMP: Sets the low bit of the destination channels with the result
 155  * of the comparison, while the upper bits are undefined, and updates
 156  * the flag register with the packed 16 bits of the result.
 157  */
 158 vec4_instruction *
 159 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, uint32_t condition)
 160 {
 161    vec4_instruction *inst;
 162
 163    /* original gen4 does type conversion to the destination type
 164     * before before comparison, producing garbage results for floating
 165     * point comparisons.
 166     */
 167    if (intel->gen == 4) {
 168       dst.type = src0.type;
 169       if (dst.file == HW_REG)
 170          dst.fixed_hw_reg.type = dst.type;
 171    }
 172
 173    resolve_ud_negate(&src0);
 174    resolve_ud_negate(&src1);
 175
 176    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
 177    inst->conditional_mod = condition;
 178
 179    return inst;
 180 }
 181
 182 vec4_instruction *
 183 vec4_visitor::SCRATCH_READ(dst_reg dst, src_reg index)
 184 {
 185    vec4_instruction *inst;
 186
 187    inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_READ,
 188                                         dst, index);
 189    inst->base_mrf = 14;
 190    inst->mlen = 1;
 191
 192    return inst;
 193 }
 194
 195 vec4_instruction *
 196 vec4_visitor::SCRATCH_WRITE(dst_reg dst, src_reg src, src_reg index)
 197 {
 198    vec4_instruction *inst;
 199
 200    inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_WRITE,
 201                                         dst, src, index);
 202    inst->base_mrf = 13;
 203    inst->mlen = 2;
 204
 205    return inst;
 206 }
 207
 208 void
 209 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
 210 {
 211    static enum opcode dot_opcodes[] = {
 212       BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
 213    };
 214
 215    emit(dot_opcodes[elements - 2], dst, src0, src1);
 216 }
 217
 218 void
 219 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
 220 {
 221    /* The gen6 math instruction ignores the source modifiers --
 222     * swizzle, abs, negate, and at least some parts of the register
 223     * region description.
 224     *
 225     * While it would seem that this MOV could be avoided at this point
 226     * in the case that the swizzle is matched up with the destination
 227     * writemask, note that uniform packing and register allocation
 228     * could rearrange our swizzle, so let's leave this matter up to
 229     * copy propagation later.
 230     */
 231    src_reg temp_src = src_reg(this, glsl_type::vec4_type);
 232    emit(MOV(dst_reg(temp_src), src));
 233
 234    if (dst.writemask != WRITEMASK_XYZW) {
 235       /* The gen6 math instruction must be align1, so we can't do
 236        * writemasks.
 237        */
 238       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 239
 240       emit(opcode, temp_dst, temp_src);
 241
 242       emit(MOV(dst, src_reg(temp_dst)));
 243    } else {
 244       emit(opcode, dst, temp_src);
 245    }
 246 }
 247
 248 void
 249 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
 250 {
 251    vec4_instruction *inst = emit(opcode, dst, src);
 252    inst->base_mrf = 1;
 253    inst->mlen = 1;
 254 }
 255
 256 void
 257 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
 258 {
 259    switch (opcode) {
 260    case SHADER_OPCODE_RCP:
 261    case SHADER_OPCODE_RSQ:
 262    case SHADER_OPCODE_SQRT:
 263    case SHADER_OPCODE_EXP2:
 264    case SHADER_OPCODE_LOG2:
 265    case SHADER_OPCODE_SIN:
 266    case SHADER_OPCODE_COS:
 267       break;
 268    default:
 269       assert(!"not reached: bad math opcode");
 270       return;
 271    }
 272
 273    if (intel->gen >= 7) {
 274       emit(opcode, dst, src);
 275    } else if (intel->gen == 6) {
 276       return emit_math1_gen6(opcode, dst, src);
 277    } else {
 278       return emit_math1_gen4(opcode, dst, src);
 279    }
 280 }
 281
 282 void
 283 vec4_visitor::emit_math2_gen6(enum opcode opcode,
 284                               dst_reg dst, src_reg src0, src_reg src1)
 285 {
 286    src_reg expanded;
 287
 288    /* The gen6 math instruction ignores the source modifiers --
 289     * swizzle, abs, negate, and at least some parts of the register
 290     * region description.  Move the sources to temporaries to make it
 291     * generally work.
 292     */
 293
 294    expanded = src_reg(this, glsl_type::vec4_type);
 295    expanded.type = src0.type;
 296    emit(MOV(dst_reg(expanded), src0));
 297    src0 = expanded;
 298
 299    expanded = src_reg(this, glsl_type::vec4_type);
 300    expanded.type = src1.type;
 301    emit(MOV(dst_reg(expanded), src1));
 302    src1 = expanded;
 303
 304    if (dst.writemask != WRITEMASK_XYZW) {
 305       /* The gen6 math instruction must be align1, so we can't do
 306        * writemasks.
 307        */
 308       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 309       temp_dst.type = dst.type;
 310
 311       emit(opcode, temp_dst, src0, src1);
 312
 313       emit(MOV(dst, src_reg(temp_dst)));
 314    } else {
 315       emit(opcode, dst, src0, src1);
 316    }
 317 }
 318
 319 void
 320 vec4_visitor::emit_math2_gen4(enum opcode opcode,
 321                               dst_reg dst, src_reg src0, src_reg src1)
 322 {
 323    vec4_instruction *inst = emit(opcode, dst, src0, src1);
 324    inst->base_mrf = 1;
 325    inst->mlen = 2;
 326 }
 327
 328 void
 329 vec4_visitor::emit_math(enum opcode opcode,
 330                         dst_reg dst, src_reg src0, src_reg src1)
 331 {
 332    switch (opcode) {
 333    case SHADER_OPCODE_POW:
 334    case SHADER_OPCODE_INT_QUOTIENT:
 335    case SHADER_OPCODE_INT_REMAINDER:
 336       break;
 337    default:
 338       assert(!"not reached: unsupported binary math opcode");
 339       return;
 340    }
 341
 342    if (intel->gen >= 7) {
 343       emit(opcode, dst, src0, src1);
 344    } else if (intel->gen == 6) {
 345       return emit_math2_gen6(opcode, dst, src0, src1);
 346    } else {
 347       return emit_math2_gen4(opcode, dst, src0, src1);
 348    }
 349 }
 350
 351 void
 352 vec4_visitor::visit_instructions(const exec_list *list)
 353 {
 354    foreach_list(node, list) {
 355       ir_instruction *ir = (ir_instruction *)node;
 356
 357       base_ir = ir;
 358       ir->accept(this);
 359    }
 360 }
 361
 362
 363 static int
 364 type_size(const struct glsl_type *type)
 365 {
 366    unsigned int i;
 367    int size;
 368
 369    switch (type->base_type) {
 370    case GLSL_TYPE_UINT:
 371    case GLSL_TYPE_INT:
 372    case GLSL_TYPE_FLOAT:
 373    case GLSL_TYPE_BOOL:
 374       if (type->is_matrix()) {
 375          return type->matrix_columns;
 376       } else {
 377          /* Regardless of size of vector, it gets a vec4. This is bad
 378           * packing for things like floats, but otherwise arrays become a
 379           * mess.  Hopefully a later pass over the code can pack scalars
 380           * down if appropriate.
 381           */
 382          return 1;
 383       }
 384    case GLSL_TYPE_ARRAY:
 385       assert(type->length > 0);
 386       return type_size(type->fields.array) * type->length;
 387    case GLSL_TYPE_STRUCT:
 388       size = 0;
 389       for (i = 0; i < type->length; i++) {
 390          size += type_size(type->fields.structure[i].type);
 391       }
 392       return size;
 393    case GLSL_TYPE_SAMPLER:
 394       /* Samplers take up one slot in UNIFORMS[], but they're baked in
 395        * at link time.
 396        */
 397       return 1;
 398    default:
 399       assert(0);
 400       return 0;
 401    }
 402 }
 403
 404 int
 405 vec4_visitor::virtual_grf_alloc(int size)
 406 {
 407    if (virtual_grf_array_size <= virtual_grf_count) {
 408       if (virtual_grf_array_size == 0)
 409          virtual_grf_array_size = 16;
 410       else
 411          virtual_grf_array_size *= 2;
 412       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 413                                    virtual_grf_array_size);
 414       virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
 415                                      virtual_grf_array_size);
 416    }
 417    virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
 418    virtual_grf_reg_count += size;
 419    virtual_grf_sizes[virtual_grf_count] = size;
 420    return virtual_grf_count++;
 421 }
 422
 423 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 424 {
 425    init();
 426
 427    this->file = GRF;
 428    this->reg = v->virtual_grf_alloc(type_size(type));
 429
 430    if (type->is_array() || type->is_record()) {
 431       this->swizzle = BRW_SWIZZLE_NOOP;
 432    } else {
 433       this->swizzle = swizzle_for_size(type->vector_elements);
 434    }
 435
 436    this->type = brw_type_for_base_type(type);
 437 }
 438
 439 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 440 {
 441    init();
 442
 443    this->file = GRF;
 444    this->reg = v->virtual_grf_alloc(type_size(type));
 445
 446    if (type->is_array() || type->is_record()) {
 447       this->writemask = WRITEMASK_XYZW;
 448    } else {
 449       this->writemask = (1 << type->vector_elements) - 1;
 450    }
 451
 452    this->type = brw_type_for_base_type(type);
 453 }
 454
 455 /* Our support for uniforms is piggy-backed on the struct
 456  * gl_fragment_program, because that's where the values actually
 457  * get stored, rather than in some global gl_shader_program uniform
 458  * store.
 459  */
 460 int
 461 vec4_visitor::setup_uniform_values(int loc, const glsl_type *type)
 462 {
 463    unsigned int offset = 0;
 464    float *values = &this->vp->Base.Parameters->ParameterValues[loc][0].f;
 465
 466    if (type->is_matrix()) {
 467       const glsl_type *column = type->column_type();
 468
 469       for (unsigned int i = 0; i < type->matrix_columns; i++) {
 470          offset += setup_uniform_values(loc + offset, column);
 471       }
 472
 473       return offset;
 474    }
 475
 476    switch (type->base_type) {
 477    case GLSL_TYPE_FLOAT:
 478    case GLSL_TYPE_UINT:
 479    case GLSL_TYPE_INT:
 480    case GLSL_TYPE_BOOL:
 481       for (unsigned int i = 0; i < type->vector_elements; i++) {
 482          c->prog_data.param[this->uniforms * 4 + i] = &values[i];
 483       }
 484
 485       /* Set up pad elements to get things aligned to a vec4 boundary. */
 486       for (unsigned int i = type->vector_elements; i < 4; i++) {
 487          static float zero = 0;
 488
 489          c->prog_data.param[this->uniforms * 4 + i] = &zero;
 490       }
 491
 492       /* Track the size of this uniform vector, for future packing of
 493        * uniforms.
 494        */
 495       this->uniform_vector_size[this->uniforms] = type->vector_elements;
 496       this->uniforms++;
 497
 498       return 1;
 499
 500    case GLSL_TYPE_STRUCT:
 501       for (unsigned int i = 0; i < type->length; i++) {
 502          offset += setup_uniform_values(loc + offset,
 503                                         type->fields.structure[i].type);
 504       }
 505       return offset;
 506
 507    case GLSL_TYPE_ARRAY:
 508       for (unsigned int i = 0; i < type->length; i++) {
 509          offset += setup_uniform_values(loc + offset, type->fields.array);
 510       }
 511       return offset;
 512
 513    case GLSL_TYPE_SAMPLER:
 514       /* The sampler takes up a slot, but we don't use any values from it. */
 515       return 1;
 516
 517    default:
 518       assert(!"not reached");
 519       return 0;
 520    }
 521 }
 522
 523 void
 524 vec4_visitor::setup_uniform_clipplane_values()
 525 {
 526    gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
 527
 528    /* Pre-Gen6, we compact clip planes.  For example, if the user
 529     * enables just clip planes 0, 1, and 3, we will enable clip planes
 530     * 0, 1, and 2 in the hardware, and we'll move clip plane 3 to clip
 531     * plane 2.  This simplifies the implementation of the Gen6 clip
 532     * thread.
 533     *
 534     * In Gen6 and later, we don't compact clip planes, because this
 535     * simplifies the implementation of gl_ClipDistance.
 536     */
 537    int compacted_clipplane_index = 0;
 538    for (int i = 0; i < c->key.nr_userclip_plane_consts; ++i) {
 539       if (intel->gen < 6 &&
 540           !(c->key.userclip_planes_enabled_gen_4_5 & (1 << i))) {
 541          continue;
 542       }
 543       this->uniform_vector_size[this->uniforms] = 4;
 544       this->userplane[compacted_clipplane_index] = dst_reg(UNIFORM, this->uniforms);
 545       this->userplane[compacted_clipplane_index].type = BRW_REGISTER_TYPE_F;
 546       for (int j = 0; j < 4; ++j) {
 547          c->prog_data.param[this->uniforms * 4 + j] = &clip_planes[i][j];
 548       }
 549       ++compacted_clipplane_index;
 550       ++this->uniforms;
 551    }
 552 }
 553
 554 /* Our support for builtin uniforms is even scarier than non-builtin.
 555  * It sits on top of the PROG_STATE_VAR parameters that are
 556  * automatically updated from GL context state.
 557  */
 558 void
 559 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
 560 {
 561    const ir_state_slot *const slots = ir->state_slots;
 562    assert(ir->state_slots != NULL);
 563
 564    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 565       /* This state reference has already been setup by ir_to_mesa,
 566        * but we'll get the same index back here.  We can reference
 567        * ParameterValues directly, since unlike brw_fs.cpp, we never
 568        * add new state references during compile.
 569        */
 570       int index = _mesa_add_state_reference(this->vp->Base.Parameters,
 571                                             (gl_state_index *)slots[i].tokens);
 572       float *values = &this->vp->Base.Parameters->ParameterValues[index][0].f;
 573
 574       this->uniform_vector_size[this->uniforms] = 0;
 575       /* Add each of the unique swizzled channels of the element.
 576        * This will end up matching the size of the glsl_type of this field.
 577        */
 578       int last_swiz = -1;
 579       for (unsigned int j = 0; j < 4; j++) {
 580          int swiz = GET_SWZ(slots[i].swizzle, j);
 581          last_swiz = swiz;
 582
 583          c->prog_data.param[this->uniforms * 4 + j] = &values[swiz];
 584          if (swiz <= last_swiz)
 585             this->uniform_vector_size[this->uniforms]++;
 586       }
 587       this->uniforms++;
 588    }
 589 }
 590
 591 dst_reg *
 592 vec4_visitor::variable_storage(ir_variable *var)
 593 {
 594    return (dst_reg *)hash_table_find(this->variable_ht, var);
 595 }
 596
 597 void
 598 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir, uint32_t *predicate)
 599 {
 600    ir_expression *expr = ir->as_expression();
 601
 602    *predicate = BRW_PREDICATE_NORMAL;
 603
 604    if (expr) {
 605       src_reg op[2];
 606       vec4_instruction *inst;
 607
 608       assert(expr->get_num_operands() <= 2);
 609       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 610          expr->operands[i]->accept(this);
 611          op[i] = this->result;
 612
 613          resolve_ud_negate(&op[i]);
 614       }
 615
 616       switch (expr->operation) {
 617       case ir_unop_logic_not:
 618          inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
 619          inst->conditional_mod = BRW_CONDITIONAL_Z;
 620          break;
 621
 622       case ir_binop_logic_xor:
 623          inst = emit(XOR(dst_null_d(), op[0], op[1]));
 624          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 625          break;
 626
 627       case ir_binop_logic_or:
 628          inst = emit(OR(dst_null_d(), op[0], op[1]));
 629          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 630          break;
 631
 632       case ir_binop_logic_and:
 633          inst = emit(AND(dst_null_d(), op[0], op[1]));
 634          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 635          break;
 636
 637       case ir_unop_f2b:
 638          if (intel->gen >= 6) {
 639             emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
 640          } else {
 641             inst = emit(MOV(dst_null_f(), op[0]));
 642             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 643          }
 644          break;
 645
 646       case ir_unop_i2b:
 647          if (intel->gen >= 6) {
 648             emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 649          } else {
 650             inst = emit(MOV(dst_null_d(), op[0]));
 651             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 652          }
 653          break;
 654
 655       case ir_binop_all_equal:
 656          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 657          *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
 658          break;
 659
 660       case ir_binop_any_nequal:
 661          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 662          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 663          break;
 664
 665       case ir_unop_any:
 666          inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 667          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 668          break;
 669
 670       case ir_binop_greater:
 671       case ir_binop_gequal:
 672       case ir_binop_less:
 673       case ir_binop_lequal:
 674       case ir_binop_equal:
 675       case ir_binop_nequal:
 676          emit(CMP(dst_null_d(), op[0], op[1],
 677                   brw_conditional_for_comparison(expr->operation)));
 678          break;
 679
 680       default:
 681          assert(!"not reached");
 682          break;
 683       }
 684       return;
 685    }
 686
 687    ir->accept(this);
 688
 689    resolve_ud_negate(&this->result);
 690
 691    if (intel->gen >= 6) {
 692       vec4_instruction *inst = emit(AND(dst_null_d(),
 693                                         this->result, src_reg(1)));
 694       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 695    } else {
 696       vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
 697       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 698    }
 699 }
 700
 701 /**
 702  * Emit a gen6 IF statement with the comparison folded into the IF
 703  * instruction.
 704  */
 705 void
 706 vec4_visitor::emit_if_gen6(ir_if *ir)
 707 {
 708    ir_expression *expr = ir->condition->as_expression();
 709
 710    if (expr) {
 711       src_reg op[2];
 712       dst_reg temp;
 713
 714       assert(expr->get_num_operands() <= 2);
 715       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 716          expr->operands[i]->accept(this);
 717          op[i] = this->result;
 718       }
 719
 720       switch (expr->operation) {
 721       case ir_unop_logic_not:
 722          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
 723          return;
 724
 725       case ir_binop_logic_xor:
 726          emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
 727          return;
 728
 729       case ir_binop_logic_or:
 730          temp = dst_reg(this, glsl_type::bool_type);
 731          emit(OR(temp, op[0], op[1]));
 732          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 733          return;
 734
 735       case ir_binop_logic_and:
 736          temp = dst_reg(this, glsl_type::bool_type);
 737          emit(AND(temp, op[0], op[1]));
 738          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 739          return;
 740
 741       case ir_unop_f2b:
 742          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 743          return;
 744
 745       case ir_unop_i2b:
 746          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 747          return;
 748
 749       case ir_binop_greater:
 750       case ir_binop_gequal:
 751       case ir_binop_less:
 752       case ir_binop_lequal:
 753       case ir_binop_equal:
 754       case ir_binop_nequal:
 755          emit(IF(op[0], op[1],
 756                  brw_conditional_for_comparison(expr->operation)));
 757          return;
 758
 759       case ir_binop_all_equal:
 760          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 761          emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
 762          return;
 763
 764       case ir_binop_any_nequal:
 765          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 766          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 767          return;
 768
 769       case ir_unop_any:
 770          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 771          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 772          return;
 773
 774       default:
 775          assert(!"not reached");
 776          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 777          return;
 778       }
 779       return;
 780    }
 781
 782    ir->condition->accept(this);
 783
 784    emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
 785 }
 786
 787 void
 788 vec4_visitor::visit(ir_variable *ir)
 789 {
 790    dst_reg *reg = NULL;
 791
 792    if (variable_storage(ir))
 793       return;
 794
 795    switch (ir->mode) {
 796    case ir_var_in:
 797       reg = new(mem_ctx) dst_reg(ATTR, ir->location);
 798
 799       /* Do GL_FIXED rescaling for GLES2.0.  Our GL_FIXED attributes
 800        * come in as floating point conversions of the integer values.
 801        */
 802       for (int i = ir->location; i < ir->location + type_size(ir->type); i++) {
 803          if (!c->key.gl_fixed_input_size[i])
 804             continue;
 805
 806          dst_reg dst = *reg;
 807          dst.type = brw_type_for_base_type(ir->type);
 808          dst.writemask = (1 << c->key.gl_fixed_input_size[i]) - 1;
 809          emit(MUL(dst, src_reg(dst), src_reg(1.0f / 65536.0f)));
 810       }
 811       break;
 812
 813    case ir_var_out:
 814       reg = new(mem_ctx) dst_reg(this, ir->type);
 815
 816       for (int i = 0; i < type_size(ir->type); i++) {
 817          output_reg[ir->location + i] = *reg;
 818          output_reg[ir->location + i].reg_offset = i;
 819          output_reg[ir->location + i].type =
 820             brw_type_for_base_type(ir->type->get_scalar_type());
 821          output_reg_annotation[ir->location + i] = ir->name;
 822       }
 823       break;
 824
 825    case ir_var_auto:
 826    case ir_var_temporary:
 827       reg = new(mem_ctx) dst_reg(this, ir->type);
 828       break;
 829
 830    case ir_var_uniform:
 831       reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
 832
 833       /* Thanks to the lower_ubo_reference pass, we will see only
 834        * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
 835        * variables, so no need for them to be in variable_ht.
 836        */
 837       if (ir->uniform_block != -1)
 838          return;
 839
 840       /* Track how big the whole uniform variable is, in case we need to put a
 841        * copy of its data into pull constants for array access.
 842        */
 843       this->uniform_size[this->uniforms] = type_size(ir->type);
 844
 845       if (!strncmp(ir->name, "gl_", 3)) {
 846          setup_builtin_uniform_values(ir);
 847       } else {
 848          setup_uniform_values(ir->location, ir->type);
 849       }
 850       break;
 851
 852    case ir_var_system_value:
 853       /* VertexID is stored by the VF as the last vertex element, but
 854        * we don't represent it with a flag in inputs_read, so we call
 855        * it VERT_ATTRIB_MAX, which setup_attributes() picks up on.
 856        */
 857       reg = new(mem_ctx) dst_reg(ATTR, VERT_ATTRIB_MAX);
 858       prog_data->uses_vertexid = true;
 859
 860       switch (ir->location) {
 861       case SYSTEM_VALUE_VERTEX_ID:
 862          reg->writemask = WRITEMASK_X;
 863          break;
 864       case SYSTEM_VALUE_INSTANCE_ID:
 865          reg->writemask = WRITEMASK_Y;
 866          break;
 867       default:
 868          assert(!"not reached");
 869          break;
 870       }
 871       break;
 872
 873    default:
 874       assert(!"not reached");
 875    }
 876
 877    reg->type = brw_type_for_base_type(ir->type);
 878    hash_table_insert(this->variable_ht, reg, ir);
 879 }
 880
 881 void
 882 vec4_visitor::visit(ir_loop *ir)
 883 {
 884    dst_reg counter;
 885
 886    /* We don't want debugging output to print the whole body of the
 887     * loop as the annotation.
 888     */
 889    this->base_ir = NULL;
 890
 891    if (ir->counter != NULL) {
 892       this->base_ir = ir->counter;
 893       ir->counter->accept(this);
 894       counter = *(variable_storage(ir->counter));
 895
 896       if (ir->from != NULL) {
 897          this->base_ir = ir->from;
 898          ir->from->accept(this);
 899
 900          emit(MOV(counter, this->result));
 901       }
 902    }
 903
 904    emit(BRW_OPCODE_DO);
 905
 906    if (ir->to) {
 907       this->base_ir = ir->to;
 908       ir->to->accept(this);
 909
 910       emit(CMP(dst_null_d(), src_reg(counter), this->result,
 911                brw_conditional_for_comparison(ir->cmp)));
 912
 913       vec4_instruction *inst = emit(BRW_OPCODE_BREAK);
 914       inst->predicate = BRW_PREDICATE_NORMAL;
 915    }
 916
 917    visit_instructions(&ir->body_instructions);
 918
 919
 920    if (ir->increment) {
 921       this->base_ir = ir->increment;
 922       ir->increment->accept(this);
 923       emit(ADD(counter, src_reg(counter), this->result));
 924    }
 925
 926    emit(BRW_OPCODE_WHILE);
 927 }
 928
 929 void
 930 vec4_visitor::visit(ir_loop_jump *ir)
 931 {
 932    switch (ir->mode) {
 933    case ir_loop_jump::jump_break:
 934       emit(BRW_OPCODE_BREAK);
 935       break;
 936    case ir_loop_jump::jump_continue:
 937       emit(BRW_OPCODE_CONTINUE);
 938       break;
 939    }
 940 }
 941
 942
 943 void
 944 vec4_visitor::visit(ir_function_signature *ir)
 945 {
 946    assert(0);
 947    (void)ir;
 948 }
 949
 950 void
 951 vec4_visitor::visit(ir_function *ir)
 952 {
 953    /* Ignore function bodies other than main() -- we shouldn't see calls to
 954     * them since they should all be inlined.
 955     */
 956    if (strcmp(ir->name, "main") == 0) {
 957       const ir_function_signature *sig;
 958       exec_list empty;
 959
 960       sig = ir->matching_signature(&empty);
 961
 962       assert(sig);
 963
 964       visit_instructions(&sig->body);
 965    }
 966 }
 967
 968 bool
 969 vec4_visitor::try_emit_sat(ir_expression *ir)
 970 {
 971    ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
 972    if (!sat_src)
 973       return false;
 974
 975    sat_src->accept(this);
 976    src_reg src = this->result;
 977
 978    this->result = src_reg(this, ir->type);
 979    vec4_instruction *inst;
 980    inst = emit(MOV(dst_reg(this->result), src));
 981    inst->saturate = true;
 982
 983    return true;
 984 }
 985
 986 void
 987 vec4_visitor::emit_bool_comparison(unsigned int op,
 988                                  dst_reg dst, src_reg src0, src_reg src1)
 989 {
 990    /* original gen4 does destination conversion before comparison. */
 991    if (intel->gen < 5)
 992       dst.type = src0.type;
 993
 994    emit(CMP(dst, src0, src1, brw_conditional_for_comparison(op)));
 995
 996    dst.type = BRW_REGISTER_TYPE_D;
 997    emit(AND(dst, src_reg(dst), src_reg(0x1)));
 998 }
 999
1000 void
1001 vec4_visitor::visit(ir_expression *ir)
1002 {
1003    unsigned int operand;
1004    src_reg op[Elements(ir->operands)];
1005    src_reg result_src;
1006    dst_reg result_dst;
1007    vec4_instruction *inst;
1008
1009    if (try_emit_sat(ir))
1010       return;
1011
1012    for (operand = 0; operand < ir->get_num_operands(); operand++) {
1013       this->result.file = BAD_FILE;
1014       ir->operands[operand]->accept(this);
1015       if (this->result.file == BAD_FILE) {
1016          printf("Failed to get tree for expression operand:\n");
1017          ir->operands[operand]->print();
1018          exit(1);
1019       }
1020       op[operand] = this->result;
1021
1022       /* Matrix expression operands should have been broken down to vector
1023        * operations already.
1024        */
1025       assert(!ir->operands[operand]->type->is_matrix());
1026    }
1027
1028    int vector_elements = ir->operands[0]->type->vector_elements;
1029    if (ir->operands[1]) {
1030       vector_elements = MAX2(vector_elements,
1031                              ir->operands[1]->type->vector_elements);
1032    }
1033
1034    this->result.file = BAD_FILE;
1035
1036    /* Storage for our result.  Ideally for an assignment we'd be using
1037     * the actual storage for the result here, instead.
1038     */
1039    result_src = src_reg(this, ir->type);
1040    /* convenience for the emit functions below. */
1041    result_dst = dst_reg(result_src);
1042    /* If nothing special happens, this is the result. */
1043    this->result = result_src;
1044    /* Limit writes to the channels that will be used by result_src later.
1045     * This does limit this temp's use as a temporary for multi-instruction
1046     * sequences.
1047     */
1048    result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1049
1050    switch (ir->operation) {
1051    case ir_unop_logic_not:
1052       /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
1053        * ones complement of the whole register, not just bit 0.
1054        */
1055       emit(XOR(result_dst, op[0], src_reg(1)));
1056       break;
1057    case ir_unop_neg:
1058       op[0].negate = !op[0].negate;
1059       this->result = op[0];
1060       break;
1061    case ir_unop_abs:
1062       op[0].abs = true;
1063       op[0].negate = false;
1064       this->result = op[0];
1065       break;
1066
1067    case ir_unop_sign:
1068       emit(MOV(result_dst, src_reg(0.0f)));
1069
1070       emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_G));
1071       inst = emit(MOV(result_dst, src_reg(1.0f)));
1072       inst->predicate = BRW_PREDICATE_NORMAL;
1073
1074       emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_L));
1075       inst = emit(MOV(result_dst, src_reg(-1.0f)));
1076       inst->predicate = BRW_PREDICATE_NORMAL;
1077
1078       break;
1079
1080    case ir_unop_rcp:
1081       emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1082       break;
1083
1084    case ir_unop_exp2:
1085       emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1086       break;
1087    case ir_unop_log2:
1088       emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1089       break;
1090    case ir_unop_exp:
1091    case ir_unop_log:
1092       assert(!"not reached: should be handled by ir_explog_to_explog2");
1093       break;
1094    case ir_unop_sin:
1095    case ir_unop_sin_reduced:
1096       emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1097       break;
1098    case ir_unop_cos:
1099    case ir_unop_cos_reduced:
1100       emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1101       break;
1102
1103    case ir_unop_dFdx:
1104    case ir_unop_dFdy:
1105       assert(!"derivatives not valid in vertex shader");
1106       break;
1107
1108    case ir_unop_noise:
1109       assert(!"not reached: should be handled by lower_noise");
1110       break;
1111
1112    case ir_binop_add:
1113       emit(ADD(result_dst, op[0], op[1]));
1114       break;
1115    case ir_binop_sub:
1116       assert(!"not reached: should be handled by ir_sub_to_add_neg");
1117       break;
1118
1119    case ir_binop_mul:
1120       if (ir->type->is_integer()) {
1121          /* For integer multiplication, the MUL uses the low 16 bits
1122           * of one of the operands (src0 on gen6, src1 on gen7).  The
1123           * MACH accumulates in the contribution of the upper 16 bits
1124           * of that operand.
1125           *
1126           * FINISHME: Emit just the MUL if we know an operand is small
1127           * enough.
1128           */
1129          struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_D);
1130
1131          emit(MUL(acc, op[0], op[1]));
1132          emit(MACH(dst_null_d(), op[0], op[1]));
1133          emit(MOV(result_dst, src_reg(acc)));
1134       } else {
1135          emit(MUL(result_dst, op[0], op[1]));
1136       }
1137       break;
1138    case ir_binop_div:
1139       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1140       assert(ir->type->is_integer());
1141       emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1142       break;
1143    case ir_binop_mod:
1144       /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1145       assert(ir->type->is_integer());
1146       emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1147       break;
1148
1149    case ir_binop_less:
1150    case ir_binop_greater:
1151    case ir_binop_lequal:
1152    case ir_binop_gequal:
1153    case ir_binop_equal:
1154    case ir_binop_nequal: {
1155       emit(CMP(result_dst, op[0], op[1],
1156                brw_conditional_for_comparison(ir->operation)));
1157       emit(AND(result_dst, result_src, src_reg(0x1)));
1158       break;
1159    }
1160
1161    case ir_binop_all_equal:
1162       /* "==" operator producing a scalar boolean. */
1163       if (ir->operands[0]->type->is_vector() ||
1164           ir->operands[1]->type->is_vector()) {
1165          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1166          emit(MOV(result_dst, src_reg(0)));
1167          inst = emit(MOV(result_dst, src_reg(1)));
1168          inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1169       } else {
1170          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1171          emit(AND(result_dst, result_src, src_reg(0x1)));
1172       }
1173       break;
1174    case ir_binop_any_nequal:
1175       /* "!=" operator producing a scalar boolean. */
1176       if (ir->operands[0]->type->is_vector() ||
1177           ir->operands[1]->type->is_vector()) {
1178          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1179
1180          emit(MOV(result_dst, src_reg(0)));
1181          inst = emit(MOV(result_dst, src_reg(1)));
1182          inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1183       } else {
1184          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1185          emit(AND(result_dst, result_src, src_reg(0x1)));
1186       }
1187       break;
1188
1189    case ir_unop_any:
1190       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1191       emit(MOV(result_dst, src_reg(0)));
1192
1193       inst = emit(MOV(result_dst, src_reg(1)));
1194       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1195       break;
1196
1197    case ir_binop_logic_xor:
1198       emit(XOR(result_dst, op[0], op[1]));
1199       break;
1200
1201    case ir_binop_logic_or:
1202       emit(OR(result_dst, op[0], op[1]));
1203       break;
1204
1205    case ir_binop_logic_and:
1206       emit(AND(result_dst, op[0], op[1]));
1207       break;
1208
1209    case ir_binop_dot:
1210       assert(ir->operands[0]->type->is_vector());
1211       assert(ir->operands[0]->type == ir->operands[1]->type);
1212       emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1213       break;
1214
1215    case ir_unop_sqrt:
1216       emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1217       break;
1218    case ir_unop_rsq:
1219       emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1220       break;
1221
1222    case ir_unop_bitcast_i2f:
1223    case ir_unop_bitcast_u2f:
1224       this->result = op[0];
1225       this->result.type = BRW_REGISTER_TYPE_F;
1226       break;
1227
1228    case ir_unop_bitcast_f2i:
1229       this->result = op[0];
1230       this->result.type = BRW_REGISTER_TYPE_D;
1231       break;
1232
1233    case ir_unop_bitcast_f2u:
1234       this->result = op[0];
1235       this->result.type = BRW_REGISTER_TYPE_UD;
1236       break;
1237
1238    case ir_unop_i2f:
1239    case ir_unop_i2u:
1240    case ir_unop_u2i:
1241    case ir_unop_u2f:
1242    case ir_unop_b2f:
1243    case ir_unop_b2i:
1244    case ir_unop_f2i:
1245    case ir_unop_f2u:
1246       emit(MOV(result_dst, op[0]));
1247       break;
1248    case ir_unop_f2b:
1249    case ir_unop_i2b: {
1250       emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1251       emit(AND(result_dst, result_src, src_reg(1)));
1252       break;
1253    }
1254
1255    case ir_unop_trunc:
1256       emit(RNDZ(result_dst, op[0]));
1257       break;
1258    case ir_unop_ceil:
1259       op[0].negate = !op[0].negate;
1260       inst = emit(RNDD(result_dst, op[0]));
1261       this->result.negate = true;
1262       break;
1263    case ir_unop_floor:
1264       inst = emit(RNDD(result_dst, op[0]));
1265       break;
1266    case ir_unop_fract:
1267       inst = emit(FRC(result_dst, op[0]));
1268       break;
1269    case ir_unop_round_even:
1270       emit(RNDE(result_dst, op[0]));
1271       break;
1272
1273    case ir_binop_min:
1274       if (intel->gen >= 6) {
1275          inst = emit(BRW_OPCODE_SEL, result_dst, op[0], op[1]);
1276          inst->conditional_mod = BRW_CONDITIONAL_L;
1277       } else {
1278          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_L));
1279
1280          inst = emit(BRW_OPCODE_SEL, result_dst, op[0], op[1]);
1281          inst->predicate = BRW_PREDICATE_NORMAL;
1282       }
1283       break;
1284    case ir_binop_max:
1285       if (intel->gen >= 6) {
1286          inst = emit(BRW_OPCODE_SEL, result_dst, op[0], op[1]);
1287          inst->conditional_mod = BRW_CONDITIONAL_G;
1288       } else {
1289          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_G));
1290
1291          inst = emit(BRW_OPCODE_SEL, result_dst, op[0], op[1]);
1292          inst->predicate = BRW_PREDICATE_NORMAL;
1293       }
1294       break;
1295
1296    case ir_binop_pow:
1297       emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1298       break;
1299
1300    case ir_unop_bit_not:
1301       inst = emit(NOT(result_dst, op[0]));
1302       break;
1303    case ir_binop_bit_and:
1304       inst = emit(AND(result_dst, op[0], op[1]));
1305       break;
1306    case ir_binop_bit_xor:
1307       inst = emit(XOR(result_dst, op[0], op[1]));
1308       break;
1309    case ir_binop_bit_or:
1310       inst = emit(OR(result_dst, op[0], op[1]));
1311       break;
1312
1313    case ir_binop_lshift:
1314       inst = emit(BRW_OPCODE_SHL, result_dst, op[0], op[1]);
1315       break;
1316
1317    case ir_binop_rshift:
1318       if (ir->type->base_type == GLSL_TYPE_INT)
1319          inst = emit(BRW_OPCODE_ASR, result_dst, op[0], op[1]);
1320       else
1321          inst = emit(BRW_OPCODE_SHR, result_dst, op[0], op[1]);
1322       break;
1323
1324    case ir_binop_ubo_load: {
1325       ir_constant *uniform_block = ir->operands[0]->as_constant();
1326       ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1327       unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1328       src_reg offset = op[1];
1329
1330       /* Now, load the vector from that offset. */
1331       assert(ir->type->is_vector() || ir->type->is_scalar());
1332
1333       src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1334       packed_consts.type = result.type;
1335       src_reg surf_index =
1336          src_reg(SURF_INDEX_VS_UBO(uniform_block->value.u[0]));
1337       if (const_offset_ir) {
1338          offset = src_reg(const_offset / 16);
1339       } else {
1340          emit(BRW_OPCODE_SHR, dst_reg(offset), offset, src_reg(4));
1341       }
1342
1343       vec4_instruction *pull =
1344          emit(new(mem_ctx) vec4_instruction(this,
1345                                             VS_OPCODE_PULL_CONSTANT_LOAD,
1346                                             dst_reg(packed_consts),
1347                                             surf_index,
1348                                             offset));
1349       pull->base_mrf = 14;
1350       pull->mlen = 1;
1351
1352       packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1353       packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1354                                             const_offset % 16 / 4,
1355                                             const_offset % 16 / 4,
1356                                             const_offset % 16 / 4);
1357
1358       /* UBO bools are any nonzero int.  We store bools as either 0 or 1. */
1359       if (ir->type->base_type == GLSL_TYPE_BOOL) {
1360          emit(CMP(result_dst, packed_consts, src_reg(0u),
1361                   BRW_CONDITIONAL_NZ));
1362          emit(AND(result_dst, result, src_reg(0x1)));
1363       } else {
1364          emit(MOV(result_dst, packed_consts));
1365       }
1366       break;
1367    }
1368
1369    case ir_quadop_vector:
1370       assert(!"not reached: should be handled by lower_quadop_vector");
1371       break;
1372    }
1373 }
1374
1375
1376 void
1377 vec4_visitor::visit(ir_swizzle *ir)
1378 {
1379    src_reg src;
1380    int i = 0;
1381    int swizzle[4];
1382
1383    /* Note that this is only swizzles in expressions, not those on the left
1384     * hand side of an assignment, which do write masking.  See ir_assignment
1385     * for that.
1386     */
1387
1388    ir->val->accept(this);
1389    src = this->result;
1390    assert(src.file != BAD_FILE);
1391
1392    for (i = 0; i < ir->type->vector_elements; i++) {
1393       switch (i) {
1394       case 0:
1395          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1396          break;
1397       case 1:
1398          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1399          break;
1400       case 2:
1401          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1402          break;
1403       case 3:
1404          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1405             break;
1406       }
1407    }
1408    for (; i < 4; i++) {
1409       /* Replicate the last channel out. */
1410       swizzle[i] = swizzle[ir->type->vector_elements - 1];
1411    }
1412
1413    src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1414
1415    this->result = src;
1416 }
1417
1418 void
1419 vec4_visitor::visit(ir_dereference_variable *ir)
1420 {
1421    const struct glsl_type *type = ir->type;
1422    dst_reg *reg = variable_storage(ir->var);
1423
1424    if (!reg) {
1425       fail("Failed to find variable storage for %s\n", ir->var->name);
1426       this->result = src_reg(brw_null_reg());
1427       return;
1428    }
1429
1430    this->result = src_reg(*reg);
1431
1432    /* System values get their swizzle from the dst_reg writemask */
1433    if (ir->var->mode == ir_var_system_value)
1434       return;
1435
1436    if (type->is_scalar() || type->is_vector() || type->is_matrix())
1437       this->result.swizzle = swizzle_for_size(type->vector_elements);
1438 }
1439
1440 void
1441 vec4_visitor::visit(ir_dereference_array *ir)
1442 {
1443    ir_constant *constant_index;
1444    src_reg src;
1445    int element_size = type_size(ir->type);
1446
1447    constant_index = ir->array_index->constant_expression_value();
1448
1449    ir->array->accept(this);
1450    src = this->result;
1451
1452    if (constant_index) {
1453       src.reg_offset += constant_index->value.i[0] * element_size;
1454    } else {
1455       /* Variable index array dereference.  It eats the "vec4" of the
1456        * base of the array and an index that offsets the Mesa register
1457        * index.
1458        */
1459       ir->array_index->accept(this);
1460
1461       src_reg index_reg;
1462
1463       if (element_size == 1) {
1464          index_reg = this->result;
1465       } else {
1466          index_reg = src_reg(this, glsl_type::int_type);
1467
1468          emit(MUL(dst_reg(index_reg), this->result, src_reg(element_size)));
1469       }
1470
1471       if (src.reladdr) {
1472          src_reg temp = src_reg(this, glsl_type::int_type);
1473
1474          emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1475
1476          index_reg = temp;
1477       }
1478
1479       src.reladdr = ralloc(mem_ctx, src_reg);
1480       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1481    }
1482
1483    /* If the type is smaller than a vec4, replicate the last channel out. */
1484    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1485       src.swizzle = swizzle_for_size(ir->type->vector_elements);
1486    else
1487       src.swizzle = BRW_SWIZZLE_NOOP;
1488    src.type = brw_type_for_base_type(ir->type);
1489
1490    this->result = src;
1491 }
1492
1493 void
1494 vec4_visitor::visit(ir_dereference_record *ir)
1495 {
1496    unsigned int i;
1497    const glsl_type *struct_type = ir->record->type;
1498    int offset = 0;
1499
1500    ir->record->accept(this);
1501
1502    for (i = 0; i < struct_type->length; i++) {
1503       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1504          break;
1505       offset += type_size(struct_type->fields.structure[i].type);
1506    }
1507
1508    /* If the type is smaller than a vec4, replicate the last channel out. */
1509    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1510       this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1511    else
1512       this->result.swizzle = BRW_SWIZZLE_NOOP;
1513    this->result.type = brw_type_for_base_type(ir->type);
1514
1515    this->result.reg_offset += offset;
1516 }
1517
1518 /**
1519  * We want to be careful in assignment setup to hit the actual storage
1520  * instead of potentially using a temporary like we might with the
1521  * ir_dereference handler.
1522  */
1523 static dst_reg
1524 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1525 {
1526    /* The LHS must be a dereference.  If the LHS is a variable indexed array
1527     * access of a vector, it must be separated into a series conditional moves
1528     * before reaching this point (see ir_vec_index_to_cond_assign).
1529     */
1530    assert(ir->as_dereference());
1531    ir_dereference_array *deref_array = ir->as_dereference_array();
1532    if (deref_array) {
1533       assert(!deref_array->array->type->is_vector());
1534    }
1535
1536    /* Use the rvalue deref handler for the most part.  We'll ignore
1537     * swizzles in it and write swizzles using writemask, though.
1538     */
1539    ir->accept(v);
1540    return dst_reg(v->result);
1541 }
1542
1543 void
1544 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1545                               const struct glsl_type *type, uint32_t predicate)
1546 {
1547    if (type->base_type == GLSL_TYPE_STRUCT) {
1548       for (unsigned int i = 0; i < type->length; i++) {
1549          emit_block_move(dst, src, type->fields.structure[i].type, predicate);
1550       }
1551       return;
1552    }
1553
1554    if (type->is_array()) {
1555       for (unsigned int i = 0; i < type->length; i++) {
1556          emit_block_move(dst, src, type->fields.array, predicate);
1557       }
1558       return;
1559    }
1560
1561    if (type->is_matrix()) {
1562       const struct glsl_type *vec_type;
1563
1564       vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1565                                          type->vector_elements, 1);
1566
1567       for (int i = 0; i < type->matrix_columns; i++) {
1568          emit_block_move(dst, src, vec_type, predicate);
1569       }
1570       return;
1571    }
1572
1573    assert(type->is_scalar() || type->is_vector());
1574
1575    dst->type = brw_type_for_base_type(type);
1576    src->type = dst->type;
1577
1578    dst->writemask = (1 << type->vector_elements) - 1;
1579
1580    src->swizzle = swizzle_for_size(type->vector_elements);
1581
1582    vec4_instruction *inst = emit(MOV(*dst, *src));
1583    inst->predicate = predicate;
1584
1585    dst->reg_offset++;
1586    src->reg_offset++;
1587 }
1588
1589
1590 /* If the RHS processing resulted in an instruction generating a
1591  * temporary value, and it would be easy to rewrite the instruction to
1592  * generate its result right into the LHS instead, do so.  This ends
1593  * up reliably removing instructions where it can be tricky to do so
1594  * later without real UD chain information.
1595  */
1596 bool
1597 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
1598                                      dst_reg dst,
1599                                      src_reg src,
1600                                      vec4_instruction *pre_rhs_inst,
1601                                      vec4_instruction *last_rhs_inst)
1602 {
1603    /* This could be supported, but it would take more smarts. */
1604    if (ir->condition)
1605       return false;
1606
1607    if (pre_rhs_inst == last_rhs_inst)
1608       return false; /* No instructions generated to work with. */
1609
1610    /* Make sure the last instruction generated our source reg. */
1611    if (src.file != GRF ||
1612        src.file != last_rhs_inst->dst.file ||
1613        src.reg != last_rhs_inst->dst.reg ||
1614        src.reg_offset != last_rhs_inst->dst.reg_offset ||
1615        src.reladdr ||
1616        src.abs ||
1617        src.negate ||
1618        last_rhs_inst->predicate != BRW_PREDICATE_NONE)
1619       return false;
1620
1621    /* Check that that last instruction fully initialized the channels
1622     * we want to use, in the order we want to use them.  We could
1623     * potentially reswizzle the operands of many instructions so that
1624     * we could handle out of order channels, but don't yet.
1625     */
1626
1627    for (unsigned i = 0; i < 4; i++) {
1628       if (dst.writemask & (1 << i)) {
1629          if (!(last_rhs_inst->dst.writemask & (1 << i)))
1630             return false;
1631
1632          if (BRW_GET_SWZ(src.swizzle, i) != i)
1633             return false;
1634       }
1635    }
1636
1637    /* Success!  Rewrite the instruction. */
1638    last_rhs_inst->dst.file = dst.file;
1639    last_rhs_inst->dst.reg = dst.reg;
1640    last_rhs_inst->dst.reg_offset = dst.reg_offset;
1641    last_rhs_inst->dst.reladdr = dst.reladdr;
1642    last_rhs_inst->dst.writemask &= dst.writemask;
1643
1644    return true;
1645 }
1646
1647 void
1648 vec4_visitor::visit(ir_assignment *ir)
1649 {
1650    dst_reg dst = get_assignment_lhs(ir->lhs, this);
1651    uint32_t predicate = BRW_PREDICATE_NONE;
1652
1653    if (!ir->lhs->type->is_scalar() &&
1654        !ir->lhs->type->is_vector()) {
1655       ir->rhs->accept(this);
1656       src_reg src = this->result;
1657
1658       if (ir->condition) {
1659          emit_bool_to_cond_code(ir->condition, &predicate);
1660       }
1661
1662       /* emit_block_move doesn't account for swizzles in the source register.
1663        * This should be ok, since the source register is a structure or an
1664        * array, and those can't be swizzled.  But double-check to be sure.
1665        */
1666       assert(src.swizzle ==
1667              (ir->rhs->type->is_matrix()
1668               ? swizzle_for_size(ir->rhs->type->vector_elements)
1669               : BRW_SWIZZLE_NOOP));
1670
1671       emit_block_move(&dst, &src, ir->rhs->type, predicate);
1672       return;
1673    }
1674
1675    /* Now we're down to just a scalar/vector with writemasks. */
1676    int i;
1677
1678    vec4_instruction *pre_rhs_inst, *last_rhs_inst;
1679    pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1680
1681    ir->rhs->accept(this);
1682
1683    last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1684
1685    src_reg src = this->result;
1686
1687    int swizzles[4];
1688    int first_enabled_chan = 0;
1689    int src_chan = 0;
1690
1691    assert(ir->lhs->type->is_vector() ||
1692           ir->lhs->type->is_scalar());
1693    dst.writemask = ir->write_mask;
1694
1695    for (int i = 0; i < 4; i++) {
1696       if (dst.writemask & (1 << i)) {
1697          first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
1698          break;
1699       }
1700    }
1701
1702    /* Swizzle a small RHS vector into the channels being written.
1703     *
1704     * glsl ir treats write_mask as dictating how many channels are
1705     * present on the RHS while in our instructions we need to make
1706     * those channels appear in the slots of the vec4 they're written to.
1707     */
1708    for (int i = 0; i < 4; i++) {
1709       if (dst.writemask & (1 << i))
1710          swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
1711       else
1712          swizzles[i] = first_enabled_chan;
1713    }
1714    src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
1715                               swizzles[2], swizzles[3]);
1716
1717    if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
1718       return;
1719    }
1720
1721    if (ir->condition) {
1722       emit_bool_to_cond_code(ir->condition, &predicate);
1723    }
1724
1725    for (i = 0; i < type_size(ir->lhs->type); i++) {
1726       vec4_instruction *inst = emit(MOV(dst, src));
1727       inst->predicate = predicate;
1728
1729       dst.reg_offset++;
1730       src.reg_offset++;
1731    }
1732 }
1733
1734 void
1735 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
1736 {
1737    if (ir->type->base_type == GLSL_TYPE_STRUCT) {
1738       foreach_list(node, &ir->components) {
1739          ir_constant *field_value = (ir_constant *)node;
1740
1741          emit_constant_values(dst, field_value);
1742       }
1743       return;
1744    }
1745
1746    if (ir->type->is_array()) {
1747       for (unsigned int i = 0; i < ir->type->length; i++) {
1748          emit_constant_values(dst, ir->array_elements[i]);
1749       }
1750       return;
1751    }
1752
1753    if (ir->type->is_matrix()) {
1754       for (int i = 0; i < ir->type->matrix_columns; i++) {
1755          float *vec = &ir->value.f[i * ir->type->vector_elements];
1756
1757          for (int j = 0; j < ir->type->vector_elements; j++) {
1758             dst->writemask = 1 << j;
1759             dst->type = BRW_REGISTER_TYPE_F;
1760
1761             emit(MOV(*dst, src_reg(vec[j])));
1762          }
1763          dst->reg_offset++;
1764       }
1765       return;
1766    }
1767
1768    int remaining_writemask = (1 << ir->type->vector_elements) - 1;
1769
1770    for (int i = 0; i < ir->type->vector_elements; i++) {
1771       if (!(remaining_writemask & (1 << i)))
1772          continue;
1773
1774       dst->writemask = 1 << i;
1775       dst->type = brw_type_for_base_type(ir->type);
1776
1777       /* Find other components that match the one we're about to
1778        * write.  Emits fewer instructions for things like vec4(0.5,
1779        * 1.5, 1.5, 1.5).
1780        */
1781       for (int j = i + 1; j < ir->type->vector_elements; j++) {
1782          if (ir->type->base_type == GLSL_TYPE_BOOL) {
1783             if (ir->value.b[i] == ir->value.b[j])
1784                dst->writemask |= (1 << j);
1785          } else {
1786             /* u, i, and f storage all line up, so no need for a
1787              * switch case for comparing each type.
1788              */
1789             if (ir->value.u[i] == ir->value.u[j])
1790                dst->writemask |= (1 << j);
1791          }
1792       }
1793
1794       switch (ir->type->base_type) {
1795       case GLSL_TYPE_FLOAT:
1796          emit(MOV(*dst, src_reg(ir->value.f[i])));
1797          break;
1798       case GLSL_TYPE_INT:
1799          emit(MOV(*dst, src_reg(ir->value.i[i])));
1800          break;
1801       case GLSL_TYPE_UINT:
1802          emit(MOV(*dst, src_reg(ir->value.u[i])));
1803          break;
1804       case GLSL_TYPE_BOOL:
1805          emit(MOV(*dst, src_reg(ir->value.b[i])));
1806          break;
1807       default:
1808          assert(!"Non-float/uint/int/bool constant");
1809          break;
1810       }
1811
1812       remaining_writemask &= ~dst->writemask;
1813    }
1814    dst->reg_offset++;
1815 }
1816
1817 void
1818 vec4_visitor::visit(ir_constant *ir)
1819 {
1820    dst_reg dst = dst_reg(this, ir->type);
1821    this->result = src_reg(dst);
1822
1823    emit_constant_values(&dst, ir);
1824 }
1825
1826 void
1827 vec4_visitor::visit(ir_call *ir)
1828 {
1829    assert(!"not reached");
1830 }
1831
1832 void
1833 vec4_visitor::visit(ir_texture *ir)
1834 {
1835    int sampler = _mesa_get_sampler_uniform_value(ir->sampler, prog, &vp->Base);
1836
1837    /* Should be lowered by do_lower_texture_projection */
1838    assert(!ir->projector);
1839
1840    /* Generate code to compute all the subexpression trees.  This has to be
1841     * done before loading any values into MRFs for the sampler message since
1842     * generating these values may involve SEND messages that need the MRFs.
1843     */
1844    src_reg coordinate;
1845    if (ir->coordinate) {
1846       ir->coordinate->accept(this);
1847       coordinate = this->result;
1848    }
1849
1850    src_reg shadow_comparitor;
1851    if (ir->shadow_comparitor) {
1852       ir->shadow_comparitor->accept(this);
1853       shadow_comparitor = this->result;
1854    }
1855
1856    const glsl_type *lod_type;
1857    src_reg lod, dPdx, dPdy;
1858    switch (ir->op) {
1859    case ir_txf:
1860    case ir_txl:
1861    case ir_txs:
1862       ir->lod_info.lod->accept(this);
1863       lod = this->result;
1864       lod_type = ir->lod_info.lod->type;
1865       break;
1866    case ir_txd:
1867       ir->lod_info.grad.dPdx->accept(this);
1868       dPdx = this->result;
1869
1870       ir->lod_info.grad.dPdy->accept(this);
1871       dPdy = this->result;
1872
1873       lod_type = ir->lod_info.grad.dPdx->type;
1874       break;
1875    case ir_tex:
1876    case ir_txb:
1877       break;
1878    }
1879
1880    vec4_instruction *inst = NULL;
1881    switch (ir->op) {
1882    case ir_tex:
1883    case ir_txl:
1884       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXL);
1885       break;
1886    case ir_txd:
1887       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXD);
1888       break;
1889    case ir_txf:
1890       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF);
1891       break;
1892    case ir_txs:
1893       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
1894       break;
1895    case ir_txb:
1896       assert(!"TXB is not valid for vertex shaders.");
1897    }
1898
1899    /* Texel offsets go in the message header; Gen4 also requires headers. */
1900    inst->header_present = ir->offset || intel->gen < 5;
1901    inst->base_mrf = 2;
1902    inst->mlen = inst->header_present + 1; /* always at least one */
1903    inst->sampler = sampler;
1904    inst->dst = dst_reg(this, ir->type);
1905    inst->shadow_compare = ir->shadow_comparitor != NULL;
1906
1907    if (ir->offset != NULL && ir->op != ir_txf)
1908       inst->texture_offset = brw_texture_offset(ir->offset->as_constant());
1909
1910    /* MRF for the first parameter */
1911    int param_base = inst->base_mrf + inst->header_present;
1912
1913    if (ir->op == ir_txs) {
1914       int writemask = intel->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
1915       emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
1916    } else {
1917       int i, coord_mask = 0, zero_mask = 0;
1918       /* Load the coordinate */
1919       /* FINISHME: gl_clamp_mask and saturate */
1920       for (i = 0; i < ir->coordinate->type->vector_elements; i++)
1921          coord_mask |= (1 << i);
1922       for (; i < 4; i++)
1923          zero_mask |= (1 << i);
1924
1925       if (ir->offset && ir->op == ir_txf) {
1926          /* It appears that the ld instruction used for txf does its
1927           * address bounds check before adding in the offset.  To work
1928           * around this, just add the integer offset to the integer
1929           * texel coordinate, and don't put the offset in the header.
1930           */
1931          ir_constant *offset = ir->offset->as_constant();
1932          assert(offset);
1933
1934          for (int j = 0; j < ir->coordinate->type->vector_elements; j++) {
1935             src_reg src = coordinate;
1936             src.swizzle = BRW_SWIZZLE4(BRW_GET_SWZ(src.swizzle, j),
1937                                        BRW_GET_SWZ(src.swizzle, j),
1938                                        BRW_GET_SWZ(src.swizzle, j),
1939                                        BRW_GET_SWZ(src.swizzle, j));
1940             emit(ADD(dst_reg(MRF, param_base, ir->coordinate->type, 1 << j),
1941                      src, offset->value.i[j]));
1942          }
1943       } else {
1944          emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
1945                   coordinate));
1946       }
1947       emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
1948                src_reg(0)));
1949       /* Load the shadow comparitor */
1950       if (ir->shadow_comparitor) {
1951          emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
1952                           WRITEMASK_X),
1953                   shadow_comparitor));
1954          inst->mlen++;
1955       }
1956
1957       /* Load the LOD info */
1958       if (ir->op == ir_txl) {
1959          int mrf, writemask;
1960          if (intel->gen >= 5) {
1961             mrf = param_base + 1;
1962             if (ir->shadow_comparitor) {
1963                writemask = WRITEMASK_Y;
1964                /* mlen already incremented */
1965             } else {
1966                writemask = WRITEMASK_X;
1967                inst->mlen++;
1968             }
1969          } else /* intel->gen == 4 */ {
1970             mrf = param_base;
1971             writemask = WRITEMASK_Z;
1972          }
1973          emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
1974       } else if (ir->op == ir_txf) {
1975          emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W),
1976                   lod));
1977       } else if (ir->op == ir_txd) {
1978          const glsl_type *type = lod_type;
1979
1980          if (intel->gen >= 5) {
1981             dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1982             dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1983             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
1984             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
1985             inst->mlen++;
1986
1987             if (ir->type->vector_elements == 3) {
1988                dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
1989                dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
1990                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
1991                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
1992                inst->mlen++;
1993             }
1994          } else /* intel->gen == 4 */ {
1995             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
1996             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
1997             inst->mlen += 2;
1998          }
1999       }
2000    }
2001
2002    emit(inst);
2003
2004    swizzle_result(ir, src_reg(inst->dst), sampler);
2005 }
2006
2007 void
2008 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, int sampler)
2009 {
2010    this->result = orig_val;
2011
2012    int s = c->key.tex.swizzles[sampler];
2013
2014    if (ir->op == ir_txs || ir->type == glsl_type::float_type
2015                         || s == SWIZZLE_NOOP)
2016       return;
2017
2018    int zero_mask = 0, one_mask = 0, copy_mask = 0;
2019    int swizzle[4];
2020
2021    for (int i = 0; i < 4; i++) {
2022       switch (GET_SWZ(s, i)) {
2023       case SWIZZLE_ZERO:
2024          zero_mask |= (1 << i);
2025          break;
2026       case SWIZZLE_ONE:
2027          one_mask |= (1 << i);
2028          break;
2029       default:
2030          copy_mask |= (1 << i);
2031          swizzle[i] = GET_SWZ(s, i);
2032          break;
2033       }
2034    }
2035
2036    this->result = src_reg(this, ir->type);
2037    dst_reg swizzled_result(this->result);
2038
2039    if (copy_mask) {
2040       orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2041       swizzled_result.writemask = copy_mask;
2042       emit(MOV(swizzled_result, orig_val));
2043    }
2044
2045    if (zero_mask) {
2046       swizzled_result.writemask = zero_mask;
2047       emit(MOV(swizzled_result, src_reg(0.0f)));
2048    }
2049
2050    if (one_mask) {
2051       swizzled_result.writemask = one_mask;
2052       emit(MOV(swizzled_result, src_reg(1.0f)));
2053    }
2054 }
2055
2056 void
2057 vec4_visitor::visit(ir_return *ir)
2058 {
2059    assert(!"not reached");
2060 }
2061
2062 void
2063 vec4_visitor::visit(ir_discard *ir)
2064 {
2065    assert(!"not reached");
2066 }
2067
2068 void
2069 vec4_visitor::visit(ir_if *ir)
2070 {
2071    /* Don't point the annotation at the if statement, because then it plus
2072     * the then and else blocks get printed.
2073     */
2074    this->base_ir = ir->condition;
2075
2076    if (intel->gen == 6) {
2077       emit_if_gen6(ir);
2078    } else {
2079       uint32_t predicate;
2080       emit_bool_to_cond_code(ir->condition, &predicate);
2081       emit(IF(predicate));
2082    }
2083
2084    visit_instructions(&ir->then_instructions);
2085
2086    if (!ir->else_instructions.is_empty()) {
2087       this->base_ir = ir->condition;
2088       emit(BRW_OPCODE_ELSE);
2089
2090       visit_instructions(&ir->else_instructions);
2091    }
2092
2093    this->base_ir = ir->condition;
2094    emit(BRW_OPCODE_ENDIF);
2095 }
2096
2097 void
2098 vec4_visitor::emit_ndc_computation()
2099 {
2100    /* Get the position */
2101    src_reg pos = src_reg(output_reg[VERT_RESULT_HPOS]);
2102
2103    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2104    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2105    output_reg[BRW_VERT_RESULT_NDC] = ndc;
2106
2107    current_annotation = "NDC";
2108    dst_reg ndc_w = ndc;
2109    ndc_w.writemask = WRITEMASK_W;
2110    src_reg pos_w = pos;
2111    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2112    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2113
2114    dst_reg ndc_xyz = ndc;
2115    ndc_xyz.writemask = WRITEMASK_XYZ;
2116
2117    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2118 }
2119
2120 void
2121 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
2122 {
2123    if (intel->gen < 6 &&
2124        ((c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) ||
2125         c->key.userclip_active || brw->has_negative_rhw_bug)) {
2126       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2127       dst_reg header1_w = header1;
2128       header1_w.writemask = WRITEMASK_W;
2129       GLuint i;
2130
2131       emit(MOV(header1, 0u));
2132
2133       if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
2134          src_reg psiz = src_reg(output_reg[VERT_RESULT_PSIZ]);
2135
2136          current_annotation = "Point size";
2137          emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2138          emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2139       }
2140
2141       current_annotation = "Clipping flags";
2142       for (i = 0; i < c->key.nr_userclip_plane_consts; i++) {
2143          vec4_instruction *inst;
2144
2145          inst = emit(DP4(dst_null_f(), src_reg(output_reg[VERT_RESULT_HPOS]),
2146                          src_reg(this->userplane[i])));
2147          inst->conditional_mod = BRW_CONDITIONAL_L;
2148
2149          inst = emit(OR(header1_w, src_reg(header1_w), 1u << i));
2150          inst->predicate = BRW_PREDICATE_NORMAL;
2151       }
2152
2153       /* i965 clipping workaround:
2154        * 1) Test for -ve rhw
2155        * 2) If set,
2156        *      set ndc = (0,0,0,0)
2157        *      set ucp[6] = 1
2158        *
2159        * Later, clipping will detect ucp[6] and ensure the primitive is
2160        * clipped against all fixed planes.
2161        */
2162       if (brw->has_negative_rhw_bug) {
2163 #if 0
2164          /* FINISHME */
2165          brw_CMP(p,
2166                  vec8(brw_null_reg()),
2167                  BRW_CONDITIONAL_L,
2168                  brw_swizzle1(output_reg[BRW_VERT_RESULT_NDC], 3),
2169                  brw_imm_f(0));
2170
2171          brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<6));
2172          brw_MOV(p, output_reg[BRW_VERT_RESULT_NDC], brw_imm_f(0));
2173          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2174 #endif
2175       }
2176
2177       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2178    } else if (intel->gen < 6) {
2179       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2180    } else {
2181       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2182       if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
2183          emit(MOV(brw_writemask(reg, WRITEMASK_W),
2184                   src_reg(output_reg[VERT_RESULT_PSIZ])));
2185       }
2186    }
2187 }
2188
2189 void
2190 vec4_visitor::emit_clip_distances(struct brw_reg reg, int offset)
2191 {
2192    if (intel->gen < 6) {
2193       /* Clip distance slots are set aside in gen5, but they are not used.  It
2194        * is not clear whether we actually need to set aside space for them,
2195        * but the performance cost is negligible.
2196        */
2197       return;
2198    }
2199
2200    /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2201     *
2202     *     "If a linked set of shaders forming the vertex stage contains no
2203     *     static write to gl_ClipVertex or gl_ClipDistance, but the
2204     *     application has requested clipping against user clip planes through
2205     *     the API, then the coordinate written to gl_Position is used for
2206     *     comparison against the user clip planes."
2207     *
2208     * This function is only called if the shader didn't write to
2209     * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
2210     * if the user wrote to it; otherwise we use gl_Position.
2211     */
2212    gl_vert_result clip_vertex = VERT_RESULT_CLIP_VERTEX;
2213    if (!(c->prog_data.outputs_written
2214          & BITFIELD64_BIT(VERT_RESULT_CLIP_VERTEX))) {
2215       clip_vertex = VERT_RESULT_HPOS;
2216    }
2217
2218    for (int i = 0; i + offset < c->key.nr_userclip_plane_consts && i < 4;
2219         ++i) {
2220       emit(DP4(dst_reg(brw_writemask(reg, 1 << i)),
2221                src_reg(output_reg[clip_vertex]),
2222                src_reg(this->userplane[i + offset])));
2223    }
2224 }
2225
2226 void
2227 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int vert_result)
2228 {
2229    assert (vert_result < VERT_RESULT_MAX);
2230    reg.type = output_reg[vert_result].type;
2231    current_annotation = output_reg_annotation[vert_result];
2232    /* Copy the register, saturating if necessary */
2233    vec4_instruction *inst = emit(MOV(reg,
2234                                      src_reg(output_reg[vert_result])));
2235    if ((vert_result == VERT_RESULT_COL0 ||
2236         vert_result == VERT_RESULT_COL1 ||
2237         vert_result == VERT_RESULT_BFC0 ||
2238         vert_result == VERT_RESULT_BFC1) &&
2239        c->key.clamp_vertex_color) {
2240       inst->saturate = true;
2241    }
2242 }
2243
2244 void
2245 vec4_visitor::emit_urb_slot(int mrf, int vert_result)
2246 {
2247    struct brw_reg hw_reg = brw_message_reg(mrf);
2248    dst_reg reg = dst_reg(MRF, mrf);
2249    reg.type = BRW_REGISTER_TYPE_F;
2250
2251    switch (vert_result) {
2252    case VERT_RESULT_PSIZ:
2253       /* PSIZ is always in slot 0, and is coupled with other flags. */
2254       current_annotation = "indices, point width, clip flags";
2255       emit_psiz_and_flags(hw_reg);
2256       break;
2257    case BRW_VERT_RESULT_NDC:
2258       current_annotation = "NDC";
2259       emit(MOV(reg, src_reg(output_reg[BRW_VERT_RESULT_NDC])));
2260       break;
2261    case BRW_VERT_RESULT_HPOS_DUPLICATE:
2262    case VERT_RESULT_HPOS:
2263       current_annotation = "gl_Position";
2264       emit(MOV(reg, src_reg(output_reg[VERT_RESULT_HPOS])));
2265       break;
2266    case VERT_RESULT_CLIP_DIST0:
2267    case VERT_RESULT_CLIP_DIST1:
2268       if (this->c->key.uses_clip_distance) {
2269          emit_generic_urb_slot(reg, vert_result);
2270       } else {
2271          current_annotation = "user clip distances";
2272          emit_clip_distances(hw_reg, (vert_result - VERT_RESULT_CLIP_DIST0) * 4);
2273       }
2274       break;
2275    case VERT_RESULT_EDGE:
2276       /* This is present when doing unfilled polygons.  We're supposed to copy
2277        * the edge flag from the user-provided vertex array
2278        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
2279        * of that attribute (starts as 1.0f).  This is then used in clipping to
2280        * determine which edges should be drawn as wireframe.
2281        */
2282       current_annotation = "edge flag";
2283       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
2284                                     glsl_type::float_type, WRITEMASK_XYZW))));
2285       break;
2286    case BRW_VERT_RESULT_PAD:
2287       /* No need to write to this slot */
2288       break;
2289    default:
2290       emit_generic_urb_slot(reg, vert_result);
2291       break;
2292    }
2293 }
2294
2295 static int
2296 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
2297 {
2298    struct intel_context *intel = &brw->intel;
2299
2300    if (intel->gen >= 6) {
2301       /* URB data written (does not include the message header reg) must
2302        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
2303        * section 5.4.3.2.2: URB_INTERLEAVED.
2304        *
2305        * URB entries are allocated on a multiple of 1024 bits, so an
2306        * extra 128 bits written here to make the end align to 256 is
2307        * no problem.
2308        */
2309       if ((mlen % 2) != 1)
2310          mlen++;
2311    }
2312
2313    return mlen;
2314 }
2315
2316 /**
2317  * Generates the VUE payload plus the 1 or 2 URB write instructions to
2318  * complete the VS thread.
2319  *
2320  * The VUE layout is documented in Volume 2a.
2321  */
2322 void
2323 vec4_visitor::emit_urb_writes()
2324 {
2325    /* MRF 0 is reserved for the debugger, so start with message header
2326     * in MRF 1.
2327     */
2328    int base_mrf = 1;
2329    int mrf = base_mrf;
2330    /* In the process of generating our URB write message contents, we
2331     * may need to unspill a register or load from an array.  Those
2332     * reads would use MRFs 14-15.
2333     */
2334    int max_usable_mrf = 13;
2335
2336    /* The following assertion verifies that max_usable_mrf causes an
2337     * even-numbered amount of URB write data, which will meet gen6's
2338     * requirements for length alignment.
2339     */
2340    assert ((max_usable_mrf - base_mrf) % 2 == 0);
2341
2342    /* First mrf is the g0-based message header containing URB handles and such,
2343     * which is implied in VS_OPCODE_URB_WRITE.
2344     */
2345    mrf++;
2346
2347    if (intel->gen < 6) {
2348       emit_ndc_computation();
2349    }
2350
2351    /* Set up the VUE data for the first URB write */
2352    int slot;
2353    for (slot = 0; slot < c->prog_data.vue_map.num_slots; ++slot) {
2354       emit_urb_slot(mrf++, c->prog_data.vue_map.slot_to_vert_result[slot]);
2355
2356       /* If this was max_usable_mrf, we can't fit anything more into this URB
2357        * WRITE.
2358        */
2359       if (mrf > max_usable_mrf) {
2360          slot++;
2361          break;
2362       }
2363    }
2364
2365    current_annotation = "URB write";
2366    vec4_instruction *inst = emit(VS_OPCODE_URB_WRITE);
2367    inst->base_mrf = base_mrf;
2368    inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2369    inst->eot = (slot >= c->prog_data.vue_map.num_slots);
2370
2371    /* Optional second URB write */
2372    if (!inst->eot) {
2373       mrf = base_mrf + 1;
2374
2375       for (; slot < c->prog_data.vue_map.num_slots; ++slot) {
2376          assert(mrf < max_usable_mrf);
2377
2378          emit_urb_slot(mrf++, c->prog_data.vue_map.slot_to_vert_result[slot]);
2379       }
2380
2381       current_annotation = "URB write";
2382       inst = emit(VS_OPCODE_URB_WRITE);
2383       inst->base_mrf = base_mrf;
2384       inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2385       inst->eot = true;
2386       /* URB destination offset.  In the previous write, we got MRFs
2387        * 2-13 minus the one header MRF, so 12 regs.  URB offset is in
2388        * URB row increments, and each of our MRFs is half of one of
2389        * those, since we're doing interleaved writes.
2390        */
2391       inst->offset = (max_usable_mrf - base_mrf) / 2;
2392    }
2393 }
2394
2395 src_reg
2396 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
2397                                  src_reg *reladdr, int reg_offset)
2398 {
2399    /* Because we store the values to scratch interleaved like our
2400     * vertex data, we need to scale the vec4 index by 2.
2401     */
2402    int message_header_scale = 2;
2403
2404    /* Pre-gen6, the message header uses byte offsets instead of vec4
2405     * (16-byte) offset units.
2406     */
2407    if (intel->gen < 6)
2408       message_header_scale *= 16;
2409
2410    if (reladdr) {
2411       src_reg index = src_reg(this, glsl_type::int_type);
2412
2413       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2414       emit_before(inst, MUL(dst_reg(index),
2415                             index, src_reg(message_header_scale)));
2416
2417       return index;
2418    } else {
2419       return src_reg(reg_offset * message_header_scale);
2420    }
2421 }
2422
2423 src_reg
2424 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
2425                                        src_reg *reladdr, int reg_offset)
2426 {
2427    if (reladdr) {
2428       src_reg index = src_reg(this, glsl_type::int_type);
2429
2430       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2431
2432       /* Pre-gen6, the message header uses byte offsets instead of vec4
2433        * (16-byte) offset units.
2434        */
2435       if (intel->gen < 6) {
2436          emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
2437       }
2438
2439       return index;
2440    } else {
2441       int message_header_scale = intel->gen < 6 ? 16 : 1;
2442       return src_reg(reg_offset * message_header_scale);
2443    }
2444 }
2445
2446 /**
2447  * Emits an instruction before @inst to load the value named by @orig_src
2448  * from scratch space at @base_offset to @temp.
2449  *
2450  * @base_offset is measured in 32-byte units (the size of a register).
2451  */
2452 void
2453 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
2454                                 dst_reg temp, src_reg orig_src,
2455                                 int base_offset)
2456 {
2457    int reg_offset = base_offset + orig_src.reg_offset;
2458    src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
2459
2460    emit_before(inst, SCRATCH_READ(temp, index));
2461 }
2462
2463 /**
2464  * Emits an instruction after @inst to store the value to be written
2465  * to @orig_dst to scratch space at @base_offset, from @temp.
2466  *
2467  * @base_offset is measured in 32-byte units (the size of a register).
2468  */
2469 void
2470 vec4_visitor::emit_scratch_write(vec4_instruction *inst,
2471                                  src_reg temp, dst_reg orig_dst,
2472                                  int base_offset)
2473 {
2474    int reg_offset = base_offset + orig_dst.reg_offset;
2475    src_reg index = get_scratch_offset(inst, orig_dst.reladdr, reg_offset);
2476
2477    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
2478                                        orig_dst.writemask));
2479    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
2480    write->predicate = inst->predicate;
2481    write->ir = inst->ir;
2482    write->annotation = inst->annotation;
2483    inst->insert_after(write);
2484 }
2485
2486 /**
2487  * We can't generally support array access in GRF space, because a
2488  * single instruction's destination can only span 2 contiguous
2489  * registers.  So, we send all GRF arrays that get variable index
2490  * access to scratch space.
2491  */
2492 void
2493 vec4_visitor::move_grf_array_access_to_scratch()
2494 {
2495    int scratch_loc[this->virtual_grf_count];
2496
2497    for (int i = 0; i < this->virtual_grf_count; i++) {
2498       scratch_loc[i] = -1;
2499    }
2500
2501    /* First, calculate the set of virtual GRFs that need to be punted
2502     * to scratch due to having any array access on them, and where in
2503     * scratch.
2504     */
2505    foreach_list(node, &this->instructions) {
2506       vec4_instruction *inst = (vec4_instruction *)node;
2507
2508       if (inst->dst.file == GRF && inst->dst.reladdr &&
2509           scratch_loc[inst->dst.reg] == -1) {
2510          scratch_loc[inst->dst.reg] = c->last_scratch;
2511          c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
2512       }
2513
2514       for (int i = 0 ; i < 3; i++) {
2515          src_reg *src = &inst->src[i];
2516
2517          if (src->file == GRF && src->reladdr &&
2518              scratch_loc[src->reg] == -1) {
2519             scratch_loc[src->reg] = c->last_scratch;
2520             c->last_scratch += this->virtual_grf_sizes[src->reg];
2521          }
2522       }
2523    }
2524
2525    /* Now, for anything that will be accessed through scratch, rewrite
2526     * it to load/store.  Note that this is a _safe list walk, because
2527     * we may generate a new scratch_write instruction after the one
2528     * we're processing.
2529     */
2530    foreach_list_safe(node, &this->instructions) {
2531       vec4_instruction *inst = (vec4_instruction *)node;
2532
2533       /* Set up the annotation tracking for new generated instructions. */
2534       base_ir = inst->ir;
2535       current_annotation = inst->annotation;
2536
2537       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
2538          src_reg temp = src_reg(this, glsl_type::vec4_type);
2539
2540          emit_scratch_write(inst, temp, inst->dst, scratch_loc[inst->dst.reg]);
2541
2542          inst->dst.file = temp.file;
2543          inst->dst.reg = temp.reg;
2544          inst->dst.reg_offset = temp.reg_offset;
2545          inst->dst.reladdr = NULL;
2546       }
2547
2548       for (int i = 0 ; i < 3; i++) {
2549          if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
2550             continue;
2551
2552          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
2553
2554          emit_scratch_read(inst, temp, inst->src[i],
2555                            scratch_loc[inst->src[i].reg]);
2556
2557          inst->src[i].file = temp.file;
2558          inst->src[i].reg = temp.reg;
2559          inst->src[i].reg_offset = temp.reg_offset;
2560          inst->src[i].reladdr = NULL;
2561       }
2562    }
2563 }
2564
2565 /**
2566  * Emits an instruction before @inst to load the value named by @orig_src
2567  * from the pull constant buffer (surface) at @base_offset to @temp.
2568  */
2569 void
2570 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
2571                                       dst_reg temp, src_reg orig_src,
2572                                       int base_offset)
2573 {
2574    int reg_offset = base_offset + orig_src.reg_offset;
2575    src_reg index = src_reg((unsigned)SURF_INDEX_VERT_CONST_BUFFER);
2576    src_reg offset = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
2577    vec4_instruction *load;
2578
2579    load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
2580                                         temp, index, offset);
2581    load->base_mrf = 14;
2582    load->mlen = 1;
2583    emit_before(inst, load);
2584 }
2585
2586 /**
2587  * Implements array access of uniforms by inserting a
2588  * PULL_CONSTANT_LOAD instruction.
2589  *
2590  * Unlike temporary GRF array access (where we don't support it due to
2591  * the difficulty of doing relative addressing on instruction
2592  * destinations), we could potentially do array access of uniforms
2593  * that were loaded in GRF space as push constants.  In real-world
2594  * usage we've seen, though, the arrays being used are always larger
2595  * than we could load as push constants, so just always move all
2596  * uniform array access out to a pull constant buffer.
2597  */
2598 void
2599 vec4_visitor::move_uniform_array_access_to_pull_constants()
2600 {
2601    int pull_constant_loc[this->uniforms];
2602
2603    for (int i = 0; i < this->uniforms; i++) {
2604       pull_constant_loc[i] = -1;
2605    }
2606
2607    /* Walk through and find array access of uniforms.  Put a copy of that
2608     * uniform in the pull constant buffer.
2609     *
2610     * Note that we don't move constant-indexed accesses to arrays.  No
2611     * testing has been done of the performance impact of this choice.
2612     */
2613    foreach_list_safe(node, &this->instructions) {
2614       vec4_instruction *inst = (vec4_instruction *)node;
2615
2616       for (int i = 0 ; i < 3; i++) {
2617          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2618             continue;
2619
2620          int uniform = inst->src[i].reg;
2621
2622          /* If this array isn't already present in the pull constant buffer,
2623           * add it.
2624           */
2625          if (pull_constant_loc[uniform] == -1) {
2626             const float **values = &prog_data->param[uniform * 4];
2627
2628             pull_constant_loc[uniform] = prog_data->nr_pull_params / 4;
2629
2630             for (int j = 0; j < uniform_size[uniform] * 4; j++) {
2631                prog_data->pull_param[prog_data->nr_pull_params++] = values[j];
2632             }
2633          }
2634
2635          /* Set up the annotation tracking for new generated instructions. */
2636          base_ir = inst->ir;
2637          current_annotation = inst->annotation;
2638
2639          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
2640
2641          emit_pull_constant_load(inst, temp, inst->src[i],
2642                                  pull_constant_loc[uniform]);
2643
2644          inst->src[i].file = temp.file;
2645          inst->src[i].reg = temp.reg;
2646          inst->src[i].reg_offset = temp.reg_offset;
2647          inst->src[i].reladdr = NULL;
2648       }
2649    }
2650
2651    /* Now there are no accesses of the UNIFORM file with a reladdr, so
2652     * no need to track them as larger-than-vec4 objects.  This will be
2653     * relied on in cutting out unused uniform vectors from push
2654     * constants.
2655     */
2656    split_uniform_registers();
2657 }
2658
2659 void
2660 vec4_visitor::resolve_ud_negate(src_reg *reg)
2661 {
2662    if (reg->type != BRW_REGISTER_TYPE_UD ||
2663        !reg->negate)
2664       return;
2665
2666    src_reg temp = src_reg(this, glsl_type::uvec4_type);
2667    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
2668    *reg = temp;
2669 }
2670
2671 vec4_visitor::vec4_visitor(struct brw_vs_compile *c,
2672                            struct gl_shader_program *prog,
2673                            struct brw_shader *shader)
2674 {
2675    this->c = c;
2676    this->p = &c->func;
2677    this->brw = p->brw;
2678    this->intel = &brw->intel;
2679    this->ctx = &intel->ctx;
2680    this->prog = prog;
2681    this->shader = shader;
2682
2683    this->mem_ctx = ralloc_context(NULL);
2684    this->failed = false;
2685
2686    this->base_ir = NULL;
2687    this->current_annotation = NULL;
2688
2689    this->c = c;
2690    this->vp = (struct gl_vertex_program *)
2691      prog->_LinkedShaders[MESA_SHADER_VERTEX]->Program;
2692    this->prog_data = &c->prog_data;
2693
2694    this->variable_ht = hash_table_ctor(0,
2695                                        hash_table_pointer_hash,
2696                                        hash_table_pointer_compare);
2697
2698    this->virtual_grf_def = NULL;
2699    this->virtual_grf_use = NULL;
2700    this->virtual_grf_sizes = NULL;
2701    this->virtual_grf_count = 0;
2702    this->virtual_grf_reg_map = NULL;
2703    this->virtual_grf_reg_count = 0;
2704    this->virtual_grf_array_size = 0;
2705    this->live_intervals_valid = false;
2706
2707    this->max_grf = intel->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
2708
2709    this->uniforms = 0;
2710 }
2711
2712 vec4_visitor::~vec4_visitor()
2713 {
2714    ralloc_free(this->mem_ctx);
2715    hash_table_dtor(this->variable_ht);
2716 }
2717
2718
2719 void
2720 vec4_visitor::fail(const char *format, ...)
2721 {
2722    va_list va;
2723    char *msg;
2724
2725    if (failed)
2726       return;
2727
2728    failed = true;
2729
2730    va_start(va, format);
2731    msg = ralloc_vasprintf(mem_ctx, format, va);
2732    va_end(va);
2733    msg = ralloc_asprintf(mem_ctx, "VS compile failed: %s\n", msg);
2734
2735    this->fail_msg = msg;
2736
2737    if (INTEL_DEBUG & DEBUG_VS) {
2738       fprintf(stderr, "%s",  msg);
2739    }
2740 }
2741
2742 } /* namespace brw */