src/mesa/drivers/dri/i965/brw_fs_visitor.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs_visitor.cpp
  25  *
  26  * This file supports generating the FS LIR from the GLSL IR.  The LIR
  27  * makes it easier to do backend-specific optimizations than doing so
  28  * in the GLSL IR or in the native code.
  29  */
  30 extern "C" {
  31
  32 #include <sys/types.h>
  33
  34 #include "main/macros.h"
  35 #include "main/shaderobj.h"
  36 #include "main/uniforms.h"
  37 #include "program/prog_parameter.h"
  38 #include "program/prog_print.h"
  39 #include "program/prog_optimize.h"
  40 #include "program/register_allocate.h"
  41 #include "program/sampler.h"
  42 #include "program/hash_table.h"
  43 #include "brw_context.h"
  44 #include "brw_eu.h"
  45 #include "brw_wm.h"
  46 }
  47 #include "brw_shader.h"
  48 #include "brw_fs.h"
  49 #include "glsl/glsl_types.h"
  50 #include "glsl/ir_optimization.h"
  51 #include "glsl/ir_print_visitor.h"
  52
  53 void
  54 fs_visitor::visit(ir_variable *ir)
  55 {
  56    fs_reg *reg = NULL;
  57
  58    if (variable_storage(ir))
  59       return;
  60
  61    if (ir->mode == ir_var_in) {
  62       if (!strcmp(ir->name, "gl_FragCoord")) {
  63          reg = emit_fragcoord_interpolation(ir);
  64       } else if (!strcmp(ir->name, "gl_FrontFacing")) {
  65          reg = emit_frontfacing_interpolation(ir);
  66       } else {
  67          reg = emit_general_interpolation(ir);
  68       }
  69       assert(reg);
  70       hash_table_insert(this->variable_ht, reg, ir);
  71       return;
  72    } else if (ir->mode == ir_var_out) {
  73       reg = new(this->mem_ctx) fs_reg(this, ir->type);
  74
  75       if (ir->index > 0) {
  76          assert(ir->location == FRAG_RESULT_DATA0);
  77          assert(ir->index == 1);
  78          this->dual_src_output = *reg;
  79       } else if (ir->location == FRAG_RESULT_COLOR) {
  80          /* Writing gl_FragColor outputs to all color regions. */
  81          for (unsigned int i = 0; i < MAX2(c->key.nr_color_regions, 1); i++) {
  82             this->outputs[i] = *reg;
  83             this->output_components[i] = 4;
  84          }
  85       } else if (ir->location == FRAG_RESULT_DEPTH) {
  86          this->frag_depth = ir;
  87       } else {
  88          /* gl_FragData or a user-defined FS output */
  89          assert(ir->location >= FRAG_RESULT_DATA0 &&
  90                 ir->location < FRAG_RESULT_DATA0 + BRW_MAX_DRAW_BUFFERS);
  91
  92          int vector_elements =
  93             ir->type->is_array() ? ir->type->fields.array->vector_elements
  94                                  : ir->type->vector_elements;
  95
  96          /* General color output. */
  97          for (unsigned int i = 0; i < MAX2(1, ir->type->length); i++) {
  98             int output = ir->location - FRAG_RESULT_DATA0 + i;
  99             this->outputs[output] = *reg;
 100             this->outputs[output].reg_offset += vector_elements * i;
 101             this->output_components[output] = vector_elements;
 102          }
 103       }
 104    } else if (ir->mode == ir_var_uniform) {
 105       int param_index = c->prog_data.nr_params;
 106
 107       /* Thanks to the lower_ubo_reference pass, we will see only
 108        * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
 109        * variables, so no need for them to be in variable_ht.
 110        */
 111       if (ir->uniform_block != -1)
 112          return;
 113
 114       if (c->dispatch_width == 16) {
 115          if (!variable_storage(ir)) {
 116             fail("Failed to find uniform '%s' in 16-wide\n", ir->name);
 117          }
 118          return;
 119       }
 120
 121       if (!strncmp(ir->name, "gl_", 3)) {
 122          setup_builtin_uniform_values(ir);
 123       } else {
 124          setup_uniform_values(ir->location, ir->type);
 125       }
 126
 127       reg = new(this->mem_ctx) fs_reg(UNIFORM, param_index);
 128       reg->type = brw_type_for_base_type(ir->type);
 129    }
 130
 131    if (!reg)
 132       reg = new(this->mem_ctx) fs_reg(this, ir->type);
 133
 134    hash_table_insert(this->variable_ht, reg, ir);
 135 }
 136
 137 void
 138 fs_visitor::visit(ir_dereference_variable *ir)
 139 {
 140    fs_reg *reg = variable_storage(ir->var);
 141    this->result = *reg;
 142 }
 143
 144 void
 145 fs_visitor::visit(ir_dereference_record *ir)
 146 {
 147    const glsl_type *struct_type = ir->record->type;
 148
 149    ir->record->accept(this);
 150
 151    unsigned int offset = 0;
 152    for (unsigned int i = 0; i < struct_type->length; i++) {
 153       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
 154          break;
 155       offset += type_size(struct_type->fields.structure[i].type);
 156    }
 157    this->result.reg_offset += offset;
 158    this->result.type = brw_type_for_base_type(ir->type);
 159 }
 160
 161 void
 162 fs_visitor::visit(ir_dereference_array *ir)
 163 {
 164    ir_constant *index;
 165    int element_size;
 166
 167    ir->array->accept(this);
 168    index = ir->array_index->as_constant();
 169
 170    element_size = type_size(ir->type);
 171    this->result.type = brw_type_for_base_type(ir->type);
 172
 173    if (index) {
 174       assert(this->result.file == UNIFORM || this->result.file == GRF);
 175       this->result.reg_offset += index->value.i[0] * element_size;
 176    } else {
 177       assert(!"FINISHME: non-constant array element");
 178    }
 179 }
 180
 181 /* Instruction selection: Produce a MOV.sat instead of
 182  * MIN(MAX(val, 0), 1) when possible.
 183  */
 184 bool
 185 fs_visitor::try_emit_saturate(ir_expression *ir)
 186 {
 187    ir_rvalue *sat_val = ir->as_rvalue_to_saturate();
 188
 189    if (!sat_val)
 190       return false;
 191
 192    fs_inst *pre_inst = (fs_inst *) this->instructions.get_tail();
 193
 194    sat_val->accept(this);
 195    fs_reg src = this->result;
 196
 197    fs_inst *last_inst = (fs_inst *) this->instructions.get_tail();
 198
 199    /* If the last instruction from our accept() didn't generate our
 200     * src, generate a saturated MOV
 201     */
 202    fs_inst *modify = get_instruction_generating_reg(pre_inst, last_inst, src);
 203    if (!modify || modify->regs_written() != 1) {
 204       fs_inst *inst = emit(BRW_OPCODE_MOV, this->result, src);
 205       inst->saturate = true;
 206    } else {
 207       modify->saturate = true;
 208       this->result = src;
 209    }
 210
 211
 212    return true;
 213 }
 214
 215 bool
 216 fs_visitor::try_emit_mad(ir_expression *ir, int mul_arg)
 217 {
 218    /* 3-src instructions were introduced in gen6. */
 219    if (intel->gen < 6)
 220       return false;
 221
 222    /* MAD can only handle floating-point data. */
 223    if (ir->type != glsl_type::float_type)
 224       return false;
 225
 226    ir_rvalue *nonmul = ir->operands[1 - mul_arg];
 227    ir_expression *mul = ir->operands[mul_arg]->as_expression();
 228
 229    if (!mul || mul->operation != ir_binop_mul)
 230       return false;
 231
 232    if (nonmul->as_constant() ||
 233        mul->operands[0]->as_constant() ||
 234        mul->operands[1]->as_constant())
 235       return false;
 236
 237    nonmul->accept(this);
 238    fs_reg src0 = this->result;
 239
 240    mul->operands[0]->accept(this);
 241    fs_reg src1 = this->result;
 242
 243    mul->operands[1]->accept(this);
 244    fs_reg src2 = this->result;
 245
 246    this->result = fs_reg(this, ir->type);
 247    emit(BRW_OPCODE_MAD, this->result, src0, src1, src2);
 248
 249    return true;
 250 }
 251
 252 void
 253 fs_visitor::visit(ir_expression *ir)
 254 {
 255    unsigned int operand;
 256    fs_reg op[2], temp;
 257    fs_inst *inst;
 258
 259    assert(ir->get_num_operands() <= 2);
 260
 261    if (try_emit_saturate(ir))
 262       return;
 263    if (ir->operation == ir_binop_add) {
 264       if (try_emit_mad(ir, 0) || try_emit_mad(ir, 1))
 265          return;
 266    }
 267
 268    for (operand = 0; operand < ir->get_num_operands(); operand++) {
 269       ir->operands[operand]->accept(this);
 270       if (this->result.file == BAD_FILE) {
 271          ir_print_visitor v;
 272          fail("Failed to get tree for expression operand:\n");
 273          ir->operands[operand]->accept(&v);
 274       }
 275       op[operand] = this->result;
 276
 277       /* Matrix expression operands should have been broken down to vector
 278        * operations already.
 279        */
 280       assert(!ir->operands[operand]->type->is_matrix());
 281       /* And then those vector operands should have been broken down to scalar.
 282        */
 283       assert(!ir->operands[operand]->type->is_vector());
 284    }
 285
 286    /* Storage for our result.  If our result goes into an assignment, it will
 287     * just get copy-propagated out, so no worries.
 288     */
 289    this->result = fs_reg(this, ir->type);
 290
 291    switch (ir->operation) {
 292    case ir_unop_logic_not:
 293       /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
 294        * ones complement of the whole register, not just bit 0.
 295        */
 296       emit(BRW_OPCODE_XOR, this->result, op[0], fs_reg(1));
 297       break;
 298    case ir_unop_neg:
 299       op[0].negate = !op[0].negate;
 300       this->result = op[0];
 301       break;
 302    case ir_unop_abs:
 303       op[0].abs = true;
 304       op[0].negate = false;
 305       this->result = op[0];
 306       break;
 307    case ir_unop_sign:
 308       temp = fs_reg(this, ir->type);
 309
 310       emit(BRW_OPCODE_MOV, this->result, fs_reg(0.0f));
 311
 312       inst = emit(BRW_OPCODE_CMP, reg_null_f, op[0], fs_reg(0.0f));
 313       inst->conditional_mod = BRW_CONDITIONAL_G;
 314       inst = emit(BRW_OPCODE_MOV, this->result, fs_reg(1.0f));
 315       inst->predicated = true;
 316
 317       inst = emit(BRW_OPCODE_CMP, reg_null_f, op[0], fs_reg(0.0f));
 318       inst->conditional_mod = BRW_CONDITIONAL_L;
 319       inst = emit(BRW_OPCODE_MOV, this->result, fs_reg(-1.0f));
 320       inst->predicated = true;
 321
 322       break;
 323    case ir_unop_rcp:
 324       emit_math(SHADER_OPCODE_RCP, this->result, op[0]);
 325       break;
 326
 327    case ir_unop_exp2:
 328       emit_math(SHADER_OPCODE_EXP2, this->result, op[0]);
 329       break;
 330    case ir_unop_log2:
 331       emit_math(SHADER_OPCODE_LOG2, this->result, op[0]);
 332       break;
 333    case ir_unop_exp:
 334    case ir_unop_log:
 335       assert(!"not reached: should be handled by ir_explog_to_explog2");
 336       break;
 337    case ir_unop_sin:
 338    case ir_unop_sin_reduced:
 339       emit_math(SHADER_OPCODE_SIN, this->result, op[0]);
 340       break;
 341    case ir_unop_cos:
 342    case ir_unop_cos_reduced:
 343       emit_math(SHADER_OPCODE_COS, this->result, op[0]);
 344       break;
 345
 346    case ir_unop_dFdx:
 347       emit(FS_OPCODE_DDX, this->result, op[0]);
 348       break;
 349    case ir_unop_dFdy:
 350       emit(FS_OPCODE_DDY, this->result, op[0]);
 351       break;
 352
 353    case ir_binop_add:
 354       emit(BRW_OPCODE_ADD, this->result, op[0], op[1]);
 355       break;
 356    case ir_binop_sub:
 357       assert(!"not reached: should be handled by ir_sub_to_add_neg");
 358       break;
 359
 360    case ir_binop_mul:
 361       if (ir->type->is_integer()) {
 362          /* For integer multiplication, the MUL uses the low 16 bits
 363           * of one of the operands (src0 on gen6, src1 on gen7).  The
 364           * MACH accumulates in the contribution of the upper 16 bits
 365           * of that operand.
 366           *
 367           * FINISHME: Emit just the MUL if we know an operand is small
 368           * enough.
 369           */
 370          if (intel->gen >= 7 && c->dispatch_width == 16)
 371             fail("16-wide explicit accumulator operands unsupported\n");
 372
 373          struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_D);
 374
 375          emit(BRW_OPCODE_MUL, acc, op[0], op[1]);
 376          emit(BRW_OPCODE_MACH, reg_null_d, op[0], op[1]);
 377          emit(BRW_OPCODE_MOV, this->result, fs_reg(acc));
 378       } else {
 379          emit(BRW_OPCODE_MUL, this->result, op[0], op[1]);
 380       }
 381       break;
 382    case ir_binop_div:
 383       if (intel->gen >= 7 && c->dispatch_width == 16)
 384          fail("16-wide INTDIV unsupported\n");
 385
 386       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
 387       assert(ir->type->is_integer());
 388       emit_math(SHADER_OPCODE_INT_QUOTIENT, this->result, op[0], op[1]);
 389       break;
 390    case ir_binop_mod:
 391       if (intel->gen >= 7 && c->dispatch_width == 16)
 392          fail("16-wide INTDIV unsupported\n");
 393
 394       /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
 395       assert(ir->type->is_integer());
 396       emit_math(SHADER_OPCODE_INT_REMAINDER, this->result, op[0], op[1]);
 397       break;
 398
 399    case ir_binop_less:
 400    case ir_binop_greater:
 401    case ir_binop_lequal:
 402    case ir_binop_gequal:
 403    case ir_binop_equal:
 404    case ir_binop_all_equal:
 405    case ir_binop_nequal:
 406    case ir_binop_any_nequal:
 407       temp = this->result;
 408       /* original gen4 does implicit conversion before comparison. */
 409       if (intel->gen < 5)
 410          temp.type = op[0].type;
 411
 412       resolve_ud_negate(&op[0]);
 413       resolve_ud_negate(&op[1]);
 414
 415       resolve_bool_comparison(ir->operands[0], &op[0]);
 416       resolve_bool_comparison(ir->operands[1], &op[1]);
 417
 418       inst = emit(BRW_OPCODE_CMP, temp, op[0], op[1]);
 419       inst->conditional_mod = brw_conditional_for_comparison(ir->operation);
 420       break;
 421
 422    case ir_binop_logic_xor:
 423       emit(BRW_OPCODE_XOR, this->result, op[0], op[1]);
 424       break;
 425
 426    case ir_binop_logic_or:
 427       emit(BRW_OPCODE_OR, this->result, op[0], op[1]);
 428       break;
 429
 430    case ir_binop_logic_and:
 431       emit(BRW_OPCODE_AND, this->result, op[0], op[1]);
 432       break;
 433
 434    case ir_binop_dot:
 435    case ir_unop_any:
 436       assert(!"not reached: should be handled by brw_fs_channel_expressions");
 437       break;
 438
 439    case ir_unop_noise:
 440       assert(!"not reached: should be handled by lower_noise");
 441       break;
 442
 443    case ir_quadop_vector:
 444       assert(!"not reached: should be handled by lower_quadop_vector");
 445       break;
 446
 447    case ir_unop_sqrt:
 448       emit_math(SHADER_OPCODE_SQRT, this->result, op[0]);
 449       break;
 450
 451    case ir_unop_rsq:
 452       emit_math(SHADER_OPCODE_RSQ, this->result, op[0]);
 453       break;
 454
 455    case ir_unop_bitcast_i2f:
 456    case ir_unop_bitcast_u2f:
 457       op[0].type = BRW_REGISTER_TYPE_F;
 458       this->result = op[0];
 459       break;
 460    case ir_unop_i2u:
 461    case ir_unop_bitcast_f2u:
 462       op[0].type = BRW_REGISTER_TYPE_UD;
 463       this->result = op[0];
 464       break;
 465    case ir_unop_u2i:
 466    case ir_unop_bitcast_f2i:
 467       op[0].type = BRW_REGISTER_TYPE_D;
 468       this->result = op[0];
 469       break;
 470    case ir_unop_i2f:
 471    case ir_unop_u2f:
 472    case ir_unop_f2i:
 473    case ir_unop_f2u:
 474       emit(BRW_OPCODE_MOV, this->result, op[0]);
 475       break;
 476
 477    case ir_unop_b2i:
 478       inst = emit(BRW_OPCODE_AND, this->result, op[0], fs_reg(1));
 479       break;
 480    case ir_unop_b2f:
 481       temp = fs_reg(this, glsl_type::int_type);
 482       emit(BRW_OPCODE_AND, temp, op[0], fs_reg(1));
 483       emit(BRW_OPCODE_MOV, this->result, temp);
 484       break;
 485
 486    case ir_unop_f2b:
 487       inst = emit(BRW_OPCODE_CMP, this->result, op[0], fs_reg(0.0f));
 488       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 489       emit(BRW_OPCODE_AND, this->result, this->result, fs_reg(1));
 490       break;
 491    case ir_unop_i2b:
 492       assert(op[0].type == BRW_REGISTER_TYPE_D);
 493
 494       inst = emit(BRW_OPCODE_CMP, this->result, op[0], fs_reg(0));
 495       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 496       emit(BRW_OPCODE_AND, this->result, this->result, fs_reg(1));
 497       break;
 498
 499    case ir_unop_trunc:
 500       emit(BRW_OPCODE_RNDZ, this->result, op[0]);
 501       break;
 502    case ir_unop_ceil:
 503       op[0].negate = !op[0].negate;
 504       inst = emit(BRW_OPCODE_RNDD, this->result, op[0]);
 505       this->result.negate = true;
 506       break;
 507    case ir_unop_floor:
 508       inst = emit(BRW_OPCODE_RNDD, this->result, op[0]);
 509       break;
 510    case ir_unop_fract:
 511       inst = emit(BRW_OPCODE_FRC, this->result, op[0]);
 512       break;
 513    case ir_unop_round_even:
 514       emit(BRW_OPCODE_RNDE, this->result, op[0]);
 515       break;
 516
 517    case ir_binop_min:
 518       resolve_ud_negate(&op[0]);
 519       resolve_ud_negate(&op[1]);
 520
 521       if (intel->gen >= 6) {
 522          inst = emit(BRW_OPCODE_SEL, this->result, op[0], op[1]);
 523          inst->conditional_mod = BRW_CONDITIONAL_L;
 524       } else {
 525          /* Unalias the destination */
 526          this->result = fs_reg(this, ir->type);
 527
 528          inst = emit(BRW_OPCODE_CMP, this->result, op[0], op[1]);
 529          inst->conditional_mod = BRW_CONDITIONAL_L;
 530
 531          inst = emit(BRW_OPCODE_SEL, this->result, op[0], op[1]);
 532          inst->predicated = true;
 533       }
 534       break;
 535    case ir_binop_max:
 536       resolve_ud_negate(&op[0]);
 537       resolve_ud_negate(&op[1]);
 538
 539       if (intel->gen >= 6) {
 540          inst = emit(BRW_OPCODE_SEL, this->result, op[0], op[1]);
 541          inst->conditional_mod = BRW_CONDITIONAL_GE;
 542       } else {
 543          /* Unalias the destination */
 544          this->result = fs_reg(this, ir->type);
 545
 546          inst = emit(BRW_OPCODE_CMP, this->result, op[0], op[1]);
 547          inst->conditional_mod = BRW_CONDITIONAL_G;
 548
 549          inst = emit(BRW_OPCODE_SEL, this->result, op[0], op[1]);
 550          inst->predicated = true;
 551       }
 552       break;
 553
 554    case ir_binop_pow:
 555       emit_math(SHADER_OPCODE_POW, this->result, op[0], op[1]);
 556       break;
 557
 558    case ir_unop_bit_not:
 559       inst = emit(BRW_OPCODE_NOT, this->result, op[0]);
 560       break;
 561    case ir_binop_bit_and:
 562       inst = emit(BRW_OPCODE_AND, this->result, op[0], op[1]);
 563       break;
 564    case ir_binop_bit_xor:
 565       inst = emit(BRW_OPCODE_XOR, this->result, op[0], op[1]);
 566       break;
 567    case ir_binop_bit_or:
 568       inst = emit(BRW_OPCODE_OR, this->result, op[0], op[1]);
 569       break;
 570
 571    case ir_binop_lshift:
 572       inst = emit(BRW_OPCODE_SHL, this->result, op[0], op[1]);
 573       break;
 574
 575    case ir_binop_rshift:
 576       if (ir->type->base_type == GLSL_TYPE_INT)
 577          inst = emit(BRW_OPCODE_ASR, this->result, op[0], op[1]);
 578       else
 579          inst = emit(BRW_OPCODE_SHR, this->result, op[0], op[1]);
 580       break;
 581
 582    case ir_binop_ubo_load:
 583       ir_constant *uniform_block = ir->operands[0]->as_constant();
 584       ir_constant *offset = ir->operands[1]->as_constant();
 585
 586       fs_reg packed_consts = fs_reg(this, glsl_type::float_type);
 587       packed_consts.type = result.type;
 588       fs_reg surf_index = fs_reg((unsigned)SURF_INDEX_WM_UBO(uniform_block->value.u[0]));
 589       fs_inst *pull = emit(fs_inst(FS_OPCODE_PULL_CONSTANT_LOAD,
 590                                    packed_consts,
 591                                    surf_index,
 592                                    fs_reg(offset->value.u[0])));
 593       pull->base_mrf = 14;
 594       pull->mlen = 1;
 595
 596       packed_consts.smear = offset->value.u[0] % 16 / 4;
 597       for (int i = 0; i < ir->type->vector_elements; i++) {
 598          /* UBO bools are any nonzero value.  We consider bools to be
 599           * values with the low bit set to 1.  Convert them using CMP.
 600           */
 601          if (ir->type->base_type == GLSL_TYPE_BOOL) {
 602             fs_inst *inst = emit(fs_inst(BRW_OPCODE_CMP, result,
 603                                          packed_consts, fs_reg(0u)));
 604             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 605          } else {
 606             emit(fs_inst(BRW_OPCODE_MOV, result, packed_consts));
 607          }
 608
 609          packed_consts.smear++;
 610          result.reg_offset++;
 611
 612          /* The std140 packing rules don't allow vectors to cross 16-byte
 613           * boundaries, and a reg is 32 bytes.
 614           */
 615          assert(packed_consts.smear < 8);
 616       }
 617       result.reg_offset = 0;
 618       break;
 619    }
 620 }
 621
 622 void
 623 fs_visitor::emit_assignment_writes(fs_reg &l, fs_reg &r,
 624                                    const glsl_type *type, bool predicated)
 625 {
 626    switch (type->base_type) {
 627    case GLSL_TYPE_FLOAT:
 628    case GLSL_TYPE_UINT:
 629    case GLSL_TYPE_INT:
 630    case GLSL_TYPE_BOOL:
 631       for (unsigned int i = 0; i < type->components(); i++) {
 632          l.type = brw_type_for_base_type(type);
 633          r.type = brw_type_for_base_type(type);
 634
 635          if (predicated || !l.equals(r)) {
 636             fs_inst *inst = emit(BRW_OPCODE_MOV, l, r);
 637             inst->predicated = predicated;
 638          }
 639
 640          l.reg_offset++;
 641          r.reg_offset++;
 642       }
 643       break;
 644    case GLSL_TYPE_ARRAY:
 645       for (unsigned int i = 0; i < type->length; i++) {
 646          emit_assignment_writes(l, r, type->fields.array, predicated);
 647       }
 648       break;
 649
 650    case GLSL_TYPE_STRUCT:
 651       for (unsigned int i = 0; i < type->length; i++) {
 652          emit_assignment_writes(l, r, type->fields.structure[i].type,
 653                                 predicated);
 654       }
 655       break;
 656
 657    case GLSL_TYPE_SAMPLER:
 658       break;
 659
 660    default:
 661       assert(!"not reached");
 662       break;
 663    }
 664 }
 665
 666 /* If the RHS processing resulted in an instruction generating a
 667  * temporary value, and it would be easy to rewrite the instruction to
 668  * generate its result right into the LHS instead, do so.  This ends
 669  * up reliably removing instructions where it can be tricky to do so
 670  * later without real UD chain information.
 671  */
 672 bool
 673 fs_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
 674                                    fs_reg dst,
 675                                    fs_reg src,
 676                                    fs_inst *pre_rhs_inst,
 677                                    fs_inst *last_rhs_inst)
 678 {
 679    /* Only attempt if we're doing a direct assignment. */
 680    if (ir->condition ||
 681        !(ir->lhs->type->is_scalar() ||
 682         (ir->lhs->type->is_vector() &&
 683          ir->write_mask == (1 << ir->lhs->type->vector_elements) - 1)))
 684       return false;
 685
 686    /* Make sure the last instruction generated our source reg. */
 687    fs_inst *modify = get_instruction_generating_reg(pre_rhs_inst,
 688                                                     last_rhs_inst,
 689                                                     src);
 690    if (!modify)
 691       return false;
 692
 693    /* If last_rhs_inst wrote a different number of components than our LHS,
 694     * we can't safely rewrite it.
 695     */
 696    if (ir->lhs->type->vector_elements != modify->regs_written())
 697       return false;
 698
 699    /* Success!  Rewrite the instruction. */
 700    modify->dst = dst;
 701
 702    return true;
 703 }
 704
 705 void
 706 fs_visitor::visit(ir_assignment *ir)
 707 {
 708    fs_reg l, r;
 709    fs_inst *inst;
 710
 711    /* FINISHME: arrays on the lhs */
 712    ir->lhs->accept(this);
 713    l = this->result;
 714
 715    fs_inst *pre_rhs_inst = (fs_inst *) this->instructions.get_tail();
 716
 717    ir->rhs->accept(this);
 718    r = this->result;
 719
 720    fs_inst *last_rhs_inst = (fs_inst *) this->instructions.get_tail();
 721
 722    assert(l.file != BAD_FILE);
 723    assert(r.file != BAD_FILE);
 724
 725    if (try_rewrite_rhs_to_dst(ir, l, r, pre_rhs_inst, last_rhs_inst))
 726       return;
 727
 728    if (ir->condition) {
 729       emit_bool_to_cond_code(ir->condition);
 730    }
 731
 732    if (ir->lhs->type->is_scalar() ||
 733        ir->lhs->type->is_vector()) {
 734       for (int i = 0; i < ir->lhs->type->vector_elements; i++) {
 735          if (ir->write_mask & (1 << i)) {
 736             inst = emit(BRW_OPCODE_MOV, l, r);
 737             if (ir->condition)
 738                inst->predicated = true;
 739             r.reg_offset++;
 740          }
 741          l.reg_offset++;
 742       }
 743    } else {
 744       emit_assignment_writes(l, r, ir->lhs->type, ir->condition != NULL);
 745    }
 746 }
 747
 748 fs_inst *
 749 fs_visitor::emit_texture_gen4(ir_texture *ir, fs_reg dst, fs_reg coordinate,
 750                               fs_reg shadow_c, fs_reg lod, fs_reg dPdy)
 751 {
 752    int mlen;
 753    int base_mrf = 1;
 754    bool simd16 = false;
 755    fs_reg orig_dst;
 756
 757    /* g0 header. */
 758    mlen = 1;
 759
 760    if (ir->shadow_comparitor) {
 761       for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
 762          emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i), coordinate);
 763          coordinate.reg_offset++;
 764       }
 765       /* gen4's SIMD8 sampler always has the slots for u,v,r present. */
 766       mlen += 3;
 767
 768       if (ir->op == ir_tex) {
 769          /* There's no plain shadow compare message, so we use shadow
 770           * compare with a bias of 0.0.
 771           */
 772          emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), fs_reg(0.0f));
 773          mlen++;
 774       } else if (ir->op == ir_txb || ir->op == ir_txl) {
 775          emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), lod);
 776          mlen++;
 777       } else {
 778          assert(!"Should not get here.");
 779       }
 780
 781       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), shadow_c);
 782       mlen++;
 783    } else if (ir->op == ir_tex) {
 784       for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
 785          emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i), coordinate);
 786          coordinate.reg_offset++;
 787       }
 788       /* gen4's SIMD8 sampler always has the slots for u,v,r present. */
 789       mlen += 3;
 790    } else if (ir->op == ir_txd) {
 791       fs_reg &dPdx = lod;
 792
 793       for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
 794          emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i), coordinate);
 795          coordinate.reg_offset++;
 796       }
 797       /* the slots for u and v are always present, but r is optional */
 798       mlen += MAX2(ir->coordinate->type->vector_elements, 2);
 799
 800       /*  P   = u, v, r
 801        * dPdx = dudx, dvdx, drdx
 802        * dPdy = dudy, dvdy, drdy
 803        *
 804        * 1-arg: Does not exist.
 805        *
 806        * 2-arg: dudx   dvdx   dudy   dvdy
 807        *        dPdx.x dPdx.y dPdy.x dPdy.y
 808        *        m4     m5     m6     m7
 809        *
 810        * 3-arg: dudx   dvdx   drdx   dudy   dvdy   drdy
 811        *        dPdx.x dPdx.y dPdx.z dPdy.x dPdy.y dPdy.z
 812        *        m5     m6     m7     m8     m9     m10
 813        */
 814       for (int i = 0; i < ir->lod_info.grad.dPdx->type->vector_elements; i++) {
 815          emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), dPdx);
 816          dPdx.reg_offset++;
 817       }
 818       mlen += MAX2(ir->lod_info.grad.dPdx->type->vector_elements, 2);
 819
 820       for (int i = 0; i < ir->lod_info.grad.dPdy->type->vector_elements; i++) {
 821          emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), dPdy);
 822          dPdy.reg_offset++;
 823       }
 824       mlen += MAX2(ir->lod_info.grad.dPdy->type->vector_elements, 2);
 825    } else if (ir->op == ir_txs) {
 826       /* There's no SIMD8 resinfo message on Gen4.  Use SIMD16 instead. */
 827       simd16 = true;
 828       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen, BRW_REGISTER_TYPE_UD), lod);
 829       mlen += 2;
 830    } else {
 831       /* Oh joy.  gen4 doesn't have SIMD8 non-shadow-compare bias/lod
 832        * instructions.  We'll need to do SIMD16 here.
 833        */
 834       simd16 = true;
 835       assert(ir->op == ir_txb || ir->op == ir_txl || ir->op == ir_txf);
 836
 837       for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
 838          emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i * 2, coordinate.type),
 839               coordinate);
 840          coordinate.reg_offset++;
 841       }
 842
 843       /* Initialize the rest of u/v/r with 0.0.  Empirically, this seems to
 844        * be necessary for TXF (ld), but seems wise to do for all messages.
 845        */
 846       for (int i = ir->coordinate->type->vector_elements; i < 3; i++) {
 847          emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i * 2), fs_reg(0.0f));
 848       }
 849
 850       /* lod/bias appears after u/v/r. */
 851       mlen += 6;
 852
 853       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen, lod.type), lod);
 854       mlen++;
 855
 856       /* The unused upper half. */
 857       mlen++;
 858    }
 859
 860    if (simd16) {
 861       /* Now, since we're doing simd16, the return is 2 interleaved
 862        * vec4s where the odd-indexed ones are junk. We'll need to move
 863        * this weirdness around to the expected layout.
 864        */
 865       orig_dst = dst;
 866       const glsl_type *vec_type =
 867          glsl_type::get_instance(ir->type->base_type, 4, 1);
 868       dst = fs_reg(this, glsl_type::get_array_instance(vec_type, 2));
 869       dst.type = intel->is_g4x ? brw_type_for_base_type(ir->type)
 870                                : BRW_REGISTER_TYPE_F;
 871    }
 872
 873    fs_inst *inst = NULL;
 874    switch (ir->op) {
 875    case ir_tex:
 876       inst = emit(SHADER_OPCODE_TEX, dst);
 877       break;
 878    case ir_txb:
 879       inst = emit(FS_OPCODE_TXB, dst);
 880       break;
 881    case ir_txl:
 882       inst = emit(SHADER_OPCODE_TXL, dst);
 883       break;
 884    case ir_txd:
 885       inst = emit(SHADER_OPCODE_TXD, dst);
 886       break;
 887    case ir_txs:
 888       inst = emit(SHADER_OPCODE_TXS, dst);
 889       break;
 890    case ir_txf:
 891       inst = emit(SHADER_OPCODE_TXF, dst);
 892       break;
 893    }
 894    inst->base_mrf = base_mrf;
 895    inst->mlen = mlen;
 896    inst->header_present = true;
 897
 898    if (simd16) {
 899       for (int i = 0; i < 4; i++) {
 900          emit(BRW_OPCODE_MOV, orig_dst, dst);
 901          orig_dst.reg_offset++;
 902          dst.reg_offset += 2;
 903       }
 904    }
 905
 906    return inst;
 907 }
 908
 909 /* gen5's sampler has slots for u, v, r, array index, then optional
 910  * parameters like shadow comparitor or LOD bias.  If optional
 911  * parameters aren't present, those base slots are optional and don't
 912  * need to be included in the message.
 913  *
 914  * We don't fill in the unnecessary slots regardless, which may look
 915  * surprising in the disassembly.
 916  */
 917 fs_inst *
 918 fs_visitor::emit_texture_gen5(ir_texture *ir, fs_reg dst, fs_reg coordinate,
 919                               fs_reg shadow_c, fs_reg lod, fs_reg lod2)
 920 {
 921    int mlen = 0;
 922    int base_mrf = 2;
 923    int reg_width = c->dispatch_width / 8;
 924    bool header_present = false;
 925    const int vector_elements =
 926       ir->coordinate ? ir->coordinate->type->vector_elements : 0;
 927
 928    if (ir->offset != NULL && ir->op == ir_txf) {
 929       /* It appears that the ld instruction used for txf does its
 930        * address bounds check before adding in the offset.  To work
 931        * around this, just add the integer offset to the integer texel
 932        * coordinate, and don't put the offset in the header.
 933        */
 934       ir_constant *offset = ir->offset->as_constant();
 935       for (int i = 0; i < vector_elements; i++) {
 936          emit(BRW_OPCODE_ADD,
 937               fs_reg(MRF, base_mrf + mlen + i * reg_width, coordinate.type),
 938               coordinate,
 939               offset->value.i[i]);
 940          coordinate.reg_offset++;
 941       }
 942    } else {
 943       if (ir->offset) {
 944          /* The offsets set up by the ir_texture visitor are in the
 945           * m1 header, so we can't go headerless.
 946           */
 947          header_present = true;
 948          mlen++;
 949          base_mrf--;
 950       }
 951
 952       for (int i = 0; i < vector_elements; i++) {
 953          emit(BRW_OPCODE_MOV,
 954               fs_reg(MRF, base_mrf + mlen + i * reg_width, coordinate.type),
 955               coordinate);
 956          coordinate.reg_offset++;
 957       }
 958    }
 959    mlen += vector_elements * reg_width;
 960
 961    if (ir->shadow_comparitor) {
 962       mlen = MAX2(mlen, header_present + 4 * reg_width);
 963
 964       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), shadow_c);
 965       mlen += reg_width;
 966    }
 967
 968    fs_inst *inst = NULL;
 969    switch (ir->op) {
 970    case ir_tex:
 971       inst = emit(SHADER_OPCODE_TEX, dst);
 972       break;
 973    case ir_txb:
 974       mlen = MAX2(mlen, header_present + 4 * reg_width);
 975       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), lod);
 976       mlen += reg_width;
 977
 978       inst = emit(FS_OPCODE_TXB, dst);
 979       break;
 980    case ir_txl:
 981       mlen = MAX2(mlen, header_present + 4 * reg_width);
 982       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), lod);
 983       mlen += reg_width;
 984
 985       inst = emit(SHADER_OPCODE_TXL, dst);
 986       break;
 987    case ir_txd: {
 988       mlen = MAX2(mlen, header_present + 4 * reg_width); /* skip over 'ai' */
 989
 990       /**
 991        *  P   =  u,    v,    r
 992        * dPdx = dudx, dvdx, drdx
 993        * dPdy = dudy, dvdy, drdy
 994        *
 995        * Load up these values:
 996        * - dudx   dudy   dvdx   dvdy   drdx   drdy
 997        * - dPdx.x dPdy.x dPdx.y dPdy.y dPdx.z dPdy.z
 998        */
 999       for (int i = 0; i < ir->lod_info.grad.dPdx->type->vector_elements; i++) {
1000          emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), lod);
1001          lod.reg_offset++;
1002          mlen += reg_width;
1003
1004          emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), lod2);
1005          lod2.reg_offset++;
1006          mlen += reg_width;
1007       }
1008
1009       inst = emit(SHADER_OPCODE_TXD, dst);
1010       break;
1011    }
1012    case ir_txs:
1013       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen, BRW_REGISTER_TYPE_UD), lod);
1014       mlen += reg_width;
1015       inst = emit(SHADER_OPCODE_TXS, dst);
1016       break;
1017    case ir_txf:
1018       mlen = header_present + 4 * reg_width;
1019
1020       emit(BRW_OPCODE_MOV,
1021            fs_reg(MRF, base_mrf + mlen - reg_width, BRW_REGISTER_TYPE_UD),
1022            lod);
1023       inst = emit(SHADER_OPCODE_TXF, dst);
1024       break;
1025    }
1026    inst->base_mrf = base_mrf;
1027    inst->mlen = mlen;
1028    inst->header_present = header_present;
1029
1030    if (mlen > 11) {
1031       fail("Message length >11 disallowed by hardware\n");
1032    }
1033
1034    return inst;
1035 }
1036
1037 fs_inst *
1038 fs_visitor::emit_texture_gen7(ir_texture *ir, fs_reg dst, fs_reg coordinate,
1039                               fs_reg shadow_c, fs_reg lod, fs_reg lod2)
1040 {
1041    int mlen = 0;
1042    int base_mrf = 2;
1043    int reg_width = c->dispatch_width / 8;
1044    bool header_present = false;
1045    int offsets[3];
1046
1047    if (ir->offset && ir->op != ir_txf) {
1048       /* The offsets set up by the ir_texture visitor are in the
1049        * m1 header, so we can't go headerless.
1050        */
1051       header_present = true;
1052       mlen++;
1053       base_mrf--;
1054    }
1055
1056    if (ir->shadow_comparitor) {
1057       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), shadow_c);
1058       mlen += reg_width;
1059    }
1060
1061    /* Set up the LOD info */
1062    switch (ir->op) {
1063    case ir_tex:
1064       break;
1065    case ir_txb:
1066       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), lod);
1067       mlen += reg_width;
1068       break;
1069    case ir_txl:
1070       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), lod);
1071       mlen += reg_width;
1072       break;
1073    case ir_txd: {
1074       if (c->dispatch_width == 16)
1075          fail("Gen7 does not support sample_d/sample_d_c in SIMD16 mode.");
1076
1077       /* Load dPdx and the coordinate together:
1078        * [hdr], [ref], x, dPdx.x, dPdy.x, y, dPdx.y, dPdy.y, z, dPdx.z, dPdy.z
1079        */
1080       for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1081          emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), coordinate);
1082          coordinate.reg_offset++;
1083          mlen += reg_width;
1084
1085          emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), lod);
1086          lod.reg_offset++;
1087          mlen += reg_width;
1088
1089          emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), lod2);
1090          lod2.reg_offset++;
1091          mlen += reg_width;
1092       }
1093       break;
1094    }
1095    case ir_txs:
1096       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen, BRW_REGISTER_TYPE_UD), lod);
1097       mlen += reg_width;
1098       break;
1099    case ir_txf:
1100       /* It appears that the ld instruction used for txf does its
1101        * address bounds check before adding in the offset.  To work
1102        * around this, just add the integer offset to the integer texel
1103        * coordinate, and don't put the offset in the header.
1104        */
1105       if (ir->offset) {
1106          ir_constant *offset = ir->offset->as_constant();
1107          offsets[0] = offset->value.i[0];
1108          offsets[1] = offset->value.i[1];
1109          offsets[2] = offset->value.i[2];
1110       } else {
1111          memset(offsets, 0, sizeof(offsets));
1112       }
1113
1114       /* Unfortunately, the parameters for LD are intermixed: u, lod, v, r. */
1115       emit(BRW_OPCODE_ADD,
1116            fs_reg(MRF, base_mrf + mlen, BRW_REGISTER_TYPE_D), coordinate, offsets[0]);
1117       coordinate.reg_offset++;
1118       mlen += reg_width;
1119
1120       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen, BRW_REGISTER_TYPE_D), lod);
1121       mlen += reg_width;
1122
1123       for (int i = 1; i < ir->coordinate->type->vector_elements; i++) {
1124          emit(BRW_OPCODE_ADD,
1125               fs_reg(MRF, base_mrf + mlen, BRW_REGISTER_TYPE_D), coordinate, offsets[i]);
1126          coordinate.reg_offset++;
1127          mlen += reg_width;
1128       }
1129       break;
1130    }
1131
1132    /* Set up the coordinate (except for cases where it was done above) */
1133    if (ir->op != ir_txd && ir->op != ir_txs && ir->op != ir_txf) {
1134       for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1135          emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), coordinate);
1136          coordinate.reg_offset++;
1137          mlen += reg_width;
1138       }
1139    }
1140
1141    /* Generate the SEND */
1142    fs_inst *inst = NULL;
1143    switch (ir->op) {
1144    case ir_tex: inst = emit(SHADER_OPCODE_TEX, dst); break;
1145    case ir_txb: inst = emit(FS_OPCODE_TXB, dst); break;
1146    case ir_txl: inst = emit(SHADER_OPCODE_TXL, dst); break;
1147    case ir_txd: inst = emit(SHADER_OPCODE_TXD, dst); break;
1148    case ir_txf: inst = emit(SHADER_OPCODE_TXF, dst); break;
1149    case ir_txs: inst = emit(SHADER_OPCODE_TXS, dst); break;
1150    }
1151    inst->base_mrf = base_mrf;
1152    inst->mlen = mlen;
1153    inst->header_present = header_present;
1154
1155    if (mlen > 11) {
1156       fail("Message length >11 disallowed by hardware\n");
1157    }
1158
1159    return inst;
1160 }
1161
1162 /**
1163  * Emit code to produce the coordinates for a texture lookup.
1164  *
1165  * Returns the fs_reg containing the texture coordinate (as opposed to
1166  * setting this->result).
1167  */
1168 fs_reg
1169 fs_visitor::emit_texcoord(ir_texture *ir, int sampler, int texunit)
1170 {
1171    fs_inst *inst = NULL;
1172
1173    if (!ir->coordinate)
1174       return fs_reg(); /* Return the default BAD_FILE register. */
1175
1176    ir->coordinate->accept(this);
1177    fs_reg coordinate = this->result;
1178
1179    bool needs_gl_clamp = true;
1180
1181    fs_reg scale_x, scale_y;
1182
1183    /* The 965 requires the EU to do the normalization of GL rectangle
1184     * texture coordinates.  We use the program parameter state
1185     * tracking to get the scaling factor.
1186     */
1187    if (ir->sampler->type->sampler_dimensionality == GLSL_SAMPLER_DIM_RECT &&
1188        (intel->gen < 6 ||
1189         (intel->gen >= 6 && (c->key.tex.gl_clamp_mask[0] & (1 << sampler) ||
1190                              c->key.tex.gl_clamp_mask[1] & (1 << sampler))))) {
1191       struct gl_program_parameter_list *params = c->fp->program.Base.Parameters;
1192       int tokens[STATE_LENGTH] = {
1193          STATE_INTERNAL,
1194          STATE_TEXRECT_SCALE,
1195          texunit,
1196          0,
1197          0
1198       };
1199
1200       if (c->dispatch_width == 16) {
1201          fail("rectangle scale uniform setup not supported on 16-wide\n");
1202          return fs_reg(this, ir->type);
1203       }
1204
1205       scale_x = fs_reg(UNIFORM, c->prog_data.nr_params);
1206       scale_y = fs_reg(UNIFORM, c->prog_data.nr_params + 1);
1207
1208       GLuint index = _mesa_add_state_reference(params,
1209                                                (gl_state_index *)tokens);
1210
1211       this->param_index[c->prog_data.nr_params] = index;
1212       this->param_offset[c->prog_data.nr_params] = 0;
1213       c->prog_data.nr_params++;
1214       this->param_index[c->prog_data.nr_params] = index;
1215       this->param_offset[c->prog_data.nr_params] = 1;
1216       c->prog_data.nr_params++;
1217    }
1218
1219    /* The 965 requires the EU to do the normalization of GL rectangle
1220     * texture coordinates.  We use the program parameter state
1221     * tracking to get the scaling factor.
1222     */
1223    if (intel->gen < 6 &&
1224        ir->sampler->type->sampler_dimensionality == GLSL_SAMPLER_DIM_RECT) {
1225       fs_reg dst = fs_reg(this, ir->coordinate->type);
1226       fs_reg src = coordinate;
1227       coordinate = dst;
1228
1229       emit(BRW_OPCODE_MUL, dst, src, scale_x);
1230       dst.reg_offset++;
1231       src.reg_offset++;
1232       emit(BRW_OPCODE_MUL, dst, src, scale_y);
1233    } else if (ir->sampler->type->sampler_dimensionality == GLSL_SAMPLER_DIM_RECT) {
1234       /* On gen6+, the sampler handles the rectangle coordinates
1235        * natively, without needing rescaling.  But that means we have
1236        * to do GL_CLAMP clamping at the [0, width], [0, height] scale,
1237        * not [0, 1] like the default case below.
1238        */
1239       needs_gl_clamp = false;
1240
1241       for (int i = 0; i < 2; i++) {
1242          if (c->key.tex.gl_clamp_mask[i] & (1 << sampler)) {
1243             fs_reg chan = coordinate;
1244             chan.reg_offset += i;
1245
1246             inst = emit(BRW_OPCODE_SEL, chan, chan, brw_imm_f(0.0));
1247             inst->conditional_mod = BRW_CONDITIONAL_G;
1248
1249             /* Our parameter comes in as 1.0/width or 1.0/height,
1250              * because that's what people normally want for doing
1251              * texture rectangle handling.  We need width or height
1252              * for clamping, but we don't care enough to make a new
1253              * parameter type, so just invert back.
1254              */
1255             fs_reg limit = fs_reg(this, glsl_type::float_type);
1256             emit(BRW_OPCODE_MOV, limit, i == 0 ? scale_x : scale_y);
1257             emit(SHADER_OPCODE_RCP, limit, limit);
1258
1259             inst = emit(BRW_OPCODE_SEL, chan, chan, limit);
1260             inst->conditional_mod = BRW_CONDITIONAL_L;
1261          }
1262       }
1263    }
1264
1265    if (ir->coordinate && needs_gl_clamp) {
1266       for (unsigned int i = 0;
1267            i < MIN2(ir->coordinate->type->vector_elements, 3); i++) {
1268          if (c->key.tex.gl_clamp_mask[i] & (1 << sampler)) {
1269             fs_reg chan = coordinate;
1270             chan.reg_offset += i;
1271
1272             fs_inst *inst = emit(BRW_OPCODE_MOV, chan, chan);
1273             inst->saturate = true;
1274          }
1275       }
1276    }
1277    return coordinate;
1278 }
1279
1280 void
1281 fs_visitor::visit(ir_texture *ir)
1282 {
1283    fs_inst *inst = NULL;
1284
1285    int sampler = _mesa_get_sampler_uniform_value(ir->sampler, prog, &fp->Base);
1286    int texunit = fp->Base.SamplerUnits[sampler];
1287
1288    /* Should be lowered by do_lower_texture_projection */
1289    assert(!ir->projector);
1290
1291    /* Generate code to compute all the subexpression trees.  This has to be
1292     * done before loading any values into MRFs for the sampler message since
1293     * generating these values may involve SEND messages that need the MRFs.
1294     */
1295    fs_reg coordinate = emit_texcoord(ir, sampler, texunit);
1296
1297    fs_reg shadow_comparitor;
1298    if (ir->shadow_comparitor) {
1299       ir->shadow_comparitor->accept(this);
1300       shadow_comparitor = this->result;
1301    }
1302
1303    fs_reg lod, lod2;
1304    switch (ir->op) {
1305    case ir_tex:
1306       break;
1307    case ir_txb:
1308       ir->lod_info.bias->accept(this);
1309       lod = this->result;
1310       break;
1311    case ir_txd:
1312       ir->lod_info.grad.dPdx->accept(this);
1313       lod = this->result;
1314
1315       ir->lod_info.grad.dPdy->accept(this);
1316       lod2 = this->result;
1317       break;
1318    case ir_txf:
1319    case ir_txl:
1320    case ir_txs:
1321       ir->lod_info.lod->accept(this);
1322       lod = this->result;
1323       break;
1324    };
1325
1326    /* Writemasking doesn't eliminate channels on SIMD8 texture
1327     * samples, so don't worry about them.
1328     */
1329    fs_reg dst = fs_reg(this, glsl_type::get_instance(ir->type->base_type, 4, 1));
1330
1331    if (intel->gen >= 7) {
1332       inst = emit_texture_gen7(ir, dst, coordinate, shadow_comparitor,
1333                                lod, lod2);
1334    } else if (intel->gen >= 5) {
1335       inst = emit_texture_gen5(ir, dst, coordinate, shadow_comparitor,
1336                                lod, lod2);
1337    } else {
1338       inst = emit_texture_gen4(ir, dst, coordinate, shadow_comparitor,
1339                                lod, lod2);
1340    }
1341
1342    /* The header is set up by generate_tex() when necessary. */
1343    inst->src[0] = reg_undef;
1344
1345    if (ir->offset != NULL && ir->op != ir_txf)
1346       inst->texture_offset = brw_texture_offset(ir->offset->as_constant());
1347
1348    inst->sampler = sampler;
1349
1350    if (ir->shadow_comparitor)
1351       inst->shadow_compare = true;
1352
1353    swizzle_result(ir, dst, sampler);
1354 }
1355
1356 /**
1357  * Swizzle the result of a texture result.  This is necessary for
1358  * EXT_texture_swizzle as well as DEPTH_TEXTURE_MODE for shadow comparisons.
1359  */
1360 void
1361 fs_visitor::swizzle_result(ir_texture *ir, fs_reg orig_val, int sampler)
1362 {
1363    this->result = orig_val;
1364
1365    if (ir->op == ir_txs)
1366       return;
1367
1368    if (ir->type == glsl_type::float_type) {
1369       /* Ignore DEPTH_TEXTURE_MODE swizzling. */
1370       assert(ir->sampler->type->sampler_shadow);
1371    } else if (c->key.tex.swizzles[sampler] != SWIZZLE_NOOP) {
1372       fs_reg swizzled_result = fs_reg(this, glsl_type::vec4_type);
1373
1374       for (int i = 0; i < 4; i++) {
1375          int swiz = GET_SWZ(c->key.tex.swizzles[sampler], i);
1376          fs_reg l = swizzled_result;
1377          l.reg_offset += i;
1378
1379          if (swiz == SWIZZLE_ZERO) {
1380             emit(BRW_OPCODE_MOV, l, fs_reg(0.0f));
1381          } else if (swiz == SWIZZLE_ONE) {
1382             emit(BRW_OPCODE_MOV, l, fs_reg(1.0f));
1383          } else {
1384             fs_reg r = orig_val;
1385             r.reg_offset += GET_SWZ(c->key.tex.swizzles[sampler], i);
1386             emit(BRW_OPCODE_MOV, l, r);
1387          }
1388       }
1389       this->result = swizzled_result;
1390    }
1391 }
1392
1393 void
1394 fs_visitor::visit(ir_swizzle *ir)
1395 {
1396    ir->val->accept(this);
1397    fs_reg val = this->result;
1398
1399    if (ir->type->vector_elements == 1) {
1400       this->result.reg_offset += ir->mask.x;
1401       return;
1402    }
1403
1404    fs_reg result = fs_reg(this, ir->type);
1405    this->result = result;
1406
1407    for (unsigned int i = 0; i < ir->type->vector_elements; i++) {
1408       fs_reg channel = val;
1409       int swiz = 0;
1410
1411       switch (i) {
1412       case 0:
1413          swiz = ir->mask.x;
1414          break;
1415       case 1:
1416          swiz = ir->mask.y;
1417          break;
1418       case 2:
1419          swiz = ir->mask.z;
1420          break;
1421       case 3:
1422          swiz = ir->mask.w;
1423          break;
1424       }
1425
1426       channel.reg_offset += swiz;
1427       emit(BRW_OPCODE_MOV, result, channel);
1428       result.reg_offset++;
1429    }
1430 }
1431
1432 void
1433 fs_visitor::visit(ir_discard *ir)
1434 {
1435    assert(ir->condition == NULL); /* FINISHME */
1436
1437    emit(FS_OPCODE_DISCARD);
1438 }
1439
1440 void
1441 fs_visitor::visit(ir_constant *ir)
1442 {
1443    /* Set this->result to reg at the bottom of the function because some code
1444     * paths will cause this visitor to be applied to other fields.  This will
1445     * cause the value stored in this->result to be modified.
1446     *
1447     * Make reg constant so that it doesn't get accidentally modified along the
1448     * way.  Yes, I actually had this problem. :(
1449     */
1450    const fs_reg reg(this, ir->type);
1451    fs_reg dst_reg = reg;
1452
1453    if (ir->type->is_array()) {
1454       const unsigned size = type_size(ir->type->fields.array);
1455
1456       for (unsigned i = 0; i < ir->type->length; i++) {
1457          ir->array_elements[i]->accept(this);
1458          fs_reg src_reg = this->result;
1459
1460          dst_reg.type = src_reg.type;
1461          for (unsigned j = 0; j < size; j++) {
1462             emit(BRW_OPCODE_MOV, dst_reg, src_reg);
1463             src_reg.reg_offset++;
1464             dst_reg.reg_offset++;
1465          }
1466       }
1467    } else if (ir->type->is_record()) {
1468       foreach_list(node, &ir->components) {
1469          ir_constant *const field = (ir_constant *) node;
1470          const unsigned size = type_size(field->type);
1471
1472          field->accept(this);
1473          fs_reg src_reg = this->result;
1474
1475          dst_reg.type = src_reg.type;
1476          for (unsigned j = 0; j < size; j++) {
1477             emit(BRW_OPCODE_MOV, dst_reg, src_reg);
1478             src_reg.reg_offset++;
1479             dst_reg.reg_offset++;
1480          }
1481       }
1482    } else {
1483       const unsigned size = type_size(ir->type);
1484
1485       for (unsigned i = 0; i < size; i++) {
1486          switch (ir->type->base_type) {
1487          case GLSL_TYPE_FLOAT:
1488             emit(BRW_OPCODE_MOV, dst_reg, fs_reg(ir->value.f[i]));
1489             break;
1490          case GLSL_TYPE_UINT:
1491             emit(BRW_OPCODE_MOV, dst_reg, fs_reg(ir->value.u[i]));
1492             break;
1493          case GLSL_TYPE_INT:
1494             emit(BRW_OPCODE_MOV, dst_reg, fs_reg(ir->value.i[i]));
1495             break;
1496          case GLSL_TYPE_BOOL:
1497             emit(BRW_OPCODE_MOV, dst_reg, fs_reg((int)ir->value.b[i]));
1498             break;
1499          default:
1500             assert(!"Non-float/uint/int/bool constant");
1501          }
1502          dst_reg.reg_offset++;
1503       }
1504    }
1505
1506    this->result = reg;
1507 }
1508
1509 void
1510 fs_visitor::emit_bool_to_cond_code(ir_rvalue *ir)
1511 {
1512    ir_expression *expr = ir->as_expression();
1513
1514    if (expr) {
1515       fs_reg op[2];
1516       fs_inst *inst;
1517
1518       assert(expr->get_num_operands() <= 2);
1519       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
1520          assert(expr->operands[i]->type->is_scalar());
1521
1522          expr->operands[i]->accept(this);
1523          op[i] = this->result;
1524
1525          resolve_ud_negate(&op[i]);
1526       }
1527
1528       switch (expr->operation) {
1529       case ir_unop_logic_not:
1530          inst = emit(BRW_OPCODE_AND, reg_null_d, op[0], fs_reg(1));
1531          inst->conditional_mod = BRW_CONDITIONAL_Z;
1532          break;
1533
1534       case ir_binop_logic_xor:
1535       case ir_binop_logic_or:
1536       case ir_binop_logic_and:
1537          goto out;
1538
1539       case ir_unop_f2b:
1540          if (intel->gen >= 6) {
1541             inst = emit(BRW_OPCODE_CMP, reg_null_d, op[0], fs_reg(0.0f));
1542          } else {
1543             inst = emit(BRW_OPCODE_MOV, reg_null_f, op[0]);
1544          }
1545          inst->conditional_mod = BRW_CONDITIONAL_NZ;
1546          break;
1547
1548       case ir_unop_i2b:
1549          if (intel->gen >= 6) {
1550             inst = emit(BRW_OPCODE_CMP, reg_null_d, op[0], fs_reg(0));
1551          } else {
1552             inst = emit(BRW_OPCODE_MOV, reg_null_d, op[0]);
1553          }
1554          inst->conditional_mod = BRW_CONDITIONAL_NZ;
1555          break;
1556
1557       case ir_binop_greater:
1558       case ir_binop_gequal:
1559       case ir_binop_less:
1560       case ir_binop_lequal:
1561       case ir_binop_equal:
1562       case ir_binop_all_equal:
1563       case ir_binop_nequal:
1564       case ir_binop_any_nequal:
1565          resolve_bool_comparison(expr->operands[0], &op[0]);
1566          resolve_bool_comparison(expr->operands[1], &op[1]);
1567
1568          inst = emit(BRW_OPCODE_CMP, reg_null_cmp, op[0], op[1]);
1569          inst->conditional_mod =
1570             brw_conditional_for_comparison(expr->operation);
1571          break;
1572
1573       default:
1574          assert(!"not reached");
1575          fail("bad cond code\n");
1576          break;
1577       }
1578       return;
1579    }
1580
1581 out:
1582    ir->accept(this);
1583
1584    fs_inst *inst = emit(BRW_OPCODE_AND, reg_null_d, this->result, fs_reg(1));
1585    inst->conditional_mod = BRW_CONDITIONAL_NZ;
1586 }
1587
1588 /**
1589  * Emit a gen6 IF statement with the comparison folded into the IF
1590  * instruction.
1591  */
1592 void
1593 fs_visitor::emit_if_gen6(ir_if *ir)
1594 {
1595    ir_expression *expr = ir->condition->as_expression();
1596
1597    if (expr) {
1598       fs_reg op[2];
1599       fs_inst *inst;
1600       fs_reg temp;
1601
1602       assert(expr->get_num_operands() <= 2);
1603       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
1604          assert(expr->operands[i]->type->is_scalar());
1605
1606          expr->operands[i]->accept(this);
1607          op[i] = this->result;
1608       }
1609
1610       switch (expr->operation) {
1611       case ir_unop_logic_not:
1612       case ir_binop_logic_xor:
1613       case ir_binop_logic_or:
1614       case ir_binop_logic_and:
1615          /* For operations on bool arguments, only the low bit of the bool is
1616           * valid, and the others are undefined.  Fall back to the condition
1617           * code path.
1618           */
1619          break;
1620
1621       case ir_unop_f2b:
1622          inst = emit(BRW_OPCODE_IF, reg_null_f, op[0], fs_reg(0));
1623          inst->conditional_mod = BRW_CONDITIONAL_NZ;
1624          return;
1625
1626       case ir_unop_i2b:
1627          inst = emit(BRW_OPCODE_IF, reg_null_d, op[0], fs_reg(0));
1628          inst->conditional_mod = BRW_CONDITIONAL_NZ;
1629          return;
1630
1631       case ir_binop_greater:
1632       case ir_binop_gequal:
1633       case ir_binop_less:
1634       case ir_binop_lequal:
1635       case ir_binop_equal:
1636       case ir_binop_all_equal:
1637       case ir_binop_nequal:
1638       case ir_binop_any_nequal:
1639          resolve_bool_comparison(expr->operands[0], &op[0]);
1640          resolve_bool_comparison(expr->operands[1], &op[1]);
1641
1642          inst = emit(BRW_OPCODE_IF, reg_null_d, op[0], op[1]);
1643          inst->conditional_mod =
1644             brw_conditional_for_comparison(expr->operation);
1645          return;
1646       default:
1647          assert(!"not reached");
1648          inst = emit(BRW_OPCODE_IF, reg_null_d, op[0], fs_reg(0));
1649          inst->conditional_mod = BRW_CONDITIONAL_NZ;
1650          fail("bad condition\n");
1651          return;
1652       }
1653    }
1654
1655    emit_bool_to_cond_code(ir->condition);
1656    fs_inst *inst = emit(BRW_OPCODE_IF);
1657    inst->predicated = true;
1658 }
1659
1660 void
1661 fs_visitor::visit(ir_if *ir)
1662 {
1663    fs_inst *inst;
1664
1665    if (intel->gen < 6 && c->dispatch_width == 16) {
1666       fail("Can't support (non-uniform) control flow on 16-wide\n");
1667    }
1668
1669    /* Don't point the annotation at the if statement, because then it plus
1670     * the then and else blocks get printed.
1671     */
1672    this->base_ir = ir->condition;
1673
1674    if (intel->gen == 6) {
1675       emit_if_gen6(ir);
1676    } else {
1677       emit_bool_to_cond_code(ir->condition);
1678
1679       inst = emit(BRW_OPCODE_IF);
1680       inst->predicated = true;
1681    }
1682
1683    foreach_list(node, &ir->then_instructions) {
1684       ir_instruction *ir = (ir_instruction *)node;
1685       this->base_ir = ir;
1686
1687       ir->accept(this);
1688    }
1689
1690    if (!ir->else_instructions.is_empty()) {
1691       emit(BRW_OPCODE_ELSE);
1692
1693       foreach_list(node, &ir->else_instructions) {
1694          ir_instruction *ir = (ir_instruction *)node;
1695          this->base_ir = ir;
1696
1697          ir->accept(this);
1698       }
1699    }
1700
1701    emit(BRW_OPCODE_ENDIF);
1702 }
1703
1704 void
1705 fs_visitor::visit(ir_loop *ir)
1706 {
1707    fs_reg counter = reg_undef;
1708
1709    if (intel->gen < 6 && c->dispatch_width == 16) {
1710       fail("Can't support (non-uniform) control flow on 16-wide\n");
1711    }
1712
1713    if (ir->counter) {
1714       this->base_ir = ir->counter;
1715       ir->counter->accept(this);
1716       counter = *(variable_storage(ir->counter));
1717
1718       if (ir->from) {
1719          this->base_ir = ir->from;
1720          ir->from->accept(this);
1721
1722          emit(BRW_OPCODE_MOV, counter, this->result);
1723       }
1724    }
1725
1726    this->base_ir = NULL;
1727    emit(BRW_OPCODE_DO);
1728
1729    if (ir->to) {
1730       this->base_ir = ir->to;
1731       ir->to->accept(this);
1732
1733       fs_inst *inst = emit(BRW_OPCODE_CMP, reg_null_cmp, counter, this->result);
1734       inst->conditional_mod = brw_conditional_for_comparison(ir->cmp);
1735
1736       inst = emit(BRW_OPCODE_BREAK);
1737       inst->predicated = true;
1738    }
1739
1740    foreach_list(node, &ir->body_instructions) {
1741       ir_instruction *ir = (ir_instruction *)node;
1742
1743       this->base_ir = ir;
1744       ir->accept(this);
1745    }
1746
1747    if (ir->increment) {
1748       this->base_ir = ir->increment;
1749       ir->increment->accept(this);
1750       emit(BRW_OPCODE_ADD, counter, counter, this->result);
1751    }
1752
1753    this->base_ir = NULL;
1754    emit(BRW_OPCODE_WHILE);
1755 }
1756
1757 void
1758 fs_visitor::visit(ir_loop_jump *ir)
1759 {
1760    switch (ir->mode) {
1761    case ir_loop_jump::jump_break:
1762       emit(BRW_OPCODE_BREAK);
1763       break;
1764    case ir_loop_jump::jump_continue:
1765       emit(BRW_OPCODE_CONTINUE);
1766       break;
1767    }
1768 }
1769
1770 void
1771 fs_visitor::visit(ir_call *ir)
1772 {
1773    assert(!"FINISHME");
1774 }
1775
1776 void
1777 fs_visitor::visit(ir_return *ir)
1778 {
1779    assert(!"FINISHME");
1780 }
1781
1782 void
1783 fs_visitor::visit(ir_function *ir)
1784 {
1785    /* Ignore function bodies other than main() -- we shouldn't see calls to
1786     * them since they should all be inlined before we get to ir_to_mesa.
1787     */
1788    if (strcmp(ir->name, "main") == 0) {
1789       const ir_function_signature *sig;
1790       exec_list empty;
1791
1792       sig = ir->matching_signature(&empty);
1793
1794       assert(sig);
1795
1796       foreach_list(node, &sig->body) {
1797          ir_instruction *ir = (ir_instruction *)node;
1798          this->base_ir = ir;
1799
1800          ir->accept(this);
1801       }
1802    }
1803 }
1804
1805 void
1806 fs_visitor::visit(ir_function_signature *ir)
1807 {
1808    assert(!"not reached");
1809    (void)ir;
1810 }
1811
1812 fs_inst *
1813 fs_visitor::emit(fs_inst inst)
1814 {
1815    fs_inst *list_inst = new(mem_ctx) fs_inst;
1816    *list_inst = inst;
1817
1818    if (force_uncompressed_stack > 0)
1819       list_inst->force_uncompressed = true;
1820    else if (force_sechalf_stack > 0)
1821       list_inst->force_sechalf = true;
1822
1823    list_inst->annotation = this->current_annotation;
1824    list_inst->ir = this->base_ir;
1825
1826    this->instructions.push_tail(list_inst);
1827
1828    return list_inst;
1829 }
1830
1831 /** Emits a dummy fragment shader consisting of magenta for bringup purposes. */
1832 void
1833 fs_visitor::emit_dummy_fs()
1834 {
1835    int reg_width = c->dispatch_width / 8;
1836
1837    /* Everyone's favorite color. */
1838    emit(BRW_OPCODE_MOV, fs_reg(MRF, 2 + 0 * reg_width), fs_reg(1.0f));
1839    emit(BRW_OPCODE_MOV, fs_reg(MRF, 2 + 1 * reg_width), fs_reg(0.0f));
1840    emit(BRW_OPCODE_MOV, fs_reg(MRF, 2 + 2 * reg_width), fs_reg(1.0f));
1841    emit(BRW_OPCODE_MOV, fs_reg(MRF, 2 + 3 * reg_width), fs_reg(0.0f));
1842
1843    fs_inst *write;
1844    write = emit(FS_OPCODE_FB_WRITE, fs_reg(0), fs_reg(0));
1845    write->base_mrf = 2;
1846    write->mlen = 4 * reg_width;
1847    write->eot = true;
1848 }
1849
1850 /* The register location here is relative to the start of the URB
1851  * data.  It will get adjusted to be a real location before
1852  * generate_code() time.
1853  */
1854 struct brw_reg
1855 fs_visitor::interp_reg(int location, int channel)
1856 {
1857    int regnr = urb_setup[location] * 2 + channel / 2;
1858    int stride = (channel & 1) * 4;
1859
1860    assert(urb_setup[location] != -1);
1861
1862    return brw_vec1_grf(regnr, stride);
1863 }
1864
1865 /** Emits the interpolation for the varying inputs. */
1866 void
1867 fs_visitor::emit_interpolation_setup_gen4()
1868 {
1869    this->current_annotation = "compute pixel centers";
1870    this->pixel_x = fs_reg(this, glsl_type::uint_type);
1871    this->pixel_y = fs_reg(this, glsl_type::uint_type);
1872    this->pixel_x.type = BRW_REGISTER_TYPE_UW;
1873    this->pixel_y.type = BRW_REGISTER_TYPE_UW;
1874
1875    emit(FS_OPCODE_PIXEL_X, this->pixel_x);
1876    emit(FS_OPCODE_PIXEL_Y, this->pixel_y);
1877
1878    this->current_annotation = "compute pixel deltas from v0";
1879    if (brw->has_pln) {
1880       this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] =
1881          fs_reg(this, glsl_type::vec2_type);
1882       this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] =
1883          this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC];
1884       this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg_offset++;
1885    } else {
1886       this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] =
1887          fs_reg(this, glsl_type::float_type);
1888       this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] =
1889          fs_reg(this, glsl_type::float_type);
1890    }
1891    emit(BRW_OPCODE_ADD, this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1892         this->pixel_x, fs_reg(negate(brw_vec1_grf(1, 0))));
1893    emit(BRW_OPCODE_ADD, this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1894         this->pixel_y, fs_reg(negate(brw_vec1_grf(1, 1))));
1895
1896    this->current_annotation = "compute pos.w and 1/pos.w";
1897    /* Compute wpos.w.  It's always in our setup, since it's needed to
1898     * interpolate the other attributes.
1899     */
1900    this->wpos_w = fs_reg(this, glsl_type::float_type);
1901    emit(FS_OPCODE_LINTERP, wpos_w,
1902         this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1903         this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1904         interp_reg(FRAG_ATTRIB_WPOS, 3));
1905    /* Compute the pixel 1/W value from wpos.w. */
1906    this->pixel_w = fs_reg(this, glsl_type::float_type);
1907    emit_math(SHADER_OPCODE_RCP, this->pixel_w, wpos_w);
1908    this->current_annotation = NULL;
1909 }
1910
1911 /** Emits the interpolation for the varying inputs. */
1912 void
1913 fs_visitor::emit_interpolation_setup_gen6()
1914 {
1915    struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
1916
1917    /* If the pixel centers end up used, the setup is the same as for gen4. */
1918    this->current_annotation = "compute pixel centers";
1919    fs_reg int_pixel_x = fs_reg(this, glsl_type::uint_type);
1920    fs_reg int_pixel_y = fs_reg(this, glsl_type::uint_type);
1921    int_pixel_x.type = BRW_REGISTER_TYPE_UW;
1922    int_pixel_y.type = BRW_REGISTER_TYPE_UW;
1923    emit(BRW_OPCODE_ADD,
1924         int_pixel_x,
1925         fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)),
1926         fs_reg(brw_imm_v(0x10101010)));
1927    emit(BRW_OPCODE_ADD,
1928         int_pixel_y,
1929         fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)),
1930         fs_reg(brw_imm_v(0x11001100)));
1931
1932    /* As of gen6, we can no longer mix float and int sources.  We have
1933     * to turn the integer pixel centers into floats for their actual
1934     * use.
1935     */
1936    this->pixel_x = fs_reg(this, glsl_type::float_type);
1937    this->pixel_y = fs_reg(this, glsl_type::float_type);
1938    emit(BRW_OPCODE_MOV, this->pixel_x, int_pixel_x);
1939    emit(BRW_OPCODE_MOV, this->pixel_y, int_pixel_y);
1940
1941    this->current_annotation = "compute pos.w";
1942    this->pixel_w = fs_reg(brw_vec8_grf(c->source_w_reg, 0));
1943    this->wpos_w = fs_reg(this, glsl_type::float_type);
1944    emit_math(SHADER_OPCODE_RCP, this->wpos_w, this->pixel_w);
1945
1946    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
1947       uint8_t reg = c->barycentric_coord_reg[i];
1948       this->delta_x[i] = fs_reg(brw_vec8_grf(reg, 0));
1949       this->delta_y[i] = fs_reg(brw_vec8_grf(reg + 1, 0));
1950    }
1951
1952    this->current_annotation = NULL;
1953 }
1954
1955 void
1956 fs_visitor::emit_color_write(int target, int index, int first_color_mrf)
1957 {
1958    int reg_width = c->dispatch_width / 8;
1959    fs_inst *inst;
1960    fs_reg color = outputs[target];
1961    fs_reg mrf;
1962
1963    /* If there's no color data to be written, skip it. */
1964    if (color.file == BAD_FILE)
1965       return;
1966
1967    color.reg_offset += index;
1968
1969    if (c->dispatch_width == 8 || intel->gen >= 6) {
1970       /* SIMD8 write looks like:
1971        * m + 0: r0
1972        * m + 1: r1
1973        * m + 2: g0
1974        * m + 3: g1
1975        *
1976        * gen6 SIMD16 DP write looks like:
1977        * m + 0: r0
1978        * m + 1: r1
1979        * m + 2: g0
1980        * m + 3: g1
1981        * m + 4: b0
1982        * m + 5: b1
1983        * m + 6: a0
1984        * m + 7: a1
1985        */
1986       inst = emit(BRW_OPCODE_MOV,
1987                   fs_reg(MRF, first_color_mrf + index * reg_width, color.type),
1988                   color);
1989       inst->saturate = c->key.clamp_fragment_color;
1990    } else {
1991       /* pre-gen6 SIMD16 single source DP write looks like:
1992        * m + 0: r0
1993        * m + 1: g0
1994        * m + 2: b0
1995        * m + 3: a0
1996        * m + 4: r1
1997        * m + 5: g1
1998        * m + 6: b1
1999        * m + 7: a1
2000        */
2001       if (brw->has_compr4) {
2002          /* By setting the high bit of the MRF register number, we
2003           * indicate that we want COMPR4 mode - instead of doing the
2004           * usual destination + 1 for the second half we get
2005           * destination + 4.
2006           */
2007          inst = emit(BRW_OPCODE_MOV,
2008                      fs_reg(MRF, BRW_MRF_COMPR4 + first_color_mrf + index,
2009                             color.type),
2010                      color);
2011          inst->saturate = c->key.clamp_fragment_color;
2012       } else {
2013          push_force_uncompressed();
2014          inst = emit(BRW_OPCODE_MOV, fs_reg(MRF, first_color_mrf + index,
2015                                             color.type),
2016                      color);
2017          inst->saturate = c->key.clamp_fragment_color;
2018          pop_force_uncompressed();
2019
2020          push_force_sechalf();
2021          color.sechalf = true;
2022          inst = emit(BRW_OPCODE_MOV, fs_reg(MRF, first_color_mrf + index + 4,
2023                                             color.type),
2024                      color);
2025          inst->saturate = c->key.clamp_fragment_color;
2026          pop_force_sechalf();
2027          color.sechalf = false;
2028       }
2029    }
2030 }
2031
2032 void
2033 fs_visitor::emit_fb_writes()
2034 {
2035    this->current_annotation = "FB write header";
2036    bool header_present = true;
2037    /* We can potentially have a message length of up to 15, so we have to set
2038     * base_mrf to either 0 or 1 in order to fit in m0..m15.
2039     */
2040    int base_mrf = 1;
2041    int nr = base_mrf;
2042    int reg_width = c->dispatch_width / 8;
2043    bool do_dual_src = this->dual_src_output.file != BAD_FILE;
2044    bool src0_alpha_to_render_target = false;
2045
2046    if (c->dispatch_width == 16 && do_dual_src) {
2047       fail("GL_ARB_blend_func_extended not yet supported in 16-wide.");
2048       do_dual_src = false;
2049    }
2050
2051    /* From the Sandy Bridge PRM, volume 4, page 198:
2052     *
2053     *     "Dispatched Pixel Enables. One bit per pixel indicating
2054     *      which pixels were originally enabled when the thread was
2055     *      dispatched. This field is only required for the end-of-
2056     *      thread message and on all dual-source messages."
2057     */
2058    if (intel->gen >= 6 &&
2059        !this->fp->UsesKill &&
2060        !do_dual_src &&
2061        c->key.nr_color_regions == 1) {
2062       header_present = false;
2063    }
2064
2065    if (header_present) {
2066       src0_alpha_to_render_target = intel->gen >= 6 &&
2067                                     !do_dual_src &&
2068                                     c->key.nr_color_regions > 1 &&
2069                                     c->key.sample_alpha_to_coverage;
2070       /* m2, m3 header */
2071       nr += 2;
2072    }
2073
2074    if (c->aa_dest_stencil_reg) {
2075       push_force_uncompressed();
2076       emit(BRW_OPCODE_MOV, fs_reg(MRF, nr++),
2077            fs_reg(brw_vec8_grf(c->aa_dest_stencil_reg, 0)));
2078       pop_force_uncompressed();
2079    }
2080
2081    /* Reserve space for color. It'll be filled in per MRT below. */
2082    int color_mrf = nr;
2083    nr += 4 * reg_width;
2084    if (do_dual_src)
2085       nr += 4;
2086    if (src0_alpha_to_render_target)
2087       nr += reg_width;
2088
2089    if (c->source_depth_to_render_target) {
2090       if (intel->gen == 6 && c->dispatch_width == 16) {
2091          /* For outputting oDepth on gen6, SIMD8 writes have to be
2092           * used.  This would require 8-wide moves of each half to
2093           * message regs, kind of like pre-gen5 SIMD16 FB writes.
2094           * Just bail on doing so for now.
2095           */
2096          fail("Missing support for simd16 depth writes on gen6\n");
2097       }
2098
2099       if (c->computes_depth) {
2100          /* Hand over gl_FragDepth. */
2101          assert(this->frag_depth);
2102          fs_reg depth = *(variable_storage(this->frag_depth));
2103
2104          emit(BRW_OPCODE_MOV, fs_reg(MRF, nr), depth);
2105       } else {
2106          /* Pass through the payload depth. */
2107          emit(BRW_OPCODE_MOV, fs_reg(MRF, nr),
2108               fs_reg(brw_vec8_grf(c->source_depth_reg, 0)));
2109       }
2110       nr += reg_width;
2111    }
2112
2113    if (c->dest_depth_reg) {
2114       emit(BRW_OPCODE_MOV, fs_reg(MRF, nr),
2115            fs_reg(brw_vec8_grf(c->dest_depth_reg, 0)));
2116       nr += reg_width;
2117    }
2118
2119    if (do_dual_src) {
2120       fs_reg src0 = this->outputs[0];
2121       fs_reg src1 = this->dual_src_output;
2122
2123       this->current_annotation = ralloc_asprintf(this->mem_ctx,
2124                                                  "FB write src0");
2125       for (int i = 0; i < 4; i++) {
2126          fs_inst *inst = emit(BRW_OPCODE_MOV,
2127                               fs_reg(MRF, color_mrf + i, src0.type),
2128                               src0);
2129          src0.reg_offset++;
2130          inst->saturate = c->key.clamp_fragment_color;
2131       }
2132
2133       this->current_annotation = ralloc_asprintf(this->mem_ctx,
2134                                                  "FB write src1");
2135       for (int i = 0; i < 4; i++) {
2136          fs_inst *inst = emit(BRW_OPCODE_MOV,
2137                               fs_reg(MRF, color_mrf + 4 + i, src1.type),
2138                               src1);
2139          src1.reg_offset++;
2140          inst->saturate = c->key.clamp_fragment_color;
2141       }
2142
2143       fs_inst *inst = emit(FS_OPCODE_FB_WRITE);
2144       inst->target = 0;
2145       inst->base_mrf = base_mrf;
2146       inst->mlen = nr - base_mrf;
2147       inst->eot = true;
2148       inst->header_present = header_present;
2149
2150       c->prog_data.dual_src_blend = true;
2151       this->current_annotation = NULL;
2152       return;
2153    }
2154
2155    for (int target = 0; target < c->key.nr_color_regions; target++) {
2156       this->current_annotation = ralloc_asprintf(this->mem_ctx,
2157                                                  "FB write target %d",
2158                                                  target);
2159       /* If src0_alpha_to_render_target is true, include source zero alpha
2160        * data in RenderTargetWrite message for targets > 0.
2161        */
2162       int write_color_mrf = color_mrf;
2163       if (src0_alpha_to_render_target && target != 0) {
2164          fs_inst *inst;
2165          fs_reg color = outputs[0];
2166          color.reg_offset += 3;
2167
2168          inst = emit(BRW_OPCODE_MOV,
2169                      fs_reg(MRF, write_color_mrf, color.type),
2170                      color);
2171          inst->saturate = c->key.clamp_fragment_color;
2172          write_color_mrf = color_mrf + reg_width;
2173       }
2174
2175       for (unsigned i = 0; i < this->output_components[target]; i++)
2176          emit_color_write(target, i, write_color_mrf);
2177
2178       fs_inst *inst = emit(FS_OPCODE_FB_WRITE);
2179       inst->target = target;
2180       inst->base_mrf = base_mrf;
2181       if (src0_alpha_to_render_target && target == 0)
2182          inst->mlen = nr - base_mrf - reg_width;
2183       else
2184          inst->mlen = nr - base_mrf;
2185       if (target == c->key.nr_color_regions - 1)
2186          inst->eot = true;
2187       inst->header_present = header_present;
2188    }
2189
2190    if (c->key.nr_color_regions == 0) {
2191       /* Even if there's no color buffers enabled, we still need to send
2192        * alpha out the pipeline to our null renderbuffer to support
2193        * alpha-testing, alpha-to-coverage, and so on.
2194        */
2195       emit_color_write(0, 3, color_mrf);
2196
2197       fs_inst *inst = emit(FS_OPCODE_FB_WRITE);
2198       inst->base_mrf = base_mrf;
2199       inst->mlen = nr - base_mrf;
2200       inst->eot = true;
2201       inst->header_present = header_present;
2202    }
2203
2204    this->current_annotation = NULL;
2205 }
2206
2207 void
2208 fs_visitor::resolve_ud_negate(fs_reg *reg)
2209 {
2210    if (reg->type != BRW_REGISTER_TYPE_UD ||
2211        !reg->negate)
2212       return;
2213
2214    fs_reg temp = fs_reg(this, glsl_type::uint_type);
2215    emit(BRW_OPCODE_MOV, temp, *reg);
2216    *reg = temp;
2217 }
2218
2219 void
2220 fs_visitor::resolve_bool_comparison(ir_rvalue *rvalue, fs_reg *reg)
2221 {
2222    if (rvalue->type != glsl_type::bool_type)
2223       return;
2224
2225    fs_reg temp = fs_reg(this, glsl_type::bool_type);
2226    emit(BRW_OPCODE_AND, temp, *reg, fs_reg(1));
2227    *reg = temp;
2228 }
2229
2230 fs_visitor::fs_visitor(struct brw_wm_compile *c, struct gl_shader_program *prog,
2231                        struct brw_shader *shader)
2232 {
2233    this->c = c;
2234    this->p = &c->func;
2235    this->brw = p->brw;
2236    this->fp = (struct gl_fragment_program *)
2237       prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
2238    this->prog = prog;
2239    this->intel = &brw->intel;
2240    this->ctx = &intel->ctx;
2241    this->mem_ctx = ralloc_context(NULL);
2242    this->shader = shader;
2243    this->failed = false;
2244    this->variable_ht = hash_table_ctor(0,
2245                                        hash_table_pointer_hash,
2246                                        hash_table_pointer_compare);
2247
2248    /* There's a question that appears to be left open in the spec:
2249     * How do implicit dst conversions interact with the CMP
2250     * instruction or conditional mods?  On gen6, the instruction:
2251     *
2252     * CMP null<d> src0<f> src1<f>
2253     *
2254     * will do src1 - src0 and compare that result as if it was an
2255     * integer.  On gen4, it will do src1 - src0 as float, convert
2256     * the result to int, and compare as int.  In between, it
2257     * appears that it does src1 - src0 and does the compare in the
2258     * execution type so dst type doesn't matter.
2259     */
2260    if (this->intel->gen > 4)
2261       this->reg_null_cmp = reg_null_d;
2262    else
2263       this->reg_null_cmp = reg_null_f;
2264
2265    this->frag_depth = NULL;
2266    memset(this->outputs, 0, sizeof(this->outputs));
2267    memset(this->output_components, 0, sizeof(this->output_components));
2268    this->first_non_payload_grf = 0;
2269    this->max_grf = intel->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
2270
2271    this->current_annotation = NULL;
2272    this->base_ir = NULL;
2273
2274    this->virtual_grf_sizes = NULL;
2275    this->virtual_grf_count = 0;
2276    this->virtual_grf_array_size = 0;
2277    this->virtual_grf_def = NULL;
2278    this->virtual_grf_use = NULL;
2279    this->live_intervals_valid = false;
2280
2281    this->force_uncompressed_stack = 0;
2282    this->force_sechalf_stack = 0;
2283 }
2284
2285 fs_visitor::~fs_visitor()
2286 {
2287    ralloc_free(this->mem_ctx);
2288    hash_table_dtor(this->variable_ht);
2289 }