2 * Copyright © 2010 Intel Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
24 /** @file brw_fs_visitor.cpp
26 * This file supports generating the FS LIR from the GLSL IR. The LIR
27 * makes it easier to do backend-specific optimizations than doing so
28 * in the GLSL IR or in the native code.
32 #include <sys/types.h>
34 #include "main/macros.h"
35 #include "main/shaderobj.h"
36 #include "main/uniforms.h"
37 #include "program/prog_parameter.h"
38 #include "program/prog_print.h"
39 #include "program/prog_optimize.h"
40 #include "program/register_allocate.h"
41 #include "program/sampler.h"
42 #include "program/hash_table.h"
43 #include "brw_context.h"
47 #include "brw_shader.h"
49 #include "glsl/glsl_types.h"
50 #include "glsl/ir_optimization.h"
51 #include "glsl/ir_print_visitor.h"
54 fs_visitor::visit(ir_variable *ir)
58 if (variable_storage(ir))
61 if (ir->mode == ir_var_in) {
62 if (!strcmp(ir->name, "gl_FragCoord")) {
63 reg = emit_fragcoord_interpolation(ir);
64 } else if (!strcmp(ir->name, "gl_FrontFacing")) {
65 reg = emit_frontfacing_interpolation(ir);
67 reg = emit_general_interpolation(ir);
70 hash_table_insert(this->variable_ht, reg, ir);
72 } else if (ir->mode == ir_var_out) {
73 reg = new(this->mem_ctx) fs_reg(this, ir->type);
76 assert(ir->location == FRAG_RESULT_DATA0);
77 assert(ir->index == 1);
78 this->dual_src_output = *reg;
79 } else if (ir->location == FRAG_RESULT_COLOR) {
80 /* Writing gl_FragColor outputs to all color regions. */
81 for (unsigned int i = 0; i < MAX2(c->key.nr_color_regions, 1); i++) {
82 this->outputs[i] = *reg;
83 this->output_components[i] = 4;
85 } else if (ir->location == FRAG_RESULT_DEPTH) {
86 this->frag_depth = ir;
88 /* gl_FragData or a user-defined FS output */
89 assert(ir->location >= FRAG_RESULT_DATA0 &&
90 ir->location < FRAG_RESULT_DATA0 + BRW_MAX_DRAW_BUFFERS);
93 ir->type->is_array() ? ir->type->fields.array->vector_elements
94 : ir->type->vector_elements;
96 /* General color output. */
97 for (unsigned int i = 0; i < MAX2(1, ir->type->length); i++) {
98 int output = ir->location - FRAG_RESULT_DATA0 + i;
99 this->outputs[output] = *reg;
100 this->outputs[output].reg_offset += vector_elements * i;
101 this->output_components[output] = vector_elements;
104 } else if (ir->mode == ir_var_uniform) {
105 int param_index = c->prog_data.nr_params;
107 /* Thanks to the lower_ubo_reference pass, we will see only
108 * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
109 * variables, so no need for them to be in variable_ht.
111 if (ir->uniform_block != -1)
114 if (c->dispatch_width == 16) {
115 if (!variable_storage(ir)) {
116 fail("Failed to find uniform '%s' in 16-wide\n", ir->name);
121 if (!strncmp(ir->name, "gl_", 3)) {
122 setup_builtin_uniform_values(ir);
124 setup_uniform_values(ir->location, ir->type);
127 reg = new(this->mem_ctx) fs_reg(UNIFORM, param_index);
128 reg->type = brw_type_for_base_type(ir->type);
132 reg = new(this->mem_ctx) fs_reg(this, ir->type);
134 hash_table_insert(this->variable_ht, reg, ir);
138 fs_visitor::visit(ir_dereference_variable *ir)
140 fs_reg *reg = variable_storage(ir->var);
145 fs_visitor::visit(ir_dereference_record *ir)
147 const glsl_type *struct_type = ir->record->type;
149 ir->record->accept(this);
151 unsigned int offset = 0;
152 for (unsigned int i = 0; i < struct_type->length; i++) {
153 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
155 offset += type_size(struct_type->fields.structure[i].type);
157 this->result.reg_offset += offset;
158 this->result.type = brw_type_for_base_type(ir->type);
162 fs_visitor::visit(ir_dereference_array *ir)
167 ir->array->accept(this);
168 index = ir->array_index->as_constant();
170 element_size = type_size(ir->type);
171 this->result.type = brw_type_for_base_type(ir->type);
174 assert(this->result.file == UNIFORM || this->result.file == GRF);
175 this->result.reg_offset += index->value.i[0] * element_size;
177 assert(!"FINISHME: non-constant array element");
181 /* Instruction selection: Produce a MOV.sat instead of
182 * MIN(MAX(val, 0), 1) when possible.
185 fs_visitor::try_emit_saturate(ir_expression *ir)
187 ir_rvalue *sat_val = ir->as_rvalue_to_saturate();
192 fs_inst *pre_inst = (fs_inst *) this->instructions.get_tail();
194 sat_val->accept(this);
195 fs_reg src = this->result;
197 fs_inst *last_inst = (fs_inst *) this->instructions.get_tail();
199 /* If the last instruction from our accept() didn't generate our
200 * src, generate a saturated MOV
202 fs_inst *modify = get_instruction_generating_reg(pre_inst, last_inst, src);
203 if (!modify || modify->regs_written() != 1) {
204 fs_inst *inst = emit(BRW_OPCODE_MOV, this->result, src);
205 inst->saturate = true;
207 modify->saturate = true;
216 fs_visitor::try_emit_mad(ir_expression *ir, int mul_arg)
218 /* 3-src instructions were introduced in gen6. */
222 /* MAD can only handle floating-point data. */
223 if (ir->type != glsl_type::float_type)
226 ir_rvalue *nonmul = ir->operands[1 - mul_arg];
227 ir_expression *mul = ir->operands[mul_arg]->as_expression();
229 if (!mul || mul->operation != ir_binop_mul)
232 if (nonmul->as_constant() ||
233 mul->operands[0]->as_constant() ||
234 mul->operands[1]->as_constant())
237 nonmul->accept(this);
238 fs_reg src0 = this->result;
240 mul->operands[0]->accept(this);
241 fs_reg src1 = this->result;
243 mul->operands[1]->accept(this);
244 fs_reg src2 = this->result;
246 this->result = fs_reg(this, ir->type);
247 emit(BRW_OPCODE_MAD, this->result, src0, src1, src2);
253 fs_visitor::visit(ir_expression *ir)
255 unsigned int operand;
259 assert(ir->get_num_operands() <= 2);
261 if (try_emit_saturate(ir))
263 if (ir->operation == ir_binop_add) {
264 if (try_emit_mad(ir, 0) || try_emit_mad(ir, 1))
268 for (operand = 0; operand < ir->get_num_operands(); operand++) {
269 ir->operands[operand]->accept(this);
270 if (this->result.file == BAD_FILE) {
272 fail("Failed to get tree for expression operand:\n");
273 ir->operands[operand]->accept(&v);
275 op[operand] = this->result;
277 /* Matrix expression operands should have been broken down to vector
278 * operations already.
280 assert(!ir->operands[operand]->type->is_matrix());
281 /* And then those vector operands should have been broken down to scalar.
283 assert(!ir->operands[operand]->type->is_vector());
286 /* Storage for our result. If our result goes into an assignment, it will
287 * just get copy-propagated out, so no worries.
289 this->result = fs_reg(this, ir->type);
291 switch (ir->operation) {
292 case ir_unop_logic_not:
293 /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
294 * ones complement of the whole register, not just bit 0.
296 emit(BRW_OPCODE_XOR, this->result, op[0], fs_reg(1));
299 op[0].negate = !op[0].negate;
300 this->result = op[0];
304 op[0].negate = false;
305 this->result = op[0];
308 temp = fs_reg(this, ir->type);
310 emit(BRW_OPCODE_MOV, this->result, fs_reg(0.0f));
312 inst = emit(BRW_OPCODE_CMP, reg_null_f, op[0], fs_reg(0.0f));
313 inst->conditional_mod = BRW_CONDITIONAL_G;
314 inst = emit(BRW_OPCODE_MOV, this->result, fs_reg(1.0f));
315 inst->predicated = true;
317 inst = emit(BRW_OPCODE_CMP, reg_null_f, op[0], fs_reg(0.0f));
318 inst->conditional_mod = BRW_CONDITIONAL_L;
319 inst = emit(BRW_OPCODE_MOV, this->result, fs_reg(-1.0f));
320 inst->predicated = true;
324 emit_math(SHADER_OPCODE_RCP, this->result, op[0]);
328 emit_math(SHADER_OPCODE_EXP2, this->result, op[0]);
331 emit_math(SHADER_OPCODE_LOG2, this->result, op[0]);
335 assert(!"not reached: should be handled by ir_explog_to_explog2");
338 case ir_unop_sin_reduced:
339 emit_math(SHADER_OPCODE_SIN, this->result, op[0]);
342 case ir_unop_cos_reduced:
343 emit_math(SHADER_OPCODE_COS, this->result, op[0]);
347 emit(FS_OPCODE_DDX, this->result, op[0]);
350 emit(FS_OPCODE_DDY, this->result, op[0]);
354 emit(BRW_OPCODE_ADD, this->result, op[0], op[1]);
357 assert(!"not reached: should be handled by ir_sub_to_add_neg");
361 if (ir->type->is_integer()) {
362 /* For integer multiplication, the MUL uses the low 16 bits
363 * of one of the operands (src0 on gen6, src1 on gen7). The
364 * MACH accumulates in the contribution of the upper 16 bits
367 * FINISHME: Emit just the MUL if we know an operand is small
370 if (intel->gen >= 7 && c->dispatch_width == 16)
371 fail("16-wide explicit accumulator operands unsupported\n");
373 struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_D);
375 emit(BRW_OPCODE_MUL, acc, op[0], op[1]);
376 emit(BRW_OPCODE_MACH, reg_null_d, op[0], op[1]);
377 emit(BRW_OPCODE_MOV, this->result, fs_reg(acc));
379 emit(BRW_OPCODE_MUL, this->result, op[0], op[1]);
383 if (intel->gen >= 7 && c->dispatch_width == 16)
384 fail("16-wide INTDIV unsupported\n");
386 /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
387 assert(ir->type->is_integer());
388 emit_math(SHADER_OPCODE_INT_QUOTIENT, this->result, op[0], op[1]);
391 if (intel->gen >= 7 && c->dispatch_width == 16)
392 fail("16-wide INTDIV unsupported\n");
394 /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
395 assert(ir->type->is_integer());
396 emit_math(SHADER_OPCODE_INT_REMAINDER, this->result, op[0], op[1]);
400 case ir_binop_greater:
401 case ir_binop_lequal:
402 case ir_binop_gequal:
404 case ir_binop_all_equal:
405 case ir_binop_nequal:
406 case ir_binop_any_nequal:
408 /* original gen4 does implicit conversion before comparison. */
410 temp.type = op[0].type;
412 resolve_ud_negate(&op[0]);
413 resolve_ud_negate(&op[1]);
415 resolve_bool_comparison(ir->operands[0], &op[0]);
416 resolve_bool_comparison(ir->operands[1], &op[1]);
418 inst = emit(BRW_OPCODE_CMP, temp, op[0], op[1]);
419 inst->conditional_mod = brw_conditional_for_comparison(ir->operation);
422 case ir_binop_logic_xor:
423 emit(BRW_OPCODE_XOR, this->result, op[0], op[1]);
426 case ir_binop_logic_or:
427 emit(BRW_OPCODE_OR, this->result, op[0], op[1]);
430 case ir_binop_logic_and:
431 emit(BRW_OPCODE_AND, this->result, op[0], op[1]);
436 assert(!"not reached: should be handled by brw_fs_channel_expressions");
440 assert(!"not reached: should be handled by lower_noise");
443 case ir_quadop_vector:
444 assert(!"not reached: should be handled by lower_quadop_vector");
448 emit_math(SHADER_OPCODE_SQRT, this->result, op[0]);
452 emit_math(SHADER_OPCODE_RSQ, this->result, op[0]);
455 case ir_unop_bitcast_i2f:
456 case ir_unop_bitcast_u2f:
457 op[0].type = BRW_REGISTER_TYPE_F;
458 this->result = op[0];
461 case ir_unop_bitcast_f2u:
462 op[0].type = BRW_REGISTER_TYPE_UD;
463 this->result = op[0];
466 case ir_unop_bitcast_f2i:
467 op[0].type = BRW_REGISTER_TYPE_D;
468 this->result = op[0];
474 emit(BRW_OPCODE_MOV, this->result, op[0]);
478 inst = emit(BRW_OPCODE_AND, this->result, op[0], fs_reg(1));
481 temp = fs_reg(this, glsl_type::int_type);
482 emit(BRW_OPCODE_AND, temp, op[0], fs_reg(1));
483 emit(BRW_OPCODE_MOV, this->result, temp);
487 inst = emit(BRW_OPCODE_CMP, this->result, op[0], fs_reg(0.0f));
488 inst->conditional_mod = BRW_CONDITIONAL_NZ;
489 emit(BRW_OPCODE_AND, this->result, this->result, fs_reg(1));
492 assert(op[0].type == BRW_REGISTER_TYPE_D);
494 inst = emit(BRW_OPCODE_CMP, this->result, op[0], fs_reg(0));
495 inst->conditional_mod = BRW_CONDITIONAL_NZ;
496 emit(BRW_OPCODE_AND, this->result, this->result, fs_reg(1));
500 emit(BRW_OPCODE_RNDZ, this->result, op[0]);
503 op[0].negate = !op[0].negate;
504 inst = emit(BRW_OPCODE_RNDD, this->result, op[0]);
505 this->result.negate = true;
508 inst = emit(BRW_OPCODE_RNDD, this->result, op[0]);
511 inst = emit(BRW_OPCODE_FRC, this->result, op[0]);
513 case ir_unop_round_even:
514 emit(BRW_OPCODE_RNDE, this->result, op[0]);
518 resolve_ud_negate(&op[0]);
519 resolve_ud_negate(&op[1]);
521 if (intel->gen >= 6) {
522 inst = emit(BRW_OPCODE_SEL, this->result, op[0], op[1]);
523 inst->conditional_mod = BRW_CONDITIONAL_L;
525 /* Unalias the destination */
526 this->result = fs_reg(this, ir->type);
528 inst = emit(BRW_OPCODE_CMP, this->result, op[0], op[1]);
529 inst->conditional_mod = BRW_CONDITIONAL_L;
531 inst = emit(BRW_OPCODE_SEL, this->result, op[0], op[1]);
532 inst->predicated = true;
536 resolve_ud_negate(&op[0]);
537 resolve_ud_negate(&op[1]);
539 if (intel->gen >= 6) {
540 inst = emit(BRW_OPCODE_SEL, this->result, op[0], op[1]);
541 inst->conditional_mod = BRW_CONDITIONAL_GE;
543 /* Unalias the destination */
544 this->result = fs_reg(this, ir->type);
546 inst = emit(BRW_OPCODE_CMP, this->result, op[0], op[1]);
547 inst->conditional_mod = BRW_CONDITIONAL_G;
549 inst = emit(BRW_OPCODE_SEL, this->result, op[0], op[1]);
550 inst->predicated = true;
555 emit_math(SHADER_OPCODE_POW, this->result, op[0], op[1]);
558 case ir_unop_bit_not:
559 inst = emit(BRW_OPCODE_NOT, this->result, op[0]);
561 case ir_binop_bit_and:
562 inst = emit(BRW_OPCODE_AND, this->result, op[0], op[1]);
564 case ir_binop_bit_xor:
565 inst = emit(BRW_OPCODE_XOR, this->result, op[0], op[1]);
567 case ir_binop_bit_or:
568 inst = emit(BRW_OPCODE_OR, this->result, op[0], op[1]);
571 case ir_binop_lshift:
572 inst = emit(BRW_OPCODE_SHL, this->result, op[0], op[1]);
575 case ir_binop_rshift:
576 if (ir->type->base_type == GLSL_TYPE_INT)
577 inst = emit(BRW_OPCODE_ASR, this->result, op[0], op[1]);
579 inst = emit(BRW_OPCODE_SHR, this->result, op[0], op[1]);
582 case ir_binop_ubo_load:
583 ir_constant *uniform_block = ir->operands[0]->as_constant();
584 ir_constant *offset = ir->operands[1]->as_constant();
586 fs_reg packed_consts = fs_reg(this, glsl_type::float_type);
587 packed_consts.type = result.type;
588 fs_reg surf_index = fs_reg((unsigned)SURF_INDEX_WM_UBO(uniform_block->value.u[0]));
589 fs_inst *pull = emit(fs_inst(FS_OPCODE_PULL_CONSTANT_LOAD,
592 fs_reg(offset->value.u[0])));
596 packed_consts.smear = offset->value.u[0] % 16 / 4;
597 for (int i = 0; i < ir->type->vector_elements; i++) {
598 /* UBO bools are any nonzero value. We consider bools to be
599 * values with the low bit set to 1. Convert them using CMP.
601 if (ir->type->base_type == GLSL_TYPE_BOOL) {
602 fs_inst *inst = emit(fs_inst(BRW_OPCODE_CMP, result,
603 packed_consts, fs_reg(0u)));
604 inst->conditional_mod = BRW_CONDITIONAL_NZ;
606 emit(fs_inst(BRW_OPCODE_MOV, result, packed_consts));
609 packed_consts.smear++;
612 /* The std140 packing rules don't allow vectors to cross 16-byte
613 * boundaries, and a reg is 32 bytes.
615 assert(packed_consts.smear < 8);
617 result.reg_offset = 0;
623 fs_visitor::emit_assignment_writes(fs_reg &l, fs_reg &r,
624 const glsl_type *type, bool predicated)
626 switch (type->base_type) {
627 case GLSL_TYPE_FLOAT:
631 for (unsigned int i = 0; i < type->components(); i++) {
632 l.type = brw_type_for_base_type(type);
633 r.type = brw_type_for_base_type(type);
635 if (predicated || !l.equals(r)) {
636 fs_inst *inst = emit(BRW_OPCODE_MOV, l, r);
637 inst->predicated = predicated;
644 case GLSL_TYPE_ARRAY:
645 for (unsigned int i = 0; i < type->length; i++) {
646 emit_assignment_writes(l, r, type->fields.array, predicated);
650 case GLSL_TYPE_STRUCT:
651 for (unsigned int i = 0; i < type->length; i++) {
652 emit_assignment_writes(l, r, type->fields.structure[i].type,
657 case GLSL_TYPE_SAMPLER:
661 assert(!"not reached");
666 /* If the RHS processing resulted in an instruction generating a
667 * temporary value, and it would be easy to rewrite the instruction to
668 * generate its result right into the LHS instead, do so. This ends
669 * up reliably removing instructions where it can be tricky to do so
670 * later without real UD chain information.
673 fs_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
676 fs_inst *pre_rhs_inst,
677 fs_inst *last_rhs_inst)
679 /* Only attempt if we're doing a direct assignment. */
681 !(ir->lhs->type->is_scalar() ||
682 (ir->lhs->type->is_vector() &&
683 ir->write_mask == (1 << ir->lhs->type->vector_elements) - 1)))
686 /* Make sure the last instruction generated our source reg. */
687 fs_inst *modify = get_instruction_generating_reg(pre_rhs_inst,
693 /* If last_rhs_inst wrote a different number of components than our LHS,
694 * we can't safely rewrite it.
696 if (ir->lhs->type->vector_elements != modify->regs_written())
699 /* Success! Rewrite the instruction. */
706 fs_visitor::visit(ir_assignment *ir)
711 /* FINISHME: arrays on the lhs */
712 ir->lhs->accept(this);
715 fs_inst *pre_rhs_inst = (fs_inst *) this->instructions.get_tail();
717 ir->rhs->accept(this);
720 fs_inst *last_rhs_inst = (fs_inst *) this->instructions.get_tail();
722 assert(l.file != BAD_FILE);
723 assert(r.file != BAD_FILE);
725 if (try_rewrite_rhs_to_dst(ir, l, r, pre_rhs_inst, last_rhs_inst))
729 emit_bool_to_cond_code(ir->condition);
732 if (ir->lhs->type->is_scalar() ||
733 ir->lhs->type->is_vector()) {
734 for (int i = 0; i < ir->lhs->type->vector_elements; i++) {
735 if (ir->write_mask & (1 << i)) {
736 inst = emit(BRW_OPCODE_MOV, l, r);
738 inst->predicated = true;
744 emit_assignment_writes(l, r, ir->lhs->type, ir->condition != NULL);
749 fs_visitor::emit_texture_gen4(ir_texture *ir, fs_reg dst, fs_reg coordinate,
750 fs_reg shadow_c, fs_reg lod, fs_reg dPdy)
760 if (ir->shadow_comparitor) {
761 for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
762 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i), coordinate);
763 coordinate.reg_offset++;
765 /* gen4's SIMD8 sampler always has the slots for u,v,r present. */
768 if (ir->op == ir_tex) {
769 /* There's no plain shadow compare message, so we use shadow
770 * compare with a bias of 0.0.
772 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), fs_reg(0.0f));
774 } else if (ir->op == ir_txb || ir->op == ir_txl) {
775 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), lod);
778 assert(!"Should not get here.");
781 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), shadow_c);
783 } else if (ir->op == ir_tex) {
784 for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
785 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i), coordinate);
786 coordinate.reg_offset++;
788 /* gen4's SIMD8 sampler always has the slots for u,v,r present. */
790 } else if (ir->op == ir_txd) {
793 for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
794 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i), coordinate);
795 coordinate.reg_offset++;
797 /* the slots for u and v are always present, but r is optional */
798 mlen += MAX2(ir->coordinate->type->vector_elements, 2);
801 * dPdx = dudx, dvdx, drdx
802 * dPdy = dudy, dvdy, drdy
804 * 1-arg: Does not exist.
806 * 2-arg: dudx dvdx dudy dvdy
807 * dPdx.x dPdx.y dPdy.x dPdy.y
810 * 3-arg: dudx dvdx drdx dudy dvdy drdy
811 * dPdx.x dPdx.y dPdx.z dPdy.x dPdy.y dPdy.z
814 for (int i = 0; i < ir->lod_info.grad.dPdx->type->vector_elements; i++) {
815 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), dPdx);
818 mlen += MAX2(ir->lod_info.grad.dPdx->type->vector_elements, 2);
820 for (int i = 0; i < ir->lod_info.grad.dPdy->type->vector_elements; i++) {
821 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), dPdy);
824 mlen += MAX2(ir->lod_info.grad.dPdy->type->vector_elements, 2);
825 } else if (ir->op == ir_txs) {
826 /* There's no SIMD8 resinfo message on Gen4. Use SIMD16 instead. */
828 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen, BRW_REGISTER_TYPE_UD), lod);
831 /* Oh joy. gen4 doesn't have SIMD8 non-shadow-compare bias/lod
832 * instructions. We'll need to do SIMD16 here.
835 assert(ir->op == ir_txb || ir->op == ir_txl || ir->op == ir_txf);
837 for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
838 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i * 2, coordinate.type),
840 coordinate.reg_offset++;
843 /* Initialize the rest of u/v/r with 0.0. Empirically, this seems to
844 * be necessary for TXF (ld), but seems wise to do for all messages.
846 for (int i = ir->coordinate->type->vector_elements; i < 3; i++) {
847 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i * 2), fs_reg(0.0f));
850 /* lod/bias appears after u/v/r. */
853 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen, lod.type), lod);
856 /* The unused upper half. */
861 /* Now, since we're doing simd16, the return is 2 interleaved
862 * vec4s where the odd-indexed ones are junk. We'll need to move
863 * this weirdness around to the expected layout.
866 const glsl_type *vec_type =
867 glsl_type::get_instance(ir->type->base_type, 4, 1);
868 dst = fs_reg(this, glsl_type::get_array_instance(vec_type, 2));
869 dst.type = intel->is_g4x ? brw_type_for_base_type(ir->type)
870 : BRW_REGISTER_TYPE_F;
873 fs_inst *inst = NULL;
876 inst = emit(SHADER_OPCODE_TEX, dst);
879 inst = emit(FS_OPCODE_TXB, dst);
882 inst = emit(SHADER_OPCODE_TXL, dst);
885 inst = emit(SHADER_OPCODE_TXD, dst);
888 inst = emit(SHADER_OPCODE_TXS, dst);
891 inst = emit(SHADER_OPCODE_TXF, dst);
894 inst->base_mrf = base_mrf;
896 inst->header_present = true;
899 for (int i = 0; i < 4; i++) {
900 emit(BRW_OPCODE_MOV, orig_dst, dst);
901 orig_dst.reg_offset++;
909 /* gen5's sampler has slots for u, v, r, array index, then optional
910 * parameters like shadow comparitor or LOD bias. If optional
911 * parameters aren't present, those base slots are optional and don't
912 * need to be included in the message.
914 * We don't fill in the unnecessary slots regardless, which may look
915 * surprising in the disassembly.
918 fs_visitor::emit_texture_gen5(ir_texture *ir, fs_reg dst, fs_reg coordinate,
919 fs_reg shadow_c, fs_reg lod, fs_reg lod2)
923 int reg_width = c->dispatch_width / 8;
924 bool header_present = false;
925 const int vector_elements =
926 ir->coordinate ? ir->coordinate->type->vector_elements : 0;
928 if (ir->offset != NULL && ir->op == ir_txf) {
929 /* It appears that the ld instruction used for txf does its
930 * address bounds check before adding in the offset. To work
931 * around this, just add the integer offset to the integer texel
932 * coordinate, and don't put the offset in the header.
934 ir_constant *offset = ir->offset->as_constant();
935 for (int i = 0; i < vector_elements; i++) {
937 fs_reg(MRF, base_mrf + mlen + i * reg_width, coordinate.type),
940 coordinate.reg_offset++;
944 /* The offsets set up by the ir_texture visitor are in the
945 * m1 header, so we can't go headerless.
947 header_present = true;
952 for (int i = 0; i < vector_elements; i++) {
954 fs_reg(MRF, base_mrf + mlen + i * reg_width, coordinate.type),
956 coordinate.reg_offset++;
959 mlen += vector_elements * reg_width;
961 if (ir->shadow_comparitor) {
962 mlen = MAX2(mlen, header_present + 4 * reg_width);
964 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), shadow_c);
968 fs_inst *inst = NULL;
971 inst = emit(SHADER_OPCODE_TEX, dst);
974 mlen = MAX2(mlen, header_present + 4 * reg_width);
975 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), lod);
978 inst = emit(FS_OPCODE_TXB, dst);
981 mlen = MAX2(mlen, header_present + 4 * reg_width);
982 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), lod);
985 inst = emit(SHADER_OPCODE_TXL, dst);
988 mlen = MAX2(mlen, header_present + 4 * reg_width); /* skip over 'ai' */
992 * dPdx = dudx, dvdx, drdx
993 * dPdy = dudy, dvdy, drdy
995 * Load up these values:
996 * - dudx dudy dvdx dvdy drdx drdy
997 * - dPdx.x dPdy.x dPdx.y dPdy.y dPdx.z dPdy.z
999 for (int i = 0; i < ir->lod_info.grad.dPdx->type->vector_elements; i++) {
1000 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), lod);
1004 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), lod2);
1009 inst = emit(SHADER_OPCODE_TXD, dst);
1013 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen, BRW_REGISTER_TYPE_UD), lod);
1015 inst = emit(SHADER_OPCODE_TXS, dst);
1018 mlen = header_present + 4 * reg_width;
1020 emit(BRW_OPCODE_MOV,
1021 fs_reg(MRF, base_mrf + mlen - reg_width, BRW_REGISTER_TYPE_UD),
1023 inst = emit(SHADER_OPCODE_TXF, dst);
1026 inst->base_mrf = base_mrf;
1028 inst->header_present = header_present;
1031 fail("Message length >11 disallowed by hardware\n");
1038 fs_visitor::emit_texture_gen7(ir_texture *ir, fs_reg dst, fs_reg coordinate,
1039 fs_reg shadow_c, fs_reg lod, fs_reg lod2)
1043 int reg_width = c->dispatch_width / 8;
1044 bool header_present = false;
1047 if (ir->offset && ir->op != ir_txf) {
1048 /* The offsets set up by the ir_texture visitor are in the
1049 * m1 header, so we can't go headerless.
1051 header_present = true;
1056 if (ir->shadow_comparitor) {
1057 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), shadow_c);
1061 /* Set up the LOD info */
1066 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), lod);
1070 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), lod);
1074 if (c->dispatch_width == 16)
1075 fail("Gen7 does not support sample_d/sample_d_c in SIMD16 mode.");
1077 /* Load dPdx and the coordinate together:
1078 * [hdr], [ref], x, dPdx.x, dPdy.x, y, dPdx.y, dPdy.y, z, dPdx.z, dPdy.z
1080 for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1081 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), coordinate);
1082 coordinate.reg_offset++;
1085 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), lod);
1089 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), lod2);
1096 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen, BRW_REGISTER_TYPE_UD), lod);
1100 /* It appears that the ld instruction used for txf does its
1101 * address bounds check before adding in the offset. To work
1102 * around this, just add the integer offset to the integer texel
1103 * coordinate, and don't put the offset in the header.
1106 ir_constant *offset = ir->offset->as_constant();
1107 offsets[0] = offset->value.i[0];
1108 offsets[1] = offset->value.i[1];
1109 offsets[2] = offset->value.i[2];
1111 memset(offsets, 0, sizeof(offsets));
1114 /* Unfortunately, the parameters for LD are intermixed: u, lod, v, r. */
1115 emit(BRW_OPCODE_ADD,
1116 fs_reg(MRF, base_mrf + mlen, BRW_REGISTER_TYPE_D), coordinate, offsets[0]);
1117 coordinate.reg_offset++;
1120 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen, BRW_REGISTER_TYPE_D), lod);
1123 for (int i = 1; i < ir->coordinate->type->vector_elements; i++) {
1124 emit(BRW_OPCODE_ADD,
1125 fs_reg(MRF, base_mrf + mlen, BRW_REGISTER_TYPE_D), coordinate, offsets[i]);
1126 coordinate.reg_offset++;
1132 /* Set up the coordinate (except for cases where it was done above) */
1133 if (ir->op != ir_txd && ir->op != ir_txs && ir->op != ir_txf) {
1134 for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1135 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), coordinate);
1136 coordinate.reg_offset++;
1141 /* Generate the SEND */
1142 fs_inst *inst = NULL;
1144 case ir_tex: inst = emit(SHADER_OPCODE_TEX, dst); break;
1145 case ir_txb: inst = emit(FS_OPCODE_TXB, dst); break;
1146 case ir_txl: inst = emit(SHADER_OPCODE_TXL, dst); break;
1147 case ir_txd: inst = emit(SHADER_OPCODE_TXD, dst); break;
1148 case ir_txf: inst = emit(SHADER_OPCODE_TXF, dst); break;
1149 case ir_txs: inst = emit(SHADER_OPCODE_TXS, dst); break;
1151 inst->base_mrf = base_mrf;
1153 inst->header_present = header_present;
1156 fail("Message length >11 disallowed by hardware\n");
1163 * Emit code to produce the coordinates for a texture lookup.
1165 * Returns the fs_reg containing the texture coordinate (as opposed to
1166 * setting this->result).
1169 fs_visitor::emit_texcoord(ir_texture *ir, int sampler, int texunit)
1171 fs_inst *inst = NULL;
1173 if (!ir->coordinate)
1174 return fs_reg(); /* Return the default BAD_FILE register. */
1176 ir->coordinate->accept(this);
1177 fs_reg coordinate = this->result;
1179 bool needs_gl_clamp = true;
1181 fs_reg scale_x, scale_y;
1183 /* The 965 requires the EU to do the normalization of GL rectangle
1184 * texture coordinates. We use the program parameter state
1185 * tracking to get the scaling factor.
1187 if (ir->sampler->type->sampler_dimensionality == GLSL_SAMPLER_DIM_RECT &&
1189 (intel->gen >= 6 && (c->key.tex.gl_clamp_mask[0] & (1 << sampler) ||
1190 c->key.tex.gl_clamp_mask[1] & (1 << sampler))))) {
1191 struct gl_program_parameter_list *params = c->fp->program.Base.Parameters;
1192 int tokens[STATE_LENGTH] = {
1194 STATE_TEXRECT_SCALE,
1200 if (c->dispatch_width == 16) {
1201 fail("rectangle scale uniform setup not supported on 16-wide\n");
1202 return fs_reg(this, ir->type);
1205 scale_x = fs_reg(UNIFORM, c->prog_data.nr_params);
1206 scale_y = fs_reg(UNIFORM, c->prog_data.nr_params + 1);
1208 GLuint index = _mesa_add_state_reference(params,
1209 (gl_state_index *)tokens);
1211 this->param_index[c->prog_data.nr_params] = index;
1212 this->param_offset[c->prog_data.nr_params] = 0;
1213 c->prog_data.nr_params++;
1214 this->param_index[c->prog_data.nr_params] = index;
1215 this->param_offset[c->prog_data.nr_params] = 1;
1216 c->prog_data.nr_params++;
1219 /* The 965 requires the EU to do the normalization of GL rectangle
1220 * texture coordinates. We use the program parameter state
1221 * tracking to get the scaling factor.
1223 if (intel->gen < 6 &&
1224 ir->sampler->type->sampler_dimensionality == GLSL_SAMPLER_DIM_RECT) {
1225 fs_reg dst = fs_reg(this, ir->coordinate->type);
1226 fs_reg src = coordinate;
1229 emit(BRW_OPCODE_MUL, dst, src, scale_x);
1232 emit(BRW_OPCODE_MUL, dst, src, scale_y);
1233 } else if (ir->sampler->type->sampler_dimensionality == GLSL_SAMPLER_DIM_RECT) {
1234 /* On gen6+, the sampler handles the rectangle coordinates
1235 * natively, without needing rescaling. But that means we have
1236 * to do GL_CLAMP clamping at the [0, width], [0, height] scale,
1237 * not [0, 1] like the default case below.
1239 needs_gl_clamp = false;
1241 for (int i = 0; i < 2; i++) {
1242 if (c->key.tex.gl_clamp_mask[i] & (1 << sampler)) {
1243 fs_reg chan = coordinate;
1244 chan.reg_offset += i;
1246 inst = emit(BRW_OPCODE_SEL, chan, chan, brw_imm_f(0.0));
1247 inst->conditional_mod = BRW_CONDITIONAL_G;
1249 /* Our parameter comes in as 1.0/width or 1.0/height,
1250 * because that's what people normally want for doing
1251 * texture rectangle handling. We need width or height
1252 * for clamping, but we don't care enough to make a new
1253 * parameter type, so just invert back.
1255 fs_reg limit = fs_reg(this, glsl_type::float_type);
1256 emit(BRW_OPCODE_MOV, limit, i == 0 ? scale_x : scale_y);
1257 emit(SHADER_OPCODE_RCP, limit, limit);
1259 inst = emit(BRW_OPCODE_SEL, chan, chan, limit);
1260 inst->conditional_mod = BRW_CONDITIONAL_L;
1265 if (ir->coordinate && needs_gl_clamp) {
1266 for (unsigned int i = 0;
1267 i < MIN2(ir->coordinate->type->vector_elements, 3); i++) {
1268 if (c->key.tex.gl_clamp_mask[i] & (1 << sampler)) {
1269 fs_reg chan = coordinate;
1270 chan.reg_offset += i;
1272 fs_inst *inst = emit(BRW_OPCODE_MOV, chan, chan);
1273 inst->saturate = true;
1281 fs_visitor::visit(ir_texture *ir)
1283 fs_inst *inst = NULL;
1285 int sampler = _mesa_get_sampler_uniform_value(ir->sampler, prog, &fp->Base);
1286 int texunit = fp->Base.SamplerUnits[sampler];
1288 /* Should be lowered by do_lower_texture_projection */
1289 assert(!ir->projector);
1291 /* Generate code to compute all the subexpression trees. This has to be
1292 * done before loading any values into MRFs for the sampler message since
1293 * generating these values may involve SEND messages that need the MRFs.
1295 fs_reg coordinate = emit_texcoord(ir, sampler, texunit);
1297 fs_reg shadow_comparitor;
1298 if (ir->shadow_comparitor) {
1299 ir->shadow_comparitor->accept(this);
1300 shadow_comparitor = this->result;
1308 ir->lod_info.bias->accept(this);
1312 ir->lod_info.grad.dPdx->accept(this);
1315 ir->lod_info.grad.dPdy->accept(this);
1316 lod2 = this->result;
1321 ir->lod_info.lod->accept(this);
1326 /* Writemasking doesn't eliminate channels on SIMD8 texture
1327 * samples, so don't worry about them.
1329 fs_reg dst = fs_reg(this, glsl_type::get_instance(ir->type->base_type, 4, 1));
1331 if (intel->gen >= 7) {
1332 inst = emit_texture_gen7(ir, dst, coordinate, shadow_comparitor,
1334 } else if (intel->gen >= 5) {
1335 inst = emit_texture_gen5(ir, dst, coordinate, shadow_comparitor,
1338 inst = emit_texture_gen4(ir, dst, coordinate, shadow_comparitor,
1342 /* The header is set up by generate_tex() when necessary. */
1343 inst->src[0] = reg_undef;
1345 if (ir->offset != NULL && ir->op != ir_txf)
1346 inst->texture_offset = brw_texture_offset(ir->offset->as_constant());
1348 inst->sampler = sampler;
1350 if (ir->shadow_comparitor)
1351 inst->shadow_compare = true;
1353 swizzle_result(ir, dst, sampler);
1357 * Swizzle the result of a texture result. This is necessary for
1358 * EXT_texture_swizzle as well as DEPTH_TEXTURE_MODE for shadow comparisons.
1361 fs_visitor::swizzle_result(ir_texture *ir, fs_reg orig_val, int sampler)
1363 this->result = orig_val;
1365 if (ir->op == ir_txs)
1368 if (ir->type == glsl_type::float_type) {
1369 /* Ignore DEPTH_TEXTURE_MODE swizzling. */
1370 assert(ir->sampler->type->sampler_shadow);
1371 } else if (c->key.tex.swizzles[sampler] != SWIZZLE_NOOP) {
1372 fs_reg swizzled_result = fs_reg(this, glsl_type::vec4_type);
1374 for (int i = 0; i < 4; i++) {
1375 int swiz = GET_SWZ(c->key.tex.swizzles[sampler], i);
1376 fs_reg l = swizzled_result;
1379 if (swiz == SWIZZLE_ZERO) {
1380 emit(BRW_OPCODE_MOV, l, fs_reg(0.0f));
1381 } else if (swiz == SWIZZLE_ONE) {
1382 emit(BRW_OPCODE_MOV, l, fs_reg(1.0f));
1384 fs_reg r = orig_val;
1385 r.reg_offset += GET_SWZ(c->key.tex.swizzles[sampler], i);
1386 emit(BRW_OPCODE_MOV, l, r);
1389 this->result = swizzled_result;
1394 fs_visitor::visit(ir_swizzle *ir)
1396 ir->val->accept(this);
1397 fs_reg val = this->result;
1399 if (ir->type->vector_elements == 1) {
1400 this->result.reg_offset += ir->mask.x;
1404 fs_reg result = fs_reg(this, ir->type);
1405 this->result = result;
1407 for (unsigned int i = 0; i < ir->type->vector_elements; i++) {
1408 fs_reg channel = val;
1426 channel.reg_offset += swiz;
1427 emit(BRW_OPCODE_MOV, result, channel);
1428 result.reg_offset++;
1433 fs_visitor::visit(ir_discard *ir)
1435 assert(ir->condition == NULL); /* FINISHME */
1437 emit(FS_OPCODE_DISCARD);
1441 fs_visitor::visit(ir_constant *ir)
1443 /* Set this->result to reg at the bottom of the function because some code
1444 * paths will cause this visitor to be applied to other fields. This will
1445 * cause the value stored in this->result to be modified.
1447 * Make reg constant so that it doesn't get accidentally modified along the
1448 * way. Yes, I actually had this problem. :(
1450 const fs_reg reg(this, ir->type);
1451 fs_reg dst_reg = reg;
1453 if (ir->type->is_array()) {
1454 const unsigned size = type_size(ir->type->fields.array);
1456 for (unsigned i = 0; i < ir->type->length; i++) {
1457 ir->array_elements[i]->accept(this);
1458 fs_reg src_reg = this->result;
1460 dst_reg.type = src_reg.type;
1461 for (unsigned j = 0; j < size; j++) {
1462 emit(BRW_OPCODE_MOV, dst_reg, src_reg);
1463 src_reg.reg_offset++;
1464 dst_reg.reg_offset++;
1467 } else if (ir->type->is_record()) {
1468 foreach_list(node, &ir->components) {
1469 ir_constant *const field = (ir_constant *) node;
1470 const unsigned size = type_size(field->type);
1472 field->accept(this);
1473 fs_reg src_reg = this->result;
1475 dst_reg.type = src_reg.type;
1476 for (unsigned j = 0; j < size; j++) {
1477 emit(BRW_OPCODE_MOV, dst_reg, src_reg);
1478 src_reg.reg_offset++;
1479 dst_reg.reg_offset++;
1483 const unsigned size = type_size(ir->type);
1485 for (unsigned i = 0; i < size; i++) {
1486 switch (ir->type->base_type) {
1487 case GLSL_TYPE_FLOAT:
1488 emit(BRW_OPCODE_MOV, dst_reg, fs_reg(ir->value.f[i]));
1490 case GLSL_TYPE_UINT:
1491 emit(BRW_OPCODE_MOV, dst_reg, fs_reg(ir->value.u[i]));
1494 emit(BRW_OPCODE_MOV, dst_reg, fs_reg(ir->value.i[i]));
1496 case GLSL_TYPE_BOOL:
1497 emit(BRW_OPCODE_MOV, dst_reg, fs_reg((int)ir->value.b[i]));
1500 assert(!"Non-float/uint/int/bool constant");
1502 dst_reg.reg_offset++;
1510 fs_visitor::emit_bool_to_cond_code(ir_rvalue *ir)
1512 ir_expression *expr = ir->as_expression();
1518 assert(expr->get_num_operands() <= 2);
1519 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
1520 assert(expr->operands[i]->type->is_scalar());
1522 expr->operands[i]->accept(this);
1523 op[i] = this->result;
1525 resolve_ud_negate(&op[i]);
1528 switch (expr->operation) {
1529 case ir_unop_logic_not:
1530 inst = emit(BRW_OPCODE_AND, reg_null_d, op[0], fs_reg(1));
1531 inst->conditional_mod = BRW_CONDITIONAL_Z;
1534 case ir_binop_logic_xor:
1535 case ir_binop_logic_or:
1536 case ir_binop_logic_and:
1540 if (intel->gen >= 6) {
1541 inst = emit(BRW_OPCODE_CMP, reg_null_d, op[0], fs_reg(0.0f));
1543 inst = emit(BRW_OPCODE_MOV, reg_null_f, op[0]);
1545 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1549 if (intel->gen >= 6) {
1550 inst = emit(BRW_OPCODE_CMP, reg_null_d, op[0], fs_reg(0));
1552 inst = emit(BRW_OPCODE_MOV, reg_null_d, op[0]);
1554 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1557 case ir_binop_greater:
1558 case ir_binop_gequal:
1560 case ir_binop_lequal:
1561 case ir_binop_equal:
1562 case ir_binop_all_equal:
1563 case ir_binop_nequal:
1564 case ir_binop_any_nequal:
1565 resolve_bool_comparison(expr->operands[0], &op[0]);
1566 resolve_bool_comparison(expr->operands[1], &op[1]);
1568 inst = emit(BRW_OPCODE_CMP, reg_null_cmp, op[0], op[1]);
1569 inst->conditional_mod =
1570 brw_conditional_for_comparison(expr->operation);
1574 assert(!"not reached");
1575 fail("bad cond code\n");
1584 fs_inst *inst = emit(BRW_OPCODE_AND, reg_null_d, this->result, fs_reg(1));
1585 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1589 * Emit a gen6 IF statement with the comparison folded into the IF
1593 fs_visitor::emit_if_gen6(ir_if *ir)
1595 ir_expression *expr = ir->condition->as_expression();
1602 assert(expr->get_num_operands() <= 2);
1603 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
1604 assert(expr->operands[i]->type->is_scalar());
1606 expr->operands[i]->accept(this);
1607 op[i] = this->result;
1610 switch (expr->operation) {
1611 case ir_unop_logic_not:
1612 case ir_binop_logic_xor:
1613 case ir_binop_logic_or:
1614 case ir_binop_logic_and:
1615 /* For operations on bool arguments, only the low bit of the bool is
1616 * valid, and the others are undefined. Fall back to the condition
1622 inst = emit(BRW_OPCODE_IF, reg_null_f, op[0], fs_reg(0));
1623 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1627 inst = emit(BRW_OPCODE_IF, reg_null_d, op[0], fs_reg(0));
1628 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1631 case ir_binop_greater:
1632 case ir_binop_gequal:
1634 case ir_binop_lequal:
1635 case ir_binop_equal:
1636 case ir_binop_all_equal:
1637 case ir_binop_nequal:
1638 case ir_binop_any_nequal:
1639 resolve_bool_comparison(expr->operands[0], &op[0]);
1640 resolve_bool_comparison(expr->operands[1], &op[1]);
1642 inst = emit(BRW_OPCODE_IF, reg_null_d, op[0], op[1]);
1643 inst->conditional_mod =
1644 brw_conditional_for_comparison(expr->operation);
1647 assert(!"not reached");
1648 inst = emit(BRW_OPCODE_IF, reg_null_d, op[0], fs_reg(0));
1649 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1650 fail("bad condition\n");
1655 emit_bool_to_cond_code(ir->condition);
1656 fs_inst *inst = emit(BRW_OPCODE_IF);
1657 inst->predicated = true;
1661 fs_visitor::visit(ir_if *ir)
1665 if (intel->gen < 6 && c->dispatch_width == 16) {
1666 fail("Can't support (non-uniform) control flow on 16-wide\n");
1669 /* Don't point the annotation at the if statement, because then it plus
1670 * the then and else blocks get printed.
1672 this->base_ir = ir->condition;
1674 if (intel->gen == 6) {
1677 emit_bool_to_cond_code(ir->condition);
1679 inst = emit(BRW_OPCODE_IF);
1680 inst->predicated = true;
1683 foreach_list(node, &ir->then_instructions) {
1684 ir_instruction *ir = (ir_instruction *)node;
1690 if (!ir->else_instructions.is_empty()) {
1691 emit(BRW_OPCODE_ELSE);
1693 foreach_list(node, &ir->else_instructions) {
1694 ir_instruction *ir = (ir_instruction *)node;
1701 emit(BRW_OPCODE_ENDIF);
1705 fs_visitor::visit(ir_loop *ir)
1707 fs_reg counter = reg_undef;
1709 if (intel->gen < 6 && c->dispatch_width == 16) {
1710 fail("Can't support (non-uniform) control flow on 16-wide\n");
1714 this->base_ir = ir->counter;
1715 ir->counter->accept(this);
1716 counter = *(variable_storage(ir->counter));
1719 this->base_ir = ir->from;
1720 ir->from->accept(this);
1722 emit(BRW_OPCODE_MOV, counter, this->result);
1726 this->base_ir = NULL;
1727 emit(BRW_OPCODE_DO);
1730 this->base_ir = ir->to;
1731 ir->to->accept(this);
1733 fs_inst *inst = emit(BRW_OPCODE_CMP, reg_null_cmp, counter, this->result);
1734 inst->conditional_mod = brw_conditional_for_comparison(ir->cmp);
1736 inst = emit(BRW_OPCODE_BREAK);
1737 inst->predicated = true;
1740 foreach_list(node, &ir->body_instructions) {
1741 ir_instruction *ir = (ir_instruction *)node;
1747 if (ir->increment) {
1748 this->base_ir = ir->increment;
1749 ir->increment->accept(this);
1750 emit(BRW_OPCODE_ADD, counter, counter, this->result);
1753 this->base_ir = NULL;
1754 emit(BRW_OPCODE_WHILE);
1758 fs_visitor::visit(ir_loop_jump *ir)
1761 case ir_loop_jump::jump_break:
1762 emit(BRW_OPCODE_BREAK);
1764 case ir_loop_jump::jump_continue:
1765 emit(BRW_OPCODE_CONTINUE);
1771 fs_visitor::visit(ir_call *ir)
1773 assert(!"FINISHME");
1777 fs_visitor::visit(ir_return *ir)
1779 assert(!"FINISHME");
1783 fs_visitor::visit(ir_function *ir)
1785 /* Ignore function bodies other than main() -- we shouldn't see calls to
1786 * them since they should all be inlined before we get to ir_to_mesa.
1788 if (strcmp(ir->name, "main") == 0) {
1789 const ir_function_signature *sig;
1792 sig = ir->matching_signature(&empty);
1796 foreach_list(node, &sig->body) {
1797 ir_instruction *ir = (ir_instruction *)node;
1806 fs_visitor::visit(ir_function_signature *ir)
1808 assert(!"not reached");
1813 fs_visitor::emit(fs_inst inst)
1815 fs_inst *list_inst = new(mem_ctx) fs_inst;
1818 if (force_uncompressed_stack > 0)
1819 list_inst->force_uncompressed = true;
1820 else if (force_sechalf_stack > 0)
1821 list_inst->force_sechalf = true;
1823 list_inst->annotation = this->current_annotation;
1824 list_inst->ir = this->base_ir;
1826 this->instructions.push_tail(list_inst);
1831 /** Emits a dummy fragment shader consisting of magenta for bringup purposes. */
1833 fs_visitor::emit_dummy_fs()
1835 int reg_width = c->dispatch_width / 8;
1837 /* Everyone's favorite color. */
1838 emit(BRW_OPCODE_MOV, fs_reg(MRF, 2 + 0 * reg_width), fs_reg(1.0f));
1839 emit(BRW_OPCODE_MOV, fs_reg(MRF, 2 + 1 * reg_width), fs_reg(0.0f));
1840 emit(BRW_OPCODE_MOV, fs_reg(MRF, 2 + 2 * reg_width), fs_reg(1.0f));
1841 emit(BRW_OPCODE_MOV, fs_reg(MRF, 2 + 3 * reg_width), fs_reg(0.0f));
1844 write = emit(FS_OPCODE_FB_WRITE, fs_reg(0), fs_reg(0));
1845 write->base_mrf = 2;
1846 write->mlen = 4 * reg_width;
1850 /* The register location here is relative to the start of the URB
1851 * data. It will get adjusted to be a real location before
1852 * generate_code() time.
1855 fs_visitor::interp_reg(int location, int channel)
1857 int regnr = urb_setup[location] * 2 + channel / 2;
1858 int stride = (channel & 1) * 4;
1860 assert(urb_setup[location] != -1);
1862 return brw_vec1_grf(regnr, stride);
1865 /** Emits the interpolation for the varying inputs. */
1867 fs_visitor::emit_interpolation_setup_gen4()
1869 this->current_annotation = "compute pixel centers";
1870 this->pixel_x = fs_reg(this, glsl_type::uint_type);
1871 this->pixel_y = fs_reg(this, glsl_type::uint_type);
1872 this->pixel_x.type = BRW_REGISTER_TYPE_UW;
1873 this->pixel_y.type = BRW_REGISTER_TYPE_UW;
1875 emit(FS_OPCODE_PIXEL_X, this->pixel_x);
1876 emit(FS_OPCODE_PIXEL_Y, this->pixel_y);
1878 this->current_annotation = "compute pixel deltas from v0";
1880 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] =
1881 fs_reg(this, glsl_type::vec2_type);
1882 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] =
1883 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC];
1884 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg_offset++;
1886 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] =
1887 fs_reg(this, glsl_type::float_type);
1888 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] =
1889 fs_reg(this, glsl_type::float_type);
1891 emit(BRW_OPCODE_ADD, this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1892 this->pixel_x, fs_reg(negate(brw_vec1_grf(1, 0))));
1893 emit(BRW_OPCODE_ADD, this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1894 this->pixel_y, fs_reg(negate(brw_vec1_grf(1, 1))));
1896 this->current_annotation = "compute pos.w and 1/pos.w";
1897 /* Compute wpos.w. It's always in our setup, since it's needed to
1898 * interpolate the other attributes.
1900 this->wpos_w = fs_reg(this, glsl_type::float_type);
1901 emit(FS_OPCODE_LINTERP, wpos_w,
1902 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1903 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1904 interp_reg(FRAG_ATTRIB_WPOS, 3));
1905 /* Compute the pixel 1/W value from wpos.w. */
1906 this->pixel_w = fs_reg(this, glsl_type::float_type);
1907 emit_math(SHADER_OPCODE_RCP, this->pixel_w, wpos_w);
1908 this->current_annotation = NULL;
1911 /** Emits the interpolation for the varying inputs. */
1913 fs_visitor::emit_interpolation_setup_gen6()
1915 struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
1917 /* If the pixel centers end up used, the setup is the same as for gen4. */
1918 this->current_annotation = "compute pixel centers";
1919 fs_reg int_pixel_x = fs_reg(this, glsl_type::uint_type);
1920 fs_reg int_pixel_y = fs_reg(this, glsl_type::uint_type);
1921 int_pixel_x.type = BRW_REGISTER_TYPE_UW;
1922 int_pixel_y.type = BRW_REGISTER_TYPE_UW;
1923 emit(BRW_OPCODE_ADD,
1925 fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)),
1926 fs_reg(brw_imm_v(0x10101010)));
1927 emit(BRW_OPCODE_ADD,
1929 fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)),
1930 fs_reg(brw_imm_v(0x11001100)));
1932 /* As of gen6, we can no longer mix float and int sources. We have
1933 * to turn the integer pixel centers into floats for their actual
1936 this->pixel_x = fs_reg(this, glsl_type::float_type);
1937 this->pixel_y = fs_reg(this, glsl_type::float_type);
1938 emit(BRW_OPCODE_MOV, this->pixel_x, int_pixel_x);
1939 emit(BRW_OPCODE_MOV, this->pixel_y, int_pixel_y);
1941 this->current_annotation = "compute pos.w";
1942 this->pixel_w = fs_reg(brw_vec8_grf(c->source_w_reg, 0));
1943 this->wpos_w = fs_reg(this, glsl_type::float_type);
1944 emit_math(SHADER_OPCODE_RCP, this->wpos_w, this->pixel_w);
1946 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
1947 uint8_t reg = c->barycentric_coord_reg[i];
1948 this->delta_x[i] = fs_reg(brw_vec8_grf(reg, 0));
1949 this->delta_y[i] = fs_reg(brw_vec8_grf(reg + 1, 0));
1952 this->current_annotation = NULL;
1956 fs_visitor::emit_color_write(int target, int index, int first_color_mrf)
1958 int reg_width = c->dispatch_width / 8;
1960 fs_reg color = outputs[target];
1963 /* If there's no color data to be written, skip it. */
1964 if (color.file == BAD_FILE)
1967 color.reg_offset += index;
1969 if (c->dispatch_width == 8 || intel->gen >= 6) {
1970 /* SIMD8 write looks like:
1976 * gen6 SIMD16 DP write looks like:
1986 inst = emit(BRW_OPCODE_MOV,
1987 fs_reg(MRF, first_color_mrf + index * reg_width, color.type),
1989 inst->saturate = c->key.clamp_fragment_color;
1991 /* pre-gen6 SIMD16 single source DP write looks like:
2001 if (brw->has_compr4) {
2002 /* By setting the high bit of the MRF register number, we
2003 * indicate that we want COMPR4 mode - instead of doing the
2004 * usual destination + 1 for the second half we get
2007 inst = emit(BRW_OPCODE_MOV,
2008 fs_reg(MRF, BRW_MRF_COMPR4 + first_color_mrf + index,
2011 inst->saturate = c->key.clamp_fragment_color;
2013 push_force_uncompressed();
2014 inst = emit(BRW_OPCODE_MOV, fs_reg(MRF, first_color_mrf + index,
2017 inst->saturate = c->key.clamp_fragment_color;
2018 pop_force_uncompressed();
2020 push_force_sechalf();
2021 color.sechalf = true;
2022 inst = emit(BRW_OPCODE_MOV, fs_reg(MRF, first_color_mrf + index + 4,
2025 inst->saturate = c->key.clamp_fragment_color;
2026 pop_force_sechalf();
2027 color.sechalf = false;
2033 fs_visitor::emit_fb_writes()
2035 this->current_annotation = "FB write header";
2036 bool header_present = true;
2037 /* We can potentially have a message length of up to 15, so we have to set
2038 * base_mrf to either 0 or 1 in order to fit in m0..m15.
2042 int reg_width = c->dispatch_width / 8;
2043 bool do_dual_src = this->dual_src_output.file != BAD_FILE;
2044 bool src0_alpha_to_render_target = false;
2046 if (c->dispatch_width == 16 && do_dual_src) {
2047 fail("GL_ARB_blend_func_extended not yet supported in 16-wide.");
2048 do_dual_src = false;
2051 /* From the Sandy Bridge PRM, volume 4, page 198:
2053 * "Dispatched Pixel Enables. One bit per pixel indicating
2054 * which pixels were originally enabled when the thread was
2055 * dispatched. This field is only required for the end-of-
2056 * thread message and on all dual-source messages."
2058 if (intel->gen >= 6 &&
2059 !this->fp->UsesKill &&
2061 c->key.nr_color_regions == 1) {
2062 header_present = false;
2065 if (header_present) {
2066 src0_alpha_to_render_target = intel->gen >= 6 &&
2068 c->key.nr_color_regions > 1 &&
2069 c->key.sample_alpha_to_coverage;
2074 if (c->aa_dest_stencil_reg) {
2075 push_force_uncompressed();
2076 emit(BRW_OPCODE_MOV, fs_reg(MRF, nr++),
2077 fs_reg(brw_vec8_grf(c->aa_dest_stencil_reg, 0)));
2078 pop_force_uncompressed();
2081 /* Reserve space for color. It'll be filled in per MRT below. */
2083 nr += 4 * reg_width;
2086 if (src0_alpha_to_render_target)
2089 if (c->source_depth_to_render_target) {
2090 if (intel->gen == 6 && c->dispatch_width == 16) {
2091 /* For outputting oDepth on gen6, SIMD8 writes have to be
2092 * used. This would require 8-wide moves of each half to
2093 * message regs, kind of like pre-gen5 SIMD16 FB writes.
2094 * Just bail on doing so for now.
2096 fail("Missing support for simd16 depth writes on gen6\n");
2099 if (c->computes_depth) {
2100 /* Hand over gl_FragDepth. */
2101 assert(this->frag_depth);
2102 fs_reg depth = *(variable_storage(this->frag_depth));
2104 emit(BRW_OPCODE_MOV, fs_reg(MRF, nr), depth);
2106 /* Pass through the payload depth. */
2107 emit(BRW_OPCODE_MOV, fs_reg(MRF, nr),
2108 fs_reg(brw_vec8_grf(c->source_depth_reg, 0)));
2113 if (c->dest_depth_reg) {
2114 emit(BRW_OPCODE_MOV, fs_reg(MRF, nr),
2115 fs_reg(brw_vec8_grf(c->dest_depth_reg, 0)));
2120 fs_reg src0 = this->outputs[0];
2121 fs_reg src1 = this->dual_src_output;
2123 this->current_annotation = ralloc_asprintf(this->mem_ctx,
2125 for (int i = 0; i < 4; i++) {
2126 fs_inst *inst = emit(BRW_OPCODE_MOV,
2127 fs_reg(MRF, color_mrf + i, src0.type),
2130 inst->saturate = c->key.clamp_fragment_color;
2133 this->current_annotation = ralloc_asprintf(this->mem_ctx,
2135 for (int i = 0; i < 4; i++) {
2136 fs_inst *inst = emit(BRW_OPCODE_MOV,
2137 fs_reg(MRF, color_mrf + 4 + i, src1.type),
2140 inst->saturate = c->key.clamp_fragment_color;
2143 fs_inst *inst = emit(FS_OPCODE_FB_WRITE);
2145 inst->base_mrf = base_mrf;
2146 inst->mlen = nr - base_mrf;
2148 inst->header_present = header_present;
2150 c->prog_data.dual_src_blend = true;
2151 this->current_annotation = NULL;
2155 for (int target = 0; target < c->key.nr_color_regions; target++) {
2156 this->current_annotation = ralloc_asprintf(this->mem_ctx,
2157 "FB write target %d",
2159 /* If src0_alpha_to_render_target is true, include source zero alpha
2160 * data in RenderTargetWrite message for targets > 0.
2162 int write_color_mrf = color_mrf;
2163 if (src0_alpha_to_render_target && target != 0) {
2165 fs_reg color = outputs[0];
2166 color.reg_offset += 3;
2168 inst = emit(BRW_OPCODE_MOV,
2169 fs_reg(MRF, write_color_mrf, color.type),
2171 inst->saturate = c->key.clamp_fragment_color;
2172 write_color_mrf = color_mrf + reg_width;
2175 for (unsigned i = 0; i < this->output_components[target]; i++)
2176 emit_color_write(target, i, write_color_mrf);
2178 fs_inst *inst = emit(FS_OPCODE_FB_WRITE);
2179 inst->target = target;
2180 inst->base_mrf = base_mrf;
2181 if (src0_alpha_to_render_target && target == 0)
2182 inst->mlen = nr - base_mrf - reg_width;
2184 inst->mlen = nr - base_mrf;
2185 if (target == c->key.nr_color_regions - 1)
2187 inst->header_present = header_present;
2190 if (c->key.nr_color_regions == 0) {
2191 /* Even if there's no color buffers enabled, we still need to send
2192 * alpha out the pipeline to our null renderbuffer to support
2193 * alpha-testing, alpha-to-coverage, and so on.
2195 emit_color_write(0, 3, color_mrf);
2197 fs_inst *inst = emit(FS_OPCODE_FB_WRITE);
2198 inst->base_mrf = base_mrf;
2199 inst->mlen = nr - base_mrf;
2201 inst->header_present = header_present;
2204 this->current_annotation = NULL;
2208 fs_visitor::resolve_ud_negate(fs_reg *reg)
2210 if (reg->type != BRW_REGISTER_TYPE_UD ||
2214 fs_reg temp = fs_reg(this, glsl_type::uint_type);
2215 emit(BRW_OPCODE_MOV, temp, *reg);
2220 fs_visitor::resolve_bool_comparison(ir_rvalue *rvalue, fs_reg *reg)
2222 if (rvalue->type != glsl_type::bool_type)
2225 fs_reg temp = fs_reg(this, glsl_type::bool_type);
2226 emit(BRW_OPCODE_AND, temp, *reg, fs_reg(1));
2230 fs_visitor::fs_visitor(struct brw_wm_compile *c, struct gl_shader_program *prog,
2231 struct brw_shader *shader)
2236 this->fp = (struct gl_fragment_program *)
2237 prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
2239 this->intel = &brw->intel;
2240 this->ctx = &intel->ctx;
2241 this->mem_ctx = ralloc_context(NULL);
2242 this->shader = shader;
2243 this->failed = false;
2244 this->variable_ht = hash_table_ctor(0,
2245 hash_table_pointer_hash,
2246 hash_table_pointer_compare);
2248 /* There's a question that appears to be left open in the spec:
2249 * How do implicit dst conversions interact with the CMP
2250 * instruction or conditional mods? On gen6, the instruction:
2252 * CMP null<d> src0<f> src1<f>
2254 * will do src1 - src0 and compare that result as if it was an
2255 * integer. On gen4, it will do src1 - src0 as float, convert
2256 * the result to int, and compare as int. In between, it
2257 * appears that it does src1 - src0 and does the compare in the
2258 * execution type so dst type doesn't matter.
2260 if (this->intel->gen > 4)
2261 this->reg_null_cmp = reg_null_d;
2263 this->reg_null_cmp = reg_null_f;
2265 this->frag_depth = NULL;
2266 memset(this->outputs, 0, sizeof(this->outputs));
2267 memset(this->output_components, 0, sizeof(this->output_components));
2268 this->first_non_payload_grf = 0;
2269 this->max_grf = intel->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
2271 this->current_annotation = NULL;
2272 this->base_ir = NULL;
2274 this->virtual_grf_sizes = NULL;
2275 this->virtual_grf_count = 0;
2276 this->virtual_grf_array_size = 0;
2277 this->virtual_grf_def = NULL;
2278 this->virtual_grf_use = NULL;
2279 this->live_intervals_valid = false;
2281 this->force_uncompressed_stack = 0;
2282 this->force_sechalf_stack = 0;
2285 fs_visitor::~fs_visitor()
2287 ralloc_free(this->mem_ctx);
2288 hash_table_dtor(this->variable_ht);