2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
4 develop this 3D driver.
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 **********************************************************************/
29 * Keith Whitwell <keith@tungstengraphics.com>
33 #include "main/macros.h"
34 #include "brw_context.h"
37 static GLboolean can_do_pln(struct intel_context *intel,
38 const struct brw_reg *deltas)
40 struct brw_context *brw = brw_context(&intel->ctx);
45 if (deltas[1].nr != deltas[0].nr + 1)
48 if (intel->gen < 6 && ((deltas[0].nr & 1) != 0))
54 /* Not quite sure how correct this is - need to understand horiz
55 * vs. vertical strides a little better.
57 static INLINE struct brw_reg sechalf( struct brw_reg reg )
64 /* Return the SrcReg index of the channels that can be immediate float operands
65 * instead of usage of PROGRAM_CONSTANT values through push/pull.
68 brw_wm_arg_can_be_immediate(enum prog_opcode opcode, int arg)
70 int opcode_array[] = {
90 /* These opcodes get broken down in a way that allow two
91 * args to be immediates.
93 if (opcode == OPCODE_MAD || opcode == OPCODE_LRP) {
94 if (arg == 1 || arg == 2)
98 if (opcode > ARRAY_SIZE(opcode_array))
101 return arg == opcode_array[opcode] - 1;
105 * Computes the screen-space x,y position of the pixels.
107 * This will be used by emit_delta_xy() or emit_wpos_xy() for
108 * interpolation of attributes..
112 * R0.0 -- pixel mask, one bit for each of 4 pixels in 4 tiles,
113 * corresponding to each of the 16 execution channels.
115 * R1.0 -- triangle vertex 0.X
116 * R1.1 -- triangle vertex 0.Y
117 * R1.2 -- tile 0 x,y coords (2 packed uwords)
118 * R1.3 -- tile 1 x,y coords (2 packed uwords)
119 * R1.4 -- tile 2 x,y coords (2 packed uwords)
120 * R1.5 -- tile 3 x,y coords (2 packed uwords)
125 void emit_pixel_xy(struct brw_wm_compile *c,
126 const struct brw_reg *dst,
129 struct brw_compile *p = &c->func;
130 struct brw_reg r1 = brw_vec1_grf(1, 0);
131 struct brw_reg r1_uw = retype(r1, BRW_REGISTER_TYPE_UW);
132 struct brw_reg dst0_uw, dst1_uw;
134 brw_push_insn_state(p);
135 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
137 if (c->dispatch_width == 16) {
138 dst0_uw = vec16(retype(dst[0], BRW_REGISTER_TYPE_UW));
139 dst1_uw = vec16(retype(dst[1], BRW_REGISTER_TYPE_UW));
141 dst0_uw = vec8(retype(dst[0], BRW_REGISTER_TYPE_UW));
142 dst1_uw = vec8(retype(dst[1], BRW_REGISTER_TYPE_UW));
145 /* Calculate pixel centers by adding 1 or 0 to each of the
146 * micro-tile coordinates passed in r1.
148 if (mask & WRITEMASK_X) {
151 stride(suboffset(r1_uw, 4), 2, 4, 0),
152 brw_imm_v(0x10101010));
155 if (mask & WRITEMASK_Y) {
158 stride(suboffset(r1_uw,5), 2, 4, 0),
159 brw_imm_v(0x11001100));
161 brw_pop_insn_state(p);
165 * Computes the screen-space x,y distance of the pixels from the start
168 * This will be used in linterp or pinterp with the start vertex value
169 * and the Cx, Cy, and C0 coefficients passed in from the setup engine
170 * to produce interpolated attribute values.
172 void emit_delta_xy(struct brw_compile *p,
173 const struct brw_reg *dst,
175 const struct brw_reg *arg0)
177 struct intel_context *intel = &p->brw->intel;
178 struct brw_reg r1 = brw_vec1_grf(1, 0);
183 assert(mask == WRITEMASK_XY);
185 if (intel->gen >= 6) {
186 /* XXX Gen6 WM doesn't have Xstart/Ystart in payload r1.0/r1.1.
187 Just add them with 0.0 for dst reg.. */
188 r1 = brw_imm_v(0x00000000);
191 retype(arg0[0], BRW_REGISTER_TYPE_UW),
195 retype(arg0[1], BRW_REGISTER_TYPE_UW),
200 /* Calc delta X,Y by subtracting origin in r1 from the pixel
201 * centers produced by emit_pixel_xy().
205 retype(arg0[0], BRW_REGISTER_TYPE_UW),
209 retype(arg0[1], BRW_REGISTER_TYPE_UW),
210 negate(suboffset(r1,1)));
214 * Computes the pixel offset from the window origin for gl_FragCoord().
216 void emit_wpos_xy(struct brw_wm_compile *c,
217 const struct brw_reg *dst,
219 const struct brw_reg *arg0)
221 struct brw_compile *p = &c->func;
222 struct intel_context *intel = &p->brw->intel;
223 struct brw_reg delta_x = retype(arg0[0], BRW_REGISTER_TYPE_W);
224 struct brw_reg delta_y = retype(arg0[1], BRW_REGISTER_TYPE_W);
226 if (mask & WRITEMASK_X) {
227 if (intel->gen >= 6) {
228 struct brw_reg delta_x_f = retype(delta_x, BRW_REGISTER_TYPE_F);
229 brw_MOV(p, delta_x_f, delta_x);
233 if (c->fp->program.PixelCenterInteger) {
235 brw_MOV(p, dst[0], delta_x);
238 brw_ADD(p, dst[0], delta_x, brw_imm_f(0.5));
242 if (mask & WRITEMASK_Y) {
243 if (intel->gen >= 6) {
244 struct brw_reg delta_y_f = retype(delta_y, BRW_REGISTER_TYPE_F);
245 brw_MOV(p, delta_y_f, delta_y);
249 if (c->fp->program.OriginUpperLeft) {
250 if (c->fp->program.PixelCenterInteger) {
252 brw_MOV(p, dst[1], delta_y);
254 brw_ADD(p, dst[1], delta_y, brw_imm_f(0.5));
257 float center_offset = c->fp->program.PixelCenterInteger ? 0.0 : 0.5;
259 /* Y' = (height - 1) - Y + center */
260 brw_ADD(p, dst[1], negate(delta_y),
261 brw_imm_f(c->key.drawable_height - 1 + center_offset));
267 void emit_pixel_w(struct brw_wm_compile *c,
268 const struct brw_reg *dst,
270 const struct brw_reg *arg0,
271 const struct brw_reg *deltas)
273 struct brw_compile *p = &c->func;
274 struct intel_context *intel = &p->brw->intel;
276 struct brw_reg temp_dst;
281 temp_dst = brw_message_reg(2);
283 assert(intel->gen < 6);
285 /* Don't need this if all you are doing is interpolating color, for
288 if (mask & WRITEMASK_W) {
289 struct brw_reg interp3 = brw_vec1_grf(arg0[0].nr+1, 4);
291 /* Calc 1/w - just linterp wpos[3] optimized by putting the
292 * result straight into a message reg.
294 if (can_do_pln(intel, deltas)) {
295 brw_PLN(p, temp_dst, interp3, deltas[0]);
297 brw_LINE(p, brw_null_reg(), interp3, deltas[0]);
298 brw_MAC(p, temp_dst, suboffset(interp3, 1), deltas[1]);
305 src = brw_null_reg();
307 if (c->dispatch_width == 16) {
308 brw_math_16(p, dst[3],
309 BRW_MATH_FUNCTION_INV,
310 BRW_MATH_SATURATE_NONE,
312 BRW_MATH_PRECISION_FULL);
315 BRW_MATH_FUNCTION_INV,
316 BRW_MATH_SATURATE_NONE,
318 BRW_MATH_DATA_VECTOR,
319 BRW_MATH_PRECISION_FULL);
324 void emit_linterp(struct brw_compile *p,
325 const struct brw_reg *dst,
327 const struct brw_reg *arg0,
328 const struct brw_reg *deltas)
330 struct intel_context *intel = &p->brw->intel;
331 struct brw_reg interp[4];
332 GLuint nr = arg0[0].nr;
335 interp[0] = brw_vec1_grf(nr, 0);
336 interp[1] = brw_vec1_grf(nr, 4);
337 interp[2] = brw_vec1_grf(nr+1, 0);
338 interp[3] = brw_vec1_grf(nr+1, 4);
340 for (i = 0; i < 4; i++) {
342 if (intel->gen >= 6) {
343 brw_PLN(p, dst[i], interp[i], brw_vec8_grf(2, 0));
344 } else if (can_do_pln(intel, deltas)) {
345 brw_PLN(p, dst[i], interp[i], deltas[0]);
347 brw_LINE(p, brw_null_reg(), interp[i], deltas[0]);
348 brw_MAC(p, dst[i], suboffset(interp[i],1), deltas[1]);
355 void emit_pinterp(struct brw_compile *p,
356 const struct brw_reg *dst,
358 const struct brw_reg *arg0,
359 const struct brw_reg *deltas,
360 const struct brw_reg *w)
362 struct intel_context *intel = &p->brw->intel;
363 struct brw_reg interp[4];
364 GLuint nr = arg0[0].nr;
367 if (intel->gen >= 6) {
368 emit_linterp(p, dst, mask, arg0, interp);
372 interp[0] = brw_vec1_grf(nr, 0);
373 interp[1] = brw_vec1_grf(nr, 4);
374 interp[2] = brw_vec1_grf(nr+1, 0);
375 interp[3] = brw_vec1_grf(nr+1, 4);
377 for (i = 0; i < 4; i++) {
379 if (can_do_pln(intel, deltas)) {
380 brw_PLN(p, dst[i], interp[i], deltas[0]);
382 brw_LINE(p, brw_null_reg(), interp[i], deltas[0]);
383 brw_MAC(p, dst[i], suboffset(interp[i],1), deltas[1]);
387 for (i = 0; i < 4; i++) {
389 brw_MUL(p, dst[i], dst[i], w[3]);
395 void emit_cinterp(struct brw_compile *p,
396 const struct brw_reg *dst,
398 const struct brw_reg *arg0)
400 struct brw_reg interp[4];
401 GLuint nr = arg0[0].nr;
404 interp[0] = brw_vec1_grf(nr, 0);
405 interp[1] = brw_vec1_grf(nr, 4);
406 interp[2] = brw_vec1_grf(nr+1, 0);
407 interp[3] = brw_vec1_grf(nr+1, 4);
409 for (i = 0; i < 4; i++) {
411 brw_MOV(p, dst[i], suboffset(interp[i],3)); /* TODO: optimize away like other moves */
416 /* Sets the destination channels to 1.0 or 0.0 according to glFrontFacing. */
417 void emit_frontfacing(struct brw_compile *p,
418 const struct brw_reg *dst,
421 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
424 if (!(mask & WRITEMASK_XYZW))
427 for (i = 0; i < 4; i++) {
429 brw_MOV(p, dst[i], brw_imm_f(0.0));
433 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
436 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, r1_6ud, brw_imm_ud(1 << 31));
437 for (i = 0; i < 4; i++) {
439 brw_MOV(p, dst[i], brw_imm_f(1.0));
442 brw_set_predicate_control_flag_value(p, 0xff);
445 /* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
448 * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
450 * and we're trying to produce:
453 * dst: (ss0.tr - ss0.tl) (ss0.tl - ss0.bl)
454 * (ss0.tr - ss0.tl) (ss0.tr - ss0.br)
455 * (ss0.br - ss0.bl) (ss0.tl - ss0.bl)
456 * (ss0.br - ss0.bl) (ss0.tr - ss0.br)
457 * (ss1.tr - ss1.tl) (ss1.tl - ss1.bl)
458 * (ss1.tr - ss1.tl) (ss1.tr - ss1.br)
459 * (ss1.br - ss1.bl) (ss1.tl - ss1.bl)
460 * (ss1.br - ss1.bl) (ss1.tr - ss1.br)
462 * and add another set of two more subspans if in 16-pixel dispatch mode.
464 * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
465 * for each pair, and vertstride = 2 jumps us 2 elements after processing a
466 * pair. But for DDY, it's harder, as we want to produce the pairs swizzled
467 * between each other. We could probably do it like ddx and swizzle the right
468 * order later, but bail for now and just produce
469 * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4)
471 void emit_ddxy(struct brw_compile *p,
472 const struct brw_reg *dst,
475 const struct brw_reg *arg0)
478 struct brw_reg src0, src1;
481 brw_set_saturate(p, 1);
482 for (i = 0; i < 4; i++ ) {
485 src0 = brw_reg(arg0[i].file, arg0[i].nr, 1,
487 BRW_VERTICAL_STRIDE_2,
489 BRW_HORIZONTAL_STRIDE_0,
490 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
491 src1 = brw_reg(arg0[i].file, arg0[i].nr, 0,
493 BRW_VERTICAL_STRIDE_2,
495 BRW_HORIZONTAL_STRIDE_0,
496 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
498 src0 = brw_reg(arg0[i].file, arg0[i].nr, 0,
500 BRW_VERTICAL_STRIDE_4,
502 BRW_HORIZONTAL_STRIDE_0,
503 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
504 src1 = brw_reg(arg0[i].file, arg0[i].nr, 2,
506 BRW_VERTICAL_STRIDE_4,
508 BRW_HORIZONTAL_STRIDE_0,
509 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
511 brw_ADD(p, dst[i], src0, negate(src1));
515 brw_set_saturate(p, 0);
518 void emit_alu1(struct brw_compile *p,
519 struct brw_instruction *(*func)(struct brw_compile *,
522 const struct brw_reg *dst,
524 const struct brw_reg *arg0)
529 brw_set_saturate(p, 1);
531 for (i = 0; i < 4; i++) {
533 func(p, dst[i], arg0[i]);
538 brw_set_saturate(p, 0);
542 void emit_alu2(struct brw_compile *p,
543 struct brw_instruction *(*func)(struct brw_compile *,
547 const struct brw_reg *dst,
549 const struct brw_reg *arg0,
550 const struct brw_reg *arg1)
555 brw_set_saturate(p, 1);
557 for (i = 0; i < 4; i++) {
559 func(p, dst[i], arg0[i], arg1[i]);
564 brw_set_saturate(p, 0);
568 void emit_mad(struct brw_compile *p,
569 const struct brw_reg *dst,
571 const struct brw_reg *arg0,
572 const struct brw_reg *arg1,
573 const struct brw_reg *arg2)
577 for (i = 0; i < 4; i++) {
579 brw_MUL(p, dst[i], arg0[i], arg1[i]);
581 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
582 brw_ADD(p, dst[i], dst[i], arg2[i]);
583 brw_set_saturate(p, 0);
588 void emit_lrp(struct brw_compile *p,
589 const struct brw_reg *dst,
591 const struct brw_reg *arg0,
592 const struct brw_reg *arg1,
593 const struct brw_reg *arg2)
597 /* Uses dst as a temporary:
599 for (i = 0; i < 4; i++) {
601 /* Can I use the LINE instruction for this?
603 brw_ADD(p, dst[i], negate(arg0[i]), brw_imm_f(1.0));
604 brw_MUL(p, brw_null_reg(), dst[i], arg2[i]);
606 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
607 brw_MAC(p, dst[i], arg0[i], arg1[i]);
608 brw_set_saturate(p, 0);
613 void emit_sop(struct brw_compile *p,
614 const struct brw_reg *dst,
617 const struct brw_reg *arg0,
618 const struct brw_reg *arg1)
622 for (i = 0; i < 4; i++) {
624 brw_push_insn_state(p);
625 brw_CMP(p, brw_null_reg(), cond, arg0[i], arg1[i]);
626 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
627 brw_MOV(p, dst[i], brw_imm_f(0));
628 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
629 brw_MOV(p, dst[i], brw_imm_f(1.0));
630 brw_pop_insn_state(p);
635 static void emit_slt( struct brw_compile *p,
636 const struct brw_reg *dst,
638 const struct brw_reg *arg0,
639 const struct brw_reg *arg1 )
641 emit_sop(p, dst, mask, BRW_CONDITIONAL_L, arg0, arg1);
644 static void emit_sle( struct brw_compile *p,
645 const struct brw_reg *dst,
647 const struct brw_reg *arg0,
648 const struct brw_reg *arg1 )
650 emit_sop(p, dst, mask, BRW_CONDITIONAL_LE, arg0, arg1);
653 static void emit_sgt( struct brw_compile *p,
654 const struct brw_reg *dst,
656 const struct brw_reg *arg0,
657 const struct brw_reg *arg1 )
659 emit_sop(p, dst, mask, BRW_CONDITIONAL_G, arg0, arg1);
662 static void emit_sge( struct brw_compile *p,
663 const struct brw_reg *dst,
665 const struct brw_reg *arg0,
666 const struct brw_reg *arg1 )
668 emit_sop(p, dst, mask, BRW_CONDITIONAL_GE, arg0, arg1);
671 static void emit_seq( struct brw_compile *p,
672 const struct brw_reg *dst,
674 const struct brw_reg *arg0,
675 const struct brw_reg *arg1 )
677 emit_sop(p, dst, mask, BRW_CONDITIONAL_EQ, arg0, arg1);
680 static void emit_sne( struct brw_compile *p,
681 const struct brw_reg *dst,
683 const struct brw_reg *arg0,
684 const struct brw_reg *arg1 )
686 emit_sop(p, dst, mask, BRW_CONDITIONAL_NEQ, arg0, arg1);
689 void emit_cmp(struct brw_compile *p,
690 const struct brw_reg *dst,
692 const struct brw_reg *arg0,
693 const struct brw_reg *arg1,
694 const struct brw_reg *arg2)
698 for (i = 0; i < 4; i++) {
700 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], brw_imm_f(0));
702 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
703 brw_SEL(p, dst[i], arg1[i], arg2[i]);
704 brw_set_saturate(p, 0);
705 brw_set_predicate_control_flag_value(p, 0xff);
710 void emit_sign(struct brw_compile *p,
711 const struct brw_reg *dst,
713 const struct brw_reg *arg0)
717 for (i = 0; i < 4; i++) {
719 brw_MOV(p, dst[i], brw_imm_f(0.0));
721 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], brw_imm_f(0));
722 brw_MOV(p, dst[i], brw_imm_f(-1.0));
723 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
725 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, arg0[i], brw_imm_f(0));
726 brw_MOV(p, dst[i], brw_imm_f(1.0));
727 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
732 void emit_max(struct brw_compile *p,
733 const struct brw_reg *dst,
735 const struct brw_reg *arg0,
736 const struct brw_reg *arg1)
740 for (i = 0; i < 4; i++) {
742 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0[i], arg1[i]);
744 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
745 brw_SEL(p, dst[i], arg0[i], arg1[i]);
746 brw_set_saturate(p, 0);
747 brw_set_predicate_control_flag_value(p, 0xff);
752 void emit_min(struct brw_compile *p,
753 const struct brw_reg *dst,
755 const struct brw_reg *arg0,
756 const struct brw_reg *arg1)
760 for (i = 0; i < 4; i++) {
762 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], arg1[i]);
764 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
765 brw_SEL(p, dst[i], arg0[i], arg1[i]);
766 brw_set_saturate(p, 0);
767 brw_set_predicate_control_flag_value(p, 0xff);
773 void emit_dp2(struct brw_compile *p,
774 const struct brw_reg *dst,
776 const struct brw_reg *arg0,
777 const struct brw_reg *arg1)
779 int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
781 if (!(mask & WRITEMASK_XYZW))
782 return; /* Do not emit dead code */
784 assert(is_power_of_two(mask & WRITEMASK_XYZW));
786 brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
788 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
789 brw_MAC(p, dst[dst_chan], arg0[1], arg1[1]);
790 brw_set_saturate(p, 0);
794 void emit_dp3(struct brw_compile *p,
795 const struct brw_reg *dst,
797 const struct brw_reg *arg0,
798 const struct brw_reg *arg1)
800 int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
802 if (!(mask & WRITEMASK_XYZW))
803 return; /* Do not emit dead code */
805 assert(is_power_of_two(mask & WRITEMASK_XYZW));
807 brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
808 brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
810 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
811 brw_MAC(p, dst[dst_chan], arg0[2], arg1[2]);
812 brw_set_saturate(p, 0);
816 void emit_dp4(struct brw_compile *p,
817 const struct brw_reg *dst,
819 const struct brw_reg *arg0,
820 const struct brw_reg *arg1)
822 int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
824 if (!(mask & WRITEMASK_XYZW))
825 return; /* Do not emit dead code */
827 assert(is_power_of_two(mask & WRITEMASK_XYZW));
829 brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
830 brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
831 brw_MAC(p, brw_null_reg(), arg0[2], arg1[2]);
833 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
834 brw_MAC(p, dst[dst_chan], arg0[3], arg1[3]);
835 brw_set_saturate(p, 0);
839 void emit_dph(struct brw_compile *p,
840 const struct brw_reg *dst,
842 const struct brw_reg *arg0,
843 const struct brw_reg *arg1)
845 const int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
847 if (!(mask & WRITEMASK_XYZW))
848 return; /* Do not emit dead code */
850 assert(is_power_of_two(mask & WRITEMASK_XYZW));
852 brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
853 brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
854 brw_MAC(p, dst[dst_chan], arg0[2], arg1[2]);
856 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
857 brw_ADD(p, dst[dst_chan], dst[dst_chan], arg1[3]);
858 brw_set_saturate(p, 0);
862 void emit_xpd(struct brw_compile *p,
863 const struct brw_reg *dst,
865 const struct brw_reg *arg0,
866 const struct brw_reg *arg1)
870 assert((mask & WRITEMASK_W) != WRITEMASK_W);
872 for (i = 0 ; i < 3; i++) {
877 brw_MUL(p, brw_null_reg(), negate(arg0[i2]), arg1[i1]);
879 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
880 brw_MAC(p, dst[i], arg0[i1], arg1[i2]);
881 brw_set_saturate(p, 0);
887 void emit_math1(struct brw_wm_compile *c,
889 const struct brw_reg *dst,
891 const struct brw_reg *arg0)
893 struct brw_compile *p = &c->func;
894 struct intel_context *intel = &p->brw->intel;
895 int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
896 GLuint saturate = ((mask & SATURATE) ?
897 BRW_MATH_SATURATE_SATURATE :
898 BRW_MATH_SATURATE_NONE);
901 if (intel->gen >= 6 && ((arg0[0].hstride == BRW_HORIZONTAL_STRIDE_0 ||
902 arg0[0].file != BRW_GENERAL_REGISTER_FILE) ||
903 arg0[0].negate || arg0[0].abs)) {
904 /* Gen6 math requires that source and dst horizontal stride be 1,
905 * and that the argument be in the GRF.
907 * The hardware ignores source modifiers (negate and abs) on math
908 * instructions, so we also move to a temp to set those up.
911 brw_MOV(p, src, arg0[0]);
916 if (!(mask & WRITEMASK_XYZW))
917 return; /* Do not emit dead code */
919 assert(is_power_of_two(mask & WRITEMASK_XYZW));
921 /* Send two messages to perform all 16 operations:
923 brw_push_insn_state(p);
924 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
931 BRW_MATH_DATA_VECTOR,
932 BRW_MATH_PRECISION_FULL);
934 if (c->dispatch_width == 16) {
935 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
937 offset(dst[dst_chan],1),
942 BRW_MATH_DATA_VECTOR,
943 BRW_MATH_PRECISION_FULL);
945 brw_pop_insn_state(p);
949 void emit_math2(struct brw_wm_compile *c,
951 const struct brw_reg *dst,
953 const struct brw_reg *arg0,
954 const struct brw_reg *arg1)
956 struct brw_compile *p = &c->func;
957 struct intel_context *intel = &p->brw->intel;
958 int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
960 if (!(mask & WRITEMASK_XYZW))
961 return; /* Do not emit dead code */
963 assert(is_power_of_two(mask & WRITEMASK_XYZW));
965 brw_push_insn_state(p);
967 /* math can only operate on up to a vec8 at a time, so in
968 * dispatch_width==16 we have to do the second half manually.
970 if (intel->gen >= 6) {
971 struct brw_reg src0 = arg0[0];
972 struct brw_reg src1 = arg1[0];
973 struct brw_reg temp_dst = dst[dst_chan];
975 if (arg0[0].hstride == BRW_HORIZONTAL_STRIDE_0) {
976 brw_MOV(p, temp_dst, src0);
980 if (arg1[0].hstride == BRW_HORIZONTAL_STRIDE_0) {
981 /* This is a heinous hack to get a temporary register for use
982 * in case both arg0 and arg1 are constants. Why you're
983 * doing exponentiation on constant values in the shader, we
986 * max_wm_grf is almost surely less than the maximum GRF, and
987 * gen6 doesn't care about the number of GRFs used in a
988 * shader like pre-gen6 did.
990 struct brw_reg temp = brw_vec8_grf(c->max_wm_grf, 0);
991 brw_MOV(p, temp, src1);
995 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
996 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1002 if (c->dispatch_width == 16) {
1003 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
1011 GLuint saturate = ((mask & SATURATE) ?
1012 BRW_MATH_SATURATE_SATURATE :
1013 BRW_MATH_SATURATE_NONE);
1015 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1016 brw_MOV(p, brw_message_reg(3), arg1[0]);
1017 if (c->dispatch_width == 16) {
1018 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
1019 brw_MOV(p, brw_message_reg(5), sechalf(arg1[0]));
1022 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1029 BRW_MATH_DATA_VECTOR,
1030 BRW_MATH_PRECISION_FULL);
1032 /* Send two messages to perform all 16 operations:
1034 if (c->dispatch_width == 16) {
1035 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
1037 offset(dst[dst_chan],1),
1042 BRW_MATH_DATA_VECTOR,
1043 BRW_MATH_PRECISION_FULL);
1046 brw_pop_insn_state(p);
1050 void emit_tex(struct brw_wm_compile *c,
1051 struct brw_reg *dst,
1053 struct brw_reg *arg,
1054 struct brw_reg depth_payload,
1059 struct brw_compile *p = &c->func;
1060 struct intel_context *intel = &p->brw->intel;
1061 struct brw_reg dst_retyped;
1062 GLuint cur_mrf = 2, response_length;
1063 GLuint i, nr_texcoords;
1066 GLuint mrf_per_channel;
1069 if (c->dispatch_width == 16) {
1070 mrf_per_channel = 2;
1071 response_length = 8;
1072 dst_retyped = retype(vec16(dst[0]), BRW_REGISTER_TYPE_UW);
1073 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
1075 mrf_per_channel = 1;
1076 response_length = 4;
1077 dst_retyped = retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW);
1078 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
1081 /* How many input regs are there?
1084 case TEXTURE_1D_INDEX:
1088 case TEXTURE_2D_INDEX:
1089 case TEXTURE_RECT_INDEX:
1090 emit = WRITEMASK_XY;
1093 case TEXTURE_3D_INDEX:
1094 case TEXTURE_CUBE_INDEX:
1095 emit = WRITEMASK_XYZ;
1099 /* unexpected target */
1103 /* Pre-Ironlake, the 8-wide sampler always took u,v,r. */
1104 if (intel->gen < 5 && c->dispatch_width == 8)
1107 /* For shadow comparisons, we have to supply u,v,r. */
1111 /* Emit the texcoords. */
1112 for (i = 0; i < nr_texcoords; i++) {
1114 brw_MOV(p, brw_message_reg(cur_mrf), arg[i]);
1116 brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0));
1117 cur_mrf += mrf_per_channel;
1120 /* Fill in the shadow comparison reference value. */
1122 if (intel->gen >= 5) {
1123 /* Fill in the cube map array index value. */
1124 brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0));
1125 cur_mrf += mrf_per_channel;
1126 } else if (c->dispatch_width == 8) {
1127 /* Fill in the LOD bias value. */
1128 brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0));
1129 cur_mrf += mrf_per_channel;
1131 brw_MOV(p, brw_message_reg(cur_mrf), arg[2]);
1132 cur_mrf += mrf_per_channel;
1135 if (intel->gen >= 5) {
1137 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_COMPARE;
1139 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE;
1141 /* Note that G45 and older determines shadow compare and dispatch width
1142 * from message length for most messages.
1144 if (c->dispatch_width == 16 && shadow)
1145 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE;
1147 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE;
1153 retype(depth_payload, BRW_REGISTER_TYPE_UW),
1154 SURF_INDEX_TEXTURE(sampler),
1156 dst_flags & WRITEMASK_XYZW,
1166 void emit_txb(struct brw_wm_compile *c,
1167 struct brw_reg *dst,
1169 struct brw_reg *arg,
1170 struct brw_reg depth_payload,
1174 struct brw_compile *p = &c->func;
1175 struct intel_context *intel = &p->brw->intel;
1178 GLuint mrf_per_channel;
1179 GLuint response_length;
1180 struct brw_reg dst_retyped;
1182 /* The G45 and older chipsets don't support 8-wide dispatch for LOD biased
1183 * samples, so we'll use the 16-wide instruction, leave the second halves
1184 * undefined, and trust the execution mask to keep the undefined pixels
1187 if (c->dispatch_width == 16 || intel->gen < 5) {
1188 if (intel->gen >= 5)
1189 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS;
1191 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
1192 mrf_per_channel = 2;
1193 dst_retyped = retype(vec16(dst[0]), BRW_REGISTER_TYPE_UW);
1194 response_length = 8;
1196 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS;
1197 mrf_per_channel = 1;
1198 dst_retyped = retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW);
1199 response_length = 4;
1202 /* Shadow ignored for txb. */
1204 case TEXTURE_1D_INDEX:
1205 brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]);
1206 brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), brw_imm_f(0));
1207 brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), brw_imm_f(0));
1209 case TEXTURE_2D_INDEX:
1210 case TEXTURE_RECT_INDEX:
1211 brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]);
1212 brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), arg[1]);
1213 brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), brw_imm_f(0));
1215 case TEXTURE_3D_INDEX:
1216 case TEXTURE_CUBE_INDEX:
1217 brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]);
1218 brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), arg[1]);
1219 brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), arg[2]);
1222 /* unexpected target */
1226 brw_MOV(p, brw_message_reg(2 + 3 * mrf_per_channel), arg[3]);
1227 msgLength = 2 + 4 * mrf_per_channel - 1;
1232 retype(depth_payload, BRW_REGISTER_TYPE_UW),
1233 SURF_INDEX_TEXTURE(sampler),
1235 dst_flags & WRITEMASK_XYZW,
1241 BRW_SAMPLER_SIMD_MODE_SIMD16);
1245 static void emit_lit(struct brw_wm_compile *c,
1246 const struct brw_reg *dst,
1248 const struct brw_reg *arg0)
1250 struct brw_compile *p = &c->func;
1252 assert((mask & WRITEMASK_XW) == 0);
1254 if (mask & WRITEMASK_Y) {
1255 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
1256 brw_MOV(p, dst[1], arg0[0]);
1257 brw_set_saturate(p, 0);
1260 if (mask & WRITEMASK_Z) {
1261 emit_math2(c, BRW_MATH_FUNCTION_POW,
1263 WRITEMASK_X | (mask & SATURATE),
1268 /* Ordinarily you'd use an iff statement to skip or shortcircuit
1269 * some of the POW calculations above, but 16-wide iff statements
1270 * seem to lock c1 hardware, so this is a nasty workaround:
1272 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_LE, arg0[0], brw_imm_f(0));
1274 if (mask & WRITEMASK_Y)
1275 brw_MOV(p, dst[1], brw_imm_f(0));
1277 if (mask & WRITEMASK_Z)
1278 brw_MOV(p, dst[2], brw_imm_f(0));
1280 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1284 /* Kill pixel - set execution mask to zero for those pixels which
1287 static void emit_kil( struct brw_wm_compile *c,
1288 struct brw_reg *arg0)
1290 struct brw_compile *p = &c->func;
1291 struct intel_context *intel = &p->brw->intel;
1292 struct brw_reg pixelmask;
1295 if (intel->gen >= 6)
1296 pixelmask = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
1298 pixelmask = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
1300 for (i = 0; i < 4; i++) {
1301 /* Check if we've already done the comparison for this reg
1302 * -- common when someone does KIL TEMP.wwww.
1304 for (j = 0; j < i; j++) {
1305 if (memcmp(&arg0[j], &arg0[i], sizeof(arg0[0])) == 0)
1311 brw_push_insn_state(p);
1312 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0[i], brw_imm_f(0));
1313 brw_set_predicate_control_flag_value(p, 0xff);
1314 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1315 brw_AND(p, pixelmask, brw_flag_reg(), pixelmask);
1316 brw_pop_insn_state(p);
1320 static void fire_fb_write( struct brw_wm_compile *c,
1326 struct brw_compile *p = &c->func;
1327 struct intel_context *intel = &p->brw->intel;
1329 /* Pass through control information:
1331 * Gen6 has done m1 mov in emit_fb_write() for current SIMD16 case.
1333 /* mov (8) m1.0<1>:ud r1.0<8;8,1>:ud { Align1 NoMask } */
1336 brw_push_insn_state(p);
1337 brw_set_mask_control(p, BRW_MASK_DISABLE); /* ? */
1338 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1340 brw_message_reg(base_reg + 1),
1341 brw_vec8_grf(1, 0));
1342 brw_pop_insn_state(p);
1345 /* Send framebuffer write message: */
1346 /* send (16) null.0<1>:uw m0 r0.0<8;8,1>:uw 0x85a04000:ud { Align1 EOT } */
1350 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW),
1359 static void emit_aa( struct brw_wm_compile *c,
1360 struct brw_reg *arg1,
1363 struct brw_compile *p = &c->func;
1364 GLuint comp = c->aa_dest_stencil_reg / 2;
1365 GLuint off = c->aa_dest_stencil_reg % 2;
1366 struct brw_reg aa = offset(arg1[comp], off);
1368 brw_push_insn_state(p);
1369 brw_set_compression_control(p, BRW_COMPRESSION_NONE); /* ?? */
1370 brw_MOV(p, brw_message_reg(reg), aa);
1371 brw_pop_insn_state(p);
1375 /* Post-fragment-program processing. Send the results to the
1377 * \param arg0 the fragment color
1378 * \param arg1 the pass-through depth value
1379 * \param arg2 the shader-computed depth value
1381 void emit_fb_write(struct brw_wm_compile *c,
1382 struct brw_reg *arg0,
1383 struct brw_reg *arg1,
1384 struct brw_reg *arg2,
1388 struct brw_compile *p = &c->func;
1389 struct brw_context *brw = p->brw;
1390 struct intel_context *intel = &brw->intel;
1394 /* Reserve a space for AA - may not be needed:
1396 if (c->aa_dest_stencil_reg)
1399 /* I don't really understand how this achieves the color interleave
1400 * (ie RGBARGBA) in the result: [Do the saturation here]
1402 brw_push_insn_state(p);
1404 if (c->key.clamp_fragment_color)
1405 brw_set_saturate(p, 1);
1407 for (channel = 0; channel < 4; channel++) {
1408 if (intel->gen >= 6) {
1409 /* gen6 SIMD16 single source DP write looks like:
1419 if (c->dispatch_width == 16) {
1420 brw_MOV(p, brw_message_reg(nr + channel * 2), arg0[channel]);
1422 brw_MOV(p, brw_message_reg(nr + channel), arg0[channel]);
1424 } else if (c->dispatch_width == 16 && brw->has_compr4) {
1425 /* pre-gen6 SIMD16 single source DP write looks like:
1435 * By setting the high bit of the MRF register number, we indicate
1436 * that we want COMPR4 mode - instead of doing the usual destination
1437 * + 1 for the second half we get destination + 4.
1440 brw_message_reg(nr + channel + BRW_MRF_COMPR4),
1443 /* mov (8) m2.0<1>:ud r28.0<8;8,1>:ud { Align1 } */
1444 /* mov (8) m6.0<1>:ud r29.0<8;8,1>:ud { Align1 SecHalf } */
1445 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1447 brw_message_reg(nr + channel),
1450 if (c->dispatch_width == 16) {
1451 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
1453 brw_message_reg(nr + channel + 4),
1454 sechalf(arg0[channel]));
1459 brw_set_saturate(p, 0);
1461 /* skip over the regs populated above:
1463 if (c->dispatch_width == 16)
1468 brw_pop_insn_state(p);
1470 if (c->source_depth_to_render_target)
1472 if (c->computes_depth)
1473 brw_MOV(p, brw_message_reg(nr), arg2[2]);
1475 brw_MOV(p, brw_message_reg(nr), arg1[1]); /* ? */
1480 if (c->dest_depth_reg)
1482 GLuint comp = c->dest_depth_reg / 2;
1483 GLuint off = c->dest_depth_reg % 2;
1486 brw_push_insn_state(p);
1487 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1489 brw_MOV(p, brw_message_reg(nr), offset(arg1[comp],1));
1491 brw_MOV(p, brw_message_reg(nr+1), arg1[comp+1]);
1492 brw_pop_insn_state(p);
1495 brw_MOV(p, brw_message_reg(nr), arg1[comp]);
1500 if (intel->gen >= 6) {
1501 /* Load the message header. There's no implied move from src0
1502 * to the base mrf on gen6.
1504 brw_push_insn_state(p);
1505 brw_set_mask_control(p, BRW_MASK_DISABLE);
1506 brw_MOV(p, retype(brw_message_reg(0), BRW_REGISTER_TYPE_UD),
1507 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1508 brw_pop_insn_state(p);
1511 brw_MOV(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
1513 2), BRW_REGISTER_TYPE_UD),
1514 brw_imm_ud(target));
1518 if (!c->runtime_check_aads_emit) {
1519 if (c->aa_dest_stencil_reg)
1520 emit_aa(c, arg1, 2);
1522 fire_fb_write(c, 0, nr, target, eot);
1525 struct brw_reg v1_null_ud = vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
1526 struct brw_reg ip = brw_ip_reg();
1527 struct brw_instruction *jmp;
1529 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1530 brw_set_conditionalmod(p, BRW_CONDITIONAL_Z);
1533 get_element_ud(brw_vec8_grf(1,0), 6),
1536 jmp = brw_JMPI(p, ip, ip, brw_imm_w(0));
1538 emit_aa(c, arg1, 2);
1539 fire_fb_write(c, 0, nr, target, eot);
1540 /* note - thread killed in subroutine */
1542 brw_land_fwd_jump(p, jmp);
1544 /* ELSE: Shuffle up one register to fill in the hole left for AA:
1546 fire_fb_write(c, 1, nr-1, target, eot);
1551 * Move a GPR to scratch memory.
1553 static void emit_spill( struct brw_wm_compile *c,
1557 struct brw_compile *p = &c->func;
1560 mov (16) m2.0<1>:ud r2.0<8;8,1>:ud { Align1 Compr }
1562 brw_MOV(p, brw_message_reg(2), reg);
1565 mov (1) r0.2<1>:d 0x00000080:d { Align1 NoMask }
1566 send (16) null.0<1>:uw m1 r0.0<8;8,1>:uw 0x053003ff:ud { Align1 }
1568 brw_oword_block_write_scratch(p, brw_message_reg(1), 2, slot);
1573 * Load a GPR from scratch memory.
1575 static void emit_unspill( struct brw_wm_compile *c,
1579 struct brw_compile *p = &c->func;
1581 /* Slot 0 is the undef value.
1584 brw_MOV(p, reg, brw_imm_f(0));
1589 mov (1) r0.2<1>:d 0x000000c0:d { Align1 NoMask }
1590 send (16) r110.0<1>:uw m1 r0.0<8;8,1>:uw 0x041243ff:ud { Align1 }
1593 brw_oword_block_read(p, vec16(reg), brw_message_reg(1), 2, slot);
1598 * Retrieve up to 4 GEN4 register pairs for the given wm reg:
1599 * Args with unspill_reg != 0 will be loaded from scratch memory.
1601 static void get_argument_regs( struct brw_wm_compile *c,
1602 struct brw_wm_ref *arg[],
1603 struct brw_reg *regs )
1607 for (i = 0; i < 4; i++) {
1609 if (arg[i]->unspill_reg)
1611 brw_vec8_grf(arg[i]->unspill_reg, 0),
1612 arg[i]->value->spill_slot);
1614 regs[i] = arg[i]->hw_reg;
1617 regs[i] = brw_null_reg();
1624 * For values that have a spill_slot!=0, write those regs to scratch memory.
1626 static void spill_values( struct brw_wm_compile *c,
1627 struct brw_wm_value *values,
1632 for (i = 0; i < nr; i++)
1633 if (values[i].spill_slot)
1634 emit_spill(c, values[i].hw_reg, values[i].spill_slot);
1638 /* Emit the fragment program instructions here.
1640 void brw_wm_emit( struct brw_wm_compile *c )
1642 struct brw_compile *p = &c->func;
1643 struct intel_context *intel = &p->brw->intel;
1646 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
1647 if (intel->gen >= 6)
1648 brw_set_acc_write_control(p, 1);
1650 /* Check if any of the payload regs need to be spilled:
1652 spill_values(c, c->payload.depth, 4);
1653 spill_values(c, c->creg, c->nr_creg);
1654 spill_values(c, c->payload.input_interp, FRAG_ATTRIB_MAX);
1657 for (insn = 0; insn < c->nr_insns; insn++) {
1659 struct brw_wm_instruction *inst = &c->instruction[insn];
1660 struct brw_reg args[3][4], dst[4];
1661 GLuint i, dst_flags;
1663 /* Get argument regs:
1665 for (i = 0; i < 3; i++)
1666 get_argument_regs(c, inst->src[i], args[i]);
1670 for (i = 0; i < 4; i++)
1672 dst[i] = inst->dst[i]->hw_reg;
1674 dst[i] = brw_null_reg();
1678 dst_flags = inst->writemask;
1680 dst_flags |= SATURATE;
1682 switch (inst->opcode) {
1683 /* Generated instructions for calculating triangle interpolants:
1686 emit_pixel_xy(c, dst, dst_flags);
1690 emit_delta_xy(p, dst, dst_flags, args[0]);
1694 emit_wpos_xy(c, dst, dst_flags, args[0]);
1698 emit_pixel_w(c, dst, dst_flags, args[0], args[1]);
1702 emit_linterp(p, dst, dst_flags, args[0], args[1]);
1706 emit_pinterp(p, dst, dst_flags, args[0], args[1], args[2]);
1710 emit_cinterp(p, dst, dst_flags, args[0]);
1714 emit_fb_write(c, args[0], args[1], args[2], inst->target, inst->eot);
1717 case WM_FRONTFACING:
1718 emit_frontfacing(p, dst, dst_flags);
1721 /* Straightforward arithmetic:
1724 emit_alu2(p, brw_ADD, dst, dst_flags, args[0], args[1]);
1728 emit_alu1(p, brw_FRC, dst, dst_flags, args[0]);
1732 emit_alu1(p, brw_RNDD, dst, dst_flags, args[0]);
1736 emit_ddxy(p, dst, dst_flags, GL_TRUE, args[0]);
1740 emit_ddxy(p, dst, dst_flags, GL_FALSE, args[0]);
1744 emit_dp2(p, dst, dst_flags, args[0], args[1]);
1748 emit_dp3(p, dst, dst_flags, args[0], args[1]);
1752 emit_dp4(p, dst, dst_flags, args[0], args[1]);
1756 emit_dph(p, dst, dst_flags, args[0], args[1]);
1760 for (i = 0; i < 4; i++) {
1761 if (dst_flags & (1<<i)) {
1762 brw_RNDZ(p, dst[i], args[0][i]);
1768 emit_lrp(p, dst, dst_flags, args[0], args[1], args[2]);
1772 emit_mad(p, dst, dst_flags, args[0], args[1], args[2]);
1777 emit_alu1(p, brw_MOV, dst, dst_flags, args[0]);
1781 emit_alu2(p, brw_MUL, dst, dst_flags, args[0], args[1]);
1785 emit_xpd(p, dst, dst_flags, args[0], args[1]);
1788 /* Higher math functions:
1791 emit_math1(c, BRW_MATH_FUNCTION_INV, dst, dst_flags, args[0]);
1795 emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, dst_flags, args[0]);
1799 emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, dst_flags, args[0]);
1803 emit_math1(c, BRW_MATH_FUNCTION_COS, dst, dst_flags, args[0]);
1807 emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, dst_flags, args[0]);
1811 emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, dst_flags, args[0]);
1815 /* There is an scs math function, but it would need some
1816 * fixup for 16-element execution.
1818 if (dst_flags & WRITEMASK_X)
1819 emit_math1(c, BRW_MATH_FUNCTION_COS, dst, (dst_flags&SATURATE)|WRITEMASK_X, args[0]);
1820 if (dst_flags & WRITEMASK_Y)
1821 emit_math1(c, BRW_MATH_FUNCTION_SIN, dst+1, (dst_flags&SATURATE)|WRITEMASK_X, args[0]);
1825 emit_math2(c, BRW_MATH_FUNCTION_POW, dst, dst_flags, args[0], args[1]);
1831 emit_cmp(p, dst, dst_flags, args[0], args[1], args[2]);
1835 emit_max(p, dst, dst_flags, args[0], args[1]);
1839 emit_min(p, dst, dst_flags, args[0], args[1]);
1843 emit_slt(p, dst, dst_flags, args[0], args[1]);
1847 emit_sle(p, dst, dst_flags, args[0], args[1]);
1850 emit_sgt(p, dst, dst_flags, args[0], args[1]);
1853 emit_sge(p, dst, dst_flags, args[0], args[1]);
1856 emit_seq(p, dst, dst_flags, args[0], args[1]);
1859 emit_sne(p, dst, dst_flags, args[0], args[1]);
1863 emit_sign(p, dst, dst_flags, args[0]);
1867 emit_lit(c, dst, dst_flags, args[0]);
1870 /* Texturing operations:
1873 emit_tex(c, dst, dst_flags, args[0], c->payload.depth[0].hw_reg,
1874 inst->tex_idx, inst->tex_unit,
1879 emit_txb(c, dst, dst_flags, args[0], c->payload.depth[0].hw_reg,
1880 inst->tex_idx, inst->tex_unit);
1884 emit_kil(c, args[0]);
1888 printf("Unsupported opcode %i (%s) in fragment shader\n",
1889 inst->opcode, inst->opcode < MAX_OPCODE ?
1890 _mesa_opcode_string(inst->opcode) :
1894 for (i = 0; i < 4; i++)
1895 if (inst->dst[i] && inst->dst[i]->spill_slot)
1897 inst->dst[i]->hw_reg,
1898 inst->dst[i]->spill_slot);
1901 /* Only properly tested on ILK */
1902 if (p->brw->intel.gen == 5) {
1903 brw_remove_duplicate_mrf_moves(p);
1904 if (c->dispatch_width == 16)
1905 brw_remove_grf_to_mrf_moves(p);
1908 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
1911 printf("wm-native:\n");
1912 for (i = 0; i < p->nr_insn; i++)
1913 brw_disasm(stdout, &p->store[i], p->brw->intel.gen);