2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
4 develop this 3D driver.
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 **********************************************************************/
29 * Keith Whitwell <keith@tungstengraphics.com>
33 #include "main/macros.h"
34 #include "brw_context.h"
37 static GLboolean can_do_pln(struct intel_context *intel,
38 const struct brw_reg *deltas)
40 struct brw_context *brw = brw_context(&intel->ctx);
45 if (deltas[1].nr != deltas[0].nr + 1)
48 if (intel->gen < 6 && ((deltas[0].nr & 1) != 0))
54 /* Not quite sure how correct this is - need to understand horiz
55 * vs. vertical strides a little better.
57 static INLINE struct brw_reg sechalf( struct brw_reg reg )
64 /* Return the SrcReg index of the channels that can be immediate float operands
65 * instead of usage of PROGRAM_CONSTANT values through push/pull.
68 brw_wm_arg_can_be_immediate(enum prog_opcode opcode, int arg)
70 int opcode_array[] = {
90 /* These opcodes get broken down in a way that allow two
91 * args to be immediates.
93 if (opcode == OPCODE_MAD || opcode == OPCODE_LRP) {
94 if (arg == 1 || arg == 2)
98 if (opcode > ARRAY_SIZE(opcode_array))
101 return arg == opcode_array[opcode] - 1;
105 * Computes the screen-space x,y position of the pixels.
107 * This will be used by emit_delta_xy() or emit_wpos_xy() for
108 * interpolation of attributes..
112 * R0.0 -- pixel mask, one bit for each of 4 pixels in 4 tiles,
113 * corresponding to each of the 16 execution channels.
115 * R1.0 -- triangle vertex 0.X
116 * R1.1 -- triangle vertex 0.Y
117 * R1.2 -- tile 0 x,y coords (2 packed uwords)
118 * R1.3 -- tile 1 x,y coords (2 packed uwords)
119 * R1.4 -- tile 2 x,y coords (2 packed uwords)
120 * R1.5 -- tile 3 x,y coords (2 packed uwords)
125 void emit_pixel_xy(struct brw_wm_compile *c,
126 const struct brw_reg *dst,
129 struct brw_compile *p = &c->func;
130 struct brw_reg r1 = brw_vec1_grf(1, 0);
131 struct brw_reg r1_uw = retype(r1, BRW_REGISTER_TYPE_UW);
132 struct brw_reg dst0_uw, dst1_uw;
134 brw_push_insn_state(p);
135 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
137 if (c->dispatch_width == 16) {
138 dst0_uw = vec16(retype(dst[0], BRW_REGISTER_TYPE_UW));
139 dst1_uw = vec16(retype(dst[1], BRW_REGISTER_TYPE_UW));
141 dst0_uw = vec8(retype(dst[0], BRW_REGISTER_TYPE_UW));
142 dst1_uw = vec8(retype(dst[1], BRW_REGISTER_TYPE_UW));
145 /* Calculate pixel centers by adding 1 or 0 to each of the
146 * micro-tile coordinates passed in r1.
148 if (mask & WRITEMASK_X) {
151 stride(suboffset(r1_uw, 4), 2, 4, 0),
152 brw_imm_v(0x10101010));
155 if (mask & WRITEMASK_Y) {
158 stride(suboffset(r1_uw,5), 2, 4, 0),
159 brw_imm_v(0x11001100));
161 brw_pop_insn_state(p);
165 * Computes the screen-space x,y distance of the pixels from the start
168 * This will be used in linterp or pinterp with the start vertex value
169 * and the Cx, Cy, and C0 coefficients passed in from the setup engine
170 * to produce interpolated attribute values.
172 void emit_delta_xy(struct brw_compile *p,
173 const struct brw_reg *dst,
175 const struct brw_reg *arg0)
177 struct intel_context *intel = &p->brw->intel;
178 struct brw_reg r1 = brw_vec1_grf(1, 0);
183 assert(mask == WRITEMASK_XY);
185 if (intel->gen >= 6) {
186 /* XXX Gen6 WM doesn't have Xstart/Ystart in payload r1.0/r1.1.
187 Just add them with 0.0 for dst reg.. */
188 r1 = brw_imm_v(0x00000000);
191 retype(arg0[0], BRW_REGISTER_TYPE_UW),
195 retype(arg0[1], BRW_REGISTER_TYPE_UW),
200 /* Calc delta X,Y by subtracting origin in r1 from the pixel
201 * centers produced by emit_pixel_xy().
205 retype(arg0[0], BRW_REGISTER_TYPE_UW),
209 retype(arg0[1], BRW_REGISTER_TYPE_UW),
210 negate(suboffset(r1,1)));
214 * Computes the pixel offset from the window origin for gl_FragCoord().
216 void emit_wpos_xy(struct brw_wm_compile *c,
217 const struct brw_reg *dst,
219 const struct brw_reg *arg0)
221 struct brw_compile *p = &c->func;
223 if (mask & WRITEMASK_X) {
224 if (c->fp->program.PixelCenterInteger) {
228 retype(arg0[0], BRW_REGISTER_TYPE_W));
233 retype(arg0[0], BRW_REGISTER_TYPE_W),
238 if (mask & WRITEMASK_Y) {
239 if (c->fp->program.OriginUpperLeft) {
240 if (c->fp->program.PixelCenterInteger) {
244 retype(arg0[1], BRW_REGISTER_TYPE_W));
249 retype(arg0[1], BRW_REGISTER_TYPE_W),
253 float center_offset = c->fp->program.PixelCenterInteger ? 0.0 : 0.5;
255 /* Y' = (height - 1) - Y + center */
258 negate(retype(arg0[1], BRW_REGISTER_TYPE_W)),
259 brw_imm_f(c->key.drawable_height - 1 + center_offset));
265 void emit_pixel_w(struct brw_wm_compile *c,
266 const struct brw_reg *dst,
268 const struct brw_reg *arg0,
269 const struct brw_reg *deltas)
271 struct brw_compile *p = &c->func;
272 struct intel_context *intel = &p->brw->intel;
274 struct brw_reg temp_dst;
279 temp_dst = brw_message_reg(2);
281 assert(intel->gen < 6);
283 /* Don't need this if all you are doing is interpolating color, for
286 if (mask & WRITEMASK_W) {
287 struct brw_reg interp3 = brw_vec1_grf(arg0[0].nr+1, 4);
289 /* Calc 1/w - just linterp wpos[3] optimized by putting the
290 * result straight into a message reg.
292 if (can_do_pln(intel, deltas)) {
293 brw_PLN(p, temp_dst, interp3, deltas[0]);
295 brw_LINE(p, brw_null_reg(), interp3, deltas[0]);
296 brw_MAC(p, temp_dst, suboffset(interp3, 1), deltas[1]);
303 src = brw_null_reg();
305 if (c->dispatch_width == 16) {
306 brw_math_16(p, dst[3],
307 BRW_MATH_FUNCTION_INV,
308 BRW_MATH_SATURATE_NONE,
310 BRW_MATH_PRECISION_FULL);
313 BRW_MATH_FUNCTION_INV,
314 BRW_MATH_SATURATE_NONE,
316 BRW_MATH_DATA_VECTOR,
317 BRW_MATH_PRECISION_FULL);
322 void emit_linterp(struct brw_compile *p,
323 const struct brw_reg *dst,
325 const struct brw_reg *arg0,
326 const struct brw_reg *deltas)
328 struct intel_context *intel = &p->brw->intel;
329 struct brw_reg interp[4];
330 GLuint nr = arg0[0].nr;
333 interp[0] = brw_vec1_grf(nr, 0);
334 interp[1] = brw_vec1_grf(nr, 4);
335 interp[2] = brw_vec1_grf(nr+1, 0);
336 interp[3] = brw_vec1_grf(nr+1, 4);
338 for (i = 0; i < 4; i++) {
340 if (intel->gen >= 6) {
341 brw_PLN(p, dst[i], interp[i], brw_vec8_grf(2, 0));
342 } else if (can_do_pln(intel, deltas)) {
343 brw_PLN(p, dst[i], interp[i], deltas[0]);
345 brw_LINE(p, brw_null_reg(), interp[i], deltas[0]);
346 brw_MAC(p, dst[i], suboffset(interp[i],1), deltas[1]);
353 void emit_pinterp(struct brw_compile *p,
354 const struct brw_reg *dst,
356 const struct brw_reg *arg0,
357 const struct brw_reg *deltas,
358 const struct brw_reg *w)
360 struct intel_context *intel = &p->brw->intel;
361 struct brw_reg interp[4];
362 GLuint nr = arg0[0].nr;
365 if (intel->gen >= 6) {
366 emit_linterp(p, dst, mask, arg0, interp);
370 interp[0] = brw_vec1_grf(nr, 0);
371 interp[1] = brw_vec1_grf(nr, 4);
372 interp[2] = brw_vec1_grf(nr+1, 0);
373 interp[3] = brw_vec1_grf(nr+1, 4);
375 for (i = 0; i < 4; i++) {
377 if (can_do_pln(intel, deltas)) {
378 brw_PLN(p, dst[i], interp[i], deltas[0]);
380 brw_LINE(p, brw_null_reg(), interp[i], deltas[0]);
381 brw_MAC(p, dst[i], suboffset(interp[i],1), deltas[1]);
385 for (i = 0; i < 4; i++) {
387 brw_MUL(p, dst[i], dst[i], w[3]);
393 void emit_cinterp(struct brw_compile *p,
394 const struct brw_reg *dst,
396 const struct brw_reg *arg0)
398 struct brw_reg interp[4];
399 GLuint nr = arg0[0].nr;
402 interp[0] = brw_vec1_grf(nr, 0);
403 interp[1] = brw_vec1_grf(nr, 4);
404 interp[2] = brw_vec1_grf(nr+1, 0);
405 interp[3] = brw_vec1_grf(nr+1, 4);
407 for (i = 0; i < 4; i++) {
409 brw_MOV(p, dst[i], suboffset(interp[i],3)); /* TODO: optimize away like other moves */
414 /* Sets the destination channels to 1.0 or 0.0 according to glFrontFacing. */
415 void emit_frontfacing(struct brw_compile *p,
416 const struct brw_reg *dst,
419 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
422 if (!(mask & WRITEMASK_XYZW))
425 for (i = 0; i < 4; i++) {
427 brw_MOV(p, dst[i], brw_imm_f(0.0));
431 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
434 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, r1_6ud, brw_imm_ud(1 << 31));
435 for (i = 0; i < 4; i++) {
437 brw_MOV(p, dst[i], brw_imm_f(1.0));
440 brw_set_predicate_control_flag_value(p, 0xff);
443 /* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
446 * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
448 * and we're trying to produce:
451 * dst: (ss0.tr - ss0.tl) (ss0.tl - ss0.bl)
452 * (ss0.tr - ss0.tl) (ss0.tr - ss0.br)
453 * (ss0.br - ss0.bl) (ss0.tl - ss0.bl)
454 * (ss0.br - ss0.bl) (ss0.tr - ss0.br)
455 * (ss1.tr - ss1.tl) (ss1.tl - ss1.bl)
456 * (ss1.tr - ss1.tl) (ss1.tr - ss1.br)
457 * (ss1.br - ss1.bl) (ss1.tl - ss1.bl)
458 * (ss1.br - ss1.bl) (ss1.tr - ss1.br)
460 * and add another set of two more subspans if in 16-pixel dispatch mode.
462 * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
463 * for each pair, and vertstride = 2 jumps us 2 elements after processing a
464 * pair. But for DDY, it's harder, as we want to produce the pairs swizzled
465 * between each other. We could probably do it like ddx and swizzle the right
466 * order later, but bail for now and just produce
467 * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4)
469 void emit_ddxy(struct brw_compile *p,
470 const struct brw_reg *dst,
473 const struct brw_reg *arg0)
476 struct brw_reg src0, src1;
479 brw_set_saturate(p, 1);
480 for (i = 0; i < 4; i++ ) {
483 src0 = brw_reg(arg0[i].file, arg0[i].nr, 1,
485 BRW_VERTICAL_STRIDE_2,
487 BRW_HORIZONTAL_STRIDE_0,
488 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
489 src1 = brw_reg(arg0[i].file, arg0[i].nr, 0,
491 BRW_VERTICAL_STRIDE_2,
493 BRW_HORIZONTAL_STRIDE_0,
494 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
496 src0 = brw_reg(arg0[i].file, arg0[i].nr, 0,
498 BRW_VERTICAL_STRIDE_4,
500 BRW_HORIZONTAL_STRIDE_0,
501 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
502 src1 = brw_reg(arg0[i].file, arg0[i].nr, 2,
504 BRW_VERTICAL_STRIDE_4,
506 BRW_HORIZONTAL_STRIDE_0,
507 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
509 brw_ADD(p, dst[i], src0, negate(src1));
513 brw_set_saturate(p, 0);
516 void emit_alu1(struct brw_compile *p,
517 struct brw_instruction *(*func)(struct brw_compile *,
520 const struct brw_reg *dst,
522 const struct brw_reg *arg0)
527 brw_set_saturate(p, 1);
529 for (i = 0; i < 4; i++) {
531 func(p, dst[i], arg0[i]);
536 brw_set_saturate(p, 0);
540 void emit_alu2(struct brw_compile *p,
541 struct brw_instruction *(*func)(struct brw_compile *,
545 const struct brw_reg *dst,
547 const struct brw_reg *arg0,
548 const struct brw_reg *arg1)
553 brw_set_saturate(p, 1);
555 for (i = 0; i < 4; i++) {
557 func(p, dst[i], arg0[i], arg1[i]);
562 brw_set_saturate(p, 0);
566 void emit_mad(struct brw_compile *p,
567 const struct brw_reg *dst,
569 const struct brw_reg *arg0,
570 const struct brw_reg *arg1,
571 const struct brw_reg *arg2)
575 for (i = 0; i < 4; i++) {
577 brw_MUL(p, dst[i], arg0[i], arg1[i]);
579 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
580 brw_ADD(p, dst[i], dst[i], arg2[i]);
581 brw_set_saturate(p, 0);
586 void emit_lrp(struct brw_compile *p,
587 const struct brw_reg *dst,
589 const struct brw_reg *arg0,
590 const struct brw_reg *arg1,
591 const struct brw_reg *arg2)
595 /* Uses dst as a temporary:
597 for (i = 0; i < 4; i++) {
599 /* Can I use the LINE instruction for this?
601 brw_ADD(p, dst[i], negate(arg0[i]), brw_imm_f(1.0));
602 brw_MUL(p, brw_null_reg(), dst[i], arg2[i]);
604 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
605 brw_MAC(p, dst[i], arg0[i], arg1[i]);
606 brw_set_saturate(p, 0);
611 void emit_sop(struct brw_compile *p,
612 const struct brw_reg *dst,
615 const struct brw_reg *arg0,
616 const struct brw_reg *arg1)
620 for (i = 0; i < 4; i++) {
622 brw_push_insn_state(p);
623 brw_CMP(p, brw_null_reg(), cond, arg0[i], arg1[i]);
624 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
625 brw_MOV(p, dst[i], brw_imm_f(0));
626 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
627 brw_MOV(p, dst[i], brw_imm_f(1.0));
628 brw_pop_insn_state(p);
633 static void emit_slt( struct brw_compile *p,
634 const struct brw_reg *dst,
636 const struct brw_reg *arg0,
637 const struct brw_reg *arg1 )
639 emit_sop(p, dst, mask, BRW_CONDITIONAL_L, arg0, arg1);
642 static void emit_sle( struct brw_compile *p,
643 const struct brw_reg *dst,
645 const struct brw_reg *arg0,
646 const struct brw_reg *arg1 )
648 emit_sop(p, dst, mask, BRW_CONDITIONAL_LE, arg0, arg1);
651 static void emit_sgt( struct brw_compile *p,
652 const struct brw_reg *dst,
654 const struct brw_reg *arg0,
655 const struct brw_reg *arg1 )
657 emit_sop(p, dst, mask, BRW_CONDITIONAL_G, arg0, arg1);
660 static void emit_sge( struct brw_compile *p,
661 const struct brw_reg *dst,
663 const struct brw_reg *arg0,
664 const struct brw_reg *arg1 )
666 emit_sop(p, dst, mask, BRW_CONDITIONAL_GE, arg0, arg1);
669 static void emit_seq( struct brw_compile *p,
670 const struct brw_reg *dst,
672 const struct brw_reg *arg0,
673 const struct brw_reg *arg1 )
675 emit_sop(p, dst, mask, BRW_CONDITIONAL_EQ, arg0, arg1);
678 static void emit_sne( struct brw_compile *p,
679 const struct brw_reg *dst,
681 const struct brw_reg *arg0,
682 const struct brw_reg *arg1 )
684 emit_sop(p, dst, mask, BRW_CONDITIONAL_NEQ, arg0, arg1);
687 void emit_cmp(struct brw_compile *p,
688 const struct brw_reg *dst,
690 const struct brw_reg *arg0,
691 const struct brw_reg *arg1,
692 const struct brw_reg *arg2)
696 for (i = 0; i < 4; i++) {
698 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], brw_imm_f(0));
700 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
701 brw_SEL(p, dst[i], arg1[i], arg2[i]);
702 brw_set_saturate(p, 0);
703 brw_set_predicate_control_flag_value(p, 0xff);
708 void emit_sign(struct brw_compile *p,
709 const struct brw_reg *dst,
711 const struct brw_reg *arg0)
715 for (i = 0; i < 4; i++) {
717 brw_MOV(p, dst[i], brw_imm_f(0.0));
719 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], brw_imm_f(0));
720 brw_MOV(p, dst[i], brw_imm_f(-1.0));
721 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
723 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, arg0[i], brw_imm_f(0));
724 brw_MOV(p, dst[i], brw_imm_f(1.0));
725 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
730 void emit_max(struct brw_compile *p,
731 const struct brw_reg *dst,
733 const struct brw_reg *arg0,
734 const struct brw_reg *arg1)
738 for (i = 0; i < 4; i++) {
740 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0[i], arg1[i]);
742 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
743 brw_SEL(p, dst[i], arg0[i], arg1[i]);
744 brw_set_saturate(p, 0);
745 brw_set_predicate_control_flag_value(p, 0xff);
750 void emit_min(struct brw_compile *p,
751 const struct brw_reg *dst,
753 const struct brw_reg *arg0,
754 const struct brw_reg *arg1)
758 for (i = 0; i < 4; i++) {
760 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], arg1[i]);
762 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
763 brw_SEL(p, dst[i], arg0[i], arg1[i]);
764 brw_set_saturate(p, 0);
765 brw_set_predicate_control_flag_value(p, 0xff);
771 void emit_dp2(struct brw_compile *p,
772 const struct brw_reg *dst,
774 const struct brw_reg *arg0,
775 const struct brw_reg *arg1)
777 int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
779 if (!(mask & WRITEMASK_XYZW))
780 return; /* Do not emit dead code */
782 assert(is_power_of_two(mask & WRITEMASK_XYZW));
784 brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
786 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
787 brw_MAC(p, dst[dst_chan], arg0[1], arg1[1]);
788 brw_set_saturate(p, 0);
792 void emit_dp3(struct brw_compile *p,
793 const struct brw_reg *dst,
795 const struct brw_reg *arg0,
796 const struct brw_reg *arg1)
798 int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
800 if (!(mask & WRITEMASK_XYZW))
801 return; /* Do not emit dead code */
803 assert(is_power_of_two(mask & WRITEMASK_XYZW));
805 brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
806 brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
808 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
809 brw_MAC(p, dst[dst_chan], arg0[2], arg1[2]);
810 brw_set_saturate(p, 0);
814 void emit_dp4(struct brw_compile *p,
815 const struct brw_reg *dst,
817 const struct brw_reg *arg0,
818 const struct brw_reg *arg1)
820 int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
822 if (!(mask & WRITEMASK_XYZW))
823 return; /* Do not emit dead code */
825 assert(is_power_of_two(mask & WRITEMASK_XYZW));
827 brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
828 brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
829 brw_MAC(p, brw_null_reg(), arg0[2], arg1[2]);
831 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
832 brw_MAC(p, dst[dst_chan], arg0[3], arg1[3]);
833 brw_set_saturate(p, 0);
837 void emit_dph(struct brw_compile *p,
838 const struct brw_reg *dst,
840 const struct brw_reg *arg0,
841 const struct brw_reg *arg1)
843 const int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
845 if (!(mask & WRITEMASK_XYZW))
846 return; /* Do not emit dead code */
848 assert(is_power_of_two(mask & WRITEMASK_XYZW));
850 brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
851 brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
852 brw_MAC(p, dst[dst_chan], arg0[2], arg1[2]);
854 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
855 brw_ADD(p, dst[dst_chan], dst[dst_chan], arg1[3]);
856 brw_set_saturate(p, 0);
860 void emit_xpd(struct brw_compile *p,
861 const struct brw_reg *dst,
863 const struct brw_reg *arg0,
864 const struct brw_reg *arg1)
868 assert((mask & WRITEMASK_W) != WRITEMASK_W);
870 for (i = 0 ; i < 3; i++) {
875 brw_MUL(p, brw_null_reg(), negate(arg0[i2]), arg1[i1]);
877 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
878 brw_MAC(p, dst[i], arg0[i1], arg1[i2]);
879 brw_set_saturate(p, 0);
885 void emit_math1(struct brw_wm_compile *c,
887 const struct brw_reg *dst,
889 const struct brw_reg *arg0)
891 struct brw_compile *p = &c->func;
892 struct intel_context *intel = &p->brw->intel;
893 int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
894 GLuint saturate = ((mask & SATURATE) ?
895 BRW_MATH_SATURATE_SATURATE :
896 BRW_MATH_SATURATE_NONE);
899 if (intel->gen >= 6 && (arg0[0].hstride == BRW_HORIZONTAL_STRIDE_0 ||
900 arg0[0].file != BRW_GENERAL_REGISTER_FILE)) {
901 /* Gen6 math requires that source and dst horizontal stride be 1,
902 * and that the argument be in the GRF.
905 brw_MOV(p, src, arg0[0]);
910 if (!(mask & WRITEMASK_XYZW))
911 return; /* Do not emit dead code */
913 assert(is_power_of_two(mask & WRITEMASK_XYZW));
915 /* Send two messages to perform all 16 operations:
917 brw_push_insn_state(p);
918 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
925 BRW_MATH_DATA_VECTOR,
926 BRW_MATH_PRECISION_FULL);
928 if (c->dispatch_width == 16) {
929 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
931 offset(dst[dst_chan],1),
936 BRW_MATH_DATA_VECTOR,
937 BRW_MATH_PRECISION_FULL);
939 brw_pop_insn_state(p);
943 void emit_math2(struct brw_wm_compile *c,
945 const struct brw_reg *dst,
947 const struct brw_reg *arg0,
948 const struct brw_reg *arg1)
950 struct brw_compile *p = &c->func;
951 struct intel_context *intel = &p->brw->intel;
952 int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
954 if (!(mask & WRITEMASK_XYZW))
955 return; /* Do not emit dead code */
957 assert(is_power_of_two(mask & WRITEMASK_XYZW));
959 brw_push_insn_state(p);
961 /* math can only operate on up to a vec8 at a time, so in
962 * dispatch_width==16 we have to do the second half manually.
964 if (intel->gen >= 6) {
965 struct brw_reg src0 = arg0[0];
966 struct brw_reg src1 = arg1[0];
967 struct brw_reg temp_dst = dst[dst_chan];
969 if (arg0[0].hstride == BRW_HORIZONTAL_STRIDE_0) {
970 if (arg1[0].hstride == BRW_HORIZONTAL_STRIDE_0) {
971 /* Both scalar arguments. Do scalar calc. */
972 src0.hstride = BRW_HORIZONTAL_STRIDE_1;
973 src1.hstride = BRW_HORIZONTAL_STRIDE_1;
974 temp_dst.hstride = BRW_HORIZONTAL_STRIDE_1;
975 temp_dst.width = BRW_WIDTH_1;
977 if (arg0[0].subnr != 0) {
978 brw_MOV(p, temp_dst, src0);
981 /* Ouch. We've used the temp as a dst, and we still
982 * need a temp to store arg1 in, because src and dst
983 * offsets have to be equal. Leaving this up to
984 * glsl2-965 to handle correctly.
986 assert(arg1[0].subnr == 0);
987 } else if (arg1[0].subnr != 0) {
988 brw_MOV(p, temp_dst, src1);
992 brw_MOV(p, temp_dst, src0);
995 } else if (arg1[0].hstride == BRW_HORIZONTAL_STRIDE_0) {
996 brw_MOV(p, temp_dst, src1);
1000 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
1001 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1007 if (c->dispatch_width == 16) {
1008 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
1016 /* Splat a scalar result into all the channels. */
1017 if (arg0[0].hstride == BRW_HORIZONTAL_STRIDE_0 &&
1018 arg1[0].hstride == BRW_HORIZONTAL_STRIDE_0) {
1019 temp_dst.hstride = BRW_HORIZONTAL_STRIDE_0;
1020 temp_dst.vstride = BRW_VERTICAL_STRIDE_0;
1021 brw_MOV(p, dst[dst_chan], temp_dst);
1024 GLuint saturate = ((mask & SATURATE) ?
1025 BRW_MATH_SATURATE_SATURATE :
1026 BRW_MATH_SATURATE_NONE);
1028 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1029 brw_MOV(p, brw_message_reg(3), arg1[0]);
1030 if (c->dispatch_width == 16) {
1031 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
1032 brw_MOV(p, brw_message_reg(5), sechalf(arg1[0]));
1035 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1042 BRW_MATH_DATA_VECTOR,
1043 BRW_MATH_PRECISION_FULL);
1045 /* Send two messages to perform all 16 operations:
1047 if (c->dispatch_width == 16) {
1048 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
1050 offset(dst[dst_chan],1),
1055 BRW_MATH_DATA_VECTOR,
1056 BRW_MATH_PRECISION_FULL);
1059 brw_pop_insn_state(p);
1063 void emit_tex(struct brw_wm_compile *c,
1064 struct brw_reg *dst,
1066 struct brw_reg *arg,
1067 struct brw_reg depth_payload,
1072 struct brw_compile *p = &c->func;
1073 struct intel_context *intel = &p->brw->intel;
1074 struct brw_reg dst_retyped;
1075 GLuint cur_mrf = 2, response_length;
1076 GLuint i, nr_texcoords;
1079 GLuint mrf_per_channel;
1082 if (c->dispatch_width == 16) {
1083 mrf_per_channel = 2;
1084 response_length = 8;
1085 dst_retyped = retype(vec16(dst[0]), BRW_REGISTER_TYPE_UW);
1086 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
1088 mrf_per_channel = 1;
1089 response_length = 4;
1090 dst_retyped = retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW);
1091 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
1094 /* How many input regs are there?
1097 case TEXTURE_1D_INDEX:
1101 case TEXTURE_2D_INDEX:
1102 case TEXTURE_RECT_INDEX:
1103 emit = WRITEMASK_XY;
1106 case TEXTURE_3D_INDEX:
1107 case TEXTURE_CUBE_INDEX:
1108 emit = WRITEMASK_XYZ;
1112 /* unexpected target */
1116 /* Pre-Ironlake, the 8-wide sampler always took u,v,r. */
1117 if (intel->gen < 5 && c->dispatch_width == 8)
1120 /* For shadow comparisons, we have to supply u,v,r. */
1124 /* Emit the texcoords. */
1125 for (i = 0; i < nr_texcoords; i++) {
1127 brw_MOV(p, brw_message_reg(cur_mrf), arg[i]);
1129 brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0));
1130 cur_mrf += mrf_per_channel;
1133 /* Fill in the shadow comparison reference value. */
1135 if (intel->gen >= 5) {
1136 /* Fill in the cube map array index value. */
1137 brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0));
1138 cur_mrf += mrf_per_channel;
1139 } else if (c->dispatch_width == 8) {
1140 /* Fill in the LOD bias value. */
1141 brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0));
1142 cur_mrf += mrf_per_channel;
1144 brw_MOV(p, brw_message_reg(cur_mrf), arg[2]);
1145 cur_mrf += mrf_per_channel;
1148 if (intel->gen >= 5) {
1150 msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_COMPARE_GEN5;
1152 msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_GEN5;
1154 /* Note that G45 and older determines shadow compare and dispatch width
1155 * from message length for most messages.
1157 if (c->dispatch_width == 16 && shadow)
1158 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE;
1160 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE;
1166 retype(depth_payload, BRW_REGISTER_TYPE_UW),
1167 SURF_INDEX_TEXTURE(sampler),
1169 dst_flags & WRITEMASK_XYZW,
1179 void emit_txb(struct brw_wm_compile *c,
1180 struct brw_reg *dst,
1182 struct brw_reg *arg,
1183 struct brw_reg depth_payload,
1187 struct brw_compile *p = &c->func;
1188 struct intel_context *intel = &p->brw->intel;
1191 GLuint mrf_per_channel;
1192 GLuint response_length;
1193 struct brw_reg dst_retyped;
1195 /* The G45 and older chipsets don't support 8-wide dispatch for LOD biased
1196 * samples, so we'll use the 16-wide instruction, leave the second halves
1197 * undefined, and trust the execution mask to keep the undefined pixels
1200 if (c->dispatch_width == 16 || intel->gen < 5) {
1201 if (intel->gen >= 5)
1202 msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_BIAS_GEN5;
1204 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
1205 mrf_per_channel = 2;
1206 dst_retyped = retype(vec16(dst[0]), BRW_REGISTER_TYPE_UW);
1207 response_length = 8;
1209 msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_BIAS_GEN5;
1210 mrf_per_channel = 1;
1211 dst_retyped = retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW);
1212 response_length = 4;
1215 /* Shadow ignored for txb. */
1217 case TEXTURE_1D_INDEX:
1218 brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]);
1219 brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), brw_imm_f(0));
1220 brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), brw_imm_f(0));
1222 case TEXTURE_2D_INDEX:
1223 case TEXTURE_RECT_INDEX:
1224 brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]);
1225 brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), arg[1]);
1226 brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), brw_imm_f(0));
1228 case TEXTURE_3D_INDEX:
1229 case TEXTURE_CUBE_INDEX:
1230 brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]);
1231 brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), arg[1]);
1232 brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), arg[2]);
1235 /* unexpected target */
1239 brw_MOV(p, brw_message_reg(2 + 3 * mrf_per_channel), arg[3]);
1240 msgLength = 2 + 4 * mrf_per_channel - 1;
1245 retype(depth_payload, BRW_REGISTER_TYPE_UW),
1246 SURF_INDEX_TEXTURE(sampler),
1248 dst_flags & WRITEMASK_XYZW,
1254 BRW_SAMPLER_SIMD_MODE_SIMD16);
1258 static void emit_lit(struct brw_wm_compile *c,
1259 const struct brw_reg *dst,
1261 const struct brw_reg *arg0)
1263 struct brw_compile *p = &c->func;
1265 assert((mask & WRITEMASK_XW) == 0);
1267 if (mask & WRITEMASK_Y) {
1268 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
1269 brw_MOV(p, dst[1], arg0[0]);
1270 brw_set_saturate(p, 0);
1273 if (mask & WRITEMASK_Z) {
1274 emit_math2(c, BRW_MATH_FUNCTION_POW,
1276 WRITEMASK_X | (mask & SATURATE),
1281 /* Ordinarily you'd use an iff statement to skip or shortcircuit
1282 * some of the POW calculations above, but 16-wide iff statements
1283 * seem to lock c1 hardware, so this is a nasty workaround:
1285 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_LE, arg0[0], brw_imm_f(0));
1287 if (mask & WRITEMASK_Y)
1288 brw_MOV(p, dst[1], brw_imm_f(0));
1290 if (mask & WRITEMASK_Z)
1291 brw_MOV(p, dst[2], brw_imm_f(0));
1293 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1297 /* Kill pixel - set execution mask to zero for those pixels which
1300 static void emit_kil( struct brw_wm_compile *c,
1301 struct brw_reg *arg0)
1303 struct brw_compile *p = &c->func;
1304 struct brw_reg r0uw = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
1307 for (i = 0; i < 4; i++) {
1308 /* Check if we've already done the comparison for this reg
1309 * -- common when someone does KIL TEMP.wwww.
1311 for (j = 0; j < i; j++) {
1312 if (memcmp(&arg0[j], &arg0[i], sizeof(arg0[0])) == 0)
1318 brw_push_insn_state(p);
1319 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0[i], brw_imm_f(0));
1320 brw_set_predicate_control_flag_value(p, 0xff);
1321 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1322 brw_AND(p, r0uw, brw_flag_reg(), r0uw);
1323 brw_pop_insn_state(p);
1327 /* KIL_NV kills the pixels that are currently executing, not based on a test
1330 void emit_kil_nv( struct brw_wm_compile *c )
1332 struct brw_compile *p = &c->func;
1333 struct brw_reg r0uw = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
1335 brw_push_insn_state(p);
1336 brw_set_mask_control(p, BRW_MASK_DISABLE);
1337 brw_NOT(p, c->emit_mask_reg, brw_mask_reg(1)); /* IMASK */
1338 brw_AND(p, r0uw, c->emit_mask_reg, r0uw);
1339 brw_pop_insn_state(p);
1342 static void fire_fb_write( struct brw_wm_compile *c,
1348 struct brw_compile *p = &c->func;
1349 struct intel_context *intel = &p->brw->intel;
1352 if (c->dispatch_width == 16)
1353 dst = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
1355 dst = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
1357 /* Pass through control information:
1359 /* mov (8) m1.0<1>:ud r1.0<8;8,1>:ud { Align1 NoMask } */
1360 if (intel->gen < 6) /* gen6, use headerless for fb write */
1362 brw_push_insn_state(p);
1363 brw_set_mask_control(p, BRW_MASK_DISABLE); /* ? */
1364 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1366 brw_message_reg(base_reg + 1),
1367 brw_vec8_grf(1, 0));
1368 brw_pop_insn_state(p);
1371 /* Send framebuffer write message: */
1372 /* send (16) null.0<1>:uw m0 r0.0<8;8,1>:uw 0x85a04000:ud { Align1 EOT } */
1377 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW),
1385 static void emit_aa( struct brw_wm_compile *c,
1386 struct brw_reg *arg1,
1389 struct brw_compile *p = &c->func;
1390 GLuint comp = c->aa_dest_stencil_reg / 2;
1391 GLuint off = c->aa_dest_stencil_reg % 2;
1392 struct brw_reg aa = offset(arg1[comp], off);
1394 brw_push_insn_state(p);
1395 brw_set_compression_control(p, BRW_COMPRESSION_NONE); /* ?? */
1396 brw_MOV(p, brw_message_reg(reg), aa);
1397 brw_pop_insn_state(p);
1401 /* Post-fragment-program processing. Send the results to the
1403 * \param arg0 the fragment color
1404 * \param arg1 the pass-through depth value
1405 * \param arg2 the shader-computed depth value
1407 void emit_fb_write(struct brw_wm_compile *c,
1408 struct brw_reg *arg0,
1409 struct brw_reg *arg1,
1410 struct brw_reg *arg2,
1414 struct brw_compile *p = &c->func;
1415 struct brw_context *brw = p->brw;
1416 struct intel_context *intel = &brw->intel;
1420 /* Reserve a space for AA - may not be needed:
1422 if (c->aa_dest_stencil_reg)
1425 /* I don't really understand how this achieves the color interleave
1426 * (ie RGBARGBA) in the result: [Do the saturation here]
1428 brw_push_insn_state(p);
1430 for (channel = 0; channel < 4; channel++) {
1431 if (intel->gen >= 6) {
1432 /* gen6 SIMD16 single source DP write looks like:
1442 if (c->dispatch_width == 16) {
1443 brw_MOV(p, brw_message_reg(nr + channel * 2), arg0[channel]);
1445 brw_MOV(p, brw_message_reg(nr + channel), arg0[channel]);
1447 } else if (c->dispatch_width == 16 && brw->has_compr4) {
1448 /* pre-gen6 SIMD16 single source DP write looks like:
1458 * By setting the high bit of the MRF register number, we indicate
1459 * that we want COMPR4 mode - instead of doing the usual destination
1460 * + 1 for the second half we get destination + 4.
1463 brw_message_reg(nr + channel + BRW_MRF_COMPR4),
1466 /* mov (8) m2.0<1>:ud r28.0<8;8,1>:ud { Align1 } */
1467 /* mov (8) m6.0<1>:ud r29.0<8;8,1>:ud { Align1 SecHalf } */
1468 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1470 brw_message_reg(nr + channel),
1473 if (c->dispatch_width == 16) {
1474 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
1476 brw_message_reg(nr + channel + 4),
1477 sechalf(arg0[channel]));
1481 /* skip over the regs populated above:
1483 if (c->dispatch_width == 16)
1488 brw_pop_insn_state(p);
1490 if (c->source_depth_to_render_target)
1492 if (c->computes_depth)
1493 brw_MOV(p, brw_message_reg(nr), arg2[2]);
1495 brw_MOV(p, brw_message_reg(nr), arg1[1]); /* ? */
1500 if (c->dest_depth_reg)
1502 GLuint comp = c->dest_depth_reg / 2;
1503 GLuint off = c->dest_depth_reg % 2;
1506 brw_push_insn_state(p);
1507 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1509 brw_MOV(p, brw_message_reg(nr), offset(arg1[comp],1));
1511 brw_MOV(p, brw_message_reg(nr+1), arg1[comp+1]);
1512 brw_pop_insn_state(p);
1515 brw_MOV(p, brw_message_reg(nr), arg1[comp]);
1520 if (intel->gen >= 6) {
1521 /* Load the message header. There's no implied move from src0
1522 * to the base mrf on gen6.
1524 brw_push_insn_state(p);
1525 brw_set_mask_control(p, BRW_MASK_DISABLE);
1526 brw_MOV(p, brw_message_reg(0), brw_vec8_grf(0, 0));
1527 brw_pop_insn_state(p);
1530 if (!c->runtime_check_aads_emit) {
1531 if (c->aa_dest_stencil_reg)
1532 emit_aa(c, arg1, 2);
1534 fire_fb_write(c, 0, nr, target, eot);
1537 struct brw_reg v1_null_ud = vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
1538 struct brw_reg ip = brw_ip_reg();
1539 struct brw_instruction *jmp;
1541 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1542 brw_set_conditionalmod(p, BRW_CONDITIONAL_Z);
1545 get_element_ud(brw_vec8_grf(1,0), 6),
1548 jmp = brw_JMPI(p, ip, ip, brw_imm_w(0));
1550 emit_aa(c, arg1, 2);
1551 fire_fb_write(c, 0, nr, target, eot);
1552 /* note - thread killed in subroutine */
1554 brw_land_fwd_jump(p, jmp);
1556 /* ELSE: Shuffle up one register to fill in the hole left for AA:
1558 fire_fb_write(c, 1, nr-1, target, eot);
1563 * Move a GPR to scratch memory.
1565 static void emit_spill( struct brw_wm_compile *c,
1569 struct brw_compile *p = &c->func;
1572 mov (16) m2.0<1>:ud r2.0<8;8,1>:ud { Align1 Compr }
1574 brw_MOV(p, brw_message_reg(2), reg);
1577 mov (1) r0.2<1>:d 0x00000080:d { Align1 NoMask }
1578 send (16) null.0<1>:uw m1 r0.0<8;8,1>:uw 0x053003ff:ud { Align1 }
1580 brw_oword_block_write_scratch(p, brw_message_reg(1), 2, slot);
1585 * Load a GPR from scratch memory.
1587 static void emit_unspill( struct brw_wm_compile *c,
1591 struct brw_compile *p = &c->func;
1593 /* Slot 0 is the undef value.
1596 brw_MOV(p, reg, brw_imm_f(0));
1601 mov (1) r0.2<1>:d 0x000000c0:d { Align1 NoMask }
1602 send (16) r110.0<1>:uw m1 r0.0<8;8,1>:uw 0x041243ff:ud { Align1 }
1605 brw_oword_block_read(p, vec16(reg), brw_message_reg(1), 2, slot);
1610 * Retrieve up to 4 GEN4 register pairs for the given wm reg:
1611 * Args with unspill_reg != 0 will be loaded from scratch memory.
1613 static void get_argument_regs( struct brw_wm_compile *c,
1614 struct brw_wm_ref *arg[],
1615 struct brw_reg *regs )
1619 for (i = 0; i < 4; i++) {
1621 if (arg[i]->unspill_reg)
1623 brw_vec8_grf(arg[i]->unspill_reg, 0),
1624 arg[i]->value->spill_slot);
1626 regs[i] = arg[i]->hw_reg;
1629 regs[i] = brw_null_reg();
1636 * For values that have a spill_slot!=0, write those regs to scratch memory.
1638 static void spill_values( struct brw_wm_compile *c,
1639 struct brw_wm_value *values,
1644 for (i = 0; i < nr; i++)
1645 if (values[i].spill_slot)
1646 emit_spill(c, values[i].hw_reg, values[i].spill_slot);
1650 /* Emit the fragment program instructions here.
1652 void brw_wm_emit( struct brw_wm_compile *c )
1654 struct brw_compile *p = &c->func;
1655 struct intel_context *intel = &p->brw->intel;
1658 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
1659 if (intel->gen >= 6)
1660 brw_set_acc_write_control(p, 1);
1662 /* Check if any of the payload regs need to be spilled:
1664 spill_values(c, c->payload.depth, 4);
1665 spill_values(c, c->creg, c->nr_creg);
1666 spill_values(c, c->payload.input_interp, FRAG_ATTRIB_MAX);
1669 for (insn = 0; insn < c->nr_insns; insn++) {
1671 struct brw_wm_instruction *inst = &c->instruction[insn];
1672 struct brw_reg args[3][4], dst[4];
1673 GLuint i, dst_flags;
1675 /* Get argument regs:
1677 for (i = 0; i < 3; i++)
1678 get_argument_regs(c, inst->src[i], args[i]);
1682 for (i = 0; i < 4; i++)
1684 dst[i] = inst->dst[i]->hw_reg;
1686 dst[i] = brw_null_reg();
1690 dst_flags = inst->writemask;
1692 dst_flags |= SATURATE;
1694 switch (inst->opcode) {
1695 /* Generated instructions for calculating triangle interpolants:
1698 emit_pixel_xy(c, dst, dst_flags);
1702 emit_delta_xy(p, dst, dst_flags, args[0]);
1706 emit_wpos_xy(c, dst, dst_flags, args[0]);
1710 emit_pixel_w(c, dst, dst_flags, args[0], args[1]);
1714 emit_linterp(p, dst, dst_flags, args[0], args[1]);
1718 emit_pinterp(p, dst, dst_flags, args[0], args[1], args[2]);
1722 emit_cinterp(p, dst, dst_flags, args[0]);
1726 emit_fb_write(c, args[0], args[1], args[2], inst->target, inst->eot);
1729 case WM_FRONTFACING:
1730 emit_frontfacing(p, dst, dst_flags);
1733 /* Straightforward arithmetic:
1736 emit_alu2(p, brw_ADD, dst, dst_flags, args[0], args[1]);
1740 emit_alu1(p, brw_FRC, dst, dst_flags, args[0]);
1744 emit_alu1(p, brw_RNDD, dst, dst_flags, args[0]);
1748 emit_ddxy(p, dst, dst_flags, GL_TRUE, args[0]);
1752 emit_ddxy(p, dst, dst_flags, GL_FALSE, args[0]);
1756 emit_dp2(p, dst, dst_flags, args[0], args[1]);
1760 emit_dp3(p, dst, dst_flags, args[0], args[1]);
1764 emit_dp4(p, dst, dst_flags, args[0], args[1]);
1768 emit_dph(p, dst, dst_flags, args[0], args[1]);
1772 for (i = 0; i < 4; i++) {
1773 if (dst_flags & (1<<i)) {
1774 brw_RNDZ(p, dst[i], args[0][i]);
1780 emit_lrp(p, dst, dst_flags, args[0], args[1], args[2]);
1784 emit_mad(p, dst, dst_flags, args[0], args[1], args[2]);
1789 emit_alu1(p, brw_MOV, dst, dst_flags, args[0]);
1793 emit_alu2(p, brw_MUL, dst, dst_flags, args[0], args[1]);
1797 emit_xpd(p, dst, dst_flags, args[0], args[1]);
1800 /* Higher math functions:
1803 emit_math1(c, BRW_MATH_FUNCTION_INV, dst, dst_flags, args[0]);
1807 emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, dst_flags, args[0]);
1811 emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, dst_flags, args[0]);
1815 emit_math1(c, BRW_MATH_FUNCTION_COS, dst, dst_flags, args[0]);
1819 emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, dst_flags, args[0]);
1823 emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, dst_flags, args[0]);
1827 /* There is an scs math function, but it would need some
1828 * fixup for 16-element execution.
1830 if (dst_flags & WRITEMASK_X)
1831 emit_math1(c, BRW_MATH_FUNCTION_COS, dst, (dst_flags&SATURATE)|WRITEMASK_X, args[0]);
1832 if (dst_flags & WRITEMASK_Y)
1833 emit_math1(c, BRW_MATH_FUNCTION_SIN, dst+1, (dst_flags&SATURATE)|WRITEMASK_X, args[0]);
1837 emit_math2(c, BRW_MATH_FUNCTION_POW, dst, dst_flags, args[0], args[1]);
1843 emit_cmp(p, dst, dst_flags, args[0], args[1], args[2]);
1847 emit_max(p, dst, dst_flags, args[0], args[1]);
1851 emit_min(p, dst, dst_flags, args[0], args[1]);
1855 emit_slt(p, dst, dst_flags, args[0], args[1]);
1859 emit_sle(p, dst, dst_flags, args[0], args[1]);
1862 emit_sgt(p, dst, dst_flags, args[0], args[1]);
1865 emit_sge(p, dst, dst_flags, args[0], args[1]);
1868 emit_seq(p, dst, dst_flags, args[0], args[1]);
1871 emit_sne(p, dst, dst_flags, args[0], args[1]);
1875 emit_sign(p, dst, dst_flags, args[0]);
1879 emit_lit(c, dst, dst_flags, args[0]);
1882 /* Texturing operations:
1885 emit_tex(c, dst, dst_flags, args[0], c->payload.depth[0].hw_reg,
1886 inst->tex_idx, inst->tex_unit,
1891 emit_txb(c, dst, dst_flags, args[0], c->payload.depth[0].hw_reg,
1892 inst->tex_idx, inst->tex_unit);
1896 emit_kil(c, args[0]);
1904 printf("Unsupported opcode %i (%s) in fragment shader\n",
1905 inst->opcode, inst->opcode < MAX_OPCODE ?
1906 _mesa_opcode_string(inst->opcode) :
1910 for (i = 0; i < 4; i++)
1911 if (inst->dst[i] && inst->dst[i]->spill_slot)
1913 inst->dst[i]->hw_reg,
1914 inst->dst[i]->spill_slot);
1917 /* Only properly tested on ILK */
1918 if (p->brw->intel.gen == 5) {
1919 brw_remove_duplicate_mrf_moves(p);
1920 if (c->dispatch_width == 16)
1921 brw_remove_grf_to_mrf_moves(p);
1924 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
1927 printf("wm-native:\n");
1928 for (i = 0; i < p->nr_insn; i++)
1929 brw_disasm(stdout, &p->store[i], p->brw->intel.gen);