2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
4 develop this 3D driver.
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 **********************************************************************/
29 * Keith Whitwell <keith@tungstengraphics.com>
34 #include "brw_context.h"
35 #include "brw_defines.h"
40 /***********************************************************************
41 * Internal helper for constructing instructions
44 static void guess_execution_size(struct brw_compile *p,
45 struct brw_instruction *insn,
48 if (reg.width == BRW_WIDTH_8 && p->compressed)
49 insn->header.execution_size = BRW_EXECUTE_16;
51 insn->header.execution_size = reg.width; /* note - definitions are compatible */
56 * Prior to Sandybridge, the SEND instruction accepted non-MRF source
57 * registers, implicitly moving the operand to a message register.
59 * On Sandybridge, this is no longer the case. This function performs the
60 * explicit move; it should be called before emitting a SEND instruction.
63 gen6_resolve_implied_move(struct brw_compile *p,
67 struct intel_context *intel = &p->brw->intel;
71 if (src->file == BRW_MESSAGE_REGISTER_FILE)
74 if (src->file != BRW_ARCHITECTURE_REGISTER_FILE || src->nr != BRW_ARF_NULL) {
75 brw_push_insn_state(p);
76 brw_set_mask_control(p, BRW_MASK_DISABLE);
77 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
78 brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD),
79 retype(*src, BRW_REGISTER_TYPE_UD));
80 brw_pop_insn_state(p);
82 *src = brw_message_reg(msg_reg_nr);
86 gen7_convert_mrf_to_grf(struct brw_compile *p, struct brw_reg *reg)
88 /* From the BSpec / ISA Reference / send - [DevIVB+]:
89 * "The send with EOT should use register space R112-R127 for <src>. This is
90 * to enable loading of a new thread into the same slot while the message
91 * with EOT for current thread is pending dispatch."
93 * Since we're pretending to have 16 MRFs anyway, we may as well use the
94 * registers required for messages with EOT.
96 struct intel_context *intel = &p->brw->intel;
97 if (intel->gen == 7 && reg->file == BRW_MESSAGE_REGISTER_FILE) {
98 reg->file = BRW_GENERAL_REGISTER_FILE;
99 reg->nr += GEN7_MRF_HACK_START;
105 brw_set_dest(struct brw_compile *p, struct brw_instruction *insn,
108 if (dest.file != BRW_ARCHITECTURE_REGISTER_FILE &&
109 dest.file != BRW_MESSAGE_REGISTER_FILE)
110 assert(dest.nr < 128);
112 gen7_convert_mrf_to_grf(p, &dest);
114 insn->bits1.da1.dest_reg_file = dest.file;
115 insn->bits1.da1.dest_reg_type = dest.type;
116 insn->bits1.da1.dest_address_mode = dest.address_mode;
118 if (dest.address_mode == BRW_ADDRESS_DIRECT) {
119 insn->bits1.da1.dest_reg_nr = dest.nr;
121 if (insn->header.access_mode == BRW_ALIGN_1) {
122 insn->bits1.da1.dest_subreg_nr = dest.subnr;
123 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
124 dest.hstride = BRW_HORIZONTAL_STRIDE_1;
125 insn->bits1.da1.dest_horiz_stride = dest.hstride;
128 insn->bits1.da16.dest_subreg_nr = dest.subnr / 16;
129 insn->bits1.da16.dest_writemask = dest.dw1.bits.writemask;
130 /* even ignored in da16, still need to set as '01' */
131 insn->bits1.da16.dest_horiz_stride = 1;
135 insn->bits1.ia1.dest_subreg_nr = dest.subnr;
137 /* These are different sizes in align1 vs align16:
139 if (insn->header.access_mode == BRW_ALIGN_1) {
140 insn->bits1.ia1.dest_indirect_offset = dest.dw1.bits.indirect_offset;
141 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
142 dest.hstride = BRW_HORIZONTAL_STRIDE_1;
143 insn->bits1.ia1.dest_horiz_stride = dest.hstride;
146 insn->bits1.ia16.dest_indirect_offset = dest.dw1.bits.indirect_offset;
147 /* even ignored in da16, still need to set as '01' */
148 insn->bits1.ia16.dest_horiz_stride = 1;
152 /* NEW: Set the execution size based on dest.width and
153 * insn->compression_control:
155 guess_execution_size(p, insn, dest);
158 extern int reg_type_size[];
161 validate_reg(struct brw_instruction *insn, struct brw_reg reg)
163 int hstride_for_reg[] = {0, 1, 2, 4};
164 int vstride_for_reg[] = {0, 1, 2, 4, 8, 16, 32, 64, 128, 256};
165 int width_for_reg[] = {1, 2, 4, 8, 16};
166 int execsize_for_reg[] = {1, 2, 4, 8, 16, 32};
167 int width, hstride, vstride, execsize;
169 if (reg.file == BRW_IMMEDIATE_VALUE) {
170 /* 3.3.6: Region Parameters. Restriction: Immediate vectors
171 * mean the destination has to be 128-bit aligned and the
172 * destination horiz stride has to be a word.
174 if (reg.type == BRW_REGISTER_TYPE_V) {
175 assert(hstride_for_reg[insn->bits1.da1.dest_horiz_stride] *
176 reg_type_size[insn->bits1.da1.dest_reg_type] == 2);
182 if (reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
183 reg.file == BRW_ARF_NULL)
186 assert(reg.hstride >= 0 && reg.hstride < Elements(hstride_for_reg));
187 hstride = hstride_for_reg[reg.hstride];
189 if (reg.vstride == 0xf) {
192 assert(reg.vstride >= 0 && reg.vstride < Elements(vstride_for_reg));
193 vstride = vstride_for_reg[reg.vstride];
196 assert(reg.width >= 0 && reg.width < Elements(width_for_reg));
197 width = width_for_reg[reg.width];
199 assert(insn->header.execution_size >= 0 &&
200 insn->header.execution_size < Elements(execsize_for_reg));
201 execsize = execsize_for_reg[insn->header.execution_size];
203 /* Restrictions from 3.3.10: Register Region Restrictions. */
205 assert(execsize >= width);
207 /* FIXME: the assembler has a lot of code written that triggers the
208 * assertions commented it below. Let's paper over it (for now!) until we
209 * can re-validate the shaders with those little inconsistencies fixed. */
213 if (execsize == width && hstride != 0) {
214 assert(vstride == -1 || vstride == width * hstride);
219 if (execsize == width && hstride == 0) {
220 /* no restriction on vstride. */
226 assert(hstride == 0);
232 if (execsize == 1 && width == 1) {
233 assert(hstride == 0);
234 assert(vstride == 0);
239 if (vstride == 0 && hstride == 0) {
243 /* 10. Check destination issues. */
247 brw_set_src0(struct brw_compile *p, struct brw_instruction *insn,
250 struct brw_context *brw = p->brw;
251 struct intel_context *intel = &brw->intel;
253 if (reg.file != BRW_ARCHITECTURE_REGISTER_FILE)
254 assert(reg.nr < 128);
256 gen7_convert_mrf_to_grf(p, ®);
258 if (intel->gen >= 6 && (insn->header.opcode == BRW_OPCODE_SEND ||
259 insn->header.opcode == BRW_OPCODE_SENDC)) {
260 /* Any source modifiers or regions will be ignored, since this just
261 * identifies the MRF/GRF to start reading the message contents from.
262 * Check for some likely failures.
266 assert(reg.address_mode == BRW_ADDRESS_DIRECT);
269 validate_reg(insn, reg);
271 insn->bits1.da1.src0_reg_file = reg.file;
272 insn->bits1.da1.src0_reg_type = reg.type;
273 insn->bits2.da1.src0_abs = reg.abs;
274 insn->bits2.da1.src0_negate = reg.negate;
275 insn->bits2.da1.src0_address_mode = reg.address_mode;
277 if (reg.file == BRW_IMMEDIATE_VALUE) {
278 insn->bits3.ud = reg.dw1.ud;
280 /* Required to set some fields in src1 as well:
283 /* FIXME: This looks quite wrong, tempering with src1. I did not find
284 * anything in the bspec that was hinting it woud be needed when setting
285 * src0. before removing this one needs to run piglit.
287 insn->bits1.da1.src1_reg_file = 0;
288 insn->bits1.da1.src1_reg_type = reg.type;
293 if (reg.address_mode == BRW_ADDRESS_DIRECT) {
294 if (insn->header.access_mode == BRW_ALIGN_1) {
295 insn->bits2.da1.src0_subreg_nr = reg.subnr;
296 insn->bits2.da1.src0_reg_nr = reg.nr;
299 insn->bits2.da16.src0_subreg_nr = reg.subnr / 16;
300 insn->bits2.da16.src0_reg_nr = reg.nr;
304 insn->bits2.ia1.src0_subreg_nr = reg.subnr;
306 if (insn->header.access_mode == BRW_ALIGN_1) {
307 insn->bits2.ia1.src0_indirect_offset = reg.dw1.bits.indirect_offset;
310 insn->bits2.ia16.src0_subreg_nr = reg.dw1.bits.indirect_offset;
314 if (insn->header.access_mode == BRW_ALIGN_1) {
316 /* FIXME: While this is correct, if the assembler uses that code path
317 * the opcode generated are different and thus needs a validation
319 if (reg.width == BRW_WIDTH_1 &&
320 insn->header.execution_size == BRW_EXECUTE_1) {
321 insn->bits2.da1.src0_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
322 insn->bits2.da1.src0_width = BRW_WIDTH_1;
323 insn->bits2.da1.src0_vert_stride = BRW_VERTICAL_STRIDE_0;
327 insn->bits2.da1.src0_horiz_stride = reg.hstride;
328 insn->bits2.da1.src0_width = reg.width;
329 insn->bits2.da1.src0_vert_stride = reg.vstride;
333 insn->bits2.da16.src0_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
334 insn->bits2.da16.src0_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
335 insn->bits2.da16.src0_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
336 insn->bits2.da16.src0_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
338 /* This is an oddity of the fact we're using the same
339 * descriptions for registers in align_16 as align_1:
341 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
342 insn->bits2.da16.src0_vert_stride = BRW_VERTICAL_STRIDE_4;
344 insn->bits2.da16.src0_vert_stride = reg.vstride;
350 void brw_set_src1(struct brw_compile *p,
351 struct brw_instruction *insn,
354 struct brw_context *brw = p->brw;
355 struct intel_context *intel = &brw->intel;
357 assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
359 if (reg.file != BRW_ARCHITECTURE_REGISTER_FILE)
360 assert(reg.nr < 128);
362 gen7_convert_mrf_to_grf(p, ®);
364 validate_reg(insn, reg);
366 insn->bits1.da1.src1_reg_file = reg.file;
367 insn->bits1.da1.src1_reg_type = reg.type;
368 insn->bits3.da1.src1_abs = reg.abs;
369 insn->bits3.da1.src1_negate = reg.negate;
370 insn->bits3.da1.src1_address_mode = reg.address_mode;
372 /* Only src1 can be immediate in two-argument instructions.
374 assert(insn->bits1.da1.src0_reg_file != BRW_IMMEDIATE_VALUE);
376 if (reg.file == BRW_IMMEDIATE_VALUE) {
377 insn->bits3.ud = reg.dw1.ud;
380 /* It's only BRW that does not support register-indirect addressing on
382 assert (intel->gen >= 4 || reg.address_mode == BRW_ADDRESS_DIRECT);
384 if (reg.address_mode == BRW_ADDRESS_DIRECT) {
385 if (insn->header.access_mode == BRW_ALIGN_1) {
386 insn->bits3.da1.src1_subreg_nr = reg.subnr;
387 insn->bits3.da1.src1_reg_nr = reg.nr;
390 insn->bits3.da16.src1_subreg_nr = reg.subnr / 16;
391 insn->bits3.da16.src1_reg_nr = reg.nr;
395 insn->bits3.ia1.src1_subreg_nr = reg.subnr;
397 if (insn->header.access_mode == BRW_ALIGN_1)
398 insn->bits3.ia1.src1_indirect_offset = reg.dw1.bits.indirect_offset;
400 insn->bits3.ia16.src1_indirect_offset = reg.dw1.bits.indirect_offset / 16;
403 if (insn->header.access_mode == BRW_ALIGN_1) {
404 /* FIXME: While this is correct, if the assembler uses that code path
405 * the opcode generated are different and thus needs a validation
407 if (reg.width == BRW_WIDTH_1 &&
408 insn->header.execution_size == BRW_EXECUTE_1) {
409 insn->bits3.da1.src1_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
410 insn->bits3.da1.src1_width = BRW_WIDTH_1;
411 insn->bits3.da1.src1_vert_stride = BRW_VERTICAL_STRIDE_0;
414 insn->bits3.da1.src1_horiz_stride = reg.hstride;
415 insn->bits3.da1.src1_width = reg.width;
416 insn->bits3.da1.src1_vert_stride = reg.vstride;
420 insn->bits3.da16.src1_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
421 insn->bits3.da16.src1_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
422 insn->bits3.da16.src1_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
423 insn->bits3.da16.src1_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
425 /* This is an oddity of the fact we're using the same
426 * descriptions for registers in align_16 as align_1:
428 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
429 insn->bits3.da16.src1_vert_stride = BRW_VERTICAL_STRIDE_4;
431 insn->bits3.da16.src1_vert_stride = reg.vstride;
437 * Set the Message Descriptor and Extended Message Descriptor fields
440 * \note This zeroes out the Function Control bits, so it must be called
441 * \b before filling out any message-specific data. Callers can
442 * choose not to fill in irrelevant bits; they will be zero.
445 brw_set_message_descriptor(struct brw_compile *p,
446 struct brw_instruction *inst,
447 enum brw_message_target sfid,
449 unsigned response_length,
453 struct intel_context *intel = &p->brw->intel;
455 brw_set_src1(p, inst, brw_imm_d(0));
457 if (intel->gen >= 5) {
458 inst->bits3.generic_gen5.header_present = header_present;
459 inst->bits3.generic_gen5.response_length = response_length;
460 inst->bits3.generic_gen5.msg_length = msg_length;
461 inst->bits3.generic_gen5.end_of_thread = end_of_thread;
463 if (intel->gen >= 6) {
464 /* On Gen6+ Message target/SFID goes in bits 27:24 of the header */
465 inst->header.destreg__conditionalmod = sfid;
467 /* Set Extended Message Descriptor (ex_desc) */
468 inst->bits2.send_gen5.sfid = sfid;
469 inst->bits2.send_gen5.end_of_thread = end_of_thread;
472 inst->bits3.generic.response_length = response_length;
473 inst->bits3.generic.msg_length = msg_length;
474 inst->bits3.generic.msg_target = sfid;
475 inst->bits3.generic.end_of_thread = end_of_thread;
479 static void brw_set_math_message( struct brw_compile *p,
480 struct brw_instruction *insn,
482 unsigned integer_type,
486 struct brw_context *brw = p->brw;
487 struct intel_context *intel = &brw->intel;
489 unsigned response_length;
491 /* Infer message length from the function */
493 case BRW_MATH_FUNCTION_POW:
494 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT:
495 case BRW_MATH_FUNCTION_INT_DIV_REMAINDER:
496 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
504 /* Infer response length from the function */
506 case BRW_MATH_FUNCTION_SINCOS:
507 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
516 brw_set_message_descriptor(p, insn, BRW_SFID_MATH,
517 msg_length, response_length, false, false);
518 if (intel->gen == 5) {
519 insn->bits3.math_gen5.function = function;
520 insn->bits3.math_gen5.int_type = integer_type;
521 insn->bits3.math_gen5.precision = low_precision;
522 insn->bits3.math_gen5.saturate = insn->header.saturate;
523 insn->bits3.math_gen5.data_type = dataType;
524 insn->bits3.math_gen5.snapshot = 0;
526 insn->bits3.math.function = function;
527 insn->bits3.math.int_type = integer_type;
528 insn->bits3.math.precision = low_precision;
529 insn->bits3.math.saturate = insn->header.saturate;
530 insn->bits3.math.data_type = dataType;
532 insn->header.saturate = 0;
536 static void brw_set_ff_sync_message(struct brw_compile *p,
537 struct brw_instruction *insn,
539 unsigned response_length,
542 brw_set_message_descriptor(p, insn, BRW_SFID_URB,
543 1, response_length, true, end_of_thread);
544 insn->bits3.urb_gen5.opcode = 1; /* FF_SYNC */
545 insn->bits3.urb_gen5.offset = 0; /* Not used by FF_SYNC */
546 insn->bits3.urb_gen5.swizzle_control = 0; /* Not used by FF_SYNC */
547 insn->bits3.urb_gen5.allocate = allocate;
548 insn->bits3.urb_gen5.used = 0; /* Not used by FF_SYNC */
549 insn->bits3.urb_gen5.complete = 0; /* Not used by FF_SYNC */
552 static void brw_set_urb_message( struct brw_compile *p,
553 struct brw_instruction *insn,
557 unsigned response_length,
561 unsigned swizzle_control )
563 struct brw_context *brw = p->brw;
564 struct intel_context *intel = &brw->intel;
566 brw_set_message_descriptor(p, insn, BRW_SFID_URB,
567 msg_length, response_length, true, end_of_thread);
568 if (intel->gen == 7) {
569 insn->bits3.urb_gen7.opcode = 0; /* URB_WRITE_HWORD */
570 insn->bits3.urb_gen7.offset = offset;
571 assert(swizzle_control != BRW_URB_SWIZZLE_TRANSPOSE);
572 insn->bits3.urb_gen7.swizzle_control = swizzle_control;
573 /* per_slot_offset = 0 makes it ignore offsets in message header */
574 insn->bits3.urb_gen7.per_slot_offset = 0;
575 insn->bits3.urb_gen7.complete = complete;
576 } else if (intel->gen >= 5) {
577 insn->bits3.urb_gen5.opcode = 0; /* URB_WRITE */
578 insn->bits3.urb_gen5.offset = offset;
579 insn->bits3.urb_gen5.swizzle_control = swizzle_control;
580 insn->bits3.urb_gen5.allocate = allocate;
581 insn->bits3.urb_gen5.used = used; /* ? */
582 insn->bits3.urb_gen5.complete = complete;
584 insn->bits3.urb.opcode = 0; /* ? */
585 insn->bits3.urb.offset = offset;
586 insn->bits3.urb.swizzle_control = swizzle_control;
587 insn->bits3.urb.allocate = allocate;
588 insn->bits3.urb.used = used; /* ? */
589 insn->bits3.urb.complete = complete;
594 brw_set_dp_write_message(struct brw_compile *p,
595 struct brw_instruction *insn,
596 unsigned binding_table_index,
597 unsigned msg_control,
601 unsigned last_render_target,
602 unsigned response_length,
603 unsigned end_of_thread,
604 unsigned send_commit_msg)
606 struct brw_context *brw = p->brw;
607 struct intel_context *intel = &brw->intel;
610 if (intel->gen >= 7) {
611 /* Use the Render Cache for RT writes; otherwise use the Data Cache */
612 if (msg_type == GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE)
613 sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
615 sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
616 } else if (intel->gen == 6) {
617 /* Use the render cache for all write messages. */
618 sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
620 sfid = BRW_SFID_DATAPORT_WRITE;
623 brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
624 header_present, end_of_thread);
626 if (intel->gen >= 7) {
627 insn->bits3.gen7_dp.binding_table_index = binding_table_index;
628 insn->bits3.gen7_dp.msg_control = msg_control |
629 last_render_target << 6;
630 insn->bits3.gen7_dp.msg_type = msg_type;
631 } else if (intel->gen == 6) {
632 insn->bits3.gen6_dp.binding_table_index = binding_table_index;
633 insn->bits3.gen6_dp.msg_control = msg_control |
634 last_render_target << 5;
635 insn->bits3.gen6_dp.msg_type = msg_type;
636 insn->bits3.gen6_dp.send_commit_msg = send_commit_msg;
637 } else if (intel->gen == 5) {
638 insn->bits3.dp_write_gen5.binding_table_index = binding_table_index;
639 insn->bits3.dp_write_gen5.msg_control = msg_control;
640 insn->bits3.dp_write_gen5.last_render_target = last_render_target;
641 insn->bits3.dp_write_gen5.msg_type = msg_type;
642 insn->bits3.dp_write_gen5.send_commit_msg = send_commit_msg;
644 insn->bits3.dp_write.binding_table_index = binding_table_index;
645 insn->bits3.dp_write.msg_control = msg_control;
646 insn->bits3.dp_write.last_render_target = last_render_target;
647 insn->bits3.dp_write.msg_type = msg_type;
648 insn->bits3.dp_write.send_commit_msg = send_commit_msg;
653 brw_set_dp_read_message(struct brw_compile *p,
654 struct brw_instruction *insn,
655 unsigned binding_table_index,
656 unsigned msg_control,
658 unsigned target_cache,
661 unsigned response_length)
663 struct brw_context *brw = p->brw;
664 struct intel_context *intel = &brw->intel;
667 if (intel->gen >= 7) {
668 sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
669 } else if (intel->gen == 6) {
670 if (target_cache == BRW_DATAPORT_READ_TARGET_RENDER_CACHE)
671 sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
673 sfid = GEN6_SFID_DATAPORT_SAMPLER_CACHE;
675 sfid = BRW_SFID_DATAPORT_READ;
678 brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
679 header_present, false);
681 if (intel->gen >= 7) {
682 insn->bits3.gen7_dp.binding_table_index = binding_table_index;
683 insn->bits3.gen7_dp.msg_control = msg_control;
684 insn->bits3.gen7_dp.msg_type = msg_type;
685 } else if (intel->gen == 6) {
686 insn->bits3.gen6_dp.binding_table_index = binding_table_index;
687 insn->bits3.gen6_dp.msg_control = msg_control;
688 insn->bits3.gen6_dp.msg_type = msg_type;
689 insn->bits3.gen6_dp.send_commit_msg = 0;
690 } else if (intel->gen == 5) {
691 insn->bits3.dp_read_gen5.binding_table_index = binding_table_index;
692 insn->bits3.dp_read_gen5.msg_control = msg_control;
693 insn->bits3.dp_read_gen5.msg_type = msg_type;
694 insn->bits3.dp_read_gen5.target_cache = target_cache;
695 } else if (intel->is_g4x) {
696 insn->bits3.dp_read_g4x.binding_table_index = binding_table_index; /*0:7*/
697 insn->bits3.dp_read_g4x.msg_control = msg_control; /*8:10*/
698 insn->bits3.dp_read_g4x.msg_type = msg_type; /*11:13*/
699 insn->bits3.dp_read_g4x.target_cache = target_cache; /*14:15*/
701 insn->bits3.dp_read.binding_table_index = binding_table_index; /*0:7*/
702 insn->bits3.dp_read.msg_control = msg_control; /*8:11*/
703 insn->bits3.dp_read.msg_type = msg_type; /*12:13*/
704 insn->bits3.dp_read.target_cache = target_cache; /*14:15*/
709 brw_set_sampler_message(struct brw_compile *p,
710 struct brw_instruction *insn,
711 unsigned binding_table_index,
714 unsigned response_length,
716 unsigned header_present,
718 unsigned return_format)
720 struct brw_context *brw = p->brw;
721 struct intel_context *intel = &brw->intel;
723 brw_set_message_descriptor(p, insn, BRW_SFID_SAMPLER, msg_length,
724 response_length, header_present, false);
726 if (intel->gen >= 7) {
727 insn->bits3.sampler_gen7.binding_table_index = binding_table_index;
728 insn->bits3.sampler_gen7.sampler = sampler;
729 insn->bits3.sampler_gen7.msg_type = msg_type;
730 insn->bits3.sampler_gen7.simd_mode = simd_mode;
731 } else if (intel->gen >= 5) {
732 insn->bits3.sampler_gen5.binding_table_index = binding_table_index;
733 insn->bits3.sampler_gen5.sampler = sampler;
734 insn->bits3.sampler_gen5.msg_type = msg_type;
735 insn->bits3.sampler_gen5.simd_mode = simd_mode;
736 } else if (intel->is_g4x) {
737 insn->bits3.sampler_g4x.binding_table_index = binding_table_index;
738 insn->bits3.sampler_g4x.sampler = sampler;
739 insn->bits3.sampler_g4x.msg_type = msg_type;
741 insn->bits3.sampler.binding_table_index = binding_table_index;
742 insn->bits3.sampler.sampler = sampler;
743 insn->bits3.sampler.msg_type = msg_type;
744 insn->bits3.sampler.return_format = return_format;
749 #define next_insn brw_next_insn
750 struct brw_instruction *
751 brw_next_insn(struct brw_compile *p, unsigned opcode)
753 struct brw_instruction *insn;
755 if (p->nr_insn + 1 > p->store_size) {
757 printf("incresing the store size to %d\n", p->store_size << 1);
759 p->store = reralloc(p->mem_ctx, p->store,
760 struct brw_instruction, p->store_size);
762 assert(!"realloc eu store memeory failed");
765 p->next_insn_offset += 16;
766 insn = &p->store[p->nr_insn++];
767 memcpy(insn, p->current, sizeof(*insn));
769 /* Reset this one-shot flag:
772 if (p->current->header.destreg__conditionalmod) {
773 p->current->header.destreg__conditionalmod = 0;
774 p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
777 insn->header.opcode = opcode;
781 static struct brw_instruction *brw_alu1( struct brw_compile *p,
786 struct brw_instruction *insn = next_insn(p, opcode);
787 brw_set_dest(p, insn, dest);
788 brw_set_src0(p, insn, src);
792 static struct brw_instruction *brw_alu2(struct brw_compile *p,
796 struct brw_reg src1 )
798 struct brw_instruction *insn = next_insn(p, opcode);
799 brw_set_dest(p, insn, dest);
800 brw_set_src0(p, insn, src0);
801 brw_set_src1(p, insn, src1);
806 get_3src_subreg_nr(struct brw_reg reg)
808 if (reg.vstride == BRW_VERTICAL_STRIDE_0) {
809 assert(brw_is_single_value_swizzle(reg.dw1.bits.swizzle));
810 return reg.subnr / 4 + BRW_GET_SWZ(reg.dw1.bits.swizzle, 0);
812 return reg.subnr / 4;
816 static int get_3src_type(int type)
818 assert(type == BRW_REGISTER_TYPE_F ||
819 type == BRW_REGISTER_TYPE_D ||
820 type == BRW_REGISTER_TYPE_UD);
823 case BRW_REGISTER_TYPE_F: return BRW_REGISTER_3SRC_TYPE_F;
824 case BRW_REGISTER_TYPE_D: return BRW_REGISTER_3SRC_TYPE_D;
825 case BRW_REGISTER_TYPE_UD: return BRW_REGISTER_3SRC_TYPE_UD;
828 return BRW_REGISTER_3SRC_TYPE_F;
832 brw_set_3src_dest(struct brw_compile *p,
833 struct brw_instruction *insn,
836 gen7_convert_mrf_to_grf(p, &dest);
838 assert(insn->header.access_mode == BRW_ALIGN_16);
840 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
841 dest.file == BRW_MESSAGE_REGISTER_FILE);
842 assert(dest.nr < 128);
843 assert(dest.address_mode == BRW_ADDRESS_DIRECT);
844 insn->bits1.da3src.dest_reg_type = get_3src_type(dest.type);
845 insn->bits1.da3src.dest_reg_file = (dest.file == BRW_MESSAGE_REGISTER_FILE);
846 insn->bits1.da3src.dest_reg_nr = dest.nr;
847 insn->bits1.da3src.dest_subreg_nr = dest.subnr / 16;
848 insn->bits1.da3src.dest_writemask = dest.dw1.bits.writemask;
849 guess_execution_size(p, insn, dest);
853 brw_set_3src_src0(struct brw_compile *p,
854 struct brw_instruction *insn,
857 assert(src0.file == BRW_GENERAL_REGISTER_FILE);
858 assert(src0.address_mode == BRW_ADDRESS_DIRECT);
859 assert(src0.nr < 128);
860 insn->bits1.da3src.src_reg_type = get_3src_type(src0.type);
861 insn->bits2.da3src.src0_swizzle = src0.dw1.bits.swizzle;
862 insn->bits2.da3src.src0_subreg_nr = get_3src_subreg_nr(src0);
863 insn->bits2.da3src.src0_reg_nr = src0.nr;
864 insn->bits1.da3src.src0_abs = src0.abs;
865 insn->bits1.da3src.src0_negate = src0.negate;
866 insn->bits2.da3src.src0_rep_ctrl = src0.vstride == BRW_VERTICAL_STRIDE_0;
870 brw_set_3src_src1(struct brw_compile *p,
871 struct brw_instruction *insn,
874 assert(src1.file == BRW_GENERAL_REGISTER_FILE);
875 assert(src1.address_mode == BRW_ADDRESS_DIRECT);
876 assert(src1.nr < 128);
877 assert(src1.type == insn->bits1.da3src.src_reg_type);
878 insn->bits2.da3src.src1_swizzle = src1.dw1.bits.swizzle;
879 insn->bits2.da3src.src1_subreg_nr_low = get_3src_subreg_nr(src1) & 0x3;
880 insn->bits3.da3src.src1_subreg_nr_high = get_3src_subreg_nr(src1) >> 2;
881 insn->bits2.da3src.src1_rep_ctrl = src1.vstride == BRW_VERTICAL_STRIDE_0;
882 insn->bits3.da3src.src1_reg_nr = src1.nr;
883 insn->bits1.da3src.src1_abs = src1.abs;
884 insn->bits1.da3src.src1_negate = src1.negate;
888 brw_set_3src_src2(struct brw_compile *p,
889 struct brw_instruction *insn,
892 assert(src2.file == BRW_GENERAL_REGISTER_FILE);
893 assert(src2.address_mode == BRW_ADDRESS_DIRECT);
894 assert(src2.nr < 128);
895 assert(src2.type == insn->bits1.da3src.src_reg_type);
896 insn->bits3.da3src.src2_swizzle = src2.dw1.bits.swizzle;
897 insn->bits3.da3src.src2_subreg_nr = get_3src_subreg_nr(src2);
898 insn->bits3.da3src.src2_rep_ctrl = src2.vstride == BRW_VERTICAL_STRIDE_0;
899 insn->bits3.da3src.src2_reg_nr = src2.nr;
900 insn->bits1.da3src.src2_abs = src2.abs;
901 insn->bits1.da3src.src2_negate = src2.negate;
904 static struct brw_instruction *brw_alu3(struct brw_compile *p,
911 struct brw_instruction *insn = next_insn(p, opcode);
912 brw_set_3src_dest(p, insn, dest);
913 brw_set_3src_src0(p, insn, src0);
914 brw_set_3src_src1(p, insn, src1);
915 brw_set_3src_src2(p, insn, src2);
920 /***********************************************************************
921 * Convenience routines.
924 struct brw_instruction *brw_##OP(struct brw_compile *p, \
925 struct brw_reg dest, \
926 struct brw_reg src0) \
928 return brw_alu1(p, BRW_OPCODE_##OP, dest, src0); \
932 struct brw_instruction *brw_##OP(struct brw_compile *p, \
933 struct brw_reg dest, \
934 struct brw_reg src0, \
935 struct brw_reg src1) \
937 return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1); \
941 struct brw_instruction *brw_##OP(struct brw_compile *p, \
942 struct brw_reg dest, \
943 struct brw_reg src0, \
944 struct brw_reg src1, \
945 struct brw_reg src2) \
947 return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
950 /* Rounding operations (other than RNDD) require two instructions - the first
951 * stores a rounded value (possibly the wrong way) in the dest register, but
952 * also sets a per-channel "increment bit" in the flag register. A predicated
953 * add of 1.0 fixes dest to contain the desired result.
955 * Sandybridge and later appear to round correctly without an ADD.
958 void brw_##OP(struct brw_compile *p, \
959 struct brw_reg dest, \
960 struct brw_reg src) \
962 struct brw_instruction *rnd, *add; \
963 rnd = next_insn(p, BRW_OPCODE_##OP); \
964 brw_set_dest(p, rnd, dest); \
965 brw_set_src0(p, rnd, src); \
967 if (p->brw->intel.gen < 6) { \
968 /* turn on round-increments */ \
969 rnd->header.destreg__conditionalmod = BRW_CONDITIONAL_R; \
970 add = brw_ADD(p, dest, dest, brw_imm_f(1.0f)); \
971 add->header.predicate_control = BRW_PREDICATE_NORMAL; \
1004 struct brw_instruction *brw_ADD(struct brw_compile *p,
1005 struct brw_reg dest,
1006 struct brw_reg src0,
1007 struct brw_reg src1)
1010 if (src0.type == BRW_REGISTER_TYPE_F ||
1011 (src0.file == BRW_IMMEDIATE_VALUE &&
1012 src0.type == BRW_REGISTER_TYPE_VF)) {
1013 assert(src1.type != BRW_REGISTER_TYPE_UD);
1014 assert(src1.type != BRW_REGISTER_TYPE_D);
1017 if (src1.type == BRW_REGISTER_TYPE_F ||
1018 (src1.file == BRW_IMMEDIATE_VALUE &&
1019 src1.type == BRW_REGISTER_TYPE_VF)) {
1020 assert(src0.type != BRW_REGISTER_TYPE_UD);
1021 assert(src0.type != BRW_REGISTER_TYPE_D);
1024 return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1);
1027 struct brw_instruction *brw_AVG(struct brw_compile *p,
1028 struct brw_reg dest,
1029 struct brw_reg src0,
1030 struct brw_reg src1)
1032 assert(dest.type == src0.type);
1033 assert(src0.type == src1.type);
1034 switch (src0.type) {
1035 case BRW_REGISTER_TYPE_B:
1036 case BRW_REGISTER_TYPE_UB:
1037 case BRW_REGISTER_TYPE_W:
1038 case BRW_REGISTER_TYPE_UW:
1039 case BRW_REGISTER_TYPE_D:
1040 case BRW_REGISTER_TYPE_UD:
1043 assert(!"Bad type for brw_AVG");
1046 return brw_alu2(p, BRW_OPCODE_AVG, dest, src0, src1);
1049 struct brw_instruction *brw_MUL(struct brw_compile *p,
1050 struct brw_reg dest,
1051 struct brw_reg src0,
1052 struct brw_reg src1)
1055 if (src0.type == BRW_REGISTER_TYPE_D ||
1056 src0.type == BRW_REGISTER_TYPE_UD ||
1057 src1.type == BRW_REGISTER_TYPE_D ||
1058 src1.type == BRW_REGISTER_TYPE_UD) {
1059 assert(dest.type != BRW_REGISTER_TYPE_F);
1062 if (src0.type == BRW_REGISTER_TYPE_F ||
1063 (src0.file == BRW_IMMEDIATE_VALUE &&
1064 src0.type == BRW_REGISTER_TYPE_VF)) {
1065 assert(src1.type != BRW_REGISTER_TYPE_UD);
1066 assert(src1.type != BRW_REGISTER_TYPE_D);
1069 if (src1.type == BRW_REGISTER_TYPE_F ||
1070 (src1.file == BRW_IMMEDIATE_VALUE &&
1071 src1.type == BRW_REGISTER_TYPE_VF)) {
1072 assert(src0.type != BRW_REGISTER_TYPE_UD);
1073 assert(src0.type != BRW_REGISTER_TYPE_D);
1076 assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1077 src0.nr != BRW_ARF_ACCUMULATOR);
1078 assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1079 src1.nr != BRW_ARF_ACCUMULATOR);
1081 return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1);
1085 void brw_NOP(struct brw_compile *p)
1087 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_NOP);
1088 brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1089 brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1090 brw_set_src1(p, insn, brw_imm_ud(0x0));
1097 /***********************************************************************
1098 * Comparisons, if/else/endif
1101 struct brw_instruction *brw_JMPI(struct brw_compile *p,
1102 struct brw_reg dest,
1103 struct brw_reg src0,
1104 struct brw_reg src1)
1106 struct brw_instruction *insn = brw_alu2(p, BRW_OPCODE_JMPI, dest, src0, src1);
1108 insn->header.execution_size = 1;
1109 insn->header.compression_control = BRW_COMPRESSION_NONE;
1110 insn->header.mask_control = BRW_MASK_DISABLE;
1112 p->current->header.predicate_control = BRW_PREDICATE_NONE;
1118 push_if_stack(struct brw_compile *p, struct brw_instruction *inst)
1120 p->if_stack[p->if_stack_depth] = inst - p->store;
1122 p->if_stack_depth++;
1123 if (p->if_stack_array_size <= p->if_stack_depth) {
1124 p->if_stack_array_size *= 2;
1125 p->if_stack = reralloc(p->mem_ctx, p->if_stack, int,
1126 p->if_stack_array_size);
1130 static struct brw_instruction *
1131 pop_if_stack(struct brw_compile *p)
1133 p->if_stack_depth--;
1134 return &p->store[p->if_stack[p->if_stack_depth]];
1138 push_loop_stack(struct brw_compile *p, struct brw_instruction *inst)
1140 if (p->loop_stack_array_size < p->loop_stack_depth) {
1141 p->loop_stack_array_size *= 2;
1142 p->loop_stack = reralloc(p->mem_ctx, p->loop_stack, int,
1143 p->loop_stack_array_size);
1144 p->if_depth_in_loop = reralloc(p->mem_ctx, p->if_depth_in_loop, int,
1145 p->loop_stack_array_size);
1148 p->loop_stack[p->loop_stack_depth] = inst - p->store;
1149 p->loop_stack_depth++;
1150 p->if_depth_in_loop[p->loop_stack_depth] = 0;
1153 static struct brw_instruction *
1154 get_inner_do_insn(struct brw_compile *p)
1156 return &p->store[p->loop_stack[p->loop_stack_depth - 1]];
1159 /* EU takes the value from the flag register and pushes it onto some
1160 * sort of a stack (presumably merging with any flag value already on
1161 * the stack). Within an if block, the flags at the top of the stack
1162 * control execution on each channel of the unit, eg. on each of the
1163 * 16 pixel values in our wm programs.
1165 * When the matching 'else' instruction is reached (presumably by
1166 * countdown of the instruction count patched in by our ELSE/ENDIF
1167 * functions), the relevent flags are inverted.
1169 * When the matching 'endif' instruction is reached, the flags are
1170 * popped off. If the stack is now empty, normal execution resumes.
1172 struct brw_instruction *
1173 brw_IF(struct brw_compile *p, unsigned execute_size)
1175 struct intel_context *intel = &p->brw->intel;
1176 struct brw_instruction *insn;
1178 insn = next_insn(p, BRW_OPCODE_IF);
1180 /* Override the defaults for this instruction:
1182 if (intel->gen < 6) {
1183 brw_set_dest(p, insn, brw_ip_reg());
1184 brw_set_src0(p, insn, brw_ip_reg());
1185 brw_set_src1(p, insn, brw_imm_d(0x0));
1186 } else if (intel->gen == 6) {
1187 brw_set_dest(p, insn, brw_imm_w(0));
1188 insn->bits1.branch_gen6.jump_count = 0;
1189 brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1190 brw_set_src1(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1192 brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1193 brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1194 brw_set_src1(p, insn, brw_imm_ud(0));
1195 insn->bits3.break_cont.jip = 0;
1196 insn->bits3.break_cont.uip = 0;
1199 insn->header.execution_size = execute_size;
1200 insn->header.compression_control = BRW_COMPRESSION_NONE;
1201 insn->header.predicate_control = BRW_PREDICATE_NORMAL;
1202 insn->header.mask_control = BRW_MASK_ENABLE;
1203 if (!p->single_program_flow)
1204 insn->header.thread_control = BRW_THREAD_SWITCH;
1206 p->current->header.predicate_control = BRW_PREDICATE_NONE;
1208 push_if_stack(p, insn);
1209 p->if_depth_in_loop[p->loop_stack_depth]++;
1213 /* This function is only used for gen6-style IF instructions with an
1214 * embedded comparison (conditional modifier). It is not used on gen7.
1216 struct brw_instruction *
1217 gen6_IF(struct brw_compile *p, uint32_t conditional,
1218 struct brw_reg src0, struct brw_reg src1)
1220 struct brw_instruction *insn;
1222 insn = next_insn(p, BRW_OPCODE_IF);
1224 brw_set_dest(p, insn, brw_imm_w(0));
1225 if (p->compressed) {
1226 insn->header.execution_size = BRW_EXECUTE_16;
1228 insn->header.execution_size = BRW_EXECUTE_8;
1230 insn->bits1.branch_gen6.jump_count = 0;
1231 brw_set_src0(p, insn, src0);
1232 brw_set_src1(p, insn, src1);
1234 assert(insn->header.compression_control == BRW_COMPRESSION_NONE);
1235 assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
1236 insn->header.destreg__conditionalmod = conditional;
1238 if (!p->single_program_flow)
1239 insn->header.thread_control = BRW_THREAD_SWITCH;
1241 push_if_stack(p, insn);
1246 * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs.
1249 convert_IF_ELSE_to_ADD(struct brw_compile *p,
1250 struct brw_instruction *if_inst,
1251 struct brw_instruction *else_inst)
1253 /* The next instruction (where the ENDIF would be, if it existed) */
1254 struct brw_instruction *next_inst = &p->store[p->nr_insn];
1256 assert(p->single_program_flow);
1257 assert(if_inst != NULL && if_inst->header.opcode == BRW_OPCODE_IF);
1258 assert(else_inst == NULL || else_inst->header.opcode == BRW_OPCODE_ELSE);
1259 assert(if_inst->header.execution_size == BRW_EXECUTE_1);
1261 /* Convert IF to an ADD instruction that moves the instruction pointer
1262 * to the first instruction of the ELSE block. If there is no ELSE
1263 * block, point to where ENDIF would be. Reverse the predicate.
1265 * There's no need to execute an ENDIF since we don't need to do any
1266 * stack operations, and if we're currently executing, we just want to
1267 * continue normally.
1269 if_inst->header.opcode = BRW_OPCODE_ADD;
1270 if_inst->header.predicate_inverse = 1;
1272 if (else_inst != NULL) {
1273 /* Convert ELSE to an ADD instruction that points where the ENDIF
1276 else_inst->header.opcode = BRW_OPCODE_ADD;
1278 if_inst->bits3.ud = (else_inst - if_inst + 1) * 16;
1279 else_inst->bits3.ud = (next_inst - else_inst) * 16;
1281 if_inst->bits3.ud = (next_inst - if_inst) * 16;
1286 * Patch IF and ELSE instructions with appropriate jump targets.
1289 patch_IF_ELSE(struct brw_compile *p,
1290 struct brw_instruction *if_inst,
1291 struct brw_instruction *else_inst,
1292 struct brw_instruction *endif_inst)
1294 struct intel_context *intel = &p->brw->intel;
1296 /* We shouldn't be patching IF and ELSE instructions in single program flow
1297 * mode when gen < 6, because in single program flow mode on those
1298 * platforms, we convert flow control instructions to conditional ADDs that
1299 * operate on IP (see brw_ENDIF).
1301 * However, on Gen6, writing to IP doesn't work in single program flow mode
1302 * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1303 * not be updated by non-flow control instructions."). And on later
1304 * platforms, there is no significant benefit to converting control flow
1305 * instructions to conditional ADDs. So we do patch IF and ELSE
1306 * instructions in single program flow mode on those platforms.
1309 assert(!p->single_program_flow);
1311 assert(if_inst != NULL && if_inst->header.opcode == BRW_OPCODE_IF);
1312 assert(endif_inst != NULL);
1313 assert(else_inst == NULL || else_inst->header.opcode == BRW_OPCODE_ELSE);
1316 /* Jump count is for 64bit data chunk each, so one 128bit instruction
1317 * requires 2 chunks.
1319 if (intel->gen >= 5)
1322 assert(endif_inst->header.opcode == BRW_OPCODE_ENDIF);
1323 endif_inst->header.execution_size = if_inst->header.execution_size;
1325 if (else_inst == NULL) {
1326 /* Patch IF -> ENDIF */
1327 if (intel->gen < 6) {
1328 /* Turn it into an IFF, which means no mask stack operations for
1329 * all-false and jumping past the ENDIF.
1331 if_inst->header.opcode = BRW_OPCODE_IFF;
1332 if_inst->bits3.if_else.jump_count = br * (endif_inst - if_inst + 1);
1333 if_inst->bits3.if_else.pop_count = 0;
1334 if_inst->bits3.if_else.pad0 = 0;
1335 } else if (intel->gen == 6) {
1336 /* As of gen6, there is no IFF and IF must point to the ENDIF. */
1337 if_inst->bits1.branch_gen6.jump_count = br * (endif_inst - if_inst);
1339 if_inst->bits3.break_cont.uip = br * (endif_inst - if_inst);
1340 if_inst->bits3.break_cont.jip = br * (endif_inst - if_inst);
1343 else_inst->header.execution_size = if_inst->header.execution_size;
1345 /* Patch IF -> ELSE */
1346 if (intel->gen < 6) {
1347 if_inst->bits3.if_else.jump_count = br * (else_inst - if_inst);
1348 if_inst->bits3.if_else.pop_count = 0;
1349 if_inst->bits3.if_else.pad0 = 0;
1350 } else if (intel->gen == 6) {
1351 if_inst->bits1.branch_gen6.jump_count = br * (else_inst - if_inst + 1);
1354 /* Patch ELSE -> ENDIF */
1355 if (intel->gen < 6) {
1356 /* BRW_OPCODE_ELSE pre-gen6 should point just past the
1359 else_inst->bits3.if_else.jump_count = br*(endif_inst - else_inst + 1);
1360 else_inst->bits3.if_else.pop_count = 1;
1361 else_inst->bits3.if_else.pad0 = 0;
1362 } else if (intel->gen == 6) {
1363 /* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */
1364 else_inst->bits1.branch_gen6.jump_count = br*(endif_inst - else_inst);
1366 /* The IF instruction's JIP should point just past the ELSE */
1367 if_inst->bits3.break_cont.jip = br * (else_inst - if_inst + 1);
1368 /* The IF instruction's UIP and ELSE's JIP should point to ENDIF */
1369 if_inst->bits3.break_cont.uip = br * (endif_inst - if_inst);
1370 else_inst->bits3.break_cont.jip = br * (endif_inst - else_inst);
1376 brw_ELSE(struct brw_compile *p)
1378 struct intel_context *intel = &p->brw->intel;
1379 struct brw_instruction *insn;
1381 insn = next_insn(p, BRW_OPCODE_ELSE);
1383 if (intel->gen < 6) {
1384 brw_set_dest(p, insn, brw_ip_reg());
1385 brw_set_src0(p, insn, brw_ip_reg());
1386 brw_set_src1(p, insn, brw_imm_d(0x0));
1387 } else if (intel->gen == 6) {
1388 brw_set_dest(p, insn, brw_imm_w(0));
1389 insn->bits1.branch_gen6.jump_count = 0;
1390 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1391 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1393 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1394 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1395 brw_set_src1(p, insn, brw_imm_ud(0));
1396 insn->bits3.break_cont.jip = 0;
1397 insn->bits3.break_cont.uip = 0;
1400 insn->header.compression_control = BRW_COMPRESSION_NONE;
1401 insn->header.mask_control = BRW_MASK_ENABLE;
1402 if (!p->single_program_flow)
1403 insn->header.thread_control = BRW_THREAD_SWITCH;
1405 push_if_stack(p, insn);
1409 brw_ENDIF(struct brw_compile *p)
1411 struct intel_context *intel = &p->brw->intel;
1412 struct brw_instruction *insn = NULL;
1413 struct brw_instruction *else_inst = NULL;
1414 struct brw_instruction *if_inst = NULL;
1415 struct brw_instruction *tmp;
1416 bool emit_endif = true;
1418 /* In single program flow mode, we can express IF and ELSE instructions
1419 * equivalently as ADD instructions that operate on IP. On platforms prior
1420 * to Gen6, flow control instructions cause an implied thread switch, so
1421 * this is a significant savings.
1423 * However, on Gen6, writing to IP doesn't work in single program flow mode
1424 * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1425 * not be updated by non-flow control instructions."). And on later
1426 * platforms, there is no significant benefit to converting control flow
1427 * instructions to conditional ADDs. So we only do this trick on Gen4 and
1430 if (intel->gen < 6 && p->single_program_flow)
1434 * A single next_insn() may change the base adress of instruction store
1435 * memory(p->store), so call it first before referencing the instruction
1436 * store pointer from an index
1439 insn = next_insn(p, BRW_OPCODE_ENDIF);
1441 /* Pop the IF and (optional) ELSE instructions from the stack */
1442 p->if_depth_in_loop[p->loop_stack_depth]--;
1443 tmp = pop_if_stack(p);
1444 if (tmp->header.opcode == BRW_OPCODE_ELSE) {
1446 tmp = pop_if_stack(p);
1451 /* ENDIF is useless; don't bother emitting it. */
1452 convert_IF_ELSE_to_ADD(p, if_inst, else_inst);
1456 if (intel->gen < 6) {
1457 brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1458 brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1459 brw_set_src1(p, insn, brw_imm_d(0x0));
1460 } else if (intel->gen == 6) {
1461 brw_set_dest(p, insn, brw_imm_w(0));
1462 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1463 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1465 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1466 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1467 brw_set_src1(p, insn, brw_imm_ud(0));
1470 insn->header.compression_control = BRW_COMPRESSION_NONE;
1471 insn->header.mask_control = BRW_MASK_ENABLE;
1472 insn->header.thread_control = BRW_THREAD_SWITCH;
1474 /* Also pop item off the stack in the endif instruction: */
1475 if (intel->gen < 6) {
1476 insn->bits3.if_else.jump_count = 0;
1477 insn->bits3.if_else.pop_count = 1;
1478 insn->bits3.if_else.pad0 = 0;
1479 } else if (intel->gen == 6) {
1480 insn->bits1.branch_gen6.jump_count = 2;
1482 insn->bits3.break_cont.jip = 2;
1484 patch_IF_ELSE(p, if_inst, else_inst, insn);
1487 struct brw_instruction *brw_BREAK(struct brw_compile *p)
1489 struct intel_context *intel = &p->brw->intel;
1490 struct brw_instruction *insn;
1492 insn = next_insn(p, BRW_OPCODE_BREAK);
1493 if (intel->gen >= 6) {
1494 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1495 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1496 brw_set_src1(p, insn, brw_imm_d(0x0));
1498 brw_set_dest(p, insn, brw_ip_reg());
1499 brw_set_src0(p, insn, brw_ip_reg());
1500 brw_set_src1(p, insn, brw_imm_d(0x0));
1501 insn->bits3.if_else.pad0 = 0;
1502 insn->bits3.if_else.pop_count = p->if_depth_in_loop[p->loop_stack_depth];
1504 insn->header.compression_control = BRW_COMPRESSION_NONE;
1505 insn->header.execution_size = BRW_EXECUTE_8;
1510 struct brw_instruction *gen6_CONT(struct brw_compile *p)
1512 struct brw_instruction *insn;
1514 insn = next_insn(p, BRW_OPCODE_CONTINUE);
1515 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1516 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1517 brw_set_dest(p, insn, brw_ip_reg());
1518 brw_set_src0(p, insn, brw_ip_reg());
1519 brw_set_src1(p, insn, brw_imm_d(0x0));
1521 insn->header.compression_control = BRW_COMPRESSION_NONE;
1522 insn->header.execution_size = BRW_EXECUTE_8;
1526 struct brw_instruction *brw_CONT(struct brw_compile *p)
1528 struct brw_instruction *insn;
1529 insn = next_insn(p, BRW_OPCODE_CONTINUE);
1530 brw_set_dest(p, insn, brw_ip_reg());
1531 brw_set_src0(p, insn, brw_ip_reg());
1532 brw_set_src1(p, insn, brw_imm_d(0x0));
1533 insn->header.compression_control = BRW_COMPRESSION_NONE;
1534 insn->header.execution_size = BRW_EXECUTE_8;
1535 /* insn->header.mask_control = BRW_MASK_DISABLE; */
1536 insn->bits3.if_else.pad0 = 0;
1537 insn->bits3.if_else.pop_count = p->if_depth_in_loop[p->loop_stack_depth];
1541 struct brw_instruction *gen6_HALT(struct brw_compile *p)
1543 struct brw_instruction *insn;
1545 insn = next_insn(p, BRW_OPCODE_HALT);
1546 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1547 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1548 brw_set_src1(p, insn, brw_imm_d(0x0)); /* UIP and JIP, updated later. */
1550 if (p->compressed) {
1551 insn->header.execution_size = BRW_EXECUTE_16;
1553 insn->header.compression_control = BRW_COMPRESSION_NONE;
1554 insn->header.execution_size = BRW_EXECUTE_8;
1561 * The DO/WHILE is just an unterminated loop -- break or continue are
1562 * used for control within the loop. We have a few ways they can be
1565 * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
1566 * jip and no DO instruction.
1568 * For non-uniform control flow pre-gen6, there's a DO instruction to
1569 * push the mask, and a WHILE to jump back, and BREAK to get out and
1572 * For gen6, there's no more mask stack, so no need for DO. WHILE
1573 * just points back to the first instruction of the loop.
1575 struct brw_instruction *brw_DO(struct brw_compile *p, unsigned execute_size)
1577 struct intel_context *intel = &p->brw->intel;
1579 if (intel->gen >= 6 || p->single_program_flow) {
1580 push_loop_stack(p, &p->store[p->nr_insn]);
1581 return &p->store[p->nr_insn];
1583 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_DO);
1585 push_loop_stack(p, insn);
1587 /* Override the defaults for this instruction:
1589 brw_set_dest(p, insn, brw_null_reg());
1590 brw_set_src0(p, insn, brw_null_reg());
1591 brw_set_src1(p, insn, brw_null_reg());
1593 insn->header.compression_control = BRW_COMPRESSION_NONE;
1594 insn->header.execution_size = execute_size;
1595 insn->header.predicate_control = BRW_PREDICATE_NONE;
1596 /* insn->header.mask_control = BRW_MASK_ENABLE; */
1597 /* insn->header.mask_control = BRW_MASK_DISABLE; */
1604 * For pre-gen6, we patch BREAK/CONT instructions to point at the WHILE
1607 * For gen6+, see brw_set_uip_jip(), which doesn't care so much about the loop
1608 * nesting, since it can always just point to the end of the block/current loop.
1611 brw_patch_break_cont(struct brw_compile *p, struct brw_instruction *while_inst)
1613 struct intel_context *intel = &p->brw->intel;
1614 struct brw_instruction *do_inst = get_inner_do_insn(p);
1615 struct brw_instruction *inst;
1616 int br = (intel->gen == 5) ? 2 : 1;
1618 for (inst = while_inst - 1; inst != do_inst; inst--) {
1619 /* If the jump count is != 0, that means that this instruction has already
1620 * been patched because it's part of a loop inside of the one we're
1623 if (inst->header.opcode == BRW_OPCODE_BREAK &&
1624 inst->bits3.if_else.jump_count == 0) {
1625 inst->bits3.if_else.jump_count = br * ((while_inst - inst) + 1);
1626 } else if (inst->header.opcode == BRW_OPCODE_CONTINUE &&
1627 inst->bits3.if_else.jump_count == 0) {
1628 inst->bits3.if_else.jump_count = br * (while_inst - inst);
1633 struct brw_instruction *brw_WHILE(struct brw_compile *p)
1635 struct intel_context *intel = &p->brw->intel;
1636 struct brw_instruction *insn, *do_insn;
1639 if (intel->gen >= 5)
1642 if (intel->gen >= 7) {
1643 insn = next_insn(p, BRW_OPCODE_WHILE);
1644 do_insn = get_inner_do_insn(p);
1646 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1647 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1648 brw_set_src1(p, insn, brw_imm_ud(0));
1649 insn->bits3.break_cont.jip = br * (do_insn - insn);
1651 insn->header.execution_size = BRW_EXECUTE_8;
1652 } else if (intel->gen == 6) {
1653 insn = next_insn(p, BRW_OPCODE_WHILE);
1654 do_insn = get_inner_do_insn(p);
1656 brw_set_dest(p, insn, brw_imm_w(0));
1657 insn->bits1.branch_gen6.jump_count = br * (do_insn - insn);
1658 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1659 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1661 insn->header.execution_size = BRW_EXECUTE_8;
1663 if (p->single_program_flow) {
1664 insn = next_insn(p, BRW_OPCODE_ADD);
1665 do_insn = get_inner_do_insn(p);
1667 brw_set_dest(p, insn, brw_ip_reg());
1668 brw_set_src0(p, insn, brw_ip_reg());
1669 brw_set_src1(p, insn, brw_imm_d((do_insn - insn) * 16));
1670 insn->header.execution_size = BRW_EXECUTE_1;
1672 insn = next_insn(p, BRW_OPCODE_WHILE);
1673 do_insn = get_inner_do_insn(p);
1675 assert(do_insn->header.opcode == BRW_OPCODE_DO);
1677 brw_set_dest(p, insn, brw_ip_reg());
1678 brw_set_src0(p, insn, brw_ip_reg());
1679 brw_set_src1(p, insn, brw_imm_d(0));
1681 insn->header.execution_size = do_insn->header.execution_size;
1682 insn->bits3.if_else.jump_count = br * (do_insn - insn + 1);
1683 insn->bits3.if_else.pop_count = 0;
1684 insn->bits3.if_else.pad0 = 0;
1686 brw_patch_break_cont(p, insn);
1689 insn->header.compression_control = BRW_COMPRESSION_NONE;
1690 p->current->header.predicate_control = BRW_PREDICATE_NONE;
1692 p->loop_stack_depth--;
1700 void brw_land_fwd_jump(struct brw_compile *p, int jmp_insn_idx)
1702 struct intel_context *intel = &p->brw->intel;
1703 struct brw_instruction *jmp_insn = &p->store[jmp_insn_idx];
1706 if (intel->gen >= 5)
1709 assert(jmp_insn->header.opcode == BRW_OPCODE_JMPI);
1710 assert(jmp_insn->bits1.da1.src1_reg_file == BRW_IMMEDIATE_VALUE);
1712 jmp_insn->bits3.ud = jmpi * (p->nr_insn - jmp_insn_idx - 1);
1717 /* To integrate with the above, it makes sense that the comparison
1718 * instruction should populate the flag register. It might be simpler
1719 * just to use the flag reg for most WM tasks?
1721 void brw_CMP(struct brw_compile *p,
1722 struct brw_reg dest,
1723 unsigned conditional,
1724 struct brw_reg src0,
1725 struct brw_reg src1)
1727 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_CMP);
1729 insn->header.destreg__conditionalmod = conditional;
1730 brw_set_dest(p, insn, dest);
1731 brw_set_src0(p, insn, src0);
1732 brw_set_src1(p, insn, src1);
1734 /* guess_execution_size(insn, src0); */
1737 /* Make it so that future instructions will use the computed flag
1738 * value until brw_set_predicate_control_flag_value() is called
1741 if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
1743 p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
1744 p->flag_value = 0xff;
1748 /* Issue 'wait' instruction for n1, host could program MMIO
1749 to wake up thread. */
1750 void brw_WAIT (struct brw_compile *p)
1752 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_WAIT);
1753 struct brw_reg src = brw_notification_1_reg();
1755 brw_set_dest(p, insn, src);
1756 brw_set_src0(p, insn, src);
1757 brw_set_src1(p, insn, brw_null_reg());
1758 insn->header.execution_size = 0; /* must */
1759 insn->header.predicate_control = 0;
1760 insn->header.compression_control = 0;
1764 /***********************************************************************
1765 * Helpers for the various SEND message types:
1768 /** Extended math function, float[8].
1770 void brw_math( struct brw_compile *p,
1771 struct brw_reg dest,
1773 unsigned msg_reg_nr,
1776 unsigned precision )
1778 struct intel_context *intel = &p->brw->intel;
1780 if (intel->gen >= 6) {
1781 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
1783 assert(dest.file == BRW_GENERAL_REGISTER_FILE);
1784 assert(src.file == BRW_GENERAL_REGISTER_FILE);
1786 assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1787 if (intel->gen == 6)
1788 assert(src.hstride == BRW_HORIZONTAL_STRIDE_1);
1790 /* Source modifiers are ignored for extended math instructions on Gen6. */
1791 if (intel->gen == 6) {
1792 assert(!src.negate);
1796 if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
1797 function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
1798 function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1799 assert(src.type != BRW_REGISTER_TYPE_F);
1801 assert(src.type == BRW_REGISTER_TYPE_F);
1804 /* Math is the same ISA format as other opcodes, except that CondModifier
1805 * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1807 insn->header.destreg__conditionalmod = function;
1809 brw_set_dest(p, insn, dest);
1810 brw_set_src0(p, insn, src);
1811 brw_set_src1(p, insn, brw_null_reg());
1813 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1815 /* Example code doesn't set predicate_control for send
1818 insn->header.predicate_control = 0;
1819 insn->header.destreg__conditionalmod = msg_reg_nr;
1821 brw_set_dest(p, insn, dest);
1822 brw_set_src0(p, insn, src);
1823 brw_set_math_message(p,
1826 src.type == BRW_REGISTER_TYPE_D,
1832 /** Extended math function, float[8].
1834 void brw_math2(struct brw_compile *p,
1835 struct brw_reg dest,
1837 struct brw_reg src0,
1838 struct brw_reg src1)
1840 struct intel_context *intel = &p->brw->intel;
1841 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
1843 assert(intel->gen >= 6);
1847 assert(dest.file == BRW_GENERAL_REGISTER_FILE);
1848 assert(src0.file == BRW_GENERAL_REGISTER_FILE);
1849 assert(src1.file == BRW_GENERAL_REGISTER_FILE);
1851 assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1852 if (intel->gen == 6) {
1853 assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1);
1854 assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1);
1857 if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
1858 function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
1859 function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1860 assert(src0.type != BRW_REGISTER_TYPE_F);
1861 assert(src1.type != BRW_REGISTER_TYPE_F);
1863 assert(src0.type == BRW_REGISTER_TYPE_F);
1864 assert(src1.type == BRW_REGISTER_TYPE_F);
1867 /* Source modifiers are ignored for extended math instructions on Gen6. */
1868 if (intel->gen == 6) {
1869 assert(!src0.negate);
1871 assert(!src1.negate);
1875 /* Math is the same ISA format as other opcodes, except that CondModifier
1876 * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1878 insn->header.destreg__conditionalmod = function;
1880 brw_set_dest(p, insn, dest);
1881 brw_set_src0(p, insn, src0);
1882 brw_set_src1(p, insn, src1);
1887 * Write a block of OWORDs (half a GRF each) from the scratch buffer,
1888 * using a constant offset per channel.
1890 * The offset must be aligned to oword size (16 bytes). Used for
1891 * register spilling.
1893 void brw_oword_block_write_scratch(struct brw_compile *p,
1898 struct intel_context *intel = &p->brw->intel;
1899 uint32_t msg_control, msg_type;
1902 if (intel->gen >= 6)
1905 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1907 if (num_regs == 1) {
1908 msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
1911 msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
1915 /* Set up the message header. This is g0, with g0.2 filled with
1916 * the offset. We don't want to leave our offset around in g0 or
1917 * it'll screw up texture samples, so set it up inside the message
1921 brw_push_insn_state(p);
1922 brw_set_mask_control(p, BRW_MASK_DISABLE);
1923 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1925 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1927 /* set message header global offset field (reg 0, element 2) */
1929 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
1931 2), BRW_REGISTER_TYPE_UD),
1932 brw_imm_ud(offset));
1934 brw_pop_insn_state(p);
1938 struct brw_reg dest;
1939 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1940 int send_commit_msg;
1941 struct brw_reg src_header = retype(brw_vec8_grf(0, 0),
1942 BRW_REGISTER_TYPE_UW);
1944 if (insn->header.compression_control != BRW_COMPRESSION_NONE) {
1945 insn->header.compression_control = BRW_COMPRESSION_NONE;
1946 src_header = vec16(src_header);
1948 assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
1949 insn->header.destreg__conditionalmod = mrf.nr;
1951 /* Until gen6, writes followed by reads from the same location
1952 * are not guaranteed to be ordered unless write_commit is set.
1953 * If set, then a no-op write is issued to the destination
1954 * register to set a dependency, and a read from the destination
1955 * can be used to ensure the ordering.
1957 * For gen6, only writes between different threads need ordering
1958 * protection. Our use of DP writes is all about register
1959 * spilling within a thread.
1961 if (intel->gen >= 6) {
1962 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
1963 send_commit_msg = 0;
1966 send_commit_msg = 1;
1969 brw_set_dest(p, insn, dest);
1970 if (intel->gen >= 6) {
1971 brw_set_src0(p, insn, mrf);
1973 brw_set_src0(p, insn, brw_null_reg());
1976 if (intel->gen >= 6)
1977 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
1979 msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
1981 brw_set_dp_write_message(p,
1983 255, /* binding table index (255=stateless) */
1987 true, /* header_present */
1988 0, /* not a render target */
1989 send_commit_msg, /* response_length */
1997 * Read a block of owords (half a GRF each) from the scratch buffer
1998 * using a constant index per channel.
2000 * Offset must be aligned to oword size (16 bytes). Used for register
2004 brw_oword_block_read_scratch(struct brw_compile *p,
2005 struct brw_reg dest,
2010 struct intel_context *intel = &p->brw->intel;
2011 uint32_t msg_control;
2014 if (intel->gen >= 6)
2017 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2018 dest = retype(dest, BRW_REGISTER_TYPE_UW);
2020 if (num_regs == 1) {
2021 msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
2024 msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
2029 brw_push_insn_state(p);
2030 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2031 brw_set_mask_control(p, BRW_MASK_DISABLE);
2033 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2035 /* set message header global offset field (reg 0, element 2) */
2037 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2039 2), BRW_REGISTER_TYPE_UD),
2040 brw_imm_ud(offset));
2042 brw_pop_insn_state(p);
2046 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
2048 assert(insn->header.predicate_control == 0);
2049 insn->header.compression_control = BRW_COMPRESSION_NONE;
2050 insn->header.destreg__conditionalmod = mrf.nr;
2052 brw_set_dest(p, insn, dest); /* UW? */
2053 if (intel->gen >= 6) {
2054 brw_set_src0(p, insn, mrf);
2056 brw_set_src0(p, insn, brw_null_reg());
2059 brw_set_dp_read_message(p,
2061 255, /* binding table index (255=stateless) */
2063 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
2064 BRW_DATAPORT_READ_TARGET_RENDER_CACHE,
2066 true, /* header_present */
2072 * Read a float[4] vector from the data port Data Cache (const buffer).
2073 * Location (in buffer) should be a multiple of 16.
2074 * Used for fetching shader constants.
2076 void brw_oword_block_read(struct brw_compile *p,
2077 struct brw_reg dest,
2080 uint32_t bind_table_index)
2082 struct intel_context *intel = &p->brw->intel;
2084 /* On newer hardware, offset is in units of owords. */
2085 if (intel->gen >= 6)
2088 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2090 brw_push_insn_state(p);
2091 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2092 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2093 brw_set_mask_control(p, BRW_MASK_DISABLE);
2095 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2097 /* set message header global offset field (reg 0, element 2) */
2099 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2101 2), BRW_REGISTER_TYPE_UD),
2102 brw_imm_ud(offset));
2104 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
2105 insn->header.destreg__conditionalmod = mrf.nr;
2107 /* cast dest to a uword[8] vector */
2108 dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
2110 brw_set_dest(p, insn, dest);
2111 if (intel->gen >= 6) {
2112 brw_set_src0(p, insn, mrf);
2114 brw_set_src0(p, insn, brw_null_reg());
2117 brw_set_dp_read_message(p,
2120 BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW,
2121 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
2122 BRW_DATAPORT_READ_TARGET_DATA_CACHE,
2124 true, /* header_present */
2125 1); /* response_length (1 reg, 2 owords!) */
2127 brw_pop_insn_state(p);
2131 void brw_fb_WRITE(struct brw_compile *p,
2133 unsigned msg_reg_nr,
2134 struct brw_reg src0,
2135 unsigned msg_control,
2136 unsigned binding_table_index,
2137 unsigned msg_length,
2138 unsigned response_length,
2140 bool header_present)
2142 struct intel_context *intel = &p->brw->intel;
2143 struct brw_instruction *insn;
2145 struct brw_reg dest;
2147 if (dispatch_width == 16)
2148 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2150 dest = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2152 if (intel->gen >= 6) {
2153 insn = next_insn(p, BRW_OPCODE_SENDC);
2155 insn = next_insn(p, BRW_OPCODE_SEND);
2157 /* The execution mask is ignored for render target writes. */
2158 insn->header.predicate_control = 0;
2159 insn->header.compression_control = BRW_COMPRESSION_NONE;
2161 if (intel->gen >= 6) {
2162 /* headerless version, just submit color payload */
2163 src0 = brw_message_reg(msg_reg_nr);
2165 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2167 insn->header.destreg__conditionalmod = msg_reg_nr;
2169 msg_type = BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2172 brw_set_dest(p, insn, dest);
2173 brw_set_src0(p, insn, src0);
2174 brw_set_dp_write_message(p,
2176 binding_table_index,
2181 eot, /* last render target write */
2184 0 /* send_commit_msg */);
2189 * Texture sample instruction.
2190 * Note: the msg_type plus msg_length values determine exactly what kind
2191 * of sampling operation is performed. See volume 4, page 161 of docs.
2193 void brw_SAMPLE(struct brw_compile *p,
2194 struct brw_reg dest,
2195 unsigned msg_reg_nr,
2196 struct brw_reg src0,
2197 unsigned binding_table_index,
2201 unsigned response_length,
2202 unsigned msg_length,
2203 unsigned header_present,
2205 unsigned return_format)
2207 struct intel_context *intel = &p->brw->intel;
2208 bool need_stall = 0;
2210 if (writemask == 0) {
2211 /*printf("%s: zero writemask??\n", __FUNCTION__); */
2215 /* Hardware doesn't do destination dependency checking on send
2216 * instructions properly. Add a workaround which generates the
2217 * dependency by other means. In practice it seems like this bug
2218 * only crops up for texture samples, and only where registers are
2219 * written by the send and then written again later without being
2220 * read in between. Luckily for us, we already track that
2221 * information and use it to modify the writemask for the
2222 * instruction, so that is a guide for whether a workaround is
2225 if (writemask != BRW_WRITEMASK_XYZW) {
2226 unsigned dst_offset = 0;
2227 unsigned i, newmask = 0, len = 0;
2229 for (i = 0; i < 4; i++) {
2230 if (writemask & (1<<i))
2234 for (; i < 4; i++) {
2235 if (!(writemask & (1<<i)))
2241 if (newmask != writemask) {
2243 /* printf("need stall %x %x\n", newmask , writemask); */
2246 bool dispatch_16 = false;
2248 struct brw_reg m1 = brw_message_reg(msg_reg_nr);
2250 guess_execution_size(p, p->current, dest);
2251 if (p->current->header.execution_size == BRW_EXECUTE_16)
2254 newmask = ~newmask & BRW_WRITEMASK_XYZW;
2256 brw_push_insn_state(p);
2258 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2259 brw_set_mask_control(p, BRW_MASK_DISABLE);
2261 brw_MOV(p, retype(m1, BRW_REGISTER_TYPE_UD),
2262 retype(brw_vec8_grf(0,0), BRW_REGISTER_TYPE_UD));
2263 brw_MOV(p, get_element_ud(m1, 2), brw_imm_ud(newmask << 12));
2265 brw_pop_insn_state(p);
2267 src0 = retype(brw_null_reg(), BRW_REGISTER_TYPE_UW);
2268 dest = offset(dest, dst_offset);
2270 /* For 16-wide dispatch, masked channels are skipped in the
2271 * response. For 8-wide, masked channels still take up slots,
2272 * and are just not written to.
2275 response_length = len * 2;
2280 struct brw_instruction *insn;
2282 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2284 insn = next_insn(p, BRW_OPCODE_SEND);
2285 insn->header.predicate_control = 0; /* XXX */
2286 insn->header.compression_control = BRW_COMPRESSION_NONE;
2288 insn->header.destreg__conditionalmod = msg_reg_nr;
2290 brw_set_dest(p, insn, dest);
2291 brw_set_src0(p, insn, src0);
2292 brw_set_sampler_message(p, insn,
2293 binding_table_index,
2304 struct brw_reg reg = vec8(offset(dest, response_length-1));
2306 /* mov (8) r9.0<1>:f r9.0<8;8,1>:f { Align1 }
2308 brw_push_insn_state(p);
2309 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2310 brw_MOV(p, retype(reg, BRW_REGISTER_TYPE_UD),
2311 retype(reg, BRW_REGISTER_TYPE_UD));
2312 brw_pop_insn_state(p);
2317 /* All these variables are pretty confusing - we might be better off
2318 * using bitmasks and macros for this, in the old style. Or perhaps
2319 * just having the caller instantiate the fields in dword3 itself.
2321 void brw_urb_WRITE(struct brw_compile *p,
2322 struct brw_reg dest,
2323 unsigned msg_reg_nr,
2324 struct brw_reg src0,
2327 unsigned msg_length,
2328 unsigned response_length,
2330 bool writes_complete,
2334 struct intel_context *intel = &p->brw->intel;
2335 struct brw_instruction *insn;
2337 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2339 if (intel->gen == 7) {
2340 /* Enable Channel Masks in the URB_WRITE_HWORD message header */
2341 brw_push_insn_state(p);
2342 brw_set_access_mode(p, BRW_ALIGN_1);
2343 brw_OR(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 5),
2344 BRW_REGISTER_TYPE_UD),
2345 retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
2346 brw_imm_ud(0xff00));
2347 brw_pop_insn_state(p);
2350 insn = next_insn(p, BRW_OPCODE_SEND);
2352 assert(msg_length < BRW_MAX_MRF);
2354 brw_set_dest(p, insn, dest);
2355 brw_set_src0(p, insn, src0);
2356 brw_set_src1(p, insn, brw_imm_d(0));
2359 insn->header.destreg__conditionalmod = msg_reg_nr;
2361 brw_set_urb_message(p,
2374 next_ip(struct brw_compile *p, int ip)
2376 struct brw_instruction *insn = (void *)p->store + ip;
2378 if (insn->header.cmpt_control)
2385 brw_find_next_block_end(struct brw_compile *p, int start)
2388 void *store = p->store;
2390 for (ip = next_ip(p, start); ip < p->next_insn_offset; ip = next_ip(p, ip)) {
2391 struct brw_instruction *insn = store + ip;
2393 switch (insn->header.opcode) {
2394 case BRW_OPCODE_ENDIF:
2395 case BRW_OPCODE_ELSE:
2396 case BRW_OPCODE_WHILE:
2397 case BRW_OPCODE_HALT:
2405 /* There is no DO instruction on gen6, so to find the end of the loop
2406 * we have to see if the loop is jumping back before our start
2410 brw_find_loop_end(struct brw_compile *p, int start)
2412 struct intel_context *intel = &p->brw->intel;
2415 void *store = p->store;
2417 /* Always start after the instruction (such as a WHILE) we're trying to fix
2420 for (ip = next_ip(p, start); ip < p->next_insn_offset; ip = next_ip(p, ip)) {
2421 struct brw_instruction *insn = store + ip;
2423 if (insn->header.opcode == BRW_OPCODE_WHILE) {
2424 int jip = intel->gen == 6 ? insn->bits1.branch_gen6.jump_count
2425 : insn->bits3.break_cont.jip;
2426 if (ip + jip * scale <= start)
2430 assert(!"not reached");
2434 /* After program generation, go back and update the UIP and JIP of
2435 * BREAK, CONT, and HALT instructions to their correct locations.
2438 brw_set_uip_jip(struct brw_compile *p)
2440 struct intel_context *intel = &p->brw->intel;
2443 void *store = p->store;
2448 for (ip = 0; ip < p->next_insn_offset; ip = next_ip(p, ip)) {
2449 struct brw_instruction *insn = store + ip;
2451 if (insn->header.cmpt_control) {
2452 /* Fixups for compacted BREAK/CONTINUE not supported yet. */
2453 assert(insn->header.opcode != BRW_OPCODE_BREAK &&
2454 insn->header.opcode != BRW_OPCODE_CONTINUE &&
2455 insn->header.opcode != BRW_OPCODE_HALT);
2459 int block_end_ip = brw_find_next_block_end(p, ip);
2460 switch (insn->header.opcode) {
2461 case BRW_OPCODE_BREAK:
2462 assert(block_end_ip != 0);
2463 insn->bits3.break_cont.jip = (block_end_ip - ip) / scale;
2464 /* Gen7 UIP points to WHILE; Gen6 points just after it */
2465 insn->bits3.break_cont.uip =
2466 (brw_find_loop_end(p, ip) - ip +
2467 (intel->gen == 6 ? 16 : 0)) / scale;
2469 case BRW_OPCODE_CONTINUE:
2470 assert(block_end_ip != 0);
2471 insn->bits3.break_cont.jip = (block_end_ip - ip) / scale;
2472 insn->bits3.break_cont.uip =
2473 (brw_find_loop_end(p, ip) - ip) / scale;
2475 assert(insn->bits3.break_cont.uip != 0);
2476 assert(insn->bits3.break_cont.jip != 0);
2479 case BRW_OPCODE_ENDIF:
2480 if (block_end_ip == 0)
2481 insn->bits3.break_cont.jip = 2;
2483 insn->bits3.break_cont.jip = (block_end_ip - ip) / scale;
2486 case BRW_OPCODE_HALT:
2487 /* From the Sandy Bridge PRM (volume 4, part 2, section 8.3.19):
2489 * "In case of the halt instruction not inside any conditional
2490 * code block, the value of <JIP> and <UIP> should be the
2491 * same. In case of the halt instruction inside conditional code
2492 * block, the <UIP> should be the end of the program, and the
2493 * <JIP> should be end of the most inner conditional code block."
2495 * The uip will have already been set by whoever set up the
2498 if (block_end_ip == 0) {
2499 insn->bits3.break_cont.jip = insn->bits3.break_cont.uip;
2501 insn->bits3.break_cont.jip = (block_end_ip - ip) / scale;
2503 assert(insn->bits3.break_cont.uip != 0);
2504 assert(insn->bits3.break_cont.jip != 0);
2510 void brw_ff_sync(struct brw_compile *p,
2511 struct brw_reg dest,
2512 unsigned msg_reg_nr,
2513 struct brw_reg src0,
2515 unsigned response_length,
2518 struct intel_context *intel = &p->brw->intel;
2519 struct brw_instruction *insn;
2521 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2523 insn = next_insn(p, BRW_OPCODE_SEND);
2524 brw_set_dest(p, insn, dest);
2525 brw_set_src0(p, insn, src0);
2526 brw_set_src1(p, insn, brw_imm_d(0));
2529 insn->header.destreg__conditionalmod = msg_reg_nr;
2531 brw_set_ff_sync_message(p,
2539 * Emit the SEND instruction necessary to generate stream output data on Gen6
2540 * (for transform feedback).
2542 * If send_commit_msg is true, this is the last piece of stream output data
2543 * from this thread, so send the data as a committed write. According to the
2544 * Sandy Bridge PRM (volume 2 part 1, section 4.5.1):
2546 * "Prior to End of Thread with a URB_WRITE, the kernel must ensure all
2547 * writes are complete by sending the final write as a committed write."
2550 brw_svb_write(struct brw_compile *p,
2551 struct brw_reg dest,
2552 unsigned msg_reg_nr,
2553 struct brw_reg src0,
2554 unsigned binding_table_index,
2555 bool send_commit_msg)
2557 struct brw_instruction *insn;
2559 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2561 insn = next_insn(p, BRW_OPCODE_SEND);
2562 brw_set_dest(p, insn, dest);
2563 brw_set_src0(p, insn, src0);
2564 brw_set_src1(p, insn, brw_imm_d(0));
2565 brw_set_dp_write_message(p, insn,
2566 binding_table_index,
2567 0, /* msg_control: ignored */
2568 GEN6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE,
2570 true, /* header_present */
2571 0, /* last_render_target: ignored */
2572 send_commit_msg, /* response_length */
2573 0, /* end_of_thread */
2574 send_commit_msg); /* send_commit_msg */
2578 * This instruction is generated as a single-channel align1 instruction by
2579 * both the VS and FS stages when using INTEL_DEBUG=shader_time.
2581 * We can't use the typed atomic op in the FS because that has the execution
2582 * mask ANDed with the pixel mask, but we just want to write the one dword for
2585 * We don't use the SIMD4x2 atomic ops in the VS because want to just write
2586 * one u32. So we use the same untyped atomic write message as the pixel
2589 * The untyped atomic operation requires a BUFFER surface type with RAW
2590 * format, and is only accessible through the legacy DATA_CACHE dataport
2593 void brw_shader_time_add(struct brw_compile *p,
2595 uint32_t surf_index)
2597 struct intel_context *intel = &p->brw->intel;
2598 assert(intel->gen >= 7);
2600 brw_push_insn_state(p);
2601 brw_set_access_mode(p, BRW_ALIGN_1);
2602 brw_set_mask_control(p, BRW_MASK_DISABLE);
2603 struct brw_instruction *send = brw_next_insn(p, BRW_OPCODE_SEND);
2604 brw_pop_insn_state(p);
2606 /* We use brw_vec1_reg and unmasked because we want to increment the given
2609 brw_set_dest(p, send, brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
2611 brw_set_src0(p, send, brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2614 bool header_present = false;
2616 uint32_t mlen = 2; /* offset, value */
2618 brw_set_message_descriptor(p, send,
2619 GEN7_SFID_DATAPORT_DATA_CACHE,
2620 mlen, rlen, header_present, eot);
2622 send->bits3.ud |= 6 << 14; /* untyped atomic op */
2623 send->bits3.ud |= 0 << 13; /* no return data */
2624 send->bits3.ud |= 1 << 12; /* SIMD8 mode */
2625 send->bits3.ud |= BRW_AOP_ADD << 8;
2626 send->bits3.ud |= surf_index << 0;