2 * Copyright © 2012 Intel Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
24 /** @file brw_eu_compact.c
26 * Instruction compaction is a feature of gm45 and newer hardware that allows
27 * for a smaller instruction encoding.
29 * The instruction cache is on the order of 32KB, and many programs generate
30 * far more instructions than that. The instruction cache is built to barely
31 * keep up with instruction dispatch abaility in cache hit cases -- L1
32 * instruction cache misses that still hit in the next level could limit
33 * throughput by around 50%.
35 * The idea of instruction compaction is that most instructions use a tiny
36 * subset of the GPU functionality, so we can encode what would be a 16 byte
37 * instruction in 8 bytes using some lookup tables for various fields.
42 #include "brw_compat.h"
43 #include "brw_context.h"
46 static const uint32_t gen6_control_index_table[32] = {
81 static const uint32_t gen6_datatype_table[32] = {
100 0b001000000110100101,
101 0b001100011000101001,
102 0b001011011000101100,
103 0b001011010110100101,
104 0b001011110110100101,
105 0b001111011110111101,
106 0b001111011110111100,
107 0b001111011110111101,
108 0b001111011110011101,
109 0b001111011110111110,
110 0b001000000000100001,
111 0b001000000000100010,
112 0b001001111111011101,
113 0b001000001110111110,
116 static const uint32_t gen6_subreg_table[32] = {
151 static const uint32_t gen6_src_index_table[32] = {
186 static const uint32_t gen7_control_index_table[32] = {
187 0b0000000000000000010,
188 0b0000100000000000000,
189 0b0000100000000000001,
190 0b0000100000000000010,
191 0b0000100000000000011,
192 0b0000100000000000100,
193 0b0000100000000000101,
194 0b0000100000000000111,
195 0b0000100000000001000,
196 0b0000100000000001001,
197 0b0000100000000001101,
198 0b0000110000000000000,
199 0b0000110000000000001,
200 0b0000110000000000010,
201 0b0000110000000000011,
202 0b0000110000000000100,
203 0b0000110000000000101,
204 0b0000110000000000111,
205 0b0000110000000001001,
206 0b0000110000000001101,
207 0b0000110000000010000,
208 0b0000110000100000000,
209 0b0001000000000000000,
210 0b0001000000000000010,
211 0b0001000000000000100,
212 0b0001000000100000000,
213 0b0010110000000000000,
214 0b0010110000000010000,
215 0b0011000000000000000,
216 0b0011000000100000000,
217 0b0101000000000000000,
218 0b0101000000100000000
221 static const uint32_t gen7_datatype_table[32] = {
222 0b001000000000000001,
223 0b001000000000100000,
224 0b001000000000100001,
225 0b001000000001100001,
226 0b001000000010111101,
227 0b001000001011111101,
228 0b001000001110100001,
229 0b001000001110100101,
230 0b001000001110111101,
231 0b001000010000100001,
232 0b001000110000100000,
233 0b001000110000100001,
234 0b001001010010100101,
235 0b001001110010100100,
236 0b001001110010100101,
237 0b001111001110111101,
238 0b001111011110011101,
239 0b001111011110111100,
240 0b001111011110111101,
241 0b001111111110111100,
242 0b000000001000001100,
243 0b001000000000111101,
244 0b001000000010100101,
245 0b001000010000100000,
246 0b001001010010100100,
247 0b001001110010000100,
248 0b001010010100001001,
249 0b001101111110111101,
250 0b001111111110111101,
251 0b001011110110101100,
252 0b001010010100101000,
256 static const uint32_t gen7_subreg_table[32] = {
291 static const uint32_t gen7_src_index_table[32] = {
326 static const uint32_t *control_index_table;
327 static const uint32_t *datatype_table;
328 static const uint32_t *subreg_table;
329 static const uint32_t *src_index_table;
332 set_control_index(struct intel_context *intel,
333 struct brw_compact_instruction *dst,
334 struct brw_instruction *src)
336 uint32_t *src_u32 = (uint32_t *)src;
337 uint32_t uncompacted = 0;
339 uncompacted |= ((src_u32[0] >> 8) & 0xffff) << 0;
340 uncompacted |= ((src_u32[0] >> 31) & 0x1) << 16;
341 /* On gen7, the flag register number gets integrated into the control
345 uncompacted |= ((src_u32[2] >> 25) & 0x3) << 17;
347 for (int i = 0; i < 32; i++) {
348 if (control_index_table[i] == uncompacted) {
349 dst->dw0.control_index = i;
358 set_datatype_index(struct brw_compact_instruction *dst,
359 struct brw_instruction *src)
361 uint32_t uncompacted = 0;
363 uncompacted |= src->bits1.ud & 0x7fff;
364 uncompacted |= (src->bits1.ud >> 29) << 15;
366 for (int i = 0; i < 32; i++) {
367 if (datatype_table[i] == uncompacted) {
368 dst->dw0.data_type_index = i;
377 set_subreg_index(struct brw_compact_instruction *dst,
378 struct brw_instruction *src)
380 uint32_t uncompacted = 0;
382 uncompacted |= src->bits1.da1.dest_subreg_nr << 0;
383 uncompacted |= src->bits2.da1.src0_subreg_nr << 5;
384 uncompacted |= src->bits3.da1.src1_subreg_nr << 10;
386 for (int i = 0; i < 32; i++) {
387 if (subreg_table[i] == uncompacted) {
388 dst->dw0.sub_reg_index = i;
397 get_src_index(uint32_t uncompacted,
400 for (int i = 0; i < 32; i++) {
401 if (src_index_table[i] == uncompacted) {
411 set_src0_index(struct brw_compact_instruction *dst,
412 struct brw_instruction *src)
414 uint32_t compacted, uncompacted = 0;
416 uncompacted |= (src->bits2.ud >> 13) & 0xfff;
418 if (!get_src_index(uncompacted, &compacted))
421 dst->dw0.src0_index = compacted & 0x3;
422 dst->dw1.src0_index = compacted >> 2;
428 set_src1_index(struct brw_compact_instruction *dst,
429 struct brw_instruction *src)
431 uint32_t compacted, uncompacted = 0;
433 uncompacted |= (src->bits3.ud >> 13) & 0xfff;
435 if (!get_src_index(uncompacted, &compacted))
438 dst->dw1.src1_index = compacted;
444 * Tries to compact instruction src into dst.
446 * It doesn't modify dst unless src is compactable, which is relied on by
447 * brw_compact_instructions().
450 brw_try_compact_instruction(struct brw_compile *p,
451 struct brw_compact_instruction *dst,
452 struct brw_instruction *src)
454 struct brw_context *brw = p->brw;
455 struct intel_context *intel = &brw->intel;
456 struct brw_compact_instruction temp;
458 if (src->header.opcode == BRW_OPCODE_IF ||
459 src->header.opcode == BRW_OPCODE_ELSE ||
460 src->header.opcode == BRW_OPCODE_ENDIF ||
461 src->header.opcode == BRW_OPCODE_HALT ||
462 src->header.opcode == BRW_OPCODE_DO ||
463 src->header.opcode == BRW_OPCODE_WHILE) {
464 /* FINISHME: The fixup code below, and brw_set_uip_jip and friends, needs
465 * to be able to handle compacted flow control instructions..
470 /* FINISHME: immediates */
471 if (src->bits1.da1.src0_reg_file == BRW_IMMEDIATE_VALUE ||
472 src->bits1.da1.src1_reg_file == BRW_IMMEDIATE_VALUE)
475 memset(&temp, 0, sizeof(temp));
477 temp.dw0.opcode = src->header.opcode;
478 temp.dw0.debug_control = src->header.debug_control;
479 if (!set_control_index(intel, &temp, src))
481 if (!set_datatype_index(&temp, src))
483 if (!set_subreg_index(&temp, src))
485 temp.dw0.acc_wr_control = src->header.acc_wr_control;
486 temp.dw0.conditionalmod = src->header.destreg__conditionalmod;
488 temp.dw0.flag_subreg_nr = src->bits2.da1.flag_subreg_nr;
489 temp.dw0.cmpt_ctrl = 1;
490 if (!set_src0_index(&temp, src))
492 if (!set_src1_index(&temp, src))
494 temp.dw1.dst_reg_nr = src->bits1.da1.dest_reg_nr;
495 temp.dw1.src0_reg_nr = src->bits2.da1.src0_reg_nr;
496 temp.dw1.src1_reg_nr = src->bits3.da1.src1_reg_nr;
504 set_uncompacted_control(struct intel_context *intel,
505 struct brw_instruction *dst,
506 struct brw_compact_instruction *src)
508 uint32_t *dst_u32 = (uint32_t *)dst;
509 uint32_t uncompacted = control_index_table[src->dw0.control_index];
511 dst_u32[0] |= ((uncompacted >> 0) & 0xffff) << 8;
512 dst_u32[0] |= ((uncompacted >> 16) & 0x1) << 31;
515 dst_u32[2] |= ((uncompacted >> 17) & 0x3) << 25;
519 set_uncompacted_datatype(struct brw_instruction *dst,
520 struct brw_compact_instruction *src)
522 uint32_t uncompacted = datatype_table[src->dw0.data_type_index];
524 dst->bits1.ud &= ~(0x7 << 29);
525 dst->bits1.ud |= ((uncompacted >> 15) & 0x7) << 29;
526 dst->bits1.ud &= ~0x7fff;
527 dst->bits1.ud |= uncompacted & 0x7fff;
531 set_uncompacted_subreg(struct brw_instruction *dst,
532 struct brw_compact_instruction *src)
534 uint32_t uncompacted = subreg_table[src->dw0.sub_reg_index];
536 dst->bits1.da1.dest_subreg_nr = (uncompacted >> 0) & 0x1f;
537 dst->bits2.da1.src0_subreg_nr = (uncompacted >> 5) & 0x1f;
538 dst->bits3.da1.src1_subreg_nr = (uncompacted >> 10) & 0x1f;
542 set_uncompacted_src0(struct brw_instruction *dst,
543 struct brw_compact_instruction *src)
545 uint32_t compacted = src->dw0.src0_index | src->dw1.src0_index << 2;
546 uint32_t uncompacted = src_index_table[compacted];
548 dst->bits2.ud |= uncompacted << 13;
552 set_uncompacted_src1(struct brw_instruction *dst,
553 struct brw_compact_instruction *src)
555 uint32_t uncompacted = src_index_table[src->dw1.src1_index];
557 dst->bits3.ud |= uncompacted << 13;
561 brw_uncompact_instruction(struct intel_context *intel,
562 struct brw_instruction *dst,
563 struct brw_compact_instruction *src)
565 memset(dst, 0, sizeof(*dst));
567 dst->header.opcode = src->dw0.opcode;
568 dst->header.debug_control = src->dw0.debug_control;
570 set_uncompacted_control(intel, dst, src);
571 set_uncompacted_datatype(dst, src);
572 set_uncompacted_subreg(dst, src);
573 dst->header.acc_wr_control = src->dw0.acc_wr_control;
574 dst->header.destreg__conditionalmod = src->dw0.conditionalmod;
576 dst->bits2.da1.flag_subreg_nr = src->dw0.flag_subreg_nr;
577 set_uncompacted_src0(dst, src);
578 set_uncompacted_src1(dst, src);
579 dst->bits1.da1.dest_reg_nr = src->dw1.dst_reg_nr;
580 dst->bits2.da1.src0_reg_nr = src->dw1.src0_reg_nr;
581 dst->bits3.da1.src1_reg_nr = src->dw1.src1_reg_nr;
584 void brw_debug_compact_uncompact(struct intel_context *intel,
585 struct brw_instruction *orig,
586 struct brw_instruction *uncompacted)
588 fprintf(stderr, "Instruction compact/uncompact changed (gen%d):\n",
591 fprintf(stderr, " before: ");
592 brw_disasm(stderr, orig, intel->gen);
594 fprintf(stderr, " after: ");
595 brw_disasm(stderr, uncompacted, intel->gen);
597 uint32_t *before_bits = (uint32_t *)orig;
598 uint32_t *after_bits = (uint32_t *)uncompacted;
599 printf(" changed bits:\n");
600 for (int i = 0; i < 128; i++) {
601 uint32_t before = before_bits[i / 32] & (1 << (i & 31));
602 uint32_t after = after_bits[i / 32] & (1 << (i & 31));
604 if (before != after) {
605 printf(" bit %d, %s to %s\n", i,
606 before ? "set" : "unset",
607 after ? "set" : "unset");
613 compacted_between(int old_ip, int old_target_ip, int *compacted_counts)
615 int this_compacted_count = compacted_counts[old_ip];
616 int target_compacted_count = compacted_counts[old_target_ip];
617 return target_compacted_count - this_compacted_count;
621 update_uip_jip(struct brw_instruction *insn, int this_old_ip,
622 int *compacted_counts)
626 target_old_ip = this_old_ip + insn->bits3.break_cont.jip;
627 insn->bits3.break_cont.jip -= compacted_between(this_old_ip,
631 target_old_ip = this_old_ip + insn->bits3.break_cont.uip;
632 insn->bits3.break_cont.uip -= compacted_between(this_old_ip,
638 brw_init_compaction_tables(struct intel_context *intel)
640 assert(gen6_control_index_table[ARRAY_SIZE(gen6_control_index_table) - 1] != 0);
641 assert(gen6_datatype_table[ARRAY_SIZE(gen6_datatype_table) - 1] != 0);
642 assert(gen6_subreg_table[ARRAY_SIZE(gen6_subreg_table) - 1] != 0);
643 assert(gen6_src_index_table[ARRAY_SIZE(gen6_src_index_table) - 1] != 0);
644 assert(gen7_control_index_table[ARRAY_SIZE(gen6_control_index_table) - 1] != 0);
645 assert(gen7_datatype_table[ARRAY_SIZE(gen6_datatype_table) - 1] != 0);
646 assert(gen7_subreg_table[ARRAY_SIZE(gen6_subreg_table) - 1] != 0);
647 assert(gen7_src_index_table[ARRAY_SIZE(gen6_src_index_table) - 1] != 0);
649 switch (intel->gen) {
651 control_index_table = gen7_control_index_table;
652 datatype_table = gen7_datatype_table;
653 subreg_table = gen7_subreg_table;
654 src_index_table = gen7_src_index_table;
657 control_index_table = gen6_control_index_table;
658 datatype_table = gen6_datatype_table;
659 subreg_table = gen6_subreg_table;
660 src_index_table = gen6_src_index_table;
668 brw_compact_instructions(struct brw_compile *p)
670 struct brw_context *brw = p->brw;
671 struct intel_context *intel = &brw->intel;
672 void *store = p->store;
673 /* For an instruction at byte offset 8*i before compaction, this is the number
674 * of compacted instructions that preceded it.
676 int compacted_counts[p->next_insn_offset / 8];
677 /* For an instruction at byte offset 8*i after compaction, this is the
678 * 8-byte offset it was at before compaction.
680 int old_ip[p->next_insn_offset / 8];
687 int compacted_count = 0;
688 for (src_offset = 0; src_offset < p->nr_insn * 16;) {
689 struct brw_instruction *src = store + src_offset;
690 void *dst = store + offset;
692 old_ip[offset / 8] = src_offset / 8;
693 compacted_counts[src_offset / 8] = compacted_count;
695 struct brw_instruction saved = *src;
697 if (!src->header.cmpt_control &&
698 brw_try_compact_instruction(p, dst, src)) {
702 struct brw_instruction uncompacted;
703 brw_uncompact_instruction(intel, &uncompacted, dst);
704 if (memcmp(&saved, &uncompacted, sizeof(uncompacted))) {
705 brw_debug_compact_uncompact(intel, &saved, &uncompacted);
712 int size = src->header.cmpt_control ? 8 : 16;
714 /* It appears that the end of thread SEND instruction needs to be
715 * aligned, or the GPU hangs.
717 if ((src->header.opcode == BRW_OPCODE_SEND ||
718 src->header.opcode == BRW_OPCODE_SENDC) &&
719 src->bits3.generic.end_of_thread &&
721 struct brw_compact_instruction *align = store + offset;
722 memset(align, 0, sizeof(*align));
723 align->dw0.opcode = BRW_OPCODE_NOP;
724 align->dw0.cmpt_ctrl = 1;
726 old_ip[offset / 8] = src_offset / 8;
727 dst = store + offset;
730 /* If we didn't compact this intruction, we need to move it down into
733 if (offset != src_offset) {
734 memmove(dst, src, size);
741 /* Fix up control flow offsets. */
742 p->next_insn_offset = offset;
743 for (offset = 0; offset < p->next_insn_offset;) {
744 struct brw_instruction *insn = store + offset;
745 int this_old_ip = old_ip[offset / 8];
746 int this_compacted_count = compacted_counts[this_old_ip];
747 int target_old_ip, target_compacted_count;
749 switch (insn->header.opcode) {
750 case BRW_OPCODE_BREAK:
751 case BRW_OPCODE_CONTINUE:
752 case BRW_OPCODE_HALT:
753 update_uip_jip(insn, this_old_ip, compacted_counts);
757 case BRW_OPCODE_ELSE:
758 case BRW_OPCODE_ENDIF:
759 case BRW_OPCODE_WHILE:
760 if (intel->gen == 6) {
761 target_old_ip = this_old_ip + insn->bits1.branch_gen6.jump_count;
762 target_compacted_count = compacted_counts[target_old_ip];
763 insn->bits1.branch_gen6.jump_count -= (target_compacted_count -
764 this_compacted_count);
766 update_uip_jip(insn, this_old_ip, compacted_counts);
771 if (insn->header.cmpt_control) {
778 /* p->nr_insn is counting the number of uncompacted instructions still, so
779 * divide. We do want to be sure there's a valid instruction in any
780 * alignment padding, so that the next compression pass (for the FS 8/16
781 * compile passes) parses correctly.
783 if (p->next_insn_offset & 8) {
784 struct brw_compact_instruction *align = store + offset;
785 memset(align, 0, sizeof(*align));
786 align->dw0.opcode = BRW_OPCODE_NOP;
787 align->dw0.cmpt_ctrl = 1;
788 p->next_insn_offset += 8;
790 p->nr_insn = p->next_insn_offset / 16;
793 fprintf(stdout, "dumping compacted program\n");
794 brw_dump_compile(p, stdout, 0, p->next_insn_offset);
797 for (offset = 0; offset < p->next_insn_offset;) {
798 struct brw_instruction *insn = store + offset;
800 if (insn->header.cmpt_control) {
807 fprintf(stderr, "%db/%db saved (%d%%)\n", cmp * 8, offset + cmp * 8,
808 cmp * 8 * 100 / (offset + cmp * 8));