assembler: Import brw_eu_compact.c
[platform/upstream/intel-gpu-tools.git] / assembler / brw_eu_compact.c
1 /*
2  * Copyright © 2012 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23
24 /** @file brw_eu_compact.c
25  *
26  * Instruction compaction is a feature of gm45 and newer hardware that allows
27  * for a smaller instruction encoding.
28  *
29  * The instruction cache is on the order of 32KB, and many programs generate
30  * far more instructions than that.  The instruction cache is built to barely
31  * keep up with instruction dispatch abaility in cache hit cases -- L1
32  * instruction cache misses that still hit in the next level could limit
33  * throughput by around 50%.
34  *
35  * The idea of instruction compaction is that most instructions use a tiny
36  * subset of the GPU functionality, so we can encode what would be a 16 byte
37  * instruction in 8 bytes using some lookup tables for various fields.
38  */
39
40 #include <string.h>
41
42 #include "brw_compat.h"
43 #include "brw_context.h"
44 #include "brw_eu.h"
45
46 static const uint32_t gen6_control_index_table[32] = {
47    0b00000000000000000,
48    0b01000000000000000,
49    0b00110000000000000,
50    0b00000000100000000,
51    0b00010000000000000,
52    0b00001000100000000,
53    0b00000000100000010,
54    0b00000000000000010,
55    0b01000000100000000,
56    0b01010000000000000,
57    0b10110000000000000,
58    0b00100000000000000,
59    0b11010000000000000,
60    0b11000000000000000,
61    0b01001000100000000,
62    0b01000000000001000,
63    0b01000000000000100,
64    0b00000000000001000,
65    0b00000000000000100,
66    0b00111000100000000,
67    0b00001000100000010,
68    0b00110000100000000,
69    0b00110000000000001,
70    0b00100000000000001,
71    0b00110000000000010,
72    0b00110000000000101,
73    0b00110000000001001,
74    0b00110000000010000,
75    0b00110000000000011,
76    0b00110000000000100,
77    0b00110000100001000,
78    0b00100000000001001
79 };
80
81 static const uint32_t gen6_datatype_table[32] = {
82    0b001001110000000000,
83    0b001000110000100000,
84    0b001001110000000001,
85    0b001000000001100000,
86    0b001010110100101001,
87    0b001000000110101101,
88    0b001100011000101100,
89    0b001011110110101101,
90    0b001000000111101100,
91    0b001000000001100001,
92    0b001000110010100101,
93    0b001000000001000001,
94    0b001000001000110001,
95    0b001000001000101001,
96    0b001000000000100000,
97    0b001000001000110010,
98    0b001010010100101001,
99    0b001011010010100101,
100    0b001000000110100101,
101    0b001100011000101001,
102    0b001011011000101100,
103    0b001011010110100101,
104    0b001011110110100101,
105    0b001111011110111101,
106    0b001111011110111100,
107    0b001111011110111101,
108    0b001111011110011101,
109    0b001111011110111110,
110    0b001000000000100001,
111    0b001000000000100010,
112    0b001001111111011101,
113    0b001000001110111110,
114 };
115
116 static const uint32_t gen6_subreg_table[32] = {
117    0b000000000000000,
118    0b000000000000100,
119    0b000000110000000,
120    0b111000000000000,
121    0b011110000001000,
122    0b000010000000000,
123    0b000000000010000,
124    0b000110000001100,
125    0b001000000000000,
126    0b000001000000000,
127    0b000001010010100,
128    0b000000001010110,
129    0b010000000000000,
130    0b110000000000000,
131    0b000100000000000,
132    0b000000010000000,
133    0b000000000001000,
134    0b100000000000000,
135    0b000001010000000,
136    0b001010000000000,
137    0b001100000000000,
138    0b000000001010100,
139    0b101101010010100,
140    0b010100000000000,
141    0b000000010001111,
142    0b011000000000000,
143    0b111110000000000,
144    0b101000000000000,
145    0b000000000001111,
146    0b000100010001111,
147    0b001000010001111,
148    0b000110000000000,
149 };
150
151 static const uint32_t gen6_src_index_table[32] = {
152    0b000000000000,
153    0b010110001000,
154    0b010001101000,
155    0b001000101000,
156    0b011010010000,
157    0b000100100000,
158    0b010001101100,
159    0b010101110000,
160    0b011001111000,
161    0b001100101000,
162    0b010110001100,
163    0b001000100000,
164    0b010110001010,
165    0b000000000010,
166    0b010101010000,
167    0b010101101000,
168    0b111101001100,
169    0b111100101100,
170    0b011001110000,
171    0b010110001001,
172    0b010101011000,
173    0b001101001000,
174    0b010000101100,
175    0b010000000000,
176    0b001101110000,
177    0b001100010000,
178    0b001100000000,
179    0b010001101010,
180    0b001101111000,
181    0b000001110000,
182    0b001100100000,
183    0b001101010000,
184 };
185
186 static const uint32_t gen7_control_index_table[32] = {
187    0b0000000000000000010,
188    0b0000100000000000000,
189    0b0000100000000000001,
190    0b0000100000000000010,
191    0b0000100000000000011,
192    0b0000100000000000100,
193    0b0000100000000000101,
194    0b0000100000000000111,
195    0b0000100000000001000,
196    0b0000100000000001001,
197    0b0000100000000001101,
198    0b0000110000000000000,
199    0b0000110000000000001,
200    0b0000110000000000010,
201    0b0000110000000000011,
202    0b0000110000000000100,
203    0b0000110000000000101,
204    0b0000110000000000111,
205    0b0000110000000001001,
206    0b0000110000000001101,
207    0b0000110000000010000,
208    0b0000110000100000000,
209    0b0001000000000000000,
210    0b0001000000000000010,
211    0b0001000000000000100,
212    0b0001000000100000000,
213    0b0010110000000000000,
214    0b0010110000000010000,
215    0b0011000000000000000,
216    0b0011000000100000000,
217    0b0101000000000000000,
218    0b0101000000100000000
219 };
220
221 static const uint32_t gen7_datatype_table[32] = {
222    0b001000000000000001,
223    0b001000000000100000,
224    0b001000000000100001,
225    0b001000000001100001,
226    0b001000000010111101,
227    0b001000001011111101,
228    0b001000001110100001,
229    0b001000001110100101,
230    0b001000001110111101,
231    0b001000010000100001,
232    0b001000110000100000,
233    0b001000110000100001,
234    0b001001010010100101,
235    0b001001110010100100,
236    0b001001110010100101,
237    0b001111001110111101,
238    0b001111011110011101,
239    0b001111011110111100,
240    0b001111011110111101,
241    0b001111111110111100,
242    0b000000001000001100,
243    0b001000000000111101,
244    0b001000000010100101,
245    0b001000010000100000,
246    0b001001010010100100,
247    0b001001110010000100,
248    0b001010010100001001,
249    0b001101111110111101,
250    0b001111111110111101,
251    0b001011110110101100,
252    0b001010010100101000,
253    0b001010110100101000
254 };
255
256 static const uint32_t gen7_subreg_table[32] = {
257    0b000000000000000,
258    0b000000000000001,
259    0b000000000001000,
260    0b000000000001111,
261    0b000000000010000,
262    0b000000010000000,
263    0b000000100000000,
264    0b000000110000000,
265    0b000001000000000,
266    0b000001000010000,
267    0b000010100000000,
268    0b001000000000000,
269    0b001000000000001,
270    0b001000010000001,
271    0b001000010000010,
272    0b001000010000011,
273    0b001000010000100,
274    0b001000010000111,
275    0b001000010001000,
276    0b001000010001110,
277    0b001000010001111,
278    0b001000110000000,
279    0b001000111101000,
280    0b010000000000000,
281    0b010000110000000,
282    0b011000000000000,
283    0b011110010000111,
284    0b100000000000000,
285    0b101000000000000,
286    0b110000000000000,
287    0b111000000000000,
288    0b111000000011100
289 };
290
291 static const uint32_t gen7_src_index_table[32] = {
292    0b000000000000,
293    0b000000000010,
294    0b000000010000,
295    0b000000010010,
296    0b000000011000,
297    0b000000100000,
298    0b000000101000,
299    0b000001001000,
300    0b000001010000,
301    0b000001110000,
302    0b000001111000,
303    0b001100000000,
304    0b001100000010,
305    0b001100001000,
306    0b001100010000,
307    0b001100010010,
308    0b001100100000,
309    0b001100101000,
310    0b001100111000,
311    0b001101000000,
312    0b001101000010,
313    0b001101001000,
314    0b001101010000,
315    0b001101100000,
316    0b001101101000,
317    0b001101110000,
318    0b001101110001,
319    0b001101111000,
320    0b010001101000,
321    0b010001101001,
322    0b010001101010,
323    0b010110001000
324 };
325
326 static const uint32_t *control_index_table;
327 static const uint32_t *datatype_table;
328 static const uint32_t *subreg_table;
329 static const uint32_t *src_index_table;
330
331 static bool
332 set_control_index(struct intel_context *intel,
333                   struct brw_compact_instruction *dst,
334                   struct brw_instruction *src)
335 {
336    uint32_t *src_u32 = (uint32_t *)src;
337    uint32_t uncompacted = 0;
338
339    uncompacted |= ((src_u32[0] >> 8) & 0xffff) << 0;
340    uncompacted |= ((src_u32[0] >> 31) & 0x1) << 16;
341    /* On gen7, the flag register number gets integrated into the control
342     * index.
343     */
344    if (intel->gen >= 7)
345       uncompacted |= ((src_u32[2] >> 25) & 0x3) << 17;
346
347    for (int i = 0; i < 32; i++) {
348       if (control_index_table[i] == uncompacted) {
349          dst->dw0.control_index = i;
350          return true;
351       }
352    }
353
354    return false;
355 }
356
357 static bool
358 set_datatype_index(struct brw_compact_instruction *dst,
359                    struct brw_instruction *src)
360 {
361    uint32_t uncompacted = 0;
362
363    uncompacted |= src->bits1.ud & 0x7fff;
364    uncompacted |= (src->bits1.ud >> 29) << 15;
365
366    for (int i = 0; i < 32; i++) {
367       if (datatype_table[i] == uncompacted) {
368          dst->dw0.data_type_index = i;
369          return true;
370       }
371    }
372
373    return false;
374 }
375
376 static bool
377 set_subreg_index(struct brw_compact_instruction *dst,
378                  struct brw_instruction *src)
379 {
380    uint32_t uncompacted = 0;
381
382    uncompacted |= src->bits1.da1.dest_subreg_nr << 0;
383    uncompacted |= src->bits2.da1.src0_subreg_nr << 5;
384    uncompacted |= src->bits3.da1.src1_subreg_nr << 10;
385
386    for (int i = 0; i < 32; i++) {
387       if (subreg_table[i] == uncompacted) {
388          dst->dw0.sub_reg_index = i;
389          return true;
390       }
391    }
392
393    return false;
394 }
395
396 static bool
397 get_src_index(uint32_t uncompacted,
398               uint32_t *compacted)
399 {
400    for (int i = 0; i < 32; i++) {
401       if (src_index_table[i] == uncompacted) {
402          *compacted = i;
403          return true;
404       }
405    }
406
407    return false;
408 }
409
410 static bool
411 set_src0_index(struct brw_compact_instruction *dst,
412                struct brw_instruction *src)
413 {
414    uint32_t compacted, uncompacted = 0;
415
416    uncompacted |= (src->bits2.ud >> 13) & 0xfff;
417
418    if (!get_src_index(uncompacted, &compacted))
419       return false;
420
421    dst->dw0.src0_index = compacted & 0x3;
422    dst->dw1.src0_index = compacted >> 2;
423
424    return true;
425 }
426
427 static bool
428 set_src1_index(struct brw_compact_instruction *dst,
429                struct brw_instruction *src)
430 {
431    uint32_t compacted, uncompacted = 0;
432
433    uncompacted |= (src->bits3.ud >> 13) & 0xfff;
434
435    if (!get_src_index(uncompacted, &compacted))
436       return false;
437
438    dst->dw1.src1_index = compacted;
439
440    return true;
441 }
442
443 /**
444  * Tries to compact instruction src into dst.
445  *
446  * It doesn't modify dst unless src is compactable, which is relied on by
447  * brw_compact_instructions().
448  */
449 bool
450 brw_try_compact_instruction(struct brw_compile *p,
451                             struct brw_compact_instruction *dst,
452                             struct brw_instruction *src)
453 {
454    struct brw_context *brw = p->brw;
455    struct intel_context *intel = &brw->intel;
456    struct brw_compact_instruction temp;
457
458    if (src->header.opcode == BRW_OPCODE_IF ||
459        src->header.opcode == BRW_OPCODE_ELSE ||
460        src->header.opcode == BRW_OPCODE_ENDIF ||
461        src->header.opcode == BRW_OPCODE_HALT ||
462        src->header.opcode == BRW_OPCODE_DO ||
463        src->header.opcode == BRW_OPCODE_WHILE) {
464       /* FINISHME: The fixup code below, and brw_set_uip_jip and friends, needs
465        * to be able to handle compacted flow control instructions..
466        */
467       return false;
468    }
469
470    /* FINISHME: immediates */
471    if (src->bits1.da1.src0_reg_file == BRW_IMMEDIATE_VALUE ||
472        src->bits1.da1.src1_reg_file == BRW_IMMEDIATE_VALUE)
473       return false;
474
475    memset(&temp, 0, sizeof(temp));
476
477    temp.dw0.opcode = src->header.opcode;
478    temp.dw0.debug_control = src->header.debug_control;
479    if (!set_control_index(intel, &temp, src))
480       return false;
481    if (!set_datatype_index(&temp, src))
482       return false;
483    if (!set_subreg_index(&temp, src))
484       return false;
485    temp.dw0.acc_wr_control = src->header.acc_wr_control;
486    temp.dw0.conditionalmod = src->header.destreg__conditionalmod;
487    if (intel->gen <= 6)
488       temp.dw0.flag_subreg_nr = src->bits2.da1.flag_subreg_nr;
489    temp.dw0.cmpt_ctrl = 1;
490    if (!set_src0_index(&temp, src))
491       return false;
492    if (!set_src1_index(&temp, src))
493       return false;
494    temp.dw1.dst_reg_nr = src->bits1.da1.dest_reg_nr;
495    temp.dw1.src0_reg_nr = src->bits2.da1.src0_reg_nr;
496    temp.dw1.src1_reg_nr = src->bits3.da1.src1_reg_nr;
497
498    *dst = temp;
499
500    return true;
501 }
502
503 static void
504 set_uncompacted_control(struct intel_context *intel,
505                         struct brw_instruction *dst,
506                         struct brw_compact_instruction *src)
507 {
508    uint32_t *dst_u32 = (uint32_t *)dst;
509    uint32_t uncompacted = control_index_table[src->dw0.control_index];
510
511    dst_u32[0] |= ((uncompacted >> 0) & 0xffff) << 8;
512    dst_u32[0] |= ((uncompacted >> 16) & 0x1) << 31;
513
514    if (intel->gen >= 7)
515       dst_u32[2] |= ((uncompacted >> 17) & 0x3) << 25;
516 }
517
518 static void
519 set_uncompacted_datatype(struct brw_instruction *dst,
520                          struct brw_compact_instruction *src)
521 {
522    uint32_t uncompacted = datatype_table[src->dw0.data_type_index];
523
524    dst->bits1.ud &= ~(0x7 << 29);
525    dst->bits1.ud |= ((uncompacted >> 15) & 0x7) << 29;
526    dst->bits1.ud &= ~0x7fff;
527    dst->bits1.ud |= uncompacted & 0x7fff;
528 }
529
530 static void
531 set_uncompacted_subreg(struct brw_instruction *dst,
532                        struct brw_compact_instruction *src)
533 {
534    uint32_t uncompacted = subreg_table[src->dw0.sub_reg_index];
535
536    dst->bits1.da1.dest_subreg_nr = (uncompacted >> 0)  & 0x1f;
537    dst->bits2.da1.src0_subreg_nr = (uncompacted >> 5)  & 0x1f;
538    dst->bits3.da1.src1_subreg_nr = (uncompacted >> 10) & 0x1f;
539 }
540
541 static void
542 set_uncompacted_src0(struct brw_instruction *dst,
543                      struct brw_compact_instruction *src)
544 {
545    uint32_t compacted = src->dw0.src0_index | src->dw1.src0_index << 2;
546    uint32_t uncompacted = src_index_table[compacted];
547
548    dst->bits2.ud |= uncompacted << 13;
549 }
550
551 static void
552 set_uncompacted_src1(struct brw_instruction *dst,
553                      struct brw_compact_instruction *src)
554 {
555    uint32_t uncompacted = src_index_table[src->dw1.src1_index];
556
557    dst->bits3.ud |= uncompacted << 13;
558 }
559
560 void
561 brw_uncompact_instruction(struct intel_context *intel,
562                           struct brw_instruction *dst,
563                           struct brw_compact_instruction *src)
564 {
565    memset(dst, 0, sizeof(*dst));
566
567    dst->header.opcode = src->dw0.opcode;
568    dst->header.debug_control = src->dw0.debug_control;
569
570    set_uncompacted_control(intel, dst, src);
571    set_uncompacted_datatype(dst, src);
572    set_uncompacted_subreg(dst, src);
573    dst->header.acc_wr_control = src->dw0.acc_wr_control;
574    dst->header.destreg__conditionalmod = src->dw0.conditionalmod;
575    if (intel->gen <= 6)
576       dst->bits2.da1.flag_subreg_nr = src->dw0.flag_subreg_nr;
577    set_uncompacted_src0(dst, src);
578    set_uncompacted_src1(dst, src);
579    dst->bits1.da1.dest_reg_nr = src->dw1.dst_reg_nr;
580    dst->bits2.da1.src0_reg_nr = src->dw1.src0_reg_nr;
581    dst->bits3.da1.src1_reg_nr = src->dw1.src1_reg_nr;
582 }
583
584 void brw_debug_compact_uncompact(struct intel_context *intel,
585                                  struct brw_instruction *orig,
586                                  struct brw_instruction *uncompacted)
587 {
588    fprintf(stderr, "Instruction compact/uncompact changed (gen%d):\n",
589            intel->gen);
590
591    fprintf(stderr, "  before: ");
592    brw_disasm(stderr, orig, intel->gen);
593
594    fprintf(stderr, "  after:  ");
595    brw_disasm(stderr, uncompacted, intel->gen);
596
597    uint32_t *before_bits = (uint32_t *)orig;
598    uint32_t *after_bits = (uint32_t *)uncompacted;
599    printf("  changed bits:\n");
600    for (int i = 0; i < 128; i++) {
601       uint32_t before = before_bits[i / 32] & (1 << (i & 31));
602       uint32_t after = after_bits[i / 32] & (1 << (i & 31));
603
604       if (before != after) {
605          printf("  bit %d, %s to %s\n", i,
606                 before ? "set" : "unset",
607                 after ? "set" : "unset");
608       }
609    }
610 }
611
612 static int
613 compacted_between(int old_ip, int old_target_ip, int *compacted_counts)
614 {
615    int this_compacted_count = compacted_counts[old_ip];
616    int target_compacted_count = compacted_counts[old_target_ip];
617    return target_compacted_count - this_compacted_count;
618 }
619
620 static void
621 update_uip_jip(struct brw_instruction *insn, int this_old_ip,
622                int *compacted_counts)
623 {
624    int target_old_ip;
625
626    target_old_ip = this_old_ip + insn->bits3.break_cont.jip;
627    insn->bits3.break_cont.jip -= compacted_between(this_old_ip,
628                                                    target_old_ip,
629                                                    compacted_counts);
630
631    target_old_ip = this_old_ip + insn->bits3.break_cont.uip;
632    insn->bits3.break_cont.uip -= compacted_between(this_old_ip,
633                                                    target_old_ip,
634                                                    compacted_counts);
635 }
636
637 void
638 brw_init_compaction_tables(struct intel_context *intel)
639 {
640    assert(gen6_control_index_table[ARRAY_SIZE(gen6_control_index_table) - 1] != 0);
641    assert(gen6_datatype_table[ARRAY_SIZE(gen6_datatype_table) - 1] != 0);
642    assert(gen6_subreg_table[ARRAY_SIZE(gen6_subreg_table) - 1] != 0);
643    assert(gen6_src_index_table[ARRAY_SIZE(gen6_src_index_table) - 1] != 0);
644    assert(gen7_control_index_table[ARRAY_SIZE(gen6_control_index_table) - 1] != 0);
645    assert(gen7_datatype_table[ARRAY_SIZE(gen6_datatype_table) - 1] != 0);
646    assert(gen7_subreg_table[ARRAY_SIZE(gen6_subreg_table) - 1] != 0);
647    assert(gen7_src_index_table[ARRAY_SIZE(gen6_src_index_table) - 1] != 0);
648
649    switch (intel->gen) {
650    case 7:
651       control_index_table = gen7_control_index_table;
652       datatype_table = gen7_datatype_table;
653       subreg_table = gen7_subreg_table;
654       src_index_table = gen7_src_index_table;
655       break;
656    case 6:
657       control_index_table = gen6_control_index_table;
658       datatype_table = gen6_datatype_table;
659       subreg_table = gen6_subreg_table;
660       src_index_table = gen6_src_index_table;
661       break;
662    default:
663       return;
664    }
665 }
666
667 void
668 brw_compact_instructions(struct brw_compile *p)
669 {
670    struct brw_context *brw = p->brw;
671    struct intel_context *intel = &brw->intel;
672    void *store = p->store;
673    /* For an instruction at byte offset 8*i before compaction, this is the number
674     * of compacted instructions that preceded it.
675     */
676    int compacted_counts[p->next_insn_offset / 8];
677    /* For an instruction at byte offset 8*i after compaction, this is the
678     * 8-byte offset it was at before compaction.
679     */
680    int old_ip[p->next_insn_offset / 8];
681
682    if (intel->gen < 6)
683       return;
684
685    int src_offset;
686    int offset = 0;
687    int compacted_count = 0;
688    for (src_offset = 0; src_offset < p->nr_insn * 16;) {
689       struct brw_instruction *src = store + src_offset;
690       void *dst = store + offset;
691
692       old_ip[offset / 8] = src_offset / 8;
693       compacted_counts[src_offset / 8] = compacted_count;
694
695       struct brw_instruction saved = *src;
696
697       if (!src->header.cmpt_control &&
698           brw_try_compact_instruction(p, dst, src)) {
699          compacted_count++;
700
701          if (INTEL_DEBUG) {
702             struct brw_instruction uncompacted;
703             brw_uncompact_instruction(intel, &uncompacted, dst);
704             if (memcmp(&saved, &uncompacted, sizeof(uncompacted))) {
705                brw_debug_compact_uncompact(intel, &saved, &uncompacted);
706             }
707          }
708
709          offset += 8;
710          src_offset += 16;
711       } else {
712          int size = src->header.cmpt_control ? 8 : 16;
713
714          /* It appears that the end of thread SEND instruction needs to be
715           * aligned, or the GPU hangs.
716           */
717          if ((src->header.opcode == BRW_OPCODE_SEND ||
718               src->header.opcode == BRW_OPCODE_SENDC) &&
719              src->bits3.generic.end_of_thread &&
720              (offset & 8) != 0) {
721             struct brw_compact_instruction *align = store + offset;
722             memset(align, 0, sizeof(*align));
723             align->dw0.opcode = BRW_OPCODE_NOP;
724             align->dw0.cmpt_ctrl = 1;
725             offset += 8;
726             old_ip[offset / 8] = src_offset / 8;
727             dst = store + offset;
728          }
729
730          /* If we didn't compact this intruction, we need to move it down into
731           * place.
732           */
733          if (offset != src_offset) {
734             memmove(dst, src, size);
735          }
736          offset += size;
737          src_offset += size;
738       }
739    }
740
741    /* Fix up control flow offsets. */
742    p->next_insn_offset = offset;
743    for (offset = 0; offset < p->next_insn_offset;) {
744       struct brw_instruction *insn = store + offset;
745       int this_old_ip = old_ip[offset / 8];
746       int this_compacted_count = compacted_counts[this_old_ip];
747       int target_old_ip, target_compacted_count;
748
749       switch (insn->header.opcode) {
750       case BRW_OPCODE_BREAK:
751       case BRW_OPCODE_CONTINUE:
752       case BRW_OPCODE_HALT:
753          update_uip_jip(insn, this_old_ip, compacted_counts);
754          break;
755
756       case BRW_OPCODE_IF:
757       case BRW_OPCODE_ELSE:
758       case BRW_OPCODE_ENDIF:
759       case BRW_OPCODE_WHILE:
760          if (intel->gen == 6) {
761             target_old_ip = this_old_ip + insn->bits1.branch_gen6.jump_count;
762             target_compacted_count = compacted_counts[target_old_ip];
763             insn->bits1.branch_gen6.jump_count -= (target_compacted_count -
764                                                    this_compacted_count);
765          } else {
766             update_uip_jip(insn, this_old_ip, compacted_counts);
767          }
768          break;
769       }
770
771       if (insn->header.cmpt_control) {
772          offset += 8;
773       } else {
774          offset += 16;
775       }
776    }
777
778    /* p->nr_insn is counting the number of uncompacted instructions still, so
779     * divide.  We do want to be sure there's a valid instruction in any
780     * alignment padding, so that the next compression pass (for the FS 8/16
781     * compile passes) parses correctly.
782     */
783    if (p->next_insn_offset & 8) {
784       struct brw_compact_instruction *align = store + offset;
785       memset(align, 0, sizeof(*align));
786       align->dw0.opcode = BRW_OPCODE_NOP;
787       align->dw0.cmpt_ctrl = 1;
788       p->next_insn_offset += 8;
789    }
790    p->nr_insn = p->next_insn_offset / 16;
791
792    if (0) {
793       fprintf(stdout, "dumping compacted program\n");
794       brw_dump_compile(p, stdout, 0, p->next_insn_offset);
795
796       int cmp = 0;
797       for (offset = 0; offset < p->next_insn_offset;) {
798          struct brw_instruction *insn = store + offset;
799
800          if (insn->header.cmpt_control) {
801             offset += 8;
802             cmp++;
803          } else {
804             offset += 16;
805          }
806       }
807       fprintf(stderr, "%db/%db saved (%d%%)\n", cmp * 8, offset + cmp * 8,
808               cmp * 8 * 100 / (offset + cmp * 8));
809    }
810 }