src/mesa/drivers/dri/i965/brw_fs_schedule_instructions.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  * Authors:
  24  *    Eric Anholt <eric@anholt.net>
  25  *
  26  */
  27
  28 extern "C" {
  29
  30 #include <sys/types.h>
  31
  32 #include "main/macros.h"
  33 #include "main/shaderobj.h"
  34 #include "main/uniforms.h"
  35 #include "program/prog_optimize.h"
  36 #include "program/register_allocate.h"
  37 #include "program/sampler.h"
  38 #include "program/hash_table.h"
  39 #include "brw_context.h"
  40 #include "brw_eu.h"
  41 #include "brw_wm.h"
  42 }
  43 #include "brw_fs.h"
  44 #include "../glsl/glsl_types.h"
  45 #include "../glsl/ir_optimization.h"
  46 #include "../glsl/ir_print_visitor.h"
  47
  48 /** @file brw_fs_schedule_instructions.cpp
  49  *
  50  * List scheduling of FS instructions.
  51  *
  52  * The basic model of the list scheduler is to take a basic block,
  53  * compute a DAG of the dependencies (RAW ordering with latency, WAW
  54  * ordering, WAR ordering), and make a list of the DAG heads.
  55  * Heuristically pick a DAG head, then put all the children that are
  56  * now DAG heads into the list of things to schedule.
  57  *
  58  * The heuristic is the important part.  We're trying to be cheap,
  59  * since actually computing the optimal scheduling is NP complete.
  60  * What we do is track a "current clock".  When we schedule a node, we
  61  * update the earliest-unblocked clock time of its children, and
  62  * increment the clock.  Then, when trying to schedule, we just pick
  63  * the earliest-unblocked instruction to schedule.
  64  *
  65  * Note that often there will be many things which could execute
  66  * immediately, and there are a range of heuristic options to choose
  67  * from in picking among those.
  68  */
  69
  70 class schedule_node : public exec_node
  71 {
  72 public:
  73    schedule_node(fs_inst *inst)
  74    {
  75       this->inst = inst;
  76       this->child_array_size = 0;
  77       this->children = NULL;
  78       this->child_latency = NULL;
  79       this->child_count = 0;
  80       this->parent_count = 0;
  81       this->unblocked_time = 0;
  82
  83       int chans = 8;
  84       int math_latency = 22;
  85
  86       switch (inst->opcode) {
  87       case FS_OPCODE_RCP:
  88          this->latency = 1 * chans * math_latency;
  89          break;
  90       case FS_OPCODE_RSQ:
  91          this->latency = 2 * chans * math_latency;
  92          break;
  93       case FS_OPCODE_SQRT:
  94       case FS_OPCODE_LOG2:
  95          /* full precision log.  partial is 2. */
  96          this->latency = 3 * chans * math_latency;
  97          break;
  98       case FS_OPCODE_EXP2:
  99          /* full precision.  partial is 3, same throughput. */
 100          this->latency = 4 * chans * math_latency;
 101          break;
 102       case FS_OPCODE_POW:
 103          this->latency = 8 * chans * math_latency;
 104          break;
 105       case FS_OPCODE_SIN:
 106       case FS_OPCODE_COS:
 107          /* minimum latency, max is 12 rounds. */
 108          this->latency = 5 * chans * math_latency;
 109          break;
 110       default:
 111          this->latency = 2;
 112          break;
 113       }
 114    }
 115
 116    fs_inst *inst;
 117    schedule_node **children;
 118    int *child_latency;
 119    int child_count;
 120    int parent_count;
 121    int child_array_size;
 122    int unblocked_time;
 123    int latency;
 124 };
 125
 126 class instruction_scheduler {
 127 public:
 128    instruction_scheduler(fs_visitor *v, void *mem_ctx, int virtual_grf_count)
 129    {
 130       this->v = v;
 131       this->mem_ctx = ralloc_context(mem_ctx);
 132       this->virtual_grf_count = virtual_grf_count;
 133       this->instructions.make_empty();
 134       this->instructions_to_schedule = 0;
 135    }
 136
 137    ~instruction_scheduler()
 138    {
 139       ralloc_free(this->mem_ctx);
 140    }
 141    void add_barrier_deps(schedule_node *n);
 142    void add_dep(schedule_node *before, schedule_node *after, int latency);
 143    void add_dep(schedule_node *before, schedule_node *after);
 144
 145    void add_inst(fs_inst *inst);
 146    void calculate_deps();
 147    void schedule_instructions(fs_inst *next_block_header);
 148
 149    bool is_compressed(fs_inst *inst);
 150
 151    void *mem_ctx;
 152
 153    int instructions_to_schedule;
 154    int virtual_grf_count;
 155    exec_list instructions;
 156    fs_visitor *v;
 157 };
 158
 159 void
 160 instruction_scheduler::add_inst(fs_inst *inst)
 161 {
 162    schedule_node *n = new(mem_ctx) schedule_node(inst);
 163
 164    assert(!inst->is_head_sentinel());
 165    assert(!inst->is_tail_sentinel());
 166
 167    this->instructions_to_schedule++;
 168
 169    inst->remove();
 170    instructions.push_tail(n);
 171 }
 172
 173 /**
 174  * Add a dependency between two instruction nodes.
 175  *
 176  * The @after node will be scheduled after @before.  We will try to
 177  * schedule it @latency cycles after @before, but no guarantees there.
 178  */
 179 void
 180 instruction_scheduler::add_dep(schedule_node *before, schedule_node *after,
 181                                int latency)
 182 {
 183    if (!before || !after)
 184       return;
 185
 186    assert(before != after);
 187
 188    for (int i = 0; i < before->child_count; i++) {
 189       if (before->children[i] == after) {
 190          before->child_latency[i] = MAX2(before->child_latency[i], latency);
 191          return;
 192       }
 193    }
 194
 195    if (before->child_array_size <= before->child_count) {
 196       if (before->child_array_size < 16)
 197          before->child_array_size = 16;
 198       else
 199          before->child_array_size *= 2;
 200
 201       before->children = reralloc(mem_ctx, before->children,
 202                                   schedule_node *,
 203                                   before->child_array_size);
 204       before->child_latency = reralloc(mem_ctx, before->child_latency,
 205                                        int, before->child_array_size);
 206    }
 207
 208    before->children[before->child_count] = after;
 209    before->child_latency[before->child_count] = latency;
 210    before->child_count++;
 211    after->parent_count++;
 212 }
 213
 214 void
 215 instruction_scheduler::add_dep(schedule_node *before, schedule_node *after)
 216 {
 217    if (!before)
 218       return;
 219
 220    add_dep(before, after, before->latency);
 221 }
 222
 223 /**
 224  * Sometimes we really want this node to execute after everything that
 225  * was before it and before everything that followed it.  This adds
 226  * the deps to do so.
 227  */
 228 void
 229 instruction_scheduler::add_barrier_deps(schedule_node *n)
 230 {
 231    schedule_node *prev = (schedule_node *)n->prev;
 232    schedule_node *next = (schedule_node *)n->next;
 233
 234    if (prev) {
 235       while (!prev->is_head_sentinel()) {
 236          add_dep(prev, n, 0);
 237          prev = (schedule_node *)prev->prev;
 238       }
 239    }
 240
 241    if (next) {
 242       while (!next->is_tail_sentinel()) {
 243          add_dep(n, next, 0);
 244          next = (schedule_node *)next->next;
 245       }
 246    }
 247 }
 248
 249 /* instruction scheduling needs to be aware of when an MRF write
 250  * actually writes 2 MRFs.
 251  */
 252 bool
 253 instruction_scheduler::is_compressed(fs_inst *inst)
 254 {
 255    return (v->c->dispatch_width == 16 &&
 256            !inst->force_uncompressed &&
 257            !inst->force_sechalf);
 258 }
 259
 260 void
 261 instruction_scheduler::calculate_deps()
 262 {
 263    schedule_node *last_grf_write[virtual_grf_count];
 264    schedule_node *last_mrf_write[BRW_MAX_MRF];
 265    schedule_node *last_conditional_mod = NULL;
 266    /* Fixed HW registers are assumed to be separate from the virtual
 267     * GRFs, so they can be tracked separately.  We don't really write
 268     * to fixed GRFs much, so don't bother tracking them on a more
 269     * granular level.
 270     */
 271    schedule_node *last_fixed_grf_write = NULL;
 272
 273    /* The last instruction always needs to still be the last
 274     * instruction.  Either it's flow control (IF, ELSE, ENDIF, DO,
 275     * WHILE) and scheduling other things after it would disturb the
 276     * basic block, or it's FB_WRITE and we should do a better job at
 277     * dead code elimination anyway.
 278     */
 279    schedule_node *last = (schedule_node *)instructions.get_tail();
 280    add_barrier_deps(last);
 281
 282    memset(last_grf_write, 0, sizeof(last_grf_write));
 283    memset(last_mrf_write, 0, sizeof(last_mrf_write));
 284
 285    /* top-to-bottom dependencies: RAW and WAW. */
 286    foreach_iter(exec_list_iterator, iter, instructions) {
 287       schedule_node *n = (schedule_node *)iter.get();
 288       fs_inst *inst = n->inst;
 289
 290       /* read-after-write deps. */
 291       for (int i = 0; i < 3; i++) {
 292          if (inst->src[i].file == GRF) {
 293             add_dep(last_grf_write[inst->src[i].reg], n);
 294          } else if (inst->src[i].file == FIXED_HW_REG &&
 295                     (inst->src[i].fixed_hw_reg.file ==
 296                      BRW_GENERAL_REGISTER_FILE)) {
 297             add_dep(last_fixed_grf_write, n);
 298          } else if (inst->src[i].file != BAD_FILE &&
 299                     inst->src[i].file != IMM &&
 300                     inst->src[i].file != UNIFORM) {
 301             assert(inst->src[i].file != MRF);
 302             add_barrier_deps(n);
 303          }
 304       }
 305
 306       for (int i = 0; i < inst->mlen; i++) {
 307          /* It looks like the MRF regs are released in the send
 308           * instruction once it's sent, not when the result comes
 309           * back.
 310           */
 311          add_dep(last_mrf_write[inst->base_mrf + i], n);
 312       }
 313
 314       if (inst->predicated) {
 315          assert(last_conditional_mod);
 316          add_dep(last_conditional_mod, n);
 317       }
 318
 319       /* write-after-write deps. */
 320       if (inst->dst.file == GRF) {
 321          add_dep(last_grf_write[inst->dst.reg], n);
 322          last_grf_write[inst->dst.reg] = n;
 323       } else if (inst->dst.file == MRF) {
 324          int reg = inst->dst.hw_reg & ~BRW_MRF_COMPR4;
 325
 326          add_dep(last_mrf_write[reg], n);
 327          last_mrf_write[reg] = n;
 328          if (is_compressed(inst)) {
 329             if (inst->dst.hw_reg & BRW_MRF_COMPR4)
 330                reg += 4;
 331             else
 332                reg++;
 333             add_dep(last_mrf_write[reg], n);
 334             last_mrf_write[reg] = n;
 335          }
 336       } else if (inst->dst.file == FIXED_HW_REG &&
 337                  inst->dst.fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
 338          last_fixed_grf_write = n;
 339       } else if (inst->dst.file != BAD_FILE) {
 340          add_barrier_deps(n);
 341       }
 342
 343       if (inst->mlen > 0) {
 344          for (int i = 0; i < v->implied_mrf_writes(inst); i++) {
 345             add_dep(last_mrf_write[inst->base_mrf + i], n);
 346             last_mrf_write[inst->base_mrf + i] = n;
 347          }
 348       }
 349
 350       if (inst->conditional_mod) {
 351          add_dep(last_conditional_mod, n, 0);
 352          last_conditional_mod = n;
 353       }
 354    }
 355
 356    /* bottom-to-top dependencies: WAR */
 357    memset(last_grf_write, 0, sizeof(last_grf_write));
 358    memset(last_mrf_write, 0, sizeof(last_mrf_write));
 359    last_conditional_mod = NULL;
 360    last_fixed_grf_write = NULL;
 361
 362    exec_node *node;
 363    exec_node *prev;
 364    for (node = instructions.get_tail(), prev = node->prev;
 365         !node->is_head_sentinel();
 366         node = prev, prev = node->prev) {
 367       schedule_node *n = (schedule_node *)node;
 368       fs_inst *inst = n->inst;
 369
 370       /* write-after-read deps. */
 371       for (int i = 0; i < 3; i++) {
 372          if (inst->src[i].file == GRF) {
 373             add_dep(n, last_grf_write[inst->src[i].reg]);
 374          } else if (inst->src[i].file == FIXED_HW_REG &&
 375                     (inst->src[i].fixed_hw_reg.file ==
 376                      BRW_GENERAL_REGISTER_FILE)) {
 377             add_dep(n, last_fixed_grf_write);
 378          } else if (inst->src[i].file != BAD_FILE &&
 379                     inst->src[i].file != IMM &&
 380                     inst->src[i].file != UNIFORM) {
 381             assert(inst->src[i].file != MRF);
 382             add_barrier_deps(n);
 383          }
 384       }
 385
 386       for (int i = 0; i < inst->mlen; i++) {
 387          /* It looks like the MRF regs are released in the send
 388           * instruction once it's sent, not when the result comes
 389           * back.
 390           */
 391          add_dep(n, last_mrf_write[inst->base_mrf + i], 2);
 392       }
 393
 394       if (inst->predicated) {
 395          add_dep(n, last_conditional_mod);
 396       }
 397
 398       /* Update the things this instruction wrote, so earlier reads
 399        * can mark this as WAR dependency.
 400        */
 401       if (inst->dst.file == GRF) {
 402          last_grf_write[inst->dst.reg] = n;
 403       } else if (inst->dst.file == MRF) {
 404          int reg = inst->dst.hw_reg & ~BRW_MRF_COMPR4;
 405
 406          last_mrf_write[reg] = n;
 407
 408          if (is_compressed(inst)) {
 409             if (inst->dst.hw_reg & BRW_MRF_COMPR4)
 410                reg += 4;
 411             else
 412                reg++;
 413
 414             last_mrf_write[reg] = n;
 415          }
 416       } else if (inst->dst.file == FIXED_HW_REG &&
 417                  inst->dst.fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
 418          last_fixed_grf_write = n;
 419       } else if (inst->dst.file != BAD_FILE) {
 420          add_barrier_deps(n);
 421       }
 422
 423       if (inst->mlen > 0) {
 424          for (int i = 0; i < v->implied_mrf_writes(inst); i++) {
 425             last_mrf_write[inst->base_mrf + i] = n;
 426          }
 427       }
 428
 429       if (inst->conditional_mod)
 430          last_conditional_mod = n;
 431    }
 432 }
 433
 434 void
 435 instruction_scheduler::schedule_instructions(fs_inst *next_block_header)
 436 {
 437    int time = 0;
 438
 439    /* Remove non-DAG heads from the list. */
 440    foreach_iter(exec_list_iterator, iter, instructions) {
 441       schedule_node *n = (schedule_node *)iter.get();
 442       if (n->parent_count != 0)
 443          n->remove();
 444    }
 445
 446    while (!instructions.is_empty()) {
 447       schedule_node *chosen = NULL;
 448       int chosen_time = 0;
 449
 450       foreach_iter(exec_list_iterator, iter, instructions) {
 451          schedule_node *n = (schedule_node *)iter.get();
 452
 453          if (!chosen || n->unblocked_time < chosen_time) {
 454             chosen = n;
 455             chosen_time = n->unblocked_time;
 456          }
 457       }
 458
 459       /* Schedule this instruction. */
 460       assert(chosen);
 461       chosen->remove();
 462       next_block_header->insert_before(chosen->inst);
 463       instructions_to_schedule--;
 464
 465       /* Bump the clock.  If we expected a delay for scheduling, then
 466        * bump the clock to reflect that.
 467        */
 468       time = MAX2(time + 1, chosen_time);
 469
 470       /* Now that we've scheduled a new instruction, some of its
 471        * children can be promoted to the list of instructions ready to
 472        * be scheduled.  Update the children's unblocked time for this
 473        * DAG edge as we do so.
 474        */
 475       for (int i = 0; i < chosen->child_count; i++) {
 476          schedule_node *child = chosen->children[i];
 477
 478          child->unblocked_time = MAX2(child->unblocked_time,
 479                                       time + chosen->child_latency[i]);
 480
 481          child->parent_count--;
 482          if (child->parent_count == 0) {
 483             instructions.push_tail(child);
 484          }
 485       }
 486
 487       /* Shared resource: the mathbox.  There's one per EU (on later
 488        * generations, it's even more limited pre-gen6), so if we send
 489        * something off to it then the next math isn't going to make
 490        * progress until the first is done.
 491        */
 492       if (chosen->inst->is_math()) {
 493          foreach_iter(exec_list_iterator, iter, instructions) {
 494             schedule_node *n = (schedule_node *)iter.get();
 495
 496             if (n->inst->is_math())
 497                n->unblocked_time = MAX2(n->unblocked_time,
 498                                         time + chosen->latency);
 499          }
 500       }
 501    }
 502
 503    assert(instructions_to_schedule == 0);
 504 }
 505
 506 void
 507 fs_visitor::schedule_instructions()
 508 {
 509    fs_inst *next_block_header = (fs_inst *)instructions.head;
 510    instruction_scheduler sched(this, mem_ctx, this->virtual_grf_next);
 511
 512    while (!next_block_header->is_tail_sentinel()) {
 513       /* Add things to be scheduled until we get to a new BB. */
 514       while (!next_block_header->is_tail_sentinel()) {
 515          fs_inst *inst = next_block_header;
 516          next_block_header = (fs_inst *)next_block_header->next;
 517
 518          sched.add_inst(inst);
 519          if (inst->opcode == BRW_OPCODE_IF ||
 520              inst->opcode == BRW_OPCODE_ELSE ||
 521              inst->opcode == BRW_OPCODE_ENDIF ||
 522              inst->opcode == BRW_OPCODE_DO ||
 523              inst->opcode == BRW_OPCODE_WHILE ||
 524              inst->opcode == BRW_OPCODE_BREAK ||
 525              inst->opcode == BRW_OPCODE_CONTINUE) {
 526             break;
 527          }
 528       }
 529       sched.calculate_deps();
 530       sched.schedule_instructions(next_block_header);
 531    }
 532
 533    this->live_intervals_valid = false;
 534 }